summaryrefslogtreecommitdiffstats
path: root/arch/x86/kvm
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-11 08:27:49 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-11 08:27:49 +0000
commitace9429bb58fd418f0c81d4c2835699bddf6bde6 (patch)
treeb2d64bc10158fdd5497876388cd68142ca374ed3 /arch/x86/kvm
parentInitial commit. (diff)
downloadlinux-ace9429bb58fd418f0c81d4c2835699bddf6bde6.tar.xz
linux-ace9429bb58fd418f0c81d4c2835699bddf6bde6.zip
Adding upstream version 6.6.15.upstream/6.6.15
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'arch/x86/kvm')
-rw-r--r--arch/x86/kvm/.gitignore2
-rw-r--r--arch/x86/kvm/Kconfig157
-rw-r--r--arch/x86/kvm/Makefile50
-rw-r--r--arch/x86/kvm/cpuid.c1573
-rw-r--r--arch/x86/kvm/cpuid.h281
-rw-r--r--arch/x86/kvm/debugfs.c196
-rw-r--r--arch/x86/kvm/emulate.c5508
-rw-r--r--arch/x86/kvm/fpu.h140
-rw-r--r--arch/x86/kvm/governed_features.h21
-rw-r--r--arch/x86/kvm/hyperv.c2866
-rw-r--r--arch/x86/kvm/hyperv.h241
-rw-r--r--arch/x86/kvm/i8254.c751
-rw-r--r--arch/x86/kvm/i8254.h65
-rw-r--r--arch/x86/kvm/i8259.c660
-rw-r--r--arch/x86/kvm/ioapic.c776
-rw-r--r--arch/x86/kvm/ioapic.h123
-rw-r--r--arch/x86/kvm/irq.c171
-rw-r--r--arch/x86/kvm/irq.h114
-rw-r--r--arch/x86/kvm/irq_comm.c442
-rw-r--r--arch/x86/kvm/kvm-asm-offsets.c29
-rw-r--r--arch/x86/kvm/kvm_cache_regs.h231
-rw-r--r--arch/x86/kvm/kvm_emulate.h541
-rw-r--r--arch/x86/kvm/kvm_onhyperv.c124
-rw-r--r--arch/x86/kvm/kvm_onhyperv.h24
-rw-r--r--arch/x86/kvm/lapic.c3316
-rw-r--r--arch/x86/kvm/lapic.h281
-rw-r--r--arch/x86/kvm/mmu.h307
-rw-r--r--arch/x86/kvm/mmu/mmu.c7161
-rw-r--r--arch/x86/kvm/mmu/mmu_internal.h350
-rw-r--r--arch/x86/kvm/mmu/mmutrace.h451
-rw-r--r--arch/x86/kvm/mmu/page_track.c307
-rw-r--r--arch/x86/kvm/mmu/page_track.h58
-rw-r--r--arch/x86/kvm/mmu/paging_tmpl.h985
-rw-r--r--arch/x86/kvm/mmu/spte.c517
-rw-r--r--arch/x86/kvm/mmu/spte.h504
-rw-r--r--arch/x86/kvm/mmu/tdp_iter.c177
-rw-r--r--arch/x86/kvm/mmu/tdp_iter.h138
-rw-r--r--arch/x86/kvm/mmu/tdp_mmu.c1817
-rw-r--r--arch/x86/kvm/mmu/tdp_mmu.h78
-rw-r--r--arch/x86/kvm/mtrr.c720
-rw-r--r--arch/x86/kvm/pmu.c941
-rw-r--r--arch/x86/kvm/pmu.h257
-rw-r--r--arch/x86/kvm/reverse_cpuid.h224
-rw-r--r--arch/x86/kvm/smm.c648
-rw-r--r--arch/x86/kvm/smm.h168
-rw-r--r--arch/x86/kvm/svm/avic.c1221
-rw-r--r--arch/x86/kvm/svm/hyperv.c18
-rw-r--r--arch/x86/kvm/svm/hyperv.h45
-rw-r--r--arch/x86/kvm/svm/nested.c1818
-rw-r--r--arch/x86/kvm/svm/pmu.c250
-rw-r--r--arch/x86/kvm/svm/sev.c3140
-rw-r--r--arch/x86/kvm/svm/svm.c5381
-rw-r--r--arch/x86/kvm/svm/svm.h725
-rw-r--r--arch/x86/kvm/svm/svm_onhyperv.c41
-rw-r--r--arch/x86/kvm/svm/svm_onhyperv.h116
-rw-r--r--arch/x86/kvm/svm/svm_ops.h64
-rw-r--r--arch/x86/kvm/svm/vmenter.S390
-rw-r--r--arch/x86/kvm/trace.h1842
-rw-r--r--arch/x86/kvm/tss.h60
-rw-r--r--arch/x86/kvm/vmx/capabilities.h404
-rw-r--r--arch/x86/kvm/vmx/hyperv.c676
-rw-r--r--arch/x86/kvm/vmx/hyperv.h200
-rw-r--r--arch/x86/kvm/vmx/nested.c7102
-rw-r--r--arch/x86/kvm/vmx/nested.h293
-rw-r--r--arch/x86/kvm/vmx/pmu_intel.c788
-rw-r--r--arch/x86/kvm/vmx/posted_intr.c353
-rw-r--r--arch/x86/kvm/vmx/posted_intr.h106
-rw-r--r--arch/x86/kvm/vmx/run_flags.h8
-rw-r--r--arch/x86/kvm/vmx/sgx.c509
-rw-r--r--arch/x86/kvm/vmx/sgx.h34
-rw-r--r--arch/x86/kvm/vmx/vmcs.h193
-rw-r--r--arch/x86/kvm/vmx/vmcs12.c155
-rw-r--r--arch/x86/kvm/vmx/vmcs12.h429
-rw-r--r--arch/x86/kvm/vmx/vmcs_shadow_fields.h79
-rw-r--r--arch/x86/kvm/vmx/vmenter.S362
-rw-r--r--arch/x86/kvm/vmx/vmx.c8712
-rw-r--r--arch/x86/kvm/vmx/vmx.h758
-rw-r--r--arch/x86/kvm/vmx/vmx_ops.h369
-rw-r--r--arch/x86/kvm/x86.c13695
-rw-r--r--arch/x86/kvm/x86.h543
-rw-r--r--arch/x86/kvm/xen.c2129
-rw-r--r--arch/x86/kvm/xen.h224
82 files changed, 87723 insertions, 0 deletions
diff --git a/arch/x86/kvm/.gitignore b/arch/x86/kvm/.gitignore
new file mode 100644
index 000000000..615d6ff35
--- /dev/null
+++ b/arch/x86/kvm/.gitignore
@@ -0,0 +1,2 @@
+/kvm-asm-offsets.s
+/kvm-asm-offsets.h
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
new file mode 100644
index 000000000..ed90f1481
--- /dev/null
+++ b/arch/x86/kvm/Kconfig
@@ -0,0 +1,157 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# KVM configuration
+#
+
+source "virt/kvm/Kconfig"
+
+menuconfig VIRTUALIZATION
+ bool "Virtualization"
+ depends on HAVE_KVM || X86
+ default y
+ help
+ Say Y here to get to see options for using your Linux host to run other
+ operating systems inside virtual machines (guests).
+ This option alone does not add any kernel code.
+
+ If you say N, all options in this submenu will be skipped and disabled.
+
+if VIRTUALIZATION
+
+config KVM
+ tristate "Kernel-based Virtual Machine (KVM) support"
+ depends on HAVE_KVM
+ depends on HIGH_RES_TIMERS
+ depends on X86_LOCAL_APIC
+ select PREEMPT_NOTIFIERS
+ select MMU_NOTIFIER
+ select HAVE_KVM_IRQCHIP
+ select HAVE_KVM_PFNCACHE
+ select HAVE_KVM_IRQFD
+ select HAVE_KVM_DIRTY_RING_TSO
+ select HAVE_KVM_DIRTY_RING_ACQ_REL
+ select IRQ_BYPASS_MANAGER
+ select HAVE_KVM_IRQ_BYPASS
+ select HAVE_KVM_IRQ_ROUTING
+ select HAVE_KVM_EVENTFD
+ select KVM_ASYNC_PF
+ select USER_RETURN_NOTIFIER
+ select KVM_MMIO
+ select SCHED_INFO
+ select PERF_EVENTS
+ select GUEST_PERF_EVENTS
+ select HAVE_KVM_MSI
+ select HAVE_KVM_CPU_RELAX_INTERCEPT
+ select HAVE_KVM_NO_POLL
+ select KVM_XFER_TO_GUEST_WORK
+ select KVM_GENERIC_DIRTYLOG_READ_PROTECT
+ select KVM_VFIO
+ select INTERVAL_TREE
+ select HAVE_KVM_PM_NOTIFIER if PM
+ select KVM_GENERIC_HARDWARE_ENABLING
+ help
+ Support hosting fully virtualized guest machines using hardware
+ virtualization extensions. You will need a fairly recent
+ processor equipped with virtualization extensions. You will also
+ need to select one or more of the processor modules below.
+
+ This module provides access to the hardware capabilities through
+ a character device node named /dev/kvm.
+
+ To compile this as a module, choose M here: the module
+ will be called kvm.
+
+ If unsure, say N.
+
+config KVM_WERROR
+ bool "Compile KVM with -Werror"
+ # KASAN may cause the build to fail due to larger frames
+ default y if X86_64 && !KASAN
+ # We use the dependency on !COMPILE_TEST to not be enabled
+ # blindly in allmodconfig or allyesconfig configurations
+ depends on KVM
+ depends on (X86_64 && !KASAN) || !COMPILE_TEST
+ depends on EXPERT
+ help
+ Add -Werror to the build flags for KVM.
+
+ If in doubt, say "N".
+
+config KVM_INTEL
+ tristate "KVM for Intel (and compatible) processors support"
+ depends on KVM && IA32_FEAT_CTL
+ help
+ Provides support for KVM on processors equipped with Intel's VT
+ extensions, a.k.a. Virtual Machine Extensions (VMX).
+
+ To compile this as a module, choose M here: the module
+ will be called kvm-intel.
+
+config X86_SGX_KVM
+ bool "Software Guard eXtensions (SGX) Virtualization"
+ depends on X86_SGX && KVM_INTEL
+ help
+
+ Enables KVM guests to create SGX enclaves.
+
+ This includes support to expose "raw" unreclaimable enclave memory to
+ guests via a device node, e.g. /dev/sgx_vepc.
+
+ If unsure, say N.
+
+config KVM_AMD
+ tristate "KVM for AMD processors support"
+ depends on KVM && (CPU_SUP_AMD || CPU_SUP_HYGON)
+ help
+ Provides support for KVM on AMD processors equipped with the AMD-V
+ (SVM) extensions.
+
+ To compile this as a module, choose M here: the module
+ will be called kvm-amd.
+
+config KVM_AMD_SEV
+ def_bool y
+ bool "AMD Secure Encrypted Virtualization (SEV) support"
+ depends on KVM_AMD && X86_64
+ depends on CRYPTO_DEV_SP_PSP && !(KVM_AMD=y && CRYPTO_DEV_CCP_DD=m)
+ help
+ Provides support for launching Encrypted VMs (SEV) and Encrypted VMs
+ with Encrypted State (SEV-ES) on AMD processors.
+
+config KVM_SMM
+ bool "System Management Mode emulation"
+ default y
+ depends on KVM
+ help
+ Provides support for KVM to emulate System Management Mode (SMM)
+ in virtual machines. This can be used by the virtual machine
+ firmware to implement UEFI secure boot.
+
+ If unsure, say Y.
+
+config KVM_XEN
+ bool "Support for Xen hypercall interface"
+ depends on KVM
+ help
+ Provides KVM support for the hosting Xen HVM guests and
+ passing Xen hypercalls to userspace.
+
+ If in doubt, say "N".
+
+config KVM_PROVE_MMU
+ bool "Prove KVM MMU correctness"
+ depends on DEBUG_KERNEL
+ depends on KVM
+ depends on EXPERT
+ help
+ Enables runtime assertions in KVM's MMU that are too costly to enable
+ in anything remotely resembling a production environment, e.g. this
+ gates code that verifies a to-be-freed page table doesn't have any
+ present SPTEs.
+
+ If in doubt, say "N".
+
+config KVM_EXTERNAL_WRITE_TRACKING
+ bool
+
+endif # VIRTUALIZATION
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
new file mode 100644
index 000000000..80e3fe184
--- /dev/null
+++ b/arch/x86/kvm/Makefile
@@ -0,0 +1,50 @@
+# SPDX-License-Identifier: GPL-2.0
+
+ccflags-y += -I $(srctree)/arch/x86/kvm
+ccflags-$(CONFIG_KVM_WERROR) += -Werror
+
+ifeq ($(CONFIG_FRAME_POINTER),y)
+OBJECT_FILES_NON_STANDARD_vmenter.o := y
+endif
+
+include $(srctree)/virt/kvm/Makefile.kvm
+
+kvm-y += x86.o emulate.o i8259.o irq.o lapic.o \
+ i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o \
+ hyperv.o debugfs.o mmu/mmu.o mmu/page_track.o \
+ mmu/spte.o
+
+ifdef CONFIG_HYPERV
+kvm-y += kvm_onhyperv.o
+endif
+
+kvm-$(CONFIG_X86_64) += mmu/tdp_iter.o mmu/tdp_mmu.o
+kvm-$(CONFIG_KVM_XEN) += xen.o
+kvm-$(CONFIG_KVM_SMM) += smm.o
+
+kvm-intel-y += vmx/vmx.o vmx/vmenter.o vmx/pmu_intel.o vmx/vmcs12.o \
+ vmx/hyperv.o vmx/nested.o vmx/posted_intr.o
+kvm-intel-$(CONFIG_X86_SGX_KVM) += vmx/sgx.o
+
+kvm-amd-y += svm/svm.o svm/vmenter.o svm/pmu.o svm/nested.o svm/avic.o \
+ svm/sev.o svm/hyperv.o
+
+ifdef CONFIG_HYPERV
+kvm-amd-y += svm/svm_onhyperv.o
+endif
+
+obj-$(CONFIG_KVM) += kvm.o
+obj-$(CONFIG_KVM_INTEL) += kvm-intel.o
+obj-$(CONFIG_KVM_AMD) += kvm-amd.o
+
+AFLAGS_svm/vmenter.o := -iquote $(obj)
+$(obj)/svm/vmenter.o: $(obj)/kvm-asm-offsets.h
+
+AFLAGS_vmx/vmenter.o := -iquote $(obj)
+$(obj)/vmx/vmenter.o: $(obj)/kvm-asm-offsets.h
+
+$(obj)/kvm-asm-offsets.h: $(obj)/kvm-asm-offsets.s FORCE
+ $(call filechk,offsets,__KVM_ASM_OFFSETS_H__)
+
+targets += kvm-asm-offsets.s
+clean-files += kvm-asm-offsets.h
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
new file mode 100644
index 000000000..773132c3b
--- /dev/null
+++ b/arch/x86/kvm/cpuid.c
@@ -0,0 +1,1573 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Kernel-based Virtual Machine driver for Linux
+ * cpuid support routines
+ *
+ * derived from arch/x86/kvm/x86.c
+ *
+ * Copyright 2011 Red Hat, Inc. and/or its affiliates.
+ * Copyright IBM Corporation, 2008
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kvm_host.h>
+#include "linux/lockdep.h"
+#include <linux/export.h>
+#include <linux/vmalloc.h>
+#include <linux/uaccess.h>
+#include <linux/sched/stat.h>
+
+#include <asm/processor.h>
+#include <asm/user.h>
+#include <asm/fpu/xstate.h>
+#include <asm/sgx.h>
+#include <asm/cpuid.h>
+#include "cpuid.h"
+#include "lapic.h"
+#include "mmu.h"
+#include "trace.h"
+#include "pmu.h"
+#include "xen.h"
+
+/*
+ * Unlike "struct cpuinfo_x86.x86_capability", kvm_cpu_caps doesn't need to be
+ * aligned to sizeof(unsigned long) because it's not accessed via bitops.
+ */
+u32 kvm_cpu_caps[NR_KVM_CPU_CAPS] __read_mostly;
+EXPORT_SYMBOL_GPL(kvm_cpu_caps);
+
+u32 xstate_required_size(u64 xstate_bv, bool compacted)
+{
+ int feature_bit = 0;
+ u32 ret = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET;
+
+ xstate_bv &= XFEATURE_MASK_EXTEND;
+ while (xstate_bv) {
+ if (xstate_bv & 0x1) {
+ u32 eax, ebx, ecx, edx, offset;
+ cpuid_count(0xD, feature_bit, &eax, &ebx, &ecx, &edx);
+ /* ECX[1]: 64B alignment in compacted form */
+ if (compacted)
+ offset = (ecx & 0x2) ? ALIGN(ret, 64) : ret;
+ else
+ offset = ebx;
+ ret = max(ret, offset + eax);
+ }
+
+ xstate_bv >>= 1;
+ feature_bit++;
+ }
+
+ return ret;
+}
+
+#define F feature_bit
+
+/* Scattered Flag - For features that are scattered by cpufeatures.h. */
+#define SF(name) \
+({ \
+ BUILD_BUG_ON(X86_FEATURE_##name >= MAX_CPU_FEATURES); \
+ (boot_cpu_has(X86_FEATURE_##name) ? F(name) : 0); \
+})
+
+/*
+ * Magic value used by KVM when querying userspace-provided CPUID entries and
+ * doesn't care about the CPIUD index because the index of the function in
+ * question is not significant. Note, this magic value must have at least one
+ * bit set in bits[63:32] and must be consumed as a u64 by cpuid_entry2_find()
+ * to avoid false positives when processing guest CPUID input.
+ */
+#define KVM_CPUID_INDEX_NOT_SIGNIFICANT -1ull
+
+static inline struct kvm_cpuid_entry2 *cpuid_entry2_find(
+ struct kvm_cpuid_entry2 *entries, int nent, u32 function, u64 index)
+{
+ struct kvm_cpuid_entry2 *e;
+ int i;
+
+ /*
+ * KVM has a semi-arbitrary rule that querying the guest's CPUID model
+ * with IRQs disabled is disallowed. The CPUID model can legitimately
+ * have over one hundred entries, i.e. the lookup is slow, and IRQs are
+ * typically disabled in KVM only when KVM is in a performance critical
+ * path, e.g. the core VM-Enter/VM-Exit run loop. Nothing will break
+ * if this rule is violated, this assertion is purely to flag potential
+ * performance issues. If this fires, consider moving the lookup out
+ * of the hotpath, e.g. by caching information during CPUID updates.
+ */
+ lockdep_assert_irqs_enabled();
+
+ for (i = 0; i < nent; i++) {
+ e = &entries[i];
+
+ if (e->function != function)
+ continue;
+
+ /*
+ * If the index isn't significant, use the first entry with a
+ * matching function. It's userspace's responsibilty to not
+ * provide "duplicate" entries in all cases.
+ */
+ if (!(e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) || e->index == index)
+ return e;
+
+
+ /*
+ * Similarly, use the first matching entry if KVM is doing a
+ * lookup (as opposed to emulating CPUID) for a function that's
+ * architecturally defined as not having a significant index.
+ */
+ if (index == KVM_CPUID_INDEX_NOT_SIGNIFICANT) {
+ /*
+ * Direct lookups from KVM should not diverge from what
+ * KVM defines internally (the architectural behavior).
+ */
+ WARN_ON_ONCE(cpuid_function_is_indexed(function));
+ return e;
+ }
+ }
+
+ return NULL;
+}
+
+static int kvm_check_cpuid(struct kvm_vcpu *vcpu,
+ struct kvm_cpuid_entry2 *entries,
+ int nent)
+{
+ struct kvm_cpuid_entry2 *best;
+ u64 xfeatures;
+
+ /*
+ * The existing code assumes virtual address is 48-bit or 57-bit in the
+ * canonical address checks; exit if it is ever changed.
+ */
+ best = cpuid_entry2_find(entries, nent, 0x80000008,
+ KVM_CPUID_INDEX_NOT_SIGNIFICANT);
+ if (best) {
+ int vaddr_bits = (best->eax & 0xff00) >> 8;
+
+ if (vaddr_bits != 48 && vaddr_bits != 57 && vaddr_bits != 0)
+ return -EINVAL;
+ }
+
+ /*
+ * Exposing dynamic xfeatures to the guest requires additional
+ * enabling in the FPU, e.g. to expand the guest XSAVE state size.
+ */
+ best = cpuid_entry2_find(entries, nent, 0xd, 0);
+ if (!best)
+ return 0;
+
+ xfeatures = best->eax | ((u64)best->edx << 32);
+ xfeatures &= XFEATURE_MASK_USER_DYNAMIC;
+ if (!xfeatures)
+ return 0;
+
+ return fpu_enable_guest_xfd_features(&vcpu->arch.guest_fpu, xfeatures);
+}
+
+/* Check whether the supplied CPUID data is equal to what is already set for the vCPU. */
+static int kvm_cpuid_check_equal(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *e2,
+ int nent)
+{
+ struct kvm_cpuid_entry2 *orig;
+ int i;
+
+ if (nent != vcpu->arch.cpuid_nent)
+ return -EINVAL;
+
+ for (i = 0; i < nent; i++) {
+ orig = &vcpu->arch.cpuid_entries[i];
+ if (e2[i].function != orig->function ||
+ e2[i].index != orig->index ||
+ e2[i].flags != orig->flags ||
+ e2[i].eax != orig->eax || e2[i].ebx != orig->ebx ||
+ e2[i].ecx != orig->ecx || e2[i].edx != orig->edx)
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static struct kvm_hypervisor_cpuid kvm_get_hypervisor_cpuid(struct kvm_vcpu *vcpu,
+ const char *sig)
+{
+ struct kvm_hypervisor_cpuid cpuid = {};
+ struct kvm_cpuid_entry2 *entry;
+ u32 base;
+
+ for_each_possible_hypervisor_cpuid_base(base) {
+ entry = kvm_find_cpuid_entry(vcpu, base);
+
+ if (entry) {
+ u32 signature[3];
+
+ signature[0] = entry->ebx;
+ signature[1] = entry->ecx;
+ signature[2] = entry->edx;
+
+ if (!memcmp(signature, sig, sizeof(signature))) {
+ cpuid.base = base;
+ cpuid.limit = entry->eax;
+ break;
+ }
+ }
+ }
+
+ return cpuid;
+}
+
+static struct kvm_cpuid_entry2 *__kvm_find_kvm_cpuid_features(struct kvm_vcpu *vcpu,
+ struct kvm_cpuid_entry2 *entries, int nent)
+{
+ u32 base = vcpu->arch.kvm_cpuid.base;
+
+ if (!base)
+ return NULL;
+
+ return cpuid_entry2_find(entries, nent, base | KVM_CPUID_FEATURES,
+ KVM_CPUID_INDEX_NOT_SIGNIFICANT);
+}
+
+static struct kvm_cpuid_entry2 *kvm_find_kvm_cpuid_features(struct kvm_vcpu *vcpu)
+{
+ return __kvm_find_kvm_cpuid_features(vcpu, vcpu->arch.cpuid_entries,
+ vcpu->arch.cpuid_nent);
+}
+
+void kvm_update_pv_runtime(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpuid_entry2 *best = kvm_find_kvm_cpuid_features(vcpu);
+
+ /*
+ * save the feature bitmap to avoid cpuid lookup for every PV
+ * operation
+ */
+ if (best)
+ vcpu->arch.pv_cpuid.features = best->eax;
+}
+
+/*
+ * Calculate guest's supported XCR0 taking into account guest CPUID data and
+ * KVM's supported XCR0 (comprised of host's XCR0 and KVM_SUPPORTED_XCR0).
+ */
+static u64 cpuid_get_supported_xcr0(struct kvm_cpuid_entry2 *entries, int nent)
+{
+ struct kvm_cpuid_entry2 *best;
+
+ best = cpuid_entry2_find(entries, nent, 0xd, 0);
+ if (!best)
+ return 0;
+
+ return (best->eax | ((u64)best->edx << 32)) & kvm_caps.supported_xcr0;
+}
+
+static void __kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *entries,
+ int nent)
+{
+ struct kvm_cpuid_entry2 *best;
+
+ best = cpuid_entry2_find(entries, nent, 1, KVM_CPUID_INDEX_NOT_SIGNIFICANT);
+ if (best) {
+ /* Update OSXSAVE bit */
+ if (boot_cpu_has(X86_FEATURE_XSAVE))
+ cpuid_entry_change(best, X86_FEATURE_OSXSAVE,
+ kvm_is_cr4_bit_set(vcpu, X86_CR4_OSXSAVE));
+
+ cpuid_entry_change(best, X86_FEATURE_APIC,
+ vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE);
+ }
+
+ best = cpuid_entry2_find(entries, nent, 7, 0);
+ if (best && boot_cpu_has(X86_FEATURE_PKU) && best->function == 0x7)
+ cpuid_entry_change(best, X86_FEATURE_OSPKE,
+ kvm_is_cr4_bit_set(vcpu, X86_CR4_PKE));
+
+ best = cpuid_entry2_find(entries, nent, 0xD, 0);
+ if (best)
+ best->ebx = xstate_required_size(vcpu->arch.xcr0, false);
+
+ best = cpuid_entry2_find(entries, nent, 0xD, 1);
+ if (best && (cpuid_entry_has(best, X86_FEATURE_XSAVES) ||
+ cpuid_entry_has(best, X86_FEATURE_XSAVEC)))
+ best->ebx = xstate_required_size(vcpu->arch.xcr0, true);
+
+ best = __kvm_find_kvm_cpuid_features(vcpu, entries, nent);
+ if (kvm_hlt_in_guest(vcpu->kvm) && best &&
+ (best->eax & (1 << KVM_FEATURE_PV_UNHALT)))
+ best->eax &= ~(1 << KVM_FEATURE_PV_UNHALT);
+
+ if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT)) {
+ best = cpuid_entry2_find(entries, nent, 0x1, KVM_CPUID_INDEX_NOT_SIGNIFICANT);
+ if (best)
+ cpuid_entry_change(best, X86_FEATURE_MWAIT,
+ vcpu->arch.ia32_misc_enable_msr &
+ MSR_IA32_MISC_ENABLE_MWAIT);
+ }
+}
+
+void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu)
+{
+ __kvm_update_cpuid_runtime(vcpu, vcpu->arch.cpuid_entries, vcpu->arch.cpuid_nent);
+}
+EXPORT_SYMBOL_GPL(kvm_update_cpuid_runtime);
+
+static bool kvm_cpuid_has_hyperv(struct kvm_cpuid_entry2 *entries, int nent)
+{
+ struct kvm_cpuid_entry2 *entry;
+
+ entry = cpuid_entry2_find(entries, nent, HYPERV_CPUID_INTERFACE,
+ KVM_CPUID_INDEX_NOT_SIGNIFICANT);
+ return entry && entry->eax == HYPERV_CPUID_SIGNATURE_EAX;
+}
+
+static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+ struct kvm_cpuid_entry2 *best;
+ bool allow_gbpages;
+
+ BUILD_BUG_ON(KVM_NR_GOVERNED_FEATURES > KVM_MAX_NR_GOVERNED_FEATURES);
+ bitmap_zero(vcpu->arch.governed_features.enabled,
+ KVM_MAX_NR_GOVERNED_FEATURES);
+
+ /*
+ * If TDP is enabled, let the guest use GBPAGES if they're supported in
+ * hardware. The hardware page walker doesn't let KVM disable GBPAGES,
+ * i.e. won't treat them as reserved, and KVM doesn't redo the GVA->GPA
+ * walk for performance and complexity reasons. Not to mention KVM
+ * _can't_ solve the problem because GVA->GPA walks aren't visible to
+ * KVM once a TDP translation is installed. Mimic hardware behavior so
+ * that KVM's is at least consistent, i.e. doesn't randomly inject #PF.
+ * If TDP is disabled, honor *only* guest CPUID as KVM has full control
+ * and can install smaller shadow pages if the host lacks 1GiB support.
+ */
+ allow_gbpages = tdp_enabled ? boot_cpu_has(X86_FEATURE_GBPAGES) :
+ guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES);
+ if (allow_gbpages)
+ kvm_governed_feature_set(vcpu, X86_FEATURE_GBPAGES);
+
+ best = kvm_find_cpuid_entry(vcpu, 1);
+ if (best && apic) {
+ if (cpuid_entry_has(best, X86_FEATURE_TSC_DEADLINE_TIMER))
+ apic->lapic_timer.timer_mode_mask = 3 << 17;
+ else
+ apic->lapic_timer.timer_mode_mask = 1 << 17;
+
+ kvm_apic_set_version(vcpu);
+ }
+
+ vcpu->arch.guest_supported_xcr0 =
+ cpuid_get_supported_xcr0(vcpu->arch.cpuid_entries, vcpu->arch.cpuid_nent);
+
+ kvm_update_pv_runtime(vcpu);
+
+ vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
+ vcpu->arch.reserved_gpa_bits = kvm_vcpu_reserved_gpa_bits_raw(vcpu);
+
+ kvm_pmu_refresh(vcpu);
+ vcpu->arch.cr4_guest_rsvd_bits =
+ __cr4_reserved_bits(guest_cpuid_has, vcpu);
+
+ kvm_hv_set_cpuid(vcpu, kvm_cpuid_has_hyperv(vcpu->arch.cpuid_entries,
+ vcpu->arch.cpuid_nent));
+
+ /* Invoke the vendor callback only after the above state is updated. */
+ static_call(kvm_x86_vcpu_after_set_cpuid)(vcpu);
+
+ /*
+ * Except for the MMU, which needs to do its thing any vendor specific
+ * adjustments to the reserved GPA bits.
+ */
+ kvm_mmu_after_set_cpuid(vcpu);
+}
+
+int cpuid_query_maxphyaddr(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpuid_entry2 *best;
+
+ best = kvm_find_cpuid_entry(vcpu, 0x80000000);
+ if (!best || best->eax < 0x80000008)
+ goto not_found;
+ best = kvm_find_cpuid_entry(vcpu, 0x80000008);
+ if (best)
+ return best->eax & 0xff;
+not_found:
+ return 36;
+}
+
+/*
+ * This "raw" version returns the reserved GPA bits without any adjustments for
+ * encryption technologies that usurp bits. The raw mask should be used if and
+ * only if hardware does _not_ strip the usurped bits, e.g. in virtual MTRRs.
+ */
+u64 kvm_vcpu_reserved_gpa_bits_raw(struct kvm_vcpu *vcpu)
+{
+ return rsvd_bits(cpuid_maxphyaddr(vcpu), 63);
+}
+
+static int kvm_set_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *e2,
+ int nent)
+{
+ int r;
+
+ __kvm_update_cpuid_runtime(vcpu, e2, nent);
+
+ /*
+ * KVM does not correctly handle changing guest CPUID after KVM_RUN, as
+ * MAXPHYADDR, GBPAGES support, AMD reserved bit behavior, etc.. aren't
+ * tracked in kvm_mmu_page_role. As a result, KVM may miss guest page
+ * faults due to reusing SPs/SPTEs. In practice no sane VMM mucks with
+ * the core vCPU model on the fly. It would've been better to forbid any
+ * KVM_SET_CPUID{,2} calls after KVM_RUN altogether but unfortunately
+ * some VMMs (e.g. QEMU) reuse vCPU fds for CPU hotplug/unplug and do
+ * KVM_SET_CPUID{,2} again. To support this legacy behavior, check
+ * whether the supplied CPUID data is equal to what's already set.
+ */
+ if (kvm_vcpu_has_run(vcpu)) {
+ r = kvm_cpuid_check_equal(vcpu, e2, nent);
+ if (r)
+ return r;
+
+ kvfree(e2);
+ return 0;
+ }
+
+ if (kvm_cpuid_has_hyperv(e2, nent)) {
+ r = kvm_hv_vcpu_init(vcpu);
+ if (r)
+ return r;
+ }
+
+ r = kvm_check_cpuid(vcpu, e2, nent);
+ if (r)
+ return r;
+
+ kvfree(vcpu->arch.cpuid_entries);
+ vcpu->arch.cpuid_entries = e2;
+ vcpu->arch.cpuid_nent = nent;
+
+ vcpu->arch.kvm_cpuid = kvm_get_hypervisor_cpuid(vcpu, KVM_SIGNATURE);
+ vcpu->arch.xen.cpuid = kvm_get_hypervisor_cpuid(vcpu, XEN_SIGNATURE);
+ kvm_vcpu_after_set_cpuid(vcpu);
+
+ return 0;
+}
+
+/* when an old userspace process fills a new kernel module */
+int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
+ struct kvm_cpuid *cpuid,
+ struct kvm_cpuid_entry __user *entries)
+{
+ int r, i;
+ struct kvm_cpuid_entry *e = NULL;
+ struct kvm_cpuid_entry2 *e2 = NULL;
+
+ if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
+ return -E2BIG;
+
+ if (cpuid->nent) {
+ e = vmemdup_user(entries, array_size(sizeof(*e), cpuid->nent));
+ if (IS_ERR(e))
+ return PTR_ERR(e);
+
+ e2 = kvmalloc_array(cpuid->nent, sizeof(*e2), GFP_KERNEL_ACCOUNT);
+ if (!e2) {
+ r = -ENOMEM;
+ goto out_free_cpuid;
+ }
+ }
+ for (i = 0; i < cpuid->nent; i++) {
+ e2[i].function = e[i].function;
+ e2[i].eax = e[i].eax;
+ e2[i].ebx = e[i].ebx;
+ e2[i].ecx = e[i].ecx;
+ e2[i].edx = e[i].edx;
+ e2[i].index = 0;
+ e2[i].flags = 0;
+ e2[i].padding[0] = 0;
+ e2[i].padding[1] = 0;
+ e2[i].padding[2] = 0;
+ }
+
+ r = kvm_set_cpuid(vcpu, e2, cpuid->nent);
+ if (r)
+ kvfree(e2);
+
+out_free_cpuid:
+ kvfree(e);
+
+ return r;
+}
+
+int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
+ struct kvm_cpuid2 *cpuid,
+ struct kvm_cpuid_entry2 __user *entries)
+{
+ struct kvm_cpuid_entry2 *e2 = NULL;
+ int r;
+
+ if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
+ return -E2BIG;
+
+ if (cpuid->nent) {
+ e2 = vmemdup_user(entries, array_size(sizeof(*e2), cpuid->nent));
+ if (IS_ERR(e2))
+ return PTR_ERR(e2);
+ }
+
+ r = kvm_set_cpuid(vcpu, e2, cpuid->nent);
+ if (r)
+ kvfree(e2);
+
+ return r;
+}
+
+int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
+ struct kvm_cpuid2 *cpuid,
+ struct kvm_cpuid_entry2 __user *entries)
+{
+ if (cpuid->nent < vcpu->arch.cpuid_nent)
+ return -E2BIG;
+
+ if (copy_to_user(entries, vcpu->arch.cpuid_entries,
+ vcpu->arch.cpuid_nent * sizeof(struct kvm_cpuid_entry2)))
+ return -EFAULT;
+
+ cpuid->nent = vcpu->arch.cpuid_nent;
+ return 0;
+}
+
+/* Mask kvm_cpu_caps for @leaf with the raw CPUID capabilities of this CPU. */
+static __always_inline void __kvm_cpu_cap_mask(unsigned int leaf)
+{
+ const struct cpuid_reg cpuid = x86_feature_cpuid(leaf * 32);
+ struct kvm_cpuid_entry2 entry;
+
+ reverse_cpuid_check(leaf);
+
+ cpuid_count(cpuid.function, cpuid.index,
+ &entry.eax, &entry.ebx, &entry.ecx, &entry.edx);
+
+ kvm_cpu_caps[leaf] &= *__cpuid_entry_get_reg(&entry, cpuid.reg);
+}
+
+static __always_inline
+void kvm_cpu_cap_init_kvm_defined(enum kvm_only_cpuid_leafs leaf, u32 mask)
+{
+ /* Use kvm_cpu_cap_mask for leafs that aren't KVM-only. */
+ BUILD_BUG_ON(leaf < NCAPINTS);
+
+ kvm_cpu_caps[leaf] = mask;
+
+ __kvm_cpu_cap_mask(leaf);
+}
+
+static __always_inline void kvm_cpu_cap_mask(enum cpuid_leafs leaf, u32 mask)
+{
+ /* Use kvm_cpu_cap_init_kvm_defined for KVM-only leafs. */
+ BUILD_BUG_ON(leaf >= NCAPINTS);
+
+ kvm_cpu_caps[leaf] &= mask;
+
+ __kvm_cpu_cap_mask(leaf);
+}
+
+void kvm_set_cpu_caps(void)
+{
+#ifdef CONFIG_X86_64
+ unsigned int f_gbpages = F(GBPAGES);
+ unsigned int f_lm = F(LM);
+ unsigned int f_xfd = F(XFD);
+#else
+ unsigned int f_gbpages = 0;
+ unsigned int f_lm = 0;
+ unsigned int f_xfd = 0;
+#endif
+ memset(kvm_cpu_caps, 0, sizeof(kvm_cpu_caps));
+
+ BUILD_BUG_ON(sizeof(kvm_cpu_caps) - (NKVMCAPINTS * sizeof(*kvm_cpu_caps)) >
+ sizeof(boot_cpu_data.x86_capability));
+
+ memcpy(&kvm_cpu_caps, &boot_cpu_data.x86_capability,
+ sizeof(kvm_cpu_caps) - (NKVMCAPINTS * sizeof(*kvm_cpu_caps)));
+
+ kvm_cpu_cap_mask(CPUID_1_ECX,
+ /*
+ * NOTE: MONITOR (and MWAIT) are emulated as NOP, but *not*
+ * advertised to guests via CPUID!
+ */
+ F(XMM3) | F(PCLMULQDQ) | 0 /* DTES64, MONITOR */ |
+ 0 /* DS-CPL, VMX, SMX, EST */ |
+ 0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ |
+ F(FMA) | F(CX16) | 0 /* xTPR Update */ | F(PDCM) |
+ F(PCID) | 0 /* Reserved, DCA */ | F(XMM4_1) |
+ F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) |
+ 0 /* Reserved*/ | F(AES) | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX) |
+ F(F16C) | F(RDRAND)
+ );
+ /* KVM emulates x2apic in software irrespective of host support. */
+ kvm_cpu_cap_set(X86_FEATURE_X2APIC);
+
+ kvm_cpu_cap_mask(CPUID_1_EDX,
+ F(FPU) | F(VME) | F(DE) | F(PSE) |
+ F(TSC) | F(MSR) | F(PAE) | F(MCE) |
+ F(CX8) | F(APIC) | 0 /* Reserved */ | F(SEP) |
+ F(MTRR) | F(PGE) | F(MCA) | F(CMOV) |
+ F(PAT) | F(PSE36) | 0 /* PSN */ | F(CLFLUSH) |
+ 0 /* Reserved, DS, ACPI */ | F(MMX) |
+ F(FXSR) | F(XMM) | F(XMM2) | F(SELFSNOOP) |
+ 0 /* HTT, TM, Reserved, PBE */
+ );
+
+ kvm_cpu_cap_mask(CPUID_7_0_EBX,
+ F(FSGSBASE) | F(SGX) | F(BMI1) | F(HLE) | F(AVX2) |
+ F(FDP_EXCPTN_ONLY) | F(SMEP) | F(BMI2) | F(ERMS) | F(INVPCID) |
+ F(RTM) | F(ZERO_FCS_FDS) | 0 /*MPX*/ | F(AVX512F) |
+ F(AVX512DQ) | F(RDSEED) | F(ADX) | F(SMAP) | F(AVX512IFMA) |
+ F(CLFLUSHOPT) | F(CLWB) | 0 /*INTEL_PT*/ | F(AVX512PF) |
+ F(AVX512ER) | F(AVX512CD) | F(SHA_NI) | F(AVX512BW) |
+ F(AVX512VL));
+
+ kvm_cpu_cap_mask(CPUID_7_ECX,
+ F(AVX512VBMI) | F(LA57) | F(PKU) | 0 /*OSPKE*/ | F(RDPID) |
+ F(AVX512_VPOPCNTDQ) | F(UMIP) | F(AVX512_VBMI2) | F(GFNI) |
+ F(VAES) | F(VPCLMULQDQ) | F(AVX512_VNNI) | F(AVX512_BITALG) |
+ F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B) | 0 /*WAITPKG*/ |
+ F(SGX_LC) | F(BUS_LOCK_DETECT)
+ );
+ /* Set LA57 based on hardware capability. */
+ if (cpuid_ecx(7) & F(LA57))
+ kvm_cpu_cap_set(X86_FEATURE_LA57);
+
+ /*
+ * PKU not yet implemented for shadow paging and requires OSPKE
+ * to be set on the host. Clear it if that is not the case
+ */
+ if (!tdp_enabled || !boot_cpu_has(X86_FEATURE_OSPKE))
+ kvm_cpu_cap_clear(X86_FEATURE_PKU);
+
+ kvm_cpu_cap_mask(CPUID_7_EDX,
+ F(AVX512_4VNNIW) | F(AVX512_4FMAPS) | F(SPEC_CTRL) |
+ F(SPEC_CTRL_SSBD) | F(ARCH_CAPABILITIES) | F(INTEL_STIBP) |
+ F(MD_CLEAR) | F(AVX512_VP2INTERSECT) | F(FSRM) |
+ F(SERIALIZE) | F(TSXLDTRK) | F(AVX512_FP16) |
+ F(AMX_TILE) | F(AMX_INT8) | F(AMX_BF16) | F(FLUSH_L1D)
+ );
+
+ /* TSC_ADJUST and ARCH_CAPABILITIES are emulated in software. */
+ kvm_cpu_cap_set(X86_FEATURE_TSC_ADJUST);
+ kvm_cpu_cap_set(X86_FEATURE_ARCH_CAPABILITIES);
+
+ if (boot_cpu_has(X86_FEATURE_IBPB) && boot_cpu_has(X86_FEATURE_IBRS))
+ kvm_cpu_cap_set(X86_FEATURE_SPEC_CTRL);
+ if (boot_cpu_has(X86_FEATURE_STIBP))
+ kvm_cpu_cap_set(X86_FEATURE_INTEL_STIBP);
+ if (boot_cpu_has(X86_FEATURE_AMD_SSBD))
+ kvm_cpu_cap_set(X86_FEATURE_SPEC_CTRL_SSBD);
+
+ kvm_cpu_cap_mask(CPUID_7_1_EAX,
+ F(AVX_VNNI) | F(AVX512_BF16) | F(CMPCCXADD) |
+ F(FZRM) | F(FSRS) | F(FSRC) |
+ F(AMX_FP16) | F(AVX_IFMA)
+ );
+
+ kvm_cpu_cap_init_kvm_defined(CPUID_7_1_EDX,
+ F(AVX_VNNI_INT8) | F(AVX_NE_CONVERT) | F(PREFETCHITI) |
+ F(AMX_COMPLEX)
+ );
+
+ kvm_cpu_cap_mask(CPUID_D_1_EAX,
+ F(XSAVEOPT) | F(XSAVEC) | F(XGETBV1) | F(XSAVES) | f_xfd
+ );
+
+ kvm_cpu_cap_init_kvm_defined(CPUID_12_EAX,
+ SF(SGX1) | SF(SGX2) | SF(SGX_EDECCSSA)
+ );
+
+ kvm_cpu_cap_mask(CPUID_8000_0001_ECX,
+ F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ |
+ F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) |
+ F(3DNOWPREFETCH) | F(OSVW) | 0 /* IBS */ | F(XOP) |
+ 0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM) |
+ F(TOPOEXT) | 0 /* PERFCTR_CORE */
+ );
+
+ kvm_cpu_cap_mask(CPUID_8000_0001_EDX,
+ F(FPU) | F(VME) | F(DE) | F(PSE) |
+ F(TSC) | F(MSR) | F(PAE) | F(MCE) |
+ F(CX8) | F(APIC) | 0 /* Reserved */ | F(SYSCALL) |
+ F(MTRR) | F(PGE) | F(MCA) | F(CMOV) |
+ F(PAT) | F(PSE36) | 0 /* Reserved */ |
+ F(NX) | 0 /* Reserved */ | F(MMXEXT) | F(MMX) |
+ F(FXSR) | F(FXSR_OPT) | f_gbpages | F(RDTSCP) |
+ 0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW)
+ );
+
+ if (!tdp_enabled && IS_ENABLED(CONFIG_X86_64))
+ kvm_cpu_cap_set(X86_FEATURE_GBPAGES);
+
+ kvm_cpu_cap_init_kvm_defined(CPUID_8000_0007_EDX,
+ SF(CONSTANT_TSC)
+ );
+
+ kvm_cpu_cap_mask(CPUID_8000_0008_EBX,
+ F(CLZERO) | F(XSAVEERPTR) |
+ F(WBNOINVD) | F(AMD_IBPB) | F(AMD_IBRS) | F(AMD_SSBD) | F(VIRT_SSBD) |
+ F(AMD_SSB_NO) | F(AMD_STIBP) | F(AMD_STIBP_ALWAYS_ON) |
+ F(AMD_PSFD)
+ );
+
+ /*
+ * AMD has separate bits for each SPEC_CTRL bit.
+ * arch/x86/kernel/cpu/bugs.c is kind enough to
+ * record that in cpufeatures so use them.
+ */
+ if (boot_cpu_has(X86_FEATURE_IBPB))
+ kvm_cpu_cap_set(X86_FEATURE_AMD_IBPB);
+ if (boot_cpu_has(X86_FEATURE_IBRS))
+ kvm_cpu_cap_set(X86_FEATURE_AMD_IBRS);
+ if (boot_cpu_has(X86_FEATURE_STIBP))
+ kvm_cpu_cap_set(X86_FEATURE_AMD_STIBP);
+ if (boot_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD))
+ kvm_cpu_cap_set(X86_FEATURE_AMD_SSBD);
+ if (!boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS))
+ kvm_cpu_cap_set(X86_FEATURE_AMD_SSB_NO);
+ /*
+ * The preference is to use SPEC CTRL MSR instead of the
+ * VIRT_SPEC MSR.
+ */
+ if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) &&
+ !boot_cpu_has(X86_FEATURE_AMD_SSBD))
+ kvm_cpu_cap_set(X86_FEATURE_VIRT_SSBD);
+
+ /*
+ * Hide all SVM features by default, SVM will set the cap bits for
+ * features it emulates and/or exposes for L1.
+ */
+ kvm_cpu_cap_mask(CPUID_8000_000A_EDX, 0);
+
+ kvm_cpu_cap_mask(CPUID_8000_001F_EAX,
+ 0 /* SME */ | F(SEV) | 0 /* VM_PAGE_FLUSH */ | F(SEV_ES) |
+ F(SME_COHERENT));
+
+ kvm_cpu_cap_mask(CPUID_8000_0021_EAX,
+ F(NO_NESTED_DATA_BP) | F(LFENCE_RDTSC) | 0 /* SmmPgCfgLock */ |
+ F(NULL_SEL_CLR_BASE) | F(AUTOIBRS) | 0 /* PrefetchCtlMsr */
+ );
+
+ if (cpu_feature_enabled(X86_FEATURE_SRSO_NO))
+ kvm_cpu_cap_set(X86_FEATURE_SRSO_NO);
+
+ kvm_cpu_cap_init_kvm_defined(CPUID_8000_0022_EAX,
+ F(PERFMON_V2)
+ );
+
+ /*
+ * Synthesize "LFENCE is serializing" into the AMD-defined entry in
+ * KVM's supported CPUID if the feature is reported as supported by the
+ * kernel. LFENCE_RDTSC was a Linux-defined synthetic feature long
+ * before AMD joined the bandwagon, e.g. LFENCE is serializing on most
+ * CPUs that support SSE2. On CPUs that don't support AMD's leaf,
+ * kvm_cpu_cap_mask() will unfortunately drop the flag due to ANDing
+ * the mask with the raw host CPUID, and reporting support in AMD's
+ * leaf can make it easier for userspace to detect the feature.
+ */
+ if (cpu_feature_enabled(X86_FEATURE_LFENCE_RDTSC))
+ kvm_cpu_cap_set(X86_FEATURE_LFENCE_RDTSC);
+ if (!static_cpu_has_bug(X86_BUG_NULL_SEG))
+ kvm_cpu_cap_set(X86_FEATURE_NULL_SEL_CLR_BASE);
+ kvm_cpu_cap_set(X86_FEATURE_NO_SMM_CTL_MSR);
+
+ kvm_cpu_cap_mask(CPUID_C000_0001_EDX,
+ F(XSTORE) | F(XSTORE_EN) | F(XCRYPT) | F(XCRYPT_EN) |
+ F(ACE2) | F(ACE2_EN) | F(PHE) | F(PHE_EN) |
+ F(PMM) | F(PMM_EN)
+ );
+
+ /*
+ * Hide RDTSCP and RDPID if either feature is reported as supported but
+ * probing MSR_TSC_AUX failed. This is purely a sanity check and
+ * should never happen, but the guest will likely crash if RDTSCP or
+ * RDPID is misreported, and KVM has botched MSR_TSC_AUX emulation in
+ * the past. For example, the sanity check may fire if this instance of
+ * KVM is running as L1 on top of an older, broken KVM.
+ */
+ if (WARN_ON((kvm_cpu_cap_has(X86_FEATURE_RDTSCP) ||
+ kvm_cpu_cap_has(X86_FEATURE_RDPID)) &&
+ !kvm_is_supported_user_return_msr(MSR_TSC_AUX))) {
+ kvm_cpu_cap_clear(X86_FEATURE_RDTSCP);
+ kvm_cpu_cap_clear(X86_FEATURE_RDPID);
+ }
+}
+EXPORT_SYMBOL_GPL(kvm_set_cpu_caps);
+
+struct kvm_cpuid_array {
+ struct kvm_cpuid_entry2 *entries;
+ int maxnent;
+ int nent;
+};
+
+static struct kvm_cpuid_entry2 *get_next_cpuid(struct kvm_cpuid_array *array)
+{
+ if (array->nent >= array->maxnent)
+ return NULL;
+
+ return &array->entries[array->nent++];
+}
+
+static struct kvm_cpuid_entry2 *do_host_cpuid(struct kvm_cpuid_array *array,
+ u32 function, u32 index)
+{
+ struct kvm_cpuid_entry2 *entry = get_next_cpuid(array);
+
+ if (!entry)
+ return NULL;
+
+ memset(entry, 0, sizeof(*entry));
+ entry->function = function;
+ entry->index = index;
+ switch (function & 0xC0000000) {
+ case 0x40000000:
+ /* Hypervisor leaves are always synthesized by __do_cpuid_func. */
+ return entry;
+
+ case 0x80000000:
+ /*
+ * 0x80000021 is sometimes synthesized by __do_cpuid_func, which
+ * would result in out-of-bounds calls to do_host_cpuid.
+ */
+ {
+ static int max_cpuid_80000000;
+ if (!READ_ONCE(max_cpuid_80000000))
+ WRITE_ONCE(max_cpuid_80000000, cpuid_eax(0x80000000));
+ if (function > READ_ONCE(max_cpuid_80000000))
+ return entry;
+ }
+ break;
+
+ default:
+ break;
+ }
+
+ cpuid_count(entry->function, entry->index,
+ &entry->eax, &entry->ebx, &entry->ecx, &entry->edx);
+
+ if (cpuid_function_is_indexed(function))
+ entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
+
+ return entry;
+}
+
+static int __do_cpuid_func_emulated(struct kvm_cpuid_array *array, u32 func)
+{
+ struct kvm_cpuid_entry2 *entry;
+
+ if (array->nent >= array->maxnent)
+ return -E2BIG;
+
+ entry = &array->entries[array->nent];
+ entry->function = func;
+ entry->index = 0;
+ entry->flags = 0;
+
+ switch (func) {
+ case 0:
+ entry->eax = 7;
+ ++array->nent;
+ break;
+ case 1:
+ entry->ecx = F(MOVBE);
+ ++array->nent;
+ break;
+ case 7:
+ entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
+ entry->eax = 0;
+ if (kvm_cpu_cap_has(X86_FEATURE_RDTSCP))
+ entry->ecx = F(RDPID);
+ ++array->nent;
+ break;
+ default:
+ break;
+ }
+
+ return 0;
+}
+
+static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
+{
+ struct kvm_cpuid_entry2 *entry;
+ int r, i, max_idx;
+
+ /* all calls to cpuid_count() should be made on the same cpu */
+ get_cpu();
+
+ r = -E2BIG;
+
+ entry = do_host_cpuid(array, function, 0);
+ if (!entry)
+ goto out;
+
+ switch (function) {
+ case 0:
+ /* Limited to the highest leaf implemented in KVM. */
+ entry->eax = min(entry->eax, 0x1fU);
+ break;
+ case 1:
+ cpuid_entry_override(entry, CPUID_1_EDX);
+ cpuid_entry_override(entry, CPUID_1_ECX);
+ break;
+ case 2:
+ /*
+ * On ancient CPUs, function 2 entries are STATEFUL. That is,
+ * CPUID(function=2, index=0) may return different results each
+ * time, with the least-significant byte in EAX enumerating the
+ * number of times software should do CPUID(2, 0).
+ *
+ * Modern CPUs, i.e. every CPU KVM has *ever* run on are less
+ * idiotic. Intel's SDM states that EAX & 0xff "will always
+ * return 01H. Software should ignore this value and not
+ * interpret it as an informational descriptor", while AMD's
+ * APM states that CPUID(2) is reserved.
+ *
+ * WARN if a frankenstein CPU that supports virtualization and
+ * a stateful CPUID.0x2 is encountered.
+ */
+ WARN_ON_ONCE((entry->eax & 0xff) > 1);
+ break;
+ /* functions 4 and 0x8000001d have additional index. */
+ case 4:
+ case 0x8000001d:
+ /*
+ * Read entries until the cache type in the previous entry is
+ * zero, i.e. indicates an invalid entry.
+ */
+ for (i = 1; entry->eax & 0x1f; ++i) {
+ entry = do_host_cpuid(array, function, i);
+ if (!entry)
+ goto out;
+ }
+ break;
+ case 6: /* Thermal management */
+ entry->eax = 0x4; /* allow ARAT */
+ entry->ebx = 0;
+ entry->ecx = 0;
+ entry->edx = 0;
+ break;
+ /* function 7 has additional index. */
+ case 7:
+ entry->eax = min(entry->eax, 1u);
+ cpuid_entry_override(entry, CPUID_7_0_EBX);
+ cpuid_entry_override(entry, CPUID_7_ECX);
+ cpuid_entry_override(entry, CPUID_7_EDX);
+
+ /* KVM only supports 0x7.0 and 0x7.1, capped above via min(). */
+ if (entry->eax == 1) {
+ entry = do_host_cpuid(array, function, 1);
+ if (!entry)
+ goto out;
+
+ cpuid_entry_override(entry, CPUID_7_1_EAX);
+ cpuid_entry_override(entry, CPUID_7_1_EDX);
+ entry->ebx = 0;
+ entry->ecx = 0;
+ }
+ break;
+ case 0xa: { /* Architectural Performance Monitoring */
+ union cpuid10_eax eax;
+ union cpuid10_edx edx;
+
+ if (!enable_pmu || !static_cpu_has(X86_FEATURE_ARCH_PERFMON)) {
+ entry->eax = entry->ebx = entry->ecx = entry->edx = 0;
+ break;
+ }
+
+ eax.split.version_id = kvm_pmu_cap.version;
+ eax.split.num_counters = kvm_pmu_cap.num_counters_gp;
+ eax.split.bit_width = kvm_pmu_cap.bit_width_gp;
+ eax.split.mask_length = kvm_pmu_cap.events_mask_len;
+ edx.split.num_counters_fixed = kvm_pmu_cap.num_counters_fixed;
+ edx.split.bit_width_fixed = kvm_pmu_cap.bit_width_fixed;
+
+ if (kvm_pmu_cap.version)
+ edx.split.anythread_deprecated = 1;
+ edx.split.reserved1 = 0;
+ edx.split.reserved2 = 0;
+
+ entry->eax = eax.full;
+ entry->ebx = kvm_pmu_cap.events_mask;
+ entry->ecx = 0;
+ entry->edx = edx.full;
+ break;
+ }
+ case 0x1f:
+ case 0xb:
+ /*
+ * No topology; a valid topology is indicated by the presence
+ * of subleaf 1.
+ */
+ entry->eax = entry->ebx = entry->ecx = 0;
+ break;
+ case 0xd: {
+ u64 permitted_xcr0 = kvm_get_filtered_xcr0();
+ u64 permitted_xss = kvm_caps.supported_xss;
+
+ entry->eax &= permitted_xcr0;
+ entry->ebx = xstate_required_size(permitted_xcr0, false);
+ entry->ecx = entry->ebx;
+ entry->edx &= permitted_xcr0 >> 32;
+ if (!permitted_xcr0)
+ break;
+
+ entry = do_host_cpuid(array, function, 1);
+ if (!entry)
+ goto out;
+
+ cpuid_entry_override(entry, CPUID_D_1_EAX);
+ if (entry->eax & (F(XSAVES)|F(XSAVEC)))
+ entry->ebx = xstate_required_size(permitted_xcr0 | permitted_xss,
+ true);
+ else {
+ WARN_ON_ONCE(permitted_xss != 0);
+ entry->ebx = 0;
+ }
+ entry->ecx &= permitted_xss;
+ entry->edx &= permitted_xss >> 32;
+
+ for (i = 2; i < 64; ++i) {
+ bool s_state;
+ if (permitted_xcr0 & BIT_ULL(i))
+ s_state = false;
+ else if (permitted_xss & BIT_ULL(i))
+ s_state = true;
+ else
+ continue;
+
+ entry = do_host_cpuid(array, function, i);
+ if (!entry)
+ goto out;
+
+ /*
+ * The supported check above should have filtered out
+ * invalid sub-leafs. Only valid sub-leafs should
+ * reach this point, and they should have a non-zero
+ * save state size. Furthermore, check whether the
+ * processor agrees with permitted_xcr0/permitted_xss
+ * on whether this is an XCR0- or IA32_XSS-managed area.
+ */
+ if (WARN_ON_ONCE(!entry->eax || (entry->ecx & 0x1) != s_state)) {
+ --array->nent;
+ continue;
+ }
+
+ if (!kvm_cpu_cap_has(X86_FEATURE_XFD))
+ entry->ecx &= ~BIT_ULL(2);
+ entry->edx = 0;
+ }
+ break;
+ }
+ case 0x12:
+ /* Intel SGX */
+ if (!kvm_cpu_cap_has(X86_FEATURE_SGX)) {
+ entry->eax = entry->ebx = entry->ecx = entry->edx = 0;
+ break;
+ }
+
+ /*
+ * Index 0: Sub-features, MISCSELECT (a.k.a extended features)
+ * and max enclave sizes. The SGX sub-features and MISCSELECT
+ * are restricted by kernel and KVM capabilities (like most
+ * feature flags), while enclave size is unrestricted.
+ */
+ cpuid_entry_override(entry, CPUID_12_EAX);
+ entry->ebx &= SGX_MISC_EXINFO;
+
+ entry = do_host_cpuid(array, function, 1);
+ if (!entry)
+ goto out;
+
+ /*
+ * Index 1: SECS.ATTRIBUTES. ATTRIBUTES are restricted a la
+ * feature flags. Advertise all supported flags, including
+ * privileged attributes that require explicit opt-in from
+ * userspace. ATTRIBUTES.XFRM is not adjusted as userspace is
+ * expected to derive it from supported XCR0.
+ */
+ entry->eax &= SGX_ATTR_PRIV_MASK | SGX_ATTR_UNPRIV_MASK;
+ entry->ebx &= 0;
+ break;
+ /* Intel PT */
+ case 0x14:
+ if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT)) {
+ entry->eax = entry->ebx = entry->ecx = entry->edx = 0;
+ break;
+ }
+
+ for (i = 1, max_idx = entry->eax; i <= max_idx; ++i) {
+ if (!do_host_cpuid(array, function, i))
+ goto out;
+ }
+ break;
+ /* Intel AMX TILE */
+ case 0x1d:
+ if (!kvm_cpu_cap_has(X86_FEATURE_AMX_TILE)) {
+ entry->eax = entry->ebx = entry->ecx = entry->edx = 0;
+ break;
+ }
+
+ for (i = 1, max_idx = entry->eax; i <= max_idx; ++i) {
+ if (!do_host_cpuid(array, function, i))
+ goto out;
+ }
+ break;
+ case 0x1e: /* TMUL information */
+ if (!kvm_cpu_cap_has(X86_FEATURE_AMX_TILE)) {
+ entry->eax = entry->ebx = entry->ecx = entry->edx = 0;
+ break;
+ }
+ break;
+ case KVM_CPUID_SIGNATURE: {
+ const u32 *sigptr = (const u32 *)KVM_SIGNATURE;
+ entry->eax = KVM_CPUID_FEATURES;
+ entry->ebx = sigptr[0];
+ entry->ecx = sigptr[1];
+ entry->edx = sigptr[2];
+ break;
+ }
+ case KVM_CPUID_FEATURES:
+ entry->eax = (1 << KVM_FEATURE_CLOCKSOURCE) |
+ (1 << KVM_FEATURE_NOP_IO_DELAY) |
+ (1 << KVM_FEATURE_CLOCKSOURCE2) |
+ (1 << KVM_FEATURE_ASYNC_PF) |
+ (1 << KVM_FEATURE_PV_EOI) |
+ (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT) |
+ (1 << KVM_FEATURE_PV_UNHALT) |
+ (1 << KVM_FEATURE_PV_TLB_FLUSH) |
+ (1 << KVM_FEATURE_ASYNC_PF_VMEXIT) |
+ (1 << KVM_FEATURE_PV_SEND_IPI) |
+ (1 << KVM_FEATURE_POLL_CONTROL) |
+ (1 << KVM_FEATURE_PV_SCHED_YIELD) |
+ (1 << KVM_FEATURE_ASYNC_PF_INT);
+
+ if (sched_info_on())
+ entry->eax |= (1 << KVM_FEATURE_STEAL_TIME);
+
+ entry->ebx = 0;
+ entry->ecx = 0;
+ entry->edx = 0;
+ break;
+ case 0x80000000:
+ entry->eax = min(entry->eax, 0x80000022);
+ /*
+ * Serializing LFENCE is reported in a multitude of ways, and
+ * NullSegClearsBase is not reported in CPUID on Zen2; help
+ * userspace by providing the CPUID leaf ourselves.
+ *
+ * However, only do it if the host has CPUID leaf 0x8000001d.
+ * QEMU thinks that it can query the host blindly for that
+ * CPUID leaf if KVM reports that it supports 0x8000001d or
+ * above. The processor merrily returns values from the
+ * highest Intel leaf which QEMU tries to use as the guest's
+ * 0x8000001d. Even worse, this can result in an infinite
+ * loop if said highest leaf has no subleaves indexed by ECX.
+ */
+ if (entry->eax >= 0x8000001d &&
+ (static_cpu_has(X86_FEATURE_LFENCE_RDTSC)
+ || !static_cpu_has_bug(X86_BUG_NULL_SEG)))
+ entry->eax = max(entry->eax, 0x80000021);
+ break;
+ case 0x80000001:
+ entry->ebx &= ~GENMASK(27, 16);
+ cpuid_entry_override(entry, CPUID_8000_0001_EDX);
+ cpuid_entry_override(entry, CPUID_8000_0001_ECX);
+ break;
+ case 0x80000005:
+ /* Pass host L1 cache and TLB info. */
+ break;
+ case 0x80000006:
+ /* Drop reserved bits, pass host L2 cache and TLB info. */
+ entry->edx &= ~GENMASK(17, 16);
+ break;
+ case 0x80000007: /* Advanced power management */
+ cpuid_entry_override(entry, CPUID_8000_0007_EDX);
+
+ /* mask against host */
+ entry->edx &= boot_cpu_data.x86_power;
+ entry->eax = entry->ebx = entry->ecx = 0;
+ break;
+ case 0x80000008: {
+ unsigned g_phys_as = (entry->eax >> 16) & 0xff;
+ unsigned virt_as = max((entry->eax >> 8) & 0xff, 48U);
+ unsigned phys_as = entry->eax & 0xff;
+
+ /*
+ * If TDP (NPT) is disabled use the adjusted host MAXPHYADDR as
+ * the guest operates in the same PA space as the host, i.e.
+ * reductions in MAXPHYADDR for memory encryption affect shadow
+ * paging, too.
+ *
+ * If TDP is enabled but an explicit guest MAXPHYADDR is not
+ * provided, use the raw bare metal MAXPHYADDR as reductions to
+ * the HPAs do not affect GPAs.
+ */
+ if (!tdp_enabled)
+ g_phys_as = boot_cpu_data.x86_phys_bits;
+ else if (!g_phys_as)
+ g_phys_as = phys_as;
+
+ entry->eax = g_phys_as | (virt_as << 8);
+ entry->ecx &= ~(GENMASK(31, 16) | GENMASK(11, 8));
+ entry->edx = 0;
+ cpuid_entry_override(entry, CPUID_8000_0008_EBX);
+ break;
+ }
+ case 0x8000000A:
+ if (!kvm_cpu_cap_has(X86_FEATURE_SVM)) {
+ entry->eax = entry->ebx = entry->ecx = entry->edx = 0;
+ break;
+ }
+ entry->eax = 1; /* SVM revision 1 */
+ entry->ebx = 8; /* Lets support 8 ASIDs in case we add proper
+ ASID emulation to nested SVM */
+ entry->ecx = 0; /* Reserved */
+ cpuid_entry_override(entry, CPUID_8000_000A_EDX);
+ break;
+ case 0x80000019:
+ entry->ecx = entry->edx = 0;
+ break;
+ case 0x8000001a:
+ entry->eax &= GENMASK(2, 0);
+ entry->ebx = entry->ecx = entry->edx = 0;
+ break;
+ case 0x8000001e:
+ /* Do not return host topology information. */
+ entry->eax = entry->ebx = entry->ecx = 0;
+ entry->edx = 0; /* reserved */
+ break;
+ case 0x8000001F:
+ if (!kvm_cpu_cap_has(X86_FEATURE_SEV)) {
+ entry->eax = entry->ebx = entry->ecx = entry->edx = 0;
+ } else {
+ cpuid_entry_override(entry, CPUID_8000_001F_EAX);
+ /* Clear NumVMPL since KVM does not support VMPL. */
+ entry->ebx &= ~GENMASK(31, 12);
+ /*
+ * Enumerate '0' for "PA bits reduction", the adjusted
+ * MAXPHYADDR is enumerated directly (see 0x80000008).
+ */
+ entry->ebx &= ~GENMASK(11, 6);
+ }
+ break;
+ case 0x80000020:
+ entry->eax = entry->ebx = entry->ecx = entry->edx = 0;
+ break;
+ case 0x80000021:
+ entry->ebx = entry->ecx = entry->edx = 0;
+ cpuid_entry_override(entry, CPUID_8000_0021_EAX);
+ break;
+ /* AMD Extended Performance Monitoring and Debug */
+ case 0x80000022: {
+ union cpuid_0x80000022_ebx ebx;
+
+ entry->ecx = entry->edx = 0;
+ if (!enable_pmu || !kvm_cpu_cap_has(X86_FEATURE_PERFMON_V2)) {
+ entry->eax = entry->ebx;
+ break;
+ }
+
+ cpuid_entry_override(entry, CPUID_8000_0022_EAX);
+
+ if (kvm_cpu_cap_has(X86_FEATURE_PERFMON_V2))
+ ebx.split.num_core_pmc = kvm_pmu_cap.num_counters_gp;
+ else if (kvm_cpu_cap_has(X86_FEATURE_PERFCTR_CORE))
+ ebx.split.num_core_pmc = AMD64_NUM_COUNTERS_CORE;
+ else
+ ebx.split.num_core_pmc = AMD64_NUM_COUNTERS;
+
+ entry->ebx = ebx.full;
+ break;
+ }
+ /*Add support for Centaur's CPUID instruction*/
+ case 0xC0000000:
+ /*Just support up to 0xC0000004 now*/
+ entry->eax = min(entry->eax, 0xC0000004);
+ break;
+ case 0xC0000001:
+ cpuid_entry_override(entry, CPUID_C000_0001_EDX);
+ break;
+ case 3: /* Processor serial number */
+ case 5: /* MONITOR/MWAIT */
+ case 0xC0000002:
+ case 0xC0000003:
+ case 0xC0000004:
+ default:
+ entry->eax = entry->ebx = entry->ecx = entry->edx = 0;
+ break;
+ }
+
+ r = 0;
+
+out:
+ put_cpu();
+
+ return r;
+}
+
+static int do_cpuid_func(struct kvm_cpuid_array *array, u32 func,
+ unsigned int type)
+{
+ if (type == KVM_GET_EMULATED_CPUID)
+ return __do_cpuid_func_emulated(array, func);
+
+ return __do_cpuid_func(array, func);
+}
+
+#define CENTAUR_CPUID_SIGNATURE 0xC0000000
+
+static int get_cpuid_func(struct kvm_cpuid_array *array, u32 func,
+ unsigned int type)
+{
+ u32 limit;
+ int r;
+
+ if (func == CENTAUR_CPUID_SIGNATURE &&
+ boot_cpu_data.x86_vendor != X86_VENDOR_CENTAUR)
+ return 0;
+
+ r = do_cpuid_func(array, func, type);
+ if (r)
+ return r;
+
+ limit = array->entries[array->nent - 1].eax;
+ for (func = func + 1; func <= limit; ++func) {
+ r = do_cpuid_func(array, func, type);
+ if (r)
+ break;
+ }
+
+ return r;
+}
+
+static bool sanity_check_entries(struct kvm_cpuid_entry2 __user *entries,
+ __u32 num_entries, unsigned int ioctl_type)
+{
+ int i;
+ __u32 pad[3];
+
+ if (ioctl_type != KVM_GET_EMULATED_CPUID)
+ return false;
+
+ /*
+ * We want to make sure that ->padding is being passed clean from
+ * userspace in case we want to use it for something in the future.
+ *
+ * Sadly, this wasn't enforced for KVM_GET_SUPPORTED_CPUID and so we
+ * have to give ourselves satisfied only with the emulated side. /me
+ * sheds a tear.
+ */
+ for (i = 0; i < num_entries; i++) {
+ if (copy_from_user(pad, entries[i].padding, sizeof(pad)))
+ return true;
+
+ if (pad[0] || pad[1] || pad[2])
+ return true;
+ }
+ return false;
+}
+
+int kvm_dev_ioctl_get_cpuid(struct kvm_cpuid2 *cpuid,
+ struct kvm_cpuid_entry2 __user *entries,
+ unsigned int type)
+{
+ static const u32 funcs[] = {
+ 0, 0x80000000, CENTAUR_CPUID_SIGNATURE, KVM_CPUID_SIGNATURE,
+ };
+
+ struct kvm_cpuid_array array = {
+ .nent = 0,
+ };
+ int r, i;
+
+ if (cpuid->nent < 1)
+ return -E2BIG;
+ if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
+ cpuid->nent = KVM_MAX_CPUID_ENTRIES;
+
+ if (sanity_check_entries(entries, cpuid->nent, type))
+ return -EINVAL;
+
+ array.entries = kvcalloc(cpuid->nent, sizeof(struct kvm_cpuid_entry2), GFP_KERNEL);
+ if (!array.entries)
+ return -ENOMEM;
+
+ array.maxnent = cpuid->nent;
+
+ for (i = 0; i < ARRAY_SIZE(funcs); i++) {
+ r = get_cpuid_func(&array, funcs[i], type);
+ if (r)
+ goto out_free;
+ }
+ cpuid->nent = array.nent;
+
+ if (copy_to_user(entries, array.entries,
+ array.nent * sizeof(struct kvm_cpuid_entry2)))
+ r = -EFAULT;
+
+out_free:
+ kvfree(array.entries);
+ return r;
+}
+
+struct kvm_cpuid_entry2 *kvm_find_cpuid_entry_index(struct kvm_vcpu *vcpu,
+ u32 function, u32 index)
+{
+ return cpuid_entry2_find(vcpu->arch.cpuid_entries, vcpu->arch.cpuid_nent,
+ function, index);
+}
+EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry_index);
+
+struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
+ u32 function)
+{
+ return cpuid_entry2_find(vcpu->arch.cpuid_entries, vcpu->arch.cpuid_nent,
+ function, KVM_CPUID_INDEX_NOT_SIGNIFICANT);
+}
+EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry);
+
+/*
+ * Intel CPUID semantics treats any query for an out-of-range leaf as if the
+ * highest basic leaf (i.e. CPUID.0H:EAX) were requested. AMD CPUID semantics
+ * returns all zeroes for any undefined leaf, whether or not the leaf is in
+ * range. Centaur/VIA follows Intel semantics.
+ *
+ * A leaf is considered out-of-range if its function is higher than the maximum
+ * supported leaf of its associated class or if its associated class does not
+ * exist.
+ *
+ * There are three primary classes to be considered, with their respective
+ * ranges described as "<base> - <top>[,<base2> - <top2>] inclusive. A primary
+ * class exists if a guest CPUID entry for its <base> leaf exists. For a given
+ * class, CPUID.<base>.EAX contains the max supported leaf for the class.
+ *
+ * - Basic: 0x00000000 - 0x3fffffff, 0x50000000 - 0x7fffffff
+ * - Hypervisor: 0x40000000 - 0x4fffffff
+ * - Extended: 0x80000000 - 0xbfffffff
+ * - Centaur: 0xc0000000 - 0xcfffffff
+ *
+ * The Hypervisor class is further subdivided into sub-classes that each act as
+ * their own independent class associated with a 0x100 byte range. E.g. if Qemu
+ * is advertising support for both HyperV and KVM, the resulting Hypervisor
+ * CPUID sub-classes are:
+ *
+ * - HyperV: 0x40000000 - 0x400000ff
+ * - KVM: 0x40000100 - 0x400001ff
+ */
+static struct kvm_cpuid_entry2 *
+get_out_of_range_cpuid_entry(struct kvm_vcpu *vcpu, u32 *fn_ptr, u32 index)
+{
+ struct kvm_cpuid_entry2 *basic, *class;
+ u32 function = *fn_ptr;
+
+ basic = kvm_find_cpuid_entry(vcpu, 0);
+ if (!basic)
+ return NULL;
+
+ if (is_guest_vendor_amd(basic->ebx, basic->ecx, basic->edx) ||
+ is_guest_vendor_hygon(basic->ebx, basic->ecx, basic->edx))
+ return NULL;
+
+ if (function >= 0x40000000 && function <= 0x4fffffff)
+ class = kvm_find_cpuid_entry(vcpu, function & 0xffffff00);
+ else if (function >= 0xc0000000)
+ class = kvm_find_cpuid_entry(vcpu, 0xc0000000);
+ else
+ class = kvm_find_cpuid_entry(vcpu, function & 0x80000000);
+
+ if (class && function <= class->eax)
+ return NULL;
+
+ /*
+ * Leaf specific adjustments are also applied when redirecting to the
+ * max basic entry, e.g. if the max basic leaf is 0xb but there is no
+ * entry for CPUID.0xb.index (see below), then the output value for EDX
+ * needs to be pulled from CPUID.0xb.1.
+ */
+ *fn_ptr = basic->eax;
+
+ /*
+ * The class does not exist or the requested function is out of range;
+ * the effective CPUID entry is the max basic leaf. Note, the index of
+ * the original requested leaf is observed!
+ */
+ return kvm_find_cpuid_entry_index(vcpu, basic->eax, index);
+}
+
+bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx,
+ u32 *ecx, u32 *edx, bool exact_only)
+{
+ u32 orig_function = *eax, function = *eax, index = *ecx;
+ struct kvm_cpuid_entry2 *entry;
+ bool exact, used_max_basic = false;
+
+ entry = kvm_find_cpuid_entry_index(vcpu, function, index);
+ exact = !!entry;
+
+ if (!entry && !exact_only) {
+ entry = get_out_of_range_cpuid_entry(vcpu, &function, index);
+ used_max_basic = !!entry;
+ }
+
+ if (entry) {
+ *eax = entry->eax;
+ *ebx = entry->ebx;
+ *ecx = entry->ecx;
+ *edx = entry->edx;
+ if (function == 7 && index == 0) {
+ u64 data;
+ if (!__kvm_get_msr(vcpu, MSR_IA32_TSX_CTRL, &data, true) &&
+ (data & TSX_CTRL_CPUID_CLEAR))
+ *ebx &= ~(F(RTM) | F(HLE));
+ } else if (function == 0x80000007) {
+ if (kvm_hv_invtsc_suppressed(vcpu))
+ *edx &= ~SF(CONSTANT_TSC);
+ }
+ } else {
+ *eax = *ebx = *ecx = *edx = 0;
+ /*
+ * When leaf 0BH or 1FH is defined, CL is pass-through
+ * and EDX is always the x2APIC ID, even for undefined
+ * subleaves. Index 1 will exist iff the leaf is
+ * implemented, so we pass through CL iff leaf 1
+ * exists. EDX can be copied from any existing index.
+ */
+ if (function == 0xb || function == 0x1f) {
+ entry = kvm_find_cpuid_entry_index(vcpu, function, 1);
+ if (entry) {
+ *ecx = index & 0xff;
+ *edx = entry->edx;
+ }
+ }
+ }
+ trace_kvm_cpuid(orig_function, index, *eax, *ebx, *ecx, *edx, exact,
+ used_max_basic);
+ return exact;
+}
+EXPORT_SYMBOL_GPL(kvm_cpuid);
+
+int kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
+{
+ u32 eax, ebx, ecx, edx;
+
+ if (cpuid_fault_enabled(vcpu) && !kvm_require_cpl(vcpu, 0))
+ return 1;
+
+ eax = kvm_rax_read(vcpu);
+ ecx = kvm_rcx_read(vcpu);
+ kvm_cpuid(vcpu, &eax, &ebx, &ecx, &edx, false);
+ kvm_rax_write(vcpu, eax);
+ kvm_rbx_write(vcpu, ebx);
+ kvm_rcx_write(vcpu, ecx);
+ kvm_rdx_write(vcpu, edx);
+ return kvm_skip_emulated_instruction(vcpu);
+}
+EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
new file mode 100644
index 000000000..284fa4704
--- /dev/null
+++ b/arch/x86/kvm/cpuid.h
@@ -0,0 +1,281 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef ARCH_X86_KVM_CPUID_H
+#define ARCH_X86_KVM_CPUID_H
+
+#include "x86.h"
+#include "reverse_cpuid.h"
+#include <asm/cpu.h>
+#include <asm/processor.h>
+#include <uapi/asm/kvm_para.h>
+
+extern u32 kvm_cpu_caps[NR_KVM_CPU_CAPS] __read_mostly;
+void kvm_set_cpu_caps(void);
+
+void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu);
+void kvm_update_pv_runtime(struct kvm_vcpu *vcpu);
+struct kvm_cpuid_entry2 *kvm_find_cpuid_entry_index(struct kvm_vcpu *vcpu,
+ u32 function, u32 index);
+struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
+ u32 function);
+int kvm_dev_ioctl_get_cpuid(struct kvm_cpuid2 *cpuid,
+ struct kvm_cpuid_entry2 __user *entries,
+ unsigned int type);
+int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
+ struct kvm_cpuid *cpuid,
+ struct kvm_cpuid_entry __user *entries);
+int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
+ struct kvm_cpuid2 *cpuid,
+ struct kvm_cpuid_entry2 __user *entries);
+int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
+ struct kvm_cpuid2 *cpuid,
+ struct kvm_cpuid_entry2 __user *entries);
+bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx,
+ u32 *ecx, u32 *edx, bool exact_only);
+
+u32 xstate_required_size(u64 xstate_bv, bool compacted);
+
+int cpuid_query_maxphyaddr(struct kvm_vcpu *vcpu);
+u64 kvm_vcpu_reserved_gpa_bits_raw(struct kvm_vcpu *vcpu);
+
+static inline int cpuid_maxphyaddr(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.maxphyaddr;
+}
+
+static inline bool kvm_vcpu_is_legal_gpa(struct kvm_vcpu *vcpu, gpa_t gpa)
+{
+ return !(gpa & vcpu->arch.reserved_gpa_bits);
+}
+
+static inline bool kvm_vcpu_is_illegal_gpa(struct kvm_vcpu *vcpu, gpa_t gpa)
+{
+ return !kvm_vcpu_is_legal_gpa(vcpu, gpa);
+}
+
+static inline bool kvm_vcpu_is_legal_aligned_gpa(struct kvm_vcpu *vcpu,
+ gpa_t gpa, gpa_t alignment)
+{
+ return IS_ALIGNED(gpa, alignment) && kvm_vcpu_is_legal_gpa(vcpu, gpa);
+}
+
+static inline bool page_address_valid(struct kvm_vcpu *vcpu, gpa_t gpa)
+{
+ return kvm_vcpu_is_legal_aligned_gpa(vcpu, gpa, PAGE_SIZE);
+}
+
+static __always_inline void cpuid_entry_override(struct kvm_cpuid_entry2 *entry,
+ unsigned int leaf)
+{
+ u32 *reg = cpuid_entry_get_reg(entry, leaf * 32);
+
+ BUILD_BUG_ON(leaf >= ARRAY_SIZE(kvm_cpu_caps));
+ *reg = kvm_cpu_caps[leaf];
+}
+
+static __always_inline u32 *guest_cpuid_get_register(struct kvm_vcpu *vcpu,
+ unsigned int x86_feature)
+{
+ const struct cpuid_reg cpuid = x86_feature_cpuid(x86_feature);
+ struct kvm_cpuid_entry2 *entry;
+
+ entry = kvm_find_cpuid_entry_index(vcpu, cpuid.function, cpuid.index);
+ if (!entry)
+ return NULL;
+
+ return __cpuid_entry_get_reg(entry, cpuid.reg);
+}
+
+static __always_inline bool guest_cpuid_has(struct kvm_vcpu *vcpu,
+ unsigned int x86_feature)
+{
+ u32 *reg;
+
+ reg = guest_cpuid_get_register(vcpu, x86_feature);
+ if (!reg)
+ return false;
+
+ return *reg & __feature_bit(x86_feature);
+}
+
+static __always_inline void guest_cpuid_clear(struct kvm_vcpu *vcpu,
+ unsigned int x86_feature)
+{
+ u32 *reg;
+
+ reg = guest_cpuid_get_register(vcpu, x86_feature);
+ if (reg)
+ *reg &= ~__feature_bit(x86_feature);
+}
+
+static inline bool guest_cpuid_is_amd_or_hygon(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpuid_entry2 *best;
+
+ best = kvm_find_cpuid_entry(vcpu, 0);
+ return best &&
+ (is_guest_vendor_amd(best->ebx, best->ecx, best->edx) ||
+ is_guest_vendor_hygon(best->ebx, best->ecx, best->edx));
+}
+
+static inline bool guest_cpuid_is_intel(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpuid_entry2 *best;
+
+ best = kvm_find_cpuid_entry(vcpu, 0);
+ return best && is_guest_vendor_intel(best->ebx, best->ecx, best->edx);
+}
+
+static inline int guest_cpuid_family(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpuid_entry2 *best;
+
+ best = kvm_find_cpuid_entry(vcpu, 0x1);
+ if (!best)
+ return -1;
+
+ return x86_family(best->eax);
+}
+
+static inline int guest_cpuid_model(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpuid_entry2 *best;
+
+ best = kvm_find_cpuid_entry(vcpu, 0x1);
+ if (!best)
+ return -1;
+
+ return x86_model(best->eax);
+}
+
+static inline bool cpuid_model_is_consistent(struct kvm_vcpu *vcpu)
+{
+ return boot_cpu_data.x86_model == guest_cpuid_model(vcpu);
+}
+
+static inline int guest_cpuid_stepping(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpuid_entry2 *best;
+
+ best = kvm_find_cpuid_entry(vcpu, 0x1);
+ if (!best)
+ return -1;
+
+ return x86_stepping(best->eax);
+}
+
+static inline bool guest_has_spec_ctrl_msr(struct kvm_vcpu *vcpu)
+{
+ return (guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL) ||
+ guest_cpuid_has(vcpu, X86_FEATURE_AMD_STIBP) ||
+ guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBRS) ||
+ guest_cpuid_has(vcpu, X86_FEATURE_AMD_SSBD));
+}
+
+static inline bool guest_has_pred_cmd_msr(struct kvm_vcpu *vcpu)
+{
+ return (guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL) ||
+ guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBPB));
+}
+
+static inline bool supports_cpuid_fault(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.msr_platform_info & MSR_PLATFORM_INFO_CPUID_FAULT;
+}
+
+static inline bool cpuid_fault_enabled(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.msr_misc_features_enables &
+ MSR_MISC_FEATURES_ENABLES_CPUID_FAULT;
+}
+
+static __always_inline void kvm_cpu_cap_clear(unsigned int x86_feature)
+{
+ unsigned int x86_leaf = __feature_leaf(x86_feature);
+
+ reverse_cpuid_check(x86_leaf);
+ kvm_cpu_caps[x86_leaf] &= ~__feature_bit(x86_feature);
+}
+
+static __always_inline void kvm_cpu_cap_set(unsigned int x86_feature)
+{
+ unsigned int x86_leaf = __feature_leaf(x86_feature);
+
+ reverse_cpuid_check(x86_leaf);
+ kvm_cpu_caps[x86_leaf] |= __feature_bit(x86_feature);
+}
+
+static __always_inline u32 kvm_cpu_cap_get(unsigned int x86_feature)
+{
+ unsigned int x86_leaf = __feature_leaf(x86_feature);
+
+ reverse_cpuid_check(x86_leaf);
+ return kvm_cpu_caps[x86_leaf] & __feature_bit(x86_feature);
+}
+
+static __always_inline bool kvm_cpu_cap_has(unsigned int x86_feature)
+{
+ return !!kvm_cpu_cap_get(x86_feature);
+}
+
+static __always_inline void kvm_cpu_cap_check_and_set(unsigned int x86_feature)
+{
+ if (boot_cpu_has(x86_feature))
+ kvm_cpu_cap_set(x86_feature);
+}
+
+static __always_inline bool guest_pv_has(struct kvm_vcpu *vcpu,
+ unsigned int kvm_feature)
+{
+ if (!vcpu->arch.pv_cpuid.enforce)
+ return true;
+
+ return vcpu->arch.pv_cpuid.features & (1u << kvm_feature);
+}
+
+enum kvm_governed_features {
+#define KVM_GOVERNED_FEATURE(x) KVM_GOVERNED_##x,
+#include "governed_features.h"
+ KVM_NR_GOVERNED_FEATURES
+};
+
+static __always_inline int kvm_governed_feature_index(unsigned int x86_feature)
+{
+ switch (x86_feature) {
+#define KVM_GOVERNED_FEATURE(x) case x: return KVM_GOVERNED_##x;
+#include "governed_features.h"
+ default:
+ return -1;
+ }
+}
+
+static __always_inline bool kvm_is_governed_feature(unsigned int x86_feature)
+{
+ return kvm_governed_feature_index(x86_feature) >= 0;
+}
+
+static __always_inline void kvm_governed_feature_set(struct kvm_vcpu *vcpu,
+ unsigned int x86_feature)
+{
+ BUILD_BUG_ON(!kvm_is_governed_feature(x86_feature));
+
+ __set_bit(kvm_governed_feature_index(x86_feature),
+ vcpu->arch.governed_features.enabled);
+}
+
+static __always_inline void kvm_governed_feature_check_and_set(struct kvm_vcpu *vcpu,
+ unsigned int x86_feature)
+{
+ if (kvm_cpu_cap_has(x86_feature) && guest_cpuid_has(vcpu, x86_feature))
+ kvm_governed_feature_set(vcpu, x86_feature);
+}
+
+static __always_inline bool guest_can_use(struct kvm_vcpu *vcpu,
+ unsigned int x86_feature)
+{
+ BUILD_BUG_ON(!kvm_is_governed_feature(x86_feature));
+
+ return test_bit(kvm_governed_feature_index(x86_feature),
+ vcpu->arch.governed_features.enabled);
+}
+
+#endif
diff --git a/arch/x86/kvm/debugfs.c b/arch/x86/kvm/debugfs.c
new file mode 100644
index 000000000..ee8c4c349
--- /dev/null
+++ b/arch/x86/kvm/debugfs.c
@@ -0,0 +1,196 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Kernel-based Virtual Machine driver for Linux
+ *
+ * Copyright 2016 Red Hat, Inc. and/or its affiliates.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kvm_host.h>
+#include <linux/debugfs.h>
+#include "lapic.h"
+#include "mmu.h"
+#include "mmu/mmu_internal.h"
+
+static int vcpu_get_timer_advance_ns(void *data, u64 *val)
+{
+ struct kvm_vcpu *vcpu = (struct kvm_vcpu *) data;
+ *val = vcpu->arch.apic->lapic_timer.timer_advance_ns;
+ return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(vcpu_timer_advance_ns_fops, vcpu_get_timer_advance_ns, NULL, "%llu\n");
+
+static int vcpu_get_guest_mode(void *data, u64 *val)
+{
+ struct kvm_vcpu *vcpu = (struct kvm_vcpu *) data;
+ *val = vcpu->stat.guest_mode;
+ return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(vcpu_guest_mode_fops, vcpu_get_guest_mode, NULL, "%lld\n");
+
+static int vcpu_get_tsc_offset(void *data, u64 *val)
+{
+ struct kvm_vcpu *vcpu = (struct kvm_vcpu *) data;
+ *val = vcpu->arch.tsc_offset;
+ return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(vcpu_tsc_offset_fops, vcpu_get_tsc_offset, NULL, "%lld\n");
+
+static int vcpu_get_tsc_scaling_ratio(void *data, u64 *val)
+{
+ struct kvm_vcpu *vcpu = (struct kvm_vcpu *) data;
+ *val = vcpu->arch.tsc_scaling_ratio;
+ return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(vcpu_tsc_scaling_fops, vcpu_get_tsc_scaling_ratio, NULL, "%llu\n");
+
+static int vcpu_get_tsc_scaling_frac_bits(void *data, u64 *val)
+{
+ *val = kvm_caps.tsc_scaling_ratio_frac_bits;
+ return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(vcpu_tsc_scaling_frac_fops, vcpu_get_tsc_scaling_frac_bits, NULL, "%llu\n");
+
+void kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu, struct dentry *debugfs_dentry)
+{
+ debugfs_create_file("guest_mode", 0444, debugfs_dentry, vcpu,
+ &vcpu_guest_mode_fops);
+ debugfs_create_file("tsc-offset", 0444, debugfs_dentry, vcpu,
+ &vcpu_tsc_offset_fops);
+
+ if (lapic_in_kernel(vcpu))
+ debugfs_create_file("lapic_timer_advance_ns", 0444,
+ debugfs_dentry, vcpu,
+ &vcpu_timer_advance_ns_fops);
+
+ if (kvm_caps.has_tsc_control) {
+ debugfs_create_file("tsc-scaling-ratio", 0444,
+ debugfs_dentry, vcpu,
+ &vcpu_tsc_scaling_fops);
+ debugfs_create_file("tsc-scaling-ratio-frac-bits", 0444,
+ debugfs_dentry, vcpu,
+ &vcpu_tsc_scaling_frac_fops);
+ }
+}
+
+/*
+ * This covers statistics <1024 (11=log(1024)+1), which should be enough to
+ * cover RMAP_RECYCLE_THRESHOLD.
+ */
+#define RMAP_LOG_SIZE 11
+
+static const char *kvm_lpage_str[KVM_NR_PAGE_SIZES] = { "4K", "2M", "1G" };
+
+static int kvm_mmu_rmaps_stat_show(struct seq_file *m, void *v)
+{
+ struct kvm_rmap_head *rmap;
+ struct kvm *kvm = m->private;
+ struct kvm_memory_slot *slot;
+ struct kvm_memslots *slots;
+ unsigned int lpage_size, index;
+ /* Still small enough to be on the stack */
+ unsigned int *log[KVM_NR_PAGE_SIZES], *cur;
+ int i, j, k, l, ret;
+
+ if (!kvm_memslots_have_rmaps(kvm))
+ return 0;
+
+ ret = -ENOMEM;
+ memset(log, 0, sizeof(log));
+ for (i = 0; i < KVM_NR_PAGE_SIZES; i++) {
+ log[i] = kcalloc(RMAP_LOG_SIZE, sizeof(unsigned int), GFP_KERNEL);
+ if (!log[i])
+ goto out;
+ }
+
+ mutex_lock(&kvm->slots_lock);
+ write_lock(&kvm->mmu_lock);
+
+ for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
+ int bkt;
+
+ slots = __kvm_memslots(kvm, i);
+ kvm_for_each_memslot(slot, bkt, slots)
+ for (k = 0; k < KVM_NR_PAGE_SIZES; k++) {
+ rmap = slot->arch.rmap[k];
+ lpage_size = kvm_mmu_slot_lpages(slot, k + 1);
+ cur = log[k];
+ for (l = 0; l < lpage_size; l++) {
+ index = ffs(pte_list_count(&rmap[l]));
+ if (WARN_ON_ONCE(index >= RMAP_LOG_SIZE))
+ index = RMAP_LOG_SIZE - 1;
+ cur[index]++;
+ }
+ }
+ }
+
+ write_unlock(&kvm->mmu_lock);
+ mutex_unlock(&kvm->slots_lock);
+
+ /* index=0 counts no rmap; index=1 counts 1 rmap */
+ seq_printf(m, "Rmap_Count:\t0\t1\t");
+ for (i = 2; i < RMAP_LOG_SIZE; i++) {
+ j = 1 << (i - 1);
+ k = (1 << i) - 1;
+ seq_printf(m, "%d-%d\t", j, k);
+ }
+ seq_printf(m, "\n");
+
+ for (i = 0; i < KVM_NR_PAGE_SIZES; i++) {
+ seq_printf(m, "Level=%s:\t", kvm_lpage_str[i]);
+ cur = log[i];
+ for (j = 0; j < RMAP_LOG_SIZE; j++)
+ seq_printf(m, "%d\t", cur[j]);
+ seq_printf(m, "\n");
+ }
+
+ ret = 0;
+out:
+ for (i = 0; i < KVM_NR_PAGE_SIZES; i++)
+ kfree(log[i]);
+
+ return ret;
+}
+
+static int kvm_mmu_rmaps_stat_open(struct inode *inode, struct file *file)
+{
+ struct kvm *kvm = inode->i_private;
+ int r;
+
+ if (!kvm_get_kvm_safe(kvm))
+ return -ENOENT;
+
+ r = single_open(file, kvm_mmu_rmaps_stat_show, kvm);
+ if (r < 0)
+ kvm_put_kvm(kvm);
+
+ return r;
+}
+
+static int kvm_mmu_rmaps_stat_release(struct inode *inode, struct file *file)
+{
+ struct kvm *kvm = inode->i_private;
+
+ kvm_put_kvm(kvm);
+
+ return single_release(inode, file);
+}
+
+static const struct file_operations mmu_rmaps_stat_fops = {
+ .open = kvm_mmu_rmaps_stat_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = kvm_mmu_rmaps_stat_release,
+};
+
+int kvm_arch_create_vm_debugfs(struct kvm *kvm)
+{
+ debugfs_create_file("mmu_rmaps_stat", 0644, kvm->debugfs_dentry, kvm,
+ &mmu_rmaps_stat_fops);
+ return 0;
+}
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
new file mode 100644
index 000000000..2673cd5c4
--- /dev/null
+++ b/arch/x86/kvm/emulate.c
@@ -0,0 +1,5508 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/******************************************************************************
+ * emulate.c
+ *
+ * Generic x86 (32-bit and 64-bit) instruction decoder and emulator.
+ *
+ * Copyright (c) 2005 Keir Fraser
+ *
+ * Linux coding style, mod r/m decoder, segment base fixes, real-mode
+ * privileged instructions:
+ *
+ * Copyright (C) 2006 Qumranet
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
+ *
+ * Avi Kivity <avi@qumranet.com>
+ * Yaniv Kamay <yaniv@qumranet.com>
+ *
+ * From: xen-unstable 10676:af9809f51f81a3c43f276f00c81a52ef558afda4
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kvm_host.h>
+#include "kvm_cache_regs.h"
+#include "kvm_emulate.h"
+#include <linux/stringify.h>
+#include <asm/debugreg.h>
+#include <asm/nospec-branch.h>
+#include <asm/ibt.h>
+
+#include "x86.h"
+#include "tss.h"
+#include "mmu.h"
+#include "pmu.h"
+
+/*
+ * Operand types
+ */
+#define OpNone 0ull
+#define OpImplicit 1ull /* No generic decode */
+#define OpReg 2ull /* Register */
+#define OpMem 3ull /* Memory */
+#define OpAcc 4ull /* Accumulator: AL/AX/EAX/RAX */
+#define OpDI 5ull /* ES:DI/EDI/RDI */
+#define OpMem64 6ull /* Memory, 64-bit */
+#define OpImmUByte 7ull /* Zero-extended 8-bit immediate */
+#define OpDX 8ull /* DX register */
+#define OpCL 9ull /* CL register (for shifts) */
+#define OpImmByte 10ull /* 8-bit sign extended immediate */
+#define OpOne 11ull /* Implied 1 */
+#define OpImm 12ull /* Sign extended up to 32-bit immediate */
+#define OpMem16 13ull /* Memory operand (16-bit). */
+#define OpMem32 14ull /* Memory operand (32-bit). */
+#define OpImmU 15ull /* Immediate operand, zero extended */
+#define OpSI 16ull /* SI/ESI/RSI */
+#define OpImmFAddr 17ull /* Immediate far address */
+#define OpMemFAddr 18ull /* Far address in memory */
+#define OpImmU16 19ull /* Immediate operand, 16 bits, zero extended */
+#define OpES 20ull /* ES */
+#define OpCS 21ull /* CS */
+#define OpSS 22ull /* SS */
+#define OpDS 23ull /* DS */
+#define OpFS 24ull /* FS */
+#define OpGS 25ull /* GS */
+#define OpMem8 26ull /* 8-bit zero extended memory operand */
+#define OpImm64 27ull /* Sign extended 16/32/64-bit immediate */
+#define OpXLat 28ull /* memory at BX/EBX/RBX + zero-extended AL */
+#define OpAccLo 29ull /* Low part of extended acc (AX/AX/EAX/RAX) */
+#define OpAccHi 30ull /* High part of extended acc (-/DX/EDX/RDX) */
+
+#define OpBits 5 /* Width of operand field */
+#define OpMask ((1ull << OpBits) - 1)
+
+/*
+ * Opcode effective-address decode tables.
+ * Note that we only emulate instructions that have at least one memory
+ * operand (excluding implicit stack references). We assume that stack
+ * references and instruction fetches will never occur in special memory
+ * areas that require emulation. So, for example, 'mov <imm>,<reg>' need
+ * not be handled.
+ */
+
+/* Operand sizes: 8-bit operands or specified/overridden size. */
+#define ByteOp (1<<0) /* 8-bit operands. */
+/* Destination operand type. */
+#define DstShift 1
+#define ImplicitOps (OpImplicit << DstShift)
+#define DstReg (OpReg << DstShift)
+#define DstMem (OpMem << DstShift)
+#define DstAcc (OpAcc << DstShift)
+#define DstDI (OpDI << DstShift)
+#define DstMem64 (OpMem64 << DstShift)
+#define DstMem16 (OpMem16 << DstShift)
+#define DstImmUByte (OpImmUByte << DstShift)
+#define DstDX (OpDX << DstShift)
+#define DstAccLo (OpAccLo << DstShift)
+#define DstMask (OpMask << DstShift)
+/* Source operand type. */
+#define SrcShift 6
+#define SrcNone (OpNone << SrcShift)
+#define SrcReg (OpReg << SrcShift)
+#define SrcMem (OpMem << SrcShift)
+#define SrcMem16 (OpMem16 << SrcShift)
+#define SrcMem32 (OpMem32 << SrcShift)
+#define SrcImm (OpImm << SrcShift)
+#define SrcImmByte (OpImmByte << SrcShift)
+#define SrcOne (OpOne << SrcShift)
+#define SrcImmUByte (OpImmUByte << SrcShift)
+#define SrcImmU (OpImmU << SrcShift)
+#define SrcSI (OpSI << SrcShift)
+#define SrcXLat (OpXLat << SrcShift)
+#define SrcImmFAddr (OpImmFAddr << SrcShift)
+#define SrcMemFAddr (OpMemFAddr << SrcShift)
+#define SrcAcc (OpAcc << SrcShift)
+#define SrcImmU16 (OpImmU16 << SrcShift)
+#define SrcImm64 (OpImm64 << SrcShift)
+#define SrcDX (OpDX << SrcShift)
+#define SrcMem8 (OpMem8 << SrcShift)
+#define SrcAccHi (OpAccHi << SrcShift)
+#define SrcMask (OpMask << SrcShift)
+#define BitOp (1<<11)
+#define MemAbs (1<<12) /* Memory operand is absolute displacement */
+#define String (1<<13) /* String instruction (rep capable) */
+#define Stack (1<<14) /* Stack instruction (push/pop) */
+#define GroupMask (7<<15) /* Opcode uses one of the group mechanisms */
+#define Group (1<<15) /* Bits 3:5 of modrm byte extend opcode */
+#define GroupDual (2<<15) /* Alternate decoding of mod == 3 */
+#define Prefix (3<<15) /* Instruction varies with 66/f2/f3 prefix */
+#define RMExt (4<<15) /* Opcode extension in ModRM r/m if mod == 3 */
+#define Escape (5<<15) /* Escape to coprocessor instruction */
+#define InstrDual (6<<15) /* Alternate instruction decoding of mod == 3 */
+#define ModeDual (7<<15) /* Different instruction for 32/64 bit */
+#define Sse (1<<18) /* SSE Vector instruction */
+/* Generic ModRM decode. */
+#define ModRM (1<<19)
+/* Destination is only written; never read. */
+#define Mov (1<<20)
+/* Misc flags */
+#define Prot (1<<21) /* instruction generates #UD if not in prot-mode */
+#define EmulateOnUD (1<<22) /* Emulate if unsupported by the host */
+#define NoAccess (1<<23) /* Don't access memory (lea/invlpg/verr etc) */
+#define Op3264 (1<<24) /* Operand is 64b in long mode, 32b otherwise */
+#define Undefined (1<<25) /* No Such Instruction */
+#define Lock (1<<26) /* lock prefix is allowed for the instruction */
+#define Priv (1<<27) /* instruction generates #GP if current CPL != 0 */
+#define No64 (1<<28)
+#define PageTable (1 << 29) /* instruction used to write page table */
+#define NotImpl (1 << 30) /* instruction is not implemented */
+/* Source 2 operand type */
+#define Src2Shift (31)
+#define Src2None (OpNone << Src2Shift)
+#define Src2Mem (OpMem << Src2Shift)
+#define Src2CL (OpCL << Src2Shift)
+#define Src2ImmByte (OpImmByte << Src2Shift)
+#define Src2One (OpOne << Src2Shift)
+#define Src2Imm (OpImm << Src2Shift)
+#define Src2ES (OpES << Src2Shift)
+#define Src2CS (OpCS << Src2Shift)
+#define Src2SS (OpSS << Src2Shift)
+#define Src2DS (OpDS << Src2Shift)
+#define Src2FS (OpFS << Src2Shift)
+#define Src2GS (OpGS << Src2Shift)
+#define Src2Mask (OpMask << Src2Shift)
+#define Mmx ((u64)1 << 40) /* MMX Vector instruction */
+#define AlignMask ((u64)7 << 41)
+#define Aligned ((u64)1 << 41) /* Explicitly aligned (e.g. MOVDQA) */
+#define Unaligned ((u64)2 << 41) /* Explicitly unaligned (e.g. MOVDQU) */
+#define Avx ((u64)3 << 41) /* Advanced Vector Extensions */
+#define Aligned16 ((u64)4 << 41) /* Aligned to 16 byte boundary (e.g. FXSAVE) */
+#define Fastop ((u64)1 << 44) /* Use opcode::u.fastop */
+#define NoWrite ((u64)1 << 45) /* No writeback */
+#define SrcWrite ((u64)1 << 46) /* Write back src operand */
+#define NoMod ((u64)1 << 47) /* Mod field is ignored */
+#define Intercept ((u64)1 << 48) /* Has valid intercept field */
+#define CheckPerm ((u64)1 << 49) /* Has valid check_perm field */
+#define PrivUD ((u64)1 << 51) /* #UD instead of #GP on CPL > 0 */
+#define NearBranch ((u64)1 << 52) /* Near branches */
+#define No16 ((u64)1 << 53) /* No 16 bit operand */
+#define IncSP ((u64)1 << 54) /* SP is incremented before ModRM calc */
+#define TwoMemOp ((u64)1 << 55) /* Instruction has two memory operand */
+#define IsBranch ((u64)1 << 56) /* Instruction is considered a branch. */
+
+#define DstXacc (DstAccLo | SrcAccHi | SrcWrite)
+
+#define X2(x...) x, x
+#define X3(x...) X2(x), x
+#define X4(x...) X2(x), X2(x)
+#define X5(x...) X4(x), x
+#define X6(x...) X4(x), X2(x)
+#define X7(x...) X4(x), X3(x)
+#define X8(x...) X4(x), X4(x)
+#define X16(x...) X8(x), X8(x)
+
+struct opcode {
+ u64 flags;
+ u8 intercept;
+ u8 pad[7];
+ union {
+ int (*execute)(struct x86_emulate_ctxt *ctxt);
+ const struct opcode *group;
+ const struct group_dual *gdual;
+ const struct gprefix *gprefix;
+ const struct escape *esc;
+ const struct instr_dual *idual;
+ const struct mode_dual *mdual;
+ void (*fastop)(struct fastop *fake);
+ } u;
+ int (*check_perm)(struct x86_emulate_ctxt *ctxt);
+};
+
+struct group_dual {
+ struct opcode mod012[8];
+ struct opcode mod3[8];
+};
+
+struct gprefix {
+ struct opcode pfx_no;
+ struct opcode pfx_66;
+ struct opcode pfx_f2;
+ struct opcode pfx_f3;
+};
+
+struct escape {
+ struct opcode op[8];
+ struct opcode high[64];
+};
+
+struct instr_dual {
+ struct opcode mod012;
+ struct opcode mod3;
+};
+
+struct mode_dual {
+ struct opcode mode32;
+ struct opcode mode64;
+};
+
+#define EFLG_RESERVED_ZEROS_MASK 0xffc0802a
+
+enum x86_transfer_type {
+ X86_TRANSFER_NONE,
+ X86_TRANSFER_CALL_JMP,
+ X86_TRANSFER_RET,
+ X86_TRANSFER_TASK_SWITCH,
+};
+
+static void writeback_registers(struct x86_emulate_ctxt *ctxt)
+{
+ unsigned long dirty = ctxt->regs_dirty;
+ unsigned reg;
+
+ for_each_set_bit(reg, &dirty, NR_EMULATOR_GPRS)
+ ctxt->ops->write_gpr(ctxt, reg, ctxt->_regs[reg]);
+}
+
+static void invalidate_registers(struct x86_emulate_ctxt *ctxt)
+{
+ ctxt->regs_dirty = 0;
+ ctxt->regs_valid = 0;
+}
+
+/*
+ * These EFLAGS bits are restored from saved value during emulation, and
+ * any changes are written back to the saved value after emulation.
+ */
+#define EFLAGS_MASK (X86_EFLAGS_OF|X86_EFLAGS_SF|X86_EFLAGS_ZF|X86_EFLAGS_AF|\
+ X86_EFLAGS_PF|X86_EFLAGS_CF)
+
+#ifdef CONFIG_X86_64
+#define ON64(x) x
+#else
+#define ON64(x)
+#endif
+
+/*
+ * fastop functions have a special calling convention:
+ *
+ * dst: rax (in/out)
+ * src: rdx (in/out)
+ * src2: rcx (in)
+ * flags: rflags (in/out)
+ * ex: rsi (in:fastop pointer, out:zero if exception)
+ *
+ * Moreover, they are all exactly FASTOP_SIZE bytes long, so functions for
+ * different operand sizes can be reached by calculation, rather than a jump
+ * table (which would be bigger than the code).
+ *
+ * The 16 byte alignment, considering 5 bytes for the RET thunk, 3 for ENDBR
+ * and 1 for the straight line speculation INT3, leaves 7 bytes for the
+ * body of the function. Currently none is larger than 4.
+ */
+static int fastop(struct x86_emulate_ctxt *ctxt, fastop_t fop);
+
+#define FASTOP_SIZE 16
+
+#define __FOP_FUNC(name) \
+ ".align " __stringify(FASTOP_SIZE) " \n\t" \
+ ".type " name ", @function \n\t" \
+ name ":\n\t" \
+ ASM_ENDBR \
+ IBT_NOSEAL(name)
+
+#define FOP_FUNC(name) \
+ __FOP_FUNC(#name)
+
+#define __FOP_RET(name) \
+ "11: " ASM_RET \
+ ".size " name ", .-" name "\n\t"
+
+#define FOP_RET(name) \
+ __FOP_RET(#name)
+
+#define __FOP_START(op, align) \
+ extern void em_##op(struct fastop *fake); \
+ asm(".pushsection .text, \"ax\" \n\t" \
+ ".global em_" #op " \n\t" \
+ ".align " __stringify(align) " \n\t" \
+ "em_" #op ":\n\t"
+
+#define FOP_START(op) __FOP_START(op, FASTOP_SIZE)
+
+#define FOP_END \
+ ".popsection")
+
+#define __FOPNOP(name) \
+ __FOP_FUNC(name) \
+ __FOP_RET(name)
+
+#define FOPNOP() \
+ __FOPNOP(__stringify(__UNIQUE_ID(nop)))
+
+#define FOP1E(op, dst) \
+ __FOP_FUNC(#op "_" #dst) \
+ "10: " #op " %" #dst " \n\t" \
+ __FOP_RET(#op "_" #dst)
+
+#define FOP1EEX(op, dst) \
+ FOP1E(op, dst) _ASM_EXTABLE_TYPE_REG(10b, 11b, EX_TYPE_ZERO_REG, %%esi)
+
+#define FASTOP1(op) \
+ FOP_START(op) \
+ FOP1E(op##b, al) \
+ FOP1E(op##w, ax) \
+ FOP1E(op##l, eax) \
+ ON64(FOP1E(op##q, rax)) \
+ FOP_END
+
+/* 1-operand, using src2 (for MUL/DIV r/m) */
+#define FASTOP1SRC2(op, name) \
+ FOP_START(name) \
+ FOP1E(op, cl) \
+ FOP1E(op, cx) \
+ FOP1E(op, ecx) \
+ ON64(FOP1E(op, rcx)) \
+ FOP_END
+
+/* 1-operand, using src2 (for MUL/DIV r/m), with exceptions */
+#define FASTOP1SRC2EX(op, name) \
+ FOP_START(name) \
+ FOP1EEX(op, cl) \
+ FOP1EEX(op, cx) \
+ FOP1EEX(op, ecx) \
+ ON64(FOP1EEX(op, rcx)) \
+ FOP_END
+
+#define FOP2E(op, dst, src) \
+ __FOP_FUNC(#op "_" #dst "_" #src) \
+ #op " %" #src ", %" #dst " \n\t" \
+ __FOP_RET(#op "_" #dst "_" #src)
+
+#define FASTOP2(op) \
+ FOP_START(op) \
+ FOP2E(op##b, al, dl) \
+ FOP2E(op##w, ax, dx) \
+ FOP2E(op##l, eax, edx) \
+ ON64(FOP2E(op##q, rax, rdx)) \
+ FOP_END
+
+/* 2 operand, word only */
+#define FASTOP2W(op) \
+ FOP_START(op) \
+ FOPNOP() \
+ FOP2E(op##w, ax, dx) \
+ FOP2E(op##l, eax, edx) \
+ ON64(FOP2E(op##q, rax, rdx)) \
+ FOP_END
+
+/* 2 operand, src is CL */
+#define FASTOP2CL(op) \
+ FOP_START(op) \
+ FOP2E(op##b, al, cl) \
+ FOP2E(op##w, ax, cl) \
+ FOP2E(op##l, eax, cl) \
+ ON64(FOP2E(op##q, rax, cl)) \
+ FOP_END
+
+/* 2 operand, src and dest are reversed */
+#define FASTOP2R(op, name) \
+ FOP_START(name) \
+ FOP2E(op##b, dl, al) \
+ FOP2E(op##w, dx, ax) \
+ FOP2E(op##l, edx, eax) \
+ ON64(FOP2E(op##q, rdx, rax)) \
+ FOP_END
+
+#define FOP3E(op, dst, src, src2) \
+ __FOP_FUNC(#op "_" #dst "_" #src "_" #src2) \
+ #op " %" #src2 ", %" #src ", %" #dst " \n\t"\
+ __FOP_RET(#op "_" #dst "_" #src "_" #src2)
+
+/* 3-operand, word-only, src2=cl */
+#define FASTOP3WCL(op) \
+ FOP_START(op) \
+ FOPNOP() \
+ FOP3E(op##w, ax, dx, cl) \
+ FOP3E(op##l, eax, edx, cl) \
+ ON64(FOP3E(op##q, rax, rdx, cl)) \
+ FOP_END
+
+/* Special case for SETcc - 1 instruction per cc */
+#define FOP_SETCC(op) \
+ FOP_FUNC(op) \
+ #op " %al \n\t" \
+ FOP_RET(op)
+
+FOP_START(setcc)
+FOP_SETCC(seto)
+FOP_SETCC(setno)
+FOP_SETCC(setc)
+FOP_SETCC(setnc)
+FOP_SETCC(setz)
+FOP_SETCC(setnz)
+FOP_SETCC(setbe)
+FOP_SETCC(setnbe)
+FOP_SETCC(sets)
+FOP_SETCC(setns)
+FOP_SETCC(setp)
+FOP_SETCC(setnp)
+FOP_SETCC(setl)
+FOP_SETCC(setnl)
+FOP_SETCC(setle)
+FOP_SETCC(setnle)
+FOP_END;
+
+FOP_START(salc)
+FOP_FUNC(salc)
+"pushf; sbb %al, %al; popf \n\t"
+FOP_RET(salc)
+FOP_END;
+
+/*
+ * XXX: inoutclob user must know where the argument is being expanded.
+ * Using asm goto would allow us to remove _fault.
+ */
+#define asm_safe(insn, inoutclob...) \
+({ \
+ int _fault = 0; \
+ \
+ asm volatile("1:" insn "\n" \
+ "2:\n" \
+ _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_ONE_REG, %[_fault]) \
+ : [_fault] "+r"(_fault) inoutclob ); \
+ \
+ _fault ? X86EMUL_UNHANDLEABLE : X86EMUL_CONTINUE; \
+})
+
+static int emulator_check_intercept(struct x86_emulate_ctxt *ctxt,
+ enum x86_intercept intercept,
+ enum x86_intercept_stage stage)
+{
+ struct x86_instruction_info info = {
+ .intercept = intercept,
+ .rep_prefix = ctxt->rep_prefix,
+ .modrm_mod = ctxt->modrm_mod,
+ .modrm_reg = ctxt->modrm_reg,
+ .modrm_rm = ctxt->modrm_rm,
+ .src_val = ctxt->src.val64,
+ .dst_val = ctxt->dst.val64,
+ .src_bytes = ctxt->src.bytes,
+ .dst_bytes = ctxt->dst.bytes,
+ .ad_bytes = ctxt->ad_bytes,
+ .next_rip = ctxt->eip,
+ };
+
+ return ctxt->ops->intercept(ctxt, &info, stage);
+}
+
+static void assign_masked(ulong *dest, ulong src, ulong mask)
+{
+ *dest = (*dest & ~mask) | (src & mask);
+}
+
+static void assign_register(unsigned long *reg, u64 val, int bytes)
+{
+ /* The 4-byte case *is* correct: in 64-bit mode we zero-extend. */
+ switch (bytes) {
+ case 1:
+ *(u8 *)reg = (u8)val;
+ break;
+ case 2:
+ *(u16 *)reg = (u16)val;
+ break;
+ case 4:
+ *reg = (u32)val;
+ break; /* 64b: zero-extend */
+ case 8:
+ *reg = val;
+ break;
+ }
+}
+
+static inline unsigned long ad_mask(struct x86_emulate_ctxt *ctxt)
+{
+ return (1UL << (ctxt->ad_bytes << 3)) - 1;
+}
+
+static ulong stack_mask(struct x86_emulate_ctxt *ctxt)
+{
+ u16 sel;
+ struct desc_struct ss;
+
+ if (ctxt->mode == X86EMUL_MODE_PROT64)
+ return ~0UL;
+ ctxt->ops->get_segment(ctxt, &sel, &ss, NULL, VCPU_SREG_SS);
+ return ~0U >> ((ss.d ^ 1) * 16); /* d=0: 0xffff; d=1: 0xffffffff */
+}
+
+static int stack_size(struct x86_emulate_ctxt *ctxt)
+{
+ return (__fls(stack_mask(ctxt)) + 1) >> 3;
+}
+
+/* Access/update address held in a register, based on addressing mode. */
+static inline unsigned long
+address_mask(struct x86_emulate_ctxt *ctxt, unsigned long reg)
+{
+ if (ctxt->ad_bytes == sizeof(unsigned long))
+ return reg;
+ else
+ return reg & ad_mask(ctxt);
+}
+
+static inline unsigned long
+register_address(struct x86_emulate_ctxt *ctxt, int reg)
+{
+ return address_mask(ctxt, reg_read(ctxt, reg));
+}
+
+static void masked_increment(ulong *reg, ulong mask, int inc)
+{
+ assign_masked(reg, *reg + inc, mask);
+}
+
+static inline void
+register_address_increment(struct x86_emulate_ctxt *ctxt, int reg, int inc)
+{
+ ulong *preg = reg_rmw(ctxt, reg);
+
+ assign_register(preg, *preg + inc, ctxt->ad_bytes);
+}
+
+static void rsp_increment(struct x86_emulate_ctxt *ctxt, int inc)
+{
+ masked_increment(reg_rmw(ctxt, VCPU_REGS_RSP), stack_mask(ctxt), inc);
+}
+
+static u32 desc_limit_scaled(struct desc_struct *desc)
+{
+ u32 limit = get_desc_limit(desc);
+
+ return desc->g ? (limit << 12) | 0xfff : limit;
+}
+
+static unsigned long seg_base(struct x86_emulate_ctxt *ctxt, int seg)
+{
+ if (ctxt->mode == X86EMUL_MODE_PROT64 && seg < VCPU_SREG_FS)
+ return 0;
+
+ return ctxt->ops->get_cached_segment_base(ctxt, seg);
+}
+
+static int emulate_exception(struct x86_emulate_ctxt *ctxt, int vec,
+ u32 error, bool valid)
+{
+ if (KVM_EMULATOR_BUG_ON(vec > 0x1f, ctxt))
+ return X86EMUL_UNHANDLEABLE;
+
+ ctxt->exception.vector = vec;
+ ctxt->exception.error_code = error;
+ ctxt->exception.error_code_valid = valid;
+ return X86EMUL_PROPAGATE_FAULT;
+}
+
+static int emulate_db(struct x86_emulate_ctxt *ctxt)
+{
+ return emulate_exception(ctxt, DB_VECTOR, 0, false);
+}
+
+static int emulate_gp(struct x86_emulate_ctxt *ctxt, int err)
+{
+ return emulate_exception(ctxt, GP_VECTOR, err, true);
+}
+
+static int emulate_ss(struct x86_emulate_ctxt *ctxt, int err)
+{
+ return emulate_exception(ctxt, SS_VECTOR, err, true);
+}
+
+static int emulate_ud(struct x86_emulate_ctxt *ctxt)
+{
+ return emulate_exception(ctxt, UD_VECTOR, 0, false);
+}
+
+static int emulate_ts(struct x86_emulate_ctxt *ctxt, int err)
+{
+ return emulate_exception(ctxt, TS_VECTOR, err, true);
+}
+
+static int emulate_de(struct x86_emulate_ctxt *ctxt)
+{
+ return emulate_exception(ctxt, DE_VECTOR, 0, false);
+}
+
+static int emulate_nm(struct x86_emulate_ctxt *ctxt)
+{
+ return emulate_exception(ctxt, NM_VECTOR, 0, false);
+}
+
+static u16 get_segment_selector(struct x86_emulate_ctxt *ctxt, unsigned seg)
+{
+ u16 selector;
+ struct desc_struct desc;
+
+ ctxt->ops->get_segment(ctxt, &selector, &desc, NULL, seg);
+ return selector;
+}
+
+static void set_segment_selector(struct x86_emulate_ctxt *ctxt, u16 selector,
+ unsigned seg)
+{
+ u16 dummy;
+ u32 base3;
+ struct desc_struct desc;
+
+ ctxt->ops->get_segment(ctxt, &dummy, &desc, &base3, seg);
+ ctxt->ops->set_segment(ctxt, selector, &desc, base3, seg);
+}
+
+static inline u8 ctxt_virt_addr_bits(struct x86_emulate_ctxt *ctxt)
+{
+ return (ctxt->ops->get_cr(ctxt, 4) & X86_CR4_LA57) ? 57 : 48;
+}
+
+static inline bool emul_is_noncanonical_address(u64 la,
+ struct x86_emulate_ctxt *ctxt)
+{
+ return !__is_canonical_address(la, ctxt_virt_addr_bits(ctxt));
+}
+
+/*
+ * x86 defines three classes of vector instructions: explicitly
+ * aligned, explicitly unaligned, and the rest, which change behaviour
+ * depending on whether they're AVX encoded or not.
+ *
+ * Also included is CMPXCHG16B which is not a vector instruction, yet it is
+ * subject to the same check. FXSAVE and FXRSTOR are checked here too as their
+ * 512 bytes of data must be aligned to a 16 byte boundary.
+ */
+static unsigned insn_alignment(struct x86_emulate_ctxt *ctxt, unsigned size)
+{
+ u64 alignment = ctxt->d & AlignMask;
+
+ if (likely(size < 16))
+ return 1;
+
+ switch (alignment) {
+ case Unaligned:
+ case Avx:
+ return 1;
+ case Aligned16:
+ return 16;
+ case Aligned:
+ default:
+ return size;
+ }
+}
+
+static __always_inline int __linearize(struct x86_emulate_ctxt *ctxt,
+ struct segmented_address addr,
+ unsigned *max_size, unsigned size,
+ bool write, bool fetch,
+ enum x86emul_mode mode, ulong *linear)
+{
+ struct desc_struct desc;
+ bool usable;
+ ulong la;
+ u32 lim;
+ u16 sel;
+ u8 va_bits;
+
+ la = seg_base(ctxt, addr.seg) + addr.ea;
+ *max_size = 0;
+ switch (mode) {
+ case X86EMUL_MODE_PROT64:
+ *linear = la;
+ va_bits = ctxt_virt_addr_bits(ctxt);
+ if (!__is_canonical_address(la, va_bits))
+ goto bad;
+
+ *max_size = min_t(u64, ~0u, (1ull << va_bits) - la);
+ if (size > *max_size)
+ goto bad;
+ break;
+ default:
+ *linear = la = (u32)la;
+ usable = ctxt->ops->get_segment(ctxt, &sel, &desc, NULL,
+ addr.seg);
+ if (!usable)
+ goto bad;
+ /* code segment in protected mode or read-only data segment */
+ if ((((ctxt->mode != X86EMUL_MODE_REAL) && (desc.type & 8))
+ || !(desc.type & 2)) && write)
+ goto bad;
+ /* unreadable code segment */
+ if (!fetch && (desc.type & 8) && !(desc.type & 2))
+ goto bad;
+ lim = desc_limit_scaled(&desc);
+ if (!(desc.type & 8) && (desc.type & 4)) {
+ /* expand-down segment */
+ if (addr.ea <= lim)
+ goto bad;
+ lim = desc.d ? 0xffffffff : 0xffff;
+ }
+ if (addr.ea > lim)
+ goto bad;
+ if (lim == 0xffffffff)
+ *max_size = ~0u;
+ else {
+ *max_size = (u64)lim + 1 - addr.ea;
+ if (size > *max_size)
+ goto bad;
+ }
+ break;
+ }
+ if (la & (insn_alignment(ctxt, size) - 1))
+ return emulate_gp(ctxt, 0);
+ return X86EMUL_CONTINUE;
+bad:
+ if (addr.seg == VCPU_SREG_SS)
+ return emulate_ss(ctxt, 0);
+ else
+ return emulate_gp(ctxt, 0);
+}
+
+static int linearize(struct x86_emulate_ctxt *ctxt,
+ struct segmented_address addr,
+ unsigned size, bool write,
+ ulong *linear)
+{
+ unsigned max_size;
+ return __linearize(ctxt, addr, &max_size, size, write, false,
+ ctxt->mode, linear);
+}
+
+static inline int assign_eip(struct x86_emulate_ctxt *ctxt, ulong dst)
+{
+ ulong linear;
+ int rc;
+ unsigned max_size;
+ struct segmented_address addr = { .seg = VCPU_SREG_CS,
+ .ea = dst };
+
+ if (ctxt->op_bytes != sizeof(unsigned long))
+ addr.ea = dst & ((1UL << (ctxt->op_bytes << 3)) - 1);
+ rc = __linearize(ctxt, addr, &max_size, 1, false, true, ctxt->mode, &linear);
+ if (rc == X86EMUL_CONTINUE)
+ ctxt->_eip = addr.ea;
+ return rc;
+}
+
+static inline int emulator_recalc_and_set_mode(struct x86_emulate_ctxt *ctxt)
+{
+ u64 efer;
+ struct desc_struct cs;
+ u16 selector;
+ u32 base3;
+
+ ctxt->ops->get_msr(ctxt, MSR_EFER, &efer);
+
+ if (!(ctxt->ops->get_cr(ctxt, 0) & X86_CR0_PE)) {
+ /* Real mode. cpu must not have long mode active */
+ if (efer & EFER_LMA)
+ return X86EMUL_UNHANDLEABLE;
+ ctxt->mode = X86EMUL_MODE_REAL;
+ return X86EMUL_CONTINUE;
+ }
+
+ if (ctxt->eflags & X86_EFLAGS_VM) {
+ /* Protected/VM86 mode. cpu must not have long mode active */
+ if (efer & EFER_LMA)
+ return X86EMUL_UNHANDLEABLE;
+ ctxt->mode = X86EMUL_MODE_VM86;
+ return X86EMUL_CONTINUE;
+ }
+
+ if (!ctxt->ops->get_segment(ctxt, &selector, &cs, &base3, VCPU_SREG_CS))
+ return X86EMUL_UNHANDLEABLE;
+
+ if (efer & EFER_LMA) {
+ if (cs.l) {
+ /* Proper long mode */
+ ctxt->mode = X86EMUL_MODE_PROT64;
+ } else if (cs.d) {
+ /* 32 bit compatibility mode*/
+ ctxt->mode = X86EMUL_MODE_PROT32;
+ } else {
+ ctxt->mode = X86EMUL_MODE_PROT16;
+ }
+ } else {
+ /* Legacy 32 bit / 16 bit mode */
+ ctxt->mode = cs.d ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
+ }
+
+ return X86EMUL_CONTINUE;
+}
+
+static inline int assign_eip_near(struct x86_emulate_ctxt *ctxt, ulong dst)
+{
+ return assign_eip(ctxt, dst);
+}
+
+static int assign_eip_far(struct x86_emulate_ctxt *ctxt, ulong dst)
+{
+ int rc = emulator_recalc_and_set_mode(ctxt);
+
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ return assign_eip(ctxt, dst);
+}
+
+static inline int jmp_rel(struct x86_emulate_ctxt *ctxt, int rel)
+{
+ return assign_eip_near(ctxt, ctxt->_eip + rel);
+}
+
+static int linear_read_system(struct x86_emulate_ctxt *ctxt, ulong linear,
+ void *data, unsigned size)
+{
+ return ctxt->ops->read_std(ctxt, linear, data, size, &ctxt->exception, true);
+}
+
+static int linear_write_system(struct x86_emulate_ctxt *ctxt,
+ ulong linear, void *data,
+ unsigned int size)
+{
+ return ctxt->ops->write_std(ctxt, linear, data, size, &ctxt->exception, true);
+}
+
+static int segmented_read_std(struct x86_emulate_ctxt *ctxt,
+ struct segmented_address addr,
+ void *data,
+ unsigned size)
+{
+ int rc;
+ ulong linear;
+
+ rc = linearize(ctxt, addr, size, false, &linear);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+ return ctxt->ops->read_std(ctxt, linear, data, size, &ctxt->exception, false);
+}
+
+static int segmented_write_std(struct x86_emulate_ctxt *ctxt,
+ struct segmented_address addr,
+ void *data,
+ unsigned int size)
+{
+ int rc;
+ ulong linear;
+
+ rc = linearize(ctxt, addr, size, true, &linear);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+ return ctxt->ops->write_std(ctxt, linear, data, size, &ctxt->exception, false);
+}
+
+/*
+ * Prefetch the remaining bytes of the instruction without crossing page
+ * boundary if they are not in fetch_cache yet.
+ */
+static int __do_insn_fetch_bytes(struct x86_emulate_ctxt *ctxt, int op_size)
+{
+ int rc;
+ unsigned size, max_size;
+ unsigned long linear;
+ int cur_size = ctxt->fetch.end - ctxt->fetch.data;
+ struct segmented_address addr = { .seg = VCPU_SREG_CS,
+ .ea = ctxt->eip + cur_size };
+
+ /*
+ * We do not know exactly how many bytes will be needed, and
+ * __linearize is expensive, so fetch as much as possible. We
+ * just have to avoid going beyond the 15 byte limit, the end
+ * of the segment, or the end of the page.
+ *
+ * __linearize is called with size 0 so that it does not do any
+ * boundary check itself. Instead, we use max_size to check
+ * against op_size.
+ */
+ rc = __linearize(ctxt, addr, &max_size, 0, false, true, ctxt->mode,
+ &linear);
+ if (unlikely(rc != X86EMUL_CONTINUE))
+ return rc;
+
+ size = min_t(unsigned, 15UL ^ cur_size, max_size);
+ size = min_t(unsigned, size, PAGE_SIZE - offset_in_page(linear));
+
+ /*
+ * One instruction can only straddle two pages,
+ * and one has been loaded at the beginning of
+ * x86_decode_insn. So, if not enough bytes
+ * still, we must have hit the 15-byte boundary.
+ */
+ if (unlikely(size < op_size))
+ return emulate_gp(ctxt, 0);
+
+ rc = ctxt->ops->fetch(ctxt, linear, ctxt->fetch.end,
+ size, &ctxt->exception);
+ if (unlikely(rc != X86EMUL_CONTINUE))
+ return rc;
+ ctxt->fetch.end += size;
+ return X86EMUL_CONTINUE;
+}
+
+static __always_inline int do_insn_fetch_bytes(struct x86_emulate_ctxt *ctxt,
+ unsigned size)
+{
+ unsigned done_size = ctxt->fetch.end - ctxt->fetch.ptr;
+
+ if (unlikely(done_size < size))
+ return __do_insn_fetch_bytes(ctxt, size - done_size);
+ else
+ return X86EMUL_CONTINUE;
+}
+
+/* Fetch next part of the instruction being emulated. */
+#define insn_fetch(_type, _ctxt) \
+({ _type _x; \
+ \
+ rc = do_insn_fetch_bytes(_ctxt, sizeof(_type)); \
+ if (rc != X86EMUL_CONTINUE) \
+ goto done; \
+ ctxt->_eip += sizeof(_type); \
+ memcpy(&_x, ctxt->fetch.ptr, sizeof(_type)); \
+ ctxt->fetch.ptr += sizeof(_type); \
+ _x; \
+})
+
+#define insn_fetch_arr(_arr, _size, _ctxt) \
+({ \
+ rc = do_insn_fetch_bytes(_ctxt, _size); \
+ if (rc != X86EMUL_CONTINUE) \
+ goto done; \
+ ctxt->_eip += (_size); \
+ memcpy(_arr, ctxt->fetch.ptr, _size); \
+ ctxt->fetch.ptr += (_size); \
+})
+
+/*
+ * Given the 'reg' portion of a ModRM byte, and a register block, return a
+ * pointer into the block that addresses the relevant register.
+ * @highbyte_regs specifies whether to decode AH,CH,DH,BH.
+ */
+static void *decode_register(struct x86_emulate_ctxt *ctxt, u8 modrm_reg,
+ int byteop)
+{
+ void *p;
+ int highbyte_regs = (ctxt->rex_prefix == 0) && byteop;
+
+ if (highbyte_regs && modrm_reg >= 4 && modrm_reg < 8)
+ p = (unsigned char *)reg_rmw(ctxt, modrm_reg & 3) + 1;
+ else
+ p = reg_rmw(ctxt, modrm_reg);
+ return p;
+}
+
+static int read_descriptor(struct x86_emulate_ctxt *ctxt,
+ struct segmented_address addr,
+ u16 *size, unsigned long *address, int op_bytes)
+{
+ int rc;
+
+ if (op_bytes == 2)
+ op_bytes = 3;
+ *address = 0;
+ rc = segmented_read_std(ctxt, addr, size, 2);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+ addr.ea += 2;
+ rc = segmented_read_std(ctxt, addr, address, op_bytes);
+ return rc;
+}
+
+FASTOP2(add);
+FASTOP2(or);
+FASTOP2(adc);
+FASTOP2(sbb);
+FASTOP2(and);
+FASTOP2(sub);
+FASTOP2(xor);
+FASTOP2(cmp);
+FASTOP2(test);
+
+FASTOP1SRC2(mul, mul_ex);
+FASTOP1SRC2(imul, imul_ex);
+FASTOP1SRC2EX(div, div_ex);
+FASTOP1SRC2EX(idiv, idiv_ex);
+
+FASTOP3WCL(shld);
+FASTOP3WCL(shrd);
+
+FASTOP2W(imul);
+
+FASTOP1(not);
+FASTOP1(neg);
+FASTOP1(inc);
+FASTOP1(dec);
+
+FASTOP2CL(rol);
+FASTOP2CL(ror);
+FASTOP2CL(rcl);
+FASTOP2CL(rcr);
+FASTOP2CL(shl);
+FASTOP2CL(shr);
+FASTOP2CL(sar);
+
+FASTOP2W(bsf);
+FASTOP2W(bsr);
+FASTOP2W(bt);
+FASTOP2W(bts);
+FASTOP2W(btr);
+FASTOP2W(btc);
+
+FASTOP2(xadd);
+
+FASTOP2R(cmp, cmp_r);
+
+static int em_bsf_c(struct x86_emulate_ctxt *ctxt)
+{
+ /* If src is zero, do not writeback, but update flags */
+ if (ctxt->src.val == 0)
+ ctxt->dst.type = OP_NONE;
+ return fastop(ctxt, em_bsf);
+}
+
+static int em_bsr_c(struct x86_emulate_ctxt *ctxt)
+{
+ /* If src is zero, do not writeback, but update flags */
+ if (ctxt->src.val == 0)
+ ctxt->dst.type = OP_NONE;
+ return fastop(ctxt, em_bsr);
+}
+
+static __always_inline u8 test_cc(unsigned int condition, unsigned long flags)
+{
+ u8 rc;
+ void (*fop)(void) = (void *)em_setcc + FASTOP_SIZE * (condition & 0xf);
+
+ flags = (flags & EFLAGS_MASK) | X86_EFLAGS_IF;
+ asm("push %[flags]; popf; " CALL_NOSPEC
+ : "=a"(rc) : [thunk_target]"r"(fop), [flags]"r"(flags));
+ return rc;
+}
+
+static void fetch_register_operand(struct operand *op)
+{
+ switch (op->bytes) {
+ case 1:
+ op->val = *(u8 *)op->addr.reg;
+ break;
+ case 2:
+ op->val = *(u16 *)op->addr.reg;
+ break;
+ case 4:
+ op->val = *(u32 *)op->addr.reg;
+ break;
+ case 8:
+ op->val = *(u64 *)op->addr.reg;
+ break;
+ }
+}
+
+static int em_fninit(struct x86_emulate_ctxt *ctxt)
+{
+ if (ctxt->ops->get_cr(ctxt, 0) & (X86_CR0_TS | X86_CR0_EM))
+ return emulate_nm(ctxt);
+
+ kvm_fpu_get();
+ asm volatile("fninit");
+ kvm_fpu_put();
+ return X86EMUL_CONTINUE;
+}
+
+static int em_fnstcw(struct x86_emulate_ctxt *ctxt)
+{
+ u16 fcw;
+
+ if (ctxt->ops->get_cr(ctxt, 0) & (X86_CR0_TS | X86_CR0_EM))
+ return emulate_nm(ctxt);
+
+ kvm_fpu_get();
+ asm volatile("fnstcw %0": "+m"(fcw));
+ kvm_fpu_put();
+
+ ctxt->dst.val = fcw;
+
+ return X86EMUL_CONTINUE;
+}
+
+static int em_fnstsw(struct x86_emulate_ctxt *ctxt)
+{
+ u16 fsw;
+
+ if (ctxt->ops->get_cr(ctxt, 0) & (X86_CR0_TS | X86_CR0_EM))
+ return emulate_nm(ctxt);
+
+ kvm_fpu_get();
+ asm volatile("fnstsw %0": "+m"(fsw));
+ kvm_fpu_put();
+
+ ctxt->dst.val = fsw;
+
+ return X86EMUL_CONTINUE;
+}
+
+static void decode_register_operand(struct x86_emulate_ctxt *ctxt,
+ struct operand *op)
+{
+ unsigned int reg;
+
+ if (ctxt->d & ModRM)
+ reg = ctxt->modrm_reg;
+ else
+ reg = (ctxt->b & 7) | ((ctxt->rex_prefix & 1) << 3);
+
+ if (ctxt->d & Sse) {
+ op->type = OP_XMM;
+ op->bytes = 16;
+ op->addr.xmm = reg;
+ kvm_read_sse_reg(reg, &op->vec_val);
+ return;
+ }
+ if (ctxt->d & Mmx) {
+ reg &= 7;
+ op->type = OP_MM;
+ op->bytes = 8;
+ op->addr.mm = reg;
+ return;
+ }
+
+ op->type = OP_REG;
+ op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
+ op->addr.reg = decode_register(ctxt, reg, ctxt->d & ByteOp);
+
+ fetch_register_operand(op);
+ op->orig_val = op->val;
+}
+
+static void adjust_modrm_seg(struct x86_emulate_ctxt *ctxt, int base_reg)
+{
+ if (base_reg == VCPU_REGS_RSP || base_reg == VCPU_REGS_RBP)
+ ctxt->modrm_seg = VCPU_SREG_SS;
+}
+
+static int decode_modrm(struct x86_emulate_ctxt *ctxt,
+ struct operand *op)
+{
+ u8 sib;
+ int index_reg, base_reg, scale;
+ int rc = X86EMUL_CONTINUE;
+ ulong modrm_ea = 0;
+
+ ctxt->modrm_reg = ((ctxt->rex_prefix << 1) & 8); /* REX.R */
+ index_reg = (ctxt->rex_prefix << 2) & 8; /* REX.X */
+ base_reg = (ctxt->rex_prefix << 3) & 8; /* REX.B */
+
+ ctxt->modrm_mod = (ctxt->modrm & 0xc0) >> 6;
+ ctxt->modrm_reg |= (ctxt->modrm & 0x38) >> 3;
+ ctxt->modrm_rm = base_reg | (ctxt->modrm & 0x07);
+ ctxt->modrm_seg = VCPU_SREG_DS;
+
+ if (ctxt->modrm_mod == 3 || (ctxt->d & NoMod)) {
+ op->type = OP_REG;
+ op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
+ op->addr.reg = decode_register(ctxt, ctxt->modrm_rm,
+ ctxt->d & ByteOp);
+ if (ctxt->d & Sse) {
+ op->type = OP_XMM;
+ op->bytes = 16;
+ op->addr.xmm = ctxt->modrm_rm;
+ kvm_read_sse_reg(ctxt->modrm_rm, &op->vec_val);
+ return rc;
+ }
+ if (ctxt->d & Mmx) {
+ op->type = OP_MM;
+ op->bytes = 8;
+ op->addr.mm = ctxt->modrm_rm & 7;
+ return rc;
+ }
+ fetch_register_operand(op);
+ return rc;
+ }
+
+ op->type = OP_MEM;
+
+ if (ctxt->ad_bytes == 2) {
+ unsigned bx = reg_read(ctxt, VCPU_REGS_RBX);
+ unsigned bp = reg_read(ctxt, VCPU_REGS_RBP);
+ unsigned si = reg_read(ctxt, VCPU_REGS_RSI);
+ unsigned di = reg_read(ctxt, VCPU_REGS_RDI);
+
+ /* 16-bit ModR/M decode. */
+ switch (ctxt->modrm_mod) {
+ case 0:
+ if (ctxt->modrm_rm == 6)
+ modrm_ea += insn_fetch(u16, ctxt);
+ break;
+ case 1:
+ modrm_ea += insn_fetch(s8, ctxt);
+ break;
+ case 2:
+ modrm_ea += insn_fetch(u16, ctxt);
+ break;
+ }
+ switch (ctxt->modrm_rm) {
+ case 0:
+ modrm_ea += bx + si;
+ break;
+ case 1:
+ modrm_ea += bx + di;
+ break;
+ case 2:
+ modrm_ea += bp + si;
+ break;
+ case 3:
+ modrm_ea += bp + di;
+ break;
+ case 4:
+ modrm_ea += si;
+ break;
+ case 5:
+ modrm_ea += di;
+ break;
+ case 6:
+ if (ctxt->modrm_mod != 0)
+ modrm_ea += bp;
+ break;
+ case 7:
+ modrm_ea += bx;
+ break;
+ }
+ if (ctxt->modrm_rm == 2 || ctxt->modrm_rm == 3 ||
+ (ctxt->modrm_rm == 6 && ctxt->modrm_mod != 0))
+ ctxt->modrm_seg = VCPU_SREG_SS;
+ modrm_ea = (u16)modrm_ea;
+ } else {
+ /* 32/64-bit ModR/M decode. */
+ if ((ctxt->modrm_rm & 7) == 4) {
+ sib = insn_fetch(u8, ctxt);
+ index_reg |= (sib >> 3) & 7;
+ base_reg |= sib & 7;
+ scale = sib >> 6;
+
+ if ((base_reg & 7) == 5 && ctxt->modrm_mod == 0)
+ modrm_ea += insn_fetch(s32, ctxt);
+ else {
+ modrm_ea += reg_read(ctxt, base_reg);
+ adjust_modrm_seg(ctxt, base_reg);
+ /* Increment ESP on POP [ESP] */
+ if ((ctxt->d & IncSP) &&
+ base_reg == VCPU_REGS_RSP)
+ modrm_ea += ctxt->op_bytes;
+ }
+ if (index_reg != 4)
+ modrm_ea += reg_read(ctxt, index_reg) << scale;
+ } else if ((ctxt->modrm_rm & 7) == 5 && ctxt->modrm_mod == 0) {
+ modrm_ea += insn_fetch(s32, ctxt);
+ if (ctxt->mode == X86EMUL_MODE_PROT64)
+ ctxt->rip_relative = 1;
+ } else {
+ base_reg = ctxt->modrm_rm;
+ modrm_ea += reg_read(ctxt, base_reg);
+ adjust_modrm_seg(ctxt, base_reg);
+ }
+ switch (ctxt->modrm_mod) {
+ case 1:
+ modrm_ea += insn_fetch(s8, ctxt);
+ break;
+ case 2:
+ modrm_ea += insn_fetch(s32, ctxt);
+ break;
+ }
+ }
+ op->addr.mem.ea = modrm_ea;
+ if (ctxt->ad_bytes != 8)
+ ctxt->memop.addr.mem.ea = (u32)ctxt->memop.addr.mem.ea;
+
+done:
+ return rc;
+}
+
+static int decode_abs(struct x86_emulate_ctxt *ctxt,
+ struct operand *op)
+{
+ int rc = X86EMUL_CONTINUE;
+
+ op->type = OP_MEM;
+ switch (ctxt->ad_bytes) {
+ case 2:
+ op->addr.mem.ea = insn_fetch(u16, ctxt);
+ break;
+ case 4:
+ op->addr.mem.ea = insn_fetch(u32, ctxt);
+ break;
+ case 8:
+ op->addr.mem.ea = insn_fetch(u64, ctxt);
+ break;
+ }
+done:
+ return rc;
+}
+
+static void fetch_bit_operand(struct x86_emulate_ctxt *ctxt)
+{
+ long sv = 0, mask;
+
+ if (ctxt->dst.type == OP_MEM && ctxt->src.type == OP_REG) {
+ mask = ~((long)ctxt->dst.bytes * 8 - 1);
+
+ if (ctxt->src.bytes == 2)
+ sv = (s16)ctxt->src.val & (s16)mask;
+ else if (ctxt->src.bytes == 4)
+ sv = (s32)ctxt->src.val & (s32)mask;
+ else
+ sv = (s64)ctxt->src.val & (s64)mask;
+
+ ctxt->dst.addr.mem.ea = address_mask(ctxt,
+ ctxt->dst.addr.mem.ea + (sv >> 3));
+ }
+
+ /* only subword offset */
+ ctxt->src.val &= (ctxt->dst.bytes << 3) - 1;
+}
+
+static int read_emulated(struct x86_emulate_ctxt *ctxt,
+ unsigned long addr, void *dest, unsigned size)
+{
+ int rc;
+ struct read_cache *mc = &ctxt->mem_read;
+
+ if (mc->pos < mc->end)
+ goto read_cached;
+
+ if (KVM_EMULATOR_BUG_ON((mc->end + size) >= sizeof(mc->data), ctxt))
+ return X86EMUL_UNHANDLEABLE;
+
+ rc = ctxt->ops->read_emulated(ctxt, addr, mc->data + mc->end, size,
+ &ctxt->exception);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ mc->end += size;
+
+read_cached:
+ memcpy(dest, mc->data + mc->pos, size);
+ mc->pos += size;
+ return X86EMUL_CONTINUE;
+}
+
+static int segmented_read(struct x86_emulate_ctxt *ctxt,
+ struct segmented_address addr,
+ void *data,
+ unsigned size)
+{
+ int rc;
+ ulong linear;
+
+ rc = linearize(ctxt, addr, size, false, &linear);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+ return read_emulated(ctxt, linear, data, size);
+}
+
+static int segmented_write(struct x86_emulate_ctxt *ctxt,
+ struct segmented_address addr,
+ const void *data,
+ unsigned size)
+{
+ int rc;
+ ulong linear;
+
+ rc = linearize(ctxt, addr, size, true, &linear);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+ return ctxt->ops->write_emulated(ctxt, linear, data, size,
+ &ctxt->exception);
+}
+
+static int segmented_cmpxchg(struct x86_emulate_ctxt *ctxt,
+ struct segmented_address addr,
+ const void *orig_data, const void *data,
+ unsigned size)
+{
+ int rc;
+ ulong linear;
+
+ rc = linearize(ctxt, addr, size, true, &linear);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+ return ctxt->ops->cmpxchg_emulated(ctxt, linear, orig_data, data,
+ size, &ctxt->exception);
+}
+
+static int pio_in_emulated(struct x86_emulate_ctxt *ctxt,
+ unsigned int size, unsigned short port,
+ void *dest)
+{
+ struct read_cache *rc = &ctxt->io_read;
+
+ if (rc->pos == rc->end) { /* refill pio read ahead */
+ unsigned int in_page, n;
+ unsigned int count = ctxt->rep_prefix ?
+ address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) : 1;
+ in_page = (ctxt->eflags & X86_EFLAGS_DF) ?
+ offset_in_page(reg_read(ctxt, VCPU_REGS_RDI)) :
+ PAGE_SIZE - offset_in_page(reg_read(ctxt, VCPU_REGS_RDI));
+ n = min3(in_page, (unsigned int)sizeof(rc->data) / size, count);
+ if (n == 0)
+ n = 1;
+ rc->pos = rc->end = 0;
+ if (!ctxt->ops->pio_in_emulated(ctxt, size, port, rc->data, n))
+ return 0;
+ rc->end = n * size;
+ }
+
+ if (ctxt->rep_prefix && (ctxt->d & String) &&
+ !(ctxt->eflags & X86_EFLAGS_DF)) {
+ ctxt->dst.data = rc->data + rc->pos;
+ ctxt->dst.type = OP_MEM_STR;
+ ctxt->dst.count = (rc->end - rc->pos) / size;
+ rc->pos = rc->end;
+ } else {
+ memcpy(dest, rc->data + rc->pos, size);
+ rc->pos += size;
+ }
+ return 1;
+}
+
+static int read_interrupt_descriptor(struct x86_emulate_ctxt *ctxt,
+ u16 index, struct desc_struct *desc)
+{
+ struct desc_ptr dt;
+ ulong addr;
+
+ ctxt->ops->get_idt(ctxt, &dt);
+
+ if (dt.size < index * 8 + 7)
+ return emulate_gp(ctxt, index << 3 | 0x2);
+
+ addr = dt.address + index * 8;
+ return linear_read_system(ctxt, addr, desc, sizeof(*desc));
+}
+
+static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt,
+ u16 selector, struct desc_ptr *dt)
+{
+ const struct x86_emulate_ops *ops = ctxt->ops;
+ u32 base3 = 0;
+
+ if (selector & 1 << 2) {
+ struct desc_struct desc;
+ u16 sel;
+
+ memset(dt, 0, sizeof(*dt));
+ if (!ops->get_segment(ctxt, &sel, &desc, &base3,
+ VCPU_SREG_LDTR))
+ return;
+
+ dt->size = desc_limit_scaled(&desc); /* what if limit > 65535? */
+ dt->address = get_desc_base(&desc) | ((u64)base3 << 32);
+ } else
+ ops->get_gdt(ctxt, dt);
+}
+
+static int get_descriptor_ptr(struct x86_emulate_ctxt *ctxt,
+ u16 selector, ulong *desc_addr_p)
+{
+ struct desc_ptr dt;
+ u16 index = selector >> 3;
+ ulong addr;
+
+ get_descriptor_table_ptr(ctxt, selector, &dt);
+
+ if (dt.size < index * 8 + 7)
+ return emulate_gp(ctxt, selector & 0xfffc);
+
+ addr = dt.address + index * 8;
+
+#ifdef CONFIG_X86_64
+ if (addr >> 32 != 0) {
+ u64 efer = 0;
+
+ ctxt->ops->get_msr(ctxt, MSR_EFER, &efer);
+ if (!(efer & EFER_LMA))
+ addr &= (u32)-1;
+ }
+#endif
+
+ *desc_addr_p = addr;
+ return X86EMUL_CONTINUE;
+}
+
+/* allowed just for 8 bytes segments */
+static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt,
+ u16 selector, struct desc_struct *desc,
+ ulong *desc_addr_p)
+{
+ int rc;
+
+ rc = get_descriptor_ptr(ctxt, selector, desc_addr_p);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ return linear_read_system(ctxt, *desc_addr_p, desc, sizeof(*desc));
+}
+
+/* allowed just for 8 bytes segments */
+static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt,
+ u16 selector, struct desc_struct *desc)
+{
+ int rc;
+ ulong addr;
+
+ rc = get_descriptor_ptr(ctxt, selector, &addr);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ return linear_write_system(ctxt, addr, desc, sizeof(*desc));
+}
+
+static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
+ u16 selector, int seg, u8 cpl,
+ enum x86_transfer_type transfer,
+ struct desc_struct *desc)
+{
+ struct desc_struct seg_desc, old_desc;
+ u8 dpl, rpl;
+ unsigned err_vec = GP_VECTOR;
+ u32 err_code = 0;
+ bool null_selector = !(selector & ~0x3); /* 0000-0003 are null */
+ ulong desc_addr;
+ int ret;
+ u16 dummy;
+ u32 base3 = 0;
+
+ memset(&seg_desc, 0, sizeof(seg_desc));
+
+ if (ctxt->mode == X86EMUL_MODE_REAL) {
+ /* set real mode segment descriptor (keep limit etc. for
+ * unreal mode) */
+ ctxt->ops->get_segment(ctxt, &dummy, &seg_desc, NULL, seg);
+ set_desc_base(&seg_desc, selector << 4);
+ goto load;
+ } else if (seg <= VCPU_SREG_GS && ctxt->mode == X86EMUL_MODE_VM86) {
+ /* VM86 needs a clean new segment descriptor */
+ set_desc_base(&seg_desc, selector << 4);
+ set_desc_limit(&seg_desc, 0xffff);
+ seg_desc.type = 3;
+ seg_desc.p = 1;
+ seg_desc.s = 1;
+ seg_desc.dpl = 3;
+ goto load;
+ }
+
+ rpl = selector & 3;
+
+ /* TR should be in GDT only */
+ if (seg == VCPU_SREG_TR && (selector & (1 << 2)))
+ goto exception;
+
+ /* NULL selector is not valid for TR, CS and (except for long mode) SS */
+ if (null_selector) {
+ if (seg == VCPU_SREG_CS || seg == VCPU_SREG_TR)
+ goto exception;
+
+ if (seg == VCPU_SREG_SS) {
+ if (ctxt->mode != X86EMUL_MODE_PROT64 || rpl != cpl)
+ goto exception;
+
+ /*
+ * ctxt->ops->set_segment expects the CPL to be in
+ * SS.DPL, so fake an expand-up 32-bit data segment.
+ */
+ seg_desc.type = 3;
+ seg_desc.p = 1;
+ seg_desc.s = 1;
+ seg_desc.dpl = cpl;
+ seg_desc.d = 1;
+ seg_desc.g = 1;
+ }
+
+ /* Skip all following checks */
+ goto load;
+ }
+
+ ret = read_segment_descriptor(ctxt, selector, &seg_desc, &desc_addr);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+
+ err_code = selector & 0xfffc;
+ err_vec = (transfer == X86_TRANSFER_TASK_SWITCH) ? TS_VECTOR :
+ GP_VECTOR;
+
+ /* can't load system descriptor into segment selector */
+ if (seg <= VCPU_SREG_GS && !seg_desc.s) {
+ if (transfer == X86_TRANSFER_CALL_JMP)
+ return X86EMUL_UNHANDLEABLE;
+ goto exception;
+ }
+
+ dpl = seg_desc.dpl;
+
+ switch (seg) {
+ case VCPU_SREG_SS:
+ /*
+ * segment is not a writable data segment or segment
+ * selector's RPL != CPL or DPL != CPL
+ */
+ if (rpl != cpl || (seg_desc.type & 0xa) != 0x2 || dpl != cpl)
+ goto exception;
+ break;
+ case VCPU_SREG_CS:
+ /*
+ * KVM uses "none" when loading CS as part of emulating Real
+ * Mode exceptions and IRET (handled above). In all other
+ * cases, loading CS without a control transfer is a KVM bug.
+ */
+ if (WARN_ON_ONCE(transfer == X86_TRANSFER_NONE))
+ goto exception;
+
+ if (!(seg_desc.type & 8))
+ goto exception;
+
+ if (transfer == X86_TRANSFER_RET) {
+ /* RET can never return to an inner privilege level. */
+ if (rpl < cpl)
+ goto exception;
+ /* Outer-privilege level return is not implemented */
+ if (rpl > cpl)
+ return X86EMUL_UNHANDLEABLE;
+ }
+ if (transfer == X86_TRANSFER_RET || transfer == X86_TRANSFER_TASK_SWITCH) {
+ if (seg_desc.type & 4) {
+ /* conforming */
+ if (dpl > rpl)
+ goto exception;
+ } else {
+ /* nonconforming */
+ if (dpl != rpl)
+ goto exception;
+ }
+ } else { /* X86_TRANSFER_CALL_JMP */
+ if (seg_desc.type & 4) {
+ /* conforming */
+ if (dpl > cpl)
+ goto exception;
+ } else {
+ /* nonconforming */
+ if (rpl > cpl || dpl != cpl)
+ goto exception;
+ }
+ }
+ /* in long-mode d/b must be clear if l is set */
+ if (seg_desc.d && seg_desc.l) {
+ u64 efer = 0;
+
+ ctxt->ops->get_msr(ctxt, MSR_EFER, &efer);
+ if (efer & EFER_LMA)
+ goto exception;
+ }
+
+ /* CS(RPL) <- CPL */
+ selector = (selector & 0xfffc) | cpl;
+ break;
+ case VCPU_SREG_TR:
+ if (seg_desc.s || (seg_desc.type != 1 && seg_desc.type != 9))
+ goto exception;
+ break;
+ case VCPU_SREG_LDTR:
+ if (seg_desc.s || seg_desc.type != 2)
+ goto exception;
+ break;
+ default: /* DS, ES, FS, or GS */
+ /*
+ * segment is not a data or readable code segment or
+ * ((segment is a data or nonconforming code segment)
+ * and ((RPL > DPL) or (CPL > DPL)))
+ */
+ if ((seg_desc.type & 0xa) == 0x8 ||
+ (((seg_desc.type & 0xc) != 0xc) &&
+ (rpl > dpl || cpl > dpl)))
+ goto exception;
+ break;
+ }
+
+ if (!seg_desc.p) {
+ err_vec = (seg == VCPU_SREG_SS) ? SS_VECTOR : NP_VECTOR;
+ goto exception;
+ }
+
+ if (seg_desc.s) {
+ /* mark segment as accessed */
+ if (!(seg_desc.type & 1)) {
+ seg_desc.type |= 1;
+ ret = write_segment_descriptor(ctxt, selector,
+ &seg_desc);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+ }
+ } else if (ctxt->mode == X86EMUL_MODE_PROT64) {
+ ret = linear_read_system(ctxt, desc_addr+8, &base3, sizeof(base3));
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+ if (emul_is_noncanonical_address(get_desc_base(&seg_desc) |
+ ((u64)base3 << 32), ctxt))
+ return emulate_gp(ctxt, err_code);
+ }
+
+ if (seg == VCPU_SREG_TR) {
+ old_desc = seg_desc;
+ seg_desc.type |= 2; /* busy */
+ ret = ctxt->ops->cmpxchg_emulated(ctxt, desc_addr, &old_desc, &seg_desc,
+ sizeof(seg_desc), &ctxt->exception);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+ }
+load:
+ ctxt->ops->set_segment(ctxt, selector, &seg_desc, base3, seg);
+ if (desc)
+ *desc = seg_desc;
+ return X86EMUL_CONTINUE;
+exception:
+ return emulate_exception(ctxt, err_vec, err_code, true);
+}
+
+static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
+ u16 selector, int seg)
+{
+ u8 cpl = ctxt->ops->cpl(ctxt);
+
+ /*
+ * None of MOV, POP and LSS can load a NULL selector in CPL=3, but
+ * they can load it at CPL<3 (Intel's manual says only LSS can,
+ * but it's wrong).
+ *
+ * However, the Intel manual says that putting IST=1/DPL=3 in
+ * an interrupt gate will result in SS=3 (the AMD manual instead
+ * says it doesn't), so allow SS=3 in __load_segment_descriptor
+ * and only forbid it here.
+ */
+ if (seg == VCPU_SREG_SS && selector == 3 &&
+ ctxt->mode == X86EMUL_MODE_PROT64)
+ return emulate_exception(ctxt, GP_VECTOR, 0, true);
+
+ return __load_segment_descriptor(ctxt, selector, seg, cpl,
+ X86_TRANSFER_NONE, NULL);
+}
+
+static void write_register_operand(struct operand *op)
+{
+ return assign_register(op->addr.reg, op->val, op->bytes);
+}
+
+static int writeback(struct x86_emulate_ctxt *ctxt, struct operand *op)
+{
+ switch (op->type) {
+ case OP_REG:
+ write_register_operand(op);
+ break;
+ case OP_MEM:
+ if (ctxt->lock_prefix)
+ return segmented_cmpxchg(ctxt,
+ op->addr.mem,
+ &op->orig_val,
+ &op->val,
+ op->bytes);
+ else
+ return segmented_write(ctxt,
+ op->addr.mem,
+ &op->val,
+ op->bytes);
+ case OP_MEM_STR:
+ return segmented_write(ctxt,
+ op->addr.mem,
+ op->data,
+ op->bytes * op->count);
+ case OP_XMM:
+ kvm_write_sse_reg(op->addr.xmm, &op->vec_val);
+ break;
+ case OP_MM:
+ kvm_write_mmx_reg(op->addr.mm, &op->mm_val);
+ break;
+ case OP_NONE:
+ /* no writeback */
+ break;
+ default:
+ break;
+ }
+ return X86EMUL_CONTINUE;
+}
+
+static int push(struct x86_emulate_ctxt *ctxt, void *data, int bytes)
+{
+ struct segmented_address addr;
+
+ rsp_increment(ctxt, -bytes);
+ addr.ea = reg_read(ctxt, VCPU_REGS_RSP) & stack_mask(ctxt);
+ addr.seg = VCPU_SREG_SS;
+
+ return segmented_write(ctxt, addr, data, bytes);
+}
+
+static int em_push(struct x86_emulate_ctxt *ctxt)
+{
+ /* Disable writeback. */
+ ctxt->dst.type = OP_NONE;
+ return push(ctxt, &ctxt->src.val, ctxt->op_bytes);
+}
+
+static int emulate_pop(struct x86_emulate_ctxt *ctxt,
+ void *dest, int len)
+{
+ int rc;
+ struct segmented_address addr;
+
+ addr.ea = reg_read(ctxt, VCPU_REGS_RSP) & stack_mask(ctxt);
+ addr.seg = VCPU_SREG_SS;
+ rc = segmented_read(ctxt, addr, dest, len);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ rsp_increment(ctxt, len);
+ return rc;
+}
+
+static int em_pop(struct x86_emulate_ctxt *ctxt)
+{
+ return emulate_pop(ctxt, &ctxt->dst.val, ctxt->op_bytes);
+}
+
+static int emulate_popf(struct x86_emulate_ctxt *ctxt,
+ void *dest, int len)
+{
+ int rc;
+ unsigned long val, change_mask;
+ int iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> X86_EFLAGS_IOPL_BIT;
+ int cpl = ctxt->ops->cpl(ctxt);
+
+ rc = emulate_pop(ctxt, &val, len);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ change_mask = X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
+ X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF |
+ X86_EFLAGS_TF | X86_EFLAGS_DF | X86_EFLAGS_NT |
+ X86_EFLAGS_AC | X86_EFLAGS_ID;
+
+ switch(ctxt->mode) {
+ case X86EMUL_MODE_PROT64:
+ case X86EMUL_MODE_PROT32:
+ case X86EMUL_MODE_PROT16:
+ if (cpl == 0)
+ change_mask |= X86_EFLAGS_IOPL;
+ if (cpl <= iopl)
+ change_mask |= X86_EFLAGS_IF;
+ break;
+ case X86EMUL_MODE_VM86:
+ if (iopl < 3)
+ return emulate_gp(ctxt, 0);
+ change_mask |= X86_EFLAGS_IF;
+ break;
+ default: /* real mode */
+ change_mask |= (X86_EFLAGS_IOPL | X86_EFLAGS_IF);
+ break;
+ }
+
+ *(unsigned long *)dest =
+ (ctxt->eflags & ~change_mask) | (val & change_mask);
+
+ return rc;
+}
+
+static int em_popf(struct x86_emulate_ctxt *ctxt)
+{
+ ctxt->dst.type = OP_REG;
+ ctxt->dst.addr.reg = &ctxt->eflags;
+ ctxt->dst.bytes = ctxt->op_bytes;
+ return emulate_popf(ctxt, &ctxt->dst.val, ctxt->op_bytes);
+}
+
+static int em_enter(struct x86_emulate_ctxt *ctxt)
+{
+ int rc;
+ unsigned frame_size = ctxt->src.val;
+ unsigned nesting_level = ctxt->src2.val & 31;
+ ulong rbp;
+
+ if (nesting_level)
+ return X86EMUL_UNHANDLEABLE;
+
+ rbp = reg_read(ctxt, VCPU_REGS_RBP);
+ rc = push(ctxt, &rbp, stack_size(ctxt));
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+ assign_masked(reg_rmw(ctxt, VCPU_REGS_RBP), reg_read(ctxt, VCPU_REGS_RSP),
+ stack_mask(ctxt));
+ assign_masked(reg_rmw(ctxt, VCPU_REGS_RSP),
+ reg_read(ctxt, VCPU_REGS_RSP) - frame_size,
+ stack_mask(ctxt));
+ return X86EMUL_CONTINUE;
+}
+
+static int em_leave(struct x86_emulate_ctxt *ctxt)
+{
+ assign_masked(reg_rmw(ctxt, VCPU_REGS_RSP), reg_read(ctxt, VCPU_REGS_RBP),
+ stack_mask(ctxt));
+ return emulate_pop(ctxt, reg_rmw(ctxt, VCPU_REGS_RBP), ctxt->op_bytes);
+}
+
+static int em_push_sreg(struct x86_emulate_ctxt *ctxt)
+{
+ int seg = ctxt->src2.val;
+
+ ctxt->src.val = get_segment_selector(ctxt, seg);
+ if (ctxt->op_bytes == 4) {
+ rsp_increment(ctxt, -2);
+ ctxt->op_bytes = 2;
+ }
+
+ return em_push(ctxt);
+}
+
+static int em_pop_sreg(struct x86_emulate_ctxt *ctxt)
+{
+ int seg = ctxt->src2.val;
+ unsigned long selector;
+ int rc;
+
+ rc = emulate_pop(ctxt, &selector, 2);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ if (seg == VCPU_SREG_SS)
+ ctxt->interruptibility = KVM_X86_SHADOW_INT_MOV_SS;
+ if (ctxt->op_bytes > 2)
+ rsp_increment(ctxt, ctxt->op_bytes - 2);
+
+ rc = load_segment_descriptor(ctxt, (u16)selector, seg);
+ return rc;
+}
+
+static int em_pusha(struct x86_emulate_ctxt *ctxt)
+{
+ unsigned long old_esp = reg_read(ctxt, VCPU_REGS_RSP);
+ int rc = X86EMUL_CONTINUE;
+ int reg = VCPU_REGS_RAX;
+
+ while (reg <= VCPU_REGS_RDI) {
+ (reg == VCPU_REGS_RSP) ?
+ (ctxt->src.val = old_esp) : (ctxt->src.val = reg_read(ctxt, reg));
+
+ rc = em_push(ctxt);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ ++reg;
+ }
+
+ return rc;
+}
+
+static int em_pushf(struct x86_emulate_ctxt *ctxt)
+{
+ ctxt->src.val = (unsigned long)ctxt->eflags & ~X86_EFLAGS_VM;
+ return em_push(ctxt);
+}
+
+static int em_popa(struct x86_emulate_ctxt *ctxt)
+{
+ int rc = X86EMUL_CONTINUE;
+ int reg = VCPU_REGS_RDI;
+ u32 val;
+
+ while (reg >= VCPU_REGS_RAX) {
+ if (reg == VCPU_REGS_RSP) {
+ rsp_increment(ctxt, ctxt->op_bytes);
+ --reg;
+ }
+
+ rc = emulate_pop(ctxt, &val, ctxt->op_bytes);
+ if (rc != X86EMUL_CONTINUE)
+ break;
+ assign_register(reg_rmw(ctxt, reg), val, ctxt->op_bytes);
+ --reg;
+ }
+ return rc;
+}
+
+static int __emulate_int_real(struct x86_emulate_ctxt *ctxt, int irq)
+{
+ const struct x86_emulate_ops *ops = ctxt->ops;
+ int rc;
+ struct desc_ptr dt;
+ gva_t cs_addr;
+ gva_t eip_addr;
+ u16 cs, eip;
+
+ /* TODO: Add limit checks */
+ ctxt->src.val = ctxt->eflags;
+ rc = em_push(ctxt);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ ctxt->eflags &= ~(X86_EFLAGS_IF | X86_EFLAGS_TF | X86_EFLAGS_AC);
+
+ ctxt->src.val = get_segment_selector(ctxt, VCPU_SREG_CS);
+ rc = em_push(ctxt);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ ctxt->src.val = ctxt->_eip;
+ rc = em_push(ctxt);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ ops->get_idt(ctxt, &dt);
+
+ eip_addr = dt.address + (irq << 2);
+ cs_addr = dt.address + (irq << 2) + 2;
+
+ rc = linear_read_system(ctxt, cs_addr, &cs, 2);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ rc = linear_read_system(ctxt, eip_addr, &eip, 2);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ rc = load_segment_descriptor(ctxt, cs, VCPU_SREG_CS);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ ctxt->_eip = eip;
+
+ return rc;
+}
+
+int emulate_int_real(struct x86_emulate_ctxt *ctxt, int irq)
+{
+ int rc;
+
+ invalidate_registers(ctxt);
+ rc = __emulate_int_real(ctxt, irq);
+ if (rc == X86EMUL_CONTINUE)
+ writeback_registers(ctxt);
+ return rc;
+}
+
+static int emulate_int(struct x86_emulate_ctxt *ctxt, int irq)
+{
+ switch(ctxt->mode) {
+ case X86EMUL_MODE_REAL:
+ return __emulate_int_real(ctxt, irq);
+ case X86EMUL_MODE_VM86:
+ case X86EMUL_MODE_PROT16:
+ case X86EMUL_MODE_PROT32:
+ case X86EMUL_MODE_PROT64:
+ default:
+ /* Protected mode interrupts unimplemented yet */
+ return X86EMUL_UNHANDLEABLE;
+ }
+}
+
+static int emulate_iret_real(struct x86_emulate_ctxt *ctxt)
+{
+ int rc = X86EMUL_CONTINUE;
+ unsigned long temp_eip = 0;
+ unsigned long temp_eflags = 0;
+ unsigned long cs = 0;
+ unsigned long mask = X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
+ X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_TF |
+ X86_EFLAGS_IF | X86_EFLAGS_DF | X86_EFLAGS_OF |
+ X86_EFLAGS_IOPL | X86_EFLAGS_NT | X86_EFLAGS_RF |
+ X86_EFLAGS_AC | X86_EFLAGS_ID |
+ X86_EFLAGS_FIXED;
+ unsigned long vm86_mask = X86_EFLAGS_VM | X86_EFLAGS_VIF |
+ X86_EFLAGS_VIP;
+
+ /* TODO: Add stack limit check */
+
+ rc = emulate_pop(ctxt, &temp_eip, ctxt->op_bytes);
+
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ if (temp_eip & ~0xffff)
+ return emulate_gp(ctxt, 0);
+
+ rc = emulate_pop(ctxt, &cs, ctxt->op_bytes);
+
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ rc = emulate_pop(ctxt, &temp_eflags, ctxt->op_bytes);
+
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ rc = load_segment_descriptor(ctxt, (u16)cs, VCPU_SREG_CS);
+
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ ctxt->_eip = temp_eip;
+
+ if (ctxt->op_bytes == 4)
+ ctxt->eflags = ((temp_eflags & mask) | (ctxt->eflags & vm86_mask));
+ else if (ctxt->op_bytes == 2) {
+ ctxt->eflags &= ~0xffff;
+ ctxt->eflags |= temp_eflags;
+ }
+
+ ctxt->eflags &= ~EFLG_RESERVED_ZEROS_MASK; /* Clear reserved zeros */
+ ctxt->eflags |= X86_EFLAGS_FIXED;
+ ctxt->ops->set_nmi_mask(ctxt, false);
+
+ return rc;
+}
+
+static int em_iret(struct x86_emulate_ctxt *ctxt)
+{
+ switch(ctxt->mode) {
+ case X86EMUL_MODE_REAL:
+ return emulate_iret_real(ctxt);
+ case X86EMUL_MODE_VM86:
+ case X86EMUL_MODE_PROT16:
+ case X86EMUL_MODE_PROT32:
+ case X86EMUL_MODE_PROT64:
+ default:
+ /* iret from protected mode unimplemented yet */
+ return X86EMUL_UNHANDLEABLE;
+ }
+}
+
+static int em_jmp_far(struct x86_emulate_ctxt *ctxt)
+{
+ int rc;
+ unsigned short sel;
+ struct desc_struct new_desc;
+ u8 cpl = ctxt->ops->cpl(ctxt);
+
+ memcpy(&sel, ctxt->src.valptr + ctxt->op_bytes, 2);
+
+ rc = __load_segment_descriptor(ctxt, sel, VCPU_SREG_CS, cpl,
+ X86_TRANSFER_CALL_JMP,
+ &new_desc);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ rc = assign_eip_far(ctxt, ctxt->src.val);
+ /* Error handling is not implemented. */
+ if (rc != X86EMUL_CONTINUE)
+ return X86EMUL_UNHANDLEABLE;
+
+ return rc;
+}
+
+static int em_jmp_abs(struct x86_emulate_ctxt *ctxt)
+{
+ return assign_eip_near(ctxt, ctxt->src.val);
+}
+
+static int em_call_near_abs(struct x86_emulate_ctxt *ctxt)
+{
+ int rc;
+ long int old_eip;
+
+ old_eip = ctxt->_eip;
+ rc = assign_eip_near(ctxt, ctxt->src.val);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+ ctxt->src.val = old_eip;
+ rc = em_push(ctxt);
+ return rc;
+}
+
+static int em_cmpxchg8b(struct x86_emulate_ctxt *ctxt)
+{
+ u64 old = ctxt->dst.orig_val64;
+
+ if (ctxt->dst.bytes == 16)
+ return X86EMUL_UNHANDLEABLE;
+
+ if (((u32) (old >> 0) != (u32) reg_read(ctxt, VCPU_REGS_RAX)) ||
+ ((u32) (old >> 32) != (u32) reg_read(ctxt, VCPU_REGS_RDX))) {
+ *reg_write(ctxt, VCPU_REGS_RAX) = (u32) (old >> 0);
+ *reg_write(ctxt, VCPU_REGS_RDX) = (u32) (old >> 32);
+ ctxt->eflags &= ~X86_EFLAGS_ZF;
+ } else {
+ ctxt->dst.val64 = ((u64)reg_read(ctxt, VCPU_REGS_RCX) << 32) |
+ (u32) reg_read(ctxt, VCPU_REGS_RBX);
+
+ ctxt->eflags |= X86_EFLAGS_ZF;
+ }
+ return X86EMUL_CONTINUE;
+}
+
+static int em_ret(struct x86_emulate_ctxt *ctxt)
+{
+ int rc;
+ unsigned long eip;
+
+ rc = emulate_pop(ctxt, &eip, ctxt->op_bytes);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ return assign_eip_near(ctxt, eip);
+}
+
+static int em_ret_far(struct x86_emulate_ctxt *ctxt)
+{
+ int rc;
+ unsigned long eip, cs;
+ int cpl = ctxt->ops->cpl(ctxt);
+ struct desc_struct new_desc;
+
+ rc = emulate_pop(ctxt, &eip, ctxt->op_bytes);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+ rc = emulate_pop(ctxt, &cs, ctxt->op_bytes);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+ rc = __load_segment_descriptor(ctxt, (u16)cs, VCPU_SREG_CS, cpl,
+ X86_TRANSFER_RET,
+ &new_desc);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+ rc = assign_eip_far(ctxt, eip);
+ /* Error handling is not implemented. */
+ if (rc != X86EMUL_CONTINUE)
+ return X86EMUL_UNHANDLEABLE;
+
+ return rc;
+}
+
+static int em_ret_far_imm(struct x86_emulate_ctxt *ctxt)
+{
+ int rc;
+
+ rc = em_ret_far(ctxt);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+ rsp_increment(ctxt, ctxt->src.val);
+ return X86EMUL_CONTINUE;
+}
+
+static int em_cmpxchg(struct x86_emulate_ctxt *ctxt)
+{
+ /* Save real source value, then compare EAX against destination. */
+ ctxt->dst.orig_val = ctxt->dst.val;
+ ctxt->dst.val = reg_read(ctxt, VCPU_REGS_RAX);
+ ctxt->src.orig_val = ctxt->src.val;
+ ctxt->src.val = ctxt->dst.orig_val;
+ fastop(ctxt, em_cmp);
+
+ if (ctxt->eflags & X86_EFLAGS_ZF) {
+ /* Success: write back to memory; no update of EAX */
+ ctxt->src.type = OP_NONE;
+ ctxt->dst.val = ctxt->src.orig_val;
+ } else {
+ /* Failure: write the value we saw to EAX. */
+ ctxt->src.type = OP_REG;
+ ctxt->src.addr.reg = reg_rmw(ctxt, VCPU_REGS_RAX);
+ ctxt->src.val = ctxt->dst.orig_val;
+ /* Create write-cycle to dest by writing the same value */
+ ctxt->dst.val = ctxt->dst.orig_val;
+ }
+ return X86EMUL_CONTINUE;
+}
+
+static int em_lseg(struct x86_emulate_ctxt *ctxt)
+{
+ int seg = ctxt->src2.val;
+ unsigned short sel;
+ int rc;
+
+ memcpy(&sel, ctxt->src.valptr + ctxt->op_bytes, 2);
+
+ rc = load_segment_descriptor(ctxt, sel, seg);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ ctxt->dst.val = ctxt->src.val;
+ return rc;
+}
+
+static int em_rsm(struct x86_emulate_ctxt *ctxt)
+{
+ if (!ctxt->ops->is_smm(ctxt))
+ return emulate_ud(ctxt);
+
+ if (ctxt->ops->leave_smm(ctxt))
+ ctxt->ops->triple_fault(ctxt);
+
+ return emulator_recalc_and_set_mode(ctxt);
+}
+
+static void
+setup_syscalls_segments(struct desc_struct *cs, struct desc_struct *ss)
+{
+ cs->l = 0; /* will be adjusted later */
+ set_desc_base(cs, 0); /* flat segment */
+ cs->g = 1; /* 4kb granularity */
+ set_desc_limit(cs, 0xfffff); /* 4GB limit */
+ cs->type = 0x0b; /* Read, Execute, Accessed */
+ cs->s = 1;
+ cs->dpl = 0; /* will be adjusted later */
+ cs->p = 1;
+ cs->d = 1;
+ cs->avl = 0;
+
+ set_desc_base(ss, 0); /* flat segment */
+ set_desc_limit(ss, 0xfffff); /* 4GB limit */
+ ss->g = 1; /* 4kb granularity */
+ ss->s = 1;
+ ss->type = 0x03; /* Read/Write, Accessed */
+ ss->d = 1; /* 32bit stack segment */
+ ss->dpl = 0;
+ ss->p = 1;
+ ss->l = 0;
+ ss->avl = 0;
+}
+
+static bool vendor_intel(struct x86_emulate_ctxt *ctxt)
+{
+ u32 eax, ebx, ecx, edx;
+
+ eax = ecx = 0;
+ ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, true);
+ return is_guest_vendor_intel(ebx, ecx, edx);
+}
+
+static bool em_syscall_is_enabled(struct x86_emulate_ctxt *ctxt)
+{
+ const struct x86_emulate_ops *ops = ctxt->ops;
+ u32 eax, ebx, ecx, edx;
+
+ /*
+ * syscall should always be enabled in longmode - so only become
+ * vendor specific (cpuid) if other modes are active...
+ */
+ if (ctxt->mode == X86EMUL_MODE_PROT64)
+ return true;
+
+ eax = 0x00000000;
+ ecx = 0x00000000;
+ ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, true);
+ /*
+ * remark: Intel CPUs only support "syscall" in 64bit longmode. Also a
+ * 64bit guest with a 32bit compat-app running will #UD !! While this
+ * behaviour can be fixed (by emulating) into AMD response - CPUs of
+ * AMD can't behave like Intel.
+ */
+ if (is_guest_vendor_intel(ebx, ecx, edx))
+ return false;
+
+ if (is_guest_vendor_amd(ebx, ecx, edx) ||
+ is_guest_vendor_hygon(ebx, ecx, edx))
+ return true;
+
+ /*
+ * default: (not Intel, not AMD, not Hygon), apply Intel's
+ * stricter rules...
+ */
+ return false;
+}
+
+static int em_syscall(struct x86_emulate_ctxt *ctxt)
+{
+ const struct x86_emulate_ops *ops = ctxt->ops;
+ struct desc_struct cs, ss;
+ u64 msr_data;
+ u16 cs_sel, ss_sel;
+ u64 efer = 0;
+
+ /* syscall is not available in real mode */
+ if (ctxt->mode == X86EMUL_MODE_REAL ||
+ ctxt->mode == X86EMUL_MODE_VM86)
+ return emulate_ud(ctxt);
+
+ if (!(em_syscall_is_enabled(ctxt)))
+ return emulate_ud(ctxt);
+
+ ops->get_msr(ctxt, MSR_EFER, &efer);
+ if (!(efer & EFER_SCE))
+ return emulate_ud(ctxt);
+
+ setup_syscalls_segments(&cs, &ss);
+ ops->get_msr(ctxt, MSR_STAR, &msr_data);
+ msr_data >>= 32;
+ cs_sel = (u16)(msr_data & 0xfffc);
+ ss_sel = (u16)(msr_data + 8);
+
+ if (efer & EFER_LMA) {
+ cs.d = 0;
+ cs.l = 1;
+ }
+ ops->set_segment(ctxt, cs_sel, &cs, 0, VCPU_SREG_CS);
+ ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS);
+
+ *reg_write(ctxt, VCPU_REGS_RCX) = ctxt->_eip;
+ if (efer & EFER_LMA) {
+#ifdef CONFIG_X86_64
+ *reg_write(ctxt, VCPU_REGS_R11) = ctxt->eflags;
+
+ ops->get_msr(ctxt,
+ ctxt->mode == X86EMUL_MODE_PROT64 ?
+ MSR_LSTAR : MSR_CSTAR, &msr_data);
+ ctxt->_eip = msr_data;
+
+ ops->get_msr(ctxt, MSR_SYSCALL_MASK, &msr_data);
+ ctxt->eflags &= ~msr_data;
+ ctxt->eflags |= X86_EFLAGS_FIXED;
+#endif
+ } else {
+ /* legacy mode */
+ ops->get_msr(ctxt, MSR_STAR, &msr_data);
+ ctxt->_eip = (u32)msr_data;
+
+ ctxt->eflags &= ~(X86_EFLAGS_VM | X86_EFLAGS_IF);
+ }
+
+ ctxt->tf = (ctxt->eflags & X86_EFLAGS_TF) != 0;
+ return X86EMUL_CONTINUE;
+}
+
+static int em_sysenter(struct x86_emulate_ctxt *ctxt)
+{
+ const struct x86_emulate_ops *ops = ctxt->ops;
+ struct desc_struct cs, ss;
+ u64 msr_data;
+ u16 cs_sel, ss_sel;
+ u64 efer = 0;
+
+ ops->get_msr(ctxt, MSR_EFER, &efer);
+ /* inject #GP if in real mode */
+ if (ctxt->mode == X86EMUL_MODE_REAL)
+ return emulate_gp(ctxt, 0);
+
+ /*
+ * Not recognized on AMD in compat mode (but is recognized in legacy
+ * mode).
+ */
+ if ((ctxt->mode != X86EMUL_MODE_PROT64) && (efer & EFER_LMA)
+ && !vendor_intel(ctxt))
+ return emulate_ud(ctxt);
+
+ /* sysenter/sysexit have not been tested in 64bit mode. */
+ if (ctxt->mode == X86EMUL_MODE_PROT64)
+ return X86EMUL_UNHANDLEABLE;
+
+ ops->get_msr(ctxt, MSR_IA32_SYSENTER_CS, &msr_data);
+ if ((msr_data & 0xfffc) == 0x0)
+ return emulate_gp(ctxt, 0);
+
+ setup_syscalls_segments(&cs, &ss);
+ ctxt->eflags &= ~(X86_EFLAGS_VM | X86_EFLAGS_IF);
+ cs_sel = (u16)msr_data & ~SEGMENT_RPL_MASK;
+ ss_sel = cs_sel + 8;
+ if (efer & EFER_LMA) {
+ cs.d = 0;
+ cs.l = 1;
+ }
+
+ ops->set_segment(ctxt, cs_sel, &cs, 0, VCPU_SREG_CS);
+ ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS);
+
+ ops->get_msr(ctxt, MSR_IA32_SYSENTER_EIP, &msr_data);
+ ctxt->_eip = (efer & EFER_LMA) ? msr_data : (u32)msr_data;
+
+ ops->get_msr(ctxt, MSR_IA32_SYSENTER_ESP, &msr_data);
+ *reg_write(ctxt, VCPU_REGS_RSP) = (efer & EFER_LMA) ? msr_data :
+ (u32)msr_data;
+ if (efer & EFER_LMA)
+ ctxt->mode = X86EMUL_MODE_PROT64;
+
+ return X86EMUL_CONTINUE;
+}
+
+static int em_sysexit(struct x86_emulate_ctxt *ctxt)
+{
+ const struct x86_emulate_ops *ops = ctxt->ops;
+ struct desc_struct cs, ss;
+ u64 msr_data, rcx, rdx;
+ int usermode;
+ u16 cs_sel = 0, ss_sel = 0;
+
+ /* inject #GP if in real mode or Virtual 8086 mode */
+ if (ctxt->mode == X86EMUL_MODE_REAL ||
+ ctxt->mode == X86EMUL_MODE_VM86)
+ return emulate_gp(ctxt, 0);
+
+ setup_syscalls_segments(&cs, &ss);
+
+ if ((ctxt->rex_prefix & 0x8) != 0x0)
+ usermode = X86EMUL_MODE_PROT64;
+ else
+ usermode = X86EMUL_MODE_PROT32;
+
+ rcx = reg_read(ctxt, VCPU_REGS_RCX);
+ rdx = reg_read(ctxt, VCPU_REGS_RDX);
+
+ cs.dpl = 3;
+ ss.dpl = 3;
+ ops->get_msr(ctxt, MSR_IA32_SYSENTER_CS, &msr_data);
+ switch (usermode) {
+ case X86EMUL_MODE_PROT32:
+ cs_sel = (u16)(msr_data + 16);
+ if ((msr_data & 0xfffc) == 0x0)
+ return emulate_gp(ctxt, 0);
+ ss_sel = (u16)(msr_data + 24);
+ rcx = (u32)rcx;
+ rdx = (u32)rdx;
+ break;
+ case X86EMUL_MODE_PROT64:
+ cs_sel = (u16)(msr_data + 32);
+ if (msr_data == 0x0)
+ return emulate_gp(ctxt, 0);
+ ss_sel = cs_sel + 8;
+ cs.d = 0;
+ cs.l = 1;
+ if (emul_is_noncanonical_address(rcx, ctxt) ||
+ emul_is_noncanonical_address(rdx, ctxt))
+ return emulate_gp(ctxt, 0);
+ break;
+ }
+ cs_sel |= SEGMENT_RPL_MASK;
+ ss_sel |= SEGMENT_RPL_MASK;
+
+ ops->set_segment(ctxt, cs_sel, &cs, 0, VCPU_SREG_CS);
+ ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS);
+
+ ctxt->_eip = rdx;
+ ctxt->mode = usermode;
+ *reg_write(ctxt, VCPU_REGS_RSP) = rcx;
+
+ return X86EMUL_CONTINUE;
+}
+
+static bool emulator_bad_iopl(struct x86_emulate_ctxt *ctxt)
+{
+ int iopl;
+ if (ctxt->mode == X86EMUL_MODE_REAL)
+ return false;
+ if (ctxt->mode == X86EMUL_MODE_VM86)
+ return true;
+ iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> X86_EFLAGS_IOPL_BIT;
+ return ctxt->ops->cpl(ctxt) > iopl;
+}
+
+#define VMWARE_PORT_VMPORT (0x5658)
+#define VMWARE_PORT_VMRPC (0x5659)
+
+static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt,
+ u16 port, u16 len)
+{
+ const struct x86_emulate_ops *ops = ctxt->ops;
+ struct desc_struct tr_seg;
+ u32 base3;
+ int r;
+ u16 tr, io_bitmap_ptr, perm, bit_idx = port & 0x7;
+ unsigned mask = (1 << len) - 1;
+ unsigned long base;
+
+ /*
+ * VMware allows access to these ports even if denied
+ * by TSS I/O permission bitmap. Mimic behavior.
+ */
+ if (enable_vmware_backdoor &&
+ ((port == VMWARE_PORT_VMPORT) || (port == VMWARE_PORT_VMRPC)))
+ return true;
+
+ ops->get_segment(ctxt, &tr, &tr_seg, &base3, VCPU_SREG_TR);
+ if (!tr_seg.p)
+ return false;
+ if (desc_limit_scaled(&tr_seg) < 103)
+ return false;
+ base = get_desc_base(&tr_seg);
+#ifdef CONFIG_X86_64
+ base |= ((u64)base3) << 32;
+#endif
+ r = ops->read_std(ctxt, base + 102, &io_bitmap_ptr, 2, NULL, true);
+ if (r != X86EMUL_CONTINUE)
+ return false;
+ if (io_bitmap_ptr + port/8 > desc_limit_scaled(&tr_seg))
+ return false;
+ r = ops->read_std(ctxt, base + io_bitmap_ptr + port/8, &perm, 2, NULL, true);
+ if (r != X86EMUL_CONTINUE)
+ return false;
+ if ((perm >> bit_idx) & mask)
+ return false;
+ return true;
+}
+
+static bool emulator_io_permitted(struct x86_emulate_ctxt *ctxt,
+ u16 port, u16 len)
+{
+ if (ctxt->perm_ok)
+ return true;
+
+ if (emulator_bad_iopl(ctxt))
+ if (!emulator_io_port_access_allowed(ctxt, port, len))
+ return false;
+
+ ctxt->perm_ok = true;
+
+ return true;
+}
+
+static void string_registers_quirk(struct x86_emulate_ctxt *ctxt)
+{
+ /*
+ * Intel CPUs mask the counter and pointers in quite strange
+ * manner when ECX is zero due to REP-string optimizations.
+ */
+#ifdef CONFIG_X86_64
+ if (ctxt->ad_bytes != 4 || !vendor_intel(ctxt))
+ return;
+
+ *reg_write(ctxt, VCPU_REGS_RCX) = 0;
+
+ switch (ctxt->b) {
+ case 0xa4: /* movsb */
+ case 0xa5: /* movsd/w */
+ *reg_rmw(ctxt, VCPU_REGS_RSI) &= (u32)-1;
+ fallthrough;
+ case 0xaa: /* stosb */
+ case 0xab: /* stosd/w */
+ *reg_rmw(ctxt, VCPU_REGS_RDI) &= (u32)-1;
+ }
+#endif
+}
+
+static void save_state_to_tss16(struct x86_emulate_ctxt *ctxt,
+ struct tss_segment_16 *tss)
+{
+ tss->ip = ctxt->_eip;
+ tss->flag = ctxt->eflags;
+ tss->ax = reg_read(ctxt, VCPU_REGS_RAX);
+ tss->cx = reg_read(ctxt, VCPU_REGS_RCX);
+ tss->dx = reg_read(ctxt, VCPU_REGS_RDX);
+ tss->bx = reg_read(ctxt, VCPU_REGS_RBX);
+ tss->sp = reg_read(ctxt, VCPU_REGS_RSP);
+ tss->bp = reg_read(ctxt, VCPU_REGS_RBP);
+ tss->si = reg_read(ctxt, VCPU_REGS_RSI);
+ tss->di = reg_read(ctxt, VCPU_REGS_RDI);
+
+ tss->es = get_segment_selector(ctxt, VCPU_SREG_ES);
+ tss->cs = get_segment_selector(ctxt, VCPU_SREG_CS);
+ tss->ss = get_segment_selector(ctxt, VCPU_SREG_SS);
+ tss->ds = get_segment_selector(ctxt, VCPU_SREG_DS);
+ tss->ldt = get_segment_selector(ctxt, VCPU_SREG_LDTR);
+}
+
+static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt,
+ struct tss_segment_16 *tss)
+{
+ int ret;
+ u8 cpl;
+
+ ctxt->_eip = tss->ip;
+ ctxt->eflags = tss->flag | 2;
+ *reg_write(ctxt, VCPU_REGS_RAX) = tss->ax;
+ *reg_write(ctxt, VCPU_REGS_RCX) = tss->cx;
+ *reg_write(ctxt, VCPU_REGS_RDX) = tss->dx;
+ *reg_write(ctxt, VCPU_REGS_RBX) = tss->bx;
+ *reg_write(ctxt, VCPU_REGS_RSP) = tss->sp;
+ *reg_write(ctxt, VCPU_REGS_RBP) = tss->bp;
+ *reg_write(ctxt, VCPU_REGS_RSI) = tss->si;
+ *reg_write(ctxt, VCPU_REGS_RDI) = tss->di;
+
+ /*
+ * SDM says that segment selectors are loaded before segment
+ * descriptors
+ */
+ set_segment_selector(ctxt, tss->ldt, VCPU_SREG_LDTR);
+ set_segment_selector(ctxt, tss->es, VCPU_SREG_ES);
+ set_segment_selector(ctxt, tss->cs, VCPU_SREG_CS);
+ set_segment_selector(ctxt, tss->ss, VCPU_SREG_SS);
+ set_segment_selector(ctxt, tss->ds, VCPU_SREG_DS);
+
+ cpl = tss->cs & 3;
+
+ /*
+ * Now load segment descriptors. If fault happens at this stage
+ * it is handled in a context of new task
+ */
+ ret = __load_segment_descriptor(ctxt, tss->ldt, VCPU_SREG_LDTR, cpl,
+ X86_TRANSFER_TASK_SWITCH, NULL);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+ ret = __load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES, cpl,
+ X86_TRANSFER_TASK_SWITCH, NULL);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+ ret = __load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS, cpl,
+ X86_TRANSFER_TASK_SWITCH, NULL);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+ ret = __load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS, cpl,
+ X86_TRANSFER_TASK_SWITCH, NULL);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+ ret = __load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS, cpl,
+ X86_TRANSFER_TASK_SWITCH, NULL);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+
+ return X86EMUL_CONTINUE;
+}
+
+static int task_switch_16(struct x86_emulate_ctxt *ctxt, u16 old_tss_sel,
+ ulong old_tss_base, struct desc_struct *new_desc)
+{
+ struct tss_segment_16 tss_seg;
+ int ret;
+ u32 new_tss_base = get_desc_base(new_desc);
+
+ ret = linear_read_system(ctxt, old_tss_base, &tss_seg, sizeof(tss_seg));
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+
+ save_state_to_tss16(ctxt, &tss_seg);
+
+ ret = linear_write_system(ctxt, old_tss_base, &tss_seg, sizeof(tss_seg));
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+
+ ret = linear_read_system(ctxt, new_tss_base, &tss_seg, sizeof(tss_seg));
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+
+ if (old_tss_sel != 0xffff) {
+ tss_seg.prev_task_link = old_tss_sel;
+
+ ret = linear_write_system(ctxt, new_tss_base,
+ &tss_seg.prev_task_link,
+ sizeof(tss_seg.prev_task_link));
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+ }
+
+ return load_state_from_tss16(ctxt, &tss_seg);
+}
+
+static void save_state_to_tss32(struct x86_emulate_ctxt *ctxt,
+ struct tss_segment_32 *tss)
+{
+ /* CR3 and ldt selector are not saved intentionally */
+ tss->eip = ctxt->_eip;
+ tss->eflags = ctxt->eflags;
+ tss->eax = reg_read(ctxt, VCPU_REGS_RAX);
+ tss->ecx = reg_read(ctxt, VCPU_REGS_RCX);
+ tss->edx = reg_read(ctxt, VCPU_REGS_RDX);
+ tss->ebx = reg_read(ctxt, VCPU_REGS_RBX);
+ tss->esp = reg_read(ctxt, VCPU_REGS_RSP);
+ tss->ebp = reg_read(ctxt, VCPU_REGS_RBP);
+ tss->esi = reg_read(ctxt, VCPU_REGS_RSI);
+ tss->edi = reg_read(ctxt, VCPU_REGS_RDI);
+
+ tss->es = get_segment_selector(ctxt, VCPU_SREG_ES);
+ tss->cs = get_segment_selector(ctxt, VCPU_SREG_CS);
+ tss->ss = get_segment_selector(ctxt, VCPU_SREG_SS);
+ tss->ds = get_segment_selector(ctxt, VCPU_SREG_DS);
+ tss->fs = get_segment_selector(ctxt, VCPU_SREG_FS);
+ tss->gs = get_segment_selector(ctxt, VCPU_SREG_GS);
+}
+
+static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
+ struct tss_segment_32 *tss)
+{
+ int ret;
+ u8 cpl;
+
+ if (ctxt->ops->set_cr(ctxt, 3, tss->cr3))
+ return emulate_gp(ctxt, 0);
+ ctxt->_eip = tss->eip;
+ ctxt->eflags = tss->eflags | 2;
+
+ /* General purpose registers */
+ *reg_write(ctxt, VCPU_REGS_RAX) = tss->eax;
+ *reg_write(ctxt, VCPU_REGS_RCX) = tss->ecx;
+ *reg_write(ctxt, VCPU_REGS_RDX) = tss->edx;
+ *reg_write(ctxt, VCPU_REGS_RBX) = tss->ebx;
+ *reg_write(ctxt, VCPU_REGS_RSP) = tss->esp;
+ *reg_write(ctxt, VCPU_REGS_RBP) = tss->ebp;
+ *reg_write(ctxt, VCPU_REGS_RSI) = tss->esi;
+ *reg_write(ctxt, VCPU_REGS_RDI) = tss->edi;
+
+ /*
+ * SDM says that segment selectors are loaded before segment
+ * descriptors. This is important because CPL checks will
+ * use CS.RPL.
+ */
+ set_segment_selector(ctxt, tss->ldt_selector, VCPU_SREG_LDTR);
+ set_segment_selector(ctxt, tss->es, VCPU_SREG_ES);
+ set_segment_selector(ctxt, tss->cs, VCPU_SREG_CS);
+ set_segment_selector(ctxt, tss->ss, VCPU_SREG_SS);
+ set_segment_selector(ctxt, tss->ds, VCPU_SREG_DS);
+ set_segment_selector(ctxt, tss->fs, VCPU_SREG_FS);
+ set_segment_selector(ctxt, tss->gs, VCPU_SREG_GS);
+
+ /*
+ * If we're switching between Protected Mode and VM86, we need to make
+ * sure to update the mode before loading the segment descriptors so
+ * that the selectors are interpreted correctly.
+ */
+ if (ctxt->eflags & X86_EFLAGS_VM) {
+ ctxt->mode = X86EMUL_MODE_VM86;
+ cpl = 3;
+ } else {
+ ctxt->mode = X86EMUL_MODE_PROT32;
+ cpl = tss->cs & 3;
+ }
+
+ /*
+ * Now load segment descriptors. If fault happens at this stage
+ * it is handled in a context of new task
+ */
+ ret = __load_segment_descriptor(ctxt, tss->ldt_selector, VCPU_SREG_LDTR,
+ cpl, X86_TRANSFER_TASK_SWITCH, NULL);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+ ret = __load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES, cpl,
+ X86_TRANSFER_TASK_SWITCH, NULL);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+ ret = __load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS, cpl,
+ X86_TRANSFER_TASK_SWITCH, NULL);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+ ret = __load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS, cpl,
+ X86_TRANSFER_TASK_SWITCH, NULL);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+ ret = __load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS, cpl,
+ X86_TRANSFER_TASK_SWITCH, NULL);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+ ret = __load_segment_descriptor(ctxt, tss->fs, VCPU_SREG_FS, cpl,
+ X86_TRANSFER_TASK_SWITCH, NULL);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+ ret = __load_segment_descriptor(ctxt, tss->gs, VCPU_SREG_GS, cpl,
+ X86_TRANSFER_TASK_SWITCH, NULL);
+
+ return ret;
+}
+
+static int task_switch_32(struct x86_emulate_ctxt *ctxt, u16 old_tss_sel,
+ ulong old_tss_base, struct desc_struct *new_desc)
+{
+ struct tss_segment_32 tss_seg;
+ int ret;
+ u32 new_tss_base = get_desc_base(new_desc);
+ u32 eip_offset = offsetof(struct tss_segment_32, eip);
+ u32 ldt_sel_offset = offsetof(struct tss_segment_32, ldt_selector);
+
+ ret = linear_read_system(ctxt, old_tss_base, &tss_seg, sizeof(tss_seg));
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+
+ save_state_to_tss32(ctxt, &tss_seg);
+
+ /* Only GP registers and segment selectors are saved */
+ ret = linear_write_system(ctxt, old_tss_base + eip_offset, &tss_seg.eip,
+ ldt_sel_offset - eip_offset);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+
+ ret = linear_read_system(ctxt, new_tss_base, &tss_seg, sizeof(tss_seg));
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+
+ if (old_tss_sel != 0xffff) {
+ tss_seg.prev_task_link = old_tss_sel;
+
+ ret = linear_write_system(ctxt, new_tss_base,
+ &tss_seg.prev_task_link,
+ sizeof(tss_seg.prev_task_link));
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+ }
+
+ return load_state_from_tss32(ctxt, &tss_seg);
+}
+
+static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
+ u16 tss_selector, int idt_index, int reason,
+ bool has_error_code, u32 error_code)
+{
+ const struct x86_emulate_ops *ops = ctxt->ops;
+ struct desc_struct curr_tss_desc, next_tss_desc;
+ int ret;
+ u16 old_tss_sel = get_segment_selector(ctxt, VCPU_SREG_TR);
+ ulong old_tss_base =
+ ops->get_cached_segment_base(ctxt, VCPU_SREG_TR);
+ u32 desc_limit;
+ ulong desc_addr, dr7;
+
+ /* FIXME: old_tss_base == ~0 ? */
+
+ ret = read_segment_descriptor(ctxt, tss_selector, &next_tss_desc, &desc_addr);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+ ret = read_segment_descriptor(ctxt, old_tss_sel, &curr_tss_desc, &desc_addr);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+
+ /* FIXME: check that next_tss_desc is tss */
+
+ /*
+ * Check privileges. The three cases are task switch caused by...
+ *
+ * 1. jmp/call/int to task gate: Check against DPL of the task gate
+ * 2. Exception/IRQ/iret: No check is performed
+ * 3. jmp/call to TSS/task-gate: No check is performed since the
+ * hardware checks it before exiting.
+ */
+ if (reason == TASK_SWITCH_GATE) {
+ if (idt_index != -1) {
+ /* Software interrupts */
+ struct desc_struct task_gate_desc;
+ int dpl;
+
+ ret = read_interrupt_descriptor(ctxt, idt_index,
+ &task_gate_desc);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+
+ dpl = task_gate_desc.dpl;
+ if ((tss_selector & 3) > dpl || ops->cpl(ctxt) > dpl)
+ return emulate_gp(ctxt, (idt_index << 3) | 0x2);
+ }
+ }
+
+ desc_limit = desc_limit_scaled(&next_tss_desc);
+ if (!next_tss_desc.p ||
+ ((desc_limit < 0x67 && (next_tss_desc.type & 8)) ||
+ desc_limit < 0x2b)) {
+ return emulate_ts(ctxt, tss_selector & 0xfffc);
+ }
+
+ if (reason == TASK_SWITCH_IRET || reason == TASK_SWITCH_JMP) {
+ curr_tss_desc.type &= ~(1 << 1); /* clear busy flag */
+ write_segment_descriptor(ctxt, old_tss_sel, &curr_tss_desc);
+ }
+
+ if (reason == TASK_SWITCH_IRET)
+ ctxt->eflags = ctxt->eflags & ~X86_EFLAGS_NT;
+
+ /* set back link to prev task only if NT bit is set in eflags
+ note that old_tss_sel is not used after this point */
+ if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE)
+ old_tss_sel = 0xffff;
+
+ if (next_tss_desc.type & 8)
+ ret = task_switch_32(ctxt, old_tss_sel, old_tss_base, &next_tss_desc);
+ else
+ ret = task_switch_16(ctxt, old_tss_sel,
+ old_tss_base, &next_tss_desc);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+
+ if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE)
+ ctxt->eflags = ctxt->eflags | X86_EFLAGS_NT;
+
+ if (reason != TASK_SWITCH_IRET) {
+ next_tss_desc.type |= (1 << 1); /* set busy flag */
+ write_segment_descriptor(ctxt, tss_selector, &next_tss_desc);
+ }
+
+ ops->set_cr(ctxt, 0, ops->get_cr(ctxt, 0) | X86_CR0_TS);
+ ops->set_segment(ctxt, tss_selector, &next_tss_desc, 0, VCPU_SREG_TR);
+
+ if (has_error_code) {
+ ctxt->op_bytes = ctxt->ad_bytes = (next_tss_desc.type & 8) ? 4 : 2;
+ ctxt->lock_prefix = 0;
+ ctxt->src.val = (unsigned long) error_code;
+ ret = em_push(ctxt);
+ }
+
+ ops->get_dr(ctxt, 7, &dr7);
+ ops->set_dr(ctxt, 7, dr7 & ~(DR_LOCAL_ENABLE_MASK | DR_LOCAL_SLOWDOWN));
+
+ return ret;
+}
+
+int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
+ u16 tss_selector, int idt_index, int reason,
+ bool has_error_code, u32 error_code)
+{
+ int rc;
+
+ invalidate_registers(ctxt);
+ ctxt->_eip = ctxt->eip;
+ ctxt->dst.type = OP_NONE;
+
+ rc = emulator_do_task_switch(ctxt, tss_selector, idt_index, reason,
+ has_error_code, error_code);
+
+ if (rc == X86EMUL_CONTINUE) {
+ ctxt->eip = ctxt->_eip;
+ writeback_registers(ctxt);
+ }
+
+ return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK;
+}
+
+static void string_addr_inc(struct x86_emulate_ctxt *ctxt, int reg,
+ struct operand *op)
+{
+ int df = (ctxt->eflags & X86_EFLAGS_DF) ? -op->count : op->count;
+
+ register_address_increment(ctxt, reg, df * op->bytes);
+ op->addr.mem.ea = register_address(ctxt, reg);
+}
+
+static int em_das(struct x86_emulate_ctxt *ctxt)
+{
+ u8 al, old_al;
+ bool af, cf, old_cf;
+
+ cf = ctxt->eflags & X86_EFLAGS_CF;
+ al = ctxt->dst.val;
+
+ old_al = al;
+ old_cf = cf;
+ cf = false;
+ af = ctxt->eflags & X86_EFLAGS_AF;
+ if ((al & 0x0f) > 9 || af) {
+ al -= 6;
+ cf = old_cf | (al >= 250);
+ af = true;
+ } else {
+ af = false;
+ }
+ if (old_al > 0x99 || old_cf) {
+ al -= 0x60;
+ cf = true;
+ }
+
+ ctxt->dst.val = al;
+ /* Set PF, ZF, SF */
+ ctxt->src.type = OP_IMM;
+ ctxt->src.val = 0;
+ ctxt->src.bytes = 1;
+ fastop(ctxt, em_or);
+ ctxt->eflags &= ~(X86_EFLAGS_AF | X86_EFLAGS_CF);
+ if (cf)
+ ctxt->eflags |= X86_EFLAGS_CF;
+ if (af)
+ ctxt->eflags |= X86_EFLAGS_AF;
+ return X86EMUL_CONTINUE;
+}
+
+static int em_aam(struct x86_emulate_ctxt *ctxt)
+{
+ u8 al, ah;
+
+ if (ctxt->src.val == 0)
+ return emulate_de(ctxt);
+
+ al = ctxt->dst.val & 0xff;
+ ah = al / ctxt->src.val;
+ al %= ctxt->src.val;
+
+ ctxt->dst.val = (ctxt->dst.val & 0xffff0000) | al | (ah << 8);
+
+ /* Set PF, ZF, SF */
+ ctxt->src.type = OP_IMM;
+ ctxt->src.val = 0;
+ ctxt->src.bytes = 1;
+ fastop(ctxt, em_or);
+
+ return X86EMUL_CONTINUE;
+}
+
+static int em_aad(struct x86_emulate_ctxt *ctxt)
+{
+ u8 al = ctxt->dst.val & 0xff;
+ u8 ah = (ctxt->dst.val >> 8) & 0xff;
+
+ al = (al + (ah * ctxt->src.val)) & 0xff;
+
+ ctxt->dst.val = (ctxt->dst.val & 0xffff0000) | al;
+
+ /* Set PF, ZF, SF */
+ ctxt->src.type = OP_IMM;
+ ctxt->src.val = 0;
+ ctxt->src.bytes = 1;
+ fastop(ctxt, em_or);
+
+ return X86EMUL_CONTINUE;
+}
+
+static int em_call(struct x86_emulate_ctxt *ctxt)
+{
+ int rc;
+ long rel = ctxt->src.val;
+
+ ctxt->src.val = (unsigned long)ctxt->_eip;
+ rc = jmp_rel(ctxt, rel);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+ return em_push(ctxt);
+}
+
+static int em_call_far(struct x86_emulate_ctxt *ctxt)
+{
+ u16 sel, old_cs;
+ ulong old_eip;
+ int rc;
+ struct desc_struct old_desc, new_desc;
+ const struct x86_emulate_ops *ops = ctxt->ops;
+ int cpl = ctxt->ops->cpl(ctxt);
+ enum x86emul_mode prev_mode = ctxt->mode;
+
+ old_eip = ctxt->_eip;
+ ops->get_segment(ctxt, &old_cs, &old_desc, NULL, VCPU_SREG_CS);
+
+ memcpy(&sel, ctxt->src.valptr + ctxt->op_bytes, 2);
+ rc = __load_segment_descriptor(ctxt, sel, VCPU_SREG_CS, cpl,
+ X86_TRANSFER_CALL_JMP, &new_desc);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ rc = assign_eip_far(ctxt, ctxt->src.val);
+ if (rc != X86EMUL_CONTINUE)
+ goto fail;
+
+ ctxt->src.val = old_cs;
+ rc = em_push(ctxt);
+ if (rc != X86EMUL_CONTINUE)
+ goto fail;
+
+ ctxt->src.val = old_eip;
+ rc = em_push(ctxt);
+ /* If we failed, we tainted the memory, but the very least we should
+ restore cs */
+ if (rc != X86EMUL_CONTINUE) {
+ pr_warn_once("faulting far call emulation tainted memory\n");
+ goto fail;
+ }
+ return rc;
+fail:
+ ops->set_segment(ctxt, old_cs, &old_desc, 0, VCPU_SREG_CS);
+ ctxt->mode = prev_mode;
+ return rc;
+
+}
+
+static int em_ret_near_imm(struct x86_emulate_ctxt *ctxt)
+{
+ int rc;
+ unsigned long eip;
+
+ rc = emulate_pop(ctxt, &eip, ctxt->op_bytes);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+ rc = assign_eip_near(ctxt, eip);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+ rsp_increment(ctxt, ctxt->src.val);
+ return X86EMUL_CONTINUE;
+}
+
+static int em_xchg(struct x86_emulate_ctxt *ctxt)
+{
+ /* Write back the register source. */
+ ctxt->src.val = ctxt->dst.val;
+ write_register_operand(&ctxt->src);
+
+ /* Write back the memory destination with implicit LOCK prefix. */
+ ctxt->dst.val = ctxt->src.orig_val;
+ ctxt->lock_prefix = 1;
+ return X86EMUL_CONTINUE;
+}
+
+static int em_imul_3op(struct x86_emulate_ctxt *ctxt)
+{
+ ctxt->dst.val = ctxt->src2.val;
+ return fastop(ctxt, em_imul);
+}
+
+static int em_cwd(struct x86_emulate_ctxt *ctxt)
+{
+ ctxt->dst.type = OP_REG;
+ ctxt->dst.bytes = ctxt->src.bytes;
+ ctxt->dst.addr.reg = reg_rmw(ctxt, VCPU_REGS_RDX);
+ ctxt->dst.val = ~((ctxt->src.val >> (ctxt->src.bytes * 8 - 1)) - 1);
+
+ return X86EMUL_CONTINUE;
+}
+
+static int em_rdpid(struct x86_emulate_ctxt *ctxt)
+{
+ u64 tsc_aux = 0;
+
+ if (!ctxt->ops->guest_has_rdpid(ctxt))
+ return emulate_ud(ctxt);
+
+ ctxt->ops->get_msr(ctxt, MSR_TSC_AUX, &tsc_aux);
+ ctxt->dst.val = tsc_aux;
+ return X86EMUL_CONTINUE;
+}
+
+static int em_rdtsc(struct x86_emulate_ctxt *ctxt)
+{
+ u64 tsc = 0;
+
+ ctxt->ops->get_msr(ctxt, MSR_IA32_TSC, &tsc);
+ *reg_write(ctxt, VCPU_REGS_RAX) = (u32)tsc;
+ *reg_write(ctxt, VCPU_REGS_RDX) = tsc >> 32;
+ return X86EMUL_CONTINUE;
+}
+
+static int em_rdpmc(struct x86_emulate_ctxt *ctxt)
+{
+ u64 pmc;
+
+ if (ctxt->ops->read_pmc(ctxt, reg_read(ctxt, VCPU_REGS_RCX), &pmc))
+ return emulate_gp(ctxt, 0);
+ *reg_write(ctxt, VCPU_REGS_RAX) = (u32)pmc;
+ *reg_write(ctxt, VCPU_REGS_RDX) = pmc >> 32;
+ return X86EMUL_CONTINUE;
+}
+
+static int em_mov(struct x86_emulate_ctxt *ctxt)
+{
+ memcpy(ctxt->dst.valptr, ctxt->src.valptr, sizeof(ctxt->src.valptr));
+ return X86EMUL_CONTINUE;
+}
+
+static int em_movbe(struct x86_emulate_ctxt *ctxt)
+{
+ u16 tmp;
+
+ if (!ctxt->ops->guest_has_movbe(ctxt))
+ return emulate_ud(ctxt);
+
+ switch (ctxt->op_bytes) {
+ case 2:
+ /*
+ * From MOVBE definition: "...When the operand size is 16 bits,
+ * the upper word of the destination register remains unchanged
+ * ..."
+ *
+ * Both casting ->valptr and ->val to u16 breaks strict aliasing
+ * rules so we have to do the operation almost per hand.
+ */
+ tmp = (u16)ctxt->src.val;
+ ctxt->dst.val &= ~0xffffUL;
+ ctxt->dst.val |= (unsigned long)swab16(tmp);
+ break;
+ case 4:
+ ctxt->dst.val = swab32((u32)ctxt->src.val);
+ break;
+ case 8:
+ ctxt->dst.val = swab64(ctxt->src.val);
+ break;
+ default:
+ BUG();
+ }
+ return X86EMUL_CONTINUE;
+}
+
+static int em_cr_write(struct x86_emulate_ctxt *ctxt)
+{
+ int cr_num = ctxt->modrm_reg;
+ int r;
+
+ if (ctxt->ops->set_cr(ctxt, cr_num, ctxt->src.val))
+ return emulate_gp(ctxt, 0);
+
+ /* Disable writeback. */
+ ctxt->dst.type = OP_NONE;
+
+ if (cr_num == 0) {
+ /*
+ * CR0 write might have updated CR0.PE and/or CR0.PG
+ * which can affect the cpu's execution mode.
+ */
+ r = emulator_recalc_and_set_mode(ctxt);
+ if (r != X86EMUL_CONTINUE)
+ return r;
+ }
+
+ return X86EMUL_CONTINUE;
+}
+
+static int em_dr_write(struct x86_emulate_ctxt *ctxt)
+{
+ unsigned long val;
+
+ if (ctxt->mode == X86EMUL_MODE_PROT64)
+ val = ctxt->src.val & ~0ULL;
+ else
+ val = ctxt->src.val & ~0U;
+
+ /* #UD condition is already handled. */
+ if (ctxt->ops->set_dr(ctxt, ctxt->modrm_reg, val) < 0)
+ return emulate_gp(ctxt, 0);
+
+ /* Disable writeback. */
+ ctxt->dst.type = OP_NONE;
+ return X86EMUL_CONTINUE;
+}
+
+static int em_wrmsr(struct x86_emulate_ctxt *ctxt)
+{
+ u64 msr_index = reg_read(ctxt, VCPU_REGS_RCX);
+ u64 msr_data;
+ int r;
+
+ msr_data = (u32)reg_read(ctxt, VCPU_REGS_RAX)
+ | ((u64)reg_read(ctxt, VCPU_REGS_RDX) << 32);
+ r = ctxt->ops->set_msr_with_filter(ctxt, msr_index, msr_data);
+
+ if (r == X86EMUL_PROPAGATE_FAULT)
+ return emulate_gp(ctxt, 0);
+
+ return r;
+}
+
+static int em_rdmsr(struct x86_emulate_ctxt *ctxt)
+{
+ u64 msr_index = reg_read(ctxt, VCPU_REGS_RCX);
+ u64 msr_data;
+ int r;
+
+ r = ctxt->ops->get_msr_with_filter(ctxt, msr_index, &msr_data);
+
+ if (r == X86EMUL_PROPAGATE_FAULT)
+ return emulate_gp(ctxt, 0);
+
+ if (r == X86EMUL_CONTINUE) {
+ *reg_write(ctxt, VCPU_REGS_RAX) = (u32)msr_data;
+ *reg_write(ctxt, VCPU_REGS_RDX) = msr_data >> 32;
+ }
+ return r;
+}
+
+static int em_store_sreg(struct x86_emulate_ctxt *ctxt, int segment)
+{
+ if (segment > VCPU_SREG_GS &&
+ (ctxt->ops->get_cr(ctxt, 4) & X86_CR4_UMIP) &&
+ ctxt->ops->cpl(ctxt) > 0)
+ return emulate_gp(ctxt, 0);
+
+ ctxt->dst.val = get_segment_selector(ctxt, segment);
+ if (ctxt->dst.bytes == 4 && ctxt->dst.type == OP_MEM)
+ ctxt->dst.bytes = 2;
+ return X86EMUL_CONTINUE;
+}
+
+static int em_mov_rm_sreg(struct x86_emulate_ctxt *ctxt)
+{
+ if (ctxt->modrm_reg > VCPU_SREG_GS)
+ return emulate_ud(ctxt);
+
+ return em_store_sreg(ctxt, ctxt->modrm_reg);
+}
+
+static int em_mov_sreg_rm(struct x86_emulate_ctxt *ctxt)
+{
+ u16 sel = ctxt->src.val;
+
+ if (ctxt->modrm_reg == VCPU_SREG_CS || ctxt->modrm_reg > VCPU_SREG_GS)
+ return emulate_ud(ctxt);
+
+ if (ctxt->modrm_reg == VCPU_SREG_SS)
+ ctxt->interruptibility = KVM_X86_SHADOW_INT_MOV_SS;
+
+ /* Disable writeback. */
+ ctxt->dst.type = OP_NONE;
+ return load_segment_descriptor(ctxt, sel, ctxt->modrm_reg);
+}
+
+static int em_sldt(struct x86_emulate_ctxt *ctxt)
+{
+ return em_store_sreg(ctxt, VCPU_SREG_LDTR);
+}
+
+static int em_lldt(struct x86_emulate_ctxt *ctxt)
+{
+ u16 sel = ctxt->src.val;
+
+ /* Disable writeback. */
+ ctxt->dst.type = OP_NONE;
+ return load_segment_descriptor(ctxt, sel, VCPU_SREG_LDTR);
+}
+
+static int em_str(struct x86_emulate_ctxt *ctxt)
+{
+ return em_store_sreg(ctxt, VCPU_SREG_TR);
+}
+
+static int em_ltr(struct x86_emulate_ctxt *ctxt)
+{
+ u16 sel = ctxt->src.val;
+
+ /* Disable writeback. */
+ ctxt->dst.type = OP_NONE;
+ return load_segment_descriptor(ctxt, sel, VCPU_SREG_TR);
+}
+
+static int em_invlpg(struct x86_emulate_ctxt *ctxt)
+{
+ int rc;
+ ulong linear;
+
+ rc = linearize(ctxt, ctxt->src.addr.mem, 1, false, &linear);
+ if (rc == X86EMUL_CONTINUE)
+ ctxt->ops->invlpg(ctxt, linear);
+ /* Disable writeback. */
+ ctxt->dst.type = OP_NONE;
+ return X86EMUL_CONTINUE;
+}
+
+static int em_clts(struct x86_emulate_ctxt *ctxt)
+{
+ ulong cr0;
+
+ cr0 = ctxt->ops->get_cr(ctxt, 0);
+ cr0 &= ~X86_CR0_TS;
+ ctxt->ops->set_cr(ctxt, 0, cr0);
+ return X86EMUL_CONTINUE;
+}
+
+static int em_hypercall(struct x86_emulate_ctxt *ctxt)
+{
+ int rc = ctxt->ops->fix_hypercall(ctxt);
+
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ /* Let the processor re-execute the fixed hypercall */
+ ctxt->_eip = ctxt->eip;
+ /* Disable writeback. */
+ ctxt->dst.type = OP_NONE;
+ return X86EMUL_CONTINUE;
+}
+
+static int emulate_store_desc_ptr(struct x86_emulate_ctxt *ctxt,
+ void (*get)(struct x86_emulate_ctxt *ctxt,
+ struct desc_ptr *ptr))
+{
+ struct desc_ptr desc_ptr;
+
+ if ((ctxt->ops->get_cr(ctxt, 4) & X86_CR4_UMIP) &&
+ ctxt->ops->cpl(ctxt) > 0)
+ return emulate_gp(ctxt, 0);
+
+ if (ctxt->mode == X86EMUL_MODE_PROT64)
+ ctxt->op_bytes = 8;
+ get(ctxt, &desc_ptr);
+ if (ctxt->op_bytes == 2) {
+ ctxt->op_bytes = 4;
+ desc_ptr.address &= 0x00ffffff;
+ }
+ /* Disable writeback. */
+ ctxt->dst.type = OP_NONE;
+ return segmented_write_std(ctxt, ctxt->dst.addr.mem,
+ &desc_ptr, 2 + ctxt->op_bytes);
+}
+
+static int em_sgdt(struct x86_emulate_ctxt *ctxt)
+{
+ return emulate_store_desc_ptr(ctxt, ctxt->ops->get_gdt);
+}
+
+static int em_sidt(struct x86_emulate_ctxt *ctxt)
+{
+ return emulate_store_desc_ptr(ctxt, ctxt->ops->get_idt);
+}
+
+static int em_lgdt_lidt(struct x86_emulate_ctxt *ctxt, bool lgdt)
+{
+ struct desc_ptr desc_ptr;
+ int rc;
+
+ if (ctxt->mode == X86EMUL_MODE_PROT64)
+ ctxt->op_bytes = 8;
+ rc = read_descriptor(ctxt, ctxt->src.addr.mem,
+ &desc_ptr.size, &desc_ptr.address,
+ ctxt->op_bytes);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+ if (ctxt->mode == X86EMUL_MODE_PROT64 &&
+ emul_is_noncanonical_address(desc_ptr.address, ctxt))
+ return emulate_gp(ctxt, 0);
+ if (lgdt)
+ ctxt->ops->set_gdt(ctxt, &desc_ptr);
+ else
+ ctxt->ops->set_idt(ctxt, &desc_ptr);
+ /* Disable writeback. */
+ ctxt->dst.type = OP_NONE;
+ return X86EMUL_CONTINUE;
+}
+
+static int em_lgdt(struct x86_emulate_ctxt *ctxt)
+{
+ return em_lgdt_lidt(ctxt, true);
+}
+
+static int em_lidt(struct x86_emulate_ctxt *ctxt)
+{
+ return em_lgdt_lidt(ctxt, false);
+}
+
+static int em_smsw(struct x86_emulate_ctxt *ctxt)
+{
+ if ((ctxt->ops->get_cr(ctxt, 4) & X86_CR4_UMIP) &&
+ ctxt->ops->cpl(ctxt) > 0)
+ return emulate_gp(ctxt, 0);
+
+ if (ctxt->dst.type == OP_MEM)
+ ctxt->dst.bytes = 2;
+ ctxt->dst.val = ctxt->ops->get_cr(ctxt, 0);
+ return X86EMUL_CONTINUE;
+}
+
+static int em_lmsw(struct x86_emulate_ctxt *ctxt)
+{
+ ctxt->ops->set_cr(ctxt, 0, (ctxt->ops->get_cr(ctxt, 0) & ~0x0eul)
+ | (ctxt->src.val & 0x0f));
+ ctxt->dst.type = OP_NONE;
+ return X86EMUL_CONTINUE;
+}
+
+static int em_loop(struct x86_emulate_ctxt *ctxt)
+{
+ int rc = X86EMUL_CONTINUE;
+
+ register_address_increment(ctxt, VCPU_REGS_RCX, -1);
+ if ((address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) != 0) &&
+ (ctxt->b == 0xe2 || test_cc(ctxt->b ^ 0x5, ctxt->eflags)))
+ rc = jmp_rel(ctxt, ctxt->src.val);
+
+ return rc;
+}
+
+static int em_jcxz(struct x86_emulate_ctxt *ctxt)
+{
+ int rc = X86EMUL_CONTINUE;
+
+ if (address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) == 0)
+ rc = jmp_rel(ctxt, ctxt->src.val);
+
+ return rc;
+}
+
+static int em_in(struct x86_emulate_ctxt *ctxt)
+{
+ if (!pio_in_emulated(ctxt, ctxt->dst.bytes, ctxt->src.val,
+ &ctxt->dst.val))
+ return X86EMUL_IO_NEEDED;
+
+ return X86EMUL_CONTINUE;
+}
+
+static int em_out(struct x86_emulate_ctxt *ctxt)
+{
+ ctxt->ops->pio_out_emulated(ctxt, ctxt->src.bytes, ctxt->dst.val,
+ &ctxt->src.val, 1);
+ /* Disable writeback. */
+ ctxt->dst.type = OP_NONE;
+ return X86EMUL_CONTINUE;
+}
+
+static int em_cli(struct x86_emulate_ctxt *ctxt)
+{
+ if (emulator_bad_iopl(ctxt))
+ return emulate_gp(ctxt, 0);
+
+ ctxt->eflags &= ~X86_EFLAGS_IF;
+ return X86EMUL_CONTINUE;
+}
+
+static int em_sti(struct x86_emulate_ctxt *ctxt)
+{
+ if (emulator_bad_iopl(ctxt))
+ return emulate_gp(ctxt, 0);
+
+ ctxt->interruptibility = KVM_X86_SHADOW_INT_STI;
+ ctxt->eflags |= X86_EFLAGS_IF;
+ return X86EMUL_CONTINUE;
+}
+
+static int em_cpuid(struct x86_emulate_ctxt *ctxt)
+{
+ u32 eax, ebx, ecx, edx;
+ u64 msr = 0;
+
+ ctxt->ops->get_msr(ctxt, MSR_MISC_FEATURES_ENABLES, &msr);
+ if (msr & MSR_MISC_FEATURES_ENABLES_CPUID_FAULT &&
+ ctxt->ops->cpl(ctxt)) {
+ return emulate_gp(ctxt, 0);
+ }
+
+ eax = reg_read(ctxt, VCPU_REGS_RAX);
+ ecx = reg_read(ctxt, VCPU_REGS_RCX);
+ ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, false);
+ *reg_write(ctxt, VCPU_REGS_RAX) = eax;
+ *reg_write(ctxt, VCPU_REGS_RBX) = ebx;
+ *reg_write(ctxt, VCPU_REGS_RCX) = ecx;
+ *reg_write(ctxt, VCPU_REGS_RDX) = edx;
+ return X86EMUL_CONTINUE;
+}
+
+static int em_sahf(struct x86_emulate_ctxt *ctxt)
+{
+ u32 flags;
+
+ flags = X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF |
+ X86_EFLAGS_SF;
+ flags &= *reg_rmw(ctxt, VCPU_REGS_RAX) >> 8;
+
+ ctxt->eflags &= ~0xffUL;
+ ctxt->eflags |= flags | X86_EFLAGS_FIXED;
+ return X86EMUL_CONTINUE;
+}
+
+static int em_lahf(struct x86_emulate_ctxt *ctxt)
+{
+ *reg_rmw(ctxt, VCPU_REGS_RAX) &= ~0xff00UL;
+ *reg_rmw(ctxt, VCPU_REGS_RAX) |= (ctxt->eflags & 0xff) << 8;
+ return X86EMUL_CONTINUE;
+}
+
+static int em_bswap(struct x86_emulate_ctxt *ctxt)
+{
+ switch (ctxt->op_bytes) {
+#ifdef CONFIG_X86_64
+ case 8:
+ asm("bswap %0" : "+r"(ctxt->dst.val));
+ break;
+#endif
+ default:
+ asm("bswap %0" : "+r"(*(u32 *)&ctxt->dst.val));
+ break;
+ }
+ return X86EMUL_CONTINUE;
+}
+
+static int em_clflush(struct x86_emulate_ctxt *ctxt)
+{
+ /* emulating clflush regardless of cpuid */
+ return X86EMUL_CONTINUE;
+}
+
+static int em_clflushopt(struct x86_emulate_ctxt *ctxt)
+{
+ /* emulating clflushopt regardless of cpuid */
+ return X86EMUL_CONTINUE;
+}
+
+static int em_movsxd(struct x86_emulate_ctxt *ctxt)
+{
+ ctxt->dst.val = (s32) ctxt->src.val;
+ return X86EMUL_CONTINUE;
+}
+
+static int check_fxsr(struct x86_emulate_ctxt *ctxt)
+{
+ if (!ctxt->ops->guest_has_fxsr(ctxt))
+ return emulate_ud(ctxt);
+
+ if (ctxt->ops->get_cr(ctxt, 0) & (X86_CR0_TS | X86_CR0_EM))
+ return emulate_nm(ctxt);
+
+ /*
+ * Don't emulate a case that should never be hit, instead of working
+ * around a lack of fxsave64/fxrstor64 on old compilers.
+ */
+ if (ctxt->mode >= X86EMUL_MODE_PROT64)
+ return X86EMUL_UNHANDLEABLE;
+
+ return X86EMUL_CONTINUE;
+}
+
+/*
+ * Hardware doesn't save and restore XMM 0-7 without CR4.OSFXSR, but does save
+ * and restore MXCSR.
+ */
+static size_t __fxstate_size(int nregs)
+{
+ return offsetof(struct fxregs_state, xmm_space[0]) + nregs * 16;
+}
+
+static inline size_t fxstate_size(struct x86_emulate_ctxt *ctxt)
+{
+ bool cr4_osfxsr;
+ if (ctxt->mode == X86EMUL_MODE_PROT64)
+ return __fxstate_size(16);
+
+ cr4_osfxsr = ctxt->ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR;
+ return __fxstate_size(cr4_osfxsr ? 8 : 0);
+}
+
+/*
+ * FXSAVE and FXRSTOR have 4 different formats depending on execution mode,
+ * 1) 16 bit mode
+ * 2) 32 bit mode
+ * - like (1), but FIP and FDP (foo) are only 16 bit. At least Intel CPUs
+ * preserve whole 32 bit values, though, so (1) and (2) are the same wrt.
+ * save and restore
+ * 3) 64-bit mode with REX.W prefix
+ * - like (2), but XMM 8-15 are being saved and restored
+ * 4) 64-bit mode without REX.W prefix
+ * - like (3), but FIP and FDP are 64 bit
+ *
+ * Emulation uses (3) for (1) and (2) and preserves XMM 8-15 to reach the
+ * desired result. (4) is not emulated.
+ *
+ * Note: Guest and host CPUID.(EAX=07H,ECX=0H):EBX[bit 13] (deprecate FPU CS
+ * and FPU DS) should match.
+ */
+static int em_fxsave(struct x86_emulate_ctxt *ctxt)
+{
+ struct fxregs_state fx_state;
+ int rc;
+
+ rc = check_fxsr(ctxt);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ kvm_fpu_get();
+
+ rc = asm_safe("fxsave %[fx]", , [fx] "+m"(fx_state));
+
+ kvm_fpu_put();
+
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ return segmented_write_std(ctxt, ctxt->memop.addr.mem, &fx_state,
+ fxstate_size(ctxt));
+}
+
+/*
+ * FXRSTOR might restore XMM registers not provided by the guest. Fill
+ * in the host registers (via FXSAVE) instead, so they won't be modified.
+ * (preemption has to stay disabled until FXRSTOR).
+ *
+ * Use noinline to keep the stack for other functions called by callers small.
+ */
+static noinline int fxregs_fixup(struct fxregs_state *fx_state,
+ const size_t used_size)
+{
+ struct fxregs_state fx_tmp;
+ int rc;
+
+ rc = asm_safe("fxsave %[fx]", , [fx] "+m"(fx_tmp));
+ memcpy((void *)fx_state + used_size, (void *)&fx_tmp + used_size,
+ __fxstate_size(16) - used_size);
+
+ return rc;
+}
+
+static int em_fxrstor(struct x86_emulate_ctxt *ctxt)
+{
+ struct fxregs_state fx_state;
+ int rc;
+ size_t size;
+
+ rc = check_fxsr(ctxt);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ size = fxstate_size(ctxt);
+ rc = segmented_read_std(ctxt, ctxt->memop.addr.mem, &fx_state, size);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ kvm_fpu_get();
+
+ if (size < __fxstate_size(16)) {
+ rc = fxregs_fixup(&fx_state, size);
+ if (rc != X86EMUL_CONTINUE)
+ goto out;
+ }
+
+ if (fx_state.mxcsr >> 16) {
+ rc = emulate_gp(ctxt, 0);
+ goto out;
+ }
+
+ if (rc == X86EMUL_CONTINUE)
+ rc = asm_safe("fxrstor %[fx]", : [fx] "m"(fx_state));
+
+out:
+ kvm_fpu_put();
+
+ return rc;
+}
+
+static int em_xsetbv(struct x86_emulate_ctxt *ctxt)
+{
+ u32 eax, ecx, edx;
+
+ if (!(ctxt->ops->get_cr(ctxt, 4) & X86_CR4_OSXSAVE))
+ return emulate_ud(ctxt);
+
+ eax = reg_read(ctxt, VCPU_REGS_RAX);
+ edx = reg_read(ctxt, VCPU_REGS_RDX);
+ ecx = reg_read(ctxt, VCPU_REGS_RCX);
+
+ if (ctxt->ops->set_xcr(ctxt, ecx, ((u64)edx << 32) | eax))
+ return emulate_gp(ctxt, 0);
+
+ return X86EMUL_CONTINUE;
+}
+
+static bool valid_cr(int nr)
+{
+ switch (nr) {
+ case 0:
+ case 2 ... 4:
+ case 8:
+ return true;
+ default:
+ return false;
+ }
+}
+
+static int check_cr_access(struct x86_emulate_ctxt *ctxt)
+{
+ if (!valid_cr(ctxt->modrm_reg))
+ return emulate_ud(ctxt);
+
+ return X86EMUL_CONTINUE;
+}
+
+static int check_dr7_gd(struct x86_emulate_ctxt *ctxt)
+{
+ unsigned long dr7;
+
+ ctxt->ops->get_dr(ctxt, 7, &dr7);
+
+ return dr7 & DR7_GD;
+}
+
+static int check_dr_read(struct x86_emulate_ctxt *ctxt)
+{
+ int dr = ctxt->modrm_reg;
+ u64 cr4;
+
+ if (dr > 7)
+ return emulate_ud(ctxt);
+
+ cr4 = ctxt->ops->get_cr(ctxt, 4);
+ if ((cr4 & X86_CR4_DE) && (dr == 4 || dr == 5))
+ return emulate_ud(ctxt);
+
+ if (check_dr7_gd(ctxt)) {
+ ulong dr6;
+
+ ctxt->ops->get_dr(ctxt, 6, &dr6);
+ dr6 &= ~DR_TRAP_BITS;
+ dr6 |= DR6_BD | DR6_ACTIVE_LOW;
+ ctxt->ops->set_dr(ctxt, 6, dr6);
+ return emulate_db(ctxt);
+ }
+
+ return X86EMUL_CONTINUE;
+}
+
+static int check_dr_write(struct x86_emulate_ctxt *ctxt)
+{
+ u64 new_val = ctxt->src.val64;
+ int dr = ctxt->modrm_reg;
+
+ if ((dr == 6 || dr == 7) && (new_val & 0xffffffff00000000ULL))
+ return emulate_gp(ctxt, 0);
+
+ return check_dr_read(ctxt);
+}
+
+static int check_svme(struct x86_emulate_ctxt *ctxt)
+{
+ u64 efer = 0;
+
+ ctxt->ops->get_msr(ctxt, MSR_EFER, &efer);
+
+ if (!(efer & EFER_SVME))
+ return emulate_ud(ctxt);
+
+ return X86EMUL_CONTINUE;
+}
+
+static int check_svme_pa(struct x86_emulate_ctxt *ctxt)
+{
+ u64 rax = reg_read(ctxt, VCPU_REGS_RAX);
+
+ /* Valid physical address? */
+ if (rax & 0xffff000000000000ULL)
+ return emulate_gp(ctxt, 0);
+
+ return check_svme(ctxt);
+}
+
+static int check_rdtsc(struct x86_emulate_ctxt *ctxt)
+{
+ u64 cr4 = ctxt->ops->get_cr(ctxt, 4);
+
+ if (cr4 & X86_CR4_TSD && ctxt->ops->cpl(ctxt))
+ return emulate_gp(ctxt, 0);
+
+ return X86EMUL_CONTINUE;
+}
+
+static int check_rdpmc(struct x86_emulate_ctxt *ctxt)
+{
+ u64 cr4 = ctxt->ops->get_cr(ctxt, 4);
+ u64 rcx = reg_read(ctxt, VCPU_REGS_RCX);
+
+ /*
+ * VMware allows access to these Pseduo-PMCs even when read via RDPMC
+ * in Ring3 when CR4.PCE=0.
+ */
+ if (enable_vmware_backdoor && is_vmware_backdoor_pmc(rcx))
+ return X86EMUL_CONTINUE;
+
+ /*
+ * If CR4.PCE is set, the SDM requires CPL=0 or CR0.PE=0. The CR0.PE
+ * check however is unnecessary because CPL is always 0 outside
+ * protected mode.
+ */
+ if ((!(cr4 & X86_CR4_PCE) && ctxt->ops->cpl(ctxt)) ||
+ ctxt->ops->check_pmc(ctxt, rcx))
+ return emulate_gp(ctxt, 0);
+
+ return X86EMUL_CONTINUE;
+}
+
+static int check_perm_in(struct x86_emulate_ctxt *ctxt)
+{
+ ctxt->dst.bytes = min(ctxt->dst.bytes, 4u);
+ if (!emulator_io_permitted(ctxt, ctxt->src.val, ctxt->dst.bytes))
+ return emulate_gp(ctxt, 0);
+
+ return X86EMUL_CONTINUE;
+}
+
+static int check_perm_out(struct x86_emulate_ctxt *ctxt)
+{
+ ctxt->src.bytes = min(ctxt->src.bytes, 4u);
+ if (!emulator_io_permitted(ctxt, ctxt->dst.val, ctxt->src.bytes))
+ return emulate_gp(ctxt, 0);
+
+ return X86EMUL_CONTINUE;
+}
+
+#define D(_y) { .flags = (_y) }
+#define DI(_y, _i) { .flags = (_y)|Intercept, .intercept = x86_intercept_##_i }
+#define DIP(_y, _i, _p) { .flags = (_y)|Intercept|CheckPerm, \
+ .intercept = x86_intercept_##_i, .check_perm = (_p) }
+#define N D(NotImpl)
+#define EXT(_f, _e) { .flags = ((_f) | RMExt), .u.group = (_e) }
+#define G(_f, _g) { .flags = ((_f) | Group | ModRM), .u.group = (_g) }
+#define GD(_f, _g) { .flags = ((_f) | GroupDual | ModRM), .u.gdual = (_g) }
+#define ID(_f, _i) { .flags = ((_f) | InstrDual | ModRM), .u.idual = (_i) }
+#define MD(_f, _m) { .flags = ((_f) | ModeDual), .u.mdual = (_m) }
+#define E(_f, _e) { .flags = ((_f) | Escape | ModRM), .u.esc = (_e) }
+#define I(_f, _e) { .flags = (_f), .u.execute = (_e) }
+#define F(_f, _e) { .flags = (_f) | Fastop, .u.fastop = (_e) }
+#define II(_f, _e, _i) \
+ { .flags = (_f)|Intercept, .u.execute = (_e), .intercept = x86_intercept_##_i }
+#define IIP(_f, _e, _i, _p) \
+ { .flags = (_f)|Intercept|CheckPerm, .u.execute = (_e), \
+ .intercept = x86_intercept_##_i, .check_perm = (_p) }
+#define GP(_f, _g) { .flags = ((_f) | Prefix), .u.gprefix = (_g) }
+
+#define D2bv(_f) D((_f) | ByteOp), D(_f)
+#define D2bvIP(_f, _i, _p) DIP((_f) | ByteOp, _i, _p), DIP(_f, _i, _p)
+#define I2bv(_f, _e) I((_f) | ByteOp, _e), I(_f, _e)
+#define F2bv(_f, _e) F((_f) | ByteOp, _e), F(_f, _e)
+#define I2bvIP(_f, _e, _i, _p) \
+ IIP((_f) | ByteOp, _e, _i, _p), IIP(_f, _e, _i, _p)
+
+#define F6ALU(_f, _e) F2bv((_f) | DstMem | SrcReg | ModRM, _e), \
+ F2bv(((_f) | DstReg | SrcMem | ModRM) & ~Lock, _e), \
+ F2bv(((_f) & ~Lock) | DstAcc | SrcImm, _e)
+
+static const struct opcode group7_rm0[] = {
+ N,
+ I(SrcNone | Priv | EmulateOnUD, em_hypercall),
+ N, N, N, N, N, N,
+};
+
+static const struct opcode group7_rm1[] = {
+ DI(SrcNone | Priv, monitor),
+ DI(SrcNone | Priv, mwait),
+ N, N, N, N, N, N,
+};
+
+static const struct opcode group7_rm2[] = {
+ N,
+ II(ImplicitOps | Priv, em_xsetbv, xsetbv),
+ N, N, N, N, N, N,
+};
+
+static const struct opcode group7_rm3[] = {
+ DIP(SrcNone | Prot | Priv, vmrun, check_svme_pa),
+ II(SrcNone | Prot | EmulateOnUD, em_hypercall, vmmcall),
+ DIP(SrcNone | Prot | Priv, vmload, check_svme_pa),
+ DIP(SrcNone | Prot | Priv, vmsave, check_svme_pa),
+ DIP(SrcNone | Prot | Priv, stgi, check_svme),
+ DIP(SrcNone | Prot | Priv, clgi, check_svme),
+ DIP(SrcNone | Prot | Priv, skinit, check_svme),
+ DIP(SrcNone | Prot | Priv, invlpga, check_svme),
+};
+
+static const struct opcode group7_rm7[] = {
+ N,
+ DIP(SrcNone, rdtscp, check_rdtsc),
+ N, N, N, N, N, N,
+};
+
+static const struct opcode group1[] = {
+ F(Lock, em_add),
+ F(Lock | PageTable, em_or),
+ F(Lock, em_adc),
+ F(Lock, em_sbb),
+ F(Lock | PageTable, em_and),
+ F(Lock, em_sub),
+ F(Lock, em_xor),
+ F(NoWrite, em_cmp),
+};
+
+static const struct opcode group1A[] = {
+ I(DstMem | SrcNone | Mov | Stack | IncSP | TwoMemOp, em_pop), N, N, N, N, N, N, N,
+};
+
+static const struct opcode group2[] = {
+ F(DstMem | ModRM, em_rol),
+ F(DstMem | ModRM, em_ror),
+ F(DstMem | ModRM, em_rcl),
+ F(DstMem | ModRM, em_rcr),
+ F(DstMem | ModRM, em_shl),
+ F(DstMem | ModRM, em_shr),
+ F(DstMem | ModRM, em_shl),
+ F(DstMem | ModRM, em_sar),
+};
+
+static const struct opcode group3[] = {
+ F(DstMem | SrcImm | NoWrite, em_test),
+ F(DstMem | SrcImm | NoWrite, em_test),
+ F(DstMem | SrcNone | Lock, em_not),
+ F(DstMem | SrcNone | Lock, em_neg),
+ F(DstXacc | Src2Mem, em_mul_ex),
+ F(DstXacc | Src2Mem, em_imul_ex),
+ F(DstXacc | Src2Mem, em_div_ex),
+ F(DstXacc | Src2Mem, em_idiv_ex),
+};
+
+static const struct opcode group4[] = {
+ F(ByteOp | DstMem | SrcNone | Lock, em_inc),
+ F(ByteOp | DstMem | SrcNone | Lock, em_dec),
+ N, N, N, N, N, N,
+};
+
+static const struct opcode group5[] = {
+ F(DstMem | SrcNone | Lock, em_inc),
+ F(DstMem | SrcNone | Lock, em_dec),
+ I(SrcMem | NearBranch | IsBranch, em_call_near_abs),
+ I(SrcMemFAddr | ImplicitOps | IsBranch, em_call_far),
+ I(SrcMem | NearBranch | IsBranch, em_jmp_abs),
+ I(SrcMemFAddr | ImplicitOps | IsBranch, em_jmp_far),
+ I(SrcMem | Stack | TwoMemOp, em_push), D(Undefined),
+};
+
+static const struct opcode group6[] = {
+ II(Prot | DstMem, em_sldt, sldt),
+ II(Prot | DstMem, em_str, str),
+ II(Prot | Priv | SrcMem16, em_lldt, lldt),
+ II(Prot | Priv | SrcMem16, em_ltr, ltr),
+ N, N, N, N,
+};
+
+static const struct group_dual group7 = { {
+ II(Mov | DstMem, em_sgdt, sgdt),
+ II(Mov | DstMem, em_sidt, sidt),
+ II(SrcMem | Priv, em_lgdt, lgdt),
+ II(SrcMem | Priv, em_lidt, lidt),
+ II(SrcNone | DstMem | Mov, em_smsw, smsw), N,
+ II(SrcMem16 | Mov | Priv, em_lmsw, lmsw),
+ II(SrcMem | ByteOp | Priv | NoAccess, em_invlpg, invlpg),
+}, {
+ EXT(0, group7_rm0),
+ EXT(0, group7_rm1),
+ EXT(0, group7_rm2),
+ EXT(0, group7_rm3),
+ II(SrcNone | DstMem | Mov, em_smsw, smsw), N,
+ II(SrcMem16 | Mov | Priv, em_lmsw, lmsw),
+ EXT(0, group7_rm7),
+} };
+
+static const struct opcode group8[] = {
+ N, N, N, N,
+ F(DstMem | SrcImmByte | NoWrite, em_bt),
+ F(DstMem | SrcImmByte | Lock | PageTable, em_bts),
+ F(DstMem | SrcImmByte | Lock, em_btr),
+ F(DstMem | SrcImmByte | Lock | PageTable, em_btc),
+};
+
+/*
+ * The "memory" destination is actually always a register, since we come
+ * from the register case of group9.
+ */
+static const struct gprefix pfx_0f_c7_7 = {
+ N, N, N, II(DstMem | ModRM | Op3264 | EmulateOnUD, em_rdpid, rdpid),
+};
+
+
+static const struct group_dual group9 = { {
+ N, I(DstMem64 | Lock | PageTable, em_cmpxchg8b), N, N, N, N, N, N,
+}, {
+ N, N, N, N, N, N, N,
+ GP(0, &pfx_0f_c7_7),
+} };
+
+static const struct opcode group11[] = {
+ I(DstMem | SrcImm | Mov | PageTable, em_mov),
+ X7(D(Undefined)),
+};
+
+static const struct gprefix pfx_0f_ae_7 = {
+ I(SrcMem | ByteOp, em_clflush), I(SrcMem | ByteOp, em_clflushopt), N, N,
+};
+
+static const struct group_dual group15 = { {
+ I(ModRM | Aligned16, em_fxsave),
+ I(ModRM | Aligned16, em_fxrstor),
+ N, N, N, N, N, GP(0, &pfx_0f_ae_7),
+}, {
+ N, N, N, N, N, N, N, N,
+} };
+
+static const struct gprefix pfx_0f_6f_0f_7f = {
+ I(Mmx, em_mov), I(Sse | Aligned, em_mov), N, I(Sse | Unaligned, em_mov),
+};
+
+static const struct instr_dual instr_dual_0f_2b = {
+ I(0, em_mov), N
+};
+
+static const struct gprefix pfx_0f_2b = {
+ ID(0, &instr_dual_0f_2b), ID(0, &instr_dual_0f_2b), N, N,
+};
+
+static const struct gprefix pfx_0f_10_0f_11 = {
+ I(Unaligned, em_mov), I(Unaligned, em_mov), N, N,
+};
+
+static const struct gprefix pfx_0f_28_0f_29 = {
+ I(Aligned, em_mov), I(Aligned, em_mov), N, N,
+};
+
+static const struct gprefix pfx_0f_e7 = {
+ N, I(Sse, em_mov), N, N,
+};
+
+static const struct escape escape_d9 = { {
+ N, N, N, N, N, N, N, I(DstMem16 | Mov, em_fnstcw),
+}, {
+ /* 0xC0 - 0xC7 */
+ N, N, N, N, N, N, N, N,
+ /* 0xC8 - 0xCF */
+ N, N, N, N, N, N, N, N,
+ /* 0xD0 - 0xC7 */
+ N, N, N, N, N, N, N, N,
+ /* 0xD8 - 0xDF */
+ N, N, N, N, N, N, N, N,
+ /* 0xE0 - 0xE7 */
+ N, N, N, N, N, N, N, N,
+ /* 0xE8 - 0xEF */
+ N, N, N, N, N, N, N, N,
+ /* 0xF0 - 0xF7 */
+ N, N, N, N, N, N, N, N,
+ /* 0xF8 - 0xFF */
+ N, N, N, N, N, N, N, N,
+} };
+
+static const struct escape escape_db = { {
+ N, N, N, N, N, N, N, N,
+}, {
+ /* 0xC0 - 0xC7 */
+ N, N, N, N, N, N, N, N,
+ /* 0xC8 - 0xCF */
+ N, N, N, N, N, N, N, N,
+ /* 0xD0 - 0xC7 */
+ N, N, N, N, N, N, N, N,
+ /* 0xD8 - 0xDF */
+ N, N, N, N, N, N, N, N,
+ /* 0xE0 - 0xE7 */
+ N, N, N, I(ImplicitOps, em_fninit), N, N, N, N,
+ /* 0xE8 - 0xEF */
+ N, N, N, N, N, N, N, N,
+ /* 0xF0 - 0xF7 */
+ N, N, N, N, N, N, N, N,
+ /* 0xF8 - 0xFF */
+ N, N, N, N, N, N, N, N,
+} };
+
+static const struct escape escape_dd = { {
+ N, N, N, N, N, N, N, I(DstMem16 | Mov, em_fnstsw),
+}, {
+ /* 0xC0 - 0xC7 */
+ N, N, N, N, N, N, N, N,
+ /* 0xC8 - 0xCF */
+ N, N, N, N, N, N, N, N,
+ /* 0xD0 - 0xC7 */
+ N, N, N, N, N, N, N, N,
+ /* 0xD8 - 0xDF */
+ N, N, N, N, N, N, N, N,
+ /* 0xE0 - 0xE7 */
+ N, N, N, N, N, N, N, N,
+ /* 0xE8 - 0xEF */
+ N, N, N, N, N, N, N, N,
+ /* 0xF0 - 0xF7 */
+ N, N, N, N, N, N, N, N,
+ /* 0xF8 - 0xFF */
+ N, N, N, N, N, N, N, N,
+} };
+
+static const struct instr_dual instr_dual_0f_c3 = {
+ I(DstMem | SrcReg | ModRM | No16 | Mov, em_mov), N
+};
+
+static const struct mode_dual mode_dual_63 = {
+ N, I(DstReg | SrcMem32 | ModRM | Mov, em_movsxd)
+};
+
+static const struct instr_dual instr_dual_8d = {
+ D(DstReg | SrcMem | ModRM | NoAccess), N
+};
+
+static const struct opcode opcode_table[256] = {
+ /* 0x00 - 0x07 */
+ F6ALU(Lock, em_add),
+ I(ImplicitOps | Stack | No64 | Src2ES, em_push_sreg),
+ I(ImplicitOps | Stack | No64 | Src2ES, em_pop_sreg),
+ /* 0x08 - 0x0F */
+ F6ALU(Lock | PageTable, em_or),
+ I(ImplicitOps | Stack | No64 | Src2CS, em_push_sreg),
+ N,
+ /* 0x10 - 0x17 */
+ F6ALU(Lock, em_adc),
+ I(ImplicitOps | Stack | No64 | Src2SS, em_push_sreg),
+ I(ImplicitOps | Stack | No64 | Src2SS, em_pop_sreg),
+ /* 0x18 - 0x1F */
+ F6ALU(Lock, em_sbb),
+ I(ImplicitOps | Stack | No64 | Src2DS, em_push_sreg),
+ I(ImplicitOps | Stack | No64 | Src2DS, em_pop_sreg),
+ /* 0x20 - 0x27 */
+ F6ALU(Lock | PageTable, em_and), N, N,
+ /* 0x28 - 0x2F */
+ F6ALU(Lock, em_sub), N, I(ByteOp | DstAcc | No64, em_das),
+ /* 0x30 - 0x37 */
+ F6ALU(Lock, em_xor), N, N,
+ /* 0x38 - 0x3F */
+ F6ALU(NoWrite, em_cmp), N, N,
+ /* 0x40 - 0x4F */
+ X8(F(DstReg, em_inc)), X8(F(DstReg, em_dec)),
+ /* 0x50 - 0x57 */
+ X8(I(SrcReg | Stack, em_push)),
+ /* 0x58 - 0x5F */
+ X8(I(DstReg | Stack, em_pop)),
+ /* 0x60 - 0x67 */
+ I(ImplicitOps | Stack | No64, em_pusha),
+ I(ImplicitOps | Stack | No64, em_popa),
+ N, MD(ModRM, &mode_dual_63),
+ N, N, N, N,
+ /* 0x68 - 0x6F */
+ I(SrcImm | Mov | Stack, em_push),
+ I(DstReg | SrcMem | ModRM | Src2Imm, em_imul_3op),
+ I(SrcImmByte | Mov | Stack, em_push),
+ I(DstReg | SrcMem | ModRM | Src2ImmByte, em_imul_3op),
+ I2bvIP(DstDI | SrcDX | Mov | String | Unaligned, em_in, ins, check_perm_in), /* insb, insw/insd */
+ I2bvIP(SrcSI | DstDX | String, em_out, outs, check_perm_out), /* outsb, outsw/outsd */
+ /* 0x70 - 0x7F */
+ X16(D(SrcImmByte | NearBranch | IsBranch)),
+ /* 0x80 - 0x87 */
+ G(ByteOp | DstMem | SrcImm, group1),
+ G(DstMem | SrcImm, group1),
+ G(ByteOp | DstMem | SrcImm | No64, group1),
+ G(DstMem | SrcImmByte, group1),
+ F2bv(DstMem | SrcReg | ModRM | NoWrite, em_test),
+ I2bv(DstMem | SrcReg | ModRM | Lock | PageTable, em_xchg),
+ /* 0x88 - 0x8F */
+ I2bv(DstMem | SrcReg | ModRM | Mov | PageTable, em_mov),
+ I2bv(DstReg | SrcMem | ModRM | Mov, em_mov),
+ I(DstMem | SrcNone | ModRM | Mov | PageTable, em_mov_rm_sreg),
+ ID(0, &instr_dual_8d),
+ I(ImplicitOps | SrcMem16 | ModRM, em_mov_sreg_rm),
+ G(0, group1A),
+ /* 0x90 - 0x97 */
+ DI(SrcAcc | DstReg, pause), X7(D(SrcAcc | DstReg)),
+ /* 0x98 - 0x9F */
+ D(DstAcc | SrcNone), I(ImplicitOps | SrcAcc, em_cwd),
+ I(SrcImmFAddr | No64 | IsBranch, em_call_far), N,
+ II(ImplicitOps | Stack, em_pushf, pushf),
+ II(ImplicitOps | Stack, em_popf, popf),
+ I(ImplicitOps, em_sahf), I(ImplicitOps, em_lahf),
+ /* 0xA0 - 0xA7 */
+ I2bv(DstAcc | SrcMem | Mov | MemAbs, em_mov),
+ I2bv(DstMem | SrcAcc | Mov | MemAbs | PageTable, em_mov),
+ I2bv(SrcSI | DstDI | Mov | String | TwoMemOp, em_mov),
+ F2bv(SrcSI | DstDI | String | NoWrite | TwoMemOp, em_cmp_r),
+ /* 0xA8 - 0xAF */
+ F2bv(DstAcc | SrcImm | NoWrite, em_test),
+ I2bv(SrcAcc | DstDI | Mov | String, em_mov),
+ I2bv(SrcSI | DstAcc | Mov | String, em_mov),
+ F2bv(SrcAcc | DstDI | String | NoWrite, em_cmp_r),
+ /* 0xB0 - 0xB7 */
+ X8(I(ByteOp | DstReg | SrcImm | Mov, em_mov)),
+ /* 0xB8 - 0xBF */
+ X8(I(DstReg | SrcImm64 | Mov, em_mov)),
+ /* 0xC0 - 0xC7 */
+ G(ByteOp | Src2ImmByte, group2), G(Src2ImmByte, group2),
+ I(ImplicitOps | NearBranch | SrcImmU16 | IsBranch, em_ret_near_imm),
+ I(ImplicitOps | NearBranch | IsBranch, em_ret),
+ I(DstReg | SrcMemFAddr | ModRM | No64 | Src2ES, em_lseg),
+ I(DstReg | SrcMemFAddr | ModRM | No64 | Src2DS, em_lseg),
+ G(ByteOp, group11), G(0, group11),
+ /* 0xC8 - 0xCF */
+ I(Stack | SrcImmU16 | Src2ImmByte | IsBranch, em_enter),
+ I(Stack | IsBranch, em_leave),
+ I(ImplicitOps | SrcImmU16 | IsBranch, em_ret_far_imm),
+ I(ImplicitOps | IsBranch, em_ret_far),
+ D(ImplicitOps | IsBranch), DI(SrcImmByte | IsBranch, intn),
+ D(ImplicitOps | No64 | IsBranch),
+ II(ImplicitOps | IsBranch, em_iret, iret),
+ /* 0xD0 - 0xD7 */
+ G(Src2One | ByteOp, group2), G(Src2One, group2),
+ G(Src2CL | ByteOp, group2), G(Src2CL, group2),
+ I(DstAcc | SrcImmUByte | No64, em_aam),
+ I(DstAcc | SrcImmUByte | No64, em_aad),
+ F(DstAcc | ByteOp | No64, em_salc),
+ I(DstAcc | SrcXLat | ByteOp, em_mov),
+ /* 0xD8 - 0xDF */
+ N, E(0, &escape_d9), N, E(0, &escape_db), N, E(0, &escape_dd), N, N,
+ /* 0xE0 - 0xE7 */
+ X3(I(SrcImmByte | NearBranch | IsBranch, em_loop)),
+ I(SrcImmByte | NearBranch | IsBranch, em_jcxz),
+ I2bvIP(SrcImmUByte | DstAcc, em_in, in, check_perm_in),
+ I2bvIP(SrcAcc | DstImmUByte, em_out, out, check_perm_out),
+ /* 0xE8 - 0xEF */
+ I(SrcImm | NearBranch | IsBranch, em_call),
+ D(SrcImm | ImplicitOps | NearBranch | IsBranch),
+ I(SrcImmFAddr | No64 | IsBranch, em_jmp_far),
+ D(SrcImmByte | ImplicitOps | NearBranch | IsBranch),
+ I2bvIP(SrcDX | DstAcc, em_in, in, check_perm_in),
+ I2bvIP(SrcAcc | DstDX, em_out, out, check_perm_out),
+ /* 0xF0 - 0xF7 */
+ N, DI(ImplicitOps, icebp), N, N,
+ DI(ImplicitOps | Priv, hlt), D(ImplicitOps),
+ G(ByteOp, group3), G(0, group3),
+ /* 0xF8 - 0xFF */
+ D(ImplicitOps), D(ImplicitOps),
+ I(ImplicitOps, em_cli), I(ImplicitOps, em_sti),
+ D(ImplicitOps), D(ImplicitOps), G(0, group4), G(0, group5),
+};
+
+static const struct opcode twobyte_table[256] = {
+ /* 0x00 - 0x0F */
+ G(0, group6), GD(0, &group7), N, N,
+ N, I(ImplicitOps | EmulateOnUD | IsBranch, em_syscall),
+ II(ImplicitOps | Priv, em_clts, clts), N,
+ DI(ImplicitOps | Priv, invd), DI(ImplicitOps | Priv, wbinvd), N, N,
+ N, D(ImplicitOps | ModRM | SrcMem | NoAccess), N, N,
+ /* 0x10 - 0x1F */
+ GP(ModRM | DstReg | SrcMem | Mov | Sse, &pfx_0f_10_0f_11),
+ GP(ModRM | DstMem | SrcReg | Mov | Sse, &pfx_0f_10_0f_11),
+ N, N, N, N, N, N,
+ D(ImplicitOps | ModRM | SrcMem | NoAccess), /* 4 * prefetch + 4 * reserved NOP */
+ D(ImplicitOps | ModRM | SrcMem | NoAccess), N, N,
+ D(ImplicitOps | ModRM | SrcMem | NoAccess), /* 8 * reserved NOP */
+ D(ImplicitOps | ModRM | SrcMem | NoAccess), /* 8 * reserved NOP */
+ D(ImplicitOps | ModRM | SrcMem | NoAccess), /* 8 * reserved NOP */
+ D(ImplicitOps | ModRM | SrcMem | NoAccess), /* NOP + 7 * reserved NOP */
+ /* 0x20 - 0x2F */
+ DIP(ModRM | DstMem | Priv | Op3264 | NoMod, cr_read, check_cr_access),
+ DIP(ModRM | DstMem | Priv | Op3264 | NoMod, dr_read, check_dr_read),
+ IIP(ModRM | SrcMem | Priv | Op3264 | NoMod, em_cr_write, cr_write,
+ check_cr_access),
+ IIP(ModRM | SrcMem | Priv | Op3264 | NoMod, em_dr_write, dr_write,
+ check_dr_write),
+ N, N, N, N,
+ GP(ModRM | DstReg | SrcMem | Mov | Sse, &pfx_0f_28_0f_29),
+ GP(ModRM | DstMem | SrcReg | Mov | Sse, &pfx_0f_28_0f_29),
+ N, GP(ModRM | DstMem | SrcReg | Mov | Sse, &pfx_0f_2b),
+ N, N, N, N,
+ /* 0x30 - 0x3F */
+ II(ImplicitOps | Priv, em_wrmsr, wrmsr),
+ IIP(ImplicitOps, em_rdtsc, rdtsc, check_rdtsc),
+ II(ImplicitOps | Priv, em_rdmsr, rdmsr),
+ IIP(ImplicitOps, em_rdpmc, rdpmc, check_rdpmc),
+ I(ImplicitOps | EmulateOnUD | IsBranch, em_sysenter),
+ I(ImplicitOps | Priv | EmulateOnUD | IsBranch, em_sysexit),
+ N, N,
+ N, N, N, N, N, N, N, N,
+ /* 0x40 - 0x4F */
+ X16(D(DstReg | SrcMem | ModRM)),
+ /* 0x50 - 0x5F */
+ N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N,
+ /* 0x60 - 0x6F */
+ N, N, N, N,
+ N, N, N, N,
+ N, N, N, N,
+ N, N, N, GP(SrcMem | DstReg | ModRM | Mov, &pfx_0f_6f_0f_7f),
+ /* 0x70 - 0x7F */
+ N, N, N, N,
+ N, N, N, N,
+ N, N, N, N,
+ N, N, N, GP(SrcReg | DstMem | ModRM | Mov, &pfx_0f_6f_0f_7f),
+ /* 0x80 - 0x8F */
+ X16(D(SrcImm | NearBranch | IsBranch)),
+ /* 0x90 - 0x9F */
+ X16(D(ByteOp | DstMem | SrcNone | ModRM| Mov)),
+ /* 0xA0 - 0xA7 */
+ I(Stack | Src2FS, em_push_sreg), I(Stack | Src2FS, em_pop_sreg),
+ II(ImplicitOps, em_cpuid, cpuid),
+ F(DstMem | SrcReg | ModRM | BitOp | NoWrite, em_bt),
+ F(DstMem | SrcReg | Src2ImmByte | ModRM, em_shld),
+ F(DstMem | SrcReg | Src2CL | ModRM, em_shld), N, N,
+ /* 0xA8 - 0xAF */
+ I(Stack | Src2GS, em_push_sreg), I(Stack | Src2GS, em_pop_sreg),
+ II(EmulateOnUD | ImplicitOps, em_rsm, rsm),
+ F(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_bts),
+ F(DstMem | SrcReg | Src2ImmByte | ModRM, em_shrd),
+ F(DstMem | SrcReg | Src2CL | ModRM, em_shrd),
+ GD(0, &group15), F(DstReg | SrcMem | ModRM, em_imul),
+ /* 0xB0 - 0xB7 */
+ I2bv(DstMem | SrcReg | ModRM | Lock | PageTable | SrcWrite, em_cmpxchg),
+ I(DstReg | SrcMemFAddr | ModRM | Src2SS, em_lseg),
+ F(DstMem | SrcReg | ModRM | BitOp | Lock, em_btr),
+ I(DstReg | SrcMemFAddr | ModRM | Src2FS, em_lseg),
+ I(DstReg | SrcMemFAddr | ModRM | Src2GS, em_lseg),
+ D(DstReg | SrcMem8 | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov),
+ /* 0xB8 - 0xBF */
+ N, N,
+ G(BitOp, group8),
+ F(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_btc),
+ I(DstReg | SrcMem | ModRM, em_bsf_c),
+ I(DstReg | SrcMem | ModRM, em_bsr_c),
+ D(DstReg | SrcMem8 | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov),
+ /* 0xC0 - 0xC7 */
+ F2bv(DstMem | SrcReg | ModRM | SrcWrite | Lock, em_xadd),
+ N, ID(0, &instr_dual_0f_c3),
+ N, N, N, GD(0, &group9),
+ /* 0xC8 - 0xCF */
+ X8(I(DstReg, em_bswap)),
+ /* 0xD0 - 0xDF */
+ N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N,
+ /* 0xE0 - 0xEF */
+ N, N, N, N, N, N, N, GP(SrcReg | DstMem | ModRM | Mov, &pfx_0f_e7),
+ N, N, N, N, N, N, N, N,
+ /* 0xF0 - 0xFF */
+ N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N
+};
+
+static const struct instr_dual instr_dual_0f_38_f0 = {
+ I(DstReg | SrcMem | Mov, em_movbe), N
+};
+
+static const struct instr_dual instr_dual_0f_38_f1 = {
+ I(DstMem | SrcReg | Mov, em_movbe), N
+};
+
+static const struct gprefix three_byte_0f_38_f0 = {
+ ID(0, &instr_dual_0f_38_f0), N, N, N
+};
+
+static const struct gprefix three_byte_0f_38_f1 = {
+ ID(0, &instr_dual_0f_38_f1), N, N, N
+};
+
+/*
+ * Insns below are selected by the prefix which indexed by the third opcode
+ * byte.
+ */
+static const struct opcode opcode_map_0f_38[256] = {
+ /* 0x00 - 0x7f */
+ X16(N), X16(N), X16(N), X16(N), X16(N), X16(N), X16(N), X16(N),
+ /* 0x80 - 0xef */
+ X16(N), X16(N), X16(N), X16(N), X16(N), X16(N), X16(N),
+ /* 0xf0 - 0xf1 */
+ GP(EmulateOnUD | ModRM, &three_byte_0f_38_f0),
+ GP(EmulateOnUD | ModRM, &three_byte_0f_38_f1),
+ /* 0xf2 - 0xff */
+ N, N, X4(N), X8(N)
+};
+
+#undef D
+#undef N
+#undef G
+#undef GD
+#undef I
+#undef GP
+#undef EXT
+#undef MD
+#undef ID
+
+#undef D2bv
+#undef D2bvIP
+#undef I2bv
+#undef I2bvIP
+#undef I6ALU
+
+static unsigned imm_size(struct x86_emulate_ctxt *ctxt)
+{
+ unsigned size;
+
+ size = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
+ if (size == 8)
+ size = 4;
+ return size;
+}
+
+static int decode_imm(struct x86_emulate_ctxt *ctxt, struct operand *op,
+ unsigned size, bool sign_extension)
+{
+ int rc = X86EMUL_CONTINUE;
+
+ op->type = OP_IMM;
+ op->bytes = size;
+ op->addr.mem.ea = ctxt->_eip;
+ /* NB. Immediates are sign-extended as necessary. */
+ switch (op->bytes) {
+ case 1:
+ op->val = insn_fetch(s8, ctxt);
+ break;
+ case 2:
+ op->val = insn_fetch(s16, ctxt);
+ break;
+ case 4:
+ op->val = insn_fetch(s32, ctxt);
+ break;
+ case 8:
+ op->val = insn_fetch(s64, ctxt);
+ break;
+ }
+ if (!sign_extension) {
+ switch (op->bytes) {
+ case 1:
+ op->val &= 0xff;
+ break;
+ case 2:
+ op->val &= 0xffff;
+ break;
+ case 4:
+ op->val &= 0xffffffff;
+ break;
+ }
+ }
+done:
+ return rc;
+}
+
+static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op,
+ unsigned d)
+{
+ int rc = X86EMUL_CONTINUE;
+
+ switch (d) {
+ case OpReg:
+ decode_register_operand(ctxt, op);
+ break;
+ case OpImmUByte:
+ rc = decode_imm(ctxt, op, 1, false);
+ break;
+ case OpMem:
+ ctxt->memop.bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
+ mem_common:
+ *op = ctxt->memop;
+ ctxt->memopp = op;
+ if (ctxt->d & BitOp)
+ fetch_bit_operand(ctxt);
+ op->orig_val = op->val;
+ break;
+ case OpMem64:
+ ctxt->memop.bytes = (ctxt->op_bytes == 8) ? 16 : 8;
+ goto mem_common;
+ case OpAcc:
+ op->type = OP_REG;
+ op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
+ op->addr.reg = reg_rmw(ctxt, VCPU_REGS_RAX);
+ fetch_register_operand(op);
+ op->orig_val = op->val;
+ break;
+ case OpAccLo:
+ op->type = OP_REG;
+ op->bytes = (ctxt->d & ByteOp) ? 2 : ctxt->op_bytes;
+ op->addr.reg = reg_rmw(ctxt, VCPU_REGS_RAX);
+ fetch_register_operand(op);
+ op->orig_val = op->val;
+ break;
+ case OpAccHi:
+ if (ctxt->d & ByteOp) {
+ op->type = OP_NONE;
+ break;
+ }
+ op->type = OP_REG;
+ op->bytes = ctxt->op_bytes;
+ op->addr.reg = reg_rmw(ctxt, VCPU_REGS_RDX);
+ fetch_register_operand(op);
+ op->orig_val = op->val;
+ break;
+ case OpDI:
+ op->type = OP_MEM;
+ op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
+ op->addr.mem.ea =
+ register_address(ctxt, VCPU_REGS_RDI);
+ op->addr.mem.seg = VCPU_SREG_ES;
+ op->val = 0;
+ op->count = 1;
+ break;
+ case OpDX:
+ op->type = OP_REG;
+ op->bytes = 2;
+ op->addr.reg = reg_rmw(ctxt, VCPU_REGS_RDX);
+ fetch_register_operand(op);
+ break;
+ case OpCL:
+ op->type = OP_IMM;
+ op->bytes = 1;
+ op->val = reg_read(ctxt, VCPU_REGS_RCX) & 0xff;
+ break;
+ case OpImmByte:
+ rc = decode_imm(ctxt, op, 1, true);
+ break;
+ case OpOne:
+ op->type = OP_IMM;
+ op->bytes = 1;
+ op->val = 1;
+ break;
+ case OpImm:
+ rc = decode_imm(ctxt, op, imm_size(ctxt), true);
+ break;
+ case OpImm64:
+ rc = decode_imm(ctxt, op, ctxt->op_bytes, true);
+ break;
+ case OpMem8:
+ ctxt->memop.bytes = 1;
+ if (ctxt->memop.type == OP_REG) {
+ ctxt->memop.addr.reg = decode_register(ctxt,
+ ctxt->modrm_rm, true);
+ fetch_register_operand(&ctxt->memop);
+ }
+ goto mem_common;
+ case OpMem16:
+ ctxt->memop.bytes = 2;
+ goto mem_common;
+ case OpMem32:
+ ctxt->memop.bytes = 4;
+ goto mem_common;
+ case OpImmU16:
+ rc = decode_imm(ctxt, op, 2, false);
+ break;
+ case OpImmU:
+ rc = decode_imm(ctxt, op, imm_size(ctxt), false);
+ break;
+ case OpSI:
+ op->type = OP_MEM;
+ op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
+ op->addr.mem.ea =
+ register_address(ctxt, VCPU_REGS_RSI);
+ op->addr.mem.seg = ctxt->seg_override;
+ op->val = 0;
+ op->count = 1;
+ break;
+ case OpXLat:
+ op->type = OP_MEM;
+ op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
+ op->addr.mem.ea =
+ address_mask(ctxt,
+ reg_read(ctxt, VCPU_REGS_RBX) +
+ (reg_read(ctxt, VCPU_REGS_RAX) & 0xff));
+ op->addr.mem.seg = ctxt->seg_override;
+ op->val = 0;
+ break;
+ case OpImmFAddr:
+ op->type = OP_IMM;
+ op->addr.mem.ea = ctxt->_eip;
+ op->bytes = ctxt->op_bytes + 2;
+ insn_fetch_arr(op->valptr, op->bytes, ctxt);
+ break;
+ case OpMemFAddr:
+ ctxt->memop.bytes = ctxt->op_bytes + 2;
+ goto mem_common;
+ case OpES:
+ op->type = OP_IMM;
+ op->val = VCPU_SREG_ES;
+ break;
+ case OpCS:
+ op->type = OP_IMM;
+ op->val = VCPU_SREG_CS;
+ break;
+ case OpSS:
+ op->type = OP_IMM;
+ op->val = VCPU_SREG_SS;
+ break;
+ case OpDS:
+ op->type = OP_IMM;
+ op->val = VCPU_SREG_DS;
+ break;
+ case OpFS:
+ op->type = OP_IMM;
+ op->val = VCPU_SREG_FS;
+ break;
+ case OpGS:
+ op->type = OP_IMM;
+ op->val = VCPU_SREG_GS;
+ break;
+ case OpImplicit:
+ /* Special instructions do their own operand decoding. */
+ default:
+ op->type = OP_NONE; /* Disable writeback. */
+ break;
+ }
+
+done:
+ return rc;
+}
+
+int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len, int emulation_type)
+{
+ int rc = X86EMUL_CONTINUE;
+ int mode = ctxt->mode;
+ int def_op_bytes, def_ad_bytes, goffset, simd_prefix;
+ bool op_prefix = false;
+ bool has_seg_override = false;
+ struct opcode opcode;
+ u16 dummy;
+ struct desc_struct desc;
+
+ ctxt->memop.type = OP_NONE;
+ ctxt->memopp = NULL;
+ ctxt->_eip = ctxt->eip;
+ ctxt->fetch.ptr = ctxt->fetch.data;
+ ctxt->fetch.end = ctxt->fetch.data + insn_len;
+ ctxt->opcode_len = 1;
+ ctxt->intercept = x86_intercept_none;
+ if (insn_len > 0)
+ memcpy(ctxt->fetch.data, insn, insn_len);
+ else {
+ rc = __do_insn_fetch_bytes(ctxt, 1);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+ }
+
+ switch (mode) {
+ case X86EMUL_MODE_REAL:
+ case X86EMUL_MODE_VM86:
+ def_op_bytes = def_ad_bytes = 2;
+ ctxt->ops->get_segment(ctxt, &dummy, &desc, NULL, VCPU_SREG_CS);
+ if (desc.d)
+ def_op_bytes = def_ad_bytes = 4;
+ break;
+ case X86EMUL_MODE_PROT16:
+ def_op_bytes = def_ad_bytes = 2;
+ break;
+ case X86EMUL_MODE_PROT32:
+ def_op_bytes = def_ad_bytes = 4;
+ break;
+#ifdef CONFIG_X86_64
+ case X86EMUL_MODE_PROT64:
+ def_op_bytes = 4;
+ def_ad_bytes = 8;
+ break;
+#endif
+ default:
+ return EMULATION_FAILED;
+ }
+
+ ctxt->op_bytes = def_op_bytes;
+ ctxt->ad_bytes = def_ad_bytes;
+
+ /* Legacy prefixes. */
+ for (;;) {
+ switch (ctxt->b = insn_fetch(u8, ctxt)) {
+ case 0x66: /* operand-size override */
+ op_prefix = true;
+ /* switch between 2/4 bytes */
+ ctxt->op_bytes = def_op_bytes ^ 6;
+ break;
+ case 0x67: /* address-size override */
+ if (mode == X86EMUL_MODE_PROT64)
+ /* switch between 4/8 bytes */
+ ctxt->ad_bytes = def_ad_bytes ^ 12;
+ else
+ /* switch between 2/4 bytes */
+ ctxt->ad_bytes = def_ad_bytes ^ 6;
+ break;
+ case 0x26: /* ES override */
+ has_seg_override = true;
+ ctxt->seg_override = VCPU_SREG_ES;
+ break;
+ case 0x2e: /* CS override */
+ has_seg_override = true;
+ ctxt->seg_override = VCPU_SREG_CS;
+ break;
+ case 0x36: /* SS override */
+ has_seg_override = true;
+ ctxt->seg_override = VCPU_SREG_SS;
+ break;
+ case 0x3e: /* DS override */
+ has_seg_override = true;
+ ctxt->seg_override = VCPU_SREG_DS;
+ break;
+ case 0x64: /* FS override */
+ has_seg_override = true;
+ ctxt->seg_override = VCPU_SREG_FS;
+ break;
+ case 0x65: /* GS override */
+ has_seg_override = true;
+ ctxt->seg_override = VCPU_SREG_GS;
+ break;
+ case 0x40 ... 0x4f: /* REX */
+ if (mode != X86EMUL_MODE_PROT64)
+ goto done_prefixes;
+ ctxt->rex_prefix = ctxt->b;
+ continue;
+ case 0xf0: /* LOCK */
+ ctxt->lock_prefix = 1;
+ break;
+ case 0xf2: /* REPNE/REPNZ */
+ case 0xf3: /* REP/REPE/REPZ */
+ ctxt->rep_prefix = ctxt->b;
+ break;
+ default:
+ goto done_prefixes;
+ }
+
+ /* Any legacy prefix after a REX prefix nullifies its effect. */
+
+ ctxt->rex_prefix = 0;
+ }
+
+done_prefixes:
+
+ /* REX prefix. */
+ if (ctxt->rex_prefix & 8)
+ ctxt->op_bytes = 8; /* REX.W */
+
+ /* Opcode byte(s). */
+ opcode = opcode_table[ctxt->b];
+ /* Two-byte opcode? */
+ if (ctxt->b == 0x0f) {
+ ctxt->opcode_len = 2;
+ ctxt->b = insn_fetch(u8, ctxt);
+ opcode = twobyte_table[ctxt->b];
+
+ /* 0F_38 opcode map */
+ if (ctxt->b == 0x38) {
+ ctxt->opcode_len = 3;
+ ctxt->b = insn_fetch(u8, ctxt);
+ opcode = opcode_map_0f_38[ctxt->b];
+ }
+ }
+ ctxt->d = opcode.flags;
+
+ if (ctxt->d & ModRM)
+ ctxt->modrm = insn_fetch(u8, ctxt);
+
+ /* vex-prefix instructions are not implemented */
+ if (ctxt->opcode_len == 1 && (ctxt->b == 0xc5 || ctxt->b == 0xc4) &&
+ (mode == X86EMUL_MODE_PROT64 || (ctxt->modrm & 0xc0) == 0xc0)) {
+ ctxt->d = NotImpl;
+ }
+
+ while (ctxt->d & GroupMask) {
+ switch (ctxt->d & GroupMask) {
+ case Group:
+ goffset = (ctxt->modrm >> 3) & 7;
+ opcode = opcode.u.group[goffset];
+ break;
+ case GroupDual:
+ goffset = (ctxt->modrm >> 3) & 7;
+ if ((ctxt->modrm >> 6) == 3)
+ opcode = opcode.u.gdual->mod3[goffset];
+ else
+ opcode = opcode.u.gdual->mod012[goffset];
+ break;
+ case RMExt:
+ goffset = ctxt->modrm & 7;
+ opcode = opcode.u.group[goffset];
+ break;
+ case Prefix:
+ if (ctxt->rep_prefix && op_prefix)
+ return EMULATION_FAILED;
+ simd_prefix = op_prefix ? 0x66 : ctxt->rep_prefix;
+ switch (simd_prefix) {
+ case 0x00: opcode = opcode.u.gprefix->pfx_no; break;
+ case 0x66: opcode = opcode.u.gprefix->pfx_66; break;
+ case 0xf2: opcode = opcode.u.gprefix->pfx_f2; break;
+ case 0xf3: opcode = opcode.u.gprefix->pfx_f3; break;
+ }
+ break;
+ case Escape:
+ if (ctxt->modrm > 0xbf) {
+ size_t size = ARRAY_SIZE(opcode.u.esc->high);
+ u32 index = array_index_nospec(
+ ctxt->modrm - 0xc0, size);
+
+ opcode = opcode.u.esc->high[index];
+ } else {
+ opcode = opcode.u.esc->op[(ctxt->modrm >> 3) & 7];
+ }
+ break;
+ case InstrDual:
+ if ((ctxt->modrm >> 6) == 3)
+ opcode = opcode.u.idual->mod3;
+ else
+ opcode = opcode.u.idual->mod012;
+ break;
+ case ModeDual:
+ if (ctxt->mode == X86EMUL_MODE_PROT64)
+ opcode = opcode.u.mdual->mode64;
+ else
+ opcode = opcode.u.mdual->mode32;
+ break;
+ default:
+ return EMULATION_FAILED;
+ }
+
+ ctxt->d &= ~(u64)GroupMask;
+ ctxt->d |= opcode.flags;
+ }
+
+ ctxt->is_branch = opcode.flags & IsBranch;
+
+ /* Unrecognised? */
+ if (ctxt->d == 0)
+ return EMULATION_FAILED;
+
+ ctxt->execute = opcode.u.execute;
+
+ if (unlikely(emulation_type & EMULTYPE_TRAP_UD) &&
+ likely(!(ctxt->d & EmulateOnUD)))
+ return EMULATION_FAILED;
+
+ if (unlikely(ctxt->d &
+ (NotImpl|Stack|Op3264|Sse|Mmx|Intercept|CheckPerm|NearBranch|
+ No16))) {
+ /*
+ * These are copied unconditionally here, and checked unconditionally
+ * in x86_emulate_insn.
+ */
+ ctxt->check_perm = opcode.check_perm;
+ ctxt->intercept = opcode.intercept;
+
+ if (ctxt->d & NotImpl)
+ return EMULATION_FAILED;
+
+ if (mode == X86EMUL_MODE_PROT64) {
+ if (ctxt->op_bytes == 4 && (ctxt->d & Stack))
+ ctxt->op_bytes = 8;
+ else if (ctxt->d & NearBranch)
+ ctxt->op_bytes = 8;
+ }
+
+ if (ctxt->d & Op3264) {
+ if (mode == X86EMUL_MODE_PROT64)
+ ctxt->op_bytes = 8;
+ else
+ ctxt->op_bytes = 4;
+ }
+
+ if ((ctxt->d & No16) && ctxt->op_bytes == 2)
+ ctxt->op_bytes = 4;
+
+ if (ctxt->d & Sse)
+ ctxt->op_bytes = 16;
+ else if (ctxt->d & Mmx)
+ ctxt->op_bytes = 8;
+ }
+
+ /* ModRM and SIB bytes. */
+ if (ctxt->d & ModRM) {
+ rc = decode_modrm(ctxt, &ctxt->memop);
+ if (!has_seg_override) {
+ has_seg_override = true;
+ ctxt->seg_override = ctxt->modrm_seg;
+ }
+ } else if (ctxt->d & MemAbs)
+ rc = decode_abs(ctxt, &ctxt->memop);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+
+ if (!has_seg_override)
+ ctxt->seg_override = VCPU_SREG_DS;
+
+ ctxt->memop.addr.mem.seg = ctxt->seg_override;
+
+ /*
+ * Decode and fetch the source operand: register, memory
+ * or immediate.
+ */
+ rc = decode_operand(ctxt, &ctxt->src, (ctxt->d >> SrcShift) & OpMask);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+
+ /*
+ * Decode and fetch the second source operand: register, memory
+ * or immediate.
+ */
+ rc = decode_operand(ctxt, &ctxt->src2, (ctxt->d >> Src2Shift) & OpMask);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+
+ /* Decode and fetch the destination operand: register or memory. */
+ rc = decode_operand(ctxt, &ctxt->dst, (ctxt->d >> DstShift) & OpMask);
+
+ if (ctxt->rip_relative && likely(ctxt->memopp))
+ ctxt->memopp->addr.mem.ea = address_mask(ctxt,
+ ctxt->memopp->addr.mem.ea + ctxt->_eip);
+
+done:
+ if (rc == X86EMUL_PROPAGATE_FAULT)
+ ctxt->have_exception = true;
+ return (rc != X86EMUL_CONTINUE) ? EMULATION_FAILED : EMULATION_OK;
+}
+
+bool x86_page_table_writing_insn(struct x86_emulate_ctxt *ctxt)
+{
+ return ctxt->d & PageTable;
+}
+
+static bool string_insn_completed(struct x86_emulate_ctxt *ctxt)
+{
+ /* The second termination condition only applies for REPE
+ * and REPNE. Test if the repeat string operation prefix is
+ * REPE/REPZ or REPNE/REPNZ and if it's the case it tests the
+ * corresponding termination condition according to:
+ * - if REPE/REPZ and ZF = 0 then done
+ * - if REPNE/REPNZ and ZF = 1 then done
+ */
+ if (((ctxt->b == 0xa6) || (ctxt->b == 0xa7) ||
+ (ctxt->b == 0xae) || (ctxt->b == 0xaf))
+ && (((ctxt->rep_prefix == REPE_PREFIX) &&
+ ((ctxt->eflags & X86_EFLAGS_ZF) == 0))
+ || ((ctxt->rep_prefix == REPNE_PREFIX) &&
+ ((ctxt->eflags & X86_EFLAGS_ZF) == X86_EFLAGS_ZF))))
+ return true;
+
+ return false;
+}
+
+static int flush_pending_x87_faults(struct x86_emulate_ctxt *ctxt)
+{
+ int rc;
+
+ kvm_fpu_get();
+ rc = asm_safe("fwait");
+ kvm_fpu_put();
+
+ if (unlikely(rc != X86EMUL_CONTINUE))
+ return emulate_exception(ctxt, MF_VECTOR, 0, false);
+
+ return X86EMUL_CONTINUE;
+}
+
+static void fetch_possible_mmx_operand(struct operand *op)
+{
+ if (op->type == OP_MM)
+ kvm_read_mmx_reg(op->addr.mm, &op->mm_val);
+}
+
+static int fastop(struct x86_emulate_ctxt *ctxt, fastop_t fop)
+{
+ ulong flags = (ctxt->eflags & EFLAGS_MASK) | X86_EFLAGS_IF;
+
+ if (!(ctxt->d & ByteOp))
+ fop += __ffs(ctxt->dst.bytes) * FASTOP_SIZE;
+
+ asm("push %[flags]; popf; " CALL_NOSPEC " ; pushf; pop %[flags]\n"
+ : "+a"(ctxt->dst.val), "+d"(ctxt->src.val), [flags]"+D"(flags),
+ [thunk_target]"+S"(fop), ASM_CALL_CONSTRAINT
+ : "c"(ctxt->src2.val));
+
+ ctxt->eflags = (ctxt->eflags & ~EFLAGS_MASK) | (flags & EFLAGS_MASK);
+ if (!fop) /* exception is returned in fop variable */
+ return emulate_de(ctxt);
+ return X86EMUL_CONTINUE;
+}
+
+void init_decode_cache(struct x86_emulate_ctxt *ctxt)
+{
+ /* Clear fields that are set conditionally but read without a guard. */
+ ctxt->rip_relative = false;
+ ctxt->rex_prefix = 0;
+ ctxt->lock_prefix = 0;
+ ctxt->rep_prefix = 0;
+ ctxt->regs_valid = 0;
+ ctxt->regs_dirty = 0;
+
+ ctxt->io_read.pos = 0;
+ ctxt->io_read.end = 0;
+ ctxt->mem_read.end = 0;
+}
+
+int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
+{
+ const struct x86_emulate_ops *ops = ctxt->ops;
+ int rc = X86EMUL_CONTINUE;
+ int saved_dst_type = ctxt->dst.type;
+ bool is_guest_mode = ctxt->ops->is_guest_mode(ctxt);
+
+ ctxt->mem_read.pos = 0;
+
+ /* LOCK prefix is allowed only with some instructions */
+ if (ctxt->lock_prefix && (!(ctxt->d & Lock) || ctxt->dst.type != OP_MEM)) {
+ rc = emulate_ud(ctxt);
+ goto done;
+ }
+
+ if ((ctxt->d & SrcMask) == SrcMemFAddr && ctxt->src.type != OP_MEM) {
+ rc = emulate_ud(ctxt);
+ goto done;
+ }
+
+ if (unlikely(ctxt->d &
+ (No64|Undefined|Sse|Mmx|Intercept|CheckPerm|Priv|Prot|String))) {
+ if ((ctxt->mode == X86EMUL_MODE_PROT64 && (ctxt->d & No64)) ||
+ (ctxt->d & Undefined)) {
+ rc = emulate_ud(ctxt);
+ goto done;
+ }
+
+ if (((ctxt->d & (Sse|Mmx)) && ((ops->get_cr(ctxt, 0) & X86_CR0_EM)))
+ || ((ctxt->d & Sse) && !(ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR))) {
+ rc = emulate_ud(ctxt);
+ goto done;
+ }
+
+ if ((ctxt->d & (Sse|Mmx)) && (ops->get_cr(ctxt, 0) & X86_CR0_TS)) {
+ rc = emulate_nm(ctxt);
+ goto done;
+ }
+
+ if (ctxt->d & Mmx) {
+ rc = flush_pending_x87_faults(ctxt);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+ /*
+ * Now that we know the fpu is exception safe, we can fetch
+ * operands from it.
+ */
+ fetch_possible_mmx_operand(&ctxt->src);
+ fetch_possible_mmx_operand(&ctxt->src2);
+ if (!(ctxt->d & Mov))
+ fetch_possible_mmx_operand(&ctxt->dst);
+ }
+
+ if (unlikely(is_guest_mode) && ctxt->intercept) {
+ rc = emulator_check_intercept(ctxt, ctxt->intercept,
+ X86_ICPT_PRE_EXCEPT);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+ }
+
+ /* Instruction can only be executed in protected mode */
+ if ((ctxt->d & Prot) && ctxt->mode < X86EMUL_MODE_PROT16) {
+ rc = emulate_ud(ctxt);
+ goto done;
+ }
+
+ /* Privileged instruction can be executed only in CPL=0 */
+ if ((ctxt->d & Priv) && ops->cpl(ctxt)) {
+ if (ctxt->d & PrivUD)
+ rc = emulate_ud(ctxt);
+ else
+ rc = emulate_gp(ctxt, 0);
+ goto done;
+ }
+
+ /* Do instruction specific permission checks */
+ if (ctxt->d & CheckPerm) {
+ rc = ctxt->check_perm(ctxt);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+ }
+
+ if (unlikely(is_guest_mode) && (ctxt->d & Intercept)) {
+ rc = emulator_check_intercept(ctxt, ctxt->intercept,
+ X86_ICPT_POST_EXCEPT);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+ }
+
+ if (ctxt->rep_prefix && (ctxt->d & String)) {
+ /* All REP prefixes have the same first termination condition */
+ if (address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) == 0) {
+ string_registers_quirk(ctxt);
+ ctxt->eip = ctxt->_eip;
+ ctxt->eflags &= ~X86_EFLAGS_RF;
+ goto done;
+ }
+ }
+ }
+
+ if ((ctxt->src.type == OP_MEM) && !(ctxt->d & NoAccess)) {
+ rc = segmented_read(ctxt, ctxt->src.addr.mem,
+ ctxt->src.valptr, ctxt->src.bytes);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+ ctxt->src.orig_val64 = ctxt->src.val64;
+ }
+
+ if (ctxt->src2.type == OP_MEM) {
+ rc = segmented_read(ctxt, ctxt->src2.addr.mem,
+ &ctxt->src2.val, ctxt->src2.bytes);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+ }
+
+ if ((ctxt->d & DstMask) == ImplicitOps)
+ goto special_insn;
+
+
+ if ((ctxt->dst.type == OP_MEM) && !(ctxt->d & Mov)) {
+ /* optimisation - avoid slow emulated read if Mov */
+ rc = segmented_read(ctxt, ctxt->dst.addr.mem,
+ &ctxt->dst.val, ctxt->dst.bytes);
+ if (rc != X86EMUL_CONTINUE) {
+ if (!(ctxt->d & NoWrite) &&
+ rc == X86EMUL_PROPAGATE_FAULT &&
+ ctxt->exception.vector == PF_VECTOR)
+ ctxt->exception.error_code |= PFERR_WRITE_MASK;
+ goto done;
+ }
+ }
+ /* Copy full 64-bit value for CMPXCHG8B. */
+ ctxt->dst.orig_val64 = ctxt->dst.val64;
+
+special_insn:
+
+ if (unlikely(is_guest_mode) && (ctxt->d & Intercept)) {
+ rc = emulator_check_intercept(ctxt, ctxt->intercept,
+ X86_ICPT_POST_MEMACCESS);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+ }
+
+ if (ctxt->rep_prefix && (ctxt->d & String))
+ ctxt->eflags |= X86_EFLAGS_RF;
+ else
+ ctxt->eflags &= ~X86_EFLAGS_RF;
+
+ if (ctxt->execute) {
+ if (ctxt->d & Fastop)
+ rc = fastop(ctxt, ctxt->fop);
+ else
+ rc = ctxt->execute(ctxt);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+ goto writeback;
+ }
+
+ if (ctxt->opcode_len == 2)
+ goto twobyte_insn;
+ else if (ctxt->opcode_len == 3)
+ goto threebyte_insn;
+
+ switch (ctxt->b) {
+ case 0x70 ... 0x7f: /* jcc (short) */
+ if (test_cc(ctxt->b, ctxt->eflags))
+ rc = jmp_rel(ctxt, ctxt->src.val);
+ break;
+ case 0x8d: /* lea r16/r32, m */
+ ctxt->dst.val = ctxt->src.addr.mem.ea;
+ break;
+ case 0x90 ... 0x97: /* nop / xchg reg, rax */
+ if (ctxt->dst.addr.reg == reg_rmw(ctxt, VCPU_REGS_RAX))
+ ctxt->dst.type = OP_NONE;
+ else
+ rc = em_xchg(ctxt);
+ break;
+ case 0x98: /* cbw/cwde/cdqe */
+ switch (ctxt->op_bytes) {
+ case 2: ctxt->dst.val = (s8)ctxt->dst.val; break;
+ case 4: ctxt->dst.val = (s16)ctxt->dst.val; break;
+ case 8: ctxt->dst.val = (s32)ctxt->dst.val; break;
+ }
+ break;
+ case 0xcc: /* int3 */
+ rc = emulate_int(ctxt, 3);
+ break;
+ case 0xcd: /* int n */
+ rc = emulate_int(ctxt, ctxt->src.val);
+ break;
+ case 0xce: /* into */
+ if (ctxt->eflags & X86_EFLAGS_OF)
+ rc = emulate_int(ctxt, 4);
+ break;
+ case 0xe9: /* jmp rel */
+ case 0xeb: /* jmp rel short */
+ rc = jmp_rel(ctxt, ctxt->src.val);
+ ctxt->dst.type = OP_NONE; /* Disable writeback. */
+ break;
+ case 0xf4: /* hlt */
+ ctxt->ops->halt(ctxt);
+ break;
+ case 0xf5: /* cmc */
+ /* complement carry flag from eflags reg */
+ ctxt->eflags ^= X86_EFLAGS_CF;
+ break;
+ case 0xf8: /* clc */
+ ctxt->eflags &= ~X86_EFLAGS_CF;
+ break;
+ case 0xf9: /* stc */
+ ctxt->eflags |= X86_EFLAGS_CF;
+ break;
+ case 0xfc: /* cld */
+ ctxt->eflags &= ~X86_EFLAGS_DF;
+ break;
+ case 0xfd: /* std */
+ ctxt->eflags |= X86_EFLAGS_DF;
+ break;
+ default:
+ goto cannot_emulate;
+ }
+
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+
+writeback:
+ if (ctxt->d & SrcWrite) {
+ BUG_ON(ctxt->src.type == OP_MEM || ctxt->src.type == OP_MEM_STR);
+ rc = writeback(ctxt, &ctxt->src);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+ }
+ if (!(ctxt->d & NoWrite)) {
+ rc = writeback(ctxt, &ctxt->dst);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+ }
+
+ /*
+ * restore dst type in case the decoding will be reused
+ * (happens for string instruction )
+ */
+ ctxt->dst.type = saved_dst_type;
+
+ if ((ctxt->d & SrcMask) == SrcSI)
+ string_addr_inc(ctxt, VCPU_REGS_RSI, &ctxt->src);
+
+ if ((ctxt->d & DstMask) == DstDI)
+ string_addr_inc(ctxt, VCPU_REGS_RDI, &ctxt->dst);
+
+ if (ctxt->rep_prefix && (ctxt->d & String)) {
+ unsigned int count;
+ struct read_cache *r = &ctxt->io_read;
+ if ((ctxt->d & SrcMask) == SrcSI)
+ count = ctxt->src.count;
+ else
+ count = ctxt->dst.count;
+ register_address_increment(ctxt, VCPU_REGS_RCX, -count);
+
+ if (!string_insn_completed(ctxt)) {
+ /*
+ * Re-enter guest when pio read ahead buffer is empty
+ * or, if it is not used, after each 1024 iteration.
+ */
+ if ((r->end != 0 || reg_read(ctxt, VCPU_REGS_RCX) & 0x3ff) &&
+ (r->end == 0 || r->end != r->pos)) {
+ /*
+ * Reset read cache. Usually happens before
+ * decode, but since instruction is restarted
+ * we have to do it here.
+ */
+ ctxt->mem_read.end = 0;
+ writeback_registers(ctxt);
+ return EMULATION_RESTART;
+ }
+ goto done; /* skip rip writeback */
+ }
+ ctxt->eflags &= ~X86_EFLAGS_RF;
+ }
+
+ ctxt->eip = ctxt->_eip;
+ if (ctxt->mode != X86EMUL_MODE_PROT64)
+ ctxt->eip = (u32)ctxt->_eip;
+
+done:
+ if (rc == X86EMUL_PROPAGATE_FAULT) {
+ if (KVM_EMULATOR_BUG_ON(ctxt->exception.vector > 0x1f, ctxt))
+ return EMULATION_FAILED;
+ ctxt->have_exception = true;
+ }
+ if (rc == X86EMUL_INTERCEPTED)
+ return EMULATION_INTERCEPTED;
+
+ if (rc == X86EMUL_CONTINUE)
+ writeback_registers(ctxt);
+
+ return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK;
+
+twobyte_insn:
+ switch (ctxt->b) {
+ case 0x09: /* wbinvd */
+ (ctxt->ops->wbinvd)(ctxt);
+ break;
+ case 0x08: /* invd */
+ case 0x0d: /* GrpP (prefetch) */
+ case 0x18: /* Grp16 (prefetch/nop) */
+ case 0x1f: /* nop */
+ break;
+ case 0x20: /* mov cr, reg */
+ ctxt->dst.val = ops->get_cr(ctxt, ctxt->modrm_reg);
+ break;
+ case 0x21: /* mov from dr to reg */
+ ops->get_dr(ctxt, ctxt->modrm_reg, &ctxt->dst.val);
+ break;
+ case 0x40 ... 0x4f: /* cmov */
+ if (test_cc(ctxt->b, ctxt->eflags))
+ ctxt->dst.val = ctxt->src.val;
+ else if (ctxt->op_bytes != 4)
+ ctxt->dst.type = OP_NONE; /* no writeback */
+ break;
+ case 0x80 ... 0x8f: /* jnz rel, etc*/
+ if (test_cc(ctxt->b, ctxt->eflags))
+ rc = jmp_rel(ctxt, ctxt->src.val);
+ break;
+ case 0x90 ... 0x9f: /* setcc r/m8 */
+ ctxt->dst.val = test_cc(ctxt->b, ctxt->eflags);
+ break;
+ case 0xb6 ... 0xb7: /* movzx */
+ ctxt->dst.bytes = ctxt->op_bytes;
+ ctxt->dst.val = (ctxt->src.bytes == 1) ? (u8) ctxt->src.val
+ : (u16) ctxt->src.val;
+ break;
+ case 0xbe ... 0xbf: /* movsx */
+ ctxt->dst.bytes = ctxt->op_bytes;
+ ctxt->dst.val = (ctxt->src.bytes == 1) ? (s8) ctxt->src.val :
+ (s16) ctxt->src.val;
+ break;
+ default:
+ goto cannot_emulate;
+ }
+
+threebyte_insn:
+
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+
+ goto writeback;
+
+cannot_emulate:
+ return EMULATION_FAILED;
+}
+
+void emulator_invalidate_register_cache(struct x86_emulate_ctxt *ctxt)
+{
+ invalidate_registers(ctxt);
+}
+
+void emulator_writeback_register_cache(struct x86_emulate_ctxt *ctxt)
+{
+ writeback_registers(ctxt);
+}
+
+bool emulator_can_use_gpa(struct x86_emulate_ctxt *ctxt)
+{
+ if (ctxt->rep_prefix && (ctxt->d & String))
+ return false;
+
+ if (ctxt->d & TwoMemOp)
+ return false;
+
+ return true;
+}
diff --git a/arch/x86/kvm/fpu.h b/arch/x86/kvm/fpu.h
new file mode 100644
index 000000000..3ba12888b
--- /dev/null
+++ b/arch/x86/kvm/fpu.h
@@ -0,0 +1,140 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef __KVM_FPU_H_
+#define __KVM_FPU_H_
+
+#include <asm/fpu/api.h>
+
+typedef u32 __attribute__((vector_size(16))) sse128_t;
+#define __sse128_u union { sse128_t vec; u64 as_u64[2]; u32 as_u32[4]; }
+#define sse128_lo(x) ({ __sse128_u t; t.vec = x; t.as_u64[0]; })
+#define sse128_hi(x) ({ __sse128_u t; t.vec = x; t.as_u64[1]; })
+#define sse128_l0(x) ({ __sse128_u t; t.vec = x; t.as_u32[0]; })
+#define sse128_l1(x) ({ __sse128_u t; t.vec = x; t.as_u32[1]; })
+#define sse128_l2(x) ({ __sse128_u t; t.vec = x; t.as_u32[2]; })
+#define sse128_l3(x) ({ __sse128_u t; t.vec = x; t.as_u32[3]; })
+#define sse128(lo, hi) ({ __sse128_u t; t.as_u64[0] = lo; t.as_u64[1] = hi; t.vec; })
+
+static inline void _kvm_read_sse_reg(int reg, sse128_t *data)
+{
+ switch (reg) {
+ case 0: asm("movdqa %%xmm0, %0" : "=m"(*data)); break;
+ case 1: asm("movdqa %%xmm1, %0" : "=m"(*data)); break;
+ case 2: asm("movdqa %%xmm2, %0" : "=m"(*data)); break;
+ case 3: asm("movdqa %%xmm3, %0" : "=m"(*data)); break;
+ case 4: asm("movdqa %%xmm4, %0" : "=m"(*data)); break;
+ case 5: asm("movdqa %%xmm5, %0" : "=m"(*data)); break;
+ case 6: asm("movdqa %%xmm6, %0" : "=m"(*data)); break;
+ case 7: asm("movdqa %%xmm7, %0" : "=m"(*data)); break;
+#ifdef CONFIG_X86_64
+ case 8: asm("movdqa %%xmm8, %0" : "=m"(*data)); break;
+ case 9: asm("movdqa %%xmm9, %0" : "=m"(*data)); break;
+ case 10: asm("movdqa %%xmm10, %0" : "=m"(*data)); break;
+ case 11: asm("movdqa %%xmm11, %0" : "=m"(*data)); break;
+ case 12: asm("movdqa %%xmm12, %0" : "=m"(*data)); break;
+ case 13: asm("movdqa %%xmm13, %0" : "=m"(*data)); break;
+ case 14: asm("movdqa %%xmm14, %0" : "=m"(*data)); break;
+ case 15: asm("movdqa %%xmm15, %0" : "=m"(*data)); break;
+#endif
+ default: BUG();
+ }
+}
+
+static inline void _kvm_write_sse_reg(int reg, const sse128_t *data)
+{
+ switch (reg) {
+ case 0: asm("movdqa %0, %%xmm0" : : "m"(*data)); break;
+ case 1: asm("movdqa %0, %%xmm1" : : "m"(*data)); break;
+ case 2: asm("movdqa %0, %%xmm2" : : "m"(*data)); break;
+ case 3: asm("movdqa %0, %%xmm3" : : "m"(*data)); break;
+ case 4: asm("movdqa %0, %%xmm4" : : "m"(*data)); break;
+ case 5: asm("movdqa %0, %%xmm5" : : "m"(*data)); break;
+ case 6: asm("movdqa %0, %%xmm6" : : "m"(*data)); break;
+ case 7: asm("movdqa %0, %%xmm7" : : "m"(*data)); break;
+#ifdef CONFIG_X86_64
+ case 8: asm("movdqa %0, %%xmm8" : : "m"(*data)); break;
+ case 9: asm("movdqa %0, %%xmm9" : : "m"(*data)); break;
+ case 10: asm("movdqa %0, %%xmm10" : : "m"(*data)); break;
+ case 11: asm("movdqa %0, %%xmm11" : : "m"(*data)); break;
+ case 12: asm("movdqa %0, %%xmm12" : : "m"(*data)); break;
+ case 13: asm("movdqa %0, %%xmm13" : : "m"(*data)); break;
+ case 14: asm("movdqa %0, %%xmm14" : : "m"(*data)); break;
+ case 15: asm("movdqa %0, %%xmm15" : : "m"(*data)); break;
+#endif
+ default: BUG();
+ }
+}
+
+static inline void _kvm_read_mmx_reg(int reg, u64 *data)
+{
+ switch (reg) {
+ case 0: asm("movq %%mm0, %0" : "=m"(*data)); break;
+ case 1: asm("movq %%mm1, %0" : "=m"(*data)); break;
+ case 2: asm("movq %%mm2, %0" : "=m"(*data)); break;
+ case 3: asm("movq %%mm3, %0" : "=m"(*data)); break;
+ case 4: asm("movq %%mm4, %0" : "=m"(*data)); break;
+ case 5: asm("movq %%mm5, %0" : "=m"(*data)); break;
+ case 6: asm("movq %%mm6, %0" : "=m"(*data)); break;
+ case 7: asm("movq %%mm7, %0" : "=m"(*data)); break;
+ default: BUG();
+ }
+}
+
+static inline void _kvm_write_mmx_reg(int reg, const u64 *data)
+{
+ switch (reg) {
+ case 0: asm("movq %0, %%mm0" : : "m"(*data)); break;
+ case 1: asm("movq %0, %%mm1" : : "m"(*data)); break;
+ case 2: asm("movq %0, %%mm2" : : "m"(*data)); break;
+ case 3: asm("movq %0, %%mm3" : : "m"(*data)); break;
+ case 4: asm("movq %0, %%mm4" : : "m"(*data)); break;
+ case 5: asm("movq %0, %%mm5" : : "m"(*data)); break;
+ case 6: asm("movq %0, %%mm6" : : "m"(*data)); break;
+ case 7: asm("movq %0, %%mm7" : : "m"(*data)); break;
+ default: BUG();
+ }
+}
+
+static inline void kvm_fpu_get(void)
+{
+ fpregs_lock();
+
+ fpregs_assert_state_consistent();
+ if (test_thread_flag(TIF_NEED_FPU_LOAD))
+ switch_fpu_return();
+}
+
+static inline void kvm_fpu_put(void)
+{
+ fpregs_unlock();
+}
+
+static inline void kvm_read_sse_reg(int reg, sse128_t *data)
+{
+ kvm_fpu_get();
+ _kvm_read_sse_reg(reg, data);
+ kvm_fpu_put();
+}
+
+static inline void kvm_write_sse_reg(int reg, const sse128_t *data)
+{
+ kvm_fpu_get();
+ _kvm_write_sse_reg(reg, data);
+ kvm_fpu_put();
+}
+
+static inline void kvm_read_mmx_reg(int reg, u64 *data)
+{
+ kvm_fpu_get();
+ _kvm_read_mmx_reg(reg, data);
+ kvm_fpu_put();
+}
+
+static inline void kvm_write_mmx_reg(int reg, const u64 *data)
+{
+ kvm_fpu_get();
+ _kvm_write_mmx_reg(reg, data);
+ kvm_fpu_put();
+}
+
+#endif
diff --git a/arch/x86/kvm/governed_features.h b/arch/x86/kvm/governed_features.h
new file mode 100644
index 000000000..423a73395
--- /dev/null
+++ b/arch/x86/kvm/governed_features.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#if !defined(KVM_GOVERNED_FEATURE) || defined(KVM_GOVERNED_X86_FEATURE)
+BUILD_BUG()
+#endif
+
+#define KVM_GOVERNED_X86_FEATURE(x) KVM_GOVERNED_FEATURE(X86_FEATURE_##x)
+
+KVM_GOVERNED_X86_FEATURE(GBPAGES)
+KVM_GOVERNED_X86_FEATURE(XSAVES)
+KVM_GOVERNED_X86_FEATURE(VMX)
+KVM_GOVERNED_X86_FEATURE(NRIPS)
+KVM_GOVERNED_X86_FEATURE(TSCRATEMSR)
+KVM_GOVERNED_X86_FEATURE(V_VMSAVE_VMLOAD)
+KVM_GOVERNED_X86_FEATURE(LBRV)
+KVM_GOVERNED_X86_FEATURE(PAUSEFILTER)
+KVM_GOVERNED_X86_FEATURE(PFTHRESHOLD)
+KVM_GOVERNED_X86_FEATURE(VGIF)
+KVM_GOVERNED_X86_FEATURE(VNMI)
+
+#undef KVM_GOVERNED_X86_FEATURE
+#undef KVM_GOVERNED_FEATURE
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
new file mode 100644
index 000000000..238afd733
--- /dev/null
+++ b/arch/x86/kvm/hyperv.c
@@ -0,0 +1,2866 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * KVM Microsoft Hyper-V emulation
+ *
+ * derived from arch/x86/kvm/x86.c
+ *
+ * Copyright (C) 2006 Qumranet, Inc.
+ * Copyright (C) 2008 Qumranet, Inc.
+ * Copyright IBM Corporation, 2008
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
+ * Copyright (C) 2015 Andrey Smetanin <asmetanin@virtuozzo.com>
+ *
+ * Authors:
+ * Avi Kivity <avi@qumranet.com>
+ * Yaniv Kamay <yaniv@qumranet.com>
+ * Amit Shah <amit.shah@qumranet.com>
+ * Ben-Ami Yassour <benami@il.ibm.com>
+ * Andrey Smetanin <asmetanin@virtuozzo.com>
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include "x86.h"
+#include "lapic.h"
+#include "ioapic.h"
+#include "cpuid.h"
+#include "hyperv.h"
+#include "mmu.h"
+#include "xen.h"
+
+#include <linux/cpu.h>
+#include <linux/kvm_host.h>
+#include <linux/highmem.h>
+#include <linux/sched/cputime.h>
+#include <linux/spinlock.h>
+#include <linux/eventfd.h>
+
+#include <asm/apicdef.h>
+#include <asm/mshyperv.h>
+#include <trace/events/kvm.h>
+
+#include "trace.h"
+#include "irq.h"
+#include "fpu.h"
+
+#define KVM_HV_MAX_SPARSE_VCPU_SET_BITS DIV_ROUND_UP(KVM_MAX_VCPUS, HV_VCPUS_PER_SPARSE_BANK)
+
+/*
+ * As per Hyper-V TLFS, extended hypercalls start from 0x8001
+ * (HvExtCallQueryCapabilities). Response of this hypercalls is a 64 bit value
+ * where each bit tells which extended hypercall is available besides
+ * HvExtCallQueryCapabilities.
+ *
+ * 0x8001 - First extended hypercall, HvExtCallQueryCapabilities, no bit
+ * assigned.
+ *
+ * 0x8002 - Bit 0
+ * 0x8003 - Bit 1
+ * ..
+ * 0x8041 - Bit 63
+ *
+ * Therefore, HV_EXT_CALL_MAX = 0x8001 + 64
+ */
+#define HV_EXT_CALL_MAX (HV_EXT_CALL_QUERY_CAPABILITIES + 64)
+
+static void stimer_mark_pending(struct kvm_vcpu_hv_stimer *stimer,
+ bool vcpu_kick);
+
+static inline u64 synic_read_sint(struct kvm_vcpu_hv_synic *synic, int sint)
+{
+ return atomic64_read(&synic->sint[sint]);
+}
+
+static inline int synic_get_sint_vector(u64 sint_value)
+{
+ if (sint_value & HV_SYNIC_SINT_MASKED)
+ return -1;
+ return sint_value & HV_SYNIC_SINT_VECTOR_MASK;
+}
+
+static bool synic_has_vector_connected(struct kvm_vcpu_hv_synic *synic,
+ int vector)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(synic->sint); i++) {
+ if (synic_get_sint_vector(synic_read_sint(synic, i)) == vector)
+ return true;
+ }
+ return false;
+}
+
+static bool synic_has_vector_auto_eoi(struct kvm_vcpu_hv_synic *synic,
+ int vector)
+{
+ int i;
+ u64 sint_value;
+
+ for (i = 0; i < ARRAY_SIZE(synic->sint); i++) {
+ sint_value = synic_read_sint(synic, i);
+ if (synic_get_sint_vector(sint_value) == vector &&
+ sint_value & HV_SYNIC_SINT_AUTO_EOI)
+ return true;
+ }
+ return false;
+}
+
+static void synic_update_vector(struct kvm_vcpu_hv_synic *synic,
+ int vector)
+{
+ struct kvm_vcpu *vcpu = hv_synic_to_vcpu(synic);
+ struct kvm_hv *hv = to_kvm_hv(vcpu->kvm);
+ bool auto_eoi_old, auto_eoi_new;
+
+ if (vector < HV_SYNIC_FIRST_VALID_VECTOR)
+ return;
+
+ if (synic_has_vector_connected(synic, vector))
+ __set_bit(vector, synic->vec_bitmap);
+ else
+ __clear_bit(vector, synic->vec_bitmap);
+
+ auto_eoi_old = !bitmap_empty(synic->auto_eoi_bitmap, 256);
+
+ if (synic_has_vector_auto_eoi(synic, vector))
+ __set_bit(vector, synic->auto_eoi_bitmap);
+ else
+ __clear_bit(vector, synic->auto_eoi_bitmap);
+
+ auto_eoi_new = !bitmap_empty(synic->auto_eoi_bitmap, 256);
+
+ if (auto_eoi_old == auto_eoi_new)
+ return;
+
+ if (!enable_apicv)
+ return;
+
+ down_write(&vcpu->kvm->arch.apicv_update_lock);
+
+ if (auto_eoi_new)
+ hv->synic_auto_eoi_used++;
+ else
+ hv->synic_auto_eoi_used--;
+
+ /*
+ * Inhibit APICv if any vCPU is using SynIC's AutoEOI, which relies on
+ * the hypervisor to manually inject IRQs.
+ */
+ __kvm_set_or_clear_apicv_inhibit(vcpu->kvm,
+ APICV_INHIBIT_REASON_HYPERV,
+ !!hv->synic_auto_eoi_used);
+
+ up_write(&vcpu->kvm->arch.apicv_update_lock);
+}
+
+static int synic_set_sint(struct kvm_vcpu_hv_synic *synic, int sint,
+ u64 data, bool host)
+{
+ int vector, old_vector;
+ bool masked;
+
+ vector = data & HV_SYNIC_SINT_VECTOR_MASK;
+ masked = data & HV_SYNIC_SINT_MASKED;
+
+ /*
+ * Valid vectors are 16-255, however, nested Hyper-V attempts to write
+ * default '0x10000' value on boot and this should not #GP. We need to
+ * allow zero-initing the register from host as well.
+ */
+ if (vector < HV_SYNIC_FIRST_VALID_VECTOR && !host && !masked)
+ return 1;
+ /*
+ * Guest may configure multiple SINTs to use the same vector, so
+ * we maintain a bitmap of vectors handled by synic, and a
+ * bitmap of vectors with auto-eoi behavior. The bitmaps are
+ * updated here, and atomically queried on fast paths.
+ */
+ old_vector = synic_read_sint(synic, sint) & HV_SYNIC_SINT_VECTOR_MASK;
+
+ atomic64_set(&synic->sint[sint], data);
+
+ synic_update_vector(synic, old_vector);
+
+ synic_update_vector(synic, vector);
+
+ /* Load SynIC vectors into EOI exit bitmap */
+ kvm_make_request(KVM_REQ_SCAN_IOAPIC, hv_synic_to_vcpu(synic));
+ return 0;
+}
+
+static struct kvm_vcpu *get_vcpu_by_vpidx(struct kvm *kvm, u32 vpidx)
+{
+ struct kvm_vcpu *vcpu = NULL;
+ unsigned long i;
+
+ if (vpidx >= KVM_MAX_VCPUS)
+ return NULL;
+
+ vcpu = kvm_get_vcpu(kvm, vpidx);
+ if (vcpu && kvm_hv_get_vpindex(vcpu) == vpidx)
+ return vcpu;
+ kvm_for_each_vcpu(i, vcpu, kvm)
+ if (kvm_hv_get_vpindex(vcpu) == vpidx)
+ return vcpu;
+ return NULL;
+}
+
+static struct kvm_vcpu_hv_synic *synic_get(struct kvm *kvm, u32 vpidx)
+{
+ struct kvm_vcpu *vcpu;
+ struct kvm_vcpu_hv_synic *synic;
+
+ vcpu = get_vcpu_by_vpidx(kvm, vpidx);
+ if (!vcpu || !to_hv_vcpu(vcpu))
+ return NULL;
+ synic = to_hv_synic(vcpu);
+ return (synic->active) ? synic : NULL;
+}
+
+static void kvm_hv_notify_acked_sint(struct kvm_vcpu *vcpu, u32 sint)
+{
+ struct kvm *kvm = vcpu->kvm;
+ struct kvm_vcpu_hv_synic *synic = to_hv_synic(vcpu);
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+ struct kvm_vcpu_hv_stimer *stimer;
+ int gsi, idx;
+
+ trace_kvm_hv_notify_acked_sint(vcpu->vcpu_id, sint);
+
+ /* Try to deliver pending Hyper-V SynIC timers messages */
+ for (idx = 0; idx < ARRAY_SIZE(hv_vcpu->stimer); idx++) {
+ stimer = &hv_vcpu->stimer[idx];
+ if (stimer->msg_pending && stimer->config.enable &&
+ !stimer->config.direct_mode &&
+ stimer->config.sintx == sint)
+ stimer_mark_pending(stimer, false);
+ }
+
+ idx = srcu_read_lock(&kvm->irq_srcu);
+ gsi = atomic_read(&synic->sint_to_gsi[sint]);
+ if (gsi != -1)
+ kvm_notify_acked_gsi(kvm, gsi);
+ srcu_read_unlock(&kvm->irq_srcu, idx);
+}
+
+static void synic_exit(struct kvm_vcpu_hv_synic *synic, u32 msr)
+{
+ struct kvm_vcpu *vcpu = hv_synic_to_vcpu(synic);
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+
+ hv_vcpu->exit.type = KVM_EXIT_HYPERV_SYNIC;
+ hv_vcpu->exit.u.synic.msr = msr;
+ hv_vcpu->exit.u.synic.control = synic->control;
+ hv_vcpu->exit.u.synic.evt_page = synic->evt_page;
+ hv_vcpu->exit.u.synic.msg_page = synic->msg_page;
+
+ kvm_make_request(KVM_REQ_HV_EXIT, vcpu);
+}
+
+static int synic_set_msr(struct kvm_vcpu_hv_synic *synic,
+ u32 msr, u64 data, bool host)
+{
+ struct kvm_vcpu *vcpu = hv_synic_to_vcpu(synic);
+ int ret;
+
+ if (!synic->active && (!host || data))
+ return 1;
+
+ trace_kvm_hv_synic_set_msr(vcpu->vcpu_id, msr, data, host);
+
+ ret = 0;
+ switch (msr) {
+ case HV_X64_MSR_SCONTROL:
+ synic->control = data;
+ if (!host)
+ synic_exit(synic, msr);
+ break;
+ case HV_X64_MSR_SVERSION:
+ if (!host) {
+ ret = 1;
+ break;
+ }
+ synic->version = data;
+ break;
+ case HV_X64_MSR_SIEFP:
+ if ((data & HV_SYNIC_SIEFP_ENABLE) && !host &&
+ !synic->dont_zero_synic_pages)
+ if (kvm_clear_guest(vcpu->kvm,
+ data & PAGE_MASK, PAGE_SIZE)) {
+ ret = 1;
+ break;
+ }
+ synic->evt_page = data;
+ if (!host)
+ synic_exit(synic, msr);
+ break;
+ case HV_X64_MSR_SIMP:
+ if ((data & HV_SYNIC_SIMP_ENABLE) && !host &&
+ !synic->dont_zero_synic_pages)
+ if (kvm_clear_guest(vcpu->kvm,
+ data & PAGE_MASK, PAGE_SIZE)) {
+ ret = 1;
+ break;
+ }
+ synic->msg_page = data;
+ if (!host)
+ synic_exit(synic, msr);
+ break;
+ case HV_X64_MSR_EOM: {
+ int i;
+
+ if (!synic->active)
+ break;
+
+ for (i = 0; i < ARRAY_SIZE(synic->sint); i++)
+ kvm_hv_notify_acked_sint(vcpu, i);
+ break;
+ }
+ case HV_X64_MSR_SINT0 ... HV_X64_MSR_SINT15:
+ ret = synic_set_sint(synic, msr - HV_X64_MSR_SINT0, data, host);
+ break;
+ default:
+ ret = 1;
+ break;
+ }
+ return ret;
+}
+
+static bool kvm_hv_is_syndbg_enabled(struct kvm_vcpu *vcpu)
+{
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+
+ return hv_vcpu->cpuid_cache.syndbg_cap_eax &
+ HV_X64_SYNDBG_CAP_ALLOW_KERNEL_DEBUGGING;
+}
+
+static int kvm_hv_syndbg_complete_userspace(struct kvm_vcpu *vcpu)
+{
+ struct kvm_hv *hv = to_kvm_hv(vcpu->kvm);
+
+ if (vcpu->run->hyperv.u.syndbg.msr == HV_X64_MSR_SYNDBG_CONTROL)
+ hv->hv_syndbg.control.status =
+ vcpu->run->hyperv.u.syndbg.status;
+ return 1;
+}
+
+static void syndbg_exit(struct kvm_vcpu *vcpu, u32 msr)
+{
+ struct kvm_hv_syndbg *syndbg = to_hv_syndbg(vcpu);
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+
+ hv_vcpu->exit.type = KVM_EXIT_HYPERV_SYNDBG;
+ hv_vcpu->exit.u.syndbg.msr = msr;
+ hv_vcpu->exit.u.syndbg.control = syndbg->control.control;
+ hv_vcpu->exit.u.syndbg.send_page = syndbg->control.send_page;
+ hv_vcpu->exit.u.syndbg.recv_page = syndbg->control.recv_page;
+ hv_vcpu->exit.u.syndbg.pending_page = syndbg->control.pending_page;
+ vcpu->arch.complete_userspace_io =
+ kvm_hv_syndbg_complete_userspace;
+
+ kvm_make_request(KVM_REQ_HV_EXIT, vcpu);
+}
+
+static int syndbg_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host)
+{
+ struct kvm_hv_syndbg *syndbg = to_hv_syndbg(vcpu);
+
+ if (!kvm_hv_is_syndbg_enabled(vcpu) && !host)
+ return 1;
+
+ trace_kvm_hv_syndbg_set_msr(vcpu->vcpu_id,
+ to_hv_vcpu(vcpu)->vp_index, msr, data);
+ switch (msr) {
+ case HV_X64_MSR_SYNDBG_CONTROL:
+ syndbg->control.control = data;
+ if (!host)
+ syndbg_exit(vcpu, msr);
+ break;
+ case HV_X64_MSR_SYNDBG_STATUS:
+ syndbg->control.status = data;
+ break;
+ case HV_X64_MSR_SYNDBG_SEND_BUFFER:
+ syndbg->control.send_page = data;
+ break;
+ case HV_X64_MSR_SYNDBG_RECV_BUFFER:
+ syndbg->control.recv_page = data;
+ break;
+ case HV_X64_MSR_SYNDBG_PENDING_BUFFER:
+ syndbg->control.pending_page = data;
+ if (!host)
+ syndbg_exit(vcpu, msr);
+ break;
+ case HV_X64_MSR_SYNDBG_OPTIONS:
+ syndbg->options = data;
+ break;
+ default:
+ break;
+ }
+
+ return 0;
+}
+
+static int syndbg_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host)
+{
+ struct kvm_hv_syndbg *syndbg = to_hv_syndbg(vcpu);
+
+ if (!kvm_hv_is_syndbg_enabled(vcpu) && !host)
+ return 1;
+
+ switch (msr) {
+ case HV_X64_MSR_SYNDBG_CONTROL:
+ *pdata = syndbg->control.control;
+ break;
+ case HV_X64_MSR_SYNDBG_STATUS:
+ *pdata = syndbg->control.status;
+ break;
+ case HV_X64_MSR_SYNDBG_SEND_BUFFER:
+ *pdata = syndbg->control.send_page;
+ break;
+ case HV_X64_MSR_SYNDBG_RECV_BUFFER:
+ *pdata = syndbg->control.recv_page;
+ break;
+ case HV_X64_MSR_SYNDBG_PENDING_BUFFER:
+ *pdata = syndbg->control.pending_page;
+ break;
+ case HV_X64_MSR_SYNDBG_OPTIONS:
+ *pdata = syndbg->options;
+ break;
+ default:
+ break;
+ }
+
+ trace_kvm_hv_syndbg_get_msr(vcpu->vcpu_id, kvm_hv_get_vpindex(vcpu), msr, *pdata);
+
+ return 0;
+}
+
+static int synic_get_msr(struct kvm_vcpu_hv_synic *synic, u32 msr, u64 *pdata,
+ bool host)
+{
+ int ret;
+
+ if (!synic->active && !host)
+ return 1;
+
+ ret = 0;
+ switch (msr) {
+ case HV_X64_MSR_SCONTROL:
+ *pdata = synic->control;
+ break;
+ case HV_X64_MSR_SVERSION:
+ *pdata = synic->version;
+ break;
+ case HV_X64_MSR_SIEFP:
+ *pdata = synic->evt_page;
+ break;
+ case HV_X64_MSR_SIMP:
+ *pdata = synic->msg_page;
+ break;
+ case HV_X64_MSR_EOM:
+ *pdata = 0;
+ break;
+ case HV_X64_MSR_SINT0 ... HV_X64_MSR_SINT15:
+ *pdata = atomic64_read(&synic->sint[msr - HV_X64_MSR_SINT0]);
+ break;
+ default:
+ ret = 1;
+ break;
+ }
+ return ret;
+}
+
+static int synic_set_irq(struct kvm_vcpu_hv_synic *synic, u32 sint)
+{
+ struct kvm_vcpu *vcpu = hv_synic_to_vcpu(synic);
+ struct kvm_lapic_irq irq;
+ int ret, vector;
+
+ if (KVM_BUG_ON(!lapic_in_kernel(vcpu), vcpu->kvm))
+ return -EINVAL;
+
+ if (sint >= ARRAY_SIZE(synic->sint))
+ return -EINVAL;
+
+ vector = synic_get_sint_vector(synic_read_sint(synic, sint));
+ if (vector < 0)
+ return -ENOENT;
+
+ memset(&irq, 0, sizeof(irq));
+ irq.shorthand = APIC_DEST_SELF;
+ irq.dest_mode = APIC_DEST_PHYSICAL;
+ irq.delivery_mode = APIC_DM_FIXED;
+ irq.vector = vector;
+ irq.level = 1;
+
+ ret = kvm_irq_delivery_to_apic(vcpu->kvm, vcpu->arch.apic, &irq, NULL);
+ trace_kvm_hv_synic_set_irq(vcpu->vcpu_id, sint, irq.vector, ret);
+ return ret;
+}
+
+int kvm_hv_synic_set_irq(struct kvm *kvm, u32 vpidx, u32 sint)
+{
+ struct kvm_vcpu_hv_synic *synic;
+
+ synic = synic_get(kvm, vpidx);
+ if (!synic)
+ return -EINVAL;
+
+ return synic_set_irq(synic, sint);
+}
+
+void kvm_hv_synic_send_eoi(struct kvm_vcpu *vcpu, int vector)
+{
+ struct kvm_vcpu_hv_synic *synic = to_hv_synic(vcpu);
+ int i;
+
+ trace_kvm_hv_synic_send_eoi(vcpu->vcpu_id, vector);
+
+ for (i = 0; i < ARRAY_SIZE(synic->sint); i++)
+ if (synic_get_sint_vector(synic_read_sint(synic, i)) == vector)
+ kvm_hv_notify_acked_sint(vcpu, i);
+}
+
+static int kvm_hv_set_sint_gsi(struct kvm *kvm, u32 vpidx, u32 sint, int gsi)
+{
+ struct kvm_vcpu_hv_synic *synic;
+
+ synic = synic_get(kvm, vpidx);
+ if (!synic)
+ return -EINVAL;
+
+ if (sint >= ARRAY_SIZE(synic->sint_to_gsi))
+ return -EINVAL;
+
+ atomic_set(&synic->sint_to_gsi[sint], gsi);
+ return 0;
+}
+
+void kvm_hv_irq_routing_update(struct kvm *kvm)
+{
+ struct kvm_irq_routing_table *irq_rt;
+ struct kvm_kernel_irq_routing_entry *e;
+ u32 gsi;
+
+ irq_rt = srcu_dereference_check(kvm->irq_routing, &kvm->irq_srcu,
+ lockdep_is_held(&kvm->irq_lock));
+
+ for (gsi = 0; gsi < irq_rt->nr_rt_entries; gsi++) {
+ hlist_for_each_entry(e, &irq_rt->map[gsi], link) {
+ if (e->type == KVM_IRQ_ROUTING_HV_SINT)
+ kvm_hv_set_sint_gsi(kvm, e->hv_sint.vcpu,
+ e->hv_sint.sint, gsi);
+ }
+ }
+}
+
+static void synic_init(struct kvm_vcpu_hv_synic *synic)
+{
+ int i;
+
+ memset(synic, 0, sizeof(*synic));
+ synic->version = HV_SYNIC_VERSION_1;
+ for (i = 0; i < ARRAY_SIZE(synic->sint); i++) {
+ atomic64_set(&synic->sint[i], HV_SYNIC_SINT_MASKED);
+ atomic_set(&synic->sint_to_gsi[i], -1);
+ }
+}
+
+static u64 get_time_ref_counter(struct kvm *kvm)
+{
+ struct kvm_hv *hv = to_kvm_hv(kvm);
+ struct kvm_vcpu *vcpu;
+ u64 tsc;
+
+ /*
+ * Fall back to get_kvmclock_ns() when TSC page hasn't been set up,
+ * is broken, disabled or being updated.
+ */
+ if (hv->hv_tsc_page_status != HV_TSC_PAGE_SET)
+ return div_u64(get_kvmclock_ns(kvm), 100);
+
+ vcpu = kvm_get_vcpu(kvm, 0);
+ tsc = kvm_read_l1_tsc(vcpu, rdtsc());
+ return mul_u64_u64_shr(tsc, hv->tsc_ref.tsc_scale, 64)
+ + hv->tsc_ref.tsc_offset;
+}
+
+static void stimer_mark_pending(struct kvm_vcpu_hv_stimer *stimer,
+ bool vcpu_kick)
+{
+ struct kvm_vcpu *vcpu = hv_stimer_to_vcpu(stimer);
+
+ set_bit(stimer->index,
+ to_hv_vcpu(vcpu)->stimer_pending_bitmap);
+ kvm_make_request(KVM_REQ_HV_STIMER, vcpu);
+ if (vcpu_kick)
+ kvm_vcpu_kick(vcpu);
+}
+
+static void stimer_cleanup(struct kvm_vcpu_hv_stimer *stimer)
+{
+ struct kvm_vcpu *vcpu = hv_stimer_to_vcpu(stimer);
+
+ trace_kvm_hv_stimer_cleanup(hv_stimer_to_vcpu(stimer)->vcpu_id,
+ stimer->index);
+
+ hrtimer_cancel(&stimer->timer);
+ clear_bit(stimer->index,
+ to_hv_vcpu(vcpu)->stimer_pending_bitmap);
+ stimer->msg_pending = false;
+ stimer->exp_time = 0;
+}
+
+static enum hrtimer_restart stimer_timer_callback(struct hrtimer *timer)
+{
+ struct kvm_vcpu_hv_stimer *stimer;
+
+ stimer = container_of(timer, struct kvm_vcpu_hv_stimer, timer);
+ trace_kvm_hv_stimer_callback(hv_stimer_to_vcpu(stimer)->vcpu_id,
+ stimer->index);
+ stimer_mark_pending(stimer, true);
+
+ return HRTIMER_NORESTART;
+}
+
+/*
+ * stimer_start() assumptions:
+ * a) stimer->count is not equal to 0
+ * b) stimer->config has HV_STIMER_ENABLE flag
+ */
+static int stimer_start(struct kvm_vcpu_hv_stimer *stimer)
+{
+ u64 time_now;
+ ktime_t ktime_now;
+
+ time_now = get_time_ref_counter(hv_stimer_to_vcpu(stimer)->kvm);
+ ktime_now = ktime_get();
+
+ if (stimer->config.periodic) {
+ if (stimer->exp_time) {
+ if (time_now >= stimer->exp_time) {
+ u64 remainder;
+
+ div64_u64_rem(time_now - stimer->exp_time,
+ stimer->count, &remainder);
+ stimer->exp_time =
+ time_now + (stimer->count - remainder);
+ }
+ } else
+ stimer->exp_time = time_now + stimer->count;
+
+ trace_kvm_hv_stimer_start_periodic(
+ hv_stimer_to_vcpu(stimer)->vcpu_id,
+ stimer->index,
+ time_now, stimer->exp_time);
+
+ hrtimer_start(&stimer->timer,
+ ktime_add_ns(ktime_now,
+ 100 * (stimer->exp_time - time_now)),
+ HRTIMER_MODE_ABS);
+ return 0;
+ }
+ stimer->exp_time = stimer->count;
+ if (time_now >= stimer->count) {
+ /*
+ * Expire timer according to Hypervisor Top-Level Functional
+ * specification v4(15.3.1):
+ * "If a one shot is enabled and the specified count is in
+ * the past, it will expire immediately."
+ */
+ stimer_mark_pending(stimer, false);
+ return 0;
+ }
+
+ trace_kvm_hv_stimer_start_one_shot(hv_stimer_to_vcpu(stimer)->vcpu_id,
+ stimer->index,
+ time_now, stimer->count);
+
+ hrtimer_start(&stimer->timer,
+ ktime_add_ns(ktime_now, 100 * (stimer->count - time_now)),
+ HRTIMER_MODE_ABS);
+ return 0;
+}
+
+static int stimer_set_config(struct kvm_vcpu_hv_stimer *stimer, u64 config,
+ bool host)
+{
+ union hv_stimer_config new_config = {.as_uint64 = config},
+ old_config = {.as_uint64 = stimer->config.as_uint64};
+ struct kvm_vcpu *vcpu = hv_stimer_to_vcpu(stimer);
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+ struct kvm_vcpu_hv_synic *synic = to_hv_synic(vcpu);
+
+ if (!synic->active && (!host || config))
+ return 1;
+
+ if (unlikely(!host && hv_vcpu->enforce_cpuid && new_config.direct_mode &&
+ !(hv_vcpu->cpuid_cache.features_edx &
+ HV_STIMER_DIRECT_MODE_AVAILABLE)))
+ return 1;
+
+ trace_kvm_hv_stimer_set_config(hv_stimer_to_vcpu(stimer)->vcpu_id,
+ stimer->index, config, host);
+
+ stimer_cleanup(stimer);
+ if (old_config.enable &&
+ !new_config.direct_mode && new_config.sintx == 0)
+ new_config.enable = 0;
+ stimer->config.as_uint64 = new_config.as_uint64;
+
+ if (stimer->config.enable)
+ stimer_mark_pending(stimer, false);
+
+ return 0;
+}
+
+static int stimer_set_count(struct kvm_vcpu_hv_stimer *stimer, u64 count,
+ bool host)
+{
+ struct kvm_vcpu *vcpu = hv_stimer_to_vcpu(stimer);
+ struct kvm_vcpu_hv_synic *synic = to_hv_synic(vcpu);
+
+ if (!synic->active && (!host || count))
+ return 1;
+
+ trace_kvm_hv_stimer_set_count(hv_stimer_to_vcpu(stimer)->vcpu_id,
+ stimer->index, count, host);
+
+ stimer_cleanup(stimer);
+ stimer->count = count;
+ if (!host) {
+ if (stimer->count == 0)
+ stimer->config.enable = 0;
+ else if (stimer->config.auto_enable)
+ stimer->config.enable = 1;
+ }
+
+ if (stimer->config.enable)
+ stimer_mark_pending(stimer, false);
+
+ return 0;
+}
+
+static int stimer_get_config(struct kvm_vcpu_hv_stimer *stimer, u64 *pconfig)
+{
+ *pconfig = stimer->config.as_uint64;
+ return 0;
+}
+
+static int stimer_get_count(struct kvm_vcpu_hv_stimer *stimer, u64 *pcount)
+{
+ *pcount = stimer->count;
+ return 0;
+}
+
+static int synic_deliver_msg(struct kvm_vcpu_hv_synic *synic, u32 sint,
+ struct hv_message *src_msg, bool no_retry)
+{
+ struct kvm_vcpu *vcpu = hv_synic_to_vcpu(synic);
+ int msg_off = offsetof(struct hv_message_page, sint_message[sint]);
+ gfn_t msg_page_gfn;
+ struct hv_message_header hv_hdr;
+ int r;
+
+ if (!(synic->msg_page & HV_SYNIC_SIMP_ENABLE))
+ return -ENOENT;
+
+ msg_page_gfn = synic->msg_page >> PAGE_SHIFT;
+
+ /*
+ * Strictly following the spec-mandated ordering would assume setting
+ * .msg_pending before checking .message_type. However, this function
+ * is only called in vcpu context so the entire update is atomic from
+ * guest POV and thus the exact order here doesn't matter.
+ */
+ r = kvm_vcpu_read_guest_page(vcpu, msg_page_gfn, &hv_hdr.message_type,
+ msg_off + offsetof(struct hv_message,
+ header.message_type),
+ sizeof(hv_hdr.message_type));
+ if (r < 0)
+ return r;
+
+ if (hv_hdr.message_type != HVMSG_NONE) {
+ if (no_retry)
+ return 0;
+
+ hv_hdr.message_flags.msg_pending = 1;
+ r = kvm_vcpu_write_guest_page(vcpu, msg_page_gfn,
+ &hv_hdr.message_flags,
+ msg_off +
+ offsetof(struct hv_message,
+ header.message_flags),
+ sizeof(hv_hdr.message_flags));
+ if (r < 0)
+ return r;
+ return -EAGAIN;
+ }
+
+ r = kvm_vcpu_write_guest_page(vcpu, msg_page_gfn, src_msg, msg_off,
+ sizeof(src_msg->header) +
+ src_msg->header.payload_size);
+ if (r < 0)
+ return r;
+
+ r = synic_set_irq(synic, sint);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ return -EFAULT;
+ return 0;
+}
+
+static int stimer_send_msg(struct kvm_vcpu_hv_stimer *stimer)
+{
+ struct kvm_vcpu *vcpu = hv_stimer_to_vcpu(stimer);
+ struct hv_message *msg = &stimer->msg;
+ struct hv_timer_message_payload *payload =
+ (struct hv_timer_message_payload *)&msg->u.payload;
+
+ /*
+ * To avoid piling up periodic ticks, don't retry message
+ * delivery for them (within "lazy" lost ticks policy).
+ */
+ bool no_retry = stimer->config.periodic;
+
+ payload->expiration_time = stimer->exp_time;
+ payload->delivery_time = get_time_ref_counter(vcpu->kvm);
+ return synic_deliver_msg(to_hv_synic(vcpu),
+ stimer->config.sintx, msg,
+ no_retry);
+}
+
+static int stimer_notify_direct(struct kvm_vcpu_hv_stimer *stimer)
+{
+ struct kvm_vcpu *vcpu = hv_stimer_to_vcpu(stimer);
+ struct kvm_lapic_irq irq = {
+ .delivery_mode = APIC_DM_FIXED,
+ .vector = stimer->config.apic_vector
+ };
+
+ if (lapic_in_kernel(vcpu))
+ return !kvm_apic_set_irq(vcpu, &irq, NULL);
+ return 0;
+}
+
+static void stimer_expiration(struct kvm_vcpu_hv_stimer *stimer)
+{
+ int r, direct = stimer->config.direct_mode;
+
+ stimer->msg_pending = true;
+ if (!direct)
+ r = stimer_send_msg(stimer);
+ else
+ r = stimer_notify_direct(stimer);
+ trace_kvm_hv_stimer_expiration(hv_stimer_to_vcpu(stimer)->vcpu_id,
+ stimer->index, direct, r);
+ if (!r) {
+ stimer->msg_pending = false;
+ if (!(stimer->config.periodic))
+ stimer->config.enable = 0;
+ }
+}
+
+void kvm_hv_process_stimers(struct kvm_vcpu *vcpu)
+{
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+ struct kvm_vcpu_hv_stimer *stimer;
+ u64 time_now, exp_time;
+ int i;
+
+ if (!hv_vcpu)
+ return;
+
+ for (i = 0; i < ARRAY_SIZE(hv_vcpu->stimer); i++)
+ if (test_and_clear_bit(i, hv_vcpu->stimer_pending_bitmap)) {
+ stimer = &hv_vcpu->stimer[i];
+ if (stimer->config.enable) {
+ exp_time = stimer->exp_time;
+
+ if (exp_time) {
+ time_now =
+ get_time_ref_counter(vcpu->kvm);
+ if (time_now >= exp_time)
+ stimer_expiration(stimer);
+ }
+
+ if ((stimer->config.enable) &&
+ stimer->count) {
+ if (!stimer->msg_pending)
+ stimer_start(stimer);
+ } else
+ stimer_cleanup(stimer);
+ }
+ }
+}
+
+void kvm_hv_vcpu_uninit(struct kvm_vcpu *vcpu)
+{
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+ int i;
+
+ if (!hv_vcpu)
+ return;
+
+ for (i = 0; i < ARRAY_SIZE(hv_vcpu->stimer); i++)
+ stimer_cleanup(&hv_vcpu->stimer[i]);
+
+ kfree(hv_vcpu);
+ vcpu->arch.hyperv = NULL;
+}
+
+bool kvm_hv_assist_page_enabled(struct kvm_vcpu *vcpu)
+{
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+
+ if (!hv_vcpu)
+ return false;
+
+ if (!(hv_vcpu->hv_vapic & HV_X64_MSR_VP_ASSIST_PAGE_ENABLE))
+ return false;
+ return vcpu->arch.pv_eoi.msr_val & KVM_MSR_ENABLED;
+}
+EXPORT_SYMBOL_GPL(kvm_hv_assist_page_enabled);
+
+int kvm_hv_get_assist_page(struct kvm_vcpu *vcpu)
+{
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+
+ if (!hv_vcpu || !kvm_hv_assist_page_enabled(vcpu))
+ return -EFAULT;
+
+ return kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.pv_eoi.data,
+ &hv_vcpu->vp_assist_page, sizeof(struct hv_vp_assist_page));
+}
+EXPORT_SYMBOL_GPL(kvm_hv_get_assist_page);
+
+static void stimer_prepare_msg(struct kvm_vcpu_hv_stimer *stimer)
+{
+ struct hv_message *msg = &stimer->msg;
+ struct hv_timer_message_payload *payload =
+ (struct hv_timer_message_payload *)&msg->u.payload;
+
+ memset(&msg->header, 0, sizeof(msg->header));
+ msg->header.message_type = HVMSG_TIMER_EXPIRED;
+ msg->header.payload_size = sizeof(*payload);
+
+ payload->timer_index = stimer->index;
+ payload->expiration_time = 0;
+ payload->delivery_time = 0;
+}
+
+static void stimer_init(struct kvm_vcpu_hv_stimer *stimer, int timer_index)
+{
+ memset(stimer, 0, sizeof(*stimer));
+ stimer->index = timer_index;
+ hrtimer_init(&stimer->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+ stimer->timer.function = stimer_timer_callback;
+ stimer_prepare_msg(stimer);
+}
+
+int kvm_hv_vcpu_init(struct kvm_vcpu *vcpu)
+{
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+ int i;
+
+ if (hv_vcpu)
+ return 0;
+
+ hv_vcpu = kzalloc(sizeof(struct kvm_vcpu_hv), GFP_KERNEL_ACCOUNT);
+ if (!hv_vcpu)
+ return -ENOMEM;
+
+ vcpu->arch.hyperv = hv_vcpu;
+ hv_vcpu->vcpu = vcpu;
+
+ synic_init(&hv_vcpu->synic);
+
+ bitmap_zero(hv_vcpu->stimer_pending_bitmap, HV_SYNIC_STIMER_COUNT);
+ for (i = 0; i < ARRAY_SIZE(hv_vcpu->stimer); i++)
+ stimer_init(&hv_vcpu->stimer[i], i);
+
+ hv_vcpu->vp_index = vcpu->vcpu_idx;
+
+ for (i = 0; i < HV_NR_TLB_FLUSH_FIFOS; i++) {
+ INIT_KFIFO(hv_vcpu->tlb_flush_fifo[i].entries);
+ spin_lock_init(&hv_vcpu->tlb_flush_fifo[i].write_lock);
+ }
+
+ return 0;
+}
+
+int kvm_hv_activate_synic(struct kvm_vcpu *vcpu, bool dont_zero_synic_pages)
+{
+ struct kvm_vcpu_hv_synic *synic;
+ int r;
+
+ r = kvm_hv_vcpu_init(vcpu);
+ if (r)
+ return r;
+
+ synic = to_hv_synic(vcpu);
+
+ synic->active = true;
+ synic->dont_zero_synic_pages = dont_zero_synic_pages;
+ synic->control = HV_SYNIC_CONTROL_ENABLE;
+ return 0;
+}
+
+static bool kvm_hv_msr_partition_wide(u32 msr)
+{
+ bool r = false;
+
+ switch (msr) {
+ case HV_X64_MSR_GUEST_OS_ID:
+ case HV_X64_MSR_HYPERCALL:
+ case HV_X64_MSR_REFERENCE_TSC:
+ case HV_X64_MSR_TIME_REF_COUNT:
+ case HV_X64_MSR_CRASH_CTL:
+ case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
+ case HV_X64_MSR_RESET:
+ case HV_X64_MSR_REENLIGHTENMENT_CONTROL:
+ case HV_X64_MSR_TSC_EMULATION_CONTROL:
+ case HV_X64_MSR_TSC_EMULATION_STATUS:
+ case HV_X64_MSR_TSC_INVARIANT_CONTROL:
+ case HV_X64_MSR_SYNDBG_OPTIONS:
+ case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER:
+ r = true;
+ break;
+ }
+
+ return r;
+}
+
+static int kvm_hv_msr_get_crash_data(struct kvm *kvm, u32 index, u64 *pdata)
+{
+ struct kvm_hv *hv = to_kvm_hv(kvm);
+ size_t size = ARRAY_SIZE(hv->hv_crash_param);
+
+ if (WARN_ON_ONCE(index >= size))
+ return -EINVAL;
+
+ *pdata = hv->hv_crash_param[array_index_nospec(index, size)];
+ return 0;
+}
+
+static int kvm_hv_msr_get_crash_ctl(struct kvm *kvm, u64 *pdata)
+{
+ struct kvm_hv *hv = to_kvm_hv(kvm);
+
+ *pdata = hv->hv_crash_ctl;
+ return 0;
+}
+
+static int kvm_hv_msr_set_crash_ctl(struct kvm *kvm, u64 data)
+{
+ struct kvm_hv *hv = to_kvm_hv(kvm);
+
+ hv->hv_crash_ctl = data & HV_CRASH_CTL_CRASH_NOTIFY;
+
+ return 0;
+}
+
+static int kvm_hv_msr_set_crash_data(struct kvm *kvm, u32 index, u64 data)
+{
+ struct kvm_hv *hv = to_kvm_hv(kvm);
+ size_t size = ARRAY_SIZE(hv->hv_crash_param);
+
+ if (WARN_ON_ONCE(index >= size))
+ return -EINVAL;
+
+ hv->hv_crash_param[array_index_nospec(index, size)] = data;
+ return 0;
+}
+
+/*
+ * The kvmclock and Hyper-V TSC page use similar formulas, and converting
+ * between them is possible:
+ *
+ * kvmclock formula:
+ * nsec = (ticks - tsc_timestamp) * tsc_to_system_mul * 2^(tsc_shift-32)
+ * + system_time
+ *
+ * Hyper-V formula:
+ * nsec/100 = ticks * scale / 2^64 + offset
+ *
+ * When tsc_timestamp = system_time = 0, offset is zero in the Hyper-V formula.
+ * By dividing the kvmclock formula by 100 and equating what's left we get:
+ * ticks * scale / 2^64 = ticks * tsc_to_system_mul * 2^(tsc_shift-32) / 100
+ * scale / 2^64 = tsc_to_system_mul * 2^(tsc_shift-32) / 100
+ * scale = tsc_to_system_mul * 2^(32+tsc_shift) / 100
+ *
+ * Now expand the kvmclock formula and divide by 100:
+ * nsec = ticks * tsc_to_system_mul * 2^(tsc_shift-32)
+ * - tsc_timestamp * tsc_to_system_mul * 2^(tsc_shift-32)
+ * + system_time
+ * nsec/100 = ticks * tsc_to_system_mul * 2^(tsc_shift-32) / 100
+ * - tsc_timestamp * tsc_to_system_mul * 2^(tsc_shift-32) / 100
+ * + system_time / 100
+ *
+ * Replace tsc_to_system_mul * 2^(tsc_shift-32) / 100 by scale / 2^64:
+ * nsec/100 = ticks * scale / 2^64
+ * - tsc_timestamp * scale / 2^64
+ * + system_time / 100
+ *
+ * Equate with the Hyper-V formula so that ticks * scale / 2^64 cancels out:
+ * offset = system_time / 100 - tsc_timestamp * scale / 2^64
+ *
+ * These two equivalencies are implemented in this function.
+ */
+static bool compute_tsc_page_parameters(struct pvclock_vcpu_time_info *hv_clock,
+ struct ms_hyperv_tsc_page *tsc_ref)
+{
+ u64 max_mul;
+
+ if (!(hv_clock->flags & PVCLOCK_TSC_STABLE_BIT))
+ return false;
+
+ /*
+ * check if scale would overflow, if so we use the time ref counter
+ * tsc_to_system_mul * 2^(tsc_shift+32) / 100 >= 2^64
+ * tsc_to_system_mul / 100 >= 2^(32-tsc_shift)
+ * tsc_to_system_mul >= 100 * 2^(32-tsc_shift)
+ */
+ max_mul = 100ull << (32 - hv_clock->tsc_shift);
+ if (hv_clock->tsc_to_system_mul >= max_mul)
+ return false;
+
+ /*
+ * Otherwise compute the scale and offset according to the formulas
+ * derived above.
+ */
+ tsc_ref->tsc_scale =
+ mul_u64_u32_div(1ULL << (32 + hv_clock->tsc_shift),
+ hv_clock->tsc_to_system_mul,
+ 100);
+
+ tsc_ref->tsc_offset = hv_clock->system_time;
+ do_div(tsc_ref->tsc_offset, 100);
+ tsc_ref->tsc_offset -=
+ mul_u64_u64_shr(hv_clock->tsc_timestamp, tsc_ref->tsc_scale, 64);
+ return true;
+}
+
+/*
+ * Don't touch TSC page values if the guest has opted for TSC emulation after
+ * migration. KVM doesn't fully support reenlightenment notifications and TSC
+ * access emulation and Hyper-V is known to expect the values in TSC page to
+ * stay constant before TSC access emulation is disabled from guest side
+ * (HV_X64_MSR_TSC_EMULATION_STATUS). KVM userspace is expected to preserve TSC
+ * frequency and guest visible TSC value across migration (and prevent it when
+ * TSC scaling is unsupported).
+ */
+static inline bool tsc_page_update_unsafe(struct kvm_hv *hv)
+{
+ return (hv->hv_tsc_page_status != HV_TSC_PAGE_GUEST_CHANGED) &&
+ hv->hv_tsc_emulation_control;
+}
+
+void kvm_hv_setup_tsc_page(struct kvm *kvm,
+ struct pvclock_vcpu_time_info *hv_clock)
+{
+ struct kvm_hv *hv = to_kvm_hv(kvm);
+ u32 tsc_seq;
+ u64 gfn;
+
+ BUILD_BUG_ON(sizeof(tsc_seq) != sizeof(hv->tsc_ref.tsc_sequence));
+ BUILD_BUG_ON(offsetof(struct ms_hyperv_tsc_page, tsc_sequence) != 0);
+
+ mutex_lock(&hv->hv_lock);
+
+ if (hv->hv_tsc_page_status == HV_TSC_PAGE_BROKEN ||
+ hv->hv_tsc_page_status == HV_TSC_PAGE_SET ||
+ hv->hv_tsc_page_status == HV_TSC_PAGE_UNSET)
+ goto out_unlock;
+
+ if (!(hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE))
+ goto out_unlock;
+
+ gfn = hv->hv_tsc_page >> HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT;
+ /*
+ * Because the TSC parameters only vary when there is a
+ * change in the master clock, do not bother with caching.
+ */
+ if (unlikely(kvm_read_guest(kvm, gfn_to_gpa(gfn),
+ &tsc_seq, sizeof(tsc_seq))))
+ goto out_err;
+
+ if (tsc_seq && tsc_page_update_unsafe(hv)) {
+ if (kvm_read_guest(kvm, gfn_to_gpa(gfn), &hv->tsc_ref, sizeof(hv->tsc_ref)))
+ goto out_err;
+
+ hv->hv_tsc_page_status = HV_TSC_PAGE_SET;
+ goto out_unlock;
+ }
+
+ /*
+ * While we're computing and writing the parameters, force the
+ * guest to use the time reference count MSR.
+ */
+ hv->tsc_ref.tsc_sequence = 0;
+ if (kvm_write_guest(kvm, gfn_to_gpa(gfn),
+ &hv->tsc_ref, sizeof(hv->tsc_ref.tsc_sequence)))
+ goto out_err;
+
+ if (!compute_tsc_page_parameters(hv_clock, &hv->tsc_ref))
+ goto out_err;
+
+ /* Ensure sequence is zero before writing the rest of the struct. */
+ smp_wmb();
+ if (kvm_write_guest(kvm, gfn_to_gpa(gfn), &hv->tsc_ref, sizeof(hv->tsc_ref)))
+ goto out_err;
+
+ /*
+ * Now switch to the TSC page mechanism by writing the sequence.
+ */
+ tsc_seq++;
+ if (tsc_seq == 0xFFFFFFFF || tsc_seq == 0)
+ tsc_seq = 1;
+
+ /* Write the struct entirely before the non-zero sequence. */
+ smp_wmb();
+
+ hv->tsc_ref.tsc_sequence = tsc_seq;
+ if (kvm_write_guest(kvm, gfn_to_gpa(gfn),
+ &hv->tsc_ref, sizeof(hv->tsc_ref.tsc_sequence)))
+ goto out_err;
+
+ hv->hv_tsc_page_status = HV_TSC_PAGE_SET;
+ goto out_unlock;
+
+out_err:
+ hv->hv_tsc_page_status = HV_TSC_PAGE_BROKEN;
+out_unlock:
+ mutex_unlock(&hv->hv_lock);
+}
+
+void kvm_hv_request_tsc_page_update(struct kvm *kvm)
+{
+ struct kvm_hv *hv = to_kvm_hv(kvm);
+
+ mutex_lock(&hv->hv_lock);
+
+ if (hv->hv_tsc_page_status == HV_TSC_PAGE_SET &&
+ !tsc_page_update_unsafe(hv))
+ hv->hv_tsc_page_status = HV_TSC_PAGE_HOST_CHANGED;
+
+ mutex_unlock(&hv->hv_lock);
+}
+
+static bool hv_check_msr_access(struct kvm_vcpu_hv *hv_vcpu, u32 msr)
+{
+ if (!hv_vcpu->enforce_cpuid)
+ return true;
+
+ switch (msr) {
+ case HV_X64_MSR_GUEST_OS_ID:
+ case HV_X64_MSR_HYPERCALL:
+ return hv_vcpu->cpuid_cache.features_eax &
+ HV_MSR_HYPERCALL_AVAILABLE;
+ case HV_X64_MSR_VP_RUNTIME:
+ return hv_vcpu->cpuid_cache.features_eax &
+ HV_MSR_VP_RUNTIME_AVAILABLE;
+ case HV_X64_MSR_TIME_REF_COUNT:
+ return hv_vcpu->cpuid_cache.features_eax &
+ HV_MSR_TIME_REF_COUNT_AVAILABLE;
+ case HV_X64_MSR_VP_INDEX:
+ return hv_vcpu->cpuid_cache.features_eax &
+ HV_MSR_VP_INDEX_AVAILABLE;
+ case HV_X64_MSR_RESET:
+ return hv_vcpu->cpuid_cache.features_eax &
+ HV_MSR_RESET_AVAILABLE;
+ case HV_X64_MSR_REFERENCE_TSC:
+ return hv_vcpu->cpuid_cache.features_eax &
+ HV_MSR_REFERENCE_TSC_AVAILABLE;
+ case HV_X64_MSR_SCONTROL:
+ case HV_X64_MSR_SVERSION:
+ case HV_X64_MSR_SIEFP:
+ case HV_X64_MSR_SIMP:
+ case HV_X64_MSR_EOM:
+ case HV_X64_MSR_SINT0 ... HV_X64_MSR_SINT15:
+ return hv_vcpu->cpuid_cache.features_eax &
+ HV_MSR_SYNIC_AVAILABLE;
+ case HV_X64_MSR_STIMER0_CONFIG:
+ case HV_X64_MSR_STIMER1_CONFIG:
+ case HV_X64_MSR_STIMER2_CONFIG:
+ case HV_X64_MSR_STIMER3_CONFIG:
+ case HV_X64_MSR_STIMER0_COUNT:
+ case HV_X64_MSR_STIMER1_COUNT:
+ case HV_X64_MSR_STIMER2_COUNT:
+ case HV_X64_MSR_STIMER3_COUNT:
+ return hv_vcpu->cpuid_cache.features_eax &
+ HV_MSR_SYNTIMER_AVAILABLE;
+ case HV_X64_MSR_EOI:
+ case HV_X64_MSR_ICR:
+ case HV_X64_MSR_TPR:
+ case HV_X64_MSR_VP_ASSIST_PAGE:
+ return hv_vcpu->cpuid_cache.features_eax &
+ HV_MSR_APIC_ACCESS_AVAILABLE;
+ case HV_X64_MSR_TSC_FREQUENCY:
+ case HV_X64_MSR_APIC_FREQUENCY:
+ return hv_vcpu->cpuid_cache.features_eax &
+ HV_ACCESS_FREQUENCY_MSRS;
+ case HV_X64_MSR_REENLIGHTENMENT_CONTROL:
+ case HV_X64_MSR_TSC_EMULATION_CONTROL:
+ case HV_X64_MSR_TSC_EMULATION_STATUS:
+ return hv_vcpu->cpuid_cache.features_eax &
+ HV_ACCESS_REENLIGHTENMENT;
+ case HV_X64_MSR_TSC_INVARIANT_CONTROL:
+ return hv_vcpu->cpuid_cache.features_eax &
+ HV_ACCESS_TSC_INVARIANT;
+ case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
+ case HV_X64_MSR_CRASH_CTL:
+ return hv_vcpu->cpuid_cache.features_edx &
+ HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE;
+ case HV_X64_MSR_SYNDBG_OPTIONS:
+ case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER:
+ return hv_vcpu->cpuid_cache.features_edx &
+ HV_FEATURE_DEBUG_MSRS_AVAILABLE;
+ default:
+ break;
+ }
+
+ return false;
+}
+
+static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data,
+ bool host)
+{
+ struct kvm *kvm = vcpu->kvm;
+ struct kvm_hv *hv = to_kvm_hv(kvm);
+
+ if (unlikely(!host && !hv_check_msr_access(to_hv_vcpu(vcpu), msr)))
+ return 1;
+
+ switch (msr) {
+ case HV_X64_MSR_GUEST_OS_ID:
+ hv->hv_guest_os_id = data;
+ /* setting guest os id to zero disables hypercall page */
+ if (!hv->hv_guest_os_id)
+ hv->hv_hypercall &= ~HV_X64_MSR_HYPERCALL_ENABLE;
+ break;
+ case HV_X64_MSR_HYPERCALL: {
+ u8 instructions[9];
+ int i = 0;
+ u64 addr;
+
+ /* if guest os id is not set hypercall should remain disabled */
+ if (!hv->hv_guest_os_id)
+ break;
+ if (!(data & HV_X64_MSR_HYPERCALL_ENABLE)) {
+ hv->hv_hypercall = data;
+ break;
+ }
+
+ /*
+ * If Xen and Hyper-V hypercalls are both enabled, disambiguate
+ * the same way Xen itself does, by setting the bit 31 of EAX
+ * which is RsvdZ in the 32-bit Hyper-V hypercall ABI and just
+ * going to be clobbered on 64-bit.
+ */
+ if (kvm_xen_hypercall_enabled(kvm)) {
+ /* orl $0x80000000, %eax */
+ instructions[i++] = 0x0d;
+ instructions[i++] = 0x00;
+ instructions[i++] = 0x00;
+ instructions[i++] = 0x00;
+ instructions[i++] = 0x80;
+ }
+
+ /* vmcall/vmmcall */
+ static_call(kvm_x86_patch_hypercall)(vcpu, instructions + i);
+ i += 3;
+
+ /* ret */
+ ((unsigned char *)instructions)[i++] = 0xc3;
+
+ addr = data & HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_MASK;
+ if (kvm_vcpu_write_guest(vcpu, addr, instructions, i))
+ return 1;
+ hv->hv_hypercall = data;
+ break;
+ }
+ case HV_X64_MSR_REFERENCE_TSC:
+ hv->hv_tsc_page = data;
+ if (hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE) {
+ if (!host)
+ hv->hv_tsc_page_status = HV_TSC_PAGE_GUEST_CHANGED;
+ else
+ hv->hv_tsc_page_status = HV_TSC_PAGE_HOST_CHANGED;
+ kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
+ } else {
+ hv->hv_tsc_page_status = HV_TSC_PAGE_UNSET;
+ }
+ break;
+ case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
+ return kvm_hv_msr_set_crash_data(kvm,
+ msr - HV_X64_MSR_CRASH_P0,
+ data);
+ case HV_X64_MSR_CRASH_CTL:
+ if (host)
+ return kvm_hv_msr_set_crash_ctl(kvm, data);
+
+ if (data & HV_CRASH_CTL_CRASH_NOTIFY) {
+ vcpu_debug(vcpu, "hv crash (0x%llx 0x%llx 0x%llx 0x%llx 0x%llx)\n",
+ hv->hv_crash_param[0],
+ hv->hv_crash_param[1],
+ hv->hv_crash_param[2],
+ hv->hv_crash_param[3],
+ hv->hv_crash_param[4]);
+
+ /* Send notification about crash to user space */
+ kvm_make_request(KVM_REQ_HV_CRASH, vcpu);
+ }
+ break;
+ case HV_X64_MSR_RESET:
+ if (data == 1) {
+ vcpu_debug(vcpu, "hyper-v reset requested\n");
+ kvm_make_request(KVM_REQ_HV_RESET, vcpu);
+ }
+ break;
+ case HV_X64_MSR_REENLIGHTENMENT_CONTROL:
+ hv->hv_reenlightenment_control = data;
+ break;
+ case HV_X64_MSR_TSC_EMULATION_CONTROL:
+ hv->hv_tsc_emulation_control = data;
+ break;
+ case HV_X64_MSR_TSC_EMULATION_STATUS:
+ if (data && !host)
+ return 1;
+
+ hv->hv_tsc_emulation_status = data;
+ break;
+ case HV_X64_MSR_TIME_REF_COUNT:
+ /* read-only, but still ignore it if host-initiated */
+ if (!host)
+ return 1;
+ break;
+ case HV_X64_MSR_TSC_INVARIANT_CONTROL:
+ /* Only bit 0 is supported */
+ if (data & ~HV_EXPOSE_INVARIANT_TSC)
+ return 1;
+
+ /* The feature can't be disabled from the guest */
+ if (!host && hv->hv_invtsc_control && !data)
+ return 1;
+
+ hv->hv_invtsc_control = data;
+ break;
+ case HV_X64_MSR_SYNDBG_OPTIONS:
+ case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER:
+ return syndbg_set_msr(vcpu, msr, data, host);
+ default:
+ kvm_pr_unimpl_wrmsr(vcpu, msr, data);
+ return 1;
+ }
+ return 0;
+}
+
+/* Calculate cpu time spent by current task in 100ns units */
+static u64 current_task_runtime_100ns(void)
+{
+ u64 utime, stime;
+
+ task_cputime_adjusted(current, &utime, &stime);
+
+ return div_u64(utime + stime, 100);
+}
+
+static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host)
+{
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+
+ if (unlikely(!host && !hv_check_msr_access(hv_vcpu, msr)))
+ return 1;
+
+ switch (msr) {
+ case HV_X64_MSR_VP_INDEX: {
+ struct kvm_hv *hv = to_kvm_hv(vcpu->kvm);
+ u32 new_vp_index = (u32)data;
+
+ if (!host || new_vp_index >= KVM_MAX_VCPUS)
+ return 1;
+
+ if (new_vp_index == hv_vcpu->vp_index)
+ return 0;
+
+ /*
+ * The VP index is initialized to vcpu_index by
+ * kvm_hv_vcpu_postcreate so they initially match. Now the
+ * VP index is changing, adjust num_mismatched_vp_indexes if
+ * it now matches or no longer matches vcpu_idx.
+ */
+ if (hv_vcpu->vp_index == vcpu->vcpu_idx)
+ atomic_inc(&hv->num_mismatched_vp_indexes);
+ else if (new_vp_index == vcpu->vcpu_idx)
+ atomic_dec(&hv->num_mismatched_vp_indexes);
+
+ hv_vcpu->vp_index = new_vp_index;
+ break;
+ }
+ case HV_X64_MSR_VP_ASSIST_PAGE: {
+ u64 gfn;
+ unsigned long addr;
+
+ if (!(data & HV_X64_MSR_VP_ASSIST_PAGE_ENABLE)) {
+ hv_vcpu->hv_vapic = data;
+ if (kvm_lapic_set_pv_eoi(vcpu, 0, 0))
+ return 1;
+ break;
+ }
+ gfn = data >> HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_SHIFT;
+ addr = kvm_vcpu_gfn_to_hva(vcpu, gfn);
+ if (kvm_is_error_hva(addr))
+ return 1;
+
+ /*
+ * Clear apic_assist portion of struct hv_vp_assist_page
+ * only, there can be valuable data in the rest which needs
+ * to be preserved e.g. on migration.
+ */
+ if (__put_user(0, (u32 __user *)addr))
+ return 1;
+ hv_vcpu->hv_vapic = data;
+ kvm_vcpu_mark_page_dirty(vcpu, gfn);
+ if (kvm_lapic_set_pv_eoi(vcpu,
+ gfn_to_gpa(gfn) | KVM_MSR_ENABLED,
+ sizeof(struct hv_vp_assist_page)))
+ return 1;
+ break;
+ }
+ case HV_X64_MSR_EOI:
+ return kvm_hv_vapic_msr_write(vcpu, APIC_EOI, data);
+ case HV_X64_MSR_ICR:
+ return kvm_hv_vapic_msr_write(vcpu, APIC_ICR, data);
+ case HV_X64_MSR_TPR:
+ return kvm_hv_vapic_msr_write(vcpu, APIC_TASKPRI, data);
+ case HV_X64_MSR_VP_RUNTIME:
+ if (!host)
+ return 1;
+ hv_vcpu->runtime_offset = data - current_task_runtime_100ns();
+ break;
+ case HV_X64_MSR_SCONTROL:
+ case HV_X64_MSR_SVERSION:
+ case HV_X64_MSR_SIEFP:
+ case HV_X64_MSR_SIMP:
+ case HV_X64_MSR_EOM:
+ case HV_X64_MSR_SINT0 ... HV_X64_MSR_SINT15:
+ return synic_set_msr(to_hv_synic(vcpu), msr, data, host);
+ case HV_X64_MSR_STIMER0_CONFIG:
+ case HV_X64_MSR_STIMER1_CONFIG:
+ case HV_X64_MSR_STIMER2_CONFIG:
+ case HV_X64_MSR_STIMER3_CONFIG: {
+ int timer_index = (msr - HV_X64_MSR_STIMER0_CONFIG)/2;
+
+ return stimer_set_config(to_hv_stimer(vcpu, timer_index),
+ data, host);
+ }
+ case HV_X64_MSR_STIMER0_COUNT:
+ case HV_X64_MSR_STIMER1_COUNT:
+ case HV_X64_MSR_STIMER2_COUNT:
+ case HV_X64_MSR_STIMER3_COUNT: {
+ int timer_index = (msr - HV_X64_MSR_STIMER0_COUNT)/2;
+
+ return stimer_set_count(to_hv_stimer(vcpu, timer_index),
+ data, host);
+ }
+ case HV_X64_MSR_TSC_FREQUENCY:
+ case HV_X64_MSR_APIC_FREQUENCY:
+ /* read-only, but still ignore it if host-initiated */
+ if (!host)
+ return 1;
+ break;
+ default:
+ kvm_pr_unimpl_wrmsr(vcpu, msr, data);
+ return 1;
+ }
+
+ return 0;
+}
+
+static int kvm_hv_get_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata,
+ bool host)
+{
+ u64 data = 0;
+ struct kvm *kvm = vcpu->kvm;
+ struct kvm_hv *hv = to_kvm_hv(kvm);
+
+ if (unlikely(!host && !hv_check_msr_access(to_hv_vcpu(vcpu), msr)))
+ return 1;
+
+ switch (msr) {
+ case HV_X64_MSR_GUEST_OS_ID:
+ data = hv->hv_guest_os_id;
+ break;
+ case HV_X64_MSR_HYPERCALL:
+ data = hv->hv_hypercall;
+ break;
+ case HV_X64_MSR_TIME_REF_COUNT:
+ data = get_time_ref_counter(kvm);
+ break;
+ case HV_X64_MSR_REFERENCE_TSC:
+ data = hv->hv_tsc_page;
+ break;
+ case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
+ return kvm_hv_msr_get_crash_data(kvm,
+ msr - HV_X64_MSR_CRASH_P0,
+ pdata);
+ case HV_X64_MSR_CRASH_CTL:
+ return kvm_hv_msr_get_crash_ctl(kvm, pdata);
+ case HV_X64_MSR_RESET:
+ data = 0;
+ break;
+ case HV_X64_MSR_REENLIGHTENMENT_CONTROL:
+ data = hv->hv_reenlightenment_control;
+ break;
+ case HV_X64_MSR_TSC_EMULATION_CONTROL:
+ data = hv->hv_tsc_emulation_control;
+ break;
+ case HV_X64_MSR_TSC_EMULATION_STATUS:
+ data = hv->hv_tsc_emulation_status;
+ break;
+ case HV_X64_MSR_TSC_INVARIANT_CONTROL:
+ data = hv->hv_invtsc_control;
+ break;
+ case HV_X64_MSR_SYNDBG_OPTIONS:
+ case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER:
+ return syndbg_get_msr(vcpu, msr, pdata, host);
+ default:
+ kvm_pr_unimpl_rdmsr(vcpu, msr);
+ return 1;
+ }
+
+ *pdata = data;
+ return 0;
+}
+
+static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata,
+ bool host)
+{
+ u64 data = 0;
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+
+ if (unlikely(!host && !hv_check_msr_access(hv_vcpu, msr)))
+ return 1;
+
+ switch (msr) {
+ case HV_X64_MSR_VP_INDEX:
+ data = hv_vcpu->vp_index;
+ break;
+ case HV_X64_MSR_EOI:
+ return kvm_hv_vapic_msr_read(vcpu, APIC_EOI, pdata);
+ case HV_X64_MSR_ICR:
+ return kvm_hv_vapic_msr_read(vcpu, APIC_ICR, pdata);
+ case HV_X64_MSR_TPR:
+ return kvm_hv_vapic_msr_read(vcpu, APIC_TASKPRI, pdata);
+ case HV_X64_MSR_VP_ASSIST_PAGE:
+ data = hv_vcpu->hv_vapic;
+ break;
+ case HV_X64_MSR_VP_RUNTIME:
+ data = current_task_runtime_100ns() + hv_vcpu->runtime_offset;
+ break;
+ case HV_X64_MSR_SCONTROL:
+ case HV_X64_MSR_SVERSION:
+ case HV_X64_MSR_SIEFP:
+ case HV_X64_MSR_SIMP:
+ case HV_X64_MSR_EOM:
+ case HV_X64_MSR_SINT0 ... HV_X64_MSR_SINT15:
+ return synic_get_msr(to_hv_synic(vcpu), msr, pdata, host);
+ case HV_X64_MSR_STIMER0_CONFIG:
+ case HV_X64_MSR_STIMER1_CONFIG:
+ case HV_X64_MSR_STIMER2_CONFIG:
+ case HV_X64_MSR_STIMER3_CONFIG: {
+ int timer_index = (msr - HV_X64_MSR_STIMER0_CONFIG)/2;
+
+ return stimer_get_config(to_hv_stimer(vcpu, timer_index),
+ pdata);
+ }
+ case HV_X64_MSR_STIMER0_COUNT:
+ case HV_X64_MSR_STIMER1_COUNT:
+ case HV_X64_MSR_STIMER2_COUNT:
+ case HV_X64_MSR_STIMER3_COUNT: {
+ int timer_index = (msr - HV_X64_MSR_STIMER0_COUNT)/2;
+
+ return stimer_get_count(to_hv_stimer(vcpu, timer_index),
+ pdata);
+ }
+ case HV_X64_MSR_TSC_FREQUENCY:
+ data = (u64)vcpu->arch.virtual_tsc_khz * 1000;
+ break;
+ case HV_X64_MSR_APIC_FREQUENCY:
+ data = APIC_BUS_FREQUENCY;
+ break;
+ default:
+ kvm_pr_unimpl_rdmsr(vcpu, msr);
+ return 1;
+ }
+ *pdata = data;
+ return 0;
+}
+
+int kvm_hv_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host)
+{
+ struct kvm_hv *hv = to_kvm_hv(vcpu->kvm);
+
+ if (!host && !vcpu->arch.hyperv_enabled)
+ return 1;
+
+ if (kvm_hv_vcpu_init(vcpu))
+ return 1;
+
+ if (kvm_hv_msr_partition_wide(msr)) {
+ int r;
+
+ mutex_lock(&hv->hv_lock);
+ r = kvm_hv_set_msr_pw(vcpu, msr, data, host);
+ mutex_unlock(&hv->hv_lock);
+ return r;
+ } else
+ return kvm_hv_set_msr(vcpu, msr, data, host);
+}
+
+int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host)
+{
+ struct kvm_hv *hv = to_kvm_hv(vcpu->kvm);
+
+ if (!host && !vcpu->arch.hyperv_enabled)
+ return 1;
+
+ if (kvm_hv_vcpu_init(vcpu))
+ return 1;
+
+ if (kvm_hv_msr_partition_wide(msr)) {
+ int r;
+
+ mutex_lock(&hv->hv_lock);
+ r = kvm_hv_get_msr_pw(vcpu, msr, pdata, host);
+ mutex_unlock(&hv->hv_lock);
+ return r;
+ } else
+ return kvm_hv_get_msr(vcpu, msr, pdata, host);
+}
+
+static void sparse_set_to_vcpu_mask(struct kvm *kvm, u64 *sparse_banks,
+ u64 valid_bank_mask, unsigned long *vcpu_mask)
+{
+ struct kvm_hv *hv = to_kvm_hv(kvm);
+ bool has_mismatch = atomic_read(&hv->num_mismatched_vp_indexes);
+ u64 vp_bitmap[KVM_HV_MAX_SPARSE_VCPU_SET_BITS];
+ struct kvm_vcpu *vcpu;
+ int bank, sbank = 0;
+ unsigned long i;
+ u64 *bitmap;
+
+ BUILD_BUG_ON(sizeof(vp_bitmap) >
+ sizeof(*vcpu_mask) * BITS_TO_LONGS(KVM_MAX_VCPUS));
+
+ /*
+ * If vp_index == vcpu_idx for all vCPUs, fill vcpu_mask directly, else
+ * fill a temporary buffer and manually test each vCPU's VP index.
+ */
+ if (likely(!has_mismatch))
+ bitmap = (u64 *)vcpu_mask;
+ else
+ bitmap = vp_bitmap;
+
+ /*
+ * Each set of 64 VPs is packed into sparse_banks, with valid_bank_mask
+ * having a '1' for each bank that exists in sparse_banks. Sets must
+ * be in ascending order, i.e. bank0..bankN.
+ */
+ memset(bitmap, 0, sizeof(vp_bitmap));
+ for_each_set_bit(bank, (unsigned long *)&valid_bank_mask,
+ KVM_HV_MAX_SPARSE_VCPU_SET_BITS)
+ bitmap[bank] = sparse_banks[sbank++];
+
+ if (likely(!has_mismatch))
+ return;
+
+ bitmap_zero(vcpu_mask, KVM_MAX_VCPUS);
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ if (test_bit(kvm_hv_get_vpindex(vcpu), (unsigned long *)vp_bitmap))
+ __set_bit(i, vcpu_mask);
+ }
+}
+
+static bool hv_is_vp_in_sparse_set(u32 vp_id, u64 valid_bank_mask, u64 sparse_banks[])
+{
+ int valid_bit_nr = vp_id / HV_VCPUS_PER_SPARSE_BANK;
+ unsigned long sbank;
+
+ if (!test_bit(valid_bit_nr, (unsigned long *)&valid_bank_mask))
+ return false;
+
+ /*
+ * The index into the sparse bank is the number of preceding bits in
+ * the valid mask. Optimize for VMs with <64 vCPUs by skipping the
+ * fancy math if there can't possibly be preceding bits.
+ */
+ if (valid_bit_nr)
+ sbank = hweight64(valid_bank_mask & GENMASK_ULL(valid_bit_nr - 1, 0));
+ else
+ sbank = 0;
+
+ return test_bit(vp_id % HV_VCPUS_PER_SPARSE_BANK,
+ (unsigned long *)&sparse_banks[sbank]);
+}
+
+struct kvm_hv_hcall {
+ /* Hypercall input data */
+ u64 param;
+ u64 ingpa;
+ u64 outgpa;
+ u16 code;
+ u16 var_cnt;
+ u16 rep_cnt;
+ u16 rep_idx;
+ bool fast;
+ bool rep;
+ sse128_t xmm[HV_HYPERCALL_MAX_XMM_REGISTERS];
+
+ /*
+ * Current read offset when KVM reads hypercall input data gradually,
+ * either offset in bytes from 'ingpa' for regular hypercalls or the
+ * number of already consumed 'XMM halves' for 'fast' hypercalls.
+ */
+ union {
+ gpa_t data_offset;
+ int consumed_xmm_halves;
+ };
+};
+
+
+static int kvm_hv_get_hc_data(struct kvm *kvm, struct kvm_hv_hcall *hc,
+ u16 orig_cnt, u16 cnt_cap, u64 *data)
+{
+ /*
+ * Preserve the original count when ignoring entries via a "cap", KVM
+ * still needs to validate the guest input (though the non-XMM path
+ * punts on the checks).
+ */
+ u16 cnt = min(orig_cnt, cnt_cap);
+ int i, j;
+
+ if (hc->fast) {
+ /*
+ * Each XMM holds two sparse banks, but do not count halves that
+ * have already been consumed for hypercall parameters.
+ */
+ if (orig_cnt > 2 * HV_HYPERCALL_MAX_XMM_REGISTERS - hc->consumed_xmm_halves)
+ return HV_STATUS_INVALID_HYPERCALL_INPUT;
+
+ for (i = 0; i < cnt; i++) {
+ j = i + hc->consumed_xmm_halves;
+ if (j % 2)
+ data[i] = sse128_hi(hc->xmm[j / 2]);
+ else
+ data[i] = sse128_lo(hc->xmm[j / 2]);
+ }
+ return 0;
+ }
+
+ return kvm_read_guest(kvm, hc->ingpa + hc->data_offset, data,
+ cnt * sizeof(*data));
+}
+
+static u64 kvm_get_sparse_vp_set(struct kvm *kvm, struct kvm_hv_hcall *hc,
+ u64 *sparse_banks)
+{
+ if (hc->var_cnt > HV_MAX_SPARSE_VCPU_BANKS)
+ return -EINVAL;
+
+ /* Cap var_cnt to ignore banks that cannot contain a legal VP index. */
+ return kvm_hv_get_hc_data(kvm, hc, hc->var_cnt, KVM_HV_MAX_SPARSE_VCPU_SET_BITS,
+ sparse_banks);
+}
+
+static int kvm_hv_get_tlb_flush_entries(struct kvm *kvm, struct kvm_hv_hcall *hc, u64 entries[])
+{
+ return kvm_hv_get_hc_data(kvm, hc, hc->rep_cnt, hc->rep_cnt, entries);
+}
+
+static void hv_tlb_flush_enqueue(struct kvm_vcpu *vcpu,
+ struct kvm_vcpu_hv_tlb_flush_fifo *tlb_flush_fifo,
+ u64 *entries, int count)
+{
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+ u64 flush_all_entry = KVM_HV_TLB_FLUSHALL_ENTRY;
+
+ if (!hv_vcpu)
+ return;
+
+ spin_lock(&tlb_flush_fifo->write_lock);
+
+ /*
+ * All entries should fit on the fifo leaving one free for 'flush all'
+ * entry in case another request comes in. In case there's not enough
+ * space, just put 'flush all' entry there.
+ */
+ if (count && entries && count < kfifo_avail(&tlb_flush_fifo->entries)) {
+ WARN_ON(kfifo_in(&tlb_flush_fifo->entries, entries, count) != count);
+ goto out_unlock;
+ }
+
+ /*
+ * Note: full fifo always contains 'flush all' entry, no need to check the
+ * return value.
+ */
+ kfifo_in(&tlb_flush_fifo->entries, &flush_all_entry, 1);
+
+out_unlock:
+ spin_unlock(&tlb_flush_fifo->write_lock);
+}
+
+int kvm_hv_vcpu_flush_tlb(struct kvm_vcpu *vcpu)
+{
+ struct kvm_vcpu_hv_tlb_flush_fifo *tlb_flush_fifo;
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+ u64 entries[KVM_HV_TLB_FLUSH_FIFO_SIZE];
+ int i, j, count;
+ gva_t gva;
+
+ if (!tdp_enabled || !hv_vcpu)
+ return -EINVAL;
+
+ tlb_flush_fifo = kvm_hv_get_tlb_flush_fifo(vcpu, is_guest_mode(vcpu));
+
+ count = kfifo_out(&tlb_flush_fifo->entries, entries, KVM_HV_TLB_FLUSH_FIFO_SIZE);
+
+ for (i = 0; i < count; i++) {
+ if (entries[i] == KVM_HV_TLB_FLUSHALL_ENTRY)
+ goto out_flush_all;
+
+ /*
+ * Lower 12 bits of 'address' encode the number of additional
+ * pages to flush.
+ */
+ gva = entries[i] & PAGE_MASK;
+ for (j = 0; j < (entries[i] & ~PAGE_MASK) + 1; j++)
+ static_call(kvm_x86_flush_tlb_gva)(vcpu, gva + j * PAGE_SIZE);
+
+ ++vcpu->stat.tlb_flush;
+ }
+ return 0;
+
+out_flush_all:
+ kfifo_reset_out(&tlb_flush_fifo->entries);
+
+ /* Fall back to full flush. */
+ return -ENOSPC;
+}
+
+static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc)
+{
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+ u64 *sparse_banks = hv_vcpu->sparse_banks;
+ struct kvm *kvm = vcpu->kvm;
+ struct hv_tlb_flush_ex flush_ex;
+ struct hv_tlb_flush flush;
+ DECLARE_BITMAP(vcpu_mask, KVM_MAX_VCPUS);
+ struct kvm_vcpu_hv_tlb_flush_fifo *tlb_flush_fifo;
+ /*
+ * Normally, there can be no more than 'KVM_HV_TLB_FLUSH_FIFO_SIZE'
+ * entries on the TLB flush fifo. The last entry, however, needs to be
+ * always left free for 'flush all' entry which gets placed when
+ * there is not enough space to put all the requested entries.
+ */
+ u64 __tlb_flush_entries[KVM_HV_TLB_FLUSH_FIFO_SIZE - 1];
+ u64 *tlb_flush_entries;
+ u64 valid_bank_mask;
+ struct kvm_vcpu *v;
+ unsigned long i;
+ bool all_cpus;
+
+ /*
+ * The Hyper-V TLFS doesn't allow more than HV_MAX_SPARSE_VCPU_BANKS
+ * sparse banks. Fail the build if KVM's max allowed number of
+ * vCPUs (>4096) exceeds this limit.
+ */
+ BUILD_BUG_ON(KVM_HV_MAX_SPARSE_VCPU_SET_BITS > HV_MAX_SPARSE_VCPU_BANKS);
+
+ /*
+ * 'Slow' hypercall's first parameter is the address in guest's memory
+ * where hypercall parameters are placed. This is either a GPA or a
+ * nested GPA when KVM is handling the call from L2 ('direct' TLB
+ * flush). Translate the address here so the memory can be uniformly
+ * read with kvm_read_guest().
+ */
+ if (!hc->fast && is_guest_mode(vcpu)) {
+ hc->ingpa = translate_nested_gpa(vcpu, hc->ingpa, 0, NULL);
+ if (unlikely(hc->ingpa == INVALID_GPA))
+ return HV_STATUS_INVALID_HYPERCALL_INPUT;
+ }
+
+ if (hc->code == HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST ||
+ hc->code == HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE) {
+ if (hc->fast) {
+ flush.address_space = hc->ingpa;
+ flush.flags = hc->outgpa;
+ flush.processor_mask = sse128_lo(hc->xmm[0]);
+ hc->consumed_xmm_halves = 1;
+ } else {
+ if (unlikely(kvm_read_guest(kvm, hc->ingpa,
+ &flush, sizeof(flush))))
+ return HV_STATUS_INVALID_HYPERCALL_INPUT;
+ hc->data_offset = sizeof(flush);
+ }
+
+ trace_kvm_hv_flush_tlb(flush.processor_mask,
+ flush.address_space, flush.flags,
+ is_guest_mode(vcpu));
+
+ valid_bank_mask = BIT_ULL(0);
+ sparse_banks[0] = flush.processor_mask;
+
+ /*
+ * Work around possible WS2012 bug: it sends hypercalls
+ * with processor_mask = 0x0 and HV_FLUSH_ALL_PROCESSORS clear,
+ * while also expecting us to flush something and crashing if
+ * we don't. Let's treat processor_mask == 0 same as
+ * HV_FLUSH_ALL_PROCESSORS.
+ */
+ all_cpus = (flush.flags & HV_FLUSH_ALL_PROCESSORS) ||
+ flush.processor_mask == 0;
+ } else {
+ if (hc->fast) {
+ flush_ex.address_space = hc->ingpa;
+ flush_ex.flags = hc->outgpa;
+ memcpy(&flush_ex.hv_vp_set,
+ &hc->xmm[0], sizeof(hc->xmm[0]));
+ hc->consumed_xmm_halves = 2;
+ } else {
+ if (unlikely(kvm_read_guest(kvm, hc->ingpa, &flush_ex,
+ sizeof(flush_ex))))
+ return HV_STATUS_INVALID_HYPERCALL_INPUT;
+ hc->data_offset = sizeof(flush_ex);
+ }
+
+ trace_kvm_hv_flush_tlb_ex(flush_ex.hv_vp_set.valid_bank_mask,
+ flush_ex.hv_vp_set.format,
+ flush_ex.address_space,
+ flush_ex.flags, is_guest_mode(vcpu));
+
+ valid_bank_mask = flush_ex.hv_vp_set.valid_bank_mask;
+ all_cpus = flush_ex.hv_vp_set.format !=
+ HV_GENERIC_SET_SPARSE_4K;
+
+ if (hc->var_cnt != hweight64(valid_bank_mask))
+ return HV_STATUS_INVALID_HYPERCALL_INPUT;
+
+ if (!all_cpus) {
+ if (!hc->var_cnt)
+ goto ret_success;
+
+ if (kvm_get_sparse_vp_set(kvm, hc, sparse_banks))
+ return HV_STATUS_INVALID_HYPERCALL_INPUT;
+ }
+
+ /*
+ * Hyper-V TLFS doesn't explicitly forbid non-empty sparse vCPU
+ * banks (and, thus, non-zero 'var_cnt') for the 'all vCPUs'
+ * case (HV_GENERIC_SET_ALL). Always adjust data_offset and
+ * consumed_xmm_halves to make sure TLB flush entries are read
+ * from the correct offset.
+ */
+ if (hc->fast)
+ hc->consumed_xmm_halves += hc->var_cnt;
+ else
+ hc->data_offset += hc->var_cnt * sizeof(sparse_banks[0]);
+ }
+
+ if (hc->code == HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE ||
+ hc->code == HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX ||
+ hc->rep_cnt > ARRAY_SIZE(__tlb_flush_entries)) {
+ tlb_flush_entries = NULL;
+ } else {
+ if (kvm_hv_get_tlb_flush_entries(kvm, hc, __tlb_flush_entries))
+ return HV_STATUS_INVALID_HYPERCALL_INPUT;
+ tlb_flush_entries = __tlb_flush_entries;
+ }
+
+ /*
+ * vcpu->arch.cr3 may not be up-to-date for running vCPUs so we can't
+ * analyze it here, flush TLB regardless of the specified address space.
+ */
+ if (all_cpus && !is_guest_mode(vcpu)) {
+ kvm_for_each_vcpu(i, v, kvm) {
+ tlb_flush_fifo = kvm_hv_get_tlb_flush_fifo(v, false);
+ hv_tlb_flush_enqueue(v, tlb_flush_fifo,
+ tlb_flush_entries, hc->rep_cnt);
+ }
+
+ kvm_make_all_cpus_request(kvm, KVM_REQ_HV_TLB_FLUSH);
+ } else if (!is_guest_mode(vcpu)) {
+ sparse_set_to_vcpu_mask(kvm, sparse_banks, valid_bank_mask, vcpu_mask);
+
+ for_each_set_bit(i, vcpu_mask, KVM_MAX_VCPUS) {
+ v = kvm_get_vcpu(kvm, i);
+ if (!v)
+ continue;
+ tlb_flush_fifo = kvm_hv_get_tlb_flush_fifo(v, false);
+ hv_tlb_flush_enqueue(v, tlb_flush_fifo,
+ tlb_flush_entries, hc->rep_cnt);
+ }
+
+ kvm_make_vcpus_request_mask(kvm, KVM_REQ_HV_TLB_FLUSH, vcpu_mask);
+ } else {
+ struct kvm_vcpu_hv *hv_v;
+
+ bitmap_zero(vcpu_mask, KVM_MAX_VCPUS);
+
+ kvm_for_each_vcpu(i, v, kvm) {
+ hv_v = to_hv_vcpu(v);
+
+ /*
+ * The following check races with nested vCPUs entering/exiting
+ * and/or migrating between L1's vCPUs, however the only case when
+ * KVM *must* flush the TLB is when the target L2 vCPU keeps
+ * running on the same L1 vCPU from the moment of the request until
+ * kvm_hv_flush_tlb() returns. TLB is fully flushed in all other
+ * cases, e.g. when the target L2 vCPU migrates to a different L1
+ * vCPU or when the corresponding L1 vCPU temporary switches to a
+ * different L2 vCPU while the request is being processed.
+ */
+ if (!hv_v || hv_v->nested.vm_id != hv_vcpu->nested.vm_id)
+ continue;
+
+ if (!all_cpus &&
+ !hv_is_vp_in_sparse_set(hv_v->nested.vp_id, valid_bank_mask,
+ sparse_banks))
+ continue;
+
+ __set_bit(i, vcpu_mask);
+ tlb_flush_fifo = kvm_hv_get_tlb_flush_fifo(v, true);
+ hv_tlb_flush_enqueue(v, tlb_flush_fifo,
+ tlb_flush_entries, hc->rep_cnt);
+ }
+
+ kvm_make_vcpus_request_mask(kvm, KVM_REQ_HV_TLB_FLUSH, vcpu_mask);
+ }
+
+ret_success:
+ /* We always do full TLB flush, set 'Reps completed' = 'Rep Count' */
+ return (u64)HV_STATUS_SUCCESS |
+ ((u64)hc->rep_cnt << HV_HYPERCALL_REP_COMP_OFFSET);
+}
+
+static void kvm_hv_send_ipi_to_many(struct kvm *kvm, u32 vector,
+ u64 *sparse_banks, u64 valid_bank_mask)
+{
+ struct kvm_lapic_irq irq = {
+ .delivery_mode = APIC_DM_FIXED,
+ .vector = vector
+ };
+ struct kvm_vcpu *vcpu;
+ unsigned long i;
+
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ if (sparse_banks &&
+ !hv_is_vp_in_sparse_set(kvm_hv_get_vpindex(vcpu),
+ valid_bank_mask, sparse_banks))
+ continue;
+
+ /* We fail only when APIC is disabled */
+ kvm_apic_set_irq(vcpu, &irq, NULL);
+ }
+}
+
+static u64 kvm_hv_send_ipi(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc)
+{
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+ u64 *sparse_banks = hv_vcpu->sparse_banks;
+ struct kvm *kvm = vcpu->kvm;
+ struct hv_send_ipi_ex send_ipi_ex;
+ struct hv_send_ipi send_ipi;
+ u64 valid_bank_mask;
+ u32 vector;
+ bool all_cpus;
+
+ if (hc->code == HVCALL_SEND_IPI) {
+ if (!hc->fast) {
+ if (unlikely(kvm_read_guest(kvm, hc->ingpa, &send_ipi,
+ sizeof(send_ipi))))
+ return HV_STATUS_INVALID_HYPERCALL_INPUT;
+ sparse_banks[0] = send_ipi.cpu_mask;
+ vector = send_ipi.vector;
+ } else {
+ /* 'reserved' part of hv_send_ipi should be 0 */
+ if (unlikely(hc->ingpa >> 32 != 0))
+ return HV_STATUS_INVALID_HYPERCALL_INPUT;
+ sparse_banks[0] = hc->outgpa;
+ vector = (u32)hc->ingpa;
+ }
+ all_cpus = false;
+ valid_bank_mask = BIT_ULL(0);
+
+ trace_kvm_hv_send_ipi(vector, sparse_banks[0]);
+ } else {
+ if (!hc->fast) {
+ if (unlikely(kvm_read_guest(kvm, hc->ingpa, &send_ipi_ex,
+ sizeof(send_ipi_ex))))
+ return HV_STATUS_INVALID_HYPERCALL_INPUT;
+ } else {
+ send_ipi_ex.vector = (u32)hc->ingpa;
+ send_ipi_ex.vp_set.format = hc->outgpa;
+ send_ipi_ex.vp_set.valid_bank_mask = sse128_lo(hc->xmm[0]);
+ }
+
+ trace_kvm_hv_send_ipi_ex(send_ipi_ex.vector,
+ send_ipi_ex.vp_set.format,
+ send_ipi_ex.vp_set.valid_bank_mask);
+
+ vector = send_ipi_ex.vector;
+ valid_bank_mask = send_ipi_ex.vp_set.valid_bank_mask;
+ all_cpus = send_ipi_ex.vp_set.format == HV_GENERIC_SET_ALL;
+
+ if (hc->var_cnt != hweight64(valid_bank_mask))
+ return HV_STATUS_INVALID_HYPERCALL_INPUT;
+
+ if (all_cpus)
+ goto check_and_send_ipi;
+
+ if (!hc->var_cnt)
+ goto ret_success;
+
+ if (!hc->fast)
+ hc->data_offset = offsetof(struct hv_send_ipi_ex,
+ vp_set.bank_contents);
+ else
+ hc->consumed_xmm_halves = 1;
+
+ if (kvm_get_sparse_vp_set(kvm, hc, sparse_banks))
+ return HV_STATUS_INVALID_HYPERCALL_INPUT;
+ }
+
+check_and_send_ipi:
+ if ((vector < HV_IPI_LOW_VECTOR) || (vector > HV_IPI_HIGH_VECTOR))
+ return HV_STATUS_INVALID_HYPERCALL_INPUT;
+
+ if (all_cpus)
+ kvm_hv_send_ipi_to_many(kvm, vector, NULL, 0);
+ else
+ kvm_hv_send_ipi_to_many(kvm, vector, sparse_banks, valid_bank_mask);
+
+ret_success:
+ return HV_STATUS_SUCCESS;
+}
+
+void kvm_hv_set_cpuid(struct kvm_vcpu *vcpu, bool hyperv_enabled)
+{
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+ struct kvm_cpuid_entry2 *entry;
+
+ vcpu->arch.hyperv_enabled = hyperv_enabled;
+
+ if (!hv_vcpu) {
+ /*
+ * KVM should have already allocated kvm_vcpu_hv if Hyper-V is
+ * enabled in CPUID.
+ */
+ WARN_ON_ONCE(vcpu->arch.hyperv_enabled);
+ return;
+ }
+
+ memset(&hv_vcpu->cpuid_cache, 0, sizeof(hv_vcpu->cpuid_cache));
+
+ if (!vcpu->arch.hyperv_enabled)
+ return;
+
+ entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_FEATURES);
+ if (entry) {
+ hv_vcpu->cpuid_cache.features_eax = entry->eax;
+ hv_vcpu->cpuid_cache.features_ebx = entry->ebx;
+ hv_vcpu->cpuid_cache.features_edx = entry->edx;
+ }
+
+ entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_ENLIGHTMENT_INFO);
+ if (entry) {
+ hv_vcpu->cpuid_cache.enlightenments_eax = entry->eax;
+ hv_vcpu->cpuid_cache.enlightenments_ebx = entry->ebx;
+ }
+
+ entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES);
+ if (entry)
+ hv_vcpu->cpuid_cache.syndbg_cap_eax = entry->eax;
+
+ entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_NESTED_FEATURES);
+ if (entry) {
+ hv_vcpu->cpuid_cache.nested_eax = entry->eax;
+ hv_vcpu->cpuid_cache.nested_ebx = entry->ebx;
+ }
+}
+
+int kvm_hv_set_enforce_cpuid(struct kvm_vcpu *vcpu, bool enforce)
+{
+ struct kvm_vcpu_hv *hv_vcpu;
+ int ret = 0;
+
+ if (!to_hv_vcpu(vcpu)) {
+ if (enforce) {
+ ret = kvm_hv_vcpu_init(vcpu);
+ if (ret)
+ return ret;
+ } else {
+ return 0;
+ }
+ }
+
+ hv_vcpu = to_hv_vcpu(vcpu);
+ hv_vcpu->enforce_cpuid = enforce;
+
+ return ret;
+}
+
+static void kvm_hv_hypercall_set_result(struct kvm_vcpu *vcpu, u64 result)
+{
+ bool longmode;
+
+ longmode = is_64_bit_hypercall(vcpu);
+ if (longmode)
+ kvm_rax_write(vcpu, result);
+ else {
+ kvm_rdx_write(vcpu, result >> 32);
+ kvm_rax_write(vcpu, result & 0xffffffff);
+ }
+}
+
+static int kvm_hv_hypercall_complete(struct kvm_vcpu *vcpu, u64 result)
+{
+ u32 tlb_lock_count = 0;
+ int ret;
+
+ if (hv_result_success(result) && is_guest_mode(vcpu) &&
+ kvm_hv_is_tlb_flush_hcall(vcpu) &&
+ kvm_read_guest(vcpu->kvm, to_hv_vcpu(vcpu)->nested.pa_page_gpa,
+ &tlb_lock_count, sizeof(tlb_lock_count)))
+ result = HV_STATUS_INVALID_HYPERCALL_INPUT;
+
+ trace_kvm_hv_hypercall_done(result);
+ kvm_hv_hypercall_set_result(vcpu, result);
+ ++vcpu->stat.hypercalls;
+
+ ret = kvm_skip_emulated_instruction(vcpu);
+
+ if (tlb_lock_count)
+ kvm_x86_ops.nested_ops->hv_inject_synthetic_vmexit_post_tlb_flush(vcpu);
+
+ return ret;
+}
+
+static int kvm_hv_hypercall_complete_userspace(struct kvm_vcpu *vcpu)
+{
+ return kvm_hv_hypercall_complete(vcpu, vcpu->run->hyperv.u.hcall.result);
+}
+
+static u16 kvm_hvcall_signal_event(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc)
+{
+ struct kvm_hv *hv = to_kvm_hv(vcpu->kvm);
+ struct eventfd_ctx *eventfd;
+
+ if (unlikely(!hc->fast)) {
+ int ret;
+ gpa_t gpa = hc->ingpa;
+
+ if ((gpa & (__alignof__(hc->ingpa) - 1)) ||
+ offset_in_page(gpa) + sizeof(hc->ingpa) > PAGE_SIZE)
+ return HV_STATUS_INVALID_ALIGNMENT;
+
+ ret = kvm_vcpu_read_guest(vcpu, gpa,
+ &hc->ingpa, sizeof(hc->ingpa));
+ if (ret < 0)
+ return HV_STATUS_INVALID_ALIGNMENT;
+ }
+
+ /*
+ * Per spec, bits 32-47 contain the extra "flag number". However, we
+ * have no use for it, and in all known usecases it is zero, so just
+ * report lookup failure if it isn't.
+ */
+ if (hc->ingpa & 0xffff00000000ULL)
+ return HV_STATUS_INVALID_PORT_ID;
+ /* remaining bits are reserved-zero */
+ if (hc->ingpa & ~KVM_HYPERV_CONN_ID_MASK)
+ return HV_STATUS_INVALID_HYPERCALL_INPUT;
+
+ /* the eventfd is protected by vcpu->kvm->srcu, but conn_to_evt isn't */
+ rcu_read_lock();
+ eventfd = idr_find(&hv->conn_to_evt, hc->ingpa);
+ rcu_read_unlock();
+ if (!eventfd)
+ return HV_STATUS_INVALID_PORT_ID;
+
+ eventfd_signal(eventfd, 1);
+ return HV_STATUS_SUCCESS;
+}
+
+static bool is_xmm_fast_hypercall(struct kvm_hv_hcall *hc)
+{
+ switch (hc->code) {
+ case HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST:
+ case HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE:
+ case HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX:
+ case HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX:
+ case HVCALL_SEND_IPI_EX:
+ return true;
+ }
+
+ return false;
+}
+
+static void kvm_hv_hypercall_read_xmm(struct kvm_hv_hcall *hc)
+{
+ int reg;
+
+ kvm_fpu_get();
+ for (reg = 0; reg < HV_HYPERCALL_MAX_XMM_REGISTERS; reg++)
+ _kvm_read_sse_reg(reg, &hc->xmm[reg]);
+ kvm_fpu_put();
+}
+
+static bool hv_check_hypercall_access(struct kvm_vcpu_hv *hv_vcpu, u16 code)
+{
+ if (!hv_vcpu->enforce_cpuid)
+ return true;
+
+ switch (code) {
+ case HVCALL_NOTIFY_LONG_SPIN_WAIT:
+ return hv_vcpu->cpuid_cache.enlightenments_ebx &&
+ hv_vcpu->cpuid_cache.enlightenments_ebx != U32_MAX;
+ case HVCALL_POST_MESSAGE:
+ return hv_vcpu->cpuid_cache.features_ebx & HV_POST_MESSAGES;
+ case HVCALL_SIGNAL_EVENT:
+ return hv_vcpu->cpuid_cache.features_ebx & HV_SIGNAL_EVENTS;
+ case HVCALL_POST_DEBUG_DATA:
+ case HVCALL_RETRIEVE_DEBUG_DATA:
+ case HVCALL_RESET_DEBUG_SESSION:
+ /*
+ * Return 'true' when SynDBG is disabled so the resulting code
+ * will be HV_STATUS_INVALID_HYPERCALL_CODE.
+ */
+ return !kvm_hv_is_syndbg_enabled(hv_vcpu->vcpu) ||
+ hv_vcpu->cpuid_cache.features_ebx & HV_DEBUGGING;
+ case HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX:
+ case HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX:
+ if (!(hv_vcpu->cpuid_cache.enlightenments_eax &
+ HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED))
+ return false;
+ fallthrough;
+ case HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST:
+ case HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE:
+ return hv_vcpu->cpuid_cache.enlightenments_eax &
+ HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED;
+ case HVCALL_SEND_IPI_EX:
+ if (!(hv_vcpu->cpuid_cache.enlightenments_eax &
+ HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED))
+ return false;
+ fallthrough;
+ case HVCALL_SEND_IPI:
+ return hv_vcpu->cpuid_cache.enlightenments_eax &
+ HV_X64_CLUSTER_IPI_RECOMMENDED;
+ case HV_EXT_CALL_QUERY_CAPABILITIES ... HV_EXT_CALL_MAX:
+ return hv_vcpu->cpuid_cache.features_ebx &
+ HV_ENABLE_EXTENDED_HYPERCALLS;
+ default:
+ break;
+ }
+
+ return true;
+}
+
+int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
+{
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+ struct kvm_hv_hcall hc;
+ u64 ret = HV_STATUS_SUCCESS;
+
+ /*
+ * hypercall generates UD from non zero cpl and real mode
+ * per HYPER-V spec
+ */
+ if (static_call(kvm_x86_get_cpl)(vcpu) != 0 || !is_protmode(vcpu)) {
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+ }
+
+#ifdef CONFIG_X86_64
+ if (is_64_bit_hypercall(vcpu)) {
+ hc.param = kvm_rcx_read(vcpu);
+ hc.ingpa = kvm_rdx_read(vcpu);
+ hc.outgpa = kvm_r8_read(vcpu);
+ } else
+#endif
+ {
+ hc.param = ((u64)kvm_rdx_read(vcpu) << 32) |
+ (kvm_rax_read(vcpu) & 0xffffffff);
+ hc.ingpa = ((u64)kvm_rbx_read(vcpu) << 32) |
+ (kvm_rcx_read(vcpu) & 0xffffffff);
+ hc.outgpa = ((u64)kvm_rdi_read(vcpu) << 32) |
+ (kvm_rsi_read(vcpu) & 0xffffffff);
+ }
+
+ hc.code = hc.param & 0xffff;
+ hc.var_cnt = (hc.param & HV_HYPERCALL_VARHEAD_MASK) >> HV_HYPERCALL_VARHEAD_OFFSET;
+ hc.fast = !!(hc.param & HV_HYPERCALL_FAST_BIT);
+ hc.rep_cnt = (hc.param >> HV_HYPERCALL_REP_COMP_OFFSET) & 0xfff;
+ hc.rep_idx = (hc.param >> HV_HYPERCALL_REP_START_OFFSET) & 0xfff;
+ hc.rep = !!(hc.rep_cnt || hc.rep_idx);
+
+ trace_kvm_hv_hypercall(hc.code, hc.fast, hc.var_cnt, hc.rep_cnt,
+ hc.rep_idx, hc.ingpa, hc.outgpa);
+
+ if (unlikely(!hv_check_hypercall_access(hv_vcpu, hc.code))) {
+ ret = HV_STATUS_ACCESS_DENIED;
+ goto hypercall_complete;
+ }
+
+ if (unlikely(hc.param & HV_HYPERCALL_RSVD_MASK)) {
+ ret = HV_STATUS_INVALID_HYPERCALL_INPUT;
+ goto hypercall_complete;
+ }
+
+ if (hc.fast && is_xmm_fast_hypercall(&hc)) {
+ if (unlikely(hv_vcpu->enforce_cpuid &&
+ !(hv_vcpu->cpuid_cache.features_edx &
+ HV_X64_HYPERCALL_XMM_INPUT_AVAILABLE))) {
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+ }
+
+ kvm_hv_hypercall_read_xmm(&hc);
+ }
+
+ switch (hc.code) {
+ case HVCALL_NOTIFY_LONG_SPIN_WAIT:
+ if (unlikely(hc.rep || hc.var_cnt)) {
+ ret = HV_STATUS_INVALID_HYPERCALL_INPUT;
+ break;
+ }
+ kvm_vcpu_on_spin(vcpu, true);
+ break;
+ case HVCALL_SIGNAL_EVENT:
+ if (unlikely(hc.rep || hc.var_cnt)) {
+ ret = HV_STATUS_INVALID_HYPERCALL_INPUT;
+ break;
+ }
+ ret = kvm_hvcall_signal_event(vcpu, &hc);
+ if (ret != HV_STATUS_INVALID_PORT_ID)
+ break;
+ fallthrough; /* maybe userspace knows this conn_id */
+ case HVCALL_POST_MESSAGE:
+ /* don't bother userspace if it has no way to handle it */
+ if (unlikely(hc.rep || hc.var_cnt || !to_hv_synic(vcpu)->active)) {
+ ret = HV_STATUS_INVALID_HYPERCALL_INPUT;
+ break;
+ }
+ goto hypercall_userspace_exit;
+ case HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST:
+ if (unlikely(hc.var_cnt)) {
+ ret = HV_STATUS_INVALID_HYPERCALL_INPUT;
+ break;
+ }
+ fallthrough;
+ case HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX:
+ if (unlikely(!hc.rep_cnt || hc.rep_idx)) {
+ ret = HV_STATUS_INVALID_HYPERCALL_INPUT;
+ break;
+ }
+ ret = kvm_hv_flush_tlb(vcpu, &hc);
+ break;
+ case HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE:
+ if (unlikely(hc.var_cnt)) {
+ ret = HV_STATUS_INVALID_HYPERCALL_INPUT;
+ break;
+ }
+ fallthrough;
+ case HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX:
+ if (unlikely(hc.rep)) {
+ ret = HV_STATUS_INVALID_HYPERCALL_INPUT;
+ break;
+ }
+ ret = kvm_hv_flush_tlb(vcpu, &hc);
+ break;
+ case HVCALL_SEND_IPI:
+ if (unlikely(hc.var_cnt)) {
+ ret = HV_STATUS_INVALID_HYPERCALL_INPUT;
+ break;
+ }
+ fallthrough;
+ case HVCALL_SEND_IPI_EX:
+ if (unlikely(hc.rep)) {
+ ret = HV_STATUS_INVALID_HYPERCALL_INPUT;
+ break;
+ }
+ ret = kvm_hv_send_ipi(vcpu, &hc);
+ break;
+ case HVCALL_POST_DEBUG_DATA:
+ case HVCALL_RETRIEVE_DEBUG_DATA:
+ if (unlikely(hc.fast)) {
+ ret = HV_STATUS_INVALID_PARAMETER;
+ break;
+ }
+ fallthrough;
+ case HVCALL_RESET_DEBUG_SESSION: {
+ struct kvm_hv_syndbg *syndbg = to_hv_syndbg(vcpu);
+
+ if (!kvm_hv_is_syndbg_enabled(vcpu)) {
+ ret = HV_STATUS_INVALID_HYPERCALL_CODE;
+ break;
+ }
+
+ if (!(syndbg->options & HV_X64_SYNDBG_OPTION_USE_HCALLS)) {
+ ret = HV_STATUS_OPERATION_DENIED;
+ break;
+ }
+ goto hypercall_userspace_exit;
+ }
+ case HV_EXT_CALL_QUERY_CAPABILITIES ... HV_EXT_CALL_MAX:
+ if (unlikely(hc.fast)) {
+ ret = HV_STATUS_INVALID_PARAMETER;
+ break;
+ }
+ goto hypercall_userspace_exit;
+ default:
+ ret = HV_STATUS_INVALID_HYPERCALL_CODE;
+ break;
+ }
+
+hypercall_complete:
+ return kvm_hv_hypercall_complete(vcpu, ret);
+
+hypercall_userspace_exit:
+ vcpu->run->exit_reason = KVM_EXIT_HYPERV;
+ vcpu->run->hyperv.type = KVM_EXIT_HYPERV_HCALL;
+ vcpu->run->hyperv.u.hcall.input = hc.param;
+ vcpu->run->hyperv.u.hcall.params[0] = hc.ingpa;
+ vcpu->run->hyperv.u.hcall.params[1] = hc.outgpa;
+ vcpu->arch.complete_userspace_io = kvm_hv_hypercall_complete_userspace;
+ return 0;
+}
+
+void kvm_hv_init_vm(struct kvm *kvm)
+{
+ struct kvm_hv *hv = to_kvm_hv(kvm);
+
+ mutex_init(&hv->hv_lock);
+ idr_init(&hv->conn_to_evt);
+}
+
+void kvm_hv_destroy_vm(struct kvm *kvm)
+{
+ struct kvm_hv *hv = to_kvm_hv(kvm);
+ struct eventfd_ctx *eventfd;
+ int i;
+
+ idr_for_each_entry(&hv->conn_to_evt, eventfd, i)
+ eventfd_ctx_put(eventfd);
+ idr_destroy(&hv->conn_to_evt);
+}
+
+static int kvm_hv_eventfd_assign(struct kvm *kvm, u32 conn_id, int fd)
+{
+ struct kvm_hv *hv = to_kvm_hv(kvm);
+ struct eventfd_ctx *eventfd;
+ int ret;
+
+ eventfd = eventfd_ctx_fdget(fd);
+ if (IS_ERR(eventfd))
+ return PTR_ERR(eventfd);
+
+ mutex_lock(&hv->hv_lock);
+ ret = idr_alloc(&hv->conn_to_evt, eventfd, conn_id, conn_id + 1,
+ GFP_KERNEL_ACCOUNT);
+ mutex_unlock(&hv->hv_lock);
+
+ if (ret >= 0)
+ return 0;
+
+ if (ret == -ENOSPC)
+ ret = -EEXIST;
+ eventfd_ctx_put(eventfd);
+ return ret;
+}
+
+static int kvm_hv_eventfd_deassign(struct kvm *kvm, u32 conn_id)
+{
+ struct kvm_hv *hv = to_kvm_hv(kvm);
+ struct eventfd_ctx *eventfd;
+
+ mutex_lock(&hv->hv_lock);
+ eventfd = idr_remove(&hv->conn_to_evt, conn_id);
+ mutex_unlock(&hv->hv_lock);
+
+ if (!eventfd)
+ return -ENOENT;
+
+ synchronize_srcu(&kvm->srcu);
+ eventfd_ctx_put(eventfd);
+ return 0;
+}
+
+int kvm_vm_ioctl_hv_eventfd(struct kvm *kvm, struct kvm_hyperv_eventfd *args)
+{
+ if ((args->flags & ~KVM_HYPERV_EVENTFD_DEASSIGN) ||
+ (args->conn_id & ~KVM_HYPERV_CONN_ID_MASK))
+ return -EINVAL;
+
+ if (args->flags == KVM_HYPERV_EVENTFD_DEASSIGN)
+ return kvm_hv_eventfd_deassign(kvm, args->conn_id);
+ return kvm_hv_eventfd_assign(kvm, args->conn_id, args->fd);
+}
+
+int kvm_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid,
+ struct kvm_cpuid_entry2 __user *entries)
+{
+ uint16_t evmcs_ver = 0;
+ struct kvm_cpuid_entry2 cpuid_entries[] = {
+ { .function = HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS },
+ { .function = HYPERV_CPUID_INTERFACE },
+ { .function = HYPERV_CPUID_VERSION },
+ { .function = HYPERV_CPUID_FEATURES },
+ { .function = HYPERV_CPUID_ENLIGHTMENT_INFO },
+ { .function = HYPERV_CPUID_IMPLEMENT_LIMITS },
+ { .function = HYPERV_CPUID_SYNDBG_VENDOR_AND_MAX_FUNCTIONS },
+ { .function = HYPERV_CPUID_SYNDBG_INTERFACE },
+ { .function = HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES },
+ { .function = HYPERV_CPUID_NESTED_FEATURES },
+ };
+ int i, nent = ARRAY_SIZE(cpuid_entries);
+
+ if (kvm_x86_ops.nested_ops->get_evmcs_version)
+ evmcs_ver = kvm_x86_ops.nested_ops->get_evmcs_version(vcpu);
+
+ if (cpuid->nent < nent)
+ return -E2BIG;
+
+ if (cpuid->nent > nent)
+ cpuid->nent = nent;
+
+ for (i = 0; i < nent; i++) {
+ struct kvm_cpuid_entry2 *ent = &cpuid_entries[i];
+ u32 signature[3];
+
+ switch (ent->function) {
+ case HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS:
+ memcpy(signature, "Linux KVM Hv", 12);
+
+ ent->eax = HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES;
+ ent->ebx = signature[0];
+ ent->ecx = signature[1];
+ ent->edx = signature[2];
+ break;
+
+ case HYPERV_CPUID_INTERFACE:
+ ent->eax = HYPERV_CPUID_SIGNATURE_EAX;
+ break;
+
+ case HYPERV_CPUID_VERSION:
+ /*
+ * We implement some Hyper-V 2016 functions so let's use
+ * this version.
+ */
+ ent->eax = 0x00003839;
+ ent->ebx = 0x000A0000;
+ break;
+
+ case HYPERV_CPUID_FEATURES:
+ ent->eax |= HV_MSR_VP_RUNTIME_AVAILABLE;
+ ent->eax |= HV_MSR_TIME_REF_COUNT_AVAILABLE;
+ ent->eax |= HV_MSR_SYNIC_AVAILABLE;
+ ent->eax |= HV_MSR_SYNTIMER_AVAILABLE;
+ ent->eax |= HV_MSR_APIC_ACCESS_AVAILABLE;
+ ent->eax |= HV_MSR_HYPERCALL_AVAILABLE;
+ ent->eax |= HV_MSR_VP_INDEX_AVAILABLE;
+ ent->eax |= HV_MSR_RESET_AVAILABLE;
+ ent->eax |= HV_MSR_REFERENCE_TSC_AVAILABLE;
+ ent->eax |= HV_ACCESS_FREQUENCY_MSRS;
+ ent->eax |= HV_ACCESS_REENLIGHTENMENT;
+ ent->eax |= HV_ACCESS_TSC_INVARIANT;
+
+ ent->ebx |= HV_POST_MESSAGES;
+ ent->ebx |= HV_SIGNAL_EVENTS;
+ ent->ebx |= HV_ENABLE_EXTENDED_HYPERCALLS;
+
+ ent->edx |= HV_X64_HYPERCALL_XMM_INPUT_AVAILABLE;
+ ent->edx |= HV_FEATURE_FREQUENCY_MSRS_AVAILABLE;
+ ent->edx |= HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE;
+
+ ent->ebx |= HV_DEBUGGING;
+ ent->edx |= HV_X64_GUEST_DEBUGGING_AVAILABLE;
+ ent->edx |= HV_FEATURE_DEBUG_MSRS_AVAILABLE;
+ ent->edx |= HV_FEATURE_EXT_GVA_RANGES_FLUSH;
+
+ /*
+ * Direct Synthetic timers only make sense with in-kernel
+ * LAPIC
+ */
+ if (!vcpu || lapic_in_kernel(vcpu))
+ ent->edx |= HV_STIMER_DIRECT_MODE_AVAILABLE;
+
+ break;
+
+ case HYPERV_CPUID_ENLIGHTMENT_INFO:
+ ent->eax |= HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED;
+ ent->eax |= HV_X64_APIC_ACCESS_RECOMMENDED;
+ ent->eax |= HV_X64_RELAXED_TIMING_RECOMMENDED;
+ ent->eax |= HV_X64_CLUSTER_IPI_RECOMMENDED;
+ ent->eax |= HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED;
+ if (evmcs_ver)
+ ent->eax |= HV_X64_ENLIGHTENED_VMCS_RECOMMENDED;
+ if (!cpu_smt_possible())
+ ent->eax |= HV_X64_NO_NONARCH_CORESHARING;
+
+ ent->eax |= HV_DEPRECATING_AEOI_RECOMMENDED;
+ /*
+ * Default number of spinlock retry attempts, matches
+ * HyperV 2016.
+ */
+ ent->ebx = 0x00000FFF;
+
+ break;
+
+ case HYPERV_CPUID_IMPLEMENT_LIMITS:
+ /* Maximum number of virtual processors */
+ ent->eax = KVM_MAX_VCPUS;
+ /*
+ * Maximum number of logical processors, matches
+ * HyperV 2016.
+ */
+ ent->ebx = 64;
+
+ break;
+
+ case HYPERV_CPUID_NESTED_FEATURES:
+ ent->eax = evmcs_ver;
+ ent->eax |= HV_X64_NESTED_DIRECT_FLUSH;
+ ent->eax |= HV_X64_NESTED_MSR_BITMAP;
+ ent->ebx |= HV_X64_NESTED_EVMCS1_PERF_GLOBAL_CTRL;
+ break;
+
+ case HYPERV_CPUID_SYNDBG_VENDOR_AND_MAX_FUNCTIONS:
+ memcpy(signature, "Linux KVM Hv", 12);
+
+ ent->eax = 0;
+ ent->ebx = signature[0];
+ ent->ecx = signature[1];
+ ent->edx = signature[2];
+ break;
+
+ case HYPERV_CPUID_SYNDBG_INTERFACE:
+ memcpy(signature, "VS#1\0\0\0\0\0\0\0\0", 12);
+ ent->eax = signature[0];
+ break;
+
+ case HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES:
+ ent->eax |= HV_X64_SYNDBG_CAP_ALLOW_KERNEL_DEBUGGING;
+ break;
+
+ default:
+ break;
+ }
+ }
+
+ if (copy_to_user(entries, cpuid_entries,
+ nent * sizeof(struct kvm_cpuid_entry2)))
+ return -EFAULT;
+
+ return 0;
+}
diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h
new file mode 100644
index 000000000..f83b8db72
--- /dev/null
+++ b/arch/x86/kvm/hyperv.h
@@ -0,0 +1,241 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * KVM Microsoft Hyper-V emulation
+ *
+ * derived from arch/x86/kvm/x86.c
+ *
+ * Copyright (C) 2006 Qumranet, Inc.
+ * Copyright (C) 2008 Qumranet, Inc.
+ * Copyright IBM Corporation, 2008
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
+ * Copyright (C) 2015 Andrey Smetanin <asmetanin@virtuozzo.com>
+ *
+ * Authors:
+ * Avi Kivity <avi@qumranet.com>
+ * Yaniv Kamay <yaniv@qumranet.com>
+ * Amit Shah <amit.shah@qumranet.com>
+ * Ben-Ami Yassour <benami@il.ibm.com>
+ * Andrey Smetanin <asmetanin@virtuozzo.com>
+ */
+
+#ifndef __ARCH_X86_KVM_HYPERV_H__
+#define __ARCH_X86_KVM_HYPERV_H__
+
+#include <linux/kvm_host.h>
+#include "x86.h"
+
+/* "Hv#1" signature */
+#define HYPERV_CPUID_SIGNATURE_EAX 0x31237648
+
+/*
+ * The #defines related to the synthetic debugger are required by KDNet, but
+ * they are not documented in the Hyper-V TLFS because the synthetic debugger
+ * functionality has been deprecated and is subject to removal in future
+ * versions of Windows.
+ */
+#define HYPERV_CPUID_SYNDBG_VENDOR_AND_MAX_FUNCTIONS 0x40000080
+#define HYPERV_CPUID_SYNDBG_INTERFACE 0x40000081
+#define HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES 0x40000082
+
+/*
+ * Hyper-V synthetic debugger platform capabilities
+ * These are HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES.EAX bits.
+ */
+#define HV_X64_SYNDBG_CAP_ALLOW_KERNEL_DEBUGGING BIT(1)
+
+/* Hyper-V Synthetic debug options MSR */
+#define HV_X64_MSR_SYNDBG_CONTROL 0x400000F1
+#define HV_X64_MSR_SYNDBG_STATUS 0x400000F2
+#define HV_X64_MSR_SYNDBG_SEND_BUFFER 0x400000F3
+#define HV_X64_MSR_SYNDBG_RECV_BUFFER 0x400000F4
+#define HV_X64_MSR_SYNDBG_PENDING_BUFFER 0x400000F5
+#define HV_X64_MSR_SYNDBG_OPTIONS 0x400000FF
+
+/* Hyper-V HV_X64_MSR_SYNDBG_OPTIONS bits */
+#define HV_X64_SYNDBG_OPTION_USE_HCALLS BIT(2)
+
+static inline struct kvm_hv *to_kvm_hv(struct kvm *kvm)
+{
+ return &kvm->arch.hyperv;
+}
+
+static inline struct kvm_vcpu_hv *to_hv_vcpu(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.hyperv;
+}
+
+static inline struct kvm_vcpu_hv_synic *to_hv_synic(struct kvm_vcpu *vcpu)
+{
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+
+ return &hv_vcpu->synic;
+}
+
+static inline struct kvm_vcpu *hv_synic_to_vcpu(struct kvm_vcpu_hv_synic *synic)
+{
+ struct kvm_vcpu_hv *hv_vcpu = container_of(synic, struct kvm_vcpu_hv, synic);
+
+ return hv_vcpu->vcpu;
+}
+
+static inline struct kvm_hv_syndbg *to_hv_syndbg(struct kvm_vcpu *vcpu)
+{
+ return &vcpu->kvm->arch.hyperv.hv_syndbg;
+}
+
+static inline u32 kvm_hv_get_vpindex(struct kvm_vcpu *vcpu)
+{
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+
+ return hv_vcpu ? hv_vcpu->vp_index : vcpu->vcpu_idx;
+}
+
+int kvm_hv_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host);
+int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host);
+
+static inline bool kvm_hv_hypercall_enabled(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.hyperv_enabled && to_kvm_hv(vcpu->kvm)->hv_guest_os_id;
+}
+
+int kvm_hv_hypercall(struct kvm_vcpu *vcpu);
+
+void kvm_hv_irq_routing_update(struct kvm *kvm);
+int kvm_hv_synic_set_irq(struct kvm *kvm, u32 vcpu_id, u32 sint);
+void kvm_hv_synic_send_eoi(struct kvm_vcpu *vcpu, int vector);
+int kvm_hv_activate_synic(struct kvm_vcpu *vcpu, bool dont_zero_synic_pages);
+
+void kvm_hv_vcpu_uninit(struct kvm_vcpu *vcpu);
+
+bool kvm_hv_assist_page_enabled(struct kvm_vcpu *vcpu);
+int kvm_hv_get_assist_page(struct kvm_vcpu *vcpu);
+
+static inline struct kvm_vcpu_hv_stimer *to_hv_stimer(struct kvm_vcpu *vcpu,
+ int timer_index)
+{
+ return &to_hv_vcpu(vcpu)->stimer[timer_index];
+}
+
+static inline struct kvm_vcpu *hv_stimer_to_vcpu(struct kvm_vcpu_hv_stimer *stimer)
+{
+ struct kvm_vcpu_hv *hv_vcpu;
+
+ hv_vcpu = container_of(stimer - stimer->index, struct kvm_vcpu_hv,
+ stimer[0]);
+ return hv_vcpu->vcpu;
+}
+
+static inline bool kvm_hv_has_stimer_pending(struct kvm_vcpu *vcpu)
+{
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+
+ if (!hv_vcpu)
+ return false;
+
+ return !bitmap_empty(hv_vcpu->stimer_pending_bitmap,
+ HV_SYNIC_STIMER_COUNT);
+}
+
+/*
+ * With HV_ACCESS_TSC_INVARIANT feature, invariant TSC (CPUID.80000007H:EDX[8])
+ * is only observed after HV_X64_MSR_TSC_INVARIANT_CONTROL was written to.
+ */
+static inline bool kvm_hv_invtsc_suppressed(struct kvm_vcpu *vcpu)
+{
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+
+ /*
+ * If Hyper-V's invariant TSC control is not exposed to the guest,
+ * the invariant TSC CPUID flag is not suppressed, Windows guests were
+ * observed to be able to handle it correctly. Going forward, VMMs are
+ * encouraged to enable Hyper-V's invariant TSC control when invariant
+ * TSC CPUID flag is set to make KVM's behavior match genuine Hyper-V.
+ */
+ if (!hv_vcpu ||
+ !(hv_vcpu->cpuid_cache.features_eax & HV_ACCESS_TSC_INVARIANT))
+ return false;
+
+ /*
+ * If Hyper-V's invariant TSC control is exposed to the guest, KVM is
+ * responsible for suppressing the invariant TSC CPUID flag if the
+ * Hyper-V control is not enabled.
+ */
+ return !(to_kvm_hv(vcpu->kvm)->hv_invtsc_control & HV_EXPOSE_INVARIANT_TSC);
+}
+
+void kvm_hv_process_stimers(struct kvm_vcpu *vcpu);
+
+void kvm_hv_setup_tsc_page(struct kvm *kvm,
+ struct pvclock_vcpu_time_info *hv_clock);
+void kvm_hv_request_tsc_page_update(struct kvm *kvm);
+
+void kvm_hv_init_vm(struct kvm *kvm);
+void kvm_hv_destroy_vm(struct kvm *kvm);
+int kvm_hv_vcpu_init(struct kvm_vcpu *vcpu);
+void kvm_hv_set_cpuid(struct kvm_vcpu *vcpu, bool hyperv_enabled);
+int kvm_hv_set_enforce_cpuid(struct kvm_vcpu *vcpu, bool enforce);
+int kvm_vm_ioctl_hv_eventfd(struct kvm *kvm, struct kvm_hyperv_eventfd *args);
+int kvm_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid,
+ struct kvm_cpuid_entry2 __user *entries);
+
+static inline struct kvm_vcpu_hv_tlb_flush_fifo *kvm_hv_get_tlb_flush_fifo(struct kvm_vcpu *vcpu,
+ bool is_guest_mode)
+{
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+ int i = is_guest_mode ? HV_L2_TLB_FLUSH_FIFO :
+ HV_L1_TLB_FLUSH_FIFO;
+
+ return &hv_vcpu->tlb_flush_fifo[i];
+}
+
+static inline void kvm_hv_vcpu_purge_flush_tlb(struct kvm_vcpu *vcpu)
+{
+ struct kvm_vcpu_hv_tlb_flush_fifo *tlb_flush_fifo;
+
+ if (!to_hv_vcpu(vcpu) || !kvm_check_request(KVM_REQ_HV_TLB_FLUSH, vcpu))
+ return;
+
+ tlb_flush_fifo = kvm_hv_get_tlb_flush_fifo(vcpu, is_guest_mode(vcpu));
+
+ kfifo_reset_out(&tlb_flush_fifo->entries);
+}
+
+static inline bool guest_hv_cpuid_has_l2_tlb_flush(struct kvm_vcpu *vcpu)
+{
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+
+ return hv_vcpu &&
+ (hv_vcpu->cpuid_cache.nested_eax & HV_X64_NESTED_DIRECT_FLUSH);
+}
+
+static inline bool kvm_hv_is_tlb_flush_hcall(struct kvm_vcpu *vcpu)
+{
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+ u16 code;
+
+ if (!hv_vcpu)
+ return false;
+
+ code = is_64_bit_hypercall(vcpu) ? kvm_rcx_read(vcpu) :
+ kvm_rax_read(vcpu);
+
+ return (code == HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE ||
+ code == HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST ||
+ code == HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX ||
+ code == HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX);
+}
+
+static inline int kvm_hv_verify_vp_assist(struct kvm_vcpu *vcpu)
+{
+ if (!to_hv_vcpu(vcpu))
+ return 0;
+
+ if (!kvm_hv_assist_page_enabled(vcpu))
+ return 0;
+
+ return kvm_hv_get_assist_page(vcpu);
+}
+
+int kvm_hv_vcpu_flush_tlb(struct kvm_vcpu *vcpu);
+
+#endif
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
new file mode 100644
index 000000000..cd57a517d
--- /dev/null
+++ b/arch/x86/kvm/i8254.c
@@ -0,0 +1,751 @@
+/*
+ * 8253/8254 interval timer emulation
+ *
+ * Copyright (c) 2003-2004 Fabrice Bellard
+ * Copyright (c) 2006 Intel Corporation
+ * Copyright (c) 2007 Keir Fraser, XenSource Inc
+ * Copyright (c) 2008 Intel Corporation
+ * Copyright 2009 Red Hat, Inc. and/or its affiliates.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ * Authors:
+ * Sheng Yang <sheng.yang@intel.com>
+ * Based on QEMU and Xen.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kvm_host.h>
+#include <linux/slab.h>
+
+#include "ioapic.h"
+#include "irq.h"
+#include "i8254.h"
+#include "x86.h"
+
+#ifndef CONFIG_X86_64
+#define mod_64(x, y) ((x) - (y) * div64_u64(x, y))
+#else
+#define mod_64(x, y) ((x) % (y))
+#endif
+
+#define RW_STATE_LSB 1
+#define RW_STATE_MSB 2
+#define RW_STATE_WORD0 3
+#define RW_STATE_WORD1 4
+
+static void pit_set_gate(struct kvm_pit *pit, int channel, u32 val)
+{
+ struct kvm_kpit_channel_state *c = &pit->pit_state.channels[channel];
+
+ switch (c->mode) {
+ default:
+ case 0:
+ case 4:
+ /* XXX: just disable/enable counting */
+ break;
+ case 1:
+ case 2:
+ case 3:
+ case 5:
+ /* Restart counting on rising edge. */
+ if (c->gate < val)
+ c->count_load_time = ktime_get();
+ break;
+ }
+
+ c->gate = val;
+}
+
+static int pit_get_gate(struct kvm_pit *pit, int channel)
+{
+ return pit->pit_state.channels[channel].gate;
+}
+
+static s64 __kpit_elapsed(struct kvm_pit *pit)
+{
+ s64 elapsed;
+ ktime_t remaining;
+ struct kvm_kpit_state *ps = &pit->pit_state;
+
+ if (!ps->period)
+ return 0;
+
+ /*
+ * The Counter does not stop when it reaches zero. In
+ * Modes 0, 1, 4, and 5 the Counter ``wraps around'' to
+ * the highest count, either FFFF hex for binary counting
+ * or 9999 for BCD counting, and continues counting.
+ * Modes 2 and 3 are periodic; the Counter reloads
+ * itself with the initial count and continues counting
+ * from there.
+ */
+ remaining = hrtimer_get_remaining(&ps->timer);
+ elapsed = ps->period - ktime_to_ns(remaining);
+
+ return elapsed;
+}
+
+static s64 kpit_elapsed(struct kvm_pit *pit, struct kvm_kpit_channel_state *c,
+ int channel)
+{
+ if (channel == 0)
+ return __kpit_elapsed(pit);
+
+ return ktime_to_ns(ktime_sub(ktime_get(), c->count_load_time));
+}
+
+static int pit_get_count(struct kvm_pit *pit, int channel)
+{
+ struct kvm_kpit_channel_state *c = &pit->pit_state.channels[channel];
+ s64 d, t;
+ int counter;
+
+ t = kpit_elapsed(pit, c, channel);
+ d = mul_u64_u32_div(t, KVM_PIT_FREQ, NSEC_PER_SEC);
+
+ switch (c->mode) {
+ case 0:
+ case 1:
+ case 4:
+ case 5:
+ counter = (c->count - d) & 0xffff;
+ break;
+ case 3:
+ /* XXX: may be incorrect for odd counts */
+ counter = c->count - (mod_64((2 * d), c->count));
+ break;
+ default:
+ counter = c->count - mod_64(d, c->count);
+ break;
+ }
+ return counter;
+}
+
+static int pit_get_out(struct kvm_pit *pit, int channel)
+{
+ struct kvm_kpit_channel_state *c = &pit->pit_state.channels[channel];
+ s64 d, t;
+ int out;
+
+ t = kpit_elapsed(pit, c, channel);
+ d = mul_u64_u32_div(t, KVM_PIT_FREQ, NSEC_PER_SEC);
+
+ switch (c->mode) {
+ default:
+ case 0:
+ out = (d >= c->count);
+ break;
+ case 1:
+ out = (d < c->count);
+ break;
+ case 2:
+ out = ((mod_64(d, c->count) == 0) && (d != 0));
+ break;
+ case 3:
+ out = (mod_64(d, c->count) < ((c->count + 1) >> 1));
+ break;
+ case 4:
+ case 5:
+ out = (d == c->count);
+ break;
+ }
+
+ return out;
+}
+
+static void pit_latch_count(struct kvm_pit *pit, int channel)
+{
+ struct kvm_kpit_channel_state *c = &pit->pit_state.channels[channel];
+
+ if (!c->count_latched) {
+ c->latched_count = pit_get_count(pit, channel);
+ c->count_latched = c->rw_mode;
+ }
+}
+
+static void pit_latch_status(struct kvm_pit *pit, int channel)
+{
+ struct kvm_kpit_channel_state *c = &pit->pit_state.channels[channel];
+
+ if (!c->status_latched) {
+ /* TODO: Return NULL COUNT (bit 6). */
+ c->status = ((pit_get_out(pit, channel) << 7) |
+ (c->rw_mode << 4) |
+ (c->mode << 1) |
+ c->bcd);
+ c->status_latched = 1;
+ }
+}
+
+static inline struct kvm_pit *pit_state_to_pit(struct kvm_kpit_state *ps)
+{
+ return container_of(ps, struct kvm_pit, pit_state);
+}
+
+static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian)
+{
+ struct kvm_kpit_state *ps = container_of(kian, struct kvm_kpit_state,
+ irq_ack_notifier);
+ struct kvm_pit *pit = pit_state_to_pit(ps);
+
+ atomic_set(&ps->irq_ack, 1);
+ /* irq_ack should be set before pending is read. Order accesses with
+ * inc(pending) in pit_timer_fn and xchg(irq_ack, 0) in pit_do_work.
+ */
+ smp_mb();
+ if (atomic_dec_if_positive(&ps->pending) > 0)
+ kthread_queue_work(pit->worker, &pit->expired);
+}
+
+void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu)
+{
+ struct kvm_pit *pit = vcpu->kvm->arch.vpit;
+ struct hrtimer *timer;
+
+ /* Somewhat arbitrarily make vcpu0 the owner of the PIT. */
+ if (vcpu->vcpu_id || !pit)
+ return;
+
+ timer = &pit->pit_state.timer;
+ mutex_lock(&pit->pit_state.lock);
+ if (hrtimer_cancel(timer))
+ hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
+ mutex_unlock(&pit->pit_state.lock);
+}
+
+static void destroy_pit_timer(struct kvm_pit *pit)
+{
+ hrtimer_cancel(&pit->pit_state.timer);
+ kthread_flush_work(&pit->expired);
+}
+
+static void pit_do_work(struct kthread_work *work)
+{
+ struct kvm_pit *pit = container_of(work, struct kvm_pit, expired);
+ struct kvm *kvm = pit->kvm;
+ struct kvm_vcpu *vcpu;
+ unsigned long i;
+ struct kvm_kpit_state *ps = &pit->pit_state;
+
+ if (atomic_read(&ps->reinject) && !atomic_xchg(&ps->irq_ack, 0))
+ return;
+
+ kvm_set_irq(kvm, pit->irq_source_id, 0, 1, false);
+ kvm_set_irq(kvm, pit->irq_source_id, 0, 0, false);
+
+ /*
+ * Provides NMI watchdog support via Virtual Wire mode.
+ * The route is: PIT -> LVT0 in NMI mode.
+ *
+ * Note: Our Virtual Wire implementation does not follow
+ * the MP specification. We propagate a PIT interrupt to all
+ * VCPUs and only when LVT0 is in NMI mode. The interrupt can
+ * also be simultaneously delivered through PIC and IOAPIC.
+ */
+ if (atomic_read(&kvm->arch.vapics_in_nmi_mode) > 0)
+ kvm_for_each_vcpu(i, vcpu, kvm)
+ kvm_apic_nmi_wd_deliver(vcpu);
+}
+
+static enum hrtimer_restart pit_timer_fn(struct hrtimer *data)
+{
+ struct kvm_kpit_state *ps = container_of(data, struct kvm_kpit_state, timer);
+ struct kvm_pit *pt = pit_state_to_pit(ps);
+
+ if (atomic_read(&ps->reinject))
+ atomic_inc(&ps->pending);
+
+ kthread_queue_work(pt->worker, &pt->expired);
+
+ if (ps->is_periodic) {
+ hrtimer_add_expires_ns(&ps->timer, ps->period);
+ return HRTIMER_RESTART;
+ } else
+ return HRTIMER_NORESTART;
+}
+
+static inline void kvm_pit_reset_reinject(struct kvm_pit *pit)
+{
+ atomic_set(&pit->pit_state.pending, 0);
+ atomic_set(&pit->pit_state.irq_ack, 1);
+}
+
+void kvm_pit_set_reinject(struct kvm_pit *pit, bool reinject)
+{
+ struct kvm_kpit_state *ps = &pit->pit_state;
+ struct kvm *kvm = pit->kvm;
+
+ if (atomic_read(&ps->reinject) == reinject)
+ return;
+
+ /*
+ * AMD SVM AVIC accelerates EOI write and does not trap.
+ * This cause in-kernel PIT re-inject mode to fail
+ * since it checks ps->irq_ack before kvm_set_irq()
+ * and relies on the ack notifier to timely queue
+ * the pt->worker work iterm and reinject the missed tick.
+ * So, deactivate APICv when PIT is in reinject mode.
+ */
+ if (reinject) {
+ kvm_set_apicv_inhibit(kvm, APICV_INHIBIT_REASON_PIT_REINJ);
+ /* The initial state is preserved while ps->reinject == 0. */
+ kvm_pit_reset_reinject(pit);
+ kvm_register_irq_ack_notifier(kvm, &ps->irq_ack_notifier);
+ kvm_register_irq_mask_notifier(kvm, 0, &pit->mask_notifier);
+ } else {
+ kvm_clear_apicv_inhibit(kvm, APICV_INHIBIT_REASON_PIT_REINJ);
+ kvm_unregister_irq_ack_notifier(kvm, &ps->irq_ack_notifier);
+ kvm_unregister_irq_mask_notifier(kvm, 0, &pit->mask_notifier);
+ }
+
+ atomic_set(&ps->reinject, reinject);
+}
+
+static void create_pit_timer(struct kvm_pit *pit, u32 val, int is_period)
+{
+ struct kvm_kpit_state *ps = &pit->pit_state;
+ struct kvm *kvm = pit->kvm;
+ s64 interval;
+
+ if (!ioapic_in_kernel(kvm) ||
+ ps->flags & KVM_PIT_FLAGS_HPET_LEGACY)
+ return;
+
+ interval = mul_u64_u32_div(val, NSEC_PER_SEC, KVM_PIT_FREQ);
+
+ pr_debug("create pit timer, interval is %llu nsec\n", interval);
+
+ /* TODO The new value only affected after the retriggered */
+ hrtimer_cancel(&ps->timer);
+ kthread_flush_work(&pit->expired);
+ ps->period = interval;
+ ps->is_periodic = is_period;
+
+ kvm_pit_reset_reinject(pit);
+
+ /*
+ * Do not allow the guest to program periodic timers with small
+ * interval, since the hrtimers are not throttled by the host
+ * scheduler.
+ */
+ if (ps->is_periodic) {
+ s64 min_period = min_timer_period_us * 1000LL;
+
+ if (ps->period < min_period) {
+ pr_info_ratelimited(
+ "requested %lld ns "
+ "i8254 timer period limited to %lld ns\n",
+ ps->period, min_period);
+ ps->period = min_period;
+ }
+ }
+
+ hrtimer_start(&ps->timer, ktime_add_ns(ktime_get(), interval),
+ HRTIMER_MODE_ABS);
+}
+
+static void pit_load_count(struct kvm_pit *pit, int channel, u32 val)
+{
+ struct kvm_kpit_state *ps = &pit->pit_state;
+
+ pr_debug("load_count val is %u, channel is %d\n", val, channel);
+
+ /*
+ * The largest possible initial count is 0; this is equivalent
+ * to 216 for binary counting and 104 for BCD counting.
+ */
+ if (val == 0)
+ val = 0x10000;
+
+ ps->channels[channel].count = val;
+
+ if (channel != 0) {
+ ps->channels[channel].count_load_time = ktime_get();
+ return;
+ }
+
+ /* Two types of timer
+ * mode 1 is one shot, mode 2 is period, otherwise del timer */
+ switch (ps->channels[0].mode) {
+ case 0:
+ case 1:
+ /* FIXME: enhance mode 4 precision */
+ case 4:
+ create_pit_timer(pit, val, 0);
+ break;
+ case 2:
+ case 3:
+ create_pit_timer(pit, val, 1);
+ break;
+ default:
+ destroy_pit_timer(pit);
+ }
+}
+
+void kvm_pit_load_count(struct kvm_pit *pit, int channel, u32 val,
+ int hpet_legacy_start)
+{
+ u8 saved_mode;
+
+ WARN_ON_ONCE(!mutex_is_locked(&pit->pit_state.lock));
+
+ if (hpet_legacy_start) {
+ /* save existing mode for later reenablement */
+ WARN_ON(channel != 0);
+ saved_mode = pit->pit_state.channels[0].mode;
+ pit->pit_state.channels[0].mode = 0xff; /* disable timer */
+ pit_load_count(pit, channel, val);
+ pit->pit_state.channels[0].mode = saved_mode;
+ } else {
+ pit_load_count(pit, channel, val);
+ }
+}
+
+static inline struct kvm_pit *dev_to_pit(struct kvm_io_device *dev)
+{
+ return container_of(dev, struct kvm_pit, dev);
+}
+
+static inline struct kvm_pit *speaker_to_pit(struct kvm_io_device *dev)
+{
+ return container_of(dev, struct kvm_pit, speaker_dev);
+}
+
+static inline int pit_in_range(gpa_t addr)
+{
+ return ((addr >= KVM_PIT_BASE_ADDRESS) &&
+ (addr < KVM_PIT_BASE_ADDRESS + KVM_PIT_MEM_LENGTH));
+}
+
+static int pit_ioport_write(struct kvm_vcpu *vcpu,
+ struct kvm_io_device *this,
+ gpa_t addr, int len, const void *data)
+{
+ struct kvm_pit *pit = dev_to_pit(this);
+ struct kvm_kpit_state *pit_state = &pit->pit_state;
+ int channel, access;
+ struct kvm_kpit_channel_state *s;
+ u32 val = *(u32 *) data;
+ if (!pit_in_range(addr))
+ return -EOPNOTSUPP;
+
+ val &= 0xff;
+ addr &= KVM_PIT_CHANNEL_MASK;
+
+ mutex_lock(&pit_state->lock);
+
+ if (val != 0)
+ pr_debug("write addr is 0x%x, len is %d, val is 0x%x\n",
+ (unsigned int)addr, len, val);
+
+ if (addr == 3) {
+ channel = val >> 6;
+ if (channel == 3) {
+ /* Read-Back Command. */
+ for (channel = 0; channel < 3; channel++) {
+ if (val & (2 << channel)) {
+ if (!(val & 0x20))
+ pit_latch_count(pit, channel);
+ if (!(val & 0x10))
+ pit_latch_status(pit, channel);
+ }
+ }
+ } else {
+ /* Select Counter <channel>. */
+ s = &pit_state->channels[channel];
+ access = (val >> 4) & KVM_PIT_CHANNEL_MASK;
+ if (access == 0) {
+ pit_latch_count(pit, channel);
+ } else {
+ s->rw_mode = access;
+ s->read_state = access;
+ s->write_state = access;
+ s->mode = (val >> 1) & 7;
+ if (s->mode > 5)
+ s->mode -= 4;
+ s->bcd = val & 1;
+ }
+ }
+ } else {
+ /* Write Count. */
+ s = &pit_state->channels[addr];
+ switch (s->write_state) {
+ default:
+ case RW_STATE_LSB:
+ pit_load_count(pit, addr, val);
+ break;
+ case RW_STATE_MSB:
+ pit_load_count(pit, addr, val << 8);
+ break;
+ case RW_STATE_WORD0:
+ s->write_latch = val;
+ s->write_state = RW_STATE_WORD1;
+ break;
+ case RW_STATE_WORD1:
+ pit_load_count(pit, addr, s->write_latch | (val << 8));
+ s->write_state = RW_STATE_WORD0;
+ break;
+ }
+ }
+
+ mutex_unlock(&pit_state->lock);
+ return 0;
+}
+
+static int pit_ioport_read(struct kvm_vcpu *vcpu,
+ struct kvm_io_device *this,
+ gpa_t addr, int len, void *data)
+{
+ struct kvm_pit *pit = dev_to_pit(this);
+ struct kvm_kpit_state *pit_state = &pit->pit_state;
+ int ret, count;
+ struct kvm_kpit_channel_state *s;
+ if (!pit_in_range(addr))
+ return -EOPNOTSUPP;
+
+ addr &= KVM_PIT_CHANNEL_MASK;
+ if (addr == 3)
+ return 0;
+
+ s = &pit_state->channels[addr];
+
+ mutex_lock(&pit_state->lock);
+
+ if (s->status_latched) {
+ s->status_latched = 0;
+ ret = s->status;
+ } else if (s->count_latched) {
+ switch (s->count_latched) {
+ default:
+ case RW_STATE_LSB:
+ ret = s->latched_count & 0xff;
+ s->count_latched = 0;
+ break;
+ case RW_STATE_MSB:
+ ret = s->latched_count >> 8;
+ s->count_latched = 0;
+ break;
+ case RW_STATE_WORD0:
+ ret = s->latched_count & 0xff;
+ s->count_latched = RW_STATE_MSB;
+ break;
+ }
+ } else {
+ switch (s->read_state) {
+ default:
+ case RW_STATE_LSB:
+ count = pit_get_count(pit, addr);
+ ret = count & 0xff;
+ break;
+ case RW_STATE_MSB:
+ count = pit_get_count(pit, addr);
+ ret = (count >> 8) & 0xff;
+ break;
+ case RW_STATE_WORD0:
+ count = pit_get_count(pit, addr);
+ ret = count & 0xff;
+ s->read_state = RW_STATE_WORD1;
+ break;
+ case RW_STATE_WORD1:
+ count = pit_get_count(pit, addr);
+ ret = (count >> 8) & 0xff;
+ s->read_state = RW_STATE_WORD0;
+ break;
+ }
+ }
+
+ if (len > sizeof(ret))
+ len = sizeof(ret);
+ memcpy(data, (char *)&ret, len);
+
+ mutex_unlock(&pit_state->lock);
+ return 0;
+}
+
+static int speaker_ioport_write(struct kvm_vcpu *vcpu,
+ struct kvm_io_device *this,
+ gpa_t addr, int len, const void *data)
+{
+ struct kvm_pit *pit = speaker_to_pit(this);
+ struct kvm_kpit_state *pit_state = &pit->pit_state;
+ u32 val = *(u32 *) data;
+ if (addr != KVM_SPEAKER_BASE_ADDRESS)
+ return -EOPNOTSUPP;
+
+ mutex_lock(&pit_state->lock);
+ if (val & (1 << 1))
+ pit_state->flags |= KVM_PIT_FLAGS_SPEAKER_DATA_ON;
+ else
+ pit_state->flags &= ~KVM_PIT_FLAGS_SPEAKER_DATA_ON;
+ pit_set_gate(pit, 2, val & 1);
+ mutex_unlock(&pit_state->lock);
+ return 0;
+}
+
+static int speaker_ioport_read(struct kvm_vcpu *vcpu,
+ struct kvm_io_device *this,
+ gpa_t addr, int len, void *data)
+{
+ struct kvm_pit *pit = speaker_to_pit(this);
+ struct kvm_kpit_state *pit_state = &pit->pit_state;
+ unsigned int refresh_clock;
+ int ret;
+ if (addr != KVM_SPEAKER_BASE_ADDRESS)
+ return -EOPNOTSUPP;
+
+ /* Refresh clock toggles at about 15us. We approximate as 2^14ns. */
+ refresh_clock = ((unsigned int)ktime_to_ns(ktime_get()) >> 14) & 1;
+
+ mutex_lock(&pit_state->lock);
+ ret = (!!(pit_state->flags & KVM_PIT_FLAGS_SPEAKER_DATA_ON) << 1) |
+ pit_get_gate(pit, 2) | (pit_get_out(pit, 2) << 5) |
+ (refresh_clock << 4);
+ if (len > sizeof(ret))
+ len = sizeof(ret);
+ memcpy(data, (char *)&ret, len);
+ mutex_unlock(&pit_state->lock);
+ return 0;
+}
+
+static void kvm_pit_reset(struct kvm_pit *pit)
+{
+ int i;
+ struct kvm_kpit_channel_state *c;
+
+ pit->pit_state.flags = 0;
+ for (i = 0; i < 3; i++) {
+ c = &pit->pit_state.channels[i];
+ c->mode = 0xff;
+ c->gate = (i != 2);
+ pit_load_count(pit, i, 0);
+ }
+
+ kvm_pit_reset_reinject(pit);
+}
+
+static void pit_mask_notifer(struct kvm_irq_mask_notifier *kimn, bool mask)
+{
+ struct kvm_pit *pit = container_of(kimn, struct kvm_pit, mask_notifier);
+
+ if (!mask)
+ kvm_pit_reset_reinject(pit);
+}
+
+static const struct kvm_io_device_ops pit_dev_ops = {
+ .read = pit_ioport_read,
+ .write = pit_ioport_write,
+};
+
+static const struct kvm_io_device_ops speaker_dev_ops = {
+ .read = speaker_ioport_read,
+ .write = speaker_ioport_write,
+};
+
+struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
+{
+ struct kvm_pit *pit;
+ struct kvm_kpit_state *pit_state;
+ struct pid *pid;
+ pid_t pid_nr;
+ int ret;
+
+ pit = kzalloc(sizeof(struct kvm_pit), GFP_KERNEL_ACCOUNT);
+ if (!pit)
+ return NULL;
+
+ pit->irq_source_id = kvm_request_irq_source_id(kvm);
+ if (pit->irq_source_id < 0)
+ goto fail_request;
+
+ mutex_init(&pit->pit_state.lock);
+
+ pid = get_pid(task_tgid(current));
+ pid_nr = pid_vnr(pid);
+ put_pid(pid);
+
+ pit->worker = kthread_create_worker(0, "kvm-pit/%d", pid_nr);
+ if (IS_ERR(pit->worker))
+ goto fail_kthread;
+
+ kthread_init_work(&pit->expired, pit_do_work);
+
+ pit->kvm = kvm;
+
+ pit_state = &pit->pit_state;
+ hrtimer_init(&pit_state->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+ pit_state->timer.function = pit_timer_fn;
+
+ pit_state->irq_ack_notifier.gsi = 0;
+ pit_state->irq_ack_notifier.irq_acked = kvm_pit_ack_irq;
+ pit->mask_notifier.func = pit_mask_notifer;
+
+ kvm_pit_reset(pit);
+
+ kvm_pit_set_reinject(pit, true);
+
+ mutex_lock(&kvm->slots_lock);
+ kvm_iodevice_init(&pit->dev, &pit_dev_ops);
+ ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, KVM_PIT_BASE_ADDRESS,
+ KVM_PIT_MEM_LENGTH, &pit->dev);
+ if (ret < 0)
+ goto fail_register_pit;
+
+ if (flags & KVM_PIT_SPEAKER_DUMMY) {
+ kvm_iodevice_init(&pit->speaker_dev, &speaker_dev_ops);
+ ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS,
+ KVM_SPEAKER_BASE_ADDRESS, 4,
+ &pit->speaker_dev);
+ if (ret < 0)
+ goto fail_register_speaker;
+ }
+ mutex_unlock(&kvm->slots_lock);
+
+ return pit;
+
+fail_register_speaker:
+ kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &pit->dev);
+fail_register_pit:
+ mutex_unlock(&kvm->slots_lock);
+ kvm_pit_set_reinject(pit, false);
+ kthread_destroy_worker(pit->worker);
+fail_kthread:
+ kvm_free_irq_source_id(kvm, pit->irq_source_id);
+fail_request:
+ kfree(pit);
+ return NULL;
+}
+
+void kvm_free_pit(struct kvm *kvm)
+{
+ struct kvm_pit *pit = kvm->arch.vpit;
+
+ if (pit) {
+ mutex_lock(&kvm->slots_lock);
+ kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &pit->dev);
+ kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &pit->speaker_dev);
+ mutex_unlock(&kvm->slots_lock);
+ kvm_pit_set_reinject(pit, false);
+ hrtimer_cancel(&pit->pit_state.timer);
+ kthread_destroy_worker(pit->worker);
+ kvm_free_irq_source_id(kvm, pit->irq_source_id);
+ kfree(pit);
+ }
+}
diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h
new file mode 100644
index 000000000..a768212ba
--- /dev/null
+++ b/arch/x86/kvm/i8254.h
@@ -0,0 +1,65 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __I8254_H
+#define __I8254_H
+
+#include <linux/kthread.h>
+
+#include <kvm/iodev.h>
+
+struct kvm_kpit_channel_state {
+ u32 count; /* can be 65536 */
+ u16 latched_count;
+ u8 count_latched;
+ u8 status_latched;
+ u8 status;
+ u8 read_state;
+ u8 write_state;
+ u8 write_latch;
+ u8 rw_mode;
+ u8 mode;
+ u8 bcd; /* not supported */
+ u8 gate; /* timer start */
+ ktime_t count_load_time;
+};
+
+struct kvm_kpit_state {
+ /* All members before "struct mutex lock" are protected by the lock. */
+ struct kvm_kpit_channel_state channels[3];
+ u32 flags;
+ bool is_periodic;
+ s64 period; /* unit: ns */
+ struct hrtimer timer;
+
+ struct mutex lock;
+ atomic_t reinject;
+ atomic_t pending; /* accumulated triggered timers */
+ atomic_t irq_ack;
+ struct kvm_irq_ack_notifier irq_ack_notifier;
+};
+
+struct kvm_pit {
+ struct kvm_io_device dev;
+ struct kvm_io_device speaker_dev;
+ struct kvm *kvm;
+ struct kvm_kpit_state pit_state;
+ int irq_source_id;
+ struct kvm_irq_mask_notifier mask_notifier;
+ struct kthread_worker *worker;
+ struct kthread_work expired;
+};
+
+#define KVM_PIT_BASE_ADDRESS 0x40
+#define KVM_SPEAKER_BASE_ADDRESS 0x61
+#define KVM_PIT_MEM_LENGTH 4
+#define KVM_PIT_FREQ 1193181
+#define KVM_MAX_PIT_INTR_INTERVAL HZ / 100
+#define KVM_PIT_CHANNEL_MASK 0x3
+
+struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags);
+void kvm_free_pit(struct kvm *kvm);
+
+void kvm_pit_load_count(struct kvm_pit *pit, int channel, u32 val,
+ int hpet_legacy_start);
+void kvm_pit_set_reinject(struct kvm_pit *pit, bool reinject);
+
+#endif
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
new file mode 100644
index 000000000..8dec646e7
--- /dev/null
+++ b/arch/x86/kvm/i8259.c
@@ -0,0 +1,660 @@
+/*
+ * 8259 interrupt controller emulation
+ *
+ * Copyright (c) 2003-2004 Fabrice Bellard
+ * Copyright (c) 2007 Intel Corporation
+ * Copyright 2009 Red Hat, Inc. and/or its affiliates.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ * Authors:
+ * Yaozu (Eddie) Dong <Eddie.dong@intel.com>
+ * Port from Qemu.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/bitops.h>
+#include "irq.h"
+
+#include <linux/kvm_host.h>
+#include "trace.h"
+
+#define pr_pic_unimpl(fmt, ...) \
+ pr_err_ratelimited("pic: " fmt, ## __VA_ARGS__)
+
+static void pic_irq_request(struct kvm *kvm, int level);
+
+static void pic_lock(struct kvm_pic *s)
+ __acquires(&s->lock)
+{
+ spin_lock(&s->lock);
+}
+
+static void pic_unlock(struct kvm_pic *s)
+ __releases(&s->lock)
+{
+ bool wakeup = s->wakeup_needed;
+ struct kvm_vcpu *vcpu;
+ unsigned long i;
+
+ s->wakeup_needed = false;
+
+ spin_unlock(&s->lock);
+
+ if (wakeup) {
+ kvm_for_each_vcpu(i, vcpu, s->kvm) {
+ if (kvm_apic_accept_pic_intr(vcpu)) {
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ kvm_vcpu_kick(vcpu);
+ return;
+ }
+ }
+ }
+}
+
+static void pic_clear_isr(struct kvm_kpic_state *s, int irq)
+{
+ s->isr &= ~(1 << irq);
+ if (s != &s->pics_state->pics[0])
+ irq += 8;
+ /*
+ * We are dropping lock while calling ack notifiers since ack
+ * notifier callbacks for assigned devices call into PIC recursively.
+ * Other interrupt may be delivered to PIC while lock is dropped but
+ * it should be safe since PIC state is already updated at this stage.
+ */
+ pic_unlock(s->pics_state);
+ kvm_notify_acked_irq(s->pics_state->kvm, SELECT_PIC(irq), irq);
+ pic_lock(s->pics_state);
+}
+
+/*
+ * set irq level. If an edge is detected, then the IRR is set to 1
+ */
+static inline int pic_set_irq1(struct kvm_kpic_state *s, int irq, int level)
+{
+ int mask, ret = 1;
+ mask = 1 << irq;
+ if (s->elcr & mask) /* level triggered */
+ if (level) {
+ ret = !(s->irr & mask);
+ s->irr |= mask;
+ s->last_irr |= mask;
+ } else {
+ s->irr &= ~mask;
+ s->last_irr &= ~mask;
+ }
+ else /* edge triggered */
+ if (level) {
+ if ((s->last_irr & mask) == 0) {
+ ret = !(s->irr & mask);
+ s->irr |= mask;
+ }
+ s->last_irr |= mask;
+ } else
+ s->last_irr &= ~mask;
+
+ return (s->imr & mask) ? -1 : ret;
+}
+
+/*
+ * return the highest priority found in mask (highest = smallest
+ * number). Return 8 if no irq
+ */
+static inline int get_priority(struct kvm_kpic_state *s, int mask)
+{
+ int priority;
+ if (mask == 0)
+ return 8;
+ priority = 0;
+ while ((mask & (1 << ((priority + s->priority_add) & 7))) == 0)
+ priority++;
+ return priority;
+}
+
+/*
+ * return the pic wanted interrupt. return -1 if none
+ */
+static int pic_get_irq(struct kvm_kpic_state *s)
+{
+ int mask, cur_priority, priority;
+
+ mask = s->irr & ~s->imr;
+ priority = get_priority(s, mask);
+ if (priority == 8)
+ return -1;
+ /*
+ * compute current priority. If special fully nested mode on the
+ * master, the IRQ coming from the slave is not taken into account
+ * for the priority computation.
+ */
+ mask = s->isr;
+ if (s->special_fully_nested_mode && s == &s->pics_state->pics[0])
+ mask &= ~(1 << 2);
+ cur_priority = get_priority(s, mask);
+ if (priority < cur_priority)
+ /*
+ * higher priority found: an irq should be generated
+ */
+ return (priority + s->priority_add) & 7;
+ else
+ return -1;
+}
+
+/*
+ * raise irq to CPU if necessary. must be called every time the active
+ * irq may change
+ */
+static void pic_update_irq(struct kvm_pic *s)
+{
+ int irq2, irq;
+
+ irq2 = pic_get_irq(&s->pics[1]);
+ if (irq2 >= 0) {
+ /*
+ * if irq request by slave pic, signal master PIC
+ */
+ pic_set_irq1(&s->pics[0], 2, 1);
+ pic_set_irq1(&s->pics[0], 2, 0);
+ }
+ irq = pic_get_irq(&s->pics[0]);
+ pic_irq_request(s->kvm, irq >= 0);
+}
+
+void kvm_pic_update_irq(struct kvm_pic *s)
+{
+ pic_lock(s);
+ pic_update_irq(s);
+ pic_unlock(s);
+}
+
+int kvm_pic_set_irq(struct kvm_pic *s, int irq, int irq_source_id, int level)
+{
+ int ret, irq_level;
+
+ BUG_ON(irq < 0 || irq >= PIC_NUM_PINS);
+
+ pic_lock(s);
+ irq_level = __kvm_irq_line_state(&s->irq_states[irq],
+ irq_source_id, level);
+ ret = pic_set_irq1(&s->pics[irq >> 3], irq & 7, irq_level);
+ pic_update_irq(s);
+ trace_kvm_pic_set_irq(irq >> 3, irq & 7, s->pics[irq >> 3].elcr,
+ s->pics[irq >> 3].imr, ret == 0);
+ pic_unlock(s);
+
+ return ret;
+}
+
+void kvm_pic_clear_all(struct kvm_pic *s, int irq_source_id)
+{
+ int i;
+
+ pic_lock(s);
+ for (i = 0; i < PIC_NUM_PINS; i++)
+ __clear_bit(irq_source_id, &s->irq_states[i]);
+ pic_unlock(s);
+}
+
+/*
+ * acknowledge interrupt 'irq'
+ */
+static inline void pic_intack(struct kvm_kpic_state *s, int irq)
+{
+ s->isr |= 1 << irq;
+ /*
+ * We don't clear a level sensitive interrupt here
+ */
+ if (!(s->elcr & (1 << irq)))
+ s->irr &= ~(1 << irq);
+
+ if (s->auto_eoi) {
+ if (s->rotate_on_auto_eoi)
+ s->priority_add = (irq + 1) & 7;
+ pic_clear_isr(s, irq);
+ }
+
+}
+
+int kvm_pic_read_irq(struct kvm *kvm)
+{
+ int irq, irq2, intno;
+ struct kvm_pic *s = kvm->arch.vpic;
+
+ s->output = 0;
+
+ pic_lock(s);
+ irq = pic_get_irq(&s->pics[0]);
+ if (irq >= 0) {
+ pic_intack(&s->pics[0], irq);
+ if (irq == 2) {
+ irq2 = pic_get_irq(&s->pics[1]);
+ if (irq2 >= 0)
+ pic_intack(&s->pics[1], irq2);
+ else
+ /*
+ * spurious IRQ on slave controller
+ */
+ irq2 = 7;
+ intno = s->pics[1].irq_base + irq2;
+ } else
+ intno = s->pics[0].irq_base + irq;
+ } else {
+ /*
+ * spurious IRQ on host controller
+ */
+ irq = 7;
+ intno = s->pics[0].irq_base + irq;
+ }
+ pic_update_irq(s);
+ pic_unlock(s);
+
+ return intno;
+}
+
+static void kvm_pic_reset(struct kvm_kpic_state *s)
+{
+ int irq;
+ unsigned long i;
+ struct kvm_vcpu *vcpu;
+ u8 edge_irr = s->irr & ~s->elcr;
+ bool found = false;
+
+ s->last_irr = 0;
+ s->irr &= s->elcr;
+ s->imr = 0;
+ s->priority_add = 0;
+ s->special_mask = 0;
+ s->read_reg_select = 0;
+ if (!s->init4) {
+ s->special_fully_nested_mode = 0;
+ s->auto_eoi = 0;
+ }
+ s->init_state = 1;
+
+ kvm_for_each_vcpu(i, vcpu, s->pics_state->kvm)
+ if (kvm_apic_accept_pic_intr(vcpu)) {
+ found = true;
+ break;
+ }
+
+
+ if (!found)
+ return;
+
+ for (irq = 0; irq < PIC_NUM_PINS/2; irq++)
+ if (edge_irr & (1 << irq))
+ pic_clear_isr(s, irq);
+}
+
+static void pic_ioport_write(void *opaque, u32 addr, u32 val)
+{
+ struct kvm_kpic_state *s = opaque;
+ int priority, cmd, irq;
+
+ addr &= 1;
+ if (addr == 0) {
+ if (val & 0x10) {
+ s->init4 = val & 1;
+ if (val & 0x02)
+ pr_pic_unimpl("single mode not supported");
+ if (val & 0x08)
+ pr_pic_unimpl(
+ "level sensitive irq not supported");
+ kvm_pic_reset(s);
+ } else if (val & 0x08) {
+ if (val & 0x04)
+ s->poll = 1;
+ if (val & 0x02)
+ s->read_reg_select = val & 1;
+ if (val & 0x40)
+ s->special_mask = (val >> 5) & 1;
+ } else {
+ cmd = val >> 5;
+ switch (cmd) {
+ case 0:
+ case 4:
+ s->rotate_on_auto_eoi = cmd >> 2;
+ break;
+ case 1: /* end of interrupt */
+ case 5:
+ priority = get_priority(s, s->isr);
+ if (priority != 8) {
+ irq = (priority + s->priority_add) & 7;
+ if (cmd == 5)
+ s->priority_add = (irq + 1) & 7;
+ pic_clear_isr(s, irq);
+ pic_update_irq(s->pics_state);
+ }
+ break;
+ case 3:
+ irq = val & 7;
+ pic_clear_isr(s, irq);
+ pic_update_irq(s->pics_state);
+ break;
+ case 6:
+ s->priority_add = (val + 1) & 7;
+ pic_update_irq(s->pics_state);
+ break;
+ case 7:
+ irq = val & 7;
+ s->priority_add = (irq + 1) & 7;
+ pic_clear_isr(s, irq);
+ pic_update_irq(s->pics_state);
+ break;
+ default:
+ break; /* no operation */
+ }
+ }
+ } else
+ switch (s->init_state) {
+ case 0: { /* normal mode */
+ u8 imr_diff = s->imr ^ val,
+ off = (s == &s->pics_state->pics[0]) ? 0 : 8;
+ s->imr = val;
+ for (irq = 0; irq < PIC_NUM_PINS/2; irq++)
+ if (imr_diff & (1 << irq))
+ kvm_fire_mask_notifiers(
+ s->pics_state->kvm,
+ SELECT_PIC(irq + off),
+ irq + off,
+ !!(s->imr & (1 << irq)));
+ pic_update_irq(s->pics_state);
+ break;
+ }
+ case 1:
+ s->irq_base = val & 0xf8;
+ s->init_state = 2;
+ break;
+ case 2:
+ if (s->init4)
+ s->init_state = 3;
+ else
+ s->init_state = 0;
+ break;
+ case 3:
+ s->special_fully_nested_mode = (val >> 4) & 1;
+ s->auto_eoi = (val >> 1) & 1;
+ s->init_state = 0;
+ break;
+ }
+}
+
+static u32 pic_poll_read(struct kvm_kpic_state *s, u32 addr1)
+{
+ int ret;
+
+ ret = pic_get_irq(s);
+ if (ret >= 0) {
+ if (addr1 >> 7) {
+ s->pics_state->pics[0].isr &= ~(1 << 2);
+ s->pics_state->pics[0].irr &= ~(1 << 2);
+ }
+ s->irr &= ~(1 << ret);
+ pic_clear_isr(s, ret);
+ if (addr1 >> 7 || ret != 2)
+ pic_update_irq(s->pics_state);
+ /* Bit 7 is 1, means there's an interrupt */
+ ret |= 0x80;
+ } else {
+ /* Bit 7 is 0, means there's no interrupt */
+ ret = 0x07;
+ pic_update_irq(s->pics_state);
+ }
+
+ return ret;
+}
+
+static u32 pic_ioport_read(void *opaque, u32 addr)
+{
+ struct kvm_kpic_state *s = opaque;
+ int ret;
+
+ if (s->poll) {
+ ret = pic_poll_read(s, addr);
+ s->poll = 0;
+ } else
+ if ((addr & 1) == 0)
+ if (s->read_reg_select)
+ ret = s->isr;
+ else
+ ret = s->irr;
+ else
+ ret = s->imr;
+ return ret;
+}
+
+static void elcr_ioport_write(void *opaque, u32 val)
+{
+ struct kvm_kpic_state *s = opaque;
+ s->elcr = val & s->elcr_mask;
+}
+
+static u32 elcr_ioport_read(void *opaque)
+{
+ struct kvm_kpic_state *s = opaque;
+ return s->elcr;
+}
+
+static int picdev_write(struct kvm_pic *s,
+ gpa_t addr, int len, const void *val)
+{
+ unsigned char data = *(unsigned char *)val;
+
+ if (len != 1) {
+ pr_pic_unimpl("non byte write\n");
+ return 0;
+ }
+ switch (addr) {
+ case 0x20:
+ case 0x21:
+ pic_lock(s);
+ pic_ioport_write(&s->pics[0], addr, data);
+ pic_unlock(s);
+ break;
+ case 0xa0:
+ case 0xa1:
+ pic_lock(s);
+ pic_ioport_write(&s->pics[1], addr, data);
+ pic_unlock(s);
+ break;
+ case 0x4d0:
+ case 0x4d1:
+ pic_lock(s);
+ elcr_ioport_write(&s->pics[addr & 1], data);
+ pic_unlock(s);
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+ return 0;
+}
+
+static int picdev_read(struct kvm_pic *s,
+ gpa_t addr, int len, void *val)
+{
+ unsigned char *data = (unsigned char *)val;
+
+ if (len != 1) {
+ memset(val, 0, len);
+ pr_pic_unimpl("non byte read\n");
+ return 0;
+ }
+ switch (addr) {
+ case 0x20:
+ case 0x21:
+ case 0xa0:
+ case 0xa1:
+ pic_lock(s);
+ *data = pic_ioport_read(&s->pics[addr >> 7], addr);
+ pic_unlock(s);
+ break;
+ case 0x4d0:
+ case 0x4d1:
+ pic_lock(s);
+ *data = elcr_ioport_read(&s->pics[addr & 1]);
+ pic_unlock(s);
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+ return 0;
+}
+
+static int picdev_master_write(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
+ gpa_t addr, int len, const void *val)
+{
+ return picdev_write(container_of(dev, struct kvm_pic, dev_master),
+ addr, len, val);
+}
+
+static int picdev_master_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
+ gpa_t addr, int len, void *val)
+{
+ return picdev_read(container_of(dev, struct kvm_pic, dev_master),
+ addr, len, val);
+}
+
+static int picdev_slave_write(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
+ gpa_t addr, int len, const void *val)
+{
+ return picdev_write(container_of(dev, struct kvm_pic, dev_slave),
+ addr, len, val);
+}
+
+static int picdev_slave_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
+ gpa_t addr, int len, void *val)
+{
+ return picdev_read(container_of(dev, struct kvm_pic, dev_slave),
+ addr, len, val);
+}
+
+static int picdev_elcr_write(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
+ gpa_t addr, int len, const void *val)
+{
+ return picdev_write(container_of(dev, struct kvm_pic, dev_elcr),
+ addr, len, val);
+}
+
+static int picdev_elcr_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
+ gpa_t addr, int len, void *val)
+{
+ return picdev_read(container_of(dev, struct kvm_pic, dev_elcr),
+ addr, len, val);
+}
+
+/*
+ * callback when PIC0 irq status changed
+ */
+static void pic_irq_request(struct kvm *kvm, int level)
+{
+ struct kvm_pic *s = kvm->arch.vpic;
+
+ if (!s->output)
+ s->wakeup_needed = true;
+ s->output = level;
+}
+
+static const struct kvm_io_device_ops picdev_master_ops = {
+ .read = picdev_master_read,
+ .write = picdev_master_write,
+};
+
+static const struct kvm_io_device_ops picdev_slave_ops = {
+ .read = picdev_slave_read,
+ .write = picdev_slave_write,
+};
+
+static const struct kvm_io_device_ops picdev_elcr_ops = {
+ .read = picdev_elcr_read,
+ .write = picdev_elcr_write,
+};
+
+int kvm_pic_init(struct kvm *kvm)
+{
+ struct kvm_pic *s;
+ int ret;
+
+ s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL_ACCOUNT);
+ if (!s)
+ return -ENOMEM;
+ spin_lock_init(&s->lock);
+ s->kvm = kvm;
+ s->pics[0].elcr_mask = 0xf8;
+ s->pics[1].elcr_mask = 0xde;
+ s->pics[0].pics_state = s;
+ s->pics[1].pics_state = s;
+
+ /*
+ * Initialize PIO device
+ */
+ kvm_iodevice_init(&s->dev_master, &picdev_master_ops);
+ kvm_iodevice_init(&s->dev_slave, &picdev_slave_ops);
+ kvm_iodevice_init(&s->dev_elcr, &picdev_elcr_ops);
+ mutex_lock(&kvm->slots_lock);
+ ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, 0x20, 2,
+ &s->dev_master);
+ if (ret < 0)
+ goto fail_unlock;
+
+ ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, 0xa0, 2, &s->dev_slave);
+ if (ret < 0)
+ goto fail_unreg_2;
+
+ ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, 0x4d0, 2, &s->dev_elcr);
+ if (ret < 0)
+ goto fail_unreg_1;
+
+ mutex_unlock(&kvm->slots_lock);
+
+ kvm->arch.vpic = s;
+
+ return 0;
+
+fail_unreg_1:
+ kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &s->dev_slave);
+
+fail_unreg_2:
+ kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &s->dev_master);
+
+fail_unlock:
+ mutex_unlock(&kvm->slots_lock);
+
+ kfree(s);
+
+ return ret;
+}
+
+void kvm_pic_destroy(struct kvm *kvm)
+{
+ struct kvm_pic *vpic = kvm->arch.vpic;
+
+ if (!vpic)
+ return;
+
+ mutex_lock(&kvm->slots_lock);
+ kvm_io_bus_unregister_dev(vpic->kvm, KVM_PIO_BUS, &vpic->dev_master);
+ kvm_io_bus_unregister_dev(vpic->kvm, KVM_PIO_BUS, &vpic->dev_slave);
+ kvm_io_bus_unregister_dev(vpic->kvm, KVM_PIO_BUS, &vpic->dev_elcr);
+ mutex_unlock(&kvm->slots_lock);
+
+ kvm->arch.vpic = NULL;
+ kfree(vpic);
+}
diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c
new file mode 100644
index 000000000..995eb5054
--- /dev/null
+++ b/arch/x86/kvm/ioapic.c
@@ -0,0 +1,776 @@
+/*
+ * Copyright (C) 2001 MandrakeSoft S.A.
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
+ *
+ * MandrakeSoft S.A.
+ * 43, rue d'Aboukir
+ * 75002 Paris - France
+ * http://www.linux-mandrake.com/
+ * http://www.mandrakesoft.com/
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Yunhong Jiang <yunhong.jiang@intel.com>
+ * Yaozu (Eddie) Dong <eddie.dong@intel.com>
+ * Based on Xen 3.1 code.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kvm_host.h>
+#include <linux/kvm.h>
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/smp.h>
+#include <linux/hrtimer.h>
+#include <linux/io.h>
+#include <linux/slab.h>
+#include <linux/export.h>
+#include <linux/nospec.h>
+#include <asm/processor.h>
+#include <asm/page.h>
+#include <asm/current.h>
+#include <trace/events/kvm.h>
+
+#include "ioapic.h"
+#include "lapic.h"
+#include "irq.h"
+
+static int ioapic_service(struct kvm_ioapic *vioapic, int irq,
+ bool line_status);
+
+static void kvm_ioapic_update_eoi_one(struct kvm_vcpu *vcpu,
+ struct kvm_ioapic *ioapic,
+ int trigger_mode,
+ int pin);
+
+static unsigned long ioapic_read_indirect(struct kvm_ioapic *ioapic)
+{
+ unsigned long result = 0;
+
+ switch (ioapic->ioregsel) {
+ case IOAPIC_REG_VERSION:
+ result = ((((IOAPIC_NUM_PINS - 1) & 0xff) << 16)
+ | (IOAPIC_VERSION_ID & 0xff));
+ break;
+
+ case IOAPIC_REG_APIC_ID:
+ case IOAPIC_REG_ARB_ID:
+ result = ((ioapic->id & 0xf) << 24);
+ break;
+
+ default:
+ {
+ u32 redir_index = (ioapic->ioregsel - 0x10) >> 1;
+ u64 redir_content = ~0ULL;
+
+ if (redir_index < IOAPIC_NUM_PINS) {
+ u32 index = array_index_nospec(
+ redir_index, IOAPIC_NUM_PINS);
+
+ redir_content = ioapic->redirtbl[index].bits;
+ }
+
+ result = (ioapic->ioregsel & 0x1) ?
+ (redir_content >> 32) & 0xffffffff :
+ redir_content & 0xffffffff;
+ break;
+ }
+ }
+
+ return result;
+}
+
+static void rtc_irq_eoi_tracking_reset(struct kvm_ioapic *ioapic)
+{
+ ioapic->rtc_status.pending_eoi = 0;
+ bitmap_zero(ioapic->rtc_status.dest_map.map, KVM_MAX_VCPU_IDS);
+}
+
+static void kvm_rtc_eoi_tracking_restore_all(struct kvm_ioapic *ioapic);
+
+static void rtc_status_pending_eoi_check_valid(struct kvm_ioapic *ioapic)
+{
+ if (WARN_ON(ioapic->rtc_status.pending_eoi < 0))
+ kvm_rtc_eoi_tracking_restore_all(ioapic);
+}
+
+static void __rtc_irq_eoi_tracking_restore_one(struct kvm_vcpu *vcpu)
+{
+ bool new_val, old_val;
+ struct kvm_ioapic *ioapic = vcpu->kvm->arch.vioapic;
+ struct dest_map *dest_map = &ioapic->rtc_status.dest_map;
+ union kvm_ioapic_redirect_entry *e;
+
+ e = &ioapic->redirtbl[RTC_GSI];
+ if (!kvm_apic_match_dest(vcpu, NULL, APIC_DEST_NOSHORT,
+ e->fields.dest_id,
+ kvm_lapic_irq_dest_mode(!!e->fields.dest_mode)))
+ return;
+
+ new_val = kvm_apic_pending_eoi(vcpu, e->fields.vector);
+ old_val = test_bit(vcpu->vcpu_id, dest_map->map);
+
+ if (new_val == old_val)
+ return;
+
+ if (new_val) {
+ __set_bit(vcpu->vcpu_id, dest_map->map);
+ dest_map->vectors[vcpu->vcpu_id] = e->fields.vector;
+ ioapic->rtc_status.pending_eoi++;
+ } else {
+ __clear_bit(vcpu->vcpu_id, dest_map->map);
+ ioapic->rtc_status.pending_eoi--;
+ rtc_status_pending_eoi_check_valid(ioapic);
+ }
+}
+
+void kvm_rtc_eoi_tracking_restore_one(struct kvm_vcpu *vcpu)
+{
+ struct kvm_ioapic *ioapic = vcpu->kvm->arch.vioapic;
+
+ spin_lock(&ioapic->lock);
+ __rtc_irq_eoi_tracking_restore_one(vcpu);
+ spin_unlock(&ioapic->lock);
+}
+
+static void kvm_rtc_eoi_tracking_restore_all(struct kvm_ioapic *ioapic)
+{
+ struct kvm_vcpu *vcpu;
+ unsigned long i;
+
+ if (RTC_GSI >= IOAPIC_NUM_PINS)
+ return;
+
+ rtc_irq_eoi_tracking_reset(ioapic);
+ kvm_for_each_vcpu(i, vcpu, ioapic->kvm)
+ __rtc_irq_eoi_tracking_restore_one(vcpu);
+}
+
+static void rtc_irq_eoi(struct kvm_ioapic *ioapic, struct kvm_vcpu *vcpu,
+ int vector)
+{
+ struct dest_map *dest_map = &ioapic->rtc_status.dest_map;
+
+ /* RTC special handling */
+ if (test_bit(vcpu->vcpu_id, dest_map->map) &&
+ (vector == dest_map->vectors[vcpu->vcpu_id]) &&
+ (test_and_clear_bit(vcpu->vcpu_id,
+ ioapic->rtc_status.dest_map.map))) {
+ --ioapic->rtc_status.pending_eoi;
+ rtc_status_pending_eoi_check_valid(ioapic);
+ }
+}
+
+static bool rtc_irq_check_coalesced(struct kvm_ioapic *ioapic)
+{
+ if (ioapic->rtc_status.pending_eoi > 0)
+ return true; /* coalesced */
+
+ return false;
+}
+
+static void ioapic_lazy_update_eoi(struct kvm_ioapic *ioapic, int irq)
+{
+ unsigned long i;
+ struct kvm_vcpu *vcpu;
+ union kvm_ioapic_redirect_entry *entry = &ioapic->redirtbl[irq];
+
+ kvm_for_each_vcpu(i, vcpu, ioapic->kvm) {
+ if (!kvm_apic_match_dest(vcpu, NULL, APIC_DEST_NOSHORT,
+ entry->fields.dest_id,
+ entry->fields.dest_mode) ||
+ kvm_apic_pending_eoi(vcpu, entry->fields.vector))
+ continue;
+
+ /*
+ * If no longer has pending EOI in LAPICs, update
+ * EOI for this vector.
+ */
+ rtc_irq_eoi(ioapic, vcpu, entry->fields.vector);
+ break;
+ }
+}
+
+static int ioapic_set_irq(struct kvm_ioapic *ioapic, unsigned int irq,
+ int irq_level, bool line_status)
+{
+ union kvm_ioapic_redirect_entry entry;
+ u32 mask = 1 << irq;
+ u32 old_irr;
+ int edge, ret;
+
+ entry = ioapic->redirtbl[irq];
+ edge = (entry.fields.trig_mode == IOAPIC_EDGE_TRIG);
+
+ if (!irq_level) {
+ ioapic->irr &= ~mask;
+ ret = 1;
+ goto out;
+ }
+
+ /*
+ * AMD SVM AVIC accelerate EOI write iff the interrupt is edge
+ * triggered, in which case the in-kernel IOAPIC will not be able
+ * to receive the EOI. In this case, we do a lazy update of the
+ * pending EOI when trying to set IOAPIC irq.
+ */
+ if (edge && kvm_apicv_activated(ioapic->kvm))
+ ioapic_lazy_update_eoi(ioapic, irq);
+
+ /*
+ * Return 0 for coalesced interrupts; for edge-triggered interrupts,
+ * this only happens if a previous edge has not been delivered due
+ * to masking. For level interrupts, the remote_irr field tells
+ * us if the interrupt is waiting for an EOI.
+ *
+ * RTC is special: it is edge-triggered, but userspace likes to know
+ * if it has been already ack-ed via EOI because coalesced RTC
+ * interrupts lead to time drift in Windows guests. So we track
+ * EOI manually for the RTC interrupt.
+ */
+ if (irq == RTC_GSI && line_status &&
+ rtc_irq_check_coalesced(ioapic)) {
+ ret = 0;
+ goto out;
+ }
+
+ old_irr = ioapic->irr;
+ ioapic->irr |= mask;
+ if (edge) {
+ ioapic->irr_delivered &= ~mask;
+ if (old_irr == ioapic->irr) {
+ ret = 0;
+ goto out;
+ }
+ }
+
+ ret = ioapic_service(ioapic, irq, line_status);
+
+out:
+ trace_kvm_ioapic_set_irq(entry.bits, irq, ret == 0);
+ return ret;
+}
+
+static void kvm_ioapic_inject_all(struct kvm_ioapic *ioapic, unsigned long irr)
+{
+ u32 idx;
+
+ rtc_irq_eoi_tracking_reset(ioapic);
+ for_each_set_bit(idx, &irr, IOAPIC_NUM_PINS)
+ ioapic_set_irq(ioapic, idx, 1, true);
+
+ kvm_rtc_eoi_tracking_restore_all(ioapic);
+}
+
+
+void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, ulong *ioapic_handled_vectors)
+{
+ struct kvm_ioapic *ioapic = vcpu->kvm->arch.vioapic;
+ struct dest_map *dest_map = &ioapic->rtc_status.dest_map;
+ union kvm_ioapic_redirect_entry *e;
+ int index;
+
+ spin_lock(&ioapic->lock);
+
+ /* Make sure we see any missing RTC EOI */
+ if (test_bit(vcpu->vcpu_id, dest_map->map))
+ __set_bit(dest_map->vectors[vcpu->vcpu_id],
+ ioapic_handled_vectors);
+
+ for (index = 0; index < IOAPIC_NUM_PINS; index++) {
+ e = &ioapic->redirtbl[index];
+ if (e->fields.trig_mode == IOAPIC_LEVEL_TRIG ||
+ kvm_irq_has_notifier(ioapic->kvm, KVM_IRQCHIP_IOAPIC, index) ||
+ index == RTC_GSI) {
+ u16 dm = kvm_lapic_irq_dest_mode(!!e->fields.dest_mode);
+
+ if (kvm_apic_match_dest(vcpu, NULL, APIC_DEST_NOSHORT,
+ e->fields.dest_id, dm) ||
+ kvm_apic_pending_eoi(vcpu, e->fields.vector))
+ __set_bit(e->fields.vector,
+ ioapic_handled_vectors);
+ }
+ }
+ spin_unlock(&ioapic->lock);
+}
+
+void kvm_arch_post_irq_ack_notifier_list_update(struct kvm *kvm)
+{
+ if (!ioapic_in_kernel(kvm))
+ return;
+ kvm_make_scan_ioapic_request(kvm);
+}
+
+static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
+{
+ unsigned index;
+ bool mask_before, mask_after;
+ union kvm_ioapic_redirect_entry *e;
+ int old_remote_irr, old_delivery_status, old_dest_id, old_dest_mode;
+ DECLARE_BITMAP(vcpu_bitmap, KVM_MAX_VCPUS);
+
+ switch (ioapic->ioregsel) {
+ case IOAPIC_REG_VERSION:
+ /* Writes are ignored. */
+ break;
+
+ case IOAPIC_REG_APIC_ID:
+ ioapic->id = (val >> 24) & 0xf;
+ break;
+
+ case IOAPIC_REG_ARB_ID:
+ break;
+
+ default:
+ index = (ioapic->ioregsel - 0x10) >> 1;
+
+ if (index >= IOAPIC_NUM_PINS)
+ return;
+ index = array_index_nospec(index, IOAPIC_NUM_PINS);
+ e = &ioapic->redirtbl[index];
+ mask_before = e->fields.mask;
+ /* Preserve read-only fields */
+ old_remote_irr = e->fields.remote_irr;
+ old_delivery_status = e->fields.delivery_status;
+ old_dest_id = e->fields.dest_id;
+ old_dest_mode = e->fields.dest_mode;
+ if (ioapic->ioregsel & 1) {
+ e->bits &= 0xffffffff;
+ e->bits |= (u64) val << 32;
+ } else {
+ e->bits &= ~0xffffffffULL;
+ e->bits |= (u32) val;
+ }
+ e->fields.remote_irr = old_remote_irr;
+ e->fields.delivery_status = old_delivery_status;
+
+ /*
+ * Some OSes (Linux, Xen) assume that Remote IRR bit will
+ * be cleared by IOAPIC hardware when the entry is configured
+ * as edge-triggered. This behavior is used to simulate an
+ * explicit EOI on IOAPICs that don't have the EOI register.
+ */
+ if (e->fields.trig_mode == IOAPIC_EDGE_TRIG)
+ e->fields.remote_irr = 0;
+
+ mask_after = e->fields.mask;
+ if (mask_before != mask_after)
+ kvm_fire_mask_notifiers(ioapic->kvm, KVM_IRQCHIP_IOAPIC, index, mask_after);
+ if (e->fields.trig_mode == IOAPIC_LEVEL_TRIG &&
+ ioapic->irr & (1 << index) && !e->fields.mask && !e->fields.remote_irr) {
+ /*
+ * Pending status in irr may be outdated: the IRQ line may have
+ * already been deasserted by a device while the IRQ was masked.
+ * This occurs, for instance, if the interrupt is handled in a
+ * Linux guest as a oneshot interrupt (IRQF_ONESHOT). In this
+ * case the guest acknowledges the interrupt to the device in
+ * its threaded irq handler, i.e. after the EOI but before
+ * unmasking, so at the time of unmasking the IRQ line is
+ * already down but our pending irr bit is still set. In such
+ * cases, injecting this pending interrupt to the guest is
+ * buggy: the guest will receive an extra unwanted interrupt.
+ *
+ * So we need to check here if the IRQ is actually still pending.
+ * As we are generally not able to probe the IRQ line status
+ * directly, we do it through irqfd resampler. Namely, we clear
+ * the pending status and notify the resampler that this interrupt
+ * is done, without actually injecting it into the guest. If the
+ * IRQ line is actually already deasserted, we are done. If it is
+ * still asserted, a new interrupt will be shortly triggered
+ * through irqfd and injected into the guest.
+ *
+ * If, however, it's not possible to resample (no irqfd resampler
+ * registered for this irq), then unconditionally inject this
+ * pending interrupt into the guest, so the guest will not miss
+ * an interrupt, although may get an extra unwanted interrupt.
+ */
+ if (kvm_notify_irqfd_resampler(ioapic->kvm, KVM_IRQCHIP_IOAPIC, index))
+ ioapic->irr &= ~(1 << index);
+ else
+ ioapic_service(ioapic, index, false);
+ }
+ if (e->fields.delivery_mode == APIC_DM_FIXED) {
+ struct kvm_lapic_irq irq;
+
+ irq.vector = e->fields.vector;
+ irq.delivery_mode = e->fields.delivery_mode << 8;
+ irq.dest_mode =
+ kvm_lapic_irq_dest_mode(!!e->fields.dest_mode);
+ irq.level = false;
+ irq.trig_mode = e->fields.trig_mode;
+ irq.shorthand = APIC_DEST_NOSHORT;
+ irq.dest_id = e->fields.dest_id;
+ irq.msi_redir_hint = false;
+ bitmap_zero(vcpu_bitmap, KVM_MAX_VCPUS);
+ kvm_bitmap_or_dest_vcpus(ioapic->kvm, &irq,
+ vcpu_bitmap);
+ if (old_dest_mode != e->fields.dest_mode ||
+ old_dest_id != e->fields.dest_id) {
+ /*
+ * Update vcpu_bitmap with vcpus specified in
+ * the previous request as well. This is done to
+ * keep ioapic_handled_vectors synchronized.
+ */
+ irq.dest_id = old_dest_id;
+ irq.dest_mode =
+ kvm_lapic_irq_dest_mode(
+ !!e->fields.dest_mode);
+ kvm_bitmap_or_dest_vcpus(ioapic->kvm, &irq,
+ vcpu_bitmap);
+ }
+ kvm_make_scan_ioapic_request_mask(ioapic->kvm,
+ vcpu_bitmap);
+ } else {
+ kvm_make_scan_ioapic_request(ioapic->kvm);
+ }
+ break;
+ }
+}
+
+static int ioapic_service(struct kvm_ioapic *ioapic, int irq, bool line_status)
+{
+ union kvm_ioapic_redirect_entry *entry = &ioapic->redirtbl[irq];
+ struct kvm_lapic_irq irqe;
+ int ret;
+
+ if (entry->fields.mask ||
+ (entry->fields.trig_mode == IOAPIC_LEVEL_TRIG &&
+ entry->fields.remote_irr))
+ return -1;
+
+ irqe.dest_id = entry->fields.dest_id;
+ irqe.vector = entry->fields.vector;
+ irqe.dest_mode = kvm_lapic_irq_dest_mode(!!entry->fields.dest_mode);
+ irqe.trig_mode = entry->fields.trig_mode;
+ irqe.delivery_mode = entry->fields.delivery_mode << 8;
+ irqe.level = 1;
+ irqe.shorthand = APIC_DEST_NOSHORT;
+ irqe.msi_redir_hint = false;
+
+ if (irqe.trig_mode == IOAPIC_EDGE_TRIG)
+ ioapic->irr_delivered |= 1 << irq;
+
+ if (irq == RTC_GSI && line_status) {
+ /*
+ * pending_eoi cannot ever become negative (see
+ * rtc_status_pending_eoi_check_valid) and the caller
+ * ensures that it is only called if it is >= zero, namely
+ * if rtc_irq_check_coalesced returns false).
+ */
+ BUG_ON(ioapic->rtc_status.pending_eoi != 0);
+ ret = kvm_irq_delivery_to_apic(ioapic->kvm, NULL, &irqe,
+ &ioapic->rtc_status.dest_map);
+ ioapic->rtc_status.pending_eoi = (ret < 0 ? 0 : ret);
+ } else
+ ret = kvm_irq_delivery_to_apic(ioapic->kvm, NULL, &irqe, NULL);
+
+ if (ret && irqe.trig_mode == IOAPIC_LEVEL_TRIG)
+ entry->fields.remote_irr = 1;
+
+ return ret;
+}
+
+int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int irq_source_id,
+ int level, bool line_status)
+{
+ int ret, irq_level;
+
+ BUG_ON(irq < 0 || irq >= IOAPIC_NUM_PINS);
+
+ spin_lock(&ioapic->lock);
+ irq_level = __kvm_irq_line_state(&ioapic->irq_states[irq],
+ irq_source_id, level);
+ ret = ioapic_set_irq(ioapic, irq, irq_level, line_status);
+
+ spin_unlock(&ioapic->lock);
+
+ return ret;
+}
+
+void kvm_ioapic_clear_all(struct kvm_ioapic *ioapic, int irq_source_id)
+{
+ int i;
+
+ spin_lock(&ioapic->lock);
+ for (i = 0; i < KVM_IOAPIC_NUM_PINS; i++)
+ __clear_bit(irq_source_id, &ioapic->irq_states[i]);
+ spin_unlock(&ioapic->lock);
+}
+
+static void kvm_ioapic_eoi_inject_work(struct work_struct *work)
+{
+ int i;
+ struct kvm_ioapic *ioapic = container_of(work, struct kvm_ioapic,
+ eoi_inject.work);
+ spin_lock(&ioapic->lock);
+ for (i = 0; i < IOAPIC_NUM_PINS; i++) {
+ union kvm_ioapic_redirect_entry *ent = &ioapic->redirtbl[i];
+
+ if (ent->fields.trig_mode != IOAPIC_LEVEL_TRIG)
+ continue;
+
+ if (ioapic->irr & (1 << i) && !ent->fields.remote_irr)
+ ioapic_service(ioapic, i, false);
+ }
+ spin_unlock(&ioapic->lock);
+}
+
+#define IOAPIC_SUCCESSIVE_IRQ_MAX_COUNT 10000
+static void kvm_ioapic_update_eoi_one(struct kvm_vcpu *vcpu,
+ struct kvm_ioapic *ioapic,
+ int trigger_mode,
+ int pin)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+ union kvm_ioapic_redirect_entry *ent = &ioapic->redirtbl[pin];
+
+ /*
+ * We are dropping lock while calling ack notifiers because ack
+ * notifier callbacks for assigned devices call into IOAPIC
+ * recursively. Since remote_irr is cleared only after call
+ * to notifiers if the same vector will be delivered while lock
+ * is dropped it will be put into irr and will be delivered
+ * after ack notifier returns.
+ */
+ spin_unlock(&ioapic->lock);
+ kvm_notify_acked_irq(ioapic->kvm, KVM_IRQCHIP_IOAPIC, pin);
+ spin_lock(&ioapic->lock);
+
+ if (trigger_mode != IOAPIC_LEVEL_TRIG ||
+ kvm_lapic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI)
+ return;
+
+ ASSERT(ent->fields.trig_mode == IOAPIC_LEVEL_TRIG);
+ ent->fields.remote_irr = 0;
+ if (!ent->fields.mask && (ioapic->irr & (1 << pin))) {
+ ++ioapic->irq_eoi[pin];
+ if (ioapic->irq_eoi[pin] == IOAPIC_SUCCESSIVE_IRQ_MAX_COUNT) {
+ /*
+ * Real hardware does not deliver the interrupt
+ * immediately during eoi broadcast, and this
+ * lets a buggy guest make slow progress
+ * even if it does not correctly handle a
+ * level-triggered interrupt. Emulate this
+ * behavior if we detect an interrupt storm.
+ */
+ schedule_delayed_work(&ioapic->eoi_inject, HZ / 100);
+ ioapic->irq_eoi[pin] = 0;
+ trace_kvm_ioapic_delayed_eoi_inj(ent->bits);
+ } else {
+ ioapic_service(ioapic, pin, false);
+ }
+ } else {
+ ioapic->irq_eoi[pin] = 0;
+ }
+}
+
+void kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu, int vector, int trigger_mode)
+{
+ int i;
+ struct kvm_ioapic *ioapic = vcpu->kvm->arch.vioapic;
+
+ spin_lock(&ioapic->lock);
+ rtc_irq_eoi(ioapic, vcpu, vector);
+ for (i = 0; i < IOAPIC_NUM_PINS; i++) {
+ union kvm_ioapic_redirect_entry *ent = &ioapic->redirtbl[i];
+
+ if (ent->fields.vector != vector)
+ continue;
+ kvm_ioapic_update_eoi_one(vcpu, ioapic, trigger_mode, i);
+ }
+ spin_unlock(&ioapic->lock);
+}
+
+static inline struct kvm_ioapic *to_ioapic(struct kvm_io_device *dev)
+{
+ return container_of(dev, struct kvm_ioapic, dev);
+}
+
+static inline int ioapic_in_range(struct kvm_ioapic *ioapic, gpa_t addr)
+{
+ return ((addr >= ioapic->base_address &&
+ (addr < ioapic->base_address + IOAPIC_MEM_LENGTH)));
+}
+
+static int ioapic_mmio_read(struct kvm_vcpu *vcpu, struct kvm_io_device *this,
+ gpa_t addr, int len, void *val)
+{
+ struct kvm_ioapic *ioapic = to_ioapic(this);
+ u32 result;
+ if (!ioapic_in_range(ioapic, addr))
+ return -EOPNOTSUPP;
+
+ ASSERT(!(addr & 0xf)); /* check alignment */
+
+ addr &= 0xff;
+ spin_lock(&ioapic->lock);
+ switch (addr) {
+ case IOAPIC_REG_SELECT:
+ result = ioapic->ioregsel;
+ break;
+
+ case IOAPIC_REG_WINDOW:
+ result = ioapic_read_indirect(ioapic);
+ break;
+
+ default:
+ result = 0;
+ break;
+ }
+ spin_unlock(&ioapic->lock);
+
+ switch (len) {
+ case 8:
+ *(u64 *) val = result;
+ break;
+ case 1:
+ case 2:
+ case 4:
+ memcpy(val, (char *)&result, len);
+ break;
+ default:
+ printk(KERN_WARNING "ioapic: wrong length %d\n", len);
+ }
+ return 0;
+}
+
+static int ioapic_mmio_write(struct kvm_vcpu *vcpu, struct kvm_io_device *this,
+ gpa_t addr, int len, const void *val)
+{
+ struct kvm_ioapic *ioapic = to_ioapic(this);
+ u32 data;
+ if (!ioapic_in_range(ioapic, addr))
+ return -EOPNOTSUPP;
+
+ ASSERT(!(addr & 0xf)); /* check alignment */
+
+ switch (len) {
+ case 8:
+ case 4:
+ data = *(u32 *) val;
+ break;
+ case 2:
+ data = *(u16 *) val;
+ break;
+ case 1:
+ data = *(u8 *) val;
+ break;
+ default:
+ printk(KERN_WARNING "ioapic: Unsupported size %d\n", len);
+ return 0;
+ }
+
+ addr &= 0xff;
+ spin_lock(&ioapic->lock);
+ switch (addr) {
+ case IOAPIC_REG_SELECT:
+ ioapic->ioregsel = data & 0xFF; /* 8-bit register */
+ break;
+
+ case IOAPIC_REG_WINDOW:
+ ioapic_write_indirect(ioapic, data);
+ break;
+
+ default:
+ break;
+ }
+ spin_unlock(&ioapic->lock);
+ return 0;
+}
+
+static void kvm_ioapic_reset(struct kvm_ioapic *ioapic)
+{
+ int i;
+
+ cancel_delayed_work_sync(&ioapic->eoi_inject);
+ for (i = 0; i < IOAPIC_NUM_PINS; i++)
+ ioapic->redirtbl[i].fields.mask = 1;
+ ioapic->base_address = IOAPIC_DEFAULT_BASE_ADDRESS;
+ ioapic->ioregsel = 0;
+ ioapic->irr = 0;
+ ioapic->irr_delivered = 0;
+ ioapic->id = 0;
+ memset(ioapic->irq_eoi, 0x00, sizeof(ioapic->irq_eoi));
+ rtc_irq_eoi_tracking_reset(ioapic);
+}
+
+static const struct kvm_io_device_ops ioapic_mmio_ops = {
+ .read = ioapic_mmio_read,
+ .write = ioapic_mmio_write,
+};
+
+int kvm_ioapic_init(struct kvm *kvm)
+{
+ struct kvm_ioapic *ioapic;
+ int ret;
+
+ ioapic = kzalloc(sizeof(struct kvm_ioapic), GFP_KERNEL_ACCOUNT);
+ if (!ioapic)
+ return -ENOMEM;
+ spin_lock_init(&ioapic->lock);
+ INIT_DELAYED_WORK(&ioapic->eoi_inject, kvm_ioapic_eoi_inject_work);
+ kvm->arch.vioapic = ioapic;
+ kvm_ioapic_reset(ioapic);
+ kvm_iodevice_init(&ioapic->dev, &ioapic_mmio_ops);
+ ioapic->kvm = kvm;
+ mutex_lock(&kvm->slots_lock);
+ ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, ioapic->base_address,
+ IOAPIC_MEM_LENGTH, &ioapic->dev);
+ mutex_unlock(&kvm->slots_lock);
+ if (ret < 0) {
+ kvm->arch.vioapic = NULL;
+ kfree(ioapic);
+ }
+
+ return ret;
+}
+
+void kvm_ioapic_destroy(struct kvm *kvm)
+{
+ struct kvm_ioapic *ioapic = kvm->arch.vioapic;
+
+ if (!ioapic)
+ return;
+
+ cancel_delayed_work_sync(&ioapic->eoi_inject);
+ mutex_lock(&kvm->slots_lock);
+ kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, &ioapic->dev);
+ mutex_unlock(&kvm->slots_lock);
+ kvm->arch.vioapic = NULL;
+ kfree(ioapic);
+}
+
+void kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state)
+{
+ struct kvm_ioapic *ioapic = kvm->arch.vioapic;
+
+ spin_lock(&ioapic->lock);
+ memcpy(state, ioapic, sizeof(struct kvm_ioapic_state));
+ state->irr &= ~ioapic->irr_delivered;
+ spin_unlock(&ioapic->lock);
+}
+
+void kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state)
+{
+ struct kvm_ioapic *ioapic = kvm->arch.vioapic;
+
+ spin_lock(&ioapic->lock);
+ memcpy(ioapic, state, sizeof(struct kvm_ioapic_state));
+ ioapic->irr = 0;
+ ioapic->irr_delivered = 0;
+ kvm_make_scan_ioapic_request(kvm);
+ kvm_ioapic_inject_all(ioapic, state->irr);
+ spin_unlock(&ioapic->lock);
+}
diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h
new file mode 100644
index 000000000..539333ac4
--- /dev/null
+++ b/arch/x86/kvm/ioapic.h
@@ -0,0 +1,123 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __KVM_IO_APIC_H
+#define __KVM_IO_APIC_H
+
+#include <linux/kvm_host.h>
+#include <kvm/iodev.h>
+#include "irq.h"
+
+struct kvm;
+struct kvm_vcpu;
+
+#define IOAPIC_NUM_PINS KVM_IOAPIC_NUM_PINS
+#define MAX_NR_RESERVED_IOAPIC_PINS KVM_MAX_IRQ_ROUTES
+#define IOAPIC_VERSION_ID 0x11 /* IOAPIC version */
+#define IOAPIC_EDGE_TRIG 0
+#define IOAPIC_LEVEL_TRIG 1
+
+#define IOAPIC_DEFAULT_BASE_ADDRESS 0xfec00000
+#define IOAPIC_MEM_LENGTH 0x100
+
+/* Direct registers. */
+#define IOAPIC_REG_SELECT 0x00
+#define IOAPIC_REG_WINDOW 0x10
+
+/* Indirect registers. */
+#define IOAPIC_REG_APIC_ID 0x00 /* x86 IOAPIC only */
+#define IOAPIC_REG_VERSION 0x01
+#define IOAPIC_REG_ARB_ID 0x02 /* x86 IOAPIC only */
+
+/*ioapic delivery mode*/
+#define IOAPIC_FIXED 0x0
+#define IOAPIC_LOWEST_PRIORITY 0x1
+#define IOAPIC_PMI 0x2
+#define IOAPIC_NMI 0x4
+#define IOAPIC_INIT 0x5
+#define IOAPIC_EXTINT 0x7
+
+#define RTC_GSI 8
+
+struct dest_map {
+ /* vcpu bitmap where IRQ has been sent */
+ DECLARE_BITMAP(map, KVM_MAX_VCPU_IDS);
+
+ /*
+ * Vector sent to a given vcpu, only valid when
+ * the vcpu's bit in map is set
+ */
+ u8 vectors[KVM_MAX_VCPU_IDS];
+};
+
+
+struct rtc_status {
+ int pending_eoi;
+ struct dest_map dest_map;
+};
+
+union kvm_ioapic_redirect_entry {
+ u64 bits;
+ struct {
+ u8 vector;
+ u8 delivery_mode:3;
+ u8 dest_mode:1;
+ u8 delivery_status:1;
+ u8 polarity:1;
+ u8 remote_irr:1;
+ u8 trig_mode:1;
+ u8 mask:1;
+ u8 reserve:7;
+ u8 reserved[4];
+ u8 dest_id;
+ } fields;
+};
+
+struct kvm_ioapic {
+ u64 base_address;
+ u32 ioregsel;
+ u32 id;
+ u32 irr;
+ u32 pad;
+ union kvm_ioapic_redirect_entry redirtbl[IOAPIC_NUM_PINS];
+ unsigned long irq_states[IOAPIC_NUM_PINS];
+ struct kvm_io_device dev;
+ struct kvm *kvm;
+ spinlock_t lock;
+ struct rtc_status rtc_status;
+ struct delayed_work eoi_inject;
+ u32 irq_eoi[IOAPIC_NUM_PINS];
+ u32 irr_delivered;
+};
+
+#ifdef DEBUG
+#define ASSERT(x) \
+do { \
+ if (!(x)) { \
+ printk(KERN_EMERG "assertion failed %s: %d: %s\n", \
+ __FILE__, __LINE__, #x); \
+ BUG(); \
+ } \
+} while (0)
+#else
+#define ASSERT(x) do { } while (0)
+#endif
+
+static inline int ioapic_in_kernel(struct kvm *kvm)
+{
+ return irqchip_kernel(kvm);
+}
+
+void kvm_rtc_eoi_tracking_restore_one(struct kvm_vcpu *vcpu);
+void kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu, int vector,
+ int trigger_mode);
+int kvm_ioapic_init(struct kvm *kvm);
+void kvm_ioapic_destroy(struct kvm *kvm);
+int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int irq_source_id,
+ int level, bool line_status);
+void kvm_ioapic_clear_all(struct kvm_ioapic *ioapic, int irq_source_id);
+void kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state);
+void kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state);
+void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu,
+ ulong *ioapic_handled_vectors);
+void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu,
+ ulong *ioapic_handled_vectors);
+#endif
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
new file mode 100644
index 000000000..b2c397dd2
--- /dev/null
+++ b/arch/x86/kvm/irq.c
@@ -0,0 +1,171 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * irq.c: API for in kernel interrupt controller
+ * Copyright (c) 2007, Intel Corporation.
+ * Copyright 2009 Red Hat, Inc. and/or its affiliates.
+ *
+ * Authors:
+ * Yaozu (Eddie) Dong <Eddie.dong@intel.com>
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/export.h>
+#include <linux/kvm_host.h>
+
+#include "irq.h"
+#include "i8254.h"
+#include "x86.h"
+#include "xen.h"
+
+/*
+ * check if there are pending timer events
+ * to be processed.
+ */
+int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
+{
+ int r = 0;
+
+ if (lapic_in_kernel(vcpu))
+ r = apic_has_pending_timer(vcpu);
+ if (kvm_xen_timer_enabled(vcpu))
+ r += kvm_xen_has_pending_timer(vcpu);
+
+ return r;
+}
+
+/*
+ * check if there is a pending userspace external interrupt
+ */
+static int pending_userspace_extint(struct kvm_vcpu *v)
+{
+ return v->arch.pending_external_vector != -1;
+}
+
+/*
+ * check if there is pending interrupt from
+ * non-APIC source without intack.
+ */
+int kvm_cpu_has_extint(struct kvm_vcpu *v)
+{
+ /*
+ * FIXME: interrupt.injected represents an interrupt whose
+ * side-effects have already been applied (e.g. bit from IRR
+ * already moved to ISR). Therefore, it is incorrect to rely
+ * on interrupt.injected to know if there is a pending
+ * interrupt in the user-mode LAPIC.
+ * This leads to nVMX/nSVM not be able to distinguish
+ * if it should exit from L2 to L1 on EXTERNAL_INTERRUPT on
+ * pending interrupt or should re-inject an injected
+ * interrupt.
+ */
+ if (!lapic_in_kernel(v))
+ return v->arch.interrupt.injected;
+
+ if (kvm_xen_has_interrupt(v))
+ return 1;
+
+ if (!kvm_apic_accept_pic_intr(v))
+ return 0;
+
+ if (irqchip_split(v->kvm))
+ return pending_userspace_extint(v);
+ else
+ return v->kvm->arch.vpic->output;
+}
+
+/*
+ * check if there is injectable interrupt:
+ * when virtual interrupt delivery enabled,
+ * interrupt from apic will handled by hardware,
+ * we don't need to check it here.
+ */
+int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v)
+{
+ if (kvm_cpu_has_extint(v))
+ return 1;
+
+ if (!is_guest_mode(v) && kvm_vcpu_apicv_active(v))
+ return 0;
+
+ return kvm_apic_has_interrupt(v) != -1; /* LAPIC */
+}
+EXPORT_SYMBOL_GPL(kvm_cpu_has_injectable_intr);
+
+/*
+ * check if there is pending interrupt without
+ * intack.
+ */
+int kvm_cpu_has_interrupt(struct kvm_vcpu *v)
+{
+ if (kvm_cpu_has_extint(v))
+ return 1;
+
+ return kvm_apic_has_interrupt(v) != -1; /* LAPIC */
+}
+EXPORT_SYMBOL_GPL(kvm_cpu_has_interrupt);
+
+/*
+ * Read pending interrupt(from non-APIC source)
+ * vector and intack.
+ */
+static int kvm_cpu_get_extint(struct kvm_vcpu *v)
+{
+ if (!kvm_cpu_has_extint(v)) {
+ WARN_ON(!lapic_in_kernel(v));
+ return -1;
+ }
+
+ if (!lapic_in_kernel(v))
+ return v->arch.interrupt.nr;
+
+ if (kvm_xen_has_interrupt(v))
+ return v->kvm->arch.xen.upcall_vector;
+
+ if (irqchip_split(v->kvm)) {
+ int vector = v->arch.pending_external_vector;
+
+ v->arch.pending_external_vector = -1;
+ return vector;
+ } else
+ return kvm_pic_read_irq(v->kvm); /* PIC */
+}
+
+/*
+ * Read pending interrupt vector and intack.
+ */
+int kvm_cpu_get_interrupt(struct kvm_vcpu *v)
+{
+ int vector = kvm_cpu_get_extint(v);
+ if (vector != -1)
+ return vector; /* PIC */
+
+ return kvm_get_apic_interrupt(v); /* APIC */
+}
+EXPORT_SYMBOL_GPL(kvm_cpu_get_interrupt);
+
+void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu)
+{
+ if (lapic_in_kernel(vcpu))
+ kvm_inject_apic_timer_irqs(vcpu);
+ if (kvm_xen_timer_enabled(vcpu))
+ kvm_xen_inject_timer_irqs(vcpu);
+}
+
+void __kvm_migrate_timers(struct kvm_vcpu *vcpu)
+{
+ __kvm_migrate_apic_timer(vcpu);
+ __kvm_migrate_pit_timer(vcpu);
+ static_call_cond(kvm_x86_migrate_timers)(vcpu);
+}
+
+bool kvm_arch_irqfd_allowed(struct kvm *kvm, struct kvm_irqfd *args)
+{
+ bool resample = args->flags & KVM_IRQFD_FLAG_RESAMPLE;
+
+ return resample ? irqchip_kernel(kvm) : irqchip_in_kernel(kvm);
+}
+
+bool kvm_arch_irqchip_in_kernel(struct kvm *kvm)
+{
+ return irqchip_in_kernel(kvm);
+}
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
new file mode 100644
index 000000000..c2d7cfe82
--- /dev/null
+++ b/arch/x86/kvm/irq.h
@@ -0,0 +1,114 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * irq.h: in kernel interrupt controller related definitions
+ * Copyright (c) 2007, Intel Corporation.
+ *
+ * Authors:
+ * Yaozu (Eddie) Dong <Eddie.dong@intel.com>
+ */
+
+#ifndef __IRQ_H
+#define __IRQ_H
+
+#include <linux/mm_types.h>
+#include <linux/hrtimer.h>
+#include <linux/kvm_host.h>
+#include <linux/spinlock.h>
+
+#include <kvm/iodev.h>
+#include "lapic.h"
+
+#define PIC_NUM_PINS 16
+#define SELECT_PIC(irq) \
+ ((irq) < 8 ? KVM_IRQCHIP_PIC_MASTER : KVM_IRQCHIP_PIC_SLAVE)
+
+struct kvm;
+struct kvm_vcpu;
+
+struct kvm_kpic_state {
+ u8 last_irr; /* edge detection */
+ u8 irr; /* interrupt request register */
+ u8 imr; /* interrupt mask register */
+ u8 isr; /* interrupt service register */
+ u8 priority_add; /* highest irq priority */
+ u8 irq_base;
+ u8 read_reg_select;
+ u8 poll;
+ u8 special_mask;
+ u8 init_state;
+ u8 auto_eoi;
+ u8 rotate_on_auto_eoi;
+ u8 special_fully_nested_mode;
+ u8 init4; /* true if 4 byte init */
+ u8 elcr; /* PIIX edge/trigger selection */
+ u8 elcr_mask;
+ u8 isr_ack; /* interrupt ack detection */
+ struct kvm_pic *pics_state;
+};
+
+struct kvm_pic {
+ spinlock_t lock;
+ bool wakeup_needed;
+ unsigned pending_acks;
+ struct kvm *kvm;
+ struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */
+ int output; /* intr from master PIC */
+ struct kvm_io_device dev_master;
+ struct kvm_io_device dev_slave;
+ struct kvm_io_device dev_elcr;
+ unsigned long irq_states[PIC_NUM_PINS];
+};
+
+int kvm_pic_init(struct kvm *kvm);
+void kvm_pic_destroy(struct kvm *kvm);
+int kvm_pic_read_irq(struct kvm *kvm);
+void kvm_pic_update_irq(struct kvm_pic *s);
+
+static inline int irqchip_split(struct kvm *kvm)
+{
+ int mode = kvm->arch.irqchip_mode;
+
+ /* Matches smp_wmb() when setting irqchip_mode */
+ smp_rmb();
+ return mode == KVM_IRQCHIP_SPLIT;
+}
+
+static inline int irqchip_kernel(struct kvm *kvm)
+{
+ int mode = kvm->arch.irqchip_mode;
+
+ /* Matches smp_wmb() when setting irqchip_mode */
+ smp_rmb();
+ return mode == KVM_IRQCHIP_KERNEL;
+}
+
+static inline int pic_in_kernel(struct kvm *kvm)
+{
+ return irqchip_kernel(kvm);
+}
+
+static inline int irqchip_in_kernel(struct kvm *kvm)
+{
+ int mode = kvm->arch.irqchip_mode;
+
+ /* Matches smp_wmb() when setting irqchip_mode */
+ smp_rmb();
+ return mode != KVM_IRQCHIP_NONE;
+}
+
+void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu);
+void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu);
+void kvm_apic_nmi_wd_deliver(struct kvm_vcpu *vcpu);
+void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu);
+void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu);
+void __kvm_migrate_timers(struct kvm_vcpu *vcpu);
+
+int apic_has_pending_timer(struct kvm_vcpu *vcpu);
+
+int kvm_setup_default_irq_routing(struct kvm *kvm);
+int kvm_setup_empty_irq_routing(struct kvm *kvm);
+int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
+ struct kvm_lapic_irq *irq,
+ struct dest_map *dest_map);
+
+#endif
diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c
new file mode 100644
index 000000000..16d076a1b
--- /dev/null
+++ b/arch/x86/kvm/irq_comm.c
@@ -0,0 +1,442 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * irq_comm.c: Common API for in kernel interrupt controller
+ * Copyright (c) 2007, Intel Corporation.
+ *
+ * Authors:
+ * Yaozu (Eddie) Dong <Eddie.dong@intel.com>
+ *
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kvm_host.h>
+#include <linux/slab.h>
+#include <linux/export.h>
+#include <linux/rculist.h>
+
+#include <trace/events/kvm.h>
+
+#include "irq.h"
+
+#include "ioapic.h"
+
+#include "lapic.h"
+
+#include "hyperv.h"
+#include "x86.h"
+#include "xen.h"
+
+static int kvm_set_pic_irq(struct kvm_kernel_irq_routing_entry *e,
+ struct kvm *kvm, int irq_source_id, int level,
+ bool line_status)
+{
+ struct kvm_pic *pic = kvm->arch.vpic;
+ return kvm_pic_set_irq(pic, e->irqchip.pin, irq_source_id, level);
+}
+
+static int kvm_set_ioapic_irq(struct kvm_kernel_irq_routing_entry *e,
+ struct kvm *kvm, int irq_source_id, int level,
+ bool line_status)
+{
+ struct kvm_ioapic *ioapic = kvm->arch.vioapic;
+ return kvm_ioapic_set_irq(ioapic, e->irqchip.pin, irq_source_id, level,
+ line_status);
+}
+
+int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
+ struct kvm_lapic_irq *irq, struct dest_map *dest_map)
+{
+ int r = -1;
+ struct kvm_vcpu *vcpu, *lowest = NULL;
+ unsigned long i, dest_vcpu_bitmap[BITS_TO_LONGS(KVM_MAX_VCPUS)];
+ unsigned int dest_vcpus = 0;
+
+ if (kvm_irq_delivery_to_apic_fast(kvm, src, irq, &r, dest_map))
+ return r;
+
+ if (irq->dest_mode == APIC_DEST_PHYSICAL &&
+ irq->dest_id == 0xff && kvm_lowest_prio_delivery(irq)) {
+ pr_info("apic: phys broadcast and lowest prio\n");
+ irq->delivery_mode = APIC_DM_FIXED;
+ }
+
+ memset(dest_vcpu_bitmap, 0, sizeof(dest_vcpu_bitmap));
+
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ if (!kvm_apic_present(vcpu))
+ continue;
+
+ if (!kvm_apic_match_dest(vcpu, src, irq->shorthand,
+ irq->dest_id, irq->dest_mode))
+ continue;
+
+ if (!kvm_lowest_prio_delivery(irq)) {
+ if (r < 0)
+ r = 0;
+ r += kvm_apic_set_irq(vcpu, irq, dest_map);
+ } else if (kvm_apic_sw_enabled(vcpu->arch.apic)) {
+ if (!kvm_vector_hashing_enabled()) {
+ if (!lowest)
+ lowest = vcpu;
+ else if (kvm_apic_compare_prio(vcpu, lowest) < 0)
+ lowest = vcpu;
+ } else {
+ __set_bit(i, dest_vcpu_bitmap);
+ dest_vcpus++;
+ }
+ }
+ }
+
+ if (dest_vcpus != 0) {
+ int idx = kvm_vector_to_index(irq->vector, dest_vcpus,
+ dest_vcpu_bitmap, KVM_MAX_VCPUS);
+
+ lowest = kvm_get_vcpu(kvm, idx);
+ }
+
+ if (lowest)
+ r = kvm_apic_set_irq(lowest, irq, dest_map);
+
+ return r;
+}
+
+void kvm_set_msi_irq(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e,
+ struct kvm_lapic_irq *irq)
+{
+ struct msi_msg msg = { .address_lo = e->msi.address_lo,
+ .address_hi = e->msi.address_hi,
+ .data = e->msi.data };
+
+ trace_kvm_msi_set_irq(msg.address_lo | (kvm->arch.x2apic_format ?
+ (u64)msg.address_hi << 32 : 0), msg.data);
+
+ irq->dest_id = x86_msi_msg_get_destid(&msg, kvm->arch.x2apic_format);
+ irq->vector = msg.arch_data.vector;
+ irq->dest_mode = kvm_lapic_irq_dest_mode(msg.arch_addr_lo.dest_mode_logical);
+ irq->trig_mode = msg.arch_data.is_level;
+ irq->delivery_mode = msg.arch_data.delivery_mode << 8;
+ irq->msi_redir_hint = msg.arch_addr_lo.redirect_hint;
+ irq->level = 1;
+ irq->shorthand = APIC_DEST_NOSHORT;
+}
+EXPORT_SYMBOL_GPL(kvm_set_msi_irq);
+
+static inline bool kvm_msi_route_invalid(struct kvm *kvm,
+ struct kvm_kernel_irq_routing_entry *e)
+{
+ return kvm->arch.x2apic_format && (e->msi.address_hi & 0xff);
+}
+
+int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
+ struct kvm *kvm, int irq_source_id, int level, bool line_status)
+{
+ struct kvm_lapic_irq irq;
+
+ if (kvm_msi_route_invalid(kvm, e))
+ return -EINVAL;
+
+ if (!level)
+ return -1;
+
+ kvm_set_msi_irq(kvm, e, &irq);
+
+ return kvm_irq_delivery_to_apic(kvm, NULL, &irq, NULL);
+}
+
+
+static int kvm_hv_set_sint(struct kvm_kernel_irq_routing_entry *e,
+ struct kvm *kvm, int irq_source_id, int level,
+ bool line_status)
+{
+ if (!level)
+ return -1;
+
+ return kvm_hv_synic_set_irq(kvm, e->hv_sint.vcpu, e->hv_sint.sint);
+}
+
+int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e,
+ struct kvm *kvm, int irq_source_id, int level,
+ bool line_status)
+{
+ struct kvm_lapic_irq irq;
+ int r;
+
+ switch (e->type) {
+ case KVM_IRQ_ROUTING_HV_SINT:
+ return kvm_hv_set_sint(e, kvm, irq_source_id, level,
+ line_status);
+
+ case KVM_IRQ_ROUTING_MSI:
+ if (kvm_msi_route_invalid(kvm, e))
+ return -EINVAL;
+
+ kvm_set_msi_irq(kvm, e, &irq);
+
+ if (kvm_irq_delivery_to_apic_fast(kvm, NULL, &irq, &r, NULL))
+ return r;
+ break;
+
+#ifdef CONFIG_KVM_XEN
+ case KVM_IRQ_ROUTING_XEN_EVTCHN:
+ if (!level)
+ return -1;
+
+ return kvm_xen_set_evtchn_fast(&e->xen_evtchn, kvm);
+#endif
+ default:
+ break;
+ }
+
+ return -EWOULDBLOCK;
+}
+
+int kvm_request_irq_source_id(struct kvm *kvm)
+{
+ unsigned long *bitmap = &kvm->arch.irq_sources_bitmap;
+ int irq_source_id;
+
+ mutex_lock(&kvm->irq_lock);
+ irq_source_id = find_first_zero_bit(bitmap, BITS_PER_LONG);
+
+ if (irq_source_id >= BITS_PER_LONG) {
+ pr_warn("exhausted allocatable IRQ sources!\n");
+ irq_source_id = -EFAULT;
+ goto unlock;
+ }
+
+ ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID);
+ ASSERT(irq_source_id != KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID);
+ set_bit(irq_source_id, bitmap);
+unlock:
+ mutex_unlock(&kvm->irq_lock);
+
+ return irq_source_id;
+}
+
+void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id)
+{
+ ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID);
+ ASSERT(irq_source_id != KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID);
+
+ mutex_lock(&kvm->irq_lock);
+ if (irq_source_id < 0 ||
+ irq_source_id >= BITS_PER_LONG) {
+ pr_err("IRQ source ID out of range!\n");
+ goto unlock;
+ }
+ clear_bit(irq_source_id, &kvm->arch.irq_sources_bitmap);
+ if (!irqchip_kernel(kvm))
+ goto unlock;
+
+ kvm_ioapic_clear_all(kvm->arch.vioapic, irq_source_id);
+ kvm_pic_clear_all(kvm->arch.vpic, irq_source_id);
+unlock:
+ mutex_unlock(&kvm->irq_lock);
+}
+
+void kvm_register_irq_mask_notifier(struct kvm *kvm, int irq,
+ struct kvm_irq_mask_notifier *kimn)
+{
+ mutex_lock(&kvm->irq_lock);
+ kimn->irq = irq;
+ hlist_add_head_rcu(&kimn->link, &kvm->arch.mask_notifier_list);
+ mutex_unlock(&kvm->irq_lock);
+}
+
+void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq,
+ struct kvm_irq_mask_notifier *kimn)
+{
+ mutex_lock(&kvm->irq_lock);
+ hlist_del_rcu(&kimn->link);
+ mutex_unlock(&kvm->irq_lock);
+ synchronize_srcu(&kvm->irq_srcu);
+}
+
+void kvm_fire_mask_notifiers(struct kvm *kvm, unsigned irqchip, unsigned pin,
+ bool mask)
+{
+ struct kvm_irq_mask_notifier *kimn;
+ int idx, gsi;
+
+ idx = srcu_read_lock(&kvm->irq_srcu);
+ gsi = kvm_irq_map_chip_pin(kvm, irqchip, pin);
+ if (gsi != -1)
+ hlist_for_each_entry_rcu(kimn, &kvm->arch.mask_notifier_list, link)
+ if (kimn->irq == gsi)
+ kimn->func(kimn, mask);
+ srcu_read_unlock(&kvm->irq_srcu, idx);
+}
+
+bool kvm_arch_can_set_irq_routing(struct kvm *kvm)
+{
+ return irqchip_in_kernel(kvm);
+}
+
+int kvm_set_routing_entry(struct kvm *kvm,
+ struct kvm_kernel_irq_routing_entry *e,
+ const struct kvm_irq_routing_entry *ue)
+{
+ /* We can't check irqchip_in_kernel() here as some callers are
+ * currently initializing the irqchip. Other callers should therefore
+ * check kvm_arch_can_set_irq_routing() before calling this function.
+ */
+ switch (ue->type) {
+ case KVM_IRQ_ROUTING_IRQCHIP:
+ if (irqchip_split(kvm))
+ return -EINVAL;
+ e->irqchip.pin = ue->u.irqchip.pin;
+ switch (ue->u.irqchip.irqchip) {
+ case KVM_IRQCHIP_PIC_SLAVE:
+ e->irqchip.pin += PIC_NUM_PINS / 2;
+ fallthrough;
+ case KVM_IRQCHIP_PIC_MASTER:
+ if (ue->u.irqchip.pin >= PIC_NUM_PINS / 2)
+ return -EINVAL;
+ e->set = kvm_set_pic_irq;
+ break;
+ case KVM_IRQCHIP_IOAPIC:
+ if (ue->u.irqchip.pin >= KVM_IOAPIC_NUM_PINS)
+ return -EINVAL;
+ e->set = kvm_set_ioapic_irq;
+ break;
+ default:
+ return -EINVAL;
+ }
+ e->irqchip.irqchip = ue->u.irqchip.irqchip;
+ break;
+ case KVM_IRQ_ROUTING_MSI:
+ e->set = kvm_set_msi;
+ e->msi.address_lo = ue->u.msi.address_lo;
+ e->msi.address_hi = ue->u.msi.address_hi;
+ e->msi.data = ue->u.msi.data;
+
+ if (kvm_msi_route_invalid(kvm, e))
+ return -EINVAL;
+ break;
+ case KVM_IRQ_ROUTING_HV_SINT:
+ e->set = kvm_hv_set_sint;
+ e->hv_sint.vcpu = ue->u.hv_sint.vcpu;
+ e->hv_sint.sint = ue->u.hv_sint.sint;
+ break;
+#ifdef CONFIG_KVM_XEN
+ case KVM_IRQ_ROUTING_XEN_EVTCHN:
+ return kvm_xen_setup_evtchn(kvm, e, ue);
+#endif
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq,
+ struct kvm_vcpu **dest_vcpu)
+{
+ int r = 0;
+ unsigned long i;
+ struct kvm_vcpu *vcpu;
+
+ if (kvm_intr_is_single_vcpu_fast(kvm, irq, dest_vcpu))
+ return true;
+
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ if (!kvm_apic_present(vcpu))
+ continue;
+
+ if (!kvm_apic_match_dest(vcpu, NULL, irq->shorthand,
+ irq->dest_id, irq->dest_mode))
+ continue;
+
+ if (++r == 2)
+ return false;
+
+ *dest_vcpu = vcpu;
+ }
+
+ return r == 1;
+}
+EXPORT_SYMBOL_GPL(kvm_intr_is_single_vcpu);
+
+#define IOAPIC_ROUTING_ENTRY(irq) \
+ { .gsi = irq, .type = KVM_IRQ_ROUTING_IRQCHIP, \
+ .u.irqchip = { .irqchip = KVM_IRQCHIP_IOAPIC, .pin = (irq) } }
+#define ROUTING_ENTRY1(irq) IOAPIC_ROUTING_ENTRY(irq)
+
+#define PIC_ROUTING_ENTRY(irq) \
+ { .gsi = irq, .type = KVM_IRQ_ROUTING_IRQCHIP, \
+ .u.irqchip = { .irqchip = SELECT_PIC(irq), .pin = (irq) % 8 } }
+#define ROUTING_ENTRY2(irq) \
+ IOAPIC_ROUTING_ENTRY(irq), PIC_ROUTING_ENTRY(irq)
+
+static const struct kvm_irq_routing_entry default_routing[] = {
+ ROUTING_ENTRY2(0), ROUTING_ENTRY2(1),
+ ROUTING_ENTRY2(2), ROUTING_ENTRY2(3),
+ ROUTING_ENTRY2(4), ROUTING_ENTRY2(5),
+ ROUTING_ENTRY2(6), ROUTING_ENTRY2(7),
+ ROUTING_ENTRY2(8), ROUTING_ENTRY2(9),
+ ROUTING_ENTRY2(10), ROUTING_ENTRY2(11),
+ ROUTING_ENTRY2(12), ROUTING_ENTRY2(13),
+ ROUTING_ENTRY2(14), ROUTING_ENTRY2(15),
+ ROUTING_ENTRY1(16), ROUTING_ENTRY1(17),
+ ROUTING_ENTRY1(18), ROUTING_ENTRY1(19),
+ ROUTING_ENTRY1(20), ROUTING_ENTRY1(21),
+ ROUTING_ENTRY1(22), ROUTING_ENTRY1(23),
+};
+
+int kvm_setup_default_irq_routing(struct kvm *kvm)
+{
+ return kvm_set_irq_routing(kvm, default_routing,
+ ARRAY_SIZE(default_routing), 0);
+}
+
+static const struct kvm_irq_routing_entry empty_routing[] = {};
+
+int kvm_setup_empty_irq_routing(struct kvm *kvm)
+{
+ return kvm_set_irq_routing(kvm, empty_routing, 0, 0);
+}
+
+void kvm_arch_post_irq_routing_update(struct kvm *kvm)
+{
+ if (!irqchip_split(kvm))
+ return;
+ kvm_make_scan_ioapic_request(kvm);
+}
+
+void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu,
+ ulong *ioapic_handled_vectors)
+{
+ struct kvm *kvm = vcpu->kvm;
+ struct kvm_kernel_irq_routing_entry *entry;
+ struct kvm_irq_routing_table *table;
+ u32 i, nr_ioapic_pins;
+ int idx;
+
+ idx = srcu_read_lock(&kvm->irq_srcu);
+ table = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
+ nr_ioapic_pins = min_t(u32, table->nr_rt_entries,
+ kvm->arch.nr_reserved_ioapic_pins);
+ for (i = 0; i < nr_ioapic_pins; ++i) {
+ hlist_for_each_entry(entry, &table->map[i], link) {
+ struct kvm_lapic_irq irq;
+
+ if (entry->type != KVM_IRQ_ROUTING_MSI)
+ continue;
+
+ kvm_set_msi_irq(vcpu->kvm, entry, &irq);
+
+ if (irq.trig_mode &&
+ (kvm_apic_match_dest(vcpu, NULL, APIC_DEST_NOSHORT,
+ irq.dest_id, irq.dest_mode) ||
+ kvm_apic_pending_eoi(vcpu, irq.vector)))
+ __set_bit(irq.vector, ioapic_handled_vectors);
+ }
+ }
+ srcu_read_unlock(&kvm->irq_srcu, idx);
+}
+
+void kvm_arch_irq_routing_update(struct kvm *kvm)
+{
+ kvm_hv_irq_routing_update(kvm);
+}
diff --git a/arch/x86/kvm/kvm-asm-offsets.c b/arch/x86/kvm/kvm-asm-offsets.c
new file mode 100644
index 000000000..24a710d37
--- /dev/null
+++ b/arch/x86/kvm/kvm-asm-offsets.c
@@ -0,0 +1,29 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Generate definitions needed by assembly language modules.
+ * This code generates raw asm output which is post-processed to extract
+ * and format the required data.
+ */
+#define COMPILE_OFFSETS
+
+#include <linux/kbuild.h>
+#include "vmx/vmx.h"
+#include "svm/svm.h"
+
+static void __used common(void)
+{
+ if (IS_ENABLED(CONFIG_KVM_AMD)) {
+ BLANK();
+ OFFSET(SVM_vcpu_arch_regs, vcpu_svm, vcpu.arch.regs);
+ OFFSET(SVM_current_vmcb, vcpu_svm, current_vmcb);
+ OFFSET(SVM_spec_ctrl, vcpu_svm, spec_ctrl);
+ OFFSET(SVM_vmcb01, vcpu_svm, vmcb01);
+ OFFSET(KVM_VMCB_pa, kvm_vmcb_info, pa);
+ OFFSET(SD_save_area_pa, svm_cpu_data, save_area_pa);
+ }
+
+ if (IS_ENABLED(CONFIG_KVM_INTEL)) {
+ BLANK();
+ OFFSET(VMX_spec_ctrl, vcpu_vmx, spec_ctrl);
+ }
+}
diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
new file mode 100644
index 000000000..75eae9c49
--- /dev/null
+++ b/arch/x86/kvm/kvm_cache_regs.h
@@ -0,0 +1,231 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef ASM_KVM_CACHE_REGS_H
+#define ASM_KVM_CACHE_REGS_H
+
+#include <linux/kvm_host.h>
+
+#define KVM_POSSIBLE_CR0_GUEST_BITS (X86_CR0_TS | X86_CR0_WP)
+#define KVM_POSSIBLE_CR4_GUEST_BITS \
+ (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \
+ | X86_CR4_OSXMMEXCPT | X86_CR4_PGE | X86_CR4_TSD | X86_CR4_FSGSBASE)
+
+#define X86_CR0_PDPTR_BITS (X86_CR0_CD | X86_CR0_NW | X86_CR0_PG)
+#define X86_CR4_TLBFLUSH_BITS (X86_CR4_PGE | X86_CR4_PCIDE | X86_CR4_PAE | X86_CR4_SMEP)
+#define X86_CR4_PDPTR_BITS (X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_SMEP)
+
+static_assert(!(KVM_POSSIBLE_CR0_GUEST_BITS & X86_CR0_PDPTR_BITS));
+
+#define BUILD_KVM_GPR_ACCESSORS(lname, uname) \
+static __always_inline unsigned long kvm_##lname##_read(struct kvm_vcpu *vcpu)\
+{ \
+ return vcpu->arch.regs[VCPU_REGS_##uname]; \
+} \
+static __always_inline void kvm_##lname##_write(struct kvm_vcpu *vcpu, \
+ unsigned long val) \
+{ \
+ vcpu->arch.regs[VCPU_REGS_##uname] = val; \
+}
+BUILD_KVM_GPR_ACCESSORS(rax, RAX)
+BUILD_KVM_GPR_ACCESSORS(rbx, RBX)
+BUILD_KVM_GPR_ACCESSORS(rcx, RCX)
+BUILD_KVM_GPR_ACCESSORS(rdx, RDX)
+BUILD_KVM_GPR_ACCESSORS(rbp, RBP)
+BUILD_KVM_GPR_ACCESSORS(rsi, RSI)
+BUILD_KVM_GPR_ACCESSORS(rdi, RDI)
+#ifdef CONFIG_X86_64
+BUILD_KVM_GPR_ACCESSORS(r8, R8)
+BUILD_KVM_GPR_ACCESSORS(r9, R9)
+BUILD_KVM_GPR_ACCESSORS(r10, R10)
+BUILD_KVM_GPR_ACCESSORS(r11, R11)
+BUILD_KVM_GPR_ACCESSORS(r12, R12)
+BUILD_KVM_GPR_ACCESSORS(r13, R13)
+BUILD_KVM_GPR_ACCESSORS(r14, R14)
+BUILD_KVM_GPR_ACCESSORS(r15, R15)
+#endif
+
+/*
+ * avail dirty
+ * 0 0 register in VMCS/VMCB
+ * 0 1 *INVALID*
+ * 1 0 register in vcpu->arch
+ * 1 1 register in vcpu->arch, needs to be stored back
+ */
+static inline bool kvm_register_is_available(struct kvm_vcpu *vcpu,
+ enum kvm_reg reg)
+{
+ return test_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
+}
+
+static inline bool kvm_register_is_dirty(struct kvm_vcpu *vcpu,
+ enum kvm_reg reg)
+{
+ return test_bit(reg, (unsigned long *)&vcpu->arch.regs_dirty);
+}
+
+static inline void kvm_register_mark_available(struct kvm_vcpu *vcpu,
+ enum kvm_reg reg)
+{
+ __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
+}
+
+static inline void kvm_register_mark_dirty(struct kvm_vcpu *vcpu,
+ enum kvm_reg reg)
+{
+ __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
+ __set_bit(reg, (unsigned long *)&vcpu->arch.regs_dirty);
+}
+
+/*
+ * kvm_register_test_and_mark_available() is a special snowflake that uses an
+ * arch bitop directly to avoid the explicit instrumentation that comes with
+ * the generic bitops. This allows code that cannot be instrumented (noinstr
+ * functions), e.g. the low level VM-Enter/VM-Exit paths, to cache registers.
+ */
+static __always_inline bool kvm_register_test_and_mark_available(struct kvm_vcpu *vcpu,
+ enum kvm_reg reg)
+{
+ return arch___test_and_set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
+}
+
+/*
+ * The "raw" register helpers are only for cases where the full 64 bits of a
+ * register are read/written irrespective of current vCPU mode. In other words,
+ * odds are good you shouldn't be using the raw variants.
+ */
+static inline unsigned long kvm_register_read_raw(struct kvm_vcpu *vcpu, int reg)
+{
+ if (WARN_ON_ONCE((unsigned int)reg >= NR_VCPU_REGS))
+ return 0;
+
+ if (!kvm_register_is_available(vcpu, reg))
+ static_call(kvm_x86_cache_reg)(vcpu, reg);
+
+ return vcpu->arch.regs[reg];
+}
+
+static inline void kvm_register_write_raw(struct kvm_vcpu *vcpu, int reg,
+ unsigned long val)
+{
+ if (WARN_ON_ONCE((unsigned int)reg >= NR_VCPU_REGS))
+ return;
+
+ vcpu->arch.regs[reg] = val;
+ kvm_register_mark_dirty(vcpu, reg);
+}
+
+static inline unsigned long kvm_rip_read(struct kvm_vcpu *vcpu)
+{
+ return kvm_register_read_raw(vcpu, VCPU_REGS_RIP);
+}
+
+static inline void kvm_rip_write(struct kvm_vcpu *vcpu, unsigned long val)
+{
+ kvm_register_write_raw(vcpu, VCPU_REGS_RIP, val);
+}
+
+static inline unsigned long kvm_rsp_read(struct kvm_vcpu *vcpu)
+{
+ return kvm_register_read_raw(vcpu, VCPU_REGS_RSP);
+}
+
+static inline void kvm_rsp_write(struct kvm_vcpu *vcpu, unsigned long val)
+{
+ kvm_register_write_raw(vcpu, VCPU_REGS_RSP, val);
+}
+
+static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index)
+{
+ might_sleep(); /* on svm */
+
+ if (!kvm_register_is_available(vcpu, VCPU_EXREG_PDPTR))
+ static_call(kvm_x86_cache_reg)(vcpu, VCPU_EXREG_PDPTR);
+
+ return vcpu->arch.walk_mmu->pdptrs[index];
+}
+
+static inline void kvm_pdptr_write(struct kvm_vcpu *vcpu, int index, u64 value)
+{
+ vcpu->arch.walk_mmu->pdptrs[index] = value;
+}
+
+static inline ulong kvm_read_cr0_bits(struct kvm_vcpu *vcpu, ulong mask)
+{
+ ulong tmask = mask & KVM_POSSIBLE_CR0_GUEST_BITS;
+ if ((tmask & vcpu->arch.cr0_guest_owned_bits) &&
+ !kvm_register_is_available(vcpu, VCPU_EXREG_CR0))
+ static_call(kvm_x86_cache_reg)(vcpu, VCPU_EXREG_CR0);
+ return vcpu->arch.cr0 & mask;
+}
+
+static __always_inline bool kvm_is_cr0_bit_set(struct kvm_vcpu *vcpu,
+ unsigned long cr0_bit)
+{
+ BUILD_BUG_ON(!is_power_of_2(cr0_bit));
+
+ return !!kvm_read_cr0_bits(vcpu, cr0_bit);
+}
+
+static inline ulong kvm_read_cr0(struct kvm_vcpu *vcpu)
+{
+ return kvm_read_cr0_bits(vcpu, ~0UL);
+}
+
+static inline ulong kvm_read_cr4_bits(struct kvm_vcpu *vcpu, ulong mask)
+{
+ ulong tmask = mask & KVM_POSSIBLE_CR4_GUEST_BITS;
+ if ((tmask & vcpu->arch.cr4_guest_owned_bits) &&
+ !kvm_register_is_available(vcpu, VCPU_EXREG_CR4))
+ static_call(kvm_x86_cache_reg)(vcpu, VCPU_EXREG_CR4);
+ return vcpu->arch.cr4 & mask;
+}
+
+static __always_inline bool kvm_is_cr4_bit_set(struct kvm_vcpu *vcpu,
+ unsigned long cr4_bit)
+{
+ BUILD_BUG_ON(!is_power_of_2(cr4_bit));
+
+ return !!kvm_read_cr4_bits(vcpu, cr4_bit);
+}
+
+static inline ulong kvm_read_cr3(struct kvm_vcpu *vcpu)
+{
+ if (!kvm_register_is_available(vcpu, VCPU_EXREG_CR3))
+ static_call(kvm_x86_cache_reg)(vcpu, VCPU_EXREG_CR3);
+ return vcpu->arch.cr3;
+}
+
+static inline ulong kvm_read_cr4(struct kvm_vcpu *vcpu)
+{
+ return kvm_read_cr4_bits(vcpu, ~0UL);
+}
+
+static inline u64 kvm_read_edx_eax(struct kvm_vcpu *vcpu)
+{
+ return (kvm_rax_read(vcpu) & -1u)
+ | ((u64)(kvm_rdx_read(vcpu) & -1u) << 32);
+}
+
+static inline void enter_guest_mode(struct kvm_vcpu *vcpu)
+{
+ vcpu->arch.hflags |= HF_GUEST_MASK;
+ vcpu->stat.guest_mode = 1;
+}
+
+static inline void leave_guest_mode(struct kvm_vcpu *vcpu)
+{
+ vcpu->arch.hflags &= ~HF_GUEST_MASK;
+
+ if (vcpu->arch.load_eoi_exitmap_pending) {
+ vcpu->arch.load_eoi_exitmap_pending = false;
+ kvm_make_request(KVM_REQ_LOAD_EOI_EXITMAP, vcpu);
+ }
+
+ vcpu->stat.guest_mode = 0;
+}
+
+static inline bool is_guest_mode(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.hflags & HF_GUEST_MASK;
+}
+
+#endif
diff --git a/arch/x86/kvm/kvm_emulate.h b/arch/x86/kvm/kvm_emulate.h
new file mode 100644
index 000000000..be7aeb9b8
--- /dev/null
+++ b/arch/x86/kvm/kvm_emulate.h
@@ -0,0 +1,541 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/******************************************************************************
+ * x86_emulate.h
+ *
+ * Generic x86 (32-bit and 64-bit) instruction decoder and emulator.
+ *
+ * Copyright (c) 2005 Keir Fraser
+ *
+ * From: xen-unstable 10676:af9809f51f81a3c43f276f00c81a52ef558afda4
+ */
+
+#ifndef _ASM_X86_KVM_X86_EMULATE_H
+#define _ASM_X86_KVM_X86_EMULATE_H
+
+#include <asm/desc_defs.h>
+#include "fpu.h"
+
+struct x86_emulate_ctxt;
+enum x86_intercept;
+enum x86_intercept_stage;
+
+struct x86_exception {
+ u8 vector;
+ bool error_code_valid;
+ u16 error_code;
+ bool nested_page_fault;
+ u64 address; /* cr2 or nested page fault gpa */
+ u8 async_page_fault;
+};
+
+/*
+ * This struct is used to carry enough information from the instruction
+ * decoder to main KVM so that a decision can be made whether the
+ * instruction needs to be intercepted or not.
+ */
+struct x86_instruction_info {
+ u8 intercept; /* which intercept */
+ u8 rep_prefix; /* rep prefix? */
+ u8 modrm_mod; /* mod part of modrm */
+ u8 modrm_reg; /* index of register used */
+ u8 modrm_rm; /* rm part of modrm */
+ u64 src_val; /* value of source operand */
+ u64 dst_val; /* value of destination operand */
+ u8 src_bytes; /* size of source operand */
+ u8 dst_bytes; /* size of destination operand */
+ u8 ad_bytes; /* size of src/dst address */
+ u64 next_rip; /* rip following the instruction */
+};
+
+/*
+ * x86_emulate_ops:
+ *
+ * These operations represent the instruction emulator's interface to memory.
+ * There are two categories of operation: those that act on ordinary memory
+ * regions (*_std), and those that act on memory regions known to require
+ * special treatment or emulation (*_emulated).
+ *
+ * The emulator assumes that an instruction accesses only one 'emulated memory'
+ * location, that this location is the given linear faulting address (cr2), and
+ * that this is one of the instruction's data operands. Instruction fetches and
+ * stack operations are assumed never to access emulated memory. The emulator
+ * automatically deduces which operand of a string-move operation is accessing
+ * emulated memory, and assumes that the other operand accesses normal memory.
+ *
+ * NOTES:
+ * 1. The emulator isn't very smart about emulated vs. standard memory.
+ * 'Emulated memory' access addresses should be checked for sanity.
+ * 'Normal memory' accesses may fault, and the caller must arrange to
+ * detect and handle reentrancy into the emulator via recursive faults.
+ * Accesses may be unaligned and may cross page boundaries.
+ * 2. If the access fails (cannot emulate, or a standard access faults) then
+ * it is up to the memop to propagate the fault to the guest VM via
+ * some out-of-band mechanism, unknown to the emulator. The memop signals
+ * failure by returning X86EMUL_PROPAGATE_FAULT to the emulator, which will
+ * then immediately bail.
+ * 3. Valid access sizes are 1, 2, 4 and 8 bytes. On x86/32 systems only
+ * cmpxchg8b_emulated need support 8-byte accesses.
+ * 4. The emulator cannot handle 64-bit mode emulation on an x86/32 system.
+ */
+/* Access completed successfully: continue emulation as normal. */
+#define X86EMUL_CONTINUE 0
+/* Access is unhandleable: bail from emulation and return error to caller. */
+#define X86EMUL_UNHANDLEABLE 1
+/* Terminate emulation but return success to the caller. */
+#define X86EMUL_PROPAGATE_FAULT 2 /* propagate a generated fault to guest */
+#define X86EMUL_RETRY_INSTR 3 /* retry the instruction for some reason */
+#define X86EMUL_CMPXCHG_FAILED 4 /* cmpxchg did not see expected value */
+#define X86EMUL_IO_NEEDED 5 /* IO is needed to complete emulation */
+#define X86EMUL_INTERCEPTED 6 /* Intercepted by nested VMCB/VMCS */
+
+struct x86_emulate_ops {
+ void (*vm_bugged)(struct x86_emulate_ctxt *ctxt);
+ /*
+ * read_gpr: read a general purpose register (rax - r15)
+ *
+ * @reg: gpr number.
+ */
+ ulong (*read_gpr)(struct x86_emulate_ctxt *ctxt, unsigned reg);
+ /*
+ * write_gpr: write a general purpose register (rax - r15)
+ *
+ * @reg: gpr number.
+ * @val: value to write.
+ */
+ void (*write_gpr)(struct x86_emulate_ctxt *ctxt, unsigned reg, ulong val);
+ /*
+ * read_std: Read bytes of standard (non-emulated/special) memory.
+ * Used for descriptor reading.
+ * @addr: [IN ] Linear address from which to read.
+ * @val: [OUT] Value read from memory, zero-extended to 'u_long'.
+ * @bytes: [IN ] Number of bytes to read from memory.
+ * @system:[IN ] Whether the access is forced to be at CPL0.
+ */
+ int (*read_std)(struct x86_emulate_ctxt *ctxt,
+ unsigned long addr, void *val,
+ unsigned int bytes,
+ struct x86_exception *fault, bool system);
+
+ /*
+ * write_std: Write bytes of standard (non-emulated/special) memory.
+ * Used for descriptor writing.
+ * @addr: [IN ] Linear address to which to write.
+ * @val: [OUT] Value write to memory, zero-extended to 'u_long'.
+ * @bytes: [IN ] Number of bytes to write to memory.
+ * @system:[IN ] Whether the access is forced to be at CPL0.
+ */
+ int (*write_std)(struct x86_emulate_ctxt *ctxt,
+ unsigned long addr, void *val, unsigned int bytes,
+ struct x86_exception *fault, bool system);
+ /*
+ * fetch: Read bytes of standard (non-emulated/special) memory.
+ * Used for instruction fetch.
+ * @addr: [IN ] Linear address from which to read.
+ * @val: [OUT] Value read from memory, zero-extended to 'u_long'.
+ * @bytes: [IN ] Number of bytes to read from memory.
+ */
+ int (*fetch)(struct x86_emulate_ctxt *ctxt,
+ unsigned long addr, void *val, unsigned int bytes,
+ struct x86_exception *fault);
+
+ /*
+ * read_emulated: Read bytes from emulated/special memory area.
+ * @addr: [IN ] Linear address from which to read.
+ * @val: [OUT] Value read from memory, zero-extended to 'u_long'.
+ * @bytes: [IN ] Number of bytes to read from memory.
+ */
+ int (*read_emulated)(struct x86_emulate_ctxt *ctxt,
+ unsigned long addr, void *val, unsigned int bytes,
+ struct x86_exception *fault);
+
+ /*
+ * write_emulated: Write bytes to emulated/special memory area.
+ * @addr: [IN ] Linear address to which to write.
+ * @val: [IN ] Value to write to memory (low-order bytes used as
+ * required).
+ * @bytes: [IN ] Number of bytes to write to memory.
+ */
+ int (*write_emulated)(struct x86_emulate_ctxt *ctxt,
+ unsigned long addr, const void *val,
+ unsigned int bytes,
+ struct x86_exception *fault);
+
+ /*
+ * cmpxchg_emulated: Emulate an atomic (LOCKed) CMPXCHG operation on an
+ * emulated/special memory area.
+ * @addr: [IN ] Linear address to access.
+ * @old: [IN ] Value expected to be current at @addr.
+ * @new: [IN ] Value to write to @addr.
+ * @bytes: [IN ] Number of bytes to access using CMPXCHG.
+ */
+ int (*cmpxchg_emulated)(struct x86_emulate_ctxt *ctxt,
+ unsigned long addr,
+ const void *old,
+ const void *new,
+ unsigned int bytes,
+ struct x86_exception *fault);
+ void (*invlpg)(struct x86_emulate_ctxt *ctxt, ulong addr);
+
+ int (*pio_in_emulated)(struct x86_emulate_ctxt *ctxt,
+ int size, unsigned short port, void *val,
+ unsigned int count);
+
+ int (*pio_out_emulated)(struct x86_emulate_ctxt *ctxt,
+ int size, unsigned short port, const void *val,
+ unsigned int count);
+
+ bool (*get_segment)(struct x86_emulate_ctxt *ctxt, u16 *selector,
+ struct desc_struct *desc, u32 *base3, int seg);
+ void (*set_segment)(struct x86_emulate_ctxt *ctxt, u16 selector,
+ struct desc_struct *desc, u32 base3, int seg);
+ unsigned long (*get_cached_segment_base)(struct x86_emulate_ctxt *ctxt,
+ int seg);
+ void (*get_gdt)(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt);
+ void (*get_idt)(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt);
+ void (*set_gdt)(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt);
+ void (*set_idt)(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt);
+ ulong (*get_cr)(struct x86_emulate_ctxt *ctxt, int cr);
+ int (*set_cr)(struct x86_emulate_ctxt *ctxt, int cr, ulong val);
+ int (*cpl)(struct x86_emulate_ctxt *ctxt);
+ void (*get_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong *dest);
+ int (*set_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong value);
+ int (*set_msr_with_filter)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 data);
+ int (*get_msr_with_filter)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 *pdata);
+ int (*get_msr)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 *pdata);
+ int (*check_pmc)(struct x86_emulate_ctxt *ctxt, u32 pmc);
+ int (*read_pmc)(struct x86_emulate_ctxt *ctxt, u32 pmc, u64 *pdata);
+ void (*halt)(struct x86_emulate_ctxt *ctxt);
+ void (*wbinvd)(struct x86_emulate_ctxt *ctxt);
+ int (*fix_hypercall)(struct x86_emulate_ctxt *ctxt);
+ int (*intercept)(struct x86_emulate_ctxt *ctxt,
+ struct x86_instruction_info *info,
+ enum x86_intercept_stage stage);
+
+ bool (*get_cpuid)(struct x86_emulate_ctxt *ctxt, u32 *eax, u32 *ebx,
+ u32 *ecx, u32 *edx, bool exact_only);
+ bool (*guest_has_movbe)(struct x86_emulate_ctxt *ctxt);
+ bool (*guest_has_fxsr)(struct x86_emulate_ctxt *ctxt);
+ bool (*guest_has_rdpid)(struct x86_emulate_ctxt *ctxt);
+
+ void (*set_nmi_mask)(struct x86_emulate_ctxt *ctxt, bool masked);
+
+ bool (*is_smm)(struct x86_emulate_ctxt *ctxt);
+ bool (*is_guest_mode)(struct x86_emulate_ctxt *ctxt);
+ int (*leave_smm)(struct x86_emulate_ctxt *ctxt);
+ void (*triple_fault)(struct x86_emulate_ctxt *ctxt);
+ int (*set_xcr)(struct x86_emulate_ctxt *ctxt, u32 index, u64 xcr);
+};
+
+/* Type, address-of, and value of an instruction's operand. */
+struct operand {
+ enum { OP_REG, OP_MEM, OP_MEM_STR, OP_IMM, OP_XMM, OP_MM, OP_NONE } type;
+ unsigned int bytes;
+ unsigned int count;
+ union {
+ unsigned long orig_val;
+ u64 orig_val64;
+ };
+ union {
+ unsigned long *reg;
+ struct segmented_address {
+ ulong ea;
+ unsigned seg;
+ } mem;
+ unsigned xmm;
+ unsigned mm;
+ } addr;
+ union {
+ unsigned long val;
+ u64 val64;
+ char valptr[sizeof(sse128_t)];
+ sse128_t vec_val;
+ u64 mm_val;
+ void *data;
+ };
+};
+
+struct fetch_cache {
+ u8 data[15];
+ u8 *ptr;
+ u8 *end;
+};
+
+struct read_cache {
+ u8 data[1024];
+ unsigned long pos;
+ unsigned long end;
+};
+
+/* Execution mode, passed to the emulator. */
+enum x86emul_mode {
+ X86EMUL_MODE_REAL, /* Real mode. */
+ X86EMUL_MODE_VM86, /* Virtual 8086 mode. */
+ X86EMUL_MODE_PROT16, /* 16-bit protected mode. */
+ X86EMUL_MODE_PROT32, /* 32-bit protected mode. */
+ X86EMUL_MODE_PROT64, /* 64-bit (long) mode. */
+};
+
+/*
+ * fastop functions are declared as taking a never-defined fastop parameter,
+ * so they can't be called from C directly.
+ */
+struct fastop;
+
+typedef void (*fastop_t)(struct fastop *);
+
+/*
+ * The emulator's _regs array tracks only the GPRs, i.e. excludes RIP. RIP is
+ * tracked/accessed via _eip, and except for RIP relative addressing, which
+ * also uses _eip, RIP cannot be a register operand nor can it be an operand in
+ * a ModRM or SIB byte.
+ */
+#ifdef CONFIG_X86_64
+#define NR_EMULATOR_GPRS 16
+#else
+#define NR_EMULATOR_GPRS 8
+#endif
+
+struct x86_emulate_ctxt {
+ void *vcpu;
+ const struct x86_emulate_ops *ops;
+
+ /* Register state before/after emulation. */
+ unsigned long eflags;
+ unsigned long eip; /* eip before instruction emulation */
+ /* Emulated execution mode, represented by an X86EMUL_MODE value. */
+ enum x86emul_mode mode;
+
+ /* interruptibility state, as a result of execution of STI or MOV SS */
+ int interruptibility;
+
+ bool perm_ok; /* do not check permissions if true */
+ bool tf; /* TF value before instruction (after for syscall/sysret) */
+
+ bool have_exception;
+ struct x86_exception exception;
+
+ /* GPA available */
+ bool gpa_available;
+ gpa_t gpa_val;
+
+ /*
+ * decode cache
+ */
+
+ /* current opcode length in bytes */
+ u8 opcode_len;
+ u8 b;
+ u8 intercept;
+ u8 op_bytes;
+ u8 ad_bytes;
+ union {
+ int (*execute)(struct x86_emulate_ctxt *ctxt);
+ fastop_t fop;
+ };
+ int (*check_perm)(struct x86_emulate_ctxt *ctxt);
+
+ bool rip_relative;
+ u8 rex_prefix;
+ u8 lock_prefix;
+ u8 rep_prefix;
+ /* bitmaps of registers in _regs[] that can be read */
+ u16 regs_valid;
+ /* bitmaps of registers in _regs[] that have been written */
+ u16 regs_dirty;
+ /* modrm */
+ u8 modrm;
+ u8 modrm_mod;
+ u8 modrm_reg;
+ u8 modrm_rm;
+ u8 modrm_seg;
+ u8 seg_override;
+ u64 d;
+ unsigned long _eip;
+
+ /* Here begins the usercopy section. */
+ struct operand src;
+ struct operand src2;
+ struct operand dst;
+ struct operand memop;
+ unsigned long _regs[NR_EMULATOR_GPRS];
+ struct operand *memopp;
+ struct fetch_cache fetch;
+ struct read_cache io_read;
+ struct read_cache mem_read;
+ bool is_branch;
+};
+
+#define KVM_EMULATOR_BUG_ON(cond, ctxt) \
+({ \
+ int __ret = (cond); \
+ \
+ if (WARN_ON_ONCE(__ret)) \
+ ctxt->ops->vm_bugged(ctxt); \
+ unlikely(__ret); \
+})
+
+/* Repeat String Operation Prefix */
+#define REPE_PREFIX 0xf3
+#define REPNE_PREFIX 0xf2
+
+/* CPUID vendors */
+#define X86EMUL_CPUID_VENDOR_AuthenticAMD_ebx 0x68747541
+#define X86EMUL_CPUID_VENDOR_AuthenticAMD_ecx 0x444d4163
+#define X86EMUL_CPUID_VENDOR_AuthenticAMD_edx 0x69746e65
+
+#define X86EMUL_CPUID_VENDOR_AMDisbetterI_ebx 0x69444d41
+#define X86EMUL_CPUID_VENDOR_AMDisbetterI_ecx 0x21726574
+#define X86EMUL_CPUID_VENDOR_AMDisbetterI_edx 0x74656273
+
+#define X86EMUL_CPUID_VENDOR_HygonGenuine_ebx 0x6f677948
+#define X86EMUL_CPUID_VENDOR_HygonGenuine_ecx 0x656e6975
+#define X86EMUL_CPUID_VENDOR_HygonGenuine_edx 0x6e65476e
+
+#define X86EMUL_CPUID_VENDOR_GenuineIntel_ebx 0x756e6547
+#define X86EMUL_CPUID_VENDOR_GenuineIntel_ecx 0x6c65746e
+#define X86EMUL_CPUID_VENDOR_GenuineIntel_edx 0x49656e69
+
+#define X86EMUL_CPUID_VENDOR_CentaurHauls_ebx 0x746e6543
+#define X86EMUL_CPUID_VENDOR_CentaurHauls_ecx 0x736c7561
+#define X86EMUL_CPUID_VENDOR_CentaurHauls_edx 0x48727561
+
+static inline bool is_guest_vendor_intel(u32 ebx, u32 ecx, u32 edx)
+{
+ return ebx == X86EMUL_CPUID_VENDOR_GenuineIntel_ebx &&
+ ecx == X86EMUL_CPUID_VENDOR_GenuineIntel_ecx &&
+ edx == X86EMUL_CPUID_VENDOR_GenuineIntel_edx;
+}
+
+static inline bool is_guest_vendor_amd(u32 ebx, u32 ecx, u32 edx)
+{
+ return (ebx == X86EMUL_CPUID_VENDOR_AuthenticAMD_ebx &&
+ ecx == X86EMUL_CPUID_VENDOR_AuthenticAMD_ecx &&
+ edx == X86EMUL_CPUID_VENDOR_AuthenticAMD_edx) ||
+ (ebx == X86EMUL_CPUID_VENDOR_AMDisbetterI_ebx &&
+ ecx == X86EMUL_CPUID_VENDOR_AMDisbetterI_ecx &&
+ edx == X86EMUL_CPUID_VENDOR_AMDisbetterI_edx);
+}
+
+static inline bool is_guest_vendor_hygon(u32 ebx, u32 ecx, u32 edx)
+{
+ return ebx == X86EMUL_CPUID_VENDOR_HygonGenuine_ebx &&
+ ecx == X86EMUL_CPUID_VENDOR_HygonGenuine_ecx &&
+ edx == X86EMUL_CPUID_VENDOR_HygonGenuine_edx;
+}
+
+enum x86_intercept_stage {
+ X86_ICTP_NONE = 0, /* Allow zero-init to not match anything */
+ X86_ICPT_PRE_EXCEPT,
+ X86_ICPT_POST_EXCEPT,
+ X86_ICPT_POST_MEMACCESS,
+};
+
+enum x86_intercept {
+ x86_intercept_none,
+ x86_intercept_cr_read,
+ x86_intercept_cr_write,
+ x86_intercept_clts,
+ x86_intercept_lmsw,
+ x86_intercept_smsw,
+ x86_intercept_dr_read,
+ x86_intercept_dr_write,
+ x86_intercept_lidt,
+ x86_intercept_sidt,
+ x86_intercept_lgdt,
+ x86_intercept_sgdt,
+ x86_intercept_lldt,
+ x86_intercept_sldt,
+ x86_intercept_ltr,
+ x86_intercept_str,
+ x86_intercept_rdtsc,
+ x86_intercept_rdpmc,
+ x86_intercept_pushf,
+ x86_intercept_popf,
+ x86_intercept_cpuid,
+ x86_intercept_rsm,
+ x86_intercept_iret,
+ x86_intercept_intn,
+ x86_intercept_invd,
+ x86_intercept_pause,
+ x86_intercept_hlt,
+ x86_intercept_invlpg,
+ x86_intercept_invlpga,
+ x86_intercept_vmrun,
+ x86_intercept_vmload,
+ x86_intercept_vmsave,
+ x86_intercept_vmmcall,
+ x86_intercept_stgi,
+ x86_intercept_clgi,
+ x86_intercept_skinit,
+ x86_intercept_rdtscp,
+ x86_intercept_rdpid,
+ x86_intercept_icebp,
+ x86_intercept_wbinvd,
+ x86_intercept_monitor,
+ x86_intercept_mwait,
+ x86_intercept_rdmsr,
+ x86_intercept_wrmsr,
+ x86_intercept_in,
+ x86_intercept_ins,
+ x86_intercept_out,
+ x86_intercept_outs,
+ x86_intercept_xsetbv,
+
+ nr_x86_intercepts
+};
+
+/* Host execution mode. */
+#if defined(CONFIG_X86_32)
+#define X86EMUL_MODE_HOST X86EMUL_MODE_PROT32
+#elif defined(CONFIG_X86_64)
+#define X86EMUL_MODE_HOST X86EMUL_MODE_PROT64
+#endif
+
+int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len, int emulation_type);
+bool x86_page_table_writing_insn(struct x86_emulate_ctxt *ctxt);
+#define EMULATION_FAILED -1
+#define EMULATION_OK 0
+#define EMULATION_RESTART 1
+#define EMULATION_INTERCEPTED 2
+void init_decode_cache(struct x86_emulate_ctxt *ctxt);
+int x86_emulate_insn(struct x86_emulate_ctxt *ctxt);
+int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
+ u16 tss_selector, int idt_index, int reason,
+ bool has_error_code, u32 error_code);
+int emulate_int_real(struct x86_emulate_ctxt *ctxt, int irq);
+void emulator_invalidate_register_cache(struct x86_emulate_ctxt *ctxt);
+void emulator_writeback_register_cache(struct x86_emulate_ctxt *ctxt);
+bool emulator_can_use_gpa(struct x86_emulate_ctxt *ctxt);
+
+static inline ulong reg_read(struct x86_emulate_ctxt *ctxt, unsigned nr)
+{
+ if (KVM_EMULATOR_BUG_ON(nr >= NR_EMULATOR_GPRS, ctxt))
+ nr &= NR_EMULATOR_GPRS - 1;
+
+ if (!(ctxt->regs_valid & (1 << nr))) {
+ ctxt->regs_valid |= 1 << nr;
+ ctxt->_regs[nr] = ctxt->ops->read_gpr(ctxt, nr);
+ }
+ return ctxt->_regs[nr];
+}
+
+static inline ulong *reg_write(struct x86_emulate_ctxt *ctxt, unsigned nr)
+{
+ if (KVM_EMULATOR_BUG_ON(nr >= NR_EMULATOR_GPRS, ctxt))
+ nr &= NR_EMULATOR_GPRS - 1;
+
+ BUILD_BUG_ON(sizeof(ctxt->regs_dirty) * BITS_PER_BYTE < NR_EMULATOR_GPRS);
+ BUILD_BUG_ON(sizeof(ctxt->regs_valid) * BITS_PER_BYTE < NR_EMULATOR_GPRS);
+
+ ctxt->regs_valid |= 1 << nr;
+ ctxt->regs_dirty |= 1 << nr;
+ return &ctxt->_regs[nr];
+}
+
+static inline ulong *reg_rmw(struct x86_emulate_ctxt *ctxt, unsigned nr)
+{
+ reg_read(ctxt, nr);
+ return reg_write(ctxt, nr);
+}
+
+#endif /* _ASM_X86_KVM_X86_EMULATE_H */
diff --git a/arch/x86/kvm/kvm_onhyperv.c b/arch/x86/kvm/kvm_onhyperv.c
new file mode 100644
index 000000000..ded0bd688
--- /dev/null
+++ b/arch/x86/kvm/kvm_onhyperv.c
@@ -0,0 +1,124 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * KVM L1 hypervisor optimizations on Hyper-V.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kvm_host.h>
+#include <asm/mshyperv.h>
+
+#include "hyperv.h"
+#include "kvm_onhyperv.h"
+
+struct kvm_hv_tlb_range {
+ u64 start_gfn;
+ u64 pages;
+};
+
+static int kvm_fill_hv_flush_list_func(struct hv_guest_mapping_flush_list *flush,
+ void *data)
+{
+ struct kvm_hv_tlb_range *range = data;
+
+ return hyperv_fill_flush_guest_mapping_list(flush, range->start_gfn,
+ range->pages);
+}
+
+static inline int hv_remote_flush_root_tdp(hpa_t root_tdp,
+ struct kvm_hv_tlb_range *range)
+{
+ if (range)
+ return hyperv_flush_guest_mapping_range(root_tdp,
+ kvm_fill_hv_flush_list_func, (void *)range);
+ else
+ return hyperv_flush_guest_mapping(root_tdp);
+}
+
+static int __hv_flush_remote_tlbs_range(struct kvm *kvm,
+ struct kvm_hv_tlb_range *range)
+{
+ struct kvm_arch *kvm_arch = &kvm->arch;
+ struct kvm_vcpu *vcpu;
+ int ret = 0, nr_unique_valid_roots;
+ unsigned long i;
+ hpa_t root;
+
+ spin_lock(&kvm_arch->hv_root_tdp_lock);
+
+ if (!VALID_PAGE(kvm_arch->hv_root_tdp)) {
+ nr_unique_valid_roots = 0;
+
+ /*
+ * Flush all valid roots, and see if all vCPUs have converged
+ * on a common root, in which case future flushes can skip the
+ * loop and flush the common root.
+ */
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ root = vcpu->arch.hv_root_tdp;
+ if (!VALID_PAGE(root) || root == kvm_arch->hv_root_tdp)
+ continue;
+
+ /*
+ * Set the tracked root to the first valid root. Keep
+ * this root for the entirety of the loop even if more
+ * roots are encountered as a low effort optimization
+ * to avoid flushing the same (first) root again.
+ */
+ if (++nr_unique_valid_roots == 1)
+ kvm_arch->hv_root_tdp = root;
+
+ if (!ret)
+ ret = hv_remote_flush_root_tdp(root, range);
+
+ /*
+ * Stop processing roots if a failure occurred and
+ * multiple valid roots have already been detected.
+ */
+ if (ret && nr_unique_valid_roots > 1)
+ break;
+ }
+
+ /*
+ * The optimized flush of a single root can't be used if there
+ * are multiple valid roots (obviously).
+ */
+ if (nr_unique_valid_roots > 1)
+ kvm_arch->hv_root_tdp = INVALID_PAGE;
+ } else {
+ ret = hv_remote_flush_root_tdp(kvm_arch->hv_root_tdp, range);
+ }
+
+ spin_unlock(&kvm_arch->hv_root_tdp_lock);
+ return ret;
+}
+
+int hv_flush_remote_tlbs_range(struct kvm *kvm, gfn_t start_gfn, gfn_t nr_pages)
+{
+ struct kvm_hv_tlb_range range = {
+ .start_gfn = start_gfn,
+ .pages = nr_pages,
+ };
+
+ return __hv_flush_remote_tlbs_range(kvm, &range);
+}
+EXPORT_SYMBOL_GPL(hv_flush_remote_tlbs_range);
+
+int hv_flush_remote_tlbs(struct kvm *kvm)
+{
+ return __hv_flush_remote_tlbs_range(kvm, NULL);
+}
+EXPORT_SYMBOL_GPL(hv_flush_remote_tlbs);
+
+void hv_track_root_tdp(struct kvm_vcpu *vcpu, hpa_t root_tdp)
+{
+ struct kvm_arch *kvm_arch = &vcpu->kvm->arch;
+
+ if (kvm_x86_ops.flush_remote_tlbs == hv_flush_remote_tlbs) {
+ spin_lock(&kvm_arch->hv_root_tdp_lock);
+ vcpu->arch.hv_root_tdp = root_tdp;
+ if (root_tdp != kvm_arch->hv_root_tdp)
+ kvm_arch->hv_root_tdp = INVALID_PAGE;
+ spin_unlock(&kvm_arch->hv_root_tdp_lock);
+ }
+}
+EXPORT_SYMBOL_GPL(hv_track_root_tdp);
diff --git a/arch/x86/kvm/kvm_onhyperv.h b/arch/x86/kvm/kvm_onhyperv.h
new file mode 100644
index 000000000..f9ca3e743
--- /dev/null
+++ b/arch/x86/kvm/kvm_onhyperv.h
@@ -0,0 +1,24 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * KVM L1 hypervisor optimizations on Hyper-V.
+ */
+
+#ifndef __ARCH_X86_KVM_KVM_ONHYPERV_H__
+#define __ARCH_X86_KVM_KVM_ONHYPERV_H__
+
+#if IS_ENABLED(CONFIG_HYPERV)
+int hv_flush_remote_tlbs_range(struct kvm *kvm, gfn_t gfn, gfn_t nr_pages);
+int hv_flush_remote_tlbs(struct kvm *kvm);
+void hv_track_root_tdp(struct kvm_vcpu *vcpu, hpa_t root_tdp);
+#else /* !CONFIG_HYPERV */
+static inline int hv_flush_remote_tlbs(struct kvm *kvm)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline void hv_track_root_tdp(struct kvm_vcpu *vcpu, hpa_t root_tdp)
+{
+}
+#endif /* !CONFIG_HYPERV */
+
+#endif
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
new file mode 100644
index 000000000..245b20973
--- /dev/null
+++ b/arch/x86/kvm/lapic.c
@@ -0,0 +1,3316 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+/*
+ * Local APIC virtualization
+ *
+ * Copyright (C) 2006 Qumranet, Inc.
+ * Copyright (C) 2007 Novell
+ * Copyright (C) 2007 Intel
+ * Copyright 2009 Red Hat, Inc. and/or its affiliates.
+ *
+ * Authors:
+ * Dor Laor <dor.laor@qumranet.com>
+ * Gregory Haskins <ghaskins@novell.com>
+ * Yaozu (Eddie) Dong <eddie.dong@intel.com>
+ *
+ * Based on Xen 3.1 code, Copyright (c) 2004, Intel Corporation.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kvm_host.h>
+#include <linux/kvm.h>
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/smp.h>
+#include <linux/hrtimer.h>
+#include <linux/io.h>
+#include <linux/export.h>
+#include <linux/math64.h>
+#include <linux/slab.h>
+#include <asm/processor.h>
+#include <asm/mce.h>
+#include <asm/msr.h>
+#include <asm/page.h>
+#include <asm/current.h>
+#include <asm/apicdef.h>
+#include <asm/delay.h>
+#include <linux/atomic.h>
+#include <linux/jump_label.h>
+#include "kvm_cache_regs.h"
+#include "irq.h"
+#include "ioapic.h"
+#include "trace.h"
+#include "x86.h"
+#include "cpuid.h"
+#include "hyperv.h"
+#include "smm.h"
+
+#ifndef CONFIG_X86_64
+#define mod_64(x, y) ((x) - (y) * div64_u64(x, y))
+#else
+#define mod_64(x, y) ((x) % (y))
+#endif
+
+/* 14 is the version for Xeon and Pentium 8.4.8*/
+#define APIC_VERSION 0x14UL
+#define LAPIC_MMIO_LENGTH (1 << 12)
+/* followed define is not in apicdef.h */
+#define MAX_APIC_VECTOR 256
+#define APIC_VECTORS_PER_REG 32
+
+static bool lapic_timer_advance_dynamic __read_mostly;
+#define LAPIC_TIMER_ADVANCE_ADJUST_MIN 100 /* clock cycles */
+#define LAPIC_TIMER_ADVANCE_ADJUST_MAX 10000 /* clock cycles */
+#define LAPIC_TIMER_ADVANCE_NS_INIT 1000
+#define LAPIC_TIMER_ADVANCE_NS_MAX 5000
+/* step-by-step approximation to mitigate fluctuation */
+#define LAPIC_TIMER_ADVANCE_ADJUST_STEP 8
+static int kvm_lapic_msr_read(struct kvm_lapic *apic, u32 reg, u64 *data);
+static int kvm_lapic_msr_write(struct kvm_lapic *apic, u32 reg, u64 data);
+
+static inline void __kvm_lapic_set_reg(char *regs, int reg_off, u32 val)
+{
+ *((u32 *) (regs + reg_off)) = val;
+}
+
+static inline void kvm_lapic_set_reg(struct kvm_lapic *apic, int reg_off, u32 val)
+{
+ __kvm_lapic_set_reg(apic->regs, reg_off, val);
+}
+
+static __always_inline u64 __kvm_lapic_get_reg64(char *regs, int reg)
+{
+ BUILD_BUG_ON(reg != APIC_ICR);
+ return *((u64 *) (regs + reg));
+}
+
+static __always_inline u64 kvm_lapic_get_reg64(struct kvm_lapic *apic, int reg)
+{
+ return __kvm_lapic_get_reg64(apic->regs, reg);
+}
+
+static __always_inline void __kvm_lapic_set_reg64(char *regs, int reg, u64 val)
+{
+ BUILD_BUG_ON(reg != APIC_ICR);
+ *((u64 *) (regs + reg)) = val;
+}
+
+static __always_inline void kvm_lapic_set_reg64(struct kvm_lapic *apic,
+ int reg, u64 val)
+{
+ __kvm_lapic_set_reg64(apic->regs, reg, val);
+}
+
+static inline int apic_test_vector(int vec, void *bitmap)
+{
+ return test_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
+}
+
+bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+
+ return apic_test_vector(vector, apic->regs + APIC_ISR) ||
+ apic_test_vector(vector, apic->regs + APIC_IRR);
+}
+
+static inline int __apic_test_and_set_vector(int vec, void *bitmap)
+{
+ return __test_and_set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
+}
+
+static inline int __apic_test_and_clear_vector(int vec, void *bitmap)
+{
+ return __test_and_clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
+}
+
+__read_mostly DEFINE_STATIC_KEY_DEFERRED_FALSE(apic_hw_disabled, HZ);
+__read_mostly DEFINE_STATIC_KEY_DEFERRED_FALSE(apic_sw_disabled, HZ);
+
+static inline int apic_enabled(struct kvm_lapic *apic)
+{
+ return kvm_apic_sw_enabled(apic) && kvm_apic_hw_enabled(apic);
+}
+
+#define LVT_MASK \
+ (APIC_LVT_MASKED | APIC_SEND_PENDING | APIC_VECTOR_MASK)
+
+#define LINT_MASK \
+ (LVT_MASK | APIC_MODE_MASK | APIC_INPUT_POLARITY | \
+ APIC_LVT_REMOTE_IRR | APIC_LVT_LEVEL_TRIGGER)
+
+static inline u32 kvm_x2apic_id(struct kvm_lapic *apic)
+{
+ return apic->vcpu->vcpu_id;
+}
+
+static bool kvm_can_post_timer_interrupt(struct kvm_vcpu *vcpu)
+{
+ return pi_inject_timer && kvm_vcpu_apicv_active(vcpu) &&
+ (kvm_mwait_in_guest(vcpu->kvm) || kvm_hlt_in_guest(vcpu->kvm));
+}
+
+bool kvm_can_use_hv_timer(struct kvm_vcpu *vcpu)
+{
+ return kvm_x86_ops.set_hv_timer
+ && !(kvm_mwait_in_guest(vcpu->kvm) ||
+ kvm_can_post_timer_interrupt(vcpu));
+}
+
+static bool kvm_use_posted_timer_interrupt(struct kvm_vcpu *vcpu)
+{
+ return kvm_can_post_timer_interrupt(vcpu) && vcpu->mode == IN_GUEST_MODE;
+}
+
+static inline u32 kvm_apic_calc_x2apic_ldr(u32 id)
+{
+ return ((id >> 4) << 16) | (1 << (id & 0xf));
+}
+
+static inline bool kvm_apic_map_get_logical_dest(struct kvm_apic_map *map,
+ u32 dest_id, struct kvm_lapic ***cluster, u16 *mask) {
+ switch (map->logical_mode) {
+ case KVM_APIC_MODE_SW_DISABLED:
+ /* Arbitrarily use the flat map so that @cluster isn't NULL. */
+ *cluster = map->xapic_flat_map;
+ *mask = 0;
+ return true;
+ case KVM_APIC_MODE_X2APIC: {
+ u32 offset = (dest_id >> 16) * 16;
+ u32 max_apic_id = map->max_apic_id;
+
+ if (offset <= max_apic_id) {
+ u8 cluster_size = min(max_apic_id - offset + 1, 16U);
+
+ offset = array_index_nospec(offset, map->max_apic_id + 1);
+ *cluster = &map->phys_map[offset];
+ *mask = dest_id & (0xffff >> (16 - cluster_size));
+ } else {
+ *mask = 0;
+ }
+
+ return true;
+ }
+ case KVM_APIC_MODE_XAPIC_FLAT:
+ *cluster = map->xapic_flat_map;
+ *mask = dest_id & 0xff;
+ return true;
+ case KVM_APIC_MODE_XAPIC_CLUSTER:
+ *cluster = map->xapic_cluster_map[(dest_id >> 4) & 0xf];
+ *mask = dest_id & 0xf;
+ return true;
+ case KVM_APIC_MODE_MAP_DISABLED:
+ return false;
+ default:
+ WARN_ON_ONCE(1);
+ return false;
+ }
+}
+
+static void kvm_apic_map_free(struct rcu_head *rcu)
+{
+ struct kvm_apic_map *map = container_of(rcu, struct kvm_apic_map, rcu);
+
+ kvfree(map);
+}
+
+static int kvm_recalculate_phys_map(struct kvm_apic_map *new,
+ struct kvm_vcpu *vcpu,
+ bool *xapic_id_mismatch)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+ u32 x2apic_id = kvm_x2apic_id(apic);
+ u32 xapic_id = kvm_xapic_id(apic);
+ u32 physical_id;
+
+ /*
+ * For simplicity, KVM always allocates enough space for all possible
+ * xAPIC IDs. Yell, but don't kill the VM, as KVM can continue on
+ * without the optimized map.
+ */
+ if (WARN_ON_ONCE(xapic_id > new->max_apic_id))
+ return -EINVAL;
+
+ /*
+ * Bail if a vCPU was added and/or enabled its APIC between allocating
+ * the map and doing the actual calculations for the map. Note, KVM
+ * hardcodes the x2APIC ID to vcpu_id, i.e. there's no TOCTOU bug if
+ * the compiler decides to reload x2apic_id after this check.
+ */
+ if (x2apic_id > new->max_apic_id)
+ return -E2BIG;
+
+ /*
+ * Deliberately truncate the vCPU ID when detecting a mismatched APIC
+ * ID to avoid false positives if the vCPU ID, i.e. x2APIC ID, is a
+ * 32-bit value. Any unwanted aliasing due to truncation results will
+ * be detected below.
+ */
+ if (!apic_x2apic_mode(apic) && xapic_id != (u8)vcpu->vcpu_id)
+ *xapic_id_mismatch = true;
+
+ /*
+ * Apply KVM's hotplug hack if userspace has enable 32-bit APIC IDs.
+ * Allow sending events to vCPUs by their x2APIC ID even if the target
+ * vCPU is in legacy xAPIC mode, and silently ignore aliased xAPIC IDs
+ * (the x2APIC ID is truncated to 8 bits, causing IDs > 0xff to wrap
+ * and collide).
+ *
+ * Honor the architectural (and KVM's non-optimized) behavior if
+ * userspace has not enabled 32-bit x2APIC IDs. Each APIC is supposed
+ * to process messages independently. If multiple vCPUs have the same
+ * effective APIC ID, e.g. due to the x2APIC wrap or because the guest
+ * manually modified its xAPIC IDs, events targeting that ID are
+ * supposed to be recognized by all vCPUs with said ID.
+ */
+ if (vcpu->kvm->arch.x2apic_format) {
+ /* See also kvm_apic_match_physical_addr(). */
+ if (apic_x2apic_mode(apic) || x2apic_id > 0xff)
+ new->phys_map[x2apic_id] = apic;
+
+ if (!apic_x2apic_mode(apic) && !new->phys_map[xapic_id])
+ new->phys_map[xapic_id] = apic;
+ } else {
+ /*
+ * Disable the optimized map if the physical APIC ID is already
+ * mapped, i.e. is aliased to multiple vCPUs. The optimized
+ * map requires a strict 1:1 mapping between IDs and vCPUs.
+ */
+ if (apic_x2apic_mode(apic))
+ physical_id = x2apic_id;
+ else
+ physical_id = xapic_id;
+
+ if (new->phys_map[physical_id])
+ return -EINVAL;
+
+ new->phys_map[physical_id] = apic;
+ }
+
+ return 0;
+}
+
+static void kvm_recalculate_logical_map(struct kvm_apic_map *new,
+ struct kvm_vcpu *vcpu)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+ enum kvm_apic_logical_mode logical_mode;
+ struct kvm_lapic **cluster;
+ u16 mask;
+ u32 ldr;
+
+ if (new->logical_mode == KVM_APIC_MODE_MAP_DISABLED)
+ return;
+
+ if (!kvm_apic_sw_enabled(apic))
+ return;
+
+ ldr = kvm_lapic_get_reg(apic, APIC_LDR);
+ if (!ldr)
+ return;
+
+ if (apic_x2apic_mode(apic)) {
+ logical_mode = KVM_APIC_MODE_X2APIC;
+ } else {
+ ldr = GET_APIC_LOGICAL_ID(ldr);
+ if (kvm_lapic_get_reg(apic, APIC_DFR) == APIC_DFR_FLAT)
+ logical_mode = KVM_APIC_MODE_XAPIC_FLAT;
+ else
+ logical_mode = KVM_APIC_MODE_XAPIC_CLUSTER;
+ }
+
+ /*
+ * To optimize logical mode delivery, all software-enabled APICs must
+ * be configured for the same mode.
+ */
+ if (new->logical_mode == KVM_APIC_MODE_SW_DISABLED) {
+ new->logical_mode = logical_mode;
+ } else if (new->logical_mode != logical_mode) {
+ new->logical_mode = KVM_APIC_MODE_MAP_DISABLED;
+ return;
+ }
+
+ /*
+ * In x2APIC mode, the LDR is read-only and derived directly from the
+ * x2APIC ID, thus is guaranteed to be addressable. KVM reuses
+ * kvm_apic_map.phys_map to optimize logical mode x2APIC interrupts by
+ * reversing the LDR calculation to get cluster of APICs, i.e. no
+ * additional work is required.
+ */
+ if (apic_x2apic_mode(apic)) {
+ WARN_ON_ONCE(ldr != kvm_apic_calc_x2apic_ldr(kvm_x2apic_id(apic)));
+ return;
+ }
+
+ if (WARN_ON_ONCE(!kvm_apic_map_get_logical_dest(new, ldr,
+ &cluster, &mask))) {
+ new->logical_mode = KVM_APIC_MODE_MAP_DISABLED;
+ return;
+ }
+
+ if (!mask)
+ return;
+
+ ldr = ffs(mask) - 1;
+ if (!is_power_of_2(mask) || cluster[ldr])
+ new->logical_mode = KVM_APIC_MODE_MAP_DISABLED;
+ else
+ cluster[ldr] = apic;
+}
+
+/*
+ * CLEAN -> DIRTY and UPDATE_IN_PROGRESS -> DIRTY changes happen without a lock.
+ *
+ * DIRTY -> UPDATE_IN_PROGRESS and UPDATE_IN_PROGRESS -> CLEAN happen with
+ * apic_map_lock_held.
+ */
+enum {
+ CLEAN,
+ UPDATE_IN_PROGRESS,
+ DIRTY
+};
+
+void kvm_recalculate_apic_map(struct kvm *kvm)
+{
+ struct kvm_apic_map *new, *old = NULL;
+ struct kvm_vcpu *vcpu;
+ unsigned long i;
+ u32 max_id = 255; /* enough space for any xAPIC ID */
+ bool xapic_id_mismatch;
+ int r;
+
+ /* Read kvm->arch.apic_map_dirty before kvm->arch.apic_map. */
+ if (atomic_read_acquire(&kvm->arch.apic_map_dirty) == CLEAN)
+ return;
+
+ WARN_ONCE(!irqchip_in_kernel(kvm),
+ "Dirty APIC map without an in-kernel local APIC");
+
+ mutex_lock(&kvm->arch.apic_map_lock);
+
+retry:
+ /*
+ * Read kvm->arch.apic_map_dirty before kvm->arch.apic_map (if clean)
+ * or the APIC registers (if dirty). Note, on retry the map may have
+ * not yet been marked dirty by whatever task changed a vCPU's x2APIC
+ * ID, i.e. the map may still show up as in-progress. In that case
+ * this task still needs to retry and complete its calculation.
+ */
+ if (atomic_cmpxchg_acquire(&kvm->arch.apic_map_dirty,
+ DIRTY, UPDATE_IN_PROGRESS) == CLEAN) {
+ /* Someone else has updated the map. */
+ mutex_unlock(&kvm->arch.apic_map_lock);
+ return;
+ }
+
+ /*
+ * Reset the mismatch flag between attempts so that KVM does the right
+ * thing if a vCPU changes its xAPIC ID, but do NOT reset max_id, i.e.
+ * keep max_id strictly increasing. Disallowing max_id from shrinking
+ * ensures KVM won't get stuck in an infinite loop, e.g. if the vCPU
+ * with the highest x2APIC ID is toggling its APIC on and off.
+ */
+ xapic_id_mismatch = false;
+
+ kvm_for_each_vcpu(i, vcpu, kvm)
+ if (kvm_apic_present(vcpu))
+ max_id = max(max_id, kvm_x2apic_id(vcpu->arch.apic));
+
+ new = kvzalloc(sizeof(struct kvm_apic_map) +
+ sizeof(struct kvm_lapic *) * ((u64)max_id + 1),
+ GFP_KERNEL_ACCOUNT);
+
+ if (!new)
+ goto out;
+
+ new->max_apic_id = max_id;
+ new->logical_mode = KVM_APIC_MODE_SW_DISABLED;
+
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ if (!kvm_apic_present(vcpu))
+ continue;
+
+ r = kvm_recalculate_phys_map(new, vcpu, &xapic_id_mismatch);
+ if (r) {
+ kvfree(new);
+ new = NULL;
+ if (r == -E2BIG) {
+ cond_resched();
+ goto retry;
+ }
+
+ goto out;
+ }
+
+ kvm_recalculate_logical_map(new, vcpu);
+ }
+out:
+ /*
+ * The optimized map is effectively KVM's internal version of APICv,
+ * and all unwanted aliasing that results in disabling the optimized
+ * map also applies to APICv.
+ */
+ if (!new)
+ kvm_set_apicv_inhibit(kvm, APICV_INHIBIT_REASON_PHYSICAL_ID_ALIASED);
+ else
+ kvm_clear_apicv_inhibit(kvm, APICV_INHIBIT_REASON_PHYSICAL_ID_ALIASED);
+
+ if (!new || new->logical_mode == KVM_APIC_MODE_MAP_DISABLED)
+ kvm_set_apicv_inhibit(kvm, APICV_INHIBIT_REASON_LOGICAL_ID_ALIASED);
+ else
+ kvm_clear_apicv_inhibit(kvm, APICV_INHIBIT_REASON_LOGICAL_ID_ALIASED);
+
+ if (xapic_id_mismatch)
+ kvm_set_apicv_inhibit(kvm, APICV_INHIBIT_REASON_APIC_ID_MODIFIED);
+ else
+ kvm_clear_apicv_inhibit(kvm, APICV_INHIBIT_REASON_APIC_ID_MODIFIED);
+
+ old = rcu_dereference_protected(kvm->arch.apic_map,
+ lockdep_is_held(&kvm->arch.apic_map_lock));
+ rcu_assign_pointer(kvm->arch.apic_map, new);
+ /*
+ * Write kvm->arch.apic_map before clearing apic->apic_map_dirty.
+ * If another update has come in, leave it DIRTY.
+ */
+ atomic_cmpxchg_release(&kvm->arch.apic_map_dirty,
+ UPDATE_IN_PROGRESS, CLEAN);
+ mutex_unlock(&kvm->arch.apic_map_lock);
+
+ if (old)
+ call_rcu(&old->rcu, kvm_apic_map_free);
+
+ kvm_make_scan_ioapic_request(kvm);
+}
+
+static inline void apic_set_spiv(struct kvm_lapic *apic, u32 val)
+{
+ bool enabled = val & APIC_SPIV_APIC_ENABLED;
+
+ kvm_lapic_set_reg(apic, APIC_SPIV, val);
+
+ if (enabled != apic->sw_enabled) {
+ apic->sw_enabled = enabled;
+ if (enabled)
+ static_branch_slow_dec_deferred(&apic_sw_disabled);
+ else
+ static_branch_inc(&apic_sw_disabled.key);
+
+ atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY);
+ }
+
+ /* Check if there are APF page ready requests pending */
+ if (enabled)
+ kvm_make_request(KVM_REQ_APF_READY, apic->vcpu);
+}
+
+static inline void kvm_apic_set_xapic_id(struct kvm_lapic *apic, u8 id)
+{
+ kvm_lapic_set_reg(apic, APIC_ID, id << 24);
+ atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY);
+}
+
+static inline void kvm_apic_set_ldr(struct kvm_lapic *apic, u32 id)
+{
+ kvm_lapic_set_reg(apic, APIC_LDR, id);
+ atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY);
+}
+
+static inline void kvm_apic_set_dfr(struct kvm_lapic *apic, u32 val)
+{
+ kvm_lapic_set_reg(apic, APIC_DFR, val);
+ atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY);
+}
+
+static inline void kvm_apic_set_x2apic_id(struct kvm_lapic *apic, u32 id)
+{
+ u32 ldr = kvm_apic_calc_x2apic_ldr(id);
+
+ WARN_ON_ONCE(id != apic->vcpu->vcpu_id);
+
+ kvm_lapic_set_reg(apic, APIC_ID, id);
+ kvm_lapic_set_reg(apic, APIC_LDR, ldr);
+ atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY);
+}
+
+static inline int apic_lvt_enabled(struct kvm_lapic *apic, int lvt_type)
+{
+ return !(kvm_lapic_get_reg(apic, lvt_type) & APIC_LVT_MASKED);
+}
+
+static inline int apic_lvtt_oneshot(struct kvm_lapic *apic)
+{
+ return apic->lapic_timer.timer_mode == APIC_LVT_TIMER_ONESHOT;
+}
+
+static inline int apic_lvtt_period(struct kvm_lapic *apic)
+{
+ return apic->lapic_timer.timer_mode == APIC_LVT_TIMER_PERIODIC;
+}
+
+static inline int apic_lvtt_tscdeadline(struct kvm_lapic *apic)
+{
+ return apic->lapic_timer.timer_mode == APIC_LVT_TIMER_TSCDEADLINE;
+}
+
+static inline int apic_lvt_nmi_mode(u32 lvt_val)
+{
+ return (lvt_val & (APIC_MODE_MASK | APIC_LVT_MASKED)) == APIC_DM_NMI;
+}
+
+static inline bool kvm_lapic_lvt_supported(struct kvm_lapic *apic, int lvt_index)
+{
+ return apic->nr_lvt_entries > lvt_index;
+}
+
+static inline int kvm_apic_calc_nr_lvt_entries(struct kvm_vcpu *vcpu)
+{
+ return KVM_APIC_MAX_NR_LVT_ENTRIES - !(vcpu->arch.mcg_cap & MCG_CMCI_P);
+}
+
+void kvm_apic_set_version(struct kvm_vcpu *vcpu)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+ u32 v = 0;
+
+ if (!lapic_in_kernel(vcpu))
+ return;
+
+ v = APIC_VERSION | ((apic->nr_lvt_entries - 1) << 16);
+
+ /*
+ * KVM emulates 82093AA datasheet (with in-kernel IOAPIC implementation)
+ * which doesn't have EOI register; Some buggy OSes (e.g. Windows with
+ * Hyper-V role) disable EOI broadcast in lapic not checking for IOAPIC
+ * version first and level-triggered interrupts never get EOIed in
+ * IOAPIC.
+ */
+ if (guest_cpuid_has(vcpu, X86_FEATURE_X2APIC) &&
+ !ioapic_in_kernel(vcpu->kvm))
+ v |= APIC_LVR_DIRECTED_EOI;
+ kvm_lapic_set_reg(apic, APIC_LVR, v);
+}
+
+void kvm_apic_after_set_mcg_cap(struct kvm_vcpu *vcpu)
+{
+ int nr_lvt_entries = kvm_apic_calc_nr_lvt_entries(vcpu);
+ struct kvm_lapic *apic = vcpu->arch.apic;
+ int i;
+
+ if (!lapic_in_kernel(vcpu) || nr_lvt_entries == apic->nr_lvt_entries)
+ return;
+
+ /* Initialize/mask any "new" LVT entries. */
+ for (i = apic->nr_lvt_entries; i < nr_lvt_entries; i++)
+ kvm_lapic_set_reg(apic, APIC_LVTx(i), APIC_LVT_MASKED);
+
+ apic->nr_lvt_entries = nr_lvt_entries;
+
+ /* The number of LVT entries is reflected in the version register. */
+ kvm_apic_set_version(vcpu);
+}
+
+static const unsigned int apic_lvt_mask[KVM_APIC_MAX_NR_LVT_ENTRIES] = {
+ [LVT_TIMER] = LVT_MASK, /* timer mode mask added at runtime */
+ [LVT_THERMAL_MONITOR] = LVT_MASK | APIC_MODE_MASK,
+ [LVT_PERFORMANCE_COUNTER] = LVT_MASK | APIC_MODE_MASK,
+ [LVT_LINT0] = LINT_MASK,
+ [LVT_LINT1] = LINT_MASK,
+ [LVT_ERROR] = LVT_MASK,
+ [LVT_CMCI] = LVT_MASK | APIC_MODE_MASK
+};
+
+static int find_highest_vector(void *bitmap)
+{
+ int vec;
+ u32 *reg;
+
+ for (vec = MAX_APIC_VECTOR - APIC_VECTORS_PER_REG;
+ vec >= 0; vec -= APIC_VECTORS_PER_REG) {
+ reg = bitmap + REG_POS(vec);
+ if (*reg)
+ return __fls(*reg) + vec;
+ }
+
+ return -1;
+}
+
+static u8 count_vectors(void *bitmap)
+{
+ int vec;
+ u32 *reg;
+ u8 count = 0;
+
+ for (vec = 0; vec < MAX_APIC_VECTOR; vec += APIC_VECTORS_PER_REG) {
+ reg = bitmap + REG_POS(vec);
+ count += hweight32(*reg);
+ }
+
+ return count;
+}
+
+bool __kvm_apic_update_irr(u32 *pir, void *regs, int *max_irr)
+{
+ u32 i, vec;
+ u32 pir_val, irr_val, prev_irr_val;
+ int max_updated_irr;
+
+ max_updated_irr = -1;
+ *max_irr = -1;
+
+ for (i = vec = 0; i <= 7; i++, vec += 32) {
+ u32 *p_irr = (u32 *)(regs + APIC_IRR + i * 0x10);
+
+ irr_val = *p_irr;
+ pir_val = READ_ONCE(pir[i]);
+
+ if (pir_val) {
+ pir_val = xchg(&pir[i], 0);
+
+ prev_irr_val = irr_val;
+ do {
+ irr_val = prev_irr_val | pir_val;
+ } while (prev_irr_val != irr_val &&
+ !try_cmpxchg(p_irr, &prev_irr_val, irr_val));
+
+ if (prev_irr_val != irr_val)
+ max_updated_irr = __fls(irr_val ^ prev_irr_val) + vec;
+ }
+ if (irr_val)
+ *max_irr = __fls(irr_val) + vec;
+ }
+
+ return ((max_updated_irr != -1) &&
+ (max_updated_irr == *max_irr));
+}
+EXPORT_SYMBOL_GPL(__kvm_apic_update_irr);
+
+bool kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir, int *max_irr)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+ bool irr_updated = __kvm_apic_update_irr(pir, apic->regs, max_irr);
+
+ if (unlikely(!apic->apicv_active && irr_updated))
+ apic->irr_pending = true;
+ return irr_updated;
+}
+EXPORT_SYMBOL_GPL(kvm_apic_update_irr);
+
+static inline int apic_search_irr(struct kvm_lapic *apic)
+{
+ return find_highest_vector(apic->regs + APIC_IRR);
+}
+
+static inline int apic_find_highest_irr(struct kvm_lapic *apic)
+{
+ int result;
+
+ /*
+ * Note that irr_pending is just a hint. It will be always
+ * true with virtual interrupt delivery enabled.
+ */
+ if (!apic->irr_pending)
+ return -1;
+
+ result = apic_search_irr(apic);
+ ASSERT(result == -1 || result >= 16);
+
+ return result;
+}
+
+static inline void apic_clear_irr(int vec, struct kvm_lapic *apic)
+{
+ if (unlikely(apic->apicv_active)) {
+ /* need to update RVI */
+ kvm_lapic_clear_vector(vec, apic->regs + APIC_IRR);
+ static_call_cond(kvm_x86_hwapic_irr_update)(apic->vcpu,
+ apic_find_highest_irr(apic));
+ } else {
+ apic->irr_pending = false;
+ kvm_lapic_clear_vector(vec, apic->regs + APIC_IRR);
+ if (apic_search_irr(apic) != -1)
+ apic->irr_pending = true;
+ }
+}
+
+void kvm_apic_clear_irr(struct kvm_vcpu *vcpu, int vec)
+{
+ apic_clear_irr(vec, vcpu->arch.apic);
+}
+EXPORT_SYMBOL_GPL(kvm_apic_clear_irr);
+
+static inline void apic_set_isr(int vec, struct kvm_lapic *apic)
+{
+ if (__apic_test_and_set_vector(vec, apic->regs + APIC_ISR))
+ return;
+
+ /*
+ * With APIC virtualization enabled, all caching is disabled
+ * because the processor can modify ISR under the hood. Instead
+ * just set SVI.
+ */
+ if (unlikely(apic->apicv_active))
+ static_call_cond(kvm_x86_hwapic_isr_update)(vec);
+ else {
+ ++apic->isr_count;
+ BUG_ON(apic->isr_count > MAX_APIC_VECTOR);
+ /*
+ * ISR (in service register) bit is set when injecting an interrupt.
+ * The highest vector is injected. Thus the latest bit set matches
+ * the highest bit in ISR.
+ */
+ apic->highest_isr_cache = vec;
+ }
+}
+
+static inline int apic_find_highest_isr(struct kvm_lapic *apic)
+{
+ int result;
+
+ /*
+ * Note that isr_count is always 1, and highest_isr_cache
+ * is always -1, with APIC virtualization enabled.
+ */
+ if (!apic->isr_count)
+ return -1;
+ if (likely(apic->highest_isr_cache != -1))
+ return apic->highest_isr_cache;
+
+ result = find_highest_vector(apic->regs + APIC_ISR);
+ ASSERT(result == -1 || result >= 16);
+
+ return result;
+}
+
+static inline void apic_clear_isr(int vec, struct kvm_lapic *apic)
+{
+ if (!__apic_test_and_clear_vector(vec, apic->regs + APIC_ISR))
+ return;
+
+ /*
+ * We do get here for APIC virtualization enabled if the guest
+ * uses the Hyper-V APIC enlightenment. In this case we may need
+ * to trigger a new interrupt delivery by writing the SVI field;
+ * on the other hand isr_count and highest_isr_cache are unused
+ * and must be left alone.
+ */
+ if (unlikely(apic->apicv_active))
+ static_call_cond(kvm_x86_hwapic_isr_update)(apic_find_highest_isr(apic));
+ else {
+ --apic->isr_count;
+ BUG_ON(apic->isr_count < 0);
+ apic->highest_isr_cache = -1;
+ }
+}
+
+int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu)
+{
+ /* This may race with setting of irr in __apic_accept_irq() and
+ * value returned may be wrong, but kvm_vcpu_kick() in __apic_accept_irq
+ * will cause vmexit immediately and the value will be recalculated
+ * on the next vmentry.
+ */
+ return apic_find_highest_irr(vcpu->arch.apic);
+}
+EXPORT_SYMBOL_GPL(kvm_lapic_find_highest_irr);
+
+static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
+ int vector, int level, int trig_mode,
+ struct dest_map *dest_map);
+
+int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq,
+ struct dest_map *dest_map)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+
+ return __apic_accept_irq(apic, irq->delivery_mode, irq->vector,
+ irq->level, irq->trig_mode, dest_map);
+}
+
+static int __pv_send_ipi(unsigned long *ipi_bitmap, struct kvm_apic_map *map,
+ struct kvm_lapic_irq *irq, u32 min)
+{
+ int i, count = 0;
+ struct kvm_vcpu *vcpu;
+
+ if (min > map->max_apic_id)
+ return 0;
+
+ for_each_set_bit(i, ipi_bitmap,
+ min((u32)BITS_PER_LONG, (map->max_apic_id - min + 1))) {
+ if (map->phys_map[min + i]) {
+ vcpu = map->phys_map[min + i]->vcpu;
+ count += kvm_apic_set_irq(vcpu, irq, NULL);
+ }
+ }
+
+ return count;
+}
+
+int kvm_pv_send_ipi(struct kvm *kvm, unsigned long ipi_bitmap_low,
+ unsigned long ipi_bitmap_high, u32 min,
+ unsigned long icr, int op_64_bit)
+{
+ struct kvm_apic_map *map;
+ struct kvm_lapic_irq irq = {0};
+ int cluster_size = op_64_bit ? 64 : 32;
+ int count;
+
+ if (icr & (APIC_DEST_MASK | APIC_SHORT_MASK))
+ return -KVM_EINVAL;
+
+ irq.vector = icr & APIC_VECTOR_MASK;
+ irq.delivery_mode = icr & APIC_MODE_MASK;
+ irq.level = (icr & APIC_INT_ASSERT) != 0;
+ irq.trig_mode = icr & APIC_INT_LEVELTRIG;
+
+ rcu_read_lock();
+ map = rcu_dereference(kvm->arch.apic_map);
+
+ count = -EOPNOTSUPP;
+ if (likely(map)) {
+ count = __pv_send_ipi(&ipi_bitmap_low, map, &irq, min);
+ min += cluster_size;
+ count += __pv_send_ipi(&ipi_bitmap_high, map, &irq, min);
+ }
+
+ rcu_read_unlock();
+ return count;
+}
+
+static int pv_eoi_put_user(struct kvm_vcpu *vcpu, u8 val)
+{
+
+ return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.pv_eoi.data, &val,
+ sizeof(val));
+}
+
+static int pv_eoi_get_user(struct kvm_vcpu *vcpu, u8 *val)
+{
+
+ return kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.pv_eoi.data, val,
+ sizeof(*val));
+}
+
+static inline bool pv_eoi_enabled(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.pv_eoi.msr_val & KVM_MSR_ENABLED;
+}
+
+static void pv_eoi_set_pending(struct kvm_vcpu *vcpu)
+{
+ if (pv_eoi_put_user(vcpu, KVM_PV_EOI_ENABLED) < 0)
+ return;
+
+ __set_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention);
+}
+
+static bool pv_eoi_test_and_clr_pending(struct kvm_vcpu *vcpu)
+{
+ u8 val;
+
+ if (pv_eoi_get_user(vcpu, &val) < 0)
+ return false;
+
+ val &= KVM_PV_EOI_ENABLED;
+
+ if (val && pv_eoi_put_user(vcpu, KVM_PV_EOI_DISABLED) < 0)
+ return false;
+
+ /*
+ * Clear pending bit in any case: it will be set again on vmentry.
+ * While this might not be ideal from performance point of view,
+ * this makes sure pv eoi is only enabled when we know it's safe.
+ */
+ __clear_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention);
+
+ return val;
+}
+
+static int apic_has_interrupt_for_ppr(struct kvm_lapic *apic, u32 ppr)
+{
+ int highest_irr;
+ if (kvm_x86_ops.sync_pir_to_irr)
+ highest_irr = static_call(kvm_x86_sync_pir_to_irr)(apic->vcpu);
+ else
+ highest_irr = apic_find_highest_irr(apic);
+ if (highest_irr == -1 || (highest_irr & 0xF0) <= ppr)
+ return -1;
+ return highest_irr;
+}
+
+static bool __apic_update_ppr(struct kvm_lapic *apic, u32 *new_ppr)
+{
+ u32 tpr, isrv, ppr, old_ppr;
+ int isr;
+
+ old_ppr = kvm_lapic_get_reg(apic, APIC_PROCPRI);
+ tpr = kvm_lapic_get_reg(apic, APIC_TASKPRI);
+ isr = apic_find_highest_isr(apic);
+ isrv = (isr != -1) ? isr : 0;
+
+ if ((tpr & 0xf0) >= (isrv & 0xf0))
+ ppr = tpr & 0xff;
+ else
+ ppr = isrv & 0xf0;
+
+ *new_ppr = ppr;
+ if (old_ppr != ppr)
+ kvm_lapic_set_reg(apic, APIC_PROCPRI, ppr);
+
+ return ppr < old_ppr;
+}
+
+static void apic_update_ppr(struct kvm_lapic *apic)
+{
+ u32 ppr;
+
+ if (__apic_update_ppr(apic, &ppr) &&
+ apic_has_interrupt_for_ppr(apic, ppr) != -1)
+ kvm_make_request(KVM_REQ_EVENT, apic->vcpu);
+}
+
+void kvm_apic_update_ppr(struct kvm_vcpu *vcpu)
+{
+ apic_update_ppr(vcpu->arch.apic);
+}
+EXPORT_SYMBOL_GPL(kvm_apic_update_ppr);
+
+static void apic_set_tpr(struct kvm_lapic *apic, u32 tpr)
+{
+ kvm_lapic_set_reg(apic, APIC_TASKPRI, tpr);
+ apic_update_ppr(apic);
+}
+
+static bool kvm_apic_broadcast(struct kvm_lapic *apic, u32 mda)
+{
+ return mda == (apic_x2apic_mode(apic) ?
+ X2APIC_BROADCAST : APIC_BROADCAST);
+}
+
+static bool kvm_apic_match_physical_addr(struct kvm_lapic *apic, u32 mda)
+{
+ if (kvm_apic_broadcast(apic, mda))
+ return true;
+
+ /*
+ * Hotplug hack: Accept interrupts for vCPUs in xAPIC mode as if they
+ * were in x2APIC mode if the target APIC ID can't be encoded as an
+ * xAPIC ID. This allows unique addressing of hotplugged vCPUs (which
+ * start in xAPIC mode) with an APIC ID that is unaddressable in xAPIC
+ * mode. Match the x2APIC ID if and only if the target APIC ID can't
+ * be encoded in xAPIC to avoid spurious matches against a vCPU that
+ * changed its (addressable) xAPIC ID (which is writable).
+ */
+ if (apic_x2apic_mode(apic) || mda > 0xff)
+ return mda == kvm_x2apic_id(apic);
+
+ return mda == kvm_xapic_id(apic);
+}
+
+static bool kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda)
+{
+ u32 logical_id;
+
+ if (kvm_apic_broadcast(apic, mda))
+ return true;
+
+ logical_id = kvm_lapic_get_reg(apic, APIC_LDR);
+
+ if (apic_x2apic_mode(apic))
+ return ((logical_id >> 16) == (mda >> 16))
+ && (logical_id & mda & 0xffff) != 0;
+
+ logical_id = GET_APIC_LOGICAL_ID(logical_id);
+
+ switch (kvm_lapic_get_reg(apic, APIC_DFR)) {
+ case APIC_DFR_FLAT:
+ return (logical_id & mda) != 0;
+ case APIC_DFR_CLUSTER:
+ return ((logical_id >> 4) == (mda >> 4))
+ && (logical_id & mda & 0xf) != 0;
+ default:
+ return false;
+ }
+}
+
+/* The KVM local APIC implementation has two quirks:
+ *
+ * - Real hardware delivers interrupts destined to x2APIC ID > 0xff to LAPICs
+ * in xAPIC mode if the "destination & 0xff" matches its xAPIC ID.
+ * KVM doesn't do that aliasing.
+ *
+ * - in-kernel IOAPIC messages have to be delivered directly to
+ * x2APIC, because the kernel does not support interrupt remapping.
+ * In order to support broadcast without interrupt remapping, x2APIC
+ * rewrites the destination of non-IPI messages from APIC_BROADCAST
+ * to X2APIC_BROADCAST.
+ *
+ * The broadcast quirk can be disabled with KVM_CAP_X2APIC_API. This is
+ * important when userspace wants to use x2APIC-format MSIs, because
+ * APIC_BROADCAST (0xff) is a legal route for "cluster 0, CPUs 0-7".
+ */
+static u32 kvm_apic_mda(struct kvm_vcpu *vcpu, unsigned int dest_id,
+ struct kvm_lapic *source, struct kvm_lapic *target)
+{
+ bool ipi = source != NULL;
+
+ if (!vcpu->kvm->arch.x2apic_broadcast_quirk_disabled &&
+ !ipi && dest_id == APIC_BROADCAST && apic_x2apic_mode(target))
+ return X2APIC_BROADCAST;
+
+ return dest_id;
+}
+
+bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
+ int shorthand, unsigned int dest, int dest_mode)
+{
+ struct kvm_lapic *target = vcpu->arch.apic;
+ u32 mda = kvm_apic_mda(vcpu, dest, source, target);
+
+ ASSERT(target);
+ switch (shorthand) {
+ case APIC_DEST_NOSHORT:
+ if (dest_mode == APIC_DEST_PHYSICAL)
+ return kvm_apic_match_physical_addr(target, mda);
+ else
+ return kvm_apic_match_logical_addr(target, mda);
+ case APIC_DEST_SELF:
+ return target == source;
+ case APIC_DEST_ALLINC:
+ return true;
+ case APIC_DEST_ALLBUT:
+ return target != source;
+ default:
+ return false;
+ }
+}
+EXPORT_SYMBOL_GPL(kvm_apic_match_dest);
+
+int kvm_vector_to_index(u32 vector, u32 dest_vcpus,
+ const unsigned long *bitmap, u32 bitmap_size)
+{
+ u32 mod;
+ int i, idx = -1;
+
+ mod = vector % dest_vcpus;
+
+ for (i = 0; i <= mod; i++) {
+ idx = find_next_bit(bitmap, bitmap_size, idx + 1);
+ BUG_ON(idx == bitmap_size);
+ }
+
+ return idx;
+}
+
+static void kvm_apic_disabled_lapic_found(struct kvm *kvm)
+{
+ if (!kvm->arch.disabled_lapic_found) {
+ kvm->arch.disabled_lapic_found = true;
+ pr_info("Disabled LAPIC found during irq injection\n");
+ }
+}
+
+static bool kvm_apic_is_broadcast_dest(struct kvm *kvm, struct kvm_lapic **src,
+ struct kvm_lapic_irq *irq, struct kvm_apic_map *map)
+{
+ if (kvm->arch.x2apic_broadcast_quirk_disabled) {
+ if ((irq->dest_id == APIC_BROADCAST &&
+ map->logical_mode != KVM_APIC_MODE_X2APIC))
+ return true;
+ if (irq->dest_id == X2APIC_BROADCAST)
+ return true;
+ } else {
+ bool x2apic_ipi = src && *src && apic_x2apic_mode(*src);
+ if (irq->dest_id == (x2apic_ipi ?
+ X2APIC_BROADCAST : APIC_BROADCAST))
+ return true;
+ }
+
+ return false;
+}
+
+/* Return true if the interrupt can be handled by using *bitmap as index mask
+ * for valid destinations in *dst array.
+ * Return false if kvm_apic_map_get_dest_lapic did nothing useful.
+ * Note: we may have zero kvm_lapic destinations when we return true, which
+ * means that the interrupt should be dropped. In this case, *bitmap would be
+ * zero and *dst undefined.
+ */
+static inline bool kvm_apic_map_get_dest_lapic(struct kvm *kvm,
+ struct kvm_lapic **src, struct kvm_lapic_irq *irq,
+ struct kvm_apic_map *map, struct kvm_lapic ***dst,
+ unsigned long *bitmap)
+{
+ int i, lowest;
+
+ if (irq->shorthand == APIC_DEST_SELF && src) {
+ *dst = src;
+ *bitmap = 1;
+ return true;
+ } else if (irq->shorthand)
+ return false;
+
+ if (!map || kvm_apic_is_broadcast_dest(kvm, src, irq, map))
+ return false;
+
+ if (irq->dest_mode == APIC_DEST_PHYSICAL) {
+ if (irq->dest_id > map->max_apic_id) {
+ *bitmap = 0;
+ } else {
+ u32 dest_id = array_index_nospec(irq->dest_id, map->max_apic_id + 1);
+ *dst = &map->phys_map[dest_id];
+ *bitmap = 1;
+ }
+ return true;
+ }
+
+ *bitmap = 0;
+ if (!kvm_apic_map_get_logical_dest(map, irq->dest_id, dst,
+ (u16 *)bitmap))
+ return false;
+
+ if (!kvm_lowest_prio_delivery(irq))
+ return true;
+
+ if (!kvm_vector_hashing_enabled()) {
+ lowest = -1;
+ for_each_set_bit(i, bitmap, 16) {
+ if (!(*dst)[i])
+ continue;
+ if (lowest < 0)
+ lowest = i;
+ else if (kvm_apic_compare_prio((*dst)[i]->vcpu,
+ (*dst)[lowest]->vcpu) < 0)
+ lowest = i;
+ }
+ } else {
+ if (!*bitmap)
+ return true;
+
+ lowest = kvm_vector_to_index(irq->vector, hweight16(*bitmap),
+ bitmap, 16);
+
+ if (!(*dst)[lowest]) {
+ kvm_apic_disabled_lapic_found(kvm);
+ *bitmap = 0;
+ return true;
+ }
+ }
+
+ *bitmap = (lowest >= 0) ? 1 << lowest : 0;
+
+ return true;
+}
+
+bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
+ struct kvm_lapic_irq *irq, int *r, struct dest_map *dest_map)
+{
+ struct kvm_apic_map *map;
+ unsigned long bitmap;
+ struct kvm_lapic **dst = NULL;
+ int i;
+ bool ret;
+
+ *r = -1;
+
+ if (irq->shorthand == APIC_DEST_SELF) {
+ if (KVM_BUG_ON(!src, kvm)) {
+ *r = 0;
+ return true;
+ }
+ *r = kvm_apic_set_irq(src->vcpu, irq, dest_map);
+ return true;
+ }
+
+ rcu_read_lock();
+ map = rcu_dereference(kvm->arch.apic_map);
+
+ ret = kvm_apic_map_get_dest_lapic(kvm, &src, irq, map, &dst, &bitmap);
+ if (ret) {
+ *r = 0;
+ for_each_set_bit(i, &bitmap, 16) {
+ if (!dst[i])
+ continue;
+ *r += kvm_apic_set_irq(dst[i]->vcpu, irq, dest_map);
+ }
+ }
+
+ rcu_read_unlock();
+ return ret;
+}
+
+/*
+ * This routine tries to handle interrupts in posted mode, here is how
+ * it deals with different cases:
+ * - For single-destination interrupts, handle it in posted mode
+ * - Else if vector hashing is enabled and it is a lowest-priority
+ * interrupt, handle it in posted mode and use the following mechanism
+ * to find the destination vCPU.
+ * 1. For lowest-priority interrupts, store all the possible
+ * destination vCPUs in an array.
+ * 2. Use "guest vector % max number of destination vCPUs" to find
+ * the right destination vCPU in the array for the lowest-priority
+ * interrupt.
+ * - Otherwise, use remapped mode to inject the interrupt.
+ */
+bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq,
+ struct kvm_vcpu **dest_vcpu)
+{
+ struct kvm_apic_map *map;
+ unsigned long bitmap;
+ struct kvm_lapic **dst = NULL;
+ bool ret = false;
+
+ if (irq->shorthand)
+ return false;
+
+ rcu_read_lock();
+ map = rcu_dereference(kvm->arch.apic_map);
+
+ if (kvm_apic_map_get_dest_lapic(kvm, NULL, irq, map, &dst, &bitmap) &&
+ hweight16(bitmap) == 1) {
+ unsigned long i = find_first_bit(&bitmap, 16);
+
+ if (dst[i]) {
+ *dest_vcpu = dst[i]->vcpu;
+ ret = true;
+ }
+ }
+
+ rcu_read_unlock();
+ return ret;
+}
+
+/*
+ * Add a pending IRQ into lapic.
+ * Return 1 if successfully added and 0 if discarded.
+ */
+static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
+ int vector, int level, int trig_mode,
+ struct dest_map *dest_map)
+{
+ int result = 0;
+ struct kvm_vcpu *vcpu = apic->vcpu;
+
+ trace_kvm_apic_accept_irq(vcpu->vcpu_id, delivery_mode,
+ trig_mode, vector);
+ switch (delivery_mode) {
+ case APIC_DM_LOWEST:
+ vcpu->arch.apic_arb_prio++;
+ fallthrough;
+ case APIC_DM_FIXED:
+ if (unlikely(trig_mode && !level))
+ break;
+
+ /* FIXME add logic for vcpu on reset */
+ if (unlikely(!apic_enabled(apic)))
+ break;
+
+ result = 1;
+
+ if (dest_map) {
+ __set_bit(vcpu->vcpu_id, dest_map->map);
+ dest_map->vectors[vcpu->vcpu_id] = vector;
+ }
+
+ if (apic_test_vector(vector, apic->regs + APIC_TMR) != !!trig_mode) {
+ if (trig_mode)
+ kvm_lapic_set_vector(vector,
+ apic->regs + APIC_TMR);
+ else
+ kvm_lapic_clear_vector(vector,
+ apic->regs + APIC_TMR);
+ }
+
+ static_call(kvm_x86_deliver_interrupt)(apic, delivery_mode,
+ trig_mode, vector);
+ break;
+
+ case APIC_DM_REMRD:
+ result = 1;
+ vcpu->arch.pv.pv_unhalted = 1;
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ kvm_vcpu_kick(vcpu);
+ break;
+
+ case APIC_DM_SMI:
+ if (!kvm_inject_smi(vcpu)) {
+ kvm_vcpu_kick(vcpu);
+ result = 1;
+ }
+ break;
+
+ case APIC_DM_NMI:
+ result = 1;
+ kvm_inject_nmi(vcpu);
+ kvm_vcpu_kick(vcpu);
+ break;
+
+ case APIC_DM_INIT:
+ if (!trig_mode || level) {
+ result = 1;
+ /* assumes that there are only KVM_APIC_INIT/SIPI */
+ apic->pending_events = (1UL << KVM_APIC_INIT);
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ kvm_vcpu_kick(vcpu);
+ }
+ break;
+
+ case APIC_DM_STARTUP:
+ result = 1;
+ apic->sipi_vector = vector;
+ /* make sure sipi_vector is visible for the receiver */
+ smp_wmb();
+ set_bit(KVM_APIC_SIPI, &apic->pending_events);
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ kvm_vcpu_kick(vcpu);
+ break;
+
+ case APIC_DM_EXTINT:
+ /*
+ * Should only be called by kvm_apic_local_deliver() with LVT0,
+ * before NMI watchdog was enabled. Already handled by
+ * kvm_apic_accept_pic_intr().
+ */
+ break;
+
+ default:
+ printk(KERN_ERR "TODO: unsupported delivery mode %x\n",
+ delivery_mode);
+ break;
+ }
+ return result;
+}
+
+/*
+ * This routine identifies the destination vcpus mask meant to receive the
+ * IOAPIC interrupts. It either uses kvm_apic_map_get_dest_lapic() to find
+ * out the destination vcpus array and set the bitmap or it traverses to
+ * each available vcpu to identify the same.
+ */
+void kvm_bitmap_or_dest_vcpus(struct kvm *kvm, struct kvm_lapic_irq *irq,
+ unsigned long *vcpu_bitmap)
+{
+ struct kvm_lapic **dest_vcpu = NULL;
+ struct kvm_lapic *src = NULL;
+ struct kvm_apic_map *map;
+ struct kvm_vcpu *vcpu;
+ unsigned long bitmap, i;
+ int vcpu_idx;
+ bool ret;
+
+ rcu_read_lock();
+ map = rcu_dereference(kvm->arch.apic_map);
+
+ ret = kvm_apic_map_get_dest_lapic(kvm, &src, irq, map, &dest_vcpu,
+ &bitmap);
+ if (ret) {
+ for_each_set_bit(i, &bitmap, 16) {
+ if (!dest_vcpu[i])
+ continue;
+ vcpu_idx = dest_vcpu[i]->vcpu->vcpu_idx;
+ __set_bit(vcpu_idx, vcpu_bitmap);
+ }
+ } else {
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ if (!kvm_apic_present(vcpu))
+ continue;
+ if (!kvm_apic_match_dest(vcpu, NULL,
+ irq->shorthand,
+ irq->dest_id,
+ irq->dest_mode))
+ continue;
+ __set_bit(i, vcpu_bitmap);
+ }
+ }
+ rcu_read_unlock();
+}
+
+int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2)
+{
+ return vcpu1->arch.apic_arb_prio - vcpu2->arch.apic_arb_prio;
+}
+
+static bool kvm_ioapic_handles_vector(struct kvm_lapic *apic, int vector)
+{
+ return test_bit(vector, apic->vcpu->arch.ioapic_handled_vectors);
+}
+
+static void kvm_ioapic_send_eoi(struct kvm_lapic *apic, int vector)
+{
+ int trigger_mode;
+
+ /* Eoi the ioapic only if the ioapic doesn't own the vector. */
+ if (!kvm_ioapic_handles_vector(apic, vector))
+ return;
+
+ /* Request a KVM exit to inform the userspace IOAPIC. */
+ if (irqchip_split(apic->vcpu->kvm)) {
+ apic->vcpu->arch.pending_ioapic_eoi = vector;
+ kvm_make_request(KVM_REQ_IOAPIC_EOI_EXIT, apic->vcpu);
+ return;
+ }
+
+ if (apic_test_vector(vector, apic->regs + APIC_TMR))
+ trigger_mode = IOAPIC_LEVEL_TRIG;
+ else
+ trigger_mode = IOAPIC_EDGE_TRIG;
+
+ kvm_ioapic_update_eoi(apic->vcpu, vector, trigger_mode);
+}
+
+static int apic_set_eoi(struct kvm_lapic *apic)
+{
+ int vector = apic_find_highest_isr(apic);
+
+ trace_kvm_eoi(apic, vector);
+
+ /*
+ * Not every write EOI will has corresponding ISR,
+ * one example is when Kernel check timer on setup_IO_APIC
+ */
+ if (vector == -1)
+ return vector;
+
+ apic_clear_isr(vector, apic);
+ apic_update_ppr(apic);
+
+ if (to_hv_vcpu(apic->vcpu) &&
+ test_bit(vector, to_hv_synic(apic->vcpu)->vec_bitmap))
+ kvm_hv_synic_send_eoi(apic->vcpu, vector);
+
+ kvm_ioapic_send_eoi(apic, vector);
+ kvm_make_request(KVM_REQ_EVENT, apic->vcpu);
+ return vector;
+}
+
+/*
+ * this interface assumes a trap-like exit, which has already finished
+ * desired side effect including vISR and vPPR update.
+ */
+void kvm_apic_set_eoi_accelerated(struct kvm_vcpu *vcpu, int vector)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+
+ trace_kvm_eoi(apic, vector);
+
+ kvm_ioapic_send_eoi(apic, vector);
+ kvm_make_request(KVM_REQ_EVENT, apic->vcpu);
+}
+EXPORT_SYMBOL_GPL(kvm_apic_set_eoi_accelerated);
+
+void kvm_apic_send_ipi(struct kvm_lapic *apic, u32 icr_low, u32 icr_high)
+{
+ struct kvm_lapic_irq irq;
+
+ /* KVM has no delay and should always clear the BUSY/PENDING flag. */
+ WARN_ON_ONCE(icr_low & APIC_ICR_BUSY);
+
+ irq.vector = icr_low & APIC_VECTOR_MASK;
+ irq.delivery_mode = icr_low & APIC_MODE_MASK;
+ irq.dest_mode = icr_low & APIC_DEST_MASK;
+ irq.level = (icr_low & APIC_INT_ASSERT) != 0;
+ irq.trig_mode = icr_low & APIC_INT_LEVELTRIG;
+ irq.shorthand = icr_low & APIC_SHORT_MASK;
+ irq.msi_redir_hint = false;
+ if (apic_x2apic_mode(apic))
+ irq.dest_id = icr_high;
+ else
+ irq.dest_id = GET_XAPIC_DEST_FIELD(icr_high);
+
+ trace_kvm_apic_ipi(icr_low, irq.dest_id);
+
+ kvm_irq_delivery_to_apic(apic->vcpu->kvm, apic, &irq, NULL);
+}
+EXPORT_SYMBOL_GPL(kvm_apic_send_ipi);
+
+static u32 apic_get_tmcct(struct kvm_lapic *apic)
+{
+ ktime_t remaining, now;
+ s64 ns;
+
+ ASSERT(apic != NULL);
+
+ /* if initial count is 0, current count should also be 0 */
+ if (kvm_lapic_get_reg(apic, APIC_TMICT) == 0 ||
+ apic->lapic_timer.period == 0)
+ return 0;
+
+ now = ktime_get();
+ remaining = ktime_sub(apic->lapic_timer.target_expiration, now);
+ if (ktime_to_ns(remaining) < 0)
+ remaining = 0;
+
+ ns = mod_64(ktime_to_ns(remaining), apic->lapic_timer.period);
+ return div64_u64(ns, (APIC_BUS_CYCLE_NS * apic->divide_count));
+}
+
+static void __report_tpr_access(struct kvm_lapic *apic, bool write)
+{
+ struct kvm_vcpu *vcpu = apic->vcpu;
+ struct kvm_run *run = vcpu->run;
+
+ kvm_make_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu);
+ run->tpr_access.rip = kvm_rip_read(vcpu);
+ run->tpr_access.is_write = write;
+}
+
+static inline void report_tpr_access(struct kvm_lapic *apic, bool write)
+{
+ if (apic->vcpu->arch.tpr_access_reporting)
+ __report_tpr_access(apic, write);
+}
+
+static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset)
+{
+ u32 val = 0;
+
+ if (offset >= LAPIC_MMIO_LENGTH)
+ return 0;
+
+ switch (offset) {
+ case APIC_ARBPRI:
+ break;
+
+ case APIC_TMCCT: /* Timer CCR */
+ if (apic_lvtt_tscdeadline(apic))
+ return 0;
+
+ val = apic_get_tmcct(apic);
+ break;
+ case APIC_PROCPRI:
+ apic_update_ppr(apic);
+ val = kvm_lapic_get_reg(apic, offset);
+ break;
+ case APIC_TASKPRI:
+ report_tpr_access(apic, false);
+ fallthrough;
+ default:
+ val = kvm_lapic_get_reg(apic, offset);
+ break;
+ }
+
+ return val;
+}
+
+static inline struct kvm_lapic *to_lapic(struct kvm_io_device *dev)
+{
+ return container_of(dev, struct kvm_lapic, dev);
+}
+
+#define APIC_REG_MASK(reg) (1ull << ((reg) >> 4))
+#define APIC_REGS_MASK(first, count) \
+ (APIC_REG_MASK(first) * ((1ull << (count)) - 1))
+
+u64 kvm_lapic_readable_reg_mask(struct kvm_lapic *apic)
+{
+ /* Leave bits '0' for reserved and write-only registers. */
+ u64 valid_reg_mask =
+ APIC_REG_MASK(APIC_ID) |
+ APIC_REG_MASK(APIC_LVR) |
+ APIC_REG_MASK(APIC_TASKPRI) |
+ APIC_REG_MASK(APIC_PROCPRI) |
+ APIC_REG_MASK(APIC_LDR) |
+ APIC_REG_MASK(APIC_SPIV) |
+ APIC_REGS_MASK(APIC_ISR, APIC_ISR_NR) |
+ APIC_REGS_MASK(APIC_TMR, APIC_ISR_NR) |
+ APIC_REGS_MASK(APIC_IRR, APIC_ISR_NR) |
+ APIC_REG_MASK(APIC_ESR) |
+ APIC_REG_MASK(APIC_ICR) |
+ APIC_REG_MASK(APIC_LVTT) |
+ APIC_REG_MASK(APIC_LVTTHMR) |
+ APIC_REG_MASK(APIC_LVTPC) |
+ APIC_REG_MASK(APIC_LVT0) |
+ APIC_REG_MASK(APIC_LVT1) |
+ APIC_REG_MASK(APIC_LVTERR) |
+ APIC_REG_MASK(APIC_TMICT) |
+ APIC_REG_MASK(APIC_TMCCT) |
+ APIC_REG_MASK(APIC_TDCR);
+
+ if (kvm_lapic_lvt_supported(apic, LVT_CMCI))
+ valid_reg_mask |= APIC_REG_MASK(APIC_LVTCMCI);
+
+ /* ARBPRI, DFR, and ICR2 are not valid in x2APIC mode. */
+ if (!apic_x2apic_mode(apic))
+ valid_reg_mask |= APIC_REG_MASK(APIC_ARBPRI) |
+ APIC_REG_MASK(APIC_DFR) |
+ APIC_REG_MASK(APIC_ICR2);
+
+ return valid_reg_mask;
+}
+EXPORT_SYMBOL_GPL(kvm_lapic_readable_reg_mask);
+
+static int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len,
+ void *data)
+{
+ unsigned char alignment = offset & 0xf;
+ u32 result;
+
+ /*
+ * WARN if KVM reads ICR in x2APIC mode, as it's an 8-byte register in
+ * x2APIC and needs to be manually handled by the caller.
+ */
+ WARN_ON_ONCE(apic_x2apic_mode(apic) && offset == APIC_ICR);
+
+ if (alignment + len > 4)
+ return 1;
+
+ if (offset > 0x3f0 ||
+ !(kvm_lapic_readable_reg_mask(apic) & APIC_REG_MASK(offset)))
+ return 1;
+
+ result = __apic_read(apic, offset & ~0xf);
+
+ trace_kvm_apic_read(offset, result);
+
+ switch (len) {
+ case 1:
+ case 2:
+ case 4:
+ memcpy(data, (char *)&result + alignment, len);
+ break;
+ default:
+ printk(KERN_ERR "Local APIC read with len = %x, "
+ "should be 1,2, or 4 instead\n", len);
+ break;
+ }
+ return 0;
+}
+
+static int apic_mmio_in_range(struct kvm_lapic *apic, gpa_t addr)
+{
+ return addr >= apic->base_address &&
+ addr < apic->base_address + LAPIC_MMIO_LENGTH;
+}
+
+static int apic_mmio_read(struct kvm_vcpu *vcpu, struct kvm_io_device *this,
+ gpa_t address, int len, void *data)
+{
+ struct kvm_lapic *apic = to_lapic(this);
+ u32 offset = address - apic->base_address;
+
+ if (!apic_mmio_in_range(apic, address))
+ return -EOPNOTSUPP;
+
+ if (!kvm_apic_hw_enabled(apic) || apic_x2apic_mode(apic)) {
+ if (!kvm_check_has_quirk(vcpu->kvm,
+ KVM_X86_QUIRK_LAPIC_MMIO_HOLE))
+ return -EOPNOTSUPP;
+
+ memset(data, 0xff, len);
+ return 0;
+ }
+
+ kvm_lapic_reg_read(apic, offset, len, data);
+
+ return 0;
+}
+
+static void update_divide_count(struct kvm_lapic *apic)
+{
+ u32 tmp1, tmp2, tdcr;
+
+ tdcr = kvm_lapic_get_reg(apic, APIC_TDCR);
+ tmp1 = tdcr & 0xf;
+ tmp2 = ((tmp1 & 0x3) | ((tmp1 & 0x8) >> 1)) + 1;
+ apic->divide_count = 0x1 << (tmp2 & 0x7);
+}
+
+static void limit_periodic_timer_frequency(struct kvm_lapic *apic)
+{
+ /*
+ * Do not allow the guest to program periodic timers with small
+ * interval, since the hrtimers are not throttled by the host
+ * scheduler.
+ */
+ if (apic_lvtt_period(apic) && apic->lapic_timer.period) {
+ s64 min_period = min_timer_period_us * 1000LL;
+
+ if (apic->lapic_timer.period < min_period) {
+ pr_info_ratelimited(
+ "vcpu %i: requested %lld ns "
+ "lapic timer period limited to %lld ns\n",
+ apic->vcpu->vcpu_id,
+ apic->lapic_timer.period, min_period);
+ apic->lapic_timer.period = min_period;
+ }
+ }
+}
+
+static void cancel_hv_timer(struct kvm_lapic *apic);
+
+static void cancel_apic_timer(struct kvm_lapic *apic)
+{
+ hrtimer_cancel(&apic->lapic_timer.timer);
+ preempt_disable();
+ if (apic->lapic_timer.hv_timer_in_use)
+ cancel_hv_timer(apic);
+ preempt_enable();
+ atomic_set(&apic->lapic_timer.pending, 0);
+}
+
+static void apic_update_lvtt(struct kvm_lapic *apic)
+{
+ u32 timer_mode = kvm_lapic_get_reg(apic, APIC_LVTT) &
+ apic->lapic_timer.timer_mode_mask;
+
+ if (apic->lapic_timer.timer_mode != timer_mode) {
+ if (apic_lvtt_tscdeadline(apic) != (timer_mode ==
+ APIC_LVT_TIMER_TSCDEADLINE)) {
+ cancel_apic_timer(apic);
+ kvm_lapic_set_reg(apic, APIC_TMICT, 0);
+ apic->lapic_timer.period = 0;
+ apic->lapic_timer.tscdeadline = 0;
+ }
+ apic->lapic_timer.timer_mode = timer_mode;
+ limit_periodic_timer_frequency(apic);
+ }
+}
+
+/*
+ * On APICv, this test will cause a busy wait
+ * during a higher-priority task.
+ */
+
+static bool lapic_timer_int_injected(struct kvm_vcpu *vcpu)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+ u32 reg = kvm_lapic_get_reg(apic, APIC_LVTT);
+
+ if (kvm_apic_hw_enabled(apic)) {
+ int vec = reg & APIC_VECTOR_MASK;
+ void *bitmap = apic->regs + APIC_ISR;
+
+ if (apic->apicv_active)
+ bitmap = apic->regs + APIC_IRR;
+
+ if (apic_test_vector(vec, bitmap))
+ return true;
+ }
+ return false;
+}
+
+static inline void __wait_lapic_expire(struct kvm_vcpu *vcpu, u64 guest_cycles)
+{
+ u64 timer_advance_ns = vcpu->arch.apic->lapic_timer.timer_advance_ns;
+
+ /*
+ * If the guest TSC is running at a different ratio than the host, then
+ * convert the delay to nanoseconds to achieve an accurate delay. Note
+ * that __delay() uses delay_tsc whenever the hardware has TSC, thus
+ * always for VMX enabled hardware.
+ */
+ if (vcpu->arch.tsc_scaling_ratio == kvm_caps.default_tsc_scaling_ratio) {
+ __delay(min(guest_cycles,
+ nsec_to_cycles(vcpu, timer_advance_ns)));
+ } else {
+ u64 delay_ns = guest_cycles * 1000000ULL;
+ do_div(delay_ns, vcpu->arch.virtual_tsc_khz);
+ ndelay(min_t(u32, delay_ns, timer_advance_ns));
+ }
+}
+
+static inline void adjust_lapic_timer_advance(struct kvm_vcpu *vcpu,
+ s64 advance_expire_delta)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+ u32 timer_advance_ns = apic->lapic_timer.timer_advance_ns;
+ u64 ns;
+
+ /* Do not adjust for tiny fluctuations or large random spikes. */
+ if (abs(advance_expire_delta) > LAPIC_TIMER_ADVANCE_ADJUST_MAX ||
+ abs(advance_expire_delta) < LAPIC_TIMER_ADVANCE_ADJUST_MIN)
+ return;
+
+ /* too early */
+ if (advance_expire_delta < 0) {
+ ns = -advance_expire_delta * 1000000ULL;
+ do_div(ns, vcpu->arch.virtual_tsc_khz);
+ timer_advance_ns -= ns/LAPIC_TIMER_ADVANCE_ADJUST_STEP;
+ } else {
+ /* too late */
+ ns = advance_expire_delta * 1000000ULL;
+ do_div(ns, vcpu->arch.virtual_tsc_khz);
+ timer_advance_ns += ns/LAPIC_TIMER_ADVANCE_ADJUST_STEP;
+ }
+
+ if (unlikely(timer_advance_ns > LAPIC_TIMER_ADVANCE_NS_MAX))
+ timer_advance_ns = LAPIC_TIMER_ADVANCE_NS_INIT;
+ apic->lapic_timer.timer_advance_ns = timer_advance_ns;
+}
+
+static void __kvm_wait_lapic_expire(struct kvm_vcpu *vcpu)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+ u64 guest_tsc, tsc_deadline;
+
+ tsc_deadline = apic->lapic_timer.expired_tscdeadline;
+ apic->lapic_timer.expired_tscdeadline = 0;
+ guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
+ trace_kvm_wait_lapic_expire(vcpu->vcpu_id, guest_tsc - tsc_deadline);
+
+ if (lapic_timer_advance_dynamic) {
+ adjust_lapic_timer_advance(vcpu, guest_tsc - tsc_deadline);
+ /*
+ * If the timer fired early, reread the TSC to account for the
+ * overhead of the above adjustment to avoid waiting longer
+ * than is necessary.
+ */
+ if (guest_tsc < tsc_deadline)
+ guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
+ }
+
+ if (guest_tsc < tsc_deadline)
+ __wait_lapic_expire(vcpu, tsc_deadline - guest_tsc);
+}
+
+void kvm_wait_lapic_expire(struct kvm_vcpu *vcpu)
+{
+ if (lapic_in_kernel(vcpu) &&
+ vcpu->arch.apic->lapic_timer.expired_tscdeadline &&
+ vcpu->arch.apic->lapic_timer.timer_advance_ns &&
+ lapic_timer_int_injected(vcpu))
+ __kvm_wait_lapic_expire(vcpu);
+}
+EXPORT_SYMBOL_GPL(kvm_wait_lapic_expire);
+
+static void kvm_apic_inject_pending_timer_irqs(struct kvm_lapic *apic)
+{
+ struct kvm_timer *ktimer = &apic->lapic_timer;
+
+ kvm_apic_local_deliver(apic, APIC_LVTT);
+ if (apic_lvtt_tscdeadline(apic)) {
+ ktimer->tscdeadline = 0;
+ } else if (apic_lvtt_oneshot(apic)) {
+ ktimer->tscdeadline = 0;
+ ktimer->target_expiration = 0;
+ }
+}
+
+static void apic_timer_expired(struct kvm_lapic *apic, bool from_timer_fn)
+{
+ struct kvm_vcpu *vcpu = apic->vcpu;
+ struct kvm_timer *ktimer = &apic->lapic_timer;
+
+ if (atomic_read(&apic->lapic_timer.pending))
+ return;
+
+ if (apic_lvtt_tscdeadline(apic) || ktimer->hv_timer_in_use)
+ ktimer->expired_tscdeadline = ktimer->tscdeadline;
+
+ if (!from_timer_fn && apic->apicv_active) {
+ WARN_ON(kvm_get_running_vcpu() != vcpu);
+ kvm_apic_inject_pending_timer_irqs(apic);
+ return;
+ }
+
+ if (kvm_use_posted_timer_interrupt(apic->vcpu)) {
+ /*
+ * Ensure the guest's timer has truly expired before posting an
+ * interrupt. Open code the relevant checks to avoid querying
+ * lapic_timer_int_injected(), which will be false since the
+ * interrupt isn't yet injected. Waiting until after injecting
+ * is not an option since that won't help a posted interrupt.
+ */
+ if (vcpu->arch.apic->lapic_timer.expired_tscdeadline &&
+ vcpu->arch.apic->lapic_timer.timer_advance_ns)
+ __kvm_wait_lapic_expire(vcpu);
+ kvm_apic_inject_pending_timer_irqs(apic);
+ return;
+ }
+
+ atomic_inc(&apic->lapic_timer.pending);
+ kvm_make_request(KVM_REQ_UNBLOCK, vcpu);
+ if (from_timer_fn)
+ kvm_vcpu_kick(vcpu);
+}
+
+static void start_sw_tscdeadline(struct kvm_lapic *apic)
+{
+ struct kvm_timer *ktimer = &apic->lapic_timer;
+ u64 guest_tsc, tscdeadline = ktimer->tscdeadline;
+ u64 ns = 0;
+ ktime_t expire;
+ struct kvm_vcpu *vcpu = apic->vcpu;
+ unsigned long this_tsc_khz = vcpu->arch.virtual_tsc_khz;
+ unsigned long flags;
+ ktime_t now;
+
+ if (unlikely(!tscdeadline || !this_tsc_khz))
+ return;
+
+ local_irq_save(flags);
+
+ now = ktime_get();
+ guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
+
+ ns = (tscdeadline - guest_tsc) * 1000000ULL;
+ do_div(ns, this_tsc_khz);
+
+ if (likely(tscdeadline > guest_tsc) &&
+ likely(ns > apic->lapic_timer.timer_advance_ns)) {
+ expire = ktime_add_ns(now, ns);
+ expire = ktime_sub_ns(expire, ktimer->timer_advance_ns);
+ hrtimer_start(&ktimer->timer, expire, HRTIMER_MODE_ABS_HARD);
+ } else
+ apic_timer_expired(apic, false);
+
+ local_irq_restore(flags);
+}
+
+static inline u64 tmict_to_ns(struct kvm_lapic *apic, u32 tmict)
+{
+ return (u64)tmict * APIC_BUS_CYCLE_NS * (u64)apic->divide_count;
+}
+
+static void update_target_expiration(struct kvm_lapic *apic, uint32_t old_divisor)
+{
+ ktime_t now, remaining;
+ u64 ns_remaining_old, ns_remaining_new;
+
+ apic->lapic_timer.period =
+ tmict_to_ns(apic, kvm_lapic_get_reg(apic, APIC_TMICT));
+ limit_periodic_timer_frequency(apic);
+
+ now = ktime_get();
+ remaining = ktime_sub(apic->lapic_timer.target_expiration, now);
+ if (ktime_to_ns(remaining) < 0)
+ remaining = 0;
+
+ ns_remaining_old = ktime_to_ns(remaining);
+ ns_remaining_new = mul_u64_u32_div(ns_remaining_old,
+ apic->divide_count, old_divisor);
+
+ apic->lapic_timer.tscdeadline +=
+ nsec_to_cycles(apic->vcpu, ns_remaining_new) -
+ nsec_to_cycles(apic->vcpu, ns_remaining_old);
+ apic->lapic_timer.target_expiration = ktime_add_ns(now, ns_remaining_new);
+}
+
+static bool set_target_expiration(struct kvm_lapic *apic, u32 count_reg)
+{
+ ktime_t now;
+ u64 tscl = rdtsc();
+ s64 deadline;
+
+ now = ktime_get();
+ apic->lapic_timer.period =
+ tmict_to_ns(apic, kvm_lapic_get_reg(apic, APIC_TMICT));
+
+ if (!apic->lapic_timer.period) {
+ apic->lapic_timer.tscdeadline = 0;
+ return false;
+ }
+
+ limit_periodic_timer_frequency(apic);
+ deadline = apic->lapic_timer.period;
+
+ if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) {
+ if (unlikely(count_reg != APIC_TMICT)) {
+ deadline = tmict_to_ns(apic,
+ kvm_lapic_get_reg(apic, count_reg));
+ if (unlikely(deadline <= 0)) {
+ if (apic_lvtt_period(apic))
+ deadline = apic->lapic_timer.period;
+ else
+ deadline = 0;
+ }
+ else if (unlikely(deadline > apic->lapic_timer.period)) {
+ pr_info_ratelimited(
+ "vcpu %i: requested lapic timer restore with "
+ "starting count register %#x=%u (%lld ns) > initial count (%lld ns). "
+ "Using initial count to start timer.\n",
+ apic->vcpu->vcpu_id,
+ count_reg,
+ kvm_lapic_get_reg(apic, count_reg),
+ deadline, apic->lapic_timer.period);
+ kvm_lapic_set_reg(apic, count_reg, 0);
+ deadline = apic->lapic_timer.period;
+ }
+ }
+ }
+
+ apic->lapic_timer.tscdeadline = kvm_read_l1_tsc(apic->vcpu, tscl) +
+ nsec_to_cycles(apic->vcpu, deadline);
+ apic->lapic_timer.target_expiration = ktime_add_ns(now, deadline);
+
+ return true;
+}
+
+static void advance_periodic_target_expiration(struct kvm_lapic *apic)
+{
+ ktime_t now = ktime_get();
+ u64 tscl = rdtsc();
+ ktime_t delta;
+
+ /*
+ * Synchronize both deadlines to the same time source or
+ * differences in the periods (caused by differences in the
+ * underlying clocks or numerical approximation errors) will
+ * cause the two to drift apart over time as the errors
+ * accumulate.
+ */
+ apic->lapic_timer.target_expiration =
+ ktime_add_ns(apic->lapic_timer.target_expiration,
+ apic->lapic_timer.period);
+ delta = ktime_sub(apic->lapic_timer.target_expiration, now);
+ apic->lapic_timer.tscdeadline = kvm_read_l1_tsc(apic->vcpu, tscl) +
+ nsec_to_cycles(apic->vcpu, delta);
+}
+
+static void start_sw_period(struct kvm_lapic *apic)
+{
+ if (!apic->lapic_timer.period)
+ return;
+
+ if (ktime_after(ktime_get(),
+ apic->lapic_timer.target_expiration)) {
+ apic_timer_expired(apic, false);
+
+ if (apic_lvtt_oneshot(apic))
+ return;
+
+ advance_periodic_target_expiration(apic);
+ }
+
+ hrtimer_start(&apic->lapic_timer.timer,
+ apic->lapic_timer.target_expiration,
+ HRTIMER_MODE_ABS_HARD);
+}
+
+bool kvm_lapic_hv_timer_in_use(struct kvm_vcpu *vcpu)
+{
+ if (!lapic_in_kernel(vcpu))
+ return false;
+
+ return vcpu->arch.apic->lapic_timer.hv_timer_in_use;
+}
+
+static void cancel_hv_timer(struct kvm_lapic *apic)
+{
+ WARN_ON(preemptible());
+ WARN_ON(!apic->lapic_timer.hv_timer_in_use);
+ static_call(kvm_x86_cancel_hv_timer)(apic->vcpu);
+ apic->lapic_timer.hv_timer_in_use = false;
+}
+
+static bool start_hv_timer(struct kvm_lapic *apic)
+{
+ struct kvm_timer *ktimer = &apic->lapic_timer;
+ struct kvm_vcpu *vcpu = apic->vcpu;
+ bool expired;
+
+ WARN_ON(preemptible());
+ if (!kvm_can_use_hv_timer(vcpu))
+ return false;
+
+ if (!ktimer->tscdeadline)
+ return false;
+
+ if (static_call(kvm_x86_set_hv_timer)(vcpu, ktimer->tscdeadline, &expired))
+ return false;
+
+ ktimer->hv_timer_in_use = true;
+ hrtimer_cancel(&ktimer->timer);
+
+ /*
+ * To simplify handling the periodic timer, leave the hv timer running
+ * even if the deadline timer has expired, i.e. rely on the resulting
+ * VM-Exit to recompute the periodic timer's target expiration.
+ */
+ if (!apic_lvtt_period(apic)) {
+ /*
+ * Cancel the hv timer if the sw timer fired while the hv timer
+ * was being programmed, or if the hv timer itself expired.
+ */
+ if (atomic_read(&ktimer->pending)) {
+ cancel_hv_timer(apic);
+ } else if (expired) {
+ apic_timer_expired(apic, false);
+ cancel_hv_timer(apic);
+ }
+ }
+
+ trace_kvm_hv_timer_state(vcpu->vcpu_id, ktimer->hv_timer_in_use);
+
+ return true;
+}
+
+static void start_sw_timer(struct kvm_lapic *apic)
+{
+ struct kvm_timer *ktimer = &apic->lapic_timer;
+
+ WARN_ON(preemptible());
+ if (apic->lapic_timer.hv_timer_in_use)
+ cancel_hv_timer(apic);
+ if (!apic_lvtt_period(apic) && atomic_read(&ktimer->pending))
+ return;
+
+ if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic))
+ start_sw_period(apic);
+ else if (apic_lvtt_tscdeadline(apic))
+ start_sw_tscdeadline(apic);
+ trace_kvm_hv_timer_state(apic->vcpu->vcpu_id, false);
+}
+
+static void restart_apic_timer(struct kvm_lapic *apic)
+{
+ preempt_disable();
+
+ if (!apic_lvtt_period(apic) && atomic_read(&apic->lapic_timer.pending))
+ goto out;
+
+ if (!start_hv_timer(apic))
+ start_sw_timer(apic);
+out:
+ preempt_enable();
+}
+
+void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+
+ preempt_disable();
+ /* If the preempt notifier has already run, it also called apic_timer_expired */
+ if (!apic->lapic_timer.hv_timer_in_use)
+ goto out;
+ WARN_ON(kvm_vcpu_is_blocking(vcpu));
+ apic_timer_expired(apic, false);
+ cancel_hv_timer(apic);
+
+ if (apic_lvtt_period(apic) && apic->lapic_timer.period) {
+ advance_periodic_target_expiration(apic);
+ restart_apic_timer(apic);
+ }
+out:
+ preempt_enable();
+}
+EXPORT_SYMBOL_GPL(kvm_lapic_expired_hv_timer);
+
+void kvm_lapic_switch_to_hv_timer(struct kvm_vcpu *vcpu)
+{
+ restart_apic_timer(vcpu->arch.apic);
+}
+
+void kvm_lapic_switch_to_sw_timer(struct kvm_vcpu *vcpu)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+
+ preempt_disable();
+ /* Possibly the TSC deadline timer is not enabled yet */
+ if (apic->lapic_timer.hv_timer_in_use)
+ start_sw_timer(apic);
+ preempt_enable();
+}
+
+void kvm_lapic_restart_hv_timer(struct kvm_vcpu *vcpu)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+
+ WARN_ON(!apic->lapic_timer.hv_timer_in_use);
+ restart_apic_timer(apic);
+}
+
+static void __start_apic_timer(struct kvm_lapic *apic, u32 count_reg)
+{
+ atomic_set(&apic->lapic_timer.pending, 0);
+
+ if ((apic_lvtt_period(apic) || apic_lvtt_oneshot(apic))
+ && !set_target_expiration(apic, count_reg))
+ return;
+
+ restart_apic_timer(apic);
+}
+
+static void start_apic_timer(struct kvm_lapic *apic)
+{
+ __start_apic_timer(apic, APIC_TMICT);
+}
+
+static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val)
+{
+ bool lvt0_in_nmi_mode = apic_lvt_nmi_mode(lvt0_val);
+
+ if (apic->lvt0_in_nmi_mode != lvt0_in_nmi_mode) {
+ apic->lvt0_in_nmi_mode = lvt0_in_nmi_mode;
+ if (lvt0_in_nmi_mode) {
+ atomic_inc(&apic->vcpu->kvm->arch.vapics_in_nmi_mode);
+ } else
+ atomic_dec(&apic->vcpu->kvm->arch.vapics_in_nmi_mode);
+ }
+}
+
+static int get_lvt_index(u32 reg)
+{
+ if (reg == APIC_LVTCMCI)
+ return LVT_CMCI;
+ if (reg < APIC_LVTT || reg > APIC_LVTERR)
+ return -1;
+ return array_index_nospec(
+ (reg - APIC_LVTT) >> 4, KVM_APIC_MAX_NR_LVT_ENTRIES);
+}
+
+static int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
+{
+ int ret = 0;
+
+ trace_kvm_apic_write(reg, val);
+
+ switch (reg) {
+ case APIC_ID: /* Local APIC ID */
+ if (!apic_x2apic_mode(apic)) {
+ kvm_apic_set_xapic_id(apic, val >> 24);
+ } else {
+ ret = 1;
+ }
+ break;
+
+ case APIC_TASKPRI:
+ report_tpr_access(apic, true);
+ apic_set_tpr(apic, val & 0xff);
+ break;
+
+ case APIC_EOI:
+ apic_set_eoi(apic);
+ break;
+
+ case APIC_LDR:
+ if (!apic_x2apic_mode(apic))
+ kvm_apic_set_ldr(apic, val & APIC_LDR_MASK);
+ else
+ ret = 1;
+ break;
+
+ case APIC_DFR:
+ if (!apic_x2apic_mode(apic))
+ kvm_apic_set_dfr(apic, val | 0x0FFFFFFF);
+ else
+ ret = 1;
+ break;
+
+ case APIC_SPIV: {
+ u32 mask = 0x3ff;
+ if (kvm_lapic_get_reg(apic, APIC_LVR) & APIC_LVR_DIRECTED_EOI)
+ mask |= APIC_SPIV_DIRECTED_EOI;
+ apic_set_spiv(apic, val & mask);
+ if (!(val & APIC_SPIV_APIC_ENABLED)) {
+ int i;
+
+ for (i = 0; i < apic->nr_lvt_entries; i++) {
+ kvm_lapic_set_reg(apic, APIC_LVTx(i),
+ kvm_lapic_get_reg(apic, APIC_LVTx(i)) | APIC_LVT_MASKED);
+ }
+ apic_update_lvtt(apic);
+ atomic_set(&apic->lapic_timer.pending, 0);
+
+ }
+ break;
+ }
+ case APIC_ICR:
+ WARN_ON_ONCE(apic_x2apic_mode(apic));
+
+ /* No delay here, so we always clear the pending bit */
+ val &= ~APIC_ICR_BUSY;
+ kvm_apic_send_ipi(apic, val, kvm_lapic_get_reg(apic, APIC_ICR2));
+ kvm_lapic_set_reg(apic, APIC_ICR, val);
+ break;
+ case APIC_ICR2:
+ if (apic_x2apic_mode(apic))
+ ret = 1;
+ else
+ kvm_lapic_set_reg(apic, APIC_ICR2, val & 0xff000000);
+ break;
+
+ case APIC_LVT0:
+ apic_manage_nmi_watchdog(apic, val);
+ fallthrough;
+ case APIC_LVTTHMR:
+ case APIC_LVTPC:
+ case APIC_LVT1:
+ case APIC_LVTERR:
+ case APIC_LVTCMCI: {
+ u32 index = get_lvt_index(reg);
+ if (!kvm_lapic_lvt_supported(apic, index)) {
+ ret = 1;
+ break;
+ }
+ if (!kvm_apic_sw_enabled(apic))
+ val |= APIC_LVT_MASKED;
+ val &= apic_lvt_mask[index];
+ kvm_lapic_set_reg(apic, reg, val);
+ break;
+ }
+
+ case APIC_LVTT:
+ if (!kvm_apic_sw_enabled(apic))
+ val |= APIC_LVT_MASKED;
+ val &= (apic_lvt_mask[0] | apic->lapic_timer.timer_mode_mask);
+ kvm_lapic_set_reg(apic, APIC_LVTT, val);
+ apic_update_lvtt(apic);
+ break;
+
+ case APIC_TMICT:
+ if (apic_lvtt_tscdeadline(apic))
+ break;
+
+ cancel_apic_timer(apic);
+ kvm_lapic_set_reg(apic, APIC_TMICT, val);
+ start_apic_timer(apic);
+ break;
+
+ case APIC_TDCR: {
+ uint32_t old_divisor = apic->divide_count;
+
+ kvm_lapic_set_reg(apic, APIC_TDCR, val & 0xb);
+ update_divide_count(apic);
+ if (apic->divide_count != old_divisor &&
+ apic->lapic_timer.period) {
+ hrtimer_cancel(&apic->lapic_timer.timer);
+ update_target_expiration(apic, old_divisor);
+ restart_apic_timer(apic);
+ }
+ break;
+ }
+ case APIC_ESR:
+ if (apic_x2apic_mode(apic) && val != 0)
+ ret = 1;
+ break;
+
+ case APIC_SELF_IPI:
+ /*
+ * Self-IPI exists only when x2APIC is enabled. Bits 7:0 hold
+ * the vector, everything else is reserved.
+ */
+ if (!apic_x2apic_mode(apic) || (val & ~APIC_VECTOR_MASK))
+ ret = 1;
+ else
+ kvm_apic_send_ipi(apic, APIC_DEST_SELF | val, 0);
+ break;
+ default:
+ ret = 1;
+ break;
+ }
+
+ /*
+ * Recalculate APIC maps if necessary, e.g. if the software enable bit
+ * was toggled, the APIC ID changed, etc... The maps are marked dirty
+ * on relevant changes, i.e. this is a nop for most writes.
+ */
+ kvm_recalculate_apic_map(apic->vcpu->kvm);
+
+ return ret;
+}
+
+static int apic_mmio_write(struct kvm_vcpu *vcpu, struct kvm_io_device *this,
+ gpa_t address, int len, const void *data)
+{
+ struct kvm_lapic *apic = to_lapic(this);
+ unsigned int offset = address - apic->base_address;
+ u32 val;
+
+ if (!apic_mmio_in_range(apic, address))
+ return -EOPNOTSUPP;
+
+ if (!kvm_apic_hw_enabled(apic) || apic_x2apic_mode(apic)) {
+ if (!kvm_check_has_quirk(vcpu->kvm,
+ KVM_X86_QUIRK_LAPIC_MMIO_HOLE))
+ return -EOPNOTSUPP;
+
+ return 0;
+ }
+
+ /*
+ * APIC register must be aligned on 128-bits boundary.
+ * 32/64/128 bits registers must be accessed thru 32 bits.
+ * Refer SDM 8.4.1
+ */
+ if (len != 4 || (offset & 0xf))
+ return 0;
+
+ val = *(u32*)data;
+
+ kvm_lapic_reg_write(apic, offset & 0xff0, val);
+
+ return 0;
+}
+
+void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu)
+{
+ kvm_lapic_reg_write(vcpu->arch.apic, APIC_EOI, 0);
+}
+EXPORT_SYMBOL_GPL(kvm_lapic_set_eoi);
+
+/* emulate APIC access in a trap manner */
+void kvm_apic_write_nodecode(struct kvm_vcpu *vcpu, u32 offset)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+
+ /*
+ * ICR is a single 64-bit register when x2APIC is enabled, all others
+ * registers hold 32-bit values. For legacy xAPIC, ICR writes need to
+ * go down the common path to get the upper half from ICR2.
+ *
+ * Note, using the write helpers may incur an unnecessary write to the
+ * virtual APIC state, but KVM needs to conditionally modify the value
+ * in certain cases, e.g. to clear the ICR busy bit. The cost of extra
+ * conditional branches is likely a wash relative to the cost of the
+ * maybe-unecessary write, and both are in the noise anyways.
+ */
+ if (apic_x2apic_mode(apic) && offset == APIC_ICR)
+ kvm_x2apic_icr_write(apic, kvm_lapic_get_reg64(apic, APIC_ICR));
+ else
+ kvm_lapic_reg_write(apic, offset, kvm_lapic_get_reg(apic, offset));
+}
+EXPORT_SYMBOL_GPL(kvm_apic_write_nodecode);
+
+void kvm_free_lapic(struct kvm_vcpu *vcpu)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+
+ if (!vcpu->arch.apic)
+ return;
+
+ hrtimer_cancel(&apic->lapic_timer.timer);
+
+ if (!(vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE))
+ static_branch_slow_dec_deferred(&apic_hw_disabled);
+
+ if (!apic->sw_enabled)
+ static_branch_slow_dec_deferred(&apic_sw_disabled);
+
+ if (apic->regs)
+ free_page((unsigned long)apic->regs);
+
+ kfree(apic);
+}
+
+/*
+ *----------------------------------------------------------------------
+ * LAPIC interface
+ *----------------------------------------------------------------------
+ */
+u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+
+ if (!kvm_apic_present(vcpu) || !apic_lvtt_tscdeadline(apic))
+ return 0;
+
+ return apic->lapic_timer.tscdeadline;
+}
+
+void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+
+ if (!kvm_apic_present(vcpu) || !apic_lvtt_tscdeadline(apic))
+ return;
+
+ hrtimer_cancel(&apic->lapic_timer.timer);
+ apic->lapic_timer.tscdeadline = data;
+ start_apic_timer(apic);
+}
+
+void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8)
+{
+ apic_set_tpr(vcpu->arch.apic, (cr8 & 0x0f) << 4);
+}
+
+u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu)
+{
+ u64 tpr;
+
+ tpr = (u64) kvm_lapic_get_reg(vcpu->arch.apic, APIC_TASKPRI);
+
+ return (tpr & 0xf0) >> 4;
+}
+
+void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
+{
+ u64 old_value = vcpu->arch.apic_base;
+ struct kvm_lapic *apic = vcpu->arch.apic;
+
+ vcpu->arch.apic_base = value;
+
+ if ((old_value ^ value) & MSR_IA32_APICBASE_ENABLE)
+ kvm_update_cpuid_runtime(vcpu);
+
+ if (!apic)
+ return;
+
+ /* update jump label if enable bit changes */
+ if ((old_value ^ value) & MSR_IA32_APICBASE_ENABLE) {
+ if (value & MSR_IA32_APICBASE_ENABLE) {
+ kvm_apic_set_xapic_id(apic, vcpu->vcpu_id);
+ static_branch_slow_dec_deferred(&apic_hw_disabled);
+ /* Check if there are APF page ready requests pending */
+ kvm_make_request(KVM_REQ_APF_READY, vcpu);
+ } else {
+ static_branch_inc(&apic_hw_disabled.key);
+ atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY);
+ }
+ }
+
+ if ((old_value ^ value) & X2APIC_ENABLE) {
+ if (value & X2APIC_ENABLE)
+ kvm_apic_set_x2apic_id(apic, vcpu->vcpu_id);
+ else if (value & MSR_IA32_APICBASE_ENABLE)
+ kvm_apic_set_xapic_id(apic, vcpu->vcpu_id);
+ }
+
+ if ((old_value ^ value) & (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE)) {
+ kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu);
+ static_call_cond(kvm_x86_set_virtual_apic_mode)(vcpu);
+ }
+
+ apic->base_address = apic->vcpu->arch.apic_base &
+ MSR_IA32_APICBASE_BASE;
+
+ if ((value & MSR_IA32_APICBASE_ENABLE) &&
+ apic->base_address != APIC_DEFAULT_PHYS_BASE) {
+ kvm_set_apicv_inhibit(apic->vcpu->kvm,
+ APICV_INHIBIT_REASON_APIC_BASE_MODIFIED);
+ }
+}
+
+void kvm_apic_update_apicv(struct kvm_vcpu *vcpu)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+
+ if (apic->apicv_active) {
+ /* irr_pending is always true when apicv is activated. */
+ apic->irr_pending = true;
+ apic->isr_count = 1;
+ } else {
+ /*
+ * Don't clear irr_pending, searching the IRR can race with
+ * updates from the CPU as APICv is still active from hardware's
+ * perspective. The flag will be cleared as appropriate when
+ * KVM injects the interrupt.
+ */
+ apic->isr_count = count_vectors(apic->regs + APIC_ISR);
+ }
+ apic->highest_isr_cache = -1;
+}
+
+int kvm_alloc_apic_access_page(struct kvm *kvm)
+{
+ struct page *page;
+ void __user *hva;
+ int ret = 0;
+
+ mutex_lock(&kvm->slots_lock);
+ if (kvm->arch.apic_access_memslot_enabled ||
+ kvm->arch.apic_access_memslot_inhibited)
+ goto out;
+
+ hva = __x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT,
+ APIC_DEFAULT_PHYS_BASE, PAGE_SIZE);
+ if (IS_ERR(hva)) {
+ ret = PTR_ERR(hva);
+ goto out;
+ }
+
+ page = gfn_to_page(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT);
+ if (is_error_page(page)) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ /*
+ * Do not pin the page in memory, so that memory hot-unplug
+ * is able to migrate it.
+ */
+ put_page(page);
+ kvm->arch.apic_access_memslot_enabled = true;
+out:
+ mutex_unlock(&kvm->slots_lock);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(kvm_alloc_apic_access_page);
+
+void kvm_inhibit_apic_access_page(struct kvm_vcpu *vcpu)
+{
+ struct kvm *kvm = vcpu->kvm;
+
+ if (!kvm->arch.apic_access_memslot_enabled)
+ return;
+
+ kvm_vcpu_srcu_read_unlock(vcpu);
+
+ mutex_lock(&kvm->slots_lock);
+
+ if (kvm->arch.apic_access_memslot_enabled) {
+ __x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT, 0, 0);
+ /*
+ * Clear "enabled" after the memslot is deleted so that a
+ * different vCPU doesn't get a false negative when checking
+ * the flag out of slots_lock. No additional memory barrier is
+ * needed as modifying memslots requires waiting other vCPUs to
+ * drop SRCU (see above), and false positives are ok as the
+ * flag is rechecked after acquiring slots_lock.
+ */
+ kvm->arch.apic_access_memslot_enabled = false;
+
+ /*
+ * Mark the memslot as inhibited to prevent reallocating the
+ * memslot during vCPU creation, e.g. if a vCPU is hotplugged.
+ */
+ kvm->arch.apic_access_memslot_inhibited = true;
+ }
+
+ mutex_unlock(&kvm->slots_lock);
+
+ kvm_vcpu_srcu_read_lock(vcpu);
+}
+
+void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+ u64 msr_val;
+ int i;
+
+ static_call_cond(kvm_x86_apicv_pre_state_restore)(vcpu);
+
+ if (!init_event) {
+ msr_val = APIC_DEFAULT_PHYS_BASE | MSR_IA32_APICBASE_ENABLE;
+ if (kvm_vcpu_is_reset_bsp(vcpu))
+ msr_val |= MSR_IA32_APICBASE_BSP;
+ kvm_lapic_set_base(vcpu, msr_val);
+ }
+
+ if (!apic)
+ return;
+
+ /* Stop the timer in case it's a reset to an active apic */
+ hrtimer_cancel(&apic->lapic_timer.timer);
+
+ /* The xAPIC ID is set at RESET even if the APIC was already enabled. */
+ if (!init_event)
+ kvm_apic_set_xapic_id(apic, vcpu->vcpu_id);
+ kvm_apic_set_version(apic->vcpu);
+
+ for (i = 0; i < apic->nr_lvt_entries; i++)
+ kvm_lapic_set_reg(apic, APIC_LVTx(i), APIC_LVT_MASKED);
+ apic_update_lvtt(apic);
+ if (kvm_vcpu_is_reset_bsp(vcpu) &&
+ kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_LINT0_REENABLED))
+ kvm_lapic_set_reg(apic, APIC_LVT0,
+ SET_APIC_DELIVERY_MODE(0, APIC_MODE_EXTINT));
+ apic_manage_nmi_watchdog(apic, kvm_lapic_get_reg(apic, APIC_LVT0));
+
+ kvm_apic_set_dfr(apic, 0xffffffffU);
+ apic_set_spiv(apic, 0xff);
+ kvm_lapic_set_reg(apic, APIC_TASKPRI, 0);
+ if (!apic_x2apic_mode(apic))
+ kvm_apic_set_ldr(apic, 0);
+ kvm_lapic_set_reg(apic, APIC_ESR, 0);
+ if (!apic_x2apic_mode(apic)) {
+ kvm_lapic_set_reg(apic, APIC_ICR, 0);
+ kvm_lapic_set_reg(apic, APIC_ICR2, 0);
+ } else {
+ kvm_lapic_set_reg64(apic, APIC_ICR, 0);
+ }
+ kvm_lapic_set_reg(apic, APIC_TDCR, 0);
+ kvm_lapic_set_reg(apic, APIC_TMICT, 0);
+ for (i = 0; i < 8; i++) {
+ kvm_lapic_set_reg(apic, APIC_IRR + 0x10 * i, 0);
+ kvm_lapic_set_reg(apic, APIC_ISR + 0x10 * i, 0);
+ kvm_lapic_set_reg(apic, APIC_TMR + 0x10 * i, 0);
+ }
+ kvm_apic_update_apicv(vcpu);
+ update_divide_count(apic);
+ atomic_set(&apic->lapic_timer.pending, 0);
+
+ vcpu->arch.pv_eoi.msr_val = 0;
+ apic_update_ppr(apic);
+ if (apic->apicv_active) {
+ static_call_cond(kvm_x86_apicv_post_state_restore)(vcpu);
+ static_call_cond(kvm_x86_hwapic_irr_update)(vcpu, -1);
+ static_call_cond(kvm_x86_hwapic_isr_update)(-1);
+ }
+
+ vcpu->arch.apic_arb_prio = 0;
+ vcpu->arch.apic_attention = 0;
+
+ kvm_recalculate_apic_map(vcpu->kvm);
+}
+
+/*
+ *----------------------------------------------------------------------
+ * timer interface
+ *----------------------------------------------------------------------
+ */
+
+static bool lapic_is_periodic(struct kvm_lapic *apic)
+{
+ return apic_lvtt_period(apic);
+}
+
+int apic_has_pending_timer(struct kvm_vcpu *vcpu)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+
+ if (apic_enabled(apic) && apic_lvt_enabled(apic, APIC_LVTT))
+ return atomic_read(&apic->lapic_timer.pending);
+
+ return 0;
+}
+
+int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type)
+{
+ u32 reg = kvm_lapic_get_reg(apic, lvt_type);
+ int vector, mode, trig_mode;
+ int r;
+
+ if (kvm_apic_hw_enabled(apic) && !(reg & APIC_LVT_MASKED)) {
+ vector = reg & APIC_VECTOR_MASK;
+ mode = reg & APIC_MODE_MASK;
+ trig_mode = reg & APIC_LVT_LEVEL_TRIGGER;
+
+ r = __apic_accept_irq(apic, mode, vector, 1, trig_mode, NULL);
+ if (r && lvt_type == APIC_LVTPC)
+ kvm_lapic_set_reg(apic, APIC_LVTPC, reg | APIC_LVT_MASKED);
+ return r;
+ }
+ return 0;
+}
+
+void kvm_apic_nmi_wd_deliver(struct kvm_vcpu *vcpu)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+
+ if (apic)
+ kvm_apic_local_deliver(apic, APIC_LVT0);
+}
+
+static const struct kvm_io_device_ops apic_mmio_ops = {
+ .read = apic_mmio_read,
+ .write = apic_mmio_write,
+};
+
+static enum hrtimer_restart apic_timer_fn(struct hrtimer *data)
+{
+ struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer);
+ struct kvm_lapic *apic = container_of(ktimer, struct kvm_lapic, lapic_timer);
+
+ apic_timer_expired(apic, true);
+
+ if (lapic_is_periodic(apic)) {
+ advance_periodic_target_expiration(apic);
+ hrtimer_add_expires_ns(&ktimer->timer, ktimer->period);
+ return HRTIMER_RESTART;
+ } else
+ return HRTIMER_NORESTART;
+}
+
+int kvm_create_lapic(struct kvm_vcpu *vcpu, int timer_advance_ns)
+{
+ struct kvm_lapic *apic;
+
+ ASSERT(vcpu != NULL);
+
+ apic = kzalloc(sizeof(*apic), GFP_KERNEL_ACCOUNT);
+ if (!apic)
+ goto nomem;
+
+ vcpu->arch.apic = apic;
+
+ apic->regs = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
+ if (!apic->regs) {
+ printk(KERN_ERR "malloc apic regs error for vcpu %x\n",
+ vcpu->vcpu_id);
+ goto nomem_free_apic;
+ }
+ apic->vcpu = vcpu;
+
+ apic->nr_lvt_entries = kvm_apic_calc_nr_lvt_entries(vcpu);
+
+ hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC,
+ HRTIMER_MODE_ABS_HARD);
+ apic->lapic_timer.timer.function = apic_timer_fn;
+ if (timer_advance_ns == -1) {
+ apic->lapic_timer.timer_advance_ns = LAPIC_TIMER_ADVANCE_NS_INIT;
+ lapic_timer_advance_dynamic = true;
+ } else {
+ apic->lapic_timer.timer_advance_ns = timer_advance_ns;
+ lapic_timer_advance_dynamic = false;
+ }
+
+ /*
+ * Stuff the APIC ENABLE bit in lieu of temporarily incrementing
+ * apic_hw_disabled; the full RESET value is set by kvm_lapic_reset().
+ */
+ vcpu->arch.apic_base = MSR_IA32_APICBASE_ENABLE;
+ static_branch_inc(&apic_sw_disabled.key); /* sw disabled at reset */
+ kvm_iodevice_init(&apic->dev, &apic_mmio_ops);
+
+ return 0;
+nomem_free_apic:
+ kfree(apic);
+ vcpu->arch.apic = NULL;
+nomem:
+ return -ENOMEM;
+}
+
+int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+ u32 ppr;
+
+ if (!kvm_apic_present(vcpu))
+ return -1;
+
+ __apic_update_ppr(apic, &ppr);
+ return apic_has_interrupt_for_ppr(apic, ppr);
+}
+EXPORT_SYMBOL_GPL(kvm_apic_has_interrupt);
+
+int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu)
+{
+ u32 lvt0 = kvm_lapic_get_reg(vcpu->arch.apic, APIC_LVT0);
+
+ if (!kvm_apic_hw_enabled(vcpu->arch.apic))
+ return 1;
+ if ((lvt0 & APIC_LVT_MASKED) == 0 &&
+ GET_APIC_DELIVERY_MODE(lvt0) == APIC_MODE_EXTINT)
+ return 1;
+ return 0;
+}
+
+void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+
+ if (atomic_read(&apic->lapic_timer.pending) > 0) {
+ kvm_apic_inject_pending_timer_irqs(apic);
+ atomic_set(&apic->lapic_timer.pending, 0);
+ }
+}
+
+int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu)
+{
+ int vector = kvm_apic_has_interrupt(vcpu);
+ struct kvm_lapic *apic = vcpu->arch.apic;
+ u32 ppr;
+
+ if (vector == -1)
+ return -1;
+
+ /*
+ * We get here even with APIC virtualization enabled, if doing
+ * nested virtualization and L1 runs with the "acknowledge interrupt
+ * on exit" mode. Then we cannot inject the interrupt via RVI,
+ * because the process would deliver it through the IDT.
+ */
+
+ apic_clear_irr(vector, apic);
+ if (to_hv_vcpu(vcpu) && test_bit(vector, to_hv_synic(vcpu)->auto_eoi_bitmap)) {
+ /*
+ * For auto-EOI interrupts, there might be another pending
+ * interrupt above PPR, so check whether to raise another
+ * KVM_REQ_EVENT.
+ */
+ apic_update_ppr(apic);
+ } else {
+ /*
+ * For normal interrupts, PPR has been raised and there cannot
+ * be a higher-priority pending interrupt---except if there was
+ * a concurrent interrupt injection, but that would have
+ * triggered KVM_REQ_EVENT already.
+ */
+ apic_set_isr(vector, apic);
+ __apic_update_ppr(apic, &ppr);
+ }
+
+ return vector;
+}
+
+static int kvm_apic_state_fixup(struct kvm_vcpu *vcpu,
+ struct kvm_lapic_state *s, bool set)
+{
+ if (apic_x2apic_mode(vcpu->arch.apic)) {
+ u32 *id = (u32 *)(s->regs + APIC_ID);
+ u32 *ldr = (u32 *)(s->regs + APIC_LDR);
+ u64 icr;
+
+ if (vcpu->kvm->arch.x2apic_format) {
+ if (*id != vcpu->vcpu_id)
+ return -EINVAL;
+ } else {
+ if (set)
+ *id >>= 24;
+ else
+ *id <<= 24;
+ }
+
+ /*
+ * In x2APIC mode, the LDR is fixed and based on the id. And
+ * ICR is internally a single 64-bit register, but needs to be
+ * split to ICR+ICR2 in userspace for backwards compatibility.
+ */
+ if (set) {
+ *ldr = kvm_apic_calc_x2apic_ldr(*id);
+
+ icr = __kvm_lapic_get_reg(s->regs, APIC_ICR) |
+ (u64)__kvm_lapic_get_reg(s->regs, APIC_ICR2) << 32;
+ __kvm_lapic_set_reg64(s->regs, APIC_ICR, icr);
+ } else {
+ icr = __kvm_lapic_get_reg64(s->regs, APIC_ICR);
+ __kvm_lapic_set_reg(s->regs, APIC_ICR2, icr >> 32);
+ }
+ }
+
+ return 0;
+}
+
+int kvm_apic_get_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s)
+{
+ memcpy(s->regs, vcpu->arch.apic->regs, sizeof(*s));
+
+ /*
+ * Get calculated timer current count for remaining timer period (if
+ * any) and store it in the returned register set.
+ */
+ __kvm_lapic_set_reg(s->regs, APIC_TMCCT,
+ __apic_read(vcpu->arch.apic, APIC_TMCCT));
+
+ return kvm_apic_state_fixup(vcpu, s, false);
+}
+
+int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+ int r;
+
+ static_call_cond(kvm_x86_apicv_pre_state_restore)(vcpu);
+
+ kvm_lapic_set_base(vcpu, vcpu->arch.apic_base);
+ /* set SPIV separately to get count of SW disabled APICs right */
+ apic_set_spiv(apic, *((u32 *)(s->regs + APIC_SPIV)));
+
+ r = kvm_apic_state_fixup(vcpu, s, true);
+ if (r) {
+ kvm_recalculate_apic_map(vcpu->kvm);
+ return r;
+ }
+ memcpy(vcpu->arch.apic->regs, s->regs, sizeof(*s));
+
+ atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY);
+ kvm_recalculate_apic_map(vcpu->kvm);
+ kvm_apic_set_version(vcpu);
+
+ apic_update_ppr(apic);
+ cancel_apic_timer(apic);
+ apic->lapic_timer.expired_tscdeadline = 0;
+ apic_update_lvtt(apic);
+ apic_manage_nmi_watchdog(apic, kvm_lapic_get_reg(apic, APIC_LVT0));
+ update_divide_count(apic);
+ __start_apic_timer(apic, APIC_TMCCT);
+ kvm_lapic_set_reg(apic, APIC_TMCCT, 0);
+ kvm_apic_update_apicv(vcpu);
+ if (apic->apicv_active) {
+ static_call_cond(kvm_x86_apicv_post_state_restore)(vcpu);
+ static_call_cond(kvm_x86_hwapic_irr_update)(vcpu, apic_find_highest_irr(apic));
+ static_call_cond(kvm_x86_hwapic_isr_update)(apic_find_highest_isr(apic));
+ }
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ if (ioapic_in_kernel(vcpu->kvm))
+ kvm_rtc_eoi_tracking_restore_one(vcpu);
+
+ vcpu->arch.apic_arb_prio = 0;
+
+ return 0;
+}
+
+void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
+{
+ struct hrtimer *timer;
+
+ if (!lapic_in_kernel(vcpu) ||
+ kvm_can_post_timer_interrupt(vcpu))
+ return;
+
+ timer = &vcpu->arch.apic->lapic_timer.timer;
+ if (hrtimer_cancel(timer))
+ hrtimer_start_expires(timer, HRTIMER_MODE_ABS_HARD);
+}
+
+/*
+ * apic_sync_pv_eoi_from_guest - called on vmexit or cancel interrupt
+ *
+ * Detect whether guest triggered PV EOI since the
+ * last entry. If yes, set EOI on guests's behalf.
+ * Clear PV EOI in guest memory in any case.
+ */
+static void apic_sync_pv_eoi_from_guest(struct kvm_vcpu *vcpu,
+ struct kvm_lapic *apic)
+{
+ int vector;
+ /*
+ * PV EOI state is derived from KVM_APIC_PV_EOI_PENDING in host
+ * and KVM_PV_EOI_ENABLED in guest memory as follows:
+ *
+ * KVM_APIC_PV_EOI_PENDING is unset:
+ * -> host disabled PV EOI.
+ * KVM_APIC_PV_EOI_PENDING is set, KVM_PV_EOI_ENABLED is set:
+ * -> host enabled PV EOI, guest did not execute EOI yet.
+ * KVM_APIC_PV_EOI_PENDING is set, KVM_PV_EOI_ENABLED is unset:
+ * -> host enabled PV EOI, guest executed EOI.
+ */
+ BUG_ON(!pv_eoi_enabled(vcpu));
+
+ if (pv_eoi_test_and_clr_pending(vcpu))
+ return;
+ vector = apic_set_eoi(apic);
+ trace_kvm_pv_eoi(apic, vector);
+}
+
+void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu)
+{
+ u32 data;
+
+ if (test_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention))
+ apic_sync_pv_eoi_from_guest(vcpu, vcpu->arch.apic);
+
+ if (!test_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention))
+ return;
+
+ if (kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.apic->vapic_cache, &data,
+ sizeof(u32)))
+ return;
+
+ apic_set_tpr(vcpu->arch.apic, data & 0xff);
+}
+
+/*
+ * apic_sync_pv_eoi_to_guest - called before vmentry
+ *
+ * Detect whether it's safe to enable PV EOI and
+ * if yes do so.
+ */
+static void apic_sync_pv_eoi_to_guest(struct kvm_vcpu *vcpu,
+ struct kvm_lapic *apic)
+{
+ if (!pv_eoi_enabled(vcpu) ||
+ /* IRR set or many bits in ISR: could be nested. */
+ apic->irr_pending ||
+ /* Cache not set: could be safe but we don't bother. */
+ apic->highest_isr_cache == -1 ||
+ /* Need EOI to update ioapic. */
+ kvm_ioapic_handles_vector(apic, apic->highest_isr_cache)) {
+ /*
+ * PV EOI was disabled by apic_sync_pv_eoi_from_guest
+ * so we need not do anything here.
+ */
+ return;
+ }
+
+ pv_eoi_set_pending(apic->vcpu);
+}
+
+void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu)
+{
+ u32 data, tpr;
+ int max_irr, max_isr;
+ struct kvm_lapic *apic = vcpu->arch.apic;
+
+ apic_sync_pv_eoi_to_guest(vcpu, apic);
+
+ if (!test_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention))
+ return;
+
+ tpr = kvm_lapic_get_reg(apic, APIC_TASKPRI) & 0xff;
+ max_irr = apic_find_highest_irr(apic);
+ if (max_irr < 0)
+ max_irr = 0;
+ max_isr = apic_find_highest_isr(apic);
+ if (max_isr < 0)
+ max_isr = 0;
+ data = (tpr & 0xff) | ((max_isr & 0xf0) << 8) | (max_irr << 24);
+
+ kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.apic->vapic_cache, &data,
+ sizeof(u32));
+}
+
+int kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr)
+{
+ if (vapic_addr) {
+ if (kvm_gfn_to_hva_cache_init(vcpu->kvm,
+ &vcpu->arch.apic->vapic_cache,
+ vapic_addr, sizeof(u32)))
+ return -EINVAL;
+ __set_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention);
+ } else {
+ __clear_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention);
+ }
+
+ vcpu->arch.apic->vapic_addr = vapic_addr;
+ return 0;
+}
+
+int kvm_x2apic_icr_write(struct kvm_lapic *apic, u64 data)
+{
+ data &= ~APIC_ICR_BUSY;
+
+ kvm_apic_send_ipi(apic, (u32)data, (u32)(data >> 32));
+ kvm_lapic_set_reg64(apic, APIC_ICR, data);
+ trace_kvm_apic_write(APIC_ICR, data);
+ return 0;
+}
+
+static int kvm_lapic_msr_read(struct kvm_lapic *apic, u32 reg, u64 *data)
+{
+ u32 low;
+
+ if (reg == APIC_ICR) {
+ *data = kvm_lapic_get_reg64(apic, APIC_ICR);
+ return 0;
+ }
+
+ if (kvm_lapic_reg_read(apic, reg, 4, &low))
+ return 1;
+
+ *data = low;
+
+ return 0;
+}
+
+static int kvm_lapic_msr_write(struct kvm_lapic *apic, u32 reg, u64 data)
+{
+ /*
+ * ICR is a 64-bit register in x2APIC mode (and Hyper-V PV vAPIC) and
+ * can be written as such, all other registers remain accessible only
+ * through 32-bit reads/writes.
+ */
+ if (reg == APIC_ICR)
+ return kvm_x2apic_icr_write(apic, data);
+
+ /* Bits 63:32 are reserved in all other registers. */
+ if (data >> 32)
+ return 1;
+
+ return kvm_lapic_reg_write(apic, reg, (u32)data);
+}
+
+int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+ u32 reg = (msr - APIC_BASE_MSR) << 4;
+
+ if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(apic))
+ return 1;
+
+ return kvm_lapic_msr_write(apic, reg, data);
+}
+
+int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+ u32 reg = (msr - APIC_BASE_MSR) << 4;
+
+ if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(apic))
+ return 1;
+
+ return kvm_lapic_msr_read(apic, reg, data);
+}
+
+int kvm_hv_vapic_msr_write(struct kvm_vcpu *vcpu, u32 reg, u64 data)
+{
+ if (!lapic_in_kernel(vcpu))
+ return 1;
+
+ return kvm_lapic_msr_write(vcpu->arch.apic, reg, data);
+}
+
+int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 reg, u64 *data)
+{
+ if (!lapic_in_kernel(vcpu))
+ return 1;
+
+ return kvm_lapic_msr_read(vcpu->arch.apic, reg, data);
+}
+
+int kvm_lapic_set_pv_eoi(struct kvm_vcpu *vcpu, u64 data, unsigned long len)
+{
+ u64 addr = data & ~KVM_MSR_ENABLED;
+ struct gfn_to_hva_cache *ghc = &vcpu->arch.pv_eoi.data;
+ unsigned long new_len;
+ int ret;
+
+ if (!IS_ALIGNED(addr, 4))
+ return 1;
+
+ if (data & KVM_MSR_ENABLED) {
+ if (addr == ghc->gpa && len <= ghc->len)
+ new_len = ghc->len;
+ else
+ new_len = len;
+
+ ret = kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, addr, new_len);
+ if (ret)
+ return ret;
+ }
+
+ vcpu->arch.pv_eoi.msr_val = data;
+
+ return 0;
+}
+
+int kvm_apic_accept_events(struct kvm_vcpu *vcpu)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+ u8 sipi_vector;
+ int r;
+
+ if (!kvm_apic_has_pending_init_or_sipi(vcpu))
+ return 0;
+
+ if (is_guest_mode(vcpu)) {
+ r = kvm_check_nested_events(vcpu);
+ if (r < 0)
+ return r == -EBUSY ? 0 : r;
+ /*
+ * Continue processing INIT/SIPI even if a nested VM-Exit
+ * occurred, e.g. pending SIPIs should be dropped if INIT+SIPI
+ * are blocked as a result of transitioning to VMX root mode.
+ */
+ }
+
+ /*
+ * INITs are blocked while CPU is in specific states (SMM, VMX root
+ * mode, SVM with GIF=0), while SIPIs are dropped if the CPU isn't in
+ * wait-for-SIPI (WFS).
+ */
+ if (!kvm_apic_init_sipi_allowed(vcpu)) {
+ WARN_ON_ONCE(vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED);
+ clear_bit(KVM_APIC_SIPI, &apic->pending_events);
+ return 0;
+ }
+
+ if (test_and_clear_bit(KVM_APIC_INIT, &apic->pending_events)) {
+ kvm_vcpu_reset(vcpu, true);
+ if (kvm_vcpu_is_bsp(apic->vcpu))
+ vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
+ else
+ vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED;
+ }
+ if (test_and_clear_bit(KVM_APIC_SIPI, &apic->pending_events)) {
+ if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
+ /* evaluate pending_events before reading the vector */
+ smp_rmb();
+ sipi_vector = apic->sipi_vector;
+ static_call(kvm_x86_vcpu_deliver_sipi_vector)(vcpu, sipi_vector);
+ vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
+ }
+ }
+ return 0;
+}
+
+void kvm_lapic_exit(void)
+{
+ static_key_deferred_flush(&apic_hw_disabled);
+ WARN_ON(static_branch_unlikely(&apic_hw_disabled.key));
+ static_key_deferred_flush(&apic_sw_disabled);
+ WARN_ON(static_branch_unlikely(&apic_sw_disabled.key));
+}
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
new file mode 100644
index 000000000..0a0ea4b5d
--- /dev/null
+++ b/arch/x86/kvm/lapic.h
@@ -0,0 +1,281 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __KVM_X86_LAPIC_H
+#define __KVM_X86_LAPIC_H
+
+#include <kvm/iodev.h>
+
+#include <linux/kvm_host.h>
+
+#include "hyperv.h"
+#include "smm.h"
+
+#define KVM_APIC_INIT 0
+#define KVM_APIC_SIPI 1
+
+#define APIC_SHORT_MASK 0xc0000
+#define APIC_DEST_NOSHORT 0x0
+#define APIC_DEST_MASK 0x800
+
+#define APIC_BUS_CYCLE_NS 1
+#define APIC_BUS_FREQUENCY (1000000000ULL / APIC_BUS_CYCLE_NS)
+
+#define APIC_BROADCAST 0xFF
+#define X2APIC_BROADCAST 0xFFFFFFFFul
+
+enum lapic_mode {
+ LAPIC_MODE_DISABLED = 0,
+ LAPIC_MODE_INVALID = X2APIC_ENABLE,
+ LAPIC_MODE_XAPIC = MSR_IA32_APICBASE_ENABLE,
+ LAPIC_MODE_X2APIC = MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE,
+};
+
+enum lapic_lvt_entry {
+ LVT_TIMER,
+ LVT_THERMAL_MONITOR,
+ LVT_PERFORMANCE_COUNTER,
+ LVT_LINT0,
+ LVT_LINT1,
+ LVT_ERROR,
+ LVT_CMCI,
+
+ KVM_APIC_MAX_NR_LVT_ENTRIES,
+};
+
+#define APIC_LVTx(x) ((x) == LVT_CMCI ? APIC_LVTCMCI : APIC_LVTT + 0x10 * (x))
+
+struct kvm_timer {
+ struct hrtimer timer;
+ s64 period; /* unit: ns */
+ ktime_t target_expiration;
+ u32 timer_mode;
+ u32 timer_mode_mask;
+ u64 tscdeadline;
+ u64 expired_tscdeadline;
+ u32 timer_advance_ns;
+ atomic_t pending; /* accumulated triggered timers */
+ bool hv_timer_in_use;
+};
+
+struct kvm_lapic {
+ unsigned long base_address;
+ struct kvm_io_device dev;
+ struct kvm_timer lapic_timer;
+ u32 divide_count;
+ struct kvm_vcpu *vcpu;
+ bool apicv_active;
+ bool sw_enabled;
+ bool irr_pending;
+ bool lvt0_in_nmi_mode;
+ /* Number of bits set in ISR. */
+ s16 isr_count;
+ /* The highest vector set in ISR; if -1 - invalid, must scan ISR. */
+ int highest_isr_cache;
+ /**
+ * APIC register page. The layout matches the register layout seen by
+ * the guest 1:1, because it is accessed by the vmx microcode.
+ * Note: Only one register, the TPR, is used by the microcode.
+ */
+ void *regs;
+ gpa_t vapic_addr;
+ struct gfn_to_hva_cache vapic_cache;
+ unsigned long pending_events;
+ unsigned int sipi_vector;
+ int nr_lvt_entries;
+};
+
+struct dest_map;
+
+int kvm_create_lapic(struct kvm_vcpu *vcpu, int timer_advance_ns);
+void kvm_free_lapic(struct kvm_vcpu *vcpu);
+
+int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu);
+int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu);
+int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu);
+int kvm_apic_accept_events(struct kvm_vcpu *vcpu);
+void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event);
+u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu);
+void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8);
+void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu);
+void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value);
+u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu);
+void kvm_recalculate_apic_map(struct kvm *kvm);
+void kvm_apic_set_version(struct kvm_vcpu *vcpu);
+void kvm_apic_after_set_mcg_cap(struct kvm_vcpu *vcpu);
+bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
+ int shorthand, unsigned int dest, int dest_mode);
+int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2);
+void kvm_apic_clear_irr(struct kvm_vcpu *vcpu, int vec);
+bool __kvm_apic_update_irr(u32 *pir, void *regs, int *max_irr);
+bool kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir, int *max_irr);
+void kvm_apic_update_ppr(struct kvm_vcpu *vcpu);
+int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq,
+ struct dest_map *dest_map);
+int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type);
+void kvm_apic_update_apicv(struct kvm_vcpu *vcpu);
+int kvm_alloc_apic_access_page(struct kvm *kvm);
+void kvm_inhibit_apic_access_page(struct kvm_vcpu *vcpu);
+
+bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
+ struct kvm_lapic_irq *irq, int *r, struct dest_map *dest_map);
+void kvm_apic_send_ipi(struct kvm_lapic *apic, u32 icr_low, u32 icr_high);
+
+u64 kvm_get_apic_base(struct kvm_vcpu *vcpu);
+int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info);
+int kvm_apic_get_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s);
+int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s);
+enum lapic_mode kvm_get_apic_mode(struct kvm_vcpu *vcpu);
+int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu);
+
+u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu);
+void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data);
+
+void kvm_apic_write_nodecode(struct kvm_vcpu *vcpu, u32 offset);
+void kvm_apic_set_eoi_accelerated(struct kvm_vcpu *vcpu, int vector);
+
+int kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr);
+void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu);
+void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu);
+
+int kvm_x2apic_icr_write(struct kvm_lapic *apic, u64 data);
+int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data);
+int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data);
+
+int kvm_hv_vapic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data);
+int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data);
+
+int kvm_lapic_set_pv_eoi(struct kvm_vcpu *vcpu, u64 data, unsigned long len);
+void kvm_lapic_exit(void);
+
+u64 kvm_lapic_readable_reg_mask(struct kvm_lapic *apic);
+
+#define VEC_POS(v) ((v) & (32 - 1))
+#define REG_POS(v) (((v) >> 5) << 4)
+
+static inline void kvm_lapic_clear_vector(int vec, void *bitmap)
+{
+ clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
+}
+
+static inline void kvm_lapic_set_vector(int vec, void *bitmap)
+{
+ set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
+}
+
+static inline void kvm_lapic_set_irr(int vec, struct kvm_lapic *apic)
+{
+ kvm_lapic_set_vector(vec, apic->regs + APIC_IRR);
+ /*
+ * irr_pending must be true if any interrupt is pending; set it after
+ * APIC_IRR to avoid race with apic_clear_irr
+ */
+ apic->irr_pending = true;
+}
+
+static inline u32 __kvm_lapic_get_reg(char *regs, int reg_off)
+{
+ return *((u32 *) (regs + reg_off));
+}
+
+static inline u32 kvm_lapic_get_reg(struct kvm_lapic *apic, int reg_off)
+{
+ return __kvm_lapic_get_reg(apic->regs, reg_off);
+}
+
+DECLARE_STATIC_KEY_FALSE(kvm_has_noapic_vcpu);
+
+static inline bool lapic_in_kernel(struct kvm_vcpu *vcpu)
+{
+ if (static_branch_unlikely(&kvm_has_noapic_vcpu))
+ return vcpu->arch.apic;
+ return true;
+}
+
+extern struct static_key_false_deferred apic_hw_disabled;
+
+static inline bool kvm_apic_hw_enabled(struct kvm_lapic *apic)
+{
+ if (static_branch_unlikely(&apic_hw_disabled.key))
+ return apic->vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE;
+ return true;
+}
+
+extern struct static_key_false_deferred apic_sw_disabled;
+
+static inline bool kvm_apic_sw_enabled(struct kvm_lapic *apic)
+{
+ if (static_branch_unlikely(&apic_sw_disabled.key))
+ return apic->sw_enabled;
+ return true;
+}
+
+static inline bool kvm_apic_present(struct kvm_vcpu *vcpu)
+{
+ return lapic_in_kernel(vcpu) && kvm_apic_hw_enabled(vcpu->arch.apic);
+}
+
+static inline int kvm_lapic_enabled(struct kvm_vcpu *vcpu)
+{
+ return kvm_apic_present(vcpu) && kvm_apic_sw_enabled(vcpu->arch.apic);
+}
+
+static inline int apic_x2apic_mode(struct kvm_lapic *apic)
+{
+ return apic->vcpu->arch.apic_base & X2APIC_ENABLE;
+}
+
+static inline bool kvm_vcpu_apicv_active(struct kvm_vcpu *vcpu)
+{
+ return lapic_in_kernel(vcpu) && vcpu->arch.apic->apicv_active;
+}
+
+static inline bool kvm_apic_has_pending_init_or_sipi(struct kvm_vcpu *vcpu)
+{
+ return lapic_in_kernel(vcpu) && vcpu->arch.apic->pending_events;
+}
+
+static inline bool kvm_apic_init_sipi_allowed(struct kvm_vcpu *vcpu)
+{
+ return !is_smm(vcpu) &&
+ !static_call(kvm_x86_apic_init_signal_blocked)(vcpu);
+}
+
+static inline bool kvm_lowest_prio_delivery(struct kvm_lapic_irq *irq)
+{
+ return (irq->delivery_mode == APIC_DM_LOWEST ||
+ irq->msi_redir_hint);
+}
+
+static inline int kvm_lapic_latched_init(struct kvm_vcpu *vcpu)
+{
+ return lapic_in_kernel(vcpu) && test_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
+}
+
+bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector);
+
+void kvm_wait_lapic_expire(struct kvm_vcpu *vcpu);
+
+void kvm_bitmap_or_dest_vcpus(struct kvm *kvm, struct kvm_lapic_irq *irq,
+ unsigned long *vcpu_bitmap);
+
+bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq,
+ struct kvm_vcpu **dest_vcpu);
+int kvm_vector_to_index(u32 vector, u32 dest_vcpus,
+ const unsigned long *bitmap, u32 bitmap_size);
+void kvm_lapic_switch_to_sw_timer(struct kvm_vcpu *vcpu);
+void kvm_lapic_switch_to_hv_timer(struct kvm_vcpu *vcpu);
+void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu);
+bool kvm_lapic_hv_timer_in_use(struct kvm_vcpu *vcpu);
+void kvm_lapic_restart_hv_timer(struct kvm_vcpu *vcpu);
+bool kvm_can_use_hv_timer(struct kvm_vcpu *vcpu);
+
+static inline enum lapic_mode kvm_apic_mode(u64 apic_base)
+{
+ return apic_base & (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE);
+}
+
+static inline u8 kvm_xapic_id(struct kvm_lapic *apic)
+{
+ return kvm_lapic_get_reg(apic, APIC_ID) >> 24;
+}
+
+#endif
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
new file mode 100644
index 000000000..253fb2093
--- /dev/null
+++ b/arch/x86/kvm/mmu.h
@@ -0,0 +1,307 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __KVM_X86_MMU_H
+#define __KVM_X86_MMU_H
+
+#include <linux/kvm_host.h>
+#include "kvm_cache_regs.h"
+#include "cpuid.h"
+
+extern bool __read_mostly enable_mmio_caching;
+
+#define PT_WRITABLE_SHIFT 1
+#define PT_USER_SHIFT 2
+
+#define PT_PRESENT_MASK (1ULL << 0)
+#define PT_WRITABLE_MASK (1ULL << PT_WRITABLE_SHIFT)
+#define PT_USER_MASK (1ULL << PT_USER_SHIFT)
+#define PT_PWT_MASK (1ULL << 3)
+#define PT_PCD_MASK (1ULL << 4)
+#define PT_ACCESSED_SHIFT 5
+#define PT_ACCESSED_MASK (1ULL << PT_ACCESSED_SHIFT)
+#define PT_DIRTY_SHIFT 6
+#define PT_DIRTY_MASK (1ULL << PT_DIRTY_SHIFT)
+#define PT_PAGE_SIZE_SHIFT 7
+#define PT_PAGE_SIZE_MASK (1ULL << PT_PAGE_SIZE_SHIFT)
+#define PT_PAT_MASK (1ULL << 7)
+#define PT_GLOBAL_MASK (1ULL << 8)
+#define PT64_NX_SHIFT 63
+#define PT64_NX_MASK (1ULL << PT64_NX_SHIFT)
+
+#define PT_PAT_SHIFT 7
+#define PT_DIR_PAT_SHIFT 12
+#define PT_DIR_PAT_MASK (1ULL << PT_DIR_PAT_SHIFT)
+
+#define PT64_ROOT_5LEVEL 5
+#define PT64_ROOT_4LEVEL 4
+#define PT32_ROOT_LEVEL 2
+#define PT32E_ROOT_LEVEL 3
+
+#define KVM_MMU_CR4_ROLE_BITS (X86_CR4_PSE | X86_CR4_PAE | X86_CR4_LA57 | \
+ X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE)
+
+#define KVM_MMU_CR0_ROLE_BITS (X86_CR0_PG | X86_CR0_WP)
+#define KVM_MMU_EFER_ROLE_BITS (EFER_LME | EFER_NX)
+
+static __always_inline u64 rsvd_bits(int s, int e)
+{
+ BUILD_BUG_ON(__builtin_constant_p(e) && __builtin_constant_p(s) && e < s);
+
+ if (__builtin_constant_p(e))
+ BUILD_BUG_ON(e > 63);
+ else
+ e &= 63;
+
+ if (e < s)
+ return 0;
+
+ return ((2ULL << (e - s)) - 1) << s;
+}
+
+/*
+ * The number of non-reserved physical address bits irrespective of features
+ * that repurpose legal bits, e.g. MKTME.
+ */
+extern u8 __read_mostly shadow_phys_bits;
+
+static inline gfn_t kvm_mmu_max_gfn(void)
+{
+ /*
+ * Note that this uses the host MAXPHYADDR, not the guest's.
+ * EPT/NPT cannot support GPAs that would exceed host.MAXPHYADDR;
+ * assuming KVM is running on bare metal, guest accesses beyond
+ * host.MAXPHYADDR will hit a #PF(RSVD) and never cause a vmexit
+ * (either EPT Violation/Misconfig or #NPF), and so KVM will never
+ * install a SPTE for such addresses. If KVM is running as a VM
+ * itself, on the other hand, it might see a MAXPHYADDR that is less
+ * than hardware's real MAXPHYADDR. Using the host MAXPHYADDR
+ * disallows such SPTEs entirely and simplifies the TDP MMU.
+ */
+ int max_gpa_bits = likely(tdp_enabled) ? shadow_phys_bits : 52;
+
+ return (1ULL << (max_gpa_bits - PAGE_SHIFT)) - 1;
+}
+
+static inline u8 kvm_get_shadow_phys_bits(void)
+{
+ /*
+ * boot_cpu_data.x86_phys_bits is reduced when MKTME or SME are detected
+ * in CPU detection code, but the processor treats those reduced bits as
+ * 'keyID' thus they are not reserved bits. Therefore KVM needs to look at
+ * the physical address bits reported by CPUID.
+ */
+ if (likely(boot_cpu_data.extended_cpuid_level >= 0x80000008))
+ return cpuid_eax(0x80000008) & 0xff;
+
+ /*
+ * Quite weird to have VMX or SVM but not MAXPHYADDR; probably a VM with
+ * custom CPUID. Proceed with whatever the kernel found since these features
+ * aren't virtualizable (SME/SEV also require CPUIDs higher than 0x80000008).
+ */
+ return boot_cpu_data.x86_phys_bits;
+}
+
+void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 mmio_mask, u64 access_mask);
+void kvm_mmu_set_me_spte_mask(u64 me_value, u64 me_mask);
+void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only);
+
+void kvm_init_mmu(struct kvm_vcpu *vcpu);
+void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, unsigned long cr0,
+ unsigned long cr4, u64 efer, gpa_t nested_cr3);
+void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
+ int huge_page_level, bool accessed_dirty,
+ gpa_t new_eptp);
+bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu);
+int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
+ u64 fault_address, char *insn, int insn_len);
+void __kvm_mmu_refresh_passthrough_bits(struct kvm_vcpu *vcpu,
+ struct kvm_mmu *mmu);
+
+int kvm_mmu_load(struct kvm_vcpu *vcpu);
+void kvm_mmu_unload(struct kvm_vcpu *vcpu);
+void kvm_mmu_free_obsolete_roots(struct kvm_vcpu *vcpu);
+void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu);
+void kvm_mmu_sync_prev_roots(struct kvm_vcpu *vcpu);
+void kvm_mmu_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new,
+ int bytes);
+
+static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu)
+{
+ if (likely(vcpu->arch.mmu->root.hpa != INVALID_PAGE))
+ return 0;
+
+ return kvm_mmu_load(vcpu);
+}
+
+static inline unsigned long kvm_get_pcid(struct kvm_vcpu *vcpu, gpa_t cr3)
+{
+ BUILD_BUG_ON((X86_CR3_PCID_MASK & PAGE_MASK) != 0);
+
+ return kvm_is_cr4_bit_set(vcpu, X86_CR4_PCIDE)
+ ? cr3 & X86_CR3_PCID_MASK
+ : 0;
+}
+
+static inline unsigned long kvm_get_active_pcid(struct kvm_vcpu *vcpu)
+{
+ return kvm_get_pcid(vcpu, kvm_read_cr3(vcpu));
+}
+
+static inline void kvm_mmu_load_pgd(struct kvm_vcpu *vcpu)
+{
+ u64 root_hpa = vcpu->arch.mmu->root.hpa;
+
+ if (!VALID_PAGE(root_hpa))
+ return;
+
+ static_call(kvm_x86_load_mmu_pgd)(vcpu, root_hpa,
+ vcpu->arch.mmu->root_role.level);
+}
+
+static inline void kvm_mmu_refresh_passthrough_bits(struct kvm_vcpu *vcpu,
+ struct kvm_mmu *mmu)
+{
+ /*
+ * When EPT is enabled, KVM may passthrough CR0.WP to the guest, i.e.
+ * @mmu's snapshot of CR0.WP and thus all related paging metadata may
+ * be stale. Refresh CR0.WP and the metadata on-demand when checking
+ * for permission faults. Exempt nested MMUs, i.e. MMUs for shadowing
+ * nEPT and nNPT, as CR0.WP is ignored in both cases. Note, KVM does
+ * need to refresh nested_mmu, a.k.a. the walker used to translate L2
+ * GVAs to GPAs, as that "MMU" needs to honor L2's CR0.WP.
+ */
+ if (!tdp_enabled || mmu == &vcpu->arch.guest_mmu)
+ return;
+
+ __kvm_mmu_refresh_passthrough_bits(vcpu, mmu);
+}
+
+/*
+ * Check if a given access (described through the I/D, W/R and U/S bits of a
+ * page fault error code pfec) causes a permission fault with the given PTE
+ * access rights (in ACC_* format).
+ *
+ * Return zero if the access does not fault; return the page fault error code
+ * if the access faults.
+ */
+static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
+ unsigned pte_access, unsigned pte_pkey,
+ u64 access)
+{
+ /* strip nested paging fault error codes */
+ unsigned int pfec = access;
+ unsigned long rflags = static_call(kvm_x86_get_rflags)(vcpu);
+
+ /*
+ * For explicit supervisor accesses, SMAP is disabled if EFLAGS.AC = 1.
+ * For implicit supervisor accesses, SMAP cannot be overridden.
+ *
+ * SMAP works on supervisor accesses only, and not_smap can
+ * be set or not set when user access with neither has any bearing
+ * on the result.
+ *
+ * We put the SMAP checking bit in place of the PFERR_RSVD_MASK bit;
+ * this bit will always be zero in pfec, but it will be one in index
+ * if SMAP checks are being disabled.
+ */
+ u64 implicit_access = access & PFERR_IMPLICIT_ACCESS;
+ bool not_smap = ((rflags & X86_EFLAGS_AC) | implicit_access) == X86_EFLAGS_AC;
+ int index = (pfec + (not_smap << PFERR_RSVD_BIT)) >> 1;
+ u32 errcode = PFERR_PRESENT_MASK;
+ bool fault;
+
+ kvm_mmu_refresh_passthrough_bits(vcpu, mmu);
+
+ fault = (mmu->permissions[index] >> pte_access) & 1;
+
+ WARN_ON(pfec & (PFERR_PK_MASK | PFERR_RSVD_MASK));
+ if (unlikely(mmu->pkru_mask)) {
+ u32 pkru_bits, offset;
+
+ /*
+ * PKRU defines 32 bits, there are 16 domains and 2
+ * attribute bits per domain in pkru. pte_pkey is the
+ * index of the protection domain, so pte_pkey * 2 is
+ * is the index of the first bit for the domain.
+ */
+ pkru_bits = (vcpu->arch.pkru >> (pte_pkey * 2)) & 3;
+
+ /* clear present bit, replace PFEC.RSVD with ACC_USER_MASK. */
+ offset = (pfec & ~1) +
+ ((pte_access & PT_USER_MASK) << (PFERR_RSVD_BIT - PT_USER_SHIFT));
+
+ pkru_bits &= mmu->pkru_mask >> offset;
+ errcode |= -pkru_bits & PFERR_PK_MASK;
+ fault |= (pkru_bits != 0);
+ }
+
+ return -(u32)fault & errcode;
+}
+
+void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end);
+
+int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu);
+
+int kvm_mmu_post_init_vm(struct kvm *kvm);
+void kvm_mmu_pre_destroy_vm(struct kvm *kvm);
+
+static inline bool kvm_shadow_root_allocated(struct kvm *kvm)
+{
+ /*
+ * Read shadow_root_allocated before related pointers. Hence, threads
+ * reading shadow_root_allocated in any lock context are guaranteed to
+ * see the pointers. Pairs with smp_store_release in
+ * mmu_first_shadow_root_alloc.
+ */
+ return smp_load_acquire(&kvm->arch.shadow_root_allocated);
+}
+
+#ifdef CONFIG_X86_64
+extern bool tdp_mmu_enabled;
+#else
+#define tdp_mmu_enabled false
+#endif
+
+static inline bool kvm_memslots_have_rmaps(struct kvm *kvm)
+{
+ return !tdp_mmu_enabled || kvm_shadow_root_allocated(kvm);
+}
+
+static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, int level)
+{
+ /* KVM_HPAGE_GFN_SHIFT(PG_LEVEL_4K) must be 0. */
+ return (gfn >> KVM_HPAGE_GFN_SHIFT(level)) -
+ (base_gfn >> KVM_HPAGE_GFN_SHIFT(level));
+}
+
+static inline unsigned long
+__kvm_mmu_slot_lpages(struct kvm_memory_slot *slot, unsigned long npages,
+ int level)
+{
+ return gfn_to_index(slot->base_gfn + npages - 1,
+ slot->base_gfn, level) + 1;
+}
+
+static inline unsigned long
+kvm_mmu_slot_lpages(struct kvm_memory_slot *slot, int level)
+{
+ return __kvm_mmu_slot_lpages(slot, slot->npages, level);
+}
+
+static inline void kvm_update_page_stats(struct kvm *kvm, int level, int count)
+{
+ atomic64_add(count, &kvm->stat.pages[level - 1]);
+}
+
+gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u64 access,
+ struct x86_exception *exception);
+
+static inline gpa_t kvm_translate_gpa(struct kvm_vcpu *vcpu,
+ struct kvm_mmu *mmu,
+ gpa_t gpa, u64 access,
+ struct x86_exception *exception)
+{
+ if (mmu != &vcpu->arch.nested_mmu)
+ return gpa;
+ return translate_nested_gpa(vcpu, gpa, access, exception);
+}
+#endif
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
new file mode 100644
index 000000000..f7901cb4d
--- /dev/null
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -0,0 +1,7161 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Kernel-based Virtual Machine driver for Linux
+ *
+ * This module enables machines with Intel VT-x extensions to run virtual
+ * machines without emulation or binary translation.
+ *
+ * MMU support
+ *
+ * Copyright (C) 2006 Qumranet, Inc.
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
+ *
+ * Authors:
+ * Yaniv Kamay <yaniv@qumranet.com>
+ * Avi Kivity <avi@qumranet.com>
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include "irq.h"
+#include "ioapic.h"
+#include "mmu.h"
+#include "mmu_internal.h"
+#include "tdp_mmu.h"
+#include "x86.h"
+#include "kvm_cache_regs.h"
+#include "smm.h"
+#include "kvm_emulate.h"
+#include "page_track.h"
+#include "cpuid.h"
+#include "spte.h"
+
+#include <linux/kvm_host.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/moduleparam.h>
+#include <linux/export.h>
+#include <linux/swap.h>
+#include <linux/hugetlb.h>
+#include <linux/compiler.h>
+#include <linux/srcu.h>
+#include <linux/slab.h>
+#include <linux/sched/signal.h>
+#include <linux/uaccess.h>
+#include <linux/hash.h>
+#include <linux/kern_levels.h>
+#include <linux/kstrtox.h>
+#include <linux/kthread.h>
+
+#include <asm/page.h>
+#include <asm/memtype.h>
+#include <asm/cmpxchg.h>
+#include <asm/io.h>
+#include <asm/set_memory.h>
+#include <asm/vmx.h>
+
+#include "trace.h"
+
+extern bool itlb_multihit_kvm_mitigation;
+
+static bool nx_hugepage_mitigation_hard_disabled;
+
+int __read_mostly nx_huge_pages = -1;
+static uint __read_mostly nx_huge_pages_recovery_period_ms;
+#ifdef CONFIG_PREEMPT_RT
+/* Recovery can cause latency spikes, disable it for PREEMPT_RT. */
+static uint __read_mostly nx_huge_pages_recovery_ratio = 0;
+#else
+static uint __read_mostly nx_huge_pages_recovery_ratio = 60;
+#endif
+
+static int get_nx_huge_pages(char *buffer, const struct kernel_param *kp);
+static int set_nx_huge_pages(const char *val, const struct kernel_param *kp);
+static int set_nx_huge_pages_recovery_param(const char *val, const struct kernel_param *kp);
+
+static const struct kernel_param_ops nx_huge_pages_ops = {
+ .set = set_nx_huge_pages,
+ .get = get_nx_huge_pages,
+};
+
+static const struct kernel_param_ops nx_huge_pages_recovery_param_ops = {
+ .set = set_nx_huge_pages_recovery_param,
+ .get = param_get_uint,
+};
+
+module_param_cb(nx_huge_pages, &nx_huge_pages_ops, &nx_huge_pages, 0644);
+__MODULE_PARM_TYPE(nx_huge_pages, "bool");
+module_param_cb(nx_huge_pages_recovery_ratio, &nx_huge_pages_recovery_param_ops,
+ &nx_huge_pages_recovery_ratio, 0644);
+__MODULE_PARM_TYPE(nx_huge_pages_recovery_ratio, "uint");
+module_param_cb(nx_huge_pages_recovery_period_ms, &nx_huge_pages_recovery_param_ops,
+ &nx_huge_pages_recovery_period_ms, 0644);
+__MODULE_PARM_TYPE(nx_huge_pages_recovery_period_ms, "uint");
+
+static bool __read_mostly force_flush_and_sync_on_reuse;
+module_param_named(flush_on_reuse, force_flush_and_sync_on_reuse, bool, 0644);
+
+/*
+ * When setting this variable to true it enables Two-Dimensional-Paging
+ * where the hardware walks 2 page tables:
+ * 1. the guest-virtual to guest-physical
+ * 2. while doing 1. it walks guest-physical to host-physical
+ * If the hardware supports that we don't need to do shadow paging.
+ */
+bool tdp_enabled = false;
+
+static bool __ro_after_init tdp_mmu_allowed;
+
+#ifdef CONFIG_X86_64
+bool __read_mostly tdp_mmu_enabled = true;
+module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0444);
+#endif
+
+static int max_huge_page_level __read_mostly;
+static int tdp_root_level __read_mostly;
+static int max_tdp_level __read_mostly;
+
+#define PTE_PREFETCH_NUM 8
+
+#include <trace/events/kvm.h>
+
+/* make pte_list_desc fit well in cache lines */
+#define PTE_LIST_EXT 14
+
+/*
+ * struct pte_list_desc is the core data structure used to implement a custom
+ * list for tracking a set of related SPTEs, e.g. all the SPTEs that map a
+ * given GFN when used in the context of rmaps. Using a custom list allows KVM
+ * to optimize for the common case where many GFNs will have at most a handful
+ * of SPTEs pointing at them, i.e. allows packing multiple SPTEs into a small
+ * memory footprint, which in turn improves runtime performance by exploiting
+ * cache locality.
+ *
+ * A list is comprised of one or more pte_list_desc objects (descriptors).
+ * Each individual descriptor stores up to PTE_LIST_EXT SPTEs. If a descriptor
+ * is full and a new SPTEs needs to be added, a new descriptor is allocated and
+ * becomes the head of the list. This means that by definitions, all tail
+ * descriptors are full.
+ *
+ * Note, the meta data fields are deliberately placed at the start of the
+ * structure to optimize the cacheline layout; accessing the descriptor will
+ * touch only a single cacheline so long as @spte_count<=6 (or if only the
+ * descriptors metadata is accessed).
+ */
+struct pte_list_desc {
+ struct pte_list_desc *more;
+ /* The number of PTEs stored in _this_ descriptor. */
+ u32 spte_count;
+ /* The number of PTEs stored in all tails of this descriptor. */
+ u32 tail_count;
+ u64 *sptes[PTE_LIST_EXT];
+};
+
+struct kvm_shadow_walk_iterator {
+ u64 addr;
+ hpa_t shadow_addr;
+ u64 *sptep;
+ int level;
+ unsigned index;
+};
+
+#define for_each_shadow_entry_using_root(_vcpu, _root, _addr, _walker) \
+ for (shadow_walk_init_using_root(&(_walker), (_vcpu), \
+ (_root), (_addr)); \
+ shadow_walk_okay(&(_walker)); \
+ shadow_walk_next(&(_walker)))
+
+#define for_each_shadow_entry(_vcpu, _addr, _walker) \
+ for (shadow_walk_init(&(_walker), _vcpu, _addr); \
+ shadow_walk_okay(&(_walker)); \
+ shadow_walk_next(&(_walker)))
+
+#define for_each_shadow_entry_lockless(_vcpu, _addr, _walker, spte) \
+ for (shadow_walk_init(&(_walker), _vcpu, _addr); \
+ shadow_walk_okay(&(_walker)) && \
+ ({ spte = mmu_spte_get_lockless(_walker.sptep); 1; }); \
+ __shadow_walk_next(&(_walker), spte))
+
+static struct kmem_cache *pte_list_desc_cache;
+struct kmem_cache *mmu_page_header_cache;
+static struct percpu_counter kvm_total_used_mmu_pages;
+
+static void mmu_spte_set(u64 *sptep, u64 spte);
+
+struct kvm_mmu_role_regs {
+ const unsigned long cr0;
+ const unsigned long cr4;
+ const u64 efer;
+};
+
+#define CREATE_TRACE_POINTS
+#include "mmutrace.h"
+
+/*
+ * Yes, lot's of underscores. They're a hint that you probably shouldn't be
+ * reading from the role_regs. Once the root_role is constructed, it becomes
+ * the single source of truth for the MMU's state.
+ */
+#define BUILD_MMU_ROLE_REGS_ACCESSOR(reg, name, flag) \
+static inline bool __maybe_unused \
+____is_##reg##_##name(const struct kvm_mmu_role_regs *regs) \
+{ \
+ return !!(regs->reg & flag); \
+}
+BUILD_MMU_ROLE_REGS_ACCESSOR(cr0, pg, X86_CR0_PG);
+BUILD_MMU_ROLE_REGS_ACCESSOR(cr0, wp, X86_CR0_WP);
+BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, pse, X86_CR4_PSE);
+BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, pae, X86_CR4_PAE);
+BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, smep, X86_CR4_SMEP);
+BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, smap, X86_CR4_SMAP);
+BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, pke, X86_CR4_PKE);
+BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, la57, X86_CR4_LA57);
+BUILD_MMU_ROLE_REGS_ACCESSOR(efer, nx, EFER_NX);
+BUILD_MMU_ROLE_REGS_ACCESSOR(efer, lma, EFER_LMA);
+
+/*
+ * The MMU itself (with a valid role) is the single source of truth for the
+ * MMU. Do not use the regs used to build the MMU/role, nor the vCPU. The
+ * regs don't account for dependencies, e.g. clearing CR4 bits if CR0.PG=1,
+ * and the vCPU may be incorrect/irrelevant.
+ */
+#define BUILD_MMU_ROLE_ACCESSOR(base_or_ext, reg, name) \
+static inline bool __maybe_unused is_##reg##_##name(struct kvm_mmu *mmu) \
+{ \
+ return !!(mmu->cpu_role. base_or_ext . reg##_##name); \
+}
+BUILD_MMU_ROLE_ACCESSOR(base, cr0, wp);
+BUILD_MMU_ROLE_ACCESSOR(ext, cr4, pse);
+BUILD_MMU_ROLE_ACCESSOR(ext, cr4, smep);
+BUILD_MMU_ROLE_ACCESSOR(ext, cr4, smap);
+BUILD_MMU_ROLE_ACCESSOR(ext, cr4, pke);
+BUILD_MMU_ROLE_ACCESSOR(ext, cr4, la57);
+BUILD_MMU_ROLE_ACCESSOR(base, efer, nx);
+BUILD_MMU_ROLE_ACCESSOR(ext, efer, lma);
+
+static inline bool is_cr0_pg(struct kvm_mmu *mmu)
+{
+ return mmu->cpu_role.base.level > 0;
+}
+
+static inline bool is_cr4_pae(struct kvm_mmu *mmu)
+{
+ return !mmu->cpu_role.base.has_4_byte_gpte;
+}
+
+static struct kvm_mmu_role_regs vcpu_to_role_regs(struct kvm_vcpu *vcpu)
+{
+ struct kvm_mmu_role_regs regs = {
+ .cr0 = kvm_read_cr0_bits(vcpu, KVM_MMU_CR0_ROLE_BITS),
+ .cr4 = kvm_read_cr4_bits(vcpu, KVM_MMU_CR4_ROLE_BITS),
+ .efer = vcpu->arch.efer,
+ };
+
+ return regs;
+}
+
+static unsigned long get_guest_cr3(struct kvm_vcpu *vcpu)
+{
+ return kvm_read_cr3(vcpu);
+}
+
+static inline unsigned long kvm_mmu_get_guest_pgd(struct kvm_vcpu *vcpu,
+ struct kvm_mmu *mmu)
+{
+ if (IS_ENABLED(CONFIG_RETPOLINE) && mmu->get_guest_pgd == get_guest_cr3)
+ return kvm_read_cr3(vcpu);
+
+ return mmu->get_guest_pgd(vcpu);
+}
+
+static inline bool kvm_available_flush_remote_tlbs_range(void)
+{
+ return kvm_x86_ops.flush_remote_tlbs_range;
+}
+
+int kvm_arch_flush_remote_tlbs_range(struct kvm *kvm, gfn_t gfn, u64 nr_pages)
+{
+ if (!kvm_x86_ops.flush_remote_tlbs_range)
+ return -EOPNOTSUPP;
+
+ return static_call(kvm_x86_flush_remote_tlbs_range)(kvm, gfn, nr_pages);
+}
+
+static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index);
+
+/* Flush the range of guest memory mapped by the given SPTE. */
+static void kvm_flush_remote_tlbs_sptep(struct kvm *kvm, u64 *sptep)
+{
+ struct kvm_mmu_page *sp = sptep_to_sp(sptep);
+ gfn_t gfn = kvm_mmu_page_get_gfn(sp, spte_index(sptep));
+
+ kvm_flush_remote_tlbs_gfn(kvm, gfn, sp->role.level);
+}
+
+static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn,
+ unsigned int access)
+{
+ u64 spte = make_mmio_spte(vcpu, gfn, access);
+
+ trace_mark_mmio_spte(sptep, gfn, spte);
+ mmu_spte_set(sptep, spte);
+}
+
+static gfn_t get_mmio_spte_gfn(u64 spte)
+{
+ u64 gpa = spte & shadow_nonpresent_or_rsvd_lower_gfn_mask;
+
+ gpa |= (spte >> SHADOW_NONPRESENT_OR_RSVD_MASK_LEN)
+ & shadow_nonpresent_or_rsvd_mask;
+
+ return gpa >> PAGE_SHIFT;
+}
+
+static unsigned get_mmio_spte_access(u64 spte)
+{
+ return spte & shadow_mmio_access_mask;
+}
+
+static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte)
+{
+ u64 kvm_gen, spte_gen, gen;
+
+ gen = kvm_vcpu_memslots(vcpu)->generation;
+ if (unlikely(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS))
+ return false;
+
+ kvm_gen = gen & MMIO_SPTE_GEN_MASK;
+ spte_gen = get_mmio_spte_generation(spte);
+
+ trace_check_mmio_spte(spte, kvm_gen, spte_gen);
+ return likely(kvm_gen == spte_gen);
+}
+
+static int is_cpuid_PSE36(void)
+{
+ return 1;
+}
+
+#ifdef CONFIG_X86_64
+static void __set_spte(u64 *sptep, u64 spte)
+{
+ WRITE_ONCE(*sptep, spte);
+}
+
+static void __update_clear_spte_fast(u64 *sptep, u64 spte)
+{
+ WRITE_ONCE(*sptep, spte);
+}
+
+static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
+{
+ return xchg(sptep, spte);
+}
+
+static u64 __get_spte_lockless(u64 *sptep)
+{
+ return READ_ONCE(*sptep);
+}
+#else
+union split_spte {
+ struct {
+ u32 spte_low;
+ u32 spte_high;
+ };
+ u64 spte;
+};
+
+static void count_spte_clear(u64 *sptep, u64 spte)
+{
+ struct kvm_mmu_page *sp = sptep_to_sp(sptep);
+
+ if (is_shadow_present_pte(spte))
+ return;
+
+ /* Ensure the spte is completely set before we increase the count */
+ smp_wmb();
+ sp->clear_spte_count++;
+}
+
+static void __set_spte(u64 *sptep, u64 spte)
+{
+ union split_spte *ssptep, sspte;
+
+ ssptep = (union split_spte *)sptep;
+ sspte = (union split_spte)spte;
+
+ ssptep->spte_high = sspte.spte_high;
+
+ /*
+ * If we map the spte from nonpresent to present, We should store
+ * the high bits firstly, then set present bit, so cpu can not
+ * fetch this spte while we are setting the spte.
+ */
+ smp_wmb();
+
+ WRITE_ONCE(ssptep->spte_low, sspte.spte_low);
+}
+
+static void __update_clear_spte_fast(u64 *sptep, u64 spte)
+{
+ union split_spte *ssptep, sspte;
+
+ ssptep = (union split_spte *)sptep;
+ sspte = (union split_spte)spte;
+
+ WRITE_ONCE(ssptep->spte_low, sspte.spte_low);
+
+ /*
+ * If we map the spte from present to nonpresent, we should clear
+ * present bit firstly to avoid vcpu fetch the old high bits.
+ */
+ smp_wmb();
+
+ ssptep->spte_high = sspte.spte_high;
+ count_spte_clear(sptep, spte);
+}
+
+static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
+{
+ union split_spte *ssptep, sspte, orig;
+
+ ssptep = (union split_spte *)sptep;
+ sspte = (union split_spte)spte;
+
+ /* xchg acts as a barrier before the setting of the high bits */
+ orig.spte_low = xchg(&ssptep->spte_low, sspte.spte_low);
+ orig.spte_high = ssptep->spte_high;
+ ssptep->spte_high = sspte.spte_high;
+ count_spte_clear(sptep, spte);
+
+ return orig.spte;
+}
+
+/*
+ * The idea using the light way get the spte on x86_32 guest is from
+ * gup_get_pte (mm/gup.c).
+ *
+ * An spte tlb flush may be pending, because kvm_set_pte_rmap
+ * coalesces them and we are running out of the MMU lock. Therefore
+ * we need to protect against in-progress updates of the spte.
+ *
+ * Reading the spte while an update is in progress may get the old value
+ * for the high part of the spte. The race is fine for a present->non-present
+ * change (because the high part of the spte is ignored for non-present spte),
+ * but for a present->present change we must reread the spte.
+ *
+ * All such changes are done in two steps (present->non-present and
+ * non-present->present), hence it is enough to count the number of
+ * present->non-present updates: if it changed while reading the spte,
+ * we might have hit the race. This is done using clear_spte_count.
+ */
+static u64 __get_spte_lockless(u64 *sptep)
+{
+ struct kvm_mmu_page *sp = sptep_to_sp(sptep);
+ union split_spte spte, *orig = (union split_spte *)sptep;
+ int count;
+
+retry:
+ count = sp->clear_spte_count;
+ smp_rmb();
+
+ spte.spte_low = orig->spte_low;
+ smp_rmb();
+
+ spte.spte_high = orig->spte_high;
+ smp_rmb();
+
+ if (unlikely(spte.spte_low != orig->spte_low ||
+ count != sp->clear_spte_count))
+ goto retry;
+
+ return spte.spte;
+}
+#endif
+
+/* Rules for using mmu_spte_set:
+ * Set the sptep from nonpresent to present.
+ * Note: the sptep being assigned *must* be either not present
+ * or in a state where the hardware will not attempt to update
+ * the spte.
+ */
+static void mmu_spte_set(u64 *sptep, u64 new_spte)
+{
+ WARN_ON_ONCE(is_shadow_present_pte(*sptep));
+ __set_spte(sptep, new_spte);
+}
+
+/*
+ * Update the SPTE (excluding the PFN), but do not track changes in its
+ * accessed/dirty status.
+ */
+static u64 mmu_spte_update_no_track(u64 *sptep, u64 new_spte)
+{
+ u64 old_spte = *sptep;
+
+ WARN_ON_ONCE(!is_shadow_present_pte(new_spte));
+ check_spte_writable_invariants(new_spte);
+
+ if (!is_shadow_present_pte(old_spte)) {
+ mmu_spte_set(sptep, new_spte);
+ return old_spte;
+ }
+
+ if (!spte_has_volatile_bits(old_spte))
+ __update_clear_spte_fast(sptep, new_spte);
+ else
+ old_spte = __update_clear_spte_slow(sptep, new_spte);
+
+ WARN_ON_ONCE(spte_to_pfn(old_spte) != spte_to_pfn(new_spte));
+
+ return old_spte;
+}
+
+/* Rules for using mmu_spte_update:
+ * Update the state bits, it means the mapped pfn is not changed.
+ *
+ * Whenever an MMU-writable SPTE is overwritten with a read-only SPTE, remote
+ * TLBs must be flushed. Otherwise rmap_write_protect will find a read-only
+ * spte, even though the writable spte might be cached on a CPU's TLB.
+ *
+ * Returns true if the TLB needs to be flushed
+ */
+static bool mmu_spte_update(u64 *sptep, u64 new_spte)
+{
+ bool flush = false;
+ u64 old_spte = mmu_spte_update_no_track(sptep, new_spte);
+
+ if (!is_shadow_present_pte(old_spte))
+ return false;
+
+ /*
+ * For the spte updated out of mmu-lock is safe, since
+ * we always atomically update it, see the comments in
+ * spte_has_volatile_bits().
+ */
+ if (is_mmu_writable_spte(old_spte) &&
+ !is_writable_pte(new_spte))
+ flush = true;
+
+ /*
+ * Flush TLB when accessed/dirty states are changed in the page tables,
+ * to guarantee consistency between TLB and page tables.
+ */
+
+ if (is_accessed_spte(old_spte) && !is_accessed_spte(new_spte)) {
+ flush = true;
+ kvm_set_pfn_accessed(spte_to_pfn(old_spte));
+ }
+
+ if (is_dirty_spte(old_spte) && !is_dirty_spte(new_spte)) {
+ flush = true;
+ kvm_set_pfn_dirty(spte_to_pfn(old_spte));
+ }
+
+ return flush;
+}
+
+/*
+ * Rules for using mmu_spte_clear_track_bits:
+ * It sets the sptep from present to nonpresent, and track the
+ * state bits, it is used to clear the last level sptep.
+ * Returns the old PTE.
+ */
+static u64 mmu_spte_clear_track_bits(struct kvm *kvm, u64 *sptep)
+{
+ kvm_pfn_t pfn;
+ u64 old_spte = *sptep;
+ int level = sptep_to_sp(sptep)->role.level;
+ struct page *page;
+
+ if (!is_shadow_present_pte(old_spte) ||
+ !spte_has_volatile_bits(old_spte))
+ __update_clear_spte_fast(sptep, 0ull);
+ else
+ old_spte = __update_clear_spte_slow(sptep, 0ull);
+
+ if (!is_shadow_present_pte(old_spte))
+ return old_spte;
+
+ kvm_update_page_stats(kvm, level, -1);
+
+ pfn = spte_to_pfn(old_spte);
+
+ /*
+ * KVM doesn't hold a reference to any pages mapped into the guest, and
+ * instead uses the mmu_notifier to ensure that KVM unmaps any pages
+ * before they are reclaimed. Sanity check that, if the pfn is backed
+ * by a refcounted page, the refcount is elevated.
+ */
+ page = kvm_pfn_to_refcounted_page(pfn);
+ WARN_ON_ONCE(page && !page_count(page));
+
+ if (is_accessed_spte(old_spte))
+ kvm_set_pfn_accessed(pfn);
+
+ if (is_dirty_spte(old_spte))
+ kvm_set_pfn_dirty(pfn);
+
+ return old_spte;
+}
+
+/*
+ * Rules for using mmu_spte_clear_no_track:
+ * Directly clear spte without caring the state bits of sptep,
+ * it is used to set the upper level spte.
+ */
+static void mmu_spte_clear_no_track(u64 *sptep)
+{
+ __update_clear_spte_fast(sptep, 0ull);
+}
+
+static u64 mmu_spte_get_lockless(u64 *sptep)
+{
+ return __get_spte_lockless(sptep);
+}
+
+/* Returns the Accessed status of the PTE and resets it at the same time. */
+static bool mmu_spte_age(u64 *sptep)
+{
+ u64 spte = mmu_spte_get_lockless(sptep);
+
+ if (!is_accessed_spte(spte))
+ return false;
+
+ if (spte_ad_enabled(spte)) {
+ clear_bit((ffs(shadow_accessed_mask) - 1),
+ (unsigned long *)sptep);
+ } else {
+ /*
+ * Capture the dirty status of the page, so that it doesn't get
+ * lost when the SPTE is marked for access tracking.
+ */
+ if (is_writable_pte(spte))
+ kvm_set_pfn_dirty(spte_to_pfn(spte));
+
+ spte = mark_spte_for_access_track(spte);
+ mmu_spte_update_no_track(sptep, spte);
+ }
+
+ return true;
+}
+
+static inline bool is_tdp_mmu_active(struct kvm_vcpu *vcpu)
+{
+ return tdp_mmu_enabled && vcpu->arch.mmu->root_role.direct;
+}
+
+static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu)
+{
+ if (is_tdp_mmu_active(vcpu)) {
+ kvm_tdp_mmu_walk_lockless_begin();
+ } else {
+ /*
+ * Prevent page table teardown by making any free-er wait during
+ * kvm_flush_remote_tlbs() IPI to all active vcpus.
+ */
+ local_irq_disable();
+
+ /*
+ * Make sure a following spte read is not reordered ahead of the write
+ * to vcpu->mode.
+ */
+ smp_store_mb(vcpu->mode, READING_SHADOW_PAGE_TABLES);
+ }
+}
+
+static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu)
+{
+ if (is_tdp_mmu_active(vcpu)) {
+ kvm_tdp_mmu_walk_lockless_end();
+ } else {
+ /*
+ * Make sure the write to vcpu->mode is not reordered in front of
+ * reads to sptes. If it does, kvm_mmu_commit_zap_page() can see us
+ * OUTSIDE_GUEST_MODE and proceed to free the shadow page table.
+ */
+ smp_store_release(&vcpu->mode, OUTSIDE_GUEST_MODE);
+ local_irq_enable();
+ }
+}
+
+static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu, bool maybe_indirect)
+{
+ int r;
+
+ /* 1 rmap, 1 parent PTE per level, and the prefetched rmaps. */
+ r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache,
+ 1 + PT64_ROOT_MAX_LEVEL + PTE_PREFETCH_NUM);
+ if (r)
+ return r;
+ r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_shadow_page_cache,
+ PT64_ROOT_MAX_LEVEL);
+ if (r)
+ return r;
+ if (maybe_indirect) {
+ r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_shadowed_info_cache,
+ PT64_ROOT_MAX_LEVEL);
+ if (r)
+ return r;
+ }
+ return kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache,
+ PT64_ROOT_MAX_LEVEL);
+}
+
+static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
+{
+ kvm_mmu_free_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache);
+ kvm_mmu_free_memory_cache(&vcpu->arch.mmu_shadow_page_cache);
+ kvm_mmu_free_memory_cache(&vcpu->arch.mmu_shadowed_info_cache);
+ kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache);
+}
+
+static void mmu_free_pte_list_desc(struct pte_list_desc *pte_list_desc)
+{
+ kmem_cache_free(pte_list_desc_cache, pte_list_desc);
+}
+
+static bool sp_has_gptes(struct kvm_mmu_page *sp);
+
+static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index)
+{
+ if (sp->role.passthrough)
+ return sp->gfn;
+
+ if (!sp->role.direct)
+ return sp->shadowed_translation[index] >> PAGE_SHIFT;
+
+ return sp->gfn + (index << ((sp->role.level - 1) * SPTE_LEVEL_BITS));
+}
+
+/*
+ * For leaf SPTEs, fetch the *guest* access permissions being shadowed. Note
+ * that the SPTE itself may have a more constrained access permissions that
+ * what the guest enforces. For example, a guest may create an executable
+ * huge PTE but KVM may disallow execution to mitigate iTLB multihit.
+ */
+static u32 kvm_mmu_page_get_access(struct kvm_mmu_page *sp, int index)
+{
+ if (sp_has_gptes(sp))
+ return sp->shadowed_translation[index] & ACC_ALL;
+
+ /*
+ * For direct MMUs (e.g. TDP or non-paging guests) or passthrough SPs,
+ * KVM is not shadowing any guest page tables, so the "guest access
+ * permissions" are just ACC_ALL.
+ *
+ * For direct SPs in indirect MMUs (shadow paging), i.e. when KVM
+ * is shadowing a guest huge page with small pages, the guest access
+ * permissions being shadowed are the access permissions of the huge
+ * page.
+ *
+ * In both cases, sp->role.access contains the correct access bits.
+ */
+ return sp->role.access;
+}
+
+static void kvm_mmu_page_set_translation(struct kvm_mmu_page *sp, int index,
+ gfn_t gfn, unsigned int access)
+{
+ if (sp_has_gptes(sp)) {
+ sp->shadowed_translation[index] = (gfn << PAGE_SHIFT) | access;
+ return;
+ }
+
+ WARN_ONCE(access != kvm_mmu_page_get_access(sp, index),
+ "access mismatch under %s page %llx (expected %u, got %u)\n",
+ sp->role.passthrough ? "passthrough" : "direct",
+ sp->gfn, kvm_mmu_page_get_access(sp, index), access);
+
+ WARN_ONCE(gfn != kvm_mmu_page_get_gfn(sp, index),
+ "gfn mismatch under %s page %llx (expected %llx, got %llx)\n",
+ sp->role.passthrough ? "passthrough" : "direct",
+ sp->gfn, kvm_mmu_page_get_gfn(sp, index), gfn);
+}
+
+static void kvm_mmu_page_set_access(struct kvm_mmu_page *sp, int index,
+ unsigned int access)
+{
+ gfn_t gfn = kvm_mmu_page_get_gfn(sp, index);
+
+ kvm_mmu_page_set_translation(sp, index, gfn, access);
+}
+
+/*
+ * Return the pointer to the large page information for a given gfn,
+ * handling slots that are not large page aligned.
+ */
+static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn,
+ const struct kvm_memory_slot *slot, int level)
+{
+ unsigned long idx;
+
+ idx = gfn_to_index(gfn, slot->base_gfn, level);
+ return &slot->arch.lpage_info[level - 2][idx];
+}
+
+static void update_gfn_disallow_lpage_count(const struct kvm_memory_slot *slot,
+ gfn_t gfn, int count)
+{
+ struct kvm_lpage_info *linfo;
+ int i;
+
+ for (i = PG_LEVEL_2M; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) {
+ linfo = lpage_info_slot(gfn, slot, i);
+ linfo->disallow_lpage += count;
+ WARN_ON_ONCE(linfo->disallow_lpage < 0);
+ }
+}
+
+void kvm_mmu_gfn_disallow_lpage(const struct kvm_memory_slot *slot, gfn_t gfn)
+{
+ update_gfn_disallow_lpage_count(slot, gfn, 1);
+}
+
+void kvm_mmu_gfn_allow_lpage(const struct kvm_memory_slot *slot, gfn_t gfn)
+{
+ update_gfn_disallow_lpage_count(slot, gfn, -1);
+}
+
+static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+ struct kvm_memslots *slots;
+ struct kvm_memory_slot *slot;
+ gfn_t gfn;
+
+ kvm->arch.indirect_shadow_pages++;
+ gfn = sp->gfn;
+ slots = kvm_memslots_for_spte_role(kvm, sp->role);
+ slot = __gfn_to_memslot(slots, gfn);
+
+ /* the non-leaf shadow pages are keeping readonly. */
+ if (sp->role.level > PG_LEVEL_4K)
+ return __kvm_write_track_add_gfn(kvm, slot, gfn);
+
+ kvm_mmu_gfn_disallow_lpage(slot, gfn);
+
+ if (kvm_mmu_slot_gfn_write_protect(kvm, slot, gfn, PG_LEVEL_4K))
+ kvm_flush_remote_tlbs_gfn(kvm, gfn, PG_LEVEL_4K);
+}
+
+void track_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+ /*
+ * If it's possible to replace the shadow page with an NX huge page,
+ * i.e. if the shadow page is the only thing currently preventing KVM
+ * from using a huge page, add the shadow page to the list of "to be
+ * zapped for NX recovery" pages. Note, the shadow page can already be
+ * on the list if KVM is reusing an existing shadow page, i.e. if KVM
+ * links a shadow page at multiple points.
+ */
+ if (!list_empty(&sp->possible_nx_huge_page_link))
+ return;
+
+ ++kvm->stat.nx_lpage_splits;
+ list_add_tail(&sp->possible_nx_huge_page_link,
+ &kvm->arch.possible_nx_huge_pages);
+}
+
+static void account_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp,
+ bool nx_huge_page_possible)
+{
+ sp->nx_huge_page_disallowed = true;
+
+ if (nx_huge_page_possible)
+ track_possible_nx_huge_page(kvm, sp);
+}
+
+static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+ struct kvm_memslots *slots;
+ struct kvm_memory_slot *slot;
+ gfn_t gfn;
+
+ kvm->arch.indirect_shadow_pages--;
+ gfn = sp->gfn;
+ slots = kvm_memslots_for_spte_role(kvm, sp->role);
+ slot = __gfn_to_memslot(slots, gfn);
+ if (sp->role.level > PG_LEVEL_4K)
+ return __kvm_write_track_remove_gfn(kvm, slot, gfn);
+
+ kvm_mmu_gfn_allow_lpage(slot, gfn);
+}
+
+void untrack_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+ if (list_empty(&sp->possible_nx_huge_page_link))
+ return;
+
+ --kvm->stat.nx_lpage_splits;
+ list_del_init(&sp->possible_nx_huge_page_link);
+}
+
+static void unaccount_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+ sp->nx_huge_page_disallowed = false;
+
+ untrack_possible_nx_huge_page(kvm, sp);
+}
+
+static struct kvm_memory_slot *gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu,
+ gfn_t gfn,
+ bool no_dirty_log)
+{
+ struct kvm_memory_slot *slot;
+
+ slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
+ if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
+ return NULL;
+ if (no_dirty_log && kvm_slot_dirty_track_enabled(slot))
+ return NULL;
+
+ return slot;
+}
+
+/*
+ * About rmap_head encoding:
+ *
+ * If the bit zero of rmap_head->val is clear, then it points to the only spte
+ * in this rmap chain. Otherwise, (rmap_head->val & ~1) points to a struct
+ * pte_list_desc containing more mappings.
+ */
+
+/*
+ * Returns the number of pointers in the rmap chain, not counting the new one.
+ */
+static int pte_list_add(struct kvm_mmu_memory_cache *cache, u64 *spte,
+ struct kvm_rmap_head *rmap_head)
+{
+ struct pte_list_desc *desc;
+ int count = 0;
+
+ if (!rmap_head->val) {
+ rmap_head->val = (unsigned long)spte;
+ } else if (!(rmap_head->val & 1)) {
+ desc = kvm_mmu_memory_cache_alloc(cache);
+ desc->sptes[0] = (u64 *)rmap_head->val;
+ desc->sptes[1] = spte;
+ desc->spte_count = 2;
+ desc->tail_count = 0;
+ rmap_head->val = (unsigned long)desc | 1;
+ ++count;
+ } else {
+ desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
+ count = desc->tail_count + desc->spte_count;
+
+ /*
+ * If the previous head is full, allocate a new head descriptor
+ * as tail descriptors are always kept full.
+ */
+ if (desc->spte_count == PTE_LIST_EXT) {
+ desc = kvm_mmu_memory_cache_alloc(cache);
+ desc->more = (struct pte_list_desc *)(rmap_head->val & ~1ul);
+ desc->spte_count = 0;
+ desc->tail_count = count;
+ rmap_head->val = (unsigned long)desc | 1;
+ }
+ desc->sptes[desc->spte_count++] = spte;
+ }
+ return count;
+}
+
+static void pte_list_desc_remove_entry(struct kvm *kvm,
+ struct kvm_rmap_head *rmap_head,
+ struct pte_list_desc *desc, int i)
+{
+ struct pte_list_desc *head_desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
+ int j = head_desc->spte_count - 1;
+
+ /*
+ * The head descriptor should never be empty. A new head is added only
+ * when adding an entry and the previous head is full, and heads are
+ * removed (this flow) when they become empty.
+ */
+ KVM_BUG_ON_DATA_CORRUPTION(j < 0, kvm);
+
+ /*
+ * Replace the to-be-freed SPTE with the last valid entry from the head
+ * descriptor to ensure that tail descriptors are full at all times.
+ * Note, this also means that tail_count is stable for each descriptor.
+ */
+ desc->sptes[i] = head_desc->sptes[j];
+ head_desc->sptes[j] = NULL;
+ head_desc->spte_count--;
+ if (head_desc->spte_count)
+ return;
+
+ /*
+ * The head descriptor is empty. If there are no tail descriptors,
+ * nullify the rmap head to mark the list as emtpy, else point the rmap
+ * head at the next descriptor, i.e. the new head.
+ */
+ if (!head_desc->more)
+ rmap_head->val = 0;
+ else
+ rmap_head->val = (unsigned long)head_desc->more | 1;
+ mmu_free_pte_list_desc(head_desc);
+}
+
+static void pte_list_remove(struct kvm *kvm, u64 *spte,
+ struct kvm_rmap_head *rmap_head)
+{
+ struct pte_list_desc *desc;
+ int i;
+
+ if (KVM_BUG_ON_DATA_CORRUPTION(!rmap_head->val, kvm))
+ return;
+
+ if (!(rmap_head->val & 1)) {
+ if (KVM_BUG_ON_DATA_CORRUPTION((u64 *)rmap_head->val != spte, kvm))
+ return;
+
+ rmap_head->val = 0;
+ } else {
+ desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
+ while (desc) {
+ for (i = 0; i < desc->spte_count; ++i) {
+ if (desc->sptes[i] == spte) {
+ pte_list_desc_remove_entry(kvm, rmap_head,
+ desc, i);
+ return;
+ }
+ }
+ desc = desc->more;
+ }
+
+ KVM_BUG_ON_DATA_CORRUPTION(true, kvm);
+ }
+}
+
+static void kvm_zap_one_rmap_spte(struct kvm *kvm,
+ struct kvm_rmap_head *rmap_head, u64 *sptep)
+{
+ mmu_spte_clear_track_bits(kvm, sptep);
+ pte_list_remove(kvm, sptep, rmap_head);
+}
+
+/* Return true if at least one SPTE was zapped, false otherwise */
+static bool kvm_zap_all_rmap_sptes(struct kvm *kvm,
+ struct kvm_rmap_head *rmap_head)
+{
+ struct pte_list_desc *desc, *next;
+ int i;
+
+ if (!rmap_head->val)
+ return false;
+
+ if (!(rmap_head->val & 1)) {
+ mmu_spte_clear_track_bits(kvm, (u64 *)rmap_head->val);
+ goto out;
+ }
+
+ desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
+
+ for (; desc; desc = next) {
+ for (i = 0; i < desc->spte_count; i++)
+ mmu_spte_clear_track_bits(kvm, desc->sptes[i]);
+ next = desc->more;
+ mmu_free_pte_list_desc(desc);
+ }
+out:
+ /* rmap_head is meaningless now, remember to reset it */
+ rmap_head->val = 0;
+ return true;
+}
+
+unsigned int pte_list_count(struct kvm_rmap_head *rmap_head)
+{
+ struct pte_list_desc *desc;
+
+ if (!rmap_head->val)
+ return 0;
+ else if (!(rmap_head->val & 1))
+ return 1;
+
+ desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
+ return desc->tail_count + desc->spte_count;
+}
+
+static struct kvm_rmap_head *gfn_to_rmap(gfn_t gfn, int level,
+ const struct kvm_memory_slot *slot)
+{
+ unsigned long idx;
+
+ idx = gfn_to_index(gfn, slot->base_gfn, level);
+ return &slot->arch.rmap[level - PG_LEVEL_4K][idx];
+}
+
+static void rmap_remove(struct kvm *kvm, u64 *spte)
+{
+ struct kvm_memslots *slots;
+ struct kvm_memory_slot *slot;
+ struct kvm_mmu_page *sp;
+ gfn_t gfn;
+ struct kvm_rmap_head *rmap_head;
+
+ sp = sptep_to_sp(spte);
+ gfn = kvm_mmu_page_get_gfn(sp, spte_index(spte));
+
+ /*
+ * Unlike rmap_add, rmap_remove does not run in the context of a vCPU
+ * so we have to determine which memslots to use based on context
+ * information in sp->role.
+ */
+ slots = kvm_memslots_for_spte_role(kvm, sp->role);
+
+ slot = __gfn_to_memslot(slots, gfn);
+ rmap_head = gfn_to_rmap(gfn, sp->role.level, slot);
+
+ pte_list_remove(kvm, spte, rmap_head);
+}
+
+/*
+ * Used by the following functions to iterate through the sptes linked by a
+ * rmap. All fields are private and not assumed to be used outside.
+ */
+struct rmap_iterator {
+ /* private fields */
+ struct pte_list_desc *desc; /* holds the sptep if not NULL */
+ int pos; /* index of the sptep */
+};
+
+/*
+ * Iteration must be started by this function. This should also be used after
+ * removing/dropping sptes from the rmap link because in such cases the
+ * information in the iterator may not be valid.
+ *
+ * Returns sptep if found, NULL otherwise.
+ */
+static u64 *rmap_get_first(struct kvm_rmap_head *rmap_head,
+ struct rmap_iterator *iter)
+{
+ u64 *sptep;
+
+ if (!rmap_head->val)
+ return NULL;
+
+ if (!(rmap_head->val & 1)) {
+ iter->desc = NULL;
+ sptep = (u64 *)rmap_head->val;
+ goto out;
+ }
+
+ iter->desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
+ iter->pos = 0;
+ sptep = iter->desc->sptes[iter->pos];
+out:
+ BUG_ON(!is_shadow_present_pte(*sptep));
+ return sptep;
+}
+
+/*
+ * Must be used with a valid iterator: e.g. after rmap_get_first().
+ *
+ * Returns sptep if found, NULL otherwise.
+ */
+static u64 *rmap_get_next(struct rmap_iterator *iter)
+{
+ u64 *sptep;
+
+ if (iter->desc) {
+ if (iter->pos < PTE_LIST_EXT - 1) {
+ ++iter->pos;
+ sptep = iter->desc->sptes[iter->pos];
+ if (sptep)
+ goto out;
+ }
+
+ iter->desc = iter->desc->more;
+
+ if (iter->desc) {
+ iter->pos = 0;
+ /* desc->sptes[0] cannot be NULL */
+ sptep = iter->desc->sptes[iter->pos];
+ goto out;
+ }
+ }
+
+ return NULL;
+out:
+ BUG_ON(!is_shadow_present_pte(*sptep));
+ return sptep;
+}
+
+#define for_each_rmap_spte(_rmap_head_, _iter_, _spte_) \
+ for (_spte_ = rmap_get_first(_rmap_head_, _iter_); \
+ _spte_; _spte_ = rmap_get_next(_iter_))
+
+static void drop_spte(struct kvm *kvm, u64 *sptep)
+{
+ u64 old_spte = mmu_spte_clear_track_bits(kvm, sptep);
+
+ if (is_shadow_present_pte(old_spte))
+ rmap_remove(kvm, sptep);
+}
+
+static void drop_large_spte(struct kvm *kvm, u64 *sptep, bool flush)
+{
+ struct kvm_mmu_page *sp;
+
+ sp = sptep_to_sp(sptep);
+ WARN_ON_ONCE(sp->role.level == PG_LEVEL_4K);
+
+ drop_spte(kvm, sptep);
+
+ if (flush)
+ kvm_flush_remote_tlbs_sptep(kvm, sptep);
+}
+
+/*
+ * Write-protect on the specified @sptep, @pt_protect indicates whether
+ * spte write-protection is caused by protecting shadow page table.
+ *
+ * Note: write protection is difference between dirty logging and spte
+ * protection:
+ * - for dirty logging, the spte can be set to writable at anytime if
+ * its dirty bitmap is properly set.
+ * - for spte protection, the spte can be writable only after unsync-ing
+ * shadow page.
+ *
+ * Return true if tlb need be flushed.
+ */
+static bool spte_write_protect(u64 *sptep, bool pt_protect)
+{
+ u64 spte = *sptep;
+
+ if (!is_writable_pte(spte) &&
+ !(pt_protect && is_mmu_writable_spte(spte)))
+ return false;
+
+ if (pt_protect)
+ spte &= ~shadow_mmu_writable_mask;
+ spte = spte & ~PT_WRITABLE_MASK;
+
+ return mmu_spte_update(sptep, spte);
+}
+
+static bool rmap_write_protect(struct kvm_rmap_head *rmap_head,
+ bool pt_protect)
+{
+ u64 *sptep;
+ struct rmap_iterator iter;
+ bool flush = false;
+
+ for_each_rmap_spte(rmap_head, &iter, sptep)
+ flush |= spte_write_protect(sptep, pt_protect);
+
+ return flush;
+}
+
+static bool spte_clear_dirty(u64 *sptep)
+{
+ u64 spte = *sptep;
+
+ KVM_MMU_WARN_ON(!spte_ad_enabled(spte));
+ spte &= ~shadow_dirty_mask;
+ return mmu_spte_update(sptep, spte);
+}
+
+static bool spte_wrprot_for_clear_dirty(u64 *sptep)
+{
+ bool was_writable = test_and_clear_bit(PT_WRITABLE_SHIFT,
+ (unsigned long *)sptep);
+ if (was_writable && !spte_ad_enabled(*sptep))
+ kvm_set_pfn_dirty(spte_to_pfn(*sptep));
+
+ return was_writable;
+}
+
+/*
+ * Gets the GFN ready for another round of dirty logging by clearing the
+ * - D bit on ad-enabled SPTEs, and
+ * - W bit on ad-disabled SPTEs.
+ * Returns true iff any D or W bits were cleared.
+ */
+static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
+ const struct kvm_memory_slot *slot)
+{
+ u64 *sptep;
+ struct rmap_iterator iter;
+ bool flush = false;
+
+ for_each_rmap_spte(rmap_head, &iter, sptep)
+ if (spte_ad_need_write_protect(*sptep))
+ flush |= spte_wrprot_for_clear_dirty(sptep);
+ else
+ flush |= spte_clear_dirty(sptep);
+
+ return flush;
+}
+
+/**
+ * kvm_mmu_write_protect_pt_masked - write protect selected PT level pages
+ * @kvm: kvm instance
+ * @slot: slot to protect
+ * @gfn_offset: start of the BITS_PER_LONG pages we care about
+ * @mask: indicates which pages we should protect
+ *
+ * Used when we do not need to care about huge page mappings.
+ */
+static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
+ struct kvm_memory_slot *slot,
+ gfn_t gfn_offset, unsigned long mask)
+{
+ struct kvm_rmap_head *rmap_head;
+
+ if (tdp_mmu_enabled)
+ kvm_tdp_mmu_clear_dirty_pt_masked(kvm, slot,
+ slot->base_gfn + gfn_offset, mask, true);
+
+ if (!kvm_memslots_have_rmaps(kvm))
+ return;
+
+ while (mask) {
+ rmap_head = gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
+ PG_LEVEL_4K, slot);
+ rmap_write_protect(rmap_head, false);
+
+ /* clear the first set bit */
+ mask &= mask - 1;
+ }
+}
+
+/**
+ * kvm_mmu_clear_dirty_pt_masked - clear MMU D-bit for PT level pages, or write
+ * protect the page if the D-bit isn't supported.
+ * @kvm: kvm instance
+ * @slot: slot to clear D-bit
+ * @gfn_offset: start of the BITS_PER_LONG pages we care about
+ * @mask: indicates which pages we should clear D-bit
+ *
+ * Used for PML to re-log the dirty GPAs after userspace querying dirty_bitmap.
+ */
+static void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm,
+ struct kvm_memory_slot *slot,
+ gfn_t gfn_offset, unsigned long mask)
+{
+ struct kvm_rmap_head *rmap_head;
+
+ if (tdp_mmu_enabled)
+ kvm_tdp_mmu_clear_dirty_pt_masked(kvm, slot,
+ slot->base_gfn + gfn_offset, mask, false);
+
+ if (!kvm_memslots_have_rmaps(kvm))
+ return;
+
+ while (mask) {
+ rmap_head = gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
+ PG_LEVEL_4K, slot);
+ __rmap_clear_dirty(kvm, rmap_head, slot);
+
+ /* clear the first set bit */
+ mask &= mask - 1;
+ }
+}
+
+/**
+ * kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected
+ * PT level pages.
+ *
+ * It calls kvm_mmu_write_protect_pt_masked to write protect selected pages to
+ * enable dirty logging for them.
+ *
+ * We need to care about huge page mappings: e.g. during dirty logging we may
+ * have such mappings.
+ */
+void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
+ struct kvm_memory_slot *slot,
+ gfn_t gfn_offset, unsigned long mask)
+{
+ /*
+ * Huge pages are NOT write protected when we start dirty logging in
+ * initially-all-set mode; must write protect them here so that they
+ * are split to 4K on the first write.
+ *
+ * The gfn_offset is guaranteed to be aligned to 64, but the base_gfn
+ * of memslot has no such restriction, so the range can cross two large
+ * pages.
+ */
+ if (kvm_dirty_log_manual_protect_and_init_set(kvm)) {
+ gfn_t start = slot->base_gfn + gfn_offset + __ffs(mask);
+ gfn_t end = slot->base_gfn + gfn_offset + __fls(mask);
+
+ if (READ_ONCE(eager_page_split))
+ kvm_mmu_try_split_huge_pages(kvm, slot, start, end, PG_LEVEL_4K);
+
+ kvm_mmu_slot_gfn_write_protect(kvm, slot, start, PG_LEVEL_2M);
+
+ /* Cross two large pages? */
+ if (ALIGN(start << PAGE_SHIFT, PMD_SIZE) !=
+ ALIGN(end << PAGE_SHIFT, PMD_SIZE))
+ kvm_mmu_slot_gfn_write_protect(kvm, slot, end,
+ PG_LEVEL_2M);
+ }
+
+ /* Now handle 4K PTEs. */
+ if (kvm_x86_ops.cpu_dirty_log_size)
+ kvm_mmu_clear_dirty_pt_masked(kvm, slot, gfn_offset, mask);
+ else
+ kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
+}
+
+int kvm_cpu_dirty_log_size(void)
+{
+ return kvm_x86_ops.cpu_dirty_log_size;
+}
+
+bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
+ struct kvm_memory_slot *slot, u64 gfn,
+ int min_level)
+{
+ struct kvm_rmap_head *rmap_head;
+ int i;
+ bool write_protected = false;
+
+ if (kvm_memslots_have_rmaps(kvm)) {
+ for (i = min_level; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) {
+ rmap_head = gfn_to_rmap(gfn, i, slot);
+ write_protected |= rmap_write_protect(rmap_head, true);
+ }
+ }
+
+ if (tdp_mmu_enabled)
+ write_protected |=
+ kvm_tdp_mmu_write_protect_gfn(kvm, slot, gfn, min_level);
+
+ return write_protected;
+}
+
+static bool kvm_vcpu_write_protect_gfn(struct kvm_vcpu *vcpu, u64 gfn)
+{
+ struct kvm_memory_slot *slot;
+
+ slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
+ return kvm_mmu_slot_gfn_write_protect(vcpu->kvm, slot, gfn, PG_LEVEL_4K);
+}
+
+static bool __kvm_zap_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
+ const struct kvm_memory_slot *slot)
+{
+ return kvm_zap_all_rmap_sptes(kvm, rmap_head);
+}
+
+static bool kvm_zap_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
+ struct kvm_memory_slot *slot, gfn_t gfn, int level,
+ pte_t unused)
+{
+ return __kvm_zap_rmap(kvm, rmap_head, slot);
+}
+
+static bool kvm_set_pte_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
+ struct kvm_memory_slot *slot, gfn_t gfn, int level,
+ pte_t pte)
+{
+ u64 *sptep;
+ struct rmap_iterator iter;
+ bool need_flush = false;
+ u64 new_spte;
+ kvm_pfn_t new_pfn;
+
+ WARN_ON_ONCE(pte_huge(pte));
+ new_pfn = pte_pfn(pte);
+
+restart:
+ for_each_rmap_spte(rmap_head, &iter, sptep) {
+ need_flush = true;
+
+ if (pte_write(pte)) {
+ kvm_zap_one_rmap_spte(kvm, rmap_head, sptep);
+ goto restart;
+ } else {
+ new_spte = kvm_mmu_changed_pte_notifier_make_spte(
+ *sptep, new_pfn);
+
+ mmu_spte_clear_track_bits(kvm, sptep);
+ mmu_spte_set(sptep, new_spte);
+ }
+ }
+
+ if (need_flush && kvm_available_flush_remote_tlbs_range()) {
+ kvm_flush_remote_tlbs_gfn(kvm, gfn, level);
+ return false;
+ }
+
+ return need_flush;
+}
+
+struct slot_rmap_walk_iterator {
+ /* input fields. */
+ const struct kvm_memory_slot *slot;
+ gfn_t start_gfn;
+ gfn_t end_gfn;
+ int start_level;
+ int end_level;
+
+ /* output fields. */
+ gfn_t gfn;
+ struct kvm_rmap_head *rmap;
+ int level;
+
+ /* private field. */
+ struct kvm_rmap_head *end_rmap;
+};
+
+static void rmap_walk_init_level(struct slot_rmap_walk_iterator *iterator,
+ int level)
+{
+ iterator->level = level;
+ iterator->gfn = iterator->start_gfn;
+ iterator->rmap = gfn_to_rmap(iterator->gfn, level, iterator->slot);
+ iterator->end_rmap = gfn_to_rmap(iterator->end_gfn, level, iterator->slot);
+}
+
+static void slot_rmap_walk_init(struct slot_rmap_walk_iterator *iterator,
+ const struct kvm_memory_slot *slot,
+ int start_level, int end_level,
+ gfn_t start_gfn, gfn_t end_gfn)
+{
+ iterator->slot = slot;
+ iterator->start_level = start_level;
+ iterator->end_level = end_level;
+ iterator->start_gfn = start_gfn;
+ iterator->end_gfn = end_gfn;
+
+ rmap_walk_init_level(iterator, iterator->start_level);
+}
+
+static bool slot_rmap_walk_okay(struct slot_rmap_walk_iterator *iterator)
+{
+ return !!iterator->rmap;
+}
+
+static void slot_rmap_walk_next(struct slot_rmap_walk_iterator *iterator)
+{
+ while (++iterator->rmap <= iterator->end_rmap) {
+ iterator->gfn += (1UL << KVM_HPAGE_GFN_SHIFT(iterator->level));
+
+ if (iterator->rmap->val)
+ return;
+ }
+
+ if (++iterator->level > iterator->end_level) {
+ iterator->rmap = NULL;
+ return;
+ }
+
+ rmap_walk_init_level(iterator, iterator->level);
+}
+
+#define for_each_slot_rmap_range(_slot_, _start_level_, _end_level_, \
+ _start_gfn, _end_gfn, _iter_) \
+ for (slot_rmap_walk_init(_iter_, _slot_, _start_level_, \
+ _end_level_, _start_gfn, _end_gfn); \
+ slot_rmap_walk_okay(_iter_); \
+ slot_rmap_walk_next(_iter_))
+
+typedef bool (*rmap_handler_t)(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
+ struct kvm_memory_slot *slot, gfn_t gfn,
+ int level, pte_t pte);
+
+static __always_inline bool kvm_handle_gfn_range(struct kvm *kvm,
+ struct kvm_gfn_range *range,
+ rmap_handler_t handler)
+{
+ struct slot_rmap_walk_iterator iterator;
+ bool ret = false;
+
+ for_each_slot_rmap_range(range->slot, PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL,
+ range->start, range->end - 1, &iterator)
+ ret |= handler(kvm, iterator.rmap, range->slot, iterator.gfn,
+ iterator.level, range->arg.pte);
+
+ return ret;
+}
+
+bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
+{
+ bool flush = false;
+
+ if (kvm_memslots_have_rmaps(kvm))
+ flush = kvm_handle_gfn_range(kvm, range, kvm_zap_rmap);
+
+ if (tdp_mmu_enabled)
+ flush = kvm_tdp_mmu_unmap_gfn_range(kvm, range, flush);
+
+ if (kvm_x86_ops.set_apic_access_page_addr &&
+ range->slot->id == APIC_ACCESS_PAGE_PRIVATE_MEMSLOT)
+ kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD);
+
+ return flush;
+}
+
+bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
+{
+ bool flush = false;
+
+ if (kvm_memslots_have_rmaps(kvm))
+ flush = kvm_handle_gfn_range(kvm, range, kvm_set_pte_rmap);
+
+ if (tdp_mmu_enabled)
+ flush |= kvm_tdp_mmu_set_spte_gfn(kvm, range);
+
+ return flush;
+}
+
+static bool kvm_age_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
+ struct kvm_memory_slot *slot, gfn_t gfn, int level,
+ pte_t unused)
+{
+ u64 *sptep;
+ struct rmap_iterator iter;
+ int young = 0;
+
+ for_each_rmap_spte(rmap_head, &iter, sptep)
+ young |= mmu_spte_age(sptep);
+
+ return young;
+}
+
+static bool kvm_test_age_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
+ struct kvm_memory_slot *slot, gfn_t gfn,
+ int level, pte_t unused)
+{
+ u64 *sptep;
+ struct rmap_iterator iter;
+
+ for_each_rmap_spte(rmap_head, &iter, sptep)
+ if (is_accessed_spte(*sptep))
+ return true;
+ return false;
+}
+
+#define RMAP_RECYCLE_THRESHOLD 1000
+
+static void __rmap_add(struct kvm *kvm,
+ struct kvm_mmu_memory_cache *cache,
+ const struct kvm_memory_slot *slot,
+ u64 *spte, gfn_t gfn, unsigned int access)
+{
+ struct kvm_mmu_page *sp;
+ struct kvm_rmap_head *rmap_head;
+ int rmap_count;
+
+ sp = sptep_to_sp(spte);
+ kvm_mmu_page_set_translation(sp, spte_index(spte), gfn, access);
+ kvm_update_page_stats(kvm, sp->role.level, 1);
+
+ rmap_head = gfn_to_rmap(gfn, sp->role.level, slot);
+ rmap_count = pte_list_add(cache, spte, rmap_head);
+
+ if (rmap_count > kvm->stat.max_mmu_rmap_size)
+ kvm->stat.max_mmu_rmap_size = rmap_count;
+ if (rmap_count > RMAP_RECYCLE_THRESHOLD) {
+ kvm_zap_all_rmap_sptes(kvm, rmap_head);
+ kvm_flush_remote_tlbs_gfn(kvm, gfn, sp->role.level);
+ }
+}
+
+static void rmap_add(struct kvm_vcpu *vcpu, const struct kvm_memory_slot *slot,
+ u64 *spte, gfn_t gfn, unsigned int access)
+{
+ struct kvm_mmu_memory_cache *cache = &vcpu->arch.mmu_pte_list_desc_cache;
+
+ __rmap_add(vcpu->kvm, cache, slot, spte, gfn, access);
+}
+
+bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
+{
+ bool young = false;
+
+ if (kvm_memslots_have_rmaps(kvm))
+ young = kvm_handle_gfn_range(kvm, range, kvm_age_rmap);
+
+ if (tdp_mmu_enabled)
+ young |= kvm_tdp_mmu_age_gfn_range(kvm, range);
+
+ return young;
+}
+
+bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
+{
+ bool young = false;
+
+ if (kvm_memslots_have_rmaps(kvm))
+ young = kvm_handle_gfn_range(kvm, range, kvm_test_age_rmap);
+
+ if (tdp_mmu_enabled)
+ young |= kvm_tdp_mmu_test_age_gfn(kvm, range);
+
+ return young;
+}
+
+static void kvm_mmu_check_sptes_at_free(struct kvm_mmu_page *sp)
+{
+#ifdef CONFIG_KVM_PROVE_MMU
+ int i;
+
+ for (i = 0; i < SPTE_ENT_PER_PAGE; i++) {
+ if (KVM_MMU_WARN_ON(is_shadow_present_pte(sp->spt[i])))
+ pr_err_ratelimited("SPTE %llx (@ %p) for gfn %llx shadow-present at free",
+ sp->spt[i], &sp->spt[i],
+ kvm_mmu_page_get_gfn(sp, i));
+ }
+#endif
+}
+
+/*
+ * This value is the sum of all of the kvm instances's
+ * kvm->arch.n_used_mmu_pages values. We need a global,
+ * aggregate version in order to make the slab shrinker
+ * faster
+ */
+static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, long nr)
+{
+ kvm->arch.n_used_mmu_pages += nr;
+ percpu_counter_add(&kvm_total_used_mmu_pages, nr);
+}
+
+static void kvm_account_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+ kvm_mod_used_mmu_pages(kvm, +1);
+ kvm_account_pgtable_pages((void *)sp->spt, +1);
+}
+
+static void kvm_unaccount_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+ kvm_mod_used_mmu_pages(kvm, -1);
+ kvm_account_pgtable_pages((void *)sp->spt, -1);
+}
+
+static void kvm_mmu_free_shadow_page(struct kvm_mmu_page *sp)
+{
+ kvm_mmu_check_sptes_at_free(sp);
+
+ hlist_del(&sp->hash_link);
+ list_del(&sp->link);
+ free_page((unsigned long)sp->spt);
+ if (!sp->role.direct)
+ free_page((unsigned long)sp->shadowed_translation);
+ kmem_cache_free(mmu_page_header_cache, sp);
+}
+
+static unsigned kvm_page_table_hashfn(gfn_t gfn)
+{
+ return hash_64(gfn, KVM_MMU_HASH_SHIFT);
+}
+
+static void mmu_page_add_parent_pte(struct kvm_mmu_memory_cache *cache,
+ struct kvm_mmu_page *sp, u64 *parent_pte)
+{
+ if (!parent_pte)
+ return;
+
+ pte_list_add(cache, parent_pte, &sp->parent_ptes);
+}
+
+static void mmu_page_remove_parent_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
+ u64 *parent_pte)
+{
+ pte_list_remove(kvm, parent_pte, &sp->parent_ptes);
+}
+
+static void drop_parent_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
+ u64 *parent_pte)
+{
+ mmu_page_remove_parent_pte(kvm, sp, parent_pte);
+ mmu_spte_clear_no_track(parent_pte);
+}
+
+static void mark_unsync(u64 *spte);
+static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp)
+{
+ u64 *sptep;
+ struct rmap_iterator iter;
+
+ for_each_rmap_spte(&sp->parent_ptes, &iter, sptep) {
+ mark_unsync(sptep);
+ }
+}
+
+static void mark_unsync(u64 *spte)
+{
+ struct kvm_mmu_page *sp;
+
+ sp = sptep_to_sp(spte);
+ if (__test_and_set_bit(spte_index(spte), sp->unsync_child_bitmap))
+ return;
+ if (sp->unsync_children++)
+ return;
+ kvm_mmu_mark_parents_unsync(sp);
+}
+
+#define KVM_PAGE_ARRAY_NR 16
+
+struct kvm_mmu_pages {
+ struct mmu_page_and_offset {
+ struct kvm_mmu_page *sp;
+ unsigned int idx;
+ } page[KVM_PAGE_ARRAY_NR];
+ unsigned int nr;
+};
+
+static int mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp,
+ int idx)
+{
+ int i;
+
+ if (sp->unsync)
+ for (i=0; i < pvec->nr; i++)
+ if (pvec->page[i].sp == sp)
+ return 0;
+
+ pvec->page[pvec->nr].sp = sp;
+ pvec->page[pvec->nr].idx = idx;
+ pvec->nr++;
+ return (pvec->nr == KVM_PAGE_ARRAY_NR);
+}
+
+static inline void clear_unsync_child_bit(struct kvm_mmu_page *sp, int idx)
+{
+ --sp->unsync_children;
+ WARN_ON_ONCE((int)sp->unsync_children < 0);
+ __clear_bit(idx, sp->unsync_child_bitmap);
+}
+
+static int __mmu_unsync_walk(struct kvm_mmu_page *sp,
+ struct kvm_mmu_pages *pvec)
+{
+ int i, ret, nr_unsync_leaf = 0;
+
+ for_each_set_bit(i, sp->unsync_child_bitmap, 512) {
+ struct kvm_mmu_page *child;
+ u64 ent = sp->spt[i];
+
+ if (!is_shadow_present_pte(ent) || is_large_pte(ent)) {
+ clear_unsync_child_bit(sp, i);
+ continue;
+ }
+
+ child = spte_to_child_sp(ent);
+
+ if (child->unsync_children) {
+ if (mmu_pages_add(pvec, child, i))
+ return -ENOSPC;
+
+ ret = __mmu_unsync_walk(child, pvec);
+ if (!ret) {
+ clear_unsync_child_bit(sp, i);
+ continue;
+ } else if (ret > 0) {
+ nr_unsync_leaf += ret;
+ } else
+ return ret;
+ } else if (child->unsync) {
+ nr_unsync_leaf++;
+ if (mmu_pages_add(pvec, child, i))
+ return -ENOSPC;
+ } else
+ clear_unsync_child_bit(sp, i);
+ }
+
+ return nr_unsync_leaf;
+}
+
+#define INVALID_INDEX (-1)
+
+static int mmu_unsync_walk(struct kvm_mmu_page *sp,
+ struct kvm_mmu_pages *pvec)
+{
+ pvec->nr = 0;
+ if (!sp->unsync_children)
+ return 0;
+
+ mmu_pages_add(pvec, sp, INVALID_INDEX);
+ return __mmu_unsync_walk(sp, pvec);
+}
+
+static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+ WARN_ON_ONCE(!sp->unsync);
+ trace_kvm_mmu_sync_page(sp);
+ sp->unsync = 0;
+ --kvm->stat.mmu_unsync;
+}
+
+static bool kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
+ struct list_head *invalid_list);
+static void kvm_mmu_commit_zap_page(struct kvm *kvm,
+ struct list_head *invalid_list);
+
+static bool sp_has_gptes(struct kvm_mmu_page *sp)
+{
+ if (sp->role.direct)
+ return false;
+
+ if (sp->role.passthrough)
+ return false;
+
+ return true;
+}
+
+#define for_each_valid_sp(_kvm, _sp, _list) \
+ hlist_for_each_entry(_sp, _list, hash_link) \
+ if (is_obsolete_sp((_kvm), (_sp))) { \
+ } else
+
+#define for_each_gfn_valid_sp_with_gptes(_kvm, _sp, _gfn) \
+ for_each_valid_sp(_kvm, _sp, \
+ &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)]) \
+ if ((_sp)->gfn != (_gfn) || !sp_has_gptes(_sp)) {} else
+
+static bool kvm_sync_page_check(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
+{
+ union kvm_mmu_page_role root_role = vcpu->arch.mmu->root_role;
+
+ /*
+ * Ignore various flags when verifying that it's safe to sync a shadow
+ * page using the current MMU context.
+ *
+ * - level: not part of the overall MMU role and will never match as the MMU's
+ * level tracks the root level
+ * - access: updated based on the new guest PTE
+ * - quadrant: not part of the overall MMU role (similar to level)
+ */
+ const union kvm_mmu_page_role sync_role_ign = {
+ .level = 0xf,
+ .access = 0x7,
+ .quadrant = 0x3,
+ .passthrough = 0x1,
+ };
+
+ /*
+ * Direct pages can never be unsync, and KVM should never attempt to
+ * sync a shadow page for a different MMU context, e.g. if the role
+ * differs then the memslot lookup (SMM vs. non-SMM) will be bogus, the
+ * reserved bits checks will be wrong, etc...
+ */
+ if (WARN_ON_ONCE(sp->role.direct || !vcpu->arch.mmu->sync_spte ||
+ (sp->role.word ^ root_role.word) & ~sync_role_ign.word))
+ return false;
+
+ return true;
+}
+
+static int kvm_sync_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, int i)
+{
+ if (!sp->spt[i])
+ return 0;
+
+ return vcpu->arch.mmu->sync_spte(vcpu, sp, i);
+}
+
+static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
+{
+ int flush = 0;
+ int i;
+
+ if (!kvm_sync_page_check(vcpu, sp))
+ return -1;
+
+ for (i = 0; i < SPTE_ENT_PER_PAGE; i++) {
+ int ret = kvm_sync_spte(vcpu, sp, i);
+
+ if (ret < -1)
+ return -1;
+ flush |= ret;
+ }
+
+ /*
+ * Note, any flush is purely for KVM's correctness, e.g. when dropping
+ * an existing SPTE or clearing W/A/D bits to ensure an mmu_notifier
+ * unmap or dirty logging event doesn't fail to flush. The guest is
+ * responsible for flushing the TLB to ensure any changes in protection
+ * bits are recognized, i.e. until the guest flushes or page faults on
+ * a relevant address, KVM is architecturally allowed to let vCPUs use
+ * cached translations with the old protection bits.
+ */
+ return flush;
+}
+
+static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+ struct list_head *invalid_list)
+{
+ int ret = __kvm_sync_page(vcpu, sp);
+
+ if (ret < 0)
+ kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
+ return ret;
+}
+
+static bool kvm_mmu_remote_flush_or_zap(struct kvm *kvm,
+ struct list_head *invalid_list,
+ bool remote_flush)
+{
+ if (!remote_flush && list_empty(invalid_list))
+ return false;
+
+ if (!list_empty(invalid_list))
+ kvm_mmu_commit_zap_page(kvm, invalid_list);
+ else
+ kvm_flush_remote_tlbs(kvm);
+ return true;
+}
+
+static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+ if (sp->role.invalid)
+ return true;
+
+ /* TDP MMU pages do not use the MMU generation. */
+ return !is_tdp_mmu_page(sp) &&
+ unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen);
+}
+
+struct mmu_page_path {
+ struct kvm_mmu_page *parent[PT64_ROOT_MAX_LEVEL];
+ unsigned int idx[PT64_ROOT_MAX_LEVEL];
+};
+
+#define for_each_sp(pvec, sp, parents, i) \
+ for (i = mmu_pages_first(&pvec, &parents); \
+ i < pvec.nr && ({ sp = pvec.page[i].sp; 1;}); \
+ i = mmu_pages_next(&pvec, &parents, i))
+
+static int mmu_pages_next(struct kvm_mmu_pages *pvec,
+ struct mmu_page_path *parents,
+ int i)
+{
+ int n;
+
+ for (n = i+1; n < pvec->nr; n++) {
+ struct kvm_mmu_page *sp = pvec->page[n].sp;
+ unsigned idx = pvec->page[n].idx;
+ int level = sp->role.level;
+
+ parents->idx[level-1] = idx;
+ if (level == PG_LEVEL_4K)
+ break;
+
+ parents->parent[level-2] = sp;
+ }
+
+ return n;
+}
+
+static int mmu_pages_first(struct kvm_mmu_pages *pvec,
+ struct mmu_page_path *parents)
+{
+ struct kvm_mmu_page *sp;
+ int level;
+
+ if (pvec->nr == 0)
+ return 0;
+
+ WARN_ON_ONCE(pvec->page[0].idx != INVALID_INDEX);
+
+ sp = pvec->page[0].sp;
+ level = sp->role.level;
+ WARN_ON_ONCE(level == PG_LEVEL_4K);
+
+ parents->parent[level-2] = sp;
+
+ /* Also set up a sentinel. Further entries in pvec are all
+ * children of sp, so this element is never overwritten.
+ */
+ parents->parent[level-1] = NULL;
+ return mmu_pages_next(pvec, parents, 0);
+}
+
+static void mmu_pages_clear_parents(struct mmu_page_path *parents)
+{
+ struct kvm_mmu_page *sp;
+ unsigned int level = 0;
+
+ do {
+ unsigned int idx = parents->idx[level];
+ sp = parents->parent[level];
+ if (!sp)
+ return;
+
+ WARN_ON_ONCE(idx == INVALID_INDEX);
+ clear_unsync_child_bit(sp, idx);
+ level++;
+ } while (!sp->unsync_children);
+}
+
+static int mmu_sync_children(struct kvm_vcpu *vcpu,
+ struct kvm_mmu_page *parent, bool can_yield)
+{
+ int i;
+ struct kvm_mmu_page *sp;
+ struct mmu_page_path parents;
+ struct kvm_mmu_pages pages;
+ LIST_HEAD(invalid_list);
+ bool flush = false;
+
+ while (mmu_unsync_walk(parent, &pages)) {
+ bool protected = false;
+
+ for_each_sp(pages, sp, parents, i)
+ protected |= kvm_vcpu_write_protect_gfn(vcpu, sp->gfn);
+
+ if (protected) {
+ kvm_mmu_remote_flush_or_zap(vcpu->kvm, &invalid_list, true);
+ flush = false;
+ }
+
+ for_each_sp(pages, sp, parents, i) {
+ kvm_unlink_unsync_page(vcpu->kvm, sp);
+ flush |= kvm_sync_page(vcpu, sp, &invalid_list) > 0;
+ mmu_pages_clear_parents(&parents);
+ }
+ if (need_resched() || rwlock_needbreak(&vcpu->kvm->mmu_lock)) {
+ kvm_mmu_remote_flush_or_zap(vcpu->kvm, &invalid_list, flush);
+ if (!can_yield) {
+ kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
+ return -EINTR;
+ }
+
+ cond_resched_rwlock_write(&vcpu->kvm->mmu_lock);
+ flush = false;
+ }
+ }
+
+ kvm_mmu_remote_flush_or_zap(vcpu->kvm, &invalid_list, flush);
+ return 0;
+}
+
+static void __clear_sp_write_flooding_count(struct kvm_mmu_page *sp)
+{
+ atomic_set(&sp->write_flooding_count, 0);
+}
+
+static void clear_sp_write_flooding_count(u64 *spte)
+{
+ __clear_sp_write_flooding_count(sptep_to_sp(spte));
+}
+
+/*
+ * The vCPU is required when finding indirect shadow pages; the shadow
+ * page may already exist and syncing it needs the vCPU pointer in
+ * order to read guest page tables. Direct shadow pages are never
+ * unsync, thus @vcpu can be NULL if @role.direct is true.
+ */
+static struct kvm_mmu_page *kvm_mmu_find_shadow_page(struct kvm *kvm,
+ struct kvm_vcpu *vcpu,
+ gfn_t gfn,
+ struct hlist_head *sp_list,
+ union kvm_mmu_page_role role)
+{
+ struct kvm_mmu_page *sp;
+ int ret;
+ int collisions = 0;
+ LIST_HEAD(invalid_list);
+
+ for_each_valid_sp(kvm, sp, sp_list) {
+ if (sp->gfn != gfn) {
+ collisions++;
+ continue;
+ }
+
+ if (sp->role.word != role.word) {
+ /*
+ * If the guest is creating an upper-level page, zap
+ * unsync pages for the same gfn. While it's possible
+ * the guest is using recursive page tables, in all
+ * likelihood the guest has stopped using the unsync
+ * page and is installing a completely unrelated page.
+ * Unsync pages must not be left as is, because the new
+ * upper-level page will be write-protected.
+ */
+ if (role.level > PG_LEVEL_4K && sp->unsync)
+ kvm_mmu_prepare_zap_page(kvm, sp,
+ &invalid_list);
+ continue;
+ }
+
+ /* unsync and write-flooding only apply to indirect SPs. */
+ if (sp->role.direct)
+ goto out;
+
+ if (sp->unsync) {
+ if (KVM_BUG_ON(!vcpu, kvm))
+ break;
+
+ /*
+ * The page is good, but is stale. kvm_sync_page does
+ * get the latest guest state, but (unlike mmu_unsync_children)
+ * it doesn't write-protect the page or mark it synchronized!
+ * This way the validity of the mapping is ensured, but the
+ * overhead of write protection is not incurred until the
+ * guest invalidates the TLB mapping. This allows multiple
+ * SPs for a single gfn to be unsync.
+ *
+ * If the sync fails, the page is zapped. If so, break
+ * in order to rebuild it.
+ */
+ ret = kvm_sync_page(vcpu, sp, &invalid_list);
+ if (ret < 0)
+ break;
+
+ WARN_ON_ONCE(!list_empty(&invalid_list));
+ if (ret > 0)
+ kvm_flush_remote_tlbs(kvm);
+ }
+
+ __clear_sp_write_flooding_count(sp);
+
+ goto out;
+ }
+
+ sp = NULL;
+ ++kvm->stat.mmu_cache_miss;
+
+out:
+ kvm_mmu_commit_zap_page(kvm, &invalid_list);
+
+ if (collisions > kvm->stat.max_mmu_page_hash_collisions)
+ kvm->stat.max_mmu_page_hash_collisions = collisions;
+ return sp;
+}
+
+/* Caches used when allocating a new shadow page. */
+struct shadow_page_caches {
+ struct kvm_mmu_memory_cache *page_header_cache;
+ struct kvm_mmu_memory_cache *shadow_page_cache;
+ struct kvm_mmu_memory_cache *shadowed_info_cache;
+};
+
+static struct kvm_mmu_page *kvm_mmu_alloc_shadow_page(struct kvm *kvm,
+ struct shadow_page_caches *caches,
+ gfn_t gfn,
+ struct hlist_head *sp_list,
+ union kvm_mmu_page_role role)
+{
+ struct kvm_mmu_page *sp;
+
+ sp = kvm_mmu_memory_cache_alloc(caches->page_header_cache);
+ sp->spt = kvm_mmu_memory_cache_alloc(caches->shadow_page_cache);
+ if (!role.direct)
+ sp->shadowed_translation = kvm_mmu_memory_cache_alloc(caches->shadowed_info_cache);
+
+ set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
+
+ INIT_LIST_HEAD(&sp->possible_nx_huge_page_link);
+
+ /*
+ * active_mmu_pages must be a FIFO list, as kvm_zap_obsolete_pages()
+ * depends on valid pages being added to the head of the list. See
+ * comments in kvm_zap_obsolete_pages().
+ */
+ sp->mmu_valid_gen = kvm->arch.mmu_valid_gen;
+ list_add(&sp->link, &kvm->arch.active_mmu_pages);
+ kvm_account_mmu_page(kvm, sp);
+
+ sp->gfn = gfn;
+ sp->role = role;
+ hlist_add_head(&sp->hash_link, sp_list);
+ if (sp_has_gptes(sp))
+ account_shadowed(kvm, sp);
+
+ return sp;
+}
+
+/* Note, @vcpu may be NULL if @role.direct is true; see kvm_mmu_find_shadow_page. */
+static struct kvm_mmu_page *__kvm_mmu_get_shadow_page(struct kvm *kvm,
+ struct kvm_vcpu *vcpu,
+ struct shadow_page_caches *caches,
+ gfn_t gfn,
+ union kvm_mmu_page_role role)
+{
+ struct hlist_head *sp_list;
+ struct kvm_mmu_page *sp;
+ bool created = false;
+
+ sp_list = &kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)];
+
+ sp = kvm_mmu_find_shadow_page(kvm, vcpu, gfn, sp_list, role);
+ if (!sp) {
+ created = true;
+ sp = kvm_mmu_alloc_shadow_page(kvm, caches, gfn, sp_list, role);
+ }
+
+ trace_kvm_mmu_get_page(sp, created);
+ return sp;
+}
+
+static struct kvm_mmu_page *kvm_mmu_get_shadow_page(struct kvm_vcpu *vcpu,
+ gfn_t gfn,
+ union kvm_mmu_page_role role)
+{
+ struct shadow_page_caches caches = {
+ .page_header_cache = &vcpu->arch.mmu_page_header_cache,
+ .shadow_page_cache = &vcpu->arch.mmu_shadow_page_cache,
+ .shadowed_info_cache = &vcpu->arch.mmu_shadowed_info_cache,
+ };
+
+ return __kvm_mmu_get_shadow_page(vcpu->kvm, vcpu, &caches, gfn, role);
+}
+
+static union kvm_mmu_page_role kvm_mmu_child_role(u64 *sptep, bool direct,
+ unsigned int access)
+{
+ struct kvm_mmu_page *parent_sp = sptep_to_sp(sptep);
+ union kvm_mmu_page_role role;
+
+ role = parent_sp->role;
+ role.level--;
+ role.access = access;
+ role.direct = direct;
+ role.passthrough = 0;
+
+ /*
+ * If the guest has 4-byte PTEs then that means it's using 32-bit,
+ * 2-level, non-PAE paging. KVM shadows such guests with PAE paging
+ * (i.e. 8-byte PTEs). The difference in PTE size means that KVM must
+ * shadow each guest page table with multiple shadow page tables, which
+ * requires extra bookkeeping in the role.
+ *
+ * Specifically, to shadow the guest's page directory (which covers a
+ * 4GiB address space), KVM uses 4 PAE page directories, each mapping
+ * 1GiB of the address space. @role.quadrant encodes which quarter of
+ * the address space each maps.
+ *
+ * To shadow the guest's page tables (which each map a 4MiB region), KVM
+ * uses 2 PAE page tables, each mapping a 2MiB region. For these,
+ * @role.quadrant encodes which half of the region they map.
+ *
+ * Concretely, a 4-byte PDE consumes bits 31:22, while an 8-byte PDE
+ * consumes bits 29:21. To consume bits 31:30, KVM's uses 4 shadow
+ * PDPTEs; those 4 PAE page directories are pre-allocated and their
+ * quadrant is assigned in mmu_alloc_root(). A 4-byte PTE consumes
+ * bits 21:12, while an 8-byte PTE consumes bits 20:12. To consume
+ * bit 21 in the PTE (the child here), KVM propagates that bit to the
+ * quadrant, i.e. sets quadrant to '0' or '1'. The parent 8-byte PDE
+ * covers bit 21 (see above), thus the quadrant is calculated from the
+ * _least_ significant bit of the PDE index.
+ */
+ if (role.has_4_byte_gpte) {
+ WARN_ON_ONCE(role.level != PG_LEVEL_4K);
+ role.quadrant = spte_index(sptep) & 1;
+ }
+
+ return role;
+}
+
+static struct kvm_mmu_page *kvm_mmu_get_child_sp(struct kvm_vcpu *vcpu,
+ u64 *sptep, gfn_t gfn,
+ bool direct, unsigned int access)
+{
+ union kvm_mmu_page_role role;
+
+ if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep))
+ return ERR_PTR(-EEXIST);
+
+ role = kvm_mmu_child_role(sptep, direct, access);
+ return kvm_mmu_get_shadow_page(vcpu, gfn, role);
+}
+
+static void shadow_walk_init_using_root(struct kvm_shadow_walk_iterator *iterator,
+ struct kvm_vcpu *vcpu, hpa_t root,
+ u64 addr)
+{
+ iterator->addr = addr;
+ iterator->shadow_addr = root;
+ iterator->level = vcpu->arch.mmu->root_role.level;
+
+ if (iterator->level >= PT64_ROOT_4LEVEL &&
+ vcpu->arch.mmu->cpu_role.base.level < PT64_ROOT_4LEVEL &&
+ !vcpu->arch.mmu->root_role.direct)
+ iterator->level = PT32E_ROOT_LEVEL;
+
+ if (iterator->level == PT32E_ROOT_LEVEL) {
+ /*
+ * prev_root is currently only used for 64-bit hosts. So only
+ * the active root_hpa is valid here.
+ */
+ BUG_ON(root != vcpu->arch.mmu->root.hpa);
+
+ iterator->shadow_addr
+ = vcpu->arch.mmu->pae_root[(addr >> 30) & 3];
+ iterator->shadow_addr &= SPTE_BASE_ADDR_MASK;
+ --iterator->level;
+ if (!iterator->shadow_addr)
+ iterator->level = 0;
+ }
+}
+
+static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator,
+ struct kvm_vcpu *vcpu, u64 addr)
+{
+ shadow_walk_init_using_root(iterator, vcpu, vcpu->arch.mmu->root.hpa,
+ addr);
+}
+
+static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator)
+{
+ if (iterator->level < PG_LEVEL_4K)
+ return false;
+
+ iterator->index = SPTE_INDEX(iterator->addr, iterator->level);
+ iterator->sptep = ((u64 *)__va(iterator->shadow_addr)) + iterator->index;
+ return true;
+}
+
+static void __shadow_walk_next(struct kvm_shadow_walk_iterator *iterator,
+ u64 spte)
+{
+ if (!is_shadow_present_pte(spte) || is_last_spte(spte, iterator->level)) {
+ iterator->level = 0;
+ return;
+ }
+
+ iterator->shadow_addr = spte & SPTE_BASE_ADDR_MASK;
+ --iterator->level;
+}
+
+static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator)
+{
+ __shadow_walk_next(iterator, *iterator->sptep);
+}
+
+static void __link_shadow_page(struct kvm *kvm,
+ struct kvm_mmu_memory_cache *cache, u64 *sptep,
+ struct kvm_mmu_page *sp, bool flush)
+{
+ u64 spte;
+
+ BUILD_BUG_ON(VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK);
+
+ /*
+ * If an SPTE is present already, it must be a leaf and therefore
+ * a large one. Drop it, and flush the TLB if needed, before
+ * installing sp.
+ */
+ if (is_shadow_present_pte(*sptep))
+ drop_large_spte(kvm, sptep, flush);
+
+ spte = make_nonleaf_spte(sp->spt, sp_ad_disabled(sp));
+
+ mmu_spte_set(sptep, spte);
+
+ mmu_page_add_parent_pte(cache, sp, sptep);
+
+ /*
+ * The non-direct sub-pagetable must be updated before linking. For
+ * L1 sp, the pagetable is updated via kvm_sync_page() in
+ * kvm_mmu_find_shadow_page() without write-protecting the gfn,
+ * so sp->unsync can be true or false. For higher level non-direct
+ * sp, the pagetable is updated/synced via mmu_sync_children() in
+ * FNAME(fetch)(), so sp->unsync_children can only be false.
+ * WARN_ON_ONCE() if anything happens unexpectedly.
+ */
+ if (WARN_ON_ONCE(sp->unsync_children) || sp->unsync)
+ mark_unsync(sptep);
+}
+
+static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep,
+ struct kvm_mmu_page *sp)
+{
+ __link_shadow_page(vcpu->kvm, &vcpu->arch.mmu_pte_list_desc_cache, sptep, sp, true);
+}
+
+static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep,
+ unsigned direct_access)
+{
+ if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) {
+ struct kvm_mmu_page *child;
+
+ /*
+ * For the direct sp, if the guest pte's dirty bit
+ * changed form clean to dirty, it will corrupt the
+ * sp's access: allow writable in the read-only sp,
+ * so we should update the spte at this point to get
+ * a new sp with the correct access.
+ */
+ child = spte_to_child_sp(*sptep);
+ if (child->role.access == direct_access)
+ return;
+
+ drop_parent_pte(vcpu->kvm, child, sptep);
+ kvm_flush_remote_tlbs_sptep(vcpu->kvm, sptep);
+ }
+}
+
+/* Returns the number of zapped non-leaf child shadow pages. */
+static int mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
+ u64 *spte, struct list_head *invalid_list)
+{
+ u64 pte;
+ struct kvm_mmu_page *child;
+
+ pte = *spte;
+ if (is_shadow_present_pte(pte)) {
+ if (is_last_spte(pte, sp->role.level)) {
+ drop_spte(kvm, spte);
+ } else {
+ child = spte_to_child_sp(pte);
+ drop_parent_pte(kvm, child, spte);
+
+ /*
+ * Recursively zap nested TDP SPs, parentless SPs are
+ * unlikely to be used again in the near future. This
+ * avoids retaining a large number of stale nested SPs.
+ */
+ if (tdp_enabled && invalid_list &&
+ child->role.guest_mode && !child->parent_ptes.val)
+ return kvm_mmu_prepare_zap_page(kvm, child,
+ invalid_list);
+ }
+ } else if (is_mmio_spte(pte)) {
+ mmu_spte_clear_no_track(spte);
+ }
+ return 0;
+}
+
+static int kvm_mmu_page_unlink_children(struct kvm *kvm,
+ struct kvm_mmu_page *sp,
+ struct list_head *invalid_list)
+{
+ int zapped = 0;
+ unsigned i;
+
+ for (i = 0; i < SPTE_ENT_PER_PAGE; ++i)
+ zapped += mmu_page_zap_pte(kvm, sp, sp->spt + i, invalid_list);
+
+ return zapped;
+}
+
+static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+ u64 *sptep;
+ struct rmap_iterator iter;
+
+ while ((sptep = rmap_get_first(&sp->parent_ptes, &iter)))
+ drop_parent_pte(kvm, sp, sptep);
+}
+
+static int mmu_zap_unsync_children(struct kvm *kvm,
+ struct kvm_mmu_page *parent,
+ struct list_head *invalid_list)
+{
+ int i, zapped = 0;
+ struct mmu_page_path parents;
+ struct kvm_mmu_pages pages;
+
+ if (parent->role.level == PG_LEVEL_4K)
+ return 0;
+
+ while (mmu_unsync_walk(parent, &pages)) {
+ struct kvm_mmu_page *sp;
+
+ for_each_sp(pages, sp, parents, i) {
+ kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
+ mmu_pages_clear_parents(&parents);
+ zapped++;
+ }
+ }
+
+ return zapped;
+}
+
+static bool __kvm_mmu_prepare_zap_page(struct kvm *kvm,
+ struct kvm_mmu_page *sp,
+ struct list_head *invalid_list,
+ int *nr_zapped)
+{
+ bool list_unstable, zapped_root = false;
+
+ lockdep_assert_held_write(&kvm->mmu_lock);
+ trace_kvm_mmu_prepare_zap_page(sp);
+ ++kvm->stat.mmu_shadow_zapped;
+ *nr_zapped = mmu_zap_unsync_children(kvm, sp, invalid_list);
+ *nr_zapped += kvm_mmu_page_unlink_children(kvm, sp, invalid_list);
+ kvm_mmu_unlink_parents(kvm, sp);
+
+ /* Zapping children means active_mmu_pages has become unstable. */
+ list_unstable = *nr_zapped;
+
+ if (!sp->role.invalid && sp_has_gptes(sp))
+ unaccount_shadowed(kvm, sp);
+
+ if (sp->unsync)
+ kvm_unlink_unsync_page(kvm, sp);
+ if (!sp->root_count) {
+ /* Count self */
+ (*nr_zapped)++;
+
+ /*
+ * Already invalid pages (previously active roots) are not on
+ * the active page list. See list_del() in the "else" case of
+ * !sp->root_count.
+ */
+ if (sp->role.invalid)
+ list_add(&sp->link, invalid_list);
+ else
+ list_move(&sp->link, invalid_list);
+ kvm_unaccount_mmu_page(kvm, sp);
+ } else {
+ /*
+ * Remove the active root from the active page list, the root
+ * will be explicitly freed when the root_count hits zero.
+ */
+ list_del(&sp->link);
+
+ /*
+ * Obsolete pages cannot be used on any vCPUs, see the comment
+ * in kvm_mmu_zap_all_fast(). Note, is_obsolete_sp() also
+ * treats invalid shadow pages as being obsolete.
+ */
+ zapped_root = !is_obsolete_sp(kvm, sp);
+ }
+
+ if (sp->nx_huge_page_disallowed)
+ unaccount_nx_huge_page(kvm, sp);
+
+ sp->role.invalid = 1;
+
+ /*
+ * Make the request to free obsolete roots after marking the root
+ * invalid, otherwise other vCPUs may not see it as invalid.
+ */
+ if (zapped_root)
+ kvm_make_all_cpus_request(kvm, KVM_REQ_MMU_FREE_OBSOLETE_ROOTS);
+ return list_unstable;
+}
+
+static bool kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
+ struct list_head *invalid_list)
+{
+ int nr_zapped;
+
+ __kvm_mmu_prepare_zap_page(kvm, sp, invalid_list, &nr_zapped);
+ return nr_zapped;
+}
+
+static void kvm_mmu_commit_zap_page(struct kvm *kvm,
+ struct list_head *invalid_list)
+{
+ struct kvm_mmu_page *sp, *nsp;
+
+ if (list_empty(invalid_list))
+ return;
+
+ /*
+ * We need to make sure everyone sees our modifications to
+ * the page tables and see changes to vcpu->mode here. The barrier
+ * in the kvm_flush_remote_tlbs() achieves this. This pairs
+ * with vcpu_enter_guest and walk_shadow_page_lockless_begin/end.
+ *
+ * In addition, kvm_flush_remote_tlbs waits for all vcpus to exit
+ * guest mode and/or lockless shadow page table walks.
+ */
+ kvm_flush_remote_tlbs(kvm);
+
+ list_for_each_entry_safe(sp, nsp, invalid_list, link) {
+ WARN_ON_ONCE(!sp->role.invalid || sp->root_count);
+ kvm_mmu_free_shadow_page(sp);
+ }
+}
+
+static unsigned long kvm_mmu_zap_oldest_mmu_pages(struct kvm *kvm,
+ unsigned long nr_to_zap)
+{
+ unsigned long total_zapped = 0;
+ struct kvm_mmu_page *sp, *tmp;
+ LIST_HEAD(invalid_list);
+ bool unstable;
+ int nr_zapped;
+
+ if (list_empty(&kvm->arch.active_mmu_pages))
+ return 0;
+
+restart:
+ list_for_each_entry_safe_reverse(sp, tmp, &kvm->arch.active_mmu_pages, link) {
+ /*
+ * Don't zap active root pages, the page itself can't be freed
+ * and zapping it will just force vCPUs to realloc and reload.
+ */
+ if (sp->root_count)
+ continue;
+
+ unstable = __kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list,
+ &nr_zapped);
+ total_zapped += nr_zapped;
+ if (total_zapped >= nr_to_zap)
+ break;
+
+ if (unstable)
+ goto restart;
+ }
+
+ kvm_mmu_commit_zap_page(kvm, &invalid_list);
+
+ kvm->stat.mmu_recycled += total_zapped;
+ return total_zapped;
+}
+
+static inline unsigned long kvm_mmu_available_pages(struct kvm *kvm)
+{
+ if (kvm->arch.n_max_mmu_pages > kvm->arch.n_used_mmu_pages)
+ return kvm->arch.n_max_mmu_pages -
+ kvm->arch.n_used_mmu_pages;
+
+ return 0;
+}
+
+static int make_mmu_pages_available(struct kvm_vcpu *vcpu)
+{
+ unsigned long avail = kvm_mmu_available_pages(vcpu->kvm);
+
+ if (likely(avail >= KVM_MIN_FREE_MMU_PAGES))
+ return 0;
+
+ kvm_mmu_zap_oldest_mmu_pages(vcpu->kvm, KVM_REFILL_PAGES - avail);
+
+ /*
+ * Note, this check is intentionally soft, it only guarantees that one
+ * page is available, while the caller may end up allocating as many as
+ * four pages, e.g. for PAE roots or for 5-level paging. Temporarily
+ * exceeding the (arbitrary by default) limit will not harm the host,
+ * being too aggressive may unnecessarily kill the guest, and getting an
+ * exact count is far more trouble than it's worth, especially in the
+ * page fault paths.
+ */
+ if (!kvm_mmu_available_pages(vcpu->kvm))
+ return -ENOSPC;
+ return 0;
+}
+
+/*
+ * Changing the number of mmu pages allocated to the vm
+ * Note: if goal_nr_mmu_pages is too small, you will get dead lock
+ */
+void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long goal_nr_mmu_pages)
+{
+ write_lock(&kvm->mmu_lock);
+
+ if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) {
+ kvm_mmu_zap_oldest_mmu_pages(kvm, kvm->arch.n_used_mmu_pages -
+ goal_nr_mmu_pages);
+
+ goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages;
+ }
+
+ kvm->arch.n_max_mmu_pages = goal_nr_mmu_pages;
+
+ write_unlock(&kvm->mmu_lock);
+}
+
+int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
+{
+ struct kvm_mmu_page *sp;
+ LIST_HEAD(invalid_list);
+ int r;
+
+ r = 0;
+ write_lock(&kvm->mmu_lock);
+ for_each_gfn_valid_sp_with_gptes(kvm, sp, gfn) {
+ r = 1;
+ kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
+ }
+ kvm_mmu_commit_zap_page(kvm, &invalid_list);
+ write_unlock(&kvm->mmu_lock);
+
+ return r;
+}
+
+static int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
+{
+ gpa_t gpa;
+ int r;
+
+ if (vcpu->arch.mmu->root_role.direct)
+ return 0;
+
+ gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL);
+
+ r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
+
+ return r;
+}
+
+static void kvm_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+ trace_kvm_mmu_unsync_page(sp);
+ ++kvm->stat.mmu_unsync;
+ sp->unsync = 1;
+
+ kvm_mmu_mark_parents_unsync(sp);
+}
+
+/*
+ * Attempt to unsync any shadow pages that can be reached by the specified gfn,
+ * KVM is creating a writable mapping for said gfn. Returns 0 if all pages
+ * were marked unsync (or if there is no shadow page), -EPERM if the SPTE must
+ * be write-protected.
+ */
+int mmu_try_to_unsync_pages(struct kvm *kvm, const struct kvm_memory_slot *slot,
+ gfn_t gfn, bool can_unsync, bool prefetch)
+{
+ struct kvm_mmu_page *sp;
+ bool locked = false;
+
+ /*
+ * Force write-protection if the page is being tracked. Note, the page
+ * track machinery is used to write-protect upper-level shadow pages,
+ * i.e. this guards the role.level == 4K assertion below!
+ */
+ if (kvm_gfn_is_write_tracked(kvm, slot, gfn))
+ return -EPERM;
+
+ /*
+ * The page is not write-tracked, mark existing shadow pages unsync
+ * unless KVM is synchronizing an unsync SP (can_unsync = false). In
+ * that case, KVM must complete emulation of the guest TLB flush before
+ * allowing shadow pages to become unsync (writable by the guest).
+ */
+ for_each_gfn_valid_sp_with_gptes(kvm, sp, gfn) {
+ if (!can_unsync)
+ return -EPERM;
+
+ if (sp->unsync)
+ continue;
+
+ if (prefetch)
+ return -EEXIST;
+
+ /*
+ * TDP MMU page faults require an additional spinlock as they
+ * run with mmu_lock held for read, not write, and the unsync
+ * logic is not thread safe. Take the spinklock regardless of
+ * the MMU type to avoid extra conditionals/parameters, there's
+ * no meaningful penalty if mmu_lock is held for write.
+ */
+ if (!locked) {
+ locked = true;
+ spin_lock(&kvm->arch.mmu_unsync_pages_lock);
+
+ /*
+ * Recheck after taking the spinlock, a different vCPU
+ * may have since marked the page unsync. A false
+ * positive on the unprotected check above is not
+ * possible as clearing sp->unsync _must_ hold mmu_lock
+ * for write, i.e. unsync cannot transition from 0->1
+ * while this CPU holds mmu_lock for read (or write).
+ */
+ if (READ_ONCE(sp->unsync))
+ continue;
+ }
+
+ WARN_ON_ONCE(sp->role.level != PG_LEVEL_4K);
+ kvm_unsync_page(kvm, sp);
+ }
+ if (locked)
+ spin_unlock(&kvm->arch.mmu_unsync_pages_lock);
+
+ /*
+ * We need to ensure that the marking of unsync pages is visible
+ * before the SPTE is updated to allow writes because
+ * kvm_mmu_sync_roots() checks the unsync flags without holding
+ * the MMU lock and so can race with this. If the SPTE was updated
+ * before the page had been marked as unsync-ed, something like the
+ * following could happen:
+ *
+ * CPU 1 CPU 2
+ * ---------------------------------------------------------------------
+ * 1.2 Host updates SPTE
+ * to be writable
+ * 2.1 Guest writes a GPTE for GVA X.
+ * (GPTE being in the guest page table shadowed
+ * by the SP from CPU 1.)
+ * This reads SPTE during the page table walk.
+ * Since SPTE.W is read as 1, there is no
+ * fault.
+ *
+ * 2.2 Guest issues TLB flush.
+ * That causes a VM Exit.
+ *
+ * 2.3 Walking of unsync pages sees sp->unsync is
+ * false and skips the page.
+ *
+ * 2.4 Guest accesses GVA X.
+ * Since the mapping in the SP was not updated,
+ * so the old mapping for GVA X incorrectly
+ * gets used.
+ * 1.1 Host marks SP
+ * as unsync
+ * (sp->unsync = true)
+ *
+ * The write barrier below ensures that 1.1 happens before 1.2 and thus
+ * the situation in 2.4 does not arise. It pairs with the read barrier
+ * in is_unsync_root(), placed between 2.1's load of SPTE.W and 2.3.
+ */
+ smp_wmb();
+
+ return 0;
+}
+
+static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot,
+ u64 *sptep, unsigned int pte_access, gfn_t gfn,
+ kvm_pfn_t pfn, struct kvm_page_fault *fault)
+{
+ struct kvm_mmu_page *sp = sptep_to_sp(sptep);
+ int level = sp->role.level;
+ int was_rmapped = 0;
+ int ret = RET_PF_FIXED;
+ bool flush = false;
+ bool wrprot;
+ u64 spte;
+
+ /* Prefetching always gets a writable pfn. */
+ bool host_writable = !fault || fault->map_writable;
+ bool prefetch = !fault || fault->prefetch;
+ bool write_fault = fault && fault->write;
+
+ if (unlikely(is_noslot_pfn(pfn))) {
+ vcpu->stat.pf_mmio_spte_created++;
+ mark_mmio_spte(vcpu, sptep, gfn, pte_access);
+ return RET_PF_EMULATE;
+ }
+
+ if (is_shadow_present_pte(*sptep)) {
+ /*
+ * If we overwrite a PTE page pointer with a 2MB PMD, unlink
+ * the parent of the now unreachable PTE.
+ */
+ if (level > PG_LEVEL_4K && !is_large_pte(*sptep)) {
+ struct kvm_mmu_page *child;
+ u64 pte = *sptep;
+
+ child = spte_to_child_sp(pte);
+ drop_parent_pte(vcpu->kvm, child, sptep);
+ flush = true;
+ } else if (pfn != spte_to_pfn(*sptep)) {
+ drop_spte(vcpu->kvm, sptep);
+ flush = true;
+ } else
+ was_rmapped = 1;
+ }
+
+ wrprot = make_spte(vcpu, sp, slot, pte_access, gfn, pfn, *sptep, prefetch,
+ true, host_writable, &spte);
+
+ if (*sptep == spte) {
+ ret = RET_PF_SPURIOUS;
+ } else {
+ flush |= mmu_spte_update(sptep, spte);
+ trace_kvm_mmu_set_spte(level, gfn, sptep);
+ }
+
+ if (wrprot) {
+ if (write_fault)
+ ret = RET_PF_EMULATE;
+ }
+
+ if (flush)
+ kvm_flush_remote_tlbs_gfn(vcpu->kvm, gfn, level);
+
+ if (!was_rmapped) {
+ WARN_ON_ONCE(ret == RET_PF_SPURIOUS);
+ rmap_add(vcpu, slot, sptep, gfn, pte_access);
+ } else {
+ /* Already rmapped but the pte_access bits may have changed. */
+ kvm_mmu_page_set_access(sp, spte_index(sptep), pte_access);
+ }
+
+ return ret;
+}
+
+static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
+ struct kvm_mmu_page *sp,
+ u64 *start, u64 *end)
+{
+ struct page *pages[PTE_PREFETCH_NUM];
+ struct kvm_memory_slot *slot;
+ unsigned int access = sp->role.access;
+ int i, ret;
+ gfn_t gfn;
+
+ gfn = kvm_mmu_page_get_gfn(sp, spte_index(start));
+ slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, access & ACC_WRITE_MASK);
+ if (!slot)
+ return -1;
+
+ ret = gfn_to_page_many_atomic(slot, gfn, pages, end - start);
+ if (ret <= 0)
+ return -1;
+
+ for (i = 0; i < ret; i++, gfn++, start++) {
+ mmu_set_spte(vcpu, slot, start, access, gfn,
+ page_to_pfn(pages[i]), NULL);
+ put_page(pages[i]);
+ }
+
+ return 0;
+}
+
+static void __direct_pte_prefetch(struct kvm_vcpu *vcpu,
+ struct kvm_mmu_page *sp, u64 *sptep)
+{
+ u64 *spte, *start = NULL;
+ int i;
+
+ WARN_ON_ONCE(!sp->role.direct);
+
+ i = spte_index(sptep) & ~(PTE_PREFETCH_NUM - 1);
+ spte = sp->spt + i;
+
+ for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) {
+ if (is_shadow_present_pte(*spte) || spte == sptep) {
+ if (!start)
+ continue;
+ if (direct_pte_prefetch_many(vcpu, sp, start, spte) < 0)
+ return;
+ start = NULL;
+ } else if (!start)
+ start = spte;
+ }
+ if (start)
+ direct_pte_prefetch_many(vcpu, sp, start, spte);
+}
+
+static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
+{
+ struct kvm_mmu_page *sp;
+
+ sp = sptep_to_sp(sptep);
+
+ /*
+ * Without accessed bits, there's no way to distinguish between
+ * actually accessed translations and prefetched, so disable pte
+ * prefetch if accessed bits aren't available.
+ */
+ if (sp_ad_disabled(sp))
+ return;
+
+ if (sp->role.level > PG_LEVEL_4K)
+ return;
+
+ /*
+ * If addresses are being invalidated, skip prefetching to avoid
+ * accidentally prefetching those addresses.
+ */
+ if (unlikely(vcpu->kvm->mmu_invalidate_in_progress))
+ return;
+
+ __direct_pte_prefetch(vcpu, sp, sptep);
+}
+
+/*
+ * Lookup the mapping level for @gfn in the current mm.
+ *
+ * WARNING! Use of host_pfn_mapping_level() requires the caller and the end
+ * consumer to be tied into KVM's handlers for MMU notifier events!
+ *
+ * There are several ways to safely use this helper:
+ *
+ * - Check mmu_invalidate_retry_hva() after grabbing the mapping level, before
+ * consuming it. In this case, mmu_lock doesn't need to be held during the
+ * lookup, but it does need to be held while checking the MMU notifier.
+ *
+ * - Hold mmu_lock AND ensure there is no in-progress MMU notifier invalidation
+ * event for the hva. This can be done by explicit checking the MMU notifier
+ * or by ensuring that KVM already has a valid mapping that covers the hva.
+ *
+ * - Do not use the result to install new mappings, e.g. use the host mapping
+ * level only to decide whether or not to zap an entry. In this case, it's
+ * not required to hold mmu_lock (though it's highly likely the caller will
+ * want to hold mmu_lock anyways, e.g. to modify SPTEs).
+ *
+ * Note! The lookup can still race with modifications to host page tables, but
+ * the above "rules" ensure KVM will not _consume_ the result of the walk if a
+ * race with the primary MMU occurs.
+ */
+static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn,
+ const struct kvm_memory_slot *slot)
+{
+ int level = PG_LEVEL_4K;
+ unsigned long hva;
+ unsigned long flags;
+ pgd_t pgd;
+ p4d_t p4d;
+ pud_t pud;
+ pmd_t pmd;
+
+ /*
+ * Note, using the already-retrieved memslot and __gfn_to_hva_memslot()
+ * is not solely for performance, it's also necessary to avoid the
+ * "writable" check in __gfn_to_hva_many(), which will always fail on
+ * read-only memslots due to gfn_to_hva() assuming writes. Earlier
+ * page fault steps have already verified the guest isn't writing a
+ * read-only memslot.
+ */
+ hva = __gfn_to_hva_memslot(slot, gfn);
+
+ /*
+ * Disable IRQs to prevent concurrent tear down of host page tables,
+ * e.g. if the primary MMU promotes a P*D to a huge page and then frees
+ * the original page table.
+ */
+ local_irq_save(flags);
+
+ /*
+ * Read each entry once. As above, a non-leaf entry can be promoted to
+ * a huge page _during_ this walk. Re-reading the entry could send the
+ * walk into the weeks, e.g. p*d_large() returns false (sees the old
+ * value) and then p*d_offset() walks into the target huge page instead
+ * of the old page table (sees the new value).
+ */
+ pgd = READ_ONCE(*pgd_offset(kvm->mm, hva));
+ if (pgd_none(pgd))
+ goto out;
+
+ p4d = READ_ONCE(*p4d_offset(&pgd, hva));
+ if (p4d_none(p4d) || !p4d_present(p4d))
+ goto out;
+
+ pud = READ_ONCE(*pud_offset(&p4d, hva));
+ if (pud_none(pud) || !pud_present(pud))
+ goto out;
+
+ if (pud_large(pud)) {
+ level = PG_LEVEL_1G;
+ goto out;
+ }
+
+ pmd = READ_ONCE(*pmd_offset(&pud, hva));
+ if (pmd_none(pmd) || !pmd_present(pmd))
+ goto out;
+
+ if (pmd_large(pmd))
+ level = PG_LEVEL_2M;
+
+out:
+ local_irq_restore(flags);
+ return level;
+}
+
+int kvm_mmu_max_mapping_level(struct kvm *kvm,
+ const struct kvm_memory_slot *slot, gfn_t gfn,
+ int max_level)
+{
+ struct kvm_lpage_info *linfo;
+ int host_level;
+
+ max_level = min(max_level, max_huge_page_level);
+ for ( ; max_level > PG_LEVEL_4K; max_level--) {
+ linfo = lpage_info_slot(gfn, slot, max_level);
+ if (!linfo->disallow_lpage)
+ break;
+ }
+
+ if (max_level == PG_LEVEL_4K)
+ return PG_LEVEL_4K;
+
+ host_level = host_pfn_mapping_level(kvm, gfn, slot);
+ return min(host_level, max_level);
+}
+
+void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
+{
+ struct kvm_memory_slot *slot = fault->slot;
+ kvm_pfn_t mask;
+
+ fault->huge_page_disallowed = fault->exec && fault->nx_huge_page_workaround_enabled;
+
+ if (unlikely(fault->max_level == PG_LEVEL_4K))
+ return;
+
+ if (is_error_noslot_pfn(fault->pfn))
+ return;
+
+ if (kvm_slot_dirty_track_enabled(slot))
+ return;
+
+ /*
+ * Enforce the iTLB multihit workaround after capturing the requested
+ * level, which will be used to do precise, accurate accounting.
+ */
+ fault->req_level = kvm_mmu_max_mapping_level(vcpu->kvm, slot,
+ fault->gfn, fault->max_level);
+ if (fault->req_level == PG_LEVEL_4K || fault->huge_page_disallowed)
+ return;
+
+ /*
+ * mmu_invalidate_retry() was successful and mmu_lock is held, so
+ * the pmd can't be split from under us.
+ */
+ fault->goal_level = fault->req_level;
+ mask = KVM_PAGES_PER_HPAGE(fault->goal_level) - 1;
+ VM_BUG_ON((fault->gfn & mask) != (fault->pfn & mask));
+ fault->pfn &= ~mask;
+}
+
+void disallowed_hugepage_adjust(struct kvm_page_fault *fault, u64 spte, int cur_level)
+{
+ if (cur_level > PG_LEVEL_4K &&
+ cur_level == fault->goal_level &&
+ is_shadow_present_pte(spte) &&
+ !is_large_pte(spte) &&
+ spte_to_child_sp(spte)->nx_huge_page_disallowed) {
+ /*
+ * A small SPTE exists for this pfn, but FNAME(fetch),
+ * direct_map(), or kvm_tdp_mmu_map() would like to create a
+ * large PTE instead: just force them to go down another level,
+ * patching back for them into pfn the next 9 bits of the
+ * address.
+ */
+ u64 page_mask = KVM_PAGES_PER_HPAGE(cur_level) -
+ KVM_PAGES_PER_HPAGE(cur_level - 1);
+ fault->pfn |= fault->gfn & page_mask;
+ fault->goal_level--;
+ }
+}
+
+static int direct_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
+{
+ struct kvm_shadow_walk_iterator it;
+ struct kvm_mmu_page *sp;
+ int ret;
+ gfn_t base_gfn = fault->gfn;
+
+ kvm_mmu_hugepage_adjust(vcpu, fault);
+
+ trace_kvm_mmu_spte_requested(fault);
+ for_each_shadow_entry(vcpu, fault->addr, it) {
+ /*
+ * We cannot overwrite existing page tables with an NX
+ * large page, as the leaf could be executable.
+ */
+ if (fault->nx_huge_page_workaround_enabled)
+ disallowed_hugepage_adjust(fault, *it.sptep, it.level);
+
+ base_gfn = gfn_round_for_level(fault->gfn, it.level);
+ if (it.level == fault->goal_level)
+ break;
+
+ sp = kvm_mmu_get_child_sp(vcpu, it.sptep, base_gfn, true, ACC_ALL);
+ if (sp == ERR_PTR(-EEXIST))
+ continue;
+
+ link_shadow_page(vcpu, it.sptep, sp);
+ if (fault->huge_page_disallowed)
+ account_nx_huge_page(vcpu->kvm, sp,
+ fault->req_level >= it.level);
+ }
+
+ if (WARN_ON_ONCE(it.level != fault->goal_level))
+ return -EFAULT;
+
+ ret = mmu_set_spte(vcpu, fault->slot, it.sptep, ACC_ALL,
+ base_gfn, fault->pfn, fault);
+ if (ret == RET_PF_SPURIOUS)
+ return ret;
+
+ direct_pte_prefetch(vcpu, it.sptep);
+ return ret;
+}
+
+static void kvm_send_hwpoison_signal(struct kvm_memory_slot *slot, gfn_t gfn)
+{
+ unsigned long hva = gfn_to_hva_memslot(slot, gfn);
+
+ send_sig_mceerr(BUS_MCEERR_AR, (void __user *)hva, PAGE_SHIFT, current);
+}
+
+static int kvm_handle_error_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
+{
+ if (is_sigpending_pfn(fault->pfn)) {
+ kvm_handle_signal_exit(vcpu);
+ return -EINTR;
+ }
+
+ /*
+ * Do not cache the mmio info caused by writing the readonly gfn
+ * into the spte otherwise read access on readonly gfn also can
+ * caused mmio page fault and treat it as mmio access.
+ */
+ if (fault->pfn == KVM_PFN_ERR_RO_FAULT)
+ return RET_PF_EMULATE;
+
+ if (fault->pfn == KVM_PFN_ERR_HWPOISON) {
+ kvm_send_hwpoison_signal(fault->slot, fault->gfn);
+ return RET_PF_RETRY;
+ }
+
+ return -EFAULT;
+}
+
+static int kvm_handle_noslot_fault(struct kvm_vcpu *vcpu,
+ struct kvm_page_fault *fault,
+ unsigned int access)
+{
+ gva_t gva = fault->is_tdp ? 0 : fault->addr;
+
+ vcpu_cache_mmio_info(vcpu, gva, fault->gfn,
+ access & shadow_mmio_access_mask);
+
+ /*
+ * If MMIO caching is disabled, emulate immediately without
+ * touching the shadow page tables as attempting to install an
+ * MMIO SPTE will just be an expensive nop.
+ */
+ if (unlikely(!enable_mmio_caching))
+ return RET_PF_EMULATE;
+
+ /*
+ * Do not create an MMIO SPTE for a gfn greater than host.MAXPHYADDR,
+ * any guest that generates such gfns is running nested and is being
+ * tricked by L0 userspace (you can observe gfn > L1.MAXPHYADDR if and
+ * only if L1's MAXPHYADDR is inaccurate with respect to the
+ * hardware's).
+ */
+ if (unlikely(fault->gfn > kvm_mmu_max_gfn()))
+ return RET_PF_EMULATE;
+
+ return RET_PF_CONTINUE;
+}
+
+static bool page_fault_can_be_fast(struct kvm_page_fault *fault)
+{
+ /*
+ * Page faults with reserved bits set, i.e. faults on MMIO SPTEs, only
+ * reach the common page fault handler if the SPTE has an invalid MMIO
+ * generation number. Refreshing the MMIO generation needs to go down
+ * the slow path. Note, EPT Misconfigs do NOT set the PRESENT flag!
+ */
+ if (fault->rsvd)
+ return false;
+
+ /*
+ * #PF can be fast if:
+ *
+ * 1. The shadow page table entry is not present and A/D bits are
+ * disabled _by KVM_, which could mean that the fault is potentially
+ * caused by access tracking (if enabled). If A/D bits are enabled
+ * by KVM, but disabled by L1 for L2, KVM is forced to disable A/D
+ * bits for L2 and employ access tracking, but the fast page fault
+ * mechanism only supports direct MMUs.
+ * 2. The shadow page table entry is present, the access is a write,
+ * and no reserved bits are set (MMIO SPTEs cannot be "fixed"), i.e.
+ * the fault was caused by a write-protection violation. If the
+ * SPTE is MMU-writable (determined later), the fault can be fixed
+ * by setting the Writable bit, which can be done out of mmu_lock.
+ */
+ if (!fault->present)
+ return !kvm_ad_enabled();
+
+ /*
+ * Note, instruction fetches and writes are mutually exclusive, ignore
+ * the "exec" flag.
+ */
+ return fault->write;
+}
+
+/*
+ * Returns true if the SPTE was fixed successfully. Otherwise,
+ * someone else modified the SPTE from its original value.
+ */
+static bool fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu,
+ struct kvm_page_fault *fault,
+ u64 *sptep, u64 old_spte, u64 new_spte)
+{
+ /*
+ * Theoretically we could also set dirty bit (and flush TLB) here in
+ * order to eliminate unnecessary PML logging. See comments in
+ * set_spte. But fast_page_fault is very unlikely to happen with PML
+ * enabled, so we do not do this. This might result in the same GPA
+ * to be logged in PML buffer again when the write really happens, and
+ * eventually to be called by mark_page_dirty twice. But it's also no
+ * harm. This also avoids the TLB flush needed after setting dirty bit
+ * so non-PML cases won't be impacted.
+ *
+ * Compare with set_spte where instead shadow_dirty_mask is set.
+ */
+ if (!try_cmpxchg64(sptep, &old_spte, new_spte))
+ return false;
+
+ if (is_writable_pte(new_spte) && !is_writable_pte(old_spte))
+ mark_page_dirty_in_slot(vcpu->kvm, fault->slot, fault->gfn);
+
+ return true;
+}
+
+static bool is_access_allowed(struct kvm_page_fault *fault, u64 spte)
+{
+ if (fault->exec)
+ return is_executable_pte(spte);
+
+ if (fault->write)
+ return is_writable_pte(spte);
+
+ /* Fault was on Read access */
+ return spte & PT_PRESENT_MASK;
+}
+
+/*
+ * Returns the last level spte pointer of the shadow page walk for the given
+ * gpa, and sets *spte to the spte value. This spte may be non-preset. If no
+ * walk could be performed, returns NULL and *spte does not contain valid data.
+ *
+ * Contract:
+ * - Must be called between walk_shadow_page_lockless_{begin,end}.
+ * - The returned sptep must not be used after walk_shadow_page_lockless_end.
+ */
+static u64 *fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, gpa_t gpa, u64 *spte)
+{
+ struct kvm_shadow_walk_iterator iterator;
+ u64 old_spte;
+ u64 *sptep = NULL;
+
+ for_each_shadow_entry_lockless(vcpu, gpa, iterator, old_spte) {
+ sptep = iterator.sptep;
+ *spte = old_spte;
+ }
+
+ return sptep;
+}
+
+/*
+ * Returns one of RET_PF_INVALID, RET_PF_FIXED or RET_PF_SPURIOUS.
+ */
+static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
+{
+ struct kvm_mmu_page *sp;
+ int ret = RET_PF_INVALID;
+ u64 spte = 0ull;
+ u64 *sptep = NULL;
+ uint retry_count = 0;
+
+ if (!page_fault_can_be_fast(fault))
+ return ret;
+
+ walk_shadow_page_lockless_begin(vcpu);
+
+ do {
+ u64 new_spte;
+
+ if (tdp_mmu_enabled)
+ sptep = kvm_tdp_mmu_fast_pf_get_last_sptep(vcpu, fault->addr, &spte);
+ else
+ sptep = fast_pf_get_last_sptep(vcpu, fault->addr, &spte);
+
+ if (!is_shadow_present_pte(spte))
+ break;
+
+ sp = sptep_to_sp(sptep);
+ if (!is_last_spte(spte, sp->role.level))
+ break;
+
+ /*
+ * Check whether the memory access that caused the fault would
+ * still cause it if it were to be performed right now. If not,
+ * then this is a spurious fault caused by TLB lazily flushed,
+ * or some other CPU has already fixed the PTE after the
+ * current CPU took the fault.
+ *
+ * Need not check the access of upper level table entries since
+ * they are always ACC_ALL.
+ */
+ if (is_access_allowed(fault, spte)) {
+ ret = RET_PF_SPURIOUS;
+ break;
+ }
+
+ new_spte = spte;
+
+ /*
+ * KVM only supports fixing page faults outside of MMU lock for
+ * direct MMUs, nested MMUs are always indirect, and KVM always
+ * uses A/D bits for non-nested MMUs. Thus, if A/D bits are
+ * enabled, the SPTE can't be an access-tracked SPTE.
+ */
+ if (unlikely(!kvm_ad_enabled()) && is_access_track_spte(spte))
+ new_spte = restore_acc_track_spte(new_spte);
+
+ /*
+ * To keep things simple, only SPTEs that are MMU-writable can
+ * be made fully writable outside of mmu_lock, e.g. only SPTEs
+ * that were write-protected for dirty-logging or access
+ * tracking are handled here. Don't bother checking if the
+ * SPTE is writable to prioritize running with A/D bits enabled.
+ * The is_access_allowed() check above handles the common case
+ * of the fault being spurious, and the SPTE is known to be
+ * shadow-present, i.e. except for access tracking restoration
+ * making the new SPTE writable, the check is wasteful.
+ */
+ if (fault->write && is_mmu_writable_spte(spte)) {
+ new_spte |= PT_WRITABLE_MASK;
+
+ /*
+ * Do not fix write-permission on the large spte when
+ * dirty logging is enabled. Since we only dirty the
+ * first page into the dirty-bitmap in
+ * fast_pf_fix_direct_spte(), other pages are missed
+ * if its slot has dirty logging enabled.
+ *
+ * Instead, we let the slow page fault path create a
+ * normal spte to fix the access.
+ */
+ if (sp->role.level > PG_LEVEL_4K &&
+ kvm_slot_dirty_track_enabled(fault->slot))
+ break;
+ }
+
+ /* Verify that the fault can be handled in the fast path */
+ if (new_spte == spte ||
+ !is_access_allowed(fault, new_spte))
+ break;
+
+ /*
+ * Currently, fast page fault only works for direct mapping
+ * since the gfn is not stable for indirect shadow page. See
+ * Documentation/virt/kvm/locking.rst to get more detail.
+ */
+ if (fast_pf_fix_direct_spte(vcpu, fault, sptep, spte, new_spte)) {
+ ret = RET_PF_FIXED;
+ break;
+ }
+
+ if (++retry_count > 4) {
+ pr_warn_once("Fast #PF retrying more than 4 times.\n");
+ break;
+ }
+
+ } while (true);
+
+ trace_fast_page_fault(vcpu, fault, sptep, spte, ret);
+ walk_shadow_page_lockless_end(vcpu);
+
+ if (ret != RET_PF_INVALID)
+ vcpu->stat.pf_fast++;
+
+ return ret;
+}
+
+static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa,
+ struct list_head *invalid_list)
+{
+ struct kvm_mmu_page *sp;
+
+ if (!VALID_PAGE(*root_hpa))
+ return;
+
+ sp = root_to_sp(*root_hpa);
+ if (WARN_ON_ONCE(!sp))
+ return;
+
+ if (is_tdp_mmu_page(sp))
+ kvm_tdp_mmu_put_root(kvm, sp, false);
+ else if (!--sp->root_count && sp->role.invalid)
+ kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
+
+ *root_hpa = INVALID_PAGE;
+}
+
+/* roots_to_free must be some combination of the KVM_MMU_ROOT_* flags */
+void kvm_mmu_free_roots(struct kvm *kvm, struct kvm_mmu *mmu,
+ ulong roots_to_free)
+{
+ int i;
+ LIST_HEAD(invalid_list);
+ bool free_active_root;
+
+ WARN_ON_ONCE(roots_to_free & ~KVM_MMU_ROOTS_ALL);
+
+ BUILD_BUG_ON(KVM_MMU_NUM_PREV_ROOTS >= BITS_PER_LONG);
+
+ /* Before acquiring the MMU lock, see if we need to do any real work. */
+ free_active_root = (roots_to_free & KVM_MMU_ROOT_CURRENT)
+ && VALID_PAGE(mmu->root.hpa);
+
+ if (!free_active_root) {
+ for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
+ if ((roots_to_free & KVM_MMU_ROOT_PREVIOUS(i)) &&
+ VALID_PAGE(mmu->prev_roots[i].hpa))
+ break;
+
+ if (i == KVM_MMU_NUM_PREV_ROOTS)
+ return;
+ }
+
+ write_lock(&kvm->mmu_lock);
+
+ for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
+ if (roots_to_free & KVM_MMU_ROOT_PREVIOUS(i))
+ mmu_free_root_page(kvm, &mmu->prev_roots[i].hpa,
+ &invalid_list);
+
+ if (free_active_root) {
+ if (kvm_mmu_is_dummy_root(mmu->root.hpa)) {
+ /* Nothing to cleanup for dummy roots. */
+ } else if (root_to_sp(mmu->root.hpa)) {
+ mmu_free_root_page(kvm, &mmu->root.hpa, &invalid_list);
+ } else if (mmu->pae_root) {
+ for (i = 0; i < 4; ++i) {
+ if (!IS_VALID_PAE_ROOT(mmu->pae_root[i]))
+ continue;
+
+ mmu_free_root_page(kvm, &mmu->pae_root[i],
+ &invalid_list);
+ mmu->pae_root[i] = INVALID_PAE_ROOT;
+ }
+ }
+ mmu->root.hpa = INVALID_PAGE;
+ mmu->root.pgd = 0;
+ }
+
+ kvm_mmu_commit_zap_page(kvm, &invalid_list);
+ write_unlock(&kvm->mmu_lock);
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_free_roots);
+
+void kvm_mmu_free_guest_mode_roots(struct kvm *kvm, struct kvm_mmu *mmu)
+{
+ unsigned long roots_to_free = 0;
+ struct kvm_mmu_page *sp;
+ hpa_t root_hpa;
+ int i;
+
+ /*
+ * This should not be called while L2 is active, L2 can't invalidate
+ * _only_ its own roots, e.g. INVVPID unconditionally exits.
+ */
+ WARN_ON_ONCE(mmu->root_role.guest_mode);
+
+ for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
+ root_hpa = mmu->prev_roots[i].hpa;
+ if (!VALID_PAGE(root_hpa))
+ continue;
+
+ sp = root_to_sp(root_hpa);
+ if (!sp || sp->role.guest_mode)
+ roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
+ }
+
+ kvm_mmu_free_roots(kvm, mmu, roots_to_free);
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_free_guest_mode_roots);
+
+static hpa_t mmu_alloc_root(struct kvm_vcpu *vcpu, gfn_t gfn, int quadrant,
+ u8 level)
+{
+ union kvm_mmu_page_role role = vcpu->arch.mmu->root_role;
+ struct kvm_mmu_page *sp;
+
+ role.level = level;
+ role.quadrant = quadrant;
+
+ WARN_ON_ONCE(quadrant && !role.has_4_byte_gpte);
+ WARN_ON_ONCE(role.direct && role.has_4_byte_gpte);
+
+ sp = kvm_mmu_get_shadow_page(vcpu, gfn, role);
+ ++sp->root_count;
+
+ return __pa(sp->spt);
+}
+
+static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
+{
+ struct kvm_mmu *mmu = vcpu->arch.mmu;
+ u8 shadow_root_level = mmu->root_role.level;
+ hpa_t root;
+ unsigned i;
+ int r;
+
+ write_lock(&vcpu->kvm->mmu_lock);
+ r = make_mmu_pages_available(vcpu);
+ if (r < 0)
+ goto out_unlock;
+
+ if (tdp_mmu_enabled) {
+ root = kvm_tdp_mmu_get_vcpu_root_hpa(vcpu);
+ mmu->root.hpa = root;
+ } else if (shadow_root_level >= PT64_ROOT_4LEVEL) {
+ root = mmu_alloc_root(vcpu, 0, 0, shadow_root_level);
+ mmu->root.hpa = root;
+ } else if (shadow_root_level == PT32E_ROOT_LEVEL) {
+ if (WARN_ON_ONCE(!mmu->pae_root)) {
+ r = -EIO;
+ goto out_unlock;
+ }
+
+ for (i = 0; i < 4; ++i) {
+ WARN_ON_ONCE(IS_VALID_PAE_ROOT(mmu->pae_root[i]));
+
+ root = mmu_alloc_root(vcpu, i << (30 - PAGE_SHIFT), 0,
+ PT32_ROOT_LEVEL);
+ mmu->pae_root[i] = root | PT_PRESENT_MASK |
+ shadow_me_value;
+ }
+ mmu->root.hpa = __pa(mmu->pae_root);
+ } else {
+ WARN_ONCE(1, "Bad TDP root level = %d\n", shadow_root_level);
+ r = -EIO;
+ goto out_unlock;
+ }
+
+ /* root.pgd is ignored for direct MMUs. */
+ mmu->root.pgd = 0;
+out_unlock:
+ write_unlock(&vcpu->kvm->mmu_lock);
+ return r;
+}
+
+static int mmu_first_shadow_root_alloc(struct kvm *kvm)
+{
+ struct kvm_memslots *slots;
+ struct kvm_memory_slot *slot;
+ int r = 0, i, bkt;
+
+ /*
+ * Check if this is the first shadow root being allocated before
+ * taking the lock.
+ */
+ if (kvm_shadow_root_allocated(kvm))
+ return 0;
+
+ mutex_lock(&kvm->slots_arch_lock);
+
+ /* Recheck, under the lock, whether this is the first shadow root. */
+ if (kvm_shadow_root_allocated(kvm))
+ goto out_unlock;
+
+ /*
+ * Check if anything actually needs to be allocated, e.g. all metadata
+ * will be allocated upfront if TDP is disabled.
+ */
+ if (kvm_memslots_have_rmaps(kvm) &&
+ kvm_page_track_write_tracking_enabled(kvm))
+ goto out_success;
+
+ for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
+ slots = __kvm_memslots(kvm, i);
+ kvm_for_each_memslot(slot, bkt, slots) {
+ /*
+ * Both of these functions are no-ops if the target is
+ * already allocated, so unconditionally calling both
+ * is safe. Intentionally do NOT free allocations on
+ * failure to avoid having to track which allocations
+ * were made now versus when the memslot was created.
+ * The metadata is guaranteed to be freed when the slot
+ * is freed, and will be kept/used if userspace retries
+ * KVM_RUN instead of killing the VM.
+ */
+ r = memslot_rmap_alloc(slot, slot->npages);
+ if (r)
+ goto out_unlock;
+ r = kvm_page_track_write_tracking_alloc(slot);
+ if (r)
+ goto out_unlock;
+ }
+ }
+
+ /*
+ * Ensure that shadow_root_allocated becomes true strictly after
+ * all the related pointers are set.
+ */
+out_success:
+ smp_store_release(&kvm->arch.shadow_root_allocated, true);
+
+out_unlock:
+ mutex_unlock(&kvm->slots_arch_lock);
+ return r;
+}
+
+static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
+{
+ struct kvm_mmu *mmu = vcpu->arch.mmu;
+ u64 pdptrs[4], pm_mask;
+ gfn_t root_gfn, root_pgd;
+ int quadrant, i, r;
+ hpa_t root;
+
+ root_pgd = kvm_mmu_get_guest_pgd(vcpu, mmu);
+ root_gfn = root_pgd >> PAGE_SHIFT;
+
+ if (!kvm_vcpu_is_visible_gfn(vcpu, root_gfn)) {
+ mmu->root.hpa = kvm_mmu_get_dummy_root();
+ return 0;
+ }
+
+ /*
+ * On SVM, reading PDPTRs might access guest memory, which might fault
+ * and thus might sleep. Grab the PDPTRs before acquiring mmu_lock.
+ */
+ if (mmu->cpu_role.base.level == PT32E_ROOT_LEVEL) {
+ for (i = 0; i < 4; ++i) {
+ pdptrs[i] = mmu->get_pdptr(vcpu, i);
+ if (!(pdptrs[i] & PT_PRESENT_MASK))
+ continue;
+
+ if (!kvm_vcpu_is_visible_gfn(vcpu, pdptrs[i] >> PAGE_SHIFT))
+ pdptrs[i] = 0;
+ }
+ }
+
+ r = mmu_first_shadow_root_alloc(vcpu->kvm);
+ if (r)
+ return r;
+
+ write_lock(&vcpu->kvm->mmu_lock);
+ r = make_mmu_pages_available(vcpu);
+ if (r < 0)
+ goto out_unlock;
+
+ /*
+ * Do we shadow a long mode page table? If so we need to
+ * write-protect the guests page table root.
+ */
+ if (mmu->cpu_role.base.level >= PT64_ROOT_4LEVEL) {
+ root = mmu_alloc_root(vcpu, root_gfn, 0,
+ mmu->root_role.level);
+ mmu->root.hpa = root;
+ goto set_root_pgd;
+ }
+
+ if (WARN_ON_ONCE(!mmu->pae_root)) {
+ r = -EIO;
+ goto out_unlock;
+ }
+
+ /*
+ * We shadow a 32 bit page table. This may be a legacy 2-level
+ * or a PAE 3-level page table. In either case we need to be aware that
+ * the shadow page table may be a PAE or a long mode page table.
+ */
+ pm_mask = PT_PRESENT_MASK | shadow_me_value;
+ if (mmu->root_role.level >= PT64_ROOT_4LEVEL) {
+ pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK;
+
+ if (WARN_ON_ONCE(!mmu->pml4_root)) {
+ r = -EIO;
+ goto out_unlock;
+ }
+ mmu->pml4_root[0] = __pa(mmu->pae_root) | pm_mask;
+
+ if (mmu->root_role.level == PT64_ROOT_5LEVEL) {
+ if (WARN_ON_ONCE(!mmu->pml5_root)) {
+ r = -EIO;
+ goto out_unlock;
+ }
+ mmu->pml5_root[0] = __pa(mmu->pml4_root) | pm_mask;
+ }
+ }
+
+ for (i = 0; i < 4; ++i) {
+ WARN_ON_ONCE(IS_VALID_PAE_ROOT(mmu->pae_root[i]));
+
+ if (mmu->cpu_role.base.level == PT32E_ROOT_LEVEL) {
+ if (!(pdptrs[i] & PT_PRESENT_MASK)) {
+ mmu->pae_root[i] = INVALID_PAE_ROOT;
+ continue;
+ }
+ root_gfn = pdptrs[i] >> PAGE_SHIFT;
+ }
+
+ /*
+ * If shadowing 32-bit non-PAE page tables, each PAE page
+ * directory maps one quarter of the guest's non-PAE page
+ * directory. Othwerise each PAE page direct shadows one guest
+ * PAE page directory so that quadrant should be 0.
+ */
+ quadrant = (mmu->cpu_role.base.level == PT32_ROOT_LEVEL) ? i : 0;
+
+ root = mmu_alloc_root(vcpu, root_gfn, quadrant, PT32_ROOT_LEVEL);
+ mmu->pae_root[i] = root | pm_mask;
+ }
+
+ if (mmu->root_role.level == PT64_ROOT_5LEVEL)
+ mmu->root.hpa = __pa(mmu->pml5_root);
+ else if (mmu->root_role.level == PT64_ROOT_4LEVEL)
+ mmu->root.hpa = __pa(mmu->pml4_root);
+ else
+ mmu->root.hpa = __pa(mmu->pae_root);
+
+set_root_pgd:
+ mmu->root.pgd = root_pgd;
+out_unlock:
+ write_unlock(&vcpu->kvm->mmu_lock);
+
+ return r;
+}
+
+static int mmu_alloc_special_roots(struct kvm_vcpu *vcpu)
+{
+ struct kvm_mmu *mmu = vcpu->arch.mmu;
+ bool need_pml5 = mmu->root_role.level > PT64_ROOT_4LEVEL;
+ u64 *pml5_root = NULL;
+ u64 *pml4_root = NULL;
+ u64 *pae_root;
+
+ /*
+ * When shadowing 32-bit or PAE NPT with 64-bit NPT, the PML4 and PDP
+ * tables are allocated and initialized at root creation as there is no
+ * equivalent level in the guest's NPT to shadow. Allocate the tables
+ * on demand, as running a 32-bit L1 VMM on 64-bit KVM is very rare.
+ */
+ if (mmu->root_role.direct ||
+ mmu->cpu_role.base.level >= PT64_ROOT_4LEVEL ||
+ mmu->root_role.level < PT64_ROOT_4LEVEL)
+ return 0;
+
+ /*
+ * NPT, the only paging mode that uses this horror, uses a fixed number
+ * of levels for the shadow page tables, e.g. all MMUs are 4-level or
+ * all MMus are 5-level. Thus, this can safely require that pml5_root
+ * is allocated if the other roots are valid and pml5 is needed, as any
+ * prior MMU would also have required pml5.
+ */
+ if (mmu->pae_root && mmu->pml4_root && (!need_pml5 || mmu->pml5_root))
+ return 0;
+
+ /*
+ * The special roots should always be allocated in concert. Yell and
+ * bail if KVM ends up in a state where only one of the roots is valid.
+ */
+ if (WARN_ON_ONCE(!tdp_enabled || mmu->pae_root || mmu->pml4_root ||
+ (need_pml5 && mmu->pml5_root)))
+ return -EIO;
+
+ /*
+ * Unlike 32-bit NPT, the PDP table doesn't need to be in low mem, and
+ * doesn't need to be decrypted.
+ */
+ pae_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
+ if (!pae_root)
+ return -ENOMEM;
+
+#ifdef CONFIG_X86_64
+ pml4_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
+ if (!pml4_root)
+ goto err_pml4;
+
+ if (need_pml5) {
+ pml5_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
+ if (!pml5_root)
+ goto err_pml5;
+ }
+#endif
+
+ mmu->pae_root = pae_root;
+ mmu->pml4_root = pml4_root;
+ mmu->pml5_root = pml5_root;
+
+ return 0;
+
+#ifdef CONFIG_X86_64
+err_pml5:
+ free_page((unsigned long)pml4_root);
+err_pml4:
+ free_page((unsigned long)pae_root);
+ return -ENOMEM;
+#endif
+}
+
+static bool is_unsync_root(hpa_t root)
+{
+ struct kvm_mmu_page *sp;
+
+ if (!VALID_PAGE(root) || kvm_mmu_is_dummy_root(root))
+ return false;
+
+ /*
+ * The read barrier orders the CPU's read of SPTE.W during the page table
+ * walk before the reads of sp->unsync/sp->unsync_children here.
+ *
+ * Even if another CPU was marking the SP as unsync-ed simultaneously,
+ * any guest page table changes are not guaranteed to be visible anyway
+ * until this VCPU issues a TLB flush strictly after those changes are
+ * made. We only need to ensure that the other CPU sets these flags
+ * before any actual changes to the page tables are made. The comments
+ * in mmu_try_to_unsync_pages() describe what could go wrong if this
+ * requirement isn't satisfied.
+ */
+ smp_rmb();
+ sp = root_to_sp(root);
+
+ /*
+ * PAE roots (somewhat arbitrarily) aren't backed by shadow pages, the
+ * PDPTEs for a given PAE root need to be synchronized individually.
+ */
+ if (WARN_ON_ONCE(!sp))
+ return false;
+
+ if (sp->unsync || sp->unsync_children)
+ return true;
+
+ return false;
+}
+
+void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
+{
+ int i;
+ struct kvm_mmu_page *sp;
+
+ if (vcpu->arch.mmu->root_role.direct)
+ return;
+
+ if (!VALID_PAGE(vcpu->arch.mmu->root.hpa))
+ return;
+
+ vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
+
+ if (vcpu->arch.mmu->cpu_role.base.level >= PT64_ROOT_4LEVEL) {
+ hpa_t root = vcpu->arch.mmu->root.hpa;
+
+ if (!is_unsync_root(root))
+ return;
+
+ sp = root_to_sp(root);
+
+ write_lock(&vcpu->kvm->mmu_lock);
+ mmu_sync_children(vcpu, sp, true);
+ write_unlock(&vcpu->kvm->mmu_lock);
+ return;
+ }
+
+ write_lock(&vcpu->kvm->mmu_lock);
+
+ for (i = 0; i < 4; ++i) {
+ hpa_t root = vcpu->arch.mmu->pae_root[i];
+
+ if (IS_VALID_PAE_ROOT(root)) {
+ sp = spte_to_child_sp(root);
+ mmu_sync_children(vcpu, sp, true);
+ }
+ }
+
+ write_unlock(&vcpu->kvm->mmu_lock);
+}
+
+void kvm_mmu_sync_prev_roots(struct kvm_vcpu *vcpu)
+{
+ unsigned long roots_to_free = 0;
+ int i;
+
+ for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
+ if (is_unsync_root(vcpu->arch.mmu->prev_roots[i].hpa))
+ roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
+
+ /* sync prev_roots by simply freeing them */
+ kvm_mmu_free_roots(vcpu->kvm, vcpu->arch.mmu, roots_to_free);
+}
+
+static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
+ gpa_t vaddr, u64 access,
+ struct x86_exception *exception)
+{
+ if (exception)
+ exception->error_code = 0;
+ return kvm_translate_gpa(vcpu, mmu, vaddr, access, exception);
+}
+
+static bool mmio_info_in_cache(struct kvm_vcpu *vcpu, u64 addr, bool direct)
+{
+ /*
+ * A nested guest cannot use the MMIO cache if it is using nested
+ * page tables, because cr2 is a nGPA while the cache stores GPAs.
+ */
+ if (mmu_is_nested(vcpu))
+ return false;
+
+ if (direct)
+ return vcpu_match_mmio_gpa(vcpu, addr);
+
+ return vcpu_match_mmio_gva(vcpu, addr);
+}
+
+/*
+ * Return the level of the lowest level SPTE added to sptes.
+ * That SPTE may be non-present.
+ *
+ * Must be called between walk_shadow_page_lockless_{begin,end}.
+ */
+static int get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, int *root_level)
+{
+ struct kvm_shadow_walk_iterator iterator;
+ int leaf = -1;
+ u64 spte;
+
+ for (shadow_walk_init(&iterator, vcpu, addr),
+ *root_level = iterator.level;
+ shadow_walk_okay(&iterator);
+ __shadow_walk_next(&iterator, spte)) {
+ leaf = iterator.level;
+ spte = mmu_spte_get_lockless(iterator.sptep);
+
+ sptes[leaf] = spte;
+ }
+
+ return leaf;
+}
+
+/* return true if reserved bit(s) are detected on a valid, non-MMIO SPTE. */
+static bool get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep)
+{
+ u64 sptes[PT64_ROOT_MAX_LEVEL + 1];
+ struct rsvd_bits_validate *rsvd_check;
+ int root, leaf, level;
+ bool reserved = false;
+
+ walk_shadow_page_lockless_begin(vcpu);
+
+ if (is_tdp_mmu_active(vcpu))
+ leaf = kvm_tdp_mmu_get_walk(vcpu, addr, sptes, &root);
+ else
+ leaf = get_walk(vcpu, addr, sptes, &root);
+
+ walk_shadow_page_lockless_end(vcpu);
+
+ if (unlikely(leaf < 0)) {
+ *sptep = 0ull;
+ return reserved;
+ }
+
+ *sptep = sptes[leaf];
+
+ /*
+ * Skip reserved bits checks on the terminal leaf if it's not a valid
+ * SPTE. Note, this also (intentionally) skips MMIO SPTEs, which, by
+ * design, always have reserved bits set. The purpose of the checks is
+ * to detect reserved bits on non-MMIO SPTEs. i.e. buggy SPTEs.
+ */
+ if (!is_shadow_present_pte(sptes[leaf]))
+ leaf++;
+
+ rsvd_check = &vcpu->arch.mmu->shadow_zero_check;
+
+ for (level = root; level >= leaf; level--)
+ reserved |= is_rsvd_spte(rsvd_check, sptes[level], level);
+
+ if (reserved) {
+ pr_err("%s: reserved bits set on MMU-present spte, addr 0x%llx, hierarchy:\n",
+ __func__, addr);
+ for (level = root; level >= leaf; level--)
+ pr_err("------ spte = 0x%llx level = %d, rsvd bits = 0x%llx",
+ sptes[level], level,
+ get_rsvd_bits(rsvd_check, sptes[level], level));
+ }
+
+ return reserved;
+}
+
+static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct)
+{
+ u64 spte;
+ bool reserved;
+
+ if (mmio_info_in_cache(vcpu, addr, direct))
+ return RET_PF_EMULATE;
+
+ reserved = get_mmio_spte(vcpu, addr, &spte);
+ if (WARN_ON_ONCE(reserved))
+ return -EINVAL;
+
+ if (is_mmio_spte(spte)) {
+ gfn_t gfn = get_mmio_spte_gfn(spte);
+ unsigned int access = get_mmio_spte_access(spte);
+
+ if (!check_mmio_spte(vcpu, spte))
+ return RET_PF_INVALID;
+
+ if (direct)
+ addr = 0;
+
+ trace_handle_mmio_page_fault(addr, gfn, access);
+ vcpu_cache_mmio_info(vcpu, addr, gfn, access);
+ return RET_PF_EMULATE;
+ }
+
+ /*
+ * If the page table is zapped by other cpus, let CPU fault again on
+ * the address.
+ */
+ return RET_PF_RETRY;
+}
+
+static bool page_fault_handle_page_track(struct kvm_vcpu *vcpu,
+ struct kvm_page_fault *fault)
+{
+ if (unlikely(fault->rsvd))
+ return false;
+
+ if (!fault->present || !fault->write)
+ return false;
+
+ /*
+ * guest is writing the page which is write tracked which can
+ * not be fixed by page fault handler.
+ */
+ if (kvm_gfn_is_write_tracked(vcpu->kvm, fault->slot, fault->gfn))
+ return true;
+
+ return false;
+}
+
+static void shadow_page_table_clear_flood(struct kvm_vcpu *vcpu, gva_t addr)
+{
+ struct kvm_shadow_walk_iterator iterator;
+ u64 spte;
+
+ walk_shadow_page_lockless_begin(vcpu);
+ for_each_shadow_entry_lockless(vcpu, addr, iterator, spte)
+ clear_sp_write_flooding_count(iterator.sptep);
+ walk_shadow_page_lockless_end(vcpu);
+}
+
+static u32 alloc_apf_token(struct kvm_vcpu *vcpu)
+{
+ /* make sure the token value is not 0 */
+ u32 id = vcpu->arch.apf.id;
+
+ if (id << 12 == 0)
+ vcpu->arch.apf.id = 1;
+
+ return (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id;
+}
+
+static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
+ gfn_t gfn)
+{
+ struct kvm_arch_async_pf arch;
+
+ arch.token = alloc_apf_token(vcpu);
+ arch.gfn = gfn;
+ arch.direct_map = vcpu->arch.mmu->root_role.direct;
+ arch.cr3 = kvm_mmu_get_guest_pgd(vcpu, vcpu->arch.mmu);
+
+ return kvm_setup_async_pf(vcpu, cr2_or_gpa,
+ kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch);
+}
+
+void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
+{
+ int r;
+
+ if ((vcpu->arch.mmu->root_role.direct != work->arch.direct_map) ||
+ work->wakeup_all)
+ return;
+
+ r = kvm_mmu_reload(vcpu);
+ if (unlikely(r))
+ return;
+
+ if (!vcpu->arch.mmu->root_role.direct &&
+ work->arch.cr3 != kvm_mmu_get_guest_pgd(vcpu, vcpu->arch.mmu))
+ return;
+
+ kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, 0, true, NULL);
+}
+
+static int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
+{
+ struct kvm_memory_slot *slot = fault->slot;
+ bool async;
+
+ /*
+ * Retry the page fault if the gfn hit a memslot that is being deleted
+ * or moved. This ensures any existing SPTEs for the old memslot will
+ * be zapped before KVM inserts a new MMIO SPTE for the gfn.
+ */
+ if (slot && (slot->flags & KVM_MEMSLOT_INVALID))
+ return RET_PF_RETRY;
+
+ if (!kvm_is_visible_memslot(slot)) {
+ /* Don't expose private memslots to L2. */
+ if (is_guest_mode(vcpu)) {
+ fault->slot = NULL;
+ fault->pfn = KVM_PFN_NOSLOT;
+ fault->map_writable = false;
+ return RET_PF_CONTINUE;
+ }
+ /*
+ * If the APIC access page exists but is disabled, go directly
+ * to emulation without caching the MMIO access or creating a
+ * MMIO SPTE. That way the cache doesn't need to be purged
+ * when the AVIC is re-enabled.
+ */
+ if (slot && slot->id == APIC_ACCESS_PAGE_PRIVATE_MEMSLOT &&
+ !kvm_apicv_activated(vcpu->kvm))
+ return RET_PF_EMULATE;
+ }
+
+ async = false;
+ fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, false, &async,
+ fault->write, &fault->map_writable,
+ &fault->hva);
+ if (!async)
+ return RET_PF_CONTINUE; /* *pfn has correct page already */
+
+ if (!fault->prefetch && kvm_can_do_async_pf(vcpu)) {
+ trace_kvm_try_async_get_page(fault->addr, fault->gfn);
+ if (kvm_find_async_pf_gfn(vcpu, fault->gfn)) {
+ trace_kvm_async_pf_repeated_fault(fault->addr, fault->gfn);
+ kvm_make_request(KVM_REQ_APF_HALT, vcpu);
+ return RET_PF_RETRY;
+ } else if (kvm_arch_setup_async_pf(vcpu, fault->addr, fault->gfn)) {
+ return RET_PF_RETRY;
+ }
+ }
+
+ /*
+ * Allow gup to bail on pending non-fatal signals when it's also allowed
+ * to wait for IO. Note, gup always bails if it is unable to quickly
+ * get a page and a fatal signal, i.e. SIGKILL, is pending.
+ */
+ fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, true, NULL,
+ fault->write, &fault->map_writable,
+ &fault->hva);
+ return RET_PF_CONTINUE;
+}
+
+static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
+ unsigned int access)
+{
+ int ret;
+
+ fault->mmu_seq = vcpu->kvm->mmu_invalidate_seq;
+ smp_rmb();
+
+ ret = __kvm_faultin_pfn(vcpu, fault);
+ if (ret != RET_PF_CONTINUE)
+ return ret;
+
+ if (unlikely(is_error_pfn(fault->pfn)))
+ return kvm_handle_error_pfn(vcpu, fault);
+
+ if (unlikely(!fault->slot))
+ return kvm_handle_noslot_fault(vcpu, fault, access);
+
+ return RET_PF_CONTINUE;
+}
+
+/*
+ * Returns true if the page fault is stale and needs to be retried, i.e. if the
+ * root was invalidated by a memslot update or a relevant mmu_notifier fired.
+ */
+static bool is_page_fault_stale(struct kvm_vcpu *vcpu,
+ struct kvm_page_fault *fault)
+{
+ struct kvm_mmu_page *sp = root_to_sp(vcpu->arch.mmu->root.hpa);
+
+ /* Special roots, e.g. pae_root, are not backed by shadow pages. */
+ if (sp && is_obsolete_sp(vcpu->kvm, sp))
+ return true;
+
+ /*
+ * Roots without an associated shadow page are considered invalid if
+ * there is a pending request to free obsolete roots. The request is
+ * only a hint that the current root _may_ be obsolete and needs to be
+ * reloaded, e.g. if the guest frees a PGD that KVM is tracking as a
+ * previous root, then __kvm_mmu_prepare_zap_page() signals all vCPUs
+ * to reload even if no vCPU is actively using the root.
+ */
+ if (!sp && kvm_test_request(KVM_REQ_MMU_FREE_OBSOLETE_ROOTS, vcpu))
+ return true;
+
+ return fault->slot &&
+ mmu_invalidate_retry_hva(vcpu->kvm, fault->mmu_seq, fault->hva);
+}
+
+static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
+{
+ int r;
+
+ /* Dummy roots are used only for shadowing bad guest roots. */
+ if (WARN_ON_ONCE(kvm_mmu_is_dummy_root(vcpu->arch.mmu->root.hpa)))
+ return RET_PF_RETRY;
+
+ if (page_fault_handle_page_track(vcpu, fault))
+ return RET_PF_EMULATE;
+
+ r = fast_page_fault(vcpu, fault);
+ if (r != RET_PF_INVALID)
+ return r;
+
+ r = mmu_topup_memory_caches(vcpu, false);
+ if (r)
+ return r;
+
+ r = kvm_faultin_pfn(vcpu, fault, ACC_ALL);
+ if (r != RET_PF_CONTINUE)
+ return r;
+
+ r = RET_PF_RETRY;
+ write_lock(&vcpu->kvm->mmu_lock);
+
+ if (is_page_fault_stale(vcpu, fault))
+ goto out_unlock;
+
+ r = make_mmu_pages_available(vcpu);
+ if (r)
+ goto out_unlock;
+
+ r = direct_map(vcpu, fault);
+
+out_unlock:
+ write_unlock(&vcpu->kvm->mmu_lock);
+ kvm_release_pfn_clean(fault->pfn);
+ return r;
+}
+
+static int nonpaging_page_fault(struct kvm_vcpu *vcpu,
+ struct kvm_page_fault *fault)
+{
+ /* This path builds a PAE pagetable, we can map 2mb pages at maximum. */
+ fault->max_level = PG_LEVEL_2M;
+ return direct_page_fault(vcpu, fault);
+}
+
+int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
+ u64 fault_address, char *insn, int insn_len)
+{
+ int r = 1;
+ u32 flags = vcpu->arch.apf.host_apf_flags;
+
+#ifndef CONFIG_X86_64
+ /* A 64-bit CR2 should be impossible on 32-bit KVM. */
+ if (WARN_ON_ONCE(fault_address >> 32))
+ return -EFAULT;
+#endif
+
+ vcpu->arch.l1tf_flush_l1d = true;
+ if (!flags) {
+ trace_kvm_page_fault(vcpu, fault_address, error_code);
+
+ if (kvm_event_needs_reinjection(vcpu))
+ kvm_mmu_unprotect_page_virt(vcpu, fault_address);
+ r = kvm_mmu_page_fault(vcpu, fault_address, error_code, insn,
+ insn_len);
+ } else if (flags & KVM_PV_REASON_PAGE_NOT_PRESENT) {
+ vcpu->arch.apf.host_apf_flags = 0;
+ local_irq_disable();
+ kvm_async_pf_task_wait_schedule(fault_address);
+ local_irq_enable();
+ } else {
+ WARN_ONCE(1, "Unexpected host async PF flags: %x\n", flags);
+ }
+
+ return r;
+}
+EXPORT_SYMBOL_GPL(kvm_handle_page_fault);
+
+#ifdef CONFIG_X86_64
+static int kvm_tdp_mmu_page_fault(struct kvm_vcpu *vcpu,
+ struct kvm_page_fault *fault)
+{
+ int r;
+
+ if (page_fault_handle_page_track(vcpu, fault))
+ return RET_PF_EMULATE;
+
+ r = fast_page_fault(vcpu, fault);
+ if (r != RET_PF_INVALID)
+ return r;
+
+ r = mmu_topup_memory_caches(vcpu, false);
+ if (r)
+ return r;
+
+ r = kvm_faultin_pfn(vcpu, fault, ACC_ALL);
+ if (r != RET_PF_CONTINUE)
+ return r;
+
+ r = RET_PF_RETRY;
+ read_lock(&vcpu->kvm->mmu_lock);
+
+ if (is_page_fault_stale(vcpu, fault))
+ goto out_unlock;
+
+ r = kvm_tdp_mmu_map(vcpu, fault);
+
+out_unlock:
+ read_unlock(&vcpu->kvm->mmu_lock);
+ kvm_release_pfn_clean(fault->pfn);
+ return r;
+}
+#endif
+
+int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
+{
+ /*
+ * If the guest's MTRRs may be used to compute the "real" memtype,
+ * restrict the mapping level to ensure KVM uses a consistent memtype
+ * across the entire mapping. If the host MTRRs are ignored by TDP
+ * (shadow_memtype_mask is non-zero), and the VM has non-coherent DMA
+ * (DMA doesn't snoop CPU caches), KVM's ABI is to honor the memtype
+ * from the guest's MTRRs so that guest accesses to memory that is
+ * DMA'd aren't cached against the guest's wishes.
+ *
+ * Note, KVM may still ultimately ignore guest MTRRs for certain PFNs,
+ * e.g. KVM will force UC memtype for host MMIO.
+ */
+ if (shadow_memtype_mask && kvm_arch_has_noncoherent_dma(vcpu->kvm)) {
+ for ( ; fault->max_level > PG_LEVEL_4K; --fault->max_level) {
+ int page_num = KVM_PAGES_PER_HPAGE(fault->max_level);
+ gfn_t base = gfn_round_for_level(fault->gfn,
+ fault->max_level);
+
+ if (kvm_mtrr_check_gfn_range_consistency(vcpu, base, page_num))
+ break;
+ }
+ }
+
+#ifdef CONFIG_X86_64
+ if (tdp_mmu_enabled)
+ return kvm_tdp_mmu_page_fault(vcpu, fault);
+#endif
+
+ return direct_page_fault(vcpu, fault);
+}
+
+static void nonpaging_init_context(struct kvm_mmu *context)
+{
+ context->page_fault = nonpaging_page_fault;
+ context->gva_to_gpa = nonpaging_gva_to_gpa;
+ context->sync_spte = NULL;
+}
+
+static inline bool is_root_usable(struct kvm_mmu_root_info *root, gpa_t pgd,
+ union kvm_mmu_page_role role)
+{
+ struct kvm_mmu_page *sp;
+
+ if (!VALID_PAGE(root->hpa))
+ return false;
+
+ if (!role.direct && pgd != root->pgd)
+ return false;
+
+ sp = root_to_sp(root->hpa);
+ if (WARN_ON_ONCE(!sp))
+ return false;
+
+ return role.word == sp->role.word;
+}
+
+/*
+ * Find out if a previously cached root matching the new pgd/role is available,
+ * and insert the current root as the MRU in the cache.
+ * If a matching root is found, it is assigned to kvm_mmu->root and
+ * true is returned.
+ * If no match is found, kvm_mmu->root is left invalid, the LRU root is
+ * evicted to make room for the current root, and false is returned.
+ */
+static bool cached_root_find_and_keep_current(struct kvm *kvm, struct kvm_mmu *mmu,
+ gpa_t new_pgd,
+ union kvm_mmu_page_role new_role)
+{
+ uint i;
+
+ if (is_root_usable(&mmu->root, new_pgd, new_role))
+ return true;
+
+ for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
+ /*
+ * The swaps end up rotating the cache like this:
+ * C 0 1 2 3 (on entry to the function)
+ * 0 C 1 2 3
+ * 1 C 0 2 3
+ * 2 C 0 1 3
+ * 3 C 0 1 2 (on exit from the loop)
+ */
+ swap(mmu->root, mmu->prev_roots[i]);
+ if (is_root_usable(&mmu->root, new_pgd, new_role))
+ return true;
+ }
+
+ kvm_mmu_free_roots(kvm, mmu, KVM_MMU_ROOT_CURRENT);
+ return false;
+}
+
+/*
+ * Find out if a previously cached root matching the new pgd/role is available.
+ * On entry, mmu->root is invalid.
+ * If a matching root is found, it is assigned to kvm_mmu->root, the LRU entry
+ * of the cache becomes invalid, and true is returned.
+ * If no match is found, kvm_mmu->root is left invalid and false is returned.
+ */
+static bool cached_root_find_without_current(struct kvm *kvm, struct kvm_mmu *mmu,
+ gpa_t new_pgd,
+ union kvm_mmu_page_role new_role)
+{
+ uint i;
+
+ for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
+ if (is_root_usable(&mmu->prev_roots[i], new_pgd, new_role))
+ goto hit;
+
+ return false;
+
+hit:
+ swap(mmu->root, mmu->prev_roots[i]);
+ /* Bubble up the remaining roots. */
+ for (; i < KVM_MMU_NUM_PREV_ROOTS - 1; i++)
+ mmu->prev_roots[i] = mmu->prev_roots[i + 1];
+ mmu->prev_roots[i].hpa = INVALID_PAGE;
+ return true;
+}
+
+static bool fast_pgd_switch(struct kvm *kvm, struct kvm_mmu *mmu,
+ gpa_t new_pgd, union kvm_mmu_page_role new_role)
+{
+ /*
+ * Limit reuse to 64-bit hosts+VMs without "special" roots in order to
+ * avoid having to deal with PDPTEs and other complexities.
+ */
+ if (VALID_PAGE(mmu->root.hpa) && !root_to_sp(mmu->root.hpa))
+ kvm_mmu_free_roots(kvm, mmu, KVM_MMU_ROOT_CURRENT);
+
+ if (VALID_PAGE(mmu->root.hpa))
+ return cached_root_find_and_keep_current(kvm, mmu, new_pgd, new_role);
+ else
+ return cached_root_find_without_current(kvm, mmu, new_pgd, new_role);
+}
+
+void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd)
+{
+ struct kvm_mmu *mmu = vcpu->arch.mmu;
+ union kvm_mmu_page_role new_role = mmu->root_role;
+
+ /*
+ * Return immediately if no usable root was found, kvm_mmu_reload()
+ * will establish a valid root prior to the next VM-Enter.
+ */
+ if (!fast_pgd_switch(vcpu->kvm, mmu, new_pgd, new_role))
+ return;
+
+ /*
+ * It's possible that the cached previous root page is obsolete because
+ * of a change in the MMU generation number. However, changing the
+ * generation number is accompanied by KVM_REQ_MMU_FREE_OBSOLETE_ROOTS,
+ * which will free the root set here and allocate a new one.
+ */
+ kvm_make_request(KVM_REQ_LOAD_MMU_PGD, vcpu);
+
+ if (force_flush_and_sync_on_reuse) {
+ kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
+ kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
+ }
+
+ /*
+ * The last MMIO access's GVA and GPA are cached in the VCPU. When
+ * switching to a new CR3, that GVA->GPA mapping may no longer be
+ * valid. So clear any cached MMIO info even when we don't need to sync
+ * the shadow page tables.
+ */
+ vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
+
+ /*
+ * If this is a direct root page, it doesn't have a write flooding
+ * count. Otherwise, clear the write flooding count.
+ */
+ if (!new_role.direct) {
+ struct kvm_mmu_page *sp = root_to_sp(vcpu->arch.mmu->root.hpa);
+
+ if (!WARN_ON_ONCE(!sp))
+ __clear_sp_write_flooding_count(sp);
+ }
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_new_pgd);
+
+static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
+ unsigned int access)
+{
+ if (unlikely(is_mmio_spte(*sptep))) {
+ if (gfn != get_mmio_spte_gfn(*sptep)) {
+ mmu_spte_clear_no_track(sptep);
+ return true;
+ }
+
+ mark_mmio_spte(vcpu, sptep, gfn, access);
+ return true;
+ }
+
+ return false;
+}
+
+#define PTTYPE_EPT 18 /* arbitrary */
+#define PTTYPE PTTYPE_EPT
+#include "paging_tmpl.h"
+#undef PTTYPE
+
+#define PTTYPE 64
+#include "paging_tmpl.h"
+#undef PTTYPE
+
+#define PTTYPE 32
+#include "paging_tmpl.h"
+#undef PTTYPE
+
+static void __reset_rsvds_bits_mask(struct rsvd_bits_validate *rsvd_check,
+ u64 pa_bits_rsvd, int level, bool nx,
+ bool gbpages, bool pse, bool amd)
+{
+ u64 gbpages_bit_rsvd = 0;
+ u64 nonleaf_bit8_rsvd = 0;
+ u64 high_bits_rsvd;
+
+ rsvd_check->bad_mt_xwr = 0;
+
+ if (!gbpages)
+ gbpages_bit_rsvd = rsvd_bits(7, 7);
+
+ if (level == PT32E_ROOT_LEVEL)
+ high_bits_rsvd = pa_bits_rsvd & rsvd_bits(0, 62);
+ else
+ high_bits_rsvd = pa_bits_rsvd & rsvd_bits(0, 51);
+
+ /* Note, NX doesn't exist in PDPTEs, this is handled below. */
+ if (!nx)
+ high_bits_rsvd |= rsvd_bits(63, 63);
+
+ /*
+ * Non-leaf PML4Es and PDPEs reserve bit 8 (which would be the G bit for
+ * leaf entries) on AMD CPUs only.
+ */
+ if (amd)
+ nonleaf_bit8_rsvd = rsvd_bits(8, 8);
+
+ switch (level) {
+ case PT32_ROOT_LEVEL:
+ /* no rsvd bits for 2 level 4K page table entries */
+ rsvd_check->rsvd_bits_mask[0][1] = 0;
+ rsvd_check->rsvd_bits_mask[0][0] = 0;
+ rsvd_check->rsvd_bits_mask[1][0] =
+ rsvd_check->rsvd_bits_mask[0][0];
+
+ if (!pse) {
+ rsvd_check->rsvd_bits_mask[1][1] = 0;
+ break;
+ }
+
+ if (is_cpuid_PSE36())
+ /* 36bits PSE 4MB page */
+ rsvd_check->rsvd_bits_mask[1][1] = rsvd_bits(17, 21);
+ else
+ /* 32 bits PSE 4MB page */
+ rsvd_check->rsvd_bits_mask[1][1] = rsvd_bits(13, 21);
+ break;
+ case PT32E_ROOT_LEVEL:
+ rsvd_check->rsvd_bits_mask[0][2] = rsvd_bits(63, 63) |
+ high_bits_rsvd |
+ rsvd_bits(5, 8) |
+ rsvd_bits(1, 2); /* PDPTE */
+ rsvd_check->rsvd_bits_mask[0][1] = high_bits_rsvd; /* PDE */
+ rsvd_check->rsvd_bits_mask[0][0] = high_bits_rsvd; /* PTE */
+ rsvd_check->rsvd_bits_mask[1][1] = high_bits_rsvd |
+ rsvd_bits(13, 20); /* large page */
+ rsvd_check->rsvd_bits_mask[1][0] =
+ rsvd_check->rsvd_bits_mask[0][0];
+ break;
+ case PT64_ROOT_5LEVEL:
+ rsvd_check->rsvd_bits_mask[0][4] = high_bits_rsvd |
+ nonleaf_bit8_rsvd |
+ rsvd_bits(7, 7);
+ rsvd_check->rsvd_bits_mask[1][4] =
+ rsvd_check->rsvd_bits_mask[0][4];
+ fallthrough;
+ case PT64_ROOT_4LEVEL:
+ rsvd_check->rsvd_bits_mask[0][3] = high_bits_rsvd |
+ nonleaf_bit8_rsvd |
+ rsvd_bits(7, 7);
+ rsvd_check->rsvd_bits_mask[0][2] = high_bits_rsvd |
+ gbpages_bit_rsvd;
+ rsvd_check->rsvd_bits_mask[0][1] = high_bits_rsvd;
+ rsvd_check->rsvd_bits_mask[0][0] = high_bits_rsvd;
+ rsvd_check->rsvd_bits_mask[1][3] =
+ rsvd_check->rsvd_bits_mask[0][3];
+ rsvd_check->rsvd_bits_mask[1][2] = high_bits_rsvd |
+ gbpages_bit_rsvd |
+ rsvd_bits(13, 29);
+ rsvd_check->rsvd_bits_mask[1][1] = high_bits_rsvd |
+ rsvd_bits(13, 20); /* large page */
+ rsvd_check->rsvd_bits_mask[1][0] =
+ rsvd_check->rsvd_bits_mask[0][0];
+ break;
+ }
+}
+
+static void reset_guest_rsvds_bits_mask(struct kvm_vcpu *vcpu,
+ struct kvm_mmu *context)
+{
+ __reset_rsvds_bits_mask(&context->guest_rsvd_check,
+ vcpu->arch.reserved_gpa_bits,
+ context->cpu_role.base.level, is_efer_nx(context),
+ guest_can_use(vcpu, X86_FEATURE_GBPAGES),
+ is_cr4_pse(context),
+ guest_cpuid_is_amd_or_hygon(vcpu));
+}
+
+static void __reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check,
+ u64 pa_bits_rsvd, bool execonly,
+ int huge_page_level)
+{
+ u64 high_bits_rsvd = pa_bits_rsvd & rsvd_bits(0, 51);
+ u64 large_1g_rsvd = 0, large_2m_rsvd = 0;
+ u64 bad_mt_xwr;
+
+ if (huge_page_level < PG_LEVEL_1G)
+ large_1g_rsvd = rsvd_bits(7, 7);
+ if (huge_page_level < PG_LEVEL_2M)
+ large_2m_rsvd = rsvd_bits(7, 7);
+
+ rsvd_check->rsvd_bits_mask[0][4] = high_bits_rsvd | rsvd_bits(3, 7);
+ rsvd_check->rsvd_bits_mask[0][3] = high_bits_rsvd | rsvd_bits(3, 7);
+ rsvd_check->rsvd_bits_mask[0][2] = high_bits_rsvd | rsvd_bits(3, 6) | large_1g_rsvd;
+ rsvd_check->rsvd_bits_mask[0][1] = high_bits_rsvd | rsvd_bits(3, 6) | large_2m_rsvd;
+ rsvd_check->rsvd_bits_mask[0][0] = high_bits_rsvd;
+
+ /* large page */
+ rsvd_check->rsvd_bits_mask[1][4] = rsvd_check->rsvd_bits_mask[0][4];
+ rsvd_check->rsvd_bits_mask[1][3] = rsvd_check->rsvd_bits_mask[0][3];
+ rsvd_check->rsvd_bits_mask[1][2] = high_bits_rsvd | rsvd_bits(12, 29) | large_1g_rsvd;
+ rsvd_check->rsvd_bits_mask[1][1] = high_bits_rsvd | rsvd_bits(12, 20) | large_2m_rsvd;
+ rsvd_check->rsvd_bits_mask[1][0] = rsvd_check->rsvd_bits_mask[0][0];
+
+ bad_mt_xwr = 0xFFull << (2 * 8); /* bits 3..5 must not be 2 */
+ bad_mt_xwr |= 0xFFull << (3 * 8); /* bits 3..5 must not be 3 */
+ bad_mt_xwr |= 0xFFull << (7 * 8); /* bits 3..5 must not be 7 */
+ bad_mt_xwr |= REPEAT_BYTE(1ull << 2); /* bits 0..2 must not be 010 */
+ bad_mt_xwr |= REPEAT_BYTE(1ull << 6); /* bits 0..2 must not be 110 */
+ if (!execonly) {
+ /* bits 0..2 must not be 100 unless VMX capabilities allow it */
+ bad_mt_xwr |= REPEAT_BYTE(1ull << 4);
+ }
+ rsvd_check->bad_mt_xwr = bad_mt_xwr;
+}
+
+static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu,
+ struct kvm_mmu *context, bool execonly, int huge_page_level)
+{
+ __reset_rsvds_bits_mask_ept(&context->guest_rsvd_check,
+ vcpu->arch.reserved_gpa_bits, execonly,
+ huge_page_level);
+}
+
+static inline u64 reserved_hpa_bits(void)
+{
+ return rsvd_bits(shadow_phys_bits, 63);
+}
+
+/*
+ * the page table on host is the shadow page table for the page
+ * table in guest or amd nested guest, its mmu features completely
+ * follow the features in guest.
+ */
+static void reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
+ struct kvm_mmu *context)
+{
+ /* @amd adds a check on bit of SPTEs, which KVM shouldn't use anyways. */
+ bool is_amd = true;
+ /* KVM doesn't use 2-level page tables for the shadow MMU. */
+ bool is_pse = false;
+ struct rsvd_bits_validate *shadow_zero_check;
+ int i;
+
+ WARN_ON_ONCE(context->root_role.level < PT32E_ROOT_LEVEL);
+
+ shadow_zero_check = &context->shadow_zero_check;
+ __reset_rsvds_bits_mask(shadow_zero_check, reserved_hpa_bits(),
+ context->root_role.level,
+ context->root_role.efer_nx,
+ guest_can_use(vcpu, X86_FEATURE_GBPAGES),
+ is_pse, is_amd);
+
+ if (!shadow_me_mask)
+ return;
+
+ for (i = context->root_role.level; --i >= 0;) {
+ /*
+ * So far shadow_me_value is a constant during KVM's life
+ * time. Bits in shadow_me_value are allowed to be set.
+ * Bits in shadow_me_mask but not in shadow_me_value are
+ * not allowed to be set.
+ */
+ shadow_zero_check->rsvd_bits_mask[0][i] |= shadow_me_mask;
+ shadow_zero_check->rsvd_bits_mask[1][i] |= shadow_me_mask;
+ shadow_zero_check->rsvd_bits_mask[0][i] &= ~shadow_me_value;
+ shadow_zero_check->rsvd_bits_mask[1][i] &= ~shadow_me_value;
+ }
+
+}
+
+static inline bool boot_cpu_is_amd(void)
+{
+ WARN_ON_ONCE(!tdp_enabled);
+ return shadow_x_mask == 0;
+}
+
+/*
+ * the direct page table on host, use as much mmu features as
+ * possible, however, kvm currently does not do execution-protection.
+ */
+static void reset_tdp_shadow_zero_bits_mask(struct kvm_mmu *context)
+{
+ struct rsvd_bits_validate *shadow_zero_check;
+ int i;
+
+ shadow_zero_check = &context->shadow_zero_check;
+
+ if (boot_cpu_is_amd())
+ __reset_rsvds_bits_mask(shadow_zero_check, reserved_hpa_bits(),
+ context->root_role.level, true,
+ boot_cpu_has(X86_FEATURE_GBPAGES),
+ false, true);
+ else
+ __reset_rsvds_bits_mask_ept(shadow_zero_check,
+ reserved_hpa_bits(), false,
+ max_huge_page_level);
+
+ if (!shadow_me_mask)
+ return;
+
+ for (i = context->root_role.level; --i >= 0;) {
+ shadow_zero_check->rsvd_bits_mask[0][i] &= ~shadow_me_mask;
+ shadow_zero_check->rsvd_bits_mask[1][i] &= ~shadow_me_mask;
+ }
+}
+
+/*
+ * as the comments in reset_shadow_zero_bits_mask() except it
+ * is the shadow page table for intel nested guest.
+ */
+static void
+reset_ept_shadow_zero_bits_mask(struct kvm_mmu *context, bool execonly)
+{
+ __reset_rsvds_bits_mask_ept(&context->shadow_zero_check,
+ reserved_hpa_bits(), execonly,
+ max_huge_page_level);
+}
+
+#define BYTE_MASK(access) \
+ ((1 & (access) ? 2 : 0) | \
+ (2 & (access) ? 4 : 0) | \
+ (3 & (access) ? 8 : 0) | \
+ (4 & (access) ? 16 : 0) | \
+ (5 & (access) ? 32 : 0) | \
+ (6 & (access) ? 64 : 0) | \
+ (7 & (access) ? 128 : 0))
+
+
+static void update_permission_bitmask(struct kvm_mmu *mmu, bool ept)
+{
+ unsigned byte;
+
+ const u8 x = BYTE_MASK(ACC_EXEC_MASK);
+ const u8 w = BYTE_MASK(ACC_WRITE_MASK);
+ const u8 u = BYTE_MASK(ACC_USER_MASK);
+
+ bool cr4_smep = is_cr4_smep(mmu);
+ bool cr4_smap = is_cr4_smap(mmu);
+ bool cr0_wp = is_cr0_wp(mmu);
+ bool efer_nx = is_efer_nx(mmu);
+
+ for (byte = 0; byte < ARRAY_SIZE(mmu->permissions); ++byte) {
+ unsigned pfec = byte << 1;
+
+ /*
+ * Each "*f" variable has a 1 bit for each UWX value
+ * that causes a fault with the given PFEC.
+ */
+
+ /* Faults from writes to non-writable pages */
+ u8 wf = (pfec & PFERR_WRITE_MASK) ? (u8)~w : 0;
+ /* Faults from user mode accesses to supervisor pages */
+ u8 uf = (pfec & PFERR_USER_MASK) ? (u8)~u : 0;
+ /* Faults from fetches of non-executable pages*/
+ u8 ff = (pfec & PFERR_FETCH_MASK) ? (u8)~x : 0;
+ /* Faults from kernel mode fetches of user pages */
+ u8 smepf = 0;
+ /* Faults from kernel mode accesses of user pages */
+ u8 smapf = 0;
+
+ if (!ept) {
+ /* Faults from kernel mode accesses to user pages */
+ u8 kf = (pfec & PFERR_USER_MASK) ? 0 : u;
+
+ /* Not really needed: !nx will cause pte.nx to fault */
+ if (!efer_nx)
+ ff = 0;
+
+ /* Allow supervisor writes if !cr0.wp */
+ if (!cr0_wp)
+ wf = (pfec & PFERR_USER_MASK) ? wf : 0;
+
+ /* Disallow supervisor fetches of user code if cr4.smep */
+ if (cr4_smep)
+ smepf = (pfec & PFERR_FETCH_MASK) ? kf : 0;
+
+ /*
+ * SMAP:kernel-mode data accesses from user-mode
+ * mappings should fault. A fault is considered
+ * as a SMAP violation if all of the following
+ * conditions are true:
+ * - X86_CR4_SMAP is set in CR4
+ * - A user page is accessed
+ * - The access is not a fetch
+ * - The access is supervisor mode
+ * - If implicit supervisor access or X86_EFLAGS_AC is clear
+ *
+ * Here, we cover the first four conditions.
+ * The fifth is computed dynamically in permission_fault();
+ * PFERR_RSVD_MASK bit will be set in PFEC if the access is
+ * *not* subject to SMAP restrictions.
+ */
+ if (cr4_smap)
+ smapf = (pfec & (PFERR_RSVD_MASK|PFERR_FETCH_MASK)) ? 0 : kf;
+ }
+
+ mmu->permissions[byte] = ff | uf | wf | smepf | smapf;
+ }
+}
+
+/*
+* PKU is an additional mechanism by which the paging controls access to
+* user-mode addresses based on the value in the PKRU register. Protection
+* key violations are reported through a bit in the page fault error code.
+* Unlike other bits of the error code, the PK bit is not known at the
+* call site of e.g. gva_to_gpa; it must be computed directly in
+* permission_fault based on two bits of PKRU, on some machine state (CR4,
+* CR0, EFER, CPL), and on other bits of the error code and the page tables.
+*
+* In particular the following conditions come from the error code, the
+* page tables and the machine state:
+* - PK is always zero unless CR4.PKE=1 and EFER.LMA=1
+* - PK is always zero if RSVD=1 (reserved bit set) or F=1 (instruction fetch)
+* - PK is always zero if U=0 in the page tables
+* - PKRU.WD is ignored if CR0.WP=0 and the access is a supervisor access.
+*
+* The PKRU bitmask caches the result of these four conditions. The error
+* code (minus the P bit) and the page table's U bit form an index into the
+* PKRU bitmask. Two bits of the PKRU bitmask are then extracted and ANDed
+* with the two bits of the PKRU register corresponding to the protection key.
+* For the first three conditions above the bits will be 00, thus masking
+* away both AD and WD. For all reads or if the last condition holds, WD
+* only will be masked away.
+*/
+static void update_pkru_bitmask(struct kvm_mmu *mmu)
+{
+ unsigned bit;
+ bool wp;
+
+ mmu->pkru_mask = 0;
+
+ if (!is_cr4_pke(mmu))
+ return;
+
+ wp = is_cr0_wp(mmu);
+
+ for (bit = 0; bit < ARRAY_SIZE(mmu->permissions); ++bit) {
+ unsigned pfec, pkey_bits;
+ bool check_pkey, check_write, ff, uf, wf, pte_user;
+
+ pfec = bit << 1;
+ ff = pfec & PFERR_FETCH_MASK;
+ uf = pfec & PFERR_USER_MASK;
+ wf = pfec & PFERR_WRITE_MASK;
+
+ /* PFEC.RSVD is replaced by ACC_USER_MASK. */
+ pte_user = pfec & PFERR_RSVD_MASK;
+
+ /*
+ * Only need to check the access which is not an
+ * instruction fetch and is to a user page.
+ */
+ check_pkey = (!ff && pte_user);
+ /*
+ * write access is controlled by PKRU if it is a
+ * user access or CR0.WP = 1.
+ */
+ check_write = check_pkey && wf && (uf || wp);
+
+ /* PKRU.AD stops both read and write access. */
+ pkey_bits = !!check_pkey;
+ /* PKRU.WD stops write access. */
+ pkey_bits |= (!!check_write) << 1;
+
+ mmu->pkru_mask |= (pkey_bits & 3) << pfec;
+ }
+}
+
+static void reset_guest_paging_metadata(struct kvm_vcpu *vcpu,
+ struct kvm_mmu *mmu)
+{
+ if (!is_cr0_pg(mmu))
+ return;
+
+ reset_guest_rsvds_bits_mask(vcpu, mmu);
+ update_permission_bitmask(mmu, false);
+ update_pkru_bitmask(mmu);
+}
+
+static void paging64_init_context(struct kvm_mmu *context)
+{
+ context->page_fault = paging64_page_fault;
+ context->gva_to_gpa = paging64_gva_to_gpa;
+ context->sync_spte = paging64_sync_spte;
+}
+
+static void paging32_init_context(struct kvm_mmu *context)
+{
+ context->page_fault = paging32_page_fault;
+ context->gva_to_gpa = paging32_gva_to_gpa;
+ context->sync_spte = paging32_sync_spte;
+}
+
+static union kvm_cpu_role kvm_calc_cpu_role(struct kvm_vcpu *vcpu,
+ const struct kvm_mmu_role_regs *regs)
+{
+ union kvm_cpu_role role = {0};
+
+ role.base.access = ACC_ALL;
+ role.base.smm = is_smm(vcpu);
+ role.base.guest_mode = is_guest_mode(vcpu);
+ role.ext.valid = 1;
+
+ if (!____is_cr0_pg(regs)) {
+ role.base.direct = 1;
+ return role;
+ }
+
+ role.base.efer_nx = ____is_efer_nx(regs);
+ role.base.cr0_wp = ____is_cr0_wp(regs);
+ role.base.smep_andnot_wp = ____is_cr4_smep(regs) && !____is_cr0_wp(regs);
+ role.base.smap_andnot_wp = ____is_cr4_smap(regs) && !____is_cr0_wp(regs);
+ role.base.has_4_byte_gpte = !____is_cr4_pae(regs);
+
+ if (____is_efer_lma(regs))
+ role.base.level = ____is_cr4_la57(regs) ? PT64_ROOT_5LEVEL
+ : PT64_ROOT_4LEVEL;
+ else if (____is_cr4_pae(regs))
+ role.base.level = PT32E_ROOT_LEVEL;
+ else
+ role.base.level = PT32_ROOT_LEVEL;
+
+ role.ext.cr4_smep = ____is_cr4_smep(regs);
+ role.ext.cr4_smap = ____is_cr4_smap(regs);
+ role.ext.cr4_pse = ____is_cr4_pse(regs);
+
+ /* PKEY and LA57 are active iff long mode is active. */
+ role.ext.cr4_pke = ____is_efer_lma(regs) && ____is_cr4_pke(regs);
+ role.ext.cr4_la57 = ____is_efer_lma(regs) && ____is_cr4_la57(regs);
+ role.ext.efer_lma = ____is_efer_lma(regs);
+ return role;
+}
+
+void __kvm_mmu_refresh_passthrough_bits(struct kvm_vcpu *vcpu,
+ struct kvm_mmu *mmu)
+{
+ const bool cr0_wp = kvm_is_cr0_bit_set(vcpu, X86_CR0_WP);
+
+ BUILD_BUG_ON((KVM_MMU_CR0_ROLE_BITS & KVM_POSSIBLE_CR0_GUEST_BITS) != X86_CR0_WP);
+ BUILD_BUG_ON((KVM_MMU_CR4_ROLE_BITS & KVM_POSSIBLE_CR4_GUEST_BITS));
+
+ if (is_cr0_wp(mmu) == cr0_wp)
+ return;
+
+ mmu->cpu_role.base.cr0_wp = cr0_wp;
+ reset_guest_paging_metadata(vcpu, mmu);
+}
+
+static inline int kvm_mmu_get_tdp_level(struct kvm_vcpu *vcpu)
+{
+ /* tdp_root_level is architecture forced level, use it if nonzero */
+ if (tdp_root_level)
+ return tdp_root_level;
+
+ /* Use 5-level TDP if and only if it's useful/necessary. */
+ if (max_tdp_level == 5 && cpuid_maxphyaddr(vcpu) <= 48)
+ return 4;
+
+ return max_tdp_level;
+}
+
+static union kvm_mmu_page_role
+kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu,
+ union kvm_cpu_role cpu_role)
+{
+ union kvm_mmu_page_role role = {0};
+
+ role.access = ACC_ALL;
+ role.cr0_wp = true;
+ role.efer_nx = true;
+ role.smm = cpu_role.base.smm;
+ role.guest_mode = cpu_role.base.guest_mode;
+ role.ad_disabled = !kvm_ad_enabled();
+ role.level = kvm_mmu_get_tdp_level(vcpu);
+ role.direct = true;
+ role.has_4_byte_gpte = false;
+
+ return role;
+}
+
+static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu,
+ union kvm_cpu_role cpu_role)
+{
+ struct kvm_mmu *context = &vcpu->arch.root_mmu;
+ union kvm_mmu_page_role root_role = kvm_calc_tdp_mmu_root_page_role(vcpu, cpu_role);
+
+ if (cpu_role.as_u64 == context->cpu_role.as_u64 &&
+ root_role.word == context->root_role.word)
+ return;
+
+ context->cpu_role.as_u64 = cpu_role.as_u64;
+ context->root_role.word = root_role.word;
+ context->page_fault = kvm_tdp_page_fault;
+ context->sync_spte = NULL;
+ context->get_guest_pgd = get_guest_cr3;
+ context->get_pdptr = kvm_pdptr_read;
+ context->inject_page_fault = kvm_inject_page_fault;
+
+ if (!is_cr0_pg(context))
+ context->gva_to_gpa = nonpaging_gva_to_gpa;
+ else if (is_cr4_pae(context))
+ context->gva_to_gpa = paging64_gva_to_gpa;
+ else
+ context->gva_to_gpa = paging32_gva_to_gpa;
+
+ reset_guest_paging_metadata(vcpu, context);
+ reset_tdp_shadow_zero_bits_mask(context);
+}
+
+static void shadow_mmu_init_context(struct kvm_vcpu *vcpu, struct kvm_mmu *context,
+ union kvm_cpu_role cpu_role,
+ union kvm_mmu_page_role root_role)
+{
+ if (cpu_role.as_u64 == context->cpu_role.as_u64 &&
+ root_role.word == context->root_role.word)
+ return;
+
+ context->cpu_role.as_u64 = cpu_role.as_u64;
+ context->root_role.word = root_role.word;
+
+ if (!is_cr0_pg(context))
+ nonpaging_init_context(context);
+ else if (is_cr4_pae(context))
+ paging64_init_context(context);
+ else
+ paging32_init_context(context);
+
+ reset_guest_paging_metadata(vcpu, context);
+ reset_shadow_zero_bits_mask(vcpu, context);
+}
+
+static void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu,
+ union kvm_cpu_role cpu_role)
+{
+ struct kvm_mmu *context = &vcpu->arch.root_mmu;
+ union kvm_mmu_page_role root_role;
+
+ root_role = cpu_role.base;
+
+ /* KVM uses PAE paging whenever the guest isn't using 64-bit paging. */
+ root_role.level = max_t(u32, root_role.level, PT32E_ROOT_LEVEL);
+
+ /*
+ * KVM forces EFER.NX=1 when TDP is disabled, reflect it in the MMU role.
+ * KVM uses NX when TDP is disabled to handle a variety of scenarios,
+ * notably for huge SPTEs if iTLB multi-hit mitigation is enabled and
+ * to generate correct permissions for CR0.WP=0/CR4.SMEP=1/EFER.NX=0.
+ * The iTLB multi-hit workaround can be toggled at any time, so assume
+ * NX can be used by any non-nested shadow MMU to avoid having to reset
+ * MMU contexts.
+ */
+ root_role.efer_nx = true;
+
+ shadow_mmu_init_context(vcpu, context, cpu_role, root_role);
+}
+
+void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, unsigned long cr0,
+ unsigned long cr4, u64 efer, gpa_t nested_cr3)
+{
+ struct kvm_mmu *context = &vcpu->arch.guest_mmu;
+ struct kvm_mmu_role_regs regs = {
+ .cr0 = cr0,
+ .cr4 = cr4 & ~X86_CR4_PKE,
+ .efer = efer,
+ };
+ union kvm_cpu_role cpu_role = kvm_calc_cpu_role(vcpu, &regs);
+ union kvm_mmu_page_role root_role;
+
+ /* NPT requires CR0.PG=1. */
+ WARN_ON_ONCE(cpu_role.base.direct);
+
+ root_role = cpu_role.base;
+ root_role.level = kvm_mmu_get_tdp_level(vcpu);
+ if (root_role.level == PT64_ROOT_5LEVEL &&
+ cpu_role.base.level == PT64_ROOT_4LEVEL)
+ root_role.passthrough = 1;
+
+ shadow_mmu_init_context(vcpu, context, cpu_role, root_role);
+ kvm_mmu_new_pgd(vcpu, nested_cr3);
+}
+EXPORT_SYMBOL_GPL(kvm_init_shadow_npt_mmu);
+
+static union kvm_cpu_role
+kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty,
+ bool execonly, u8 level)
+{
+ union kvm_cpu_role role = {0};
+
+ /*
+ * KVM does not support SMM transfer monitors, and consequently does not
+ * support the "entry to SMM" control either. role.base.smm is always 0.
+ */
+ WARN_ON_ONCE(is_smm(vcpu));
+ role.base.level = level;
+ role.base.has_4_byte_gpte = false;
+ role.base.direct = false;
+ role.base.ad_disabled = !accessed_dirty;
+ role.base.guest_mode = true;
+ role.base.access = ACC_ALL;
+
+ role.ext.word = 0;
+ role.ext.execonly = execonly;
+ role.ext.valid = 1;
+
+ return role;
+}
+
+void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
+ int huge_page_level, bool accessed_dirty,
+ gpa_t new_eptp)
+{
+ struct kvm_mmu *context = &vcpu->arch.guest_mmu;
+ u8 level = vmx_eptp_page_walk_level(new_eptp);
+ union kvm_cpu_role new_mode =
+ kvm_calc_shadow_ept_root_page_role(vcpu, accessed_dirty,
+ execonly, level);
+
+ if (new_mode.as_u64 != context->cpu_role.as_u64) {
+ /* EPT, and thus nested EPT, does not consume CR0, CR4, nor EFER. */
+ context->cpu_role.as_u64 = new_mode.as_u64;
+ context->root_role.word = new_mode.base.word;
+
+ context->page_fault = ept_page_fault;
+ context->gva_to_gpa = ept_gva_to_gpa;
+ context->sync_spte = ept_sync_spte;
+
+ update_permission_bitmask(context, true);
+ context->pkru_mask = 0;
+ reset_rsvds_bits_mask_ept(vcpu, context, execonly, huge_page_level);
+ reset_ept_shadow_zero_bits_mask(context, execonly);
+ }
+
+ kvm_mmu_new_pgd(vcpu, new_eptp);
+}
+EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu);
+
+static void init_kvm_softmmu(struct kvm_vcpu *vcpu,
+ union kvm_cpu_role cpu_role)
+{
+ struct kvm_mmu *context = &vcpu->arch.root_mmu;
+
+ kvm_init_shadow_mmu(vcpu, cpu_role);
+
+ context->get_guest_pgd = get_guest_cr3;
+ context->get_pdptr = kvm_pdptr_read;
+ context->inject_page_fault = kvm_inject_page_fault;
+}
+
+static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu,
+ union kvm_cpu_role new_mode)
+{
+ struct kvm_mmu *g_context = &vcpu->arch.nested_mmu;
+
+ if (new_mode.as_u64 == g_context->cpu_role.as_u64)
+ return;
+
+ g_context->cpu_role.as_u64 = new_mode.as_u64;
+ g_context->get_guest_pgd = get_guest_cr3;
+ g_context->get_pdptr = kvm_pdptr_read;
+ g_context->inject_page_fault = kvm_inject_page_fault;
+
+ /*
+ * L2 page tables are never shadowed, so there is no need to sync
+ * SPTEs.
+ */
+ g_context->sync_spte = NULL;
+
+ /*
+ * Note that arch.mmu->gva_to_gpa translates l2_gpa to l1_gpa using
+ * L1's nested page tables (e.g. EPT12). The nested translation
+ * of l2_gva to l1_gpa is done by arch.nested_mmu.gva_to_gpa using
+ * L2's page tables as the first level of translation and L1's
+ * nested page tables as the second level of translation. Basically
+ * the gva_to_gpa functions between mmu and nested_mmu are swapped.
+ */
+ if (!is_paging(vcpu))
+ g_context->gva_to_gpa = nonpaging_gva_to_gpa;
+ else if (is_long_mode(vcpu))
+ g_context->gva_to_gpa = paging64_gva_to_gpa;
+ else if (is_pae(vcpu))
+ g_context->gva_to_gpa = paging64_gva_to_gpa;
+ else
+ g_context->gva_to_gpa = paging32_gva_to_gpa;
+
+ reset_guest_paging_metadata(vcpu, g_context);
+}
+
+void kvm_init_mmu(struct kvm_vcpu *vcpu)
+{
+ struct kvm_mmu_role_regs regs = vcpu_to_role_regs(vcpu);
+ union kvm_cpu_role cpu_role = kvm_calc_cpu_role(vcpu, &regs);
+
+ if (mmu_is_nested(vcpu))
+ init_kvm_nested_mmu(vcpu, cpu_role);
+ else if (tdp_enabled)
+ init_kvm_tdp_mmu(vcpu, cpu_role);
+ else
+ init_kvm_softmmu(vcpu, cpu_role);
+}
+EXPORT_SYMBOL_GPL(kvm_init_mmu);
+
+void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu)
+{
+ /*
+ * Invalidate all MMU roles to force them to reinitialize as CPUID
+ * information is factored into reserved bit calculations.
+ *
+ * Correctly handling multiple vCPU models with respect to paging and
+ * physical address properties) in a single VM would require tracking
+ * all relevant CPUID information in kvm_mmu_page_role. That is very
+ * undesirable as it would increase the memory requirements for
+ * gfn_write_track (see struct kvm_mmu_page_role comments). For now
+ * that problem is swept under the rug; KVM's CPUID API is horrific and
+ * it's all but impossible to solve it without introducing a new API.
+ */
+ vcpu->arch.root_mmu.root_role.word = 0;
+ vcpu->arch.guest_mmu.root_role.word = 0;
+ vcpu->arch.nested_mmu.root_role.word = 0;
+ vcpu->arch.root_mmu.cpu_role.ext.valid = 0;
+ vcpu->arch.guest_mmu.cpu_role.ext.valid = 0;
+ vcpu->arch.nested_mmu.cpu_role.ext.valid = 0;
+ kvm_mmu_reset_context(vcpu);
+
+ /*
+ * Changing guest CPUID after KVM_RUN is forbidden, see the comment in
+ * kvm_arch_vcpu_ioctl().
+ */
+ KVM_BUG_ON(kvm_vcpu_has_run(vcpu), vcpu->kvm);
+}
+
+void kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
+{
+ kvm_mmu_unload(vcpu);
+ kvm_init_mmu(vcpu);
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_reset_context);
+
+int kvm_mmu_load(struct kvm_vcpu *vcpu)
+{
+ int r;
+
+ r = mmu_topup_memory_caches(vcpu, !vcpu->arch.mmu->root_role.direct);
+ if (r)
+ goto out;
+ r = mmu_alloc_special_roots(vcpu);
+ if (r)
+ goto out;
+ if (vcpu->arch.mmu->root_role.direct)
+ r = mmu_alloc_direct_roots(vcpu);
+ else
+ r = mmu_alloc_shadow_roots(vcpu);
+ if (r)
+ goto out;
+
+ kvm_mmu_sync_roots(vcpu);
+
+ kvm_mmu_load_pgd(vcpu);
+
+ /*
+ * Flush any TLB entries for the new root, the provenance of the root
+ * is unknown. Even if KVM ensures there are no stale TLB entries
+ * for a freed root, in theory another hypervisor could have left
+ * stale entries. Flushing on alloc also allows KVM to skip the TLB
+ * flush when freeing a root (see kvm_tdp_mmu_put_root()).
+ */
+ static_call(kvm_x86_flush_tlb_current)(vcpu);
+out:
+ return r;
+}
+
+void kvm_mmu_unload(struct kvm_vcpu *vcpu)
+{
+ struct kvm *kvm = vcpu->kvm;
+
+ kvm_mmu_free_roots(kvm, &vcpu->arch.root_mmu, KVM_MMU_ROOTS_ALL);
+ WARN_ON_ONCE(VALID_PAGE(vcpu->arch.root_mmu.root.hpa));
+ kvm_mmu_free_roots(kvm, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
+ WARN_ON_ONCE(VALID_PAGE(vcpu->arch.guest_mmu.root.hpa));
+ vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
+}
+
+static bool is_obsolete_root(struct kvm *kvm, hpa_t root_hpa)
+{
+ struct kvm_mmu_page *sp;
+
+ if (!VALID_PAGE(root_hpa))
+ return false;
+
+ /*
+ * When freeing obsolete roots, treat roots as obsolete if they don't
+ * have an associated shadow page, as it's impossible to determine if
+ * such roots are fresh or stale. This does mean KVM will get false
+ * positives and free roots that don't strictly need to be freed, but
+ * such false positives are relatively rare:
+ *
+ * (a) only PAE paging and nested NPT have roots without shadow pages
+ * (or any shadow paging flavor with a dummy root, see note below)
+ * (b) remote reloads due to a memslot update obsoletes _all_ roots
+ * (c) KVM doesn't track previous roots for PAE paging, and the guest
+ * is unlikely to zap an in-use PGD.
+ *
+ * Note! Dummy roots are unique in that they are obsoleted by memslot
+ * _creation_! See also FNAME(fetch).
+ */
+ sp = root_to_sp(root_hpa);
+ return !sp || is_obsolete_sp(kvm, sp);
+}
+
+static void __kvm_mmu_free_obsolete_roots(struct kvm *kvm, struct kvm_mmu *mmu)
+{
+ unsigned long roots_to_free = 0;
+ int i;
+
+ if (is_obsolete_root(kvm, mmu->root.hpa))
+ roots_to_free |= KVM_MMU_ROOT_CURRENT;
+
+ for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
+ if (is_obsolete_root(kvm, mmu->prev_roots[i].hpa))
+ roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
+ }
+
+ if (roots_to_free)
+ kvm_mmu_free_roots(kvm, mmu, roots_to_free);
+}
+
+void kvm_mmu_free_obsolete_roots(struct kvm_vcpu *vcpu)
+{
+ __kvm_mmu_free_obsolete_roots(vcpu->kvm, &vcpu->arch.root_mmu);
+ __kvm_mmu_free_obsolete_roots(vcpu->kvm, &vcpu->arch.guest_mmu);
+}
+
+static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa,
+ int *bytes)
+{
+ u64 gentry = 0;
+ int r;
+
+ /*
+ * Assume that the pte write on a page table of the same type
+ * as the current vcpu paging mode since we update the sptes only
+ * when they have the same mode.
+ */
+ if (is_pae(vcpu) && *bytes == 4) {
+ /* Handle a 32-bit guest writing two halves of a 64-bit gpte */
+ *gpa &= ~(gpa_t)7;
+ *bytes = 8;
+ }
+
+ if (*bytes == 4 || *bytes == 8) {
+ r = kvm_vcpu_read_guest_atomic(vcpu, *gpa, &gentry, *bytes);
+ if (r)
+ gentry = 0;
+ }
+
+ return gentry;
+}
+
+/*
+ * If we're seeing too many writes to a page, it may no longer be a page table,
+ * or we may be forking, in which case it is better to unmap the page.
+ */
+static bool detect_write_flooding(struct kvm_mmu_page *sp)
+{
+ /*
+ * Skip write-flooding detected for the sp whose level is 1, because
+ * it can become unsync, then the guest page is not write-protected.
+ */
+ if (sp->role.level == PG_LEVEL_4K)
+ return false;
+
+ atomic_inc(&sp->write_flooding_count);
+ return atomic_read(&sp->write_flooding_count) >= 3;
+}
+
+/*
+ * Misaligned accesses are too much trouble to fix up; also, they usually
+ * indicate a page is not used as a page table.
+ */
+static bool detect_write_misaligned(struct kvm_mmu_page *sp, gpa_t gpa,
+ int bytes)
+{
+ unsigned offset, pte_size, misaligned;
+
+ offset = offset_in_page(gpa);
+ pte_size = sp->role.has_4_byte_gpte ? 4 : 8;
+
+ /*
+ * Sometimes, the OS only writes the last one bytes to update status
+ * bits, for example, in linux, andb instruction is used in clear_bit().
+ */
+ if (!(offset & (pte_size - 1)) && bytes == 1)
+ return false;
+
+ misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
+ misaligned |= bytes < 4;
+
+ return misaligned;
+}
+
+static u64 *get_written_sptes(struct kvm_mmu_page *sp, gpa_t gpa, int *nspte)
+{
+ unsigned page_offset, quadrant;
+ u64 *spte;
+ int level;
+
+ page_offset = offset_in_page(gpa);
+ level = sp->role.level;
+ *nspte = 1;
+ if (sp->role.has_4_byte_gpte) {
+ page_offset <<= 1; /* 32->64 */
+ /*
+ * A 32-bit pde maps 4MB while the shadow pdes map
+ * only 2MB. So we need to double the offset again
+ * and zap two pdes instead of one.
+ */
+ if (level == PT32_ROOT_LEVEL) {
+ page_offset &= ~7; /* kill rounding error */
+ page_offset <<= 1;
+ *nspte = 2;
+ }
+ quadrant = page_offset >> PAGE_SHIFT;
+ page_offset &= ~PAGE_MASK;
+ if (quadrant != sp->role.quadrant)
+ return NULL;
+ }
+
+ spte = &sp->spt[page_offset / sizeof(*spte)];
+ return spte;
+}
+
+void kvm_mmu_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new,
+ int bytes)
+{
+ gfn_t gfn = gpa >> PAGE_SHIFT;
+ struct kvm_mmu_page *sp;
+ LIST_HEAD(invalid_list);
+ u64 entry, gentry, *spte;
+ int npte;
+ bool flush = false;
+
+ /*
+ * If we don't have indirect shadow pages, it means no page is
+ * write-protected, so we can exit simply.
+ */
+ if (!READ_ONCE(vcpu->kvm->arch.indirect_shadow_pages))
+ return;
+
+ write_lock(&vcpu->kvm->mmu_lock);
+
+ gentry = mmu_pte_write_fetch_gpte(vcpu, &gpa, &bytes);
+
+ ++vcpu->kvm->stat.mmu_pte_write;
+
+ for_each_gfn_valid_sp_with_gptes(vcpu->kvm, sp, gfn) {
+ if (detect_write_misaligned(sp, gpa, bytes) ||
+ detect_write_flooding(sp)) {
+ kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);
+ ++vcpu->kvm->stat.mmu_flooded;
+ continue;
+ }
+
+ spte = get_written_sptes(sp, gpa, &npte);
+ if (!spte)
+ continue;
+
+ while (npte--) {
+ entry = *spte;
+ mmu_page_zap_pte(vcpu->kvm, sp, spte, NULL);
+ if (gentry && sp->role.level != PG_LEVEL_4K)
+ ++vcpu->kvm->stat.mmu_pde_zapped;
+ if (is_shadow_present_pte(entry))
+ flush = true;
+ ++spte;
+ }
+ }
+ kvm_mmu_remote_flush_or_zap(vcpu->kvm, &invalid_list, flush);
+ write_unlock(&vcpu->kvm->mmu_lock);
+}
+
+int noinline kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code,
+ void *insn, int insn_len)
+{
+ int r, emulation_type = EMULTYPE_PF;
+ bool direct = vcpu->arch.mmu->root_role.direct;
+
+ /*
+ * IMPLICIT_ACCESS is a KVM-defined flag used to correctly perform SMAP
+ * checks when emulating instructions that triggers implicit access.
+ * WARN if hardware generates a fault with an error code that collides
+ * with the KVM-defined value. Clear the flag and continue on, i.e.
+ * don't terminate the VM, as KVM can't possibly be relying on a flag
+ * that KVM doesn't know about.
+ */
+ if (WARN_ON_ONCE(error_code & PFERR_IMPLICIT_ACCESS))
+ error_code &= ~PFERR_IMPLICIT_ACCESS;
+
+ if (WARN_ON_ONCE(!VALID_PAGE(vcpu->arch.mmu->root.hpa)))
+ return RET_PF_RETRY;
+
+ r = RET_PF_INVALID;
+ if (unlikely(error_code & PFERR_RSVD_MASK)) {
+ r = handle_mmio_page_fault(vcpu, cr2_or_gpa, direct);
+ if (r == RET_PF_EMULATE)
+ goto emulate;
+ }
+
+ if (r == RET_PF_INVALID) {
+ r = kvm_mmu_do_page_fault(vcpu, cr2_or_gpa,
+ lower_32_bits(error_code), false,
+ &emulation_type);
+ if (KVM_BUG_ON(r == RET_PF_INVALID, vcpu->kvm))
+ return -EIO;
+ }
+
+ if (r < 0)
+ return r;
+ if (r != RET_PF_EMULATE)
+ return 1;
+
+ /*
+ * Before emulating the instruction, check if the error code
+ * was due to a RO violation while translating the guest page.
+ * This can occur when using nested virtualization with nested
+ * paging in both guests. If true, we simply unprotect the page
+ * and resume the guest.
+ */
+ if (vcpu->arch.mmu->root_role.direct &&
+ (error_code & PFERR_NESTED_GUEST_PAGE) == PFERR_NESTED_GUEST_PAGE) {
+ kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2_or_gpa));
+ return 1;
+ }
+
+ /*
+ * vcpu->arch.mmu.page_fault returned RET_PF_EMULATE, but we can still
+ * optimistically try to just unprotect the page and let the processor
+ * re-execute the instruction that caused the page fault. Do not allow
+ * retrying MMIO emulation, as it's not only pointless but could also
+ * cause us to enter an infinite loop because the processor will keep
+ * faulting on the non-existent MMIO address. Retrying an instruction
+ * from a nested guest is also pointless and dangerous as we are only
+ * explicitly shadowing L1's page tables, i.e. unprotecting something
+ * for L1 isn't going to magically fix whatever issue cause L2 to fail.
+ */
+ if (!mmio_info_in_cache(vcpu, cr2_or_gpa, direct) && !is_guest_mode(vcpu))
+ emulation_type |= EMULTYPE_ALLOW_RETRY_PF;
+emulate:
+ return x86_emulate_instruction(vcpu, cr2_or_gpa, emulation_type, insn,
+ insn_len);
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
+
+static void __kvm_mmu_invalidate_addr(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
+ u64 addr, hpa_t root_hpa)
+{
+ struct kvm_shadow_walk_iterator iterator;
+
+ vcpu_clear_mmio_info(vcpu, addr);
+
+ /*
+ * Walking and synchronizing SPTEs both assume they are operating in
+ * the context of the current MMU, and would need to be reworked if
+ * this is ever used to sync the guest_mmu, e.g. to emulate INVEPT.
+ */
+ if (WARN_ON_ONCE(mmu != vcpu->arch.mmu))
+ return;
+
+ if (!VALID_PAGE(root_hpa))
+ return;
+
+ write_lock(&vcpu->kvm->mmu_lock);
+ for_each_shadow_entry_using_root(vcpu, root_hpa, addr, iterator) {
+ struct kvm_mmu_page *sp = sptep_to_sp(iterator.sptep);
+
+ if (sp->unsync) {
+ int ret = kvm_sync_spte(vcpu, sp, iterator.index);
+
+ if (ret < 0)
+ mmu_page_zap_pte(vcpu->kvm, sp, iterator.sptep, NULL);
+ if (ret)
+ kvm_flush_remote_tlbs_sptep(vcpu->kvm, iterator.sptep);
+ }
+
+ if (!sp->unsync_children)
+ break;
+ }
+ write_unlock(&vcpu->kvm->mmu_lock);
+}
+
+void kvm_mmu_invalidate_addr(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
+ u64 addr, unsigned long roots)
+{
+ int i;
+
+ WARN_ON_ONCE(roots & ~KVM_MMU_ROOTS_ALL);
+
+ /* It's actually a GPA for vcpu->arch.guest_mmu. */
+ if (mmu != &vcpu->arch.guest_mmu) {
+ /* INVLPG on a non-canonical address is a NOP according to the SDM. */
+ if (is_noncanonical_address(addr, vcpu))
+ return;
+
+ static_call(kvm_x86_flush_tlb_gva)(vcpu, addr);
+ }
+
+ if (!mmu->sync_spte)
+ return;
+
+ if (roots & KVM_MMU_ROOT_CURRENT)
+ __kvm_mmu_invalidate_addr(vcpu, mmu, addr, mmu->root.hpa);
+
+ for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
+ if (roots & KVM_MMU_ROOT_PREVIOUS(i))
+ __kvm_mmu_invalidate_addr(vcpu, mmu, addr, mmu->prev_roots[i].hpa);
+ }
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_invalidate_addr);
+
+void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
+{
+ /*
+ * INVLPG is required to invalidate any global mappings for the VA,
+ * irrespective of PCID. Blindly sync all roots as it would take
+ * roughly the same amount of work/time to determine whether any of the
+ * previous roots have a global mapping.
+ *
+ * Mappings not reachable via the current or previous cached roots will
+ * be synced when switching to that new cr3, so nothing needs to be
+ * done here for them.
+ */
+ kvm_mmu_invalidate_addr(vcpu, vcpu->arch.walk_mmu, gva, KVM_MMU_ROOTS_ALL);
+ ++vcpu->stat.invlpg;
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_invlpg);
+
+
+void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid)
+{
+ struct kvm_mmu *mmu = vcpu->arch.mmu;
+ unsigned long roots = 0;
+ uint i;
+
+ if (pcid == kvm_get_active_pcid(vcpu))
+ roots |= KVM_MMU_ROOT_CURRENT;
+
+ for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
+ if (VALID_PAGE(mmu->prev_roots[i].hpa) &&
+ pcid == kvm_get_pcid(vcpu, mmu->prev_roots[i].pgd))
+ roots |= KVM_MMU_ROOT_PREVIOUS(i);
+ }
+
+ if (roots)
+ kvm_mmu_invalidate_addr(vcpu, mmu, gva, roots);
+ ++vcpu->stat.invlpg;
+
+ /*
+ * Mappings not reachable via the current cr3 or the prev_roots will be
+ * synced when switching to that cr3, so nothing needs to be done here
+ * for them.
+ */
+}
+
+void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level,
+ int tdp_max_root_level, int tdp_huge_page_level)
+{
+ tdp_enabled = enable_tdp;
+ tdp_root_level = tdp_forced_root_level;
+ max_tdp_level = tdp_max_root_level;
+
+#ifdef CONFIG_X86_64
+ tdp_mmu_enabled = tdp_mmu_allowed && tdp_enabled;
+#endif
+ /*
+ * max_huge_page_level reflects KVM's MMU capabilities irrespective
+ * of kernel support, e.g. KVM may be capable of using 1GB pages when
+ * the kernel is not. But, KVM never creates a page size greater than
+ * what is used by the kernel for any given HVA, i.e. the kernel's
+ * capabilities are ultimately consulted by kvm_mmu_hugepage_adjust().
+ */
+ if (tdp_enabled)
+ max_huge_page_level = tdp_huge_page_level;
+ else if (boot_cpu_has(X86_FEATURE_GBPAGES))
+ max_huge_page_level = PG_LEVEL_1G;
+ else
+ max_huge_page_level = PG_LEVEL_2M;
+}
+EXPORT_SYMBOL_GPL(kvm_configure_mmu);
+
+/* The return value indicates if tlb flush on all vcpus is needed. */
+typedef bool (*slot_rmaps_handler) (struct kvm *kvm,
+ struct kvm_rmap_head *rmap_head,
+ const struct kvm_memory_slot *slot);
+
+static __always_inline bool __walk_slot_rmaps(struct kvm *kvm,
+ const struct kvm_memory_slot *slot,
+ slot_rmaps_handler fn,
+ int start_level, int end_level,
+ gfn_t start_gfn, gfn_t end_gfn,
+ bool flush_on_yield, bool flush)
+{
+ struct slot_rmap_walk_iterator iterator;
+
+ lockdep_assert_held_write(&kvm->mmu_lock);
+
+ for_each_slot_rmap_range(slot, start_level, end_level, start_gfn,
+ end_gfn, &iterator) {
+ if (iterator.rmap)
+ flush |= fn(kvm, iterator.rmap, slot);
+
+ if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
+ if (flush && flush_on_yield) {
+ kvm_flush_remote_tlbs_range(kvm, start_gfn,
+ iterator.gfn - start_gfn + 1);
+ flush = false;
+ }
+ cond_resched_rwlock_write(&kvm->mmu_lock);
+ }
+ }
+
+ return flush;
+}
+
+static __always_inline bool walk_slot_rmaps(struct kvm *kvm,
+ const struct kvm_memory_slot *slot,
+ slot_rmaps_handler fn,
+ int start_level, int end_level,
+ bool flush_on_yield)
+{
+ return __walk_slot_rmaps(kvm, slot, fn, start_level, end_level,
+ slot->base_gfn, slot->base_gfn + slot->npages - 1,
+ flush_on_yield, false);
+}
+
+static __always_inline bool walk_slot_rmaps_4k(struct kvm *kvm,
+ const struct kvm_memory_slot *slot,
+ slot_rmaps_handler fn,
+ bool flush_on_yield)
+{
+ return walk_slot_rmaps(kvm, slot, fn, PG_LEVEL_4K, PG_LEVEL_4K, flush_on_yield);
+}
+
+static void free_mmu_pages(struct kvm_mmu *mmu)
+{
+ if (!tdp_enabled && mmu->pae_root)
+ set_memory_encrypted((unsigned long)mmu->pae_root, 1);
+ free_page((unsigned long)mmu->pae_root);
+ free_page((unsigned long)mmu->pml4_root);
+ free_page((unsigned long)mmu->pml5_root);
+}
+
+static int __kvm_mmu_create(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
+{
+ struct page *page;
+ int i;
+
+ mmu->root.hpa = INVALID_PAGE;
+ mmu->root.pgd = 0;
+ for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
+ mmu->prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
+
+ /* vcpu->arch.guest_mmu isn't used when !tdp_enabled. */
+ if (!tdp_enabled && mmu == &vcpu->arch.guest_mmu)
+ return 0;
+
+ /*
+ * When using PAE paging, the four PDPTEs are treated as 'root' pages,
+ * while the PDP table is a per-vCPU construct that's allocated at MMU
+ * creation. When emulating 32-bit mode, cr3 is only 32 bits even on
+ * x86_64. Therefore we need to allocate the PDP table in the first
+ * 4GB of memory, which happens to fit the DMA32 zone. TDP paging
+ * generally doesn't use PAE paging and can skip allocating the PDP
+ * table. The main exception, handled here, is SVM's 32-bit NPT. The
+ * other exception is for shadowing L1's 32-bit or PAE NPT on 64-bit
+ * KVM; that horror is handled on-demand by mmu_alloc_special_roots().
+ */
+ if (tdp_enabled && kvm_mmu_get_tdp_level(vcpu) > PT32E_ROOT_LEVEL)
+ return 0;
+
+ page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_DMA32);
+ if (!page)
+ return -ENOMEM;
+
+ mmu->pae_root = page_address(page);
+
+ /*
+ * CR3 is only 32 bits when PAE paging is used, thus it's impossible to
+ * get the CPU to treat the PDPTEs as encrypted. Decrypt the page so
+ * that KVM's writes and the CPU's reads get along. Note, this is
+ * only necessary when using shadow paging, as 64-bit NPT can get at
+ * the C-bit even when shadowing 32-bit NPT, and SME isn't supported
+ * by 32-bit kernels (when KVM itself uses 32-bit NPT).
+ */
+ if (!tdp_enabled)
+ set_memory_decrypted((unsigned long)mmu->pae_root, 1);
+ else
+ WARN_ON_ONCE(shadow_me_value);
+
+ for (i = 0; i < 4; ++i)
+ mmu->pae_root[i] = INVALID_PAE_ROOT;
+
+ return 0;
+}
+
+int kvm_mmu_create(struct kvm_vcpu *vcpu)
+{
+ int ret;
+
+ vcpu->arch.mmu_pte_list_desc_cache.kmem_cache = pte_list_desc_cache;
+ vcpu->arch.mmu_pte_list_desc_cache.gfp_zero = __GFP_ZERO;
+
+ vcpu->arch.mmu_page_header_cache.kmem_cache = mmu_page_header_cache;
+ vcpu->arch.mmu_page_header_cache.gfp_zero = __GFP_ZERO;
+
+ vcpu->arch.mmu_shadow_page_cache.gfp_zero = __GFP_ZERO;
+
+ vcpu->arch.mmu = &vcpu->arch.root_mmu;
+ vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
+
+ ret = __kvm_mmu_create(vcpu, &vcpu->arch.guest_mmu);
+ if (ret)
+ return ret;
+
+ ret = __kvm_mmu_create(vcpu, &vcpu->arch.root_mmu);
+ if (ret)
+ goto fail_allocate_root;
+
+ return ret;
+ fail_allocate_root:
+ free_mmu_pages(&vcpu->arch.guest_mmu);
+ return ret;
+}
+
+#define BATCH_ZAP_PAGES 10
+static void kvm_zap_obsolete_pages(struct kvm *kvm)
+{
+ struct kvm_mmu_page *sp, *node;
+ int nr_zapped, batch = 0;
+ bool unstable;
+
+restart:
+ list_for_each_entry_safe_reverse(sp, node,
+ &kvm->arch.active_mmu_pages, link) {
+ /*
+ * No obsolete valid page exists before a newly created page
+ * since active_mmu_pages is a FIFO list.
+ */
+ if (!is_obsolete_sp(kvm, sp))
+ break;
+
+ /*
+ * Invalid pages should never land back on the list of active
+ * pages. Skip the bogus page, otherwise we'll get stuck in an
+ * infinite loop if the page gets put back on the list (again).
+ */
+ if (WARN_ON_ONCE(sp->role.invalid))
+ continue;
+
+ /*
+ * No need to flush the TLB since we're only zapping shadow
+ * pages with an obsolete generation number and all vCPUS have
+ * loaded a new root, i.e. the shadow pages being zapped cannot
+ * be in active use by the guest.
+ */
+ if (batch >= BATCH_ZAP_PAGES &&
+ cond_resched_rwlock_write(&kvm->mmu_lock)) {
+ batch = 0;
+ goto restart;
+ }
+
+ unstable = __kvm_mmu_prepare_zap_page(kvm, sp,
+ &kvm->arch.zapped_obsolete_pages, &nr_zapped);
+ batch += nr_zapped;
+
+ if (unstable)
+ goto restart;
+ }
+
+ /*
+ * Kick all vCPUs (via remote TLB flush) before freeing the page tables
+ * to ensure KVM is not in the middle of a lockless shadow page table
+ * walk, which may reference the pages. The remote TLB flush itself is
+ * not required and is simply a convenient way to kick vCPUs as needed.
+ * KVM performs a local TLB flush when allocating a new root (see
+ * kvm_mmu_load()), and the reload in the caller ensure no vCPUs are
+ * running with an obsolete MMU.
+ */
+ kvm_mmu_commit_zap_page(kvm, &kvm->arch.zapped_obsolete_pages);
+}
+
+/*
+ * Fast invalidate all shadow pages and use lock-break technique
+ * to zap obsolete pages.
+ *
+ * It's required when memslot is being deleted or VM is being
+ * destroyed, in these cases, we should ensure that KVM MMU does
+ * not use any resource of the being-deleted slot or all slots
+ * after calling the function.
+ */
+static void kvm_mmu_zap_all_fast(struct kvm *kvm)
+{
+ lockdep_assert_held(&kvm->slots_lock);
+
+ write_lock(&kvm->mmu_lock);
+ trace_kvm_mmu_zap_all_fast(kvm);
+
+ /*
+ * Toggle mmu_valid_gen between '0' and '1'. Because slots_lock is
+ * held for the entire duration of zapping obsolete pages, it's
+ * impossible for there to be multiple invalid generations associated
+ * with *valid* shadow pages at any given time, i.e. there is exactly
+ * one valid generation and (at most) one invalid generation.
+ */
+ kvm->arch.mmu_valid_gen = kvm->arch.mmu_valid_gen ? 0 : 1;
+
+ /*
+ * In order to ensure all vCPUs drop their soon-to-be invalid roots,
+ * invalidating TDP MMU roots must be done while holding mmu_lock for
+ * write and in the same critical section as making the reload request,
+ * e.g. before kvm_zap_obsolete_pages() could drop mmu_lock and yield.
+ */
+ if (tdp_mmu_enabled)
+ kvm_tdp_mmu_invalidate_all_roots(kvm);
+
+ /*
+ * Notify all vcpus to reload its shadow page table and flush TLB.
+ * Then all vcpus will switch to new shadow page table with the new
+ * mmu_valid_gen.
+ *
+ * Note: we need to do this under the protection of mmu_lock,
+ * otherwise, vcpu would purge shadow page but miss tlb flush.
+ */
+ kvm_make_all_cpus_request(kvm, KVM_REQ_MMU_FREE_OBSOLETE_ROOTS);
+
+ kvm_zap_obsolete_pages(kvm);
+
+ write_unlock(&kvm->mmu_lock);
+
+ /*
+ * Zap the invalidated TDP MMU roots, all SPTEs must be dropped before
+ * returning to the caller, e.g. if the zap is in response to a memslot
+ * deletion, mmu_notifier callbacks will be unable to reach the SPTEs
+ * associated with the deleted memslot once the update completes, and
+ * Deferring the zap until the final reference to the root is put would
+ * lead to use-after-free.
+ */
+ if (tdp_mmu_enabled)
+ kvm_tdp_mmu_zap_invalidated_roots(kvm);
+}
+
+static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm)
+{
+ return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages));
+}
+
+void kvm_mmu_init_vm(struct kvm *kvm)
+{
+ INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
+ INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages);
+ INIT_LIST_HEAD(&kvm->arch.possible_nx_huge_pages);
+ spin_lock_init(&kvm->arch.mmu_unsync_pages_lock);
+
+ if (tdp_mmu_enabled)
+ kvm_mmu_init_tdp_mmu(kvm);
+
+ kvm->arch.split_page_header_cache.kmem_cache = mmu_page_header_cache;
+ kvm->arch.split_page_header_cache.gfp_zero = __GFP_ZERO;
+
+ kvm->arch.split_shadow_page_cache.gfp_zero = __GFP_ZERO;
+
+ kvm->arch.split_desc_cache.kmem_cache = pte_list_desc_cache;
+ kvm->arch.split_desc_cache.gfp_zero = __GFP_ZERO;
+}
+
+static void mmu_free_vm_memory_caches(struct kvm *kvm)
+{
+ kvm_mmu_free_memory_cache(&kvm->arch.split_desc_cache);
+ kvm_mmu_free_memory_cache(&kvm->arch.split_page_header_cache);
+ kvm_mmu_free_memory_cache(&kvm->arch.split_shadow_page_cache);
+}
+
+void kvm_mmu_uninit_vm(struct kvm *kvm)
+{
+ if (tdp_mmu_enabled)
+ kvm_mmu_uninit_tdp_mmu(kvm);
+
+ mmu_free_vm_memory_caches(kvm);
+}
+
+static bool kvm_rmap_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
+{
+ const struct kvm_memory_slot *memslot;
+ struct kvm_memslots *slots;
+ struct kvm_memslot_iter iter;
+ bool flush = false;
+ gfn_t start, end;
+ int i;
+
+ if (!kvm_memslots_have_rmaps(kvm))
+ return flush;
+
+ for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
+ slots = __kvm_memslots(kvm, i);
+
+ kvm_for_each_memslot_in_gfn_range(&iter, slots, gfn_start, gfn_end) {
+ memslot = iter.slot;
+ start = max(gfn_start, memslot->base_gfn);
+ end = min(gfn_end, memslot->base_gfn + memslot->npages);
+ if (WARN_ON_ONCE(start >= end))
+ continue;
+
+ flush = __walk_slot_rmaps(kvm, memslot, __kvm_zap_rmap,
+ PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL,
+ start, end - 1, true, flush);
+ }
+ }
+
+ return flush;
+}
+
+/*
+ * Invalidate (zap) SPTEs that cover GFNs from gfn_start and up to gfn_end
+ * (not including it)
+ */
+void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
+{
+ bool flush;
+
+ if (WARN_ON_ONCE(gfn_end <= gfn_start))
+ return;
+
+ write_lock(&kvm->mmu_lock);
+
+ kvm_mmu_invalidate_begin(kvm, 0, -1ul);
+
+ flush = kvm_rmap_zap_gfn_range(kvm, gfn_start, gfn_end);
+
+ if (tdp_mmu_enabled)
+ flush = kvm_tdp_mmu_zap_leafs(kvm, gfn_start, gfn_end, flush);
+
+ if (flush)
+ kvm_flush_remote_tlbs_range(kvm, gfn_start, gfn_end - gfn_start);
+
+ kvm_mmu_invalidate_end(kvm, 0, -1ul);
+
+ write_unlock(&kvm->mmu_lock);
+}
+
+static bool slot_rmap_write_protect(struct kvm *kvm,
+ struct kvm_rmap_head *rmap_head,
+ const struct kvm_memory_slot *slot)
+{
+ return rmap_write_protect(rmap_head, false);
+}
+
+void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
+ const struct kvm_memory_slot *memslot,
+ int start_level)
+{
+ if (kvm_memslots_have_rmaps(kvm)) {
+ write_lock(&kvm->mmu_lock);
+ walk_slot_rmaps(kvm, memslot, slot_rmap_write_protect,
+ start_level, KVM_MAX_HUGEPAGE_LEVEL, false);
+ write_unlock(&kvm->mmu_lock);
+ }
+
+ if (tdp_mmu_enabled) {
+ read_lock(&kvm->mmu_lock);
+ kvm_tdp_mmu_wrprot_slot(kvm, memslot, start_level);
+ read_unlock(&kvm->mmu_lock);
+ }
+}
+
+static inline bool need_topup(struct kvm_mmu_memory_cache *cache, int min)
+{
+ return kvm_mmu_memory_cache_nr_free_objects(cache) < min;
+}
+
+static bool need_topup_split_caches_or_resched(struct kvm *kvm)
+{
+ if (need_resched() || rwlock_needbreak(&kvm->mmu_lock))
+ return true;
+
+ /*
+ * In the worst case, SPLIT_DESC_CACHE_MIN_NR_OBJECTS descriptors are needed
+ * to split a single huge page. Calculating how many are actually needed
+ * is possible but not worth the complexity.
+ */
+ return need_topup(&kvm->arch.split_desc_cache, SPLIT_DESC_CACHE_MIN_NR_OBJECTS) ||
+ need_topup(&kvm->arch.split_page_header_cache, 1) ||
+ need_topup(&kvm->arch.split_shadow_page_cache, 1);
+}
+
+static int topup_split_caches(struct kvm *kvm)
+{
+ /*
+ * Allocating rmap list entries when splitting huge pages for nested
+ * MMUs is uncommon as KVM needs to use a list if and only if there is
+ * more than one rmap entry for a gfn, i.e. requires an L1 gfn to be
+ * aliased by multiple L2 gfns and/or from multiple nested roots with
+ * different roles. Aliasing gfns when using TDP is atypical for VMMs;
+ * a few gfns are often aliased during boot, e.g. when remapping BIOS,
+ * but aliasing rarely occurs post-boot or for many gfns. If there is
+ * only one rmap entry, rmap->val points directly at that one entry and
+ * doesn't need to allocate a list. Buffer the cache by the default
+ * capacity so that KVM doesn't have to drop mmu_lock to topup if KVM
+ * encounters an aliased gfn or two.
+ */
+ const int capacity = SPLIT_DESC_CACHE_MIN_NR_OBJECTS +
+ KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE;
+ int r;
+
+ lockdep_assert_held(&kvm->slots_lock);
+
+ r = __kvm_mmu_topup_memory_cache(&kvm->arch.split_desc_cache, capacity,
+ SPLIT_DESC_CACHE_MIN_NR_OBJECTS);
+ if (r)
+ return r;
+
+ r = kvm_mmu_topup_memory_cache(&kvm->arch.split_page_header_cache, 1);
+ if (r)
+ return r;
+
+ return kvm_mmu_topup_memory_cache(&kvm->arch.split_shadow_page_cache, 1);
+}
+
+static struct kvm_mmu_page *shadow_mmu_get_sp_for_split(struct kvm *kvm, u64 *huge_sptep)
+{
+ struct kvm_mmu_page *huge_sp = sptep_to_sp(huge_sptep);
+ struct shadow_page_caches caches = {};
+ union kvm_mmu_page_role role;
+ unsigned int access;
+ gfn_t gfn;
+
+ gfn = kvm_mmu_page_get_gfn(huge_sp, spte_index(huge_sptep));
+ access = kvm_mmu_page_get_access(huge_sp, spte_index(huge_sptep));
+
+ /*
+ * Note, huge page splitting always uses direct shadow pages, regardless
+ * of whether the huge page itself is mapped by a direct or indirect
+ * shadow page, since the huge page region itself is being directly
+ * mapped with smaller pages.
+ */
+ role = kvm_mmu_child_role(huge_sptep, /*direct=*/true, access);
+
+ /* Direct SPs do not require a shadowed_info_cache. */
+ caches.page_header_cache = &kvm->arch.split_page_header_cache;
+ caches.shadow_page_cache = &kvm->arch.split_shadow_page_cache;
+
+ /* Safe to pass NULL for vCPU since requesting a direct SP. */
+ return __kvm_mmu_get_shadow_page(kvm, NULL, &caches, gfn, role);
+}
+
+static void shadow_mmu_split_huge_page(struct kvm *kvm,
+ const struct kvm_memory_slot *slot,
+ u64 *huge_sptep)
+
+{
+ struct kvm_mmu_memory_cache *cache = &kvm->arch.split_desc_cache;
+ u64 huge_spte = READ_ONCE(*huge_sptep);
+ struct kvm_mmu_page *sp;
+ bool flush = false;
+ u64 *sptep, spte;
+ gfn_t gfn;
+ int index;
+
+ sp = shadow_mmu_get_sp_for_split(kvm, huge_sptep);
+
+ for (index = 0; index < SPTE_ENT_PER_PAGE; index++) {
+ sptep = &sp->spt[index];
+ gfn = kvm_mmu_page_get_gfn(sp, index);
+
+ /*
+ * The SP may already have populated SPTEs, e.g. if this huge
+ * page is aliased by multiple sptes with the same access
+ * permissions. These entries are guaranteed to map the same
+ * gfn-to-pfn translation since the SP is direct, so no need to
+ * modify them.
+ *
+ * However, if a given SPTE points to a lower level page table,
+ * that lower level page table may only be partially populated.
+ * Installing such SPTEs would effectively unmap a potion of the
+ * huge page. Unmapping guest memory always requires a TLB flush
+ * since a subsequent operation on the unmapped regions would
+ * fail to detect the need to flush.
+ */
+ if (is_shadow_present_pte(*sptep)) {
+ flush |= !is_last_spte(*sptep, sp->role.level);
+ continue;
+ }
+
+ spte = make_huge_page_split_spte(kvm, huge_spte, sp->role, index);
+ mmu_spte_set(sptep, spte);
+ __rmap_add(kvm, cache, slot, sptep, gfn, sp->role.access);
+ }
+
+ __link_shadow_page(kvm, cache, huge_sptep, sp, flush);
+}
+
+static int shadow_mmu_try_split_huge_page(struct kvm *kvm,
+ const struct kvm_memory_slot *slot,
+ u64 *huge_sptep)
+{
+ struct kvm_mmu_page *huge_sp = sptep_to_sp(huge_sptep);
+ int level, r = 0;
+ gfn_t gfn;
+ u64 spte;
+
+ /* Grab information for the tracepoint before dropping the MMU lock. */
+ gfn = kvm_mmu_page_get_gfn(huge_sp, spte_index(huge_sptep));
+ level = huge_sp->role.level;
+ spte = *huge_sptep;
+
+ if (kvm_mmu_available_pages(kvm) <= KVM_MIN_FREE_MMU_PAGES) {
+ r = -ENOSPC;
+ goto out;
+ }
+
+ if (need_topup_split_caches_or_resched(kvm)) {
+ write_unlock(&kvm->mmu_lock);
+ cond_resched();
+ /*
+ * If the topup succeeds, return -EAGAIN to indicate that the
+ * rmap iterator should be restarted because the MMU lock was
+ * dropped.
+ */
+ r = topup_split_caches(kvm) ?: -EAGAIN;
+ write_lock(&kvm->mmu_lock);
+ goto out;
+ }
+
+ shadow_mmu_split_huge_page(kvm, slot, huge_sptep);
+
+out:
+ trace_kvm_mmu_split_huge_page(gfn, spte, level, r);
+ return r;
+}
+
+static bool shadow_mmu_try_split_huge_pages(struct kvm *kvm,
+ struct kvm_rmap_head *rmap_head,
+ const struct kvm_memory_slot *slot)
+{
+ struct rmap_iterator iter;
+ struct kvm_mmu_page *sp;
+ u64 *huge_sptep;
+ int r;
+
+restart:
+ for_each_rmap_spte(rmap_head, &iter, huge_sptep) {
+ sp = sptep_to_sp(huge_sptep);
+
+ /* TDP MMU is enabled, so rmap only contains nested MMU SPs. */
+ if (WARN_ON_ONCE(!sp->role.guest_mode))
+ continue;
+
+ /* The rmaps should never contain non-leaf SPTEs. */
+ if (WARN_ON_ONCE(!is_large_pte(*huge_sptep)))
+ continue;
+
+ /* SPs with level >PG_LEVEL_4K should never by unsync. */
+ if (WARN_ON_ONCE(sp->unsync))
+ continue;
+
+ /* Don't bother splitting huge pages on invalid SPs. */
+ if (sp->role.invalid)
+ continue;
+
+ r = shadow_mmu_try_split_huge_page(kvm, slot, huge_sptep);
+
+ /*
+ * The split succeeded or needs to be retried because the MMU
+ * lock was dropped. Either way, restart the iterator to get it
+ * back into a consistent state.
+ */
+ if (!r || r == -EAGAIN)
+ goto restart;
+
+ /* The split failed and shouldn't be retried (e.g. -ENOMEM). */
+ break;
+ }
+
+ return false;
+}
+
+static void kvm_shadow_mmu_try_split_huge_pages(struct kvm *kvm,
+ const struct kvm_memory_slot *slot,
+ gfn_t start, gfn_t end,
+ int target_level)
+{
+ int level;
+
+ /*
+ * Split huge pages starting with KVM_MAX_HUGEPAGE_LEVEL and working
+ * down to the target level. This ensures pages are recursively split
+ * all the way to the target level. There's no need to split pages
+ * already at the target level.
+ */
+ for (level = KVM_MAX_HUGEPAGE_LEVEL; level > target_level; level--)
+ __walk_slot_rmaps(kvm, slot, shadow_mmu_try_split_huge_pages,
+ level, level, start, end - 1, true, false);
+}
+
+/* Must be called with the mmu_lock held in write-mode. */
+void kvm_mmu_try_split_huge_pages(struct kvm *kvm,
+ const struct kvm_memory_slot *memslot,
+ u64 start, u64 end,
+ int target_level)
+{
+ if (!tdp_mmu_enabled)
+ return;
+
+ if (kvm_memslots_have_rmaps(kvm))
+ kvm_shadow_mmu_try_split_huge_pages(kvm, memslot, start, end, target_level);
+
+ kvm_tdp_mmu_try_split_huge_pages(kvm, memslot, start, end, target_level, false);
+
+ /*
+ * A TLB flush is unnecessary at this point for the same resons as in
+ * kvm_mmu_slot_try_split_huge_pages().
+ */
+}
+
+void kvm_mmu_slot_try_split_huge_pages(struct kvm *kvm,
+ const struct kvm_memory_slot *memslot,
+ int target_level)
+{
+ u64 start = memslot->base_gfn;
+ u64 end = start + memslot->npages;
+
+ if (!tdp_mmu_enabled)
+ return;
+
+ if (kvm_memslots_have_rmaps(kvm)) {
+ write_lock(&kvm->mmu_lock);
+ kvm_shadow_mmu_try_split_huge_pages(kvm, memslot, start, end, target_level);
+ write_unlock(&kvm->mmu_lock);
+ }
+
+ read_lock(&kvm->mmu_lock);
+ kvm_tdp_mmu_try_split_huge_pages(kvm, memslot, start, end, target_level, true);
+ read_unlock(&kvm->mmu_lock);
+
+ /*
+ * No TLB flush is necessary here. KVM will flush TLBs after
+ * write-protecting and/or clearing dirty on the newly split SPTEs to
+ * ensure that guest writes are reflected in the dirty log before the
+ * ioctl to enable dirty logging on this memslot completes. Since the
+ * split SPTEs retain the write and dirty bits of the huge SPTE, it is
+ * safe for KVM to decide if a TLB flush is necessary based on the split
+ * SPTEs.
+ */
+}
+
+static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
+ struct kvm_rmap_head *rmap_head,
+ const struct kvm_memory_slot *slot)
+{
+ u64 *sptep;
+ struct rmap_iterator iter;
+ int need_tlb_flush = 0;
+ struct kvm_mmu_page *sp;
+
+restart:
+ for_each_rmap_spte(rmap_head, &iter, sptep) {
+ sp = sptep_to_sp(sptep);
+
+ /*
+ * We cannot do huge page mapping for indirect shadow pages,
+ * which are found on the last rmap (level = 1) when not using
+ * tdp; such shadow pages are synced with the page table in
+ * the guest, and the guest page table is using 4K page size
+ * mapping if the indirect sp has level = 1.
+ */
+ if (sp->role.direct &&
+ sp->role.level < kvm_mmu_max_mapping_level(kvm, slot, sp->gfn,
+ PG_LEVEL_NUM)) {
+ kvm_zap_one_rmap_spte(kvm, rmap_head, sptep);
+
+ if (kvm_available_flush_remote_tlbs_range())
+ kvm_flush_remote_tlbs_sptep(kvm, sptep);
+ else
+ need_tlb_flush = 1;
+
+ goto restart;
+ }
+ }
+
+ return need_tlb_flush;
+}
+
+static void kvm_rmap_zap_collapsible_sptes(struct kvm *kvm,
+ const struct kvm_memory_slot *slot)
+{
+ /*
+ * Note, use KVM_MAX_HUGEPAGE_LEVEL - 1 since there's no need to zap
+ * pages that are already mapped at the maximum hugepage level.
+ */
+ if (walk_slot_rmaps(kvm, slot, kvm_mmu_zap_collapsible_spte,
+ PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL - 1, true))
+ kvm_flush_remote_tlbs_memslot(kvm, slot);
+}
+
+void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
+ const struct kvm_memory_slot *slot)
+{
+ if (kvm_memslots_have_rmaps(kvm)) {
+ write_lock(&kvm->mmu_lock);
+ kvm_rmap_zap_collapsible_sptes(kvm, slot);
+ write_unlock(&kvm->mmu_lock);
+ }
+
+ if (tdp_mmu_enabled) {
+ read_lock(&kvm->mmu_lock);
+ kvm_tdp_mmu_zap_collapsible_sptes(kvm, slot);
+ read_unlock(&kvm->mmu_lock);
+ }
+}
+
+void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
+ const struct kvm_memory_slot *memslot)
+{
+ if (kvm_memslots_have_rmaps(kvm)) {
+ write_lock(&kvm->mmu_lock);
+ /*
+ * Clear dirty bits only on 4k SPTEs since the legacy MMU only
+ * support dirty logging at a 4k granularity.
+ */
+ walk_slot_rmaps_4k(kvm, memslot, __rmap_clear_dirty, false);
+ write_unlock(&kvm->mmu_lock);
+ }
+
+ if (tdp_mmu_enabled) {
+ read_lock(&kvm->mmu_lock);
+ kvm_tdp_mmu_clear_dirty_slot(kvm, memslot);
+ read_unlock(&kvm->mmu_lock);
+ }
+
+ /*
+ * The caller will flush the TLBs after this function returns.
+ *
+ * It's also safe to flush TLBs out of mmu lock here as currently this
+ * function is only used for dirty logging, in which case flushing TLB
+ * out of mmu lock also guarantees no dirty pages will be lost in
+ * dirty_bitmap.
+ */
+}
+
+static void kvm_mmu_zap_all(struct kvm *kvm)
+{
+ struct kvm_mmu_page *sp, *node;
+ LIST_HEAD(invalid_list);
+ int ign;
+
+ write_lock(&kvm->mmu_lock);
+restart:
+ list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) {
+ if (WARN_ON_ONCE(sp->role.invalid))
+ continue;
+ if (__kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, &ign))
+ goto restart;
+ if (cond_resched_rwlock_write(&kvm->mmu_lock))
+ goto restart;
+ }
+
+ kvm_mmu_commit_zap_page(kvm, &invalid_list);
+
+ if (tdp_mmu_enabled)
+ kvm_tdp_mmu_zap_all(kvm);
+
+ write_unlock(&kvm->mmu_lock);
+}
+
+void kvm_arch_flush_shadow_all(struct kvm *kvm)
+{
+ kvm_mmu_zap_all(kvm);
+}
+
+void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
+ struct kvm_memory_slot *slot)
+{
+ kvm_mmu_zap_all_fast(kvm);
+}
+
+void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen)
+{
+ WARN_ON_ONCE(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS);
+
+ gen &= MMIO_SPTE_GEN_MASK;
+
+ /*
+ * Generation numbers are incremented in multiples of the number of
+ * address spaces in order to provide unique generations across all
+ * address spaces. Strip what is effectively the address space
+ * modifier prior to checking for a wrap of the MMIO generation so
+ * that a wrap in any address space is detected.
+ */
+ gen &= ~((u64)KVM_ADDRESS_SPACE_NUM - 1);
+
+ /*
+ * The very rare case: if the MMIO generation number has wrapped,
+ * zap all shadow pages.
+ */
+ if (unlikely(gen == 0)) {
+ kvm_debug_ratelimited("zapping shadow pages for mmio generation wraparound\n");
+ kvm_mmu_zap_all_fast(kvm);
+ }
+}
+
+static unsigned long mmu_shrink_scan(struct shrinker *shrink,
+ struct shrink_control *sc)
+{
+ struct kvm *kvm;
+ int nr_to_scan = sc->nr_to_scan;
+ unsigned long freed = 0;
+
+ mutex_lock(&kvm_lock);
+
+ list_for_each_entry(kvm, &vm_list, vm_list) {
+ int idx;
+ LIST_HEAD(invalid_list);
+
+ /*
+ * Never scan more than sc->nr_to_scan VM instances.
+ * Will not hit this condition practically since we do not try
+ * to shrink more than one VM and it is very unlikely to see
+ * !n_used_mmu_pages so many times.
+ */
+ if (!nr_to_scan--)
+ break;
+ /*
+ * n_used_mmu_pages is accessed without holding kvm->mmu_lock
+ * here. We may skip a VM instance errorneosly, but we do not
+ * want to shrink a VM that only started to populate its MMU
+ * anyway.
+ */
+ if (!kvm->arch.n_used_mmu_pages &&
+ !kvm_has_zapped_obsolete_pages(kvm))
+ continue;
+
+ idx = srcu_read_lock(&kvm->srcu);
+ write_lock(&kvm->mmu_lock);
+
+ if (kvm_has_zapped_obsolete_pages(kvm)) {
+ kvm_mmu_commit_zap_page(kvm,
+ &kvm->arch.zapped_obsolete_pages);
+ goto unlock;
+ }
+
+ freed = kvm_mmu_zap_oldest_mmu_pages(kvm, sc->nr_to_scan);
+
+unlock:
+ write_unlock(&kvm->mmu_lock);
+ srcu_read_unlock(&kvm->srcu, idx);
+
+ /*
+ * unfair on small ones
+ * per-vm shrinkers cry out
+ * sadness comes quickly
+ */
+ list_move_tail(&kvm->vm_list, &vm_list);
+ break;
+ }
+
+ mutex_unlock(&kvm_lock);
+ return freed;
+}
+
+static unsigned long mmu_shrink_count(struct shrinker *shrink,
+ struct shrink_control *sc)
+{
+ return percpu_counter_read_positive(&kvm_total_used_mmu_pages);
+}
+
+static struct shrinker mmu_shrinker = {
+ .count_objects = mmu_shrink_count,
+ .scan_objects = mmu_shrink_scan,
+ .seeks = DEFAULT_SEEKS * 10,
+};
+
+static void mmu_destroy_caches(void)
+{
+ kmem_cache_destroy(pte_list_desc_cache);
+ kmem_cache_destroy(mmu_page_header_cache);
+}
+
+static int get_nx_huge_pages(char *buffer, const struct kernel_param *kp)
+{
+ if (nx_hugepage_mitigation_hard_disabled)
+ return sysfs_emit(buffer, "never\n");
+
+ return param_get_bool(buffer, kp);
+}
+
+static bool get_nx_auto_mode(void)
+{
+ /* Return true when CPU has the bug, and mitigations are ON */
+ return boot_cpu_has_bug(X86_BUG_ITLB_MULTIHIT) && !cpu_mitigations_off();
+}
+
+static void __set_nx_huge_pages(bool val)
+{
+ nx_huge_pages = itlb_multihit_kvm_mitigation = val;
+}
+
+static int set_nx_huge_pages(const char *val, const struct kernel_param *kp)
+{
+ bool old_val = nx_huge_pages;
+ bool new_val;
+
+ if (nx_hugepage_mitigation_hard_disabled)
+ return -EPERM;
+
+ /* In "auto" mode deploy workaround only if CPU has the bug. */
+ if (sysfs_streq(val, "off")) {
+ new_val = 0;
+ } else if (sysfs_streq(val, "force")) {
+ new_val = 1;
+ } else if (sysfs_streq(val, "auto")) {
+ new_val = get_nx_auto_mode();
+ } else if (sysfs_streq(val, "never")) {
+ new_val = 0;
+
+ mutex_lock(&kvm_lock);
+ if (!list_empty(&vm_list)) {
+ mutex_unlock(&kvm_lock);
+ return -EBUSY;
+ }
+ nx_hugepage_mitigation_hard_disabled = true;
+ mutex_unlock(&kvm_lock);
+ } else if (kstrtobool(val, &new_val) < 0) {
+ return -EINVAL;
+ }
+
+ __set_nx_huge_pages(new_val);
+
+ if (new_val != old_val) {
+ struct kvm *kvm;
+
+ mutex_lock(&kvm_lock);
+
+ list_for_each_entry(kvm, &vm_list, vm_list) {
+ mutex_lock(&kvm->slots_lock);
+ kvm_mmu_zap_all_fast(kvm);
+ mutex_unlock(&kvm->slots_lock);
+
+ wake_up_process(kvm->arch.nx_huge_page_recovery_thread);
+ }
+ mutex_unlock(&kvm_lock);
+ }
+
+ return 0;
+}
+
+/*
+ * nx_huge_pages needs to be resolved to true/false when kvm.ko is loaded, as
+ * its default value of -1 is technically undefined behavior for a boolean.
+ * Forward the module init call to SPTE code so that it too can handle module
+ * params that need to be resolved/snapshot.
+ */
+void __init kvm_mmu_x86_module_init(void)
+{
+ if (nx_huge_pages == -1)
+ __set_nx_huge_pages(get_nx_auto_mode());
+
+ /*
+ * Snapshot userspace's desire to enable the TDP MMU. Whether or not the
+ * TDP MMU is actually enabled is determined in kvm_configure_mmu()
+ * when the vendor module is loaded.
+ */
+ tdp_mmu_allowed = tdp_mmu_enabled;
+
+ kvm_mmu_spte_module_init();
+}
+
+/*
+ * The bulk of the MMU initialization is deferred until the vendor module is
+ * loaded as many of the masks/values may be modified by VMX or SVM, i.e. need
+ * to be reset when a potentially different vendor module is loaded.
+ */
+int kvm_mmu_vendor_module_init(void)
+{
+ int ret = -ENOMEM;
+
+ /*
+ * MMU roles use union aliasing which is, generally speaking, an
+ * undefined behavior. However, we supposedly know how compilers behave
+ * and the current status quo is unlikely to change. Guardians below are
+ * supposed to let us know if the assumption becomes false.
+ */
+ BUILD_BUG_ON(sizeof(union kvm_mmu_page_role) != sizeof(u32));
+ BUILD_BUG_ON(sizeof(union kvm_mmu_extended_role) != sizeof(u32));
+ BUILD_BUG_ON(sizeof(union kvm_cpu_role) != sizeof(u64));
+
+ kvm_mmu_reset_all_pte_masks();
+
+ pte_list_desc_cache = kmem_cache_create("pte_list_desc",
+ sizeof(struct pte_list_desc),
+ 0, SLAB_ACCOUNT, NULL);
+ if (!pte_list_desc_cache)
+ goto out;
+
+ mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header",
+ sizeof(struct kvm_mmu_page),
+ 0, SLAB_ACCOUNT, NULL);
+ if (!mmu_page_header_cache)
+ goto out;
+
+ if (percpu_counter_init(&kvm_total_used_mmu_pages, 0, GFP_KERNEL))
+ goto out;
+
+ ret = register_shrinker(&mmu_shrinker, "x86-mmu");
+ if (ret)
+ goto out_shrinker;
+
+ return 0;
+
+out_shrinker:
+ percpu_counter_destroy(&kvm_total_used_mmu_pages);
+out:
+ mmu_destroy_caches();
+ return ret;
+}
+
+void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
+{
+ kvm_mmu_unload(vcpu);
+ free_mmu_pages(&vcpu->arch.root_mmu);
+ free_mmu_pages(&vcpu->arch.guest_mmu);
+ mmu_free_memory_caches(vcpu);
+}
+
+void kvm_mmu_vendor_module_exit(void)
+{
+ mmu_destroy_caches();
+ percpu_counter_destroy(&kvm_total_used_mmu_pages);
+ unregister_shrinker(&mmu_shrinker);
+}
+
+/*
+ * Calculate the effective recovery period, accounting for '0' meaning "let KVM
+ * select a halving time of 1 hour". Returns true if recovery is enabled.
+ */
+static bool calc_nx_huge_pages_recovery_period(uint *period)
+{
+ /*
+ * Use READ_ONCE to get the params, this may be called outside of the
+ * param setters, e.g. by the kthread to compute its next timeout.
+ */
+ bool enabled = READ_ONCE(nx_huge_pages);
+ uint ratio = READ_ONCE(nx_huge_pages_recovery_ratio);
+
+ if (!enabled || !ratio)
+ return false;
+
+ *period = READ_ONCE(nx_huge_pages_recovery_period_ms);
+ if (!*period) {
+ /* Make sure the period is not less than one second. */
+ ratio = min(ratio, 3600u);
+ *period = 60 * 60 * 1000 / ratio;
+ }
+ return true;
+}
+
+static int set_nx_huge_pages_recovery_param(const char *val, const struct kernel_param *kp)
+{
+ bool was_recovery_enabled, is_recovery_enabled;
+ uint old_period, new_period;
+ int err;
+
+ if (nx_hugepage_mitigation_hard_disabled)
+ return -EPERM;
+
+ was_recovery_enabled = calc_nx_huge_pages_recovery_period(&old_period);
+
+ err = param_set_uint(val, kp);
+ if (err)
+ return err;
+
+ is_recovery_enabled = calc_nx_huge_pages_recovery_period(&new_period);
+
+ if (is_recovery_enabled &&
+ (!was_recovery_enabled || old_period > new_period)) {
+ struct kvm *kvm;
+
+ mutex_lock(&kvm_lock);
+
+ list_for_each_entry(kvm, &vm_list, vm_list)
+ wake_up_process(kvm->arch.nx_huge_page_recovery_thread);
+
+ mutex_unlock(&kvm_lock);
+ }
+
+ return err;
+}
+
+static void kvm_recover_nx_huge_pages(struct kvm *kvm)
+{
+ unsigned long nx_lpage_splits = kvm->stat.nx_lpage_splits;
+ struct kvm_memory_slot *slot;
+ int rcu_idx;
+ struct kvm_mmu_page *sp;
+ unsigned int ratio;
+ LIST_HEAD(invalid_list);
+ bool flush = false;
+ ulong to_zap;
+
+ rcu_idx = srcu_read_lock(&kvm->srcu);
+ write_lock(&kvm->mmu_lock);
+
+ /*
+ * Zapping TDP MMU shadow pages, including the remote TLB flush, must
+ * be done under RCU protection, because the pages are freed via RCU
+ * callback.
+ */
+ rcu_read_lock();
+
+ ratio = READ_ONCE(nx_huge_pages_recovery_ratio);
+ to_zap = ratio ? DIV_ROUND_UP(nx_lpage_splits, ratio) : 0;
+ for ( ; to_zap; --to_zap) {
+ if (list_empty(&kvm->arch.possible_nx_huge_pages))
+ break;
+
+ /*
+ * We use a separate list instead of just using active_mmu_pages
+ * because the number of shadow pages that be replaced with an
+ * NX huge page is expected to be relatively small compared to
+ * the total number of shadow pages. And because the TDP MMU
+ * doesn't use active_mmu_pages.
+ */
+ sp = list_first_entry(&kvm->arch.possible_nx_huge_pages,
+ struct kvm_mmu_page,
+ possible_nx_huge_page_link);
+ WARN_ON_ONCE(!sp->nx_huge_page_disallowed);
+ WARN_ON_ONCE(!sp->role.direct);
+
+ /*
+ * Unaccount and do not attempt to recover any NX Huge Pages
+ * that are being dirty tracked, as they would just be faulted
+ * back in as 4KiB pages. The NX Huge Pages in this slot will be
+ * recovered, along with all the other huge pages in the slot,
+ * when dirty logging is disabled.
+ *
+ * Since gfn_to_memslot() is relatively expensive, it helps to
+ * skip it if it the test cannot possibly return true. On the
+ * other hand, if any memslot has logging enabled, chances are
+ * good that all of them do, in which case unaccount_nx_huge_page()
+ * is much cheaper than zapping the page.
+ *
+ * If a memslot update is in progress, reading an incorrect value
+ * of kvm->nr_memslots_dirty_logging is not a problem: if it is
+ * becoming zero, gfn_to_memslot() will be done unnecessarily; if
+ * it is becoming nonzero, the page will be zapped unnecessarily.
+ * Either way, this only affects efficiency in racy situations,
+ * and not correctness.
+ */
+ slot = NULL;
+ if (atomic_read(&kvm->nr_memslots_dirty_logging)) {
+ struct kvm_memslots *slots;
+
+ slots = kvm_memslots_for_spte_role(kvm, sp->role);
+ slot = __gfn_to_memslot(slots, sp->gfn);
+ WARN_ON_ONCE(!slot);
+ }
+
+ if (slot && kvm_slot_dirty_track_enabled(slot))
+ unaccount_nx_huge_page(kvm, sp);
+ else if (is_tdp_mmu_page(sp))
+ flush |= kvm_tdp_mmu_zap_sp(kvm, sp);
+ else
+ kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
+ WARN_ON_ONCE(sp->nx_huge_page_disallowed);
+
+ if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
+ kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush);
+ rcu_read_unlock();
+
+ cond_resched_rwlock_write(&kvm->mmu_lock);
+ flush = false;
+
+ rcu_read_lock();
+ }
+ }
+ kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush);
+
+ rcu_read_unlock();
+
+ write_unlock(&kvm->mmu_lock);
+ srcu_read_unlock(&kvm->srcu, rcu_idx);
+}
+
+static long get_nx_huge_page_recovery_timeout(u64 start_time)
+{
+ bool enabled;
+ uint period;
+
+ enabled = calc_nx_huge_pages_recovery_period(&period);
+
+ return enabled ? start_time + msecs_to_jiffies(period) - get_jiffies_64()
+ : MAX_SCHEDULE_TIMEOUT;
+}
+
+static int kvm_nx_huge_page_recovery_worker(struct kvm *kvm, uintptr_t data)
+{
+ u64 start_time;
+ long remaining_time;
+
+ while (true) {
+ start_time = get_jiffies_64();
+ remaining_time = get_nx_huge_page_recovery_timeout(start_time);
+
+ set_current_state(TASK_INTERRUPTIBLE);
+ while (!kthread_should_stop() && remaining_time > 0) {
+ schedule_timeout(remaining_time);
+ remaining_time = get_nx_huge_page_recovery_timeout(start_time);
+ set_current_state(TASK_INTERRUPTIBLE);
+ }
+
+ set_current_state(TASK_RUNNING);
+
+ if (kthread_should_stop())
+ return 0;
+
+ kvm_recover_nx_huge_pages(kvm);
+ }
+}
+
+int kvm_mmu_post_init_vm(struct kvm *kvm)
+{
+ int err;
+
+ if (nx_hugepage_mitigation_hard_disabled)
+ return 0;
+
+ err = kvm_vm_create_worker_thread(kvm, kvm_nx_huge_page_recovery_worker, 0,
+ "kvm-nx-lpage-recovery",
+ &kvm->arch.nx_huge_page_recovery_thread);
+ if (!err)
+ kthread_unpark(kvm->arch.nx_huge_page_recovery_thread);
+
+ return err;
+}
+
+void kvm_mmu_pre_destroy_vm(struct kvm *kvm)
+{
+ if (kvm->arch.nx_huge_page_recovery_thread)
+ kthread_stop(kvm->arch.nx_huge_page_recovery_thread);
+}
diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
new file mode 100644
index 000000000..decc1f153
--- /dev/null
+++ b/arch/x86/kvm/mmu/mmu_internal.h
@@ -0,0 +1,350 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __KVM_X86_MMU_INTERNAL_H
+#define __KVM_X86_MMU_INTERNAL_H
+
+#include <linux/types.h>
+#include <linux/kvm_host.h>
+#include <asm/kvm_host.h>
+
+#ifdef CONFIG_KVM_PROVE_MMU
+#define KVM_MMU_WARN_ON(x) WARN_ON_ONCE(x)
+#else
+#define KVM_MMU_WARN_ON(x) BUILD_BUG_ON_INVALID(x)
+#endif
+
+/* Page table builder macros common to shadow (host) PTEs and guest PTEs. */
+#define __PT_LEVEL_SHIFT(level, bits_per_level) \
+ (PAGE_SHIFT + ((level) - 1) * (bits_per_level))
+#define __PT_INDEX(address, level, bits_per_level) \
+ (((address) >> __PT_LEVEL_SHIFT(level, bits_per_level)) & ((1 << (bits_per_level)) - 1))
+
+#define __PT_LVL_ADDR_MASK(base_addr_mask, level, bits_per_level) \
+ ((base_addr_mask) & ~((1ULL << (PAGE_SHIFT + (((level) - 1) * (bits_per_level)))) - 1))
+
+#define __PT_LVL_OFFSET_MASK(base_addr_mask, level, bits_per_level) \
+ ((base_addr_mask) & ((1ULL << (PAGE_SHIFT + (((level) - 1) * (bits_per_level)))) - 1))
+
+#define __PT_ENT_PER_PAGE(bits_per_level) (1 << (bits_per_level))
+
+/*
+ * Unlike regular MMU roots, PAE "roots", a.k.a. PDPTEs/PDPTRs, have a PRESENT
+ * bit, and thus are guaranteed to be non-zero when valid. And, when a guest
+ * PDPTR is !PRESENT, its corresponding PAE root cannot be set to INVALID_PAGE,
+ * as the CPU would treat that as PRESENT PDPTR with reserved bits set. Use
+ * '0' instead of INVALID_PAGE to indicate an invalid PAE root.
+ */
+#define INVALID_PAE_ROOT 0
+#define IS_VALID_PAE_ROOT(x) (!!(x))
+
+static inline hpa_t kvm_mmu_get_dummy_root(void)
+{
+ return my_zero_pfn(0) << PAGE_SHIFT;
+}
+
+static inline bool kvm_mmu_is_dummy_root(hpa_t shadow_page)
+{
+ return is_zero_pfn(shadow_page >> PAGE_SHIFT);
+}
+
+typedef u64 __rcu *tdp_ptep_t;
+
+struct kvm_mmu_page {
+ /*
+ * Note, "link" through "spt" fit in a single 64 byte cache line on
+ * 64-bit kernels, keep it that way unless there's a reason not to.
+ */
+ struct list_head link;
+ struct hlist_node hash_link;
+
+ bool tdp_mmu_page;
+ bool unsync;
+ union {
+ u8 mmu_valid_gen;
+
+ /* Only accessed under slots_lock. */
+ bool tdp_mmu_scheduled_root_to_zap;
+ };
+
+ /*
+ * The shadow page can't be replaced by an equivalent huge page
+ * because it is being used to map an executable page in the guest
+ * and the NX huge page mitigation is enabled.
+ */
+ bool nx_huge_page_disallowed;
+
+ /*
+ * The following two entries are used to key the shadow page in the
+ * hash table.
+ */
+ union kvm_mmu_page_role role;
+ gfn_t gfn;
+
+ u64 *spt;
+
+ /*
+ * Stores the result of the guest translation being shadowed by each
+ * SPTE. KVM shadows two types of guest translations: nGPA -> GPA
+ * (shadow EPT/NPT) and GVA -> GPA (traditional shadow paging). In both
+ * cases the result of the translation is a GPA and a set of access
+ * constraints.
+ *
+ * The GFN is stored in the upper bits (PAGE_SHIFT) and the shadowed
+ * access permissions are stored in the lower bits. Note, for
+ * convenience and uniformity across guests, the access permissions are
+ * stored in KVM format (e.g. ACC_EXEC_MASK) not the raw guest format.
+ */
+ u64 *shadowed_translation;
+
+ /* Currently serving as active root */
+ union {
+ int root_count;
+ refcount_t tdp_mmu_root_count;
+ };
+ unsigned int unsync_children;
+ union {
+ struct kvm_rmap_head parent_ptes; /* rmap pointers to parent sptes */
+ tdp_ptep_t ptep;
+ };
+ DECLARE_BITMAP(unsync_child_bitmap, 512);
+
+ /*
+ * Tracks shadow pages that, if zapped, would allow KVM to create an NX
+ * huge page. A shadow page will have nx_huge_page_disallowed set but
+ * not be on the list if a huge page is disallowed for other reasons,
+ * e.g. because KVM is shadowing a PTE at the same gfn, the memslot
+ * isn't properly aligned, etc...
+ */
+ struct list_head possible_nx_huge_page_link;
+#ifdef CONFIG_X86_32
+ /*
+ * Used out of the mmu-lock to avoid reading spte values while an
+ * update is in progress; see the comments in __get_spte_lockless().
+ */
+ int clear_spte_count;
+#endif
+
+ /* Number of writes since the last time traversal visited this page. */
+ atomic_t write_flooding_count;
+
+#ifdef CONFIG_X86_64
+ /* Used for freeing the page asynchronously if it is a TDP MMU page. */
+ struct rcu_head rcu_head;
+#endif
+};
+
+extern struct kmem_cache *mmu_page_header_cache;
+
+static inline int kvm_mmu_role_as_id(union kvm_mmu_page_role role)
+{
+ return role.smm ? 1 : 0;
+}
+
+static inline int kvm_mmu_page_as_id(struct kvm_mmu_page *sp)
+{
+ return kvm_mmu_role_as_id(sp->role);
+}
+
+static inline bool kvm_mmu_page_ad_need_write_protect(struct kvm_mmu_page *sp)
+{
+ /*
+ * When using the EPT page-modification log, the GPAs in the CPU dirty
+ * log would come from L2 rather than L1. Therefore, we need to rely
+ * on write protection to record dirty pages, which bypasses PML, since
+ * writes now result in a vmexit. Note, the check on CPU dirty logging
+ * being enabled is mandatory as the bits used to denote WP-only SPTEs
+ * are reserved for PAE paging (32-bit KVM).
+ */
+ return kvm_x86_ops.cpu_dirty_log_size && sp->role.guest_mode;
+}
+
+static inline gfn_t gfn_round_for_level(gfn_t gfn, int level)
+{
+ return gfn & -KVM_PAGES_PER_HPAGE(level);
+}
+
+int mmu_try_to_unsync_pages(struct kvm *kvm, const struct kvm_memory_slot *slot,
+ gfn_t gfn, bool can_unsync, bool prefetch);
+
+void kvm_mmu_gfn_disallow_lpage(const struct kvm_memory_slot *slot, gfn_t gfn);
+void kvm_mmu_gfn_allow_lpage(const struct kvm_memory_slot *slot, gfn_t gfn);
+bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
+ struct kvm_memory_slot *slot, u64 gfn,
+ int min_level);
+
+/* Flush the given page (huge or not) of guest memory. */
+static inline void kvm_flush_remote_tlbs_gfn(struct kvm *kvm, gfn_t gfn, int level)
+{
+ kvm_flush_remote_tlbs_range(kvm, gfn_round_for_level(gfn, level),
+ KVM_PAGES_PER_HPAGE(level));
+}
+
+unsigned int pte_list_count(struct kvm_rmap_head *rmap_head);
+
+extern int nx_huge_pages;
+static inline bool is_nx_huge_page_enabled(struct kvm *kvm)
+{
+ return READ_ONCE(nx_huge_pages) && !kvm->arch.disable_nx_huge_pages;
+}
+
+struct kvm_page_fault {
+ /* arguments to kvm_mmu_do_page_fault. */
+ const gpa_t addr;
+ const u32 error_code;
+ const bool prefetch;
+
+ /* Derived from error_code. */
+ const bool exec;
+ const bool write;
+ const bool present;
+ const bool rsvd;
+ const bool user;
+
+ /* Derived from mmu and global state. */
+ const bool is_tdp;
+ const bool nx_huge_page_workaround_enabled;
+
+ /*
+ * Whether a >4KB mapping can be created or is forbidden due to NX
+ * hugepages.
+ */
+ bool huge_page_disallowed;
+
+ /*
+ * Maximum page size that can be created for this fault; input to
+ * FNAME(fetch), direct_map() and kvm_tdp_mmu_map().
+ */
+ u8 max_level;
+
+ /*
+ * Page size that can be created based on the max_level and the
+ * page size used by the host mapping.
+ */
+ u8 req_level;
+
+ /*
+ * Page size that will be created based on the req_level and
+ * huge_page_disallowed.
+ */
+ u8 goal_level;
+
+ /* Shifted addr, or result of guest page table walk if addr is a gva. */
+ gfn_t gfn;
+
+ /* The memslot containing gfn. May be NULL. */
+ struct kvm_memory_slot *slot;
+
+ /* Outputs of kvm_faultin_pfn. */
+ unsigned long mmu_seq;
+ kvm_pfn_t pfn;
+ hva_t hva;
+ bool map_writable;
+
+ /*
+ * Indicates the guest is trying to write a gfn that contains one or
+ * more of the PTEs used to translate the write itself, i.e. the access
+ * is changing its own translation in the guest page tables.
+ */
+ bool write_fault_to_shadow_pgtable;
+};
+
+int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault);
+
+/*
+ * Return values of handle_mmio_page_fault(), mmu.page_fault(), fast_page_fault(),
+ * and of course kvm_mmu_do_page_fault().
+ *
+ * RET_PF_CONTINUE: So far, so good, keep handling the page fault.
+ * RET_PF_RETRY: let CPU fault again on the address.
+ * RET_PF_EMULATE: mmio page fault, emulate the instruction directly.
+ * RET_PF_INVALID: the spte is invalid, let the real page fault path update it.
+ * RET_PF_FIXED: The faulting entry has been fixed.
+ * RET_PF_SPURIOUS: The faulting entry was already fixed, e.g. by another vCPU.
+ *
+ * Any names added to this enum should be exported to userspace for use in
+ * tracepoints via TRACE_DEFINE_ENUM() in mmutrace.h
+ *
+ * Note, all values must be greater than or equal to zero so as not to encroach
+ * on -errno return values. Somewhat arbitrarily use '0' for CONTINUE, which
+ * will allow for efficient machine code when checking for CONTINUE, e.g.
+ * "TEST %rax, %rax, JNZ", as all "stop!" values are non-zero.
+ */
+enum {
+ RET_PF_CONTINUE = 0,
+ RET_PF_RETRY,
+ RET_PF_EMULATE,
+ RET_PF_INVALID,
+ RET_PF_FIXED,
+ RET_PF_SPURIOUS,
+};
+
+static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
+ u32 err, bool prefetch, int *emulation_type)
+{
+ struct kvm_page_fault fault = {
+ .addr = cr2_or_gpa,
+ .error_code = err,
+ .exec = err & PFERR_FETCH_MASK,
+ .write = err & PFERR_WRITE_MASK,
+ .present = err & PFERR_PRESENT_MASK,
+ .rsvd = err & PFERR_RSVD_MASK,
+ .user = err & PFERR_USER_MASK,
+ .prefetch = prefetch,
+ .is_tdp = likely(vcpu->arch.mmu->page_fault == kvm_tdp_page_fault),
+ .nx_huge_page_workaround_enabled =
+ is_nx_huge_page_enabled(vcpu->kvm),
+
+ .max_level = KVM_MAX_HUGEPAGE_LEVEL,
+ .req_level = PG_LEVEL_4K,
+ .goal_level = PG_LEVEL_4K,
+ };
+ int r;
+
+ if (vcpu->arch.mmu->root_role.direct) {
+ fault.gfn = fault.addr >> PAGE_SHIFT;
+ fault.slot = kvm_vcpu_gfn_to_memslot(vcpu, fault.gfn);
+ }
+
+ /*
+ * Async #PF "faults", a.k.a. prefetch faults, are not faults from the
+ * guest perspective and have already been counted at the time of the
+ * original fault.
+ */
+ if (!prefetch)
+ vcpu->stat.pf_taken++;
+
+ if (IS_ENABLED(CONFIG_RETPOLINE) && fault.is_tdp)
+ r = kvm_tdp_page_fault(vcpu, &fault);
+ else
+ r = vcpu->arch.mmu->page_fault(vcpu, &fault);
+
+ if (fault.write_fault_to_shadow_pgtable && emulation_type)
+ *emulation_type |= EMULTYPE_WRITE_PF_TO_SP;
+
+ /*
+ * Similar to above, prefetch faults aren't truly spurious, and the
+ * async #PF path doesn't do emulation. Do count faults that are fixed
+ * by the async #PF handler though, otherwise they'll never be counted.
+ */
+ if (r == RET_PF_FIXED)
+ vcpu->stat.pf_fixed++;
+ else if (prefetch)
+ ;
+ else if (r == RET_PF_EMULATE)
+ vcpu->stat.pf_emulate++;
+ else if (r == RET_PF_SPURIOUS)
+ vcpu->stat.pf_spurious++;
+ return r;
+}
+
+int kvm_mmu_max_mapping_level(struct kvm *kvm,
+ const struct kvm_memory_slot *slot, gfn_t gfn,
+ int max_level);
+void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault);
+void disallowed_hugepage_adjust(struct kvm_page_fault *fault, u64 spte, int cur_level);
+
+void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc);
+
+void track_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp);
+void untrack_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp);
+
+#endif /* __KVM_X86_MMU_INTERNAL_H */
diff --git a/arch/x86/kvm/mmu/mmutrace.h b/arch/x86/kvm/mmu/mmutrace.h
new file mode 100644
index 000000000..ae86820ce
--- /dev/null
+++ b/arch/x86/kvm/mmu/mmutrace.h
@@ -0,0 +1,451 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#if !defined(_TRACE_KVMMMU_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_KVMMMU_H
+
+#include <linux/tracepoint.h>
+#include <linux/trace_events.h>
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM kvmmmu
+
+#define KVM_MMU_PAGE_FIELDS \
+ __field(__u8, mmu_valid_gen) \
+ __field(__u64, gfn) \
+ __field(__u32, role) \
+ __field(__u32, root_count) \
+ __field(bool, unsync)
+
+#define KVM_MMU_PAGE_ASSIGN(sp) \
+ __entry->mmu_valid_gen = sp->mmu_valid_gen; \
+ __entry->gfn = sp->gfn; \
+ __entry->role = sp->role.word; \
+ __entry->root_count = sp->root_count; \
+ __entry->unsync = sp->unsync;
+
+#define KVM_MMU_PAGE_PRINTK() ({ \
+ const char *saved_ptr = trace_seq_buffer_ptr(p); \
+ static const char *access_str[] = { \
+ "---", "--x", "w--", "w-x", "-u-", "-ux", "wu-", "wux" \
+ }; \
+ union kvm_mmu_page_role role; \
+ \
+ role.word = __entry->role; \
+ \
+ trace_seq_printf(p, "sp gen %u gfn %llx l%u %u-byte q%u%s %s%s" \
+ " %snxe %sad root %u %s%c", \
+ __entry->mmu_valid_gen, \
+ __entry->gfn, role.level, \
+ role.has_4_byte_gpte ? 4 : 8, \
+ role.quadrant, \
+ role.direct ? " direct" : "", \
+ access_str[role.access], \
+ role.invalid ? " invalid" : "", \
+ role.efer_nx ? "" : "!", \
+ role.ad_disabled ? "!" : "", \
+ __entry->root_count, \
+ __entry->unsync ? "unsync" : "sync", 0); \
+ saved_ptr; \
+ })
+
+#define kvm_mmu_trace_pferr_flags \
+ { PFERR_PRESENT_MASK, "P" }, \
+ { PFERR_WRITE_MASK, "W" }, \
+ { PFERR_USER_MASK, "U" }, \
+ { PFERR_RSVD_MASK, "RSVD" }, \
+ { PFERR_FETCH_MASK, "F" }
+
+TRACE_DEFINE_ENUM(RET_PF_CONTINUE);
+TRACE_DEFINE_ENUM(RET_PF_RETRY);
+TRACE_DEFINE_ENUM(RET_PF_EMULATE);
+TRACE_DEFINE_ENUM(RET_PF_INVALID);
+TRACE_DEFINE_ENUM(RET_PF_FIXED);
+TRACE_DEFINE_ENUM(RET_PF_SPURIOUS);
+
+/*
+ * A pagetable walk has started
+ */
+TRACE_EVENT(
+ kvm_mmu_pagetable_walk,
+ TP_PROTO(u64 addr, u32 pferr),
+ TP_ARGS(addr, pferr),
+
+ TP_STRUCT__entry(
+ __field(__u64, addr)
+ __field(__u32, pferr)
+ ),
+
+ TP_fast_assign(
+ __entry->addr = addr;
+ __entry->pferr = pferr;
+ ),
+
+ TP_printk("addr %llx pferr %x %s", __entry->addr, __entry->pferr,
+ __print_flags(__entry->pferr, "|", kvm_mmu_trace_pferr_flags))
+);
+
+
+/* We just walked a paging element */
+TRACE_EVENT(
+ kvm_mmu_paging_element,
+ TP_PROTO(u64 pte, int level),
+ TP_ARGS(pte, level),
+
+ TP_STRUCT__entry(
+ __field(__u64, pte)
+ __field(__u32, level)
+ ),
+
+ TP_fast_assign(
+ __entry->pte = pte;
+ __entry->level = level;
+ ),
+
+ TP_printk("pte %llx level %u", __entry->pte, __entry->level)
+);
+
+DECLARE_EVENT_CLASS(kvm_mmu_set_bit_class,
+
+ TP_PROTO(unsigned long table_gfn, unsigned index, unsigned size),
+
+ TP_ARGS(table_gfn, index, size),
+
+ TP_STRUCT__entry(
+ __field(__u64, gpa)
+ ),
+
+ TP_fast_assign(
+ __entry->gpa = ((u64)table_gfn << PAGE_SHIFT)
+ + index * size;
+ ),
+
+ TP_printk("gpa %llx", __entry->gpa)
+);
+
+/* We set a pte accessed bit */
+DEFINE_EVENT(kvm_mmu_set_bit_class, kvm_mmu_set_accessed_bit,
+
+ TP_PROTO(unsigned long table_gfn, unsigned index, unsigned size),
+
+ TP_ARGS(table_gfn, index, size)
+);
+
+/* We set a pte dirty bit */
+DEFINE_EVENT(kvm_mmu_set_bit_class, kvm_mmu_set_dirty_bit,
+
+ TP_PROTO(unsigned long table_gfn, unsigned index, unsigned size),
+
+ TP_ARGS(table_gfn, index, size)
+);
+
+TRACE_EVENT(
+ kvm_mmu_walker_error,
+ TP_PROTO(u32 pferr),
+ TP_ARGS(pferr),
+
+ TP_STRUCT__entry(
+ __field(__u32, pferr)
+ ),
+
+ TP_fast_assign(
+ __entry->pferr = pferr;
+ ),
+
+ TP_printk("pferr %x %s", __entry->pferr,
+ __print_flags(__entry->pferr, "|", kvm_mmu_trace_pferr_flags))
+);
+
+TRACE_EVENT(
+ kvm_mmu_get_page,
+ TP_PROTO(struct kvm_mmu_page *sp, bool created),
+ TP_ARGS(sp, created),
+
+ TP_STRUCT__entry(
+ KVM_MMU_PAGE_FIELDS
+ __field(bool, created)
+ ),
+
+ TP_fast_assign(
+ KVM_MMU_PAGE_ASSIGN(sp)
+ __entry->created = created;
+ ),
+
+ TP_printk("%s %s", KVM_MMU_PAGE_PRINTK(),
+ __entry->created ? "new" : "existing")
+);
+
+DECLARE_EVENT_CLASS(kvm_mmu_page_class,
+
+ TP_PROTO(struct kvm_mmu_page *sp),
+ TP_ARGS(sp),
+
+ TP_STRUCT__entry(
+ KVM_MMU_PAGE_FIELDS
+ ),
+
+ TP_fast_assign(
+ KVM_MMU_PAGE_ASSIGN(sp)
+ ),
+
+ TP_printk("%s", KVM_MMU_PAGE_PRINTK())
+);
+
+DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_sync_page,
+ TP_PROTO(struct kvm_mmu_page *sp),
+
+ TP_ARGS(sp)
+);
+
+DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_unsync_page,
+ TP_PROTO(struct kvm_mmu_page *sp),
+
+ TP_ARGS(sp)
+);
+
+DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_prepare_zap_page,
+ TP_PROTO(struct kvm_mmu_page *sp),
+
+ TP_ARGS(sp)
+);
+
+TRACE_EVENT(
+ mark_mmio_spte,
+ TP_PROTO(u64 *sptep, gfn_t gfn, u64 spte),
+ TP_ARGS(sptep, gfn, spte),
+
+ TP_STRUCT__entry(
+ __field(void *, sptep)
+ __field(gfn_t, gfn)
+ __field(unsigned, access)
+ __field(unsigned int, gen)
+ ),
+
+ TP_fast_assign(
+ __entry->sptep = sptep;
+ __entry->gfn = gfn;
+ __entry->access = spte & ACC_ALL;
+ __entry->gen = get_mmio_spte_generation(spte);
+ ),
+
+ TP_printk("sptep:%p gfn %llx access %x gen %x", __entry->sptep,
+ __entry->gfn, __entry->access, __entry->gen)
+);
+
+TRACE_EVENT(
+ handle_mmio_page_fault,
+ TP_PROTO(u64 addr, gfn_t gfn, unsigned access),
+ TP_ARGS(addr, gfn, access),
+
+ TP_STRUCT__entry(
+ __field(u64, addr)
+ __field(gfn_t, gfn)
+ __field(unsigned, access)
+ ),
+
+ TP_fast_assign(
+ __entry->addr = addr;
+ __entry->gfn = gfn;
+ __entry->access = access;
+ ),
+
+ TP_printk("addr:%llx gfn %llx access %x", __entry->addr, __entry->gfn,
+ __entry->access)
+);
+
+TRACE_EVENT(
+ fast_page_fault,
+ TP_PROTO(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
+ u64 *sptep, u64 old_spte, int ret),
+ TP_ARGS(vcpu, fault, sptep, old_spte, ret),
+
+ TP_STRUCT__entry(
+ __field(int, vcpu_id)
+ __field(gpa_t, cr2_or_gpa)
+ __field(u32, error_code)
+ __field(u64 *, sptep)
+ __field(u64, old_spte)
+ __field(u64, new_spte)
+ __field(int, ret)
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu->vcpu_id;
+ __entry->cr2_or_gpa = fault->addr;
+ __entry->error_code = fault->error_code;
+ __entry->sptep = sptep;
+ __entry->old_spte = old_spte;
+ __entry->new_spte = *sptep;
+ __entry->ret = ret;
+ ),
+
+ TP_printk("vcpu %d gva %llx error_code %s sptep %p old %#llx"
+ " new %llx spurious %d fixed %d", __entry->vcpu_id,
+ __entry->cr2_or_gpa, __print_flags(__entry->error_code, "|",
+ kvm_mmu_trace_pferr_flags), __entry->sptep,
+ __entry->old_spte, __entry->new_spte,
+ __entry->ret == RET_PF_SPURIOUS, __entry->ret == RET_PF_FIXED
+ )
+);
+
+TRACE_EVENT(
+ kvm_mmu_zap_all_fast,
+ TP_PROTO(struct kvm *kvm),
+ TP_ARGS(kvm),
+
+ TP_STRUCT__entry(
+ __field(__u8, mmu_valid_gen)
+ __field(unsigned int, mmu_used_pages)
+ ),
+
+ TP_fast_assign(
+ __entry->mmu_valid_gen = kvm->arch.mmu_valid_gen;
+ __entry->mmu_used_pages = kvm->arch.n_used_mmu_pages;
+ ),
+
+ TP_printk("kvm-mmu-valid-gen %u used_pages %x",
+ __entry->mmu_valid_gen, __entry->mmu_used_pages
+ )
+);
+
+
+TRACE_EVENT(
+ check_mmio_spte,
+ TP_PROTO(u64 spte, unsigned int kvm_gen, unsigned int spte_gen),
+ TP_ARGS(spte, kvm_gen, spte_gen),
+
+ TP_STRUCT__entry(
+ __field(unsigned int, kvm_gen)
+ __field(unsigned int, spte_gen)
+ __field(u64, spte)
+ ),
+
+ TP_fast_assign(
+ __entry->kvm_gen = kvm_gen;
+ __entry->spte_gen = spte_gen;
+ __entry->spte = spte;
+ ),
+
+ TP_printk("spte %llx kvm_gen %x spte-gen %x valid %d", __entry->spte,
+ __entry->kvm_gen, __entry->spte_gen,
+ __entry->kvm_gen == __entry->spte_gen
+ )
+);
+
+TRACE_EVENT(
+ kvm_mmu_set_spte,
+ TP_PROTO(int level, gfn_t gfn, u64 *sptep),
+ TP_ARGS(level, gfn, sptep),
+
+ TP_STRUCT__entry(
+ __field(u64, gfn)
+ __field(u64, spte)
+ __field(u64, sptep)
+ __field(u8, level)
+ /* These depend on page entry type, so compute them now. */
+ __field(bool, r)
+ __field(bool, x)
+ __field(signed char, u)
+ ),
+
+ TP_fast_assign(
+ __entry->gfn = gfn;
+ __entry->spte = *sptep;
+ __entry->sptep = virt_to_phys(sptep);
+ __entry->level = level;
+ __entry->r = shadow_present_mask || (__entry->spte & PT_PRESENT_MASK);
+ __entry->x = is_executable_pte(__entry->spte);
+ __entry->u = shadow_user_mask ? !!(__entry->spte & shadow_user_mask) : -1;
+ ),
+
+ TP_printk("gfn %llx spte %llx (%s%s%s%s) level %d at %llx",
+ __entry->gfn, __entry->spte,
+ __entry->r ? "r" : "-",
+ __entry->spte & PT_WRITABLE_MASK ? "w" : "-",
+ __entry->x ? "x" : "-",
+ __entry->u == -1 ? "" : (__entry->u ? "u" : "-"),
+ __entry->level, __entry->sptep
+ )
+);
+
+TRACE_EVENT(
+ kvm_mmu_spte_requested,
+ TP_PROTO(struct kvm_page_fault *fault),
+ TP_ARGS(fault),
+
+ TP_STRUCT__entry(
+ __field(u64, gfn)
+ __field(u64, pfn)
+ __field(u8, level)
+ ),
+
+ TP_fast_assign(
+ __entry->gfn = fault->gfn;
+ __entry->pfn = fault->pfn | (fault->gfn & (KVM_PAGES_PER_HPAGE(fault->goal_level) - 1));
+ __entry->level = fault->goal_level;
+ ),
+
+ TP_printk("gfn %llx pfn %llx level %d",
+ __entry->gfn, __entry->pfn, __entry->level
+ )
+);
+
+TRACE_EVENT(
+ kvm_tdp_mmu_spte_changed,
+ TP_PROTO(int as_id, gfn_t gfn, int level, u64 old_spte, u64 new_spte),
+ TP_ARGS(as_id, gfn, level, old_spte, new_spte),
+
+ TP_STRUCT__entry(
+ __field(u64, gfn)
+ __field(u64, old_spte)
+ __field(u64, new_spte)
+ /* Level cannot be larger than 5 on x86, so it fits in a u8. */
+ __field(u8, level)
+ /* as_id can only be 0 or 1 x86, so it fits in a u8. */
+ __field(u8, as_id)
+ ),
+
+ TP_fast_assign(
+ __entry->gfn = gfn;
+ __entry->old_spte = old_spte;
+ __entry->new_spte = new_spte;
+ __entry->level = level;
+ __entry->as_id = as_id;
+ ),
+
+ TP_printk("as id %d gfn %llx level %d old_spte %llx new_spte %llx",
+ __entry->as_id, __entry->gfn, __entry->level,
+ __entry->old_spte, __entry->new_spte
+ )
+);
+
+TRACE_EVENT(
+ kvm_mmu_split_huge_page,
+ TP_PROTO(u64 gfn, u64 spte, int level, int errno),
+ TP_ARGS(gfn, spte, level, errno),
+
+ TP_STRUCT__entry(
+ __field(u64, gfn)
+ __field(u64, spte)
+ __field(int, level)
+ __field(int, errno)
+ ),
+
+ TP_fast_assign(
+ __entry->gfn = gfn;
+ __entry->spte = spte;
+ __entry->level = level;
+ __entry->errno = errno;
+ ),
+
+ TP_printk("gfn %llx spte %llx level %d errno %d",
+ __entry->gfn, __entry->spte, __entry->level, __entry->errno)
+);
+
+#endif /* _TRACE_KVMMMU_H */
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH mmu
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE mmutrace
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/arch/x86/kvm/mmu/page_track.c b/arch/x86/kvm/mmu/page_track.c
new file mode 100644
index 000000000..c87da11f3
--- /dev/null
+++ b/arch/x86/kvm/mmu/page_track.c
@@ -0,0 +1,307 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Support KVM gust page tracking
+ *
+ * This feature allows us to track page access in guest. Currently, only
+ * write access is tracked.
+ *
+ * Copyright(C) 2015 Intel Corporation.
+ *
+ * Author:
+ * Xiao Guangrong <guangrong.xiao@linux.intel.com>
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/lockdep.h>
+#include <linux/kvm_host.h>
+#include <linux/rculist.h>
+
+#include "mmu.h"
+#include "mmu_internal.h"
+#include "page_track.h"
+
+bool kvm_page_track_write_tracking_enabled(struct kvm *kvm)
+{
+ return IS_ENABLED(CONFIG_KVM_EXTERNAL_WRITE_TRACKING) ||
+ !tdp_enabled || kvm_shadow_root_allocated(kvm);
+}
+
+void kvm_page_track_free_memslot(struct kvm_memory_slot *slot)
+{
+ kvfree(slot->arch.gfn_write_track);
+ slot->arch.gfn_write_track = NULL;
+}
+
+static int __kvm_page_track_write_tracking_alloc(struct kvm_memory_slot *slot,
+ unsigned long npages)
+{
+ const size_t size = sizeof(*slot->arch.gfn_write_track);
+
+ if (!slot->arch.gfn_write_track)
+ slot->arch.gfn_write_track = __vcalloc(npages, size,
+ GFP_KERNEL_ACCOUNT);
+
+ return slot->arch.gfn_write_track ? 0 : -ENOMEM;
+}
+
+int kvm_page_track_create_memslot(struct kvm *kvm,
+ struct kvm_memory_slot *slot,
+ unsigned long npages)
+{
+ if (!kvm_page_track_write_tracking_enabled(kvm))
+ return 0;
+
+ return __kvm_page_track_write_tracking_alloc(slot, npages);
+}
+
+int kvm_page_track_write_tracking_alloc(struct kvm_memory_slot *slot)
+{
+ return __kvm_page_track_write_tracking_alloc(slot, slot->npages);
+}
+
+static void update_gfn_write_track(struct kvm_memory_slot *slot, gfn_t gfn,
+ short count)
+{
+ int index, val;
+
+ index = gfn_to_index(gfn, slot->base_gfn, PG_LEVEL_4K);
+
+ val = slot->arch.gfn_write_track[index];
+
+ if (WARN_ON_ONCE(val + count < 0 || val + count > USHRT_MAX))
+ return;
+
+ slot->arch.gfn_write_track[index] += count;
+}
+
+void __kvm_write_track_add_gfn(struct kvm *kvm, struct kvm_memory_slot *slot,
+ gfn_t gfn)
+{
+ lockdep_assert_held_write(&kvm->mmu_lock);
+
+ lockdep_assert_once(lockdep_is_held(&kvm->slots_lock) ||
+ srcu_read_lock_held(&kvm->srcu));
+
+ if (KVM_BUG_ON(!kvm_page_track_write_tracking_enabled(kvm), kvm))
+ return;
+
+ update_gfn_write_track(slot, gfn, 1);
+
+ /*
+ * new track stops large page mapping for the
+ * tracked page.
+ */
+ kvm_mmu_gfn_disallow_lpage(slot, gfn);
+
+ if (kvm_mmu_slot_gfn_write_protect(kvm, slot, gfn, PG_LEVEL_4K))
+ kvm_flush_remote_tlbs(kvm);
+}
+
+void __kvm_write_track_remove_gfn(struct kvm *kvm,
+ struct kvm_memory_slot *slot, gfn_t gfn)
+{
+ lockdep_assert_held_write(&kvm->mmu_lock);
+
+ lockdep_assert_once(lockdep_is_held(&kvm->slots_lock) ||
+ srcu_read_lock_held(&kvm->srcu));
+
+ if (KVM_BUG_ON(!kvm_page_track_write_tracking_enabled(kvm), kvm))
+ return;
+
+ update_gfn_write_track(slot, gfn, -1);
+
+ /*
+ * allow large page mapping for the tracked page
+ * after the tracker is gone.
+ */
+ kvm_mmu_gfn_allow_lpage(slot, gfn);
+}
+
+/*
+ * check if the corresponding access on the specified guest page is tracked.
+ */
+bool kvm_gfn_is_write_tracked(struct kvm *kvm,
+ const struct kvm_memory_slot *slot, gfn_t gfn)
+{
+ int index;
+
+ if (!slot)
+ return false;
+
+ if (!kvm_page_track_write_tracking_enabled(kvm))
+ return false;
+
+ index = gfn_to_index(gfn, slot->base_gfn, PG_LEVEL_4K);
+ return !!READ_ONCE(slot->arch.gfn_write_track[index]);
+}
+
+#ifdef CONFIG_KVM_EXTERNAL_WRITE_TRACKING
+void kvm_page_track_cleanup(struct kvm *kvm)
+{
+ struct kvm_page_track_notifier_head *head;
+
+ head = &kvm->arch.track_notifier_head;
+ cleanup_srcu_struct(&head->track_srcu);
+}
+
+int kvm_page_track_init(struct kvm *kvm)
+{
+ struct kvm_page_track_notifier_head *head;
+
+ head = &kvm->arch.track_notifier_head;
+ INIT_HLIST_HEAD(&head->track_notifier_list);
+ return init_srcu_struct(&head->track_srcu);
+}
+
+/*
+ * register the notifier so that event interception for the tracked guest
+ * pages can be received.
+ */
+int kvm_page_track_register_notifier(struct kvm *kvm,
+ struct kvm_page_track_notifier_node *n)
+{
+ struct kvm_page_track_notifier_head *head;
+
+ if (!kvm || kvm->mm != current->mm)
+ return -ESRCH;
+
+ kvm_get_kvm(kvm);
+
+ head = &kvm->arch.track_notifier_head;
+
+ write_lock(&kvm->mmu_lock);
+ hlist_add_head_rcu(&n->node, &head->track_notifier_list);
+ write_unlock(&kvm->mmu_lock);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_page_track_register_notifier);
+
+/*
+ * stop receiving the event interception. It is the opposed operation of
+ * kvm_page_track_register_notifier().
+ */
+void kvm_page_track_unregister_notifier(struct kvm *kvm,
+ struct kvm_page_track_notifier_node *n)
+{
+ struct kvm_page_track_notifier_head *head;
+
+ head = &kvm->arch.track_notifier_head;
+
+ write_lock(&kvm->mmu_lock);
+ hlist_del_rcu(&n->node);
+ write_unlock(&kvm->mmu_lock);
+ synchronize_srcu(&head->track_srcu);
+
+ kvm_put_kvm(kvm);
+}
+EXPORT_SYMBOL_GPL(kvm_page_track_unregister_notifier);
+
+/*
+ * Notify the node that write access is intercepted and write emulation is
+ * finished at this time.
+ *
+ * The node should figure out if the written page is the one that node is
+ * interested in by itself.
+ */
+void __kvm_page_track_write(struct kvm *kvm, gpa_t gpa, const u8 *new, int bytes)
+{
+ struct kvm_page_track_notifier_head *head;
+ struct kvm_page_track_notifier_node *n;
+ int idx;
+
+ head = &kvm->arch.track_notifier_head;
+
+ if (hlist_empty(&head->track_notifier_list))
+ return;
+
+ idx = srcu_read_lock(&head->track_srcu);
+ hlist_for_each_entry_srcu(n, &head->track_notifier_list, node,
+ srcu_read_lock_held(&head->track_srcu))
+ if (n->track_write)
+ n->track_write(gpa, new, bytes, n);
+ srcu_read_unlock(&head->track_srcu, idx);
+}
+
+/*
+ * Notify external page track nodes that a memory region is being removed from
+ * the VM, e.g. so that users can free any associated metadata.
+ */
+void kvm_page_track_delete_slot(struct kvm *kvm, struct kvm_memory_slot *slot)
+{
+ struct kvm_page_track_notifier_head *head;
+ struct kvm_page_track_notifier_node *n;
+ int idx;
+
+ head = &kvm->arch.track_notifier_head;
+
+ if (hlist_empty(&head->track_notifier_list))
+ return;
+
+ idx = srcu_read_lock(&head->track_srcu);
+ hlist_for_each_entry_srcu(n, &head->track_notifier_list, node,
+ srcu_read_lock_held(&head->track_srcu))
+ if (n->track_remove_region)
+ n->track_remove_region(slot->base_gfn, slot->npages, n);
+ srcu_read_unlock(&head->track_srcu, idx);
+}
+
+/*
+ * add guest page to the tracking pool so that corresponding access on that
+ * page will be intercepted.
+ *
+ * @kvm: the guest instance we are interested in.
+ * @gfn: the guest page.
+ */
+int kvm_write_track_add_gfn(struct kvm *kvm, gfn_t gfn)
+{
+ struct kvm_memory_slot *slot;
+ int idx;
+
+ idx = srcu_read_lock(&kvm->srcu);
+
+ slot = gfn_to_memslot(kvm, gfn);
+ if (!slot) {
+ srcu_read_unlock(&kvm->srcu, idx);
+ return -EINVAL;
+ }
+
+ write_lock(&kvm->mmu_lock);
+ __kvm_write_track_add_gfn(kvm, slot, gfn);
+ write_unlock(&kvm->mmu_lock);
+
+ srcu_read_unlock(&kvm->srcu, idx);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_write_track_add_gfn);
+
+/*
+ * remove the guest page from the tracking pool which stops the interception
+ * of corresponding access on that page.
+ *
+ * @kvm: the guest instance we are interested in.
+ * @gfn: the guest page.
+ */
+int kvm_write_track_remove_gfn(struct kvm *kvm, gfn_t gfn)
+{
+ struct kvm_memory_slot *slot;
+ int idx;
+
+ idx = srcu_read_lock(&kvm->srcu);
+
+ slot = gfn_to_memslot(kvm, gfn);
+ if (!slot) {
+ srcu_read_unlock(&kvm->srcu, idx);
+ return -EINVAL;
+ }
+
+ write_lock(&kvm->mmu_lock);
+ __kvm_write_track_remove_gfn(kvm, slot, gfn);
+ write_unlock(&kvm->mmu_lock);
+
+ srcu_read_unlock(&kvm->srcu, idx);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_write_track_remove_gfn);
+#endif
diff --git a/arch/x86/kvm/mmu/page_track.h b/arch/x86/kvm/mmu/page_track.h
new file mode 100644
index 000000000..d4d72ed99
--- /dev/null
+++ b/arch/x86/kvm/mmu/page_track.h
@@ -0,0 +1,58 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __KVM_X86_PAGE_TRACK_H
+#define __KVM_X86_PAGE_TRACK_H
+
+#include <linux/kvm_host.h>
+
+#include <asm/kvm_page_track.h>
+
+
+bool kvm_page_track_write_tracking_enabled(struct kvm *kvm);
+int kvm_page_track_write_tracking_alloc(struct kvm_memory_slot *slot);
+
+void kvm_page_track_free_memslot(struct kvm_memory_slot *slot);
+int kvm_page_track_create_memslot(struct kvm *kvm,
+ struct kvm_memory_slot *slot,
+ unsigned long npages);
+
+void __kvm_write_track_add_gfn(struct kvm *kvm, struct kvm_memory_slot *slot,
+ gfn_t gfn);
+void __kvm_write_track_remove_gfn(struct kvm *kvm,
+ struct kvm_memory_slot *slot, gfn_t gfn);
+
+bool kvm_gfn_is_write_tracked(struct kvm *kvm,
+ const struct kvm_memory_slot *slot, gfn_t gfn);
+
+#ifdef CONFIG_KVM_EXTERNAL_WRITE_TRACKING
+int kvm_page_track_init(struct kvm *kvm);
+void kvm_page_track_cleanup(struct kvm *kvm);
+
+void __kvm_page_track_write(struct kvm *kvm, gpa_t gpa, const u8 *new, int bytes);
+void kvm_page_track_delete_slot(struct kvm *kvm, struct kvm_memory_slot *slot);
+
+static inline bool kvm_page_track_has_external_user(struct kvm *kvm)
+{
+ return !hlist_empty(&kvm->arch.track_notifier_head.track_notifier_list);
+}
+#else
+static inline int kvm_page_track_init(struct kvm *kvm) { return 0; }
+static inline void kvm_page_track_cleanup(struct kvm *kvm) { }
+
+static inline void __kvm_page_track_write(struct kvm *kvm, gpa_t gpa,
+ const u8 *new, int bytes) { }
+static inline void kvm_page_track_delete_slot(struct kvm *kvm,
+ struct kvm_memory_slot *slot) { }
+
+static inline bool kvm_page_track_has_external_user(struct kvm *kvm) { return false; }
+
+#endif /* CONFIG_KVM_EXTERNAL_WRITE_TRACKING */
+
+static inline void kvm_page_track_write(struct kvm_vcpu *vcpu, gpa_t gpa,
+ const u8 *new, int bytes)
+{
+ __kvm_page_track_write(vcpu->kvm, gpa, new, bytes);
+
+ kvm_mmu_track_write(vcpu, gpa, new, bytes);
+}
+
+#endif /* __KVM_X86_PAGE_TRACK_H */
diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
new file mode 100644
index 000000000..c85255073
--- /dev/null
+++ b/arch/x86/kvm/mmu/paging_tmpl.h
@@ -0,0 +1,985 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Kernel-based Virtual Machine driver for Linux
+ *
+ * This module enables machines with Intel VT-x extensions to run virtual
+ * machines without emulation or binary translation.
+ *
+ * MMU support
+ *
+ * Copyright (C) 2006 Qumranet, Inc.
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
+ *
+ * Authors:
+ * Yaniv Kamay <yaniv@qumranet.com>
+ * Avi Kivity <avi@qumranet.com>
+ */
+
+/*
+ * The MMU needs to be able to access/walk 32-bit and 64-bit guest page tables,
+ * as well as guest EPT tables, so the code in this file is compiled thrice,
+ * once per guest PTE type. The per-type defines are #undef'd at the end.
+ */
+
+#if PTTYPE == 64
+ #define pt_element_t u64
+ #define guest_walker guest_walker64
+ #define FNAME(name) paging##64_##name
+ #define PT_LEVEL_BITS 9
+ #define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT
+ #define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT
+ #define PT_HAVE_ACCESSED_DIRTY(mmu) true
+ #ifdef CONFIG_X86_64
+ #define PT_MAX_FULL_LEVELS PT64_ROOT_MAX_LEVEL
+ #else
+ #define PT_MAX_FULL_LEVELS 2
+ #endif
+#elif PTTYPE == 32
+ #define pt_element_t u32
+ #define guest_walker guest_walker32
+ #define FNAME(name) paging##32_##name
+ #define PT_LEVEL_BITS 10
+ #define PT_MAX_FULL_LEVELS 2
+ #define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT
+ #define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT
+ #define PT_HAVE_ACCESSED_DIRTY(mmu) true
+
+ #define PT32_DIR_PSE36_SIZE 4
+ #define PT32_DIR_PSE36_SHIFT 13
+ #define PT32_DIR_PSE36_MASK \
+ (((1ULL << PT32_DIR_PSE36_SIZE) - 1) << PT32_DIR_PSE36_SHIFT)
+#elif PTTYPE == PTTYPE_EPT
+ #define pt_element_t u64
+ #define guest_walker guest_walkerEPT
+ #define FNAME(name) ept_##name
+ #define PT_LEVEL_BITS 9
+ #define PT_GUEST_DIRTY_SHIFT 9
+ #define PT_GUEST_ACCESSED_SHIFT 8
+ #define PT_HAVE_ACCESSED_DIRTY(mmu) (!(mmu)->cpu_role.base.ad_disabled)
+ #define PT_MAX_FULL_LEVELS PT64_ROOT_MAX_LEVEL
+#else
+ #error Invalid PTTYPE value
+#endif
+
+/* Common logic, but per-type values. These also need to be undefined. */
+#define PT_BASE_ADDR_MASK ((pt_element_t)(((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1)))
+#define PT_LVL_ADDR_MASK(lvl) __PT_LVL_ADDR_MASK(PT_BASE_ADDR_MASK, lvl, PT_LEVEL_BITS)
+#define PT_LVL_OFFSET_MASK(lvl) __PT_LVL_OFFSET_MASK(PT_BASE_ADDR_MASK, lvl, PT_LEVEL_BITS)
+#define PT_INDEX(addr, lvl) __PT_INDEX(addr, lvl, PT_LEVEL_BITS)
+
+#define PT_GUEST_DIRTY_MASK (1 << PT_GUEST_DIRTY_SHIFT)
+#define PT_GUEST_ACCESSED_MASK (1 << PT_GUEST_ACCESSED_SHIFT)
+
+#define gpte_to_gfn_lvl FNAME(gpte_to_gfn_lvl)
+#define gpte_to_gfn(pte) gpte_to_gfn_lvl((pte), PG_LEVEL_4K)
+
+/*
+ * The guest_walker structure emulates the behavior of the hardware page
+ * table walker.
+ */
+struct guest_walker {
+ int level;
+ unsigned max_level;
+ gfn_t table_gfn[PT_MAX_FULL_LEVELS];
+ pt_element_t ptes[PT_MAX_FULL_LEVELS];
+ pt_element_t prefetch_ptes[PTE_PREFETCH_NUM];
+ gpa_t pte_gpa[PT_MAX_FULL_LEVELS];
+ pt_element_t __user *ptep_user[PT_MAX_FULL_LEVELS];
+ bool pte_writable[PT_MAX_FULL_LEVELS];
+ unsigned int pt_access[PT_MAX_FULL_LEVELS];
+ unsigned int pte_access;
+ gfn_t gfn;
+ struct x86_exception fault;
+};
+
+#if PTTYPE == 32
+static inline gfn_t pse36_gfn_delta(u32 gpte)
+{
+ int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT;
+
+ return (gpte & PT32_DIR_PSE36_MASK) << shift;
+}
+#endif
+
+static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl)
+{
+ return (gpte & PT_LVL_ADDR_MASK(lvl)) >> PAGE_SHIFT;
+}
+
+static inline void FNAME(protect_clean_gpte)(struct kvm_mmu *mmu, unsigned *access,
+ unsigned gpte)
+{
+ unsigned mask;
+
+ /* dirty bit is not supported, so no need to track it */
+ if (!PT_HAVE_ACCESSED_DIRTY(mmu))
+ return;
+
+ BUILD_BUG_ON(PT_WRITABLE_MASK != ACC_WRITE_MASK);
+
+ mask = (unsigned)~ACC_WRITE_MASK;
+ /* Allow write access to dirty gptes */
+ mask |= (gpte >> (PT_GUEST_DIRTY_SHIFT - PT_WRITABLE_SHIFT)) &
+ PT_WRITABLE_MASK;
+ *access &= mask;
+}
+
+static inline int FNAME(is_present_gpte)(unsigned long pte)
+{
+#if PTTYPE != PTTYPE_EPT
+ return pte & PT_PRESENT_MASK;
+#else
+ return pte & 7;
+#endif
+}
+
+static bool FNAME(is_bad_mt_xwr)(struct rsvd_bits_validate *rsvd_check, u64 gpte)
+{
+#if PTTYPE != PTTYPE_EPT
+ return false;
+#else
+ return __is_bad_mt_xwr(rsvd_check, gpte);
+#endif
+}
+
+static bool FNAME(is_rsvd_bits_set)(struct kvm_mmu *mmu, u64 gpte, int level)
+{
+ return __is_rsvd_bits_set(&mmu->guest_rsvd_check, gpte, level) ||
+ FNAME(is_bad_mt_xwr)(&mmu->guest_rsvd_check, gpte);
+}
+
+static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
+ struct kvm_mmu_page *sp, u64 *spte,
+ u64 gpte)
+{
+ if (!FNAME(is_present_gpte)(gpte))
+ goto no_present;
+
+ /* Prefetch only accessed entries (unless A/D bits are disabled). */
+ if (PT_HAVE_ACCESSED_DIRTY(vcpu->arch.mmu) &&
+ !(gpte & PT_GUEST_ACCESSED_MASK))
+ goto no_present;
+
+ if (FNAME(is_rsvd_bits_set)(vcpu->arch.mmu, gpte, PG_LEVEL_4K))
+ goto no_present;
+
+ return false;
+
+no_present:
+ drop_spte(vcpu->kvm, spte);
+ return true;
+}
+
+/*
+ * For PTTYPE_EPT, a page table can be executable but not readable
+ * on supported processors. Therefore, set_spte does not automatically
+ * set bit 0 if execute only is supported. Here, we repurpose ACC_USER_MASK
+ * to signify readability since it isn't used in the EPT case
+ */
+static inline unsigned FNAME(gpte_access)(u64 gpte)
+{
+ unsigned access;
+#if PTTYPE == PTTYPE_EPT
+ access = ((gpte & VMX_EPT_WRITABLE_MASK) ? ACC_WRITE_MASK : 0) |
+ ((gpte & VMX_EPT_EXECUTABLE_MASK) ? ACC_EXEC_MASK : 0) |
+ ((gpte & VMX_EPT_READABLE_MASK) ? ACC_USER_MASK : 0);
+#else
+ BUILD_BUG_ON(ACC_EXEC_MASK != PT_PRESENT_MASK);
+ BUILD_BUG_ON(ACC_EXEC_MASK != 1);
+ access = gpte & (PT_WRITABLE_MASK | PT_USER_MASK | PT_PRESENT_MASK);
+ /* Combine NX with P (which is set here) to get ACC_EXEC_MASK. */
+ access ^= (gpte >> PT64_NX_SHIFT);
+#endif
+
+ return access;
+}
+
+static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu,
+ struct kvm_mmu *mmu,
+ struct guest_walker *walker,
+ gpa_t addr, int write_fault)
+{
+ unsigned level, index;
+ pt_element_t pte, orig_pte;
+ pt_element_t __user *ptep_user;
+ gfn_t table_gfn;
+ int ret;
+
+ /* dirty/accessed bits are not supported, so no need to update them */
+ if (!PT_HAVE_ACCESSED_DIRTY(mmu))
+ return 0;
+
+ for (level = walker->max_level; level >= walker->level; --level) {
+ pte = orig_pte = walker->ptes[level - 1];
+ table_gfn = walker->table_gfn[level - 1];
+ ptep_user = walker->ptep_user[level - 1];
+ index = offset_in_page(ptep_user) / sizeof(pt_element_t);
+ if (!(pte & PT_GUEST_ACCESSED_MASK)) {
+ trace_kvm_mmu_set_accessed_bit(table_gfn, index, sizeof(pte));
+ pte |= PT_GUEST_ACCESSED_MASK;
+ }
+ if (level == walker->level && write_fault &&
+ !(pte & PT_GUEST_DIRTY_MASK)) {
+ trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte));
+#if PTTYPE == PTTYPE_EPT
+ if (kvm_x86_ops.nested_ops->write_log_dirty(vcpu, addr))
+ return -EINVAL;
+#endif
+ pte |= PT_GUEST_DIRTY_MASK;
+ }
+ if (pte == orig_pte)
+ continue;
+
+ /*
+ * If the slot is read-only, simply do not process the accessed
+ * and dirty bits. This is the correct thing to do if the slot
+ * is ROM, and page tables in read-as-ROM/write-as-MMIO slots
+ * are only supported if the accessed and dirty bits are already
+ * set in the ROM (so that MMIO writes are never needed).
+ *
+ * Note that NPT does not allow this at all and faults, since
+ * it always wants nested page table entries for the guest
+ * page tables to be writable. And EPT works but will simply
+ * overwrite the read-only memory to set the accessed and dirty
+ * bits.
+ */
+ if (unlikely(!walker->pte_writable[level - 1]))
+ continue;
+
+ ret = __try_cmpxchg_user(ptep_user, &orig_pte, pte, fault);
+ if (ret)
+ return ret;
+
+ kvm_vcpu_mark_page_dirty(vcpu, table_gfn);
+ walker->ptes[level - 1] = pte;
+ }
+ return 0;
+}
+
+static inline unsigned FNAME(gpte_pkeys)(struct kvm_vcpu *vcpu, u64 gpte)
+{
+ unsigned pkeys = 0;
+#if PTTYPE == 64
+ pte_t pte = {.pte = gpte};
+
+ pkeys = pte_flags_pkey(pte_flags(pte));
+#endif
+ return pkeys;
+}
+
+static inline bool FNAME(is_last_gpte)(struct kvm_mmu *mmu,
+ unsigned int level, unsigned int gpte)
+{
+ /*
+ * For EPT and PAE paging (both variants), bit 7 is either reserved at
+ * all level or indicates a huge page (ignoring CR3/EPTP). In either
+ * case, bit 7 being set terminates the walk.
+ */
+#if PTTYPE == 32
+ /*
+ * 32-bit paging requires special handling because bit 7 is ignored if
+ * CR4.PSE=0, not reserved. Clear bit 7 in the gpte if the level is
+ * greater than the last level for which bit 7 is the PAGE_SIZE bit.
+ *
+ * The RHS has bit 7 set iff level < (2 + PSE). If it is clear, bit 7
+ * is not reserved and does not indicate a large page at this level,
+ * so clear PT_PAGE_SIZE_MASK in gpte if that is the case.
+ */
+ gpte &= level - (PT32_ROOT_LEVEL + mmu->cpu_role.ext.cr4_pse);
+#endif
+ /*
+ * PG_LEVEL_4K always terminates. The RHS has bit 7 set
+ * iff level <= PG_LEVEL_4K, which for our purpose means
+ * level == PG_LEVEL_4K; set PT_PAGE_SIZE_MASK in gpte then.
+ */
+ gpte |= level - PG_LEVEL_4K - 1;
+
+ return gpte & PT_PAGE_SIZE_MASK;
+}
+/*
+ * Fetch a guest pte for a guest virtual address, or for an L2's GPA.
+ */
+static int FNAME(walk_addr_generic)(struct guest_walker *walker,
+ struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
+ gpa_t addr, u64 access)
+{
+ int ret;
+ pt_element_t pte;
+ pt_element_t __user *ptep_user;
+ gfn_t table_gfn;
+ u64 pt_access, pte_access;
+ unsigned index, accessed_dirty, pte_pkey;
+ u64 nested_access;
+ gpa_t pte_gpa;
+ bool have_ad;
+ int offset;
+ u64 walk_nx_mask = 0;
+ const int write_fault = access & PFERR_WRITE_MASK;
+ const int user_fault = access & PFERR_USER_MASK;
+ const int fetch_fault = access & PFERR_FETCH_MASK;
+ u16 errcode = 0;
+ gpa_t real_gpa;
+ gfn_t gfn;
+
+ trace_kvm_mmu_pagetable_walk(addr, access);
+retry_walk:
+ walker->level = mmu->cpu_role.base.level;
+ pte = kvm_mmu_get_guest_pgd(vcpu, mmu);
+ have_ad = PT_HAVE_ACCESSED_DIRTY(mmu);
+
+#if PTTYPE == 64
+ walk_nx_mask = 1ULL << PT64_NX_SHIFT;
+ if (walker->level == PT32E_ROOT_LEVEL) {
+ pte = mmu->get_pdptr(vcpu, (addr >> 30) & 3);
+ trace_kvm_mmu_paging_element(pte, walker->level);
+ if (!FNAME(is_present_gpte)(pte))
+ goto error;
+ --walker->level;
+ }
+#endif
+ walker->max_level = walker->level;
+
+ /*
+ * FIXME: on Intel processors, loads of the PDPTE registers for PAE paging
+ * by the MOV to CR instruction are treated as reads and do not cause the
+ * processor to set the dirty flag in any EPT paging-structure entry.
+ */
+ nested_access = (have_ad ? PFERR_WRITE_MASK : 0) | PFERR_USER_MASK;
+
+ pte_access = ~0;
+
+ /*
+ * Queue a page fault for injection if this assertion fails, as callers
+ * assume that walker.fault contains sane info on a walk failure. I.e.
+ * avoid making the situation worse by inducing even worse badness
+ * between when the assertion fails and when KVM kicks the vCPU out to
+ * userspace (because the VM is bugged).
+ */
+ if (KVM_BUG_ON(is_long_mode(vcpu) && !is_pae(vcpu), vcpu->kvm))
+ goto error;
+
+ ++walker->level;
+
+ do {
+ struct kvm_memory_slot *slot;
+ unsigned long host_addr;
+
+ pt_access = pte_access;
+ --walker->level;
+
+ index = PT_INDEX(addr, walker->level);
+ table_gfn = gpte_to_gfn(pte);
+ offset = index * sizeof(pt_element_t);
+ pte_gpa = gfn_to_gpa(table_gfn) + offset;
+
+ BUG_ON(walker->level < 1);
+ walker->table_gfn[walker->level - 1] = table_gfn;
+ walker->pte_gpa[walker->level - 1] = pte_gpa;
+
+ real_gpa = kvm_translate_gpa(vcpu, mmu, gfn_to_gpa(table_gfn),
+ nested_access, &walker->fault);
+
+ /*
+ * FIXME: This can happen if emulation (for of an INS/OUTS
+ * instruction) triggers a nested page fault. The exit
+ * qualification / exit info field will incorrectly have
+ * "guest page access" as the nested page fault's cause,
+ * instead of "guest page structure access". To fix this,
+ * the x86_exception struct should be augmented with enough
+ * information to fix the exit_qualification or exit_info_1
+ * fields.
+ */
+ if (unlikely(real_gpa == INVALID_GPA))
+ return 0;
+
+ slot = kvm_vcpu_gfn_to_memslot(vcpu, gpa_to_gfn(real_gpa));
+ if (!kvm_is_visible_memslot(slot))
+ goto error;
+
+ host_addr = gfn_to_hva_memslot_prot(slot, gpa_to_gfn(real_gpa),
+ &walker->pte_writable[walker->level - 1]);
+ if (unlikely(kvm_is_error_hva(host_addr)))
+ goto error;
+
+ ptep_user = (pt_element_t __user *)((void *)host_addr + offset);
+ if (unlikely(__get_user(pte, ptep_user)))
+ goto error;
+ walker->ptep_user[walker->level - 1] = ptep_user;
+
+ trace_kvm_mmu_paging_element(pte, walker->level);
+
+ /*
+ * Inverting the NX it lets us AND it like other
+ * permission bits.
+ */
+ pte_access = pt_access & (pte ^ walk_nx_mask);
+
+ if (unlikely(!FNAME(is_present_gpte)(pte)))
+ goto error;
+
+ if (unlikely(FNAME(is_rsvd_bits_set)(mmu, pte, walker->level))) {
+ errcode = PFERR_RSVD_MASK | PFERR_PRESENT_MASK;
+ goto error;
+ }
+
+ walker->ptes[walker->level - 1] = pte;
+
+ /* Convert to ACC_*_MASK flags for struct guest_walker. */
+ walker->pt_access[walker->level - 1] = FNAME(gpte_access)(pt_access ^ walk_nx_mask);
+ } while (!FNAME(is_last_gpte)(mmu, walker->level, pte));
+
+ pte_pkey = FNAME(gpte_pkeys)(vcpu, pte);
+ accessed_dirty = have_ad ? pte_access & PT_GUEST_ACCESSED_MASK : 0;
+
+ /* Convert to ACC_*_MASK flags for struct guest_walker. */
+ walker->pte_access = FNAME(gpte_access)(pte_access ^ walk_nx_mask);
+ errcode = permission_fault(vcpu, mmu, walker->pte_access, pte_pkey, access);
+ if (unlikely(errcode))
+ goto error;
+
+ gfn = gpte_to_gfn_lvl(pte, walker->level);
+ gfn += (addr & PT_LVL_OFFSET_MASK(walker->level)) >> PAGE_SHIFT;
+
+#if PTTYPE == 32
+ if (walker->level > PG_LEVEL_4K && is_cpuid_PSE36())
+ gfn += pse36_gfn_delta(pte);
+#endif
+
+ real_gpa = kvm_translate_gpa(vcpu, mmu, gfn_to_gpa(gfn), access, &walker->fault);
+ if (real_gpa == INVALID_GPA)
+ return 0;
+
+ walker->gfn = real_gpa >> PAGE_SHIFT;
+
+ if (!write_fault)
+ FNAME(protect_clean_gpte)(mmu, &walker->pte_access, pte);
+ else
+ /*
+ * On a write fault, fold the dirty bit into accessed_dirty.
+ * For modes without A/D bits support accessed_dirty will be
+ * always clear.
+ */
+ accessed_dirty &= pte >>
+ (PT_GUEST_DIRTY_SHIFT - PT_GUEST_ACCESSED_SHIFT);
+
+ if (unlikely(!accessed_dirty)) {
+ ret = FNAME(update_accessed_dirty_bits)(vcpu, mmu, walker,
+ addr, write_fault);
+ if (unlikely(ret < 0))
+ goto error;
+ else if (ret)
+ goto retry_walk;
+ }
+
+ return 1;
+
+error:
+ errcode |= write_fault | user_fault;
+ if (fetch_fault && (is_efer_nx(mmu) || is_cr4_smep(mmu)))
+ errcode |= PFERR_FETCH_MASK;
+
+ walker->fault.vector = PF_VECTOR;
+ walker->fault.error_code_valid = true;
+ walker->fault.error_code = errcode;
+
+#if PTTYPE == PTTYPE_EPT
+ /*
+ * Use PFERR_RSVD_MASK in error_code to tell if EPT
+ * misconfiguration requires to be injected. The detection is
+ * done by is_rsvd_bits_set() above.
+ *
+ * We set up the value of exit_qualification to inject:
+ * [2:0] - Derive from the access bits. The exit_qualification might be
+ * out of date if it is serving an EPT misconfiguration.
+ * [5:3] - Calculated by the page walk of the guest EPT page tables
+ * [7:8] - Derived from [7:8] of real exit_qualification
+ *
+ * The other bits are set to 0.
+ */
+ if (!(errcode & PFERR_RSVD_MASK)) {
+ vcpu->arch.exit_qualification &= (EPT_VIOLATION_GVA_IS_VALID |
+ EPT_VIOLATION_GVA_TRANSLATED);
+ if (write_fault)
+ vcpu->arch.exit_qualification |= EPT_VIOLATION_ACC_WRITE;
+ if (user_fault)
+ vcpu->arch.exit_qualification |= EPT_VIOLATION_ACC_READ;
+ if (fetch_fault)
+ vcpu->arch.exit_qualification |= EPT_VIOLATION_ACC_INSTR;
+
+ /*
+ * Note, pte_access holds the raw RWX bits from the EPTE, not
+ * ACC_*_MASK flags!
+ */
+ vcpu->arch.exit_qualification |= (pte_access & VMX_EPT_RWX_MASK) <<
+ EPT_VIOLATION_RWX_SHIFT;
+ }
+#endif
+ walker->fault.address = addr;
+ walker->fault.nested_page_fault = mmu != vcpu->arch.walk_mmu;
+ walker->fault.async_page_fault = false;
+
+ trace_kvm_mmu_walker_error(walker->fault.error_code);
+ return 0;
+}
+
+static int FNAME(walk_addr)(struct guest_walker *walker,
+ struct kvm_vcpu *vcpu, gpa_t addr, u64 access)
+{
+ return FNAME(walk_addr_generic)(walker, vcpu, vcpu->arch.mmu, addr,
+ access);
+}
+
+static bool
+FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+ u64 *spte, pt_element_t gpte)
+{
+ struct kvm_memory_slot *slot;
+ unsigned pte_access;
+ gfn_t gfn;
+ kvm_pfn_t pfn;
+
+ if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte))
+ return false;
+
+ gfn = gpte_to_gfn(gpte);
+ pte_access = sp->role.access & FNAME(gpte_access)(gpte);
+ FNAME(protect_clean_gpte)(vcpu->arch.mmu, &pte_access, gpte);
+
+ slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, pte_access & ACC_WRITE_MASK);
+ if (!slot)
+ return false;
+
+ pfn = gfn_to_pfn_memslot_atomic(slot, gfn);
+ if (is_error_pfn(pfn))
+ return false;
+
+ mmu_set_spte(vcpu, slot, spte, pte_access, gfn, pfn, NULL);
+ kvm_release_pfn_clean(pfn);
+ return true;
+}
+
+static bool FNAME(gpte_changed)(struct kvm_vcpu *vcpu,
+ struct guest_walker *gw, int level)
+{
+ pt_element_t curr_pte;
+ gpa_t base_gpa, pte_gpa = gw->pte_gpa[level - 1];
+ u64 mask;
+ int r, index;
+
+ if (level == PG_LEVEL_4K) {
+ mask = PTE_PREFETCH_NUM * sizeof(pt_element_t) - 1;
+ base_gpa = pte_gpa & ~mask;
+ index = (pte_gpa - base_gpa) / sizeof(pt_element_t);
+
+ r = kvm_vcpu_read_guest_atomic(vcpu, base_gpa,
+ gw->prefetch_ptes, sizeof(gw->prefetch_ptes));
+ curr_pte = gw->prefetch_ptes[index];
+ } else
+ r = kvm_vcpu_read_guest_atomic(vcpu, pte_gpa,
+ &curr_pte, sizeof(curr_pte));
+
+ return r || curr_pte != gw->ptes[level - 1];
+}
+
+static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
+ u64 *sptep)
+{
+ struct kvm_mmu_page *sp;
+ pt_element_t *gptep = gw->prefetch_ptes;
+ u64 *spte;
+ int i;
+
+ sp = sptep_to_sp(sptep);
+
+ if (sp->role.level > PG_LEVEL_4K)
+ return;
+
+ /*
+ * If addresses are being invalidated, skip prefetching to avoid
+ * accidentally prefetching those addresses.
+ */
+ if (unlikely(vcpu->kvm->mmu_invalidate_in_progress))
+ return;
+
+ if (sp->role.direct)
+ return __direct_pte_prefetch(vcpu, sp, sptep);
+
+ i = spte_index(sptep) & ~(PTE_PREFETCH_NUM - 1);
+ spte = sp->spt + i;
+
+ for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) {
+ if (spte == sptep)
+ continue;
+
+ if (is_shadow_present_pte(*spte))
+ continue;
+
+ if (!FNAME(prefetch_gpte)(vcpu, sp, spte, gptep[i]))
+ break;
+ }
+}
+
+/*
+ * Fetch a shadow pte for a specific level in the paging hierarchy.
+ * If the guest tries to write a write-protected page, we need to
+ * emulate this operation, return 1 to indicate this case.
+ */
+static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
+ struct guest_walker *gw)
+{
+ struct kvm_mmu_page *sp = NULL;
+ struct kvm_shadow_walk_iterator it;
+ unsigned int direct_access, access;
+ int top_level, ret;
+ gfn_t base_gfn = fault->gfn;
+
+ WARN_ON_ONCE(gw->gfn != base_gfn);
+ direct_access = gw->pte_access;
+
+ top_level = vcpu->arch.mmu->cpu_role.base.level;
+ if (top_level == PT32E_ROOT_LEVEL)
+ top_level = PT32_ROOT_LEVEL;
+ /*
+ * Verify that the top-level gpte is still there. Since the page
+ * is a root page, it is either write protected (and cannot be
+ * changed from now on) or it is invalid (in which case, we don't
+ * really care if it changes underneath us after this point).
+ */
+ if (FNAME(gpte_changed)(vcpu, gw, top_level))
+ goto out_gpte_changed;
+
+ if (WARN_ON_ONCE(!VALID_PAGE(vcpu->arch.mmu->root.hpa)))
+ goto out_gpte_changed;
+
+ /*
+ * Load a new root and retry the faulting instruction in the extremely
+ * unlikely scenario that the guest root gfn became visible between
+ * loading a dummy root and handling the resulting page fault, e.g. if
+ * userspace create a memslot in the interim.
+ */
+ if (unlikely(kvm_mmu_is_dummy_root(vcpu->arch.mmu->root.hpa))) {
+ kvm_make_request(KVM_REQ_MMU_FREE_OBSOLETE_ROOTS, vcpu);
+ goto out_gpte_changed;
+ }
+
+ for_each_shadow_entry(vcpu, fault->addr, it) {
+ gfn_t table_gfn;
+
+ clear_sp_write_flooding_count(it.sptep);
+ if (it.level == gw->level)
+ break;
+
+ table_gfn = gw->table_gfn[it.level - 2];
+ access = gw->pt_access[it.level - 2];
+ sp = kvm_mmu_get_child_sp(vcpu, it.sptep, table_gfn,
+ false, access);
+
+ if (sp != ERR_PTR(-EEXIST)) {
+ /*
+ * We must synchronize the pagetable before linking it
+ * because the guest doesn't need to flush tlb when
+ * the gpte is changed from non-present to present.
+ * Otherwise, the guest may use the wrong mapping.
+ *
+ * For PG_LEVEL_4K, kvm_mmu_get_page() has already
+ * synchronized it transiently via kvm_sync_page().
+ *
+ * For higher level pagetable, we synchronize it via
+ * the slower mmu_sync_children(). If it needs to
+ * break, some progress has been made; return
+ * RET_PF_RETRY and retry on the next #PF.
+ * KVM_REQ_MMU_SYNC is not necessary but it
+ * expedites the process.
+ */
+ if (sp->unsync_children &&
+ mmu_sync_children(vcpu, sp, false))
+ return RET_PF_RETRY;
+ }
+
+ /*
+ * Verify that the gpte in the page we've just write
+ * protected is still there.
+ */
+ if (FNAME(gpte_changed)(vcpu, gw, it.level - 1))
+ goto out_gpte_changed;
+
+ if (sp != ERR_PTR(-EEXIST))
+ link_shadow_page(vcpu, it.sptep, sp);
+
+ if (fault->write && table_gfn == fault->gfn)
+ fault->write_fault_to_shadow_pgtable = true;
+ }
+
+ /*
+ * Adjust the hugepage size _after_ resolving indirect shadow pages.
+ * KVM doesn't support mapping hugepages into the guest for gfns that
+ * are being shadowed by KVM, i.e. allocating a new shadow page may
+ * affect the allowed hugepage size.
+ */
+ kvm_mmu_hugepage_adjust(vcpu, fault);
+
+ trace_kvm_mmu_spte_requested(fault);
+
+ for (; shadow_walk_okay(&it); shadow_walk_next(&it)) {
+ /*
+ * We cannot overwrite existing page tables with an NX
+ * large page, as the leaf could be executable.
+ */
+ if (fault->nx_huge_page_workaround_enabled)
+ disallowed_hugepage_adjust(fault, *it.sptep, it.level);
+
+ base_gfn = gfn_round_for_level(fault->gfn, it.level);
+ if (it.level == fault->goal_level)
+ break;
+
+ validate_direct_spte(vcpu, it.sptep, direct_access);
+
+ sp = kvm_mmu_get_child_sp(vcpu, it.sptep, base_gfn,
+ true, direct_access);
+ if (sp == ERR_PTR(-EEXIST))
+ continue;
+
+ link_shadow_page(vcpu, it.sptep, sp);
+ if (fault->huge_page_disallowed)
+ account_nx_huge_page(vcpu->kvm, sp,
+ fault->req_level >= it.level);
+ }
+
+ if (WARN_ON_ONCE(it.level != fault->goal_level))
+ return -EFAULT;
+
+ ret = mmu_set_spte(vcpu, fault->slot, it.sptep, gw->pte_access,
+ base_gfn, fault->pfn, fault);
+ if (ret == RET_PF_SPURIOUS)
+ return ret;
+
+ FNAME(pte_prefetch)(vcpu, gw, it.sptep);
+ return ret;
+
+out_gpte_changed:
+ return RET_PF_RETRY;
+}
+
+/*
+ * Page fault handler. There are several causes for a page fault:
+ * - there is no shadow pte for the guest pte
+ * - write access through a shadow pte marked read only so that we can set
+ * the dirty bit
+ * - write access to a shadow pte marked read only so we can update the page
+ * dirty bitmap, when userspace requests it
+ * - mmio access; in this case we will never install a present shadow pte
+ * - normal guest page fault due to the guest pte marked not present, not
+ * writable, or not executable
+ *
+ * Returns: 1 if we need to emulate the instruction, 0 otherwise, or
+ * a negative value on error.
+ */
+static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
+{
+ struct guest_walker walker;
+ int r;
+
+ WARN_ON_ONCE(fault->is_tdp);
+
+ /*
+ * Look up the guest pte for the faulting address.
+ * If PFEC.RSVD is set, this is a shadow page fault.
+ * The bit needs to be cleared before walking guest page tables.
+ */
+ r = FNAME(walk_addr)(&walker, vcpu, fault->addr,
+ fault->error_code & ~PFERR_RSVD_MASK);
+
+ /*
+ * The page is not mapped by the guest. Let the guest handle it.
+ */
+ if (!r) {
+ if (!fault->prefetch)
+ kvm_inject_emulated_page_fault(vcpu, &walker.fault);
+
+ return RET_PF_RETRY;
+ }
+
+ fault->gfn = walker.gfn;
+ fault->max_level = walker.level;
+ fault->slot = kvm_vcpu_gfn_to_memslot(vcpu, fault->gfn);
+
+ if (page_fault_handle_page_track(vcpu, fault)) {
+ shadow_page_table_clear_flood(vcpu, fault->addr);
+ return RET_PF_EMULATE;
+ }
+
+ r = mmu_topup_memory_caches(vcpu, true);
+ if (r)
+ return r;
+
+ r = kvm_faultin_pfn(vcpu, fault, walker.pte_access);
+ if (r != RET_PF_CONTINUE)
+ return r;
+
+ /*
+ * Do not change pte_access if the pfn is a mmio page, otherwise
+ * we will cache the incorrect access into mmio spte.
+ */
+ if (fault->write && !(walker.pte_access & ACC_WRITE_MASK) &&
+ !is_cr0_wp(vcpu->arch.mmu) && !fault->user && fault->slot) {
+ walker.pte_access |= ACC_WRITE_MASK;
+ walker.pte_access &= ~ACC_USER_MASK;
+
+ /*
+ * If we converted a user page to a kernel page,
+ * so that the kernel can write to it when cr0.wp=0,
+ * then we should prevent the kernel from executing it
+ * if SMEP is enabled.
+ */
+ if (is_cr4_smep(vcpu->arch.mmu))
+ walker.pte_access &= ~ACC_EXEC_MASK;
+ }
+
+ r = RET_PF_RETRY;
+ write_lock(&vcpu->kvm->mmu_lock);
+
+ if (is_page_fault_stale(vcpu, fault))
+ goto out_unlock;
+
+ r = make_mmu_pages_available(vcpu);
+ if (r)
+ goto out_unlock;
+ r = FNAME(fetch)(vcpu, fault, &walker);
+
+out_unlock:
+ write_unlock(&vcpu->kvm->mmu_lock);
+ kvm_release_pfn_clean(fault->pfn);
+ return r;
+}
+
+static gpa_t FNAME(get_level1_sp_gpa)(struct kvm_mmu_page *sp)
+{
+ int offset = 0;
+
+ WARN_ON_ONCE(sp->role.level != PG_LEVEL_4K);
+
+ if (PTTYPE == 32)
+ offset = sp->role.quadrant << SPTE_LEVEL_BITS;
+
+ return gfn_to_gpa(sp->gfn) + offset * sizeof(pt_element_t);
+}
+
+/* Note, @addr is a GPA when gva_to_gpa() translates an L2 GPA to an L1 GPA. */
+static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
+ gpa_t addr, u64 access,
+ struct x86_exception *exception)
+{
+ struct guest_walker walker;
+ gpa_t gpa = INVALID_GPA;
+ int r;
+
+#ifndef CONFIG_X86_64
+ /* A 64-bit GVA should be impossible on 32-bit KVM. */
+ WARN_ON_ONCE((addr >> 32) && mmu == vcpu->arch.walk_mmu);
+#endif
+
+ r = FNAME(walk_addr_generic)(&walker, vcpu, mmu, addr, access);
+
+ if (r) {
+ gpa = gfn_to_gpa(walker.gfn);
+ gpa |= addr & ~PAGE_MASK;
+ } else if (exception)
+ *exception = walker.fault;
+
+ return gpa;
+}
+
+/*
+ * Using the information in sp->shadowed_translation (kvm_mmu_page_get_gfn()) is
+ * safe because:
+ * - The spte has a reference to the struct page, so the pfn for a given gfn
+ * can't change unless all sptes pointing to it are nuked first.
+ *
+ * Returns
+ * < 0: failed to sync spte
+ * 0: the spte is synced and no tlb flushing is required
+ * > 0: the spte is synced and tlb flushing is required
+ */
+static int FNAME(sync_spte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, int i)
+{
+ bool host_writable;
+ gpa_t first_pte_gpa;
+ u64 *sptep, spte;
+ struct kvm_memory_slot *slot;
+ unsigned pte_access;
+ pt_element_t gpte;
+ gpa_t pte_gpa;
+ gfn_t gfn;
+
+ if (WARN_ON_ONCE(!sp->spt[i]))
+ return 0;
+
+ first_pte_gpa = FNAME(get_level1_sp_gpa)(sp);
+ pte_gpa = first_pte_gpa + i * sizeof(pt_element_t);
+
+ if (kvm_vcpu_read_guest_atomic(vcpu, pte_gpa, &gpte,
+ sizeof(pt_element_t)))
+ return -1;
+
+ if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte))
+ return 1;
+
+ gfn = gpte_to_gfn(gpte);
+ pte_access = sp->role.access;
+ pte_access &= FNAME(gpte_access)(gpte);
+ FNAME(protect_clean_gpte)(vcpu->arch.mmu, &pte_access, gpte);
+
+ if (sync_mmio_spte(vcpu, &sp->spt[i], gfn, pte_access))
+ return 0;
+
+ /*
+ * Drop the SPTE if the new protections would result in a RWX=0
+ * SPTE or if the gfn is changing. The RWX=0 case only affects
+ * EPT with execute-only support, i.e. EPT without an effective
+ * "present" bit, as all other paging modes will create a
+ * read-only SPTE if pte_access is zero.
+ */
+ if ((!pte_access && !shadow_present_mask) ||
+ gfn != kvm_mmu_page_get_gfn(sp, i)) {
+ drop_spte(vcpu->kvm, &sp->spt[i]);
+ return 1;
+ }
+ /*
+ * Do nothing if the permissions are unchanged. The existing SPTE is
+ * still, and prefetch_invalid_gpte() has verified that the A/D bits
+ * are set in the "new" gPTE, i.e. there is no danger of missing an A/D
+ * update due to A/D bits being set in the SPTE but not the gPTE.
+ */
+ if (kvm_mmu_page_get_access(sp, i) == pte_access)
+ return 0;
+
+ /* Update the shadowed access bits in case they changed. */
+ kvm_mmu_page_set_access(sp, i, pte_access);
+
+ sptep = &sp->spt[i];
+ spte = *sptep;
+ host_writable = spte & shadow_host_writable_mask;
+ slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
+ make_spte(vcpu, sp, slot, pte_access, gfn,
+ spte_to_pfn(spte), spte, true, false,
+ host_writable, &spte);
+
+ return mmu_spte_update(sptep, spte);
+}
+
+#undef pt_element_t
+#undef guest_walker
+#undef FNAME
+#undef PT_BASE_ADDR_MASK
+#undef PT_INDEX
+#undef PT_LVL_ADDR_MASK
+#undef PT_LVL_OFFSET_MASK
+#undef PT_LEVEL_BITS
+#undef PT_MAX_FULL_LEVELS
+#undef gpte_to_gfn
+#undef gpte_to_gfn_lvl
+#undef PT_GUEST_ACCESSED_MASK
+#undef PT_GUEST_DIRTY_MASK
+#undef PT_GUEST_DIRTY_SHIFT
+#undef PT_GUEST_ACCESSED_SHIFT
+#undef PT_HAVE_ACCESSED_DIRTY
diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c
new file mode 100644
index 000000000..4a599130e
--- /dev/null
+++ b/arch/x86/kvm/mmu/spte.c
@@ -0,0 +1,517 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Kernel-based Virtual Machine driver for Linux
+ *
+ * Macros and functions to access KVM PTEs (also known as SPTEs)
+ *
+ * Copyright (C) 2006 Qumranet, Inc.
+ * Copyright 2020 Red Hat, Inc. and/or its affiliates.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kvm_host.h>
+#include "mmu.h"
+#include "mmu_internal.h"
+#include "x86.h"
+#include "spte.h"
+
+#include <asm/e820/api.h>
+#include <asm/memtype.h>
+#include <asm/vmx.h>
+
+bool __read_mostly enable_mmio_caching = true;
+static bool __ro_after_init allow_mmio_caching;
+module_param_named(mmio_caching, enable_mmio_caching, bool, 0444);
+EXPORT_SYMBOL_GPL(enable_mmio_caching);
+
+u64 __read_mostly shadow_host_writable_mask;
+u64 __read_mostly shadow_mmu_writable_mask;
+u64 __read_mostly shadow_nx_mask;
+u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */
+u64 __read_mostly shadow_user_mask;
+u64 __read_mostly shadow_accessed_mask;
+u64 __read_mostly shadow_dirty_mask;
+u64 __read_mostly shadow_mmio_value;
+u64 __read_mostly shadow_mmio_mask;
+u64 __read_mostly shadow_mmio_access_mask;
+u64 __read_mostly shadow_present_mask;
+u64 __read_mostly shadow_memtype_mask;
+u64 __read_mostly shadow_me_value;
+u64 __read_mostly shadow_me_mask;
+u64 __read_mostly shadow_acc_track_mask;
+
+u64 __read_mostly shadow_nonpresent_or_rsvd_mask;
+u64 __read_mostly shadow_nonpresent_or_rsvd_lower_gfn_mask;
+
+u8 __read_mostly shadow_phys_bits;
+
+void __init kvm_mmu_spte_module_init(void)
+{
+ /*
+ * Snapshot userspace's desire to allow MMIO caching. Whether or not
+ * KVM can actually enable MMIO caching depends on vendor-specific
+ * hardware capabilities and other module params that can't be resolved
+ * until the vendor module is loaded, i.e. enable_mmio_caching can and
+ * will change when the vendor module is (re)loaded.
+ */
+ allow_mmio_caching = enable_mmio_caching;
+}
+
+static u64 generation_mmio_spte_mask(u64 gen)
+{
+ u64 mask;
+
+ WARN_ON_ONCE(gen & ~MMIO_SPTE_GEN_MASK);
+
+ mask = (gen << MMIO_SPTE_GEN_LOW_SHIFT) & MMIO_SPTE_GEN_LOW_MASK;
+ mask |= (gen << MMIO_SPTE_GEN_HIGH_SHIFT) & MMIO_SPTE_GEN_HIGH_MASK;
+ return mask;
+}
+
+u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access)
+{
+ u64 gen = kvm_vcpu_memslots(vcpu)->generation & MMIO_SPTE_GEN_MASK;
+ u64 spte = generation_mmio_spte_mask(gen);
+ u64 gpa = gfn << PAGE_SHIFT;
+
+ WARN_ON_ONCE(!shadow_mmio_value);
+
+ access &= shadow_mmio_access_mask;
+ spte |= shadow_mmio_value | access;
+ spte |= gpa | shadow_nonpresent_or_rsvd_mask;
+ spte |= (gpa & shadow_nonpresent_or_rsvd_mask)
+ << SHADOW_NONPRESENT_OR_RSVD_MASK_LEN;
+
+ return spte;
+}
+
+static bool kvm_is_mmio_pfn(kvm_pfn_t pfn)
+{
+ if (pfn_valid(pfn))
+ return !is_zero_pfn(pfn) && PageReserved(pfn_to_page(pfn)) &&
+ /*
+ * Some reserved pages, such as those from NVDIMM
+ * DAX devices, are not for MMIO, and can be mapped
+ * with cached memory type for better performance.
+ * However, the above check misconceives those pages
+ * as MMIO, and results in KVM mapping them with UC
+ * memory type, which would hurt the performance.
+ * Therefore, we check the host memory type in addition
+ * and only treat UC/UC-/WC pages as MMIO.
+ */
+ (!pat_enabled() || pat_pfn_immune_to_uc_mtrr(pfn));
+
+ return !e820__mapped_raw_any(pfn_to_hpa(pfn),
+ pfn_to_hpa(pfn + 1) - 1,
+ E820_TYPE_RAM);
+}
+
+/*
+ * Returns true if the SPTE has bits that may be set without holding mmu_lock.
+ * The caller is responsible for checking if the SPTE is shadow-present, and
+ * for determining whether or not the caller cares about non-leaf SPTEs.
+ */
+bool spte_has_volatile_bits(u64 spte)
+{
+ /*
+ * Always atomically update spte if it can be updated
+ * out of mmu-lock, it can ensure dirty bit is not lost,
+ * also, it can help us to get a stable is_writable_pte()
+ * to ensure tlb flush is not missed.
+ */
+ if (!is_writable_pte(spte) && is_mmu_writable_spte(spte))
+ return true;
+
+ if (is_access_track_spte(spte))
+ return true;
+
+ if (spte_ad_enabled(spte)) {
+ if (!(spte & shadow_accessed_mask) ||
+ (is_writable_pte(spte) && !(spte & shadow_dirty_mask)))
+ return true;
+ }
+
+ return false;
+}
+
+bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+ const struct kvm_memory_slot *slot,
+ unsigned int pte_access, gfn_t gfn, kvm_pfn_t pfn,
+ u64 old_spte, bool prefetch, bool can_unsync,
+ bool host_writable, u64 *new_spte)
+{
+ int level = sp->role.level;
+ u64 spte = SPTE_MMU_PRESENT_MASK;
+ bool wrprot = false;
+
+ WARN_ON_ONCE(!pte_access && !shadow_present_mask);
+
+ if (sp->role.ad_disabled)
+ spte |= SPTE_TDP_AD_DISABLED;
+ else if (kvm_mmu_page_ad_need_write_protect(sp))
+ spte |= SPTE_TDP_AD_WRPROT_ONLY;
+
+ /*
+ * For the EPT case, shadow_present_mask is 0 if hardware
+ * supports exec-only page table entries. In that case,
+ * ACC_USER_MASK and shadow_user_mask are used to represent
+ * read access. See FNAME(gpte_access) in paging_tmpl.h.
+ */
+ spte |= shadow_present_mask;
+ if (!prefetch)
+ spte |= spte_shadow_accessed_mask(spte);
+
+ /*
+ * For simplicity, enforce the NX huge page mitigation even if not
+ * strictly necessary. KVM could ignore the mitigation if paging is
+ * disabled in the guest, as the guest doesn't have any page tables to
+ * abuse. But to safely ignore the mitigation, KVM would have to
+ * ensure a new MMU is loaded (or all shadow pages zapped) when CR0.PG
+ * is toggled on, and that's a net negative for performance when TDP is
+ * enabled. When TDP is disabled, KVM will always switch to a new MMU
+ * when CR0.PG is toggled, but leveraging that to ignore the mitigation
+ * would tie make_spte() further to vCPU/MMU state, and add complexity
+ * just to optimize a mode that is anything but performance critical.
+ */
+ if (level > PG_LEVEL_4K && (pte_access & ACC_EXEC_MASK) &&
+ is_nx_huge_page_enabled(vcpu->kvm)) {
+ pte_access &= ~ACC_EXEC_MASK;
+ }
+
+ if (pte_access & ACC_EXEC_MASK)
+ spte |= shadow_x_mask;
+ else
+ spte |= shadow_nx_mask;
+
+ if (pte_access & ACC_USER_MASK)
+ spte |= shadow_user_mask;
+
+ if (level > PG_LEVEL_4K)
+ spte |= PT_PAGE_SIZE_MASK;
+
+ if (shadow_memtype_mask)
+ spte |= static_call(kvm_x86_get_mt_mask)(vcpu, gfn,
+ kvm_is_mmio_pfn(pfn));
+ if (host_writable)
+ spte |= shadow_host_writable_mask;
+ else
+ pte_access &= ~ACC_WRITE_MASK;
+
+ if (shadow_me_value && !kvm_is_mmio_pfn(pfn))
+ spte |= shadow_me_value;
+
+ spte |= (u64)pfn << PAGE_SHIFT;
+
+ if (pte_access & ACC_WRITE_MASK) {
+ spte |= PT_WRITABLE_MASK | shadow_mmu_writable_mask;
+
+ /*
+ * Optimization: for pte sync, if spte was writable the hash
+ * lookup is unnecessary (and expensive). Write protection
+ * is responsibility of kvm_mmu_get_page / kvm_mmu_sync_roots.
+ * Same reasoning can be applied to dirty page accounting.
+ */
+ if (is_writable_pte(old_spte))
+ goto out;
+
+ /*
+ * Unsync shadow pages that are reachable by the new, writable
+ * SPTE. Write-protect the SPTE if the page can't be unsync'd,
+ * e.g. it's write-tracked (upper-level SPs) or has one or more
+ * shadow pages and unsync'ing pages is not allowed.
+ */
+ if (mmu_try_to_unsync_pages(vcpu->kvm, slot, gfn, can_unsync, prefetch)) {
+ wrprot = true;
+ pte_access &= ~ACC_WRITE_MASK;
+ spte &= ~(PT_WRITABLE_MASK | shadow_mmu_writable_mask);
+ }
+ }
+
+ if (pte_access & ACC_WRITE_MASK)
+ spte |= spte_shadow_dirty_mask(spte);
+
+out:
+ if (prefetch)
+ spte = mark_spte_for_access_track(spte);
+
+ WARN_ONCE(is_rsvd_spte(&vcpu->arch.mmu->shadow_zero_check, spte, level),
+ "spte = 0x%llx, level = %d, rsvd bits = 0x%llx", spte, level,
+ get_rsvd_bits(&vcpu->arch.mmu->shadow_zero_check, spte, level));
+
+ if ((spte & PT_WRITABLE_MASK) && kvm_slot_dirty_track_enabled(slot)) {
+ /* Enforced by kvm_mmu_hugepage_adjust. */
+ WARN_ON_ONCE(level > PG_LEVEL_4K);
+ mark_page_dirty_in_slot(vcpu->kvm, slot, gfn);
+ }
+
+ *new_spte = spte;
+ return wrprot;
+}
+
+static u64 make_spte_executable(u64 spte)
+{
+ bool is_access_track = is_access_track_spte(spte);
+
+ if (is_access_track)
+ spte = restore_acc_track_spte(spte);
+
+ spte &= ~shadow_nx_mask;
+ spte |= shadow_x_mask;
+
+ if (is_access_track)
+ spte = mark_spte_for_access_track(spte);
+
+ return spte;
+}
+
+/*
+ * Construct an SPTE that maps a sub-page of the given huge page SPTE where
+ * `index` identifies which sub-page.
+ *
+ * This is used during huge page splitting to build the SPTEs that make up the
+ * new page table.
+ */
+u64 make_huge_page_split_spte(struct kvm *kvm, u64 huge_spte, union kvm_mmu_page_role role,
+ int index)
+{
+ u64 child_spte;
+
+ if (WARN_ON_ONCE(!is_shadow_present_pte(huge_spte)))
+ return 0;
+
+ if (WARN_ON_ONCE(!is_large_pte(huge_spte)))
+ return 0;
+
+ child_spte = huge_spte;
+
+ /*
+ * The child_spte already has the base address of the huge page being
+ * split. So we just have to OR in the offset to the page at the next
+ * lower level for the given index.
+ */
+ child_spte |= (index * KVM_PAGES_PER_HPAGE(role.level)) << PAGE_SHIFT;
+
+ if (role.level == PG_LEVEL_4K) {
+ child_spte &= ~PT_PAGE_SIZE_MASK;
+
+ /*
+ * When splitting to a 4K page where execution is allowed, mark
+ * the page executable as the NX hugepage mitigation no longer
+ * applies.
+ */
+ if ((role.access & ACC_EXEC_MASK) && is_nx_huge_page_enabled(kvm))
+ child_spte = make_spte_executable(child_spte);
+ }
+
+ return child_spte;
+}
+
+
+u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled)
+{
+ u64 spte = SPTE_MMU_PRESENT_MASK;
+
+ spte |= __pa(child_pt) | shadow_present_mask | PT_WRITABLE_MASK |
+ shadow_user_mask | shadow_x_mask | shadow_me_value;
+
+ if (ad_disabled)
+ spte |= SPTE_TDP_AD_DISABLED;
+ else
+ spte |= shadow_accessed_mask;
+
+ return spte;
+}
+
+u64 kvm_mmu_changed_pte_notifier_make_spte(u64 old_spte, kvm_pfn_t new_pfn)
+{
+ u64 new_spte;
+
+ new_spte = old_spte & ~SPTE_BASE_ADDR_MASK;
+ new_spte |= (u64)new_pfn << PAGE_SHIFT;
+
+ new_spte &= ~PT_WRITABLE_MASK;
+ new_spte &= ~shadow_host_writable_mask;
+ new_spte &= ~shadow_mmu_writable_mask;
+
+ new_spte = mark_spte_for_access_track(new_spte);
+
+ return new_spte;
+}
+
+u64 mark_spte_for_access_track(u64 spte)
+{
+ if (spte_ad_enabled(spte))
+ return spte & ~shadow_accessed_mask;
+
+ if (is_access_track_spte(spte))
+ return spte;
+
+ check_spte_writable_invariants(spte);
+
+ WARN_ONCE(spte & (SHADOW_ACC_TRACK_SAVED_BITS_MASK <<
+ SHADOW_ACC_TRACK_SAVED_BITS_SHIFT),
+ "Access Tracking saved bit locations are not zero\n");
+
+ spte |= (spte & SHADOW_ACC_TRACK_SAVED_BITS_MASK) <<
+ SHADOW_ACC_TRACK_SAVED_BITS_SHIFT;
+ spte &= ~shadow_acc_track_mask;
+
+ return spte;
+}
+
+void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 mmio_mask, u64 access_mask)
+{
+ BUG_ON((u64)(unsigned)access_mask != access_mask);
+ WARN_ON(mmio_value & shadow_nonpresent_or_rsvd_lower_gfn_mask);
+
+ /*
+ * Reset to the original module param value to honor userspace's desire
+ * to (dis)allow MMIO caching. Update the param itself so that
+ * userspace can see whether or not KVM is actually using MMIO caching.
+ */
+ enable_mmio_caching = allow_mmio_caching;
+ if (!enable_mmio_caching)
+ mmio_value = 0;
+
+ /*
+ * The mask must contain only bits that are carved out specifically for
+ * the MMIO SPTE mask, e.g. to ensure there's no overlap with the MMIO
+ * generation.
+ */
+ if (WARN_ON(mmio_mask & ~SPTE_MMIO_ALLOWED_MASK))
+ mmio_value = 0;
+
+ /*
+ * Disable MMIO caching if the MMIO value collides with the bits that
+ * are used to hold the relocated GFN when the L1TF mitigation is
+ * enabled. This should never fire as there is no known hardware that
+ * can trigger this condition, e.g. SME/SEV CPUs that require a custom
+ * MMIO value are not susceptible to L1TF.
+ */
+ if (WARN_ON(mmio_value & (shadow_nonpresent_or_rsvd_mask <<
+ SHADOW_NONPRESENT_OR_RSVD_MASK_LEN)))
+ mmio_value = 0;
+
+ /*
+ * The masked MMIO value must obviously match itself and a removed SPTE
+ * must not get a false positive. Removed SPTEs and MMIO SPTEs should
+ * never collide as MMIO must set some RWX bits, and removed SPTEs must
+ * not set any RWX bits.
+ */
+ if (WARN_ON((mmio_value & mmio_mask) != mmio_value) ||
+ WARN_ON(mmio_value && (REMOVED_SPTE & mmio_mask) == mmio_value))
+ mmio_value = 0;
+
+ if (!mmio_value)
+ enable_mmio_caching = false;
+
+ shadow_mmio_value = mmio_value;
+ shadow_mmio_mask = mmio_mask;
+ shadow_mmio_access_mask = access_mask;
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask);
+
+void kvm_mmu_set_me_spte_mask(u64 me_value, u64 me_mask)
+{
+ /* shadow_me_value must be a subset of shadow_me_mask */
+ if (WARN_ON(me_value & ~me_mask))
+ me_value = me_mask = 0;
+
+ shadow_me_value = me_value;
+ shadow_me_mask = me_mask;
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_set_me_spte_mask);
+
+void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only)
+{
+ shadow_user_mask = VMX_EPT_READABLE_MASK;
+ shadow_accessed_mask = has_ad_bits ? VMX_EPT_ACCESS_BIT : 0ull;
+ shadow_dirty_mask = has_ad_bits ? VMX_EPT_DIRTY_BIT : 0ull;
+ shadow_nx_mask = 0ull;
+ shadow_x_mask = VMX_EPT_EXECUTABLE_MASK;
+ shadow_present_mask = has_exec_only ? 0ull : VMX_EPT_READABLE_MASK;
+ /*
+ * EPT overrides the host MTRRs, and so KVM must program the desired
+ * memtype directly into the SPTEs. Note, this mask is just the mask
+ * of all bits that factor into the memtype, the actual memtype must be
+ * dynamically calculated, e.g. to ensure host MMIO is mapped UC.
+ */
+ shadow_memtype_mask = VMX_EPT_MT_MASK | VMX_EPT_IPAT_BIT;
+ shadow_acc_track_mask = VMX_EPT_RWX_MASK;
+ shadow_host_writable_mask = EPT_SPTE_HOST_WRITABLE;
+ shadow_mmu_writable_mask = EPT_SPTE_MMU_WRITABLE;
+
+ /*
+ * EPT Misconfigurations are generated if the value of bits 2:0
+ * of an EPT paging-structure entry is 110b (write/execute).
+ */
+ kvm_mmu_set_mmio_spte_mask(VMX_EPT_MISCONFIG_WX_VALUE,
+ VMX_EPT_RWX_MASK, 0);
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_set_ept_masks);
+
+void kvm_mmu_reset_all_pte_masks(void)
+{
+ u8 low_phys_bits;
+ u64 mask;
+
+ shadow_phys_bits = kvm_get_shadow_phys_bits();
+
+ /*
+ * If the CPU has 46 or less physical address bits, then set an
+ * appropriate mask to guard against L1TF attacks. Otherwise, it is
+ * assumed that the CPU is not vulnerable to L1TF.
+ *
+ * Some Intel CPUs address the L1 cache using more PA bits than are
+ * reported by CPUID. Use the PA width of the L1 cache when possible
+ * to achieve more effective mitigation, e.g. if system RAM overlaps
+ * the most significant bits of legal physical address space.
+ */
+ shadow_nonpresent_or_rsvd_mask = 0;
+ low_phys_bits = boot_cpu_data.x86_phys_bits;
+ if (boot_cpu_has_bug(X86_BUG_L1TF) &&
+ !WARN_ON_ONCE(boot_cpu_data.x86_cache_bits >=
+ 52 - SHADOW_NONPRESENT_OR_RSVD_MASK_LEN)) {
+ low_phys_bits = boot_cpu_data.x86_cache_bits
+ - SHADOW_NONPRESENT_OR_RSVD_MASK_LEN;
+ shadow_nonpresent_or_rsvd_mask =
+ rsvd_bits(low_phys_bits, boot_cpu_data.x86_cache_bits - 1);
+ }
+
+ shadow_nonpresent_or_rsvd_lower_gfn_mask =
+ GENMASK_ULL(low_phys_bits - 1, PAGE_SHIFT);
+
+ shadow_user_mask = PT_USER_MASK;
+ shadow_accessed_mask = PT_ACCESSED_MASK;
+ shadow_dirty_mask = PT_DIRTY_MASK;
+ shadow_nx_mask = PT64_NX_MASK;
+ shadow_x_mask = 0;
+ shadow_present_mask = PT_PRESENT_MASK;
+
+ /*
+ * For shadow paging and NPT, KVM uses PAT entry '0' to encode WB
+ * memtype in the SPTEs, i.e. relies on host MTRRs to provide the
+ * correct memtype (WB is the "weakest" memtype).
+ */
+ shadow_memtype_mask = 0;
+ shadow_acc_track_mask = 0;
+ shadow_me_mask = 0;
+ shadow_me_value = 0;
+
+ shadow_host_writable_mask = DEFAULT_SPTE_HOST_WRITABLE;
+ shadow_mmu_writable_mask = DEFAULT_SPTE_MMU_WRITABLE;
+
+ /*
+ * Set a reserved PA bit in MMIO SPTEs to generate page faults with
+ * PFEC.RSVD=1 on MMIO accesses. 64-bit PTEs (PAE, x86-64, and EPT
+ * paging) support a maximum of 52 bits of PA, i.e. if the CPU supports
+ * 52-bit physical addresses then there are no reserved PA bits in the
+ * PTEs and so the reserved PA approach must be disabled.
+ */
+ if (shadow_phys_bits < 52)
+ mask = BIT_ULL(51) | PT_PRESENT_MASK;
+ else
+ mask = 0;
+
+ kvm_mmu_set_mmio_spte_mask(mask, mask, ACC_WRITE_MASK | ACC_USER_MASK);
+}
diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h
new file mode 100644
index 000000000..a129951c9
--- /dev/null
+++ b/arch/x86/kvm/mmu/spte.h
@@ -0,0 +1,504 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#ifndef KVM_X86_MMU_SPTE_H
+#define KVM_X86_MMU_SPTE_H
+
+#include "mmu.h"
+#include "mmu_internal.h"
+
+/*
+ * A MMU present SPTE is backed by actual memory and may or may not be present
+ * in hardware. E.g. MMIO SPTEs are not considered present. Use bit 11, as it
+ * is ignored by all flavors of SPTEs and checking a low bit often generates
+ * better code than for a high bit, e.g. 56+. MMU present checks are pervasive
+ * enough that the improved code generation is noticeable in KVM's footprint.
+ */
+#define SPTE_MMU_PRESENT_MASK BIT_ULL(11)
+
+/*
+ * TDP SPTES (more specifically, EPT SPTEs) may not have A/D bits, and may also
+ * be restricted to using write-protection (for L2 when CPU dirty logging, i.e.
+ * PML, is enabled). Use bits 52 and 53 to hold the type of A/D tracking that
+ * is must be employed for a given TDP SPTE.
+ *
+ * Note, the "enabled" mask must be '0', as bits 62:52 are _reserved_ for PAE
+ * paging, including NPT PAE. This scheme works because legacy shadow paging
+ * is guaranteed to have A/D bits and write-protection is forced only for
+ * TDP with CPU dirty logging (PML). If NPT ever gains PML-like support, it
+ * must be restricted to 64-bit KVM.
+ */
+#define SPTE_TDP_AD_SHIFT 52
+#define SPTE_TDP_AD_MASK (3ULL << SPTE_TDP_AD_SHIFT)
+#define SPTE_TDP_AD_ENABLED (0ULL << SPTE_TDP_AD_SHIFT)
+#define SPTE_TDP_AD_DISABLED (1ULL << SPTE_TDP_AD_SHIFT)
+#define SPTE_TDP_AD_WRPROT_ONLY (2ULL << SPTE_TDP_AD_SHIFT)
+static_assert(SPTE_TDP_AD_ENABLED == 0);
+
+#ifdef CONFIG_DYNAMIC_PHYSICAL_MASK
+#define SPTE_BASE_ADDR_MASK (physical_mask & ~(u64)(PAGE_SIZE-1))
+#else
+#define SPTE_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))
+#endif
+
+#define SPTE_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | shadow_user_mask \
+ | shadow_x_mask | shadow_nx_mask | shadow_me_mask)
+
+#define ACC_EXEC_MASK 1
+#define ACC_WRITE_MASK PT_WRITABLE_MASK
+#define ACC_USER_MASK PT_USER_MASK
+#define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)
+
+/* The mask for the R/X bits in EPT PTEs */
+#define SPTE_EPT_READABLE_MASK 0x1ull
+#define SPTE_EPT_EXECUTABLE_MASK 0x4ull
+
+#define SPTE_LEVEL_BITS 9
+#define SPTE_LEVEL_SHIFT(level) __PT_LEVEL_SHIFT(level, SPTE_LEVEL_BITS)
+#define SPTE_INDEX(address, level) __PT_INDEX(address, level, SPTE_LEVEL_BITS)
+#define SPTE_ENT_PER_PAGE __PT_ENT_PER_PAGE(SPTE_LEVEL_BITS)
+
+/*
+ * The mask/shift to use for saving the original R/X bits when marking the PTE
+ * as not-present for access tracking purposes. We do not save the W bit as the
+ * PTEs being access tracked also need to be dirty tracked, so the W bit will be
+ * restored only when a write is attempted to the page. This mask obviously
+ * must not overlap the A/D type mask.
+ */
+#define SHADOW_ACC_TRACK_SAVED_BITS_MASK (SPTE_EPT_READABLE_MASK | \
+ SPTE_EPT_EXECUTABLE_MASK)
+#define SHADOW_ACC_TRACK_SAVED_BITS_SHIFT 54
+#define SHADOW_ACC_TRACK_SAVED_MASK (SHADOW_ACC_TRACK_SAVED_BITS_MASK << \
+ SHADOW_ACC_TRACK_SAVED_BITS_SHIFT)
+static_assert(!(SPTE_TDP_AD_MASK & SHADOW_ACC_TRACK_SAVED_MASK));
+
+/*
+ * {DEFAULT,EPT}_SPTE_{HOST,MMU}_WRITABLE are used to keep track of why a given
+ * SPTE is write-protected. See is_writable_pte() for details.
+ */
+
+/* Bits 9 and 10 are ignored by all non-EPT PTEs. */
+#define DEFAULT_SPTE_HOST_WRITABLE BIT_ULL(9)
+#define DEFAULT_SPTE_MMU_WRITABLE BIT_ULL(10)
+
+/*
+ * Low ignored bits are at a premium for EPT, use high ignored bits, taking care
+ * to not overlap the A/D type mask or the saved access bits of access-tracked
+ * SPTEs when A/D bits are disabled.
+ */
+#define EPT_SPTE_HOST_WRITABLE BIT_ULL(57)
+#define EPT_SPTE_MMU_WRITABLE BIT_ULL(58)
+
+static_assert(!(EPT_SPTE_HOST_WRITABLE & SPTE_TDP_AD_MASK));
+static_assert(!(EPT_SPTE_MMU_WRITABLE & SPTE_TDP_AD_MASK));
+static_assert(!(EPT_SPTE_HOST_WRITABLE & SHADOW_ACC_TRACK_SAVED_MASK));
+static_assert(!(EPT_SPTE_MMU_WRITABLE & SHADOW_ACC_TRACK_SAVED_MASK));
+
+/* Defined only to keep the above static asserts readable. */
+#undef SHADOW_ACC_TRACK_SAVED_MASK
+
+/*
+ * Due to limited space in PTEs, the MMIO generation is a 19 bit subset of
+ * the memslots generation and is derived as follows:
+ *
+ * Bits 0-7 of the MMIO generation are propagated to spte bits 3-10
+ * Bits 8-18 of the MMIO generation are propagated to spte bits 52-62
+ *
+ * The KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS flag is intentionally not included in
+ * the MMIO generation number, as doing so would require stealing a bit from
+ * the "real" generation number and thus effectively halve the maximum number
+ * of MMIO generations that can be handled before encountering a wrap (which
+ * requires a full MMU zap). The flag is instead explicitly queried when
+ * checking for MMIO spte cache hits.
+ */
+
+#define MMIO_SPTE_GEN_LOW_START 3
+#define MMIO_SPTE_GEN_LOW_END 10
+
+#define MMIO_SPTE_GEN_HIGH_START 52
+#define MMIO_SPTE_GEN_HIGH_END 62
+
+#define MMIO_SPTE_GEN_LOW_MASK GENMASK_ULL(MMIO_SPTE_GEN_LOW_END, \
+ MMIO_SPTE_GEN_LOW_START)
+#define MMIO_SPTE_GEN_HIGH_MASK GENMASK_ULL(MMIO_SPTE_GEN_HIGH_END, \
+ MMIO_SPTE_GEN_HIGH_START)
+static_assert(!(SPTE_MMU_PRESENT_MASK &
+ (MMIO_SPTE_GEN_LOW_MASK | MMIO_SPTE_GEN_HIGH_MASK)));
+
+/*
+ * The SPTE MMIO mask must NOT overlap the MMIO generation bits or the
+ * MMU-present bit. The generation obviously co-exists with the magic MMIO
+ * mask/value, and MMIO SPTEs are considered !MMU-present.
+ *
+ * The SPTE MMIO mask is allowed to use hardware "present" bits (i.e. all EPT
+ * RWX bits), all physical address bits (legal PA bits are used for "fast" MMIO
+ * and so they're off-limits for generation; additional checks ensure the mask
+ * doesn't overlap legal PA bits), and bit 63 (carved out for future usage).
+ */
+#define SPTE_MMIO_ALLOWED_MASK (BIT_ULL(63) | GENMASK_ULL(51, 12) | GENMASK_ULL(2, 0))
+static_assert(!(SPTE_MMIO_ALLOWED_MASK &
+ (SPTE_MMU_PRESENT_MASK | MMIO_SPTE_GEN_LOW_MASK | MMIO_SPTE_GEN_HIGH_MASK)));
+
+#define MMIO_SPTE_GEN_LOW_BITS (MMIO_SPTE_GEN_LOW_END - MMIO_SPTE_GEN_LOW_START + 1)
+#define MMIO_SPTE_GEN_HIGH_BITS (MMIO_SPTE_GEN_HIGH_END - MMIO_SPTE_GEN_HIGH_START + 1)
+
+/* remember to adjust the comment above as well if you change these */
+static_assert(MMIO_SPTE_GEN_LOW_BITS == 8 && MMIO_SPTE_GEN_HIGH_BITS == 11);
+
+#define MMIO_SPTE_GEN_LOW_SHIFT (MMIO_SPTE_GEN_LOW_START - 0)
+#define MMIO_SPTE_GEN_HIGH_SHIFT (MMIO_SPTE_GEN_HIGH_START - MMIO_SPTE_GEN_LOW_BITS)
+
+#define MMIO_SPTE_GEN_MASK GENMASK_ULL(MMIO_SPTE_GEN_LOW_BITS + MMIO_SPTE_GEN_HIGH_BITS - 1, 0)
+
+extern u64 __read_mostly shadow_host_writable_mask;
+extern u64 __read_mostly shadow_mmu_writable_mask;
+extern u64 __read_mostly shadow_nx_mask;
+extern u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */
+extern u64 __read_mostly shadow_user_mask;
+extern u64 __read_mostly shadow_accessed_mask;
+extern u64 __read_mostly shadow_dirty_mask;
+extern u64 __read_mostly shadow_mmio_value;
+extern u64 __read_mostly shadow_mmio_mask;
+extern u64 __read_mostly shadow_mmio_access_mask;
+extern u64 __read_mostly shadow_present_mask;
+extern u64 __read_mostly shadow_memtype_mask;
+extern u64 __read_mostly shadow_me_value;
+extern u64 __read_mostly shadow_me_mask;
+
+/*
+ * SPTEs in MMUs without A/D bits are marked with SPTE_TDP_AD_DISABLED;
+ * shadow_acc_track_mask is the set of bits to be cleared in non-accessed
+ * pages.
+ */
+extern u64 __read_mostly shadow_acc_track_mask;
+
+/*
+ * This mask must be set on all non-zero Non-Present or Reserved SPTEs in order
+ * to guard against L1TF attacks.
+ */
+extern u64 __read_mostly shadow_nonpresent_or_rsvd_mask;
+
+/*
+ * The number of high-order 1 bits to use in the mask above.
+ */
+#define SHADOW_NONPRESENT_OR_RSVD_MASK_LEN 5
+
+/*
+ * If a thread running without exclusive control of the MMU lock must perform a
+ * multi-part operation on an SPTE, it can set the SPTE to REMOVED_SPTE as a
+ * non-present intermediate value. Other threads which encounter this value
+ * should not modify the SPTE.
+ *
+ * Use a semi-arbitrary value that doesn't set RWX bits, i.e. is not-present on
+ * both AMD and Intel CPUs, and doesn't set PFN bits, i.e. doesn't create a L1TF
+ * vulnerability. Use only low bits to avoid 64-bit immediates.
+ *
+ * Only used by the TDP MMU.
+ */
+#define REMOVED_SPTE 0x5a0ULL
+
+/* Removed SPTEs must not be misconstrued as shadow present PTEs. */
+static_assert(!(REMOVED_SPTE & SPTE_MMU_PRESENT_MASK));
+
+static inline bool is_removed_spte(u64 spte)
+{
+ return spte == REMOVED_SPTE;
+}
+
+/* Get an SPTE's index into its parent's page table (and the spt array). */
+static inline int spte_index(u64 *sptep)
+{
+ return ((unsigned long)sptep / sizeof(*sptep)) & (SPTE_ENT_PER_PAGE - 1);
+}
+
+/*
+ * In some cases, we need to preserve the GFN of a non-present or reserved
+ * SPTE when we usurp the upper five bits of the physical address space to
+ * defend against L1TF, e.g. for MMIO SPTEs. To preserve the GFN, we'll
+ * shift bits of the GFN that overlap with shadow_nonpresent_or_rsvd_mask
+ * left into the reserved bits, i.e. the GFN in the SPTE will be split into
+ * high and low parts. This mask covers the lower bits of the GFN.
+ */
+extern u64 __read_mostly shadow_nonpresent_or_rsvd_lower_gfn_mask;
+
+static inline struct kvm_mmu_page *to_shadow_page(hpa_t shadow_page)
+{
+ struct page *page = pfn_to_page((shadow_page) >> PAGE_SHIFT);
+
+ return (struct kvm_mmu_page *)page_private(page);
+}
+
+static inline struct kvm_mmu_page *spte_to_child_sp(u64 spte)
+{
+ return to_shadow_page(spte & SPTE_BASE_ADDR_MASK);
+}
+
+static inline struct kvm_mmu_page *sptep_to_sp(u64 *sptep)
+{
+ return to_shadow_page(__pa(sptep));
+}
+
+static inline struct kvm_mmu_page *root_to_sp(hpa_t root)
+{
+ if (kvm_mmu_is_dummy_root(root))
+ return NULL;
+
+ /*
+ * The "root" may be a special root, e.g. a PAE entry, treat it as a
+ * SPTE to ensure any non-PA bits are dropped.
+ */
+ return spte_to_child_sp(root);
+}
+
+static inline bool is_mmio_spte(u64 spte)
+{
+ return (spte & shadow_mmio_mask) == shadow_mmio_value &&
+ likely(enable_mmio_caching);
+}
+
+static inline bool is_shadow_present_pte(u64 pte)
+{
+ return !!(pte & SPTE_MMU_PRESENT_MASK);
+}
+
+/*
+ * Returns true if A/D bits are supported in hardware and are enabled by KVM.
+ * When enabled, KVM uses A/D bits for all non-nested MMUs. Because L1 can
+ * disable A/D bits in EPTP12, SP and SPTE variants are needed to handle the
+ * scenario where KVM is using A/D bits for L1, but not L2.
+ */
+static inline bool kvm_ad_enabled(void)
+{
+ return !!shadow_accessed_mask;
+}
+
+static inline bool sp_ad_disabled(struct kvm_mmu_page *sp)
+{
+ return sp->role.ad_disabled;
+}
+
+static inline bool spte_ad_enabled(u64 spte)
+{
+ KVM_MMU_WARN_ON(!is_shadow_present_pte(spte));
+ return (spte & SPTE_TDP_AD_MASK) != SPTE_TDP_AD_DISABLED;
+}
+
+static inline bool spte_ad_need_write_protect(u64 spte)
+{
+ KVM_MMU_WARN_ON(!is_shadow_present_pte(spte));
+ /*
+ * This is benign for non-TDP SPTEs as SPTE_TDP_AD_ENABLED is '0',
+ * and non-TDP SPTEs will never set these bits. Optimize for 64-bit
+ * TDP and do the A/D type check unconditionally.
+ */
+ return (spte & SPTE_TDP_AD_MASK) != SPTE_TDP_AD_ENABLED;
+}
+
+static inline u64 spte_shadow_accessed_mask(u64 spte)
+{
+ KVM_MMU_WARN_ON(!is_shadow_present_pte(spte));
+ return spte_ad_enabled(spte) ? shadow_accessed_mask : 0;
+}
+
+static inline u64 spte_shadow_dirty_mask(u64 spte)
+{
+ KVM_MMU_WARN_ON(!is_shadow_present_pte(spte));
+ return spte_ad_enabled(spte) ? shadow_dirty_mask : 0;
+}
+
+static inline bool is_access_track_spte(u64 spte)
+{
+ return !spte_ad_enabled(spte) && (spte & shadow_acc_track_mask) == 0;
+}
+
+static inline bool is_large_pte(u64 pte)
+{
+ return pte & PT_PAGE_SIZE_MASK;
+}
+
+static inline bool is_last_spte(u64 pte, int level)
+{
+ return (level == PG_LEVEL_4K) || is_large_pte(pte);
+}
+
+static inline bool is_executable_pte(u64 spte)
+{
+ return (spte & (shadow_x_mask | shadow_nx_mask)) == shadow_x_mask;
+}
+
+static inline kvm_pfn_t spte_to_pfn(u64 pte)
+{
+ return (pte & SPTE_BASE_ADDR_MASK) >> PAGE_SHIFT;
+}
+
+static inline bool is_accessed_spte(u64 spte)
+{
+ u64 accessed_mask = spte_shadow_accessed_mask(spte);
+
+ return accessed_mask ? spte & accessed_mask
+ : !is_access_track_spte(spte);
+}
+
+static inline bool is_dirty_spte(u64 spte)
+{
+ u64 dirty_mask = spte_shadow_dirty_mask(spte);
+
+ return dirty_mask ? spte & dirty_mask : spte & PT_WRITABLE_MASK;
+}
+
+static inline u64 get_rsvd_bits(struct rsvd_bits_validate *rsvd_check, u64 pte,
+ int level)
+{
+ int bit7 = (pte >> 7) & 1;
+
+ return rsvd_check->rsvd_bits_mask[bit7][level-1];
+}
+
+static inline bool __is_rsvd_bits_set(struct rsvd_bits_validate *rsvd_check,
+ u64 pte, int level)
+{
+ return pte & get_rsvd_bits(rsvd_check, pte, level);
+}
+
+static inline bool __is_bad_mt_xwr(struct rsvd_bits_validate *rsvd_check,
+ u64 pte)
+{
+ return rsvd_check->bad_mt_xwr & BIT_ULL(pte & 0x3f);
+}
+
+static __always_inline bool is_rsvd_spte(struct rsvd_bits_validate *rsvd_check,
+ u64 spte, int level)
+{
+ return __is_bad_mt_xwr(rsvd_check, spte) ||
+ __is_rsvd_bits_set(rsvd_check, spte, level);
+}
+
+/*
+ * A shadow-present leaf SPTE may be non-writable for 4 possible reasons:
+ *
+ * 1. To intercept writes for dirty logging. KVM write-protects huge pages
+ * so that they can be split down into the dirty logging
+ * granularity (4KiB) whenever the guest writes to them. KVM also
+ * write-protects 4KiB pages so that writes can be recorded in the dirty log
+ * (e.g. if not using PML). SPTEs are write-protected for dirty logging
+ * during the VM-iotcls that enable dirty logging.
+ *
+ * 2. To intercept writes to guest page tables that KVM is shadowing. When a
+ * guest writes to its page table the corresponding shadow page table will
+ * be marked "unsync". That way KVM knows which shadow page tables need to
+ * be updated on the next TLB flush, INVLPG, etc. and which do not.
+ *
+ * 3. To prevent guest writes to read-only memory, such as for memory in a
+ * read-only memslot or guest memory backed by a read-only VMA. Writes to
+ * such pages are disallowed entirely.
+ *
+ * 4. To emulate the Accessed bit for SPTEs without A/D bits. Note, in this
+ * case, the SPTE is access-protected, not just write-protected!
+ *
+ * For cases #1 and #4, KVM can safely make such SPTEs writable without taking
+ * mmu_lock as capturing the Accessed/Dirty state doesn't require taking it.
+ * To differentiate #1 and #4 from #2 and #3, KVM uses two software-only bits
+ * in the SPTE:
+ *
+ * shadow_mmu_writable_mask, aka MMU-writable -
+ * Cleared on SPTEs that KVM is currently write-protecting for shadow paging
+ * purposes (case 2 above).
+ *
+ * shadow_host_writable_mask, aka Host-writable -
+ * Cleared on SPTEs that are not host-writable (case 3 above)
+ *
+ * Note, not all possible combinations of PT_WRITABLE_MASK,
+ * shadow_mmu_writable_mask, and shadow_host_writable_mask are valid. A given
+ * SPTE can be in only one of the following states, which map to the
+ * aforementioned 3 cases:
+ *
+ * shadow_host_writable_mask | shadow_mmu_writable_mask | PT_WRITABLE_MASK
+ * ------------------------- | ------------------------ | ----------------
+ * 1 | 1 | 1 (writable)
+ * 1 | 1 | 0 (case 1)
+ * 1 | 0 | 0 (case 2)
+ * 0 | 0 | 0 (case 3)
+ *
+ * The valid combinations of these bits are checked by
+ * check_spte_writable_invariants() whenever an SPTE is modified.
+ *
+ * Clearing the MMU-writable bit is always done under the MMU lock and always
+ * accompanied by a TLB flush before dropping the lock to avoid corrupting the
+ * shadow page tables between vCPUs. Write-protecting an SPTE for dirty logging
+ * (which does not clear the MMU-writable bit), does not flush TLBs before
+ * dropping the lock, as it only needs to synchronize guest writes with the
+ * dirty bitmap. Similarly, making the SPTE inaccessible (and non-writable) for
+ * access-tracking via the clear_young() MMU notifier also does not flush TLBs.
+ *
+ * So, there is the problem: clearing the MMU-writable bit can encounter a
+ * write-protected SPTE while CPUs still have writable mappings for that SPTE
+ * cached in their TLB. To address this, KVM always flushes TLBs when
+ * write-protecting SPTEs if the MMU-writable bit is set on the old SPTE.
+ *
+ * The Host-writable bit is not modified on present SPTEs, it is only set or
+ * cleared when an SPTE is first faulted in from non-present and then remains
+ * immutable.
+ */
+static inline bool is_writable_pte(unsigned long pte)
+{
+ return pte & PT_WRITABLE_MASK;
+}
+
+/* Note: spte must be a shadow-present leaf SPTE. */
+static inline void check_spte_writable_invariants(u64 spte)
+{
+ if (spte & shadow_mmu_writable_mask)
+ WARN_ONCE(!(spte & shadow_host_writable_mask),
+ KBUILD_MODNAME ": MMU-writable SPTE is not Host-writable: %llx",
+ spte);
+ else
+ WARN_ONCE(is_writable_pte(spte),
+ KBUILD_MODNAME ": Writable SPTE is not MMU-writable: %llx", spte);
+}
+
+static inline bool is_mmu_writable_spte(u64 spte)
+{
+ return spte & shadow_mmu_writable_mask;
+}
+
+static inline u64 get_mmio_spte_generation(u64 spte)
+{
+ u64 gen;
+
+ gen = (spte & MMIO_SPTE_GEN_LOW_MASK) >> MMIO_SPTE_GEN_LOW_SHIFT;
+ gen |= (spte & MMIO_SPTE_GEN_HIGH_MASK) >> MMIO_SPTE_GEN_HIGH_SHIFT;
+ return gen;
+}
+
+bool spte_has_volatile_bits(u64 spte);
+
+bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+ const struct kvm_memory_slot *slot,
+ unsigned int pte_access, gfn_t gfn, kvm_pfn_t pfn,
+ u64 old_spte, bool prefetch, bool can_unsync,
+ bool host_writable, u64 *new_spte);
+u64 make_huge_page_split_spte(struct kvm *kvm, u64 huge_spte,
+ union kvm_mmu_page_role role, int index);
+u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled);
+u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access);
+u64 mark_spte_for_access_track(u64 spte);
+
+/* Restore an acc-track PTE back to a regular PTE */
+static inline u64 restore_acc_track_spte(u64 spte)
+{
+ u64 saved_bits = (spte >> SHADOW_ACC_TRACK_SAVED_BITS_SHIFT)
+ & SHADOW_ACC_TRACK_SAVED_BITS_MASK;
+
+ spte &= ~shadow_acc_track_mask;
+ spte &= ~(SHADOW_ACC_TRACK_SAVED_BITS_MASK <<
+ SHADOW_ACC_TRACK_SAVED_BITS_SHIFT);
+ spte |= saved_bits;
+
+ return spte;
+}
+
+u64 kvm_mmu_changed_pte_notifier_make_spte(u64 old_spte, kvm_pfn_t new_pfn);
+
+void __init kvm_mmu_spte_module_init(void);
+void kvm_mmu_reset_all_pte_masks(void);
+
+#endif
diff --git a/arch/x86/kvm/mmu/tdp_iter.c b/arch/x86/kvm/mmu/tdp_iter.c
new file mode 100644
index 000000000..bd30ebfb2
--- /dev/null
+++ b/arch/x86/kvm/mmu/tdp_iter.c
@@ -0,0 +1,177 @@
+// SPDX-License-Identifier: GPL-2.0
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include "mmu_internal.h"
+#include "tdp_iter.h"
+#include "spte.h"
+
+/*
+ * Recalculates the pointer to the SPTE for the current GFN and level and
+ * reread the SPTE.
+ */
+static void tdp_iter_refresh_sptep(struct tdp_iter *iter)
+{
+ iter->sptep = iter->pt_path[iter->level - 1] +
+ SPTE_INDEX(iter->gfn << PAGE_SHIFT, iter->level);
+ iter->old_spte = kvm_tdp_mmu_read_spte(iter->sptep);
+}
+
+/*
+ * Return the TDP iterator to the root PT and allow it to continue its
+ * traversal over the paging structure from there.
+ */
+void tdp_iter_restart(struct tdp_iter *iter)
+{
+ iter->yielded = false;
+ iter->yielded_gfn = iter->next_last_level_gfn;
+ iter->level = iter->root_level;
+
+ iter->gfn = gfn_round_for_level(iter->next_last_level_gfn, iter->level);
+ tdp_iter_refresh_sptep(iter);
+
+ iter->valid = true;
+}
+
+/*
+ * Sets a TDP iterator to walk a pre-order traversal of the paging structure
+ * rooted at root_pt, starting with the walk to translate next_last_level_gfn.
+ */
+void tdp_iter_start(struct tdp_iter *iter, struct kvm_mmu_page *root,
+ int min_level, gfn_t next_last_level_gfn)
+{
+ if (WARN_ON_ONCE(!root || (root->role.level < 1) ||
+ (root->role.level > PT64_ROOT_MAX_LEVEL))) {
+ iter->valid = false;
+ return;
+ }
+
+ iter->next_last_level_gfn = next_last_level_gfn;
+ iter->root_level = root->role.level;
+ iter->min_level = min_level;
+ iter->pt_path[iter->root_level - 1] = (tdp_ptep_t)root->spt;
+ iter->as_id = kvm_mmu_page_as_id(root);
+
+ tdp_iter_restart(iter);
+}
+
+/*
+ * Given an SPTE and its level, returns a pointer containing the host virtual
+ * address of the child page table referenced by the SPTE. Returns null if
+ * there is no such entry.
+ */
+tdp_ptep_t spte_to_child_pt(u64 spte, int level)
+{
+ /*
+ * There's no child entry if this entry isn't present or is a
+ * last-level entry.
+ */
+ if (!is_shadow_present_pte(spte) || is_last_spte(spte, level))
+ return NULL;
+
+ return (tdp_ptep_t)__va(spte_to_pfn(spte) << PAGE_SHIFT);
+}
+
+/*
+ * Steps down one level in the paging structure towards the goal GFN. Returns
+ * true if the iterator was able to step down a level, false otherwise.
+ */
+static bool try_step_down(struct tdp_iter *iter)
+{
+ tdp_ptep_t child_pt;
+
+ if (iter->level == iter->min_level)
+ return false;
+
+ /*
+ * Reread the SPTE before stepping down to avoid traversing into page
+ * tables that are no longer linked from this entry.
+ */
+ iter->old_spte = kvm_tdp_mmu_read_spte(iter->sptep);
+
+ child_pt = spte_to_child_pt(iter->old_spte, iter->level);
+ if (!child_pt)
+ return false;
+
+ iter->level--;
+ iter->pt_path[iter->level - 1] = child_pt;
+ iter->gfn = gfn_round_for_level(iter->next_last_level_gfn, iter->level);
+ tdp_iter_refresh_sptep(iter);
+
+ return true;
+}
+
+/*
+ * Steps to the next entry in the current page table, at the current page table
+ * level. The next entry could point to a page backing guest memory or another
+ * page table, or it could be non-present. Returns true if the iterator was
+ * able to step to the next entry in the page table, false if the iterator was
+ * already at the end of the current page table.
+ */
+static bool try_step_side(struct tdp_iter *iter)
+{
+ /*
+ * Check if the iterator is already at the end of the current page
+ * table.
+ */
+ if (SPTE_INDEX(iter->gfn << PAGE_SHIFT, iter->level) ==
+ (SPTE_ENT_PER_PAGE - 1))
+ return false;
+
+ iter->gfn += KVM_PAGES_PER_HPAGE(iter->level);
+ iter->next_last_level_gfn = iter->gfn;
+ iter->sptep++;
+ iter->old_spte = kvm_tdp_mmu_read_spte(iter->sptep);
+
+ return true;
+}
+
+/*
+ * Tries to traverse back up a level in the paging structure so that the walk
+ * can continue from the next entry in the parent page table. Returns true on a
+ * successful step up, false if already in the root page.
+ */
+static bool try_step_up(struct tdp_iter *iter)
+{
+ if (iter->level == iter->root_level)
+ return false;
+
+ iter->level++;
+ iter->gfn = gfn_round_for_level(iter->gfn, iter->level);
+ tdp_iter_refresh_sptep(iter);
+
+ return true;
+}
+
+/*
+ * Step to the next SPTE in a pre-order traversal of the paging structure.
+ * To get to the next SPTE, the iterator either steps down towards the goal
+ * GFN, if at a present, non-last-level SPTE, or over to a SPTE mapping a
+ * highter GFN.
+ *
+ * The basic algorithm is as follows:
+ * 1. If the current SPTE is a non-last-level SPTE, step down into the page
+ * table it points to.
+ * 2. If the iterator cannot step down, it will try to step to the next SPTE
+ * in the current page of the paging structure.
+ * 3. If the iterator cannot step to the next entry in the current page, it will
+ * try to step up to the parent paging structure page. In this case, that
+ * SPTE will have already been visited, and so the iterator must also step
+ * to the side again.
+ */
+void tdp_iter_next(struct tdp_iter *iter)
+{
+ if (iter->yielded) {
+ tdp_iter_restart(iter);
+ return;
+ }
+
+ if (try_step_down(iter))
+ return;
+
+ do {
+ if (try_step_side(iter))
+ return;
+ } while (try_step_up(iter));
+ iter->valid = false;
+}
+
diff --git a/arch/x86/kvm/mmu/tdp_iter.h b/arch/x86/kvm/mmu/tdp_iter.h
new file mode 100644
index 000000000..fae559559
--- /dev/null
+++ b/arch/x86/kvm/mmu/tdp_iter.h
@@ -0,0 +1,138 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#ifndef __KVM_X86_MMU_TDP_ITER_H
+#define __KVM_X86_MMU_TDP_ITER_H
+
+#include <linux/kvm_host.h>
+
+#include "mmu.h"
+#include "spte.h"
+
+/*
+ * TDP MMU SPTEs are RCU protected to allow paging structures (non-leaf SPTEs)
+ * to be zapped while holding mmu_lock for read, and to allow TLB flushes to be
+ * batched without having to collect the list of zapped SPs. Flows that can
+ * remove SPs must service pending TLB flushes prior to dropping RCU protection.
+ */
+static inline u64 kvm_tdp_mmu_read_spte(tdp_ptep_t sptep)
+{
+ return READ_ONCE(*rcu_dereference(sptep));
+}
+
+static inline u64 kvm_tdp_mmu_write_spte_atomic(tdp_ptep_t sptep, u64 new_spte)
+{
+ return xchg(rcu_dereference(sptep), new_spte);
+}
+
+static inline void __kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 new_spte)
+{
+ WRITE_ONCE(*rcu_dereference(sptep), new_spte);
+}
+
+/*
+ * SPTEs must be modified atomically if they are shadow-present, leaf
+ * SPTEs, and have volatile bits, i.e. has bits that can be set outside
+ * of mmu_lock. The Writable bit can be set by KVM's fast page fault
+ * handler, and Accessed and Dirty bits can be set by the CPU.
+ *
+ * Note, non-leaf SPTEs do have Accessed bits and those bits are
+ * technically volatile, but KVM doesn't consume the Accessed bit of
+ * non-leaf SPTEs, i.e. KVM doesn't care if it clobbers the bit. This
+ * logic needs to be reassessed if KVM were to use non-leaf Accessed
+ * bits, e.g. to skip stepping down into child SPTEs when aging SPTEs.
+ */
+static inline bool kvm_tdp_mmu_spte_need_atomic_write(u64 old_spte, int level)
+{
+ return is_shadow_present_pte(old_spte) &&
+ is_last_spte(old_spte, level) &&
+ spte_has_volatile_bits(old_spte);
+}
+
+static inline u64 kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 old_spte,
+ u64 new_spte, int level)
+{
+ if (kvm_tdp_mmu_spte_need_atomic_write(old_spte, level))
+ return kvm_tdp_mmu_write_spte_atomic(sptep, new_spte);
+
+ __kvm_tdp_mmu_write_spte(sptep, new_spte);
+ return old_spte;
+}
+
+static inline u64 tdp_mmu_clear_spte_bits(tdp_ptep_t sptep, u64 old_spte,
+ u64 mask, int level)
+{
+ atomic64_t *sptep_atomic;
+
+ if (kvm_tdp_mmu_spte_need_atomic_write(old_spte, level)) {
+ sptep_atomic = (atomic64_t *)rcu_dereference(sptep);
+ return (u64)atomic64_fetch_and(~mask, sptep_atomic);
+ }
+
+ __kvm_tdp_mmu_write_spte(sptep, old_spte & ~mask);
+ return old_spte;
+}
+
+/*
+ * A TDP iterator performs a pre-order walk over a TDP paging structure.
+ */
+struct tdp_iter {
+ /*
+ * The iterator will traverse the paging structure towards the mapping
+ * for this GFN.
+ */
+ gfn_t next_last_level_gfn;
+ /*
+ * The next_last_level_gfn at the time when the thread last
+ * yielded. Only yielding when the next_last_level_gfn !=
+ * yielded_gfn helps ensure forward progress.
+ */
+ gfn_t yielded_gfn;
+ /* Pointers to the page tables traversed to reach the current SPTE */
+ tdp_ptep_t pt_path[PT64_ROOT_MAX_LEVEL];
+ /* A pointer to the current SPTE */
+ tdp_ptep_t sptep;
+ /* The lowest GFN mapped by the current SPTE */
+ gfn_t gfn;
+ /* The level of the root page given to the iterator */
+ int root_level;
+ /* The lowest level the iterator should traverse to */
+ int min_level;
+ /* The iterator's current level within the paging structure */
+ int level;
+ /* The address space ID, i.e. SMM vs. regular. */
+ int as_id;
+ /* A snapshot of the value at sptep */
+ u64 old_spte;
+ /*
+ * Whether the iterator has a valid state. This will be false if the
+ * iterator walks off the end of the paging structure.
+ */
+ bool valid;
+ /*
+ * True if KVM dropped mmu_lock and yielded in the middle of a walk, in
+ * which case tdp_iter_next() needs to restart the walk at the root
+ * level instead of advancing to the next entry.
+ */
+ bool yielded;
+};
+
+/*
+ * Iterates over every SPTE mapping the GFN range [start, end) in a
+ * preorder traversal.
+ */
+#define for_each_tdp_pte_min_level(iter, root, min_level, start, end) \
+ for (tdp_iter_start(&iter, root, min_level, start); \
+ iter.valid && iter.gfn < end; \
+ tdp_iter_next(&iter))
+
+#define for_each_tdp_pte(iter, root, start, end) \
+ for_each_tdp_pte_min_level(iter, root, PG_LEVEL_4K, start, end)
+
+tdp_ptep_t spte_to_child_pt(u64 pte, int level);
+
+void tdp_iter_start(struct tdp_iter *iter, struct kvm_mmu_page *root,
+ int min_level, gfn_t next_last_level_gfn);
+void tdp_iter_next(struct tdp_iter *iter);
+void tdp_iter_restart(struct tdp_iter *iter);
+
+#endif /* __KVM_X86_MMU_TDP_ITER_H */
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
new file mode 100644
index 000000000..6cd4dd631
--- /dev/null
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -0,0 +1,1817 @@
+// SPDX-License-Identifier: GPL-2.0
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include "mmu.h"
+#include "mmu_internal.h"
+#include "mmutrace.h"
+#include "tdp_iter.h"
+#include "tdp_mmu.h"
+#include "spte.h"
+
+#include <asm/cmpxchg.h>
+#include <trace/events/kvm.h>
+
+/* Initializes the TDP MMU for the VM, if enabled. */
+void kvm_mmu_init_tdp_mmu(struct kvm *kvm)
+{
+ INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots);
+ spin_lock_init(&kvm->arch.tdp_mmu_pages_lock);
+}
+
+/* Arbitrarily returns true so that this may be used in if statements. */
+static __always_inline bool kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm,
+ bool shared)
+{
+ if (shared)
+ lockdep_assert_held_read(&kvm->mmu_lock);
+ else
+ lockdep_assert_held_write(&kvm->mmu_lock);
+
+ return true;
+}
+
+void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
+{
+ /*
+ * Invalidate all roots, which besides the obvious, schedules all roots
+ * for zapping and thus puts the TDP MMU's reference to each root, i.e.
+ * ultimately frees all roots.
+ */
+ kvm_tdp_mmu_invalidate_all_roots(kvm);
+ kvm_tdp_mmu_zap_invalidated_roots(kvm);
+
+ WARN_ON(atomic64_read(&kvm->arch.tdp_mmu_pages));
+ WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));
+
+ /*
+ * Ensure that all the outstanding RCU callbacks to free shadow pages
+ * can run before the VM is torn down. Putting the last reference to
+ * zapped roots will create new callbacks.
+ */
+ rcu_barrier();
+}
+
+static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
+{
+ free_page((unsigned long)sp->spt);
+ kmem_cache_free(mmu_page_header_cache, sp);
+}
+
+/*
+ * This is called through call_rcu in order to free TDP page table memory
+ * safely with respect to other kernel threads that may be operating on
+ * the memory.
+ * By only accessing TDP MMU page table memory in an RCU read critical
+ * section, and freeing it after a grace period, lockless access to that
+ * memory won't use it after it is freed.
+ */
+static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
+{
+ struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page,
+ rcu_head);
+
+ tdp_mmu_free_sp(sp);
+}
+
+void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
+ bool shared)
+{
+ kvm_lockdep_assert_mmu_lock_held(kvm, shared);
+
+ if (!refcount_dec_and_test(&root->tdp_mmu_root_count))
+ return;
+
+ /*
+ * The TDP MMU itself holds a reference to each root until the root is
+ * explicitly invalidated, i.e. the final reference should be never be
+ * put for a valid root.
+ */
+ KVM_BUG_ON(!is_tdp_mmu_page(root) || !root->role.invalid, kvm);
+
+ spin_lock(&kvm->arch.tdp_mmu_pages_lock);
+ list_del_rcu(&root->link);
+ spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
+ call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback);
+}
+
+/*
+ * Returns the next root after @prev_root (or the first root if @prev_root is
+ * NULL). A reference to the returned root is acquired, and the reference to
+ * @prev_root is released (the caller obviously must hold a reference to
+ * @prev_root if it's non-NULL).
+ *
+ * If @only_valid is true, invalid roots are skipped.
+ *
+ * Returns NULL if the end of tdp_mmu_roots was reached.
+ */
+static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
+ struct kvm_mmu_page *prev_root,
+ bool shared, bool only_valid)
+{
+ struct kvm_mmu_page *next_root;
+
+ rcu_read_lock();
+
+ if (prev_root)
+ next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
+ &prev_root->link,
+ typeof(*prev_root), link);
+ else
+ next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots,
+ typeof(*next_root), link);
+
+ while (next_root) {
+ if ((!only_valid || !next_root->role.invalid) &&
+ kvm_tdp_mmu_get_root(next_root))
+ break;
+
+ next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
+ &next_root->link, typeof(*next_root), link);
+ }
+
+ rcu_read_unlock();
+
+ if (prev_root)
+ kvm_tdp_mmu_put_root(kvm, prev_root, shared);
+
+ return next_root;
+}
+
+/*
+ * Note: this iterator gets and puts references to the roots it iterates over.
+ * This makes it safe to release the MMU lock and yield within the loop, but
+ * if exiting the loop early, the caller must drop the reference to the most
+ * recent root. (Unless keeping a live reference is desirable.)
+ *
+ * If shared is set, this function is operating under the MMU lock in read
+ * mode. In the unlikely event that this thread must free a root, the lock
+ * will be temporarily dropped and reacquired in write mode.
+ */
+#define __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, _only_valid)\
+ for (_root = tdp_mmu_next_root(_kvm, NULL, _shared, _only_valid); \
+ _root; \
+ _root = tdp_mmu_next_root(_kvm, _root, _shared, _only_valid)) \
+ if (kvm_lockdep_assert_mmu_lock_held(_kvm, _shared) && \
+ kvm_mmu_page_as_id(_root) != _as_id) { \
+ } else
+
+#define for_each_valid_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared) \
+ __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, true)
+
+#define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _shared) \
+ for (_root = tdp_mmu_next_root(_kvm, NULL, _shared, false); \
+ _root; \
+ _root = tdp_mmu_next_root(_kvm, _root, _shared, false)) \
+ if (!kvm_lockdep_assert_mmu_lock_held(_kvm, _shared)) { \
+ } else
+
+/*
+ * Iterate over all TDP MMU roots. Requires that mmu_lock be held for write,
+ * the implication being that any flow that holds mmu_lock for read is
+ * inherently yield-friendly and should use the yield-safe variant above.
+ * Holding mmu_lock for write obviates the need for RCU protection as the list
+ * is guaranteed to be stable.
+ */
+#define for_each_tdp_mmu_root(_kvm, _root, _as_id) \
+ list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link) \
+ if (kvm_lockdep_assert_mmu_lock_held(_kvm, false) && \
+ kvm_mmu_page_as_id(_root) != _as_id) { \
+ } else
+
+static struct kvm_mmu_page *tdp_mmu_alloc_sp(struct kvm_vcpu *vcpu)
+{
+ struct kvm_mmu_page *sp;
+
+ sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
+ sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
+
+ return sp;
+}
+
+static void tdp_mmu_init_sp(struct kvm_mmu_page *sp, tdp_ptep_t sptep,
+ gfn_t gfn, union kvm_mmu_page_role role)
+{
+ INIT_LIST_HEAD(&sp->possible_nx_huge_page_link);
+
+ set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
+
+ sp->role = role;
+ sp->gfn = gfn;
+ sp->ptep = sptep;
+ sp->tdp_mmu_page = true;
+
+ trace_kvm_mmu_get_page(sp, true);
+}
+
+static void tdp_mmu_init_child_sp(struct kvm_mmu_page *child_sp,
+ struct tdp_iter *iter)
+{
+ struct kvm_mmu_page *parent_sp;
+ union kvm_mmu_page_role role;
+
+ parent_sp = sptep_to_sp(rcu_dereference(iter->sptep));
+
+ role = parent_sp->role;
+ role.level--;
+
+ tdp_mmu_init_sp(child_sp, iter->sptep, iter->gfn, role);
+}
+
+hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
+{
+ union kvm_mmu_page_role role = vcpu->arch.mmu->root_role;
+ struct kvm *kvm = vcpu->kvm;
+ struct kvm_mmu_page *root;
+
+ lockdep_assert_held_write(&kvm->mmu_lock);
+
+ /*
+ * Check for an existing root before allocating a new one. Note, the
+ * role check prevents consuming an invalid root.
+ */
+ for_each_tdp_mmu_root(kvm, root, kvm_mmu_role_as_id(role)) {
+ if (root->role.word == role.word &&
+ kvm_tdp_mmu_get_root(root))
+ goto out;
+ }
+
+ root = tdp_mmu_alloc_sp(vcpu);
+ tdp_mmu_init_sp(root, NULL, 0, role);
+
+ /*
+ * TDP MMU roots are kept until they are explicitly invalidated, either
+ * by a memslot update or by the destruction of the VM. Initialize the
+ * refcount to two; one reference for the vCPU, and one reference for
+ * the TDP MMU itself, which is held until the root is invalidated and
+ * is ultimately put by kvm_tdp_mmu_zap_invalidated_roots().
+ */
+ refcount_set(&root->tdp_mmu_root_count, 2);
+
+ spin_lock(&kvm->arch.tdp_mmu_pages_lock);
+ list_add_rcu(&root->link, &kvm->arch.tdp_mmu_roots);
+ spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
+
+out:
+ return __pa(root->spt);
+}
+
+static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
+ u64 old_spte, u64 new_spte, int level,
+ bool shared);
+
+static void tdp_account_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+ kvm_account_pgtable_pages((void *)sp->spt, +1);
+ atomic64_inc(&kvm->arch.tdp_mmu_pages);
+}
+
+static void tdp_unaccount_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+ kvm_account_pgtable_pages((void *)sp->spt, -1);
+ atomic64_dec(&kvm->arch.tdp_mmu_pages);
+}
+
+/**
+ * tdp_mmu_unlink_sp() - Remove a shadow page from the list of used pages
+ *
+ * @kvm: kvm instance
+ * @sp: the page to be removed
+ * @shared: This operation may not be running under the exclusive use of
+ * the MMU lock and the operation must synchronize with other
+ * threads that might be adding or removing pages.
+ */
+static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp,
+ bool shared)
+{
+ tdp_unaccount_mmu_page(kvm, sp);
+
+ if (!sp->nx_huge_page_disallowed)
+ return;
+
+ if (shared)
+ spin_lock(&kvm->arch.tdp_mmu_pages_lock);
+ else
+ lockdep_assert_held_write(&kvm->mmu_lock);
+
+ sp->nx_huge_page_disallowed = false;
+ untrack_possible_nx_huge_page(kvm, sp);
+
+ if (shared)
+ spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
+}
+
+/**
+ * handle_removed_pt() - handle a page table removed from the TDP structure
+ *
+ * @kvm: kvm instance
+ * @pt: the page removed from the paging structure
+ * @shared: This operation may not be running under the exclusive use
+ * of the MMU lock and the operation must synchronize with other
+ * threads that might be modifying SPTEs.
+ *
+ * Given a page table that has been removed from the TDP paging structure,
+ * iterates through the page table to clear SPTEs and free child page tables.
+ *
+ * Note that pt is passed in as a tdp_ptep_t, but it does not need RCU
+ * protection. Since this thread removed it from the paging structure,
+ * this thread will be responsible for ensuring the page is freed. Hence the
+ * early rcu_dereferences in the function.
+ */
+static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
+{
+ struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(pt));
+ int level = sp->role.level;
+ gfn_t base_gfn = sp->gfn;
+ int i;
+
+ trace_kvm_mmu_prepare_zap_page(sp);
+
+ tdp_mmu_unlink_sp(kvm, sp, shared);
+
+ for (i = 0; i < SPTE_ENT_PER_PAGE; i++) {
+ tdp_ptep_t sptep = pt + i;
+ gfn_t gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level);
+ u64 old_spte;
+
+ if (shared) {
+ /*
+ * Set the SPTE to a nonpresent value that other
+ * threads will not overwrite. If the SPTE was
+ * already marked as removed then another thread
+ * handling a page fault could overwrite it, so
+ * set the SPTE until it is set from some other
+ * value to the removed SPTE value.
+ */
+ for (;;) {
+ old_spte = kvm_tdp_mmu_write_spte_atomic(sptep, REMOVED_SPTE);
+ if (!is_removed_spte(old_spte))
+ break;
+ cpu_relax();
+ }
+ } else {
+ /*
+ * If the SPTE is not MMU-present, there is no backing
+ * page associated with the SPTE and so no side effects
+ * that need to be recorded, and exclusive ownership of
+ * mmu_lock ensures the SPTE can't be made present.
+ * Note, zapping MMIO SPTEs is also unnecessary as they
+ * are guarded by the memslots generation, not by being
+ * unreachable.
+ */
+ old_spte = kvm_tdp_mmu_read_spte(sptep);
+ if (!is_shadow_present_pte(old_spte))
+ continue;
+
+ /*
+ * Use the common helper instead of a raw WRITE_ONCE as
+ * the SPTE needs to be updated atomically if it can be
+ * modified by a different vCPU outside of mmu_lock.
+ * Even though the parent SPTE is !PRESENT, the TLB
+ * hasn't yet been flushed, and both Intel and AMD
+ * document that A/D assists can use upper-level PxE
+ * entries that are cached in the TLB, i.e. the CPU can
+ * still access the page and mark it dirty.
+ *
+ * No retry is needed in the atomic update path as the
+ * sole concern is dropping a Dirty bit, i.e. no other
+ * task can zap/remove the SPTE as mmu_lock is held for
+ * write. Marking the SPTE as a removed SPTE is not
+ * strictly necessary for the same reason, but using
+ * the remove SPTE value keeps the shared/exclusive
+ * paths consistent and allows the handle_changed_spte()
+ * call below to hardcode the new value to REMOVED_SPTE.
+ *
+ * Note, even though dropping a Dirty bit is the only
+ * scenario where a non-atomic update could result in a
+ * functional bug, simply checking the Dirty bit isn't
+ * sufficient as a fast page fault could read the upper
+ * level SPTE before it is zapped, and then make this
+ * target SPTE writable, resume the guest, and set the
+ * Dirty bit between reading the SPTE above and writing
+ * it here.
+ */
+ old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte,
+ REMOVED_SPTE, level);
+ }
+ handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn,
+ old_spte, REMOVED_SPTE, level, shared);
+ }
+
+ call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback);
+}
+
+/**
+ * handle_changed_spte - handle bookkeeping associated with an SPTE change
+ * @kvm: kvm instance
+ * @as_id: the address space of the paging structure the SPTE was a part of
+ * @gfn: the base GFN that was mapped by the SPTE
+ * @old_spte: The value of the SPTE before the change
+ * @new_spte: The value of the SPTE after the change
+ * @level: the level of the PT the SPTE is part of in the paging structure
+ * @shared: This operation may not be running under the exclusive use of
+ * the MMU lock and the operation must synchronize with other
+ * threads that might be modifying SPTEs.
+ *
+ * Handle bookkeeping that might result from the modification of a SPTE. Note,
+ * dirty logging updates are handled in common code, not here (see make_spte()
+ * and fast_pf_fix_direct_spte()).
+ */
+static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
+ u64 old_spte, u64 new_spte, int level,
+ bool shared)
+{
+ bool was_present = is_shadow_present_pte(old_spte);
+ bool is_present = is_shadow_present_pte(new_spte);
+ bool was_leaf = was_present && is_last_spte(old_spte, level);
+ bool is_leaf = is_present && is_last_spte(new_spte, level);
+ bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
+
+ WARN_ON_ONCE(level > PT64_ROOT_MAX_LEVEL);
+ WARN_ON_ONCE(level < PG_LEVEL_4K);
+ WARN_ON_ONCE(gfn & (KVM_PAGES_PER_HPAGE(level) - 1));
+
+ /*
+ * If this warning were to trigger it would indicate that there was a
+ * missing MMU notifier or a race with some notifier handler.
+ * A present, leaf SPTE should never be directly replaced with another
+ * present leaf SPTE pointing to a different PFN. A notifier handler
+ * should be zapping the SPTE before the main MM's page table is
+ * changed, or the SPTE should be zeroed, and the TLBs flushed by the
+ * thread before replacement.
+ */
+ if (was_leaf && is_leaf && pfn_changed) {
+ pr_err("Invalid SPTE change: cannot replace a present leaf\n"
+ "SPTE with another present leaf SPTE mapping a\n"
+ "different PFN!\n"
+ "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
+ as_id, gfn, old_spte, new_spte, level);
+
+ /*
+ * Crash the host to prevent error propagation and guest data
+ * corruption.
+ */
+ BUG();
+ }
+
+ if (old_spte == new_spte)
+ return;
+
+ trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte);
+
+ if (is_leaf)
+ check_spte_writable_invariants(new_spte);
+
+ /*
+ * The only times a SPTE should be changed from a non-present to
+ * non-present state is when an MMIO entry is installed/modified/
+ * removed. In that case, there is nothing to do here.
+ */
+ if (!was_present && !is_present) {
+ /*
+ * If this change does not involve a MMIO SPTE or removed SPTE,
+ * it is unexpected. Log the change, though it should not
+ * impact the guest since both the former and current SPTEs
+ * are nonpresent.
+ */
+ if (WARN_ON_ONCE(!is_mmio_spte(old_spte) &&
+ !is_mmio_spte(new_spte) &&
+ !is_removed_spte(new_spte)))
+ pr_err("Unexpected SPTE change! Nonpresent SPTEs\n"
+ "should not be replaced with another,\n"
+ "different nonpresent SPTE, unless one or both\n"
+ "are MMIO SPTEs, or the new SPTE is\n"
+ "a temporary removed SPTE.\n"
+ "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
+ as_id, gfn, old_spte, new_spte, level);
+ return;
+ }
+
+ if (is_leaf != was_leaf)
+ kvm_update_page_stats(kvm, level, is_leaf ? 1 : -1);
+
+ if (was_leaf && is_dirty_spte(old_spte) &&
+ (!is_present || !is_dirty_spte(new_spte) || pfn_changed))
+ kvm_set_pfn_dirty(spte_to_pfn(old_spte));
+
+ /*
+ * Recursively handle child PTs if the change removed a subtree from
+ * the paging structure. Note the WARN on the PFN changing without the
+ * SPTE being converted to a hugepage (leaf) or being zapped. Shadow
+ * pages are kernel allocations and should never be migrated.
+ */
+ if (was_present && !was_leaf &&
+ (is_leaf || !is_present || WARN_ON_ONCE(pfn_changed)))
+ handle_removed_pt(kvm, spte_to_child_pt(old_spte, level), shared);
+
+ if (was_leaf && is_accessed_spte(old_spte) &&
+ (!is_present || !is_accessed_spte(new_spte) || pfn_changed))
+ kvm_set_pfn_accessed(spte_to_pfn(old_spte));
+}
+
+/*
+ * tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically
+ * and handle the associated bookkeeping. Do not mark the page dirty
+ * in KVM's dirty bitmaps.
+ *
+ * If setting the SPTE fails because it has changed, iter->old_spte will be
+ * refreshed to the current value of the spte.
+ *
+ * @kvm: kvm instance
+ * @iter: a tdp_iter instance currently on the SPTE that should be set
+ * @new_spte: The value the SPTE should be set to
+ * Return:
+ * * 0 - If the SPTE was set.
+ * * -EBUSY - If the SPTE cannot be set. In this case this function will have
+ * no side-effects other than setting iter->old_spte to the last
+ * known value of the spte.
+ */
+static inline int tdp_mmu_set_spte_atomic(struct kvm *kvm,
+ struct tdp_iter *iter,
+ u64 new_spte)
+{
+ u64 *sptep = rcu_dereference(iter->sptep);
+
+ /*
+ * The caller is responsible for ensuring the old SPTE is not a REMOVED
+ * SPTE. KVM should never attempt to zap or manipulate a REMOVED SPTE,
+ * and pre-checking before inserting a new SPTE is advantageous as it
+ * avoids unnecessary work.
+ */
+ WARN_ON_ONCE(iter->yielded || is_removed_spte(iter->old_spte));
+
+ lockdep_assert_held_read(&kvm->mmu_lock);
+
+ /*
+ * Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs and
+ * does not hold the mmu_lock. On failure, i.e. if a different logical
+ * CPU modified the SPTE, try_cmpxchg64() updates iter->old_spte with
+ * the current value, so the caller operates on fresh data, e.g. if it
+ * retries tdp_mmu_set_spte_atomic()
+ */
+ if (!try_cmpxchg64(sptep, &iter->old_spte, new_spte))
+ return -EBUSY;
+
+ handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
+ new_spte, iter->level, true);
+
+ return 0;
+}
+
+static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm,
+ struct tdp_iter *iter)
+{
+ int ret;
+
+ /*
+ * Freeze the SPTE by setting it to a special,
+ * non-present value. This will stop other threads from
+ * immediately installing a present entry in its place
+ * before the TLBs are flushed.
+ */
+ ret = tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE);
+ if (ret)
+ return ret;
+
+ kvm_flush_remote_tlbs_gfn(kvm, iter->gfn, iter->level);
+
+ /*
+ * No other thread can overwrite the removed SPTE as they must either
+ * wait on the MMU lock or use tdp_mmu_set_spte_atomic() which will not
+ * overwrite the special removed SPTE value. No bookkeeping is needed
+ * here since the SPTE is going from non-present to non-present. Use
+ * the raw write helper to avoid an unnecessary check on volatile bits.
+ */
+ __kvm_tdp_mmu_write_spte(iter->sptep, 0);
+
+ return 0;
+}
+
+
+/*
+ * tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping
+ * @kvm: KVM instance
+ * @as_id: Address space ID, i.e. regular vs. SMM
+ * @sptep: Pointer to the SPTE
+ * @old_spte: The current value of the SPTE
+ * @new_spte: The new value that will be set for the SPTE
+ * @gfn: The base GFN that was (or will be) mapped by the SPTE
+ * @level: The level _containing_ the SPTE (its parent PT's level)
+ *
+ * Returns the old SPTE value, which _may_ be different than @old_spte if the
+ * SPTE had voldatile bits.
+ */
+static u64 tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep,
+ u64 old_spte, u64 new_spte, gfn_t gfn, int level)
+{
+ lockdep_assert_held_write(&kvm->mmu_lock);
+
+ /*
+ * No thread should be using this function to set SPTEs to or from the
+ * temporary removed SPTE value.
+ * If operating under the MMU lock in read mode, tdp_mmu_set_spte_atomic
+ * should be used. If operating under the MMU lock in write mode, the
+ * use of the removed SPTE should not be necessary.
+ */
+ WARN_ON_ONCE(is_removed_spte(old_spte) || is_removed_spte(new_spte));
+
+ old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte, new_spte, level);
+
+ handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, false);
+ return old_spte;
+}
+
+static inline void tdp_mmu_iter_set_spte(struct kvm *kvm, struct tdp_iter *iter,
+ u64 new_spte)
+{
+ WARN_ON_ONCE(iter->yielded);
+ iter->old_spte = tdp_mmu_set_spte(kvm, iter->as_id, iter->sptep,
+ iter->old_spte, new_spte,
+ iter->gfn, iter->level);
+}
+
+#define tdp_root_for_each_pte(_iter, _root, _start, _end) \
+ for_each_tdp_pte(_iter, _root, _start, _end)
+
+#define tdp_root_for_each_leaf_pte(_iter, _root, _start, _end) \
+ tdp_root_for_each_pte(_iter, _root, _start, _end) \
+ if (!is_shadow_present_pte(_iter.old_spte) || \
+ !is_last_spte(_iter.old_spte, _iter.level)) \
+ continue; \
+ else
+
+#define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end) \
+ for_each_tdp_pte(_iter, root_to_sp(_mmu->root.hpa), _start, _end)
+
+/*
+ * Yield if the MMU lock is contended or this thread needs to return control
+ * to the scheduler.
+ *
+ * If this function should yield and flush is set, it will perform a remote
+ * TLB flush before yielding.
+ *
+ * If this function yields, iter->yielded is set and the caller must skip to
+ * the next iteration, where tdp_iter_next() will reset the tdp_iter's walk
+ * over the paging structures to allow the iterator to continue its traversal
+ * from the paging structure root.
+ *
+ * Returns true if this function yielded.
+ */
+static inline bool __must_check tdp_mmu_iter_cond_resched(struct kvm *kvm,
+ struct tdp_iter *iter,
+ bool flush, bool shared)
+{
+ WARN_ON_ONCE(iter->yielded);
+
+ /* Ensure forward progress has been made before yielding. */
+ if (iter->next_last_level_gfn == iter->yielded_gfn)
+ return false;
+
+ if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
+ if (flush)
+ kvm_flush_remote_tlbs(kvm);
+
+ rcu_read_unlock();
+
+ if (shared)
+ cond_resched_rwlock_read(&kvm->mmu_lock);
+ else
+ cond_resched_rwlock_write(&kvm->mmu_lock);
+
+ rcu_read_lock();
+
+ WARN_ON_ONCE(iter->gfn > iter->next_last_level_gfn);
+
+ iter->yielded = true;
+ }
+
+ return iter->yielded;
+}
+
+static inline gfn_t tdp_mmu_max_gfn_exclusive(void)
+{
+ /*
+ * Bound TDP MMU walks at host.MAXPHYADDR. KVM disallows memslots with
+ * a gpa range that would exceed the max gfn, and KVM does not create
+ * MMIO SPTEs for "impossible" gfns, instead sending such accesses down
+ * the slow emulation path every time.
+ */
+ return kvm_mmu_max_gfn() + 1;
+}
+
+static void __tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
+ bool shared, int zap_level)
+{
+ struct tdp_iter iter;
+
+ gfn_t end = tdp_mmu_max_gfn_exclusive();
+ gfn_t start = 0;
+
+ for_each_tdp_pte_min_level(iter, root, zap_level, start, end) {
+retry:
+ if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared))
+ continue;
+
+ if (!is_shadow_present_pte(iter.old_spte))
+ continue;
+
+ if (iter.level > zap_level)
+ continue;
+
+ if (!shared)
+ tdp_mmu_iter_set_spte(kvm, &iter, 0);
+ else if (tdp_mmu_set_spte_atomic(kvm, &iter, 0))
+ goto retry;
+ }
+}
+
+static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
+ bool shared)
+{
+
+ /*
+ * The root must have an elevated refcount so that it's reachable via
+ * mmu_notifier callbacks, which allows this path to yield and drop
+ * mmu_lock. When handling an unmap/release mmu_notifier command, KVM
+ * must drop all references to relevant pages prior to completing the
+ * callback. Dropping mmu_lock with an unreachable root would result
+ * in zapping SPTEs after a relevant mmu_notifier callback completes
+ * and lead to use-after-free as zapping a SPTE triggers "writeback" of
+ * dirty accessed bits to the SPTE's associated struct page.
+ */
+ WARN_ON_ONCE(!refcount_read(&root->tdp_mmu_root_count));
+
+ kvm_lockdep_assert_mmu_lock_held(kvm, shared);
+
+ rcu_read_lock();
+
+ /*
+ * To avoid RCU stalls due to recursively removing huge swaths of SPs,
+ * split the zap into two passes. On the first pass, zap at the 1gb
+ * level, and then zap top-level SPs on the second pass. "1gb" is not
+ * arbitrary, as KVM must be able to zap a 1gb shadow page without
+ * inducing a stall to allow in-place replacement with a 1gb hugepage.
+ *
+ * Because zapping a SP recurses on its children, stepping down to
+ * PG_LEVEL_4K in the iterator itself is unnecessary.
+ */
+ __tdp_mmu_zap_root(kvm, root, shared, PG_LEVEL_1G);
+ __tdp_mmu_zap_root(kvm, root, shared, root->role.level);
+
+ rcu_read_unlock();
+}
+
+bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+ u64 old_spte;
+
+ /*
+ * This helper intentionally doesn't allow zapping a root shadow page,
+ * which doesn't have a parent page table and thus no associated entry.
+ */
+ if (WARN_ON_ONCE(!sp->ptep))
+ return false;
+
+ old_spte = kvm_tdp_mmu_read_spte(sp->ptep);
+ if (WARN_ON_ONCE(!is_shadow_present_pte(old_spte)))
+ return false;
+
+ tdp_mmu_set_spte(kvm, kvm_mmu_page_as_id(sp), sp->ptep, old_spte, 0,
+ sp->gfn, sp->role.level + 1);
+
+ return true;
+}
+
+/*
+ * If can_yield is true, will release the MMU lock and reschedule if the
+ * scheduler needs the CPU or there is contention on the MMU lock. If this
+ * function cannot yield, it will not release the MMU lock or reschedule and
+ * the caller must ensure it does not supply too large a GFN range, or the
+ * operation can cause a soft lockup.
+ */
+static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root,
+ gfn_t start, gfn_t end, bool can_yield, bool flush)
+{
+ struct tdp_iter iter;
+
+ end = min(end, tdp_mmu_max_gfn_exclusive());
+
+ lockdep_assert_held_write(&kvm->mmu_lock);
+
+ rcu_read_lock();
+
+ for_each_tdp_pte_min_level(iter, root, PG_LEVEL_4K, start, end) {
+ if (can_yield &&
+ tdp_mmu_iter_cond_resched(kvm, &iter, flush, false)) {
+ flush = false;
+ continue;
+ }
+
+ if (!is_shadow_present_pte(iter.old_spte) ||
+ !is_last_spte(iter.old_spte, iter.level))
+ continue;
+
+ tdp_mmu_iter_set_spte(kvm, &iter, 0);
+ flush = true;
+ }
+
+ rcu_read_unlock();
+
+ /*
+ * Because this flow zaps _only_ leaf SPTEs, the caller doesn't need
+ * to provide RCU protection as no 'struct kvm_mmu_page' will be freed.
+ */
+ return flush;
+}
+
+/*
+ * Zap leaf SPTEs for the range of gfns, [start, end), for all roots. Returns
+ * true if a TLB flush is needed before releasing the MMU lock, i.e. if one or
+ * more SPTEs were zapped since the MMU lock was last acquired.
+ */
+bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, gfn_t start, gfn_t end, bool flush)
+{
+ struct kvm_mmu_page *root;
+
+ for_each_tdp_mmu_root_yield_safe(kvm, root, false)
+ flush = tdp_mmu_zap_leafs(kvm, root, start, end, true, flush);
+
+ return flush;
+}
+
+void kvm_tdp_mmu_zap_all(struct kvm *kvm)
+{
+ struct kvm_mmu_page *root;
+
+ /*
+ * Zap all roots, including invalid roots, as all SPTEs must be dropped
+ * before returning to the caller. Zap directly even if the root is
+ * also being zapped by a worker. Walking zapped top-level SPTEs isn't
+ * all that expensive and mmu_lock is already held, which means the
+ * worker has yielded, i.e. flushing the work instead of zapping here
+ * isn't guaranteed to be any faster.
+ *
+ * A TLB flush is unnecessary, KVM zaps everything if and only the VM
+ * is being destroyed or the userspace VMM has exited. In both cases,
+ * KVM_RUN is unreachable, i.e. no vCPUs will ever service the request.
+ */
+ for_each_tdp_mmu_root_yield_safe(kvm, root, false)
+ tdp_mmu_zap_root(kvm, root, false);
+}
+
+/*
+ * Zap all invalidated roots to ensure all SPTEs are dropped before the "fast
+ * zap" completes.
+ */
+void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm)
+{
+ struct kvm_mmu_page *root;
+
+ read_lock(&kvm->mmu_lock);
+
+ for_each_tdp_mmu_root_yield_safe(kvm, root, true) {
+ if (!root->tdp_mmu_scheduled_root_to_zap)
+ continue;
+
+ root->tdp_mmu_scheduled_root_to_zap = false;
+ KVM_BUG_ON(!root->role.invalid, kvm);
+
+ /*
+ * A TLB flush is not necessary as KVM performs a local TLB
+ * flush when allocating a new root (see kvm_mmu_load()), and
+ * when migrating a vCPU to a different pCPU. Note, the local
+ * TLB flush on reuse also invalidates paging-structure-cache
+ * entries, i.e. TLB entries for intermediate paging structures,
+ * that may be zapped, as such entries are associated with the
+ * ASID on both VMX and SVM.
+ */
+ tdp_mmu_zap_root(kvm, root, true);
+
+ /*
+ * The referenced needs to be put *after* zapping the root, as
+ * the root must be reachable by mmu_notifiers while it's being
+ * zapped
+ */
+ kvm_tdp_mmu_put_root(kvm, root, true);
+ }
+
+ read_unlock(&kvm->mmu_lock);
+}
+
+/*
+ * Mark each TDP MMU root as invalid to prevent vCPUs from reusing a root that
+ * is about to be zapped, e.g. in response to a memslots update. The actual
+ * zapping is done separately so that it happens with mmu_lock with read,
+ * whereas invalidating roots must be done with mmu_lock held for write (unless
+ * the VM is being destroyed).
+ *
+ * Note, kvm_tdp_mmu_zap_invalidated_roots() is gifted the TDP MMU's reference.
+ * See kvm_tdp_mmu_get_vcpu_root_hpa().
+ */
+void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm)
+{
+ struct kvm_mmu_page *root;
+
+ /*
+ * mmu_lock must be held for write to ensure that a root doesn't become
+ * invalid while there are active readers (invalidating a root while
+ * there are active readers may or may not be problematic in practice,
+ * but it's uncharted territory and not supported).
+ *
+ * Waive the assertion if there are no users of @kvm, i.e. the VM is
+ * being destroyed after all references have been put, or if no vCPUs
+ * have been created (which means there are no roots), i.e. the VM is
+ * being destroyed in an error path of KVM_CREATE_VM.
+ */
+ if (IS_ENABLED(CONFIG_PROVE_LOCKING) &&
+ refcount_read(&kvm->users_count) && kvm->created_vcpus)
+ lockdep_assert_held_write(&kvm->mmu_lock);
+
+ /*
+ * As above, mmu_lock isn't held when destroying the VM! There can't
+ * be other references to @kvm, i.e. nothing else can invalidate roots
+ * or get/put references to roots.
+ */
+ list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link) {
+ /*
+ * Note, invalid roots can outlive a memslot update! Invalid
+ * roots must be *zapped* before the memslot update completes,
+ * but a different task can acquire a reference and keep the
+ * root alive after its been zapped.
+ */
+ if (!root->role.invalid) {
+ root->tdp_mmu_scheduled_root_to_zap = true;
+ root->role.invalid = true;
+ }
+ }
+}
+
+/*
+ * Installs a last-level SPTE to handle a TDP page fault.
+ * (NPT/EPT violation/misconfiguration)
+ */
+static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
+ struct kvm_page_fault *fault,
+ struct tdp_iter *iter)
+{
+ struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(iter->sptep));
+ u64 new_spte;
+ int ret = RET_PF_FIXED;
+ bool wrprot = false;
+
+ if (WARN_ON_ONCE(sp->role.level != fault->goal_level))
+ return RET_PF_RETRY;
+
+ if (unlikely(!fault->slot))
+ new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL);
+ else
+ wrprot = make_spte(vcpu, sp, fault->slot, ACC_ALL, iter->gfn,
+ fault->pfn, iter->old_spte, fault->prefetch, true,
+ fault->map_writable, &new_spte);
+
+ if (new_spte == iter->old_spte)
+ ret = RET_PF_SPURIOUS;
+ else if (tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte))
+ return RET_PF_RETRY;
+ else if (is_shadow_present_pte(iter->old_spte) &&
+ !is_last_spte(iter->old_spte, iter->level))
+ kvm_flush_remote_tlbs_gfn(vcpu->kvm, iter->gfn, iter->level);
+
+ /*
+ * If the page fault was caused by a write but the page is write
+ * protected, emulation is needed. If the emulation was skipped,
+ * the vCPU would have the same fault again.
+ */
+ if (wrprot) {
+ if (fault->write)
+ ret = RET_PF_EMULATE;
+ }
+
+ /* If a MMIO SPTE is installed, the MMIO will need to be emulated. */
+ if (unlikely(is_mmio_spte(new_spte))) {
+ vcpu->stat.pf_mmio_spte_created++;
+ trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn,
+ new_spte);
+ ret = RET_PF_EMULATE;
+ } else {
+ trace_kvm_mmu_set_spte(iter->level, iter->gfn,
+ rcu_dereference(iter->sptep));
+ }
+
+ return ret;
+}
+
+/*
+ * tdp_mmu_link_sp - Replace the given spte with an spte pointing to the
+ * provided page table.
+ *
+ * @kvm: kvm instance
+ * @iter: a tdp_iter instance currently on the SPTE that should be set
+ * @sp: The new TDP page table to install.
+ * @shared: This operation is running under the MMU lock in read mode.
+ *
+ * Returns: 0 if the new page table was installed. Non-0 if the page table
+ * could not be installed (e.g. the atomic compare-exchange failed).
+ */
+static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter,
+ struct kvm_mmu_page *sp, bool shared)
+{
+ u64 spte = make_nonleaf_spte(sp->spt, !kvm_ad_enabled());
+ int ret = 0;
+
+ if (shared) {
+ ret = tdp_mmu_set_spte_atomic(kvm, iter, spte);
+ if (ret)
+ return ret;
+ } else {
+ tdp_mmu_iter_set_spte(kvm, iter, spte);
+ }
+
+ tdp_account_mmu_page(kvm, sp);
+
+ return 0;
+}
+
+static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
+ struct kvm_mmu_page *sp, bool shared);
+
+/*
+ * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing
+ * page tables and SPTEs to translate the faulting guest physical address.
+ */
+int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
+{
+ struct kvm_mmu *mmu = vcpu->arch.mmu;
+ struct kvm *kvm = vcpu->kvm;
+ struct tdp_iter iter;
+ struct kvm_mmu_page *sp;
+ int ret = RET_PF_RETRY;
+
+ kvm_mmu_hugepage_adjust(vcpu, fault);
+
+ trace_kvm_mmu_spte_requested(fault);
+
+ rcu_read_lock();
+
+ tdp_mmu_for_each_pte(iter, mmu, fault->gfn, fault->gfn + 1) {
+ int r;
+
+ if (fault->nx_huge_page_workaround_enabled)
+ disallowed_hugepage_adjust(fault, iter.old_spte, iter.level);
+
+ /*
+ * If SPTE has been frozen by another thread, just give up and
+ * retry, avoiding unnecessary page table allocation and free.
+ */
+ if (is_removed_spte(iter.old_spte))
+ goto retry;
+
+ if (iter.level == fault->goal_level)
+ goto map_target_level;
+
+ /* Step down into the lower level page table if it exists. */
+ if (is_shadow_present_pte(iter.old_spte) &&
+ !is_large_pte(iter.old_spte))
+ continue;
+
+ /*
+ * The SPTE is either non-present or points to a huge page that
+ * needs to be split.
+ */
+ sp = tdp_mmu_alloc_sp(vcpu);
+ tdp_mmu_init_child_sp(sp, &iter);
+
+ sp->nx_huge_page_disallowed = fault->huge_page_disallowed;
+
+ if (is_shadow_present_pte(iter.old_spte))
+ r = tdp_mmu_split_huge_page(kvm, &iter, sp, true);
+ else
+ r = tdp_mmu_link_sp(kvm, &iter, sp, true);
+
+ /*
+ * Force the guest to retry if installing an upper level SPTE
+ * failed, e.g. because a different task modified the SPTE.
+ */
+ if (r) {
+ tdp_mmu_free_sp(sp);
+ goto retry;
+ }
+
+ if (fault->huge_page_disallowed &&
+ fault->req_level >= iter.level) {
+ spin_lock(&kvm->arch.tdp_mmu_pages_lock);
+ if (sp->nx_huge_page_disallowed)
+ track_possible_nx_huge_page(kvm, sp);
+ spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
+ }
+ }
+
+ /*
+ * The walk aborted before reaching the target level, e.g. because the
+ * iterator detected an upper level SPTE was frozen during traversal.
+ */
+ WARN_ON_ONCE(iter.level == fault->goal_level);
+ goto retry;
+
+map_target_level:
+ ret = tdp_mmu_map_handle_target_level(vcpu, fault, &iter);
+
+retry:
+ rcu_read_unlock();
+ return ret;
+}
+
+bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
+ bool flush)
+{
+ struct kvm_mmu_page *root;
+
+ __for_each_tdp_mmu_root_yield_safe(kvm, root, range->slot->as_id, false, false)
+ flush = tdp_mmu_zap_leafs(kvm, root, range->start, range->end,
+ range->may_block, flush);
+
+ return flush;
+}
+
+typedef bool (*tdp_handler_t)(struct kvm *kvm, struct tdp_iter *iter,
+ struct kvm_gfn_range *range);
+
+static __always_inline bool kvm_tdp_mmu_handle_gfn(struct kvm *kvm,
+ struct kvm_gfn_range *range,
+ tdp_handler_t handler)
+{
+ struct kvm_mmu_page *root;
+ struct tdp_iter iter;
+ bool ret = false;
+
+ /*
+ * Don't support rescheduling, none of the MMU notifiers that funnel
+ * into this helper allow blocking; it'd be dead, wasteful code.
+ */
+ for_each_tdp_mmu_root(kvm, root, range->slot->as_id) {
+ rcu_read_lock();
+
+ tdp_root_for_each_leaf_pte(iter, root, range->start, range->end)
+ ret |= handler(kvm, &iter, range);
+
+ rcu_read_unlock();
+ }
+
+ return ret;
+}
+
+/*
+ * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero
+ * if any of the GFNs in the range have been accessed.
+ *
+ * No need to mark the corresponding PFN as accessed as this call is coming
+ * from the clear_young() or clear_flush_young() notifier, which uses the
+ * return value to determine if the page has been accessed.
+ */
+static bool age_gfn_range(struct kvm *kvm, struct tdp_iter *iter,
+ struct kvm_gfn_range *range)
+{
+ u64 new_spte;
+
+ /* If we have a non-accessed entry we don't need to change the pte. */
+ if (!is_accessed_spte(iter->old_spte))
+ return false;
+
+ if (spte_ad_enabled(iter->old_spte)) {
+ iter->old_spte = tdp_mmu_clear_spte_bits(iter->sptep,
+ iter->old_spte,
+ shadow_accessed_mask,
+ iter->level);
+ new_spte = iter->old_spte & ~shadow_accessed_mask;
+ } else {
+ /*
+ * Capture the dirty status of the page, so that it doesn't get
+ * lost when the SPTE is marked for access tracking.
+ */
+ if (is_writable_pte(iter->old_spte))
+ kvm_set_pfn_dirty(spte_to_pfn(iter->old_spte));
+
+ new_spte = mark_spte_for_access_track(iter->old_spte);
+ iter->old_spte = kvm_tdp_mmu_write_spte(iter->sptep,
+ iter->old_spte, new_spte,
+ iter->level);
+ }
+
+ trace_kvm_tdp_mmu_spte_changed(iter->as_id, iter->gfn, iter->level,
+ iter->old_spte, new_spte);
+ return true;
+}
+
+bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
+{
+ return kvm_tdp_mmu_handle_gfn(kvm, range, age_gfn_range);
+}
+
+static bool test_age_gfn(struct kvm *kvm, struct tdp_iter *iter,
+ struct kvm_gfn_range *range)
+{
+ return is_accessed_spte(iter->old_spte);
+}
+
+bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
+{
+ return kvm_tdp_mmu_handle_gfn(kvm, range, test_age_gfn);
+}
+
+static bool set_spte_gfn(struct kvm *kvm, struct tdp_iter *iter,
+ struct kvm_gfn_range *range)
+{
+ u64 new_spte;
+
+ /* Huge pages aren't expected to be modified without first being zapped. */
+ WARN_ON_ONCE(pte_huge(range->arg.pte) || range->start + 1 != range->end);
+
+ if (iter->level != PG_LEVEL_4K ||
+ !is_shadow_present_pte(iter->old_spte))
+ return false;
+
+ /*
+ * Note, when changing a read-only SPTE, it's not strictly necessary to
+ * zero the SPTE before setting the new PFN, but doing so preserves the
+ * invariant that the PFN of a present * leaf SPTE can never change.
+ * See handle_changed_spte().
+ */
+ tdp_mmu_iter_set_spte(kvm, iter, 0);
+
+ if (!pte_write(range->arg.pte)) {
+ new_spte = kvm_mmu_changed_pte_notifier_make_spte(iter->old_spte,
+ pte_pfn(range->arg.pte));
+
+ tdp_mmu_iter_set_spte(kvm, iter, new_spte);
+ }
+
+ return true;
+}
+
+/*
+ * Handle the changed_pte MMU notifier for the TDP MMU.
+ * data is a pointer to the new pte_t mapping the HVA specified by the MMU
+ * notifier.
+ * Returns non-zero if a flush is needed before releasing the MMU lock.
+ */
+bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
+{
+ /*
+ * No need to handle the remote TLB flush under RCU protection, the
+ * target SPTE _must_ be a leaf SPTE, i.e. cannot result in freeing a
+ * shadow page. See the WARN on pfn_changed in handle_changed_spte().
+ */
+ return kvm_tdp_mmu_handle_gfn(kvm, range, set_spte_gfn);
+}
+
+/*
+ * Remove write access from all SPTEs at or above min_level that map GFNs
+ * [start, end). Returns true if an SPTE has been changed and the TLBs need to
+ * be flushed.
+ */
+static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
+ gfn_t start, gfn_t end, int min_level)
+{
+ struct tdp_iter iter;
+ u64 new_spte;
+ bool spte_set = false;
+
+ rcu_read_lock();
+
+ BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
+
+ for_each_tdp_pte_min_level(iter, root, min_level, start, end) {
+retry:
+ if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
+ continue;
+
+ if (!is_shadow_present_pte(iter.old_spte) ||
+ !is_last_spte(iter.old_spte, iter.level) ||
+ !(iter.old_spte & PT_WRITABLE_MASK))
+ continue;
+
+ new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
+
+ if (tdp_mmu_set_spte_atomic(kvm, &iter, new_spte))
+ goto retry;
+
+ spte_set = true;
+ }
+
+ rcu_read_unlock();
+ return spte_set;
+}
+
+/*
+ * Remove write access from all the SPTEs mapping GFNs in the memslot. Will
+ * only affect leaf SPTEs down to min_level.
+ * Returns true if an SPTE has been changed and the TLBs need to be flushed.
+ */
+bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm,
+ const struct kvm_memory_slot *slot, int min_level)
+{
+ struct kvm_mmu_page *root;
+ bool spte_set = false;
+
+ lockdep_assert_held_read(&kvm->mmu_lock);
+
+ for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
+ spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn,
+ slot->base_gfn + slot->npages, min_level);
+
+ return spte_set;
+}
+
+static struct kvm_mmu_page *__tdp_mmu_alloc_sp_for_split(gfp_t gfp)
+{
+ struct kvm_mmu_page *sp;
+
+ gfp |= __GFP_ZERO;
+
+ sp = kmem_cache_alloc(mmu_page_header_cache, gfp);
+ if (!sp)
+ return NULL;
+
+ sp->spt = (void *)__get_free_page(gfp);
+ if (!sp->spt) {
+ kmem_cache_free(mmu_page_header_cache, sp);
+ return NULL;
+ }
+
+ return sp;
+}
+
+static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(struct kvm *kvm,
+ struct tdp_iter *iter,
+ bool shared)
+{
+ struct kvm_mmu_page *sp;
+
+ /*
+ * Since we are allocating while under the MMU lock we have to be
+ * careful about GFP flags. Use GFP_NOWAIT to avoid blocking on direct
+ * reclaim and to avoid making any filesystem callbacks (which can end
+ * up invoking KVM MMU notifiers, resulting in a deadlock).
+ *
+ * If this allocation fails we drop the lock and retry with reclaim
+ * allowed.
+ */
+ sp = __tdp_mmu_alloc_sp_for_split(GFP_NOWAIT | __GFP_ACCOUNT);
+ if (sp)
+ return sp;
+
+ rcu_read_unlock();
+
+ if (shared)
+ read_unlock(&kvm->mmu_lock);
+ else
+ write_unlock(&kvm->mmu_lock);
+
+ iter->yielded = true;
+ sp = __tdp_mmu_alloc_sp_for_split(GFP_KERNEL_ACCOUNT);
+
+ if (shared)
+ read_lock(&kvm->mmu_lock);
+ else
+ write_lock(&kvm->mmu_lock);
+
+ rcu_read_lock();
+
+ return sp;
+}
+
+/* Note, the caller is responsible for initializing @sp. */
+static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
+ struct kvm_mmu_page *sp, bool shared)
+{
+ const u64 huge_spte = iter->old_spte;
+ const int level = iter->level;
+ int ret, i;
+
+ /*
+ * No need for atomics when writing to sp->spt since the page table has
+ * not been linked in yet and thus is not reachable from any other CPU.
+ */
+ for (i = 0; i < SPTE_ENT_PER_PAGE; i++)
+ sp->spt[i] = make_huge_page_split_spte(kvm, huge_spte, sp->role, i);
+
+ /*
+ * Replace the huge spte with a pointer to the populated lower level
+ * page table. Since we are making this change without a TLB flush vCPUs
+ * will see a mix of the split mappings and the original huge mapping,
+ * depending on what's currently in their TLB. This is fine from a
+ * correctness standpoint since the translation will be the same either
+ * way.
+ */
+ ret = tdp_mmu_link_sp(kvm, iter, sp, shared);
+ if (ret)
+ goto out;
+
+ /*
+ * tdp_mmu_link_sp_atomic() will handle subtracting the huge page we
+ * are overwriting from the page stats. But we have to manually update
+ * the page stats with the new present child pages.
+ */
+ kvm_update_page_stats(kvm, level - 1, SPTE_ENT_PER_PAGE);
+
+out:
+ trace_kvm_mmu_split_huge_page(iter->gfn, huge_spte, level, ret);
+ return ret;
+}
+
+static int tdp_mmu_split_huge_pages_root(struct kvm *kvm,
+ struct kvm_mmu_page *root,
+ gfn_t start, gfn_t end,
+ int target_level, bool shared)
+{
+ struct kvm_mmu_page *sp = NULL;
+ struct tdp_iter iter;
+ int ret = 0;
+
+ rcu_read_lock();
+
+ /*
+ * Traverse the page table splitting all huge pages above the target
+ * level into one lower level. For example, if we encounter a 1GB page
+ * we split it into 512 2MB pages.
+ *
+ * Since the TDP iterator uses a pre-order traversal, we are guaranteed
+ * to visit an SPTE before ever visiting its children, which means we
+ * will correctly recursively split huge pages that are more than one
+ * level above the target level (e.g. splitting a 1GB to 512 2MB pages,
+ * and then splitting each of those to 512 4KB pages).
+ */
+ for_each_tdp_pte_min_level(iter, root, target_level + 1, start, end) {
+retry:
+ if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared))
+ continue;
+
+ if (!is_shadow_present_pte(iter.old_spte) || !is_large_pte(iter.old_spte))
+ continue;
+
+ if (!sp) {
+ sp = tdp_mmu_alloc_sp_for_split(kvm, &iter, shared);
+ if (!sp) {
+ ret = -ENOMEM;
+ trace_kvm_mmu_split_huge_page(iter.gfn,
+ iter.old_spte,
+ iter.level, ret);
+ break;
+ }
+
+ if (iter.yielded)
+ continue;
+ }
+
+ tdp_mmu_init_child_sp(sp, &iter);
+
+ if (tdp_mmu_split_huge_page(kvm, &iter, sp, shared))
+ goto retry;
+
+ sp = NULL;
+ }
+
+ rcu_read_unlock();
+
+ /*
+ * It's possible to exit the loop having never used the last sp if, for
+ * example, a vCPU doing HugePage NX splitting wins the race and
+ * installs its own sp in place of the last sp we tried to split.
+ */
+ if (sp)
+ tdp_mmu_free_sp(sp);
+
+ return ret;
+}
+
+
+/*
+ * Try to split all huge pages mapped by the TDP MMU down to the target level.
+ */
+void kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm,
+ const struct kvm_memory_slot *slot,
+ gfn_t start, gfn_t end,
+ int target_level, bool shared)
+{
+ struct kvm_mmu_page *root;
+ int r = 0;
+
+ kvm_lockdep_assert_mmu_lock_held(kvm, shared);
+
+ for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, shared) {
+ r = tdp_mmu_split_huge_pages_root(kvm, root, start, end, target_level, shared);
+ if (r) {
+ kvm_tdp_mmu_put_root(kvm, root, shared);
+ break;
+ }
+ }
+}
+
+/*
+ * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
+ * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
+ * If AD bits are not enabled, this will require clearing the writable bit on
+ * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
+ * be flushed.
+ */
+static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
+ gfn_t start, gfn_t end)
+{
+ u64 dbit = kvm_ad_enabled() ? shadow_dirty_mask : PT_WRITABLE_MASK;
+ struct tdp_iter iter;
+ bool spte_set = false;
+
+ rcu_read_lock();
+
+ tdp_root_for_each_leaf_pte(iter, root, start, end) {
+retry:
+ if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
+ continue;
+
+ if (!is_shadow_present_pte(iter.old_spte))
+ continue;
+
+ KVM_MMU_WARN_ON(kvm_ad_enabled() &&
+ spte_ad_need_write_protect(iter.old_spte));
+
+ if (!(iter.old_spte & dbit))
+ continue;
+
+ if (tdp_mmu_set_spte_atomic(kvm, &iter, iter.old_spte & ~dbit))
+ goto retry;
+
+ spte_set = true;
+ }
+
+ rcu_read_unlock();
+ return spte_set;
+}
+
+/*
+ * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
+ * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
+ * If AD bits are not enabled, this will require clearing the writable bit on
+ * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
+ * be flushed.
+ */
+bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm,
+ const struct kvm_memory_slot *slot)
+{
+ struct kvm_mmu_page *root;
+ bool spte_set = false;
+
+ lockdep_assert_held_read(&kvm->mmu_lock);
+
+ for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
+ spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn,
+ slot->base_gfn + slot->npages);
+
+ return spte_set;
+}
+
+/*
+ * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
+ * set in mask, starting at gfn. The given memslot is expected to contain all
+ * the GFNs represented by set bits in the mask. If AD bits are enabled,
+ * clearing the dirty status will involve clearing the dirty bit on each SPTE
+ * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
+ */
+static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
+ gfn_t gfn, unsigned long mask, bool wrprot)
+{
+ u64 dbit = (wrprot || !kvm_ad_enabled()) ? PT_WRITABLE_MASK :
+ shadow_dirty_mask;
+ struct tdp_iter iter;
+
+ lockdep_assert_held_write(&kvm->mmu_lock);
+
+ rcu_read_lock();
+
+ tdp_root_for_each_leaf_pte(iter, root, gfn + __ffs(mask),
+ gfn + BITS_PER_LONG) {
+ if (!mask)
+ break;
+
+ KVM_MMU_WARN_ON(kvm_ad_enabled() &&
+ spte_ad_need_write_protect(iter.old_spte));
+
+ if (iter.level > PG_LEVEL_4K ||
+ !(mask & (1UL << (iter.gfn - gfn))))
+ continue;
+
+ mask &= ~(1UL << (iter.gfn - gfn));
+
+ if (!(iter.old_spte & dbit))
+ continue;
+
+ iter.old_spte = tdp_mmu_clear_spte_bits(iter.sptep,
+ iter.old_spte, dbit,
+ iter.level);
+
+ trace_kvm_tdp_mmu_spte_changed(iter.as_id, iter.gfn, iter.level,
+ iter.old_spte,
+ iter.old_spte & ~dbit);
+ kvm_set_pfn_dirty(spte_to_pfn(iter.old_spte));
+ }
+
+ rcu_read_unlock();
+}
+
+/*
+ * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
+ * set in mask, starting at gfn. The given memslot is expected to contain all
+ * the GFNs represented by set bits in the mask. If AD bits are enabled,
+ * clearing the dirty status will involve clearing the dirty bit on each SPTE
+ * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
+ */
+void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
+ struct kvm_memory_slot *slot,
+ gfn_t gfn, unsigned long mask,
+ bool wrprot)
+{
+ struct kvm_mmu_page *root;
+
+ for_each_tdp_mmu_root(kvm, root, slot->as_id)
+ clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot);
+}
+
+static void zap_collapsible_spte_range(struct kvm *kvm,
+ struct kvm_mmu_page *root,
+ const struct kvm_memory_slot *slot)
+{
+ gfn_t start = slot->base_gfn;
+ gfn_t end = start + slot->npages;
+ struct tdp_iter iter;
+ int max_mapping_level;
+
+ rcu_read_lock();
+
+ for_each_tdp_pte_min_level(iter, root, PG_LEVEL_2M, start, end) {
+retry:
+ if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
+ continue;
+
+ if (iter.level > KVM_MAX_HUGEPAGE_LEVEL ||
+ !is_shadow_present_pte(iter.old_spte))
+ continue;
+
+ /*
+ * Don't zap leaf SPTEs, if a leaf SPTE could be replaced with
+ * a large page size, then its parent would have been zapped
+ * instead of stepping down.
+ */
+ if (is_last_spte(iter.old_spte, iter.level))
+ continue;
+
+ /*
+ * If iter.gfn resides outside of the slot, i.e. the page for
+ * the current level overlaps but is not contained by the slot,
+ * then the SPTE can't be made huge. More importantly, trying
+ * to query that info from slot->arch.lpage_info will cause an
+ * out-of-bounds access.
+ */
+ if (iter.gfn < start || iter.gfn >= end)
+ continue;
+
+ max_mapping_level = kvm_mmu_max_mapping_level(kvm, slot,
+ iter.gfn, PG_LEVEL_NUM);
+ if (max_mapping_level < iter.level)
+ continue;
+
+ /* Note, a successful atomic zap also does a remote TLB flush. */
+ if (tdp_mmu_zap_spte_atomic(kvm, &iter))
+ goto retry;
+ }
+
+ rcu_read_unlock();
+}
+
+/*
+ * Zap non-leaf SPTEs (and free their associated page tables) which could
+ * be replaced by huge pages, for GFNs within the slot.
+ */
+void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
+ const struct kvm_memory_slot *slot)
+{
+ struct kvm_mmu_page *root;
+
+ lockdep_assert_held_read(&kvm->mmu_lock);
+
+ for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
+ zap_collapsible_spte_range(kvm, root, slot);
+}
+
+/*
+ * Removes write access on the last level SPTE mapping this GFN and unsets the
+ * MMU-writable bit to ensure future writes continue to be intercepted.
+ * Returns true if an SPTE was set and a TLB flush is needed.
+ */
+static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
+ gfn_t gfn, int min_level)
+{
+ struct tdp_iter iter;
+ u64 new_spte;
+ bool spte_set = false;
+
+ BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
+
+ rcu_read_lock();
+
+ for_each_tdp_pte_min_level(iter, root, min_level, gfn, gfn + 1) {
+ if (!is_shadow_present_pte(iter.old_spte) ||
+ !is_last_spte(iter.old_spte, iter.level))
+ continue;
+
+ new_spte = iter.old_spte &
+ ~(PT_WRITABLE_MASK | shadow_mmu_writable_mask);
+
+ if (new_spte == iter.old_spte)
+ break;
+
+ tdp_mmu_iter_set_spte(kvm, &iter, new_spte);
+ spte_set = true;
+ }
+
+ rcu_read_unlock();
+
+ return spte_set;
+}
+
+/*
+ * Removes write access on the last level SPTE mapping this GFN and unsets the
+ * MMU-writable bit to ensure future writes continue to be intercepted.
+ * Returns true if an SPTE was set and a TLB flush is needed.
+ */
+bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
+ struct kvm_memory_slot *slot, gfn_t gfn,
+ int min_level)
+{
+ struct kvm_mmu_page *root;
+ bool spte_set = false;
+
+ lockdep_assert_held_write(&kvm->mmu_lock);
+ for_each_tdp_mmu_root(kvm, root, slot->as_id)
+ spte_set |= write_protect_gfn(kvm, root, gfn, min_level);
+
+ return spte_set;
+}
+
+/*
+ * Return the level of the lowest level SPTE added to sptes.
+ * That SPTE may be non-present.
+ *
+ * Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}.
+ */
+int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
+ int *root_level)
+{
+ struct tdp_iter iter;
+ struct kvm_mmu *mmu = vcpu->arch.mmu;
+ gfn_t gfn = addr >> PAGE_SHIFT;
+ int leaf = -1;
+
+ *root_level = vcpu->arch.mmu->root_role.level;
+
+ tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
+ leaf = iter.level;
+ sptes[leaf] = iter.old_spte;
+ }
+
+ return leaf;
+}
+
+/*
+ * Returns the last level spte pointer of the shadow page walk for the given
+ * gpa, and sets *spte to the spte value. This spte may be non-preset. If no
+ * walk could be performed, returns NULL and *spte does not contain valid data.
+ *
+ * Contract:
+ * - Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}.
+ * - The returned sptep must not be used after kvm_tdp_mmu_walk_lockless_end.
+ *
+ * WARNING: This function is only intended to be called during fast_page_fault.
+ */
+u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, u64 addr,
+ u64 *spte)
+{
+ struct tdp_iter iter;
+ struct kvm_mmu *mmu = vcpu->arch.mmu;
+ gfn_t gfn = addr >> PAGE_SHIFT;
+ tdp_ptep_t sptep = NULL;
+
+ tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
+ *spte = iter.old_spte;
+ sptep = iter.sptep;
+ }
+
+ /*
+ * Perform the rcu_dereference to get the raw spte pointer value since
+ * we are passing it up to fast_page_fault, which is shared with the
+ * legacy MMU and thus does not retain the TDP MMU-specific __rcu
+ * annotation.
+ *
+ * This is safe since fast_page_fault obeys the contracts of this
+ * function as well as all TDP MMU contracts around modifying SPTEs
+ * outside of mmu_lock.
+ */
+ return rcu_dereference(sptep);
+}
diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h
new file mode 100644
index 000000000..733a3aef3
--- /dev/null
+++ b/arch/x86/kvm/mmu/tdp_mmu.h
@@ -0,0 +1,78 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#ifndef __KVM_X86_MMU_TDP_MMU_H
+#define __KVM_X86_MMU_TDP_MMU_H
+
+#include <linux/kvm_host.h>
+
+#include "spte.h"
+
+void kvm_mmu_init_tdp_mmu(struct kvm *kvm);
+void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm);
+
+hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu);
+
+__must_check static inline bool kvm_tdp_mmu_get_root(struct kvm_mmu_page *root)
+{
+ return refcount_inc_not_zero(&root->tdp_mmu_root_count);
+}
+
+void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
+ bool shared);
+
+bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, gfn_t start, gfn_t end, bool flush);
+bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp);
+void kvm_tdp_mmu_zap_all(struct kvm *kvm);
+void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm);
+void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm);
+
+int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault);
+
+bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
+ bool flush);
+bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range);
+bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
+bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
+
+bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm,
+ const struct kvm_memory_slot *slot, int min_level);
+bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm,
+ const struct kvm_memory_slot *slot);
+void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
+ struct kvm_memory_slot *slot,
+ gfn_t gfn, unsigned long mask,
+ bool wrprot);
+void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
+ const struct kvm_memory_slot *slot);
+
+bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
+ struct kvm_memory_slot *slot, gfn_t gfn,
+ int min_level);
+
+void kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm,
+ const struct kvm_memory_slot *slot,
+ gfn_t start, gfn_t end,
+ int target_level, bool shared);
+
+static inline void kvm_tdp_mmu_walk_lockless_begin(void)
+{
+ rcu_read_lock();
+}
+
+static inline void kvm_tdp_mmu_walk_lockless_end(void)
+{
+ rcu_read_unlock();
+}
+
+int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
+ int *root_level);
+u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, u64 addr,
+ u64 *spte);
+
+#ifdef CONFIG_X86_64
+static inline bool is_tdp_mmu_page(struct kvm_mmu_page *sp) { return sp->tdp_mmu_page; }
+#else
+static inline bool is_tdp_mmu_page(struct kvm_mmu_page *sp) { return false; }
+#endif
+
+#endif /* __KVM_X86_MMU_TDP_MMU_H */
diff --git a/arch/x86/kvm/mtrr.c b/arch/x86/kvm/mtrr.c
new file mode 100644
index 000000000..3eb6e7f47
--- /dev/null
+++ b/arch/x86/kvm/mtrr.c
@@ -0,0 +1,720 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * vMTRR implementation
+ *
+ * Copyright (C) 2006 Qumranet, Inc.
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
+ * Copyright(C) 2015 Intel Corporation.
+ *
+ * Authors:
+ * Yaniv Kamay <yaniv@qumranet.com>
+ * Avi Kivity <avi@qumranet.com>
+ * Marcelo Tosatti <mtosatti@redhat.com>
+ * Paolo Bonzini <pbonzini@redhat.com>
+ * Xiao Guangrong <guangrong.xiao@linux.intel.com>
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kvm_host.h>
+#include <asm/mtrr.h>
+
+#include "cpuid.h"
+#include "mmu.h"
+
+#define IA32_MTRR_DEF_TYPE_E (1ULL << 11)
+#define IA32_MTRR_DEF_TYPE_FE (1ULL << 10)
+#define IA32_MTRR_DEF_TYPE_TYPE_MASK (0xff)
+
+static bool is_mtrr_base_msr(unsigned int msr)
+{
+ /* MTRR base MSRs use even numbers, masks use odd numbers. */
+ return !(msr & 0x1);
+}
+
+static struct kvm_mtrr_range *var_mtrr_msr_to_range(struct kvm_vcpu *vcpu,
+ unsigned int msr)
+{
+ int index = (msr - MTRRphysBase_MSR(0)) / 2;
+
+ return &vcpu->arch.mtrr_state.var_ranges[index];
+}
+
+static bool msr_mtrr_valid(unsigned msr)
+{
+ switch (msr) {
+ case MTRRphysBase_MSR(0) ... MTRRphysMask_MSR(KVM_NR_VAR_MTRR - 1):
+ case MSR_MTRRfix64K_00000:
+ case MSR_MTRRfix16K_80000:
+ case MSR_MTRRfix16K_A0000:
+ case MSR_MTRRfix4K_C0000:
+ case MSR_MTRRfix4K_C8000:
+ case MSR_MTRRfix4K_D0000:
+ case MSR_MTRRfix4K_D8000:
+ case MSR_MTRRfix4K_E0000:
+ case MSR_MTRRfix4K_E8000:
+ case MSR_MTRRfix4K_F0000:
+ case MSR_MTRRfix4K_F8000:
+ case MSR_MTRRdefType:
+ return true;
+ }
+ return false;
+}
+
+static bool valid_mtrr_type(unsigned t)
+{
+ return t < 8 && (1 << t) & 0x73; /* 0, 1, 4, 5, 6 */
+}
+
+static bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data)
+{
+ int i;
+ u64 mask;
+
+ if (!msr_mtrr_valid(msr))
+ return false;
+
+ if (msr == MSR_MTRRdefType) {
+ if (data & ~0xcff)
+ return false;
+ return valid_mtrr_type(data & 0xff);
+ } else if (msr >= MSR_MTRRfix64K_00000 && msr <= MSR_MTRRfix4K_F8000) {
+ for (i = 0; i < 8 ; i++)
+ if (!valid_mtrr_type((data >> (i * 8)) & 0xff))
+ return false;
+ return true;
+ }
+
+ /* variable MTRRs */
+ WARN_ON(!(msr >= MTRRphysBase_MSR(0) &&
+ msr <= MTRRphysMask_MSR(KVM_NR_VAR_MTRR - 1)));
+
+ mask = kvm_vcpu_reserved_gpa_bits_raw(vcpu);
+ if ((msr & 1) == 0) {
+ /* MTRR base */
+ if (!valid_mtrr_type(data & 0xff))
+ return false;
+ mask |= 0xf00;
+ } else
+ /* MTRR mask */
+ mask |= 0x7ff;
+
+ return (data & mask) == 0;
+}
+
+static bool mtrr_is_enabled(struct kvm_mtrr *mtrr_state)
+{
+ return !!(mtrr_state->deftype & IA32_MTRR_DEF_TYPE_E);
+}
+
+static bool fixed_mtrr_is_enabled(struct kvm_mtrr *mtrr_state)
+{
+ return !!(mtrr_state->deftype & IA32_MTRR_DEF_TYPE_FE);
+}
+
+static u8 mtrr_default_type(struct kvm_mtrr *mtrr_state)
+{
+ return mtrr_state->deftype & IA32_MTRR_DEF_TYPE_TYPE_MASK;
+}
+
+static u8 mtrr_disabled_type(struct kvm_vcpu *vcpu)
+{
+ /*
+ * Intel SDM 11.11.2.2: all MTRRs are disabled when
+ * IA32_MTRR_DEF_TYPE.E bit is cleared, and the UC
+ * memory type is applied to all of physical memory.
+ *
+ * However, virtual machines can be run with CPUID such that
+ * there are no MTRRs. In that case, the firmware will never
+ * enable MTRRs and it is obviously undesirable to run the
+ * guest entirely with UC memory and we use WB.
+ */
+ if (guest_cpuid_has(vcpu, X86_FEATURE_MTRR))
+ return MTRR_TYPE_UNCACHABLE;
+ else
+ return MTRR_TYPE_WRBACK;
+}
+
+/*
+* Three terms are used in the following code:
+* - segment, it indicates the address segments covered by fixed MTRRs.
+* - unit, it corresponds to the MSR entry in the segment.
+* - range, a range is covered in one memory cache type.
+*/
+struct fixed_mtrr_segment {
+ u64 start;
+ u64 end;
+
+ int range_shift;
+
+ /* the start position in kvm_mtrr.fixed_ranges[]. */
+ int range_start;
+};
+
+static struct fixed_mtrr_segment fixed_seg_table[] = {
+ /* MSR_MTRRfix64K_00000, 1 unit. 64K fixed mtrr. */
+ {
+ .start = 0x0,
+ .end = 0x80000,
+ .range_shift = 16, /* 64K */
+ .range_start = 0,
+ },
+
+ /*
+ * MSR_MTRRfix16K_80000 ... MSR_MTRRfix16K_A0000, 2 units,
+ * 16K fixed mtrr.
+ */
+ {
+ .start = 0x80000,
+ .end = 0xc0000,
+ .range_shift = 14, /* 16K */
+ .range_start = 8,
+ },
+
+ /*
+ * MSR_MTRRfix4K_C0000 ... MSR_MTRRfix4K_F8000, 8 units,
+ * 4K fixed mtrr.
+ */
+ {
+ .start = 0xc0000,
+ .end = 0x100000,
+ .range_shift = 12, /* 12K */
+ .range_start = 24,
+ }
+};
+
+/*
+ * The size of unit is covered in one MSR, one MSR entry contains
+ * 8 ranges so that unit size is always 8 * 2^range_shift.
+ */
+static u64 fixed_mtrr_seg_unit_size(int seg)
+{
+ return 8 << fixed_seg_table[seg].range_shift;
+}
+
+static bool fixed_msr_to_seg_unit(u32 msr, int *seg, int *unit)
+{
+ switch (msr) {
+ case MSR_MTRRfix64K_00000:
+ *seg = 0;
+ *unit = 0;
+ break;
+ case MSR_MTRRfix16K_80000 ... MSR_MTRRfix16K_A0000:
+ *seg = 1;
+ *unit = array_index_nospec(
+ msr - MSR_MTRRfix16K_80000,
+ MSR_MTRRfix16K_A0000 - MSR_MTRRfix16K_80000 + 1);
+ break;
+ case MSR_MTRRfix4K_C0000 ... MSR_MTRRfix4K_F8000:
+ *seg = 2;
+ *unit = array_index_nospec(
+ msr - MSR_MTRRfix4K_C0000,
+ MSR_MTRRfix4K_F8000 - MSR_MTRRfix4K_C0000 + 1);
+ break;
+ default:
+ return false;
+ }
+
+ return true;
+}
+
+static void fixed_mtrr_seg_unit_range(int seg, int unit, u64 *start, u64 *end)
+{
+ struct fixed_mtrr_segment *mtrr_seg = &fixed_seg_table[seg];
+ u64 unit_size = fixed_mtrr_seg_unit_size(seg);
+
+ *start = mtrr_seg->start + unit * unit_size;
+ *end = *start + unit_size;
+ WARN_ON(*end > mtrr_seg->end);
+}
+
+static int fixed_mtrr_seg_unit_range_index(int seg, int unit)
+{
+ struct fixed_mtrr_segment *mtrr_seg = &fixed_seg_table[seg];
+
+ WARN_ON(mtrr_seg->start + unit * fixed_mtrr_seg_unit_size(seg)
+ > mtrr_seg->end);
+
+ /* each unit has 8 ranges. */
+ return mtrr_seg->range_start + 8 * unit;
+}
+
+static int fixed_mtrr_seg_end_range_index(int seg)
+{
+ struct fixed_mtrr_segment *mtrr_seg = &fixed_seg_table[seg];
+ int n;
+
+ n = (mtrr_seg->end - mtrr_seg->start) >> mtrr_seg->range_shift;
+ return mtrr_seg->range_start + n - 1;
+}
+
+static bool fixed_msr_to_range(u32 msr, u64 *start, u64 *end)
+{
+ int seg, unit;
+
+ if (!fixed_msr_to_seg_unit(msr, &seg, &unit))
+ return false;
+
+ fixed_mtrr_seg_unit_range(seg, unit, start, end);
+ return true;
+}
+
+static int fixed_msr_to_range_index(u32 msr)
+{
+ int seg, unit;
+
+ if (!fixed_msr_to_seg_unit(msr, &seg, &unit))
+ return -1;
+
+ return fixed_mtrr_seg_unit_range_index(seg, unit);
+}
+
+static int fixed_mtrr_addr_to_seg(u64 addr)
+{
+ struct fixed_mtrr_segment *mtrr_seg;
+ int seg, seg_num = ARRAY_SIZE(fixed_seg_table);
+
+ for (seg = 0; seg < seg_num; seg++) {
+ mtrr_seg = &fixed_seg_table[seg];
+ if (mtrr_seg->start <= addr && addr < mtrr_seg->end)
+ return seg;
+ }
+
+ return -1;
+}
+
+static int fixed_mtrr_addr_seg_to_range_index(u64 addr, int seg)
+{
+ struct fixed_mtrr_segment *mtrr_seg;
+ int index;
+
+ mtrr_seg = &fixed_seg_table[seg];
+ index = mtrr_seg->range_start;
+ index += (addr - mtrr_seg->start) >> mtrr_seg->range_shift;
+ return index;
+}
+
+static u64 fixed_mtrr_range_end_addr(int seg, int index)
+{
+ struct fixed_mtrr_segment *mtrr_seg = &fixed_seg_table[seg];
+ int pos = index - mtrr_seg->range_start;
+
+ return mtrr_seg->start + ((pos + 1) << mtrr_seg->range_shift);
+}
+
+static void var_mtrr_range(struct kvm_mtrr_range *range, u64 *start, u64 *end)
+{
+ u64 mask;
+
+ *start = range->base & PAGE_MASK;
+
+ mask = range->mask & PAGE_MASK;
+
+ /* This cannot overflow because writing to the reserved bits of
+ * variable MTRRs causes a #GP.
+ */
+ *end = (*start | ~mask) + 1;
+}
+
+static void update_mtrr(struct kvm_vcpu *vcpu, u32 msr)
+{
+ struct kvm_mtrr *mtrr_state = &vcpu->arch.mtrr_state;
+ gfn_t start, end;
+
+ if (!tdp_enabled || !kvm_arch_has_noncoherent_dma(vcpu->kvm))
+ return;
+
+ if (!mtrr_is_enabled(mtrr_state) && msr != MSR_MTRRdefType)
+ return;
+
+ /* fixed MTRRs. */
+ if (fixed_msr_to_range(msr, &start, &end)) {
+ if (!fixed_mtrr_is_enabled(mtrr_state))
+ return;
+ } else if (msr == MSR_MTRRdefType) {
+ start = 0x0;
+ end = ~0ULL;
+ } else {
+ /* variable range MTRRs. */
+ var_mtrr_range(var_mtrr_msr_to_range(vcpu, msr), &start, &end);
+ }
+
+ kvm_zap_gfn_range(vcpu->kvm, gpa_to_gfn(start), gpa_to_gfn(end));
+}
+
+static bool var_mtrr_range_is_valid(struct kvm_mtrr_range *range)
+{
+ return (range->mask & (1 << 11)) != 0;
+}
+
+static void set_var_mtrr_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
+{
+ struct kvm_mtrr *mtrr_state = &vcpu->arch.mtrr_state;
+ struct kvm_mtrr_range *tmp, *cur;
+
+ cur = var_mtrr_msr_to_range(vcpu, msr);
+
+ /* remove the entry if it's in the list. */
+ if (var_mtrr_range_is_valid(cur))
+ list_del(&cur->node);
+
+ /*
+ * Set all illegal GPA bits in the mask, since those bits must
+ * implicitly be 0. The bits are then cleared when reading them.
+ */
+ if (is_mtrr_base_msr(msr))
+ cur->base = data;
+ else
+ cur->mask = data | kvm_vcpu_reserved_gpa_bits_raw(vcpu);
+
+ /* add it to the list if it's enabled. */
+ if (var_mtrr_range_is_valid(cur)) {
+ list_for_each_entry(tmp, &mtrr_state->head, node)
+ if (cur->base >= tmp->base)
+ break;
+ list_add_tail(&cur->node, &tmp->node);
+ }
+}
+
+int kvm_mtrr_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
+{
+ int index;
+
+ if (!kvm_mtrr_valid(vcpu, msr, data))
+ return 1;
+
+ index = fixed_msr_to_range_index(msr);
+ if (index >= 0)
+ *(u64 *)&vcpu->arch.mtrr_state.fixed_ranges[index] = data;
+ else if (msr == MSR_MTRRdefType)
+ vcpu->arch.mtrr_state.deftype = data;
+ else
+ set_var_mtrr_msr(vcpu, msr, data);
+
+ update_mtrr(vcpu, msr);
+ return 0;
+}
+
+int kvm_mtrr_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
+{
+ int index;
+
+ /* MSR_MTRRcap is a readonly MSR. */
+ if (msr == MSR_MTRRcap) {
+ /*
+ * SMRR = 0
+ * WC = 1
+ * FIX = 1
+ * VCNT = KVM_NR_VAR_MTRR
+ */
+ *pdata = 0x500 | KVM_NR_VAR_MTRR;
+ return 0;
+ }
+
+ if (!msr_mtrr_valid(msr))
+ return 1;
+
+ index = fixed_msr_to_range_index(msr);
+ if (index >= 0) {
+ *pdata = *(u64 *)&vcpu->arch.mtrr_state.fixed_ranges[index];
+ } else if (msr == MSR_MTRRdefType) {
+ *pdata = vcpu->arch.mtrr_state.deftype;
+ } else {
+ /* Variable MTRRs */
+ if (is_mtrr_base_msr(msr))
+ *pdata = var_mtrr_msr_to_range(vcpu, msr)->base;
+ else
+ *pdata = var_mtrr_msr_to_range(vcpu, msr)->mask;
+
+ *pdata &= ~kvm_vcpu_reserved_gpa_bits_raw(vcpu);
+ }
+
+ return 0;
+}
+
+void kvm_vcpu_mtrr_init(struct kvm_vcpu *vcpu)
+{
+ INIT_LIST_HEAD(&vcpu->arch.mtrr_state.head);
+}
+
+struct mtrr_iter {
+ /* input fields. */
+ struct kvm_mtrr *mtrr_state;
+ u64 start;
+ u64 end;
+
+ /* output fields. */
+ int mem_type;
+ /* mtrr is completely disabled? */
+ bool mtrr_disabled;
+ /* [start, end) is not fully covered in MTRRs? */
+ bool partial_map;
+
+ /* private fields. */
+ union {
+ /* used for fixed MTRRs. */
+ struct {
+ int index;
+ int seg;
+ };
+
+ /* used for var MTRRs. */
+ struct {
+ struct kvm_mtrr_range *range;
+ /* max address has been covered in var MTRRs. */
+ u64 start_max;
+ };
+ };
+
+ bool fixed;
+};
+
+static bool mtrr_lookup_fixed_start(struct mtrr_iter *iter)
+{
+ int seg, index;
+
+ if (!fixed_mtrr_is_enabled(iter->mtrr_state))
+ return false;
+
+ seg = fixed_mtrr_addr_to_seg(iter->start);
+ if (seg < 0)
+ return false;
+
+ iter->fixed = true;
+ index = fixed_mtrr_addr_seg_to_range_index(iter->start, seg);
+ iter->index = index;
+ iter->seg = seg;
+ return true;
+}
+
+static bool match_var_range(struct mtrr_iter *iter,
+ struct kvm_mtrr_range *range)
+{
+ u64 start, end;
+
+ var_mtrr_range(range, &start, &end);
+ if (!(start >= iter->end || end <= iter->start)) {
+ iter->range = range;
+
+ /*
+ * the function is called when we do kvm_mtrr.head walking.
+ * Range has the minimum base address which interleaves
+ * [looker->start_max, looker->end).
+ */
+ iter->partial_map |= iter->start_max < start;
+
+ /* update the max address has been covered. */
+ iter->start_max = max(iter->start_max, end);
+ return true;
+ }
+
+ return false;
+}
+
+static void __mtrr_lookup_var_next(struct mtrr_iter *iter)
+{
+ struct kvm_mtrr *mtrr_state = iter->mtrr_state;
+
+ list_for_each_entry_continue(iter->range, &mtrr_state->head, node)
+ if (match_var_range(iter, iter->range))
+ return;
+
+ iter->range = NULL;
+ iter->partial_map |= iter->start_max < iter->end;
+}
+
+static void mtrr_lookup_var_start(struct mtrr_iter *iter)
+{
+ struct kvm_mtrr *mtrr_state = iter->mtrr_state;
+
+ iter->fixed = false;
+ iter->start_max = iter->start;
+ iter->range = NULL;
+ iter->range = list_prepare_entry(iter->range, &mtrr_state->head, node);
+
+ __mtrr_lookup_var_next(iter);
+}
+
+static void mtrr_lookup_fixed_next(struct mtrr_iter *iter)
+{
+ /* terminate the lookup. */
+ if (fixed_mtrr_range_end_addr(iter->seg, iter->index) >= iter->end) {
+ iter->fixed = false;
+ iter->range = NULL;
+ return;
+ }
+
+ iter->index++;
+
+ /* have looked up for all fixed MTRRs. */
+ if (iter->index >= ARRAY_SIZE(iter->mtrr_state->fixed_ranges))
+ return mtrr_lookup_var_start(iter);
+
+ /* switch to next segment. */
+ if (iter->index > fixed_mtrr_seg_end_range_index(iter->seg))
+ iter->seg++;
+}
+
+static void mtrr_lookup_var_next(struct mtrr_iter *iter)
+{
+ __mtrr_lookup_var_next(iter);
+}
+
+static void mtrr_lookup_start(struct mtrr_iter *iter)
+{
+ if (!mtrr_is_enabled(iter->mtrr_state)) {
+ iter->mtrr_disabled = true;
+ return;
+ }
+
+ if (!mtrr_lookup_fixed_start(iter))
+ mtrr_lookup_var_start(iter);
+}
+
+static void mtrr_lookup_init(struct mtrr_iter *iter,
+ struct kvm_mtrr *mtrr_state, u64 start, u64 end)
+{
+ iter->mtrr_state = mtrr_state;
+ iter->start = start;
+ iter->end = end;
+ iter->mtrr_disabled = false;
+ iter->partial_map = false;
+ iter->fixed = false;
+ iter->range = NULL;
+
+ mtrr_lookup_start(iter);
+}
+
+static bool mtrr_lookup_okay(struct mtrr_iter *iter)
+{
+ if (iter->fixed) {
+ iter->mem_type = iter->mtrr_state->fixed_ranges[iter->index];
+ return true;
+ }
+
+ if (iter->range) {
+ iter->mem_type = iter->range->base & 0xff;
+ return true;
+ }
+
+ return false;
+}
+
+static void mtrr_lookup_next(struct mtrr_iter *iter)
+{
+ if (iter->fixed)
+ mtrr_lookup_fixed_next(iter);
+ else
+ mtrr_lookup_var_next(iter);
+}
+
+#define mtrr_for_each_mem_type(_iter_, _mtrr_, _gpa_start_, _gpa_end_) \
+ for (mtrr_lookup_init(_iter_, _mtrr_, _gpa_start_, _gpa_end_); \
+ mtrr_lookup_okay(_iter_); mtrr_lookup_next(_iter_))
+
+u8 kvm_mtrr_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn)
+{
+ struct kvm_mtrr *mtrr_state = &vcpu->arch.mtrr_state;
+ struct mtrr_iter iter;
+ u64 start, end;
+ int type = -1;
+ const int wt_wb_mask = (1 << MTRR_TYPE_WRBACK)
+ | (1 << MTRR_TYPE_WRTHROUGH);
+
+ start = gfn_to_gpa(gfn);
+ end = start + PAGE_SIZE;
+
+ mtrr_for_each_mem_type(&iter, mtrr_state, start, end) {
+ int curr_type = iter.mem_type;
+
+ /*
+ * Please refer to Intel SDM Volume 3: 11.11.4.1 MTRR
+ * Precedences.
+ */
+
+ if (type == -1) {
+ type = curr_type;
+ continue;
+ }
+
+ /*
+ * If two or more variable memory ranges match and the
+ * memory types are identical, then that memory type is
+ * used.
+ */
+ if (type == curr_type)
+ continue;
+
+ /*
+ * If two or more variable memory ranges match and one of
+ * the memory types is UC, the UC memory type used.
+ */
+ if (curr_type == MTRR_TYPE_UNCACHABLE)
+ return MTRR_TYPE_UNCACHABLE;
+
+ /*
+ * If two or more variable memory ranges match and the
+ * memory types are WT and WB, the WT memory type is used.
+ */
+ if (((1 << type) & wt_wb_mask) &&
+ ((1 << curr_type) & wt_wb_mask)) {
+ type = MTRR_TYPE_WRTHROUGH;
+ continue;
+ }
+
+ /*
+ * For overlaps not defined by the above rules, processor
+ * behavior is undefined.
+ */
+
+ /* We use WB for this undefined behavior. :( */
+ return MTRR_TYPE_WRBACK;
+ }
+
+ if (iter.mtrr_disabled)
+ return mtrr_disabled_type(vcpu);
+
+ /* not contained in any MTRRs. */
+ if (type == -1)
+ return mtrr_default_type(mtrr_state);
+
+ /*
+ * We just check one page, partially covered by MTRRs is
+ * impossible.
+ */
+ WARN_ON(iter.partial_map);
+
+ return type;
+}
+EXPORT_SYMBOL_GPL(kvm_mtrr_get_guest_memory_type);
+
+bool kvm_mtrr_check_gfn_range_consistency(struct kvm_vcpu *vcpu, gfn_t gfn,
+ int page_num)
+{
+ struct kvm_mtrr *mtrr_state = &vcpu->arch.mtrr_state;
+ struct mtrr_iter iter;
+ u64 start, end;
+ int type = -1;
+
+ start = gfn_to_gpa(gfn);
+ end = gfn_to_gpa(gfn + page_num);
+ mtrr_for_each_mem_type(&iter, mtrr_state, start, end) {
+ if (type == -1) {
+ type = iter.mem_type;
+ continue;
+ }
+
+ if (type != iter.mem_type)
+ return false;
+ }
+
+ if (iter.mtrr_disabled)
+ return true;
+
+ if (!iter.partial_map)
+ return true;
+
+ if (type == -1)
+ return true;
+
+ return type == mtrr_default_type(mtrr_state);
+}
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
new file mode 100644
index 000000000..dc8e8e907
--- /dev/null
+++ b/arch/x86/kvm/pmu.c
@@ -0,0 +1,941 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Kernel-based Virtual Machine -- Performance Monitoring Unit support
+ *
+ * Copyright 2015 Red Hat, Inc. and/or its affiliates.
+ *
+ * Authors:
+ * Avi Kivity <avi@redhat.com>
+ * Gleb Natapov <gleb@redhat.com>
+ * Wei Huang <wei@redhat.com>
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/types.h>
+#include <linux/kvm_host.h>
+#include <linux/perf_event.h>
+#include <linux/bsearch.h>
+#include <linux/sort.h>
+#include <asm/perf_event.h>
+#include <asm/cpu_device_id.h>
+#include "x86.h"
+#include "cpuid.h"
+#include "lapic.h"
+#include "pmu.h"
+
+/* This is enough to filter the vast majority of currently defined events. */
+#define KVM_PMU_EVENT_FILTER_MAX_EVENTS 300
+
+struct x86_pmu_capability __read_mostly kvm_pmu_cap;
+EXPORT_SYMBOL_GPL(kvm_pmu_cap);
+
+/* Precise Distribution of Instructions Retired (PDIR) */
+static const struct x86_cpu_id vmx_pebs_pdir_cpu[] = {
+ X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, NULL),
+ X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, NULL),
+ /* Instruction-Accurate PDIR (PDIR++) */
+ X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, NULL),
+ {}
+};
+
+/* Precise Distribution (PDist) */
+static const struct x86_cpu_id vmx_pebs_pdist_cpu[] = {
+ X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, NULL),
+ {}
+};
+
+/* NOTE:
+ * - Each perf counter is defined as "struct kvm_pmc";
+ * - There are two types of perf counters: general purpose (gp) and fixed.
+ * gp counters are stored in gp_counters[] and fixed counters are stored
+ * in fixed_counters[] respectively. Both of them are part of "struct
+ * kvm_pmu";
+ * - pmu.c understands the difference between gp counters and fixed counters.
+ * However AMD doesn't support fixed-counters;
+ * - There are three types of index to access perf counters (PMC):
+ * 1. MSR (named msr): For example Intel has MSR_IA32_PERFCTRn and AMD
+ * has MSR_K7_PERFCTRn and, for families 15H and later,
+ * MSR_F15H_PERF_CTRn, where MSR_F15H_PERF_CTR[0-3] are
+ * aliased to MSR_K7_PERFCTRn.
+ * 2. MSR Index (named idx): This normally is used by RDPMC instruction.
+ * For instance AMD RDPMC instruction uses 0000_0003h in ECX to access
+ * C001_0007h (MSR_K7_PERCTR3). Intel has a similar mechanism, except
+ * that it also supports fixed counters. idx can be used to as index to
+ * gp and fixed counters.
+ * 3. Global PMC Index (named pmc): pmc is an index specific to PMU
+ * code. Each pmc, stored in kvm_pmc.idx field, is unique across
+ * all perf counters (both gp and fixed). The mapping relationship
+ * between pmc and perf counters is as the following:
+ * * Intel: [0 .. KVM_INTEL_PMC_MAX_GENERIC-1] <=> gp counters
+ * [INTEL_PMC_IDX_FIXED .. INTEL_PMC_IDX_FIXED + 2] <=> fixed
+ * * AMD: [0 .. AMD64_NUM_COUNTERS-1] and, for families 15H
+ * and later, [0 .. AMD64_NUM_COUNTERS_CORE-1] <=> gp counters
+ */
+
+static struct kvm_pmu_ops kvm_pmu_ops __read_mostly;
+
+#define KVM_X86_PMU_OP(func) \
+ DEFINE_STATIC_CALL_NULL(kvm_x86_pmu_##func, \
+ *(((struct kvm_pmu_ops *)0)->func));
+#define KVM_X86_PMU_OP_OPTIONAL KVM_X86_PMU_OP
+#include <asm/kvm-x86-pmu-ops.h>
+
+void kvm_pmu_ops_update(const struct kvm_pmu_ops *pmu_ops)
+{
+ memcpy(&kvm_pmu_ops, pmu_ops, sizeof(kvm_pmu_ops));
+
+#define __KVM_X86_PMU_OP(func) \
+ static_call_update(kvm_x86_pmu_##func, kvm_pmu_ops.func);
+#define KVM_X86_PMU_OP(func) \
+ WARN_ON(!kvm_pmu_ops.func); __KVM_X86_PMU_OP(func)
+#define KVM_X86_PMU_OP_OPTIONAL __KVM_X86_PMU_OP
+#include <asm/kvm-x86-pmu-ops.h>
+#undef __KVM_X86_PMU_OP
+}
+
+static inline void __kvm_perf_overflow(struct kvm_pmc *pmc, bool in_pmi)
+{
+ struct kvm_pmu *pmu = pmc_to_pmu(pmc);
+ bool skip_pmi = false;
+
+ if (pmc->perf_event && pmc->perf_event->attr.precise_ip) {
+ if (!in_pmi) {
+ /*
+ * TODO: KVM is currently _choosing_ to not generate records
+ * for emulated instructions, avoiding BUFFER_OVF PMI when
+ * there are no records. Strictly speaking, it should be done
+ * as well in the right context to improve sampling accuracy.
+ */
+ skip_pmi = true;
+ } else {
+ /* Indicate PEBS overflow PMI to guest. */
+ skip_pmi = __test_and_set_bit(GLOBAL_STATUS_BUFFER_OVF_BIT,
+ (unsigned long *)&pmu->global_status);
+ }
+ } else {
+ __set_bit(pmc->idx, (unsigned long *)&pmu->global_status);
+ }
+
+ if (pmc->intr && !skip_pmi)
+ kvm_make_request(KVM_REQ_PMI, pmc->vcpu);
+}
+
+static void kvm_perf_overflow(struct perf_event *perf_event,
+ struct perf_sample_data *data,
+ struct pt_regs *regs)
+{
+ struct kvm_pmc *pmc = perf_event->overflow_handler_context;
+
+ /*
+ * Ignore overflow events for counters that are scheduled to be
+ * reprogrammed, e.g. if a PMI for the previous event races with KVM's
+ * handling of a related guest WRMSR.
+ */
+ if (test_and_set_bit(pmc->idx, pmc_to_pmu(pmc)->reprogram_pmi))
+ return;
+
+ __kvm_perf_overflow(pmc, true);
+
+ kvm_make_request(KVM_REQ_PMU, pmc->vcpu);
+}
+
+static u64 pmc_get_pebs_precise_level(struct kvm_pmc *pmc)
+{
+ /*
+ * For some model specific pebs counters with special capabilities
+ * (PDIR, PDIR++, PDIST), KVM needs to raise the event precise
+ * level to the maximum value (currently 3, backwards compatible)
+ * so that the perf subsystem would assign specific hardware counter
+ * with that capability for vPMC.
+ */
+ if ((pmc->idx == 0 && x86_match_cpu(vmx_pebs_pdist_cpu)) ||
+ (pmc->idx == 32 && x86_match_cpu(vmx_pebs_pdir_cpu)))
+ return 3;
+
+ /*
+ * The non-zero precision level of guest event makes the ordinary
+ * guest event becomes a guest PEBS event and triggers the host
+ * PEBS PMI handler to determine whether the PEBS overflow PMI
+ * comes from the host counters or the guest.
+ */
+ return 1;
+}
+
+static int pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, u64 config,
+ bool exclude_user, bool exclude_kernel,
+ bool intr)
+{
+ struct kvm_pmu *pmu = pmc_to_pmu(pmc);
+ struct perf_event *event;
+ struct perf_event_attr attr = {
+ .type = type,
+ .size = sizeof(attr),
+ .pinned = true,
+ .exclude_idle = true,
+ .exclude_host = 1,
+ .exclude_user = exclude_user,
+ .exclude_kernel = exclude_kernel,
+ .config = config,
+ };
+ bool pebs = test_bit(pmc->idx, (unsigned long *)&pmu->pebs_enable);
+
+ attr.sample_period = get_sample_period(pmc, pmc->counter);
+
+ if ((attr.config & HSW_IN_TX_CHECKPOINTED) &&
+ guest_cpuid_is_intel(pmc->vcpu)) {
+ /*
+ * HSW_IN_TX_CHECKPOINTED is not supported with nonzero
+ * period. Just clear the sample period so at least
+ * allocating the counter doesn't fail.
+ */
+ attr.sample_period = 0;
+ }
+ if (pebs) {
+ /*
+ * For most PEBS hardware events, the difference in the software
+ * precision levels of guest and host PEBS events will not affect
+ * the accuracy of the PEBS profiling result, because the "event IP"
+ * in the PEBS record is calibrated on the guest side.
+ */
+ attr.precise_ip = pmc_get_pebs_precise_level(pmc);
+ }
+
+ event = perf_event_create_kernel_counter(&attr, -1, current,
+ kvm_perf_overflow, pmc);
+ if (IS_ERR(event)) {
+ pr_debug_ratelimited("kvm_pmu: event creation failed %ld for pmc->idx = %d\n",
+ PTR_ERR(event), pmc->idx);
+ return PTR_ERR(event);
+ }
+
+ pmc->perf_event = event;
+ pmc_to_pmu(pmc)->event_count++;
+ pmc->is_paused = false;
+ pmc->intr = intr || pebs;
+ return 0;
+}
+
+static void pmc_pause_counter(struct kvm_pmc *pmc)
+{
+ u64 counter = pmc->counter;
+
+ if (!pmc->perf_event || pmc->is_paused)
+ return;
+
+ /* update counter, reset event value to avoid redundant accumulation */
+ counter += perf_event_pause(pmc->perf_event, true);
+ pmc->counter = counter & pmc_bitmask(pmc);
+ pmc->is_paused = true;
+}
+
+static bool pmc_resume_counter(struct kvm_pmc *pmc)
+{
+ if (!pmc->perf_event)
+ return false;
+
+ /* recalibrate sample period and check if it's accepted by perf core */
+ if (is_sampling_event(pmc->perf_event) &&
+ perf_event_period(pmc->perf_event,
+ get_sample_period(pmc, pmc->counter)))
+ return false;
+
+ if (test_bit(pmc->idx, (unsigned long *)&pmc_to_pmu(pmc)->pebs_enable) !=
+ (!!pmc->perf_event->attr.precise_ip))
+ return false;
+
+ /* reuse perf_event to serve as pmc_reprogram_counter() does*/
+ perf_event_enable(pmc->perf_event);
+ pmc->is_paused = false;
+
+ return true;
+}
+
+static void pmc_release_perf_event(struct kvm_pmc *pmc)
+{
+ if (pmc->perf_event) {
+ perf_event_release_kernel(pmc->perf_event);
+ pmc->perf_event = NULL;
+ pmc->current_config = 0;
+ pmc_to_pmu(pmc)->event_count--;
+ }
+}
+
+static void pmc_stop_counter(struct kvm_pmc *pmc)
+{
+ if (pmc->perf_event) {
+ pmc->counter = pmc_read_counter(pmc);
+ pmc_release_perf_event(pmc);
+ }
+}
+
+static int filter_cmp(const void *pa, const void *pb, u64 mask)
+{
+ u64 a = *(u64 *)pa & mask;
+ u64 b = *(u64 *)pb & mask;
+
+ return (a > b) - (a < b);
+}
+
+
+static int filter_sort_cmp(const void *pa, const void *pb)
+{
+ return filter_cmp(pa, pb, (KVM_PMU_MASKED_ENTRY_EVENT_SELECT |
+ KVM_PMU_MASKED_ENTRY_EXCLUDE));
+}
+
+/*
+ * For the event filter, searching is done on the 'includes' list and
+ * 'excludes' list separately rather than on the 'events' list (which
+ * has both). As a result the exclude bit can be ignored.
+ */
+static int filter_event_cmp(const void *pa, const void *pb)
+{
+ return filter_cmp(pa, pb, (KVM_PMU_MASKED_ENTRY_EVENT_SELECT));
+}
+
+static int find_filter_index(u64 *events, u64 nevents, u64 key)
+{
+ u64 *fe = bsearch(&key, events, nevents, sizeof(events[0]),
+ filter_event_cmp);
+
+ if (!fe)
+ return -1;
+
+ return fe - events;
+}
+
+static bool is_filter_entry_match(u64 filter_event, u64 umask)
+{
+ u64 mask = filter_event >> (KVM_PMU_MASKED_ENTRY_UMASK_MASK_SHIFT - 8);
+ u64 match = filter_event & KVM_PMU_MASKED_ENTRY_UMASK_MATCH;
+
+ BUILD_BUG_ON((KVM_PMU_ENCODE_MASKED_ENTRY(0, 0xff, 0, false) >>
+ (KVM_PMU_MASKED_ENTRY_UMASK_MASK_SHIFT - 8)) !=
+ ARCH_PERFMON_EVENTSEL_UMASK);
+
+ return (umask & mask) == match;
+}
+
+static bool filter_contains_match(u64 *events, u64 nevents, u64 eventsel)
+{
+ u64 event_select = eventsel & kvm_pmu_ops.EVENTSEL_EVENT;
+ u64 umask = eventsel & ARCH_PERFMON_EVENTSEL_UMASK;
+ int i, index;
+
+ index = find_filter_index(events, nevents, event_select);
+ if (index < 0)
+ return false;
+
+ /*
+ * Entries are sorted by the event select. Walk the list in both
+ * directions to process all entries with the targeted event select.
+ */
+ for (i = index; i < nevents; i++) {
+ if (filter_event_cmp(&events[i], &event_select))
+ break;
+
+ if (is_filter_entry_match(events[i], umask))
+ return true;
+ }
+
+ for (i = index - 1; i >= 0; i--) {
+ if (filter_event_cmp(&events[i], &event_select))
+ break;
+
+ if (is_filter_entry_match(events[i], umask))
+ return true;
+ }
+
+ return false;
+}
+
+static bool is_gp_event_allowed(struct kvm_x86_pmu_event_filter *f,
+ u64 eventsel)
+{
+ if (filter_contains_match(f->includes, f->nr_includes, eventsel) &&
+ !filter_contains_match(f->excludes, f->nr_excludes, eventsel))
+ return f->action == KVM_PMU_EVENT_ALLOW;
+
+ return f->action == KVM_PMU_EVENT_DENY;
+}
+
+static bool is_fixed_event_allowed(struct kvm_x86_pmu_event_filter *filter,
+ int idx)
+{
+ int fixed_idx = idx - INTEL_PMC_IDX_FIXED;
+
+ if (filter->action == KVM_PMU_EVENT_DENY &&
+ test_bit(fixed_idx, (ulong *)&filter->fixed_counter_bitmap))
+ return false;
+ if (filter->action == KVM_PMU_EVENT_ALLOW &&
+ !test_bit(fixed_idx, (ulong *)&filter->fixed_counter_bitmap))
+ return false;
+
+ return true;
+}
+
+static bool check_pmu_event_filter(struct kvm_pmc *pmc)
+{
+ struct kvm_x86_pmu_event_filter *filter;
+ struct kvm *kvm = pmc->vcpu->kvm;
+
+ filter = srcu_dereference(kvm->arch.pmu_event_filter, &kvm->srcu);
+ if (!filter)
+ return true;
+
+ if (pmc_is_gp(pmc))
+ return is_gp_event_allowed(filter, pmc->eventsel);
+
+ return is_fixed_event_allowed(filter, pmc->idx);
+}
+
+static bool pmc_event_is_allowed(struct kvm_pmc *pmc)
+{
+ return pmc_is_globally_enabled(pmc) && pmc_speculative_in_use(pmc) &&
+ static_call(kvm_x86_pmu_hw_event_available)(pmc) &&
+ check_pmu_event_filter(pmc);
+}
+
+static void reprogram_counter(struct kvm_pmc *pmc)
+{
+ struct kvm_pmu *pmu = pmc_to_pmu(pmc);
+ u64 eventsel = pmc->eventsel;
+ u64 new_config = eventsel;
+ u8 fixed_ctr_ctrl;
+
+ pmc_pause_counter(pmc);
+
+ if (!pmc_event_is_allowed(pmc))
+ goto reprogram_complete;
+
+ if (pmc->counter < pmc->prev_counter)
+ __kvm_perf_overflow(pmc, false);
+
+ if (eventsel & ARCH_PERFMON_EVENTSEL_PIN_CONTROL)
+ printk_once("kvm pmu: pin control bit is ignored\n");
+
+ if (pmc_is_fixed(pmc)) {
+ fixed_ctr_ctrl = fixed_ctrl_field(pmu->fixed_ctr_ctrl,
+ pmc->idx - INTEL_PMC_IDX_FIXED);
+ if (fixed_ctr_ctrl & 0x1)
+ eventsel |= ARCH_PERFMON_EVENTSEL_OS;
+ if (fixed_ctr_ctrl & 0x2)
+ eventsel |= ARCH_PERFMON_EVENTSEL_USR;
+ if (fixed_ctr_ctrl & 0x8)
+ eventsel |= ARCH_PERFMON_EVENTSEL_INT;
+ new_config = (u64)fixed_ctr_ctrl;
+ }
+
+ if (pmc->current_config == new_config && pmc_resume_counter(pmc))
+ goto reprogram_complete;
+
+ pmc_release_perf_event(pmc);
+
+ pmc->current_config = new_config;
+
+ /*
+ * If reprogramming fails, e.g. due to contention, leave the counter's
+ * regprogram bit set, i.e. opportunistically try again on the next PMU
+ * refresh. Don't make a new request as doing so can stall the guest
+ * if reprogramming repeatedly fails.
+ */
+ if (pmc_reprogram_counter(pmc, PERF_TYPE_RAW,
+ (eventsel & pmu->raw_event_mask),
+ !(eventsel & ARCH_PERFMON_EVENTSEL_USR),
+ !(eventsel & ARCH_PERFMON_EVENTSEL_OS),
+ eventsel & ARCH_PERFMON_EVENTSEL_INT))
+ return;
+
+reprogram_complete:
+ clear_bit(pmc->idx, (unsigned long *)&pmc_to_pmu(pmc)->reprogram_pmi);
+ pmc->prev_counter = 0;
+}
+
+void kvm_pmu_handle_event(struct kvm_vcpu *vcpu)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ int bit;
+
+ for_each_set_bit(bit, pmu->reprogram_pmi, X86_PMC_IDX_MAX) {
+ struct kvm_pmc *pmc = static_call(kvm_x86_pmu_pmc_idx_to_pmc)(pmu, bit);
+
+ if (unlikely(!pmc)) {
+ clear_bit(bit, pmu->reprogram_pmi);
+ continue;
+ }
+
+ reprogram_counter(pmc);
+ }
+
+ /*
+ * Unused perf_events are only released if the corresponding MSRs
+ * weren't accessed during the last vCPU time slice. kvm_arch_sched_in
+ * triggers KVM_REQ_PMU if cleanup is needed.
+ */
+ if (unlikely(pmu->need_cleanup))
+ kvm_pmu_cleanup(vcpu);
+}
+
+/* check if idx is a valid index to access PMU */
+bool kvm_pmu_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx)
+{
+ return static_call(kvm_x86_pmu_is_valid_rdpmc_ecx)(vcpu, idx);
+}
+
+bool is_vmware_backdoor_pmc(u32 pmc_idx)
+{
+ switch (pmc_idx) {
+ case VMWARE_BACKDOOR_PMC_HOST_TSC:
+ case VMWARE_BACKDOOR_PMC_REAL_TIME:
+ case VMWARE_BACKDOOR_PMC_APPARENT_TIME:
+ return true;
+ }
+ return false;
+}
+
+static int kvm_pmu_rdpmc_vmware(struct kvm_vcpu *vcpu, unsigned idx, u64 *data)
+{
+ u64 ctr_val;
+
+ switch (idx) {
+ case VMWARE_BACKDOOR_PMC_HOST_TSC:
+ ctr_val = rdtsc();
+ break;
+ case VMWARE_BACKDOOR_PMC_REAL_TIME:
+ ctr_val = ktime_get_boottime_ns();
+ break;
+ case VMWARE_BACKDOOR_PMC_APPARENT_TIME:
+ ctr_val = ktime_get_boottime_ns() +
+ vcpu->kvm->arch.kvmclock_offset;
+ break;
+ default:
+ return 1;
+ }
+
+ *data = ctr_val;
+ return 0;
+}
+
+int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data)
+{
+ bool fast_mode = idx & (1u << 31);
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ struct kvm_pmc *pmc;
+ u64 mask = fast_mode ? ~0u : ~0ull;
+
+ if (!pmu->version)
+ return 1;
+
+ if (is_vmware_backdoor_pmc(idx))
+ return kvm_pmu_rdpmc_vmware(vcpu, idx, data);
+
+ pmc = static_call(kvm_x86_pmu_rdpmc_ecx_to_pmc)(vcpu, idx, &mask);
+ if (!pmc)
+ return 1;
+
+ if (!kvm_is_cr4_bit_set(vcpu, X86_CR4_PCE) &&
+ (static_call(kvm_x86_get_cpl)(vcpu) != 0) &&
+ kvm_is_cr0_bit_set(vcpu, X86_CR0_PE))
+ return 1;
+
+ *data = pmc_read_counter(pmc) & mask;
+ return 0;
+}
+
+void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu)
+{
+ if (lapic_in_kernel(vcpu)) {
+ static_call_cond(kvm_x86_pmu_deliver_pmi)(vcpu);
+ kvm_apic_local_deliver(vcpu->arch.apic, APIC_LVTPC);
+ }
+}
+
+bool kvm_pmu_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr)
+{
+ switch (msr) {
+ case MSR_CORE_PERF_GLOBAL_STATUS:
+ case MSR_CORE_PERF_GLOBAL_CTRL:
+ case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
+ return kvm_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu));
+ default:
+ break;
+ }
+ return static_call(kvm_x86_pmu_msr_idx_to_pmc)(vcpu, msr) ||
+ static_call(kvm_x86_pmu_is_valid_msr)(vcpu, msr);
+}
+
+static void kvm_pmu_mark_pmc_in_use(struct kvm_vcpu *vcpu, u32 msr)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ struct kvm_pmc *pmc = static_call(kvm_x86_pmu_msr_idx_to_pmc)(vcpu, msr);
+
+ if (pmc)
+ __set_bit(pmc->idx, pmu->pmc_in_use);
+}
+
+int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ u32 msr = msr_info->index;
+
+ switch (msr) {
+ case MSR_CORE_PERF_GLOBAL_STATUS:
+ case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS:
+ msr_info->data = pmu->global_status;
+ break;
+ case MSR_AMD64_PERF_CNTR_GLOBAL_CTL:
+ case MSR_CORE_PERF_GLOBAL_CTRL:
+ msr_info->data = pmu->global_ctrl;
+ break;
+ case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR:
+ case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
+ msr_info->data = 0;
+ break;
+ default:
+ return static_call(kvm_x86_pmu_get_msr)(vcpu, msr_info);
+ }
+
+ return 0;
+}
+
+int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ u32 msr = msr_info->index;
+ u64 data = msr_info->data;
+ u64 diff;
+
+ /*
+ * Note, AMD ignores writes to reserved bits and read-only PMU MSRs,
+ * whereas Intel generates #GP on attempts to write reserved/RO MSRs.
+ */
+ switch (msr) {
+ case MSR_CORE_PERF_GLOBAL_STATUS:
+ if (!msr_info->host_initiated)
+ return 1; /* RO MSR */
+ fallthrough;
+ case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS:
+ /* Per PPR, Read-only MSR. Writes are ignored. */
+ if (!msr_info->host_initiated)
+ break;
+
+ if (data & pmu->global_status_mask)
+ return 1;
+
+ pmu->global_status = data;
+ break;
+ case MSR_AMD64_PERF_CNTR_GLOBAL_CTL:
+ data &= ~pmu->global_ctrl_mask;
+ fallthrough;
+ case MSR_CORE_PERF_GLOBAL_CTRL:
+ if (!kvm_valid_perf_global_ctrl(pmu, data))
+ return 1;
+
+ if (pmu->global_ctrl != data) {
+ diff = pmu->global_ctrl ^ data;
+ pmu->global_ctrl = data;
+ reprogram_counters(pmu, diff);
+ }
+ break;
+ case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
+ /*
+ * GLOBAL_OVF_CTRL, a.k.a. GLOBAL STATUS_RESET, clears bits in
+ * GLOBAL_STATUS, and so the set of reserved bits is the same.
+ */
+ if (data & pmu->global_status_mask)
+ return 1;
+ fallthrough;
+ case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR:
+ if (!msr_info->host_initiated)
+ pmu->global_status &= ~data;
+ break;
+ default:
+ kvm_pmu_mark_pmc_in_use(vcpu, msr_info->index);
+ return static_call(kvm_x86_pmu_set_msr)(vcpu, msr_info);
+ }
+
+ return 0;
+}
+
+void kvm_pmu_reset(struct kvm_vcpu *vcpu)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ struct kvm_pmc *pmc;
+ int i;
+
+ pmu->need_cleanup = false;
+
+ bitmap_zero(pmu->reprogram_pmi, X86_PMC_IDX_MAX);
+
+ for_each_set_bit(i, pmu->all_valid_pmc_idx, X86_PMC_IDX_MAX) {
+ pmc = static_call(kvm_x86_pmu_pmc_idx_to_pmc)(pmu, i);
+ if (!pmc)
+ continue;
+
+ pmc_stop_counter(pmc);
+ pmc->counter = 0;
+
+ if (pmc_is_gp(pmc))
+ pmc->eventsel = 0;
+ }
+
+ pmu->fixed_ctr_ctrl = pmu->global_ctrl = pmu->global_status = 0;
+
+ static_call_cond(kvm_x86_pmu_reset)(vcpu);
+}
+
+
+/*
+ * Refresh the PMU configuration for the vCPU, e.g. if userspace changes CPUID
+ * and/or PERF_CAPABILITIES.
+ */
+void kvm_pmu_refresh(struct kvm_vcpu *vcpu)
+{
+ if (KVM_BUG_ON(kvm_vcpu_has_run(vcpu), vcpu->kvm))
+ return;
+
+ /*
+ * Stop/release all existing counters/events before realizing the new
+ * vPMU model.
+ */
+ kvm_pmu_reset(vcpu);
+
+ bitmap_zero(vcpu_to_pmu(vcpu)->all_valid_pmc_idx, X86_PMC_IDX_MAX);
+ static_call(kvm_x86_pmu_refresh)(vcpu);
+}
+
+void kvm_pmu_init(struct kvm_vcpu *vcpu)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+
+ memset(pmu, 0, sizeof(*pmu));
+ static_call(kvm_x86_pmu_init)(vcpu);
+ pmu->event_count = 0;
+ pmu->need_cleanup = false;
+ kvm_pmu_refresh(vcpu);
+}
+
+/* Release perf_events for vPMCs that have been unused for a full time slice. */
+void kvm_pmu_cleanup(struct kvm_vcpu *vcpu)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ struct kvm_pmc *pmc = NULL;
+ DECLARE_BITMAP(bitmask, X86_PMC_IDX_MAX);
+ int i;
+
+ pmu->need_cleanup = false;
+
+ bitmap_andnot(bitmask, pmu->all_valid_pmc_idx,
+ pmu->pmc_in_use, X86_PMC_IDX_MAX);
+
+ for_each_set_bit(i, bitmask, X86_PMC_IDX_MAX) {
+ pmc = static_call(kvm_x86_pmu_pmc_idx_to_pmc)(pmu, i);
+
+ if (pmc && pmc->perf_event && !pmc_speculative_in_use(pmc))
+ pmc_stop_counter(pmc);
+ }
+
+ static_call_cond(kvm_x86_pmu_cleanup)(vcpu);
+
+ bitmap_zero(pmu->pmc_in_use, X86_PMC_IDX_MAX);
+}
+
+void kvm_pmu_destroy(struct kvm_vcpu *vcpu)
+{
+ kvm_pmu_reset(vcpu);
+}
+
+static void kvm_pmu_incr_counter(struct kvm_pmc *pmc)
+{
+ pmc->prev_counter = pmc->counter;
+ pmc->counter = (pmc->counter + 1) & pmc_bitmask(pmc);
+ kvm_pmu_request_counter_reprogram(pmc);
+}
+
+static inline bool eventsel_match_perf_hw_id(struct kvm_pmc *pmc,
+ unsigned int perf_hw_id)
+{
+ return !((pmc->eventsel ^ perf_get_hw_event_config(perf_hw_id)) &
+ AMD64_RAW_EVENT_MASK_NB);
+}
+
+static inline bool cpl_is_matched(struct kvm_pmc *pmc)
+{
+ bool select_os, select_user;
+ u64 config;
+
+ if (pmc_is_gp(pmc)) {
+ config = pmc->eventsel;
+ select_os = config & ARCH_PERFMON_EVENTSEL_OS;
+ select_user = config & ARCH_PERFMON_EVENTSEL_USR;
+ } else {
+ config = fixed_ctrl_field(pmc_to_pmu(pmc)->fixed_ctr_ctrl,
+ pmc->idx - INTEL_PMC_IDX_FIXED);
+ select_os = config & 0x1;
+ select_user = config & 0x2;
+ }
+
+ return (static_call(kvm_x86_get_cpl)(pmc->vcpu) == 0) ? select_os : select_user;
+}
+
+void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu, u64 perf_hw_id)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ struct kvm_pmc *pmc;
+ int i;
+
+ for_each_set_bit(i, pmu->all_valid_pmc_idx, X86_PMC_IDX_MAX) {
+ pmc = static_call(kvm_x86_pmu_pmc_idx_to_pmc)(pmu, i);
+
+ if (!pmc || !pmc_event_is_allowed(pmc))
+ continue;
+
+ /* Ignore checks for edge detect, pin control, invert and CMASK bits */
+ if (eventsel_match_perf_hw_id(pmc, perf_hw_id) && cpl_is_matched(pmc))
+ kvm_pmu_incr_counter(pmc);
+ }
+}
+EXPORT_SYMBOL_GPL(kvm_pmu_trigger_event);
+
+static bool is_masked_filter_valid(const struct kvm_x86_pmu_event_filter *filter)
+{
+ u64 mask = kvm_pmu_ops.EVENTSEL_EVENT |
+ KVM_PMU_MASKED_ENTRY_UMASK_MASK |
+ KVM_PMU_MASKED_ENTRY_UMASK_MATCH |
+ KVM_PMU_MASKED_ENTRY_EXCLUDE;
+ int i;
+
+ for (i = 0; i < filter->nevents; i++) {
+ if (filter->events[i] & ~mask)
+ return false;
+ }
+
+ return true;
+}
+
+static void convert_to_masked_filter(struct kvm_x86_pmu_event_filter *filter)
+{
+ int i, j;
+
+ for (i = 0, j = 0; i < filter->nevents; i++) {
+ /*
+ * Skip events that are impossible to match against a guest
+ * event. When filtering, only the event select + unit mask
+ * of the guest event is used. To maintain backwards
+ * compatibility, impossible filters can't be rejected :-(
+ */
+ if (filter->events[i] & ~(kvm_pmu_ops.EVENTSEL_EVENT |
+ ARCH_PERFMON_EVENTSEL_UMASK))
+ continue;
+ /*
+ * Convert userspace events to a common in-kernel event so
+ * only one code path is needed to support both events. For
+ * the in-kernel events use masked events because they are
+ * flexible enough to handle both cases. To convert to masked
+ * events all that's needed is to add an "all ones" umask_mask,
+ * (unmasked filter events don't support EXCLUDE).
+ */
+ filter->events[j++] = filter->events[i] |
+ (0xFFULL << KVM_PMU_MASKED_ENTRY_UMASK_MASK_SHIFT);
+ }
+
+ filter->nevents = j;
+}
+
+static int prepare_filter_lists(struct kvm_x86_pmu_event_filter *filter)
+{
+ int i;
+
+ if (!(filter->flags & KVM_PMU_EVENT_FLAG_MASKED_EVENTS))
+ convert_to_masked_filter(filter);
+ else if (!is_masked_filter_valid(filter))
+ return -EINVAL;
+
+ /*
+ * Sort entries by event select and includes vs. excludes so that all
+ * entries for a given event select can be processed efficiently during
+ * filtering. The EXCLUDE flag uses a more significant bit than the
+ * event select, and so the sorted list is also effectively split into
+ * includes and excludes sub-lists.
+ */
+ sort(&filter->events, filter->nevents, sizeof(filter->events[0]),
+ filter_sort_cmp, NULL);
+
+ i = filter->nevents;
+ /* Find the first EXCLUDE event (only supported for masked events). */
+ if (filter->flags & KVM_PMU_EVENT_FLAG_MASKED_EVENTS) {
+ for (i = 0; i < filter->nevents; i++) {
+ if (filter->events[i] & KVM_PMU_MASKED_ENTRY_EXCLUDE)
+ break;
+ }
+ }
+
+ filter->nr_includes = i;
+ filter->nr_excludes = filter->nevents - filter->nr_includes;
+ filter->includes = filter->events;
+ filter->excludes = filter->events + filter->nr_includes;
+
+ return 0;
+}
+
+int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp)
+{
+ struct kvm_pmu_event_filter __user *user_filter = argp;
+ struct kvm_x86_pmu_event_filter *filter;
+ struct kvm_pmu_event_filter tmp;
+ struct kvm_vcpu *vcpu;
+ unsigned long i;
+ size_t size;
+ int r;
+
+ if (copy_from_user(&tmp, user_filter, sizeof(tmp)))
+ return -EFAULT;
+
+ if (tmp.action != KVM_PMU_EVENT_ALLOW &&
+ tmp.action != KVM_PMU_EVENT_DENY)
+ return -EINVAL;
+
+ if (tmp.flags & ~KVM_PMU_EVENT_FLAGS_VALID_MASK)
+ return -EINVAL;
+
+ if (tmp.nevents > KVM_PMU_EVENT_FILTER_MAX_EVENTS)
+ return -E2BIG;
+
+ size = struct_size(filter, events, tmp.nevents);
+ filter = kzalloc(size, GFP_KERNEL_ACCOUNT);
+ if (!filter)
+ return -ENOMEM;
+
+ filter->action = tmp.action;
+ filter->nevents = tmp.nevents;
+ filter->fixed_counter_bitmap = tmp.fixed_counter_bitmap;
+ filter->flags = tmp.flags;
+
+ r = -EFAULT;
+ if (copy_from_user(filter->events, user_filter->events,
+ sizeof(filter->events[0]) * filter->nevents))
+ goto cleanup;
+
+ r = prepare_filter_lists(filter);
+ if (r)
+ goto cleanup;
+
+ mutex_lock(&kvm->lock);
+ filter = rcu_replace_pointer(kvm->arch.pmu_event_filter, filter,
+ mutex_is_locked(&kvm->lock));
+ mutex_unlock(&kvm->lock);
+ synchronize_srcu_expedited(&kvm->srcu);
+
+ BUILD_BUG_ON(sizeof(((struct kvm_pmu *)0)->reprogram_pmi) >
+ sizeof(((struct kvm_pmu *)0)->__reprogram_pmi));
+
+ kvm_for_each_vcpu(i, vcpu, kvm)
+ atomic64_set(&vcpu_to_pmu(vcpu)->__reprogram_pmi, -1ull);
+
+ kvm_make_all_cpus_request(kvm, KVM_REQ_PMU);
+
+ r = 0;
+cleanup:
+ kfree(filter);
+ return r;
+}
diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h
new file mode 100644
index 000000000..a46aa9b25
--- /dev/null
+++ b/arch/x86/kvm/pmu.h
@@ -0,0 +1,257 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __KVM_X86_PMU_H
+#define __KVM_X86_PMU_H
+
+#include <linux/nospec.h>
+
+#define vcpu_to_pmu(vcpu) (&(vcpu)->arch.pmu)
+#define pmu_to_vcpu(pmu) (container_of((pmu), struct kvm_vcpu, arch.pmu))
+#define pmc_to_pmu(pmc) (&(pmc)->vcpu->arch.pmu)
+
+#define MSR_IA32_MISC_ENABLE_PMU_RO_MASK (MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL | \
+ MSR_IA32_MISC_ENABLE_BTS_UNAVAIL)
+
+/* retrieve the 4 bits for EN and PMI out of IA32_FIXED_CTR_CTRL */
+#define fixed_ctrl_field(ctrl_reg, idx) (((ctrl_reg) >> ((idx)*4)) & 0xf)
+
+#define VMWARE_BACKDOOR_PMC_HOST_TSC 0x10000
+#define VMWARE_BACKDOOR_PMC_REAL_TIME 0x10001
+#define VMWARE_BACKDOOR_PMC_APPARENT_TIME 0x10002
+
+struct kvm_pmu_ops {
+ bool (*hw_event_available)(struct kvm_pmc *pmc);
+ struct kvm_pmc *(*pmc_idx_to_pmc)(struct kvm_pmu *pmu, int pmc_idx);
+ struct kvm_pmc *(*rdpmc_ecx_to_pmc)(struct kvm_vcpu *vcpu,
+ unsigned int idx, u64 *mask);
+ struct kvm_pmc *(*msr_idx_to_pmc)(struct kvm_vcpu *vcpu, u32 msr);
+ bool (*is_valid_rdpmc_ecx)(struct kvm_vcpu *vcpu, unsigned int idx);
+ bool (*is_valid_msr)(struct kvm_vcpu *vcpu, u32 msr);
+ int (*get_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr_info);
+ int (*set_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr_info);
+ void (*refresh)(struct kvm_vcpu *vcpu);
+ void (*init)(struct kvm_vcpu *vcpu);
+ void (*reset)(struct kvm_vcpu *vcpu);
+ void (*deliver_pmi)(struct kvm_vcpu *vcpu);
+ void (*cleanup)(struct kvm_vcpu *vcpu);
+
+ const u64 EVENTSEL_EVENT;
+ const int MAX_NR_GP_COUNTERS;
+ const int MIN_NR_GP_COUNTERS;
+};
+
+void kvm_pmu_ops_update(const struct kvm_pmu_ops *pmu_ops);
+
+static inline bool kvm_pmu_has_perf_global_ctrl(struct kvm_pmu *pmu)
+{
+ /*
+ * Architecturally, Intel's SDM states that IA32_PERF_GLOBAL_CTRL is
+ * supported if "CPUID.0AH: EAX[7:0] > 0", i.e. if the PMU version is
+ * greater than zero. However, KVM only exposes and emulates the MSR
+ * to/for the guest if the guest PMU supports at least "Architectural
+ * Performance Monitoring Version 2".
+ *
+ * AMD's version of PERF_GLOBAL_CTRL conveniently shows up with v2.
+ */
+ return pmu->version > 1;
+}
+
+static inline u64 pmc_bitmask(struct kvm_pmc *pmc)
+{
+ struct kvm_pmu *pmu = pmc_to_pmu(pmc);
+
+ return pmu->counter_bitmask[pmc->type];
+}
+
+static inline u64 pmc_read_counter(struct kvm_pmc *pmc)
+{
+ u64 counter, enabled, running;
+
+ counter = pmc->counter;
+ if (pmc->perf_event && !pmc->is_paused)
+ counter += perf_event_read_value(pmc->perf_event,
+ &enabled, &running);
+ /* FIXME: Scaling needed? */
+ return counter & pmc_bitmask(pmc);
+}
+
+static inline void pmc_write_counter(struct kvm_pmc *pmc, u64 val)
+{
+ pmc->counter += val - pmc_read_counter(pmc);
+ pmc->counter &= pmc_bitmask(pmc);
+}
+
+static inline bool pmc_is_gp(struct kvm_pmc *pmc)
+{
+ return pmc->type == KVM_PMC_GP;
+}
+
+static inline bool pmc_is_fixed(struct kvm_pmc *pmc)
+{
+ return pmc->type == KVM_PMC_FIXED;
+}
+
+static inline bool kvm_valid_perf_global_ctrl(struct kvm_pmu *pmu,
+ u64 data)
+{
+ return !(pmu->global_ctrl_mask & data);
+}
+
+/* returns general purpose PMC with the specified MSR. Note that it can be
+ * used for both PERFCTRn and EVNTSELn; that is why it accepts base as a
+ * parameter to tell them apart.
+ */
+static inline struct kvm_pmc *get_gp_pmc(struct kvm_pmu *pmu, u32 msr,
+ u32 base)
+{
+ if (msr >= base && msr < base + pmu->nr_arch_gp_counters) {
+ u32 index = array_index_nospec(msr - base,
+ pmu->nr_arch_gp_counters);
+
+ return &pmu->gp_counters[index];
+ }
+
+ return NULL;
+}
+
+/* returns fixed PMC with the specified MSR */
+static inline struct kvm_pmc *get_fixed_pmc(struct kvm_pmu *pmu, u32 msr)
+{
+ int base = MSR_CORE_PERF_FIXED_CTR0;
+
+ if (msr >= base && msr < base + pmu->nr_arch_fixed_counters) {
+ u32 index = array_index_nospec(msr - base,
+ pmu->nr_arch_fixed_counters);
+
+ return &pmu->fixed_counters[index];
+ }
+
+ return NULL;
+}
+
+static inline u64 get_sample_period(struct kvm_pmc *pmc, u64 counter_value)
+{
+ u64 sample_period = (-counter_value) & pmc_bitmask(pmc);
+
+ if (!sample_period)
+ sample_period = pmc_bitmask(pmc) + 1;
+ return sample_period;
+}
+
+static inline void pmc_update_sample_period(struct kvm_pmc *pmc)
+{
+ if (!pmc->perf_event || pmc->is_paused ||
+ !is_sampling_event(pmc->perf_event))
+ return;
+
+ perf_event_period(pmc->perf_event,
+ get_sample_period(pmc, pmc->counter));
+}
+
+static inline bool pmc_speculative_in_use(struct kvm_pmc *pmc)
+{
+ struct kvm_pmu *pmu = pmc_to_pmu(pmc);
+
+ if (pmc_is_fixed(pmc))
+ return fixed_ctrl_field(pmu->fixed_ctr_ctrl,
+ pmc->idx - INTEL_PMC_IDX_FIXED) & 0x3;
+
+ return pmc->eventsel & ARCH_PERFMON_EVENTSEL_ENABLE;
+}
+
+extern struct x86_pmu_capability kvm_pmu_cap;
+
+static inline void kvm_init_pmu_capability(const struct kvm_pmu_ops *pmu_ops)
+{
+ bool is_intel = boot_cpu_data.x86_vendor == X86_VENDOR_INTEL;
+ int min_nr_gp_ctrs = pmu_ops->MIN_NR_GP_COUNTERS;
+
+ /*
+ * Hybrid PMUs don't play nice with virtualization without careful
+ * configuration by userspace, and KVM's APIs for reporting supported
+ * vPMU features do not account for hybrid PMUs. Disable vPMU support
+ * for hybrid PMUs until KVM gains a way to let userspace opt-in.
+ */
+ if (cpu_feature_enabled(X86_FEATURE_HYBRID_CPU))
+ enable_pmu = false;
+
+ if (enable_pmu) {
+ perf_get_x86_pmu_capability(&kvm_pmu_cap);
+
+ /*
+ * WARN if perf did NOT disable hardware PMU if the number of
+ * architecturally required GP counters aren't present, i.e. if
+ * there are a non-zero number of counters, but fewer than what
+ * is architecturally required.
+ */
+ if (!kvm_pmu_cap.num_counters_gp ||
+ WARN_ON_ONCE(kvm_pmu_cap.num_counters_gp < min_nr_gp_ctrs))
+ enable_pmu = false;
+ else if (is_intel && !kvm_pmu_cap.version)
+ enable_pmu = false;
+ }
+
+ if (!enable_pmu) {
+ memset(&kvm_pmu_cap, 0, sizeof(kvm_pmu_cap));
+ return;
+ }
+
+ kvm_pmu_cap.version = min(kvm_pmu_cap.version, 2);
+ kvm_pmu_cap.num_counters_gp = min(kvm_pmu_cap.num_counters_gp,
+ pmu_ops->MAX_NR_GP_COUNTERS);
+ kvm_pmu_cap.num_counters_fixed = min(kvm_pmu_cap.num_counters_fixed,
+ KVM_PMC_MAX_FIXED);
+}
+
+static inline void kvm_pmu_request_counter_reprogram(struct kvm_pmc *pmc)
+{
+ set_bit(pmc->idx, pmc_to_pmu(pmc)->reprogram_pmi);
+ kvm_make_request(KVM_REQ_PMU, pmc->vcpu);
+}
+
+static inline void reprogram_counters(struct kvm_pmu *pmu, u64 diff)
+{
+ int bit;
+
+ if (!diff)
+ return;
+
+ for_each_set_bit(bit, (unsigned long *)&diff, X86_PMC_IDX_MAX)
+ set_bit(bit, pmu->reprogram_pmi);
+ kvm_make_request(KVM_REQ_PMU, pmu_to_vcpu(pmu));
+}
+
+/*
+ * Check if a PMC is enabled by comparing it against global_ctrl bits.
+ *
+ * If the vPMU doesn't have global_ctrl MSR, all vPMCs are enabled.
+ */
+static inline bool pmc_is_globally_enabled(struct kvm_pmc *pmc)
+{
+ struct kvm_pmu *pmu = pmc_to_pmu(pmc);
+
+ if (!kvm_pmu_has_perf_global_ctrl(pmu))
+ return true;
+
+ return test_bit(pmc->idx, (unsigned long *)&pmu->global_ctrl);
+}
+
+void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu);
+void kvm_pmu_handle_event(struct kvm_vcpu *vcpu);
+int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data);
+bool kvm_pmu_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx);
+bool kvm_pmu_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr);
+int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info);
+int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info);
+void kvm_pmu_refresh(struct kvm_vcpu *vcpu);
+void kvm_pmu_reset(struct kvm_vcpu *vcpu);
+void kvm_pmu_init(struct kvm_vcpu *vcpu);
+void kvm_pmu_cleanup(struct kvm_vcpu *vcpu);
+void kvm_pmu_destroy(struct kvm_vcpu *vcpu);
+int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp);
+void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu, u64 perf_hw_id);
+
+bool is_vmware_backdoor_pmc(u32 pmc_idx);
+
+extern struct kvm_pmu_ops intel_pmu_ops;
+extern struct kvm_pmu_ops amd_pmu_ops;
+#endif /* __KVM_X86_PMU_H */
diff --git a/arch/x86/kvm/reverse_cpuid.h b/arch/x86/kvm/reverse_cpuid.h
new file mode 100644
index 000000000..b81650678
--- /dev/null
+++ b/arch/x86/kvm/reverse_cpuid.h
@@ -0,0 +1,224 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef ARCH_X86_KVM_REVERSE_CPUID_H
+#define ARCH_X86_KVM_REVERSE_CPUID_H
+
+#include <uapi/asm/kvm.h>
+#include <asm/cpufeature.h>
+#include <asm/cpufeatures.h>
+
+/*
+ * Hardware-defined CPUID leafs that are either scattered by the kernel or are
+ * unknown to the kernel, but need to be directly used by KVM. Note, these
+ * word values conflict with the kernel's "bug" caps, but KVM doesn't use those.
+ */
+enum kvm_only_cpuid_leafs {
+ CPUID_12_EAX = NCAPINTS,
+ CPUID_7_1_EDX,
+ CPUID_8000_0007_EDX,
+ CPUID_8000_0022_EAX,
+ NR_KVM_CPU_CAPS,
+
+ NKVMCAPINTS = NR_KVM_CPU_CAPS - NCAPINTS,
+};
+
+/*
+ * Define a KVM-only feature flag.
+ *
+ * For features that are scattered by cpufeatures.h, __feature_translate() also
+ * needs to be updated to translate the kernel-defined feature into the
+ * KVM-defined feature.
+ *
+ * For features that are 100% KVM-only, i.e. not defined by cpufeatures.h,
+ * forego the intermediate KVM_X86_FEATURE and directly define X86_FEATURE_* so
+ * that X86_FEATURE_* can be used in KVM. No __feature_translate() handling is
+ * needed in this case.
+ */
+#define KVM_X86_FEATURE(w, f) ((w)*32 + (f))
+
+/* Intel-defined SGX sub-features, CPUID level 0x12 (EAX). */
+#define KVM_X86_FEATURE_SGX1 KVM_X86_FEATURE(CPUID_12_EAX, 0)
+#define KVM_X86_FEATURE_SGX2 KVM_X86_FEATURE(CPUID_12_EAX, 1)
+#define KVM_X86_FEATURE_SGX_EDECCSSA KVM_X86_FEATURE(CPUID_12_EAX, 11)
+
+/* Intel-defined sub-features, CPUID level 0x00000007:1 (EDX) */
+#define X86_FEATURE_AVX_VNNI_INT8 KVM_X86_FEATURE(CPUID_7_1_EDX, 4)
+#define X86_FEATURE_AVX_NE_CONVERT KVM_X86_FEATURE(CPUID_7_1_EDX, 5)
+#define X86_FEATURE_AMX_COMPLEX KVM_X86_FEATURE(CPUID_7_1_EDX, 8)
+#define X86_FEATURE_PREFETCHITI KVM_X86_FEATURE(CPUID_7_1_EDX, 14)
+
+/* CPUID level 0x80000007 (EDX). */
+#define KVM_X86_FEATURE_CONSTANT_TSC KVM_X86_FEATURE(CPUID_8000_0007_EDX, 8)
+
+/* CPUID level 0x80000022 (EAX) */
+#define KVM_X86_FEATURE_PERFMON_V2 KVM_X86_FEATURE(CPUID_8000_0022_EAX, 0)
+
+struct cpuid_reg {
+ u32 function;
+ u32 index;
+ int reg;
+};
+
+static const struct cpuid_reg reverse_cpuid[] = {
+ [CPUID_1_EDX] = { 1, 0, CPUID_EDX},
+ [CPUID_8000_0001_EDX] = {0x80000001, 0, CPUID_EDX},
+ [CPUID_8086_0001_EDX] = {0x80860001, 0, CPUID_EDX},
+ [CPUID_1_ECX] = { 1, 0, CPUID_ECX},
+ [CPUID_C000_0001_EDX] = {0xc0000001, 0, CPUID_EDX},
+ [CPUID_8000_0001_ECX] = {0x80000001, 0, CPUID_ECX},
+ [CPUID_7_0_EBX] = { 7, 0, CPUID_EBX},
+ [CPUID_D_1_EAX] = { 0xd, 1, CPUID_EAX},
+ [CPUID_8000_0008_EBX] = {0x80000008, 0, CPUID_EBX},
+ [CPUID_6_EAX] = { 6, 0, CPUID_EAX},
+ [CPUID_8000_000A_EDX] = {0x8000000a, 0, CPUID_EDX},
+ [CPUID_7_ECX] = { 7, 0, CPUID_ECX},
+ [CPUID_8000_0007_EBX] = {0x80000007, 0, CPUID_EBX},
+ [CPUID_7_EDX] = { 7, 0, CPUID_EDX},
+ [CPUID_7_1_EAX] = { 7, 1, CPUID_EAX},
+ [CPUID_12_EAX] = {0x00000012, 0, CPUID_EAX},
+ [CPUID_8000_001F_EAX] = {0x8000001f, 0, CPUID_EAX},
+ [CPUID_7_1_EDX] = { 7, 1, CPUID_EDX},
+ [CPUID_8000_0007_EDX] = {0x80000007, 0, CPUID_EDX},
+ [CPUID_8000_0021_EAX] = {0x80000021, 0, CPUID_EAX},
+ [CPUID_8000_0022_EAX] = {0x80000022, 0, CPUID_EAX},
+};
+
+/*
+ * Reverse CPUID and its derivatives can only be used for hardware-defined
+ * feature words, i.e. words whose bits directly correspond to a CPUID leaf.
+ * Retrieving a feature bit or masking guest CPUID from a Linux-defined word
+ * is nonsensical as the bit number/mask is an arbitrary software-defined value
+ * and can't be used by KVM to query/control guest capabilities. And obviously
+ * the leaf being queried must have an entry in the lookup table.
+ */
+static __always_inline void reverse_cpuid_check(unsigned int x86_leaf)
+{
+ BUILD_BUG_ON(x86_leaf == CPUID_LNX_1);
+ BUILD_BUG_ON(x86_leaf == CPUID_LNX_2);
+ BUILD_BUG_ON(x86_leaf == CPUID_LNX_3);
+ BUILD_BUG_ON(x86_leaf == CPUID_LNX_4);
+ BUILD_BUG_ON(x86_leaf >= ARRAY_SIZE(reverse_cpuid));
+ BUILD_BUG_ON(reverse_cpuid[x86_leaf].function == 0);
+}
+
+/*
+ * Translate feature bits that are scattered in the kernel's cpufeatures word
+ * into KVM feature words that align with hardware's definitions.
+ */
+static __always_inline u32 __feature_translate(int x86_feature)
+{
+ if (x86_feature == X86_FEATURE_SGX1)
+ return KVM_X86_FEATURE_SGX1;
+ else if (x86_feature == X86_FEATURE_SGX2)
+ return KVM_X86_FEATURE_SGX2;
+ else if (x86_feature == X86_FEATURE_SGX_EDECCSSA)
+ return KVM_X86_FEATURE_SGX_EDECCSSA;
+ else if (x86_feature == X86_FEATURE_CONSTANT_TSC)
+ return KVM_X86_FEATURE_CONSTANT_TSC;
+ else if (x86_feature == X86_FEATURE_PERFMON_V2)
+ return KVM_X86_FEATURE_PERFMON_V2;
+
+ return x86_feature;
+}
+
+static __always_inline u32 __feature_leaf(int x86_feature)
+{
+ return __feature_translate(x86_feature) / 32;
+}
+
+/*
+ * Retrieve the bit mask from an X86_FEATURE_* definition. Features contain
+ * the hardware defined bit number (stored in bits 4:0) and a software defined
+ * "word" (stored in bits 31:5). The word is used to index into arrays of
+ * bit masks that hold the per-cpu feature capabilities, e.g. this_cpu_has().
+ */
+static __always_inline u32 __feature_bit(int x86_feature)
+{
+ x86_feature = __feature_translate(x86_feature);
+
+ reverse_cpuid_check(x86_feature / 32);
+ return 1 << (x86_feature & 31);
+}
+
+#define feature_bit(name) __feature_bit(X86_FEATURE_##name)
+
+static __always_inline struct cpuid_reg x86_feature_cpuid(unsigned int x86_feature)
+{
+ unsigned int x86_leaf = __feature_leaf(x86_feature);
+
+ reverse_cpuid_check(x86_leaf);
+ return reverse_cpuid[x86_leaf];
+}
+
+static __always_inline u32 *__cpuid_entry_get_reg(struct kvm_cpuid_entry2 *entry,
+ u32 reg)
+{
+ switch (reg) {
+ case CPUID_EAX:
+ return &entry->eax;
+ case CPUID_EBX:
+ return &entry->ebx;
+ case CPUID_ECX:
+ return &entry->ecx;
+ case CPUID_EDX:
+ return &entry->edx;
+ default:
+ BUILD_BUG();
+ return NULL;
+ }
+}
+
+static __always_inline u32 *cpuid_entry_get_reg(struct kvm_cpuid_entry2 *entry,
+ unsigned int x86_feature)
+{
+ const struct cpuid_reg cpuid = x86_feature_cpuid(x86_feature);
+
+ return __cpuid_entry_get_reg(entry, cpuid.reg);
+}
+
+static __always_inline u32 cpuid_entry_get(struct kvm_cpuid_entry2 *entry,
+ unsigned int x86_feature)
+{
+ u32 *reg = cpuid_entry_get_reg(entry, x86_feature);
+
+ return *reg & __feature_bit(x86_feature);
+}
+
+static __always_inline bool cpuid_entry_has(struct kvm_cpuid_entry2 *entry,
+ unsigned int x86_feature)
+{
+ return cpuid_entry_get(entry, x86_feature);
+}
+
+static __always_inline void cpuid_entry_clear(struct kvm_cpuid_entry2 *entry,
+ unsigned int x86_feature)
+{
+ u32 *reg = cpuid_entry_get_reg(entry, x86_feature);
+
+ *reg &= ~__feature_bit(x86_feature);
+}
+
+static __always_inline void cpuid_entry_set(struct kvm_cpuid_entry2 *entry,
+ unsigned int x86_feature)
+{
+ u32 *reg = cpuid_entry_get_reg(entry, x86_feature);
+
+ *reg |= __feature_bit(x86_feature);
+}
+
+static __always_inline void cpuid_entry_change(struct kvm_cpuid_entry2 *entry,
+ unsigned int x86_feature,
+ bool set)
+{
+ u32 *reg = cpuid_entry_get_reg(entry, x86_feature);
+
+ /*
+ * Open coded instead of using cpuid_entry_{clear,set}() to coerce the
+ * compiler into using CMOV instead of Jcc when possible.
+ */
+ if (set)
+ *reg |= __feature_bit(x86_feature);
+ else
+ *reg &= ~__feature_bit(x86_feature);
+}
+
+#endif /* ARCH_X86_KVM_REVERSE_CPUID_H */
diff --git a/arch/x86/kvm/smm.c b/arch/x86/kvm/smm.c
new file mode 100644
index 000000000..b42111a24
--- /dev/null
+++ b/arch/x86/kvm/smm.c
@@ -0,0 +1,648 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kvm_host.h>
+#include "x86.h"
+#include "kvm_cache_regs.h"
+#include "kvm_emulate.h"
+#include "smm.h"
+#include "cpuid.h"
+#include "trace.h"
+
+#define CHECK_SMRAM32_OFFSET(field, offset) \
+ ASSERT_STRUCT_OFFSET(struct kvm_smram_state_32, field, offset - 0xFE00)
+
+#define CHECK_SMRAM64_OFFSET(field, offset) \
+ ASSERT_STRUCT_OFFSET(struct kvm_smram_state_64, field, offset - 0xFE00)
+
+static void check_smram_offsets(void)
+{
+ /* 32 bit SMRAM image */
+ CHECK_SMRAM32_OFFSET(reserved1, 0xFE00);
+ CHECK_SMRAM32_OFFSET(smbase, 0xFEF8);
+ CHECK_SMRAM32_OFFSET(smm_revision, 0xFEFC);
+ CHECK_SMRAM32_OFFSET(io_inst_restart, 0xFF00);
+ CHECK_SMRAM32_OFFSET(auto_hlt_restart, 0xFF02);
+ CHECK_SMRAM32_OFFSET(io_restart_rdi, 0xFF04);
+ CHECK_SMRAM32_OFFSET(io_restart_rcx, 0xFF08);
+ CHECK_SMRAM32_OFFSET(io_restart_rsi, 0xFF0C);
+ CHECK_SMRAM32_OFFSET(io_restart_rip, 0xFF10);
+ CHECK_SMRAM32_OFFSET(cr4, 0xFF14);
+ CHECK_SMRAM32_OFFSET(reserved2, 0xFF18);
+ CHECK_SMRAM32_OFFSET(int_shadow, 0xFF1A);
+ CHECK_SMRAM32_OFFSET(reserved3, 0xFF1B);
+ CHECK_SMRAM32_OFFSET(ds, 0xFF2C);
+ CHECK_SMRAM32_OFFSET(fs, 0xFF38);
+ CHECK_SMRAM32_OFFSET(gs, 0xFF44);
+ CHECK_SMRAM32_OFFSET(idtr, 0xFF50);
+ CHECK_SMRAM32_OFFSET(tr, 0xFF5C);
+ CHECK_SMRAM32_OFFSET(gdtr, 0xFF6C);
+ CHECK_SMRAM32_OFFSET(ldtr, 0xFF78);
+ CHECK_SMRAM32_OFFSET(es, 0xFF84);
+ CHECK_SMRAM32_OFFSET(cs, 0xFF90);
+ CHECK_SMRAM32_OFFSET(ss, 0xFF9C);
+ CHECK_SMRAM32_OFFSET(es_sel, 0xFFA8);
+ CHECK_SMRAM32_OFFSET(cs_sel, 0xFFAC);
+ CHECK_SMRAM32_OFFSET(ss_sel, 0xFFB0);
+ CHECK_SMRAM32_OFFSET(ds_sel, 0xFFB4);
+ CHECK_SMRAM32_OFFSET(fs_sel, 0xFFB8);
+ CHECK_SMRAM32_OFFSET(gs_sel, 0xFFBC);
+ CHECK_SMRAM32_OFFSET(ldtr_sel, 0xFFC0);
+ CHECK_SMRAM32_OFFSET(tr_sel, 0xFFC4);
+ CHECK_SMRAM32_OFFSET(dr7, 0xFFC8);
+ CHECK_SMRAM32_OFFSET(dr6, 0xFFCC);
+ CHECK_SMRAM32_OFFSET(gprs, 0xFFD0);
+ CHECK_SMRAM32_OFFSET(eip, 0xFFF0);
+ CHECK_SMRAM32_OFFSET(eflags, 0xFFF4);
+ CHECK_SMRAM32_OFFSET(cr3, 0xFFF8);
+ CHECK_SMRAM32_OFFSET(cr0, 0xFFFC);
+
+ /* 64 bit SMRAM image */
+ CHECK_SMRAM64_OFFSET(es, 0xFE00);
+ CHECK_SMRAM64_OFFSET(cs, 0xFE10);
+ CHECK_SMRAM64_OFFSET(ss, 0xFE20);
+ CHECK_SMRAM64_OFFSET(ds, 0xFE30);
+ CHECK_SMRAM64_OFFSET(fs, 0xFE40);
+ CHECK_SMRAM64_OFFSET(gs, 0xFE50);
+ CHECK_SMRAM64_OFFSET(gdtr, 0xFE60);
+ CHECK_SMRAM64_OFFSET(ldtr, 0xFE70);
+ CHECK_SMRAM64_OFFSET(idtr, 0xFE80);
+ CHECK_SMRAM64_OFFSET(tr, 0xFE90);
+ CHECK_SMRAM64_OFFSET(io_restart_rip, 0xFEA0);
+ CHECK_SMRAM64_OFFSET(io_restart_rcx, 0xFEA8);
+ CHECK_SMRAM64_OFFSET(io_restart_rsi, 0xFEB0);
+ CHECK_SMRAM64_OFFSET(io_restart_rdi, 0xFEB8);
+ CHECK_SMRAM64_OFFSET(io_restart_dword, 0xFEC0);
+ CHECK_SMRAM64_OFFSET(reserved1, 0xFEC4);
+ CHECK_SMRAM64_OFFSET(io_inst_restart, 0xFEC8);
+ CHECK_SMRAM64_OFFSET(auto_hlt_restart, 0xFEC9);
+ CHECK_SMRAM64_OFFSET(amd_nmi_mask, 0xFECA);
+ CHECK_SMRAM64_OFFSET(int_shadow, 0xFECB);
+ CHECK_SMRAM64_OFFSET(reserved2, 0xFECC);
+ CHECK_SMRAM64_OFFSET(efer, 0xFED0);
+ CHECK_SMRAM64_OFFSET(svm_guest_flag, 0xFED8);
+ CHECK_SMRAM64_OFFSET(svm_guest_vmcb_gpa, 0xFEE0);
+ CHECK_SMRAM64_OFFSET(svm_guest_virtual_int, 0xFEE8);
+ CHECK_SMRAM64_OFFSET(reserved3, 0xFEF0);
+ CHECK_SMRAM64_OFFSET(smm_revison, 0xFEFC);
+ CHECK_SMRAM64_OFFSET(smbase, 0xFF00);
+ CHECK_SMRAM64_OFFSET(reserved4, 0xFF04);
+ CHECK_SMRAM64_OFFSET(ssp, 0xFF18);
+ CHECK_SMRAM64_OFFSET(svm_guest_pat, 0xFF20);
+ CHECK_SMRAM64_OFFSET(svm_host_efer, 0xFF28);
+ CHECK_SMRAM64_OFFSET(svm_host_cr4, 0xFF30);
+ CHECK_SMRAM64_OFFSET(svm_host_cr3, 0xFF38);
+ CHECK_SMRAM64_OFFSET(svm_host_cr0, 0xFF40);
+ CHECK_SMRAM64_OFFSET(cr4, 0xFF48);
+ CHECK_SMRAM64_OFFSET(cr3, 0xFF50);
+ CHECK_SMRAM64_OFFSET(cr0, 0xFF58);
+ CHECK_SMRAM64_OFFSET(dr7, 0xFF60);
+ CHECK_SMRAM64_OFFSET(dr6, 0xFF68);
+ CHECK_SMRAM64_OFFSET(rflags, 0xFF70);
+ CHECK_SMRAM64_OFFSET(rip, 0xFF78);
+ CHECK_SMRAM64_OFFSET(gprs, 0xFF80);
+
+ BUILD_BUG_ON(sizeof(union kvm_smram) != 512);
+}
+
+#undef CHECK_SMRAM64_OFFSET
+#undef CHECK_SMRAM32_OFFSET
+
+
+void kvm_smm_changed(struct kvm_vcpu *vcpu, bool entering_smm)
+{
+ trace_kvm_smm_transition(vcpu->vcpu_id, vcpu->arch.smbase, entering_smm);
+
+ if (entering_smm) {
+ vcpu->arch.hflags |= HF_SMM_MASK;
+ } else {
+ vcpu->arch.hflags &= ~(HF_SMM_MASK | HF_SMM_INSIDE_NMI_MASK);
+
+ /* Process a latched INIT or SMI, if any. */
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+
+ /*
+ * Even if KVM_SET_SREGS2 loaded PDPTRs out of band,
+ * on SMM exit we still need to reload them from
+ * guest memory
+ */
+ vcpu->arch.pdptrs_from_userspace = false;
+ }
+
+ kvm_mmu_reset_context(vcpu);
+}
+
+void process_smi(struct kvm_vcpu *vcpu)
+{
+ vcpu->arch.smi_pending = true;
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+}
+
+static u32 enter_smm_get_segment_flags(struct kvm_segment *seg)
+{
+ u32 flags = 0;
+ flags |= seg->g << 23;
+ flags |= seg->db << 22;
+ flags |= seg->l << 21;
+ flags |= seg->avl << 20;
+ flags |= seg->present << 15;
+ flags |= seg->dpl << 13;
+ flags |= seg->s << 12;
+ flags |= seg->type << 8;
+ return flags;
+}
+
+static void enter_smm_save_seg_32(struct kvm_vcpu *vcpu,
+ struct kvm_smm_seg_state_32 *state,
+ u32 *selector, int n)
+{
+ struct kvm_segment seg;
+
+ kvm_get_segment(vcpu, &seg, n);
+ *selector = seg.selector;
+ state->base = seg.base;
+ state->limit = seg.limit;
+ state->flags = enter_smm_get_segment_flags(&seg);
+}
+
+#ifdef CONFIG_X86_64
+static void enter_smm_save_seg_64(struct kvm_vcpu *vcpu,
+ struct kvm_smm_seg_state_64 *state,
+ int n)
+{
+ struct kvm_segment seg;
+
+ kvm_get_segment(vcpu, &seg, n);
+ state->selector = seg.selector;
+ state->attributes = enter_smm_get_segment_flags(&seg) >> 8;
+ state->limit = seg.limit;
+ state->base = seg.base;
+}
+#endif
+
+static void enter_smm_save_state_32(struct kvm_vcpu *vcpu,
+ struct kvm_smram_state_32 *smram)
+{
+ struct desc_ptr dt;
+ unsigned long val;
+ int i;
+
+ smram->cr0 = kvm_read_cr0(vcpu);
+ smram->cr3 = kvm_read_cr3(vcpu);
+ smram->eflags = kvm_get_rflags(vcpu);
+ smram->eip = kvm_rip_read(vcpu);
+
+ for (i = 0; i < 8; i++)
+ smram->gprs[i] = kvm_register_read_raw(vcpu, i);
+
+ kvm_get_dr(vcpu, 6, &val);
+ smram->dr6 = (u32)val;
+ kvm_get_dr(vcpu, 7, &val);
+ smram->dr7 = (u32)val;
+
+ enter_smm_save_seg_32(vcpu, &smram->tr, &smram->tr_sel, VCPU_SREG_TR);
+ enter_smm_save_seg_32(vcpu, &smram->ldtr, &smram->ldtr_sel, VCPU_SREG_LDTR);
+
+ static_call(kvm_x86_get_gdt)(vcpu, &dt);
+ smram->gdtr.base = dt.address;
+ smram->gdtr.limit = dt.size;
+
+ static_call(kvm_x86_get_idt)(vcpu, &dt);
+ smram->idtr.base = dt.address;
+ smram->idtr.limit = dt.size;
+
+ enter_smm_save_seg_32(vcpu, &smram->es, &smram->es_sel, VCPU_SREG_ES);
+ enter_smm_save_seg_32(vcpu, &smram->cs, &smram->cs_sel, VCPU_SREG_CS);
+ enter_smm_save_seg_32(vcpu, &smram->ss, &smram->ss_sel, VCPU_SREG_SS);
+
+ enter_smm_save_seg_32(vcpu, &smram->ds, &smram->ds_sel, VCPU_SREG_DS);
+ enter_smm_save_seg_32(vcpu, &smram->fs, &smram->fs_sel, VCPU_SREG_FS);
+ enter_smm_save_seg_32(vcpu, &smram->gs, &smram->gs_sel, VCPU_SREG_GS);
+
+ smram->cr4 = kvm_read_cr4(vcpu);
+ smram->smm_revision = 0x00020000;
+ smram->smbase = vcpu->arch.smbase;
+
+ smram->int_shadow = static_call(kvm_x86_get_interrupt_shadow)(vcpu);
+}
+
+#ifdef CONFIG_X86_64
+static void enter_smm_save_state_64(struct kvm_vcpu *vcpu,
+ struct kvm_smram_state_64 *smram)
+{
+ struct desc_ptr dt;
+ unsigned long val;
+ int i;
+
+ for (i = 0; i < 16; i++)
+ smram->gprs[15 - i] = kvm_register_read_raw(vcpu, i);
+
+ smram->rip = kvm_rip_read(vcpu);
+ smram->rflags = kvm_get_rflags(vcpu);
+
+
+ kvm_get_dr(vcpu, 6, &val);
+ smram->dr6 = val;
+ kvm_get_dr(vcpu, 7, &val);
+ smram->dr7 = val;
+
+ smram->cr0 = kvm_read_cr0(vcpu);
+ smram->cr3 = kvm_read_cr3(vcpu);
+ smram->cr4 = kvm_read_cr4(vcpu);
+
+ smram->smbase = vcpu->arch.smbase;
+ smram->smm_revison = 0x00020064;
+
+ smram->efer = vcpu->arch.efer;
+
+ enter_smm_save_seg_64(vcpu, &smram->tr, VCPU_SREG_TR);
+
+ static_call(kvm_x86_get_idt)(vcpu, &dt);
+ smram->idtr.limit = dt.size;
+ smram->idtr.base = dt.address;
+
+ enter_smm_save_seg_64(vcpu, &smram->ldtr, VCPU_SREG_LDTR);
+
+ static_call(kvm_x86_get_gdt)(vcpu, &dt);
+ smram->gdtr.limit = dt.size;
+ smram->gdtr.base = dt.address;
+
+ enter_smm_save_seg_64(vcpu, &smram->es, VCPU_SREG_ES);
+ enter_smm_save_seg_64(vcpu, &smram->cs, VCPU_SREG_CS);
+ enter_smm_save_seg_64(vcpu, &smram->ss, VCPU_SREG_SS);
+ enter_smm_save_seg_64(vcpu, &smram->ds, VCPU_SREG_DS);
+ enter_smm_save_seg_64(vcpu, &smram->fs, VCPU_SREG_FS);
+ enter_smm_save_seg_64(vcpu, &smram->gs, VCPU_SREG_GS);
+
+ smram->int_shadow = static_call(kvm_x86_get_interrupt_shadow)(vcpu);
+}
+#endif
+
+void enter_smm(struct kvm_vcpu *vcpu)
+{
+ struct kvm_segment cs, ds;
+ struct desc_ptr dt;
+ unsigned long cr0;
+ union kvm_smram smram;
+
+ check_smram_offsets();
+
+ memset(smram.bytes, 0, sizeof(smram.bytes));
+
+#ifdef CONFIG_X86_64
+ if (guest_cpuid_has(vcpu, X86_FEATURE_LM))
+ enter_smm_save_state_64(vcpu, &smram.smram64);
+ else
+#endif
+ enter_smm_save_state_32(vcpu, &smram.smram32);
+
+ /*
+ * Give enter_smm() a chance to make ISA-specific changes to the vCPU
+ * state (e.g. leave guest mode) after we've saved the state into the
+ * SMM state-save area.
+ *
+ * Kill the VM in the unlikely case of failure, because the VM
+ * can be in undefined state in this case.
+ */
+ if (static_call(kvm_x86_enter_smm)(vcpu, &smram))
+ goto error;
+
+ kvm_smm_changed(vcpu, true);
+
+ if (kvm_vcpu_write_guest(vcpu, vcpu->arch.smbase + 0xfe00, &smram, sizeof(smram)))
+ goto error;
+
+ if (static_call(kvm_x86_get_nmi_mask)(vcpu))
+ vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK;
+ else
+ static_call(kvm_x86_set_nmi_mask)(vcpu, true);
+
+ kvm_set_rflags(vcpu, X86_EFLAGS_FIXED);
+ kvm_rip_write(vcpu, 0x8000);
+
+ static_call(kvm_x86_set_interrupt_shadow)(vcpu, 0);
+
+ cr0 = vcpu->arch.cr0 & ~(X86_CR0_PE | X86_CR0_EM | X86_CR0_TS | X86_CR0_PG);
+ static_call(kvm_x86_set_cr0)(vcpu, cr0);
+ vcpu->arch.cr0 = cr0;
+
+ static_call(kvm_x86_set_cr4)(vcpu, 0);
+
+ /* Undocumented: IDT limit is set to zero on entry to SMM. */
+ dt.address = dt.size = 0;
+ static_call(kvm_x86_set_idt)(vcpu, &dt);
+
+ if (WARN_ON_ONCE(kvm_set_dr(vcpu, 7, DR7_FIXED_1)))
+ goto error;
+
+ cs.selector = (vcpu->arch.smbase >> 4) & 0xffff;
+ cs.base = vcpu->arch.smbase;
+
+ ds.selector = 0;
+ ds.base = 0;
+
+ cs.limit = ds.limit = 0xffffffff;
+ cs.type = ds.type = 0x3;
+ cs.dpl = ds.dpl = 0;
+ cs.db = ds.db = 0;
+ cs.s = ds.s = 1;
+ cs.l = ds.l = 0;
+ cs.g = ds.g = 1;
+ cs.avl = ds.avl = 0;
+ cs.present = ds.present = 1;
+ cs.unusable = ds.unusable = 0;
+ cs.padding = ds.padding = 0;
+
+ kvm_set_segment(vcpu, &cs, VCPU_SREG_CS);
+ kvm_set_segment(vcpu, &ds, VCPU_SREG_DS);
+ kvm_set_segment(vcpu, &ds, VCPU_SREG_ES);
+ kvm_set_segment(vcpu, &ds, VCPU_SREG_FS);
+ kvm_set_segment(vcpu, &ds, VCPU_SREG_GS);
+ kvm_set_segment(vcpu, &ds, VCPU_SREG_SS);
+
+#ifdef CONFIG_X86_64
+ if (guest_cpuid_has(vcpu, X86_FEATURE_LM))
+ if (static_call(kvm_x86_set_efer)(vcpu, 0))
+ goto error;
+#endif
+
+ kvm_update_cpuid_runtime(vcpu);
+ kvm_mmu_reset_context(vcpu);
+ return;
+error:
+ kvm_vm_dead(vcpu->kvm);
+}
+
+static void rsm_set_desc_flags(struct kvm_segment *desc, u32 flags)
+{
+ desc->g = (flags >> 23) & 1;
+ desc->db = (flags >> 22) & 1;
+ desc->l = (flags >> 21) & 1;
+ desc->avl = (flags >> 20) & 1;
+ desc->present = (flags >> 15) & 1;
+ desc->dpl = (flags >> 13) & 3;
+ desc->s = (flags >> 12) & 1;
+ desc->type = (flags >> 8) & 15;
+
+ desc->unusable = !desc->present;
+ desc->padding = 0;
+}
+
+static int rsm_load_seg_32(struct kvm_vcpu *vcpu,
+ const struct kvm_smm_seg_state_32 *state,
+ u16 selector, int n)
+{
+ struct kvm_segment desc;
+
+ desc.selector = selector;
+ desc.base = state->base;
+ desc.limit = state->limit;
+ rsm_set_desc_flags(&desc, state->flags);
+ kvm_set_segment(vcpu, &desc, n);
+ return X86EMUL_CONTINUE;
+}
+
+#ifdef CONFIG_X86_64
+
+static int rsm_load_seg_64(struct kvm_vcpu *vcpu,
+ const struct kvm_smm_seg_state_64 *state,
+ int n)
+{
+ struct kvm_segment desc;
+
+ desc.selector = state->selector;
+ rsm_set_desc_flags(&desc, state->attributes << 8);
+ desc.limit = state->limit;
+ desc.base = state->base;
+ kvm_set_segment(vcpu, &desc, n);
+ return X86EMUL_CONTINUE;
+}
+#endif
+
+static int rsm_enter_protected_mode(struct kvm_vcpu *vcpu,
+ u64 cr0, u64 cr3, u64 cr4)
+{
+ int bad;
+ u64 pcid;
+
+ /* In order to later set CR4.PCIDE, CR3[11:0] must be zero. */
+ pcid = 0;
+ if (cr4 & X86_CR4_PCIDE) {
+ pcid = cr3 & 0xfff;
+ cr3 &= ~0xfff;
+ }
+
+ bad = kvm_set_cr3(vcpu, cr3);
+ if (bad)
+ return X86EMUL_UNHANDLEABLE;
+
+ /*
+ * First enable PAE, long mode needs it before CR0.PG = 1 is set.
+ * Then enable protected mode. However, PCID cannot be enabled
+ * if EFER.LMA=0, so set it separately.
+ */
+ bad = kvm_set_cr4(vcpu, cr4 & ~X86_CR4_PCIDE);
+ if (bad)
+ return X86EMUL_UNHANDLEABLE;
+
+ bad = kvm_set_cr0(vcpu, cr0);
+ if (bad)
+ return X86EMUL_UNHANDLEABLE;
+
+ if (cr4 & X86_CR4_PCIDE) {
+ bad = kvm_set_cr4(vcpu, cr4);
+ if (bad)
+ return X86EMUL_UNHANDLEABLE;
+ if (pcid) {
+ bad = kvm_set_cr3(vcpu, cr3 | pcid);
+ if (bad)
+ return X86EMUL_UNHANDLEABLE;
+ }
+
+ }
+
+ return X86EMUL_CONTINUE;
+}
+
+static int rsm_load_state_32(struct x86_emulate_ctxt *ctxt,
+ const struct kvm_smram_state_32 *smstate)
+{
+ struct kvm_vcpu *vcpu = ctxt->vcpu;
+ struct desc_ptr dt;
+ int i, r;
+
+ ctxt->eflags = smstate->eflags | X86_EFLAGS_FIXED;
+ ctxt->_eip = smstate->eip;
+
+ for (i = 0; i < 8; i++)
+ *reg_write(ctxt, i) = smstate->gprs[i];
+
+ if (kvm_set_dr(vcpu, 6, smstate->dr6))
+ return X86EMUL_UNHANDLEABLE;
+ if (kvm_set_dr(vcpu, 7, smstate->dr7))
+ return X86EMUL_UNHANDLEABLE;
+
+ rsm_load_seg_32(vcpu, &smstate->tr, smstate->tr_sel, VCPU_SREG_TR);
+ rsm_load_seg_32(vcpu, &smstate->ldtr, smstate->ldtr_sel, VCPU_SREG_LDTR);
+
+ dt.address = smstate->gdtr.base;
+ dt.size = smstate->gdtr.limit;
+ static_call(kvm_x86_set_gdt)(vcpu, &dt);
+
+ dt.address = smstate->idtr.base;
+ dt.size = smstate->idtr.limit;
+ static_call(kvm_x86_set_idt)(vcpu, &dt);
+
+ rsm_load_seg_32(vcpu, &smstate->es, smstate->es_sel, VCPU_SREG_ES);
+ rsm_load_seg_32(vcpu, &smstate->cs, smstate->cs_sel, VCPU_SREG_CS);
+ rsm_load_seg_32(vcpu, &smstate->ss, smstate->ss_sel, VCPU_SREG_SS);
+
+ rsm_load_seg_32(vcpu, &smstate->ds, smstate->ds_sel, VCPU_SREG_DS);
+ rsm_load_seg_32(vcpu, &smstate->fs, smstate->fs_sel, VCPU_SREG_FS);
+ rsm_load_seg_32(vcpu, &smstate->gs, smstate->gs_sel, VCPU_SREG_GS);
+
+ vcpu->arch.smbase = smstate->smbase;
+
+ r = rsm_enter_protected_mode(vcpu, smstate->cr0,
+ smstate->cr3, smstate->cr4);
+
+ if (r != X86EMUL_CONTINUE)
+ return r;
+
+ static_call(kvm_x86_set_interrupt_shadow)(vcpu, 0);
+ ctxt->interruptibility = (u8)smstate->int_shadow;
+
+ return r;
+}
+
+#ifdef CONFIG_X86_64
+static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt,
+ const struct kvm_smram_state_64 *smstate)
+{
+ struct kvm_vcpu *vcpu = ctxt->vcpu;
+ struct desc_ptr dt;
+ int i, r;
+
+ for (i = 0; i < 16; i++)
+ *reg_write(ctxt, i) = smstate->gprs[15 - i];
+
+ ctxt->_eip = smstate->rip;
+ ctxt->eflags = smstate->rflags | X86_EFLAGS_FIXED;
+
+ if (kvm_set_dr(vcpu, 6, smstate->dr6))
+ return X86EMUL_UNHANDLEABLE;
+ if (kvm_set_dr(vcpu, 7, smstate->dr7))
+ return X86EMUL_UNHANDLEABLE;
+
+ vcpu->arch.smbase = smstate->smbase;
+
+ if (kvm_set_msr(vcpu, MSR_EFER, smstate->efer & ~EFER_LMA))
+ return X86EMUL_UNHANDLEABLE;
+
+ rsm_load_seg_64(vcpu, &smstate->tr, VCPU_SREG_TR);
+
+ dt.size = smstate->idtr.limit;
+ dt.address = smstate->idtr.base;
+ static_call(kvm_x86_set_idt)(vcpu, &dt);
+
+ rsm_load_seg_64(vcpu, &smstate->ldtr, VCPU_SREG_LDTR);
+
+ dt.size = smstate->gdtr.limit;
+ dt.address = smstate->gdtr.base;
+ static_call(kvm_x86_set_gdt)(vcpu, &dt);
+
+ r = rsm_enter_protected_mode(vcpu, smstate->cr0, smstate->cr3, smstate->cr4);
+ if (r != X86EMUL_CONTINUE)
+ return r;
+
+ rsm_load_seg_64(vcpu, &smstate->es, VCPU_SREG_ES);
+ rsm_load_seg_64(vcpu, &smstate->cs, VCPU_SREG_CS);
+ rsm_load_seg_64(vcpu, &smstate->ss, VCPU_SREG_SS);
+ rsm_load_seg_64(vcpu, &smstate->ds, VCPU_SREG_DS);
+ rsm_load_seg_64(vcpu, &smstate->fs, VCPU_SREG_FS);
+ rsm_load_seg_64(vcpu, &smstate->gs, VCPU_SREG_GS);
+
+ static_call(kvm_x86_set_interrupt_shadow)(vcpu, 0);
+ ctxt->interruptibility = (u8)smstate->int_shadow;
+
+ return X86EMUL_CONTINUE;
+}
+#endif
+
+int emulator_leave_smm(struct x86_emulate_ctxt *ctxt)
+{
+ struct kvm_vcpu *vcpu = ctxt->vcpu;
+ unsigned long cr0;
+ union kvm_smram smram;
+ u64 smbase;
+ int ret;
+
+ smbase = vcpu->arch.smbase;
+
+ ret = kvm_vcpu_read_guest(vcpu, smbase + 0xfe00, smram.bytes, sizeof(smram));
+ if (ret < 0)
+ return X86EMUL_UNHANDLEABLE;
+
+ if ((vcpu->arch.hflags & HF_SMM_INSIDE_NMI_MASK) == 0)
+ static_call(kvm_x86_set_nmi_mask)(vcpu, false);
+
+ kvm_smm_changed(vcpu, false);
+
+ /*
+ * Get back to real mode, to prepare a safe state in which to load
+ * CR0/CR3/CR4/EFER. It's all a bit more complicated if the vCPU
+ * supports long mode.
+ */
+#ifdef CONFIG_X86_64
+ if (guest_cpuid_has(vcpu, X86_FEATURE_LM)) {
+ struct kvm_segment cs_desc;
+ unsigned long cr4;
+
+ /* Zero CR4.PCIDE before CR0.PG. */
+ cr4 = kvm_read_cr4(vcpu);
+ if (cr4 & X86_CR4_PCIDE)
+ kvm_set_cr4(vcpu, cr4 & ~X86_CR4_PCIDE);
+
+ /* A 32-bit code segment is required to clear EFER.LMA. */
+ memset(&cs_desc, 0, sizeof(cs_desc));
+ cs_desc.type = 0xb;
+ cs_desc.s = cs_desc.g = cs_desc.present = 1;
+ kvm_set_segment(vcpu, &cs_desc, VCPU_SREG_CS);
+ }
+#endif
+
+ /* For the 64-bit case, this will clear EFER.LMA. */
+ cr0 = kvm_read_cr0(vcpu);
+ if (cr0 & X86_CR0_PE)
+ kvm_set_cr0(vcpu, cr0 & ~(X86_CR0_PG | X86_CR0_PE));
+
+#ifdef CONFIG_X86_64
+ if (guest_cpuid_has(vcpu, X86_FEATURE_LM)) {
+ unsigned long cr4, efer;
+
+ /* Clear CR4.PAE before clearing EFER.LME. */
+ cr4 = kvm_read_cr4(vcpu);
+ if (cr4 & X86_CR4_PAE)
+ kvm_set_cr4(vcpu, cr4 & ~X86_CR4_PAE);
+
+ /* And finally go back to 32-bit mode. */
+ efer = 0;
+ kvm_set_msr(vcpu, MSR_EFER, efer);
+ }
+#endif
+
+ /*
+ * Give leave_smm() a chance to make ISA-specific changes to the vCPU
+ * state (e.g. enter guest mode) before loading state from the SMM
+ * state-save area.
+ */
+ if (static_call(kvm_x86_leave_smm)(vcpu, &smram))
+ return X86EMUL_UNHANDLEABLE;
+
+#ifdef CONFIG_X86_64
+ if (guest_cpuid_has(vcpu, X86_FEATURE_LM))
+ return rsm_load_state_64(ctxt, &smram.smram64);
+ else
+#endif
+ return rsm_load_state_32(ctxt, &smram.smram32);
+}
diff --git a/arch/x86/kvm/smm.h b/arch/x86/kvm/smm.h
new file mode 100644
index 000000000..a1cf2ac5b
--- /dev/null
+++ b/arch/x86/kvm/smm.h
@@ -0,0 +1,168 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef ASM_KVM_SMM_H
+#define ASM_KVM_SMM_H
+
+#include <linux/build_bug.h>
+
+#ifdef CONFIG_KVM_SMM
+
+
+/*
+ * 32 bit KVM's emulated SMM layout. Based on Intel P6 layout
+ * (https://www.sandpile.org/x86/smm.htm).
+ */
+
+struct kvm_smm_seg_state_32 {
+ u32 flags;
+ u32 limit;
+ u32 base;
+} __packed;
+
+struct kvm_smram_state_32 {
+ u32 reserved1[62];
+ u32 smbase;
+ u32 smm_revision;
+ u16 io_inst_restart;
+ u16 auto_hlt_restart;
+ u32 io_restart_rdi;
+ u32 io_restart_rcx;
+ u32 io_restart_rsi;
+ u32 io_restart_rip;
+ u32 cr4;
+
+ /* A20M#, CPL, shutdown and other reserved/undocumented fields */
+ u16 reserved2;
+ u8 int_shadow; /* KVM extension */
+ u8 reserved3[17];
+
+ struct kvm_smm_seg_state_32 ds;
+ struct kvm_smm_seg_state_32 fs;
+ struct kvm_smm_seg_state_32 gs;
+ struct kvm_smm_seg_state_32 idtr; /* IDTR has only base and limit */
+ struct kvm_smm_seg_state_32 tr;
+ u32 reserved;
+ struct kvm_smm_seg_state_32 gdtr; /* GDTR has only base and limit */
+ struct kvm_smm_seg_state_32 ldtr;
+ struct kvm_smm_seg_state_32 es;
+ struct kvm_smm_seg_state_32 cs;
+ struct kvm_smm_seg_state_32 ss;
+
+ u32 es_sel;
+ u32 cs_sel;
+ u32 ss_sel;
+ u32 ds_sel;
+ u32 fs_sel;
+ u32 gs_sel;
+ u32 ldtr_sel;
+ u32 tr_sel;
+
+ u32 dr7;
+ u32 dr6;
+ u32 gprs[8]; /* GPRS in the "natural" X86 order (EAX/ECX/EDX.../EDI) */
+ u32 eip;
+ u32 eflags;
+ u32 cr3;
+ u32 cr0;
+} __packed;
+
+
+/* 64 bit KVM's emulated SMM layout. Based on AMD64 layout */
+
+struct kvm_smm_seg_state_64 {
+ u16 selector;
+ u16 attributes;
+ u32 limit;
+ u64 base;
+};
+
+struct kvm_smram_state_64 {
+
+ struct kvm_smm_seg_state_64 es;
+ struct kvm_smm_seg_state_64 cs;
+ struct kvm_smm_seg_state_64 ss;
+ struct kvm_smm_seg_state_64 ds;
+ struct kvm_smm_seg_state_64 fs;
+ struct kvm_smm_seg_state_64 gs;
+ struct kvm_smm_seg_state_64 gdtr; /* GDTR has only base and limit*/
+ struct kvm_smm_seg_state_64 ldtr;
+ struct kvm_smm_seg_state_64 idtr; /* IDTR has only base and limit*/
+ struct kvm_smm_seg_state_64 tr;
+
+ /* I/O restart and auto halt restart are not implemented by KVM */
+ u64 io_restart_rip;
+ u64 io_restart_rcx;
+ u64 io_restart_rsi;
+ u64 io_restart_rdi;
+ u32 io_restart_dword;
+ u32 reserved1;
+ u8 io_inst_restart;
+ u8 auto_hlt_restart;
+ u8 amd_nmi_mask; /* Documented in AMD BKDG as NMI mask, not used by KVM */
+ u8 int_shadow;
+ u32 reserved2;
+
+ u64 efer;
+
+ /*
+ * Two fields below are implemented on AMD only, to store
+ * SVM guest vmcb address if the #SMI was received while in the guest mode.
+ */
+ u64 svm_guest_flag;
+ u64 svm_guest_vmcb_gpa;
+ u64 svm_guest_virtual_int; /* unknown purpose, not implemented */
+
+ u32 reserved3[3];
+ u32 smm_revison;
+ u32 smbase;
+ u32 reserved4[5];
+
+ /* ssp and svm_* fields below are not implemented by KVM */
+ u64 ssp;
+ u64 svm_guest_pat;
+ u64 svm_host_efer;
+ u64 svm_host_cr4;
+ u64 svm_host_cr3;
+ u64 svm_host_cr0;
+
+ u64 cr4;
+ u64 cr3;
+ u64 cr0;
+ u64 dr7;
+ u64 dr6;
+ u64 rflags;
+ u64 rip;
+ u64 gprs[16]; /* GPRS in a reversed "natural" X86 order (R15/R14/../RCX/RAX.) */
+};
+
+union kvm_smram {
+ struct kvm_smram_state_64 smram64;
+ struct kvm_smram_state_32 smram32;
+ u8 bytes[512];
+};
+
+static inline int kvm_inject_smi(struct kvm_vcpu *vcpu)
+{
+ kvm_make_request(KVM_REQ_SMI, vcpu);
+ return 0;
+}
+
+static inline bool is_smm(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.hflags & HF_SMM_MASK;
+}
+
+void kvm_smm_changed(struct kvm_vcpu *vcpu, bool in_smm);
+void enter_smm(struct kvm_vcpu *vcpu);
+int emulator_leave_smm(struct x86_emulate_ctxt *ctxt);
+void process_smi(struct kvm_vcpu *vcpu);
+#else
+static inline int kvm_inject_smi(struct kvm_vcpu *vcpu) { return -ENOTTY; }
+static inline bool is_smm(struct kvm_vcpu *vcpu) { return false; }
+
+/*
+ * emulator_leave_smm is used as a function pointer, so the
+ * stub is defined in x86.c.
+ */
+#endif
+
+#endif
diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c
new file mode 100644
index 000000000..4b74ea91f
--- /dev/null
+++ b/arch/x86/kvm/svm/avic.c
@@ -0,0 +1,1221 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Kernel-based Virtual Machine driver for Linux
+ *
+ * AMD SVM support
+ *
+ * Copyright (C) 2006 Qumranet, Inc.
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
+ *
+ * Authors:
+ * Yaniv Kamay <yaniv@qumranet.com>
+ * Avi Kivity <avi@qumranet.com>
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kvm_types.h>
+#include <linux/hashtable.h>
+#include <linux/amd-iommu.h>
+#include <linux/kvm_host.h>
+
+#include <asm/irq_remapping.h>
+
+#include "trace.h"
+#include "lapic.h"
+#include "x86.h"
+#include "irq.h"
+#include "svm.h"
+
+/*
+ * Encode the arbitrary VM ID and the vCPU's default APIC ID, i.e the vCPU ID,
+ * into the GATag so that KVM can retrieve the correct vCPU from a GALog entry
+ * if an interrupt can't be delivered, e.g. because the vCPU isn't running.
+ *
+ * For the vCPU ID, use however many bits are currently allowed for the max
+ * guest physical APIC ID (limited by the size of the physical ID table), and
+ * use whatever bits remain to assign arbitrary AVIC IDs to VMs. Note, the
+ * size of the GATag is defined by hardware (32 bits), but is an opaque value
+ * as far as hardware is concerned.
+ */
+#define AVIC_VCPU_ID_MASK AVIC_PHYSICAL_MAX_INDEX_MASK
+
+#define AVIC_VM_ID_SHIFT HWEIGHT32(AVIC_PHYSICAL_MAX_INDEX_MASK)
+#define AVIC_VM_ID_MASK (GENMASK(31, AVIC_VM_ID_SHIFT) >> AVIC_VM_ID_SHIFT)
+
+#define AVIC_GATAG_TO_VMID(x) ((x >> AVIC_VM_ID_SHIFT) & AVIC_VM_ID_MASK)
+#define AVIC_GATAG_TO_VCPUID(x) (x & AVIC_VCPU_ID_MASK)
+
+#define __AVIC_GATAG(vm_id, vcpu_id) ((((vm_id) & AVIC_VM_ID_MASK) << AVIC_VM_ID_SHIFT) | \
+ ((vcpu_id) & AVIC_VCPU_ID_MASK))
+#define AVIC_GATAG(vm_id, vcpu_id) \
+({ \
+ u32 ga_tag = __AVIC_GATAG(vm_id, vcpu_id); \
+ \
+ WARN_ON_ONCE(AVIC_GATAG_TO_VCPUID(ga_tag) != (vcpu_id)); \
+ WARN_ON_ONCE(AVIC_GATAG_TO_VMID(ga_tag) != (vm_id)); \
+ ga_tag; \
+})
+
+static_assert(__AVIC_GATAG(AVIC_VM_ID_MASK, AVIC_VCPU_ID_MASK) == -1u);
+
+static bool force_avic;
+module_param_unsafe(force_avic, bool, 0444);
+
+/* Note:
+ * This hash table is used to map VM_ID to a struct kvm_svm,
+ * when handling AMD IOMMU GALOG notification to schedule in
+ * a particular vCPU.
+ */
+#define SVM_VM_DATA_HASH_BITS 8
+static DEFINE_HASHTABLE(svm_vm_data_hash, SVM_VM_DATA_HASH_BITS);
+static u32 next_vm_id = 0;
+static bool next_vm_id_wrapped = 0;
+static DEFINE_SPINLOCK(svm_vm_data_hash_lock);
+bool x2avic_enabled;
+
+/*
+ * This is a wrapper of struct amd_iommu_ir_data.
+ */
+struct amd_svm_iommu_ir {
+ struct list_head node; /* Used by SVM for per-vcpu ir_list */
+ void *data; /* Storing pointer to struct amd_ir_data */
+};
+
+static void avic_activate_vmcb(struct vcpu_svm *svm)
+{
+ struct vmcb *vmcb = svm->vmcb01.ptr;
+
+ vmcb->control.int_ctl &= ~(AVIC_ENABLE_MASK | X2APIC_MODE_MASK);
+ vmcb->control.avic_physical_id &= ~AVIC_PHYSICAL_MAX_INDEX_MASK;
+
+ vmcb->control.int_ctl |= AVIC_ENABLE_MASK;
+
+ /*
+ * Note: KVM supports hybrid-AVIC mode, where KVM emulates x2APIC MSR
+ * accesses, while interrupt injection to a running vCPU can be
+ * achieved using AVIC doorbell. KVM disables the APIC access page
+ * (deletes the memslot) if any vCPU has x2APIC enabled, thus enabling
+ * AVIC in hybrid mode activates only the doorbell mechanism.
+ */
+ if (x2avic_enabled && apic_x2apic_mode(svm->vcpu.arch.apic)) {
+ vmcb->control.int_ctl |= X2APIC_MODE_MASK;
+ vmcb->control.avic_physical_id |= X2AVIC_MAX_PHYSICAL_ID;
+ /* Disabling MSR intercept for x2APIC registers */
+ svm_set_x2apic_msr_interception(svm, false);
+ } else {
+ /*
+ * Flush the TLB, the guest may have inserted a non-APIC
+ * mapping into the TLB while AVIC was disabled.
+ */
+ kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, &svm->vcpu);
+
+ /* For xAVIC and hybrid-xAVIC modes */
+ vmcb->control.avic_physical_id |= AVIC_MAX_PHYSICAL_ID;
+ /* Enabling MSR intercept for x2APIC registers */
+ svm_set_x2apic_msr_interception(svm, true);
+ }
+}
+
+static void avic_deactivate_vmcb(struct vcpu_svm *svm)
+{
+ struct vmcb *vmcb = svm->vmcb01.ptr;
+
+ vmcb->control.int_ctl &= ~(AVIC_ENABLE_MASK | X2APIC_MODE_MASK);
+ vmcb->control.avic_physical_id &= ~AVIC_PHYSICAL_MAX_INDEX_MASK;
+
+ /*
+ * If running nested and the guest uses its own MSR bitmap, there
+ * is no need to update L0's msr bitmap
+ */
+ if (is_guest_mode(&svm->vcpu) &&
+ vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_MSR_PROT))
+ return;
+
+ /* Enabling MSR intercept for x2APIC registers */
+ svm_set_x2apic_msr_interception(svm, true);
+}
+
+/* Note:
+ * This function is called from IOMMU driver to notify
+ * SVM to schedule in a particular vCPU of a particular VM.
+ */
+int avic_ga_log_notifier(u32 ga_tag)
+{
+ unsigned long flags;
+ struct kvm_svm *kvm_svm;
+ struct kvm_vcpu *vcpu = NULL;
+ u32 vm_id = AVIC_GATAG_TO_VMID(ga_tag);
+ u32 vcpu_id = AVIC_GATAG_TO_VCPUID(ga_tag);
+
+ pr_debug("SVM: %s: vm_id=%#x, vcpu_id=%#x\n", __func__, vm_id, vcpu_id);
+ trace_kvm_avic_ga_log(vm_id, vcpu_id);
+
+ spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
+ hash_for_each_possible(svm_vm_data_hash, kvm_svm, hnode, vm_id) {
+ if (kvm_svm->avic_vm_id != vm_id)
+ continue;
+ vcpu = kvm_get_vcpu_by_id(&kvm_svm->kvm, vcpu_id);
+ break;
+ }
+ spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
+
+ /* Note:
+ * At this point, the IOMMU should have already set the pending
+ * bit in the vAPIC backing page. So, we just need to schedule
+ * in the vcpu.
+ */
+ if (vcpu)
+ kvm_vcpu_wake_up(vcpu);
+
+ return 0;
+}
+
+void avic_vm_destroy(struct kvm *kvm)
+{
+ unsigned long flags;
+ struct kvm_svm *kvm_svm = to_kvm_svm(kvm);
+
+ if (!enable_apicv)
+ return;
+
+ if (kvm_svm->avic_logical_id_table_page)
+ __free_page(kvm_svm->avic_logical_id_table_page);
+ if (kvm_svm->avic_physical_id_table_page)
+ __free_page(kvm_svm->avic_physical_id_table_page);
+
+ spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
+ hash_del(&kvm_svm->hnode);
+ spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
+}
+
+int avic_vm_init(struct kvm *kvm)
+{
+ unsigned long flags;
+ int err = -ENOMEM;
+ struct kvm_svm *kvm_svm = to_kvm_svm(kvm);
+ struct kvm_svm *k2;
+ struct page *p_page;
+ struct page *l_page;
+ u32 vm_id;
+
+ if (!enable_apicv)
+ return 0;
+
+ /* Allocating physical APIC ID table (4KB) */
+ p_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+ if (!p_page)
+ goto free_avic;
+
+ kvm_svm->avic_physical_id_table_page = p_page;
+
+ /* Allocating logical APIC ID table (4KB) */
+ l_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+ if (!l_page)
+ goto free_avic;
+
+ kvm_svm->avic_logical_id_table_page = l_page;
+
+ spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
+ again:
+ vm_id = next_vm_id = (next_vm_id + 1) & AVIC_VM_ID_MASK;
+ if (vm_id == 0) { /* id is 1-based, zero is not okay */
+ next_vm_id_wrapped = 1;
+ goto again;
+ }
+ /* Is it still in use? Only possible if wrapped at least once */
+ if (next_vm_id_wrapped) {
+ hash_for_each_possible(svm_vm_data_hash, k2, hnode, vm_id) {
+ if (k2->avic_vm_id == vm_id)
+ goto again;
+ }
+ }
+ kvm_svm->avic_vm_id = vm_id;
+ hash_add(svm_vm_data_hash, &kvm_svm->hnode, kvm_svm->avic_vm_id);
+ spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
+
+ return 0;
+
+free_avic:
+ avic_vm_destroy(kvm);
+ return err;
+}
+
+void avic_init_vmcb(struct vcpu_svm *svm, struct vmcb *vmcb)
+{
+ struct kvm_svm *kvm_svm = to_kvm_svm(svm->vcpu.kvm);
+ phys_addr_t bpa = __sme_set(page_to_phys(svm->avic_backing_page));
+ phys_addr_t lpa = __sme_set(page_to_phys(kvm_svm->avic_logical_id_table_page));
+ phys_addr_t ppa = __sme_set(page_to_phys(kvm_svm->avic_physical_id_table_page));
+
+ vmcb->control.avic_backing_page = bpa & AVIC_HPA_MASK;
+ vmcb->control.avic_logical_id = lpa & AVIC_HPA_MASK;
+ vmcb->control.avic_physical_id = ppa & AVIC_HPA_MASK;
+ vmcb->control.avic_vapic_bar = APIC_DEFAULT_PHYS_BASE & VMCB_AVIC_APIC_BAR_MASK;
+
+ if (kvm_apicv_activated(svm->vcpu.kvm))
+ avic_activate_vmcb(svm);
+ else
+ avic_deactivate_vmcb(svm);
+}
+
+static u64 *avic_get_physical_id_entry(struct kvm_vcpu *vcpu,
+ unsigned int index)
+{
+ u64 *avic_physical_id_table;
+ struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
+
+ if ((!x2avic_enabled && index > AVIC_MAX_PHYSICAL_ID) ||
+ (index > X2AVIC_MAX_PHYSICAL_ID))
+ return NULL;
+
+ avic_physical_id_table = page_address(kvm_svm->avic_physical_id_table_page);
+
+ return &avic_physical_id_table[index];
+}
+
+static int avic_init_backing_page(struct kvm_vcpu *vcpu)
+{
+ u64 *entry, new_entry;
+ int id = vcpu->vcpu_id;
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if ((!x2avic_enabled && id > AVIC_MAX_PHYSICAL_ID) ||
+ (id > X2AVIC_MAX_PHYSICAL_ID))
+ return -EINVAL;
+
+ if (!vcpu->arch.apic->regs)
+ return -EINVAL;
+
+ if (kvm_apicv_activated(vcpu->kvm)) {
+ int ret;
+
+ /*
+ * Note, AVIC hardware walks the nested page table to check
+ * permissions, but does not use the SPA address specified in
+ * the leaf SPTE since it uses address in the AVIC_BACKING_PAGE
+ * pointer field of the VMCB.
+ */
+ ret = kvm_alloc_apic_access_page(vcpu->kvm);
+ if (ret)
+ return ret;
+ }
+
+ svm->avic_backing_page = virt_to_page(vcpu->arch.apic->regs);
+
+ /* Setting AVIC backing page address in the phy APIC ID table */
+ entry = avic_get_physical_id_entry(vcpu, id);
+ if (!entry)
+ return -EINVAL;
+
+ new_entry = __sme_set((page_to_phys(svm->avic_backing_page) &
+ AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK) |
+ AVIC_PHYSICAL_ID_ENTRY_VALID_MASK);
+ WRITE_ONCE(*entry, new_entry);
+
+ svm->avic_physical_id_cache = entry;
+
+ return 0;
+}
+
+void avic_ring_doorbell(struct kvm_vcpu *vcpu)
+{
+ /*
+ * Note, the vCPU could get migrated to a different pCPU at any point,
+ * which could result in signalling the wrong/previous pCPU. But if
+ * that happens the vCPU is guaranteed to do a VMRUN (after being
+ * migrated) and thus will process pending interrupts, i.e. a doorbell
+ * is not needed (and the spurious one is harmless).
+ */
+ int cpu = READ_ONCE(vcpu->cpu);
+
+ if (cpu != get_cpu()) {
+ wrmsrl(MSR_AMD64_SVM_AVIC_DOORBELL, kvm_cpu_get_apicid(cpu));
+ trace_kvm_avic_doorbell(vcpu->vcpu_id, kvm_cpu_get_apicid(cpu));
+ }
+ put_cpu();
+}
+
+
+static void avic_kick_vcpu(struct kvm_vcpu *vcpu, u32 icrl)
+{
+ vcpu->arch.apic->irr_pending = true;
+ svm_complete_interrupt_delivery(vcpu,
+ icrl & APIC_MODE_MASK,
+ icrl & APIC_INT_LEVELTRIG,
+ icrl & APIC_VECTOR_MASK);
+}
+
+static void avic_kick_vcpu_by_physical_id(struct kvm *kvm, u32 physical_id,
+ u32 icrl)
+{
+ /*
+ * KVM inhibits AVIC if any vCPU ID diverges from the vCPUs APIC ID,
+ * i.e. APIC ID == vCPU ID.
+ */
+ struct kvm_vcpu *target_vcpu = kvm_get_vcpu_by_id(kvm, physical_id);
+
+ /* Once again, nothing to do if the target vCPU doesn't exist. */
+ if (unlikely(!target_vcpu))
+ return;
+
+ avic_kick_vcpu(target_vcpu, icrl);
+}
+
+static void avic_kick_vcpu_by_logical_id(struct kvm *kvm, u32 *avic_logical_id_table,
+ u32 logid_index, u32 icrl)
+{
+ u32 physical_id;
+
+ if (avic_logical_id_table) {
+ u32 logid_entry = avic_logical_id_table[logid_index];
+
+ /* Nothing to do if the logical destination is invalid. */
+ if (unlikely(!(logid_entry & AVIC_LOGICAL_ID_ENTRY_VALID_MASK)))
+ return;
+
+ physical_id = logid_entry &
+ AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK;
+ } else {
+ /*
+ * For x2APIC, the logical APIC ID is a read-only value that is
+ * derived from the x2APIC ID, thus the x2APIC ID can be found
+ * by reversing the calculation (stored in logid_index). Note,
+ * bits 31:20 of the x2APIC ID aren't propagated to the logical
+ * ID, but KVM limits the x2APIC ID limited to KVM_MAX_VCPU_IDS.
+ */
+ physical_id = logid_index;
+ }
+
+ avic_kick_vcpu_by_physical_id(kvm, physical_id, icrl);
+}
+
+/*
+ * A fast-path version of avic_kick_target_vcpus(), which attempts to match
+ * destination APIC ID to vCPU without looping through all vCPUs.
+ */
+static int avic_kick_target_vcpus_fast(struct kvm *kvm, struct kvm_lapic *source,
+ u32 icrl, u32 icrh, u32 index)
+{
+ int dest_mode = icrl & APIC_DEST_MASK;
+ int shorthand = icrl & APIC_SHORT_MASK;
+ struct kvm_svm *kvm_svm = to_kvm_svm(kvm);
+ u32 dest;
+
+ if (shorthand != APIC_DEST_NOSHORT)
+ return -EINVAL;
+
+ if (apic_x2apic_mode(source))
+ dest = icrh;
+ else
+ dest = GET_XAPIC_DEST_FIELD(icrh);
+
+ if (dest_mode == APIC_DEST_PHYSICAL) {
+ /* broadcast destination, use slow path */
+ if (apic_x2apic_mode(source) && dest == X2APIC_BROADCAST)
+ return -EINVAL;
+ if (!apic_x2apic_mode(source) && dest == APIC_BROADCAST)
+ return -EINVAL;
+
+ if (WARN_ON_ONCE(dest != index))
+ return -EINVAL;
+
+ avic_kick_vcpu_by_physical_id(kvm, dest, icrl);
+ } else {
+ u32 *avic_logical_id_table;
+ unsigned long bitmap, i;
+ u32 cluster;
+
+ if (apic_x2apic_mode(source)) {
+ /* 16 bit dest mask, 16 bit cluster id */
+ bitmap = dest & 0xFFFF;
+ cluster = (dest >> 16) << 4;
+ } else if (kvm_lapic_get_reg(source, APIC_DFR) == APIC_DFR_FLAT) {
+ /* 8 bit dest mask*/
+ bitmap = dest;
+ cluster = 0;
+ } else {
+ /* 4 bit desk mask, 4 bit cluster id */
+ bitmap = dest & 0xF;
+ cluster = (dest >> 4) << 2;
+ }
+
+ /* Nothing to do if there are no destinations in the cluster. */
+ if (unlikely(!bitmap))
+ return 0;
+
+ if (apic_x2apic_mode(source))
+ avic_logical_id_table = NULL;
+ else
+ avic_logical_id_table = page_address(kvm_svm->avic_logical_id_table_page);
+
+ /*
+ * AVIC is inhibited if vCPUs aren't mapped 1:1 with logical
+ * IDs, thus each bit in the destination is guaranteed to map
+ * to at most one vCPU.
+ */
+ for_each_set_bit(i, &bitmap, 16)
+ avic_kick_vcpu_by_logical_id(kvm, avic_logical_id_table,
+ cluster + i, icrl);
+ }
+
+ return 0;
+}
+
+static void avic_kick_target_vcpus(struct kvm *kvm, struct kvm_lapic *source,
+ u32 icrl, u32 icrh, u32 index)
+{
+ u32 dest = apic_x2apic_mode(source) ? icrh : GET_XAPIC_DEST_FIELD(icrh);
+ unsigned long i;
+ struct kvm_vcpu *vcpu;
+
+ if (!avic_kick_target_vcpus_fast(kvm, source, icrl, icrh, index))
+ return;
+
+ trace_kvm_avic_kick_vcpu_slowpath(icrh, icrl, index);
+
+ /*
+ * Wake any target vCPUs that are blocking, i.e. waiting for a wake
+ * event. There's no need to signal doorbells, as hardware has handled
+ * vCPUs that were in guest at the time of the IPI, and vCPUs that have
+ * since entered the guest will have processed pending IRQs at VMRUN.
+ */
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ if (kvm_apic_match_dest(vcpu, source, icrl & APIC_SHORT_MASK,
+ dest, icrl & APIC_DEST_MASK))
+ avic_kick_vcpu(vcpu, icrl);
+ }
+}
+
+int avic_incomplete_ipi_interception(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ u32 icrh = svm->vmcb->control.exit_info_1 >> 32;
+ u32 icrl = svm->vmcb->control.exit_info_1;
+ u32 id = svm->vmcb->control.exit_info_2 >> 32;
+ u32 index = svm->vmcb->control.exit_info_2 & 0x1FF;
+ struct kvm_lapic *apic = vcpu->arch.apic;
+
+ trace_kvm_avic_incomplete_ipi(vcpu->vcpu_id, icrh, icrl, id, index);
+
+ switch (id) {
+ case AVIC_IPI_FAILURE_INVALID_TARGET:
+ case AVIC_IPI_FAILURE_INVALID_INT_TYPE:
+ /*
+ * Emulate IPIs that are not handled by AVIC hardware, which
+ * only virtualizes Fixed, Edge-Triggered INTRs, and falls over
+ * if _any_ targets are invalid, e.g. if the logical mode mask
+ * is a superset of running vCPUs.
+ *
+ * The exit is a trap, e.g. ICR holds the correct value and RIP
+ * has been advanced, KVM is responsible only for emulating the
+ * IPI. Sadly, hardware may sometimes leave the BUSY flag set,
+ * in which case KVM needs to emulate the ICR write as well in
+ * order to clear the BUSY flag.
+ */
+ if (icrl & APIC_ICR_BUSY)
+ kvm_apic_write_nodecode(vcpu, APIC_ICR);
+ else
+ kvm_apic_send_ipi(apic, icrl, icrh);
+ break;
+ case AVIC_IPI_FAILURE_TARGET_NOT_RUNNING:
+ /*
+ * At this point, we expect that the AVIC HW has already
+ * set the appropriate IRR bits on the valid target
+ * vcpus. So, we just need to kick the appropriate vcpu.
+ */
+ avic_kick_target_vcpus(vcpu->kvm, apic, icrl, icrh, index);
+ break;
+ case AVIC_IPI_FAILURE_INVALID_BACKING_PAGE:
+ WARN_ONCE(1, "Invalid backing page\n");
+ break;
+ case AVIC_IPI_FAILURE_INVALID_IPI_VECTOR:
+ /* Invalid IPI with vector < 16 */
+ break;
+ default:
+ vcpu_unimpl(vcpu, "Unknown avic incomplete IPI interception\n");
+ }
+
+ return 1;
+}
+
+unsigned long avic_vcpu_get_apicv_inhibit_reasons(struct kvm_vcpu *vcpu)
+{
+ if (is_guest_mode(vcpu))
+ return APICV_INHIBIT_REASON_NESTED;
+ return 0;
+}
+
+static u32 *avic_get_logical_id_entry(struct kvm_vcpu *vcpu, u32 ldr, bool flat)
+{
+ struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
+ u32 *logical_apic_id_table;
+ u32 cluster, index;
+
+ ldr = GET_APIC_LOGICAL_ID(ldr);
+
+ if (flat) {
+ cluster = 0;
+ } else {
+ cluster = (ldr >> 4);
+ if (cluster >= 0xf)
+ return NULL;
+ ldr &= 0xf;
+ }
+ if (!ldr || !is_power_of_2(ldr))
+ return NULL;
+
+ index = __ffs(ldr);
+ if (WARN_ON_ONCE(index > 7))
+ return NULL;
+ index += (cluster << 2);
+
+ logical_apic_id_table = (u32 *) page_address(kvm_svm->avic_logical_id_table_page);
+
+ return &logical_apic_id_table[index];
+}
+
+static void avic_ldr_write(struct kvm_vcpu *vcpu, u8 g_physical_id, u32 ldr)
+{
+ bool flat;
+ u32 *entry, new_entry;
+
+ flat = kvm_lapic_get_reg(vcpu->arch.apic, APIC_DFR) == APIC_DFR_FLAT;
+ entry = avic_get_logical_id_entry(vcpu, ldr, flat);
+ if (!entry)
+ return;
+
+ new_entry = READ_ONCE(*entry);
+ new_entry &= ~AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK;
+ new_entry |= (g_physical_id & AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK);
+ new_entry |= AVIC_LOGICAL_ID_ENTRY_VALID_MASK;
+ WRITE_ONCE(*entry, new_entry);
+}
+
+static void avic_invalidate_logical_id_entry(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ bool flat = svm->dfr_reg == APIC_DFR_FLAT;
+ u32 *entry;
+
+ /* Note: x2AVIC does not use logical APIC ID table */
+ if (apic_x2apic_mode(vcpu->arch.apic))
+ return;
+
+ entry = avic_get_logical_id_entry(vcpu, svm->ldr_reg, flat);
+ if (entry)
+ clear_bit(AVIC_LOGICAL_ID_ENTRY_VALID_BIT, (unsigned long *)entry);
+}
+
+static void avic_handle_ldr_update(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ u32 ldr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_LDR);
+ u32 id = kvm_xapic_id(vcpu->arch.apic);
+
+ /* AVIC does not support LDR update for x2APIC */
+ if (apic_x2apic_mode(vcpu->arch.apic))
+ return;
+
+ if (ldr == svm->ldr_reg)
+ return;
+
+ avic_invalidate_logical_id_entry(vcpu);
+
+ svm->ldr_reg = ldr;
+ avic_ldr_write(vcpu, id, ldr);
+}
+
+static void avic_handle_dfr_update(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ u32 dfr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_DFR);
+
+ if (svm->dfr_reg == dfr)
+ return;
+
+ avic_invalidate_logical_id_entry(vcpu);
+ svm->dfr_reg = dfr;
+}
+
+static int avic_unaccel_trap_write(struct kvm_vcpu *vcpu)
+{
+ u32 offset = to_svm(vcpu)->vmcb->control.exit_info_1 &
+ AVIC_UNACCEL_ACCESS_OFFSET_MASK;
+
+ switch (offset) {
+ case APIC_LDR:
+ avic_handle_ldr_update(vcpu);
+ break;
+ case APIC_DFR:
+ avic_handle_dfr_update(vcpu);
+ break;
+ case APIC_RRR:
+ /* Ignore writes to Read Remote Data, it's read-only. */
+ return 1;
+ default:
+ break;
+ }
+
+ kvm_apic_write_nodecode(vcpu, offset);
+ return 1;
+}
+
+static bool is_avic_unaccelerated_access_trap(u32 offset)
+{
+ bool ret = false;
+
+ switch (offset) {
+ case APIC_ID:
+ case APIC_EOI:
+ case APIC_RRR:
+ case APIC_LDR:
+ case APIC_DFR:
+ case APIC_SPIV:
+ case APIC_ESR:
+ case APIC_ICR:
+ case APIC_LVTT:
+ case APIC_LVTTHMR:
+ case APIC_LVTPC:
+ case APIC_LVT0:
+ case APIC_LVT1:
+ case APIC_LVTERR:
+ case APIC_TMICT:
+ case APIC_TDCR:
+ ret = true;
+ break;
+ default:
+ break;
+ }
+ return ret;
+}
+
+int avic_unaccelerated_access_interception(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ int ret = 0;
+ u32 offset = svm->vmcb->control.exit_info_1 &
+ AVIC_UNACCEL_ACCESS_OFFSET_MASK;
+ u32 vector = svm->vmcb->control.exit_info_2 &
+ AVIC_UNACCEL_ACCESS_VECTOR_MASK;
+ bool write = (svm->vmcb->control.exit_info_1 >> 32) &
+ AVIC_UNACCEL_ACCESS_WRITE_MASK;
+ bool trap = is_avic_unaccelerated_access_trap(offset);
+
+ trace_kvm_avic_unaccelerated_access(vcpu->vcpu_id, offset,
+ trap, write, vector);
+ if (trap) {
+ /* Handling Trap */
+ WARN_ONCE(!write, "svm: Handling trap read.\n");
+ ret = avic_unaccel_trap_write(vcpu);
+ } else {
+ /* Handling Fault */
+ ret = kvm_emulate_instruction(vcpu, 0);
+ }
+
+ return ret;
+}
+
+int avic_init_vcpu(struct vcpu_svm *svm)
+{
+ int ret;
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+
+ if (!enable_apicv || !irqchip_in_kernel(vcpu->kvm))
+ return 0;
+
+ ret = avic_init_backing_page(vcpu);
+ if (ret)
+ return ret;
+
+ INIT_LIST_HEAD(&svm->ir_list);
+ spin_lock_init(&svm->ir_list_lock);
+ svm->dfr_reg = APIC_DFR_FLAT;
+
+ return ret;
+}
+
+void avic_apicv_post_state_restore(struct kvm_vcpu *vcpu)
+{
+ avic_handle_dfr_update(vcpu);
+ avic_handle_ldr_update(vcpu);
+}
+
+static int avic_set_pi_irte_mode(struct kvm_vcpu *vcpu, bool activate)
+{
+ int ret = 0;
+ unsigned long flags;
+ struct amd_svm_iommu_ir *ir;
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (!kvm_arch_has_assigned_device(vcpu->kvm))
+ return 0;
+
+ /*
+ * Here, we go through the per-vcpu ir_list to update all existing
+ * interrupt remapping table entry targeting this vcpu.
+ */
+ spin_lock_irqsave(&svm->ir_list_lock, flags);
+
+ if (list_empty(&svm->ir_list))
+ goto out;
+
+ list_for_each_entry(ir, &svm->ir_list, node) {
+ if (activate)
+ ret = amd_iommu_activate_guest_mode(ir->data);
+ else
+ ret = amd_iommu_deactivate_guest_mode(ir->data);
+ if (ret)
+ break;
+ }
+out:
+ spin_unlock_irqrestore(&svm->ir_list_lock, flags);
+ return ret;
+}
+
+static void svm_ir_list_del(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi)
+{
+ unsigned long flags;
+ struct amd_svm_iommu_ir *cur;
+
+ spin_lock_irqsave(&svm->ir_list_lock, flags);
+ list_for_each_entry(cur, &svm->ir_list, node) {
+ if (cur->data != pi->ir_data)
+ continue;
+ list_del(&cur->node);
+ kfree(cur);
+ break;
+ }
+ spin_unlock_irqrestore(&svm->ir_list_lock, flags);
+}
+
+static int svm_ir_list_add(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi)
+{
+ int ret = 0;
+ unsigned long flags;
+ struct amd_svm_iommu_ir *ir;
+ u64 entry;
+
+ /**
+ * In some cases, the existing irte is updated and re-set,
+ * so we need to check here if it's already been * added
+ * to the ir_list.
+ */
+ if (pi->ir_data && (pi->prev_ga_tag != 0)) {
+ struct kvm *kvm = svm->vcpu.kvm;
+ u32 vcpu_id = AVIC_GATAG_TO_VCPUID(pi->prev_ga_tag);
+ struct kvm_vcpu *prev_vcpu = kvm_get_vcpu_by_id(kvm, vcpu_id);
+ struct vcpu_svm *prev_svm;
+
+ if (!prev_vcpu) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ prev_svm = to_svm(prev_vcpu);
+ svm_ir_list_del(prev_svm, pi);
+ }
+
+ /**
+ * Allocating new amd_iommu_pi_data, which will get
+ * add to the per-vcpu ir_list.
+ */
+ ir = kzalloc(sizeof(struct amd_svm_iommu_ir), GFP_KERNEL_ACCOUNT);
+ if (!ir) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ ir->data = pi->ir_data;
+
+ spin_lock_irqsave(&svm->ir_list_lock, flags);
+
+ /*
+ * Update the target pCPU for IOMMU doorbells if the vCPU is running.
+ * If the vCPU is NOT running, i.e. is blocking or scheduled out, KVM
+ * will update the pCPU info when the vCPU awkened and/or scheduled in.
+ * See also avic_vcpu_load().
+ */
+ entry = READ_ONCE(*(svm->avic_physical_id_cache));
+ if (entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK)
+ amd_iommu_update_ga(entry & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK,
+ true, pi->ir_data);
+
+ list_add(&ir->node, &svm->ir_list);
+ spin_unlock_irqrestore(&svm->ir_list_lock, flags);
+out:
+ return ret;
+}
+
+/*
+ * Note:
+ * The HW cannot support posting multicast/broadcast
+ * interrupts to a vCPU. So, we still use legacy interrupt
+ * remapping for these kind of interrupts.
+ *
+ * For lowest-priority interrupts, we only support
+ * those with single CPU as the destination, e.g. user
+ * configures the interrupts via /proc/irq or uses
+ * irqbalance to make the interrupts single-CPU.
+ */
+static int
+get_pi_vcpu_info(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e,
+ struct vcpu_data *vcpu_info, struct vcpu_svm **svm)
+{
+ struct kvm_lapic_irq irq;
+ struct kvm_vcpu *vcpu = NULL;
+
+ kvm_set_msi_irq(kvm, e, &irq);
+
+ if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu) ||
+ !kvm_irq_is_postable(&irq)) {
+ pr_debug("SVM: %s: use legacy intr remap mode for irq %u\n",
+ __func__, irq.vector);
+ return -1;
+ }
+
+ pr_debug("SVM: %s: use GA mode for irq %u\n", __func__,
+ irq.vector);
+ *svm = to_svm(vcpu);
+ vcpu_info->pi_desc_addr = __sme_set(page_to_phys((*svm)->avic_backing_page));
+ vcpu_info->vector = irq.vector;
+
+ return 0;
+}
+
+/*
+ * avic_pi_update_irte - set IRTE for Posted-Interrupts
+ *
+ * @kvm: kvm
+ * @host_irq: host irq of the interrupt
+ * @guest_irq: gsi of the interrupt
+ * @set: set or unset PI
+ * returns 0 on success, < 0 on failure
+ */
+int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
+ uint32_t guest_irq, bool set)
+{
+ struct kvm_kernel_irq_routing_entry *e;
+ struct kvm_irq_routing_table *irq_rt;
+ int idx, ret = 0;
+
+ if (!kvm_arch_has_assigned_device(kvm) ||
+ !irq_remapping_cap(IRQ_POSTING_CAP))
+ return 0;
+
+ pr_debug("SVM: %s: host_irq=%#x, guest_irq=%#x, set=%#x\n",
+ __func__, host_irq, guest_irq, set);
+
+ idx = srcu_read_lock(&kvm->irq_srcu);
+ irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
+
+ if (guest_irq >= irq_rt->nr_rt_entries ||
+ hlist_empty(&irq_rt->map[guest_irq])) {
+ pr_warn_once("no route for guest_irq %u/%u (broken user space?)\n",
+ guest_irq, irq_rt->nr_rt_entries);
+ goto out;
+ }
+
+ hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) {
+ struct vcpu_data vcpu_info;
+ struct vcpu_svm *svm = NULL;
+
+ if (e->type != KVM_IRQ_ROUTING_MSI)
+ continue;
+
+ /**
+ * Here, we setup with legacy mode in the following cases:
+ * 1. When cannot target interrupt to a specific vcpu.
+ * 2. Unsetting posted interrupt.
+ * 3. APIC virtualization is disabled for the vcpu.
+ * 4. IRQ has incompatible delivery mode (SMI, INIT, etc)
+ */
+ if (!get_pi_vcpu_info(kvm, e, &vcpu_info, &svm) && set &&
+ kvm_vcpu_apicv_active(&svm->vcpu)) {
+ struct amd_iommu_pi_data pi;
+
+ /* Try to enable guest_mode in IRTE */
+ pi.base = __sme_set(page_to_phys(svm->avic_backing_page) &
+ AVIC_HPA_MASK);
+ pi.ga_tag = AVIC_GATAG(to_kvm_svm(kvm)->avic_vm_id,
+ svm->vcpu.vcpu_id);
+ pi.is_guest_mode = true;
+ pi.vcpu_data = &vcpu_info;
+ ret = irq_set_vcpu_affinity(host_irq, &pi);
+
+ /**
+ * Here, we successfully setting up vcpu affinity in
+ * IOMMU guest mode. Now, we need to store the posted
+ * interrupt information in a per-vcpu ir_list so that
+ * we can reference to them directly when we update vcpu
+ * scheduling information in IOMMU irte.
+ */
+ if (!ret && pi.is_guest_mode)
+ svm_ir_list_add(svm, &pi);
+ } else {
+ /* Use legacy mode in IRTE */
+ struct amd_iommu_pi_data pi;
+
+ /**
+ * Here, pi is used to:
+ * - Tell IOMMU to use legacy mode for this interrupt.
+ * - Retrieve ga_tag of prior interrupt remapping data.
+ */
+ pi.prev_ga_tag = 0;
+ pi.is_guest_mode = false;
+ ret = irq_set_vcpu_affinity(host_irq, &pi);
+
+ /**
+ * Check if the posted interrupt was previously
+ * setup with the guest_mode by checking if the ga_tag
+ * was cached. If so, we need to clean up the per-vcpu
+ * ir_list.
+ */
+ if (!ret && pi.prev_ga_tag) {
+ int id = AVIC_GATAG_TO_VCPUID(pi.prev_ga_tag);
+ struct kvm_vcpu *vcpu;
+
+ vcpu = kvm_get_vcpu_by_id(kvm, id);
+ if (vcpu)
+ svm_ir_list_del(to_svm(vcpu), &pi);
+ }
+ }
+
+ if (!ret && svm) {
+ trace_kvm_pi_irte_update(host_irq, svm->vcpu.vcpu_id,
+ e->gsi, vcpu_info.vector,
+ vcpu_info.pi_desc_addr, set);
+ }
+
+ if (ret < 0) {
+ pr_err("%s: failed to update PI IRTE\n", __func__);
+ goto out;
+ }
+ }
+
+ ret = 0;
+out:
+ srcu_read_unlock(&kvm->irq_srcu, idx);
+ return ret;
+}
+
+static inline int
+avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu, bool r)
+{
+ int ret = 0;
+ struct amd_svm_iommu_ir *ir;
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ lockdep_assert_held(&svm->ir_list_lock);
+
+ if (!kvm_arch_has_assigned_device(vcpu->kvm))
+ return 0;
+
+ /*
+ * Here, we go through the per-vcpu ir_list to update all existing
+ * interrupt remapping table entry targeting this vcpu.
+ */
+ if (list_empty(&svm->ir_list))
+ return 0;
+
+ list_for_each_entry(ir, &svm->ir_list, node) {
+ ret = amd_iommu_update_ga(cpu, r, ir->data);
+ if (ret)
+ return ret;
+ }
+ return 0;
+}
+
+void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+{
+ u64 entry;
+ int h_physical_id = kvm_cpu_get_apicid(cpu);
+ struct vcpu_svm *svm = to_svm(vcpu);
+ unsigned long flags;
+
+ lockdep_assert_preemption_disabled();
+
+ if (WARN_ON(h_physical_id & ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK))
+ return;
+
+ /*
+ * No need to update anything if the vCPU is blocking, i.e. if the vCPU
+ * is being scheduled in after being preempted. The CPU entries in the
+ * Physical APIC table and IRTE are consumed iff IsRun{ning} is '1'.
+ * If the vCPU was migrated, its new CPU value will be stuffed when the
+ * vCPU unblocks.
+ */
+ if (kvm_vcpu_is_blocking(vcpu))
+ return;
+
+ /*
+ * Grab the per-vCPU interrupt remapping lock even if the VM doesn't
+ * _currently_ have assigned devices, as that can change. Holding
+ * ir_list_lock ensures that either svm_ir_list_add() will consume
+ * up-to-date entry information, or that this task will wait until
+ * svm_ir_list_add() completes to set the new target pCPU.
+ */
+ spin_lock_irqsave(&svm->ir_list_lock, flags);
+
+ entry = READ_ONCE(*(svm->avic_physical_id_cache));
+ WARN_ON_ONCE(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK);
+
+ entry &= ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK;
+ entry |= (h_physical_id & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK);
+ entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
+
+ WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
+ avic_update_iommu_vcpu_affinity(vcpu, h_physical_id, true);
+
+ spin_unlock_irqrestore(&svm->ir_list_lock, flags);
+}
+
+void avic_vcpu_put(struct kvm_vcpu *vcpu)
+{
+ u64 entry;
+ struct vcpu_svm *svm = to_svm(vcpu);
+ unsigned long flags;
+
+ lockdep_assert_preemption_disabled();
+
+ /*
+ * Note, reading the Physical ID entry outside of ir_list_lock is safe
+ * as only the pCPU that has loaded (or is loading) the vCPU is allowed
+ * to modify the entry, and preemption is disabled. I.e. the vCPU
+ * can't be scheduled out and thus avic_vcpu_{put,load}() can't run
+ * recursively.
+ */
+ entry = READ_ONCE(*(svm->avic_physical_id_cache));
+
+ /* Nothing to do if IsRunning == '0' due to vCPU blocking. */
+ if (!(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK))
+ return;
+
+ /*
+ * Take and hold the per-vCPU interrupt remapping lock while updating
+ * the Physical ID entry even though the lock doesn't protect against
+ * multiple writers (see above). Holding ir_list_lock ensures that
+ * either svm_ir_list_add() will consume up-to-date entry information,
+ * or that this task will wait until svm_ir_list_add() completes to
+ * mark the vCPU as not running.
+ */
+ spin_lock_irqsave(&svm->ir_list_lock, flags);
+
+ avic_update_iommu_vcpu_affinity(vcpu, -1, 0);
+
+ entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
+ WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
+
+ spin_unlock_irqrestore(&svm->ir_list_lock, flags);
+
+}
+
+void avic_refresh_virtual_apic_mode(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb *vmcb = svm->vmcb01.ptr;
+
+ if (!lapic_in_kernel(vcpu) || !enable_apicv)
+ return;
+
+ if (kvm_vcpu_apicv_active(vcpu)) {
+ /**
+ * During AVIC temporary deactivation, guest could update
+ * APIC ID, DFR and LDR registers, which would not be trapped
+ * by avic_unaccelerated_access_interception(). In this case,
+ * we need to check and update the AVIC logical APIC ID table
+ * accordingly before re-activating.
+ */
+ avic_apicv_post_state_restore(vcpu);
+ avic_activate_vmcb(svm);
+ } else {
+ avic_deactivate_vmcb(svm);
+ }
+ vmcb_mark_dirty(vmcb, VMCB_AVIC);
+}
+
+void avic_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
+{
+ bool activated = kvm_vcpu_apicv_active(vcpu);
+
+ if (!enable_apicv)
+ return;
+
+ avic_refresh_virtual_apic_mode(vcpu);
+
+ if (activated)
+ avic_vcpu_load(vcpu, vcpu->cpu);
+ else
+ avic_vcpu_put(vcpu);
+
+ avic_set_pi_irte_mode(vcpu, activated);
+}
+
+void avic_vcpu_blocking(struct kvm_vcpu *vcpu)
+{
+ if (!kvm_vcpu_apicv_active(vcpu))
+ return;
+
+ /*
+ * Unload the AVIC when the vCPU is about to block, _before_
+ * the vCPU actually blocks.
+ *
+ * Any IRQs that arrive before IsRunning=0 will not cause an
+ * incomplete IPI vmexit on the source, therefore vIRR will also
+ * be checked by kvm_vcpu_check_block() before blocking. The
+ * memory barrier implicit in set_current_state orders writing
+ * IsRunning=0 before reading the vIRR. The processor needs a
+ * matching memory barrier on interrupt delivery between writing
+ * IRR and reading IsRunning; the lack of this barrier might be
+ * the cause of errata #1235).
+ */
+ avic_vcpu_put(vcpu);
+}
+
+void avic_vcpu_unblocking(struct kvm_vcpu *vcpu)
+{
+ if (!kvm_vcpu_apicv_active(vcpu))
+ return;
+
+ avic_vcpu_load(vcpu, vcpu->cpu);
+}
+
+/*
+ * Note:
+ * - The module param avic enable both xAPIC and x2APIC mode.
+ * - Hypervisor can support both xAVIC and x2AVIC in the same guest.
+ * - The mode can be switched at run-time.
+ */
+bool avic_hardware_setup(void)
+{
+ if (!npt_enabled)
+ return false;
+
+ /* AVIC is a prerequisite for x2AVIC. */
+ if (!boot_cpu_has(X86_FEATURE_AVIC) && !force_avic) {
+ if (boot_cpu_has(X86_FEATURE_X2AVIC)) {
+ pr_warn(FW_BUG "Cannot support x2AVIC due to AVIC is disabled");
+ pr_warn(FW_BUG "Try enable AVIC using force_avic option");
+ }
+ return false;
+ }
+
+ if (boot_cpu_has(X86_FEATURE_AVIC)) {
+ pr_info("AVIC enabled\n");
+ } else if (force_avic) {
+ /*
+ * Some older systems does not advertise AVIC support.
+ * See Revision Guide for specific AMD processor for more detail.
+ */
+ pr_warn("AVIC is not supported in CPUID but force enabled");
+ pr_warn("Your system might crash and burn");
+ }
+
+ /* AVIC is a prerequisite for x2AVIC. */
+ x2avic_enabled = boot_cpu_has(X86_FEATURE_X2AVIC);
+ if (x2avic_enabled)
+ pr_info("x2AVIC enabled\n");
+
+ amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier);
+
+ return true;
+}
diff --git a/arch/x86/kvm/svm/hyperv.c b/arch/x86/kvm/svm/hyperv.c
new file mode 100644
index 000000000..088f6429b
--- /dev/null
+++ b/arch/x86/kvm/svm/hyperv.c
@@ -0,0 +1,18 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * AMD SVM specific code for Hyper-V on KVM.
+ *
+ * Copyright 2022 Red Hat, Inc. and/or its affiliates.
+ */
+#include "hyperv.h"
+
+void svm_hv_inject_synthetic_vmexit_post_tlb_flush(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ svm->vmcb->control.exit_code = HV_SVM_EXITCODE_ENL;
+ svm->vmcb->control.exit_code_hi = 0;
+ svm->vmcb->control.exit_info_1 = HV_SVM_ENL_EXITCODE_TRAP_AFTER_FLUSH;
+ svm->vmcb->control.exit_info_2 = 0;
+ nested_svm_vmexit(svm);
+}
diff --git a/arch/x86/kvm/svm/hyperv.h b/arch/x86/kvm/svm/hyperv.h
new file mode 100644
index 000000000..02f4784b5
--- /dev/null
+++ b/arch/x86/kvm/svm/hyperv.h
@@ -0,0 +1,45 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Common Hyper-V on KVM and KVM on Hyper-V definitions (SVM).
+ */
+
+#ifndef __ARCH_X86_KVM_SVM_HYPERV_H__
+#define __ARCH_X86_KVM_SVM_HYPERV_H__
+
+#include <asm/mshyperv.h>
+
+#include "../hyperv.h"
+#include "svm.h"
+
+static inline void nested_svm_hv_update_vm_vp_ids(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct hv_vmcb_enlightenments *hve = &svm->nested.ctl.hv_enlightenments;
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+
+ if (!hv_vcpu)
+ return;
+
+ hv_vcpu->nested.pa_page_gpa = hve->partition_assist_page;
+ hv_vcpu->nested.vm_id = hve->hv_vm_id;
+ hv_vcpu->nested.vp_id = hve->hv_vp_id;
+}
+
+static inline bool nested_svm_l2_tlb_flush_enabled(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct hv_vmcb_enlightenments *hve = &svm->nested.ctl.hv_enlightenments;
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+
+ if (!hv_vcpu)
+ return false;
+
+ if (!hve->hv_enlightenments_control.nested_flush_hypercall)
+ return false;
+
+ return hv_vcpu->vp_assist_page.nested_control.features.directhypercall;
+}
+
+void svm_hv_inject_synthetic_vmexit_post_tlb_flush(struct kvm_vcpu *vcpu);
+
+#endif /* __ARCH_X86_KVM_SVM_HYPERV_H__ */
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
new file mode 100644
index 000000000..60891b9ce
--- /dev/null
+++ b/arch/x86/kvm/svm/nested.c
@@ -0,0 +1,1818 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Kernel-based Virtual Machine driver for Linux
+ *
+ * AMD SVM support
+ *
+ * Copyright (C) 2006 Qumranet, Inc.
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
+ *
+ * Authors:
+ * Yaniv Kamay <yaniv@qumranet.com>
+ * Avi Kivity <avi@qumranet.com>
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kvm_types.h>
+#include <linux/kvm_host.h>
+#include <linux/kernel.h>
+
+#include <asm/msr-index.h>
+#include <asm/debugreg.h>
+
+#include "kvm_emulate.h"
+#include "trace.h"
+#include "mmu.h"
+#include "x86.h"
+#include "smm.h"
+#include "cpuid.h"
+#include "lapic.h"
+#include "svm.h"
+#include "hyperv.h"
+
+#define CC KVM_NESTED_VMENTER_CONSISTENCY_CHECK
+
+static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu,
+ struct x86_exception *fault)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb *vmcb = svm->vmcb;
+
+ if (vmcb->control.exit_code != SVM_EXIT_NPF) {
+ /*
+ * TODO: track the cause of the nested page fault, and
+ * correctly fill in the high bits of exit_info_1.
+ */
+ vmcb->control.exit_code = SVM_EXIT_NPF;
+ vmcb->control.exit_code_hi = 0;
+ vmcb->control.exit_info_1 = (1ULL << 32);
+ vmcb->control.exit_info_2 = fault->address;
+ }
+
+ vmcb->control.exit_info_1 &= ~0xffffffffULL;
+ vmcb->control.exit_info_1 |= fault->error_code;
+
+ nested_svm_vmexit(svm);
+}
+
+static u64 nested_svm_get_tdp_pdptr(struct kvm_vcpu *vcpu, int index)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ u64 cr3 = svm->nested.ctl.nested_cr3;
+ u64 pdpte;
+ int ret;
+
+ ret = kvm_vcpu_read_guest_page(vcpu, gpa_to_gfn(cr3), &pdpte,
+ offset_in_page(cr3) + index * 8, 8);
+ if (ret)
+ return 0;
+ return pdpte;
+}
+
+static unsigned long nested_svm_get_tdp_cr3(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ return svm->nested.ctl.nested_cr3;
+}
+
+static void nested_svm_init_mmu_context(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ WARN_ON(mmu_is_nested(vcpu));
+
+ vcpu->arch.mmu = &vcpu->arch.guest_mmu;
+
+ /*
+ * The NPT format depends on L1's CR4 and EFER, which is in vmcb01. Note,
+ * when called via KVM_SET_NESTED_STATE, that state may _not_ match current
+ * vCPU state. CR0.WP is explicitly ignored, while CR0.PG is required.
+ */
+ kvm_init_shadow_npt_mmu(vcpu, X86_CR0_PG, svm->vmcb01.ptr->save.cr4,
+ svm->vmcb01.ptr->save.efer,
+ svm->nested.ctl.nested_cr3);
+ vcpu->arch.mmu->get_guest_pgd = nested_svm_get_tdp_cr3;
+ vcpu->arch.mmu->get_pdptr = nested_svm_get_tdp_pdptr;
+ vcpu->arch.mmu->inject_page_fault = nested_svm_inject_npf_exit;
+ vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu;
+}
+
+static void nested_svm_uninit_mmu_context(struct kvm_vcpu *vcpu)
+{
+ vcpu->arch.mmu = &vcpu->arch.root_mmu;
+ vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
+}
+
+static bool nested_vmcb_needs_vls_intercept(struct vcpu_svm *svm)
+{
+ if (!guest_can_use(&svm->vcpu, X86_FEATURE_V_VMSAVE_VMLOAD))
+ return true;
+
+ if (!nested_npt_enabled(svm))
+ return true;
+
+ if (!(svm->nested.ctl.virt_ext & VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK))
+ return true;
+
+ return false;
+}
+
+void recalc_intercepts(struct vcpu_svm *svm)
+{
+ struct vmcb_control_area *c, *h;
+ struct vmcb_ctrl_area_cached *g;
+ unsigned int i;
+
+ vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
+
+ if (!is_guest_mode(&svm->vcpu))
+ return;
+
+ c = &svm->vmcb->control;
+ h = &svm->vmcb01.ptr->control;
+ g = &svm->nested.ctl;
+
+ for (i = 0; i < MAX_INTERCEPT; i++)
+ c->intercepts[i] = h->intercepts[i];
+
+ if (g->int_ctl & V_INTR_MASKING_MASK) {
+ /*
+ * If L2 is active and V_INTR_MASKING is enabled in vmcb12,
+ * disable intercept of CR8 writes as L2's CR8 does not affect
+ * any interrupt KVM may want to inject.
+ *
+ * Similarly, disable intercept of virtual interrupts (used to
+ * detect interrupt windows) if the saved RFLAGS.IF is '0', as
+ * the effective RFLAGS.IF for L1 interrupts will never be set
+ * while L2 is running (L2's RFLAGS.IF doesn't affect L1 IRQs).
+ */
+ vmcb_clr_intercept(c, INTERCEPT_CR8_WRITE);
+ if (!(svm->vmcb01.ptr->save.rflags & X86_EFLAGS_IF))
+ vmcb_clr_intercept(c, INTERCEPT_VINTR);
+ }
+
+ /*
+ * We want to see VMMCALLs from a nested guest only when Hyper-V L2 TLB
+ * flush feature is enabled.
+ */
+ if (!nested_svm_l2_tlb_flush_enabled(&svm->vcpu))
+ vmcb_clr_intercept(c, INTERCEPT_VMMCALL);
+
+ for (i = 0; i < MAX_INTERCEPT; i++)
+ c->intercepts[i] |= g->intercepts[i];
+
+ /* If SMI is not intercepted, ignore guest SMI intercept as well */
+ if (!intercept_smi)
+ vmcb_clr_intercept(c, INTERCEPT_SMI);
+
+ if (nested_vmcb_needs_vls_intercept(svm)) {
+ /*
+ * If the virtual VMLOAD/VMSAVE is not enabled for the L2,
+ * we must intercept these instructions to correctly
+ * emulate them in case L1 doesn't intercept them.
+ */
+ vmcb_set_intercept(c, INTERCEPT_VMLOAD);
+ vmcb_set_intercept(c, INTERCEPT_VMSAVE);
+ } else {
+ WARN_ON(!(c->virt_ext & VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK));
+ }
+}
+
+/*
+ * Merge L0's (KVM) and L1's (Nested VMCB) MSR permission bitmaps. The function
+ * is optimized in that it only merges the parts where KVM MSR permission bitmap
+ * may contain zero bits.
+ */
+static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
+{
+ struct hv_vmcb_enlightenments *hve = &svm->nested.ctl.hv_enlightenments;
+ int i;
+
+ /*
+ * MSR bitmap update can be skipped when:
+ * - MSR bitmap for L1 hasn't changed.
+ * - Nested hypervisor (L1) is attempting to launch the same L2 as
+ * before.
+ * - Nested hypervisor (L1) is using Hyper-V emulation interface and
+ * tells KVM (L0) there were no changes in MSR bitmap for L2.
+ */
+ if (!svm->nested.force_msr_bitmap_recalc &&
+ kvm_hv_hypercall_enabled(&svm->vcpu) &&
+ hve->hv_enlightenments_control.msr_bitmap &&
+ (svm->nested.ctl.clean & BIT(HV_VMCB_NESTED_ENLIGHTENMENTS)))
+ goto set_msrpm_base_pa;
+
+ if (!(vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_MSR_PROT)))
+ return true;
+
+ for (i = 0; i < MSRPM_OFFSETS; i++) {
+ u32 value, p;
+ u64 offset;
+
+ if (msrpm_offsets[i] == 0xffffffff)
+ break;
+
+ p = msrpm_offsets[i];
+
+ /* x2apic msrs are intercepted always for the nested guest */
+ if (is_x2apic_msrpm_offset(p))
+ continue;
+
+ offset = svm->nested.ctl.msrpm_base_pa + (p * 4);
+
+ if (kvm_vcpu_read_guest(&svm->vcpu, offset, &value, 4))
+ return false;
+
+ svm->nested.msrpm[p] = svm->msrpm[p] | value;
+ }
+
+ svm->nested.force_msr_bitmap_recalc = false;
+
+set_msrpm_base_pa:
+ svm->vmcb->control.msrpm_base_pa = __sme_set(__pa(svm->nested.msrpm));
+
+ return true;
+}
+
+/*
+ * Bits 11:0 of bitmap address are ignored by hardware
+ */
+static bool nested_svm_check_bitmap_pa(struct kvm_vcpu *vcpu, u64 pa, u32 size)
+{
+ u64 addr = PAGE_ALIGN(pa);
+
+ return kvm_vcpu_is_legal_gpa(vcpu, addr) &&
+ kvm_vcpu_is_legal_gpa(vcpu, addr + size - 1);
+}
+
+static bool __nested_vmcb_check_controls(struct kvm_vcpu *vcpu,
+ struct vmcb_ctrl_area_cached *control)
+{
+ if (CC(!vmcb12_is_intercept(control, INTERCEPT_VMRUN)))
+ return false;
+
+ if (CC(control->asid == 0))
+ return false;
+
+ if (CC((control->nested_ctl & SVM_NESTED_CTL_NP_ENABLE) && !npt_enabled))
+ return false;
+
+ if (CC(!nested_svm_check_bitmap_pa(vcpu, control->msrpm_base_pa,
+ MSRPM_SIZE)))
+ return false;
+ if (CC(!nested_svm_check_bitmap_pa(vcpu, control->iopm_base_pa,
+ IOPM_SIZE)))
+ return false;
+
+ if (CC((control->int_ctl & V_NMI_ENABLE_MASK) &&
+ !vmcb12_is_intercept(control, INTERCEPT_NMI))) {
+ return false;
+ }
+
+ return true;
+}
+
+/* Common checks that apply to both L1 and L2 state. */
+static bool __nested_vmcb_check_save(struct kvm_vcpu *vcpu,
+ struct vmcb_save_area_cached *save)
+{
+ if (CC(!(save->efer & EFER_SVME)))
+ return false;
+
+ if (CC((save->cr0 & X86_CR0_CD) == 0 && (save->cr0 & X86_CR0_NW)) ||
+ CC(save->cr0 & ~0xffffffffULL))
+ return false;
+
+ if (CC(!kvm_dr6_valid(save->dr6)) || CC(!kvm_dr7_valid(save->dr7)))
+ return false;
+
+ /*
+ * These checks are also performed by KVM_SET_SREGS,
+ * except that EFER.LMA is not checked by SVM against
+ * CR0.PG && EFER.LME.
+ */
+ if ((save->efer & EFER_LME) && (save->cr0 & X86_CR0_PG)) {
+ if (CC(!(save->cr4 & X86_CR4_PAE)) ||
+ CC(!(save->cr0 & X86_CR0_PE)) ||
+ CC(kvm_vcpu_is_illegal_gpa(vcpu, save->cr3)))
+ return false;
+ }
+
+ /* Note, SVM doesn't have any additional restrictions on CR4. */
+ if (CC(!__kvm_is_valid_cr4(vcpu, save->cr4)))
+ return false;
+
+ if (CC(!kvm_valid_efer(vcpu, save->efer)))
+ return false;
+
+ return true;
+}
+
+static bool nested_vmcb_check_save(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb_save_area_cached *save = &svm->nested.save;
+
+ return __nested_vmcb_check_save(vcpu, save);
+}
+
+static bool nested_vmcb_check_controls(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb_ctrl_area_cached *ctl = &svm->nested.ctl;
+
+ return __nested_vmcb_check_controls(vcpu, ctl);
+}
+
+static
+void __nested_copy_vmcb_control_to_cache(struct kvm_vcpu *vcpu,
+ struct vmcb_ctrl_area_cached *to,
+ struct vmcb_control_area *from)
+{
+ unsigned int i;
+
+ for (i = 0; i < MAX_INTERCEPT; i++)
+ to->intercepts[i] = from->intercepts[i];
+
+ to->iopm_base_pa = from->iopm_base_pa;
+ to->msrpm_base_pa = from->msrpm_base_pa;
+ to->tsc_offset = from->tsc_offset;
+ to->tlb_ctl = from->tlb_ctl;
+ to->int_ctl = from->int_ctl;
+ to->int_vector = from->int_vector;
+ to->int_state = from->int_state;
+ to->exit_code = from->exit_code;
+ to->exit_code_hi = from->exit_code_hi;
+ to->exit_info_1 = from->exit_info_1;
+ to->exit_info_2 = from->exit_info_2;
+ to->exit_int_info = from->exit_int_info;
+ to->exit_int_info_err = from->exit_int_info_err;
+ to->nested_ctl = from->nested_ctl;
+ to->event_inj = from->event_inj;
+ to->event_inj_err = from->event_inj_err;
+ to->next_rip = from->next_rip;
+ to->nested_cr3 = from->nested_cr3;
+ to->virt_ext = from->virt_ext;
+ to->pause_filter_count = from->pause_filter_count;
+ to->pause_filter_thresh = from->pause_filter_thresh;
+
+ /* Copy asid here because nested_vmcb_check_controls will check it. */
+ to->asid = from->asid;
+ to->msrpm_base_pa &= ~0x0fffULL;
+ to->iopm_base_pa &= ~0x0fffULL;
+
+ /* Hyper-V extensions (Enlightened VMCB) */
+ if (kvm_hv_hypercall_enabled(vcpu)) {
+ to->clean = from->clean;
+ memcpy(&to->hv_enlightenments, &from->hv_enlightenments,
+ sizeof(to->hv_enlightenments));
+ }
+}
+
+void nested_copy_vmcb_control_to_cache(struct vcpu_svm *svm,
+ struct vmcb_control_area *control)
+{
+ __nested_copy_vmcb_control_to_cache(&svm->vcpu, &svm->nested.ctl, control);
+}
+
+static void __nested_copy_vmcb_save_to_cache(struct vmcb_save_area_cached *to,
+ struct vmcb_save_area *from)
+{
+ /*
+ * Copy only fields that are validated, as we need them
+ * to avoid TOC/TOU races.
+ */
+ to->efer = from->efer;
+ to->cr0 = from->cr0;
+ to->cr3 = from->cr3;
+ to->cr4 = from->cr4;
+
+ to->dr6 = from->dr6;
+ to->dr7 = from->dr7;
+}
+
+void nested_copy_vmcb_save_to_cache(struct vcpu_svm *svm,
+ struct vmcb_save_area *save)
+{
+ __nested_copy_vmcb_save_to_cache(&svm->nested.save, save);
+}
+
+/*
+ * Synchronize fields that are written by the processor, so that
+ * they can be copied back into the vmcb12.
+ */
+void nested_sync_control_from_vmcb02(struct vcpu_svm *svm)
+{
+ u32 mask;
+ svm->nested.ctl.event_inj = svm->vmcb->control.event_inj;
+ svm->nested.ctl.event_inj_err = svm->vmcb->control.event_inj_err;
+
+ /* Only a few fields of int_ctl are written by the processor. */
+ mask = V_IRQ_MASK | V_TPR_MASK;
+ /*
+ * Don't sync vmcb02 V_IRQ back to vmcb12 if KVM (L0) is intercepting
+ * virtual interrupts in order to request an interrupt window, as KVM
+ * has usurped vmcb02's int_ctl. If an interrupt window opens before
+ * the next VM-Exit, svm_clear_vintr() will restore vmcb12's int_ctl.
+ * If no window opens, V_IRQ will be correctly preserved in vmcb12's
+ * int_ctl (because it was never recognized while L2 was running).
+ */
+ if (svm_is_intercept(svm, INTERCEPT_VINTR) &&
+ !test_bit(INTERCEPT_VINTR, (unsigned long *)svm->nested.ctl.intercepts))
+ mask &= ~V_IRQ_MASK;
+
+ if (nested_vgif_enabled(svm))
+ mask |= V_GIF_MASK;
+
+ if (nested_vnmi_enabled(svm))
+ mask |= V_NMI_BLOCKING_MASK | V_NMI_PENDING_MASK;
+
+ svm->nested.ctl.int_ctl &= ~mask;
+ svm->nested.ctl.int_ctl |= svm->vmcb->control.int_ctl & mask;
+}
+
+/*
+ * Transfer any event that L0 or L1 wanted to inject into L2 to
+ * EXIT_INT_INFO.
+ */
+static void nested_save_pending_event_to_vmcb12(struct vcpu_svm *svm,
+ struct vmcb *vmcb12)
+{
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+ u32 exit_int_info = 0;
+ unsigned int nr;
+
+ if (vcpu->arch.exception.injected) {
+ nr = vcpu->arch.exception.vector;
+ exit_int_info = nr | SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_EXEPT;
+
+ if (vcpu->arch.exception.has_error_code) {
+ exit_int_info |= SVM_EVTINJ_VALID_ERR;
+ vmcb12->control.exit_int_info_err =
+ vcpu->arch.exception.error_code;
+ }
+
+ } else if (vcpu->arch.nmi_injected) {
+ exit_int_info = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI;
+
+ } else if (vcpu->arch.interrupt.injected) {
+ nr = vcpu->arch.interrupt.nr;
+ exit_int_info = nr | SVM_EVTINJ_VALID;
+
+ if (vcpu->arch.interrupt.soft)
+ exit_int_info |= SVM_EVTINJ_TYPE_SOFT;
+ else
+ exit_int_info |= SVM_EVTINJ_TYPE_INTR;
+ }
+
+ vmcb12->control.exit_int_info = exit_int_info;
+}
+
+static void nested_svm_transition_tlb_flush(struct kvm_vcpu *vcpu)
+{
+ /*
+ * KVM_REQ_HV_TLB_FLUSH flushes entries from either L1's VP_ID or
+ * L2's VP_ID upon request from the guest. Make sure we check for
+ * pending entries in the right FIFO upon L1/L2 transition as these
+ * requests are put by other vCPUs asynchronously.
+ */
+ if (to_hv_vcpu(vcpu) && npt_enabled)
+ kvm_make_request(KVM_REQ_HV_TLB_FLUSH, vcpu);
+
+ /*
+ * TODO: optimize unconditional TLB flush/MMU sync. A partial list of
+ * things to fix before this can be conditional:
+ *
+ * - Flush TLBs for both L1 and L2 remote TLB flush
+ * - Honor L1's request to flush an ASID on nested VMRUN
+ * - Sync nested NPT MMU on VMRUN that flushes L2's ASID[*]
+ * - Don't crush a pending TLB flush in vmcb02 on nested VMRUN
+ * - Flush L1's ASID on KVM_REQ_TLB_FLUSH_GUEST
+ *
+ * [*] Unlike nested EPT, SVM's ASID management can invalidate nested
+ * NPT guest-physical mappings on VMRUN.
+ */
+ kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
+ kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
+}
+
+/*
+ * Load guest's/host's cr3 on nested vmentry or vmexit. @nested_npt is true
+ * if we are emulating VM-Entry into a guest with NPT enabled.
+ */
+static int nested_svm_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3,
+ bool nested_npt, bool reload_pdptrs)
+{
+ if (CC(kvm_vcpu_is_illegal_gpa(vcpu, cr3)))
+ return -EINVAL;
+
+ if (reload_pdptrs && !nested_npt && is_pae_paging(vcpu) &&
+ CC(!load_pdptrs(vcpu, cr3)))
+ return -EINVAL;
+
+ vcpu->arch.cr3 = cr3;
+
+ /* Re-initialize the MMU, e.g. to pick up CR4 MMU role changes. */
+ kvm_init_mmu(vcpu);
+
+ if (!nested_npt)
+ kvm_mmu_new_pgd(vcpu, cr3);
+
+ return 0;
+}
+
+void nested_vmcb02_compute_g_pat(struct vcpu_svm *svm)
+{
+ if (!svm->nested.vmcb02.ptr)
+ return;
+
+ /* FIXME: merge g_pat from vmcb01 and vmcb12. */
+ svm->nested.vmcb02.ptr->save.g_pat = svm->vmcb01.ptr->save.g_pat;
+}
+
+static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12)
+{
+ bool new_vmcb12 = false;
+ struct vmcb *vmcb01 = svm->vmcb01.ptr;
+ struct vmcb *vmcb02 = svm->nested.vmcb02.ptr;
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+
+ nested_vmcb02_compute_g_pat(svm);
+
+ /* Load the nested guest state */
+ if (svm->nested.vmcb12_gpa != svm->nested.last_vmcb12_gpa) {
+ new_vmcb12 = true;
+ svm->nested.last_vmcb12_gpa = svm->nested.vmcb12_gpa;
+ svm->nested.force_msr_bitmap_recalc = true;
+ }
+
+ if (unlikely(new_vmcb12 || vmcb_is_dirty(vmcb12, VMCB_SEG))) {
+ vmcb02->save.es = vmcb12->save.es;
+ vmcb02->save.cs = vmcb12->save.cs;
+ vmcb02->save.ss = vmcb12->save.ss;
+ vmcb02->save.ds = vmcb12->save.ds;
+ vmcb02->save.cpl = vmcb12->save.cpl;
+ vmcb_mark_dirty(vmcb02, VMCB_SEG);
+ }
+
+ if (unlikely(new_vmcb12 || vmcb_is_dirty(vmcb12, VMCB_DT))) {
+ vmcb02->save.gdtr = vmcb12->save.gdtr;
+ vmcb02->save.idtr = vmcb12->save.idtr;
+ vmcb_mark_dirty(vmcb02, VMCB_DT);
+ }
+
+ kvm_set_rflags(vcpu, vmcb12->save.rflags | X86_EFLAGS_FIXED);
+
+ svm_set_efer(vcpu, svm->nested.save.efer);
+
+ svm_set_cr0(vcpu, svm->nested.save.cr0);
+ svm_set_cr4(vcpu, svm->nested.save.cr4);
+
+ svm->vcpu.arch.cr2 = vmcb12->save.cr2;
+
+ kvm_rax_write(vcpu, vmcb12->save.rax);
+ kvm_rsp_write(vcpu, vmcb12->save.rsp);
+ kvm_rip_write(vcpu, vmcb12->save.rip);
+
+ /* In case we don't even reach vcpu_run, the fields are not updated */
+ vmcb02->save.rax = vmcb12->save.rax;
+ vmcb02->save.rsp = vmcb12->save.rsp;
+ vmcb02->save.rip = vmcb12->save.rip;
+
+ /* These bits will be set properly on the first execution when new_vmc12 is true */
+ if (unlikely(new_vmcb12 || vmcb_is_dirty(vmcb12, VMCB_DR))) {
+ vmcb02->save.dr7 = svm->nested.save.dr7 | DR7_FIXED_1;
+ svm->vcpu.arch.dr6 = svm->nested.save.dr6 | DR6_ACTIVE_LOW;
+ vmcb_mark_dirty(vmcb02, VMCB_DR);
+ }
+
+ if (unlikely(guest_can_use(vcpu, X86_FEATURE_LBRV) &&
+ (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK))) {
+ /*
+ * Reserved bits of DEBUGCTL are ignored. Be consistent with
+ * svm_set_msr's definition of reserved bits.
+ */
+ svm_copy_lbrs(vmcb02, vmcb12);
+ vmcb02->save.dbgctl &= ~DEBUGCTL_RESERVED_BITS;
+ svm_update_lbrv(&svm->vcpu);
+
+ } else if (unlikely(vmcb01->control.virt_ext & LBR_CTL_ENABLE_MASK)) {
+ svm_copy_lbrs(vmcb02, vmcb01);
+ }
+}
+
+static inline bool is_evtinj_soft(u32 evtinj)
+{
+ u32 type = evtinj & SVM_EVTINJ_TYPE_MASK;
+ u8 vector = evtinj & SVM_EVTINJ_VEC_MASK;
+
+ if (!(evtinj & SVM_EVTINJ_VALID))
+ return false;
+
+ if (type == SVM_EVTINJ_TYPE_SOFT)
+ return true;
+
+ return type == SVM_EVTINJ_TYPE_EXEPT && kvm_exception_is_soft(vector);
+}
+
+static bool is_evtinj_nmi(u32 evtinj)
+{
+ u32 type = evtinj & SVM_EVTINJ_TYPE_MASK;
+
+ if (!(evtinj & SVM_EVTINJ_VALID))
+ return false;
+
+ return type == SVM_EVTINJ_TYPE_NMI;
+}
+
+static void nested_vmcb02_prepare_control(struct vcpu_svm *svm,
+ unsigned long vmcb12_rip,
+ unsigned long vmcb12_csbase)
+{
+ u32 int_ctl_vmcb01_bits = V_INTR_MASKING_MASK;
+ u32 int_ctl_vmcb12_bits = V_TPR_MASK | V_IRQ_INJECTION_BITS_MASK;
+
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+ struct vmcb *vmcb01 = svm->vmcb01.ptr;
+ struct vmcb *vmcb02 = svm->nested.vmcb02.ptr;
+ u32 pause_count12;
+ u32 pause_thresh12;
+
+ /*
+ * Filled at exit: exit_code, exit_code_hi, exit_info_1, exit_info_2,
+ * exit_int_info, exit_int_info_err, next_rip, insn_len, insn_bytes.
+ */
+
+ if (guest_can_use(vcpu, X86_FEATURE_VGIF) &&
+ (svm->nested.ctl.int_ctl & V_GIF_ENABLE_MASK))
+ int_ctl_vmcb12_bits |= (V_GIF_MASK | V_GIF_ENABLE_MASK);
+ else
+ int_ctl_vmcb01_bits |= (V_GIF_MASK | V_GIF_ENABLE_MASK);
+
+ if (vnmi) {
+ if (vmcb01->control.int_ctl & V_NMI_PENDING_MASK) {
+ svm->vcpu.arch.nmi_pending++;
+ kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
+ }
+ if (nested_vnmi_enabled(svm))
+ int_ctl_vmcb12_bits |= (V_NMI_PENDING_MASK |
+ V_NMI_ENABLE_MASK |
+ V_NMI_BLOCKING_MASK);
+ }
+
+ /* Copied from vmcb01. msrpm_base can be overwritten later. */
+ vmcb02->control.nested_ctl = vmcb01->control.nested_ctl;
+ vmcb02->control.iopm_base_pa = vmcb01->control.iopm_base_pa;
+ vmcb02->control.msrpm_base_pa = vmcb01->control.msrpm_base_pa;
+
+ /* Done at vmrun: asid. */
+
+ /* Also overwritten later if necessary. */
+ vmcb02->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;
+
+ /* nested_cr3. */
+ if (nested_npt_enabled(svm))
+ nested_svm_init_mmu_context(vcpu);
+
+ vcpu->arch.tsc_offset = kvm_calc_nested_tsc_offset(
+ vcpu->arch.l1_tsc_offset,
+ svm->nested.ctl.tsc_offset,
+ svm->tsc_ratio_msr);
+
+ vmcb02->control.tsc_offset = vcpu->arch.tsc_offset;
+
+ if (guest_can_use(vcpu, X86_FEATURE_TSCRATEMSR) &&
+ svm->tsc_ratio_msr != kvm_caps.default_tsc_scaling_ratio)
+ nested_svm_update_tsc_ratio_msr(vcpu);
+
+ vmcb02->control.int_ctl =
+ (svm->nested.ctl.int_ctl & int_ctl_vmcb12_bits) |
+ (vmcb01->control.int_ctl & int_ctl_vmcb01_bits);
+
+ vmcb02->control.int_vector = svm->nested.ctl.int_vector;
+ vmcb02->control.int_state = svm->nested.ctl.int_state;
+ vmcb02->control.event_inj = svm->nested.ctl.event_inj;
+ vmcb02->control.event_inj_err = svm->nested.ctl.event_inj_err;
+
+ /*
+ * next_rip is consumed on VMRUN as the return address pushed on the
+ * stack for injected soft exceptions/interrupts. If nrips is exposed
+ * to L1, take it verbatim from vmcb12. If nrips is supported in
+ * hardware but not exposed to L1, stuff the actual L2 RIP to emulate
+ * what a nrips=0 CPU would do (L1 is responsible for advancing RIP
+ * prior to injecting the event).
+ */
+ if (guest_can_use(vcpu, X86_FEATURE_NRIPS))
+ vmcb02->control.next_rip = svm->nested.ctl.next_rip;
+ else if (boot_cpu_has(X86_FEATURE_NRIPS))
+ vmcb02->control.next_rip = vmcb12_rip;
+
+ svm->nmi_l1_to_l2 = is_evtinj_nmi(vmcb02->control.event_inj);
+ if (is_evtinj_soft(vmcb02->control.event_inj)) {
+ svm->soft_int_injected = true;
+ svm->soft_int_csbase = vmcb12_csbase;
+ svm->soft_int_old_rip = vmcb12_rip;
+ if (guest_can_use(vcpu, X86_FEATURE_NRIPS))
+ svm->soft_int_next_rip = svm->nested.ctl.next_rip;
+ else
+ svm->soft_int_next_rip = vmcb12_rip;
+ }
+
+ vmcb02->control.virt_ext = vmcb01->control.virt_ext &
+ LBR_CTL_ENABLE_MASK;
+ if (guest_can_use(vcpu, X86_FEATURE_LBRV))
+ vmcb02->control.virt_ext |=
+ (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK);
+
+ if (!nested_vmcb_needs_vls_intercept(svm))
+ vmcb02->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
+
+ if (guest_can_use(vcpu, X86_FEATURE_PAUSEFILTER))
+ pause_count12 = svm->nested.ctl.pause_filter_count;
+ else
+ pause_count12 = 0;
+ if (guest_can_use(vcpu, X86_FEATURE_PFTHRESHOLD))
+ pause_thresh12 = svm->nested.ctl.pause_filter_thresh;
+ else
+ pause_thresh12 = 0;
+ if (kvm_pause_in_guest(svm->vcpu.kvm)) {
+ /* use guest values since host doesn't intercept PAUSE */
+ vmcb02->control.pause_filter_count = pause_count12;
+ vmcb02->control.pause_filter_thresh = pause_thresh12;
+
+ } else {
+ /* start from host values otherwise */
+ vmcb02->control.pause_filter_count = vmcb01->control.pause_filter_count;
+ vmcb02->control.pause_filter_thresh = vmcb01->control.pause_filter_thresh;
+
+ /* ... but ensure filtering is disabled if so requested. */
+ if (vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_PAUSE)) {
+ if (!pause_count12)
+ vmcb02->control.pause_filter_count = 0;
+ if (!pause_thresh12)
+ vmcb02->control.pause_filter_thresh = 0;
+ }
+ }
+
+ nested_svm_transition_tlb_flush(vcpu);
+
+ /* Enter Guest-Mode */
+ enter_guest_mode(vcpu);
+
+ /*
+ * Merge guest and host intercepts - must be called with vcpu in
+ * guest-mode to take effect.
+ */
+ recalc_intercepts(svm);
+}
+
+static void nested_svm_copy_common_state(struct vmcb *from_vmcb, struct vmcb *to_vmcb)
+{
+ /*
+ * Some VMCB state is shared between L1 and L2 and thus has to be
+ * moved at the time of nested vmrun and vmexit.
+ *
+ * VMLOAD/VMSAVE state would also belong in this category, but KVM
+ * always performs VMLOAD and VMSAVE from the VMCB01.
+ */
+ to_vmcb->save.spec_ctrl = from_vmcb->save.spec_ctrl;
+}
+
+int enter_svm_guest_mode(struct kvm_vcpu *vcpu, u64 vmcb12_gpa,
+ struct vmcb *vmcb12, bool from_vmrun)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ int ret;
+
+ trace_kvm_nested_vmenter(svm->vmcb->save.rip,
+ vmcb12_gpa,
+ vmcb12->save.rip,
+ vmcb12->control.int_ctl,
+ vmcb12->control.event_inj,
+ vmcb12->control.nested_ctl,
+ vmcb12->control.nested_cr3,
+ vmcb12->save.cr3,
+ KVM_ISA_SVM);
+
+ trace_kvm_nested_intercepts(vmcb12->control.intercepts[INTERCEPT_CR] & 0xffff,
+ vmcb12->control.intercepts[INTERCEPT_CR] >> 16,
+ vmcb12->control.intercepts[INTERCEPT_EXCEPTION],
+ vmcb12->control.intercepts[INTERCEPT_WORD3],
+ vmcb12->control.intercepts[INTERCEPT_WORD4],
+ vmcb12->control.intercepts[INTERCEPT_WORD5]);
+
+
+ svm->nested.vmcb12_gpa = vmcb12_gpa;
+
+ WARN_ON(svm->vmcb == svm->nested.vmcb02.ptr);
+
+ nested_svm_copy_common_state(svm->vmcb01.ptr, svm->nested.vmcb02.ptr);
+
+ svm_switch_vmcb(svm, &svm->nested.vmcb02);
+ nested_vmcb02_prepare_control(svm, vmcb12->save.rip, vmcb12->save.cs.base);
+ nested_vmcb02_prepare_save(svm, vmcb12);
+
+ ret = nested_svm_load_cr3(&svm->vcpu, svm->nested.save.cr3,
+ nested_npt_enabled(svm), from_vmrun);
+ if (ret)
+ return ret;
+
+ if (!from_vmrun)
+ kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
+
+ svm_set_gif(svm, true);
+
+ if (kvm_vcpu_apicv_active(vcpu))
+ kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu);
+
+ nested_svm_hv_update_vm_vp_ids(vcpu);
+
+ return 0;
+}
+
+int nested_svm_vmrun(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ int ret;
+ struct vmcb *vmcb12;
+ struct kvm_host_map map;
+ u64 vmcb12_gpa;
+ struct vmcb *vmcb01 = svm->vmcb01.ptr;
+
+ if (!svm->nested.hsave_msr) {
+ kvm_inject_gp(vcpu, 0);
+ return 1;
+ }
+
+ if (is_smm(vcpu)) {
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+ }
+
+ /* This fails when VP assist page is enabled but the supplied GPA is bogus */
+ ret = kvm_hv_verify_vp_assist(vcpu);
+ if (ret) {
+ kvm_inject_gp(vcpu, 0);
+ return ret;
+ }
+
+ vmcb12_gpa = svm->vmcb->save.rax;
+ ret = kvm_vcpu_map(vcpu, gpa_to_gfn(vmcb12_gpa), &map);
+ if (ret == -EINVAL) {
+ kvm_inject_gp(vcpu, 0);
+ return 1;
+ } else if (ret) {
+ return kvm_skip_emulated_instruction(vcpu);
+ }
+
+ ret = kvm_skip_emulated_instruction(vcpu);
+
+ vmcb12 = map.hva;
+
+ if (WARN_ON_ONCE(!svm->nested.initialized))
+ return -EINVAL;
+
+ nested_copy_vmcb_control_to_cache(svm, &vmcb12->control);
+ nested_copy_vmcb_save_to_cache(svm, &vmcb12->save);
+
+ if (!nested_vmcb_check_save(vcpu) ||
+ !nested_vmcb_check_controls(vcpu)) {
+ vmcb12->control.exit_code = SVM_EXIT_ERR;
+ vmcb12->control.exit_code_hi = 0;
+ vmcb12->control.exit_info_1 = 0;
+ vmcb12->control.exit_info_2 = 0;
+ goto out;
+ }
+
+ /*
+ * Since vmcb01 is not in use, we can use it to store some of the L1
+ * state.
+ */
+ vmcb01->save.efer = vcpu->arch.efer;
+ vmcb01->save.cr0 = kvm_read_cr0(vcpu);
+ vmcb01->save.cr4 = vcpu->arch.cr4;
+ vmcb01->save.rflags = kvm_get_rflags(vcpu);
+ vmcb01->save.rip = kvm_rip_read(vcpu);
+
+ if (!npt_enabled)
+ vmcb01->save.cr3 = kvm_read_cr3(vcpu);
+
+ svm->nested.nested_run_pending = 1;
+
+ if (enter_svm_guest_mode(vcpu, vmcb12_gpa, vmcb12, true))
+ goto out_exit_err;
+
+ if (nested_svm_vmrun_msrpm(svm))
+ goto out;
+
+out_exit_err:
+ svm->nested.nested_run_pending = 0;
+ svm->nmi_l1_to_l2 = false;
+ svm->soft_int_injected = false;
+
+ svm->vmcb->control.exit_code = SVM_EXIT_ERR;
+ svm->vmcb->control.exit_code_hi = 0;
+ svm->vmcb->control.exit_info_1 = 0;
+ svm->vmcb->control.exit_info_2 = 0;
+
+ nested_svm_vmexit(svm);
+
+out:
+ kvm_vcpu_unmap(vcpu, &map, true);
+
+ return ret;
+}
+
+/* Copy state save area fields which are handled by VMRUN */
+void svm_copy_vmrun_state(struct vmcb_save_area *to_save,
+ struct vmcb_save_area *from_save)
+{
+ to_save->es = from_save->es;
+ to_save->cs = from_save->cs;
+ to_save->ss = from_save->ss;
+ to_save->ds = from_save->ds;
+ to_save->gdtr = from_save->gdtr;
+ to_save->idtr = from_save->idtr;
+ to_save->rflags = from_save->rflags | X86_EFLAGS_FIXED;
+ to_save->efer = from_save->efer;
+ to_save->cr0 = from_save->cr0;
+ to_save->cr3 = from_save->cr3;
+ to_save->cr4 = from_save->cr4;
+ to_save->rax = from_save->rax;
+ to_save->rsp = from_save->rsp;
+ to_save->rip = from_save->rip;
+ to_save->cpl = 0;
+}
+
+void svm_copy_vmloadsave_state(struct vmcb *to_vmcb, struct vmcb *from_vmcb)
+{
+ to_vmcb->save.fs = from_vmcb->save.fs;
+ to_vmcb->save.gs = from_vmcb->save.gs;
+ to_vmcb->save.tr = from_vmcb->save.tr;
+ to_vmcb->save.ldtr = from_vmcb->save.ldtr;
+ to_vmcb->save.kernel_gs_base = from_vmcb->save.kernel_gs_base;
+ to_vmcb->save.star = from_vmcb->save.star;
+ to_vmcb->save.lstar = from_vmcb->save.lstar;
+ to_vmcb->save.cstar = from_vmcb->save.cstar;
+ to_vmcb->save.sfmask = from_vmcb->save.sfmask;
+ to_vmcb->save.sysenter_cs = from_vmcb->save.sysenter_cs;
+ to_vmcb->save.sysenter_esp = from_vmcb->save.sysenter_esp;
+ to_vmcb->save.sysenter_eip = from_vmcb->save.sysenter_eip;
+}
+
+int nested_svm_vmexit(struct vcpu_svm *svm)
+{
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+ struct vmcb *vmcb01 = svm->vmcb01.ptr;
+ struct vmcb *vmcb02 = svm->nested.vmcb02.ptr;
+ struct vmcb *vmcb12;
+ struct kvm_host_map map;
+ int rc;
+
+ rc = kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.vmcb12_gpa), &map);
+ if (rc) {
+ if (rc == -EINVAL)
+ kvm_inject_gp(vcpu, 0);
+ return 1;
+ }
+
+ vmcb12 = map.hva;
+
+ /* Exit Guest-Mode */
+ leave_guest_mode(vcpu);
+ svm->nested.vmcb12_gpa = 0;
+ WARN_ON_ONCE(svm->nested.nested_run_pending);
+
+ kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
+
+ /* in case we halted in L2 */
+ svm->vcpu.arch.mp_state = KVM_MP_STATE_RUNNABLE;
+
+ /* Give the current vmcb to the guest */
+
+ vmcb12->save.es = vmcb02->save.es;
+ vmcb12->save.cs = vmcb02->save.cs;
+ vmcb12->save.ss = vmcb02->save.ss;
+ vmcb12->save.ds = vmcb02->save.ds;
+ vmcb12->save.gdtr = vmcb02->save.gdtr;
+ vmcb12->save.idtr = vmcb02->save.idtr;
+ vmcb12->save.efer = svm->vcpu.arch.efer;
+ vmcb12->save.cr0 = kvm_read_cr0(vcpu);
+ vmcb12->save.cr3 = kvm_read_cr3(vcpu);
+ vmcb12->save.cr2 = vmcb02->save.cr2;
+ vmcb12->save.cr4 = svm->vcpu.arch.cr4;
+ vmcb12->save.rflags = kvm_get_rflags(vcpu);
+ vmcb12->save.rip = kvm_rip_read(vcpu);
+ vmcb12->save.rsp = kvm_rsp_read(vcpu);
+ vmcb12->save.rax = kvm_rax_read(vcpu);
+ vmcb12->save.dr7 = vmcb02->save.dr7;
+ vmcb12->save.dr6 = svm->vcpu.arch.dr6;
+ vmcb12->save.cpl = vmcb02->save.cpl;
+
+ vmcb12->control.int_state = vmcb02->control.int_state;
+ vmcb12->control.exit_code = vmcb02->control.exit_code;
+ vmcb12->control.exit_code_hi = vmcb02->control.exit_code_hi;
+ vmcb12->control.exit_info_1 = vmcb02->control.exit_info_1;
+ vmcb12->control.exit_info_2 = vmcb02->control.exit_info_2;
+
+ if (vmcb12->control.exit_code != SVM_EXIT_ERR)
+ nested_save_pending_event_to_vmcb12(svm, vmcb12);
+
+ if (guest_can_use(vcpu, X86_FEATURE_NRIPS))
+ vmcb12->control.next_rip = vmcb02->control.next_rip;
+
+ vmcb12->control.int_ctl = svm->nested.ctl.int_ctl;
+ vmcb12->control.event_inj = svm->nested.ctl.event_inj;
+ vmcb12->control.event_inj_err = svm->nested.ctl.event_inj_err;
+
+ if (!kvm_pause_in_guest(vcpu->kvm)) {
+ vmcb01->control.pause_filter_count = vmcb02->control.pause_filter_count;
+ vmcb_mark_dirty(vmcb01, VMCB_INTERCEPTS);
+
+ }
+
+ nested_svm_copy_common_state(svm->nested.vmcb02.ptr, svm->vmcb01.ptr);
+
+ svm_switch_vmcb(svm, &svm->vmcb01);
+
+ /*
+ * Rules for synchronizing int_ctl bits from vmcb02 to vmcb01:
+ *
+ * V_IRQ, V_IRQ_VECTOR, V_INTR_PRIO_MASK, V_IGN_TPR: If L1 doesn't
+ * intercept interrupts, then KVM will use vmcb02's V_IRQ (and related
+ * flags) to detect interrupt windows for L1 IRQs (even if L1 uses
+ * virtual interrupt masking). Raise KVM_REQ_EVENT to ensure that
+ * KVM re-requests an interrupt window if necessary, which implicitly
+ * copies this bits from vmcb02 to vmcb01.
+ *
+ * V_TPR: If L1 doesn't use virtual interrupt masking, then L1's vTPR
+ * is stored in vmcb02, but its value doesn't need to be copied from/to
+ * vmcb01 because it is copied from/to the virtual APIC's TPR register
+ * on each VM entry/exit.
+ *
+ * V_GIF: If nested vGIF is not used, KVM uses vmcb02's V_GIF for L1's
+ * V_GIF. However, GIF is architecturally clear on each VM exit, thus
+ * there is no need to copy V_GIF from vmcb02 to vmcb01.
+ */
+ if (!nested_exit_on_intr(svm))
+ kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
+
+ if (unlikely(guest_can_use(vcpu, X86_FEATURE_LBRV) &&
+ (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK))) {
+ svm_copy_lbrs(vmcb12, vmcb02);
+ svm_update_lbrv(vcpu);
+ } else if (unlikely(vmcb01->control.virt_ext & LBR_CTL_ENABLE_MASK)) {
+ svm_copy_lbrs(vmcb01, vmcb02);
+ svm_update_lbrv(vcpu);
+ }
+
+ if (vnmi) {
+ if (vmcb02->control.int_ctl & V_NMI_BLOCKING_MASK)
+ vmcb01->control.int_ctl |= V_NMI_BLOCKING_MASK;
+ else
+ vmcb01->control.int_ctl &= ~V_NMI_BLOCKING_MASK;
+
+ if (vcpu->arch.nmi_pending) {
+ vcpu->arch.nmi_pending--;
+ vmcb01->control.int_ctl |= V_NMI_PENDING_MASK;
+ } else {
+ vmcb01->control.int_ctl &= ~V_NMI_PENDING_MASK;
+ }
+ }
+
+ /*
+ * On vmexit the GIF is set to false and
+ * no event can be injected in L1.
+ */
+ svm_set_gif(svm, false);
+ vmcb01->control.exit_int_info = 0;
+
+ svm->vcpu.arch.tsc_offset = svm->vcpu.arch.l1_tsc_offset;
+ if (vmcb01->control.tsc_offset != svm->vcpu.arch.tsc_offset) {
+ vmcb01->control.tsc_offset = svm->vcpu.arch.tsc_offset;
+ vmcb_mark_dirty(vmcb01, VMCB_INTERCEPTS);
+ }
+
+ if (kvm_caps.has_tsc_control &&
+ vcpu->arch.tsc_scaling_ratio != vcpu->arch.l1_tsc_scaling_ratio) {
+ vcpu->arch.tsc_scaling_ratio = vcpu->arch.l1_tsc_scaling_ratio;
+ svm_write_tsc_multiplier(vcpu);
+ }
+
+ svm->nested.ctl.nested_cr3 = 0;
+
+ /*
+ * Restore processor state that had been saved in vmcb01
+ */
+ kvm_set_rflags(vcpu, vmcb01->save.rflags);
+ svm_set_efer(vcpu, vmcb01->save.efer);
+ svm_set_cr0(vcpu, vmcb01->save.cr0 | X86_CR0_PE);
+ svm_set_cr4(vcpu, vmcb01->save.cr4);
+ kvm_rax_write(vcpu, vmcb01->save.rax);
+ kvm_rsp_write(vcpu, vmcb01->save.rsp);
+ kvm_rip_write(vcpu, vmcb01->save.rip);
+
+ svm->vcpu.arch.dr7 = DR7_FIXED_1;
+ kvm_update_dr7(&svm->vcpu);
+
+ trace_kvm_nested_vmexit_inject(vmcb12->control.exit_code,
+ vmcb12->control.exit_info_1,
+ vmcb12->control.exit_info_2,
+ vmcb12->control.exit_int_info,
+ vmcb12->control.exit_int_info_err,
+ KVM_ISA_SVM);
+
+ kvm_vcpu_unmap(vcpu, &map, true);
+
+ nested_svm_transition_tlb_flush(vcpu);
+
+ nested_svm_uninit_mmu_context(vcpu);
+
+ rc = nested_svm_load_cr3(vcpu, vmcb01->save.cr3, false, true);
+ if (rc)
+ return 1;
+
+ /*
+ * Drop what we picked up for L2 via svm_complete_interrupts() so it
+ * doesn't end up in L1.
+ */
+ svm->vcpu.arch.nmi_injected = false;
+ kvm_clear_exception_queue(vcpu);
+ kvm_clear_interrupt_queue(vcpu);
+
+ /*
+ * If we are here following the completion of a VMRUN that
+ * is being single-stepped, queue the pending #DB intercept
+ * right now so that it an be accounted for before we execute
+ * L1's next instruction.
+ */
+ if (unlikely(vmcb01->save.rflags & X86_EFLAGS_TF))
+ kvm_queue_exception(&(svm->vcpu), DB_VECTOR);
+
+ /*
+ * Un-inhibit the AVIC right away, so that other vCPUs can start
+ * to benefit from it right away.
+ */
+ if (kvm_apicv_activated(vcpu->kvm))
+ __kvm_vcpu_update_apicv(vcpu);
+
+ return 0;
+}
+
+static void nested_svm_triple_fault(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (!vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_SHUTDOWN))
+ return;
+
+ kvm_clear_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+ nested_svm_simple_vmexit(to_svm(vcpu), SVM_EXIT_SHUTDOWN);
+}
+
+int svm_allocate_nested(struct vcpu_svm *svm)
+{
+ struct page *vmcb02_page;
+
+ if (svm->nested.initialized)
+ return 0;
+
+ vmcb02_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+ if (!vmcb02_page)
+ return -ENOMEM;
+ svm->nested.vmcb02.ptr = page_address(vmcb02_page);
+ svm->nested.vmcb02.pa = __sme_set(page_to_pfn(vmcb02_page) << PAGE_SHIFT);
+
+ svm->nested.msrpm = svm_vcpu_alloc_msrpm();
+ if (!svm->nested.msrpm)
+ goto err_free_vmcb02;
+ svm_vcpu_init_msrpm(&svm->vcpu, svm->nested.msrpm);
+
+ svm->nested.initialized = true;
+ return 0;
+
+err_free_vmcb02:
+ __free_page(vmcb02_page);
+ return -ENOMEM;
+}
+
+void svm_free_nested(struct vcpu_svm *svm)
+{
+ if (!svm->nested.initialized)
+ return;
+
+ if (WARN_ON_ONCE(svm->vmcb != svm->vmcb01.ptr))
+ svm_switch_vmcb(svm, &svm->vmcb01);
+
+ svm_vcpu_free_msrpm(svm->nested.msrpm);
+ svm->nested.msrpm = NULL;
+
+ __free_page(virt_to_page(svm->nested.vmcb02.ptr));
+ svm->nested.vmcb02.ptr = NULL;
+
+ /*
+ * When last_vmcb12_gpa matches the current vmcb12 gpa,
+ * some vmcb12 fields are not loaded if they are marked clean
+ * in the vmcb12, since in this case they are up to date already.
+ *
+ * When the vmcb02 is freed, this optimization becomes invalid.
+ */
+ svm->nested.last_vmcb12_gpa = INVALID_GPA;
+
+ svm->nested.initialized = false;
+}
+
+void svm_leave_nested(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (is_guest_mode(vcpu)) {
+ svm->nested.nested_run_pending = 0;
+ svm->nested.vmcb12_gpa = INVALID_GPA;
+
+ leave_guest_mode(vcpu);
+
+ svm_switch_vmcb(svm, &svm->vmcb01);
+
+ nested_svm_uninit_mmu_context(vcpu);
+ vmcb_mark_all_dirty(svm->vmcb);
+
+ if (kvm_apicv_activated(vcpu->kvm))
+ kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu);
+ }
+
+ kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
+}
+
+static int nested_svm_exit_handled_msr(struct vcpu_svm *svm)
+{
+ u32 offset, msr, value;
+ int write, mask;
+
+ if (!(vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_MSR_PROT)))
+ return NESTED_EXIT_HOST;
+
+ msr = svm->vcpu.arch.regs[VCPU_REGS_RCX];
+ offset = svm_msrpm_offset(msr);
+ write = svm->vmcb->control.exit_info_1 & 1;
+ mask = 1 << ((2 * (msr & 0xf)) + write);
+
+ if (offset == MSR_INVALID)
+ return NESTED_EXIT_DONE;
+
+ /* Offset is in 32 bit units but need in 8 bit units */
+ offset *= 4;
+
+ if (kvm_vcpu_read_guest(&svm->vcpu, svm->nested.ctl.msrpm_base_pa + offset, &value, 4))
+ return NESTED_EXIT_DONE;
+
+ return (value & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST;
+}
+
+static int nested_svm_intercept_ioio(struct vcpu_svm *svm)
+{
+ unsigned port, size, iopm_len;
+ u16 val, mask;
+ u8 start_bit;
+ u64 gpa;
+
+ if (!(vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_IOIO_PROT)))
+ return NESTED_EXIT_HOST;
+
+ port = svm->vmcb->control.exit_info_1 >> 16;
+ size = (svm->vmcb->control.exit_info_1 & SVM_IOIO_SIZE_MASK) >>
+ SVM_IOIO_SIZE_SHIFT;
+ gpa = svm->nested.ctl.iopm_base_pa + (port / 8);
+ start_bit = port % 8;
+ iopm_len = (start_bit + size > 8) ? 2 : 1;
+ mask = (0xf >> (4 - size)) << start_bit;
+ val = 0;
+
+ if (kvm_vcpu_read_guest(&svm->vcpu, gpa, &val, iopm_len))
+ return NESTED_EXIT_DONE;
+
+ return (val & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST;
+}
+
+static int nested_svm_intercept(struct vcpu_svm *svm)
+{
+ u32 exit_code = svm->vmcb->control.exit_code;
+ int vmexit = NESTED_EXIT_HOST;
+
+ switch (exit_code) {
+ case SVM_EXIT_MSR:
+ vmexit = nested_svm_exit_handled_msr(svm);
+ break;
+ case SVM_EXIT_IOIO:
+ vmexit = nested_svm_intercept_ioio(svm);
+ break;
+ case SVM_EXIT_READ_CR0 ... SVM_EXIT_WRITE_CR8: {
+ if (vmcb12_is_intercept(&svm->nested.ctl, exit_code))
+ vmexit = NESTED_EXIT_DONE;
+ break;
+ }
+ case SVM_EXIT_READ_DR0 ... SVM_EXIT_WRITE_DR7: {
+ if (vmcb12_is_intercept(&svm->nested.ctl, exit_code))
+ vmexit = NESTED_EXIT_DONE;
+ break;
+ }
+ case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 0x1f: {
+ /*
+ * Host-intercepted exceptions have been checked already in
+ * nested_svm_exit_special. There is nothing to do here,
+ * the vmexit is injected by svm_check_nested_events.
+ */
+ vmexit = NESTED_EXIT_DONE;
+ break;
+ }
+ case SVM_EXIT_ERR: {
+ vmexit = NESTED_EXIT_DONE;
+ break;
+ }
+ default: {
+ if (vmcb12_is_intercept(&svm->nested.ctl, exit_code))
+ vmexit = NESTED_EXIT_DONE;
+ }
+ }
+
+ return vmexit;
+}
+
+int nested_svm_exit_handled(struct vcpu_svm *svm)
+{
+ int vmexit;
+
+ vmexit = nested_svm_intercept(svm);
+
+ if (vmexit == NESTED_EXIT_DONE)
+ nested_svm_vmexit(svm);
+
+ return vmexit;
+}
+
+int nested_svm_check_permissions(struct kvm_vcpu *vcpu)
+{
+ if (!(vcpu->arch.efer & EFER_SVME) || !is_paging(vcpu)) {
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+ }
+
+ if (to_svm(vcpu)->vmcb->save.cpl) {
+ kvm_inject_gp(vcpu, 0);
+ return 1;
+ }
+
+ return 0;
+}
+
+static bool nested_svm_is_exception_vmexit(struct kvm_vcpu *vcpu, u8 vector,
+ u32 error_code)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ return (svm->nested.ctl.intercepts[INTERCEPT_EXCEPTION] & BIT(vector));
+}
+
+static void nested_svm_inject_exception_vmexit(struct kvm_vcpu *vcpu)
+{
+ struct kvm_queued_exception *ex = &vcpu->arch.exception_vmexit;
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb *vmcb = svm->vmcb;
+
+ vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + ex->vector;
+ vmcb->control.exit_code_hi = 0;
+
+ if (ex->has_error_code)
+ vmcb->control.exit_info_1 = ex->error_code;
+
+ /*
+ * EXITINFO2 is undefined for all exception intercepts other
+ * than #PF.
+ */
+ if (ex->vector == PF_VECTOR) {
+ if (ex->has_payload)
+ vmcb->control.exit_info_2 = ex->payload;
+ else
+ vmcb->control.exit_info_2 = vcpu->arch.cr2;
+ } else if (ex->vector == DB_VECTOR) {
+ /* See kvm_check_and_inject_events(). */
+ kvm_deliver_exception_payload(vcpu, ex);
+
+ if (vcpu->arch.dr7 & DR7_GD) {
+ vcpu->arch.dr7 &= ~DR7_GD;
+ kvm_update_dr7(vcpu);
+ }
+ } else {
+ WARN_ON(ex->has_payload);
+ }
+
+ nested_svm_vmexit(svm);
+}
+
+static inline bool nested_exit_on_init(struct vcpu_svm *svm)
+{
+ return vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_INIT);
+}
+
+static int svm_check_nested_events(struct kvm_vcpu *vcpu)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+ struct vcpu_svm *svm = to_svm(vcpu);
+ /*
+ * Only a pending nested run blocks a pending exception. If there is a
+ * previously injected event, the pending exception occurred while said
+ * event was being delivered and thus needs to be handled.
+ */
+ bool block_nested_exceptions = svm->nested.nested_run_pending;
+ /*
+ * New events (not exceptions) are only recognized at instruction
+ * boundaries. If an event needs reinjection, then KVM is handling a
+ * VM-Exit that occurred _during_ instruction execution; new events are
+ * blocked until the instruction completes.
+ */
+ bool block_nested_events = block_nested_exceptions ||
+ kvm_event_needs_reinjection(vcpu);
+
+ if (lapic_in_kernel(vcpu) &&
+ test_bit(KVM_APIC_INIT, &apic->pending_events)) {
+ if (block_nested_events)
+ return -EBUSY;
+ if (!nested_exit_on_init(svm))
+ return 0;
+ nested_svm_simple_vmexit(svm, SVM_EXIT_INIT);
+ return 0;
+ }
+
+ if (vcpu->arch.exception_vmexit.pending) {
+ if (block_nested_exceptions)
+ return -EBUSY;
+ nested_svm_inject_exception_vmexit(vcpu);
+ return 0;
+ }
+
+ if (vcpu->arch.exception.pending) {
+ if (block_nested_exceptions)
+ return -EBUSY;
+ return 0;
+ }
+
+#ifdef CONFIG_KVM_SMM
+ if (vcpu->arch.smi_pending && !svm_smi_blocked(vcpu)) {
+ if (block_nested_events)
+ return -EBUSY;
+ if (!nested_exit_on_smi(svm))
+ return 0;
+ nested_svm_simple_vmexit(svm, SVM_EXIT_SMI);
+ return 0;
+ }
+#endif
+
+ if (vcpu->arch.nmi_pending && !svm_nmi_blocked(vcpu)) {
+ if (block_nested_events)
+ return -EBUSY;
+ if (!nested_exit_on_nmi(svm))
+ return 0;
+ nested_svm_simple_vmexit(svm, SVM_EXIT_NMI);
+ return 0;
+ }
+
+ if (kvm_cpu_has_interrupt(vcpu) && !svm_interrupt_blocked(vcpu)) {
+ if (block_nested_events)
+ return -EBUSY;
+ if (!nested_exit_on_intr(svm))
+ return 0;
+ trace_kvm_nested_intr_vmexit(svm->vmcb->save.rip);
+ nested_svm_simple_vmexit(svm, SVM_EXIT_INTR);
+ return 0;
+ }
+
+ return 0;
+}
+
+int nested_svm_exit_special(struct vcpu_svm *svm)
+{
+ u32 exit_code = svm->vmcb->control.exit_code;
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+
+ switch (exit_code) {
+ case SVM_EXIT_INTR:
+ case SVM_EXIT_NMI:
+ case SVM_EXIT_NPF:
+ return NESTED_EXIT_HOST;
+ case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 0x1f: {
+ u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE);
+
+ if (svm->vmcb01.ptr->control.intercepts[INTERCEPT_EXCEPTION] &
+ excp_bits)
+ return NESTED_EXIT_HOST;
+ else if (exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR &&
+ svm->vcpu.arch.apf.host_apf_flags)
+ /* Trap async PF even if not shadowing */
+ return NESTED_EXIT_HOST;
+ break;
+ }
+ case SVM_EXIT_VMMCALL:
+ /* Hyper-V L2 TLB flush hypercall is handled by L0 */
+ if (guest_hv_cpuid_has_l2_tlb_flush(vcpu) &&
+ nested_svm_l2_tlb_flush_enabled(vcpu) &&
+ kvm_hv_is_tlb_flush_hcall(vcpu))
+ return NESTED_EXIT_HOST;
+ break;
+ default:
+ break;
+ }
+
+ return NESTED_EXIT_CONTINUE;
+}
+
+void nested_svm_update_tsc_ratio_msr(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ vcpu->arch.tsc_scaling_ratio =
+ kvm_calc_nested_tsc_multiplier(vcpu->arch.l1_tsc_scaling_ratio,
+ svm->tsc_ratio_msr);
+ svm_write_tsc_multiplier(vcpu);
+}
+
+/* Inverse operation of nested_copy_vmcb_control_to_cache(). asid is copied too. */
+static void nested_copy_vmcb_cache_to_control(struct vmcb_control_area *dst,
+ struct vmcb_ctrl_area_cached *from)
+{
+ unsigned int i;
+
+ memset(dst, 0, sizeof(struct vmcb_control_area));
+
+ for (i = 0; i < MAX_INTERCEPT; i++)
+ dst->intercepts[i] = from->intercepts[i];
+
+ dst->iopm_base_pa = from->iopm_base_pa;
+ dst->msrpm_base_pa = from->msrpm_base_pa;
+ dst->tsc_offset = from->tsc_offset;
+ dst->asid = from->asid;
+ dst->tlb_ctl = from->tlb_ctl;
+ dst->int_ctl = from->int_ctl;
+ dst->int_vector = from->int_vector;
+ dst->int_state = from->int_state;
+ dst->exit_code = from->exit_code;
+ dst->exit_code_hi = from->exit_code_hi;
+ dst->exit_info_1 = from->exit_info_1;
+ dst->exit_info_2 = from->exit_info_2;
+ dst->exit_int_info = from->exit_int_info;
+ dst->exit_int_info_err = from->exit_int_info_err;
+ dst->nested_ctl = from->nested_ctl;
+ dst->event_inj = from->event_inj;
+ dst->event_inj_err = from->event_inj_err;
+ dst->next_rip = from->next_rip;
+ dst->nested_cr3 = from->nested_cr3;
+ dst->virt_ext = from->virt_ext;
+ dst->pause_filter_count = from->pause_filter_count;
+ dst->pause_filter_thresh = from->pause_filter_thresh;
+ /* 'clean' and 'hv_enlightenments' are not changed by KVM */
+}
+
+static int svm_get_nested_state(struct kvm_vcpu *vcpu,
+ struct kvm_nested_state __user *user_kvm_nested_state,
+ u32 user_data_size)
+{
+ struct vcpu_svm *svm;
+ struct vmcb_control_area *ctl;
+ unsigned long r;
+ struct kvm_nested_state kvm_state = {
+ .flags = 0,
+ .format = KVM_STATE_NESTED_FORMAT_SVM,
+ .size = sizeof(kvm_state),
+ };
+ struct vmcb __user *user_vmcb = (struct vmcb __user *)
+ &user_kvm_nested_state->data.svm[0];
+
+ if (!vcpu)
+ return kvm_state.size + KVM_STATE_NESTED_SVM_VMCB_SIZE;
+
+ svm = to_svm(vcpu);
+
+ if (user_data_size < kvm_state.size)
+ goto out;
+
+ /* First fill in the header and copy it out. */
+ if (is_guest_mode(vcpu)) {
+ kvm_state.hdr.svm.vmcb_pa = svm->nested.vmcb12_gpa;
+ kvm_state.size += KVM_STATE_NESTED_SVM_VMCB_SIZE;
+ kvm_state.flags |= KVM_STATE_NESTED_GUEST_MODE;
+
+ if (svm->nested.nested_run_pending)
+ kvm_state.flags |= KVM_STATE_NESTED_RUN_PENDING;
+ }
+
+ if (gif_set(svm))
+ kvm_state.flags |= KVM_STATE_NESTED_GIF_SET;
+
+ if (copy_to_user(user_kvm_nested_state, &kvm_state, sizeof(kvm_state)))
+ return -EFAULT;
+
+ if (!is_guest_mode(vcpu))
+ goto out;
+
+ /*
+ * Copy over the full size of the VMCB rather than just the size
+ * of the structs.
+ */
+ if (clear_user(user_vmcb, KVM_STATE_NESTED_SVM_VMCB_SIZE))
+ return -EFAULT;
+
+ ctl = kzalloc(sizeof(*ctl), GFP_KERNEL);
+ if (!ctl)
+ return -ENOMEM;
+
+ nested_copy_vmcb_cache_to_control(ctl, &svm->nested.ctl);
+ r = copy_to_user(&user_vmcb->control, ctl,
+ sizeof(user_vmcb->control));
+ kfree(ctl);
+ if (r)
+ return -EFAULT;
+
+ if (copy_to_user(&user_vmcb->save, &svm->vmcb01.ptr->save,
+ sizeof(user_vmcb->save)))
+ return -EFAULT;
+out:
+ return kvm_state.size;
+}
+
+static int svm_set_nested_state(struct kvm_vcpu *vcpu,
+ struct kvm_nested_state __user *user_kvm_nested_state,
+ struct kvm_nested_state *kvm_state)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb __user *user_vmcb = (struct vmcb __user *)
+ &user_kvm_nested_state->data.svm[0];
+ struct vmcb_control_area *ctl;
+ struct vmcb_save_area *save;
+ struct vmcb_save_area_cached save_cached;
+ struct vmcb_ctrl_area_cached ctl_cached;
+ unsigned long cr0;
+ int ret;
+
+ BUILD_BUG_ON(sizeof(struct vmcb_control_area) + sizeof(struct vmcb_save_area) >
+ KVM_STATE_NESTED_SVM_VMCB_SIZE);
+
+ if (kvm_state->format != KVM_STATE_NESTED_FORMAT_SVM)
+ return -EINVAL;
+
+ if (kvm_state->flags & ~(KVM_STATE_NESTED_GUEST_MODE |
+ KVM_STATE_NESTED_RUN_PENDING |
+ KVM_STATE_NESTED_GIF_SET))
+ return -EINVAL;
+
+ /*
+ * If in guest mode, vcpu->arch.efer actually refers to the L2 guest's
+ * EFER.SVME, but EFER.SVME still has to be 1 for VMRUN to succeed.
+ */
+ if (!(vcpu->arch.efer & EFER_SVME)) {
+ /* GIF=1 and no guest mode are required if SVME=0. */
+ if (kvm_state->flags != KVM_STATE_NESTED_GIF_SET)
+ return -EINVAL;
+ }
+
+ /* SMM temporarily disables SVM, so we cannot be in guest mode. */
+ if (is_smm(vcpu) && (kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE))
+ return -EINVAL;
+
+ if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) {
+ svm_leave_nested(vcpu);
+ svm_set_gif(svm, !!(kvm_state->flags & KVM_STATE_NESTED_GIF_SET));
+ return 0;
+ }
+
+ if (!page_address_valid(vcpu, kvm_state->hdr.svm.vmcb_pa))
+ return -EINVAL;
+ if (kvm_state->size < sizeof(*kvm_state) + KVM_STATE_NESTED_SVM_VMCB_SIZE)
+ return -EINVAL;
+
+ ret = -ENOMEM;
+ ctl = kzalloc(sizeof(*ctl), GFP_KERNEL_ACCOUNT);
+ save = kzalloc(sizeof(*save), GFP_KERNEL_ACCOUNT);
+ if (!ctl || !save)
+ goto out_free;
+
+ ret = -EFAULT;
+ if (copy_from_user(ctl, &user_vmcb->control, sizeof(*ctl)))
+ goto out_free;
+ if (copy_from_user(save, &user_vmcb->save, sizeof(*save)))
+ goto out_free;
+
+ ret = -EINVAL;
+ __nested_copy_vmcb_control_to_cache(vcpu, &ctl_cached, ctl);
+ if (!__nested_vmcb_check_controls(vcpu, &ctl_cached))
+ goto out_free;
+
+ /*
+ * Processor state contains L2 state. Check that it is
+ * valid for guest mode (see nested_vmcb_check_save).
+ */
+ cr0 = kvm_read_cr0(vcpu);
+ if (((cr0 & X86_CR0_CD) == 0) && (cr0 & X86_CR0_NW))
+ goto out_free;
+
+ /*
+ * Validate host state saved from before VMRUN (see
+ * nested_svm_check_permissions).
+ */
+ __nested_copy_vmcb_save_to_cache(&save_cached, save);
+ if (!(save->cr0 & X86_CR0_PG) ||
+ !(save->cr0 & X86_CR0_PE) ||
+ (save->rflags & X86_EFLAGS_VM) ||
+ !__nested_vmcb_check_save(vcpu, &save_cached))
+ goto out_free;
+
+
+ /*
+ * All checks done, we can enter guest mode. Userspace provides
+ * vmcb12.control, which will be combined with L1 and stored into
+ * vmcb02, and the L1 save state which we store in vmcb01.
+ * L2 registers if needed are moved from the current VMCB to VMCB02.
+ */
+
+ if (is_guest_mode(vcpu))
+ svm_leave_nested(vcpu);
+ else
+ svm->nested.vmcb02.ptr->save = svm->vmcb01.ptr->save;
+
+ svm_set_gif(svm, !!(kvm_state->flags & KVM_STATE_NESTED_GIF_SET));
+
+ svm->nested.nested_run_pending =
+ !!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING);
+
+ svm->nested.vmcb12_gpa = kvm_state->hdr.svm.vmcb_pa;
+
+ svm_copy_vmrun_state(&svm->vmcb01.ptr->save, save);
+ nested_copy_vmcb_control_to_cache(svm, ctl);
+
+ svm_switch_vmcb(svm, &svm->nested.vmcb02);
+ nested_vmcb02_prepare_control(svm, svm->vmcb->save.rip, svm->vmcb->save.cs.base);
+
+ /*
+ * While the nested guest CR3 is already checked and set by
+ * KVM_SET_SREGS, it was set when nested state was yet loaded,
+ * thus MMU might not be initialized correctly.
+ * Set it again to fix this.
+ */
+
+ ret = nested_svm_load_cr3(&svm->vcpu, vcpu->arch.cr3,
+ nested_npt_enabled(svm), false);
+ if (WARN_ON_ONCE(ret))
+ goto out_free;
+
+ svm->nested.force_msr_bitmap_recalc = true;
+
+ kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
+ ret = 0;
+out_free:
+ kfree(save);
+ kfree(ctl);
+
+ return ret;
+}
+
+static bool svm_get_nested_state_pages(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (WARN_ON(!is_guest_mode(vcpu)))
+ return true;
+
+ if (!vcpu->arch.pdptrs_from_userspace &&
+ !nested_npt_enabled(svm) && is_pae_paging(vcpu))
+ /*
+ * Reload the guest's PDPTRs since after a migration
+ * the guest CR3 might be restored prior to setting the nested
+ * state which can lead to a load of wrong PDPTRs.
+ */
+ if (CC(!load_pdptrs(vcpu, vcpu->arch.cr3)))
+ return false;
+
+ if (!nested_svm_vmrun_msrpm(svm)) {
+ vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+ vcpu->run->internal.suberror =
+ KVM_INTERNAL_ERROR_EMULATION;
+ vcpu->run->internal.ndata = 0;
+ return false;
+ }
+
+ if (kvm_hv_verify_vp_assist(vcpu))
+ return false;
+
+ return true;
+}
+
+struct kvm_x86_nested_ops svm_nested_ops = {
+ .leave_nested = svm_leave_nested,
+ .is_exception_vmexit = nested_svm_is_exception_vmexit,
+ .check_events = svm_check_nested_events,
+ .triple_fault = nested_svm_triple_fault,
+ .get_nested_state_pages = svm_get_nested_state_pages,
+ .get_state = svm_get_nested_state,
+ .set_state = svm_set_nested_state,
+ .hv_inject_synthetic_vmexit_post_tlb_flush = svm_hv_inject_synthetic_vmexit_post_tlb_flush,
+};
diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c
new file mode 100644
index 000000000..3fd47de14
--- /dev/null
+++ b/arch/x86/kvm/svm/pmu.c
@@ -0,0 +1,250 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * KVM PMU support for AMD
+ *
+ * Copyright 2015, Red Hat, Inc. and/or its affiliates.
+ *
+ * Author:
+ * Wei Huang <wei@redhat.com>
+ *
+ * Implementation is based on pmu_intel.c file
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/types.h>
+#include <linux/kvm_host.h>
+#include <linux/perf_event.h>
+#include "x86.h"
+#include "cpuid.h"
+#include "lapic.h"
+#include "pmu.h"
+#include "svm.h"
+
+enum pmu_type {
+ PMU_TYPE_COUNTER = 0,
+ PMU_TYPE_EVNTSEL,
+};
+
+static struct kvm_pmc *amd_pmc_idx_to_pmc(struct kvm_pmu *pmu, int pmc_idx)
+{
+ unsigned int num_counters = pmu->nr_arch_gp_counters;
+
+ if (pmc_idx >= num_counters)
+ return NULL;
+
+ return &pmu->gp_counters[array_index_nospec(pmc_idx, num_counters)];
+}
+
+static inline struct kvm_pmc *get_gp_pmc_amd(struct kvm_pmu *pmu, u32 msr,
+ enum pmu_type type)
+{
+ struct kvm_vcpu *vcpu = pmu_to_vcpu(pmu);
+ unsigned int idx;
+
+ if (!vcpu->kvm->arch.enable_pmu)
+ return NULL;
+
+ switch (msr) {
+ case MSR_F15H_PERF_CTL0 ... MSR_F15H_PERF_CTR5:
+ if (!guest_cpuid_has(vcpu, X86_FEATURE_PERFCTR_CORE))
+ return NULL;
+ /*
+ * Each PMU counter has a pair of CTL and CTR MSRs. CTLn
+ * MSRs (accessed via EVNTSEL) are even, CTRn MSRs are odd.
+ */
+ idx = (unsigned int)((msr - MSR_F15H_PERF_CTL0) / 2);
+ if (!(msr & 0x1) != (type == PMU_TYPE_EVNTSEL))
+ return NULL;
+ break;
+ case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3:
+ if (type != PMU_TYPE_EVNTSEL)
+ return NULL;
+ idx = msr - MSR_K7_EVNTSEL0;
+ break;
+ case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3:
+ if (type != PMU_TYPE_COUNTER)
+ return NULL;
+ idx = msr - MSR_K7_PERFCTR0;
+ break;
+ default:
+ return NULL;
+ }
+
+ return amd_pmc_idx_to_pmc(pmu, idx);
+}
+
+static bool amd_hw_event_available(struct kvm_pmc *pmc)
+{
+ return true;
+}
+
+static bool amd_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+
+ idx &= ~(3u << 30);
+
+ return idx < pmu->nr_arch_gp_counters;
+}
+
+/* idx is the ECX register of RDPMC instruction */
+static struct kvm_pmc *amd_rdpmc_ecx_to_pmc(struct kvm_vcpu *vcpu,
+ unsigned int idx, u64 *mask)
+{
+ return amd_pmc_idx_to_pmc(vcpu_to_pmu(vcpu), idx & ~(3u << 30));
+}
+
+static struct kvm_pmc *amd_msr_idx_to_pmc(struct kvm_vcpu *vcpu, u32 msr)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ struct kvm_pmc *pmc;
+
+ pmc = get_gp_pmc_amd(pmu, msr, PMU_TYPE_COUNTER);
+ pmc = pmc ? pmc : get_gp_pmc_amd(pmu, msr, PMU_TYPE_EVNTSEL);
+
+ return pmc;
+}
+
+static bool amd_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+
+ switch (msr) {
+ case MSR_K7_EVNTSEL0 ... MSR_K7_PERFCTR3:
+ return pmu->version > 0;
+ case MSR_F15H_PERF_CTL0 ... MSR_F15H_PERF_CTR5:
+ return guest_cpuid_has(vcpu, X86_FEATURE_PERFCTR_CORE);
+ case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS:
+ case MSR_AMD64_PERF_CNTR_GLOBAL_CTL:
+ case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR:
+ return pmu->version > 1;
+ default:
+ if (msr > MSR_F15H_PERF_CTR5 &&
+ msr < MSR_F15H_PERF_CTL0 + 2 * pmu->nr_arch_gp_counters)
+ return pmu->version > 1;
+ break;
+ }
+
+ return amd_msr_idx_to_pmc(vcpu, msr);
+}
+
+static int amd_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ struct kvm_pmc *pmc;
+ u32 msr = msr_info->index;
+
+ /* MSR_PERFCTRn */
+ pmc = get_gp_pmc_amd(pmu, msr, PMU_TYPE_COUNTER);
+ if (pmc) {
+ msr_info->data = pmc_read_counter(pmc);
+ return 0;
+ }
+ /* MSR_EVNTSELn */
+ pmc = get_gp_pmc_amd(pmu, msr, PMU_TYPE_EVNTSEL);
+ if (pmc) {
+ msr_info->data = pmc->eventsel;
+ return 0;
+ }
+
+ return 1;
+}
+
+static int amd_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ struct kvm_pmc *pmc;
+ u32 msr = msr_info->index;
+ u64 data = msr_info->data;
+
+ /* MSR_PERFCTRn */
+ pmc = get_gp_pmc_amd(pmu, msr, PMU_TYPE_COUNTER);
+ if (pmc) {
+ pmc_write_counter(pmc, data);
+ pmc_update_sample_period(pmc);
+ return 0;
+ }
+ /* MSR_EVNTSELn */
+ pmc = get_gp_pmc_amd(pmu, msr, PMU_TYPE_EVNTSEL);
+ if (pmc) {
+ data &= ~pmu->reserved_bits;
+ if (data != pmc->eventsel) {
+ pmc->eventsel = data;
+ kvm_pmu_request_counter_reprogram(pmc);
+ }
+ return 0;
+ }
+
+ return 1;
+}
+
+static void amd_pmu_refresh(struct kvm_vcpu *vcpu)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ union cpuid_0x80000022_ebx ebx;
+
+ pmu->version = 1;
+ if (guest_cpuid_has(vcpu, X86_FEATURE_PERFMON_V2)) {
+ pmu->version = 2;
+ /*
+ * Note, PERFMON_V2 is also in 0x80000022.0x0, i.e. the guest
+ * CPUID entry is guaranteed to be non-NULL.
+ */
+ BUILD_BUG_ON(x86_feature_cpuid(X86_FEATURE_PERFMON_V2).function != 0x80000022 ||
+ x86_feature_cpuid(X86_FEATURE_PERFMON_V2).index);
+ ebx.full = kvm_find_cpuid_entry_index(vcpu, 0x80000022, 0)->ebx;
+ pmu->nr_arch_gp_counters = ebx.split.num_core_pmc;
+ } else if (guest_cpuid_has(vcpu, X86_FEATURE_PERFCTR_CORE)) {
+ pmu->nr_arch_gp_counters = AMD64_NUM_COUNTERS_CORE;
+ } else {
+ pmu->nr_arch_gp_counters = AMD64_NUM_COUNTERS;
+ }
+
+ pmu->nr_arch_gp_counters = min_t(unsigned int, pmu->nr_arch_gp_counters,
+ kvm_pmu_cap.num_counters_gp);
+
+ if (pmu->version > 1) {
+ pmu->global_ctrl_mask = ~((1ull << pmu->nr_arch_gp_counters) - 1);
+ pmu->global_status_mask = pmu->global_ctrl_mask;
+ }
+
+ pmu->counter_bitmask[KVM_PMC_GP] = ((u64)1 << 48) - 1;
+ pmu->reserved_bits = 0xfffffff000280000ull;
+ pmu->raw_event_mask = AMD64_RAW_EVENT_MASK;
+ /* not applicable to AMD; but clean them to prevent any fall out */
+ pmu->counter_bitmask[KVM_PMC_FIXED] = 0;
+ pmu->nr_arch_fixed_counters = 0;
+ bitmap_set(pmu->all_valid_pmc_idx, 0, pmu->nr_arch_gp_counters);
+}
+
+static void amd_pmu_init(struct kvm_vcpu *vcpu)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ int i;
+
+ BUILD_BUG_ON(KVM_AMD_PMC_MAX_GENERIC > AMD64_NUM_COUNTERS_CORE);
+ BUILD_BUG_ON(KVM_AMD_PMC_MAX_GENERIC > INTEL_PMC_MAX_GENERIC);
+
+ for (i = 0; i < KVM_AMD_PMC_MAX_GENERIC ; i++) {
+ pmu->gp_counters[i].type = KVM_PMC_GP;
+ pmu->gp_counters[i].vcpu = vcpu;
+ pmu->gp_counters[i].idx = i;
+ pmu->gp_counters[i].current_config = 0;
+ }
+}
+
+struct kvm_pmu_ops amd_pmu_ops __initdata = {
+ .hw_event_available = amd_hw_event_available,
+ .pmc_idx_to_pmc = amd_pmc_idx_to_pmc,
+ .rdpmc_ecx_to_pmc = amd_rdpmc_ecx_to_pmc,
+ .msr_idx_to_pmc = amd_msr_idx_to_pmc,
+ .is_valid_rdpmc_ecx = amd_is_valid_rdpmc_ecx,
+ .is_valid_msr = amd_is_valid_msr,
+ .get_msr = amd_pmu_get_msr,
+ .set_msr = amd_pmu_set_msr,
+ .refresh = amd_pmu_refresh,
+ .init = amd_pmu_init,
+ .EVENTSEL_EVENT = AMD64_EVENTSEL_EVENT,
+ .MAX_NR_GP_COUNTERS = KVM_AMD_PMC_MAX_GENERIC,
+ .MIN_NR_GP_COUNTERS = AMD64_NUM_COUNTERS,
+};
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
new file mode 100644
index 000000000..4900c0780
--- /dev/null
+++ b/arch/x86/kvm/svm/sev.c
@@ -0,0 +1,3140 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Kernel-based Virtual Machine driver for Linux
+ *
+ * AMD SVM-SEV support
+ *
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kvm_types.h>
+#include <linux/kvm_host.h>
+#include <linux/kernel.h>
+#include <linux/highmem.h>
+#include <linux/psp.h>
+#include <linux/psp-sev.h>
+#include <linux/pagemap.h>
+#include <linux/swap.h>
+#include <linux/misc_cgroup.h>
+#include <linux/processor.h>
+#include <linux/trace_events.h>
+
+#include <asm/pkru.h>
+#include <asm/trapnr.h>
+#include <asm/fpu/xcr.h>
+#include <asm/debugreg.h>
+
+#include "mmu.h"
+#include "x86.h"
+#include "svm.h"
+#include "svm_ops.h"
+#include "cpuid.h"
+#include "trace.h"
+
+#ifndef CONFIG_KVM_AMD_SEV
+/*
+ * When this config is not defined, SEV feature is not supported and APIs in
+ * this file are not used but this file still gets compiled into the KVM AMD
+ * module.
+ *
+ * We will not have MISC_CG_RES_SEV and MISC_CG_RES_SEV_ES entries in the enum
+ * misc_res_type {} defined in linux/misc_cgroup.h.
+ *
+ * Below macros allow compilation to succeed.
+ */
+#define MISC_CG_RES_SEV MISC_CG_RES_TYPES
+#define MISC_CG_RES_SEV_ES MISC_CG_RES_TYPES
+#endif
+
+#ifdef CONFIG_KVM_AMD_SEV
+/* enable/disable SEV support */
+static bool sev_enabled = true;
+module_param_named(sev, sev_enabled, bool, 0444);
+
+/* enable/disable SEV-ES support */
+static bool sev_es_enabled = true;
+module_param_named(sev_es, sev_es_enabled, bool, 0444);
+
+/* enable/disable SEV-ES DebugSwap support */
+static bool sev_es_debug_swap_enabled = true;
+module_param_named(debug_swap, sev_es_debug_swap_enabled, bool, 0444);
+#else
+#define sev_enabled false
+#define sev_es_enabled false
+#define sev_es_debug_swap_enabled false
+#endif /* CONFIG_KVM_AMD_SEV */
+
+static u8 sev_enc_bit;
+static DECLARE_RWSEM(sev_deactivate_lock);
+static DEFINE_MUTEX(sev_bitmap_lock);
+unsigned int max_sev_asid;
+static unsigned int min_sev_asid;
+static unsigned long sev_me_mask;
+static unsigned int nr_asids;
+static unsigned long *sev_asid_bitmap;
+static unsigned long *sev_reclaim_asid_bitmap;
+
+struct enc_region {
+ struct list_head list;
+ unsigned long npages;
+ struct page **pages;
+ unsigned long uaddr;
+ unsigned long size;
+};
+
+/* Called with the sev_bitmap_lock held, or on shutdown */
+static int sev_flush_asids(int min_asid, int max_asid)
+{
+ int ret, asid, error = 0;
+
+ /* Check if there are any ASIDs to reclaim before performing a flush */
+ asid = find_next_bit(sev_reclaim_asid_bitmap, nr_asids, min_asid);
+ if (asid > max_asid)
+ return -EBUSY;
+
+ /*
+ * DEACTIVATE will clear the WBINVD indicator causing DF_FLUSH to fail,
+ * so it must be guarded.
+ */
+ down_write(&sev_deactivate_lock);
+
+ wbinvd_on_all_cpus();
+ ret = sev_guest_df_flush(&error);
+
+ up_write(&sev_deactivate_lock);
+
+ if (ret)
+ pr_err("SEV: DF_FLUSH failed, ret=%d, error=%#x\n", ret, error);
+
+ return ret;
+}
+
+static inline bool is_mirroring_enc_context(struct kvm *kvm)
+{
+ return !!to_kvm_svm(kvm)->sev_info.enc_context_owner;
+}
+
+/* Must be called with the sev_bitmap_lock held */
+static bool __sev_recycle_asids(int min_asid, int max_asid)
+{
+ if (sev_flush_asids(min_asid, max_asid))
+ return false;
+
+ /* The flush process will flush all reclaimable SEV and SEV-ES ASIDs */
+ bitmap_xor(sev_asid_bitmap, sev_asid_bitmap, sev_reclaim_asid_bitmap,
+ nr_asids);
+ bitmap_zero(sev_reclaim_asid_bitmap, nr_asids);
+
+ return true;
+}
+
+static int sev_misc_cg_try_charge(struct kvm_sev_info *sev)
+{
+ enum misc_res_type type = sev->es_active ? MISC_CG_RES_SEV_ES : MISC_CG_RES_SEV;
+ return misc_cg_try_charge(type, sev->misc_cg, 1);
+}
+
+static void sev_misc_cg_uncharge(struct kvm_sev_info *sev)
+{
+ enum misc_res_type type = sev->es_active ? MISC_CG_RES_SEV_ES : MISC_CG_RES_SEV;
+ misc_cg_uncharge(type, sev->misc_cg, 1);
+}
+
+static int sev_asid_new(struct kvm_sev_info *sev)
+{
+ int asid, min_asid, max_asid, ret;
+ bool retry = true;
+
+ WARN_ON(sev->misc_cg);
+ sev->misc_cg = get_current_misc_cg();
+ ret = sev_misc_cg_try_charge(sev);
+ if (ret) {
+ put_misc_cg(sev->misc_cg);
+ sev->misc_cg = NULL;
+ return ret;
+ }
+
+ mutex_lock(&sev_bitmap_lock);
+
+ /*
+ * SEV-enabled guests must use asid from min_sev_asid to max_sev_asid.
+ * SEV-ES-enabled guest can use from 1 to min_sev_asid - 1.
+ */
+ min_asid = sev->es_active ? 1 : min_sev_asid;
+ max_asid = sev->es_active ? min_sev_asid - 1 : max_sev_asid;
+again:
+ asid = find_next_zero_bit(sev_asid_bitmap, max_asid + 1, min_asid);
+ if (asid > max_asid) {
+ if (retry && __sev_recycle_asids(min_asid, max_asid)) {
+ retry = false;
+ goto again;
+ }
+ mutex_unlock(&sev_bitmap_lock);
+ ret = -EBUSY;
+ goto e_uncharge;
+ }
+
+ __set_bit(asid, sev_asid_bitmap);
+
+ mutex_unlock(&sev_bitmap_lock);
+
+ return asid;
+e_uncharge:
+ sev_misc_cg_uncharge(sev);
+ put_misc_cg(sev->misc_cg);
+ sev->misc_cg = NULL;
+ return ret;
+}
+
+static int sev_get_asid(struct kvm *kvm)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+
+ return sev->asid;
+}
+
+static void sev_asid_free(struct kvm_sev_info *sev)
+{
+ struct svm_cpu_data *sd;
+ int cpu;
+
+ mutex_lock(&sev_bitmap_lock);
+
+ __set_bit(sev->asid, sev_reclaim_asid_bitmap);
+
+ for_each_possible_cpu(cpu) {
+ sd = per_cpu_ptr(&svm_data, cpu);
+ sd->sev_vmcbs[sev->asid] = NULL;
+ }
+
+ mutex_unlock(&sev_bitmap_lock);
+
+ sev_misc_cg_uncharge(sev);
+ put_misc_cg(sev->misc_cg);
+ sev->misc_cg = NULL;
+}
+
+static void sev_decommission(unsigned int handle)
+{
+ struct sev_data_decommission decommission;
+
+ if (!handle)
+ return;
+
+ decommission.handle = handle;
+ sev_guest_decommission(&decommission, NULL);
+}
+
+static void sev_unbind_asid(struct kvm *kvm, unsigned int handle)
+{
+ struct sev_data_deactivate deactivate;
+
+ if (!handle)
+ return;
+
+ deactivate.handle = handle;
+
+ /* Guard DEACTIVATE against WBINVD/DF_FLUSH used in ASID recycling */
+ down_read(&sev_deactivate_lock);
+ sev_guest_deactivate(&deactivate, NULL);
+ up_read(&sev_deactivate_lock);
+
+ sev_decommission(handle);
+}
+
+static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ int asid, ret;
+
+ if (kvm->created_vcpus)
+ return -EINVAL;
+
+ ret = -EBUSY;
+ if (unlikely(sev->active))
+ return ret;
+
+ sev->active = true;
+ sev->es_active = argp->id == KVM_SEV_ES_INIT;
+ asid = sev_asid_new(sev);
+ if (asid < 0)
+ goto e_no_asid;
+ sev->asid = asid;
+
+ ret = sev_platform_init(&argp->error);
+ if (ret)
+ goto e_free;
+
+ INIT_LIST_HEAD(&sev->regions_list);
+ INIT_LIST_HEAD(&sev->mirror_vms);
+
+ kvm_set_apicv_inhibit(kvm, APICV_INHIBIT_REASON_SEV);
+
+ return 0;
+
+e_free:
+ sev_asid_free(sev);
+ sev->asid = 0;
+e_no_asid:
+ sev->es_active = false;
+ sev->active = false;
+ return ret;
+}
+
+static int sev_bind_asid(struct kvm *kvm, unsigned int handle, int *error)
+{
+ struct sev_data_activate activate;
+ int asid = sev_get_asid(kvm);
+ int ret;
+
+ /* activate ASID on the given handle */
+ activate.handle = handle;
+ activate.asid = asid;
+ ret = sev_guest_activate(&activate, error);
+
+ return ret;
+}
+
+static int __sev_issue_cmd(int fd, int id, void *data, int *error)
+{
+ struct fd f;
+ int ret;
+
+ f = fdget(fd);
+ if (!f.file)
+ return -EBADF;
+
+ ret = sev_issue_cmd_external_user(f.file, id, data, error);
+
+ fdput(f);
+ return ret;
+}
+
+static int sev_issue_cmd(struct kvm *kvm, int id, void *data, int *error)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+
+ return __sev_issue_cmd(sev->fd, id, data, error);
+}
+
+static int sev_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct sev_data_launch_start start;
+ struct kvm_sev_launch_start params;
+ void *dh_blob, *session_blob;
+ int *error = &argp->error;
+ int ret;
+
+ if (!sev_guest(kvm))
+ return -ENOTTY;
+
+ if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data, sizeof(params)))
+ return -EFAULT;
+
+ memset(&start, 0, sizeof(start));
+
+ dh_blob = NULL;
+ if (params.dh_uaddr) {
+ dh_blob = psp_copy_user_blob(params.dh_uaddr, params.dh_len);
+ if (IS_ERR(dh_blob))
+ return PTR_ERR(dh_blob);
+
+ start.dh_cert_address = __sme_set(__pa(dh_blob));
+ start.dh_cert_len = params.dh_len;
+ }
+
+ session_blob = NULL;
+ if (params.session_uaddr) {
+ session_blob = psp_copy_user_blob(params.session_uaddr, params.session_len);
+ if (IS_ERR(session_blob)) {
+ ret = PTR_ERR(session_blob);
+ goto e_free_dh;
+ }
+
+ start.session_address = __sme_set(__pa(session_blob));
+ start.session_len = params.session_len;
+ }
+
+ start.handle = params.handle;
+ start.policy = params.policy;
+
+ /* create memory encryption context */
+ ret = __sev_issue_cmd(argp->sev_fd, SEV_CMD_LAUNCH_START, &start, error);
+ if (ret)
+ goto e_free_session;
+
+ /* Bind ASID to this guest */
+ ret = sev_bind_asid(kvm, start.handle, error);
+ if (ret) {
+ sev_decommission(start.handle);
+ goto e_free_session;
+ }
+
+ /* return handle to userspace */
+ params.handle = start.handle;
+ if (copy_to_user((void __user *)(uintptr_t)argp->data, &params, sizeof(params))) {
+ sev_unbind_asid(kvm, start.handle);
+ ret = -EFAULT;
+ goto e_free_session;
+ }
+
+ sev->handle = start.handle;
+ sev->fd = argp->sev_fd;
+
+e_free_session:
+ kfree(session_blob);
+e_free_dh:
+ kfree(dh_blob);
+ return ret;
+}
+
+static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr,
+ unsigned long ulen, unsigned long *n,
+ int write)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ unsigned long npages, size;
+ int npinned;
+ unsigned long locked, lock_limit;
+ struct page **pages;
+ unsigned long first, last;
+ int ret;
+
+ lockdep_assert_held(&kvm->lock);
+
+ if (ulen == 0 || uaddr + ulen < uaddr)
+ return ERR_PTR(-EINVAL);
+
+ /* Calculate number of pages. */
+ first = (uaddr & PAGE_MASK) >> PAGE_SHIFT;
+ last = ((uaddr + ulen - 1) & PAGE_MASK) >> PAGE_SHIFT;
+ npages = (last - first + 1);
+
+ locked = sev->pages_locked + npages;
+ lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+ if (locked > lock_limit && !capable(CAP_IPC_LOCK)) {
+ pr_err("SEV: %lu locked pages exceed the lock limit of %lu.\n", locked, lock_limit);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ if (WARN_ON_ONCE(npages > INT_MAX))
+ return ERR_PTR(-EINVAL);
+
+ /* Avoid using vmalloc for smaller buffers. */
+ size = npages * sizeof(struct page *);
+ if (size > PAGE_SIZE)
+ pages = __vmalloc(size, GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+ else
+ pages = kmalloc(size, GFP_KERNEL_ACCOUNT);
+
+ if (!pages)
+ return ERR_PTR(-ENOMEM);
+
+ /* Pin the user virtual address. */
+ npinned = pin_user_pages_fast(uaddr, npages, write ? FOLL_WRITE : 0, pages);
+ if (npinned != npages) {
+ pr_err("SEV: Failure locking %lu pages.\n", npages);
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ *n = npages;
+ sev->pages_locked = locked;
+
+ return pages;
+
+err:
+ if (npinned > 0)
+ unpin_user_pages(pages, npinned);
+
+ kvfree(pages);
+ return ERR_PTR(ret);
+}
+
+static void sev_unpin_memory(struct kvm *kvm, struct page **pages,
+ unsigned long npages)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+
+ unpin_user_pages(pages, npages);
+ kvfree(pages);
+ sev->pages_locked -= npages;
+}
+
+static void sev_clflush_pages(struct page *pages[], unsigned long npages)
+{
+ uint8_t *page_virtual;
+ unsigned long i;
+
+ if (this_cpu_has(X86_FEATURE_SME_COHERENT) || npages == 0 ||
+ pages == NULL)
+ return;
+
+ for (i = 0; i < npages; i++) {
+ page_virtual = kmap_local_page(pages[i]);
+ clflush_cache_range(page_virtual, PAGE_SIZE);
+ kunmap_local(page_virtual);
+ cond_resched();
+ }
+}
+
+static unsigned long get_num_contig_pages(unsigned long idx,
+ struct page **inpages, unsigned long npages)
+{
+ unsigned long paddr, next_paddr;
+ unsigned long i = idx + 1, pages = 1;
+
+ /* find the number of contiguous pages starting from idx */
+ paddr = __sme_page_pa(inpages[idx]);
+ while (i < npages) {
+ next_paddr = __sme_page_pa(inpages[i++]);
+ if ((paddr + PAGE_SIZE) == next_paddr) {
+ pages++;
+ paddr = next_paddr;
+ continue;
+ }
+ break;
+ }
+
+ return pages;
+}
+
+static int sev_launch_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ unsigned long vaddr, vaddr_end, next_vaddr, npages, pages, size, i;
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct kvm_sev_launch_update_data params;
+ struct sev_data_launch_update_data data;
+ struct page **inpages;
+ int ret;
+
+ if (!sev_guest(kvm))
+ return -ENOTTY;
+
+ if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data, sizeof(params)))
+ return -EFAULT;
+
+ vaddr = params.uaddr;
+ size = params.len;
+ vaddr_end = vaddr + size;
+
+ /* Lock the user memory. */
+ inpages = sev_pin_memory(kvm, vaddr, size, &npages, 1);
+ if (IS_ERR(inpages))
+ return PTR_ERR(inpages);
+
+ /*
+ * Flush (on non-coherent CPUs) before LAUNCH_UPDATE encrypts pages in
+ * place; the cache may contain the data that was written unencrypted.
+ */
+ sev_clflush_pages(inpages, npages);
+
+ data.reserved = 0;
+ data.handle = sev->handle;
+
+ for (i = 0; vaddr < vaddr_end; vaddr = next_vaddr, i += pages) {
+ int offset, len;
+
+ /*
+ * If the user buffer is not page-aligned, calculate the offset
+ * within the page.
+ */
+ offset = vaddr & (PAGE_SIZE - 1);
+
+ /* Calculate the number of pages that can be encrypted in one go. */
+ pages = get_num_contig_pages(i, inpages, npages);
+
+ len = min_t(size_t, ((pages * PAGE_SIZE) - offset), size);
+
+ data.len = len;
+ data.address = __sme_page_pa(inpages[i]) + offset;
+ ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_DATA, &data, &argp->error);
+ if (ret)
+ goto e_unpin;
+
+ size -= len;
+ next_vaddr = vaddr + len;
+ }
+
+e_unpin:
+ /* content of memory is updated, mark pages dirty */
+ for (i = 0; i < npages; i++) {
+ set_page_dirty_lock(inpages[i]);
+ mark_page_accessed(inpages[i]);
+ }
+ /* unlock the user pages */
+ sev_unpin_memory(kvm, inpages, npages);
+ return ret;
+}
+
+static int sev_es_sync_vmsa(struct vcpu_svm *svm)
+{
+ struct sev_es_save_area *save = svm->sev_es.vmsa;
+
+ /* Check some debug related fields before encrypting the VMSA */
+ if (svm->vcpu.guest_debug || (svm->vmcb->save.dr7 & ~DR7_FIXED_1))
+ return -EINVAL;
+
+ /*
+ * SEV-ES will use a VMSA that is pointed to by the VMCB, not
+ * the traditional VMSA that is part of the VMCB. Copy the
+ * traditional VMSA as it has been built so far (in prep
+ * for LAUNCH_UPDATE_VMSA) to be the initial SEV-ES state.
+ */
+ memcpy(save, &svm->vmcb->save, sizeof(svm->vmcb->save));
+
+ /* Sync registgers */
+ save->rax = svm->vcpu.arch.regs[VCPU_REGS_RAX];
+ save->rbx = svm->vcpu.arch.regs[VCPU_REGS_RBX];
+ save->rcx = svm->vcpu.arch.regs[VCPU_REGS_RCX];
+ save->rdx = svm->vcpu.arch.regs[VCPU_REGS_RDX];
+ save->rsp = svm->vcpu.arch.regs[VCPU_REGS_RSP];
+ save->rbp = svm->vcpu.arch.regs[VCPU_REGS_RBP];
+ save->rsi = svm->vcpu.arch.regs[VCPU_REGS_RSI];
+ save->rdi = svm->vcpu.arch.regs[VCPU_REGS_RDI];
+#ifdef CONFIG_X86_64
+ save->r8 = svm->vcpu.arch.regs[VCPU_REGS_R8];
+ save->r9 = svm->vcpu.arch.regs[VCPU_REGS_R9];
+ save->r10 = svm->vcpu.arch.regs[VCPU_REGS_R10];
+ save->r11 = svm->vcpu.arch.regs[VCPU_REGS_R11];
+ save->r12 = svm->vcpu.arch.regs[VCPU_REGS_R12];
+ save->r13 = svm->vcpu.arch.regs[VCPU_REGS_R13];
+ save->r14 = svm->vcpu.arch.regs[VCPU_REGS_R14];
+ save->r15 = svm->vcpu.arch.regs[VCPU_REGS_R15];
+#endif
+ save->rip = svm->vcpu.arch.regs[VCPU_REGS_RIP];
+
+ /* Sync some non-GPR registers before encrypting */
+ save->xcr0 = svm->vcpu.arch.xcr0;
+ save->pkru = svm->vcpu.arch.pkru;
+ save->xss = svm->vcpu.arch.ia32_xss;
+ save->dr6 = svm->vcpu.arch.dr6;
+
+ if (sev_es_debug_swap_enabled)
+ save->sev_features |= SVM_SEV_FEAT_DEBUG_SWAP;
+
+ pr_debug("Virtual Machine Save Area (VMSA):\n");
+ print_hex_dump_debug("", DUMP_PREFIX_NONE, 16, 1, save, sizeof(*save), false);
+
+ return 0;
+}
+
+static int __sev_launch_update_vmsa(struct kvm *kvm, struct kvm_vcpu *vcpu,
+ int *error)
+{
+ struct sev_data_launch_update_vmsa vmsa;
+ struct vcpu_svm *svm = to_svm(vcpu);
+ int ret;
+
+ if (vcpu->guest_debug) {
+ pr_warn_once("KVM_SET_GUEST_DEBUG for SEV-ES guest is not supported");
+ return -EINVAL;
+ }
+
+ /* Perform some pre-encryption checks against the VMSA */
+ ret = sev_es_sync_vmsa(svm);
+ if (ret)
+ return ret;
+
+ /*
+ * The LAUNCH_UPDATE_VMSA command will perform in-place encryption of
+ * the VMSA memory content (i.e it will write the same memory region
+ * with the guest's key), so invalidate it first.
+ */
+ clflush_cache_range(svm->sev_es.vmsa, PAGE_SIZE);
+
+ vmsa.reserved = 0;
+ vmsa.handle = to_kvm_svm(kvm)->sev_info.handle;
+ vmsa.address = __sme_pa(svm->sev_es.vmsa);
+ vmsa.len = PAGE_SIZE;
+ ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_VMSA, &vmsa, error);
+ if (ret)
+ return ret;
+
+ vcpu->arch.guest_state_protected = true;
+ return 0;
+}
+
+static int sev_launch_update_vmsa(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct kvm_vcpu *vcpu;
+ unsigned long i;
+ int ret;
+
+ if (!sev_es_guest(kvm))
+ return -ENOTTY;
+
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ ret = mutex_lock_killable(&vcpu->mutex);
+ if (ret)
+ return ret;
+
+ ret = __sev_launch_update_vmsa(kvm, vcpu, &argp->error);
+
+ mutex_unlock(&vcpu->mutex);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+static int sev_launch_measure(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ void __user *measure = (void __user *)(uintptr_t)argp->data;
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct sev_data_launch_measure data;
+ struct kvm_sev_launch_measure params;
+ void __user *p = NULL;
+ void *blob = NULL;
+ int ret;
+
+ if (!sev_guest(kvm))
+ return -ENOTTY;
+
+ if (copy_from_user(&params, measure, sizeof(params)))
+ return -EFAULT;
+
+ memset(&data, 0, sizeof(data));
+
+ /* User wants to query the blob length */
+ if (!params.len)
+ goto cmd;
+
+ p = (void __user *)(uintptr_t)params.uaddr;
+ if (p) {
+ if (params.len > SEV_FW_BLOB_MAX_SIZE)
+ return -EINVAL;
+
+ blob = kzalloc(params.len, GFP_KERNEL_ACCOUNT);
+ if (!blob)
+ return -ENOMEM;
+
+ data.address = __psp_pa(blob);
+ data.len = params.len;
+ }
+
+cmd:
+ data.handle = sev->handle;
+ ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_MEASURE, &data, &argp->error);
+
+ /*
+ * If we query the session length, FW responded with expected data.
+ */
+ if (!params.len)
+ goto done;
+
+ if (ret)
+ goto e_free_blob;
+
+ if (blob) {
+ if (copy_to_user(p, blob, params.len))
+ ret = -EFAULT;
+ }
+
+done:
+ params.len = data.len;
+ if (copy_to_user(measure, &params, sizeof(params)))
+ ret = -EFAULT;
+e_free_blob:
+ kfree(blob);
+ return ret;
+}
+
+static int sev_launch_finish(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct sev_data_launch_finish data;
+
+ if (!sev_guest(kvm))
+ return -ENOTTY;
+
+ data.handle = sev->handle;
+ return sev_issue_cmd(kvm, SEV_CMD_LAUNCH_FINISH, &data, &argp->error);
+}
+
+static int sev_guest_status(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct kvm_sev_guest_status params;
+ struct sev_data_guest_status data;
+ int ret;
+
+ if (!sev_guest(kvm))
+ return -ENOTTY;
+
+ memset(&data, 0, sizeof(data));
+
+ data.handle = sev->handle;
+ ret = sev_issue_cmd(kvm, SEV_CMD_GUEST_STATUS, &data, &argp->error);
+ if (ret)
+ return ret;
+
+ params.policy = data.policy;
+ params.state = data.state;
+ params.handle = data.handle;
+
+ if (copy_to_user((void __user *)(uintptr_t)argp->data, &params, sizeof(params)))
+ ret = -EFAULT;
+
+ return ret;
+}
+
+static int __sev_issue_dbg_cmd(struct kvm *kvm, unsigned long src,
+ unsigned long dst, int size,
+ int *error, bool enc)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct sev_data_dbg data;
+
+ data.reserved = 0;
+ data.handle = sev->handle;
+ data.dst_addr = dst;
+ data.src_addr = src;
+ data.len = size;
+
+ return sev_issue_cmd(kvm,
+ enc ? SEV_CMD_DBG_ENCRYPT : SEV_CMD_DBG_DECRYPT,
+ &data, error);
+}
+
+static int __sev_dbg_decrypt(struct kvm *kvm, unsigned long src_paddr,
+ unsigned long dst_paddr, int sz, int *err)
+{
+ int offset;
+
+ /*
+ * Its safe to read more than we are asked, caller should ensure that
+ * destination has enough space.
+ */
+ offset = src_paddr & 15;
+ src_paddr = round_down(src_paddr, 16);
+ sz = round_up(sz + offset, 16);
+
+ return __sev_issue_dbg_cmd(kvm, src_paddr, dst_paddr, sz, err, false);
+}
+
+static int __sev_dbg_decrypt_user(struct kvm *kvm, unsigned long paddr,
+ void __user *dst_uaddr,
+ unsigned long dst_paddr,
+ int size, int *err)
+{
+ struct page *tpage = NULL;
+ int ret, offset;
+
+ /* if inputs are not 16-byte then use intermediate buffer */
+ if (!IS_ALIGNED(dst_paddr, 16) ||
+ !IS_ALIGNED(paddr, 16) ||
+ !IS_ALIGNED(size, 16)) {
+ tpage = (void *)alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+ if (!tpage)
+ return -ENOMEM;
+
+ dst_paddr = __sme_page_pa(tpage);
+ }
+
+ ret = __sev_dbg_decrypt(kvm, paddr, dst_paddr, size, err);
+ if (ret)
+ goto e_free;
+
+ if (tpage) {
+ offset = paddr & 15;
+ if (copy_to_user(dst_uaddr, page_address(tpage) + offset, size))
+ ret = -EFAULT;
+ }
+
+e_free:
+ if (tpage)
+ __free_page(tpage);
+
+ return ret;
+}
+
+static int __sev_dbg_encrypt_user(struct kvm *kvm, unsigned long paddr,
+ void __user *vaddr,
+ unsigned long dst_paddr,
+ void __user *dst_vaddr,
+ int size, int *error)
+{
+ struct page *src_tpage = NULL;
+ struct page *dst_tpage = NULL;
+ int ret, len = size;
+
+ /* If source buffer is not aligned then use an intermediate buffer */
+ if (!IS_ALIGNED((unsigned long)vaddr, 16)) {
+ src_tpage = alloc_page(GFP_KERNEL_ACCOUNT);
+ if (!src_tpage)
+ return -ENOMEM;
+
+ if (copy_from_user(page_address(src_tpage), vaddr, size)) {
+ __free_page(src_tpage);
+ return -EFAULT;
+ }
+
+ paddr = __sme_page_pa(src_tpage);
+ }
+
+ /*
+ * If destination buffer or length is not aligned then do read-modify-write:
+ * - decrypt destination in an intermediate buffer
+ * - copy the source buffer in an intermediate buffer
+ * - use the intermediate buffer as source buffer
+ */
+ if (!IS_ALIGNED((unsigned long)dst_vaddr, 16) || !IS_ALIGNED(size, 16)) {
+ int dst_offset;
+
+ dst_tpage = alloc_page(GFP_KERNEL_ACCOUNT);
+ if (!dst_tpage) {
+ ret = -ENOMEM;
+ goto e_free;
+ }
+
+ ret = __sev_dbg_decrypt(kvm, dst_paddr,
+ __sme_page_pa(dst_tpage), size, error);
+ if (ret)
+ goto e_free;
+
+ /*
+ * If source is kernel buffer then use memcpy() otherwise
+ * copy_from_user().
+ */
+ dst_offset = dst_paddr & 15;
+
+ if (src_tpage)
+ memcpy(page_address(dst_tpage) + dst_offset,
+ page_address(src_tpage), size);
+ else {
+ if (copy_from_user(page_address(dst_tpage) + dst_offset,
+ vaddr, size)) {
+ ret = -EFAULT;
+ goto e_free;
+ }
+ }
+
+ paddr = __sme_page_pa(dst_tpage);
+ dst_paddr = round_down(dst_paddr, 16);
+ len = round_up(size, 16);
+ }
+
+ ret = __sev_issue_dbg_cmd(kvm, paddr, dst_paddr, len, error, true);
+
+e_free:
+ if (src_tpage)
+ __free_page(src_tpage);
+ if (dst_tpage)
+ __free_page(dst_tpage);
+ return ret;
+}
+
+static int sev_dbg_crypt(struct kvm *kvm, struct kvm_sev_cmd *argp, bool dec)
+{
+ unsigned long vaddr, vaddr_end, next_vaddr;
+ unsigned long dst_vaddr;
+ struct page **src_p, **dst_p;
+ struct kvm_sev_dbg debug;
+ unsigned long n;
+ unsigned int size;
+ int ret;
+
+ if (!sev_guest(kvm))
+ return -ENOTTY;
+
+ if (copy_from_user(&debug, (void __user *)(uintptr_t)argp->data, sizeof(debug)))
+ return -EFAULT;
+
+ if (!debug.len || debug.src_uaddr + debug.len < debug.src_uaddr)
+ return -EINVAL;
+ if (!debug.dst_uaddr)
+ return -EINVAL;
+
+ vaddr = debug.src_uaddr;
+ size = debug.len;
+ vaddr_end = vaddr + size;
+ dst_vaddr = debug.dst_uaddr;
+
+ for (; vaddr < vaddr_end; vaddr = next_vaddr) {
+ int len, s_off, d_off;
+
+ /* lock userspace source and destination page */
+ src_p = sev_pin_memory(kvm, vaddr & PAGE_MASK, PAGE_SIZE, &n, 0);
+ if (IS_ERR(src_p))
+ return PTR_ERR(src_p);
+
+ dst_p = sev_pin_memory(kvm, dst_vaddr & PAGE_MASK, PAGE_SIZE, &n, 1);
+ if (IS_ERR(dst_p)) {
+ sev_unpin_memory(kvm, src_p, n);
+ return PTR_ERR(dst_p);
+ }
+
+ /*
+ * Flush (on non-coherent CPUs) before DBG_{DE,EN}CRYPT read or modify
+ * the pages; flush the destination too so that future accesses do not
+ * see stale data.
+ */
+ sev_clflush_pages(src_p, 1);
+ sev_clflush_pages(dst_p, 1);
+
+ /*
+ * Since user buffer may not be page aligned, calculate the
+ * offset within the page.
+ */
+ s_off = vaddr & ~PAGE_MASK;
+ d_off = dst_vaddr & ~PAGE_MASK;
+ len = min_t(size_t, (PAGE_SIZE - s_off), size);
+
+ if (dec)
+ ret = __sev_dbg_decrypt_user(kvm,
+ __sme_page_pa(src_p[0]) + s_off,
+ (void __user *)dst_vaddr,
+ __sme_page_pa(dst_p[0]) + d_off,
+ len, &argp->error);
+ else
+ ret = __sev_dbg_encrypt_user(kvm,
+ __sme_page_pa(src_p[0]) + s_off,
+ (void __user *)vaddr,
+ __sme_page_pa(dst_p[0]) + d_off,
+ (void __user *)dst_vaddr,
+ len, &argp->error);
+
+ sev_unpin_memory(kvm, src_p, n);
+ sev_unpin_memory(kvm, dst_p, n);
+
+ if (ret)
+ goto err;
+
+ next_vaddr = vaddr + len;
+ dst_vaddr = dst_vaddr + len;
+ size -= len;
+ }
+err:
+ return ret;
+}
+
+static int sev_launch_secret(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct sev_data_launch_secret data;
+ struct kvm_sev_launch_secret params;
+ struct page **pages;
+ void *blob, *hdr;
+ unsigned long n, i;
+ int ret, offset;
+
+ if (!sev_guest(kvm))
+ return -ENOTTY;
+
+ if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data, sizeof(params)))
+ return -EFAULT;
+
+ pages = sev_pin_memory(kvm, params.guest_uaddr, params.guest_len, &n, 1);
+ if (IS_ERR(pages))
+ return PTR_ERR(pages);
+
+ /*
+ * Flush (on non-coherent CPUs) before LAUNCH_SECRET encrypts pages in
+ * place; the cache may contain the data that was written unencrypted.
+ */
+ sev_clflush_pages(pages, n);
+
+ /*
+ * The secret must be copied into contiguous memory region, lets verify
+ * that userspace memory pages are contiguous before we issue command.
+ */
+ if (get_num_contig_pages(0, pages, n) != n) {
+ ret = -EINVAL;
+ goto e_unpin_memory;
+ }
+
+ memset(&data, 0, sizeof(data));
+
+ offset = params.guest_uaddr & (PAGE_SIZE - 1);
+ data.guest_address = __sme_page_pa(pages[0]) + offset;
+ data.guest_len = params.guest_len;
+
+ blob = psp_copy_user_blob(params.trans_uaddr, params.trans_len);
+ if (IS_ERR(blob)) {
+ ret = PTR_ERR(blob);
+ goto e_unpin_memory;
+ }
+
+ data.trans_address = __psp_pa(blob);
+ data.trans_len = params.trans_len;
+
+ hdr = psp_copy_user_blob(params.hdr_uaddr, params.hdr_len);
+ if (IS_ERR(hdr)) {
+ ret = PTR_ERR(hdr);
+ goto e_free_blob;
+ }
+ data.hdr_address = __psp_pa(hdr);
+ data.hdr_len = params.hdr_len;
+
+ data.handle = sev->handle;
+ ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_SECRET, &data, &argp->error);
+
+ kfree(hdr);
+
+e_free_blob:
+ kfree(blob);
+e_unpin_memory:
+ /* content of memory is updated, mark pages dirty */
+ for (i = 0; i < n; i++) {
+ set_page_dirty_lock(pages[i]);
+ mark_page_accessed(pages[i]);
+ }
+ sev_unpin_memory(kvm, pages, n);
+ return ret;
+}
+
+static int sev_get_attestation_report(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ void __user *report = (void __user *)(uintptr_t)argp->data;
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct sev_data_attestation_report data;
+ struct kvm_sev_attestation_report params;
+ void __user *p;
+ void *blob = NULL;
+ int ret;
+
+ if (!sev_guest(kvm))
+ return -ENOTTY;
+
+ if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data, sizeof(params)))
+ return -EFAULT;
+
+ memset(&data, 0, sizeof(data));
+
+ /* User wants to query the blob length */
+ if (!params.len)
+ goto cmd;
+
+ p = (void __user *)(uintptr_t)params.uaddr;
+ if (p) {
+ if (params.len > SEV_FW_BLOB_MAX_SIZE)
+ return -EINVAL;
+
+ blob = kzalloc(params.len, GFP_KERNEL_ACCOUNT);
+ if (!blob)
+ return -ENOMEM;
+
+ data.address = __psp_pa(blob);
+ data.len = params.len;
+ memcpy(data.mnonce, params.mnonce, sizeof(params.mnonce));
+ }
+cmd:
+ data.handle = sev->handle;
+ ret = sev_issue_cmd(kvm, SEV_CMD_ATTESTATION_REPORT, &data, &argp->error);
+ /*
+ * If we query the session length, FW responded with expected data.
+ */
+ if (!params.len)
+ goto done;
+
+ if (ret)
+ goto e_free_blob;
+
+ if (blob) {
+ if (copy_to_user(p, blob, params.len))
+ ret = -EFAULT;
+ }
+
+done:
+ params.len = data.len;
+ if (copy_to_user(report, &params, sizeof(params)))
+ ret = -EFAULT;
+e_free_blob:
+ kfree(blob);
+ return ret;
+}
+
+/* Userspace wants to query session length. */
+static int
+__sev_send_start_query_session_length(struct kvm *kvm, struct kvm_sev_cmd *argp,
+ struct kvm_sev_send_start *params)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct sev_data_send_start data;
+ int ret;
+
+ memset(&data, 0, sizeof(data));
+ data.handle = sev->handle;
+ ret = sev_issue_cmd(kvm, SEV_CMD_SEND_START, &data, &argp->error);
+
+ params->session_len = data.session_len;
+ if (copy_to_user((void __user *)(uintptr_t)argp->data, params,
+ sizeof(struct kvm_sev_send_start)))
+ ret = -EFAULT;
+
+ return ret;
+}
+
+static int sev_send_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct sev_data_send_start data;
+ struct kvm_sev_send_start params;
+ void *amd_certs, *session_data;
+ void *pdh_cert, *plat_certs;
+ int ret;
+
+ if (!sev_guest(kvm))
+ return -ENOTTY;
+
+ if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data,
+ sizeof(struct kvm_sev_send_start)))
+ return -EFAULT;
+
+ /* if session_len is zero, userspace wants to query the session length */
+ if (!params.session_len)
+ return __sev_send_start_query_session_length(kvm, argp,
+ &params);
+
+ /* some sanity checks */
+ if (!params.pdh_cert_uaddr || !params.pdh_cert_len ||
+ !params.session_uaddr || params.session_len > SEV_FW_BLOB_MAX_SIZE)
+ return -EINVAL;
+
+ /* allocate the memory to hold the session data blob */
+ session_data = kzalloc(params.session_len, GFP_KERNEL_ACCOUNT);
+ if (!session_data)
+ return -ENOMEM;
+
+ /* copy the certificate blobs from userspace */
+ pdh_cert = psp_copy_user_blob(params.pdh_cert_uaddr,
+ params.pdh_cert_len);
+ if (IS_ERR(pdh_cert)) {
+ ret = PTR_ERR(pdh_cert);
+ goto e_free_session;
+ }
+
+ plat_certs = psp_copy_user_blob(params.plat_certs_uaddr,
+ params.plat_certs_len);
+ if (IS_ERR(plat_certs)) {
+ ret = PTR_ERR(plat_certs);
+ goto e_free_pdh;
+ }
+
+ amd_certs = psp_copy_user_blob(params.amd_certs_uaddr,
+ params.amd_certs_len);
+ if (IS_ERR(amd_certs)) {
+ ret = PTR_ERR(amd_certs);
+ goto e_free_plat_cert;
+ }
+
+ /* populate the FW SEND_START field with system physical address */
+ memset(&data, 0, sizeof(data));
+ data.pdh_cert_address = __psp_pa(pdh_cert);
+ data.pdh_cert_len = params.pdh_cert_len;
+ data.plat_certs_address = __psp_pa(plat_certs);
+ data.plat_certs_len = params.plat_certs_len;
+ data.amd_certs_address = __psp_pa(amd_certs);
+ data.amd_certs_len = params.amd_certs_len;
+ data.session_address = __psp_pa(session_data);
+ data.session_len = params.session_len;
+ data.handle = sev->handle;
+
+ ret = sev_issue_cmd(kvm, SEV_CMD_SEND_START, &data, &argp->error);
+
+ if (!ret && copy_to_user((void __user *)(uintptr_t)params.session_uaddr,
+ session_data, params.session_len)) {
+ ret = -EFAULT;
+ goto e_free_amd_cert;
+ }
+
+ params.policy = data.policy;
+ params.session_len = data.session_len;
+ if (copy_to_user((void __user *)(uintptr_t)argp->data, &params,
+ sizeof(struct kvm_sev_send_start)))
+ ret = -EFAULT;
+
+e_free_amd_cert:
+ kfree(amd_certs);
+e_free_plat_cert:
+ kfree(plat_certs);
+e_free_pdh:
+ kfree(pdh_cert);
+e_free_session:
+ kfree(session_data);
+ return ret;
+}
+
+/* Userspace wants to query either header or trans length. */
+static int
+__sev_send_update_data_query_lengths(struct kvm *kvm, struct kvm_sev_cmd *argp,
+ struct kvm_sev_send_update_data *params)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct sev_data_send_update_data data;
+ int ret;
+
+ memset(&data, 0, sizeof(data));
+ data.handle = sev->handle;
+ ret = sev_issue_cmd(kvm, SEV_CMD_SEND_UPDATE_DATA, &data, &argp->error);
+
+ params->hdr_len = data.hdr_len;
+ params->trans_len = data.trans_len;
+
+ if (copy_to_user((void __user *)(uintptr_t)argp->data, params,
+ sizeof(struct kvm_sev_send_update_data)))
+ ret = -EFAULT;
+
+ return ret;
+}
+
+static int sev_send_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct sev_data_send_update_data data;
+ struct kvm_sev_send_update_data params;
+ void *hdr, *trans_data;
+ struct page **guest_page;
+ unsigned long n;
+ int ret, offset;
+
+ if (!sev_guest(kvm))
+ return -ENOTTY;
+
+ if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data,
+ sizeof(struct kvm_sev_send_update_data)))
+ return -EFAULT;
+
+ /* userspace wants to query either header or trans length */
+ if (!params.trans_len || !params.hdr_len)
+ return __sev_send_update_data_query_lengths(kvm, argp, &params);
+
+ if (!params.trans_uaddr || !params.guest_uaddr ||
+ !params.guest_len || !params.hdr_uaddr)
+ return -EINVAL;
+
+ /* Check if we are crossing the page boundary */
+ offset = params.guest_uaddr & (PAGE_SIZE - 1);
+ if (params.guest_len > PAGE_SIZE || (params.guest_len + offset) > PAGE_SIZE)
+ return -EINVAL;
+
+ /* Pin guest memory */
+ guest_page = sev_pin_memory(kvm, params.guest_uaddr & PAGE_MASK,
+ PAGE_SIZE, &n, 0);
+ if (IS_ERR(guest_page))
+ return PTR_ERR(guest_page);
+
+ /* allocate memory for header and transport buffer */
+ ret = -ENOMEM;
+ hdr = kzalloc(params.hdr_len, GFP_KERNEL_ACCOUNT);
+ if (!hdr)
+ goto e_unpin;
+
+ trans_data = kzalloc(params.trans_len, GFP_KERNEL_ACCOUNT);
+ if (!trans_data)
+ goto e_free_hdr;
+
+ memset(&data, 0, sizeof(data));
+ data.hdr_address = __psp_pa(hdr);
+ data.hdr_len = params.hdr_len;
+ data.trans_address = __psp_pa(trans_data);
+ data.trans_len = params.trans_len;
+
+ /* The SEND_UPDATE_DATA command requires C-bit to be always set. */
+ data.guest_address = (page_to_pfn(guest_page[0]) << PAGE_SHIFT) + offset;
+ data.guest_address |= sev_me_mask;
+ data.guest_len = params.guest_len;
+ data.handle = sev->handle;
+
+ ret = sev_issue_cmd(kvm, SEV_CMD_SEND_UPDATE_DATA, &data, &argp->error);
+
+ if (ret)
+ goto e_free_trans_data;
+
+ /* copy transport buffer to user space */
+ if (copy_to_user((void __user *)(uintptr_t)params.trans_uaddr,
+ trans_data, params.trans_len)) {
+ ret = -EFAULT;
+ goto e_free_trans_data;
+ }
+
+ /* Copy packet header to userspace. */
+ if (copy_to_user((void __user *)(uintptr_t)params.hdr_uaddr, hdr,
+ params.hdr_len))
+ ret = -EFAULT;
+
+e_free_trans_data:
+ kfree(trans_data);
+e_free_hdr:
+ kfree(hdr);
+e_unpin:
+ sev_unpin_memory(kvm, guest_page, n);
+
+ return ret;
+}
+
+static int sev_send_finish(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct sev_data_send_finish data;
+
+ if (!sev_guest(kvm))
+ return -ENOTTY;
+
+ data.handle = sev->handle;
+ return sev_issue_cmd(kvm, SEV_CMD_SEND_FINISH, &data, &argp->error);
+}
+
+static int sev_send_cancel(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct sev_data_send_cancel data;
+
+ if (!sev_guest(kvm))
+ return -ENOTTY;
+
+ data.handle = sev->handle;
+ return sev_issue_cmd(kvm, SEV_CMD_SEND_CANCEL, &data, &argp->error);
+}
+
+static int sev_receive_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct sev_data_receive_start start;
+ struct kvm_sev_receive_start params;
+ int *error = &argp->error;
+ void *session_data;
+ void *pdh_data;
+ int ret;
+
+ if (!sev_guest(kvm))
+ return -ENOTTY;
+
+ /* Get parameter from the userspace */
+ if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data,
+ sizeof(struct kvm_sev_receive_start)))
+ return -EFAULT;
+
+ /* some sanity checks */
+ if (!params.pdh_uaddr || !params.pdh_len ||
+ !params.session_uaddr || !params.session_len)
+ return -EINVAL;
+
+ pdh_data = psp_copy_user_blob(params.pdh_uaddr, params.pdh_len);
+ if (IS_ERR(pdh_data))
+ return PTR_ERR(pdh_data);
+
+ session_data = psp_copy_user_blob(params.session_uaddr,
+ params.session_len);
+ if (IS_ERR(session_data)) {
+ ret = PTR_ERR(session_data);
+ goto e_free_pdh;
+ }
+
+ memset(&start, 0, sizeof(start));
+ start.handle = params.handle;
+ start.policy = params.policy;
+ start.pdh_cert_address = __psp_pa(pdh_data);
+ start.pdh_cert_len = params.pdh_len;
+ start.session_address = __psp_pa(session_data);
+ start.session_len = params.session_len;
+
+ /* create memory encryption context */
+ ret = __sev_issue_cmd(argp->sev_fd, SEV_CMD_RECEIVE_START, &start,
+ error);
+ if (ret)
+ goto e_free_session;
+
+ /* Bind ASID to this guest */
+ ret = sev_bind_asid(kvm, start.handle, error);
+ if (ret) {
+ sev_decommission(start.handle);
+ goto e_free_session;
+ }
+
+ params.handle = start.handle;
+ if (copy_to_user((void __user *)(uintptr_t)argp->data,
+ &params, sizeof(struct kvm_sev_receive_start))) {
+ ret = -EFAULT;
+ sev_unbind_asid(kvm, start.handle);
+ goto e_free_session;
+ }
+
+ sev->handle = start.handle;
+ sev->fd = argp->sev_fd;
+
+e_free_session:
+ kfree(session_data);
+e_free_pdh:
+ kfree(pdh_data);
+
+ return ret;
+}
+
+static int sev_receive_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct kvm_sev_receive_update_data params;
+ struct sev_data_receive_update_data data;
+ void *hdr = NULL, *trans = NULL;
+ struct page **guest_page;
+ unsigned long n;
+ int ret, offset;
+
+ if (!sev_guest(kvm))
+ return -EINVAL;
+
+ if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data,
+ sizeof(struct kvm_sev_receive_update_data)))
+ return -EFAULT;
+
+ if (!params.hdr_uaddr || !params.hdr_len ||
+ !params.guest_uaddr || !params.guest_len ||
+ !params.trans_uaddr || !params.trans_len)
+ return -EINVAL;
+
+ /* Check if we are crossing the page boundary */
+ offset = params.guest_uaddr & (PAGE_SIZE - 1);
+ if (params.guest_len > PAGE_SIZE || (params.guest_len + offset) > PAGE_SIZE)
+ return -EINVAL;
+
+ hdr = psp_copy_user_blob(params.hdr_uaddr, params.hdr_len);
+ if (IS_ERR(hdr))
+ return PTR_ERR(hdr);
+
+ trans = psp_copy_user_blob(params.trans_uaddr, params.trans_len);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ goto e_free_hdr;
+ }
+
+ memset(&data, 0, sizeof(data));
+ data.hdr_address = __psp_pa(hdr);
+ data.hdr_len = params.hdr_len;
+ data.trans_address = __psp_pa(trans);
+ data.trans_len = params.trans_len;
+
+ /* Pin guest memory */
+ guest_page = sev_pin_memory(kvm, params.guest_uaddr & PAGE_MASK,
+ PAGE_SIZE, &n, 1);
+ if (IS_ERR(guest_page)) {
+ ret = PTR_ERR(guest_page);
+ goto e_free_trans;
+ }
+
+ /*
+ * Flush (on non-coherent CPUs) before RECEIVE_UPDATE_DATA, the PSP
+ * encrypts the written data with the guest's key, and the cache may
+ * contain dirty, unencrypted data.
+ */
+ sev_clflush_pages(guest_page, n);
+
+ /* The RECEIVE_UPDATE_DATA command requires C-bit to be always set. */
+ data.guest_address = (page_to_pfn(guest_page[0]) << PAGE_SHIFT) + offset;
+ data.guest_address |= sev_me_mask;
+ data.guest_len = params.guest_len;
+ data.handle = sev->handle;
+
+ ret = sev_issue_cmd(kvm, SEV_CMD_RECEIVE_UPDATE_DATA, &data,
+ &argp->error);
+
+ sev_unpin_memory(kvm, guest_page, n);
+
+e_free_trans:
+ kfree(trans);
+e_free_hdr:
+ kfree(hdr);
+
+ return ret;
+}
+
+static int sev_receive_finish(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct sev_data_receive_finish data;
+
+ if (!sev_guest(kvm))
+ return -ENOTTY;
+
+ data.handle = sev->handle;
+ return sev_issue_cmd(kvm, SEV_CMD_RECEIVE_FINISH, &data, &argp->error);
+}
+
+static bool is_cmd_allowed_from_mirror(u32 cmd_id)
+{
+ /*
+ * Allow mirrors VM to call KVM_SEV_LAUNCH_UPDATE_VMSA to enable SEV-ES
+ * active mirror VMs. Also allow the debugging and status commands.
+ */
+ if (cmd_id == KVM_SEV_LAUNCH_UPDATE_VMSA ||
+ cmd_id == KVM_SEV_GUEST_STATUS || cmd_id == KVM_SEV_DBG_DECRYPT ||
+ cmd_id == KVM_SEV_DBG_ENCRYPT)
+ return true;
+
+ return false;
+}
+
+static int sev_lock_two_vms(struct kvm *dst_kvm, struct kvm *src_kvm)
+{
+ struct kvm_sev_info *dst_sev = &to_kvm_svm(dst_kvm)->sev_info;
+ struct kvm_sev_info *src_sev = &to_kvm_svm(src_kvm)->sev_info;
+ int r = -EBUSY;
+
+ if (dst_kvm == src_kvm)
+ return -EINVAL;
+
+ /*
+ * Bail if these VMs are already involved in a migration to avoid
+ * deadlock between two VMs trying to migrate to/from each other.
+ */
+ if (atomic_cmpxchg_acquire(&dst_sev->migration_in_progress, 0, 1))
+ return -EBUSY;
+
+ if (atomic_cmpxchg_acquire(&src_sev->migration_in_progress, 0, 1))
+ goto release_dst;
+
+ r = -EINTR;
+ if (mutex_lock_killable(&dst_kvm->lock))
+ goto release_src;
+ if (mutex_lock_killable_nested(&src_kvm->lock, SINGLE_DEPTH_NESTING))
+ goto unlock_dst;
+ return 0;
+
+unlock_dst:
+ mutex_unlock(&dst_kvm->lock);
+release_src:
+ atomic_set_release(&src_sev->migration_in_progress, 0);
+release_dst:
+ atomic_set_release(&dst_sev->migration_in_progress, 0);
+ return r;
+}
+
+static void sev_unlock_two_vms(struct kvm *dst_kvm, struct kvm *src_kvm)
+{
+ struct kvm_sev_info *dst_sev = &to_kvm_svm(dst_kvm)->sev_info;
+ struct kvm_sev_info *src_sev = &to_kvm_svm(src_kvm)->sev_info;
+
+ mutex_unlock(&dst_kvm->lock);
+ mutex_unlock(&src_kvm->lock);
+ atomic_set_release(&dst_sev->migration_in_progress, 0);
+ atomic_set_release(&src_sev->migration_in_progress, 0);
+}
+
+/* vCPU mutex subclasses. */
+enum sev_migration_role {
+ SEV_MIGRATION_SOURCE = 0,
+ SEV_MIGRATION_TARGET,
+ SEV_NR_MIGRATION_ROLES,
+};
+
+static int sev_lock_vcpus_for_migration(struct kvm *kvm,
+ enum sev_migration_role role)
+{
+ struct kvm_vcpu *vcpu;
+ unsigned long i, j;
+
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ if (mutex_lock_killable_nested(&vcpu->mutex, role))
+ goto out_unlock;
+
+#ifdef CONFIG_PROVE_LOCKING
+ if (!i)
+ /*
+ * Reset the role to one that avoids colliding with
+ * the role used for the first vcpu mutex.
+ */
+ role = SEV_NR_MIGRATION_ROLES;
+ else
+ mutex_release(&vcpu->mutex.dep_map, _THIS_IP_);
+#endif
+ }
+
+ return 0;
+
+out_unlock:
+
+ kvm_for_each_vcpu(j, vcpu, kvm) {
+ if (i == j)
+ break;
+
+#ifdef CONFIG_PROVE_LOCKING
+ if (j)
+ mutex_acquire(&vcpu->mutex.dep_map, role, 0, _THIS_IP_);
+#endif
+
+ mutex_unlock(&vcpu->mutex);
+ }
+ return -EINTR;
+}
+
+static void sev_unlock_vcpus_for_migration(struct kvm *kvm)
+{
+ struct kvm_vcpu *vcpu;
+ unsigned long i;
+ bool first = true;
+
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ if (first)
+ first = false;
+ else
+ mutex_acquire(&vcpu->mutex.dep_map,
+ SEV_NR_MIGRATION_ROLES, 0, _THIS_IP_);
+
+ mutex_unlock(&vcpu->mutex);
+ }
+}
+
+static void sev_migrate_from(struct kvm *dst_kvm, struct kvm *src_kvm)
+{
+ struct kvm_sev_info *dst = &to_kvm_svm(dst_kvm)->sev_info;
+ struct kvm_sev_info *src = &to_kvm_svm(src_kvm)->sev_info;
+ struct kvm_vcpu *dst_vcpu, *src_vcpu;
+ struct vcpu_svm *dst_svm, *src_svm;
+ struct kvm_sev_info *mirror;
+ unsigned long i;
+
+ dst->active = true;
+ dst->asid = src->asid;
+ dst->handle = src->handle;
+ dst->pages_locked = src->pages_locked;
+ dst->enc_context_owner = src->enc_context_owner;
+ dst->es_active = src->es_active;
+
+ src->asid = 0;
+ src->active = false;
+ src->handle = 0;
+ src->pages_locked = 0;
+ src->enc_context_owner = NULL;
+ src->es_active = false;
+
+ list_cut_before(&dst->regions_list, &src->regions_list, &src->regions_list);
+
+ /*
+ * If this VM has mirrors, "transfer" each mirror's refcount of the
+ * source to the destination (this KVM). The caller holds a reference
+ * to the source, so there's no danger of use-after-free.
+ */
+ list_cut_before(&dst->mirror_vms, &src->mirror_vms, &src->mirror_vms);
+ list_for_each_entry(mirror, &dst->mirror_vms, mirror_entry) {
+ kvm_get_kvm(dst_kvm);
+ kvm_put_kvm(src_kvm);
+ mirror->enc_context_owner = dst_kvm;
+ }
+
+ /*
+ * If this VM is a mirror, remove the old mirror from the owners list
+ * and add the new mirror to the list.
+ */
+ if (is_mirroring_enc_context(dst_kvm)) {
+ struct kvm_sev_info *owner_sev_info =
+ &to_kvm_svm(dst->enc_context_owner)->sev_info;
+
+ list_del(&src->mirror_entry);
+ list_add_tail(&dst->mirror_entry, &owner_sev_info->mirror_vms);
+ }
+
+ kvm_for_each_vcpu(i, dst_vcpu, dst_kvm) {
+ dst_svm = to_svm(dst_vcpu);
+
+ sev_init_vmcb(dst_svm);
+
+ if (!dst->es_active)
+ continue;
+
+ /*
+ * Note, the source is not required to have the same number of
+ * vCPUs as the destination when migrating a vanilla SEV VM.
+ */
+ src_vcpu = kvm_get_vcpu(src_kvm, i);
+ src_svm = to_svm(src_vcpu);
+
+ /*
+ * Transfer VMSA and GHCB state to the destination. Nullify and
+ * clear source fields as appropriate, the state now belongs to
+ * the destination.
+ */
+ memcpy(&dst_svm->sev_es, &src_svm->sev_es, sizeof(src_svm->sev_es));
+ dst_svm->vmcb->control.ghcb_gpa = src_svm->vmcb->control.ghcb_gpa;
+ dst_svm->vmcb->control.vmsa_pa = src_svm->vmcb->control.vmsa_pa;
+ dst_vcpu->arch.guest_state_protected = true;
+
+ memset(&src_svm->sev_es, 0, sizeof(src_svm->sev_es));
+ src_svm->vmcb->control.ghcb_gpa = INVALID_PAGE;
+ src_svm->vmcb->control.vmsa_pa = INVALID_PAGE;
+ src_vcpu->arch.guest_state_protected = false;
+ }
+}
+
+static int sev_check_source_vcpus(struct kvm *dst, struct kvm *src)
+{
+ struct kvm_vcpu *src_vcpu;
+ unsigned long i;
+
+ if (!sev_es_guest(src))
+ return 0;
+
+ if (atomic_read(&src->online_vcpus) != atomic_read(&dst->online_vcpus))
+ return -EINVAL;
+
+ kvm_for_each_vcpu(i, src_vcpu, src) {
+ if (!src_vcpu->arch.guest_state_protected)
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+int sev_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd)
+{
+ struct kvm_sev_info *dst_sev = &to_kvm_svm(kvm)->sev_info;
+ struct kvm_sev_info *src_sev, *cg_cleanup_sev;
+ struct fd f = fdget(source_fd);
+ struct kvm *source_kvm;
+ bool charged = false;
+ int ret;
+
+ if (!f.file)
+ return -EBADF;
+
+ if (!file_is_kvm(f.file)) {
+ ret = -EBADF;
+ goto out_fput;
+ }
+
+ source_kvm = f.file->private_data;
+ ret = sev_lock_two_vms(kvm, source_kvm);
+ if (ret)
+ goto out_fput;
+
+ if (sev_guest(kvm) || !sev_guest(source_kvm)) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+
+ src_sev = &to_kvm_svm(source_kvm)->sev_info;
+
+ dst_sev->misc_cg = get_current_misc_cg();
+ cg_cleanup_sev = dst_sev;
+ if (dst_sev->misc_cg != src_sev->misc_cg) {
+ ret = sev_misc_cg_try_charge(dst_sev);
+ if (ret)
+ goto out_dst_cgroup;
+ charged = true;
+ }
+
+ ret = sev_lock_vcpus_for_migration(kvm, SEV_MIGRATION_SOURCE);
+ if (ret)
+ goto out_dst_cgroup;
+ ret = sev_lock_vcpus_for_migration(source_kvm, SEV_MIGRATION_TARGET);
+ if (ret)
+ goto out_dst_vcpu;
+
+ ret = sev_check_source_vcpus(kvm, source_kvm);
+ if (ret)
+ goto out_source_vcpu;
+
+ sev_migrate_from(kvm, source_kvm);
+ kvm_vm_dead(source_kvm);
+ cg_cleanup_sev = src_sev;
+ ret = 0;
+
+out_source_vcpu:
+ sev_unlock_vcpus_for_migration(source_kvm);
+out_dst_vcpu:
+ sev_unlock_vcpus_for_migration(kvm);
+out_dst_cgroup:
+ /* Operates on the source on success, on the destination on failure. */
+ if (charged)
+ sev_misc_cg_uncharge(cg_cleanup_sev);
+ put_misc_cg(cg_cleanup_sev->misc_cg);
+ cg_cleanup_sev->misc_cg = NULL;
+out_unlock:
+ sev_unlock_two_vms(kvm, source_kvm);
+out_fput:
+ fdput(f);
+ return ret;
+}
+
+int sev_mem_enc_ioctl(struct kvm *kvm, void __user *argp)
+{
+ struct kvm_sev_cmd sev_cmd;
+ int r;
+
+ if (!sev_enabled)
+ return -ENOTTY;
+
+ if (!argp)
+ return 0;
+
+ if (copy_from_user(&sev_cmd, argp, sizeof(struct kvm_sev_cmd)))
+ return -EFAULT;
+
+ mutex_lock(&kvm->lock);
+
+ /* Only the enc_context_owner handles some memory enc operations. */
+ if (is_mirroring_enc_context(kvm) &&
+ !is_cmd_allowed_from_mirror(sev_cmd.id)) {
+ r = -EINVAL;
+ goto out;
+ }
+
+ switch (sev_cmd.id) {
+ case KVM_SEV_ES_INIT:
+ if (!sev_es_enabled) {
+ r = -ENOTTY;
+ goto out;
+ }
+ fallthrough;
+ case KVM_SEV_INIT:
+ r = sev_guest_init(kvm, &sev_cmd);
+ break;
+ case KVM_SEV_LAUNCH_START:
+ r = sev_launch_start(kvm, &sev_cmd);
+ break;
+ case KVM_SEV_LAUNCH_UPDATE_DATA:
+ r = sev_launch_update_data(kvm, &sev_cmd);
+ break;
+ case KVM_SEV_LAUNCH_UPDATE_VMSA:
+ r = sev_launch_update_vmsa(kvm, &sev_cmd);
+ break;
+ case KVM_SEV_LAUNCH_MEASURE:
+ r = sev_launch_measure(kvm, &sev_cmd);
+ break;
+ case KVM_SEV_LAUNCH_FINISH:
+ r = sev_launch_finish(kvm, &sev_cmd);
+ break;
+ case KVM_SEV_GUEST_STATUS:
+ r = sev_guest_status(kvm, &sev_cmd);
+ break;
+ case KVM_SEV_DBG_DECRYPT:
+ r = sev_dbg_crypt(kvm, &sev_cmd, true);
+ break;
+ case KVM_SEV_DBG_ENCRYPT:
+ r = sev_dbg_crypt(kvm, &sev_cmd, false);
+ break;
+ case KVM_SEV_LAUNCH_SECRET:
+ r = sev_launch_secret(kvm, &sev_cmd);
+ break;
+ case KVM_SEV_GET_ATTESTATION_REPORT:
+ r = sev_get_attestation_report(kvm, &sev_cmd);
+ break;
+ case KVM_SEV_SEND_START:
+ r = sev_send_start(kvm, &sev_cmd);
+ break;
+ case KVM_SEV_SEND_UPDATE_DATA:
+ r = sev_send_update_data(kvm, &sev_cmd);
+ break;
+ case KVM_SEV_SEND_FINISH:
+ r = sev_send_finish(kvm, &sev_cmd);
+ break;
+ case KVM_SEV_SEND_CANCEL:
+ r = sev_send_cancel(kvm, &sev_cmd);
+ break;
+ case KVM_SEV_RECEIVE_START:
+ r = sev_receive_start(kvm, &sev_cmd);
+ break;
+ case KVM_SEV_RECEIVE_UPDATE_DATA:
+ r = sev_receive_update_data(kvm, &sev_cmd);
+ break;
+ case KVM_SEV_RECEIVE_FINISH:
+ r = sev_receive_finish(kvm, &sev_cmd);
+ break;
+ default:
+ r = -EINVAL;
+ goto out;
+ }
+
+ if (copy_to_user(argp, &sev_cmd, sizeof(struct kvm_sev_cmd)))
+ r = -EFAULT;
+
+out:
+ mutex_unlock(&kvm->lock);
+ return r;
+}
+
+int sev_mem_enc_register_region(struct kvm *kvm,
+ struct kvm_enc_region *range)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct enc_region *region;
+ int ret = 0;
+
+ if (!sev_guest(kvm))
+ return -ENOTTY;
+
+ /* If kvm is mirroring encryption context it isn't responsible for it */
+ if (is_mirroring_enc_context(kvm))
+ return -EINVAL;
+
+ if (range->addr > ULONG_MAX || range->size > ULONG_MAX)
+ return -EINVAL;
+
+ region = kzalloc(sizeof(*region), GFP_KERNEL_ACCOUNT);
+ if (!region)
+ return -ENOMEM;
+
+ mutex_lock(&kvm->lock);
+ region->pages = sev_pin_memory(kvm, range->addr, range->size, &region->npages, 1);
+ if (IS_ERR(region->pages)) {
+ ret = PTR_ERR(region->pages);
+ mutex_unlock(&kvm->lock);
+ goto e_free;
+ }
+
+ region->uaddr = range->addr;
+ region->size = range->size;
+
+ list_add_tail(&region->list, &sev->regions_list);
+ mutex_unlock(&kvm->lock);
+
+ /*
+ * The guest may change the memory encryption attribute from C=0 -> C=1
+ * or vice versa for this memory range. Lets make sure caches are
+ * flushed to ensure that guest data gets written into memory with
+ * correct C-bit.
+ */
+ sev_clflush_pages(region->pages, region->npages);
+
+ return ret;
+
+e_free:
+ kfree(region);
+ return ret;
+}
+
+static struct enc_region *
+find_enc_region(struct kvm *kvm, struct kvm_enc_region *range)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct list_head *head = &sev->regions_list;
+ struct enc_region *i;
+
+ list_for_each_entry(i, head, list) {
+ if (i->uaddr == range->addr &&
+ i->size == range->size)
+ return i;
+ }
+
+ return NULL;
+}
+
+static void __unregister_enc_region_locked(struct kvm *kvm,
+ struct enc_region *region)
+{
+ sev_unpin_memory(kvm, region->pages, region->npages);
+ list_del(&region->list);
+ kfree(region);
+}
+
+int sev_mem_enc_unregister_region(struct kvm *kvm,
+ struct kvm_enc_region *range)
+{
+ struct enc_region *region;
+ int ret;
+
+ /* If kvm is mirroring encryption context it isn't responsible for it */
+ if (is_mirroring_enc_context(kvm))
+ return -EINVAL;
+
+ mutex_lock(&kvm->lock);
+
+ if (!sev_guest(kvm)) {
+ ret = -ENOTTY;
+ goto failed;
+ }
+
+ region = find_enc_region(kvm, range);
+ if (!region) {
+ ret = -EINVAL;
+ goto failed;
+ }
+
+ /*
+ * Ensure that all guest tagged cache entries are flushed before
+ * releasing the pages back to the system for use. CLFLUSH will
+ * not do this, so issue a WBINVD.
+ */
+ wbinvd_on_all_cpus();
+
+ __unregister_enc_region_locked(kvm, region);
+
+ mutex_unlock(&kvm->lock);
+ return 0;
+
+failed:
+ mutex_unlock(&kvm->lock);
+ return ret;
+}
+
+int sev_vm_copy_enc_context_from(struct kvm *kvm, unsigned int source_fd)
+{
+ struct fd f = fdget(source_fd);
+ struct kvm *source_kvm;
+ struct kvm_sev_info *source_sev, *mirror_sev;
+ int ret;
+
+ if (!f.file)
+ return -EBADF;
+
+ if (!file_is_kvm(f.file)) {
+ ret = -EBADF;
+ goto e_source_fput;
+ }
+
+ source_kvm = f.file->private_data;
+ ret = sev_lock_two_vms(kvm, source_kvm);
+ if (ret)
+ goto e_source_fput;
+
+ /*
+ * Mirrors of mirrors should work, but let's not get silly. Also
+ * disallow out-of-band SEV/SEV-ES init if the target is already an
+ * SEV guest, or if vCPUs have been created. KVM relies on vCPUs being
+ * created after SEV/SEV-ES initialization, e.g. to init intercepts.
+ */
+ if (sev_guest(kvm) || !sev_guest(source_kvm) ||
+ is_mirroring_enc_context(source_kvm) || kvm->created_vcpus) {
+ ret = -EINVAL;
+ goto e_unlock;
+ }
+
+ /*
+ * The mirror kvm holds an enc_context_owner ref so its asid can't
+ * disappear until we're done with it
+ */
+ source_sev = &to_kvm_svm(source_kvm)->sev_info;
+ kvm_get_kvm(source_kvm);
+ mirror_sev = &to_kvm_svm(kvm)->sev_info;
+ list_add_tail(&mirror_sev->mirror_entry, &source_sev->mirror_vms);
+
+ /* Set enc_context_owner and copy its encryption context over */
+ mirror_sev->enc_context_owner = source_kvm;
+ mirror_sev->active = true;
+ mirror_sev->asid = source_sev->asid;
+ mirror_sev->fd = source_sev->fd;
+ mirror_sev->es_active = source_sev->es_active;
+ mirror_sev->handle = source_sev->handle;
+ INIT_LIST_HEAD(&mirror_sev->regions_list);
+ INIT_LIST_HEAD(&mirror_sev->mirror_vms);
+ ret = 0;
+
+ /*
+ * Do not copy ap_jump_table. Since the mirror does not share the same
+ * KVM contexts as the original, and they may have different
+ * memory-views.
+ */
+
+e_unlock:
+ sev_unlock_two_vms(kvm, source_kvm);
+e_source_fput:
+ fdput(f);
+ return ret;
+}
+
+void sev_vm_destroy(struct kvm *kvm)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct list_head *head = &sev->regions_list;
+ struct list_head *pos, *q;
+
+ if (!sev_guest(kvm))
+ return;
+
+ WARN_ON(!list_empty(&sev->mirror_vms));
+
+ /* If this is a mirror_kvm release the enc_context_owner and skip sev cleanup */
+ if (is_mirroring_enc_context(kvm)) {
+ struct kvm *owner_kvm = sev->enc_context_owner;
+
+ mutex_lock(&owner_kvm->lock);
+ list_del(&sev->mirror_entry);
+ mutex_unlock(&owner_kvm->lock);
+ kvm_put_kvm(owner_kvm);
+ return;
+ }
+
+ /*
+ * Ensure that all guest tagged cache entries are flushed before
+ * releasing the pages back to the system for use. CLFLUSH will
+ * not do this, so issue a WBINVD.
+ */
+ wbinvd_on_all_cpus();
+
+ /*
+ * if userspace was terminated before unregistering the memory regions
+ * then lets unpin all the registered memory.
+ */
+ if (!list_empty(head)) {
+ list_for_each_safe(pos, q, head) {
+ __unregister_enc_region_locked(kvm,
+ list_entry(pos, struct enc_region, list));
+ cond_resched();
+ }
+ }
+
+ sev_unbind_asid(kvm, sev->handle);
+ sev_asid_free(sev);
+}
+
+void __init sev_set_cpu_caps(void)
+{
+ if (!sev_enabled)
+ kvm_cpu_cap_clear(X86_FEATURE_SEV);
+ if (!sev_es_enabled)
+ kvm_cpu_cap_clear(X86_FEATURE_SEV_ES);
+}
+
+void __init sev_hardware_setup(void)
+{
+#ifdef CONFIG_KVM_AMD_SEV
+ unsigned int eax, ebx, ecx, edx, sev_asid_count, sev_es_asid_count;
+ bool sev_es_supported = false;
+ bool sev_supported = false;
+
+ if (!sev_enabled || !npt_enabled || !nrips)
+ goto out;
+
+ /*
+ * SEV must obviously be supported in hardware. Sanity check that the
+ * CPU supports decode assists, which is mandatory for SEV guests to
+ * support instruction emulation.
+ */
+ if (!boot_cpu_has(X86_FEATURE_SEV) ||
+ WARN_ON_ONCE(!boot_cpu_has(X86_FEATURE_DECODEASSISTS)))
+ goto out;
+
+ /* Retrieve SEV CPUID information */
+ cpuid(0x8000001f, &eax, &ebx, &ecx, &edx);
+
+ /* Set encryption bit location for SEV-ES guests */
+ sev_enc_bit = ebx & 0x3f;
+
+ /* Maximum number of encrypted guests supported simultaneously */
+ max_sev_asid = ecx;
+ if (!max_sev_asid)
+ goto out;
+
+ /* Minimum ASID value that should be used for SEV guest */
+ min_sev_asid = edx;
+ sev_me_mask = 1UL << (ebx & 0x3f);
+
+ /*
+ * Initialize SEV ASID bitmaps. Allocate space for ASID 0 in the bitmap,
+ * even though it's never used, so that the bitmap is indexed by the
+ * actual ASID.
+ */
+ nr_asids = max_sev_asid + 1;
+ sev_asid_bitmap = bitmap_zalloc(nr_asids, GFP_KERNEL);
+ if (!sev_asid_bitmap)
+ goto out;
+
+ sev_reclaim_asid_bitmap = bitmap_zalloc(nr_asids, GFP_KERNEL);
+ if (!sev_reclaim_asid_bitmap) {
+ bitmap_free(sev_asid_bitmap);
+ sev_asid_bitmap = NULL;
+ goto out;
+ }
+
+ sev_asid_count = max_sev_asid - min_sev_asid + 1;
+ WARN_ON_ONCE(misc_cg_set_capacity(MISC_CG_RES_SEV, sev_asid_count));
+ sev_supported = true;
+
+ /* SEV-ES support requested? */
+ if (!sev_es_enabled)
+ goto out;
+
+ /*
+ * SEV-ES requires MMIO caching as KVM doesn't have access to the guest
+ * instruction stream, i.e. can't emulate in response to a #NPF and
+ * instead relies on #NPF(RSVD) being reflected into the guest as #VC
+ * (the guest can then do a #VMGEXIT to request MMIO emulation).
+ */
+ if (!enable_mmio_caching)
+ goto out;
+
+ /* Does the CPU support SEV-ES? */
+ if (!boot_cpu_has(X86_FEATURE_SEV_ES))
+ goto out;
+
+ /* Has the system been allocated ASIDs for SEV-ES? */
+ if (min_sev_asid == 1)
+ goto out;
+
+ sev_es_asid_count = min_sev_asid - 1;
+ WARN_ON_ONCE(misc_cg_set_capacity(MISC_CG_RES_SEV_ES, sev_es_asid_count));
+ sev_es_supported = true;
+
+out:
+ if (boot_cpu_has(X86_FEATURE_SEV))
+ pr_info("SEV %s (ASIDs %u - %u)\n",
+ sev_supported ? "enabled" : "disabled",
+ min_sev_asid, max_sev_asid);
+ if (boot_cpu_has(X86_FEATURE_SEV_ES))
+ pr_info("SEV-ES %s (ASIDs %u - %u)\n",
+ sev_es_supported ? "enabled" : "disabled",
+ min_sev_asid > 1 ? 1 : 0, min_sev_asid - 1);
+
+ sev_enabled = sev_supported;
+ sev_es_enabled = sev_es_supported;
+ if (!sev_es_enabled || !cpu_feature_enabled(X86_FEATURE_DEBUG_SWAP) ||
+ !cpu_feature_enabled(X86_FEATURE_NO_NESTED_DATA_BP))
+ sev_es_debug_swap_enabled = false;
+#endif
+}
+
+void sev_hardware_unsetup(void)
+{
+ if (!sev_enabled)
+ return;
+
+ /* No need to take sev_bitmap_lock, all VMs have been destroyed. */
+ sev_flush_asids(1, max_sev_asid);
+
+ bitmap_free(sev_asid_bitmap);
+ bitmap_free(sev_reclaim_asid_bitmap);
+
+ misc_cg_set_capacity(MISC_CG_RES_SEV, 0);
+ misc_cg_set_capacity(MISC_CG_RES_SEV_ES, 0);
+}
+
+int sev_cpu_init(struct svm_cpu_data *sd)
+{
+ if (!sev_enabled)
+ return 0;
+
+ sd->sev_vmcbs = kcalloc(nr_asids, sizeof(void *), GFP_KERNEL);
+ if (!sd->sev_vmcbs)
+ return -ENOMEM;
+
+ return 0;
+}
+
+/*
+ * Pages used by hardware to hold guest encrypted state must be flushed before
+ * returning them to the system.
+ */
+static void sev_flush_encrypted_page(struct kvm_vcpu *vcpu, void *va)
+{
+ int asid = to_kvm_svm(vcpu->kvm)->sev_info.asid;
+
+ /*
+ * Note! The address must be a kernel address, as regular page walk
+ * checks are performed by VM_PAGE_FLUSH, i.e. operating on a user
+ * address is non-deterministic and unsafe. This function deliberately
+ * takes a pointer to deter passing in a user address.
+ */
+ unsigned long addr = (unsigned long)va;
+
+ /*
+ * If CPU enforced cache coherency for encrypted mappings of the
+ * same physical page is supported, use CLFLUSHOPT instead. NOTE: cache
+ * flush is still needed in order to work properly with DMA devices.
+ */
+ if (boot_cpu_has(X86_FEATURE_SME_COHERENT)) {
+ clflush_cache_range(va, PAGE_SIZE);
+ return;
+ }
+
+ /*
+ * VM Page Flush takes a host virtual address and a guest ASID. Fall
+ * back to WBINVD if this faults so as not to make any problems worse
+ * by leaving stale encrypted data in the cache.
+ */
+ if (WARN_ON_ONCE(wrmsrl_safe(MSR_AMD64_VM_PAGE_FLUSH, addr | asid)))
+ goto do_wbinvd;
+
+ return;
+
+do_wbinvd:
+ wbinvd_on_all_cpus();
+}
+
+void sev_guest_memory_reclaimed(struct kvm *kvm)
+{
+ if (!sev_guest(kvm))
+ return;
+
+ wbinvd_on_all_cpus();
+}
+
+void sev_free_vcpu(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm;
+
+ if (!sev_es_guest(vcpu->kvm))
+ return;
+
+ svm = to_svm(vcpu);
+
+ if (vcpu->arch.guest_state_protected)
+ sev_flush_encrypted_page(vcpu, svm->sev_es.vmsa);
+
+ __free_page(virt_to_page(svm->sev_es.vmsa));
+
+ if (svm->sev_es.ghcb_sa_free)
+ kvfree(svm->sev_es.ghcb_sa);
+}
+
+static void dump_ghcb(struct vcpu_svm *svm)
+{
+ struct ghcb *ghcb = svm->sev_es.ghcb;
+ unsigned int nbits;
+
+ /* Re-use the dump_invalid_vmcb module parameter */
+ if (!dump_invalid_vmcb) {
+ pr_warn_ratelimited("set kvm_amd.dump_invalid_vmcb=1 to dump internal KVM state.\n");
+ return;
+ }
+
+ nbits = sizeof(ghcb->save.valid_bitmap) * 8;
+
+ pr_err("GHCB (GPA=%016llx):\n", svm->vmcb->control.ghcb_gpa);
+ pr_err("%-20s%016llx is_valid: %u\n", "sw_exit_code",
+ ghcb->save.sw_exit_code, ghcb_sw_exit_code_is_valid(ghcb));
+ pr_err("%-20s%016llx is_valid: %u\n", "sw_exit_info_1",
+ ghcb->save.sw_exit_info_1, ghcb_sw_exit_info_1_is_valid(ghcb));
+ pr_err("%-20s%016llx is_valid: %u\n", "sw_exit_info_2",
+ ghcb->save.sw_exit_info_2, ghcb_sw_exit_info_2_is_valid(ghcb));
+ pr_err("%-20s%016llx is_valid: %u\n", "sw_scratch",
+ ghcb->save.sw_scratch, ghcb_sw_scratch_is_valid(ghcb));
+ pr_err("%-20s%*pb\n", "valid_bitmap", nbits, ghcb->save.valid_bitmap);
+}
+
+static void sev_es_sync_to_ghcb(struct vcpu_svm *svm)
+{
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+ struct ghcb *ghcb = svm->sev_es.ghcb;
+
+ /*
+ * The GHCB protocol so far allows for the following data
+ * to be returned:
+ * GPRs RAX, RBX, RCX, RDX
+ *
+ * Copy their values, even if they may not have been written during the
+ * VM-Exit. It's the guest's responsibility to not consume random data.
+ */
+ ghcb_set_rax(ghcb, vcpu->arch.regs[VCPU_REGS_RAX]);
+ ghcb_set_rbx(ghcb, vcpu->arch.regs[VCPU_REGS_RBX]);
+ ghcb_set_rcx(ghcb, vcpu->arch.regs[VCPU_REGS_RCX]);
+ ghcb_set_rdx(ghcb, vcpu->arch.regs[VCPU_REGS_RDX]);
+}
+
+static void sev_es_sync_from_ghcb(struct vcpu_svm *svm)
+{
+ struct vmcb_control_area *control = &svm->vmcb->control;
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+ struct ghcb *ghcb = svm->sev_es.ghcb;
+ u64 exit_code;
+
+ /*
+ * The GHCB protocol so far allows for the following data
+ * to be supplied:
+ * GPRs RAX, RBX, RCX, RDX
+ * XCR0
+ * CPL
+ *
+ * VMMCALL allows the guest to provide extra registers. KVM also
+ * expects RSI for hypercalls, so include that, too.
+ *
+ * Copy their values to the appropriate location if supplied.
+ */
+ memset(vcpu->arch.regs, 0, sizeof(vcpu->arch.regs));
+
+ BUILD_BUG_ON(sizeof(svm->sev_es.valid_bitmap) != sizeof(ghcb->save.valid_bitmap));
+ memcpy(&svm->sev_es.valid_bitmap, &ghcb->save.valid_bitmap, sizeof(ghcb->save.valid_bitmap));
+
+ vcpu->arch.regs[VCPU_REGS_RAX] = kvm_ghcb_get_rax_if_valid(svm, ghcb);
+ vcpu->arch.regs[VCPU_REGS_RBX] = kvm_ghcb_get_rbx_if_valid(svm, ghcb);
+ vcpu->arch.regs[VCPU_REGS_RCX] = kvm_ghcb_get_rcx_if_valid(svm, ghcb);
+ vcpu->arch.regs[VCPU_REGS_RDX] = kvm_ghcb_get_rdx_if_valid(svm, ghcb);
+ vcpu->arch.regs[VCPU_REGS_RSI] = kvm_ghcb_get_rsi_if_valid(svm, ghcb);
+
+ svm->vmcb->save.cpl = kvm_ghcb_get_cpl_if_valid(svm, ghcb);
+
+ if (kvm_ghcb_xcr0_is_valid(svm)) {
+ vcpu->arch.xcr0 = ghcb_get_xcr0(ghcb);
+ kvm_update_cpuid_runtime(vcpu);
+ }
+
+ /* Copy the GHCB exit information into the VMCB fields */
+ exit_code = ghcb_get_sw_exit_code(ghcb);
+ control->exit_code = lower_32_bits(exit_code);
+ control->exit_code_hi = upper_32_bits(exit_code);
+ control->exit_info_1 = ghcb_get_sw_exit_info_1(ghcb);
+ control->exit_info_2 = ghcb_get_sw_exit_info_2(ghcb);
+ svm->sev_es.sw_scratch = kvm_ghcb_get_sw_scratch_if_valid(svm, ghcb);
+
+ /* Clear the valid entries fields */
+ memset(ghcb->save.valid_bitmap, 0, sizeof(ghcb->save.valid_bitmap));
+}
+
+static u64 kvm_ghcb_get_sw_exit_code(struct vmcb_control_area *control)
+{
+ return (((u64)control->exit_code_hi) << 32) | control->exit_code;
+}
+
+static int sev_es_validate_vmgexit(struct vcpu_svm *svm)
+{
+ struct vmcb_control_area *control = &svm->vmcb->control;
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+ u64 exit_code;
+ u64 reason;
+
+ /*
+ * Retrieve the exit code now even though it may not be marked valid
+ * as it could help with debugging.
+ */
+ exit_code = kvm_ghcb_get_sw_exit_code(control);
+
+ /* Only GHCB Usage code 0 is supported */
+ if (svm->sev_es.ghcb->ghcb_usage) {
+ reason = GHCB_ERR_INVALID_USAGE;
+ goto vmgexit_err;
+ }
+
+ reason = GHCB_ERR_MISSING_INPUT;
+
+ if (!kvm_ghcb_sw_exit_code_is_valid(svm) ||
+ !kvm_ghcb_sw_exit_info_1_is_valid(svm) ||
+ !kvm_ghcb_sw_exit_info_2_is_valid(svm))
+ goto vmgexit_err;
+
+ switch (exit_code) {
+ case SVM_EXIT_READ_DR7:
+ break;
+ case SVM_EXIT_WRITE_DR7:
+ if (!kvm_ghcb_rax_is_valid(svm))
+ goto vmgexit_err;
+ break;
+ case SVM_EXIT_RDTSC:
+ break;
+ case SVM_EXIT_RDPMC:
+ if (!kvm_ghcb_rcx_is_valid(svm))
+ goto vmgexit_err;
+ break;
+ case SVM_EXIT_CPUID:
+ if (!kvm_ghcb_rax_is_valid(svm) ||
+ !kvm_ghcb_rcx_is_valid(svm))
+ goto vmgexit_err;
+ if (vcpu->arch.regs[VCPU_REGS_RAX] == 0xd)
+ if (!kvm_ghcb_xcr0_is_valid(svm))
+ goto vmgexit_err;
+ break;
+ case SVM_EXIT_INVD:
+ break;
+ case SVM_EXIT_IOIO:
+ if (control->exit_info_1 & SVM_IOIO_STR_MASK) {
+ if (!kvm_ghcb_sw_scratch_is_valid(svm))
+ goto vmgexit_err;
+ } else {
+ if (!(control->exit_info_1 & SVM_IOIO_TYPE_MASK))
+ if (!kvm_ghcb_rax_is_valid(svm))
+ goto vmgexit_err;
+ }
+ break;
+ case SVM_EXIT_MSR:
+ if (!kvm_ghcb_rcx_is_valid(svm))
+ goto vmgexit_err;
+ if (control->exit_info_1) {
+ if (!kvm_ghcb_rax_is_valid(svm) ||
+ !kvm_ghcb_rdx_is_valid(svm))
+ goto vmgexit_err;
+ }
+ break;
+ case SVM_EXIT_VMMCALL:
+ if (!kvm_ghcb_rax_is_valid(svm) ||
+ !kvm_ghcb_cpl_is_valid(svm))
+ goto vmgexit_err;
+ break;
+ case SVM_EXIT_RDTSCP:
+ break;
+ case SVM_EXIT_WBINVD:
+ break;
+ case SVM_EXIT_MONITOR:
+ if (!kvm_ghcb_rax_is_valid(svm) ||
+ !kvm_ghcb_rcx_is_valid(svm) ||
+ !kvm_ghcb_rdx_is_valid(svm))
+ goto vmgexit_err;
+ break;
+ case SVM_EXIT_MWAIT:
+ if (!kvm_ghcb_rax_is_valid(svm) ||
+ !kvm_ghcb_rcx_is_valid(svm))
+ goto vmgexit_err;
+ break;
+ case SVM_VMGEXIT_MMIO_READ:
+ case SVM_VMGEXIT_MMIO_WRITE:
+ if (!kvm_ghcb_sw_scratch_is_valid(svm))
+ goto vmgexit_err;
+ break;
+ case SVM_VMGEXIT_NMI_COMPLETE:
+ case SVM_VMGEXIT_AP_HLT_LOOP:
+ case SVM_VMGEXIT_AP_JUMP_TABLE:
+ case SVM_VMGEXIT_UNSUPPORTED_EVENT:
+ break;
+ default:
+ reason = GHCB_ERR_INVALID_EVENT;
+ goto vmgexit_err;
+ }
+
+ return 0;
+
+vmgexit_err:
+ if (reason == GHCB_ERR_INVALID_USAGE) {
+ vcpu_unimpl(vcpu, "vmgexit: ghcb usage %#x is not valid\n",
+ svm->sev_es.ghcb->ghcb_usage);
+ } else if (reason == GHCB_ERR_INVALID_EVENT) {
+ vcpu_unimpl(vcpu, "vmgexit: exit code %#llx is not valid\n",
+ exit_code);
+ } else {
+ vcpu_unimpl(vcpu, "vmgexit: exit code %#llx input is not valid\n",
+ exit_code);
+ dump_ghcb(svm);
+ }
+
+ ghcb_set_sw_exit_info_1(svm->sev_es.ghcb, 2);
+ ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, reason);
+
+ /* Resume the guest to "return" the error code. */
+ return 1;
+}
+
+void sev_es_unmap_ghcb(struct vcpu_svm *svm)
+{
+ if (!svm->sev_es.ghcb)
+ return;
+
+ if (svm->sev_es.ghcb_sa_free) {
+ /*
+ * The scratch area lives outside the GHCB, so there is a
+ * buffer that, depending on the operation performed, may
+ * need to be synced, then freed.
+ */
+ if (svm->sev_es.ghcb_sa_sync) {
+ kvm_write_guest(svm->vcpu.kvm,
+ svm->sev_es.sw_scratch,
+ svm->sev_es.ghcb_sa,
+ svm->sev_es.ghcb_sa_len);
+ svm->sev_es.ghcb_sa_sync = false;
+ }
+
+ kvfree(svm->sev_es.ghcb_sa);
+ svm->sev_es.ghcb_sa = NULL;
+ svm->sev_es.ghcb_sa_free = false;
+ }
+
+ trace_kvm_vmgexit_exit(svm->vcpu.vcpu_id, svm->sev_es.ghcb);
+
+ sev_es_sync_to_ghcb(svm);
+
+ kvm_vcpu_unmap(&svm->vcpu, &svm->sev_es.ghcb_map, true);
+ svm->sev_es.ghcb = NULL;
+}
+
+void pre_sev_run(struct vcpu_svm *svm, int cpu)
+{
+ struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, cpu);
+ int asid = sev_get_asid(svm->vcpu.kvm);
+
+ /* Assign the asid allocated with this SEV guest */
+ svm->asid = asid;
+
+ /*
+ * Flush guest TLB:
+ *
+ * 1) when different VMCB for the same ASID is to be run on the same host CPU.
+ * 2) or this VMCB was executed on different host CPU in previous VMRUNs.
+ */
+ if (sd->sev_vmcbs[asid] == svm->vmcb &&
+ svm->vcpu.arch.last_vmentry_cpu == cpu)
+ return;
+
+ sd->sev_vmcbs[asid] = svm->vmcb;
+ svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID;
+ vmcb_mark_dirty(svm->vmcb, VMCB_ASID);
+}
+
+#define GHCB_SCRATCH_AREA_LIMIT (16ULL * PAGE_SIZE)
+static int setup_vmgexit_scratch(struct vcpu_svm *svm, bool sync, u64 len)
+{
+ struct vmcb_control_area *control = &svm->vmcb->control;
+ u64 ghcb_scratch_beg, ghcb_scratch_end;
+ u64 scratch_gpa_beg, scratch_gpa_end;
+ void *scratch_va;
+
+ scratch_gpa_beg = svm->sev_es.sw_scratch;
+ if (!scratch_gpa_beg) {
+ pr_err("vmgexit: scratch gpa not provided\n");
+ goto e_scratch;
+ }
+
+ scratch_gpa_end = scratch_gpa_beg + len;
+ if (scratch_gpa_end < scratch_gpa_beg) {
+ pr_err("vmgexit: scratch length (%#llx) not valid for scratch address (%#llx)\n",
+ len, scratch_gpa_beg);
+ goto e_scratch;
+ }
+
+ if ((scratch_gpa_beg & PAGE_MASK) == control->ghcb_gpa) {
+ /* Scratch area begins within GHCB */
+ ghcb_scratch_beg = control->ghcb_gpa +
+ offsetof(struct ghcb, shared_buffer);
+ ghcb_scratch_end = control->ghcb_gpa +
+ offsetof(struct ghcb, reserved_0xff0);
+
+ /*
+ * If the scratch area begins within the GHCB, it must be
+ * completely contained in the GHCB shared buffer area.
+ */
+ if (scratch_gpa_beg < ghcb_scratch_beg ||
+ scratch_gpa_end > ghcb_scratch_end) {
+ pr_err("vmgexit: scratch area is outside of GHCB shared buffer area (%#llx - %#llx)\n",
+ scratch_gpa_beg, scratch_gpa_end);
+ goto e_scratch;
+ }
+
+ scratch_va = (void *)svm->sev_es.ghcb;
+ scratch_va += (scratch_gpa_beg - control->ghcb_gpa);
+ } else {
+ /*
+ * The guest memory must be read into a kernel buffer, so
+ * limit the size
+ */
+ if (len > GHCB_SCRATCH_AREA_LIMIT) {
+ pr_err("vmgexit: scratch area exceeds KVM limits (%#llx requested, %#llx limit)\n",
+ len, GHCB_SCRATCH_AREA_LIMIT);
+ goto e_scratch;
+ }
+ scratch_va = kvzalloc(len, GFP_KERNEL_ACCOUNT);
+ if (!scratch_va)
+ return -ENOMEM;
+
+ if (kvm_read_guest(svm->vcpu.kvm, scratch_gpa_beg, scratch_va, len)) {
+ /* Unable to copy scratch area from guest */
+ pr_err("vmgexit: kvm_read_guest for scratch area failed\n");
+
+ kvfree(scratch_va);
+ return -EFAULT;
+ }
+
+ /*
+ * The scratch area is outside the GHCB. The operation will
+ * dictate whether the buffer needs to be synced before running
+ * the vCPU next time (i.e. a read was requested so the data
+ * must be written back to the guest memory).
+ */
+ svm->sev_es.ghcb_sa_sync = sync;
+ svm->sev_es.ghcb_sa_free = true;
+ }
+
+ svm->sev_es.ghcb_sa = scratch_va;
+ svm->sev_es.ghcb_sa_len = len;
+
+ return 0;
+
+e_scratch:
+ ghcb_set_sw_exit_info_1(svm->sev_es.ghcb, 2);
+ ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, GHCB_ERR_INVALID_SCRATCH_AREA);
+
+ return 1;
+}
+
+static void set_ghcb_msr_bits(struct vcpu_svm *svm, u64 value, u64 mask,
+ unsigned int pos)
+{
+ svm->vmcb->control.ghcb_gpa &= ~(mask << pos);
+ svm->vmcb->control.ghcb_gpa |= (value & mask) << pos;
+}
+
+static u64 get_ghcb_msr_bits(struct vcpu_svm *svm, u64 mask, unsigned int pos)
+{
+ return (svm->vmcb->control.ghcb_gpa >> pos) & mask;
+}
+
+static void set_ghcb_msr(struct vcpu_svm *svm, u64 value)
+{
+ svm->vmcb->control.ghcb_gpa = value;
+}
+
+static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm)
+{
+ struct vmcb_control_area *control = &svm->vmcb->control;
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+ u64 ghcb_info;
+ int ret = 1;
+
+ ghcb_info = control->ghcb_gpa & GHCB_MSR_INFO_MASK;
+
+ trace_kvm_vmgexit_msr_protocol_enter(svm->vcpu.vcpu_id,
+ control->ghcb_gpa);
+
+ switch (ghcb_info) {
+ case GHCB_MSR_SEV_INFO_REQ:
+ set_ghcb_msr(svm, GHCB_MSR_SEV_INFO(GHCB_VERSION_MAX,
+ GHCB_VERSION_MIN,
+ sev_enc_bit));
+ break;
+ case GHCB_MSR_CPUID_REQ: {
+ u64 cpuid_fn, cpuid_reg, cpuid_value;
+
+ cpuid_fn = get_ghcb_msr_bits(svm,
+ GHCB_MSR_CPUID_FUNC_MASK,
+ GHCB_MSR_CPUID_FUNC_POS);
+
+ /* Initialize the registers needed by the CPUID intercept */
+ vcpu->arch.regs[VCPU_REGS_RAX] = cpuid_fn;
+ vcpu->arch.regs[VCPU_REGS_RCX] = 0;
+
+ ret = svm_invoke_exit_handler(vcpu, SVM_EXIT_CPUID);
+ if (!ret) {
+ /* Error, keep GHCB MSR value as-is */
+ break;
+ }
+
+ cpuid_reg = get_ghcb_msr_bits(svm,
+ GHCB_MSR_CPUID_REG_MASK,
+ GHCB_MSR_CPUID_REG_POS);
+ if (cpuid_reg == 0)
+ cpuid_value = vcpu->arch.regs[VCPU_REGS_RAX];
+ else if (cpuid_reg == 1)
+ cpuid_value = vcpu->arch.regs[VCPU_REGS_RBX];
+ else if (cpuid_reg == 2)
+ cpuid_value = vcpu->arch.regs[VCPU_REGS_RCX];
+ else
+ cpuid_value = vcpu->arch.regs[VCPU_REGS_RDX];
+
+ set_ghcb_msr_bits(svm, cpuid_value,
+ GHCB_MSR_CPUID_VALUE_MASK,
+ GHCB_MSR_CPUID_VALUE_POS);
+
+ set_ghcb_msr_bits(svm, GHCB_MSR_CPUID_RESP,
+ GHCB_MSR_INFO_MASK,
+ GHCB_MSR_INFO_POS);
+ break;
+ }
+ case GHCB_MSR_TERM_REQ: {
+ u64 reason_set, reason_code;
+
+ reason_set = get_ghcb_msr_bits(svm,
+ GHCB_MSR_TERM_REASON_SET_MASK,
+ GHCB_MSR_TERM_REASON_SET_POS);
+ reason_code = get_ghcb_msr_bits(svm,
+ GHCB_MSR_TERM_REASON_MASK,
+ GHCB_MSR_TERM_REASON_POS);
+ pr_info("SEV-ES guest requested termination: %#llx:%#llx\n",
+ reason_set, reason_code);
+
+ vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
+ vcpu->run->system_event.type = KVM_SYSTEM_EVENT_SEV_TERM;
+ vcpu->run->system_event.ndata = 1;
+ vcpu->run->system_event.data[0] = control->ghcb_gpa;
+
+ return 0;
+ }
+ default:
+ /* Error, keep GHCB MSR value as-is */
+ break;
+ }
+
+ trace_kvm_vmgexit_msr_protocol_exit(svm->vcpu.vcpu_id,
+ control->ghcb_gpa, ret);
+
+ return ret;
+}
+
+int sev_handle_vmgexit(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb_control_area *control = &svm->vmcb->control;
+ u64 ghcb_gpa, exit_code;
+ int ret;
+
+ /* Validate the GHCB */
+ ghcb_gpa = control->ghcb_gpa;
+ if (ghcb_gpa & GHCB_MSR_INFO_MASK)
+ return sev_handle_vmgexit_msr_protocol(svm);
+
+ if (!ghcb_gpa) {
+ vcpu_unimpl(vcpu, "vmgexit: GHCB gpa is not set\n");
+
+ /* Without a GHCB, just return right back to the guest */
+ return 1;
+ }
+
+ if (kvm_vcpu_map(vcpu, ghcb_gpa >> PAGE_SHIFT, &svm->sev_es.ghcb_map)) {
+ /* Unable to map GHCB from guest */
+ vcpu_unimpl(vcpu, "vmgexit: error mapping GHCB [%#llx] from guest\n",
+ ghcb_gpa);
+
+ /* Without a GHCB, just return right back to the guest */
+ return 1;
+ }
+
+ svm->sev_es.ghcb = svm->sev_es.ghcb_map.hva;
+
+ trace_kvm_vmgexit_enter(vcpu->vcpu_id, svm->sev_es.ghcb);
+
+ sev_es_sync_from_ghcb(svm);
+ ret = sev_es_validate_vmgexit(svm);
+ if (ret)
+ return ret;
+
+ ghcb_set_sw_exit_info_1(svm->sev_es.ghcb, 0);
+ ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, 0);
+
+ exit_code = kvm_ghcb_get_sw_exit_code(control);
+ switch (exit_code) {
+ case SVM_VMGEXIT_MMIO_READ:
+ ret = setup_vmgexit_scratch(svm, true, control->exit_info_2);
+ if (ret)
+ break;
+
+ ret = kvm_sev_es_mmio_read(vcpu,
+ control->exit_info_1,
+ control->exit_info_2,
+ svm->sev_es.ghcb_sa);
+ break;
+ case SVM_VMGEXIT_MMIO_WRITE:
+ ret = setup_vmgexit_scratch(svm, false, control->exit_info_2);
+ if (ret)
+ break;
+
+ ret = kvm_sev_es_mmio_write(vcpu,
+ control->exit_info_1,
+ control->exit_info_2,
+ svm->sev_es.ghcb_sa);
+ break;
+ case SVM_VMGEXIT_NMI_COMPLETE:
+ ++vcpu->stat.nmi_window_exits;
+ svm->nmi_masked = false;
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ ret = 1;
+ break;
+ case SVM_VMGEXIT_AP_HLT_LOOP:
+ ret = kvm_emulate_ap_reset_hold(vcpu);
+ break;
+ case SVM_VMGEXIT_AP_JUMP_TABLE: {
+ struct kvm_sev_info *sev = &to_kvm_svm(vcpu->kvm)->sev_info;
+
+ switch (control->exit_info_1) {
+ case 0:
+ /* Set AP jump table address */
+ sev->ap_jump_table = control->exit_info_2;
+ break;
+ case 1:
+ /* Get AP jump table address */
+ ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, sev->ap_jump_table);
+ break;
+ default:
+ pr_err("svm: vmgexit: unsupported AP jump table request - exit_info_1=%#llx\n",
+ control->exit_info_1);
+ ghcb_set_sw_exit_info_1(svm->sev_es.ghcb, 2);
+ ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, GHCB_ERR_INVALID_INPUT);
+ }
+
+ ret = 1;
+ break;
+ }
+ case SVM_VMGEXIT_UNSUPPORTED_EVENT:
+ vcpu_unimpl(vcpu,
+ "vmgexit: unsupported event - exit_info_1=%#llx, exit_info_2=%#llx\n",
+ control->exit_info_1, control->exit_info_2);
+ ret = -EINVAL;
+ break;
+ default:
+ ret = svm_invoke_exit_handler(vcpu, exit_code);
+ }
+
+ return ret;
+}
+
+int sev_es_string_io(struct vcpu_svm *svm, int size, unsigned int port, int in)
+{
+ int count;
+ int bytes;
+ int r;
+
+ if (svm->vmcb->control.exit_info_2 > INT_MAX)
+ return -EINVAL;
+
+ count = svm->vmcb->control.exit_info_2;
+ if (unlikely(check_mul_overflow(count, size, &bytes)))
+ return -EINVAL;
+
+ r = setup_vmgexit_scratch(svm, in, bytes);
+ if (r)
+ return r;
+
+ return kvm_sev_es_string_io(&svm->vcpu, size, port, svm->sev_es.ghcb_sa,
+ count, in);
+}
+
+static void sev_es_vcpu_after_set_cpuid(struct vcpu_svm *svm)
+{
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+
+ if (boot_cpu_has(X86_FEATURE_V_TSC_AUX)) {
+ bool v_tsc_aux = guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP) ||
+ guest_cpuid_has(vcpu, X86_FEATURE_RDPID);
+
+ set_msr_interception(vcpu, svm->msrpm, MSR_TSC_AUX, v_tsc_aux, v_tsc_aux);
+ }
+}
+
+void sev_vcpu_after_set_cpuid(struct vcpu_svm *svm)
+{
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+ struct kvm_cpuid_entry2 *best;
+
+ /* For sev guests, the memory encryption bit is not reserved in CR3. */
+ best = kvm_find_cpuid_entry(vcpu, 0x8000001F);
+ if (best)
+ vcpu->arch.reserved_gpa_bits &= ~(1UL << (best->ebx & 0x3f));
+
+ if (sev_es_guest(svm->vcpu.kvm))
+ sev_es_vcpu_after_set_cpuid(svm);
+}
+
+static void sev_es_init_vmcb(struct vcpu_svm *svm)
+{
+ struct vmcb *vmcb = svm->vmcb01.ptr;
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+
+ svm->vmcb->control.nested_ctl |= SVM_NESTED_CTL_SEV_ES_ENABLE;
+ svm->vmcb->control.virt_ext |= LBR_CTL_ENABLE_MASK;
+
+ /*
+ * An SEV-ES guest requires a VMSA area that is a separate from the
+ * VMCB page. Do not include the encryption mask on the VMSA physical
+ * address since hardware will access it using the guest key. Note,
+ * the VMSA will be NULL if this vCPU is the destination for intrahost
+ * migration, and will be copied later.
+ */
+ if (svm->sev_es.vmsa)
+ svm->vmcb->control.vmsa_pa = __pa(svm->sev_es.vmsa);
+
+ /* Can't intercept CR register access, HV can't modify CR registers */
+ svm_clr_intercept(svm, INTERCEPT_CR0_READ);
+ svm_clr_intercept(svm, INTERCEPT_CR4_READ);
+ svm_clr_intercept(svm, INTERCEPT_CR8_READ);
+ svm_clr_intercept(svm, INTERCEPT_CR0_WRITE);
+ svm_clr_intercept(svm, INTERCEPT_CR4_WRITE);
+ svm_clr_intercept(svm, INTERCEPT_CR8_WRITE);
+
+ svm_clr_intercept(svm, INTERCEPT_SELECTIVE_CR0);
+
+ /* Track EFER/CR register changes */
+ svm_set_intercept(svm, TRAP_EFER_WRITE);
+ svm_set_intercept(svm, TRAP_CR0_WRITE);
+ svm_set_intercept(svm, TRAP_CR4_WRITE);
+ svm_set_intercept(svm, TRAP_CR8_WRITE);
+
+ vmcb->control.intercepts[INTERCEPT_DR] = 0;
+ if (!sev_es_debug_swap_enabled) {
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_READ);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_WRITE);
+ recalc_intercepts(svm);
+ } else {
+ /*
+ * Disable #DB intercept iff DebugSwap is enabled. KVM doesn't
+ * allow debugging SEV-ES guests, and enables DebugSwap iff
+ * NO_NESTED_DATA_BP is supported, so there's no reason to
+ * intercept #DB when DebugSwap is enabled. For simplicity
+ * with respect to guest debug, intercept #DB for other VMs
+ * even if NO_NESTED_DATA_BP is supported, i.e. even if the
+ * guest can't DoS the CPU with infinite #DB vectoring.
+ */
+ clr_exception_intercept(svm, DB_VECTOR);
+ }
+
+ /* Can't intercept XSETBV, HV can't modify XCR0 directly */
+ svm_clr_intercept(svm, INTERCEPT_XSETBV);
+
+ /* Clear intercepts on selected MSRs */
+ set_msr_interception(vcpu, svm->msrpm, MSR_EFER, 1, 1);
+ set_msr_interception(vcpu, svm->msrpm, MSR_IA32_CR_PAT, 1, 1);
+ set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHFROMIP, 1, 1);
+ set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1);
+ set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 1, 1);
+ set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 1, 1);
+}
+
+void sev_init_vmcb(struct vcpu_svm *svm)
+{
+ svm->vmcb->control.nested_ctl |= SVM_NESTED_CTL_SEV_ENABLE;
+ clr_exception_intercept(svm, UD_VECTOR);
+
+ /*
+ * Don't intercept #GP for SEV guests, e.g. for the VMware backdoor, as
+ * KVM can't decrypt guest memory to decode the faulting instruction.
+ */
+ clr_exception_intercept(svm, GP_VECTOR);
+
+ if (sev_es_guest(svm->vcpu.kvm))
+ sev_es_init_vmcb(svm);
+}
+
+void sev_es_vcpu_reset(struct vcpu_svm *svm)
+{
+ /*
+ * Set the GHCB MSR value as per the GHCB specification when emulating
+ * vCPU RESET for an SEV-ES guest.
+ */
+ set_ghcb_msr(svm, GHCB_MSR_SEV_INFO(GHCB_VERSION_MAX,
+ GHCB_VERSION_MIN,
+ sev_enc_bit));
+}
+
+void sev_es_prepare_switch_to_guest(struct sev_es_save_area *hostsa)
+{
+ /*
+ * All host state for SEV-ES guests is categorized into three swap types
+ * based on how it is handled by hardware during a world switch:
+ *
+ * A: VMRUN: Host state saved in host save area
+ * VMEXIT: Host state loaded from host save area
+ *
+ * B: VMRUN: Host state _NOT_ saved in host save area
+ * VMEXIT: Host state loaded from host save area
+ *
+ * C: VMRUN: Host state _NOT_ saved in host save area
+ * VMEXIT: Host state initialized to default(reset) values
+ *
+ * Manually save type-B state, i.e. state that is loaded by VMEXIT but
+ * isn't saved by VMRUN, that isn't already saved by VMSAVE (performed
+ * by common SVM code).
+ */
+ hostsa->xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
+ hostsa->pkru = read_pkru();
+ hostsa->xss = host_xss;
+
+ /*
+ * If DebugSwap is enabled, debug registers are loaded but NOT saved by
+ * the CPU (Type-B). If DebugSwap is disabled/unsupported, the CPU both
+ * saves and loads debug registers (Type-A).
+ */
+ if (sev_es_debug_swap_enabled) {
+ hostsa->dr0 = native_get_debugreg(0);
+ hostsa->dr1 = native_get_debugreg(1);
+ hostsa->dr2 = native_get_debugreg(2);
+ hostsa->dr3 = native_get_debugreg(3);
+ hostsa->dr0_addr_mask = amd_get_dr_addr_mask(0);
+ hostsa->dr1_addr_mask = amd_get_dr_addr_mask(1);
+ hostsa->dr2_addr_mask = amd_get_dr_addr_mask(2);
+ hostsa->dr3_addr_mask = amd_get_dr_addr_mask(3);
+ }
+}
+
+void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ /* First SIPI: Use the values as initially set by the VMM */
+ if (!svm->sev_es.received_first_sipi) {
+ svm->sev_es.received_first_sipi = true;
+ return;
+ }
+
+ /*
+ * Subsequent SIPI: Return from an AP Reset Hold VMGEXIT, where
+ * the guest will set the CS and RIP. Set SW_EXIT_INFO_2 to a
+ * non-zero value.
+ */
+ if (!svm->sev_es.ghcb)
+ return;
+
+ ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, 1);
+}
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
new file mode 100644
index 000000000..77f1eeefc
--- /dev/null
+++ b/arch/x86/kvm/svm/svm.c
@@ -0,0 +1,5381 @@
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kvm_host.h>
+
+#include "irq.h"
+#include "mmu.h"
+#include "kvm_cache_regs.h"
+#include "x86.h"
+#include "smm.h"
+#include "cpuid.h"
+#include "pmu.h"
+
+#include <linux/module.h>
+#include <linux/mod_devicetable.h>
+#include <linux/kernel.h>
+#include <linux/vmalloc.h>
+#include <linux/highmem.h>
+#include <linux/amd-iommu.h>
+#include <linux/sched.h>
+#include <linux/trace_events.h>
+#include <linux/slab.h>
+#include <linux/hashtable.h>
+#include <linux/objtool.h>
+#include <linux/psp-sev.h>
+#include <linux/file.h>
+#include <linux/pagemap.h>
+#include <linux/swap.h>
+#include <linux/rwsem.h>
+#include <linux/cc_platform.h>
+#include <linux/smp.h>
+
+#include <asm/apic.h>
+#include <asm/perf_event.h>
+#include <asm/tlbflush.h>
+#include <asm/desc.h>
+#include <asm/debugreg.h>
+#include <asm/kvm_para.h>
+#include <asm/irq_remapping.h>
+#include <asm/spec-ctrl.h>
+#include <asm/cpu_device_id.h>
+#include <asm/traps.h>
+#include <asm/reboot.h>
+#include <asm/fpu/api.h>
+
+#include <trace/events/ipi.h>
+
+#include "trace.h"
+
+#include "svm.h"
+#include "svm_ops.h"
+
+#include "kvm_onhyperv.h"
+#include "svm_onhyperv.h"
+
+MODULE_AUTHOR("Qumranet");
+MODULE_LICENSE("GPL");
+
+#ifdef MODULE
+static const struct x86_cpu_id svm_cpu_id[] = {
+ X86_MATCH_FEATURE(X86_FEATURE_SVM, NULL),
+ {}
+};
+MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id);
+#endif
+
+#define SEG_TYPE_LDT 2
+#define SEG_TYPE_BUSY_TSS16 3
+
+static bool erratum_383_found __read_mostly;
+
+u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
+
+/*
+ * Set osvw_len to higher value when updated Revision Guides
+ * are published and we know what the new status bits are
+ */
+static uint64_t osvw_len = 4, osvw_status;
+
+static DEFINE_PER_CPU(u64, current_tsc_ratio);
+
+#define X2APIC_MSR(x) (APIC_BASE_MSR + (x >> 4))
+
+static const struct svm_direct_access_msrs {
+ u32 index; /* Index of the MSR */
+ bool always; /* True if intercept is initially cleared */
+} direct_access_msrs[MAX_DIRECT_ACCESS_MSRS] = {
+ { .index = MSR_STAR, .always = true },
+ { .index = MSR_IA32_SYSENTER_CS, .always = true },
+ { .index = MSR_IA32_SYSENTER_EIP, .always = false },
+ { .index = MSR_IA32_SYSENTER_ESP, .always = false },
+#ifdef CONFIG_X86_64
+ { .index = MSR_GS_BASE, .always = true },
+ { .index = MSR_FS_BASE, .always = true },
+ { .index = MSR_KERNEL_GS_BASE, .always = true },
+ { .index = MSR_LSTAR, .always = true },
+ { .index = MSR_CSTAR, .always = true },
+ { .index = MSR_SYSCALL_MASK, .always = true },
+#endif
+ { .index = MSR_IA32_SPEC_CTRL, .always = false },
+ { .index = MSR_IA32_PRED_CMD, .always = false },
+ { .index = MSR_IA32_FLUSH_CMD, .always = false },
+ { .index = MSR_IA32_LASTBRANCHFROMIP, .always = false },
+ { .index = MSR_IA32_LASTBRANCHTOIP, .always = false },
+ { .index = MSR_IA32_LASTINTFROMIP, .always = false },
+ { .index = MSR_IA32_LASTINTTOIP, .always = false },
+ { .index = MSR_EFER, .always = false },
+ { .index = MSR_IA32_CR_PAT, .always = false },
+ { .index = MSR_AMD64_SEV_ES_GHCB, .always = true },
+ { .index = MSR_TSC_AUX, .always = false },
+ { .index = X2APIC_MSR(APIC_ID), .always = false },
+ { .index = X2APIC_MSR(APIC_LVR), .always = false },
+ { .index = X2APIC_MSR(APIC_TASKPRI), .always = false },
+ { .index = X2APIC_MSR(APIC_ARBPRI), .always = false },
+ { .index = X2APIC_MSR(APIC_PROCPRI), .always = false },
+ { .index = X2APIC_MSR(APIC_EOI), .always = false },
+ { .index = X2APIC_MSR(APIC_RRR), .always = false },
+ { .index = X2APIC_MSR(APIC_LDR), .always = false },
+ { .index = X2APIC_MSR(APIC_DFR), .always = false },
+ { .index = X2APIC_MSR(APIC_SPIV), .always = false },
+ { .index = X2APIC_MSR(APIC_ISR), .always = false },
+ { .index = X2APIC_MSR(APIC_TMR), .always = false },
+ { .index = X2APIC_MSR(APIC_IRR), .always = false },
+ { .index = X2APIC_MSR(APIC_ESR), .always = false },
+ { .index = X2APIC_MSR(APIC_ICR), .always = false },
+ { .index = X2APIC_MSR(APIC_ICR2), .always = false },
+
+ /*
+ * Note:
+ * AMD does not virtualize APIC TSC-deadline timer mode, but it is
+ * emulated by KVM. When setting APIC LVTT (0x832) register bit 18,
+ * the AVIC hardware would generate GP fault. Therefore, always
+ * intercept the MSR 0x832, and do not setup direct_access_msr.
+ */
+ { .index = X2APIC_MSR(APIC_LVTTHMR), .always = false },
+ { .index = X2APIC_MSR(APIC_LVTPC), .always = false },
+ { .index = X2APIC_MSR(APIC_LVT0), .always = false },
+ { .index = X2APIC_MSR(APIC_LVT1), .always = false },
+ { .index = X2APIC_MSR(APIC_LVTERR), .always = false },
+ { .index = X2APIC_MSR(APIC_TMICT), .always = false },
+ { .index = X2APIC_MSR(APIC_TMCCT), .always = false },
+ { .index = X2APIC_MSR(APIC_TDCR), .always = false },
+ { .index = MSR_INVALID, .always = false },
+};
+
+/*
+ * These 2 parameters are used to config the controls for Pause-Loop Exiting:
+ * pause_filter_count: On processors that support Pause filtering(indicated
+ * by CPUID Fn8000_000A_EDX), the VMCB provides a 16 bit pause filter
+ * count value. On VMRUN this value is loaded into an internal counter.
+ * Each time a pause instruction is executed, this counter is decremented
+ * until it reaches zero at which time a #VMEXIT is generated if pause
+ * intercept is enabled. Refer to AMD APM Vol 2 Section 15.14.4 Pause
+ * Intercept Filtering for more details.
+ * This also indicate if ple logic enabled.
+ *
+ * pause_filter_thresh: In addition, some processor families support advanced
+ * pause filtering (indicated by CPUID Fn8000_000A_EDX) upper bound on
+ * the amount of time a guest is allowed to execute in a pause loop.
+ * In this mode, a 16-bit pause filter threshold field is added in the
+ * VMCB. The threshold value is a cycle count that is used to reset the
+ * pause counter. As with simple pause filtering, VMRUN loads the pause
+ * count value from VMCB into an internal counter. Then, on each pause
+ * instruction the hardware checks the elapsed number of cycles since
+ * the most recent pause instruction against the pause filter threshold.
+ * If the elapsed cycle count is greater than the pause filter threshold,
+ * then the internal pause count is reloaded from the VMCB and execution
+ * continues. If the elapsed cycle count is less than the pause filter
+ * threshold, then the internal pause count is decremented. If the count
+ * value is less than zero and PAUSE intercept is enabled, a #VMEXIT is
+ * triggered. If advanced pause filtering is supported and pause filter
+ * threshold field is set to zero, the filter will operate in the simpler,
+ * count only mode.
+ */
+
+static unsigned short pause_filter_thresh = KVM_DEFAULT_PLE_GAP;
+module_param(pause_filter_thresh, ushort, 0444);
+
+static unsigned short pause_filter_count = KVM_SVM_DEFAULT_PLE_WINDOW;
+module_param(pause_filter_count, ushort, 0444);
+
+/* Default doubles per-vcpu window every exit. */
+static unsigned short pause_filter_count_grow = KVM_DEFAULT_PLE_WINDOW_GROW;
+module_param(pause_filter_count_grow, ushort, 0444);
+
+/* Default resets per-vcpu window every exit to pause_filter_count. */
+static unsigned short pause_filter_count_shrink = KVM_DEFAULT_PLE_WINDOW_SHRINK;
+module_param(pause_filter_count_shrink, ushort, 0444);
+
+/* Default is to compute the maximum so we can never overflow. */
+static unsigned short pause_filter_count_max = KVM_SVM_DEFAULT_PLE_WINDOW_MAX;
+module_param(pause_filter_count_max, ushort, 0444);
+
+/*
+ * Use nested page tables by default. Note, NPT may get forced off by
+ * svm_hardware_setup() if it's unsupported by hardware or the host kernel.
+ */
+bool npt_enabled = true;
+module_param_named(npt, npt_enabled, bool, 0444);
+
+/* allow nested virtualization in KVM/SVM */
+static int nested = true;
+module_param(nested, int, S_IRUGO);
+
+/* enable/disable Next RIP Save */
+int nrips = true;
+module_param(nrips, int, 0444);
+
+/* enable/disable Virtual VMLOAD VMSAVE */
+static int vls = true;
+module_param(vls, int, 0444);
+
+/* enable/disable Virtual GIF */
+int vgif = true;
+module_param(vgif, int, 0444);
+
+/* enable/disable LBR virtualization */
+static int lbrv = true;
+module_param(lbrv, int, 0444);
+
+static int tsc_scaling = true;
+module_param(tsc_scaling, int, 0444);
+
+/*
+ * enable / disable AVIC. Because the defaults differ for APICv
+ * support between VMX and SVM we cannot use module_param_named.
+ */
+static bool avic;
+module_param(avic, bool, 0444);
+
+bool __read_mostly dump_invalid_vmcb;
+module_param(dump_invalid_vmcb, bool, 0644);
+
+
+bool intercept_smi = true;
+module_param(intercept_smi, bool, 0444);
+
+bool vnmi = true;
+module_param(vnmi, bool, 0444);
+
+static bool svm_gp_erratum_intercept = true;
+
+static u8 rsm_ins_bytes[] = "\x0f\xaa";
+
+static unsigned long iopm_base;
+
+DEFINE_PER_CPU(struct svm_cpu_data, svm_data);
+
+/*
+ * Only MSR_TSC_AUX is switched via the user return hook. EFER is switched via
+ * the VMCB, and the SYSCALL/SYSENTER MSRs are handled by VMLOAD/VMSAVE.
+ *
+ * RDTSCP and RDPID are not used in the kernel, specifically to allow KVM to
+ * defer the restoration of TSC_AUX until the CPU returns to userspace.
+ */
+static int tsc_aux_uret_slot __read_mostly = -1;
+
+static const u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000};
+
+#define NUM_MSR_MAPS ARRAY_SIZE(msrpm_ranges)
+#define MSRS_RANGE_SIZE 2048
+#define MSRS_IN_RANGE (MSRS_RANGE_SIZE * 8 / 2)
+
+u32 svm_msrpm_offset(u32 msr)
+{
+ u32 offset;
+ int i;
+
+ for (i = 0; i < NUM_MSR_MAPS; i++) {
+ if (msr < msrpm_ranges[i] ||
+ msr >= msrpm_ranges[i] + MSRS_IN_RANGE)
+ continue;
+
+ offset = (msr - msrpm_ranges[i]) / 4; /* 4 msrs per u8 */
+ offset += (i * MSRS_RANGE_SIZE); /* add range offset */
+
+ /* Now we have the u8 offset - but need the u32 offset */
+ return offset / 4;
+ }
+
+ /* MSR not in any range */
+ return MSR_INVALID;
+}
+
+static void svm_flush_tlb_current(struct kvm_vcpu *vcpu);
+
+static int get_npt_level(void)
+{
+#ifdef CONFIG_X86_64
+ return pgtable_l5_enabled() ? PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL;
+#else
+ return PT32E_ROOT_LEVEL;
+#endif
+}
+
+int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ u64 old_efer = vcpu->arch.efer;
+ vcpu->arch.efer = efer;
+
+ if (!npt_enabled) {
+ /* Shadow paging assumes NX to be available. */
+ efer |= EFER_NX;
+
+ if (!(efer & EFER_LMA))
+ efer &= ~EFER_LME;
+ }
+
+ if ((old_efer & EFER_SVME) != (efer & EFER_SVME)) {
+ if (!(efer & EFER_SVME)) {
+ svm_leave_nested(vcpu);
+ svm_set_gif(svm, true);
+ /* #GP intercept is still needed for vmware backdoor */
+ if (!enable_vmware_backdoor)
+ clr_exception_intercept(svm, GP_VECTOR);
+
+ /*
+ * Free the nested guest state, unless we are in SMM.
+ * In this case we will return to the nested guest
+ * as soon as we leave SMM.
+ */
+ if (!is_smm(vcpu))
+ svm_free_nested(svm);
+
+ } else {
+ int ret = svm_allocate_nested(svm);
+
+ if (ret) {
+ vcpu->arch.efer = old_efer;
+ return ret;
+ }
+
+ /*
+ * Never intercept #GP for SEV guests, KVM can't
+ * decrypt guest memory to workaround the erratum.
+ */
+ if (svm_gp_erratum_intercept && !sev_guest(vcpu->kvm))
+ set_exception_intercept(svm, GP_VECTOR);
+ }
+ }
+
+ svm->vmcb->save.efer = efer | EFER_SVME;
+ vmcb_mark_dirty(svm->vmcb, VMCB_CR);
+ return 0;
+}
+
+static u32 svm_get_interrupt_shadow(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ u32 ret = 0;
+
+ if (svm->vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK)
+ ret = KVM_X86_SHADOW_INT_STI | KVM_X86_SHADOW_INT_MOV_SS;
+ return ret;
+}
+
+static void svm_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (mask == 0)
+ svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK;
+ else
+ svm->vmcb->control.int_state |= SVM_INTERRUPT_SHADOW_MASK;
+
+}
+static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
+ void *insn, int insn_len);
+
+static int __svm_skip_emulated_instruction(struct kvm_vcpu *vcpu,
+ bool commit_side_effects)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ unsigned long old_rflags;
+
+ /*
+ * SEV-ES does not expose the next RIP. The RIP update is controlled by
+ * the type of exit and the #VC handler in the guest.
+ */
+ if (sev_es_guest(vcpu->kvm))
+ goto done;
+
+ if (nrips && svm->vmcb->control.next_rip != 0) {
+ WARN_ON_ONCE(!static_cpu_has(X86_FEATURE_NRIPS));
+ svm->next_rip = svm->vmcb->control.next_rip;
+ }
+
+ if (!svm->next_rip) {
+ /*
+ * FIXME: Drop this when kvm_emulate_instruction() does the
+ * right thing and treats "can't emulate" as outright failure
+ * for EMULTYPE_SKIP.
+ */
+ if (!svm_can_emulate_instruction(vcpu, EMULTYPE_SKIP, NULL, 0))
+ return 0;
+
+ if (unlikely(!commit_side_effects))
+ old_rflags = svm->vmcb->save.rflags;
+
+ if (!kvm_emulate_instruction(vcpu, EMULTYPE_SKIP))
+ return 0;
+
+ if (unlikely(!commit_side_effects))
+ svm->vmcb->save.rflags = old_rflags;
+ } else {
+ kvm_rip_write(vcpu, svm->next_rip);
+ }
+
+done:
+ if (likely(commit_side_effects))
+ svm_set_interrupt_shadow(vcpu, 0);
+
+ return 1;
+}
+
+static int svm_skip_emulated_instruction(struct kvm_vcpu *vcpu)
+{
+ return __svm_skip_emulated_instruction(vcpu, true);
+}
+
+static int svm_update_soft_interrupt_rip(struct kvm_vcpu *vcpu)
+{
+ unsigned long rip, old_rip = kvm_rip_read(vcpu);
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ /*
+ * Due to architectural shortcomings, the CPU doesn't always provide
+ * NextRIP, e.g. if KVM intercepted an exception that occurred while
+ * the CPU was vectoring an INTO/INT3 in the guest. Temporarily skip
+ * the instruction even if NextRIP is supported to acquire the next
+ * RIP so that it can be shoved into the NextRIP field, otherwise
+ * hardware will fail to advance guest RIP during event injection.
+ * Drop the exception/interrupt if emulation fails and effectively
+ * retry the instruction, it's the least awful option. If NRIPS is
+ * in use, the skip must not commit any side effects such as clearing
+ * the interrupt shadow or RFLAGS.RF.
+ */
+ if (!__svm_skip_emulated_instruction(vcpu, !nrips))
+ return -EIO;
+
+ rip = kvm_rip_read(vcpu);
+
+ /*
+ * Save the injection information, even when using next_rip, as the
+ * VMCB's next_rip will be lost (cleared on VM-Exit) if the injection
+ * doesn't complete due to a VM-Exit occurring while the CPU is
+ * vectoring the event. Decoding the instruction isn't guaranteed to
+ * work as there may be no backing instruction, e.g. if the event is
+ * being injected by L1 for L2, or if the guest is patching INT3 into
+ * a different instruction.
+ */
+ svm->soft_int_injected = true;
+ svm->soft_int_csbase = svm->vmcb->save.cs.base;
+ svm->soft_int_old_rip = old_rip;
+ svm->soft_int_next_rip = rip;
+
+ if (nrips)
+ kvm_rip_write(vcpu, old_rip);
+
+ if (static_cpu_has(X86_FEATURE_NRIPS))
+ svm->vmcb->control.next_rip = rip;
+
+ return 0;
+}
+
+static void svm_inject_exception(struct kvm_vcpu *vcpu)
+{
+ struct kvm_queued_exception *ex = &vcpu->arch.exception;
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ kvm_deliver_exception_payload(vcpu, ex);
+
+ if (kvm_exception_is_soft(ex->vector) &&
+ svm_update_soft_interrupt_rip(vcpu))
+ return;
+
+ svm->vmcb->control.event_inj = ex->vector
+ | SVM_EVTINJ_VALID
+ | (ex->has_error_code ? SVM_EVTINJ_VALID_ERR : 0)
+ | SVM_EVTINJ_TYPE_EXEPT;
+ svm->vmcb->control.event_inj_err = ex->error_code;
+}
+
+static void svm_init_erratum_383(void)
+{
+ u32 low, high;
+ int err;
+ u64 val;
+
+ if (!static_cpu_has_bug(X86_BUG_AMD_TLB_MMATCH))
+ return;
+
+ /* Use _safe variants to not break nested virtualization */
+ val = native_read_msr_safe(MSR_AMD64_DC_CFG, &err);
+ if (err)
+ return;
+
+ val |= (1ULL << 47);
+
+ low = lower_32_bits(val);
+ high = upper_32_bits(val);
+
+ native_write_msr_safe(MSR_AMD64_DC_CFG, low, high);
+
+ erratum_383_found = true;
+}
+
+static void svm_init_osvw(struct kvm_vcpu *vcpu)
+{
+ /*
+ * Guests should see errata 400 and 415 as fixed (assuming that
+ * HLT and IO instructions are intercepted).
+ */
+ vcpu->arch.osvw.length = (osvw_len >= 3) ? (osvw_len) : 3;
+ vcpu->arch.osvw.status = osvw_status & ~(6ULL);
+
+ /*
+ * By increasing VCPU's osvw.length to 3 we are telling the guest that
+ * all osvw.status bits inside that length, including bit 0 (which is
+ * reserved for erratum 298), are valid. However, if host processor's
+ * osvw_len is 0 then osvw_status[0] carries no information. We need to
+ * be conservative here and therefore we tell the guest that erratum 298
+ * is present (because we really don't know).
+ */
+ if (osvw_len == 0 && boot_cpu_data.x86 == 0x10)
+ vcpu->arch.osvw.status |= 1;
+}
+
+static bool __kvm_is_svm_supported(void)
+{
+ int cpu = smp_processor_id();
+ struct cpuinfo_x86 *c = &cpu_data(cpu);
+
+ u64 vm_cr;
+
+ if (c->x86_vendor != X86_VENDOR_AMD &&
+ c->x86_vendor != X86_VENDOR_HYGON) {
+ pr_err("CPU %d isn't AMD or Hygon\n", cpu);
+ return false;
+ }
+
+ if (!cpu_has(c, X86_FEATURE_SVM)) {
+ pr_err("SVM not supported by CPU %d\n", cpu);
+ return false;
+ }
+
+ if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) {
+ pr_info("KVM is unsupported when running as an SEV guest\n");
+ return false;
+ }
+
+ rdmsrl(MSR_VM_CR, vm_cr);
+ if (vm_cr & (1 << SVM_VM_CR_SVM_DISABLE)) {
+ pr_err("SVM disabled (by BIOS) in MSR_VM_CR on CPU %d\n", cpu);
+ return false;
+ }
+
+ return true;
+}
+
+static bool kvm_is_svm_supported(void)
+{
+ bool supported;
+
+ migrate_disable();
+ supported = __kvm_is_svm_supported();
+ migrate_enable();
+
+ return supported;
+}
+
+static int svm_check_processor_compat(void)
+{
+ if (!__kvm_is_svm_supported())
+ return -EIO;
+
+ return 0;
+}
+
+static void __svm_write_tsc_multiplier(u64 multiplier)
+{
+ if (multiplier == __this_cpu_read(current_tsc_ratio))
+ return;
+
+ wrmsrl(MSR_AMD64_TSC_RATIO, multiplier);
+ __this_cpu_write(current_tsc_ratio, multiplier);
+}
+
+static inline void kvm_cpu_svm_disable(void)
+{
+ uint64_t efer;
+
+ wrmsrl(MSR_VM_HSAVE_PA, 0);
+ rdmsrl(MSR_EFER, efer);
+ if (efer & EFER_SVME) {
+ /*
+ * Force GIF=1 prior to disabling SVM, e.g. to ensure INIT and
+ * NMI aren't blocked.
+ */
+ stgi();
+ wrmsrl(MSR_EFER, efer & ~EFER_SVME);
+ }
+}
+
+static void svm_emergency_disable(void)
+{
+ kvm_rebooting = true;
+
+ kvm_cpu_svm_disable();
+}
+
+static void svm_hardware_disable(void)
+{
+ /* Make sure we clean up behind us */
+ if (tsc_scaling)
+ __svm_write_tsc_multiplier(SVM_TSC_RATIO_DEFAULT);
+
+ kvm_cpu_svm_disable();
+
+ amd_pmu_disable_virt();
+}
+
+static int svm_hardware_enable(void)
+{
+
+ struct svm_cpu_data *sd;
+ uint64_t efer;
+ int me = raw_smp_processor_id();
+
+ rdmsrl(MSR_EFER, efer);
+ if (efer & EFER_SVME)
+ return -EBUSY;
+
+ sd = per_cpu_ptr(&svm_data, me);
+ sd->asid_generation = 1;
+ sd->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1;
+ sd->next_asid = sd->max_asid + 1;
+ sd->min_asid = max_sev_asid + 1;
+
+ wrmsrl(MSR_EFER, efer | EFER_SVME);
+
+ wrmsrl(MSR_VM_HSAVE_PA, sd->save_area_pa);
+
+ if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) {
+ /*
+ * Set the default value, even if we don't use TSC scaling
+ * to avoid having stale value in the msr
+ */
+ __svm_write_tsc_multiplier(SVM_TSC_RATIO_DEFAULT);
+ }
+
+
+ /*
+ * Get OSVW bits.
+ *
+ * Note that it is possible to have a system with mixed processor
+ * revisions and therefore different OSVW bits. If bits are not the same
+ * on different processors then choose the worst case (i.e. if erratum
+ * is present on one processor and not on another then assume that the
+ * erratum is present everywhere).
+ */
+ if (cpu_has(&boot_cpu_data, X86_FEATURE_OSVW)) {
+ uint64_t len, status = 0;
+ int err;
+
+ len = native_read_msr_safe(MSR_AMD64_OSVW_ID_LENGTH, &err);
+ if (!err)
+ status = native_read_msr_safe(MSR_AMD64_OSVW_STATUS,
+ &err);
+
+ if (err)
+ osvw_status = osvw_len = 0;
+ else {
+ if (len < osvw_len)
+ osvw_len = len;
+ osvw_status |= status;
+ osvw_status &= (1ULL << osvw_len) - 1;
+ }
+ } else
+ osvw_status = osvw_len = 0;
+
+ svm_init_erratum_383();
+
+ amd_pmu_enable_virt();
+
+ /*
+ * If TSC_AUX virtualization is supported, TSC_AUX becomes a swap type
+ * "B" field (see sev_es_prepare_switch_to_guest()) for SEV-ES guests.
+ * Since Linux does not change the value of TSC_AUX once set, prime the
+ * TSC_AUX field now to avoid a RDMSR on every vCPU run.
+ */
+ if (boot_cpu_has(X86_FEATURE_V_TSC_AUX)) {
+ struct sev_es_save_area *hostsa;
+ u32 __maybe_unused msr_hi;
+
+ hostsa = (struct sev_es_save_area *)(page_address(sd->save_area) + 0x400);
+
+ rdmsr(MSR_TSC_AUX, hostsa->tsc_aux, msr_hi);
+ }
+
+ return 0;
+}
+
+static void svm_cpu_uninit(int cpu)
+{
+ struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, cpu);
+
+ if (!sd->save_area)
+ return;
+
+ kfree(sd->sev_vmcbs);
+ __free_page(sd->save_area);
+ sd->save_area_pa = 0;
+ sd->save_area = NULL;
+}
+
+static int svm_cpu_init(int cpu)
+{
+ struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, cpu);
+ int ret = -ENOMEM;
+
+ memset(sd, 0, sizeof(struct svm_cpu_data));
+ sd->save_area = alloc_page(GFP_KERNEL | __GFP_ZERO);
+ if (!sd->save_area)
+ return ret;
+
+ ret = sev_cpu_init(sd);
+ if (ret)
+ goto free_save_area;
+
+ sd->save_area_pa = __sme_page_pa(sd->save_area);
+ return 0;
+
+free_save_area:
+ __free_page(sd->save_area);
+ sd->save_area = NULL;
+ return ret;
+
+}
+
+static void set_dr_intercepts(struct vcpu_svm *svm)
+{
+ struct vmcb *vmcb = svm->vmcb01.ptr;
+
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR0_READ);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR1_READ);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR2_READ);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR3_READ);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR4_READ);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR5_READ);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR6_READ);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR0_WRITE);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR1_WRITE);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR2_WRITE);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR3_WRITE);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR4_WRITE);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR5_WRITE);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR6_WRITE);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_READ);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_WRITE);
+
+ recalc_intercepts(svm);
+}
+
+static void clr_dr_intercepts(struct vcpu_svm *svm)
+{
+ struct vmcb *vmcb = svm->vmcb01.ptr;
+
+ vmcb->control.intercepts[INTERCEPT_DR] = 0;
+
+ recalc_intercepts(svm);
+}
+
+static int direct_access_msr_slot(u32 msr)
+{
+ u32 i;
+
+ for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++)
+ if (direct_access_msrs[i].index == msr)
+ return i;
+
+ return -ENOENT;
+}
+
+static void set_shadow_msr_intercept(struct kvm_vcpu *vcpu, u32 msr, int read,
+ int write)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ int slot = direct_access_msr_slot(msr);
+
+ if (slot == -ENOENT)
+ return;
+
+ /* Set the shadow bitmaps to the desired intercept states */
+ if (read)
+ set_bit(slot, svm->shadow_msr_intercept.read);
+ else
+ clear_bit(slot, svm->shadow_msr_intercept.read);
+
+ if (write)
+ set_bit(slot, svm->shadow_msr_intercept.write);
+ else
+ clear_bit(slot, svm->shadow_msr_intercept.write);
+}
+
+static bool valid_msr_intercept(u32 index)
+{
+ return direct_access_msr_slot(index) != -ENOENT;
+}
+
+static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr)
+{
+ u8 bit_write;
+ unsigned long tmp;
+ u32 offset;
+ u32 *msrpm;
+
+ /*
+ * For non-nested case:
+ * If the L01 MSR bitmap does not intercept the MSR, then we need to
+ * save it.
+ *
+ * For nested case:
+ * If the L02 MSR bitmap does not intercept the MSR, then we need to
+ * save it.
+ */
+ msrpm = is_guest_mode(vcpu) ? to_svm(vcpu)->nested.msrpm:
+ to_svm(vcpu)->msrpm;
+
+ offset = svm_msrpm_offset(msr);
+ bit_write = 2 * (msr & 0x0f) + 1;
+ tmp = msrpm[offset];
+
+ BUG_ON(offset == MSR_INVALID);
+
+ return test_bit(bit_write, &tmp);
+}
+
+static void set_msr_interception_bitmap(struct kvm_vcpu *vcpu, u32 *msrpm,
+ u32 msr, int read, int write)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ u8 bit_read, bit_write;
+ unsigned long tmp;
+ u32 offset;
+
+ /*
+ * If this warning triggers extend the direct_access_msrs list at the
+ * beginning of the file
+ */
+ WARN_ON(!valid_msr_intercept(msr));
+
+ /* Enforce non allowed MSRs to trap */
+ if (read && !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_READ))
+ read = 0;
+
+ if (write && !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_WRITE))
+ write = 0;
+
+ offset = svm_msrpm_offset(msr);
+ bit_read = 2 * (msr & 0x0f);
+ bit_write = 2 * (msr & 0x0f) + 1;
+ tmp = msrpm[offset];
+
+ BUG_ON(offset == MSR_INVALID);
+
+ read ? clear_bit(bit_read, &tmp) : set_bit(bit_read, &tmp);
+ write ? clear_bit(bit_write, &tmp) : set_bit(bit_write, &tmp);
+
+ msrpm[offset] = tmp;
+
+ svm_hv_vmcb_dirty_nested_enlightenments(vcpu);
+ svm->nested.force_msr_bitmap_recalc = true;
+}
+
+void set_msr_interception(struct kvm_vcpu *vcpu, u32 *msrpm, u32 msr,
+ int read, int write)
+{
+ set_shadow_msr_intercept(vcpu, msr, read, write);
+ set_msr_interception_bitmap(vcpu, msrpm, msr, read, write);
+}
+
+u32 *svm_vcpu_alloc_msrpm(void)
+{
+ unsigned int order = get_order(MSRPM_SIZE);
+ struct page *pages = alloc_pages(GFP_KERNEL_ACCOUNT, order);
+ u32 *msrpm;
+
+ if (!pages)
+ return NULL;
+
+ msrpm = page_address(pages);
+ memset(msrpm, 0xff, PAGE_SIZE * (1 << order));
+
+ return msrpm;
+}
+
+void svm_vcpu_init_msrpm(struct kvm_vcpu *vcpu, u32 *msrpm)
+{
+ int i;
+
+ for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
+ if (!direct_access_msrs[i].always)
+ continue;
+ set_msr_interception(vcpu, msrpm, direct_access_msrs[i].index, 1, 1);
+ }
+}
+
+void svm_set_x2apic_msr_interception(struct vcpu_svm *svm, bool intercept)
+{
+ int i;
+
+ if (intercept == svm->x2avic_msrs_intercepted)
+ return;
+
+ if (!x2avic_enabled)
+ return;
+
+ for (i = 0; i < MAX_DIRECT_ACCESS_MSRS; i++) {
+ int index = direct_access_msrs[i].index;
+
+ if ((index < APIC_BASE_MSR) ||
+ (index > APIC_BASE_MSR + 0xff))
+ continue;
+ set_msr_interception(&svm->vcpu, svm->msrpm, index,
+ !intercept, !intercept);
+ }
+
+ svm->x2avic_msrs_intercepted = intercept;
+}
+
+void svm_vcpu_free_msrpm(u32 *msrpm)
+{
+ __free_pages(virt_to_page(msrpm), get_order(MSRPM_SIZE));
+}
+
+static void svm_msr_filter_changed(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ u32 i;
+
+ /*
+ * Set intercept permissions for all direct access MSRs again. They
+ * will automatically get filtered through the MSR filter, so we are
+ * back in sync after this.
+ */
+ for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
+ u32 msr = direct_access_msrs[i].index;
+ u32 read = test_bit(i, svm->shadow_msr_intercept.read);
+ u32 write = test_bit(i, svm->shadow_msr_intercept.write);
+
+ set_msr_interception_bitmap(vcpu, svm->msrpm, msr, read, write);
+ }
+}
+
+static void add_msr_offset(u32 offset)
+{
+ int i;
+
+ for (i = 0; i < MSRPM_OFFSETS; ++i) {
+
+ /* Offset already in list? */
+ if (msrpm_offsets[i] == offset)
+ return;
+
+ /* Slot used by another offset? */
+ if (msrpm_offsets[i] != MSR_INVALID)
+ continue;
+
+ /* Add offset to list */
+ msrpm_offsets[i] = offset;
+
+ return;
+ }
+
+ /*
+ * If this BUG triggers the msrpm_offsets table has an overflow. Just
+ * increase MSRPM_OFFSETS in this case.
+ */
+ BUG();
+}
+
+static void init_msrpm_offsets(void)
+{
+ int i;
+
+ memset(msrpm_offsets, 0xff, sizeof(msrpm_offsets));
+
+ for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
+ u32 offset;
+
+ offset = svm_msrpm_offset(direct_access_msrs[i].index);
+ BUG_ON(offset == MSR_INVALID);
+
+ add_msr_offset(offset);
+ }
+}
+
+void svm_copy_lbrs(struct vmcb *to_vmcb, struct vmcb *from_vmcb)
+{
+ to_vmcb->save.dbgctl = from_vmcb->save.dbgctl;
+ to_vmcb->save.br_from = from_vmcb->save.br_from;
+ to_vmcb->save.br_to = from_vmcb->save.br_to;
+ to_vmcb->save.last_excp_from = from_vmcb->save.last_excp_from;
+ to_vmcb->save.last_excp_to = from_vmcb->save.last_excp_to;
+
+ vmcb_mark_dirty(to_vmcb, VMCB_LBR);
+}
+
+static void svm_enable_lbrv(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ svm->vmcb->control.virt_ext |= LBR_CTL_ENABLE_MASK;
+ set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHFROMIP, 1, 1);
+ set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1);
+ set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 1, 1);
+ set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 1, 1);
+
+ /* Move the LBR msrs to the vmcb02 so that the guest can see them. */
+ if (is_guest_mode(vcpu))
+ svm_copy_lbrs(svm->vmcb, svm->vmcb01.ptr);
+}
+
+static void svm_disable_lbrv(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ svm->vmcb->control.virt_ext &= ~LBR_CTL_ENABLE_MASK;
+ set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHFROMIP, 0, 0);
+ set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 0, 0);
+ set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 0, 0);
+ set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 0, 0);
+
+ /*
+ * Move the LBR msrs back to the vmcb01 to avoid copying them
+ * on nested guest entries.
+ */
+ if (is_guest_mode(vcpu))
+ svm_copy_lbrs(svm->vmcb01.ptr, svm->vmcb);
+}
+
+static struct vmcb *svm_get_lbr_vmcb(struct vcpu_svm *svm)
+{
+ /*
+ * If LBR virtualization is disabled, the LBR MSRs are always kept in
+ * vmcb01. If LBR virtualization is enabled and L1 is running VMs of
+ * its own, the MSRs are moved between vmcb01 and vmcb02 as needed.
+ */
+ return svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK ? svm->vmcb :
+ svm->vmcb01.ptr;
+}
+
+void svm_update_lbrv(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ bool current_enable_lbrv = svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK;
+ bool enable_lbrv = (svm_get_lbr_vmcb(svm)->save.dbgctl & DEBUGCTLMSR_LBR) ||
+ (is_guest_mode(vcpu) && guest_can_use(vcpu, X86_FEATURE_LBRV) &&
+ (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK));
+
+ if (enable_lbrv == current_enable_lbrv)
+ return;
+
+ if (enable_lbrv)
+ svm_enable_lbrv(vcpu);
+ else
+ svm_disable_lbrv(vcpu);
+}
+
+void disable_nmi_singlestep(struct vcpu_svm *svm)
+{
+ svm->nmi_singlestep = false;
+
+ if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP)) {
+ /* Clear our flags if they were not set by the guest */
+ if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF))
+ svm->vmcb->save.rflags &= ~X86_EFLAGS_TF;
+ if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_RF))
+ svm->vmcb->save.rflags &= ~X86_EFLAGS_RF;
+ }
+}
+
+static void grow_ple_window(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb_control_area *control = &svm->vmcb->control;
+ int old = control->pause_filter_count;
+
+ if (kvm_pause_in_guest(vcpu->kvm))
+ return;
+
+ control->pause_filter_count = __grow_ple_window(old,
+ pause_filter_count,
+ pause_filter_count_grow,
+ pause_filter_count_max);
+
+ if (control->pause_filter_count != old) {
+ vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
+ trace_kvm_ple_window_update(vcpu->vcpu_id,
+ control->pause_filter_count, old);
+ }
+}
+
+static void shrink_ple_window(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb_control_area *control = &svm->vmcb->control;
+ int old = control->pause_filter_count;
+
+ if (kvm_pause_in_guest(vcpu->kvm))
+ return;
+
+ control->pause_filter_count =
+ __shrink_ple_window(old,
+ pause_filter_count,
+ pause_filter_count_shrink,
+ pause_filter_count);
+ if (control->pause_filter_count != old) {
+ vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
+ trace_kvm_ple_window_update(vcpu->vcpu_id,
+ control->pause_filter_count, old);
+ }
+}
+
+static void svm_hardware_unsetup(void)
+{
+ int cpu;
+
+ sev_hardware_unsetup();
+
+ for_each_possible_cpu(cpu)
+ svm_cpu_uninit(cpu);
+
+ __free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT),
+ get_order(IOPM_SIZE));
+ iopm_base = 0;
+}
+
+static void init_seg(struct vmcb_seg *seg)
+{
+ seg->selector = 0;
+ seg->attrib = SVM_SELECTOR_P_MASK | SVM_SELECTOR_S_MASK |
+ SVM_SELECTOR_WRITE_MASK; /* Read/Write Data Segment */
+ seg->limit = 0xffff;
+ seg->base = 0;
+}
+
+static void init_sys_seg(struct vmcb_seg *seg, uint32_t type)
+{
+ seg->selector = 0;
+ seg->attrib = SVM_SELECTOR_P_MASK | type;
+ seg->limit = 0xffff;
+ seg->base = 0;
+}
+
+static u64 svm_get_l2_tsc_offset(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ return svm->nested.ctl.tsc_offset;
+}
+
+static u64 svm_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ return svm->tsc_ratio_msr;
+}
+
+static void svm_write_tsc_offset(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ svm->vmcb01.ptr->control.tsc_offset = vcpu->arch.l1_tsc_offset;
+ svm->vmcb->control.tsc_offset = vcpu->arch.tsc_offset;
+ vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
+}
+
+void svm_write_tsc_multiplier(struct kvm_vcpu *vcpu)
+{
+ preempt_disable();
+ if (to_svm(vcpu)->guest_state_loaded)
+ __svm_write_tsc_multiplier(vcpu->arch.tsc_scaling_ratio);
+ preempt_enable();
+}
+
+/* Evaluate instruction intercepts that depend on guest CPUID features. */
+static void svm_recalc_instruction_intercepts(struct kvm_vcpu *vcpu,
+ struct vcpu_svm *svm)
+{
+ /*
+ * Intercept INVPCID if shadow paging is enabled to sync/free shadow
+ * roots, or if INVPCID is disabled in the guest to inject #UD.
+ */
+ if (kvm_cpu_cap_has(X86_FEATURE_INVPCID)) {
+ if (!npt_enabled ||
+ !guest_cpuid_has(&svm->vcpu, X86_FEATURE_INVPCID))
+ svm_set_intercept(svm, INTERCEPT_INVPCID);
+ else
+ svm_clr_intercept(svm, INTERCEPT_INVPCID);
+ }
+
+ if (kvm_cpu_cap_has(X86_FEATURE_RDTSCP)) {
+ if (guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
+ svm_clr_intercept(svm, INTERCEPT_RDTSCP);
+ else
+ svm_set_intercept(svm, INTERCEPT_RDTSCP);
+ }
+}
+
+static inline void init_vmcb_after_set_cpuid(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (guest_cpuid_is_intel(vcpu)) {
+ /*
+ * We must intercept SYSENTER_EIP and SYSENTER_ESP
+ * accesses because the processor only stores 32 bits.
+ * For the same reason we cannot use virtual VMLOAD/VMSAVE.
+ */
+ svm_set_intercept(svm, INTERCEPT_VMLOAD);
+ svm_set_intercept(svm, INTERCEPT_VMSAVE);
+ svm->vmcb->control.virt_ext &= ~VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
+
+ set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_EIP, 0, 0);
+ set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_ESP, 0, 0);
+ } else {
+ /*
+ * If hardware supports Virtual VMLOAD VMSAVE then enable it
+ * in VMCB and clear intercepts to avoid #VMEXIT.
+ */
+ if (vls) {
+ svm_clr_intercept(svm, INTERCEPT_VMLOAD);
+ svm_clr_intercept(svm, INTERCEPT_VMSAVE);
+ svm->vmcb->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
+ }
+ /* No need to intercept these MSRs */
+ set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_EIP, 1, 1);
+ set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_ESP, 1, 1);
+ }
+}
+
+static void init_vmcb(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb *vmcb = svm->vmcb01.ptr;
+ struct vmcb_control_area *control = &vmcb->control;
+ struct vmcb_save_area *save = &vmcb->save;
+
+ svm_set_intercept(svm, INTERCEPT_CR0_READ);
+ svm_set_intercept(svm, INTERCEPT_CR3_READ);
+ svm_set_intercept(svm, INTERCEPT_CR4_READ);
+ svm_set_intercept(svm, INTERCEPT_CR0_WRITE);
+ svm_set_intercept(svm, INTERCEPT_CR3_WRITE);
+ svm_set_intercept(svm, INTERCEPT_CR4_WRITE);
+ if (!kvm_vcpu_apicv_active(vcpu))
+ svm_set_intercept(svm, INTERCEPT_CR8_WRITE);
+
+ set_dr_intercepts(svm);
+
+ set_exception_intercept(svm, PF_VECTOR);
+ set_exception_intercept(svm, UD_VECTOR);
+ set_exception_intercept(svm, MC_VECTOR);
+ set_exception_intercept(svm, AC_VECTOR);
+ set_exception_intercept(svm, DB_VECTOR);
+ /*
+ * Guest access to VMware backdoor ports could legitimately
+ * trigger #GP because of TSS I/O permission bitmap.
+ * We intercept those #GP and allow access to them anyway
+ * as VMware does.
+ */
+ if (enable_vmware_backdoor)
+ set_exception_intercept(svm, GP_VECTOR);
+
+ svm_set_intercept(svm, INTERCEPT_INTR);
+ svm_set_intercept(svm, INTERCEPT_NMI);
+
+ if (intercept_smi)
+ svm_set_intercept(svm, INTERCEPT_SMI);
+
+ svm_set_intercept(svm, INTERCEPT_SELECTIVE_CR0);
+ svm_set_intercept(svm, INTERCEPT_RDPMC);
+ svm_set_intercept(svm, INTERCEPT_CPUID);
+ svm_set_intercept(svm, INTERCEPT_INVD);
+ svm_set_intercept(svm, INTERCEPT_INVLPG);
+ svm_set_intercept(svm, INTERCEPT_INVLPGA);
+ svm_set_intercept(svm, INTERCEPT_IOIO_PROT);
+ svm_set_intercept(svm, INTERCEPT_MSR_PROT);
+ svm_set_intercept(svm, INTERCEPT_TASK_SWITCH);
+ svm_set_intercept(svm, INTERCEPT_SHUTDOWN);
+ svm_set_intercept(svm, INTERCEPT_VMRUN);
+ svm_set_intercept(svm, INTERCEPT_VMMCALL);
+ svm_set_intercept(svm, INTERCEPT_VMLOAD);
+ svm_set_intercept(svm, INTERCEPT_VMSAVE);
+ svm_set_intercept(svm, INTERCEPT_STGI);
+ svm_set_intercept(svm, INTERCEPT_CLGI);
+ svm_set_intercept(svm, INTERCEPT_SKINIT);
+ svm_set_intercept(svm, INTERCEPT_WBINVD);
+ svm_set_intercept(svm, INTERCEPT_XSETBV);
+ svm_set_intercept(svm, INTERCEPT_RDPRU);
+ svm_set_intercept(svm, INTERCEPT_RSM);
+
+ if (!kvm_mwait_in_guest(vcpu->kvm)) {
+ svm_set_intercept(svm, INTERCEPT_MONITOR);
+ svm_set_intercept(svm, INTERCEPT_MWAIT);
+ }
+
+ if (!kvm_hlt_in_guest(vcpu->kvm))
+ svm_set_intercept(svm, INTERCEPT_HLT);
+
+ control->iopm_base_pa = __sme_set(iopm_base);
+ control->msrpm_base_pa = __sme_set(__pa(svm->msrpm));
+ control->int_ctl = V_INTR_MASKING_MASK;
+
+ init_seg(&save->es);
+ init_seg(&save->ss);
+ init_seg(&save->ds);
+ init_seg(&save->fs);
+ init_seg(&save->gs);
+
+ save->cs.selector = 0xf000;
+ save->cs.base = 0xffff0000;
+ /* Executable/Readable Code Segment */
+ save->cs.attrib = SVM_SELECTOR_READ_MASK | SVM_SELECTOR_P_MASK |
+ SVM_SELECTOR_S_MASK | SVM_SELECTOR_CODE_MASK;
+ save->cs.limit = 0xffff;
+
+ save->gdtr.base = 0;
+ save->gdtr.limit = 0xffff;
+ save->idtr.base = 0;
+ save->idtr.limit = 0xffff;
+
+ init_sys_seg(&save->ldtr, SEG_TYPE_LDT);
+ init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16);
+
+ if (npt_enabled) {
+ /* Setup VMCB for Nested Paging */
+ control->nested_ctl |= SVM_NESTED_CTL_NP_ENABLE;
+ svm_clr_intercept(svm, INTERCEPT_INVLPG);
+ clr_exception_intercept(svm, PF_VECTOR);
+ svm_clr_intercept(svm, INTERCEPT_CR3_READ);
+ svm_clr_intercept(svm, INTERCEPT_CR3_WRITE);
+ save->g_pat = vcpu->arch.pat;
+ save->cr3 = 0;
+ }
+ svm->current_vmcb->asid_generation = 0;
+ svm->asid = 0;
+
+ svm->nested.vmcb12_gpa = INVALID_GPA;
+ svm->nested.last_vmcb12_gpa = INVALID_GPA;
+
+ if (!kvm_pause_in_guest(vcpu->kvm)) {
+ control->pause_filter_count = pause_filter_count;
+ if (pause_filter_thresh)
+ control->pause_filter_thresh = pause_filter_thresh;
+ svm_set_intercept(svm, INTERCEPT_PAUSE);
+ } else {
+ svm_clr_intercept(svm, INTERCEPT_PAUSE);
+ }
+
+ svm_recalc_instruction_intercepts(vcpu, svm);
+
+ /*
+ * If the host supports V_SPEC_CTRL then disable the interception
+ * of MSR_IA32_SPEC_CTRL.
+ */
+ if (boot_cpu_has(X86_FEATURE_V_SPEC_CTRL))
+ set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1);
+
+ if (kvm_vcpu_apicv_active(vcpu))
+ avic_init_vmcb(svm, vmcb);
+
+ if (vnmi)
+ svm->vmcb->control.int_ctl |= V_NMI_ENABLE_MASK;
+
+ if (vgif) {
+ svm_clr_intercept(svm, INTERCEPT_STGI);
+ svm_clr_intercept(svm, INTERCEPT_CLGI);
+ svm->vmcb->control.int_ctl |= V_GIF_ENABLE_MASK;
+ }
+
+ if (sev_guest(vcpu->kvm))
+ sev_init_vmcb(svm);
+
+ svm_hv_init_vmcb(vmcb);
+ init_vmcb_after_set_cpuid(vcpu);
+
+ vmcb_mark_all_dirty(vmcb);
+
+ enable_gif(svm);
+}
+
+static void __svm_vcpu_reset(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ svm_vcpu_init_msrpm(vcpu, svm->msrpm);
+
+ svm_init_osvw(vcpu);
+ vcpu->arch.microcode_version = 0x01000065;
+ svm->tsc_ratio_msr = kvm_caps.default_tsc_scaling_ratio;
+
+ svm->nmi_masked = false;
+ svm->awaiting_iret_completion = false;
+
+ if (sev_es_guest(vcpu->kvm))
+ sev_es_vcpu_reset(svm);
+}
+
+static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ svm->spec_ctrl = 0;
+ svm->virt_spec_ctrl = 0;
+
+ init_vmcb(vcpu);
+
+ if (!init_event)
+ __svm_vcpu_reset(vcpu);
+}
+
+void svm_switch_vmcb(struct vcpu_svm *svm, struct kvm_vmcb_info *target_vmcb)
+{
+ svm->current_vmcb = target_vmcb;
+ svm->vmcb = target_vmcb->ptr;
+}
+
+static int svm_vcpu_create(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm;
+ struct page *vmcb01_page;
+ struct page *vmsa_page = NULL;
+ int err;
+
+ BUILD_BUG_ON(offsetof(struct vcpu_svm, vcpu) != 0);
+ svm = to_svm(vcpu);
+
+ err = -ENOMEM;
+ vmcb01_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+ if (!vmcb01_page)
+ goto out;
+
+ if (sev_es_guest(vcpu->kvm)) {
+ /*
+ * SEV-ES guests require a separate VMSA page used to contain
+ * the encrypted register state of the guest.
+ */
+ vmsa_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+ if (!vmsa_page)
+ goto error_free_vmcb_page;
+
+ /*
+ * SEV-ES guests maintain an encrypted version of their FPU
+ * state which is restored and saved on VMRUN and VMEXIT.
+ * Mark vcpu->arch.guest_fpu->fpstate as scratch so it won't
+ * do xsave/xrstor on it.
+ */
+ fpstate_set_confidential(&vcpu->arch.guest_fpu);
+ }
+
+ err = avic_init_vcpu(svm);
+ if (err)
+ goto error_free_vmsa_page;
+
+ svm->msrpm = svm_vcpu_alloc_msrpm();
+ if (!svm->msrpm) {
+ err = -ENOMEM;
+ goto error_free_vmsa_page;
+ }
+
+ svm->x2avic_msrs_intercepted = true;
+
+ svm->vmcb01.ptr = page_address(vmcb01_page);
+ svm->vmcb01.pa = __sme_set(page_to_pfn(vmcb01_page) << PAGE_SHIFT);
+ svm_switch_vmcb(svm, &svm->vmcb01);
+
+ if (vmsa_page)
+ svm->sev_es.vmsa = page_address(vmsa_page);
+
+ svm->guest_state_loaded = false;
+
+ return 0;
+
+error_free_vmsa_page:
+ if (vmsa_page)
+ __free_page(vmsa_page);
+error_free_vmcb_page:
+ __free_page(vmcb01_page);
+out:
+ return err;
+}
+
+static void svm_clear_current_vmcb(struct vmcb *vmcb)
+{
+ int i;
+
+ for_each_online_cpu(i)
+ cmpxchg(per_cpu_ptr(&svm_data.current_vmcb, i), vmcb, NULL);
+}
+
+static void svm_vcpu_free(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ /*
+ * The vmcb page can be recycled, causing a false negative in
+ * svm_vcpu_load(). So, ensure that no logical CPU has this
+ * vmcb page recorded as its current vmcb.
+ */
+ svm_clear_current_vmcb(svm->vmcb);
+
+ svm_leave_nested(vcpu);
+ svm_free_nested(svm);
+
+ sev_free_vcpu(vcpu);
+
+ __free_page(pfn_to_page(__sme_clr(svm->vmcb01.pa) >> PAGE_SHIFT));
+ __free_pages(virt_to_page(svm->msrpm), get_order(MSRPM_SIZE));
+}
+
+static void svm_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, vcpu->cpu);
+
+ if (sev_es_guest(vcpu->kvm))
+ sev_es_unmap_ghcb(svm);
+
+ if (svm->guest_state_loaded)
+ return;
+
+ /*
+ * Save additional host state that will be restored on VMEXIT (sev-es)
+ * or subsequent vmload of host save area.
+ */
+ vmsave(sd->save_area_pa);
+ if (sev_es_guest(vcpu->kvm)) {
+ struct sev_es_save_area *hostsa;
+ hostsa = (struct sev_es_save_area *)(page_address(sd->save_area) + 0x400);
+
+ sev_es_prepare_switch_to_guest(hostsa);
+ }
+
+ if (tsc_scaling)
+ __svm_write_tsc_multiplier(vcpu->arch.tsc_scaling_ratio);
+
+ /*
+ * TSC_AUX is always virtualized for SEV-ES guests when the feature is
+ * available. The user return MSR support is not required in this case
+ * because TSC_AUX is restored on #VMEXIT from the host save area
+ * (which has been initialized in svm_hardware_enable()).
+ */
+ if (likely(tsc_aux_uret_slot >= 0) &&
+ (!boot_cpu_has(X86_FEATURE_V_TSC_AUX) || !sev_es_guest(vcpu->kvm)))
+ kvm_set_user_return_msr(tsc_aux_uret_slot, svm->tsc_aux, -1ull);
+
+ svm->guest_state_loaded = true;
+}
+
+static void svm_prepare_host_switch(struct kvm_vcpu *vcpu)
+{
+ to_svm(vcpu)->guest_state_loaded = false;
+}
+
+static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, cpu);
+
+ if (sd->current_vmcb != svm->vmcb) {
+ sd->current_vmcb = svm->vmcb;
+
+ if (!cpu_feature_enabled(X86_FEATURE_IBPB_ON_VMEXIT))
+ indirect_branch_prediction_barrier();
+ }
+ if (kvm_vcpu_apicv_active(vcpu))
+ avic_vcpu_load(vcpu, cpu);
+}
+
+static void svm_vcpu_put(struct kvm_vcpu *vcpu)
+{
+ if (kvm_vcpu_apicv_active(vcpu))
+ avic_vcpu_put(vcpu);
+
+ svm_prepare_host_switch(vcpu);
+
+ ++vcpu->stat.host_state_reload;
+}
+
+static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ unsigned long rflags = svm->vmcb->save.rflags;
+
+ if (svm->nmi_singlestep) {
+ /* Hide our flags if they were not set by the guest */
+ if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF))
+ rflags &= ~X86_EFLAGS_TF;
+ if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_RF))
+ rflags &= ~X86_EFLAGS_RF;
+ }
+ return rflags;
+}
+
+static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
+{
+ if (to_svm(vcpu)->nmi_singlestep)
+ rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
+
+ /*
+ * Any change of EFLAGS.VM is accompanied by a reload of SS
+ * (caused by either a task switch or an inter-privilege IRET),
+ * so we do not need to update the CPL here.
+ */
+ to_svm(vcpu)->vmcb->save.rflags = rflags;
+}
+
+static bool svm_get_if_flag(struct kvm_vcpu *vcpu)
+{
+ struct vmcb *vmcb = to_svm(vcpu)->vmcb;
+
+ return sev_es_guest(vcpu->kvm)
+ ? vmcb->control.int_state & SVM_GUEST_INTERRUPT_MASK
+ : kvm_get_rflags(vcpu) & X86_EFLAGS_IF;
+}
+
+static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
+{
+ kvm_register_mark_available(vcpu, reg);
+
+ switch (reg) {
+ case VCPU_EXREG_PDPTR:
+ /*
+ * When !npt_enabled, mmu->pdptrs[] is already available since
+ * it is always updated per SDM when moving to CRs.
+ */
+ if (npt_enabled)
+ load_pdptrs(vcpu, kvm_read_cr3(vcpu));
+ break;
+ default:
+ KVM_BUG_ON(1, vcpu->kvm);
+ }
+}
+
+static void svm_set_vintr(struct vcpu_svm *svm)
+{
+ struct vmcb_control_area *control;
+
+ /*
+ * The following fields are ignored when AVIC is enabled
+ */
+ WARN_ON(kvm_vcpu_apicv_activated(&svm->vcpu));
+
+ svm_set_intercept(svm, INTERCEPT_VINTR);
+
+ /*
+ * Recalculating intercepts may have cleared the VINTR intercept. If
+ * V_INTR_MASKING is enabled in vmcb12, then the effective RFLAGS.IF
+ * for L1 physical interrupts is L1's RFLAGS.IF at the time of VMRUN.
+ * Requesting an interrupt window if save.RFLAGS.IF=0 is pointless as
+ * interrupts will never be unblocked while L2 is running.
+ */
+ if (!svm_is_intercept(svm, INTERCEPT_VINTR))
+ return;
+
+ /*
+ * This is just a dummy VINTR to actually cause a vmexit to happen.
+ * Actual injection of virtual interrupts happens through EVENTINJ.
+ */
+ control = &svm->vmcb->control;
+ control->int_vector = 0x0;
+ control->int_ctl &= ~V_INTR_PRIO_MASK;
+ control->int_ctl |= V_IRQ_MASK |
+ ((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT);
+ vmcb_mark_dirty(svm->vmcb, VMCB_INTR);
+}
+
+static void svm_clear_vintr(struct vcpu_svm *svm)
+{
+ svm_clr_intercept(svm, INTERCEPT_VINTR);
+
+ /* Drop int_ctl fields related to VINTR injection. */
+ svm->vmcb->control.int_ctl &= ~V_IRQ_INJECTION_BITS_MASK;
+ if (is_guest_mode(&svm->vcpu)) {
+ svm->vmcb01.ptr->control.int_ctl &= ~V_IRQ_INJECTION_BITS_MASK;
+
+ WARN_ON((svm->vmcb->control.int_ctl & V_TPR_MASK) !=
+ (svm->nested.ctl.int_ctl & V_TPR_MASK));
+
+ svm->vmcb->control.int_ctl |= svm->nested.ctl.int_ctl &
+ V_IRQ_INJECTION_BITS_MASK;
+
+ svm->vmcb->control.int_vector = svm->nested.ctl.int_vector;
+ }
+
+ vmcb_mark_dirty(svm->vmcb, VMCB_INTR);
+}
+
+static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg)
+{
+ struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;
+ struct vmcb_save_area *save01 = &to_svm(vcpu)->vmcb01.ptr->save;
+
+ switch (seg) {
+ case VCPU_SREG_CS: return &save->cs;
+ case VCPU_SREG_DS: return &save->ds;
+ case VCPU_SREG_ES: return &save->es;
+ case VCPU_SREG_FS: return &save01->fs;
+ case VCPU_SREG_GS: return &save01->gs;
+ case VCPU_SREG_SS: return &save->ss;
+ case VCPU_SREG_TR: return &save01->tr;
+ case VCPU_SREG_LDTR: return &save01->ldtr;
+ }
+ BUG();
+ return NULL;
+}
+
+static u64 svm_get_segment_base(struct kvm_vcpu *vcpu, int seg)
+{
+ struct vmcb_seg *s = svm_seg(vcpu, seg);
+
+ return s->base;
+}
+
+static void svm_get_segment(struct kvm_vcpu *vcpu,
+ struct kvm_segment *var, int seg)
+{
+ struct vmcb_seg *s = svm_seg(vcpu, seg);
+
+ var->base = s->base;
+ var->limit = s->limit;
+ var->selector = s->selector;
+ var->type = s->attrib & SVM_SELECTOR_TYPE_MASK;
+ var->s = (s->attrib >> SVM_SELECTOR_S_SHIFT) & 1;
+ var->dpl = (s->attrib >> SVM_SELECTOR_DPL_SHIFT) & 3;
+ var->present = (s->attrib >> SVM_SELECTOR_P_SHIFT) & 1;
+ var->avl = (s->attrib >> SVM_SELECTOR_AVL_SHIFT) & 1;
+ var->l = (s->attrib >> SVM_SELECTOR_L_SHIFT) & 1;
+ var->db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1;
+
+ /*
+ * AMD CPUs circa 2014 track the G bit for all segments except CS.
+ * However, the SVM spec states that the G bit is not observed by the
+ * CPU, and some VMware virtual CPUs drop the G bit for all segments.
+ * So let's synthesize a legal G bit for all segments, this helps
+ * running KVM nested. It also helps cross-vendor migration, because
+ * Intel's vmentry has a check on the 'G' bit.
+ */
+ var->g = s->limit > 0xfffff;
+
+ /*
+ * AMD's VMCB does not have an explicit unusable field, so emulate it
+ * for cross vendor migration purposes by "not present"
+ */
+ var->unusable = !var->present;
+
+ switch (seg) {
+ case VCPU_SREG_TR:
+ /*
+ * Work around a bug where the busy flag in the tr selector
+ * isn't exposed
+ */
+ var->type |= 0x2;
+ break;
+ case VCPU_SREG_DS:
+ case VCPU_SREG_ES:
+ case VCPU_SREG_FS:
+ case VCPU_SREG_GS:
+ /*
+ * The accessed bit must always be set in the segment
+ * descriptor cache, although it can be cleared in the
+ * descriptor, the cached bit always remains at 1. Since
+ * Intel has a check on this, set it here to support
+ * cross-vendor migration.
+ */
+ if (!var->unusable)
+ var->type |= 0x1;
+ break;
+ case VCPU_SREG_SS:
+ /*
+ * On AMD CPUs sometimes the DB bit in the segment
+ * descriptor is left as 1, although the whole segment has
+ * been made unusable. Clear it here to pass an Intel VMX
+ * entry check when cross vendor migrating.
+ */
+ if (var->unusable)
+ var->db = 0;
+ /* This is symmetric with svm_set_segment() */
+ var->dpl = to_svm(vcpu)->vmcb->save.cpl;
+ break;
+ }
+}
+
+static int svm_get_cpl(struct kvm_vcpu *vcpu)
+{
+ struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;
+
+ return save->cpl;
+}
+
+static void svm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
+{
+ struct kvm_segment cs;
+
+ svm_get_segment(vcpu, &cs, VCPU_SREG_CS);
+ *db = cs.db;
+ *l = cs.l;
+}
+
+static void svm_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ dt->size = svm->vmcb->save.idtr.limit;
+ dt->address = svm->vmcb->save.idtr.base;
+}
+
+static void svm_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ svm->vmcb->save.idtr.limit = dt->size;
+ svm->vmcb->save.idtr.base = dt->address ;
+ vmcb_mark_dirty(svm->vmcb, VMCB_DT);
+}
+
+static void svm_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ dt->size = svm->vmcb->save.gdtr.limit;
+ dt->address = svm->vmcb->save.gdtr.base;
+}
+
+static void svm_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ svm->vmcb->save.gdtr.limit = dt->size;
+ svm->vmcb->save.gdtr.base = dt->address ;
+ vmcb_mark_dirty(svm->vmcb, VMCB_DT);
+}
+
+static void sev_post_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ /*
+ * For guests that don't set guest_state_protected, the cr3 update is
+ * handled via kvm_mmu_load() while entering the guest. For guests
+ * that do (SEV-ES/SEV-SNP), the cr3 update needs to be written to
+ * VMCB save area now, since the save area will become the initial
+ * contents of the VMSA, and future VMCB save area updates won't be
+ * seen.
+ */
+ if (sev_es_guest(vcpu->kvm)) {
+ svm->vmcb->save.cr3 = cr3;
+ vmcb_mark_dirty(svm->vmcb, VMCB_CR);
+ }
+}
+
+static bool svm_is_valid_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
+{
+ return true;
+}
+
+void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ u64 hcr0 = cr0;
+ bool old_paging = is_paging(vcpu);
+
+#ifdef CONFIG_X86_64
+ if (vcpu->arch.efer & EFER_LME) {
+ if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
+ vcpu->arch.efer |= EFER_LMA;
+ if (!vcpu->arch.guest_state_protected)
+ svm->vmcb->save.efer |= EFER_LMA | EFER_LME;
+ }
+
+ if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) {
+ vcpu->arch.efer &= ~EFER_LMA;
+ if (!vcpu->arch.guest_state_protected)
+ svm->vmcb->save.efer &= ~(EFER_LMA | EFER_LME);
+ }
+ }
+#endif
+ vcpu->arch.cr0 = cr0;
+
+ if (!npt_enabled) {
+ hcr0 |= X86_CR0_PG | X86_CR0_WP;
+ if (old_paging != is_paging(vcpu))
+ svm_set_cr4(vcpu, kvm_read_cr4(vcpu));
+ }
+
+ /*
+ * re-enable caching here because the QEMU bios
+ * does not do it - this results in some delay at
+ * reboot
+ */
+ if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
+ hcr0 &= ~(X86_CR0_CD | X86_CR0_NW);
+
+ svm->vmcb->save.cr0 = hcr0;
+ vmcb_mark_dirty(svm->vmcb, VMCB_CR);
+
+ /*
+ * SEV-ES guests must always keep the CR intercepts cleared. CR
+ * tracking is done using the CR write traps.
+ */
+ if (sev_es_guest(vcpu->kvm))
+ return;
+
+ if (hcr0 == cr0) {
+ /* Selective CR0 write remains on. */
+ svm_clr_intercept(svm, INTERCEPT_CR0_READ);
+ svm_clr_intercept(svm, INTERCEPT_CR0_WRITE);
+ } else {
+ svm_set_intercept(svm, INTERCEPT_CR0_READ);
+ svm_set_intercept(svm, INTERCEPT_CR0_WRITE);
+ }
+}
+
+static bool svm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
+{
+ return true;
+}
+
+void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
+{
+ unsigned long host_cr4_mce = cr4_read_shadow() & X86_CR4_MCE;
+ unsigned long old_cr4 = vcpu->arch.cr4;
+
+ if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE))
+ svm_flush_tlb_current(vcpu);
+
+ vcpu->arch.cr4 = cr4;
+ if (!npt_enabled) {
+ cr4 |= X86_CR4_PAE;
+
+ if (!is_paging(vcpu))
+ cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE);
+ }
+ cr4 |= host_cr4_mce;
+ to_svm(vcpu)->vmcb->save.cr4 = cr4;
+ vmcb_mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR);
+
+ if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE))
+ kvm_update_cpuid_runtime(vcpu);
+}
+
+static void svm_set_segment(struct kvm_vcpu *vcpu,
+ struct kvm_segment *var, int seg)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb_seg *s = svm_seg(vcpu, seg);
+
+ s->base = var->base;
+ s->limit = var->limit;
+ s->selector = var->selector;
+ s->attrib = (var->type & SVM_SELECTOR_TYPE_MASK);
+ s->attrib |= (var->s & 1) << SVM_SELECTOR_S_SHIFT;
+ s->attrib |= (var->dpl & 3) << SVM_SELECTOR_DPL_SHIFT;
+ s->attrib |= ((var->present & 1) && !var->unusable) << SVM_SELECTOR_P_SHIFT;
+ s->attrib |= (var->avl & 1) << SVM_SELECTOR_AVL_SHIFT;
+ s->attrib |= (var->l & 1) << SVM_SELECTOR_L_SHIFT;
+ s->attrib |= (var->db & 1) << SVM_SELECTOR_DB_SHIFT;
+ s->attrib |= (var->g & 1) << SVM_SELECTOR_G_SHIFT;
+
+ /*
+ * This is always accurate, except if SYSRET returned to a segment
+ * with SS.DPL != 3. Intel does not have this quirk, and always
+ * forces SS.DPL to 3 on sysret, so we ignore that case; fixing it
+ * would entail passing the CPL to userspace and back.
+ */
+ if (seg == VCPU_SREG_SS)
+ /* This is symmetric with svm_get_segment() */
+ svm->vmcb->save.cpl = (var->dpl & 3);
+
+ vmcb_mark_dirty(svm->vmcb, VMCB_SEG);
+}
+
+static void svm_update_exception_bitmap(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ clr_exception_intercept(svm, BP_VECTOR);
+
+ if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) {
+ if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
+ set_exception_intercept(svm, BP_VECTOR);
+ }
+}
+
+static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd)
+{
+ if (sd->next_asid > sd->max_asid) {
+ ++sd->asid_generation;
+ sd->next_asid = sd->min_asid;
+ svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ALL_ASID;
+ vmcb_mark_dirty(svm->vmcb, VMCB_ASID);
+ }
+
+ svm->current_vmcb->asid_generation = sd->asid_generation;
+ svm->asid = sd->next_asid++;
+}
+
+static void svm_set_dr6(struct vcpu_svm *svm, unsigned long value)
+{
+ struct vmcb *vmcb = svm->vmcb;
+
+ if (svm->vcpu.arch.guest_state_protected)
+ return;
+
+ if (unlikely(value != vmcb->save.dr6)) {
+ vmcb->save.dr6 = value;
+ vmcb_mark_dirty(vmcb, VMCB_DR);
+ }
+}
+
+static void svm_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (WARN_ON_ONCE(sev_es_guest(vcpu->kvm)))
+ return;
+
+ get_debugreg(vcpu->arch.db[0], 0);
+ get_debugreg(vcpu->arch.db[1], 1);
+ get_debugreg(vcpu->arch.db[2], 2);
+ get_debugreg(vcpu->arch.db[3], 3);
+ /*
+ * We cannot reset svm->vmcb->save.dr6 to DR6_ACTIVE_LOW here,
+ * because db_interception might need it. We can do it before vmentry.
+ */
+ vcpu->arch.dr6 = svm->vmcb->save.dr6;
+ vcpu->arch.dr7 = svm->vmcb->save.dr7;
+ vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT;
+ set_dr_intercepts(svm);
+}
+
+static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (vcpu->arch.guest_state_protected)
+ return;
+
+ svm->vmcb->save.dr7 = value;
+ vmcb_mark_dirty(svm->vmcb, VMCB_DR);
+}
+
+static int pf_interception(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ u64 fault_address = svm->vmcb->control.exit_info_2;
+ u64 error_code = svm->vmcb->control.exit_info_1;
+
+ return kvm_handle_page_fault(vcpu, error_code, fault_address,
+ static_cpu_has(X86_FEATURE_DECODEASSISTS) ?
+ svm->vmcb->control.insn_bytes : NULL,
+ svm->vmcb->control.insn_len);
+}
+
+static int npf_interception(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ u64 fault_address = svm->vmcb->control.exit_info_2;
+ u64 error_code = svm->vmcb->control.exit_info_1;
+
+ trace_kvm_page_fault(vcpu, fault_address, error_code);
+ return kvm_mmu_page_fault(vcpu, fault_address, error_code,
+ static_cpu_has(X86_FEATURE_DECODEASSISTS) ?
+ svm->vmcb->control.insn_bytes : NULL,
+ svm->vmcb->control.insn_len);
+}
+
+static int db_interception(struct kvm_vcpu *vcpu)
+{
+ struct kvm_run *kvm_run = vcpu->run;
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (!(vcpu->guest_debug &
+ (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) &&
+ !svm->nmi_singlestep) {
+ u32 payload = svm->vmcb->save.dr6 ^ DR6_ACTIVE_LOW;
+ kvm_queue_exception_p(vcpu, DB_VECTOR, payload);
+ return 1;
+ }
+
+ if (svm->nmi_singlestep) {
+ disable_nmi_singlestep(svm);
+ /* Make sure we check for pending NMIs upon entry */
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ }
+
+ if (vcpu->guest_debug &
+ (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) {
+ kvm_run->exit_reason = KVM_EXIT_DEBUG;
+ kvm_run->debug.arch.dr6 = svm->vmcb->save.dr6;
+ kvm_run->debug.arch.dr7 = svm->vmcb->save.dr7;
+ kvm_run->debug.arch.pc =
+ svm->vmcb->save.cs.base + svm->vmcb->save.rip;
+ kvm_run->debug.arch.exception = DB_VECTOR;
+ return 0;
+ }
+
+ return 1;
+}
+
+static int bp_interception(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct kvm_run *kvm_run = vcpu->run;
+
+ kvm_run->exit_reason = KVM_EXIT_DEBUG;
+ kvm_run->debug.arch.pc = svm->vmcb->save.cs.base + svm->vmcb->save.rip;
+ kvm_run->debug.arch.exception = BP_VECTOR;
+ return 0;
+}
+
+static int ud_interception(struct kvm_vcpu *vcpu)
+{
+ return handle_ud(vcpu);
+}
+
+static int ac_interception(struct kvm_vcpu *vcpu)
+{
+ kvm_queue_exception_e(vcpu, AC_VECTOR, 0);
+ return 1;
+}
+
+static bool is_erratum_383(void)
+{
+ int err, i;
+ u64 value;
+
+ if (!erratum_383_found)
+ return false;
+
+ value = native_read_msr_safe(MSR_IA32_MC0_STATUS, &err);
+ if (err)
+ return false;
+
+ /* Bit 62 may or may not be set for this mce */
+ value &= ~(1ULL << 62);
+
+ if (value != 0xb600000000010015ULL)
+ return false;
+
+ /* Clear MCi_STATUS registers */
+ for (i = 0; i < 6; ++i)
+ native_write_msr_safe(MSR_IA32_MCx_STATUS(i), 0, 0);
+
+ value = native_read_msr_safe(MSR_IA32_MCG_STATUS, &err);
+ if (!err) {
+ u32 low, high;
+
+ value &= ~(1ULL << 2);
+ low = lower_32_bits(value);
+ high = upper_32_bits(value);
+
+ native_write_msr_safe(MSR_IA32_MCG_STATUS, low, high);
+ }
+
+ /* Flush tlb to evict multi-match entries */
+ __flush_tlb_all();
+
+ return true;
+}
+
+static void svm_handle_mce(struct kvm_vcpu *vcpu)
+{
+ if (is_erratum_383()) {
+ /*
+ * Erratum 383 triggered. Guest state is corrupt so kill the
+ * guest.
+ */
+ pr_err("Guest triggered AMD Erratum 383\n");
+
+ kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+
+ return;
+ }
+
+ /*
+ * On an #MC intercept the MCE handler is not called automatically in
+ * the host. So do it by hand here.
+ */
+ kvm_machine_check();
+}
+
+static int mc_interception(struct kvm_vcpu *vcpu)
+{
+ return 1;
+}
+
+static int shutdown_interception(struct kvm_vcpu *vcpu)
+{
+ struct kvm_run *kvm_run = vcpu->run;
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ /*
+ * The VM save area has already been encrypted so it
+ * cannot be reinitialized - just terminate.
+ */
+ if (sev_es_guest(vcpu->kvm))
+ return -EINVAL;
+
+ /*
+ * VMCB is undefined after a SHUTDOWN intercept. INIT the vCPU to put
+ * the VMCB in a known good state. Unfortuately, KVM doesn't have
+ * KVM_MP_STATE_SHUTDOWN and can't add it without potentially breaking
+ * userspace. At a platform view, INIT is acceptable behavior as
+ * there exist bare metal platforms that automatically INIT the CPU
+ * in response to shutdown.
+ */
+ clear_page(svm->vmcb);
+ kvm_vcpu_reset(vcpu, true);
+
+ kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
+ return 0;
+}
+
+static int io_interception(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */
+ int size, in, string;
+ unsigned port;
+
+ ++vcpu->stat.io_exits;
+ string = (io_info & SVM_IOIO_STR_MASK) != 0;
+ in = (io_info & SVM_IOIO_TYPE_MASK) != 0;
+ port = io_info >> 16;
+ size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT;
+
+ if (string) {
+ if (sev_es_guest(vcpu->kvm))
+ return sev_es_string_io(svm, size, port, in);
+ else
+ return kvm_emulate_instruction(vcpu, 0);
+ }
+
+ svm->next_rip = svm->vmcb->control.exit_info_2;
+
+ return kvm_fast_pio(vcpu, size, port, in);
+}
+
+static int nmi_interception(struct kvm_vcpu *vcpu)
+{
+ return 1;
+}
+
+static int smi_interception(struct kvm_vcpu *vcpu)
+{
+ return 1;
+}
+
+static int intr_interception(struct kvm_vcpu *vcpu)
+{
+ ++vcpu->stat.irq_exits;
+ return 1;
+}
+
+static int vmload_vmsave_interception(struct kvm_vcpu *vcpu, bool vmload)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb *vmcb12;
+ struct kvm_host_map map;
+ int ret;
+
+ if (nested_svm_check_permissions(vcpu))
+ return 1;
+
+ ret = kvm_vcpu_map(vcpu, gpa_to_gfn(svm->vmcb->save.rax), &map);
+ if (ret) {
+ if (ret == -EINVAL)
+ kvm_inject_gp(vcpu, 0);
+ return 1;
+ }
+
+ vmcb12 = map.hva;
+
+ ret = kvm_skip_emulated_instruction(vcpu);
+
+ if (vmload) {
+ svm_copy_vmloadsave_state(svm->vmcb, vmcb12);
+ svm->sysenter_eip_hi = 0;
+ svm->sysenter_esp_hi = 0;
+ } else {
+ svm_copy_vmloadsave_state(vmcb12, svm->vmcb);
+ }
+
+ kvm_vcpu_unmap(vcpu, &map, true);
+
+ return ret;
+}
+
+static int vmload_interception(struct kvm_vcpu *vcpu)
+{
+ return vmload_vmsave_interception(vcpu, true);
+}
+
+static int vmsave_interception(struct kvm_vcpu *vcpu)
+{
+ return vmload_vmsave_interception(vcpu, false);
+}
+
+static int vmrun_interception(struct kvm_vcpu *vcpu)
+{
+ if (nested_svm_check_permissions(vcpu))
+ return 1;
+
+ return nested_svm_vmrun(vcpu);
+}
+
+enum {
+ NONE_SVM_INSTR,
+ SVM_INSTR_VMRUN,
+ SVM_INSTR_VMLOAD,
+ SVM_INSTR_VMSAVE,
+};
+
+/* Return NONE_SVM_INSTR if not SVM instrs, otherwise return decode result */
+static int svm_instr_opcode(struct kvm_vcpu *vcpu)
+{
+ struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
+
+ if (ctxt->b != 0x1 || ctxt->opcode_len != 2)
+ return NONE_SVM_INSTR;
+
+ switch (ctxt->modrm) {
+ case 0xd8: /* VMRUN */
+ return SVM_INSTR_VMRUN;
+ case 0xda: /* VMLOAD */
+ return SVM_INSTR_VMLOAD;
+ case 0xdb: /* VMSAVE */
+ return SVM_INSTR_VMSAVE;
+ default:
+ break;
+ }
+
+ return NONE_SVM_INSTR;
+}
+
+static int emulate_svm_instr(struct kvm_vcpu *vcpu, int opcode)
+{
+ const int guest_mode_exit_codes[] = {
+ [SVM_INSTR_VMRUN] = SVM_EXIT_VMRUN,
+ [SVM_INSTR_VMLOAD] = SVM_EXIT_VMLOAD,
+ [SVM_INSTR_VMSAVE] = SVM_EXIT_VMSAVE,
+ };
+ int (*const svm_instr_handlers[])(struct kvm_vcpu *vcpu) = {
+ [SVM_INSTR_VMRUN] = vmrun_interception,
+ [SVM_INSTR_VMLOAD] = vmload_interception,
+ [SVM_INSTR_VMSAVE] = vmsave_interception,
+ };
+ struct vcpu_svm *svm = to_svm(vcpu);
+ int ret;
+
+ if (is_guest_mode(vcpu)) {
+ /* Returns '1' or -errno on failure, '0' on success. */
+ ret = nested_svm_simple_vmexit(svm, guest_mode_exit_codes[opcode]);
+ if (ret)
+ return ret;
+ return 1;
+ }
+ return svm_instr_handlers[opcode](vcpu);
+}
+
+/*
+ * #GP handling code. Note that #GP can be triggered under the following two
+ * cases:
+ * 1) SVM VM-related instructions (VMRUN/VMSAVE/VMLOAD) that trigger #GP on
+ * some AMD CPUs when EAX of these instructions are in the reserved memory
+ * regions (e.g. SMM memory on host).
+ * 2) VMware backdoor
+ */
+static int gp_interception(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ u32 error_code = svm->vmcb->control.exit_info_1;
+ int opcode;
+
+ /* Both #GP cases have zero error_code */
+ if (error_code)
+ goto reinject;
+
+ /* Decode the instruction for usage later */
+ if (x86_decode_emulated_instruction(vcpu, 0, NULL, 0) != EMULATION_OK)
+ goto reinject;
+
+ opcode = svm_instr_opcode(vcpu);
+
+ if (opcode == NONE_SVM_INSTR) {
+ if (!enable_vmware_backdoor)
+ goto reinject;
+
+ /*
+ * VMware backdoor emulation on #GP interception only handles
+ * IN{S}, OUT{S}, and RDPMC.
+ */
+ if (!is_guest_mode(vcpu))
+ return kvm_emulate_instruction(vcpu,
+ EMULTYPE_VMWARE_GP | EMULTYPE_NO_DECODE);
+ } else {
+ /* All SVM instructions expect page aligned RAX */
+ if (svm->vmcb->save.rax & ~PAGE_MASK)
+ goto reinject;
+
+ return emulate_svm_instr(vcpu, opcode);
+ }
+
+reinject:
+ kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
+ return 1;
+}
+
+void svm_set_gif(struct vcpu_svm *svm, bool value)
+{
+ if (value) {
+ /*
+ * If VGIF is enabled, the STGI intercept is only added to
+ * detect the opening of the SMI/NMI window; remove it now.
+ * Likewise, clear the VINTR intercept, we will set it
+ * again while processing KVM_REQ_EVENT if needed.
+ */
+ if (vgif)
+ svm_clr_intercept(svm, INTERCEPT_STGI);
+ if (svm_is_intercept(svm, INTERCEPT_VINTR))
+ svm_clear_vintr(svm);
+
+ enable_gif(svm);
+ if (svm->vcpu.arch.smi_pending ||
+ svm->vcpu.arch.nmi_pending ||
+ kvm_cpu_has_injectable_intr(&svm->vcpu) ||
+ kvm_apic_has_pending_init_or_sipi(&svm->vcpu))
+ kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
+ } else {
+ disable_gif(svm);
+
+ /*
+ * After a CLGI no interrupts should come. But if vGIF is
+ * in use, we still rely on the VINTR intercept (rather than
+ * STGI) to detect an open interrupt window.
+ */
+ if (!vgif)
+ svm_clear_vintr(svm);
+ }
+}
+
+static int stgi_interception(struct kvm_vcpu *vcpu)
+{
+ int ret;
+
+ if (nested_svm_check_permissions(vcpu))
+ return 1;
+
+ ret = kvm_skip_emulated_instruction(vcpu);
+ svm_set_gif(to_svm(vcpu), true);
+ return ret;
+}
+
+static int clgi_interception(struct kvm_vcpu *vcpu)
+{
+ int ret;
+
+ if (nested_svm_check_permissions(vcpu))
+ return 1;
+
+ ret = kvm_skip_emulated_instruction(vcpu);
+ svm_set_gif(to_svm(vcpu), false);
+ return ret;
+}
+
+static int invlpga_interception(struct kvm_vcpu *vcpu)
+{
+ gva_t gva = kvm_rax_read(vcpu);
+ u32 asid = kvm_rcx_read(vcpu);
+
+ /* FIXME: Handle an address size prefix. */
+ if (!is_long_mode(vcpu))
+ gva = (u32)gva;
+
+ trace_kvm_invlpga(to_svm(vcpu)->vmcb->save.rip, asid, gva);
+
+ /* Let's treat INVLPGA the same as INVLPG (can be optimized!) */
+ kvm_mmu_invlpg(vcpu, gva);
+
+ return kvm_skip_emulated_instruction(vcpu);
+}
+
+static int skinit_interception(struct kvm_vcpu *vcpu)
+{
+ trace_kvm_skinit(to_svm(vcpu)->vmcb->save.rip, kvm_rax_read(vcpu));
+
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+}
+
+static int task_switch_interception(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ u16 tss_selector;
+ int reason;
+ int int_type = svm->vmcb->control.exit_int_info &
+ SVM_EXITINTINFO_TYPE_MASK;
+ int int_vec = svm->vmcb->control.exit_int_info & SVM_EVTINJ_VEC_MASK;
+ uint32_t type =
+ svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_TYPE_MASK;
+ uint32_t idt_v =
+ svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_VALID;
+ bool has_error_code = false;
+ u32 error_code = 0;
+
+ tss_selector = (u16)svm->vmcb->control.exit_info_1;
+
+ if (svm->vmcb->control.exit_info_2 &
+ (1ULL << SVM_EXITINFOSHIFT_TS_REASON_IRET))
+ reason = TASK_SWITCH_IRET;
+ else if (svm->vmcb->control.exit_info_2 &
+ (1ULL << SVM_EXITINFOSHIFT_TS_REASON_JMP))
+ reason = TASK_SWITCH_JMP;
+ else if (idt_v)
+ reason = TASK_SWITCH_GATE;
+ else
+ reason = TASK_SWITCH_CALL;
+
+ if (reason == TASK_SWITCH_GATE) {
+ switch (type) {
+ case SVM_EXITINTINFO_TYPE_NMI:
+ vcpu->arch.nmi_injected = false;
+ break;
+ case SVM_EXITINTINFO_TYPE_EXEPT:
+ if (svm->vmcb->control.exit_info_2 &
+ (1ULL << SVM_EXITINFOSHIFT_TS_HAS_ERROR_CODE)) {
+ has_error_code = true;
+ error_code =
+ (u32)svm->vmcb->control.exit_info_2;
+ }
+ kvm_clear_exception_queue(vcpu);
+ break;
+ case SVM_EXITINTINFO_TYPE_INTR:
+ case SVM_EXITINTINFO_TYPE_SOFT:
+ kvm_clear_interrupt_queue(vcpu);
+ break;
+ default:
+ break;
+ }
+ }
+
+ if (reason != TASK_SWITCH_GATE ||
+ int_type == SVM_EXITINTINFO_TYPE_SOFT ||
+ (int_type == SVM_EXITINTINFO_TYPE_EXEPT &&
+ (int_vec == OF_VECTOR || int_vec == BP_VECTOR))) {
+ if (!svm_skip_emulated_instruction(vcpu))
+ return 0;
+ }
+
+ if (int_type != SVM_EXITINTINFO_TYPE_SOFT)
+ int_vec = -1;
+
+ return kvm_task_switch(vcpu, tss_selector, int_vec, reason,
+ has_error_code, error_code);
+}
+
+static void svm_clr_iret_intercept(struct vcpu_svm *svm)
+{
+ if (!sev_es_guest(svm->vcpu.kvm))
+ svm_clr_intercept(svm, INTERCEPT_IRET);
+}
+
+static void svm_set_iret_intercept(struct vcpu_svm *svm)
+{
+ if (!sev_es_guest(svm->vcpu.kvm))
+ svm_set_intercept(svm, INTERCEPT_IRET);
+}
+
+static int iret_interception(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ WARN_ON_ONCE(sev_es_guest(vcpu->kvm));
+
+ ++vcpu->stat.nmi_window_exits;
+ svm->awaiting_iret_completion = true;
+
+ svm_clr_iret_intercept(svm);
+ svm->nmi_iret_rip = kvm_rip_read(vcpu);
+
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ return 1;
+}
+
+static int invlpg_interception(struct kvm_vcpu *vcpu)
+{
+ if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
+ return kvm_emulate_instruction(vcpu, 0);
+
+ kvm_mmu_invlpg(vcpu, to_svm(vcpu)->vmcb->control.exit_info_1);
+ return kvm_skip_emulated_instruction(vcpu);
+}
+
+static int emulate_on_interception(struct kvm_vcpu *vcpu)
+{
+ return kvm_emulate_instruction(vcpu, 0);
+}
+
+static int rsm_interception(struct kvm_vcpu *vcpu)
+{
+ return kvm_emulate_instruction_from_buffer(vcpu, rsm_ins_bytes, 2);
+}
+
+static bool check_selective_cr0_intercepted(struct kvm_vcpu *vcpu,
+ unsigned long val)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ unsigned long cr0 = vcpu->arch.cr0;
+ bool ret = false;
+
+ if (!is_guest_mode(vcpu) ||
+ (!(vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_SELECTIVE_CR0))))
+ return false;
+
+ cr0 &= ~SVM_CR0_SELECTIVE_MASK;
+ val &= ~SVM_CR0_SELECTIVE_MASK;
+
+ if (cr0 ^ val) {
+ svm->vmcb->control.exit_code = SVM_EXIT_CR0_SEL_WRITE;
+ ret = (nested_svm_exit_handled(svm) == NESTED_EXIT_DONE);
+ }
+
+ return ret;
+}
+
+#define CR_VALID (1ULL << 63)
+
+static int cr_interception(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ int reg, cr;
+ unsigned long val;
+ int err;
+
+ if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
+ return emulate_on_interception(vcpu);
+
+ if (unlikely((svm->vmcb->control.exit_info_1 & CR_VALID) == 0))
+ return emulate_on_interception(vcpu);
+
+ reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
+ if (svm->vmcb->control.exit_code == SVM_EXIT_CR0_SEL_WRITE)
+ cr = SVM_EXIT_WRITE_CR0 - SVM_EXIT_READ_CR0;
+ else
+ cr = svm->vmcb->control.exit_code - SVM_EXIT_READ_CR0;
+
+ err = 0;
+ if (cr >= 16) { /* mov to cr */
+ cr -= 16;
+ val = kvm_register_read(vcpu, reg);
+ trace_kvm_cr_write(cr, val);
+ switch (cr) {
+ case 0:
+ if (!check_selective_cr0_intercepted(vcpu, val))
+ err = kvm_set_cr0(vcpu, val);
+ else
+ return 1;
+
+ break;
+ case 3:
+ err = kvm_set_cr3(vcpu, val);
+ break;
+ case 4:
+ err = kvm_set_cr4(vcpu, val);
+ break;
+ case 8:
+ err = kvm_set_cr8(vcpu, val);
+ break;
+ default:
+ WARN(1, "unhandled write to CR%d", cr);
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+ }
+ } else { /* mov from cr */
+ switch (cr) {
+ case 0:
+ val = kvm_read_cr0(vcpu);
+ break;
+ case 2:
+ val = vcpu->arch.cr2;
+ break;
+ case 3:
+ val = kvm_read_cr3(vcpu);
+ break;
+ case 4:
+ val = kvm_read_cr4(vcpu);
+ break;
+ case 8:
+ val = kvm_get_cr8(vcpu);
+ break;
+ default:
+ WARN(1, "unhandled read from CR%d", cr);
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+ }
+ kvm_register_write(vcpu, reg, val);
+ trace_kvm_cr_read(cr, val);
+ }
+ return kvm_complete_insn_gp(vcpu, err);
+}
+
+static int cr_trap(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ unsigned long old_value, new_value;
+ unsigned int cr;
+ int ret = 0;
+
+ new_value = (unsigned long)svm->vmcb->control.exit_info_1;
+
+ cr = svm->vmcb->control.exit_code - SVM_EXIT_CR0_WRITE_TRAP;
+ switch (cr) {
+ case 0:
+ old_value = kvm_read_cr0(vcpu);
+ svm_set_cr0(vcpu, new_value);
+
+ kvm_post_set_cr0(vcpu, old_value, new_value);
+ break;
+ case 4:
+ old_value = kvm_read_cr4(vcpu);
+ svm_set_cr4(vcpu, new_value);
+
+ kvm_post_set_cr4(vcpu, old_value, new_value);
+ break;
+ case 8:
+ ret = kvm_set_cr8(vcpu, new_value);
+ break;
+ default:
+ WARN(1, "unhandled CR%d write trap", cr);
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+ }
+
+ return kvm_complete_insn_gp(vcpu, ret);
+}
+
+static int dr_interception(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ int reg, dr;
+ unsigned long val;
+ int err = 0;
+
+ /*
+ * SEV-ES intercepts DR7 only to disable guest debugging and the guest issues a VMGEXIT
+ * for DR7 write only. KVM cannot change DR7 (always swapped as type 'A') so return early.
+ */
+ if (sev_es_guest(vcpu->kvm))
+ return 1;
+
+ if (vcpu->guest_debug == 0) {
+ /*
+ * No more DR vmexits; force a reload of the debug registers
+ * and reenter on this instruction. The next vmexit will
+ * retrieve the full state of the debug registers.
+ */
+ clr_dr_intercepts(svm);
+ vcpu->arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT;
+ return 1;
+ }
+
+ if (!boot_cpu_has(X86_FEATURE_DECODEASSISTS))
+ return emulate_on_interception(vcpu);
+
+ reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
+ dr = svm->vmcb->control.exit_code - SVM_EXIT_READ_DR0;
+ if (dr >= 16) { /* mov to DRn */
+ dr -= 16;
+ val = kvm_register_read(vcpu, reg);
+ err = kvm_set_dr(vcpu, dr, val);
+ } else {
+ kvm_get_dr(vcpu, dr, &val);
+ kvm_register_write(vcpu, reg, val);
+ }
+
+ return kvm_complete_insn_gp(vcpu, err);
+}
+
+static int cr8_write_interception(struct kvm_vcpu *vcpu)
+{
+ int r;
+
+ u8 cr8_prev = kvm_get_cr8(vcpu);
+ /* instruction emulation calls kvm_set_cr8() */
+ r = cr_interception(vcpu);
+ if (lapic_in_kernel(vcpu))
+ return r;
+ if (cr8_prev <= kvm_get_cr8(vcpu))
+ return r;
+ vcpu->run->exit_reason = KVM_EXIT_SET_TPR;
+ return 0;
+}
+
+static int efer_trap(struct kvm_vcpu *vcpu)
+{
+ struct msr_data msr_info;
+ int ret;
+
+ /*
+ * Clear the EFER_SVME bit from EFER. The SVM code always sets this
+ * bit in svm_set_efer(), but __kvm_valid_efer() checks it against
+ * whether the guest has X86_FEATURE_SVM - this avoids a failure if
+ * the guest doesn't have X86_FEATURE_SVM.
+ */
+ msr_info.host_initiated = false;
+ msr_info.index = MSR_EFER;
+ msr_info.data = to_svm(vcpu)->vmcb->control.exit_info_1 & ~EFER_SVME;
+ ret = kvm_set_msr_common(vcpu, &msr_info);
+
+ return kvm_complete_insn_gp(vcpu, ret);
+}
+
+static int svm_get_msr_feature(struct kvm_msr_entry *msr)
+{
+ msr->data = 0;
+
+ switch (msr->index) {
+ case MSR_AMD64_DE_CFG:
+ if (cpu_feature_enabled(X86_FEATURE_LFENCE_RDTSC))
+ msr->data |= MSR_AMD64_DE_CFG_LFENCE_SERIALIZE;
+ break;
+ default:
+ return KVM_MSR_RET_INVALID;
+ }
+
+ return 0;
+}
+
+static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ switch (msr_info->index) {
+ case MSR_AMD64_TSC_RATIO:
+ if (!msr_info->host_initiated &&
+ !guest_can_use(vcpu, X86_FEATURE_TSCRATEMSR))
+ return 1;
+ msr_info->data = svm->tsc_ratio_msr;
+ break;
+ case MSR_STAR:
+ msr_info->data = svm->vmcb01.ptr->save.star;
+ break;
+#ifdef CONFIG_X86_64
+ case MSR_LSTAR:
+ msr_info->data = svm->vmcb01.ptr->save.lstar;
+ break;
+ case MSR_CSTAR:
+ msr_info->data = svm->vmcb01.ptr->save.cstar;
+ break;
+ case MSR_KERNEL_GS_BASE:
+ msr_info->data = svm->vmcb01.ptr->save.kernel_gs_base;
+ break;
+ case MSR_SYSCALL_MASK:
+ msr_info->data = svm->vmcb01.ptr->save.sfmask;
+ break;
+#endif
+ case MSR_IA32_SYSENTER_CS:
+ msr_info->data = svm->vmcb01.ptr->save.sysenter_cs;
+ break;
+ case MSR_IA32_SYSENTER_EIP:
+ msr_info->data = (u32)svm->vmcb01.ptr->save.sysenter_eip;
+ if (guest_cpuid_is_intel(vcpu))
+ msr_info->data |= (u64)svm->sysenter_eip_hi << 32;
+ break;
+ case MSR_IA32_SYSENTER_ESP:
+ msr_info->data = svm->vmcb01.ptr->save.sysenter_esp;
+ if (guest_cpuid_is_intel(vcpu))
+ msr_info->data |= (u64)svm->sysenter_esp_hi << 32;
+ break;
+ case MSR_TSC_AUX:
+ msr_info->data = svm->tsc_aux;
+ break;
+ case MSR_IA32_DEBUGCTLMSR:
+ msr_info->data = svm_get_lbr_vmcb(svm)->save.dbgctl;
+ break;
+ case MSR_IA32_LASTBRANCHFROMIP:
+ msr_info->data = svm_get_lbr_vmcb(svm)->save.br_from;
+ break;
+ case MSR_IA32_LASTBRANCHTOIP:
+ msr_info->data = svm_get_lbr_vmcb(svm)->save.br_to;
+ break;
+ case MSR_IA32_LASTINTFROMIP:
+ msr_info->data = svm_get_lbr_vmcb(svm)->save.last_excp_from;
+ break;
+ case MSR_IA32_LASTINTTOIP:
+ msr_info->data = svm_get_lbr_vmcb(svm)->save.last_excp_to;
+ break;
+ case MSR_VM_HSAVE_PA:
+ msr_info->data = svm->nested.hsave_msr;
+ break;
+ case MSR_VM_CR:
+ msr_info->data = svm->nested.vm_cr_msr;
+ break;
+ case MSR_IA32_SPEC_CTRL:
+ if (!msr_info->host_initiated &&
+ !guest_has_spec_ctrl_msr(vcpu))
+ return 1;
+
+ if (boot_cpu_has(X86_FEATURE_V_SPEC_CTRL))
+ msr_info->data = svm->vmcb->save.spec_ctrl;
+ else
+ msr_info->data = svm->spec_ctrl;
+ break;
+ case MSR_AMD64_VIRT_SPEC_CTRL:
+ if (!msr_info->host_initiated &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_VIRT_SSBD))
+ return 1;
+
+ msr_info->data = svm->virt_spec_ctrl;
+ break;
+ case MSR_F15H_IC_CFG: {
+
+ int family, model;
+
+ family = guest_cpuid_family(vcpu);
+ model = guest_cpuid_model(vcpu);
+
+ if (family < 0 || model < 0)
+ return kvm_get_msr_common(vcpu, msr_info);
+
+ msr_info->data = 0;
+
+ if (family == 0x15 &&
+ (model >= 0x2 && model < 0x20))
+ msr_info->data = 0x1E;
+ }
+ break;
+ case MSR_AMD64_DE_CFG:
+ msr_info->data = svm->msr_decfg;
+ break;
+ default:
+ return kvm_get_msr_common(vcpu, msr_info);
+ }
+ return 0;
+}
+
+static int svm_complete_emulated_msr(struct kvm_vcpu *vcpu, int err)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ if (!err || !sev_es_guest(vcpu->kvm) || WARN_ON_ONCE(!svm->sev_es.ghcb))
+ return kvm_complete_insn_gp(vcpu, err);
+
+ ghcb_set_sw_exit_info_1(svm->sev_es.ghcb, 1);
+ ghcb_set_sw_exit_info_2(svm->sev_es.ghcb,
+ X86_TRAP_GP |
+ SVM_EVTINJ_TYPE_EXEPT |
+ SVM_EVTINJ_VALID);
+ return 1;
+}
+
+static int svm_set_vm_cr(struct kvm_vcpu *vcpu, u64 data)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ int svm_dis, chg_mask;
+
+ if (data & ~SVM_VM_CR_VALID_MASK)
+ return 1;
+
+ chg_mask = SVM_VM_CR_VALID_MASK;
+
+ if (svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK)
+ chg_mask &= ~(SVM_VM_CR_SVM_LOCK_MASK | SVM_VM_CR_SVM_DIS_MASK);
+
+ svm->nested.vm_cr_msr &= ~chg_mask;
+ svm->nested.vm_cr_msr |= (data & chg_mask);
+
+ svm_dis = svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK;
+
+ /* check for svm_disable while efer.svme is set */
+ if (svm_dis && (vcpu->arch.efer & EFER_SVME))
+ return 1;
+
+ return 0;
+}
+
+static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ int ret = 0;
+
+ u32 ecx = msr->index;
+ u64 data = msr->data;
+ switch (ecx) {
+ case MSR_AMD64_TSC_RATIO:
+
+ if (!guest_can_use(vcpu, X86_FEATURE_TSCRATEMSR)) {
+
+ if (!msr->host_initiated)
+ return 1;
+ /*
+ * In case TSC scaling is not enabled, always
+ * leave this MSR at the default value.
+ *
+ * Due to bug in qemu 6.2.0, it would try to set
+ * this msr to 0 if tsc scaling is not enabled.
+ * Ignore this value as well.
+ */
+ if (data != 0 && data != svm->tsc_ratio_msr)
+ return 1;
+ break;
+ }
+
+ if (data & SVM_TSC_RATIO_RSVD)
+ return 1;
+
+ svm->tsc_ratio_msr = data;
+
+ if (guest_can_use(vcpu, X86_FEATURE_TSCRATEMSR) &&
+ is_guest_mode(vcpu))
+ nested_svm_update_tsc_ratio_msr(vcpu);
+
+ break;
+ case MSR_IA32_CR_PAT:
+ ret = kvm_set_msr_common(vcpu, msr);
+ if (ret)
+ break;
+
+ svm->vmcb01.ptr->save.g_pat = data;
+ if (is_guest_mode(vcpu))
+ nested_vmcb02_compute_g_pat(svm);
+ vmcb_mark_dirty(svm->vmcb, VMCB_NPT);
+ break;
+ case MSR_IA32_SPEC_CTRL:
+ if (!msr->host_initiated &&
+ !guest_has_spec_ctrl_msr(vcpu))
+ return 1;
+
+ if (kvm_spec_ctrl_test_value(data))
+ return 1;
+
+ if (boot_cpu_has(X86_FEATURE_V_SPEC_CTRL))
+ svm->vmcb->save.spec_ctrl = data;
+ else
+ svm->spec_ctrl = data;
+ if (!data)
+ break;
+
+ /*
+ * For non-nested:
+ * When it's written (to non-zero) for the first time, pass
+ * it through.
+ *
+ * For nested:
+ * The handling of the MSR bitmap for L2 guests is done in
+ * nested_svm_vmrun_msrpm.
+ * We update the L1 MSR bit as well since it will end up
+ * touching the MSR anyway now.
+ */
+ set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1);
+ break;
+ case MSR_AMD64_VIRT_SPEC_CTRL:
+ if (!msr->host_initiated &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_VIRT_SSBD))
+ return 1;
+
+ if (data & ~SPEC_CTRL_SSBD)
+ return 1;
+
+ svm->virt_spec_ctrl = data;
+ break;
+ case MSR_STAR:
+ svm->vmcb01.ptr->save.star = data;
+ break;
+#ifdef CONFIG_X86_64
+ case MSR_LSTAR:
+ svm->vmcb01.ptr->save.lstar = data;
+ break;
+ case MSR_CSTAR:
+ svm->vmcb01.ptr->save.cstar = data;
+ break;
+ case MSR_KERNEL_GS_BASE:
+ svm->vmcb01.ptr->save.kernel_gs_base = data;
+ break;
+ case MSR_SYSCALL_MASK:
+ svm->vmcb01.ptr->save.sfmask = data;
+ break;
+#endif
+ case MSR_IA32_SYSENTER_CS:
+ svm->vmcb01.ptr->save.sysenter_cs = data;
+ break;
+ case MSR_IA32_SYSENTER_EIP:
+ svm->vmcb01.ptr->save.sysenter_eip = (u32)data;
+ /*
+ * We only intercept the MSR_IA32_SYSENTER_{EIP|ESP} msrs
+ * when we spoof an Intel vendor ID (for cross vendor migration).
+ * In this case we use this intercept to track the high
+ * 32 bit part of these msrs to support Intel's
+ * implementation of SYSENTER/SYSEXIT.
+ */
+ svm->sysenter_eip_hi = guest_cpuid_is_intel(vcpu) ? (data >> 32) : 0;
+ break;
+ case MSR_IA32_SYSENTER_ESP:
+ svm->vmcb01.ptr->save.sysenter_esp = (u32)data;
+ svm->sysenter_esp_hi = guest_cpuid_is_intel(vcpu) ? (data >> 32) : 0;
+ break;
+ case MSR_TSC_AUX:
+ /*
+ * TSC_AUX is always virtualized for SEV-ES guests when the
+ * feature is available. The user return MSR support is not
+ * required in this case because TSC_AUX is restored on #VMEXIT
+ * from the host save area (which has been initialized in
+ * svm_hardware_enable()).
+ */
+ if (boot_cpu_has(X86_FEATURE_V_TSC_AUX) && sev_es_guest(vcpu->kvm))
+ break;
+
+ /*
+ * TSC_AUX is usually changed only during boot and never read
+ * directly. Intercept TSC_AUX instead of exposing it to the
+ * guest via direct_access_msrs, and switch it via user return.
+ */
+ preempt_disable();
+ ret = kvm_set_user_return_msr(tsc_aux_uret_slot, data, -1ull);
+ preempt_enable();
+ if (ret)
+ break;
+
+ svm->tsc_aux = data;
+ break;
+ case MSR_IA32_DEBUGCTLMSR:
+ if (!lbrv) {
+ kvm_pr_unimpl_wrmsr(vcpu, ecx, data);
+ break;
+ }
+ if (data & DEBUGCTL_RESERVED_BITS)
+ return 1;
+
+ svm_get_lbr_vmcb(svm)->save.dbgctl = data;
+ svm_update_lbrv(vcpu);
+ break;
+ case MSR_VM_HSAVE_PA:
+ /*
+ * Old kernels did not validate the value written to
+ * MSR_VM_HSAVE_PA. Allow KVM_SET_MSR to set an invalid
+ * value to allow live migrating buggy or malicious guests
+ * originating from those kernels.
+ */
+ if (!msr->host_initiated && !page_address_valid(vcpu, data))
+ return 1;
+
+ svm->nested.hsave_msr = data & PAGE_MASK;
+ break;
+ case MSR_VM_CR:
+ return svm_set_vm_cr(vcpu, data);
+ case MSR_VM_IGNNE:
+ kvm_pr_unimpl_wrmsr(vcpu, ecx, data);
+ break;
+ case MSR_AMD64_DE_CFG: {
+ struct kvm_msr_entry msr_entry;
+
+ msr_entry.index = msr->index;
+ if (svm_get_msr_feature(&msr_entry))
+ return 1;
+
+ /* Check the supported bits */
+ if (data & ~msr_entry.data)
+ return 1;
+
+ /* Don't allow the guest to change a bit, #GP */
+ if (!msr->host_initiated && (data ^ msr_entry.data))
+ return 1;
+
+ svm->msr_decfg = data;
+ break;
+ }
+ default:
+ return kvm_set_msr_common(vcpu, msr);
+ }
+ return ret;
+}
+
+static int msr_interception(struct kvm_vcpu *vcpu)
+{
+ if (to_svm(vcpu)->vmcb->control.exit_info_1)
+ return kvm_emulate_wrmsr(vcpu);
+ else
+ return kvm_emulate_rdmsr(vcpu);
+}
+
+static int interrupt_window_interception(struct kvm_vcpu *vcpu)
+{
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ svm_clear_vintr(to_svm(vcpu));
+
+ /*
+ * If not running nested, for AVIC, the only reason to end up here is ExtINTs.
+ * In this case AVIC was temporarily disabled for
+ * requesting the IRQ window and we have to re-enable it.
+ *
+ * If running nested, still remove the VM wide AVIC inhibit to
+ * support case in which the interrupt window was requested when the
+ * vCPU was not running nested.
+
+ * All vCPUs which run still run nested, will remain to have their
+ * AVIC still inhibited due to per-cpu AVIC inhibition.
+ */
+ kvm_clear_apicv_inhibit(vcpu->kvm, APICV_INHIBIT_REASON_IRQWIN);
+
+ ++vcpu->stat.irq_window_exits;
+ return 1;
+}
+
+static int pause_interception(struct kvm_vcpu *vcpu)
+{
+ bool in_kernel;
+ /*
+ * CPL is not made available for an SEV-ES guest, therefore
+ * vcpu->arch.preempted_in_kernel can never be true. Just
+ * set in_kernel to false as well.
+ */
+ in_kernel = !sev_es_guest(vcpu->kvm) && svm_get_cpl(vcpu) == 0;
+
+ grow_ple_window(vcpu);
+
+ kvm_vcpu_on_spin(vcpu, in_kernel);
+ return kvm_skip_emulated_instruction(vcpu);
+}
+
+static int invpcid_interception(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ unsigned long type;
+ gva_t gva;
+
+ if (!guest_cpuid_has(vcpu, X86_FEATURE_INVPCID)) {
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+ }
+
+ /*
+ * For an INVPCID intercept:
+ * EXITINFO1 provides the linear address of the memory operand.
+ * EXITINFO2 provides the contents of the register operand.
+ */
+ type = svm->vmcb->control.exit_info_2;
+ gva = svm->vmcb->control.exit_info_1;
+
+ return kvm_handle_invpcid(vcpu, type, gva);
+}
+
+static int (*const svm_exit_handlers[])(struct kvm_vcpu *vcpu) = {
+ [SVM_EXIT_READ_CR0] = cr_interception,
+ [SVM_EXIT_READ_CR3] = cr_interception,
+ [SVM_EXIT_READ_CR4] = cr_interception,
+ [SVM_EXIT_READ_CR8] = cr_interception,
+ [SVM_EXIT_CR0_SEL_WRITE] = cr_interception,
+ [SVM_EXIT_WRITE_CR0] = cr_interception,
+ [SVM_EXIT_WRITE_CR3] = cr_interception,
+ [SVM_EXIT_WRITE_CR4] = cr_interception,
+ [SVM_EXIT_WRITE_CR8] = cr8_write_interception,
+ [SVM_EXIT_READ_DR0] = dr_interception,
+ [SVM_EXIT_READ_DR1] = dr_interception,
+ [SVM_EXIT_READ_DR2] = dr_interception,
+ [SVM_EXIT_READ_DR3] = dr_interception,
+ [SVM_EXIT_READ_DR4] = dr_interception,
+ [SVM_EXIT_READ_DR5] = dr_interception,
+ [SVM_EXIT_READ_DR6] = dr_interception,
+ [SVM_EXIT_READ_DR7] = dr_interception,
+ [SVM_EXIT_WRITE_DR0] = dr_interception,
+ [SVM_EXIT_WRITE_DR1] = dr_interception,
+ [SVM_EXIT_WRITE_DR2] = dr_interception,
+ [SVM_EXIT_WRITE_DR3] = dr_interception,
+ [SVM_EXIT_WRITE_DR4] = dr_interception,
+ [SVM_EXIT_WRITE_DR5] = dr_interception,
+ [SVM_EXIT_WRITE_DR6] = dr_interception,
+ [SVM_EXIT_WRITE_DR7] = dr_interception,
+ [SVM_EXIT_EXCP_BASE + DB_VECTOR] = db_interception,
+ [SVM_EXIT_EXCP_BASE + BP_VECTOR] = bp_interception,
+ [SVM_EXIT_EXCP_BASE + UD_VECTOR] = ud_interception,
+ [SVM_EXIT_EXCP_BASE + PF_VECTOR] = pf_interception,
+ [SVM_EXIT_EXCP_BASE + MC_VECTOR] = mc_interception,
+ [SVM_EXIT_EXCP_BASE + AC_VECTOR] = ac_interception,
+ [SVM_EXIT_EXCP_BASE + GP_VECTOR] = gp_interception,
+ [SVM_EXIT_INTR] = intr_interception,
+ [SVM_EXIT_NMI] = nmi_interception,
+ [SVM_EXIT_SMI] = smi_interception,
+ [SVM_EXIT_VINTR] = interrupt_window_interception,
+ [SVM_EXIT_RDPMC] = kvm_emulate_rdpmc,
+ [SVM_EXIT_CPUID] = kvm_emulate_cpuid,
+ [SVM_EXIT_IRET] = iret_interception,
+ [SVM_EXIT_INVD] = kvm_emulate_invd,
+ [SVM_EXIT_PAUSE] = pause_interception,
+ [SVM_EXIT_HLT] = kvm_emulate_halt,
+ [SVM_EXIT_INVLPG] = invlpg_interception,
+ [SVM_EXIT_INVLPGA] = invlpga_interception,
+ [SVM_EXIT_IOIO] = io_interception,
+ [SVM_EXIT_MSR] = msr_interception,
+ [SVM_EXIT_TASK_SWITCH] = task_switch_interception,
+ [SVM_EXIT_SHUTDOWN] = shutdown_interception,
+ [SVM_EXIT_VMRUN] = vmrun_interception,
+ [SVM_EXIT_VMMCALL] = kvm_emulate_hypercall,
+ [SVM_EXIT_VMLOAD] = vmload_interception,
+ [SVM_EXIT_VMSAVE] = vmsave_interception,
+ [SVM_EXIT_STGI] = stgi_interception,
+ [SVM_EXIT_CLGI] = clgi_interception,
+ [SVM_EXIT_SKINIT] = skinit_interception,
+ [SVM_EXIT_RDTSCP] = kvm_handle_invalid_op,
+ [SVM_EXIT_WBINVD] = kvm_emulate_wbinvd,
+ [SVM_EXIT_MONITOR] = kvm_emulate_monitor,
+ [SVM_EXIT_MWAIT] = kvm_emulate_mwait,
+ [SVM_EXIT_XSETBV] = kvm_emulate_xsetbv,
+ [SVM_EXIT_RDPRU] = kvm_handle_invalid_op,
+ [SVM_EXIT_EFER_WRITE_TRAP] = efer_trap,
+ [SVM_EXIT_CR0_WRITE_TRAP] = cr_trap,
+ [SVM_EXIT_CR4_WRITE_TRAP] = cr_trap,
+ [SVM_EXIT_CR8_WRITE_TRAP] = cr_trap,
+ [SVM_EXIT_INVPCID] = invpcid_interception,
+ [SVM_EXIT_NPF] = npf_interception,
+ [SVM_EXIT_RSM] = rsm_interception,
+ [SVM_EXIT_AVIC_INCOMPLETE_IPI] = avic_incomplete_ipi_interception,
+ [SVM_EXIT_AVIC_UNACCELERATED_ACCESS] = avic_unaccelerated_access_interception,
+ [SVM_EXIT_VMGEXIT] = sev_handle_vmgexit,
+};
+
+static void dump_vmcb(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb_control_area *control = &svm->vmcb->control;
+ struct vmcb_save_area *save = &svm->vmcb->save;
+ struct vmcb_save_area *save01 = &svm->vmcb01.ptr->save;
+
+ if (!dump_invalid_vmcb) {
+ pr_warn_ratelimited("set kvm_amd.dump_invalid_vmcb=1 to dump internal KVM state.\n");
+ return;
+ }
+
+ pr_err("VMCB %p, last attempted VMRUN on CPU %d\n",
+ svm->current_vmcb->ptr, vcpu->arch.last_vmentry_cpu);
+ pr_err("VMCB Control Area:\n");
+ pr_err("%-20s%04x\n", "cr_read:", control->intercepts[INTERCEPT_CR] & 0xffff);
+ pr_err("%-20s%04x\n", "cr_write:", control->intercepts[INTERCEPT_CR] >> 16);
+ pr_err("%-20s%04x\n", "dr_read:", control->intercepts[INTERCEPT_DR] & 0xffff);
+ pr_err("%-20s%04x\n", "dr_write:", control->intercepts[INTERCEPT_DR] >> 16);
+ pr_err("%-20s%08x\n", "exceptions:", control->intercepts[INTERCEPT_EXCEPTION]);
+ pr_err("%-20s%08x %08x\n", "intercepts:",
+ control->intercepts[INTERCEPT_WORD3],
+ control->intercepts[INTERCEPT_WORD4]);
+ pr_err("%-20s%d\n", "pause filter count:", control->pause_filter_count);
+ pr_err("%-20s%d\n", "pause filter threshold:",
+ control->pause_filter_thresh);
+ pr_err("%-20s%016llx\n", "iopm_base_pa:", control->iopm_base_pa);
+ pr_err("%-20s%016llx\n", "msrpm_base_pa:", control->msrpm_base_pa);
+ pr_err("%-20s%016llx\n", "tsc_offset:", control->tsc_offset);
+ pr_err("%-20s%d\n", "asid:", control->asid);
+ pr_err("%-20s%d\n", "tlb_ctl:", control->tlb_ctl);
+ pr_err("%-20s%08x\n", "int_ctl:", control->int_ctl);
+ pr_err("%-20s%08x\n", "int_vector:", control->int_vector);
+ pr_err("%-20s%08x\n", "int_state:", control->int_state);
+ pr_err("%-20s%08x\n", "exit_code:", control->exit_code);
+ pr_err("%-20s%016llx\n", "exit_info1:", control->exit_info_1);
+ pr_err("%-20s%016llx\n", "exit_info2:", control->exit_info_2);
+ pr_err("%-20s%08x\n", "exit_int_info:", control->exit_int_info);
+ pr_err("%-20s%08x\n", "exit_int_info_err:", control->exit_int_info_err);
+ pr_err("%-20s%lld\n", "nested_ctl:", control->nested_ctl);
+ pr_err("%-20s%016llx\n", "nested_cr3:", control->nested_cr3);
+ pr_err("%-20s%016llx\n", "avic_vapic_bar:", control->avic_vapic_bar);
+ pr_err("%-20s%016llx\n", "ghcb:", control->ghcb_gpa);
+ pr_err("%-20s%08x\n", "event_inj:", control->event_inj);
+ pr_err("%-20s%08x\n", "event_inj_err:", control->event_inj_err);
+ pr_err("%-20s%lld\n", "virt_ext:", control->virt_ext);
+ pr_err("%-20s%016llx\n", "next_rip:", control->next_rip);
+ pr_err("%-20s%016llx\n", "avic_backing_page:", control->avic_backing_page);
+ pr_err("%-20s%016llx\n", "avic_logical_id:", control->avic_logical_id);
+ pr_err("%-20s%016llx\n", "avic_physical_id:", control->avic_physical_id);
+ pr_err("%-20s%016llx\n", "vmsa_pa:", control->vmsa_pa);
+ pr_err("VMCB State Save Area:\n");
+ pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
+ "es:",
+ save->es.selector, save->es.attrib,
+ save->es.limit, save->es.base);
+ pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
+ "cs:",
+ save->cs.selector, save->cs.attrib,
+ save->cs.limit, save->cs.base);
+ pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
+ "ss:",
+ save->ss.selector, save->ss.attrib,
+ save->ss.limit, save->ss.base);
+ pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
+ "ds:",
+ save->ds.selector, save->ds.attrib,
+ save->ds.limit, save->ds.base);
+ pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
+ "fs:",
+ save01->fs.selector, save01->fs.attrib,
+ save01->fs.limit, save01->fs.base);
+ pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
+ "gs:",
+ save01->gs.selector, save01->gs.attrib,
+ save01->gs.limit, save01->gs.base);
+ pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
+ "gdtr:",
+ save->gdtr.selector, save->gdtr.attrib,
+ save->gdtr.limit, save->gdtr.base);
+ pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
+ "ldtr:",
+ save01->ldtr.selector, save01->ldtr.attrib,
+ save01->ldtr.limit, save01->ldtr.base);
+ pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
+ "idtr:",
+ save->idtr.selector, save->idtr.attrib,
+ save->idtr.limit, save->idtr.base);
+ pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
+ "tr:",
+ save01->tr.selector, save01->tr.attrib,
+ save01->tr.limit, save01->tr.base);
+ pr_err("vmpl: %d cpl: %d efer: %016llx\n",
+ save->vmpl, save->cpl, save->efer);
+ pr_err("%-15s %016llx %-13s %016llx\n",
+ "cr0:", save->cr0, "cr2:", save->cr2);
+ pr_err("%-15s %016llx %-13s %016llx\n",
+ "cr3:", save->cr3, "cr4:", save->cr4);
+ pr_err("%-15s %016llx %-13s %016llx\n",
+ "dr6:", save->dr6, "dr7:", save->dr7);
+ pr_err("%-15s %016llx %-13s %016llx\n",
+ "rip:", save->rip, "rflags:", save->rflags);
+ pr_err("%-15s %016llx %-13s %016llx\n",
+ "rsp:", save->rsp, "rax:", save->rax);
+ pr_err("%-15s %016llx %-13s %016llx\n",
+ "star:", save01->star, "lstar:", save01->lstar);
+ pr_err("%-15s %016llx %-13s %016llx\n",
+ "cstar:", save01->cstar, "sfmask:", save01->sfmask);
+ pr_err("%-15s %016llx %-13s %016llx\n",
+ "kernel_gs_base:", save01->kernel_gs_base,
+ "sysenter_cs:", save01->sysenter_cs);
+ pr_err("%-15s %016llx %-13s %016llx\n",
+ "sysenter_esp:", save01->sysenter_esp,
+ "sysenter_eip:", save01->sysenter_eip);
+ pr_err("%-15s %016llx %-13s %016llx\n",
+ "gpat:", save->g_pat, "dbgctl:", save->dbgctl);
+ pr_err("%-15s %016llx %-13s %016llx\n",
+ "br_from:", save->br_from, "br_to:", save->br_to);
+ pr_err("%-15s %016llx %-13s %016llx\n",
+ "excp_from:", save->last_excp_from,
+ "excp_to:", save->last_excp_to);
+}
+
+static bool svm_check_exit_valid(u64 exit_code)
+{
+ return (exit_code < ARRAY_SIZE(svm_exit_handlers) &&
+ svm_exit_handlers[exit_code]);
+}
+
+static int svm_handle_invalid_exit(struct kvm_vcpu *vcpu, u64 exit_code)
+{
+ vcpu_unimpl(vcpu, "svm: unexpected exit reason 0x%llx\n", exit_code);
+ dump_vmcb(vcpu);
+ vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+ vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON;
+ vcpu->run->internal.ndata = 2;
+ vcpu->run->internal.data[0] = exit_code;
+ vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu;
+ return 0;
+}
+
+int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 exit_code)
+{
+ if (!svm_check_exit_valid(exit_code))
+ return svm_handle_invalid_exit(vcpu, exit_code);
+
+#ifdef CONFIG_RETPOLINE
+ if (exit_code == SVM_EXIT_MSR)
+ return msr_interception(vcpu);
+ else if (exit_code == SVM_EXIT_VINTR)
+ return interrupt_window_interception(vcpu);
+ else if (exit_code == SVM_EXIT_INTR)
+ return intr_interception(vcpu);
+ else if (exit_code == SVM_EXIT_HLT)
+ return kvm_emulate_halt(vcpu);
+ else if (exit_code == SVM_EXIT_NPF)
+ return npf_interception(vcpu);
+#endif
+ return svm_exit_handlers[exit_code](vcpu);
+}
+
+static void svm_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason,
+ u64 *info1, u64 *info2,
+ u32 *intr_info, u32 *error_code)
+{
+ struct vmcb_control_area *control = &to_svm(vcpu)->vmcb->control;
+
+ *reason = control->exit_code;
+ *info1 = control->exit_info_1;
+ *info2 = control->exit_info_2;
+ *intr_info = control->exit_int_info;
+ if ((*intr_info & SVM_EXITINTINFO_VALID) &&
+ (*intr_info & SVM_EXITINTINFO_VALID_ERR))
+ *error_code = control->exit_int_info_err;
+ else
+ *error_code = 0;
+}
+
+static int svm_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct kvm_run *kvm_run = vcpu->run;
+ u32 exit_code = svm->vmcb->control.exit_code;
+
+ /* SEV-ES guests must use the CR write traps to track CR registers. */
+ if (!sev_es_guest(vcpu->kvm)) {
+ if (!svm_is_intercept(svm, INTERCEPT_CR0_WRITE))
+ vcpu->arch.cr0 = svm->vmcb->save.cr0;
+ if (npt_enabled)
+ vcpu->arch.cr3 = svm->vmcb->save.cr3;
+ }
+
+ if (is_guest_mode(vcpu)) {
+ int vmexit;
+
+ trace_kvm_nested_vmexit(vcpu, KVM_ISA_SVM);
+
+ vmexit = nested_svm_exit_special(svm);
+
+ if (vmexit == NESTED_EXIT_CONTINUE)
+ vmexit = nested_svm_exit_handled(svm);
+
+ if (vmexit == NESTED_EXIT_DONE)
+ return 1;
+ }
+
+ if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) {
+ kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
+ kvm_run->fail_entry.hardware_entry_failure_reason
+ = svm->vmcb->control.exit_code;
+ kvm_run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu;
+ dump_vmcb(vcpu);
+ return 0;
+ }
+
+ if (exit_fastpath != EXIT_FASTPATH_NONE)
+ return 1;
+
+ return svm_invoke_exit_handler(vcpu, exit_code);
+}
+
+static void pre_svm_run(struct kvm_vcpu *vcpu)
+{
+ struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, vcpu->cpu);
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ /*
+ * If the previous vmrun of the vmcb occurred on a different physical
+ * cpu, then mark the vmcb dirty and assign a new asid. Hardware's
+ * vmcb clean bits are per logical CPU, as are KVM's asid assignments.
+ */
+ if (unlikely(svm->current_vmcb->cpu != vcpu->cpu)) {
+ svm->current_vmcb->asid_generation = 0;
+ vmcb_mark_all_dirty(svm->vmcb);
+ svm->current_vmcb->cpu = vcpu->cpu;
+ }
+
+ if (sev_guest(vcpu->kvm))
+ return pre_sev_run(svm, vcpu->cpu);
+
+ /* FIXME: handle wraparound of asid_generation */
+ if (svm->current_vmcb->asid_generation != sd->asid_generation)
+ new_asid(svm, sd);
+}
+
+static void svm_inject_nmi(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI;
+
+ if (svm->nmi_l1_to_l2)
+ return;
+
+ svm->nmi_masked = true;
+ svm_set_iret_intercept(svm);
+ ++vcpu->stat.nmi_injections;
+}
+
+static bool svm_is_vnmi_pending(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (!is_vnmi_enabled(svm))
+ return false;
+
+ return !!(svm->vmcb->control.int_ctl & V_NMI_PENDING_MASK);
+}
+
+static bool svm_set_vnmi_pending(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (!is_vnmi_enabled(svm))
+ return false;
+
+ if (svm->vmcb->control.int_ctl & V_NMI_PENDING_MASK)
+ return false;
+
+ svm->vmcb->control.int_ctl |= V_NMI_PENDING_MASK;
+ vmcb_mark_dirty(svm->vmcb, VMCB_INTR);
+
+ /*
+ * Because the pending NMI is serviced by hardware, KVM can't know when
+ * the NMI is "injected", but for all intents and purposes, passing the
+ * NMI off to hardware counts as injection.
+ */
+ ++vcpu->stat.nmi_injections;
+
+ return true;
+}
+
+static void svm_inject_irq(struct kvm_vcpu *vcpu, bool reinjected)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ u32 type;
+
+ if (vcpu->arch.interrupt.soft) {
+ if (svm_update_soft_interrupt_rip(vcpu))
+ return;
+
+ type = SVM_EVTINJ_TYPE_SOFT;
+ } else {
+ type = SVM_EVTINJ_TYPE_INTR;
+ }
+
+ trace_kvm_inj_virq(vcpu->arch.interrupt.nr,
+ vcpu->arch.interrupt.soft, reinjected);
+ ++vcpu->stat.irq_injections;
+
+ svm->vmcb->control.event_inj = vcpu->arch.interrupt.nr |
+ SVM_EVTINJ_VALID | type;
+}
+
+void svm_complete_interrupt_delivery(struct kvm_vcpu *vcpu, int delivery_mode,
+ int trig_mode, int vector)
+{
+ /*
+ * apic->apicv_active must be read after vcpu->mode.
+ * Pairs with smp_store_release in vcpu_enter_guest.
+ */
+ bool in_guest_mode = (smp_load_acquire(&vcpu->mode) == IN_GUEST_MODE);
+
+ /* Note, this is called iff the local APIC is in-kernel. */
+ if (!READ_ONCE(vcpu->arch.apic->apicv_active)) {
+ /* Process the interrupt via kvm_check_and_inject_events(). */
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ kvm_vcpu_kick(vcpu);
+ return;
+ }
+
+ trace_kvm_apicv_accept_irq(vcpu->vcpu_id, delivery_mode, trig_mode, vector);
+ if (in_guest_mode) {
+ /*
+ * Signal the doorbell to tell hardware to inject the IRQ. If
+ * the vCPU exits the guest before the doorbell chimes, hardware
+ * will automatically process AVIC interrupts at the next VMRUN.
+ */
+ avic_ring_doorbell(vcpu);
+ } else {
+ /*
+ * Wake the vCPU if it was blocking. KVM will then detect the
+ * pending IRQ when checking if the vCPU has a wake event.
+ */
+ kvm_vcpu_wake_up(vcpu);
+ }
+}
+
+static void svm_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode,
+ int trig_mode, int vector)
+{
+ kvm_lapic_set_irr(vector, apic);
+
+ /*
+ * Pairs with the smp_mb_*() after setting vcpu->guest_mode in
+ * vcpu_enter_guest() to ensure the write to the vIRR is ordered before
+ * the read of guest_mode. This guarantees that either VMRUN will see
+ * and process the new vIRR entry, or that svm_complete_interrupt_delivery
+ * will signal the doorbell if the CPU has already entered the guest.
+ */
+ smp_mb__after_atomic();
+ svm_complete_interrupt_delivery(apic->vcpu, delivery_mode, trig_mode, vector);
+}
+
+static void svm_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ /*
+ * SEV-ES guests must always keep the CR intercepts cleared. CR
+ * tracking is done using the CR write traps.
+ */
+ if (sev_es_guest(vcpu->kvm))
+ return;
+
+ if (nested_svm_virtualize_tpr(vcpu))
+ return;
+
+ svm_clr_intercept(svm, INTERCEPT_CR8_WRITE);
+
+ if (irr == -1)
+ return;
+
+ if (tpr >= irr)
+ svm_set_intercept(svm, INTERCEPT_CR8_WRITE);
+}
+
+static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (is_vnmi_enabled(svm))
+ return svm->vmcb->control.int_ctl & V_NMI_BLOCKING_MASK;
+ else
+ return svm->nmi_masked;
+}
+
+static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (is_vnmi_enabled(svm)) {
+ if (masked)
+ svm->vmcb->control.int_ctl |= V_NMI_BLOCKING_MASK;
+ else
+ svm->vmcb->control.int_ctl &= ~V_NMI_BLOCKING_MASK;
+
+ } else {
+ svm->nmi_masked = masked;
+ if (masked)
+ svm_set_iret_intercept(svm);
+ else
+ svm_clr_iret_intercept(svm);
+ }
+}
+
+bool svm_nmi_blocked(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb *vmcb = svm->vmcb;
+
+ if (!gif_set(svm))
+ return true;
+
+ if (is_guest_mode(vcpu) && nested_exit_on_nmi(svm))
+ return false;
+
+ if (svm_get_nmi_mask(vcpu))
+ return true;
+
+ return vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK;
+}
+
+static int svm_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ if (svm->nested.nested_run_pending)
+ return -EBUSY;
+
+ if (svm_nmi_blocked(vcpu))
+ return 0;
+
+ /* An NMI must not be injected into L2 if it's supposed to VM-Exit. */
+ if (for_injection && is_guest_mode(vcpu) && nested_exit_on_nmi(svm))
+ return -EBUSY;
+ return 1;
+}
+
+bool svm_interrupt_blocked(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb *vmcb = svm->vmcb;
+
+ if (!gif_set(svm))
+ return true;
+
+ if (is_guest_mode(vcpu)) {
+ /* As long as interrupts are being delivered... */
+ if ((svm->nested.ctl.int_ctl & V_INTR_MASKING_MASK)
+ ? !(svm->vmcb01.ptr->save.rflags & X86_EFLAGS_IF)
+ : !(kvm_get_rflags(vcpu) & X86_EFLAGS_IF))
+ return true;
+
+ /* ... vmexits aren't blocked by the interrupt shadow */
+ if (nested_exit_on_intr(svm))
+ return false;
+ } else {
+ if (!svm_get_if_flag(vcpu))
+ return true;
+ }
+
+ return (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK);
+}
+
+static int svm_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (svm->nested.nested_run_pending)
+ return -EBUSY;
+
+ if (svm_interrupt_blocked(vcpu))
+ return 0;
+
+ /*
+ * An IRQ must not be injected into L2 if it's supposed to VM-Exit,
+ * e.g. if the IRQ arrived asynchronously after checking nested events.
+ */
+ if (for_injection && is_guest_mode(vcpu) && nested_exit_on_intr(svm))
+ return -EBUSY;
+
+ return 1;
+}
+
+static void svm_enable_irq_window(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ /*
+ * In case GIF=0 we can't rely on the CPU to tell us when GIF becomes
+ * 1, because that's a separate STGI/VMRUN intercept. The next time we
+ * get that intercept, this function will be called again though and
+ * we'll get the vintr intercept. However, if the vGIF feature is
+ * enabled, the STGI interception will not occur. Enable the irq
+ * window under the assumption that the hardware will set the GIF.
+ */
+ if (vgif || gif_set(svm)) {
+ /*
+ * IRQ window is not needed when AVIC is enabled,
+ * unless we have pending ExtINT since it cannot be injected
+ * via AVIC. In such case, KVM needs to temporarily disable AVIC,
+ * and fallback to injecting IRQ via V_IRQ.
+ *
+ * If running nested, AVIC is already locally inhibited
+ * on this vCPU, therefore there is no need to request
+ * the VM wide AVIC inhibition.
+ */
+ if (!is_guest_mode(vcpu))
+ kvm_set_apicv_inhibit(vcpu->kvm, APICV_INHIBIT_REASON_IRQWIN);
+
+ svm_set_vintr(svm);
+ }
+}
+
+static void svm_enable_nmi_window(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ /*
+ * KVM should never request an NMI window when vNMI is enabled, as KVM
+ * allows at most one to-be-injected NMI and one pending NMI, i.e. if
+ * two NMIs arrive simultaneously, KVM will inject one and set
+ * V_NMI_PENDING for the other. WARN, but continue with the standard
+ * single-step approach to try and salvage the pending NMI.
+ */
+ WARN_ON_ONCE(is_vnmi_enabled(svm));
+
+ if (svm_get_nmi_mask(vcpu) && !svm->awaiting_iret_completion)
+ return; /* IRET will cause a vm exit */
+
+ /*
+ * SEV-ES guests are responsible for signaling when a vCPU is ready to
+ * receive a new NMI, as SEV-ES guests can't be single-stepped, i.e.
+ * KVM can't intercept and single-step IRET to detect when NMIs are
+ * unblocked (architecturally speaking). See SVM_VMGEXIT_NMI_COMPLETE.
+ *
+ * Note, GIF is guaranteed to be '1' for SEV-ES guests as hardware
+ * ignores SEV-ES guest writes to EFER.SVME *and* CLGI/STGI are not
+ * supported NAEs in the GHCB protocol.
+ */
+ if (sev_es_guest(vcpu->kvm))
+ return;
+
+ if (!gif_set(svm)) {
+ if (vgif)
+ svm_set_intercept(svm, INTERCEPT_STGI);
+ return; /* STGI will cause a vm exit */
+ }
+
+ /*
+ * Something prevents NMI from been injected. Single step over possible
+ * problem (IRET or exception injection or interrupt shadow)
+ */
+ svm->nmi_singlestep_guest_rflags = svm_get_rflags(vcpu);
+ svm->nmi_singlestep = true;
+ svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
+}
+
+static void svm_flush_tlb_asid(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ /*
+ * Unlike VMX, SVM doesn't provide a way to flush only NPT TLB entries.
+ * A TLB flush for the current ASID flushes both "host" and "guest" TLB
+ * entries, and thus is a superset of Hyper-V's fine grained flushing.
+ */
+ kvm_hv_vcpu_purge_flush_tlb(vcpu);
+
+ /*
+ * Flush only the current ASID even if the TLB flush was invoked via
+ * kvm_flush_remote_tlbs(). Although flushing remote TLBs requires all
+ * ASIDs to be flushed, KVM uses a single ASID for L1 and L2, and
+ * unconditionally does a TLB flush on both nested VM-Enter and nested
+ * VM-Exit (via kvm_mmu_reset_context()).
+ */
+ if (static_cpu_has(X86_FEATURE_FLUSHBYASID))
+ svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID;
+ else
+ svm->current_vmcb->asid_generation--;
+}
+
+static void svm_flush_tlb_current(struct kvm_vcpu *vcpu)
+{
+ hpa_t root_tdp = vcpu->arch.mmu->root.hpa;
+
+ /*
+ * When running on Hyper-V with EnlightenedNptTlb enabled, explicitly
+ * flush the NPT mappings via hypercall as flushing the ASID only
+ * affects virtual to physical mappings, it does not invalidate guest
+ * physical to host physical mappings.
+ */
+ if (svm_hv_is_enlightened_tlb_enabled(vcpu) && VALID_PAGE(root_tdp))
+ hyperv_flush_guest_mapping(root_tdp);
+
+ svm_flush_tlb_asid(vcpu);
+}
+
+static void svm_flush_tlb_all(struct kvm_vcpu *vcpu)
+{
+ /*
+ * When running on Hyper-V with EnlightenedNptTlb enabled, remote TLB
+ * flushes should be routed to hv_flush_remote_tlbs() without requesting
+ * a "regular" remote flush. Reaching this point means either there's
+ * a KVM bug or a prior hv_flush_remote_tlbs() call failed, both of
+ * which might be fatal to the guest. Yell, but try to recover.
+ */
+ if (WARN_ON_ONCE(svm_hv_is_enlightened_tlb_enabled(vcpu)))
+ hv_flush_remote_tlbs(vcpu->kvm);
+
+ svm_flush_tlb_asid(vcpu);
+}
+
+static void svm_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t gva)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ invlpga(gva, svm->vmcb->control.asid);
+}
+
+static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (nested_svm_virtualize_tpr(vcpu))
+ return;
+
+ if (!svm_is_intercept(svm, INTERCEPT_CR8_WRITE)) {
+ int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK;
+ kvm_set_cr8(vcpu, cr8);
+ }
+}
+
+static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ u64 cr8;
+
+ if (nested_svm_virtualize_tpr(vcpu) ||
+ kvm_vcpu_apicv_active(vcpu))
+ return;
+
+ cr8 = kvm_get_cr8(vcpu);
+ svm->vmcb->control.int_ctl &= ~V_TPR_MASK;
+ svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK;
+}
+
+static void svm_complete_soft_interrupt(struct kvm_vcpu *vcpu, u8 vector,
+ int type)
+{
+ bool is_exception = (type == SVM_EXITINTINFO_TYPE_EXEPT);
+ bool is_soft = (type == SVM_EXITINTINFO_TYPE_SOFT);
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ /*
+ * If NRIPS is enabled, KVM must snapshot the pre-VMRUN next_rip that's
+ * associated with the original soft exception/interrupt. next_rip is
+ * cleared on all exits that can occur while vectoring an event, so KVM
+ * needs to manually set next_rip for re-injection. Unlike the !nrips
+ * case below, this needs to be done if and only if KVM is re-injecting
+ * the same event, i.e. if the event is a soft exception/interrupt,
+ * otherwise next_rip is unused on VMRUN.
+ */
+ if (nrips && (is_soft || (is_exception && kvm_exception_is_soft(vector))) &&
+ kvm_is_linear_rip(vcpu, svm->soft_int_old_rip + svm->soft_int_csbase))
+ svm->vmcb->control.next_rip = svm->soft_int_next_rip;
+ /*
+ * If NRIPS isn't enabled, KVM must manually advance RIP prior to
+ * injecting the soft exception/interrupt. That advancement needs to
+ * be unwound if vectoring didn't complete. Note, the new event may
+ * not be the injected event, e.g. if KVM injected an INTn, the INTn
+ * hit a #NP in the guest, and the #NP encountered a #PF, the #NP will
+ * be the reported vectored event, but RIP still needs to be unwound.
+ */
+ else if (!nrips && (is_soft || is_exception) &&
+ kvm_is_linear_rip(vcpu, svm->soft_int_next_rip + svm->soft_int_csbase))
+ kvm_rip_write(vcpu, svm->soft_int_old_rip);
+}
+
+static void svm_complete_interrupts(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ u8 vector;
+ int type;
+ u32 exitintinfo = svm->vmcb->control.exit_int_info;
+ bool nmi_l1_to_l2 = svm->nmi_l1_to_l2;
+ bool soft_int_injected = svm->soft_int_injected;
+
+ svm->nmi_l1_to_l2 = false;
+ svm->soft_int_injected = false;
+
+ /*
+ * If we've made progress since setting awaiting_iret_completion, we've
+ * executed an IRET and can allow NMI injection.
+ */
+ if (svm->awaiting_iret_completion &&
+ kvm_rip_read(vcpu) != svm->nmi_iret_rip) {
+ svm->awaiting_iret_completion = false;
+ svm->nmi_masked = false;
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ }
+
+ vcpu->arch.nmi_injected = false;
+ kvm_clear_exception_queue(vcpu);
+ kvm_clear_interrupt_queue(vcpu);
+
+ if (!(exitintinfo & SVM_EXITINTINFO_VALID))
+ return;
+
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+
+ vector = exitintinfo & SVM_EXITINTINFO_VEC_MASK;
+ type = exitintinfo & SVM_EXITINTINFO_TYPE_MASK;
+
+ if (soft_int_injected)
+ svm_complete_soft_interrupt(vcpu, vector, type);
+
+ switch (type) {
+ case SVM_EXITINTINFO_TYPE_NMI:
+ vcpu->arch.nmi_injected = true;
+ svm->nmi_l1_to_l2 = nmi_l1_to_l2;
+ break;
+ case SVM_EXITINTINFO_TYPE_EXEPT:
+ /*
+ * Never re-inject a #VC exception.
+ */
+ if (vector == X86_TRAP_VC)
+ break;
+
+ if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) {
+ u32 err = svm->vmcb->control.exit_int_info_err;
+ kvm_requeue_exception_e(vcpu, vector, err);
+
+ } else
+ kvm_requeue_exception(vcpu, vector);
+ break;
+ case SVM_EXITINTINFO_TYPE_INTR:
+ kvm_queue_interrupt(vcpu, vector, false);
+ break;
+ case SVM_EXITINTINFO_TYPE_SOFT:
+ kvm_queue_interrupt(vcpu, vector, true);
+ break;
+ default:
+ break;
+ }
+
+}
+
+static void svm_cancel_injection(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb_control_area *control = &svm->vmcb->control;
+
+ control->exit_int_info = control->event_inj;
+ control->exit_int_info_err = control->event_inj_err;
+ control->event_inj = 0;
+ svm_complete_interrupts(vcpu);
+}
+
+static int svm_vcpu_pre_run(struct kvm_vcpu *vcpu)
+{
+ return 1;
+}
+
+static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
+{
+ if (to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_MSR &&
+ to_svm(vcpu)->vmcb->control.exit_info_1)
+ return handle_fastpath_set_msr_irqoff(vcpu);
+
+ return EXIT_FASTPATH_NONE;
+}
+
+static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_intercepted)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ guest_state_enter_irqoff();
+
+ amd_clear_divider();
+
+ if (sev_es_guest(vcpu->kvm))
+ __svm_sev_es_vcpu_run(svm, spec_ctrl_intercepted);
+ else
+ __svm_vcpu_run(svm, spec_ctrl_intercepted);
+
+ guest_state_exit_irqoff();
+}
+
+static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ bool spec_ctrl_intercepted = msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL);
+
+ trace_kvm_entry(vcpu);
+
+ svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
+ svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
+ svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];
+
+ /*
+ * Disable singlestep if we're injecting an interrupt/exception.
+ * We don't want our modified rflags to be pushed on the stack where
+ * we might not be able to easily reset them if we disabled NMI
+ * singlestep later.
+ */
+ if (svm->nmi_singlestep && svm->vmcb->control.event_inj) {
+ /*
+ * Event injection happens before external interrupts cause a
+ * vmexit and interrupts are disabled here, so smp_send_reschedule
+ * is enough to force an immediate vmexit.
+ */
+ disable_nmi_singlestep(svm);
+ smp_send_reschedule(vcpu->cpu);
+ }
+
+ pre_svm_run(vcpu);
+
+ sync_lapic_to_cr8(vcpu);
+
+ if (unlikely(svm->asid != svm->vmcb->control.asid)) {
+ svm->vmcb->control.asid = svm->asid;
+ vmcb_mark_dirty(svm->vmcb, VMCB_ASID);
+ }
+ svm->vmcb->save.cr2 = vcpu->arch.cr2;
+
+ svm_hv_update_vp_id(svm->vmcb, vcpu);
+
+ /*
+ * Run with all-zero DR6 unless needed, so that we can get the exact cause
+ * of a #DB.
+ */
+ if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT))
+ svm_set_dr6(svm, vcpu->arch.dr6);
+ else
+ svm_set_dr6(svm, DR6_ACTIVE_LOW);
+
+ clgi();
+ kvm_load_guest_xsave_state(vcpu);
+
+ kvm_wait_lapic_expire(vcpu);
+
+ /*
+ * If this vCPU has touched SPEC_CTRL, restore the guest's value if
+ * it's non-zero. Since vmentry is serialising on affected CPUs, there
+ * is no need to worry about the conditional branch over the wrmsr
+ * being speculatively taken.
+ */
+ if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL))
+ x86_spec_ctrl_set_guest(svm->virt_spec_ctrl);
+
+ svm_vcpu_enter_exit(vcpu, spec_ctrl_intercepted);
+
+ if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL))
+ x86_spec_ctrl_restore_host(svm->virt_spec_ctrl);
+
+ if (!sev_es_guest(vcpu->kvm)) {
+ vcpu->arch.cr2 = svm->vmcb->save.cr2;
+ vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
+ vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
+ vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip;
+ }
+ vcpu->arch.regs_dirty = 0;
+
+ if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
+ kvm_before_interrupt(vcpu, KVM_HANDLING_NMI);
+
+ kvm_load_host_xsave_state(vcpu);
+ stgi();
+
+ /* Any pending NMI will happen here */
+
+ if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
+ kvm_after_interrupt(vcpu);
+
+ sync_cr8_to_lapic(vcpu);
+
+ svm->next_rip = 0;
+ if (is_guest_mode(vcpu)) {
+ nested_sync_control_from_vmcb02(svm);
+
+ /* Track VMRUNs that have made past consistency checking */
+ if (svm->nested.nested_run_pending &&
+ svm->vmcb->control.exit_code != SVM_EXIT_ERR)
+ ++vcpu->stat.nested_run;
+
+ svm->nested.nested_run_pending = 0;
+ }
+
+ svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;
+ vmcb_mark_all_clean(svm->vmcb);
+
+ /* if exit due to PF check for async PF */
+ if (svm->vmcb->control.exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR)
+ vcpu->arch.apf.host_apf_flags =
+ kvm_read_and_reset_apf_flags();
+
+ vcpu->arch.regs_avail &= ~SVM_REGS_LAZY_LOAD_SET;
+
+ /*
+ * We need to handle MC intercepts here before the vcpu has a chance to
+ * change the physical cpu
+ */
+ if (unlikely(svm->vmcb->control.exit_code ==
+ SVM_EXIT_EXCP_BASE + MC_VECTOR))
+ svm_handle_mce(vcpu);
+
+ trace_kvm_exit(vcpu, KVM_ISA_SVM);
+
+ svm_complete_interrupts(vcpu);
+
+ if (is_guest_mode(vcpu))
+ return EXIT_FASTPATH_NONE;
+
+ return svm_exit_handlers_fastpath(vcpu);
+}
+
+static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa,
+ int root_level)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ unsigned long cr3;
+
+ if (npt_enabled) {
+ svm->vmcb->control.nested_cr3 = __sme_set(root_hpa);
+ vmcb_mark_dirty(svm->vmcb, VMCB_NPT);
+
+ hv_track_root_tdp(vcpu, root_hpa);
+
+ cr3 = vcpu->arch.cr3;
+ } else if (root_level >= PT64_ROOT_4LEVEL) {
+ cr3 = __sme_set(root_hpa) | kvm_get_active_pcid(vcpu);
+ } else {
+ /* PCID in the guest should be impossible with a 32-bit MMU. */
+ WARN_ON_ONCE(kvm_get_active_pcid(vcpu));
+ cr3 = root_hpa;
+ }
+
+ svm->vmcb->save.cr3 = cr3;
+ vmcb_mark_dirty(svm->vmcb, VMCB_CR);
+}
+
+static void
+svm_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
+{
+ /*
+ * Patch in the VMMCALL instruction:
+ */
+ hypercall[0] = 0x0f;
+ hypercall[1] = 0x01;
+ hypercall[2] = 0xd9;
+}
+
+/*
+ * The kvm parameter can be NULL (module initialization, or invocation before
+ * VM creation). Be sure to check the kvm parameter before using it.
+ */
+static bool svm_has_emulated_msr(struct kvm *kvm, u32 index)
+{
+ switch (index) {
+ case MSR_IA32_MCG_EXT_CTL:
+ case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR:
+ return false;
+ case MSR_IA32_SMBASE:
+ if (!IS_ENABLED(CONFIG_KVM_SMM))
+ return false;
+ /* SEV-ES guests do not support SMM, so report false */
+ if (kvm && sev_es_guest(kvm))
+ return false;
+ break;
+ default:
+ break;
+ }
+
+ return true;
+}
+
+static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ /*
+ * SVM doesn't provide a way to disable just XSAVES in the guest, KVM
+ * can only disable all variants of by disallowing CR4.OSXSAVE from
+ * being set. As a result, if the host has XSAVE and XSAVES, and the
+ * guest has XSAVE enabled, the guest can execute XSAVES without
+ * faulting. Treat XSAVES as enabled in this case regardless of
+ * whether it's advertised to the guest so that KVM context switches
+ * XSS on VM-Enter/VM-Exit. Failure to do so would effectively give
+ * the guest read/write access to the host's XSS.
+ */
+ if (boot_cpu_has(X86_FEATURE_XSAVE) &&
+ boot_cpu_has(X86_FEATURE_XSAVES) &&
+ guest_cpuid_has(vcpu, X86_FEATURE_XSAVE))
+ kvm_governed_feature_set(vcpu, X86_FEATURE_XSAVES);
+
+ kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_NRIPS);
+ kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_TSCRATEMSR);
+ kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_LBRV);
+
+ /*
+ * Intercept VMLOAD if the vCPU mode is Intel in order to emulate that
+ * VMLOAD drops bits 63:32 of SYSENTER (ignoring the fact that exposing
+ * SVM on Intel is bonkers and extremely unlikely to work).
+ */
+ if (!guest_cpuid_is_intel(vcpu))
+ kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_V_VMSAVE_VMLOAD);
+
+ kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_PAUSEFILTER);
+ kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_PFTHRESHOLD);
+ kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_VGIF);
+ kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_VNMI);
+
+ svm_recalc_instruction_intercepts(vcpu, svm);
+
+ if (boot_cpu_has(X86_FEATURE_IBPB))
+ set_msr_interception(vcpu, svm->msrpm, MSR_IA32_PRED_CMD, 0,
+ !!guest_has_pred_cmd_msr(vcpu));
+
+ if (boot_cpu_has(X86_FEATURE_FLUSH_L1D))
+ set_msr_interception(vcpu, svm->msrpm, MSR_IA32_FLUSH_CMD, 0,
+ !!guest_cpuid_has(vcpu, X86_FEATURE_FLUSH_L1D));
+
+ if (sev_guest(vcpu->kvm))
+ sev_vcpu_after_set_cpuid(svm);
+
+ init_vmcb_after_set_cpuid(vcpu);
+}
+
+static bool svm_has_wbinvd_exit(void)
+{
+ return true;
+}
+
+#define PRE_EX(exit) { .exit_code = (exit), \
+ .stage = X86_ICPT_PRE_EXCEPT, }
+#define POST_EX(exit) { .exit_code = (exit), \
+ .stage = X86_ICPT_POST_EXCEPT, }
+#define POST_MEM(exit) { .exit_code = (exit), \
+ .stage = X86_ICPT_POST_MEMACCESS, }
+
+static const struct __x86_intercept {
+ u32 exit_code;
+ enum x86_intercept_stage stage;
+} x86_intercept_map[] = {
+ [x86_intercept_cr_read] = POST_EX(SVM_EXIT_READ_CR0),
+ [x86_intercept_cr_write] = POST_EX(SVM_EXIT_WRITE_CR0),
+ [x86_intercept_clts] = POST_EX(SVM_EXIT_WRITE_CR0),
+ [x86_intercept_lmsw] = POST_EX(SVM_EXIT_WRITE_CR0),
+ [x86_intercept_smsw] = POST_EX(SVM_EXIT_READ_CR0),
+ [x86_intercept_dr_read] = POST_EX(SVM_EXIT_READ_DR0),
+ [x86_intercept_dr_write] = POST_EX(SVM_EXIT_WRITE_DR0),
+ [x86_intercept_sldt] = POST_EX(SVM_EXIT_LDTR_READ),
+ [x86_intercept_str] = POST_EX(SVM_EXIT_TR_READ),
+ [x86_intercept_lldt] = POST_EX(SVM_EXIT_LDTR_WRITE),
+ [x86_intercept_ltr] = POST_EX(SVM_EXIT_TR_WRITE),
+ [x86_intercept_sgdt] = POST_EX(SVM_EXIT_GDTR_READ),
+ [x86_intercept_sidt] = POST_EX(SVM_EXIT_IDTR_READ),
+ [x86_intercept_lgdt] = POST_EX(SVM_EXIT_GDTR_WRITE),
+ [x86_intercept_lidt] = POST_EX(SVM_EXIT_IDTR_WRITE),
+ [x86_intercept_vmrun] = POST_EX(SVM_EXIT_VMRUN),
+ [x86_intercept_vmmcall] = POST_EX(SVM_EXIT_VMMCALL),
+ [x86_intercept_vmload] = POST_EX(SVM_EXIT_VMLOAD),
+ [x86_intercept_vmsave] = POST_EX(SVM_EXIT_VMSAVE),
+ [x86_intercept_stgi] = POST_EX(SVM_EXIT_STGI),
+ [x86_intercept_clgi] = POST_EX(SVM_EXIT_CLGI),
+ [x86_intercept_skinit] = POST_EX(SVM_EXIT_SKINIT),
+ [x86_intercept_invlpga] = POST_EX(SVM_EXIT_INVLPGA),
+ [x86_intercept_rdtscp] = POST_EX(SVM_EXIT_RDTSCP),
+ [x86_intercept_monitor] = POST_MEM(SVM_EXIT_MONITOR),
+ [x86_intercept_mwait] = POST_EX(SVM_EXIT_MWAIT),
+ [x86_intercept_invlpg] = POST_EX(SVM_EXIT_INVLPG),
+ [x86_intercept_invd] = POST_EX(SVM_EXIT_INVD),
+ [x86_intercept_wbinvd] = POST_EX(SVM_EXIT_WBINVD),
+ [x86_intercept_wrmsr] = POST_EX(SVM_EXIT_MSR),
+ [x86_intercept_rdtsc] = POST_EX(SVM_EXIT_RDTSC),
+ [x86_intercept_rdmsr] = POST_EX(SVM_EXIT_MSR),
+ [x86_intercept_rdpmc] = POST_EX(SVM_EXIT_RDPMC),
+ [x86_intercept_cpuid] = PRE_EX(SVM_EXIT_CPUID),
+ [x86_intercept_rsm] = PRE_EX(SVM_EXIT_RSM),
+ [x86_intercept_pause] = PRE_EX(SVM_EXIT_PAUSE),
+ [x86_intercept_pushf] = PRE_EX(SVM_EXIT_PUSHF),
+ [x86_intercept_popf] = PRE_EX(SVM_EXIT_POPF),
+ [x86_intercept_intn] = PRE_EX(SVM_EXIT_SWINT),
+ [x86_intercept_iret] = PRE_EX(SVM_EXIT_IRET),
+ [x86_intercept_icebp] = PRE_EX(SVM_EXIT_ICEBP),
+ [x86_intercept_hlt] = POST_EX(SVM_EXIT_HLT),
+ [x86_intercept_in] = POST_EX(SVM_EXIT_IOIO),
+ [x86_intercept_ins] = POST_EX(SVM_EXIT_IOIO),
+ [x86_intercept_out] = POST_EX(SVM_EXIT_IOIO),
+ [x86_intercept_outs] = POST_EX(SVM_EXIT_IOIO),
+ [x86_intercept_xsetbv] = PRE_EX(SVM_EXIT_XSETBV),
+};
+
+#undef PRE_EX
+#undef POST_EX
+#undef POST_MEM
+
+static int svm_check_intercept(struct kvm_vcpu *vcpu,
+ struct x86_instruction_info *info,
+ enum x86_intercept_stage stage,
+ struct x86_exception *exception)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ int vmexit, ret = X86EMUL_CONTINUE;
+ struct __x86_intercept icpt_info;
+ struct vmcb *vmcb = svm->vmcb;
+
+ if (info->intercept >= ARRAY_SIZE(x86_intercept_map))
+ goto out;
+
+ icpt_info = x86_intercept_map[info->intercept];
+
+ if (stage != icpt_info.stage)
+ goto out;
+
+ switch (icpt_info.exit_code) {
+ case SVM_EXIT_READ_CR0:
+ if (info->intercept == x86_intercept_cr_read)
+ icpt_info.exit_code += info->modrm_reg;
+ break;
+ case SVM_EXIT_WRITE_CR0: {
+ unsigned long cr0, val;
+
+ if (info->intercept == x86_intercept_cr_write)
+ icpt_info.exit_code += info->modrm_reg;
+
+ if (icpt_info.exit_code != SVM_EXIT_WRITE_CR0 ||
+ info->intercept == x86_intercept_clts)
+ break;
+
+ if (!(vmcb12_is_intercept(&svm->nested.ctl,
+ INTERCEPT_SELECTIVE_CR0)))
+ break;
+
+ cr0 = vcpu->arch.cr0 & ~SVM_CR0_SELECTIVE_MASK;
+ val = info->src_val & ~SVM_CR0_SELECTIVE_MASK;
+
+ if (info->intercept == x86_intercept_lmsw) {
+ cr0 &= 0xfUL;
+ val &= 0xfUL;
+ /* lmsw can't clear PE - catch this here */
+ if (cr0 & X86_CR0_PE)
+ val |= X86_CR0_PE;
+ }
+
+ if (cr0 ^ val)
+ icpt_info.exit_code = SVM_EXIT_CR0_SEL_WRITE;
+
+ break;
+ }
+ case SVM_EXIT_READ_DR0:
+ case SVM_EXIT_WRITE_DR0:
+ icpt_info.exit_code += info->modrm_reg;
+ break;
+ case SVM_EXIT_MSR:
+ if (info->intercept == x86_intercept_wrmsr)
+ vmcb->control.exit_info_1 = 1;
+ else
+ vmcb->control.exit_info_1 = 0;
+ break;
+ case SVM_EXIT_PAUSE:
+ /*
+ * We get this for NOP only, but pause
+ * is rep not, check this here
+ */
+ if (info->rep_prefix != REPE_PREFIX)
+ goto out;
+ break;
+ case SVM_EXIT_IOIO: {
+ u64 exit_info;
+ u32 bytes;
+
+ if (info->intercept == x86_intercept_in ||
+ info->intercept == x86_intercept_ins) {
+ exit_info = ((info->src_val & 0xffff) << 16) |
+ SVM_IOIO_TYPE_MASK;
+ bytes = info->dst_bytes;
+ } else {
+ exit_info = (info->dst_val & 0xffff) << 16;
+ bytes = info->src_bytes;
+ }
+
+ if (info->intercept == x86_intercept_outs ||
+ info->intercept == x86_intercept_ins)
+ exit_info |= SVM_IOIO_STR_MASK;
+
+ if (info->rep_prefix)
+ exit_info |= SVM_IOIO_REP_MASK;
+
+ bytes = min(bytes, 4u);
+
+ exit_info |= bytes << SVM_IOIO_SIZE_SHIFT;
+
+ exit_info |= (u32)info->ad_bytes << (SVM_IOIO_ASIZE_SHIFT - 1);
+
+ vmcb->control.exit_info_1 = exit_info;
+ vmcb->control.exit_info_2 = info->next_rip;
+
+ break;
+ }
+ default:
+ break;
+ }
+
+ /* TODO: Advertise NRIPS to guest hypervisor unconditionally */
+ if (static_cpu_has(X86_FEATURE_NRIPS))
+ vmcb->control.next_rip = info->next_rip;
+ vmcb->control.exit_code = icpt_info.exit_code;
+ vmexit = nested_svm_exit_handled(svm);
+
+ ret = (vmexit == NESTED_EXIT_DONE) ? X86EMUL_INTERCEPTED
+ : X86EMUL_CONTINUE;
+
+out:
+ return ret;
+}
+
+static void svm_handle_exit_irqoff(struct kvm_vcpu *vcpu)
+{
+ if (to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_INTR)
+ vcpu->arch.at_instruction_boundary = true;
+}
+
+static void svm_sched_in(struct kvm_vcpu *vcpu, int cpu)
+{
+ if (!kvm_pause_in_guest(vcpu->kvm))
+ shrink_ple_window(vcpu);
+}
+
+static void svm_setup_mce(struct kvm_vcpu *vcpu)
+{
+ /* [63:9] are reserved. */
+ vcpu->arch.mcg_cap &= 0x1ff;
+}
+
+#ifdef CONFIG_KVM_SMM
+bool svm_smi_blocked(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ /* Per APM Vol.2 15.22.2 "Response to SMI" */
+ if (!gif_set(svm))
+ return true;
+
+ return is_smm(vcpu);
+}
+
+static int svm_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ if (svm->nested.nested_run_pending)
+ return -EBUSY;
+
+ if (svm_smi_blocked(vcpu))
+ return 0;
+
+ /* An SMI must not be injected into L2 if it's supposed to VM-Exit. */
+ if (for_injection && is_guest_mode(vcpu) && nested_exit_on_smi(svm))
+ return -EBUSY;
+
+ return 1;
+}
+
+static int svm_enter_smm(struct kvm_vcpu *vcpu, union kvm_smram *smram)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct kvm_host_map map_save;
+ int ret;
+
+ if (!is_guest_mode(vcpu))
+ return 0;
+
+ /*
+ * 32-bit SMRAM format doesn't preserve EFER and SVM state. Userspace is
+ * responsible for ensuring nested SVM and SMIs are mutually exclusive.
+ */
+
+ if (!guest_cpuid_has(vcpu, X86_FEATURE_LM))
+ return 1;
+
+ smram->smram64.svm_guest_flag = 1;
+ smram->smram64.svm_guest_vmcb_gpa = svm->nested.vmcb12_gpa;
+
+ svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
+ svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
+ svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];
+
+ ret = nested_svm_simple_vmexit(svm, SVM_EXIT_SW);
+ if (ret)
+ return ret;
+
+ /*
+ * KVM uses VMCB01 to store L1 host state while L2 runs but
+ * VMCB01 is going to be used during SMM and thus the state will
+ * be lost. Temporary save non-VMLOAD/VMSAVE state to the host save
+ * area pointed to by MSR_VM_HSAVE_PA. APM guarantees that the
+ * format of the area is identical to guest save area offsetted
+ * by 0x400 (matches the offset of 'struct vmcb_save_area'
+ * within 'struct vmcb'). Note: HSAVE area may also be used by
+ * L1 hypervisor to save additional host context (e.g. KVM does
+ * that, see svm_prepare_switch_to_guest()) which must be
+ * preserved.
+ */
+ if (kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.hsave_msr), &map_save))
+ return 1;
+
+ BUILD_BUG_ON(offsetof(struct vmcb, save) != 0x400);
+
+ svm_copy_vmrun_state(map_save.hva + 0x400,
+ &svm->vmcb01.ptr->save);
+
+ kvm_vcpu_unmap(vcpu, &map_save, true);
+ return 0;
+}
+
+static int svm_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct kvm_host_map map, map_save;
+ struct vmcb *vmcb12;
+ int ret;
+
+ const struct kvm_smram_state_64 *smram64 = &smram->smram64;
+
+ if (!guest_cpuid_has(vcpu, X86_FEATURE_LM))
+ return 0;
+
+ /* Non-zero if SMI arrived while vCPU was in guest mode. */
+ if (!smram64->svm_guest_flag)
+ return 0;
+
+ if (!guest_cpuid_has(vcpu, X86_FEATURE_SVM))
+ return 1;
+
+ if (!(smram64->efer & EFER_SVME))
+ return 1;
+
+ if (kvm_vcpu_map(vcpu, gpa_to_gfn(smram64->svm_guest_vmcb_gpa), &map))
+ return 1;
+
+ ret = 1;
+ if (kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.hsave_msr), &map_save))
+ goto unmap_map;
+
+ if (svm_allocate_nested(svm))
+ goto unmap_save;
+
+ /*
+ * Restore L1 host state from L1 HSAVE area as VMCB01 was
+ * used during SMM (see svm_enter_smm())
+ */
+
+ svm_copy_vmrun_state(&svm->vmcb01.ptr->save, map_save.hva + 0x400);
+
+ /*
+ * Enter the nested guest now
+ */
+
+ vmcb_mark_all_dirty(svm->vmcb01.ptr);
+
+ vmcb12 = map.hva;
+ nested_copy_vmcb_control_to_cache(svm, &vmcb12->control);
+ nested_copy_vmcb_save_to_cache(svm, &vmcb12->save);
+ ret = enter_svm_guest_mode(vcpu, smram64->svm_guest_vmcb_gpa, vmcb12, false);
+
+ if (ret)
+ goto unmap_save;
+
+ svm->nested.nested_run_pending = 1;
+
+unmap_save:
+ kvm_vcpu_unmap(vcpu, &map_save, true);
+unmap_map:
+ kvm_vcpu_unmap(vcpu, &map, true);
+ return ret;
+}
+
+static void svm_enable_smi_window(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (!gif_set(svm)) {
+ if (vgif)
+ svm_set_intercept(svm, INTERCEPT_STGI);
+ /* STGI will cause a vm exit */
+ } else {
+ /* We must be in SMM; RSM will cause a vmexit anyway. */
+ }
+}
+#endif
+
+static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
+ void *insn, int insn_len)
+{
+ bool smep, smap, is_user;
+ u64 error_code;
+
+ /* Emulation is always possible when KVM has access to all guest state. */
+ if (!sev_guest(vcpu->kvm))
+ return true;
+
+ /* #UD and #GP should never be intercepted for SEV guests. */
+ WARN_ON_ONCE(emul_type & (EMULTYPE_TRAP_UD |
+ EMULTYPE_TRAP_UD_FORCED |
+ EMULTYPE_VMWARE_GP));
+
+ /*
+ * Emulation is impossible for SEV-ES guests as KVM doesn't have access
+ * to guest register state.
+ */
+ if (sev_es_guest(vcpu->kvm))
+ return false;
+
+ /*
+ * Emulation is possible if the instruction is already decoded, e.g.
+ * when completing I/O after returning from userspace.
+ */
+ if (emul_type & EMULTYPE_NO_DECODE)
+ return true;
+
+ /*
+ * Emulation is possible for SEV guests if and only if a prefilled
+ * buffer containing the bytes of the intercepted instruction is
+ * available. SEV guest memory is encrypted with a guest specific key
+ * and cannot be decrypted by KVM, i.e. KVM would read cyphertext and
+ * decode garbage.
+ *
+ * If KVM is NOT trying to simply skip an instruction, inject #UD if
+ * KVM reached this point without an instruction buffer. In practice,
+ * this path should never be hit by a well-behaved guest, e.g. KVM
+ * doesn't intercept #UD or #GP for SEV guests, but this path is still
+ * theoretically reachable, e.g. via unaccelerated fault-like AVIC
+ * access, and needs to be handled by KVM to avoid putting the guest
+ * into an infinite loop. Injecting #UD is somewhat arbitrary, but
+ * its the least awful option given lack of insight into the guest.
+ *
+ * If KVM is trying to skip an instruction, simply resume the guest.
+ * If a #NPF occurs while the guest is vectoring an INT3/INTO, then KVM
+ * will attempt to re-inject the INT3/INTO and skip the instruction.
+ * In that scenario, retrying the INT3/INTO and hoping the guest will
+ * make forward progress is the only option that has a chance of
+ * success (and in practice it will work the vast majority of the time).
+ */
+ if (unlikely(!insn)) {
+ if (!(emul_type & EMULTYPE_SKIP))
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return false;
+ }
+
+ /*
+ * Emulate for SEV guests if the insn buffer is not empty. The buffer
+ * will be empty if the DecodeAssist microcode cannot fetch bytes for
+ * the faulting instruction because the code fetch itself faulted, e.g.
+ * the guest attempted to fetch from emulated MMIO or a guest page
+ * table used to translate CS:RIP resides in emulated MMIO.
+ */
+ if (likely(insn_len))
+ return true;
+
+ /*
+ * Detect and workaround Errata 1096 Fam_17h_00_0Fh.
+ *
+ * Errata:
+ * When CPU raises #NPF on guest data access and vCPU CR4.SMAP=1, it is
+ * possible that CPU microcode implementing DecodeAssist will fail to
+ * read guest memory at CS:RIP and vmcb.GuestIntrBytes will incorrectly
+ * be '0'. This happens because microcode reads CS:RIP using a _data_
+ * loap uop with CPL=0 privileges. If the load hits a SMAP #PF, ucode
+ * gives up and does not fill the instruction bytes buffer.
+ *
+ * As above, KVM reaches this point iff the VM is an SEV guest, the CPU
+ * supports DecodeAssist, a #NPF was raised, KVM's page fault handler
+ * triggered emulation (e.g. for MMIO), and the CPU returned 0 in the
+ * GuestIntrBytes field of the VMCB.
+ *
+ * This does _not_ mean that the erratum has been encountered, as the
+ * DecodeAssist will also fail if the load for CS:RIP hits a legitimate
+ * #PF, e.g. if the guest attempt to execute from emulated MMIO and
+ * encountered a reserved/not-present #PF.
+ *
+ * To hit the erratum, the following conditions must be true:
+ * 1. CR4.SMAP=1 (obviously).
+ * 2. CR4.SMEP=0 || CPL=3. If SMEP=1 and CPL<3, the erratum cannot
+ * have been hit as the guest would have encountered a SMEP
+ * violation #PF, not a #NPF.
+ * 3. The #NPF is not due to a code fetch, in which case failure to
+ * retrieve the instruction bytes is legitimate (see abvoe).
+ *
+ * In addition, don't apply the erratum workaround if the #NPF occurred
+ * while translating guest page tables (see below).
+ */
+ error_code = to_svm(vcpu)->vmcb->control.exit_info_1;
+ if (error_code & (PFERR_GUEST_PAGE_MASK | PFERR_FETCH_MASK))
+ goto resume_guest;
+
+ smep = kvm_is_cr4_bit_set(vcpu, X86_CR4_SMEP);
+ smap = kvm_is_cr4_bit_set(vcpu, X86_CR4_SMAP);
+ is_user = svm_get_cpl(vcpu) == 3;
+ if (smap && (!smep || is_user)) {
+ pr_err_ratelimited("SEV Guest triggered AMD Erratum 1096\n");
+
+ /*
+ * If the fault occurred in userspace, arbitrarily inject #GP
+ * to avoid killing the guest and to hopefully avoid confusing
+ * the guest kernel too much, e.g. injecting #PF would not be
+ * coherent with respect to the guest's page tables. Request
+ * triple fault if the fault occurred in the kernel as there's
+ * no fault that KVM can inject without confusing the guest.
+ * In practice, the triple fault is moot as no sane SEV kernel
+ * will execute from user memory while also running with SMAP=1.
+ */
+ if (is_user)
+ kvm_inject_gp(vcpu, 0);
+ else
+ kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+ }
+
+resume_guest:
+ /*
+ * If the erratum was not hit, simply resume the guest and let it fault
+ * again. While awful, e.g. the vCPU may get stuck in an infinite loop
+ * if the fault is at CPL=0, it's the lesser of all evils. Exiting to
+ * userspace will kill the guest, and letting the emulator read garbage
+ * will yield random behavior and potentially corrupt the guest.
+ *
+ * Simply resuming the guest is technically not a violation of the SEV
+ * architecture. AMD's APM states that all code fetches and page table
+ * accesses for SEV guest are encrypted, regardless of the C-Bit. The
+ * APM also states that encrypted accesses to MMIO are "ignored", but
+ * doesn't explicitly define "ignored", i.e. doing nothing and letting
+ * the guest spin is technically "ignoring" the access.
+ */
+ return false;
+}
+
+static bool svm_apic_init_signal_blocked(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ return !gif_set(svm);
+}
+
+static void svm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
+{
+ if (!sev_es_guest(vcpu->kvm))
+ return kvm_vcpu_deliver_sipi_vector(vcpu, vector);
+
+ sev_vcpu_deliver_sipi_vector(vcpu, vector);
+}
+
+static void svm_vm_destroy(struct kvm *kvm)
+{
+ avic_vm_destroy(kvm);
+ sev_vm_destroy(kvm);
+}
+
+static int svm_vm_init(struct kvm *kvm)
+{
+ if (!pause_filter_count || !pause_filter_thresh)
+ kvm->arch.pause_in_guest = true;
+
+ if (enable_apicv) {
+ int ret = avic_vm_init(kvm);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+static struct kvm_x86_ops svm_x86_ops __initdata = {
+ .name = KBUILD_MODNAME,
+
+ .check_processor_compatibility = svm_check_processor_compat,
+
+ .hardware_unsetup = svm_hardware_unsetup,
+ .hardware_enable = svm_hardware_enable,
+ .hardware_disable = svm_hardware_disable,
+ .has_emulated_msr = svm_has_emulated_msr,
+
+ .vcpu_create = svm_vcpu_create,
+ .vcpu_free = svm_vcpu_free,
+ .vcpu_reset = svm_vcpu_reset,
+
+ .vm_size = sizeof(struct kvm_svm),
+ .vm_init = svm_vm_init,
+ .vm_destroy = svm_vm_destroy,
+
+ .prepare_switch_to_guest = svm_prepare_switch_to_guest,
+ .vcpu_load = svm_vcpu_load,
+ .vcpu_put = svm_vcpu_put,
+ .vcpu_blocking = avic_vcpu_blocking,
+ .vcpu_unblocking = avic_vcpu_unblocking,
+
+ .update_exception_bitmap = svm_update_exception_bitmap,
+ .get_msr_feature = svm_get_msr_feature,
+ .get_msr = svm_get_msr,
+ .set_msr = svm_set_msr,
+ .get_segment_base = svm_get_segment_base,
+ .get_segment = svm_get_segment,
+ .set_segment = svm_set_segment,
+ .get_cpl = svm_get_cpl,
+ .get_cs_db_l_bits = svm_get_cs_db_l_bits,
+ .is_valid_cr0 = svm_is_valid_cr0,
+ .set_cr0 = svm_set_cr0,
+ .post_set_cr3 = sev_post_set_cr3,
+ .is_valid_cr4 = svm_is_valid_cr4,
+ .set_cr4 = svm_set_cr4,
+ .set_efer = svm_set_efer,
+ .get_idt = svm_get_idt,
+ .set_idt = svm_set_idt,
+ .get_gdt = svm_get_gdt,
+ .set_gdt = svm_set_gdt,
+ .set_dr7 = svm_set_dr7,
+ .sync_dirty_debug_regs = svm_sync_dirty_debug_regs,
+ .cache_reg = svm_cache_reg,
+ .get_rflags = svm_get_rflags,
+ .set_rflags = svm_set_rflags,
+ .get_if_flag = svm_get_if_flag,
+
+ .flush_tlb_all = svm_flush_tlb_all,
+ .flush_tlb_current = svm_flush_tlb_current,
+ .flush_tlb_gva = svm_flush_tlb_gva,
+ .flush_tlb_guest = svm_flush_tlb_asid,
+
+ .vcpu_pre_run = svm_vcpu_pre_run,
+ .vcpu_run = svm_vcpu_run,
+ .handle_exit = svm_handle_exit,
+ .skip_emulated_instruction = svm_skip_emulated_instruction,
+ .update_emulated_instruction = NULL,
+ .set_interrupt_shadow = svm_set_interrupt_shadow,
+ .get_interrupt_shadow = svm_get_interrupt_shadow,
+ .patch_hypercall = svm_patch_hypercall,
+ .inject_irq = svm_inject_irq,
+ .inject_nmi = svm_inject_nmi,
+ .is_vnmi_pending = svm_is_vnmi_pending,
+ .set_vnmi_pending = svm_set_vnmi_pending,
+ .inject_exception = svm_inject_exception,
+ .cancel_injection = svm_cancel_injection,
+ .interrupt_allowed = svm_interrupt_allowed,
+ .nmi_allowed = svm_nmi_allowed,
+ .get_nmi_mask = svm_get_nmi_mask,
+ .set_nmi_mask = svm_set_nmi_mask,
+ .enable_nmi_window = svm_enable_nmi_window,
+ .enable_irq_window = svm_enable_irq_window,
+ .update_cr8_intercept = svm_update_cr8_intercept,
+ .set_virtual_apic_mode = avic_refresh_virtual_apic_mode,
+ .refresh_apicv_exec_ctrl = avic_refresh_apicv_exec_ctrl,
+ .apicv_post_state_restore = avic_apicv_post_state_restore,
+ .required_apicv_inhibits = AVIC_REQUIRED_APICV_INHIBITS,
+
+ .get_exit_info = svm_get_exit_info,
+
+ .vcpu_after_set_cpuid = svm_vcpu_after_set_cpuid,
+
+ .has_wbinvd_exit = svm_has_wbinvd_exit,
+
+ .get_l2_tsc_offset = svm_get_l2_tsc_offset,
+ .get_l2_tsc_multiplier = svm_get_l2_tsc_multiplier,
+ .write_tsc_offset = svm_write_tsc_offset,
+ .write_tsc_multiplier = svm_write_tsc_multiplier,
+
+ .load_mmu_pgd = svm_load_mmu_pgd,
+
+ .check_intercept = svm_check_intercept,
+ .handle_exit_irqoff = svm_handle_exit_irqoff,
+
+ .request_immediate_exit = __kvm_request_immediate_exit,
+
+ .sched_in = svm_sched_in,
+
+ .nested_ops = &svm_nested_ops,
+
+ .deliver_interrupt = svm_deliver_interrupt,
+ .pi_update_irte = avic_pi_update_irte,
+ .setup_mce = svm_setup_mce,
+
+#ifdef CONFIG_KVM_SMM
+ .smi_allowed = svm_smi_allowed,
+ .enter_smm = svm_enter_smm,
+ .leave_smm = svm_leave_smm,
+ .enable_smi_window = svm_enable_smi_window,
+#endif
+
+ .mem_enc_ioctl = sev_mem_enc_ioctl,
+ .mem_enc_register_region = sev_mem_enc_register_region,
+ .mem_enc_unregister_region = sev_mem_enc_unregister_region,
+ .guest_memory_reclaimed = sev_guest_memory_reclaimed,
+
+ .vm_copy_enc_context_from = sev_vm_copy_enc_context_from,
+ .vm_move_enc_context_from = sev_vm_move_enc_context_from,
+
+ .can_emulate_instruction = svm_can_emulate_instruction,
+
+ .apic_init_signal_blocked = svm_apic_init_signal_blocked,
+
+ .msr_filter_changed = svm_msr_filter_changed,
+ .complete_emulated_msr = svm_complete_emulated_msr,
+
+ .vcpu_deliver_sipi_vector = svm_vcpu_deliver_sipi_vector,
+ .vcpu_get_apicv_inhibit_reasons = avic_vcpu_get_apicv_inhibit_reasons,
+};
+
+/*
+ * The default MMIO mask is a single bit (excluding the present bit),
+ * which could conflict with the memory encryption bit. Check for
+ * memory encryption support and override the default MMIO mask if
+ * memory encryption is enabled.
+ */
+static __init void svm_adjust_mmio_mask(void)
+{
+ unsigned int enc_bit, mask_bit;
+ u64 msr, mask;
+
+ /* If there is no memory encryption support, use existing mask */
+ if (cpuid_eax(0x80000000) < 0x8000001f)
+ return;
+
+ /* If memory encryption is not enabled, use existing mask */
+ rdmsrl(MSR_AMD64_SYSCFG, msr);
+ if (!(msr & MSR_AMD64_SYSCFG_MEM_ENCRYPT))
+ return;
+
+ enc_bit = cpuid_ebx(0x8000001f) & 0x3f;
+ mask_bit = boot_cpu_data.x86_phys_bits;
+
+ /* Increment the mask bit if it is the same as the encryption bit */
+ if (enc_bit == mask_bit)
+ mask_bit++;
+
+ /*
+ * If the mask bit location is below 52, then some bits above the
+ * physical addressing limit will always be reserved, so use the
+ * rsvd_bits() function to generate the mask. This mask, along with
+ * the present bit, will be used to generate a page fault with
+ * PFER.RSV = 1.
+ *
+ * If the mask bit location is 52 (or above), then clear the mask.
+ */
+ mask = (mask_bit < 52) ? rsvd_bits(mask_bit, 51) | PT_PRESENT_MASK : 0;
+
+ kvm_mmu_set_mmio_spte_mask(mask, mask, PT_WRITABLE_MASK | PT_USER_MASK);
+}
+
+static __init void svm_set_cpu_caps(void)
+{
+ kvm_set_cpu_caps();
+
+ kvm_caps.supported_perf_cap = 0;
+ kvm_caps.supported_xss = 0;
+
+ /* CPUID 0x80000001 and 0x8000000A (SVM features) */
+ if (nested) {
+ kvm_cpu_cap_set(X86_FEATURE_SVM);
+ kvm_cpu_cap_set(X86_FEATURE_VMCBCLEAN);
+
+ if (nrips)
+ kvm_cpu_cap_set(X86_FEATURE_NRIPS);
+
+ if (npt_enabled)
+ kvm_cpu_cap_set(X86_FEATURE_NPT);
+
+ if (tsc_scaling)
+ kvm_cpu_cap_set(X86_FEATURE_TSCRATEMSR);
+
+ if (vls)
+ kvm_cpu_cap_set(X86_FEATURE_V_VMSAVE_VMLOAD);
+ if (lbrv)
+ kvm_cpu_cap_set(X86_FEATURE_LBRV);
+
+ if (boot_cpu_has(X86_FEATURE_PAUSEFILTER))
+ kvm_cpu_cap_set(X86_FEATURE_PAUSEFILTER);
+
+ if (boot_cpu_has(X86_FEATURE_PFTHRESHOLD))
+ kvm_cpu_cap_set(X86_FEATURE_PFTHRESHOLD);
+
+ if (vgif)
+ kvm_cpu_cap_set(X86_FEATURE_VGIF);
+
+ if (vnmi)
+ kvm_cpu_cap_set(X86_FEATURE_VNMI);
+
+ /* Nested VM can receive #VMEXIT instead of triggering #GP */
+ kvm_cpu_cap_set(X86_FEATURE_SVME_ADDR_CHK);
+ }
+
+ /* CPUID 0x80000008 */
+ if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) ||
+ boot_cpu_has(X86_FEATURE_AMD_SSBD))
+ kvm_cpu_cap_set(X86_FEATURE_VIRT_SSBD);
+
+ if (enable_pmu) {
+ /*
+ * Enumerate support for PERFCTR_CORE if and only if KVM has
+ * access to enough counters to virtualize "core" support,
+ * otherwise limit vPMU support to the legacy number of counters.
+ */
+ if (kvm_pmu_cap.num_counters_gp < AMD64_NUM_COUNTERS_CORE)
+ kvm_pmu_cap.num_counters_gp = min(AMD64_NUM_COUNTERS,
+ kvm_pmu_cap.num_counters_gp);
+ else
+ kvm_cpu_cap_check_and_set(X86_FEATURE_PERFCTR_CORE);
+
+ if (kvm_pmu_cap.version != 2 ||
+ !kvm_cpu_cap_has(X86_FEATURE_PERFCTR_CORE))
+ kvm_cpu_cap_clear(X86_FEATURE_PERFMON_V2);
+ }
+
+ /* CPUID 0x8000001F (SME/SEV features) */
+ sev_set_cpu_caps();
+}
+
+static __init int svm_hardware_setup(void)
+{
+ int cpu;
+ struct page *iopm_pages;
+ void *iopm_va;
+ int r;
+ unsigned int order = get_order(IOPM_SIZE);
+
+ /*
+ * NX is required for shadow paging and for NPT if the NX huge pages
+ * mitigation is enabled.
+ */
+ if (!boot_cpu_has(X86_FEATURE_NX)) {
+ pr_err_ratelimited("NX (Execute Disable) not supported\n");
+ return -EOPNOTSUPP;
+ }
+ kvm_enable_efer_bits(EFER_NX);
+
+ iopm_pages = alloc_pages(GFP_KERNEL, order);
+
+ if (!iopm_pages)
+ return -ENOMEM;
+
+ iopm_va = page_address(iopm_pages);
+ memset(iopm_va, 0xff, PAGE_SIZE * (1 << order));
+ iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT;
+
+ init_msrpm_offsets();
+
+ kvm_caps.supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS |
+ XFEATURE_MASK_BNDCSR);
+
+ if (boot_cpu_has(X86_FEATURE_FXSR_OPT))
+ kvm_enable_efer_bits(EFER_FFXSR);
+
+ if (tsc_scaling) {
+ if (!boot_cpu_has(X86_FEATURE_TSCRATEMSR)) {
+ tsc_scaling = false;
+ } else {
+ pr_info("TSC scaling supported\n");
+ kvm_caps.has_tsc_control = true;
+ }
+ }
+ kvm_caps.max_tsc_scaling_ratio = SVM_TSC_RATIO_MAX;
+ kvm_caps.tsc_scaling_ratio_frac_bits = 32;
+
+ tsc_aux_uret_slot = kvm_add_user_return_msr(MSR_TSC_AUX);
+
+ if (boot_cpu_has(X86_FEATURE_AUTOIBRS))
+ kvm_enable_efer_bits(EFER_AUTOIBRS);
+
+ /* Check for pause filtering support */
+ if (!boot_cpu_has(X86_FEATURE_PAUSEFILTER)) {
+ pause_filter_count = 0;
+ pause_filter_thresh = 0;
+ } else if (!boot_cpu_has(X86_FEATURE_PFTHRESHOLD)) {
+ pause_filter_thresh = 0;
+ }
+
+ if (nested) {
+ pr_info("Nested Virtualization enabled\n");
+ kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE);
+ }
+
+ /*
+ * KVM's MMU doesn't support using 2-level paging for itself, and thus
+ * NPT isn't supported if the host is using 2-level paging since host
+ * CR4 is unchanged on VMRUN.
+ */
+ if (!IS_ENABLED(CONFIG_X86_64) && !IS_ENABLED(CONFIG_X86_PAE))
+ npt_enabled = false;
+
+ if (!boot_cpu_has(X86_FEATURE_NPT))
+ npt_enabled = false;
+
+ /* Force VM NPT level equal to the host's paging level */
+ kvm_configure_mmu(npt_enabled, get_npt_level(),
+ get_npt_level(), PG_LEVEL_1G);
+ pr_info("Nested Paging %sabled\n", npt_enabled ? "en" : "dis");
+
+ /* Setup shadow_me_value and shadow_me_mask */
+ kvm_mmu_set_me_spte_mask(sme_me_mask, sme_me_mask);
+
+ svm_adjust_mmio_mask();
+
+ nrips = nrips && boot_cpu_has(X86_FEATURE_NRIPS);
+
+ /*
+ * Note, SEV setup consumes npt_enabled and enable_mmio_caching (which
+ * may be modified by svm_adjust_mmio_mask()), as well as nrips.
+ */
+ sev_hardware_setup();
+
+ svm_hv_hardware_setup();
+
+ for_each_possible_cpu(cpu) {
+ r = svm_cpu_init(cpu);
+ if (r)
+ goto err;
+ }
+
+ enable_apicv = avic = avic && avic_hardware_setup();
+
+ if (!enable_apicv) {
+ svm_x86_ops.vcpu_blocking = NULL;
+ svm_x86_ops.vcpu_unblocking = NULL;
+ svm_x86_ops.vcpu_get_apicv_inhibit_reasons = NULL;
+ } else if (!x2avic_enabled) {
+ svm_x86_ops.allow_apicv_in_x2apic_without_x2apic_virtualization = true;
+ }
+
+ if (vls) {
+ if (!npt_enabled ||
+ !boot_cpu_has(X86_FEATURE_V_VMSAVE_VMLOAD) ||
+ !IS_ENABLED(CONFIG_X86_64)) {
+ vls = false;
+ } else {
+ pr_info("Virtual VMLOAD VMSAVE supported\n");
+ }
+ }
+
+ if (boot_cpu_has(X86_FEATURE_SVME_ADDR_CHK))
+ svm_gp_erratum_intercept = false;
+
+ if (vgif) {
+ if (!boot_cpu_has(X86_FEATURE_VGIF))
+ vgif = false;
+ else
+ pr_info("Virtual GIF supported\n");
+ }
+
+ vnmi = vgif && vnmi && boot_cpu_has(X86_FEATURE_VNMI);
+ if (vnmi)
+ pr_info("Virtual NMI enabled\n");
+
+ if (!vnmi) {
+ svm_x86_ops.is_vnmi_pending = NULL;
+ svm_x86_ops.set_vnmi_pending = NULL;
+ }
+
+
+ if (lbrv) {
+ if (!boot_cpu_has(X86_FEATURE_LBRV))
+ lbrv = false;
+ else
+ pr_info("LBR virtualization supported\n");
+ }
+
+ if (!enable_pmu)
+ pr_info("PMU virtualization is disabled\n");
+
+ svm_set_cpu_caps();
+
+ /*
+ * It seems that on AMD processors PTE's accessed bit is
+ * being set by the CPU hardware before the NPF vmexit.
+ * This is not expected behaviour and our tests fail because
+ * of it.
+ * A workaround here is to disable support for
+ * GUEST_MAXPHYADDR < HOST_MAXPHYADDR if NPT is enabled.
+ * In this case userspace can know if there is support using
+ * KVM_CAP_SMALLER_MAXPHYADDR extension and decide how to handle
+ * it
+ * If future AMD CPU models change the behaviour described above,
+ * this variable can be changed accordingly
+ */
+ allow_smaller_maxphyaddr = !npt_enabled;
+
+ return 0;
+
+err:
+ svm_hardware_unsetup();
+ return r;
+}
+
+
+static struct kvm_x86_init_ops svm_init_ops __initdata = {
+ .hardware_setup = svm_hardware_setup,
+
+ .runtime_ops = &svm_x86_ops,
+ .pmu_ops = &amd_pmu_ops,
+};
+
+static void __svm_exit(void)
+{
+ kvm_x86_vendor_exit();
+
+ cpu_emergency_unregister_virt_callback(svm_emergency_disable);
+}
+
+static int __init svm_init(void)
+{
+ int r;
+
+ __unused_size_checks();
+
+ if (!kvm_is_svm_supported())
+ return -EOPNOTSUPP;
+
+ r = kvm_x86_vendor_init(&svm_init_ops);
+ if (r)
+ return r;
+
+ cpu_emergency_register_virt_callback(svm_emergency_disable);
+
+ /*
+ * Common KVM initialization _must_ come last, after this, /dev/kvm is
+ * exposed to userspace!
+ */
+ r = kvm_init(sizeof(struct vcpu_svm), __alignof__(struct vcpu_svm),
+ THIS_MODULE);
+ if (r)
+ goto err_kvm_init;
+
+ return 0;
+
+err_kvm_init:
+ __svm_exit();
+ return r;
+}
+
+static void __exit svm_exit(void)
+{
+ kvm_exit();
+ __svm_exit();
+}
+
+module_init(svm_init)
+module_exit(svm_exit)
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
new file mode 100644
index 000000000..be67ab7fd
--- /dev/null
+++ b/arch/x86/kvm/svm/svm.h
@@ -0,0 +1,725 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Kernel-based Virtual Machine driver for Linux
+ *
+ * AMD SVM support
+ *
+ * Copyright (C) 2006 Qumranet, Inc.
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
+ *
+ * Authors:
+ * Yaniv Kamay <yaniv@qumranet.com>
+ * Avi Kivity <avi@qumranet.com>
+ */
+
+#ifndef __SVM_SVM_H
+#define __SVM_SVM_H
+
+#include <linux/kvm_types.h>
+#include <linux/kvm_host.h>
+#include <linux/bits.h>
+
+#include <asm/svm.h>
+#include <asm/sev-common.h>
+
+#include "cpuid.h"
+#include "kvm_cache_regs.h"
+
+#define __sme_page_pa(x) __sme_set(page_to_pfn(x) << PAGE_SHIFT)
+
+#define IOPM_SIZE PAGE_SIZE * 3
+#define MSRPM_SIZE PAGE_SIZE * 2
+
+#define MAX_DIRECT_ACCESS_MSRS 46
+#define MSRPM_OFFSETS 32
+extern u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
+extern bool npt_enabled;
+extern int nrips;
+extern int vgif;
+extern bool intercept_smi;
+extern bool x2avic_enabled;
+extern bool vnmi;
+
+/*
+ * Clean bits in VMCB.
+ * VMCB_ALL_CLEAN_MASK might also need to
+ * be updated if this enum is modified.
+ */
+enum {
+ VMCB_INTERCEPTS, /* Intercept vectors, TSC offset,
+ pause filter count */
+ VMCB_PERM_MAP, /* IOPM Base and MSRPM Base */
+ VMCB_ASID, /* ASID */
+ VMCB_INTR, /* int_ctl, int_vector */
+ VMCB_NPT, /* npt_en, nCR3, gPAT */
+ VMCB_CR, /* CR0, CR3, CR4, EFER */
+ VMCB_DR, /* DR6, DR7 */
+ VMCB_DT, /* GDT, IDT */
+ VMCB_SEG, /* CS, DS, SS, ES, CPL */
+ VMCB_CR2, /* CR2 only */
+ VMCB_LBR, /* DBGCTL, BR_FROM, BR_TO, LAST_EX_FROM, LAST_EX_TO */
+ VMCB_AVIC, /* AVIC APIC_BAR, AVIC APIC_BACKING_PAGE,
+ * AVIC PHYSICAL_TABLE pointer,
+ * AVIC LOGICAL_TABLE pointer
+ */
+ VMCB_SW = 31, /* Reserved for hypervisor/software use */
+};
+
+#define VMCB_ALL_CLEAN_MASK ( \
+ (1U << VMCB_INTERCEPTS) | (1U << VMCB_PERM_MAP) | \
+ (1U << VMCB_ASID) | (1U << VMCB_INTR) | \
+ (1U << VMCB_NPT) | (1U << VMCB_CR) | (1U << VMCB_DR) | \
+ (1U << VMCB_DT) | (1U << VMCB_SEG) | (1U << VMCB_CR2) | \
+ (1U << VMCB_LBR) | (1U << VMCB_AVIC) | \
+ (1U << VMCB_SW))
+
+/* TPR and CR2 are always written before VMRUN */
+#define VMCB_ALWAYS_DIRTY_MASK ((1U << VMCB_INTR) | (1U << VMCB_CR2))
+
+struct kvm_sev_info {
+ bool active; /* SEV enabled guest */
+ bool es_active; /* SEV-ES enabled guest */
+ unsigned int asid; /* ASID used for this guest */
+ unsigned int handle; /* SEV firmware handle */
+ int fd; /* SEV device fd */
+ unsigned long pages_locked; /* Number of pages locked */
+ struct list_head regions_list; /* List of registered regions */
+ u64 ap_jump_table; /* SEV-ES AP Jump Table address */
+ struct kvm *enc_context_owner; /* Owner of copied encryption context */
+ struct list_head mirror_vms; /* List of VMs mirroring */
+ struct list_head mirror_entry; /* Use as a list entry of mirrors */
+ struct misc_cg *misc_cg; /* For misc cgroup accounting */
+ atomic_t migration_in_progress;
+};
+
+struct kvm_svm {
+ struct kvm kvm;
+
+ /* Struct members for AVIC */
+ u32 avic_vm_id;
+ struct page *avic_logical_id_table_page;
+ struct page *avic_physical_id_table_page;
+ struct hlist_node hnode;
+
+ struct kvm_sev_info sev_info;
+};
+
+struct kvm_vcpu;
+
+struct kvm_vmcb_info {
+ struct vmcb *ptr;
+ unsigned long pa;
+ int cpu;
+ uint64_t asid_generation;
+};
+
+struct vmcb_save_area_cached {
+ u64 efer;
+ u64 cr4;
+ u64 cr3;
+ u64 cr0;
+ u64 dr7;
+ u64 dr6;
+};
+
+struct vmcb_ctrl_area_cached {
+ u32 intercepts[MAX_INTERCEPT];
+ u16 pause_filter_thresh;
+ u16 pause_filter_count;
+ u64 iopm_base_pa;
+ u64 msrpm_base_pa;
+ u64 tsc_offset;
+ u32 asid;
+ u8 tlb_ctl;
+ u32 int_ctl;
+ u32 int_vector;
+ u32 int_state;
+ u32 exit_code;
+ u32 exit_code_hi;
+ u64 exit_info_1;
+ u64 exit_info_2;
+ u32 exit_int_info;
+ u32 exit_int_info_err;
+ u64 nested_ctl;
+ u32 event_inj;
+ u32 event_inj_err;
+ u64 next_rip;
+ u64 nested_cr3;
+ u64 virt_ext;
+ u32 clean;
+ union {
+ struct hv_vmcb_enlightenments hv_enlightenments;
+ u8 reserved_sw[32];
+ };
+};
+
+struct svm_nested_state {
+ struct kvm_vmcb_info vmcb02;
+ u64 hsave_msr;
+ u64 vm_cr_msr;
+ u64 vmcb12_gpa;
+ u64 last_vmcb12_gpa;
+
+ /* These are the merged vectors */
+ u32 *msrpm;
+
+ /* A VMRUN has started but has not yet been performed, so
+ * we cannot inject a nested vmexit yet. */
+ bool nested_run_pending;
+
+ /* cache for control fields of the guest */
+ struct vmcb_ctrl_area_cached ctl;
+
+ /*
+ * Note: this struct is not kept up-to-date while L2 runs; it is only
+ * valid within nested_svm_vmrun.
+ */
+ struct vmcb_save_area_cached save;
+
+ bool initialized;
+
+ /*
+ * Indicates whether MSR bitmap for L2 needs to be rebuilt due to
+ * changes in MSR bitmap for L1 or switching to a different L2. Note,
+ * this flag can only be used reliably in conjunction with a paravirt L1
+ * which informs L0 whether any changes to MSR bitmap for L2 were done
+ * on its side.
+ */
+ bool force_msr_bitmap_recalc;
+};
+
+struct vcpu_sev_es_state {
+ /* SEV-ES support */
+ struct sev_es_save_area *vmsa;
+ struct ghcb *ghcb;
+ u8 valid_bitmap[16];
+ struct kvm_host_map ghcb_map;
+ bool received_first_sipi;
+
+ /* SEV-ES scratch area support */
+ u64 sw_scratch;
+ void *ghcb_sa;
+ u32 ghcb_sa_len;
+ bool ghcb_sa_sync;
+ bool ghcb_sa_free;
+};
+
+struct vcpu_svm {
+ struct kvm_vcpu vcpu;
+ /* vmcb always points at current_vmcb->ptr, it's purely a shorthand. */
+ struct vmcb *vmcb;
+ struct kvm_vmcb_info vmcb01;
+ struct kvm_vmcb_info *current_vmcb;
+ u32 asid;
+ u32 sysenter_esp_hi;
+ u32 sysenter_eip_hi;
+ uint64_t tsc_aux;
+
+ u64 msr_decfg;
+
+ u64 next_rip;
+
+ u64 spec_ctrl;
+
+ u64 tsc_ratio_msr;
+ /*
+ * Contains guest-controlled bits of VIRT_SPEC_CTRL, which will be
+ * translated into the appropriate L2_CFG bits on the host to
+ * perform speculative control.
+ */
+ u64 virt_spec_ctrl;
+
+ u32 *msrpm;
+
+ ulong nmi_iret_rip;
+
+ struct svm_nested_state nested;
+
+ /* NMI mask value, used when vNMI is not enabled */
+ bool nmi_masked;
+
+ /*
+ * True when NMIs are still masked but guest IRET was just intercepted
+ * and KVM is waiting for RIP to change, which will signal that the
+ * intercepted IRET was retired and thus NMI can be unmasked.
+ */
+ bool awaiting_iret_completion;
+
+ /*
+ * Set when KVM is awaiting IRET completion and needs to inject NMIs as
+ * soon as the IRET completes (e.g. NMI is pending injection). KVM
+ * temporarily steals RFLAGS.TF to single-step the guest in this case
+ * in order to regain control as soon as the NMI-blocking condition
+ * goes away.
+ */
+ bool nmi_singlestep;
+ u64 nmi_singlestep_guest_rflags;
+
+ bool nmi_l1_to_l2;
+
+ unsigned long soft_int_csbase;
+ unsigned long soft_int_old_rip;
+ unsigned long soft_int_next_rip;
+ bool soft_int_injected;
+
+ u32 ldr_reg;
+ u32 dfr_reg;
+ struct page *avic_backing_page;
+ u64 *avic_physical_id_cache;
+
+ /*
+ * Per-vcpu list of struct amd_svm_iommu_ir:
+ * This is used mainly to store interrupt remapping information used
+ * when update the vcpu affinity. This avoids the need to scan for
+ * IRTE and try to match ga_tag in the IOMMU driver.
+ */
+ struct list_head ir_list;
+ spinlock_t ir_list_lock;
+
+ /* Save desired MSR intercept (read: pass-through) state */
+ struct {
+ DECLARE_BITMAP(read, MAX_DIRECT_ACCESS_MSRS);
+ DECLARE_BITMAP(write, MAX_DIRECT_ACCESS_MSRS);
+ } shadow_msr_intercept;
+
+ struct vcpu_sev_es_state sev_es;
+
+ bool guest_state_loaded;
+
+ bool x2avic_msrs_intercepted;
+
+ /* Guest GIF value, used when vGIF is not enabled */
+ bool guest_gif;
+};
+
+struct svm_cpu_data {
+ u64 asid_generation;
+ u32 max_asid;
+ u32 next_asid;
+ u32 min_asid;
+
+ struct page *save_area;
+ unsigned long save_area_pa;
+
+ struct vmcb *current_vmcb;
+
+ /* index = sev_asid, value = vmcb pointer */
+ struct vmcb **sev_vmcbs;
+};
+
+DECLARE_PER_CPU(struct svm_cpu_data, svm_data);
+
+void recalc_intercepts(struct vcpu_svm *svm);
+
+static __always_inline struct kvm_svm *to_kvm_svm(struct kvm *kvm)
+{
+ return container_of(kvm, struct kvm_svm, kvm);
+}
+
+static __always_inline bool sev_guest(struct kvm *kvm)
+{
+#ifdef CONFIG_KVM_AMD_SEV
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+
+ return sev->active;
+#else
+ return false;
+#endif
+}
+
+static __always_inline bool sev_es_guest(struct kvm *kvm)
+{
+#ifdef CONFIG_KVM_AMD_SEV
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+
+ return sev->es_active && !WARN_ON_ONCE(!sev->active);
+#else
+ return false;
+#endif
+}
+
+static inline void vmcb_mark_all_dirty(struct vmcb *vmcb)
+{
+ vmcb->control.clean = 0;
+}
+
+static inline void vmcb_mark_all_clean(struct vmcb *vmcb)
+{
+ vmcb->control.clean = VMCB_ALL_CLEAN_MASK
+ & ~VMCB_ALWAYS_DIRTY_MASK;
+}
+
+static inline void vmcb_mark_dirty(struct vmcb *vmcb, int bit)
+{
+ vmcb->control.clean &= ~(1 << bit);
+}
+
+static inline bool vmcb_is_dirty(struct vmcb *vmcb, int bit)
+{
+ return !test_bit(bit, (unsigned long *)&vmcb->control.clean);
+}
+
+static __always_inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu)
+{
+ return container_of(vcpu, struct vcpu_svm, vcpu);
+}
+
+/*
+ * Only the PDPTRs are loaded on demand into the shadow MMU. All other
+ * fields are synchronized on VM-Exit, because accessing the VMCB is cheap.
+ *
+ * CR3 might be out of date in the VMCB but it is not marked dirty; instead,
+ * KVM_REQ_LOAD_MMU_PGD is always requested when the cached vcpu->arch.cr3
+ * is changed. svm_load_mmu_pgd() then syncs the new CR3 value into the VMCB.
+ */
+#define SVM_REGS_LAZY_LOAD_SET (1 << VCPU_EXREG_PDPTR)
+
+static inline void vmcb_set_intercept(struct vmcb_control_area *control, u32 bit)
+{
+ WARN_ON_ONCE(bit >= 32 * MAX_INTERCEPT);
+ __set_bit(bit, (unsigned long *)&control->intercepts);
+}
+
+static inline void vmcb_clr_intercept(struct vmcb_control_area *control, u32 bit)
+{
+ WARN_ON_ONCE(bit >= 32 * MAX_INTERCEPT);
+ __clear_bit(bit, (unsigned long *)&control->intercepts);
+}
+
+static inline bool vmcb_is_intercept(struct vmcb_control_area *control, u32 bit)
+{
+ WARN_ON_ONCE(bit >= 32 * MAX_INTERCEPT);
+ return test_bit(bit, (unsigned long *)&control->intercepts);
+}
+
+static inline bool vmcb12_is_intercept(struct vmcb_ctrl_area_cached *control, u32 bit)
+{
+ WARN_ON_ONCE(bit >= 32 * MAX_INTERCEPT);
+ return test_bit(bit, (unsigned long *)&control->intercepts);
+}
+
+static inline void set_exception_intercept(struct vcpu_svm *svm, u32 bit)
+{
+ struct vmcb *vmcb = svm->vmcb01.ptr;
+
+ WARN_ON_ONCE(bit >= 32);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_EXCEPTION_OFFSET + bit);
+
+ recalc_intercepts(svm);
+}
+
+static inline void clr_exception_intercept(struct vcpu_svm *svm, u32 bit)
+{
+ struct vmcb *vmcb = svm->vmcb01.ptr;
+
+ WARN_ON_ONCE(bit >= 32);
+ vmcb_clr_intercept(&vmcb->control, INTERCEPT_EXCEPTION_OFFSET + bit);
+
+ recalc_intercepts(svm);
+}
+
+static inline void svm_set_intercept(struct vcpu_svm *svm, int bit)
+{
+ struct vmcb *vmcb = svm->vmcb01.ptr;
+
+ vmcb_set_intercept(&vmcb->control, bit);
+
+ recalc_intercepts(svm);
+}
+
+static inline void svm_clr_intercept(struct vcpu_svm *svm, int bit)
+{
+ struct vmcb *vmcb = svm->vmcb01.ptr;
+
+ vmcb_clr_intercept(&vmcb->control, bit);
+
+ recalc_intercepts(svm);
+}
+
+static inline bool svm_is_intercept(struct vcpu_svm *svm, int bit)
+{
+ return vmcb_is_intercept(&svm->vmcb->control, bit);
+}
+
+static inline bool nested_vgif_enabled(struct vcpu_svm *svm)
+{
+ return guest_can_use(&svm->vcpu, X86_FEATURE_VGIF) &&
+ (svm->nested.ctl.int_ctl & V_GIF_ENABLE_MASK);
+}
+
+static inline struct vmcb *get_vgif_vmcb(struct vcpu_svm *svm)
+{
+ if (!vgif)
+ return NULL;
+
+ if (is_guest_mode(&svm->vcpu) && !nested_vgif_enabled(svm))
+ return svm->nested.vmcb02.ptr;
+ else
+ return svm->vmcb01.ptr;
+}
+
+static inline void enable_gif(struct vcpu_svm *svm)
+{
+ struct vmcb *vmcb = get_vgif_vmcb(svm);
+
+ if (vmcb)
+ vmcb->control.int_ctl |= V_GIF_MASK;
+ else
+ svm->guest_gif = true;
+}
+
+static inline void disable_gif(struct vcpu_svm *svm)
+{
+ struct vmcb *vmcb = get_vgif_vmcb(svm);
+
+ if (vmcb)
+ vmcb->control.int_ctl &= ~V_GIF_MASK;
+ else
+ svm->guest_gif = false;
+}
+
+static inline bool gif_set(struct vcpu_svm *svm)
+{
+ struct vmcb *vmcb = get_vgif_vmcb(svm);
+
+ if (vmcb)
+ return !!(vmcb->control.int_ctl & V_GIF_MASK);
+ else
+ return svm->guest_gif;
+}
+
+static inline bool nested_npt_enabled(struct vcpu_svm *svm)
+{
+ return svm->nested.ctl.nested_ctl & SVM_NESTED_CTL_NP_ENABLE;
+}
+
+static inline bool nested_vnmi_enabled(struct vcpu_svm *svm)
+{
+ return guest_can_use(&svm->vcpu, X86_FEATURE_VNMI) &&
+ (svm->nested.ctl.int_ctl & V_NMI_ENABLE_MASK);
+}
+
+static inline bool is_x2apic_msrpm_offset(u32 offset)
+{
+ /* 4 msrs per u8, and 4 u8 in u32 */
+ u32 msr = offset * 16;
+
+ return (msr >= APIC_BASE_MSR) &&
+ (msr < (APIC_BASE_MSR + 0x100));
+}
+
+static inline struct vmcb *get_vnmi_vmcb_l1(struct vcpu_svm *svm)
+{
+ if (!vnmi)
+ return NULL;
+
+ if (is_guest_mode(&svm->vcpu))
+ return NULL;
+ else
+ return svm->vmcb01.ptr;
+}
+
+static inline bool is_vnmi_enabled(struct vcpu_svm *svm)
+{
+ struct vmcb *vmcb = get_vnmi_vmcb_l1(svm);
+
+ if (vmcb)
+ return !!(vmcb->control.int_ctl & V_NMI_ENABLE_MASK);
+ else
+ return false;
+}
+
+/* svm.c */
+#define MSR_INVALID 0xffffffffU
+
+#define DEBUGCTL_RESERVED_BITS (~(0x3fULL))
+
+extern bool dump_invalid_vmcb;
+
+u32 svm_msrpm_offset(u32 msr);
+u32 *svm_vcpu_alloc_msrpm(void);
+void svm_vcpu_init_msrpm(struct kvm_vcpu *vcpu, u32 *msrpm);
+void svm_vcpu_free_msrpm(u32 *msrpm);
+void svm_copy_lbrs(struct vmcb *to_vmcb, struct vmcb *from_vmcb);
+void svm_update_lbrv(struct kvm_vcpu *vcpu);
+
+int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer);
+void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
+void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
+void disable_nmi_singlestep(struct vcpu_svm *svm);
+bool svm_smi_blocked(struct kvm_vcpu *vcpu);
+bool svm_nmi_blocked(struct kvm_vcpu *vcpu);
+bool svm_interrupt_blocked(struct kvm_vcpu *vcpu);
+void svm_set_gif(struct vcpu_svm *svm, bool value);
+int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 exit_code);
+void set_msr_interception(struct kvm_vcpu *vcpu, u32 *msrpm, u32 msr,
+ int read, int write);
+void svm_set_x2apic_msr_interception(struct vcpu_svm *svm, bool disable);
+void svm_complete_interrupt_delivery(struct kvm_vcpu *vcpu, int delivery_mode,
+ int trig_mode, int vec);
+
+/* nested.c */
+
+#define NESTED_EXIT_HOST 0 /* Exit handled on host level */
+#define NESTED_EXIT_DONE 1 /* Exit caused nested vmexit */
+#define NESTED_EXIT_CONTINUE 2 /* Further checks needed */
+
+static inline bool nested_svm_virtualize_tpr(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ return is_guest_mode(vcpu) && (svm->nested.ctl.int_ctl & V_INTR_MASKING_MASK);
+}
+
+static inline bool nested_exit_on_smi(struct vcpu_svm *svm)
+{
+ return vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_SMI);
+}
+
+static inline bool nested_exit_on_intr(struct vcpu_svm *svm)
+{
+ return vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_INTR);
+}
+
+static inline bool nested_exit_on_nmi(struct vcpu_svm *svm)
+{
+ return vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_NMI);
+}
+
+int enter_svm_guest_mode(struct kvm_vcpu *vcpu,
+ u64 vmcb_gpa, struct vmcb *vmcb12, bool from_vmrun);
+void svm_leave_nested(struct kvm_vcpu *vcpu);
+void svm_free_nested(struct vcpu_svm *svm);
+int svm_allocate_nested(struct vcpu_svm *svm);
+int nested_svm_vmrun(struct kvm_vcpu *vcpu);
+void svm_copy_vmrun_state(struct vmcb_save_area *to_save,
+ struct vmcb_save_area *from_save);
+void svm_copy_vmloadsave_state(struct vmcb *to_vmcb, struct vmcb *from_vmcb);
+int nested_svm_vmexit(struct vcpu_svm *svm);
+
+static inline int nested_svm_simple_vmexit(struct vcpu_svm *svm, u32 exit_code)
+{
+ svm->vmcb->control.exit_code = exit_code;
+ svm->vmcb->control.exit_info_1 = 0;
+ svm->vmcb->control.exit_info_2 = 0;
+ return nested_svm_vmexit(svm);
+}
+
+int nested_svm_exit_handled(struct vcpu_svm *svm);
+int nested_svm_check_permissions(struct kvm_vcpu *vcpu);
+int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
+ bool has_error_code, u32 error_code);
+int nested_svm_exit_special(struct vcpu_svm *svm);
+void nested_svm_update_tsc_ratio_msr(struct kvm_vcpu *vcpu);
+void svm_write_tsc_multiplier(struct kvm_vcpu *vcpu);
+void nested_copy_vmcb_control_to_cache(struct vcpu_svm *svm,
+ struct vmcb_control_area *control);
+void nested_copy_vmcb_save_to_cache(struct vcpu_svm *svm,
+ struct vmcb_save_area *save);
+void nested_sync_control_from_vmcb02(struct vcpu_svm *svm);
+void nested_vmcb02_compute_g_pat(struct vcpu_svm *svm);
+void svm_switch_vmcb(struct vcpu_svm *svm, struct kvm_vmcb_info *target_vmcb);
+
+extern struct kvm_x86_nested_ops svm_nested_ops;
+
+/* avic.c */
+#define AVIC_REQUIRED_APICV_INHIBITS \
+( \
+ BIT(APICV_INHIBIT_REASON_DISABLE) | \
+ BIT(APICV_INHIBIT_REASON_ABSENT) | \
+ BIT(APICV_INHIBIT_REASON_HYPERV) | \
+ BIT(APICV_INHIBIT_REASON_NESTED) | \
+ BIT(APICV_INHIBIT_REASON_IRQWIN) | \
+ BIT(APICV_INHIBIT_REASON_PIT_REINJ) | \
+ BIT(APICV_INHIBIT_REASON_BLOCKIRQ) | \
+ BIT(APICV_INHIBIT_REASON_SEV) | \
+ BIT(APICV_INHIBIT_REASON_PHYSICAL_ID_ALIASED) | \
+ BIT(APICV_INHIBIT_REASON_APIC_ID_MODIFIED) | \
+ BIT(APICV_INHIBIT_REASON_APIC_BASE_MODIFIED) | \
+ BIT(APICV_INHIBIT_REASON_LOGICAL_ID_ALIASED) \
+)
+
+bool avic_hardware_setup(void);
+int avic_ga_log_notifier(u32 ga_tag);
+void avic_vm_destroy(struct kvm *kvm);
+int avic_vm_init(struct kvm *kvm);
+void avic_init_vmcb(struct vcpu_svm *svm, struct vmcb *vmcb);
+int avic_incomplete_ipi_interception(struct kvm_vcpu *vcpu);
+int avic_unaccelerated_access_interception(struct kvm_vcpu *vcpu);
+int avic_init_vcpu(struct vcpu_svm *svm);
+void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
+void avic_vcpu_put(struct kvm_vcpu *vcpu);
+void avic_apicv_post_state_restore(struct kvm_vcpu *vcpu);
+void avic_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu);
+int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
+ uint32_t guest_irq, bool set);
+void avic_vcpu_blocking(struct kvm_vcpu *vcpu);
+void avic_vcpu_unblocking(struct kvm_vcpu *vcpu);
+void avic_ring_doorbell(struct kvm_vcpu *vcpu);
+unsigned long avic_vcpu_get_apicv_inhibit_reasons(struct kvm_vcpu *vcpu);
+void avic_refresh_virtual_apic_mode(struct kvm_vcpu *vcpu);
+
+
+/* sev.c */
+
+#define GHCB_VERSION_MAX 1ULL
+#define GHCB_VERSION_MIN 1ULL
+
+
+extern unsigned int max_sev_asid;
+
+void sev_vm_destroy(struct kvm *kvm);
+int sev_mem_enc_ioctl(struct kvm *kvm, void __user *argp);
+int sev_mem_enc_register_region(struct kvm *kvm,
+ struct kvm_enc_region *range);
+int sev_mem_enc_unregister_region(struct kvm *kvm,
+ struct kvm_enc_region *range);
+int sev_vm_copy_enc_context_from(struct kvm *kvm, unsigned int source_fd);
+int sev_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd);
+void sev_guest_memory_reclaimed(struct kvm *kvm);
+
+void pre_sev_run(struct vcpu_svm *svm, int cpu);
+void __init sev_set_cpu_caps(void);
+void __init sev_hardware_setup(void);
+void sev_hardware_unsetup(void);
+int sev_cpu_init(struct svm_cpu_data *sd);
+void sev_init_vmcb(struct vcpu_svm *svm);
+void sev_vcpu_after_set_cpuid(struct vcpu_svm *svm);
+void sev_free_vcpu(struct kvm_vcpu *vcpu);
+int sev_handle_vmgexit(struct kvm_vcpu *vcpu);
+int sev_es_string_io(struct vcpu_svm *svm, int size, unsigned int port, int in);
+void sev_es_vcpu_reset(struct vcpu_svm *svm);
+void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector);
+void sev_es_prepare_switch_to_guest(struct sev_es_save_area *hostsa);
+void sev_es_unmap_ghcb(struct vcpu_svm *svm);
+
+/* vmenter.S */
+
+void __svm_sev_es_vcpu_run(struct vcpu_svm *svm, bool spec_ctrl_intercepted);
+void __svm_vcpu_run(struct vcpu_svm *svm, bool spec_ctrl_intercepted);
+
+#define DEFINE_KVM_GHCB_ACCESSORS(field) \
+ static __always_inline bool kvm_ghcb_##field##_is_valid(const struct vcpu_svm *svm) \
+ { \
+ return test_bit(GHCB_BITMAP_IDX(field), \
+ (unsigned long *)&svm->sev_es.valid_bitmap); \
+ } \
+ \
+ static __always_inline u64 kvm_ghcb_get_##field##_if_valid(struct vcpu_svm *svm, struct ghcb *ghcb) \
+ { \
+ return kvm_ghcb_##field##_is_valid(svm) ? ghcb->save.field : 0; \
+ } \
+
+DEFINE_KVM_GHCB_ACCESSORS(cpl)
+DEFINE_KVM_GHCB_ACCESSORS(rax)
+DEFINE_KVM_GHCB_ACCESSORS(rcx)
+DEFINE_KVM_GHCB_ACCESSORS(rdx)
+DEFINE_KVM_GHCB_ACCESSORS(rbx)
+DEFINE_KVM_GHCB_ACCESSORS(rsi)
+DEFINE_KVM_GHCB_ACCESSORS(sw_exit_code)
+DEFINE_KVM_GHCB_ACCESSORS(sw_exit_info_1)
+DEFINE_KVM_GHCB_ACCESSORS(sw_exit_info_2)
+DEFINE_KVM_GHCB_ACCESSORS(sw_scratch)
+DEFINE_KVM_GHCB_ACCESSORS(xcr0)
+
+#endif
diff --git a/arch/x86/kvm/svm/svm_onhyperv.c b/arch/x86/kvm/svm/svm_onhyperv.c
new file mode 100644
index 000000000..7af8422d3
--- /dev/null
+++ b/arch/x86/kvm/svm/svm_onhyperv.c
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * KVM L1 hypervisor optimizations on Hyper-V for SVM.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kvm_host.h>
+
+#include <asm/mshyperv.h>
+
+#include "svm.h"
+#include "svm_ops.h"
+
+#include "hyperv.h"
+#include "kvm_onhyperv.h"
+#include "svm_onhyperv.h"
+
+int svm_hv_enable_l2_tlb_flush(struct kvm_vcpu *vcpu)
+{
+ struct hv_vmcb_enlightenments *hve;
+ struct hv_partition_assist_pg **p_hv_pa_pg =
+ &to_kvm_hv(vcpu->kvm)->hv_pa_pg;
+
+ if (!*p_hv_pa_pg)
+ *p_hv_pa_pg = kzalloc(PAGE_SIZE, GFP_KERNEL);
+
+ if (!*p_hv_pa_pg)
+ return -ENOMEM;
+
+ hve = &to_svm(vcpu)->vmcb->control.hv_enlightenments;
+
+ hve->partition_assist_page = __pa(*p_hv_pa_pg);
+ hve->hv_vm_id = (unsigned long)vcpu->kvm;
+ if (!hve->hv_enlightenments_control.nested_flush_hypercall) {
+ hve->hv_enlightenments_control.nested_flush_hypercall = 1;
+ vmcb_mark_dirty(to_svm(vcpu)->vmcb, HV_VMCB_NESTED_ENLIGHTENMENTS);
+ }
+
+ return 0;
+}
+
diff --git a/arch/x86/kvm/svm/svm_onhyperv.h b/arch/x86/kvm/svm/svm_onhyperv.h
new file mode 100644
index 000000000..f85bc617f
--- /dev/null
+++ b/arch/x86/kvm/svm/svm_onhyperv.h
@@ -0,0 +1,116 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * KVM L1 hypervisor optimizations on Hyper-V for SVM.
+ */
+
+#ifndef __ARCH_X86_KVM_SVM_ONHYPERV_H__
+#define __ARCH_X86_KVM_SVM_ONHYPERV_H__
+
+#include <asm/mshyperv.h>
+
+#if IS_ENABLED(CONFIG_HYPERV)
+
+#include "kvm_onhyperv.h"
+#include "svm/hyperv.h"
+
+static struct kvm_x86_ops svm_x86_ops;
+
+int svm_hv_enable_l2_tlb_flush(struct kvm_vcpu *vcpu);
+
+static inline bool svm_hv_is_enlightened_tlb_enabled(struct kvm_vcpu *vcpu)
+{
+ struct hv_vmcb_enlightenments *hve = &to_svm(vcpu)->vmcb->control.hv_enlightenments;
+
+ return ms_hyperv.nested_features & HV_X64_NESTED_ENLIGHTENED_TLB &&
+ !!hve->hv_enlightenments_control.enlightened_npt_tlb;
+}
+
+static inline void svm_hv_init_vmcb(struct vmcb *vmcb)
+{
+ struct hv_vmcb_enlightenments *hve = &vmcb->control.hv_enlightenments;
+
+ BUILD_BUG_ON(sizeof(vmcb->control.hv_enlightenments) !=
+ sizeof(vmcb->control.reserved_sw));
+
+ if (npt_enabled &&
+ ms_hyperv.nested_features & HV_X64_NESTED_ENLIGHTENED_TLB)
+ hve->hv_enlightenments_control.enlightened_npt_tlb = 1;
+
+ if (ms_hyperv.nested_features & HV_X64_NESTED_MSR_BITMAP)
+ hve->hv_enlightenments_control.msr_bitmap = 1;
+}
+
+static inline __init void svm_hv_hardware_setup(void)
+{
+ if (npt_enabled &&
+ ms_hyperv.nested_features & HV_X64_NESTED_ENLIGHTENED_TLB) {
+ pr_info(KBUILD_MODNAME ": Hyper-V enlightened NPT TLB flush enabled\n");
+ svm_x86_ops.flush_remote_tlbs = hv_flush_remote_tlbs;
+ svm_x86_ops.flush_remote_tlbs_range = hv_flush_remote_tlbs_range;
+ }
+
+ if (ms_hyperv.nested_features & HV_X64_NESTED_DIRECT_FLUSH) {
+ int cpu;
+
+ pr_info(KBUILD_MODNAME ": Hyper-V Direct TLB Flush enabled\n");
+ for_each_online_cpu(cpu) {
+ struct hv_vp_assist_page *vp_ap =
+ hv_get_vp_assist_page(cpu);
+
+ if (!vp_ap)
+ continue;
+
+ vp_ap->nested_control.features.directhypercall = 1;
+ }
+ svm_x86_ops.enable_l2_tlb_flush =
+ svm_hv_enable_l2_tlb_flush;
+ }
+}
+
+static inline void svm_hv_vmcb_dirty_nested_enlightenments(
+ struct kvm_vcpu *vcpu)
+{
+ struct vmcb *vmcb = to_svm(vcpu)->vmcb;
+ struct hv_vmcb_enlightenments *hve = &vmcb->control.hv_enlightenments;
+
+ if (hve->hv_enlightenments_control.msr_bitmap)
+ vmcb_mark_dirty(vmcb, HV_VMCB_NESTED_ENLIGHTENMENTS);
+}
+
+static inline void svm_hv_update_vp_id(struct vmcb *vmcb, struct kvm_vcpu *vcpu)
+{
+ struct hv_vmcb_enlightenments *hve = &vmcb->control.hv_enlightenments;
+ u32 vp_index = kvm_hv_get_vpindex(vcpu);
+
+ if (hve->hv_vp_id != vp_index) {
+ hve->hv_vp_id = vp_index;
+ vmcb_mark_dirty(vmcb, HV_VMCB_NESTED_ENLIGHTENMENTS);
+ }
+}
+#else
+
+static inline bool svm_hv_is_enlightened_tlb_enabled(struct kvm_vcpu *vcpu)
+{
+ return false;
+}
+
+static inline void svm_hv_init_vmcb(struct vmcb *vmcb)
+{
+}
+
+static inline __init void svm_hv_hardware_setup(void)
+{
+}
+
+static inline void svm_hv_vmcb_dirty_nested_enlightenments(
+ struct kvm_vcpu *vcpu)
+{
+}
+
+static inline void svm_hv_update_vp_id(struct vmcb *vmcb,
+ struct kvm_vcpu *vcpu)
+{
+}
+#endif /* CONFIG_HYPERV */
+
+#endif /* __ARCH_X86_KVM_SVM_ONHYPERV_H__ */
diff --git a/arch/x86/kvm/svm/svm_ops.h b/arch/x86/kvm/svm/svm_ops.h
new file mode 100644
index 000000000..36c8af87a
--- /dev/null
+++ b/arch/x86/kvm/svm/svm_ops.h
@@ -0,0 +1,64 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __KVM_X86_SVM_OPS_H
+#define __KVM_X86_SVM_OPS_H
+
+#include <linux/compiler_types.h>
+
+#include "x86.h"
+
+#define svm_asm(insn, clobber...) \
+do { \
+ asm_volatile_goto("1: " __stringify(insn) "\n\t" \
+ _ASM_EXTABLE(1b, %l[fault]) \
+ ::: clobber : fault); \
+ return; \
+fault: \
+ kvm_spurious_fault(); \
+} while (0)
+
+#define svm_asm1(insn, op1, clobber...) \
+do { \
+ asm_volatile_goto("1: " __stringify(insn) " %0\n\t" \
+ _ASM_EXTABLE(1b, %l[fault]) \
+ :: op1 : clobber : fault); \
+ return; \
+fault: \
+ kvm_spurious_fault(); \
+} while (0)
+
+#define svm_asm2(insn, op1, op2, clobber...) \
+do { \
+ asm_volatile_goto("1: " __stringify(insn) " %1, %0\n\t" \
+ _ASM_EXTABLE(1b, %l[fault]) \
+ :: op1, op2 : clobber : fault); \
+ return; \
+fault: \
+ kvm_spurious_fault(); \
+} while (0)
+
+static inline void clgi(void)
+{
+ svm_asm(clgi);
+}
+
+static inline void stgi(void)
+{
+ svm_asm(stgi);
+}
+
+static inline void invlpga(unsigned long addr, u32 asid)
+{
+ svm_asm2(invlpga, "c"(asid), "a"(addr));
+}
+
+/*
+ * Despite being a physical address, the portion of rAX that is consumed by
+ * VMSAVE, VMLOAD, etc... is still controlled by the effective address size,
+ * hence 'unsigned long' instead of 'hpa_t'.
+ */
+static __always_inline void vmsave(unsigned long pa)
+{
+ svm_asm1(vmsave, "a" (pa), "memory");
+}
+
+#endif /* __KVM_X86_SVM_OPS_H */
diff --git a/arch/x86/kvm/svm/vmenter.S b/arch/x86/kvm/svm/vmenter.S
new file mode 100644
index 000000000..ef2ebabb0
--- /dev/null
+++ b/arch/x86/kvm/svm/vmenter.S
@@ -0,0 +1,390 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/linkage.h>
+#include <asm/asm.h>
+#include <asm/asm-offsets.h>
+#include <asm/bitsperlong.h>
+#include <asm/kvm_vcpu_regs.h>
+#include <asm/nospec-branch.h>
+#include "kvm-asm-offsets.h"
+
+#define WORD_SIZE (BITS_PER_LONG / 8)
+
+/* Intentionally omit RAX as it's context switched by hardware */
+#define VCPU_RCX (SVM_vcpu_arch_regs + __VCPU_REGS_RCX * WORD_SIZE)
+#define VCPU_RDX (SVM_vcpu_arch_regs + __VCPU_REGS_RDX * WORD_SIZE)
+#define VCPU_RBX (SVM_vcpu_arch_regs + __VCPU_REGS_RBX * WORD_SIZE)
+/* Intentionally omit RSP as it's context switched by hardware */
+#define VCPU_RBP (SVM_vcpu_arch_regs + __VCPU_REGS_RBP * WORD_SIZE)
+#define VCPU_RSI (SVM_vcpu_arch_regs + __VCPU_REGS_RSI * WORD_SIZE)
+#define VCPU_RDI (SVM_vcpu_arch_regs + __VCPU_REGS_RDI * WORD_SIZE)
+
+#ifdef CONFIG_X86_64
+#define VCPU_R8 (SVM_vcpu_arch_regs + __VCPU_REGS_R8 * WORD_SIZE)
+#define VCPU_R9 (SVM_vcpu_arch_regs + __VCPU_REGS_R9 * WORD_SIZE)
+#define VCPU_R10 (SVM_vcpu_arch_regs + __VCPU_REGS_R10 * WORD_SIZE)
+#define VCPU_R11 (SVM_vcpu_arch_regs + __VCPU_REGS_R11 * WORD_SIZE)
+#define VCPU_R12 (SVM_vcpu_arch_regs + __VCPU_REGS_R12 * WORD_SIZE)
+#define VCPU_R13 (SVM_vcpu_arch_regs + __VCPU_REGS_R13 * WORD_SIZE)
+#define VCPU_R14 (SVM_vcpu_arch_regs + __VCPU_REGS_R14 * WORD_SIZE)
+#define VCPU_R15 (SVM_vcpu_arch_regs + __VCPU_REGS_R15 * WORD_SIZE)
+#endif
+
+#define SVM_vmcb01_pa (SVM_vmcb01 + KVM_VMCB_pa)
+
+.section .noinstr.text, "ax"
+
+.macro RESTORE_GUEST_SPEC_CTRL
+ /* No need to do anything if SPEC_CTRL is unset or V_SPEC_CTRL is set */
+ ALTERNATIVE_2 "", \
+ "jmp 800f", X86_FEATURE_MSR_SPEC_CTRL, \
+ "", X86_FEATURE_V_SPEC_CTRL
+801:
+.endm
+.macro RESTORE_GUEST_SPEC_CTRL_BODY
+800:
+ /*
+ * SPEC_CTRL handling: if the guest's SPEC_CTRL value differs from the
+ * host's, write the MSR. This is kept out-of-line so that the common
+ * case does not have to jump.
+ *
+ * IMPORTANT: To avoid RSB underflow attacks and any other nastiness,
+ * there must not be any returns or indirect branches between this code
+ * and vmentry.
+ */
+ movl SVM_spec_ctrl(%_ASM_DI), %eax
+ cmp PER_CPU_VAR(x86_spec_ctrl_current), %eax
+ je 801b
+ mov $MSR_IA32_SPEC_CTRL, %ecx
+ xor %edx, %edx
+ wrmsr
+ jmp 801b
+.endm
+
+.macro RESTORE_HOST_SPEC_CTRL
+ /* No need to do anything if SPEC_CTRL is unset or V_SPEC_CTRL is set */
+ ALTERNATIVE_2 "", \
+ "jmp 900f", X86_FEATURE_MSR_SPEC_CTRL, \
+ "", X86_FEATURE_V_SPEC_CTRL
+901:
+.endm
+.macro RESTORE_HOST_SPEC_CTRL_BODY
+900:
+ /* Same for after vmexit. */
+ mov $MSR_IA32_SPEC_CTRL, %ecx
+
+ /*
+ * Load the value that the guest had written into MSR_IA32_SPEC_CTRL,
+ * if it was not intercepted during guest execution.
+ */
+ cmpb $0, (%_ASM_SP)
+ jnz 998f
+ rdmsr
+ movl %eax, SVM_spec_ctrl(%_ASM_DI)
+998:
+
+ /* Now restore the host value of the MSR if different from the guest's. */
+ movl PER_CPU_VAR(x86_spec_ctrl_current), %eax
+ cmp SVM_spec_ctrl(%_ASM_DI), %eax
+ je 901b
+ xor %edx, %edx
+ wrmsr
+ jmp 901b
+.endm
+
+
+/**
+ * __svm_vcpu_run - Run a vCPU via a transition to SVM guest mode
+ * @svm: struct vcpu_svm *
+ * @spec_ctrl_intercepted: bool
+ */
+SYM_FUNC_START(__svm_vcpu_run)
+ push %_ASM_BP
+#ifdef CONFIG_X86_64
+ push %r15
+ push %r14
+ push %r13
+ push %r12
+#else
+ push %edi
+ push %esi
+#endif
+ push %_ASM_BX
+
+ /*
+ * Save variables needed after vmexit on the stack, in inverse
+ * order compared to when they are needed.
+ */
+
+ /* Accessed directly from the stack in RESTORE_HOST_SPEC_CTRL. */
+ push %_ASM_ARG2
+
+ /* Needed to restore access to percpu variables. */
+ __ASM_SIZE(push) PER_CPU_VAR(svm_data + SD_save_area_pa)
+
+ /* Finally save @svm. */
+ push %_ASM_ARG1
+
+.ifnc _ASM_ARG1, _ASM_DI
+ /*
+ * Stash @svm in RDI early. On 32-bit, arguments are in RAX, RCX
+ * and RDX which are clobbered by RESTORE_GUEST_SPEC_CTRL.
+ */
+ mov %_ASM_ARG1, %_ASM_DI
+.endif
+
+ /* Clobbers RAX, RCX, RDX. */
+ RESTORE_GUEST_SPEC_CTRL
+
+ /*
+ * Use a single vmcb (vmcb01 because it's always valid) for
+ * context switching guest state via VMLOAD/VMSAVE, that way
+ * the state doesn't need to be copied between vmcb01 and
+ * vmcb02 when switching vmcbs for nested virtualization.
+ */
+ mov SVM_vmcb01_pa(%_ASM_DI), %_ASM_AX
+1: vmload %_ASM_AX
+2:
+
+ /* Get svm->current_vmcb->pa into RAX. */
+ mov SVM_current_vmcb(%_ASM_DI), %_ASM_AX
+ mov KVM_VMCB_pa(%_ASM_AX), %_ASM_AX
+
+ /* Load guest registers. */
+ mov VCPU_RCX(%_ASM_DI), %_ASM_CX
+ mov VCPU_RDX(%_ASM_DI), %_ASM_DX
+ mov VCPU_RBX(%_ASM_DI), %_ASM_BX
+ mov VCPU_RBP(%_ASM_DI), %_ASM_BP
+ mov VCPU_RSI(%_ASM_DI), %_ASM_SI
+#ifdef CONFIG_X86_64
+ mov VCPU_R8 (%_ASM_DI), %r8
+ mov VCPU_R9 (%_ASM_DI), %r9
+ mov VCPU_R10(%_ASM_DI), %r10
+ mov VCPU_R11(%_ASM_DI), %r11
+ mov VCPU_R12(%_ASM_DI), %r12
+ mov VCPU_R13(%_ASM_DI), %r13
+ mov VCPU_R14(%_ASM_DI), %r14
+ mov VCPU_R15(%_ASM_DI), %r15
+#endif
+ mov VCPU_RDI(%_ASM_DI), %_ASM_DI
+
+ /* Enter guest mode */
+ sti
+
+3: vmrun %_ASM_AX
+4:
+ cli
+
+ /* Pop @svm to RAX while it's the only available register. */
+ pop %_ASM_AX
+
+ /* Save all guest registers. */
+ mov %_ASM_CX, VCPU_RCX(%_ASM_AX)
+ mov %_ASM_DX, VCPU_RDX(%_ASM_AX)
+ mov %_ASM_BX, VCPU_RBX(%_ASM_AX)
+ mov %_ASM_BP, VCPU_RBP(%_ASM_AX)
+ mov %_ASM_SI, VCPU_RSI(%_ASM_AX)
+ mov %_ASM_DI, VCPU_RDI(%_ASM_AX)
+#ifdef CONFIG_X86_64
+ mov %r8, VCPU_R8 (%_ASM_AX)
+ mov %r9, VCPU_R9 (%_ASM_AX)
+ mov %r10, VCPU_R10(%_ASM_AX)
+ mov %r11, VCPU_R11(%_ASM_AX)
+ mov %r12, VCPU_R12(%_ASM_AX)
+ mov %r13, VCPU_R13(%_ASM_AX)
+ mov %r14, VCPU_R14(%_ASM_AX)
+ mov %r15, VCPU_R15(%_ASM_AX)
+#endif
+
+ /* @svm can stay in RDI from now on. */
+ mov %_ASM_AX, %_ASM_DI
+
+ mov SVM_vmcb01_pa(%_ASM_DI), %_ASM_AX
+5: vmsave %_ASM_AX
+6:
+
+ /* Restores GSBASE among other things, allowing access to percpu data. */
+ pop %_ASM_AX
+7: vmload %_ASM_AX
+8:
+
+#ifdef CONFIG_RETPOLINE
+ /* IMPORTANT: Stuff the RSB immediately after VM-Exit, before RET! */
+ FILL_RETURN_BUFFER %_ASM_AX, RSB_CLEAR_LOOPS, X86_FEATURE_RETPOLINE
+#endif
+
+ /* Clobbers RAX, RCX, RDX. */
+ RESTORE_HOST_SPEC_CTRL
+
+ /*
+ * Mitigate RETBleed for AMD/Hygon Zen uarch. RET should be
+ * untrained as soon as we exit the VM and are back to the
+ * kernel. This should be done before re-enabling interrupts
+ * because interrupt handlers won't sanitize 'ret' if the return is
+ * from the kernel.
+ */
+ UNTRAIN_RET_VM
+
+ /*
+ * Clear all general purpose registers except RSP and RAX to prevent
+ * speculative use of the guest's values, even those that are reloaded
+ * via the stack. In theory, an L1 cache miss when restoring registers
+ * could lead to speculative execution with the guest's values.
+ * Zeroing XORs are dirt cheap, i.e. the extra paranoia is essentially
+ * free. RSP and RAX are exempt as they are restored by hardware
+ * during VM-Exit.
+ */
+ xor %ecx, %ecx
+ xor %edx, %edx
+ xor %ebx, %ebx
+ xor %ebp, %ebp
+ xor %esi, %esi
+ xor %edi, %edi
+#ifdef CONFIG_X86_64
+ xor %r8d, %r8d
+ xor %r9d, %r9d
+ xor %r10d, %r10d
+ xor %r11d, %r11d
+ xor %r12d, %r12d
+ xor %r13d, %r13d
+ xor %r14d, %r14d
+ xor %r15d, %r15d
+#endif
+
+ /* "Pop" @spec_ctrl_intercepted. */
+ pop %_ASM_BX
+
+ pop %_ASM_BX
+
+#ifdef CONFIG_X86_64
+ pop %r12
+ pop %r13
+ pop %r14
+ pop %r15
+#else
+ pop %esi
+ pop %edi
+#endif
+ pop %_ASM_BP
+ RET
+
+ RESTORE_GUEST_SPEC_CTRL_BODY
+ RESTORE_HOST_SPEC_CTRL_BODY
+
+10: cmpb $0, kvm_rebooting
+ jne 2b
+ ud2
+30: cmpb $0, kvm_rebooting
+ jne 4b
+ ud2
+50: cmpb $0, kvm_rebooting
+ jne 6b
+ ud2
+70: cmpb $0, kvm_rebooting
+ jne 8b
+ ud2
+
+ _ASM_EXTABLE(1b, 10b)
+ _ASM_EXTABLE(3b, 30b)
+ _ASM_EXTABLE(5b, 50b)
+ _ASM_EXTABLE(7b, 70b)
+
+SYM_FUNC_END(__svm_vcpu_run)
+
+/**
+ * __svm_sev_es_vcpu_run - Run a SEV-ES vCPU via a transition to SVM guest mode
+ * @svm: struct vcpu_svm *
+ * @spec_ctrl_intercepted: bool
+ */
+SYM_FUNC_START(__svm_sev_es_vcpu_run)
+ push %_ASM_BP
+#ifdef CONFIG_X86_64
+ push %r15
+ push %r14
+ push %r13
+ push %r12
+#else
+ push %edi
+ push %esi
+#endif
+ push %_ASM_BX
+
+ /*
+ * Save variables needed after vmexit on the stack, in inverse
+ * order compared to when they are needed.
+ */
+
+ /* Accessed directly from the stack in RESTORE_HOST_SPEC_CTRL. */
+ push %_ASM_ARG2
+
+ /* Save @svm. */
+ push %_ASM_ARG1
+
+.ifnc _ASM_ARG1, _ASM_DI
+ /*
+ * Stash @svm in RDI early. On 32-bit, arguments are in RAX, RCX
+ * and RDX which are clobbered by RESTORE_GUEST_SPEC_CTRL.
+ */
+ mov %_ASM_ARG1, %_ASM_DI
+.endif
+
+ /* Clobbers RAX, RCX, RDX. */
+ RESTORE_GUEST_SPEC_CTRL
+
+ /* Get svm->current_vmcb->pa into RAX. */
+ mov SVM_current_vmcb(%_ASM_DI), %_ASM_AX
+ mov KVM_VMCB_pa(%_ASM_AX), %_ASM_AX
+
+ /* Enter guest mode */
+ sti
+
+1: vmrun %_ASM_AX
+
+2: cli
+
+ /* Pop @svm to RDI, guest registers have been saved already. */
+ pop %_ASM_DI
+
+#ifdef CONFIG_RETPOLINE
+ /* IMPORTANT: Stuff the RSB immediately after VM-Exit, before RET! */
+ FILL_RETURN_BUFFER %_ASM_AX, RSB_CLEAR_LOOPS, X86_FEATURE_RETPOLINE
+#endif
+
+ /* Clobbers RAX, RCX, RDX. */
+ RESTORE_HOST_SPEC_CTRL
+
+ /*
+ * Mitigate RETBleed for AMD/Hygon Zen uarch. RET should be
+ * untrained as soon as we exit the VM and are back to the
+ * kernel. This should be done before re-enabling interrupts
+ * because interrupt handlers won't sanitize RET if the return is
+ * from the kernel.
+ */
+ UNTRAIN_RET_VM
+
+ /* "Pop" @spec_ctrl_intercepted. */
+ pop %_ASM_BX
+
+ pop %_ASM_BX
+
+#ifdef CONFIG_X86_64
+ pop %r12
+ pop %r13
+ pop %r14
+ pop %r15
+#else
+ pop %esi
+ pop %edi
+#endif
+ pop %_ASM_BP
+ RET
+
+ RESTORE_GUEST_SPEC_CTRL_BODY
+ RESTORE_HOST_SPEC_CTRL_BODY
+
+3: cmpb $0, kvm_rebooting
+ jne 2b
+ ud2
+
+ _ASM_EXTABLE(1b, 3b)
+
+SYM_FUNC_END(__svm_sev_es_vcpu_run)
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
new file mode 100644
index 000000000..838433798
--- /dev/null
+++ b/arch/x86/kvm/trace.h
@@ -0,0 +1,1842 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#if !defined(_TRACE_KVM_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_KVM_H
+
+#include <linux/tracepoint.h>
+#include <asm/vmx.h>
+#include <asm/svm.h>
+#include <asm/clocksource.h>
+#include <asm/pvclock-abi.h>
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM kvm
+
+/*
+ * Tracepoint for guest mode entry.
+ */
+TRACE_EVENT(kvm_entry,
+ TP_PROTO(struct kvm_vcpu *vcpu),
+ TP_ARGS(vcpu),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, vcpu_id )
+ __field( unsigned long, rip )
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu->vcpu_id;
+ __entry->rip = kvm_rip_read(vcpu);
+ ),
+
+ TP_printk("vcpu %u, rip 0x%lx", __entry->vcpu_id, __entry->rip)
+);
+
+/*
+ * Tracepoint for hypercall.
+ */
+TRACE_EVENT(kvm_hypercall,
+ TP_PROTO(unsigned long nr, unsigned long a0, unsigned long a1,
+ unsigned long a2, unsigned long a3),
+ TP_ARGS(nr, a0, a1, a2, a3),
+
+ TP_STRUCT__entry(
+ __field( unsigned long, nr )
+ __field( unsigned long, a0 )
+ __field( unsigned long, a1 )
+ __field( unsigned long, a2 )
+ __field( unsigned long, a3 )
+ ),
+
+ TP_fast_assign(
+ __entry->nr = nr;
+ __entry->a0 = a0;
+ __entry->a1 = a1;
+ __entry->a2 = a2;
+ __entry->a3 = a3;
+ ),
+
+ TP_printk("nr 0x%lx a0 0x%lx a1 0x%lx a2 0x%lx a3 0x%lx",
+ __entry->nr, __entry->a0, __entry->a1, __entry->a2,
+ __entry->a3)
+);
+
+/*
+ * Tracepoint for hypercall.
+ */
+TRACE_EVENT(kvm_hv_hypercall,
+ TP_PROTO(__u16 code, bool fast, __u16 var_cnt, __u16 rep_cnt,
+ __u16 rep_idx, __u64 ingpa, __u64 outgpa),
+ TP_ARGS(code, fast, var_cnt, rep_cnt, rep_idx, ingpa, outgpa),
+
+ TP_STRUCT__entry(
+ __field( __u16, rep_cnt )
+ __field( __u16, rep_idx )
+ __field( __u64, ingpa )
+ __field( __u64, outgpa )
+ __field( __u16, code )
+ __field( __u16, var_cnt )
+ __field( bool, fast )
+ ),
+
+ TP_fast_assign(
+ __entry->rep_cnt = rep_cnt;
+ __entry->rep_idx = rep_idx;
+ __entry->ingpa = ingpa;
+ __entry->outgpa = outgpa;
+ __entry->code = code;
+ __entry->var_cnt = var_cnt;
+ __entry->fast = fast;
+ ),
+
+ TP_printk("code 0x%x %s var_cnt 0x%x rep_cnt 0x%x idx 0x%x in 0x%llx out 0x%llx",
+ __entry->code, __entry->fast ? "fast" : "slow",
+ __entry->var_cnt, __entry->rep_cnt, __entry->rep_idx,
+ __entry->ingpa, __entry->outgpa)
+);
+
+TRACE_EVENT(kvm_hv_hypercall_done,
+ TP_PROTO(u64 result),
+ TP_ARGS(result),
+
+ TP_STRUCT__entry(
+ __field(__u64, result)
+ ),
+
+ TP_fast_assign(
+ __entry->result = result;
+ ),
+
+ TP_printk("result 0x%llx", __entry->result)
+);
+
+/*
+ * Tracepoint for Xen hypercall.
+ */
+TRACE_EVENT(kvm_xen_hypercall,
+ TP_PROTO(u8 cpl, unsigned long nr,
+ unsigned long a0, unsigned long a1, unsigned long a2,
+ unsigned long a3, unsigned long a4, unsigned long a5),
+ TP_ARGS(cpl, nr, a0, a1, a2, a3, a4, a5),
+
+ TP_STRUCT__entry(
+ __field(u8, cpl)
+ __field(unsigned long, nr)
+ __field(unsigned long, a0)
+ __field(unsigned long, a1)
+ __field(unsigned long, a2)
+ __field(unsigned long, a3)
+ __field(unsigned long, a4)
+ __field(unsigned long, a5)
+ ),
+
+ TP_fast_assign(
+ __entry->cpl = cpl;
+ __entry->nr = nr;
+ __entry->a0 = a0;
+ __entry->a1 = a1;
+ __entry->a2 = a2;
+ __entry->a3 = a3;
+ __entry->a4 = a4;
+ __entry->a4 = a5;
+ ),
+
+ TP_printk("cpl %d nr 0x%lx a0 0x%lx a1 0x%lx a2 0x%lx a3 0x%lx a4 0x%lx a5 %lx",
+ __entry->cpl, __entry->nr,
+ __entry->a0, __entry->a1, __entry->a2,
+ __entry->a3, __entry->a4, __entry->a5)
+);
+
+
+
+/*
+ * Tracepoint for PIO.
+ */
+
+#define KVM_PIO_IN 0
+#define KVM_PIO_OUT 1
+
+TRACE_EVENT(kvm_pio,
+ TP_PROTO(unsigned int rw, unsigned int port, unsigned int size,
+ unsigned int count, const void *data),
+ TP_ARGS(rw, port, size, count, data),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, rw )
+ __field( unsigned int, port )
+ __field( unsigned int, size )
+ __field( unsigned int, count )
+ __field( unsigned int, val )
+ ),
+
+ TP_fast_assign(
+ __entry->rw = rw;
+ __entry->port = port;
+ __entry->size = size;
+ __entry->count = count;
+ if (size == 1)
+ __entry->val = *(unsigned char *)data;
+ else if (size == 2)
+ __entry->val = *(unsigned short *)data;
+ else
+ __entry->val = *(unsigned int *)data;
+ ),
+
+ TP_printk("pio_%s at 0x%x size %d count %d val 0x%x %s",
+ __entry->rw ? "write" : "read",
+ __entry->port, __entry->size, __entry->count, __entry->val,
+ __entry->count > 1 ? "(...)" : "")
+);
+
+/*
+ * Tracepoint for fast mmio.
+ */
+TRACE_EVENT(kvm_fast_mmio,
+ TP_PROTO(u64 gpa),
+ TP_ARGS(gpa),
+
+ TP_STRUCT__entry(
+ __field(u64, gpa)
+ ),
+
+ TP_fast_assign(
+ __entry->gpa = gpa;
+ ),
+
+ TP_printk("fast mmio at gpa 0x%llx", __entry->gpa)
+);
+
+/*
+ * Tracepoint for cpuid.
+ */
+TRACE_EVENT(kvm_cpuid,
+ TP_PROTO(unsigned int function, unsigned int index, unsigned long rax,
+ unsigned long rbx, unsigned long rcx, unsigned long rdx,
+ bool found, bool used_max_basic),
+ TP_ARGS(function, index, rax, rbx, rcx, rdx, found, used_max_basic),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, function )
+ __field( unsigned int, index )
+ __field( unsigned long, rax )
+ __field( unsigned long, rbx )
+ __field( unsigned long, rcx )
+ __field( unsigned long, rdx )
+ __field( bool, found )
+ __field( bool, used_max_basic )
+ ),
+
+ TP_fast_assign(
+ __entry->function = function;
+ __entry->index = index;
+ __entry->rax = rax;
+ __entry->rbx = rbx;
+ __entry->rcx = rcx;
+ __entry->rdx = rdx;
+ __entry->found = found;
+ __entry->used_max_basic = used_max_basic;
+ ),
+
+ TP_printk("func %x idx %x rax %lx rbx %lx rcx %lx rdx %lx, cpuid entry %s%s",
+ __entry->function, __entry->index, __entry->rax,
+ __entry->rbx, __entry->rcx, __entry->rdx,
+ __entry->found ? "found" : "not found",
+ __entry->used_max_basic ? ", used max basic" : "")
+);
+
+#define AREG(x) { APIC_##x, "APIC_" #x }
+
+#define kvm_trace_symbol_apic \
+ AREG(ID), AREG(LVR), AREG(TASKPRI), AREG(ARBPRI), AREG(PROCPRI), \
+ AREG(EOI), AREG(RRR), AREG(LDR), AREG(DFR), AREG(SPIV), AREG(ISR), \
+ AREG(TMR), AREG(IRR), AREG(ESR), AREG(ICR), AREG(ICR2), AREG(LVTT), \
+ AREG(LVTTHMR), AREG(LVTPC), AREG(LVT0), AREG(LVT1), AREG(LVTERR), \
+ AREG(TMICT), AREG(TMCCT), AREG(TDCR), AREG(SELF_IPI), AREG(EFEAT), \
+ AREG(ECTRL)
+/*
+ * Tracepoint for apic access.
+ */
+TRACE_EVENT(kvm_apic,
+ TP_PROTO(unsigned int rw, unsigned int reg, u64 val),
+ TP_ARGS(rw, reg, val),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, rw )
+ __field( unsigned int, reg )
+ __field( u64, val )
+ ),
+
+ TP_fast_assign(
+ __entry->rw = rw;
+ __entry->reg = reg;
+ __entry->val = val;
+ ),
+
+ TP_printk("apic_%s %s = 0x%llx",
+ __entry->rw ? "write" : "read",
+ __print_symbolic(__entry->reg, kvm_trace_symbol_apic),
+ __entry->val)
+);
+
+#define trace_kvm_apic_read(reg, val) trace_kvm_apic(0, reg, val)
+#define trace_kvm_apic_write(reg, val) trace_kvm_apic(1, reg, val)
+
+#define KVM_ISA_VMX 1
+#define KVM_ISA_SVM 2
+
+#define kvm_print_exit_reason(exit_reason, isa) \
+ (isa == KVM_ISA_VMX) ? \
+ __print_symbolic(exit_reason & 0xffff, VMX_EXIT_REASONS) : \
+ __print_symbolic(exit_reason, SVM_EXIT_REASONS), \
+ (isa == KVM_ISA_VMX && exit_reason & ~0xffff) ? " " : "", \
+ (isa == KVM_ISA_VMX) ? \
+ __print_flags(exit_reason & ~0xffff, " ", VMX_EXIT_REASON_FLAGS) : ""
+
+#define TRACE_EVENT_KVM_EXIT(name) \
+TRACE_EVENT(name, \
+ TP_PROTO(struct kvm_vcpu *vcpu, u32 isa), \
+ TP_ARGS(vcpu, isa), \
+ \
+ TP_STRUCT__entry( \
+ __field( unsigned int, exit_reason ) \
+ __field( unsigned long, guest_rip ) \
+ __field( u32, isa ) \
+ __field( u64, info1 ) \
+ __field( u64, info2 ) \
+ __field( u32, intr_info ) \
+ __field( u32, error_code ) \
+ __field( unsigned int, vcpu_id ) \
+ ), \
+ \
+ TP_fast_assign( \
+ __entry->guest_rip = kvm_rip_read(vcpu); \
+ __entry->isa = isa; \
+ __entry->vcpu_id = vcpu->vcpu_id; \
+ static_call(kvm_x86_get_exit_info)(vcpu, \
+ &__entry->exit_reason, \
+ &__entry->info1, \
+ &__entry->info2, \
+ &__entry->intr_info, \
+ &__entry->error_code); \
+ ), \
+ \
+ TP_printk("vcpu %u reason %s%s%s rip 0x%lx info1 0x%016llx " \
+ "info2 0x%016llx intr_info 0x%08x error_code 0x%08x", \
+ __entry->vcpu_id, \
+ kvm_print_exit_reason(__entry->exit_reason, __entry->isa), \
+ __entry->guest_rip, __entry->info1, __entry->info2, \
+ __entry->intr_info, __entry->error_code) \
+)
+
+/*
+ * Tracepoint for kvm guest exit:
+ */
+TRACE_EVENT_KVM_EXIT(kvm_exit);
+
+/*
+ * Tracepoint for kvm interrupt injection:
+ */
+TRACE_EVENT(kvm_inj_virq,
+ TP_PROTO(unsigned int vector, bool soft, bool reinjected),
+ TP_ARGS(vector, soft, reinjected),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, vector )
+ __field( bool, soft )
+ __field( bool, reinjected )
+ ),
+
+ TP_fast_assign(
+ __entry->vector = vector;
+ __entry->soft = soft;
+ __entry->reinjected = reinjected;
+ ),
+
+ TP_printk("%s 0x%x%s",
+ __entry->soft ? "Soft/INTn" : "IRQ", __entry->vector,
+ __entry->reinjected ? " [reinjected]" : "")
+);
+
+#define EXS(x) { x##_VECTOR, "#" #x }
+
+#define kvm_trace_sym_exc \
+ EXS(DE), EXS(DB), EXS(BP), EXS(OF), EXS(BR), EXS(UD), EXS(NM), \
+ EXS(DF), EXS(TS), EXS(NP), EXS(SS), EXS(GP), EXS(PF), \
+ EXS(MF), EXS(AC), EXS(MC)
+
+/*
+ * Tracepoint for kvm interrupt injection:
+ */
+TRACE_EVENT(kvm_inj_exception,
+ TP_PROTO(unsigned exception, bool has_error, unsigned error_code,
+ bool reinjected),
+ TP_ARGS(exception, has_error, error_code, reinjected),
+
+ TP_STRUCT__entry(
+ __field( u8, exception )
+ __field( u8, has_error )
+ __field( u32, error_code )
+ __field( bool, reinjected )
+ ),
+
+ TP_fast_assign(
+ __entry->exception = exception;
+ __entry->has_error = has_error;
+ __entry->error_code = error_code;
+ __entry->reinjected = reinjected;
+ ),
+
+ TP_printk("%s%s%s%s%s",
+ __print_symbolic(__entry->exception, kvm_trace_sym_exc),
+ !__entry->has_error ? "" : " (",
+ !__entry->has_error ? "" : __print_symbolic(__entry->error_code, { }),
+ !__entry->has_error ? "" : ")",
+ __entry->reinjected ? " [reinjected]" : "")
+);
+
+/*
+ * Tracepoint for page fault.
+ */
+TRACE_EVENT(kvm_page_fault,
+ TP_PROTO(struct kvm_vcpu *vcpu, u64 fault_address, u64 error_code),
+ TP_ARGS(vcpu, fault_address, error_code),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, vcpu_id )
+ __field( unsigned long, guest_rip )
+ __field( u64, fault_address )
+ __field( u64, error_code )
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu->vcpu_id;
+ __entry->guest_rip = kvm_rip_read(vcpu);
+ __entry->fault_address = fault_address;
+ __entry->error_code = error_code;
+ ),
+
+ TP_printk("vcpu %u rip 0x%lx address 0x%016llx error_code 0x%llx",
+ __entry->vcpu_id, __entry->guest_rip,
+ __entry->fault_address, __entry->error_code)
+);
+
+/*
+ * Tracepoint for guest MSR access.
+ */
+TRACE_EVENT(kvm_msr,
+ TP_PROTO(unsigned write, u32 ecx, u64 data, bool exception),
+ TP_ARGS(write, ecx, data, exception),
+
+ TP_STRUCT__entry(
+ __field( unsigned, write )
+ __field( u32, ecx )
+ __field( u64, data )
+ __field( u8, exception )
+ ),
+
+ TP_fast_assign(
+ __entry->write = write;
+ __entry->ecx = ecx;
+ __entry->data = data;
+ __entry->exception = exception;
+ ),
+
+ TP_printk("msr_%s %x = 0x%llx%s",
+ __entry->write ? "write" : "read",
+ __entry->ecx, __entry->data,
+ __entry->exception ? " (#GP)" : "")
+);
+
+#define trace_kvm_msr_read(ecx, data) trace_kvm_msr(0, ecx, data, false)
+#define trace_kvm_msr_write(ecx, data) trace_kvm_msr(1, ecx, data, false)
+#define trace_kvm_msr_read_ex(ecx) trace_kvm_msr(0, ecx, 0, true)
+#define trace_kvm_msr_write_ex(ecx, data) trace_kvm_msr(1, ecx, data, true)
+
+/*
+ * Tracepoint for guest CR access.
+ */
+TRACE_EVENT(kvm_cr,
+ TP_PROTO(unsigned int rw, unsigned int cr, unsigned long val),
+ TP_ARGS(rw, cr, val),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, rw )
+ __field( unsigned int, cr )
+ __field( unsigned long, val )
+ ),
+
+ TP_fast_assign(
+ __entry->rw = rw;
+ __entry->cr = cr;
+ __entry->val = val;
+ ),
+
+ TP_printk("cr_%s %x = 0x%lx",
+ __entry->rw ? "write" : "read",
+ __entry->cr, __entry->val)
+);
+
+#define trace_kvm_cr_read(cr, val) trace_kvm_cr(0, cr, val)
+#define trace_kvm_cr_write(cr, val) trace_kvm_cr(1, cr, val)
+
+TRACE_EVENT(kvm_pic_set_irq,
+ TP_PROTO(__u8 chip, __u8 pin, __u8 elcr, __u8 imr, bool coalesced),
+ TP_ARGS(chip, pin, elcr, imr, coalesced),
+
+ TP_STRUCT__entry(
+ __field( __u8, chip )
+ __field( __u8, pin )
+ __field( __u8, elcr )
+ __field( __u8, imr )
+ __field( bool, coalesced )
+ ),
+
+ TP_fast_assign(
+ __entry->chip = chip;
+ __entry->pin = pin;
+ __entry->elcr = elcr;
+ __entry->imr = imr;
+ __entry->coalesced = coalesced;
+ ),
+
+ TP_printk("chip %u pin %u (%s%s)%s",
+ __entry->chip, __entry->pin,
+ (__entry->elcr & (1 << __entry->pin)) ? "level":"edge",
+ (__entry->imr & (1 << __entry->pin)) ? "|masked":"",
+ __entry->coalesced ? " (coalesced)" : "")
+);
+
+#define kvm_apic_dst_shorthand \
+ {0x0, "dst"}, \
+ {0x1, "self"}, \
+ {0x2, "all"}, \
+ {0x3, "all-but-self"}
+
+TRACE_EVENT(kvm_apic_ipi,
+ TP_PROTO(__u32 icr_low, __u32 dest_id),
+ TP_ARGS(icr_low, dest_id),
+
+ TP_STRUCT__entry(
+ __field( __u32, icr_low )
+ __field( __u32, dest_id )
+ ),
+
+ TP_fast_assign(
+ __entry->icr_low = icr_low;
+ __entry->dest_id = dest_id;
+ ),
+
+ TP_printk("dst %x vec %u (%s|%s|%s|%s|%s)",
+ __entry->dest_id, (u8)__entry->icr_low,
+ __print_symbolic((__entry->icr_low >> 8 & 0x7),
+ kvm_deliver_mode),
+ (__entry->icr_low & (1<<11)) ? "logical" : "physical",
+ (__entry->icr_low & (1<<14)) ? "assert" : "de-assert",
+ (__entry->icr_low & (1<<15)) ? "level" : "edge",
+ __print_symbolic((__entry->icr_low >> 18 & 0x3),
+ kvm_apic_dst_shorthand))
+);
+
+TRACE_EVENT(kvm_apic_accept_irq,
+ TP_PROTO(__u32 apicid, __u16 dm, __u16 tm, __u8 vec),
+ TP_ARGS(apicid, dm, tm, vec),
+
+ TP_STRUCT__entry(
+ __field( __u32, apicid )
+ __field( __u16, dm )
+ __field( __u16, tm )
+ __field( __u8, vec )
+ ),
+
+ TP_fast_assign(
+ __entry->apicid = apicid;
+ __entry->dm = dm;
+ __entry->tm = tm;
+ __entry->vec = vec;
+ ),
+
+ TP_printk("apicid %x vec %u (%s|%s)",
+ __entry->apicid, __entry->vec,
+ __print_symbolic((__entry->dm >> 8 & 0x7), kvm_deliver_mode),
+ __entry->tm ? "level" : "edge")
+);
+
+TRACE_EVENT(kvm_eoi,
+ TP_PROTO(struct kvm_lapic *apic, int vector),
+ TP_ARGS(apic, vector),
+
+ TP_STRUCT__entry(
+ __field( __u32, apicid )
+ __field( int, vector )
+ ),
+
+ TP_fast_assign(
+ __entry->apicid = apic->vcpu->vcpu_id;
+ __entry->vector = vector;
+ ),
+
+ TP_printk("apicid %x vector %d", __entry->apicid, __entry->vector)
+);
+
+TRACE_EVENT(kvm_pv_eoi,
+ TP_PROTO(struct kvm_lapic *apic, int vector),
+ TP_ARGS(apic, vector),
+
+ TP_STRUCT__entry(
+ __field( __u32, apicid )
+ __field( int, vector )
+ ),
+
+ TP_fast_assign(
+ __entry->apicid = apic->vcpu->vcpu_id;
+ __entry->vector = vector;
+ ),
+
+ TP_printk("apicid %x vector %d", __entry->apicid, __entry->vector)
+);
+
+/*
+ * Tracepoint for nested VMRUN
+ */
+TRACE_EVENT(kvm_nested_vmenter,
+ TP_PROTO(__u64 rip, __u64 vmcb, __u64 nested_rip, __u32 int_ctl,
+ __u32 event_inj, bool tdp_enabled, __u64 guest_tdp_pgd,
+ __u64 guest_cr3, __u32 isa),
+ TP_ARGS(rip, vmcb, nested_rip, int_ctl, event_inj, tdp_enabled,
+ guest_tdp_pgd, guest_cr3, isa),
+
+ TP_STRUCT__entry(
+ __field( __u64, rip )
+ __field( __u64, vmcb )
+ __field( __u64, nested_rip )
+ __field( __u32, int_ctl )
+ __field( __u32, event_inj )
+ __field( bool, tdp_enabled )
+ __field( __u64, guest_pgd )
+ __field( __u32, isa )
+ ),
+
+ TP_fast_assign(
+ __entry->rip = rip;
+ __entry->vmcb = vmcb;
+ __entry->nested_rip = nested_rip;
+ __entry->int_ctl = int_ctl;
+ __entry->event_inj = event_inj;
+ __entry->tdp_enabled = tdp_enabled;
+ __entry->guest_pgd = tdp_enabled ? guest_tdp_pgd : guest_cr3;
+ __entry->isa = isa;
+ ),
+
+ TP_printk("rip: 0x%016llx %s: 0x%016llx nested_rip: 0x%016llx "
+ "int_ctl: 0x%08x event_inj: 0x%08x nested_%s=%s %s: 0x%016llx",
+ __entry->rip,
+ __entry->isa == KVM_ISA_VMX ? "vmcs" : "vmcb",
+ __entry->vmcb,
+ __entry->nested_rip,
+ __entry->int_ctl,
+ __entry->event_inj,
+ __entry->isa == KVM_ISA_VMX ? "ept" : "npt",
+ __entry->tdp_enabled ? "y" : "n",
+ !__entry->tdp_enabled ? "guest_cr3" :
+ __entry->isa == KVM_ISA_VMX ? "nested_eptp" : "nested_cr3",
+ __entry->guest_pgd)
+);
+
+TRACE_EVENT(kvm_nested_intercepts,
+ TP_PROTO(__u16 cr_read, __u16 cr_write, __u32 exceptions,
+ __u32 intercept1, __u32 intercept2, __u32 intercept3),
+ TP_ARGS(cr_read, cr_write, exceptions, intercept1,
+ intercept2, intercept3),
+
+ TP_STRUCT__entry(
+ __field( __u16, cr_read )
+ __field( __u16, cr_write )
+ __field( __u32, exceptions )
+ __field( __u32, intercept1 )
+ __field( __u32, intercept2 )
+ __field( __u32, intercept3 )
+ ),
+
+ TP_fast_assign(
+ __entry->cr_read = cr_read;
+ __entry->cr_write = cr_write;
+ __entry->exceptions = exceptions;
+ __entry->intercept1 = intercept1;
+ __entry->intercept2 = intercept2;
+ __entry->intercept3 = intercept3;
+ ),
+
+ TP_printk("cr_read: %04x cr_write: %04x excp: %08x "
+ "intercepts: %08x %08x %08x",
+ __entry->cr_read, __entry->cr_write, __entry->exceptions,
+ __entry->intercept1, __entry->intercept2, __entry->intercept3)
+);
+/*
+ * Tracepoint for #VMEXIT while nested
+ */
+TRACE_EVENT_KVM_EXIT(kvm_nested_vmexit);
+
+/*
+ * Tracepoint for #VMEXIT reinjected to the guest
+ */
+TRACE_EVENT(kvm_nested_vmexit_inject,
+ TP_PROTO(__u32 exit_code,
+ __u64 exit_info1, __u64 exit_info2,
+ __u32 exit_int_info, __u32 exit_int_info_err, __u32 isa),
+ TP_ARGS(exit_code, exit_info1, exit_info2,
+ exit_int_info, exit_int_info_err, isa),
+
+ TP_STRUCT__entry(
+ __field( __u32, exit_code )
+ __field( __u64, exit_info1 )
+ __field( __u64, exit_info2 )
+ __field( __u32, exit_int_info )
+ __field( __u32, exit_int_info_err )
+ __field( __u32, isa )
+ ),
+
+ TP_fast_assign(
+ __entry->exit_code = exit_code;
+ __entry->exit_info1 = exit_info1;
+ __entry->exit_info2 = exit_info2;
+ __entry->exit_int_info = exit_int_info;
+ __entry->exit_int_info_err = exit_int_info_err;
+ __entry->isa = isa;
+ ),
+
+ TP_printk("reason: %s%s%s ext_inf1: 0x%016llx "
+ "ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x",
+ kvm_print_exit_reason(__entry->exit_code, __entry->isa),
+ __entry->exit_info1, __entry->exit_info2,
+ __entry->exit_int_info, __entry->exit_int_info_err)
+);
+
+/*
+ * Tracepoint for nested #vmexit because of interrupt pending
+ */
+TRACE_EVENT(kvm_nested_intr_vmexit,
+ TP_PROTO(__u64 rip),
+ TP_ARGS(rip),
+
+ TP_STRUCT__entry(
+ __field( __u64, rip )
+ ),
+
+ TP_fast_assign(
+ __entry->rip = rip
+ ),
+
+ TP_printk("rip: 0x%016llx", __entry->rip)
+);
+
+/*
+ * Tracepoint for nested #vmexit because of interrupt pending
+ */
+TRACE_EVENT(kvm_invlpga,
+ TP_PROTO(__u64 rip, int asid, u64 address),
+ TP_ARGS(rip, asid, address),
+
+ TP_STRUCT__entry(
+ __field( __u64, rip )
+ __field( int, asid )
+ __field( __u64, address )
+ ),
+
+ TP_fast_assign(
+ __entry->rip = rip;
+ __entry->asid = asid;
+ __entry->address = address;
+ ),
+
+ TP_printk("rip: 0x%016llx asid: %d address: 0x%016llx",
+ __entry->rip, __entry->asid, __entry->address)
+);
+
+/*
+ * Tracepoint for nested #vmexit because of interrupt pending
+ */
+TRACE_EVENT(kvm_skinit,
+ TP_PROTO(__u64 rip, __u32 slb),
+ TP_ARGS(rip, slb),
+
+ TP_STRUCT__entry(
+ __field( __u64, rip )
+ __field( __u32, slb )
+ ),
+
+ TP_fast_assign(
+ __entry->rip = rip;
+ __entry->slb = slb;
+ ),
+
+ TP_printk("rip: 0x%016llx slb: 0x%08x",
+ __entry->rip, __entry->slb)
+);
+
+#define KVM_EMUL_INSN_F_CR0_PE (1 << 0)
+#define KVM_EMUL_INSN_F_EFL_VM (1 << 1)
+#define KVM_EMUL_INSN_F_CS_D (1 << 2)
+#define KVM_EMUL_INSN_F_CS_L (1 << 3)
+
+#define kvm_trace_symbol_emul_flags \
+ { 0, "real" }, \
+ { KVM_EMUL_INSN_F_CR0_PE \
+ | KVM_EMUL_INSN_F_EFL_VM, "vm16" }, \
+ { KVM_EMUL_INSN_F_CR0_PE, "prot16" }, \
+ { KVM_EMUL_INSN_F_CR0_PE \
+ | KVM_EMUL_INSN_F_CS_D, "prot32" }, \
+ { KVM_EMUL_INSN_F_CR0_PE \
+ | KVM_EMUL_INSN_F_CS_L, "prot64" }
+
+#define kei_decode_mode(mode) ({ \
+ u8 flags = 0xff; \
+ switch (mode) { \
+ case X86EMUL_MODE_REAL: \
+ flags = 0; \
+ break; \
+ case X86EMUL_MODE_VM86: \
+ flags = KVM_EMUL_INSN_F_EFL_VM; \
+ break; \
+ case X86EMUL_MODE_PROT16: \
+ flags = KVM_EMUL_INSN_F_CR0_PE; \
+ break; \
+ case X86EMUL_MODE_PROT32: \
+ flags = KVM_EMUL_INSN_F_CR0_PE \
+ | KVM_EMUL_INSN_F_CS_D; \
+ break; \
+ case X86EMUL_MODE_PROT64: \
+ flags = KVM_EMUL_INSN_F_CR0_PE \
+ | KVM_EMUL_INSN_F_CS_L; \
+ break; \
+ } \
+ flags; \
+ })
+
+TRACE_EVENT(kvm_emulate_insn,
+ TP_PROTO(struct kvm_vcpu *vcpu, __u8 failed),
+ TP_ARGS(vcpu, failed),
+
+ TP_STRUCT__entry(
+ __field( __u64, rip )
+ __field( __u32, csbase )
+ __field( __u8, len )
+ __array( __u8, insn, 15 )
+ __field( __u8, flags )
+ __field( __u8, failed )
+ ),
+
+ TP_fast_assign(
+ __entry->csbase = static_call(kvm_x86_get_segment_base)(vcpu, VCPU_SREG_CS);
+ __entry->len = vcpu->arch.emulate_ctxt->fetch.ptr
+ - vcpu->arch.emulate_ctxt->fetch.data;
+ __entry->rip = vcpu->arch.emulate_ctxt->_eip - __entry->len;
+ memcpy(__entry->insn,
+ vcpu->arch.emulate_ctxt->fetch.data,
+ 15);
+ __entry->flags = kei_decode_mode(vcpu->arch.emulate_ctxt->mode);
+ __entry->failed = failed;
+ ),
+
+ TP_printk("%x:%llx:%s (%s)%s",
+ __entry->csbase, __entry->rip,
+ __print_hex(__entry->insn, __entry->len),
+ __print_symbolic(__entry->flags,
+ kvm_trace_symbol_emul_flags),
+ __entry->failed ? " failed" : ""
+ )
+ );
+
+#define trace_kvm_emulate_insn_start(vcpu) trace_kvm_emulate_insn(vcpu, 0)
+#define trace_kvm_emulate_insn_failed(vcpu) trace_kvm_emulate_insn(vcpu, 1)
+
+TRACE_EVENT(
+ vcpu_match_mmio,
+ TP_PROTO(gva_t gva, gpa_t gpa, bool write, bool gpa_match),
+ TP_ARGS(gva, gpa, write, gpa_match),
+
+ TP_STRUCT__entry(
+ __field(gva_t, gva)
+ __field(gpa_t, gpa)
+ __field(bool, write)
+ __field(bool, gpa_match)
+ ),
+
+ TP_fast_assign(
+ __entry->gva = gva;
+ __entry->gpa = gpa;
+ __entry->write = write;
+ __entry->gpa_match = gpa_match
+ ),
+
+ TP_printk("gva %#lx gpa %#llx %s %s", __entry->gva, __entry->gpa,
+ __entry->write ? "Write" : "Read",
+ __entry->gpa_match ? "GPA" : "GVA")
+);
+
+TRACE_EVENT(kvm_write_tsc_offset,
+ TP_PROTO(unsigned int vcpu_id, __u64 previous_tsc_offset,
+ __u64 next_tsc_offset),
+ TP_ARGS(vcpu_id, previous_tsc_offset, next_tsc_offset),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, vcpu_id )
+ __field( __u64, previous_tsc_offset )
+ __field( __u64, next_tsc_offset )
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->previous_tsc_offset = previous_tsc_offset;
+ __entry->next_tsc_offset = next_tsc_offset;
+ ),
+
+ TP_printk("vcpu=%u prev=%llu next=%llu", __entry->vcpu_id,
+ __entry->previous_tsc_offset, __entry->next_tsc_offset)
+);
+
+#ifdef CONFIG_X86_64
+
+#define host_clocks \
+ {VDSO_CLOCKMODE_NONE, "none"}, \
+ {VDSO_CLOCKMODE_TSC, "tsc"} \
+
+TRACE_EVENT(kvm_update_master_clock,
+ TP_PROTO(bool use_master_clock, unsigned int host_clock, bool offset_matched),
+ TP_ARGS(use_master_clock, host_clock, offset_matched),
+
+ TP_STRUCT__entry(
+ __field( bool, use_master_clock )
+ __field( unsigned int, host_clock )
+ __field( bool, offset_matched )
+ ),
+
+ TP_fast_assign(
+ __entry->use_master_clock = use_master_clock;
+ __entry->host_clock = host_clock;
+ __entry->offset_matched = offset_matched;
+ ),
+
+ TP_printk("masterclock %d hostclock %s offsetmatched %u",
+ __entry->use_master_clock,
+ __print_symbolic(__entry->host_clock, host_clocks),
+ __entry->offset_matched)
+);
+
+TRACE_EVENT(kvm_track_tsc,
+ TP_PROTO(unsigned int vcpu_id, unsigned int nr_matched,
+ unsigned int online_vcpus, bool use_master_clock,
+ unsigned int host_clock),
+ TP_ARGS(vcpu_id, nr_matched, online_vcpus, use_master_clock,
+ host_clock),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, vcpu_id )
+ __field( unsigned int, nr_vcpus_matched_tsc )
+ __field( unsigned int, online_vcpus )
+ __field( bool, use_master_clock )
+ __field( unsigned int, host_clock )
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->nr_vcpus_matched_tsc = nr_matched;
+ __entry->online_vcpus = online_vcpus;
+ __entry->use_master_clock = use_master_clock;
+ __entry->host_clock = host_clock;
+ ),
+
+ TP_printk("vcpu_id %u masterclock %u offsetmatched %u nr_online %u"
+ " hostclock %s",
+ __entry->vcpu_id, __entry->use_master_clock,
+ __entry->nr_vcpus_matched_tsc, __entry->online_vcpus,
+ __print_symbolic(__entry->host_clock, host_clocks))
+);
+
+#endif /* CONFIG_X86_64 */
+
+/*
+ * Tracepoint for PML full VMEXIT.
+ */
+TRACE_EVENT(kvm_pml_full,
+ TP_PROTO(unsigned int vcpu_id),
+ TP_ARGS(vcpu_id),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, vcpu_id )
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ ),
+
+ TP_printk("vcpu %d: PML full", __entry->vcpu_id)
+);
+
+TRACE_EVENT(kvm_ple_window_update,
+ TP_PROTO(unsigned int vcpu_id, unsigned int new, unsigned int old),
+ TP_ARGS(vcpu_id, new, old),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, vcpu_id )
+ __field( unsigned int, new )
+ __field( unsigned int, old )
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->new = new;
+ __entry->old = old;
+ ),
+
+ TP_printk("vcpu %u old %u new %u (%s)",
+ __entry->vcpu_id, __entry->old, __entry->new,
+ __entry->old < __entry->new ? "growed" : "shrinked")
+);
+
+TRACE_EVENT(kvm_pvclock_update,
+ TP_PROTO(unsigned int vcpu_id, struct pvclock_vcpu_time_info *pvclock),
+ TP_ARGS(vcpu_id, pvclock),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, vcpu_id )
+ __field( __u32, version )
+ __field( __u64, tsc_timestamp )
+ __field( __u64, system_time )
+ __field( __u32, tsc_to_system_mul )
+ __field( __s8, tsc_shift )
+ __field( __u8, flags )
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->version = pvclock->version;
+ __entry->tsc_timestamp = pvclock->tsc_timestamp;
+ __entry->system_time = pvclock->system_time;
+ __entry->tsc_to_system_mul = pvclock->tsc_to_system_mul;
+ __entry->tsc_shift = pvclock->tsc_shift;
+ __entry->flags = pvclock->flags;
+ ),
+
+ TP_printk("vcpu_id %u, pvclock { version %u, tsc_timestamp 0x%llx, "
+ "system_time 0x%llx, tsc_to_system_mul 0x%x, tsc_shift %d, "
+ "flags 0x%x }",
+ __entry->vcpu_id,
+ __entry->version,
+ __entry->tsc_timestamp,
+ __entry->system_time,
+ __entry->tsc_to_system_mul,
+ __entry->tsc_shift,
+ __entry->flags)
+);
+
+TRACE_EVENT(kvm_wait_lapic_expire,
+ TP_PROTO(unsigned int vcpu_id, s64 delta),
+ TP_ARGS(vcpu_id, delta),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, vcpu_id )
+ __field( s64, delta )
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->delta = delta;
+ ),
+
+ TP_printk("vcpu %u: delta %lld (%s)",
+ __entry->vcpu_id,
+ __entry->delta,
+ __entry->delta < 0 ? "early" : "late")
+);
+
+TRACE_EVENT(kvm_smm_transition,
+ TP_PROTO(unsigned int vcpu_id, u64 smbase, bool entering),
+ TP_ARGS(vcpu_id, smbase, entering),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, vcpu_id )
+ __field( u64, smbase )
+ __field( bool, entering )
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->smbase = smbase;
+ __entry->entering = entering;
+ ),
+
+ TP_printk("vcpu %u: %s SMM, smbase 0x%llx",
+ __entry->vcpu_id,
+ __entry->entering ? "entering" : "leaving",
+ __entry->smbase)
+);
+
+/*
+ * Tracepoint for VT-d posted-interrupts.
+ */
+TRACE_EVENT(kvm_pi_irte_update,
+ TP_PROTO(unsigned int host_irq, unsigned int vcpu_id,
+ unsigned int gsi, unsigned int gvec,
+ u64 pi_desc_addr, bool set),
+ TP_ARGS(host_irq, vcpu_id, gsi, gvec, pi_desc_addr, set),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, host_irq )
+ __field( unsigned int, vcpu_id )
+ __field( unsigned int, gsi )
+ __field( unsigned int, gvec )
+ __field( u64, pi_desc_addr )
+ __field( bool, set )
+ ),
+
+ TP_fast_assign(
+ __entry->host_irq = host_irq;
+ __entry->vcpu_id = vcpu_id;
+ __entry->gsi = gsi;
+ __entry->gvec = gvec;
+ __entry->pi_desc_addr = pi_desc_addr;
+ __entry->set = set;
+ ),
+
+ TP_printk("VT-d PI is %s for irq %u, vcpu %u, gsi: 0x%x, "
+ "gvec: 0x%x, pi_desc_addr: 0x%llx",
+ __entry->set ? "enabled and being updated" : "disabled",
+ __entry->host_irq,
+ __entry->vcpu_id,
+ __entry->gsi,
+ __entry->gvec,
+ __entry->pi_desc_addr)
+);
+
+/*
+ * Tracepoint for kvm_hv_notify_acked_sint.
+ */
+TRACE_EVENT(kvm_hv_notify_acked_sint,
+ TP_PROTO(int vcpu_id, u32 sint),
+ TP_ARGS(vcpu_id, sint),
+
+ TP_STRUCT__entry(
+ __field(int, vcpu_id)
+ __field(u32, sint)
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->sint = sint;
+ ),
+
+ TP_printk("vcpu_id %d sint %u", __entry->vcpu_id, __entry->sint)
+);
+
+/*
+ * Tracepoint for synic_set_irq.
+ */
+TRACE_EVENT(kvm_hv_synic_set_irq,
+ TP_PROTO(int vcpu_id, u32 sint, int vector, int ret),
+ TP_ARGS(vcpu_id, sint, vector, ret),
+
+ TP_STRUCT__entry(
+ __field(int, vcpu_id)
+ __field(u32, sint)
+ __field(int, vector)
+ __field(int, ret)
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->sint = sint;
+ __entry->vector = vector;
+ __entry->ret = ret;
+ ),
+
+ TP_printk("vcpu_id %d sint %u vector %d ret %d",
+ __entry->vcpu_id, __entry->sint, __entry->vector,
+ __entry->ret)
+);
+
+/*
+ * Tracepoint for kvm_hv_synic_send_eoi.
+ */
+TRACE_EVENT(kvm_hv_synic_send_eoi,
+ TP_PROTO(int vcpu_id, int vector),
+ TP_ARGS(vcpu_id, vector),
+
+ TP_STRUCT__entry(
+ __field(int, vcpu_id)
+ __field(u32, sint)
+ __field(int, vector)
+ __field(int, ret)
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->vector = vector;
+ ),
+
+ TP_printk("vcpu_id %d vector %d", __entry->vcpu_id, __entry->vector)
+);
+
+/*
+ * Tracepoint for synic_set_msr.
+ */
+TRACE_EVENT(kvm_hv_synic_set_msr,
+ TP_PROTO(int vcpu_id, u32 msr, u64 data, bool host),
+ TP_ARGS(vcpu_id, msr, data, host),
+
+ TP_STRUCT__entry(
+ __field(int, vcpu_id)
+ __field(u32, msr)
+ __field(u64, data)
+ __field(bool, host)
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->msr = msr;
+ __entry->data = data;
+ __entry->host = host
+ ),
+
+ TP_printk("vcpu_id %d msr 0x%x data 0x%llx host %d",
+ __entry->vcpu_id, __entry->msr, __entry->data, __entry->host)
+);
+
+/*
+ * Tracepoint for stimer_set_config.
+ */
+TRACE_EVENT(kvm_hv_stimer_set_config,
+ TP_PROTO(int vcpu_id, int timer_index, u64 config, bool host),
+ TP_ARGS(vcpu_id, timer_index, config, host),
+
+ TP_STRUCT__entry(
+ __field(int, vcpu_id)
+ __field(int, timer_index)
+ __field(u64, config)
+ __field(bool, host)
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->timer_index = timer_index;
+ __entry->config = config;
+ __entry->host = host;
+ ),
+
+ TP_printk("vcpu_id %d timer %d config 0x%llx host %d",
+ __entry->vcpu_id, __entry->timer_index, __entry->config,
+ __entry->host)
+);
+
+/*
+ * Tracepoint for stimer_set_count.
+ */
+TRACE_EVENT(kvm_hv_stimer_set_count,
+ TP_PROTO(int vcpu_id, int timer_index, u64 count, bool host),
+ TP_ARGS(vcpu_id, timer_index, count, host),
+
+ TP_STRUCT__entry(
+ __field(int, vcpu_id)
+ __field(int, timer_index)
+ __field(u64, count)
+ __field(bool, host)
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->timer_index = timer_index;
+ __entry->count = count;
+ __entry->host = host;
+ ),
+
+ TP_printk("vcpu_id %d timer %d count %llu host %d",
+ __entry->vcpu_id, __entry->timer_index, __entry->count,
+ __entry->host)
+);
+
+/*
+ * Tracepoint for stimer_start(periodic timer case).
+ */
+TRACE_EVENT(kvm_hv_stimer_start_periodic,
+ TP_PROTO(int vcpu_id, int timer_index, u64 time_now, u64 exp_time),
+ TP_ARGS(vcpu_id, timer_index, time_now, exp_time),
+
+ TP_STRUCT__entry(
+ __field(int, vcpu_id)
+ __field(int, timer_index)
+ __field(u64, time_now)
+ __field(u64, exp_time)
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->timer_index = timer_index;
+ __entry->time_now = time_now;
+ __entry->exp_time = exp_time;
+ ),
+
+ TP_printk("vcpu_id %d timer %d time_now %llu exp_time %llu",
+ __entry->vcpu_id, __entry->timer_index, __entry->time_now,
+ __entry->exp_time)
+);
+
+/*
+ * Tracepoint for stimer_start(one-shot timer case).
+ */
+TRACE_EVENT(kvm_hv_stimer_start_one_shot,
+ TP_PROTO(int vcpu_id, int timer_index, u64 time_now, u64 count),
+ TP_ARGS(vcpu_id, timer_index, time_now, count),
+
+ TP_STRUCT__entry(
+ __field(int, vcpu_id)
+ __field(int, timer_index)
+ __field(u64, time_now)
+ __field(u64, count)
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->timer_index = timer_index;
+ __entry->time_now = time_now;
+ __entry->count = count;
+ ),
+
+ TP_printk("vcpu_id %d timer %d time_now %llu count %llu",
+ __entry->vcpu_id, __entry->timer_index, __entry->time_now,
+ __entry->count)
+);
+
+/*
+ * Tracepoint for stimer_timer_callback.
+ */
+TRACE_EVENT(kvm_hv_stimer_callback,
+ TP_PROTO(int vcpu_id, int timer_index),
+ TP_ARGS(vcpu_id, timer_index),
+
+ TP_STRUCT__entry(
+ __field(int, vcpu_id)
+ __field(int, timer_index)
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->timer_index = timer_index;
+ ),
+
+ TP_printk("vcpu_id %d timer %d",
+ __entry->vcpu_id, __entry->timer_index)
+);
+
+/*
+ * Tracepoint for stimer_expiration.
+ */
+TRACE_EVENT(kvm_hv_stimer_expiration,
+ TP_PROTO(int vcpu_id, int timer_index, int direct, int msg_send_result),
+ TP_ARGS(vcpu_id, timer_index, direct, msg_send_result),
+
+ TP_STRUCT__entry(
+ __field(int, vcpu_id)
+ __field(int, timer_index)
+ __field(int, direct)
+ __field(int, msg_send_result)
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->timer_index = timer_index;
+ __entry->direct = direct;
+ __entry->msg_send_result = msg_send_result;
+ ),
+
+ TP_printk("vcpu_id %d timer %d direct %d send result %d",
+ __entry->vcpu_id, __entry->timer_index,
+ __entry->direct, __entry->msg_send_result)
+);
+
+/*
+ * Tracepoint for stimer_cleanup.
+ */
+TRACE_EVENT(kvm_hv_stimer_cleanup,
+ TP_PROTO(int vcpu_id, int timer_index),
+ TP_ARGS(vcpu_id, timer_index),
+
+ TP_STRUCT__entry(
+ __field(int, vcpu_id)
+ __field(int, timer_index)
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->timer_index = timer_index;
+ ),
+
+ TP_printk("vcpu_id %d timer %d",
+ __entry->vcpu_id, __entry->timer_index)
+);
+
+TRACE_EVENT(kvm_apicv_inhibit_changed,
+ TP_PROTO(int reason, bool set, unsigned long inhibits),
+ TP_ARGS(reason, set, inhibits),
+
+ TP_STRUCT__entry(
+ __field(int, reason)
+ __field(bool, set)
+ __field(unsigned long, inhibits)
+ ),
+
+ TP_fast_assign(
+ __entry->reason = reason;
+ __entry->set = set;
+ __entry->inhibits = inhibits;
+ ),
+
+ TP_printk("%s reason=%u, inhibits=0x%lx",
+ __entry->set ? "set" : "cleared",
+ __entry->reason, __entry->inhibits)
+);
+
+TRACE_EVENT(kvm_apicv_accept_irq,
+ TP_PROTO(__u32 apicid, __u16 dm, __u16 tm, __u8 vec),
+ TP_ARGS(apicid, dm, tm, vec),
+
+ TP_STRUCT__entry(
+ __field( __u32, apicid )
+ __field( __u16, dm )
+ __field( __u16, tm )
+ __field( __u8, vec )
+ ),
+
+ TP_fast_assign(
+ __entry->apicid = apicid;
+ __entry->dm = dm;
+ __entry->tm = tm;
+ __entry->vec = vec;
+ ),
+
+ TP_printk("apicid %x vec %u (%s|%s)",
+ __entry->apicid, __entry->vec,
+ __print_symbolic((__entry->dm >> 8 & 0x7), kvm_deliver_mode),
+ __entry->tm ? "level" : "edge")
+);
+
+/*
+ * Tracepoint for AMD AVIC
+ */
+TRACE_EVENT(kvm_avic_incomplete_ipi,
+ TP_PROTO(u32 vcpu, u32 icrh, u32 icrl, u32 id, u32 index),
+ TP_ARGS(vcpu, icrh, icrl, id, index),
+
+ TP_STRUCT__entry(
+ __field(u32, vcpu)
+ __field(u32, icrh)
+ __field(u32, icrl)
+ __field(u32, id)
+ __field(u32, index)
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu = vcpu;
+ __entry->icrh = icrh;
+ __entry->icrl = icrl;
+ __entry->id = id;
+ __entry->index = index;
+ ),
+
+ TP_printk("vcpu=%u, icrh:icrl=%#010x:%08x, id=%u, index=%u",
+ __entry->vcpu, __entry->icrh, __entry->icrl,
+ __entry->id, __entry->index)
+);
+
+TRACE_EVENT(kvm_avic_unaccelerated_access,
+ TP_PROTO(u32 vcpu, u32 offset, bool ft, bool rw, u32 vec),
+ TP_ARGS(vcpu, offset, ft, rw, vec),
+
+ TP_STRUCT__entry(
+ __field(u32, vcpu)
+ __field(u32, offset)
+ __field(bool, ft)
+ __field(bool, rw)
+ __field(u32, vec)
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu = vcpu;
+ __entry->offset = offset;
+ __entry->ft = ft;
+ __entry->rw = rw;
+ __entry->vec = vec;
+ ),
+
+ TP_printk("vcpu=%u, offset=%#x(%s), %s, %s, vec=%#x",
+ __entry->vcpu,
+ __entry->offset,
+ __print_symbolic(__entry->offset, kvm_trace_symbol_apic),
+ __entry->ft ? "trap" : "fault",
+ __entry->rw ? "write" : "read",
+ __entry->vec)
+);
+
+TRACE_EVENT(kvm_avic_ga_log,
+ TP_PROTO(u32 vmid, u32 vcpuid),
+ TP_ARGS(vmid, vcpuid),
+
+ TP_STRUCT__entry(
+ __field(u32, vmid)
+ __field(u32, vcpuid)
+ ),
+
+ TP_fast_assign(
+ __entry->vmid = vmid;
+ __entry->vcpuid = vcpuid;
+ ),
+
+ TP_printk("vmid=%u, vcpuid=%u",
+ __entry->vmid, __entry->vcpuid)
+);
+
+TRACE_EVENT(kvm_avic_kick_vcpu_slowpath,
+ TP_PROTO(u32 icrh, u32 icrl, u32 index),
+ TP_ARGS(icrh, icrl, index),
+
+ TP_STRUCT__entry(
+ __field(u32, icrh)
+ __field(u32, icrl)
+ __field(u32, index)
+ ),
+
+ TP_fast_assign(
+ __entry->icrh = icrh;
+ __entry->icrl = icrl;
+ __entry->index = index;
+ ),
+
+ TP_printk("icrh:icrl=%#08x:%08x, index=%u",
+ __entry->icrh, __entry->icrl, __entry->index)
+);
+
+TRACE_EVENT(kvm_avic_doorbell,
+ TP_PROTO(u32 vcpuid, u32 apicid),
+ TP_ARGS(vcpuid, apicid),
+
+ TP_STRUCT__entry(
+ __field(u32, vcpuid)
+ __field(u32, apicid)
+ ),
+
+ TP_fast_assign(
+ __entry->vcpuid = vcpuid;
+ __entry->apicid = apicid;
+ ),
+
+ TP_printk("vcpuid=%u, apicid=%u",
+ __entry->vcpuid, __entry->apicid)
+);
+
+TRACE_EVENT(kvm_hv_timer_state,
+ TP_PROTO(unsigned int vcpu_id, unsigned int hv_timer_in_use),
+ TP_ARGS(vcpu_id, hv_timer_in_use),
+ TP_STRUCT__entry(
+ __field(unsigned int, vcpu_id)
+ __field(unsigned int, hv_timer_in_use)
+ ),
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->hv_timer_in_use = hv_timer_in_use;
+ ),
+ TP_printk("vcpu_id %x hv_timer %x",
+ __entry->vcpu_id,
+ __entry->hv_timer_in_use)
+);
+
+/*
+ * Tracepoint for kvm_hv_flush_tlb.
+ */
+TRACE_EVENT(kvm_hv_flush_tlb,
+ TP_PROTO(u64 processor_mask, u64 address_space, u64 flags, bool guest_mode),
+ TP_ARGS(processor_mask, address_space, flags, guest_mode),
+
+ TP_STRUCT__entry(
+ __field(u64, processor_mask)
+ __field(u64, address_space)
+ __field(u64, flags)
+ __field(bool, guest_mode)
+ ),
+
+ TP_fast_assign(
+ __entry->processor_mask = processor_mask;
+ __entry->address_space = address_space;
+ __entry->flags = flags;
+ __entry->guest_mode = guest_mode;
+ ),
+
+ TP_printk("processor_mask 0x%llx address_space 0x%llx flags 0x%llx %s",
+ __entry->processor_mask, __entry->address_space,
+ __entry->flags, __entry->guest_mode ? "(L2)" : "")
+);
+
+/*
+ * Tracepoint for kvm_hv_flush_tlb_ex.
+ */
+TRACE_EVENT(kvm_hv_flush_tlb_ex,
+ TP_PROTO(u64 valid_bank_mask, u64 format, u64 address_space, u64 flags, bool guest_mode),
+ TP_ARGS(valid_bank_mask, format, address_space, flags, guest_mode),
+
+ TP_STRUCT__entry(
+ __field(u64, valid_bank_mask)
+ __field(u64, format)
+ __field(u64, address_space)
+ __field(u64, flags)
+ __field(bool, guest_mode)
+ ),
+
+ TP_fast_assign(
+ __entry->valid_bank_mask = valid_bank_mask;
+ __entry->format = format;
+ __entry->address_space = address_space;
+ __entry->flags = flags;
+ __entry->guest_mode = guest_mode;
+ ),
+
+ TP_printk("valid_bank_mask 0x%llx format 0x%llx "
+ "address_space 0x%llx flags 0x%llx %s",
+ __entry->valid_bank_mask, __entry->format,
+ __entry->address_space, __entry->flags,
+ __entry->guest_mode ? "(L2)" : "")
+);
+
+/*
+ * Tracepoints for kvm_hv_send_ipi.
+ */
+TRACE_EVENT(kvm_hv_send_ipi,
+ TP_PROTO(u32 vector, u64 processor_mask),
+ TP_ARGS(vector, processor_mask),
+
+ TP_STRUCT__entry(
+ __field(u32, vector)
+ __field(u64, processor_mask)
+ ),
+
+ TP_fast_assign(
+ __entry->vector = vector;
+ __entry->processor_mask = processor_mask;
+ ),
+
+ TP_printk("vector %x processor_mask 0x%llx",
+ __entry->vector, __entry->processor_mask)
+);
+
+TRACE_EVENT(kvm_hv_send_ipi_ex,
+ TP_PROTO(u32 vector, u64 format, u64 valid_bank_mask),
+ TP_ARGS(vector, format, valid_bank_mask),
+
+ TP_STRUCT__entry(
+ __field(u32, vector)
+ __field(u64, format)
+ __field(u64, valid_bank_mask)
+ ),
+
+ TP_fast_assign(
+ __entry->vector = vector;
+ __entry->format = format;
+ __entry->valid_bank_mask = valid_bank_mask;
+ ),
+
+ TP_printk("vector %x format %llx valid_bank_mask 0x%llx",
+ __entry->vector, __entry->format,
+ __entry->valid_bank_mask)
+);
+
+TRACE_EVENT(kvm_pv_tlb_flush,
+ TP_PROTO(unsigned int vcpu_id, bool need_flush_tlb),
+ TP_ARGS(vcpu_id, need_flush_tlb),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, vcpu_id )
+ __field( bool, need_flush_tlb )
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->need_flush_tlb = need_flush_tlb;
+ ),
+
+ TP_printk("vcpu %u need_flush_tlb %s", __entry->vcpu_id,
+ __entry->need_flush_tlb ? "true" : "false")
+);
+
+/*
+ * Tracepoint for failed nested VMX VM-Enter.
+ */
+TRACE_EVENT(kvm_nested_vmenter_failed,
+ TP_PROTO(const char *msg, u32 err),
+ TP_ARGS(msg, err),
+
+ TP_STRUCT__entry(
+ __string(msg, msg)
+ __field(u32, err)
+ ),
+
+ TP_fast_assign(
+ __assign_str(msg, msg);
+ __entry->err = err;
+ ),
+
+ TP_printk("%s%s", __get_str(msg), !__entry->err ? "" :
+ __print_symbolic(__entry->err, VMX_VMENTER_INSTRUCTION_ERRORS))
+);
+
+/*
+ * Tracepoint for syndbg_set_msr.
+ */
+TRACE_EVENT(kvm_hv_syndbg_set_msr,
+ TP_PROTO(int vcpu_id, u32 vp_index, u32 msr, u64 data),
+ TP_ARGS(vcpu_id, vp_index, msr, data),
+
+ TP_STRUCT__entry(
+ __field(int, vcpu_id)
+ __field(u32, vp_index)
+ __field(u32, msr)
+ __field(u64, data)
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->vp_index = vp_index;
+ __entry->msr = msr;
+ __entry->data = data;
+ ),
+
+ TP_printk("vcpu_id %d vp_index %u msr 0x%x data 0x%llx",
+ __entry->vcpu_id, __entry->vp_index, __entry->msr,
+ __entry->data)
+);
+
+/*
+ * Tracepoint for syndbg_get_msr.
+ */
+TRACE_EVENT(kvm_hv_syndbg_get_msr,
+ TP_PROTO(int vcpu_id, u32 vp_index, u32 msr, u64 data),
+ TP_ARGS(vcpu_id, vp_index, msr, data),
+
+ TP_STRUCT__entry(
+ __field(int, vcpu_id)
+ __field(u32, vp_index)
+ __field(u32, msr)
+ __field(u64, data)
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->vp_index = vp_index;
+ __entry->msr = msr;
+ __entry->data = data;
+ ),
+
+ TP_printk("vcpu_id %d vp_index %u msr 0x%x data 0x%llx",
+ __entry->vcpu_id, __entry->vp_index, __entry->msr,
+ __entry->data)
+);
+
+/*
+ * Tracepoint for the start of VMGEXIT processing
+ */
+TRACE_EVENT(kvm_vmgexit_enter,
+ TP_PROTO(unsigned int vcpu_id, struct ghcb *ghcb),
+ TP_ARGS(vcpu_id, ghcb),
+
+ TP_STRUCT__entry(
+ __field(unsigned int, vcpu_id)
+ __field(u64, exit_reason)
+ __field(u64, info1)
+ __field(u64, info2)
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->exit_reason = ghcb->save.sw_exit_code;
+ __entry->info1 = ghcb->save.sw_exit_info_1;
+ __entry->info2 = ghcb->save.sw_exit_info_2;
+ ),
+
+ TP_printk("vcpu %u, exit_reason %llx, exit_info1 %llx, exit_info2 %llx",
+ __entry->vcpu_id, __entry->exit_reason,
+ __entry->info1, __entry->info2)
+);
+
+/*
+ * Tracepoint for the end of VMGEXIT processing
+ */
+TRACE_EVENT(kvm_vmgexit_exit,
+ TP_PROTO(unsigned int vcpu_id, struct ghcb *ghcb),
+ TP_ARGS(vcpu_id, ghcb),
+
+ TP_STRUCT__entry(
+ __field(unsigned int, vcpu_id)
+ __field(u64, exit_reason)
+ __field(u64, info1)
+ __field(u64, info2)
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->exit_reason = ghcb->save.sw_exit_code;
+ __entry->info1 = ghcb->save.sw_exit_info_1;
+ __entry->info2 = ghcb->save.sw_exit_info_2;
+ ),
+
+ TP_printk("vcpu %u, exit_reason %llx, exit_info1 %llx, exit_info2 %llx",
+ __entry->vcpu_id, __entry->exit_reason,
+ __entry->info1, __entry->info2)
+);
+
+/*
+ * Tracepoint for the start of VMGEXIT MSR procotol processing
+ */
+TRACE_EVENT(kvm_vmgexit_msr_protocol_enter,
+ TP_PROTO(unsigned int vcpu_id, u64 ghcb_gpa),
+ TP_ARGS(vcpu_id, ghcb_gpa),
+
+ TP_STRUCT__entry(
+ __field(unsigned int, vcpu_id)
+ __field(u64, ghcb_gpa)
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->ghcb_gpa = ghcb_gpa;
+ ),
+
+ TP_printk("vcpu %u, ghcb_gpa %016llx",
+ __entry->vcpu_id, __entry->ghcb_gpa)
+);
+
+/*
+ * Tracepoint for the end of VMGEXIT MSR procotol processing
+ */
+TRACE_EVENT(kvm_vmgexit_msr_protocol_exit,
+ TP_PROTO(unsigned int vcpu_id, u64 ghcb_gpa, int result),
+ TP_ARGS(vcpu_id, ghcb_gpa, result),
+
+ TP_STRUCT__entry(
+ __field(unsigned int, vcpu_id)
+ __field(u64, ghcb_gpa)
+ __field(int, result)
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->ghcb_gpa = ghcb_gpa;
+ __entry->result = result;
+ ),
+
+ TP_printk("vcpu %u, ghcb_gpa %016llx, result %d",
+ __entry->vcpu_id, __entry->ghcb_gpa, __entry->result)
+);
+
+#endif /* _TRACE_KVM_H */
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH ../../arch/x86/kvm
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE trace
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/arch/x86/kvm/tss.h b/arch/x86/kvm/tss.h
new file mode 100644
index 000000000..3f9150125
--- /dev/null
+++ b/arch/x86/kvm/tss.h
@@ -0,0 +1,60 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __TSS_SEGMENT_H
+#define __TSS_SEGMENT_H
+
+struct tss_segment_32 {
+ u32 prev_task_link;
+ u32 esp0;
+ u32 ss0;
+ u32 esp1;
+ u32 ss1;
+ u32 esp2;
+ u32 ss2;
+ u32 cr3;
+ u32 eip;
+ u32 eflags;
+ u32 eax;
+ u32 ecx;
+ u32 edx;
+ u32 ebx;
+ u32 esp;
+ u32 ebp;
+ u32 esi;
+ u32 edi;
+ u32 es;
+ u32 cs;
+ u32 ss;
+ u32 ds;
+ u32 fs;
+ u32 gs;
+ u32 ldt_selector;
+ u16 t;
+ u16 io_map;
+};
+
+struct tss_segment_16 {
+ u16 prev_task_link;
+ u16 sp0;
+ u16 ss0;
+ u16 sp1;
+ u16 ss1;
+ u16 sp2;
+ u16 ss2;
+ u16 ip;
+ u16 flag;
+ u16 ax;
+ u16 cx;
+ u16 dx;
+ u16 bx;
+ u16 sp;
+ u16 bp;
+ u16 si;
+ u16 di;
+ u16 es;
+ u16 cs;
+ u16 ss;
+ u16 ds;
+ u16 ldt;
+};
+
+#endif
diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h
new file mode 100644
index 000000000..41a4533f9
--- /dev/null
+++ b/arch/x86/kvm/vmx/capabilities.h
@@ -0,0 +1,404 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __KVM_X86_VMX_CAPS_H
+#define __KVM_X86_VMX_CAPS_H
+
+#include <asm/vmx.h>
+
+#include "../lapic.h"
+#include "../x86.h"
+#include "../pmu.h"
+#include "../cpuid.h"
+
+extern bool __read_mostly enable_vpid;
+extern bool __read_mostly flexpriority_enabled;
+extern bool __read_mostly enable_ept;
+extern bool __read_mostly enable_unrestricted_guest;
+extern bool __read_mostly enable_ept_ad_bits;
+extern bool __read_mostly enable_pml;
+extern bool __read_mostly enable_ipiv;
+extern int __read_mostly pt_mode;
+
+#define PT_MODE_SYSTEM 0
+#define PT_MODE_HOST_GUEST 1
+
+#define PMU_CAP_FW_WRITES (1ULL << 13)
+#define PMU_CAP_LBR_FMT 0x3f
+
+struct nested_vmx_msrs {
+ /*
+ * We only store the "true" versions of the VMX capability MSRs. We
+ * generate the "non-true" versions by setting the must-be-1 bits
+ * according to the SDM.
+ */
+ u32 procbased_ctls_low;
+ u32 procbased_ctls_high;
+ u32 secondary_ctls_low;
+ u32 secondary_ctls_high;
+ u32 pinbased_ctls_low;
+ u32 pinbased_ctls_high;
+ u32 exit_ctls_low;
+ u32 exit_ctls_high;
+ u32 entry_ctls_low;
+ u32 entry_ctls_high;
+ u32 misc_low;
+ u32 misc_high;
+ u32 ept_caps;
+ u32 vpid_caps;
+ u64 basic;
+ u64 cr0_fixed0;
+ u64 cr0_fixed1;
+ u64 cr4_fixed0;
+ u64 cr4_fixed1;
+ u64 vmcs_enum;
+ u64 vmfunc_controls;
+};
+
+struct vmcs_config {
+ int size;
+ u32 basic_cap;
+ u32 revision_id;
+ u32 pin_based_exec_ctrl;
+ u32 cpu_based_exec_ctrl;
+ u32 cpu_based_2nd_exec_ctrl;
+ u64 cpu_based_3rd_exec_ctrl;
+ u32 vmexit_ctrl;
+ u32 vmentry_ctrl;
+ u64 misc;
+ struct nested_vmx_msrs nested;
+};
+extern struct vmcs_config vmcs_config __ro_after_init;
+
+struct vmx_capability {
+ u32 ept;
+ u32 vpid;
+};
+extern struct vmx_capability vmx_capability __ro_after_init;
+
+static inline bool cpu_has_vmx_basic_inout(void)
+{
+ return (((u64)vmcs_config.basic_cap << 32) & VMX_BASIC_INOUT);
+}
+
+static inline bool cpu_has_virtual_nmis(void)
+{
+ return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS &&
+ vmcs_config.cpu_based_exec_ctrl & CPU_BASED_NMI_WINDOW_EXITING;
+}
+
+static inline bool cpu_has_vmx_preemption_timer(void)
+{
+ return vmcs_config.pin_based_exec_ctrl &
+ PIN_BASED_VMX_PREEMPTION_TIMER;
+}
+
+static inline bool cpu_has_vmx_posted_intr(void)
+{
+ return vmcs_config.pin_based_exec_ctrl & PIN_BASED_POSTED_INTR;
+}
+
+static inline bool cpu_has_load_ia32_efer(void)
+{
+ return vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_EFER;
+}
+
+static inline bool cpu_has_load_perf_global_ctrl(void)
+{
+ return vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
+}
+
+static inline bool cpu_has_vmx_mpx(void)
+{
+ return vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_BNDCFGS;
+}
+
+static inline bool cpu_has_vmx_tpr_shadow(void)
+{
+ return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW;
+}
+
+static inline bool cpu_need_tpr_shadow(struct kvm_vcpu *vcpu)
+{
+ return cpu_has_vmx_tpr_shadow() && lapic_in_kernel(vcpu);
+}
+
+static inline bool cpu_has_vmx_msr_bitmap(void)
+{
+ return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_USE_MSR_BITMAPS;
+}
+
+static inline bool cpu_has_secondary_exec_ctrls(void)
+{
+ return vmcs_config.cpu_based_exec_ctrl &
+ CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
+}
+
+static inline bool cpu_has_tertiary_exec_ctrls(void)
+{
+ return vmcs_config.cpu_based_exec_ctrl &
+ CPU_BASED_ACTIVATE_TERTIARY_CONTROLS;
+}
+
+static inline bool cpu_has_vmx_virtualize_apic_accesses(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
+}
+
+static inline bool cpu_has_vmx_ept(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_ENABLE_EPT;
+}
+
+static inline bool vmx_umip_emulated(void)
+{
+ return !boot_cpu_has(X86_FEATURE_UMIP) &&
+ (vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_DESC);
+}
+
+static inline bool cpu_has_vmx_rdtscp(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_ENABLE_RDTSCP;
+}
+
+static inline bool cpu_has_vmx_virtualize_x2apic_mode(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
+}
+
+static inline bool cpu_has_vmx_vpid(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_ENABLE_VPID;
+}
+
+static inline bool cpu_has_vmx_wbinvd_exit(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_WBINVD_EXITING;
+}
+
+static inline bool cpu_has_vmx_unrestricted_guest(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_UNRESTRICTED_GUEST;
+}
+
+static inline bool cpu_has_vmx_apic_register_virt(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_APIC_REGISTER_VIRT;
+}
+
+static inline bool cpu_has_vmx_virtual_intr_delivery(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY;
+}
+
+static inline bool cpu_has_vmx_ple(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_PAUSE_LOOP_EXITING;
+}
+
+static inline bool cpu_has_vmx_rdrand(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_RDRAND_EXITING;
+}
+
+static inline bool cpu_has_vmx_invpcid(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_ENABLE_INVPCID;
+}
+
+static inline bool cpu_has_vmx_vmfunc(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_ENABLE_VMFUNC;
+}
+
+static inline bool cpu_has_vmx_shadow_vmcs(void)
+{
+ /* check if the cpu supports writing r/o exit information fields */
+ if (!(vmcs_config.misc & MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS))
+ return false;
+
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_SHADOW_VMCS;
+}
+
+static inline bool cpu_has_vmx_encls_vmexit(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_ENCLS_EXITING;
+}
+
+static inline bool cpu_has_vmx_rdseed(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_RDSEED_EXITING;
+}
+
+static inline bool cpu_has_vmx_pml(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_ENABLE_PML;
+}
+
+static inline bool cpu_has_vmx_xsaves(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_ENABLE_XSAVES;
+}
+
+static inline bool cpu_has_vmx_waitpkg(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE;
+}
+
+static inline bool cpu_has_vmx_tsc_scaling(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_TSC_SCALING;
+}
+
+static inline bool cpu_has_vmx_bus_lock_detection(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_BUS_LOCK_DETECTION;
+}
+
+static inline bool cpu_has_vmx_apicv(void)
+{
+ return cpu_has_vmx_apic_register_virt() &&
+ cpu_has_vmx_virtual_intr_delivery() &&
+ cpu_has_vmx_posted_intr();
+}
+
+static inline bool cpu_has_vmx_ipiv(void)
+{
+ return vmcs_config.cpu_based_3rd_exec_ctrl & TERTIARY_EXEC_IPI_VIRT;
+}
+
+static inline bool cpu_has_vmx_flexpriority(void)
+{
+ return cpu_has_vmx_tpr_shadow() &&
+ cpu_has_vmx_virtualize_apic_accesses();
+}
+
+static inline bool cpu_has_vmx_ept_execute_only(void)
+{
+ return vmx_capability.ept & VMX_EPT_EXECUTE_ONLY_BIT;
+}
+
+static inline bool cpu_has_vmx_ept_4levels(void)
+{
+ return vmx_capability.ept & VMX_EPT_PAGE_WALK_4_BIT;
+}
+
+static inline bool cpu_has_vmx_ept_5levels(void)
+{
+ return vmx_capability.ept & VMX_EPT_PAGE_WALK_5_BIT;
+}
+
+static inline bool cpu_has_vmx_ept_mt_wb(void)
+{
+ return vmx_capability.ept & VMX_EPTP_WB_BIT;
+}
+
+static inline bool cpu_has_vmx_ept_2m_page(void)
+{
+ return vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT;
+}
+
+static inline bool cpu_has_vmx_ept_1g_page(void)
+{
+ return vmx_capability.ept & VMX_EPT_1GB_PAGE_BIT;
+}
+
+static inline int ept_caps_to_lpage_level(u32 ept_caps)
+{
+ if (ept_caps & VMX_EPT_1GB_PAGE_BIT)
+ return PG_LEVEL_1G;
+ if (ept_caps & VMX_EPT_2MB_PAGE_BIT)
+ return PG_LEVEL_2M;
+ return PG_LEVEL_4K;
+}
+
+static inline bool cpu_has_vmx_ept_ad_bits(void)
+{
+ return vmx_capability.ept & VMX_EPT_AD_BIT;
+}
+
+static inline bool cpu_has_vmx_invept_context(void)
+{
+ return vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT;
+}
+
+static inline bool cpu_has_vmx_invept_global(void)
+{
+ return vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT;
+}
+
+static inline bool cpu_has_vmx_invvpid(void)
+{
+ return vmx_capability.vpid & VMX_VPID_INVVPID_BIT;
+}
+
+static inline bool cpu_has_vmx_invvpid_individual_addr(void)
+{
+ return vmx_capability.vpid & VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT;
+}
+
+static inline bool cpu_has_vmx_invvpid_single(void)
+{
+ return vmx_capability.vpid & VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT;
+}
+
+static inline bool cpu_has_vmx_invvpid_global(void)
+{
+ return vmx_capability.vpid & VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT;
+}
+
+static inline bool cpu_has_vmx_intel_pt(void)
+{
+ return (vmcs_config.misc & MSR_IA32_VMX_MISC_INTEL_PT) &&
+ (vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_PT_USE_GPA) &&
+ (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_RTIT_CTL);
+}
+
+/*
+ * Processor Trace can operate in one of three modes:
+ * a. system-wide: trace both host/guest and output to host buffer
+ * b. host-only: only trace host and output to host buffer
+ * c. host-guest: trace host and guest simultaneously and output to their
+ * respective buffer
+ *
+ * KVM currently only supports (a) and (c).
+ */
+static inline bool vmx_pt_mode_is_system(void)
+{
+ return pt_mode == PT_MODE_SYSTEM;
+}
+static inline bool vmx_pt_mode_is_host_guest(void)
+{
+ return pt_mode == PT_MODE_HOST_GUEST;
+}
+
+static inline bool vmx_pebs_supported(void)
+{
+ return boot_cpu_has(X86_FEATURE_PEBS) && kvm_pmu_cap.pebs_ept;
+}
+
+static inline bool cpu_has_notify_vmexit(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_NOTIFY_VM_EXITING;
+}
+
+#endif /* __KVM_X86_VMX_CAPS_H */
diff --git a/arch/x86/kvm/vmx/hyperv.c b/arch/x86/kvm/vmx/hyperv.c
new file mode 100644
index 000000000..313b8bb5b
--- /dev/null
+++ b/arch/x86/kvm/vmx/hyperv.c
@@ -0,0 +1,676 @@
+// SPDX-License-Identifier: GPL-2.0
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/errno.h>
+#include <linux/smp.h>
+
+#include "../cpuid.h"
+#include "hyperv.h"
+#include "nested.h"
+#include "vmcs.h"
+#include "vmx.h"
+#include "trace.h"
+
+#define CC KVM_NESTED_VMENTER_CONSISTENCY_CHECK
+
+/*
+ * Enlightened VMCSv1 doesn't support these:
+ *
+ * POSTED_INTR_NV = 0x00000002,
+ * GUEST_INTR_STATUS = 0x00000810,
+ * APIC_ACCESS_ADDR = 0x00002014,
+ * POSTED_INTR_DESC_ADDR = 0x00002016,
+ * EOI_EXIT_BITMAP0 = 0x0000201c,
+ * EOI_EXIT_BITMAP1 = 0x0000201e,
+ * EOI_EXIT_BITMAP2 = 0x00002020,
+ * EOI_EXIT_BITMAP3 = 0x00002022,
+ * GUEST_PML_INDEX = 0x00000812,
+ * PML_ADDRESS = 0x0000200e,
+ * VM_FUNCTION_CONTROL = 0x00002018,
+ * EPTP_LIST_ADDRESS = 0x00002024,
+ * VMREAD_BITMAP = 0x00002026,
+ * VMWRITE_BITMAP = 0x00002028,
+ *
+ * TSC_MULTIPLIER = 0x00002032,
+ * PLE_GAP = 0x00004020,
+ * PLE_WINDOW = 0x00004022,
+ * VMX_PREEMPTION_TIMER_VALUE = 0x0000482E,
+ *
+ * Currently unsupported in KVM:
+ * GUEST_IA32_RTIT_CTL = 0x00002814,
+ */
+#define EVMCS1_SUPPORTED_PINCTRL \
+ (PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR | \
+ PIN_BASED_EXT_INTR_MASK | \
+ PIN_BASED_NMI_EXITING | \
+ PIN_BASED_VIRTUAL_NMIS)
+
+#define EVMCS1_SUPPORTED_EXEC_CTRL \
+ (CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR | \
+ CPU_BASED_HLT_EXITING | \
+ CPU_BASED_CR3_LOAD_EXITING | \
+ CPU_BASED_CR3_STORE_EXITING | \
+ CPU_BASED_UNCOND_IO_EXITING | \
+ CPU_BASED_MOV_DR_EXITING | \
+ CPU_BASED_USE_TSC_OFFSETTING | \
+ CPU_BASED_MWAIT_EXITING | \
+ CPU_BASED_MONITOR_EXITING | \
+ CPU_BASED_INVLPG_EXITING | \
+ CPU_BASED_RDPMC_EXITING | \
+ CPU_BASED_INTR_WINDOW_EXITING | \
+ CPU_BASED_CR8_LOAD_EXITING | \
+ CPU_BASED_CR8_STORE_EXITING | \
+ CPU_BASED_RDTSC_EXITING | \
+ CPU_BASED_TPR_SHADOW | \
+ CPU_BASED_USE_IO_BITMAPS | \
+ CPU_BASED_MONITOR_TRAP_FLAG | \
+ CPU_BASED_USE_MSR_BITMAPS | \
+ CPU_BASED_NMI_WINDOW_EXITING | \
+ CPU_BASED_PAUSE_EXITING | \
+ CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)
+
+#define EVMCS1_SUPPORTED_2NDEXEC \
+ (SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | \
+ SECONDARY_EXEC_WBINVD_EXITING | \
+ SECONDARY_EXEC_ENABLE_VPID | \
+ SECONDARY_EXEC_ENABLE_EPT | \
+ SECONDARY_EXEC_UNRESTRICTED_GUEST | \
+ SECONDARY_EXEC_DESC | \
+ SECONDARY_EXEC_ENABLE_RDTSCP | \
+ SECONDARY_EXEC_ENABLE_INVPCID | \
+ SECONDARY_EXEC_ENABLE_XSAVES | \
+ SECONDARY_EXEC_RDSEED_EXITING | \
+ SECONDARY_EXEC_RDRAND_EXITING | \
+ SECONDARY_EXEC_TSC_SCALING | \
+ SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE | \
+ SECONDARY_EXEC_PT_USE_GPA | \
+ SECONDARY_EXEC_PT_CONCEAL_VMX | \
+ SECONDARY_EXEC_BUS_LOCK_DETECTION | \
+ SECONDARY_EXEC_NOTIFY_VM_EXITING | \
+ SECONDARY_EXEC_ENCLS_EXITING)
+
+#define EVMCS1_SUPPORTED_3RDEXEC (0ULL)
+
+#define EVMCS1_SUPPORTED_VMEXIT_CTRL \
+ (VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | \
+ VM_EXIT_SAVE_DEBUG_CONTROLS | \
+ VM_EXIT_ACK_INTR_ON_EXIT | \
+ VM_EXIT_HOST_ADDR_SPACE_SIZE | \
+ VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | \
+ VM_EXIT_SAVE_IA32_PAT | \
+ VM_EXIT_LOAD_IA32_PAT | \
+ VM_EXIT_SAVE_IA32_EFER | \
+ VM_EXIT_LOAD_IA32_EFER | \
+ VM_EXIT_CLEAR_BNDCFGS | \
+ VM_EXIT_PT_CONCEAL_PIP | \
+ VM_EXIT_CLEAR_IA32_RTIT_CTL)
+
+#define EVMCS1_SUPPORTED_VMENTRY_CTRL \
+ (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | \
+ VM_ENTRY_LOAD_DEBUG_CONTROLS | \
+ VM_ENTRY_IA32E_MODE | \
+ VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL | \
+ VM_ENTRY_LOAD_IA32_PAT | \
+ VM_ENTRY_LOAD_IA32_EFER | \
+ VM_ENTRY_LOAD_BNDCFGS | \
+ VM_ENTRY_PT_CONCEAL_PIP | \
+ VM_ENTRY_LOAD_IA32_RTIT_CTL)
+
+#define EVMCS1_SUPPORTED_VMFUNC (0)
+
+#define EVMCS1_OFFSET(x) offsetof(struct hv_enlightened_vmcs, x)
+#define EVMCS1_FIELD(number, name, clean_field)[ROL16(number, 6)] = \
+ {EVMCS1_OFFSET(name), clean_field}
+
+const struct evmcs_field vmcs_field_to_evmcs_1[] = {
+ /* 64 bit rw */
+ EVMCS1_FIELD(GUEST_RIP, guest_rip,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
+ EVMCS1_FIELD(GUEST_RSP, guest_rsp,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC),
+ EVMCS1_FIELD(GUEST_RFLAGS, guest_rflags,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC),
+ EVMCS1_FIELD(HOST_IA32_PAT, host_ia32_pat,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ EVMCS1_FIELD(HOST_IA32_EFER, host_ia32_efer,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ EVMCS1_FIELD(HOST_IA32_PERF_GLOBAL_CTRL, host_ia32_perf_global_ctrl,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ EVMCS1_FIELD(HOST_CR0, host_cr0,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ EVMCS1_FIELD(HOST_CR3, host_cr3,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ EVMCS1_FIELD(HOST_CR4, host_cr4,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ EVMCS1_FIELD(HOST_IA32_SYSENTER_ESP, host_ia32_sysenter_esp,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ EVMCS1_FIELD(HOST_IA32_SYSENTER_EIP, host_ia32_sysenter_eip,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ EVMCS1_FIELD(HOST_RIP, host_rip,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ EVMCS1_FIELD(IO_BITMAP_A, io_bitmap_a,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP),
+ EVMCS1_FIELD(IO_BITMAP_B, io_bitmap_b,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP),
+ EVMCS1_FIELD(MSR_BITMAP, msr_bitmap,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP),
+ EVMCS1_FIELD(GUEST_ES_BASE, guest_es_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_CS_BASE, guest_cs_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_SS_BASE, guest_ss_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_DS_BASE, guest_ds_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_FS_BASE, guest_fs_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_GS_BASE, guest_gs_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_LDTR_BASE, guest_ldtr_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_TR_BASE, guest_tr_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_GDTR_BASE, guest_gdtr_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_IDTR_BASE, guest_idtr_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(TSC_OFFSET, tsc_offset,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2),
+ EVMCS1_FIELD(VIRTUAL_APIC_PAGE_ADDR, virtual_apic_page_addr,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2),
+ EVMCS1_FIELD(VMCS_LINK_POINTER, vmcs_link_pointer,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
+ EVMCS1_FIELD(GUEST_IA32_DEBUGCTL, guest_ia32_debugctl,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
+ EVMCS1_FIELD(GUEST_IA32_PAT, guest_ia32_pat,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
+ EVMCS1_FIELD(GUEST_IA32_EFER, guest_ia32_efer,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
+ EVMCS1_FIELD(GUEST_IA32_PERF_GLOBAL_CTRL, guest_ia32_perf_global_ctrl,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
+ EVMCS1_FIELD(GUEST_PDPTR0, guest_pdptr0,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
+ EVMCS1_FIELD(GUEST_PDPTR1, guest_pdptr1,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
+ EVMCS1_FIELD(GUEST_PDPTR2, guest_pdptr2,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
+ EVMCS1_FIELD(GUEST_PDPTR3, guest_pdptr3,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
+ EVMCS1_FIELD(GUEST_PENDING_DBG_EXCEPTIONS, guest_pending_dbg_exceptions,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
+ EVMCS1_FIELD(GUEST_SYSENTER_ESP, guest_sysenter_esp,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
+ EVMCS1_FIELD(GUEST_SYSENTER_EIP, guest_sysenter_eip,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
+ EVMCS1_FIELD(CR0_GUEST_HOST_MASK, cr0_guest_host_mask,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR),
+ EVMCS1_FIELD(CR4_GUEST_HOST_MASK, cr4_guest_host_mask,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR),
+ EVMCS1_FIELD(CR0_READ_SHADOW, cr0_read_shadow,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR),
+ EVMCS1_FIELD(CR4_READ_SHADOW, cr4_read_shadow,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR),
+ EVMCS1_FIELD(GUEST_CR0, guest_cr0,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR),
+ EVMCS1_FIELD(GUEST_CR3, guest_cr3,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR),
+ EVMCS1_FIELD(GUEST_CR4, guest_cr4,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR),
+ EVMCS1_FIELD(GUEST_DR7, guest_dr7,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR),
+ EVMCS1_FIELD(HOST_FS_BASE, host_fs_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER),
+ EVMCS1_FIELD(HOST_GS_BASE, host_gs_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER),
+ EVMCS1_FIELD(HOST_TR_BASE, host_tr_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER),
+ EVMCS1_FIELD(HOST_GDTR_BASE, host_gdtr_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER),
+ EVMCS1_FIELD(HOST_IDTR_BASE, host_idtr_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER),
+ EVMCS1_FIELD(HOST_RSP, host_rsp,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER),
+ EVMCS1_FIELD(EPT_POINTER, ept_pointer,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT),
+ EVMCS1_FIELD(GUEST_BNDCFGS, guest_bndcfgs,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
+ EVMCS1_FIELD(XSS_EXIT_BITMAP, xss_exit_bitmap,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2),
+ EVMCS1_FIELD(ENCLS_EXITING_BITMAP, encls_exiting_bitmap,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2),
+ EVMCS1_FIELD(TSC_MULTIPLIER, tsc_multiplier,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2),
+ /*
+ * Not used by KVM:
+ *
+ * EVMCS1_FIELD(0x00006828, guest_ia32_s_cet,
+ * HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
+ * EVMCS1_FIELD(0x0000682A, guest_ssp,
+ * HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC),
+ * EVMCS1_FIELD(0x0000682C, guest_ia32_int_ssp_table_addr,
+ * HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
+ * EVMCS1_FIELD(0x00002816, guest_ia32_lbr_ctl,
+ * HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
+ * EVMCS1_FIELD(0x00006C18, host_ia32_s_cet,
+ * HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ * EVMCS1_FIELD(0x00006C1A, host_ssp,
+ * HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ * EVMCS1_FIELD(0x00006C1C, host_ia32_int_ssp_table_addr,
+ * HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ */
+
+ /* 64 bit read only */
+ EVMCS1_FIELD(GUEST_PHYSICAL_ADDRESS, guest_physical_address,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
+ EVMCS1_FIELD(EXIT_QUALIFICATION, exit_qualification,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
+ /*
+ * Not defined in KVM:
+ *
+ * EVMCS1_FIELD(0x00006402, exit_io_instruction_ecx,
+ * HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE);
+ * EVMCS1_FIELD(0x00006404, exit_io_instruction_esi,
+ * HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE);
+ * EVMCS1_FIELD(0x00006406, exit_io_instruction_esi,
+ * HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE);
+ * EVMCS1_FIELD(0x00006408, exit_io_instruction_eip,
+ * HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE);
+ */
+ EVMCS1_FIELD(GUEST_LINEAR_ADDRESS, guest_linear_address,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
+
+ /*
+ * No mask defined in the spec as Hyper-V doesn't currently support
+ * these. Future proof by resetting the whole clean field mask on
+ * access.
+ */
+ EVMCS1_FIELD(VM_EXIT_MSR_STORE_ADDR, vm_exit_msr_store_addr,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
+ EVMCS1_FIELD(VM_EXIT_MSR_LOAD_ADDR, vm_exit_msr_load_addr,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
+ EVMCS1_FIELD(VM_ENTRY_MSR_LOAD_ADDR, vm_entry_msr_load_addr,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
+
+ /* 32 bit rw */
+ EVMCS1_FIELD(TPR_THRESHOLD, tpr_threshold,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
+ EVMCS1_FIELD(GUEST_INTERRUPTIBILITY_INFO, guest_interruptibility_info,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC),
+ EVMCS1_FIELD(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC),
+ EVMCS1_FIELD(EXCEPTION_BITMAP, exception_bitmap,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EXCPN),
+ EVMCS1_FIELD(VM_ENTRY_CONTROLS, vm_entry_controls,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_ENTRY),
+ EVMCS1_FIELD(VM_ENTRY_INTR_INFO_FIELD, vm_entry_intr_info_field,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT),
+ EVMCS1_FIELD(VM_ENTRY_EXCEPTION_ERROR_CODE,
+ vm_entry_exception_error_code,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT),
+ EVMCS1_FIELD(VM_ENTRY_INSTRUCTION_LEN, vm_entry_instruction_len,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT),
+ EVMCS1_FIELD(HOST_IA32_SYSENTER_CS, host_ia32_sysenter_cs,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ EVMCS1_FIELD(PIN_BASED_VM_EXEC_CONTROL, pin_based_vm_exec_control,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1),
+ EVMCS1_FIELD(VM_EXIT_CONTROLS, vm_exit_controls,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1),
+ EVMCS1_FIELD(SECONDARY_VM_EXEC_CONTROL, secondary_vm_exec_control,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1),
+ EVMCS1_FIELD(GUEST_ES_LIMIT, guest_es_limit,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_CS_LIMIT, guest_cs_limit,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_SS_LIMIT, guest_ss_limit,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_DS_LIMIT, guest_ds_limit,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_FS_LIMIT, guest_fs_limit,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_GS_LIMIT, guest_gs_limit,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_LDTR_LIMIT, guest_ldtr_limit,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_TR_LIMIT, guest_tr_limit,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_GDTR_LIMIT, guest_gdtr_limit,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_IDTR_LIMIT, guest_idtr_limit,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_ES_AR_BYTES, guest_es_ar_bytes,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_CS_AR_BYTES, guest_cs_ar_bytes,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_SS_AR_BYTES, guest_ss_ar_bytes,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_DS_AR_BYTES, guest_ds_ar_bytes,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_FS_AR_BYTES, guest_fs_ar_bytes,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_GS_AR_BYTES, guest_gs_ar_bytes,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_LDTR_AR_BYTES, guest_ldtr_ar_bytes,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_TR_AR_BYTES, guest_tr_ar_bytes,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_ACTIVITY_STATE, guest_activity_state,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
+ EVMCS1_FIELD(GUEST_SYSENTER_CS, guest_sysenter_cs,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
+
+ /* 32 bit read only */
+ EVMCS1_FIELD(VM_INSTRUCTION_ERROR, vm_instruction_error,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
+ EVMCS1_FIELD(VM_EXIT_REASON, vm_exit_reason,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
+ EVMCS1_FIELD(VM_EXIT_INTR_INFO, vm_exit_intr_info,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
+ EVMCS1_FIELD(VM_EXIT_INTR_ERROR_CODE, vm_exit_intr_error_code,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
+ EVMCS1_FIELD(IDT_VECTORING_INFO_FIELD, idt_vectoring_info_field,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
+ EVMCS1_FIELD(IDT_VECTORING_ERROR_CODE, idt_vectoring_error_code,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
+ EVMCS1_FIELD(VM_EXIT_INSTRUCTION_LEN, vm_exit_instruction_len,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
+ EVMCS1_FIELD(VMX_INSTRUCTION_INFO, vmx_instruction_info,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
+
+ /* No mask defined in the spec (not used) */
+ EVMCS1_FIELD(PAGE_FAULT_ERROR_CODE_MASK, page_fault_error_code_mask,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
+ EVMCS1_FIELD(PAGE_FAULT_ERROR_CODE_MATCH, page_fault_error_code_match,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
+ EVMCS1_FIELD(CR3_TARGET_COUNT, cr3_target_count,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
+ EVMCS1_FIELD(VM_EXIT_MSR_STORE_COUNT, vm_exit_msr_store_count,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
+ EVMCS1_FIELD(VM_EXIT_MSR_LOAD_COUNT, vm_exit_msr_load_count,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
+ EVMCS1_FIELD(VM_ENTRY_MSR_LOAD_COUNT, vm_entry_msr_load_count,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
+
+ /* 16 bit rw */
+ EVMCS1_FIELD(HOST_ES_SELECTOR, host_es_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ EVMCS1_FIELD(HOST_CS_SELECTOR, host_cs_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ EVMCS1_FIELD(HOST_SS_SELECTOR, host_ss_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ EVMCS1_FIELD(HOST_DS_SELECTOR, host_ds_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ EVMCS1_FIELD(HOST_FS_SELECTOR, host_fs_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ EVMCS1_FIELD(HOST_GS_SELECTOR, host_gs_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ EVMCS1_FIELD(HOST_TR_SELECTOR, host_tr_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ EVMCS1_FIELD(GUEST_ES_SELECTOR, guest_es_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_CS_SELECTOR, guest_cs_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_SS_SELECTOR, guest_ss_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_DS_SELECTOR, guest_ds_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_FS_SELECTOR, guest_fs_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_GS_SELECTOR, guest_gs_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_LDTR_SELECTOR, guest_ldtr_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_TR_SELECTOR, guest_tr_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT),
+};
+const unsigned int nr_evmcs_1_fields = ARRAY_SIZE(vmcs_field_to_evmcs_1);
+
+u64 nested_get_evmptr(struct kvm_vcpu *vcpu)
+{
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+
+ if (unlikely(kvm_hv_get_assist_page(vcpu)))
+ return EVMPTR_INVALID;
+
+ if (unlikely(!hv_vcpu->vp_assist_page.enlighten_vmentry))
+ return EVMPTR_INVALID;
+
+ return hv_vcpu->vp_assist_page.current_nested_vmcs;
+}
+
+uint16_t nested_get_evmcs_version(struct kvm_vcpu *vcpu)
+{
+ /*
+ * vmcs_version represents the range of supported Enlightened VMCS
+ * versions: lower 8 bits is the minimal version, higher 8 bits is the
+ * maximum supported version. KVM supports versions from 1 to
+ * KVM_EVMCS_VERSION.
+ *
+ * Note, do not check the Hyper-V is fully enabled in guest CPUID, this
+ * helper is used to _get_ the vCPU's supported CPUID.
+ */
+ if (kvm_cpu_cap_get(X86_FEATURE_VMX) &&
+ (!vcpu || to_vmx(vcpu)->nested.enlightened_vmcs_enabled))
+ return (KVM_EVMCS_VERSION << 8) | 1;
+
+ return 0;
+}
+
+enum evmcs_revision {
+ EVMCSv1_LEGACY,
+ NR_EVMCS_REVISIONS,
+};
+
+enum evmcs_ctrl_type {
+ EVMCS_EXIT_CTRLS,
+ EVMCS_ENTRY_CTRLS,
+ EVMCS_EXEC_CTRL,
+ EVMCS_2NDEXEC,
+ EVMCS_3RDEXEC,
+ EVMCS_PINCTRL,
+ EVMCS_VMFUNC,
+ NR_EVMCS_CTRLS,
+};
+
+static const u32 evmcs_supported_ctrls[NR_EVMCS_CTRLS][NR_EVMCS_REVISIONS] = {
+ [EVMCS_EXIT_CTRLS] = {
+ [EVMCSv1_LEGACY] = EVMCS1_SUPPORTED_VMEXIT_CTRL,
+ },
+ [EVMCS_ENTRY_CTRLS] = {
+ [EVMCSv1_LEGACY] = EVMCS1_SUPPORTED_VMENTRY_CTRL,
+ },
+ [EVMCS_EXEC_CTRL] = {
+ [EVMCSv1_LEGACY] = EVMCS1_SUPPORTED_EXEC_CTRL,
+ },
+ [EVMCS_2NDEXEC] = {
+ [EVMCSv1_LEGACY] = EVMCS1_SUPPORTED_2NDEXEC & ~SECONDARY_EXEC_TSC_SCALING,
+ },
+ [EVMCS_3RDEXEC] = {
+ [EVMCSv1_LEGACY] = EVMCS1_SUPPORTED_3RDEXEC,
+ },
+ [EVMCS_PINCTRL] = {
+ [EVMCSv1_LEGACY] = EVMCS1_SUPPORTED_PINCTRL,
+ },
+ [EVMCS_VMFUNC] = {
+ [EVMCSv1_LEGACY] = EVMCS1_SUPPORTED_VMFUNC,
+ },
+};
+
+static u32 evmcs_get_supported_ctls(enum evmcs_ctrl_type ctrl_type)
+{
+ enum evmcs_revision evmcs_rev = EVMCSv1_LEGACY;
+
+ return evmcs_supported_ctrls[ctrl_type][evmcs_rev];
+}
+
+static bool evmcs_has_perf_global_ctrl(struct kvm_vcpu *vcpu)
+{
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+
+ /*
+ * PERF_GLOBAL_CTRL has a quirk where some Windows guests may fail to
+ * boot if a PV CPUID feature flag is not also set. Treat the fields
+ * as unsupported if the flag is not set in guest CPUID. This should
+ * be called only for guest accesses, and all guest accesses should be
+ * gated on Hyper-V being enabled and initialized.
+ */
+ if (WARN_ON_ONCE(!hv_vcpu))
+ return false;
+
+ return hv_vcpu->cpuid_cache.nested_ebx & HV_X64_NESTED_EVMCS1_PERF_GLOBAL_CTRL;
+}
+
+void nested_evmcs_filter_control_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
+{
+ u32 ctl_low = (u32)*pdata;
+ u32 ctl_high = (u32)(*pdata >> 32);
+ u32 supported_ctrls;
+
+ /*
+ * Hyper-V 2016 and 2019 try using these features even when eVMCS
+ * is enabled but there are no corresponding fields.
+ */
+ switch (msr_index) {
+ case MSR_IA32_VMX_EXIT_CTLS:
+ case MSR_IA32_VMX_TRUE_EXIT_CTLS:
+ supported_ctrls = evmcs_get_supported_ctls(EVMCS_EXIT_CTRLS);
+ if (!evmcs_has_perf_global_ctrl(vcpu))
+ supported_ctrls &= ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL;
+ ctl_high &= supported_ctrls;
+ break;
+ case MSR_IA32_VMX_ENTRY_CTLS:
+ case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
+ supported_ctrls = evmcs_get_supported_ctls(EVMCS_ENTRY_CTRLS);
+ if (!evmcs_has_perf_global_ctrl(vcpu))
+ supported_ctrls &= ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
+ ctl_high &= supported_ctrls;
+ break;
+ case MSR_IA32_VMX_PROCBASED_CTLS:
+ case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
+ ctl_high &= evmcs_get_supported_ctls(EVMCS_EXEC_CTRL);
+ break;
+ case MSR_IA32_VMX_PROCBASED_CTLS2:
+ ctl_high &= evmcs_get_supported_ctls(EVMCS_2NDEXEC);
+ break;
+ case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
+ case MSR_IA32_VMX_PINBASED_CTLS:
+ ctl_high &= evmcs_get_supported_ctls(EVMCS_PINCTRL);
+ break;
+ case MSR_IA32_VMX_VMFUNC:
+ ctl_low &= evmcs_get_supported_ctls(EVMCS_VMFUNC);
+ break;
+ }
+
+ *pdata = ctl_low | ((u64)ctl_high << 32);
+}
+
+static bool nested_evmcs_is_valid_controls(enum evmcs_ctrl_type ctrl_type,
+ u32 val)
+{
+ return !(val & ~evmcs_get_supported_ctls(ctrl_type));
+}
+
+int nested_evmcs_check_controls(struct vmcs12 *vmcs12)
+{
+ if (CC(!nested_evmcs_is_valid_controls(EVMCS_PINCTRL,
+ vmcs12->pin_based_vm_exec_control)))
+ return -EINVAL;
+
+ if (CC(!nested_evmcs_is_valid_controls(EVMCS_EXEC_CTRL,
+ vmcs12->cpu_based_vm_exec_control)))
+ return -EINVAL;
+
+ if (CC(!nested_evmcs_is_valid_controls(EVMCS_2NDEXEC,
+ vmcs12->secondary_vm_exec_control)))
+ return -EINVAL;
+
+ if (CC(!nested_evmcs_is_valid_controls(EVMCS_EXIT_CTRLS,
+ vmcs12->vm_exit_controls)))
+ return -EINVAL;
+
+ if (CC(!nested_evmcs_is_valid_controls(EVMCS_ENTRY_CTRLS,
+ vmcs12->vm_entry_controls)))
+ return -EINVAL;
+
+ /*
+ * VM-Func controls are 64-bit, but KVM currently doesn't support any
+ * controls in bits 63:32, i.e. dropping those bits on the consistency
+ * check is intentional.
+ */
+ if (WARN_ON_ONCE(vmcs12->vm_function_control >> 32))
+ return -EINVAL;
+
+ if (CC(!nested_evmcs_is_valid_controls(EVMCS_VMFUNC,
+ vmcs12->vm_function_control)))
+ return -EINVAL;
+
+ return 0;
+}
+
+#if IS_ENABLED(CONFIG_HYPERV)
+DEFINE_STATIC_KEY_FALSE(__kvm_is_using_evmcs);
+
+/*
+ * KVM on Hyper-V always uses the latest known eVMCSv1 revision, the assumption
+ * is: in case a feature has corresponding fields in eVMCS described and it was
+ * exposed in VMX feature MSRs, KVM is free to use it. Warn if KVM meets a
+ * feature which has no corresponding eVMCS field, this likely means that KVM
+ * needs to be updated.
+ */
+#define evmcs_check_vmcs_conf(field, ctrl) \
+ do { \
+ typeof(vmcs_conf->field) unsupported; \
+ \
+ unsupported = vmcs_conf->field & ~EVMCS1_SUPPORTED_ ## ctrl; \
+ if (unsupported) { \
+ pr_warn_once(#field " unsupported with eVMCS: 0x%llx\n",\
+ (u64)unsupported); \
+ vmcs_conf->field &= EVMCS1_SUPPORTED_ ## ctrl; \
+ } \
+ } \
+ while (0)
+
+void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf)
+{
+ evmcs_check_vmcs_conf(cpu_based_exec_ctrl, EXEC_CTRL);
+ evmcs_check_vmcs_conf(pin_based_exec_ctrl, PINCTRL);
+ evmcs_check_vmcs_conf(cpu_based_2nd_exec_ctrl, 2NDEXEC);
+ evmcs_check_vmcs_conf(cpu_based_3rd_exec_ctrl, 3RDEXEC);
+ evmcs_check_vmcs_conf(vmentry_ctrl, VMENTRY_CTRL);
+ evmcs_check_vmcs_conf(vmexit_ctrl, VMEXIT_CTRL);
+}
+#endif
+
+int nested_enable_evmcs(struct kvm_vcpu *vcpu,
+ uint16_t *vmcs_version)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ vmx->nested.enlightened_vmcs_enabled = true;
+
+ if (vmcs_version)
+ *vmcs_version = nested_get_evmcs_version(vcpu);
+
+ return 0;
+}
+
+bool nested_evmcs_l2_tlb_flush_enabled(struct kvm_vcpu *vcpu)
+{
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs;
+
+ if (!hv_vcpu || !evmcs)
+ return false;
+
+ if (!evmcs->hv_enlightenments_control.nested_flush_hypercall)
+ return false;
+
+ return hv_vcpu->vp_assist_page.nested_control.features.directhypercall;
+}
+
+void vmx_hv_inject_synthetic_vmexit_post_tlb_flush(struct kvm_vcpu *vcpu)
+{
+ nested_vmx_vmexit(vcpu, HV_VMX_SYNTHETIC_EXIT_REASON_TRAP_AFTER_FLUSH, 0, 0);
+}
diff --git a/arch/x86/kvm/vmx/hyperv.h b/arch/x86/kvm/vmx/hyperv.h
new file mode 100644
index 000000000..9623fe165
--- /dev/null
+++ b/arch/x86/kvm/vmx/hyperv.h
@@ -0,0 +1,200 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __KVM_X86_VMX_HYPERV_H
+#define __KVM_X86_VMX_HYPERV_H
+
+#include <linux/jump_label.h>
+
+#include <asm/hyperv-tlfs.h>
+#include <asm/mshyperv.h>
+#include <asm/vmx.h>
+
+#include "../hyperv.h"
+
+#include "capabilities.h"
+#include "vmcs.h"
+#include "vmcs12.h"
+
+struct vmcs_config;
+
+#define current_evmcs ((struct hv_enlightened_vmcs *)this_cpu_read(current_vmcs))
+
+#define KVM_EVMCS_VERSION 1
+
+struct evmcs_field {
+ u16 offset;
+ u16 clean_field;
+};
+
+extern const struct evmcs_field vmcs_field_to_evmcs_1[];
+extern const unsigned int nr_evmcs_1_fields;
+
+static __always_inline int evmcs_field_offset(unsigned long field,
+ u16 *clean_field)
+{
+ unsigned int index = ROL16(field, 6);
+ const struct evmcs_field *evmcs_field;
+
+ if (unlikely(index >= nr_evmcs_1_fields))
+ return -ENOENT;
+
+ evmcs_field = &vmcs_field_to_evmcs_1[index];
+
+ /*
+ * Use offset=0 to detect holes in eVMCS. This offset belongs to
+ * 'revision_id' but this field has no encoding and is supposed to
+ * be accessed directly.
+ */
+ if (unlikely(!evmcs_field->offset))
+ return -ENOENT;
+
+ if (clean_field)
+ *clean_field = evmcs_field->clean_field;
+
+ return evmcs_field->offset;
+}
+
+static inline u64 evmcs_read_any(struct hv_enlightened_vmcs *evmcs,
+ unsigned long field, u16 offset)
+{
+ /*
+ * vmcs12_read_any() doesn't care whether the supplied structure
+ * is 'struct vmcs12' or 'struct hv_enlightened_vmcs' as it takes
+ * the exact offset of the required field, use it for convenience
+ * here.
+ */
+ return vmcs12_read_any((void *)evmcs, field, offset);
+}
+
+#if IS_ENABLED(CONFIG_HYPERV)
+
+DECLARE_STATIC_KEY_FALSE(__kvm_is_using_evmcs);
+
+static __always_inline bool kvm_is_using_evmcs(void)
+{
+ return static_branch_unlikely(&__kvm_is_using_evmcs);
+}
+
+static __always_inline int get_evmcs_offset(unsigned long field,
+ u16 *clean_field)
+{
+ int offset = evmcs_field_offset(field, clean_field);
+
+ WARN_ONCE(offset < 0, "accessing unsupported EVMCS field %lx\n", field);
+ return offset;
+}
+
+static __always_inline void evmcs_write64(unsigned long field, u64 value)
+{
+ u16 clean_field;
+ int offset = get_evmcs_offset(field, &clean_field);
+
+ if (offset < 0)
+ return;
+
+ *(u64 *)((char *)current_evmcs + offset) = value;
+
+ current_evmcs->hv_clean_fields &= ~clean_field;
+}
+
+static __always_inline void evmcs_write32(unsigned long field, u32 value)
+{
+ u16 clean_field;
+ int offset = get_evmcs_offset(field, &clean_field);
+
+ if (offset < 0)
+ return;
+
+ *(u32 *)((char *)current_evmcs + offset) = value;
+ current_evmcs->hv_clean_fields &= ~clean_field;
+}
+
+static __always_inline void evmcs_write16(unsigned long field, u16 value)
+{
+ u16 clean_field;
+ int offset = get_evmcs_offset(field, &clean_field);
+
+ if (offset < 0)
+ return;
+
+ *(u16 *)((char *)current_evmcs + offset) = value;
+ current_evmcs->hv_clean_fields &= ~clean_field;
+}
+
+static __always_inline u64 evmcs_read64(unsigned long field)
+{
+ int offset = get_evmcs_offset(field, NULL);
+
+ if (offset < 0)
+ return 0;
+
+ return *(u64 *)((char *)current_evmcs + offset);
+}
+
+static __always_inline u32 evmcs_read32(unsigned long field)
+{
+ int offset = get_evmcs_offset(field, NULL);
+
+ if (offset < 0)
+ return 0;
+
+ return *(u32 *)((char *)current_evmcs + offset);
+}
+
+static __always_inline u16 evmcs_read16(unsigned long field)
+{
+ int offset = get_evmcs_offset(field, NULL);
+
+ if (offset < 0)
+ return 0;
+
+ return *(u16 *)((char *)current_evmcs + offset);
+}
+
+static inline void evmcs_load(u64 phys_addr)
+{
+ struct hv_vp_assist_page *vp_ap =
+ hv_get_vp_assist_page(smp_processor_id());
+
+ if (current_evmcs->hv_enlightenments_control.nested_flush_hypercall)
+ vp_ap->nested_control.features.directhypercall = 1;
+ vp_ap->current_nested_vmcs = phys_addr;
+ vp_ap->enlighten_vmentry = 1;
+}
+
+void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf);
+#else /* !IS_ENABLED(CONFIG_HYPERV) */
+static __always_inline bool kvm_is_using_evmcs(void) { return false; }
+static __always_inline void evmcs_write64(unsigned long field, u64 value) {}
+static __always_inline void evmcs_write32(unsigned long field, u32 value) {}
+static __always_inline void evmcs_write16(unsigned long field, u16 value) {}
+static __always_inline u64 evmcs_read64(unsigned long field) { return 0; }
+static __always_inline u32 evmcs_read32(unsigned long field) { return 0; }
+static __always_inline u16 evmcs_read16(unsigned long field) { return 0; }
+static inline void evmcs_load(u64 phys_addr) {}
+#endif /* IS_ENABLED(CONFIG_HYPERV) */
+
+#define EVMPTR_INVALID (-1ULL)
+#define EVMPTR_MAP_PENDING (-2ULL)
+
+static inline bool evmptr_is_valid(u64 evmptr)
+{
+ return evmptr != EVMPTR_INVALID && evmptr != EVMPTR_MAP_PENDING;
+}
+
+enum nested_evmptrld_status {
+ EVMPTRLD_DISABLED,
+ EVMPTRLD_SUCCEEDED,
+ EVMPTRLD_VMFAIL,
+ EVMPTRLD_ERROR,
+};
+
+u64 nested_get_evmptr(struct kvm_vcpu *vcpu);
+uint16_t nested_get_evmcs_version(struct kvm_vcpu *vcpu);
+int nested_enable_evmcs(struct kvm_vcpu *vcpu,
+ uint16_t *vmcs_version);
+void nested_evmcs_filter_control_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata);
+int nested_evmcs_check_controls(struct vmcs12 *vmcs12);
+bool nested_evmcs_l2_tlb_flush_enabled(struct kvm_vcpu *vcpu);
+void vmx_hv_inject_synthetic_vmexit_post_tlb_flush(struct kvm_vcpu *vcpu);
+
+#endif /* __KVM_X86_VMX_HYPERV_H */
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
new file mode 100644
index 000000000..c5ec0ef51
--- /dev/null
+++ b/arch/x86/kvm/vmx/nested.c
@@ -0,0 +1,7102 @@
+// SPDX-License-Identifier: GPL-2.0
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/objtool.h>
+#include <linux/percpu.h>
+
+#include <asm/debugreg.h>
+#include <asm/mmu_context.h>
+
+#include "cpuid.h"
+#include "hyperv.h"
+#include "mmu.h"
+#include "nested.h"
+#include "pmu.h"
+#include "sgx.h"
+#include "trace.h"
+#include "vmx.h"
+#include "x86.h"
+#include "smm.h"
+
+static bool __read_mostly enable_shadow_vmcs = 1;
+module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO);
+
+static bool __read_mostly nested_early_check = 0;
+module_param(nested_early_check, bool, S_IRUGO);
+
+#define CC KVM_NESTED_VMENTER_CONSISTENCY_CHECK
+
+/*
+ * Hyper-V requires all of these, so mark them as supported even though
+ * they are just treated the same as all-context.
+ */
+#define VMX_VPID_EXTENT_SUPPORTED_MASK \
+ (VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT | \
+ VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT | \
+ VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT | \
+ VMX_VPID_EXTENT_SINGLE_NON_GLOBAL_BIT)
+
+#define VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE 5
+
+enum {
+ VMX_VMREAD_BITMAP,
+ VMX_VMWRITE_BITMAP,
+ VMX_BITMAP_NR
+};
+static unsigned long *vmx_bitmap[VMX_BITMAP_NR];
+
+#define vmx_vmread_bitmap (vmx_bitmap[VMX_VMREAD_BITMAP])
+#define vmx_vmwrite_bitmap (vmx_bitmap[VMX_VMWRITE_BITMAP])
+
+struct shadow_vmcs_field {
+ u16 encoding;
+ u16 offset;
+};
+static struct shadow_vmcs_field shadow_read_only_fields[] = {
+#define SHADOW_FIELD_RO(x, y) { x, offsetof(struct vmcs12, y) },
+#include "vmcs_shadow_fields.h"
+};
+static int max_shadow_read_only_fields =
+ ARRAY_SIZE(shadow_read_only_fields);
+
+static struct shadow_vmcs_field shadow_read_write_fields[] = {
+#define SHADOW_FIELD_RW(x, y) { x, offsetof(struct vmcs12, y) },
+#include "vmcs_shadow_fields.h"
+};
+static int max_shadow_read_write_fields =
+ ARRAY_SIZE(shadow_read_write_fields);
+
+static void init_vmcs_shadow_fields(void)
+{
+ int i, j;
+
+ memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE);
+ memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE);
+
+ for (i = j = 0; i < max_shadow_read_only_fields; i++) {
+ struct shadow_vmcs_field entry = shadow_read_only_fields[i];
+ u16 field = entry.encoding;
+
+ if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 &&
+ (i + 1 == max_shadow_read_only_fields ||
+ shadow_read_only_fields[i + 1].encoding != field + 1))
+ pr_err("Missing field from shadow_read_only_field %x\n",
+ field + 1);
+
+ clear_bit(field, vmx_vmread_bitmap);
+ if (field & 1)
+#ifdef CONFIG_X86_64
+ continue;
+#else
+ entry.offset += sizeof(u32);
+#endif
+ shadow_read_only_fields[j++] = entry;
+ }
+ max_shadow_read_only_fields = j;
+
+ for (i = j = 0; i < max_shadow_read_write_fields; i++) {
+ struct shadow_vmcs_field entry = shadow_read_write_fields[i];
+ u16 field = entry.encoding;
+
+ if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 &&
+ (i + 1 == max_shadow_read_write_fields ||
+ shadow_read_write_fields[i + 1].encoding != field + 1))
+ pr_err("Missing field from shadow_read_write_field %x\n",
+ field + 1);
+
+ WARN_ONCE(field >= GUEST_ES_AR_BYTES &&
+ field <= GUEST_TR_AR_BYTES,
+ "Update vmcs12_write_any() to drop reserved bits from AR_BYTES");
+
+ /*
+ * PML and the preemption timer can be emulated, but the
+ * processor cannot vmwrite to fields that don't exist
+ * on bare metal.
+ */
+ switch (field) {
+ case GUEST_PML_INDEX:
+ if (!cpu_has_vmx_pml())
+ continue;
+ break;
+ case VMX_PREEMPTION_TIMER_VALUE:
+ if (!cpu_has_vmx_preemption_timer())
+ continue;
+ break;
+ case GUEST_INTR_STATUS:
+ if (!cpu_has_vmx_apicv())
+ continue;
+ break;
+ default:
+ break;
+ }
+
+ clear_bit(field, vmx_vmwrite_bitmap);
+ clear_bit(field, vmx_vmread_bitmap);
+ if (field & 1)
+#ifdef CONFIG_X86_64
+ continue;
+#else
+ entry.offset += sizeof(u32);
+#endif
+ shadow_read_write_fields[j++] = entry;
+ }
+ max_shadow_read_write_fields = j;
+}
+
+/*
+ * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(),
+ * set the success or error code of an emulated VMX instruction (as specified
+ * by Vol 2B, VMX Instruction Reference, "Conventions"), and skip the emulated
+ * instruction.
+ */
+static int nested_vmx_succeed(struct kvm_vcpu *vcpu)
+{
+ vmx_set_rflags(vcpu, vmx_get_rflags(vcpu)
+ & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
+ X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF));
+ return kvm_skip_emulated_instruction(vcpu);
+}
+
+static int nested_vmx_failInvalid(struct kvm_vcpu *vcpu)
+{
+ vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
+ & ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF |
+ X86_EFLAGS_SF | X86_EFLAGS_OF))
+ | X86_EFLAGS_CF);
+ return kvm_skip_emulated_instruction(vcpu);
+}
+
+static int nested_vmx_failValid(struct kvm_vcpu *vcpu,
+ u32 vm_instruction_error)
+{
+ vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
+ & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
+ X86_EFLAGS_SF | X86_EFLAGS_OF))
+ | X86_EFLAGS_ZF);
+ get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error;
+ /*
+ * We don't need to force sync to shadow VMCS because
+ * VM_INSTRUCTION_ERROR is not shadowed. Enlightened VMCS 'shadows' all
+ * fields and thus must be synced.
+ */
+ if (to_vmx(vcpu)->nested.hv_evmcs_vmptr != EVMPTR_INVALID)
+ to_vmx(vcpu)->nested.need_vmcs12_to_shadow_sync = true;
+
+ return kvm_skip_emulated_instruction(vcpu);
+}
+
+static int nested_vmx_fail(struct kvm_vcpu *vcpu, u32 vm_instruction_error)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ /*
+ * failValid writes the error number to the current VMCS, which
+ * can't be done if there isn't a current VMCS.
+ */
+ if (vmx->nested.current_vmptr == INVALID_GPA &&
+ !evmptr_is_valid(vmx->nested.hv_evmcs_vmptr))
+ return nested_vmx_failInvalid(vcpu);
+
+ return nested_vmx_failValid(vcpu, vm_instruction_error);
+}
+
+static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator)
+{
+ /* TODO: not to reset guest simply here. */
+ kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+ pr_debug_ratelimited("nested vmx abort, indicator %d\n", indicator);
+}
+
+static inline bool vmx_control_verify(u32 control, u32 low, u32 high)
+{
+ return fixed_bits_valid(control, low, high);
+}
+
+static inline u64 vmx_control_msr(u32 low, u32 high)
+{
+ return low | ((u64)high << 32);
+}
+
+static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx)
+{
+ secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_SHADOW_VMCS);
+ vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA);
+ vmx->nested.need_vmcs12_to_shadow_sync = false;
+}
+
+static inline void nested_release_evmcs(struct kvm_vcpu *vcpu)
+{
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) {
+ kvm_vcpu_unmap(vcpu, &vmx->nested.hv_evmcs_map, true);
+ vmx->nested.hv_evmcs = NULL;
+ }
+
+ vmx->nested.hv_evmcs_vmptr = EVMPTR_INVALID;
+
+ if (hv_vcpu) {
+ hv_vcpu->nested.pa_page_gpa = INVALID_GPA;
+ hv_vcpu->nested.vm_id = 0;
+ hv_vcpu->nested.vp_id = 0;
+ }
+}
+
+static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx,
+ struct loaded_vmcs *prev)
+{
+ struct vmcs_host_state *dest, *src;
+
+ if (unlikely(!vmx->guest_state_loaded))
+ return;
+
+ src = &prev->host_state;
+ dest = &vmx->loaded_vmcs->host_state;
+
+ vmx_set_host_fs_gs(dest, src->fs_sel, src->gs_sel, src->fs_base, src->gs_base);
+ dest->ldt_sel = src->ldt_sel;
+#ifdef CONFIG_X86_64
+ dest->ds_sel = src->ds_sel;
+ dest->es_sel = src->es_sel;
+#endif
+}
+
+static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct loaded_vmcs *prev;
+ int cpu;
+
+ if (WARN_ON_ONCE(vmx->loaded_vmcs == vmcs))
+ return;
+
+ cpu = get_cpu();
+ prev = vmx->loaded_vmcs;
+ vmx->loaded_vmcs = vmcs;
+ vmx_vcpu_load_vmcs(vcpu, cpu, prev);
+ vmx_sync_vmcs_host_state(vmx, prev);
+ put_cpu();
+
+ vcpu->arch.regs_avail = ~VMX_REGS_LAZY_LOAD_SET;
+
+ /*
+ * All lazily updated registers will be reloaded from VMCS12 on both
+ * vmentry and vmexit.
+ */
+ vcpu->arch.regs_dirty = 0;
+}
+
+/*
+ * Free whatever needs to be freed from vmx->nested when L1 goes down, or
+ * just stops using VMX.
+ */
+static void free_nested(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ if (WARN_ON_ONCE(vmx->loaded_vmcs != &vmx->vmcs01))
+ vmx_switch_vmcs(vcpu, &vmx->vmcs01);
+
+ if (!vmx->nested.vmxon && !vmx->nested.smm.vmxon)
+ return;
+
+ kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
+
+ vmx->nested.vmxon = false;
+ vmx->nested.smm.vmxon = false;
+ vmx->nested.vmxon_ptr = INVALID_GPA;
+ free_vpid(vmx->nested.vpid02);
+ vmx->nested.posted_intr_nv = -1;
+ vmx->nested.current_vmptr = INVALID_GPA;
+ if (enable_shadow_vmcs) {
+ vmx_disable_shadow_vmcs(vmx);
+ vmcs_clear(vmx->vmcs01.shadow_vmcs);
+ free_vmcs(vmx->vmcs01.shadow_vmcs);
+ vmx->vmcs01.shadow_vmcs = NULL;
+ }
+ kfree(vmx->nested.cached_vmcs12);
+ vmx->nested.cached_vmcs12 = NULL;
+ kfree(vmx->nested.cached_shadow_vmcs12);
+ vmx->nested.cached_shadow_vmcs12 = NULL;
+ /*
+ * Unpin physical memory we referred to in the vmcs02. The APIC access
+ * page's backing page (yeah, confusing) shouldn't actually be accessed,
+ * and if it is written, the contents are irrelevant.
+ */
+ kvm_vcpu_unmap(vcpu, &vmx->nested.apic_access_page_map, false);
+ kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true);
+ kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true);
+ vmx->nested.pi_desc = NULL;
+
+ kvm_mmu_free_roots(vcpu->kvm, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
+
+ nested_release_evmcs(vcpu);
+
+ free_loaded_vmcs(&vmx->nested.vmcs02);
+}
+
+/*
+ * Ensure that the current vmcs of the logical processor is the
+ * vmcs01 of the vcpu before calling free_nested().
+ */
+void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu)
+{
+ vcpu_load(vcpu);
+ vmx_leave_nested(vcpu);
+ vcpu_put(vcpu);
+}
+
+#define EPTP_PA_MASK GENMASK_ULL(51, 12)
+
+static bool nested_ept_root_matches(hpa_t root_hpa, u64 root_eptp, u64 eptp)
+{
+ return VALID_PAGE(root_hpa) &&
+ ((root_eptp & EPTP_PA_MASK) == (eptp & EPTP_PA_MASK));
+}
+
+static void nested_ept_invalidate_addr(struct kvm_vcpu *vcpu, gpa_t eptp,
+ gpa_t addr)
+{
+ unsigned long roots = 0;
+ uint i;
+ struct kvm_mmu_root_info *cached_root;
+
+ WARN_ON_ONCE(!mmu_is_nested(vcpu));
+
+ for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
+ cached_root = &vcpu->arch.mmu->prev_roots[i];
+
+ if (nested_ept_root_matches(cached_root->hpa, cached_root->pgd,
+ eptp))
+ roots |= KVM_MMU_ROOT_PREVIOUS(i);
+ }
+ if (roots)
+ kvm_mmu_invalidate_addr(vcpu, vcpu->arch.mmu, addr, roots);
+}
+
+static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu,
+ struct x86_exception *fault)
+{
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ u32 vm_exit_reason;
+ unsigned long exit_qualification = vcpu->arch.exit_qualification;
+
+ if (vmx->nested.pml_full) {
+ vm_exit_reason = EXIT_REASON_PML_FULL;
+ vmx->nested.pml_full = false;
+ exit_qualification &= INTR_INFO_UNBLOCK_NMI;
+ } else {
+ if (fault->error_code & PFERR_RSVD_MASK)
+ vm_exit_reason = EXIT_REASON_EPT_MISCONFIG;
+ else
+ vm_exit_reason = EXIT_REASON_EPT_VIOLATION;
+
+ /*
+ * Although the caller (kvm_inject_emulated_page_fault) would
+ * have already synced the faulting address in the shadow EPT
+ * tables for the current EPTP12, we also need to sync it for
+ * any other cached EPTP02s based on the same EP4TA, since the
+ * TLB associates mappings to the EP4TA rather than the full EPTP.
+ */
+ nested_ept_invalidate_addr(vcpu, vmcs12->ept_pointer,
+ fault->address);
+ }
+
+ nested_vmx_vmexit(vcpu, vm_exit_reason, 0, exit_qualification);
+ vmcs12->guest_physical_address = fault->address;
+}
+
+static void nested_ept_new_eptp(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ bool execonly = vmx->nested.msrs.ept_caps & VMX_EPT_EXECUTE_ONLY_BIT;
+ int ept_lpage_level = ept_caps_to_lpage_level(vmx->nested.msrs.ept_caps);
+
+ kvm_init_shadow_ept_mmu(vcpu, execonly, ept_lpage_level,
+ nested_ept_ad_enabled(vcpu),
+ nested_ept_get_eptp(vcpu));
+}
+
+static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu)
+{
+ WARN_ON(mmu_is_nested(vcpu));
+
+ vcpu->arch.mmu = &vcpu->arch.guest_mmu;
+ nested_ept_new_eptp(vcpu);
+ vcpu->arch.mmu->get_guest_pgd = nested_ept_get_eptp;
+ vcpu->arch.mmu->inject_page_fault = nested_ept_inject_page_fault;
+ vcpu->arch.mmu->get_pdptr = kvm_pdptr_read;
+
+ vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu;
+}
+
+static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu)
+{
+ vcpu->arch.mmu = &vcpu->arch.root_mmu;
+ vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
+}
+
+static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12,
+ u16 error_code)
+{
+ bool inequality, bit;
+
+ bit = (vmcs12->exception_bitmap & (1u << PF_VECTOR)) != 0;
+ inequality =
+ (error_code & vmcs12->page_fault_error_code_mask) !=
+ vmcs12->page_fault_error_code_match;
+ return inequality ^ bit;
+}
+
+static bool nested_vmx_is_exception_vmexit(struct kvm_vcpu *vcpu, u8 vector,
+ u32 error_code)
+{
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+
+ /*
+ * Drop bits 31:16 of the error code when performing the #PF mask+match
+ * check. All VMCS fields involved are 32 bits, but Intel CPUs never
+ * set bits 31:16 and VMX disallows setting bits 31:16 in the injected
+ * error code. Including the to-be-dropped bits in the check might
+ * result in an "impossible" or missed exit from L1's perspective.
+ */
+ if (vector == PF_VECTOR)
+ return nested_vmx_is_page_fault_vmexit(vmcs12, (u16)error_code);
+
+ return (vmcs12->exception_bitmap & (1u << vector));
+}
+
+static int nested_vmx_check_io_bitmap_controls(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+ if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
+ return 0;
+
+ if (CC(!page_address_valid(vcpu, vmcs12->io_bitmap_a)) ||
+ CC(!page_address_valid(vcpu, vmcs12->io_bitmap_b)))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+ if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
+ return 0;
+
+ if (CC(!page_address_valid(vcpu, vmcs12->msr_bitmap)))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int nested_vmx_check_tpr_shadow_controls(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+ if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))
+ return 0;
+
+ if (CC(!page_address_valid(vcpu, vmcs12->virtual_apic_page_addr)))
+ return -EINVAL;
+
+ return 0;
+}
+
+/*
+ * For x2APIC MSRs, ignore the vmcs01 bitmap. L1 can enable x2APIC without L1
+ * itself utilizing x2APIC. All MSRs were previously set to be intercepted,
+ * only the "disable intercept" case needs to be handled.
+ */
+static void nested_vmx_disable_intercept_for_x2apic_msr(unsigned long *msr_bitmap_l1,
+ unsigned long *msr_bitmap_l0,
+ u32 msr, int type)
+{
+ if (type & MSR_TYPE_R && !vmx_test_msr_bitmap_read(msr_bitmap_l1, msr))
+ vmx_clear_msr_bitmap_read(msr_bitmap_l0, msr);
+
+ if (type & MSR_TYPE_W && !vmx_test_msr_bitmap_write(msr_bitmap_l1, msr))
+ vmx_clear_msr_bitmap_write(msr_bitmap_l0, msr);
+}
+
+static inline void enable_x2apic_msr_intercepts(unsigned long *msr_bitmap)
+{
+ int msr;
+
+ for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
+ unsigned word = msr / BITS_PER_LONG;
+
+ msr_bitmap[word] = ~0;
+ msr_bitmap[word + (0x800 / sizeof(long))] = ~0;
+ }
+}
+
+#define BUILD_NVMX_MSR_INTERCEPT_HELPER(rw) \
+static inline \
+void nested_vmx_set_msr_##rw##_intercept(struct vcpu_vmx *vmx, \
+ unsigned long *msr_bitmap_l1, \
+ unsigned long *msr_bitmap_l0, u32 msr) \
+{ \
+ if (vmx_test_msr_bitmap_##rw(vmx->vmcs01.msr_bitmap, msr) || \
+ vmx_test_msr_bitmap_##rw(msr_bitmap_l1, msr)) \
+ vmx_set_msr_bitmap_##rw(msr_bitmap_l0, msr); \
+ else \
+ vmx_clear_msr_bitmap_##rw(msr_bitmap_l0, msr); \
+}
+BUILD_NVMX_MSR_INTERCEPT_HELPER(read)
+BUILD_NVMX_MSR_INTERCEPT_HELPER(write)
+
+static inline void nested_vmx_set_intercept_for_msr(struct vcpu_vmx *vmx,
+ unsigned long *msr_bitmap_l1,
+ unsigned long *msr_bitmap_l0,
+ u32 msr, int types)
+{
+ if (types & MSR_TYPE_R)
+ nested_vmx_set_msr_read_intercept(vmx, msr_bitmap_l1,
+ msr_bitmap_l0, msr);
+ if (types & MSR_TYPE_W)
+ nested_vmx_set_msr_write_intercept(vmx, msr_bitmap_l1,
+ msr_bitmap_l0, msr);
+}
+
+/*
+ * Merge L0's and L1's MSR bitmap, return false to indicate that
+ * we do not use the hardware.
+ */
+static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ int msr;
+ unsigned long *msr_bitmap_l1;
+ unsigned long *msr_bitmap_l0 = vmx->nested.vmcs02.msr_bitmap;
+ struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs;
+ struct kvm_host_map *map = &vmx->nested.msr_bitmap_map;
+
+ /* Nothing to do if the MSR bitmap is not in use. */
+ if (!cpu_has_vmx_msr_bitmap() ||
+ !nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
+ return false;
+
+ /*
+ * MSR bitmap update can be skipped when:
+ * - MSR bitmap for L1 hasn't changed.
+ * - Nested hypervisor (L1) is attempting to launch the same L2 as
+ * before.
+ * - Nested hypervisor (L1) has enabled 'Enlightened MSR Bitmap' feature
+ * and tells KVM (L0) there were no changes in MSR bitmap for L2.
+ */
+ if (!vmx->nested.force_msr_bitmap_recalc && evmcs &&
+ evmcs->hv_enlightenments_control.msr_bitmap &&
+ evmcs->hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP)
+ return true;
+
+ if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->msr_bitmap), map))
+ return false;
+
+ msr_bitmap_l1 = (unsigned long *)map->hva;
+
+ /*
+ * To keep the control flow simple, pay eight 8-byte writes (sixteen
+ * 4-byte writes on 32-bit systems) up front to enable intercepts for
+ * the x2APIC MSR range and selectively toggle those relevant to L2.
+ */
+ enable_x2apic_msr_intercepts(msr_bitmap_l0);
+
+ if (nested_cpu_has_virt_x2apic_mode(vmcs12)) {
+ if (nested_cpu_has_apic_reg_virt(vmcs12)) {
+ /*
+ * L0 need not intercept reads for MSRs between 0x800
+ * and 0x8ff, it just lets the processor take the value
+ * from the virtual-APIC page; take those 256 bits
+ * directly from the L1 bitmap.
+ */
+ for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
+ unsigned word = msr / BITS_PER_LONG;
+
+ msr_bitmap_l0[word] = msr_bitmap_l1[word];
+ }
+ }
+
+ nested_vmx_disable_intercept_for_x2apic_msr(
+ msr_bitmap_l1, msr_bitmap_l0,
+ X2APIC_MSR(APIC_TASKPRI),
+ MSR_TYPE_R | MSR_TYPE_W);
+
+ if (nested_cpu_has_vid(vmcs12)) {
+ nested_vmx_disable_intercept_for_x2apic_msr(
+ msr_bitmap_l1, msr_bitmap_l0,
+ X2APIC_MSR(APIC_EOI),
+ MSR_TYPE_W);
+ nested_vmx_disable_intercept_for_x2apic_msr(
+ msr_bitmap_l1, msr_bitmap_l0,
+ X2APIC_MSR(APIC_SELF_IPI),
+ MSR_TYPE_W);
+ }
+ }
+
+ /*
+ * Always check vmcs01's bitmap to honor userspace MSR filters and any
+ * other runtime changes to vmcs01's bitmap, e.g. dynamic pass-through.
+ */
+#ifdef CONFIG_X86_64
+ nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
+ MSR_FS_BASE, MSR_TYPE_RW);
+
+ nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
+ MSR_GS_BASE, MSR_TYPE_RW);
+
+ nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
+ MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
+#endif
+ nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
+ MSR_IA32_SPEC_CTRL, MSR_TYPE_RW);
+
+ nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
+ MSR_IA32_PRED_CMD, MSR_TYPE_W);
+
+ nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
+ MSR_IA32_FLUSH_CMD, MSR_TYPE_W);
+
+ kvm_vcpu_unmap(vcpu, &vmx->nested.msr_bitmap_map, false);
+
+ vmx->nested.force_msr_bitmap_recalc = false;
+
+ return true;
+}
+
+static void nested_cache_shadow_vmcs12(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct gfn_to_hva_cache *ghc = &vmx->nested.shadow_vmcs12_cache;
+
+ if (!nested_cpu_has_shadow_vmcs(vmcs12) ||
+ vmcs12->vmcs_link_pointer == INVALID_GPA)
+ return;
+
+ if (ghc->gpa != vmcs12->vmcs_link_pointer &&
+ kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc,
+ vmcs12->vmcs_link_pointer, VMCS12_SIZE))
+ return;
+
+ kvm_read_guest_cached(vmx->vcpu.kvm, ghc, get_shadow_vmcs12(vcpu),
+ VMCS12_SIZE);
+}
+
+static void nested_flush_cached_shadow_vmcs12(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct gfn_to_hva_cache *ghc = &vmx->nested.shadow_vmcs12_cache;
+
+ if (!nested_cpu_has_shadow_vmcs(vmcs12) ||
+ vmcs12->vmcs_link_pointer == INVALID_GPA)
+ return;
+
+ if (ghc->gpa != vmcs12->vmcs_link_pointer &&
+ kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc,
+ vmcs12->vmcs_link_pointer, VMCS12_SIZE))
+ return;
+
+ kvm_write_guest_cached(vmx->vcpu.kvm, ghc, get_shadow_vmcs12(vcpu),
+ VMCS12_SIZE);
+}
+
+/*
+ * In nested virtualization, check if L1 has set
+ * VM_EXIT_ACK_INTR_ON_EXIT
+ */
+static bool nested_exit_intr_ack_set(struct kvm_vcpu *vcpu)
+{
+ return get_vmcs12(vcpu)->vm_exit_controls &
+ VM_EXIT_ACK_INTR_ON_EXIT;
+}
+
+static int nested_vmx_check_apic_access_controls(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+ if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) &&
+ CC(!page_address_valid(vcpu, vmcs12->apic_access_addr)))
+ return -EINVAL;
+ else
+ return 0;
+}
+
+static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+ if (!nested_cpu_has_virt_x2apic_mode(vmcs12) &&
+ !nested_cpu_has_apic_reg_virt(vmcs12) &&
+ !nested_cpu_has_vid(vmcs12) &&
+ !nested_cpu_has_posted_intr(vmcs12))
+ return 0;
+
+ /*
+ * If virtualize x2apic mode is enabled,
+ * virtualize apic access must be disabled.
+ */
+ if (CC(nested_cpu_has_virt_x2apic_mode(vmcs12) &&
+ nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)))
+ return -EINVAL;
+
+ /*
+ * If virtual interrupt delivery is enabled,
+ * we must exit on external interrupts.
+ */
+ if (CC(nested_cpu_has_vid(vmcs12) && !nested_exit_on_intr(vcpu)))
+ return -EINVAL;
+
+ /*
+ * bits 15:8 should be zero in posted_intr_nv,
+ * the descriptor address has been already checked
+ * in nested_get_vmcs12_pages.
+ *
+ * bits 5:0 of posted_intr_desc_addr should be zero.
+ */
+ if (nested_cpu_has_posted_intr(vmcs12) &&
+ (CC(!nested_cpu_has_vid(vmcs12)) ||
+ CC(!nested_exit_intr_ack_set(vcpu)) ||
+ CC((vmcs12->posted_intr_nv & 0xff00)) ||
+ CC(!kvm_vcpu_is_legal_aligned_gpa(vcpu, vmcs12->posted_intr_desc_addr, 64))))
+ return -EINVAL;
+
+ /* tpr shadow is needed by all apicv features. */
+ if (CC(!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu,
+ u32 count, u64 addr)
+{
+ if (count == 0)
+ return 0;
+
+ if (!kvm_vcpu_is_legal_aligned_gpa(vcpu, addr, 16) ||
+ !kvm_vcpu_is_legal_gpa(vcpu, (addr + count * sizeof(struct vmx_msr_entry) - 1)))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int nested_vmx_check_exit_msr_switch_controls(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+ if (CC(nested_vmx_check_msr_switch(vcpu,
+ vmcs12->vm_exit_msr_load_count,
+ vmcs12->vm_exit_msr_load_addr)) ||
+ CC(nested_vmx_check_msr_switch(vcpu,
+ vmcs12->vm_exit_msr_store_count,
+ vmcs12->vm_exit_msr_store_addr)))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int nested_vmx_check_entry_msr_switch_controls(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+ if (CC(nested_vmx_check_msr_switch(vcpu,
+ vmcs12->vm_entry_msr_load_count,
+ vmcs12->vm_entry_msr_load_addr)))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int nested_vmx_check_pml_controls(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+ if (!nested_cpu_has_pml(vmcs12))
+ return 0;
+
+ if (CC(!nested_cpu_has_ept(vmcs12)) ||
+ CC(!page_address_valid(vcpu, vmcs12->pml_address)))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int nested_vmx_check_unrestricted_guest_controls(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+ if (CC(nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST) &&
+ !nested_cpu_has_ept(vmcs12)))
+ return -EINVAL;
+ return 0;
+}
+
+static int nested_vmx_check_mode_based_ept_exec_controls(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+ if (CC(nested_cpu_has2(vmcs12, SECONDARY_EXEC_MODE_BASED_EPT_EXEC) &&
+ !nested_cpu_has_ept(vmcs12)))
+ return -EINVAL;
+ return 0;
+}
+
+static int nested_vmx_check_shadow_vmcs_controls(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+ if (!nested_cpu_has_shadow_vmcs(vmcs12))
+ return 0;
+
+ if (CC(!page_address_valid(vcpu, vmcs12->vmread_bitmap)) ||
+ CC(!page_address_valid(vcpu, vmcs12->vmwrite_bitmap)))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int nested_vmx_msr_check_common(struct kvm_vcpu *vcpu,
+ struct vmx_msr_entry *e)
+{
+ /* x2APIC MSR accesses are not allowed */
+ if (CC(vcpu->arch.apic_base & X2APIC_ENABLE && e->index >> 8 == 0x8))
+ return -EINVAL;
+ if (CC(e->index == MSR_IA32_UCODE_WRITE) || /* SDM Table 35-2 */
+ CC(e->index == MSR_IA32_UCODE_REV))
+ return -EINVAL;
+ if (CC(e->reserved != 0))
+ return -EINVAL;
+ return 0;
+}
+
+static int nested_vmx_load_msr_check(struct kvm_vcpu *vcpu,
+ struct vmx_msr_entry *e)
+{
+ if (CC(e->index == MSR_FS_BASE) ||
+ CC(e->index == MSR_GS_BASE) ||
+ CC(e->index == MSR_IA32_SMM_MONITOR_CTL) || /* SMM is not supported */
+ nested_vmx_msr_check_common(vcpu, e))
+ return -EINVAL;
+ return 0;
+}
+
+static int nested_vmx_store_msr_check(struct kvm_vcpu *vcpu,
+ struct vmx_msr_entry *e)
+{
+ if (CC(e->index == MSR_IA32_SMBASE) || /* SMM is not supported */
+ nested_vmx_msr_check_common(vcpu, e))
+ return -EINVAL;
+ return 0;
+}
+
+static u32 nested_vmx_max_atomic_switch_msrs(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ u64 vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low,
+ vmx->nested.msrs.misc_high);
+
+ return (vmx_misc_max_msr(vmx_misc) + 1) * VMX_MISC_MSR_LIST_MULTIPLIER;
+}
+
+/*
+ * Load guest's/host's msr at nested entry/exit.
+ * return 0 for success, entry index for failure.
+ *
+ * One of the failure modes for MSR load/store is when a list exceeds the
+ * virtual hardware's capacity. To maintain compatibility with hardware inasmuch
+ * as possible, process all valid entries before failing rather than precheck
+ * for a capacity violation.
+ */
+static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
+{
+ u32 i;
+ struct vmx_msr_entry e;
+ u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu);
+
+ for (i = 0; i < count; i++) {
+ if (unlikely(i >= max_msr_list_size))
+ goto fail;
+
+ if (kvm_vcpu_read_guest(vcpu, gpa + i * sizeof(e),
+ &e, sizeof(e))) {
+ pr_debug_ratelimited(
+ "%s cannot read MSR entry (%u, 0x%08llx)\n",
+ __func__, i, gpa + i * sizeof(e));
+ goto fail;
+ }
+ if (nested_vmx_load_msr_check(vcpu, &e)) {
+ pr_debug_ratelimited(
+ "%s check failed (%u, 0x%x, 0x%x)\n",
+ __func__, i, e.index, e.reserved);
+ goto fail;
+ }
+ if (kvm_set_msr(vcpu, e.index, e.value)) {
+ pr_debug_ratelimited(
+ "%s cannot write MSR (%u, 0x%x, 0x%llx)\n",
+ __func__, i, e.index, e.value);
+ goto fail;
+ }
+ }
+ return 0;
+fail:
+ /* Note, max_msr_list_size is at most 4096, i.e. this can't wrap. */
+ return i + 1;
+}
+
+static bool nested_vmx_get_vmexit_msr_value(struct kvm_vcpu *vcpu,
+ u32 msr_index,
+ u64 *data)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ /*
+ * If the L0 hypervisor stored a more accurate value for the TSC that
+ * does not include the time taken for emulation of the L2->L1
+ * VM-exit in L0, use the more accurate value.
+ */
+ if (msr_index == MSR_IA32_TSC) {
+ int i = vmx_find_loadstore_msr_slot(&vmx->msr_autostore.guest,
+ MSR_IA32_TSC);
+
+ if (i >= 0) {
+ u64 val = vmx->msr_autostore.guest.val[i].value;
+
+ *data = kvm_read_l1_tsc(vcpu, val);
+ return true;
+ }
+ }
+
+ if (kvm_get_msr(vcpu, msr_index, data)) {
+ pr_debug_ratelimited("%s cannot read MSR (0x%x)\n", __func__,
+ msr_index);
+ return false;
+ }
+ return true;
+}
+
+static bool read_and_check_msr_entry(struct kvm_vcpu *vcpu, u64 gpa, int i,
+ struct vmx_msr_entry *e)
+{
+ if (kvm_vcpu_read_guest(vcpu,
+ gpa + i * sizeof(*e),
+ e, 2 * sizeof(u32))) {
+ pr_debug_ratelimited(
+ "%s cannot read MSR entry (%u, 0x%08llx)\n",
+ __func__, i, gpa + i * sizeof(*e));
+ return false;
+ }
+ if (nested_vmx_store_msr_check(vcpu, e)) {
+ pr_debug_ratelimited(
+ "%s check failed (%u, 0x%x, 0x%x)\n",
+ __func__, i, e->index, e->reserved);
+ return false;
+ }
+ return true;
+}
+
+static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
+{
+ u64 data;
+ u32 i;
+ struct vmx_msr_entry e;
+ u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu);
+
+ for (i = 0; i < count; i++) {
+ if (unlikely(i >= max_msr_list_size))
+ return -EINVAL;
+
+ if (!read_and_check_msr_entry(vcpu, gpa, i, &e))
+ return -EINVAL;
+
+ if (!nested_vmx_get_vmexit_msr_value(vcpu, e.index, &data))
+ return -EINVAL;
+
+ if (kvm_vcpu_write_guest(vcpu,
+ gpa + i * sizeof(e) +
+ offsetof(struct vmx_msr_entry, value),
+ &data, sizeof(data))) {
+ pr_debug_ratelimited(
+ "%s cannot write MSR (%u, 0x%x, 0x%llx)\n",
+ __func__, i, e.index, data);
+ return -EINVAL;
+ }
+ }
+ return 0;
+}
+
+static bool nested_msr_store_list_has_msr(struct kvm_vcpu *vcpu, u32 msr_index)
+{
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+ u32 count = vmcs12->vm_exit_msr_store_count;
+ u64 gpa = vmcs12->vm_exit_msr_store_addr;
+ struct vmx_msr_entry e;
+ u32 i;
+
+ for (i = 0; i < count; i++) {
+ if (!read_and_check_msr_entry(vcpu, gpa, i, &e))
+ return false;
+
+ if (e.index == msr_index)
+ return true;
+ }
+ return false;
+}
+
+static void prepare_vmx_msr_autostore_list(struct kvm_vcpu *vcpu,
+ u32 msr_index)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct vmx_msrs *autostore = &vmx->msr_autostore.guest;
+ bool in_vmcs12_store_list;
+ int msr_autostore_slot;
+ bool in_autostore_list;
+ int last;
+
+ msr_autostore_slot = vmx_find_loadstore_msr_slot(autostore, msr_index);
+ in_autostore_list = msr_autostore_slot >= 0;
+ in_vmcs12_store_list = nested_msr_store_list_has_msr(vcpu, msr_index);
+
+ if (in_vmcs12_store_list && !in_autostore_list) {
+ if (autostore->nr == MAX_NR_LOADSTORE_MSRS) {
+ /*
+ * Emulated VMEntry does not fail here. Instead a less
+ * accurate value will be returned by
+ * nested_vmx_get_vmexit_msr_value() using kvm_get_msr()
+ * instead of reading the value from the vmcs02 VMExit
+ * MSR-store area.
+ */
+ pr_warn_ratelimited(
+ "Not enough msr entries in msr_autostore. Can't add msr %x\n",
+ msr_index);
+ return;
+ }
+ last = autostore->nr++;
+ autostore->val[last].index = msr_index;
+ } else if (!in_vmcs12_store_list && in_autostore_list) {
+ last = --autostore->nr;
+ autostore->val[msr_autostore_slot] = autostore->val[last];
+ }
+}
+
+/*
+ * Load guest's/host's cr3 at nested entry/exit. @nested_ept is true if we are
+ * emulating VM-Entry into a guest with EPT enabled. On failure, the expected
+ * Exit Qualification (for a VM-Entry consistency check VM-Exit) is assigned to
+ * @entry_failure_code.
+ */
+static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3,
+ bool nested_ept, bool reload_pdptrs,
+ enum vm_entry_failure_code *entry_failure_code)
+{
+ if (CC(kvm_vcpu_is_illegal_gpa(vcpu, cr3))) {
+ *entry_failure_code = ENTRY_FAIL_DEFAULT;
+ return -EINVAL;
+ }
+
+ /*
+ * If PAE paging and EPT are both on, CR3 is not used by the CPU and
+ * must not be dereferenced.
+ */
+ if (reload_pdptrs && !nested_ept && is_pae_paging(vcpu) &&
+ CC(!load_pdptrs(vcpu, cr3))) {
+ *entry_failure_code = ENTRY_FAIL_PDPTE;
+ return -EINVAL;
+ }
+
+ vcpu->arch.cr3 = cr3;
+ kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3);
+
+ /* Re-initialize the MMU, e.g. to pick up CR4 MMU role changes. */
+ kvm_init_mmu(vcpu);
+
+ if (!nested_ept)
+ kvm_mmu_new_pgd(vcpu, cr3);
+
+ return 0;
+}
+
+/*
+ * Returns if KVM is able to config CPU to tag TLB entries
+ * populated by L2 differently than TLB entries populated
+ * by L1.
+ *
+ * If L0 uses EPT, L1 and L2 run with different EPTP because
+ * guest_mode is part of kvm_mmu_page_role. Thus, TLB entries
+ * are tagged with different EPTP.
+ *
+ * If L1 uses VPID and we allocated a vpid02, TLB entries are tagged
+ * with different VPID (L1 entries are tagged with vmx->vpid
+ * while L2 entries are tagged with vmx->nested.vpid02).
+ */
+static bool nested_has_guest_tlb_tag(struct kvm_vcpu *vcpu)
+{
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+
+ return enable_ept ||
+ (nested_cpu_has_vpid(vmcs12) && to_vmx(vcpu)->nested.vpid02);
+}
+
+static void nested_vmx_transition_tlb_flush(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12,
+ bool is_vmenter)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ /*
+ * KVM_REQ_HV_TLB_FLUSH flushes entries from either L1's VP_ID or
+ * L2's VP_ID upon request from the guest. Make sure we check for
+ * pending entries in the right FIFO upon L1/L2 transition as these
+ * requests are put by other vCPUs asynchronously.
+ */
+ if (to_hv_vcpu(vcpu) && enable_ept)
+ kvm_make_request(KVM_REQ_HV_TLB_FLUSH, vcpu);
+
+ /*
+ * If vmcs12 doesn't use VPID, L1 expects linear and combined mappings
+ * for *all* contexts to be flushed on VM-Enter/VM-Exit, i.e. it's a
+ * full TLB flush from the guest's perspective. This is required even
+ * if VPID is disabled in the host as KVM may need to synchronize the
+ * MMU in response to the guest TLB flush.
+ *
+ * Note, using TLB_FLUSH_GUEST is correct even if nested EPT is in use.
+ * EPT is a special snowflake, as guest-physical mappings aren't
+ * flushed on VPID invalidations, including VM-Enter or VM-Exit with
+ * VPID disabled. As a result, KVM _never_ needs to sync nEPT
+ * entries on VM-Enter because L1 can't rely on VM-Enter to flush
+ * those mappings.
+ */
+ if (!nested_cpu_has_vpid(vmcs12)) {
+ kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
+ return;
+ }
+
+ /* L2 should never have a VPID if VPID is disabled. */
+ WARN_ON(!enable_vpid);
+
+ /*
+ * VPID is enabled and in use by vmcs12. If vpid12 is changing, then
+ * emulate a guest TLB flush as KVM does not track vpid12 history nor
+ * is the VPID incorporated into the MMU context. I.e. KVM must assume
+ * that the new vpid12 has never been used and thus represents a new
+ * guest ASID that cannot have entries in the TLB.
+ */
+ if (is_vmenter && vmcs12->virtual_processor_id != vmx->nested.last_vpid) {
+ vmx->nested.last_vpid = vmcs12->virtual_processor_id;
+ kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
+ return;
+ }
+
+ /*
+ * If VPID is enabled, used by vmc12, and vpid12 is not changing but
+ * does not have a unique TLB tag (ASID), i.e. EPT is disabled and
+ * KVM was unable to allocate a VPID for L2, flush the current context
+ * as the effective ASID is common to both L1 and L2.
+ */
+ if (!nested_has_guest_tlb_tag(vcpu))
+ kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
+}
+
+static bool is_bitwise_subset(u64 superset, u64 subset, u64 mask)
+{
+ superset &= mask;
+ subset &= mask;
+
+ return (superset | subset) == superset;
+}
+
+static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data)
+{
+ const u64 feature_and_reserved =
+ /* feature (except bit 48; see below) */
+ BIT_ULL(49) | BIT_ULL(54) | BIT_ULL(55) |
+ /* reserved */
+ BIT_ULL(31) | GENMASK_ULL(47, 45) | GENMASK_ULL(63, 56);
+ u64 vmx_basic = vmcs_config.nested.basic;
+
+ if (!is_bitwise_subset(vmx_basic, data, feature_and_reserved))
+ return -EINVAL;
+
+ /*
+ * KVM does not emulate a version of VMX that constrains physical
+ * addresses of VMX structures (e.g. VMCS) to 32-bits.
+ */
+ if (data & BIT_ULL(48))
+ return -EINVAL;
+
+ if (vmx_basic_vmcs_revision_id(vmx_basic) !=
+ vmx_basic_vmcs_revision_id(data))
+ return -EINVAL;
+
+ if (vmx_basic_vmcs_size(vmx_basic) > vmx_basic_vmcs_size(data))
+ return -EINVAL;
+
+ vmx->nested.msrs.basic = data;
+ return 0;
+}
+
+static void vmx_get_control_msr(struct nested_vmx_msrs *msrs, u32 msr_index,
+ u32 **low, u32 **high)
+{
+ switch (msr_index) {
+ case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
+ *low = &msrs->pinbased_ctls_low;
+ *high = &msrs->pinbased_ctls_high;
+ break;
+ case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
+ *low = &msrs->procbased_ctls_low;
+ *high = &msrs->procbased_ctls_high;
+ break;
+ case MSR_IA32_VMX_TRUE_EXIT_CTLS:
+ *low = &msrs->exit_ctls_low;
+ *high = &msrs->exit_ctls_high;
+ break;
+ case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
+ *low = &msrs->entry_ctls_low;
+ *high = &msrs->entry_ctls_high;
+ break;
+ case MSR_IA32_VMX_PROCBASED_CTLS2:
+ *low = &msrs->secondary_ctls_low;
+ *high = &msrs->secondary_ctls_high;
+ break;
+ default:
+ BUG();
+ }
+}
+
+static int
+vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data)
+{
+ u32 *lowp, *highp;
+ u64 supported;
+
+ vmx_get_control_msr(&vmcs_config.nested, msr_index, &lowp, &highp);
+
+ supported = vmx_control_msr(*lowp, *highp);
+
+ /* Check must-be-1 bits are still 1. */
+ if (!is_bitwise_subset(data, supported, GENMASK_ULL(31, 0)))
+ return -EINVAL;
+
+ /* Check must-be-0 bits are still 0. */
+ if (!is_bitwise_subset(supported, data, GENMASK_ULL(63, 32)))
+ return -EINVAL;
+
+ vmx_get_control_msr(&vmx->nested.msrs, msr_index, &lowp, &highp);
+ *lowp = data;
+ *highp = data >> 32;
+ return 0;
+}
+
+static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data)
+{
+ const u64 feature_and_reserved_bits =
+ /* feature */
+ BIT_ULL(5) | GENMASK_ULL(8, 6) | BIT_ULL(14) | BIT_ULL(15) |
+ BIT_ULL(28) | BIT_ULL(29) | BIT_ULL(30) |
+ /* reserved */
+ GENMASK_ULL(13, 9) | BIT_ULL(31);
+ u64 vmx_misc = vmx_control_msr(vmcs_config.nested.misc_low,
+ vmcs_config.nested.misc_high);
+
+ if (!is_bitwise_subset(vmx_misc, data, feature_and_reserved_bits))
+ return -EINVAL;
+
+ if ((vmx->nested.msrs.pinbased_ctls_high &
+ PIN_BASED_VMX_PREEMPTION_TIMER) &&
+ vmx_misc_preemption_timer_rate(data) !=
+ vmx_misc_preemption_timer_rate(vmx_misc))
+ return -EINVAL;
+
+ if (vmx_misc_cr3_count(data) > vmx_misc_cr3_count(vmx_misc))
+ return -EINVAL;
+
+ if (vmx_misc_max_msr(data) > vmx_misc_max_msr(vmx_misc))
+ return -EINVAL;
+
+ if (vmx_misc_mseg_revid(data) != vmx_misc_mseg_revid(vmx_misc))
+ return -EINVAL;
+
+ vmx->nested.msrs.misc_low = data;
+ vmx->nested.msrs.misc_high = data >> 32;
+
+ return 0;
+}
+
+static int vmx_restore_vmx_ept_vpid_cap(struct vcpu_vmx *vmx, u64 data)
+{
+ u64 vmx_ept_vpid_cap = vmx_control_msr(vmcs_config.nested.ept_caps,
+ vmcs_config.nested.vpid_caps);
+
+ /* Every bit is either reserved or a feature bit. */
+ if (!is_bitwise_subset(vmx_ept_vpid_cap, data, -1ULL))
+ return -EINVAL;
+
+ vmx->nested.msrs.ept_caps = data;
+ vmx->nested.msrs.vpid_caps = data >> 32;
+ return 0;
+}
+
+static u64 *vmx_get_fixed0_msr(struct nested_vmx_msrs *msrs, u32 msr_index)
+{
+ switch (msr_index) {
+ case MSR_IA32_VMX_CR0_FIXED0:
+ return &msrs->cr0_fixed0;
+ case MSR_IA32_VMX_CR4_FIXED0:
+ return &msrs->cr4_fixed0;
+ default:
+ BUG();
+ }
+}
+
+static int vmx_restore_fixed0_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data)
+{
+ const u64 *msr = vmx_get_fixed0_msr(&vmcs_config.nested, msr_index);
+
+ /*
+ * 1 bits (which indicates bits which "must-be-1" during VMX operation)
+ * must be 1 in the restored value.
+ */
+ if (!is_bitwise_subset(data, *msr, -1ULL))
+ return -EINVAL;
+
+ *vmx_get_fixed0_msr(&vmx->nested.msrs, msr_index) = data;
+ return 0;
+}
+
+/*
+ * Called when userspace is restoring VMX MSRs.
+ *
+ * Returns 0 on success, non-0 otherwise.
+ */
+int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ /*
+ * Don't allow changes to the VMX capability MSRs while the vCPU
+ * is in VMX operation.
+ */
+ if (vmx->nested.vmxon)
+ return -EBUSY;
+
+ switch (msr_index) {
+ case MSR_IA32_VMX_BASIC:
+ return vmx_restore_vmx_basic(vmx, data);
+ case MSR_IA32_VMX_PINBASED_CTLS:
+ case MSR_IA32_VMX_PROCBASED_CTLS:
+ case MSR_IA32_VMX_EXIT_CTLS:
+ case MSR_IA32_VMX_ENTRY_CTLS:
+ /*
+ * The "non-true" VMX capability MSRs are generated from the
+ * "true" MSRs, so we do not support restoring them directly.
+ *
+ * If userspace wants to emulate VMX_BASIC[55]=0, userspace
+ * should restore the "true" MSRs with the must-be-1 bits
+ * set according to the SDM Vol 3. A.2 "RESERVED CONTROLS AND
+ * DEFAULT SETTINGS".
+ */
+ return -EINVAL;
+ case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
+ case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
+ case MSR_IA32_VMX_TRUE_EXIT_CTLS:
+ case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
+ case MSR_IA32_VMX_PROCBASED_CTLS2:
+ return vmx_restore_control_msr(vmx, msr_index, data);
+ case MSR_IA32_VMX_MISC:
+ return vmx_restore_vmx_misc(vmx, data);
+ case MSR_IA32_VMX_CR0_FIXED0:
+ case MSR_IA32_VMX_CR4_FIXED0:
+ return vmx_restore_fixed0_msr(vmx, msr_index, data);
+ case MSR_IA32_VMX_CR0_FIXED1:
+ case MSR_IA32_VMX_CR4_FIXED1:
+ /*
+ * These MSRs are generated based on the vCPU's CPUID, so we
+ * do not support restoring them directly.
+ */
+ return -EINVAL;
+ case MSR_IA32_VMX_EPT_VPID_CAP:
+ return vmx_restore_vmx_ept_vpid_cap(vmx, data);
+ case MSR_IA32_VMX_VMCS_ENUM:
+ vmx->nested.msrs.vmcs_enum = data;
+ return 0;
+ case MSR_IA32_VMX_VMFUNC:
+ if (data & ~vmcs_config.nested.vmfunc_controls)
+ return -EINVAL;
+ vmx->nested.msrs.vmfunc_controls = data;
+ return 0;
+ default:
+ /*
+ * The rest of the VMX capability MSRs do not support restore.
+ */
+ return -EINVAL;
+ }
+}
+
+/* Returns 0 on success, non-0 otherwise. */
+int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata)
+{
+ switch (msr_index) {
+ case MSR_IA32_VMX_BASIC:
+ *pdata = msrs->basic;
+ break;
+ case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
+ case MSR_IA32_VMX_PINBASED_CTLS:
+ *pdata = vmx_control_msr(
+ msrs->pinbased_ctls_low,
+ msrs->pinbased_ctls_high);
+ if (msr_index == MSR_IA32_VMX_PINBASED_CTLS)
+ *pdata |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
+ break;
+ case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
+ case MSR_IA32_VMX_PROCBASED_CTLS:
+ *pdata = vmx_control_msr(
+ msrs->procbased_ctls_low,
+ msrs->procbased_ctls_high);
+ if (msr_index == MSR_IA32_VMX_PROCBASED_CTLS)
+ *pdata |= CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
+ break;
+ case MSR_IA32_VMX_TRUE_EXIT_CTLS:
+ case MSR_IA32_VMX_EXIT_CTLS:
+ *pdata = vmx_control_msr(
+ msrs->exit_ctls_low,
+ msrs->exit_ctls_high);
+ if (msr_index == MSR_IA32_VMX_EXIT_CTLS)
+ *pdata |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
+ break;
+ case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
+ case MSR_IA32_VMX_ENTRY_CTLS:
+ *pdata = vmx_control_msr(
+ msrs->entry_ctls_low,
+ msrs->entry_ctls_high);
+ if (msr_index == MSR_IA32_VMX_ENTRY_CTLS)
+ *pdata |= VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
+ break;
+ case MSR_IA32_VMX_MISC:
+ *pdata = vmx_control_msr(
+ msrs->misc_low,
+ msrs->misc_high);
+ break;
+ case MSR_IA32_VMX_CR0_FIXED0:
+ *pdata = msrs->cr0_fixed0;
+ break;
+ case MSR_IA32_VMX_CR0_FIXED1:
+ *pdata = msrs->cr0_fixed1;
+ break;
+ case MSR_IA32_VMX_CR4_FIXED0:
+ *pdata = msrs->cr4_fixed0;
+ break;
+ case MSR_IA32_VMX_CR4_FIXED1:
+ *pdata = msrs->cr4_fixed1;
+ break;
+ case MSR_IA32_VMX_VMCS_ENUM:
+ *pdata = msrs->vmcs_enum;
+ break;
+ case MSR_IA32_VMX_PROCBASED_CTLS2:
+ *pdata = vmx_control_msr(
+ msrs->secondary_ctls_low,
+ msrs->secondary_ctls_high);
+ break;
+ case MSR_IA32_VMX_EPT_VPID_CAP:
+ *pdata = msrs->ept_caps |
+ ((u64)msrs->vpid_caps << 32);
+ break;
+ case MSR_IA32_VMX_VMFUNC:
+ *pdata = msrs->vmfunc_controls;
+ break;
+ default:
+ return 1;
+ }
+
+ return 0;
+}
+
+/*
+ * Copy the writable VMCS shadow fields back to the VMCS12, in case they have
+ * been modified by the L1 guest. Note, "writable" in this context means
+ * "writable by the guest", i.e. tagged SHADOW_FIELD_RW; the set of
+ * fields tagged SHADOW_FIELD_RO may or may not align with the "read-only"
+ * VM-exit information fields (which are actually writable if the vCPU is
+ * configured to support "VMWRITE to any supported field in the VMCS").
+ */
+static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx)
+{
+ struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs;
+ struct vmcs12 *vmcs12 = get_vmcs12(&vmx->vcpu);
+ struct shadow_vmcs_field field;
+ unsigned long val;
+ int i;
+
+ if (WARN_ON(!shadow_vmcs))
+ return;
+
+ preempt_disable();
+
+ vmcs_load(shadow_vmcs);
+
+ for (i = 0; i < max_shadow_read_write_fields; i++) {
+ field = shadow_read_write_fields[i];
+ val = __vmcs_readl(field.encoding);
+ vmcs12_write_any(vmcs12, field.encoding, field.offset, val);
+ }
+
+ vmcs_clear(shadow_vmcs);
+ vmcs_load(vmx->loaded_vmcs->vmcs);
+
+ preempt_enable();
+}
+
+static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx)
+{
+ const struct shadow_vmcs_field *fields[] = {
+ shadow_read_write_fields,
+ shadow_read_only_fields
+ };
+ const int max_fields[] = {
+ max_shadow_read_write_fields,
+ max_shadow_read_only_fields
+ };
+ struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs;
+ struct vmcs12 *vmcs12 = get_vmcs12(&vmx->vcpu);
+ struct shadow_vmcs_field field;
+ unsigned long val;
+ int i, q;
+
+ if (WARN_ON(!shadow_vmcs))
+ return;
+
+ vmcs_load(shadow_vmcs);
+
+ for (q = 0; q < ARRAY_SIZE(fields); q++) {
+ for (i = 0; i < max_fields[q]; i++) {
+ field = fields[q][i];
+ val = vmcs12_read_any(vmcs12, field.encoding,
+ field.offset);
+ __vmcs_writel(field.encoding, val);
+ }
+ }
+
+ vmcs_clear(shadow_vmcs);
+ vmcs_load(vmx->loaded_vmcs->vmcs);
+}
+
+static void copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx, u32 hv_clean_fields)
+{
+ struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12;
+ struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs;
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(&vmx->vcpu);
+
+ /* HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE */
+ vmcs12->tpr_threshold = evmcs->tpr_threshold;
+ vmcs12->guest_rip = evmcs->guest_rip;
+
+ if (unlikely(!(hv_clean_fields &
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_ENLIGHTENMENTSCONTROL))) {
+ hv_vcpu->nested.pa_page_gpa = evmcs->partition_assist_page;
+ hv_vcpu->nested.vm_id = evmcs->hv_vm_id;
+ hv_vcpu->nested.vp_id = evmcs->hv_vp_id;
+ }
+
+ if (unlikely(!(hv_clean_fields &
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC))) {
+ vmcs12->guest_rsp = evmcs->guest_rsp;
+ vmcs12->guest_rflags = evmcs->guest_rflags;
+ vmcs12->guest_interruptibility_info =
+ evmcs->guest_interruptibility_info;
+ /*
+ * Not present in struct vmcs12:
+ * vmcs12->guest_ssp = evmcs->guest_ssp;
+ */
+ }
+
+ if (unlikely(!(hv_clean_fields &
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC))) {
+ vmcs12->cpu_based_vm_exec_control =
+ evmcs->cpu_based_vm_exec_control;
+ }
+
+ if (unlikely(!(hv_clean_fields &
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EXCPN))) {
+ vmcs12->exception_bitmap = evmcs->exception_bitmap;
+ }
+
+ if (unlikely(!(hv_clean_fields &
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_ENTRY))) {
+ vmcs12->vm_entry_controls = evmcs->vm_entry_controls;
+ }
+
+ if (unlikely(!(hv_clean_fields &
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT))) {
+ vmcs12->vm_entry_intr_info_field =
+ evmcs->vm_entry_intr_info_field;
+ vmcs12->vm_entry_exception_error_code =
+ evmcs->vm_entry_exception_error_code;
+ vmcs12->vm_entry_instruction_len =
+ evmcs->vm_entry_instruction_len;
+ }
+
+ if (unlikely(!(hv_clean_fields &
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1))) {
+ vmcs12->host_ia32_pat = evmcs->host_ia32_pat;
+ vmcs12->host_ia32_efer = evmcs->host_ia32_efer;
+ vmcs12->host_cr0 = evmcs->host_cr0;
+ vmcs12->host_cr3 = evmcs->host_cr3;
+ vmcs12->host_cr4 = evmcs->host_cr4;
+ vmcs12->host_ia32_sysenter_esp = evmcs->host_ia32_sysenter_esp;
+ vmcs12->host_ia32_sysenter_eip = evmcs->host_ia32_sysenter_eip;
+ vmcs12->host_rip = evmcs->host_rip;
+ vmcs12->host_ia32_sysenter_cs = evmcs->host_ia32_sysenter_cs;
+ vmcs12->host_es_selector = evmcs->host_es_selector;
+ vmcs12->host_cs_selector = evmcs->host_cs_selector;
+ vmcs12->host_ss_selector = evmcs->host_ss_selector;
+ vmcs12->host_ds_selector = evmcs->host_ds_selector;
+ vmcs12->host_fs_selector = evmcs->host_fs_selector;
+ vmcs12->host_gs_selector = evmcs->host_gs_selector;
+ vmcs12->host_tr_selector = evmcs->host_tr_selector;
+ vmcs12->host_ia32_perf_global_ctrl = evmcs->host_ia32_perf_global_ctrl;
+ /*
+ * Not present in struct vmcs12:
+ * vmcs12->host_ia32_s_cet = evmcs->host_ia32_s_cet;
+ * vmcs12->host_ssp = evmcs->host_ssp;
+ * vmcs12->host_ia32_int_ssp_table_addr = evmcs->host_ia32_int_ssp_table_addr;
+ */
+ }
+
+ if (unlikely(!(hv_clean_fields &
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1))) {
+ vmcs12->pin_based_vm_exec_control =
+ evmcs->pin_based_vm_exec_control;
+ vmcs12->vm_exit_controls = evmcs->vm_exit_controls;
+ vmcs12->secondary_vm_exec_control =
+ evmcs->secondary_vm_exec_control;
+ }
+
+ if (unlikely(!(hv_clean_fields &
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP))) {
+ vmcs12->io_bitmap_a = evmcs->io_bitmap_a;
+ vmcs12->io_bitmap_b = evmcs->io_bitmap_b;
+ }
+
+ if (unlikely(!(hv_clean_fields &
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP))) {
+ vmcs12->msr_bitmap = evmcs->msr_bitmap;
+ }
+
+ if (unlikely(!(hv_clean_fields &
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2))) {
+ vmcs12->guest_es_base = evmcs->guest_es_base;
+ vmcs12->guest_cs_base = evmcs->guest_cs_base;
+ vmcs12->guest_ss_base = evmcs->guest_ss_base;
+ vmcs12->guest_ds_base = evmcs->guest_ds_base;
+ vmcs12->guest_fs_base = evmcs->guest_fs_base;
+ vmcs12->guest_gs_base = evmcs->guest_gs_base;
+ vmcs12->guest_ldtr_base = evmcs->guest_ldtr_base;
+ vmcs12->guest_tr_base = evmcs->guest_tr_base;
+ vmcs12->guest_gdtr_base = evmcs->guest_gdtr_base;
+ vmcs12->guest_idtr_base = evmcs->guest_idtr_base;
+ vmcs12->guest_es_limit = evmcs->guest_es_limit;
+ vmcs12->guest_cs_limit = evmcs->guest_cs_limit;
+ vmcs12->guest_ss_limit = evmcs->guest_ss_limit;
+ vmcs12->guest_ds_limit = evmcs->guest_ds_limit;
+ vmcs12->guest_fs_limit = evmcs->guest_fs_limit;
+ vmcs12->guest_gs_limit = evmcs->guest_gs_limit;
+ vmcs12->guest_ldtr_limit = evmcs->guest_ldtr_limit;
+ vmcs12->guest_tr_limit = evmcs->guest_tr_limit;
+ vmcs12->guest_gdtr_limit = evmcs->guest_gdtr_limit;
+ vmcs12->guest_idtr_limit = evmcs->guest_idtr_limit;
+ vmcs12->guest_es_ar_bytes = evmcs->guest_es_ar_bytes;
+ vmcs12->guest_cs_ar_bytes = evmcs->guest_cs_ar_bytes;
+ vmcs12->guest_ss_ar_bytes = evmcs->guest_ss_ar_bytes;
+ vmcs12->guest_ds_ar_bytes = evmcs->guest_ds_ar_bytes;
+ vmcs12->guest_fs_ar_bytes = evmcs->guest_fs_ar_bytes;
+ vmcs12->guest_gs_ar_bytes = evmcs->guest_gs_ar_bytes;
+ vmcs12->guest_ldtr_ar_bytes = evmcs->guest_ldtr_ar_bytes;
+ vmcs12->guest_tr_ar_bytes = evmcs->guest_tr_ar_bytes;
+ vmcs12->guest_es_selector = evmcs->guest_es_selector;
+ vmcs12->guest_cs_selector = evmcs->guest_cs_selector;
+ vmcs12->guest_ss_selector = evmcs->guest_ss_selector;
+ vmcs12->guest_ds_selector = evmcs->guest_ds_selector;
+ vmcs12->guest_fs_selector = evmcs->guest_fs_selector;
+ vmcs12->guest_gs_selector = evmcs->guest_gs_selector;
+ vmcs12->guest_ldtr_selector = evmcs->guest_ldtr_selector;
+ vmcs12->guest_tr_selector = evmcs->guest_tr_selector;
+ }
+
+ if (unlikely(!(hv_clean_fields &
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2))) {
+ vmcs12->tsc_offset = evmcs->tsc_offset;
+ vmcs12->virtual_apic_page_addr = evmcs->virtual_apic_page_addr;
+ vmcs12->xss_exit_bitmap = evmcs->xss_exit_bitmap;
+ vmcs12->encls_exiting_bitmap = evmcs->encls_exiting_bitmap;
+ vmcs12->tsc_multiplier = evmcs->tsc_multiplier;
+ }
+
+ if (unlikely(!(hv_clean_fields &
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR))) {
+ vmcs12->cr0_guest_host_mask = evmcs->cr0_guest_host_mask;
+ vmcs12->cr4_guest_host_mask = evmcs->cr4_guest_host_mask;
+ vmcs12->cr0_read_shadow = evmcs->cr0_read_shadow;
+ vmcs12->cr4_read_shadow = evmcs->cr4_read_shadow;
+ vmcs12->guest_cr0 = evmcs->guest_cr0;
+ vmcs12->guest_cr3 = evmcs->guest_cr3;
+ vmcs12->guest_cr4 = evmcs->guest_cr4;
+ vmcs12->guest_dr7 = evmcs->guest_dr7;
+ }
+
+ if (unlikely(!(hv_clean_fields &
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER))) {
+ vmcs12->host_fs_base = evmcs->host_fs_base;
+ vmcs12->host_gs_base = evmcs->host_gs_base;
+ vmcs12->host_tr_base = evmcs->host_tr_base;
+ vmcs12->host_gdtr_base = evmcs->host_gdtr_base;
+ vmcs12->host_idtr_base = evmcs->host_idtr_base;
+ vmcs12->host_rsp = evmcs->host_rsp;
+ }
+
+ if (unlikely(!(hv_clean_fields &
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT))) {
+ vmcs12->ept_pointer = evmcs->ept_pointer;
+ vmcs12->virtual_processor_id = evmcs->virtual_processor_id;
+ }
+
+ if (unlikely(!(hv_clean_fields &
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1))) {
+ vmcs12->vmcs_link_pointer = evmcs->vmcs_link_pointer;
+ vmcs12->guest_ia32_debugctl = evmcs->guest_ia32_debugctl;
+ vmcs12->guest_ia32_pat = evmcs->guest_ia32_pat;
+ vmcs12->guest_ia32_efer = evmcs->guest_ia32_efer;
+ vmcs12->guest_pdptr0 = evmcs->guest_pdptr0;
+ vmcs12->guest_pdptr1 = evmcs->guest_pdptr1;
+ vmcs12->guest_pdptr2 = evmcs->guest_pdptr2;
+ vmcs12->guest_pdptr3 = evmcs->guest_pdptr3;
+ vmcs12->guest_pending_dbg_exceptions =
+ evmcs->guest_pending_dbg_exceptions;
+ vmcs12->guest_sysenter_esp = evmcs->guest_sysenter_esp;
+ vmcs12->guest_sysenter_eip = evmcs->guest_sysenter_eip;
+ vmcs12->guest_bndcfgs = evmcs->guest_bndcfgs;
+ vmcs12->guest_activity_state = evmcs->guest_activity_state;
+ vmcs12->guest_sysenter_cs = evmcs->guest_sysenter_cs;
+ vmcs12->guest_ia32_perf_global_ctrl = evmcs->guest_ia32_perf_global_ctrl;
+ /*
+ * Not present in struct vmcs12:
+ * vmcs12->guest_ia32_s_cet = evmcs->guest_ia32_s_cet;
+ * vmcs12->guest_ia32_lbr_ctl = evmcs->guest_ia32_lbr_ctl;
+ * vmcs12->guest_ia32_int_ssp_table_addr = evmcs->guest_ia32_int_ssp_table_addr;
+ */
+ }
+
+ /*
+ * Not used?
+ * vmcs12->vm_exit_msr_store_addr = evmcs->vm_exit_msr_store_addr;
+ * vmcs12->vm_exit_msr_load_addr = evmcs->vm_exit_msr_load_addr;
+ * vmcs12->vm_entry_msr_load_addr = evmcs->vm_entry_msr_load_addr;
+ * vmcs12->page_fault_error_code_mask =
+ * evmcs->page_fault_error_code_mask;
+ * vmcs12->page_fault_error_code_match =
+ * evmcs->page_fault_error_code_match;
+ * vmcs12->cr3_target_count = evmcs->cr3_target_count;
+ * vmcs12->vm_exit_msr_store_count = evmcs->vm_exit_msr_store_count;
+ * vmcs12->vm_exit_msr_load_count = evmcs->vm_exit_msr_load_count;
+ * vmcs12->vm_entry_msr_load_count = evmcs->vm_entry_msr_load_count;
+ */
+
+ /*
+ * Read only fields:
+ * vmcs12->guest_physical_address = evmcs->guest_physical_address;
+ * vmcs12->vm_instruction_error = evmcs->vm_instruction_error;
+ * vmcs12->vm_exit_reason = evmcs->vm_exit_reason;
+ * vmcs12->vm_exit_intr_info = evmcs->vm_exit_intr_info;
+ * vmcs12->vm_exit_intr_error_code = evmcs->vm_exit_intr_error_code;
+ * vmcs12->idt_vectoring_info_field = evmcs->idt_vectoring_info_field;
+ * vmcs12->idt_vectoring_error_code = evmcs->idt_vectoring_error_code;
+ * vmcs12->vm_exit_instruction_len = evmcs->vm_exit_instruction_len;
+ * vmcs12->vmx_instruction_info = evmcs->vmx_instruction_info;
+ * vmcs12->exit_qualification = evmcs->exit_qualification;
+ * vmcs12->guest_linear_address = evmcs->guest_linear_address;
+ *
+ * Not present in struct vmcs12:
+ * vmcs12->exit_io_instruction_ecx = evmcs->exit_io_instruction_ecx;
+ * vmcs12->exit_io_instruction_esi = evmcs->exit_io_instruction_esi;
+ * vmcs12->exit_io_instruction_edi = evmcs->exit_io_instruction_edi;
+ * vmcs12->exit_io_instruction_eip = evmcs->exit_io_instruction_eip;
+ */
+
+ return;
+}
+
+static void copy_vmcs12_to_enlightened(struct vcpu_vmx *vmx)
+{
+ struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12;
+ struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs;
+
+ /*
+ * Should not be changed by KVM:
+ *
+ * evmcs->host_es_selector = vmcs12->host_es_selector;
+ * evmcs->host_cs_selector = vmcs12->host_cs_selector;
+ * evmcs->host_ss_selector = vmcs12->host_ss_selector;
+ * evmcs->host_ds_selector = vmcs12->host_ds_selector;
+ * evmcs->host_fs_selector = vmcs12->host_fs_selector;
+ * evmcs->host_gs_selector = vmcs12->host_gs_selector;
+ * evmcs->host_tr_selector = vmcs12->host_tr_selector;
+ * evmcs->host_ia32_pat = vmcs12->host_ia32_pat;
+ * evmcs->host_ia32_efer = vmcs12->host_ia32_efer;
+ * evmcs->host_cr0 = vmcs12->host_cr0;
+ * evmcs->host_cr3 = vmcs12->host_cr3;
+ * evmcs->host_cr4 = vmcs12->host_cr4;
+ * evmcs->host_ia32_sysenter_esp = vmcs12->host_ia32_sysenter_esp;
+ * evmcs->host_ia32_sysenter_eip = vmcs12->host_ia32_sysenter_eip;
+ * evmcs->host_rip = vmcs12->host_rip;
+ * evmcs->host_ia32_sysenter_cs = vmcs12->host_ia32_sysenter_cs;
+ * evmcs->host_fs_base = vmcs12->host_fs_base;
+ * evmcs->host_gs_base = vmcs12->host_gs_base;
+ * evmcs->host_tr_base = vmcs12->host_tr_base;
+ * evmcs->host_gdtr_base = vmcs12->host_gdtr_base;
+ * evmcs->host_idtr_base = vmcs12->host_idtr_base;
+ * evmcs->host_rsp = vmcs12->host_rsp;
+ * sync_vmcs02_to_vmcs12() doesn't read these:
+ * evmcs->io_bitmap_a = vmcs12->io_bitmap_a;
+ * evmcs->io_bitmap_b = vmcs12->io_bitmap_b;
+ * evmcs->msr_bitmap = vmcs12->msr_bitmap;
+ * evmcs->ept_pointer = vmcs12->ept_pointer;
+ * evmcs->xss_exit_bitmap = vmcs12->xss_exit_bitmap;
+ * evmcs->vm_exit_msr_store_addr = vmcs12->vm_exit_msr_store_addr;
+ * evmcs->vm_exit_msr_load_addr = vmcs12->vm_exit_msr_load_addr;
+ * evmcs->vm_entry_msr_load_addr = vmcs12->vm_entry_msr_load_addr;
+ * evmcs->tpr_threshold = vmcs12->tpr_threshold;
+ * evmcs->virtual_processor_id = vmcs12->virtual_processor_id;
+ * evmcs->exception_bitmap = vmcs12->exception_bitmap;
+ * evmcs->vmcs_link_pointer = vmcs12->vmcs_link_pointer;
+ * evmcs->pin_based_vm_exec_control = vmcs12->pin_based_vm_exec_control;
+ * evmcs->vm_exit_controls = vmcs12->vm_exit_controls;
+ * evmcs->secondary_vm_exec_control = vmcs12->secondary_vm_exec_control;
+ * evmcs->page_fault_error_code_mask =
+ * vmcs12->page_fault_error_code_mask;
+ * evmcs->page_fault_error_code_match =
+ * vmcs12->page_fault_error_code_match;
+ * evmcs->cr3_target_count = vmcs12->cr3_target_count;
+ * evmcs->virtual_apic_page_addr = vmcs12->virtual_apic_page_addr;
+ * evmcs->tsc_offset = vmcs12->tsc_offset;
+ * evmcs->guest_ia32_debugctl = vmcs12->guest_ia32_debugctl;
+ * evmcs->cr0_guest_host_mask = vmcs12->cr0_guest_host_mask;
+ * evmcs->cr4_guest_host_mask = vmcs12->cr4_guest_host_mask;
+ * evmcs->cr0_read_shadow = vmcs12->cr0_read_shadow;
+ * evmcs->cr4_read_shadow = vmcs12->cr4_read_shadow;
+ * evmcs->vm_exit_msr_store_count = vmcs12->vm_exit_msr_store_count;
+ * evmcs->vm_exit_msr_load_count = vmcs12->vm_exit_msr_load_count;
+ * evmcs->vm_entry_msr_load_count = vmcs12->vm_entry_msr_load_count;
+ * evmcs->guest_ia32_perf_global_ctrl = vmcs12->guest_ia32_perf_global_ctrl;
+ * evmcs->host_ia32_perf_global_ctrl = vmcs12->host_ia32_perf_global_ctrl;
+ * evmcs->encls_exiting_bitmap = vmcs12->encls_exiting_bitmap;
+ * evmcs->tsc_multiplier = vmcs12->tsc_multiplier;
+ *
+ * Not present in struct vmcs12:
+ * evmcs->exit_io_instruction_ecx = vmcs12->exit_io_instruction_ecx;
+ * evmcs->exit_io_instruction_esi = vmcs12->exit_io_instruction_esi;
+ * evmcs->exit_io_instruction_edi = vmcs12->exit_io_instruction_edi;
+ * evmcs->exit_io_instruction_eip = vmcs12->exit_io_instruction_eip;
+ * evmcs->host_ia32_s_cet = vmcs12->host_ia32_s_cet;
+ * evmcs->host_ssp = vmcs12->host_ssp;
+ * evmcs->host_ia32_int_ssp_table_addr = vmcs12->host_ia32_int_ssp_table_addr;
+ * evmcs->guest_ia32_s_cet = vmcs12->guest_ia32_s_cet;
+ * evmcs->guest_ia32_lbr_ctl = vmcs12->guest_ia32_lbr_ctl;
+ * evmcs->guest_ia32_int_ssp_table_addr = vmcs12->guest_ia32_int_ssp_table_addr;
+ * evmcs->guest_ssp = vmcs12->guest_ssp;
+ */
+
+ evmcs->guest_es_selector = vmcs12->guest_es_selector;
+ evmcs->guest_cs_selector = vmcs12->guest_cs_selector;
+ evmcs->guest_ss_selector = vmcs12->guest_ss_selector;
+ evmcs->guest_ds_selector = vmcs12->guest_ds_selector;
+ evmcs->guest_fs_selector = vmcs12->guest_fs_selector;
+ evmcs->guest_gs_selector = vmcs12->guest_gs_selector;
+ evmcs->guest_ldtr_selector = vmcs12->guest_ldtr_selector;
+ evmcs->guest_tr_selector = vmcs12->guest_tr_selector;
+
+ evmcs->guest_es_limit = vmcs12->guest_es_limit;
+ evmcs->guest_cs_limit = vmcs12->guest_cs_limit;
+ evmcs->guest_ss_limit = vmcs12->guest_ss_limit;
+ evmcs->guest_ds_limit = vmcs12->guest_ds_limit;
+ evmcs->guest_fs_limit = vmcs12->guest_fs_limit;
+ evmcs->guest_gs_limit = vmcs12->guest_gs_limit;
+ evmcs->guest_ldtr_limit = vmcs12->guest_ldtr_limit;
+ evmcs->guest_tr_limit = vmcs12->guest_tr_limit;
+ evmcs->guest_gdtr_limit = vmcs12->guest_gdtr_limit;
+ evmcs->guest_idtr_limit = vmcs12->guest_idtr_limit;
+
+ evmcs->guest_es_ar_bytes = vmcs12->guest_es_ar_bytes;
+ evmcs->guest_cs_ar_bytes = vmcs12->guest_cs_ar_bytes;
+ evmcs->guest_ss_ar_bytes = vmcs12->guest_ss_ar_bytes;
+ evmcs->guest_ds_ar_bytes = vmcs12->guest_ds_ar_bytes;
+ evmcs->guest_fs_ar_bytes = vmcs12->guest_fs_ar_bytes;
+ evmcs->guest_gs_ar_bytes = vmcs12->guest_gs_ar_bytes;
+ evmcs->guest_ldtr_ar_bytes = vmcs12->guest_ldtr_ar_bytes;
+ evmcs->guest_tr_ar_bytes = vmcs12->guest_tr_ar_bytes;
+
+ evmcs->guest_es_base = vmcs12->guest_es_base;
+ evmcs->guest_cs_base = vmcs12->guest_cs_base;
+ evmcs->guest_ss_base = vmcs12->guest_ss_base;
+ evmcs->guest_ds_base = vmcs12->guest_ds_base;
+ evmcs->guest_fs_base = vmcs12->guest_fs_base;
+ evmcs->guest_gs_base = vmcs12->guest_gs_base;
+ evmcs->guest_ldtr_base = vmcs12->guest_ldtr_base;
+ evmcs->guest_tr_base = vmcs12->guest_tr_base;
+ evmcs->guest_gdtr_base = vmcs12->guest_gdtr_base;
+ evmcs->guest_idtr_base = vmcs12->guest_idtr_base;
+
+ evmcs->guest_ia32_pat = vmcs12->guest_ia32_pat;
+ evmcs->guest_ia32_efer = vmcs12->guest_ia32_efer;
+
+ evmcs->guest_pdptr0 = vmcs12->guest_pdptr0;
+ evmcs->guest_pdptr1 = vmcs12->guest_pdptr1;
+ evmcs->guest_pdptr2 = vmcs12->guest_pdptr2;
+ evmcs->guest_pdptr3 = vmcs12->guest_pdptr3;
+
+ evmcs->guest_pending_dbg_exceptions =
+ vmcs12->guest_pending_dbg_exceptions;
+ evmcs->guest_sysenter_esp = vmcs12->guest_sysenter_esp;
+ evmcs->guest_sysenter_eip = vmcs12->guest_sysenter_eip;
+
+ evmcs->guest_activity_state = vmcs12->guest_activity_state;
+ evmcs->guest_sysenter_cs = vmcs12->guest_sysenter_cs;
+
+ evmcs->guest_cr0 = vmcs12->guest_cr0;
+ evmcs->guest_cr3 = vmcs12->guest_cr3;
+ evmcs->guest_cr4 = vmcs12->guest_cr4;
+ evmcs->guest_dr7 = vmcs12->guest_dr7;
+
+ evmcs->guest_physical_address = vmcs12->guest_physical_address;
+
+ evmcs->vm_instruction_error = vmcs12->vm_instruction_error;
+ evmcs->vm_exit_reason = vmcs12->vm_exit_reason;
+ evmcs->vm_exit_intr_info = vmcs12->vm_exit_intr_info;
+ evmcs->vm_exit_intr_error_code = vmcs12->vm_exit_intr_error_code;
+ evmcs->idt_vectoring_info_field = vmcs12->idt_vectoring_info_field;
+ evmcs->idt_vectoring_error_code = vmcs12->idt_vectoring_error_code;
+ evmcs->vm_exit_instruction_len = vmcs12->vm_exit_instruction_len;
+ evmcs->vmx_instruction_info = vmcs12->vmx_instruction_info;
+
+ evmcs->exit_qualification = vmcs12->exit_qualification;
+
+ evmcs->guest_linear_address = vmcs12->guest_linear_address;
+ evmcs->guest_rsp = vmcs12->guest_rsp;
+ evmcs->guest_rflags = vmcs12->guest_rflags;
+
+ evmcs->guest_interruptibility_info =
+ vmcs12->guest_interruptibility_info;
+ evmcs->cpu_based_vm_exec_control = vmcs12->cpu_based_vm_exec_control;
+ evmcs->vm_entry_controls = vmcs12->vm_entry_controls;
+ evmcs->vm_entry_intr_info_field = vmcs12->vm_entry_intr_info_field;
+ evmcs->vm_entry_exception_error_code =
+ vmcs12->vm_entry_exception_error_code;
+ evmcs->vm_entry_instruction_len = vmcs12->vm_entry_instruction_len;
+
+ evmcs->guest_rip = vmcs12->guest_rip;
+
+ evmcs->guest_bndcfgs = vmcs12->guest_bndcfgs;
+
+ return;
+}
+
+/*
+ * This is an equivalent of the nested hypervisor executing the vmptrld
+ * instruction.
+ */
+static enum nested_evmptrld_status nested_vmx_handle_enlightened_vmptrld(
+ struct kvm_vcpu *vcpu, bool from_launch)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ bool evmcs_gpa_changed = false;
+ u64 evmcs_gpa;
+
+ if (likely(!guest_cpuid_has_evmcs(vcpu)))
+ return EVMPTRLD_DISABLED;
+
+ evmcs_gpa = nested_get_evmptr(vcpu);
+ if (!evmptr_is_valid(evmcs_gpa)) {
+ nested_release_evmcs(vcpu);
+ return EVMPTRLD_DISABLED;
+ }
+
+ if (unlikely(evmcs_gpa != vmx->nested.hv_evmcs_vmptr)) {
+ vmx->nested.current_vmptr = INVALID_GPA;
+
+ nested_release_evmcs(vcpu);
+
+ if (kvm_vcpu_map(vcpu, gpa_to_gfn(evmcs_gpa),
+ &vmx->nested.hv_evmcs_map))
+ return EVMPTRLD_ERROR;
+
+ vmx->nested.hv_evmcs = vmx->nested.hv_evmcs_map.hva;
+
+ /*
+ * Currently, KVM only supports eVMCS version 1
+ * (== KVM_EVMCS_VERSION) and thus we expect guest to set this
+ * value to first u32 field of eVMCS which should specify eVMCS
+ * VersionNumber.
+ *
+ * Guest should be aware of supported eVMCS versions by host by
+ * examining CPUID.0x4000000A.EAX[0:15]. Host userspace VMM is
+ * expected to set this CPUID leaf according to the value
+ * returned in vmcs_version from nested_enable_evmcs().
+ *
+ * However, it turns out that Microsoft Hyper-V fails to comply
+ * to their own invented interface: When Hyper-V use eVMCS, it
+ * just sets first u32 field of eVMCS to revision_id specified
+ * in MSR_IA32_VMX_BASIC. Instead of used eVMCS version number
+ * which is one of the supported versions specified in
+ * CPUID.0x4000000A.EAX[0:15].
+ *
+ * To overcome Hyper-V bug, we accept here either a supported
+ * eVMCS version or VMCS12 revision_id as valid values for first
+ * u32 field of eVMCS.
+ */
+ if ((vmx->nested.hv_evmcs->revision_id != KVM_EVMCS_VERSION) &&
+ (vmx->nested.hv_evmcs->revision_id != VMCS12_REVISION)) {
+ nested_release_evmcs(vcpu);
+ return EVMPTRLD_VMFAIL;
+ }
+
+ vmx->nested.hv_evmcs_vmptr = evmcs_gpa;
+
+ evmcs_gpa_changed = true;
+ /*
+ * Unlike normal vmcs12, enlightened vmcs12 is not fully
+ * reloaded from guest's memory (read only fields, fields not
+ * present in struct hv_enlightened_vmcs, ...). Make sure there
+ * are no leftovers.
+ */
+ if (from_launch) {
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+ memset(vmcs12, 0, sizeof(*vmcs12));
+ vmcs12->hdr.revision_id = VMCS12_REVISION;
+ }
+
+ }
+
+ /*
+ * Clean fields data can't be used on VMLAUNCH and when we switch
+ * between different L2 guests as KVM keeps a single VMCS12 per L1.
+ */
+ if (from_launch || evmcs_gpa_changed) {
+ vmx->nested.hv_evmcs->hv_clean_fields &=
+ ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
+
+ vmx->nested.force_msr_bitmap_recalc = true;
+ }
+
+ return EVMPTRLD_SUCCEEDED;
+}
+
+void nested_sync_vmcs12_to_shadow(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr))
+ copy_vmcs12_to_enlightened(vmx);
+ else
+ copy_vmcs12_to_shadow(vmx);
+
+ vmx->nested.need_vmcs12_to_shadow_sync = false;
+}
+
+static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer)
+{
+ struct vcpu_vmx *vmx =
+ container_of(timer, struct vcpu_vmx, nested.preemption_timer);
+
+ vmx->nested.preemption_timer_expired = true;
+ kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu);
+ kvm_vcpu_kick(&vmx->vcpu);
+
+ return HRTIMER_NORESTART;
+}
+
+static u64 vmx_calc_preemption_timer_value(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+
+ u64 l1_scaled_tsc = kvm_read_l1_tsc(vcpu, rdtsc()) >>
+ VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE;
+
+ if (!vmx->nested.has_preemption_timer_deadline) {
+ vmx->nested.preemption_timer_deadline =
+ vmcs12->vmx_preemption_timer_value + l1_scaled_tsc;
+ vmx->nested.has_preemption_timer_deadline = true;
+ }
+ return vmx->nested.preemption_timer_deadline - l1_scaled_tsc;
+}
+
+static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu,
+ u64 preemption_timeout)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ /*
+ * A timer value of zero is architecturally guaranteed to cause
+ * a VMExit prior to executing any instructions in the guest.
+ */
+ if (preemption_timeout == 0) {
+ vmx_preemption_timer_fn(&vmx->nested.preemption_timer);
+ return;
+ }
+
+ if (vcpu->arch.virtual_tsc_khz == 0)
+ return;
+
+ preemption_timeout <<= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE;
+ preemption_timeout *= 1000000;
+ do_div(preemption_timeout, vcpu->arch.virtual_tsc_khz);
+ hrtimer_start(&vmx->nested.preemption_timer,
+ ktime_add_ns(ktime_get(), preemption_timeout),
+ HRTIMER_MODE_ABS_PINNED);
+}
+
+static u64 nested_vmx_calc_efer(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
+{
+ if (vmx->nested.nested_run_pending &&
+ (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER))
+ return vmcs12->guest_ia32_efer;
+ else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE)
+ return vmx->vcpu.arch.efer | (EFER_LMA | EFER_LME);
+ else
+ return vmx->vcpu.arch.efer & ~(EFER_LMA | EFER_LME);
+}
+
+static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx)
+{
+ struct kvm *kvm = vmx->vcpu.kvm;
+
+ /*
+ * If vmcs02 hasn't been initialized, set the constant vmcs02 state
+ * according to L0's settings (vmcs12 is irrelevant here). Host
+ * fields that come from L0 and are not constant, e.g. HOST_CR3,
+ * will be set as needed prior to VMLAUNCH/VMRESUME.
+ */
+ if (vmx->nested.vmcs02_initialized)
+ return;
+ vmx->nested.vmcs02_initialized = true;
+
+ /*
+ * We don't care what the EPTP value is we just need to guarantee
+ * it's valid so we don't get a false positive when doing early
+ * consistency checks.
+ */
+ if (enable_ept && nested_early_check)
+ vmcs_write64(EPT_POINTER,
+ construct_eptp(&vmx->vcpu, 0, PT64_ROOT_4LEVEL));
+
+ /* All VMFUNCs are currently emulated through L0 vmexits. */
+ if (cpu_has_vmx_vmfunc())
+ vmcs_write64(VM_FUNCTION_CONTROL, 0);
+
+ if (cpu_has_vmx_posted_intr())
+ vmcs_write16(POSTED_INTR_NV, POSTED_INTR_NESTED_VECTOR);
+
+ if (cpu_has_vmx_msr_bitmap())
+ vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap));
+
+ /*
+ * PML is emulated for L2, but never enabled in hardware as the MMU
+ * handles A/D emulation. Disabling PML for L2 also avoids having to
+ * deal with filtering out L2 GPAs from the buffer.
+ */
+ if (enable_pml) {
+ vmcs_write64(PML_ADDRESS, 0);
+ vmcs_write16(GUEST_PML_INDEX, -1);
+ }
+
+ if (cpu_has_vmx_encls_vmexit())
+ vmcs_write64(ENCLS_EXITING_BITMAP, INVALID_GPA);
+
+ if (kvm_notify_vmexit_enabled(kvm))
+ vmcs_write32(NOTIFY_WINDOW, kvm->arch.notify_window);
+
+ /*
+ * Set the MSR load/store lists to match L0's settings. Only the
+ * addresses are constant (for vmcs02), the counts can change based
+ * on L2's behavior, e.g. switching to/from long mode.
+ */
+ vmcs_write64(VM_EXIT_MSR_STORE_ADDR, __pa(vmx->msr_autostore.guest.val));
+ vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val));
+ vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val));
+
+ vmx_set_constant_host_state(vmx);
+}
+
+static void prepare_vmcs02_early_rare(struct vcpu_vmx *vmx,
+ struct vmcs12 *vmcs12)
+{
+ prepare_vmcs02_constant_state(vmx);
+
+ vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA);
+
+ if (enable_vpid) {
+ if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02)
+ vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.vpid02);
+ else
+ vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
+ }
+}
+
+static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs01,
+ struct vmcs12 *vmcs12)
+{
+ u32 exec_control;
+ u64 guest_efer = nested_vmx_calc_efer(vmx, vmcs12);
+
+ if (vmx->nested.dirty_vmcs12 || evmptr_is_valid(vmx->nested.hv_evmcs_vmptr))
+ prepare_vmcs02_early_rare(vmx, vmcs12);
+
+ /*
+ * PIN CONTROLS
+ */
+ exec_control = __pin_controls_get(vmcs01);
+ exec_control |= (vmcs12->pin_based_vm_exec_control &
+ ~PIN_BASED_VMX_PREEMPTION_TIMER);
+
+ /* Posted interrupts setting is only taken from vmcs12. */
+ vmx->nested.pi_pending = false;
+ if (nested_cpu_has_posted_intr(vmcs12))
+ vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv;
+ else
+ exec_control &= ~PIN_BASED_POSTED_INTR;
+ pin_controls_set(vmx, exec_control);
+
+ /*
+ * EXEC CONTROLS
+ */
+ exec_control = __exec_controls_get(vmcs01); /* L0's desires */
+ exec_control &= ~CPU_BASED_INTR_WINDOW_EXITING;
+ exec_control &= ~CPU_BASED_NMI_WINDOW_EXITING;
+ exec_control &= ~CPU_BASED_TPR_SHADOW;
+ exec_control |= vmcs12->cpu_based_vm_exec_control;
+
+ vmx->nested.l1_tpr_threshold = -1;
+ if (exec_control & CPU_BASED_TPR_SHADOW)
+ vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold);
+#ifdef CONFIG_X86_64
+ else
+ exec_control |= CPU_BASED_CR8_LOAD_EXITING |
+ CPU_BASED_CR8_STORE_EXITING;
+#endif
+
+ /*
+ * A vmexit (to either L1 hypervisor or L0 userspace) is always needed
+ * for I/O port accesses.
+ */
+ exec_control |= CPU_BASED_UNCOND_IO_EXITING;
+ exec_control &= ~CPU_BASED_USE_IO_BITMAPS;
+
+ /*
+ * This bit will be computed in nested_get_vmcs12_pages, because
+ * we do not have access to L1's MSR bitmap yet. For now, keep
+ * the same bit as before, hoping to avoid multiple VMWRITEs that
+ * only set/clear this bit.
+ */
+ exec_control &= ~CPU_BASED_USE_MSR_BITMAPS;
+ exec_control |= exec_controls_get(vmx) & CPU_BASED_USE_MSR_BITMAPS;
+
+ exec_controls_set(vmx, exec_control);
+
+ /*
+ * SECONDARY EXEC CONTROLS
+ */
+ if (cpu_has_secondary_exec_ctrls()) {
+ exec_control = __secondary_exec_controls_get(vmcs01);
+
+ /* Take the following fields only from vmcs12 */
+ exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
+ SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
+ SECONDARY_EXEC_ENABLE_INVPCID |
+ SECONDARY_EXEC_ENABLE_RDTSCP |
+ SECONDARY_EXEC_ENABLE_XSAVES |
+ SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE |
+ SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
+ SECONDARY_EXEC_APIC_REGISTER_VIRT |
+ SECONDARY_EXEC_ENABLE_VMFUNC |
+ SECONDARY_EXEC_DESC);
+
+ if (nested_cpu_has(vmcs12,
+ CPU_BASED_ACTIVATE_SECONDARY_CONTROLS))
+ exec_control |= vmcs12->secondary_vm_exec_control;
+
+ /* PML is emulated and never enabled in hardware for L2. */
+ exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
+
+ /* VMCS shadowing for L2 is emulated for now */
+ exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
+
+ /*
+ * Preset *DT exiting when emulating UMIP, so that vmx_set_cr4()
+ * will not have to rewrite the controls just for this bit.
+ */
+ if (vmx_umip_emulated() && (vmcs12->guest_cr4 & X86_CR4_UMIP))
+ exec_control |= SECONDARY_EXEC_DESC;
+
+ if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)
+ vmcs_write16(GUEST_INTR_STATUS,
+ vmcs12->guest_intr_status);
+
+ if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST))
+ exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
+
+ if (exec_control & SECONDARY_EXEC_ENCLS_EXITING)
+ vmx_write_encls_bitmap(&vmx->vcpu, vmcs12);
+
+ secondary_exec_controls_set(vmx, exec_control);
+ }
+
+ /*
+ * ENTRY CONTROLS
+ *
+ * vmcs12's VM_{ENTRY,EXIT}_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE
+ * are emulated by vmx_set_efer() in prepare_vmcs02(), but speculate
+ * on the related bits (if supported by the CPU) in the hope that
+ * we can avoid VMWrites during vmx_set_efer().
+ *
+ * Similarly, take vmcs01's PERF_GLOBAL_CTRL in the hope that if KVM is
+ * loading PERF_GLOBAL_CTRL via the VMCS for L1, then KVM will want to
+ * do the same for L2.
+ */
+ exec_control = __vm_entry_controls_get(vmcs01);
+ exec_control |= (vmcs12->vm_entry_controls &
+ ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL);
+ exec_control &= ~(VM_ENTRY_IA32E_MODE | VM_ENTRY_LOAD_IA32_EFER);
+ if (cpu_has_load_ia32_efer()) {
+ if (guest_efer & EFER_LMA)
+ exec_control |= VM_ENTRY_IA32E_MODE;
+ if (guest_efer != host_efer)
+ exec_control |= VM_ENTRY_LOAD_IA32_EFER;
+ }
+ vm_entry_controls_set(vmx, exec_control);
+
+ /*
+ * EXIT CONTROLS
+ *
+ * L2->L1 exit controls are emulated - the hardware exit is to L0 so
+ * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER
+ * bits may be modified by vmx_set_efer() in prepare_vmcs02().
+ */
+ exec_control = __vm_exit_controls_get(vmcs01);
+ if (cpu_has_load_ia32_efer() && guest_efer != host_efer)
+ exec_control |= VM_EXIT_LOAD_IA32_EFER;
+ else
+ exec_control &= ~VM_EXIT_LOAD_IA32_EFER;
+ vm_exit_controls_set(vmx, exec_control);
+
+ /*
+ * Interrupt/Exception Fields
+ */
+ if (vmx->nested.nested_run_pending) {
+ vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
+ vmcs12->vm_entry_intr_info_field);
+ vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
+ vmcs12->vm_entry_exception_error_code);
+ vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
+ vmcs12->vm_entry_instruction_len);
+ vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
+ vmcs12->guest_interruptibility_info);
+ vmx->loaded_vmcs->nmi_known_unmasked =
+ !(vmcs12->guest_interruptibility_info & GUEST_INTR_STATE_NMI);
+ } else {
+ vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);
+ }
+}
+
+static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
+{
+ struct hv_enlightened_vmcs *hv_evmcs = vmx->nested.hv_evmcs;
+
+ if (!hv_evmcs || !(hv_evmcs->hv_clean_fields &
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2)) {
+ vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector);
+ vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector);
+ vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector);
+ vmcs_write16(GUEST_DS_SELECTOR, vmcs12->guest_ds_selector);
+ vmcs_write16(GUEST_FS_SELECTOR, vmcs12->guest_fs_selector);
+ vmcs_write16(GUEST_GS_SELECTOR, vmcs12->guest_gs_selector);
+ vmcs_write16(GUEST_LDTR_SELECTOR, vmcs12->guest_ldtr_selector);
+ vmcs_write16(GUEST_TR_SELECTOR, vmcs12->guest_tr_selector);
+ vmcs_write32(GUEST_ES_LIMIT, vmcs12->guest_es_limit);
+ vmcs_write32(GUEST_CS_LIMIT, vmcs12->guest_cs_limit);
+ vmcs_write32(GUEST_SS_LIMIT, vmcs12->guest_ss_limit);
+ vmcs_write32(GUEST_DS_LIMIT, vmcs12->guest_ds_limit);
+ vmcs_write32(GUEST_FS_LIMIT, vmcs12->guest_fs_limit);
+ vmcs_write32(GUEST_GS_LIMIT, vmcs12->guest_gs_limit);
+ vmcs_write32(GUEST_LDTR_LIMIT, vmcs12->guest_ldtr_limit);
+ vmcs_write32(GUEST_TR_LIMIT, vmcs12->guest_tr_limit);
+ vmcs_write32(GUEST_GDTR_LIMIT, vmcs12->guest_gdtr_limit);
+ vmcs_write32(GUEST_IDTR_LIMIT, vmcs12->guest_idtr_limit);
+ vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes);
+ vmcs_write32(GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes);
+ vmcs_write32(GUEST_ES_AR_BYTES, vmcs12->guest_es_ar_bytes);
+ vmcs_write32(GUEST_DS_AR_BYTES, vmcs12->guest_ds_ar_bytes);
+ vmcs_write32(GUEST_FS_AR_BYTES, vmcs12->guest_fs_ar_bytes);
+ vmcs_write32(GUEST_GS_AR_BYTES, vmcs12->guest_gs_ar_bytes);
+ vmcs_write32(GUEST_LDTR_AR_BYTES, vmcs12->guest_ldtr_ar_bytes);
+ vmcs_write32(GUEST_TR_AR_BYTES, vmcs12->guest_tr_ar_bytes);
+ vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base);
+ vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base);
+ vmcs_writel(GUEST_SS_BASE, vmcs12->guest_ss_base);
+ vmcs_writel(GUEST_DS_BASE, vmcs12->guest_ds_base);
+ vmcs_writel(GUEST_FS_BASE, vmcs12->guest_fs_base);
+ vmcs_writel(GUEST_GS_BASE, vmcs12->guest_gs_base);
+ vmcs_writel(GUEST_LDTR_BASE, vmcs12->guest_ldtr_base);
+ vmcs_writel(GUEST_TR_BASE, vmcs12->guest_tr_base);
+ vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base);
+ vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base);
+
+ vmx->segment_cache.bitmask = 0;
+ }
+
+ if (!hv_evmcs || !(hv_evmcs->hv_clean_fields &
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1)) {
+ vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs);
+ vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
+ vmcs12->guest_pending_dbg_exceptions);
+ vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp);
+ vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip);
+
+ /*
+ * L1 may access the L2's PDPTR, so save them to construct
+ * vmcs12
+ */
+ if (enable_ept) {
+ vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0);
+ vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1);
+ vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2);
+ vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3);
+ }
+
+ if (kvm_mpx_supported() && vmx->nested.nested_run_pending &&
+ (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))
+ vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs);
+ }
+
+ if (nested_cpu_has_xsaves(vmcs12))
+ vmcs_write64(XSS_EXIT_BITMAP, vmcs12->xss_exit_bitmap);
+
+ /*
+ * Whether page-faults are trapped is determined by a combination of
+ * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF. If L0
+ * doesn't care about page faults then we should set all of these to
+ * L1's desires. However, if L0 does care about (some) page faults, it
+ * is not easy (if at all possible?) to merge L0 and L1's desires, we
+ * simply ask to exit on each and every L2 page fault. This is done by
+ * setting MASK=MATCH=0 and (see below) EB.PF=1.
+ * Note that below we don't need special code to set EB.PF beyond the
+ * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept,
+ * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when
+ * !enable_ept, EB.PF is 1, so the "or" will always be 1.
+ */
+ if (vmx_need_pf_intercept(&vmx->vcpu)) {
+ /*
+ * TODO: if both L0 and L1 need the same MASK and MATCH,
+ * go ahead and use it?
+ */
+ vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
+ vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
+ } else {
+ vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, vmcs12->page_fault_error_code_mask);
+ vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, vmcs12->page_fault_error_code_match);
+ }
+
+ if (cpu_has_vmx_apicv()) {
+ vmcs_write64(EOI_EXIT_BITMAP0, vmcs12->eoi_exit_bitmap0);
+ vmcs_write64(EOI_EXIT_BITMAP1, vmcs12->eoi_exit_bitmap1);
+ vmcs_write64(EOI_EXIT_BITMAP2, vmcs12->eoi_exit_bitmap2);
+ vmcs_write64(EOI_EXIT_BITMAP3, vmcs12->eoi_exit_bitmap3);
+ }
+
+ /*
+ * Make sure the msr_autostore list is up to date before we set the
+ * count in the vmcs02.
+ */
+ prepare_vmx_msr_autostore_list(&vmx->vcpu, MSR_IA32_TSC);
+
+ vmcs_write32(VM_EXIT_MSR_STORE_COUNT, vmx->msr_autostore.guest.nr);
+ vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
+ vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
+
+ set_cr4_guest_host_mask(vmx);
+}
+
+/*
+ * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested
+ * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it
+ * with L0's requirements for its guest (a.k.a. vmcs01), so we can run the L2
+ * guest in a way that will both be appropriate to L1's requests, and our
+ * needs. In addition to modifying the active vmcs (which is vmcs02), this
+ * function also has additional necessary side-effects, like setting various
+ * vcpu->arch fields.
+ * Returns 0 on success, 1 on failure. Invalid state exit qualification code
+ * is assigned to entry_failure_code on failure.
+ */
+static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
+ bool from_vmentry,
+ enum vm_entry_failure_code *entry_failure_code)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ bool load_guest_pdptrs_vmcs12 = false;
+
+ if (vmx->nested.dirty_vmcs12 || evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) {
+ prepare_vmcs02_rare(vmx, vmcs12);
+ vmx->nested.dirty_vmcs12 = false;
+
+ load_guest_pdptrs_vmcs12 = !evmptr_is_valid(vmx->nested.hv_evmcs_vmptr) ||
+ !(vmx->nested.hv_evmcs->hv_clean_fields &
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1);
+ }
+
+ if (vmx->nested.nested_run_pending &&
+ (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) {
+ kvm_set_dr(vcpu, 7, vmcs12->guest_dr7);
+ vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl);
+ } else {
+ kvm_set_dr(vcpu, 7, vcpu->arch.dr7);
+ vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.pre_vmenter_debugctl);
+ }
+ if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending ||
+ !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)))
+ vmcs_write64(GUEST_BNDCFGS, vmx->nested.pre_vmenter_bndcfgs);
+ vmx_set_rflags(vcpu, vmcs12->guest_rflags);
+
+ /* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the
+ * bitwise-or of what L1 wants to trap for L2, and what we want to
+ * trap. Note that CR0.TS also needs updating - we do this later.
+ */
+ vmx_update_exception_bitmap(vcpu);
+ vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask;
+ vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
+
+ if (vmx->nested.nested_run_pending &&
+ (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT)) {
+ vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat);
+ vcpu->arch.pat = vmcs12->guest_ia32_pat;
+ } else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
+ vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
+ }
+
+ vcpu->arch.tsc_offset = kvm_calc_nested_tsc_offset(
+ vcpu->arch.l1_tsc_offset,
+ vmx_get_l2_tsc_offset(vcpu),
+ vmx_get_l2_tsc_multiplier(vcpu));
+
+ vcpu->arch.tsc_scaling_ratio = kvm_calc_nested_tsc_multiplier(
+ vcpu->arch.l1_tsc_scaling_ratio,
+ vmx_get_l2_tsc_multiplier(vcpu));
+
+ vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
+ if (kvm_caps.has_tsc_control)
+ vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio);
+
+ nested_vmx_transition_tlb_flush(vcpu, vmcs12, true);
+
+ if (nested_cpu_has_ept(vmcs12))
+ nested_ept_init_mmu_context(vcpu);
+
+ /*
+ * Override the CR0/CR4 read shadows after setting the effective guest
+ * CR0/CR4. The common helpers also set the shadows, but they don't
+ * account for vmcs12's cr0/4_guest_host_mask.
+ */
+ vmx_set_cr0(vcpu, vmcs12->guest_cr0);
+ vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12));
+
+ vmx_set_cr4(vcpu, vmcs12->guest_cr4);
+ vmcs_writel(CR4_READ_SHADOW, nested_read_cr4(vmcs12));
+
+ vcpu->arch.efer = nested_vmx_calc_efer(vmx, vmcs12);
+ /* Note: may modify VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */
+ vmx_set_efer(vcpu, vcpu->arch.efer);
+
+ /*
+ * Guest state is invalid and unrestricted guest is disabled,
+ * which means L1 attempted VMEntry to L2 with invalid state.
+ * Fail the VMEntry.
+ *
+ * However when force loading the guest state (SMM exit or
+ * loading nested state after migration, it is possible to
+ * have invalid guest state now, which will be later fixed by
+ * restoring L2 register state
+ */
+ if (CC(from_vmentry && !vmx_guest_state_valid(vcpu))) {
+ *entry_failure_code = ENTRY_FAIL_DEFAULT;
+ return -EINVAL;
+ }
+
+ /* Shadow page tables on either EPT or shadow page tables. */
+ if (nested_vmx_load_cr3(vcpu, vmcs12->guest_cr3, nested_cpu_has_ept(vmcs12),
+ from_vmentry, entry_failure_code))
+ return -EINVAL;
+
+ /*
+ * Immediately write vmcs02.GUEST_CR3. It will be propagated to vmcs12
+ * on nested VM-Exit, which can occur without actually running L2 and
+ * thus without hitting vmx_load_mmu_pgd(), e.g. if L1 is entering L2 with
+ * vmcs12.GUEST_ACTIVITYSTATE=HLT, in which case KVM will intercept the
+ * transition to HLT instead of running L2.
+ */
+ if (enable_ept)
+ vmcs_writel(GUEST_CR3, vmcs12->guest_cr3);
+
+ /* Late preparation of GUEST_PDPTRs now that EFER and CRs are set. */
+ if (load_guest_pdptrs_vmcs12 && nested_cpu_has_ept(vmcs12) &&
+ is_pae_paging(vcpu)) {
+ vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0);
+ vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1);
+ vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2);
+ vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3);
+ }
+
+ if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) &&
+ kvm_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu)) &&
+ WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL,
+ vmcs12->guest_ia32_perf_global_ctrl))) {
+ *entry_failure_code = ENTRY_FAIL_DEFAULT;
+ return -EINVAL;
+ }
+
+ kvm_rsp_write(vcpu, vmcs12->guest_rsp);
+ kvm_rip_write(vcpu, vmcs12->guest_rip);
+
+ /*
+ * It was observed that genuine Hyper-V running in L1 doesn't reset
+ * 'hv_clean_fields' by itself, it only sets the corresponding dirty
+ * bits when it changes a field in eVMCS. Mark all fields as clean
+ * here.
+ */
+ if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr))
+ vmx->nested.hv_evmcs->hv_clean_fields |=
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
+
+ return 0;
+}
+
+static int nested_vmx_check_nmi_controls(struct vmcs12 *vmcs12)
+{
+ if (CC(!nested_cpu_has_nmi_exiting(vmcs12) &&
+ nested_cpu_has_virtual_nmis(vmcs12)))
+ return -EINVAL;
+
+ if (CC(!nested_cpu_has_virtual_nmis(vmcs12) &&
+ nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING)))
+ return -EINVAL;
+
+ return 0;
+}
+
+static bool nested_vmx_check_eptp(struct kvm_vcpu *vcpu, u64 new_eptp)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ /* Check for memory type validity */
+ switch (new_eptp & VMX_EPTP_MT_MASK) {
+ case VMX_EPTP_MT_UC:
+ if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPTP_UC_BIT)))
+ return false;
+ break;
+ case VMX_EPTP_MT_WB:
+ if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPTP_WB_BIT)))
+ return false;
+ break;
+ default:
+ return false;
+ }
+
+ /* Page-walk levels validity. */
+ switch (new_eptp & VMX_EPTP_PWL_MASK) {
+ case VMX_EPTP_PWL_5:
+ if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_PAGE_WALK_5_BIT)))
+ return false;
+ break;
+ case VMX_EPTP_PWL_4:
+ if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_PAGE_WALK_4_BIT)))
+ return false;
+ break;
+ default:
+ return false;
+ }
+
+ /* Reserved bits should not be set */
+ if (CC(kvm_vcpu_is_illegal_gpa(vcpu, new_eptp) || ((new_eptp >> 7) & 0x1f)))
+ return false;
+
+ /* AD, if set, should be supported */
+ if (new_eptp & VMX_EPTP_AD_ENABLE_BIT) {
+ if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_AD_BIT)))
+ return false;
+ }
+
+ return true;
+}
+
+/*
+ * Checks related to VM-Execution Control Fields
+ */
+static int nested_check_vm_execution_controls(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ if (CC(!vmx_control_verify(vmcs12->pin_based_vm_exec_control,
+ vmx->nested.msrs.pinbased_ctls_low,
+ vmx->nested.msrs.pinbased_ctls_high)) ||
+ CC(!vmx_control_verify(vmcs12->cpu_based_vm_exec_control,
+ vmx->nested.msrs.procbased_ctls_low,
+ vmx->nested.msrs.procbased_ctls_high)))
+ return -EINVAL;
+
+ if (nested_cpu_has(vmcs12, CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) &&
+ CC(!vmx_control_verify(vmcs12->secondary_vm_exec_control,
+ vmx->nested.msrs.secondary_ctls_low,
+ vmx->nested.msrs.secondary_ctls_high)))
+ return -EINVAL;
+
+ if (CC(vmcs12->cr3_target_count > nested_cpu_vmx_misc_cr3_count(vcpu)) ||
+ nested_vmx_check_io_bitmap_controls(vcpu, vmcs12) ||
+ nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12) ||
+ nested_vmx_check_tpr_shadow_controls(vcpu, vmcs12) ||
+ nested_vmx_check_apic_access_controls(vcpu, vmcs12) ||
+ nested_vmx_check_apicv_controls(vcpu, vmcs12) ||
+ nested_vmx_check_nmi_controls(vmcs12) ||
+ nested_vmx_check_pml_controls(vcpu, vmcs12) ||
+ nested_vmx_check_unrestricted_guest_controls(vcpu, vmcs12) ||
+ nested_vmx_check_mode_based_ept_exec_controls(vcpu, vmcs12) ||
+ nested_vmx_check_shadow_vmcs_controls(vcpu, vmcs12) ||
+ CC(nested_cpu_has_vpid(vmcs12) && !vmcs12->virtual_processor_id))
+ return -EINVAL;
+
+ if (!nested_cpu_has_preemption_timer(vmcs12) &&
+ nested_cpu_has_save_preemption_timer(vmcs12))
+ return -EINVAL;
+
+ if (nested_cpu_has_ept(vmcs12) &&
+ CC(!nested_vmx_check_eptp(vcpu, vmcs12->ept_pointer)))
+ return -EINVAL;
+
+ if (nested_cpu_has_vmfunc(vmcs12)) {
+ if (CC(vmcs12->vm_function_control &
+ ~vmx->nested.msrs.vmfunc_controls))
+ return -EINVAL;
+
+ if (nested_cpu_has_eptp_switching(vmcs12)) {
+ if (CC(!nested_cpu_has_ept(vmcs12)) ||
+ CC(!page_address_valid(vcpu, vmcs12->eptp_list_address)))
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * Checks related to VM-Exit Control Fields
+ */
+static int nested_check_vm_exit_controls(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ if (CC(!vmx_control_verify(vmcs12->vm_exit_controls,
+ vmx->nested.msrs.exit_ctls_low,
+ vmx->nested.msrs.exit_ctls_high)) ||
+ CC(nested_vmx_check_exit_msr_switch_controls(vcpu, vmcs12)))
+ return -EINVAL;
+
+ return 0;
+}
+
+/*
+ * Checks related to VM-Entry Control Fields
+ */
+static int nested_check_vm_entry_controls(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ if (CC(!vmx_control_verify(vmcs12->vm_entry_controls,
+ vmx->nested.msrs.entry_ctls_low,
+ vmx->nested.msrs.entry_ctls_high)))
+ return -EINVAL;
+
+ /*
+ * From the Intel SDM, volume 3:
+ * Fields relevant to VM-entry event injection must be set properly.
+ * These fields are the VM-entry interruption-information field, the
+ * VM-entry exception error code, and the VM-entry instruction length.
+ */
+ if (vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) {
+ u32 intr_info = vmcs12->vm_entry_intr_info_field;
+ u8 vector = intr_info & INTR_INFO_VECTOR_MASK;
+ u32 intr_type = intr_info & INTR_INFO_INTR_TYPE_MASK;
+ bool has_error_code = intr_info & INTR_INFO_DELIVER_CODE_MASK;
+ bool should_have_error_code;
+ bool urg = nested_cpu_has2(vmcs12,
+ SECONDARY_EXEC_UNRESTRICTED_GUEST);
+ bool prot_mode = !urg || vmcs12->guest_cr0 & X86_CR0_PE;
+
+ /* VM-entry interruption-info field: interruption type */
+ if (CC(intr_type == INTR_TYPE_RESERVED) ||
+ CC(intr_type == INTR_TYPE_OTHER_EVENT &&
+ !nested_cpu_supports_monitor_trap_flag(vcpu)))
+ return -EINVAL;
+
+ /* VM-entry interruption-info field: vector */
+ if (CC(intr_type == INTR_TYPE_NMI_INTR && vector != NMI_VECTOR) ||
+ CC(intr_type == INTR_TYPE_HARD_EXCEPTION && vector > 31) ||
+ CC(intr_type == INTR_TYPE_OTHER_EVENT && vector != 0))
+ return -EINVAL;
+
+ /* VM-entry interruption-info field: deliver error code */
+ should_have_error_code =
+ intr_type == INTR_TYPE_HARD_EXCEPTION && prot_mode &&
+ x86_exception_has_error_code(vector);
+ if (CC(has_error_code != should_have_error_code))
+ return -EINVAL;
+
+ /* VM-entry exception error code */
+ if (CC(has_error_code &&
+ vmcs12->vm_entry_exception_error_code & GENMASK(31, 16)))
+ return -EINVAL;
+
+ /* VM-entry interruption-info field: reserved bits */
+ if (CC(intr_info & INTR_INFO_RESVD_BITS_MASK))
+ return -EINVAL;
+
+ /* VM-entry instruction length */
+ switch (intr_type) {
+ case INTR_TYPE_SOFT_EXCEPTION:
+ case INTR_TYPE_SOFT_INTR:
+ case INTR_TYPE_PRIV_SW_EXCEPTION:
+ if (CC(vmcs12->vm_entry_instruction_len > 15) ||
+ CC(vmcs12->vm_entry_instruction_len == 0 &&
+ CC(!nested_cpu_has_zero_length_injection(vcpu))))
+ return -EINVAL;
+ }
+ }
+
+ if (nested_vmx_check_entry_msr_switch_controls(vcpu, vmcs12))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int nested_vmx_check_controls(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+ if (nested_check_vm_execution_controls(vcpu, vmcs12) ||
+ nested_check_vm_exit_controls(vcpu, vmcs12) ||
+ nested_check_vm_entry_controls(vcpu, vmcs12))
+ return -EINVAL;
+
+ if (guest_cpuid_has_evmcs(vcpu))
+ return nested_evmcs_check_controls(vmcs12);
+
+ return 0;
+}
+
+static int nested_vmx_check_address_space_size(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+#ifdef CONFIG_X86_64
+ if (CC(!!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) !=
+ !!(vcpu->arch.efer & EFER_LMA)))
+ return -EINVAL;
+#endif
+ return 0;
+}
+
+static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+ bool ia32e = !!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE);
+
+ if (CC(!nested_host_cr0_valid(vcpu, vmcs12->host_cr0)) ||
+ CC(!nested_host_cr4_valid(vcpu, vmcs12->host_cr4)) ||
+ CC(kvm_vcpu_is_illegal_gpa(vcpu, vmcs12->host_cr3)))
+ return -EINVAL;
+
+ if (CC(is_noncanonical_address(vmcs12->host_ia32_sysenter_esp, vcpu)) ||
+ CC(is_noncanonical_address(vmcs12->host_ia32_sysenter_eip, vcpu)))
+ return -EINVAL;
+
+ if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) &&
+ CC(!kvm_pat_valid(vmcs12->host_ia32_pat)))
+ return -EINVAL;
+
+ if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) &&
+ CC(!kvm_valid_perf_global_ctrl(vcpu_to_pmu(vcpu),
+ vmcs12->host_ia32_perf_global_ctrl)))
+ return -EINVAL;
+
+ if (ia32e) {
+ if (CC(!(vmcs12->host_cr4 & X86_CR4_PAE)))
+ return -EINVAL;
+ } else {
+ if (CC(vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) ||
+ CC(vmcs12->host_cr4 & X86_CR4_PCIDE) ||
+ CC((vmcs12->host_rip) >> 32))
+ return -EINVAL;
+ }
+
+ if (CC(vmcs12->host_cs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
+ CC(vmcs12->host_ss_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
+ CC(vmcs12->host_ds_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
+ CC(vmcs12->host_es_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
+ CC(vmcs12->host_fs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
+ CC(vmcs12->host_gs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
+ CC(vmcs12->host_tr_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
+ CC(vmcs12->host_cs_selector == 0) ||
+ CC(vmcs12->host_tr_selector == 0) ||
+ CC(vmcs12->host_ss_selector == 0 && !ia32e))
+ return -EINVAL;
+
+ if (CC(is_noncanonical_address(vmcs12->host_fs_base, vcpu)) ||
+ CC(is_noncanonical_address(vmcs12->host_gs_base, vcpu)) ||
+ CC(is_noncanonical_address(vmcs12->host_gdtr_base, vcpu)) ||
+ CC(is_noncanonical_address(vmcs12->host_idtr_base, vcpu)) ||
+ CC(is_noncanonical_address(vmcs12->host_tr_base, vcpu)) ||
+ CC(is_noncanonical_address(vmcs12->host_rip, vcpu)))
+ return -EINVAL;
+
+ /*
+ * If the load IA32_EFER VM-exit control is 1, bits reserved in the
+ * IA32_EFER MSR must be 0 in the field for that register. In addition,
+ * the values of the LMA and LME bits in the field must each be that of
+ * the host address-space size VM-exit control.
+ */
+ if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) {
+ if (CC(!kvm_valid_efer(vcpu, vmcs12->host_ia32_efer)) ||
+ CC(ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA)) ||
+ CC(ia32e != !!(vmcs12->host_ia32_efer & EFER_LME)))
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct gfn_to_hva_cache *ghc = &vmx->nested.shadow_vmcs12_cache;
+ struct vmcs_hdr hdr;
+
+ if (vmcs12->vmcs_link_pointer == INVALID_GPA)
+ return 0;
+
+ if (CC(!page_address_valid(vcpu, vmcs12->vmcs_link_pointer)))
+ return -EINVAL;
+
+ if (ghc->gpa != vmcs12->vmcs_link_pointer &&
+ CC(kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc,
+ vmcs12->vmcs_link_pointer, VMCS12_SIZE)))
+ return -EINVAL;
+
+ if (CC(kvm_read_guest_offset_cached(vcpu->kvm, ghc, &hdr,
+ offsetof(struct vmcs12, hdr),
+ sizeof(hdr))))
+ return -EINVAL;
+
+ if (CC(hdr.revision_id != VMCS12_REVISION) ||
+ CC(hdr.shadow_vmcs != nested_cpu_has_shadow_vmcs(vmcs12)))
+ return -EINVAL;
+
+ return 0;
+}
+
+/*
+ * Checks related to Guest Non-register State
+ */
+static int nested_check_guest_non_reg_state(struct vmcs12 *vmcs12)
+{
+ if (CC(vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE &&
+ vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT &&
+ vmcs12->guest_activity_state != GUEST_ACTIVITY_WAIT_SIPI))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12,
+ enum vm_entry_failure_code *entry_failure_code)
+{
+ bool ia32e = !!(vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE);
+
+ *entry_failure_code = ENTRY_FAIL_DEFAULT;
+
+ if (CC(!nested_guest_cr0_valid(vcpu, vmcs12->guest_cr0)) ||
+ CC(!nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4)))
+ return -EINVAL;
+
+ if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) &&
+ CC(!kvm_dr7_valid(vmcs12->guest_dr7)))
+ return -EINVAL;
+
+ if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) &&
+ CC(!kvm_pat_valid(vmcs12->guest_ia32_pat)))
+ return -EINVAL;
+
+ if (nested_vmx_check_vmcs_link_ptr(vcpu, vmcs12)) {
+ *entry_failure_code = ENTRY_FAIL_VMCS_LINK_PTR;
+ return -EINVAL;
+ }
+
+ if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) &&
+ CC(!kvm_valid_perf_global_ctrl(vcpu_to_pmu(vcpu),
+ vmcs12->guest_ia32_perf_global_ctrl)))
+ return -EINVAL;
+
+ if (CC((vmcs12->guest_cr0 & (X86_CR0_PG | X86_CR0_PE)) == X86_CR0_PG))
+ return -EINVAL;
+
+ if (CC(ia32e && !(vmcs12->guest_cr4 & X86_CR4_PAE)) ||
+ CC(ia32e && !(vmcs12->guest_cr0 & X86_CR0_PG)))
+ return -EINVAL;
+
+ /*
+ * If the load IA32_EFER VM-entry control is 1, the following checks
+ * are performed on the field for the IA32_EFER MSR:
+ * - Bits reserved in the IA32_EFER MSR must be 0.
+ * - Bit 10 (corresponding to IA32_EFER.LMA) must equal the value of
+ * the IA-32e mode guest VM-exit control. It must also be identical
+ * to bit 8 (LME) if bit 31 in the CR0 field (corresponding to
+ * CR0.PG) is 1.
+ */
+ if (to_vmx(vcpu)->nested.nested_run_pending &&
+ (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) {
+ if (CC(!kvm_valid_efer(vcpu, vmcs12->guest_ia32_efer)) ||
+ CC(ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA)) ||
+ CC(((vmcs12->guest_cr0 & X86_CR0_PG) &&
+ ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME))))
+ return -EINVAL;
+ }
+
+ if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS) &&
+ (CC(is_noncanonical_address(vmcs12->guest_bndcfgs & PAGE_MASK, vcpu)) ||
+ CC((vmcs12->guest_bndcfgs & MSR_IA32_BNDCFGS_RSVD))))
+ return -EINVAL;
+
+ if (nested_check_guest_non_reg_state(vmcs12))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ unsigned long cr3, cr4;
+ bool vm_fail;
+
+ if (!nested_early_check)
+ return 0;
+
+ if (vmx->msr_autoload.host.nr)
+ vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
+ if (vmx->msr_autoload.guest.nr)
+ vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
+
+ preempt_disable();
+
+ vmx_prepare_switch_to_guest(vcpu);
+
+ /*
+ * Induce a consistency check VMExit by clearing bit 1 in GUEST_RFLAGS,
+ * which is reserved to '1' by hardware. GUEST_RFLAGS is guaranteed to
+ * be written (by prepare_vmcs02()) before the "real" VMEnter, i.e.
+ * there is no need to preserve other bits or save/restore the field.
+ */
+ vmcs_writel(GUEST_RFLAGS, 0);
+
+ cr3 = __get_current_cr3_fast();
+ if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) {
+ vmcs_writel(HOST_CR3, cr3);
+ vmx->loaded_vmcs->host_state.cr3 = cr3;
+ }
+
+ cr4 = cr4_read_shadow();
+ if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) {
+ vmcs_writel(HOST_CR4, cr4);
+ vmx->loaded_vmcs->host_state.cr4 = cr4;
+ }
+
+ vm_fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs,
+ __vmx_vcpu_run_flags(vmx));
+
+ if (vmx->msr_autoload.host.nr)
+ vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
+ if (vmx->msr_autoload.guest.nr)
+ vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
+
+ if (vm_fail) {
+ u32 error = vmcs_read32(VM_INSTRUCTION_ERROR);
+
+ preempt_enable();
+
+ trace_kvm_nested_vmenter_failed(
+ "early hardware check VM-instruction error: ", error);
+ WARN_ON_ONCE(error != VMXERR_ENTRY_INVALID_CONTROL_FIELD);
+ return 1;
+ }
+
+ /*
+ * VMExit clears RFLAGS.IF and DR7, even on a consistency check.
+ */
+ if (hw_breakpoint_active())
+ set_debugreg(__this_cpu_read(cpu_dr7), 7);
+ local_irq_enable();
+ preempt_enable();
+
+ /*
+ * A non-failing VMEntry means we somehow entered guest mode with
+ * an illegal RIP, and that's just the tip of the iceberg. There
+ * is no telling what memory has been modified or what state has
+ * been exposed to unknown code. Hitting this all but guarantees
+ * a (very critical) hardware issue.
+ */
+ WARN_ON(!(vmcs_read32(VM_EXIT_REASON) &
+ VMX_EXIT_REASONS_FAILED_VMENTRY));
+
+ return 0;
+}
+
+static bool nested_get_evmcs_page(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ /*
+ * hv_evmcs may end up being not mapped after migration (when
+ * L2 was running), map it here to make sure vmcs12 changes are
+ * properly reflected.
+ */
+ if (guest_cpuid_has_evmcs(vcpu) &&
+ vmx->nested.hv_evmcs_vmptr == EVMPTR_MAP_PENDING) {
+ enum nested_evmptrld_status evmptrld_status =
+ nested_vmx_handle_enlightened_vmptrld(vcpu, false);
+
+ if (evmptrld_status == EVMPTRLD_VMFAIL ||
+ evmptrld_status == EVMPTRLD_ERROR)
+ return false;
+
+ /*
+ * Post migration VMCS12 always provides the most actual
+ * information, copy it to eVMCS upon entry.
+ */
+ vmx->nested.need_vmcs12_to_shadow_sync = true;
+ }
+
+ return true;
+}
+
+static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
+{
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct kvm_host_map *map;
+
+ if (!vcpu->arch.pdptrs_from_userspace &&
+ !nested_cpu_has_ept(vmcs12) && is_pae_paging(vcpu)) {
+ /*
+ * Reload the guest's PDPTRs since after a migration
+ * the guest CR3 might be restored prior to setting the nested
+ * state which can lead to a load of wrong PDPTRs.
+ */
+ if (CC(!load_pdptrs(vcpu, vcpu->arch.cr3)))
+ return false;
+ }
+
+
+ if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
+ map = &vmx->nested.apic_access_page_map;
+
+ if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->apic_access_addr), map)) {
+ vmcs_write64(APIC_ACCESS_ADDR, pfn_to_hpa(map->pfn));
+ } else {
+ pr_debug_ratelimited("%s: no backing for APIC-access address in vmcs12\n",
+ __func__);
+ vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+ vcpu->run->internal.suberror =
+ KVM_INTERNAL_ERROR_EMULATION;
+ vcpu->run->internal.ndata = 0;
+ return false;
+ }
+ }
+
+ if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) {
+ map = &vmx->nested.virtual_apic_map;
+
+ if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->virtual_apic_page_addr), map)) {
+ vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, pfn_to_hpa(map->pfn));
+ } else if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING) &&
+ nested_cpu_has(vmcs12, CPU_BASED_CR8_STORE_EXITING) &&
+ !nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
+ /*
+ * The processor will never use the TPR shadow, simply
+ * clear the bit from the execution control. Such a
+ * configuration is useless, but it happens in tests.
+ * For any other configuration, failing the vm entry is
+ * _not_ what the processor does but it's basically the
+ * only possibility we have.
+ */
+ exec_controls_clearbit(vmx, CPU_BASED_TPR_SHADOW);
+ } else {
+ /*
+ * Write an illegal value to VIRTUAL_APIC_PAGE_ADDR to
+ * force VM-Entry to fail.
+ */
+ vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, INVALID_GPA);
+ }
+ }
+
+ if (nested_cpu_has_posted_intr(vmcs12)) {
+ map = &vmx->nested.pi_desc_map;
+
+ if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->posted_intr_desc_addr), map)) {
+ vmx->nested.pi_desc =
+ (struct pi_desc *)(((void *)map->hva) +
+ offset_in_page(vmcs12->posted_intr_desc_addr));
+ vmcs_write64(POSTED_INTR_DESC_ADDR,
+ pfn_to_hpa(map->pfn) + offset_in_page(vmcs12->posted_intr_desc_addr));
+ } else {
+ /*
+ * Defer the KVM_INTERNAL_EXIT until KVM tries to
+ * access the contents of the VMCS12 posted interrupt
+ * descriptor. (Note that KVM may do this when it
+ * should not, per the architectural specification.)
+ */
+ vmx->nested.pi_desc = NULL;
+ pin_controls_clearbit(vmx, PIN_BASED_POSTED_INTR);
+ }
+ }
+ if (nested_vmx_prepare_msr_bitmap(vcpu, vmcs12))
+ exec_controls_setbit(vmx, CPU_BASED_USE_MSR_BITMAPS);
+ else
+ exec_controls_clearbit(vmx, CPU_BASED_USE_MSR_BITMAPS);
+
+ return true;
+}
+
+static bool vmx_get_nested_state_pages(struct kvm_vcpu *vcpu)
+{
+ /*
+ * Note: nested_get_evmcs_page() also updates 'vp_assist_page' copy
+ * in 'struct kvm_vcpu_hv' in case eVMCS is in use, this is mandatory
+ * to make nested_evmcs_l2_tlb_flush_enabled() work correctly post
+ * migration.
+ */
+ if (!nested_get_evmcs_page(vcpu)) {
+ pr_debug_ratelimited("%s: enlightened vmptrld failed\n",
+ __func__);
+ vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+ vcpu->run->internal.suberror =
+ KVM_INTERNAL_ERROR_EMULATION;
+ vcpu->run->internal.ndata = 0;
+
+ return false;
+ }
+
+ if (is_guest_mode(vcpu) && !nested_get_vmcs12_pages(vcpu))
+ return false;
+
+ return true;
+}
+
+static int nested_vmx_write_pml_buffer(struct kvm_vcpu *vcpu, gpa_t gpa)
+{
+ struct vmcs12 *vmcs12;
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ gpa_t dst;
+
+ if (WARN_ON_ONCE(!is_guest_mode(vcpu)))
+ return 0;
+
+ if (WARN_ON_ONCE(vmx->nested.pml_full))
+ return 1;
+
+ /*
+ * Check if PML is enabled for the nested guest. Whether eptp bit 6 is
+ * set is already checked as part of A/D emulation.
+ */
+ vmcs12 = get_vmcs12(vcpu);
+ if (!nested_cpu_has_pml(vmcs12))
+ return 0;
+
+ if (vmcs12->guest_pml_index >= PML_ENTITY_NUM) {
+ vmx->nested.pml_full = true;
+ return 1;
+ }
+
+ gpa &= ~0xFFFull;
+ dst = vmcs12->pml_address + sizeof(u64) * vmcs12->guest_pml_index;
+
+ if (kvm_write_guest_page(vcpu->kvm, gpa_to_gfn(dst), &gpa,
+ offset_in_page(dst), sizeof(gpa)))
+ return 0;
+
+ vmcs12->guest_pml_index--;
+
+ return 0;
+}
+
+/*
+ * Intel's VMX Instruction Reference specifies a common set of prerequisites
+ * for running VMX instructions (except VMXON, whose prerequisites are
+ * slightly different). It also specifies what exception to inject otherwise.
+ * Note that many of these exceptions have priority over VM exits, so they
+ * don't have to be checked again here.
+ */
+static int nested_vmx_check_permission(struct kvm_vcpu *vcpu)
+{
+ if (!to_vmx(vcpu)->nested.vmxon) {
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 0;
+ }
+
+ if (vmx_get_cpl(vcpu)) {
+ kvm_inject_gp(vcpu, 0);
+ return 0;
+ }
+
+ return 1;
+}
+
+static u8 vmx_has_apicv_interrupt(struct kvm_vcpu *vcpu)
+{
+ u8 rvi = vmx_get_rvi();
+ u8 vppr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_PROCPRI);
+
+ return ((rvi & 0xf0) > (vppr & 0xf0));
+}
+
+static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12);
+
+/*
+ * If from_vmentry is false, this is being called from state restore (either RSM
+ * or KVM_SET_NESTED_STATE). Otherwise it's called from vmlaunch/vmresume.
+ *
+ * Returns:
+ * NVMX_VMENTRY_SUCCESS: Entered VMX non-root mode
+ * NVMX_VMENTRY_VMFAIL: Consistency check VMFail
+ * NVMX_VMENTRY_VMEXIT: Consistency check VMExit
+ * NVMX_VMENTRY_KVM_INTERNAL_ERROR: KVM internal error
+ */
+enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
+ bool from_vmentry)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+ enum vm_entry_failure_code entry_failure_code;
+ bool evaluate_pending_interrupts;
+ union vmx_exit_reason exit_reason = {
+ .basic = EXIT_REASON_INVALID_STATE,
+ .failed_vmentry = 1,
+ };
+ u32 failed_index;
+
+ trace_kvm_nested_vmenter(kvm_rip_read(vcpu),
+ vmx->nested.current_vmptr,
+ vmcs12->guest_rip,
+ vmcs12->guest_intr_status,
+ vmcs12->vm_entry_intr_info_field,
+ vmcs12->secondary_vm_exec_control & SECONDARY_EXEC_ENABLE_EPT,
+ vmcs12->ept_pointer,
+ vmcs12->guest_cr3,
+ KVM_ISA_VMX);
+
+ kvm_service_local_tlb_flush_requests(vcpu);
+
+ evaluate_pending_interrupts = exec_controls_get(vmx) &
+ (CPU_BASED_INTR_WINDOW_EXITING | CPU_BASED_NMI_WINDOW_EXITING);
+ if (likely(!evaluate_pending_interrupts) && kvm_vcpu_apicv_active(vcpu))
+ evaluate_pending_interrupts |= vmx_has_apicv_interrupt(vcpu);
+ if (!evaluate_pending_interrupts)
+ evaluate_pending_interrupts |= kvm_apic_has_pending_init_or_sipi(vcpu);
+
+ if (!vmx->nested.nested_run_pending ||
+ !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS))
+ vmx->nested.pre_vmenter_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
+ if (kvm_mpx_supported() &&
+ (!vmx->nested.nested_run_pending ||
+ !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)))
+ vmx->nested.pre_vmenter_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
+
+ /*
+ * Overwrite vmcs01.GUEST_CR3 with L1's CR3 if EPT is disabled *and*
+ * nested early checks are disabled. In the event of a "late" VM-Fail,
+ * i.e. a VM-Fail detected by hardware but not KVM, KVM must unwind its
+ * software model to the pre-VMEntry host state. When EPT is disabled,
+ * GUEST_CR3 holds KVM's shadow CR3, not L1's "real" CR3, which causes
+ * nested_vmx_restore_host_state() to corrupt vcpu->arch.cr3. Stuffing
+ * vmcs01.GUEST_CR3 results in the unwind naturally setting arch.cr3 to
+ * the correct value. Smashing vmcs01.GUEST_CR3 is safe because nested
+ * VM-Exits, and the unwind, reset KVM's MMU, i.e. vmcs01.GUEST_CR3 is
+ * guaranteed to be overwritten with a shadow CR3 prior to re-entering
+ * L1. Don't stuff vmcs01.GUEST_CR3 when using nested early checks as
+ * KVM modifies vcpu->arch.cr3 if and only if the early hardware checks
+ * pass, and early VM-Fails do not reset KVM's MMU, i.e. the VM-Fail
+ * path would need to manually save/restore vmcs01.GUEST_CR3.
+ */
+ if (!enable_ept && !nested_early_check)
+ vmcs_writel(GUEST_CR3, vcpu->arch.cr3);
+
+ vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02);
+
+ prepare_vmcs02_early(vmx, &vmx->vmcs01, vmcs12);
+
+ if (from_vmentry) {
+ if (unlikely(!nested_get_vmcs12_pages(vcpu))) {
+ vmx_switch_vmcs(vcpu, &vmx->vmcs01);
+ return NVMX_VMENTRY_KVM_INTERNAL_ERROR;
+ }
+
+ if (nested_vmx_check_vmentry_hw(vcpu)) {
+ vmx_switch_vmcs(vcpu, &vmx->vmcs01);
+ return NVMX_VMENTRY_VMFAIL;
+ }
+
+ if (nested_vmx_check_guest_state(vcpu, vmcs12,
+ &entry_failure_code)) {
+ exit_reason.basic = EXIT_REASON_INVALID_STATE;
+ vmcs12->exit_qualification = entry_failure_code;
+ goto vmentry_fail_vmexit;
+ }
+ }
+
+ enter_guest_mode(vcpu);
+
+ if (prepare_vmcs02(vcpu, vmcs12, from_vmentry, &entry_failure_code)) {
+ exit_reason.basic = EXIT_REASON_INVALID_STATE;
+ vmcs12->exit_qualification = entry_failure_code;
+ goto vmentry_fail_vmexit_guest_mode;
+ }
+
+ if (from_vmentry) {
+ failed_index = nested_vmx_load_msr(vcpu,
+ vmcs12->vm_entry_msr_load_addr,
+ vmcs12->vm_entry_msr_load_count);
+ if (failed_index) {
+ exit_reason.basic = EXIT_REASON_MSR_LOAD_FAIL;
+ vmcs12->exit_qualification = failed_index;
+ goto vmentry_fail_vmexit_guest_mode;
+ }
+ } else {
+ /*
+ * The MMU is not initialized to point at the right entities yet and
+ * "get pages" would need to read data from the guest (i.e. we will
+ * need to perform gpa to hpa translation). Request a call
+ * to nested_get_vmcs12_pages before the next VM-entry. The MSRs
+ * have already been set at vmentry time and should not be reset.
+ */
+ kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
+ }
+
+ /*
+ * Re-evaluate pending events if L1 had a pending IRQ/NMI/INIT/SIPI
+ * when it executed VMLAUNCH/VMRESUME, as entering non-root mode can
+ * effectively unblock various events, e.g. INIT/SIPI cause VM-Exit
+ * unconditionally.
+ */
+ if (unlikely(evaluate_pending_interrupts))
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+
+ /*
+ * Do not start the preemption timer hrtimer until after we know
+ * we are successful, so that only nested_vmx_vmexit needs to cancel
+ * the timer.
+ */
+ vmx->nested.preemption_timer_expired = false;
+ if (nested_cpu_has_preemption_timer(vmcs12)) {
+ u64 timer_value = vmx_calc_preemption_timer_value(vcpu);
+ vmx_start_preemption_timer(vcpu, timer_value);
+ }
+
+ /*
+ * Note no nested_vmx_succeed or nested_vmx_fail here. At this point
+ * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet
+ * returned as far as L1 is concerned. It will only return (and set
+ * the success flag) when L2 exits (see nested_vmx_vmexit()).
+ */
+ return NVMX_VMENTRY_SUCCESS;
+
+ /*
+ * A failed consistency check that leads to a VMExit during L1's
+ * VMEnter to L2 is a variation of a normal VMexit, as explained in
+ * 26.7 "VM-entry failures during or after loading guest state".
+ */
+vmentry_fail_vmexit_guest_mode:
+ if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING)
+ vcpu->arch.tsc_offset -= vmcs12->tsc_offset;
+ leave_guest_mode(vcpu);
+
+vmentry_fail_vmexit:
+ vmx_switch_vmcs(vcpu, &vmx->vmcs01);
+
+ if (!from_vmentry)
+ return NVMX_VMENTRY_VMEXIT;
+
+ load_vmcs12_host_state(vcpu, vmcs12);
+ vmcs12->vm_exit_reason = exit_reason.full;
+ if (enable_shadow_vmcs || evmptr_is_valid(vmx->nested.hv_evmcs_vmptr))
+ vmx->nested.need_vmcs12_to_shadow_sync = true;
+ return NVMX_VMENTRY_VMEXIT;
+}
+
+/*
+ * nested_vmx_run() handles a nested entry, i.e., a VMLAUNCH or VMRESUME on L1
+ * for running an L2 nested guest.
+ */
+static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
+{
+ struct vmcs12 *vmcs12;
+ enum nvmx_vmentry_status status;
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ u32 interrupt_shadow = vmx_get_interrupt_shadow(vcpu);
+ enum nested_evmptrld_status evmptrld_status;
+
+ if (!nested_vmx_check_permission(vcpu))
+ return 1;
+
+ evmptrld_status = nested_vmx_handle_enlightened_vmptrld(vcpu, launch);
+ if (evmptrld_status == EVMPTRLD_ERROR) {
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+ }
+
+ kvm_pmu_trigger_event(vcpu, PERF_COUNT_HW_BRANCH_INSTRUCTIONS);
+
+ if (CC(evmptrld_status == EVMPTRLD_VMFAIL))
+ return nested_vmx_failInvalid(vcpu);
+
+ if (CC(!evmptr_is_valid(vmx->nested.hv_evmcs_vmptr) &&
+ vmx->nested.current_vmptr == INVALID_GPA))
+ return nested_vmx_failInvalid(vcpu);
+
+ vmcs12 = get_vmcs12(vcpu);
+
+ /*
+ * Can't VMLAUNCH or VMRESUME a shadow VMCS. Despite the fact
+ * that there *is* a valid VMCS pointer, RFLAGS.CF is set
+ * rather than RFLAGS.ZF, and no error number is stored to the
+ * VM-instruction error field.
+ */
+ if (CC(vmcs12->hdr.shadow_vmcs))
+ return nested_vmx_failInvalid(vcpu);
+
+ if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) {
+ copy_enlightened_to_vmcs12(vmx, vmx->nested.hv_evmcs->hv_clean_fields);
+ /* Enlightened VMCS doesn't have launch state */
+ vmcs12->launch_state = !launch;
+ } else if (enable_shadow_vmcs) {
+ copy_shadow_to_vmcs12(vmx);
+ }
+
+ /*
+ * The nested entry process starts with enforcing various prerequisites
+ * on vmcs12 as required by the Intel SDM, and act appropriately when
+ * they fail: As the SDM explains, some conditions should cause the
+ * instruction to fail, while others will cause the instruction to seem
+ * to succeed, but return an EXIT_REASON_INVALID_STATE.
+ * To speed up the normal (success) code path, we should avoid checking
+ * for misconfigurations which will anyway be caught by the processor
+ * when using the merged vmcs02.
+ */
+ if (CC(interrupt_shadow & KVM_X86_SHADOW_INT_MOV_SS))
+ return nested_vmx_fail(vcpu, VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS);
+
+ if (CC(vmcs12->launch_state == launch))
+ return nested_vmx_fail(vcpu,
+ launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS
+ : VMXERR_VMRESUME_NONLAUNCHED_VMCS);
+
+ if (nested_vmx_check_controls(vcpu, vmcs12))
+ return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
+
+ if (nested_vmx_check_address_space_size(vcpu, vmcs12))
+ return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD);
+
+ if (nested_vmx_check_host_state(vcpu, vmcs12))
+ return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD);
+
+ /*
+ * We're finally done with prerequisite checking, and can start with
+ * the nested entry.
+ */
+ vmx->nested.nested_run_pending = 1;
+ vmx->nested.has_preemption_timer_deadline = false;
+ status = nested_vmx_enter_non_root_mode(vcpu, true);
+ if (unlikely(status != NVMX_VMENTRY_SUCCESS))
+ goto vmentry_failed;
+
+ /* Emulate processing of posted interrupts on VM-Enter. */
+ if (nested_cpu_has_posted_intr(vmcs12) &&
+ kvm_apic_has_interrupt(vcpu) == vmx->nested.posted_intr_nv) {
+ vmx->nested.pi_pending = true;
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ kvm_apic_clear_irr(vcpu, vmx->nested.posted_intr_nv);
+ }
+
+ /* Hide L1D cache contents from the nested guest. */
+ vmx->vcpu.arch.l1tf_flush_l1d = true;
+
+ /*
+ * Must happen outside of nested_vmx_enter_non_root_mode() as it will
+ * also be used as part of restoring nVMX state for
+ * snapshot restore (migration).
+ *
+ * In this flow, it is assumed that vmcs12 cache was
+ * transferred as part of captured nVMX state and should
+ * therefore not be read from guest memory (which may not
+ * exist on destination host yet).
+ */
+ nested_cache_shadow_vmcs12(vcpu, vmcs12);
+
+ switch (vmcs12->guest_activity_state) {
+ case GUEST_ACTIVITY_HLT:
+ /*
+ * If we're entering a halted L2 vcpu and the L2 vcpu won't be
+ * awakened by event injection or by an NMI-window VM-exit or
+ * by an interrupt-window VM-exit, halt the vcpu.
+ */
+ if (!(vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) &&
+ !nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING) &&
+ !(nested_cpu_has(vmcs12, CPU_BASED_INTR_WINDOW_EXITING) &&
+ (vmcs12->guest_rflags & X86_EFLAGS_IF))) {
+ vmx->nested.nested_run_pending = 0;
+ return kvm_emulate_halt_noskip(vcpu);
+ }
+ break;
+ case GUEST_ACTIVITY_WAIT_SIPI:
+ vmx->nested.nested_run_pending = 0;
+ vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED;
+ break;
+ default:
+ break;
+ }
+
+ return 1;
+
+vmentry_failed:
+ vmx->nested.nested_run_pending = 0;
+ if (status == NVMX_VMENTRY_KVM_INTERNAL_ERROR)
+ return 0;
+ if (status == NVMX_VMENTRY_VMEXIT)
+ return 1;
+ WARN_ON_ONCE(status != NVMX_VMENTRY_VMFAIL);
+ return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
+}
+
+/*
+ * On a nested exit from L2 to L1, vmcs12.guest_cr0 might not be up-to-date
+ * because L2 may have changed some cr0 bits directly (CR0_GUEST_HOST_MASK).
+ * This function returns the new value we should put in vmcs12.guest_cr0.
+ * It's not enough to just return the vmcs02 GUEST_CR0. Rather,
+ * 1. Bits that neither L0 nor L1 trapped, were set directly by L2 and are now
+ * available in vmcs02 GUEST_CR0. (Note: It's enough to check that L0
+ * didn't trap the bit, because if L1 did, so would L0).
+ * 2. Bits that L1 asked to trap (and therefore L0 also did) could not have
+ * been modified by L2, and L1 knows it. So just leave the old value of
+ * the bit from vmcs12.guest_cr0. Note that the bit from vmcs02 GUEST_CR0
+ * isn't relevant, because if L0 traps this bit it can set it to anything.
+ * 3. Bits that L1 didn't trap, but L0 did. L1 believes the guest could have
+ * changed these bits, and therefore they need to be updated, but L0
+ * didn't necessarily allow them to be changed in GUEST_CR0 - and rather
+ * put them in vmcs02 CR0_READ_SHADOW. So take these bits from there.
+ */
+static inline unsigned long
+vmcs12_guest_cr0(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
+{
+ return
+ /*1*/ (vmcs_readl(GUEST_CR0) & vcpu->arch.cr0_guest_owned_bits) |
+ /*2*/ (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask) |
+ /*3*/ (vmcs_readl(CR0_READ_SHADOW) & ~(vmcs12->cr0_guest_host_mask |
+ vcpu->arch.cr0_guest_owned_bits));
+}
+
+static inline unsigned long
+vmcs12_guest_cr4(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
+{
+ return
+ /*1*/ (vmcs_readl(GUEST_CR4) & vcpu->arch.cr4_guest_owned_bits) |
+ /*2*/ (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask) |
+ /*3*/ (vmcs_readl(CR4_READ_SHADOW) & ~(vmcs12->cr4_guest_host_mask |
+ vcpu->arch.cr4_guest_owned_bits));
+}
+
+static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12,
+ u32 vm_exit_reason, u32 exit_intr_info)
+{
+ u32 idt_vectoring;
+ unsigned int nr;
+
+ /*
+ * Per the SDM, VM-Exits due to double and triple faults are never
+ * considered to occur during event delivery, even if the double/triple
+ * fault is the result of an escalating vectoring issue.
+ *
+ * Note, the SDM qualifies the double fault behavior with "The original
+ * event results in a double-fault exception". It's unclear why the
+ * qualification exists since exits due to double fault can occur only
+ * while vectoring a different exception (injected events are never
+ * subject to interception), i.e. there's _always_ an original event.
+ *
+ * The SDM also uses NMI as a confusing example for the "original event
+ * causes the VM exit directly" clause. NMI isn't special in any way,
+ * the same rule applies to all events that cause an exit directly.
+ * NMI is an odd choice for the example because NMIs can only occur on
+ * instruction boundaries, i.e. they _can't_ occur during vectoring.
+ */
+ if ((u16)vm_exit_reason == EXIT_REASON_TRIPLE_FAULT ||
+ ((u16)vm_exit_reason == EXIT_REASON_EXCEPTION_NMI &&
+ is_double_fault(exit_intr_info))) {
+ vmcs12->idt_vectoring_info_field = 0;
+ } else if (vcpu->arch.exception.injected) {
+ nr = vcpu->arch.exception.vector;
+ idt_vectoring = nr | VECTORING_INFO_VALID_MASK;
+
+ if (kvm_exception_is_soft(nr)) {
+ vmcs12->vm_exit_instruction_len =
+ vcpu->arch.event_exit_inst_len;
+ idt_vectoring |= INTR_TYPE_SOFT_EXCEPTION;
+ } else
+ idt_vectoring |= INTR_TYPE_HARD_EXCEPTION;
+
+ if (vcpu->arch.exception.has_error_code) {
+ idt_vectoring |= VECTORING_INFO_DELIVER_CODE_MASK;
+ vmcs12->idt_vectoring_error_code =
+ vcpu->arch.exception.error_code;
+ }
+
+ vmcs12->idt_vectoring_info_field = idt_vectoring;
+ } else if (vcpu->arch.nmi_injected) {
+ vmcs12->idt_vectoring_info_field =
+ INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR;
+ } else if (vcpu->arch.interrupt.injected) {
+ nr = vcpu->arch.interrupt.nr;
+ idt_vectoring = nr | VECTORING_INFO_VALID_MASK;
+
+ if (vcpu->arch.interrupt.soft) {
+ idt_vectoring |= INTR_TYPE_SOFT_INTR;
+ vmcs12->vm_entry_instruction_len =
+ vcpu->arch.event_exit_inst_len;
+ } else
+ idt_vectoring |= INTR_TYPE_EXT_INTR;
+
+ vmcs12->idt_vectoring_info_field = idt_vectoring;
+ } else {
+ vmcs12->idt_vectoring_info_field = 0;
+ }
+}
+
+
+void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu)
+{
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+ gfn_t gfn;
+
+ /*
+ * Don't need to mark the APIC access page dirty; it is never
+ * written to by the CPU during APIC virtualization.
+ */
+
+ if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) {
+ gfn = vmcs12->virtual_apic_page_addr >> PAGE_SHIFT;
+ kvm_vcpu_mark_page_dirty(vcpu, gfn);
+ }
+
+ if (nested_cpu_has_posted_intr(vmcs12)) {
+ gfn = vmcs12->posted_intr_desc_addr >> PAGE_SHIFT;
+ kvm_vcpu_mark_page_dirty(vcpu, gfn);
+ }
+}
+
+static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ int max_irr;
+ void *vapic_page;
+ u16 status;
+
+ if (!vmx->nested.pi_pending)
+ return 0;
+
+ if (!vmx->nested.pi_desc)
+ goto mmio_needed;
+
+ vmx->nested.pi_pending = false;
+
+ if (!pi_test_and_clear_on(vmx->nested.pi_desc))
+ return 0;
+
+ max_irr = find_last_bit((unsigned long *)vmx->nested.pi_desc->pir, 256);
+ if (max_irr != 256) {
+ vapic_page = vmx->nested.virtual_apic_map.hva;
+ if (!vapic_page)
+ goto mmio_needed;
+
+ __kvm_apic_update_irr(vmx->nested.pi_desc->pir,
+ vapic_page, &max_irr);
+ status = vmcs_read16(GUEST_INTR_STATUS);
+ if ((u8)max_irr > ((u8)status & 0xff)) {
+ status &= ~0xff;
+ status |= (u8)max_irr;
+ vmcs_write16(GUEST_INTR_STATUS, status);
+ }
+ }
+
+ nested_mark_vmcs12_pages_dirty(vcpu);
+ return 0;
+
+mmio_needed:
+ kvm_handle_memory_failure(vcpu, X86EMUL_IO_NEEDED, NULL);
+ return -ENXIO;
+}
+
+static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu)
+{
+ struct kvm_queued_exception *ex = &vcpu->arch.exception_vmexit;
+ u32 intr_info = ex->vector | INTR_INFO_VALID_MASK;
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+ unsigned long exit_qual;
+
+ if (ex->has_payload) {
+ exit_qual = ex->payload;
+ } else if (ex->vector == PF_VECTOR) {
+ exit_qual = vcpu->arch.cr2;
+ } else if (ex->vector == DB_VECTOR) {
+ exit_qual = vcpu->arch.dr6;
+ exit_qual &= ~DR6_BT;
+ exit_qual ^= DR6_ACTIVE_LOW;
+ } else {
+ exit_qual = 0;
+ }
+
+ /*
+ * Unlike AMD's Paged Real Mode, which reports an error code on #PF
+ * VM-Exits even if the CPU is in Real Mode, Intel VMX never sets the
+ * "has error code" flags on VM-Exit if the CPU is in Real Mode.
+ */
+ if (ex->has_error_code && is_protmode(vcpu)) {
+ /*
+ * Intel CPUs do not generate error codes with bits 31:16 set,
+ * and more importantly VMX disallows setting bits 31:16 in the
+ * injected error code for VM-Entry. Drop the bits to mimic
+ * hardware and avoid inducing failure on nested VM-Entry if L1
+ * chooses to inject the exception back to L2. AMD CPUs _do_
+ * generate "full" 32-bit error codes, so KVM allows userspace
+ * to inject exception error codes with bits 31:16 set.
+ */
+ vmcs12->vm_exit_intr_error_code = (u16)ex->error_code;
+ intr_info |= INTR_INFO_DELIVER_CODE_MASK;
+ }
+
+ if (kvm_exception_is_soft(ex->vector))
+ intr_info |= INTR_TYPE_SOFT_EXCEPTION;
+ else
+ intr_info |= INTR_TYPE_HARD_EXCEPTION;
+
+ if (!(vmcs12->idt_vectoring_info_field & VECTORING_INFO_VALID_MASK) &&
+ vmx_get_nmi_mask(vcpu))
+ intr_info |= INTR_INFO_UNBLOCK_NMI;
+
+ nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, intr_info, exit_qual);
+}
+
+/*
+ * Returns true if a debug trap is (likely) pending delivery. Infer the class
+ * of a #DB (trap-like vs. fault-like) from the exception payload (to-be-DR6).
+ * Using the payload is flawed because code breakpoints (fault-like) and data
+ * breakpoints (trap-like) set the same bits in DR6 (breakpoint detected), i.e.
+ * this will return false positives if a to-be-injected code breakpoint #DB is
+ * pending (from KVM's perspective, but not "pending" across an instruction
+ * boundary). ICEBP, a.k.a. INT1, is also not reflected here even though it
+ * too is trap-like.
+ *
+ * KVM "works" despite these flaws as ICEBP isn't currently supported by the
+ * emulator, Monitor Trap Flag is not marked pending on intercepted #DBs (the
+ * #DB has already happened), and MTF isn't marked pending on code breakpoints
+ * from the emulator (because such #DBs are fault-like and thus don't trigger
+ * actions that fire on instruction retire).
+ */
+static unsigned long vmx_get_pending_dbg_trap(struct kvm_queued_exception *ex)
+{
+ if (!ex->pending || ex->vector != DB_VECTOR)
+ return 0;
+
+ /* General Detect #DBs are always fault-like. */
+ return ex->payload & ~DR6_BD;
+}
+
+/*
+ * Returns true if there's a pending #DB exception that is lower priority than
+ * a pending Monitor Trap Flag VM-Exit. TSS T-flag #DBs are not emulated by
+ * KVM, but could theoretically be injected by userspace. Note, this code is
+ * imperfect, see above.
+ */
+static bool vmx_is_low_priority_db_trap(struct kvm_queued_exception *ex)
+{
+ return vmx_get_pending_dbg_trap(ex) & ~DR6_BT;
+}
+
+/*
+ * Certain VM-exits set the 'pending debug exceptions' field to indicate a
+ * recognized #DB (data or single-step) that has yet to be delivered. Since KVM
+ * represents these debug traps with a payload that is said to be compatible
+ * with the 'pending debug exceptions' field, write the payload to the VMCS
+ * field if a VM-exit is delivered before the debug trap.
+ */
+static void nested_vmx_update_pending_dbg(struct kvm_vcpu *vcpu)
+{
+ unsigned long pending_dbg;
+
+ pending_dbg = vmx_get_pending_dbg_trap(&vcpu->arch.exception);
+ if (pending_dbg)
+ vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, pending_dbg);
+}
+
+static bool nested_vmx_preemption_timer_pending(struct kvm_vcpu *vcpu)
+{
+ return nested_cpu_has_preemption_timer(get_vmcs12(vcpu)) &&
+ to_vmx(vcpu)->nested.preemption_timer_expired;
+}
+
+static bool vmx_has_nested_events(struct kvm_vcpu *vcpu)
+{
+ return nested_vmx_preemption_timer_pending(vcpu) ||
+ to_vmx(vcpu)->nested.mtf_pending;
+}
+
+/*
+ * Per the Intel SDM's table "Priority Among Concurrent Events", with minor
+ * edits to fill in missing examples, e.g. #DB due to split-lock accesses,
+ * and less minor edits to splice in the priority of VMX Non-Root specific
+ * events, e.g. MTF and NMI/INTR-window exiting.
+ *
+ * 1 Hardware Reset and Machine Checks
+ * - RESET
+ * - Machine Check
+ *
+ * 2 Trap on Task Switch
+ * - T flag in TSS is set (on task switch)
+ *
+ * 3 External Hardware Interventions
+ * - FLUSH
+ * - STOPCLK
+ * - SMI
+ * - INIT
+ *
+ * 3.5 Monitor Trap Flag (MTF) VM-exit[1]
+ *
+ * 4 Traps on Previous Instruction
+ * - Breakpoints
+ * - Trap-class Debug Exceptions (#DB due to TF flag set, data/I-O
+ * breakpoint, or #DB due to a split-lock access)
+ *
+ * 4.3 VMX-preemption timer expired VM-exit
+ *
+ * 4.6 NMI-window exiting VM-exit[2]
+ *
+ * 5 Nonmaskable Interrupts (NMI)
+ *
+ * 5.5 Interrupt-window exiting VM-exit and Virtual-interrupt delivery
+ *
+ * 6 Maskable Hardware Interrupts
+ *
+ * 7 Code Breakpoint Fault
+ *
+ * 8 Faults from Fetching Next Instruction
+ * - Code-Segment Limit Violation
+ * - Code Page Fault
+ * - Control protection exception (missing ENDBRANCH at target of indirect
+ * call or jump)
+ *
+ * 9 Faults from Decoding Next Instruction
+ * - Instruction length > 15 bytes
+ * - Invalid Opcode
+ * - Coprocessor Not Available
+ *
+ *10 Faults on Executing Instruction
+ * - Overflow
+ * - Bound error
+ * - Invalid TSS
+ * - Segment Not Present
+ * - Stack fault
+ * - General Protection
+ * - Data Page Fault
+ * - Alignment Check
+ * - x86 FPU Floating-point exception
+ * - SIMD floating-point exception
+ * - Virtualization exception
+ * - Control protection exception
+ *
+ * [1] Per the "Monitor Trap Flag" section: System-management interrupts (SMIs),
+ * INIT signals, and higher priority events take priority over MTF VM exits.
+ * MTF VM exits take priority over debug-trap exceptions and lower priority
+ * events.
+ *
+ * [2] Debug-trap exceptions and higher priority events take priority over VM exits
+ * caused by the VMX-preemption timer. VM exits caused by the VMX-preemption
+ * timer take priority over VM exits caused by the "NMI-window exiting"
+ * VM-execution control and lower priority events.
+ *
+ * [3] Debug-trap exceptions and higher priority events take priority over VM exits
+ * caused by "NMI-window exiting". VM exits caused by this control take
+ * priority over non-maskable interrupts (NMIs) and lower priority events.
+ *
+ * [4] Virtual-interrupt delivery has the same priority as that of VM exits due to
+ * the 1-setting of the "interrupt-window exiting" VM-execution control. Thus,
+ * non-maskable interrupts (NMIs) and higher priority events take priority over
+ * delivery of a virtual interrupt; delivery of a virtual interrupt takes
+ * priority over external interrupts and lower priority events.
+ */
+static int vmx_check_nested_events(struct kvm_vcpu *vcpu)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ /*
+ * Only a pending nested run blocks a pending exception. If there is a
+ * previously injected event, the pending exception occurred while said
+ * event was being delivered and thus needs to be handled.
+ */
+ bool block_nested_exceptions = vmx->nested.nested_run_pending;
+ /*
+ * New events (not exceptions) are only recognized at instruction
+ * boundaries. If an event needs reinjection, then KVM is handling a
+ * VM-Exit that occurred _during_ instruction execution; new events are
+ * blocked until the instruction completes.
+ */
+ bool block_nested_events = block_nested_exceptions ||
+ kvm_event_needs_reinjection(vcpu);
+
+ if (lapic_in_kernel(vcpu) &&
+ test_bit(KVM_APIC_INIT, &apic->pending_events)) {
+ if (block_nested_events)
+ return -EBUSY;
+ nested_vmx_update_pending_dbg(vcpu);
+ clear_bit(KVM_APIC_INIT, &apic->pending_events);
+ if (vcpu->arch.mp_state != KVM_MP_STATE_INIT_RECEIVED)
+ nested_vmx_vmexit(vcpu, EXIT_REASON_INIT_SIGNAL, 0, 0);
+
+ /* MTF is discarded if the vCPU is in WFS. */
+ vmx->nested.mtf_pending = false;
+ return 0;
+ }
+
+ if (lapic_in_kernel(vcpu) &&
+ test_bit(KVM_APIC_SIPI, &apic->pending_events)) {
+ if (block_nested_events)
+ return -EBUSY;
+
+ clear_bit(KVM_APIC_SIPI, &apic->pending_events);
+ if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
+ nested_vmx_vmexit(vcpu, EXIT_REASON_SIPI_SIGNAL, 0,
+ apic->sipi_vector & 0xFFUL);
+ return 0;
+ }
+ /* Fallthrough, the SIPI is completely ignored. */
+ }
+
+ /*
+ * Process exceptions that are higher priority than Monitor Trap Flag:
+ * fault-like exceptions, TSS T flag #DB (not emulated by KVM, but
+ * could theoretically come in from userspace), and ICEBP (INT1).
+ *
+ * TODO: SMIs have higher priority than MTF and trap-like #DBs (except
+ * for TSS T flag #DBs). KVM also doesn't save/restore pending MTF
+ * across SMI/RSM as it should; that needs to be addressed in order to
+ * prioritize SMI over MTF and trap-like #DBs.
+ */
+ if (vcpu->arch.exception_vmexit.pending &&
+ !vmx_is_low_priority_db_trap(&vcpu->arch.exception_vmexit)) {
+ if (block_nested_exceptions)
+ return -EBUSY;
+
+ nested_vmx_inject_exception_vmexit(vcpu);
+ return 0;
+ }
+
+ if (vcpu->arch.exception.pending &&
+ !vmx_is_low_priority_db_trap(&vcpu->arch.exception)) {
+ if (block_nested_exceptions)
+ return -EBUSY;
+ goto no_vmexit;
+ }
+
+ if (vmx->nested.mtf_pending) {
+ if (block_nested_events)
+ return -EBUSY;
+ nested_vmx_update_pending_dbg(vcpu);
+ nested_vmx_vmexit(vcpu, EXIT_REASON_MONITOR_TRAP_FLAG, 0, 0);
+ return 0;
+ }
+
+ if (vcpu->arch.exception_vmexit.pending) {
+ if (block_nested_exceptions)
+ return -EBUSY;
+
+ nested_vmx_inject_exception_vmexit(vcpu);
+ return 0;
+ }
+
+ if (vcpu->arch.exception.pending) {
+ if (block_nested_exceptions)
+ return -EBUSY;
+ goto no_vmexit;
+ }
+
+ if (nested_vmx_preemption_timer_pending(vcpu)) {
+ if (block_nested_events)
+ return -EBUSY;
+ nested_vmx_vmexit(vcpu, EXIT_REASON_PREEMPTION_TIMER, 0, 0);
+ return 0;
+ }
+
+ if (vcpu->arch.smi_pending && !is_smm(vcpu)) {
+ if (block_nested_events)
+ return -EBUSY;
+ goto no_vmexit;
+ }
+
+ if (vcpu->arch.nmi_pending && !vmx_nmi_blocked(vcpu)) {
+ if (block_nested_events)
+ return -EBUSY;
+ if (!nested_exit_on_nmi(vcpu))
+ goto no_vmexit;
+
+ nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI,
+ NMI_VECTOR | INTR_TYPE_NMI_INTR |
+ INTR_INFO_VALID_MASK, 0);
+ /*
+ * The NMI-triggered VM exit counts as injection:
+ * clear this one and block further NMIs.
+ */
+ vcpu->arch.nmi_pending = 0;
+ vmx_set_nmi_mask(vcpu, true);
+ return 0;
+ }
+
+ if (kvm_cpu_has_interrupt(vcpu) && !vmx_interrupt_blocked(vcpu)) {
+ if (block_nested_events)
+ return -EBUSY;
+ if (!nested_exit_on_intr(vcpu))
+ goto no_vmexit;
+ nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0);
+ return 0;
+ }
+
+no_vmexit:
+ return vmx_complete_nested_posted_interrupt(vcpu);
+}
+
+static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu)
+{
+ ktime_t remaining =
+ hrtimer_get_remaining(&to_vmx(vcpu)->nested.preemption_timer);
+ u64 value;
+
+ if (ktime_to_ns(remaining) <= 0)
+ return 0;
+
+ value = ktime_to_ns(remaining) * vcpu->arch.virtual_tsc_khz;
+ do_div(value, 1000000);
+ return value >> VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE;
+}
+
+static bool is_vmcs12_ext_field(unsigned long field)
+{
+ switch (field) {
+ case GUEST_ES_SELECTOR:
+ case GUEST_CS_SELECTOR:
+ case GUEST_SS_SELECTOR:
+ case GUEST_DS_SELECTOR:
+ case GUEST_FS_SELECTOR:
+ case GUEST_GS_SELECTOR:
+ case GUEST_LDTR_SELECTOR:
+ case GUEST_TR_SELECTOR:
+ case GUEST_ES_LIMIT:
+ case GUEST_CS_LIMIT:
+ case GUEST_SS_LIMIT:
+ case GUEST_DS_LIMIT:
+ case GUEST_FS_LIMIT:
+ case GUEST_GS_LIMIT:
+ case GUEST_LDTR_LIMIT:
+ case GUEST_TR_LIMIT:
+ case GUEST_GDTR_LIMIT:
+ case GUEST_IDTR_LIMIT:
+ case GUEST_ES_AR_BYTES:
+ case GUEST_DS_AR_BYTES:
+ case GUEST_FS_AR_BYTES:
+ case GUEST_GS_AR_BYTES:
+ case GUEST_LDTR_AR_BYTES:
+ case GUEST_TR_AR_BYTES:
+ case GUEST_ES_BASE:
+ case GUEST_CS_BASE:
+ case GUEST_SS_BASE:
+ case GUEST_DS_BASE:
+ case GUEST_FS_BASE:
+ case GUEST_GS_BASE:
+ case GUEST_LDTR_BASE:
+ case GUEST_TR_BASE:
+ case GUEST_GDTR_BASE:
+ case GUEST_IDTR_BASE:
+ case GUEST_PENDING_DBG_EXCEPTIONS:
+ case GUEST_BNDCFGS:
+ return true;
+ default:
+ break;
+ }
+
+ return false;
+}
+
+static void sync_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ vmcs12->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR);
+ vmcs12->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR);
+ vmcs12->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR);
+ vmcs12->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR);
+ vmcs12->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR);
+ vmcs12->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR);
+ vmcs12->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR);
+ vmcs12->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR);
+ vmcs12->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT);
+ vmcs12->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT);
+ vmcs12->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT);
+ vmcs12->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT);
+ vmcs12->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT);
+ vmcs12->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT);
+ vmcs12->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT);
+ vmcs12->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT);
+ vmcs12->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT);
+ vmcs12->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT);
+ vmcs12->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES);
+ vmcs12->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES);
+ vmcs12->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES);
+ vmcs12->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES);
+ vmcs12->guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES);
+ vmcs12->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES);
+ vmcs12->guest_es_base = vmcs_readl(GUEST_ES_BASE);
+ vmcs12->guest_cs_base = vmcs_readl(GUEST_CS_BASE);
+ vmcs12->guest_ss_base = vmcs_readl(GUEST_SS_BASE);
+ vmcs12->guest_ds_base = vmcs_readl(GUEST_DS_BASE);
+ vmcs12->guest_fs_base = vmcs_readl(GUEST_FS_BASE);
+ vmcs12->guest_gs_base = vmcs_readl(GUEST_GS_BASE);
+ vmcs12->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE);
+ vmcs12->guest_tr_base = vmcs_readl(GUEST_TR_BASE);
+ vmcs12->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE);
+ vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE);
+ vmcs12->guest_pending_dbg_exceptions =
+ vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
+
+ vmx->nested.need_sync_vmcs02_to_vmcs12_rare = false;
+}
+
+static void copy_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ int cpu;
+
+ if (!vmx->nested.need_sync_vmcs02_to_vmcs12_rare)
+ return;
+
+
+ WARN_ON_ONCE(vmx->loaded_vmcs != &vmx->vmcs01);
+
+ cpu = get_cpu();
+ vmx->loaded_vmcs = &vmx->nested.vmcs02;
+ vmx_vcpu_load_vmcs(vcpu, cpu, &vmx->vmcs01);
+
+ sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
+
+ vmx->loaded_vmcs = &vmx->vmcs01;
+ vmx_vcpu_load_vmcs(vcpu, cpu, &vmx->nested.vmcs02);
+ put_cpu();
+}
+
+/*
+ * Update the guest state fields of vmcs12 to reflect changes that
+ * occurred while L2 was running. (The "IA-32e mode guest" bit of the
+ * VM-entry controls is also updated, since this is really a guest
+ * state bit.)
+ */
+static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr))
+ sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
+
+ vmx->nested.need_sync_vmcs02_to_vmcs12_rare =
+ !evmptr_is_valid(vmx->nested.hv_evmcs_vmptr);
+
+ vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12);
+ vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12);
+
+ vmcs12->guest_rsp = kvm_rsp_read(vcpu);
+ vmcs12->guest_rip = kvm_rip_read(vcpu);
+ vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS);
+
+ vmcs12->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES);
+ vmcs12->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES);
+
+ vmcs12->guest_interruptibility_info =
+ vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
+
+ if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED)
+ vmcs12->guest_activity_state = GUEST_ACTIVITY_HLT;
+ else if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED)
+ vmcs12->guest_activity_state = GUEST_ACTIVITY_WAIT_SIPI;
+ else
+ vmcs12->guest_activity_state = GUEST_ACTIVITY_ACTIVE;
+
+ if (nested_cpu_has_preemption_timer(vmcs12) &&
+ vmcs12->vm_exit_controls & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER &&
+ !vmx->nested.nested_run_pending)
+ vmcs12->vmx_preemption_timer_value =
+ vmx_get_preemption_timer_value(vcpu);
+
+ /*
+ * In some cases (usually, nested EPT), L2 is allowed to change its
+ * own CR3 without exiting. If it has changed it, we must keep it.
+ * Of course, if L0 is using shadow page tables, GUEST_CR3 was defined
+ * by L0, not L1 or L2, so we mustn't unconditionally copy it to vmcs12.
+ *
+ * Additionally, restore L2's PDPTR to vmcs12.
+ */
+ if (enable_ept) {
+ vmcs12->guest_cr3 = vmcs_readl(GUEST_CR3);
+ if (nested_cpu_has_ept(vmcs12) && is_pae_paging(vcpu)) {
+ vmcs12->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0);
+ vmcs12->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1);
+ vmcs12->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2);
+ vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3);
+ }
+ }
+
+ vmcs12->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS);
+
+ if (nested_cpu_has_vid(vmcs12))
+ vmcs12->guest_intr_status = vmcs_read16(GUEST_INTR_STATUS);
+
+ vmcs12->vm_entry_controls =
+ (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) |
+ (vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE);
+
+ if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS)
+ kvm_get_dr(vcpu, 7, (unsigned long *)&vmcs12->guest_dr7);
+
+ if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_EFER)
+ vmcs12->guest_ia32_efer = vcpu->arch.efer;
+}
+
+/*
+ * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits
+ * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12),
+ * and this function updates it to reflect the changes to the guest state while
+ * L2 was running (and perhaps made some exits which were handled directly by L0
+ * without going back to L1), and to reflect the exit reason.
+ * Note that we do not have to copy here all VMCS fields, just those that
+ * could have changed by the L2 guest or the exit - i.e., the guest-state and
+ * exit-information fields only. Other fields are modified by L1 with VMWRITE,
+ * which already writes to vmcs12 directly.
+ */
+static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
+ u32 vm_exit_reason, u32 exit_intr_info,
+ unsigned long exit_qualification)
+{
+ /* update exit information fields: */
+ vmcs12->vm_exit_reason = vm_exit_reason;
+ if (to_vmx(vcpu)->exit_reason.enclave_mode)
+ vmcs12->vm_exit_reason |= VMX_EXIT_REASONS_SGX_ENCLAVE_MODE;
+ vmcs12->exit_qualification = exit_qualification;
+
+ /*
+ * On VM-Exit due to a failed VM-Entry, the VMCS isn't marked launched
+ * and only EXIT_REASON and EXIT_QUALIFICATION are updated, all other
+ * exit info fields are unmodified.
+ */
+ if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) {
+ vmcs12->launch_state = 1;
+
+ /* vm_entry_intr_info_field is cleared on exit. Emulate this
+ * instead of reading the real value. */
+ vmcs12->vm_entry_intr_info_field &= ~INTR_INFO_VALID_MASK;
+
+ /*
+ * Transfer the event that L0 or L1 may wanted to inject into
+ * L2 to IDT_VECTORING_INFO_FIELD.
+ */
+ vmcs12_save_pending_event(vcpu, vmcs12,
+ vm_exit_reason, exit_intr_info);
+
+ vmcs12->vm_exit_intr_info = exit_intr_info;
+ vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
+ vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
+
+ /*
+ * According to spec, there's no need to store the guest's
+ * MSRs if the exit is due to a VM-entry failure that occurs
+ * during or after loading the guest state. Since this exit
+ * does not fall in that category, we need to save the MSRs.
+ */
+ if (nested_vmx_store_msr(vcpu,
+ vmcs12->vm_exit_msr_store_addr,
+ vmcs12->vm_exit_msr_store_count))
+ nested_vmx_abort(vcpu,
+ VMX_ABORT_SAVE_GUEST_MSR_FAIL);
+ }
+}
+
+/*
+ * A part of what we need to when the nested L2 guest exits and we want to
+ * run its L1 parent, is to reset L1's guest state to the host state specified
+ * in vmcs12.
+ * This function is to be called not only on normal nested exit, but also on
+ * a nested entry failure, as explained in Intel's spec, 3B.23.7 ("VM-Entry
+ * Failures During or After Loading Guest State").
+ * This function should be called when the active VMCS is L1's (vmcs01).
+ */
+static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+ enum vm_entry_failure_code ignored;
+ struct kvm_segment seg;
+
+ if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER)
+ vcpu->arch.efer = vmcs12->host_ia32_efer;
+ else if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)
+ vcpu->arch.efer |= (EFER_LMA | EFER_LME);
+ else
+ vcpu->arch.efer &= ~(EFER_LMA | EFER_LME);
+ vmx_set_efer(vcpu, vcpu->arch.efer);
+
+ kvm_rsp_write(vcpu, vmcs12->host_rsp);
+ kvm_rip_write(vcpu, vmcs12->host_rip);
+ vmx_set_rflags(vcpu, X86_EFLAGS_FIXED);
+ vmx_set_interrupt_shadow(vcpu, 0);
+
+ /*
+ * Note that calling vmx_set_cr0 is important, even if cr0 hasn't
+ * actually changed, because vmx_set_cr0 refers to efer set above.
+ *
+ * CR0_GUEST_HOST_MASK is already set in the original vmcs01
+ * (KVM doesn't change it);
+ */
+ vcpu->arch.cr0_guest_owned_bits = vmx_l1_guest_owned_cr0_bits();
+ vmx_set_cr0(vcpu, vmcs12->host_cr0);
+
+ /* Same as above - no reason to call set_cr4_guest_host_mask(). */
+ vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK);
+ vmx_set_cr4(vcpu, vmcs12->host_cr4);
+
+ nested_ept_uninit_mmu_context(vcpu);
+
+ /*
+ * Only PDPTE load can fail as the value of cr3 was checked on entry and
+ * couldn't have changed.
+ */
+ if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, true, &ignored))
+ nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL);
+
+ nested_vmx_transition_tlb_flush(vcpu, vmcs12, false);
+
+ vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs);
+ vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->host_ia32_sysenter_esp);
+ vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip);
+ vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base);
+ vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base);
+ vmcs_write32(GUEST_IDTR_LIMIT, 0xFFFF);
+ vmcs_write32(GUEST_GDTR_LIMIT, 0xFFFF);
+
+ /* If not VM_EXIT_CLEAR_BNDCFGS, the L2 value propagates to L1. */
+ if (vmcs12->vm_exit_controls & VM_EXIT_CLEAR_BNDCFGS)
+ vmcs_write64(GUEST_BNDCFGS, 0);
+
+ if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) {
+ vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat);
+ vcpu->arch.pat = vmcs12->host_ia32_pat;
+ }
+ if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) &&
+ kvm_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu)))
+ WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL,
+ vmcs12->host_ia32_perf_global_ctrl));
+
+ /* Set L1 segment info according to Intel SDM
+ 27.5.2 Loading Host Segment and Descriptor-Table Registers */
+ seg = (struct kvm_segment) {
+ .base = 0,
+ .limit = 0xFFFFFFFF,
+ .selector = vmcs12->host_cs_selector,
+ .type = 11,
+ .present = 1,
+ .s = 1,
+ .g = 1
+ };
+ if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)
+ seg.l = 1;
+ else
+ seg.db = 1;
+ __vmx_set_segment(vcpu, &seg, VCPU_SREG_CS);
+ seg = (struct kvm_segment) {
+ .base = 0,
+ .limit = 0xFFFFFFFF,
+ .type = 3,
+ .present = 1,
+ .s = 1,
+ .db = 1,
+ .g = 1
+ };
+ seg.selector = vmcs12->host_ds_selector;
+ __vmx_set_segment(vcpu, &seg, VCPU_SREG_DS);
+ seg.selector = vmcs12->host_es_selector;
+ __vmx_set_segment(vcpu, &seg, VCPU_SREG_ES);
+ seg.selector = vmcs12->host_ss_selector;
+ __vmx_set_segment(vcpu, &seg, VCPU_SREG_SS);
+ seg.selector = vmcs12->host_fs_selector;
+ seg.base = vmcs12->host_fs_base;
+ __vmx_set_segment(vcpu, &seg, VCPU_SREG_FS);
+ seg.selector = vmcs12->host_gs_selector;
+ seg.base = vmcs12->host_gs_base;
+ __vmx_set_segment(vcpu, &seg, VCPU_SREG_GS);
+ seg = (struct kvm_segment) {
+ .base = vmcs12->host_tr_base,
+ .limit = 0x67,
+ .selector = vmcs12->host_tr_selector,
+ .type = 11,
+ .present = 1
+ };
+ __vmx_set_segment(vcpu, &seg, VCPU_SREG_TR);
+
+ memset(&seg, 0, sizeof(seg));
+ seg.unusable = 1;
+ __vmx_set_segment(vcpu, &seg, VCPU_SREG_LDTR);
+
+ kvm_set_dr(vcpu, 7, 0x400);
+ vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
+
+ if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr,
+ vmcs12->vm_exit_msr_load_count))
+ nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL);
+
+ to_vmx(vcpu)->emulation_required = vmx_emulation_required(vcpu);
+}
+
+static inline u64 nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx *vmx)
+{
+ struct vmx_uret_msr *efer_msr;
+ unsigned int i;
+
+ if (vm_entry_controls_get(vmx) & VM_ENTRY_LOAD_IA32_EFER)
+ return vmcs_read64(GUEST_IA32_EFER);
+
+ if (cpu_has_load_ia32_efer())
+ return host_efer;
+
+ for (i = 0; i < vmx->msr_autoload.guest.nr; ++i) {
+ if (vmx->msr_autoload.guest.val[i].index == MSR_EFER)
+ return vmx->msr_autoload.guest.val[i].value;
+ }
+
+ efer_msr = vmx_find_uret_msr(vmx, MSR_EFER);
+ if (efer_msr)
+ return efer_msr->data;
+
+ return host_efer;
+}
+
+static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu)
+{
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct vmx_msr_entry g, h;
+ gpa_t gpa;
+ u32 i, j;
+
+ vcpu->arch.pat = vmcs_read64(GUEST_IA32_PAT);
+
+ if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) {
+ /*
+ * L1's host DR7 is lost if KVM_GUESTDBG_USE_HW_BP is set
+ * as vmcs01.GUEST_DR7 contains a userspace defined value
+ * and vcpu->arch.dr7 is not squirreled away before the
+ * nested VMENTER (not worth adding a variable in nested_vmx).
+ */
+ if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
+ kvm_set_dr(vcpu, 7, DR7_FIXED_1);
+ else
+ WARN_ON(kvm_set_dr(vcpu, 7, vmcs_readl(GUEST_DR7)));
+ }
+
+ /*
+ * Note that calling vmx_set_{efer,cr0,cr4} is important as they
+ * handle a variety of side effects to KVM's software model.
+ */
+ vmx_set_efer(vcpu, nested_vmx_get_vmcs01_guest_efer(vmx));
+
+ vcpu->arch.cr0_guest_owned_bits = vmx_l1_guest_owned_cr0_bits();
+ vmx_set_cr0(vcpu, vmcs_readl(CR0_READ_SHADOW));
+
+ vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK);
+ vmx_set_cr4(vcpu, vmcs_readl(CR4_READ_SHADOW));
+
+ nested_ept_uninit_mmu_context(vcpu);
+ vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
+ kvm_register_mark_available(vcpu, VCPU_EXREG_CR3);
+
+ /*
+ * Use ept_save_pdptrs(vcpu) to load the MMU's cached PDPTRs
+ * from vmcs01 (if necessary). The PDPTRs are not loaded on
+ * VMFail, like everything else we just need to ensure our
+ * software model is up-to-date.
+ */
+ if (enable_ept && is_pae_paging(vcpu))
+ ept_save_pdptrs(vcpu);
+
+ kvm_mmu_reset_context(vcpu);
+
+ /*
+ * This nasty bit of open coding is a compromise between blindly
+ * loading L1's MSRs using the exit load lists (incorrect emulation
+ * of VMFail), leaving the nested VM's MSRs in the software model
+ * (incorrect behavior) and snapshotting the modified MSRs (too
+ * expensive since the lists are unbound by hardware). For each
+ * MSR that was (prematurely) loaded from the nested VMEntry load
+ * list, reload it from the exit load list if it exists and differs
+ * from the guest value. The intent is to stuff host state as
+ * silently as possible, not to fully process the exit load list.
+ */
+ for (i = 0; i < vmcs12->vm_entry_msr_load_count; i++) {
+ gpa = vmcs12->vm_entry_msr_load_addr + (i * sizeof(g));
+ if (kvm_vcpu_read_guest(vcpu, gpa, &g, sizeof(g))) {
+ pr_debug_ratelimited(
+ "%s read MSR index failed (%u, 0x%08llx)\n",
+ __func__, i, gpa);
+ goto vmabort;
+ }
+
+ for (j = 0; j < vmcs12->vm_exit_msr_load_count; j++) {
+ gpa = vmcs12->vm_exit_msr_load_addr + (j * sizeof(h));
+ if (kvm_vcpu_read_guest(vcpu, gpa, &h, sizeof(h))) {
+ pr_debug_ratelimited(
+ "%s read MSR failed (%u, 0x%08llx)\n",
+ __func__, j, gpa);
+ goto vmabort;
+ }
+ if (h.index != g.index)
+ continue;
+ if (h.value == g.value)
+ break;
+
+ if (nested_vmx_load_msr_check(vcpu, &h)) {
+ pr_debug_ratelimited(
+ "%s check failed (%u, 0x%x, 0x%x)\n",
+ __func__, j, h.index, h.reserved);
+ goto vmabort;
+ }
+
+ if (kvm_set_msr(vcpu, h.index, h.value)) {
+ pr_debug_ratelimited(
+ "%s WRMSR failed (%u, 0x%x, 0x%llx)\n",
+ __func__, j, h.index, h.value);
+ goto vmabort;
+ }
+ }
+ }
+
+ return;
+
+vmabort:
+ nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL);
+}
+
+/*
+ * Emulate an exit from nested guest (L2) to L1, i.e., prepare to run L1
+ * and modify vmcs12 to make it see what it would expect to see there if
+ * L2 was its real guest. Must only be called when in L2 (is_guest_mode())
+ */
+void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
+ u32 exit_intr_info, unsigned long exit_qualification)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+
+ /* Pending MTF traps are discarded on VM-Exit. */
+ vmx->nested.mtf_pending = false;
+
+ /* trying to cancel vmlaunch/vmresume is a bug */
+ WARN_ON_ONCE(vmx->nested.nested_run_pending);
+
+ if (kvm_check_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu)) {
+ /*
+ * KVM_REQ_GET_NESTED_STATE_PAGES is also used to map
+ * Enlightened VMCS after migration and we still need to
+ * do that when something is forcing L2->L1 exit prior to
+ * the first L2 run.
+ */
+ (void)nested_get_evmcs_page(vcpu);
+ }
+
+ /* Service pending TLB flush requests for L2 before switching to L1. */
+ kvm_service_local_tlb_flush_requests(vcpu);
+
+ /*
+ * VCPU_EXREG_PDPTR will be clobbered in arch/x86/kvm/vmx/vmx.h between
+ * now and the new vmentry. Ensure that the VMCS02 PDPTR fields are
+ * up-to-date before switching to L1.
+ */
+ if (enable_ept && is_pae_paging(vcpu))
+ vmx_ept_load_pdptrs(vcpu);
+
+ leave_guest_mode(vcpu);
+
+ if (nested_cpu_has_preemption_timer(vmcs12))
+ hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer);
+
+ if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETTING)) {
+ vcpu->arch.tsc_offset = vcpu->arch.l1_tsc_offset;
+ if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_TSC_SCALING))
+ vcpu->arch.tsc_scaling_ratio = vcpu->arch.l1_tsc_scaling_ratio;
+ }
+
+ if (likely(!vmx->fail)) {
+ sync_vmcs02_to_vmcs12(vcpu, vmcs12);
+
+ if (vm_exit_reason != -1)
+ prepare_vmcs12(vcpu, vmcs12, vm_exit_reason,
+ exit_intr_info, exit_qualification);
+
+ /*
+ * Must happen outside of sync_vmcs02_to_vmcs12() as it will
+ * also be used to capture vmcs12 cache as part of
+ * capturing nVMX state for snapshot (migration).
+ *
+ * Otherwise, this flush will dirty guest memory at a
+ * point it is already assumed by user-space to be
+ * immutable.
+ */
+ nested_flush_cached_shadow_vmcs12(vcpu, vmcs12);
+ } else {
+ /*
+ * The only expected VM-instruction error is "VM entry with
+ * invalid control field(s)." Anything else indicates a
+ * problem with L0. And we should never get here with a
+ * VMFail of any type if early consistency checks are enabled.
+ */
+ WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR) !=
+ VMXERR_ENTRY_INVALID_CONTROL_FIELD);
+ WARN_ON_ONCE(nested_early_check);
+ }
+
+ /*
+ * Drop events/exceptions that were queued for re-injection to L2
+ * (picked up via vmx_complete_interrupts()), as well as exceptions
+ * that were pending for L2. Note, this must NOT be hoisted above
+ * prepare_vmcs12(), events/exceptions queued for re-injection need to
+ * be captured in vmcs12 (see vmcs12_save_pending_event()).
+ */
+ vcpu->arch.nmi_injected = false;
+ kvm_clear_exception_queue(vcpu);
+ kvm_clear_interrupt_queue(vcpu);
+
+ vmx_switch_vmcs(vcpu, &vmx->vmcs01);
+
+ /*
+ * If IBRS is advertised to the vCPU, KVM must flush the indirect
+ * branch predictors when transitioning from L2 to L1, as L1 expects
+ * hardware (KVM in this case) to provide separate predictor modes.
+ * Bare metal isolates VMX root (host) from VMX non-root (guest), but
+ * doesn't isolate different VMCSs, i.e. in this case, doesn't provide
+ * separate modes for L2 vs L1.
+ */
+ if (guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
+ indirect_branch_prediction_barrier();
+
+ /* Update any VMCS fields that might have changed while L2 ran */
+ vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
+ vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
+ vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
+ if (kvm_caps.has_tsc_control)
+ vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio);
+
+ if (vmx->nested.l1_tpr_threshold != -1)
+ vmcs_write32(TPR_THRESHOLD, vmx->nested.l1_tpr_threshold);
+
+ if (vmx->nested.change_vmcs01_virtual_apic_mode) {
+ vmx->nested.change_vmcs01_virtual_apic_mode = false;
+ vmx_set_virtual_apic_mode(vcpu);
+ }
+
+ if (vmx->nested.update_vmcs01_cpu_dirty_logging) {
+ vmx->nested.update_vmcs01_cpu_dirty_logging = false;
+ vmx_update_cpu_dirty_logging(vcpu);
+ }
+
+ /* Unpin physical memory we referred to in vmcs02 */
+ kvm_vcpu_unmap(vcpu, &vmx->nested.apic_access_page_map, false);
+ kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true);
+ kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true);
+ vmx->nested.pi_desc = NULL;
+
+ if (vmx->nested.reload_vmcs01_apic_access_page) {
+ vmx->nested.reload_vmcs01_apic_access_page = false;
+ kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
+ }
+
+ if (vmx->nested.update_vmcs01_apicv_status) {
+ vmx->nested.update_vmcs01_apicv_status = false;
+ kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu);
+ }
+
+ if ((vm_exit_reason != -1) &&
+ (enable_shadow_vmcs || evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)))
+ vmx->nested.need_vmcs12_to_shadow_sync = true;
+
+ /* in case we halted in L2 */
+ vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
+
+ if (likely(!vmx->fail)) {
+ if ((u16)vm_exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT &&
+ nested_exit_intr_ack_set(vcpu)) {
+ int irq = kvm_cpu_get_interrupt(vcpu);
+ WARN_ON(irq < 0);
+ vmcs12->vm_exit_intr_info = irq |
+ INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR;
+ }
+
+ if (vm_exit_reason != -1)
+ trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason,
+ vmcs12->exit_qualification,
+ vmcs12->idt_vectoring_info_field,
+ vmcs12->vm_exit_intr_info,
+ vmcs12->vm_exit_intr_error_code,
+ KVM_ISA_VMX);
+
+ load_vmcs12_host_state(vcpu, vmcs12);
+
+ return;
+ }
+
+ /*
+ * After an early L2 VM-entry failure, we're now back
+ * in L1 which thinks it just finished a VMLAUNCH or
+ * VMRESUME instruction, so we need to set the failure
+ * flag and the VM-instruction error field of the VMCS
+ * accordingly, and skip the emulated instruction.
+ */
+ (void)nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
+
+ /*
+ * Restore L1's host state to KVM's software model. We're here
+ * because a consistency check was caught by hardware, which
+ * means some amount of guest state has been propagated to KVM's
+ * model and needs to be unwound to the host's state.
+ */
+ nested_vmx_restore_host_state(vcpu);
+
+ vmx->fail = 0;
+}
+
+static void nested_vmx_triple_fault(struct kvm_vcpu *vcpu)
+{
+ kvm_clear_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+ nested_vmx_vmexit(vcpu, EXIT_REASON_TRIPLE_FAULT, 0, 0);
+}
+
+/*
+ * Decode the memory-address operand of a vmx instruction, as recorded on an
+ * exit caused by such an instruction (run by a guest hypervisor).
+ * On success, returns 0. When the operand is invalid, returns 1 and throws
+ * #UD, #GP, or #SS.
+ */
+int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification,
+ u32 vmx_instruction_info, bool wr, int len, gva_t *ret)
+{
+ gva_t off;
+ bool exn;
+ struct kvm_segment s;
+
+ /*
+ * According to Vol. 3B, "Information for VM Exits Due to Instruction
+ * Execution", on an exit, vmx_instruction_info holds most of the
+ * addressing components of the operand. Only the displacement part
+ * is put in exit_qualification (see 3B, "Basic VM-Exit Information").
+ * For how an actual address is calculated from all these components,
+ * refer to Vol. 1, "Operand Addressing".
+ */
+ int scaling = vmx_instruction_info & 3;
+ int addr_size = (vmx_instruction_info >> 7) & 7;
+ bool is_reg = vmx_instruction_info & (1u << 10);
+ int seg_reg = (vmx_instruction_info >> 15) & 7;
+ int index_reg = (vmx_instruction_info >> 18) & 0xf;
+ bool index_is_valid = !(vmx_instruction_info & (1u << 22));
+ int base_reg = (vmx_instruction_info >> 23) & 0xf;
+ bool base_is_valid = !(vmx_instruction_info & (1u << 27));
+
+ if (is_reg) {
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+ }
+
+ /* Addr = segment_base + offset */
+ /* offset = base + [index * scale] + displacement */
+ off = exit_qualification; /* holds the displacement */
+ if (addr_size == 1)
+ off = (gva_t)sign_extend64(off, 31);
+ else if (addr_size == 0)
+ off = (gva_t)sign_extend64(off, 15);
+ if (base_is_valid)
+ off += kvm_register_read(vcpu, base_reg);
+ if (index_is_valid)
+ off += kvm_register_read(vcpu, index_reg) << scaling;
+ vmx_get_segment(vcpu, &s, seg_reg);
+
+ /*
+ * The effective address, i.e. @off, of a memory operand is truncated
+ * based on the address size of the instruction. Note that this is
+ * the *effective address*, i.e. the address prior to accounting for
+ * the segment's base.
+ */
+ if (addr_size == 1) /* 32 bit */
+ off &= 0xffffffff;
+ else if (addr_size == 0) /* 16 bit */
+ off &= 0xffff;
+
+ /* Checks for #GP/#SS exceptions. */
+ exn = false;
+ if (is_long_mode(vcpu)) {
+ /*
+ * The virtual/linear address is never truncated in 64-bit
+ * mode, e.g. a 32-bit address size can yield a 64-bit virtual
+ * address when using FS/GS with a non-zero base.
+ */
+ if (seg_reg == VCPU_SREG_FS || seg_reg == VCPU_SREG_GS)
+ *ret = s.base + off;
+ else
+ *ret = off;
+
+ /* Long mode: #GP(0)/#SS(0) if the memory address is in a
+ * non-canonical form. This is the only check on the memory
+ * destination for long mode!
+ */
+ exn = is_noncanonical_address(*ret, vcpu);
+ } else {
+ /*
+ * When not in long mode, the virtual/linear address is
+ * unconditionally truncated to 32 bits regardless of the
+ * address size.
+ */
+ *ret = (s.base + off) & 0xffffffff;
+
+ /* Protected mode: apply checks for segment validity in the
+ * following order:
+ * - segment type check (#GP(0) may be thrown)
+ * - usability check (#GP(0)/#SS(0))
+ * - limit check (#GP(0)/#SS(0))
+ */
+ if (wr)
+ /* #GP(0) if the destination operand is located in a
+ * read-only data segment or any code segment.
+ */
+ exn = ((s.type & 0xa) == 0 || (s.type & 8));
+ else
+ /* #GP(0) if the source operand is located in an
+ * execute-only code segment
+ */
+ exn = ((s.type & 0xa) == 8);
+ if (exn) {
+ kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
+ return 1;
+ }
+ /* Protected mode: #GP(0)/#SS(0) if the segment is unusable.
+ */
+ exn = (s.unusable != 0);
+
+ /*
+ * Protected mode: #GP(0)/#SS(0) if the memory operand is
+ * outside the segment limit. All CPUs that support VMX ignore
+ * limit checks for flat segments, i.e. segments with base==0,
+ * limit==0xffffffff and of type expand-up data or code.
+ */
+ if (!(s.base == 0 && s.limit == 0xffffffff &&
+ ((s.type & 8) || !(s.type & 4))))
+ exn = exn || ((u64)off + len - 1 > s.limit);
+ }
+ if (exn) {
+ kvm_queue_exception_e(vcpu,
+ seg_reg == VCPU_SREG_SS ?
+ SS_VECTOR : GP_VECTOR,
+ 0);
+ return 1;
+ }
+
+ return 0;
+}
+
+static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer,
+ int *ret)
+{
+ gva_t gva;
+ struct x86_exception e;
+ int r;
+
+ if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu),
+ vmcs_read32(VMX_INSTRUCTION_INFO), false,
+ sizeof(*vmpointer), &gva)) {
+ *ret = 1;
+ return -EINVAL;
+ }
+
+ r = kvm_read_guest_virt(vcpu, gva, vmpointer, sizeof(*vmpointer), &e);
+ if (r != X86EMUL_CONTINUE) {
+ *ret = kvm_handle_memory_failure(vcpu, r, &e);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+/*
+ * Allocate a shadow VMCS and associate it with the currently loaded
+ * VMCS, unless such a shadow VMCS already exists. The newly allocated
+ * VMCS is also VMCLEARed, so that it is ready for use.
+ */
+static struct vmcs *alloc_shadow_vmcs(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct loaded_vmcs *loaded_vmcs = vmx->loaded_vmcs;
+
+ /*
+ * KVM allocates a shadow VMCS only when L1 executes VMXON and frees it
+ * when L1 executes VMXOFF or the vCPU is forced out of nested
+ * operation. VMXON faults if the CPU is already post-VMXON, so it
+ * should be impossible to already have an allocated shadow VMCS. KVM
+ * doesn't support virtualization of VMCS shadowing, so vmcs01 should
+ * always be the loaded VMCS.
+ */
+ if (WARN_ON(loaded_vmcs != &vmx->vmcs01 || loaded_vmcs->shadow_vmcs))
+ return loaded_vmcs->shadow_vmcs;
+
+ loaded_vmcs->shadow_vmcs = alloc_vmcs(true);
+ if (loaded_vmcs->shadow_vmcs)
+ vmcs_clear(loaded_vmcs->shadow_vmcs);
+
+ return loaded_vmcs->shadow_vmcs;
+}
+
+static int enter_vmx_operation(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ int r;
+
+ r = alloc_loaded_vmcs(&vmx->nested.vmcs02);
+ if (r < 0)
+ goto out_vmcs02;
+
+ vmx->nested.cached_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT);
+ if (!vmx->nested.cached_vmcs12)
+ goto out_cached_vmcs12;
+
+ vmx->nested.shadow_vmcs12_cache.gpa = INVALID_GPA;
+ vmx->nested.cached_shadow_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT);
+ if (!vmx->nested.cached_shadow_vmcs12)
+ goto out_cached_shadow_vmcs12;
+
+ if (enable_shadow_vmcs && !alloc_shadow_vmcs(vcpu))
+ goto out_shadow_vmcs;
+
+ hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC,
+ HRTIMER_MODE_ABS_PINNED);
+ vmx->nested.preemption_timer.function = vmx_preemption_timer_fn;
+
+ vmx->nested.vpid02 = allocate_vpid();
+
+ vmx->nested.vmcs02_initialized = false;
+ vmx->nested.vmxon = true;
+
+ if (vmx_pt_mode_is_host_guest()) {
+ vmx->pt_desc.guest.ctl = 0;
+ pt_update_intercept_for_msr(vcpu);
+ }
+
+ return 0;
+
+out_shadow_vmcs:
+ kfree(vmx->nested.cached_shadow_vmcs12);
+
+out_cached_shadow_vmcs12:
+ kfree(vmx->nested.cached_vmcs12);
+
+out_cached_vmcs12:
+ free_loaded_vmcs(&vmx->nested.vmcs02);
+
+out_vmcs02:
+ return -ENOMEM;
+}
+
+/* Emulate the VMXON instruction. */
+static int handle_vmxon(struct kvm_vcpu *vcpu)
+{
+ int ret;
+ gpa_t vmptr;
+ uint32_t revision;
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ const u64 VMXON_NEEDED_FEATURES = FEAT_CTL_LOCKED
+ | FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX;
+
+ /*
+ * Manually check CR4.VMXE checks, KVM must force CR4.VMXE=1 to enter
+ * the guest and so cannot rely on hardware to perform the check,
+ * which has higher priority than VM-Exit (see Intel SDM's pseudocode
+ * for VMXON).
+ *
+ * Rely on hardware for the other pre-VM-Exit checks, CR0.PE=1, !VM86
+ * and !COMPATIBILITY modes. For an unrestricted guest, KVM doesn't
+ * force any of the relevant guest state. For a restricted guest, KVM
+ * does force CR0.PE=1, but only to also force VM86 in order to emulate
+ * Real Mode, and so there's no need to check CR0.PE manually.
+ */
+ if (!kvm_is_cr4_bit_set(vcpu, X86_CR4_VMXE)) {
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+ }
+
+ /*
+ * The CPL is checked for "not in VMX operation" and for "in VMX root",
+ * and has higher priority than the VM-Fail due to being post-VMXON,
+ * i.e. VMXON #GPs outside of VMX non-root if CPL!=0. In VMX non-root,
+ * VMXON causes VM-Exit and KVM unconditionally forwards VMXON VM-Exits
+ * from L2 to L1, i.e. there's no need to check for the vCPU being in
+ * VMX non-root.
+ *
+ * Forwarding the VM-Exit unconditionally, i.e. without performing the
+ * #UD checks (see above), is functionally ok because KVM doesn't allow
+ * L1 to run L2 without CR4.VMXE=0, and because KVM never modifies L2's
+ * CR0 or CR4, i.e. it's L2's responsibility to emulate #UDs that are
+ * missed by hardware due to shadowing CR0 and/or CR4.
+ */
+ if (vmx_get_cpl(vcpu)) {
+ kvm_inject_gp(vcpu, 0);
+ return 1;
+ }
+
+ if (vmx->nested.vmxon)
+ return nested_vmx_fail(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION);
+
+ /*
+ * Invalid CR0/CR4 generates #GP. These checks are performed if and
+ * only if the vCPU isn't already in VMX operation, i.e. effectively
+ * have lower priority than the VM-Fail above.
+ */
+ if (!nested_host_cr0_valid(vcpu, kvm_read_cr0(vcpu)) ||
+ !nested_host_cr4_valid(vcpu, kvm_read_cr4(vcpu))) {
+ kvm_inject_gp(vcpu, 0);
+ return 1;
+ }
+
+ if ((vmx->msr_ia32_feature_control & VMXON_NEEDED_FEATURES)
+ != VMXON_NEEDED_FEATURES) {
+ kvm_inject_gp(vcpu, 0);
+ return 1;
+ }
+
+ if (nested_vmx_get_vmptr(vcpu, &vmptr, &ret))
+ return ret;
+
+ /*
+ * SDM 3: 24.11.5
+ * The first 4 bytes of VMXON region contain the supported
+ * VMCS revision identifier
+ *
+ * Note - IA32_VMX_BASIC[48] will never be 1 for the nested case;
+ * which replaces physical address width with 32
+ */
+ if (!page_address_valid(vcpu, vmptr))
+ return nested_vmx_failInvalid(vcpu);
+
+ if (kvm_read_guest(vcpu->kvm, vmptr, &revision, sizeof(revision)) ||
+ revision != VMCS12_REVISION)
+ return nested_vmx_failInvalid(vcpu);
+
+ vmx->nested.vmxon_ptr = vmptr;
+ ret = enter_vmx_operation(vcpu);
+ if (ret)
+ return ret;
+
+ return nested_vmx_succeed(vcpu);
+}
+
+static inline void nested_release_vmcs12(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ if (vmx->nested.current_vmptr == INVALID_GPA)
+ return;
+
+ copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu));
+
+ if (enable_shadow_vmcs) {
+ /* copy to memory all shadowed fields in case
+ they were modified */
+ copy_shadow_to_vmcs12(vmx);
+ vmx_disable_shadow_vmcs(vmx);
+ }
+ vmx->nested.posted_intr_nv = -1;
+
+ /* Flush VMCS12 to guest memory */
+ kvm_vcpu_write_guest_page(vcpu,
+ vmx->nested.current_vmptr >> PAGE_SHIFT,
+ vmx->nested.cached_vmcs12, 0, VMCS12_SIZE);
+
+ kvm_mmu_free_roots(vcpu->kvm, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
+
+ vmx->nested.current_vmptr = INVALID_GPA;
+}
+
+/* Emulate the VMXOFF instruction */
+static int handle_vmxoff(struct kvm_vcpu *vcpu)
+{
+ if (!nested_vmx_check_permission(vcpu))
+ return 1;
+
+ free_nested(vcpu);
+
+ if (kvm_apic_has_pending_init_or_sipi(vcpu))
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+
+ return nested_vmx_succeed(vcpu);
+}
+
+/* Emulate the VMCLEAR instruction */
+static int handle_vmclear(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ u32 zero = 0;
+ gpa_t vmptr;
+ int r;
+
+ if (!nested_vmx_check_permission(vcpu))
+ return 1;
+
+ if (nested_vmx_get_vmptr(vcpu, &vmptr, &r))
+ return r;
+
+ if (!page_address_valid(vcpu, vmptr))
+ return nested_vmx_fail(vcpu, VMXERR_VMCLEAR_INVALID_ADDRESS);
+
+ if (vmptr == vmx->nested.vmxon_ptr)
+ return nested_vmx_fail(vcpu, VMXERR_VMCLEAR_VMXON_POINTER);
+
+ /*
+ * When Enlightened VMEntry is enabled on the calling CPU we treat
+ * memory area pointer by vmptr as Enlightened VMCS (as there's no good
+ * way to distinguish it from VMCS12) and we must not corrupt it by
+ * writing to the non-existent 'launch_state' field. The area doesn't
+ * have to be the currently active EVMCS on the calling CPU and there's
+ * nothing KVM has to do to transition it from 'active' to 'non-active'
+ * state. It is possible that the area will stay mapped as
+ * vmx->nested.hv_evmcs but this shouldn't be a problem.
+ */
+ if (likely(!guest_cpuid_has_evmcs(vcpu) ||
+ !evmptr_is_valid(nested_get_evmptr(vcpu)))) {
+ if (vmptr == vmx->nested.current_vmptr)
+ nested_release_vmcs12(vcpu);
+
+ /*
+ * Silently ignore memory errors on VMCLEAR, Intel's pseudocode
+ * for VMCLEAR includes a "ensure that data for VMCS referenced
+ * by the operand is in memory" clause that guards writes to
+ * memory, i.e. doing nothing for I/O is architecturally valid.
+ *
+ * FIXME: Suppress failures if and only if no memslot is found,
+ * i.e. exit to userspace if __copy_to_user() fails.
+ */
+ (void)kvm_vcpu_write_guest(vcpu,
+ vmptr + offsetof(struct vmcs12,
+ launch_state),
+ &zero, sizeof(zero));
+ } else if (vmx->nested.hv_evmcs && vmptr == vmx->nested.hv_evmcs_vmptr) {
+ nested_release_evmcs(vcpu);
+ }
+
+ return nested_vmx_succeed(vcpu);
+}
+
+/* Emulate the VMLAUNCH instruction */
+static int handle_vmlaunch(struct kvm_vcpu *vcpu)
+{
+ return nested_vmx_run(vcpu, true);
+}
+
+/* Emulate the VMRESUME instruction */
+static int handle_vmresume(struct kvm_vcpu *vcpu)
+{
+
+ return nested_vmx_run(vcpu, false);
+}
+
+static int handle_vmread(struct kvm_vcpu *vcpu)
+{
+ struct vmcs12 *vmcs12 = is_guest_mode(vcpu) ? get_shadow_vmcs12(vcpu)
+ : get_vmcs12(vcpu);
+ unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
+ u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO);
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct x86_exception e;
+ unsigned long field;
+ u64 value;
+ gva_t gva = 0;
+ short offset;
+ int len, r;
+
+ if (!nested_vmx_check_permission(vcpu))
+ return 1;
+
+ /* Decode instruction info and find the field to read */
+ field = kvm_register_read(vcpu, (((instr_info) >> 28) & 0xf));
+
+ if (!evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) {
+ /*
+ * In VMX non-root operation, when the VMCS-link pointer is INVALID_GPA,
+ * any VMREAD sets the ALU flags for VMfailInvalid.
+ */
+ if (vmx->nested.current_vmptr == INVALID_GPA ||
+ (is_guest_mode(vcpu) &&
+ get_vmcs12(vcpu)->vmcs_link_pointer == INVALID_GPA))
+ return nested_vmx_failInvalid(vcpu);
+
+ offset = get_vmcs12_field_offset(field);
+ if (offset < 0)
+ return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
+
+ if (!is_guest_mode(vcpu) && is_vmcs12_ext_field(field))
+ copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
+
+ /* Read the field, zero-extended to a u64 value */
+ value = vmcs12_read_any(vmcs12, field, offset);
+ } else {
+ /*
+ * Hyper-V TLFS (as of 6.0b) explicitly states, that while an
+ * enlightened VMCS is active VMREAD/VMWRITE instructions are
+ * unsupported. Unfortunately, certain versions of Windows 11
+ * don't comply with this requirement which is not enforced in
+ * genuine Hyper-V. Allow VMREAD from an enlightened VMCS as a
+ * workaround, as misbehaving guests will panic on VM-Fail.
+ * Note, enlightened VMCS is incompatible with shadow VMCS so
+ * all VMREADs from L2 should go to L1.
+ */
+ if (WARN_ON_ONCE(is_guest_mode(vcpu)))
+ return nested_vmx_failInvalid(vcpu);
+
+ offset = evmcs_field_offset(field, NULL);
+ if (offset < 0)
+ return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
+
+ /* Read the field, zero-extended to a u64 value */
+ value = evmcs_read_any(vmx->nested.hv_evmcs, field, offset);
+ }
+
+ /*
+ * Now copy part of this value to register or memory, as requested.
+ * Note that the number of bits actually copied is 32 or 64 depending
+ * on the guest's mode (32 or 64 bit), not on the given field's length.
+ */
+ if (instr_info & BIT(10)) {
+ kvm_register_write(vcpu, (((instr_info) >> 3) & 0xf), value);
+ } else {
+ len = is_64_bit_mode(vcpu) ? 8 : 4;
+ if (get_vmx_mem_address(vcpu, exit_qualification,
+ instr_info, true, len, &gva))
+ return 1;
+ /* _system ok, nested_vmx_check_permission has verified cpl=0 */
+ r = kvm_write_guest_virt_system(vcpu, gva, &value, len, &e);
+ if (r != X86EMUL_CONTINUE)
+ return kvm_handle_memory_failure(vcpu, r, &e);
+ }
+
+ return nested_vmx_succeed(vcpu);
+}
+
+static bool is_shadow_field_rw(unsigned long field)
+{
+ switch (field) {
+#define SHADOW_FIELD_RW(x, y) case x:
+#include "vmcs_shadow_fields.h"
+ return true;
+ default:
+ break;
+ }
+ return false;
+}
+
+static bool is_shadow_field_ro(unsigned long field)
+{
+ switch (field) {
+#define SHADOW_FIELD_RO(x, y) case x:
+#include "vmcs_shadow_fields.h"
+ return true;
+ default:
+ break;
+ }
+ return false;
+}
+
+static int handle_vmwrite(struct kvm_vcpu *vcpu)
+{
+ struct vmcs12 *vmcs12 = is_guest_mode(vcpu) ? get_shadow_vmcs12(vcpu)
+ : get_vmcs12(vcpu);
+ unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
+ u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO);
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct x86_exception e;
+ unsigned long field;
+ short offset;
+ gva_t gva;
+ int len, r;
+
+ /*
+ * The value to write might be 32 or 64 bits, depending on L1's long
+ * mode, and eventually we need to write that into a field of several
+ * possible lengths. The code below first zero-extends the value to 64
+ * bit (value), and then copies only the appropriate number of
+ * bits into the vmcs12 field.
+ */
+ u64 value = 0;
+
+ if (!nested_vmx_check_permission(vcpu))
+ return 1;
+
+ /*
+ * In VMX non-root operation, when the VMCS-link pointer is INVALID_GPA,
+ * any VMWRITE sets the ALU flags for VMfailInvalid.
+ */
+ if (vmx->nested.current_vmptr == INVALID_GPA ||
+ (is_guest_mode(vcpu) &&
+ get_vmcs12(vcpu)->vmcs_link_pointer == INVALID_GPA))
+ return nested_vmx_failInvalid(vcpu);
+
+ if (instr_info & BIT(10))
+ value = kvm_register_read(vcpu, (((instr_info) >> 3) & 0xf));
+ else {
+ len = is_64_bit_mode(vcpu) ? 8 : 4;
+ if (get_vmx_mem_address(vcpu, exit_qualification,
+ instr_info, false, len, &gva))
+ return 1;
+ r = kvm_read_guest_virt(vcpu, gva, &value, len, &e);
+ if (r != X86EMUL_CONTINUE)
+ return kvm_handle_memory_failure(vcpu, r, &e);
+ }
+
+ field = kvm_register_read(vcpu, (((instr_info) >> 28) & 0xf));
+
+ offset = get_vmcs12_field_offset(field);
+ if (offset < 0)
+ return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
+
+ /*
+ * If the vCPU supports "VMWRITE to any supported field in the
+ * VMCS," then the "read-only" fields are actually read/write.
+ */
+ if (vmcs_field_readonly(field) &&
+ !nested_cpu_has_vmwrite_any_field(vcpu))
+ return nested_vmx_fail(vcpu, VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT);
+
+ /*
+ * Ensure vmcs12 is up-to-date before any VMWRITE that dirties
+ * vmcs12, else we may crush a field or consume a stale value.
+ */
+ if (!is_guest_mode(vcpu) && !is_shadow_field_rw(field))
+ copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
+
+ /*
+ * Some Intel CPUs intentionally drop the reserved bits of the AR byte
+ * fields on VMWRITE. Emulate this behavior to ensure consistent KVM
+ * behavior regardless of the underlying hardware, e.g. if an AR_BYTE
+ * field is intercepted for VMWRITE but not VMREAD (in L1), then VMREAD
+ * from L1 will return a different value than VMREAD from L2 (L1 sees
+ * the stripped down value, L2 sees the full value as stored by KVM).
+ */
+ if (field >= GUEST_ES_AR_BYTES && field <= GUEST_TR_AR_BYTES)
+ value &= 0x1f0ff;
+
+ vmcs12_write_any(vmcs12, field, offset, value);
+
+ /*
+ * Do not track vmcs12 dirty-state if in guest-mode as we actually
+ * dirty shadow vmcs12 instead of vmcs12. Fields that can be updated
+ * by L1 without a vmexit are always updated in the vmcs02, i.e. don't
+ * "dirty" vmcs12, all others go down the prepare_vmcs02() slow path.
+ */
+ if (!is_guest_mode(vcpu) && !is_shadow_field_rw(field)) {
+ /*
+ * L1 can read these fields without exiting, ensure the
+ * shadow VMCS is up-to-date.
+ */
+ if (enable_shadow_vmcs && is_shadow_field_ro(field)) {
+ preempt_disable();
+ vmcs_load(vmx->vmcs01.shadow_vmcs);
+
+ __vmcs_writel(field, value);
+
+ vmcs_clear(vmx->vmcs01.shadow_vmcs);
+ vmcs_load(vmx->loaded_vmcs->vmcs);
+ preempt_enable();
+ }
+ vmx->nested.dirty_vmcs12 = true;
+ }
+
+ return nested_vmx_succeed(vcpu);
+}
+
+static void set_current_vmptr(struct vcpu_vmx *vmx, gpa_t vmptr)
+{
+ vmx->nested.current_vmptr = vmptr;
+ if (enable_shadow_vmcs) {
+ secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_SHADOW_VMCS);
+ vmcs_write64(VMCS_LINK_POINTER,
+ __pa(vmx->vmcs01.shadow_vmcs));
+ vmx->nested.need_vmcs12_to_shadow_sync = true;
+ }
+ vmx->nested.dirty_vmcs12 = true;
+ vmx->nested.force_msr_bitmap_recalc = true;
+}
+
+/* Emulate the VMPTRLD instruction */
+static int handle_vmptrld(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ gpa_t vmptr;
+ int r;
+
+ if (!nested_vmx_check_permission(vcpu))
+ return 1;
+
+ if (nested_vmx_get_vmptr(vcpu, &vmptr, &r))
+ return r;
+
+ if (!page_address_valid(vcpu, vmptr))
+ return nested_vmx_fail(vcpu, VMXERR_VMPTRLD_INVALID_ADDRESS);
+
+ if (vmptr == vmx->nested.vmxon_ptr)
+ return nested_vmx_fail(vcpu, VMXERR_VMPTRLD_VMXON_POINTER);
+
+ /* Forbid normal VMPTRLD if Enlightened version was used */
+ if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr))
+ return 1;
+
+ if (vmx->nested.current_vmptr != vmptr) {
+ struct gfn_to_hva_cache *ghc = &vmx->nested.vmcs12_cache;
+ struct vmcs_hdr hdr;
+
+ if (kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, vmptr, VMCS12_SIZE)) {
+ /*
+ * Reads from an unbacked page return all 1s,
+ * which means that the 32 bits located at the
+ * given physical address won't match the required
+ * VMCS12_REVISION identifier.
+ */
+ return nested_vmx_fail(vcpu,
+ VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
+ }
+
+ if (kvm_read_guest_offset_cached(vcpu->kvm, ghc, &hdr,
+ offsetof(struct vmcs12, hdr),
+ sizeof(hdr))) {
+ return nested_vmx_fail(vcpu,
+ VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
+ }
+
+ if (hdr.revision_id != VMCS12_REVISION ||
+ (hdr.shadow_vmcs &&
+ !nested_cpu_has_vmx_shadow_vmcs(vcpu))) {
+ return nested_vmx_fail(vcpu,
+ VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
+ }
+
+ nested_release_vmcs12(vcpu);
+
+ /*
+ * Load VMCS12 from guest memory since it is not already
+ * cached.
+ */
+ if (kvm_read_guest_cached(vcpu->kvm, ghc, vmx->nested.cached_vmcs12,
+ VMCS12_SIZE)) {
+ return nested_vmx_fail(vcpu,
+ VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
+ }
+
+ set_current_vmptr(vmx, vmptr);
+ }
+
+ return nested_vmx_succeed(vcpu);
+}
+
+/* Emulate the VMPTRST instruction */
+static int handle_vmptrst(struct kvm_vcpu *vcpu)
+{
+ unsigned long exit_qual = vmx_get_exit_qual(vcpu);
+ u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO);
+ gpa_t current_vmptr = to_vmx(vcpu)->nested.current_vmptr;
+ struct x86_exception e;
+ gva_t gva;
+ int r;
+
+ if (!nested_vmx_check_permission(vcpu))
+ return 1;
+
+ if (unlikely(evmptr_is_valid(to_vmx(vcpu)->nested.hv_evmcs_vmptr)))
+ return 1;
+
+ if (get_vmx_mem_address(vcpu, exit_qual, instr_info,
+ true, sizeof(gpa_t), &gva))
+ return 1;
+ /* *_system ok, nested_vmx_check_permission has verified cpl=0 */
+ r = kvm_write_guest_virt_system(vcpu, gva, (void *)&current_vmptr,
+ sizeof(gpa_t), &e);
+ if (r != X86EMUL_CONTINUE)
+ return kvm_handle_memory_failure(vcpu, r, &e);
+
+ return nested_vmx_succeed(vcpu);
+}
+
+/* Emulate the INVEPT instruction */
+static int handle_invept(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ u32 vmx_instruction_info, types;
+ unsigned long type, roots_to_free;
+ struct kvm_mmu *mmu;
+ gva_t gva;
+ struct x86_exception e;
+ struct {
+ u64 eptp, gpa;
+ } operand;
+ int i, r, gpr_index;
+
+ if (!(vmx->nested.msrs.secondary_ctls_high &
+ SECONDARY_EXEC_ENABLE_EPT) ||
+ !(vmx->nested.msrs.ept_caps & VMX_EPT_INVEPT_BIT)) {
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+ }
+
+ if (!nested_vmx_check_permission(vcpu))
+ return 1;
+
+ vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
+ gpr_index = vmx_get_instr_info_reg2(vmx_instruction_info);
+ type = kvm_register_read(vcpu, gpr_index);
+
+ types = (vmx->nested.msrs.ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6;
+
+ if (type >= 32 || !(types & (1 << type)))
+ return nested_vmx_fail(vcpu, VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
+
+ /* According to the Intel VMX instruction reference, the memory
+ * operand is read even if it isn't needed (e.g., for type==global)
+ */
+ if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu),
+ vmx_instruction_info, false, sizeof(operand), &gva))
+ return 1;
+ r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e);
+ if (r != X86EMUL_CONTINUE)
+ return kvm_handle_memory_failure(vcpu, r, &e);
+
+ /*
+ * Nested EPT roots are always held through guest_mmu,
+ * not root_mmu.
+ */
+ mmu = &vcpu->arch.guest_mmu;
+
+ switch (type) {
+ case VMX_EPT_EXTENT_CONTEXT:
+ if (!nested_vmx_check_eptp(vcpu, operand.eptp))
+ return nested_vmx_fail(vcpu,
+ VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
+
+ roots_to_free = 0;
+ if (nested_ept_root_matches(mmu->root.hpa, mmu->root.pgd,
+ operand.eptp))
+ roots_to_free |= KVM_MMU_ROOT_CURRENT;
+
+ for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
+ if (nested_ept_root_matches(mmu->prev_roots[i].hpa,
+ mmu->prev_roots[i].pgd,
+ operand.eptp))
+ roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
+ }
+ break;
+ case VMX_EPT_EXTENT_GLOBAL:
+ roots_to_free = KVM_MMU_ROOTS_ALL;
+ break;
+ default:
+ BUG();
+ break;
+ }
+
+ if (roots_to_free)
+ kvm_mmu_free_roots(vcpu->kvm, mmu, roots_to_free);
+
+ return nested_vmx_succeed(vcpu);
+}
+
+static int handle_invvpid(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ u32 vmx_instruction_info;
+ unsigned long type, types;
+ gva_t gva;
+ struct x86_exception e;
+ struct {
+ u64 vpid;
+ u64 gla;
+ } operand;
+ u16 vpid02;
+ int r, gpr_index;
+
+ if (!(vmx->nested.msrs.secondary_ctls_high &
+ SECONDARY_EXEC_ENABLE_VPID) ||
+ !(vmx->nested.msrs.vpid_caps & VMX_VPID_INVVPID_BIT)) {
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+ }
+
+ if (!nested_vmx_check_permission(vcpu))
+ return 1;
+
+ vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
+ gpr_index = vmx_get_instr_info_reg2(vmx_instruction_info);
+ type = kvm_register_read(vcpu, gpr_index);
+
+ types = (vmx->nested.msrs.vpid_caps &
+ VMX_VPID_EXTENT_SUPPORTED_MASK) >> 8;
+
+ if (type >= 32 || !(types & (1 << type)))
+ return nested_vmx_fail(vcpu,
+ VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
+
+ /* according to the intel vmx instruction reference, the memory
+ * operand is read even if it isn't needed (e.g., for type==global)
+ */
+ if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu),
+ vmx_instruction_info, false, sizeof(operand), &gva))
+ return 1;
+ r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e);
+ if (r != X86EMUL_CONTINUE)
+ return kvm_handle_memory_failure(vcpu, r, &e);
+
+ if (operand.vpid >> 16)
+ return nested_vmx_fail(vcpu,
+ VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
+
+ vpid02 = nested_get_vpid02(vcpu);
+ switch (type) {
+ case VMX_VPID_EXTENT_INDIVIDUAL_ADDR:
+ if (!operand.vpid ||
+ is_noncanonical_address(operand.gla, vcpu))
+ return nested_vmx_fail(vcpu,
+ VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
+ vpid_sync_vcpu_addr(vpid02, operand.gla);
+ break;
+ case VMX_VPID_EXTENT_SINGLE_CONTEXT:
+ case VMX_VPID_EXTENT_SINGLE_NON_GLOBAL:
+ if (!operand.vpid)
+ return nested_vmx_fail(vcpu,
+ VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
+ vpid_sync_context(vpid02);
+ break;
+ case VMX_VPID_EXTENT_ALL_CONTEXT:
+ vpid_sync_context(vpid02);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ return kvm_skip_emulated_instruction(vcpu);
+ }
+
+ /*
+ * Sync the shadow page tables if EPT is disabled, L1 is invalidating
+ * linear mappings for L2 (tagged with L2's VPID). Free all guest
+ * roots as VPIDs are not tracked in the MMU role.
+ *
+ * Note, this operates on root_mmu, not guest_mmu, as L1 and L2 share
+ * an MMU when EPT is disabled.
+ *
+ * TODO: sync only the affected SPTEs for INVDIVIDUAL_ADDR.
+ */
+ if (!enable_ept)
+ kvm_mmu_free_guest_mode_roots(vcpu->kvm, &vcpu->arch.root_mmu);
+
+ return nested_vmx_succeed(vcpu);
+}
+
+static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+ u32 index = kvm_rcx_read(vcpu);
+ u64 new_eptp;
+
+ if (WARN_ON_ONCE(!nested_cpu_has_ept(vmcs12)))
+ return 1;
+ if (index >= VMFUNC_EPTP_ENTRIES)
+ return 1;
+
+ if (kvm_vcpu_read_guest_page(vcpu, vmcs12->eptp_list_address >> PAGE_SHIFT,
+ &new_eptp, index * 8, 8))
+ return 1;
+
+ /*
+ * If the (L2) guest does a vmfunc to the currently
+ * active ept pointer, we don't have to do anything else
+ */
+ if (vmcs12->ept_pointer != new_eptp) {
+ if (!nested_vmx_check_eptp(vcpu, new_eptp))
+ return 1;
+
+ vmcs12->ept_pointer = new_eptp;
+ nested_ept_new_eptp(vcpu);
+
+ if (!nested_cpu_has_vpid(vmcs12))
+ kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
+ }
+
+ return 0;
+}
+
+static int handle_vmfunc(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct vmcs12 *vmcs12;
+ u32 function = kvm_rax_read(vcpu);
+
+ /*
+ * VMFUNC should never execute cleanly while L1 is active; KVM supports
+ * VMFUNC for nested VMs, but not for L1.
+ */
+ if (WARN_ON_ONCE(!is_guest_mode(vcpu))) {
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+ }
+
+ vmcs12 = get_vmcs12(vcpu);
+
+ /*
+ * #UD on out-of-bounds function has priority over VM-Exit, and VMFUNC
+ * is enabled in vmcs02 if and only if it's enabled in vmcs12.
+ */
+ if (WARN_ON_ONCE((function > 63) || !nested_cpu_has_vmfunc(vmcs12))) {
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+ }
+
+ if (!(vmcs12->vm_function_control & BIT_ULL(function)))
+ goto fail;
+
+ switch (function) {
+ case 0:
+ if (nested_vmx_eptp_switching(vcpu, vmcs12))
+ goto fail;
+ break;
+ default:
+ goto fail;
+ }
+ return kvm_skip_emulated_instruction(vcpu);
+
+fail:
+ /*
+ * This is effectively a reflected VM-Exit, as opposed to a synthesized
+ * nested VM-Exit. Pass the original exit reason, i.e. don't hardcode
+ * EXIT_REASON_VMFUNC as the exit reason.
+ */
+ nested_vmx_vmexit(vcpu, vmx->exit_reason.full,
+ vmx_get_intr_info(vcpu),
+ vmx_get_exit_qual(vcpu));
+ return 1;
+}
+
+/*
+ * Return true if an IO instruction with the specified port and size should cause
+ * a VM-exit into L1.
+ */
+bool nested_vmx_check_io_bitmaps(struct kvm_vcpu *vcpu, unsigned int port,
+ int size)
+{
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+ gpa_t bitmap, last_bitmap;
+ u8 b;
+
+ last_bitmap = INVALID_GPA;
+ b = -1;
+
+ while (size > 0) {
+ if (port < 0x8000)
+ bitmap = vmcs12->io_bitmap_a;
+ else if (port < 0x10000)
+ bitmap = vmcs12->io_bitmap_b;
+ else
+ return true;
+ bitmap += (port & 0x7fff) / 8;
+
+ if (last_bitmap != bitmap)
+ if (kvm_vcpu_read_guest(vcpu, bitmap, &b, 1))
+ return true;
+ if (b & (1 << (port & 7)))
+ return true;
+
+ port++;
+ size--;
+ last_bitmap = bitmap;
+ }
+
+ return false;
+}
+
+static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+ unsigned long exit_qualification;
+ unsigned short port;
+ int size;
+
+ if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
+ return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING);
+
+ exit_qualification = vmx_get_exit_qual(vcpu);
+
+ port = exit_qualification >> 16;
+ size = (exit_qualification & 7) + 1;
+
+ return nested_vmx_check_io_bitmaps(vcpu, port, size);
+}
+
+/*
+ * Return 1 if we should exit from L2 to L1 to handle an MSR access,
+ * rather than handle it ourselves in L0. I.e., check whether L1 expressed
+ * disinterest in the current event (read or write a specific MSR) by using an
+ * MSR bitmap. This may be the case even when L0 doesn't use MSR bitmaps.
+ */
+static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12,
+ union vmx_exit_reason exit_reason)
+{
+ u32 msr_index = kvm_rcx_read(vcpu);
+ gpa_t bitmap;
+
+ if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
+ return true;
+
+ /*
+ * The MSR_BITMAP page is divided into four 1024-byte bitmaps,
+ * for the four combinations of read/write and low/high MSR numbers.
+ * First we need to figure out which of the four to use:
+ */
+ bitmap = vmcs12->msr_bitmap;
+ if (exit_reason.basic == EXIT_REASON_MSR_WRITE)
+ bitmap += 2048;
+ if (msr_index >= 0xc0000000) {
+ msr_index -= 0xc0000000;
+ bitmap += 1024;
+ }
+
+ /* Then read the msr_index'th bit from this bitmap: */
+ if (msr_index < 1024*8) {
+ unsigned char b;
+ if (kvm_vcpu_read_guest(vcpu, bitmap + msr_index/8, &b, 1))
+ return true;
+ return 1 & (b >> (msr_index & 7));
+ } else
+ return true; /* let L1 handle the wrong parameter */
+}
+
+/*
+ * Return 1 if we should exit from L2 to L1 to handle a CR access exit,
+ * rather than handle it ourselves in L0. I.e., check if L1 wanted to
+ * intercept (via guest_host_mask etc.) the current event.
+ */
+static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+ unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
+ int cr = exit_qualification & 15;
+ int reg;
+ unsigned long val;
+
+ switch ((exit_qualification >> 4) & 3) {
+ case 0: /* mov to cr */
+ reg = (exit_qualification >> 8) & 15;
+ val = kvm_register_read(vcpu, reg);
+ switch (cr) {
+ case 0:
+ if (vmcs12->cr0_guest_host_mask &
+ (val ^ vmcs12->cr0_read_shadow))
+ return true;
+ break;
+ case 3:
+ if (nested_cpu_has(vmcs12, CPU_BASED_CR3_LOAD_EXITING))
+ return true;
+ break;
+ case 4:
+ if (vmcs12->cr4_guest_host_mask &
+ (vmcs12->cr4_read_shadow ^ val))
+ return true;
+ break;
+ case 8:
+ if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING))
+ return true;
+ break;
+ }
+ break;
+ case 2: /* clts */
+ if ((vmcs12->cr0_guest_host_mask & X86_CR0_TS) &&
+ (vmcs12->cr0_read_shadow & X86_CR0_TS))
+ return true;
+ break;
+ case 1: /* mov from cr */
+ switch (cr) {
+ case 3:
+ if (vmcs12->cpu_based_vm_exec_control &
+ CPU_BASED_CR3_STORE_EXITING)
+ return true;
+ break;
+ case 8:
+ if (vmcs12->cpu_based_vm_exec_control &
+ CPU_BASED_CR8_STORE_EXITING)
+ return true;
+ break;
+ }
+ break;
+ case 3: /* lmsw */
+ /*
+ * lmsw can change bits 1..3 of cr0, and only set bit 0 of
+ * cr0. Other attempted changes are ignored, with no exit.
+ */
+ val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f;
+ if (vmcs12->cr0_guest_host_mask & 0xe &
+ (val ^ vmcs12->cr0_read_shadow))
+ return true;
+ if ((vmcs12->cr0_guest_host_mask & 0x1) &&
+ !(vmcs12->cr0_read_shadow & 0x1) &&
+ (val & 0x1))
+ return true;
+ break;
+ }
+ return false;
+}
+
+static bool nested_vmx_exit_handled_encls(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+ u32 encls_leaf;
+
+ if (!guest_cpuid_has(vcpu, X86_FEATURE_SGX) ||
+ !nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENCLS_EXITING))
+ return false;
+
+ encls_leaf = kvm_rax_read(vcpu);
+ if (encls_leaf > 62)
+ encls_leaf = 63;
+ return vmcs12->encls_exiting_bitmap & BIT_ULL(encls_leaf);
+}
+
+static bool nested_vmx_exit_handled_vmcs_access(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12, gpa_t bitmap)
+{
+ u32 vmx_instruction_info;
+ unsigned long field;
+ u8 b;
+
+ if (!nested_cpu_has_shadow_vmcs(vmcs12))
+ return true;
+
+ /* Decode instruction info and find the field to access */
+ vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
+ field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
+
+ /* Out-of-range fields always cause a VM exit from L2 to L1 */
+ if (field >> 15)
+ return true;
+
+ if (kvm_vcpu_read_guest(vcpu, bitmap + field/8, &b, 1))
+ return true;
+
+ return 1 & (b >> (field & 7));
+}
+
+static bool nested_vmx_exit_handled_mtf(struct vmcs12 *vmcs12)
+{
+ u32 entry_intr_info = vmcs12->vm_entry_intr_info_field;
+
+ if (nested_cpu_has_mtf(vmcs12))
+ return true;
+
+ /*
+ * An MTF VM-exit may be injected into the guest by setting the
+ * interruption-type to 7 (other event) and the vector field to 0. Such
+ * is the case regardless of the 'monitor trap flag' VM-execution
+ * control.
+ */
+ return entry_intr_info == (INTR_INFO_VALID_MASK
+ | INTR_TYPE_OTHER_EVENT);
+}
+
+/*
+ * Return true if L0 wants to handle an exit from L2 regardless of whether or not
+ * L1 wants the exit. Only call this when in is_guest_mode (L2).
+ */
+static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu,
+ union vmx_exit_reason exit_reason)
+{
+ u32 intr_info;
+
+ switch ((u16)exit_reason.basic) {
+ case EXIT_REASON_EXCEPTION_NMI:
+ intr_info = vmx_get_intr_info(vcpu);
+ if (is_nmi(intr_info))
+ return true;
+ else if (is_page_fault(intr_info))
+ return vcpu->arch.apf.host_apf_flags ||
+ vmx_need_pf_intercept(vcpu);
+ else if (is_debug(intr_info) &&
+ vcpu->guest_debug &
+ (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
+ return true;
+ else if (is_breakpoint(intr_info) &&
+ vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
+ return true;
+ else if (is_alignment_check(intr_info) &&
+ !vmx_guest_inject_ac(vcpu))
+ return true;
+ return false;
+ case EXIT_REASON_EXTERNAL_INTERRUPT:
+ return true;
+ case EXIT_REASON_MCE_DURING_VMENTRY:
+ return true;
+ case EXIT_REASON_EPT_VIOLATION:
+ /*
+ * L0 always deals with the EPT violation. If nested EPT is
+ * used, and the nested mmu code discovers that the address is
+ * missing in the guest EPT table (EPT12), the EPT violation
+ * will be injected with nested_ept_inject_page_fault()
+ */
+ return true;
+ case EXIT_REASON_EPT_MISCONFIG:
+ /*
+ * L2 never uses directly L1's EPT, but rather L0's own EPT
+ * table (shadow on EPT) or a merged EPT table that L0 built
+ * (EPT on EPT). So any problems with the structure of the
+ * table is L0's fault.
+ */
+ return true;
+ case EXIT_REASON_PREEMPTION_TIMER:
+ return true;
+ case EXIT_REASON_PML_FULL:
+ /*
+ * PML is emulated for an L1 VMM and should never be enabled in
+ * vmcs02, always "handle" PML_FULL by exiting to userspace.
+ */
+ return true;
+ case EXIT_REASON_VMFUNC:
+ /* VM functions are emulated through L2->L0 vmexits. */
+ return true;
+ case EXIT_REASON_BUS_LOCK:
+ /*
+ * At present, bus lock VM exit is never exposed to L1.
+ * Handle L2's bus locks in L0 directly.
+ */
+ return true;
+ case EXIT_REASON_VMCALL:
+ /* Hyper-V L2 TLB flush hypercall is handled by L0 */
+ return guest_hv_cpuid_has_l2_tlb_flush(vcpu) &&
+ nested_evmcs_l2_tlb_flush_enabled(vcpu) &&
+ kvm_hv_is_tlb_flush_hcall(vcpu);
+ default:
+ break;
+ }
+ return false;
+}
+
+/*
+ * Return 1 if L1 wants to intercept an exit from L2. Only call this when in
+ * is_guest_mode (L2).
+ */
+static bool nested_vmx_l1_wants_exit(struct kvm_vcpu *vcpu,
+ union vmx_exit_reason exit_reason)
+{
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+ u32 intr_info;
+
+ switch ((u16)exit_reason.basic) {
+ case EXIT_REASON_EXCEPTION_NMI:
+ intr_info = vmx_get_intr_info(vcpu);
+ if (is_nmi(intr_info))
+ return true;
+ else if (is_page_fault(intr_info))
+ return true;
+ return vmcs12->exception_bitmap &
+ (1u << (intr_info & INTR_INFO_VECTOR_MASK));
+ case EXIT_REASON_EXTERNAL_INTERRUPT:
+ return nested_exit_on_intr(vcpu);
+ case EXIT_REASON_TRIPLE_FAULT:
+ return true;
+ case EXIT_REASON_INTERRUPT_WINDOW:
+ return nested_cpu_has(vmcs12, CPU_BASED_INTR_WINDOW_EXITING);
+ case EXIT_REASON_NMI_WINDOW:
+ return nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING);
+ case EXIT_REASON_TASK_SWITCH:
+ return true;
+ case EXIT_REASON_CPUID:
+ return true;
+ case EXIT_REASON_HLT:
+ return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING);
+ case EXIT_REASON_INVD:
+ return true;
+ case EXIT_REASON_INVLPG:
+ return nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING);
+ case EXIT_REASON_RDPMC:
+ return nested_cpu_has(vmcs12, CPU_BASED_RDPMC_EXITING);
+ case EXIT_REASON_RDRAND:
+ return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDRAND_EXITING);
+ case EXIT_REASON_RDSEED:
+ return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDSEED_EXITING);
+ case EXIT_REASON_RDTSC: case EXIT_REASON_RDTSCP:
+ return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING);
+ case EXIT_REASON_VMREAD:
+ return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12,
+ vmcs12->vmread_bitmap);
+ case EXIT_REASON_VMWRITE:
+ return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12,
+ vmcs12->vmwrite_bitmap);
+ case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR:
+ case EXIT_REASON_VMLAUNCH: case EXIT_REASON_VMPTRLD:
+ case EXIT_REASON_VMPTRST: case EXIT_REASON_VMRESUME:
+ case EXIT_REASON_VMOFF: case EXIT_REASON_VMON:
+ case EXIT_REASON_INVEPT: case EXIT_REASON_INVVPID:
+ /*
+ * VMX instructions trap unconditionally. This allows L1 to
+ * emulate them for its L2 guest, i.e., allows 3-level nesting!
+ */
+ return true;
+ case EXIT_REASON_CR_ACCESS:
+ return nested_vmx_exit_handled_cr(vcpu, vmcs12);
+ case EXIT_REASON_DR_ACCESS:
+ return nested_cpu_has(vmcs12, CPU_BASED_MOV_DR_EXITING);
+ case EXIT_REASON_IO_INSTRUCTION:
+ return nested_vmx_exit_handled_io(vcpu, vmcs12);
+ case EXIT_REASON_GDTR_IDTR: case EXIT_REASON_LDTR_TR:
+ return nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC);
+ case EXIT_REASON_MSR_READ:
+ case EXIT_REASON_MSR_WRITE:
+ return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason);
+ case EXIT_REASON_INVALID_STATE:
+ return true;
+ case EXIT_REASON_MWAIT_INSTRUCTION:
+ return nested_cpu_has(vmcs12, CPU_BASED_MWAIT_EXITING);
+ case EXIT_REASON_MONITOR_TRAP_FLAG:
+ return nested_vmx_exit_handled_mtf(vmcs12);
+ case EXIT_REASON_MONITOR_INSTRUCTION:
+ return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_EXITING);
+ case EXIT_REASON_PAUSE_INSTRUCTION:
+ return nested_cpu_has(vmcs12, CPU_BASED_PAUSE_EXITING) ||
+ nested_cpu_has2(vmcs12,
+ SECONDARY_EXEC_PAUSE_LOOP_EXITING);
+ case EXIT_REASON_MCE_DURING_VMENTRY:
+ return true;
+ case EXIT_REASON_TPR_BELOW_THRESHOLD:
+ return nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW);
+ case EXIT_REASON_APIC_ACCESS:
+ case EXIT_REASON_APIC_WRITE:
+ case EXIT_REASON_EOI_INDUCED:
+ /*
+ * The controls for "virtualize APIC accesses," "APIC-
+ * register virtualization," and "virtual-interrupt
+ * delivery" only come from vmcs12.
+ */
+ return true;
+ case EXIT_REASON_INVPCID:
+ return
+ nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_INVPCID) &&
+ nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING);
+ case EXIT_REASON_WBINVD:
+ return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING);
+ case EXIT_REASON_XSETBV:
+ return true;
+ case EXIT_REASON_XSAVES: case EXIT_REASON_XRSTORS:
+ /*
+ * This should never happen, since it is not possible to
+ * set XSS to a non-zero value---neither in L1 nor in L2.
+ * If if it were, XSS would have to be checked against
+ * the XSS exit bitmap in vmcs12.
+ */
+ return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_XSAVES);
+ case EXIT_REASON_UMWAIT:
+ case EXIT_REASON_TPAUSE:
+ return nested_cpu_has2(vmcs12,
+ SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE);
+ case EXIT_REASON_ENCLS:
+ return nested_vmx_exit_handled_encls(vcpu, vmcs12);
+ case EXIT_REASON_NOTIFY:
+ /* Notify VM exit is not exposed to L1 */
+ return false;
+ default:
+ return true;
+ }
+}
+
+/*
+ * Conditionally reflect a VM-Exit into L1. Returns %true if the VM-Exit was
+ * reflected into L1.
+ */
+bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ union vmx_exit_reason exit_reason = vmx->exit_reason;
+ unsigned long exit_qual;
+ u32 exit_intr_info;
+
+ WARN_ON_ONCE(vmx->nested.nested_run_pending);
+
+ /*
+ * Late nested VM-Fail shares the same flow as nested VM-Exit since KVM
+ * has already loaded L2's state.
+ */
+ if (unlikely(vmx->fail)) {
+ trace_kvm_nested_vmenter_failed(
+ "hardware VM-instruction error: ",
+ vmcs_read32(VM_INSTRUCTION_ERROR));
+ exit_intr_info = 0;
+ exit_qual = 0;
+ goto reflect_vmexit;
+ }
+
+ trace_kvm_nested_vmexit(vcpu, KVM_ISA_VMX);
+
+ /* If L0 (KVM) wants the exit, it trumps L1's desires. */
+ if (nested_vmx_l0_wants_exit(vcpu, exit_reason))
+ return false;
+
+ /* If L1 doesn't want the exit, handle it in L0. */
+ if (!nested_vmx_l1_wants_exit(vcpu, exit_reason))
+ return false;
+
+ /*
+ * vmcs.VM_EXIT_INTR_INFO is only valid for EXCEPTION_NMI exits. For
+ * EXTERNAL_INTERRUPT, the value for vmcs12->vm_exit_intr_info would
+ * need to be synthesized by querying the in-kernel LAPIC, but external
+ * interrupts are never reflected to L1 so it's a non-issue.
+ */
+ exit_intr_info = vmx_get_intr_info(vcpu);
+ if (is_exception_with_error_code(exit_intr_info)) {
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+
+ vmcs12->vm_exit_intr_error_code =
+ vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
+ }
+ exit_qual = vmx_get_exit_qual(vcpu);
+
+reflect_vmexit:
+ nested_vmx_vmexit(vcpu, exit_reason.full, exit_intr_info, exit_qual);
+ return true;
+}
+
+static int vmx_get_nested_state(struct kvm_vcpu *vcpu,
+ struct kvm_nested_state __user *user_kvm_nested_state,
+ u32 user_data_size)
+{
+ struct vcpu_vmx *vmx;
+ struct vmcs12 *vmcs12;
+ struct kvm_nested_state kvm_state = {
+ .flags = 0,
+ .format = KVM_STATE_NESTED_FORMAT_VMX,
+ .size = sizeof(kvm_state),
+ .hdr.vmx.flags = 0,
+ .hdr.vmx.vmxon_pa = INVALID_GPA,
+ .hdr.vmx.vmcs12_pa = INVALID_GPA,
+ .hdr.vmx.preemption_timer_deadline = 0,
+ };
+ struct kvm_vmx_nested_state_data __user *user_vmx_nested_state =
+ &user_kvm_nested_state->data.vmx[0];
+
+ if (!vcpu)
+ return kvm_state.size + sizeof(*user_vmx_nested_state);
+
+ vmx = to_vmx(vcpu);
+ vmcs12 = get_vmcs12(vcpu);
+
+ if (guest_can_use(vcpu, X86_FEATURE_VMX) &&
+ (vmx->nested.vmxon || vmx->nested.smm.vmxon)) {
+ kvm_state.hdr.vmx.vmxon_pa = vmx->nested.vmxon_ptr;
+ kvm_state.hdr.vmx.vmcs12_pa = vmx->nested.current_vmptr;
+
+ if (vmx_has_valid_vmcs12(vcpu)) {
+ kvm_state.size += sizeof(user_vmx_nested_state->vmcs12);
+
+ /* 'hv_evmcs_vmptr' can also be EVMPTR_MAP_PENDING here */
+ if (vmx->nested.hv_evmcs_vmptr != EVMPTR_INVALID)
+ kvm_state.flags |= KVM_STATE_NESTED_EVMCS;
+
+ if (is_guest_mode(vcpu) &&
+ nested_cpu_has_shadow_vmcs(vmcs12) &&
+ vmcs12->vmcs_link_pointer != INVALID_GPA)
+ kvm_state.size += sizeof(user_vmx_nested_state->shadow_vmcs12);
+ }
+
+ if (vmx->nested.smm.vmxon)
+ kvm_state.hdr.vmx.smm.flags |= KVM_STATE_NESTED_SMM_VMXON;
+
+ if (vmx->nested.smm.guest_mode)
+ kvm_state.hdr.vmx.smm.flags |= KVM_STATE_NESTED_SMM_GUEST_MODE;
+
+ if (is_guest_mode(vcpu)) {
+ kvm_state.flags |= KVM_STATE_NESTED_GUEST_MODE;
+
+ if (vmx->nested.nested_run_pending)
+ kvm_state.flags |= KVM_STATE_NESTED_RUN_PENDING;
+
+ if (vmx->nested.mtf_pending)
+ kvm_state.flags |= KVM_STATE_NESTED_MTF_PENDING;
+
+ if (nested_cpu_has_preemption_timer(vmcs12) &&
+ vmx->nested.has_preemption_timer_deadline) {
+ kvm_state.hdr.vmx.flags |=
+ KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE;
+ kvm_state.hdr.vmx.preemption_timer_deadline =
+ vmx->nested.preemption_timer_deadline;
+ }
+ }
+ }
+
+ if (user_data_size < kvm_state.size)
+ goto out;
+
+ if (copy_to_user(user_kvm_nested_state, &kvm_state, sizeof(kvm_state)))
+ return -EFAULT;
+
+ if (!vmx_has_valid_vmcs12(vcpu))
+ goto out;
+
+ /*
+ * When running L2, the authoritative vmcs12 state is in the
+ * vmcs02. When running L1, the authoritative vmcs12 state is
+ * in the shadow or enlightened vmcs linked to vmcs01, unless
+ * need_vmcs12_to_shadow_sync is set, in which case, the authoritative
+ * vmcs12 state is in the vmcs12 already.
+ */
+ if (is_guest_mode(vcpu)) {
+ sync_vmcs02_to_vmcs12(vcpu, vmcs12);
+ sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
+ } else {
+ copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu));
+ if (!vmx->nested.need_vmcs12_to_shadow_sync) {
+ if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr))
+ /*
+ * L1 hypervisor is not obliged to keep eVMCS
+ * clean fields data always up-to-date while
+ * not in guest mode, 'hv_clean_fields' is only
+ * supposed to be actual upon vmentry so we need
+ * to ignore it here and do full copy.
+ */
+ copy_enlightened_to_vmcs12(vmx, 0);
+ else if (enable_shadow_vmcs)
+ copy_shadow_to_vmcs12(vmx);
+ }
+ }
+
+ BUILD_BUG_ON(sizeof(user_vmx_nested_state->vmcs12) < VMCS12_SIZE);
+ BUILD_BUG_ON(sizeof(user_vmx_nested_state->shadow_vmcs12) < VMCS12_SIZE);
+
+ /*
+ * Copy over the full allocated size of vmcs12 rather than just the size
+ * of the struct.
+ */
+ if (copy_to_user(user_vmx_nested_state->vmcs12, vmcs12, VMCS12_SIZE))
+ return -EFAULT;
+
+ if (nested_cpu_has_shadow_vmcs(vmcs12) &&
+ vmcs12->vmcs_link_pointer != INVALID_GPA) {
+ if (copy_to_user(user_vmx_nested_state->shadow_vmcs12,
+ get_shadow_vmcs12(vcpu), VMCS12_SIZE))
+ return -EFAULT;
+ }
+out:
+ return kvm_state.size;
+}
+
+void vmx_leave_nested(struct kvm_vcpu *vcpu)
+{
+ if (is_guest_mode(vcpu)) {
+ to_vmx(vcpu)->nested.nested_run_pending = 0;
+ nested_vmx_vmexit(vcpu, -1, 0, 0);
+ }
+ free_nested(vcpu);
+}
+
+static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
+ struct kvm_nested_state __user *user_kvm_nested_state,
+ struct kvm_nested_state *kvm_state)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct vmcs12 *vmcs12;
+ enum vm_entry_failure_code ignored;
+ struct kvm_vmx_nested_state_data __user *user_vmx_nested_state =
+ &user_kvm_nested_state->data.vmx[0];
+ int ret;
+
+ if (kvm_state->format != KVM_STATE_NESTED_FORMAT_VMX)
+ return -EINVAL;
+
+ if (kvm_state->hdr.vmx.vmxon_pa == INVALID_GPA) {
+ if (kvm_state->hdr.vmx.smm.flags)
+ return -EINVAL;
+
+ if (kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA)
+ return -EINVAL;
+
+ /*
+ * KVM_STATE_NESTED_EVMCS used to signal that KVM should
+ * enable eVMCS capability on vCPU. However, since then
+ * code was changed such that flag signals vmcs12 should
+ * be copied into eVMCS in guest memory.
+ *
+ * To preserve backwards compatability, allow user
+ * to set this flag even when there is no VMXON region.
+ */
+ if (kvm_state->flags & ~KVM_STATE_NESTED_EVMCS)
+ return -EINVAL;
+ } else {
+ if (!guest_can_use(vcpu, X86_FEATURE_VMX))
+ return -EINVAL;
+
+ if (!page_address_valid(vcpu, kvm_state->hdr.vmx.vmxon_pa))
+ return -EINVAL;
+ }
+
+ if ((kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) &&
+ (kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE))
+ return -EINVAL;
+
+ if (kvm_state->hdr.vmx.smm.flags &
+ ~(KVM_STATE_NESTED_SMM_GUEST_MODE | KVM_STATE_NESTED_SMM_VMXON))
+ return -EINVAL;
+
+ if (kvm_state->hdr.vmx.flags & ~KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE)
+ return -EINVAL;
+
+ /*
+ * SMM temporarily disables VMX, so we cannot be in guest mode,
+ * nor can VMLAUNCH/VMRESUME be pending. Outside SMM, SMM flags
+ * must be zero.
+ */
+ if (is_smm(vcpu) ?
+ (kvm_state->flags &
+ (KVM_STATE_NESTED_GUEST_MODE | KVM_STATE_NESTED_RUN_PENDING))
+ : kvm_state->hdr.vmx.smm.flags)
+ return -EINVAL;
+
+ if ((kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) &&
+ !(kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON))
+ return -EINVAL;
+
+ if ((kvm_state->flags & KVM_STATE_NESTED_EVMCS) &&
+ (!guest_can_use(vcpu, X86_FEATURE_VMX) ||
+ !vmx->nested.enlightened_vmcs_enabled))
+ return -EINVAL;
+
+ vmx_leave_nested(vcpu);
+
+ if (kvm_state->hdr.vmx.vmxon_pa == INVALID_GPA)
+ return 0;
+
+ vmx->nested.vmxon_ptr = kvm_state->hdr.vmx.vmxon_pa;
+ ret = enter_vmx_operation(vcpu);
+ if (ret)
+ return ret;
+
+ /* Empty 'VMXON' state is permitted if no VMCS loaded */
+ if (kvm_state->size < sizeof(*kvm_state) + sizeof(*vmcs12)) {
+ /* See vmx_has_valid_vmcs12. */
+ if ((kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE) ||
+ (kvm_state->flags & KVM_STATE_NESTED_EVMCS) ||
+ (kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA))
+ return -EINVAL;
+ else
+ return 0;
+ }
+
+ if (kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA) {
+ if (kvm_state->hdr.vmx.vmcs12_pa == kvm_state->hdr.vmx.vmxon_pa ||
+ !page_address_valid(vcpu, kvm_state->hdr.vmx.vmcs12_pa))
+ return -EINVAL;
+
+ set_current_vmptr(vmx, kvm_state->hdr.vmx.vmcs12_pa);
+ } else if (kvm_state->flags & KVM_STATE_NESTED_EVMCS) {
+ /*
+ * nested_vmx_handle_enlightened_vmptrld() cannot be called
+ * directly from here as HV_X64_MSR_VP_ASSIST_PAGE may not be
+ * restored yet. EVMCS will be mapped from
+ * nested_get_vmcs12_pages().
+ */
+ vmx->nested.hv_evmcs_vmptr = EVMPTR_MAP_PENDING;
+ kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
+ } else {
+ return -EINVAL;
+ }
+
+ if (kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON) {
+ vmx->nested.smm.vmxon = true;
+ vmx->nested.vmxon = false;
+
+ if (kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE)
+ vmx->nested.smm.guest_mode = true;
+ }
+
+ vmcs12 = get_vmcs12(vcpu);
+ if (copy_from_user(vmcs12, user_vmx_nested_state->vmcs12, sizeof(*vmcs12)))
+ return -EFAULT;
+
+ if (vmcs12->hdr.revision_id != VMCS12_REVISION)
+ return -EINVAL;
+
+ if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE))
+ return 0;
+
+ vmx->nested.nested_run_pending =
+ !!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING);
+
+ vmx->nested.mtf_pending =
+ !!(kvm_state->flags & KVM_STATE_NESTED_MTF_PENDING);
+
+ ret = -EINVAL;
+ if (nested_cpu_has_shadow_vmcs(vmcs12) &&
+ vmcs12->vmcs_link_pointer != INVALID_GPA) {
+ struct vmcs12 *shadow_vmcs12 = get_shadow_vmcs12(vcpu);
+
+ if (kvm_state->size <
+ sizeof(*kvm_state) +
+ sizeof(user_vmx_nested_state->vmcs12) + sizeof(*shadow_vmcs12))
+ goto error_guest_mode;
+
+ if (copy_from_user(shadow_vmcs12,
+ user_vmx_nested_state->shadow_vmcs12,
+ sizeof(*shadow_vmcs12))) {
+ ret = -EFAULT;
+ goto error_guest_mode;
+ }
+
+ if (shadow_vmcs12->hdr.revision_id != VMCS12_REVISION ||
+ !shadow_vmcs12->hdr.shadow_vmcs)
+ goto error_guest_mode;
+ }
+
+ vmx->nested.has_preemption_timer_deadline = false;
+ if (kvm_state->hdr.vmx.flags & KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE) {
+ vmx->nested.has_preemption_timer_deadline = true;
+ vmx->nested.preemption_timer_deadline =
+ kvm_state->hdr.vmx.preemption_timer_deadline;
+ }
+
+ if (nested_vmx_check_controls(vcpu, vmcs12) ||
+ nested_vmx_check_host_state(vcpu, vmcs12) ||
+ nested_vmx_check_guest_state(vcpu, vmcs12, &ignored))
+ goto error_guest_mode;
+
+ vmx->nested.dirty_vmcs12 = true;
+ vmx->nested.force_msr_bitmap_recalc = true;
+ ret = nested_vmx_enter_non_root_mode(vcpu, false);
+ if (ret)
+ goto error_guest_mode;
+
+ if (vmx->nested.mtf_pending)
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+
+ return 0;
+
+error_guest_mode:
+ vmx->nested.nested_run_pending = 0;
+ return ret;
+}
+
+void nested_vmx_set_vmcs_shadowing_bitmap(void)
+{
+ if (enable_shadow_vmcs) {
+ vmcs_write64(VMREAD_BITMAP, __pa(vmx_vmread_bitmap));
+ vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap));
+ }
+}
+
+/*
+ * Indexing into the vmcs12 uses the VMCS encoding rotated left by 6. Undo
+ * that madness to get the encoding for comparison.
+ */
+#define VMCS12_IDX_TO_ENC(idx) ((u16)(((u16)(idx) >> 6) | ((u16)(idx) << 10)))
+
+static u64 nested_vmx_calc_vmcs_enum_msr(void)
+{
+ /*
+ * Note these are the so called "index" of the VMCS field encoding, not
+ * the index into vmcs12.
+ */
+ unsigned int max_idx, idx;
+ int i;
+
+ /*
+ * For better or worse, KVM allows VMREAD/VMWRITE to all fields in
+ * vmcs12, regardless of whether or not the associated feature is
+ * exposed to L1. Simply find the field with the highest index.
+ */
+ max_idx = 0;
+ for (i = 0; i < nr_vmcs12_fields; i++) {
+ /* The vmcs12 table is very, very sparsely populated. */
+ if (!vmcs12_field_offsets[i])
+ continue;
+
+ idx = vmcs_field_index(VMCS12_IDX_TO_ENC(i));
+ if (idx > max_idx)
+ max_idx = idx;
+ }
+
+ return (u64)max_idx << VMCS_FIELD_INDEX_SHIFT;
+}
+
+static void nested_vmx_setup_pinbased_ctls(struct vmcs_config *vmcs_conf,
+ struct nested_vmx_msrs *msrs)
+{
+ msrs->pinbased_ctls_low =
+ PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
+
+ msrs->pinbased_ctls_high = vmcs_conf->pin_based_exec_ctrl;
+ msrs->pinbased_ctls_high &=
+ PIN_BASED_EXT_INTR_MASK |
+ PIN_BASED_NMI_EXITING |
+ PIN_BASED_VIRTUAL_NMIS |
+ (enable_apicv ? PIN_BASED_POSTED_INTR : 0);
+ msrs->pinbased_ctls_high |=
+ PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
+ PIN_BASED_VMX_PREEMPTION_TIMER;
+}
+
+static void nested_vmx_setup_exit_ctls(struct vmcs_config *vmcs_conf,
+ struct nested_vmx_msrs *msrs)
+{
+ msrs->exit_ctls_low =
+ VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
+
+ msrs->exit_ctls_high = vmcs_conf->vmexit_ctrl;
+ msrs->exit_ctls_high &=
+#ifdef CONFIG_X86_64
+ VM_EXIT_HOST_ADDR_SPACE_SIZE |
+#endif
+ VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT |
+ VM_EXIT_CLEAR_BNDCFGS;
+ msrs->exit_ctls_high |=
+ VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
+ VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER |
+ VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT |
+ VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL;
+
+ /* We support free control of debug control saving. */
+ msrs->exit_ctls_low &= ~VM_EXIT_SAVE_DEBUG_CONTROLS;
+}
+
+static void nested_vmx_setup_entry_ctls(struct vmcs_config *vmcs_conf,
+ struct nested_vmx_msrs *msrs)
+{
+ msrs->entry_ctls_low =
+ VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
+
+ msrs->entry_ctls_high = vmcs_conf->vmentry_ctrl;
+ msrs->entry_ctls_high &=
+#ifdef CONFIG_X86_64
+ VM_ENTRY_IA32E_MODE |
+#endif
+ VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_BNDCFGS;
+ msrs->entry_ctls_high |=
+ (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER |
+ VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL);
+
+ /* We support free control of debug control loading. */
+ msrs->entry_ctls_low &= ~VM_ENTRY_LOAD_DEBUG_CONTROLS;
+}
+
+static void nested_vmx_setup_cpubased_ctls(struct vmcs_config *vmcs_conf,
+ struct nested_vmx_msrs *msrs)
+{
+ msrs->procbased_ctls_low =
+ CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
+
+ msrs->procbased_ctls_high = vmcs_conf->cpu_based_exec_ctrl;
+ msrs->procbased_ctls_high &=
+ CPU_BASED_INTR_WINDOW_EXITING |
+ CPU_BASED_NMI_WINDOW_EXITING | CPU_BASED_USE_TSC_OFFSETTING |
+ CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING |
+ CPU_BASED_MWAIT_EXITING | CPU_BASED_CR3_LOAD_EXITING |
+ CPU_BASED_CR3_STORE_EXITING |
+#ifdef CONFIG_X86_64
+ CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING |
+#endif
+ CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING |
+ CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_TRAP_FLAG |
+ CPU_BASED_MONITOR_EXITING | CPU_BASED_RDPMC_EXITING |
+ CPU_BASED_RDTSC_EXITING | CPU_BASED_PAUSE_EXITING |
+ CPU_BASED_TPR_SHADOW | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
+ /*
+ * We can allow some features even when not supported by the
+ * hardware. For example, L1 can specify an MSR bitmap - and we
+ * can use it to avoid exits to L1 - even when L0 runs L2
+ * without MSR bitmaps.
+ */
+ msrs->procbased_ctls_high |=
+ CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
+ CPU_BASED_USE_MSR_BITMAPS;
+
+ /* We support free control of CR3 access interception. */
+ msrs->procbased_ctls_low &=
+ ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING);
+}
+
+static void nested_vmx_setup_secondary_ctls(u32 ept_caps,
+ struct vmcs_config *vmcs_conf,
+ struct nested_vmx_msrs *msrs)
+{
+ msrs->secondary_ctls_low = 0;
+
+ msrs->secondary_ctls_high = vmcs_conf->cpu_based_2nd_exec_ctrl;
+ msrs->secondary_ctls_high &=
+ SECONDARY_EXEC_DESC |
+ SECONDARY_EXEC_ENABLE_RDTSCP |
+ SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
+ SECONDARY_EXEC_WBINVD_EXITING |
+ SECONDARY_EXEC_APIC_REGISTER_VIRT |
+ SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
+ SECONDARY_EXEC_RDRAND_EXITING |
+ SECONDARY_EXEC_ENABLE_INVPCID |
+ SECONDARY_EXEC_ENABLE_VMFUNC |
+ SECONDARY_EXEC_RDSEED_EXITING |
+ SECONDARY_EXEC_ENABLE_XSAVES |
+ SECONDARY_EXEC_TSC_SCALING |
+ SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE;
+
+ /*
+ * We can emulate "VMCS shadowing," even if the hardware
+ * doesn't support it.
+ */
+ msrs->secondary_ctls_high |=
+ SECONDARY_EXEC_SHADOW_VMCS;
+
+ if (enable_ept) {
+ /* nested EPT: emulate EPT also to L1 */
+ msrs->secondary_ctls_high |=
+ SECONDARY_EXEC_ENABLE_EPT;
+ msrs->ept_caps =
+ VMX_EPT_PAGE_WALK_4_BIT |
+ VMX_EPT_PAGE_WALK_5_BIT |
+ VMX_EPTP_WB_BIT |
+ VMX_EPT_INVEPT_BIT |
+ VMX_EPT_EXECUTE_ONLY_BIT;
+
+ msrs->ept_caps &= ept_caps;
+ msrs->ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT |
+ VMX_EPT_EXTENT_CONTEXT_BIT | VMX_EPT_2MB_PAGE_BIT |
+ VMX_EPT_1GB_PAGE_BIT;
+ if (enable_ept_ad_bits) {
+ msrs->secondary_ctls_high |=
+ SECONDARY_EXEC_ENABLE_PML;
+ msrs->ept_caps |= VMX_EPT_AD_BIT;
+ }
+
+ /*
+ * Advertise EPTP switching irrespective of hardware support,
+ * KVM emulates it in software so long as VMFUNC is supported.
+ */
+ if (cpu_has_vmx_vmfunc())
+ msrs->vmfunc_controls = VMX_VMFUNC_EPTP_SWITCHING;
+ }
+
+ /*
+ * Old versions of KVM use the single-context version without
+ * checking for support, so declare that it is supported even
+ * though it is treated as global context. The alternative is
+ * not failing the single-context invvpid, and it is worse.
+ */
+ if (enable_vpid) {
+ msrs->secondary_ctls_high |=
+ SECONDARY_EXEC_ENABLE_VPID;
+ msrs->vpid_caps = VMX_VPID_INVVPID_BIT |
+ VMX_VPID_EXTENT_SUPPORTED_MASK;
+ }
+
+ if (enable_unrestricted_guest)
+ msrs->secondary_ctls_high |=
+ SECONDARY_EXEC_UNRESTRICTED_GUEST;
+
+ if (flexpriority_enabled)
+ msrs->secondary_ctls_high |=
+ SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
+
+ if (enable_sgx)
+ msrs->secondary_ctls_high |= SECONDARY_EXEC_ENCLS_EXITING;
+}
+
+static void nested_vmx_setup_misc_data(struct vmcs_config *vmcs_conf,
+ struct nested_vmx_msrs *msrs)
+{
+ msrs->misc_low = (u32)vmcs_conf->misc & VMX_MISC_SAVE_EFER_LMA;
+ msrs->misc_low |=
+ MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS |
+ VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE |
+ VMX_MISC_ACTIVITY_HLT |
+ VMX_MISC_ACTIVITY_WAIT_SIPI;
+ msrs->misc_high = 0;
+}
+
+static void nested_vmx_setup_basic(struct nested_vmx_msrs *msrs)
+{
+ /*
+ * This MSR reports some information about VMX support. We
+ * should return information about the VMX we emulate for the
+ * guest, and the VMCS structure we give it - not about the
+ * VMX support of the underlying hardware.
+ */
+ msrs->basic =
+ VMCS12_REVISION |
+ VMX_BASIC_TRUE_CTLS |
+ ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) |
+ (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT);
+
+ if (cpu_has_vmx_basic_inout())
+ msrs->basic |= VMX_BASIC_INOUT;
+}
+
+static void nested_vmx_setup_cr_fixed(struct nested_vmx_msrs *msrs)
+{
+ /*
+ * These MSRs specify bits which the guest must keep fixed on
+ * while L1 is in VMXON mode (in L1's root mode, or running an L2).
+ * We picked the standard core2 setting.
+ */
+#define VMXON_CR0_ALWAYSON (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE)
+#define VMXON_CR4_ALWAYSON X86_CR4_VMXE
+ msrs->cr0_fixed0 = VMXON_CR0_ALWAYSON;
+ msrs->cr4_fixed0 = VMXON_CR4_ALWAYSON;
+
+ /* These MSRs specify bits which the guest must keep fixed off. */
+ rdmsrl(MSR_IA32_VMX_CR0_FIXED1, msrs->cr0_fixed1);
+ rdmsrl(MSR_IA32_VMX_CR4_FIXED1, msrs->cr4_fixed1);
+
+ if (vmx_umip_emulated())
+ msrs->cr4_fixed1 |= X86_CR4_UMIP;
+}
+
+/*
+ * nested_vmx_setup_ctls_msrs() sets up variables containing the values to be
+ * returned for the various VMX controls MSRs when nested VMX is enabled.
+ * The same values should also be used to verify that vmcs12 control fields are
+ * valid during nested entry from L1 to L2.
+ * Each of these control msrs has a low and high 32-bit half: A low bit is on
+ * if the corresponding bit in the (32-bit) control field *must* be on, and a
+ * bit in the high half is on if the corresponding bit in the control field
+ * may be on. See also vmx_control_verify().
+ */
+void nested_vmx_setup_ctls_msrs(struct vmcs_config *vmcs_conf, u32 ept_caps)
+{
+ struct nested_vmx_msrs *msrs = &vmcs_conf->nested;
+
+ /*
+ * Note that as a general rule, the high half of the MSRs (bits in
+ * the control fields which may be 1) should be initialized by the
+ * intersection of the underlying hardware's MSR (i.e., features which
+ * can be supported) and the list of features we want to expose -
+ * because they are known to be properly supported in our code.
+ * Also, usually, the low half of the MSRs (bits which must be 1) can
+ * be set to 0, meaning that L1 may turn off any of these bits. The
+ * reason is that if one of these bits is necessary, it will appear
+ * in vmcs01 and prepare_vmcs02, when it bitwise-or's the control
+ * fields of vmcs01 and vmcs02, will turn these bits off - and
+ * nested_vmx_l1_wants_exit() will not pass related exits to L1.
+ * These rules have exceptions below.
+ */
+ nested_vmx_setup_pinbased_ctls(vmcs_conf, msrs);
+
+ nested_vmx_setup_exit_ctls(vmcs_conf, msrs);
+
+ nested_vmx_setup_entry_ctls(vmcs_conf, msrs);
+
+ nested_vmx_setup_cpubased_ctls(vmcs_conf, msrs);
+
+ nested_vmx_setup_secondary_ctls(ept_caps, vmcs_conf, msrs);
+
+ nested_vmx_setup_misc_data(vmcs_conf, msrs);
+
+ nested_vmx_setup_basic(msrs);
+
+ nested_vmx_setup_cr_fixed(msrs);
+
+ msrs->vmcs_enum = nested_vmx_calc_vmcs_enum_msr();
+}
+
+void nested_vmx_hardware_unsetup(void)
+{
+ int i;
+
+ if (enable_shadow_vmcs) {
+ for (i = 0; i < VMX_BITMAP_NR; i++)
+ free_page((unsigned long)vmx_bitmap[i]);
+ }
+}
+
+__init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *))
+{
+ int i;
+
+ if (!cpu_has_vmx_shadow_vmcs())
+ enable_shadow_vmcs = 0;
+ if (enable_shadow_vmcs) {
+ for (i = 0; i < VMX_BITMAP_NR; i++) {
+ /*
+ * The vmx_bitmap is not tied to a VM and so should
+ * not be charged to a memcg.
+ */
+ vmx_bitmap[i] = (unsigned long *)
+ __get_free_page(GFP_KERNEL);
+ if (!vmx_bitmap[i]) {
+ nested_vmx_hardware_unsetup();
+ return -ENOMEM;
+ }
+ }
+
+ init_vmcs_shadow_fields();
+ }
+
+ exit_handlers[EXIT_REASON_VMCLEAR] = handle_vmclear;
+ exit_handlers[EXIT_REASON_VMLAUNCH] = handle_vmlaunch;
+ exit_handlers[EXIT_REASON_VMPTRLD] = handle_vmptrld;
+ exit_handlers[EXIT_REASON_VMPTRST] = handle_vmptrst;
+ exit_handlers[EXIT_REASON_VMREAD] = handle_vmread;
+ exit_handlers[EXIT_REASON_VMRESUME] = handle_vmresume;
+ exit_handlers[EXIT_REASON_VMWRITE] = handle_vmwrite;
+ exit_handlers[EXIT_REASON_VMOFF] = handle_vmxoff;
+ exit_handlers[EXIT_REASON_VMON] = handle_vmxon;
+ exit_handlers[EXIT_REASON_INVEPT] = handle_invept;
+ exit_handlers[EXIT_REASON_INVVPID] = handle_invvpid;
+ exit_handlers[EXIT_REASON_VMFUNC] = handle_vmfunc;
+
+ return 0;
+}
+
+struct kvm_x86_nested_ops vmx_nested_ops = {
+ .leave_nested = vmx_leave_nested,
+ .is_exception_vmexit = nested_vmx_is_exception_vmexit,
+ .check_events = vmx_check_nested_events,
+ .has_events = vmx_has_nested_events,
+ .triple_fault = nested_vmx_triple_fault,
+ .get_state = vmx_get_nested_state,
+ .set_state = vmx_set_nested_state,
+ .get_nested_state_pages = vmx_get_nested_state_pages,
+ .write_log_dirty = nested_vmx_write_pml_buffer,
+ .enable_evmcs = nested_enable_evmcs,
+ .get_evmcs_version = nested_get_evmcs_version,
+ .hv_inject_synthetic_vmexit_post_tlb_flush = vmx_hv_inject_synthetic_vmexit_post_tlb_flush,
+};
diff --git a/arch/x86/kvm/vmx/nested.h b/arch/x86/kvm/vmx/nested.h
new file mode 100644
index 000000000..b4b9d5143
--- /dev/null
+++ b/arch/x86/kvm/vmx/nested.h
@@ -0,0 +1,293 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __KVM_X86_VMX_NESTED_H
+#define __KVM_X86_VMX_NESTED_H
+
+#include "kvm_cache_regs.h"
+#include "vmcs12.h"
+#include "vmx.h"
+
+/*
+ * Status returned by nested_vmx_enter_non_root_mode():
+ */
+enum nvmx_vmentry_status {
+ NVMX_VMENTRY_SUCCESS, /* Entered VMX non-root mode */
+ NVMX_VMENTRY_VMFAIL, /* Consistency check VMFail */
+ NVMX_VMENTRY_VMEXIT, /* Consistency check VMExit */
+ NVMX_VMENTRY_KVM_INTERNAL_ERROR,/* KVM internal error */
+};
+
+void vmx_leave_nested(struct kvm_vcpu *vcpu);
+void nested_vmx_setup_ctls_msrs(struct vmcs_config *vmcs_conf, u32 ept_caps);
+void nested_vmx_hardware_unsetup(void);
+__init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *));
+void nested_vmx_set_vmcs_shadowing_bitmap(void);
+void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu);
+enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
+ bool from_vmentry);
+bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu);
+void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
+ u32 exit_intr_info, unsigned long exit_qualification);
+void nested_sync_vmcs12_to_shadow(struct kvm_vcpu *vcpu);
+int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
+int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata);
+int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification,
+ u32 vmx_instruction_info, bool wr, int len, gva_t *ret);
+void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu);
+bool nested_vmx_check_io_bitmaps(struct kvm_vcpu *vcpu, unsigned int port,
+ int size);
+
+static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu)
+{
+ return to_vmx(vcpu)->nested.cached_vmcs12;
+}
+
+static inline struct vmcs12 *get_shadow_vmcs12(struct kvm_vcpu *vcpu)
+{
+ return to_vmx(vcpu)->nested.cached_shadow_vmcs12;
+}
+
+/*
+ * Note: the same condition is checked against the state provided by userspace
+ * in vmx_set_nested_state; if it is satisfied, the nested state must include
+ * the VMCS12.
+ */
+static inline int vmx_has_valid_vmcs12(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ /* 'hv_evmcs_vmptr' can also be EVMPTR_MAP_PENDING here */
+ return vmx->nested.current_vmptr != -1ull ||
+ vmx->nested.hv_evmcs_vmptr != EVMPTR_INVALID;
+}
+
+static inline u16 nested_get_vpid02(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ return vmx->nested.vpid02 ? vmx->nested.vpid02 : vmx->vpid;
+}
+
+static inline unsigned long nested_ept_get_eptp(struct kvm_vcpu *vcpu)
+{
+ /* return the page table to be shadowed - in our case, EPT12 */
+ return get_vmcs12(vcpu)->ept_pointer;
+}
+
+static inline bool nested_ept_ad_enabled(struct kvm_vcpu *vcpu)
+{
+ return nested_ept_get_eptp(vcpu) & VMX_EPTP_AD_ENABLE_BIT;
+}
+
+/*
+ * Return the cr0/4 value that a nested guest would read. This is a combination
+ * of L1's "real" cr0 used to run the guest (guest_cr0), and the bits shadowed
+ * by the L1 hypervisor (cr0_read_shadow). KVM must emulate CPU behavior as
+ * the value+mask loaded into vmcs02 may not match the vmcs12 fields.
+ */
+static inline unsigned long nested_read_cr0(struct vmcs12 *fields)
+{
+ return (fields->guest_cr0 & ~fields->cr0_guest_host_mask) |
+ (fields->cr0_read_shadow & fields->cr0_guest_host_mask);
+}
+static inline unsigned long nested_read_cr4(struct vmcs12 *fields)
+{
+ return (fields->guest_cr4 & ~fields->cr4_guest_host_mask) |
+ (fields->cr4_read_shadow & fields->cr4_guest_host_mask);
+}
+
+static inline unsigned nested_cpu_vmx_misc_cr3_count(struct kvm_vcpu *vcpu)
+{
+ return vmx_misc_cr3_count(to_vmx(vcpu)->nested.msrs.misc_low);
+}
+
+/*
+ * Do the virtual VMX capability MSRs specify that L1 can use VMWRITE
+ * to modify any valid field of the VMCS, or are the VM-exit
+ * information fields read-only?
+ */
+static inline bool nested_cpu_has_vmwrite_any_field(struct kvm_vcpu *vcpu)
+{
+ return to_vmx(vcpu)->nested.msrs.misc_low &
+ MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS;
+}
+
+static inline bool nested_cpu_has_zero_length_injection(struct kvm_vcpu *vcpu)
+{
+ return to_vmx(vcpu)->nested.msrs.misc_low & VMX_MISC_ZERO_LEN_INS;
+}
+
+static inline bool nested_cpu_supports_monitor_trap_flag(struct kvm_vcpu *vcpu)
+{
+ return to_vmx(vcpu)->nested.msrs.procbased_ctls_high &
+ CPU_BASED_MONITOR_TRAP_FLAG;
+}
+
+static inline bool nested_cpu_has_vmx_shadow_vmcs(struct kvm_vcpu *vcpu)
+{
+ return to_vmx(vcpu)->nested.msrs.secondary_ctls_high &
+ SECONDARY_EXEC_SHADOW_VMCS;
+}
+
+static inline bool nested_cpu_has(struct vmcs12 *vmcs12, u32 bit)
+{
+ return vmcs12->cpu_based_vm_exec_control & bit;
+}
+
+static inline bool nested_cpu_has2(struct vmcs12 *vmcs12, u32 bit)
+{
+ return (vmcs12->cpu_based_vm_exec_control &
+ CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) &&
+ (vmcs12->secondary_vm_exec_control & bit);
+}
+
+static inline bool nested_cpu_has_preemption_timer(struct vmcs12 *vmcs12)
+{
+ return vmcs12->pin_based_vm_exec_control &
+ PIN_BASED_VMX_PREEMPTION_TIMER;
+}
+
+static inline bool nested_cpu_has_nmi_exiting(struct vmcs12 *vmcs12)
+{
+ return vmcs12->pin_based_vm_exec_control & PIN_BASED_NMI_EXITING;
+}
+
+static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12)
+{
+ return vmcs12->pin_based_vm_exec_control & PIN_BASED_VIRTUAL_NMIS;
+}
+
+static inline int nested_cpu_has_mtf(struct vmcs12 *vmcs12)
+{
+ return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_TRAP_FLAG);
+}
+
+static inline int nested_cpu_has_ept(struct vmcs12 *vmcs12)
+{
+ return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_EPT);
+}
+
+static inline bool nested_cpu_has_xsaves(struct vmcs12 *vmcs12)
+{
+ return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_XSAVES);
+}
+
+static inline bool nested_cpu_has_pml(struct vmcs12 *vmcs12)
+{
+ return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_PML);
+}
+
+static inline bool nested_cpu_has_virt_x2apic_mode(struct vmcs12 *vmcs12)
+{
+ return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE);
+}
+
+static inline bool nested_cpu_has_vpid(struct vmcs12 *vmcs12)
+{
+ return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_VPID);
+}
+
+static inline bool nested_cpu_has_apic_reg_virt(struct vmcs12 *vmcs12)
+{
+ return nested_cpu_has2(vmcs12, SECONDARY_EXEC_APIC_REGISTER_VIRT);
+}
+
+static inline bool nested_cpu_has_vid(struct vmcs12 *vmcs12)
+{
+ return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
+}
+
+static inline bool nested_cpu_has_posted_intr(struct vmcs12 *vmcs12)
+{
+ return vmcs12->pin_based_vm_exec_control & PIN_BASED_POSTED_INTR;
+}
+
+static inline bool nested_cpu_has_vmfunc(struct vmcs12 *vmcs12)
+{
+ return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_VMFUNC);
+}
+
+static inline bool nested_cpu_has_eptp_switching(struct vmcs12 *vmcs12)
+{
+ return nested_cpu_has_vmfunc(vmcs12) &&
+ (vmcs12->vm_function_control &
+ VMX_VMFUNC_EPTP_SWITCHING);
+}
+
+static inline bool nested_cpu_has_shadow_vmcs(struct vmcs12 *vmcs12)
+{
+ return nested_cpu_has2(vmcs12, SECONDARY_EXEC_SHADOW_VMCS);
+}
+
+static inline bool nested_cpu_has_save_preemption_timer(struct vmcs12 *vmcs12)
+{
+ return vmcs12->vm_exit_controls &
+ VM_EXIT_SAVE_VMX_PREEMPTION_TIMER;
+}
+
+static inline bool nested_exit_on_nmi(struct kvm_vcpu *vcpu)
+{
+ return nested_cpu_has_nmi_exiting(get_vmcs12(vcpu));
+}
+
+/*
+ * In nested virtualization, check if L1 asked to exit on external interrupts.
+ * For most existing hypervisors, this will always return true.
+ */
+static inline bool nested_exit_on_intr(struct kvm_vcpu *vcpu)
+{
+ return get_vmcs12(vcpu)->pin_based_vm_exec_control &
+ PIN_BASED_EXT_INTR_MASK;
+}
+
+static inline bool nested_cpu_has_encls_exit(struct vmcs12 *vmcs12)
+{
+ return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENCLS_EXITING);
+}
+
+/*
+ * if fixed0[i] == 1: val[i] must be 1
+ * if fixed1[i] == 0: val[i] must be 0
+ */
+static inline bool fixed_bits_valid(u64 val, u64 fixed0, u64 fixed1)
+{
+ return ((val & fixed1) | fixed0) == val;
+}
+
+static inline bool nested_guest_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val)
+{
+ u64 fixed0 = to_vmx(vcpu)->nested.msrs.cr0_fixed0;
+ u64 fixed1 = to_vmx(vcpu)->nested.msrs.cr0_fixed1;
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+
+ if (to_vmx(vcpu)->nested.msrs.secondary_ctls_high &
+ SECONDARY_EXEC_UNRESTRICTED_GUEST &&
+ nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST))
+ fixed0 &= ~(X86_CR0_PE | X86_CR0_PG);
+
+ return fixed_bits_valid(val, fixed0, fixed1);
+}
+
+static inline bool nested_host_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val)
+{
+ u64 fixed0 = to_vmx(vcpu)->nested.msrs.cr0_fixed0;
+ u64 fixed1 = to_vmx(vcpu)->nested.msrs.cr0_fixed1;
+
+ return fixed_bits_valid(val, fixed0, fixed1);
+}
+
+static inline bool nested_cr4_valid(struct kvm_vcpu *vcpu, unsigned long val)
+{
+ u64 fixed0 = to_vmx(vcpu)->nested.msrs.cr4_fixed0;
+ u64 fixed1 = to_vmx(vcpu)->nested.msrs.cr4_fixed1;
+
+ return fixed_bits_valid(val, fixed0, fixed1) &&
+ __kvm_is_valid_cr4(vcpu, val);
+}
+
+/* No difference in the restrictions on guest and host CR4 in VMX operation. */
+#define nested_guest_cr4_valid nested_cr4_valid
+#define nested_host_cr4_valid nested_cr4_valid
+
+extern struct kvm_x86_nested_ops vmx_nested_ops;
+
+#endif /* __KVM_X86_VMX_NESTED_H */
diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
new file mode 100644
index 000000000..90c1f7f07
--- /dev/null
+++ b/arch/x86/kvm/vmx/pmu_intel.c
@@ -0,0 +1,788 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * KVM PMU support for Intel CPUs
+ *
+ * Copyright 2011 Red Hat, Inc. and/or its affiliates.
+ *
+ * Authors:
+ * Avi Kivity <avi@redhat.com>
+ * Gleb Natapov <gleb@redhat.com>
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/types.h>
+#include <linux/kvm_host.h>
+#include <linux/perf_event.h>
+#include <asm/perf_event.h>
+#include "x86.h"
+#include "cpuid.h"
+#include "lapic.h"
+#include "nested.h"
+#include "pmu.h"
+
+#define MSR_PMC_FULL_WIDTH_BIT (MSR_IA32_PMC0 - MSR_IA32_PERFCTR0)
+
+enum intel_pmu_architectural_events {
+ /*
+ * The order of the architectural events matters as support for each
+ * event is enumerated via CPUID using the index of the event.
+ */
+ INTEL_ARCH_CPU_CYCLES,
+ INTEL_ARCH_INSTRUCTIONS_RETIRED,
+ INTEL_ARCH_REFERENCE_CYCLES,
+ INTEL_ARCH_LLC_REFERENCES,
+ INTEL_ARCH_LLC_MISSES,
+ INTEL_ARCH_BRANCHES_RETIRED,
+ INTEL_ARCH_BRANCHES_MISPREDICTED,
+
+ NR_REAL_INTEL_ARCH_EVENTS,
+
+ /*
+ * Pseudo-architectural event used to implement IA32_FIXED_CTR2, a.k.a.
+ * TSC reference cycles. The architectural reference cycles event may
+ * or may not actually use the TSC as the reference, e.g. might use the
+ * core crystal clock or the bus clock (yeah, "architectural").
+ */
+ PSEUDO_ARCH_REFERENCE_CYCLES = NR_REAL_INTEL_ARCH_EVENTS,
+ NR_INTEL_ARCH_EVENTS,
+};
+
+static struct {
+ u8 eventsel;
+ u8 unit_mask;
+} const intel_arch_events[] = {
+ [INTEL_ARCH_CPU_CYCLES] = { 0x3c, 0x00 },
+ [INTEL_ARCH_INSTRUCTIONS_RETIRED] = { 0xc0, 0x00 },
+ [INTEL_ARCH_REFERENCE_CYCLES] = { 0x3c, 0x01 },
+ [INTEL_ARCH_LLC_REFERENCES] = { 0x2e, 0x4f },
+ [INTEL_ARCH_LLC_MISSES] = { 0x2e, 0x41 },
+ [INTEL_ARCH_BRANCHES_RETIRED] = { 0xc4, 0x00 },
+ [INTEL_ARCH_BRANCHES_MISPREDICTED] = { 0xc5, 0x00 },
+ [PSEUDO_ARCH_REFERENCE_CYCLES] = { 0x00, 0x03 },
+};
+
+/* mapping between fixed pmc index and intel_arch_events array */
+static int fixed_pmc_events[] = {
+ [0] = INTEL_ARCH_INSTRUCTIONS_RETIRED,
+ [1] = INTEL_ARCH_CPU_CYCLES,
+ [2] = PSEUDO_ARCH_REFERENCE_CYCLES,
+};
+
+static void reprogram_fixed_counters(struct kvm_pmu *pmu, u64 data)
+{
+ struct kvm_pmc *pmc;
+ u8 old_fixed_ctr_ctrl = pmu->fixed_ctr_ctrl;
+ int i;
+
+ pmu->fixed_ctr_ctrl = data;
+ for (i = 0; i < pmu->nr_arch_fixed_counters; i++) {
+ u8 new_ctrl = fixed_ctrl_field(data, i);
+ u8 old_ctrl = fixed_ctrl_field(old_fixed_ctr_ctrl, i);
+
+ if (old_ctrl == new_ctrl)
+ continue;
+
+ pmc = get_fixed_pmc(pmu, MSR_CORE_PERF_FIXED_CTR0 + i);
+
+ __set_bit(INTEL_PMC_IDX_FIXED + i, pmu->pmc_in_use);
+ kvm_pmu_request_counter_reprogram(pmc);
+ }
+}
+
+static struct kvm_pmc *intel_pmc_idx_to_pmc(struct kvm_pmu *pmu, int pmc_idx)
+{
+ if (pmc_idx < INTEL_PMC_IDX_FIXED) {
+ return get_gp_pmc(pmu, MSR_P6_EVNTSEL0 + pmc_idx,
+ MSR_P6_EVNTSEL0);
+ } else {
+ u32 idx = pmc_idx - INTEL_PMC_IDX_FIXED;
+
+ return get_fixed_pmc(pmu, idx + MSR_CORE_PERF_FIXED_CTR0);
+ }
+}
+
+static bool intel_hw_event_available(struct kvm_pmc *pmc)
+{
+ struct kvm_pmu *pmu = pmc_to_pmu(pmc);
+ u8 event_select = pmc->eventsel & ARCH_PERFMON_EVENTSEL_EVENT;
+ u8 unit_mask = (pmc->eventsel & ARCH_PERFMON_EVENTSEL_UMASK) >> 8;
+ int i;
+
+ BUILD_BUG_ON(ARRAY_SIZE(intel_arch_events) != NR_INTEL_ARCH_EVENTS);
+
+ /*
+ * Disallow events reported as unavailable in guest CPUID. Note, this
+ * doesn't apply to pseudo-architectural events.
+ */
+ for (i = 0; i < NR_REAL_INTEL_ARCH_EVENTS; i++) {
+ if (intel_arch_events[i].eventsel != event_select ||
+ intel_arch_events[i].unit_mask != unit_mask)
+ continue;
+
+ return pmu->available_event_types & BIT(i);
+ }
+
+ return true;
+}
+
+static bool intel_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ bool fixed = idx & (1u << 30);
+
+ idx &= ~(3u << 30);
+
+ return fixed ? idx < pmu->nr_arch_fixed_counters
+ : idx < pmu->nr_arch_gp_counters;
+}
+
+static struct kvm_pmc *intel_rdpmc_ecx_to_pmc(struct kvm_vcpu *vcpu,
+ unsigned int idx, u64 *mask)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ bool fixed = idx & (1u << 30);
+ struct kvm_pmc *counters;
+ unsigned int num_counters;
+
+ idx &= ~(3u << 30);
+ if (fixed) {
+ counters = pmu->fixed_counters;
+ num_counters = pmu->nr_arch_fixed_counters;
+ } else {
+ counters = pmu->gp_counters;
+ num_counters = pmu->nr_arch_gp_counters;
+ }
+ if (idx >= num_counters)
+ return NULL;
+ *mask &= pmu->counter_bitmask[fixed ? KVM_PMC_FIXED : KVM_PMC_GP];
+ return &counters[array_index_nospec(idx, num_counters)];
+}
+
+static inline u64 vcpu_get_perf_capabilities(struct kvm_vcpu *vcpu)
+{
+ if (!guest_cpuid_has(vcpu, X86_FEATURE_PDCM))
+ return 0;
+
+ return vcpu->arch.perf_capabilities;
+}
+
+static inline bool fw_writes_is_enabled(struct kvm_vcpu *vcpu)
+{
+ return (vcpu_get_perf_capabilities(vcpu) & PMU_CAP_FW_WRITES) != 0;
+}
+
+static inline struct kvm_pmc *get_fw_gp_pmc(struct kvm_pmu *pmu, u32 msr)
+{
+ if (!fw_writes_is_enabled(pmu_to_vcpu(pmu)))
+ return NULL;
+
+ return get_gp_pmc(pmu, msr, MSR_IA32_PMC0);
+}
+
+static bool intel_pmu_is_valid_lbr_msr(struct kvm_vcpu *vcpu, u32 index)
+{
+ struct x86_pmu_lbr *records = vcpu_to_lbr_records(vcpu);
+ bool ret = false;
+
+ if (!intel_pmu_lbr_is_enabled(vcpu))
+ return ret;
+
+ ret = (index == MSR_LBR_SELECT) || (index == MSR_LBR_TOS) ||
+ (index >= records->from && index < records->from + records->nr) ||
+ (index >= records->to && index < records->to + records->nr);
+
+ if (!ret && records->info)
+ ret = (index >= records->info && index < records->info + records->nr);
+
+ return ret;
+}
+
+static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ u64 perf_capabilities;
+ int ret;
+
+ switch (msr) {
+ case MSR_CORE_PERF_FIXED_CTR_CTRL:
+ return kvm_pmu_has_perf_global_ctrl(pmu);
+ case MSR_IA32_PEBS_ENABLE:
+ ret = vcpu_get_perf_capabilities(vcpu) & PERF_CAP_PEBS_FORMAT;
+ break;
+ case MSR_IA32_DS_AREA:
+ ret = guest_cpuid_has(vcpu, X86_FEATURE_DS);
+ break;
+ case MSR_PEBS_DATA_CFG:
+ perf_capabilities = vcpu_get_perf_capabilities(vcpu);
+ ret = (perf_capabilities & PERF_CAP_PEBS_BASELINE) &&
+ ((perf_capabilities & PERF_CAP_PEBS_FORMAT) > 3);
+ break;
+ default:
+ ret = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0) ||
+ get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0) ||
+ get_fixed_pmc(pmu, msr) || get_fw_gp_pmc(pmu, msr) ||
+ intel_pmu_is_valid_lbr_msr(vcpu, msr);
+ break;
+ }
+
+ return ret;
+}
+
+static struct kvm_pmc *intel_msr_idx_to_pmc(struct kvm_vcpu *vcpu, u32 msr)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ struct kvm_pmc *pmc;
+
+ pmc = get_fixed_pmc(pmu, msr);
+ pmc = pmc ? pmc : get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0);
+ pmc = pmc ? pmc : get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0);
+
+ return pmc;
+}
+
+static inline void intel_pmu_release_guest_lbr_event(struct kvm_vcpu *vcpu)
+{
+ struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu);
+
+ if (lbr_desc->event) {
+ perf_event_release_kernel(lbr_desc->event);
+ lbr_desc->event = NULL;
+ vcpu_to_pmu(vcpu)->event_count--;
+ }
+}
+
+int intel_pmu_create_guest_lbr_event(struct kvm_vcpu *vcpu)
+{
+ struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu);
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ struct perf_event *event;
+
+ /*
+ * The perf_event_attr is constructed in the minimum efficient way:
+ * - set 'pinned = true' to make it task pinned so that if another
+ * cpu pinned event reclaims LBR, the event->oncpu will be set to -1;
+ * - set '.exclude_host = true' to record guest branches behavior;
+ *
+ * - set '.config = INTEL_FIXED_VLBR_EVENT' to indicates host perf
+ * schedule the event without a real HW counter but a fake one;
+ * check is_guest_lbr_event() and __intel_get_event_constraints();
+ *
+ * - set 'sample_type = PERF_SAMPLE_BRANCH_STACK' and
+ * 'branch_sample_type = PERF_SAMPLE_BRANCH_CALL_STACK |
+ * PERF_SAMPLE_BRANCH_USER' to configure it as a LBR callstack
+ * event, which helps KVM to save/restore guest LBR records
+ * during host context switches and reduces quite a lot overhead,
+ * check branch_user_callstack() and intel_pmu_lbr_sched_task();
+ */
+ struct perf_event_attr attr = {
+ .type = PERF_TYPE_RAW,
+ .size = sizeof(attr),
+ .config = INTEL_FIXED_VLBR_EVENT,
+ .sample_type = PERF_SAMPLE_BRANCH_STACK,
+ .pinned = true,
+ .exclude_host = true,
+ .branch_sample_type = PERF_SAMPLE_BRANCH_CALL_STACK |
+ PERF_SAMPLE_BRANCH_USER,
+ };
+
+ if (unlikely(lbr_desc->event)) {
+ __set_bit(INTEL_PMC_IDX_FIXED_VLBR, pmu->pmc_in_use);
+ return 0;
+ }
+
+ event = perf_event_create_kernel_counter(&attr, -1,
+ current, NULL, NULL);
+ if (IS_ERR(event)) {
+ pr_debug_ratelimited("%s: failed %ld\n",
+ __func__, PTR_ERR(event));
+ return PTR_ERR(event);
+ }
+ lbr_desc->event = event;
+ pmu->event_count++;
+ __set_bit(INTEL_PMC_IDX_FIXED_VLBR, pmu->pmc_in_use);
+ return 0;
+}
+
+/*
+ * It's safe to access LBR msrs from guest when they have not
+ * been passthrough since the host would help restore or reset
+ * the LBR msrs records when the guest LBR event is scheduled in.
+ */
+static bool intel_pmu_handle_lbr_msrs_access(struct kvm_vcpu *vcpu,
+ struct msr_data *msr_info, bool read)
+{
+ struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu);
+ u32 index = msr_info->index;
+
+ if (!intel_pmu_is_valid_lbr_msr(vcpu, index))
+ return false;
+
+ if (!lbr_desc->event && intel_pmu_create_guest_lbr_event(vcpu) < 0)
+ goto dummy;
+
+ /*
+ * Disable irq to ensure the LBR feature doesn't get reclaimed by the
+ * host at the time the value is read from the msr, and this avoids the
+ * host LBR value to be leaked to the guest. If LBR has been reclaimed,
+ * return 0 on guest reads.
+ */
+ local_irq_disable();
+ if (lbr_desc->event->state == PERF_EVENT_STATE_ACTIVE) {
+ if (read)
+ rdmsrl(index, msr_info->data);
+ else
+ wrmsrl(index, msr_info->data);
+ __set_bit(INTEL_PMC_IDX_FIXED_VLBR, vcpu_to_pmu(vcpu)->pmc_in_use);
+ local_irq_enable();
+ return true;
+ }
+ clear_bit(INTEL_PMC_IDX_FIXED_VLBR, vcpu_to_pmu(vcpu)->pmc_in_use);
+ local_irq_enable();
+
+dummy:
+ if (read)
+ msr_info->data = 0;
+ return true;
+}
+
+static int intel_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ struct kvm_pmc *pmc;
+ u32 msr = msr_info->index;
+
+ switch (msr) {
+ case MSR_CORE_PERF_FIXED_CTR_CTRL:
+ msr_info->data = pmu->fixed_ctr_ctrl;
+ break;
+ case MSR_IA32_PEBS_ENABLE:
+ msr_info->data = pmu->pebs_enable;
+ break;
+ case MSR_IA32_DS_AREA:
+ msr_info->data = pmu->ds_area;
+ break;
+ case MSR_PEBS_DATA_CFG:
+ msr_info->data = pmu->pebs_data_cfg;
+ break;
+ default:
+ if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)) ||
+ (pmc = get_gp_pmc(pmu, msr, MSR_IA32_PMC0))) {
+ u64 val = pmc_read_counter(pmc);
+ msr_info->data =
+ val & pmu->counter_bitmask[KVM_PMC_GP];
+ break;
+ } else if ((pmc = get_fixed_pmc(pmu, msr))) {
+ u64 val = pmc_read_counter(pmc);
+ msr_info->data =
+ val & pmu->counter_bitmask[KVM_PMC_FIXED];
+ break;
+ } else if ((pmc = get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0))) {
+ msr_info->data = pmc->eventsel;
+ break;
+ } else if (intel_pmu_handle_lbr_msrs_access(vcpu, msr_info, true)) {
+ break;
+ }
+ return 1;
+ }
+
+ return 0;
+}
+
+static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ struct kvm_pmc *pmc;
+ u32 msr = msr_info->index;
+ u64 data = msr_info->data;
+ u64 reserved_bits, diff;
+
+ switch (msr) {
+ case MSR_CORE_PERF_FIXED_CTR_CTRL:
+ if (data & pmu->fixed_ctr_ctrl_mask)
+ return 1;
+
+ if (pmu->fixed_ctr_ctrl != data)
+ reprogram_fixed_counters(pmu, data);
+ break;
+ case MSR_IA32_PEBS_ENABLE:
+ if (data & pmu->pebs_enable_mask)
+ return 1;
+
+ if (pmu->pebs_enable != data) {
+ diff = pmu->pebs_enable ^ data;
+ pmu->pebs_enable = data;
+ reprogram_counters(pmu, diff);
+ }
+ break;
+ case MSR_IA32_DS_AREA:
+ if (is_noncanonical_address(data, vcpu))
+ return 1;
+
+ pmu->ds_area = data;
+ break;
+ case MSR_PEBS_DATA_CFG:
+ if (data & pmu->pebs_data_cfg_mask)
+ return 1;
+
+ pmu->pebs_data_cfg = data;
+ break;
+ default:
+ if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)) ||
+ (pmc = get_gp_pmc(pmu, msr, MSR_IA32_PMC0))) {
+ if ((msr & MSR_PMC_FULL_WIDTH_BIT) &&
+ (data & ~pmu->counter_bitmask[KVM_PMC_GP]))
+ return 1;
+
+ if (!msr_info->host_initiated &&
+ !(msr & MSR_PMC_FULL_WIDTH_BIT))
+ data = (s64)(s32)data;
+ pmc_write_counter(pmc, data);
+ pmc_update_sample_period(pmc);
+ break;
+ } else if ((pmc = get_fixed_pmc(pmu, msr))) {
+ pmc_write_counter(pmc, data);
+ pmc_update_sample_period(pmc);
+ break;
+ } else if ((pmc = get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0))) {
+ reserved_bits = pmu->reserved_bits;
+ if ((pmc->idx == 2) &&
+ (pmu->raw_event_mask & HSW_IN_TX_CHECKPOINTED))
+ reserved_bits ^= HSW_IN_TX_CHECKPOINTED;
+ if (data & reserved_bits)
+ return 1;
+
+ if (data != pmc->eventsel) {
+ pmc->eventsel = data;
+ kvm_pmu_request_counter_reprogram(pmc);
+ }
+ break;
+ } else if (intel_pmu_handle_lbr_msrs_access(vcpu, msr_info, false)) {
+ break;
+ }
+ /* Not a known PMU MSR. */
+ return 1;
+ }
+
+ return 0;
+}
+
+static void setup_fixed_pmc_eventsel(struct kvm_pmu *pmu)
+{
+ int i;
+
+ BUILD_BUG_ON(ARRAY_SIZE(fixed_pmc_events) != KVM_PMC_MAX_FIXED);
+
+ for (i = 0; i < pmu->nr_arch_fixed_counters; i++) {
+ int index = array_index_nospec(i, KVM_PMC_MAX_FIXED);
+ struct kvm_pmc *pmc = &pmu->fixed_counters[index];
+ u32 event = fixed_pmc_events[index];
+
+ pmc->eventsel = (intel_arch_events[event].unit_mask << 8) |
+ intel_arch_events[event].eventsel;
+ }
+}
+
+static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu);
+ struct kvm_cpuid_entry2 *entry;
+ union cpuid10_eax eax;
+ union cpuid10_edx edx;
+ u64 perf_capabilities;
+ u64 counter_mask;
+ int i;
+
+ pmu->nr_arch_gp_counters = 0;
+ pmu->nr_arch_fixed_counters = 0;
+ pmu->counter_bitmask[KVM_PMC_GP] = 0;
+ pmu->counter_bitmask[KVM_PMC_FIXED] = 0;
+ pmu->version = 0;
+ pmu->reserved_bits = 0xffffffff00200000ull;
+ pmu->raw_event_mask = X86_RAW_EVENT_MASK;
+ pmu->global_ctrl_mask = ~0ull;
+ pmu->global_status_mask = ~0ull;
+ pmu->fixed_ctr_ctrl_mask = ~0ull;
+ pmu->pebs_enable_mask = ~0ull;
+ pmu->pebs_data_cfg_mask = ~0ull;
+
+ memset(&lbr_desc->records, 0, sizeof(lbr_desc->records));
+
+ /*
+ * Setting passthrough of LBR MSRs is done only in the VM-Entry loop,
+ * and PMU refresh is disallowed after the vCPU has run, i.e. this code
+ * should never be reached while KVM is passing through MSRs.
+ */
+ if (KVM_BUG_ON(lbr_desc->msr_passthrough, vcpu->kvm))
+ return;
+
+ entry = kvm_find_cpuid_entry(vcpu, 0xa);
+ if (!entry || !vcpu->kvm->arch.enable_pmu)
+ return;
+ eax.full = entry->eax;
+ edx.full = entry->edx;
+
+ pmu->version = eax.split.version_id;
+ if (!pmu->version)
+ return;
+
+ pmu->nr_arch_gp_counters = min_t(int, eax.split.num_counters,
+ kvm_pmu_cap.num_counters_gp);
+ eax.split.bit_width = min_t(int, eax.split.bit_width,
+ kvm_pmu_cap.bit_width_gp);
+ pmu->counter_bitmask[KVM_PMC_GP] = ((u64)1 << eax.split.bit_width) - 1;
+ eax.split.mask_length = min_t(int, eax.split.mask_length,
+ kvm_pmu_cap.events_mask_len);
+ pmu->available_event_types = ~entry->ebx &
+ ((1ull << eax.split.mask_length) - 1);
+
+ if (pmu->version == 1) {
+ pmu->nr_arch_fixed_counters = 0;
+ } else {
+ pmu->nr_arch_fixed_counters = min_t(int, edx.split.num_counters_fixed,
+ kvm_pmu_cap.num_counters_fixed);
+ edx.split.bit_width_fixed = min_t(int, edx.split.bit_width_fixed,
+ kvm_pmu_cap.bit_width_fixed);
+ pmu->counter_bitmask[KVM_PMC_FIXED] =
+ ((u64)1 << edx.split.bit_width_fixed) - 1;
+ setup_fixed_pmc_eventsel(pmu);
+ }
+
+ for (i = 0; i < pmu->nr_arch_fixed_counters; i++)
+ pmu->fixed_ctr_ctrl_mask &= ~(0xbull << (i * 4));
+ counter_mask = ~(((1ull << pmu->nr_arch_gp_counters) - 1) |
+ (((1ull << pmu->nr_arch_fixed_counters) - 1) << INTEL_PMC_IDX_FIXED));
+ pmu->global_ctrl_mask = counter_mask;
+
+ /*
+ * GLOBAL_STATUS and GLOBAL_OVF_CONTROL (a.k.a. GLOBAL_STATUS_RESET)
+ * share reserved bit definitions. The kernel just happens to use
+ * OVF_CTRL for the names.
+ */
+ pmu->global_status_mask = pmu->global_ctrl_mask
+ & ~(MSR_CORE_PERF_GLOBAL_OVF_CTRL_OVF_BUF |
+ MSR_CORE_PERF_GLOBAL_OVF_CTRL_COND_CHGD);
+ if (vmx_pt_mode_is_host_guest())
+ pmu->global_status_mask &=
+ ~MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI;
+
+ entry = kvm_find_cpuid_entry_index(vcpu, 7, 0);
+ if (entry &&
+ (boot_cpu_has(X86_FEATURE_HLE) || boot_cpu_has(X86_FEATURE_RTM)) &&
+ (entry->ebx & (X86_FEATURE_HLE|X86_FEATURE_RTM))) {
+ pmu->reserved_bits ^= HSW_IN_TX;
+ pmu->raw_event_mask |= (HSW_IN_TX|HSW_IN_TX_CHECKPOINTED);
+ }
+
+ bitmap_set(pmu->all_valid_pmc_idx,
+ 0, pmu->nr_arch_gp_counters);
+ bitmap_set(pmu->all_valid_pmc_idx,
+ INTEL_PMC_MAX_GENERIC, pmu->nr_arch_fixed_counters);
+
+ perf_capabilities = vcpu_get_perf_capabilities(vcpu);
+ if (cpuid_model_is_consistent(vcpu) &&
+ (perf_capabilities & PMU_CAP_LBR_FMT))
+ x86_perf_get_lbr(&lbr_desc->records);
+ else
+ lbr_desc->records.nr = 0;
+
+ if (lbr_desc->records.nr)
+ bitmap_set(pmu->all_valid_pmc_idx, INTEL_PMC_IDX_FIXED_VLBR, 1);
+
+ if (perf_capabilities & PERF_CAP_PEBS_FORMAT) {
+ if (perf_capabilities & PERF_CAP_PEBS_BASELINE) {
+ pmu->pebs_enable_mask = counter_mask;
+ pmu->reserved_bits &= ~ICL_EVENTSEL_ADAPTIVE;
+ for (i = 0; i < pmu->nr_arch_fixed_counters; i++) {
+ pmu->fixed_ctr_ctrl_mask &=
+ ~(1ULL << (INTEL_PMC_IDX_FIXED + i * 4));
+ }
+ pmu->pebs_data_cfg_mask = ~0xff00000full;
+ } else {
+ pmu->pebs_enable_mask =
+ ~((1ull << pmu->nr_arch_gp_counters) - 1);
+ }
+ }
+}
+
+static void intel_pmu_init(struct kvm_vcpu *vcpu)
+{
+ int i;
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu);
+
+ for (i = 0; i < KVM_INTEL_PMC_MAX_GENERIC; i++) {
+ pmu->gp_counters[i].type = KVM_PMC_GP;
+ pmu->gp_counters[i].vcpu = vcpu;
+ pmu->gp_counters[i].idx = i;
+ pmu->gp_counters[i].current_config = 0;
+ }
+
+ for (i = 0; i < KVM_PMC_MAX_FIXED; i++) {
+ pmu->fixed_counters[i].type = KVM_PMC_FIXED;
+ pmu->fixed_counters[i].vcpu = vcpu;
+ pmu->fixed_counters[i].idx = i + INTEL_PMC_IDX_FIXED;
+ pmu->fixed_counters[i].current_config = 0;
+ }
+
+ lbr_desc->records.nr = 0;
+ lbr_desc->event = NULL;
+ lbr_desc->msr_passthrough = false;
+}
+
+static void intel_pmu_reset(struct kvm_vcpu *vcpu)
+{
+ intel_pmu_release_guest_lbr_event(vcpu);
+}
+
+/*
+ * Emulate LBR_On_PMI behavior for 1 < pmu.version < 4.
+ *
+ * If Freeze_LBR_On_PMI = 1, the LBR is frozen on PMI and
+ * the KVM emulates to clear the LBR bit (bit 0) in IA32_DEBUGCTL.
+ *
+ * Guest needs to re-enable LBR to resume branches recording.
+ */
+static void intel_pmu_legacy_freezing_lbrs_on_pmi(struct kvm_vcpu *vcpu)
+{
+ u64 data = vmcs_read64(GUEST_IA32_DEBUGCTL);
+
+ if (data & DEBUGCTLMSR_FREEZE_LBRS_ON_PMI) {
+ data &= ~DEBUGCTLMSR_LBR;
+ vmcs_write64(GUEST_IA32_DEBUGCTL, data);
+ }
+}
+
+static void intel_pmu_deliver_pmi(struct kvm_vcpu *vcpu)
+{
+ u8 version = vcpu_to_pmu(vcpu)->version;
+
+ if (!intel_pmu_lbr_is_enabled(vcpu))
+ return;
+
+ if (version > 1 && version < 4)
+ intel_pmu_legacy_freezing_lbrs_on_pmi(vcpu);
+}
+
+static void vmx_update_intercept_for_lbr_msrs(struct kvm_vcpu *vcpu, bool set)
+{
+ struct x86_pmu_lbr *lbr = vcpu_to_lbr_records(vcpu);
+ int i;
+
+ for (i = 0; i < lbr->nr; i++) {
+ vmx_set_intercept_for_msr(vcpu, lbr->from + i, MSR_TYPE_RW, set);
+ vmx_set_intercept_for_msr(vcpu, lbr->to + i, MSR_TYPE_RW, set);
+ if (lbr->info)
+ vmx_set_intercept_for_msr(vcpu, lbr->info + i, MSR_TYPE_RW, set);
+ }
+
+ vmx_set_intercept_for_msr(vcpu, MSR_LBR_SELECT, MSR_TYPE_RW, set);
+ vmx_set_intercept_for_msr(vcpu, MSR_LBR_TOS, MSR_TYPE_RW, set);
+}
+
+static inline void vmx_disable_lbr_msrs_passthrough(struct kvm_vcpu *vcpu)
+{
+ struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu);
+
+ if (!lbr_desc->msr_passthrough)
+ return;
+
+ vmx_update_intercept_for_lbr_msrs(vcpu, true);
+ lbr_desc->msr_passthrough = false;
+}
+
+static inline void vmx_enable_lbr_msrs_passthrough(struct kvm_vcpu *vcpu)
+{
+ struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu);
+
+ if (lbr_desc->msr_passthrough)
+ return;
+
+ vmx_update_intercept_for_lbr_msrs(vcpu, false);
+ lbr_desc->msr_passthrough = true;
+}
+
+/*
+ * Higher priority host perf events (e.g. cpu pinned) could reclaim the
+ * pmu resources (e.g. LBR) that were assigned to the guest. This is
+ * usually done via ipi calls (more details in perf_install_in_context).
+ *
+ * Before entering the non-root mode (with irq disabled here), double
+ * confirm that the pmu features enabled to the guest are not reclaimed
+ * by higher priority host events. Otherwise, disallow vcpu's access to
+ * the reclaimed features.
+ */
+void vmx_passthrough_lbr_msrs(struct kvm_vcpu *vcpu)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu);
+
+ if (!lbr_desc->event) {
+ vmx_disable_lbr_msrs_passthrough(vcpu);
+ if (vmcs_read64(GUEST_IA32_DEBUGCTL) & DEBUGCTLMSR_LBR)
+ goto warn;
+ if (test_bit(INTEL_PMC_IDX_FIXED_VLBR, pmu->pmc_in_use))
+ goto warn;
+ return;
+ }
+
+ if (lbr_desc->event->state < PERF_EVENT_STATE_ACTIVE) {
+ vmx_disable_lbr_msrs_passthrough(vcpu);
+ __clear_bit(INTEL_PMC_IDX_FIXED_VLBR, pmu->pmc_in_use);
+ goto warn;
+ } else
+ vmx_enable_lbr_msrs_passthrough(vcpu);
+
+ return;
+
+warn:
+ pr_warn_ratelimited("vcpu-%d: fail to passthrough LBR.\n", vcpu->vcpu_id);
+}
+
+static void intel_pmu_cleanup(struct kvm_vcpu *vcpu)
+{
+ if (!(vmcs_read64(GUEST_IA32_DEBUGCTL) & DEBUGCTLMSR_LBR))
+ intel_pmu_release_guest_lbr_event(vcpu);
+}
+
+void intel_pmu_cross_mapped_check(struct kvm_pmu *pmu)
+{
+ struct kvm_pmc *pmc = NULL;
+ int bit, hw_idx;
+
+ for_each_set_bit(bit, (unsigned long *)&pmu->global_ctrl,
+ X86_PMC_IDX_MAX) {
+ pmc = intel_pmc_idx_to_pmc(pmu, bit);
+
+ if (!pmc || !pmc_speculative_in_use(pmc) ||
+ !pmc_is_globally_enabled(pmc) || !pmc->perf_event)
+ continue;
+
+ /*
+ * A negative index indicates the event isn't mapped to a
+ * physical counter in the host, e.g. due to contention.
+ */
+ hw_idx = pmc->perf_event->hw.idx;
+ if (hw_idx != pmc->idx && hw_idx > -1)
+ pmu->host_cross_mapped_mask |= BIT_ULL(hw_idx);
+ }
+}
+
+struct kvm_pmu_ops intel_pmu_ops __initdata = {
+ .hw_event_available = intel_hw_event_available,
+ .pmc_idx_to_pmc = intel_pmc_idx_to_pmc,
+ .rdpmc_ecx_to_pmc = intel_rdpmc_ecx_to_pmc,
+ .msr_idx_to_pmc = intel_msr_idx_to_pmc,
+ .is_valid_rdpmc_ecx = intel_is_valid_rdpmc_ecx,
+ .is_valid_msr = intel_is_valid_msr,
+ .get_msr = intel_pmu_get_msr,
+ .set_msr = intel_pmu_set_msr,
+ .refresh = intel_pmu_refresh,
+ .init = intel_pmu_init,
+ .reset = intel_pmu_reset,
+ .deliver_pmi = intel_pmu_deliver_pmi,
+ .cleanup = intel_pmu_cleanup,
+ .EVENTSEL_EVENT = ARCH_PERFMON_EVENTSEL_EVENT,
+ .MAX_NR_GP_COUNTERS = KVM_INTEL_PMC_MAX_GENERIC,
+ .MIN_NR_GP_COUNTERS = 1,
+};
diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c
new file mode 100644
index 000000000..af662312f
--- /dev/null
+++ b/arch/x86/kvm/vmx/posted_intr.c
@@ -0,0 +1,353 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kvm_host.h>
+
+#include <asm/irq_remapping.h>
+#include <asm/cpu.h>
+
+#include "lapic.h"
+#include "irq.h"
+#include "posted_intr.h"
+#include "trace.h"
+#include "vmx.h"
+
+/*
+ * Maintain a per-CPU list of vCPUs that need to be awakened by wakeup_handler()
+ * when a WAKEUP_VECTOR interrupted is posted. vCPUs are added to the list when
+ * the vCPU is scheduled out and is blocking (e.g. in HLT) with IRQs enabled.
+ * The vCPUs posted interrupt descriptor is updated at the same time to set its
+ * notification vector to WAKEUP_VECTOR, so that posted interrupt from devices
+ * wake the target vCPUs. vCPUs are removed from the list and the notification
+ * vector is reset when the vCPU is scheduled in.
+ */
+static DEFINE_PER_CPU(struct list_head, wakeup_vcpus_on_cpu);
+/*
+ * Protect the per-CPU list with a per-CPU spinlock to handle task migration.
+ * When a blocking vCPU is awakened _and_ migrated to a different pCPU, the
+ * ->sched_in() path will need to take the vCPU off the list of the _previous_
+ * CPU. IRQs must be disabled when taking this lock, otherwise deadlock will
+ * occur if a wakeup IRQ arrives and attempts to acquire the lock.
+ */
+static DEFINE_PER_CPU(raw_spinlock_t, wakeup_vcpus_on_cpu_lock);
+
+static inline struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu)
+{
+ return &(to_vmx(vcpu)->pi_desc);
+}
+
+static int pi_try_set_control(struct pi_desc *pi_desc, u64 *pold, u64 new)
+{
+ /*
+ * PID.ON can be set at any time by a different vCPU or by hardware,
+ * e.g. a device. PID.control must be written atomically, and the
+ * update must be retried with a fresh snapshot an ON change causes
+ * the cmpxchg to fail.
+ */
+ if (!try_cmpxchg64(&pi_desc->control, pold, new))
+ return -EBUSY;
+
+ return 0;
+}
+
+void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
+{
+ struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct pi_desc old, new;
+ unsigned long flags;
+ unsigned int dest;
+
+ /*
+ * To simplify hot-plug and dynamic toggling of APICv, keep PI.NDST and
+ * PI.SN up-to-date even if there is no assigned device or if APICv is
+ * deactivated due to a dynamic inhibit bit, e.g. for Hyper-V's SyncIC.
+ */
+ if (!enable_apicv || !lapic_in_kernel(vcpu))
+ return;
+
+ /*
+ * If the vCPU wasn't on the wakeup list and wasn't migrated, then the
+ * full update can be skipped as neither the vector nor the destination
+ * needs to be changed.
+ */
+ if (pi_desc->nv != POSTED_INTR_WAKEUP_VECTOR && vcpu->cpu == cpu) {
+ /*
+ * Clear SN if it was set due to being preempted. Again, do
+ * this even if there is no assigned device for simplicity.
+ */
+ if (pi_test_and_clear_sn(pi_desc))
+ goto after_clear_sn;
+ return;
+ }
+
+ local_irq_save(flags);
+
+ /*
+ * If the vCPU was waiting for wakeup, remove the vCPU from the wakeup
+ * list of the _previous_ pCPU, which will not be the same as the
+ * current pCPU if the task was migrated.
+ */
+ if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR) {
+ raw_spin_lock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu));
+ list_del(&vmx->pi_wakeup_list);
+ raw_spin_unlock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu));
+ }
+
+ dest = cpu_physical_id(cpu);
+ if (!x2apic_mode)
+ dest = (dest << 8) & 0xFF00;
+
+ old.control = READ_ONCE(pi_desc->control);
+ do {
+ new.control = old.control;
+
+ /*
+ * Clear SN (as above) and refresh the destination APIC ID to
+ * handle task migration (@cpu != vcpu->cpu).
+ */
+ new.ndst = dest;
+ new.sn = 0;
+
+ /*
+ * Restore the notification vector; in the blocking case, the
+ * descriptor was modified on "put" to use the wakeup vector.
+ */
+ new.nv = POSTED_INTR_VECTOR;
+ } while (pi_try_set_control(pi_desc, &old.control, new.control));
+
+ local_irq_restore(flags);
+
+after_clear_sn:
+
+ /*
+ * Clear SN before reading the bitmap. The VT-d firmware
+ * writes the bitmap and reads SN atomically (5.2.3 in the
+ * spec), so it doesn't really have a memory barrier that
+ * pairs with this, but we cannot do that and we need one.
+ */
+ smp_mb__after_atomic();
+
+ if (!pi_is_pir_empty(pi_desc))
+ pi_set_on(pi_desc);
+}
+
+static bool vmx_can_use_vtd_pi(struct kvm *kvm)
+{
+ return irqchip_in_kernel(kvm) && enable_apicv &&
+ kvm_arch_has_assigned_device(kvm) &&
+ irq_remapping_cap(IRQ_POSTING_CAP);
+}
+
+/*
+ * Put the vCPU on this pCPU's list of vCPUs that needs to be awakened and set
+ * WAKEUP as the notification vector in the PI descriptor.
+ */
+static void pi_enable_wakeup_handler(struct kvm_vcpu *vcpu)
+{
+ struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct pi_desc old, new;
+ unsigned long flags;
+
+ local_irq_save(flags);
+
+ raw_spin_lock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu));
+ list_add_tail(&vmx->pi_wakeup_list,
+ &per_cpu(wakeup_vcpus_on_cpu, vcpu->cpu));
+ raw_spin_unlock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu));
+
+ WARN(pi_desc->sn, "PI descriptor SN field set before blocking");
+
+ old.control = READ_ONCE(pi_desc->control);
+ do {
+ /* set 'NV' to 'wakeup vector' */
+ new.control = old.control;
+ new.nv = POSTED_INTR_WAKEUP_VECTOR;
+ } while (pi_try_set_control(pi_desc, &old.control, new.control));
+
+ /*
+ * Send a wakeup IPI to this CPU if an interrupt may have been posted
+ * before the notification vector was updated, in which case the IRQ
+ * will arrive on the non-wakeup vector. An IPI is needed as calling
+ * try_to_wake_up() from ->sched_out() isn't allowed (IRQs are not
+ * enabled until it is safe to call try_to_wake_up() on the task being
+ * scheduled out).
+ */
+ if (pi_test_on(&new))
+ __apic_send_IPI_self(POSTED_INTR_WAKEUP_VECTOR);
+
+ local_irq_restore(flags);
+}
+
+static bool vmx_needs_pi_wakeup(struct kvm_vcpu *vcpu)
+{
+ /*
+ * The default posted interrupt vector does nothing when
+ * invoked outside guest mode. Return whether a blocked vCPU
+ * can be the target of posted interrupts, as is the case when
+ * using either IPI virtualization or VT-d PI, so that the
+ * notification vector is switched to the one that calls
+ * back to the pi_wakeup_handler() function.
+ */
+ return vmx_can_use_ipiv(vcpu) || vmx_can_use_vtd_pi(vcpu->kvm);
+}
+
+void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu)
+{
+ struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
+
+ if (!vmx_needs_pi_wakeup(vcpu))
+ return;
+
+ if (kvm_vcpu_is_blocking(vcpu) && !vmx_interrupt_blocked(vcpu))
+ pi_enable_wakeup_handler(vcpu);
+
+ /*
+ * Set SN when the vCPU is preempted. Note, the vCPU can both be seen
+ * as blocking and preempted, e.g. if it's preempted between setting
+ * its wait state and manually scheduling out.
+ */
+ if (vcpu->preempted)
+ pi_set_sn(pi_desc);
+}
+
+/*
+ * Handler for POSTED_INTERRUPT_WAKEUP_VECTOR.
+ */
+void pi_wakeup_handler(void)
+{
+ int cpu = smp_processor_id();
+ struct list_head *wakeup_list = &per_cpu(wakeup_vcpus_on_cpu, cpu);
+ raw_spinlock_t *spinlock = &per_cpu(wakeup_vcpus_on_cpu_lock, cpu);
+ struct vcpu_vmx *vmx;
+
+ raw_spin_lock(spinlock);
+ list_for_each_entry(vmx, wakeup_list, pi_wakeup_list) {
+
+ if (pi_test_on(&vmx->pi_desc))
+ kvm_vcpu_wake_up(&vmx->vcpu);
+ }
+ raw_spin_unlock(spinlock);
+}
+
+void __init pi_init_cpu(int cpu)
+{
+ INIT_LIST_HEAD(&per_cpu(wakeup_vcpus_on_cpu, cpu));
+ raw_spin_lock_init(&per_cpu(wakeup_vcpus_on_cpu_lock, cpu));
+}
+
+bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu)
+{
+ struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
+
+ return pi_test_on(pi_desc) ||
+ (pi_test_sn(pi_desc) && !pi_is_pir_empty(pi_desc));
+}
+
+
+/*
+ * Bail out of the block loop if the VM has an assigned
+ * device, but the blocking vCPU didn't reconfigure the
+ * PI.NV to the wakeup vector, i.e. the assigned device
+ * came along after the initial check in vmx_vcpu_pi_put().
+ */
+void vmx_pi_start_assignment(struct kvm *kvm)
+{
+ if (!irq_remapping_cap(IRQ_POSTING_CAP))
+ return;
+
+ kvm_make_all_cpus_request(kvm, KVM_REQ_UNBLOCK);
+}
+
+/*
+ * vmx_pi_update_irte - set IRTE for Posted-Interrupts
+ *
+ * @kvm: kvm
+ * @host_irq: host irq of the interrupt
+ * @guest_irq: gsi of the interrupt
+ * @set: set or unset PI
+ * returns 0 on success, < 0 on failure
+ */
+int vmx_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
+ uint32_t guest_irq, bool set)
+{
+ struct kvm_kernel_irq_routing_entry *e;
+ struct kvm_irq_routing_table *irq_rt;
+ struct kvm_lapic_irq irq;
+ struct kvm_vcpu *vcpu;
+ struct vcpu_data vcpu_info;
+ int idx, ret = 0;
+
+ if (!vmx_can_use_vtd_pi(kvm))
+ return 0;
+
+ idx = srcu_read_lock(&kvm->irq_srcu);
+ irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
+ if (guest_irq >= irq_rt->nr_rt_entries ||
+ hlist_empty(&irq_rt->map[guest_irq])) {
+ pr_warn_once("no route for guest_irq %u/%u (broken user space?)\n",
+ guest_irq, irq_rt->nr_rt_entries);
+ goto out;
+ }
+
+ hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) {
+ if (e->type != KVM_IRQ_ROUTING_MSI)
+ continue;
+ /*
+ * VT-d PI cannot support posting multicast/broadcast
+ * interrupts to a vCPU, we still use interrupt remapping
+ * for these kind of interrupts.
+ *
+ * For lowest-priority interrupts, we only support
+ * those with single CPU as the destination, e.g. user
+ * configures the interrupts via /proc/irq or uses
+ * irqbalance to make the interrupts single-CPU.
+ *
+ * We will support full lowest-priority interrupt later.
+ *
+ * In addition, we can only inject generic interrupts using
+ * the PI mechanism, refuse to route others through it.
+ */
+
+ kvm_set_msi_irq(kvm, e, &irq);
+ if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu) ||
+ !kvm_irq_is_postable(&irq)) {
+ /*
+ * Make sure the IRTE is in remapped mode if
+ * we don't handle it in posted mode.
+ */
+ ret = irq_set_vcpu_affinity(host_irq, NULL);
+ if (ret < 0) {
+ printk(KERN_INFO
+ "failed to back to remapped mode, irq: %u\n",
+ host_irq);
+ goto out;
+ }
+
+ continue;
+ }
+
+ vcpu_info.pi_desc_addr = __pa(vcpu_to_pi_desc(vcpu));
+ vcpu_info.vector = irq.vector;
+
+ trace_kvm_pi_irte_update(host_irq, vcpu->vcpu_id, e->gsi,
+ vcpu_info.vector, vcpu_info.pi_desc_addr, set);
+
+ if (set)
+ ret = irq_set_vcpu_affinity(host_irq, &vcpu_info);
+ else
+ ret = irq_set_vcpu_affinity(host_irq, NULL);
+
+ if (ret < 0) {
+ printk(KERN_INFO "%s: failed to update PI IRTE\n",
+ __func__);
+ goto out;
+ }
+ }
+
+ ret = 0;
+out:
+ srcu_read_unlock(&kvm->irq_srcu, idx);
+ return ret;
+}
diff --git a/arch/x86/kvm/vmx/posted_intr.h b/arch/x86/kvm/vmx/posted_intr.h
new file mode 100644
index 000000000..269920765
--- /dev/null
+++ b/arch/x86/kvm/vmx/posted_intr.h
@@ -0,0 +1,106 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __KVM_X86_VMX_POSTED_INTR_H
+#define __KVM_X86_VMX_POSTED_INTR_H
+
+#define POSTED_INTR_ON 0
+#define POSTED_INTR_SN 1
+
+#define PID_TABLE_ENTRY_VALID 1
+
+/* Posted-Interrupt Descriptor */
+struct pi_desc {
+ u32 pir[8]; /* Posted interrupt requested */
+ union {
+ struct {
+ /* bit 256 - Outstanding Notification */
+ u16 on : 1,
+ /* bit 257 - Suppress Notification */
+ sn : 1,
+ /* bit 271:258 - Reserved */
+ rsvd_1 : 14;
+ /* bit 279:272 - Notification Vector */
+ u8 nv;
+ /* bit 287:280 - Reserved */
+ u8 rsvd_2;
+ /* bit 319:288 - Notification Destination */
+ u32 ndst;
+ };
+ u64 control;
+ };
+ u32 rsvd[6];
+} __aligned(64);
+
+static inline bool pi_test_and_set_on(struct pi_desc *pi_desc)
+{
+ return test_and_set_bit(POSTED_INTR_ON,
+ (unsigned long *)&pi_desc->control);
+}
+
+static inline bool pi_test_and_clear_on(struct pi_desc *pi_desc)
+{
+ return test_and_clear_bit(POSTED_INTR_ON,
+ (unsigned long *)&pi_desc->control);
+}
+
+static inline bool pi_test_and_clear_sn(struct pi_desc *pi_desc)
+{
+ return test_and_clear_bit(POSTED_INTR_SN,
+ (unsigned long *)&pi_desc->control);
+}
+
+static inline bool pi_test_and_set_pir(int vector, struct pi_desc *pi_desc)
+{
+ return test_and_set_bit(vector, (unsigned long *)pi_desc->pir);
+}
+
+static inline bool pi_is_pir_empty(struct pi_desc *pi_desc)
+{
+ return bitmap_empty((unsigned long *)pi_desc->pir, NR_VECTORS);
+}
+
+static inline void pi_set_sn(struct pi_desc *pi_desc)
+{
+ set_bit(POSTED_INTR_SN,
+ (unsigned long *)&pi_desc->control);
+}
+
+static inline void pi_set_on(struct pi_desc *pi_desc)
+{
+ set_bit(POSTED_INTR_ON,
+ (unsigned long *)&pi_desc->control);
+}
+
+static inline void pi_clear_on(struct pi_desc *pi_desc)
+{
+ clear_bit(POSTED_INTR_ON,
+ (unsigned long *)&pi_desc->control);
+}
+
+static inline void pi_clear_sn(struct pi_desc *pi_desc)
+{
+ clear_bit(POSTED_INTR_SN,
+ (unsigned long *)&pi_desc->control);
+}
+
+static inline bool pi_test_on(struct pi_desc *pi_desc)
+{
+ return test_bit(POSTED_INTR_ON,
+ (unsigned long *)&pi_desc->control);
+}
+
+static inline bool pi_test_sn(struct pi_desc *pi_desc)
+{
+ return test_bit(POSTED_INTR_SN,
+ (unsigned long *)&pi_desc->control);
+}
+
+void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu);
+void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu);
+void pi_wakeup_handler(void);
+void __init pi_init_cpu(int cpu);
+bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu);
+int vmx_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
+ uint32_t guest_irq, bool set);
+void vmx_pi_start_assignment(struct kvm *kvm);
+
+#endif /* __KVM_X86_VMX_POSTED_INTR_H */
diff --git a/arch/x86/kvm/vmx/run_flags.h b/arch/x86/kvm/vmx/run_flags.h
new file mode 100644
index 000000000..edc3f16cc
--- /dev/null
+++ b/arch/x86/kvm/vmx/run_flags.h
@@ -0,0 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __KVM_X86_VMX_RUN_FLAGS_H
+#define __KVM_X86_VMX_RUN_FLAGS_H
+
+#define VMX_RUN_VMRESUME (1 << 0)
+#define VMX_RUN_SAVE_SPEC_CTRL (1 << 1)
+
+#endif /* __KVM_X86_VMX_RUN_FLAGS_H */
diff --git a/arch/x86/kvm/vmx/sgx.c b/arch/x86/kvm/vmx/sgx.c
new file mode 100644
index 000000000..3e822e582
--- /dev/null
+++ b/arch/x86/kvm/vmx/sgx.c
@@ -0,0 +1,509 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2021 Intel Corporation. */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <asm/sgx.h>
+
+#include "cpuid.h"
+#include "kvm_cache_regs.h"
+#include "nested.h"
+#include "sgx.h"
+#include "vmx.h"
+#include "x86.h"
+
+bool __read_mostly enable_sgx = 1;
+module_param_named(sgx, enable_sgx, bool, 0444);
+
+/* Initial value of guest's virtual SGX_LEPUBKEYHASHn MSRs */
+static u64 sgx_pubkey_hash[4] __ro_after_init;
+
+/*
+ * ENCLS's memory operands use a fixed segment (DS) and a fixed
+ * address size based on the mode. Related prefixes are ignored.
+ */
+static int sgx_get_encls_gva(struct kvm_vcpu *vcpu, unsigned long offset,
+ int size, int alignment, gva_t *gva)
+{
+ struct kvm_segment s;
+ bool fault;
+
+ /* Skip vmcs.GUEST_DS retrieval for 64-bit mode to avoid VMREADs. */
+ *gva = offset;
+ if (!is_64_bit_mode(vcpu)) {
+ vmx_get_segment(vcpu, &s, VCPU_SREG_DS);
+ *gva += s.base;
+ }
+
+ if (!IS_ALIGNED(*gva, alignment)) {
+ fault = true;
+ } else if (likely(is_64_bit_mode(vcpu))) {
+ fault = is_noncanonical_address(*gva, vcpu);
+ } else {
+ *gva &= 0xffffffff;
+ fault = (s.unusable) ||
+ (s.type != 2 && s.type != 3) ||
+ (*gva > s.limit) ||
+ ((s.base != 0 || s.limit != 0xffffffff) &&
+ (((u64)*gva + size - 1) > s.limit + 1));
+ }
+ if (fault)
+ kvm_inject_gp(vcpu, 0);
+ return fault ? -EINVAL : 0;
+}
+
+static void sgx_handle_emulation_failure(struct kvm_vcpu *vcpu, u64 addr,
+ unsigned int size)
+{
+ uint64_t data[2] = { addr, size };
+
+ __kvm_prepare_emulation_failure_exit(vcpu, data, ARRAY_SIZE(data));
+}
+
+static int sgx_read_hva(struct kvm_vcpu *vcpu, unsigned long hva, void *data,
+ unsigned int size)
+{
+ if (__copy_from_user(data, (void __user *)hva, size)) {
+ sgx_handle_emulation_failure(vcpu, hva, size);
+ return -EFAULT;
+ }
+
+ return 0;
+}
+
+static int sgx_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t gva, bool write,
+ gpa_t *gpa)
+{
+ struct x86_exception ex;
+
+ if (write)
+ *gpa = kvm_mmu_gva_to_gpa_write(vcpu, gva, &ex);
+ else
+ *gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, &ex);
+
+ if (*gpa == INVALID_GPA) {
+ kvm_inject_emulated_page_fault(vcpu, &ex);
+ return -EFAULT;
+ }
+
+ return 0;
+}
+
+static int sgx_gpa_to_hva(struct kvm_vcpu *vcpu, gpa_t gpa, unsigned long *hva)
+{
+ *hva = kvm_vcpu_gfn_to_hva(vcpu, PFN_DOWN(gpa));
+ if (kvm_is_error_hva(*hva)) {
+ sgx_handle_emulation_failure(vcpu, gpa, 1);
+ return -EFAULT;
+ }
+
+ *hva |= gpa & ~PAGE_MASK;
+
+ return 0;
+}
+
+static int sgx_inject_fault(struct kvm_vcpu *vcpu, gva_t gva, int trapnr)
+{
+ struct x86_exception ex;
+
+ /*
+ * A non-EPCM #PF indicates a bad userspace HVA. This *should* check
+ * for PFEC.SGX and not assume any #PF on SGX2 originated in the EPC,
+ * but the error code isn't (yet) plumbed through the ENCLS helpers.
+ */
+ if (trapnr == PF_VECTOR && !boot_cpu_has(X86_FEATURE_SGX2)) {
+ kvm_prepare_emulation_failure_exit(vcpu);
+ return 0;
+ }
+
+ /*
+ * If the guest thinks it's running on SGX2 hardware, inject an SGX
+ * #PF if the fault matches an EPCM fault signature (#GP on SGX1,
+ * #PF on SGX2). The assumption is that EPCM faults are much more
+ * likely than a bad userspace address.
+ */
+ if ((trapnr == PF_VECTOR || !boot_cpu_has(X86_FEATURE_SGX2)) &&
+ guest_cpuid_has(vcpu, X86_FEATURE_SGX2)) {
+ memset(&ex, 0, sizeof(ex));
+ ex.vector = PF_VECTOR;
+ ex.error_code = PFERR_PRESENT_MASK | PFERR_WRITE_MASK |
+ PFERR_SGX_MASK;
+ ex.address = gva;
+ ex.error_code_valid = true;
+ ex.nested_page_fault = false;
+ kvm_inject_emulated_page_fault(vcpu, &ex);
+ } else {
+ kvm_inject_gp(vcpu, 0);
+ }
+ return 1;
+}
+
+static int __handle_encls_ecreate(struct kvm_vcpu *vcpu,
+ struct sgx_pageinfo *pageinfo,
+ unsigned long secs_hva,
+ gva_t secs_gva)
+{
+ struct sgx_secs *contents = (struct sgx_secs *)pageinfo->contents;
+ struct kvm_cpuid_entry2 *sgx_12_0, *sgx_12_1;
+ u64 attributes, xfrm, size;
+ u32 miscselect;
+ u8 max_size_log2;
+ int trapnr, ret;
+
+ sgx_12_0 = kvm_find_cpuid_entry_index(vcpu, 0x12, 0);
+ sgx_12_1 = kvm_find_cpuid_entry_index(vcpu, 0x12, 1);
+ if (!sgx_12_0 || !sgx_12_1) {
+ kvm_prepare_emulation_failure_exit(vcpu);
+ return 0;
+ }
+
+ miscselect = contents->miscselect;
+ attributes = contents->attributes;
+ xfrm = contents->xfrm;
+ size = contents->size;
+
+ /* Enforce restriction of access to the PROVISIONKEY. */
+ if (!vcpu->kvm->arch.sgx_provisioning_allowed &&
+ (attributes & SGX_ATTR_PROVISIONKEY)) {
+ if (sgx_12_1->eax & SGX_ATTR_PROVISIONKEY)
+ pr_warn_once("SGX PROVISIONKEY advertised but not allowed\n");
+ kvm_inject_gp(vcpu, 0);
+ return 1;
+ }
+
+ /*
+ * Enforce CPUID restrictions on MISCSELECT, ATTRIBUTES and XFRM. Note
+ * that the allowed XFRM (XFeature Request Mask) isn't strictly bound
+ * by the supported XCR0. FP+SSE *must* be set in XFRM, even if XSAVE
+ * is unsupported, i.e. even if XCR0 itself is completely unsupported.
+ */
+ if ((u32)miscselect & ~sgx_12_0->ebx ||
+ (u32)attributes & ~sgx_12_1->eax ||
+ (u32)(attributes >> 32) & ~sgx_12_1->ebx ||
+ (u32)xfrm & ~sgx_12_1->ecx ||
+ (u32)(xfrm >> 32) & ~sgx_12_1->edx ||
+ xfrm & ~(vcpu->arch.guest_supported_xcr0 | XFEATURE_MASK_FPSSE) ||
+ (xfrm & XFEATURE_MASK_FPSSE) != XFEATURE_MASK_FPSSE) {
+ kvm_inject_gp(vcpu, 0);
+ return 1;
+ }
+
+ /* Enforce CPUID restriction on max enclave size. */
+ max_size_log2 = (attributes & SGX_ATTR_MODE64BIT) ? sgx_12_0->edx >> 8 :
+ sgx_12_0->edx;
+ if (size >= BIT_ULL(max_size_log2)) {
+ kvm_inject_gp(vcpu, 0);
+ return 1;
+ }
+
+ /*
+ * sgx_virt_ecreate() returns:
+ * 1) 0: ECREATE was successful
+ * 2) -EFAULT: ECREATE was run but faulted, and trapnr was set to the
+ * exception number.
+ * 3) -EINVAL: access_ok() on @secs_hva failed. This should never
+ * happen as KVM checks host addresses at memslot creation.
+ * sgx_virt_ecreate() has already warned in this case.
+ */
+ ret = sgx_virt_ecreate(pageinfo, (void __user *)secs_hva, &trapnr);
+ if (!ret)
+ return kvm_skip_emulated_instruction(vcpu);
+ if (ret == -EFAULT)
+ return sgx_inject_fault(vcpu, secs_gva, trapnr);
+
+ return ret;
+}
+
+static int handle_encls_ecreate(struct kvm_vcpu *vcpu)
+{
+ gva_t pageinfo_gva, secs_gva;
+ gva_t metadata_gva, contents_gva;
+ gpa_t metadata_gpa, contents_gpa, secs_gpa;
+ unsigned long metadata_hva, contents_hva, secs_hva;
+ struct sgx_pageinfo pageinfo;
+ struct sgx_secs *contents;
+ struct x86_exception ex;
+ int r;
+
+ if (sgx_get_encls_gva(vcpu, kvm_rbx_read(vcpu), 32, 32, &pageinfo_gva) ||
+ sgx_get_encls_gva(vcpu, kvm_rcx_read(vcpu), 4096, 4096, &secs_gva))
+ return 1;
+
+ /*
+ * Copy the PAGEINFO to local memory, its pointers need to be
+ * translated, i.e. we need to do a deep copy/translate.
+ */
+ r = kvm_read_guest_virt(vcpu, pageinfo_gva, &pageinfo,
+ sizeof(pageinfo), &ex);
+ if (r == X86EMUL_PROPAGATE_FAULT) {
+ kvm_inject_emulated_page_fault(vcpu, &ex);
+ return 1;
+ } else if (r != X86EMUL_CONTINUE) {
+ sgx_handle_emulation_failure(vcpu, pageinfo_gva,
+ sizeof(pageinfo));
+ return 0;
+ }
+
+ if (sgx_get_encls_gva(vcpu, pageinfo.metadata, 64, 64, &metadata_gva) ||
+ sgx_get_encls_gva(vcpu, pageinfo.contents, 4096, 4096,
+ &contents_gva))
+ return 1;
+
+ /*
+ * Translate the SECINFO, SOURCE and SECS pointers from GVA to GPA.
+ * Resume the guest on failure to inject a #PF.
+ */
+ if (sgx_gva_to_gpa(vcpu, metadata_gva, false, &metadata_gpa) ||
+ sgx_gva_to_gpa(vcpu, contents_gva, false, &contents_gpa) ||
+ sgx_gva_to_gpa(vcpu, secs_gva, true, &secs_gpa))
+ return 1;
+
+ /*
+ * ...and then to HVA. The order of accesses isn't architectural, i.e.
+ * KVM doesn't have to fully process one address at a time. Exit to
+ * userspace if a GPA is invalid.
+ */
+ if (sgx_gpa_to_hva(vcpu, metadata_gpa, &metadata_hva) ||
+ sgx_gpa_to_hva(vcpu, contents_gpa, &contents_hva) ||
+ sgx_gpa_to_hva(vcpu, secs_gpa, &secs_hva))
+ return 0;
+
+ /*
+ * Copy contents into kernel memory to prevent TOCTOU attack. E.g. the
+ * guest could do ECREATE w/ SECS.SGX_ATTR_PROVISIONKEY=0, and
+ * simultaneously set SGX_ATTR_PROVISIONKEY to bypass the check to
+ * enforce restriction of access to the PROVISIONKEY.
+ */
+ contents = (struct sgx_secs *)__get_free_page(GFP_KERNEL_ACCOUNT);
+ if (!contents)
+ return -ENOMEM;
+
+ /* Exit to userspace if copying from a host userspace address fails. */
+ if (sgx_read_hva(vcpu, contents_hva, (void *)contents, PAGE_SIZE)) {
+ free_page((unsigned long)contents);
+ return 0;
+ }
+
+ pageinfo.metadata = metadata_hva;
+ pageinfo.contents = (u64)contents;
+
+ r = __handle_encls_ecreate(vcpu, &pageinfo, secs_hva, secs_gva);
+
+ free_page((unsigned long)contents);
+
+ return r;
+}
+
+static int handle_encls_einit(struct kvm_vcpu *vcpu)
+{
+ unsigned long sig_hva, secs_hva, token_hva, rflags;
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ gva_t sig_gva, secs_gva, token_gva;
+ gpa_t sig_gpa, secs_gpa, token_gpa;
+ int ret, trapnr;
+
+ if (sgx_get_encls_gva(vcpu, kvm_rbx_read(vcpu), 1808, 4096, &sig_gva) ||
+ sgx_get_encls_gva(vcpu, kvm_rcx_read(vcpu), 4096, 4096, &secs_gva) ||
+ sgx_get_encls_gva(vcpu, kvm_rdx_read(vcpu), 304, 512, &token_gva))
+ return 1;
+
+ /*
+ * Translate the SIGSTRUCT, SECS and TOKEN pointers from GVA to GPA.
+ * Resume the guest on failure to inject a #PF.
+ */
+ if (sgx_gva_to_gpa(vcpu, sig_gva, false, &sig_gpa) ||
+ sgx_gva_to_gpa(vcpu, secs_gva, true, &secs_gpa) ||
+ sgx_gva_to_gpa(vcpu, token_gva, false, &token_gpa))
+ return 1;
+
+ /*
+ * ...and then to HVA. The order of accesses isn't architectural, i.e.
+ * KVM doesn't have to fully process one address at a time. Exit to
+ * userspace if a GPA is invalid. Note, all structures are aligned and
+ * cannot split pages.
+ */
+ if (sgx_gpa_to_hva(vcpu, sig_gpa, &sig_hva) ||
+ sgx_gpa_to_hva(vcpu, secs_gpa, &secs_hva) ||
+ sgx_gpa_to_hva(vcpu, token_gpa, &token_hva))
+ return 0;
+
+ ret = sgx_virt_einit((void __user *)sig_hva, (void __user *)token_hva,
+ (void __user *)secs_hva,
+ vmx->msr_ia32_sgxlepubkeyhash, &trapnr);
+
+ if (ret == -EFAULT)
+ return sgx_inject_fault(vcpu, secs_gva, trapnr);
+
+ /*
+ * sgx_virt_einit() returns -EINVAL when access_ok() fails on @sig_hva,
+ * @token_hva or @secs_hva. This should never happen as KVM checks host
+ * addresses at memslot creation. sgx_virt_einit() has already warned
+ * in this case, so just return.
+ */
+ if (ret < 0)
+ return ret;
+
+ rflags = vmx_get_rflags(vcpu) & ~(X86_EFLAGS_CF | X86_EFLAGS_PF |
+ X86_EFLAGS_AF | X86_EFLAGS_SF |
+ X86_EFLAGS_OF);
+ if (ret)
+ rflags |= X86_EFLAGS_ZF;
+ else
+ rflags &= ~X86_EFLAGS_ZF;
+ vmx_set_rflags(vcpu, rflags);
+
+ kvm_rax_write(vcpu, ret);
+ return kvm_skip_emulated_instruction(vcpu);
+}
+
+static inline bool encls_leaf_enabled_in_guest(struct kvm_vcpu *vcpu, u32 leaf)
+{
+ /*
+ * ENCLS generates a #UD if SGX1 isn't supported, i.e. this point will
+ * be reached if and only if the SGX1 leafs are enabled.
+ */
+ if (leaf >= ECREATE && leaf <= ETRACK)
+ return true;
+
+ if (leaf >= EAUG && leaf <= EMODT)
+ return guest_cpuid_has(vcpu, X86_FEATURE_SGX2);
+
+ return false;
+}
+
+static inline bool sgx_enabled_in_guest_bios(struct kvm_vcpu *vcpu)
+{
+ const u64 bits = FEAT_CTL_SGX_ENABLED | FEAT_CTL_LOCKED;
+
+ return (to_vmx(vcpu)->msr_ia32_feature_control & bits) == bits;
+}
+
+int handle_encls(struct kvm_vcpu *vcpu)
+{
+ u32 leaf = (u32)kvm_rax_read(vcpu);
+
+ if (!enable_sgx || !guest_cpuid_has(vcpu, X86_FEATURE_SGX) ||
+ !guest_cpuid_has(vcpu, X86_FEATURE_SGX1)) {
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ } else if (!encls_leaf_enabled_in_guest(vcpu, leaf) ||
+ !sgx_enabled_in_guest_bios(vcpu) || !is_paging(vcpu)) {
+ kvm_inject_gp(vcpu, 0);
+ } else {
+ if (leaf == ECREATE)
+ return handle_encls_ecreate(vcpu);
+ if (leaf == EINIT)
+ return handle_encls_einit(vcpu);
+ WARN_ONCE(1, "unexpected exit on ENCLS[%u]", leaf);
+ vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
+ vcpu->run->hw.hardware_exit_reason = EXIT_REASON_ENCLS;
+ return 0;
+ }
+ return 1;
+}
+
+void setup_default_sgx_lepubkeyhash(void)
+{
+ /*
+ * Use Intel's default value for Skylake hardware if Launch Control is
+ * not supported, i.e. Intel's hash is hardcoded into silicon, or if
+ * Launch Control is supported and enabled, i.e. mimic the reset value
+ * and let the guest write the MSRs at will. If Launch Control is
+ * supported but disabled, then use the current MSR values as the hash
+ * MSRs exist but are read-only (locked and not writable).
+ */
+ if (!enable_sgx || boot_cpu_has(X86_FEATURE_SGX_LC) ||
+ rdmsrl_safe(MSR_IA32_SGXLEPUBKEYHASH0, &sgx_pubkey_hash[0])) {
+ sgx_pubkey_hash[0] = 0xa6053e051270b7acULL;
+ sgx_pubkey_hash[1] = 0x6cfbe8ba8b3b413dULL;
+ sgx_pubkey_hash[2] = 0xc4916d99f2b3735dULL;
+ sgx_pubkey_hash[3] = 0xd4f8c05909f9bb3bULL;
+ } else {
+ /* MSR_IA32_SGXLEPUBKEYHASH0 is read above */
+ rdmsrl(MSR_IA32_SGXLEPUBKEYHASH1, sgx_pubkey_hash[1]);
+ rdmsrl(MSR_IA32_SGXLEPUBKEYHASH2, sgx_pubkey_hash[2]);
+ rdmsrl(MSR_IA32_SGXLEPUBKEYHASH3, sgx_pubkey_hash[3]);
+ }
+}
+
+void vcpu_setup_sgx_lepubkeyhash(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ memcpy(vmx->msr_ia32_sgxlepubkeyhash, sgx_pubkey_hash,
+ sizeof(sgx_pubkey_hash));
+}
+
+/*
+ * ECREATE must be intercepted to enforce MISCSELECT, ATTRIBUTES and XFRM
+ * restrictions if the guest's allowed-1 settings diverge from hardware.
+ */
+static bool sgx_intercept_encls_ecreate(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpuid_entry2 *guest_cpuid;
+ u32 eax, ebx, ecx, edx;
+
+ if (!vcpu->kvm->arch.sgx_provisioning_allowed)
+ return true;
+
+ guest_cpuid = kvm_find_cpuid_entry_index(vcpu, 0x12, 0);
+ if (!guest_cpuid)
+ return true;
+
+ cpuid_count(0x12, 0, &eax, &ebx, &ecx, &edx);
+ if (guest_cpuid->ebx != ebx || guest_cpuid->edx != edx)
+ return true;
+
+ guest_cpuid = kvm_find_cpuid_entry_index(vcpu, 0x12, 1);
+ if (!guest_cpuid)
+ return true;
+
+ cpuid_count(0x12, 1, &eax, &ebx, &ecx, &edx);
+ if (guest_cpuid->eax != eax || guest_cpuid->ebx != ebx ||
+ guest_cpuid->ecx != ecx || guest_cpuid->edx != edx)
+ return true;
+
+ return false;
+}
+
+void vmx_write_encls_bitmap(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
+{
+ /*
+ * There is no software enable bit for SGX that is virtualized by
+ * hardware, e.g. there's no CR4.SGXE, so when SGX is disabled in the
+ * guest (either by the host or by the guest's BIOS) but enabled in the
+ * host, trap all ENCLS leafs and inject #UD/#GP as needed to emulate
+ * the expected system behavior for ENCLS.
+ */
+ u64 bitmap = -1ull;
+
+ /* Nothing to do if hardware doesn't support SGX */
+ if (!cpu_has_vmx_encls_vmexit())
+ return;
+
+ if (guest_cpuid_has(vcpu, X86_FEATURE_SGX) &&
+ sgx_enabled_in_guest_bios(vcpu)) {
+ if (guest_cpuid_has(vcpu, X86_FEATURE_SGX1)) {
+ bitmap &= ~GENMASK_ULL(ETRACK, ECREATE);
+ if (sgx_intercept_encls_ecreate(vcpu))
+ bitmap |= (1 << ECREATE);
+ }
+
+ if (guest_cpuid_has(vcpu, X86_FEATURE_SGX2))
+ bitmap &= ~GENMASK_ULL(EMODT, EAUG);
+
+ /*
+ * Trap and execute EINIT if launch control is enabled in the
+ * host using the guest's values for launch control MSRs, even
+ * if the guest's values are fixed to hardware default values.
+ * The MSRs are not loaded/saved on VM-Enter/VM-Exit as writing
+ * the MSRs is extraordinarily expensive.
+ */
+ if (boot_cpu_has(X86_FEATURE_SGX_LC))
+ bitmap |= (1 << EINIT);
+
+ if (!vmcs12 && is_guest_mode(vcpu))
+ vmcs12 = get_vmcs12(vcpu);
+ if (vmcs12 && nested_cpu_has_encls_exit(vmcs12))
+ bitmap |= vmcs12->encls_exiting_bitmap;
+ }
+ vmcs_write64(ENCLS_EXITING_BITMAP, bitmap);
+}
diff --git a/arch/x86/kvm/vmx/sgx.h b/arch/x86/kvm/vmx/sgx.h
new file mode 100644
index 000000000..a400888b3
--- /dev/null
+++ b/arch/x86/kvm/vmx/sgx.h
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __KVM_X86_SGX_H
+#define __KVM_X86_SGX_H
+
+#include <linux/kvm_host.h>
+
+#include "capabilities.h"
+#include "vmx_ops.h"
+
+#ifdef CONFIG_X86_SGX_KVM
+extern bool __read_mostly enable_sgx;
+
+int handle_encls(struct kvm_vcpu *vcpu);
+
+void setup_default_sgx_lepubkeyhash(void);
+void vcpu_setup_sgx_lepubkeyhash(struct kvm_vcpu *vcpu);
+
+void vmx_write_encls_bitmap(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12);
+#else
+#define enable_sgx 0
+
+static inline void setup_default_sgx_lepubkeyhash(void) { }
+static inline void vcpu_setup_sgx_lepubkeyhash(struct kvm_vcpu *vcpu) { }
+
+static inline void vmx_write_encls_bitmap(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+ /* Nothing to do if hardware doesn't support SGX */
+ if (cpu_has_vmx_encls_vmexit())
+ vmcs_write64(ENCLS_EXITING_BITMAP, -1ull);
+}
+#endif
+
+#endif /* __KVM_X86_SGX_H */
diff --git a/arch/x86/kvm/vmx/vmcs.h b/arch/x86/kvm/vmx/vmcs.h
new file mode 100644
index 000000000..7c1996b43
--- /dev/null
+++ b/arch/x86/kvm/vmx/vmcs.h
@@ -0,0 +1,193 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __KVM_X86_VMX_VMCS_H
+#define __KVM_X86_VMX_VMCS_H
+
+#include <linux/ktime.h>
+#include <linux/list.h>
+#include <linux/nospec.h>
+
+#include <asm/kvm.h>
+#include <asm/vmx.h>
+
+#include "capabilities.h"
+
+#define ROL16(val, n) ((u16)(((u16)(val) << (n)) | ((u16)(val) >> (16 - (n)))))
+
+struct vmcs_hdr {
+ u32 revision_id:31;
+ u32 shadow_vmcs:1;
+};
+
+struct vmcs {
+ struct vmcs_hdr hdr;
+ u32 abort;
+ char data[];
+};
+
+DECLARE_PER_CPU(struct vmcs *, current_vmcs);
+
+/*
+ * vmcs_host_state tracks registers that are loaded from the VMCS on VMEXIT
+ * and whose values change infrequently, but are not constant. I.e. this is
+ * used as a write-through cache of the corresponding VMCS fields.
+ */
+struct vmcs_host_state {
+ unsigned long cr3; /* May not match real cr3 */
+ unsigned long cr4; /* May not match real cr4 */
+ unsigned long gs_base;
+ unsigned long fs_base;
+ unsigned long rsp;
+
+ u16 fs_sel, gs_sel, ldt_sel;
+#ifdef CONFIG_X86_64
+ u16 ds_sel, es_sel;
+#endif
+};
+
+struct vmcs_controls_shadow {
+ u32 vm_entry;
+ u32 vm_exit;
+ u32 pin;
+ u32 exec;
+ u32 secondary_exec;
+ u64 tertiary_exec;
+};
+
+/*
+ * Track a VMCS that may be loaded on a certain CPU. If it is (cpu!=-1), also
+ * remember whether it was VMLAUNCHed, and maintain a linked list of all VMCSs
+ * loaded on this CPU (so we can clear them if the CPU goes down).
+ */
+struct loaded_vmcs {
+ struct vmcs *vmcs;
+ struct vmcs *shadow_vmcs;
+ int cpu;
+ bool launched;
+ bool nmi_known_unmasked;
+ bool hv_timer_soft_disabled;
+ /* Support for vnmi-less CPUs */
+ int soft_vnmi_blocked;
+ ktime_t entry_time;
+ s64 vnmi_blocked_time;
+ unsigned long *msr_bitmap;
+ struct list_head loaded_vmcss_on_cpu_link;
+ struct vmcs_host_state host_state;
+ struct vmcs_controls_shadow controls_shadow;
+};
+
+static __always_inline bool is_intr_type(u32 intr_info, u32 type)
+{
+ const u32 mask = INTR_INFO_VALID_MASK | INTR_INFO_INTR_TYPE_MASK;
+
+ return (intr_info & mask) == (INTR_INFO_VALID_MASK | type);
+}
+
+static inline bool is_intr_type_n(u32 intr_info, u32 type, u8 vector)
+{
+ const u32 mask = INTR_INFO_VALID_MASK | INTR_INFO_INTR_TYPE_MASK |
+ INTR_INFO_VECTOR_MASK;
+
+ return (intr_info & mask) == (INTR_INFO_VALID_MASK | type | vector);
+}
+
+static inline bool is_exception_n(u32 intr_info, u8 vector)
+{
+ return is_intr_type_n(intr_info, INTR_TYPE_HARD_EXCEPTION, vector);
+}
+
+static inline bool is_debug(u32 intr_info)
+{
+ return is_exception_n(intr_info, DB_VECTOR);
+}
+
+static inline bool is_breakpoint(u32 intr_info)
+{
+ return is_exception_n(intr_info, BP_VECTOR);
+}
+
+static inline bool is_double_fault(u32 intr_info)
+{
+ return is_exception_n(intr_info, DF_VECTOR);
+}
+
+static inline bool is_page_fault(u32 intr_info)
+{
+ return is_exception_n(intr_info, PF_VECTOR);
+}
+
+static inline bool is_invalid_opcode(u32 intr_info)
+{
+ return is_exception_n(intr_info, UD_VECTOR);
+}
+
+static inline bool is_gp_fault(u32 intr_info)
+{
+ return is_exception_n(intr_info, GP_VECTOR);
+}
+
+static inline bool is_alignment_check(u32 intr_info)
+{
+ return is_exception_n(intr_info, AC_VECTOR);
+}
+
+static inline bool is_machine_check(u32 intr_info)
+{
+ return is_exception_n(intr_info, MC_VECTOR);
+}
+
+static inline bool is_nm_fault(u32 intr_info)
+{
+ return is_exception_n(intr_info, NM_VECTOR);
+}
+
+/* Undocumented: icebp/int1 */
+static inline bool is_icebp(u32 intr_info)
+{
+ return is_intr_type(intr_info, INTR_TYPE_PRIV_SW_EXCEPTION);
+}
+
+static __always_inline bool is_nmi(u32 intr_info)
+{
+ return is_intr_type(intr_info, INTR_TYPE_NMI_INTR);
+}
+
+static inline bool is_external_intr(u32 intr_info)
+{
+ return is_intr_type(intr_info, INTR_TYPE_EXT_INTR);
+}
+
+static inline bool is_exception_with_error_code(u32 intr_info)
+{
+ const u32 mask = INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK;
+
+ return (intr_info & mask) == mask;
+}
+
+enum vmcs_field_width {
+ VMCS_FIELD_WIDTH_U16 = 0,
+ VMCS_FIELD_WIDTH_U64 = 1,
+ VMCS_FIELD_WIDTH_U32 = 2,
+ VMCS_FIELD_WIDTH_NATURAL_WIDTH = 3
+};
+
+static inline int vmcs_field_width(unsigned long field)
+{
+ if (0x1 & field) /* the *_HIGH fields are all 32 bit */
+ return VMCS_FIELD_WIDTH_U32;
+ return (field >> 13) & 0x3;
+}
+
+static inline int vmcs_field_readonly(unsigned long field)
+{
+ return (((field >> 10) & 0x3) == 1);
+}
+
+#define VMCS_FIELD_INDEX_SHIFT (1)
+#define VMCS_FIELD_INDEX_MASK GENMASK(9, 1)
+
+static inline unsigned int vmcs_field_index(unsigned long field)
+{
+ return (field & VMCS_FIELD_INDEX_MASK) >> VMCS_FIELD_INDEX_SHIFT;
+}
+
+#endif /* __KVM_X86_VMX_VMCS_H */
diff --git a/arch/x86/kvm/vmx/vmcs12.c b/arch/x86/kvm/vmx/vmcs12.c
new file mode 100644
index 000000000..106a72c92
--- /dev/null
+++ b/arch/x86/kvm/vmx/vmcs12.c
@@ -0,0 +1,155 @@
+// SPDX-License-Identifier: GPL-2.0
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include "vmcs12.h"
+
+#define VMCS12_OFFSET(x) offsetof(struct vmcs12, x)
+#define FIELD(number, name) [ROL16(number, 6)] = VMCS12_OFFSET(name)
+#define FIELD64(number, name) \
+ FIELD(number, name), \
+ [ROL16(number##_HIGH, 6)] = VMCS12_OFFSET(name) + sizeof(u32)
+
+const unsigned short vmcs12_field_offsets[] = {
+ FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id),
+ FIELD(POSTED_INTR_NV, posted_intr_nv),
+ FIELD(GUEST_ES_SELECTOR, guest_es_selector),
+ FIELD(GUEST_CS_SELECTOR, guest_cs_selector),
+ FIELD(GUEST_SS_SELECTOR, guest_ss_selector),
+ FIELD(GUEST_DS_SELECTOR, guest_ds_selector),
+ FIELD(GUEST_FS_SELECTOR, guest_fs_selector),
+ FIELD(GUEST_GS_SELECTOR, guest_gs_selector),
+ FIELD(GUEST_LDTR_SELECTOR, guest_ldtr_selector),
+ FIELD(GUEST_TR_SELECTOR, guest_tr_selector),
+ FIELD(GUEST_INTR_STATUS, guest_intr_status),
+ FIELD(GUEST_PML_INDEX, guest_pml_index),
+ FIELD(HOST_ES_SELECTOR, host_es_selector),
+ FIELD(HOST_CS_SELECTOR, host_cs_selector),
+ FIELD(HOST_SS_SELECTOR, host_ss_selector),
+ FIELD(HOST_DS_SELECTOR, host_ds_selector),
+ FIELD(HOST_FS_SELECTOR, host_fs_selector),
+ FIELD(HOST_GS_SELECTOR, host_gs_selector),
+ FIELD(HOST_TR_SELECTOR, host_tr_selector),
+ FIELD64(IO_BITMAP_A, io_bitmap_a),
+ FIELD64(IO_BITMAP_B, io_bitmap_b),
+ FIELD64(MSR_BITMAP, msr_bitmap),
+ FIELD64(VM_EXIT_MSR_STORE_ADDR, vm_exit_msr_store_addr),
+ FIELD64(VM_EXIT_MSR_LOAD_ADDR, vm_exit_msr_load_addr),
+ FIELD64(VM_ENTRY_MSR_LOAD_ADDR, vm_entry_msr_load_addr),
+ FIELD64(PML_ADDRESS, pml_address),
+ FIELD64(TSC_OFFSET, tsc_offset),
+ FIELD64(TSC_MULTIPLIER, tsc_multiplier),
+ FIELD64(VIRTUAL_APIC_PAGE_ADDR, virtual_apic_page_addr),
+ FIELD64(APIC_ACCESS_ADDR, apic_access_addr),
+ FIELD64(POSTED_INTR_DESC_ADDR, posted_intr_desc_addr),
+ FIELD64(VM_FUNCTION_CONTROL, vm_function_control),
+ FIELD64(EPT_POINTER, ept_pointer),
+ FIELD64(EOI_EXIT_BITMAP0, eoi_exit_bitmap0),
+ FIELD64(EOI_EXIT_BITMAP1, eoi_exit_bitmap1),
+ FIELD64(EOI_EXIT_BITMAP2, eoi_exit_bitmap2),
+ FIELD64(EOI_EXIT_BITMAP3, eoi_exit_bitmap3),
+ FIELD64(EPTP_LIST_ADDRESS, eptp_list_address),
+ FIELD64(VMREAD_BITMAP, vmread_bitmap),
+ FIELD64(VMWRITE_BITMAP, vmwrite_bitmap),
+ FIELD64(XSS_EXIT_BITMAP, xss_exit_bitmap),
+ FIELD64(ENCLS_EXITING_BITMAP, encls_exiting_bitmap),
+ FIELD64(GUEST_PHYSICAL_ADDRESS, guest_physical_address),
+ FIELD64(VMCS_LINK_POINTER, vmcs_link_pointer),
+ FIELD64(GUEST_IA32_DEBUGCTL, guest_ia32_debugctl),
+ FIELD64(GUEST_IA32_PAT, guest_ia32_pat),
+ FIELD64(GUEST_IA32_EFER, guest_ia32_efer),
+ FIELD64(GUEST_IA32_PERF_GLOBAL_CTRL, guest_ia32_perf_global_ctrl),
+ FIELD64(GUEST_PDPTR0, guest_pdptr0),
+ FIELD64(GUEST_PDPTR1, guest_pdptr1),
+ FIELD64(GUEST_PDPTR2, guest_pdptr2),
+ FIELD64(GUEST_PDPTR3, guest_pdptr3),
+ FIELD64(GUEST_BNDCFGS, guest_bndcfgs),
+ FIELD64(HOST_IA32_PAT, host_ia32_pat),
+ FIELD64(HOST_IA32_EFER, host_ia32_efer),
+ FIELD64(HOST_IA32_PERF_GLOBAL_CTRL, host_ia32_perf_global_ctrl),
+ FIELD(PIN_BASED_VM_EXEC_CONTROL, pin_based_vm_exec_control),
+ FIELD(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control),
+ FIELD(EXCEPTION_BITMAP, exception_bitmap),
+ FIELD(PAGE_FAULT_ERROR_CODE_MASK, page_fault_error_code_mask),
+ FIELD(PAGE_FAULT_ERROR_CODE_MATCH, page_fault_error_code_match),
+ FIELD(CR3_TARGET_COUNT, cr3_target_count),
+ FIELD(VM_EXIT_CONTROLS, vm_exit_controls),
+ FIELD(VM_EXIT_MSR_STORE_COUNT, vm_exit_msr_store_count),
+ FIELD(VM_EXIT_MSR_LOAD_COUNT, vm_exit_msr_load_count),
+ FIELD(VM_ENTRY_CONTROLS, vm_entry_controls),
+ FIELD(VM_ENTRY_MSR_LOAD_COUNT, vm_entry_msr_load_count),
+ FIELD(VM_ENTRY_INTR_INFO_FIELD, vm_entry_intr_info_field),
+ FIELD(VM_ENTRY_EXCEPTION_ERROR_CODE, vm_entry_exception_error_code),
+ FIELD(VM_ENTRY_INSTRUCTION_LEN, vm_entry_instruction_len),
+ FIELD(TPR_THRESHOLD, tpr_threshold),
+ FIELD(SECONDARY_VM_EXEC_CONTROL, secondary_vm_exec_control),
+ FIELD(VM_INSTRUCTION_ERROR, vm_instruction_error),
+ FIELD(VM_EXIT_REASON, vm_exit_reason),
+ FIELD(VM_EXIT_INTR_INFO, vm_exit_intr_info),
+ FIELD(VM_EXIT_INTR_ERROR_CODE, vm_exit_intr_error_code),
+ FIELD(IDT_VECTORING_INFO_FIELD, idt_vectoring_info_field),
+ FIELD(IDT_VECTORING_ERROR_CODE, idt_vectoring_error_code),
+ FIELD(VM_EXIT_INSTRUCTION_LEN, vm_exit_instruction_len),
+ FIELD(VMX_INSTRUCTION_INFO, vmx_instruction_info),
+ FIELD(GUEST_ES_LIMIT, guest_es_limit),
+ FIELD(GUEST_CS_LIMIT, guest_cs_limit),
+ FIELD(GUEST_SS_LIMIT, guest_ss_limit),
+ FIELD(GUEST_DS_LIMIT, guest_ds_limit),
+ FIELD(GUEST_FS_LIMIT, guest_fs_limit),
+ FIELD(GUEST_GS_LIMIT, guest_gs_limit),
+ FIELD(GUEST_LDTR_LIMIT, guest_ldtr_limit),
+ FIELD(GUEST_TR_LIMIT, guest_tr_limit),
+ FIELD(GUEST_GDTR_LIMIT, guest_gdtr_limit),
+ FIELD(GUEST_IDTR_LIMIT, guest_idtr_limit),
+ FIELD(GUEST_ES_AR_BYTES, guest_es_ar_bytes),
+ FIELD(GUEST_CS_AR_BYTES, guest_cs_ar_bytes),
+ FIELD(GUEST_SS_AR_BYTES, guest_ss_ar_bytes),
+ FIELD(GUEST_DS_AR_BYTES, guest_ds_ar_bytes),
+ FIELD(GUEST_FS_AR_BYTES, guest_fs_ar_bytes),
+ FIELD(GUEST_GS_AR_BYTES, guest_gs_ar_bytes),
+ FIELD(GUEST_LDTR_AR_BYTES, guest_ldtr_ar_bytes),
+ FIELD(GUEST_TR_AR_BYTES, guest_tr_ar_bytes),
+ FIELD(GUEST_INTERRUPTIBILITY_INFO, guest_interruptibility_info),
+ FIELD(GUEST_ACTIVITY_STATE, guest_activity_state),
+ FIELD(GUEST_SYSENTER_CS, guest_sysenter_cs),
+ FIELD(HOST_IA32_SYSENTER_CS, host_ia32_sysenter_cs),
+ FIELD(VMX_PREEMPTION_TIMER_VALUE, vmx_preemption_timer_value),
+ FIELD(CR0_GUEST_HOST_MASK, cr0_guest_host_mask),
+ FIELD(CR4_GUEST_HOST_MASK, cr4_guest_host_mask),
+ FIELD(CR0_READ_SHADOW, cr0_read_shadow),
+ FIELD(CR4_READ_SHADOW, cr4_read_shadow),
+ FIELD(EXIT_QUALIFICATION, exit_qualification),
+ FIELD(GUEST_LINEAR_ADDRESS, guest_linear_address),
+ FIELD(GUEST_CR0, guest_cr0),
+ FIELD(GUEST_CR3, guest_cr3),
+ FIELD(GUEST_CR4, guest_cr4),
+ FIELD(GUEST_ES_BASE, guest_es_base),
+ FIELD(GUEST_CS_BASE, guest_cs_base),
+ FIELD(GUEST_SS_BASE, guest_ss_base),
+ FIELD(GUEST_DS_BASE, guest_ds_base),
+ FIELD(GUEST_FS_BASE, guest_fs_base),
+ FIELD(GUEST_GS_BASE, guest_gs_base),
+ FIELD(GUEST_LDTR_BASE, guest_ldtr_base),
+ FIELD(GUEST_TR_BASE, guest_tr_base),
+ FIELD(GUEST_GDTR_BASE, guest_gdtr_base),
+ FIELD(GUEST_IDTR_BASE, guest_idtr_base),
+ FIELD(GUEST_DR7, guest_dr7),
+ FIELD(GUEST_RSP, guest_rsp),
+ FIELD(GUEST_RIP, guest_rip),
+ FIELD(GUEST_RFLAGS, guest_rflags),
+ FIELD(GUEST_PENDING_DBG_EXCEPTIONS, guest_pending_dbg_exceptions),
+ FIELD(GUEST_SYSENTER_ESP, guest_sysenter_esp),
+ FIELD(GUEST_SYSENTER_EIP, guest_sysenter_eip),
+ FIELD(HOST_CR0, host_cr0),
+ FIELD(HOST_CR3, host_cr3),
+ FIELD(HOST_CR4, host_cr4),
+ FIELD(HOST_FS_BASE, host_fs_base),
+ FIELD(HOST_GS_BASE, host_gs_base),
+ FIELD(HOST_TR_BASE, host_tr_base),
+ FIELD(HOST_GDTR_BASE, host_gdtr_base),
+ FIELD(HOST_IDTR_BASE, host_idtr_base),
+ FIELD(HOST_IA32_SYSENTER_ESP, host_ia32_sysenter_esp),
+ FIELD(HOST_IA32_SYSENTER_EIP, host_ia32_sysenter_eip),
+ FIELD(HOST_RSP, host_rsp),
+ FIELD(HOST_RIP, host_rip),
+};
+const unsigned int nr_vmcs12_fields = ARRAY_SIZE(vmcs12_field_offsets);
diff --git a/arch/x86/kvm/vmx/vmcs12.h b/arch/x86/kvm/vmx/vmcs12.h
new file mode 100644
index 000000000..019360134
--- /dev/null
+++ b/arch/x86/kvm/vmx/vmcs12.h
@@ -0,0 +1,429 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __KVM_X86_VMX_VMCS12_H
+#define __KVM_X86_VMX_VMCS12_H
+
+#include <linux/build_bug.h>
+
+#include "vmcs.h"
+
+/*
+ * struct vmcs12 describes the state that our guest hypervisor (L1) keeps for a
+ * single nested guest (L2), hence the name vmcs12. Any VMX implementation has
+ * a VMCS structure, and vmcs12 is our emulated VMX's VMCS. This structure is
+ * stored in guest memory specified by VMPTRLD, but is opaque to the guest,
+ * which must access it using VMREAD/VMWRITE/VMCLEAR instructions.
+ * More than one of these structures may exist, if L1 runs multiple L2 guests.
+ * nested_vmx_run() will use the data here to build the vmcs02: a VMCS for the
+ * underlying hardware which will be used to run L2.
+ * This structure is packed to ensure that its layout is identical across
+ * machines (necessary for live migration).
+ *
+ * IMPORTANT: Changing the layout of existing fields in this structure
+ * will break save/restore compatibility with older kvm releases. When
+ * adding new fields, either use space in the reserved padding* arrays
+ * or add the new fields to the end of the structure.
+ */
+typedef u64 natural_width;
+struct __packed vmcs12 {
+ /* According to the Intel spec, a VMCS region must start with the
+ * following two fields. Then follow implementation-specific data.
+ */
+ struct vmcs_hdr hdr;
+ u32 abort;
+
+ u32 launch_state; /* set to 0 by VMCLEAR, to 1 by VMLAUNCH */
+ u32 padding[7]; /* room for future expansion */
+
+ u64 io_bitmap_a;
+ u64 io_bitmap_b;
+ u64 msr_bitmap;
+ u64 vm_exit_msr_store_addr;
+ u64 vm_exit_msr_load_addr;
+ u64 vm_entry_msr_load_addr;
+ u64 tsc_offset;
+ u64 virtual_apic_page_addr;
+ u64 apic_access_addr;
+ u64 posted_intr_desc_addr;
+ u64 ept_pointer;
+ u64 eoi_exit_bitmap0;
+ u64 eoi_exit_bitmap1;
+ u64 eoi_exit_bitmap2;
+ u64 eoi_exit_bitmap3;
+ u64 xss_exit_bitmap;
+ u64 guest_physical_address;
+ u64 vmcs_link_pointer;
+ u64 guest_ia32_debugctl;
+ u64 guest_ia32_pat;
+ u64 guest_ia32_efer;
+ u64 guest_ia32_perf_global_ctrl;
+ u64 guest_pdptr0;
+ u64 guest_pdptr1;
+ u64 guest_pdptr2;
+ u64 guest_pdptr3;
+ u64 guest_bndcfgs;
+ u64 host_ia32_pat;
+ u64 host_ia32_efer;
+ u64 host_ia32_perf_global_ctrl;
+ u64 vmread_bitmap;
+ u64 vmwrite_bitmap;
+ u64 vm_function_control;
+ u64 eptp_list_address;
+ u64 pml_address;
+ u64 encls_exiting_bitmap;
+ u64 tsc_multiplier;
+ u64 padding64[1]; /* room for future expansion */
+ /*
+ * To allow migration of L1 (complete with its L2 guests) between
+ * machines of different natural widths (32 or 64 bit), we cannot have
+ * unsigned long fields with no explicit size. We use u64 (aliased
+ * natural_width) instead. Luckily, x86 is little-endian.
+ */
+ natural_width cr0_guest_host_mask;
+ natural_width cr4_guest_host_mask;
+ natural_width cr0_read_shadow;
+ natural_width cr4_read_shadow;
+ natural_width dead_space[4]; /* Last remnants of cr3_target_value[0-3]. */
+ natural_width exit_qualification;
+ natural_width guest_linear_address;
+ natural_width guest_cr0;
+ natural_width guest_cr3;
+ natural_width guest_cr4;
+ natural_width guest_es_base;
+ natural_width guest_cs_base;
+ natural_width guest_ss_base;
+ natural_width guest_ds_base;
+ natural_width guest_fs_base;
+ natural_width guest_gs_base;
+ natural_width guest_ldtr_base;
+ natural_width guest_tr_base;
+ natural_width guest_gdtr_base;
+ natural_width guest_idtr_base;
+ natural_width guest_dr7;
+ natural_width guest_rsp;
+ natural_width guest_rip;
+ natural_width guest_rflags;
+ natural_width guest_pending_dbg_exceptions;
+ natural_width guest_sysenter_esp;
+ natural_width guest_sysenter_eip;
+ natural_width host_cr0;
+ natural_width host_cr3;
+ natural_width host_cr4;
+ natural_width host_fs_base;
+ natural_width host_gs_base;
+ natural_width host_tr_base;
+ natural_width host_gdtr_base;
+ natural_width host_idtr_base;
+ natural_width host_ia32_sysenter_esp;
+ natural_width host_ia32_sysenter_eip;
+ natural_width host_rsp;
+ natural_width host_rip;
+ natural_width paddingl[8]; /* room for future expansion */
+ u32 pin_based_vm_exec_control;
+ u32 cpu_based_vm_exec_control;
+ u32 exception_bitmap;
+ u32 page_fault_error_code_mask;
+ u32 page_fault_error_code_match;
+ u32 cr3_target_count;
+ u32 vm_exit_controls;
+ u32 vm_exit_msr_store_count;
+ u32 vm_exit_msr_load_count;
+ u32 vm_entry_controls;
+ u32 vm_entry_msr_load_count;
+ u32 vm_entry_intr_info_field;
+ u32 vm_entry_exception_error_code;
+ u32 vm_entry_instruction_len;
+ u32 tpr_threshold;
+ u32 secondary_vm_exec_control;
+ u32 vm_instruction_error;
+ u32 vm_exit_reason;
+ u32 vm_exit_intr_info;
+ u32 vm_exit_intr_error_code;
+ u32 idt_vectoring_info_field;
+ u32 idt_vectoring_error_code;
+ u32 vm_exit_instruction_len;
+ u32 vmx_instruction_info;
+ u32 guest_es_limit;
+ u32 guest_cs_limit;
+ u32 guest_ss_limit;
+ u32 guest_ds_limit;
+ u32 guest_fs_limit;
+ u32 guest_gs_limit;
+ u32 guest_ldtr_limit;
+ u32 guest_tr_limit;
+ u32 guest_gdtr_limit;
+ u32 guest_idtr_limit;
+ u32 guest_es_ar_bytes;
+ u32 guest_cs_ar_bytes;
+ u32 guest_ss_ar_bytes;
+ u32 guest_ds_ar_bytes;
+ u32 guest_fs_ar_bytes;
+ u32 guest_gs_ar_bytes;
+ u32 guest_ldtr_ar_bytes;
+ u32 guest_tr_ar_bytes;
+ u32 guest_interruptibility_info;
+ u32 guest_activity_state;
+ u32 guest_sysenter_cs;
+ u32 host_ia32_sysenter_cs;
+ u32 vmx_preemption_timer_value;
+ u32 padding32[7]; /* room for future expansion */
+ u16 virtual_processor_id;
+ u16 posted_intr_nv;
+ u16 guest_es_selector;
+ u16 guest_cs_selector;
+ u16 guest_ss_selector;
+ u16 guest_ds_selector;
+ u16 guest_fs_selector;
+ u16 guest_gs_selector;
+ u16 guest_ldtr_selector;
+ u16 guest_tr_selector;
+ u16 guest_intr_status;
+ u16 host_es_selector;
+ u16 host_cs_selector;
+ u16 host_ss_selector;
+ u16 host_ds_selector;
+ u16 host_fs_selector;
+ u16 host_gs_selector;
+ u16 host_tr_selector;
+ u16 guest_pml_index;
+};
+
+/*
+ * VMCS12_REVISION is an arbitrary id that should be changed if the content or
+ * layout of struct vmcs12 is changed. MSR_IA32_VMX_BASIC returns this id, and
+ * VMPTRLD verifies that the VMCS region that L1 is loading contains this id.
+ *
+ * IMPORTANT: Changing this value will break save/restore compatibility with
+ * older kvm releases.
+ */
+#define VMCS12_REVISION 0x11e57ed0
+
+/*
+ * VMCS12_SIZE is the number of bytes L1 should allocate for the VMXON region
+ * and any VMCS region. Although only sizeof(struct vmcs12) are used by the
+ * current implementation, 4K are reserved to avoid future complications and
+ * to preserve userspace ABI.
+ */
+#define VMCS12_SIZE KVM_STATE_NESTED_VMX_VMCS_SIZE
+
+/*
+ * For save/restore compatibility, the vmcs12 field offsets must not change.
+ */
+#define CHECK_OFFSET(field, loc) \
+ ASSERT_STRUCT_OFFSET(struct vmcs12, field, loc)
+
+static inline void vmx_check_vmcs12_offsets(void)
+{
+ CHECK_OFFSET(hdr, 0);
+ CHECK_OFFSET(abort, 4);
+ CHECK_OFFSET(launch_state, 8);
+ CHECK_OFFSET(io_bitmap_a, 40);
+ CHECK_OFFSET(io_bitmap_b, 48);
+ CHECK_OFFSET(msr_bitmap, 56);
+ CHECK_OFFSET(vm_exit_msr_store_addr, 64);
+ CHECK_OFFSET(vm_exit_msr_load_addr, 72);
+ CHECK_OFFSET(vm_entry_msr_load_addr, 80);
+ CHECK_OFFSET(tsc_offset, 88);
+ CHECK_OFFSET(virtual_apic_page_addr, 96);
+ CHECK_OFFSET(apic_access_addr, 104);
+ CHECK_OFFSET(posted_intr_desc_addr, 112);
+ CHECK_OFFSET(ept_pointer, 120);
+ CHECK_OFFSET(eoi_exit_bitmap0, 128);
+ CHECK_OFFSET(eoi_exit_bitmap1, 136);
+ CHECK_OFFSET(eoi_exit_bitmap2, 144);
+ CHECK_OFFSET(eoi_exit_bitmap3, 152);
+ CHECK_OFFSET(xss_exit_bitmap, 160);
+ CHECK_OFFSET(guest_physical_address, 168);
+ CHECK_OFFSET(vmcs_link_pointer, 176);
+ CHECK_OFFSET(guest_ia32_debugctl, 184);
+ CHECK_OFFSET(guest_ia32_pat, 192);
+ CHECK_OFFSET(guest_ia32_efer, 200);
+ CHECK_OFFSET(guest_ia32_perf_global_ctrl, 208);
+ CHECK_OFFSET(guest_pdptr0, 216);
+ CHECK_OFFSET(guest_pdptr1, 224);
+ CHECK_OFFSET(guest_pdptr2, 232);
+ CHECK_OFFSET(guest_pdptr3, 240);
+ CHECK_OFFSET(guest_bndcfgs, 248);
+ CHECK_OFFSET(host_ia32_pat, 256);
+ CHECK_OFFSET(host_ia32_efer, 264);
+ CHECK_OFFSET(host_ia32_perf_global_ctrl, 272);
+ CHECK_OFFSET(vmread_bitmap, 280);
+ CHECK_OFFSET(vmwrite_bitmap, 288);
+ CHECK_OFFSET(vm_function_control, 296);
+ CHECK_OFFSET(eptp_list_address, 304);
+ CHECK_OFFSET(pml_address, 312);
+ CHECK_OFFSET(encls_exiting_bitmap, 320);
+ CHECK_OFFSET(tsc_multiplier, 328);
+ CHECK_OFFSET(cr0_guest_host_mask, 344);
+ CHECK_OFFSET(cr4_guest_host_mask, 352);
+ CHECK_OFFSET(cr0_read_shadow, 360);
+ CHECK_OFFSET(cr4_read_shadow, 368);
+ CHECK_OFFSET(dead_space, 376);
+ CHECK_OFFSET(exit_qualification, 408);
+ CHECK_OFFSET(guest_linear_address, 416);
+ CHECK_OFFSET(guest_cr0, 424);
+ CHECK_OFFSET(guest_cr3, 432);
+ CHECK_OFFSET(guest_cr4, 440);
+ CHECK_OFFSET(guest_es_base, 448);
+ CHECK_OFFSET(guest_cs_base, 456);
+ CHECK_OFFSET(guest_ss_base, 464);
+ CHECK_OFFSET(guest_ds_base, 472);
+ CHECK_OFFSET(guest_fs_base, 480);
+ CHECK_OFFSET(guest_gs_base, 488);
+ CHECK_OFFSET(guest_ldtr_base, 496);
+ CHECK_OFFSET(guest_tr_base, 504);
+ CHECK_OFFSET(guest_gdtr_base, 512);
+ CHECK_OFFSET(guest_idtr_base, 520);
+ CHECK_OFFSET(guest_dr7, 528);
+ CHECK_OFFSET(guest_rsp, 536);
+ CHECK_OFFSET(guest_rip, 544);
+ CHECK_OFFSET(guest_rflags, 552);
+ CHECK_OFFSET(guest_pending_dbg_exceptions, 560);
+ CHECK_OFFSET(guest_sysenter_esp, 568);
+ CHECK_OFFSET(guest_sysenter_eip, 576);
+ CHECK_OFFSET(host_cr0, 584);
+ CHECK_OFFSET(host_cr3, 592);
+ CHECK_OFFSET(host_cr4, 600);
+ CHECK_OFFSET(host_fs_base, 608);
+ CHECK_OFFSET(host_gs_base, 616);
+ CHECK_OFFSET(host_tr_base, 624);
+ CHECK_OFFSET(host_gdtr_base, 632);
+ CHECK_OFFSET(host_idtr_base, 640);
+ CHECK_OFFSET(host_ia32_sysenter_esp, 648);
+ CHECK_OFFSET(host_ia32_sysenter_eip, 656);
+ CHECK_OFFSET(host_rsp, 664);
+ CHECK_OFFSET(host_rip, 672);
+ CHECK_OFFSET(pin_based_vm_exec_control, 744);
+ CHECK_OFFSET(cpu_based_vm_exec_control, 748);
+ CHECK_OFFSET(exception_bitmap, 752);
+ CHECK_OFFSET(page_fault_error_code_mask, 756);
+ CHECK_OFFSET(page_fault_error_code_match, 760);
+ CHECK_OFFSET(cr3_target_count, 764);
+ CHECK_OFFSET(vm_exit_controls, 768);
+ CHECK_OFFSET(vm_exit_msr_store_count, 772);
+ CHECK_OFFSET(vm_exit_msr_load_count, 776);
+ CHECK_OFFSET(vm_entry_controls, 780);
+ CHECK_OFFSET(vm_entry_msr_load_count, 784);
+ CHECK_OFFSET(vm_entry_intr_info_field, 788);
+ CHECK_OFFSET(vm_entry_exception_error_code, 792);
+ CHECK_OFFSET(vm_entry_instruction_len, 796);
+ CHECK_OFFSET(tpr_threshold, 800);
+ CHECK_OFFSET(secondary_vm_exec_control, 804);
+ CHECK_OFFSET(vm_instruction_error, 808);
+ CHECK_OFFSET(vm_exit_reason, 812);
+ CHECK_OFFSET(vm_exit_intr_info, 816);
+ CHECK_OFFSET(vm_exit_intr_error_code, 820);
+ CHECK_OFFSET(idt_vectoring_info_field, 824);
+ CHECK_OFFSET(idt_vectoring_error_code, 828);
+ CHECK_OFFSET(vm_exit_instruction_len, 832);
+ CHECK_OFFSET(vmx_instruction_info, 836);
+ CHECK_OFFSET(guest_es_limit, 840);
+ CHECK_OFFSET(guest_cs_limit, 844);
+ CHECK_OFFSET(guest_ss_limit, 848);
+ CHECK_OFFSET(guest_ds_limit, 852);
+ CHECK_OFFSET(guest_fs_limit, 856);
+ CHECK_OFFSET(guest_gs_limit, 860);
+ CHECK_OFFSET(guest_ldtr_limit, 864);
+ CHECK_OFFSET(guest_tr_limit, 868);
+ CHECK_OFFSET(guest_gdtr_limit, 872);
+ CHECK_OFFSET(guest_idtr_limit, 876);
+ CHECK_OFFSET(guest_es_ar_bytes, 880);
+ CHECK_OFFSET(guest_cs_ar_bytes, 884);
+ CHECK_OFFSET(guest_ss_ar_bytes, 888);
+ CHECK_OFFSET(guest_ds_ar_bytes, 892);
+ CHECK_OFFSET(guest_fs_ar_bytes, 896);
+ CHECK_OFFSET(guest_gs_ar_bytes, 900);
+ CHECK_OFFSET(guest_ldtr_ar_bytes, 904);
+ CHECK_OFFSET(guest_tr_ar_bytes, 908);
+ CHECK_OFFSET(guest_interruptibility_info, 912);
+ CHECK_OFFSET(guest_activity_state, 916);
+ CHECK_OFFSET(guest_sysenter_cs, 920);
+ CHECK_OFFSET(host_ia32_sysenter_cs, 924);
+ CHECK_OFFSET(vmx_preemption_timer_value, 928);
+ CHECK_OFFSET(virtual_processor_id, 960);
+ CHECK_OFFSET(posted_intr_nv, 962);
+ CHECK_OFFSET(guest_es_selector, 964);
+ CHECK_OFFSET(guest_cs_selector, 966);
+ CHECK_OFFSET(guest_ss_selector, 968);
+ CHECK_OFFSET(guest_ds_selector, 970);
+ CHECK_OFFSET(guest_fs_selector, 972);
+ CHECK_OFFSET(guest_gs_selector, 974);
+ CHECK_OFFSET(guest_ldtr_selector, 976);
+ CHECK_OFFSET(guest_tr_selector, 978);
+ CHECK_OFFSET(guest_intr_status, 980);
+ CHECK_OFFSET(host_es_selector, 982);
+ CHECK_OFFSET(host_cs_selector, 984);
+ CHECK_OFFSET(host_ss_selector, 986);
+ CHECK_OFFSET(host_ds_selector, 988);
+ CHECK_OFFSET(host_fs_selector, 990);
+ CHECK_OFFSET(host_gs_selector, 992);
+ CHECK_OFFSET(host_tr_selector, 994);
+ CHECK_OFFSET(guest_pml_index, 996);
+}
+
+extern const unsigned short vmcs12_field_offsets[];
+extern const unsigned int nr_vmcs12_fields;
+
+static inline short get_vmcs12_field_offset(unsigned long field)
+{
+ unsigned short offset;
+ unsigned int index;
+
+ if (field >> 15)
+ return -ENOENT;
+
+ index = ROL16(field, 6);
+ if (index >= nr_vmcs12_fields)
+ return -ENOENT;
+
+ index = array_index_nospec(index, nr_vmcs12_fields);
+ offset = vmcs12_field_offsets[index];
+ if (offset == 0)
+ return -ENOENT;
+ return offset;
+}
+
+static inline u64 vmcs12_read_any(struct vmcs12 *vmcs12, unsigned long field,
+ u16 offset)
+{
+ char *p = (char *)vmcs12 + offset;
+
+ switch (vmcs_field_width(field)) {
+ case VMCS_FIELD_WIDTH_NATURAL_WIDTH:
+ return *((natural_width *)p);
+ case VMCS_FIELD_WIDTH_U16:
+ return *((u16 *)p);
+ case VMCS_FIELD_WIDTH_U32:
+ return *((u32 *)p);
+ case VMCS_FIELD_WIDTH_U64:
+ return *((u64 *)p);
+ default:
+ WARN_ON_ONCE(1);
+ return -1;
+ }
+}
+
+static inline void vmcs12_write_any(struct vmcs12 *vmcs12, unsigned long field,
+ u16 offset, u64 field_value)
+{
+ char *p = (char *)vmcs12 + offset;
+
+ switch (vmcs_field_width(field)) {
+ case VMCS_FIELD_WIDTH_U16:
+ *(u16 *)p = field_value;
+ break;
+ case VMCS_FIELD_WIDTH_U32:
+ *(u32 *)p = field_value;
+ break;
+ case VMCS_FIELD_WIDTH_U64:
+ *(u64 *)p = field_value;
+ break;
+ case VMCS_FIELD_WIDTH_NATURAL_WIDTH:
+ *(natural_width *)p = field_value;
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ break;
+ }
+}
+
+#endif /* __KVM_X86_VMX_VMCS12_H */
diff --git a/arch/x86/kvm/vmx/vmcs_shadow_fields.h b/arch/x86/kvm/vmx/vmcs_shadow_fields.h
new file mode 100644
index 000000000..cad128d16
--- /dev/null
+++ b/arch/x86/kvm/vmx/vmcs_shadow_fields.h
@@ -0,0 +1,79 @@
+#if !defined(SHADOW_FIELD_RO) && !defined(SHADOW_FIELD_RW)
+BUILD_BUG_ON(1)
+#endif
+
+#ifndef SHADOW_FIELD_RO
+#define SHADOW_FIELD_RO(x, y)
+#endif
+#ifndef SHADOW_FIELD_RW
+#define SHADOW_FIELD_RW(x, y)
+#endif
+
+/*
+ * We do NOT shadow fields that are modified when L0
+ * traps and emulates any vmx instruction (e.g. VMPTRLD,
+ * VMXON...) executed by L1.
+ * For example, VM_INSTRUCTION_ERROR is read
+ * by L1 if a vmx instruction fails (part of the error path).
+ * Note the code assumes this logic. If for some reason
+ * we start shadowing these fields then we need to
+ * force a shadow sync when L0 emulates vmx instructions
+ * (e.g. force a sync if VM_INSTRUCTION_ERROR is modified
+ * by nested_vmx_failValid)
+ *
+ * When adding or removing fields here, note that shadowed
+ * fields must always be synced by prepare_vmcs02, not just
+ * prepare_vmcs02_rare.
+ */
+
+/*
+ * Keeping the fields ordered by size is an attempt at improving
+ * branch prediction in vmcs12_read_any and vmcs12_write_any.
+ */
+
+/* 16-bits */
+SHADOW_FIELD_RW(GUEST_INTR_STATUS, guest_intr_status)
+SHADOW_FIELD_RW(GUEST_PML_INDEX, guest_pml_index)
+SHADOW_FIELD_RW(HOST_FS_SELECTOR, host_fs_selector)
+SHADOW_FIELD_RW(HOST_GS_SELECTOR, host_gs_selector)
+
+/* 32-bits */
+SHADOW_FIELD_RO(VM_EXIT_REASON, vm_exit_reason)
+SHADOW_FIELD_RO(VM_EXIT_INTR_INFO, vm_exit_intr_info)
+SHADOW_FIELD_RO(VM_EXIT_INSTRUCTION_LEN, vm_exit_instruction_len)
+SHADOW_FIELD_RO(IDT_VECTORING_INFO_FIELD, idt_vectoring_info_field)
+SHADOW_FIELD_RO(IDT_VECTORING_ERROR_CODE, idt_vectoring_error_code)
+SHADOW_FIELD_RO(VM_EXIT_INTR_ERROR_CODE, vm_exit_intr_error_code)
+SHADOW_FIELD_RO(GUEST_CS_AR_BYTES, guest_cs_ar_bytes)
+SHADOW_FIELD_RO(GUEST_SS_AR_BYTES, guest_ss_ar_bytes)
+SHADOW_FIELD_RW(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control)
+SHADOW_FIELD_RW(PIN_BASED_VM_EXEC_CONTROL, pin_based_vm_exec_control)
+SHADOW_FIELD_RW(EXCEPTION_BITMAP, exception_bitmap)
+SHADOW_FIELD_RW(VM_ENTRY_EXCEPTION_ERROR_CODE, vm_entry_exception_error_code)
+SHADOW_FIELD_RW(VM_ENTRY_INTR_INFO_FIELD, vm_entry_intr_info_field)
+SHADOW_FIELD_RW(VM_ENTRY_INSTRUCTION_LEN, vm_entry_instruction_len)
+SHADOW_FIELD_RW(TPR_THRESHOLD, tpr_threshold)
+SHADOW_FIELD_RW(GUEST_INTERRUPTIBILITY_INFO, guest_interruptibility_info)
+SHADOW_FIELD_RW(VMX_PREEMPTION_TIMER_VALUE, vmx_preemption_timer_value)
+
+/* Natural width */
+SHADOW_FIELD_RO(EXIT_QUALIFICATION, exit_qualification)
+SHADOW_FIELD_RO(GUEST_LINEAR_ADDRESS, guest_linear_address)
+SHADOW_FIELD_RW(GUEST_RIP, guest_rip)
+SHADOW_FIELD_RW(GUEST_RSP, guest_rsp)
+SHADOW_FIELD_RW(GUEST_CR0, guest_cr0)
+SHADOW_FIELD_RW(GUEST_CR3, guest_cr3)
+SHADOW_FIELD_RW(GUEST_CR4, guest_cr4)
+SHADOW_FIELD_RW(GUEST_RFLAGS, guest_rflags)
+SHADOW_FIELD_RW(CR0_GUEST_HOST_MASK, cr0_guest_host_mask)
+SHADOW_FIELD_RW(CR0_READ_SHADOW, cr0_read_shadow)
+SHADOW_FIELD_RW(CR4_READ_SHADOW, cr4_read_shadow)
+SHADOW_FIELD_RW(HOST_FS_BASE, host_fs_base)
+SHADOW_FIELD_RW(HOST_GS_BASE, host_gs_base)
+
+/* 64-bit */
+SHADOW_FIELD_RO(GUEST_PHYSICAL_ADDRESS, guest_physical_address)
+SHADOW_FIELD_RO(GUEST_PHYSICAL_ADDRESS_HIGH, guest_physical_address)
+
+#undef SHADOW_FIELD_RO
+#undef SHADOW_FIELD_RW
diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S
new file mode 100644
index 000000000..be275a041
--- /dev/null
+++ b/arch/x86/kvm/vmx/vmenter.S
@@ -0,0 +1,362 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/linkage.h>
+#include <asm/asm.h>
+#include <asm/bitsperlong.h>
+#include <asm/kvm_vcpu_regs.h>
+#include <asm/nospec-branch.h>
+#include <asm/percpu.h>
+#include <asm/segment.h>
+#include "kvm-asm-offsets.h"
+#include "run_flags.h"
+
+#define WORD_SIZE (BITS_PER_LONG / 8)
+
+#define VCPU_RAX __VCPU_REGS_RAX * WORD_SIZE
+#define VCPU_RCX __VCPU_REGS_RCX * WORD_SIZE
+#define VCPU_RDX __VCPU_REGS_RDX * WORD_SIZE
+#define VCPU_RBX __VCPU_REGS_RBX * WORD_SIZE
+/* Intentionally omit RSP as it's context switched by hardware */
+#define VCPU_RBP __VCPU_REGS_RBP * WORD_SIZE
+#define VCPU_RSI __VCPU_REGS_RSI * WORD_SIZE
+#define VCPU_RDI __VCPU_REGS_RDI * WORD_SIZE
+
+#ifdef CONFIG_X86_64
+#define VCPU_R8 __VCPU_REGS_R8 * WORD_SIZE
+#define VCPU_R9 __VCPU_REGS_R9 * WORD_SIZE
+#define VCPU_R10 __VCPU_REGS_R10 * WORD_SIZE
+#define VCPU_R11 __VCPU_REGS_R11 * WORD_SIZE
+#define VCPU_R12 __VCPU_REGS_R12 * WORD_SIZE
+#define VCPU_R13 __VCPU_REGS_R13 * WORD_SIZE
+#define VCPU_R14 __VCPU_REGS_R14 * WORD_SIZE
+#define VCPU_R15 __VCPU_REGS_R15 * WORD_SIZE
+#endif
+
+.macro VMX_DO_EVENT_IRQOFF call_insn call_target
+ /*
+ * Unconditionally create a stack frame, getting the correct RSP on the
+ * stack (for x86-64) would take two instructions anyways, and RBP can
+ * be used to restore RSP to make objtool happy (see below).
+ */
+ push %_ASM_BP
+ mov %_ASM_SP, %_ASM_BP
+
+#ifdef CONFIG_X86_64
+ /*
+ * Align RSP to a 16-byte boundary (to emulate CPU behavior) before
+ * creating the synthetic interrupt stack frame for the IRQ/NMI.
+ */
+ and $-16, %rsp
+ push $__KERNEL_DS
+ push %rbp
+#endif
+ pushf
+ push $__KERNEL_CS
+ \call_insn \call_target
+
+ /*
+ * "Restore" RSP from RBP, even though IRET has already unwound RSP to
+ * the correct value. objtool doesn't know the callee will IRET and,
+ * without the explicit restore, thinks the stack is getting walloped.
+ * Using an unwind hint is problematic due to x86-64's dynamic alignment.
+ */
+ mov %_ASM_BP, %_ASM_SP
+ pop %_ASM_BP
+ RET
+.endm
+
+.section .noinstr.text, "ax"
+
+/**
+ * __vmx_vcpu_run - Run a vCPU via a transition to VMX guest mode
+ * @vmx: struct vcpu_vmx *
+ * @regs: unsigned long * (to guest registers)
+ * @flags: VMX_RUN_VMRESUME: use VMRESUME instead of VMLAUNCH
+ * VMX_RUN_SAVE_SPEC_CTRL: save guest SPEC_CTRL into vmx->spec_ctrl
+ *
+ * Returns:
+ * 0 on VM-Exit, 1 on VM-Fail
+ */
+SYM_FUNC_START(__vmx_vcpu_run)
+ push %_ASM_BP
+ mov %_ASM_SP, %_ASM_BP
+#ifdef CONFIG_X86_64
+ push %r15
+ push %r14
+ push %r13
+ push %r12
+#else
+ push %edi
+ push %esi
+#endif
+ push %_ASM_BX
+
+ /* Save @vmx for SPEC_CTRL handling */
+ push %_ASM_ARG1
+
+ /* Save @flags for SPEC_CTRL handling */
+ push %_ASM_ARG3
+
+ /*
+ * Save @regs, _ASM_ARG2 may be modified by vmx_update_host_rsp() and
+ * @regs is needed after VM-Exit to save the guest's register values.
+ */
+ push %_ASM_ARG2
+
+ /* Copy @flags to EBX, _ASM_ARG3 is volatile. */
+ mov %_ASM_ARG3L, %ebx
+
+ lea (%_ASM_SP), %_ASM_ARG2
+ call vmx_update_host_rsp
+
+ ALTERNATIVE "jmp .Lspec_ctrl_done", "", X86_FEATURE_MSR_SPEC_CTRL
+
+ /*
+ * SPEC_CTRL handling: if the guest's SPEC_CTRL value differs from the
+ * host's, write the MSR.
+ *
+ * IMPORTANT: To avoid RSB underflow attacks and any other nastiness,
+ * there must not be any returns or indirect branches between this code
+ * and vmentry.
+ */
+ mov 2*WORD_SIZE(%_ASM_SP), %_ASM_DI
+ movl VMX_spec_ctrl(%_ASM_DI), %edi
+ movl PER_CPU_VAR(x86_spec_ctrl_current), %esi
+ cmp %edi, %esi
+ je .Lspec_ctrl_done
+ mov $MSR_IA32_SPEC_CTRL, %ecx
+ xor %edx, %edx
+ mov %edi, %eax
+ wrmsr
+
+.Lspec_ctrl_done:
+
+ /*
+ * Since vmentry is serializing on affected CPUs, there's no need for
+ * an LFENCE to stop speculation from skipping the wrmsr.
+ */
+
+ /* Load @regs to RAX. */
+ mov (%_ASM_SP), %_ASM_AX
+
+ /* Check if vmlaunch or vmresume is needed */
+ test $VMX_RUN_VMRESUME, %ebx
+
+ /* Load guest registers. Don't clobber flags. */
+ mov VCPU_RCX(%_ASM_AX), %_ASM_CX
+ mov VCPU_RDX(%_ASM_AX), %_ASM_DX
+ mov VCPU_RBX(%_ASM_AX), %_ASM_BX
+ mov VCPU_RBP(%_ASM_AX), %_ASM_BP
+ mov VCPU_RSI(%_ASM_AX), %_ASM_SI
+ mov VCPU_RDI(%_ASM_AX), %_ASM_DI
+#ifdef CONFIG_X86_64
+ mov VCPU_R8 (%_ASM_AX), %r8
+ mov VCPU_R9 (%_ASM_AX), %r9
+ mov VCPU_R10(%_ASM_AX), %r10
+ mov VCPU_R11(%_ASM_AX), %r11
+ mov VCPU_R12(%_ASM_AX), %r12
+ mov VCPU_R13(%_ASM_AX), %r13
+ mov VCPU_R14(%_ASM_AX), %r14
+ mov VCPU_R15(%_ASM_AX), %r15
+#endif
+ /* Load guest RAX. This kills the @regs pointer! */
+ mov VCPU_RAX(%_ASM_AX), %_ASM_AX
+
+ /* Check EFLAGS.ZF from 'test VMX_RUN_VMRESUME' above */
+ jz .Lvmlaunch
+
+ /*
+ * After a successful VMRESUME/VMLAUNCH, control flow "magically"
+ * resumes below at 'vmx_vmexit' due to the VMCS HOST_RIP setting.
+ * So this isn't a typical function and objtool needs to be told to
+ * save the unwind state here and restore it below.
+ */
+ UNWIND_HINT_SAVE
+
+/*
+ * If VMRESUME/VMLAUNCH and corresponding vmexit succeed, execution resumes at
+ * the 'vmx_vmexit' label below.
+ */
+.Lvmresume:
+ vmresume
+ jmp .Lvmfail
+
+.Lvmlaunch:
+ vmlaunch
+ jmp .Lvmfail
+
+ _ASM_EXTABLE(.Lvmresume, .Lfixup)
+ _ASM_EXTABLE(.Lvmlaunch, .Lfixup)
+
+SYM_INNER_LABEL_ALIGN(vmx_vmexit, SYM_L_GLOBAL)
+
+ /* Restore unwind state from before the VMRESUME/VMLAUNCH. */
+ UNWIND_HINT_RESTORE
+ ENDBR
+
+ /* Temporarily save guest's RAX. */
+ push %_ASM_AX
+
+ /* Reload @regs to RAX. */
+ mov WORD_SIZE(%_ASM_SP), %_ASM_AX
+
+ /* Save all guest registers, including RAX from the stack */
+ pop VCPU_RAX(%_ASM_AX)
+ mov %_ASM_CX, VCPU_RCX(%_ASM_AX)
+ mov %_ASM_DX, VCPU_RDX(%_ASM_AX)
+ mov %_ASM_BX, VCPU_RBX(%_ASM_AX)
+ mov %_ASM_BP, VCPU_RBP(%_ASM_AX)
+ mov %_ASM_SI, VCPU_RSI(%_ASM_AX)
+ mov %_ASM_DI, VCPU_RDI(%_ASM_AX)
+#ifdef CONFIG_X86_64
+ mov %r8, VCPU_R8 (%_ASM_AX)
+ mov %r9, VCPU_R9 (%_ASM_AX)
+ mov %r10, VCPU_R10(%_ASM_AX)
+ mov %r11, VCPU_R11(%_ASM_AX)
+ mov %r12, VCPU_R12(%_ASM_AX)
+ mov %r13, VCPU_R13(%_ASM_AX)
+ mov %r14, VCPU_R14(%_ASM_AX)
+ mov %r15, VCPU_R15(%_ASM_AX)
+#endif
+
+ /* Clear return value to indicate VM-Exit (as opposed to VM-Fail). */
+ xor %ebx, %ebx
+
+.Lclear_regs:
+ /* Discard @regs. The register is irrelevant, it just can't be RBX. */
+ pop %_ASM_AX
+
+ /*
+ * Clear all general purpose registers except RSP and RBX to prevent
+ * speculative use of the guest's values, even those that are reloaded
+ * via the stack. In theory, an L1 cache miss when restoring registers
+ * could lead to speculative execution with the guest's values.
+ * Zeroing XORs are dirt cheap, i.e. the extra paranoia is essentially
+ * free. RSP and RBX are exempt as RSP is restored by hardware during
+ * VM-Exit and RBX is explicitly loaded with 0 or 1 to hold the return
+ * value.
+ */
+ xor %eax, %eax
+ xor %ecx, %ecx
+ xor %edx, %edx
+ xor %ebp, %ebp
+ xor %esi, %esi
+ xor %edi, %edi
+#ifdef CONFIG_X86_64
+ xor %r8d, %r8d
+ xor %r9d, %r9d
+ xor %r10d, %r10d
+ xor %r11d, %r11d
+ xor %r12d, %r12d
+ xor %r13d, %r13d
+ xor %r14d, %r14d
+ xor %r15d, %r15d
+#endif
+
+ /*
+ * IMPORTANT: RSB filling and SPEC_CTRL handling must be done before
+ * the first unbalanced RET after vmexit!
+ *
+ * For retpoline or IBRS, RSB filling is needed to prevent poisoned RSB
+ * entries and (in some cases) RSB underflow.
+ *
+ * eIBRS has its own protection against poisoned RSB, so it doesn't
+ * need the RSB filling sequence. But it does need to be enabled, and a
+ * single call to retire, before the first unbalanced RET.
+ */
+
+ FILL_RETURN_BUFFER %_ASM_CX, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_VMEXIT,\
+ X86_FEATURE_RSB_VMEXIT_LITE
+
+ pop %_ASM_ARG2 /* @flags */
+ pop %_ASM_ARG1 /* @vmx */
+
+ call vmx_spec_ctrl_restore_host
+
+ /* Put return value in AX */
+ mov %_ASM_BX, %_ASM_AX
+
+ pop %_ASM_BX
+#ifdef CONFIG_X86_64
+ pop %r12
+ pop %r13
+ pop %r14
+ pop %r15
+#else
+ pop %esi
+ pop %edi
+#endif
+ pop %_ASM_BP
+ RET
+
+.Lfixup:
+ cmpb $0, kvm_rebooting
+ jne .Lvmfail
+ ud2
+.Lvmfail:
+ /* VM-Fail: set return value to 1 */
+ mov $1, %_ASM_BX
+ jmp .Lclear_regs
+
+SYM_FUNC_END(__vmx_vcpu_run)
+
+SYM_FUNC_START(vmx_do_nmi_irqoff)
+ VMX_DO_EVENT_IRQOFF call asm_exc_nmi_kvm_vmx
+SYM_FUNC_END(vmx_do_nmi_irqoff)
+
+#ifndef CONFIG_CC_HAS_ASM_GOTO_OUTPUT
+
+/**
+ * vmread_error_trampoline - Trampoline from inline asm to vmread_error()
+ * @field: VMCS field encoding that failed
+ * @fault: %true if the VMREAD faulted, %false if it failed
+ *
+ * Save and restore volatile registers across a call to vmread_error(). Note,
+ * all parameters are passed on the stack.
+ */
+SYM_FUNC_START(vmread_error_trampoline)
+ push %_ASM_BP
+ mov %_ASM_SP, %_ASM_BP
+
+ push %_ASM_AX
+ push %_ASM_CX
+ push %_ASM_DX
+#ifdef CONFIG_X86_64
+ push %rdi
+ push %rsi
+ push %r8
+ push %r9
+ push %r10
+ push %r11
+#endif
+
+ /* Load @field and @fault to arg1 and arg2 respectively. */
+ mov 3*WORD_SIZE(%_ASM_BP), %_ASM_ARG2
+ mov 2*WORD_SIZE(%_ASM_BP), %_ASM_ARG1
+
+ call vmread_error_trampoline2
+
+ /* Zero out @fault, which will be popped into the result register. */
+ _ASM_MOV $0, 3*WORD_SIZE(%_ASM_BP)
+
+#ifdef CONFIG_X86_64
+ pop %r11
+ pop %r10
+ pop %r9
+ pop %r8
+ pop %rsi
+ pop %rdi
+#endif
+ pop %_ASM_DX
+ pop %_ASM_CX
+ pop %_ASM_AX
+ pop %_ASM_BP
+
+ RET
+SYM_FUNC_END(vmread_error_trampoline)
+#endif
+
+.section .text, "ax"
+
+SYM_FUNC_START(vmx_do_interrupt_irqoff)
+ VMX_DO_EVENT_IRQOFF CALL_NOSPEC _ASM_ARG1
+SYM_FUNC_END(vmx_do_interrupt_irqoff)
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
new file mode 100644
index 000000000..9bba53525
--- /dev/null
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -0,0 +1,8712 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Kernel-based Virtual Machine driver for Linux
+ *
+ * This module enables machines with Intel VT-x extensions to run virtual
+ * machines without emulation or binary translation.
+ *
+ * Copyright (C) 2006 Qumranet, Inc.
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
+ *
+ * Authors:
+ * Avi Kivity <avi@qumranet.com>
+ * Yaniv Kamay <yaniv@qumranet.com>
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/highmem.h>
+#include <linux/hrtimer.h>
+#include <linux/kernel.h>
+#include <linux/kvm_host.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/mod_devicetable.h>
+#include <linux/mm.h>
+#include <linux/objtool.h>
+#include <linux/sched.h>
+#include <linux/sched/smt.h>
+#include <linux/slab.h>
+#include <linux/tboot.h>
+#include <linux/trace_events.h>
+#include <linux/entry-kvm.h>
+
+#include <asm/apic.h>
+#include <asm/asm.h>
+#include <asm/cpu.h>
+#include <asm/cpu_device_id.h>
+#include <asm/debugreg.h>
+#include <asm/desc.h>
+#include <asm/fpu/api.h>
+#include <asm/fpu/xstate.h>
+#include <asm/idtentry.h>
+#include <asm/io.h>
+#include <asm/irq_remapping.h>
+#include <asm/reboot.h>
+#include <asm/perf_event.h>
+#include <asm/mmu_context.h>
+#include <asm/mshyperv.h>
+#include <asm/mwait.h>
+#include <asm/spec-ctrl.h>
+#include <asm/vmx.h>
+
+#include "capabilities.h"
+#include "cpuid.h"
+#include "hyperv.h"
+#include "kvm_onhyperv.h"
+#include "irq.h"
+#include "kvm_cache_regs.h"
+#include "lapic.h"
+#include "mmu.h"
+#include "nested.h"
+#include "pmu.h"
+#include "sgx.h"
+#include "trace.h"
+#include "vmcs.h"
+#include "vmcs12.h"
+#include "vmx.h"
+#include "x86.h"
+#include "smm.h"
+
+MODULE_AUTHOR("Qumranet");
+MODULE_LICENSE("GPL");
+
+#ifdef MODULE
+static const struct x86_cpu_id vmx_cpu_id[] = {
+ X86_MATCH_FEATURE(X86_FEATURE_VMX, NULL),
+ {}
+};
+MODULE_DEVICE_TABLE(x86cpu, vmx_cpu_id);
+#endif
+
+bool __read_mostly enable_vpid = 1;
+module_param_named(vpid, enable_vpid, bool, 0444);
+
+static bool __read_mostly enable_vnmi = 1;
+module_param_named(vnmi, enable_vnmi, bool, S_IRUGO);
+
+bool __read_mostly flexpriority_enabled = 1;
+module_param_named(flexpriority, flexpriority_enabled, bool, S_IRUGO);
+
+bool __read_mostly enable_ept = 1;
+module_param_named(ept, enable_ept, bool, S_IRUGO);
+
+bool __read_mostly enable_unrestricted_guest = 1;
+module_param_named(unrestricted_guest,
+ enable_unrestricted_guest, bool, S_IRUGO);
+
+bool __read_mostly enable_ept_ad_bits = 1;
+module_param_named(eptad, enable_ept_ad_bits, bool, S_IRUGO);
+
+static bool __read_mostly emulate_invalid_guest_state = true;
+module_param(emulate_invalid_guest_state, bool, S_IRUGO);
+
+static bool __read_mostly fasteoi = 1;
+module_param(fasteoi, bool, S_IRUGO);
+
+module_param(enable_apicv, bool, S_IRUGO);
+
+bool __read_mostly enable_ipiv = true;
+module_param(enable_ipiv, bool, 0444);
+
+/*
+ * If nested=1, nested virtualization is supported, i.e., guests may use
+ * VMX and be a hypervisor for its own guests. If nested=0, guests may not
+ * use VMX instructions.
+ */
+static bool __read_mostly nested = 1;
+module_param(nested, bool, S_IRUGO);
+
+bool __read_mostly enable_pml = 1;
+module_param_named(pml, enable_pml, bool, S_IRUGO);
+
+static bool __read_mostly error_on_inconsistent_vmcs_config = true;
+module_param(error_on_inconsistent_vmcs_config, bool, 0444);
+
+static bool __read_mostly dump_invalid_vmcs = 0;
+module_param(dump_invalid_vmcs, bool, 0644);
+
+#define MSR_BITMAP_MODE_X2APIC 1
+#define MSR_BITMAP_MODE_X2APIC_APICV 2
+
+#define KVM_VMX_TSC_MULTIPLIER_MAX 0xffffffffffffffffULL
+
+/* Guest_tsc -> host_tsc conversion requires 64-bit division. */
+static int __read_mostly cpu_preemption_timer_multi;
+static bool __read_mostly enable_preemption_timer = 1;
+#ifdef CONFIG_X86_64
+module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO);
+#endif
+
+extern bool __read_mostly allow_smaller_maxphyaddr;
+module_param(allow_smaller_maxphyaddr, bool, S_IRUGO);
+
+#define KVM_VM_CR0_ALWAYS_OFF (X86_CR0_NW | X86_CR0_CD)
+#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR0_NE
+#define KVM_VM_CR0_ALWAYS_ON \
+ (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE)
+
+#define KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR4_VMXE
+#define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE)
+#define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE)
+
+#define RMODE_GUEST_OWNED_EFLAGS_BITS (~(X86_EFLAGS_IOPL | X86_EFLAGS_VM))
+
+#define MSR_IA32_RTIT_STATUS_MASK (~(RTIT_STATUS_FILTEREN | \
+ RTIT_STATUS_CONTEXTEN | RTIT_STATUS_TRIGGEREN | \
+ RTIT_STATUS_ERROR | RTIT_STATUS_STOPPED | \
+ RTIT_STATUS_BYTECNT))
+
+/*
+ * List of MSRs that can be directly passed to the guest.
+ * In addition to these x2apic and PT MSRs are handled specially.
+ */
+static u32 vmx_possible_passthrough_msrs[MAX_POSSIBLE_PASSTHROUGH_MSRS] = {
+ MSR_IA32_SPEC_CTRL,
+ MSR_IA32_PRED_CMD,
+ MSR_IA32_FLUSH_CMD,
+ MSR_IA32_TSC,
+#ifdef CONFIG_X86_64
+ MSR_FS_BASE,
+ MSR_GS_BASE,
+ MSR_KERNEL_GS_BASE,
+ MSR_IA32_XFD,
+ MSR_IA32_XFD_ERR,
+#endif
+ MSR_IA32_SYSENTER_CS,
+ MSR_IA32_SYSENTER_ESP,
+ MSR_IA32_SYSENTER_EIP,
+ MSR_CORE_C1_RES,
+ MSR_CORE_C3_RESIDENCY,
+ MSR_CORE_C6_RESIDENCY,
+ MSR_CORE_C7_RESIDENCY,
+};
+
+/*
+ * These 2 parameters are used to config the controls for Pause-Loop Exiting:
+ * ple_gap: upper bound on the amount of time between two successive
+ * executions of PAUSE in a loop. Also indicate if ple enabled.
+ * According to test, this time is usually smaller than 128 cycles.
+ * ple_window: upper bound on the amount of time a guest is allowed to execute
+ * in a PAUSE loop. Tests indicate that most spinlocks are held for
+ * less than 2^12 cycles
+ * Time is measured based on a counter that runs at the same rate as the TSC,
+ * refer SDM volume 3b section 21.6.13 & 22.1.3.
+ */
+static unsigned int ple_gap = KVM_DEFAULT_PLE_GAP;
+module_param(ple_gap, uint, 0444);
+
+static unsigned int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW;
+module_param(ple_window, uint, 0444);
+
+/* Default doubles per-vcpu window every exit. */
+static unsigned int ple_window_grow = KVM_DEFAULT_PLE_WINDOW_GROW;
+module_param(ple_window_grow, uint, 0444);
+
+/* Default resets per-vcpu window every exit to ple_window. */
+static unsigned int ple_window_shrink = KVM_DEFAULT_PLE_WINDOW_SHRINK;
+module_param(ple_window_shrink, uint, 0444);
+
+/* Default is to compute the maximum so we can never overflow. */
+static unsigned int ple_window_max = KVM_VMX_DEFAULT_PLE_WINDOW_MAX;
+module_param(ple_window_max, uint, 0444);
+
+/* Default is SYSTEM mode, 1 for host-guest mode */
+int __read_mostly pt_mode = PT_MODE_SYSTEM;
+module_param(pt_mode, int, S_IRUGO);
+
+static DEFINE_STATIC_KEY_FALSE(vmx_l1d_should_flush);
+static DEFINE_STATIC_KEY_FALSE(vmx_l1d_flush_cond);
+static DEFINE_MUTEX(vmx_l1d_flush_mutex);
+
+/* Storage for pre module init parameter parsing */
+static enum vmx_l1d_flush_state __read_mostly vmentry_l1d_flush_param = VMENTER_L1D_FLUSH_AUTO;
+
+static const struct {
+ const char *option;
+ bool for_parse;
+} vmentry_l1d_param[] = {
+ [VMENTER_L1D_FLUSH_AUTO] = {"auto", true},
+ [VMENTER_L1D_FLUSH_NEVER] = {"never", true},
+ [VMENTER_L1D_FLUSH_COND] = {"cond", true},
+ [VMENTER_L1D_FLUSH_ALWAYS] = {"always", true},
+ [VMENTER_L1D_FLUSH_EPT_DISABLED] = {"EPT disabled", false},
+ [VMENTER_L1D_FLUSH_NOT_REQUIRED] = {"not required", false},
+};
+
+#define L1D_CACHE_ORDER 4
+static void *vmx_l1d_flush_pages;
+
+static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf)
+{
+ struct page *page;
+ unsigned int i;
+
+ if (!boot_cpu_has_bug(X86_BUG_L1TF)) {
+ l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED;
+ return 0;
+ }
+
+ if (!enable_ept) {
+ l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_EPT_DISABLED;
+ return 0;
+ }
+
+ if (host_arch_capabilities & ARCH_CAP_SKIP_VMENTRY_L1DFLUSH) {
+ l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED;
+ return 0;
+ }
+
+ /* If set to auto use the default l1tf mitigation method */
+ if (l1tf == VMENTER_L1D_FLUSH_AUTO) {
+ switch (l1tf_mitigation) {
+ case L1TF_MITIGATION_OFF:
+ l1tf = VMENTER_L1D_FLUSH_NEVER;
+ break;
+ case L1TF_MITIGATION_FLUSH_NOWARN:
+ case L1TF_MITIGATION_FLUSH:
+ case L1TF_MITIGATION_FLUSH_NOSMT:
+ l1tf = VMENTER_L1D_FLUSH_COND;
+ break;
+ case L1TF_MITIGATION_FULL:
+ case L1TF_MITIGATION_FULL_FORCE:
+ l1tf = VMENTER_L1D_FLUSH_ALWAYS;
+ break;
+ }
+ } else if (l1tf_mitigation == L1TF_MITIGATION_FULL_FORCE) {
+ l1tf = VMENTER_L1D_FLUSH_ALWAYS;
+ }
+
+ if (l1tf != VMENTER_L1D_FLUSH_NEVER && !vmx_l1d_flush_pages &&
+ !boot_cpu_has(X86_FEATURE_FLUSH_L1D)) {
+ /*
+ * This allocation for vmx_l1d_flush_pages is not tied to a VM
+ * lifetime and so should not be charged to a memcg.
+ */
+ page = alloc_pages(GFP_KERNEL, L1D_CACHE_ORDER);
+ if (!page)
+ return -ENOMEM;
+ vmx_l1d_flush_pages = page_address(page);
+
+ /*
+ * Initialize each page with a different pattern in
+ * order to protect against KSM in the nested
+ * virtualization case.
+ */
+ for (i = 0; i < 1u << L1D_CACHE_ORDER; ++i) {
+ memset(vmx_l1d_flush_pages + i * PAGE_SIZE, i + 1,
+ PAGE_SIZE);
+ }
+ }
+
+ l1tf_vmx_mitigation = l1tf;
+
+ if (l1tf != VMENTER_L1D_FLUSH_NEVER)
+ static_branch_enable(&vmx_l1d_should_flush);
+ else
+ static_branch_disable(&vmx_l1d_should_flush);
+
+ if (l1tf == VMENTER_L1D_FLUSH_COND)
+ static_branch_enable(&vmx_l1d_flush_cond);
+ else
+ static_branch_disable(&vmx_l1d_flush_cond);
+ return 0;
+}
+
+static int vmentry_l1d_flush_parse(const char *s)
+{
+ unsigned int i;
+
+ if (s) {
+ for (i = 0; i < ARRAY_SIZE(vmentry_l1d_param); i++) {
+ if (vmentry_l1d_param[i].for_parse &&
+ sysfs_streq(s, vmentry_l1d_param[i].option))
+ return i;
+ }
+ }
+ return -EINVAL;
+}
+
+static int vmentry_l1d_flush_set(const char *s, const struct kernel_param *kp)
+{
+ int l1tf, ret;
+
+ l1tf = vmentry_l1d_flush_parse(s);
+ if (l1tf < 0)
+ return l1tf;
+
+ if (!boot_cpu_has(X86_BUG_L1TF))
+ return 0;
+
+ /*
+ * Has vmx_init() run already? If not then this is the pre init
+ * parameter parsing. In that case just store the value and let
+ * vmx_init() do the proper setup after enable_ept has been
+ * established.
+ */
+ if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_AUTO) {
+ vmentry_l1d_flush_param = l1tf;
+ return 0;
+ }
+
+ mutex_lock(&vmx_l1d_flush_mutex);
+ ret = vmx_setup_l1d_flush(l1tf);
+ mutex_unlock(&vmx_l1d_flush_mutex);
+ return ret;
+}
+
+static int vmentry_l1d_flush_get(char *s, const struct kernel_param *kp)
+{
+ if (WARN_ON_ONCE(l1tf_vmx_mitigation >= ARRAY_SIZE(vmentry_l1d_param)))
+ return sysfs_emit(s, "???\n");
+
+ return sysfs_emit(s, "%s\n", vmentry_l1d_param[l1tf_vmx_mitigation].option);
+}
+
+static __always_inline void vmx_disable_fb_clear(struct vcpu_vmx *vmx)
+{
+ u64 msr;
+
+ if (!vmx->disable_fb_clear)
+ return;
+
+ msr = __rdmsr(MSR_IA32_MCU_OPT_CTRL);
+ msr |= FB_CLEAR_DIS;
+ native_wrmsrl(MSR_IA32_MCU_OPT_CTRL, msr);
+ /* Cache the MSR value to avoid reading it later */
+ vmx->msr_ia32_mcu_opt_ctrl = msr;
+}
+
+static __always_inline void vmx_enable_fb_clear(struct vcpu_vmx *vmx)
+{
+ if (!vmx->disable_fb_clear)
+ return;
+
+ vmx->msr_ia32_mcu_opt_ctrl &= ~FB_CLEAR_DIS;
+ native_wrmsrl(MSR_IA32_MCU_OPT_CTRL, vmx->msr_ia32_mcu_opt_ctrl);
+}
+
+static void vmx_update_fb_clear_dis(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx)
+{
+ vmx->disable_fb_clear = (host_arch_capabilities & ARCH_CAP_FB_CLEAR_CTRL) &&
+ !boot_cpu_has_bug(X86_BUG_MDS) &&
+ !boot_cpu_has_bug(X86_BUG_TAA);
+
+ /*
+ * If guest will not execute VERW, there is no need to set FB_CLEAR_DIS
+ * at VMEntry. Skip the MSR read/write when a guest has no use case to
+ * execute VERW.
+ */
+ if ((vcpu->arch.arch_capabilities & ARCH_CAP_FB_CLEAR) ||
+ ((vcpu->arch.arch_capabilities & ARCH_CAP_MDS_NO) &&
+ (vcpu->arch.arch_capabilities & ARCH_CAP_TAA_NO) &&
+ (vcpu->arch.arch_capabilities & ARCH_CAP_PSDP_NO) &&
+ (vcpu->arch.arch_capabilities & ARCH_CAP_FBSDP_NO) &&
+ (vcpu->arch.arch_capabilities & ARCH_CAP_SBDR_SSDP_NO)))
+ vmx->disable_fb_clear = false;
+}
+
+static const struct kernel_param_ops vmentry_l1d_flush_ops = {
+ .set = vmentry_l1d_flush_set,
+ .get = vmentry_l1d_flush_get,
+};
+module_param_cb(vmentry_l1d_flush, &vmentry_l1d_flush_ops, NULL, 0644);
+
+static u32 vmx_segment_access_rights(struct kvm_segment *var);
+
+void vmx_vmexit(void);
+
+#define vmx_insn_failed(fmt...) \
+do { \
+ WARN_ONCE(1, fmt); \
+ pr_warn_ratelimited(fmt); \
+} while (0)
+
+noinline void vmread_error(unsigned long field)
+{
+ vmx_insn_failed("vmread failed: field=%lx\n", field);
+}
+
+#ifndef CONFIG_CC_HAS_ASM_GOTO_OUTPUT
+noinstr void vmread_error_trampoline2(unsigned long field, bool fault)
+{
+ if (fault) {
+ kvm_spurious_fault();
+ } else {
+ instrumentation_begin();
+ vmread_error(field);
+ instrumentation_end();
+ }
+}
+#endif
+
+noinline void vmwrite_error(unsigned long field, unsigned long value)
+{
+ vmx_insn_failed("vmwrite failed: field=%lx val=%lx err=%u\n",
+ field, value, vmcs_read32(VM_INSTRUCTION_ERROR));
+}
+
+noinline void vmclear_error(struct vmcs *vmcs, u64 phys_addr)
+{
+ vmx_insn_failed("vmclear failed: %p/%llx err=%u\n",
+ vmcs, phys_addr, vmcs_read32(VM_INSTRUCTION_ERROR));
+}
+
+noinline void vmptrld_error(struct vmcs *vmcs, u64 phys_addr)
+{
+ vmx_insn_failed("vmptrld failed: %p/%llx err=%u\n",
+ vmcs, phys_addr, vmcs_read32(VM_INSTRUCTION_ERROR));
+}
+
+noinline void invvpid_error(unsigned long ext, u16 vpid, gva_t gva)
+{
+ vmx_insn_failed("invvpid failed: ext=0x%lx vpid=%u gva=0x%lx\n",
+ ext, vpid, gva);
+}
+
+noinline void invept_error(unsigned long ext, u64 eptp, gpa_t gpa)
+{
+ vmx_insn_failed("invept failed: ext=0x%lx eptp=%llx gpa=0x%llx\n",
+ ext, eptp, gpa);
+}
+
+static DEFINE_PER_CPU(struct vmcs *, vmxarea);
+DEFINE_PER_CPU(struct vmcs *, current_vmcs);
+/*
+ * We maintain a per-CPU linked-list of VMCS loaded on that CPU. This is needed
+ * when a CPU is brought down, and we need to VMCLEAR all VMCSs loaded on it.
+ */
+static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu);
+
+static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS);
+static DEFINE_SPINLOCK(vmx_vpid_lock);
+
+struct vmcs_config vmcs_config __ro_after_init;
+struct vmx_capability vmx_capability __ro_after_init;
+
+#define VMX_SEGMENT_FIELD(seg) \
+ [VCPU_SREG_##seg] = { \
+ .selector = GUEST_##seg##_SELECTOR, \
+ .base = GUEST_##seg##_BASE, \
+ .limit = GUEST_##seg##_LIMIT, \
+ .ar_bytes = GUEST_##seg##_AR_BYTES, \
+ }
+
+static const struct kvm_vmx_segment_field {
+ unsigned selector;
+ unsigned base;
+ unsigned limit;
+ unsigned ar_bytes;
+} kvm_vmx_segment_fields[] = {
+ VMX_SEGMENT_FIELD(CS),
+ VMX_SEGMENT_FIELD(DS),
+ VMX_SEGMENT_FIELD(ES),
+ VMX_SEGMENT_FIELD(FS),
+ VMX_SEGMENT_FIELD(GS),
+ VMX_SEGMENT_FIELD(SS),
+ VMX_SEGMENT_FIELD(TR),
+ VMX_SEGMENT_FIELD(LDTR),
+};
+
+static inline void vmx_segment_cache_clear(struct vcpu_vmx *vmx)
+{
+ vmx->segment_cache.bitmask = 0;
+}
+
+static unsigned long host_idt_base;
+
+#if IS_ENABLED(CONFIG_HYPERV)
+static struct kvm_x86_ops vmx_x86_ops __initdata;
+
+static bool __read_mostly enlightened_vmcs = true;
+module_param(enlightened_vmcs, bool, 0444);
+
+static int hv_enable_l2_tlb_flush(struct kvm_vcpu *vcpu)
+{
+ struct hv_enlightened_vmcs *evmcs;
+ struct hv_partition_assist_pg **p_hv_pa_pg =
+ &to_kvm_hv(vcpu->kvm)->hv_pa_pg;
+ /*
+ * Synthetic VM-Exit is not enabled in current code and so All
+ * evmcs in singe VM shares same assist page.
+ */
+ if (!*p_hv_pa_pg)
+ *p_hv_pa_pg = kzalloc(PAGE_SIZE, GFP_KERNEL_ACCOUNT);
+
+ if (!*p_hv_pa_pg)
+ return -ENOMEM;
+
+ evmcs = (struct hv_enlightened_vmcs *)to_vmx(vcpu)->loaded_vmcs->vmcs;
+
+ evmcs->partition_assist_page =
+ __pa(*p_hv_pa_pg);
+ evmcs->hv_vm_id = (unsigned long)vcpu->kvm;
+ evmcs->hv_enlightenments_control.nested_flush_hypercall = 1;
+
+ return 0;
+}
+
+static __init void hv_init_evmcs(void)
+{
+ int cpu;
+
+ if (!enlightened_vmcs)
+ return;
+
+ /*
+ * Enlightened VMCS usage should be recommended and the host needs
+ * to support eVMCS v1 or above.
+ */
+ if (ms_hyperv.hints & HV_X64_ENLIGHTENED_VMCS_RECOMMENDED &&
+ (ms_hyperv.nested_features & HV_X64_ENLIGHTENED_VMCS_VERSION) >=
+ KVM_EVMCS_VERSION) {
+
+ /* Check that we have assist pages on all online CPUs */
+ for_each_online_cpu(cpu) {
+ if (!hv_get_vp_assist_page(cpu)) {
+ enlightened_vmcs = false;
+ break;
+ }
+ }
+
+ if (enlightened_vmcs) {
+ pr_info("Using Hyper-V Enlightened VMCS\n");
+ static_branch_enable(&__kvm_is_using_evmcs);
+ }
+
+ if (ms_hyperv.nested_features & HV_X64_NESTED_DIRECT_FLUSH)
+ vmx_x86_ops.enable_l2_tlb_flush
+ = hv_enable_l2_tlb_flush;
+
+ } else {
+ enlightened_vmcs = false;
+ }
+}
+
+static void hv_reset_evmcs(void)
+{
+ struct hv_vp_assist_page *vp_ap;
+
+ if (!kvm_is_using_evmcs())
+ return;
+
+ /*
+ * KVM should enable eVMCS if and only if all CPUs have a VP assist
+ * page, and should reject CPU onlining if eVMCS is enabled the CPU
+ * doesn't have a VP assist page allocated.
+ */
+ vp_ap = hv_get_vp_assist_page(smp_processor_id());
+ if (WARN_ON_ONCE(!vp_ap))
+ return;
+
+ /*
+ * Reset everything to support using non-enlightened VMCS access later
+ * (e.g. when we reload the module with enlightened_vmcs=0)
+ */
+ vp_ap->nested_control.features.directhypercall = 0;
+ vp_ap->current_nested_vmcs = 0;
+ vp_ap->enlighten_vmentry = 0;
+}
+
+#else /* IS_ENABLED(CONFIG_HYPERV) */
+static void hv_init_evmcs(void) {}
+static void hv_reset_evmcs(void) {}
+#endif /* IS_ENABLED(CONFIG_HYPERV) */
+
+/*
+ * Comment's format: document - errata name - stepping - processor name.
+ * Refer from
+ * https://www.virtualbox.org/svn/vbox/trunk/src/VBox/VMM/VMMR0/HMR0.cpp
+ */
+static u32 vmx_preemption_cpu_tfms[] = {
+/* 323344.pdf - BA86 - D0 - Xeon 7500 Series */
+0x000206E6,
+/* 323056.pdf - AAX65 - C2 - Xeon L3406 */
+/* 322814.pdf - AAT59 - C2 - i7-600, i5-500, i5-400 and i3-300 Mobile */
+/* 322911.pdf - AAU65 - C2 - i5-600, i3-500 Desktop and Pentium G6950 */
+0x00020652,
+/* 322911.pdf - AAU65 - K0 - i5-600, i3-500 Desktop and Pentium G6950 */
+0x00020655,
+/* 322373.pdf - AAO95 - B1 - Xeon 3400 Series */
+/* 322166.pdf - AAN92 - B1 - i7-800 and i5-700 Desktop */
+/*
+ * 320767.pdf - AAP86 - B1 -
+ * i7-900 Mobile Extreme, i7-800 and i7-700 Mobile
+ */
+0x000106E5,
+/* 321333.pdf - AAM126 - C0 - Xeon 3500 */
+0x000106A0,
+/* 321333.pdf - AAM126 - C1 - Xeon 3500 */
+0x000106A1,
+/* 320836.pdf - AAJ124 - C0 - i7-900 Desktop Extreme and i7-900 Desktop */
+0x000106A4,
+ /* 321333.pdf - AAM126 - D0 - Xeon 3500 */
+ /* 321324.pdf - AAK139 - D0 - Xeon 5500 */
+ /* 320836.pdf - AAJ124 - D0 - i7-900 Extreme and i7-900 Desktop */
+0x000106A5,
+ /* Xeon E3-1220 V2 */
+0x000306A8,
+};
+
+static inline bool cpu_has_broken_vmx_preemption_timer(void)
+{
+ u32 eax = cpuid_eax(0x00000001), i;
+
+ /* Clear the reserved bits */
+ eax &= ~(0x3U << 14 | 0xfU << 28);
+ for (i = 0; i < ARRAY_SIZE(vmx_preemption_cpu_tfms); i++)
+ if (eax == vmx_preemption_cpu_tfms[i])
+ return true;
+
+ return false;
+}
+
+static inline bool cpu_need_virtualize_apic_accesses(struct kvm_vcpu *vcpu)
+{
+ return flexpriority_enabled && lapic_in_kernel(vcpu);
+}
+
+static int possible_passthrough_msr_slot(u32 msr)
+{
+ u32 i;
+
+ for (i = 0; i < ARRAY_SIZE(vmx_possible_passthrough_msrs); i++)
+ if (vmx_possible_passthrough_msrs[i] == msr)
+ return i;
+
+ return -ENOENT;
+}
+
+static bool is_valid_passthrough_msr(u32 msr)
+{
+ bool r;
+
+ switch (msr) {
+ case 0x800 ... 0x8ff:
+ /* x2APIC MSRs. These are handled in vmx_update_msr_bitmap_x2apic() */
+ return true;
+ case MSR_IA32_RTIT_STATUS:
+ case MSR_IA32_RTIT_OUTPUT_BASE:
+ case MSR_IA32_RTIT_OUTPUT_MASK:
+ case MSR_IA32_RTIT_CR3_MATCH:
+ case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
+ /* PT MSRs. These are handled in pt_update_intercept_for_msr() */
+ case MSR_LBR_SELECT:
+ case MSR_LBR_TOS:
+ case MSR_LBR_INFO_0 ... MSR_LBR_INFO_0 + 31:
+ case MSR_LBR_NHM_FROM ... MSR_LBR_NHM_FROM + 31:
+ case MSR_LBR_NHM_TO ... MSR_LBR_NHM_TO + 31:
+ case MSR_LBR_CORE_FROM ... MSR_LBR_CORE_FROM + 8:
+ case MSR_LBR_CORE_TO ... MSR_LBR_CORE_TO + 8:
+ /* LBR MSRs. These are handled in vmx_update_intercept_for_lbr_msrs() */
+ return true;
+ }
+
+ r = possible_passthrough_msr_slot(msr) != -ENOENT;
+
+ WARN(!r, "Invalid MSR %x, please adapt vmx_possible_passthrough_msrs[]", msr);
+
+ return r;
+}
+
+struct vmx_uret_msr *vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr)
+{
+ int i;
+
+ i = kvm_find_user_return_msr(msr);
+ if (i >= 0)
+ return &vmx->guest_uret_msrs[i];
+ return NULL;
+}
+
+static int vmx_set_guest_uret_msr(struct vcpu_vmx *vmx,
+ struct vmx_uret_msr *msr, u64 data)
+{
+ unsigned int slot = msr - vmx->guest_uret_msrs;
+ int ret = 0;
+
+ if (msr->load_into_hardware) {
+ preempt_disable();
+ ret = kvm_set_user_return_msr(slot, data, msr->mask);
+ preempt_enable();
+ }
+ if (!ret)
+ msr->data = data;
+ return ret;
+}
+
+/*
+ * Disable VMX and clear CR4.VMXE (even if VMXOFF faults)
+ *
+ * Note, VMXOFF causes a #UD if the CPU is !post-VMXON, but it's impossible to
+ * atomically track post-VMXON state, e.g. this may be called in NMI context.
+ * Eat all faults as all other faults on VMXOFF faults are mode related, i.e.
+ * faults are guaranteed to be due to the !post-VMXON check unless the CPU is
+ * magically in RM, VM86, compat mode, or at CPL>0.
+ */
+static int kvm_cpu_vmxoff(void)
+{
+ asm_volatile_goto("1: vmxoff\n\t"
+ _ASM_EXTABLE(1b, %l[fault])
+ ::: "cc", "memory" : fault);
+
+ cr4_clear_bits(X86_CR4_VMXE);
+ return 0;
+
+fault:
+ cr4_clear_bits(X86_CR4_VMXE);
+ return -EIO;
+}
+
+static void vmx_emergency_disable(void)
+{
+ int cpu = raw_smp_processor_id();
+ struct loaded_vmcs *v;
+
+ kvm_rebooting = true;
+
+ /*
+ * Note, CR4.VMXE can be _cleared_ in NMI context, but it can only be
+ * set in task context. If this races with VMX is disabled by an NMI,
+ * VMCLEAR and VMXOFF may #UD, but KVM will eat those faults due to
+ * kvm_rebooting set.
+ */
+ if (!(__read_cr4() & X86_CR4_VMXE))
+ return;
+
+ list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu),
+ loaded_vmcss_on_cpu_link)
+ vmcs_clear(v->vmcs);
+
+ kvm_cpu_vmxoff();
+}
+
+static void __loaded_vmcs_clear(void *arg)
+{
+ struct loaded_vmcs *loaded_vmcs = arg;
+ int cpu = raw_smp_processor_id();
+
+ if (loaded_vmcs->cpu != cpu)
+ return; /* vcpu migration can race with cpu offline */
+ if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs)
+ per_cpu(current_vmcs, cpu) = NULL;
+
+ vmcs_clear(loaded_vmcs->vmcs);
+ if (loaded_vmcs->shadow_vmcs && loaded_vmcs->launched)
+ vmcs_clear(loaded_vmcs->shadow_vmcs);
+
+ list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link);
+
+ /*
+ * Ensure all writes to loaded_vmcs, including deleting it from its
+ * current percpu list, complete before setting loaded_vmcs->cpu to
+ * -1, otherwise a different cpu can see loaded_vmcs->cpu == -1 first
+ * and add loaded_vmcs to its percpu list before it's deleted from this
+ * cpu's list. Pairs with the smp_rmb() in vmx_vcpu_load_vmcs().
+ */
+ smp_wmb();
+
+ loaded_vmcs->cpu = -1;
+ loaded_vmcs->launched = 0;
+}
+
+void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs)
+{
+ int cpu = loaded_vmcs->cpu;
+
+ if (cpu != -1)
+ smp_call_function_single(cpu,
+ __loaded_vmcs_clear, loaded_vmcs, 1);
+}
+
+static bool vmx_segment_cache_test_set(struct vcpu_vmx *vmx, unsigned seg,
+ unsigned field)
+{
+ bool ret;
+ u32 mask = 1 << (seg * SEG_FIELD_NR + field);
+
+ if (!kvm_register_is_available(&vmx->vcpu, VCPU_EXREG_SEGMENTS)) {
+ kvm_register_mark_available(&vmx->vcpu, VCPU_EXREG_SEGMENTS);
+ vmx->segment_cache.bitmask = 0;
+ }
+ ret = vmx->segment_cache.bitmask & mask;
+ vmx->segment_cache.bitmask |= mask;
+ return ret;
+}
+
+static u16 vmx_read_guest_seg_selector(struct vcpu_vmx *vmx, unsigned seg)
+{
+ u16 *p = &vmx->segment_cache.seg[seg].selector;
+
+ if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_SEL))
+ *p = vmcs_read16(kvm_vmx_segment_fields[seg].selector);
+ return *p;
+}
+
+static ulong vmx_read_guest_seg_base(struct vcpu_vmx *vmx, unsigned seg)
+{
+ ulong *p = &vmx->segment_cache.seg[seg].base;
+
+ if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_BASE))
+ *p = vmcs_readl(kvm_vmx_segment_fields[seg].base);
+ return *p;
+}
+
+static u32 vmx_read_guest_seg_limit(struct vcpu_vmx *vmx, unsigned seg)
+{
+ u32 *p = &vmx->segment_cache.seg[seg].limit;
+
+ if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_LIMIT))
+ *p = vmcs_read32(kvm_vmx_segment_fields[seg].limit);
+ return *p;
+}
+
+static u32 vmx_read_guest_seg_ar(struct vcpu_vmx *vmx, unsigned seg)
+{
+ u32 *p = &vmx->segment_cache.seg[seg].ar;
+
+ if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_AR))
+ *p = vmcs_read32(kvm_vmx_segment_fields[seg].ar_bytes);
+ return *p;
+}
+
+void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu)
+{
+ u32 eb;
+
+ eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) |
+ (1u << DB_VECTOR) | (1u << AC_VECTOR);
+ /*
+ * Guest access to VMware backdoor ports could legitimately
+ * trigger #GP because of TSS I/O permission bitmap.
+ * We intercept those #GP and allow access to them anyway
+ * as VMware does.
+ */
+ if (enable_vmware_backdoor)
+ eb |= (1u << GP_VECTOR);
+ if ((vcpu->guest_debug &
+ (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) ==
+ (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP))
+ eb |= 1u << BP_VECTOR;
+ if (to_vmx(vcpu)->rmode.vm86_active)
+ eb = ~0;
+ if (!vmx_need_pf_intercept(vcpu))
+ eb &= ~(1u << PF_VECTOR);
+
+ /* When we are running a nested L2 guest and L1 specified for it a
+ * certain exception bitmap, we must trap the same exceptions and pass
+ * them to L1. When running L2, we will only handle the exceptions
+ * specified above if L1 did not want them.
+ */
+ if (is_guest_mode(vcpu))
+ eb |= get_vmcs12(vcpu)->exception_bitmap;
+ else {
+ int mask = 0, match = 0;
+
+ if (enable_ept && (eb & (1u << PF_VECTOR))) {
+ /*
+ * If EPT is enabled, #PF is currently only intercepted
+ * if MAXPHYADDR is smaller on the guest than on the
+ * host. In that case we only care about present,
+ * non-reserved faults. For vmcs02, however, PFEC_MASK
+ * and PFEC_MATCH are set in prepare_vmcs02_rare.
+ */
+ mask = PFERR_PRESENT_MASK | PFERR_RSVD_MASK;
+ match = PFERR_PRESENT_MASK;
+ }
+ vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, mask);
+ vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, match);
+ }
+
+ /*
+ * Disabling xfd interception indicates that dynamic xfeatures
+ * might be used in the guest. Always trap #NM in this case
+ * to save guest xfd_err timely.
+ */
+ if (vcpu->arch.xfd_no_write_intercept)
+ eb |= (1u << NM_VECTOR);
+
+ vmcs_write32(EXCEPTION_BITMAP, eb);
+}
+
+/*
+ * Check if MSR is intercepted for currently loaded MSR bitmap.
+ */
+static bool msr_write_intercepted(struct vcpu_vmx *vmx, u32 msr)
+{
+ if (!(exec_controls_get(vmx) & CPU_BASED_USE_MSR_BITMAPS))
+ return true;
+
+ return vmx_test_msr_bitmap_write(vmx->loaded_vmcs->msr_bitmap, msr);
+}
+
+unsigned int __vmx_vcpu_run_flags(struct vcpu_vmx *vmx)
+{
+ unsigned int flags = 0;
+
+ if (vmx->loaded_vmcs->launched)
+ flags |= VMX_RUN_VMRESUME;
+
+ /*
+ * If writes to the SPEC_CTRL MSR aren't intercepted, the guest is free
+ * to change it directly without causing a vmexit. In that case read
+ * it after vmexit and store it in vmx->spec_ctrl.
+ */
+ if (!msr_write_intercepted(vmx, MSR_IA32_SPEC_CTRL))
+ flags |= VMX_RUN_SAVE_SPEC_CTRL;
+
+ return flags;
+}
+
+static __always_inline void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx,
+ unsigned long entry, unsigned long exit)
+{
+ vm_entry_controls_clearbit(vmx, entry);
+ vm_exit_controls_clearbit(vmx, exit);
+}
+
+int vmx_find_loadstore_msr_slot(struct vmx_msrs *m, u32 msr)
+{
+ unsigned int i;
+
+ for (i = 0; i < m->nr; ++i) {
+ if (m->val[i].index == msr)
+ return i;
+ }
+ return -ENOENT;
+}
+
+static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr)
+{
+ int i;
+ struct msr_autoload *m = &vmx->msr_autoload;
+
+ switch (msr) {
+ case MSR_EFER:
+ if (cpu_has_load_ia32_efer()) {
+ clear_atomic_switch_msr_special(vmx,
+ VM_ENTRY_LOAD_IA32_EFER,
+ VM_EXIT_LOAD_IA32_EFER);
+ return;
+ }
+ break;
+ case MSR_CORE_PERF_GLOBAL_CTRL:
+ if (cpu_has_load_perf_global_ctrl()) {
+ clear_atomic_switch_msr_special(vmx,
+ VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL,
+ VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL);
+ return;
+ }
+ break;
+ }
+ i = vmx_find_loadstore_msr_slot(&m->guest, msr);
+ if (i < 0)
+ goto skip_guest;
+ --m->guest.nr;
+ m->guest.val[i] = m->guest.val[m->guest.nr];
+ vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr);
+
+skip_guest:
+ i = vmx_find_loadstore_msr_slot(&m->host, msr);
+ if (i < 0)
+ return;
+
+ --m->host.nr;
+ m->host.val[i] = m->host.val[m->host.nr];
+ vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr);
+}
+
+static __always_inline void add_atomic_switch_msr_special(struct vcpu_vmx *vmx,
+ unsigned long entry, unsigned long exit,
+ unsigned long guest_val_vmcs, unsigned long host_val_vmcs,
+ u64 guest_val, u64 host_val)
+{
+ vmcs_write64(guest_val_vmcs, guest_val);
+ if (host_val_vmcs != HOST_IA32_EFER)
+ vmcs_write64(host_val_vmcs, host_val);
+ vm_entry_controls_setbit(vmx, entry);
+ vm_exit_controls_setbit(vmx, exit);
+}
+
+static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
+ u64 guest_val, u64 host_val, bool entry_only)
+{
+ int i, j = 0;
+ struct msr_autoload *m = &vmx->msr_autoload;
+
+ switch (msr) {
+ case MSR_EFER:
+ if (cpu_has_load_ia32_efer()) {
+ add_atomic_switch_msr_special(vmx,
+ VM_ENTRY_LOAD_IA32_EFER,
+ VM_EXIT_LOAD_IA32_EFER,
+ GUEST_IA32_EFER,
+ HOST_IA32_EFER,
+ guest_val, host_val);
+ return;
+ }
+ break;
+ case MSR_CORE_PERF_GLOBAL_CTRL:
+ if (cpu_has_load_perf_global_ctrl()) {
+ add_atomic_switch_msr_special(vmx,
+ VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL,
+ VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL,
+ GUEST_IA32_PERF_GLOBAL_CTRL,
+ HOST_IA32_PERF_GLOBAL_CTRL,
+ guest_val, host_val);
+ return;
+ }
+ break;
+ case MSR_IA32_PEBS_ENABLE:
+ /* PEBS needs a quiescent period after being disabled (to write
+ * a record). Disabling PEBS through VMX MSR swapping doesn't
+ * provide that period, so a CPU could write host's record into
+ * guest's memory.
+ */
+ wrmsrl(MSR_IA32_PEBS_ENABLE, 0);
+ }
+
+ i = vmx_find_loadstore_msr_slot(&m->guest, msr);
+ if (!entry_only)
+ j = vmx_find_loadstore_msr_slot(&m->host, msr);
+
+ if ((i < 0 && m->guest.nr == MAX_NR_LOADSTORE_MSRS) ||
+ (j < 0 && m->host.nr == MAX_NR_LOADSTORE_MSRS)) {
+ printk_once(KERN_WARNING "Not enough msr switch entries. "
+ "Can't add msr %x\n", msr);
+ return;
+ }
+ if (i < 0) {
+ i = m->guest.nr++;
+ vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr);
+ }
+ m->guest.val[i].index = msr;
+ m->guest.val[i].value = guest_val;
+
+ if (entry_only)
+ return;
+
+ if (j < 0) {
+ j = m->host.nr++;
+ vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr);
+ }
+ m->host.val[j].index = msr;
+ m->host.val[j].value = host_val;
+}
+
+static bool update_transition_efer(struct vcpu_vmx *vmx)
+{
+ u64 guest_efer = vmx->vcpu.arch.efer;
+ u64 ignore_bits = 0;
+ int i;
+
+ /* Shadow paging assumes NX to be available. */
+ if (!enable_ept)
+ guest_efer |= EFER_NX;
+
+ /*
+ * LMA and LME handled by hardware; SCE meaningless outside long mode.
+ */
+ ignore_bits |= EFER_SCE;
+#ifdef CONFIG_X86_64
+ ignore_bits |= EFER_LMA | EFER_LME;
+ /* SCE is meaningful only in long mode on Intel */
+ if (guest_efer & EFER_LMA)
+ ignore_bits &= ~(u64)EFER_SCE;
+#endif
+
+ /*
+ * On EPT, we can't emulate NX, so we must switch EFER atomically.
+ * On CPUs that support "load IA32_EFER", always switch EFER
+ * atomically, since it's faster than switching it manually.
+ */
+ if (cpu_has_load_ia32_efer() ||
+ (enable_ept && ((vmx->vcpu.arch.efer ^ host_efer) & EFER_NX))) {
+ if (!(guest_efer & EFER_LMA))
+ guest_efer &= ~EFER_LME;
+ if (guest_efer != host_efer)
+ add_atomic_switch_msr(vmx, MSR_EFER,
+ guest_efer, host_efer, false);
+ else
+ clear_atomic_switch_msr(vmx, MSR_EFER);
+ return false;
+ }
+
+ i = kvm_find_user_return_msr(MSR_EFER);
+ if (i < 0)
+ return false;
+
+ clear_atomic_switch_msr(vmx, MSR_EFER);
+
+ guest_efer &= ~ignore_bits;
+ guest_efer |= host_efer & ignore_bits;
+
+ vmx->guest_uret_msrs[i].data = guest_efer;
+ vmx->guest_uret_msrs[i].mask = ~ignore_bits;
+
+ return true;
+}
+
+#ifdef CONFIG_X86_32
+/*
+ * On 32-bit kernels, VM exits still load the FS and GS bases from the
+ * VMCS rather than the segment table. KVM uses this helper to figure
+ * out the current bases to poke them into the VMCS before entry.
+ */
+static unsigned long segment_base(u16 selector)
+{
+ struct desc_struct *table;
+ unsigned long v;
+
+ if (!(selector & ~SEGMENT_RPL_MASK))
+ return 0;
+
+ table = get_current_gdt_ro();
+
+ if ((selector & SEGMENT_TI_MASK) == SEGMENT_LDT) {
+ u16 ldt_selector = kvm_read_ldt();
+
+ if (!(ldt_selector & ~SEGMENT_RPL_MASK))
+ return 0;
+
+ table = (struct desc_struct *)segment_base(ldt_selector);
+ }
+ v = get_desc_base(&table[selector >> 3]);
+ return v;
+}
+#endif
+
+static inline bool pt_can_write_msr(struct vcpu_vmx *vmx)
+{
+ return vmx_pt_mode_is_host_guest() &&
+ !(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN);
+}
+
+static inline bool pt_output_base_valid(struct kvm_vcpu *vcpu, u64 base)
+{
+ /* The base must be 128-byte aligned and a legal physical address. */
+ return kvm_vcpu_is_legal_aligned_gpa(vcpu, base, 128);
+}
+
+static inline void pt_load_msr(struct pt_ctx *ctx, u32 addr_range)
+{
+ u32 i;
+
+ wrmsrl(MSR_IA32_RTIT_STATUS, ctx->status);
+ wrmsrl(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base);
+ wrmsrl(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask);
+ wrmsrl(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match);
+ for (i = 0; i < addr_range; i++) {
+ wrmsrl(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]);
+ wrmsrl(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]);
+ }
+}
+
+static inline void pt_save_msr(struct pt_ctx *ctx, u32 addr_range)
+{
+ u32 i;
+
+ rdmsrl(MSR_IA32_RTIT_STATUS, ctx->status);
+ rdmsrl(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base);
+ rdmsrl(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask);
+ rdmsrl(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match);
+ for (i = 0; i < addr_range; i++) {
+ rdmsrl(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]);
+ rdmsrl(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]);
+ }
+}
+
+static void pt_guest_enter(struct vcpu_vmx *vmx)
+{
+ if (vmx_pt_mode_is_system())
+ return;
+
+ /*
+ * GUEST_IA32_RTIT_CTL is already set in the VMCS.
+ * Save host state before VM entry.
+ */
+ rdmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl);
+ if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) {
+ wrmsrl(MSR_IA32_RTIT_CTL, 0);
+ pt_save_msr(&vmx->pt_desc.host, vmx->pt_desc.num_address_ranges);
+ pt_load_msr(&vmx->pt_desc.guest, vmx->pt_desc.num_address_ranges);
+ }
+}
+
+static void pt_guest_exit(struct vcpu_vmx *vmx)
+{
+ if (vmx_pt_mode_is_system())
+ return;
+
+ if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) {
+ pt_save_msr(&vmx->pt_desc.guest, vmx->pt_desc.num_address_ranges);
+ pt_load_msr(&vmx->pt_desc.host, vmx->pt_desc.num_address_ranges);
+ }
+
+ /*
+ * KVM requires VM_EXIT_CLEAR_IA32_RTIT_CTL to expose PT to the guest,
+ * i.e. RTIT_CTL is always cleared on VM-Exit. Restore it if necessary.
+ */
+ if (vmx->pt_desc.host.ctl)
+ wrmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl);
+}
+
+void vmx_set_host_fs_gs(struct vmcs_host_state *host, u16 fs_sel, u16 gs_sel,
+ unsigned long fs_base, unsigned long gs_base)
+{
+ if (unlikely(fs_sel != host->fs_sel)) {
+ if (!(fs_sel & 7))
+ vmcs_write16(HOST_FS_SELECTOR, fs_sel);
+ else
+ vmcs_write16(HOST_FS_SELECTOR, 0);
+ host->fs_sel = fs_sel;
+ }
+ if (unlikely(gs_sel != host->gs_sel)) {
+ if (!(gs_sel & 7))
+ vmcs_write16(HOST_GS_SELECTOR, gs_sel);
+ else
+ vmcs_write16(HOST_GS_SELECTOR, 0);
+ host->gs_sel = gs_sel;
+ }
+ if (unlikely(fs_base != host->fs_base)) {
+ vmcs_writel(HOST_FS_BASE, fs_base);
+ host->fs_base = fs_base;
+ }
+ if (unlikely(gs_base != host->gs_base)) {
+ vmcs_writel(HOST_GS_BASE, gs_base);
+ host->gs_base = gs_base;
+ }
+}
+
+void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct vmcs_host_state *host_state;
+#ifdef CONFIG_X86_64
+ int cpu = raw_smp_processor_id();
+#endif
+ unsigned long fs_base, gs_base;
+ u16 fs_sel, gs_sel;
+ int i;
+
+ vmx->req_immediate_exit = false;
+
+ /*
+ * Note that guest MSRs to be saved/restored can also be changed
+ * when guest state is loaded. This happens when guest transitions
+ * to/from long-mode by setting MSR_EFER.LMA.
+ */
+ if (!vmx->guest_uret_msrs_loaded) {
+ vmx->guest_uret_msrs_loaded = true;
+ for (i = 0; i < kvm_nr_uret_msrs; ++i) {
+ if (!vmx->guest_uret_msrs[i].load_into_hardware)
+ continue;
+
+ kvm_set_user_return_msr(i,
+ vmx->guest_uret_msrs[i].data,
+ vmx->guest_uret_msrs[i].mask);
+ }
+ }
+
+ if (vmx->nested.need_vmcs12_to_shadow_sync)
+ nested_sync_vmcs12_to_shadow(vcpu);
+
+ if (vmx->guest_state_loaded)
+ return;
+
+ host_state = &vmx->loaded_vmcs->host_state;
+
+ /*
+ * Set host fs and gs selectors. Unfortunately, 22.2.3 does not
+ * allow segment selectors with cpl > 0 or ti == 1.
+ */
+ host_state->ldt_sel = kvm_read_ldt();
+
+#ifdef CONFIG_X86_64
+ savesegment(ds, host_state->ds_sel);
+ savesegment(es, host_state->es_sel);
+
+ gs_base = cpu_kernelmode_gs_base(cpu);
+ if (likely(is_64bit_mm(current->mm))) {
+ current_save_fsgs();
+ fs_sel = current->thread.fsindex;
+ gs_sel = current->thread.gsindex;
+ fs_base = current->thread.fsbase;
+ vmx->msr_host_kernel_gs_base = current->thread.gsbase;
+ } else {
+ savesegment(fs, fs_sel);
+ savesegment(gs, gs_sel);
+ fs_base = read_msr(MSR_FS_BASE);
+ vmx->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE);
+ }
+
+ wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
+#else
+ savesegment(fs, fs_sel);
+ savesegment(gs, gs_sel);
+ fs_base = segment_base(fs_sel);
+ gs_base = segment_base(gs_sel);
+#endif
+
+ vmx_set_host_fs_gs(host_state, fs_sel, gs_sel, fs_base, gs_base);
+ vmx->guest_state_loaded = true;
+}
+
+static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx)
+{
+ struct vmcs_host_state *host_state;
+
+ if (!vmx->guest_state_loaded)
+ return;
+
+ host_state = &vmx->loaded_vmcs->host_state;
+
+ ++vmx->vcpu.stat.host_state_reload;
+
+#ifdef CONFIG_X86_64
+ rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
+#endif
+ if (host_state->ldt_sel || (host_state->gs_sel & 7)) {
+ kvm_load_ldt(host_state->ldt_sel);
+#ifdef CONFIG_X86_64
+ load_gs_index(host_state->gs_sel);
+#else
+ loadsegment(gs, host_state->gs_sel);
+#endif
+ }
+ if (host_state->fs_sel & 7)
+ loadsegment(fs, host_state->fs_sel);
+#ifdef CONFIG_X86_64
+ if (unlikely(host_state->ds_sel | host_state->es_sel)) {
+ loadsegment(ds, host_state->ds_sel);
+ loadsegment(es, host_state->es_sel);
+ }
+#endif
+ invalidate_tss_limit();
+#ifdef CONFIG_X86_64
+ wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
+#endif
+ load_fixmap_gdt(raw_smp_processor_id());
+ vmx->guest_state_loaded = false;
+ vmx->guest_uret_msrs_loaded = false;
+}
+
+#ifdef CONFIG_X86_64
+static u64 vmx_read_guest_kernel_gs_base(struct vcpu_vmx *vmx)
+{
+ preempt_disable();
+ if (vmx->guest_state_loaded)
+ rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
+ preempt_enable();
+ return vmx->msr_guest_kernel_gs_base;
+}
+
+static void vmx_write_guest_kernel_gs_base(struct vcpu_vmx *vmx, u64 data)
+{
+ preempt_disable();
+ if (vmx->guest_state_loaded)
+ wrmsrl(MSR_KERNEL_GS_BASE, data);
+ preempt_enable();
+ vmx->msr_guest_kernel_gs_base = data;
+}
+#endif
+
+void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu,
+ struct loaded_vmcs *buddy)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ bool already_loaded = vmx->loaded_vmcs->cpu == cpu;
+ struct vmcs *prev;
+
+ if (!already_loaded) {
+ loaded_vmcs_clear(vmx->loaded_vmcs);
+ local_irq_disable();
+
+ /*
+ * Ensure loaded_vmcs->cpu is read before adding loaded_vmcs to
+ * this cpu's percpu list, otherwise it may not yet be deleted
+ * from its previous cpu's percpu list. Pairs with the
+ * smb_wmb() in __loaded_vmcs_clear().
+ */
+ smp_rmb();
+
+ list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link,
+ &per_cpu(loaded_vmcss_on_cpu, cpu));
+ local_irq_enable();
+ }
+
+ prev = per_cpu(current_vmcs, cpu);
+ if (prev != vmx->loaded_vmcs->vmcs) {
+ per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs;
+ vmcs_load(vmx->loaded_vmcs->vmcs);
+
+ /*
+ * No indirect branch prediction barrier needed when switching
+ * the active VMCS within a vCPU, unless IBRS is advertised to
+ * the vCPU. To minimize the number of IBPBs executed, KVM
+ * performs IBPB on nested VM-Exit (a single nested transition
+ * may switch the active VMCS multiple times).
+ */
+ if (!buddy || WARN_ON_ONCE(buddy->vmcs != prev))
+ indirect_branch_prediction_barrier();
+ }
+
+ if (!already_loaded) {
+ void *gdt = get_current_gdt_ro();
+
+ /*
+ * Flush all EPTP/VPID contexts, the new pCPU may have stale
+ * TLB entries from its previous association with the vCPU.
+ */
+ kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
+
+ /*
+ * Linux uses per-cpu TSS and GDT, so set these when switching
+ * processors. See 22.2.4.
+ */
+ vmcs_writel(HOST_TR_BASE,
+ (unsigned long)&get_cpu_entry_area(cpu)->tss.x86_tss);
+ vmcs_writel(HOST_GDTR_BASE, (unsigned long)gdt); /* 22.2.4 */
+
+ if (IS_ENABLED(CONFIG_IA32_EMULATION) || IS_ENABLED(CONFIG_X86_32)) {
+ /* 22.2.3 */
+ vmcs_writel(HOST_IA32_SYSENTER_ESP,
+ (unsigned long)(cpu_entry_stack(cpu) + 1));
+ }
+
+ vmx->loaded_vmcs->cpu = cpu;
+ }
+}
+
+/*
+ * Switches to specified vcpu, until a matching vcpu_put(), but assumes
+ * vcpu mutex is already taken.
+ */
+static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ vmx_vcpu_load_vmcs(vcpu, cpu, NULL);
+
+ vmx_vcpu_pi_load(vcpu, cpu);
+
+ vmx->host_debugctlmsr = get_debugctlmsr();
+}
+
+static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
+{
+ vmx_vcpu_pi_put(vcpu);
+
+ vmx_prepare_switch_to_host(to_vmx(vcpu));
+}
+
+bool vmx_emulation_required(struct kvm_vcpu *vcpu)
+{
+ return emulate_invalid_guest_state && !vmx_guest_state_valid(vcpu);
+}
+
+unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ unsigned long rflags, save_rflags;
+
+ if (!kvm_register_is_available(vcpu, VCPU_EXREG_RFLAGS)) {
+ kvm_register_mark_available(vcpu, VCPU_EXREG_RFLAGS);
+ rflags = vmcs_readl(GUEST_RFLAGS);
+ if (vmx->rmode.vm86_active) {
+ rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
+ save_rflags = vmx->rmode.save_rflags;
+ rflags |= save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
+ }
+ vmx->rflags = rflags;
+ }
+ return vmx->rflags;
+}
+
+void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ unsigned long old_rflags;
+
+ /*
+ * Unlike CR0 and CR4, RFLAGS handling requires checking if the vCPU
+ * is an unrestricted guest in order to mark L2 as needing emulation
+ * if L1 runs L2 as a restricted guest.
+ */
+ if (is_unrestricted_guest(vcpu)) {
+ kvm_register_mark_available(vcpu, VCPU_EXREG_RFLAGS);
+ vmx->rflags = rflags;
+ vmcs_writel(GUEST_RFLAGS, rflags);
+ return;
+ }
+
+ old_rflags = vmx_get_rflags(vcpu);
+ vmx->rflags = rflags;
+ if (vmx->rmode.vm86_active) {
+ vmx->rmode.save_rflags = rflags;
+ rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
+ }
+ vmcs_writel(GUEST_RFLAGS, rflags);
+
+ if ((old_rflags ^ vmx->rflags) & X86_EFLAGS_VM)
+ vmx->emulation_required = vmx_emulation_required(vcpu);
+}
+
+static bool vmx_get_if_flag(struct kvm_vcpu *vcpu)
+{
+ return vmx_get_rflags(vcpu) & X86_EFLAGS_IF;
+}
+
+u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu)
+{
+ u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
+ int ret = 0;
+
+ if (interruptibility & GUEST_INTR_STATE_STI)
+ ret |= KVM_X86_SHADOW_INT_STI;
+ if (interruptibility & GUEST_INTR_STATE_MOV_SS)
+ ret |= KVM_X86_SHADOW_INT_MOV_SS;
+
+ return ret;
+}
+
+void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
+{
+ u32 interruptibility_old = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
+ u32 interruptibility = interruptibility_old;
+
+ interruptibility &= ~(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS);
+
+ if (mask & KVM_X86_SHADOW_INT_MOV_SS)
+ interruptibility |= GUEST_INTR_STATE_MOV_SS;
+ else if (mask & KVM_X86_SHADOW_INT_STI)
+ interruptibility |= GUEST_INTR_STATE_STI;
+
+ if ((interruptibility != interruptibility_old))
+ vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, interruptibility);
+}
+
+static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ unsigned long value;
+
+ /*
+ * Any MSR write that attempts to change bits marked reserved will
+ * case a #GP fault.
+ */
+ if (data & vmx->pt_desc.ctl_bitmask)
+ return 1;
+
+ /*
+ * Any attempt to modify IA32_RTIT_CTL while TraceEn is set will
+ * result in a #GP unless the same write also clears TraceEn.
+ */
+ if ((vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) &&
+ ((vmx->pt_desc.guest.ctl ^ data) & ~RTIT_CTL_TRACEEN))
+ return 1;
+
+ /*
+ * WRMSR to IA32_RTIT_CTL that sets TraceEn but clears this bit
+ * and FabricEn would cause #GP, if
+ * CPUID.(EAX=14H, ECX=0):ECX.SNGLRGNOUT[bit 2] = 0
+ */
+ if ((data & RTIT_CTL_TRACEEN) && !(data & RTIT_CTL_TOPA) &&
+ !(data & RTIT_CTL_FABRIC_EN) &&
+ !intel_pt_validate_cap(vmx->pt_desc.caps,
+ PT_CAP_single_range_output))
+ return 1;
+
+ /*
+ * MTCFreq, CycThresh and PSBFreq encodings check, any MSR write that
+ * utilize encodings marked reserved will cause a #GP fault.
+ */
+ value = intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc_periods);
+ if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc) &&
+ !test_bit((data & RTIT_CTL_MTC_RANGE) >>
+ RTIT_CTL_MTC_RANGE_OFFSET, &value))
+ return 1;
+ value = intel_pt_validate_cap(vmx->pt_desc.caps,
+ PT_CAP_cycle_thresholds);
+ if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc) &&
+ !test_bit((data & RTIT_CTL_CYC_THRESH) >>
+ RTIT_CTL_CYC_THRESH_OFFSET, &value))
+ return 1;
+ value = intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_periods);
+ if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc) &&
+ !test_bit((data & RTIT_CTL_PSB_FREQ) >>
+ RTIT_CTL_PSB_FREQ_OFFSET, &value))
+ return 1;
+
+ /*
+ * If ADDRx_CFG is reserved or the encodings is >2 will
+ * cause a #GP fault.
+ */
+ value = (data & RTIT_CTL_ADDR0) >> RTIT_CTL_ADDR0_OFFSET;
+ if ((value && (vmx->pt_desc.num_address_ranges < 1)) || (value > 2))
+ return 1;
+ value = (data & RTIT_CTL_ADDR1) >> RTIT_CTL_ADDR1_OFFSET;
+ if ((value && (vmx->pt_desc.num_address_ranges < 2)) || (value > 2))
+ return 1;
+ value = (data & RTIT_CTL_ADDR2) >> RTIT_CTL_ADDR2_OFFSET;
+ if ((value && (vmx->pt_desc.num_address_ranges < 3)) || (value > 2))
+ return 1;
+ value = (data & RTIT_CTL_ADDR3) >> RTIT_CTL_ADDR3_OFFSET;
+ if ((value && (vmx->pt_desc.num_address_ranges < 4)) || (value > 2))
+ return 1;
+
+ return 0;
+}
+
+static bool vmx_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
+ void *insn, int insn_len)
+{
+ /*
+ * Emulation of instructions in SGX enclaves is impossible as RIP does
+ * not point at the failing instruction, and even if it did, the code
+ * stream is inaccessible. Inject #UD instead of exiting to userspace
+ * so that guest userspace can't DoS the guest simply by triggering
+ * emulation (enclaves are CPL3 only).
+ */
+ if (to_vmx(vcpu)->exit_reason.enclave_mode) {
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return false;
+ }
+ return true;
+}
+
+static int skip_emulated_instruction(struct kvm_vcpu *vcpu)
+{
+ union vmx_exit_reason exit_reason = to_vmx(vcpu)->exit_reason;
+ unsigned long rip, orig_rip;
+ u32 instr_len;
+
+ /*
+ * Using VMCS.VM_EXIT_INSTRUCTION_LEN on EPT misconfig depends on
+ * undefined behavior: Intel's SDM doesn't mandate the VMCS field be
+ * set when EPT misconfig occurs. In practice, real hardware updates
+ * VM_EXIT_INSTRUCTION_LEN on EPT misconfig, but other hypervisors
+ * (namely Hyper-V) don't set it due to it being undefined behavior,
+ * i.e. we end up advancing IP with some random value.
+ */
+ if (!static_cpu_has(X86_FEATURE_HYPERVISOR) ||
+ exit_reason.basic != EXIT_REASON_EPT_MISCONFIG) {
+ instr_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
+
+ /*
+ * Emulating an enclave's instructions isn't supported as KVM
+ * cannot access the enclave's memory or its true RIP, e.g. the
+ * vmcs.GUEST_RIP points at the exit point of the enclave, not
+ * the RIP that actually triggered the VM-Exit. But, because
+ * most instructions that cause VM-Exit will #UD in an enclave,
+ * most instruction-based VM-Exits simply do not occur.
+ *
+ * There are a few exceptions, notably the debug instructions
+ * INT1ICEBRK and INT3, as they are allowed in debug enclaves
+ * and generate #DB/#BP as expected, which KVM might intercept.
+ * But again, the CPU does the dirty work and saves an instr
+ * length of zero so VMMs don't shoot themselves in the foot.
+ * WARN if KVM tries to skip a non-zero length instruction on
+ * a VM-Exit from an enclave.
+ */
+ if (!instr_len)
+ goto rip_updated;
+
+ WARN_ONCE(exit_reason.enclave_mode,
+ "skipping instruction after SGX enclave VM-Exit");
+
+ orig_rip = kvm_rip_read(vcpu);
+ rip = orig_rip + instr_len;
+#ifdef CONFIG_X86_64
+ /*
+ * We need to mask out the high 32 bits of RIP if not in 64-bit
+ * mode, but just finding out that we are in 64-bit mode is
+ * quite expensive. Only do it if there was a carry.
+ */
+ if (unlikely(((rip ^ orig_rip) >> 31) == 3) && !is_64_bit_mode(vcpu))
+ rip = (u32)rip;
+#endif
+ kvm_rip_write(vcpu, rip);
+ } else {
+ if (!kvm_emulate_instruction(vcpu, EMULTYPE_SKIP))
+ return 0;
+ }
+
+rip_updated:
+ /* skipping an emulated instruction also counts */
+ vmx_set_interrupt_shadow(vcpu, 0);
+
+ return 1;
+}
+
+/*
+ * Recognizes a pending MTF VM-exit and records the nested state for later
+ * delivery.
+ */
+static void vmx_update_emulated_instruction(struct kvm_vcpu *vcpu)
+{
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ if (!is_guest_mode(vcpu))
+ return;
+
+ /*
+ * Per the SDM, MTF takes priority over debug-trap exceptions besides
+ * TSS T-bit traps and ICEBP (INT1). KVM doesn't emulate T-bit traps
+ * or ICEBP (in the emulator proper), and skipping of ICEBP after an
+ * intercepted #DB deliberately avoids single-step #DB and MTF updates
+ * as ICEBP is higher priority than both. As instruction emulation is
+ * completed at this point (i.e. KVM is at the instruction boundary),
+ * any #DB exception pending delivery must be a debug-trap of lower
+ * priority than MTF. Record the pending MTF state to be delivered in
+ * vmx_check_nested_events().
+ */
+ if (nested_cpu_has_mtf(vmcs12) &&
+ (!vcpu->arch.exception.pending ||
+ vcpu->arch.exception.vector == DB_VECTOR) &&
+ (!vcpu->arch.exception_vmexit.pending ||
+ vcpu->arch.exception_vmexit.vector == DB_VECTOR)) {
+ vmx->nested.mtf_pending = true;
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ } else {
+ vmx->nested.mtf_pending = false;
+ }
+}
+
+static int vmx_skip_emulated_instruction(struct kvm_vcpu *vcpu)
+{
+ vmx_update_emulated_instruction(vcpu);
+ return skip_emulated_instruction(vcpu);
+}
+
+static void vmx_clear_hlt(struct kvm_vcpu *vcpu)
+{
+ /*
+ * Ensure that we clear the HLT state in the VMCS. We don't need to
+ * explicitly skip the instruction because if the HLT state is set,
+ * then the instruction is already executing and RIP has already been
+ * advanced.
+ */
+ if (kvm_hlt_in_guest(vcpu->kvm) &&
+ vmcs_read32(GUEST_ACTIVITY_STATE) == GUEST_ACTIVITY_HLT)
+ vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
+}
+
+static void vmx_inject_exception(struct kvm_vcpu *vcpu)
+{
+ struct kvm_queued_exception *ex = &vcpu->arch.exception;
+ u32 intr_info = ex->vector | INTR_INFO_VALID_MASK;
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ kvm_deliver_exception_payload(vcpu, ex);
+
+ if (ex->has_error_code) {
+ /*
+ * Despite the error code being architecturally defined as 32
+ * bits, and the VMCS field being 32 bits, Intel CPUs and thus
+ * VMX don't actually supporting setting bits 31:16. Hardware
+ * will (should) never provide a bogus error code, but AMD CPUs
+ * do generate error codes with bits 31:16 set, and so KVM's
+ * ABI lets userspace shove in arbitrary 32-bit values. Drop
+ * the upper bits to avoid VM-Fail, losing information that
+ * does't really exist is preferable to killing the VM.
+ */
+ vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, (u16)ex->error_code);
+ intr_info |= INTR_INFO_DELIVER_CODE_MASK;
+ }
+
+ if (vmx->rmode.vm86_active) {
+ int inc_eip = 0;
+ if (kvm_exception_is_soft(ex->vector))
+ inc_eip = vcpu->arch.event_exit_inst_len;
+ kvm_inject_realmode_interrupt(vcpu, ex->vector, inc_eip);
+ return;
+ }
+
+ WARN_ON_ONCE(vmx->emulation_required);
+
+ if (kvm_exception_is_soft(ex->vector)) {
+ vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
+ vmx->vcpu.arch.event_exit_inst_len);
+ intr_info |= INTR_TYPE_SOFT_EXCEPTION;
+ } else
+ intr_info |= INTR_TYPE_HARD_EXCEPTION;
+
+ vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
+
+ vmx_clear_hlt(vcpu);
+}
+
+static void vmx_setup_uret_msr(struct vcpu_vmx *vmx, unsigned int msr,
+ bool load_into_hardware)
+{
+ struct vmx_uret_msr *uret_msr;
+
+ uret_msr = vmx_find_uret_msr(vmx, msr);
+ if (!uret_msr)
+ return;
+
+ uret_msr->load_into_hardware = load_into_hardware;
+}
+
+/*
+ * Configuring user return MSRs to automatically save, load, and restore MSRs
+ * that need to be shoved into hardware when running the guest. Note, omitting
+ * an MSR here does _NOT_ mean it's not emulated, only that it will not be
+ * loaded into hardware when running the guest.
+ */
+static void vmx_setup_uret_msrs(struct vcpu_vmx *vmx)
+{
+#ifdef CONFIG_X86_64
+ bool load_syscall_msrs;
+
+ /*
+ * The SYSCALL MSRs are only needed on long mode guests, and only
+ * when EFER.SCE is set.
+ */
+ load_syscall_msrs = is_long_mode(&vmx->vcpu) &&
+ (vmx->vcpu.arch.efer & EFER_SCE);
+
+ vmx_setup_uret_msr(vmx, MSR_STAR, load_syscall_msrs);
+ vmx_setup_uret_msr(vmx, MSR_LSTAR, load_syscall_msrs);
+ vmx_setup_uret_msr(vmx, MSR_SYSCALL_MASK, load_syscall_msrs);
+#endif
+ vmx_setup_uret_msr(vmx, MSR_EFER, update_transition_efer(vmx));
+
+ vmx_setup_uret_msr(vmx, MSR_TSC_AUX,
+ guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDTSCP) ||
+ guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDPID));
+
+ /*
+ * hle=0, rtm=0, tsx_ctrl=1 can be found with some combinations of new
+ * kernel and old userspace. If those guests run on a tsx=off host, do
+ * allow guests to use TSX_CTRL, but don't change the value in hardware
+ * so that TSX remains always disabled.
+ */
+ vmx_setup_uret_msr(vmx, MSR_IA32_TSX_CTRL, boot_cpu_has(X86_FEATURE_RTM));
+
+ /*
+ * The set of MSRs to load may have changed, reload MSRs before the
+ * next VM-Enter.
+ */
+ vmx->guest_uret_msrs_loaded = false;
+}
+
+u64 vmx_get_l2_tsc_offset(struct kvm_vcpu *vcpu)
+{
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+
+ if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETTING))
+ return vmcs12->tsc_offset;
+
+ return 0;
+}
+
+u64 vmx_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu)
+{
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+
+ if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETTING) &&
+ nested_cpu_has2(vmcs12, SECONDARY_EXEC_TSC_SCALING))
+ return vmcs12->tsc_multiplier;
+
+ return kvm_caps.default_tsc_scaling_ratio;
+}
+
+static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu)
+{
+ vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
+}
+
+static void vmx_write_tsc_multiplier(struct kvm_vcpu *vcpu)
+{
+ vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio);
+}
+
+/*
+ * Userspace is allowed to set any supported IA32_FEATURE_CONTROL regardless of
+ * guest CPUID. Note, KVM allows userspace to set "VMX in SMX" to maintain
+ * backwards compatibility even though KVM doesn't support emulating SMX. And
+ * because userspace set "VMX in SMX", the guest must also be allowed to set it,
+ * e.g. if the MSR is left unlocked and the guest does a RMW operation.
+ */
+#define KVM_SUPPORTED_FEATURE_CONTROL (FEAT_CTL_LOCKED | \
+ FEAT_CTL_VMX_ENABLED_INSIDE_SMX | \
+ FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX | \
+ FEAT_CTL_SGX_LC_ENABLED | \
+ FEAT_CTL_SGX_ENABLED | \
+ FEAT_CTL_LMCE_ENABLED)
+
+static inline bool is_vmx_feature_control_msr_valid(struct vcpu_vmx *vmx,
+ struct msr_data *msr)
+{
+ uint64_t valid_bits;
+
+ /*
+ * Ensure KVM_SUPPORTED_FEATURE_CONTROL is updated when new bits are
+ * exposed to the guest.
+ */
+ WARN_ON_ONCE(vmx->msr_ia32_feature_control_valid_bits &
+ ~KVM_SUPPORTED_FEATURE_CONTROL);
+
+ if (!msr->host_initiated &&
+ (vmx->msr_ia32_feature_control & FEAT_CTL_LOCKED))
+ return false;
+
+ if (msr->host_initiated)
+ valid_bits = KVM_SUPPORTED_FEATURE_CONTROL;
+ else
+ valid_bits = vmx->msr_ia32_feature_control_valid_bits;
+
+ return !(msr->data & ~valid_bits);
+}
+
+static int vmx_get_msr_feature(struct kvm_msr_entry *msr)
+{
+ switch (msr->index) {
+ case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR:
+ if (!nested)
+ return 1;
+ return vmx_get_vmx_msr(&vmcs_config.nested, msr->index, &msr->data);
+ default:
+ return KVM_MSR_RET_INVALID;
+ }
+}
+
+/*
+ * Reads an msr value (of 'msr_info->index') into 'msr_info->data'.
+ * Returns 0 on success, non-0 otherwise.
+ * Assumes vcpu_load() was already called.
+ */
+static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct vmx_uret_msr *msr;
+ u32 index;
+
+ switch (msr_info->index) {
+#ifdef CONFIG_X86_64
+ case MSR_FS_BASE:
+ msr_info->data = vmcs_readl(GUEST_FS_BASE);
+ break;
+ case MSR_GS_BASE:
+ msr_info->data = vmcs_readl(GUEST_GS_BASE);
+ break;
+ case MSR_KERNEL_GS_BASE:
+ msr_info->data = vmx_read_guest_kernel_gs_base(vmx);
+ break;
+#endif
+ case MSR_EFER:
+ return kvm_get_msr_common(vcpu, msr_info);
+ case MSR_IA32_TSX_CTRL:
+ if (!msr_info->host_initiated &&
+ !(vcpu->arch.arch_capabilities & ARCH_CAP_TSX_CTRL_MSR))
+ return 1;
+ goto find_uret_msr;
+ case MSR_IA32_UMWAIT_CONTROL:
+ if (!msr_info->host_initiated && !vmx_has_waitpkg(vmx))
+ return 1;
+
+ msr_info->data = vmx->msr_ia32_umwait_control;
+ break;
+ case MSR_IA32_SPEC_CTRL:
+ if (!msr_info->host_initiated &&
+ !guest_has_spec_ctrl_msr(vcpu))
+ return 1;
+
+ msr_info->data = to_vmx(vcpu)->spec_ctrl;
+ break;
+ case MSR_IA32_SYSENTER_CS:
+ msr_info->data = vmcs_read32(GUEST_SYSENTER_CS);
+ break;
+ case MSR_IA32_SYSENTER_EIP:
+ msr_info->data = vmcs_readl(GUEST_SYSENTER_EIP);
+ break;
+ case MSR_IA32_SYSENTER_ESP:
+ msr_info->data = vmcs_readl(GUEST_SYSENTER_ESP);
+ break;
+ case MSR_IA32_BNDCFGS:
+ if (!kvm_mpx_supported() ||
+ (!msr_info->host_initiated &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_MPX)))
+ return 1;
+ msr_info->data = vmcs_read64(GUEST_BNDCFGS);
+ break;
+ case MSR_IA32_MCG_EXT_CTL:
+ if (!msr_info->host_initiated &&
+ !(vmx->msr_ia32_feature_control &
+ FEAT_CTL_LMCE_ENABLED))
+ return 1;
+ msr_info->data = vcpu->arch.mcg_ext_ctl;
+ break;
+ case MSR_IA32_FEAT_CTL:
+ msr_info->data = vmx->msr_ia32_feature_control;
+ break;
+ case MSR_IA32_SGXLEPUBKEYHASH0 ... MSR_IA32_SGXLEPUBKEYHASH3:
+ if (!msr_info->host_initiated &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_SGX_LC))
+ return 1;
+ msr_info->data = to_vmx(vcpu)->msr_ia32_sgxlepubkeyhash
+ [msr_info->index - MSR_IA32_SGXLEPUBKEYHASH0];
+ break;
+ case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR:
+ if (!guest_can_use(vcpu, X86_FEATURE_VMX))
+ return 1;
+ if (vmx_get_vmx_msr(&vmx->nested.msrs, msr_info->index,
+ &msr_info->data))
+ return 1;
+ /*
+ * Enlightened VMCS v1 doesn't have certain VMCS fields but
+ * instead of just ignoring the features, different Hyper-V
+ * versions are either trying to use them and fail or do some
+ * sanity checking and refuse to boot. Filter all unsupported
+ * features out.
+ */
+ if (!msr_info->host_initiated && guest_cpuid_has_evmcs(vcpu))
+ nested_evmcs_filter_control_msr(vcpu, msr_info->index,
+ &msr_info->data);
+ break;
+ case MSR_IA32_RTIT_CTL:
+ if (!vmx_pt_mode_is_host_guest())
+ return 1;
+ msr_info->data = vmx->pt_desc.guest.ctl;
+ break;
+ case MSR_IA32_RTIT_STATUS:
+ if (!vmx_pt_mode_is_host_guest())
+ return 1;
+ msr_info->data = vmx->pt_desc.guest.status;
+ break;
+ case MSR_IA32_RTIT_CR3_MATCH:
+ if (!vmx_pt_mode_is_host_guest() ||
+ !intel_pt_validate_cap(vmx->pt_desc.caps,
+ PT_CAP_cr3_filtering))
+ return 1;
+ msr_info->data = vmx->pt_desc.guest.cr3_match;
+ break;
+ case MSR_IA32_RTIT_OUTPUT_BASE:
+ if (!vmx_pt_mode_is_host_guest() ||
+ (!intel_pt_validate_cap(vmx->pt_desc.caps,
+ PT_CAP_topa_output) &&
+ !intel_pt_validate_cap(vmx->pt_desc.caps,
+ PT_CAP_single_range_output)))
+ return 1;
+ msr_info->data = vmx->pt_desc.guest.output_base;
+ break;
+ case MSR_IA32_RTIT_OUTPUT_MASK:
+ if (!vmx_pt_mode_is_host_guest() ||
+ (!intel_pt_validate_cap(vmx->pt_desc.caps,
+ PT_CAP_topa_output) &&
+ !intel_pt_validate_cap(vmx->pt_desc.caps,
+ PT_CAP_single_range_output)))
+ return 1;
+ msr_info->data = vmx->pt_desc.guest.output_mask;
+ break;
+ case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
+ index = msr_info->index - MSR_IA32_RTIT_ADDR0_A;
+ if (!vmx_pt_mode_is_host_guest() ||
+ (index >= 2 * vmx->pt_desc.num_address_ranges))
+ return 1;
+ if (index % 2)
+ msr_info->data = vmx->pt_desc.guest.addr_b[index / 2];
+ else
+ msr_info->data = vmx->pt_desc.guest.addr_a[index / 2];
+ break;
+ case MSR_IA32_DEBUGCTLMSR:
+ msr_info->data = vmcs_read64(GUEST_IA32_DEBUGCTL);
+ break;
+ default:
+ find_uret_msr:
+ msr = vmx_find_uret_msr(vmx, msr_info->index);
+ if (msr) {
+ msr_info->data = msr->data;
+ break;
+ }
+ return kvm_get_msr_common(vcpu, msr_info);
+ }
+
+ return 0;
+}
+
+static u64 nested_vmx_truncate_sysenter_addr(struct kvm_vcpu *vcpu,
+ u64 data)
+{
+#ifdef CONFIG_X86_64
+ if (!guest_cpuid_has(vcpu, X86_FEATURE_LM))
+ return (u32)data;
+#endif
+ return (unsigned long)data;
+}
+
+static u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated)
+{
+ u64 debugctl = 0;
+
+ if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT) &&
+ (host_initiated || guest_cpuid_has(vcpu, X86_FEATURE_BUS_LOCK_DETECT)))
+ debugctl |= DEBUGCTLMSR_BUS_LOCK_DETECT;
+
+ if ((kvm_caps.supported_perf_cap & PMU_CAP_LBR_FMT) &&
+ (host_initiated || intel_pmu_lbr_is_enabled(vcpu)))
+ debugctl |= DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI;
+
+ return debugctl;
+}
+
+/*
+ * Writes msr value into the appropriate "register".
+ * Returns 0 on success, non-0 otherwise.
+ * Assumes vcpu_load() was already called.
+ */
+static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct vmx_uret_msr *msr;
+ int ret = 0;
+ u32 msr_index = msr_info->index;
+ u64 data = msr_info->data;
+ u32 index;
+
+ switch (msr_index) {
+ case MSR_EFER:
+ ret = kvm_set_msr_common(vcpu, msr_info);
+ break;
+#ifdef CONFIG_X86_64
+ case MSR_FS_BASE:
+ vmx_segment_cache_clear(vmx);
+ vmcs_writel(GUEST_FS_BASE, data);
+ break;
+ case MSR_GS_BASE:
+ vmx_segment_cache_clear(vmx);
+ vmcs_writel(GUEST_GS_BASE, data);
+ break;
+ case MSR_KERNEL_GS_BASE:
+ vmx_write_guest_kernel_gs_base(vmx, data);
+ break;
+ case MSR_IA32_XFD:
+ ret = kvm_set_msr_common(vcpu, msr_info);
+ /*
+ * Always intercepting WRMSR could incur non-negligible
+ * overhead given xfd might be changed frequently in
+ * guest context switch. Disable write interception
+ * upon the first write with a non-zero value (indicating
+ * potential usage on dynamic xfeatures). Also update
+ * exception bitmap to trap #NM for proper virtualization
+ * of guest xfd_err.
+ */
+ if (!ret && data) {
+ vmx_disable_intercept_for_msr(vcpu, MSR_IA32_XFD,
+ MSR_TYPE_RW);
+ vcpu->arch.xfd_no_write_intercept = true;
+ vmx_update_exception_bitmap(vcpu);
+ }
+ break;
+#endif
+ case MSR_IA32_SYSENTER_CS:
+ if (is_guest_mode(vcpu))
+ get_vmcs12(vcpu)->guest_sysenter_cs = data;
+ vmcs_write32(GUEST_SYSENTER_CS, data);
+ break;
+ case MSR_IA32_SYSENTER_EIP:
+ if (is_guest_mode(vcpu)) {
+ data = nested_vmx_truncate_sysenter_addr(vcpu, data);
+ get_vmcs12(vcpu)->guest_sysenter_eip = data;
+ }
+ vmcs_writel(GUEST_SYSENTER_EIP, data);
+ break;
+ case MSR_IA32_SYSENTER_ESP:
+ if (is_guest_mode(vcpu)) {
+ data = nested_vmx_truncate_sysenter_addr(vcpu, data);
+ get_vmcs12(vcpu)->guest_sysenter_esp = data;
+ }
+ vmcs_writel(GUEST_SYSENTER_ESP, data);
+ break;
+ case MSR_IA32_DEBUGCTLMSR: {
+ u64 invalid;
+
+ invalid = data & ~vmx_get_supported_debugctl(vcpu, msr_info->host_initiated);
+ if (invalid & (DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR)) {
+ kvm_pr_unimpl_wrmsr(vcpu, msr_index, data);
+ data &= ~(DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR);
+ invalid &= ~(DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR);
+ }
+
+ if (invalid)
+ return 1;
+
+ if (is_guest_mode(vcpu) && get_vmcs12(vcpu)->vm_exit_controls &
+ VM_EXIT_SAVE_DEBUG_CONTROLS)
+ get_vmcs12(vcpu)->guest_ia32_debugctl = data;
+
+ vmcs_write64(GUEST_IA32_DEBUGCTL, data);
+ if (intel_pmu_lbr_is_enabled(vcpu) && !to_vmx(vcpu)->lbr_desc.event &&
+ (data & DEBUGCTLMSR_LBR))
+ intel_pmu_create_guest_lbr_event(vcpu);
+ return 0;
+ }
+ case MSR_IA32_BNDCFGS:
+ if (!kvm_mpx_supported() ||
+ (!msr_info->host_initiated &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_MPX)))
+ return 1;
+ if (is_noncanonical_address(data & PAGE_MASK, vcpu) ||
+ (data & MSR_IA32_BNDCFGS_RSVD))
+ return 1;
+
+ if (is_guest_mode(vcpu) &&
+ ((vmx->nested.msrs.entry_ctls_high & VM_ENTRY_LOAD_BNDCFGS) ||
+ (vmx->nested.msrs.exit_ctls_high & VM_EXIT_CLEAR_BNDCFGS)))
+ get_vmcs12(vcpu)->guest_bndcfgs = data;
+
+ vmcs_write64(GUEST_BNDCFGS, data);
+ break;
+ case MSR_IA32_UMWAIT_CONTROL:
+ if (!msr_info->host_initiated && !vmx_has_waitpkg(vmx))
+ return 1;
+
+ /* The reserved bit 1 and non-32 bit [63:32] should be zero */
+ if (data & (BIT_ULL(1) | GENMASK_ULL(63, 32)))
+ return 1;
+
+ vmx->msr_ia32_umwait_control = data;
+ break;
+ case MSR_IA32_SPEC_CTRL:
+ if (!msr_info->host_initiated &&
+ !guest_has_spec_ctrl_msr(vcpu))
+ return 1;
+
+ if (kvm_spec_ctrl_test_value(data))
+ return 1;
+
+ vmx->spec_ctrl = data;
+ if (!data)
+ break;
+
+ /*
+ * For non-nested:
+ * When it's written (to non-zero) for the first time, pass
+ * it through.
+ *
+ * For nested:
+ * The handling of the MSR bitmap for L2 guests is done in
+ * nested_vmx_prepare_msr_bitmap. We should not touch the
+ * vmcs02.msr_bitmap here since it gets completely overwritten
+ * in the merging. We update the vmcs01 here for L1 as well
+ * since it will end up touching the MSR anyway now.
+ */
+ vmx_disable_intercept_for_msr(vcpu,
+ MSR_IA32_SPEC_CTRL,
+ MSR_TYPE_RW);
+ break;
+ case MSR_IA32_TSX_CTRL:
+ if (!msr_info->host_initiated &&
+ !(vcpu->arch.arch_capabilities & ARCH_CAP_TSX_CTRL_MSR))
+ return 1;
+ if (data & ~(TSX_CTRL_RTM_DISABLE | TSX_CTRL_CPUID_CLEAR))
+ return 1;
+ goto find_uret_msr;
+ case MSR_IA32_CR_PAT:
+ ret = kvm_set_msr_common(vcpu, msr_info);
+ if (ret)
+ break;
+
+ if (is_guest_mode(vcpu) &&
+ get_vmcs12(vcpu)->vm_exit_controls & VM_EXIT_SAVE_IA32_PAT)
+ get_vmcs12(vcpu)->guest_ia32_pat = data;
+
+ if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
+ vmcs_write64(GUEST_IA32_PAT, data);
+ break;
+ case MSR_IA32_MCG_EXT_CTL:
+ if ((!msr_info->host_initiated &&
+ !(to_vmx(vcpu)->msr_ia32_feature_control &
+ FEAT_CTL_LMCE_ENABLED)) ||
+ (data & ~MCG_EXT_CTL_LMCE_EN))
+ return 1;
+ vcpu->arch.mcg_ext_ctl = data;
+ break;
+ case MSR_IA32_FEAT_CTL:
+ if (!is_vmx_feature_control_msr_valid(vmx, msr_info))
+ return 1;
+
+ vmx->msr_ia32_feature_control = data;
+ if (msr_info->host_initiated && data == 0)
+ vmx_leave_nested(vcpu);
+
+ /* SGX may be enabled/disabled by guest's firmware */
+ vmx_write_encls_bitmap(vcpu, NULL);
+ break;
+ case MSR_IA32_SGXLEPUBKEYHASH0 ... MSR_IA32_SGXLEPUBKEYHASH3:
+ /*
+ * On real hardware, the LE hash MSRs are writable before
+ * the firmware sets bit 0 in MSR 0x7a ("activating" SGX),
+ * at which point SGX related bits in IA32_FEATURE_CONTROL
+ * become writable.
+ *
+ * KVM does not emulate SGX activation for simplicity, so
+ * allow writes to the LE hash MSRs if IA32_FEATURE_CONTROL
+ * is unlocked. This is technically not architectural
+ * behavior, but it's close enough.
+ */
+ if (!msr_info->host_initiated &&
+ (!guest_cpuid_has(vcpu, X86_FEATURE_SGX_LC) ||
+ ((vmx->msr_ia32_feature_control & FEAT_CTL_LOCKED) &&
+ !(vmx->msr_ia32_feature_control & FEAT_CTL_SGX_LC_ENABLED))))
+ return 1;
+ vmx->msr_ia32_sgxlepubkeyhash
+ [msr_index - MSR_IA32_SGXLEPUBKEYHASH0] = data;
+ break;
+ case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR:
+ if (!msr_info->host_initiated)
+ return 1; /* they are read-only */
+ if (!guest_can_use(vcpu, X86_FEATURE_VMX))
+ return 1;
+ return vmx_set_vmx_msr(vcpu, msr_index, data);
+ case MSR_IA32_RTIT_CTL:
+ if (!vmx_pt_mode_is_host_guest() ||
+ vmx_rtit_ctl_check(vcpu, data) ||
+ vmx->nested.vmxon)
+ return 1;
+ vmcs_write64(GUEST_IA32_RTIT_CTL, data);
+ vmx->pt_desc.guest.ctl = data;
+ pt_update_intercept_for_msr(vcpu);
+ break;
+ case MSR_IA32_RTIT_STATUS:
+ if (!pt_can_write_msr(vmx))
+ return 1;
+ if (data & MSR_IA32_RTIT_STATUS_MASK)
+ return 1;
+ vmx->pt_desc.guest.status = data;
+ break;
+ case MSR_IA32_RTIT_CR3_MATCH:
+ if (!pt_can_write_msr(vmx))
+ return 1;
+ if (!intel_pt_validate_cap(vmx->pt_desc.caps,
+ PT_CAP_cr3_filtering))
+ return 1;
+ vmx->pt_desc.guest.cr3_match = data;
+ break;
+ case MSR_IA32_RTIT_OUTPUT_BASE:
+ if (!pt_can_write_msr(vmx))
+ return 1;
+ if (!intel_pt_validate_cap(vmx->pt_desc.caps,
+ PT_CAP_topa_output) &&
+ !intel_pt_validate_cap(vmx->pt_desc.caps,
+ PT_CAP_single_range_output))
+ return 1;
+ if (!pt_output_base_valid(vcpu, data))
+ return 1;
+ vmx->pt_desc.guest.output_base = data;
+ break;
+ case MSR_IA32_RTIT_OUTPUT_MASK:
+ if (!pt_can_write_msr(vmx))
+ return 1;
+ if (!intel_pt_validate_cap(vmx->pt_desc.caps,
+ PT_CAP_topa_output) &&
+ !intel_pt_validate_cap(vmx->pt_desc.caps,
+ PT_CAP_single_range_output))
+ return 1;
+ vmx->pt_desc.guest.output_mask = data;
+ break;
+ case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
+ if (!pt_can_write_msr(vmx))
+ return 1;
+ index = msr_info->index - MSR_IA32_RTIT_ADDR0_A;
+ if (index >= 2 * vmx->pt_desc.num_address_ranges)
+ return 1;
+ if (is_noncanonical_address(data, vcpu))
+ return 1;
+ if (index % 2)
+ vmx->pt_desc.guest.addr_b[index / 2] = data;
+ else
+ vmx->pt_desc.guest.addr_a[index / 2] = data;
+ break;
+ case MSR_IA32_PERF_CAPABILITIES:
+ if (data && !vcpu_to_pmu(vcpu)->version)
+ return 1;
+ if (data & PMU_CAP_LBR_FMT) {
+ if ((data & PMU_CAP_LBR_FMT) !=
+ (kvm_caps.supported_perf_cap & PMU_CAP_LBR_FMT))
+ return 1;
+ if (!cpuid_model_is_consistent(vcpu))
+ return 1;
+ }
+ if (data & PERF_CAP_PEBS_FORMAT) {
+ if ((data & PERF_CAP_PEBS_MASK) !=
+ (kvm_caps.supported_perf_cap & PERF_CAP_PEBS_MASK))
+ return 1;
+ if (!guest_cpuid_has(vcpu, X86_FEATURE_DS))
+ return 1;
+ if (!guest_cpuid_has(vcpu, X86_FEATURE_DTES64))
+ return 1;
+ if (!cpuid_model_is_consistent(vcpu))
+ return 1;
+ }
+ ret = kvm_set_msr_common(vcpu, msr_info);
+ break;
+
+ default:
+ find_uret_msr:
+ msr = vmx_find_uret_msr(vmx, msr_index);
+ if (msr)
+ ret = vmx_set_guest_uret_msr(vmx, msr, data);
+ else
+ ret = kvm_set_msr_common(vcpu, msr_info);
+ }
+
+ /* FB_CLEAR may have changed, also update the FB_CLEAR_DIS behavior */
+ if (msr_index == MSR_IA32_ARCH_CAPABILITIES)
+ vmx_update_fb_clear_dis(vcpu, vmx);
+
+ return ret;
+}
+
+static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
+{
+ unsigned long guest_owned_bits;
+
+ kvm_register_mark_available(vcpu, reg);
+
+ switch (reg) {
+ case VCPU_REGS_RSP:
+ vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP);
+ break;
+ case VCPU_REGS_RIP:
+ vcpu->arch.regs[VCPU_REGS_RIP] = vmcs_readl(GUEST_RIP);
+ break;
+ case VCPU_EXREG_PDPTR:
+ if (enable_ept)
+ ept_save_pdptrs(vcpu);
+ break;
+ case VCPU_EXREG_CR0:
+ guest_owned_bits = vcpu->arch.cr0_guest_owned_bits;
+
+ vcpu->arch.cr0 &= ~guest_owned_bits;
+ vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & guest_owned_bits;
+ break;
+ case VCPU_EXREG_CR3:
+ /*
+ * When intercepting CR3 loads, e.g. for shadowing paging, KVM's
+ * CR3 is loaded into hardware, not the guest's CR3.
+ */
+ if (!(exec_controls_get(to_vmx(vcpu)) & CPU_BASED_CR3_LOAD_EXITING))
+ vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
+ break;
+ case VCPU_EXREG_CR4:
+ guest_owned_bits = vcpu->arch.cr4_guest_owned_bits;
+
+ vcpu->arch.cr4 &= ~guest_owned_bits;
+ vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & guest_owned_bits;
+ break;
+ default:
+ KVM_BUG_ON(1, vcpu->kvm);
+ break;
+ }
+}
+
+/*
+ * There is no X86_FEATURE for SGX yet, but anyway we need to query CPUID
+ * directly instead of going through cpu_has(), to ensure KVM is trapping
+ * ENCLS whenever it's supported in hardware. It does not matter whether
+ * the host OS supports or has enabled SGX.
+ */
+static bool cpu_has_sgx(void)
+{
+ return cpuid_eax(0) >= 0x12 && (cpuid_eax(0x12) & BIT(0));
+}
+
+/*
+ * Some cpus support VM_{ENTRY,EXIT}_IA32_PERF_GLOBAL_CTRL but they
+ * can't be used due to errata where VM Exit may incorrectly clear
+ * IA32_PERF_GLOBAL_CTRL[34:32]. Work around the errata by using the
+ * MSR load mechanism to switch IA32_PERF_GLOBAL_CTRL.
+ */
+static bool cpu_has_perf_global_ctrl_bug(void)
+{
+ if (boot_cpu_data.x86 == 0x6) {
+ switch (boot_cpu_data.x86_model) {
+ case INTEL_FAM6_NEHALEM_EP: /* AAK155 */
+ case INTEL_FAM6_NEHALEM: /* AAP115 */
+ case INTEL_FAM6_WESTMERE: /* AAT100 */
+ case INTEL_FAM6_WESTMERE_EP: /* BC86,AAY89,BD102 */
+ case INTEL_FAM6_NEHALEM_EX: /* BA97 */
+ return true;
+ default:
+ break;
+ }
+ }
+
+ return false;
+}
+
+static int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, u32 msr, u32 *result)
+{
+ u32 vmx_msr_low, vmx_msr_high;
+ u32 ctl = ctl_min | ctl_opt;
+
+ rdmsr(msr, vmx_msr_low, vmx_msr_high);
+
+ ctl &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */
+ ctl |= vmx_msr_low; /* bit == 1 in low word ==> must be one */
+
+ /* Ensure minimum (required) set of control bits are supported. */
+ if (ctl_min & ~ctl)
+ return -EIO;
+
+ *result = ctl;
+ return 0;
+}
+
+static u64 adjust_vmx_controls64(u64 ctl_opt, u32 msr)
+{
+ u64 allowed;
+
+ rdmsrl(msr, allowed);
+
+ return ctl_opt & allowed;
+}
+
+static int setup_vmcs_config(struct vmcs_config *vmcs_conf,
+ struct vmx_capability *vmx_cap)
+{
+ u32 vmx_msr_low, vmx_msr_high;
+ u32 _pin_based_exec_control = 0;
+ u32 _cpu_based_exec_control = 0;
+ u32 _cpu_based_2nd_exec_control = 0;
+ u64 _cpu_based_3rd_exec_control = 0;
+ u32 _vmexit_control = 0;
+ u32 _vmentry_control = 0;
+ u64 misc_msr;
+ int i;
+
+ /*
+ * LOAD/SAVE_DEBUG_CONTROLS are absent because both are mandatory.
+ * SAVE_IA32_PAT and SAVE_IA32_EFER are absent because KVM always
+ * intercepts writes to PAT and EFER, i.e. never enables those controls.
+ */
+ struct {
+ u32 entry_control;
+ u32 exit_control;
+ } const vmcs_entry_exit_pairs[] = {
+ { VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL, VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL },
+ { VM_ENTRY_LOAD_IA32_PAT, VM_EXIT_LOAD_IA32_PAT },
+ { VM_ENTRY_LOAD_IA32_EFER, VM_EXIT_LOAD_IA32_EFER },
+ { VM_ENTRY_LOAD_BNDCFGS, VM_EXIT_CLEAR_BNDCFGS },
+ { VM_ENTRY_LOAD_IA32_RTIT_CTL, VM_EXIT_CLEAR_IA32_RTIT_CTL },
+ };
+
+ memset(vmcs_conf, 0, sizeof(*vmcs_conf));
+
+ if (adjust_vmx_controls(KVM_REQUIRED_VMX_CPU_BASED_VM_EXEC_CONTROL,
+ KVM_OPTIONAL_VMX_CPU_BASED_VM_EXEC_CONTROL,
+ MSR_IA32_VMX_PROCBASED_CTLS,
+ &_cpu_based_exec_control))
+ return -EIO;
+ if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) {
+ if (adjust_vmx_controls(KVM_REQUIRED_VMX_SECONDARY_VM_EXEC_CONTROL,
+ KVM_OPTIONAL_VMX_SECONDARY_VM_EXEC_CONTROL,
+ MSR_IA32_VMX_PROCBASED_CTLS2,
+ &_cpu_based_2nd_exec_control))
+ return -EIO;
+ }
+#ifndef CONFIG_X86_64
+ if (!(_cpu_based_2nd_exec_control &
+ SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
+ _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW;
+#endif
+
+ if (!(_cpu_based_exec_control & CPU_BASED_TPR_SHADOW))
+ _cpu_based_2nd_exec_control &= ~(
+ SECONDARY_EXEC_APIC_REGISTER_VIRT |
+ SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
+ SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
+
+ rdmsr_safe(MSR_IA32_VMX_EPT_VPID_CAP,
+ &vmx_cap->ept, &vmx_cap->vpid);
+
+ if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) &&
+ vmx_cap->ept) {
+ pr_warn_once("EPT CAP should not exist if not support "
+ "1-setting enable EPT VM-execution control\n");
+
+ if (error_on_inconsistent_vmcs_config)
+ return -EIO;
+
+ vmx_cap->ept = 0;
+ }
+ if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_VPID) &&
+ vmx_cap->vpid) {
+ pr_warn_once("VPID CAP should not exist if not support "
+ "1-setting enable VPID VM-execution control\n");
+
+ if (error_on_inconsistent_vmcs_config)
+ return -EIO;
+
+ vmx_cap->vpid = 0;
+ }
+
+ if (!cpu_has_sgx())
+ _cpu_based_2nd_exec_control &= ~SECONDARY_EXEC_ENCLS_EXITING;
+
+ if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_TERTIARY_CONTROLS)
+ _cpu_based_3rd_exec_control =
+ adjust_vmx_controls64(KVM_OPTIONAL_VMX_TERTIARY_VM_EXEC_CONTROL,
+ MSR_IA32_VMX_PROCBASED_CTLS3);
+
+ if (adjust_vmx_controls(KVM_REQUIRED_VMX_VM_EXIT_CONTROLS,
+ KVM_OPTIONAL_VMX_VM_EXIT_CONTROLS,
+ MSR_IA32_VMX_EXIT_CTLS,
+ &_vmexit_control))
+ return -EIO;
+
+ if (adjust_vmx_controls(KVM_REQUIRED_VMX_PIN_BASED_VM_EXEC_CONTROL,
+ KVM_OPTIONAL_VMX_PIN_BASED_VM_EXEC_CONTROL,
+ MSR_IA32_VMX_PINBASED_CTLS,
+ &_pin_based_exec_control))
+ return -EIO;
+
+ if (cpu_has_broken_vmx_preemption_timer())
+ _pin_based_exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
+ if (!(_cpu_based_2nd_exec_control &
+ SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY))
+ _pin_based_exec_control &= ~PIN_BASED_POSTED_INTR;
+
+ if (adjust_vmx_controls(KVM_REQUIRED_VMX_VM_ENTRY_CONTROLS,
+ KVM_OPTIONAL_VMX_VM_ENTRY_CONTROLS,
+ MSR_IA32_VMX_ENTRY_CTLS,
+ &_vmentry_control))
+ return -EIO;
+
+ for (i = 0; i < ARRAY_SIZE(vmcs_entry_exit_pairs); i++) {
+ u32 n_ctrl = vmcs_entry_exit_pairs[i].entry_control;
+ u32 x_ctrl = vmcs_entry_exit_pairs[i].exit_control;
+
+ if (!(_vmentry_control & n_ctrl) == !(_vmexit_control & x_ctrl))
+ continue;
+
+ pr_warn_once("Inconsistent VM-Entry/VM-Exit pair, entry = %x, exit = %x\n",
+ _vmentry_control & n_ctrl, _vmexit_control & x_ctrl);
+
+ if (error_on_inconsistent_vmcs_config)
+ return -EIO;
+
+ _vmentry_control &= ~n_ctrl;
+ _vmexit_control &= ~x_ctrl;
+ }
+
+ rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high);
+
+ /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */
+ if ((vmx_msr_high & 0x1fff) > PAGE_SIZE)
+ return -EIO;
+
+#ifdef CONFIG_X86_64
+ /* IA-32 SDM Vol 3B: 64-bit CPUs always have VMX_BASIC_MSR[48]==0. */
+ if (vmx_msr_high & (1u<<16))
+ return -EIO;
+#endif
+
+ /* Require Write-Back (WB) memory type for VMCS accesses. */
+ if (((vmx_msr_high >> 18) & 15) != 6)
+ return -EIO;
+
+ rdmsrl(MSR_IA32_VMX_MISC, misc_msr);
+
+ vmcs_conf->size = vmx_msr_high & 0x1fff;
+ vmcs_conf->basic_cap = vmx_msr_high & ~0x1fff;
+
+ vmcs_conf->revision_id = vmx_msr_low;
+
+ vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control;
+ vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control;
+ vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control;
+ vmcs_conf->cpu_based_3rd_exec_ctrl = _cpu_based_3rd_exec_control;
+ vmcs_conf->vmexit_ctrl = _vmexit_control;
+ vmcs_conf->vmentry_ctrl = _vmentry_control;
+ vmcs_conf->misc = misc_msr;
+
+#if IS_ENABLED(CONFIG_HYPERV)
+ if (enlightened_vmcs)
+ evmcs_sanitize_exec_ctrls(vmcs_conf);
+#endif
+
+ return 0;
+}
+
+static bool __kvm_is_vmx_supported(void)
+{
+ int cpu = smp_processor_id();
+
+ if (!(cpuid_ecx(1) & feature_bit(VMX))) {
+ pr_err("VMX not supported by CPU %d\n", cpu);
+ return false;
+ }
+
+ if (!this_cpu_has(X86_FEATURE_MSR_IA32_FEAT_CTL) ||
+ !this_cpu_has(X86_FEATURE_VMX)) {
+ pr_err("VMX not enabled (by BIOS) in MSR_IA32_FEAT_CTL on CPU %d\n", cpu);
+ return false;
+ }
+
+ return true;
+}
+
+static bool kvm_is_vmx_supported(void)
+{
+ bool supported;
+
+ migrate_disable();
+ supported = __kvm_is_vmx_supported();
+ migrate_enable();
+
+ return supported;
+}
+
+static int vmx_check_processor_compat(void)
+{
+ int cpu = raw_smp_processor_id();
+ struct vmcs_config vmcs_conf;
+ struct vmx_capability vmx_cap;
+
+ if (!__kvm_is_vmx_supported())
+ return -EIO;
+
+ if (setup_vmcs_config(&vmcs_conf, &vmx_cap) < 0) {
+ pr_err("Failed to setup VMCS config on CPU %d\n", cpu);
+ return -EIO;
+ }
+ if (nested)
+ nested_vmx_setup_ctls_msrs(&vmcs_conf, vmx_cap.ept);
+ if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config))) {
+ pr_err("Inconsistent VMCS config on CPU %d\n", cpu);
+ return -EIO;
+ }
+ return 0;
+}
+
+static int kvm_cpu_vmxon(u64 vmxon_pointer)
+{
+ u64 msr;
+
+ cr4_set_bits(X86_CR4_VMXE);
+
+ asm_volatile_goto("1: vmxon %[vmxon_pointer]\n\t"
+ _ASM_EXTABLE(1b, %l[fault])
+ : : [vmxon_pointer] "m"(vmxon_pointer)
+ : : fault);
+ return 0;
+
+fault:
+ WARN_ONCE(1, "VMXON faulted, MSR_IA32_FEAT_CTL (0x3a) = 0x%llx\n",
+ rdmsrl_safe(MSR_IA32_FEAT_CTL, &msr) ? 0xdeadbeef : msr);
+ cr4_clear_bits(X86_CR4_VMXE);
+
+ return -EFAULT;
+}
+
+static int vmx_hardware_enable(void)
+{
+ int cpu = raw_smp_processor_id();
+ u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
+ int r;
+
+ if (cr4_read_shadow() & X86_CR4_VMXE)
+ return -EBUSY;
+
+ /*
+ * This can happen if we hot-added a CPU but failed to allocate
+ * VP assist page for it.
+ */
+ if (kvm_is_using_evmcs() && !hv_get_vp_assist_page(cpu))
+ return -EFAULT;
+
+ intel_pt_handle_vmx(1);
+
+ r = kvm_cpu_vmxon(phys_addr);
+ if (r) {
+ intel_pt_handle_vmx(0);
+ return r;
+ }
+
+ if (enable_ept)
+ ept_sync_global();
+
+ return 0;
+}
+
+static void vmclear_local_loaded_vmcss(void)
+{
+ int cpu = raw_smp_processor_id();
+ struct loaded_vmcs *v, *n;
+
+ list_for_each_entry_safe(v, n, &per_cpu(loaded_vmcss_on_cpu, cpu),
+ loaded_vmcss_on_cpu_link)
+ __loaded_vmcs_clear(v);
+}
+
+static void vmx_hardware_disable(void)
+{
+ vmclear_local_loaded_vmcss();
+
+ if (kvm_cpu_vmxoff())
+ kvm_spurious_fault();
+
+ hv_reset_evmcs();
+
+ intel_pt_handle_vmx(0);
+}
+
+struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags)
+{
+ int node = cpu_to_node(cpu);
+ struct page *pages;
+ struct vmcs *vmcs;
+
+ pages = __alloc_pages_node(node, flags, 0);
+ if (!pages)
+ return NULL;
+ vmcs = page_address(pages);
+ memset(vmcs, 0, vmcs_config.size);
+
+ /* KVM supports Enlightened VMCS v1 only */
+ if (kvm_is_using_evmcs())
+ vmcs->hdr.revision_id = KVM_EVMCS_VERSION;
+ else
+ vmcs->hdr.revision_id = vmcs_config.revision_id;
+
+ if (shadow)
+ vmcs->hdr.shadow_vmcs = 1;
+ return vmcs;
+}
+
+void free_vmcs(struct vmcs *vmcs)
+{
+ free_page((unsigned long)vmcs);
+}
+
+/*
+ * Free a VMCS, but before that VMCLEAR it on the CPU where it was last loaded
+ */
+void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
+{
+ if (!loaded_vmcs->vmcs)
+ return;
+ loaded_vmcs_clear(loaded_vmcs);
+ free_vmcs(loaded_vmcs->vmcs);
+ loaded_vmcs->vmcs = NULL;
+ if (loaded_vmcs->msr_bitmap)
+ free_page((unsigned long)loaded_vmcs->msr_bitmap);
+ WARN_ON(loaded_vmcs->shadow_vmcs != NULL);
+}
+
+int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
+{
+ loaded_vmcs->vmcs = alloc_vmcs(false);
+ if (!loaded_vmcs->vmcs)
+ return -ENOMEM;
+
+ vmcs_clear(loaded_vmcs->vmcs);
+
+ loaded_vmcs->shadow_vmcs = NULL;
+ loaded_vmcs->hv_timer_soft_disabled = false;
+ loaded_vmcs->cpu = -1;
+ loaded_vmcs->launched = 0;
+
+ if (cpu_has_vmx_msr_bitmap()) {
+ loaded_vmcs->msr_bitmap = (unsigned long *)
+ __get_free_page(GFP_KERNEL_ACCOUNT);
+ if (!loaded_vmcs->msr_bitmap)
+ goto out_vmcs;
+ memset(loaded_vmcs->msr_bitmap, 0xff, PAGE_SIZE);
+ }
+
+ memset(&loaded_vmcs->host_state, 0, sizeof(struct vmcs_host_state));
+ memset(&loaded_vmcs->controls_shadow, 0,
+ sizeof(struct vmcs_controls_shadow));
+
+ return 0;
+
+out_vmcs:
+ free_loaded_vmcs(loaded_vmcs);
+ return -ENOMEM;
+}
+
+static void free_kvm_area(void)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ free_vmcs(per_cpu(vmxarea, cpu));
+ per_cpu(vmxarea, cpu) = NULL;
+ }
+}
+
+static __init int alloc_kvm_area(void)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ struct vmcs *vmcs;
+
+ vmcs = alloc_vmcs_cpu(false, cpu, GFP_KERNEL);
+ if (!vmcs) {
+ free_kvm_area();
+ return -ENOMEM;
+ }
+
+ /*
+ * When eVMCS is enabled, alloc_vmcs_cpu() sets
+ * vmcs->revision_id to KVM_EVMCS_VERSION instead of
+ * revision_id reported by MSR_IA32_VMX_BASIC.
+ *
+ * However, even though not explicitly documented by
+ * TLFS, VMXArea passed as VMXON argument should
+ * still be marked with revision_id reported by
+ * physical CPU.
+ */
+ if (kvm_is_using_evmcs())
+ vmcs->hdr.revision_id = vmcs_config.revision_id;
+
+ per_cpu(vmxarea, cpu) = vmcs;
+ }
+ return 0;
+}
+
+static void fix_pmode_seg(struct kvm_vcpu *vcpu, int seg,
+ struct kvm_segment *save)
+{
+ if (!emulate_invalid_guest_state) {
+ /*
+ * CS and SS RPL should be equal during guest entry according
+ * to VMX spec, but in reality it is not always so. Since vcpu
+ * is in the middle of the transition from real mode to
+ * protected mode it is safe to assume that RPL 0 is a good
+ * default value.
+ */
+ if (seg == VCPU_SREG_CS || seg == VCPU_SREG_SS)
+ save->selector &= ~SEGMENT_RPL_MASK;
+ save->dpl = save->selector & SEGMENT_RPL_MASK;
+ save->s = 1;
+ }
+ __vmx_set_segment(vcpu, save, seg);
+}
+
+static void enter_pmode(struct kvm_vcpu *vcpu)
+{
+ unsigned long flags;
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ /*
+ * Update real mode segment cache. It may be not up-to-date if segment
+ * register was written while vcpu was in a guest mode.
+ */
+ vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES);
+ vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS);
+ vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS);
+ vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS);
+ vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS);
+ vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS);
+
+ vmx->rmode.vm86_active = 0;
+
+ __vmx_set_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);
+
+ flags = vmcs_readl(GUEST_RFLAGS);
+ flags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
+ flags |= vmx->rmode.save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
+ vmcs_writel(GUEST_RFLAGS, flags);
+
+ vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) |
+ (vmcs_readl(CR4_READ_SHADOW) & X86_CR4_VME));
+
+ vmx_update_exception_bitmap(vcpu);
+
+ fix_pmode_seg(vcpu, VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]);
+ fix_pmode_seg(vcpu, VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]);
+ fix_pmode_seg(vcpu, VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]);
+ fix_pmode_seg(vcpu, VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]);
+ fix_pmode_seg(vcpu, VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]);
+ fix_pmode_seg(vcpu, VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]);
+}
+
+static void fix_rmode_seg(int seg, struct kvm_segment *save)
+{
+ const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
+ struct kvm_segment var = *save;
+
+ var.dpl = 0x3;
+ if (seg == VCPU_SREG_CS)
+ var.type = 0x3;
+
+ if (!emulate_invalid_guest_state) {
+ var.selector = var.base >> 4;
+ var.base = var.base & 0xffff0;
+ var.limit = 0xffff;
+ var.g = 0;
+ var.db = 0;
+ var.present = 1;
+ var.s = 1;
+ var.l = 0;
+ var.unusable = 0;
+ var.type = 0x3;
+ var.avl = 0;
+ if (save->base & 0xf)
+ pr_warn_once("segment base is not paragraph aligned "
+ "when entering protected mode (seg=%d)", seg);
+ }
+
+ vmcs_write16(sf->selector, var.selector);
+ vmcs_writel(sf->base, var.base);
+ vmcs_write32(sf->limit, var.limit);
+ vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(&var));
+}
+
+static void enter_rmode(struct kvm_vcpu *vcpu)
+{
+ unsigned long flags;
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct kvm_vmx *kvm_vmx = to_kvm_vmx(vcpu->kvm);
+
+ /*
+ * KVM should never use VM86 to virtualize Real Mode when L2 is active,
+ * as using VM86 is unnecessary if unrestricted guest is enabled, and
+ * if unrestricted guest is disabled, VM-Enter (from L1) with CR0.PG=0
+ * should VM-Fail and KVM should reject userspace attempts to stuff
+ * CR0.PG=0 when L2 is active.
+ */
+ WARN_ON_ONCE(is_guest_mode(vcpu));
+
+ vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);
+ vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES);
+ vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS);
+ vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS);
+ vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS);
+ vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS);
+ vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS);
+
+ vmx->rmode.vm86_active = 1;
+
+ vmx_segment_cache_clear(vmx);
+
+ vmcs_writel(GUEST_TR_BASE, kvm_vmx->tss_addr);
+ vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1);
+ vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
+
+ flags = vmcs_readl(GUEST_RFLAGS);
+ vmx->rmode.save_rflags = flags;
+
+ flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
+
+ vmcs_writel(GUEST_RFLAGS, flags);
+ vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME);
+ vmx_update_exception_bitmap(vcpu);
+
+ fix_rmode_seg(VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]);
+ fix_rmode_seg(VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]);
+ fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]);
+ fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]);
+ fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]);
+ fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]);
+}
+
+int vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ /* Nothing to do if hardware doesn't support EFER. */
+ if (!vmx_find_uret_msr(vmx, MSR_EFER))
+ return 0;
+
+ vcpu->arch.efer = efer;
+#ifdef CONFIG_X86_64
+ if (efer & EFER_LMA)
+ vm_entry_controls_setbit(vmx, VM_ENTRY_IA32E_MODE);
+ else
+ vm_entry_controls_clearbit(vmx, VM_ENTRY_IA32E_MODE);
+#else
+ if (KVM_BUG_ON(efer & EFER_LMA, vcpu->kvm))
+ return 1;
+#endif
+
+ vmx_setup_uret_msrs(vmx);
+ return 0;
+}
+
+#ifdef CONFIG_X86_64
+
+static void enter_lmode(struct kvm_vcpu *vcpu)
+{
+ u32 guest_tr_ar;
+
+ vmx_segment_cache_clear(to_vmx(vcpu));
+
+ guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES);
+ if ((guest_tr_ar & VMX_AR_TYPE_MASK) != VMX_AR_TYPE_BUSY_64_TSS) {
+ pr_debug_ratelimited("%s: tss fixup for long mode. \n",
+ __func__);
+ vmcs_write32(GUEST_TR_AR_BYTES,
+ (guest_tr_ar & ~VMX_AR_TYPE_MASK)
+ | VMX_AR_TYPE_BUSY_64_TSS);
+ }
+ vmx_set_efer(vcpu, vcpu->arch.efer | EFER_LMA);
+}
+
+static void exit_lmode(struct kvm_vcpu *vcpu)
+{
+ vmx_set_efer(vcpu, vcpu->arch.efer & ~EFER_LMA);
+}
+
+#endif
+
+static void vmx_flush_tlb_all(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ /*
+ * INVEPT must be issued when EPT is enabled, irrespective of VPID, as
+ * the CPU is not required to invalidate guest-physical mappings on
+ * VM-Entry, even if VPID is disabled. Guest-physical mappings are
+ * associated with the root EPT structure and not any particular VPID
+ * (INVVPID also isn't required to invalidate guest-physical mappings).
+ */
+ if (enable_ept) {
+ ept_sync_global();
+ } else if (enable_vpid) {
+ if (cpu_has_vmx_invvpid_global()) {
+ vpid_sync_vcpu_global();
+ } else {
+ vpid_sync_vcpu_single(vmx->vpid);
+ vpid_sync_vcpu_single(vmx->nested.vpid02);
+ }
+ }
+}
+
+static inline int vmx_get_current_vpid(struct kvm_vcpu *vcpu)
+{
+ if (is_guest_mode(vcpu))
+ return nested_get_vpid02(vcpu);
+ return to_vmx(vcpu)->vpid;
+}
+
+static void vmx_flush_tlb_current(struct kvm_vcpu *vcpu)
+{
+ struct kvm_mmu *mmu = vcpu->arch.mmu;
+ u64 root_hpa = mmu->root.hpa;
+
+ /* No flush required if the current context is invalid. */
+ if (!VALID_PAGE(root_hpa))
+ return;
+
+ if (enable_ept)
+ ept_sync_context(construct_eptp(vcpu, root_hpa,
+ mmu->root_role.level));
+ else
+ vpid_sync_context(vmx_get_current_vpid(vcpu));
+}
+
+static void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr)
+{
+ /*
+ * vpid_sync_vcpu_addr() is a nop if vpid==0, see the comment in
+ * vmx_flush_tlb_guest() for an explanation of why this is ok.
+ */
+ vpid_sync_vcpu_addr(vmx_get_current_vpid(vcpu), addr);
+}
+
+static void vmx_flush_tlb_guest(struct kvm_vcpu *vcpu)
+{
+ /*
+ * vpid_sync_context() is a nop if vpid==0, e.g. if enable_vpid==0 or a
+ * vpid couldn't be allocated for this vCPU. VM-Enter and VM-Exit are
+ * required to flush GVA->{G,H}PA mappings from the TLB if vpid is
+ * disabled (VM-Enter with vpid enabled and vpid==0 is disallowed),
+ * i.e. no explicit INVVPID is necessary.
+ */
+ vpid_sync_context(vmx_get_current_vpid(vcpu));
+}
+
+void vmx_ept_load_pdptrs(struct kvm_vcpu *vcpu)
+{
+ struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
+
+ if (!kvm_register_is_dirty(vcpu, VCPU_EXREG_PDPTR))
+ return;
+
+ if (is_pae_paging(vcpu)) {
+ vmcs_write64(GUEST_PDPTR0, mmu->pdptrs[0]);
+ vmcs_write64(GUEST_PDPTR1, mmu->pdptrs[1]);
+ vmcs_write64(GUEST_PDPTR2, mmu->pdptrs[2]);
+ vmcs_write64(GUEST_PDPTR3, mmu->pdptrs[3]);
+ }
+}
+
+void ept_save_pdptrs(struct kvm_vcpu *vcpu)
+{
+ struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
+
+ if (WARN_ON_ONCE(!is_pae_paging(vcpu)))
+ return;
+
+ mmu->pdptrs[0] = vmcs_read64(GUEST_PDPTR0);
+ mmu->pdptrs[1] = vmcs_read64(GUEST_PDPTR1);
+ mmu->pdptrs[2] = vmcs_read64(GUEST_PDPTR2);
+ mmu->pdptrs[3] = vmcs_read64(GUEST_PDPTR3);
+
+ kvm_register_mark_available(vcpu, VCPU_EXREG_PDPTR);
+}
+
+#define CR3_EXITING_BITS (CPU_BASED_CR3_LOAD_EXITING | \
+ CPU_BASED_CR3_STORE_EXITING)
+
+static bool vmx_is_valid_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
+{
+ if (is_guest_mode(vcpu))
+ return nested_guest_cr0_valid(vcpu, cr0);
+
+ if (to_vmx(vcpu)->nested.vmxon)
+ return nested_host_cr0_valid(vcpu, cr0);
+
+ return true;
+}
+
+void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ unsigned long hw_cr0, old_cr0_pg;
+ u32 tmp;
+
+ old_cr0_pg = kvm_read_cr0_bits(vcpu, X86_CR0_PG);
+
+ hw_cr0 = (cr0 & ~KVM_VM_CR0_ALWAYS_OFF);
+ if (enable_unrestricted_guest)
+ hw_cr0 |= KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST;
+ else {
+ hw_cr0 |= KVM_VM_CR0_ALWAYS_ON;
+ if (!enable_ept)
+ hw_cr0 |= X86_CR0_WP;
+
+ if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE))
+ enter_pmode(vcpu);
+
+ if (!vmx->rmode.vm86_active && !(cr0 & X86_CR0_PE))
+ enter_rmode(vcpu);
+ }
+
+ vmcs_writel(CR0_READ_SHADOW, cr0);
+ vmcs_writel(GUEST_CR0, hw_cr0);
+ vcpu->arch.cr0 = cr0;
+ kvm_register_mark_available(vcpu, VCPU_EXREG_CR0);
+
+#ifdef CONFIG_X86_64
+ if (vcpu->arch.efer & EFER_LME) {
+ if (!old_cr0_pg && (cr0 & X86_CR0_PG))
+ enter_lmode(vcpu);
+ else if (old_cr0_pg && !(cr0 & X86_CR0_PG))
+ exit_lmode(vcpu);
+ }
+#endif
+
+ if (enable_ept && !enable_unrestricted_guest) {
+ /*
+ * Ensure KVM has an up-to-date snapshot of the guest's CR3. If
+ * the below code _enables_ CR3 exiting, vmx_cache_reg() will
+ * (correctly) stop reading vmcs.GUEST_CR3 because it thinks
+ * KVM's CR3 is installed.
+ */
+ if (!kvm_register_is_available(vcpu, VCPU_EXREG_CR3))
+ vmx_cache_reg(vcpu, VCPU_EXREG_CR3);
+
+ /*
+ * When running with EPT but not unrestricted guest, KVM must
+ * intercept CR3 accesses when paging is _disabled_. This is
+ * necessary because restricted guests can't actually run with
+ * paging disabled, and so KVM stuffs its own CR3 in order to
+ * run the guest when identity mapped page tables.
+ *
+ * Do _NOT_ check the old CR0.PG, e.g. to optimize away the
+ * update, it may be stale with respect to CR3 interception,
+ * e.g. after nested VM-Enter.
+ *
+ * Lastly, honor L1's desires, i.e. intercept CR3 loads and/or
+ * stores to forward them to L1, even if KVM does not need to
+ * intercept them to preserve its identity mapped page tables.
+ */
+ if (!(cr0 & X86_CR0_PG)) {
+ exec_controls_setbit(vmx, CR3_EXITING_BITS);
+ } else if (!is_guest_mode(vcpu)) {
+ exec_controls_clearbit(vmx, CR3_EXITING_BITS);
+ } else {
+ tmp = exec_controls_get(vmx);
+ tmp &= ~CR3_EXITING_BITS;
+ tmp |= get_vmcs12(vcpu)->cpu_based_vm_exec_control & CR3_EXITING_BITS;
+ exec_controls_set(vmx, tmp);
+ }
+
+ /* Note, vmx_set_cr4() consumes the new vcpu->arch.cr0. */
+ if ((old_cr0_pg ^ cr0) & X86_CR0_PG)
+ vmx_set_cr4(vcpu, kvm_read_cr4(vcpu));
+
+ /*
+ * When !CR0_PG -> CR0_PG, vcpu->arch.cr3 becomes active, but
+ * GUEST_CR3 is still vmx->ept_identity_map_addr if EPT + !URG.
+ */
+ if (!(old_cr0_pg & X86_CR0_PG) && (cr0 & X86_CR0_PG))
+ kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3);
+ }
+
+ /* depends on vcpu->arch.cr0 to be set to a new value */
+ vmx->emulation_required = vmx_emulation_required(vcpu);
+}
+
+static int vmx_get_max_ept_level(void)
+{
+ if (cpu_has_vmx_ept_5levels())
+ return 5;
+ return 4;
+}
+
+u64 construct_eptp(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level)
+{
+ u64 eptp = VMX_EPTP_MT_WB;
+
+ eptp |= (root_level == 5) ? VMX_EPTP_PWL_5 : VMX_EPTP_PWL_4;
+
+ if (enable_ept_ad_bits &&
+ (!is_guest_mode(vcpu) || nested_ept_ad_enabled(vcpu)))
+ eptp |= VMX_EPTP_AD_ENABLE_BIT;
+ eptp |= root_hpa;
+
+ return eptp;
+}
+
+static void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa,
+ int root_level)
+{
+ struct kvm *kvm = vcpu->kvm;
+ bool update_guest_cr3 = true;
+ unsigned long guest_cr3;
+ u64 eptp;
+
+ if (enable_ept) {
+ eptp = construct_eptp(vcpu, root_hpa, root_level);
+ vmcs_write64(EPT_POINTER, eptp);
+
+ hv_track_root_tdp(vcpu, root_hpa);
+
+ if (!enable_unrestricted_guest && !is_paging(vcpu))
+ guest_cr3 = to_kvm_vmx(kvm)->ept_identity_map_addr;
+ else if (kvm_register_is_dirty(vcpu, VCPU_EXREG_CR3))
+ guest_cr3 = vcpu->arch.cr3;
+ else /* vmcs.GUEST_CR3 is already up-to-date. */
+ update_guest_cr3 = false;
+ vmx_ept_load_pdptrs(vcpu);
+ } else {
+ guest_cr3 = root_hpa | kvm_get_active_pcid(vcpu);
+ }
+
+ if (update_guest_cr3)
+ vmcs_writel(GUEST_CR3, guest_cr3);
+}
+
+
+static bool vmx_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
+{
+ /*
+ * We operate under the default treatment of SMM, so VMX cannot be
+ * enabled under SMM. Note, whether or not VMXE is allowed at all,
+ * i.e. is a reserved bit, is handled by common x86 code.
+ */
+ if ((cr4 & X86_CR4_VMXE) && is_smm(vcpu))
+ return false;
+
+ if (to_vmx(vcpu)->nested.vmxon && !nested_cr4_valid(vcpu, cr4))
+ return false;
+
+ return true;
+}
+
+void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
+{
+ unsigned long old_cr4 = kvm_read_cr4(vcpu);
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ unsigned long hw_cr4;
+
+ /*
+ * Pass through host's Machine Check Enable value to hw_cr4, which
+ * is in force while we are in guest mode. Do not let guests control
+ * this bit, even if host CR4.MCE == 0.
+ */
+ hw_cr4 = (cr4_read_shadow() & X86_CR4_MCE) | (cr4 & ~X86_CR4_MCE);
+ if (enable_unrestricted_guest)
+ hw_cr4 |= KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST;
+ else if (vmx->rmode.vm86_active)
+ hw_cr4 |= KVM_RMODE_VM_CR4_ALWAYS_ON;
+ else
+ hw_cr4 |= KVM_PMODE_VM_CR4_ALWAYS_ON;
+
+ if (vmx_umip_emulated()) {
+ if (cr4 & X86_CR4_UMIP) {
+ secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_DESC);
+ hw_cr4 &= ~X86_CR4_UMIP;
+ } else if (!is_guest_mode(vcpu) ||
+ !nested_cpu_has2(get_vmcs12(vcpu), SECONDARY_EXEC_DESC)) {
+ secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_DESC);
+ }
+ }
+
+ vcpu->arch.cr4 = cr4;
+ kvm_register_mark_available(vcpu, VCPU_EXREG_CR4);
+
+ if (!enable_unrestricted_guest) {
+ if (enable_ept) {
+ if (!is_paging(vcpu)) {
+ hw_cr4 &= ~X86_CR4_PAE;
+ hw_cr4 |= X86_CR4_PSE;
+ } else if (!(cr4 & X86_CR4_PAE)) {
+ hw_cr4 &= ~X86_CR4_PAE;
+ }
+ }
+
+ /*
+ * SMEP/SMAP/PKU is disabled if CPU is in non-paging mode in
+ * hardware. To emulate this behavior, SMEP/SMAP/PKU needs
+ * to be manually disabled when guest switches to non-paging
+ * mode.
+ *
+ * If !enable_unrestricted_guest, the CPU is always running
+ * with CR0.PG=1 and CR4 needs to be modified.
+ * If enable_unrestricted_guest, the CPU automatically
+ * disables SMEP/SMAP/PKU when the guest sets CR0.PG=0.
+ */
+ if (!is_paging(vcpu))
+ hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE);
+ }
+
+ vmcs_writel(CR4_READ_SHADOW, cr4);
+ vmcs_writel(GUEST_CR4, hw_cr4);
+
+ if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE))
+ kvm_update_cpuid_runtime(vcpu);
+}
+
+void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ u32 ar;
+
+ if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) {
+ *var = vmx->rmode.segs[seg];
+ if (seg == VCPU_SREG_TR
+ || var->selector == vmx_read_guest_seg_selector(vmx, seg))
+ return;
+ var->base = vmx_read_guest_seg_base(vmx, seg);
+ var->selector = vmx_read_guest_seg_selector(vmx, seg);
+ return;
+ }
+ var->base = vmx_read_guest_seg_base(vmx, seg);
+ var->limit = vmx_read_guest_seg_limit(vmx, seg);
+ var->selector = vmx_read_guest_seg_selector(vmx, seg);
+ ar = vmx_read_guest_seg_ar(vmx, seg);
+ var->unusable = (ar >> 16) & 1;
+ var->type = ar & 15;
+ var->s = (ar >> 4) & 1;
+ var->dpl = (ar >> 5) & 3;
+ /*
+ * Some userspaces do not preserve unusable property. Since usable
+ * segment has to be present according to VMX spec we can use present
+ * property to amend userspace bug by making unusable segment always
+ * nonpresent. vmx_segment_access_rights() already marks nonpresent
+ * segment as unusable.
+ */
+ var->present = !var->unusable;
+ var->avl = (ar >> 12) & 1;
+ var->l = (ar >> 13) & 1;
+ var->db = (ar >> 14) & 1;
+ var->g = (ar >> 15) & 1;
+}
+
+static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg)
+{
+ struct kvm_segment s;
+
+ if (to_vmx(vcpu)->rmode.vm86_active) {
+ vmx_get_segment(vcpu, &s, seg);
+ return s.base;
+ }
+ return vmx_read_guest_seg_base(to_vmx(vcpu), seg);
+}
+
+int vmx_get_cpl(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ if (unlikely(vmx->rmode.vm86_active))
+ return 0;
+ else {
+ int ar = vmx_read_guest_seg_ar(vmx, VCPU_SREG_SS);
+ return VMX_AR_DPL(ar);
+ }
+}
+
+static u32 vmx_segment_access_rights(struct kvm_segment *var)
+{
+ u32 ar;
+
+ ar = var->type & 15;
+ ar |= (var->s & 1) << 4;
+ ar |= (var->dpl & 3) << 5;
+ ar |= (var->present & 1) << 7;
+ ar |= (var->avl & 1) << 12;
+ ar |= (var->l & 1) << 13;
+ ar |= (var->db & 1) << 14;
+ ar |= (var->g & 1) << 15;
+ ar |= (var->unusable || !var->present) << 16;
+
+ return ar;
+}
+
+void __vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
+
+ vmx_segment_cache_clear(vmx);
+
+ if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) {
+ vmx->rmode.segs[seg] = *var;
+ if (seg == VCPU_SREG_TR)
+ vmcs_write16(sf->selector, var->selector);
+ else if (var->s)
+ fix_rmode_seg(seg, &vmx->rmode.segs[seg]);
+ return;
+ }
+
+ vmcs_writel(sf->base, var->base);
+ vmcs_write32(sf->limit, var->limit);
+ vmcs_write16(sf->selector, var->selector);
+
+ /*
+ * Fix the "Accessed" bit in AR field of segment registers for older
+ * qemu binaries.
+ * IA32 arch specifies that at the time of processor reset the
+ * "Accessed" bit in the AR field of segment registers is 1. And qemu
+ * is setting it to 0 in the userland code. This causes invalid guest
+ * state vmexit when "unrestricted guest" mode is turned on.
+ * Fix for this setup issue in cpu_reset is being pushed in the qemu
+ * tree. Newer qemu binaries with that qemu fix would not need this
+ * kvm hack.
+ */
+ if (is_unrestricted_guest(vcpu) && (seg != VCPU_SREG_LDTR))
+ var->type |= 0x1; /* Accessed */
+
+ vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(var));
+}
+
+static void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
+{
+ __vmx_set_segment(vcpu, var, seg);
+
+ to_vmx(vcpu)->emulation_required = vmx_emulation_required(vcpu);
+}
+
+static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
+{
+ u32 ar = vmx_read_guest_seg_ar(to_vmx(vcpu), VCPU_SREG_CS);
+
+ *db = (ar >> 14) & 1;
+ *l = (ar >> 13) & 1;
+}
+
+static void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
+{
+ dt->size = vmcs_read32(GUEST_IDTR_LIMIT);
+ dt->address = vmcs_readl(GUEST_IDTR_BASE);
+}
+
+static void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
+{
+ vmcs_write32(GUEST_IDTR_LIMIT, dt->size);
+ vmcs_writel(GUEST_IDTR_BASE, dt->address);
+}
+
+static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
+{
+ dt->size = vmcs_read32(GUEST_GDTR_LIMIT);
+ dt->address = vmcs_readl(GUEST_GDTR_BASE);
+}
+
+static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
+{
+ vmcs_write32(GUEST_GDTR_LIMIT, dt->size);
+ vmcs_writel(GUEST_GDTR_BASE, dt->address);
+}
+
+static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg)
+{
+ struct kvm_segment var;
+ u32 ar;
+
+ vmx_get_segment(vcpu, &var, seg);
+ var.dpl = 0x3;
+ if (seg == VCPU_SREG_CS)
+ var.type = 0x3;
+ ar = vmx_segment_access_rights(&var);
+
+ if (var.base != (var.selector << 4))
+ return false;
+ if (var.limit != 0xffff)
+ return false;
+ if (ar != 0xf3)
+ return false;
+
+ return true;
+}
+
+static bool code_segment_valid(struct kvm_vcpu *vcpu)
+{
+ struct kvm_segment cs;
+ unsigned int cs_rpl;
+
+ vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
+ cs_rpl = cs.selector & SEGMENT_RPL_MASK;
+
+ if (cs.unusable)
+ return false;
+ if (~cs.type & (VMX_AR_TYPE_CODE_MASK|VMX_AR_TYPE_ACCESSES_MASK))
+ return false;
+ if (!cs.s)
+ return false;
+ if (cs.type & VMX_AR_TYPE_WRITEABLE_MASK) {
+ if (cs.dpl > cs_rpl)
+ return false;
+ } else {
+ if (cs.dpl != cs_rpl)
+ return false;
+ }
+ if (!cs.present)
+ return false;
+
+ /* TODO: Add Reserved field check, this'll require a new member in the kvm_segment_field structure */
+ return true;
+}
+
+static bool stack_segment_valid(struct kvm_vcpu *vcpu)
+{
+ struct kvm_segment ss;
+ unsigned int ss_rpl;
+
+ vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);
+ ss_rpl = ss.selector & SEGMENT_RPL_MASK;
+
+ if (ss.unusable)
+ return true;
+ if (ss.type != 3 && ss.type != 7)
+ return false;
+ if (!ss.s)
+ return false;
+ if (ss.dpl != ss_rpl) /* DPL != RPL */
+ return false;
+ if (!ss.present)
+ return false;
+
+ return true;
+}
+
+static bool data_segment_valid(struct kvm_vcpu *vcpu, int seg)
+{
+ struct kvm_segment var;
+ unsigned int rpl;
+
+ vmx_get_segment(vcpu, &var, seg);
+ rpl = var.selector & SEGMENT_RPL_MASK;
+
+ if (var.unusable)
+ return true;
+ if (!var.s)
+ return false;
+ if (!var.present)
+ return false;
+ if (~var.type & (VMX_AR_TYPE_CODE_MASK|VMX_AR_TYPE_WRITEABLE_MASK)) {
+ if (var.dpl < rpl) /* DPL < RPL */
+ return false;
+ }
+
+ /* TODO: Add other members to kvm_segment_field to allow checking for other access
+ * rights flags
+ */
+ return true;
+}
+
+static bool tr_valid(struct kvm_vcpu *vcpu)
+{
+ struct kvm_segment tr;
+
+ vmx_get_segment(vcpu, &tr, VCPU_SREG_TR);
+
+ if (tr.unusable)
+ return false;
+ if (tr.selector & SEGMENT_TI_MASK) /* TI = 1 */
+ return false;
+ if (tr.type != 3 && tr.type != 11) /* TODO: Check if guest is in IA32e mode */
+ return false;
+ if (!tr.present)
+ return false;
+
+ return true;
+}
+
+static bool ldtr_valid(struct kvm_vcpu *vcpu)
+{
+ struct kvm_segment ldtr;
+
+ vmx_get_segment(vcpu, &ldtr, VCPU_SREG_LDTR);
+
+ if (ldtr.unusable)
+ return true;
+ if (ldtr.selector & SEGMENT_TI_MASK) /* TI = 1 */
+ return false;
+ if (ldtr.type != 2)
+ return false;
+ if (!ldtr.present)
+ return false;
+
+ return true;
+}
+
+static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu)
+{
+ struct kvm_segment cs, ss;
+
+ vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
+ vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);
+
+ return ((cs.selector & SEGMENT_RPL_MASK) ==
+ (ss.selector & SEGMENT_RPL_MASK));
+}
+
+/*
+ * Check if guest state is valid. Returns true if valid, false if
+ * not.
+ * We assume that registers are always usable
+ */
+bool __vmx_guest_state_valid(struct kvm_vcpu *vcpu)
+{
+ /* real mode guest state checks */
+ if (!is_protmode(vcpu) || (vmx_get_rflags(vcpu) & X86_EFLAGS_VM)) {
+ if (!rmode_segment_valid(vcpu, VCPU_SREG_CS))
+ return false;
+ if (!rmode_segment_valid(vcpu, VCPU_SREG_SS))
+ return false;
+ if (!rmode_segment_valid(vcpu, VCPU_SREG_DS))
+ return false;
+ if (!rmode_segment_valid(vcpu, VCPU_SREG_ES))
+ return false;
+ if (!rmode_segment_valid(vcpu, VCPU_SREG_FS))
+ return false;
+ if (!rmode_segment_valid(vcpu, VCPU_SREG_GS))
+ return false;
+ } else {
+ /* protected mode guest state checks */
+ if (!cs_ss_rpl_check(vcpu))
+ return false;
+ if (!code_segment_valid(vcpu))
+ return false;
+ if (!stack_segment_valid(vcpu))
+ return false;
+ if (!data_segment_valid(vcpu, VCPU_SREG_DS))
+ return false;
+ if (!data_segment_valid(vcpu, VCPU_SREG_ES))
+ return false;
+ if (!data_segment_valid(vcpu, VCPU_SREG_FS))
+ return false;
+ if (!data_segment_valid(vcpu, VCPU_SREG_GS))
+ return false;
+ if (!tr_valid(vcpu))
+ return false;
+ if (!ldtr_valid(vcpu))
+ return false;
+ }
+ /* TODO:
+ * - Add checks on RIP
+ * - Add checks on RFLAGS
+ */
+
+ return true;
+}
+
+static int init_rmode_tss(struct kvm *kvm, void __user *ua)
+{
+ const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0)));
+ u16 data;
+ int i;
+
+ for (i = 0; i < 3; i++) {
+ if (__copy_to_user(ua + PAGE_SIZE * i, zero_page, PAGE_SIZE))
+ return -EFAULT;
+ }
+
+ data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE;
+ if (__copy_to_user(ua + TSS_IOPB_BASE_OFFSET, &data, sizeof(u16)))
+ return -EFAULT;
+
+ data = ~0;
+ if (__copy_to_user(ua + RMODE_TSS_SIZE - 1, &data, sizeof(u8)))
+ return -EFAULT;
+
+ return 0;
+}
+
+static int init_rmode_identity_map(struct kvm *kvm)
+{
+ struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm);
+ int i, r = 0;
+ void __user *uaddr;
+ u32 tmp;
+
+ /* Protect kvm_vmx->ept_identity_pagetable_done. */
+ mutex_lock(&kvm->slots_lock);
+
+ if (likely(kvm_vmx->ept_identity_pagetable_done))
+ goto out;
+
+ if (!kvm_vmx->ept_identity_map_addr)
+ kvm_vmx->ept_identity_map_addr = VMX_EPT_IDENTITY_PAGETABLE_ADDR;
+
+ uaddr = __x86_set_memory_region(kvm,
+ IDENTITY_PAGETABLE_PRIVATE_MEMSLOT,
+ kvm_vmx->ept_identity_map_addr,
+ PAGE_SIZE);
+ if (IS_ERR(uaddr)) {
+ r = PTR_ERR(uaddr);
+ goto out;
+ }
+
+ /* Set up identity-mapping pagetable for EPT in real mode */
+ for (i = 0; i < (PAGE_SIZE / sizeof(tmp)); i++) {
+ tmp = (i << 22) + (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |
+ _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE);
+ if (__copy_to_user(uaddr + i * sizeof(tmp), &tmp, sizeof(tmp))) {
+ r = -EFAULT;
+ goto out;
+ }
+ }
+ kvm_vmx->ept_identity_pagetable_done = true;
+
+out:
+ mutex_unlock(&kvm->slots_lock);
+ return r;
+}
+
+static void seg_setup(int seg)
+{
+ const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
+ unsigned int ar;
+
+ vmcs_write16(sf->selector, 0);
+ vmcs_writel(sf->base, 0);
+ vmcs_write32(sf->limit, 0xffff);
+ ar = 0x93;
+ if (seg == VCPU_SREG_CS)
+ ar |= 0x08; /* code segment */
+
+ vmcs_write32(sf->ar_bytes, ar);
+}
+
+int allocate_vpid(void)
+{
+ int vpid;
+
+ if (!enable_vpid)
+ return 0;
+ spin_lock(&vmx_vpid_lock);
+ vpid = find_first_zero_bit(vmx_vpid_bitmap, VMX_NR_VPIDS);
+ if (vpid < VMX_NR_VPIDS)
+ __set_bit(vpid, vmx_vpid_bitmap);
+ else
+ vpid = 0;
+ spin_unlock(&vmx_vpid_lock);
+ return vpid;
+}
+
+void free_vpid(int vpid)
+{
+ if (!enable_vpid || vpid == 0)
+ return;
+ spin_lock(&vmx_vpid_lock);
+ __clear_bit(vpid, vmx_vpid_bitmap);
+ spin_unlock(&vmx_vpid_lock);
+}
+
+static void vmx_msr_bitmap_l01_changed(struct vcpu_vmx *vmx)
+{
+ /*
+ * When KVM is a nested hypervisor on top of Hyper-V and uses
+ * 'Enlightened MSR Bitmap' feature L0 needs to know that MSR
+ * bitmap has changed.
+ */
+ if (kvm_is_using_evmcs()) {
+ struct hv_enlightened_vmcs *evmcs = (void *)vmx->vmcs01.vmcs;
+
+ if (evmcs->hv_enlightenments_control.msr_bitmap)
+ evmcs->hv_clean_fields &=
+ ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP;
+ }
+
+ vmx->nested.force_msr_bitmap_recalc = true;
+}
+
+void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
+
+ if (!cpu_has_vmx_msr_bitmap())
+ return;
+
+ vmx_msr_bitmap_l01_changed(vmx);
+
+ /*
+ * Mark the desired intercept state in shadow bitmap, this is needed
+ * for resync when the MSR filters change.
+ */
+ if (is_valid_passthrough_msr(msr)) {
+ int idx = possible_passthrough_msr_slot(msr);
+
+ if (idx != -ENOENT) {
+ if (type & MSR_TYPE_R)
+ clear_bit(idx, vmx->shadow_msr_intercept.read);
+ if (type & MSR_TYPE_W)
+ clear_bit(idx, vmx->shadow_msr_intercept.write);
+ }
+ }
+
+ if ((type & MSR_TYPE_R) &&
+ !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_READ)) {
+ vmx_set_msr_bitmap_read(msr_bitmap, msr);
+ type &= ~MSR_TYPE_R;
+ }
+
+ if ((type & MSR_TYPE_W) &&
+ !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_WRITE)) {
+ vmx_set_msr_bitmap_write(msr_bitmap, msr);
+ type &= ~MSR_TYPE_W;
+ }
+
+ if (type & MSR_TYPE_R)
+ vmx_clear_msr_bitmap_read(msr_bitmap, msr);
+
+ if (type & MSR_TYPE_W)
+ vmx_clear_msr_bitmap_write(msr_bitmap, msr);
+}
+
+void vmx_enable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
+
+ if (!cpu_has_vmx_msr_bitmap())
+ return;
+
+ vmx_msr_bitmap_l01_changed(vmx);
+
+ /*
+ * Mark the desired intercept state in shadow bitmap, this is needed
+ * for resync when the MSR filter changes.
+ */
+ if (is_valid_passthrough_msr(msr)) {
+ int idx = possible_passthrough_msr_slot(msr);
+
+ if (idx != -ENOENT) {
+ if (type & MSR_TYPE_R)
+ set_bit(idx, vmx->shadow_msr_intercept.read);
+ if (type & MSR_TYPE_W)
+ set_bit(idx, vmx->shadow_msr_intercept.write);
+ }
+ }
+
+ if (type & MSR_TYPE_R)
+ vmx_set_msr_bitmap_read(msr_bitmap, msr);
+
+ if (type & MSR_TYPE_W)
+ vmx_set_msr_bitmap_write(msr_bitmap, msr);
+}
+
+static void vmx_update_msr_bitmap_x2apic(struct kvm_vcpu *vcpu)
+{
+ /*
+ * x2APIC indices for 64-bit accesses into the RDMSR and WRMSR halves
+ * of the MSR bitmap. KVM emulates APIC registers up through 0x3f0,
+ * i.e. MSR 0x83f, and so only needs to dynamically manipulate 64 bits.
+ */
+ const int read_idx = APIC_BASE_MSR / BITS_PER_LONG_LONG;
+ const int write_idx = read_idx + (0x800 / sizeof(u64));
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ u64 *msr_bitmap = (u64 *)vmx->vmcs01.msr_bitmap;
+ u8 mode;
+
+ if (!cpu_has_vmx_msr_bitmap() || WARN_ON_ONCE(!lapic_in_kernel(vcpu)))
+ return;
+
+ if (cpu_has_secondary_exec_ctrls() &&
+ (secondary_exec_controls_get(vmx) &
+ SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) {
+ mode = MSR_BITMAP_MODE_X2APIC;
+ if (enable_apicv && kvm_vcpu_apicv_active(vcpu))
+ mode |= MSR_BITMAP_MODE_X2APIC_APICV;
+ } else {
+ mode = 0;
+ }
+
+ if (mode == vmx->x2apic_msr_bitmap_mode)
+ return;
+
+ vmx->x2apic_msr_bitmap_mode = mode;
+
+ /*
+ * Reset the bitmap for MSRs 0x800 - 0x83f. Leave AMD's uber-extended
+ * registers (0x840 and above) intercepted, KVM doesn't support them.
+ * Intercept all writes by default and poke holes as needed. Pass
+ * through reads for all valid registers by default in x2APIC+APICv
+ * mode, only the current timer count needs on-demand emulation by KVM.
+ */
+ if (mode & MSR_BITMAP_MODE_X2APIC_APICV)
+ msr_bitmap[read_idx] = ~kvm_lapic_readable_reg_mask(vcpu->arch.apic);
+ else
+ msr_bitmap[read_idx] = ~0ull;
+ msr_bitmap[write_idx] = ~0ull;
+
+ /*
+ * TPR reads and writes can be virtualized even if virtual interrupt
+ * delivery is not in use.
+ */
+ vmx_set_intercept_for_msr(vcpu, X2APIC_MSR(APIC_TASKPRI), MSR_TYPE_RW,
+ !(mode & MSR_BITMAP_MODE_X2APIC));
+
+ if (mode & MSR_BITMAP_MODE_X2APIC_APICV) {
+ vmx_enable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_TMCCT), MSR_TYPE_RW);
+ vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_EOI), MSR_TYPE_W);
+ vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W);
+ if (enable_ipiv)
+ vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_ICR), MSR_TYPE_RW);
+ }
+}
+
+void pt_update_intercept_for_msr(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ bool flag = !(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN);
+ u32 i;
+
+ vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_STATUS, MSR_TYPE_RW, flag);
+ vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_OUTPUT_BASE, MSR_TYPE_RW, flag);
+ vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_OUTPUT_MASK, MSR_TYPE_RW, flag);
+ vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_CR3_MATCH, MSR_TYPE_RW, flag);
+ for (i = 0; i < vmx->pt_desc.num_address_ranges; i++) {
+ vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_ADDR0_A + i * 2, MSR_TYPE_RW, flag);
+ vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_ADDR0_B + i * 2, MSR_TYPE_RW, flag);
+ }
+}
+
+static bool vmx_guest_apic_has_interrupt(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ void *vapic_page;
+ u32 vppr;
+ int rvi;
+
+ if (WARN_ON_ONCE(!is_guest_mode(vcpu)) ||
+ !nested_cpu_has_vid(get_vmcs12(vcpu)) ||
+ WARN_ON_ONCE(!vmx->nested.virtual_apic_map.gfn))
+ return false;
+
+ rvi = vmx_get_rvi();
+
+ vapic_page = vmx->nested.virtual_apic_map.hva;
+ vppr = *((u32 *)(vapic_page + APIC_PROCPRI));
+
+ return ((rvi & 0xf0) > (vppr & 0xf0));
+}
+
+static void vmx_msr_filter_changed(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ u32 i;
+
+ /*
+ * Redo intercept permissions for MSRs that KVM is passing through to
+ * the guest. Disabling interception will check the new MSR filter and
+ * ensure that KVM enables interception if usersepace wants to filter
+ * the MSR. MSRs that KVM is already intercepting don't need to be
+ * refreshed since KVM is going to intercept them regardless of what
+ * userspace wants.
+ */
+ for (i = 0; i < ARRAY_SIZE(vmx_possible_passthrough_msrs); i++) {
+ u32 msr = vmx_possible_passthrough_msrs[i];
+
+ if (!test_bit(i, vmx->shadow_msr_intercept.read))
+ vmx_disable_intercept_for_msr(vcpu, msr, MSR_TYPE_R);
+
+ if (!test_bit(i, vmx->shadow_msr_intercept.write))
+ vmx_disable_intercept_for_msr(vcpu, msr, MSR_TYPE_W);
+ }
+
+ /* PT MSRs can be passed through iff PT is exposed to the guest. */
+ if (vmx_pt_mode_is_host_guest())
+ pt_update_intercept_for_msr(vcpu);
+}
+
+static inline void kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu,
+ int pi_vec)
+{
+#ifdef CONFIG_SMP
+ if (vcpu->mode == IN_GUEST_MODE) {
+ /*
+ * The vector of the virtual has already been set in the PIR.
+ * Send a notification event to deliver the virtual interrupt
+ * unless the vCPU is the currently running vCPU, i.e. the
+ * event is being sent from a fastpath VM-Exit handler, in
+ * which case the PIR will be synced to the vIRR before
+ * re-entering the guest.
+ *
+ * When the target is not the running vCPU, the following
+ * possibilities emerge:
+ *
+ * Case 1: vCPU stays in non-root mode. Sending a notification
+ * event posts the interrupt to the vCPU.
+ *
+ * Case 2: vCPU exits to root mode and is still runnable. The
+ * PIR will be synced to the vIRR before re-entering the guest.
+ * Sending a notification event is ok as the host IRQ handler
+ * will ignore the spurious event.
+ *
+ * Case 3: vCPU exits to root mode and is blocked. vcpu_block()
+ * has already synced PIR to vIRR and never blocks the vCPU if
+ * the vIRR is not empty. Therefore, a blocked vCPU here does
+ * not wait for any requested interrupts in PIR, and sending a
+ * notification event also results in a benign, spurious event.
+ */
+
+ if (vcpu != kvm_get_running_vcpu())
+ __apic_send_IPI_mask(get_cpu_mask(vcpu->cpu), pi_vec);
+ return;
+ }
+#endif
+ /*
+ * The vCPU isn't in the guest; wake the vCPU in case it is blocking,
+ * otherwise do nothing as KVM will grab the highest priority pending
+ * IRQ via ->sync_pir_to_irr() in vcpu_enter_guest().
+ */
+ kvm_vcpu_wake_up(vcpu);
+}
+
+static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu,
+ int vector)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ if (is_guest_mode(vcpu) &&
+ vector == vmx->nested.posted_intr_nv) {
+ /*
+ * If a posted intr is not recognized by hardware,
+ * we will accomplish it in the next vmentry.
+ */
+ vmx->nested.pi_pending = true;
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+
+ /*
+ * This pairs with the smp_mb_*() after setting vcpu->mode in
+ * vcpu_enter_guest() to guarantee the vCPU sees the event
+ * request if triggering a posted interrupt "fails" because
+ * vcpu->mode != IN_GUEST_MODE. The extra barrier is needed as
+ * the smb_wmb() in kvm_make_request() only ensures everything
+ * done before making the request is visible when the request
+ * is visible, it doesn't ensure ordering between the store to
+ * vcpu->requests and the load from vcpu->mode.
+ */
+ smp_mb__after_atomic();
+
+ /* the PIR and ON have been set by L1. */
+ kvm_vcpu_trigger_posted_interrupt(vcpu, POSTED_INTR_NESTED_VECTOR);
+ return 0;
+ }
+ return -1;
+}
+/*
+ * Send interrupt to vcpu via posted interrupt way.
+ * 1. If target vcpu is running(non-root mode), send posted interrupt
+ * notification to vcpu and hardware will sync PIR to vIRR atomically.
+ * 2. If target vcpu isn't running(root mode), kick it to pick up the
+ * interrupt from PIR in next vmentry.
+ */
+static int vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ int r;
+
+ r = vmx_deliver_nested_posted_interrupt(vcpu, vector);
+ if (!r)
+ return 0;
+
+ /* Note, this is called iff the local APIC is in-kernel. */
+ if (!vcpu->arch.apic->apicv_active)
+ return -1;
+
+ if (pi_test_and_set_pir(vector, &vmx->pi_desc))
+ return 0;
+
+ /* If a previous notification has sent the IPI, nothing to do. */
+ if (pi_test_and_set_on(&vmx->pi_desc))
+ return 0;
+
+ /*
+ * The implied barrier in pi_test_and_set_on() pairs with the smp_mb_*()
+ * after setting vcpu->mode in vcpu_enter_guest(), thus the vCPU is
+ * guaranteed to see PID.ON=1 and sync the PIR to IRR if triggering a
+ * posted interrupt "fails" because vcpu->mode != IN_GUEST_MODE.
+ */
+ kvm_vcpu_trigger_posted_interrupt(vcpu, POSTED_INTR_VECTOR);
+ return 0;
+}
+
+static void vmx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode,
+ int trig_mode, int vector)
+{
+ struct kvm_vcpu *vcpu = apic->vcpu;
+
+ if (vmx_deliver_posted_interrupt(vcpu, vector)) {
+ kvm_lapic_set_irr(vector, apic);
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ kvm_vcpu_kick(vcpu);
+ } else {
+ trace_kvm_apicv_accept_irq(vcpu->vcpu_id, delivery_mode,
+ trig_mode, vector);
+ }
+}
+
+/*
+ * Set up the vmcs's constant host-state fields, i.e., host-state fields that
+ * will not change in the lifetime of the guest.
+ * Note that host-state that does change is set elsewhere. E.g., host-state
+ * that is set differently for each CPU is set in vmx_vcpu_load(), not here.
+ */
+void vmx_set_constant_host_state(struct vcpu_vmx *vmx)
+{
+ u32 low32, high32;
+ unsigned long tmpl;
+ unsigned long cr0, cr3, cr4;
+
+ cr0 = read_cr0();
+ WARN_ON(cr0 & X86_CR0_TS);
+ vmcs_writel(HOST_CR0, cr0); /* 22.2.3 */
+
+ /*
+ * Save the most likely value for this task's CR3 in the VMCS.
+ * We can't use __get_current_cr3_fast() because we're not atomic.
+ */
+ cr3 = __read_cr3();
+ vmcs_writel(HOST_CR3, cr3); /* 22.2.3 FIXME: shadow tables */
+ vmx->loaded_vmcs->host_state.cr3 = cr3;
+
+ /* Save the most likely value for this task's CR4 in the VMCS. */
+ cr4 = cr4_read_shadow();
+ vmcs_writel(HOST_CR4, cr4); /* 22.2.3, 22.2.5 */
+ vmx->loaded_vmcs->host_state.cr4 = cr4;
+
+ vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */
+#ifdef CONFIG_X86_64
+ /*
+ * Load null selectors, so we can avoid reloading them in
+ * vmx_prepare_switch_to_host(), in case userspace uses
+ * the null selectors too (the expected case).
+ */
+ vmcs_write16(HOST_DS_SELECTOR, 0);
+ vmcs_write16(HOST_ES_SELECTOR, 0);
+#else
+ vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */
+ vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */
+#endif
+ vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */
+ vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */
+
+ vmcs_writel(HOST_IDTR_BASE, host_idt_base); /* 22.2.4 */
+
+ vmcs_writel(HOST_RIP, (unsigned long)vmx_vmexit); /* 22.2.5 */
+
+ rdmsr(MSR_IA32_SYSENTER_CS, low32, high32);
+ vmcs_write32(HOST_IA32_SYSENTER_CS, low32);
+
+ /*
+ * SYSENTER is used for 32-bit system calls on either 32-bit or
+ * 64-bit kernels. It is always zero If neither is allowed, otherwise
+ * vmx_vcpu_load_vmcs loads it with the per-CPU entry stack (and may
+ * have already done so!).
+ */
+ if (!IS_ENABLED(CONFIG_IA32_EMULATION) && !IS_ENABLED(CONFIG_X86_32))
+ vmcs_writel(HOST_IA32_SYSENTER_ESP, 0);
+
+ rdmsrl(MSR_IA32_SYSENTER_EIP, tmpl);
+ vmcs_writel(HOST_IA32_SYSENTER_EIP, tmpl); /* 22.2.3 */
+
+ if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) {
+ rdmsr(MSR_IA32_CR_PAT, low32, high32);
+ vmcs_write64(HOST_IA32_PAT, low32 | ((u64) high32 << 32));
+ }
+
+ if (cpu_has_load_ia32_efer())
+ vmcs_write64(HOST_IA32_EFER, host_efer);
+}
+
+void set_cr4_guest_host_mask(struct vcpu_vmx *vmx)
+{
+ struct kvm_vcpu *vcpu = &vmx->vcpu;
+
+ vcpu->arch.cr4_guest_owned_bits = KVM_POSSIBLE_CR4_GUEST_BITS &
+ ~vcpu->arch.cr4_guest_rsvd_bits;
+ if (!enable_ept) {
+ vcpu->arch.cr4_guest_owned_bits &= ~X86_CR4_TLBFLUSH_BITS;
+ vcpu->arch.cr4_guest_owned_bits &= ~X86_CR4_PDPTR_BITS;
+ }
+ if (is_guest_mode(&vmx->vcpu))
+ vcpu->arch.cr4_guest_owned_bits &=
+ ~get_vmcs12(vcpu)->cr4_guest_host_mask;
+ vmcs_writel(CR4_GUEST_HOST_MASK, ~vcpu->arch.cr4_guest_owned_bits);
+}
+
+static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx)
+{
+ u32 pin_based_exec_ctrl = vmcs_config.pin_based_exec_ctrl;
+
+ if (!kvm_vcpu_apicv_active(&vmx->vcpu))
+ pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR;
+
+ if (!enable_vnmi)
+ pin_based_exec_ctrl &= ~PIN_BASED_VIRTUAL_NMIS;
+
+ if (!enable_preemption_timer)
+ pin_based_exec_ctrl &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
+
+ return pin_based_exec_ctrl;
+}
+
+static u32 vmx_vmentry_ctrl(void)
+{
+ u32 vmentry_ctrl = vmcs_config.vmentry_ctrl;
+
+ if (vmx_pt_mode_is_system())
+ vmentry_ctrl &= ~(VM_ENTRY_PT_CONCEAL_PIP |
+ VM_ENTRY_LOAD_IA32_RTIT_CTL);
+ /*
+ * IA32e mode, and loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically.
+ */
+ vmentry_ctrl &= ~(VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL |
+ VM_ENTRY_LOAD_IA32_EFER |
+ VM_ENTRY_IA32E_MODE);
+
+ if (cpu_has_perf_global_ctrl_bug())
+ vmentry_ctrl &= ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
+
+ return vmentry_ctrl;
+}
+
+static u32 vmx_vmexit_ctrl(void)
+{
+ u32 vmexit_ctrl = vmcs_config.vmexit_ctrl;
+
+ /*
+ * Not used by KVM and never set in vmcs01 or vmcs02, but emulated for
+ * nested virtualization and thus allowed to be set in vmcs12.
+ */
+ vmexit_ctrl &= ~(VM_EXIT_SAVE_IA32_PAT | VM_EXIT_SAVE_IA32_EFER |
+ VM_EXIT_SAVE_VMX_PREEMPTION_TIMER);
+
+ if (vmx_pt_mode_is_system())
+ vmexit_ctrl &= ~(VM_EXIT_PT_CONCEAL_PIP |
+ VM_EXIT_CLEAR_IA32_RTIT_CTL);
+
+ if (cpu_has_perf_global_ctrl_bug())
+ vmexit_ctrl &= ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL;
+
+ /* Loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically */
+ return vmexit_ctrl &
+ ~(VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | VM_EXIT_LOAD_IA32_EFER);
+}
+
+static void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ if (is_guest_mode(vcpu)) {
+ vmx->nested.update_vmcs01_apicv_status = true;
+ return;
+ }
+
+ pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx));
+
+ if (kvm_vcpu_apicv_active(vcpu)) {
+ secondary_exec_controls_setbit(vmx,
+ SECONDARY_EXEC_APIC_REGISTER_VIRT |
+ SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
+ if (enable_ipiv)
+ tertiary_exec_controls_setbit(vmx, TERTIARY_EXEC_IPI_VIRT);
+ } else {
+ secondary_exec_controls_clearbit(vmx,
+ SECONDARY_EXEC_APIC_REGISTER_VIRT |
+ SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
+ if (enable_ipiv)
+ tertiary_exec_controls_clearbit(vmx, TERTIARY_EXEC_IPI_VIRT);
+ }
+
+ vmx_update_msr_bitmap_x2apic(vcpu);
+}
+
+static u32 vmx_exec_control(struct vcpu_vmx *vmx)
+{
+ u32 exec_control = vmcs_config.cpu_based_exec_ctrl;
+
+ /*
+ * Not used by KVM, but fully supported for nesting, i.e. are allowed in
+ * vmcs12 and propagated to vmcs02 when set in vmcs12.
+ */
+ exec_control &= ~(CPU_BASED_RDTSC_EXITING |
+ CPU_BASED_USE_IO_BITMAPS |
+ CPU_BASED_MONITOR_TRAP_FLAG |
+ CPU_BASED_PAUSE_EXITING);
+
+ /* INTR_WINDOW_EXITING and NMI_WINDOW_EXITING are toggled dynamically */
+ exec_control &= ~(CPU_BASED_INTR_WINDOW_EXITING |
+ CPU_BASED_NMI_WINDOW_EXITING);
+
+ if (vmx->vcpu.arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)
+ exec_control &= ~CPU_BASED_MOV_DR_EXITING;
+
+ if (!cpu_need_tpr_shadow(&vmx->vcpu))
+ exec_control &= ~CPU_BASED_TPR_SHADOW;
+
+#ifdef CONFIG_X86_64
+ if (exec_control & CPU_BASED_TPR_SHADOW)
+ exec_control &= ~(CPU_BASED_CR8_LOAD_EXITING |
+ CPU_BASED_CR8_STORE_EXITING);
+ else
+ exec_control |= CPU_BASED_CR8_STORE_EXITING |
+ CPU_BASED_CR8_LOAD_EXITING;
+#endif
+ /* No need to intercept CR3 access or INVPLG when using EPT. */
+ if (enable_ept)
+ exec_control &= ~(CPU_BASED_CR3_LOAD_EXITING |
+ CPU_BASED_CR3_STORE_EXITING |
+ CPU_BASED_INVLPG_EXITING);
+ if (kvm_mwait_in_guest(vmx->vcpu.kvm))
+ exec_control &= ~(CPU_BASED_MWAIT_EXITING |
+ CPU_BASED_MONITOR_EXITING);
+ if (kvm_hlt_in_guest(vmx->vcpu.kvm))
+ exec_control &= ~CPU_BASED_HLT_EXITING;
+ return exec_control;
+}
+
+static u64 vmx_tertiary_exec_control(struct vcpu_vmx *vmx)
+{
+ u64 exec_control = vmcs_config.cpu_based_3rd_exec_ctrl;
+
+ /*
+ * IPI virtualization relies on APICv. Disable IPI virtualization if
+ * APICv is inhibited.
+ */
+ if (!enable_ipiv || !kvm_vcpu_apicv_active(&vmx->vcpu))
+ exec_control &= ~TERTIARY_EXEC_IPI_VIRT;
+
+ return exec_control;
+}
+
+/*
+ * Adjust a single secondary execution control bit to intercept/allow an
+ * instruction in the guest. This is usually done based on whether or not a
+ * feature has been exposed to the guest in order to correctly emulate faults.
+ */
+static inline void
+vmx_adjust_secondary_exec_control(struct vcpu_vmx *vmx, u32 *exec_control,
+ u32 control, bool enabled, bool exiting)
+{
+ /*
+ * If the control is for an opt-in feature, clear the control if the
+ * feature is not exposed to the guest, i.e. not enabled. If the
+ * control is opt-out, i.e. an exiting control, clear the control if
+ * the feature _is_ exposed to the guest, i.e. exiting/interception is
+ * disabled for the associated instruction. Note, the caller is
+ * responsible presetting exec_control to set all supported bits.
+ */
+ if (enabled == exiting)
+ *exec_control &= ~control;
+
+ /*
+ * Update the nested MSR settings so that a nested VMM can/can't set
+ * controls for features that are/aren't exposed to the guest.
+ */
+ if (nested) {
+ /*
+ * All features that can be added or removed to VMX MSRs must
+ * be supported in the first place for nested virtualization.
+ */
+ if (WARN_ON_ONCE(!(vmcs_config.nested.secondary_ctls_high & control)))
+ enabled = false;
+
+ if (enabled)
+ vmx->nested.msrs.secondary_ctls_high |= control;
+ else
+ vmx->nested.msrs.secondary_ctls_high &= ~control;
+ }
+}
+
+/*
+ * Wrapper macro for the common case of adjusting a secondary execution control
+ * based on a single guest CPUID bit, with a dedicated feature bit. This also
+ * verifies that the control is actually supported by KVM and hardware.
+ */
+#define vmx_adjust_sec_exec_control(vmx, exec_control, name, feat_name, ctrl_name, exiting) \
+({ \
+ struct kvm_vcpu *__vcpu = &(vmx)->vcpu; \
+ bool __enabled; \
+ \
+ if (cpu_has_vmx_##name()) { \
+ if (kvm_is_governed_feature(X86_FEATURE_##feat_name)) \
+ __enabled = guest_can_use(__vcpu, X86_FEATURE_##feat_name); \
+ else \
+ __enabled = guest_cpuid_has(__vcpu, X86_FEATURE_##feat_name); \
+ vmx_adjust_secondary_exec_control(vmx, exec_control, SECONDARY_EXEC_##ctrl_name,\
+ __enabled, exiting); \
+ } \
+})
+
+/* More macro magic for ENABLE_/opt-in versus _EXITING/opt-out controls. */
+#define vmx_adjust_sec_exec_feature(vmx, exec_control, lname, uname) \
+ vmx_adjust_sec_exec_control(vmx, exec_control, lname, uname, ENABLE_##uname, false)
+
+#define vmx_adjust_sec_exec_exiting(vmx, exec_control, lname, uname) \
+ vmx_adjust_sec_exec_control(vmx, exec_control, lname, uname, uname##_EXITING, true)
+
+static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
+{
+ struct kvm_vcpu *vcpu = &vmx->vcpu;
+
+ u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl;
+
+ if (vmx_pt_mode_is_system())
+ exec_control &= ~(SECONDARY_EXEC_PT_USE_GPA | SECONDARY_EXEC_PT_CONCEAL_VMX);
+ if (!cpu_need_virtualize_apic_accesses(vcpu))
+ exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
+ if (vmx->vpid == 0)
+ exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
+ if (!enable_ept) {
+ exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
+ enable_unrestricted_guest = 0;
+ }
+ if (!enable_unrestricted_guest)
+ exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
+ if (kvm_pause_in_guest(vmx->vcpu.kvm))
+ exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
+ if (!kvm_vcpu_apicv_active(vcpu))
+ exec_control &= ~(SECONDARY_EXEC_APIC_REGISTER_VIRT |
+ SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
+ exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
+
+ /*
+ * KVM doesn't support VMFUNC for L1, but the control is set in KVM's
+ * base configuration as KVM emulates VMFUNC[EPTP_SWITCHING] for L2.
+ */
+ exec_control &= ~SECONDARY_EXEC_ENABLE_VMFUNC;
+
+ /* SECONDARY_EXEC_DESC is enabled/disabled on writes to CR4.UMIP,
+ * in vmx_set_cr4. */
+ exec_control &= ~SECONDARY_EXEC_DESC;
+
+ /* SECONDARY_EXEC_SHADOW_VMCS is enabled when L1 executes VMPTRLD
+ (handle_vmptrld).
+ We can NOT enable shadow_vmcs here because we don't have yet
+ a current VMCS12
+ */
+ exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
+
+ /*
+ * PML is enabled/disabled when dirty logging of memsmlots changes, but
+ * it needs to be set here when dirty logging is already active, e.g.
+ * if this vCPU was created after dirty logging was enabled.
+ */
+ if (!enable_pml || !atomic_read(&vcpu->kvm->nr_memslots_dirty_logging))
+ exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
+
+ vmx_adjust_sec_exec_feature(vmx, &exec_control, xsaves, XSAVES);
+
+ /*
+ * RDPID is also gated by ENABLE_RDTSCP, turn on the control if either
+ * feature is exposed to the guest. This creates a virtualization hole
+ * if both are supported in hardware but only one is exposed to the
+ * guest, but letting the guest execute RDTSCP or RDPID when either one
+ * is advertised is preferable to emulating the advertised instruction
+ * in KVM on #UD, and obviously better than incorrectly injecting #UD.
+ */
+ if (cpu_has_vmx_rdtscp()) {
+ bool rdpid_or_rdtscp_enabled =
+ guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP) ||
+ guest_cpuid_has(vcpu, X86_FEATURE_RDPID);
+
+ vmx_adjust_secondary_exec_control(vmx, &exec_control,
+ SECONDARY_EXEC_ENABLE_RDTSCP,
+ rdpid_or_rdtscp_enabled, false);
+ }
+
+ vmx_adjust_sec_exec_feature(vmx, &exec_control, invpcid, INVPCID);
+
+ vmx_adjust_sec_exec_exiting(vmx, &exec_control, rdrand, RDRAND);
+ vmx_adjust_sec_exec_exiting(vmx, &exec_control, rdseed, RDSEED);
+
+ vmx_adjust_sec_exec_control(vmx, &exec_control, waitpkg, WAITPKG,
+ ENABLE_USR_WAIT_PAUSE, false);
+
+ if (!vcpu->kvm->arch.bus_lock_detection_enabled)
+ exec_control &= ~SECONDARY_EXEC_BUS_LOCK_DETECTION;
+
+ if (!kvm_notify_vmexit_enabled(vcpu->kvm))
+ exec_control &= ~SECONDARY_EXEC_NOTIFY_VM_EXITING;
+
+ return exec_control;
+}
+
+static inline int vmx_get_pid_table_order(struct kvm *kvm)
+{
+ return get_order(kvm->arch.max_vcpu_ids * sizeof(*to_kvm_vmx(kvm)->pid_table));
+}
+
+static int vmx_alloc_ipiv_pid_table(struct kvm *kvm)
+{
+ struct page *pages;
+ struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm);
+
+ if (!irqchip_in_kernel(kvm) || !enable_ipiv)
+ return 0;
+
+ if (kvm_vmx->pid_table)
+ return 0;
+
+ pages = alloc_pages(GFP_KERNEL_ACCOUNT | __GFP_ZERO,
+ vmx_get_pid_table_order(kvm));
+ if (!pages)
+ return -ENOMEM;
+
+ kvm_vmx->pid_table = (void *)page_address(pages);
+ return 0;
+}
+
+static int vmx_vcpu_precreate(struct kvm *kvm)
+{
+ return vmx_alloc_ipiv_pid_table(kvm);
+}
+
+#define VMX_XSS_EXIT_BITMAP 0
+
+static void init_vmcs(struct vcpu_vmx *vmx)
+{
+ struct kvm *kvm = vmx->vcpu.kvm;
+ struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm);
+
+ if (nested)
+ nested_vmx_set_vmcs_shadowing_bitmap();
+
+ if (cpu_has_vmx_msr_bitmap())
+ vmcs_write64(MSR_BITMAP, __pa(vmx->vmcs01.msr_bitmap));
+
+ vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA); /* 22.3.1.5 */
+
+ /* Control */
+ pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx));
+
+ exec_controls_set(vmx, vmx_exec_control(vmx));
+
+ if (cpu_has_secondary_exec_ctrls())
+ secondary_exec_controls_set(vmx, vmx_secondary_exec_control(vmx));
+
+ if (cpu_has_tertiary_exec_ctrls())
+ tertiary_exec_controls_set(vmx, vmx_tertiary_exec_control(vmx));
+
+ if (enable_apicv && lapic_in_kernel(&vmx->vcpu)) {
+ vmcs_write64(EOI_EXIT_BITMAP0, 0);
+ vmcs_write64(EOI_EXIT_BITMAP1, 0);
+ vmcs_write64(EOI_EXIT_BITMAP2, 0);
+ vmcs_write64(EOI_EXIT_BITMAP3, 0);
+
+ vmcs_write16(GUEST_INTR_STATUS, 0);
+
+ vmcs_write16(POSTED_INTR_NV, POSTED_INTR_VECTOR);
+ vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((&vmx->pi_desc)));
+ }
+
+ if (vmx_can_use_ipiv(&vmx->vcpu)) {
+ vmcs_write64(PID_POINTER_TABLE, __pa(kvm_vmx->pid_table));
+ vmcs_write16(LAST_PID_POINTER_INDEX, kvm->arch.max_vcpu_ids - 1);
+ }
+
+ if (!kvm_pause_in_guest(kvm)) {
+ vmcs_write32(PLE_GAP, ple_gap);
+ vmx->ple_window = ple_window;
+ vmx->ple_window_dirty = true;
+ }
+
+ if (kvm_notify_vmexit_enabled(kvm))
+ vmcs_write32(NOTIFY_WINDOW, kvm->arch.notify_window);
+
+ vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
+ vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
+ vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */
+
+ vmcs_write16(HOST_FS_SELECTOR, 0); /* 22.2.4 */
+ vmcs_write16(HOST_GS_SELECTOR, 0); /* 22.2.4 */
+ vmx_set_constant_host_state(vmx);
+ vmcs_writel(HOST_FS_BASE, 0); /* 22.2.4 */
+ vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */
+
+ if (cpu_has_vmx_vmfunc())
+ vmcs_write64(VM_FUNCTION_CONTROL, 0);
+
+ vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
+ vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
+ vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val));
+ vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
+ vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val));
+
+ if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
+ vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
+
+ vm_exit_controls_set(vmx, vmx_vmexit_ctrl());
+
+ /* 22.2.1, 20.8.1 */
+ vm_entry_controls_set(vmx, vmx_vmentry_ctrl());
+
+ vmx->vcpu.arch.cr0_guest_owned_bits = vmx_l1_guest_owned_cr0_bits();
+ vmcs_writel(CR0_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr0_guest_owned_bits);
+
+ set_cr4_guest_host_mask(vmx);
+
+ if (vmx->vpid != 0)
+ vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
+
+ if (cpu_has_vmx_xsaves())
+ vmcs_write64(XSS_EXIT_BITMAP, VMX_XSS_EXIT_BITMAP);
+
+ if (enable_pml) {
+ vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
+ vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
+ }
+
+ vmx_write_encls_bitmap(&vmx->vcpu, NULL);
+
+ if (vmx_pt_mode_is_host_guest()) {
+ memset(&vmx->pt_desc, 0, sizeof(vmx->pt_desc));
+ /* Bit[6~0] are forced to 1, writes are ignored. */
+ vmx->pt_desc.guest.output_mask = 0x7F;
+ vmcs_write64(GUEST_IA32_RTIT_CTL, 0);
+ }
+
+ vmcs_write32(GUEST_SYSENTER_CS, 0);
+ vmcs_writel(GUEST_SYSENTER_ESP, 0);
+ vmcs_writel(GUEST_SYSENTER_EIP, 0);
+ vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
+
+ if (cpu_has_vmx_tpr_shadow()) {
+ vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
+ if (cpu_need_tpr_shadow(&vmx->vcpu))
+ vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
+ __pa(vmx->vcpu.arch.apic->regs));
+ vmcs_write32(TPR_THRESHOLD, 0);
+ }
+
+ vmx_setup_uret_msrs(vmx);
+}
+
+static void __vmx_vcpu_reset(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ init_vmcs(vmx);
+
+ if (nested)
+ memcpy(&vmx->nested.msrs, &vmcs_config.nested, sizeof(vmx->nested.msrs));
+
+ vcpu_setup_sgx_lepubkeyhash(vcpu);
+
+ vmx->nested.posted_intr_nv = -1;
+ vmx->nested.vmxon_ptr = INVALID_GPA;
+ vmx->nested.current_vmptr = INVALID_GPA;
+ vmx->nested.hv_evmcs_vmptr = EVMPTR_INVALID;
+
+ vcpu->arch.microcode_version = 0x100000000ULL;
+ vmx->msr_ia32_feature_control_valid_bits = FEAT_CTL_LOCKED;
+
+ /*
+ * Enforce invariant: pi_desc.nv is always either POSTED_INTR_VECTOR
+ * or POSTED_INTR_WAKEUP_VECTOR.
+ */
+ vmx->pi_desc.nv = POSTED_INTR_VECTOR;
+ vmx->pi_desc.sn = 1;
+}
+
+static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ if (!init_event)
+ __vmx_vcpu_reset(vcpu);
+
+ vmx->rmode.vm86_active = 0;
+ vmx->spec_ctrl = 0;
+
+ vmx->msr_ia32_umwait_control = 0;
+
+ vmx->hv_deadline_tsc = -1;
+ kvm_set_cr8(vcpu, 0);
+
+ vmx_segment_cache_clear(vmx);
+ kvm_register_mark_available(vcpu, VCPU_EXREG_SEGMENTS);
+
+ seg_setup(VCPU_SREG_CS);
+ vmcs_write16(GUEST_CS_SELECTOR, 0xf000);
+ vmcs_writel(GUEST_CS_BASE, 0xffff0000ul);
+
+ seg_setup(VCPU_SREG_DS);
+ seg_setup(VCPU_SREG_ES);
+ seg_setup(VCPU_SREG_FS);
+ seg_setup(VCPU_SREG_GS);
+ seg_setup(VCPU_SREG_SS);
+
+ vmcs_write16(GUEST_TR_SELECTOR, 0);
+ vmcs_writel(GUEST_TR_BASE, 0);
+ vmcs_write32(GUEST_TR_LIMIT, 0xffff);
+ vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
+
+ vmcs_write16(GUEST_LDTR_SELECTOR, 0);
+ vmcs_writel(GUEST_LDTR_BASE, 0);
+ vmcs_write32(GUEST_LDTR_LIMIT, 0xffff);
+ vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082);
+
+ vmcs_writel(GUEST_GDTR_BASE, 0);
+ vmcs_write32(GUEST_GDTR_LIMIT, 0xffff);
+
+ vmcs_writel(GUEST_IDTR_BASE, 0);
+ vmcs_write32(GUEST_IDTR_LIMIT, 0xffff);
+
+ vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
+ vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
+ vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 0);
+ if (kvm_mpx_supported())
+ vmcs_write64(GUEST_BNDCFGS, 0);
+
+ vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); /* 22.2.1 */
+
+ kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
+
+ vpid_sync_context(vmx->vpid);
+
+ vmx_update_fb_clear_dis(vcpu, vmx);
+}
+
+static void vmx_enable_irq_window(struct kvm_vcpu *vcpu)
+{
+ exec_controls_setbit(to_vmx(vcpu), CPU_BASED_INTR_WINDOW_EXITING);
+}
+
+static void vmx_enable_nmi_window(struct kvm_vcpu *vcpu)
+{
+ if (!enable_vnmi ||
+ vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) {
+ vmx_enable_irq_window(vcpu);
+ return;
+ }
+
+ exec_controls_setbit(to_vmx(vcpu), CPU_BASED_NMI_WINDOW_EXITING);
+}
+
+static void vmx_inject_irq(struct kvm_vcpu *vcpu, bool reinjected)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ uint32_t intr;
+ int irq = vcpu->arch.interrupt.nr;
+
+ trace_kvm_inj_virq(irq, vcpu->arch.interrupt.soft, reinjected);
+
+ ++vcpu->stat.irq_injections;
+ if (vmx->rmode.vm86_active) {
+ int inc_eip = 0;
+ if (vcpu->arch.interrupt.soft)
+ inc_eip = vcpu->arch.event_exit_inst_len;
+ kvm_inject_realmode_interrupt(vcpu, irq, inc_eip);
+ return;
+ }
+ intr = irq | INTR_INFO_VALID_MASK;
+ if (vcpu->arch.interrupt.soft) {
+ intr |= INTR_TYPE_SOFT_INTR;
+ vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
+ vmx->vcpu.arch.event_exit_inst_len);
+ } else
+ intr |= INTR_TYPE_EXT_INTR;
+ vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr);
+
+ vmx_clear_hlt(vcpu);
+}
+
+static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ if (!enable_vnmi) {
+ /*
+ * Tracking the NMI-blocked state in software is built upon
+ * finding the next open IRQ window. This, in turn, depends on
+ * well-behaving guests: They have to keep IRQs disabled at
+ * least as long as the NMI handler runs. Otherwise we may
+ * cause NMI nesting, maybe breaking the guest. But as this is
+ * highly unlikely, we can live with the residual risk.
+ */
+ vmx->loaded_vmcs->soft_vnmi_blocked = 1;
+ vmx->loaded_vmcs->vnmi_blocked_time = 0;
+ }
+
+ ++vcpu->stat.nmi_injections;
+ vmx->loaded_vmcs->nmi_known_unmasked = false;
+
+ if (vmx->rmode.vm86_active) {
+ kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR, 0);
+ return;
+ }
+
+ vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
+ INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR);
+
+ vmx_clear_hlt(vcpu);
+}
+
+bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ bool masked;
+
+ if (!enable_vnmi)
+ return vmx->loaded_vmcs->soft_vnmi_blocked;
+ if (vmx->loaded_vmcs->nmi_known_unmasked)
+ return false;
+ masked = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI;
+ vmx->loaded_vmcs->nmi_known_unmasked = !masked;
+ return masked;
+}
+
+void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ if (!enable_vnmi) {
+ if (vmx->loaded_vmcs->soft_vnmi_blocked != masked) {
+ vmx->loaded_vmcs->soft_vnmi_blocked = masked;
+ vmx->loaded_vmcs->vnmi_blocked_time = 0;
+ }
+ } else {
+ vmx->loaded_vmcs->nmi_known_unmasked = !masked;
+ if (masked)
+ vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
+ GUEST_INTR_STATE_NMI);
+ else
+ vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
+ GUEST_INTR_STATE_NMI);
+ }
+}
+
+bool vmx_nmi_blocked(struct kvm_vcpu *vcpu)
+{
+ if (is_guest_mode(vcpu) && nested_exit_on_nmi(vcpu))
+ return false;
+
+ if (!enable_vnmi && to_vmx(vcpu)->loaded_vmcs->soft_vnmi_blocked)
+ return true;
+
+ return (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
+ (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI |
+ GUEST_INTR_STATE_NMI));
+}
+
+static int vmx_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
+{
+ if (to_vmx(vcpu)->nested.nested_run_pending)
+ return -EBUSY;
+
+ /* An NMI must not be injected into L2 if it's supposed to VM-Exit. */
+ if (for_injection && is_guest_mode(vcpu) && nested_exit_on_nmi(vcpu))
+ return -EBUSY;
+
+ return !vmx_nmi_blocked(vcpu);
+}
+
+bool vmx_interrupt_blocked(struct kvm_vcpu *vcpu)
+{
+ if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu))
+ return false;
+
+ return !(vmx_get_rflags(vcpu) & X86_EFLAGS_IF) ||
+ (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
+ (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS));
+}
+
+static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection)
+{
+ if (to_vmx(vcpu)->nested.nested_run_pending)
+ return -EBUSY;
+
+ /*
+ * An IRQ must not be injected into L2 if it's supposed to VM-Exit,
+ * e.g. if the IRQ arrived asynchronously after checking nested events.
+ */
+ if (for_injection && is_guest_mode(vcpu) && nested_exit_on_intr(vcpu))
+ return -EBUSY;
+
+ return !vmx_interrupt_blocked(vcpu);
+}
+
+static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
+{
+ void __user *ret;
+
+ if (enable_unrestricted_guest)
+ return 0;
+
+ mutex_lock(&kvm->slots_lock);
+ ret = __x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, addr,
+ PAGE_SIZE * 3);
+ mutex_unlock(&kvm->slots_lock);
+
+ if (IS_ERR(ret))
+ return PTR_ERR(ret);
+
+ to_kvm_vmx(kvm)->tss_addr = addr;
+
+ return init_rmode_tss(kvm, ret);
+}
+
+static int vmx_set_identity_map_addr(struct kvm *kvm, u64 ident_addr)
+{
+ to_kvm_vmx(kvm)->ept_identity_map_addr = ident_addr;
+ return 0;
+}
+
+static bool rmode_exception(struct kvm_vcpu *vcpu, int vec)
+{
+ switch (vec) {
+ case BP_VECTOR:
+ /*
+ * Update instruction length as we may reinject the exception
+ * from user space while in guest debugging mode.
+ */
+ to_vmx(vcpu)->vcpu.arch.event_exit_inst_len =
+ vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
+ if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
+ return false;
+ fallthrough;
+ case DB_VECTOR:
+ return !(vcpu->guest_debug &
+ (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP));
+ case DE_VECTOR:
+ case OF_VECTOR:
+ case BR_VECTOR:
+ case UD_VECTOR:
+ case DF_VECTOR:
+ case SS_VECTOR:
+ case GP_VECTOR:
+ case MF_VECTOR:
+ return true;
+ }
+ return false;
+}
+
+static int handle_rmode_exception(struct kvm_vcpu *vcpu,
+ int vec, u32 err_code)
+{
+ /*
+ * Instruction with address size override prefix opcode 0x67
+ * Cause the #SS fault with 0 error code in VM86 mode.
+ */
+ if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) {
+ if (kvm_emulate_instruction(vcpu, 0)) {
+ if (vcpu->arch.halt_request) {
+ vcpu->arch.halt_request = 0;
+ return kvm_emulate_halt_noskip(vcpu);
+ }
+ return 1;
+ }
+ return 0;
+ }
+
+ /*
+ * Forward all other exceptions that are valid in real mode.
+ * FIXME: Breaks guest debugging in real mode, needs to be fixed with
+ * the required debugging infrastructure rework.
+ */
+ kvm_queue_exception(vcpu, vec);
+ return 1;
+}
+
+static int handle_machine_check(struct kvm_vcpu *vcpu)
+{
+ /* handled by vmx_vcpu_run() */
+ return 1;
+}
+
+/*
+ * If the host has split lock detection disabled, then #AC is
+ * unconditionally injected into the guest, which is the pre split lock
+ * detection behaviour.
+ *
+ * If the host has split lock detection enabled then #AC is
+ * only injected into the guest when:
+ * - Guest CPL == 3 (user mode)
+ * - Guest has #AC detection enabled in CR0
+ * - Guest EFLAGS has AC bit set
+ */
+bool vmx_guest_inject_ac(struct kvm_vcpu *vcpu)
+{
+ if (!boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT))
+ return true;
+
+ return vmx_get_cpl(vcpu) == 3 && kvm_is_cr0_bit_set(vcpu, X86_CR0_AM) &&
+ (kvm_get_rflags(vcpu) & X86_EFLAGS_AC);
+}
+
+static int handle_exception_nmi(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct kvm_run *kvm_run = vcpu->run;
+ u32 intr_info, ex_no, error_code;
+ unsigned long cr2, dr6;
+ u32 vect_info;
+
+ vect_info = vmx->idt_vectoring_info;
+ intr_info = vmx_get_intr_info(vcpu);
+
+ /*
+ * Machine checks are handled by handle_exception_irqoff(), or by
+ * vmx_vcpu_run() if a #MC occurs on VM-Entry. NMIs are handled by
+ * vmx_vcpu_enter_exit().
+ */
+ if (is_machine_check(intr_info) || is_nmi(intr_info))
+ return 1;
+
+ /*
+ * Queue the exception here instead of in handle_nm_fault_irqoff().
+ * This ensures the nested_vmx check is not skipped so vmexit can
+ * be reflected to L1 (when it intercepts #NM) before reaching this
+ * point.
+ */
+ if (is_nm_fault(intr_info)) {
+ kvm_queue_exception(vcpu, NM_VECTOR);
+ return 1;
+ }
+
+ if (is_invalid_opcode(intr_info))
+ return handle_ud(vcpu);
+
+ error_code = 0;
+ if (intr_info & INTR_INFO_DELIVER_CODE_MASK)
+ error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
+
+ if (!vmx->rmode.vm86_active && is_gp_fault(intr_info)) {
+ WARN_ON_ONCE(!enable_vmware_backdoor);
+
+ /*
+ * VMware backdoor emulation on #GP interception only handles
+ * IN{S}, OUT{S}, and RDPMC, none of which generate a non-zero
+ * error code on #GP.
+ */
+ if (error_code) {
+ kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
+ return 1;
+ }
+ return kvm_emulate_instruction(vcpu, EMULTYPE_VMWARE_GP);
+ }
+
+ /*
+ * The #PF with PFEC.RSVD = 1 indicates the guest is accessing
+ * MMIO, it is better to report an internal error.
+ * See the comments in vmx_handle_exit.
+ */
+ if ((vect_info & VECTORING_INFO_VALID_MASK) &&
+ !(is_page_fault(intr_info) && !(error_code & PFERR_RSVD_MASK))) {
+ vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+ vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX;
+ vcpu->run->internal.ndata = 4;
+ vcpu->run->internal.data[0] = vect_info;
+ vcpu->run->internal.data[1] = intr_info;
+ vcpu->run->internal.data[2] = error_code;
+ vcpu->run->internal.data[3] = vcpu->arch.last_vmentry_cpu;
+ return 0;
+ }
+
+ if (is_page_fault(intr_info)) {
+ cr2 = vmx_get_exit_qual(vcpu);
+ if (enable_ept && !vcpu->arch.apf.host_apf_flags) {
+ /*
+ * EPT will cause page fault only if we need to
+ * detect illegal GPAs.
+ */
+ WARN_ON_ONCE(!allow_smaller_maxphyaddr);
+ kvm_fixup_and_inject_pf_error(vcpu, cr2, error_code);
+ return 1;
+ } else
+ return kvm_handle_page_fault(vcpu, error_code, cr2, NULL, 0);
+ }
+
+ ex_no = intr_info & INTR_INFO_VECTOR_MASK;
+
+ if (vmx->rmode.vm86_active && rmode_exception(vcpu, ex_no))
+ return handle_rmode_exception(vcpu, ex_no, error_code);
+
+ switch (ex_no) {
+ case DB_VECTOR:
+ dr6 = vmx_get_exit_qual(vcpu);
+ if (!(vcpu->guest_debug &
+ (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) {
+ /*
+ * If the #DB was due to ICEBP, a.k.a. INT1, skip the
+ * instruction. ICEBP generates a trap-like #DB, but
+ * despite its interception control being tied to #DB,
+ * is an instruction intercept, i.e. the VM-Exit occurs
+ * on the ICEBP itself. Use the inner "skip" helper to
+ * avoid single-step #DB and MTF updates, as ICEBP is
+ * higher priority. Note, skipping ICEBP still clears
+ * STI and MOVSS blocking.
+ *
+ * For all other #DBs, set vmcs.PENDING_DBG_EXCEPTIONS.BS
+ * if single-step is enabled in RFLAGS and STI or MOVSS
+ * blocking is active, as the CPU doesn't set the bit
+ * on VM-Exit due to #DB interception. VM-Entry has a
+ * consistency check that a single-step #DB is pending
+ * in this scenario as the previous instruction cannot
+ * have toggled RFLAGS.TF 0=>1 (because STI and POP/MOV
+ * don't modify RFLAGS), therefore the one instruction
+ * delay when activating single-step breakpoints must
+ * have already expired. Note, the CPU sets/clears BS
+ * as appropriate for all other VM-Exits types.
+ */
+ if (is_icebp(intr_info))
+ WARN_ON(!skip_emulated_instruction(vcpu));
+ else if ((vmx_get_rflags(vcpu) & X86_EFLAGS_TF) &&
+ (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
+ (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS)))
+ vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
+ vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS) | DR6_BS);
+
+ kvm_queue_exception_p(vcpu, DB_VECTOR, dr6);
+ return 1;
+ }
+ kvm_run->debug.arch.dr6 = dr6 | DR6_ACTIVE_LOW;
+ kvm_run->debug.arch.dr7 = vmcs_readl(GUEST_DR7);
+ fallthrough;
+ case BP_VECTOR:
+ /*
+ * Update instruction length as we may reinject #BP from
+ * user space while in guest debugging mode. Reading it for
+ * #DB as well causes no harm, it is not used in that case.
+ */
+ vmx->vcpu.arch.event_exit_inst_len =
+ vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
+ kvm_run->exit_reason = KVM_EXIT_DEBUG;
+ kvm_run->debug.arch.pc = kvm_get_linear_rip(vcpu);
+ kvm_run->debug.arch.exception = ex_no;
+ break;
+ case AC_VECTOR:
+ if (vmx_guest_inject_ac(vcpu)) {
+ kvm_queue_exception_e(vcpu, AC_VECTOR, error_code);
+ return 1;
+ }
+
+ /*
+ * Handle split lock. Depending on detection mode this will
+ * either warn and disable split lock detection for this
+ * task or force SIGBUS on it.
+ */
+ if (handle_guest_split_lock(kvm_rip_read(vcpu)))
+ return 1;
+ fallthrough;
+ default:
+ kvm_run->exit_reason = KVM_EXIT_EXCEPTION;
+ kvm_run->ex.exception = ex_no;
+ kvm_run->ex.error_code = error_code;
+ break;
+ }
+ return 0;
+}
+
+static __always_inline int handle_external_interrupt(struct kvm_vcpu *vcpu)
+{
+ ++vcpu->stat.irq_exits;
+ return 1;
+}
+
+static int handle_triple_fault(struct kvm_vcpu *vcpu)
+{
+ vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
+ vcpu->mmio_needed = 0;
+ return 0;
+}
+
+static int handle_io(struct kvm_vcpu *vcpu)
+{
+ unsigned long exit_qualification;
+ int size, in, string;
+ unsigned port;
+
+ exit_qualification = vmx_get_exit_qual(vcpu);
+ string = (exit_qualification & 16) != 0;
+
+ ++vcpu->stat.io_exits;
+
+ if (string)
+ return kvm_emulate_instruction(vcpu, 0);
+
+ port = exit_qualification >> 16;
+ size = (exit_qualification & 7) + 1;
+ in = (exit_qualification & 8) != 0;
+
+ return kvm_fast_pio(vcpu, size, port, in);
+}
+
+static void
+vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
+{
+ /*
+ * Patch in the VMCALL instruction:
+ */
+ hypercall[0] = 0x0f;
+ hypercall[1] = 0x01;
+ hypercall[2] = 0xc1;
+}
+
+/* called to set cr0 as appropriate for a mov-to-cr0 exit. */
+static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val)
+{
+ if (is_guest_mode(vcpu)) {
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+ unsigned long orig_val = val;
+
+ /*
+ * We get here when L2 changed cr0 in a way that did not change
+ * any of L1's shadowed bits (see nested_vmx_exit_handled_cr),
+ * but did change L0 shadowed bits. So we first calculate the
+ * effective cr0 value that L1 would like to write into the
+ * hardware. It consists of the L2-owned bits from the new
+ * value combined with the L1-owned bits from L1's guest_cr0.
+ */
+ val = (val & ~vmcs12->cr0_guest_host_mask) |
+ (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask);
+
+ if (kvm_set_cr0(vcpu, val))
+ return 1;
+ vmcs_writel(CR0_READ_SHADOW, orig_val);
+ return 0;
+ } else {
+ return kvm_set_cr0(vcpu, val);
+ }
+}
+
+static int handle_set_cr4(struct kvm_vcpu *vcpu, unsigned long val)
+{
+ if (is_guest_mode(vcpu)) {
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+ unsigned long orig_val = val;
+
+ /* analogously to handle_set_cr0 */
+ val = (val & ~vmcs12->cr4_guest_host_mask) |
+ (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask);
+ if (kvm_set_cr4(vcpu, val))
+ return 1;
+ vmcs_writel(CR4_READ_SHADOW, orig_val);
+ return 0;
+ } else
+ return kvm_set_cr4(vcpu, val);
+}
+
+static int handle_desc(struct kvm_vcpu *vcpu)
+{
+ /*
+ * UMIP emulation relies on intercepting writes to CR4.UMIP, i.e. this
+ * and other code needs to be updated if UMIP can be guest owned.
+ */
+ BUILD_BUG_ON(KVM_POSSIBLE_CR4_GUEST_BITS & X86_CR4_UMIP);
+
+ WARN_ON_ONCE(!kvm_is_cr4_bit_set(vcpu, X86_CR4_UMIP));
+ return kvm_emulate_instruction(vcpu, 0);
+}
+
+static int handle_cr(struct kvm_vcpu *vcpu)
+{
+ unsigned long exit_qualification, val;
+ int cr;
+ int reg;
+ int err;
+ int ret;
+
+ exit_qualification = vmx_get_exit_qual(vcpu);
+ cr = exit_qualification & 15;
+ reg = (exit_qualification >> 8) & 15;
+ switch ((exit_qualification >> 4) & 3) {
+ case 0: /* mov to cr */
+ val = kvm_register_read(vcpu, reg);
+ trace_kvm_cr_write(cr, val);
+ switch (cr) {
+ case 0:
+ err = handle_set_cr0(vcpu, val);
+ return kvm_complete_insn_gp(vcpu, err);
+ case 3:
+ WARN_ON_ONCE(enable_unrestricted_guest);
+
+ err = kvm_set_cr3(vcpu, val);
+ return kvm_complete_insn_gp(vcpu, err);
+ case 4:
+ err = handle_set_cr4(vcpu, val);
+ return kvm_complete_insn_gp(vcpu, err);
+ case 8: {
+ u8 cr8_prev = kvm_get_cr8(vcpu);
+ u8 cr8 = (u8)val;
+ err = kvm_set_cr8(vcpu, cr8);
+ ret = kvm_complete_insn_gp(vcpu, err);
+ if (lapic_in_kernel(vcpu))
+ return ret;
+ if (cr8_prev <= cr8)
+ return ret;
+ /*
+ * TODO: we might be squashing a
+ * KVM_GUESTDBG_SINGLESTEP-triggered
+ * KVM_EXIT_DEBUG here.
+ */
+ vcpu->run->exit_reason = KVM_EXIT_SET_TPR;
+ return 0;
+ }
+ }
+ break;
+ case 2: /* clts */
+ KVM_BUG(1, vcpu->kvm, "Guest always owns CR0.TS");
+ return -EIO;
+ case 1: /*mov from cr*/
+ switch (cr) {
+ case 3:
+ WARN_ON_ONCE(enable_unrestricted_guest);
+
+ val = kvm_read_cr3(vcpu);
+ kvm_register_write(vcpu, reg, val);
+ trace_kvm_cr_read(cr, val);
+ return kvm_skip_emulated_instruction(vcpu);
+ case 8:
+ val = kvm_get_cr8(vcpu);
+ kvm_register_write(vcpu, reg, val);
+ trace_kvm_cr_read(cr, val);
+ return kvm_skip_emulated_instruction(vcpu);
+ }
+ break;
+ case 3: /* lmsw */
+ val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f;
+ trace_kvm_cr_write(0, (kvm_read_cr0_bits(vcpu, ~0xful) | val));
+ kvm_lmsw(vcpu, val);
+
+ return kvm_skip_emulated_instruction(vcpu);
+ default:
+ break;
+ }
+ vcpu->run->exit_reason = 0;
+ vcpu_unimpl(vcpu, "unhandled control register: op %d cr %d\n",
+ (int)(exit_qualification >> 4) & 3, cr);
+ return 0;
+}
+
+static int handle_dr(struct kvm_vcpu *vcpu)
+{
+ unsigned long exit_qualification;
+ int dr, dr7, reg;
+ int err = 1;
+
+ exit_qualification = vmx_get_exit_qual(vcpu);
+ dr = exit_qualification & DEBUG_REG_ACCESS_NUM;
+
+ /* First, if DR does not exist, trigger UD */
+ if (!kvm_require_dr(vcpu, dr))
+ return 1;
+
+ if (vmx_get_cpl(vcpu) > 0)
+ goto out;
+
+ dr7 = vmcs_readl(GUEST_DR7);
+ if (dr7 & DR7_GD) {
+ /*
+ * As the vm-exit takes precedence over the debug trap, we
+ * need to emulate the latter, either for the host or the
+ * guest debugging itself.
+ */
+ if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) {
+ vcpu->run->debug.arch.dr6 = DR6_BD | DR6_ACTIVE_LOW;
+ vcpu->run->debug.arch.dr7 = dr7;
+ vcpu->run->debug.arch.pc = kvm_get_linear_rip(vcpu);
+ vcpu->run->debug.arch.exception = DB_VECTOR;
+ vcpu->run->exit_reason = KVM_EXIT_DEBUG;
+ return 0;
+ } else {
+ kvm_queue_exception_p(vcpu, DB_VECTOR, DR6_BD);
+ return 1;
+ }
+ }
+
+ if (vcpu->guest_debug == 0) {
+ exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_MOV_DR_EXITING);
+
+ /*
+ * No more DR vmexits; force a reload of the debug registers
+ * and reenter on this instruction. The next vmexit will
+ * retrieve the full state of the debug registers.
+ */
+ vcpu->arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT;
+ return 1;
+ }
+
+ reg = DEBUG_REG_ACCESS_REG(exit_qualification);
+ if (exit_qualification & TYPE_MOV_FROM_DR) {
+ unsigned long val;
+
+ kvm_get_dr(vcpu, dr, &val);
+ kvm_register_write(vcpu, reg, val);
+ err = 0;
+ } else {
+ err = kvm_set_dr(vcpu, dr, kvm_register_read(vcpu, reg));
+ }
+
+out:
+ return kvm_complete_insn_gp(vcpu, err);
+}
+
+static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
+{
+ get_debugreg(vcpu->arch.db[0], 0);
+ get_debugreg(vcpu->arch.db[1], 1);
+ get_debugreg(vcpu->arch.db[2], 2);
+ get_debugreg(vcpu->arch.db[3], 3);
+ get_debugreg(vcpu->arch.dr6, 6);
+ vcpu->arch.dr7 = vmcs_readl(GUEST_DR7);
+
+ vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT;
+ exec_controls_setbit(to_vmx(vcpu), CPU_BASED_MOV_DR_EXITING);
+
+ /*
+ * exc_debug expects dr6 to be cleared after it runs, avoid that it sees
+ * a stale dr6 from the guest.
+ */
+ set_debugreg(DR6_RESERVED, 6);
+}
+
+static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val)
+{
+ vmcs_writel(GUEST_DR7, val);
+}
+
+static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu)
+{
+ kvm_apic_update_ppr(vcpu);
+ return 1;
+}
+
+static int handle_interrupt_window(struct kvm_vcpu *vcpu)
+{
+ exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_INTR_WINDOW_EXITING);
+
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+
+ ++vcpu->stat.irq_window_exits;
+ return 1;
+}
+
+static int handle_invlpg(struct kvm_vcpu *vcpu)
+{
+ unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
+
+ kvm_mmu_invlpg(vcpu, exit_qualification);
+ return kvm_skip_emulated_instruction(vcpu);
+}
+
+static int handle_apic_access(struct kvm_vcpu *vcpu)
+{
+ if (likely(fasteoi)) {
+ unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
+ int access_type, offset;
+
+ access_type = exit_qualification & APIC_ACCESS_TYPE;
+ offset = exit_qualification & APIC_ACCESS_OFFSET;
+ /*
+ * Sane guest uses MOV to write EOI, with written value
+ * not cared. So make a short-circuit here by avoiding
+ * heavy instruction emulation.
+ */
+ if ((access_type == TYPE_LINEAR_APIC_INST_WRITE) &&
+ (offset == APIC_EOI)) {
+ kvm_lapic_set_eoi(vcpu);
+ return kvm_skip_emulated_instruction(vcpu);
+ }
+ }
+ return kvm_emulate_instruction(vcpu, 0);
+}
+
+static int handle_apic_eoi_induced(struct kvm_vcpu *vcpu)
+{
+ unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
+ int vector = exit_qualification & 0xff;
+
+ /* EOI-induced VM exit is trap-like and thus no need to adjust IP */
+ kvm_apic_set_eoi_accelerated(vcpu, vector);
+ return 1;
+}
+
+static int handle_apic_write(struct kvm_vcpu *vcpu)
+{
+ unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
+
+ /*
+ * APIC-write VM-Exit is trap-like, KVM doesn't need to advance RIP and
+ * hardware has done any necessary aliasing, offset adjustments, etc...
+ * for the access. I.e. the correct value has already been written to
+ * the vAPIC page for the correct 16-byte chunk. KVM needs only to
+ * retrieve the register value and emulate the access.
+ */
+ u32 offset = exit_qualification & 0xff0;
+
+ kvm_apic_write_nodecode(vcpu, offset);
+ return 1;
+}
+
+static int handle_task_switch(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ unsigned long exit_qualification;
+ bool has_error_code = false;
+ u32 error_code = 0;
+ u16 tss_selector;
+ int reason, type, idt_v, idt_index;
+
+ idt_v = (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK);
+ idt_index = (vmx->idt_vectoring_info & VECTORING_INFO_VECTOR_MASK);
+ type = (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK);
+
+ exit_qualification = vmx_get_exit_qual(vcpu);
+
+ reason = (u32)exit_qualification >> 30;
+ if (reason == TASK_SWITCH_GATE && idt_v) {
+ switch (type) {
+ case INTR_TYPE_NMI_INTR:
+ vcpu->arch.nmi_injected = false;
+ vmx_set_nmi_mask(vcpu, true);
+ break;
+ case INTR_TYPE_EXT_INTR:
+ case INTR_TYPE_SOFT_INTR:
+ kvm_clear_interrupt_queue(vcpu);
+ break;
+ case INTR_TYPE_HARD_EXCEPTION:
+ if (vmx->idt_vectoring_info &
+ VECTORING_INFO_DELIVER_CODE_MASK) {
+ has_error_code = true;
+ error_code =
+ vmcs_read32(IDT_VECTORING_ERROR_CODE);
+ }
+ fallthrough;
+ case INTR_TYPE_SOFT_EXCEPTION:
+ kvm_clear_exception_queue(vcpu);
+ break;
+ default:
+ break;
+ }
+ }
+ tss_selector = exit_qualification;
+
+ if (!idt_v || (type != INTR_TYPE_HARD_EXCEPTION &&
+ type != INTR_TYPE_EXT_INTR &&
+ type != INTR_TYPE_NMI_INTR))
+ WARN_ON(!skip_emulated_instruction(vcpu));
+
+ /*
+ * TODO: What about debug traps on tss switch?
+ * Are we supposed to inject them and update dr6?
+ */
+ return kvm_task_switch(vcpu, tss_selector,
+ type == INTR_TYPE_SOFT_INTR ? idt_index : -1,
+ reason, has_error_code, error_code);
+}
+
+static int handle_ept_violation(struct kvm_vcpu *vcpu)
+{
+ unsigned long exit_qualification;
+ gpa_t gpa;
+ u64 error_code;
+
+ exit_qualification = vmx_get_exit_qual(vcpu);
+
+ /*
+ * EPT violation happened while executing iret from NMI,
+ * "blocked by NMI" bit has to be set before next VM entry.
+ * There are errata that may cause this bit to not be set:
+ * AAK134, BY25.
+ */
+ if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
+ enable_vnmi &&
+ (exit_qualification & INTR_INFO_UNBLOCK_NMI))
+ vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI);
+
+ gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
+ trace_kvm_page_fault(vcpu, gpa, exit_qualification);
+
+ /* Is it a read fault? */
+ error_code = (exit_qualification & EPT_VIOLATION_ACC_READ)
+ ? PFERR_USER_MASK : 0;
+ /* Is it a write fault? */
+ error_code |= (exit_qualification & EPT_VIOLATION_ACC_WRITE)
+ ? PFERR_WRITE_MASK : 0;
+ /* Is it a fetch fault? */
+ error_code |= (exit_qualification & EPT_VIOLATION_ACC_INSTR)
+ ? PFERR_FETCH_MASK : 0;
+ /* ept page table entry is present? */
+ error_code |= (exit_qualification & EPT_VIOLATION_RWX_MASK)
+ ? PFERR_PRESENT_MASK : 0;
+
+ error_code |= (exit_qualification & EPT_VIOLATION_GVA_TRANSLATED) != 0 ?
+ PFERR_GUEST_FINAL_MASK : PFERR_GUEST_PAGE_MASK;
+
+ vcpu->arch.exit_qualification = exit_qualification;
+
+ /*
+ * Check that the GPA doesn't exceed physical memory limits, as that is
+ * a guest page fault. We have to emulate the instruction here, because
+ * if the illegal address is that of a paging structure, then
+ * EPT_VIOLATION_ACC_WRITE bit is set. Alternatively, if supported we
+ * would also use advanced VM-exit information for EPT violations to
+ * reconstruct the page fault error code.
+ */
+ if (unlikely(allow_smaller_maxphyaddr && kvm_vcpu_is_illegal_gpa(vcpu, gpa)))
+ return kvm_emulate_instruction(vcpu, 0);
+
+ return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0);
+}
+
+static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
+{
+ gpa_t gpa;
+
+ if (!vmx_can_emulate_instruction(vcpu, EMULTYPE_PF, NULL, 0))
+ return 1;
+
+ /*
+ * A nested guest cannot optimize MMIO vmexits, because we have an
+ * nGPA here instead of the required GPA.
+ */
+ gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
+ if (!is_guest_mode(vcpu) &&
+ !kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) {
+ trace_kvm_fast_mmio(gpa);
+ return kvm_skip_emulated_instruction(vcpu);
+ }
+
+ return kvm_mmu_page_fault(vcpu, gpa, PFERR_RSVD_MASK, NULL, 0);
+}
+
+static int handle_nmi_window(struct kvm_vcpu *vcpu)
+{
+ if (KVM_BUG_ON(!enable_vnmi, vcpu->kvm))
+ return -EIO;
+
+ exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_NMI_WINDOW_EXITING);
+ ++vcpu->stat.nmi_window_exits;
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+
+ return 1;
+}
+
+static bool vmx_emulation_required_with_pending_exception(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ return vmx->emulation_required && !vmx->rmode.vm86_active &&
+ (kvm_is_exception_pending(vcpu) || vcpu->arch.exception.injected);
+}
+
+static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ bool intr_window_requested;
+ unsigned count = 130;
+
+ intr_window_requested = exec_controls_get(vmx) &
+ CPU_BASED_INTR_WINDOW_EXITING;
+
+ while (vmx->emulation_required && count-- != 0) {
+ if (intr_window_requested && !vmx_interrupt_blocked(vcpu))
+ return handle_interrupt_window(&vmx->vcpu);
+
+ if (kvm_test_request(KVM_REQ_EVENT, vcpu))
+ return 1;
+
+ if (!kvm_emulate_instruction(vcpu, 0))
+ return 0;
+
+ if (vmx_emulation_required_with_pending_exception(vcpu)) {
+ kvm_prepare_emulation_failure_exit(vcpu);
+ return 0;
+ }
+
+ if (vcpu->arch.halt_request) {
+ vcpu->arch.halt_request = 0;
+ return kvm_emulate_halt_noskip(vcpu);
+ }
+
+ /*
+ * Note, return 1 and not 0, vcpu_run() will invoke
+ * xfer_to_guest_mode() which will create a proper return
+ * code.
+ */
+ if (__xfer_to_guest_mode_work_pending())
+ return 1;
+ }
+
+ return 1;
+}
+
+static int vmx_vcpu_pre_run(struct kvm_vcpu *vcpu)
+{
+ if (vmx_emulation_required_with_pending_exception(vcpu)) {
+ kvm_prepare_emulation_failure_exit(vcpu);
+ return 0;
+ }
+
+ return 1;
+}
+
+static void grow_ple_window(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ unsigned int old = vmx->ple_window;
+
+ vmx->ple_window = __grow_ple_window(old, ple_window,
+ ple_window_grow,
+ ple_window_max);
+
+ if (vmx->ple_window != old) {
+ vmx->ple_window_dirty = true;
+ trace_kvm_ple_window_update(vcpu->vcpu_id,
+ vmx->ple_window, old);
+ }
+}
+
+static void shrink_ple_window(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ unsigned int old = vmx->ple_window;
+
+ vmx->ple_window = __shrink_ple_window(old, ple_window,
+ ple_window_shrink,
+ ple_window);
+
+ if (vmx->ple_window != old) {
+ vmx->ple_window_dirty = true;
+ trace_kvm_ple_window_update(vcpu->vcpu_id,
+ vmx->ple_window, old);
+ }
+}
+
+/*
+ * Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE
+ * exiting, so only get here on cpu with PAUSE-Loop-Exiting.
+ */
+static int handle_pause(struct kvm_vcpu *vcpu)
+{
+ if (!kvm_pause_in_guest(vcpu->kvm))
+ grow_ple_window(vcpu);
+
+ /*
+ * Intel sdm vol3 ch-25.1.3 says: The "PAUSE-loop exiting"
+ * VM-execution control is ignored if CPL > 0. OTOH, KVM
+ * never set PAUSE_EXITING and just set PLE if supported,
+ * so the vcpu must be CPL=0 if it gets a PAUSE exit.
+ */
+ kvm_vcpu_on_spin(vcpu, true);
+ return kvm_skip_emulated_instruction(vcpu);
+}
+
+static int handle_monitor_trap(struct kvm_vcpu *vcpu)
+{
+ return 1;
+}
+
+static int handle_invpcid(struct kvm_vcpu *vcpu)
+{
+ u32 vmx_instruction_info;
+ unsigned long type;
+ gva_t gva;
+ struct {
+ u64 pcid;
+ u64 gla;
+ } operand;
+ int gpr_index;
+
+ if (!guest_cpuid_has(vcpu, X86_FEATURE_INVPCID)) {
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+ }
+
+ vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
+ gpr_index = vmx_get_instr_info_reg2(vmx_instruction_info);
+ type = kvm_register_read(vcpu, gpr_index);
+
+ /* According to the Intel instruction reference, the memory operand
+ * is read even if it isn't needed (e.g., for type==all)
+ */
+ if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu),
+ vmx_instruction_info, false,
+ sizeof(operand), &gva))
+ return 1;
+
+ return kvm_handle_invpcid(vcpu, type, gva);
+}
+
+static int handle_pml_full(struct kvm_vcpu *vcpu)
+{
+ unsigned long exit_qualification;
+
+ trace_kvm_pml_full(vcpu->vcpu_id);
+
+ exit_qualification = vmx_get_exit_qual(vcpu);
+
+ /*
+ * PML buffer FULL happened while executing iret from NMI,
+ * "blocked by NMI" bit has to be set before next VM entry.
+ */
+ if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
+ enable_vnmi &&
+ (exit_qualification & INTR_INFO_UNBLOCK_NMI))
+ vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
+ GUEST_INTR_STATE_NMI);
+
+ /*
+ * PML buffer already flushed at beginning of VMEXIT. Nothing to do
+ * here.., and there's no userspace involvement needed for PML.
+ */
+ return 1;
+}
+
+static fastpath_t handle_fastpath_preemption_timer(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ if (!vmx->req_immediate_exit &&
+ !unlikely(vmx->loaded_vmcs->hv_timer_soft_disabled)) {
+ kvm_lapic_expired_hv_timer(vcpu);
+ return EXIT_FASTPATH_REENTER_GUEST;
+ }
+
+ return EXIT_FASTPATH_NONE;
+}
+
+static int handle_preemption_timer(struct kvm_vcpu *vcpu)
+{
+ handle_fastpath_preemption_timer(vcpu);
+ return 1;
+}
+
+/*
+ * When nested=0, all VMX instruction VM Exits filter here. The handlers
+ * are overwritten by nested_vmx_setup() when nested=1.
+ */
+static int handle_vmx_instruction(struct kvm_vcpu *vcpu)
+{
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+}
+
+#ifndef CONFIG_X86_SGX_KVM
+static int handle_encls(struct kvm_vcpu *vcpu)
+{
+ /*
+ * SGX virtualization is disabled. There is no software enable bit for
+ * SGX, so KVM intercepts all ENCLS leafs and injects a #UD to prevent
+ * the guest from executing ENCLS (when SGX is supported by hardware).
+ */
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+}
+#endif /* CONFIG_X86_SGX_KVM */
+
+static int handle_bus_lock_vmexit(struct kvm_vcpu *vcpu)
+{
+ /*
+ * Hardware may or may not set the BUS_LOCK_DETECTED flag on BUS_LOCK
+ * VM-Exits. Unconditionally set the flag here and leave the handling to
+ * vmx_handle_exit().
+ */
+ to_vmx(vcpu)->exit_reason.bus_lock_detected = true;
+ return 1;
+}
+
+static int handle_notify(struct kvm_vcpu *vcpu)
+{
+ unsigned long exit_qual = vmx_get_exit_qual(vcpu);
+ bool context_invalid = exit_qual & NOTIFY_VM_CONTEXT_INVALID;
+
+ ++vcpu->stat.notify_window_exits;
+
+ /*
+ * Notify VM exit happened while executing iret from NMI,
+ * "blocked by NMI" bit has to be set before next VM entry.
+ */
+ if (enable_vnmi && (exit_qual & INTR_INFO_UNBLOCK_NMI))
+ vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
+ GUEST_INTR_STATE_NMI);
+
+ if (vcpu->kvm->arch.notify_vmexit_flags & KVM_X86_NOTIFY_VMEXIT_USER ||
+ context_invalid) {
+ vcpu->run->exit_reason = KVM_EXIT_NOTIFY;
+ vcpu->run->notify.flags = context_invalid ?
+ KVM_NOTIFY_CONTEXT_INVALID : 0;
+ return 0;
+ }
+
+ return 1;
+}
+
+/*
+ * The exit handlers return 1 if the exit was handled fully and guest execution
+ * may resume. Otherwise they set the kvm_run parameter to indicate what needs
+ * to be done to userspace and return 0.
+ */
+static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
+ [EXIT_REASON_EXCEPTION_NMI] = handle_exception_nmi,
+ [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt,
+ [EXIT_REASON_TRIPLE_FAULT] = handle_triple_fault,
+ [EXIT_REASON_NMI_WINDOW] = handle_nmi_window,
+ [EXIT_REASON_IO_INSTRUCTION] = handle_io,
+ [EXIT_REASON_CR_ACCESS] = handle_cr,
+ [EXIT_REASON_DR_ACCESS] = handle_dr,
+ [EXIT_REASON_CPUID] = kvm_emulate_cpuid,
+ [EXIT_REASON_MSR_READ] = kvm_emulate_rdmsr,
+ [EXIT_REASON_MSR_WRITE] = kvm_emulate_wrmsr,
+ [EXIT_REASON_INTERRUPT_WINDOW] = handle_interrupt_window,
+ [EXIT_REASON_HLT] = kvm_emulate_halt,
+ [EXIT_REASON_INVD] = kvm_emulate_invd,
+ [EXIT_REASON_INVLPG] = handle_invlpg,
+ [EXIT_REASON_RDPMC] = kvm_emulate_rdpmc,
+ [EXIT_REASON_VMCALL] = kvm_emulate_hypercall,
+ [EXIT_REASON_VMCLEAR] = handle_vmx_instruction,
+ [EXIT_REASON_VMLAUNCH] = handle_vmx_instruction,
+ [EXIT_REASON_VMPTRLD] = handle_vmx_instruction,
+ [EXIT_REASON_VMPTRST] = handle_vmx_instruction,
+ [EXIT_REASON_VMREAD] = handle_vmx_instruction,
+ [EXIT_REASON_VMRESUME] = handle_vmx_instruction,
+ [EXIT_REASON_VMWRITE] = handle_vmx_instruction,
+ [EXIT_REASON_VMOFF] = handle_vmx_instruction,
+ [EXIT_REASON_VMON] = handle_vmx_instruction,
+ [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold,
+ [EXIT_REASON_APIC_ACCESS] = handle_apic_access,
+ [EXIT_REASON_APIC_WRITE] = handle_apic_write,
+ [EXIT_REASON_EOI_INDUCED] = handle_apic_eoi_induced,
+ [EXIT_REASON_WBINVD] = kvm_emulate_wbinvd,
+ [EXIT_REASON_XSETBV] = kvm_emulate_xsetbv,
+ [EXIT_REASON_TASK_SWITCH] = handle_task_switch,
+ [EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check,
+ [EXIT_REASON_GDTR_IDTR] = handle_desc,
+ [EXIT_REASON_LDTR_TR] = handle_desc,
+ [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation,
+ [EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig,
+ [EXIT_REASON_PAUSE_INSTRUCTION] = handle_pause,
+ [EXIT_REASON_MWAIT_INSTRUCTION] = kvm_emulate_mwait,
+ [EXIT_REASON_MONITOR_TRAP_FLAG] = handle_monitor_trap,
+ [EXIT_REASON_MONITOR_INSTRUCTION] = kvm_emulate_monitor,
+ [EXIT_REASON_INVEPT] = handle_vmx_instruction,
+ [EXIT_REASON_INVVPID] = handle_vmx_instruction,
+ [EXIT_REASON_RDRAND] = kvm_handle_invalid_op,
+ [EXIT_REASON_RDSEED] = kvm_handle_invalid_op,
+ [EXIT_REASON_PML_FULL] = handle_pml_full,
+ [EXIT_REASON_INVPCID] = handle_invpcid,
+ [EXIT_REASON_VMFUNC] = handle_vmx_instruction,
+ [EXIT_REASON_PREEMPTION_TIMER] = handle_preemption_timer,
+ [EXIT_REASON_ENCLS] = handle_encls,
+ [EXIT_REASON_BUS_LOCK] = handle_bus_lock_vmexit,
+ [EXIT_REASON_NOTIFY] = handle_notify,
+};
+
+static const int kvm_vmx_max_exit_handlers =
+ ARRAY_SIZE(kvm_vmx_exit_handlers);
+
+static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason,
+ u64 *info1, u64 *info2,
+ u32 *intr_info, u32 *error_code)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ *reason = vmx->exit_reason.full;
+ *info1 = vmx_get_exit_qual(vcpu);
+ if (!(vmx->exit_reason.failed_vmentry)) {
+ *info2 = vmx->idt_vectoring_info;
+ *intr_info = vmx_get_intr_info(vcpu);
+ if (is_exception_with_error_code(*intr_info))
+ *error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
+ else
+ *error_code = 0;
+ } else {
+ *info2 = 0;
+ *intr_info = 0;
+ *error_code = 0;
+ }
+}
+
+static void vmx_destroy_pml_buffer(struct vcpu_vmx *vmx)
+{
+ if (vmx->pml_pg) {
+ __free_page(vmx->pml_pg);
+ vmx->pml_pg = NULL;
+ }
+}
+
+static void vmx_flush_pml_buffer(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ u64 *pml_buf;
+ u16 pml_idx;
+
+ pml_idx = vmcs_read16(GUEST_PML_INDEX);
+
+ /* Do nothing if PML buffer is empty */
+ if (pml_idx == (PML_ENTITY_NUM - 1))
+ return;
+
+ /* PML index always points to next available PML buffer entity */
+ if (pml_idx >= PML_ENTITY_NUM)
+ pml_idx = 0;
+ else
+ pml_idx++;
+
+ pml_buf = page_address(vmx->pml_pg);
+ for (; pml_idx < PML_ENTITY_NUM; pml_idx++) {
+ u64 gpa;
+
+ gpa = pml_buf[pml_idx];
+ WARN_ON(gpa & (PAGE_SIZE - 1));
+ kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT);
+ }
+
+ /* reset PML index */
+ vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
+}
+
+static void vmx_dump_sel(char *name, uint32_t sel)
+{
+ pr_err("%s sel=0x%04x, attr=0x%05x, limit=0x%08x, base=0x%016lx\n",
+ name, vmcs_read16(sel),
+ vmcs_read32(sel + GUEST_ES_AR_BYTES - GUEST_ES_SELECTOR),
+ vmcs_read32(sel + GUEST_ES_LIMIT - GUEST_ES_SELECTOR),
+ vmcs_readl(sel + GUEST_ES_BASE - GUEST_ES_SELECTOR));
+}
+
+static void vmx_dump_dtsel(char *name, uint32_t limit)
+{
+ pr_err("%s limit=0x%08x, base=0x%016lx\n",
+ name, vmcs_read32(limit),
+ vmcs_readl(limit + GUEST_GDTR_BASE - GUEST_GDTR_LIMIT));
+}
+
+static void vmx_dump_msrs(char *name, struct vmx_msrs *m)
+{
+ unsigned int i;
+ struct vmx_msr_entry *e;
+
+ pr_err("MSR %s:\n", name);
+ for (i = 0, e = m->val; i < m->nr; ++i, ++e)
+ pr_err(" %2d: msr=0x%08x value=0x%016llx\n", i, e->index, e->value);
+}
+
+void dump_vmcs(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ u32 vmentry_ctl, vmexit_ctl;
+ u32 cpu_based_exec_ctrl, pin_based_exec_ctrl, secondary_exec_control;
+ u64 tertiary_exec_control;
+ unsigned long cr4;
+ int efer_slot;
+
+ if (!dump_invalid_vmcs) {
+ pr_warn_ratelimited("set kvm_intel.dump_invalid_vmcs=1 to dump internal KVM state.\n");
+ return;
+ }
+
+ vmentry_ctl = vmcs_read32(VM_ENTRY_CONTROLS);
+ vmexit_ctl = vmcs_read32(VM_EXIT_CONTROLS);
+ cpu_based_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
+ pin_based_exec_ctrl = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL);
+ cr4 = vmcs_readl(GUEST_CR4);
+
+ if (cpu_has_secondary_exec_ctrls())
+ secondary_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
+ else
+ secondary_exec_control = 0;
+
+ if (cpu_has_tertiary_exec_ctrls())
+ tertiary_exec_control = vmcs_read64(TERTIARY_VM_EXEC_CONTROL);
+ else
+ tertiary_exec_control = 0;
+
+ pr_err("VMCS %p, last attempted VM-entry on CPU %d\n",
+ vmx->loaded_vmcs->vmcs, vcpu->arch.last_vmentry_cpu);
+ pr_err("*** Guest State ***\n");
+ pr_err("CR0: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n",
+ vmcs_readl(GUEST_CR0), vmcs_readl(CR0_READ_SHADOW),
+ vmcs_readl(CR0_GUEST_HOST_MASK));
+ pr_err("CR4: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n",
+ cr4, vmcs_readl(CR4_READ_SHADOW), vmcs_readl(CR4_GUEST_HOST_MASK));
+ pr_err("CR3 = 0x%016lx\n", vmcs_readl(GUEST_CR3));
+ if (cpu_has_vmx_ept()) {
+ pr_err("PDPTR0 = 0x%016llx PDPTR1 = 0x%016llx\n",
+ vmcs_read64(GUEST_PDPTR0), vmcs_read64(GUEST_PDPTR1));
+ pr_err("PDPTR2 = 0x%016llx PDPTR3 = 0x%016llx\n",
+ vmcs_read64(GUEST_PDPTR2), vmcs_read64(GUEST_PDPTR3));
+ }
+ pr_err("RSP = 0x%016lx RIP = 0x%016lx\n",
+ vmcs_readl(GUEST_RSP), vmcs_readl(GUEST_RIP));
+ pr_err("RFLAGS=0x%08lx DR7 = 0x%016lx\n",
+ vmcs_readl(GUEST_RFLAGS), vmcs_readl(GUEST_DR7));
+ pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n",
+ vmcs_readl(GUEST_SYSENTER_ESP),
+ vmcs_read32(GUEST_SYSENTER_CS), vmcs_readl(GUEST_SYSENTER_EIP));
+ vmx_dump_sel("CS: ", GUEST_CS_SELECTOR);
+ vmx_dump_sel("DS: ", GUEST_DS_SELECTOR);
+ vmx_dump_sel("SS: ", GUEST_SS_SELECTOR);
+ vmx_dump_sel("ES: ", GUEST_ES_SELECTOR);
+ vmx_dump_sel("FS: ", GUEST_FS_SELECTOR);
+ vmx_dump_sel("GS: ", GUEST_GS_SELECTOR);
+ vmx_dump_dtsel("GDTR:", GUEST_GDTR_LIMIT);
+ vmx_dump_sel("LDTR:", GUEST_LDTR_SELECTOR);
+ vmx_dump_dtsel("IDTR:", GUEST_IDTR_LIMIT);
+ vmx_dump_sel("TR: ", GUEST_TR_SELECTOR);
+ efer_slot = vmx_find_loadstore_msr_slot(&vmx->msr_autoload.guest, MSR_EFER);
+ if (vmentry_ctl & VM_ENTRY_LOAD_IA32_EFER)
+ pr_err("EFER= 0x%016llx\n", vmcs_read64(GUEST_IA32_EFER));
+ else if (efer_slot >= 0)
+ pr_err("EFER= 0x%016llx (autoload)\n",
+ vmx->msr_autoload.guest.val[efer_slot].value);
+ else if (vmentry_ctl & VM_ENTRY_IA32E_MODE)
+ pr_err("EFER= 0x%016llx (effective)\n",
+ vcpu->arch.efer | (EFER_LMA | EFER_LME));
+ else
+ pr_err("EFER= 0x%016llx (effective)\n",
+ vcpu->arch.efer & ~(EFER_LMA | EFER_LME));
+ if (vmentry_ctl & VM_ENTRY_LOAD_IA32_PAT)
+ pr_err("PAT = 0x%016llx\n", vmcs_read64(GUEST_IA32_PAT));
+ pr_err("DebugCtl = 0x%016llx DebugExceptions = 0x%016lx\n",
+ vmcs_read64(GUEST_IA32_DEBUGCTL),
+ vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS));
+ if (cpu_has_load_perf_global_ctrl() &&
+ vmentry_ctl & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL)
+ pr_err("PerfGlobCtl = 0x%016llx\n",
+ vmcs_read64(GUEST_IA32_PERF_GLOBAL_CTRL));
+ if (vmentry_ctl & VM_ENTRY_LOAD_BNDCFGS)
+ pr_err("BndCfgS = 0x%016llx\n", vmcs_read64(GUEST_BNDCFGS));
+ pr_err("Interruptibility = %08x ActivityState = %08x\n",
+ vmcs_read32(GUEST_INTERRUPTIBILITY_INFO),
+ vmcs_read32(GUEST_ACTIVITY_STATE));
+ if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)
+ pr_err("InterruptStatus = %04x\n",
+ vmcs_read16(GUEST_INTR_STATUS));
+ if (vmcs_read32(VM_ENTRY_MSR_LOAD_COUNT) > 0)
+ vmx_dump_msrs("guest autoload", &vmx->msr_autoload.guest);
+ if (vmcs_read32(VM_EXIT_MSR_STORE_COUNT) > 0)
+ vmx_dump_msrs("guest autostore", &vmx->msr_autostore.guest);
+
+ pr_err("*** Host State ***\n");
+ pr_err("RIP = 0x%016lx RSP = 0x%016lx\n",
+ vmcs_readl(HOST_RIP), vmcs_readl(HOST_RSP));
+ pr_err("CS=%04x SS=%04x DS=%04x ES=%04x FS=%04x GS=%04x TR=%04x\n",
+ vmcs_read16(HOST_CS_SELECTOR), vmcs_read16(HOST_SS_SELECTOR),
+ vmcs_read16(HOST_DS_SELECTOR), vmcs_read16(HOST_ES_SELECTOR),
+ vmcs_read16(HOST_FS_SELECTOR), vmcs_read16(HOST_GS_SELECTOR),
+ vmcs_read16(HOST_TR_SELECTOR));
+ pr_err("FSBase=%016lx GSBase=%016lx TRBase=%016lx\n",
+ vmcs_readl(HOST_FS_BASE), vmcs_readl(HOST_GS_BASE),
+ vmcs_readl(HOST_TR_BASE));
+ pr_err("GDTBase=%016lx IDTBase=%016lx\n",
+ vmcs_readl(HOST_GDTR_BASE), vmcs_readl(HOST_IDTR_BASE));
+ pr_err("CR0=%016lx CR3=%016lx CR4=%016lx\n",
+ vmcs_readl(HOST_CR0), vmcs_readl(HOST_CR3),
+ vmcs_readl(HOST_CR4));
+ pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n",
+ vmcs_readl(HOST_IA32_SYSENTER_ESP),
+ vmcs_read32(HOST_IA32_SYSENTER_CS),
+ vmcs_readl(HOST_IA32_SYSENTER_EIP));
+ if (vmexit_ctl & VM_EXIT_LOAD_IA32_EFER)
+ pr_err("EFER= 0x%016llx\n", vmcs_read64(HOST_IA32_EFER));
+ if (vmexit_ctl & VM_EXIT_LOAD_IA32_PAT)
+ pr_err("PAT = 0x%016llx\n", vmcs_read64(HOST_IA32_PAT));
+ if (cpu_has_load_perf_global_ctrl() &&
+ vmexit_ctl & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
+ pr_err("PerfGlobCtl = 0x%016llx\n",
+ vmcs_read64(HOST_IA32_PERF_GLOBAL_CTRL));
+ if (vmcs_read32(VM_EXIT_MSR_LOAD_COUNT) > 0)
+ vmx_dump_msrs("host autoload", &vmx->msr_autoload.host);
+
+ pr_err("*** Control State ***\n");
+ pr_err("CPUBased=0x%08x SecondaryExec=0x%08x TertiaryExec=0x%016llx\n",
+ cpu_based_exec_ctrl, secondary_exec_control, tertiary_exec_control);
+ pr_err("PinBased=0x%08x EntryControls=%08x ExitControls=%08x\n",
+ pin_based_exec_ctrl, vmentry_ctl, vmexit_ctl);
+ pr_err("ExceptionBitmap=%08x PFECmask=%08x PFECmatch=%08x\n",
+ vmcs_read32(EXCEPTION_BITMAP),
+ vmcs_read32(PAGE_FAULT_ERROR_CODE_MASK),
+ vmcs_read32(PAGE_FAULT_ERROR_CODE_MATCH));
+ pr_err("VMEntry: intr_info=%08x errcode=%08x ilen=%08x\n",
+ vmcs_read32(VM_ENTRY_INTR_INFO_FIELD),
+ vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE),
+ vmcs_read32(VM_ENTRY_INSTRUCTION_LEN));
+ pr_err("VMExit: intr_info=%08x errcode=%08x ilen=%08x\n",
+ vmcs_read32(VM_EXIT_INTR_INFO),
+ vmcs_read32(VM_EXIT_INTR_ERROR_CODE),
+ vmcs_read32(VM_EXIT_INSTRUCTION_LEN));
+ pr_err(" reason=%08x qualification=%016lx\n",
+ vmcs_read32(VM_EXIT_REASON), vmcs_readl(EXIT_QUALIFICATION));
+ pr_err("IDTVectoring: info=%08x errcode=%08x\n",
+ vmcs_read32(IDT_VECTORING_INFO_FIELD),
+ vmcs_read32(IDT_VECTORING_ERROR_CODE));
+ pr_err("TSC Offset = 0x%016llx\n", vmcs_read64(TSC_OFFSET));
+ if (secondary_exec_control & SECONDARY_EXEC_TSC_SCALING)
+ pr_err("TSC Multiplier = 0x%016llx\n",
+ vmcs_read64(TSC_MULTIPLIER));
+ if (cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW) {
+ if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) {
+ u16 status = vmcs_read16(GUEST_INTR_STATUS);
+ pr_err("SVI|RVI = %02x|%02x ", status >> 8, status & 0xff);
+ }
+ pr_cont("TPR Threshold = 0x%02x\n", vmcs_read32(TPR_THRESHOLD));
+ if (secondary_exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)
+ pr_err("APIC-access addr = 0x%016llx ", vmcs_read64(APIC_ACCESS_ADDR));
+ pr_cont("virt-APIC addr = 0x%016llx\n", vmcs_read64(VIRTUAL_APIC_PAGE_ADDR));
+ }
+ if (pin_based_exec_ctrl & PIN_BASED_POSTED_INTR)
+ pr_err("PostedIntrVec = 0x%02x\n", vmcs_read16(POSTED_INTR_NV));
+ if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT))
+ pr_err("EPT pointer = 0x%016llx\n", vmcs_read64(EPT_POINTER));
+ if (secondary_exec_control & SECONDARY_EXEC_PAUSE_LOOP_EXITING)
+ pr_err("PLE Gap=%08x Window=%08x\n",
+ vmcs_read32(PLE_GAP), vmcs_read32(PLE_WINDOW));
+ if (secondary_exec_control & SECONDARY_EXEC_ENABLE_VPID)
+ pr_err("Virtual processor ID = 0x%04x\n",
+ vmcs_read16(VIRTUAL_PROCESSOR_ID));
+}
+
+/*
+ * The guest has exited. See if we can fix it or if we need userspace
+ * assistance.
+ */
+static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ union vmx_exit_reason exit_reason = vmx->exit_reason;
+ u32 vectoring_info = vmx->idt_vectoring_info;
+ u16 exit_handler_index;
+
+ /*
+ * Flush logged GPAs PML buffer, this will make dirty_bitmap more
+ * updated. Another good is, in kvm_vm_ioctl_get_dirty_log, before
+ * querying dirty_bitmap, we only need to kick all vcpus out of guest
+ * mode as if vcpus is in root mode, the PML buffer must has been
+ * flushed already. Note, PML is never enabled in hardware while
+ * running L2.
+ */
+ if (enable_pml && !is_guest_mode(vcpu))
+ vmx_flush_pml_buffer(vcpu);
+
+ /*
+ * KVM should never reach this point with a pending nested VM-Enter.
+ * More specifically, short-circuiting VM-Entry to emulate L2 due to
+ * invalid guest state should never happen as that means KVM knowingly
+ * allowed a nested VM-Enter with an invalid vmcs12. More below.
+ */
+ if (KVM_BUG_ON(vmx->nested.nested_run_pending, vcpu->kvm))
+ return -EIO;
+
+ if (is_guest_mode(vcpu)) {
+ /*
+ * PML is never enabled when running L2, bail immediately if a
+ * PML full exit occurs as something is horribly wrong.
+ */
+ if (exit_reason.basic == EXIT_REASON_PML_FULL)
+ goto unexpected_vmexit;
+
+ /*
+ * The host physical addresses of some pages of guest memory
+ * are loaded into the vmcs02 (e.g. vmcs12's Virtual APIC
+ * Page). The CPU may write to these pages via their host
+ * physical address while L2 is running, bypassing any
+ * address-translation-based dirty tracking (e.g. EPT write
+ * protection).
+ *
+ * Mark them dirty on every exit from L2 to prevent them from
+ * getting out of sync with dirty tracking.
+ */
+ nested_mark_vmcs12_pages_dirty(vcpu);
+
+ /*
+ * Synthesize a triple fault if L2 state is invalid. In normal
+ * operation, nested VM-Enter rejects any attempt to enter L2
+ * with invalid state. However, those checks are skipped if
+ * state is being stuffed via RSM or KVM_SET_NESTED_STATE. If
+ * L2 state is invalid, it means either L1 modified SMRAM state
+ * or userspace provided bad state. Synthesize TRIPLE_FAULT as
+ * doing so is architecturally allowed in the RSM case, and is
+ * the least awful solution for the userspace case without
+ * risking false positives.
+ */
+ if (vmx->emulation_required) {
+ nested_vmx_vmexit(vcpu, EXIT_REASON_TRIPLE_FAULT, 0, 0);
+ return 1;
+ }
+
+ if (nested_vmx_reflect_vmexit(vcpu))
+ return 1;
+ }
+
+ /* If guest state is invalid, start emulating. L2 is handled above. */
+ if (vmx->emulation_required)
+ return handle_invalid_guest_state(vcpu);
+
+ if (exit_reason.failed_vmentry) {
+ dump_vmcs(vcpu);
+ vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
+ vcpu->run->fail_entry.hardware_entry_failure_reason
+ = exit_reason.full;
+ vcpu->run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu;
+ return 0;
+ }
+
+ if (unlikely(vmx->fail)) {
+ dump_vmcs(vcpu);
+ vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
+ vcpu->run->fail_entry.hardware_entry_failure_reason
+ = vmcs_read32(VM_INSTRUCTION_ERROR);
+ vcpu->run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu;
+ return 0;
+ }
+
+ /*
+ * Note:
+ * Do not try to fix EXIT_REASON_EPT_MISCONFIG if it caused by
+ * delivery event since it indicates guest is accessing MMIO.
+ * The vm-exit can be triggered again after return to guest that
+ * will cause infinite loop.
+ */
+ if ((vectoring_info & VECTORING_INFO_VALID_MASK) &&
+ (exit_reason.basic != EXIT_REASON_EXCEPTION_NMI &&
+ exit_reason.basic != EXIT_REASON_EPT_VIOLATION &&
+ exit_reason.basic != EXIT_REASON_PML_FULL &&
+ exit_reason.basic != EXIT_REASON_APIC_ACCESS &&
+ exit_reason.basic != EXIT_REASON_TASK_SWITCH &&
+ exit_reason.basic != EXIT_REASON_NOTIFY)) {
+ int ndata = 3;
+
+ vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+ vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV;
+ vcpu->run->internal.data[0] = vectoring_info;
+ vcpu->run->internal.data[1] = exit_reason.full;
+ vcpu->run->internal.data[2] = vcpu->arch.exit_qualification;
+ if (exit_reason.basic == EXIT_REASON_EPT_MISCONFIG) {
+ vcpu->run->internal.data[ndata++] =
+ vmcs_read64(GUEST_PHYSICAL_ADDRESS);
+ }
+ vcpu->run->internal.data[ndata++] = vcpu->arch.last_vmentry_cpu;
+ vcpu->run->internal.ndata = ndata;
+ return 0;
+ }
+
+ if (unlikely(!enable_vnmi &&
+ vmx->loaded_vmcs->soft_vnmi_blocked)) {
+ if (!vmx_interrupt_blocked(vcpu)) {
+ vmx->loaded_vmcs->soft_vnmi_blocked = 0;
+ } else if (vmx->loaded_vmcs->vnmi_blocked_time > 1000000000LL &&
+ vcpu->arch.nmi_pending) {
+ /*
+ * This CPU don't support us in finding the end of an
+ * NMI-blocked window if the guest runs with IRQs
+ * disabled. So we pull the trigger after 1 s of
+ * futile waiting, but inform the user about this.
+ */
+ printk(KERN_WARNING "%s: Breaking out of NMI-blocked "
+ "state on VCPU %d after 1 s timeout\n",
+ __func__, vcpu->vcpu_id);
+ vmx->loaded_vmcs->soft_vnmi_blocked = 0;
+ }
+ }
+
+ if (exit_fastpath != EXIT_FASTPATH_NONE)
+ return 1;
+
+ if (exit_reason.basic >= kvm_vmx_max_exit_handlers)
+ goto unexpected_vmexit;
+#ifdef CONFIG_RETPOLINE
+ if (exit_reason.basic == EXIT_REASON_MSR_WRITE)
+ return kvm_emulate_wrmsr(vcpu);
+ else if (exit_reason.basic == EXIT_REASON_PREEMPTION_TIMER)
+ return handle_preemption_timer(vcpu);
+ else if (exit_reason.basic == EXIT_REASON_INTERRUPT_WINDOW)
+ return handle_interrupt_window(vcpu);
+ else if (exit_reason.basic == EXIT_REASON_EXTERNAL_INTERRUPT)
+ return handle_external_interrupt(vcpu);
+ else if (exit_reason.basic == EXIT_REASON_HLT)
+ return kvm_emulate_halt(vcpu);
+ else if (exit_reason.basic == EXIT_REASON_EPT_MISCONFIG)
+ return handle_ept_misconfig(vcpu);
+#endif
+
+ exit_handler_index = array_index_nospec((u16)exit_reason.basic,
+ kvm_vmx_max_exit_handlers);
+ if (!kvm_vmx_exit_handlers[exit_handler_index])
+ goto unexpected_vmexit;
+
+ return kvm_vmx_exit_handlers[exit_handler_index](vcpu);
+
+unexpected_vmexit:
+ vcpu_unimpl(vcpu, "vmx: unexpected exit reason 0x%x\n",
+ exit_reason.full);
+ dump_vmcs(vcpu);
+ vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+ vcpu->run->internal.suberror =
+ KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON;
+ vcpu->run->internal.ndata = 2;
+ vcpu->run->internal.data[0] = exit_reason.full;
+ vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu;
+ return 0;
+}
+
+static int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
+{
+ int ret = __vmx_handle_exit(vcpu, exit_fastpath);
+
+ /*
+ * Exit to user space when bus lock detected to inform that there is
+ * a bus lock in guest.
+ */
+ if (to_vmx(vcpu)->exit_reason.bus_lock_detected) {
+ if (ret > 0)
+ vcpu->run->exit_reason = KVM_EXIT_X86_BUS_LOCK;
+
+ vcpu->run->flags |= KVM_RUN_X86_BUS_LOCK;
+ return 0;
+ }
+ return ret;
+}
+
+/*
+ * Software based L1D cache flush which is used when microcode providing
+ * the cache control MSR is not loaded.
+ *
+ * The L1D cache is 32 KiB on Nehalem and later microarchitectures, but to
+ * flush it is required to read in 64 KiB because the replacement algorithm
+ * is not exactly LRU. This could be sized at runtime via topology
+ * information but as all relevant affected CPUs have 32KiB L1D cache size
+ * there is no point in doing so.
+ */
+static noinstr void vmx_l1d_flush(struct kvm_vcpu *vcpu)
+{
+ int size = PAGE_SIZE << L1D_CACHE_ORDER;
+
+ /*
+ * This code is only executed when the flush mode is 'cond' or
+ * 'always'
+ */
+ if (static_branch_likely(&vmx_l1d_flush_cond)) {
+ bool flush_l1d;
+
+ /*
+ * Clear the per-vcpu flush bit, it gets set again
+ * either from vcpu_run() or from one of the unsafe
+ * VMEXIT handlers.
+ */
+ flush_l1d = vcpu->arch.l1tf_flush_l1d;
+ vcpu->arch.l1tf_flush_l1d = false;
+
+ /*
+ * Clear the per-cpu flush bit, it gets set again from
+ * the interrupt handlers.
+ */
+ flush_l1d |= kvm_get_cpu_l1tf_flush_l1d();
+ kvm_clear_cpu_l1tf_flush_l1d();
+
+ if (!flush_l1d)
+ return;
+ }
+
+ vcpu->stat.l1d_flush++;
+
+ if (static_cpu_has(X86_FEATURE_FLUSH_L1D)) {
+ native_wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH);
+ return;
+ }
+
+ asm volatile(
+ /* First ensure the pages are in the TLB */
+ "xorl %%eax, %%eax\n"
+ ".Lpopulate_tlb:\n\t"
+ "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t"
+ "addl $4096, %%eax\n\t"
+ "cmpl %%eax, %[size]\n\t"
+ "jne .Lpopulate_tlb\n\t"
+ "xorl %%eax, %%eax\n\t"
+ "cpuid\n\t"
+ /* Now fill the cache */
+ "xorl %%eax, %%eax\n"
+ ".Lfill_cache:\n"
+ "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t"
+ "addl $64, %%eax\n\t"
+ "cmpl %%eax, %[size]\n\t"
+ "jne .Lfill_cache\n\t"
+ "lfence\n"
+ :: [flush_pages] "r" (vmx_l1d_flush_pages),
+ [size] "r" (size)
+ : "eax", "ebx", "ecx", "edx");
+}
+
+static void vmx_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
+{
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+ int tpr_threshold;
+
+ if (is_guest_mode(vcpu) &&
+ nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))
+ return;
+
+ tpr_threshold = (irr == -1 || tpr < irr) ? 0 : irr;
+ if (is_guest_mode(vcpu))
+ to_vmx(vcpu)->nested.l1_tpr_threshold = tpr_threshold;
+ else
+ vmcs_write32(TPR_THRESHOLD, tpr_threshold);
+}
+
+void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ u32 sec_exec_control;
+
+ if (!lapic_in_kernel(vcpu))
+ return;
+
+ if (!flexpriority_enabled &&
+ !cpu_has_vmx_virtualize_x2apic_mode())
+ return;
+
+ /* Postpone execution until vmcs01 is the current VMCS. */
+ if (is_guest_mode(vcpu)) {
+ vmx->nested.change_vmcs01_virtual_apic_mode = true;
+ return;
+ }
+
+ sec_exec_control = secondary_exec_controls_get(vmx);
+ sec_exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
+ SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE);
+
+ switch (kvm_get_apic_mode(vcpu)) {
+ case LAPIC_MODE_INVALID:
+ WARN_ONCE(true, "Invalid local APIC state");
+ break;
+ case LAPIC_MODE_DISABLED:
+ break;
+ case LAPIC_MODE_XAPIC:
+ if (flexpriority_enabled) {
+ sec_exec_control |=
+ SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
+ kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
+
+ /*
+ * Flush the TLB, reloading the APIC access page will
+ * only do so if its physical address has changed, but
+ * the guest may have inserted a non-APIC mapping into
+ * the TLB while the APIC access page was disabled.
+ */
+ kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
+ }
+ break;
+ case LAPIC_MODE_X2APIC:
+ if (cpu_has_vmx_virtualize_x2apic_mode())
+ sec_exec_control |=
+ SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
+ break;
+ }
+ secondary_exec_controls_set(vmx, sec_exec_control);
+
+ vmx_update_msr_bitmap_x2apic(vcpu);
+}
+
+static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu)
+{
+ const gfn_t gfn = APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT;
+ struct kvm *kvm = vcpu->kvm;
+ struct kvm_memslots *slots = kvm_memslots(kvm);
+ struct kvm_memory_slot *slot;
+ unsigned long mmu_seq;
+ kvm_pfn_t pfn;
+
+ /* Defer reload until vmcs01 is the current VMCS. */
+ if (is_guest_mode(vcpu)) {
+ to_vmx(vcpu)->nested.reload_vmcs01_apic_access_page = true;
+ return;
+ }
+
+ if (!(secondary_exec_controls_get(to_vmx(vcpu)) &
+ SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
+ return;
+
+ /*
+ * Grab the memslot so that the hva lookup for the mmu_notifier retry
+ * is guaranteed to use the same memslot as the pfn lookup, i.e. rely
+ * on the pfn lookup's validation of the memslot to ensure a valid hva
+ * is used for the retry check.
+ */
+ slot = id_to_memslot(slots, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT);
+ if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
+ return;
+
+ /*
+ * Ensure that the mmu_notifier sequence count is read before KVM
+ * retrieves the pfn from the primary MMU. Note, the memslot is
+ * protected by SRCU, not the mmu_notifier. Pairs with the smp_wmb()
+ * in kvm_mmu_invalidate_end().
+ */
+ mmu_seq = kvm->mmu_invalidate_seq;
+ smp_rmb();
+
+ /*
+ * No need to retry if the memslot does not exist or is invalid. KVM
+ * controls the APIC-access page memslot, and only deletes the memslot
+ * if APICv is permanently inhibited, i.e. the memslot won't reappear.
+ */
+ pfn = gfn_to_pfn_memslot(slot, gfn);
+ if (is_error_noslot_pfn(pfn))
+ return;
+
+ read_lock(&vcpu->kvm->mmu_lock);
+ if (mmu_invalidate_retry_hva(kvm, mmu_seq,
+ gfn_to_hva_memslot(slot, gfn))) {
+ kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
+ read_unlock(&vcpu->kvm->mmu_lock);
+ goto out;
+ }
+
+ vmcs_write64(APIC_ACCESS_ADDR, pfn_to_hpa(pfn));
+ read_unlock(&vcpu->kvm->mmu_lock);
+
+ /*
+ * No need for a manual TLB flush at this point, KVM has already done a
+ * flush if there were SPTEs pointing at the previous page.
+ */
+out:
+ /*
+ * Do not pin apic access page in memory, the MMU notifier
+ * will call us again if it is migrated or swapped out.
+ */
+ kvm_release_pfn_clean(pfn);
+}
+
+static void vmx_hwapic_isr_update(int max_isr)
+{
+ u16 status;
+ u8 old;
+
+ if (max_isr == -1)
+ max_isr = 0;
+
+ status = vmcs_read16(GUEST_INTR_STATUS);
+ old = status >> 8;
+ if (max_isr != old) {
+ status &= 0xff;
+ status |= max_isr << 8;
+ vmcs_write16(GUEST_INTR_STATUS, status);
+ }
+}
+
+static void vmx_set_rvi(int vector)
+{
+ u16 status;
+ u8 old;
+
+ if (vector == -1)
+ vector = 0;
+
+ status = vmcs_read16(GUEST_INTR_STATUS);
+ old = (u8)status & 0xff;
+ if ((u8)vector != old) {
+ status &= ~0xff;
+ status |= (u8)vector;
+ vmcs_write16(GUEST_INTR_STATUS, status);
+ }
+}
+
+static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr)
+{
+ /*
+ * When running L2, updating RVI is only relevant when
+ * vmcs12 virtual-interrupt-delivery enabled.
+ * However, it can be enabled only when L1 also
+ * intercepts external-interrupts and in that case
+ * we should not update vmcs02 RVI but instead intercept
+ * interrupt. Therefore, do nothing when running L2.
+ */
+ if (!is_guest_mode(vcpu))
+ vmx_set_rvi(max_irr);
+}
+
+static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ int max_irr;
+ bool got_posted_interrupt;
+
+ if (KVM_BUG_ON(!enable_apicv, vcpu->kvm))
+ return -EIO;
+
+ if (pi_test_on(&vmx->pi_desc)) {
+ pi_clear_on(&vmx->pi_desc);
+ /*
+ * IOMMU can write to PID.ON, so the barrier matters even on UP.
+ * But on x86 this is just a compiler barrier anyway.
+ */
+ smp_mb__after_atomic();
+ got_posted_interrupt =
+ kvm_apic_update_irr(vcpu, vmx->pi_desc.pir, &max_irr);
+ } else {
+ max_irr = kvm_lapic_find_highest_irr(vcpu);
+ got_posted_interrupt = false;
+ }
+
+ /*
+ * Newly recognized interrupts are injected via either virtual interrupt
+ * delivery (RVI) or KVM_REQ_EVENT. Virtual interrupt delivery is
+ * disabled in two cases:
+ *
+ * 1) If L2 is running and the vCPU has a new pending interrupt. If L1
+ * wants to exit on interrupts, KVM_REQ_EVENT is needed to synthesize a
+ * VM-Exit to L1. If L1 doesn't want to exit, the interrupt is injected
+ * into L2, but KVM doesn't use virtual interrupt delivery to inject
+ * interrupts into L2, and so KVM_REQ_EVENT is again needed.
+ *
+ * 2) If APICv is disabled for this vCPU, assigned devices may still
+ * attempt to post interrupts. The posted interrupt vector will cause
+ * a VM-Exit and the subsequent entry will call sync_pir_to_irr.
+ */
+ if (!is_guest_mode(vcpu) && kvm_vcpu_apicv_active(vcpu))
+ vmx_set_rvi(max_irr);
+ else if (got_posted_interrupt)
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+
+ return max_irr;
+}
+
+static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
+{
+ if (!kvm_vcpu_apicv_active(vcpu))
+ return;
+
+ vmcs_write64(EOI_EXIT_BITMAP0, eoi_exit_bitmap[0]);
+ vmcs_write64(EOI_EXIT_BITMAP1, eoi_exit_bitmap[1]);
+ vmcs_write64(EOI_EXIT_BITMAP2, eoi_exit_bitmap[2]);
+ vmcs_write64(EOI_EXIT_BITMAP3, eoi_exit_bitmap[3]);
+}
+
+static void vmx_apicv_pre_state_restore(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ pi_clear_on(&vmx->pi_desc);
+ memset(vmx->pi_desc.pir, 0, sizeof(vmx->pi_desc.pir));
+}
+
+void vmx_do_interrupt_irqoff(unsigned long entry);
+void vmx_do_nmi_irqoff(void);
+
+static void handle_nm_fault_irqoff(struct kvm_vcpu *vcpu)
+{
+ /*
+ * Save xfd_err to guest_fpu before interrupt is enabled, so the
+ * MSR value is not clobbered by the host activity before the guest
+ * has chance to consume it.
+ *
+ * Do not blindly read xfd_err here, since this exception might
+ * be caused by L1 interception on a platform which doesn't
+ * support xfd at all.
+ *
+ * Do it conditionally upon guest_fpu::xfd. xfd_err matters
+ * only when xfd contains a non-zero value.
+ *
+ * Queuing exception is done in vmx_handle_exit. See comment there.
+ */
+ if (vcpu->arch.guest_fpu.fpstate->xfd)
+ rdmsrl(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err);
+}
+
+static void handle_exception_irqoff(struct vcpu_vmx *vmx)
+{
+ u32 intr_info = vmx_get_intr_info(&vmx->vcpu);
+
+ /* if exit due to PF check for async PF */
+ if (is_page_fault(intr_info))
+ vmx->vcpu.arch.apf.host_apf_flags = kvm_read_and_reset_apf_flags();
+ /* if exit due to NM, handle before interrupts are enabled */
+ else if (is_nm_fault(intr_info))
+ handle_nm_fault_irqoff(&vmx->vcpu);
+ /* Handle machine checks before interrupts are enabled */
+ else if (is_machine_check(intr_info))
+ kvm_machine_check();
+}
+
+static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu)
+{
+ u32 intr_info = vmx_get_intr_info(vcpu);
+ unsigned int vector = intr_info & INTR_INFO_VECTOR_MASK;
+ gate_desc *desc = (gate_desc *)host_idt_base + vector;
+
+ if (KVM_BUG(!is_external_intr(intr_info), vcpu->kvm,
+ "unexpected VM-Exit interrupt info: 0x%x", intr_info))
+ return;
+
+ kvm_before_interrupt(vcpu, KVM_HANDLING_IRQ);
+ vmx_do_interrupt_irqoff(gate_offset(desc));
+ kvm_after_interrupt(vcpu);
+
+ vcpu->arch.at_instruction_boundary = true;
+}
+
+static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ if (vmx->emulation_required)
+ return;
+
+ if (vmx->exit_reason.basic == EXIT_REASON_EXTERNAL_INTERRUPT)
+ handle_external_interrupt_irqoff(vcpu);
+ else if (vmx->exit_reason.basic == EXIT_REASON_EXCEPTION_NMI)
+ handle_exception_irqoff(vmx);
+}
+
+/*
+ * The kvm parameter can be NULL (module initialization, or invocation before
+ * VM creation). Be sure to check the kvm parameter before using it.
+ */
+static bool vmx_has_emulated_msr(struct kvm *kvm, u32 index)
+{
+ switch (index) {
+ case MSR_IA32_SMBASE:
+ if (!IS_ENABLED(CONFIG_KVM_SMM))
+ return false;
+ /*
+ * We cannot do SMM unless we can run the guest in big
+ * real mode.
+ */
+ return enable_unrestricted_guest || emulate_invalid_guest_state;
+ case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR:
+ return nested;
+ case MSR_AMD64_VIRT_SPEC_CTRL:
+ case MSR_AMD64_TSC_RATIO:
+ /* This is AMD only. */
+ return false;
+ default:
+ return true;
+ }
+}
+
+static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
+{
+ u32 exit_intr_info;
+ bool unblock_nmi;
+ u8 vector;
+ bool idtv_info_valid;
+
+ idtv_info_valid = vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK;
+
+ if (enable_vnmi) {
+ if (vmx->loaded_vmcs->nmi_known_unmasked)
+ return;
+
+ exit_intr_info = vmx_get_intr_info(&vmx->vcpu);
+ unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0;
+ vector = exit_intr_info & INTR_INFO_VECTOR_MASK;
+ /*
+ * SDM 3: 27.7.1.2 (September 2008)
+ * Re-set bit "block by NMI" before VM entry if vmexit caused by
+ * a guest IRET fault.
+ * SDM 3: 23.2.2 (September 2008)
+ * Bit 12 is undefined in any of the following cases:
+ * If the VM exit sets the valid bit in the IDT-vectoring
+ * information field.
+ * If the VM exit is due to a double fault.
+ */
+ if ((exit_intr_info & INTR_INFO_VALID_MASK) && unblock_nmi &&
+ vector != DF_VECTOR && !idtv_info_valid)
+ vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
+ GUEST_INTR_STATE_NMI);
+ else
+ vmx->loaded_vmcs->nmi_known_unmasked =
+ !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO)
+ & GUEST_INTR_STATE_NMI);
+ } else if (unlikely(vmx->loaded_vmcs->soft_vnmi_blocked))
+ vmx->loaded_vmcs->vnmi_blocked_time +=
+ ktime_to_ns(ktime_sub(ktime_get(),
+ vmx->loaded_vmcs->entry_time));
+}
+
+static void __vmx_complete_interrupts(struct kvm_vcpu *vcpu,
+ u32 idt_vectoring_info,
+ int instr_len_field,
+ int error_code_field)
+{
+ u8 vector;
+ int type;
+ bool idtv_info_valid;
+
+ idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK;
+
+ vcpu->arch.nmi_injected = false;
+ kvm_clear_exception_queue(vcpu);
+ kvm_clear_interrupt_queue(vcpu);
+
+ if (!idtv_info_valid)
+ return;
+
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+
+ vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK;
+ type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK;
+
+ switch (type) {
+ case INTR_TYPE_NMI_INTR:
+ vcpu->arch.nmi_injected = true;
+ /*
+ * SDM 3: 27.7.1.2 (September 2008)
+ * Clear bit "block by NMI" before VM entry if a NMI
+ * delivery faulted.
+ */
+ vmx_set_nmi_mask(vcpu, false);
+ break;
+ case INTR_TYPE_SOFT_EXCEPTION:
+ vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field);
+ fallthrough;
+ case INTR_TYPE_HARD_EXCEPTION:
+ if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) {
+ u32 err = vmcs_read32(error_code_field);
+ kvm_requeue_exception_e(vcpu, vector, err);
+ } else
+ kvm_requeue_exception(vcpu, vector);
+ break;
+ case INTR_TYPE_SOFT_INTR:
+ vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field);
+ fallthrough;
+ case INTR_TYPE_EXT_INTR:
+ kvm_queue_interrupt(vcpu, vector, type == INTR_TYPE_SOFT_INTR);
+ break;
+ default:
+ break;
+ }
+}
+
+static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
+{
+ __vmx_complete_interrupts(&vmx->vcpu, vmx->idt_vectoring_info,
+ VM_EXIT_INSTRUCTION_LEN,
+ IDT_VECTORING_ERROR_CODE);
+}
+
+static void vmx_cancel_injection(struct kvm_vcpu *vcpu)
+{
+ __vmx_complete_interrupts(vcpu,
+ vmcs_read32(VM_ENTRY_INTR_INFO_FIELD),
+ VM_ENTRY_INSTRUCTION_LEN,
+ VM_ENTRY_EXCEPTION_ERROR_CODE);
+
+ vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);
+}
+
+static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx)
+{
+ int i, nr_msrs;
+ struct perf_guest_switch_msr *msrs;
+ struct kvm_pmu *pmu = vcpu_to_pmu(&vmx->vcpu);
+
+ pmu->host_cross_mapped_mask = 0;
+ if (pmu->pebs_enable & pmu->global_ctrl)
+ intel_pmu_cross_mapped_check(pmu);
+
+ /* Note, nr_msrs may be garbage if perf_guest_get_msrs() returns NULL. */
+ msrs = perf_guest_get_msrs(&nr_msrs, (void *)pmu);
+ if (!msrs)
+ return;
+
+ for (i = 0; i < nr_msrs; i++)
+ if (msrs[i].host == msrs[i].guest)
+ clear_atomic_switch_msr(vmx, msrs[i].msr);
+ else
+ add_atomic_switch_msr(vmx, msrs[i].msr, msrs[i].guest,
+ msrs[i].host, false);
+}
+
+static void vmx_update_hv_timer(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ u64 tscl;
+ u32 delta_tsc;
+
+ if (vmx->req_immediate_exit) {
+ vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, 0);
+ vmx->loaded_vmcs->hv_timer_soft_disabled = false;
+ } else if (vmx->hv_deadline_tsc != -1) {
+ tscl = rdtsc();
+ if (vmx->hv_deadline_tsc > tscl)
+ /* set_hv_timer ensures the delta fits in 32-bits */
+ delta_tsc = (u32)((vmx->hv_deadline_tsc - tscl) >>
+ cpu_preemption_timer_multi);
+ else
+ delta_tsc = 0;
+
+ vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, delta_tsc);
+ vmx->loaded_vmcs->hv_timer_soft_disabled = false;
+ } else if (!vmx->loaded_vmcs->hv_timer_soft_disabled) {
+ vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, -1);
+ vmx->loaded_vmcs->hv_timer_soft_disabled = true;
+ }
+}
+
+void noinstr vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp)
+{
+ if (unlikely(host_rsp != vmx->loaded_vmcs->host_state.rsp)) {
+ vmx->loaded_vmcs->host_state.rsp = host_rsp;
+ vmcs_writel(HOST_RSP, host_rsp);
+ }
+}
+
+void noinstr vmx_spec_ctrl_restore_host(struct vcpu_vmx *vmx,
+ unsigned int flags)
+{
+ u64 hostval = this_cpu_read(x86_spec_ctrl_current);
+
+ if (!cpu_feature_enabled(X86_FEATURE_MSR_SPEC_CTRL))
+ return;
+
+ if (flags & VMX_RUN_SAVE_SPEC_CTRL)
+ vmx->spec_ctrl = __rdmsr(MSR_IA32_SPEC_CTRL);
+
+ /*
+ * If the guest/host SPEC_CTRL values differ, restore the host value.
+ *
+ * For legacy IBRS, the IBRS bit always needs to be written after
+ * transitioning from a less privileged predictor mode, regardless of
+ * whether the guest/host values differ.
+ */
+ if (cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS) ||
+ vmx->spec_ctrl != hostval)
+ native_wrmsrl(MSR_IA32_SPEC_CTRL, hostval);
+
+ barrier_nospec();
+}
+
+static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
+{
+ switch (to_vmx(vcpu)->exit_reason.basic) {
+ case EXIT_REASON_MSR_WRITE:
+ return handle_fastpath_set_msr_irqoff(vcpu);
+ case EXIT_REASON_PREEMPTION_TIMER:
+ return handle_fastpath_preemption_timer(vcpu);
+ default:
+ return EXIT_FASTPATH_NONE;
+ }
+}
+
+static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu,
+ unsigned int flags)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ guest_state_enter_irqoff();
+
+ /* L1D Flush includes CPU buffer clear to mitigate MDS */
+ if (static_branch_unlikely(&vmx_l1d_should_flush))
+ vmx_l1d_flush(vcpu);
+ else if (static_branch_unlikely(&mds_user_clear))
+ mds_clear_cpu_buffers();
+ else if (static_branch_unlikely(&mmio_stale_data_clear) &&
+ kvm_arch_has_assigned_device(vcpu->kvm))
+ mds_clear_cpu_buffers();
+
+ vmx_disable_fb_clear(vmx);
+
+ if (vcpu->arch.cr2 != native_read_cr2())
+ native_write_cr2(vcpu->arch.cr2);
+
+ vmx->fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs,
+ flags);
+
+ vcpu->arch.cr2 = native_read_cr2();
+ vcpu->arch.regs_avail &= ~VMX_REGS_LAZY_LOAD_SET;
+
+ vmx->idt_vectoring_info = 0;
+
+ vmx_enable_fb_clear(vmx);
+
+ if (unlikely(vmx->fail)) {
+ vmx->exit_reason.full = 0xdead;
+ goto out;
+ }
+
+ vmx->exit_reason.full = vmcs_read32(VM_EXIT_REASON);
+ if (likely(!vmx->exit_reason.failed_vmentry))
+ vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
+
+ if ((u16)vmx->exit_reason.basic == EXIT_REASON_EXCEPTION_NMI &&
+ is_nmi(vmx_get_intr_info(vcpu))) {
+ kvm_before_interrupt(vcpu, KVM_HANDLING_NMI);
+ vmx_do_nmi_irqoff();
+ kvm_after_interrupt(vcpu);
+ }
+
+out:
+ guest_state_exit_irqoff();
+}
+
+static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ unsigned long cr3, cr4;
+
+ /* Record the guest's net vcpu time for enforced NMI injections. */
+ if (unlikely(!enable_vnmi &&
+ vmx->loaded_vmcs->soft_vnmi_blocked))
+ vmx->loaded_vmcs->entry_time = ktime_get();
+
+ /*
+ * Don't enter VMX if guest state is invalid, let the exit handler
+ * start emulation until we arrive back to a valid state. Synthesize a
+ * consistency check VM-Exit due to invalid guest state and bail.
+ */
+ if (unlikely(vmx->emulation_required)) {
+ vmx->fail = 0;
+
+ vmx->exit_reason.full = EXIT_REASON_INVALID_STATE;
+ vmx->exit_reason.failed_vmentry = 1;
+ kvm_register_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_1);
+ vmx->exit_qualification = ENTRY_FAIL_DEFAULT;
+ kvm_register_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_2);
+ vmx->exit_intr_info = 0;
+ return EXIT_FASTPATH_NONE;
+ }
+
+ trace_kvm_entry(vcpu);
+
+ if (vmx->ple_window_dirty) {
+ vmx->ple_window_dirty = false;
+ vmcs_write32(PLE_WINDOW, vmx->ple_window);
+ }
+
+ /*
+ * We did this in prepare_switch_to_guest, because it needs to
+ * be within srcu_read_lock.
+ */
+ WARN_ON_ONCE(vmx->nested.need_vmcs12_to_shadow_sync);
+
+ if (kvm_register_is_dirty(vcpu, VCPU_REGS_RSP))
+ vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);
+ if (kvm_register_is_dirty(vcpu, VCPU_REGS_RIP))
+ vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
+ vcpu->arch.regs_dirty = 0;
+
+ /*
+ * Refresh vmcs.HOST_CR3 if necessary. This must be done immediately
+ * prior to VM-Enter, as the kernel may load a new ASID (PCID) any time
+ * it switches back to the current->mm, which can occur in KVM context
+ * when switching to a temporary mm to patch kernel code, e.g. if KVM
+ * toggles a static key while handling a VM-Exit.
+ */
+ cr3 = __get_current_cr3_fast();
+ if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) {
+ vmcs_writel(HOST_CR3, cr3);
+ vmx->loaded_vmcs->host_state.cr3 = cr3;
+ }
+
+ cr4 = cr4_read_shadow();
+ if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) {
+ vmcs_writel(HOST_CR4, cr4);
+ vmx->loaded_vmcs->host_state.cr4 = cr4;
+ }
+
+ /* When KVM_DEBUGREG_WONT_EXIT, dr6 is accessible in guest. */
+ if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT))
+ set_debugreg(vcpu->arch.dr6, 6);
+
+ /* When single-stepping over STI and MOV SS, we must clear the
+ * corresponding interruptibility bits in the guest state. Otherwise
+ * vmentry fails as it then expects bit 14 (BS) in pending debug
+ * exceptions being set, but that's not correct for the guest debugging
+ * case. */
+ if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
+ vmx_set_interrupt_shadow(vcpu, 0);
+
+ kvm_load_guest_xsave_state(vcpu);
+
+ pt_guest_enter(vmx);
+
+ atomic_switch_perf_msrs(vmx);
+ if (intel_pmu_lbr_is_enabled(vcpu))
+ vmx_passthrough_lbr_msrs(vcpu);
+
+ if (enable_preemption_timer)
+ vmx_update_hv_timer(vcpu);
+
+ kvm_wait_lapic_expire(vcpu);
+
+ /* The actual VMENTER/EXIT is in the .noinstr.text section. */
+ vmx_vcpu_enter_exit(vcpu, __vmx_vcpu_run_flags(vmx));
+
+ /* All fields are clean at this point */
+ if (kvm_is_using_evmcs()) {
+ current_evmcs->hv_clean_fields |=
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
+
+ current_evmcs->hv_vp_id = kvm_hv_get_vpindex(vcpu);
+ }
+
+ /* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */
+ if (vmx->host_debugctlmsr)
+ update_debugctlmsr(vmx->host_debugctlmsr);
+
+#ifndef CONFIG_X86_64
+ /*
+ * The sysexit path does not restore ds/es, so we must set them to
+ * a reasonable value ourselves.
+ *
+ * We can't defer this to vmx_prepare_switch_to_host() since that
+ * function may be executed in interrupt context, which saves and
+ * restore segments around it, nullifying its effect.
+ */
+ loadsegment(ds, __USER_DS);
+ loadsegment(es, __USER_DS);
+#endif
+
+ pt_guest_exit(vmx);
+
+ kvm_load_host_xsave_state(vcpu);
+
+ if (is_guest_mode(vcpu)) {
+ /*
+ * Track VMLAUNCH/VMRESUME that have made past guest state
+ * checking.
+ */
+ if (vmx->nested.nested_run_pending &&
+ !vmx->exit_reason.failed_vmentry)
+ ++vcpu->stat.nested_run;
+
+ vmx->nested.nested_run_pending = 0;
+ }
+
+ if (unlikely(vmx->fail))
+ return EXIT_FASTPATH_NONE;
+
+ if (unlikely((u16)vmx->exit_reason.basic == EXIT_REASON_MCE_DURING_VMENTRY))
+ kvm_machine_check();
+
+ trace_kvm_exit(vcpu, KVM_ISA_VMX);
+
+ if (unlikely(vmx->exit_reason.failed_vmentry))
+ return EXIT_FASTPATH_NONE;
+
+ vmx->loaded_vmcs->launched = 1;
+
+ vmx_recover_nmi_blocking(vmx);
+ vmx_complete_interrupts(vmx);
+
+ if (is_guest_mode(vcpu))
+ return EXIT_FASTPATH_NONE;
+
+ return vmx_exit_handlers_fastpath(vcpu);
+}
+
+static void vmx_vcpu_free(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ if (enable_pml)
+ vmx_destroy_pml_buffer(vmx);
+ free_vpid(vmx->vpid);
+ nested_vmx_free_vcpu(vcpu);
+ free_loaded_vmcs(vmx->loaded_vmcs);
+}
+
+static int vmx_vcpu_create(struct kvm_vcpu *vcpu)
+{
+ struct vmx_uret_msr *tsx_ctrl;
+ struct vcpu_vmx *vmx;
+ int i, err;
+
+ BUILD_BUG_ON(offsetof(struct vcpu_vmx, vcpu) != 0);
+ vmx = to_vmx(vcpu);
+
+ INIT_LIST_HEAD(&vmx->pi_wakeup_list);
+
+ err = -ENOMEM;
+
+ vmx->vpid = allocate_vpid();
+
+ /*
+ * If PML is turned on, failure on enabling PML just results in failure
+ * of creating the vcpu, therefore we can simplify PML logic (by
+ * avoiding dealing with cases, such as enabling PML partially on vcpus
+ * for the guest), etc.
+ */
+ if (enable_pml) {
+ vmx->pml_pg = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+ if (!vmx->pml_pg)
+ goto free_vpid;
+ }
+
+ for (i = 0; i < kvm_nr_uret_msrs; ++i)
+ vmx->guest_uret_msrs[i].mask = -1ull;
+ if (boot_cpu_has(X86_FEATURE_RTM)) {
+ /*
+ * TSX_CTRL_CPUID_CLEAR is handled in the CPUID interception.
+ * Keep the host value unchanged to avoid changing CPUID bits
+ * under the host kernel's feet.
+ */
+ tsx_ctrl = vmx_find_uret_msr(vmx, MSR_IA32_TSX_CTRL);
+ if (tsx_ctrl)
+ tsx_ctrl->mask = ~(u64)TSX_CTRL_CPUID_CLEAR;
+ }
+
+ err = alloc_loaded_vmcs(&vmx->vmcs01);
+ if (err < 0)
+ goto free_pml;
+
+ /*
+ * Use Hyper-V 'Enlightened MSR Bitmap' feature when KVM runs as a
+ * nested (L1) hypervisor and Hyper-V in L0 supports it. Enable the
+ * feature only for vmcs01, KVM currently isn't equipped to realize any
+ * performance benefits from enabling it for vmcs02.
+ */
+ if (kvm_is_using_evmcs() &&
+ (ms_hyperv.nested_features & HV_X64_NESTED_MSR_BITMAP)) {
+ struct hv_enlightened_vmcs *evmcs = (void *)vmx->vmcs01.vmcs;
+
+ evmcs->hv_enlightenments_control.msr_bitmap = 1;
+ }
+
+ /* The MSR bitmap starts with all ones */
+ bitmap_fill(vmx->shadow_msr_intercept.read, MAX_POSSIBLE_PASSTHROUGH_MSRS);
+ bitmap_fill(vmx->shadow_msr_intercept.write, MAX_POSSIBLE_PASSTHROUGH_MSRS);
+
+ vmx_disable_intercept_for_msr(vcpu, MSR_IA32_TSC, MSR_TYPE_R);
+#ifdef CONFIG_X86_64
+ vmx_disable_intercept_for_msr(vcpu, MSR_FS_BASE, MSR_TYPE_RW);
+ vmx_disable_intercept_for_msr(vcpu, MSR_GS_BASE, MSR_TYPE_RW);
+ vmx_disable_intercept_for_msr(vcpu, MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
+#endif
+ vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW);
+ vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW);
+ vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW);
+ if (kvm_cstate_in_guest(vcpu->kvm)) {
+ vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C1_RES, MSR_TYPE_R);
+ vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C3_RESIDENCY, MSR_TYPE_R);
+ vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C6_RESIDENCY, MSR_TYPE_R);
+ vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C7_RESIDENCY, MSR_TYPE_R);
+ }
+
+ vmx->loaded_vmcs = &vmx->vmcs01;
+
+ if (cpu_need_virtualize_apic_accesses(vcpu)) {
+ err = kvm_alloc_apic_access_page(vcpu->kvm);
+ if (err)
+ goto free_vmcs;
+ }
+
+ if (enable_ept && !enable_unrestricted_guest) {
+ err = init_rmode_identity_map(vcpu->kvm);
+ if (err)
+ goto free_vmcs;
+ }
+
+ if (vmx_can_use_ipiv(vcpu))
+ WRITE_ONCE(to_kvm_vmx(vcpu->kvm)->pid_table[vcpu->vcpu_id],
+ __pa(&vmx->pi_desc) | PID_TABLE_ENTRY_VALID);
+
+ return 0;
+
+free_vmcs:
+ free_loaded_vmcs(vmx->loaded_vmcs);
+free_pml:
+ vmx_destroy_pml_buffer(vmx);
+free_vpid:
+ free_vpid(vmx->vpid);
+ return err;
+}
+
+#define L1TF_MSG_SMT "L1TF CPU bug present and SMT on, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n"
+#define L1TF_MSG_L1D "L1TF CPU bug present and virtualization mitigation disabled, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n"
+
+static int vmx_vm_init(struct kvm *kvm)
+{
+ if (!ple_gap)
+ kvm->arch.pause_in_guest = true;
+
+ if (boot_cpu_has(X86_BUG_L1TF) && enable_ept) {
+ switch (l1tf_mitigation) {
+ case L1TF_MITIGATION_OFF:
+ case L1TF_MITIGATION_FLUSH_NOWARN:
+ /* 'I explicitly don't care' is set */
+ break;
+ case L1TF_MITIGATION_FLUSH:
+ case L1TF_MITIGATION_FLUSH_NOSMT:
+ case L1TF_MITIGATION_FULL:
+ /*
+ * Warn upon starting the first VM in a potentially
+ * insecure environment.
+ */
+ if (sched_smt_active())
+ pr_warn_once(L1TF_MSG_SMT);
+ if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_NEVER)
+ pr_warn_once(L1TF_MSG_L1D);
+ break;
+ case L1TF_MITIGATION_FULL_FORCE:
+ /* Flush is enforced */
+ break;
+ }
+ }
+ return 0;
+}
+
+static u8 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
+{
+ u8 cache;
+
+ /* We wanted to honor guest CD/MTRR/PAT, but doing so could result in
+ * memory aliases with conflicting memory types and sometimes MCEs.
+ * We have to be careful as to what are honored and when.
+ *
+ * For MMIO, guest CD/MTRR are ignored. The EPT memory type is set to
+ * UC. The effective memory type is UC or WC depending on guest PAT.
+ * This was historically the source of MCEs and we want to be
+ * conservative.
+ *
+ * When there is no need to deal with noncoherent DMA (e.g., no VT-d
+ * or VT-d has snoop control), guest CD/MTRR/PAT are all ignored. The
+ * EPT memory type is set to WB. The effective memory type is forced
+ * WB.
+ *
+ * Otherwise, we trust guest. Guest CD/MTRR/PAT are all honored. The
+ * EPT memory type is used to emulate guest CD/MTRR.
+ */
+
+ if (is_mmio)
+ return MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT;
+
+ if (!kvm_arch_has_noncoherent_dma(vcpu->kvm))
+ return (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT) | VMX_EPT_IPAT_BIT;
+
+ if (kvm_read_cr0_bits(vcpu, X86_CR0_CD)) {
+ if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
+ cache = MTRR_TYPE_WRBACK;
+ else
+ cache = MTRR_TYPE_UNCACHABLE;
+
+ return (cache << VMX_EPT_MT_EPTE_SHIFT) | VMX_EPT_IPAT_BIT;
+ }
+
+ return kvm_mtrr_get_guest_memory_type(vcpu, gfn) << VMX_EPT_MT_EPTE_SHIFT;
+}
+
+static void vmcs_set_secondary_exec_control(struct vcpu_vmx *vmx, u32 new_ctl)
+{
+ /*
+ * These bits in the secondary execution controls field
+ * are dynamic, the others are mostly based on the hypervisor
+ * architecture and the guest's CPUID. Do not touch the
+ * dynamic bits.
+ */
+ u32 mask =
+ SECONDARY_EXEC_SHADOW_VMCS |
+ SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
+ SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
+ SECONDARY_EXEC_DESC;
+
+ u32 cur_ctl = secondary_exec_controls_get(vmx);
+
+ secondary_exec_controls_set(vmx, (new_ctl & ~mask) | (cur_ctl & mask));
+}
+
+/*
+ * Generate MSR_IA32_VMX_CR{0,4}_FIXED1 according to CPUID. Only set bits
+ * (indicating "allowed-1") if they are supported in the guest's CPUID.
+ */
+static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct kvm_cpuid_entry2 *entry;
+
+ vmx->nested.msrs.cr0_fixed1 = 0xffffffff;
+ vmx->nested.msrs.cr4_fixed1 = X86_CR4_PCE;
+
+#define cr4_fixed1_update(_cr4_mask, _reg, _cpuid_mask) do { \
+ if (entry && (entry->_reg & (_cpuid_mask))) \
+ vmx->nested.msrs.cr4_fixed1 |= (_cr4_mask); \
+} while (0)
+
+ entry = kvm_find_cpuid_entry(vcpu, 0x1);
+ cr4_fixed1_update(X86_CR4_VME, edx, feature_bit(VME));
+ cr4_fixed1_update(X86_CR4_PVI, edx, feature_bit(VME));
+ cr4_fixed1_update(X86_CR4_TSD, edx, feature_bit(TSC));
+ cr4_fixed1_update(X86_CR4_DE, edx, feature_bit(DE));
+ cr4_fixed1_update(X86_CR4_PSE, edx, feature_bit(PSE));
+ cr4_fixed1_update(X86_CR4_PAE, edx, feature_bit(PAE));
+ cr4_fixed1_update(X86_CR4_MCE, edx, feature_bit(MCE));
+ cr4_fixed1_update(X86_CR4_PGE, edx, feature_bit(PGE));
+ cr4_fixed1_update(X86_CR4_OSFXSR, edx, feature_bit(FXSR));
+ cr4_fixed1_update(X86_CR4_OSXMMEXCPT, edx, feature_bit(XMM));
+ cr4_fixed1_update(X86_CR4_VMXE, ecx, feature_bit(VMX));
+ cr4_fixed1_update(X86_CR4_SMXE, ecx, feature_bit(SMX));
+ cr4_fixed1_update(X86_CR4_PCIDE, ecx, feature_bit(PCID));
+ cr4_fixed1_update(X86_CR4_OSXSAVE, ecx, feature_bit(XSAVE));
+
+ entry = kvm_find_cpuid_entry_index(vcpu, 0x7, 0);
+ cr4_fixed1_update(X86_CR4_FSGSBASE, ebx, feature_bit(FSGSBASE));
+ cr4_fixed1_update(X86_CR4_SMEP, ebx, feature_bit(SMEP));
+ cr4_fixed1_update(X86_CR4_SMAP, ebx, feature_bit(SMAP));
+ cr4_fixed1_update(X86_CR4_PKE, ecx, feature_bit(PKU));
+ cr4_fixed1_update(X86_CR4_UMIP, ecx, feature_bit(UMIP));
+ cr4_fixed1_update(X86_CR4_LA57, ecx, feature_bit(LA57));
+
+#undef cr4_fixed1_update
+}
+
+static void update_intel_pt_cfg(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct kvm_cpuid_entry2 *best = NULL;
+ int i;
+
+ for (i = 0; i < PT_CPUID_LEAVES; i++) {
+ best = kvm_find_cpuid_entry_index(vcpu, 0x14, i);
+ if (!best)
+ return;
+ vmx->pt_desc.caps[CPUID_EAX + i*PT_CPUID_REGS_NUM] = best->eax;
+ vmx->pt_desc.caps[CPUID_EBX + i*PT_CPUID_REGS_NUM] = best->ebx;
+ vmx->pt_desc.caps[CPUID_ECX + i*PT_CPUID_REGS_NUM] = best->ecx;
+ vmx->pt_desc.caps[CPUID_EDX + i*PT_CPUID_REGS_NUM] = best->edx;
+ }
+
+ /* Get the number of configurable Address Ranges for filtering */
+ vmx->pt_desc.num_address_ranges = intel_pt_validate_cap(vmx->pt_desc.caps,
+ PT_CAP_num_address_ranges);
+
+ /* Initialize and clear the no dependency bits */
+ vmx->pt_desc.ctl_bitmask = ~(RTIT_CTL_TRACEEN | RTIT_CTL_OS |
+ RTIT_CTL_USR | RTIT_CTL_TSC_EN | RTIT_CTL_DISRETC |
+ RTIT_CTL_BRANCH_EN);
+
+ /*
+ * If CPUID.(EAX=14H,ECX=0):EBX[0]=1 CR3Filter can be set otherwise
+ * will inject an #GP
+ */
+ if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_cr3_filtering))
+ vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_CR3EN;
+
+ /*
+ * If CPUID.(EAX=14H,ECX=0):EBX[1]=1 CYCEn, CycThresh and
+ * PSBFreq can be set
+ */
+ if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc))
+ vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_CYCLEACC |
+ RTIT_CTL_CYC_THRESH | RTIT_CTL_PSB_FREQ);
+
+ /*
+ * If CPUID.(EAX=14H,ECX=0):EBX[3]=1 MTCEn and MTCFreq can be set
+ */
+ if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc))
+ vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_MTC_EN |
+ RTIT_CTL_MTC_RANGE);
+
+ /* If CPUID.(EAX=14H,ECX=0):EBX[4]=1 FUPonPTW and PTWEn can be set */
+ if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_ptwrite))
+ vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_FUP_ON_PTW |
+ RTIT_CTL_PTW_EN);
+
+ /* If CPUID.(EAX=14H,ECX=0):EBX[5]=1 PwrEvEn can be set */
+ if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_power_event_trace))
+ vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_PWR_EVT_EN;
+
+ /* If CPUID.(EAX=14H,ECX=0):ECX[0]=1 ToPA can be set */
+ if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_topa_output))
+ vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_TOPA;
+
+ /* If CPUID.(EAX=14H,ECX=0):ECX[3]=1 FabricEn can be set */
+ if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_output_subsys))
+ vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_FABRIC_EN;
+
+ /* unmask address range configure area */
+ for (i = 0; i < vmx->pt_desc.num_address_ranges; i++)
+ vmx->pt_desc.ctl_bitmask &= ~(0xfULL << (32 + i * 4));
+}
+
+static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ /*
+ * XSAVES is effectively enabled if and only if XSAVE is also exposed
+ * to the guest. XSAVES depends on CR4.OSXSAVE, and CR4.OSXSAVE can be
+ * set if and only if XSAVE is supported.
+ */
+ if (boot_cpu_has(X86_FEATURE_XSAVE) &&
+ guest_cpuid_has(vcpu, X86_FEATURE_XSAVE))
+ kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_XSAVES);
+
+ kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_VMX);
+
+ vmx_setup_uret_msrs(vmx);
+
+ if (cpu_has_secondary_exec_ctrls())
+ vmcs_set_secondary_exec_control(vmx,
+ vmx_secondary_exec_control(vmx));
+
+ if (guest_can_use(vcpu, X86_FEATURE_VMX))
+ vmx->msr_ia32_feature_control_valid_bits |=
+ FEAT_CTL_VMX_ENABLED_INSIDE_SMX |
+ FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX;
+ else
+ vmx->msr_ia32_feature_control_valid_bits &=
+ ~(FEAT_CTL_VMX_ENABLED_INSIDE_SMX |
+ FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX);
+
+ if (guest_can_use(vcpu, X86_FEATURE_VMX))
+ nested_vmx_cr_fixed1_bits_update(vcpu);
+
+ if (boot_cpu_has(X86_FEATURE_INTEL_PT) &&
+ guest_cpuid_has(vcpu, X86_FEATURE_INTEL_PT))
+ update_intel_pt_cfg(vcpu);
+
+ if (boot_cpu_has(X86_FEATURE_RTM)) {
+ struct vmx_uret_msr *msr;
+ msr = vmx_find_uret_msr(vmx, MSR_IA32_TSX_CTRL);
+ if (msr) {
+ bool enabled = guest_cpuid_has(vcpu, X86_FEATURE_RTM);
+ vmx_set_guest_uret_msr(vmx, msr, enabled ? 0 : TSX_CTRL_RTM_DISABLE);
+ }
+ }
+
+ if (kvm_cpu_cap_has(X86_FEATURE_XFD))
+ vmx_set_intercept_for_msr(vcpu, MSR_IA32_XFD_ERR, MSR_TYPE_R,
+ !guest_cpuid_has(vcpu, X86_FEATURE_XFD));
+
+ if (boot_cpu_has(X86_FEATURE_IBPB))
+ vmx_set_intercept_for_msr(vcpu, MSR_IA32_PRED_CMD, MSR_TYPE_W,
+ !guest_has_pred_cmd_msr(vcpu));
+
+ if (boot_cpu_has(X86_FEATURE_FLUSH_L1D))
+ vmx_set_intercept_for_msr(vcpu, MSR_IA32_FLUSH_CMD, MSR_TYPE_W,
+ !guest_cpuid_has(vcpu, X86_FEATURE_FLUSH_L1D));
+
+ set_cr4_guest_host_mask(vmx);
+
+ vmx_write_encls_bitmap(vcpu, NULL);
+ if (guest_cpuid_has(vcpu, X86_FEATURE_SGX))
+ vmx->msr_ia32_feature_control_valid_bits |= FEAT_CTL_SGX_ENABLED;
+ else
+ vmx->msr_ia32_feature_control_valid_bits &= ~FEAT_CTL_SGX_ENABLED;
+
+ if (guest_cpuid_has(vcpu, X86_FEATURE_SGX_LC))
+ vmx->msr_ia32_feature_control_valid_bits |=
+ FEAT_CTL_SGX_LC_ENABLED;
+ else
+ vmx->msr_ia32_feature_control_valid_bits &=
+ ~FEAT_CTL_SGX_LC_ENABLED;
+
+ /* Refresh #PF interception to account for MAXPHYADDR changes. */
+ vmx_update_exception_bitmap(vcpu);
+}
+
+static u64 vmx_get_perf_capabilities(void)
+{
+ u64 perf_cap = PMU_CAP_FW_WRITES;
+ struct x86_pmu_lbr lbr;
+ u64 host_perf_cap = 0;
+
+ if (!enable_pmu)
+ return 0;
+
+ if (boot_cpu_has(X86_FEATURE_PDCM))
+ rdmsrl(MSR_IA32_PERF_CAPABILITIES, host_perf_cap);
+
+ if (!cpu_feature_enabled(X86_FEATURE_ARCH_LBR)) {
+ x86_perf_get_lbr(&lbr);
+ if (lbr.nr)
+ perf_cap |= host_perf_cap & PMU_CAP_LBR_FMT;
+ }
+
+ if (vmx_pebs_supported()) {
+ perf_cap |= host_perf_cap & PERF_CAP_PEBS_MASK;
+ if ((perf_cap & PERF_CAP_PEBS_FORMAT) < 4)
+ perf_cap &= ~PERF_CAP_PEBS_BASELINE;
+ }
+
+ return perf_cap;
+}
+
+static __init void vmx_set_cpu_caps(void)
+{
+ kvm_set_cpu_caps();
+
+ /* CPUID 0x1 */
+ if (nested)
+ kvm_cpu_cap_set(X86_FEATURE_VMX);
+
+ /* CPUID 0x7 */
+ if (kvm_mpx_supported())
+ kvm_cpu_cap_check_and_set(X86_FEATURE_MPX);
+ if (!cpu_has_vmx_invpcid())
+ kvm_cpu_cap_clear(X86_FEATURE_INVPCID);
+ if (vmx_pt_mode_is_host_guest())
+ kvm_cpu_cap_check_and_set(X86_FEATURE_INTEL_PT);
+ if (vmx_pebs_supported()) {
+ kvm_cpu_cap_check_and_set(X86_FEATURE_DS);
+ kvm_cpu_cap_check_and_set(X86_FEATURE_DTES64);
+ }
+
+ if (!enable_pmu)
+ kvm_cpu_cap_clear(X86_FEATURE_PDCM);
+ kvm_caps.supported_perf_cap = vmx_get_perf_capabilities();
+
+ if (!enable_sgx) {
+ kvm_cpu_cap_clear(X86_FEATURE_SGX);
+ kvm_cpu_cap_clear(X86_FEATURE_SGX_LC);
+ kvm_cpu_cap_clear(X86_FEATURE_SGX1);
+ kvm_cpu_cap_clear(X86_FEATURE_SGX2);
+ }
+
+ if (vmx_umip_emulated())
+ kvm_cpu_cap_set(X86_FEATURE_UMIP);
+
+ /* CPUID 0xD.1 */
+ kvm_caps.supported_xss = 0;
+ if (!cpu_has_vmx_xsaves())
+ kvm_cpu_cap_clear(X86_FEATURE_XSAVES);
+
+ /* CPUID 0x80000001 and 0x7 (RDPID) */
+ if (!cpu_has_vmx_rdtscp()) {
+ kvm_cpu_cap_clear(X86_FEATURE_RDTSCP);
+ kvm_cpu_cap_clear(X86_FEATURE_RDPID);
+ }
+
+ if (cpu_has_vmx_waitpkg())
+ kvm_cpu_cap_check_and_set(X86_FEATURE_WAITPKG);
+}
+
+static void vmx_request_immediate_exit(struct kvm_vcpu *vcpu)
+{
+ to_vmx(vcpu)->req_immediate_exit = true;
+}
+
+static int vmx_check_intercept_io(struct kvm_vcpu *vcpu,
+ struct x86_instruction_info *info)
+{
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+ unsigned short port;
+ bool intercept;
+ int size;
+
+ if (info->intercept == x86_intercept_in ||
+ info->intercept == x86_intercept_ins) {
+ port = info->src_val;
+ size = info->dst_bytes;
+ } else {
+ port = info->dst_val;
+ size = info->src_bytes;
+ }
+
+ /*
+ * If the 'use IO bitmaps' VM-execution control is 0, IO instruction
+ * VM-exits depend on the 'unconditional IO exiting' VM-execution
+ * control.
+ *
+ * Otherwise, IO instruction VM-exits are controlled by the IO bitmaps.
+ */
+ if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
+ intercept = nested_cpu_has(vmcs12,
+ CPU_BASED_UNCOND_IO_EXITING);
+ else
+ intercept = nested_vmx_check_io_bitmaps(vcpu, port, size);
+
+ /* FIXME: produce nested vmexit and return X86EMUL_INTERCEPTED. */
+ return intercept ? X86EMUL_UNHANDLEABLE : X86EMUL_CONTINUE;
+}
+
+static int vmx_check_intercept(struct kvm_vcpu *vcpu,
+ struct x86_instruction_info *info,
+ enum x86_intercept_stage stage,
+ struct x86_exception *exception)
+{
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+
+ switch (info->intercept) {
+ /*
+ * RDPID causes #UD if disabled through secondary execution controls.
+ * Because it is marked as EmulateOnUD, we need to intercept it here.
+ * Note, RDPID is hidden behind ENABLE_RDTSCP.
+ */
+ case x86_intercept_rdpid:
+ if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_RDTSCP)) {
+ exception->vector = UD_VECTOR;
+ exception->error_code_valid = false;
+ return X86EMUL_PROPAGATE_FAULT;
+ }
+ break;
+
+ case x86_intercept_in:
+ case x86_intercept_ins:
+ case x86_intercept_out:
+ case x86_intercept_outs:
+ return vmx_check_intercept_io(vcpu, info);
+
+ case x86_intercept_lgdt:
+ case x86_intercept_lidt:
+ case x86_intercept_lldt:
+ case x86_intercept_ltr:
+ case x86_intercept_sgdt:
+ case x86_intercept_sidt:
+ case x86_intercept_sldt:
+ case x86_intercept_str:
+ if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC))
+ return X86EMUL_CONTINUE;
+
+ /* FIXME: produce nested vmexit and return X86EMUL_INTERCEPTED. */
+ break;
+
+ case x86_intercept_pause:
+ /*
+ * PAUSE is a single-byte NOP with a REPE prefix, i.e. collides
+ * with vanilla NOPs in the emulator. Apply the interception
+ * check only to actual PAUSE instructions. Don't check
+ * PAUSE-loop-exiting, software can't expect a given PAUSE to
+ * exit, i.e. KVM is within its rights to allow L2 to execute
+ * the PAUSE.
+ */
+ if ((info->rep_prefix != REPE_PREFIX) ||
+ !nested_cpu_has2(vmcs12, CPU_BASED_PAUSE_EXITING))
+ return X86EMUL_CONTINUE;
+
+ break;
+
+ /* TODO: check more intercepts... */
+ default:
+ break;
+ }
+
+ return X86EMUL_UNHANDLEABLE;
+}
+
+#ifdef CONFIG_X86_64
+/* (a << shift) / divisor, return 1 if overflow otherwise 0 */
+static inline int u64_shl_div_u64(u64 a, unsigned int shift,
+ u64 divisor, u64 *result)
+{
+ u64 low = a << shift, high = a >> (64 - shift);
+
+ /* To avoid the overflow on divq */
+ if (high >= divisor)
+ return 1;
+
+ /* Low hold the result, high hold rem which is discarded */
+ asm("divq %2\n\t" : "=a" (low), "=d" (high) :
+ "rm" (divisor), "0" (low), "1" (high));
+ *result = low;
+
+ return 0;
+}
+
+static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc,
+ bool *expired)
+{
+ struct vcpu_vmx *vmx;
+ u64 tscl, guest_tscl, delta_tsc, lapic_timer_advance_cycles;
+ struct kvm_timer *ktimer = &vcpu->arch.apic->lapic_timer;
+
+ vmx = to_vmx(vcpu);
+ tscl = rdtsc();
+ guest_tscl = kvm_read_l1_tsc(vcpu, tscl);
+ delta_tsc = max(guest_deadline_tsc, guest_tscl) - guest_tscl;
+ lapic_timer_advance_cycles = nsec_to_cycles(vcpu,
+ ktimer->timer_advance_ns);
+
+ if (delta_tsc > lapic_timer_advance_cycles)
+ delta_tsc -= lapic_timer_advance_cycles;
+ else
+ delta_tsc = 0;
+
+ /* Convert to host delta tsc if tsc scaling is enabled */
+ if (vcpu->arch.l1_tsc_scaling_ratio != kvm_caps.default_tsc_scaling_ratio &&
+ delta_tsc && u64_shl_div_u64(delta_tsc,
+ kvm_caps.tsc_scaling_ratio_frac_bits,
+ vcpu->arch.l1_tsc_scaling_ratio, &delta_tsc))
+ return -ERANGE;
+
+ /*
+ * If the delta tsc can't fit in the 32 bit after the multi shift,
+ * we can't use the preemption timer.
+ * It's possible that it fits on later vmentries, but checking
+ * on every vmentry is costly so we just use an hrtimer.
+ */
+ if (delta_tsc >> (cpu_preemption_timer_multi + 32))
+ return -ERANGE;
+
+ vmx->hv_deadline_tsc = tscl + delta_tsc;
+ *expired = !delta_tsc;
+ return 0;
+}
+
+static void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu)
+{
+ to_vmx(vcpu)->hv_deadline_tsc = -1;
+}
+#endif
+
+static void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu)
+{
+ if (!kvm_pause_in_guest(vcpu->kvm))
+ shrink_ple_window(vcpu);
+}
+
+void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ if (WARN_ON_ONCE(!enable_pml))
+ return;
+
+ if (is_guest_mode(vcpu)) {
+ vmx->nested.update_vmcs01_cpu_dirty_logging = true;
+ return;
+ }
+
+ /*
+ * Note, nr_memslots_dirty_logging can be changed concurrent with this
+ * code, but in that case another update request will be made and so
+ * the guest will never run with a stale PML value.
+ */
+ if (atomic_read(&vcpu->kvm->nr_memslots_dirty_logging))
+ secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_ENABLE_PML);
+ else
+ secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_ENABLE_PML);
+}
+
+static void vmx_setup_mce(struct kvm_vcpu *vcpu)
+{
+ if (vcpu->arch.mcg_cap & MCG_LMCE_P)
+ to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |=
+ FEAT_CTL_LMCE_ENABLED;
+ else
+ to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &=
+ ~FEAT_CTL_LMCE_ENABLED;
+}
+
+#ifdef CONFIG_KVM_SMM
+static int vmx_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
+{
+ /* we need a nested vmexit to enter SMM, postpone if run is pending */
+ if (to_vmx(vcpu)->nested.nested_run_pending)
+ return -EBUSY;
+ return !is_smm(vcpu);
+}
+
+static int vmx_enter_smm(struct kvm_vcpu *vcpu, union kvm_smram *smram)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ /*
+ * TODO: Implement custom flows for forcing the vCPU out/in of L2 on
+ * SMI and RSM. Using the common VM-Exit + VM-Enter routines is wrong
+ * SMI and RSM only modify state that is saved and restored via SMRAM.
+ * E.g. most MSRs are left untouched, but many are modified by VM-Exit
+ * and VM-Enter, and thus L2's values may be corrupted on SMI+RSM.
+ */
+ vmx->nested.smm.guest_mode = is_guest_mode(vcpu);
+ if (vmx->nested.smm.guest_mode)
+ nested_vmx_vmexit(vcpu, -1, 0, 0);
+
+ vmx->nested.smm.vmxon = vmx->nested.vmxon;
+ vmx->nested.vmxon = false;
+ vmx_clear_hlt(vcpu);
+ return 0;
+}
+
+static int vmx_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ int ret;
+
+ if (vmx->nested.smm.vmxon) {
+ vmx->nested.vmxon = true;
+ vmx->nested.smm.vmxon = false;
+ }
+
+ if (vmx->nested.smm.guest_mode) {
+ ret = nested_vmx_enter_non_root_mode(vcpu, false);
+ if (ret)
+ return ret;
+
+ vmx->nested.nested_run_pending = 1;
+ vmx->nested.smm.guest_mode = false;
+ }
+ return 0;
+}
+
+static void vmx_enable_smi_window(struct kvm_vcpu *vcpu)
+{
+ /* RSM will cause a vmexit anyway. */
+}
+#endif
+
+static bool vmx_apic_init_signal_blocked(struct kvm_vcpu *vcpu)
+{
+ return to_vmx(vcpu)->nested.vmxon && !is_guest_mode(vcpu);
+}
+
+static void vmx_migrate_timers(struct kvm_vcpu *vcpu)
+{
+ if (is_guest_mode(vcpu)) {
+ struct hrtimer *timer = &to_vmx(vcpu)->nested.preemption_timer;
+
+ if (hrtimer_try_to_cancel(timer) == 1)
+ hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED);
+ }
+}
+
+static void vmx_hardware_unsetup(void)
+{
+ kvm_set_posted_intr_wakeup_handler(NULL);
+
+ if (nested)
+ nested_vmx_hardware_unsetup();
+
+ free_kvm_area();
+}
+
+#define VMX_REQUIRED_APICV_INHIBITS \
+( \
+ BIT(APICV_INHIBIT_REASON_DISABLE)| \
+ BIT(APICV_INHIBIT_REASON_ABSENT) | \
+ BIT(APICV_INHIBIT_REASON_HYPERV) | \
+ BIT(APICV_INHIBIT_REASON_BLOCKIRQ) | \
+ BIT(APICV_INHIBIT_REASON_PHYSICAL_ID_ALIASED) | \
+ BIT(APICV_INHIBIT_REASON_APIC_ID_MODIFIED) | \
+ BIT(APICV_INHIBIT_REASON_APIC_BASE_MODIFIED) \
+)
+
+static void vmx_vm_destroy(struct kvm *kvm)
+{
+ struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm);
+
+ free_pages((unsigned long)kvm_vmx->pid_table, vmx_get_pid_table_order(kvm));
+}
+
+static struct kvm_x86_ops vmx_x86_ops __initdata = {
+ .name = KBUILD_MODNAME,
+
+ .check_processor_compatibility = vmx_check_processor_compat,
+
+ .hardware_unsetup = vmx_hardware_unsetup,
+
+ .hardware_enable = vmx_hardware_enable,
+ .hardware_disable = vmx_hardware_disable,
+ .has_emulated_msr = vmx_has_emulated_msr,
+
+ .vm_size = sizeof(struct kvm_vmx),
+ .vm_init = vmx_vm_init,
+ .vm_destroy = vmx_vm_destroy,
+
+ .vcpu_precreate = vmx_vcpu_precreate,
+ .vcpu_create = vmx_vcpu_create,
+ .vcpu_free = vmx_vcpu_free,
+ .vcpu_reset = vmx_vcpu_reset,
+
+ .prepare_switch_to_guest = vmx_prepare_switch_to_guest,
+ .vcpu_load = vmx_vcpu_load,
+ .vcpu_put = vmx_vcpu_put,
+
+ .update_exception_bitmap = vmx_update_exception_bitmap,
+ .get_msr_feature = vmx_get_msr_feature,
+ .get_msr = vmx_get_msr,
+ .set_msr = vmx_set_msr,
+ .get_segment_base = vmx_get_segment_base,
+ .get_segment = vmx_get_segment,
+ .set_segment = vmx_set_segment,
+ .get_cpl = vmx_get_cpl,
+ .get_cs_db_l_bits = vmx_get_cs_db_l_bits,
+ .is_valid_cr0 = vmx_is_valid_cr0,
+ .set_cr0 = vmx_set_cr0,
+ .is_valid_cr4 = vmx_is_valid_cr4,
+ .set_cr4 = vmx_set_cr4,
+ .set_efer = vmx_set_efer,
+ .get_idt = vmx_get_idt,
+ .set_idt = vmx_set_idt,
+ .get_gdt = vmx_get_gdt,
+ .set_gdt = vmx_set_gdt,
+ .set_dr7 = vmx_set_dr7,
+ .sync_dirty_debug_regs = vmx_sync_dirty_debug_regs,
+ .cache_reg = vmx_cache_reg,
+ .get_rflags = vmx_get_rflags,
+ .set_rflags = vmx_set_rflags,
+ .get_if_flag = vmx_get_if_flag,
+
+ .flush_tlb_all = vmx_flush_tlb_all,
+ .flush_tlb_current = vmx_flush_tlb_current,
+ .flush_tlb_gva = vmx_flush_tlb_gva,
+ .flush_tlb_guest = vmx_flush_tlb_guest,
+
+ .vcpu_pre_run = vmx_vcpu_pre_run,
+ .vcpu_run = vmx_vcpu_run,
+ .handle_exit = vmx_handle_exit,
+ .skip_emulated_instruction = vmx_skip_emulated_instruction,
+ .update_emulated_instruction = vmx_update_emulated_instruction,
+ .set_interrupt_shadow = vmx_set_interrupt_shadow,
+ .get_interrupt_shadow = vmx_get_interrupt_shadow,
+ .patch_hypercall = vmx_patch_hypercall,
+ .inject_irq = vmx_inject_irq,
+ .inject_nmi = vmx_inject_nmi,
+ .inject_exception = vmx_inject_exception,
+ .cancel_injection = vmx_cancel_injection,
+ .interrupt_allowed = vmx_interrupt_allowed,
+ .nmi_allowed = vmx_nmi_allowed,
+ .get_nmi_mask = vmx_get_nmi_mask,
+ .set_nmi_mask = vmx_set_nmi_mask,
+ .enable_nmi_window = vmx_enable_nmi_window,
+ .enable_irq_window = vmx_enable_irq_window,
+ .update_cr8_intercept = vmx_update_cr8_intercept,
+ .set_virtual_apic_mode = vmx_set_virtual_apic_mode,
+ .set_apic_access_page_addr = vmx_set_apic_access_page_addr,
+ .refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl,
+ .load_eoi_exitmap = vmx_load_eoi_exitmap,
+ .apicv_pre_state_restore = vmx_apicv_pre_state_restore,
+ .required_apicv_inhibits = VMX_REQUIRED_APICV_INHIBITS,
+ .hwapic_irr_update = vmx_hwapic_irr_update,
+ .hwapic_isr_update = vmx_hwapic_isr_update,
+ .guest_apic_has_interrupt = vmx_guest_apic_has_interrupt,
+ .sync_pir_to_irr = vmx_sync_pir_to_irr,
+ .deliver_interrupt = vmx_deliver_interrupt,
+ .dy_apicv_has_pending_interrupt = pi_has_pending_interrupt,
+
+ .set_tss_addr = vmx_set_tss_addr,
+ .set_identity_map_addr = vmx_set_identity_map_addr,
+ .get_mt_mask = vmx_get_mt_mask,
+
+ .get_exit_info = vmx_get_exit_info,
+
+ .vcpu_after_set_cpuid = vmx_vcpu_after_set_cpuid,
+
+ .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
+
+ .get_l2_tsc_offset = vmx_get_l2_tsc_offset,
+ .get_l2_tsc_multiplier = vmx_get_l2_tsc_multiplier,
+ .write_tsc_offset = vmx_write_tsc_offset,
+ .write_tsc_multiplier = vmx_write_tsc_multiplier,
+
+ .load_mmu_pgd = vmx_load_mmu_pgd,
+
+ .check_intercept = vmx_check_intercept,
+ .handle_exit_irqoff = vmx_handle_exit_irqoff,
+
+ .request_immediate_exit = vmx_request_immediate_exit,
+
+ .sched_in = vmx_sched_in,
+
+ .cpu_dirty_log_size = PML_ENTITY_NUM,
+ .update_cpu_dirty_logging = vmx_update_cpu_dirty_logging,
+
+ .nested_ops = &vmx_nested_ops,
+
+ .pi_update_irte = vmx_pi_update_irte,
+ .pi_start_assignment = vmx_pi_start_assignment,
+
+#ifdef CONFIG_X86_64
+ .set_hv_timer = vmx_set_hv_timer,
+ .cancel_hv_timer = vmx_cancel_hv_timer,
+#endif
+
+ .setup_mce = vmx_setup_mce,
+
+#ifdef CONFIG_KVM_SMM
+ .smi_allowed = vmx_smi_allowed,
+ .enter_smm = vmx_enter_smm,
+ .leave_smm = vmx_leave_smm,
+ .enable_smi_window = vmx_enable_smi_window,
+#endif
+
+ .can_emulate_instruction = vmx_can_emulate_instruction,
+ .apic_init_signal_blocked = vmx_apic_init_signal_blocked,
+ .migrate_timers = vmx_migrate_timers,
+
+ .msr_filter_changed = vmx_msr_filter_changed,
+ .complete_emulated_msr = kvm_complete_insn_gp,
+
+ .vcpu_deliver_sipi_vector = kvm_vcpu_deliver_sipi_vector,
+};
+
+static unsigned int vmx_handle_intel_pt_intr(void)
+{
+ struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
+
+ /* '0' on failure so that the !PT case can use a RET0 static call. */
+ if (!vcpu || !kvm_handling_nmi_from_guest(vcpu))
+ return 0;
+
+ kvm_make_request(KVM_REQ_PMI, vcpu);
+ __set_bit(MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI_BIT,
+ (unsigned long *)&vcpu->arch.pmu.global_status);
+ return 1;
+}
+
+static __init void vmx_setup_user_return_msrs(void)
+{
+
+ /*
+ * Though SYSCALL is only supported in 64-bit mode on Intel CPUs, kvm
+ * will emulate SYSCALL in legacy mode if the vendor string in guest
+ * CPUID.0:{EBX,ECX,EDX} is "AuthenticAMD" or "AMDisbetter!" To
+ * support this emulation, MSR_STAR is included in the list for i386,
+ * but is never loaded into hardware. MSR_CSTAR is also never loaded
+ * into hardware and is here purely for emulation purposes.
+ */
+ const u32 vmx_uret_msrs_list[] = {
+ #ifdef CONFIG_X86_64
+ MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR,
+ #endif
+ MSR_EFER, MSR_TSC_AUX, MSR_STAR,
+ MSR_IA32_TSX_CTRL,
+ };
+ int i;
+
+ BUILD_BUG_ON(ARRAY_SIZE(vmx_uret_msrs_list) != MAX_NR_USER_RETURN_MSRS);
+
+ for (i = 0; i < ARRAY_SIZE(vmx_uret_msrs_list); ++i)
+ kvm_add_user_return_msr(vmx_uret_msrs_list[i]);
+}
+
+static void __init vmx_setup_me_spte_mask(void)
+{
+ u64 me_mask = 0;
+
+ /*
+ * kvm_get_shadow_phys_bits() returns shadow_phys_bits. Use
+ * the former to avoid exposing shadow_phys_bits.
+ *
+ * On pre-MKTME system, boot_cpu_data.x86_phys_bits equals to
+ * shadow_phys_bits. On MKTME and/or TDX capable systems,
+ * boot_cpu_data.x86_phys_bits holds the actual physical address
+ * w/o the KeyID bits, and shadow_phys_bits equals to MAXPHYADDR
+ * reported by CPUID. Those bits between are KeyID bits.
+ */
+ if (boot_cpu_data.x86_phys_bits != kvm_get_shadow_phys_bits())
+ me_mask = rsvd_bits(boot_cpu_data.x86_phys_bits,
+ kvm_get_shadow_phys_bits() - 1);
+ /*
+ * Unlike SME, host kernel doesn't support setting up any
+ * MKTME KeyID on Intel platforms. No memory encryption
+ * bits should be included into the SPTE.
+ */
+ kvm_mmu_set_me_spte_mask(0, me_mask);
+}
+
+static struct kvm_x86_init_ops vmx_init_ops __initdata;
+
+static __init int hardware_setup(void)
+{
+ unsigned long host_bndcfgs;
+ struct desc_ptr dt;
+ int r;
+
+ store_idt(&dt);
+ host_idt_base = dt.address;
+
+ vmx_setup_user_return_msrs();
+
+ if (setup_vmcs_config(&vmcs_config, &vmx_capability) < 0)
+ return -EIO;
+
+ if (cpu_has_perf_global_ctrl_bug())
+ pr_warn_once("VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL "
+ "does not work properly. Using workaround\n");
+
+ if (boot_cpu_has(X86_FEATURE_NX))
+ kvm_enable_efer_bits(EFER_NX);
+
+ if (boot_cpu_has(X86_FEATURE_MPX)) {
+ rdmsrl(MSR_IA32_BNDCFGS, host_bndcfgs);
+ WARN_ONCE(host_bndcfgs, "BNDCFGS in host will be lost");
+ }
+
+ if (!cpu_has_vmx_mpx())
+ kvm_caps.supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS |
+ XFEATURE_MASK_BNDCSR);
+
+ if (!cpu_has_vmx_vpid() || !cpu_has_vmx_invvpid() ||
+ !(cpu_has_vmx_invvpid_single() || cpu_has_vmx_invvpid_global()))
+ enable_vpid = 0;
+
+ if (!cpu_has_vmx_ept() ||
+ !cpu_has_vmx_ept_4levels() ||
+ !cpu_has_vmx_ept_mt_wb() ||
+ !cpu_has_vmx_invept_global())
+ enable_ept = 0;
+
+ /* NX support is required for shadow paging. */
+ if (!enable_ept && !boot_cpu_has(X86_FEATURE_NX)) {
+ pr_err_ratelimited("NX (Execute Disable) not supported\n");
+ return -EOPNOTSUPP;
+ }
+
+ if (!cpu_has_vmx_ept_ad_bits() || !enable_ept)
+ enable_ept_ad_bits = 0;
+
+ if (!cpu_has_vmx_unrestricted_guest() || !enable_ept)
+ enable_unrestricted_guest = 0;
+
+ if (!cpu_has_vmx_flexpriority())
+ flexpriority_enabled = 0;
+
+ if (!cpu_has_virtual_nmis())
+ enable_vnmi = 0;
+
+#ifdef CONFIG_X86_SGX_KVM
+ if (!cpu_has_vmx_encls_vmexit())
+ enable_sgx = false;
+#endif
+
+ /*
+ * set_apic_access_page_addr() is used to reload apic access
+ * page upon invalidation. No need to do anything if not
+ * using the APIC_ACCESS_ADDR VMCS field.
+ */
+ if (!flexpriority_enabled)
+ vmx_x86_ops.set_apic_access_page_addr = NULL;
+
+ if (!cpu_has_vmx_tpr_shadow())
+ vmx_x86_ops.update_cr8_intercept = NULL;
+
+#if IS_ENABLED(CONFIG_HYPERV)
+ if (ms_hyperv.nested_features & HV_X64_NESTED_GUEST_MAPPING_FLUSH
+ && enable_ept) {
+ vmx_x86_ops.flush_remote_tlbs = hv_flush_remote_tlbs;
+ vmx_x86_ops.flush_remote_tlbs_range = hv_flush_remote_tlbs_range;
+ }
+#endif
+
+ if (!cpu_has_vmx_ple()) {
+ ple_gap = 0;
+ ple_window = 0;
+ ple_window_grow = 0;
+ ple_window_max = 0;
+ ple_window_shrink = 0;
+ }
+
+ if (!cpu_has_vmx_apicv())
+ enable_apicv = 0;
+ if (!enable_apicv)
+ vmx_x86_ops.sync_pir_to_irr = NULL;
+
+ if (!enable_apicv || !cpu_has_vmx_ipiv())
+ enable_ipiv = false;
+
+ if (cpu_has_vmx_tsc_scaling())
+ kvm_caps.has_tsc_control = true;
+
+ kvm_caps.max_tsc_scaling_ratio = KVM_VMX_TSC_MULTIPLIER_MAX;
+ kvm_caps.tsc_scaling_ratio_frac_bits = 48;
+ kvm_caps.has_bus_lock_exit = cpu_has_vmx_bus_lock_detection();
+ kvm_caps.has_notify_vmexit = cpu_has_notify_vmexit();
+
+ set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
+
+ if (enable_ept)
+ kvm_mmu_set_ept_masks(enable_ept_ad_bits,
+ cpu_has_vmx_ept_execute_only());
+
+ /*
+ * Setup shadow_me_value/shadow_me_mask to include MKTME KeyID
+ * bits to shadow_zero_check.
+ */
+ vmx_setup_me_spte_mask();
+
+ kvm_configure_mmu(enable_ept, 0, vmx_get_max_ept_level(),
+ ept_caps_to_lpage_level(vmx_capability.ept));
+
+ /*
+ * Only enable PML when hardware supports PML feature, and both EPT
+ * and EPT A/D bit features are enabled -- PML depends on them to work.
+ */
+ if (!enable_ept || !enable_ept_ad_bits || !cpu_has_vmx_pml())
+ enable_pml = 0;
+
+ if (!enable_pml)
+ vmx_x86_ops.cpu_dirty_log_size = 0;
+
+ if (!cpu_has_vmx_preemption_timer())
+ enable_preemption_timer = false;
+
+ if (enable_preemption_timer) {
+ u64 use_timer_freq = 5000ULL * 1000 * 1000;
+
+ cpu_preemption_timer_multi =
+ vmcs_config.misc & VMX_MISC_PREEMPTION_TIMER_RATE_MASK;
+
+ if (tsc_khz)
+ use_timer_freq = (u64)tsc_khz * 1000;
+ use_timer_freq >>= cpu_preemption_timer_multi;
+
+ /*
+ * KVM "disables" the preemption timer by setting it to its max
+ * value. Don't use the timer if it might cause spurious exits
+ * at a rate faster than 0.1 Hz (of uninterrupted guest time).
+ */
+ if (use_timer_freq > 0xffffffffu / 10)
+ enable_preemption_timer = false;
+ }
+
+ if (!enable_preemption_timer) {
+ vmx_x86_ops.set_hv_timer = NULL;
+ vmx_x86_ops.cancel_hv_timer = NULL;
+ vmx_x86_ops.request_immediate_exit = __kvm_request_immediate_exit;
+ }
+
+ kvm_caps.supported_mce_cap |= MCG_LMCE_P;
+ kvm_caps.supported_mce_cap |= MCG_CMCI_P;
+
+ if (pt_mode != PT_MODE_SYSTEM && pt_mode != PT_MODE_HOST_GUEST)
+ return -EINVAL;
+ if (!enable_ept || !enable_pmu || !cpu_has_vmx_intel_pt())
+ pt_mode = PT_MODE_SYSTEM;
+ if (pt_mode == PT_MODE_HOST_GUEST)
+ vmx_init_ops.handle_intel_pt_intr = vmx_handle_intel_pt_intr;
+ else
+ vmx_init_ops.handle_intel_pt_intr = NULL;
+
+ setup_default_sgx_lepubkeyhash();
+
+ if (nested) {
+ nested_vmx_setup_ctls_msrs(&vmcs_config, vmx_capability.ept);
+
+ r = nested_vmx_hardware_setup(kvm_vmx_exit_handlers);
+ if (r)
+ return r;
+ }
+
+ vmx_set_cpu_caps();
+
+ r = alloc_kvm_area();
+ if (r && nested)
+ nested_vmx_hardware_unsetup();
+
+ kvm_set_posted_intr_wakeup_handler(pi_wakeup_handler);
+
+ return r;
+}
+
+static struct kvm_x86_init_ops vmx_init_ops __initdata = {
+ .hardware_setup = hardware_setup,
+ .handle_intel_pt_intr = NULL,
+
+ .runtime_ops = &vmx_x86_ops,
+ .pmu_ops = &intel_pmu_ops,
+};
+
+static void vmx_cleanup_l1d_flush(void)
+{
+ if (vmx_l1d_flush_pages) {
+ free_pages((unsigned long)vmx_l1d_flush_pages, L1D_CACHE_ORDER);
+ vmx_l1d_flush_pages = NULL;
+ }
+ /* Restore state so sysfs ignores VMX */
+ l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO;
+}
+
+static void __vmx_exit(void)
+{
+ allow_smaller_maxphyaddr = false;
+
+ cpu_emergency_unregister_virt_callback(vmx_emergency_disable);
+
+ vmx_cleanup_l1d_flush();
+}
+
+static void vmx_exit(void)
+{
+ kvm_exit();
+ kvm_x86_vendor_exit();
+
+ __vmx_exit();
+}
+module_exit(vmx_exit);
+
+static int __init vmx_init(void)
+{
+ int r, cpu;
+
+ if (!kvm_is_vmx_supported())
+ return -EOPNOTSUPP;
+
+ /*
+ * Note, hv_init_evmcs() touches only VMX knobs, i.e. there's nothing
+ * to unwind if a later step fails.
+ */
+ hv_init_evmcs();
+
+ r = kvm_x86_vendor_init(&vmx_init_ops);
+ if (r)
+ return r;
+
+ /*
+ * Must be called after common x86 init so enable_ept is properly set
+ * up. Hand the parameter mitigation value in which was stored in
+ * the pre module init parser. If no parameter was given, it will
+ * contain 'auto' which will be turned into the default 'cond'
+ * mitigation mode.
+ */
+ r = vmx_setup_l1d_flush(vmentry_l1d_flush_param);
+ if (r)
+ goto err_l1d_flush;
+
+ for_each_possible_cpu(cpu) {
+ INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
+
+ pi_init_cpu(cpu);
+ }
+
+ cpu_emergency_register_virt_callback(vmx_emergency_disable);
+
+ vmx_check_vmcs12_offsets();
+
+ /*
+ * Shadow paging doesn't have a (further) performance penalty
+ * from GUEST_MAXPHYADDR < HOST_MAXPHYADDR so enable it
+ * by default
+ */
+ if (!enable_ept)
+ allow_smaller_maxphyaddr = true;
+
+ /*
+ * Common KVM initialization _must_ come last, after this, /dev/kvm is
+ * exposed to userspace!
+ */
+ r = kvm_init(sizeof(struct vcpu_vmx), __alignof__(struct vcpu_vmx),
+ THIS_MODULE);
+ if (r)
+ goto err_kvm_init;
+
+ return 0;
+
+err_kvm_init:
+ __vmx_exit();
+err_l1d_flush:
+ kvm_x86_vendor_exit();
+ return r;
+}
+module_init(vmx_init);
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
new file mode 100644
index 000000000..c2130d2c8
--- /dev/null
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -0,0 +1,758 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __KVM_X86_VMX_H
+#define __KVM_X86_VMX_H
+
+#include <linux/kvm_host.h>
+
+#include <asm/kvm.h>
+#include <asm/intel_pt.h>
+#include <asm/perf_event.h>
+
+#include "capabilities.h"
+#include "../kvm_cache_regs.h"
+#include "posted_intr.h"
+#include "vmcs.h"
+#include "vmx_ops.h"
+#include "../cpuid.h"
+#include "run_flags.h"
+
+#define MSR_TYPE_R 1
+#define MSR_TYPE_W 2
+#define MSR_TYPE_RW 3
+
+#define X2APIC_MSR(r) (APIC_BASE_MSR + ((r) >> 4))
+
+#ifdef CONFIG_X86_64
+#define MAX_NR_USER_RETURN_MSRS 7
+#else
+#define MAX_NR_USER_RETURN_MSRS 4
+#endif
+
+#define MAX_NR_LOADSTORE_MSRS 8
+
+struct vmx_msrs {
+ unsigned int nr;
+ struct vmx_msr_entry val[MAX_NR_LOADSTORE_MSRS];
+};
+
+struct vmx_uret_msr {
+ bool load_into_hardware;
+ u64 data;
+ u64 mask;
+};
+
+enum segment_cache_field {
+ SEG_FIELD_SEL = 0,
+ SEG_FIELD_BASE = 1,
+ SEG_FIELD_LIMIT = 2,
+ SEG_FIELD_AR = 3,
+
+ SEG_FIELD_NR = 4
+};
+
+#define RTIT_ADDR_RANGE 4
+
+struct pt_ctx {
+ u64 ctl;
+ u64 status;
+ u64 output_base;
+ u64 output_mask;
+ u64 cr3_match;
+ u64 addr_a[RTIT_ADDR_RANGE];
+ u64 addr_b[RTIT_ADDR_RANGE];
+};
+
+struct pt_desc {
+ u64 ctl_bitmask;
+ u32 num_address_ranges;
+ u32 caps[PT_CPUID_REGS_NUM * PT_CPUID_LEAVES];
+ struct pt_ctx host;
+ struct pt_ctx guest;
+};
+
+union vmx_exit_reason {
+ struct {
+ u32 basic : 16;
+ u32 reserved16 : 1;
+ u32 reserved17 : 1;
+ u32 reserved18 : 1;
+ u32 reserved19 : 1;
+ u32 reserved20 : 1;
+ u32 reserved21 : 1;
+ u32 reserved22 : 1;
+ u32 reserved23 : 1;
+ u32 reserved24 : 1;
+ u32 reserved25 : 1;
+ u32 bus_lock_detected : 1;
+ u32 enclave_mode : 1;
+ u32 smi_pending_mtf : 1;
+ u32 smi_from_vmx_root : 1;
+ u32 reserved30 : 1;
+ u32 failed_vmentry : 1;
+ };
+ u32 full;
+};
+
+struct lbr_desc {
+ /* Basic info about guest LBR records. */
+ struct x86_pmu_lbr records;
+
+ /*
+ * Emulate LBR feature via passthrough LBR registers when the
+ * per-vcpu guest LBR event is scheduled on the current pcpu.
+ *
+ * The records may be inaccurate if the host reclaims the LBR.
+ */
+ struct perf_event *event;
+
+ /* True if LBRs are marked as not intercepted in the MSR bitmap */
+ bool msr_passthrough;
+};
+
+/*
+ * The nested_vmx structure is part of vcpu_vmx, and holds information we need
+ * for correct emulation of VMX (i.e., nested VMX) on this vcpu.
+ */
+struct nested_vmx {
+ /* Has the level1 guest done vmxon? */
+ bool vmxon;
+ gpa_t vmxon_ptr;
+ bool pml_full;
+
+ /* The guest-physical address of the current VMCS L1 keeps for L2 */
+ gpa_t current_vmptr;
+ /*
+ * Cache of the guest's VMCS, existing outside of guest memory.
+ * Loaded from guest memory during VMPTRLD. Flushed to guest
+ * memory during VMCLEAR and VMPTRLD.
+ */
+ struct vmcs12 *cached_vmcs12;
+ /*
+ * Cache of the guest's shadow VMCS, existing outside of guest
+ * memory. Loaded from guest memory during VM entry. Flushed
+ * to guest memory during VM exit.
+ */
+ struct vmcs12 *cached_shadow_vmcs12;
+
+ /*
+ * GPA to HVA cache for accessing vmcs12->vmcs_link_pointer
+ */
+ struct gfn_to_hva_cache shadow_vmcs12_cache;
+
+ /*
+ * GPA to HVA cache for VMCS12
+ */
+ struct gfn_to_hva_cache vmcs12_cache;
+
+ /*
+ * Indicates if the shadow vmcs or enlightened vmcs must be updated
+ * with the data held by struct vmcs12.
+ */
+ bool need_vmcs12_to_shadow_sync;
+ bool dirty_vmcs12;
+
+ /*
+ * Indicates whether MSR bitmap for L2 needs to be rebuilt due to
+ * changes in MSR bitmap for L1 or switching to a different L2. Note,
+ * this flag can only be used reliably in conjunction with a paravirt L1
+ * which informs L0 whether any changes to MSR bitmap for L2 were done
+ * on its side.
+ */
+ bool force_msr_bitmap_recalc;
+
+ /*
+ * Indicates lazily loaded guest state has not yet been decached from
+ * vmcs02.
+ */
+ bool need_sync_vmcs02_to_vmcs12_rare;
+
+ /*
+ * vmcs02 has been initialized, i.e. state that is constant for
+ * vmcs02 has been written to the backing VMCS. Initialization
+ * is delayed until L1 actually attempts to run a nested VM.
+ */
+ bool vmcs02_initialized;
+
+ bool change_vmcs01_virtual_apic_mode;
+ bool reload_vmcs01_apic_access_page;
+ bool update_vmcs01_cpu_dirty_logging;
+ bool update_vmcs01_apicv_status;
+
+ /*
+ * Enlightened VMCS has been enabled. It does not mean that L1 has to
+ * use it. However, VMX features available to L1 will be limited based
+ * on what the enlightened VMCS supports.
+ */
+ bool enlightened_vmcs_enabled;
+
+ /* L2 must run next, and mustn't decide to exit to L1. */
+ bool nested_run_pending;
+
+ /* Pending MTF VM-exit into L1. */
+ bool mtf_pending;
+
+ struct loaded_vmcs vmcs02;
+
+ /*
+ * Guest pages referred to in the vmcs02 with host-physical
+ * pointers, so we must keep them pinned while L2 runs.
+ */
+ struct kvm_host_map apic_access_page_map;
+ struct kvm_host_map virtual_apic_map;
+ struct kvm_host_map pi_desc_map;
+
+ struct kvm_host_map msr_bitmap_map;
+
+ struct pi_desc *pi_desc;
+ bool pi_pending;
+ u16 posted_intr_nv;
+
+ struct hrtimer preemption_timer;
+ u64 preemption_timer_deadline;
+ bool has_preemption_timer_deadline;
+ bool preemption_timer_expired;
+
+ /*
+ * Used to snapshot MSRs that are conditionally loaded on VM-Enter in
+ * order to propagate the guest's pre-VM-Enter value into vmcs02. For
+ * emulation of VMLAUNCH/VMRESUME, the snapshot will be of L1's value.
+ * For KVM_SET_NESTED_STATE, the snapshot is of L2's value, _if_
+ * userspace restores MSRs before nested state. If userspace restores
+ * MSRs after nested state, the snapshot holds garbage, but KVM can't
+ * detect that, and the garbage value in vmcs02 will be overwritten by
+ * MSR restoration in any case.
+ */
+ u64 pre_vmenter_debugctl;
+ u64 pre_vmenter_bndcfgs;
+
+ /* to migrate it to L1 if L2 writes to L1's CR8 directly */
+ int l1_tpr_threshold;
+
+ u16 vpid02;
+ u16 last_vpid;
+
+ struct nested_vmx_msrs msrs;
+
+ /* SMM related state */
+ struct {
+ /* in VMX operation on SMM entry? */
+ bool vmxon;
+ /* in guest mode on SMM entry? */
+ bool guest_mode;
+ } smm;
+
+ gpa_t hv_evmcs_vmptr;
+ struct kvm_host_map hv_evmcs_map;
+ struct hv_enlightened_vmcs *hv_evmcs;
+};
+
+struct vcpu_vmx {
+ struct kvm_vcpu vcpu;
+ u8 fail;
+ u8 x2apic_msr_bitmap_mode;
+
+ /*
+ * If true, host state has been stored in vmx->loaded_vmcs for
+ * the CPU registers that only need to be switched when transitioning
+ * to/from the kernel, and the registers have been loaded with guest
+ * values. If false, host state is loaded in the CPU registers
+ * and vmx->loaded_vmcs->host_state is invalid.
+ */
+ bool guest_state_loaded;
+
+ unsigned long exit_qualification;
+ u32 exit_intr_info;
+ u32 idt_vectoring_info;
+ ulong rflags;
+
+ /*
+ * User return MSRs are always emulated when enabled in the guest, but
+ * only loaded into hardware when necessary, e.g. SYSCALL #UDs outside
+ * of 64-bit mode or if EFER.SCE=1, thus the SYSCALL MSRs don't need to
+ * be loaded into hardware if those conditions aren't met.
+ */
+ struct vmx_uret_msr guest_uret_msrs[MAX_NR_USER_RETURN_MSRS];
+ bool guest_uret_msrs_loaded;
+#ifdef CONFIG_X86_64
+ u64 msr_host_kernel_gs_base;
+ u64 msr_guest_kernel_gs_base;
+#endif
+
+ u64 spec_ctrl;
+ u32 msr_ia32_umwait_control;
+
+ /*
+ * loaded_vmcs points to the VMCS currently used in this vcpu. For a
+ * non-nested (L1) guest, it always points to vmcs01. For a nested
+ * guest (L2), it points to a different VMCS.
+ */
+ struct loaded_vmcs vmcs01;
+ struct loaded_vmcs *loaded_vmcs;
+
+ struct msr_autoload {
+ struct vmx_msrs guest;
+ struct vmx_msrs host;
+ } msr_autoload;
+
+ struct msr_autostore {
+ struct vmx_msrs guest;
+ } msr_autostore;
+
+ struct {
+ int vm86_active;
+ ulong save_rflags;
+ struct kvm_segment segs[8];
+ } rmode;
+ struct {
+ u32 bitmask; /* 4 bits per segment (1 bit per field) */
+ struct kvm_save_segment {
+ u16 selector;
+ unsigned long base;
+ u32 limit;
+ u32 ar;
+ } seg[8];
+ } segment_cache;
+ int vpid;
+ bool emulation_required;
+
+ union vmx_exit_reason exit_reason;
+
+ /* Posted interrupt descriptor */
+ struct pi_desc pi_desc;
+
+ /* Used if this vCPU is waiting for PI notification wakeup. */
+ struct list_head pi_wakeup_list;
+
+ /* Support for a guest hypervisor (nested VMX) */
+ struct nested_vmx nested;
+
+ /* Dynamic PLE window. */
+ unsigned int ple_window;
+ bool ple_window_dirty;
+
+ bool req_immediate_exit;
+
+ /* Support for PML */
+#define PML_ENTITY_NUM 512
+ struct page *pml_pg;
+
+ /* apic deadline value in host tsc */
+ u64 hv_deadline_tsc;
+
+ unsigned long host_debugctlmsr;
+
+ /*
+ * Only bits masked by msr_ia32_feature_control_valid_bits can be set in
+ * msr_ia32_feature_control. FEAT_CTL_LOCKED is always included
+ * in msr_ia32_feature_control_valid_bits.
+ */
+ u64 msr_ia32_feature_control;
+ u64 msr_ia32_feature_control_valid_bits;
+ /* SGX Launch Control public key hash */
+ u64 msr_ia32_sgxlepubkeyhash[4];
+ u64 msr_ia32_mcu_opt_ctrl;
+ bool disable_fb_clear;
+
+ struct pt_desc pt_desc;
+ struct lbr_desc lbr_desc;
+
+ /* Save desired MSR intercept (read: pass-through) state */
+#define MAX_POSSIBLE_PASSTHROUGH_MSRS 16
+ struct {
+ DECLARE_BITMAP(read, MAX_POSSIBLE_PASSTHROUGH_MSRS);
+ DECLARE_BITMAP(write, MAX_POSSIBLE_PASSTHROUGH_MSRS);
+ } shadow_msr_intercept;
+};
+
+struct kvm_vmx {
+ struct kvm kvm;
+
+ unsigned int tss_addr;
+ bool ept_identity_pagetable_done;
+ gpa_t ept_identity_map_addr;
+ /* Posted Interrupt Descriptor (PID) table for IPI virtualization */
+ u64 *pid_table;
+};
+
+void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu,
+ struct loaded_vmcs *buddy);
+int allocate_vpid(void);
+void free_vpid(int vpid);
+void vmx_set_constant_host_state(struct vcpu_vmx *vmx);
+void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu);
+void vmx_set_host_fs_gs(struct vmcs_host_state *host, u16 fs_sel, u16 gs_sel,
+ unsigned long fs_base, unsigned long gs_base);
+int vmx_get_cpl(struct kvm_vcpu *vcpu);
+bool vmx_emulation_required(struct kvm_vcpu *vcpu);
+unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu);
+void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
+u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu);
+void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask);
+int vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer);
+void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
+void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
+void set_cr4_guest_host_mask(struct vcpu_vmx *vmx);
+void ept_save_pdptrs(struct kvm_vcpu *vcpu);
+void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
+void __vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
+u64 construct_eptp(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level);
+
+bool vmx_guest_inject_ac(struct kvm_vcpu *vcpu);
+void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu);
+bool vmx_nmi_blocked(struct kvm_vcpu *vcpu);
+bool vmx_interrupt_blocked(struct kvm_vcpu *vcpu);
+bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu);
+void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked);
+void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu);
+struct vmx_uret_msr *vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr);
+void pt_update_intercept_for_msr(struct kvm_vcpu *vcpu);
+void vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp);
+void vmx_spec_ctrl_restore_host(struct vcpu_vmx *vmx, unsigned int flags);
+unsigned int __vmx_vcpu_run_flags(struct vcpu_vmx *vmx);
+bool __vmx_vcpu_run(struct vcpu_vmx *vmx, unsigned long *regs,
+ unsigned int flags);
+int vmx_find_loadstore_msr_slot(struct vmx_msrs *m, u32 msr);
+void vmx_ept_load_pdptrs(struct kvm_vcpu *vcpu);
+
+void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type);
+void vmx_enable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type);
+
+u64 vmx_get_l2_tsc_offset(struct kvm_vcpu *vcpu);
+u64 vmx_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu);
+
+static inline void vmx_set_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr,
+ int type, bool value)
+{
+ if (value)
+ vmx_enable_intercept_for_msr(vcpu, msr, type);
+ else
+ vmx_disable_intercept_for_msr(vcpu, msr, type);
+}
+
+void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu);
+
+/*
+ * Note, early Intel manuals have the write-low and read-high bitmap offsets
+ * the wrong way round. The bitmaps control MSRs 0x00000000-0x00001fff and
+ * 0xc0000000-0xc0001fff. The former (low) uses bytes 0-0x3ff for reads and
+ * 0x800-0xbff for writes. The latter (high) uses 0x400-0x7ff for reads and
+ * 0xc00-0xfff for writes. MSRs not covered by either of the ranges always
+ * VM-Exit.
+ */
+#define __BUILD_VMX_MSR_BITMAP_HELPER(rtype, action, bitop, access, base) \
+static inline rtype vmx_##action##_msr_bitmap_##access(unsigned long *bitmap, \
+ u32 msr) \
+{ \
+ int f = sizeof(unsigned long); \
+ \
+ if (msr <= 0x1fff) \
+ return bitop##_bit(msr, bitmap + base / f); \
+ else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) \
+ return bitop##_bit(msr & 0x1fff, bitmap + (base + 0x400) / f); \
+ return (rtype)true; \
+}
+#define BUILD_VMX_MSR_BITMAP_HELPERS(ret_type, action, bitop) \
+ __BUILD_VMX_MSR_BITMAP_HELPER(ret_type, action, bitop, read, 0x0) \
+ __BUILD_VMX_MSR_BITMAP_HELPER(ret_type, action, bitop, write, 0x800)
+
+BUILD_VMX_MSR_BITMAP_HELPERS(bool, test, test)
+BUILD_VMX_MSR_BITMAP_HELPERS(void, clear, __clear)
+BUILD_VMX_MSR_BITMAP_HELPERS(void, set, __set)
+
+static inline u8 vmx_get_rvi(void)
+{
+ return vmcs_read16(GUEST_INTR_STATUS) & 0xff;
+}
+
+#define __KVM_REQUIRED_VMX_VM_ENTRY_CONTROLS \
+ (VM_ENTRY_LOAD_DEBUG_CONTROLS)
+#ifdef CONFIG_X86_64
+ #define KVM_REQUIRED_VMX_VM_ENTRY_CONTROLS \
+ (__KVM_REQUIRED_VMX_VM_ENTRY_CONTROLS | \
+ VM_ENTRY_IA32E_MODE)
+#else
+ #define KVM_REQUIRED_VMX_VM_ENTRY_CONTROLS \
+ __KVM_REQUIRED_VMX_VM_ENTRY_CONTROLS
+#endif
+#define KVM_OPTIONAL_VMX_VM_ENTRY_CONTROLS \
+ (VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL | \
+ VM_ENTRY_LOAD_IA32_PAT | \
+ VM_ENTRY_LOAD_IA32_EFER | \
+ VM_ENTRY_LOAD_BNDCFGS | \
+ VM_ENTRY_PT_CONCEAL_PIP | \
+ VM_ENTRY_LOAD_IA32_RTIT_CTL)
+
+#define __KVM_REQUIRED_VMX_VM_EXIT_CONTROLS \
+ (VM_EXIT_SAVE_DEBUG_CONTROLS | \
+ VM_EXIT_ACK_INTR_ON_EXIT)
+#ifdef CONFIG_X86_64
+ #define KVM_REQUIRED_VMX_VM_EXIT_CONTROLS \
+ (__KVM_REQUIRED_VMX_VM_EXIT_CONTROLS | \
+ VM_EXIT_HOST_ADDR_SPACE_SIZE)
+#else
+ #define KVM_REQUIRED_VMX_VM_EXIT_CONTROLS \
+ __KVM_REQUIRED_VMX_VM_EXIT_CONTROLS
+#endif
+#define KVM_OPTIONAL_VMX_VM_EXIT_CONTROLS \
+ (VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | \
+ VM_EXIT_SAVE_IA32_PAT | \
+ VM_EXIT_LOAD_IA32_PAT | \
+ VM_EXIT_SAVE_IA32_EFER | \
+ VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | \
+ VM_EXIT_LOAD_IA32_EFER | \
+ VM_EXIT_CLEAR_BNDCFGS | \
+ VM_EXIT_PT_CONCEAL_PIP | \
+ VM_EXIT_CLEAR_IA32_RTIT_CTL)
+
+#define KVM_REQUIRED_VMX_PIN_BASED_VM_EXEC_CONTROL \
+ (PIN_BASED_EXT_INTR_MASK | \
+ PIN_BASED_NMI_EXITING)
+#define KVM_OPTIONAL_VMX_PIN_BASED_VM_EXEC_CONTROL \
+ (PIN_BASED_VIRTUAL_NMIS | \
+ PIN_BASED_POSTED_INTR | \
+ PIN_BASED_VMX_PREEMPTION_TIMER)
+
+#define __KVM_REQUIRED_VMX_CPU_BASED_VM_EXEC_CONTROL \
+ (CPU_BASED_HLT_EXITING | \
+ CPU_BASED_CR3_LOAD_EXITING | \
+ CPU_BASED_CR3_STORE_EXITING | \
+ CPU_BASED_UNCOND_IO_EXITING | \
+ CPU_BASED_MOV_DR_EXITING | \
+ CPU_BASED_USE_TSC_OFFSETTING | \
+ CPU_BASED_MWAIT_EXITING | \
+ CPU_BASED_MONITOR_EXITING | \
+ CPU_BASED_INVLPG_EXITING | \
+ CPU_BASED_RDPMC_EXITING | \
+ CPU_BASED_INTR_WINDOW_EXITING)
+
+#ifdef CONFIG_X86_64
+ #define KVM_REQUIRED_VMX_CPU_BASED_VM_EXEC_CONTROL \
+ (__KVM_REQUIRED_VMX_CPU_BASED_VM_EXEC_CONTROL | \
+ CPU_BASED_CR8_LOAD_EXITING | \
+ CPU_BASED_CR8_STORE_EXITING)
+#else
+ #define KVM_REQUIRED_VMX_CPU_BASED_VM_EXEC_CONTROL \
+ __KVM_REQUIRED_VMX_CPU_BASED_VM_EXEC_CONTROL
+#endif
+
+#define KVM_OPTIONAL_VMX_CPU_BASED_VM_EXEC_CONTROL \
+ (CPU_BASED_RDTSC_EXITING | \
+ CPU_BASED_TPR_SHADOW | \
+ CPU_BASED_USE_IO_BITMAPS | \
+ CPU_BASED_MONITOR_TRAP_FLAG | \
+ CPU_BASED_USE_MSR_BITMAPS | \
+ CPU_BASED_NMI_WINDOW_EXITING | \
+ CPU_BASED_PAUSE_EXITING | \
+ CPU_BASED_ACTIVATE_SECONDARY_CONTROLS | \
+ CPU_BASED_ACTIVATE_TERTIARY_CONTROLS)
+
+#define KVM_REQUIRED_VMX_SECONDARY_VM_EXEC_CONTROL 0
+#define KVM_OPTIONAL_VMX_SECONDARY_VM_EXEC_CONTROL \
+ (SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | \
+ SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | \
+ SECONDARY_EXEC_WBINVD_EXITING | \
+ SECONDARY_EXEC_ENABLE_VPID | \
+ SECONDARY_EXEC_ENABLE_EPT | \
+ SECONDARY_EXEC_UNRESTRICTED_GUEST | \
+ SECONDARY_EXEC_PAUSE_LOOP_EXITING | \
+ SECONDARY_EXEC_DESC | \
+ SECONDARY_EXEC_ENABLE_RDTSCP | \
+ SECONDARY_EXEC_ENABLE_INVPCID | \
+ SECONDARY_EXEC_APIC_REGISTER_VIRT | \
+ SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | \
+ SECONDARY_EXEC_SHADOW_VMCS | \
+ SECONDARY_EXEC_ENABLE_XSAVES | \
+ SECONDARY_EXEC_RDSEED_EXITING | \
+ SECONDARY_EXEC_RDRAND_EXITING | \
+ SECONDARY_EXEC_ENABLE_PML | \
+ SECONDARY_EXEC_TSC_SCALING | \
+ SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE | \
+ SECONDARY_EXEC_PT_USE_GPA | \
+ SECONDARY_EXEC_PT_CONCEAL_VMX | \
+ SECONDARY_EXEC_ENABLE_VMFUNC | \
+ SECONDARY_EXEC_BUS_LOCK_DETECTION | \
+ SECONDARY_EXEC_NOTIFY_VM_EXITING | \
+ SECONDARY_EXEC_ENCLS_EXITING)
+
+#define KVM_REQUIRED_VMX_TERTIARY_VM_EXEC_CONTROL 0
+#define KVM_OPTIONAL_VMX_TERTIARY_VM_EXEC_CONTROL \
+ (TERTIARY_EXEC_IPI_VIRT)
+
+#define BUILD_CONTROLS_SHADOW(lname, uname, bits) \
+static inline void lname##_controls_set(struct vcpu_vmx *vmx, u##bits val) \
+{ \
+ if (vmx->loaded_vmcs->controls_shadow.lname != val) { \
+ vmcs_write##bits(uname, val); \
+ vmx->loaded_vmcs->controls_shadow.lname = val; \
+ } \
+} \
+static inline u##bits __##lname##_controls_get(struct loaded_vmcs *vmcs) \
+{ \
+ return vmcs->controls_shadow.lname; \
+} \
+static inline u##bits lname##_controls_get(struct vcpu_vmx *vmx) \
+{ \
+ return __##lname##_controls_get(vmx->loaded_vmcs); \
+} \
+static __always_inline void lname##_controls_setbit(struct vcpu_vmx *vmx, u##bits val) \
+{ \
+ BUILD_BUG_ON(!(val & (KVM_REQUIRED_VMX_##uname | KVM_OPTIONAL_VMX_##uname))); \
+ lname##_controls_set(vmx, lname##_controls_get(vmx) | val); \
+} \
+static __always_inline void lname##_controls_clearbit(struct vcpu_vmx *vmx, u##bits val) \
+{ \
+ BUILD_BUG_ON(!(val & (KVM_REQUIRED_VMX_##uname | KVM_OPTIONAL_VMX_##uname))); \
+ lname##_controls_set(vmx, lname##_controls_get(vmx) & ~val); \
+}
+BUILD_CONTROLS_SHADOW(vm_entry, VM_ENTRY_CONTROLS, 32)
+BUILD_CONTROLS_SHADOW(vm_exit, VM_EXIT_CONTROLS, 32)
+BUILD_CONTROLS_SHADOW(pin, PIN_BASED_VM_EXEC_CONTROL, 32)
+BUILD_CONTROLS_SHADOW(exec, CPU_BASED_VM_EXEC_CONTROL, 32)
+BUILD_CONTROLS_SHADOW(secondary_exec, SECONDARY_VM_EXEC_CONTROL, 32)
+BUILD_CONTROLS_SHADOW(tertiary_exec, TERTIARY_VM_EXEC_CONTROL, 64)
+
+/*
+ * VMX_REGS_LAZY_LOAD_SET - The set of registers that will be updated in the
+ * cache on demand. Other registers not listed here are synced to
+ * the cache immediately after VM-Exit.
+ */
+#define VMX_REGS_LAZY_LOAD_SET ((1 << VCPU_REGS_RIP) | \
+ (1 << VCPU_REGS_RSP) | \
+ (1 << VCPU_EXREG_RFLAGS) | \
+ (1 << VCPU_EXREG_PDPTR) | \
+ (1 << VCPU_EXREG_SEGMENTS) | \
+ (1 << VCPU_EXREG_CR0) | \
+ (1 << VCPU_EXREG_CR3) | \
+ (1 << VCPU_EXREG_CR4) | \
+ (1 << VCPU_EXREG_EXIT_INFO_1) | \
+ (1 << VCPU_EXREG_EXIT_INFO_2))
+
+static inline unsigned long vmx_l1_guest_owned_cr0_bits(void)
+{
+ unsigned long bits = KVM_POSSIBLE_CR0_GUEST_BITS;
+
+ /*
+ * CR0.WP needs to be intercepted when KVM is shadowing legacy paging
+ * in order to construct shadow PTEs with the correct protections.
+ * Note! CR0.WP technically can be passed through to the guest if
+ * paging is disabled, but checking CR0.PG would generate a cyclical
+ * dependency of sorts due to forcing the caller to ensure CR0 holds
+ * the correct value prior to determining which CR0 bits can be owned
+ * by L1. Keep it simple and limit the optimization to EPT.
+ */
+ if (!enable_ept)
+ bits &= ~X86_CR0_WP;
+ return bits;
+}
+
+static __always_inline struct kvm_vmx *to_kvm_vmx(struct kvm *kvm)
+{
+ return container_of(kvm, struct kvm_vmx, kvm);
+}
+
+static __always_inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
+{
+ return container_of(vcpu, struct vcpu_vmx, vcpu);
+}
+
+static inline struct lbr_desc *vcpu_to_lbr_desc(struct kvm_vcpu *vcpu)
+{
+ return &to_vmx(vcpu)->lbr_desc;
+}
+
+static inline struct x86_pmu_lbr *vcpu_to_lbr_records(struct kvm_vcpu *vcpu)
+{
+ return &vcpu_to_lbr_desc(vcpu)->records;
+}
+
+static inline bool intel_pmu_lbr_is_enabled(struct kvm_vcpu *vcpu)
+{
+ return !!vcpu_to_lbr_records(vcpu)->nr;
+}
+
+void intel_pmu_cross_mapped_check(struct kvm_pmu *pmu);
+int intel_pmu_create_guest_lbr_event(struct kvm_vcpu *vcpu);
+void vmx_passthrough_lbr_msrs(struct kvm_vcpu *vcpu);
+
+static __always_inline unsigned long vmx_get_exit_qual(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ if (!kvm_register_test_and_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_1))
+ vmx->exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+
+ return vmx->exit_qualification;
+}
+
+static __always_inline u32 vmx_get_intr_info(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ if (!kvm_register_test_and_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_2))
+ vmx->exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+
+ return vmx->exit_intr_info;
+}
+
+struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags);
+void free_vmcs(struct vmcs *vmcs);
+int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs);
+void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs);
+void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs);
+
+static inline struct vmcs *alloc_vmcs(bool shadow)
+{
+ return alloc_vmcs_cpu(shadow, raw_smp_processor_id(),
+ GFP_KERNEL_ACCOUNT);
+}
+
+static inline bool vmx_has_waitpkg(struct vcpu_vmx *vmx)
+{
+ return secondary_exec_controls_get(vmx) &
+ SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE;
+}
+
+static inline bool vmx_need_pf_intercept(struct kvm_vcpu *vcpu)
+{
+ if (!enable_ept)
+ return true;
+
+ return allow_smaller_maxphyaddr && cpuid_maxphyaddr(vcpu) < boot_cpu_data.x86_phys_bits;
+}
+
+static inline bool is_unrestricted_guest(struct kvm_vcpu *vcpu)
+{
+ return enable_unrestricted_guest && (!is_guest_mode(vcpu) ||
+ (secondary_exec_controls_get(to_vmx(vcpu)) &
+ SECONDARY_EXEC_UNRESTRICTED_GUEST));
+}
+
+bool __vmx_guest_state_valid(struct kvm_vcpu *vcpu);
+static inline bool vmx_guest_state_valid(struct kvm_vcpu *vcpu)
+{
+ return is_unrestricted_guest(vcpu) || __vmx_guest_state_valid(vcpu);
+}
+
+void dump_vmcs(struct kvm_vcpu *vcpu);
+
+static inline int vmx_get_instr_info_reg2(u32 vmx_instr_info)
+{
+ return (vmx_instr_info >> 28) & 0xf;
+}
+
+static inline bool vmx_can_use_ipiv(struct kvm_vcpu *vcpu)
+{
+ return lapic_in_kernel(vcpu) && enable_ipiv;
+}
+
+static inline bool guest_cpuid_has_evmcs(struct kvm_vcpu *vcpu)
+{
+ /*
+ * eVMCS is exposed to the guest if Hyper-V is enabled in CPUID and
+ * eVMCS has been explicitly enabled by userspace.
+ */
+ return vcpu->arch.hyperv_enabled &&
+ to_vmx(vcpu)->nested.enlightened_vmcs_enabled;
+}
+
+#endif /* __KVM_X86_VMX_H */
diff --git a/arch/x86/kvm/vmx/vmx_ops.h b/arch/x86/kvm/vmx/vmx_ops.h
new file mode 100644
index 000000000..33af7b4c6
--- /dev/null
+++ b/arch/x86/kvm/vmx/vmx_ops.h
@@ -0,0 +1,369 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __KVM_X86_VMX_INSN_H
+#define __KVM_X86_VMX_INSN_H
+
+#include <linux/nospec.h>
+
+#include <asm/vmx.h>
+
+#include "hyperv.h"
+#include "vmcs.h"
+#include "../x86.h"
+
+void vmread_error(unsigned long field);
+void vmwrite_error(unsigned long field, unsigned long value);
+void vmclear_error(struct vmcs *vmcs, u64 phys_addr);
+void vmptrld_error(struct vmcs *vmcs, u64 phys_addr);
+void invvpid_error(unsigned long ext, u16 vpid, gva_t gva);
+void invept_error(unsigned long ext, u64 eptp, gpa_t gpa);
+
+#ifndef CONFIG_CC_HAS_ASM_GOTO_OUTPUT
+/*
+ * The VMREAD error trampoline _always_ uses the stack to pass parameters, even
+ * for 64-bit targets. Preserving all registers allows the VMREAD inline asm
+ * blob to avoid clobbering GPRs, which in turn allows the compiler to better
+ * optimize sequences of VMREADs.
+ *
+ * Declare the trampoline as an opaque label as it's not safe to call from C
+ * code; there is no way to tell the compiler to pass params on the stack for
+ * 64-bit targets.
+ *
+ * void vmread_error_trampoline(unsigned long field, bool fault);
+ */
+extern unsigned long vmread_error_trampoline;
+
+/*
+ * The second VMREAD error trampoline, called from the assembly trampoline,
+ * exists primarily to enable instrumentation for the VM-Fail path.
+ */
+void vmread_error_trampoline2(unsigned long field, bool fault);
+
+#endif
+
+static __always_inline void vmcs_check16(unsigned long field)
+{
+ BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2000,
+ "16-bit accessor invalid for 64-bit field");
+ BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2001,
+ "16-bit accessor invalid for 64-bit high field");
+ BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x4000,
+ "16-bit accessor invalid for 32-bit high field");
+ BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x6000,
+ "16-bit accessor invalid for natural width field");
+}
+
+static __always_inline void vmcs_check32(unsigned long field)
+{
+ BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0,
+ "32-bit accessor invalid for 16-bit field");
+ BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2000,
+ "32-bit accessor invalid for 64-bit field");
+ BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2001,
+ "32-bit accessor invalid for 64-bit high field");
+ BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x6000,
+ "32-bit accessor invalid for natural width field");
+}
+
+static __always_inline void vmcs_check64(unsigned long field)
+{
+ BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0,
+ "64-bit accessor invalid for 16-bit field");
+ BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2001,
+ "64-bit accessor invalid for 64-bit high field");
+ BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x4000,
+ "64-bit accessor invalid for 32-bit field");
+ BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x6000,
+ "64-bit accessor invalid for natural width field");
+}
+
+static __always_inline void vmcs_checkl(unsigned long field)
+{
+ BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0,
+ "Natural width accessor invalid for 16-bit field");
+ BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2000,
+ "Natural width accessor invalid for 64-bit field");
+ BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2001,
+ "Natural width accessor invalid for 64-bit high field");
+ BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x4000,
+ "Natural width accessor invalid for 32-bit field");
+}
+
+static __always_inline unsigned long __vmcs_readl(unsigned long field)
+{
+ unsigned long value;
+
+#ifdef CONFIG_CC_HAS_ASM_GOTO_OUTPUT
+
+ asm_volatile_goto("1: vmread %[field], %[output]\n\t"
+ "jna %l[do_fail]\n\t"
+
+ _ASM_EXTABLE(1b, %l[do_exception])
+
+ : [output] "=r" (value)
+ : [field] "r" (field)
+ : "cc"
+ : do_fail, do_exception);
+
+ return value;
+
+do_fail:
+ instrumentation_begin();
+ vmread_error(field);
+ instrumentation_end();
+ return 0;
+
+do_exception:
+ kvm_spurious_fault();
+ return 0;
+
+#else /* !CONFIG_CC_HAS_ASM_GOTO_OUTPUT */
+
+ asm volatile("1: vmread %2, %1\n\t"
+ ".byte 0x3e\n\t" /* branch taken hint */
+ "ja 3f\n\t"
+
+ /*
+ * VMREAD failed. Push '0' for @fault, push the failing
+ * @field, and bounce through the trampoline to preserve
+ * volatile registers.
+ */
+ "xorl %k1, %k1\n\t"
+ "2:\n\t"
+ "push %1\n\t"
+ "push %2\n\t"
+ "call vmread_error_trampoline\n\t"
+
+ /*
+ * Unwind the stack. Note, the trampoline zeros out the
+ * memory for @fault so that the result is '0' on error.
+ */
+ "pop %2\n\t"
+ "pop %1\n\t"
+ "3:\n\t"
+
+ /* VMREAD faulted. As above, except push '1' for @fault. */
+ _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_ONE_REG, %1)
+
+ : ASM_CALL_CONSTRAINT, "=&r"(value) : "r"(field) : "cc");
+ return value;
+
+#endif /* CONFIG_CC_HAS_ASM_GOTO_OUTPUT */
+}
+
+static __always_inline u16 vmcs_read16(unsigned long field)
+{
+ vmcs_check16(field);
+ if (kvm_is_using_evmcs())
+ return evmcs_read16(field);
+ return __vmcs_readl(field);
+}
+
+static __always_inline u32 vmcs_read32(unsigned long field)
+{
+ vmcs_check32(field);
+ if (kvm_is_using_evmcs())
+ return evmcs_read32(field);
+ return __vmcs_readl(field);
+}
+
+static __always_inline u64 vmcs_read64(unsigned long field)
+{
+ vmcs_check64(field);
+ if (kvm_is_using_evmcs())
+ return evmcs_read64(field);
+#ifdef CONFIG_X86_64
+ return __vmcs_readl(field);
+#else
+ return __vmcs_readl(field) | ((u64)__vmcs_readl(field+1) << 32);
+#endif
+}
+
+static __always_inline unsigned long vmcs_readl(unsigned long field)
+{
+ vmcs_checkl(field);
+ if (kvm_is_using_evmcs())
+ return evmcs_read64(field);
+ return __vmcs_readl(field);
+}
+
+#define vmx_asm1(insn, op1, error_args...) \
+do { \
+ asm_volatile_goto("1: " __stringify(insn) " %0\n\t" \
+ ".byte 0x2e\n\t" /* branch not taken hint */ \
+ "jna %l[error]\n\t" \
+ _ASM_EXTABLE(1b, %l[fault]) \
+ : : op1 : "cc" : error, fault); \
+ return; \
+error: \
+ instrumentation_begin(); \
+ insn##_error(error_args); \
+ instrumentation_end(); \
+ return; \
+fault: \
+ kvm_spurious_fault(); \
+} while (0)
+
+#define vmx_asm2(insn, op1, op2, error_args...) \
+do { \
+ asm_volatile_goto("1: " __stringify(insn) " %1, %0\n\t" \
+ ".byte 0x2e\n\t" /* branch not taken hint */ \
+ "jna %l[error]\n\t" \
+ _ASM_EXTABLE(1b, %l[fault]) \
+ : : op1, op2 : "cc" : error, fault); \
+ return; \
+error: \
+ instrumentation_begin(); \
+ insn##_error(error_args); \
+ instrumentation_end(); \
+ return; \
+fault: \
+ kvm_spurious_fault(); \
+} while (0)
+
+static __always_inline void __vmcs_writel(unsigned long field, unsigned long value)
+{
+ vmx_asm2(vmwrite, "r"(field), "rm"(value), field, value);
+}
+
+static __always_inline void vmcs_write16(unsigned long field, u16 value)
+{
+ vmcs_check16(field);
+ if (kvm_is_using_evmcs())
+ return evmcs_write16(field, value);
+
+ __vmcs_writel(field, value);
+}
+
+static __always_inline void vmcs_write32(unsigned long field, u32 value)
+{
+ vmcs_check32(field);
+ if (kvm_is_using_evmcs())
+ return evmcs_write32(field, value);
+
+ __vmcs_writel(field, value);
+}
+
+static __always_inline void vmcs_write64(unsigned long field, u64 value)
+{
+ vmcs_check64(field);
+ if (kvm_is_using_evmcs())
+ return evmcs_write64(field, value);
+
+ __vmcs_writel(field, value);
+#ifndef CONFIG_X86_64
+ __vmcs_writel(field+1, value >> 32);
+#endif
+}
+
+static __always_inline void vmcs_writel(unsigned long field, unsigned long value)
+{
+ vmcs_checkl(field);
+ if (kvm_is_using_evmcs())
+ return evmcs_write64(field, value);
+
+ __vmcs_writel(field, value);
+}
+
+static __always_inline void vmcs_clear_bits(unsigned long field, u32 mask)
+{
+ BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x2000,
+ "vmcs_clear_bits does not support 64-bit fields");
+ if (kvm_is_using_evmcs())
+ return evmcs_write32(field, evmcs_read32(field) & ~mask);
+
+ __vmcs_writel(field, __vmcs_readl(field) & ~mask);
+}
+
+static __always_inline void vmcs_set_bits(unsigned long field, u32 mask)
+{
+ BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x2000,
+ "vmcs_set_bits does not support 64-bit fields");
+ if (kvm_is_using_evmcs())
+ return evmcs_write32(field, evmcs_read32(field) | mask);
+
+ __vmcs_writel(field, __vmcs_readl(field) | mask);
+}
+
+static inline void vmcs_clear(struct vmcs *vmcs)
+{
+ u64 phys_addr = __pa(vmcs);
+
+ vmx_asm1(vmclear, "m"(phys_addr), vmcs, phys_addr);
+}
+
+static inline void vmcs_load(struct vmcs *vmcs)
+{
+ u64 phys_addr = __pa(vmcs);
+
+ if (kvm_is_using_evmcs())
+ return evmcs_load(phys_addr);
+
+ vmx_asm1(vmptrld, "m"(phys_addr), vmcs, phys_addr);
+}
+
+static inline void __invvpid(unsigned long ext, u16 vpid, gva_t gva)
+{
+ struct {
+ u64 vpid : 16;
+ u64 rsvd : 48;
+ u64 gva;
+ } operand = { vpid, 0, gva };
+
+ vmx_asm2(invvpid, "r"(ext), "m"(operand), ext, vpid, gva);
+}
+
+static inline void __invept(unsigned long ext, u64 eptp, gpa_t gpa)
+{
+ struct {
+ u64 eptp, gpa;
+ } operand = {eptp, gpa};
+
+ vmx_asm2(invept, "r"(ext), "m"(operand), ext, eptp, gpa);
+}
+
+static inline void vpid_sync_vcpu_single(int vpid)
+{
+ if (vpid == 0)
+ return;
+
+ __invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vpid, 0);
+}
+
+static inline void vpid_sync_vcpu_global(void)
+{
+ __invvpid(VMX_VPID_EXTENT_ALL_CONTEXT, 0, 0);
+}
+
+static inline void vpid_sync_context(int vpid)
+{
+ if (cpu_has_vmx_invvpid_single())
+ vpid_sync_vcpu_single(vpid);
+ else if (vpid != 0)
+ vpid_sync_vcpu_global();
+}
+
+static inline void vpid_sync_vcpu_addr(int vpid, gva_t addr)
+{
+ if (vpid == 0)
+ return;
+
+ if (cpu_has_vmx_invvpid_individual_addr())
+ __invvpid(VMX_VPID_EXTENT_INDIVIDUAL_ADDR, vpid, addr);
+ else
+ vpid_sync_context(vpid);
+}
+
+static inline void ept_sync_global(void)
+{
+ __invept(VMX_EPT_EXTENT_GLOBAL, 0, 0);
+}
+
+static inline void ept_sync_context(u64 eptp)
+{
+ if (cpu_has_vmx_invept_context())
+ __invept(VMX_EPT_EXTENT_CONTEXT, eptp, 0);
+ else
+ ept_sync_global();
+}
+
+#endif /* __KVM_X86_VMX_INSN_H */
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
new file mode 100644
index 000000000..e179db7c1
--- /dev/null
+++ b/arch/x86/kvm/x86.c
@@ -0,0 +1,13695 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Kernel-based Virtual Machine driver for Linux
+ *
+ * derived from drivers/kvm/kvm_main.c
+ *
+ * Copyright (C) 2006 Qumranet, Inc.
+ * Copyright (C) 2008 Qumranet, Inc.
+ * Copyright IBM Corporation, 2008
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
+ *
+ * Authors:
+ * Avi Kivity <avi@qumranet.com>
+ * Yaniv Kamay <yaniv@qumranet.com>
+ * Amit Shah <amit.shah@qumranet.com>
+ * Ben-Ami Yassour <benami@il.ibm.com>
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kvm_host.h>
+#include "irq.h"
+#include "ioapic.h"
+#include "mmu.h"
+#include "i8254.h"
+#include "tss.h"
+#include "kvm_cache_regs.h"
+#include "kvm_emulate.h"
+#include "mmu/page_track.h"
+#include "x86.h"
+#include "cpuid.h"
+#include "pmu.h"
+#include "hyperv.h"
+#include "lapic.h"
+#include "xen.h"
+#include "smm.h"
+
+#include <linux/clocksource.h>
+#include <linux/interrupt.h>
+#include <linux/kvm.h>
+#include <linux/fs.h>
+#include <linux/vmalloc.h>
+#include <linux/export.h>
+#include <linux/moduleparam.h>
+#include <linux/mman.h>
+#include <linux/highmem.h>
+#include <linux/iommu.h>
+#include <linux/cpufreq.h>
+#include <linux/user-return-notifier.h>
+#include <linux/srcu.h>
+#include <linux/slab.h>
+#include <linux/perf_event.h>
+#include <linux/uaccess.h>
+#include <linux/hash.h>
+#include <linux/pci.h>
+#include <linux/timekeeper_internal.h>
+#include <linux/pvclock_gtod.h>
+#include <linux/kvm_irqfd.h>
+#include <linux/irqbypass.h>
+#include <linux/sched/stat.h>
+#include <linux/sched/isolation.h>
+#include <linux/mem_encrypt.h>
+#include <linux/entry-kvm.h>
+#include <linux/suspend.h>
+#include <linux/smp.h>
+
+#include <trace/events/ipi.h>
+#include <trace/events/kvm.h>
+
+#include <asm/debugreg.h>
+#include <asm/msr.h>
+#include <asm/desc.h>
+#include <asm/mce.h>
+#include <asm/pkru.h>
+#include <linux/kernel_stat.h>
+#include <asm/fpu/api.h>
+#include <asm/fpu/xcr.h>
+#include <asm/fpu/xstate.h>
+#include <asm/pvclock.h>
+#include <asm/div64.h>
+#include <asm/irq_remapping.h>
+#include <asm/mshyperv.h>
+#include <asm/hypervisor.h>
+#include <asm/tlbflush.h>
+#include <asm/intel_pt.h>
+#include <asm/emulate_prefix.h>
+#include <asm/sgx.h>
+#include <clocksource/hyperv_timer.h>
+
+#define CREATE_TRACE_POINTS
+#include "trace.h"
+
+#define MAX_IO_MSRS 256
+#define KVM_MAX_MCE_BANKS 32
+
+struct kvm_caps kvm_caps __read_mostly = {
+ .supported_mce_cap = MCG_CTL_P | MCG_SER_P,
+};
+EXPORT_SYMBOL_GPL(kvm_caps);
+
+#define ERR_PTR_USR(e) ((void __user *)ERR_PTR(e))
+
+#define emul_to_vcpu(ctxt) \
+ ((struct kvm_vcpu *)(ctxt)->vcpu)
+
+/* EFER defaults:
+ * - enable syscall per default because its emulated by KVM
+ * - enable LME and LMA per default on 64 bit KVM
+ */
+#ifdef CONFIG_X86_64
+static
+u64 __read_mostly efer_reserved_bits = ~((u64)(EFER_SCE | EFER_LME | EFER_LMA));
+#else
+static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE);
+#endif
+
+static u64 __read_mostly cr4_reserved_bits = CR4_RESERVED_BITS;
+
+#define KVM_EXIT_HYPERCALL_VALID_MASK (1 << KVM_HC_MAP_GPA_RANGE)
+
+#define KVM_CAP_PMU_VALID_MASK KVM_PMU_CAP_DISABLE
+
+#define KVM_X2APIC_API_VALID_FLAGS (KVM_X2APIC_API_USE_32BIT_IDS | \
+ KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK)
+
+static void update_cr8_intercept(struct kvm_vcpu *vcpu);
+static void process_nmi(struct kvm_vcpu *vcpu);
+static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
+static void store_regs(struct kvm_vcpu *vcpu);
+static int sync_regs(struct kvm_vcpu *vcpu);
+static int kvm_vcpu_do_singlestep(struct kvm_vcpu *vcpu);
+
+static int __set_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2);
+static void __get_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2);
+
+static DEFINE_MUTEX(vendor_module_lock);
+struct kvm_x86_ops kvm_x86_ops __read_mostly;
+
+#define KVM_X86_OP(func) \
+ DEFINE_STATIC_CALL_NULL(kvm_x86_##func, \
+ *(((struct kvm_x86_ops *)0)->func));
+#define KVM_X86_OP_OPTIONAL KVM_X86_OP
+#define KVM_X86_OP_OPTIONAL_RET0 KVM_X86_OP
+#include <asm/kvm-x86-ops.h>
+EXPORT_STATIC_CALL_GPL(kvm_x86_get_cs_db_l_bits);
+EXPORT_STATIC_CALL_GPL(kvm_x86_cache_reg);
+
+static bool __read_mostly ignore_msrs = 0;
+module_param(ignore_msrs, bool, S_IRUGO | S_IWUSR);
+
+bool __read_mostly report_ignored_msrs = true;
+module_param(report_ignored_msrs, bool, S_IRUGO | S_IWUSR);
+EXPORT_SYMBOL_GPL(report_ignored_msrs);
+
+unsigned int min_timer_period_us = 200;
+module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR);
+
+static bool __read_mostly kvmclock_periodic_sync = true;
+module_param(kvmclock_periodic_sync, bool, S_IRUGO);
+
+/* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */
+static u32 __read_mostly tsc_tolerance_ppm = 250;
+module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR);
+
+/*
+ * lapic timer advance (tscdeadline mode only) in nanoseconds. '-1' enables
+ * adaptive tuning starting from default advancement of 1000ns. '0' disables
+ * advancement entirely. Any other value is used as-is and disables adaptive
+ * tuning, i.e. allows privileged userspace to set an exact advancement time.
+ */
+static int __read_mostly lapic_timer_advance_ns = -1;
+module_param(lapic_timer_advance_ns, int, S_IRUGO | S_IWUSR);
+
+static bool __read_mostly vector_hashing = true;
+module_param(vector_hashing, bool, S_IRUGO);
+
+bool __read_mostly enable_vmware_backdoor = false;
+module_param(enable_vmware_backdoor, bool, S_IRUGO);
+EXPORT_SYMBOL_GPL(enable_vmware_backdoor);
+
+/*
+ * Flags to manipulate forced emulation behavior (any non-zero value will
+ * enable forced emulation).
+ */
+#define KVM_FEP_CLEAR_RFLAGS_RF BIT(1)
+static int __read_mostly force_emulation_prefix;
+module_param(force_emulation_prefix, int, 0644);
+
+int __read_mostly pi_inject_timer = -1;
+module_param(pi_inject_timer, bint, S_IRUGO | S_IWUSR);
+
+/* Enable/disable PMU virtualization */
+bool __read_mostly enable_pmu = true;
+EXPORT_SYMBOL_GPL(enable_pmu);
+module_param(enable_pmu, bool, 0444);
+
+bool __read_mostly eager_page_split = true;
+module_param(eager_page_split, bool, 0644);
+
+/* Enable/disable SMT_RSB bug mitigation */
+static bool __read_mostly mitigate_smt_rsb;
+module_param(mitigate_smt_rsb, bool, 0444);
+
+/*
+ * Restoring the host value for MSRs that are only consumed when running in
+ * usermode, e.g. SYSCALL MSRs and TSC_AUX, can be deferred until the CPU
+ * returns to userspace, i.e. the kernel can run with the guest's value.
+ */
+#define KVM_MAX_NR_USER_RETURN_MSRS 16
+
+struct kvm_user_return_msrs {
+ struct user_return_notifier urn;
+ bool registered;
+ struct kvm_user_return_msr_values {
+ u64 host;
+ u64 curr;
+ } values[KVM_MAX_NR_USER_RETURN_MSRS];
+};
+
+u32 __read_mostly kvm_nr_uret_msrs;
+EXPORT_SYMBOL_GPL(kvm_nr_uret_msrs);
+static u32 __read_mostly kvm_uret_msrs_list[KVM_MAX_NR_USER_RETURN_MSRS];
+static struct kvm_user_return_msrs __percpu *user_return_msrs;
+
+#define KVM_SUPPORTED_XCR0 (XFEATURE_MASK_FP | XFEATURE_MASK_SSE \
+ | XFEATURE_MASK_YMM | XFEATURE_MASK_BNDREGS \
+ | XFEATURE_MASK_BNDCSR | XFEATURE_MASK_AVX512 \
+ | XFEATURE_MASK_PKRU | XFEATURE_MASK_XTILE)
+
+u64 __read_mostly host_efer;
+EXPORT_SYMBOL_GPL(host_efer);
+
+bool __read_mostly allow_smaller_maxphyaddr = 0;
+EXPORT_SYMBOL_GPL(allow_smaller_maxphyaddr);
+
+bool __read_mostly enable_apicv = true;
+EXPORT_SYMBOL_GPL(enable_apicv);
+
+u64 __read_mostly host_xss;
+EXPORT_SYMBOL_GPL(host_xss);
+
+u64 __read_mostly host_arch_capabilities;
+EXPORT_SYMBOL_GPL(host_arch_capabilities);
+
+const struct _kvm_stats_desc kvm_vm_stats_desc[] = {
+ KVM_GENERIC_VM_STATS(),
+ STATS_DESC_COUNTER(VM, mmu_shadow_zapped),
+ STATS_DESC_COUNTER(VM, mmu_pte_write),
+ STATS_DESC_COUNTER(VM, mmu_pde_zapped),
+ STATS_DESC_COUNTER(VM, mmu_flooded),
+ STATS_DESC_COUNTER(VM, mmu_recycled),
+ STATS_DESC_COUNTER(VM, mmu_cache_miss),
+ STATS_DESC_ICOUNTER(VM, mmu_unsync),
+ STATS_DESC_ICOUNTER(VM, pages_4k),
+ STATS_DESC_ICOUNTER(VM, pages_2m),
+ STATS_DESC_ICOUNTER(VM, pages_1g),
+ STATS_DESC_ICOUNTER(VM, nx_lpage_splits),
+ STATS_DESC_PCOUNTER(VM, max_mmu_rmap_size),
+ STATS_DESC_PCOUNTER(VM, max_mmu_page_hash_collisions)
+};
+
+const struct kvm_stats_header kvm_vm_stats_header = {
+ .name_size = KVM_STATS_NAME_SIZE,
+ .num_desc = ARRAY_SIZE(kvm_vm_stats_desc),
+ .id_offset = sizeof(struct kvm_stats_header),
+ .desc_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE,
+ .data_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE +
+ sizeof(kvm_vm_stats_desc),
+};
+
+const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = {
+ KVM_GENERIC_VCPU_STATS(),
+ STATS_DESC_COUNTER(VCPU, pf_taken),
+ STATS_DESC_COUNTER(VCPU, pf_fixed),
+ STATS_DESC_COUNTER(VCPU, pf_emulate),
+ STATS_DESC_COUNTER(VCPU, pf_spurious),
+ STATS_DESC_COUNTER(VCPU, pf_fast),
+ STATS_DESC_COUNTER(VCPU, pf_mmio_spte_created),
+ STATS_DESC_COUNTER(VCPU, pf_guest),
+ STATS_DESC_COUNTER(VCPU, tlb_flush),
+ STATS_DESC_COUNTER(VCPU, invlpg),
+ STATS_DESC_COUNTER(VCPU, exits),
+ STATS_DESC_COUNTER(VCPU, io_exits),
+ STATS_DESC_COUNTER(VCPU, mmio_exits),
+ STATS_DESC_COUNTER(VCPU, signal_exits),
+ STATS_DESC_COUNTER(VCPU, irq_window_exits),
+ STATS_DESC_COUNTER(VCPU, nmi_window_exits),
+ STATS_DESC_COUNTER(VCPU, l1d_flush),
+ STATS_DESC_COUNTER(VCPU, halt_exits),
+ STATS_DESC_COUNTER(VCPU, request_irq_exits),
+ STATS_DESC_COUNTER(VCPU, irq_exits),
+ STATS_DESC_COUNTER(VCPU, host_state_reload),
+ STATS_DESC_COUNTER(VCPU, fpu_reload),
+ STATS_DESC_COUNTER(VCPU, insn_emulation),
+ STATS_DESC_COUNTER(VCPU, insn_emulation_fail),
+ STATS_DESC_COUNTER(VCPU, hypercalls),
+ STATS_DESC_COUNTER(VCPU, irq_injections),
+ STATS_DESC_COUNTER(VCPU, nmi_injections),
+ STATS_DESC_COUNTER(VCPU, req_event),
+ STATS_DESC_COUNTER(VCPU, nested_run),
+ STATS_DESC_COUNTER(VCPU, directed_yield_attempted),
+ STATS_DESC_COUNTER(VCPU, directed_yield_successful),
+ STATS_DESC_COUNTER(VCPU, preemption_reported),
+ STATS_DESC_COUNTER(VCPU, preemption_other),
+ STATS_DESC_IBOOLEAN(VCPU, guest_mode),
+ STATS_DESC_COUNTER(VCPU, notify_window_exits),
+};
+
+const struct kvm_stats_header kvm_vcpu_stats_header = {
+ .name_size = KVM_STATS_NAME_SIZE,
+ .num_desc = ARRAY_SIZE(kvm_vcpu_stats_desc),
+ .id_offset = sizeof(struct kvm_stats_header),
+ .desc_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE,
+ .data_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE +
+ sizeof(kvm_vcpu_stats_desc),
+};
+
+u64 __read_mostly host_xcr0;
+
+static struct kmem_cache *x86_emulator_cache;
+
+/*
+ * When called, it means the previous get/set msr reached an invalid msr.
+ * Return true if we want to ignore/silent this failed msr access.
+ */
+static bool kvm_msr_ignored_check(u32 msr, u64 data, bool write)
+{
+ const char *op = write ? "wrmsr" : "rdmsr";
+
+ if (ignore_msrs) {
+ if (report_ignored_msrs)
+ kvm_pr_unimpl("ignored %s: 0x%x data 0x%llx\n",
+ op, msr, data);
+ /* Mask the error */
+ return true;
+ } else {
+ kvm_debug_ratelimited("unhandled %s: 0x%x data 0x%llx\n",
+ op, msr, data);
+ return false;
+ }
+}
+
+static struct kmem_cache *kvm_alloc_emulator_cache(void)
+{
+ unsigned int useroffset = offsetof(struct x86_emulate_ctxt, src);
+ unsigned int size = sizeof(struct x86_emulate_ctxt);
+
+ return kmem_cache_create_usercopy("x86_emulator", size,
+ __alignof__(struct x86_emulate_ctxt),
+ SLAB_ACCOUNT, useroffset,
+ size - useroffset, NULL);
+}
+
+static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt);
+
+static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu)
+{
+ int i;
+ for (i = 0; i < ASYNC_PF_PER_VCPU; i++)
+ vcpu->arch.apf.gfns[i] = ~0;
+}
+
+static void kvm_on_user_return(struct user_return_notifier *urn)
+{
+ unsigned slot;
+ struct kvm_user_return_msrs *msrs
+ = container_of(urn, struct kvm_user_return_msrs, urn);
+ struct kvm_user_return_msr_values *values;
+ unsigned long flags;
+
+ /*
+ * Disabling irqs at this point since the following code could be
+ * interrupted and executed through kvm_arch_hardware_disable()
+ */
+ local_irq_save(flags);
+ if (msrs->registered) {
+ msrs->registered = false;
+ user_return_notifier_unregister(urn);
+ }
+ local_irq_restore(flags);
+ for (slot = 0; slot < kvm_nr_uret_msrs; ++slot) {
+ values = &msrs->values[slot];
+ if (values->host != values->curr) {
+ wrmsrl(kvm_uret_msrs_list[slot], values->host);
+ values->curr = values->host;
+ }
+ }
+}
+
+static int kvm_probe_user_return_msr(u32 msr)
+{
+ u64 val;
+ int ret;
+
+ preempt_disable();
+ ret = rdmsrl_safe(msr, &val);
+ if (ret)
+ goto out;
+ ret = wrmsrl_safe(msr, val);
+out:
+ preempt_enable();
+ return ret;
+}
+
+int kvm_add_user_return_msr(u32 msr)
+{
+ BUG_ON(kvm_nr_uret_msrs >= KVM_MAX_NR_USER_RETURN_MSRS);
+
+ if (kvm_probe_user_return_msr(msr))
+ return -1;
+
+ kvm_uret_msrs_list[kvm_nr_uret_msrs] = msr;
+ return kvm_nr_uret_msrs++;
+}
+EXPORT_SYMBOL_GPL(kvm_add_user_return_msr);
+
+int kvm_find_user_return_msr(u32 msr)
+{
+ int i;
+
+ for (i = 0; i < kvm_nr_uret_msrs; ++i) {
+ if (kvm_uret_msrs_list[i] == msr)
+ return i;
+ }
+ return -1;
+}
+EXPORT_SYMBOL_GPL(kvm_find_user_return_msr);
+
+static void kvm_user_return_msr_cpu_online(void)
+{
+ unsigned int cpu = smp_processor_id();
+ struct kvm_user_return_msrs *msrs = per_cpu_ptr(user_return_msrs, cpu);
+ u64 value;
+ int i;
+
+ for (i = 0; i < kvm_nr_uret_msrs; ++i) {
+ rdmsrl_safe(kvm_uret_msrs_list[i], &value);
+ msrs->values[i].host = value;
+ msrs->values[i].curr = value;
+ }
+}
+
+int kvm_set_user_return_msr(unsigned slot, u64 value, u64 mask)
+{
+ unsigned int cpu = smp_processor_id();
+ struct kvm_user_return_msrs *msrs = per_cpu_ptr(user_return_msrs, cpu);
+ int err;
+
+ value = (value & mask) | (msrs->values[slot].host & ~mask);
+ if (value == msrs->values[slot].curr)
+ return 0;
+ err = wrmsrl_safe(kvm_uret_msrs_list[slot], value);
+ if (err)
+ return 1;
+
+ msrs->values[slot].curr = value;
+ if (!msrs->registered) {
+ msrs->urn.on_user_return = kvm_on_user_return;
+ user_return_notifier_register(&msrs->urn);
+ msrs->registered = true;
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_set_user_return_msr);
+
+static void drop_user_return_notifiers(void)
+{
+ unsigned int cpu = smp_processor_id();
+ struct kvm_user_return_msrs *msrs = per_cpu_ptr(user_return_msrs, cpu);
+
+ if (msrs->registered)
+ kvm_on_user_return(&msrs->urn);
+}
+
+u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.apic_base;
+}
+
+enum lapic_mode kvm_get_apic_mode(struct kvm_vcpu *vcpu)
+{
+ return kvm_apic_mode(kvm_get_apic_base(vcpu));
+}
+EXPORT_SYMBOL_GPL(kvm_get_apic_mode);
+
+int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+{
+ enum lapic_mode old_mode = kvm_get_apic_mode(vcpu);
+ enum lapic_mode new_mode = kvm_apic_mode(msr_info->data);
+ u64 reserved_bits = kvm_vcpu_reserved_gpa_bits_raw(vcpu) | 0x2ff |
+ (guest_cpuid_has(vcpu, X86_FEATURE_X2APIC) ? 0 : X2APIC_ENABLE);
+
+ if ((msr_info->data & reserved_bits) != 0 || new_mode == LAPIC_MODE_INVALID)
+ return 1;
+ if (!msr_info->host_initiated) {
+ if (old_mode == LAPIC_MODE_X2APIC && new_mode == LAPIC_MODE_XAPIC)
+ return 1;
+ if (old_mode == LAPIC_MODE_DISABLED && new_mode == LAPIC_MODE_X2APIC)
+ return 1;
+ }
+
+ kvm_lapic_set_base(vcpu, msr_info->data);
+ kvm_recalculate_apic_map(vcpu->kvm);
+ return 0;
+}
+
+/*
+ * Handle a fault on a hardware virtualization (VMX or SVM) instruction.
+ *
+ * Hardware virtualization extension instructions may fault if a reboot turns
+ * off virtualization while processes are running. Usually after catching the
+ * fault we just panic; during reboot instead the instruction is ignored.
+ */
+noinstr void kvm_spurious_fault(void)
+{
+ /* Fault while not rebooting. We want the trace. */
+ BUG_ON(!kvm_rebooting);
+}
+EXPORT_SYMBOL_GPL(kvm_spurious_fault);
+
+#define EXCPT_BENIGN 0
+#define EXCPT_CONTRIBUTORY 1
+#define EXCPT_PF 2
+
+static int exception_class(int vector)
+{
+ switch (vector) {
+ case PF_VECTOR:
+ return EXCPT_PF;
+ case DE_VECTOR:
+ case TS_VECTOR:
+ case NP_VECTOR:
+ case SS_VECTOR:
+ case GP_VECTOR:
+ return EXCPT_CONTRIBUTORY;
+ default:
+ break;
+ }
+ return EXCPT_BENIGN;
+}
+
+#define EXCPT_FAULT 0
+#define EXCPT_TRAP 1
+#define EXCPT_ABORT 2
+#define EXCPT_INTERRUPT 3
+#define EXCPT_DB 4
+
+static int exception_type(int vector)
+{
+ unsigned int mask;
+
+ if (WARN_ON(vector > 31 || vector == NMI_VECTOR))
+ return EXCPT_INTERRUPT;
+
+ mask = 1 << vector;
+
+ /*
+ * #DBs can be trap-like or fault-like, the caller must check other CPU
+ * state, e.g. DR6, to determine whether a #DB is a trap or fault.
+ */
+ if (mask & (1 << DB_VECTOR))
+ return EXCPT_DB;
+
+ if (mask & ((1 << BP_VECTOR) | (1 << OF_VECTOR)))
+ return EXCPT_TRAP;
+
+ if (mask & ((1 << DF_VECTOR) | (1 << MC_VECTOR)))
+ return EXCPT_ABORT;
+
+ /* Reserved exceptions will result in fault */
+ return EXCPT_FAULT;
+}
+
+void kvm_deliver_exception_payload(struct kvm_vcpu *vcpu,
+ struct kvm_queued_exception *ex)
+{
+ if (!ex->has_payload)
+ return;
+
+ switch (ex->vector) {
+ case DB_VECTOR:
+ /*
+ * "Certain debug exceptions may clear bit 0-3. The
+ * remaining contents of the DR6 register are never
+ * cleared by the processor".
+ */
+ vcpu->arch.dr6 &= ~DR_TRAP_BITS;
+ /*
+ * In order to reflect the #DB exception payload in guest
+ * dr6, three components need to be considered: active low
+ * bit, FIXED_1 bits and active high bits (e.g. DR6_BD,
+ * DR6_BS and DR6_BT)
+ * DR6_ACTIVE_LOW contains the FIXED_1 and active low bits.
+ * In the target guest dr6:
+ * FIXED_1 bits should always be set.
+ * Active low bits should be cleared if 1-setting in payload.
+ * Active high bits should be set if 1-setting in payload.
+ *
+ * Note, the payload is compatible with the pending debug
+ * exceptions/exit qualification under VMX, that active_low bits
+ * are active high in payload.
+ * So they need to be flipped for DR6.
+ */
+ vcpu->arch.dr6 |= DR6_ACTIVE_LOW;
+ vcpu->arch.dr6 |= ex->payload;
+ vcpu->arch.dr6 ^= ex->payload & DR6_ACTIVE_LOW;
+
+ /*
+ * The #DB payload is defined as compatible with the 'pending
+ * debug exceptions' field under VMX, not DR6. While bit 12 is
+ * defined in the 'pending debug exceptions' field (enabled
+ * breakpoint), it is reserved and must be zero in DR6.
+ */
+ vcpu->arch.dr6 &= ~BIT(12);
+ break;
+ case PF_VECTOR:
+ vcpu->arch.cr2 = ex->payload;
+ break;
+ }
+
+ ex->has_payload = false;
+ ex->payload = 0;
+}
+EXPORT_SYMBOL_GPL(kvm_deliver_exception_payload);
+
+static void kvm_queue_exception_vmexit(struct kvm_vcpu *vcpu, unsigned int vector,
+ bool has_error_code, u32 error_code,
+ bool has_payload, unsigned long payload)
+{
+ struct kvm_queued_exception *ex = &vcpu->arch.exception_vmexit;
+
+ ex->vector = vector;
+ ex->injected = false;
+ ex->pending = true;
+ ex->has_error_code = has_error_code;
+ ex->error_code = error_code;
+ ex->has_payload = has_payload;
+ ex->payload = payload;
+}
+
+/* Forcibly leave the nested mode in cases like a vCPU reset */
+static void kvm_leave_nested(struct kvm_vcpu *vcpu)
+{
+ kvm_x86_ops.nested_ops->leave_nested(vcpu);
+}
+
+static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
+ unsigned nr, bool has_error, u32 error_code,
+ bool has_payload, unsigned long payload, bool reinject)
+{
+ u32 prev_nr;
+ int class1, class2;
+
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+
+ /*
+ * If the exception is destined for L2 and isn't being reinjected,
+ * morph it to a VM-Exit if L1 wants to intercept the exception. A
+ * previously injected exception is not checked because it was checked
+ * when it was original queued, and re-checking is incorrect if _L1_
+ * injected the exception, in which case it's exempt from interception.
+ */
+ if (!reinject && is_guest_mode(vcpu) &&
+ kvm_x86_ops.nested_ops->is_exception_vmexit(vcpu, nr, error_code)) {
+ kvm_queue_exception_vmexit(vcpu, nr, has_error, error_code,
+ has_payload, payload);
+ return;
+ }
+
+ if (!vcpu->arch.exception.pending && !vcpu->arch.exception.injected) {
+ queue:
+ if (reinject) {
+ /*
+ * On VM-Entry, an exception can be pending if and only
+ * if event injection was blocked by nested_run_pending.
+ * In that case, however, vcpu_enter_guest() requests an
+ * immediate exit, and the guest shouldn't proceed far
+ * enough to need reinjection.
+ */
+ WARN_ON_ONCE(kvm_is_exception_pending(vcpu));
+ vcpu->arch.exception.injected = true;
+ if (WARN_ON_ONCE(has_payload)) {
+ /*
+ * A reinjected event has already
+ * delivered its payload.
+ */
+ has_payload = false;
+ payload = 0;
+ }
+ } else {
+ vcpu->arch.exception.pending = true;
+ vcpu->arch.exception.injected = false;
+ }
+ vcpu->arch.exception.has_error_code = has_error;
+ vcpu->arch.exception.vector = nr;
+ vcpu->arch.exception.error_code = error_code;
+ vcpu->arch.exception.has_payload = has_payload;
+ vcpu->arch.exception.payload = payload;
+ if (!is_guest_mode(vcpu))
+ kvm_deliver_exception_payload(vcpu,
+ &vcpu->arch.exception);
+ return;
+ }
+
+ /* to check exception */
+ prev_nr = vcpu->arch.exception.vector;
+ if (prev_nr == DF_VECTOR) {
+ /* triple fault -> shutdown */
+ kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+ return;
+ }
+ class1 = exception_class(prev_nr);
+ class2 = exception_class(nr);
+ if ((class1 == EXCPT_CONTRIBUTORY && class2 == EXCPT_CONTRIBUTORY) ||
+ (class1 == EXCPT_PF && class2 != EXCPT_BENIGN)) {
+ /*
+ * Synthesize #DF. Clear the previously injected or pending
+ * exception so as not to incorrectly trigger shutdown.
+ */
+ vcpu->arch.exception.injected = false;
+ vcpu->arch.exception.pending = false;
+
+ kvm_queue_exception_e(vcpu, DF_VECTOR, 0);
+ } else {
+ /* replace previous exception with a new one in a hope
+ that instruction re-execution will regenerate lost
+ exception */
+ goto queue;
+ }
+}
+
+void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr)
+{
+ kvm_multiple_exception(vcpu, nr, false, 0, false, 0, false);
+}
+EXPORT_SYMBOL_GPL(kvm_queue_exception);
+
+void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr)
+{
+ kvm_multiple_exception(vcpu, nr, false, 0, false, 0, true);
+}
+EXPORT_SYMBOL_GPL(kvm_requeue_exception);
+
+void kvm_queue_exception_p(struct kvm_vcpu *vcpu, unsigned nr,
+ unsigned long payload)
+{
+ kvm_multiple_exception(vcpu, nr, false, 0, true, payload, false);
+}
+EXPORT_SYMBOL_GPL(kvm_queue_exception_p);
+
+static void kvm_queue_exception_e_p(struct kvm_vcpu *vcpu, unsigned nr,
+ u32 error_code, unsigned long payload)
+{
+ kvm_multiple_exception(vcpu, nr, true, error_code,
+ true, payload, false);
+}
+
+int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err)
+{
+ if (err)
+ kvm_inject_gp(vcpu, 0);
+ else
+ return kvm_skip_emulated_instruction(vcpu);
+
+ return 1;
+}
+EXPORT_SYMBOL_GPL(kvm_complete_insn_gp);
+
+static int complete_emulated_insn_gp(struct kvm_vcpu *vcpu, int err)
+{
+ if (err) {
+ kvm_inject_gp(vcpu, 0);
+ return 1;
+ }
+
+ return kvm_emulate_instruction(vcpu, EMULTYPE_NO_DECODE | EMULTYPE_SKIP |
+ EMULTYPE_COMPLETE_USER_EXIT);
+}
+
+void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
+{
+ ++vcpu->stat.pf_guest;
+
+ /*
+ * Async #PF in L2 is always forwarded to L1 as a VM-Exit regardless of
+ * whether or not L1 wants to intercept "regular" #PF.
+ */
+ if (is_guest_mode(vcpu) && fault->async_page_fault)
+ kvm_queue_exception_vmexit(vcpu, PF_VECTOR,
+ true, fault->error_code,
+ true, fault->address);
+ else
+ kvm_queue_exception_e_p(vcpu, PF_VECTOR, fault->error_code,
+ fault->address);
+}
+
+void kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu,
+ struct x86_exception *fault)
+{
+ struct kvm_mmu *fault_mmu;
+ WARN_ON_ONCE(fault->vector != PF_VECTOR);
+
+ fault_mmu = fault->nested_page_fault ? vcpu->arch.mmu :
+ vcpu->arch.walk_mmu;
+
+ /*
+ * Invalidate the TLB entry for the faulting address, if it exists,
+ * else the access will fault indefinitely (and to emulate hardware).
+ */
+ if ((fault->error_code & PFERR_PRESENT_MASK) &&
+ !(fault->error_code & PFERR_RSVD_MASK))
+ kvm_mmu_invalidate_addr(vcpu, fault_mmu, fault->address,
+ KVM_MMU_ROOT_CURRENT);
+
+ fault_mmu->inject_page_fault(vcpu, fault);
+}
+EXPORT_SYMBOL_GPL(kvm_inject_emulated_page_fault);
+
+void kvm_inject_nmi(struct kvm_vcpu *vcpu)
+{
+ atomic_inc(&vcpu->arch.nmi_queued);
+ kvm_make_request(KVM_REQ_NMI, vcpu);
+}
+
+void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
+{
+ kvm_multiple_exception(vcpu, nr, true, error_code, false, 0, false);
+}
+EXPORT_SYMBOL_GPL(kvm_queue_exception_e);
+
+void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
+{
+ kvm_multiple_exception(vcpu, nr, true, error_code, false, 0, true);
+}
+EXPORT_SYMBOL_GPL(kvm_requeue_exception_e);
+
+/*
+ * Checks if cpl <= required_cpl; if true, return true. Otherwise queue
+ * a #GP and return false.
+ */
+bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl)
+{
+ if (static_call(kvm_x86_get_cpl)(vcpu) <= required_cpl)
+ return true;
+ kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
+ return false;
+}
+
+bool kvm_require_dr(struct kvm_vcpu *vcpu, int dr)
+{
+ if ((dr != 4 && dr != 5) || !kvm_is_cr4_bit_set(vcpu, X86_CR4_DE))
+ return true;
+
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return false;
+}
+EXPORT_SYMBOL_GPL(kvm_require_dr);
+
+static inline u64 pdptr_rsvd_bits(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.reserved_gpa_bits | rsvd_bits(5, 8) | rsvd_bits(1, 2);
+}
+
+/*
+ * Load the pae pdptrs. Return 1 if they are all valid, 0 otherwise.
+ */
+int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
+{
+ struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
+ gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
+ gpa_t real_gpa;
+ int i;
+ int ret;
+ u64 pdpte[ARRAY_SIZE(mmu->pdptrs)];
+
+ /*
+ * If the MMU is nested, CR3 holds an L2 GPA and needs to be translated
+ * to an L1 GPA.
+ */
+ real_gpa = kvm_translate_gpa(vcpu, mmu, gfn_to_gpa(pdpt_gfn),
+ PFERR_USER_MASK | PFERR_WRITE_MASK, NULL);
+ if (real_gpa == INVALID_GPA)
+ return 0;
+
+ /* Note the offset, PDPTRs are 32 byte aligned when using PAE paging. */
+ ret = kvm_vcpu_read_guest_page(vcpu, gpa_to_gfn(real_gpa), pdpte,
+ cr3 & GENMASK(11, 5), sizeof(pdpte));
+ if (ret < 0)
+ return 0;
+
+ for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
+ if ((pdpte[i] & PT_PRESENT_MASK) &&
+ (pdpte[i] & pdptr_rsvd_bits(vcpu))) {
+ return 0;
+ }
+ }
+
+ /*
+ * Marking VCPU_EXREG_PDPTR dirty doesn't work for !tdp_enabled.
+ * Shadow page roots need to be reconstructed instead.
+ */
+ if (!tdp_enabled && memcmp(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs)))
+ kvm_mmu_free_roots(vcpu->kvm, mmu, KVM_MMU_ROOT_CURRENT);
+
+ memcpy(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs));
+ kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR);
+ kvm_make_request(KVM_REQ_LOAD_MMU_PGD, vcpu);
+ vcpu->arch.pdptrs_from_userspace = false;
+
+ return 1;
+}
+EXPORT_SYMBOL_GPL(load_pdptrs);
+
+static bool kvm_is_valid_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
+{
+#ifdef CONFIG_X86_64
+ if (cr0 & 0xffffffff00000000UL)
+ return false;
+#endif
+
+ if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD))
+ return false;
+
+ if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE))
+ return false;
+
+ return static_call(kvm_x86_is_valid_cr0)(vcpu, cr0);
+}
+
+void kvm_post_set_cr0(struct kvm_vcpu *vcpu, unsigned long old_cr0, unsigned long cr0)
+{
+ /*
+ * CR0.WP is incorporated into the MMU role, but only for non-nested,
+ * indirect shadow MMUs. If paging is disabled, no updates are needed
+ * as there are no permission bits to emulate. If TDP is enabled, the
+ * MMU's metadata needs to be updated, e.g. so that emulating guest
+ * translations does the right thing, but there's no need to unload the
+ * root as CR0.WP doesn't affect SPTEs.
+ */
+ if ((cr0 ^ old_cr0) == X86_CR0_WP) {
+ if (!(cr0 & X86_CR0_PG))
+ return;
+
+ if (tdp_enabled) {
+ kvm_init_mmu(vcpu);
+ return;
+ }
+ }
+
+ if ((cr0 ^ old_cr0) & X86_CR0_PG) {
+ kvm_clear_async_pf_completion_queue(vcpu);
+ kvm_async_pf_hash_reset(vcpu);
+
+ /*
+ * Clearing CR0.PG is defined to flush the TLB from the guest's
+ * perspective.
+ */
+ if (!(cr0 & X86_CR0_PG))
+ kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
+ }
+
+ if ((cr0 ^ old_cr0) & KVM_MMU_CR0_ROLE_BITS)
+ kvm_mmu_reset_context(vcpu);
+
+ if (((cr0 ^ old_cr0) & X86_CR0_CD) &&
+ kvm_arch_has_noncoherent_dma(vcpu->kvm) &&
+ !kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
+ kvm_zap_gfn_range(vcpu->kvm, 0, ~0ULL);
+}
+EXPORT_SYMBOL_GPL(kvm_post_set_cr0);
+
+int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
+{
+ unsigned long old_cr0 = kvm_read_cr0(vcpu);
+
+ if (!kvm_is_valid_cr0(vcpu, cr0))
+ return 1;
+
+ cr0 |= X86_CR0_ET;
+
+ /* Write to CR0 reserved bits are ignored, even on Intel. */
+ cr0 &= ~CR0_RESERVED_BITS;
+
+#ifdef CONFIG_X86_64
+ if ((vcpu->arch.efer & EFER_LME) && !is_paging(vcpu) &&
+ (cr0 & X86_CR0_PG)) {
+ int cs_db, cs_l;
+
+ if (!is_pae(vcpu))
+ return 1;
+ static_call(kvm_x86_get_cs_db_l_bits)(vcpu, &cs_db, &cs_l);
+ if (cs_l)
+ return 1;
+ }
+#endif
+ if (!(vcpu->arch.efer & EFER_LME) && (cr0 & X86_CR0_PG) &&
+ is_pae(vcpu) && ((cr0 ^ old_cr0) & X86_CR0_PDPTR_BITS) &&
+ !load_pdptrs(vcpu, kvm_read_cr3(vcpu)))
+ return 1;
+
+ if (!(cr0 & X86_CR0_PG) &&
+ (is_64_bit_mode(vcpu) || kvm_is_cr4_bit_set(vcpu, X86_CR4_PCIDE)))
+ return 1;
+
+ static_call(kvm_x86_set_cr0)(vcpu, cr0);
+
+ kvm_post_set_cr0(vcpu, old_cr0, cr0);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_set_cr0);
+
+void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
+{
+ (void)kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0eul) | (msw & 0x0f));
+}
+EXPORT_SYMBOL_GPL(kvm_lmsw);
+
+void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu)
+{
+ if (vcpu->arch.guest_state_protected)
+ return;
+
+ if (kvm_is_cr4_bit_set(vcpu, X86_CR4_OSXSAVE)) {
+
+ if (vcpu->arch.xcr0 != host_xcr0)
+ xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0);
+
+ if (guest_can_use(vcpu, X86_FEATURE_XSAVES) &&
+ vcpu->arch.ia32_xss != host_xss)
+ wrmsrl(MSR_IA32_XSS, vcpu->arch.ia32_xss);
+ }
+
+ if (cpu_feature_enabled(X86_FEATURE_PKU) &&
+ vcpu->arch.pkru != vcpu->arch.host_pkru &&
+ ((vcpu->arch.xcr0 & XFEATURE_MASK_PKRU) ||
+ kvm_is_cr4_bit_set(vcpu, X86_CR4_PKE)))
+ write_pkru(vcpu->arch.pkru);
+}
+EXPORT_SYMBOL_GPL(kvm_load_guest_xsave_state);
+
+void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu)
+{
+ if (vcpu->arch.guest_state_protected)
+ return;
+
+ if (cpu_feature_enabled(X86_FEATURE_PKU) &&
+ ((vcpu->arch.xcr0 & XFEATURE_MASK_PKRU) ||
+ kvm_is_cr4_bit_set(vcpu, X86_CR4_PKE))) {
+ vcpu->arch.pkru = rdpkru();
+ if (vcpu->arch.pkru != vcpu->arch.host_pkru)
+ write_pkru(vcpu->arch.host_pkru);
+ }
+
+ if (kvm_is_cr4_bit_set(vcpu, X86_CR4_OSXSAVE)) {
+
+ if (vcpu->arch.xcr0 != host_xcr0)
+ xsetbv(XCR_XFEATURE_ENABLED_MASK, host_xcr0);
+
+ if (guest_can_use(vcpu, X86_FEATURE_XSAVES) &&
+ vcpu->arch.ia32_xss != host_xss)
+ wrmsrl(MSR_IA32_XSS, host_xss);
+ }
+
+}
+EXPORT_SYMBOL_GPL(kvm_load_host_xsave_state);
+
+#ifdef CONFIG_X86_64
+static inline u64 kvm_guest_supported_xfd(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.guest_supported_xcr0 & XFEATURE_MASK_USER_DYNAMIC;
+}
+#endif
+
+static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
+{
+ u64 xcr0 = xcr;
+ u64 old_xcr0 = vcpu->arch.xcr0;
+ u64 valid_bits;
+
+ /* Only support XCR_XFEATURE_ENABLED_MASK(xcr0) now */
+ if (index != XCR_XFEATURE_ENABLED_MASK)
+ return 1;
+ if (!(xcr0 & XFEATURE_MASK_FP))
+ return 1;
+ if ((xcr0 & XFEATURE_MASK_YMM) && !(xcr0 & XFEATURE_MASK_SSE))
+ return 1;
+
+ /*
+ * Do not allow the guest to set bits that we do not support
+ * saving. However, xcr0 bit 0 is always set, even if the
+ * emulated CPU does not support XSAVE (see kvm_vcpu_reset()).
+ */
+ valid_bits = vcpu->arch.guest_supported_xcr0 | XFEATURE_MASK_FP;
+ if (xcr0 & ~valid_bits)
+ return 1;
+
+ if ((!(xcr0 & XFEATURE_MASK_BNDREGS)) !=
+ (!(xcr0 & XFEATURE_MASK_BNDCSR)))
+ return 1;
+
+ if (xcr0 & XFEATURE_MASK_AVX512) {
+ if (!(xcr0 & XFEATURE_MASK_YMM))
+ return 1;
+ if ((xcr0 & XFEATURE_MASK_AVX512) != XFEATURE_MASK_AVX512)
+ return 1;
+ }
+
+ if ((xcr0 & XFEATURE_MASK_XTILE) &&
+ ((xcr0 & XFEATURE_MASK_XTILE) != XFEATURE_MASK_XTILE))
+ return 1;
+
+ vcpu->arch.xcr0 = xcr0;
+
+ if ((xcr0 ^ old_xcr0) & XFEATURE_MASK_EXTEND)
+ kvm_update_cpuid_runtime(vcpu);
+ return 0;
+}
+
+int kvm_emulate_xsetbv(struct kvm_vcpu *vcpu)
+{
+ /* Note, #UD due to CR4.OSXSAVE=0 has priority over the intercept. */
+ if (static_call(kvm_x86_get_cpl)(vcpu) != 0 ||
+ __kvm_set_xcr(vcpu, kvm_rcx_read(vcpu), kvm_read_edx_eax(vcpu))) {
+ kvm_inject_gp(vcpu, 0);
+ return 1;
+ }
+
+ return kvm_skip_emulated_instruction(vcpu);
+}
+EXPORT_SYMBOL_GPL(kvm_emulate_xsetbv);
+
+bool __kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
+{
+ if (cr4 & cr4_reserved_bits)
+ return false;
+
+ if (cr4 & vcpu->arch.cr4_guest_rsvd_bits)
+ return false;
+
+ return true;
+}
+EXPORT_SYMBOL_GPL(__kvm_is_valid_cr4);
+
+static bool kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
+{
+ return __kvm_is_valid_cr4(vcpu, cr4) &&
+ static_call(kvm_x86_is_valid_cr4)(vcpu, cr4);
+}
+
+void kvm_post_set_cr4(struct kvm_vcpu *vcpu, unsigned long old_cr4, unsigned long cr4)
+{
+ if ((cr4 ^ old_cr4) & KVM_MMU_CR4_ROLE_BITS)
+ kvm_mmu_reset_context(vcpu);
+
+ /*
+ * If CR4.PCIDE is changed 0 -> 1, there is no need to flush the TLB
+ * according to the SDM; however, stale prev_roots could be reused
+ * incorrectly in the future after a MOV to CR3 with NOFLUSH=1, so we
+ * free them all. This is *not* a superset of KVM_REQ_TLB_FLUSH_GUEST
+ * or KVM_REQ_TLB_FLUSH_CURRENT, because the hardware TLB is not flushed,
+ * so fall through.
+ */
+ if (!tdp_enabled &&
+ (cr4 & X86_CR4_PCIDE) && !(old_cr4 & X86_CR4_PCIDE))
+ kvm_mmu_unload(vcpu);
+
+ /*
+ * The TLB has to be flushed for all PCIDs if any of the following
+ * (architecturally required) changes happen:
+ * - CR4.PCIDE is changed from 1 to 0
+ * - CR4.PGE is toggled
+ *
+ * This is a superset of KVM_REQ_TLB_FLUSH_CURRENT.
+ */
+ if (((cr4 ^ old_cr4) & X86_CR4_PGE) ||
+ (!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE)))
+ kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
+
+ /*
+ * The TLB has to be flushed for the current PCID if any of the
+ * following (architecturally required) changes happen:
+ * - CR4.SMEP is changed from 0 to 1
+ * - CR4.PAE is toggled
+ */
+ else if (((cr4 ^ old_cr4) & X86_CR4_PAE) ||
+ ((cr4 & X86_CR4_SMEP) && !(old_cr4 & X86_CR4_SMEP)))
+ kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
+
+}
+EXPORT_SYMBOL_GPL(kvm_post_set_cr4);
+
+int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
+{
+ unsigned long old_cr4 = kvm_read_cr4(vcpu);
+
+ if (!kvm_is_valid_cr4(vcpu, cr4))
+ return 1;
+
+ if (is_long_mode(vcpu)) {
+ if (!(cr4 & X86_CR4_PAE))
+ return 1;
+ if ((cr4 ^ old_cr4) & X86_CR4_LA57)
+ return 1;
+ } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE)
+ && ((cr4 ^ old_cr4) & X86_CR4_PDPTR_BITS)
+ && !load_pdptrs(vcpu, kvm_read_cr3(vcpu)))
+ return 1;
+
+ if ((cr4 & X86_CR4_PCIDE) && !(old_cr4 & X86_CR4_PCIDE)) {
+ /* PCID can not be enabled when cr3[11:0]!=000H or EFER.LMA=0 */
+ if ((kvm_read_cr3(vcpu) & X86_CR3_PCID_MASK) || !is_long_mode(vcpu))
+ return 1;
+ }
+
+ static_call(kvm_x86_set_cr4)(vcpu, cr4);
+
+ kvm_post_set_cr4(vcpu, old_cr4, cr4);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_set_cr4);
+
+static void kvm_invalidate_pcid(struct kvm_vcpu *vcpu, unsigned long pcid)
+{
+ struct kvm_mmu *mmu = vcpu->arch.mmu;
+ unsigned long roots_to_free = 0;
+ int i;
+
+ /*
+ * MOV CR3 and INVPCID are usually not intercepted when using TDP, but
+ * this is reachable when running EPT=1 and unrestricted_guest=0, and
+ * also via the emulator. KVM's TDP page tables are not in the scope of
+ * the invalidation, but the guest's TLB entries need to be flushed as
+ * the CPU may have cached entries in its TLB for the target PCID.
+ */
+ if (unlikely(tdp_enabled)) {
+ kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
+ return;
+ }
+
+ /*
+ * If neither the current CR3 nor any of the prev_roots use the given
+ * PCID, then nothing needs to be done here because a resync will
+ * happen anyway before switching to any other CR3.
+ */
+ if (kvm_get_active_pcid(vcpu) == pcid) {
+ kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
+ kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
+ }
+
+ /*
+ * If PCID is disabled, there is no need to free prev_roots even if the
+ * PCIDs for them are also 0, because MOV to CR3 always flushes the TLB
+ * with PCIDE=0.
+ */
+ if (!kvm_is_cr4_bit_set(vcpu, X86_CR4_PCIDE))
+ return;
+
+ for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
+ if (kvm_get_pcid(vcpu, mmu->prev_roots[i].pgd) == pcid)
+ roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
+
+ kvm_mmu_free_roots(vcpu->kvm, mmu, roots_to_free);
+}
+
+int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
+{
+ bool skip_tlb_flush = false;
+ unsigned long pcid = 0;
+#ifdef CONFIG_X86_64
+ if (kvm_is_cr4_bit_set(vcpu, X86_CR4_PCIDE)) {
+ skip_tlb_flush = cr3 & X86_CR3_PCID_NOFLUSH;
+ cr3 &= ~X86_CR3_PCID_NOFLUSH;
+ pcid = cr3 & X86_CR3_PCID_MASK;
+ }
+#endif
+
+ /* PDPTRs are always reloaded for PAE paging. */
+ if (cr3 == kvm_read_cr3(vcpu) && !is_pae_paging(vcpu))
+ goto handle_tlb_flush;
+
+ /*
+ * Do not condition the GPA check on long mode, this helper is used to
+ * stuff CR3, e.g. for RSM emulation, and there is no guarantee that
+ * the current vCPU mode is accurate.
+ */
+ if (kvm_vcpu_is_illegal_gpa(vcpu, cr3))
+ return 1;
+
+ if (is_pae_paging(vcpu) && !load_pdptrs(vcpu, cr3))
+ return 1;
+
+ if (cr3 != kvm_read_cr3(vcpu))
+ kvm_mmu_new_pgd(vcpu, cr3);
+
+ vcpu->arch.cr3 = cr3;
+ kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3);
+ /* Do not call post_set_cr3, we do not get here for confidential guests. */
+
+handle_tlb_flush:
+ /*
+ * A load of CR3 that flushes the TLB flushes only the current PCID,
+ * even if PCID is disabled, in which case PCID=0 is flushed. It's a
+ * moot point in the end because _disabling_ PCID will flush all PCIDs,
+ * and it's impossible to use a non-zero PCID when PCID is disabled,
+ * i.e. only PCID=0 can be relevant.
+ */
+ if (!skip_tlb_flush)
+ kvm_invalidate_pcid(vcpu, pcid);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_set_cr3);
+
+int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
+{
+ if (cr8 & CR8_RESERVED_BITS)
+ return 1;
+ if (lapic_in_kernel(vcpu))
+ kvm_lapic_set_tpr(vcpu, cr8);
+ else
+ vcpu->arch.cr8 = cr8;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_set_cr8);
+
+unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
+{
+ if (lapic_in_kernel(vcpu))
+ return kvm_lapic_get_cr8(vcpu);
+ else
+ return vcpu->arch.cr8;
+}
+EXPORT_SYMBOL_GPL(kvm_get_cr8);
+
+static void kvm_update_dr0123(struct kvm_vcpu *vcpu)
+{
+ int i;
+
+ if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
+ for (i = 0; i < KVM_NR_DB_REGS; i++)
+ vcpu->arch.eff_db[i] = vcpu->arch.db[i];
+ }
+}
+
+void kvm_update_dr7(struct kvm_vcpu *vcpu)
+{
+ unsigned long dr7;
+
+ if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
+ dr7 = vcpu->arch.guest_debug_dr7;
+ else
+ dr7 = vcpu->arch.dr7;
+ static_call(kvm_x86_set_dr7)(vcpu, dr7);
+ vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_BP_ENABLED;
+ if (dr7 & DR7_BP_EN_MASK)
+ vcpu->arch.switch_db_regs |= KVM_DEBUGREG_BP_ENABLED;
+}
+EXPORT_SYMBOL_GPL(kvm_update_dr7);
+
+static u64 kvm_dr6_fixed(struct kvm_vcpu *vcpu)
+{
+ u64 fixed = DR6_FIXED_1;
+
+ if (!guest_cpuid_has(vcpu, X86_FEATURE_RTM))
+ fixed |= DR6_RTM;
+
+ if (!guest_cpuid_has(vcpu, X86_FEATURE_BUS_LOCK_DETECT))
+ fixed |= DR6_BUS_LOCK;
+ return fixed;
+}
+
+int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
+{
+ size_t size = ARRAY_SIZE(vcpu->arch.db);
+
+ switch (dr) {
+ case 0 ... 3:
+ vcpu->arch.db[array_index_nospec(dr, size)] = val;
+ if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
+ vcpu->arch.eff_db[dr] = val;
+ break;
+ case 4:
+ case 6:
+ if (!kvm_dr6_valid(val))
+ return 1; /* #GP */
+ vcpu->arch.dr6 = (val & DR6_VOLATILE) | kvm_dr6_fixed(vcpu);
+ break;
+ case 5:
+ default: /* 7 */
+ if (!kvm_dr7_valid(val))
+ return 1; /* #GP */
+ vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1;
+ kvm_update_dr7(vcpu);
+ break;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_set_dr);
+
+void kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
+{
+ size_t size = ARRAY_SIZE(vcpu->arch.db);
+
+ switch (dr) {
+ case 0 ... 3:
+ *val = vcpu->arch.db[array_index_nospec(dr, size)];
+ break;
+ case 4:
+ case 6:
+ *val = vcpu->arch.dr6;
+ break;
+ case 5:
+ default: /* 7 */
+ *val = vcpu->arch.dr7;
+ break;
+ }
+}
+EXPORT_SYMBOL_GPL(kvm_get_dr);
+
+int kvm_emulate_rdpmc(struct kvm_vcpu *vcpu)
+{
+ u32 ecx = kvm_rcx_read(vcpu);
+ u64 data;
+
+ if (kvm_pmu_rdpmc(vcpu, ecx, &data)) {
+ kvm_inject_gp(vcpu, 0);
+ return 1;
+ }
+
+ kvm_rax_write(vcpu, (u32)data);
+ kvm_rdx_write(vcpu, data >> 32);
+ return kvm_skip_emulated_instruction(vcpu);
+}
+EXPORT_SYMBOL_GPL(kvm_emulate_rdpmc);
+
+/*
+ * The three MSR lists(msrs_to_save, emulated_msrs, msr_based_features) track
+ * the set of MSRs that KVM exposes to userspace through KVM_GET_MSRS,
+ * KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST. msrs_to_save holds MSRs that
+ * require host support, i.e. should be probed via RDMSR. emulated_msrs holds
+ * MSRs that KVM emulates without strictly requiring host support.
+ * msr_based_features holds MSRs that enumerate features, i.e. are effectively
+ * CPUID leafs. Note, msr_based_features isn't mutually exclusive with
+ * msrs_to_save and emulated_msrs.
+ */
+
+static const u32 msrs_to_save_base[] = {
+ MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
+ MSR_STAR,
+#ifdef CONFIG_X86_64
+ MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
+#endif
+ MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA,
+ MSR_IA32_FEAT_CTL, MSR_IA32_BNDCFGS, MSR_TSC_AUX,
+ MSR_IA32_SPEC_CTRL, MSR_IA32_TSX_CTRL,
+ MSR_IA32_RTIT_CTL, MSR_IA32_RTIT_STATUS, MSR_IA32_RTIT_CR3_MATCH,
+ MSR_IA32_RTIT_OUTPUT_BASE, MSR_IA32_RTIT_OUTPUT_MASK,
+ MSR_IA32_RTIT_ADDR0_A, MSR_IA32_RTIT_ADDR0_B,
+ MSR_IA32_RTIT_ADDR1_A, MSR_IA32_RTIT_ADDR1_B,
+ MSR_IA32_RTIT_ADDR2_A, MSR_IA32_RTIT_ADDR2_B,
+ MSR_IA32_RTIT_ADDR3_A, MSR_IA32_RTIT_ADDR3_B,
+ MSR_IA32_UMWAIT_CONTROL,
+
+ MSR_IA32_XFD, MSR_IA32_XFD_ERR,
+};
+
+static const u32 msrs_to_save_pmu[] = {
+ MSR_ARCH_PERFMON_FIXED_CTR0, MSR_ARCH_PERFMON_FIXED_CTR1,
+ MSR_ARCH_PERFMON_FIXED_CTR0 + 2,
+ MSR_CORE_PERF_FIXED_CTR_CTRL, MSR_CORE_PERF_GLOBAL_STATUS,
+ MSR_CORE_PERF_GLOBAL_CTRL, MSR_CORE_PERF_GLOBAL_OVF_CTRL,
+ MSR_IA32_PEBS_ENABLE, MSR_IA32_DS_AREA, MSR_PEBS_DATA_CFG,
+
+ /* This part of MSRs should match KVM_INTEL_PMC_MAX_GENERIC. */
+ MSR_ARCH_PERFMON_PERFCTR0, MSR_ARCH_PERFMON_PERFCTR1,
+ MSR_ARCH_PERFMON_PERFCTR0 + 2, MSR_ARCH_PERFMON_PERFCTR0 + 3,
+ MSR_ARCH_PERFMON_PERFCTR0 + 4, MSR_ARCH_PERFMON_PERFCTR0 + 5,
+ MSR_ARCH_PERFMON_PERFCTR0 + 6, MSR_ARCH_PERFMON_PERFCTR0 + 7,
+ MSR_ARCH_PERFMON_EVENTSEL0, MSR_ARCH_PERFMON_EVENTSEL1,
+ MSR_ARCH_PERFMON_EVENTSEL0 + 2, MSR_ARCH_PERFMON_EVENTSEL0 + 3,
+ MSR_ARCH_PERFMON_EVENTSEL0 + 4, MSR_ARCH_PERFMON_EVENTSEL0 + 5,
+ MSR_ARCH_PERFMON_EVENTSEL0 + 6, MSR_ARCH_PERFMON_EVENTSEL0 + 7,
+
+ MSR_K7_EVNTSEL0, MSR_K7_EVNTSEL1, MSR_K7_EVNTSEL2, MSR_K7_EVNTSEL3,
+ MSR_K7_PERFCTR0, MSR_K7_PERFCTR1, MSR_K7_PERFCTR2, MSR_K7_PERFCTR3,
+
+ /* This part of MSRs should match KVM_AMD_PMC_MAX_GENERIC. */
+ MSR_F15H_PERF_CTL0, MSR_F15H_PERF_CTL1, MSR_F15H_PERF_CTL2,
+ MSR_F15H_PERF_CTL3, MSR_F15H_PERF_CTL4, MSR_F15H_PERF_CTL5,
+ MSR_F15H_PERF_CTR0, MSR_F15H_PERF_CTR1, MSR_F15H_PERF_CTR2,
+ MSR_F15H_PERF_CTR3, MSR_F15H_PERF_CTR4, MSR_F15H_PERF_CTR5,
+
+ MSR_AMD64_PERF_CNTR_GLOBAL_CTL,
+ MSR_AMD64_PERF_CNTR_GLOBAL_STATUS,
+ MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR,
+};
+
+static u32 msrs_to_save[ARRAY_SIZE(msrs_to_save_base) +
+ ARRAY_SIZE(msrs_to_save_pmu)];
+static unsigned num_msrs_to_save;
+
+static const u32 emulated_msrs_all[] = {
+ MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
+ MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
+ HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
+ HV_X64_MSR_TIME_REF_COUNT, HV_X64_MSR_REFERENCE_TSC,
+ HV_X64_MSR_TSC_FREQUENCY, HV_X64_MSR_APIC_FREQUENCY,
+ HV_X64_MSR_CRASH_P0, HV_X64_MSR_CRASH_P1, HV_X64_MSR_CRASH_P2,
+ HV_X64_MSR_CRASH_P3, HV_X64_MSR_CRASH_P4, HV_X64_MSR_CRASH_CTL,
+ HV_X64_MSR_RESET,
+ HV_X64_MSR_VP_INDEX,
+ HV_X64_MSR_VP_RUNTIME,
+ HV_X64_MSR_SCONTROL,
+ HV_X64_MSR_STIMER0_CONFIG,
+ HV_X64_MSR_VP_ASSIST_PAGE,
+ HV_X64_MSR_REENLIGHTENMENT_CONTROL, HV_X64_MSR_TSC_EMULATION_CONTROL,
+ HV_X64_MSR_TSC_EMULATION_STATUS, HV_X64_MSR_TSC_INVARIANT_CONTROL,
+ HV_X64_MSR_SYNDBG_OPTIONS,
+ HV_X64_MSR_SYNDBG_CONTROL, HV_X64_MSR_SYNDBG_STATUS,
+ HV_X64_MSR_SYNDBG_SEND_BUFFER, HV_X64_MSR_SYNDBG_RECV_BUFFER,
+ HV_X64_MSR_SYNDBG_PENDING_BUFFER,
+
+ MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
+ MSR_KVM_PV_EOI_EN, MSR_KVM_ASYNC_PF_INT, MSR_KVM_ASYNC_PF_ACK,
+
+ MSR_IA32_TSC_ADJUST,
+ MSR_IA32_TSC_DEADLINE,
+ MSR_IA32_ARCH_CAPABILITIES,
+ MSR_IA32_PERF_CAPABILITIES,
+ MSR_IA32_MISC_ENABLE,
+ MSR_IA32_MCG_STATUS,
+ MSR_IA32_MCG_CTL,
+ MSR_IA32_MCG_EXT_CTL,
+ MSR_IA32_SMBASE,
+ MSR_SMI_COUNT,
+ MSR_PLATFORM_INFO,
+ MSR_MISC_FEATURES_ENABLES,
+ MSR_AMD64_VIRT_SPEC_CTRL,
+ MSR_AMD64_TSC_RATIO,
+ MSR_IA32_POWER_CTL,
+ MSR_IA32_UCODE_REV,
+
+ /*
+ * KVM always supports the "true" VMX control MSRs, even if the host
+ * does not. The VMX MSRs as a whole are considered "emulated" as KVM
+ * doesn't strictly require them to exist in the host (ignoring that
+ * KVM would refuse to load in the first place if the core set of MSRs
+ * aren't supported).
+ */
+ MSR_IA32_VMX_BASIC,
+ MSR_IA32_VMX_TRUE_PINBASED_CTLS,
+ MSR_IA32_VMX_TRUE_PROCBASED_CTLS,
+ MSR_IA32_VMX_TRUE_EXIT_CTLS,
+ MSR_IA32_VMX_TRUE_ENTRY_CTLS,
+ MSR_IA32_VMX_MISC,
+ MSR_IA32_VMX_CR0_FIXED0,
+ MSR_IA32_VMX_CR4_FIXED0,
+ MSR_IA32_VMX_VMCS_ENUM,
+ MSR_IA32_VMX_PROCBASED_CTLS2,
+ MSR_IA32_VMX_EPT_VPID_CAP,
+ MSR_IA32_VMX_VMFUNC,
+
+ MSR_K7_HWCR,
+ MSR_KVM_POLL_CONTROL,
+};
+
+static u32 emulated_msrs[ARRAY_SIZE(emulated_msrs_all)];
+static unsigned num_emulated_msrs;
+
+/*
+ * List of MSRs that control the existence of MSR-based features, i.e. MSRs
+ * that are effectively CPUID leafs. VMX MSRs are also included in the set of
+ * feature MSRs, but are handled separately to allow expedited lookups.
+ */
+static const u32 msr_based_features_all_except_vmx[] = {
+ MSR_AMD64_DE_CFG,
+ MSR_IA32_UCODE_REV,
+ MSR_IA32_ARCH_CAPABILITIES,
+ MSR_IA32_PERF_CAPABILITIES,
+};
+
+static u32 msr_based_features[ARRAY_SIZE(msr_based_features_all_except_vmx) +
+ (KVM_LAST_EMULATED_VMX_MSR - KVM_FIRST_EMULATED_VMX_MSR + 1)];
+static unsigned int num_msr_based_features;
+
+/*
+ * All feature MSRs except uCode revID, which tracks the currently loaded uCode
+ * patch, are immutable once the vCPU model is defined.
+ */
+static bool kvm_is_immutable_feature_msr(u32 msr)
+{
+ int i;
+
+ if (msr >= KVM_FIRST_EMULATED_VMX_MSR && msr <= KVM_LAST_EMULATED_VMX_MSR)
+ return true;
+
+ for (i = 0; i < ARRAY_SIZE(msr_based_features_all_except_vmx); i++) {
+ if (msr == msr_based_features_all_except_vmx[i])
+ return msr != MSR_IA32_UCODE_REV;
+ }
+
+ return false;
+}
+
+/*
+ * Some IA32_ARCH_CAPABILITIES bits have dependencies on MSRs that KVM
+ * does not yet virtualize. These include:
+ * 10 - MISC_PACKAGE_CTRLS
+ * 11 - ENERGY_FILTERING_CTL
+ * 12 - DOITM
+ * 18 - FB_CLEAR_CTRL
+ * 21 - XAPIC_DISABLE_STATUS
+ * 23 - OVERCLOCKING_STATUS
+ */
+
+#define KVM_SUPPORTED_ARCH_CAP \
+ (ARCH_CAP_RDCL_NO | ARCH_CAP_IBRS_ALL | ARCH_CAP_RSBA | \
+ ARCH_CAP_SKIP_VMENTRY_L1DFLUSH | ARCH_CAP_SSB_NO | ARCH_CAP_MDS_NO | \
+ ARCH_CAP_PSCHANGE_MC_NO | ARCH_CAP_TSX_CTRL_MSR | ARCH_CAP_TAA_NO | \
+ ARCH_CAP_SBDR_SSDP_NO | ARCH_CAP_FBSDP_NO | ARCH_CAP_PSDP_NO | \
+ ARCH_CAP_FB_CLEAR | ARCH_CAP_RRSBA | ARCH_CAP_PBRSB_NO | ARCH_CAP_GDS_NO)
+
+static u64 kvm_get_arch_capabilities(void)
+{
+ u64 data = host_arch_capabilities & KVM_SUPPORTED_ARCH_CAP;
+
+ /*
+ * If nx_huge_pages is enabled, KVM's shadow paging will ensure that
+ * the nested hypervisor runs with NX huge pages. If it is not,
+ * L1 is anyway vulnerable to ITLB_MULTIHIT exploits from other
+ * L1 guests, so it need not worry about its own (L2) guests.
+ */
+ data |= ARCH_CAP_PSCHANGE_MC_NO;
+
+ /*
+ * If we're doing cache flushes (either "always" or "cond")
+ * we will do one whenever the guest does a vmlaunch/vmresume.
+ * If an outer hypervisor is doing the cache flush for us
+ * (ARCH_CAP_SKIP_VMENTRY_L1DFLUSH), we can safely pass that
+ * capability to the guest too, and if EPT is disabled we're not
+ * vulnerable. Overall, only VMENTER_L1D_FLUSH_NEVER will
+ * require a nested hypervisor to do a flush of its own.
+ */
+ if (l1tf_vmx_mitigation != VMENTER_L1D_FLUSH_NEVER)
+ data |= ARCH_CAP_SKIP_VMENTRY_L1DFLUSH;
+
+ if (!boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN))
+ data |= ARCH_CAP_RDCL_NO;
+ if (!boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS))
+ data |= ARCH_CAP_SSB_NO;
+ if (!boot_cpu_has_bug(X86_BUG_MDS))
+ data |= ARCH_CAP_MDS_NO;
+
+ if (!boot_cpu_has(X86_FEATURE_RTM)) {
+ /*
+ * If RTM=0 because the kernel has disabled TSX, the host might
+ * have TAA_NO or TSX_CTRL. Clear TAA_NO (the guest sees RTM=0
+ * and therefore knows that there cannot be TAA) but keep
+ * TSX_CTRL: some buggy userspaces leave it set on tsx=on hosts,
+ * and we want to allow migrating those guests to tsx=off hosts.
+ */
+ data &= ~ARCH_CAP_TAA_NO;
+ } else if (!boot_cpu_has_bug(X86_BUG_TAA)) {
+ data |= ARCH_CAP_TAA_NO;
+ } else {
+ /*
+ * Nothing to do here; we emulate TSX_CTRL if present on the
+ * host so the guest can choose between disabling TSX or
+ * using VERW to clear CPU buffers.
+ */
+ }
+
+ if (!boot_cpu_has_bug(X86_BUG_GDS) || gds_ucode_mitigated())
+ data |= ARCH_CAP_GDS_NO;
+
+ return data;
+}
+
+static int kvm_get_msr_feature(struct kvm_msr_entry *msr)
+{
+ switch (msr->index) {
+ case MSR_IA32_ARCH_CAPABILITIES:
+ msr->data = kvm_get_arch_capabilities();
+ break;
+ case MSR_IA32_PERF_CAPABILITIES:
+ msr->data = kvm_caps.supported_perf_cap;
+ break;
+ case MSR_IA32_UCODE_REV:
+ rdmsrl_safe(msr->index, &msr->data);
+ break;
+ default:
+ return static_call(kvm_x86_get_msr_feature)(msr);
+ }
+ return 0;
+}
+
+static int do_get_msr_feature(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
+{
+ struct kvm_msr_entry msr;
+ int r;
+
+ msr.index = index;
+ r = kvm_get_msr_feature(&msr);
+
+ if (r == KVM_MSR_RET_INVALID) {
+ /* Unconditionally clear the output for simplicity */
+ *data = 0;
+ if (kvm_msr_ignored_check(index, 0, false))
+ r = 0;
+ }
+
+ if (r)
+ return r;
+
+ *data = msr.data;
+
+ return 0;
+}
+
+static bool __kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer)
+{
+ if (efer & EFER_AUTOIBRS && !guest_cpuid_has(vcpu, X86_FEATURE_AUTOIBRS))
+ return false;
+
+ if (efer & EFER_FFXSR && !guest_cpuid_has(vcpu, X86_FEATURE_FXSR_OPT))
+ return false;
+
+ if (efer & EFER_SVME && !guest_cpuid_has(vcpu, X86_FEATURE_SVM))
+ return false;
+
+ if (efer & (EFER_LME | EFER_LMA) &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_LM))
+ return false;
+
+ if (efer & EFER_NX && !guest_cpuid_has(vcpu, X86_FEATURE_NX))
+ return false;
+
+ return true;
+
+}
+bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer)
+{
+ if (efer & efer_reserved_bits)
+ return false;
+
+ return __kvm_valid_efer(vcpu, efer);
+}
+EXPORT_SYMBOL_GPL(kvm_valid_efer);
+
+static int set_efer(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+{
+ u64 old_efer = vcpu->arch.efer;
+ u64 efer = msr_info->data;
+ int r;
+
+ if (efer & efer_reserved_bits)
+ return 1;
+
+ if (!msr_info->host_initiated) {
+ if (!__kvm_valid_efer(vcpu, efer))
+ return 1;
+
+ if (is_paging(vcpu) &&
+ (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME))
+ return 1;
+ }
+
+ efer &= ~EFER_LMA;
+ efer |= vcpu->arch.efer & EFER_LMA;
+
+ r = static_call(kvm_x86_set_efer)(vcpu, efer);
+ if (r) {
+ WARN_ON(r > 0);
+ return r;
+ }
+
+ if ((efer ^ old_efer) & KVM_MMU_EFER_ROLE_BITS)
+ kvm_mmu_reset_context(vcpu);
+
+ return 0;
+}
+
+void kvm_enable_efer_bits(u64 mask)
+{
+ efer_reserved_bits &= ~mask;
+}
+EXPORT_SYMBOL_GPL(kvm_enable_efer_bits);
+
+bool kvm_msr_allowed(struct kvm_vcpu *vcpu, u32 index, u32 type)
+{
+ struct kvm_x86_msr_filter *msr_filter;
+ struct msr_bitmap_range *ranges;
+ struct kvm *kvm = vcpu->kvm;
+ bool allowed;
+ int idx;
+ u32 i;
+
+ /* x2APIC MSRs do not support filtering. */
+ if (index >= 0x800 && index <= 0x8ff)
+ return true;
+
+ idx = srcu_read_lock(&kvm->srcu);
+
+ msr_filter = srcu_dereference(kvm->arch.msr_filter, &kvm->srcu);
+ if (!msr_filter) {
+ allowed = true;
+ goto out;
+ }
+
+ allowed = msr_filter->default_allow;
+ ranges = msr_filter->ranges;
+
+ for (i = 0; i < msr_filter->count; i++) {
+ u32 start = ranges[i].base;
+ u32 end = start + ranges[i].nmsrs;
+ u32 flags = ranges[i].flags;
+ unsigned long *bitmap = ranges[i].bitmap;
+
+ if ((index >= start) && (index < end) && (flags & type)) {
+ allowed = test_bit(index - start, bitmap);
+ break;
+ }
+ }
+
+out:
+ srcu_read_unlock(&kvm->srcu, idx);
+
+ return allowed;
+}
+EXPORT_SYMBOL_GPL(kvm_msr_allowed);
+
+/*
+ * Write @data into the MSR specified by @index. Select MSR specific fault
+ * checks are bypassed if @host_initiated is %true.
+ * Returns 0 on success, non-0 otherwise.
+ * Assumes vcpu_load() was already called.
+ */
+static int __kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data,
+ bool host_initiated)
+{
+ struct msr_data msr;
+
+ switch (index) {
+ case MSR_FS_BASE:
+ case MSR_GS_BASE:
+ case MSR_KERNEL_GS_BASE:
+ case MSR_CSTAR:
+ case MSR_LSTAR:
+ if (is_noncanonical_address(data, vcpu))
+ return 1;
+ break;
+ case MSR_IA32_SYSENTER_EIP:
+ case MSR_IA32_SYSENTER_ESP:
+ /*
+ * IA32_SYSENTER_ESP and IA32_SYSENTER_EIP cause #GP if
+ * non-canonical address is written on Intel but not on
+ * AMD (which ignores the top 32-bits, because it does
+ * not implement 64-bit SYSENTER).
+ *
+ * 64-bit code should hence be able to write a non-canonical
+ * value on AMD. Making the address canonical ensures that
+ * vmentry does not fail on Intel after writing a non-canonical
+ * value, and that something deterministic happens if the guest
+ * invokes 64-bit SYSENTER.
+ */
+ data = __canonical_address(data, vcpu_virt_addr_bits(vcpu));
+ break;
+ case MSR_TSC_AUX:
+ if (!kvm_is_supported_user_return_msr(MSR_TSC_AUX))
+ return 1;
+
+ if (!host_initiated &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP) &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_RDPID))
+ return 1;
+
+ /*
+ * Per Intel's SDM, bits 63:32 are reserved, but AMD's APM has
+ * incomplete and conflicting architectural behavior. Current
+ * AMD CPUs completely ignore bits 63:32, i.e. they aren't
+ * reserved and always read as zeros. Enforce Intel's reserved
+ * bits check if and only if the guest CPU is Intel, and clear
+ * the bits in all other cases. This ensures cross-vendor
+ * migration will provide consistent behavior for the guest.
+ */
+ if (guest_cpuid_is_intel(vcpu) && (data >> 32) != 0)
+ return 1;
+
+ data = (u32)data;
+ break;
+ }
+
+ msr.data = data;
+ msr.index = index;
+ msr.host_initiated = host_initiated;
+
+ return static_call(kvm_x86_set_msr)(vcpu, &msr);
+}
+
+static int kvm_set_msr_ignored_check(struct kvm_vcpu *vcpu,
+ u32 index, u64 data, bool host_initiated)
+{
+ int ret = __kvm_set_msr(vcpu, index, data, host_initiated);
+
+ if (ret == KVM_MSR_RET_INVALID)
+ if (kvm_msr_ignored_check(index, data, true))
+ ret = 0;
+
+ return ret;
+}
+
+/*
+ * Read the MSR specified by @index into @data. Select MSR specific fault
+ * checks are bypassed if @host_initiated is %true.
+ * Returns 0 on success, non-0 otherwise.
+ * Assumes vcpu_load() was already called.
+ */
+int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data,
+ bool host_initiated)
+{
+ struct msr_data msr;
+ int ret;
+
+ switch (index) {
+ case MSR_TSC_AUX:
+ if (!kvm_is_supported_user_return_msr(MSR_TSC_AUX))
+ return 1;
+
+ if (!host_initiated &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP) &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_RDPID))
+ return 1;
+ break;
+ }
+
+ msr.index = index;
+ msr.host_initiated = host_initiated;
+
+ ret = static_call(kvm_x86_get_msr)(vcpu, &msr);
+ if (!ret)
+ *data = msr.data;
+ return ret;
+}
+
+static int kvm_get_msr_ignored_check(struct kvm_vcpu *vcpu,
+ u32 index, u64 *data, bool host_initiated)
+{
+ int ret = __kvm_get_msr(vcpu, index, data, host_initiated);
+
+ if (ret == KVM_MSR_RET_INVALID) {
+ /* Unconditionally clear *data for simplicity */
+ *data = 0;
+ if (kvm_msr_ignored_check(index, 0, false))
+ ret = 0;
+ }
+
+ return ret;
+}
+
+static int kvm_get_msr_with_filter(struct kvm_vcpu *vcpu, u32 index, u64 *data)
+{
+ if (!kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_READ))
+ return KVM_MSR_RET_FILTERED;
+ return kvm_get_msr_ignored_check(vcpu, index, data, false);
+}
+
+static int kvm_set_msr_with_filter(struct kvm_vcpu *vcpu, u32 index, u64 data)
+{
+ if (!kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_WRITE))
+ return KVM_MSR_RET_FILTERED;
+ return kvm_set_msr_ignored_check(vcpu, index, data, false);
+}
+
+int kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data)
+{
+ return kvm_get_msr_ignored_check(vcpu, index, data, false);
+}
+EXPORT_SYMBOL_GPL(kvm_get_msr);
+
+int kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data)
+{
+ return kvm_set_msr_ignored_check(vcpu, index, data, false);
+}
+EXPORT_SYMBOL_GPL(kvm_set_msr);
+
+static void complete_userspace_rdmsr(struct kvm_vcpu *vcpu)
+{
+ if (!vcpu->run->msr.error) {
+ kvm_rax_write(vcpu, (u32)vcpu->run->msr.data);
+ kvm_rdx_write(vcpu, vcpu->run->msr.data >> 32);
+ }
+}
+
+static int complete_emulated_msr_access(struct kvm_vcpu *vcpu)
+{
+ return complete_emulated_insn_gp(vcpu, vcpu->run->msr.error);
+}
+
+static int complete_emulated_rdmsr(struct kvm_vcpu *vcpu)
+{
+ complete_userspace_rdmsr(vcpu);
+ return complete_emulated_msr_access(vcpu);
+}
+
+static int complete_fast_msr_access(struct kvm_vcpu *vcpu)
+{
+ return static_call(kvm_x86_complete_emulated_msr)(vcpu, vcpu->run->msr.error);
+}
+
+static int complete_fast_rdmsr(struct kvm_vcpu *vcpu)
+{
+ complete_userspace_rdmsr(vcpu);
+ return complete_fast_msr_access(vcpu);
+}
+
+static u64 kvm_msr_reason(int r)
+{
+ switch (r) {
+ case KVM_MSR_RET_INVALID:
+ return KVM_MSR_EXIT_REASON_UNKNOWN;
+ case KVM_MSR_RET_FILTERED:
+ return KVM_MSR_EXIT_REASON_FILTER;
+ default:
+ return KVM_MSR_EXIT_REASON_INVAL;
+ }
+}
+
+static int kvm_msr_user_space(struct kvm_vcpu *vcpu, u32 index,
+ u32 exit_reason, u64 data,
+ int (*completion)(struct kvm_vcpu *vcpu),
+ int r)
+{
+ u64 msr_reason = kvm_msr_reason(r);
+
+ /* Check if the user wanted to know about this MSR fault */
+ if (!(vcpu->kvm->arch.user_space_msr_mask & msr_reason))
+ return 0;
+
+ vcpu->run->exit_reason = exit_reason;
+ vcpu->run->msr.error = 0;
+ memset(vcpu->run->msr.pad, 0, sizeof(vcpu->run->msr.pad));
+ vcpu->run->msr.reason = msr_reason;
+ vcpu->run->msr.index = index;
+ vcpu->run->msr.data = data;
+ vcpu->arch.complete_userspace_io = completion;
+
+ return 1;
+}
+
+int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu)
+{
+ u32 ecx = kvm_rcx_read(vcpu);
+ u64 data;
+ int r;
+
+ r = kvm_get_msr_with_filter(vcpu, ecx, &data);
+
+ if (!r) {
+ trace_kvm_msr_read(ecx, data);
+
+ kvm_rax_write(vcpu, data & -1u);
+ kvm_rdx_write(vcpu, (data >> 32) & -1u);
+ } else {
+ /* MSR read failed? See if we should ask user space */
+ if (kvm_msr_user_space(vcpu, ecx, KVM_EXIT_X86_RDMSR, 0,
+ complete_fast_rdmsr, r))
+ return 0;
+ trace_kvm_msr_read_ex(ecx);
+ }
+
+ return static_call(kvm_x86_complete_emulated_msr)(vcpu, r);
+}
+EXPORT_SYMBOL_GPL(kvm_emulate_rdmsr);
+
+int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu)
+{
+ u32 ecx = kvm_rcx_read(vcpu);
+ u64 data = kvm_read_edx_eax(vcpu);
+ int r;
+
+ r = kvm_set_msr_with_filter(vcpu, ecx, data);
+
+ if (!r) {
+ trace_kvm_msr_write(ecx, data);
+ } else {
+ /* MSR write failed? See if we should ask user space */
+ if (kvm_msr_user_space(vcpu, ecx, KVM_EXIT_X86_WRMSR, data,
+ complete_fast_msr_access, r))
+ return 0;
+ /* Signal all other negative errors to userspace */
+ if (r < 0)
+ return r;
+ trace_kvm_msr_write_ex(ecx, data);
+ }
+
+ return static_call(kvm_x86_complete_emulated_msr)(vcpu, r);
+}
+EXPORT_SYMBOL_GPL(kvm_emulate_wrmsr);
+
+int kvm_emulate_as_nop(struct kvm_vcpu *vcpu)
+{
+ return kvm_skip_emulated_instruction(vcpu);
+}
+
+int kvm_emulate_invd(struct kvm_vcpu *vcpu)
+{
+ /* Treat an INVD instruction as a NOP and just skip it. */
+ return kvm_emulate_as_nop(vcpu);
+}
+EXPORT_SYMBOL_GPL(kvm_emulate_invd);
+
+int kvm_handle_invalid_op(struct kvm_vcpu *vcpu)
+{
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+}
+EXPORT_SYMBOL_GPL(kvm_handle_invalid_op);
+
+
+static int kvm_emulate_monitor_mwait(struct kvm_vcpu *vcpu, const char *insn)
+{
+ if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS) &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_MWAIT))
+ return kvm_handle_invalid_op(vcpu);
+
+ pr_warn_once("%s instruction emulated as NOP!\n", insn);
+ return kvm_emulate_as_nop(vcpu);
+}
+int kvm_emulate_mwait(struct kvm_vcpu *vcpu)
+{
+ return kvm_emulate_monitor_mwait(vcpu, "MWAIT");
+}
+EXPORT_SYMBOL_GPL(kvm_emulate_mwait);
+
+int kvm_emulate_monitor(struct kvm_vcpu *vcpu)
+{
+ return kvm_emulate_monitor_mwait(vcpu, "MONITOR");
+}
+EXPORT_SYMBOL_GPL(kvm_emulate_monitor);
+
+static inline bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu)
+{
+ xfer_to_guest_mode_prepare();
+ return vcpu->mode == EXITING_GUEST_MODE || kvm_request_pending(vcpu) ||
+ xfer_to_guest_mode_work_pending();
+}
+
+/*
+ * The fast path for frequent and performance sensitive wrmsr emulation,
+ * i.e. the sending of IPI, sending IPI early in the VM-Exit flow reduces
+ * the latency of virtual IPI by avoiding the expensive bits of transitioning
+ * from guest to host, e.g. reacquiring KVM's SRCU lock. In contrast to the
+ * other cases which must be called after interrupts are enabled on the host.
+ */
+static int handle_fastpath_set_x2apic_icr_irqoff(struct kvm_vcpu *vcpu, u64 data)
+{
+ if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(vcpu->arch.apic))
+ return 1;
+
+ if (((data & APIC_SHORT_MASK) == APIC_DEST_NOSHORT) &&
+ ((data & APIC_DEST_MASK) == APIC_DEST_PHYSICAL) &&
+ ((data & APIC_MODE_MASK) == APIC_DM_FIXED) &&
+ ((u32)(data >> 32) != X2APIC_BROADCAST))
+ return kvm_x2apic_icr_write(vcpu->arch.apic, data);
+
+ return 1;
+}
+
+static int handle_fastpath_set_tscdeadline(struct kvm_vcpu *vcpu, u64 data)
+{
+ if (!kvm_can_use_hv_timer(vcpu))
+ return 1;
+
+ kvm_set_lapic_tscdeadline_msr(vcpu, data);
+ return 0;
+}
+
+fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu)
+{
+ u32 msr = kvm_rcx_read(vcpu);
+ u64 data;
+ fastpath_t ret = EXIT_FASTPATH_NONE;
+
+ kvm_vcpu_srcu_read_lock(vcpu);
+
+ switch (msr) {
+ case APIC_BASE_MSR + (APIC_ICR >> 4):
+ data = kvm_read_edx_eax(vcpu);
+ if (!handle_fastpath_set_x2apic_icr_irqoff(vcpu, data)) {
+ kvm_skip_emulated_instruction(vcpu);
+ ret = EXIT_FASTPATH_EXIT_HANDLED;
+ }
+ break;
+ case MSR_IA32_TSC_DEADLINE:
+ data = kvm_read_edx_eax(vcpu);
+ if (!handle_fastpath_set_tscdeadline(vcpu, data)) {
+ kvm_skip_emulated_instruction(vcpu);
+ ret = EXIT_FASTPATH_REENTER_GUEST;
+ }
+ break;
+ default:
+ break;
+ }
+
+ if (ret != EXIT_FASTPATH_NONE)
+ trace_kvm_msr_write(msr, data);
+
+ kvm_vcpu_srcu_read_unlock(vcpu);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(handle_fastpath_set_msr_irqoff);
+
+/*
+ * Adapt set_msr() to msr_io()'s calling convention
+ */
+static int do_get_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
+{
+ return kvm_get_msr_ignored_check(vcpu, index, data, true);
+}
+
+static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
+{
+ u64 val;
+
+ /*
+ * Disallow writes to immutable feature MSRs after KVM_RUN. KVM does
+ * not support modifying the guest vCPU model on the fly, e.g. changing
+ * the nVMX capabilities while L2 is running is nonsensical. Ignore
+ * writes of the same value, e.g. to allow userspace to blindly stuff
+ * all MSRs when emulating RESET.
+ */
+ if (kvm_vcpu_has_run(vcpu) && kvm_is_immutable_feature_msr(index)) {
+ if (do_get_msr(vcpu, index, &val) || *data != val)
+ return -EINVAL;
+
+ return 0;
+ }
+
+ return kvm_set_msr_ignored_check(vcpu, index, *data, true);
+}
+
+#ifdef CONFIG_X86_64
+struct pvclock_clock {
+ int vclock_mode;
+ u64 cycle_last;
+ u64 mask;
+ u32 mult;
+ u32 shift;
+ u64 base_cycles;
+ u64 offset;
+};
+
+struct pvclock_gtod_data {
+ seqcount_t seq;
+
+ struct pvclock_clock clock; /* extract of a clocksource struct */
+ struct pvclock_clock raw_clock; /* extract of a clocksource struct */
+
+ ktime_t offs_boot;
+ u64 wall_time_sec;
+};
+
+static struct pvclock_gtod_data pvclock_gtod_data;
+
+static void update_pvclock_gtod(struct timekeeper *tk)
+{
+ struct pvclock_gtod_data *vdata = &pvclock_gtod_data;
+
+ write_seqcount_begin(&vdata->seq);
+
+ /* copy pvclock gtod data */
+ vdata->clock.vclock_mode = tk->tkr_mono.clock->vdso_clock_mode;
+ vdata->clock.cycle_last = tk->tkr_mono.cycle_last;
+ vdata->clock.mask = tk->tkr_mono.mask;
+ vdata->clock.mult = tk->tkr_mono.mult;
+ vdata->clock.shift = tk->tkr_mono.shift;
+ vdata->clock.base_cycles = tk->tkr_mono.xtime_nsec;
+ vdata->clock.offset = tk->tkr_mono.base;
+
+ vdata->raw_clock.vclock_mode = tk->tkr_raw.clock->vdso_clock_mode;
+ vdata->raw_clock.cycle_last = tk->tkr_raw.cycle_last;
+ vdata->raw_clock.mask = tk->tkr_raw.mask;
+ vdata->raw_clock.mult = tk->tkr_raw.mult;
+ vdata->raw_clock.shift = tk->tkr_raw.shift;
+ vdata->raw_clock.base_cycles = tk->tkr_raw.xtime_nsec;
+ vdata->raw_clock.offset = tk->tkr_raw.base;
+
+ vdata->wall_time_sec = tk->xtime_sec;
+
+ vdata->offs_boot = tk->offs_boot;
+
+ write_seqcount_end(&vdata->seq);
+}
+
+static s64 get_kvmclock_base_ns(void)
+{
+ /* Count up from boot time, but with the frequency of the raw clock. */
+ return ktime_to_ns(ktime_add(ktime_get_raw(), pvclock_gtod_data.offs_boot));
+}
+#else
+static s64 get_kvmclock_base_ns(void)
+{
+ /* Master clock not used, so we can just use CLOCK_BOOTTIME. */
+ return ktime_get_boottime_ns();
+}
+#endif
+
+static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock, int sec_hi_ofs)
+{
+ int version;
+ int r;
+ struct pvclock_wall_clock wc;
+ u32 wc_sec_hi;
+ u64 wall_nsec;
+
+ if (!wall_clock)
+ return;
+
+ r = kvm_read_guest(kvm, wall_clock, &version, sizeof(version));
+ if (r)
+ return;
+
+ if (version & 1)
+ ++version; /* first time write, random junk */
+
+ ++version;
+
+ if (kvm_write_guest(kvm, wall_clock, &version, sizeof(version)))
+ return;
+
+ /*
+ * The guest calculates current wall clock time by adding
+ * system time (updated by kvm_guest_time_update below) to the
+ * wall clock specified here. We do the reverse here.
+ */
+ wall_nsec = ktime_get_real_ns() - get_kvmclock_ns(kvm);
+
+ wc.nsec = do_div(wall_nsec, 1000000000);
+ wc.sec = (u32)wall_nsec; /* overflow in 2106 guest time */
+ wc.version = version;
+
+ kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc));
+
+ if (sec_hi_ofs) {
+ wc_sec_hi = wall_nsec >> 32;
+ kvm_write_guest(kvm, wall_clock + sec_hi_ofs,
+ &wc_sec_hi, sizeof(wc_sec_hi));
+ }
+
+ version++;
+ kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
+}
+
+static void kvm_write_system_time(struct kvm_vcpu *vcpu, gpa_t system_time,
+ bool old_msr, bool host_initiated)
+{
+ struct kvm_arch *ka = &vcpu->kvm->arch;
+
+ if (vcpu->vcpu_id == 0 && !host_initiated) {
+ if (ka->boot_vcpu_runs_old_kvmclock != old_msr)
+ kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
+
+ ka->boot_vcpu_runs_old_kvmclock = old_msr;
+ }
+
+ vcpu->arch.time = system_time;
+ kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu);
+
+ /* we verify if the enable bit is set... */
+ if (system_time & 1)
+ kvm_gpc_activate(&vcpu->arch.pv_time, system_time & ~1ULL,
+ sizeof(struct pvclock_vcpu_time_info));
+ else
+ kvm_gpc_deactivate(&vcpu->arch.pv_time);
+
+ return;
+}
+
+static uint32_t div_frac(uint32_t dividend, uint32_t divisor)
+{
+ do_shl32_div32(dividend, divisor);
+ return dividend;
+}
+
+static void kvm_get_time_scale(uint64_t scaled_hz, uint64_t base_hz,
+ s8 *pshift, u32 *pmultiplier)
+{
+ uint64_t scaled64;
+ int32_t shift = 0;
+ uint64_t tps64;
+ uint32_t tps32;
+
+ tps64 = base_hz;
+ scaled64 = scaled_hz;
+ while (tps64 > scaled64*2 || tps64 & 0xffffffff00000000ULL) {
+ tps64 >>= 1;
+ shift--;
+ }
+
+ tps32 = (uint32_t)tps64;
+ while (tps32 <= scaled64 || scaled64 & 0xffffffff00000000ULL) {
+ if (scaled64 & 0xffffffff00000000ULL || tps32 & 0x80000000)
+ scaled64 >>= 1;
+ else
+ tps32 <<= 1;
+ shift++;
+ }
+
+ *pshift = shift;
+ *pmultiplier = div_frac(scaled64, tps32);
+}
+
+#ifdef CONFIG_X86_64
+static atomic_t kvm_guest_has_master_clock = ATOMIC_INIT(0);
+#endif
+
+static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
+static unsigned long max_tsc_khz;
+
+static u32 adjust_tsc_khz(u32 khz, s32 ppm)
+{
+ u64 v = (u64)khz * (1000000 + ppm);
+ do_div(v, 1000000);
+ return v;
+}
+
+static void kvm_vcpu_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 l1_multiplier);
+
+static int set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
+{
+ u64 ratio;
+
+ /* Guest TSC same frequency as host TSC? */
+ if (!scale) {
+ kvm_vcpu_write_tsc_multiplier(vcpu, kvm_caps.default_tsc_scaling_ratio);
+ return 0;
+ }
+
+ /* TSC scaling supported? */
+ if (!kvm_caps.has_tsc_control) {
+ if (user_tsc_khz > tsc_khz) {
+ vcpu->arch.tsc_catchup = 1;
+ vcpu->arch.tsc_always_catchup = 1;
+ return 0;
+ } else {
+ pr_warn_ratelimited("user requested TSC rate below hardware speed\n");
+ return -1;
+ }
+ }
+
+ /* TSC scaling required - calculate ratio */
+ ratio = mul_u64_u32_div(1ULL << kvm_caps.tsc_scaling_ratio_frac_bits,
+ user_tsc_khz, tsc_khz);
+
+ if (ratio == 0 || ratio >= kvm_caps.max_tsc_scaling_ratio) {
+ pr_warn_ratelimited("Invalid TSC scaling ratio - virtual-tsc-khz=%u\n",
+ user_tsc_khz);
+ return -1;
+ }
+
+ kvm_vcpu_write_tsc_multiplier(vcpu, ratio);
+ return 0;
+}
+
+static int kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz)
+{
+ u32 thresh_lo, thresh_hi;
+ int use_scaling = 0;
+
+ /* tsc_khz can be zero if TSC calibration fails */
+ if (user_tsc_khz == 0) {
+ /* set tsc_scaling_ratio to a safe value */
+ kvm_vcpu_write_tsc_multiplier(vcpu, kvm_caps.default_tsc_scaling_ratio);
+ return -1;
+ }
+
+ /* Compute a scale to convert nanoseconds in TSC cycles */
+ kvm_get_time_scale(user_tsc_khz * 1000LL, NSEC_PER_SEC,
+ &vcpu->arch.virtual_tsc_shift,
+ &vcpu->arch.virtual_tsc_mult);
+ vcpu->arch.virtual_tsc_khz = user_tsc_khz;
+
+ /*
+ * Compute the variation in TSC rate which is acceptable
+ * within the range of tolerance and decide if the
+ * rate being applied is within that bounds of the hardware
+ * rate. If so, no scaling or compensation need be done.
+ */
+ thresh_lo = adjust_tsc_khz(tsc_khz, -tsc_tolerance_ppm);
+ thresh_hi = adjust_tsc_khz(tsc_khz, tsc_tolerance_ppm);
+ if (user_tsc_khz < thresh_lo || user_tsc_khz > thresh_hi) {
+ pr_debug("requested TSC rate %u falls outside tolerance [%u,%u]\n",
+ user_tsc_khz, thresh_lo, thresh_hi);
+ use_scaling = 1;
+ }
+ return set_tsc_khz(vcpu, user_tsc_khz, use_scaling);
+}
+
+static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns)
+{
+ u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.this_tsc_nsec,
+ vcpu->arch.virtual_tsc_mult,
+ vcpu->arch.virtual_tsc_shift);
+ tsc += vcpu->arch.this_tsc_write;
+ return tsc;
+}
+
+#ifdef CONFIG_X86_64
+static inline int gtod_is_based_on_tsc(int mode)
+{
+ return mode == VDSO_CLOCKMODE_TSC || mode == VDSO_CLOCKMODE_HVCLOCK;
+}
+#endif
+
+static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu)
+{
+#ifdef CONFIG_X86_64
+ bool vcpus_matched;
+ struct kvm_arch *ka = &vcpu->kvm->arch;
+ struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
+
+ vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 ==
+ atomic_read(&vcpu->kvm->online_vcpus));
+
+ /*
+ * Once the masterclock is enabled, always perform request in
+ * order to update it.
+ *
+ * In order to enable masterclock, the host clocksource must be TSC
+ * and the vcpus need to have matched TSCs. When that happens,
+ * perform request to enable masterclock.
+ */
+ if (ka->use_master_clock ||
+ (gtod_is_based_on_tsc(gtod->clock.vclock_mode) && vcpus_matched))
+ kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
+
+ trace_kvm_track_tsc(vcpu->vcpu_id, ka->nr_vcpus_matched_tsc,
+ atomic_read(&vcpu->kvm->online_vcpus),
+ ka->use_master_clock, gtod->clock.vclock_mode);
+#endif
+}
+
+/*
+ * Multiply tsc by a fixed point number represented by ratio.
+ *
+ * The most significant 64-N bits (mult) of ratio represent the
+ * integral part of the fixed point number; the remaining N bits
+ * (frac) represent the fractional part, ie. ratio represents a fixed
+ * point number (mult + frac * 2^(-N)).
+ *
+ * N equals to kvm_caps.tsc_scaling_ratio_frac_bits.
+ */
+static inline u64 __scale_tsc(u64 ratio, u64 tsc)
+{
+ return mul_u64_u64_shr(tsc, ratio, kvm_caps.tsc_scaling_ratio_frac_bits);
+}
+
+u64 kvm_scale_tsc(u64 tsc, u64 ratio)
+{
+ u64 _tsc = tsc;
+
+ if (ratio != kvm_caps.default_tsc_scaling_ratio)
+ _tsc = __scale_tsc(ratio, tsc);
+
+ return _tsc;
+}
+
+static u64 kvm_compute_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
+{
+ u64 tsc;
+
+ tsc = kvm_scale_tsc(rdtsc(), vcpu->arch.l1_tsc_scaling_ratio);
+
+ return target_tsc - tsc;
+}
+
+u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc)
+{
+ return vcpu->arch.l1_tsc_offset +
+ kvm_scale_tsc(host_tsc, vcpu->arch.l1_tsc_scaling_ratio);
+}
+EXPORT_SYMBOL_GPL(kvm_read_l1_tsc);
+
+u64 kvm_calc_nested_tsc_offset(u64 l1_offset, u64 l2_offset, u64 l2_multiplier)
+{
+ u64 nested_offset;
+
+ if (l2_multiplier == kvm_caps.default_tsc_scaling_ratio)
+ nested_offset = l1_offset;
+ else
+ nested_offset = mul_s64_u64_shr((s64) l1_offset, l2_multiplier,
+ kvm_caps.tsc_scaling_ratio_frac_bits);
+
+ nested_offset += l2_offset;
+ return nested_offset;
+}
+EXPORT_SYMBOL_GPL(kvm_calc_nested_tsc_offset);
+
+u64 kvm_calc_nested_tsc_multiplier(u64 l1_multiplier, u64 l2_multiplier)
+{
+ if (l2_multiplier != kvm_caps.default_tsc_scaling_ratio)
+ return mul_u64_u64_shr(l1_multiplier, l2_multiplier,
+ kvm_caps.tsc_scaling_ratio_frac_bits);
+
+ return l1_multiplier;
+}
+EXPORT_SYMBOL_GPL(kvm_calc_nested_tsc_multiplier);
+
+static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 l1_offset)
+{
+ trace_kvm_write_tsc_offset(vcpu->vcpu_id,
+ vcpu->arch.l1_tsc_offset,
+ l1_offset);
+
+ vcpu->arch.l1_tsc_offset = l1_offset;
+
+ /*
+ * If we are here because L1 chose not to trap WRMSR to TSC then
+ * according to the spec this should set L1's TSC (as opposed to
+ * setting L1's offset for L2).
+ */
+ if (is_guest_mode(vcpu))
+ vcpu->arch.tsc_offset = kvm_calc_nested_tsc_offset(
+ l1_offset,
+ static_call(kvm_x86_get_l2_tsc_offset)(vcpu),
+ static_call(kvm_x86_get_l2_tsc_multiplier)(vcpu));
+ else
+ vcpu->arch.tsc_offset = l1_offset;
+
+ static_call(kvm_x86_write_tsc_offset)(vcpu);
+}
+
+static void kvm_vcpu_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 l1_multiplier)
+{
+ vcpu->arch.l1_tsc_scaling_ratio = l1_multiplier;
+
+ /* Userspace is changing the multiplier while L2 is active */
+ if (is_guest_mode(vcpu))
+ vcpu->arch.tsc_scaling_ratio = kvm_calc_nested_tsc_multiplier(
+ l1_multiplier,
+ static_call(kvm_x86_get_l2_tsc_multiplier)(vcpu));
+ else
+ vcpu->arch.tsc_scaling_ratio = l1_multiplier;
+
+ if (kvm_caps.has_tsc_control)
+ static_call(kvm_x86_write_tsc_multiplier)(vcpu);
+}
+
+static inline bool kvm_check_tsc_unstable(void)
+{
+#ifdef CONFIG_X86_64
+ /*
+ * TSC is marked unstable when we're running on Hyper-V,
+ * 'TSC page' clocksource is good.
+ */
+ if (pvclock_gtod_data.clock.vclock_mode == VDSO_CLOCKMODE_HVCLOCK)
+ return false;
+#endif
+ return check_tsc_unstable();
+}
+
+/*
+ * Infers attempts to synchronize the guest's tsc from host writes. Sets the
+ * offset for the vcpu and tracks the TSC matching generation that the vcpu
+ * participates in.
+ */
+static void __kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 offset, u64 tsc,
+ u64 ns, bool matched)
+{
+ struct kvm *kvm = vcpu->kvm;
+
+ lockdep_assert_held(&kvm->arch.tsc_write_lock);
+
+ /*
+ * We also track th most recent recorded KHZ, write and time to
+ * allow the matching interval to be extended at each write.
+ */
+ kvm->arch.last_tsc_nsec = ns;
+ kvm->arch.last_tsc_write = tsc;
+ kvm->arch.last_tsc_khz = vcpu->arch.virtual_tsc_khz;
+ kvm->arch.last_tsc_offset = offset;
+
+ vcpu->arch.last_guest_tsc = tsc;
+
+ kvm_vcpu_write_tsc_offset(vcpu, offset);
+
+ if (!matched) {
+ /*
+ * We split periods of matched TSC writes into generations.
+ * For each generation, we track the original measured
+ * nanosecond time, offset, and write, so if TSCs are in
+ * sync, we can match exact offset, and if not, we can match
+ * exact software computation in compute_guest_tsc()
+ *
+ * These values are tracked in kvm->arch.cur_xxx variables.
+ */
+ kvm->arch.cur_tsc_generation++;
+ kvm->arch.cur_tsc_nsec = ns;
+ kvm->arch.cur_tsc_write = tsc;
+ kvm->arch.cur_tsc_offset = offset;
+ kvm->arch.nr_vcpus_matched_tsc = 0;
+ } else if (vcpu->arch.this_tsc_generation != kvm->arch.cur_tsc_generation) {
+ kvm->arch.nr_vcpus_matched_tsc++;
+ }
+
+ /* Keep track of which generation this VCPU has synchronized to */
+ vcpu->arch.this_tsc_generation = kvm->arch.cur_tsc_generation;
+ vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec;
+ vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write;
+
+ kvm_track_tsc_matching(vcpu);
+}
+
+static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 data)
+{
+ struct kvm *kvm = vcpu->kvm;
+ u64 offset, ns, elapsed;
+ unsigned long flags;
+ bool matched = false;
+ bool synchronizing = false;
+
+ raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
+ offset = kvm_compute_l1_tsc_offset(vcpu, data);
+ ns = get_kvmclock_base_ns();
+ elapsed = ns - kvm->arch.last_tsc_nsec;
+
+ if (vcpu->arch.virtual_tsc_khz) {
+ if (data == 0) {
+ /*
+ * detection of vcpu initialization -- need to sync
+ * with other vCPUs. This particularly helps to keep
+ * kvm_clock stable after CPU hotplug
+ */
+ synchronizing = true;
+ } else {
+ u64 tsc_exp = kvm->arch.last_tsc_write +
+ nsec_to_cycles(vcpu, elapsed);
+ u64 tsc_hz = vcpu->arch.virtual_tsc_khz * 1000LL;
+ /*
+ * Special case: TSC write with a small delta (1 second)
+ * of virtual cycle time against real time is
+ * interpreted as an attempt to synchronize the CPU.
+ */
+ synchronizing = data < tsc_exp + tsc_hz &&
+ data + tsc_hz > tsc_exp;
+ }
+ }
+
+ /*
+ * For a reliable TSC, we can match TSC offsets, and for an unstable
+ * TSC, we add elapsed time in this computation. We could let the
+ * compensation code attempt to catch up if we fall behind, but
+ * it's better to try to match offsets from the beginning.
+ */
+ if (synchronizing &&
+ vcpu->arch.virtual_tsc_khz == kvm->arch.last_tsc_khz) {
+ if (!kvm_check_tsc_unstable()) {
+ offset = kvm->arch.cur_tsc_offset;
+ } else {
+ u64 delta = nsec_to_cycles(vcpu, elapsed);
+ data += delta;
+ offset = kvm_compute_l1_tsc_offset(vcpu, data);
+ }
+ matched = true;
+ }
+
+ __kvm_synchronize_tsc(vcpu, offset, data, ns, matched);
+ raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
+}
+
+static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu,
+ s64 adjustment)
+{
+ u64 tsc_offset = vcpu->arch.l1_tsc_offset;
+ kvm_vcpu_write_tsc_offset(vcpu, tsc_offset + adjustment);
+}
+
+static inline void adjust_tsc_offset_host(struct kvm_vcpu *vcpu, s64 adjustment)
+{
+ if (vcpu->arch.l1_tsc_scaling_ratio != kvm_caps.default_tsc_scaling_ratio)
+ WARN_ON(adjustment < 0);
+ adjustment = kvm_scale_tsc((u64) adjustment,
+ vcpu->arch.l1_tsc_scaling_ratio);
+ adjust_tsc_offset_guest(vcpu, adjustment);
+}
+
+#ifdef CONFIG_X86_64
+
+static u64 read_tsc(void)
+{
+ u64 ret = (u64)rdtsc_ordered();
+ u64 last = pvclock_gtod_data.clock.cycle_last;
+
+ if (likely(ret >= last))
+ return ret;
+
+ /*
+ * GCC likes to generate cmov here, but this branch is extremely
+ * predictable (it's just a function of time and the likely is
+ * very likely) and there's a data dependence, so force GCC
+ * to generate a branch instead. I don't barrier() because
+ * we don't actually need a barrier, and if this function
+ * ever gets inlined it will generate worse code.
+ */
+ asm volatile ("");
+ return last;
+}
+
+static inline u64 vgettsc(struct pvclock_clock *clock, u64 *tsc_timestamp,
+ int *mode)
+{
+ u64 tsc_pg_val;
+ long v;
+
+ switch (clock->vclock_mode) {
+ case VDSO_CLOCKMODE_HVCLOCK:
+ if (hv_read_tsc_page_tsc(hv_get_tsc_page(),
+ tsc_timestamp, &tsc_pg_val)) {
+ /* TSC page valid */
+ *mode = VDSO_CLOCKMODE_HVCLOCK;
+ v = (tsc_pg_val - clock->cycle_last) &
+ clock->mask;
+ } else {
+ /* TSC page invalid */
+ *mode = VDSO_CLOCKMODE_NONE;
+ }
+ break;
+ case VDSO_CLOCKMODE_TSC:
+ *mode = VDSO_CLOCKMODE_TSC;
+ *tsc_timestamp = read_tsc();
+ v = (*tsc_timestamp - clock->cycle_last) &
+ clock->mask;
+ break;
+ default:
+ *mode = VDSO_CLOCKMODE_NONE;
+ }
+
+ if (*mode == VDSO_CLOCKMODE_NONE)
+ *tsc_timestamp = v = 0;
+
+ return v * clock->mult;
+}
+
+static int do_monotonic_raw(s64 *t, u64 *tsc_timestamp)
+{
+ struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
+ unsigned long seq;
+ int mode;
+ u64 ns;
+
+ do {
+ seq = read_seqcount_begin(&gtod->seq);
+ ns = gtod->raw_clock.base_cycles;
+ ns += vgettsc(&gtod->raw_clock, tsc_timestamp, &mode);
+ ns >>= gtod->raw_clock.shift;
+ ns += ktime_to_ns(ktime_add(gtod->raw_clock.offset, gtod->offs_boot));
+ } while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
+ *t = ns;
+
+ return mode;
+}
+
+static int do_realtime(struct timespec64 *ts, u64 *tsc_timestamp)
+{
+ struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
+ unsigned long seq;
+ int mode;
+ u64 ns;
+
+ do {
+ seq = read_seqcount_begin(&gtod->seq);
+ ts->tv_sec = gtod->wall_time_sec;
+ ns = gtod->clock.base_cycles;
+ ns += vgettsc(&gtod->clock, tsc_timestamp, &mode);
+ ns >>= gtod->clock.shift;
+ } while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
+
+ ts->tv_sec += __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns);
+ ts->tv_nsec = ns;
+
+ return mode;
+}
+
+/* returns true if host is using TSC based clocksource */
+static bool kvm_get_time_and_clockread(s64 *kernel_ns, u64 *tsc_timestamp)
+{
+ /* checked again under seqlock below */
+ if (!gtod_is_based_on_tsc(pvclock_gtod_data.clock.vclock_mode))
+ return false;
+
+ return gtod_is_based_on_tsc(do_monotonic_raw(kernel_ns,
+ tsc_timestamp));
+}
+
+/* returns true if host is using TSC based clocksource */
+static bool kvm_get_walltime_and_clockread(struct timespec64 *ts,
+ u64 *tsc_timestamp)
+{
+ /* checked again under seqlock below */
+ if (!gtod_is_based_on_tsc(pvclock_gtod_data.clock.vclock_mode))
+ return false;
+
+ return gtod_is_based_on_tsc(do_realtime(ts, tsc_timestamp));
+}
+#endif
+
+/*
+ *
+ * Assuming a stable TSC across physical CPUS, and a stable TSC
+ * across virtual CPUs, the following condition is possible.
+ * Each numbered line represents an event visible to both
+ * CPUs at the next numbered event.
+ *
+ * "timespecX" represents host monotonic time. "tscX" represents
+ * RDTSC value.
+ *
+ * VCPU0 on CPU0 | VCPU1 on CPU1
+ *
+ * 1. read timespec0,tsc0
+ * 2. | timespec1 = timespec0 + N
+ * | tsc1 = tsc0 + M
+ * 3. transition to guest | transition to guest
+ * 4. ret0 = timespec0 + (rdtsc - tsc0) |
+ * 5. | ret1 = timespec1 + (rdtsc - tsc1)
+ * | ret1 = timespec0 + N + (rdtsc - (tsc0 + M))
+ *
+ * Since ret0 update is visible to VCPU1 at time 5, to obey monotonicity:
+ *
+ * - ret0 < ret1
+ * - timespec0 + (rdtsc - tsc0) < timespec0 + N + (rdtsc - (tsc0 + M))
+ * ...
+ * - 0 < N - M => M < N
+ *
+ * That is, when timespec0 != timespec1, M < N. Unfortunately that is not
+ * always the case (the difference between two distinct xtime instances
+ * might be smaller then the difference between corresponding TSC reads,
+ * when updating guest vcpus pvclock areas).
+ *
+ * To avoid that problem, do not allow visibility of distinct
+ * system_timestamp/tsc_timestamp values simultaneously: use a master
+ * copy of host monotonic time values. Update that master copy
+ * in lockstep.
+ *
+ * Rely on synchronization of host TSCs and guest TSCs for monotonicity.
+ *
+ */
+
+static void pvclock_update_vm_gtod_copy(struct kvm *kvm)
+{
+#ifdef CONFIG_X86_64
+ struct kvm_arch *ka = &kvm->arch;
+ int vclock_mode;
+ bool host_tsc_clocksource, vcpus_matched;
+
+ lockdep_assert_held(&kvm->arch.tsc_write_lock);
+ vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 ==
+ atomic_read(&kvm->online_vcpus));
+
+ /*
+ * If the host uses TSC clock, then passthrough TSC as stable
+ * to the guest.
+ */
+ host_tsc_clocksource = kvm_get_time_and_clockread(
+ &ka->master_kernel_ns,
+ &ka->master_cycle_now);
+
+ ka->use_master_clock = host_tsc_clocksource && vcpus_matched
+ && !ka->backwards_tsc_observed
+ && !ka->boot_vcpu_runs_old_kvmclock;
+
+ if (ka->use_master_clock)
+ atomic_set(&kvm_guest_has_master_clock, 1);
+
+ vclock_mode = pvclock_gtod_data.clock.vclock_mode;
+ trace_kvm_update_master_clock(ka->use_master_clock, vclock_mode,
+ vcpus_matched);
+#endif
+}
+
+static void kvm_make_mclock_inprogress_request(struct kvm *kvm)
+{
+ kvm_make_all_cpus_request(kvm, KVM_REQ_MCLOCK_INPROGRESS);
+}
+
+static void __kvm_start_pvclock_update(struct kvm *kvm)
+{
+ raw_spin_lock_irq(&kvm->arch.tsc_write_lock);
+ write_seqcount_begin(&kvm->arch.pvclock_sc);
+}
+
+static void kvm_start_pvclock_update(struct kvm *kvm)
+{
+ kvm_make_mclock_inprogress_request(kvm);
+
+ /* no guest entries from this point */
+ __kvm_start_pvclock_update(kvm);
+}
+
+static void kvm_end_pvclock_update(struct kvm *kvm)
+{
+ struct kvm_arch *ka = &kvm->arch;
+ struct kvm_vcpu *vcpu;
+ unsigned long i;
+
+ write_seqcount_end(&ka->pvclock_sc);
+ raw_spin_unlock_irq(&ka->tsc_write_lock);
+ kvm_for_each_vcpu(i, vcpu, kvm)
+ kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
+
+ /* guest entries allowed */
+ kvm_for_each_vcpu(i, vcpu, kvm)
+ kvm_clear_request(KVM_REQ_MCLOCK_INPROGRESS, vcpu);
+}
+
+static void kvm_update_masterclock(struct kvm *kvm)
+{
+ kvm_hv_request_tsc_page_update(kvm);
+ kvm_start_pvclock_update(kvm);
+ pvclock_update_vm_gtod_copy(kvm);
+ kvm_end_pvclock_update(kvm);
+}
+
+/*
+ * Use the kernel's tsc_khz directly if the TSC is constant, otherwise use KVM's
+ * per-CPU value (which may be zero if a CPU is going offline). Note, tsc_khz
+ * can change during boot even if the TSC is constant, as it's possible for KVM
+ * to be loaded before TSC calibration completes. Ideally, KVM would get a
+ * notification when calibration completes, but practically speaking calibration
+ * will complete before userspace is alive enough to create VMs.
+ */
+static unsigned long get_cpu_tsc_khz(void)
+{
+ if (static_cpu_has(X86_FEATURE_CONSTANT_TSC))
+ return tsc_khz;
+ else
+ return __this_cpu_read(cpu_tsc_khz);
+}
+
+/* Called within read_seqcount_begin/retry for kvm->pvclock_sc. */
+static void __get_kvmclock(struct kvm *kvm, struct kvm_clock_data *data)
+{
+ struct kvm_arch *ka = &kvm->arch;
+ struct pvclock_vcpu_time_info hv_clock;
+
+ /* both __this_cpu_read() and rdtsc() should be on the same cpu */
+ get_cpu();
+
+ data->flags = 0;
+ if (ka->use_master_clock &&
+ (static_cpu_has(X86_FEATURE_CONSTANT_TSC) || __this_cpu_read(cpu_tsc_khz))) {
+#ifdef CONFIG_X86_64
+ struct timespec64 ts;
+
+ if (kvm_get_walltime_and_clockread(&ts, &data->host_tsc)) {
+ data->realtime = ts.tv_nsec + NSEC_PER_SEC * ts.tv_sec;
+ data->flags |= KVM_CLOCK_REALTIME | KVM_CLOCK_HOST_TSC;
+ } else
+#endif
+ data->host_tsc = rdtsc();
+
+ data->flags |= KVM_CLOCK_TSC_STABLE;
+ hv_clock.tsc_timestamp = ka->master_cycle_now;
+ hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset;
+ kvm_get_time_scale(NSEC_PER_SEC, get_cpu_tsc_khz() * 1000LL,
+ &hv_clock.tsc_shift,
+ &hv_clock.tsc_to_system_mul);
+ data->clock = __pvclock_read_cycles(&hv_clock, data->host_tsc);
+ } else {
+ data->clock = get_kvmclock_base_ns() + ka->kvmclock_offset;
+ }
+
+ put_cpu();
+}
+
+static void get_kvmclock(struct kvm *kvm, struct kvm_clock_data *data)
+{
+ struct kvm_arch *ka = &kvm->arch;
+ unsigned seq;
+
+ do {
+ seq = read_seqcount_begin(&ka->pvclock_sc);
+ __get_kvmclock(kvm, data);
+ } while (read_seqcount_retry(&ka->pvclock_sc, seq));
+}
+
+u64 get_kvmclock_ns(struct kvm *kvm)
+{
+ struct kvm_clock_data data;
+
+ get_kvmclock(kvm, &data);
+ return data.clock;
+}
+
+static void kvm_setup_guest_pvclock(struct kvm_vcpu *v,
+ struct gfn_to_pfn_cache *gpc,
+ unsigned int offset)
+{
+ struct kvm_vcpu_arch *vcpu = &v->arch;
+ struct pvclock_vcpu_time_info *guest_hv_clock;
+ unsigned long flags;
+
+ read_lock_irqsave(&gpc->lock, flags);
+ while (!kvm_gpc_check(gpc, offset + sizeof(*guest_hv_clock))) {
+ read_unlock_irqrestore(&gpc->lock, flags);
+
+ if (kvm_gpc_refresh(gpc, offset + sizeof(*guest_hv_clock)))
+ return;
+
+ read_lock_irqsave(&gpc->lock, flags);
+ }
+
+ guest_hv_clock = (void *)(gpc->khva + offset);
+
+ /*
+ * This VCPU is paused, but it's legal for a guest to read another
+ * VCPU's kvmclock, so we really have to follow the specification where
+ * it says that version is odd if data is being modified, and even after
+ * it is consistent.
+ */
+
+ guest_hv_clock->version = vcpu->hv_clock.version = (guest_hv_clock->version + 1) | 1;
+ smp_wmb();
+
+ /* retain PVCLOCK_GUEST_STOPPED if set in guest copy */
+ vcpu->hv_clock.flags |= (guest_hv_clock->flags & PVCLOCK_GUEST_STOPPED);
+
+ if (vcpu->pvclock_set_guest_stopped_request) {
+ vcpu->hv_clock.flags |= PVCLOCK_GUEST_STOPPED;
+ vcpu->pvclock_set_guest_stopped_request = false;
+ }
+
+ memcpy(guest_hv_clock, &vcpu->hv_clock, sizeof(*guest_hv_clock));
+ smp_wmb();
+
+ guest_hv_clock->version = ++vcpu->hv_clock.version;
+
+ mark_page_dirty_in_slot(v->kvm, gpc->memslot, gpc->gpa >> PAGE_SHIFT);
+ read_unlock_irqrestore(&gpc->lock, flags);
+
+ trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock);
+}
+
+static int kvm_guest_time_update(struct kvm_vcpu *v)
+{
+ unsigned long flags, tgt_tsc_khz;
+ unsigned seq;
+ struct kvm_vcpu_arch *vcpu = &v->arch;
+ struct kvm_arch *ka = &v->kvm->arch;
+ s64 kernel_ns;
+ u64 tsc_timestamp, host_tsc;
+ u8 pvclock_flags;
+ bool use_master_clock;
+
+ kernel_ns = 0;
+ host_tsc = 0;
+
+ /*
+ * If the host uses TSC clock, then passthrough TSC as stable
+ * to the guest.
+ */
+ do {
+ seq = read_seqcount_begin(&ka->pvclock_sc);
+ use_master_clock = ka->use_master_clock;
+ if (use_master_clock) {
+ host_tsc = ka->master_cycle_now;
+ kernel_ns = ka->master_kernel_ns;
+ }
+ } while (read_seqcount_retry(&ka->pvclock_sc, seq));
+
+ /* Keep irq disabled to prevent changes to the clock */
+ local_irq_save(flags);
+ tgt_tsc_khz = get_cpu_tsc_khz();
+ if (unlikely(tgt_tsc_khz == 0)) {
+ local_irq_restore(flags);
+ kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
+ return 1;
+ }
+ if (!use_master_clock) {
+ host_tsc = rdtsc();
+ kernel_ns = get_kvmclock_base_ns();
+ }
+
+ tsc_timestamp = kvm_read_l1_tsc(v, host_tsc);
+
+ /*
+ * We may have to catch up the TSC to match elapsed wall clock
+ * time for two reasons, even if kvmclock is used.
+ * 1) CPU could have been running below the maximum TSC rate
+ * 2) Broken TSC compensation resets the base at each VCPU
+ * entry to avoid unknown leaps of TSC even when running
+ * again on the same CPU. This may cause apparent elapsed
+ * time to disappear, and the guest to stand still or run
+ * very slowly.
+ */
+ if (vcpu->tsc_catchup) {
+ u64 tsc = compute_guest_tsc(v, kernel_ns);
+ if (tsc > tsc_timestamp) {
+ adjust_tsc_offset_guest(v, tsc - tsc_timestamp);
+ tsc_timestamp = tsc;
+ }
+ }
+
+ local_irq_restore(flags);
+
+ /* With all the info we got, fill in the values */
+
+ if (kvm_caps.has_tsc_control)
+ tgt_tsc_khz = kvm_scale_tsc(tgt_tsc_khz,
+ v->arch.l1_tsc_scaling_ratio);
+
+ if (unlikely(vcpu->hw_tsc_khz != tgt_tsc_khz)) {
+ kvm_get_time_scale(NSEC_PER_SEC, tgt_tsc_khz * 1000LL,
+ &vcpu->hv_clock.tsc_shift,
+ &vcpu->hv_clock.tsc_to_system_mul);
+ vcpu->hw_tsc_khz = tgt_tsc_khz;
+ kvm_xen_update_tsc_info(v);
+ }
+
+ vcpu->hv_clock.tsc_timestamp = tsc_timestamp;
+ vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
+ vcpu->last_guest_tsc = tsc_timestamp;
+
+ /* If the host uses TSC clocksource, then it is stable */
+ pvclock_flags = 0;
+ if (use_master_clock)
+ pvclock_flags |= PVCLOCK_TSC_STABLE_BIT;
+
+ vcpu->hv_clock.flags = pvclock_flags;
+
+ if (vcpu->pv_time.active)
+ kvm_setup_guest_pvclock(v, &vcpu->pv_time, 0);
+ if (vcpu->xen.vcpu_info_cache.active)
+ kvm_setup_guest_pvclock(v, &vcpu->xen.vcpu_info_cache,
+ offsetof(struct compat_vcpu_info, time));
+ if (vcpu->xen.vcpu_time_info_cache.active)
+ kvm_setup_guest_pvclock(v, &vcpu->xen.vcpu_time_info_cache, 0);
+ kvm_hv_setup_tsc_page(v->kvm, &vcpu->hv_clock);
+ return 0;
+}
+
+/*
+ * kvmclock updates which are isolated to a given vcpu, such as
+ * vcpu->cpu migration, should not allow system_timestamp from
+ * the rest of the vcpus to remain static. Otherwise ntp frequency
+ * correction applies to one vcpu's system_timestamp but not
+ * the others.
+ *
+ * So in those cases, request a kvmclock update for all vcpus.
+ * We need to rate-limit these requests though, as they can
+ * considerably slow guests that have a large number of vcpus.
+ * The time for a remote vcpu to update its kvmclock is bound
+ * by the delay we use to rate-limit the updates.
+ */
+
+#define KVMCLOCK_UPDATE_DELAY msecs_to_jiffies(100)
+
+static void kvmclock_update_fn(struct work_struct *work)
+{
+ unsigned long i;
+ struct delayed_work *dwork = to_delayed_work(work);
+ struct kvm_arch *ka = container_of(dwork, struct kvm_arch,
+ kvmclock_update_work);
+ struct kvm *kvm = container_of(ka, struct kvm, arch);
+ struct kvm_vcpu *vcpu;
+
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
+ kvm_vcpu_kick(vcpu);
+ }
+}
+
+static void kvm_gen_kvmclock_update(struct kvm_vcpu *v)
+{
+ struct kvm *kvm = v->kvm;
+
+ kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
+ schedule_delayed_work(&kvm->arch.kvmclock_update_work,
+ KVMCLOCK_UPDATE_DELAY);
+}
+
+#define KVMCLOCK_SYNC_PERIOD (300 * HZ)
+
+static void kvmclock_sync_fn(struct work_struct *work)
+{
+ struct delayed_work *dwork = to_delayed_work(work);
+ struct kvm_arch *ka = container_of(dwork, struct kvm_arch,
+ kvmclock_sync_work);
+ struct kvm *kvm = container_of(ka, struct kvm, arch);
+
+ if (!kvmclock_periodic_sync)
+ return;
+
+ schedule_delayed_work(&kvm->arch.kvmclock_update_work, 0);
+ schedule_delayed_work(&kvm->arch.kvmclock_sync_work,
+ KVMCLOCK_SYNC_PERIOD);
+}
+
+/* These helpers are safe iff @msr is known to be an MCx bank MSR. */
+static bool is_mci_control_msr(u32 msr)
+{
+ return (msr & 3) == 0;
+}
+static bool is_mci_status_msr(u32 msr)
+{
+ return (msr & 3) == 1;
+}
+
+/*
+ * On AMD, HWCR[McStatusWrEn] controls whether setting MCi_STATUS results in #GP.
+ */
+static bool can_set_mci_status(struct kvm_vcpu *vcpu)
+{
+ /* McStatusWrEn enabled? */
+ if (guest_cpuid_is_amd_or_hygon(vcpu))
+ return !!(vcpu->arch.msr_hwcr & BIT_ULL(18));
+
+ return false;
+}
+
+static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+{
+ u64 mcg_cap = vcpu->arch.mcg_cap;
+ unsigned bank_num = mcg_cap & 0xff;
+ u32 msr = msr_info->index;
+ u64 data = msr_info->data;
+ u32 offset, last_msr;
+
+ switch (msr) {
+ case MSR_IA32_MCG_STATUS:
+ vcpu->arch.mcg_status = data;
+ break;
+ case MSR_IA32_MCG_CTL:
+ if (!(mcg_cap & MCG_CTL_P) &&
+ (data || !msr_info->host_initiated))
+ return 1;
+ if (data != 0 && data != ~(u64)0)
+ return 1;
+ vcpu->arch.mcg_ctl = data;
+ break;
+ case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1:
+ last_msr = MSR_IA32_MCx_CTL2(bank_num) - 1;
+ if (msr > last_msr)
+ return 1;
+
+ if (!(mcg_cap & MCG_CMCI_P) && (data || !msr_info->host_initiated))
+ return 1;
+ /* An attempt to write a 1 to a reserved bit raises #GP */
+ if (data & ~(MCI_CTL2_CMCI_EN | MCI_CTL2_CMCI_THRESHOLD_MASK))
+ return 1;
+ offset = array_index_nospec(msr - MSR_IA32_MC0_CTL2,
+ last_msr + 1 - MSR_IA32_MC0_CTL2);
+ vcpu->arch.mci_ctl2_banks[offset] = data;
+ break;
+ case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
+ last_msr = MSR_IA32_MCx_CTL(bank_num) - 1;
+ if (msr > last_msr)
+ return 1;
+
+ /*
+ * Only 0 or all 1s can be written to IA32_MCi_CTL, all other
+ * values are architecturally undefined. But, some Linux
+ * kernels clear bit 10 in bank 4 to workaround a BIOS/GART TLB
+ * issue on AMD K8s, allow bit 10 to be clear when setting all
+ * other bits in order to avoid an uncaught #GP in the guest.
+ *
+ * UNIXWARE clears bit 0 of MC1_CTL to ignore correctable,
+ * single-bit ECC data errors.
+ */
+ if (is_mci_control_msr(msr) &&
+ data != 0 && (data | (1 << 10) | 1) != ~(u64)0)
+ return 1;
+
+ /*
+ * All CPUs allow writing 0 to MCi_STATUS MSRs to clear the MSR.
+ * AMD-based CPUs allow non-zero values, but if and only if
+ * HWCR[McStatusWrEn] is set.
+ */
+ if (!msr_info->host_initiated && is_mci_status_msr(msr) &&
+ data != 0 && !can_set_mci_status(vcpu))
+ return 1;
+
+ offset = array_index_nospec(msr - MSR_IA32_MC0_CTL,
+ last_msr + 1 - MSR_IA32_MC0_CTL);
+ vcpu->arch.mce_banks[offset] = data;
+ break;
+ default:
+ return 1;
+ }
+ return 0;
+}
+
+static inline bool kvm_pv_async_pf_enabled(struct kvm_vcpu *vcpu)
+{
+ u64 mask = KVM_ASYNC_PF_ENABLED | KVM_ASYNC_PF_DELIVERY_AS_INT;
+
+ return (vcpu->arch.apf.msr_en_val & mask) == mask;
+}
+
+static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data)
+{
+ gpa_t gpa = data & ~0x3f;
+
+ /* Bits 4:5 are reserved, Should be zero */
+ if (data & 0x30)
+ return 1;
+
+ if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_VMEXIT) &&
+ (data & KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT))
+ return 1;
+
+ if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT) &&
+ (data & KVM_ASYNC_PF_DELIVERY_AS_INT))
+ return 1;
+
+ if (!lapic_in_kernel(vcpu))
+ return data ? 1 : 0;
+
+ vcpu->arch.apf.msr_en_val = data;
+
+ if (!kvm_pv_async_pf_enabled(vcpu)) {
+ kvm_clear_async_pf_completion_queue(vcpu);
+ kvm_async_pf_hash_reset(vcpu);
+ return 0;
+ }
+
+ if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apf.data, gpa,
+ sizeof(u64)))
+ return 1;
+
+ vcpu->arch.apf.send_user_only = !(data & KVM_ASYNC_PF_SEND_ALWAYS);
+ vcpu->arch.apf.delivery_as_pf_vmexit = data & KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT;
+
+ kvm_async_pf_wakeup_all(vcpu);
+
+ return 0;
+}
+
+static int kvm_pv_enable_async_pf_int(struct kvm_vcpu *vcpu, u64 data)
+{
+ /* Bits 8-63 are reserved */
+ if (data >> 8)
+ return 1;
+
+ if (!lapic_in_kernel(vcpu))
+ return 1;
+
+ vcpu->arch.apf.msr_int_val = data;
+
+ vcpu->arch.apf.vec = data & KVM_ASYNC_PF_VEC_MASK;
+
+ return 0;
+}
+
+static void kvmclock_reset(struct kvm_vcpu *vcpu)
+{
+ kvm_gpc_deactivate(&vcpu->arch.pv_time);
+ vcpu->arch.time = 0;
+}
+
+static void kvm_vcpu_flush_tlb_all(struct kvm_vcpu *vcpu)
+{
+ ++vcpu->stat.tlb_flush;
+ static_call(kvm_x86_flush_tlb_all)(vcpu);
+
+ /* Flushing all ASIDs flushes the current ASID... */
+ kvm_clear_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
+}
+
+static void kvm_vcpu_flush_tlb_guest(struct kvm_vcpu *vcpu)
+{
+ ++vcpu->stat.tlb_flush;
+
+ if (!tdp_enabled) {
+ /*
+ * A TLB flush on behalf of the guest is equivalent to
+ * INVPCID(all), toggling CR4.PGE, etc., which requires
+ * a forced sync of the shadow page tables. Ensure all the
+ * roots are synced and the guest TLB in hardware is clean.
+ */
+ kvm_mmu_sync_roots(vcpu);
+ kvm_mmu_sync_prev_roots(vcpu);
+ }
+
+ static_call(kvm_x86_flush_tlb_guest)(vcpu);
+
+ /*
+ * Flushing all "guest" TLB is always a superset of Hyper-V's fine
+ * grained flushing.
+ */
+ kvm_hv_vcpu_purge_flush_tlb(vcpu);
+}
+
+
+static inline void kvm_vcpu_flush_tlb_current(struct kvm_vcpu *vcpu)
+{
+ ++vcpu->stat.tlb_flush;
+ static_call(kvm_x86_flush_tlb_current)(vcpu);
+}
+
+/*
+ * Service "local" TLB flush requests, which are specific to the current MMU
+ * context. In addition to the generic event handling in vcpu_enter_guest(),
+ * TLB flushes that are targeted at an MMU context also need to be serviced
+ * prior before nested VM-Enter/VM-Exit.
+ */
+void kvm_service_local_tlb_flush_requests(struct kvm_vcpu *vcpu)
+{
+ if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu))
+ kvm_vcpu_flush_tlb_current(vcpu);
+
+ if (kvm_check_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu))
+ kvm_vcpu_flush_tlb_guest(vcpu);
+}
+EXPORT_SYMBOL_GPL(kvm_service_local_tlb_flush_requests);
+
+static void record_steal_time(struct kvm_vcpu *vcpu)
+{
+ struct gfn_to_hva_cache *ghc = &vcpu->arch.st.cache;
+ struct kvm_steal_time __user *st;
+ struct kvm_memslots *slots;
+ gpa_t gpa = vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS;
+ u64 steal;
+ u32 version;
+
+ if (kvm_xen_msr_enabled(vcpu->kvm)) {
+ kvm_xen_runstate_set_running(vcpu);
+ return;
+ }
+
+ if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
+ return;
+
+ if (WARN_ON_ONCE(current->mm != vcpu->kvm->mm))
+ return;
+
+ slots = kvm_memslots(vcpu->kvm);
+
+ if (unlikely(slots->generation != ghc->generation ||
+ gpa != ghc->gpa ||
+ kvm_is_error_hva(ghc->hva) || !ghc->memslot)) {
+ /* We rely on the fact that it fits in a single page. */
+ BUILD_BUG_ON((sizeof(*st) - 1) & KVM_STEAL_VALID_BITS);
+
+ if (kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, gpa, sizeof(*st)) ||
+ kvm_is_error_hva(ghc->hva) || !ghc->memslot)
+ return;
+ }
+
+ st = (struct kvm_steal_time __user *)ghc->hva;
+ /*
+ * Doing a TLB flush here, on the guest's behalf, can avoid
+ * expensive IPIs.
+ */
+ if (guest_pv_has(vcpu, KVM_FEATURE_PV_TLB_FLUSH)) {
+ u8 st_preempted = 0;
+ int err = -EFAULT;
+
+ if (!user_access_begin(st, sizeof(*st)))
+ return;
+
+ asm volatile("1: xchgb %0, %2\n"
+ "xor %1, %1\n"
+ "2:\n"
+ _ASM_EXTABLE_UA(1b, 2b)
+ : "+q" (st_preempted),
+ "+&r" (err),
+ "+m" (st->preempted));
+ if (err)
+ goto out;
+
+ user_access_end();
+
+ vcpu->arch.st.preempted = 0;
+
+ trace_kvm_pv_tlb_flush(vcpu->vcpu_id,
+ st_preempted & KVM_VCPU_FLUSH_TLB);
+ if (st_preempted & KVM_VCPU_FLUSH_TLB)
+ kvm_vcpu_flush_tlb_guest(vcpu);
+
+ if (!user_access_begin(st, sizeof(*st)))
+ goto dirty;
+ } else {
+ if (!user_access_begin(st, sizeof(*st)))
+ return;
+
+ unsafe_put_user(0, &st->preempted, out);
+ vcpu->arch.st.preempted = 0;
+ }
+
+ unsafe_get_user(version, &st->version, out);
+ if (version & 1)
+ version += 1; /* first time write, random junk */
+
+ version += 1;
+ unsafe_put_user(version, &st->version, out);
+
+ smp_wmb();
+
+ unsafe_get_user(steal, &st->steal, out);
+ steal += current->sched_info.run_delay -
+ vcpu->arch.st.last_steal;
+ vcpu->arch.st.last_steal = current->sched_info.run_delay;
+ unsafe_put_user(steal, &st->steal, out);
+
+ version += 1;
+ unsafe_put_user(version, &st->version, out);
+
+ out:
+ user_access_end();
+ dirty:
+ mark_page_dirty_in_slot(vcpu->kvm, ghc->memslot, gpa_to_gfn(ghc->gpa));
+}
+
+static bool kvm_is_msr_to_save(u32 msr_index)
+{
+ unsigned int i;
+
+ for (i = 0; i < num_msrs_to_save; i++) {
+ if (msrs_to_save[i] == msr_index)
+ return true;
+ }
+
+ return false;
+}
+
+int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+{
+ u32 msr = msr_info->index;
+ u64 data = msr_info->data;
+
+ if (msr && msr == vcpu->kvm->arch.xen_hvm_config.msr)
+ return kvm_xen_write_hypercall_page(vcpu, data);
+
+ switch (msr) {
+ case MSR_AMD64_NB_CFG:
+ case MSR_IA32_UCODE_WRITE:
+ case MSR_VM_HSAVE_PA:
+ case MSR_AMD64_PATCH_LOADER:
+ case MSR_AMD64_BU_CFG2:
+ case MSR_AMD64_DC_CFG:
+ case MSR_AMD64_TW_CFG:
+ case MSR_F15H_EX_CFG:
+ break;
+
+ case MSR_IA32_UCODE_REV:
+ if (msr_info->host_initiated)
+ vcpu->arch.microcode_version = data;
+ break;
+ case MSR_IA32_ARCH_CAPABILITIES:
+ if (!msr_info->host_initiated)
+ return 1;
+ vcpu->arch.arch_capabilities = data;
+ break;
+ case MSR_IA32_PERF_CAPABILITIES:
+ if (!msr_info->host_initiated)
+ return 1;
+ if (data & ~kvm_caps.supported_perf_cap)
+ return 1;
+
+ /*
+ * Note, this is not just a performance optimization! KVM
+ * disallows changing feature MSRs after the vCPU has run; PMU
+ * refresh will bug the VM if called after the vCPU has run.
+ */
+ if (vcpu->arch.perf_capabilities == data)
+ break;
+
+ vcpu->arch.perf_capabilities = data;
+ kvm_pmu_refresh(vcpu);
+ break;
+ case MSR_IA32_PRED_CMD:
+ if (!msr_info->host_initiated && !guest_has_pred_cmd_msr(vcpu))
+ return 1;
+
+ if (!boot_cpu_has(X86_FEATURE_IBPB) || (data & ~PRED_CMD_IBPB))
+ return 1;
+ if (!data)
+ break;
+
+ wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB);
+ break;
+ case MSR_IA32_FLUSH_CMD:
+ if (!msr_info->host_initiated &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_FLUSH_L1D))
+ return 1;
+
+ if (!boot_cpu_has(X86_FEATURE_FLUSH_L1D) || (data & ~L1D_FLUSH))
+ return 1;
+ if (!data)
+ break;
+
+ wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH);
+ break;
+ case MSR_EFER:
+ return set_efer(vcpu, msr_info);
+ case MSR_K7_HWCR:
+ data &= ~(u64)0x40; /* ignore flush filter disable */
+ data &= ~(u64)0x100; /* ignore ignne emulation enable */
+ data &= ~(u64)0x8; /* ignore TLB cache disable */
+
+ /* Handle McStatusWrEn */
+ if (data == BIT_ULL(18)) {
+ vcpu->arch.msr_hwcr = data;
+ } else if (data != 0) {
+ kvm_pr_unimpl_wrmsr(vcpu, msr, data);
+ return 1;
+ }
+ break;
+ case MSR_FAM10H_MMIO_CONF_BASE:
+ if (data != 0) {
+ kvm_pr_unimpl_wrmsr(vcpu, msr, data);
+ return 1;
+ }
+ break;
+ case MSR_IA32_CR_PAT:
+ if (!kvm_pat_valid(data))
+ return 1;
+
+ vcpu->arch.pat = data;
+ break;
+ case MTRRphysBase_MSR(0) ... MSR_MTRRfix4K_F8000:
+ case MSR_MTRRdefType:
+ return kvm_mtrr_set_msr(vcpu, msr, data);
+ case MSR_IA32_APICBASE:
+ return kvm_set_apic_base(vcpu, msr_info);
+ case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff:
+ return kvm_x2apic_msr_write(vcpu, msr, data);
+ case MSR_IA32_TSC_DEADLINE:
+ kvm_set_lapic_tscdeadline_msr(vcpu, data);
+ break;
+ case MSR_IA32_TSC_ADJUST:
+ if (guest_cpuid_has(vcpu, X86_FEATURE_TSC_ADJUST)) {
+ if (!msr_info->host_initiated) {
+ s64 adj = data - vcpu->arch.ia32_tsc_adjust_msr;
+ adjust_tsc_offset_guest(vcpu, adj);
+ /* Before back to guest, tsc_timestamp must be adjusted
+ * as well, otherwise guest's percpu pvclock time could jump.
+ */
+ kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
+ }
+ vcpu->arch.ia32_tsc_adjust_msr = data;
+ }
+ break;
+ case MSR_IA32_MISC_ENABLE: {
+ u64 old_val = vcpu->arch.ia32_misc_enable_msr;
+
+ if (!msr_info->host_initiated) {
+ /* RO bits */
+ if ((old_val ^ data) & MSR_IA32_MISC_ENABLE_PMU_RO_MASK)
+ return 1;
+
+ /* R bits, i.e. writes are ignored, but don't fault. */
+ data = data & ~MSR_IA32_MISC_ENABLE_EMON;
+ data |= old_val & MSR_IA32_MISC_ENABLE_EMON;
+ }
+
+ if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT) &&
+ ((old_val ^ data) & MSR_IA32_MISC_ENABLE_MWAIT)) {
+ if (!guest_cpuid_has(vcpu, X86_FEATURE_XMM3))
+ return 1;
+ vcpu->arch.ia32_misc_enable_msr = data;
+ kvm_update_cpuid_runtime(vcpu);
+ } else {
+ vcpu->arch.ia32_misc_enable_msr = data;
+ }
+ break;
+ }
+ case MSR_IA32_SMBASE:
+ if (!IS_ENABLED(CONFIG_KVM_SMM) || !msr_info->host_initiated)
+ return 1;
+ vcpu->arch.smbase = data;
+ break;
+ case MSR_IA32_POWER_CTL:
+ vcpu->arch.msr_ia32_power_ctl = data;
+ break;
+ case MSR_IA32_TSC:
+ if (msr_info->host_initiated) {
+ kvm_synchronize_tsc(vcpu, data);
+ } else {
+ u64 adj = kvm_compute_l1_tsc_offset(vcpu, data) - vcpu->arch.l1_tsc_offset;
+ adjust_tsc_offset_guest(vcpu, adj);
+ vcpu->arch.ia32_tsc_adjust_msr += adj;
+ }
+ break;
+ case MSR_IA32_XSS:
+ if (!msr_info->host_initiated &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_XSAVES))
+ return 1;
+ /*
+ * KVM supports exposing PT to the guest, but does not support
+ * IA32_XSS[bit 8]. Guests have to use RDMSR/WRMSR rather than
+ * XSAVES/XRSTORS to save/restore PT MSRs.
+ */
+ if (data & ~kvm_caps.supported_xss)
+ return 1;
+ vcpu->arch.ia32_xss = data;
+ kvm_update_cpuid_runtime(vcpu);
+ break;
+ case MSR_SMI_COUNT:
+ if (!msr_info->host_initiated)
+ return 1;
+ vcpu->arch.smi_count = data;
+ break;
+ case MSR_KVM_WALL_CLOCK_NEW:
+ if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2))
+ return 1;
+
+ vcpu->kvm->arch.wall_clock = data;
+ kvm_write_wall_clock(vcpu->kvm, data, 0);
+ break;
+ case MSR_KVM_WALL_CLOCK:
+ if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE))
+ return 1;
+
+ vcpu->kvm->arch.wall_clock = data;
+ kvm_write_wall_clock(vcpu->kvm, data, 0);
+ break;
+ case MSR_KVM_SYSTEM_TIME_NEW:
+ if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2))
+ return 1;
+
+ kvm_write_system_time(vcpu, data, false, msr_info->host_initiated);
+ break;
+ case MSR_KVM_SYSTEM_TIME:
+ if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE))
+ return 1;
+
+ kvm_write_system_time(vcpu, data, true, msr_info->host_initiated);
+ break;
+ case MSR_KVM_ASYNC_PF_EN:
+ if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF))
+ return 1;
+
+ if (kvm_pv_enable_async_pf(vcpu, data))
+ return 1;
+ break;
+ case MSR_KVM_ASYNC_PF_INT:
+ if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT))
+ return 1;
+
+ if (kvm_pv_enable_async_pf_int(vcpu, data))
+ return 1;
+ break;
+ case MSR_KVM_ASYNC_PF_ACK:
+ if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT))
+ return 1;
+ if (data & 0x1) {
+ vcpu->arch.apf.pageready_pending = false;
+ kvm_check_async_pf_completion(vcpu);
+ }
+ break;
+ case MSR_KVM_STEAL_TIME:
+ if (!guest_pv_has(vcpu, KVM_FEATURE_STEAL_TIME))
+ return 1;
+
+ if (unlikely(!sched_info_on()))
+ return 1;
+
+ if (data & KVM_STEAL_RESERVED_MASK)
+ return 1;
+
+ vcpu->arch.st.msr_val = data;
+
+ if (!(data & KVM_MSR_ENABLED))
+ break;
+
+ kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
+
+ break;
+ case MSR_KVM_PV_EOI_EN:
+ if (!guest_pv_has(vcpu, KVM_FEATURE_PV_EOI))
+ return 1;
+
+ if (kvm_lapic_set_pv_eoi(vcpu, data, sizeof(u8)))
+ return 1;
+ break;
+
+ case MSR_KVM_POLL_CONTROL:
+ if (!guest_pv_has(vcpu, KVM_FEATURE_POLL_CONTROL))
+ return 1;
+
+ /* only enable bit supported */
+ if (data & (-1ULL << 1))
+ return 1;
+
+ vcpu->arch.msr_kvm_poll_control = data;
+ break;
+
+ case MSR_IA32_MCG_CTL:
+ case MSR_IA32_MCG_STATUS:
+ case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
+ case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1:
+ return set_msr_mce(vcpu, msr_info);
+
+ case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3:
+ case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1:
+ case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3:
+ case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1:
+ if (kvm_pmu_is_valid_msr(vcpu, msr))
+ return kvm_pmu_set_msr(vcpu, msr_info);
+
+ if (data)
+ kvm_pr_unimpl_wrmsr(vcpu, msr, data);
+ break;
+ case MSR_K7_CLK_CTL:
+ /*
+ * Ignore all writes to this no longer documented MSR.
+ * Writes are only relevant for old K7 processors,
+ * all pre-dating SVM, but a recommended workaround from
+ * AMD for these chips. It is possible to specify the
+ * affected processor models on the command line, hence
+ * the need to ignore the workaround.
+ */
+ break;
+ case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
+ case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER:
+ case HV_X64_MSR_SYNDBG_OPTIONS:
+ case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
+ case HV_X64_MSR_CRASH_CTL:
+ case HV_X64_MSR_STIMER0_CONFIG ... HV_X64_MSR_STIMER3_COUNT:
+ case HV_X64_MSR_REENLIGHTENMENT_CONTROL:
+ case HV_X64_MSR_TSC_EMULATION_CONTROL:
+ case HV_X64_MSR_TSC_EMULATION_STATUS:
+ case HV_X64_MSR_TSC_INVARIANT_CONTROL:
+ return kvm_hv_set_msr_common(vcpu, msr, data,
+ msr_info->host_initiated);
+ case MSR_IA32_BBL_CR_CTL3:
+ /* Drop writes to this legacy MSR -- see rdmsr
+ * counterpart for further detail.
+ */
+ kvm_pr_unimpl_wrmsr(vcpu, msr, data);
+ break;
+ case MSR_AMD64_OSVW_ID_LENGTH:
+ if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW))
+ return 1;
+ vcpu->arch.osvw.length = data;
+ break;
+ case MSR_AMD64_OSVW_STATUS:
+ if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW))
+ return 1;
+ vcpu->arch.osvw.status = data;
+ break;
+ case MSR_PLATFORM_INFO:
+ if (!msr_info->host_initiated ||
+ (!(data & MSR_PLATFORM_INFO_CPUID_FAULT) &&
+ cpuid_fault_enabled(vcpu)))
+ return 1;
+ vcpu->arch.msr_platform_info = data;
+ break;
+ case MSR_MISC_FEATURES_ENABLES:
+ if (data & ~MSR_MISC_FEATURES_ENABLES_CPUID_FAULT ||
+ (data & MSR_MISC_FEATURES_ENABLES_CPUID_FAULT &&
+ !supports_cpuid_fault(vcpu)))
+ return 1;
+ vcpu->arch.msr_misc_features_enables = data;
+ break;
+#ifdef CONFIG_X86_64
+ case MSR_IA32_XFD:
+ if (!msr_info->host_initiated &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_XFD))
+ return 1;
+
+ if (data & ~kvm_guest_supported_xfd(vcpu))
+ return 1;
+
+ fpu_update_guest_xfd(&vcpu->arch.guest_fpu, data);
+ break;
+ case MSR_IA32_XFD_ERR:
+ if (!msr_info->host_initiated &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_XFD))
+ return 1;
+
+ if (data & ~kvm_guest_supported_xfd(vcpu))
+ return 1;
+
+ vcpu->arch.guest_fpu.xfd_err = data;
+ break;
+#endif
+ default:
+ if (kvm_pmu_is_valid_msr(vcpu, msr))
+ return kvm_pmu_set_msr(vcpu, msr_info);
+
+ /*
+ * Userspace is allowed to write '0' to MSRs that KVM reports
+ * as to-be-saved, even if an MSRs isn't fully supported.
+ */
+ if (msr_info->host_initiated && !data &&
+ kvm_is_msr_to_save(msr))
+ break;
+
+ return KVM_MSR_RET_INVALID;
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_set_msr_common);
+
+static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host)
+{
+ u64 data;
+ u64 mcg_cap = vcpu->arch.mcg_cap;
+ unsigned bank_num = mcg_cap & 0xff;
+ u32 offset, last_msr;
+
+ switch (msr) {
+ case MSR_IA32_P5_MC_ADDR:
+ case MSR_IA32_P5_MC_TYPE:
+ data = 0;
+ break;
+ case MSR_IA32_MCG_CAP:
+ data = vcpu->arch.mcg_cap;
+ break;
+ case MSR_IA32_MCG_CTL:
+ if (!(mcg_cap & MCG_CTL_P) && !host)
+ return 1;
+ data = vcpu->arch.mcg_ctl;
+ break;
+ case MSR_IA32_MCG_STATUS:
+ data = vcpu->arch.mcg_status;
+ break;
+ case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1:
+ last_msr = MSR_IA32_MCx_CTL2(bank_num) - 1;
+ if (msr > last_msr)
+ return 1;
+
+ if (!(mcg_cap & MCG_CMCI_P) && !host)
+ return 1;
+ offset = array_index_nospec(msr - MSR_IA32_MC0_CTL2,
+ last_msr + 1 - MSR_IA32_MC0_CTL2);
+ data = vcpu->arch.mci_ctl2_banks[offset];
+ break;
+ case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
+ last_msr = MSR_IA32_MCx_CTL(bank_num) - 1;
+ if (msr > last_msr)
+ return 1;
+
+ offset = array_index_nospec(msr - MSR_IA32_MC0_CTL,
+ last_msr + 1 - MSR_IA32_MC0_CTL);
+ data = vcpu->arch.mce_banks[offset];
+ break;
+ default:
+ return 1;
+ }
+ *pdata = data;
+ return 0;
+}
+
+int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+{
+ switch (msr_info->index) {
+ case MSR_IA32_PLATFORM_ID:
+ case MSR_IA32_EBL_CR_POWERON:
+ case MSR_IA32_LASTBRANCHFROMIP:
+ case MSR_IA32_LASTBRANCHTOIP:
+ case MSR_IA32_LASTINTFROMIP:
+ case MSR_IA32_LASTINTTOIP:
+ case MSR_AMD64_SYSCFG:
+ case MSR_K8_TSEG_ADDR:
+ case MSR_K8_TSEG_MASK:
+ case MSR_VM_HSAVE_PA:
+ case MSR_K8_INT_PENDING_MSG:
+ case MSR_AMD64_NB_CFG:
+ case MSR_FAM10H_MMIO_CONF_BASE:
+ case MSR_AMD64_BU_CFG2:
+ case MSR_IA32_PERF_CTL:
+ case MSR_AMD64_DC_CFG:
+ case MSR_AMD64_TW_CFG:
+ case MSR_F15H_EX_CFG:
+ /*
+ * Intel Sandy Bridge CPUs must support the RAPL (running average power
+ * limit) MSRs. Just return 0, as we do not want to expose the host
+ * data here. Do not conditionalize this on CPUID, as KVM does not do
+ * so for existing CPU-specific MSRs.
+ */
+ case MSR_RAPL_POWER_UNIT:
+ case MSR_PP0_ENERGY_STATUS: /* Power plane 0 (core) */
+ case MSR_PP1_ENERGY_STATUS: /* Power plane 1 (graphics uncore) */
+ case MSR_PKG_ENERGY_STATUS: /* Total package */
+ case MSR_DRAM_ENERGY_STATUS: /* DRAM controller */
+ msr_info->data = 0;
+ break;
+ case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3:
+ case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3:
+ case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1:
+ case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1:
+ if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
+ return kvm_pmu_get_msr(vcpu, msr_info);
+ msr_info->data = 0;
+ break;
+ case MSR_IA32_UCODE_REV:
+ msr_info->data = vcpu->arch.microcode_version;
+ break;
+ case MSR_IA32_ARCH_CAPABILITIES:
+ if (!msr_info->host_initiated &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_ARCH_CAPABILITIES))
+ return 1;
+ msr_info->data = vcpu->arch.arch_capabilities;
+ break;
+ case MSR_IA32_PERF_CAPABILITIES:
+ if (!msr_info->host_initiated &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_PDCM))
+ return 1;
+ msr_info->data = vcpu->arch.perf_capabilities;
+ break;
+ case MSR_IA32_POWER_CTL:
+ msr_info->data = vcpu->arch.msr_ia32_power_ctl;
+ break;
+ case MSR_IA32_TSC: {
+ /*
+ * Intel SDM states that MSR_IA32_TSC read adds the TSC offset
+ * even when not intercepted. AMD manual doesn't explicitly
+ * state this but appears to behave the same.
+ *
+ * On userspace reads and writes, however, we unconditionally
+ * return L1's TSC value to ensure backwards-compatible
+ * behavior for migration.
+ */
+ u64 offset, ratio;
+
+ if (msr_info->host_initiated) {
+ offset = vcpu->arch.l1_tsc_offset;
+ ratio = vcpu->arch.l1_tsc_scaling_ratio;
+ } else {
+ offset = vcpu->arch.tsc_offset;
+ ratio = vcpu->arch.tsc_scaling_ratio;
+ }
+
+ msr_info->data = kvm_scale_tsc(rdtsc(), ratio) + offset;
+ break;
+ }
+ case MSR_IA32_CR_PAT:
+ msr_info->data = vcpu->arch.pat;
+ break;
+ case MSR_MTRRcap:
+ case MTRRphysBase_MSR(0) ... MSR_MTRRfix4K_F8000:
+ case MSR_MTRRdefType:
+ return kvm_mtrr_get_msr(vcpu, msr_info->index, &msr_info->data);
+ case 0xcd: /* fsb frequency */
+ msr_info->data = 3;
+ break;
+ /*
+ * MSR_EBC_FREQUENCY_ID
+ * Conservative value valid for even the basic CPU models.
+ * Models 0,1: 000 in bits 23:21 indicating a bus speed of
+ * 100MHz, model 2 000 in bits 18:16 indicating 100MHz,
+ * and 266MHz for model 3, or 4. Set Core Clock
+ * Frequency to System Bus Frequency Ratio to 1 (bits
+ * 31:24) even though these are only valid for CPU
+ * models > 2, however guests may end up dividing or
+ * multiplying by zero otherwise.
+ */
+ case MSR_EBC_FREQUENCY_ID:
+ msr_info->data = 1 << 24;
+ break;
+ case MSR_IA32_APICBASE:
+ msr_info->data = kvm_get_apic_base(vcpu);
+ break;
+ case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff:
+ return kvm_x2apic_msr_read(vcpu, msr_info->index, &msr_info->data);
+ case MSR_IA32_TSC_DEADLINE:
+ msr_info->data = kvm_get_lapic_tscdeadline_msr(vcpu);
+ break;
+ case MSR_IA32_TSC_ADJUST:
+ msr_info->data = (u64)vcpu->arch.ia32_tsc_adjust_msr;
+ break;
+ case MSR_IA32_MISC_ENABLE:
+ msr_info->data = vcpu->arch.ia32_misc_enable_msr;
+ break;
+ case MSR_IA32_SMBASE:
+ if (!IS_ENABLED(CONFIG_KVM_SMM) || !msr_info->host_initiated)
+ return 1;
+ msr_info->data = vcpu->arch.smbase;
+ break;
+ case MSR_SMI_COUNT:
+ msr_info->data = vcpu->arch.smi_count;
+ break;
+ case MSR_IA32_PERF_STATUS:
+ /* TSC increment by tick */
+ msr_info->data = 1000ULL;
+ /* CPU multiplier */
+ msr_info->data |= (((uint64_t)4ULL) << 40);
+ break;
+ case MSR_EFER:
+ msr_info->data = vcpu->arch.efer;
+ break;
+ case MSR_KVM_WALL_CLOCK:
+ if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE))
+ return 1;
+
+ msr_info->data = vcpu->kvm->arch.wall_clock;
+ break;
+ case MSR_KVM_WALL_CLOCK_NEW:
+ if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2))
+ return 1;
+
+ msr_info->data = vcpu->kvm->arch.wall_clock;
+ break;
+ case MSR_KVM_SYSTEM_TIME:
+ if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE))
+ return 1;
+
+ msr_info->data = vcpu->arch.time;
+ break;
+ case MSR_KVM_SYSTEM_TIME_NEW:
+ if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2))
+ return 1;
+
+ msr_info->data = vcpu->arch.time;
+ break;
+ case MSR_KVM_ASYNC_PF_EN:
+ if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF))
+ return 1;
+
+ msr_info->data = vcpu->arch.apf.msr_en_val;
+ break;
+ case MSR_KVM_ASYNC_PF_INT:
+ if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT))
+ return 1;
+
+ msr_info->data = vcpu->arch.apf.msr_int_val;
+ break;
+ case MSR_KVM_ASYNC_PF_ACK:
+ if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT))
+ return 1;
+
+ msr_info->data = 0;
+ break;
+ case MSR_KVM_STEAL_TIME:
+ if (!guest_pv_has(vcpu, KVM_FEATURE_STEAL_TIME))
+ return 1;
+
+ msr_info->data = vcpu->arch.st.msr_val;
+ break;
+ case MSR_KVM_PV_EOI_EN:
+ if (!guest_pv_has(vcpu, KVM_FEATURE_PV_EOI))
+ return 1;
+
+ msr_info->data = vcpu->arch.pv_eoi.msr_val;
+ break;
+ case MSR_KVM_POLL_CONTROL:
+ if (!guest_pv_has(vcpu, KVM_FEATURE_POLL_CONTROL))
+ return 1;
+
+ msr_info->data = vcpu->arch.msr_kvm_poll_control;
+ break;
+ case MSR_IA32_P5_MC_ADDR:
+ case MSR_IA32_P5_MC_TYPE:
+ case MSR_IA32_MCG_CAP:
+ case MSR_IA32_MCG_CTL:
+ case MSR_IA32_MCG_STATUS:
+ case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
+ case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1:
+ return get_msr_mce(vcpu, msr_info->index, &msr_info->data,
+ msr_info->host_initiated);
+ case MSR_IA32_XSS:
+ if (!msr_info->host_initiated &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_XSAVES))
+ return 1;
+ msr_info->data = vcpu->arch.ia32_xss;
+ break;
+ case MSR_K7_CLK_CTL:
+ /*
+ * Provide expected ramp-up count for K7. All other
+ * are set to zero, indicating minimum divisors for
+ * every field.
+ *
+ * This prevents guest kernels on AMD host with CPU
+ * type 6, model 8 and higher from exploding due to
+ * the rdmsr failing.
+ */
+ msr_info->data = 0x20000000;
+ break;
+ case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
+ case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER:
+ case HV_X64_MSR_SYNDBG_OPTIONS:
+ case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
+ case HV_X64_MSR_CRASH_CTL:
+ case HV_X64_MSR_STIMER0_CONFIG ... HV_X64_MSR_STIMER3_COUNT:
+ case HV_X64_MSR_REENLIGHTENMENT_CONTROL:
+ case HV_X64_MSR_TSC_EMULATION_CONTROL:
+ case HV_X64_MSR_TSC_EMULATION_STATUS:
+ case HV_X64_MSR_TSC_INVARIANT_CONTROL:
+ return kvm_hv_get_msr_common(vcpu,
+ msr_info->index, &msr_info->data,
+ msr_info->host_initiated);
+ case MSR_IA32_BBL_CR_CTL3:
+ /* This legacy MSR exists but isn't fully documented in current
+ * silicon. It is however accessed by winxp in very narrow
+ * scenarios where it sets bit #19, itself documented as
+ * a "reserved" bit. Best effort attempt to source coherent
+ * read data here should the balance of the register be
+ * interpreted by the guest:
+ *
+ * L2 cache control register 3: 64GB range, 256KB size,
+ * enabled, latency 0x1, configured
+ */
+ msr_info->data = 0xbe702111;
+ break;
+ case MSR_AMD64_OSVW_ID_LENGTH:
+ if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW))
+ return 1;
+ msr_info->data = vcpu->arch.osvw.length;
+ break;
+ case MSR_AMD64_OSVW_STATUS:
+ if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW))
+ return 1;
+ msr_info->data = vcpu->arch.osvw.status;
+ break;
+ case MSR_PLATFORM_INFO:
+ if (!msr_info->host_initiated &&
+ !vcpu->kvm->arch.guest_can_read_msr_platform_info)
+ return 1;
+ msr_info->data = vcpu->arch.msr_platform_info;
+ break;
+ case MSR_MISC_FEATURES_ENABLES:
+ msr_info->data = vcpu->arch.msr_misc_features_enables;
+ break;
+ case MSR_K7_HWCR:
+ msr_info->data = vcpu->arch.msr_hwcr;
+ break;
+#ifdef CONFIG_X86_64
+ case MSR_IA32_XFD:
+ if (!msr_info->host_initiated &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_XFD))
+ return 1;
+
+ msr_info->data = vcpu->arch.guest_fpu.fpstate->xfd;
+ break;
+ case MSR_IA32_XFD_ERR:
+ if (!msr_info->host_initiated &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_XFD))
+ return 1;
+
+ msr_info->data = vcpu->arch.guest_fpu.xfd_err;
+ break;
+#endif
+ default:
+ if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
+ return kvm_pmu_get_msr(vcpu, msr_info);
+
+ /*
+ * Userspace is allowed to read MSRs that KVM reports as
+ * to-be-saved, even if an MSR isn't fully supported.
+ */
+ if (msr_info->host_initiated &&
+ kvm_is_msr_to_save(msr_info->index)) {
+ msr_info->data = 0;
+ break;
+ }
+
+ return KVM_MSR_RET_INVALID;
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_get_msr_common);
+
+/*
+ * Read or write a bunch of msrs. All parameters are kernel addresses.
+ *
+ * @return number of msrs set successfully.
+ */
+static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,
+ struct kvm_msr_entry *entries,
+ int (*do_msr)(struct kvm_vcpu *vcpu,
+ unsigned index, u64 *data))
+{
+ int i;
+
+ for (i = 0; i < msrs->nmsrs; ++i)
+ if (do_msr(vcpu, entries[i].index, &entries[i].data))
+ break;
+
+ return i;
+}
+
+/*
+ * Read or write a bunch of msrs. Parameters are user addresses.
+ *
+ * @return number of msrs set successfully.
+ */
+static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs,
+ int (*do_msr)(struct kvm_vcpu *vcpu,
+ unsigned index, u64 *data),
+ int writeback)
+{
+ struct kvm_msrs msrs;
+ struct kvm_msr_entry *entries;
+ unsigned size;
+ int r;
+
+ r = -EFAULT;
+ if (copy_from_user(&msrs, user_msrs, sizeof(msrs)))
+ goto out;
+
+ r = -E2BIG;
+ if (msrs.nmsrs >= MAX_IO_MSRS)
+ goto out;
+
+ size = sizeof(struct kvm_msr_entry) * msrs.nmsrs;
+ entries = memdup_user(user_msrs->entries, size);
+ if (IS_ERR(entries)) {
+ r = PTR_ERR(entries);
+ goto out;
+ }
+
+ r = __msr_io(vcpu, &msrs, entries, do_msr);
+
+ if (writeback && copy_to_user(user_msrs->entries, entries, size))
+ r = -EFAULT;
+
+ kfree(entries);
+out:
+ return r;
+}
+
+static inline bool kvm_can_mwait_in_guest(void)
+{
+ return boot_cpu_has(X86_FEATURE_MWAIT) &&
+ !boot_cpu_has_bug(X86_BUG_MONITOR) &&
+ boot_cpu_has(X86_FEATURE_ARAT);
+}
+
+static int kvm_ioctl_get_supported_hv_cpuid(struct kvm_vcpu *vcpu,
+ struct kvm_cpuid2 __user *cpuid_arg)
+{
+ struct kvm_cpuid2 cpuid;
+ int r;
+
+ r = -EFAULT;
+ if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
+ return r;
+
+ r = kvm_get_hv_cpuid(vcpu, &cpuid, cpuid_arg->entries);
+ if (r)
+ return r;
+
+ r = -EFAULT;
+ if (copy_to_user(cpuid_arg, &cpuid, sizeof(cpuid)))
+ return r;
+
+ return 0;
+}
+
+int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
+{
+ int r = 0;
+
+ switch (ext) {
+ case KVM_CAP_IRQCHIP:
+ case KVM_CAP_HLT:
+ case KVM_CAP_MMU_SHADOW_CACHE_CONTROL:
+ case KVM_CAP_SET_TSS_ADDR:
+ case KVM_CAP_EXT_CPUID:
+ case KVM_CAP_EXT_EMUL_CPUID:
+ case KVM_CAP_CLOCKSOURCE:
+ case KVM_CAP_PIT:
+ case KVM_CAP_NOP_IO_DELAY:
+ case KVM_CAP_MP_STATE:
+ case KVM_CAP_SYNC_MMU:
+ case KVM_CAP_USER_NMI:
+ case KVM_CAP_REINJECT_CONTROL:
+ case KVM_CAP_IRQ_INJECT_STATUS:
+ case KVM_CAP_IOEVENTFD:
+ case KVM_CAP_IOEVENTFD_NO_LENGTH:
+ case KVM_CAP_PIT2:
+ case KVM_CAP_PIT_STATE2:
+ case KVM_CAP_SET_IDENTITY_MAP_ADDR:
+ case KVM_CAP_VCPU_EVENTS:
+ case KVM_CAP_HYPERV:
+ case KVM_CAP_HYPERV_VAPIC:
+ case KVM_CAP_HYPERV_SPIN:
+ case KVM_CAP_HYPERV_SYNIC:
+ case KVM_CAP_HYPERV_SYNIC2:
+ case KVM_CAP_HYPERV_VP_INDEX:
+ case KVM_CAP_HYPERV_EVENTFD:
+ case KVM_CAP_HYPERV_TLBFLUSH:
+ case KVM_CAP_HYPERV_SEND_IPI:
+ case KVM_CAP_HYPERV_CPUID:
+ case KVM_CAP_HYPERV_ENFORCE_CPUID:
+ case KVM_CAP_SYS_HYPERV_CPUID:
+ case KVM_CAP_PCI_SEGMENT:
+ case KVM_CAP_DEBUGREGS:
+ case KVM_CAP_X86_ROBUST_SINGLESTEP:
+ case KVM_CAP_XSAVE:
+ case KVM_CAP_ASYNC_PF:
+ case KVM_CAP_ASYNC_PF_INT:
+ case KVM_CAP_GET_TSC_KHZ:
+ case KVM_CAP_KVMCLOCK_CTRL:
+ case KVM_CAP_READONLY_MEM:
+ case KVM_CAP_HYPERV_TIME:
+ case KVM_CAP_IOAPIC_POLARITY_IGNORED:
+ case KVM_CAP_TSC_DEADLINE_TIMER:
+ case KVM_CAP_DISABLE_QUIRKS:
+ case KVM_CAP_SET_BOOT_CPU_ID:
+ case KVM_CAP_SPLIT_IRQCHIP:
+ case KVM_CAP_IMMEDIATE_EXIT:
+ case KVM_CAP_PMU_EVENT_FILTER:
+ case KVM_CAP_PMU_EVENT_MASKED_EVENTS:
+ case KVM_CAP_GET_MSR_FEATURES:
+ case KVM_CAP_MSR_PLATFORM_INFO:
+ case KVM_CAP_EXCEPTION_PAYLOAD:
+ case KVM_CAP_X86_TRIPLE_FAULT_EVENT:
+ case KVM_CAP_SET_GUEST_DEBUG:
+ case KVM_CAP_LAST_CPU:
+ case KVM_CAP_X86_USER_SPACE_MSR:
+ case KVM_CAP_X86_MSR_FILTER:
+ case KVM_CAP_ENFORCE_PV_FEATURE_CPUID:
+#ifdef CONFIG_X86_SGX_KVM
+ case KVM_CAP_SGX_ATTRIBUTE:
+#endif
+ case KVM_CAP_VM_COPY_ENC_CONTEXT_FROM:
+ case KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM:
+ case KVM_CAP_SREGS2:
+ case KVM_CAP_EXIT_ON_EMULATION_FAILURE:
+ case KVM_CAP_VCPU_ATTRIBUTES:
+ case KVM_CAP_SYS_ATTRIBUTES:
+ case KVM_CAP_VAPIC:
+ case KVM_CAP_ENABLE_CAP:
+ case KVM_CAP_VM_DISABLE_NX_HUGE_PAGES:
+ case KVM_CAP_IRQFD_RESAMPLE:
+ r = 1;
+ break;
+ case KVM_CAP_EXIT_HYPERCALL:
+ r = KVM_EXIT_HYPERCALL_VALID_MASK;
+ break;
+ case KVM_CAP_SET_GUEST_DEBUG2:
+ return KVM_GUESTDBG_VALID_MASK;
+#ifdef CONFIG_KVM_XEN
+ case KVM_CAP_XEN_HVM:
+ r = KVM_XEN_HVM_CONFIG_HYPERCALL_MSR |
+ KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL |
+ KVM_XEN_HVM_CONFIG_SHARED_INFO |
+ KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL |
+ KVM_XEN_HVM_CONFIG_EVTCHN_SEND;
+ if (sched_info_on())
+ r |= KVM_XEN_HVM_CONFIG_RUNSTATE |
+ KVM_XEN_HVM_CONFIG_RUNSTATE_UPDATE_FLAG;
+ break;
+#endif
+ case KVM_CAP_SYNC_REGS:
+ r = KVM_SYNC_X86_VALID_FIELDS;
+ break;
+ case KVM_CAP_ADJUST_CLOCK:
+ r = KVM_CLOCK_VALID_FLAGS;
+ break;
+ case KVM_CAP_X86_DISABLE_EXITS:
+ r = KVM_X86_DISABLE_EXITS_PAUSE;
+
+ if (!mitigate_smt_rsb) {
+ r |= KVM_X86_DISABLE_EXITS_HLT |
+ KVM_X86_DISABLE_EXITS_CSTATE;
+
+ if (kvm_can_mwait_in_guest())
+ r |= KVM_X86_DISABLE_EXITS_MWAIT;
+ }
+ break;
+ case KVM_CAP_X86_SMM:
+ if (!IS_ENABLED(CONFIG_KVM_SMM))
+ break;
+
+ /* SMBASE is usually relocated above 1M on modern chipsets,
+ * and SMM handlers might indeed rely on 4G segment limits,
+ * so do not report SMM to be available if real mode is
+ * emulated via vm86 mode. Still, do not go to great lengths
+ * to avoid userspace's usage of the feature, because it is a
+ * fringe case that is not enabled except via specific settings
+ * of the module parameters.
+ */
+ r = static_call(kvm_x86_has_emulated_msr)(kvm, MSR_IA32_SMBASE);
+ break;
+ case KVM_CAP_NR_VCPUS:
+ r = min_t(unsigned int, num_online_cpus(), KVM_MAX_VCPUS);
+ break;
+ case KVM_CAP_MAX_VCPUS:
+ r = KVM_MAX_VCPUS;
+ break;
+ case KVM_CAP_MAX_VCPU_ID:
+ r = KVM_MAX_VCPU_IDS;
+ break;
+ case KVM_CAP_PV_MMU: /* obsolete */
+ r = 0;
+ break;
+ case KVM_CAP_MCE:
+ r = KVM_MAX_MCE_BANKS;
+ break;
+ case KVM_CAP_XCRS:
+ r = boot_cpu_has(X86_FEATURE_XSAVE);
+ break;
+ case KVM_CAP_TSC_CONTROL:
+ case KVM_CAP_VM_TSC_CONTROL:
+ r = kvm_caps.has_tsc_control;
+ break;
+ case KVM_CAP_X2APIC_API:
+ r = KVM_X2APIC_API_VALID_FLAGS;
+ break;
+ case KVM_CAP_NESTED_STATE:
+ r = kvm_x86_ops.nested_ops->get_state ?
+ kvm_x86_ops.nested_ops->get_state(NULL, NULL, 0) : 0;
+ break;
+ case KVM_CAP_HYPERV_DIRECT_TLBFLUSH:
+ r = kvm_x86_ops.enable_l2_tlb_flush != NULL;
+ break;
+ case KVM_CAP_HYPERV_ENLIGHTENED_VMCS:
+ r = kvm_x86_ops.nested_ops->enable_evmcs != NULL;
+ break;
+ case KVM_CAP_SMALLER_MAXPHYADDR:
+ r = (int) allow_smaller_maxphyaddr;
+ break;
+ case KVM_CAP_STEAL_TIME:
+ r = sched_info_on();
+ break;
+ case KVM_CAP_X86_BUS_LOCK_EXIT:
+ if (kvm_caps.has_bus_lock_exit)
+ r = KVM_BUS_LOCK_DETECTION_OFF |
+ KVM_BUS_LOCK_DETECTION_EXIT;
+ else
+ r = 0;
+ break;
+ case KVM_CAP_XSAVE2: {
+ r = xstate_required_size(kvm_get_filtered_xcr0(), false);
+ if (r < sizeof(struct kvm_xsave))
+ r = sizeof(struct kvm_xsave);
+ break;
+ }
+ case KVM_CAP_PMU_CAPABILITY:
+ r = enable_pmu ? KVM_CAP_PMU_VALID_MASK : 0;
+ break;
+ case KVM_CAP_DISABLE_QUIRKS2:
+ r = KVM_X86_VALID_QUIRKS;
+ break;
+ case KVM_CAP_X86_NOTIFY_VMEXIT:
+ r = kvm_caps.has_notify_vmexit;
+ break;
+ default:
+ break;
+ }
+ return r;
+}
+
+static inline void __user *kvm_get_attr_addr(struct kvm_device_attr *attr)
+{
+ void __user *uaddr = (void __user*)(unsigned long)attr->addr;
+
+ if ((u64)(unsigned long)uaddr != attr->addr)
+ return ERR_PTR_USR(-EFAULT);
+ return uaddr;
+}
+
+static int kvm_x86_dev_get_attr(struct kvm_device_attr *attr)
+{
+ u64 __user *uaddr = kvm_get_attr_addr(attr);
+
+ if (attr->group)
+ return -ENXIO;
+
+ if (IS_ERR(uaddr))
+ return PTR_ERR(uaddr);
+
+ switch (attr->attr) {
+ case KVM_X86_XCOMP_GUEST_SUPP:
+ if (put_user(kvm_caps.supported_xcr0, uaddr))
+ return -EFAULT;
+ return 0;
+ default:
+ return -ENXIO;
+ }
+}
+
+static int kvm_x86_dev_has_attr(struct kvm_device_attr *attr)
+{
+ if (attr->group)
+ return -ENXIO;
+
+ switch (attr->attr) {
+ case KVM_X86_XCOMP_GUEST_SUPP:
+ return 0;
+ default:
+ return -ENXIO;
+ }
+}
+
+long kvm_arch_dev_ioctl(struct file *filp,
+ unsigned int ioctl, unsigned long arg)
+{
+ void __user *argp = (void __user *)arg;
+ long r;
+
+ switch (ioctl) {
+ case KVM_GET_MSR_INDEX_LIST: {
+ struct kvm_msr_list __user *user_msr_list = argp;
+ struct kvm_msr_list msr_list;
+ unsigned n;
+
+ r = -EFAULT;
+ if (copy_from_user(&msr_list, user_msr_list, sizeof(msr_list)))
+ goto out;
+ n = msr_list.nmsrs;
+ msr_list.nmsrs = num_msrs_to_save + num_emulated_msrs;
+ if (copy_to_user(user_msr_list, &msr_list, sizeof(msr_list)))
+ goto out;
+ r = -E2BIG;
+ if (n < msr_list.nmsrs)
+ goto out;
+ r = -EFAULT;
+ if (copy_to_user(user_msr_list->indices, &msrs_to_save,
+ num_msrs_to_save * sizeof(u32)))
+ goto out;
+ if (copy_to_user(user_msr_list->indices + num_msrs_to_save,
+ &emulated_msrs,
+ num_emulated_msrs * sizeof(u32)))
+ goto out;
+ r = 0;
+ break;
+ }
+ case KVM_GET_SUPPORTED_CPUID:
+ case KVM_GET_EMULATED_CPUID: {
+ struct kvm_cpuid2 __user *cpuid_arg = argp;
+ struct kvm_cpuid2 cpuid;
+
+ r = -EFAULT;
+ if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
+ goto out;
+
+ r = kvm_dev_ioctl_get_cpuid(&cpuid, cpuid_arg->entries,
+ ioctl);
+ if (r)
+ goto out;
+
+ r = -EFAULT;
+ if (copy_to_user(cpuid_arg, &cpuid, sizeof(cpuid)))
+ goto out;
+ r = 0;
+ break;
+ }
+ case KVM_X86_GET_MCE_CAP_SUPPORTED:
+ r = -EFAULT;
+ if (copy_to_user(argp, &kvm_caps.supported_mce_cap,
+ sizeof(kvm_caps.supported_mce_cap)))
+ goto out;
+ r = 0;
+ break;
+ case KVM_GET_MSR_FEATURE_INDEX_LIST: {
+ struct kvm_msr_list __user *user_msr_list = argp;
+ struct kvm_msr_list msr_list;
+ unsigned int n;
+
+ r = -EFAULT;
+ if (copy_from_user(&msr_list, user_msr_list, sizeof(msr_list)))
+ goto out;
+ n = msr_list.nmsrs;
+ msr_list.nmsrs = num_msr_based_features;
+ if (copy_to_user(user_msr_list, &msr_list, sizeof(msr_list)))
+ goto out;
+ r = -E2BIG;
+ if (n < msr_list.nmsrs)
+ goto out;
+ r = -EFAULT;
+ if (copy_to_user(user_msr_list->indices, &msr_based_features,
+ num_msr_based_features * sizeof(u32)))
+ goto out;
+ r = 0;
+ break;
+ }
+ case KVM_GET_MSRS:
+ r = msr_io(NULL, argp, do_get_msr_feature, 1);
+ break;
+ case KVM_GET_SUPPORTED_HV_CPUID:
+ r = kvm_ioctl_get_supported_hv_cpuid(NULL, argp);
+ break;
+ case KVM_GET_DEVICE_ATTR: {
+ struct kvm_device_attr attr;
+ r = -EFAULT;
+ if (copy_from_user(&attr, (void __user *)arg, sizeof(attr)))
+ break;
+ r = kvm_x86_dev_get_attr(&attr);
+ break;
+ }
+ case KVM_HAS_DEVICE_ATTR: {
+ struct kvm_device_attr attr;
+ r = -EFAULT;
+ if (copy_from_user(&attr, (void __user *)arg, sizeof(attr)))
+ break;
+ r = kvm_x86_dev_has_attr(&attr);
+ break;
+ }
+ default:
+ r = -EINVAL;
+ break;
+ }
+out:
+ return r;
+}
+
+static void wbinvd_ipi(void *garbage)
+{
+ wbinvd();
+}
+
+static bool need_emulate_wbinvd(struct kvm_vcpu *vcpu)
+{
+ return kvm_arch_has_noncoherent_dma(vcpu->kvm);
+}
+
+void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+{
+ /* Address WBINVD may be executed by guest */
+ if (need_emulate_wbinvd(vcpu)) {
+ if (static_call(kvm_x86_has_wbinvd_exit)())
+ cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
+ else if (vcpu->cpu != -1 && vcpu->cpu != cpu)
+ smp_call_function_single(vcpu->cpu,
+ wbinvd_ipi, NULL, 1);
+ }
+
+ static_call(kvm_x86_vcpu_load)(vcpu, cpu);
+
+ /* Save host pkru register if supported */
+ vcpu->arch.host_pkru = read_pkru();
+
+ /* Apply any externally detected TSC adjustments (due to suspend) */
+ if (unlikely(vcpu->arch.tsc_offset_adjustment)) {
+ adjust_tsc_offset_host(vcpu, vcpu->arch.tsc_offset_adjustment);
+ vcpu->arch.tsc_offset_adjustment = 0;
+ kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
+ }
+
+ if (unlikely(vcpu->cpu != cpu) || kvm_check_tsc_unstable()) {
+ s64 tsc_delta = !vcpu->arch.last_host_tsc ? 0 :
+ rdtsc() - vcpu->arch.last_host_tsc;
+ if (tsc_delta < 0)
+ mark_tsc_unstable("KVM discovered backwards TSC");
+
+ if (kvm_check_tsc_unstable()) {
+ u64 offset = kvm_compute_l1_tsc_offset(vcpu,
+ vcpu->arch.last_guest_tsc);
+ kvm_vcpu_write_tsc_offset(vcpu, offset);
+ vcpu->arch.tsc_catchup = 1;
+ }
+
+ if (kvm_lapic_hv_timer_in_use(vcpu))
+ kvm_lapic_restart_hv_timer(vcpu);
+
+ /*
+ * On a host with synchronized TSC, there is no need to update
+ * kvmclock on vcpu->cpu migration
+ */
+ if (!vcpu->kvm->arch.use_master_clock || vcpu->cpu == -1)
+ kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu);
+ if (vcpu->cpu != cpu)
+ kvm_make_request(KVM_REQ_MIGRATE_TIMER, vcpu);
+ vcpu->cpu = cpu;
+ }
+
+ kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
+}
+
+static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu)
+{
+ struct gfn_to_hva_cache *ghc = &vcpu->arch.st.cache;
+ struct kvm_steal_time __user *st;
+ struct kvm_memslots *slots;
+ static const u8 preempted = KVM_VCPU_PREEMPTED;
+ gpa_t gpa = vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS;
+
+ /*
+ * The vCPU can be marked preempted if and only if the VM-Exit was on
+ * an instruction boundary and will not trigger guest emulation of any
+ * kind (see vcpu_run). Vendor specific code controls (conservatively)
+ * when this is true, for example allowing the vCPU to be marked
+ * preempted if and only if the VM-Exit was due to a host interrupt.
+ */
+ if (!vcpu->arch.at_instruction_boundary) {
+ vcpu->stat.preemption_other++;
+ return;
+ }
+
+ vcpu->stat.preemption_reported++;
+ if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
+ return;
+
+ if (vcpu->arch.st.preempted)
+ return;
+
+ /* This happens on process exit */
+ if (unlikely(current->mm != vcpu->kvm->mm))
+ return;
+
+ slots = kvm_memslots(vcpu->kvm);
+
+ if (unlikely(slots->generation != ghc->generation ||
+ gpa != ghc->gpa ||
+ kvm_is_error_hva(ghc->hva) || !ghc->memslot))
+ return;
+
+ st = (struct kvm_steal_time __user *)ghc->hva;
+ BUILD_BUG_ON(sizeof(st->preempted) != sizeof(preempted));
+
+ if (!copy_to_user_nofault(&st->preempted, &preempted, sizeof(preempted)))
+ vcpu->arch.st.preempted = KVM_VCPU_PREEMPTED;
+
+ mark_page_dirty_in_slot(vcpu->kvm, ghc->memslot, gpa_to_gfn(ghc->gpa));
+}
+
+void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
+{
+ int idx;
+
+ if (vcpu->preempted) {
+ if (!vcpu->arch.guest_state_protected)
+ vcpu->arch.preempted_in_kernel = !static_call(kvm_x86_get_cpl)(vcpu);
+
+ /*
+ * Take the srcu lock as memslots will be accessed to check the gfn
+ * cache generation against the memslots generation.
+ */
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
+ if (kvm_xen_msr_enabled(vcpu->kvm))
+ kvm_xen_runstate_set_preempted(vcpu);
+ else
+ kvm_steal_time_set_preempted(vcpu);
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
+ }
+
+ static_call(kvm_x86_vcpu_put)(vcpu);
+ vcpu->arch.last_host_tsc = rdtsc();
+}
+
+static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
+ struct kvm_lapic_state *s)
+{
+ static_call_cond(kvm_x86_sync_pir_to_irr)(vcpu);
+
+ return kvm_apic_get_state(vcpu, s);
+}
+
+static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
+ struct kvm_lapic_state *s)
+{
+ int r;
+
+ r = kvm_apic_set_state(vcpu, s);
+ if (r)
+ return r;
+ update_cr8_intercept(vcpu);
+
+ return 0;
+}
+
+static int kvm_cpu_accept_dm_intr(struct kvm_vcpu *vcpu)
+{
+ /*
+ * We can accept userspace's request for interrupt injection
+ * as long as we have a place to store the interrupt number.
+ * The actual injection will happen when the CPU is able to
+ * deliver the interrupt.
+ */
+ if (kvm_cpu_has_extint(vcpu))
+ return false;
+
+ /* Acknowledging ExtINT does not happen if LINT0 is masked. */
+ return (!lapic_in_kernel(vcpu) ||
+ kvm_apic_accept_pic_intr(vcpu));
+}
+
+static int kvm_vcpu_ready_for_interrupt_injection(struct kvm_vcpu *vcpu)
+{
+ /*
+ * Do not cause an interrupt window exit if an exception
+ * is pending or an event needs reinjection; userspace
+ * might want to inject the interrupt manually using KVM_SET_REGS
+ * or KVM_SET_SREGS. For that to work, we must be at an
+ * instruction boundary and with no events half-injected.
+ */
+ return (kvm_arch_interrupt_allowed(vcpu) &&
+ kvm_cpu_accept_dm_intr(vcpu) &&
+ !kvm_event_needs_reinjection(vcpu) &&
+ !kvm_is_exception_pending(vcpu));
+}
+
+static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
+ struct kvm_interrupt *irq)
+{
+ if (irq->irq >= KVM_NR_INTERRUPTS)
+ return -EINVAL;
+
+ if (!irqchip_in_kernel(vcpu->kvm)) {
+ kvm_queue_interrupt(vcpu, irq->irq, false);
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ return 0;
+ }
+
+ /*
+ * With in-kernel LAPIC, we only use this to inject EXTINT, so
+ * fail for in-kernel 8259.
+ */
+ if (pic_in_kernel(vcpu->kvm))
+ return -ENXIO;
+
+ if (vcpu->arch.pending_external_vector != -1)
+ return -EEXIST;
+
+ vcpu->arch.pending_external_vector = irq->irq;
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ return 0;
+}
+
+static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu)
+{
+ kvm_inject_nmi(vcpu);
+
+ return 0;
+}
+
+static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu,
+ struct kvm_tpr_access_ctl *tac)
+{
+ if (tac->flags)
+ return -EINVAL;
+ vcpu->arch.tpr_access_reporting = !!tac->enabled;
+ return 0;
+}
+
+static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
+ u64 mcg_cap)
+{
+ int r;
+ unsigned bank_num = mcg_cap & 0xff, bank;
+
+ r = -EINVAL;
+ if (!bank_num || bank_num > KVM_MAX_MCE_BANKS)
+ goto out;
+ if (mcg_cap & ~(kvm_caps.supported_mce_cap | 0xff | 0xff0000))
+ goto out;
+ r = 0;
+ vcpu->arch.mcg_cap = mcg_cap;
+ /* Init IA32_MCG_CTL to all 1s */
+ if (mcg_cap & MCG_CTL_P)
+ vcpu->arch.mcg_ctl = ~(u64)0;
+ /* Init IA32_MCi_CTL to all 1s, IA32_MCi_CTL2 to all 0s */
+ for (bank = 0; bank < bank_num; bank++) {
+ vcpu->arch.mce_banks[bank*4] = ~(u64)0;
+ if (mcg_cap & MCG_CMCI_P)
+ vcpu->arch.mci_ctl2_banks[bank] = 0;
+ }
+
+ kvm_apic_after_set_mcg_cap(vcpu);
+
+ static_call(kvm_x86_setup_mce)(vcpu);
+out:
+ return r;
+}
+
+/*
+ * Validate this is an UCNA (uncorrectable no action) error by checking the
+ * MCG_STATUS and MCi_STATUS registers:
+ * - none of the bits for Machine Check Exceptions are set
+ * - both the VAL (valid) and UC (uncorrectable) bits are set
+ * MCI_STATUS_PCC - Processor Context Corrupted
+ * MCI_STATUS_S - Signaled as a Machine Check Exception
+ * MCI_STATUS_AR - Software recoverable Action Required
+ */
+static bool is_ucna(struct kvm_x86_mce *mce)
+{
+ return !mce->mcg_status &&
+ !(mce->status & (MCI_STATUS_PCC | MCI_STATUS_S | MCI_STATUS_AR)) &&
+ (mce->status & MCI_STATUS_VAL) &&
+ (mce->status & MCI_STATUS_UC);
+}
+
+static int kvm_vcpu_x86_set_ucna(struct kvm_vcpu *vcpu, struct kvm_x86_mce *mce, u64* banks)
+{
+ u64 mcg_cap = vcpu->arch.mcg_cap;
+
+ banks[1] = mce->status;
+ banks[2] = mce->addr;
+ banks[3] = mce->misc;
+ vcpu->arch.mcg_status = mce->mcg_status;
+
+ if (!(mcg_cap & MCG_CMCI_P) ||
+ !(vcpu->arch.mci_ctl2_banks[mce->bank] & MCI_CTL2_CMCI_EN))
+ return 0;
+
+ if (lapic_in_kernel(vcpu))
+ kvm_apic_local_deliver(vcpu->arch.apic, APIC_LVTCMCI);
+
+ return 0;
+}
+
+static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
+ struct kvm_x86_mce *mce)
+{
+ u64 mcg_cap = vcpu->arch.mcg_cap;
+ unsigned bank_num = mcg_cap & 0xff;
+ u64 *banks = vcpu->arch.mce_banks;
+
+ if (mce->bank >= bank_num || !(mce->status & MCI_STATUS_VAL))
+ return -EINVAL;
+
+ banks += array_index_nospec(4 * mce->bank, 4 * bank_num);
+
+ if (is_ucna(mce))
+ return kvm_vcpu_x86_set_ucna(vcpu, mce, banks);
+
+ /*
+ * if IA32_MCG_CTL is not all 1s, the uncorrected error
+ * reporting is disabled
+ */
+ if ((mce->status & MCI_STATUS_UC) && (mcg_cap & MCG_CTL_P) &&
+ vcpu->arch.mcg_ctl != ~(u64)0)
+ return 0;
+ /*
+ * if IA32_MCi_CTL is not all 1s, the uncorrected error
+ * reporting is disabled for the bank
+ */
+ if ((mce->status & MCI_STATUS_UC) && banks[0] != ~(u64)0)
+ return 0;
+ if (mce->status & MCI_STATUS_UC) {
+ if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) ||
+ !kvm_is_cr4_bit_set(vcpu, X86_CR4_MCE)) {
+ kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+ return 0;
+ }
+ if (banks[1] & MCI_STATUS_VAL)
+ mce->status |= MCI_STATUS_OVER;
+ banks[2] = mce->addr;
+ banks[3] = mce->misc;
+ vcpu->arch.mcg_status = mce->mcg_status;
+ banks[1] = mce->status;
+ kvm_queue_exception(vcpu, MC_VECTOR);
+ } else if (!(banks[1] & MCI_STATUS_VAL)
+ || !(banks[1] & MCI_STATUS_UC)) {
+ if (banks[1] & MCI_STATUS_VAL)
+ mce->status |= MCI_STATUS_OVER;
+ banks[2] = mce->addr;
+ banks[3] = mce->misc;
+ banks[1] = mce->status;
+ } else
+ banks[1] |= MCI_STATUS_OVER;
+ return 0;
+}
+
+static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
+ struct kvm_vcpu_events *events)
+{
+ struct kvm_queued_exception *ex;
+
+ process_nmi(vcpu);
+
+#ifdef CONFIG_KVM_SMM
+ if (kvm_check_request(KVM_REQ_SMI, vcpu))
+ process_smi(vcpu);
+#endif
+
+ /*
+ * KVM's ABI only allows for one exception to be migrated. Luckily,
+ * the only time there can be two queued exceptions is if there's a
+ * non-exiting _injected_ exception, and a pending exiting exception.
+ * In that case, ignore the VM-Exiting exception as it's an extension
+ * of the injected exception.
+ */
+ if (vcpu->arch.exception_vmexit.pending &&
+ !vcpu->arch.exception.pending &&
+ !vcpu->arch.exception.injected)
+ ex = &vcpu->arch.exception_vmexit;
+ else
+ ex = &vcpu->arch.exception;
+
+ /*
+ * In guest mode, payload delivery should be deferred if the exception
+ * will be intercepted by L1, e.g. KVM should not modifying CR2 if L1
+ * intercepts #PF, ditto for DR6 and #DBs. If the per-VM capability,
+ * KVM_CAP_EXCEPTION_PAYLOAD, is not set, userspace may or may not
+ * propagate the payload and so it cannot be safely deferred. Deliver
+ * the payload if the capability hasn't been requested.
+ */
+ if (!vcpu->kvm->arch.exception_payload_enabled &&
+ ex->pending && ex->has_payload)
+ kvm_deliver_exception_payload(vcpu, ex);
+
+ memset(events, 0, sizeof(*events));
+
+ /*
+ * The API doesn't provide the instruction length for software
+ * exceptions, so don't report them. As long as the guest RIP
+ * isn't advanced, we should expect to encounter the exception
+ * again.
+ */
+ if (!kvm_exception_is_soft(ex->vector)) {
+ events->exception.injected = ex->injected;
+ events->exception.pending = ex->pending;
+ /*
+ * For ABI compatibility, deliberately conflate
+ * pending and injected exceptions when
+ * KVM_CAP_EXCEPTION_PAYLOAD isn't enabled.
+ */
+ if (!vcpu->kvm->arch.exception_payload_enabled)
+ events->exception.injected |= ex->pending;
+ }
+ events->exception.nr = ex->vector;
+ events->exception.has_error_code = ex->has_error_code;
+ events->exception.error_code = ex->error_code;
+ events->exception_has_payload = ex->has_payload;
+ events->exception_payload = ex->payload;
+
+ events->interrupt.injected =
+ vcpu->arch.interrupt.injected && !vcpu->arch.interrupt.soft;
+ events->interrupt.nr = vcpu->arch.interrupt.nr;
+ events->interrupt.shadow = static_call(kvm_x86_get_interrupt_shadow)(vcpu);
+
+ events->nmi.injected = vcpu->arch.nmi_injected;
+ events->nmi.pending = kvm_get_nr_pending_nmis(vcpu);
+ events->nmi.masked = static_call(kvm_x86_get_nmi_mask)(vcpu);
+
+ /* events->sipi_vector is never valid when reporting to user space */
+
+#ifdef CONFIG_KVM_SMM
+ events->smi.smm = is_smm(vcpu);
+ events->smi.pending = vcpu->arch.smi_pending;
+ events->smi.smm_inside_nmi =
+ !!(vcpu->arch.hflags & HF_SMM_INSIDE_NMI_MASK);
+#endif
+ events->smi.latched_init = kvm_lapic_latched_init(vcpu);
+
+ events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING
+ | KVM_VCPUEVENT_VALID_SHADOW
+ | KVM_VCPUEVENT_VALID_SMM);
+ if (vcpu->kvm->arch.exception_payload_enabled)
+ events->flags |= KVM_VCPUEVENT_VALID_PAYLOAD;
+ if (vcpu->kvm->arch.triple_fault_event) {
+ events->triple_fault.pending = kvm_test_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+ events->flags |= KVM_VCPUEVENT_VALID_TRIPLE_FAULT;
+ }
+}
+
+static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
+ struct kvm_vcpu_events *events)
+{
+ if (events->flags & ~(KVM_VCPUEVENT_VALID_NMI_PENDING
+ | KVM_VCPUEVENT_VALID_SIPI_VECTOR
+ | KVM_VCPUEVENT_VALID_SHADOW
+ | KVM_VCPUEVENT_VALID_SMM
+ | KVM_VCPUEVENT_VALID_PAYLOAD
+ | KVM_VCPUEVENT_VALID_TRIPLE_FAULT))
+ return -EINVAL;
+
+ if (events->flags & KVM_VCPUEVENT_VALID_PAYLOAD) {
+ if (!vcpu->kvm->arch.exception_payload_enabled)
+ return -EINVAL;
+ if (events->exception.pending)
+ events->exception.injected = 0;
+ else
+ events->exception_has_payload = 0;
+ } else {
+ events->exception.pending = 0;
+ events->exception_has_payload = 0;
+ }
+
+ if ((events->exception.injected || events->exception.pending) &&
+ (events->exception.nr > 31 || events->exception.nr == NMI_VECTOR))
+ return -EINVAL;
+
+ /* INITs are latched while in SMM */
+ if (events->flags & KVM_VCPUEVENT_VALID_SMM &&
+ (events->smi.smm || events->smi.pending) &&
+ vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED)
+ return -EINVAL;
+
+ process_nmi(vcpu);
+
+ /*
+ * Flag that userspace is stuffing an exception, the next KVM_RUN will
+ * morph the exception to a VM-Exit if appropriate. Do this only for
+ * pending exceptions, already-injected exceptions are not subject to
+ * intercpetion. Note, userspace that conflates pending and injected
+ * is hosed, and will incorrectly convert an injected exception into a
+ * pending exception, which in turn may cause a spurious VM-Exit.
+ */
+ vcpu->arch.exception_from_userspace = events->exception.pending;
+
+ vcpu->arch.exception_vmexit.pending = false;
+
+ vcpu->arch.exception.injected = events->exception.injected;
+ vcpu->arch.exception.pending = events->exception.pending;
+ vcpu->arch.exception.vector = events->exception.nr;
+ vcpu->arch.exception.has_error_code = events->exception.has_error_code;
+ vcpu->arch.exception.error_code = events->exception.error_code;
+ vcpu->arch.exception.has_payload = events->exception_has_payload;
+ vcpu->arch.exception.payload = events->exception_payload;
+
+ vcpu->arch.interrupt.injected = events->interrupt.injected;
+ vcpu->arch.interrupt.nr = events->interrupt.nr;
+ vcpu->arch.interrupt.soft = events->interrupt.soft;
+ if (events->flags & KVM_VCPUEVENT_VALID_SHADOW)
+ static_call(kvm_x86_set_interrupt_shadow)(vcpu,
+ events->interrupt.shadow);
+
+ vcpu->arch.nmi_injected = events->nmi.injected;
+ if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING) {
+ vcpu->arch.nmi_pending = 0;
+ atomic_set(&vcpu->arch.nmi_queued, events->nmi.pending);
+ kvm_make_request(KVM_REQ_NMI, vcpu);
+ }
+ static_call(kvm_x86_set_nmi_mask)(vcpu, events->nmi.masked);
+
+ if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR &&
+ lapic_in_kernel(vcpu))
+ vcpu->arch.apic->sipi_vector = events->sipi_vector;
+
+ if (events->flags & KVM_VCPUEVENT_VALID_SMM) {
+#ifdef CONFIG_KVM_SMM
+ if (!!(vcpu->arch.hflags & HF_SMM_MASK) != events->smi.smm) {
+ kvm_leave_nested(vcpu);
+ kvm_smm_changed(vcpu, events->smi.smm);
+ }
+
+ vcpu->arch.smi_pending = events->smi.pending;
+
+ if (events->smi.smm) {
+ if (events->smi.smm_inside_nmi)
+ vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK;
+ else
+ vcpu->arch.hflags &= ~HF_SMM_INSIDE_NMI_MASK;
+ }
+
+#else
+ if (events->smi.smm || events->smi.pending ||
+ events->smi.smm_inside_nmi)
+ return -EINVAL;
+#endif
+
+ if (lapic_in_kernel(vcpu)) {
+ if (events->smi.latched_init)
+ set_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
+ else
+ clear_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
+ }
+ }
+
+ if (events->flags & KVM_VCPUEVENT_VALID_TRIPLE_FAULT) {
+ if (!vcpu->kvm->arch.triple_fault_event)
+ return -EINVAL;
+ if (events->triple_fault.pending)
+ kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+ else
+ kvm_clear_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+ }
+
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+
+ return 0;
+}
+
+static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu,
+ struct kvm_debugregs *dbgregs)
+{
+ unsigned long val;
+
+ memset(dbgregs, 0, sizeof(*dbgregs));
+ memcpy(dbgregs->db, vcpu->arch.db, sizeof(vcpu->arch.db));
+ kvm_get_dr(vcpu, 6, &val);
+ dbgregs->dr6 = val;
+ dbgregs->dr7 = vcpu->arch.dr7;
+}
+
+static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
+ struct kvm_debugregs *dbgregs)
+{
+ if (dbgregs->flags)
+ return -EINVAL;
+
+ if (!kvm_dr6_valid(dbgregs->dr6))
+ return -EINVAL;
+ if (!kvm_dr7_valid(dbgregs->dr7))
+ return -EINVAL;
+
+ memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db));
+ kvm_update_dr0123(vcpu);
+ vcpu->arch.dr6 = dbgregs->dr6;
+ vcpu->arch.dr7 = dbgregs->dr7;
+ kvm_update_dr7(vcpu);
+
+ return 0;
+}
+
+
+static void kvm_vcpu_ioctl_x86_get_xsave2(struct kvm_vcpu *vcpu,
+ u8 *state, unsigned int size)
+{
+ /*
+ * Only copy state for features that are enabled for the guest. The
+ * state itself isn't problematic, but setting bits in the header for
+ * features that are supported in *this* host but not exposed to the
+ * guest can result in KVM_SET_XSAVE failing when live migrating to a
+ * compatible host without the features that are NOT exposed to the
+ * guest.
+ *
+ * FP+SSE can always be saved/restored via KVM_{G,S}ET_XSAVE, even if
+ * XSAVE/XCRO are not exposed to the guest, and even if XSAVE isn't
+ * supported by the host.
+ */
+ u64 supported_xcr0 = vcpu->arch.guest_supported_xcr0 |
+ XFEATURE_MASK_FPSSE;
+
+ if (fpstate_is_confidential(&vcpu->arch.guest_fpu))
+ return;
+
+ fpu_copy_guest_fpstate_to_uabi(&vcpu->arch.guest_fpu, state, size,
+ supported_xcr0, vcpu->arch.pkru);
+}
+
+static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu,
+ struct kvm_xsave *guest_xsave)
+{
+ return kvm_vcpu_ioctl_x86_get_xsave2(vcpu, (void *)guest_xsave->region,
+ sizeof(guest_xsave->region));
+}
+
+static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu,
+ struct kvm_xsave *guest_xsave)
+{
+ if (fpstate_is_confidential(&vcpu->arch.guest_fpu))
+ return 0;
+
+ return fpu_copy_uabi_to_guest_fpstate(&vcpu->arch.guest_fpu,
+ guest_xsave->region,
+ kvm_caps.supported_xcr0,
+ &vcpu->arch.pkru);
+}
+
+static void kvm_vcpu_ioctl_x86_get_xcrs(struct kvm_vcpu *vcpu,
+ struct kvm_xcrs *guest_xcrs)
+{
+ if (!boot_cpu_has(X86_FEATURE_XSAVE)) {
+ guest_xcrs->nr_xcrs = 0;
+ return;
+ }
+
+ guest_xcrs->nr_xcrs = 1;
+ guest_xcrs->flags = 0;
+ guest_xcrs->xcrs[0].xcr = XCR_XFEATURE_ENABLED_MASK;
+ guest_xcrs->xcrs[0].value = vcpu->arch.xcr0;
+}
+
+static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu,
+ struct kvm_xcrs *guest_xcrs)
+{
+ int i, r = 0;
+
+ if (!boot_cpu_has(X86_FEATURE_XSAVE))
+ return -EINVAL;
+
+ if (guest_xcrs->nr_xcrs > KVM_MAX_XCRS || guest_xcrs->flags)
+ return -EINVAL;
+
+ for (i = 0; i < guest_xcrs->nr_xcrs; i++)
+ /* Only support XCR0 currently */
+ if (guest_xcrs->xcrs[i].xcr == XCR_XFEATURE_ENABLED_MASK) {
+ r = __kvm_set_xcr(vcpu, XCR_XFEATURE_ENABLED_MASK,
+ guest_xcrs->xcrs[i].value);
+ break;
+ }
+ if (r)
+ r = -EINVAL;
+ return r;
+}
+
+/*
+ * kvm_set_guest_paused() indicates to the guest kernel that it has been
+ * stopped by the hypervisor. This function will be called from the host only.
+ * EINVAL is returned when the host attempts to set the flag for a guest that
+ * does not support pv clocks.
+ */
+static int kvm_set_guest_paused(struct kvm_vcpu *vcpu)
+{
+ if (!vcpu->arch.pv_time.active)
+ return -EINVAL;
+ vcpu->arch.pvclock_set_guest_stopped_request = true;
+ kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
+ return 0;
+}
+
+static int kvm_arch_tsc_has_attr(struct kvm_vcpu *vcpu,
+ struct kvm_device_attr *attr)
+{
+ int r;
+
+ switch (attr->attr) {
+ case KVM_VCPU_TSC_OFFSET:
+ r = 0;
+ break;
+ default:
+ r = -ENXIO;
+ }
+
+ return r;
+}
+
+static int kvm_arch_tsc_get_attr(struct kvm_vcpu *vcpu,
+ struct kvm_device_attr *attr)
+{
+ u64 __user *uaddr = kvm_get_attr_addr(attr);
+ int r;
+
+ if (IS_ERR(uaddr))
+ return PTR_ERR(uaddr);
+
+ switch (attr->attr) {
+ case KVM_VCPU_TSC_OFFSET:
+ r = -EFAULT;
+ if (put_user(vcpu->arch.l1_tsc_offset, uaddr))
+ break;
+ r = 0;
+ break;
+ default:
+ r = -ENXIO;
+ }
+
+ return r;
+}
+
+static int kvm_arch_tsc_set_attr(struct kvm_vcpu *vcpu,
+ struct kvm_device_attr *attr)
+{
+ u64 __user *uaddr = kvm_get_attr_addr(attr);
+ struct kvm *kvm = vcpu->kvm;
+ int r;
+
+ if (IS_ERR(uaddr))
+ return PTR_ERR(uaddr);
+
+ switch (attr->attr) {
+ case KVM_VCPU_TSC_OFFSET: {
+ u64 offset, tsc, ns;
+ unsigned long flags;
+ bool matched;
+
+ r = -EFAULT;
+ if (get_user(offset, uaddr))
+ break;
+
+ raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
+
+ matched = (vcpu->arch.virtual_tsc_khz &&
+ kvm->arch.last_tsc_khz == vcpu->arch.virtual_tsc_khz &&
+ kvm->arch.last_tsc_offset == offset);
+
+ tsc = kvm_scale_tsc(rdtsc(), vcpu->arch.l1_tsc_scaling_ratio) + offset;
+ ns = get_kvmclock_base_ns();
+
+ __kvm_synchronize_tsc(vcpu, offset, tsc, ns, matched);
+ raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
+
+ r = 0;
+ break;
+ }
+ default:
+ r = -ENXIO;
+ }
+
+ return r;
+}
+
+static int kvm_vcpu_ioctl_device_attr(struct kvm_vcpu *vcpu,
+ unsigned int ioctl,
+ void __user *argp)
+{
+ struct kvm_device_attr attr;
+ int r;
+
+ if (copy_from_user(&attr, argp, sizeof(attr)))
+ return -EFAULT;
+
+ if (attr.group != KVM_VCPU_TSC_CTRL)
+ return -ENXIO;
+
+ switch (ioctl) {
+ case KVM_HAS_DEVICE_ATTR:
+ r = kvm_arch_tsc_has_attr(vcpu, &attr);
+ break;
+ case KVM_GET_DEVICE_ATTR:
+ r = kvm_arch_tsc_get_attr(vcpu, &attr);
+ break;
+ case KVM_SET_DEVICE_ATTR:
+ r = kvm_arch_tsc_set_attr(vcpu, &attr);
+ break;
+ }
+
+ return r;
+}
+
+static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
+ struct kvm_enable_cap *cap)
+{
+ int r;
+ uint16_t vmcs_version;
+ void __user *user_ptr;
+
+ if (cap->flags)
+ return -EINVAL;
+
+ switch (cap->cap) {
+ case KVM_CAP_HYPERV_SYNIC2:
+ if (cap->args[0])
+ return -EINVAL;
+ fallthrough;
+
+ case KVM_CAP_HYPERV_SYNIC:
+ if (!irqchip_in_kernel(vcpu->kvm))
+ return -EINVAL;
+ return kvm_hv_activate_synic(vcpu, cap->cap ==
+ KVM_CAP_HYPERV_SYNIC2);
+ case KVM_CAP_HYPERV_ENLIGHTENED_VMCS:
+ if (!kvm_x86_ops.nested_ops->enable_evmcs)
+ return -ENOTTY;
+ r = kvm_x86_ops.nested_ops->enable_evmcs(vcpu, &vmcs_version);
+ if (!r) {
+ user_ptr = (void __user *)(uintptr_t)cap->args[0];
+ if (copy_to_user(user_ptr, &vmcs_version,
+ sizeof(vmcs_version)))
+ r = -EFAULT;
+ }
+ return r;
+ case KVM_CAP_HYPERV_DIRECT_TLBFLUSH:
+ if (!kvm_x86_ops.enable_l2_tlb_flush)
+ return -ENOTTY;
+
+ return static_call(kvm_x86_enable_l2_tlb_flush)(vcpu);
+
+ case KVM_CAP_HYPERV_ENFORCE_CPUID:
+ return kvm_hv_set_enforce_cpuid(vcpu, cap->args[0]);
+
+ case KVM_CAP_ENFORCE_PV_FEATURE_CPUID:
+ vcpu->arch.pv_cpuid.enforce = cap->args[0];
+ if (vcpu->arch.pv_cpuid.enforce)
+ kvm_update_pv_runtime(vcpu);
+
+ return 0;
+ default:
+ return -EINVAL;
+ }
+}
+
+long kvm_arch_vcpu_ioctl(struct file *filp,
+ unsigned int ioctl, unsigned long arg)
+{
+ struct kvm_vcpu *vcpu = filp->private_data;
+ void __user *argp = (void __user *)arg;
+ int r;
+ union {
+ struct kvm_sregs2 *sregs2;
+ struct kvm_lapic_state *lapic;
+ struct kvm_xsave *xsave;
+ struct kvm_xcrs *xcrs;
+ void *buffer;
+ } u;
+
+ vcpu_load(vcpu);
+
+ u.buffer = NULL;
+ switch (ioctl) {
+ case KVM_GET_LAPIC: {
+ r = -EINVAL;
+ if (!lapic_in_kernel(vcpu))
+ goto out;
+ u.lapic = kzalloc(sizeof(struct kvm_lapic_state),
+ GFP_KERNEL_ACCOUNT);
+
+ r = -ENOMEM;
+ if (!u.lapic)
+ goto out;
+ r = kvm_vcpu_ioctl_get_lapic(vcpu, u.lapic);
+ if (r)
+ goto out;
+ r = -EFAULT;
+ if (copy_to_user(argp, u.lapic, sizeof(struct kvm_lapic_state)))
+ goto out;
+ r = 0;
+ break;
+ }
+ case KVM_SET_LAPIC: {
+ r = -EINVAL;
+ if (!lapic_in_kernel(vcpu))
+ goto out;
+ u.lapic = memdup_user(argp, sizeof(*u.lapic));
+ if (IS_ERR(u.lapic)) {
+ r = PTR_ERR(u.lapic);
+ goto out_nofree;
+ }
+
+ r = kvm_vcpu_ioctl_set_lapic(vcpu, u.lapic);
+ break;
+ }
+ case KVM_INTERRUPT: {
+ struct kvm_interrupt irq;
+
+ r = -EFAULT;
+ if (copy_from_user(&irq, argp, sizeof(irq)))
+ goto out;
+ r = kvm_vcpu_ioctl_interrupt(vcpu, &irq);
+ break;
+ }
+ case KVM_NMI: {
+ r = kvm_vcpu_ioctl_nmi(vcpu);
+ break;
+ }
+ case KVM_SMI: {
+ r = kvm_inject_smi(vcpu);
+ break;
+ }
+ case KVM_SET_CPUID: {
+ struct kvm_cpuid __user *cpuid_arg = argp;
+ struct kvm_cpuid cpuid;
+
+ r = -EFAULT;
+ if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
+ goto out;
+ r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries);
+ break;
+ }
+ case KVM_SET_CPUID2: {
+ struct kvm_cpuid2 __user *cpuid_arg = argp;
+ struct kvm_cpuid2 cpuid;
+
+ r = -EFAULT;
+ if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
+ goto out;
+ r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid,
+ cpuid_arg->entries);
+ break;
+ }
+ case KVM_GET_CPUID2: {
+ struct kvm_cpuid2 __user *cpuid_arg = argp;
+ struct kvm_cpuid2 cpuid;
+
+ r = -EFAULT;
+ if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
+ goto out;
+ r = kvm_vcpu_ioctl_get_cpuid2(vcpu, &cpuid,
+ cpuid_arg->entries);
+ if (r)
+ goto out;
+ r = -EFAULT;
+ if (copy_to_user(cpuid_arg, &cpuid, sizeof(cpuid)))
+ goto out;
+ r = 0;
+ break;
+ }
+ case KVM_GET_MSRS: {
+ int idx = srcu_read_lock(&vcpu->kvm->srcu);
+ r = msr_io(vcpu, argp, do_get_msr, 1);
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
+ break;
+ }
+ case KVM_SET_MSRS: {
+ int idx = srcu_read_lock(&vcpu->kvm->srcu);
+ r = msr_io(vcpu, argp, do_set_msr, 0);
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
+ break;
+ }
+ case KVM_TPR_ACCESS_REPORTING: {
+ struct kvm_tpr_access_ctl tac;
+
+ r = -EFAULT;
+ if (copy_from_user(&tac, argp, sizeof(tac)))
+ goto out;
+ r = vcpu_ioctl_tpr_access_reporting(vcpu, &tac);
+ if (r)
+ goto out;
+ r = -EFAULT;
+ if (copy_to_user(argp, &tac, sizeof(tac)))
+ goto out;
+ r = 0;
+ break;
+ };
+ case KVM_SET_VAPIC_ADDR: {
+ struct kvm_vapic_addr va;
+ int idx;
+
+ r = -EINVAL;
+ if (!lapic_in_kernel(vcpu))
+ goto out;
+ r = -EFAULT;
+ if (copy_from_user(&va, argp, sizeof(va)))
+ goto out;
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
+ r = kvm_lapic_set_vapic_addr(vcpu, va.vapic_addr);
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
+ break;
+ }
+ case KVM_X86_SETUP_MCE: {
+ u64 mcg_cap;
+
+ r = -EFAULT;
+ if (copy_from_user(&mcg_cap, argp, sizeof(mcg_cap)))
+ goto out;
+ r = kvm_vcpu_ioctl_x86_setup_mce(vcpu, mcg_cap);
+ break;
+ }
+ case KVM_X86_SET_MCE: {
+ struct kvm_x86_mce mce;
+
+ r = -EFAULT;
+ if (copy_from_user(&mce, argp, sizeof(mce)))
+ goto out;
+ r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce);
+ break;
+ }
+ case KVM_GET_VCPU_EVENTS: {
+ struct kvm_vcpu_events events;
+
+ kvm_vcpu_ioctl_x86_get_vcpu_events(vcpu, &events);
+
+ r = -EFAULT;
+ if (copy_to_user(argp, &events, sizeof(struct kvm_vcpu_events)))
+ break;
+ r = 0;
+ break;
+ }
+ case KVM_SET_VCPU_EVENTS: {
+ struct kvm_vcpu_events events;
+
+ r = -EFAULT;
+ if (copy_from_user(&events, argp, sizeof(struct kvm_vcpu_events)))
+ break;
+
+ r = kvm_vcpu_ioctl_x86_set_vcpu_events(vcpu, &events);
+ break;
+ }
+ case KVM_GET_DEBUGREGS: {
+ struct kvm_debugregs dbgregs;
+
+ kvm_vcpu_ioctl_x86_get_debugregs(vcpu, &dbgregs);
+
+ r = -EFAULT;
+ if (copy_to_user(argp, &dbgregs,
+ sizeof(struct kvm_debugregs)))
+ break;
+ r = 0;
+ break;
+ }
+ case KVM_SET_DEBUGREGS: {
+ struct kvm_debugregs dbgregs;
+
+ r = -EFAULT;
+ if (copy_from_user(&dbgregs, argp,
+ sizeof(struct kvm_debugregs)))
+ break;
+
+ r = kvm_vcpu_ioctl_x86_set_debugregs(vcpu, &dbgregs);
+ break;
+ }
+ case KVM_GET_XSAVE: {
+ r = -EINVAL;
+ if (vcpu->arch.guest_fpu.uabi_size > sizeof(struct kvm_xsave))
+ break;
+
+ u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL_ACCOUNT);
+ r = -ENOMEM;
+ if (!u.xsave)
+ break;
+
+ kvm_vcpu_ioctl_x86_get_xsave(vcpu, u.xsave);
+
+ r = -EFAULT;
+ if (copy_to_user(argp, u.xsave, sizeof(struct kvm_xsave)))
+ break;
+ r = 0;
+ break;
+ }
+ case KVM_SET_XSAVE: {
+ int size = vcpu->arch.guest_fpu.uabi_size;
+
+ u.xsave = memdup_user(argp, size);
+ if (IS_ERR(u.xsave)) {
+ r = PTR_ERR(u.xsave);
+ goto out_nofree;
+ }
+
+ r = kvm_vcpu_ioctl_x86_set_xsave(vcpu, u.xsave);
+ break;
+ }
+
+ case KVM_GET_XSAVE2: {
+ int size = vcpu->arch.guest_fpu.uabi_size;
+
+ u.xsave = kzalloc(size, GFP_KERNEL_ACCOUNT);
+ r = -ENOMEM;
+ if (!u.xsave)
+ break;
+
+ kvm_vcpu_ioctl_x86_get_xsave2(vcpu, u.buffer, size);
+
+ r = -EFAULT;
+ if (copy_to_user(argp, u.xsave, size))
+ break;
+
+ r = 0;
+ break;
+ }
+
+ case KVM_GET_XCRS: {
+ u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL_ACCOUNT);
+ r = -ENOMEM;
+ if (!u.xcrs)
+ break;
+
+ kvm_vcpu_ioctl_x86_get_xcrs(vcpu, u.xcrs);
+
+ r = -EFAULT;
+ if (copy_to_user(argp, u.xcrs,
+ sizeof(struct kvm_xcrs)))
+ break;
+ r = 0;
+ break;
+ }
+ case KVM_SET_XCRS: {
+ u.xcrs = memdup_user(argp, sizeof(*u.xcrs));
+ if (IS_ERR(u.xcrs)) {
+ r = PTR_ERR(u.xcrs);
+ goto out_nofree;
+ }
+
+ r = kvm_vcpu_ioctl_x86_set_xcrs(vcpu, u.xcrs);
+ break;
+ }
+ case KVM_SET_TSC_KHZ: {
+ u32 user_tsc_khz;
+
+ r = -EINVAL;
+ user_tsc_khz = (u32)arg;
+
+ if (kvm_caps.has_tsc_control &&
+ user_tsc_khz >= kvm_caps.max_guest_tsc_khz)
+ goto out;
+
+ if (user_tsc_khz == 0)
+ user_tsc_khz = tsc_khz;
+
+ if (!kvm_set_tsc_khz(vcpu, user_tsc_khz))
+ r = 0;
+
+ goto out;
+ }
+ case KVM_GET_TSC_KHZ: {
+ r = vcpu->arch.virtual_tsc_khz;
+ goto out;
+ }
+ case KVM_KVMCLOCK_CTRL: {
+ r = kvm_set_guest_paused(vcpu);
+ goto out;
+ }
+ case KVM_ENABLE_CAP: {
+ struct kvm_enable_cap cap;
+
+ r = -EFAULT;
+ if (copy_from_user(&cap, argp, sizeof(cap)))
+ goto out;
+ r = kvm_vcpu_ioctl_enable_cap(vcpu, &cap);
+ break;
+ }
+ case KVM_GET_NESTED_STATE: {
+ struct kvm_nested_state __user *user_kvm_nested_state = argp;
+ u32 user_data_size;
+
+ r = -EINVAL;
+ if (!kvm_x86_ops.nested_ops->get_state)
+ break;
+
+ BUILD_BUG_ON(sizeof(user_data_size) != sizeof(user_kvm_nested_state->size));
+ r = -EFAULT;
+ if (get_user(user_data_size, &user_kvm_nested_state->size))
+ break;
+
+ r = kvm_x86_ops.nested_ops->get_state(vcpu, user_kvm_nested_state,
+ user_data_size);
+ if (r < 0)
+ break;
+
+ if (r > user_data_size) {
+ if (put_user(r, &user_kvm_nested_state->size))
+ r = -EFAULT;
+ else
+ r = -E2BIG;
+ break;
+ }
+
+ r = 0;
+ break;
+ }
+ case KVM_SET_NESTED_STATE: {
+ struct kvm_nested_state __user *user_kvm_nested_state = argp;
+ struct kvm_nested_state kvm_state;
+ int idx;
+
+ r = -EINVAL;
+ if (!kvm_x86_ops.nested_ops->set_state)
+ break;
+
+ r = -EFAULT;
+ if (copy_from_user(&kvm_state, user_kvm_nested_state, sizeof(kvm_state)))
+ break;
+
+ r = -EINVAL;
+ if (kvm_state.size < sizeof(kvm_state))
+ break;
+
+ if (kvm_state.flags &
+ ~(KVM_STATE_NESTED_RUN_PENDING | KVM_STATE_NESTED_GUEST_MODE
+ | KVM_STATE_NESTED_EVMCS | KVM_STATE_NESTED_MTF_PENDING
+ | KVM_STATE_NESTED_GIF_SET))
+ break;
+
+ /* nested_run_pending implies guest_mode. */
+ if ((kvm_state.flags & KVM_STATE_NESTED_RUN_PENDING)
+ && !(kvm_state.flags & KVM_STATE_NESTED_GUEST_MODE))
+ break;
+
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
+ r = kvm_x86_ops.nested_ops->set_state(vcpu, user_kvm_nested_state, &kvm_state);
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
+ break;
+ }
+ case KVM_GET_SUPPORTED_HV_CPUID:
+ r = kvm_ioctl_get_supported_hv_cpuid(vcpu, argp);
+ break;
+#ifdef CONFIG_KVM_XEN
+ case KVM_XEN_VCPU_GET_ATTR: {
+ struct kvm_xen_vcpu_attr xva;
+
+ r = -EFAULT;
+ if (copy_from_user(&xva, argp, sizeof(xva)))
+ goto out;
+ r = kvm_xen_vcpu_get_attr(vcpu, &xva);
+ if (!r && copy_to_user(argp, &xva, sizeof(xva)))
+ r = -EFAULT;
+ break;
+ }
+ case KVM_XEN_VCPU_SET_ATTR: {
+ struct kvm_xen_vcpu_attr xva;
+
+ r = -EFAULT;
+ if (copy_from_user(&xva, argp, sizeof(xva)))
+ goto out;
+ r = kvm_xen_vcpu_set_attr(vcpu, &xva);
+ break;
+ }
+#endif
+ case KVM_GET_SREGS2: {
+ u.sregs2 = kzalloc(sizeof(struct kvm_sregs2), GFP_KERNEL);
+ r = -ENOMEM;
+ if (!u.sregs2)
+ goto out;
+ __get_sregs2(vcpu, u.sregs2);
+ r = -EFAULT;
+ if (copy_to_user(argp, u.sregs2, sizeof(struct kvm_sregs2)))
+ goto out;
+ r = 0;
+ break;
+ }
+ case KVM_SET_SREGS2: {
+ u.sregs2 = memdup_user(argp, sizeof(struct kvm_sregs2));
+ if (IS_ERR(u.sregs2)) {
+ r = PTR_ERR(u.sregs2);
+ u.sregs2 = NULL;
+ goto out;
+ }
+ r = __set_sregs2(vcpu, u.sregs2);
+ break;
+ }
+ case KVM_HAS_DEVICE_ATTR:
+ case KVM_GET_DEVICE_ATTR:
+ case KVM_SET_DEVICE_ATTR:
+ r = kvm_vcpu_ioctl_device_attr(vcpu, ioctl, argp);
+ break;
+ default:
+ r = -EINVAL;
+ }
+out:
+ kfree(u.buffer);
+out_nofree:
+ vcpu_put(vcpu);
+ return r;
+}
+
+vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
+{
+ return VM_FAULT_SIGBUS;
+}
+
+static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr)
+{
+ int ret;
+
+ if (addr > (unsigned int)(-3 * PAGE_SIZE))
+ return -EINVAL;
+ ret = static_call(kvm_x86_set_tss_addr)(kvm, addr);
+ return ret;
+}
+
+static int kvm_vm_ioctl_set_identity_map_addr(struct kvm *kvm,
+ u64 ident_addr)
+{
+ return static_call(kvm_x86_set_identity_map_addr)(kvm, ident_addr);
+}
+
+static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
+ unsigned long kvm_nr_mmu_pages)
+{
+ if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES)
+ return -EINVAL;
+
+ mutex_lock(&kvm->slots_lock);
+
+ kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages);
+ kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages;
+
+ mutex_unlock(&kvm->slots_lock);
+ return 0;
+}
+
+static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
+{
+ struct kvm_pic *pic = kvm->arch.vpic;
+ int r;
+
+ r = 0;
+ switch (chip->chip_id) {
+ case KVM_IRQCHIP_PIC_MASTER:
+ memcpy(&chip->chip.pic, &pic->pics[0],
+ sizeof(struct kvm_pic_state));
+ break;
+ case KVM_IRQCHIP_PIC_SLAVE:
+ memcpy(&chip->chip.pic, &pic->pics[1],
+ sizeof(struct kvm_pic_state));
+ break;
+ case KVM_IRQCHIP_IOAPIC:
+ kvm_get_ioapic(kvm, &chip->chip.ioapic);
+ break;
+ default:
+ r = -EINVAL;
+ break;
+ }
+ return r;
+}
+
+static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
+{
+ struct kvm_pic *pic = kvm->arch.vpic;
+ int r;
+
+ r = 0;
+ switch (chip->chip_id) {
+ case KVM_IRQCHIP_PIC_MASTER:
+ spin_lock(&pic->lock);
+ memcpy(&pic->pics[0], &chip->chip.pic,
+ sizeof(struct kvm_pic_state));
+ spin_unlock(&pic->lock);
+ break;
+ case KVM_IRQCHIP_PIC_SLAVE:
+ spin_lock(&pic->lock);
+ memcpy(&pic->pics[1], &chip->chip.pic,
+ sizeof(struct kvm_pic_state));
+ spin_unlock(&pic->lock);
+ break;
+ case KVM_IRQCHIP_IOAPIC:
+ kvm_set_ioapic(kvm, &chip->chip.ioapic);
+ break;
+ default:
+ r = -EINVAL;
+ break;
+ }
+ kvm_pic_update_irq(pic);
+ return r;
+}
+
+static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps)
+{
+ struct kvm_kpit_state *kps = &kvm->arch.vpit->pit_state;
+
+ BUILD_BUG_ON(sizeof(*ps) != sizeof(kps->channels));
+
+ mutex_lock(&kps->lock);
+ memcpy(ps, &kps->channels, sizeof(*ps));
+ mutex_unlock(&kps->lock);
+ return 0;
+}
+
+static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps)
+{
+ int i;
+ struct kvm_pit *pit = kvm->arch.vpit;
+
+ mutex_lock(&pit->pit_state.lock);
+ memcpy(&pit->pit_state.channels, ps, sizeof(*ps));
+ for (i = 0; i < 3; i++)
+ kvm_pit_load_count(pit, i, ps->channels[i].count, 0);
+ mutex_unlock(&pit->pit_state.lock);
+ return 0;
+}
+
+static int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
+{
+ mutex_lock(&kvm->arch.vpit->pit_state.lock);
+ memcpy(ps->channels, &kvm->arch.vpit->pit_state.channels,
+ sizeof(ps->channels));
+ ps->flags = kvm->arch.vpit->pit_state.flags;
+ mutex_unlock(&kvm->arch.vpit->pit_state.lock);
+ memset(&ps->reserved, 0, sizeof(ps->reserved));
+ return 0;
+}
+
+static int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
+{
+ int start = 0;
+ int i;
+ u32 prev_legacy, cur_legacy;
+ struct kvm_pit *pit = kvm->arch.vpit;
+
+ mutex_lock(&pit->pit_state.lock);
+ prev_legacy = pit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY;
+ cur_legacy = ps->flags & KVM_PIT_FLAGS_HPET_LEGACY;
+ if (!prev_legacy && cur_legacy)
+ start = 1;
+ memcpy(&pit->pit_state.channels, &ps->channels,
+ sizeof(pit->pit_state.channels));
+ pit->pit_state.flags = ps->flags;
+ for (i = 0; i < 3; i++)
+ kvm_pit_load_count(pit, i, pit->pit_state.channels[i].count,
+ start && i == 0);
+ mutex_unlock(&pit->pit_state.lock);
+ return 0;
+}
+
+static int kvm_vm_ioctl_reinject(struct kvm *kvm,
+ struct kvm_reinject_control *control)
+{
+ struct kvm_pit *pit = kvm->arch.vpit;
+
+ /* pit->pit_state.lock was overloaded to prevent userspace from getting
+ * an inconsistent state after running multiple KVM_REINJECT_CONTROL
+ * ioctls in parallel. Use a separate lock if that ioctl isn't rare.
+ */
+ mutex_lock(&pit->pit_state.lock);
+ kvm_pit_set_reinject(pit, control->pit_reinject);
+ mutex_unlock(&pit->pit_state.lock);
+
+ return 0;
+}
+
+void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
+{
+
+ /*
+ * Flush all CPUs' dirty log buffers to the dirty_bitmap. Called
+ * before reporting dirty_bitmap to userspace. KVM flushes the buffers
+ * on all VM-Exits, thus we only need to kick running vCPUs to force a
+ * VM-Exit.
+ */
+ struct kvm_vcpu *vcpu;
+ unsigned long i;
+
+ kvm_for_each_vcpu(i, vcpu, kvm)
+ kvm_vcpu_kick(vcpu);
+}
+
+int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event,
+ bool line_status)
+{
+ if (!irqchip_in_kernel(kvm))
+ return -ENXIO;
+
+ irq_event->status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
+ irq_event->irq, irq_event->level,
+ line_status);
+ return 0;
+}
+
+int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
+ struct kvm_enable_cap *cap)
+{
+ int r;
+
+ if (cap->flags)
+ return -EINVAL;
+
+ switch (cap->cap) {
+ case KVM_CAP_DISABLE_QUIRKS2:
+ r = -EINVAL;
+ if (cap->args[0] & ~KVM_X86_VALID_QUIRKS)
+ break;
+ fallthrough;
+ case KVM_CAP_DISABLE_QUIRKS:
+ kvm->arch.disabled_quirks = cap->args[0];
+ r = 0;
+ break;
+ case KVM_CAP_SPLIT_IRQCHIP: {
+ mutex_lock(&kvm->lock);
+ r = -EINVAL;
+ if (cap->args[0] > MAX_NR_RESERVED_IOAPIC_PINS)
+ goto split_irqchip_unlock;
+ r = -EEXIST;
+ if (irqchip_in_kernel(kvm))
+ goto split_irqchip_unlock;
+ if (kvm->created_vcpus)
+ goto split_irqchip_unlock;
+ r = kvm_setup_empty_irq_routing(kvm);
+ if (r)
+ goto split_irqchip_unlock;
+ /* Pairs with irqchip_in_kernel. */
+ smp_wmb();
+ kvm->arch.irqchip_mode = KVM_IRQCHIP_SPLIT;
+ kvm->arch.nr_reserved_ioapic_pins = cap->args[0];
+ kvm_clear_apicv_inhibit(kvm, APICV_INHIBIT_REASON_ABSENT);
+ r = 0;
+split_irqchip_unlock:
+ mutex_unlock(&kvm->lock);
+ break;
+ }
+ case KVM_CAP_X2APIC_API:
+ r = -EINVAL;
+ if (cap->args[0] & ~KVM_X2APIC_API_VALID_FLAGS)
+ break;
+
+ if (cap->args[0] & KVM_X2APIC_API_USE_32BIT_IDS)
+ kvm->arch.x2apic_format = true;
+ if (cap->args[0] & KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK)
+ kvm->arch.x2apic_broadcast_quirk_disabled = true;
+
+ r = 0;
+ break;
+ case KVM_CAP_X86_DISABLE_EXITS:
+ r = -EINVAL;
+ if (cap->args[0] & ~KVM_X86_DISABLE_VALID_EXITS)
+ break;
+
+ if (cap->args[0] & KVM_X86_DISABLE_EXITS_PAUSE)
+ kvm->arch.pause_in_guest = true;
+
+#define SMT_RSB_MSG "This processor is affected by the Cross-Thread Return Predictions vulnerability. " \
+ "KVM_CAP_X86_DISABLE_EXITS should only be used with SMT disabled or trusted guests."
+
+ if (!mitigate_smt_rsb) {
+ if (boot_cpu_has_bug(X86_BUG_SMT_RSB) && cpu_smt_possible() &&
+ (cap->args[0] & ~KVM_X86_DISABLE_EXITS_PAUSE))
+ pr_warn_once(SMT_RSB_MSG);
+
+ if ((cap->args[0] & KVM_X86_DISABLE_EXITS_MWAIT) &&
+ kvm_can_mwait_in_guest())
+ kvm->arch.mwait_in_guest = true;
+ if (cap->args[0] & KVM_X86_DISABLE_EXITS_HLT)
+ kvm->arch.hlt_in_guest = true;
+ if (cap->args[0] & KVM_X86_DISABLE_EXITS_CSTATE)
+ kvm->arch.cstate_in_guest = true;
+ }
+
+ r = 0;
+ break;
+ case KVM_CAP_MSR_PLATFORM_INFO:
+ kvm->arch.guest_can_read_msr_platform_info = cap->args[0];
+ r = 0;
+ break;
+ case KVM_CAP_EXCEPTION_PAYLOAD:
+ kvm->arch.exception_payload_enabled = cap->args[0];
+ r = 0;
+ break;
+ case KVM_CAP_X86_TRIPLE_FAULT_EVENT:
+ kvm->arch.triple_fault_event = cap->args[0];
+ r = 0;
+ break;
+ case KVM_CAP_X86_USER_SPACE_MSR:
+ r = -EINVAL;
+ if (cap->args[0] & ~KVM_MSR_EXIT_REASON_VALID_MASK)
+ break;
+ kvm->arch.user_space_msr_mask = cap->args[0];
+ r = 0;
+ break;
+ case KVM_CAP_X86_BUS_LOCK_EXIT:
+ r = -EINVAL;
+ if (cap->args[0] & ~KVM_BUS_LOCK_DETECTION_VALID_MODE)
+ break;
+
+ if ((cap->args[0] & KVM_BUS_LOCK_DETECTION_OFF) &&
+ (cap->args[0] & KVM_BUS_LOCK_DETECTION_EXIT))
+ break;
+
+ if (kvm_caps.has_bus_lock_exit &&
+ cap->args[0] & KVM_BUS_LOCK_DETECTION_EXIT)
+ kvm->arch.bus_lock_detection_enabled = true;
+ r = 0;
+ break;
+#ifdef CONFIG_X86_SGX_KVM
+ case KVM_CAP_SGX_ATTRIBUTE: {
+ unsigned long allowed_attributes = 0;
+
+ r = sgx_set_attribute(&allowed_attributes, cap->args[0]);
+ if (r)
+ break;
+
+ /* KVM only supports the PROVISIONKEY privileged attribute. */
+ if ((allowed_attributes & SGX_ATTR_PROVISIONKEY) &&
+ !(allowed_attributes & ~SGX_ATTR_PROVISIONKEY))
+ kvm->arch.sgx_provisioning_allowed = true;
+ else
+ r = -EINVAL;
+ break;
+ }
+#endif
+ case KVM_CAP_VM_COPY_ENC_CONTEXT_FROM:
+ r = -EINVAL;
+ if (!kvm_x86_ops.vm_copy_enc_context_from)
+ break;
+
+ r = static_call(kvm_x86_vm_copy_enc_context_from)(kvm, cap->args[0]);
+ break;
+ case KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM:
+ r = -EINVAL;
+ if (!kvm_x86_ops.vm_move_enc_context_from)
+ break;
+
+ r = static_call(kvm_x86_vm_move_enc_context_from)(kvm, cap->args[0]);
+ break;
+ case KVM_CAP_EXIT_HYPERCALL:
+ if (cap->args[0] & ~KVM_EXIT_HYPERCALL_VALID_MASK) {
+ r = -EINVAL;
+ break;
+ }
+ kvm->arch.hypercall_exit_enabled = cap->args[0];
+ r = 0;
+ break;
+ case KVM_CAP_EXIT_ON_EMULATION_FAILURE:
+ r = -EINVAL;
+ if (cap->args[0] & ~1)
+ break;
+ kvm->arch.exit_on_emulation_error = cap->args[0];
+ r = 0;
+ break;
+ case KVM_CAP_PMU_CAPABILITY:
+ r = -EINVAL;
+ if (!enable_pmu || (cap->args[0] & ~KVM_CAP_PMU_VALID_MASK))
+ break;
+
+ mutex_lock(&kvm->lock);
+ if (!kvm->created_vcpus) {
+ kvm->arch.enable_pmu = !(cap->args[0] & KVM_PMU_CAP_DISABLE);
+ r = 0;
+ }
+ mutex_unlock(&kvm->lock);
+ break;
+ case KVM_CAP_MAX_VCPU_ID:
+ r = -EINVAL;
+ if (cap->args[0] > KVM_MAX_VCPU_IDS)
+ break;
+
+ mutex_lock(&kvm->lock);
+ if (kvm->arch.max_vcpu_ids == cap->args[0]) {
+ r = 0;
+ } else if (!kvm->arch.max_vcpu_ids) {
+ kvm->arch.max_vcpu_ids = cap->args[0];
+ r = 0;
+ }
+ mutex_unlock(&kvm->lock);
+ break;
+ case KVM_CAP_X86_NOTIFY_VMEXIT:
+ r = -EINVAL;
+ if ((u32)cap->args[0] & ~KVM_X86_NOTIFY_VMEXIT_VALID_BITS)
+ break;
+ if (!kvm_caps.has_notify_vmexit)
+ break;
+ if (!((u32)cap->args[0] & KVM_X86_NOTIFY_VMEXIT_ENABLED))
+ break;
+ mutex_lock(&kvm->lock);
+ if (!kvm->created_vcpus) {
+ kvm->arch.notify_window = cap->args[0] >> 32;
+ kvm->arch.notify_vmexit_flags = (u32)cap->args[0];
+ r = 0;
+ }
+ mutex_unlock(&kvm->lock);
+ break;
+ case KVM_CAP_VM_DISABLE_NX_HUGE_PAGES:
+ r = -EINVAL;
+
+ /*
+ * Since the risk of disabling NX hugepages is a guest crashing
+ * the system, ensure the userspace process has permission to
+ * reboot the system.
+ *
+ * Note that unlike the reboot() syscall, the process must have
+ * this capability in the root namespace because exposing
+ * /dev/kvm into a container does not limit the scope of the
+ * iTLB multihit bug to that container. In other words,
+ * this must use capable(), not ns_capable().
+ */
+ if (!capable(CAP_SYS_BOOT)) {
+ r = -EPERM;
+ break;
+ }
+
+ if (cap->args[0])
+ break;
+
+ mutex_lock(&kvm->lock);
+ if (!kvm->created_vcpus) {
+ kvm->arch.disable_nx_huge_pages = true;
+ r = 0;
+ }
+ mutex_unlock(&kvm->lock);
+ break;
+ default:
+ r = -EINVAL;
+ break;
+ }
+ return r;
+}
+
+static struct kvm_x86_msr_filter *kvm_alloc_msr_filter(bool default_allow)
+{
+ struct kvm_x86_msr_filter *msr_filter;
+
+ msr_filter = kzalloc(sizeof(*msr_filter), GFP_KERNEL_ACCOUNT);
+ if (!msr_filter)
+ return NULL;
+
+ msr_filter->default_allow = default_allow;
+ return msr_filter;
+}
+
+static void kvm_free_msr_filter(struct kvm_x86_msr_filter *msr_filter)
+{
+ u32 i;
+
+ if (!msr_filter)
+ return;
+
+ for (i = 0; i < msr_filter->count; i++)
+ kfree(msr_filter->ranges[i].bitmap);
+
+ kfree(msr_filter);
+}
+
+static int kvm_add_msr_filter(struct kvm_x86_msr_filter *msr_filter,
+ struct kvm_msr_filter_range *user_range)
+{
+ unsigned long *bitmap;
+ size_t bitmap_size;
+
+ if (!user_range->nmsrs)
+ return 0;
+
+ if (user_range->flags & ~KVM_MSR_FILTER_RANGE_VALID_MASK)
+ return -EINVAL;
+
+ if (!user_range->flags)
+ return -EINVAL;
+
+ bitmap_size = BITS_TO_LONGS(user_range->nmsrs) * sizeof(long);
+ if (!bitmap_size || bitmap_size > KVM_MSR_FILTER_MAX_BITMAP_SIZE)
+ return -EINVAL;
+
+ bitmap = memdup_user((__user u8*)user_range->bitmap, bitmap_size);
+ if (IS_ERR(bitmap))
+ return PTR_ERR(bitmap);
+
+ msr_filter->ranges[msr_filter->count] = (struct msr_bitmap_range) {
+ .flags = user_range->flags,
+ .base = user_range->base,
+ .nmsrs = user_range->nmsrs,
+ .bitmap = bitmap,
+ };
+
+ msr_filter->count++;
+ return 0;
+}
+
+static int kvm_vm_ioctl_set_msr_filter(struct kvm *kvm,
+ struct kvm_msr_filter *filter)
+{
+ struct kvm_x86_msr_filter *new_filter, *old_filter;
+ bool default_allow;
+ bool empty = true;
+ int r;
+ u32 i;
+
+ if (filter->flags & ~KVM_MSR_FILTER_VALID_MASK)
+ return -EINVAL;
+
+ for (i = 0; i < ARRAY_SIZE(filter->ranges); i++)
+ empty &= !filter->ranges[i].nmsrs;
+
+ default_allow = !(filter->flags & KVM_MSR_FILTER_DEFAULT_DENY);
+ if (empty && !default_allow)
+ return -EINVAL;
+
+ new_filter = kvm_alloc_msr_filter(default_allow);
+ if (!new_filter)
+ return -ENOMEM;
+
+ for (i = 0; i < ARRAY_SIZE(filter->ranges); i++) {
+ r = kvm_add_msr_filter(new_filter, &filter->ranges[i]);
+ if (r) {
+ kvm_free_msr_filter(new_filter);
+ return r;
+ }
+ }
+
+ mutex_lock(&kvm->lock);
+ old_filter = rcu_replace_pointer(kvm->arch.msr_filter, new_filter,
+ mutex_is_locked(&kvm->lock));
+ mutex_unlock(&kvm->lock);
+ synchronize_srcu(&kvm->srcu);
+
+ kvm_free_msr_filter(old_filter);
+
+ kvm_make_all_cpus_request(kvm, KVM_REQ_MSR_FILTER_CHANGED);
+
+ return 0;
+}
+
+#ifdef CONFIG_KVM_COMPAT
+/* for KVM_X86_SET_MSR_FILTER */
+struct kvm_msr_filter_range_compat {
+ __u32 flags;
+ __u32 nmsrs;
+ __u32 base;
+ __u32 bitmap;
+};
+
+struct kvm_msr_filter_compat {
+ __u32 flags;
+ struct kvm_msr_filter_range_compat ranges[KVM_MSR_FILTER_MAX_RANGES];
+};
+
+#define KVM_X86_SET_MSR_FILTER_COMPAT _IOW(KVMIO, 0xc6, struct kvm_msr_filter_compat)
+
+long kvm_arch_vm_compat_ioctl(struct file *filp, unsigned int ioctl,
+ unsigned long arg)
+{
+ void __user *argp = (void __user *)arg;
+ struct kvm *kvm = filp->private_data;
+ long r = -ENOTTY;
+
+ switch (ioctl) {
+ case KVM_X86_SET_MSR_FILTER_COMPAT: {
+ struct kvm_msr_filter __user *user_msr_filter = argp;
+ struct kvm_msr_filter_compat filter_compat;
+ struct kvm_msr_filter filter;
+ int i;
+
+ if (copy_from_user(&filter_compat, user_msr_filter,
+ sizeof(filter_compat)))
+ return -EFAULT;
+
+ filter.flags = filter_compat.flags;
+ for (i = 0; i < ARRAY_SIZE(filter.ranges); i++) {
+ struct kvm_msr_filter_range_compat *cr;
+
+ cr = &filter_compat.ranges[i];
+ filter.ranges[i] = (struct kvm_msr_filter_range) {
+ .flags = cr->flags,
+ .nmsrs = cr->nmsrs,
+ .base = cr->base,
+ .bitmap = (__u8 *)(ulong)cr->bitmap,
+ };
+ }
+
+ r = kvm_vm_ioctl_set_msr_filter(kvm, &filter);
+ break;
+ }
+ }
+
+ return r;
+}
+#endif
+
+#ifdef CONFIG_HAVE_KVM_PM_NOTIFIER
+static int kvm_arch_suspend_notifier(struct kvm *kvm)
+{
+ struct kvm_vcpu *vcpu;
+ unsigned long i;
+ int ret = 0;
+
+ mutex_lock(&kvm->lock);
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ if (!vcpu->arch.pv_time.active)
+ continue;
+
+ ret = kvm_set_guest_paused(vcpu);
+ if (ret) {
+ kvm_err("Failed to pause guest VCPU%d: %d\n",
+ vcpu->vcpu_id, ret);
+ break;
+ }
+ }
+ mutex_unlock(&kvm->lock);
+
+ return ret ? NOTIFY_BAD : NOTIFY_DONE;
+}
+
+int kvm_arch_pm_notifier(struct kvm *kvm, unsigned long state)
+{
+ switch (state) {
+ case PM_HIBERNATION_PREPARE:
+ case PM_SUSPEND_PREPARE:
+ return kvm_arch_suspend_notifier(kvm);
+ }
+
+ return NOTIFY_DONE;
+}
+#endif /* CONFIG_HAVE_KVM_PM_NOTIFIER */
+
+static int kvm_vm_ioctl_get_clock(struct kvm *kvm, void __user *argp)
+{
+ struct kvm_clock_data data = { 0 };
+
+ get_kvmclock(kvm, &data);
+ if (copy_to_user(argp, &data, sizeof(data)))
+ return -EFAULT;
+
+ return 0;
+}
+
+static int kvm_vm_ioctl_set_clock(struct kvm *kvm, void __user *argp)
+{
+ struct kvm_arch *ka = &kvm->arch;
+ struct kvm_clock_data data;
+ u64 now_raw_ns;
+
+ if (copy_from_user(&data, argp, sizeof(data)))
+ return -EFAULT;
+
+ /*
+ * Only KVM_CLOCK_REALTIME is used, but allow passing the
+ * result of KVM_GET_CLOCK back to KVM_SET_CLOCK.
+ */
+ if (data.flags & ~KVM_CLOCK_VALID_FLAGS)
+ return -EINVAL;
+
+ kvm_hv_request_tsc_page_update(kvm);
+ kvm_start_pvclock_update(kvm);
+ pvclock_update_vm_gtod_copy(kvm);
+
+ /*
+ * This pairs with kvm_guest_time_update(): when masterclock is
+ * in use, we use master_kernel_ns + kvmclock_offset to set
+ * unsigned 'system_time' so if we use get_kvmclock_ns() (which
+ * is slightly ahead) here we risk going negative on unsigned
+ * 'system_time' when 'data.clock' is very small.
+ */
+ if (data.flags & KVM_CLOCK_REALTIME) {
+ u64 now_real_ns = ktime_get_real_ns();
+
+ /*
+ * Avoid stepping the kvmclock backwards.
+ */
+ if (now_real_ns > data.realtime)
+ data.clock += now_real_ns - data.realtime;
+ }
+
+ if (ka->use_master_clock)
+ now_raw_ns = ka->master_kernel_ns;
+ else
+ now_raw_ns = get_kvmclock_base_ns();
+ ka->kvmclock_offset = data.clock - now_raw_ns;
+ kvm_end_pvclock_update(kvm);
+ return 0;
+}
+
+int kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
+{
+ struct kvm *kvm = filp->private_data;
+ void __user *argp = (void __user *)arg;
+ int r = -ENOTTY;
+ /*
+ * This union makes it completely explicit to gcc-3.x
+ * that these two variables' stack usage should be
+ * combined, not added together.
+ */
+ union {
+ struct kvm_pit_state ps;
+ struct kvm_pit_state2 ps2;
+ struct kvm_pit_config pit_config;
+ } u;
+
+ switch (ioctl) {
+ case KVM_SET_TSS_ADDR:
+ r = kvm_vm_ioctl_set_tss_addr(kvm, arg);
+ break;
+ case KVM_SET_IDENTITY_MAP_ADDR: {
+ u64 ident_addr;
+
+ mutex_lock(&kvm->lock);
+ r = -EINVAL;
+ if (kvm->created_vcpus)
+ goto set_identity_unlock;
+ r = -EFAULT;
+ if (copy_from_user(&ident_addr, argp, sizeof(ident_addr)))
+ goto set_identity_unlock;
+ r = kvm_vm_ioctl_set_identity_map_addr(kvm, ident_addr);
+set_identity_unlock:
+ mutex_unlock(&kvm->lock);
+ break;
+ }
+ case KVM_SET_NR_MMU_PAGES:
+ r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg);
+ break;
+ case KVM_CREATE_IRQCHIP: {
+ mutex_lock(&kvm->lock);
+
+ r = -EEXIST;
+ if (irqchip_in_kernel(kvm))
+ goto create_irqchip_unlock;
+
+ r = -EINVAL;
+ if (kvm->created_vcpus)
+ goto create_irqchip_unlock;
+
+ r = kvm_pic_init(kvm);
+ if (r)
+ goto create_irqchip_unlock;
+
+ r = kvm_ioapic_init(kvm);
+ if (r) {
+ kvm_pic_destroy(kvm);
+ goto create_irqchip_unlock;
+ }
+
+ r = kvm_setup_default_irq_routing(kvm);
+ if (r) {
+ kvm_ioapic_destroy(kvm);
+ kvm_pic_destroy(kvm);
+ goto create_irqchip_unlock;
+ }
+ /* Write kvm->irq_routing before enabling irqchip_in_kernel. */
+ smp_wmb();
+ kvm->arch.irqchip_mode = KVM_IRQCHIP_KERNEL;
+ kvm_clear_apicv_inhibit(kvm, APICV_INHIBIT_REASON_ABSENT);
+ create_irqchip_unlock:
+ mutex_unlock(&kvm->lock);
+ break;
+ }
+ case KVM_CREATE_PIT:
+ u.pit_config.flags = KVM_PIT_SPEAKER_DUMMY;
+ goto create_pit;
+ case KVM_CREATE_PIT2:
+ r = -EFAULT;
+ if (copy_from_user(&u.pit_config, argp,
+ sizeof(struct kvm_pit_config)))
+ goto out;
+ create_pit:
+ mutex_lock(&kvm->lock);
+ r = -EEXIST;
+ if (kvm->arch.vpit)
+ goto create_pit_unlock;
+ r = -ENOMEM;
+ kvm->arch.vpit = kvm_create_pit(kvm, u.pit_config.flags);
+ if (kvm->arch.vpit)
+ r = 0;
+ create_pit_unlock:
+ mutex_unlock(&kvm->lock);
+ break;
+ case KVM_GET_IRQCHIP: {
+ /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
+ struct kvm_irqchip *chip;
+
+ chip = memdup_user(argp, sizeof(*chip));
+ if (IS_ERR(chip)) {
+ r = PTR_ERR(chip);
+ goto out;
+ }
+
+ r = -ENXIO;
+ if (!irqchip_kernel(kvm))
+ goto get_irqchip_out;
+ r = kvm_vm_ioctl_get_irqchip(kvm, chip);
+ if (r)
+ goto get_irqchip_out;
+ r = -EFAULT;
+ if (copy_to_user(argp, chip, sizeof(*chip)))
+ goto get_irqchip_out;
+ r = 0;
+ get_irqchip_out:
+ kfree(chip);
+ break;
+ }
+ case KVM_SET_IRQCHIP: {
+ /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
+ struct kvm_irqchip *chip;
+
+ chip = memdup_user(argp, sizeof(*chip));
+ if (IS_ERR(chip)) {
+ r = PTR_ERR(chip);
+ goto out;
+ }
+
+ r = -ENXIO;
+ if (!irqchip_kernel(kvm))
+ goto set_irqchip_out;
+ r = kvm_vm_ioctl_set_irqchip(kvm, chip);
+ set_irqchip_out:
+ kfree(chip);
+ break;
+ }
+ case KVM_GET_PIT: {
+ r = -EFAULT;
+ if (copy_from_user(&u.ps, argp, sizeof(struct kvm_pit_state)))
+ goto out;
+ r = -ENXIO;
+ if (!kvm->arch.vpit)
+ goto out;
+ r = kvm_vm_ioctl_get_pit(kvm, &u.ps);
+ if (r)
+ goto out;
+ r = -EFAULT;
+ if (copy_to_user(argp, &u.ps, sizeof(struct kvm_pit_state)))
+ goto out;
+ r = 0;
+ break;
+ }
+ case KVM_SET_PIT: {
+ r = -EFAULT;
+ if (copy_from_user(&u.ps, argp, sizeof(u.ps)))
+ goto out;
+ mutex_lock(&kvm->lock);
+ r = -ENXIO;
+ if (!kvm->arch.vpit)
+ goto set_pit_out;
+ r = kvm_vm_ioctl_set_pit(kvm, &u.ps);
+set_pit_out:
+ mutex_unlock(&kvm->lock);
+ break;
+ }
+ case KVM_GET_PIT2: {
+ r = -ENXIO;
+ if (!kvm->arch.vpit)
+ goto out;
+ r = kvm_vm_ioctl_get_pit2(kvm, &u.ps2);
+ if (r)
+ goto out;
+ r = -EFAULT;
+ if (copy_to_user(argp, &u.ps2, sizeof(u.ps2)))
+ goto out;
+ r = 0;
+ break;
+ }
+ case KVM_SET_PIT2: {
+ r = -EFAULT;
+ if (copy_from_user(&u.ps2, argp, sizeof(u.ps2)))
+ goto out;
+ mutex_lock(&kvm->lock);
+ r = -ENXIO;
+ if (!kvm->arch.vpit)
+ goto set_pit2_out;
+ r = kvm_vm_ioctl_set_pit2(kvm, &u.ps2);
+set_pit2_out:
+ mutex_unlock(&kvm->lock);
+ break;
+ }
+ case KVM_REINJECT_CONTROL: {
+ struct kvm_reinject_control control;
+ r = -EFAULT;
+ if (copy_from_user(&control, argp, sizeof(control)))
+ goto out;
+ r = -ENXIO;
+ if (!kvm->arch.vpit)
+ goto out;
+ r = kvm_vm_ioctl_reinject(kvm, &control);
+ break;
+ }
+ case KVM_SET_BOOT_CPU_ID:
+ r = 0;
+ mutex_lock(&kvm->lock);
+ if (kvm->created_vcpus)
+ r = -EBUSY;
+ else
+ kvm->arch.bsp_vcpu_id = arg;
+ mutex_unlock(&kvm->lock);
+ break;
+#ifdef CONFIG_KVM_XEN
+ case KVM_XEN_HVM_CONFIG: {
+ struct kvm_xen_hvm_config xhc;
+ r = -EFAULT;
+ if (copy_from_user(&xhc, argp, sizeof(xhc)))
+ goto out;
+ r = kvm_xen_hvm_config(kvm, &xhc);
+ break;
+ }
+ case KVM_XEN_HVM_GET_ATTR: {
+ struct kvm_xen_hvm_attr xha;
+
+ r = -EFAULT;
+ if (copy_from_user(&xha, argp, sizeof(xha)))
+ goto out;
+ r = kvm_xen_hvm_get_attr(kvm, &xha);
+ if (!r && copy_to_user(argp, &xha, sizeof(xha)))
+ r = -EFAULT;
+ break;
+ }
+ case KVM_XEN_HVM_SET_ATTR: {
+ struct kvm_xen_hvm_attr xha;
+
+ r = -EFAULT;
+ if (copy_from_user(&xha, argp, sizeof(xha)))
+ goto out;
+ r = kvm_xen_hvm_set_attr(kvm, &xha);
+ break;
+ }
+ case KVM_XEN_HVM_EVTCHN_SEND: {
+ struct kvm_irq_routing_xen_evtchn uxe;
+
+ r = -EFAULT;
+ if (copy_from_user(&uxe, argp, sizeof(uxe)))
+ goto out;
+ r = kvm_xen_hvm_evtchn_send(kvm, &uxe);
+ break;
+ }
+#endif
+ case KVM_SET_CLOCK:
+ r = kvm_vm_ioctl_set_clock(kvm, argp);
+ break;
+ case KVM_GET_CLOCK:
+ r = kvm_vm_ioctl_get_clock(kvm, argp);
+ break;
+ case KVM_SET_TSC_KHZ: {
+ u32 user_tsc_khz;
+
+ r = -EINVAL;
+ user_tsc_khz = (u32)arg;
+
+ if (kvm_caps.has_tsc_control &&
+ user_tsc_khz >= kvm_caps.max_guest_tsc_khz)
+ goto out;
+
+ if (user_tsc_khz == 0)
+ user_tsc_khz = tsc_khz;
+
+ WRITE_ONCE(kvm->arch.default_tsc_khz, user_tsc_khz);
+ r = 0;
+
+ goto out;
+ }
+ case KVM_GET_TSC_KHZ: {
+ r = READ_ONCE(kvm->arch.default_tsc_khz);
+ goto out;
+ }
+ case KVM_MEMORY_ENCRYPT_OP: {
+ r = -ENOTTY;
+ if (!kvm_x86_ops.mem_enc_ioctl)
+ goto out;
+
+ r = static_call(kvm_x86_mem_enc_ioctl)(kvm, argp);
+ break;
+ }
+ case KVM_MEMORY_ENCRYPT_REG_REGION: {
+ struct kvm_enc_region region;
+
+ r = -EFAULT;
+ if (copy_from_user(&region, argp, sizeof(region)))
+ goto out;
+
+ r = -ENOTTY;
+ if (!kvm_x86_ops.mem_enc_register_region)
+ goto out;
+
+ r = static_call(kvm_x86_mem_enc_register_region)(kvm, &region);
+ break;
+ }
+ case KVM_MEMORY_ENCRYPT_UNREG_REGION: {
+ struct kvm_enc_region region;
+
+ r = -EFAULT;
+ if (copy_from_user(&region, argp, sizeof(region)))
+ goto out;
+
+ r = -ENOTTY;
+ if (!kvm_x86_ops.mem_enc_unregister_region)
+ goto out;
+
+ r = static_call(kvm_x86_mem_enc_unregister_region)(kvm, &region);
+ break;
+ }
+ case KVM_HYPERV_EVENTFD: {
+ struct kvm_hyperv_eventfd hvevfd;
+
+ r = -EFAULT;
+ if (copy_from_user(&hvevfd, argp, sizeof(hvevfd)))
+ goto out;
+ r = kvm_vm_ioctl_hv_eventfd(kvm, &hvevfd);
+ break;
+ }
+ case KVM_SET_PMU_EVENT_FILTER:
+ r = kvm_vm_ioctl_set_pmu_event_filter(kvm, argp);
+ break;
+ case KVM_X86_SET_MSR_FILTER: {
+ struct kvm_msr_filter __user *user_msr_filter = argp;
+ struct kvm_msr_filter filter;
+
+ if (copy_from_user(&filter, user_msr_filter, sizeof(filter)))
+ return -EFAULT;
+
+ r = kvm_vm_ioctl_set_msr_filter(kvm, &filter);
+ break;
+ }
+ default:
+ r = -ENOTTY;
+ }
+out:
+ return r;
+}
+
+static void kvm_probe_feature_msr(u32 msr_index)
+{
+ struct kvm_msr_entry msr = {
+ .index = msr_index,
+ };
+
+ if (kvm_get_msr_feature(&msr))
+ return;
+
+ msr_based_features[num_msr_based_features++] = msr_index;
+}
+
+static void kvm_probe_msr_to_save(u32 msr_index)
+{
+ u32 dummy[2];
+
+ if (rdmsr_safe(msr_index, &dummy[0], &dummy[1]))
+ return;
+
+ /*
+ * Even MSRs that are valid in the host may not be exposed to guests in
+ * some cases.
+ */
+ switch (msr_index) {
+ case MSR_IA32_BNDCFGS:
+ if (!kvm_mpx_supported())
+ return;
+ break;
+ case MSR_TSC_AUX:
+ if (!kvm_cpu_cap_has(X86_FEATURE_RDTSCP) &&
+ !kvm_cpu_cap_has(X86_FEATURE_RDPID))
+ return;
+ break;
+ case MSR_IA32_UMWAIT_CONTROL:
+ if (!kvm_cpu_cap_has(X86_FEATURE_WAITPKG))
+ return;
+ break;
+ case MSR_IA32_RTIT_CTL:
+ case MSR_IA32_RTIT_STATUS:
+ if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT))
+ return;
+ break;
+ case MSR_IA32_RTIT_CR3_MATCH:
+ if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) ||
+ !intel_pt_validate_hw_cap(PT_CAP_cr3_filtering))
+ return;
+ break;
+ case MSR_IA32_RTIT_OUTPUT_BASE:
+ case MSR_IA32_RTIT_OUTPUT_MASK:
+ if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) ||
+ (!intel_pt_validate_hw_cap(PT_CAP_topa_output) &&
+ !intel_pt_validate_hw_cap(PT_CAP_single_range_output)))
+ return;
+ break;
+ case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
+ if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) ||
+ (msr_index - MSR_IA32_RTIT_ADDR0_A >=
+ intel_pt_validate_hw_cap(PT_CAP_num_address_ranges) * 2))
+ return;
+ break;
+ case MSR_ARCH_PERFMON_PERFCTR0 ... MSR_ARCH_PERFMON_PERFCTR_MAX:
+ if (msr_index - MSR_ARCH_PERFMON_PERFCTR0 >=
+ kvm_pmu_cap.num_counters_gp)
+ return;
+ break;
+ case MSR_ARCH_PERFMON_EVENTSEL0 ... MSR_ARCH_PERFMON_EVENTSEL_MAX:
+ if (msr_index - MSR_ARCH_PERFMON_EVENTSEL0 >=
+ kvm_pmu_cap.num_counters_gp)
+ return;
+ break;
+ case MSR_ARCH_PERFMON_FIXED_CTR0 ... MSR_ARCH_PERFMON_FIXED_CTR_MAX:
+ if (msr_index - MSR_ARCH_PERFMON_FIXED_CTR0 >=
+ kvm_pmu_cap.num_counters_fixed)
+ return;
+ break;
+ case MSR_AMD64_PERF_CNTR_GLOBAL_CTL:
+ case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS:
+ case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR:
+ if (!kvm_cpu_cap_has(X86_FEATURE_PERFMON_V2))
+ return;
+ break;
+ case MSR_IA32_XFD:
+ case MSR_IA32_XFD_ERR:
+ if (!kvm_cpu_cap_has(X86_FEATURE_XFD))
+ return;
+ break;
+ case MSR_IA32_TSX_CTRL:
+ if (!(kvm_get_arch_capabilities() & ARCH_CAP_TSX_CTRL_MSR))
+ return;
+ break;
+ default:
+ break;
+ }
+
+ msrs_to_save[num_msrs_to_save++] = msr_index;
+}
+
+static void kvm_init_msr_lists(void)
+{
+ unsigned i;
+
+ BUILD_BUG_ON_MSG(KVM_PMC_MAX_FIXED != 3,
+ "Please update the fixed PMCs in msrs_to_save_pmu[]");
+
+ num_msrs_to_save = 0;
+ num_emulated_msrs = 0;
+ num_msr_based_features = 0;
+
+ for (i = 0; i < ARRAY_SIZE(msrs_to_save_base); i++)
+ kvm_probe_msr_to_save(msrs_to_save_base[i]);
+
+ if (enable_pmu) {
+ for (i = 0; i < ARRAY_SIZE(msrs_to_save_pmu); i++)
+ kvm_probe_msr_to_save(msrs_to_save_pmu[i]);
+ }
+
+ for (i = 0; i < ARRAY_SIZE(emulated_msrs_all); i++) {
+ if (!static_call(kvm_x86_has_emulated_msr)(NULL, emulated_msrs_all[i]))
+ continue;
+
+ emulated_msrs[num_emulated_msrs++] = emulated_msrs_all[i];
+ }
+
+ for (i = KVM_FIRST_EMULATED_VMX_MSR; i <= KVM_LAST_EMULATED_VMX_MSR; i++)
+ kvm_probe_feature_msr(i);
+
+ for (i = 0; i < ARRAY_SIZE(msr_based_features_all_except_vmx); i++)
+ kvm_probe_feature_msr(msr_based_features_all_except_vmx[i]);
+}
+
+static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len,
+ const void *v)
+{
+ int handled = 0;
+ int n;
+
+ do {
+ n = min(len, 8);
+ if (!(lapic_in_kernel(vcpu) &&
+ !kvm_iodevice_write(vcpu, &vcpu->arch.apic->dev, addr, n, v))
+ && kvm_io_bus_write(vcpu, KVM_MMIO_BUS, addr, n, v))
+ break;
+ handled += n;
+ addr += n;
+ len -= n;
+ v += n;
+ } while (len);
+
+ return handled;
+}
+
+static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)
+{
+ int handled = 0;
+ int n;
+
+ do {
+ n = min(len, 8);
+ if (!(lapic_in_kernel(vcpu) &&
+ !kvm_iodevice_read(vcpu, &vcpu->arch.apic->dev,
+ addr, n, v))
+ && kvm_io_bus_read(vcpu, KVM_MMIO_BUS, addr, n, v))
+ break;
+ trace_kvm_mmio(KVM_TRACE_MMIO_READ, n, addr, v);
+ handled += n;
+ addr += n;
+ len -= n;
+ v += n;
+ } while (len);
+
+ return handled;
+}
+
+void kvm_set_segment(struct kvm_vcpu *vcpu,
+ struct kvm_segment *var, int seg)
+{
+ static_call(kvm_x86_set_segment)(vcpu, var, seg);
+}
+
+void kvm_get_segment(struct kvm_vcpu *vcpu,
+ struct kvm_segment *var, int seg)
+{
+ static_call(kvm_x86_get_segment)(vcpu, var, seg);
+}
+
+gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u64 access,
+ struct x86_exception *exception)
+{
+ struct kvm_mmu *mmu = vcpu->arch.mmu;
+ gpa_t t_gpa;
+
+ BUG_ON(!mmu_is_nested(vcpu));
+
+ /* NPT walks are always user-walks */
+ access |= PFERR_USER_MASK;
+ t_gpa = mmu->gva_to_gpa(vcpu, mmu, gpa, access, exception);
+
+ return t_gpa;
+}
+
+gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva,
+ struct x86_exception *exception)
+{
+ struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
+
+ u64 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0;
+ return mmu->gva_to_gpa(vcpu, mmu, gva, access, exception);
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_gva_to_gpa_read);
+
+gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva,
+ struct x86_exception *exception)
+{
+ struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
+
+ u64 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0;
+ access |= PFERR_WRITE_MASK;
+ return mmu->gva_to_gpa(vcpu, mmu, gva, access, exception);
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_gva_to_gpa_write);
+
+/* uses this to access any guest's mapped memory without checking CPL */
+gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva,
+ struct x86_exception *exception)
+{
+ struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
+
+ return mmu->gva_to_gpa(vcpu, mmu, gva, 0, exception);
+}
+
+static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,
+ struct kvm_vcpu *vcpu, u64 access,
+ struct x86_exception *exception)
+{
+ struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
+ void *data = val;
+ int r = X86EMUL_CONTINUE;
+
+ while (bytes) {
+ gpa_t gpa = mmu->gva_to_gpa(vcpu, mmu, addr, access, exception);
+ unsigned offset = addr & (PAGE_SIZE-1);
+ unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset);
+ int ret;
+
+ if (gpa == INVALID_GPA)
+ return X86EMUL_PROPAGATE_FAULT;
+ ret = kvm_vcpu_read_guest_page(vcpu, gpa >> PAGE_SHIFT, data,
+ offset, toread);
+ if (ret < 0) {
+ r = X86EMUL_IO_NEEDED;
+ goto out;
+ }
+
+ bytes -= toread;
+ data += toread;
+ addr += toread;
+ }
+out:
+ return r;
+}
+
+/* used for instruction fetching */
+static int kvm_fetch_guest_virt(struct x86_emulate_ctxt *ctxt,
+ gva_t addr, void *val, unsigned int bytes,
+ struct x86_exception *exception)
+{
+ struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
+ struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
+ u64 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0;
+ unsigned offset;
+ int ret;
+
+ /* Inline kvm_read_guest_virt_helper for speed. */
+ gpa_t gpa = mmu->gva_to_gpa(vcpu, mmu, addr, access|PFERR_FETCH_MASK,
+ exception);
+ if (unlikely(gpa == INVALID_GPA))
+ return X86EMUL_PROPAGATE_FAULT;
+
+ offset = addr & (PAGE_SIZE-1);
+ if (WARN_ON(offset + bytes > PAGE_SIZE))
+ bytes = (unsigned)PAGE_SIZE - offset;
+ ret = kvm_vcpu_read_guest_page(vcpu, gpa >> PAGE_SHIFT, val,
+ offset, bytes);
+ if (unlikely(ret < 0))
+ return X86EMUL_IO_NEEDED;
+
+ return X86EMUL_CONTINUE;
+}
+
+int kvm_read_guest_virt(struct kvm_vcpu *vcpu,
+ gva_t addr, void *val, unsigned int bytes,
+ struct x86_exception *exception)
+{
+ u64 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0;
+
+ /*
+ * FIXME: this should call handle_emulation_failure if X86EMUL_IO_NEEDED
+ * is returned, but our callers are not ready for that and they blindly
+ * call kvm_inject_page_fault. Ensure that they at least do not leak
+ * uninitialized kernel stack memory into cr2 and error code.
+ */
+ memset(exception, 0, sizeof(*exception));
+ return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access,
+ exception);
+}
+EXPORT_SYMBOL_GPL(kvm_read_guest_virt);
+
+static int emulator_read_std(struct x86_emulate_ctxt *ctxt,
+ gva_t addr, void *val, unsigned int bytes,
+ struct x86_exception *exception, bool system)
+{
+ struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
+ u64 access = 0;
+
+ if (system)
+ access |= PFERR_IMPLICIT_ACCESS;
+ else if (static_call(kvm_x86_get_cpl)(vcpu) == 3)
+ access |= PFERR_USER_MASK;
+
+ return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access, exception);
+}
+
+static int kvm_write_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,
+ struct kvm_vcpu *vcpu, u64 access,
+ struct x86_exception *exception)
+{
+ struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
+ void *data = val;
+ int r = X86EMUL_CONTINUE;
+
+ while (bytes) {
+ gpa_t gpa = mmu->gva_to_gpa(vcpu, mmu, addr, access, exception);
+ unsigned offset = addr & (PAGE_SIZE-1);
+ unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset);
+ int ret;
+
+ if (gpa == INVALID_GPA)
+ return X86EMUL_PROPAGATE_FAULT;
+ ret = kvm_vcpu_write_guest(vcpu, gpa, data, towrite);
+ if (ret < 0) {
+ r = X86EMUL_IO_NEEDED;
+ goto out;
+ }
+
+ bytes -= towrite;
+ data += towrite;
+ addr += towrite;
+ }
+out:
+ return r;
+}
+
+static int emulator_write_std(struct x86_emulate_ctxt *ctxt, gva_t addr, void *val,
+ unsigned int bytes, struct x86_exception *exception,
+ bool system)
+{
+ struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
+ u64 access = PFERR_WRITE_MASK;
+
+ if (system)
+ access |= PFERR_IMPLICIT_ACCESS;
+ else if (static_call(kvm_x86_get_cpl)(vcpu) == 3)
+ access |= PFERR_USER_MASK;
+
+ return kvm_write_guest_virt_helper(addr, val, bytes, vcpu,
+ access, exception);
+}
+
+int kvm_write_guest_virt_system(struct kvm_vcpu *vcpu, gva_t addr, void *val,
+ unsigned int bytes, struct x86_exception *exception)
+{
+ /* kvm_write_guest_virt_system can pull in tons of pages. */
+ vcpu->arch.l1tf_flush_l1d = true;
+
+ return kvm_write_guest_virt_helper(addr, val, bytes, vcpu,
+ PFERR_WRITE_MASK, exception);
+}
+EXPORT_SYMBOL_GPL(kvm_write_guest_virt_system);
+
+static int kvm_can_emulate_insn(struct kvm_vcpu *vcpu, int emul_type,
+ void *insn, int insn_len)
+{
+ return static_call(kvm_x86_can_emulate_instruction)(vcpu, emul_type,
+ insn, insn_len);
+}
+
+int handle_ud(struct kvm_vcpu *vcpu)
+{
+ static const char kvm_emulate_prefix[] = { __KVM_EMULATE_PREFIX };
+ int fep_flags = READ_ONCE(force_emulation_prefix);
+ int emul_type = EMULTYPE_TRAP_UD;
+ char sig[5]; /* ud2; .ascii "kvm" */
+ struct x86_exception e;
+
+ if (unlikely(!kvm_can_emulate_insn(vcpu, emul_type, NULL, 0)))
+ return 1;
+
+ if (fep_flags &&
+ kvm_read_guest_virt(vcpu, kvm_get_linear_rip(vcpu),
+ sig, sizeof(sig), &e) == 0 &&
+ memcmp(sig, kvm_emulate_prefix, sizeof(sig)) == 0) {
+ if (fep_flags & KVM_FEP_CLEAR_RFLAGS_RF)
+ kvm_set_rflags(vcpu, kvm_get_rflags(vcpu) & ~X86_EFLAGS_RF);
+ kvm_rip_write(vcpu, kvm_rip_read(vcpu) + sizeof(sig));
+ emul_type = EMULTYPE_TRAP_UD_FORCED;
+ }
+
+ return kvm_emulate_instruction(vcpu, emul_type);
+}
+EXPORT_SYMBOL_GPL(handle_ud);
+
+static int vcpu_is_mmio_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
+ gpa_t gpa, bool write)
+{
+ /* For APIC access vmexit */
+ if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
+ return 1;
+
+ if (vcpu_match_mmio_gpa(vcpu, gpa)) {
+ trace_vcpu_match_mmio(gva, gpa, write, true);
+ return 1;
+ }
+
+ return 0;
+}
+
+static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
+ gpa_t *gpa, struct x86_exception *exception,
+ bool write)
+{
+ struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
+ u64 access = ((static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0)
+ | (write ? PFERR_WRITE_MASK : 0);
+
+ /*
+ * currently PKRU is only applied to ept enabled guest so
+ * there is no pkey in EPT page table for L1 guest or EPT
+ * shadow page table for L2 guest.
+ */
+ if (vcpu_match_mmio_gva(vcpu, gva) && (!is_paging(vcpu) ||
+ !permission_fault(vcpu, vcpu->arch.walk_mmu,
+ vcpu->arch.mmio_access, 0, access))) {
+ *gpa = vcpu->arch.mmio_gfn << PAGE_SHIFT |
+ (gva & (PAGE_SIZE - 1));
+ trace_vcpu_match_mmio(gva, *gpa, write, false);
+ return 1;
+ }
+
+ *gpa = mmu->gva_to_gpa(vcpu, mmu, gva, access, exception);
+
+ if (*gpa == INVALID_GPA)
+ return -1;
+
+ return vcpu_is_mmio_gpa(vcpu, gva, *gpa, write);
+}
+
+int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
+ const void *val, int bytes)
+{
+ int ret;
+
+ ret = kvm_vcpu_write_guest(vcpu, gpa, val, bytes);
+ if (ret < 0)
+ return 0;
+ kvm_page_track_write(vcpu, gpa, val, bytes);
+ return 1;
+}
+
+struct read_write_emulator_ops {
+ int (*read_write_prepare)(struct kvm_vcpu *vcpu, void *val,
+ int bytes);
+ int (*read_write_emulate)(struct kvm_vcpu *vcpu, gpa_t gpa,
+ void *val, int bytes);
+ int (*read_write_mmio)(struct kvm_vcpu *vcpu, gpa_t gpa,
+ int bytes, void *val);
+ int (*read_write_exit_mmio)(struct kvm_vcpu *vcpu, gpa_t gpa,
+ void *val, int bytes);
+ bool write;
+};
+
+static int read_prepare(struct kvm_vcpu *vcpu, void *val, int bytes)
+{
+ if (vcpu->mmio_read_completed) {
+ trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes,
+ vcpu->mmio_fragments[0].gpa, val);
+ vcpu->mmio_read_completed = 0;
+ return 1;
+ }
+
+ return 0;
+}
+
+static int read_emulate(struct kvm_vcpu *vcpu, gpa_t gpa,
+ void *val, int bytes)
+{
+ return !kvm_vcpu_read_guest(vcpu, gpa, val, bytes);
+}
+
+static int write_emulate(struct kvm_vcpu *vcpu, gpa_t gpa,
+ void *val, int bytes)
+{
+ return emulator_write_phys(vcpu, gpa, val, bytes);
+}
+
+static int write_mmio(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes, void *val)
+{
+ trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, val);
+ return vcpu_mmio_write(vcpu, gpa, bytes, val);
+}
+
+static int read_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa,
+ void *val, int bytes)
+{
+ trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, NULL);
+ return X86EMUL_IO_NEEDED;
+}
+
+static int write_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa,
+ void *val, int bytes)
+{
+ struct kvm_mmio_fragment *frag = &vcpu->mmio_fragments[0];
+
+ memcpy(vcpu->run->mmio.data, frag->data, min(8u, frag->len));
+ return X86EMUL_CONTINUE;
+}
+
+static const struct read_write_emulator_ops read_emultor = {
+ .read_write_prepare = read_prepare,
+ .read_write_emulate = read_emulate,
+ .read_write_mmio = vcpu_mmio_read,
+ .read_write_exit_mmio = read_exit_mmio,
+};
+
+static const struct read_write_emulator_ops write_emultor = {
+ .read_write_emulate = write_emulate,
+ .read_write_mmio = write_mmio,
+ .read_write_exit_mmio = write_exit_mmio,
+ .write = true,
+};
+
+static int emulator_read_write_onepage(unsigned long addr, void *val,
+ unsigned int bytes,
+ struct x86_exception *exception,
+ struct kvm_vcpu *vcpu,
+ const struct read_write_emulator_ops *ops)
+{
+ gpa_t gpa;
+ int handled, ret;
+ bool write = ops->write;
+ struct kvm_mmio_fragment *frag;
+ struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
+
+ /*
+ * If the exit was due to a NPF we may already have a GPA.
+ * If the GPA is present, use it to avoid the GVA to GPA table walk.
+ * Note, this cannot be used on string operations since string
+ * operation using rep will only have the initial GPA from the NPF
+ * occurred.
+ */
+ if (ctxt->gpa_available && emulator_can_use_gpa(ctxt) &&
+ (addr & ~PAGE_MASK) == (ctxt->gpa_val & ~PAGE_MASK)) {
+ gpa = ctxt->gpa_val;
+ ret = vcpu_is_mmio_gpa(vcpu, addr, gpa, write);
+ } else {
+ ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, write);
+ if (ret < 0)
+ return X86EMUL_PROPAGATE_FAULT;
+ }
+
+ if (!ret && ops->read_write_emulate(vcpu, gpa, val, bytes))
+ return X86EMUL_CONTINUE;
+
+ /*
+ * Is this MMIO handled locally?
+ */
+ handled = ops->read_write_mmio(vcpu, gpa, bytes, val);
+ if (handled == bytes)
+ return X86EMUL_CONTINUE;
+
+ gpa += handled;
+ bytes -= handled;
+ val += handled;
+
+ WARN_ON(vcpu->mmio_nr_fragments >= KVM_MAX_MMIO_FRAGMENTS);
+ frag = &vcpu->mmio_fragments[vcpu->mmio_nr_fragments++];
+ frag->gpa = gpa;
+ frag->data = val;
+ frag->len = bytes;
+ return X86EMUL_CONTINUE;
+}
+
+static int emulator_read_write(struct x86_emulate_ctxt *ctxt,
+ unsigned long addr,
+ void *val, unsigned int bytes,
+ struct x86_exception *exception,
+ const struct read_write_emulator_ops *ops)
+{
+ struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
+ gpa_t gpa;
+ int rc;
+
+ if (ops->read_write_prepare &&
+ ops->read_write_prepare(vcpu, val, bytes))
+ return X86EMUL_CONTINUE;
+
+ vcpu->mmio_nr_fragments = 0;
+
+ /* Crossing a page boundary? */
+ if (((addr + bytes - 1) ^ addr) & PAGE_MASK) {
+ int now;
+
+ now = -addr & ~PAGE_MASK;
+ rc = emulator_read_write_onepage(addr, val, now, exception,
+ vcpu, ops);
+
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+ addr += now;
+ if (ctxt->mode != X86EMUL_MODE_PROT64)
+ addr = (u32)addr;
+ val += now;
+ bytes -= now;
+ }
+
+ rc = emulator_read_write_onepage(addr, val, bytes, exception,
+ vcpu, ops);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ if (!vcpu->mmio_nr_fragments)
+ return rc;
+
+ gpa = vcpu->mmio_fragments[0].gpa;
+
+ vcpu->mmio_needed = 1;
+ vcpu->mmio_cur_fragment = 0;
+
+ vcpu->run->mmio.len = min(8u, vcpu->mmio_fragments[0].len);
+ vcpu->run->mmio.is_write = vcpu->mmio_is_write = ops->write;
+ vcpu->run->exit_reason = KVM_EXIT_MMIO;
+ vcpu->run->mmio.phys_addr = gpa;
+
+ return ops->read_write_exit_mmio(vcpu, gpa, val, bytes);
+}
+
+static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt,
+ unsigned long addr,
+ void *val,
+ unsigned int bytes,
+ struct x86_exception *exception)
+{
+ return emulator_read_write(ctxt, addr, val, bytes,
+ exception, &read_emultor);
+}
+
+static int emulator_write_emulated(struct x86_emulate_ctxt *ctxt,
+ unsigned long addr,
+ const void *val,
+ unsigned int bytes,
+ struct x86_exception *exception)
+{
+ return emulator_read_write(ctxt, addr, (void *)val, bytes,
+ exception, &write_emultor);
+}
+
+#define emulator_try_cmpxchg_user(t, ptr, old, new) \
+ (__try_cmpxchg_user((t __user *)(ptr), (t *)(old), *(t *)(new), efault ## t))
+
+static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt,
+ unsigned long addr,
+ const void *old,
+ const void *new,
+ unsigned int bytes,
+ struct x86_exception *exception)
+{
+ struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
+ u64 page_line_mask;
+ unsigned long hva;
+ gpa_t gpa;
+ int r;
+
+ /* guests cmpxchg8b have to be emulated atomically */
+ if (bytes > 8 || (bytes & (bytes - 1)))
+ goto emul_write;
+
+ gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, NULL);
+
+ if (gpa == INVALID_GPA ||
+ (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
+ goto emul_write;
+
+ /*
+ * Emulate the atomic as a straight write to avoid #AC if SLD is
+ * enabled in the host and the access splits a cache line.
+ */
+ if (boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT))
+ page_line_mask = ~(cache_line_size() - 1);
+ else
+ page_line_mask = PAGE_MASK;
+
+ if (((gpa + bytes - 1) & page_line_mask) != (gpa & page_line_mask))
+ goto emul_write;
+
+ hva = kvm_vcpu_gfn_to_hva(vcpu, gpa_to_gfn(gpa));
+ if (kvm_is_error_hva(hva))
+ goto emul_write;
+
+ hva += offset_in_page(gpa);
+
+ switch (bytes) {
+ case 1:
+ r = emulator_try_cmpxchg_user(u8, hva, old, new);
+ break;
+ case 2:
+ r = emulator_try_cmpxchg_user(u16, hva, old, new);
+ break;
+ case 4:
+ r = emulator_try_cmpxchg_user(u32, hva, old, new);
+ break;
+ case 8:
+ r = emulator_try_cmpxchg_user(u64, hva, old, new);
+ break;
+ default:
+ BUG();
+ }
+
+ if (r < 0)
+ return X86EMUL_UNHANDLEABLE;
+ if (r)
+ return X86EMUL_CMPXCHG_FAILED;
+
+ kvm_page_track_write(vcpu, gpa, new, bytes);
+
+ return X86EMUL_CONTINUE;
+
+emul_write:
+ pr_warn_once("emulating exchange as write\n");
+
+ return emulator_write_emulated(ctxt, addr, new, bytes, exception);
+}
+
+static int emulator_pio_in_out(struct kvm_vcpu *vcpu, int size,
+ unsigned short port, void *data,
+ unsigned int count, bool in)
+{
+ unsigned i;
+ int r;
+
+ WARN_ON_ONCE(vcpu->arch.pio.count);
+ for (i = 0; i < count; i++) {
+ if (in)
+ r = kvm_io_bus_read(vcpu, KVM_PIO_BUS, port, size, data);
+ else
+ r = kvm_io_bus_write(vcpu, KVM_PIO_BUS, port, size, data);
+
+ if (r) {
+ if (i == 0)
+ goto userspace_io;
+
+ /*
+ * Userspace must have unregistered the device while PIO
+ * was running. Drop writes / read as 0.
+ */
+ if (in)
+ memset(data, 0, size * (count - i));
+ break;
+ }
+
+ data += size;
+ }
+ return 1;
+
+userspace_io:
+ vcpu->arch.pio.port = port;
+ vcpu->arch.pio.in = in;
+ vcpu->arch.pio.count = count;
+ vcpu->arch.pio.size = size;
+
+ if (in)
+ memset(vcpu->arch.pio_data, 0, size * count);
+ else
+ memcpy(vcpu->arch.pio_data, data, size * count);
+
+ vcpu->run->exit_reason = KVM_EXIT_IO;
+ vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
+ vcpu->run->io.size = size;
+ vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
+ vcpu->run->io.count = count;
+ vcpu->run->io.port = port;
+ return 0;
+}
+
+static int emulator_pio_in(struct kvm_vcpu *vcpu, int size,
+ unsigned short port, void *val, unsigned int count)
+{
+ int r = emulator_pio_in_out(vcpu, size, port, val, count, true);
+ if (r)
+ trace_kvm_pio(KVM_PIO_IN, port, size, count, val);
+
+ return r;
+}
+
+static void complete_emulator_pio_in(struct kvm_vcpu *vcpu, void *val)
+{
+ int size = vcpu->arch.pio.size;
+ unsigned int count = vcpu->arch.pio.count;
+ memcpy(val, vcpu->arch.pio_data, size * count);
+ trace_kvm_pio(KVM_PIO_IN, vcpu->arch.pio.port, size, count, vcpu->arch.pio_data);
+ vcpu->arch.pio.count = 0;
+}
+
+static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt,
+ int size, unsigned short port, void *val,
+ unsigned int count)
+{
+ struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
+ if (vcpu->arch.pio.count) {
+ /*
+ * Complete a previous iteration that required userspace I/O.
+ * Note, @count isn't guaranteed to match pio.count as userspace
+ * can modify ECX before rerunning the vCPU. Ignore any such
+ * shenanigans as KVM doesn't support modifying the rep count,
+ * and the emulator ensures @count doesn't overflow the buffer.
+ */
+ complete_emulator_pio_in(vcpu, val);
+ return 1;
+ }
+
+ return emulator_pio_in(vcpu, size, port, val, count);
+}
+
+static int emulator_pio_out(struct kvm_vcpu *vcpu, int size,
+ unsigned short port, const void *val,
+ unsigned int count)
+{
+ trace_kvm_pio(KVM_PIO_OUT, port, size, count, val);
+ return emulator_pio_in_out(vcpu, size, port, (void *)val, count, false);
+}
+
+static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt,
+ int size, unsigned short port,
+ const void *val, unsigned int count)
+{
+ return emulator_pio_out(emul_to_vcpu(ctxt), size, port, val, count);
+}
+
+static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
+{
+ return static_call(kvm_x86_get_segment_base)(vcpu, seg);
+}
+
+static void emulator_invlpg(struct x86_emulate_ctxt *ctxt, ulong address)
+{
+ kvm_mmu_invlpg(emul_to_vcpu(ctxt), address);
+}
+
+static int kvm_emulate_wbinvd_noskip(struct kvm_vcpu *vcpu)
+{
+ if (!need_emulate_wbinvd(vcpu))
+ return X86EMUL_CONTINUE;
+
+ if (static_call(kvm_x86_has_wbinvd_exit)()) {
+ int cpu = get_cpu();
+
+ cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
+ on_each_cpu_mask(vcpu->arch.wbinvd_dirty_mask,
+ wbinvd_ipi, NULL, 1);
+ put_cpu();
+ cpumask_clear(vcpu->arch.wbinvd_dirty_mask);
+ } else
+ wbinvd();
+ return X86EMUL_CONTINUE;
+}
+
+int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu)
+{
+ kvm_emulate_wbinvd_noskip(vcpu);
+ return kvm_skip_emulated_instruction(vcpu);
+}
+EXPORT_SYMBOL_GPL(kvm_emulate_wbinvd);
+
+
+
+static void emulator_wbinvd(struct x86_emulate_ctxt *ctxt)
+{
+ kvm_emulate_wbinvd_noskip(emul_to_vcpu(ctxt));
+}
+
+static void emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr,
+ unsigned long *dest)
+{
+ kvm_get_dr(emul_to_vcpu(ctxt), dr, dest);
+}
+
+static int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr,
+ unsigned long value)
+{
+
+ return kvm_set_dr(emul_to_vcpu(ctxt), dr, value);
+}
+
+static u64 mk_cr_64(u64 curr_cr, u32 new_val)
+{
+ return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
+}
+
+static unsigned long emulator_get_cr(struct x86_emulate_ctxt *ctxt, int cr)
+{
+ struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
+ unsigned long value;
+
+ switch (cr) {
+ case 0:
+ value = kvm_read_cr0(vcpu);
+ break;
+ case 2:
+ value = vcpu->arch.cr2;
+ break;
+ case 3:
+ value = kvm_read_cr3(vcpu);
+ break;
+ case 4:
+ value = kvm_read_cr4(vcpu);
+ break;
+ case 8:
+ value = kvm_get_cr8(vcpu);
+ break;
+ default:
+ kvm_err("%s: unexpected cr %u\n", __func__, cr);
+ return 0;
+ }
+
+ return value;
+}
+
+static int emulator_set_cr(struct x86_emulate_ctxt *ctxt, int cr, ulong val)
+{
+ struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
+ int res = 0;
+
+ switch (cr) {
+ case 0:
+ res = kvm_set_cr0(vcpu, mk_cr_64(kvm_read_cr0(vcpu), val));
+ break;
+ case 2:
+ vcpu->arch.cr2 = val;
+ break;
+ case 3:
+ res = kvm_set_cr3(vcpu, val);
+ break;
+ case 4:
+ res = kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val));
+ break;
+ case 8:
+ res = kvm_set_cr8(vcpu, val);
+ break;
+ default:
+ kvm_err("%s: unexpected cr %u\n", __func__, cr);
+ res = -1;
+ }
+
+ return res;
+}
+
+static int emulator_get_cpl(struct x86_emulate_ctxt *ctxt)
+{
+ return static_call(kvm_x86_get_cpl)(emul_to_vcpu(ctxt));
+}
+
+static void emulator_get_gdt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
+{
+ static_call(kvm_x86_get_gdt)(emul_to_vcpu(ctxt), dt);
+}
+
+static void emulator_get_idt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
+{
+ static_call(kvm_x86_get_idt)(emul_to_vcpu(ctxt), dt);
+}
+
+static void emulator_set_gdt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
+{
+ static_call(kvm_x86_set_gdt)(emul_to_vcpu(ctxt), dt);
+}
+
+static void emulator_set_idt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
+{
+ static_call(kvm_x86_set_idt)(emul_to_vcpu(ctxt), dt);
+}
+
+static unsigned long emulator_get_cached_segment_base(
+ struct x86_emulate_ctxt *ctxt, int seg)
+{
+ return get_segment_base(emul_to_vcpu(ctxt), seg);
+}
+
+static bool emulator_get_segment(struct x86_emulate_ctxt *ctxt, u16 *selector,
+ struct desc_struct *desc, u32 *base3,
+ int seg)
+{
+ struct kvm_segment var;
+
+ kvm_get_segment(emul_to_vcpu(ctxt), &var, seg);
+ *selector = var.selector;
+
+ if (var.unusable) {
+ memset(desc, 0, sizeof(*desc));
+ if (base3)
+ *base3 = 0;
+ return false;
+ }
+
+ if (var.g)
+ var.limit >>= 12;
+ set_desc_limit(desc, var.limit);
+ set_desc_base(desc, (unsigned long)var.base);
+#ifdef CONFIG_X86_64
+ if (base3)
+ *base3 = var.base >> 32;
+#endif
+ desc->type = var.type;
+ desc->s = var.s;
+ desc->dpl = var.dpl;
+ desc->p = var.present;
+ desc->avl = var.avl;
+ desc->l = var.l;
+ desc->d = var.db;
+ desc->g = var.g;
+
+ return true;
+}
+
+static void emulator_set_segment(struct x86_emulate_ctxt *ctxt, u16 selector,
+ struct desc_struct *desc, u32 base3,
+ int seg)
+{
+ struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
+ struct kvm_segment var;
+
+ var.selector = selector;
+ var.base = get_desc_base(desc);
+#ifdef CONFIG_X86_64
+ var.base |= ((u64)base3) << 32;
+#endif
+ var.limit = get_desc_limit(desc);
+ if (desc->g)
+ var.limit = (var.limit << 12) | 0xfff;
+ var.type = desc->type;
+ var.dpl = desc->dpl;
+ var.db = desc->d;
+ var.s = desc->s;
+ var.l = desc->l;
+ var.g = desc->g;
+ var.avl = desc->avl;
+ var.present = desc->p;
+ var.unusable = !var.present;
+ var.padding = 0;
+
+ kvm_set_segment(vcpu, &var, seg);
+ return;
+}
+
+static int emulator_get_msr_with_filter(struct x86_emulate_ctxt *ctxt,
+ u32 msr_index, u64 *pdata)
+{
+ struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
+ int r;
+
+ r = kvm_get_msr_with_filter(vcpu, msr_index, pdata);
+ if (r < 0)
+ return X86EMUL_UNHANDLEABLE;
+
+ if (r) {
+ if (kvm_msr_user_space(vcpu, msr_index, KVM_EXIT_X86_RDMSR, 0,
+ complete_emulated_rdmsr, r))
+ return X86EMUL_IO_NEEDED;
+
+ trace_kvm_msr_read_ex(msr_index);
+ return X86EMUL_PROPAGATE_FAULT;
+ }
+
+ trace_kvm_msr_read(msr_index, *pdata);
+ return X86EMUL_CONTINUE;
+}
+
+static int emulator_set_msr_with_filter(struct x86_emulate_ctxt *ctxt,
+ u32 msr_index, u64 data)
+{
+ struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
+ int r;
+
+ r = kvm_set_msr_with_filter(vcpu, msr_index, data);
+ if (r < 0)
+ return X86EMUL_UNHANDLEABLE;
+
+ if (r) {
+ if (kvm_msr_user_space(vcpu, msr_index, KVM_EXIT_X86_WRMSR, data,
+ complete_emulated_msr_access, r))
+ return X86EMUL_IO_NEEDED;
+
+ trace_kvm_msr_write_ex(msr_index, data);
+ return X86EMUL_PROPAGATE_FAULT;
+ }
+
+ trace_kvm_msr_write(msr_index, data);
+ return X86EMUL_CONTINUE;
+}
+
+static int emulator_get_msr(struct x86_emulate_ctxt *ctxt,
+ u32 msr_index, u64 *pdata)
+{
+ return kvm_get_msr(emul_to_vcpu(ctxt), msr_index, pdata);
+}
+
+static int emulator_check_pmc(struct x86_emulate_ctxt *ctxt,
+ u32 pmc)
+{
+ if (kvm_pmu_is_valid_rdpmc_ecx(emul_to_vcpu(ctxt), pmc))
+ return 0;
+ return -EINVAL;
+}
+
+static int emulator_read_pmc(struct x86_emulate_ctxt *ctxt,
+ u32 pmc, u64 *pdata)
+{
+ return kvm_pmu_rdpmc(emul_to_vcpu(ctxt), pmc, pdata);
+}
+
+static void emulator_halt(struct x86_emulate_ctxt *ctxt)
+{
+ emul_to_vcpu(ctxt)->arch.halt_request = 1;
+}
+
+static int emulator_intercept(struct x86_emulate_ctxt *ctxt,
+ struct x86_instruction_info *info,
+ enum x86_intercept_stage stage)
+{
+ return static_call(kvm_x86_check_intercept)(emul_to_vcpu(ctxt), info, stage,
+ &ctxt->exception);
+}
+
+static bool emulator_get_cpuid(struct x86_emulate_ctxt *ctxt,
+ u32 *eax, u32 *ebx, u32 *ecx, u32 *edx,
+ bool exact_only)
+{
+ return kvm_cpuid(emul_to_vcpu(ctxt), eax, ebx, ecx, edx, exact_only);
+}
+
+static bool emulator_guest_has_movbe(struct x86_emulate_ctxt *ctxt)
+{
+ return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_MOVBE);
+}
+
+static bool emulator_guest_has_fxsr(struct x86_emulate_ctxt *ctxt)
+{
+ return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_FXSR);
+}
+
+static bool emulator_guest_has_rdpid(struct x86_emulate_ctxt *ctxt)
+{
+ return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_RDPID);
+}
+
+static ulong emulator_read_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg)
+{
+ return kvm_register_read_raw(emul_to_vcpu(ctxt), reg);
+}
+
+static void emulator_write_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg, ulong val)
+{
+ kvm_register_write_raw(emul_to_vcpu(ctxt), reg, val);
+}
+
+static void emulator_set_nmi_mask(struct x86_emulate_ctxt *ctxt, bool masked)
+{
+ static_call(kvm_x86_set_nmi_mask)(emul_to_vcpu(ctxt), masked);
+}
+
+static bool emulator_is_smm(struct x86_emulate_ctxt *ctxt)
+{
+ return is_smm(emul_to_vcpu(ctxt));
+}
+
+static bool emulator_is_guest_mode(struct x86_emulate_ctxt *ctxt)
+{
+ return is_guest_mode(emul_to_vcpu(ctxt));
+}
+
+#ifndef CONFIG_KVM_SMM
+static int emulator_leave_smm(struct x86_emulate_ctxt *ctxt)
+{
+ WARN_ON_ONCE(1);
+ return X86EMUL_UNHANDLEABLE;
+}
+#endif
+
+static void emulator_triple_fault(struct x86_emulate_ctxt *ctxt)
+{
+ kvm_make_request(KVM_REQ_TRIPLE_FAULT, emul_to_vcpu(ctxt));
+}
+
+static int emulator_set_xcr(struct x86_emulate_ctxt *ctxt, u32 index, u64 xcr)
+{
+ return __kvm_set_xcr(emul_to_vcpu(ctxt), index, xcr);
+}
+
+static void emulator_vm_bugged(struct x86_emulate_ctxt *ctxt)
+{
+ struct kvm *kvm = emul_to_vcpu(ctxt)->kvm;
+
+ if (!kvm->vm_bugged)
+ kvm_vm_bugged(kvm);
+}
+
+static const struct x86_emulate_ops emulate_ops = {
+ .vm_bugged = emulator_vm_bugged,
+ .read_gpr = emulator_read_gpr,
+ .write_gpr = emulator_write_gpr,
+ .read_std = emulator_read_std,
+ .write_std = emulator_write_std,
+ .fetch = kvm_fetch_guest_virt,
+ .read_emulated = emulator_read_emulated,
+ .write_emulated = emulator_write_emulated,
+ .cmpxchg_emulated = emulator_cmpxchg_emulated,
+ .invlpg = emulator_invlpg,
+ .pio_in_emulated = emulator_pio_in_emulated,
+ .pio_out_emulated = emulator_pio_out_emulated,
+ .get_segment = emulator_get_segment,
+ .set_segment = emulator_set_segment,
+ .get_cached_segment_base = emulator_get_cached_segment_base,
+ .get_gdt = emulator_get_gdt,
+ .get_idt = emulator_get_idt,
+ .set_gdt = emulator_set_gdt,
+ .set_idt = emulator_set_idt,
+ .get_cr = emulator_get_cr,
+ .set_cr = emulator_set_cr,
+ .cpl = emulator_get_cpl,
+ .get_dr = emulator_get_dr,
+ .set_dr = emulator_set_dr,
+ .set_msr_with_filter = emulator_set_msr_with_filter,
+ .get_msr_with_filter = emulator_get_msr_with_filter,
+ .get_msr = emulator_get_msr,
+ .check_pmc = emulator_check_pmc,
+ .read_pmc = emulator_read_pmc,
+ .halt = emulator_halt,
+ .wbinvd = emulator_wbinvd,
+ .fix_hypercall = emulator_fix_hypercall,
+ .intercept = emulator_intercept,
+ .get_cpuid = emulator_get_cpuid,
+ .guest_has_movbe = emulator_guest_has_movbe,
+ .guest_has_fxsr = emulator_guest_has_fxsr,
+ .guest_has_rdpid = emulator_guest_has_rdpid,
+ .set_nmi_mask = emulator_set_nmi_mask,
+ .is_smm = emulator_is_smm,
+ .is_guest_mode = emulator_is_guest_mode,
+ .leave_smm = emulator_leave_smm,
+ .triple_fault = emulator_triple_fault,
+ .set_xcr = emulator_set_xcr,
+};
+
+static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask)
+{
+ u32 int_shadow = static_call(kvm_x86_get_interrupt_shadow)(vcpu);
+ /*
+ * an sti; sti; sequence only disable interrupts for the first
+ * instruction. So, if the last instruction, be it emulated or
+ * not, left the system with the INT_STI flag enabled, it
+ * means that the last instruction is an sti. We should not
+ * leave the flag on in this case. The same goes for mov ss
+ */
+ if (int_shadow & mask)
+ mask = 0;
+ if (unlikely(int_shadow || mask)) {
+ static_call(kvm_x86_set_interrupt_shadow)(vcpu, mask);
+ if (!mask)
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ }
+}
+
+static void inject_emulated_exception(struct kvm_vcpu *vcpu)
+{
+ struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
+
+ if (ctxt->exception.vector == PF_VECTOR)
+ kvm_inject_emulated_page_fault(vcpu, &ctxt->exception);
+ else if (ctxt->exception.error_code_valid)
+ kvm_queue_exception_e(vcpu, ctxt->exception.vector,
+ ctxt->exception.error_code);
+ else
+ kvm_queue_exception(vcpu, ctxt->exception.vector);
+}
+
+static struct x86_emulate_ctxt *alloc_emulate_ctxt(struct kvm_vcpu *vcpu)
+{
+ struct x86_emulate_ctxt *ctxt;
+
+ ctxt = kmem_cache_zalloc(x86_emulator_cache, GFP_KERNEL_ACCOUNT);
+ if (!ctxt) {
+ pr_err("failed to allocate vcpu's emulator\n");
+ return NULL;
+ }
+
+ ctxt->vcpu = vcpu;
+ ctxt->ops = &emulate_ops;
+ vcpu->arch.emulate_ctxt = ctxt;
+
+ return ctxt;
+}
+
+static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
+{
+ struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
+ int cs_db, cs_l;
+
+ static_call(kvm_x86_get_cs_db_l_bits)(vcpu, &cs_db, &cs_l);
+
+ ctxt->gpa_available = false;
+ ctxt->eflags = kvm_get_rflags(vcpu);
+ ctxt->tf = (ctxt->eflags & X86_EFLAGS_TF) != 0;
+
+ ctxt->eip = kvm_rip_read(vcpu);
+ ctxt->mode = (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL :
+ (ctxt->eflags & X86_EFLAGS_VM) ? X86EMUL_MODE_VM86 :
+ (cs_l && is_long_mode(vcpu)) ? X86EMUL_MODE_PROT64 :
+ cs_db ? X86EMUL_MODE_PROT32 :
+ X86EMUL_MODE_PROT16;
+ ctxt->interruptibility = 0;
+ ctxt->have_exception = false;
+ ctxt->exception.vector = -1;
+ ctxt->perm_ok = false;
+
+ init_decode_cache(ctxt);
+ vcpu->arch.emulate_regs_need_sync_from_vcpu = false;
+}
+
+void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip)
+{
+ struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
+ int ret;
+
+ init_emulate_ctxt(vcpu);
+
+ ctxt->op_bytes = 2;
+ ctxt->ad_bytes = 2;
+ ctxt->_eip = ctxt->eip + inc_eip;
+ ret = emulate_int_real(ctxt, irq);
+
+ if (ret != X86EMUL_CONTINUE) {
+ kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+ } else {
+ ctxt->eip = ctxt->_eip;
+ kvm_rip_write(vcpu, ctxt->eip);
+ kvm_set_rflags(vcpu, ctxt->eflags);
+ }
+}
+EXPORT_SYMBOL_GPL(kvm_inject_realmode_interrupt);
+
+static void prepare_emulation_failure_exit(struct kvm_vcpu *vcpu, u64 *data,
+ u8 ndata, u8 *insn_bytes, u8 insn_size)
+{
+ struct kvm_run *run = vcpu->run;
+ u64 info[5];
+ u8 info_start;
+
+ /*
+ * Zero the whole array used to retrieve the exit info, as casting to
+ * u32 for select entries will leave some chunks uninitialized.
+ */
+ memset(&info, 0, sizeof(info));
+
+ static_call(kvm_x86_get_exit_info)(vcpu, (u32 *)&info[0], &info[1],
+ &info[2], (u32 *)&info[3],
+ (u32 *)&info[4]);
+
+ run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+ run->emulation_failure.suberror = KVM_INTERNAL_ERROR_EMULATION;
+
+ /*
+ * There's currently space for 13 entries, but 5 are used for the exit
+ * reason and info. Restrict to 4 to reduce the maintenance burden
+ * when expanding kvm_run.emulation_failure in the future.
+ */
+ if (WARN_ON_ONCE(ndata > 4))
+ ndata = 4;
+
+ /* Always include the flags as a 'data' entry. */
+ info_start = 1;
+ run->emulation_failure.flags = 0;
+
+ if (insn_size) {
+ BUILD_BUG_ON((sizeof(run->emulation_failure.insn_size) +
+ sizeof(run->emulation_failure.insn_bytes) != 16));
+ info_start += 2;
+ run->emulation_failure.flags |=
+ KVM_INTERNAL_ERROR_EMULATION_FLAG_INSTRUCTION_BYTES;
+ run->emulation_failure.insn_size = insn_size;
+ memset(run->emulation_failure.insn_bytes, 0x90,
+ sizeof(run->emulation_failure.insn_bytes));
+ memcpy(run->emulation_failure.insn_bytes, insn_bytes, insn_size);
+ }
+
+ memcpy(&run->internal.data[info_start], info, sizeof(info));
+ memcpy(&run->internal.data[info_start + ARRAY_SIZE(info)], data,
+ ndata * sizeof(data[0]));
+
+ run->emulation_failure.ndata = info_start + ARRAY_SIZE(info) + ndata;
+}
+
+static void prepare_emulation_ctxt_failure_exit(struct kvm_vcpu *vcpu)
+{
+ struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
+
+ prepare_emulation_failure_exit(vcpu, NULL, 0, ctxt->fetch.data,
+ ctxt->fetch.end - ctxt->fetch.data);
+}
+
+void __kvm_prepare_emulation_failure_exit(struct kvm_vcpu *vcpu, u64 *data,
+ u8 ndata)
+{
+ prepare_emulation_failure_exit(vcpu, data, ndata, NULL, 0);
+}
+EXPORT_SYMBOL_GPL(__kvm_prepare_emulation_failure_exit);
+
+void kvm_prepare_emulation_failure_exit(struct kvm_vcpu *vcpu)
+{
+ __kvm_prepare_emulation_failure_exit(vcpu, NULL, 0);
+}
+EXPORT_SYMBOL_GPL(kvm_prepare_emulation_failure_exit);
+
+static int handle_emulation_failure(struct kvm_vcpu *vcpu, int emulation_type)
+{
+ struct kvm *kvm = vcpu->kvm;
+
+ ++vcpu->stat.insn_emulation_fail;
+ trace_kvm_emulate_insn_failed(vcpu);
+
+ if (emulation_type & EMULTYPE_VMWARE_GP) {
+ kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
+ return 1;
+ }
+
+ if (kvm->arch.exit_on_emulation_error ||
+ (emulation_type & EMULTYPE_SKIP)) {
+ prepare_emulation_ctxt_failure_exit(vcpu);
+ return 0;
+ }
+
+ kvm_queue_exception(vcpu, UD_VECTOR);
+
+ if (!is_guest_mode(vcpu) && static_call(kvm_x86_get_cpl)(vcpu) == 0) {
+ prepare_emulation_ctxt_failure_exit(vcpu);
+ return 0;
+ }
+
+ return 1;
+}
+
+static bool reexecute_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
+ int emulation_type)
+{
+ gpa_t gpa = cr2_or_gpa;
+ kvm_pfn_t pfn;
+
+ if (!(emulation_type & EMULTYPE_ALLOW_RETRY_PF))
+ return false;
+
+ if (WARN_ON_ONCE(is_guest_mode(vcpu)) ||
+ WARN_ON_ONCE(!(emulation_type & EMULTYPE_PF)))
+ return false;
+
+ if (!vcpu->arch.mmu->root_role.direct) {
+ /*
+ * Write permission should be allowed since only
+ * write access need to be emulated.
+ */
+ gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2_or_gpa, NULL);
+
+ /*
+ * If the mapping is invalid in guest, let cpu retry
+ * it to generate fault.
+ */
+ if (gpa == INVALID_GPA)
+ return true;
+ }
+
+ /*
+ * Do not retry the unhandleable instruction if it faults on the
+ * readonly host memory, otherwise it will goto a infinite loop:
+ * retry instruction -> write #PF -> emulation fail -> retry
+ * instruction -> ...
+ */
+ pfn = gfn_to_pfn(vcpu->kvm, gpa_to_gfn(gpa));
+
+ /*
+ * If the instruction failed on the error pfn, it can not be fixed,
+ * report the error to userspace.
+ */
+ if (is_error_noslot_pfn(pfn))
+ return false;
+
+ kvm_release_pfn_clean(pfn);
+
+ /* The instructions are well-emulated on direct mmu. */
+ if (vcpu->arch.mmu->root_role.direct) {
+ unsigned int indirect_shadow_pages;
+
+ write_lock(&vcpu->kvm->mmu_lock);
+ indirect_shadow_pages = vcpu->kvm->arch.indirect_shadow_pages;
+ write_unlock(&vcpu->kvm->mmu_lock);
+
+ if (indirect_shadow_pages)
+ kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
+
+ return true;
+ }
+
+ /*
+ * if emulation was due to access to shadowed page table
+ * and it failed try to unshadow page and re-enter the
+ * guest to let CPU execute the instruction.
+ */
+ kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
+
+ /*
+ * If the access faults on its page table, it can not
+ * be fixed by unprotecting shadow page and it should
+ * be reported to userspace.
+ */
+ return !(emulation_type & EMULTYPE_WRITE_PF_TO_SP);
+}
+
+static bool retry_instruction(struct x86_emulate_ctxt *ctxt,
+ gpa_t cr2_or_gpa, int emulation_type)
+{
+ struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
+ unsigned long last_retry_eip, last_retry_addr, gpa = cr2_or_gpa;
+
+ last_retry_eip = vcpu->arch.last_retry_eip;
+ last_retry_addr = vcpu->arch.last_retry_addr;
+
+ /*
+ * If the emulation is caused by #PF and it is non-page_table
+ * writing instruction, it means the VM-EXIT is caused by shadow
+ * page protected, we can zap the shadow page and retry this
+ * instruction directly.
+ *
+ * Note: if the guest uses a non-page-table modifying instruction
+ * on the PDE that points to the instruction, then we will unmap
+ * the instruction and go to an infinite loop. So, we cache the
+ * last retried eip and the last fault address, if we meet the eip
+ * and the address again, we can break out of the potential infinite
+ * loop.
+ */
+ vcpu->arch.last_retry_eip = vcpu->arch.last_retry_addr = 0;
+
+ if (!(emulation_type & EMULTYPE_ALLOW_RETRY_PF))
+ return false;
+
+ if (WARN_ON_ONCE(is_guest_mode(vcpu)) ||
+ WARN_ON_ONCE(!(emulation_type & EMULTYPE_PF)))
+ return false;
+
+ if (x86_page_table_writing_insn(ctxt))
+ return false;
+
+ if (ctxt->eip == last_retry_eip && last_retry_addr == cr2_or_gpa)
+ return false;
+
+ vcpu->arch.last_retry_eip = ctxt->eip;
+ vcpu->arch.last_retry_addr = cr2_or_gpa;
+
+ if (!vcpu->arch.mmu->root_role.direct)
+ gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2_or_gpa, NULL);
+
+ kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
+
+ return true;
+}
+
+static int complete_emulated_mmio(struct kvm_vcpu *vcpu);
+static int complete_emulated_pio(struct kvm_vcpu *vcpu);
+
+static int kvm_vcpu_check_hw_bp(unsigned long addr, u32 type, u32 dr7,
+ unsigned long *db)
+{
+ u32 dr6 = 0;
+ int i;
+ u32 enable, rwlen;
+
+ enable = dr7;
+ rwlen = dr7 >> 16;
+ for (i = 0; i < 4; i++, enable >>= 2, rwlen >>= 4)
+ if ((enable & 3) && (rwlen & 15) == type && db[i] == addr)
+ dr6 |= (1 << i);
+ return dr6;
+}
+
+static int kvm_vcpu_do_singlestep(struct kvm_vcpu *vcpu)
+{
+ struct kvm_run *kvm_run = vcpu->run;
+
+ if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) {
+ kvm_run->debug.arch.dr6 = DR6_BS | DR6_ACTIVE_LOW;
+ kvm_run->debug.arch.pc = kvm_get_linear_rip(vcpu);
+ kvm_run->debug.arch.exception = DB_VECTOR;
+ kvm_run->exit_reason = KVM_EXIT_DEBUG;
+ return 0;
+ }
+ kvm_queue_exception_p(vcpu, DB_VECTOR, DR6_BS);
+ return 1;
+}
+
+int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu)
+{
+ unsigned long rflags = static_call(kvm_x86_get_rflags)(vcpu);
+ int r;
+
+ r = static_call(kvm_x86_skip_emulated_instruction)(vcpu);
+ if (unlikely(!r))
+ return 0;
+
+ kvm_pmu_trigger_event(vcpu, PERF_COUNT_HW_INSTRUCTIONS);
+
+ /*
+ * rflags is the old, "raw" value of the flags. The new value has
+ * not been saved yet.
+ *
+ * This is correct even for TF set by the guest, because "the
+ * processor will not generate this exception after the instruction
+ * that sets the TF flag".
+ */
+ if (unlikely(rflags & X86_EFLAGS_TF))
+ r = kvm_vcpu_do_singlestep(vcpu);
+ return r;
+}
+EXPORT_SYMBOL_GPL(kvm_skip_emulated_instruction);
+
+static bool kvm_is_code_breakpoint_inhibited(struct kvm_vcpu *vcpu)
+{
+ u32 shadow;
+
+ if (kvm_get_rflags(vcpu) & X86_EFLAGS_RF)
+ return true;
+
+ /*
+ * Intel CPUs inhibit code #DBs when MOV/POP SS blocking is active,
+ * but AMD CPUs do not. MOV/POP SS blocking is rare, check that first
+ * to avoid the relatively expensive CPUID lookup.
+ */
+ shadow = static_call(kvm_x86_get_interrupt_shadow)(vcpu);
+ return (shadow & KVM_X86_SHADOW_INT_MOV_SS) &&
+ guest_cpuid_is_intel(vcpu);
+}
+
+static bool kvm_vcpu_check_code_breakpoint(struct kvm_vcpu *vcpu,
+ int emulation_type, int *r)
+{
+ WARN_ON_ONCE(emulation_type & EMULTYPE_NO_DECODE);
+
+ /*
+ * Do not check for code breakpoints if hardware has already done the
+ * checks, as inferred from the emulation type. On NO_DECODE and SKIP,
+ * the instruction has passed all exception checks, and all intercepted
+ * exceptions that trigger emulation have lower priority than code
+ * breakpoints, i.e. the fact that the intercepted exception occurred
+ * means any code breakpoints have already been serviced.
+ *
+ * Note, KVM needs to check for code #DBs on EMULTYPE_TRAP_UD_FORCED as
+ * hardware has checked the RIP of the magic prefix, but not the RIP of
+ * the instruction being emulated. The intent of forced emulation is
+ * to behave as if KVM intercepted the instruction without an exception
+ * and without a prefix.
+ */
+ if (emulation_type & (EMULTYPE_NO_DECODE | EMULTYPE_SKIP |
+ EMULTYPE_TRAP_UD | EMULTYPE_VMWARE_GP | EMULTYPE_PF))
+ return false;
+
+ if (unlikely(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) &&
+ (vcpu->arch.guest_debug_dr7 & DR7_BP_EN_MASK)) {
+ struct kvm_run *kvm_run = vcpu->run;
+ unsigned long eip = kvm_get_linear_rip(vcpu);
+ u32 dr6 = kvm_vcpu_check_hw_bp(eip, 0,
+ vcpu->arch.guest_debug_dr7,
+ vcpu->arch.eff_db);
+
+ if (dr6 != 0) {
+ kvm_run->debug.arch.dr6 = dr6 | DR6_ACTIVE_LOW;
+ kvm_run->debug.arch.pc = eip;
+ kvm_run->debug.arch.exception = DB_VECTOR;
+ kvm_run->exit_reason = KVM_EXIT_DEBUG;
+ *r = 0;
+ return true;
+ }
+ }
+
+ if (unlikely(vcpu->arch.dr7 & DR7_BP_EN_MASK) &&
+ !kvm_is_code_breakpoint_inhibited(vcpu)) {
+ unsigned long eip = kvm_get_linear_rip(vcpu);
+ u32 dr6 = kvm_vcpu_check_hw_bp(eip, 0,
+ vcpu->arch.dr7,
+ vcpu->arch.db);
+
+ if (dr6 != 0) {
+ kvm_queue_exception_p(vcpu, DB_VECTOR, dr6);
+ *r = 1;
+ return true;
+ }
+ }
+
+ return false;
+}
+
+static bool is_vmware_backdoor_opcode(struct x86_emulate_ctxt *ctxt)
+{
+ switch (ctxt->opcode_len) {
+ case 1:
+ switch (ctxt->b) {
+ case 0xe4: /* IN */
+ case 0xe5:
+ case 0xec:
+ case 0xed:
+ case 0xe6: /* OUT */
+ case 0xe7:
+ case 0xee:
+ case 0xef:
+ case 0x6c: /* INS */
+ case 0x6d:
+ case 0x6e: /* OUTS */
+ case 0x6f:
+ return true;
+ }
+ break;
+ case 2:
+ switch (ctxt->b) {
+ case 0x33: /* RDPMC */
+ return true;
+ }
+ break;
+ }
+
+ return false;
+}
+
+/*
+ * Decode an instruction for emulation. The caller is responsible for handling
+ * code breakpoints. Note, manually detecting code breakpoints is unnecessary
+ * (and wrong) when emulating on an intercepted fault-like exception[*], as
+ * code breakpoints have higher priority and thus have already been done by
+ * hardware.
+ *
+ * [*] Except #MC, which is higher priority, but KVM should never emulate in
+ * response to a machine check.
+ */
+int x86_decode_emulated_instruction(struct kvm_vcpu *vcpu, int emulation_type,
+ void *insn, int insn_len)
+{
+ struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
+ int r;
+
+ init_emulate_ctxt(vcpu);
+
+ r = x86_decode_insn(ctxt, insn, insn_len, emulation_type);
+
+ trace_kvm_emulate_insn_start(vcpu);
+ ++vcpu->stat.insn_emulation;
+
+ return r;
+}
+EXPORT_SYMBOL_GPL(x86_decode_emulated_instruction);
+
+int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
+ int emulation_type, void *insn, int insn_len)
+{
+ int r;
+ struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
+ bool writeback = true;
+
+ if (unlikely(!kvm_can_emulate_insn(vcpu, emulation_type, insn, insn_len)))
+ return 1;
+
+ vcpu->arch.l1tf_flush_l1d = true;
+
+ if (!(emulation_type & EMULTYPE_NO_DECODE)) {
+ kvm_clear_exception_queue(vcpu);
+
+ /*
+ * Return immediately if RIP hits a code breakpoint, such #DBs
+ * are fault-like and are higher priority than any faults on
+ * the code fetch itself.
+ */
+ if (kvm_vcpu_check_code_breakpoint(vcpu, emulation_type, &r))
+ return r;
+
+ r = x86_decode_emulated_instruction(vcpu, emulation_type,
+ insn, insn_len);
+ if (r != EMULATION_OK) {
+ if ((emulation_type & EMULTYPE_TRAP_UD) ||
+ (emulation_type & EMULTYPE_TRAP_UD_FORCED)) {
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+ }
+ if (reexecute_instruction(vcpu, cr2_or_gpa,
+ emulation_type))
+ return 1;
+
+ if (ctxt->have_exception &&
+ !(emulation_type & EMULTYPE_SKIP)) {
+ /*
+ * #UD should result in just EMULATION_FAILED, and trap-like
+ * exception should not be encountered during decode.
+ */
+ WARN_ON_ONCE(ctxt->exception.vector == UD_VECTOR ||
+ exception_type(ctxt->exception.vector) == EXCPT_TRAP);
+ inject_emulated_exception(vcpu);
+ return 1;
+ }
+ return handle_emulation_failure(vcpu, emulation_type);
+ }
+ }
+
+ if ((emulation_type & EMULTYPE_VMWARE_GP) &&
+ !is_vmware_backdoor_opcode(ctxt)) {
+ kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
+ return 1;
+ }
+
+ /*
+ * EMULTYPE_SKIP without EMULTYPE_COMPLETE_USER_EXIT is intended for
+ * use *only* by vendor callbacks for kvm_skip_emulated_instruction().
+ * The caller is responsible for updating interruptibility state and
+ * injecting single-step #DBs.
+ */
+ if (emulation_type & EMULTYPE_SKIP) {
+ if (ctxt->mode != X86EMUL_MODE_PROT64)
+ ctxt->eip = (u32)ctxt->_eip;
+ else
+ ctxt->eip = ctxt->_eip;
+
+ if (emulation_type & EMULTYPE_COMPLETE_USER_EXIT) {
+ r = 1;
+ goto writeback;
+ }
+
+ kvm_rip_write(vcpu, ctxt->eip);
+ if (ctxt->eflags & X86_EFLAGS_RF)
+ kvm_set_rflags(vcpu, ctxt->eflags & ~X86_EFLAGS_RF);
+ return 1;
+ }
+
+ if (retry_instruction(ctxt, cr2_or_gpa, emulation_type))
+ return 1;
+
+ /* this is needed for vmware backdoor interface to work since it
+ changes registers values during IO operation */
+ if (vcpu->arch.emulate_regs_need_sync_from_vcpu) {
+ vcpu->arch.emulate_regs_need_sync_from_vcpu = false;
+ emulator_invalidate_register_cache(ctxt);
+ }
+
+restart:
+ if (emulation_type & EMULTYPE_PF) {
+ /* Save the faulting GPA (cr2) in the address field */
+ ctxt->exception.address = cr2_or_gpa;
+
+ /* With shadow page tables, cr2 contains a GVA or nGPA. */
+ if (vcpu->arch.mmu->root_role.direct) {
+ ctxt->gpa_available = true;
+ ctxt->gpa_val = cr2_or_gpa;
+ }
+ } else {
+ /* Sanitize the address out of an abundance of paranoia. */
+ ctxt->exception.address = 0;
+ }
+
+ r = x86_emulate_insn(ctxt);
+
+ if (r == EMULATION_INTERCEPTED)
+ return 1;
+
+ if (r == EMULATION_FAILED) {
+ if (reexecute_instruction(vcpu, cr2_or_gpa, emulation_type))
+ return 1;
+
+ return handle_emulation_failure(vcpu, emulation_type);
+ }
+
+ if (ctxt->have_exception) {
+ WARN_ON_ONCE(vcpu->mmio_needed && !vcpu->mmio_is_write);
+ vcpu->mmio_needed = false;
+ r = 1;
+ inject_emulated_exception(vcpu);
+ } else if (vcpu->arch.pio.count) {
+ if (!vcpu->arch.pio.in) {
+ /* FIXME: return into emulator if single-stepping. */
+ vcpu->arch.pio.count = 0;
+ } else {
+ writeback = false;
+ vcpu->arch.complete_userspace_io = complete_emulated_pio;
+ }
+ r = 0;
+ } else if (vcpu->mmio_needed) {
+ ++vcpu->stat.mmio_exits;
+
+ if (!vcpu->mmio_is_write)
+ writeback = false;
+ r = 0;
+ vcpu->arch.complete_userspace_io = complete_emulated_mmio;
+ } else if (vcpu->arch.complete_userspace_io) {
+ writeback = false;
+ r = 0;
+ } else if (r == EMULATION_RESTART)
+ goto restart;
+ else
+ r = 1;
+
+writeback:
+ if (writeback) {
+ unsigned long rflags = static_call(kvm_x86_get_rflags)(vcpu);
+ toggle_interruptibility(vcpu, ctxt->interruptibility);
+ vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
+
+ /*
+ * Note, EXCPT_DB is assumed to be fault-like as the emulator
+ * only supports code breakpoints and general detect #DB, both
+ * of which are fault-like.
+ */
+ if (!ctxt->have_exception ||
+ exception_type(ctxt->exception.vector) == EXCPT_TRAP) {
+ kvm_pmu_trigger_event(vcpu, PERF_COUNT_HW_INSTRUCTIONS);
+ if (ctxt->is_branch)
+ kvm_pmu_trigger_event(vcpu, PERF_COUNT_HW_BRANCH_INSTRUCTIONS);
+ kvm_rip_write(vcpu, ctxt->eip);
+ if (r && (ctxt->tf || (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)))
+ r = kvm_vcpu_do_singlestep(vcpu);
+ static_call_cond(kvm_x86_update_emulated_instruction)(vcpu);
+ __kvm_set_rflags(vcpu, ctxt->eflags);
+ }
+
+ /*
+ * For STI, interrupts are shadowed; so KVM_REQ_EVENT will
+ * do nothing, and it will be requested again as soon as
+ * the shadow expires. But we still need to check here,
+ * because POPF has no interrupt shadow.
+ */
+ if (unlikely((ctxt->eflags & ~rflags) & X86_EFLAGS_IF))
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ } else
+ vcpu->arch.emulate_regs_need_sync_to_vcpu = true;
+
+ return r;
+}
+
+int kvm_emulate_instruction(struct kvm_vcpu *vcpu, int emulation_type)
+{
+ return x86_emulate_instruction(vcpu, 0, emulation_type, NULL, 0);
+}
+EXPORT_SYMBOL_GPL(kvm_emulate_instruction);
+
+int kvm_emulate_instruction_from_buffer(struct kvm_vcpu *vcpu,
+ void *insn, int insn_len)
+{
+ return x86_emulate_instruction(vcpu, 0, 0, insn, insn_len);
+}
+EXPORT_SYMBOL_GPL(kvm_emulate_instruction_from_buffer);
+
+static int complete_fast_pio_out_port_0x7e(struct kvm_vcpu *vcpu)
+{
+ vcpu->arch.pio.count = 0;
+ return 1;
+}
+
+static int complete_fast_pio_out(struct kvm_vcpu *vcpu)
+{
+ vcpu->arch.pio.count = 0;
+
+ if (unlikely(!kvm_is_linear_rip(vcpu, vcpu->arch.pio.linear_rip)))
+ return 1;
+
+ return kvm_skip_emulated_instruction(vcpu);
+}
+
+static int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size,
+ unsigned short port)
+{
+ unsigned long val = kvm_rax_read(vcpu);
+ int ret = emulator_pio_out(vcpu, size, port, &val, 1);
+
+ if (ret)
+ return ret;
+
+ /*
+ * Workaround userspace that relies on old KVM behavior of %rip being
+ * incremented prior to exiting to userspace to handle "OUT 0x7e".
+ */
+ if (port == 0x7e &&
+ kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_OUT_7E_INC_RIP)) {
+ vcpu->arch.complete_userspace_io =
+ complete_fast_pio_out_port_0x7e;
+ kvm_skip_emulated_instruction(vcpu);
+ } else {
+ vcpu->arch.pio.linear_rip = kvm_get_linear_rip(vcpu);
+ vcpu->arch.complete_userspace_io = complete_fast_pio_out;
+ }
+ return 0;
+}
+
+static int complete_fast_pio_in(struct kvm_vcpu *vcpu)
+{
+ unsigned long val;
+
+ /* We should only ever be called with arch.pio.count equal to 1 */
+ BUG_ON(vcpu->arch.pio.count != 1);
+
+ if (unlikely(!kvm_is_linear_rip(vcpu, vcpu->arch.pio.linear_rip))) {
+ vcpu->arch.pio.count = 0;
+ return 1;
+ }
+
+ /* For size less than 4 we merge, else we zero extend */
+ val = (vcpu->arch.pio.size < 4) ? kvm_rax_read(vcpu) : 0;
+
+ complete_emulator_pio_in(vcpu, &val);
+ kvm_rax_write(vcpu, val);
+
+ return kvm_skip_emulated_instruction(vcpu);
+}
+
+static int kvm_fast_pio_in(struct kvm_vcpu *vcpu, int size,
+ unsigned short port)
+{
+ unsigned long val;
+ int ret;
+
+ /* For size less than 4 we merge, else we zero extend */
+ val = (size < 4) ? kvm_rax_read(vcpu) : 0;
+
+ ret = emulator_pio_in(vcpu, size, port, &val, 1);
+ if (ret) {
+ kvm_rax_write(vcpu, val);
+ return ret;
+ }
+
+ vcpu->arch.pio.linear_rip = kvm_get_linear_rip(vcpu);
+ vcpu->arch.complete_userspace_io = complete_fast_pio_in;
+
+ return 0;
+}
+
+int kvm_fast_pio(struct kvm_vcpu *vcpu, int size, unsigned short port, int in)
+{
+ int ret;
+
+ if (in)
+ ret = kvm_fast_pio_in(vcpu, size, port);
+ else
+ ret = kvm_fast_pio_out(vcpu, size, port);
+ return ret && kvm_skip_emulated_instruction(vcpu);
+}
+EXPORT_SYMBOL_GPL(kvm_fast_pio);
+
+static int kvmclock_cpu_down_prep(unsigned int cpu)
+{
+ __this_cpu_write(cpu_tsc_khz, 0);
+ return 0;
+}
+
+static void tsc_khz_changed(void *data)
+{
+ struct cpufreq_freqs *freq = data;
+ unsigned long khz;
+
+ WARN_ON_ONCE(boot_cpu_has(X86_FEATURE_CONSTANT_TSC));
+
+ if (data)
+ khz = freq->new;
+ else
+ khz = cpufreq_quick_get(raw_smp_processor_id());
+ if (!khz)
+ khz = tsc_khz;
+ __this_cpu_write(cpu_tsc_khz, khz);
+}
+
+#ifdef CONFIG_X86_64
+static void kvm_hyperv_tsc_notifier(void)
+{
+ struct kvm *kvm;
+ int cpu;
+
+ mutex_lock(&kvm_lock);
+ list_for_each_entry(kvm, &vm_list, vm_list)
+ kvm_make_mclock_inprogress_request(kvm);
+
+ /* no guest entries from this point */
+ hyperv_stop_tsc_emulation();
+
+ /* TSC frequency always matches when on Hyper-V */
+ if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
+ for_each_present_cpu(cpu)
+ per_cpu(cpu_tsc_khz, cpu) = tsc_khz;
+ }
+ kvm_caps.max_guest_tsc_khz = tsc_khz;
+
+ list_for_each_entry(kvm, &vm_list, vm_list) {
+ __kvm_start_pvclock_update(kvm);
+ pvclock_update_vm_gtod_copy(kvm);
+ kvm_end_pvclock_update(kvm);
+ }
+
+ mutex_unlock(&kvm_lock);
+}
+#endif
+
+static void __kvmclock_cpufreq_notifier(struct cpufreq_freqs *freq, int cpu)
+{
+ struct kvm *kvm;
+ struct kvm_vcpu *vcpu;
+ int send_ipi = 0;
+ unsigned long i;
+
+ /*
+ * We allow guests to temporarily run on slowing clocks,
+ * provided we notify them after, or to run on accelerating
+ * clocks, provided we notify them before. Thus time never
+ * goes backwards.
+ *
+ * However, we have a problem. We can't atomically update
+ * the frequency of a given CPU from this function; it is
+ * merely a notifier, which can be called from any CPU.
+ * Changing the TSC frequency at arbitrary points in time
+ * requires a recomputation of local variables related to
+ * the TSC for each VCPU. We must flag these local variables
+ * to be updated and be sure the update takes place with the
+ * new frequency before any guests proceed.
+ *
+ * Unfortunately, the combination of hotplug CPU and frequency
+ * change creates an intractable locking scenario; the order
+ * of when these callouts happen is undefined with respect to
+ * CPU hotplug, and they can race with each other. As such,
+ * merely setting per_cpu(cpu_tsc_khz) = X during a hotadd is
+ * undefined; you can actually have a CPU frequency change take
+ * place in between the computation of X and the setting of the
+ * variable. To protect against this problem, all updates of
+ * the per_cpu tsc_khz variable are done in an interrupt
+ * protected IPI, and all callers wishing to update the value
+ * must wait for a synchronous IPI to complete (which is trivial
+ * if the caller is on the CPU already). This establishes the
+ * necessary total order on variable updates.
+ *
+ * Note that because a guest time update may take place
+ * anytime after the setting of the VCPU's request bit, the
+ * correct TSC value must be set before the request. However,
+ * to ensure the update actually makes it to any guest which
+ * starts running in hardware virtualization between the set
+ * and the acquisition of the spinlock, we must also ping the
+ * CPU after setting the request bit.
+ *
+ */
+
+ smp_call_function_single(cpu, tsc_khz_changed, freq, 1);
+
+ mutex_lock(&kvm_lock);
+ list_for_each_entry(kvm, &vm_list, vm_list) {
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ if (vcpu->cpu != cpu)
+ continue;
+ kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
+ if (vcpu->cpu != raw_smp_processor_id())
+ send_ipi = 1;
+ }
+ }
+ mutex_unlock(&kvm_lock);
+
+ if (freq->old < freq->new && send_ipi) {
+ /*
+ * We upscale the frequency. Must make the guest
+ * doesn't see old kvmclock values while running with
+ * the new frequency, otherwise we risk the guest sees
+ * time go backwards.
+ *
+ * In case we update the frequency for another cpu
+ * (which might be in guest context) send an interrupt
+ * to kick the cpu out of guest context. Next time
+ * guest context is entered kvmclock will be updated,
+ * so the guest will not see stale values.
+ */
+ smp_call_function_single(cpu, tsc_khz_changed, freq, 1);
+ }
+}
+
+static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
+ void *data)
+{
+ struct cpufreq_freqs *freq = data;
+ int cpu;
+
+ if (val == CPUFREQ_PRECHANGE && freq->old > freq->new)
+ return 0;
+ if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new)
+ return 0;
+
+ for_each_cpu(cpu, freq->policy->cpus)
+ __kvmclock_cpufreq_notifier(freq, cpu);
+
+ return 0;
+}
+
+static struct notifier_block kvmclock_cpufreq_notifier_block = {
+ .notifier_call = kvmclock_cpufreq_notifier
+};
+
+static int kvmclock_cpu_online(unsigned int cpu)
+{
+ tsc_khz_changed(NULL);
+ return 0;
+}
+
+static void kvm_timer_init(void)
+{
+ if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
+ max_tsc_khz = tsc_khz;
+
+ if (IS_ENABLED(CONFIG_CPU_FREQ)) {
+ struct cpufreq_policy *policy;
+ int cpu;
+
+ cpu = get_cpu();
+ policy = cpufreq_cpu_get(cpu);
+ if (policy) {
+ if (policy->cpuinfo.max_freq)
+ max_tsc_khz = policy->cpuinfo.max_freq;
+ cpufreq_cpu_put(policy);
+ }
+ put_cpu();
+ }
+ cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block,
+ CPUFREQ_TRANSITION_NOTIFIER);
+
+ cpuhp_setup_state(CPUHP_AP_X86_KVM_CLK_ONLINE, "x86/kvm/clk:online",
+ kvmclock_cpu_online, kvmclock_cpu_down_prep);
+ }
+}
+
+#ifdef CONFIG_X86_64
+static void pvclock_gtod_update_fn(struct work_struct *work)
+{
+ struct kvm *kvm;
+ struct kvm_vcpu *vcpu;
+ unsigned long i;
+
+ mutex_lock(&kvm_lock);
+ list_for_each_entry(kvm, &vm_list, vm_list)
+ kvm_for_each_vcpu(i, vcpu, kvm)
+ kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
+ atomic_set(&kvm_guest_has_master_clock, 0);
+ mutex_unlock(&kvm_lock);
+}
+
+static DECLARE_WORK(pvclock_gtod_work, pvclock_gtod_update_fn);
+
+/*
+ * Indirection to move queue_work() out of the tk_core.seq write held
+ * region to prevent possible deadlocks against time accessors which
+ * are invoked with work related locks held.
+ */
+static void pvclock_irq_work_fn(struct irq_work *w)
+{
+ queue_work(system_long_wq, &pvclock_gtod_work);
+}
+
+static DEFINE_IRQ_WORK(pvclock_irq_work, pvclock_irq_work_fn);
+
+/*
+ * Notification about pvclock gtod data update.
+ */
+static int pvclock_gtod_notify(struct notifier_block *nb, unsigned long unused,
+ void *priv)
+{
+ struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
+ struct timekeeper *tk = priv;
+
+ update_pvclock_gtod(tk);
+
+ /*
+ * Disable master clock if host does not trust, or does not use,
+ * TSC based clocksource. Delegate queue_work() to irq_work as
+ * this is invoked with tk_core.seq write held.
+ */
+ if (!gtod_is_based_on_tsc(gtod->clock.vclock_mode) &&
+ atomic_read(&kvm_guest_has_master_clock) != 0)
+ irq_work_queue(&pvclock_irq_work);
+ return 0;
+}
+
+static struct notifier_block pvclock_gtod_notifier = {
+ .notifier_call = pvclock_gtod_notify,
+};
+#endif
+
+static inline void kvm_ops_update(struct kvm_x86_init_ops *ops)
+{
+ memcpy(&kvm_x86_ops, ops->runtime_ops, sizeof(kvm_x86_ops));
+
+#define __KVM_X86_OP(func) \
+ static_call_update(kvm_x86_##func, kvm_x86_ops.func);
+#define KVM_X86_OP(func) \
+ WARN_ON(!kvm_x86_ops.func); __KVM_X86_OP(func)
+#define KVM_X86_OP_OPTIONAL __KVM_X86_OP
+#define KVM_X86_OP_OPTIONAL_RET0(func) \
+ static_call_update(kvm_x86_##func, (void *)kvm_x86_ops.func ? : \
+ (void *)__static_call_return0);
+#include <asm/kvm-x86-ops.h>
+#undef __KVM_X86_OP
+
+ kvm_pmu_ops_update(ops->pmu_ops);
+}
+
+static int kvm_x86_check_processor_compatibility(void)
+{
+ int cpu = smp_processor_id();
+ struct cpuinfo_x86 *c = &cpu_data(cpu);
+
+ /*
+ * Compatibility checks are done when loading KVM and when enabling
+ * hardware, e.g. during CPU hotplug, to ensure all online CPUs are
+ * compatible, i.e. KVM should never perform a compatibility check on
+ * an offline CPU.
+ */
+ WARN_ON(!cpu_online(cpu));
+
+ if (__cr4_reserved_bits(cpu_has, c) !=
+ __cr4_reserved_bits(cpu_has, &boot_cpu_data))
+ return -EIO;
+
+ return static_call(kvm_x86_check_processor_compatibility)();
+}
+
+static void kvm_x86_check_cpu_compat(void *ret)
+{
+ *(int *)ret = kvm_x86_check_processor_compatibility();
+}
+
+static int __kvm_x86_vendor_init(struct kvm_x86_init_ops *ops)
+{
+ u64 host_pat;
+ int r, cpu;
+
+ if (kvm_x86_ops.hardware_enable) {
+ pr_err("already loaded vendor module '%s'\n", kvm_x86_ops.name);
+ return -EEXIST;
+ }
+
+ /*
+ * KVM explicitly assumes that the guest has an FPU and
+ * FXSAVE/FXRSTOR. For example, the KVM_GET_FPU explicitly casts the
+ * vCPU's FPU state as a fxregs_state struct.
+ */
+ if (!boot_cpu_has(X86_FEATURE_FPU) || !boot_cpu_has(X86_FEATURE_FXSR)) {
+ pr_err("inadequate fpu\n");
+ return -EOPNOTSUPP;
+ }
+
+ if (IS_ENABLED(CONFIG_PREEMPT_RT) && !boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
+ pr_err("RT requires X86_FEATURE_CONSTANT_TSC\n");
+ return -EOPNOTSUPP;
+ }
+
+ /*
+ * KVM assumes that PAT entry '0' encodes WB memtype and simply zeroes
+ * the PAT bits in SPTEs. Bail if PAT[0] is programmed to something
+ * other than WB. Note, EPT doesn't utilize the PAT, but don't bother
+ * with an exception. PAT[0] is set to WB on RESET and also by the
+ * kernel, i.e. failure indicates a kernel bug or broken firmware.
+ */
+ if (rdmsrl_safe(MSR_IA32_CR_PAT, &host_pat) ||
+ (host_pat & GENMASK(2, 0)) != 6) {
+ pr_err("host PAT[0] is not WB\n");
+ return -EIO;
+ }
+
+ x86_emulator_cache = kvm_alloc_emulator_cache();
+ if (!x86_emulator_cache) {
+ pr_err("failed to allocate cache for x86 emulator\n");
+ return -ENOMEM;
+ }
+
+ user_return_msrs = alloc_percpu(struct kvm_user_return_msrs);
+ if (!user_return_msrs) {
+ pr_err("failed to allocate percpu kvm_user_return_msrs\n");
+ r = -ENOMEM;
+ goto out_free_x86_emulator_cache;
+ }
+ kvm_nr_uret_msrs = 0;
+
+ r = kvm_mmu_vendor_module_init();
+ if (r)
+ goto out_free_percpu;
+
+ if (boot_cpu_has(X86_FEATURE_XSAVE)) {
+ host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
+ kvm_caps.supported_xcr0 = host_xcr0 & KVM_SUPPORTED_XCR0;
+ }
+
+ rdmsrl_safe(MSR_EFER, &host_efer);
+
+ if (boot_cpu_has(X86_FEATURE_XSAVES))
+ rdmsrl(MSR_IA32_XSS, host_xss);
+
+ kvm_init_pmu_capability(ops->pmu_ops);
+
+ if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES))
+ rdmsrl(MSR_IA32_ARCH_CAPABILITIES, host_arch_capabilities);
+
+ r = ops->hardware_setup();
+ if (r != 0)
+ goto out_mmu_exit;
+
+ kvm_ops_update(ops);
+
+ for_each_online_cpu(cpu) {
+ smp_call_function_single(cpu, kvm_x86_check_cpu_compat, &r, 1);
+ if (r < 0)
+ goto out_unwind_ops;
+ }
+
+ /*
+ * Point of no return! DO NOT add error paths below this point unless
+ * absolutely necessary, as most operations from this point forward
+ * require unwinding.
+ */
+ kvm_timer_init();
+
+ if (pi_inject_timer == -1)
+ pi_inject_timer = housekeeping_enabled(HK_TYPE_TIMER);
+#ifdef CONFIG_X86_64
+ pvclock_gtod_register_notifier(&pvclock_gtod_notifier);
+
+ if (hypervisor_is_type(X86_HYPER_MS_HYPERV))
+ set_hv_tscchange_cb(kvm_hyperv_tsc_notifier);
+#endif
+
+ kvm_register_perf_callbacks(ops->handle_intel_pt_intr);
+
+ if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES))
+ kvm_caps.supported_xss = 0;
+
+#define __kvm_cpu_cap_has(UNUSED_, f) kvm_cpu_cap_has(f)
+ cr4_reserved_bits = __cr4_reserved_bits(__kvm_cpu_cap_has, UNUSED_);
+#undef __kvm_cpu_cap_has
+
+ if (kvm_caps.has_tsc_control) {
+ /*
+ * Make sure the user can only configure tsc_khz values that
+ * fit into a signed integer.
+ * A min value is not calculated because it will always
+ * be 1 on all machines.
+ */
+ u64 max = min(0x7fffffffULL,
+ __scale_tsc(kvm_caps.max_tsc_scaling_ratio, tsc_khz));
+ kvm_caps.max_guest_tsc_khz = max;
+ }
+ kvm_caps.default_tsc_scaling_ratio = 1ULL << kvm_caps.tsc_scaling_ratio_frac_bits;
+ kvm_init_msr_lists();
+ return 0;
+
+out_unwind_ops:
+ kvm_x86_ops.hardware_enable = NULL;
+ static_call(kvm_x86_hardware_unsetup)();
+out_mmu_exit:
+ kvm_mmu_vendor_module_exit();
+out_free_percpu:
+ free_percpu(user_return_msrs);
+out_free_x86_emulator_cache:
+ kmem_cache_destroy(x86_emulator_cache);
+ return r;
+}
+
+int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops)
+{
+ int r;
+
+ mutex_lock(&vendor_module_lock);
+ r = __kvm_x86_vendor_init(ops);
+ mutex_unlock(&vendor_module_lock);
+
+ return r;
+}
+EXPORT_SYMBOL_GPL(kvm_x86_vendor_init);
+
+void kvm_x86_vendor_exit(void)
+{
+ kvm_unregister_perf_callbacks();
+
+#ifdef CONFIG_X86_64
+ if (hypervisor_is_type(X86_HYPER_MS_HYPERV))
+ clear_hv_tscchange_cb();
+#endif
+ kvm_lapic_exit();
+
+ if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
+ cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block,
+ CPUFREQ_TRANSITION_NOTIFIER);
+ cpuhp_remove_state_nocalls(CPUHP_AP_X86_KVM_CLK_ONLINE);
+ }
+#ifdef CONFIG_X86_64
+ pvclock_gtod_unregister_notifier(&pvclock_gtod_notifier);
+ irq_work_sync(&pvclock_irq_work);
+ cancel_work_sync(&pvclock_gtod_work);
+#endif
+ static_call(kvm_x86_hardware_unsetup)();
+ kvm_mmu_vendor_module_exit();
+ free_percpu(user_return_msrs);
+ kmem_cache_destroy(x86_emulator_cache);
+#ifdef CONFIG_KVM_XEN
+ static_key_deferred_flush(&kvm_xen_enabled);
+ WARN_ON(static_branch_unlikely(&kvm_xen_enabled.key));
+#endif
+ mutex_lock(&vendor_module_lock);
+ kvm_x86_ops.hardware_enable = NULL;
+ mutex_unlock(&vendor_module_lock);
+}
+EXPORT_SYMBOL_GPL(kvm_x86_vendor_exit);
+
+static int __kvm_emulate_halt(struct kvm_vcpu *vcpu, int state, int reason)
+{
+ /*
+ * The vCPU has halted, e.g. executed HLT. Update the run state if the
+ * local APIC is in-kernel, the run loop will detect the non-runnable
+ * state and halt the vCPU. Exit to userspace if the local APIC is
+ * managed by userspace, in which case userspace is responsible for
+ * handling wake events.
+ */
+ ++vcpu->stat.halt_exits;
+ if (lapic_in_kernel(vcpu)) {
+ vcpu->arch.mp_state = state;
+ return 1;
+ } else {
+ vcpu->run->exit_reason = reason;
+ return 0;
+ }
+}
+
+int kvm_emulate_halt_noskip(struct kvm_vcpu *vcpu)
+{
+ return __kvm_emulate_halt(vcpu, KVM_MP_STATE_HALTED, KVM_EXIT_HLT);
+}
+EXPORT_SYMBOL_GPL(kvm_emulate_halt_noskip);
+
+int kvm_emulate_halt(struct kvm_vcpu *vcpu)
+{
+ int ret = kvm_skip_emulated_instruction(vcpu);
+ /*
+ * TODO: we might be squashing a GUESTDBG_SINGLESTEP-triggered
+ * KVM_EXIT_DEBUG here.
+ */
+ return kvm_emulate_halt_noskip(vcpu) && ret;
+}
+EXPORT_SYMBOL_GPL(kvm_emulate_halt);
+
+int kvm_emulate_ap_reset_hold(struct kvm_vcpu *vcpu)
+{
+ int ret = kvm_skip_emulated_instruction(vcpu);
+
+ return __kvm_emulate_halt(vcpu, KVM_MP_STATE_AP_RESET_HOLD,
+ KVM_EXIT_AP_RESET_HOLD) && ret;
+}
+EXPORT_SYMBOL_GPL(kvm_emulate_ap_reset_hold);
+
+#ifdef CONFIG_X86_64
+static int kvm_pv_clock_pairing(struct kvm_vcpu *vcpu, gpa_t paddr,
+ unsigned long clock_type)
+{
+ struct kvm_clock_pairing clock_pairing;
+ struct timespec64 ts;
+ u64 cycle;
+ int ret;
+
+ if (clock_type != KVM_CLOCK_PAIRING_WALLCLOCK)
+ return -KVM_EOPNOTSUPP;
+
+ /*
+ * When tsc is in permanent catchup mode guests won't be able to use
+ * pvclock_read_retry loop to get consistent view of pvclock
+ */
+ if (vcpu->arch.tsc_always_catchup)
+ return -KVM_EOPNOTSUPP;
+
+ if (!kvm_get_walltime_and_clockread(&ts, &cycle))
+ return -KVM_EOPNOTSUPP;
+
+ clock_pairing.sec = ts.tv_sec;
+ clock_pairing.nsec = ts.tv_nsec;
+ clock_pairing.tsc = kvm_read_l1_tsc(vcpu, cycle);
+ clock_pairing.flags = 0;
+ memset(&clock_pairing.pad, 0, sizeof(clock_pairing.pad));
+
+ ret = 0;
+ if (kvm_write_guest(vcpu->kvm, paddr, &clock_pairing,
+ sizeof(struct kvm_clock_pairing)))
+ ret = -KVM_EFAULT;
+
+ return ret;
+}
+#endif
+
+/*
+ * kvm_pv_kick_cpu_op: Kick a vcpu.
+ *
+ * @apicid - apicid of vcpu to be kicked.
+ */
+static void kvm_pv_kick_cpu_op(struct kvm *kvm, int apicid)
+{
+ /*
+ * All other fields are unused for APIC_DM_REMRD, but may be consumed by
+ * common code, e.g. for tracing. Defer initialization to the compiler.
+ */
+ struct kvm_lapic_irq lapic_irq = {
+ .delivery_mode = APIC_DM_REMRD,
+ .dest_mode = APIC_DEST_PHYSICAL,
+ .shorthand = APIC_DEST_NOSHORT,
+ .dest_id = apicid,
+ };
+
+ kvm_irq_delivery_to_apic(kvm, NULL, &lapic_irq, NULL);
+}
+
+bool kvm_apicv_activated(struct kvm *kvm)
+{
+ return (READ_ONCE(kvm->arch.apicv_inhibit_reasons) == 0);
+}
+EXPORT_SYMBOL_GPL(kvm_apicv_activated);
+
+bool kvm_vcpu_apicv_activated(struct kvm_vcpu *vcpu)
+{
+ ulong vm_reasons = READ_ONCE(vcpu->kvm->arch.apicv_inhibit_reasons);
+ ulong vcpu_reasons = static_call(kvm_x86_vcpu_get_apicv_inhibit_reasons)(vcpu);
+
+ return (vm_reasons | vcpu_reasons) == 0;
+}
+EXPORT_SYMBOL_GPL(kvm_vcpu_apicv_activated);
+
+static void set_or_clear_apicv_inhibit(unsigned long *inhibits,
+ enum kvm_apicv_inhibit reason, bool set)
+{
+ if (set)
+ __set_bit(reason, inhibits);
+ else
+ __clear_bit(reason, inhibits);
+
+ trace_kvm_apicv_inhibit_changed(reason, set, *inhibits);
+}
+
+static void kvm_apicv_init(struct kvm *kvm)
+{
+ unsigned long *inhibits = &kvm->arch.apicv_inhibit_reasons;
+
+ init_rwsem(&kvm->arch.apicv_update_lock);
+
+ set_or_clear_apicv_inhibit(inhibits, APICV_INHIBIT_REASON_ABSENT, true);
+
+ if (!enable_apicv)
+ set_or_clear_apicv_inhibit(inhibits,
+ APICV_INHIBIT_REASON_DISABLE, true);
+}
+
+static void kvm_sched_yield(struct kvm_vcpu *vcpu, unsigned long dest_id)
+{
+ struct kvm_vcpu *target = NULL;
+ struct kvm_apic_map *map;
+
+ vcpu->stat.directed_yield_attempted++;
+
+ if (single_task_running())
+ goto no_yield;
+
+ rcu_read_lock();
+ map = rcu_dereference(vcpu->kvm->arch.apic_map);
+
+ if (likely(map) && dest_id <= map->max_apic_id && map->phys_map[dest_id])
+ target = map->phys_map[dest_id]->vcpu;
+
+ rcu_read_unlock();
+
+ if (!target || !READ_ONCE(target->ready))
+ goto no_yield;
+
+ /* Ignore requests to yield to self */
+ if (vcpu == target)
+ goto no_yield;
+
+ if (kvm_vcpu_yield_to(target) <= 0)
+ goto no_yield;
+
+ vcpu->stat.directed_yield_successful++;
+
+no_yield:
+ return;
+}
+
+static int complete_hypercall_exit(struct kvm_vcpu *vcpu)
+{
+ u64 ret = vcpu->run->hypercall.ret;
+
+ if (!is_64_bit_mode(vcpu))
+ ret = (u32)ret;
+ kvm_rax_write(vcpu, ret);
+ ++vcpu->stat.hypercalls;
+ return kvm_skip_emulated_instruction(vcpu);
+}
+
+int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
+{
+ unsigned long nr, a0, a1, a2, a3, ret;
+ int op_64_bit;
+
+ if (kvm_xen_hypercall_enabled(vcpu->kvm))
+ return kvm_xen_hypercall(vcpu);
+
+ if (kvm_hv_hypercall_enabled(vcpu))
+ return kvm_hv_hypercall(vcpu);
+
+ nr = kvm_rax_read(vcpu);
+ a0 = kvm_rbx_read(vcpu);
+ a1 = kvm_rcx_read(vcpu);
+ a2 = kvm_rdx_read(vcpu);
+ a3 = kvm_rsi_read(vcpu);
+
+ trace_kvm_hypercall(nr, a0, a1, a2, a3);
+
+ op_64_bit = is_64_bit_hypercall(vcpu);
+ if (!op_64_bit) {
+ nr &= 0xFFFFFFFF;
+ a0 &= 0xFFFFFFFF;
+ a1 &= 0xFFFFFFFF;
+ a2 &= 0xFFFFFFFF;
+ a3 &= 0xFFFFFFFF;
+ }
+
+ if (static_call(kvm_x86_get_cpl)(vcpu) != 0) {
+ ret = -KVM_EPERM;
+ goto out;
+ }
+
+ ret = -KVM_ENOSYS;
+
+ switch (nr) {
+ case KVM_HC_VAPIC_POLL_IRQ:
+ ret = 0;
+ break;
+ case KVM_HC_KICK_CPU:
+ if (!guest_pv_has(vcpu, KVM_FEATURE_PV_UNHALT))
+ break;
+
+ kvm_pv_kick_cpu_op(vcpu->kvm, a1);
+ kvm_sched_yield(vcpu, a1);
+ ret = 0;
+ break;
+#ifdef CONFIG_X86_64
+ case KVM_HC_CLOCK_PAIRING:
+ ret = kvm_pv_clock_pairing(vcpu, a0, a1);
+ break;
+#endif
+ case KVM_HC_SEND_IPI:
+ if (!guest_pv_has(vcpu, KVM_FEATURE_PV_SEND_IPI))
+ break;
+
+ ret = kvm_pv_send_ipi(vcpu->kvm, a0, a1, a2, a3, op_64_bit);
+ break;
+ case KVM_HC_SCHED_YIELD:
+ if (!guest_pv_has(vcpu, KVM_FEATURE_PV_SCHED_YIELD))
+ break;
+
+ kvm_sched_yield(vcpu, a0);
+ ret = 0;
+ break;
+ case KVM_HC_MAP_GPA_RANGE: {
+ u64 gpa = a0, npages = a1, attrs = a2;
+
+ ret = -KVM_ENOSYS;
+ if (!(vcpu->kvm->arch.hypercall_exit_enabled & (1 << KVM_HC_MAP_GPA_RANGE)))
+ break;
+
+ if (!PAGE_ALIGNED(gpa) || !npages ||
+ gpa_to_gfn(gpa) + npages <= gpa_to_gfn(gpa)) {
+ ret = -KVM_EINVAL;
+ break;
+ }
+
+ vcpu->run->exit_reason = KVM_EXIT_HYPERCALL;
+ vcpu->run->hypercall.nr = KVM_HC_MAP_GPA_RANGE;
+ vcpu->run->hypercall.args[0] = gpa;
+ vcpu->run->hypercall.args[1] = npages;
+ vcpu->run->hypercall.args[2] = attrs;
+ vcpu->run->hypercall.flags = 0;
+ if (op_64_bit)
+ vcpu->run->hypercall.flags |= KVM_EXIT_HYPERCALL_LONG_MODE;
+
+ WARN_ON_ONCE(vcpu->run->hypercall.flags & KVM_EXIT_HYPERCALL_MBZ);
+ vcpu->arch.complete_userspace_io = complete_hypercall_exit;
+ return 0;
+ }
+ default:
+ ret = -KVM_ENOSYS;
+ break;
+ }
+out:
+ if (!op_64_bit)
+ ret = (u32)ret;
+ kvm_rax_write(vcpu, ret);
+
+ ++vcpu->stat.hypercalls;
+ return kvm_skip_emulated_instruction(vcpu);
+}
+EXPORT_SYMBOL_GPL(kvm_emulate_hypercall);
+
+static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt)
+{
+ struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
+ char instruction[3];
+ unsigned long rip = kvm_rip_read(vcpu);
+
+ /*
+ * If the quirk is disabled, synthesize a #UD and let the guest pick up
+ * the pieces.
+ */
+ if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_FIX_HYPERCALL_INSN)) {
+ ctxt->exception.error_code_valid = false;
+ ctxt->exception.vector = UD_VECTOR;
+ ctxt->have_exception = true;
+ return X86EMUL_PROPAGATE_FAULT;
+ }
+
+ static_call(kvm_x86_patch_hypercall)(vcpu, instruction);
+
+ return emulator_write_emulated(ctxt, rip, instruction, 3,
+ &ctxt->exception);
+}
+
+static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu)
+{
+ return vcpu->run->request_interrupt_window &&
+ likely(!pic_in_kernel(vcpu->kvm));
+}
+
+/* Called within kvm->srcu read side. */
+static void post_kvm_run_save(struct kvm_vcpu *vcpu)
+{
+ struct kvm_run *kvm_run = vcpu->run;
+
+ kvm_run->if_flag = static_call(kvm_x86_get_if_flag)(vcpu);
+ kvm_run->cr8 = kvm_get_cr8(vcpu);
+ kvm_run->apic_base = kvm_get_apic_base(vcpu);
+
+ kvm_run->ready_for_interrupt_injection =
+ pic_in_kernel(vcpu->kvm) ||
+ kvm_vcpu_ready_for_interrupt_injection(vcpu);
+
+ if (is_smm(vcpu))
+ kvm_run->flags |= KVM_RUN_X86_SMM;
+}
+
+static void update_cr8_intercept(struct kvm_vcpu *vcpu)
+{
+ int max_irr, tpr;
+
+ if (!kvm_x86_ops.update_cr8_intercept)
+ return;
+
+ if (!lapic_in_kernel(vcpu))
+ return;
+
+ if (vcpu->arch.apic->apicv_active)
+ return;
+
+ if (!vcpu->arch.apic->vapic_addr)
+ max_irr = kvm_lapic_find_highest_irr(vcpu);
+ else
+ max_irr = -1;
+
+ if (max_irr != -1)
+ max_irr >>= 4;
+
+ tpr = kvm_lapic_get_cr8(vcpu);
+
+ static_call(kvm_x86_update_cr8_intercept)(vcpu, tpr, max_irr);
+}
+
+
+int kvm_check_nested_events(struct kvm_vcpu *vcpu)
+{
+ if (kvm_test_request(KVM_REQ_TRIPLE_FAULT, vcpu)) {
+ kvm_x86_ops.nested_ops->triple_fault(vcpu);
+ return 1;
+ }
+
+ return kvm_x86_ops.nested_ops->check_events(vcpu);
+}
+
+static void kvm_inject_exception(struct kvm_vcpu *vcpu)
+{
+ /*
+ * Suppress the error code if the vCPU is in Real Mode, as Real Mode
+ * exceptions don't report error codes. The presence of an error code
+ * is carried with the exception and only stripped when the exception
+ * is injected as intercepted #PF VM-Exits for AMD's Paged Real Mode do
+ * report an error code despite the CPU being in Real Mode.
+ */
+ vcpu->arch.exception.has_error_code &= is_protmode(vcpu);
+
+ trace_kvm_inj_exception(vcpu->arch.exception.vector,
+ vcpu->arch.exception.has_error_code,
+ vcpu->arch.exception.error_code,
+ vcpu->arch.exception.injected);
+
+ static_call(kvm_x86_inject_exception)(vcpu);
+}
+
+/*
+ * Check for any event (interrupt or exception) that is ready to be injected,
+ * and if there is at least one event, inject the event with the highest
+ * priority. This handles both "pending" events, i.e. events that have never
+ * been injected into the guest, and "injected" events, i.e. events that were
+ * injected as part of a previous VM-Enter, but weren't successfully delivered
+ * and need to be re-injected.
+ *
+ * Note, this is not guaranteed to be invoked on a guest instruction boundary,
+ * i.e. doesn't guarantee that there's an event window in the guest. KVM must
+ * be able to inject exceptions in the "middle" of an instruction, and so must
+ * also be able to re-inject NMIs and IRQs in the middle of an instruction.
+ * I.e. for exceptions and re-injected events, NOT invoking this on instruction
+ * boundaries is necessary and correct.
+ *
+ * For simplicity, KVM uses a single path to inject all events (except events
+ * that are injected directly from L1 to L2) and doesn't explicitly track
+ * instruction boundaries for asynchronous events. However, because VM-Exits
+ * that can occur during instruction execution typically result in KVM skipping
+ * the instruction or injecting an exception, e.g. instruction and exception
+ * intercepts, and because pending exceptions have higher priority than pending
+ * interrupts, KVM still honors instruction boundaries in most scenarios.
+ *
+ * But, if a VM-Exit occurs during instruction execution, and KVM does NOT skip
+ * the instruction or inject an exception, then KVM can incorrecty inject a new
+ * asynchrounous event if the event became pending after the CPU fetched the
+ * instruction (in the guest). E.g. if a page fault (#PF, #NPF, EPT violation)
+ * occurs and is resolved by KVM, a coincident NMI, SMI, IRQ, etc... can be
+ * injected on the restarted instruction instead of being deferred until the
+ * instruction completes.
+ *
+ * In practice, this virtualization hole is unlikely to be observed by the
+ * guest, and even less likely to cause functional problems. To detect the
+ * hole, the guest would have to trigger an event on a side effect of an early
+ * phase of instruction execution, e.g. on the instruction fetch from memory.
+ * And for it to be a functional problem, the guest would need to depend on the
+ * ordering between that side effect, the instruction completing, _and_ the
+ * delivery of the asynchronous event.
+ */
+static int kvm_check_and_inject_events(struct kvm_vcpu *vcpu,
+ bool *req_immediate_exit)
+{
+ bool can_inject;
+ int r;
+
+ /*
+ * Process nested events first, as nested VM-Exit supercedes event
+ * re-injection. If there's an event queued for re-injection, it will
+ * be saved into the appropriate vmc{b,s}12 fields on nested VM-Exit.
+ */
+ if (is_guest_mode(vcpu))
+ r = kvm_check_nested_events(vcpu);
+ else
+ r = 0;
+
+ /*
+ * Re-inject exceptions and events *especially* if immediate entry+exit
+ * to/from L2 is needed, as any event that has already been injected
+ * into L2 needs to complete its lifecycle before injecting a new event.
+ *
+ * Don't re-inject an NMI or interrupt if there is a pending exception.
+ * This collision arises if an exception occurred while vectoring the
+ * injected event, KVM intercepted said exception, and KVM ultimately
+ * determined the fault belongs to the guest and queues the exception
+ * for injection back into the guest.
+ *
+ * "Injected" interrupts can also collide with pending exceptions if
+ * userspace ignores the "ready for injection" flag and blindly queues
+ * an interrupt. In that case, prioritizing the exception is correct,
+ * as the exception "occurred" before the exit to userspace. Trap-like
+ * exceptions, e.g. most #DBs, have higher priority than interrupts.
+ * And while fault-like exceptions, e.g. #GP and #PF, are the lowest
+ * priority, they're only generated (pended) during instruction
+ * execution, and interrupts are recognized at instruction boundaries.
+ * Thus a pending fault-like exception means the fault occurred on the
+ * *previous* instruction and must be serviced prior to recognizing any
+ * new events in order to fully complete the previous instruction.
+ */
+ if (vcpu->arch.exception.injected)
+ kvm_inject_exception(vcpu);
+ else if (kvm_is_exception_pending(vcpu))
+ ; /* see above */
+ else if (vcpu->arch.nmi_injected)
+ static_call(kvm_x86_inject_nmi)(vcpu);
+ else if (vcpu->arch.interrupt.injected)
+ static_call(kvm_x86_inject_irq)(vcpu, true);
+
+ /*
+ * Exceptions that morph to VM-Exits are handled above, and pending
+ * exceptions on top of injected exceptions that do not VM-Exit should
+ * either morph to #DF or, sadly, override the injected exception.
+ */
+ WARN_ON_ONCE(vcpu->arch.exception.injected &&
+ vcpu->arch.exception.pending);
+
+ /*
+ * Bail if immediate entry+exit to/from the guest is needed to complete
+ * nested VM-Enter or event re-injection so that a different pending
+ * event can be serviced (or if KVM needs to exit to userspace).
+ *
+ * Otherwise, continue processing events even if VM-Exit occurred. The
+ * VM-Exit will have cleared exceptions that were meant for L2, but
+ * there may now be events that can be injected into L1.
+ */
+ if (r < 0)
+ goto out;
+
+ /*
+ * A pending exception VM-Exit should either result in nested VM-Exit
+ * or force an immediate re-entry and exit to/from L2, and exception
+ * VM-Exits cannot be injected (flag should _never_ be set).
+ */
+ WARN_ON_ONCE(vcpu->arch.exception_vmexit.injected ||
+ vcpu->arch.exception_vmexit.pending);
+
+ /*
+ * New events, other than exceptions, cannot be injected if KVM needs
+ * to re-inject a previous event. See above comments on re-injecting
+ * for why pending exceptions get priority.
+ */
+ can_inject = !kvm_event_needs_reinjection(vcpu);
+
+ if (vcpu->arch.exception.pending) {
+ /*
+ * Fault-class exceptions, except #DBs, set RF=1 in the RFLAGS
+ * value pushed on the stack. Trap-like exception and all #DBs
+ * leave RF as-is (KVM follows Intel's behavior in this regard;
+ * AMD states that code breakpoint #DBs excplitly clear RF=0).
+ *
+ * Note, most versions of Intel's SDM and AMD's APM incorrectly
+ * describe the behavior of General Detect #DBs, which are
+ * fault-like. They do _not_ set RF, a la code breakpoints.
+ */
+ if (exception_type(vcpu->arch.exception.vector) == EXCPT_FAULT)
+ __kvm_set_rflags(vcpu, kvm_get_rflags(vcpu) |
+ X86_EFLAGS_RF);
+
+ if (vcpu->arch.exception.vector == DB_VECTOR) {
+ kvm_deliver_exception_payload(vcpu, &vcpu->arch.exception);
+ if (vcpu->arch.dr7 & DR7_GD) {
+ vcpu->arch.dr7 &= ~DR7_GD;
+ kvm_update_dr7(vcpu);
+ }
+ }
+
+ kvm_inject_exception(vcpu);
+
+ vcpu->arch.exception.pending = false;
+ vcpu->arch.exception.injected = true;
+
+ can_inject = false;
+ }
+
+ /* Don't inject interrupts if the user asked to avoid doing so */
+ if (vcpu->guest_debug & KVM_GUESTDBG_BLOCKIRQ)
+ return 0;
+
+ /*
+ * Finally, inject interrupt events. If an event cannot be injected
+ * due to architectural conditions (e.g. IF=0) a window-open exit
+ * will re-request KVM_REQ_EVENT. Sometimes however an event is pending
+ * and can architecturally be injected, but we cannot do it right now:
+ * an interrupt could have arrived just now and we have to inject it
+ * as a vmexit, or there could already an event in the queue, which is
+ * indicated by can_inject. In that case we request an immediate exit
+ * in order to make progress and get back here for another iteration.
+ * The kvm_x86_ops hooks communicate this by returning -EBUSY.
+ */
+#ifdef CONFIG_KVM_SMM
+ if (vcpu->arch.smi_pending) {
+ r = can_inject ? static_call(kvm_x86_smi_allowed)(vcpu, true) : -EBUSY;
+ if (r < 0)
+ goto out;
+ if (r) {
+ vcpu->arch.smi_pending = false;
+ ++vcpu->arch.smi_count;
+ enter_smm(vcpu);
+ can_inject = false;
+ } else
+ static_call(kvm_x86_enable_smi_window)(vcpu);
+ }
+#endif
+
+ if (vcpu->arch.nmi_pending) {
+ r = can_inject ? static_call(kvm_x86_nmi_allowed)(vcpu, true) : -EBUSY;
+ if (r < 0)
+ goto out;
+ if (r) {
+ --vcpu->arch.nmi_pending;
+ vcpu->arch.nmi_injected = true;
+ static_call(kvm_x86_inject_nmi)(vcpu);
+ can_inject = false;
+ WARN_ON(static_call(kvm_x86_nmi_allowed)(vcpu, true) < 0);
+ }
+ if (vcpu->arch.nmi_pending)
+ static_call(kvm_x86_enable_nmi_window)(vcpu);
+ }
+
+ if (kvm_cpu_has_injectable_intr(vcpu)) {
+ r = can_inject ? static_call(kvm_x86_interrupt_allowed)(vcpu, true) : -EBUSY;
+ if (r < 0)
+ goto out;
+ if (r) {
+ int irq = kvm_cpu_get_interrupt(vcpu);
+
+ if (!WARN_ON_ONCE(irq == -1)) {
+ kvm_queue_interrupt(vcpu, irq, false);
+ static_call(kvm_x86_inject_irq)(vcpu, false);
+ WARN_ON(static_call(kvm_x86_interrupt_allowed)(vcpu, true) < 0);
+ }
+ }
+ if (kvm_cpu_has_injectable_intr(vcpu))
+ static_call(kvm_x86_enable_irq_window)(vcpu);
+ }
+
+ if (is_guest_mode(vcpu) &&
+ kvm_x86_ops.nested_ops->has_events &&
+ kvm_x86_ops.nested_ops->has_events(vcpu))
+ *req_immediate_exit = true;
+
+ /*
+ * KVM must never queue a new exception while injecting an event; KVM
+ * is done emulating and should only propagate the to-be-injected event
+ * to the VMCS/VMCB. Queueing a new exception can put the vCPU into an
+ * infinite loop as KVM will bail from VM-Enter to inject the pending
+ * exception and start the cycle all over.
+ *
+ * Exempt triple faults as they have special handling and won't put the
+ * vCPU into an infinite loop. Triple fault can be queued when running
+ * VMX without unrestricted guest, as that requires KVM to emulate Real
+ * Mode events (see kvm_inject_realmode_interrupt()).
+ */
+ WARN_ON_ONCE(vcpu->arch.exception.pending ||
+ vcpu->arch.exception_vmexit.pending);
+ return 0;
+
+out:
+ if (r == -EBUSY) {
+ *req_immediate_exit = true;
+ r = 0;
+ }
+ return r;
+}
+
+static void process_nmi(struct kvm_vcpu *vcpu)
+{
+ unsigned int limit;
+
+ /*
+ * x86 is limited to one NMI pending, but because KVM can't react to
+ * incoming NMIs as quickly as bare metal, e.g. if the vCPU is
+ * scheduled out, KVM needs to play nice with two queued NMIs showing
+ * up at the same time. To handle this scenario, allow two NMIs to be
+ * (temporarily) pending so long as NMIs are not blocked and KVM is not
+ * waiting for a previous NMI injection to complete (which effectively
+ * blocks NMIs). KVM will immediately inject one of the two NMIs, and
+ * will request an NMI window to handle the second NMI.
+ */
+ if (static_call(kvm_x86_get_nmi_mask)(vcpu) || vcpu->arch.nmi_injected)
+ limit = 1;
+ else
+ limit = 2;
+
+ /*
+ * Adjust the limit to account for pending virtual NMIs, which aren't
+ * tracked in vcpu->arch.nmi_pending.
+ */
+ if (static_call(kvm_x86_is_vnmi_pending)(vcpu))
+ limit--;
+
+ vcpu->arch.nmi_pending += atomic_xchg(&vcpu->arch.nmi_queued, 0);
+ vcpu->arch.nmi_pending = min(vcpu->arch.nmi_pending, limit);
+
+ if (vcpu->arch.nmi_pending &&
+ (static_call(kvm_x86_set_vnmi_pending)(vcpu)))
+ vcpu->arch.nmi_pending--;
+
+ if (vcpu->arch.nmi_pending)
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+}
+
+/* Return total number of NMIs pending injection to the VM */
+int kvm_get_nr_pending_nmis(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.nmi_pending +
+ static_call(kvm_x86_is_vnmi_pending)(vcpu);
+}
+
+void kvm_make_scan_ioapic_request_mask(struct kvm *kvm,
+ unsigned long *vcpu_bitmap)
+{
+ kvm_make_vcpus_request_mask(kvm, KVM_REQ_SCAN_IOAPIC, vcpu_bitmap);
+}
+
+void kvm_make_scan_ioapic_request(struct kvm *kvm)
+{
+ kvm_make_all_cpus_request(kvm, KVM_REQ_SCAN_IOAPIC);
+}
+
+void __kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+ bool activate;
+
+ if (!lapic_in_kernel(vcpu))
+ return;
+
+ down_read(&vcpu->kvm->arch.apicv_update_lock);
+ preempt_disable();
+
+ /* Do not activate APICV when APIC is disabled */
+ activate = kvm_vcpu_apicv_activated(vcpu) &&
+ (kvm_get_apic_mode(vcpu) != LAPIC_MODE_DISABLED);
+
+ if (apic->apicv_active == activate)
+ goto out;
+
+ apic->apicv_active = activate;
+ kvm_apic_update_apicv(vcpu);
+ static_call(kvm_x86_refresh_apicv_exec_ctrl)(vcpu);
+
+ /*
+ * When APICv gets disabled, we may still have injected interrupts
+ * pending. At the same time, KVM_REQ_EVENT may not be set as APICv was
+ * still active when the interrupt got accepted. Make sure
+ * kvm_check_and_inject_events() is called to check for that.
+ */
+ if (!apic->apicv_active)
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+
+out:
+ preempt_enable();
+ up_read(&vcpu->kvm->arch.apicv_update_lock);
+}
+EXPORT_SYMBOL_GPL(__kvm_vcpu_update_apicv);
+
+static void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu)
+{
+ if (!lapic_in_kernel(vcpu))
+ return;
+
+ /*
+ * Due to sharing page tables across vCPUs, the xAPIC memslot must be
+ * deleted if any vCPU has xAPIC virtualization and x2APIC enabled, but
+ * and hardware doesn't support x2APIC virtualization. E.g. some AMD
+ * CPUs support AVIC but not x2APIC. KVM still allows enabling AVIC in
+ * this case so that KVM can the AVIC doorbell to inject interrupts to
+ * running vCPUs, but KVM must not create SPTEs for the APIC base as
+ * the vCPU would incorrectly be able to access the vAPIC page via MMIO
+ * despite being in x2APIC mode. For simplicity, inhibiting the APIC
+ * access page is sticky.
+ */
+ if (apic_x2apic_mode(vcpu->arch.apic) &&
+ kvm_x86_ops.allow_apicv_in_x2apic_without_x2apic_virtualization)
+ kvm_inhibit_apic_access_page(vcpu);
+
+ __kvm_vcpu_update_apicv(vcpu);
+}
+
+void __kvm_set_or_clear_apicv_inhibit(struct kvm *kvm,
+ enum kvm_apicv_inhibit reason, bool set)
+{
+ unsigned long old, new;
+
+ lockdep_assert_held_write(&kvm->arch.apicv_update_lock);
+
+ if (!(kvm_x86_ops.required_apicv_inhibits & BIT(reason)))
+ return;
+
+ old = new = kvm->arch.apicv_inhibit_reasons;
+
+ set_or_clear_apicv_inhibit(&new, reason, set);
+
+ if (!!old != !!new) {
+ /*
+ * Kick all vCPUs before setting apicv_inhibit_reasons to avoid
+ * false positives in the sanity check WARN in svm_vcpu_run().
+ * This task will wait for all vCPUs to ack the kick IRQ before
+ * updating apicv_inhibit_reasons, and all other vCPUs will
+ * block on acquiring apicv_update_lock so that vCPUs can't
+ * redo svm_vcpu_run() without seeing the new inhibit state.
+ *
+ * Note, holding apicv_update_lock and taking it in the read
+ * side (handling the request) also prevents other vCPUs from
+ * servicing the request with a stale apicv_inhibit_reasons.
+ */
+ kvm_make_all_cpus_request(kvm, KVM_REQ_APICV_UPDATE);
+ kvm->arch.apicv_inhibit_reasons = new;
+ if (new) {
+ unsigned long gfn = gpa_to_gfn(APIC_DEFAULT_PHYS_BASE);
+ int idx = srcu_read_lock(&kvm->srcu);
+
+ kvm_zap_gfn_range(kvm, gfn, gfn+1);
+ srcu_read_unlock(&kvm->srcu, idx);
+ }
+ } else {
+ kvm->arch.apicv_inhibit_reasons = new;
+ }
+}
+
+void kvm_set_or_clear_apicv_inhibit(struct kvm *kvm,
+ enum kvm_apicv_inhibit reason, bool set)
+{
+ if (!enable_apicv)
+ return;
+
+ down_write(&kvm->arch.apicv_update_lock);
+ __kvm_set_or_clear_apicv_inhibit(kvm, reason, set);
+ up_write(&kvm->arch.apicv_update_lock);
+}
+EXPORT_SYMBOL_GPL(kvm_set_or_clear_apicv_inhibit);
+
+static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
+{
+ if (!kvm_apic_present(vcpu))
+ return;
+
+ bitmap_zero(vcpu->arch.ioapic_handled_vectors, 256);
+
+ if (irqchip_split(vcpu->kvm))
+ kvm_scan_ioapic_routes(vcpu, vcpu->arch.ioapic_handled_vectors);
+ else {
+ static_call_cond(kvm_x86_sync_pir_to_irr)(vcpu);
+ if (ioapic_in_kernel(vcpu->kvm))
+ kvm_ioapic_scan_entry(vcpu, vcpu->arch.ioapic_handled_vectors);
+ }
+
+ if (is_guest_mode(vcpu))
+ vcpu->arch.load_eoi_exitmap_pending = true;
+ else
+ kvm_make_request(KVM_REQ_LOAD_EOI_EXITMAP, vcpu);
+}
+
+static void vcpu_load_eoi_exitmap(struct kvm_vcpu *vcpu)
+{
+ u64 eoi_exit_bitmap[4];
+
+ if (!kvm_apic_hw_enabled(vcpu->arch.apic))
+ return;
+
+ if (to_hv_vcpu(vcpu)) {
+ bitmap_or((ulong *)eoi_exit_bitmap,
+ vcpu->arch.ioapic_handled_vectors,
+ to_hv_synic(vcpu)->vec_bitmap, 256);
+ static_call_cond(kvm_x86_load_eoi_exitmap)(vcpu, eoi_exit_bitmap);
+ return;
+ }
+
+ static_call_cond(kvm_x86_load_eoi_exitmap)(
+ vcpu, (u64 *)vcpu->arch.ioapic_handled_vectors);
+}
+
+void kvm_arch_guest_memory_reclaimed(struct kvm *kvm)
+{
+ static_call_cond(kvm_x86_guest_memory_reclaimed)(kvm);
+}
+
+static void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu)
+{
+ if (!lapic_in_kernel(vcpu))
+ return;
+
+ static_call_cond(kvm_x86_set_apic_access_page_addr)(vcpu);
+}
+
+void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu)
+{
+ smp_send_reschedule(vcpu->cpu);
+}
+EXPORT_SYMBOL_GPL(__kvm_request_immediate_exit);
+
+/*
+ * Called within kvm->srcu read side.
+ * Returns 1 to let vcpu_run() continue the guest execution loop without
+ * exiting to the userspace. Otherwise, the value will be returned to the
+ * userspace.
+ */
+static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
+{
+ int r;
+ bool req_int_win =
+ dm_request_for_irq_injection(vcpu) &&
+ kvm_cpu_accept_dm_intr(vcpu);
+ fastpath_t exit_fastpath;
+
+ bool req_immediate_exit = false;
+
+ if (kvm_request_pending(vcpu)) {
+ if (kvm_check_request(KVM_REQ_VM_DEAD, vcpu)) {
+ r = -EIO;
+ goto out;
+ }
+
+ if (kvm_dirty_ring_check_request(vcpu)) {
+ r = 0;
+ goto out;
+ }
+
+ if (kvm_check_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu)) {
+ if (unlikely(!kvm_x86_ops.nested_ops->get_nested_state_pages(vcpu))) {
+ r = 0;
+ goto out;
+ }
+ }
+ if (kvm_check_request(KVM_REQ_MMU_FREE_OBSOLETE_ROOTS, vcpu))
+ kvm_mmu_free_obsolete_roots(vcpu);
+ if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu))
+ __kvm_migrate_timers(vcpu);
+ if (kvm_check_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu))
+ kvm_update_masterclock(vcpu->kvm);
+ if (kvm_check_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu))
+ kvm_gen_kvmclock_update(vcpu);
+ if (kvm_check_request(KVM_REQ_CLOCK_UPDATE, vcpu)) {
+ r = kvm_guest_time_update(vcpu);
+ if (unlikely(r))
+ goto out;
+ }
+ if (kvm_check_request(KVM_REQ_MMU_SYNC, vcpu))
+ kvm_mmu_sync_roots(vcpu);
+ if (kvm_check_request(KVM_REQ_LOAD_MMU_PGD, vcpu))
+ kvm_mmu_load_pgd(vcpu);
+
+ /*
+ * Note, the order matters here, as flushing "all" TLB entries
+ * also flushes the "current" TLB entries, i.e. servicing the
+ * flush "all" will clear any request to flush "current".
+ */
+ if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu))
+ kvm_vcpu_flush_tlb_all(vcpu);
+
+ kvm_service_local_tlb_flush_requests(vcpu);
+
+ /*
+ * Fall back to a "full" guest flush if Hyper-V's precise
+ * flushing fails. Note, Hyper-V's flushing is per-vCPU, but
+ * the flushes are considered "remote" and not "local" because
+ * the requests can be initiated from other vCPUs.
+ */
+ if (kvm_check_request(KVM_REQ_HV_TLB_FLUSH, vcpu) &&
+ kvm_hv_vcpu_flush_tlb(vcpu))
+ kvm_vcpu_flush_tlb_guest(vcpu);
+
+ if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) {
+ vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS;
+ r = 0;
+ goto out;
+ }
+ if (kvm_test_request(KVM_REQ_TRIPLE_FAULT, vcpu)) {
+ if (is_guest_mode(vcpu))
+ kvm_x86_ops.nested_ops->triple_fault(vcpu);
+
+ if (kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)) {
+ vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
+ vcpu->mmio_needed = 0;
+ r = 0;
+ goto out;
+ }
+ }
+ if (kvm_check_request(KVM_REQ_APF_HALT, vcpu)) {
+ /* Page is swapped out. Do synthetic halt */
+ vcpu->arch.apf.halted = true;
+ r = 1;
+ goto out;
+ }
+ if (kvm_check_request(KVM_REQ_STEAL_UPDATE, vcpu))
+ record_steal_time(vcpu);
+#ifdef CONFIG_KVM_SMM
+ if (kvm_check_request(KVM_REQ_SMI, vcpu))
+ process_smi(vcpu);
+#endif
+ if (kvm_check_request(KVM_REQ_NMI, vcpu))
+ process_nmi(vcpu);
+ if (kvm_check_request(KVM_REQ_PMU, vcpu))
+ kvm_pmu_handle_event(vcpu);
+ if (kvm_check_request(KVM_REQ_PMI, vcpu))
+ kvm_pmu_deliver_pmi(vcpu);
+ if (kvm_check_request(KVM_REQ_IOAPIC_EOI_EXIT, vcpu)) {
+ BUG_ON(vcpu->arch.pending_ioapic_eoi > 255);
+ if (test_bit(vcpu->arch.pending_ioapic_eoi,
+ vcpu->arch.ioapic_handled_vectors)) {
+ vcpu->run->exit_reason = KVM_EXIT_IOAPIC_EOI;
+ vcpu->run->eoi.vector =
+ vcpu->arch.pending_ioapic_eoi;
+ r = 0;
+ goto out;
+ }
+ }
+ if (kvm_check_request(KVM_REQ_SCAN_IOAPIC, vcpu))
+ vcpu_scan_ioapic(vcpu);
+ if (kvm_check_request(KVM_REQ_LOAD_EOI_EXITMAP, vcpu))
+ vcpu_load_eoi_exitmap(vcpu);
+ if (kvm_check_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu))
+ kvm_vcpu_reload_apic_access_page(vcpu);
+ if (kvm_check_request(KVM_REQ_HV_CRASH, vcpu)) {
+ vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
+ vcpu->run->system_event.type = KVM_SYSTEM_EVENT_CRASH;
+ vcpu->run->system_event.ndata = 0;
+ r = 0;
+ goto out;
+ }
+ if (kvm_check_request(KVM_REQ_HV_RESET, vcpu)) {
+ vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
+ vcpu->run->system_event.type = KVM_SYSTEM_EVENT_RESET;
+ vcpu->run->system_event.ndata = 0;
+ r = 0;
+ goto out;
+ }
+ if (kvm_check_request(KVM_REQ_HV_EXIT, vcpu)) {
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+
+ vcpu->run->exit_reason = KVM_EXIT_HYPERV;
+ vcpu->run->hyperv = hv_vcpu->exit;
+ r = 0;
+ goto out;
+ }
+
+ /*
+ * KVM_REQ_HV_STIMER has to be processed after
+ * KVM_REQ_CLOCK_UPDATE, because Hyper-V SynIC timers
+ * depend on the guest clock being up-to-date
+ */
+ if (kvm_check_request(KVM_REQ_HV_STIMER, vcpu))
+ kvm_hv_process_stimers(vcpu);
+ if (kvm_check_request(KVM_REQ_APICV_UPDATE, vcpu))
+ kvm_vcpu_update_apicv(vcpu);
+ if (kvm_check_request(KVM_REQ_APF_READY, vcpu))
+ kvm_check_async_pf_completion(vcpu);
+ if (kvm_check_request(KVM_REQ_MSR_FILTER_CHANGED, vcpu))
+ static_call(kvm_x86_msr_filter_changed)(vcpu);
+
+ if (kvm_check_request(KVM_REQ_UPDATE_CPU_DIRTY_LOGGING, vcpu))
+ static_call(kvm_x86_update_cpu_dirty_logging)(vcpu);
+ }
+
+ if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win ||
+ kvm_xen_has_interrupt(vcpu)) {
+ ++vcpu->stat.req_event;
+ r = kvm_apic_accept_events(vcpu);
+ if (r < 0) {
+ r = 0;
+ goto out;
+ }
+ if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
+ r = 1;
+ goto out;
+ }
+
+ r = kvm_check_and_inject_events(vcpu, &req_immediate_exit);
+ if (r < 0) {
+ r = 0;
+ goto out;
+ }
+ if (req_int_win)
+ static_call(kvm_x86_enable_irq_window)(vcpu);
+
+ if (kvm_lapic_enabled(vcpu)) {
+ update_cr8_intercept(vcpu);
+ kvm_lapic_sync_to_vapic(vcpu);
+ }
+ }
+
+ r = kvm_mmu_reload(vcpu);
+ if (unlikely(r)) {
+ goto cancel_injection;
+ }
+
+ preempt_disable();
+
+ static_call(kvm_x86_prepare_switch_to_guest)(vcpu);
+
+ /*
+ * Disable IRQs before setting IN_GUEST_MODE. Posted interrupt
+ * IPI are then delayed after guest entry, which ensures that they
+ * result in virtual interrupt delivery.
+ */
+ local_irq_disable();
+
+ /* Store vcpu->apicv_active before vcpu->mode. */
+ smp_store_release(&vcpu->mode, IN_GUEST_MODE);
+
+ kvm_vcpu_srcu_read_unlock(vcpu);
+
+ /*
+ * 1) We should set ->mode before checking ->requests. Please see
+ * the comment in kvm_vcpu_exiting_guest_mode().
+ *
+ * 2) For APICv, we should set ->mode before checking PID.ON. This
+ * pairs with the memory barrier implicit in pi_test_and_set_on
+ * (see vmx_deliver_posted_interrupt).
+ *
+ * 3) This also orders the write to mode from any reads to the page
+ * tables done while the VCPU is running. Please see the comment
+ * in kvm_flush_remote_tlbs.
+ */
+ smp_mb__after_srcu_read_unlock();
+
+ /*
+ * Process pending posted interrupts to handle the case where the
+ * notification IRQ arrived in the host, or was never sent (because the
+ * target vCPU wasn't running). Do this regardless of the vCPU's APICv
+ * status, KVM doesn't update assigned devices when APICv is inhibited,
+ * i.e. they can post interrupts even if APICv is temporarily disabled.
+ */
+ if (kvm_lapic_enabled(vcpu))
+ static_call_cond(kvm_x86_sync_pir_to_irr)(vcpu);
+
+ if (kvm_vcpu_exit_request(vcpu)) {
+ vcpu->mode = OUTSIDE_GUEST_MODE;
+ smp_wmb();
+ local_irq_enable();
+ preempt_enable();
+ kvm_vcpu_srcu_read_lock(vcpu);
+ r = 1;
+ goto cancel_injection;
+ }
+
+ if (req_immediate_exit) {
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ static_call(kvm_x86_request_immediate_exit)(vcpu);
+ }
+
+ fpregs_assert_state_consistent();
+ if (test_thread_flag(TIF_NEED_FPU_LOAD))
+ switch_fpu_return();
+
+ if (vcpu->arch.guest_fpu.xfd_err)
+ wrmsrl(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err);
+
+ if (unlikely(vcpu->arch.switch_db_regs)) {
+ set_debugreg(0, 7);
+ set_debugreg(vcpu->arch.eff_db[0], 0);
+ set_debugreg(vcpu->arch.eff_db[1], 1);
+ set_debugreg(vcpu->arch.eff_db[2], 2);
+ set_debugreg(vcpu->arch.eff_db[3], 3);
+ } else if (unlikely(hw_breakpoint_active())) {
+ set_debugreg(0, 7);
+ }
+
+ guest_timing_enter_irqoff();
+
+ for (;;) {
+ /*
+ * Assert that vCPU vs. VM APICv state is consistent. An APICv
+ * update must kick and wait for all vCPUs before toggling the
+ * per-VM state, and responsing vCPUs must wait for the update
+ * to complete before servicing KVM_REQ_APICV_UPDATE.
+ */
+ WARN_ON_ONCE((kvm_vcpu_apicv_activated(vcpu) != kvm_vcpu_apicv_active(vcpu)) &&
+ (kvm_get_apic_mode(vcpu) != LAPIC_MODE_DISABLED));
+
+ exit_fastpath = static_call(kvm_x86_vcpu_run)(vcpu);
+ if (likely(exit_fastpath != EXIT_FASTPATH_REENTER_GUEST))
+ break;
+
+ if (kvm_lapic_enabled(vcpu))
+ static_call_cond(kvm_x86_sync_pir_to_irr)(vcpu);
+
+ if (unlikely(kvm_vcpu_exit_request(vcpu))) {
+ exit_fastpath = EXIT_FASTPATH_EXIT_HANDLED;
+ break;
+ }
+
+ /* Note, VM-Exits that go down the "slow" path are accounted below. */
+ ++vcpu->stat.exits;
+ }
+
+ /*
+ * Do this here before restoring debug registers on the host. And
+ * since we do this before handling the vmexit, a DR access vmexit
+ * can (a) read the correct value of the debug registers, (b) set
+ * KVM_DEBUGREG_WONT_EXIT again.
+ */
+ if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) {
+ WARN_ON(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP);
+ static_call(kvm_x86_sync_dirty_debug_regs)(vcpu);
+ kvm_update_dr0123(vcpu);
+ kvm_update_dr7(vcpu);
+ }
+
+ /*
+ * If the guest has used debug registers, at least dr7
+ * will be disabled while returning to the host.
+ * If we don't have active breakpoints in the host, we don't
+ * care about the messed up debug address registers. But if
+ * we have some of them active, restore the old state.
+ */
+ if (hw_breakpoint_active())
+ hw_breakpoint_restore();
+
+ vcpu->arch.last_vmentry_cpu = vcpu->cpu;
+ vcpu->arch.last_guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
+
+ vcpu->mode = OUTSIDE_GUEST_MODE;
+ smp_wmb();
+
+ /*
+ * Sync xfd before calling handle_exit_irqoff() which may
+ * rely on the fact that guest_fpu::xfd is up-to-date (e.g.
+ * in #NM irqoff handler).
+ */
+ if (vcpu->arch.xfd_no_write_intercept)
+ fpu_sync_guest_vmexit_xfd_state();
+
+ static_call(kvm_x86_handle_exit_irqoff)(vcpu);
+
+ if (vcpu->arch.guest_fpu.xfd_err)
+ wrmsrl(MSR_IA32_XFD_ERR, 0);
+
+ /*
+ * Consume any pending interrupts, including the possible source of
+ * VM-Exit on SVM and any ticks that occur between VM-Exit and now.
+ * An instruction is required after local_irq_enable() to fully unblock
+ * interrupts on processors that implement an interrupt shadow, the
+ * stat.exits increment will do nicely.
+ */
+ kvm_before_interrupt(vcpu, KVM_HANDLING_IRQ);
+ local_irq_enable();
+ ++vcpu->stat.exits;
+ local_irq_disable();
+ kvm_after_interrupt(vcpu);
+
+ /*
+ * Wait until after servicing IRQs to account guest time so that any
+ * ticks that occurred while running the guest are properly accounted
+ * to the guest. Waiting until IRQs are enabled degrades the accuracy
+ * of accounting via context tracking, but the loss of accuracy is
+ * acceptable for all known use cases.
+ */
+ guest_timing_exit_irqoff();
+
+ local_irq_enable();
+ preempt_enable();
+
+ kvm_vcpu_srcu_read_lock(vcpu);
+
+ /*
+ * Profile KVM exit RIPs:
+ */
+ if (unlikely(prof_on == KVM_PROFILING)) {
+ unsigned long rip = kvm_rip_read(vcpu);
+ profile_hit(KVM_PROFILING, (void *)rip);
+ }
+
+ if (unlikely(vcpu->arch.tsc_always_catchup))
+ kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
+
+ if (vcpu->arch.apic_attention)
+ kvm_lapic_sync_from_vapic(vcpu);
+
+ r = static_call(kvm_x86_handle_exit)(vcpu, exit_fastpath);
+ return r;
+
+cancel_injection:
+ if (req_immediate_exit)
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ static_call(kvm_x86_cancel_injection)(vcpu);
+ if (unlikely(vcpu->arch.apic_attention))
+ kvm_lapic_sync_from_vapic(vcpu);
+out:
+ return r;
+}
+
+/* Called within kvm->srcu read side. */
+static inline int vcpu_block(struct kvm_vcpu *vcpu)
+{
+ bool hv_timer;
+
+ if (!kvm_arch_vcpu_runnable(vcpu)) {
+ /*
+ * Switch to the software timer before halt-polling/blocking as
+ * the guest's timer may be a break event for the vCPU, and the
+ * hypervisor timer runs only when the CPU is in guest mode.
+ * Switch before halt-polling so that KVM recognizes an expired
+ * timer before blocking.
+ */
+ hv_timer = kvm_lapic_hv_timer_in_use(vcpu);
+ if (hv_timer)
+ kvm_lapic_switch_to_sw_timer(vcpu);
+
+ kvm_vcpu_srcu_read_unlock(vcpu);
+ if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED)
+ kvm_vcpu_halt(vcpu);
+ else
+ kvm_vcpu_block(vcpu);
+ kvm_vcpu_srcu_read_lock(vcpu);
+
+ if (hv_timer)
+ kvm_lapic_switch_to_hv_timer(vcpu);
+
+ /*
+ * If the vCPU is not runnable, a signal or another host event
+ * of some kind is pending; service it without changing the
+ * vCPU's activity state.
+ */
+ if (!kvm_arch_vcpu_runnable(vcpu))
+ return 1;
+ }
+
+ /*
+ * Evaluate nested events before exiting the halted state. This allows
+ * the halt state to be recorded properly in the VMCS12's activity
+ * state field (AMD does not have a similar field and a VM-Exit always
+ * causes a spurious wakeup from HLT).
+ */
+ if (is_guest_mode(vcpu)) {
+ if (kvm_check_nested_events(vcpu) < 0)
+ return 0;
+ }
+
+ if (kvm_apic_accept_events(vcpu) < 0)
+ return 0;
+ switch(vcpu->arch.mp_state) {
+ case KVM_MP_STATE_HALTED:
+ case KVM_MP_STATE_AP_RESET_HOLD:
+ vcpu->arch.pv.pv_unhalted = false;
+ vcpu->arch.mp_state =
+ KVM_MP_STATE_RUNNABLE;
+ fallthrough;
+ case KVM_MP_STATE_RUNNABLE:
+ vcpu->arch.apf.halted = false;
+ break;
+ case KVM_MP_STATE_INIT_RECEIVED:
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ break;
+ }
+ return 1;
+}
+
+static inline bool kvm_vcpu_running(struct kvm_vcpu *vcpu)
+{
+ return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
+ !vcpu->arch.apf.halted);
+}
+
+/* Called within kvm->srcu read side. */
+static int vcpu_run(struct kvm_vcpu *vcpu)
+{
+ int r;
+
+ vcpu->arch.l1tf_flush_l1d = true;
+
+ for (;;) {
+ /*
+ * If another guest vCPU requests a PV TLB flush in the middle
+ * of instruction emulation, the rest of the emulation could
+ * use a stale page translation. Assume that any code after
+ * this point can start executing an instruction.
+ */
+ vcpu->arch.at_instruction_boundary = false;
+ if (kvm_vcpu_running(vcpu)) {
+ r = vcpu_enter_guest(vcpu);
+ } else {
+ r = vcpu_block(vcpu);
+ }
+
+ if (r <= 0)
+ break;
+
+ kvm_clear_request(KVM_REQ_UNBLOCK, vcpu);
+ if (kvm_xen_has_pending_events(vcpu))
+ kvm_xen_inject_pending_events(vcpu);
+
+ if (kvm_cpu_has_pending_timer(vcpu))
+ kvm_inject_pending_timer_irqs(vcpu);
+
+ if (dm_request_for_irq_injection(vcpu) &&
+ kvm_vcpu_ready_for_interrupt_injection(vcpu)) {
+ r = 0;
+ vcpu->run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
+ ++vcpu->stat.request_irq_exits;
+ break;
+ }
+
+ if (__xfer_to_guest_mode_work_pending()) {
+ kvm_vcpu_srcu_read_unlock(vcpu);
+ r = xfer_to_guest_mode_handle_work(vcpu);
+ kvm_vcpu_srcu_read_lock(vcpu);
+ if (r)
+ return r;
+ }
+ }
+
+ return r;
+}
+
+static inline int complete_emulated_io(struct kvm_vcpu *vcpu)
+{
+ return kvm_emulate_instruction(vcpu, EMULTYPE_NO_DECODE);
+}
+
+static int complete_emulated_pio(struct kvm_vcpu *vcpu)
+{
+ BUG_ON(!vcpu->arch.pio.count);
+
+ return complete_emulated_io(vcpu);
+}
+
+/*
+ * Implements the following, as a state machine:
+ *
+ * read:
+ * for each fragment
+ * for each mmio piece in the fragment
+ * write gpa, len
+ * exit
+ * copy data
+ * execute insn
+ *
+ * write:
+ * for each fragment
+ * for each mmio piece in the fragment
+ * write gpa, len
+ * copy data
+ * exit
+ */
+static int complete_emulated_mmio(struct kvm_vcpu *vcpu)
+{
+ struct kvm_run *run = vcpu->run;
+ struct kvm_mmio_fragment *frag;
+ unsigned len;
+
+ BUG_ON(!vcpu->mmio_needed);
+
+ /* Complete previous fragment */
+ frag = &vcpu->mmio_fragments[vcpu->mmio_cur_fragment];
+ len = min(8u, frag->len);
+ if (!vcpu->mmio_is_write)
+ memcpy(frag->data, run->mmio.data, len);
+
+ if (frag->len <= 8) {
+ /* Switch to the next fragment. */
+ frag++;
+ vcpu->mmio_cur_fragment++;
+ } else {
+ /* Go forward to the next mmio piece. */
+ frag->data += len;
+ frag->gpa += len;
+ frag->len -= len;
+ }
+
+ if (vcpu->mmio_cur_fragment >= vcpu->mmio_nr_fragments) {
+ vcpu->mmio_needed = 0;
+
+ /* FIXME: return into emulator if single-stepping. */
+ if (vcpu->mmio_is_write)
+ return 1;
+ vcpu->mmio_read_completed = 1;
+ return complete_emulated_io(vcpu);
+ }
+
+ run->exit_reason = KVM_EXIT_MMIO;
+ run->mmio.phys_addr = frag->gpa;
+ if (vcpu->mmio_is_write)
+ memcpy(run->mmio.data, frag->data, min(8u, frag->len));
+ run->mmio.len = min(8u, frag->len);
+ run->mmio.is_write = vcpu->mmio_is_write;
+ vcpu->arch.complete_userspace_io = complete_emulated_mmio;
+ return 0;
+}
+
+/* Swap (qemu) user FPU context for the guest FPU context. */
+static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
+{
+ /* Exclude PKRU, it's restored separately immediately after VM-Exit. */
+ fpu_swap_kvm_fpstate(&vcpu->arch.guest_fpu, true);
+ trace_kvm_fpu(1);
+}
+
+/* When vcpu_run ends, restore user space FPU context. */
+static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
+{
+ fpu_swap_kvm_fpstate(&vcpu->arch.guest_fpu, false);
+ ++vcpu->stat.fpu_reload;
+ trace_kvm_fpu(0);
+}
+
+int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
+{
+ struct kvm_queued_exception *ex = &vcpu->arch.exception;
+ struct kvm_run *kvm_run = vcpu->run;
+ int r;
+
+ vcpu_load(vcpu);
+ kvm_sigset_activate(vcpu);
+ kvm_run->flags = 0;
+ kvm_load_guest_fpu(vcpu);
+
+ kvm_vcpu_srcu_read_lock(vcpu);
+ if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) {
+ if (kvm_run->immediate_exit) {
+ r = -EINTR;
+ goto out;
+ }
+
+ /*
+ * Don't bother switching APIC timer emulation from the
+ * hypervisor timer to the software timer, the only way for the
+ * APIC timer to be active is if userspace stuffed vCPU state,
+ * i.e. put the vCPU into a nonsensical state. Only an INIT
+ * will transition the vCPU out of UNINITIALIZED (without more
+ * state stuffing from userspace), which will reset the local
+ * APIC and thus cancel the timer or drop the IRQ (if the timer
+ * already expired).
+ */
+ kvm_vcpu_srcu_read_unlock(vcpu);
+ kvm_vcpu_block(vcpu);
+ kvm_vcpu_srcu_read_lock(vcpu);
+
+ if (kvm_apic_accept_events(vcpu) < 0) {
+ r = 0;
+ goto out;
+ }
+ r = -EAGAIN;
+ if (signal_pending(current)) {
+ r = -EINTR;
+ kvm_run->exit_reason = KVM_EXIT_INTR;
+ ++vcpu->stat.signal_exits;
+ }
+ goto out;
+ }
+
+ if ((kvm_run->kvm_valid_regs & ~KVM_SYNC_X86_VALID_FIELDS) ||
+ (kvm_run->kvm_dirty_regs & ~KVM_SYNC_X86_VALID_FIELDS)) {
+ r = -EINVAL;
+ goto out;
+ }
+
+ if (kvm_run->kvm_dirty_regs) {
+ r = sync_regs(vcpu);
+ if (r != 0)
+ goto out;
+ }
+
+ /* re-sync apic's tpr */
+ if (!lapic_in_kernel(vcpu)) {
+ if (kvm_set_cr8(vcpu, kvm_run->cr8) != 0) {
+ r = -EINVAL;
+ goto out;
+ }
+ }
+
+ /*
+ * If userspace set a pending exception and L2 is active, convert it to
+ * a pending VM-Exit if L1 wants to intercept the exception.
+ */
+ if (vcpu->arch.exception_from_userspace && is_guest_mode(vcpu) &&
+ kvm_x86_ops.nested_ops->is_exception_vmexit(vcpu, ex->vector,
+ ex->error_code)) {
+ kvm_queue_exception_vmexit(vcpu, ex->vector,
+ ex->has_error_code, ex->error_code,
+ ex->has_payload, ex->payload);
+ ex->injected = false;
+ ex->pending = false;
+ }
+ vcpu->arch.exception_from_userspace = false;
+
+ if (unlikely(vcpu->arch.complete_userspace_io)) {
+ int (*cui)(struct kvm_vcpu *) = vcpu->arch.complete_userspace_io;
+ vcpu->arch.complete_userspace_io = NULL;
+ r = cui(vcpu);
+ if (r <= 0)
+ goto out;
+ } else {
+ WARN_ON_ONCE(vcpu->arch.pio.count);
+ WARN_ON_ONCE(vcpu->mmio_needed);
+ }
+
+ if (kvm_run->immediate_exit) {
+ r = -EINTR;
+ goto out;
+ }
+
+ r = static_call(kvm_x86_vcpu_pre_run)(vcpu);
+ if (r <= 0)
+ goto out;
+
+ r = vcpu_run(vcpu);
+
+out:
+ kvm_put_guest_fpu(vcpu);
+ if (kvm_run->kvm_valid_regs)
+ store_regs(vcpu);
+ post_kvm_run_save(vcpu);
+ kvm_vcpu_srcu_read_unlock(vcpu);
+
+ kvm_sigset_deactivate(vcpu);
+ vcpu_put(vcpu);
+ return r;
+}
+
+static void __get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
+{
+ if (vcpu->arch.emulate_regs_need_sync_to_vcpu) {
+ /*
+ * We are here if userspace calls get_regs() in the middle of
+ * instruction emulation. Registers state needs to be copied
+ * back from emulation context to vcpu. Userspace shouldn't do
+ * that usually, but some bad designed PV devices (vmware
+ * backdoor interface) need this to work
+ */
+ emulator_writeback_register_cache(vcpu->arch.emulate_ctxt);
+ vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
+ }
+ regs->rax = kvm_rax_read(vcpu);
+ regs->rbx = kvm_rbx_read(vcpu);
+ regs->rcx = kvm_rcx_read(vcpu);
+ regs->rdx = kvm_rdx_read(vcpu);
+ regs->rsi = kvm_rsi_read(vcpu);
+ regs->rdi = kvm_rdi_read(vcpu);
+ regs->rsp = kvm_rsp_read(vcpu);
+ regs->rbp = kvm_rbp_read(vcpu);
+#ifdef CONFIG_X86_64
+ regs->r8 = kvm_r8_read(vcpu);
+ regs->r9 = kvm_r9_read(vcpu);
+ regs->r10 = kvm_r10_read(vcpu);
+ regs->r11 = kvm_r11_read(vcpu);
+ regs->r12 = kvm_r12_read(vcpu);
+ regs->r13 = kvm_r13_read(vcpu);
+ regs->r14 = kvm_r14_read(vcpu);
+ regs->r15 = kvm_r15_read(vcpu);
+#endif
+
+ regs->rip = kvm_rip_read(vcpu);
+ regs->rflags = kvm_get_rflags(vcpu);
+}
+
+int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
+{
+ vcpu_load(vcpu);
+ __get_regs(vcpu, regs);
+ vcpu_put(vcpu);
+ return 0;
+}
+
+static void __set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
+{
+ vcpu->arch.emulate_regs_need_sync_from_vcpu = true;
+ vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
+
+ kvm_rax_write(vcpu, regs->rax);
+ kvm_rbx_write(vcpu, regs->rbx);
+ kvm_rcx_write(vcpu, regs->rcx);
+ kvm_rdx_write(vcpu, regs->rdx);
+ kvm_rsi_write(vcpu, regs->rsi);
+ kvm_rdi_write(vcpu, regs->rdi);
+ kvm_rsp_write(vcpu, regs->rsp);
+ kvm_rbp_write(vcpu, regs->rbp);
+#ifdef CONFIG_X86_64
+ kvm_r8_write(vcpu, regs->r8);
+ kvm_r9_write(vcpu, regs->r9);
+ kvm_r10_write(vcpu, regs->r10);
+ kvm_r11_write(vcpu, regs->r11);
+ kvm_r12_write(vcpu, regs->r12);
+ kvm_r13_write(vcpu, regs->r13);
+ kvm_r14_write(vcpu, regs->r14);
+ kvm_r15_write(vcpu, regs->r15);
+#endif
+
+ kvm_rip_write(vcpu, regs->rip);
+ kvm_set_rflags(vcpu, regs->rflags | X86_EFLAGS_FIXED);
+
+ vcpu->arch.exception.pending = false;
+ vcpu->arch.exception_vmexit.pending = false;
+
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+}
+
+int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
+{
+ vcpu_load(vcpu);
+ __set_regs(vcpu, regs);
+ vcpu_put(vcpu);
+ return 0;
+}
+
+static void __get_sregs_common(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
+{
+ struct desc_ptr dt;
+
+ if (vcpu->arch.guest_state_protected)
+ goto skip_protected_regs;
+
+ kvm_get_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
+ kvm_get_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
+ kvm_get_segment(vcpu, &sregs->es, VCPU_SREG_ES);
+ kvm_get_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
+ kvm_get_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
+ kvm_get_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
+
+ kvm_get_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
+ kvm_get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
+
+ static_call(kvm_x86_get_idt)(vcpu, &dt);
+ sregs->idt.limit = dt.size;
+ sregs->idt.base = dt.address;
+ static_call(kvm_x86_get_gdt)(vcpu, &dt);
+ sregs->gdt.limit = dt.size;
+ sregs->gdt.base = dt.address;
+
+ sregs->cr2 = vcpu->arch.cr2;
+ sregs->cr3 = kvm_read_cr3(vcpu);
+
+skip_protected_regs:
+ sregs->cr0 = kvm_read_cr0(vcpu);
+ sregs->cr4 = kvm_read_cr4(vcpu);
+ sregs->cr8 = kvm_get_cr8(vcpu);
+ sregs->efer = vcpu->arch.efer;
+ sregs->apic_base = kvm_get_apic_base(vcpu);
+}
+
+static void __get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
+{
+ __get_sregs_common(vcpu, sregs);
+
+ if (vcpu->arch.guest_state_protected)
+ return;
+
+ if (vcpu->arch.interrupt.injected && !vcpu->arch.interrupt.soft)
+ set_bit(vcpu->arch.interrupt.nr,
+ (unsigned long *)sregs->interrupt_bitmap);
+}
+
+static void __get_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2)
+{
+ int i;
+
+ __get_sregs_common(vcpu, (struct kvm_sregs *)sregs2);
+
+ if (vcpu->arch.guest_state_protected)
+ return;
+
+ if (is_pae_paging(vcpu)) {
+ for (i = 0 ; i < 4 ; i++)
+ sregs2->pdptrs[i] = kvm_pdptr_read(vcpu, i);
+ sregs2->flags |= KVM_SREGS2_FLAGS_PDPTRS_VALID;
+ }
+}
+
+int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
+ struct kvm_sregs *sregs)
+{
+ vcpu_load(vcpu);
+ __get_sregs(vcpu, sregs);
+ vcpu_put(vcpu);
+ return 0;
+}
+
+int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
+ struct kvm_mp_state *mp_state)
+{
+ int r;
+
+ vcpu_load(vcpu);
+ if (kvm_mpx_supported())
+ kvm_load_guest_fpu(vcpu);
+
+ r = kvm_apic_accept_events(vcpu);
+ if (r < 0)
+ goto out;
+ r = 0;
+
+ if ((vcpu->arch.mp_state == KVM_MP_STATE_HALTED ||
+ vcpu->arch.mp_state == KVM_MP_STATE_AP_RESET_HOLD) &&
+ vcpu->arch.pv.pv_unhalted)
+ mp_state->mp_state = KVM_MP_STATE_RUNNABLE;
+ else
+ mp_state->mp_state = vcpu->arch.mp_state;
+
+out:
+ if (kvm_mpx_supported())
+ kvm_put_guest_fpu(vcpu);
+ vcpu_put(vcpu);
+ return r;
+}
+
+int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
+ struct kvm_mp_state *mp_state)
+{
+ int ret = -EINVAL;
+
+ vcpu_load(vcpu);
+
+ switch (mp_state->mp_state) {
+ case KVM_MP_STATE_UNINITIALIZED:
+ case KVM_MP_STATE_HALTED:
+ case KVM_MP_STATE_AP_RESET_HOLD:
+ case KVM_MP_STATE_INIT_RECEIVED:
+ case KVM_MP_STATE_SIPI_RECEIVED:
+ if (!lapic_in_kernel(vcpu))
+ goto out;
+ break;
+
+ case KVM_MP_STATE_RUNNABLE:
+ break;
+
+ default:
+ goto out;
+ }
+
+ /*
+ * Pending INITs are reported using KVM_SET_VCPU_EVENTS, disallow
+ * forcing the guest into INIT/SIPI if those events are supposed to be
+ * blocked. KVM prioritizes SMI over INIT, so reject INIT/SIPI state
+ * if an SMI is pending as well.
+ */
+ if ((!kvm_apic_init_sipi_allowed(vcpu) || vcpu->arch.smi_pending) &&
+ (mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED ||
+ mp_state->mp_state == KVM_MP_STATE_INIT_RECEIVED))
+ goto out;
+
+ if (mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED) {
+ vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED;
+ set_bit(KVM_APIC_SIPI, &vcpu->arch.apic->pending_events);
+ } else
+ vcpu->arch.mp_state = mp_state->mp_state;
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+
+ ret = 0;
+out:
+ vcpu_put(vcpu);
+ return ret;
+}
+
+int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index,
+ int reason, bool has_error_code, u32 error_code)
+{
+ struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
+ int ret;
+
+ init_emulate_ctxt(vcpu);
+
+ ret = emulator_task_switch(ctxt, tss_selector, idt_index, reason,
+ has_error_code, error_code);
+ if (ret) {
+ vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+ vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
+ vcpu->run->internal.ndata = 0;
+ return 0;
+ }
+
+ kvm_rip_write(vcpu, ctxt->eip);
+ kvm_set_rflags(vcpu, ctxt->eflags);
+ return 1;
+}
+EXPORT_SYMBOL_GPL(kvm_task_switch);
+
+static bool kvm_is_valid_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
+{
+ if ((sregs->efer & EFER_LME) && (sregs->cr0 & X86_CR0_PG)) {
+ /*
+ * When EFER.LME and CR0.PG are set, the processor is in
+ * 64-bit mode (though maybe in a 32-bit code segment).
+ * CR4.PAE and EFER.LMA must be set.
+ */
+ if (!(sregs->cr4 & X86_CR4_PAE) || !(sregs->efer & EFER_LMA))
+ return false;
+ if (kvm_vcpu_is_illegal_gpa(vcpu, sregs->cr3))
+ return false;
+ } else {
+ /*
+ * Not in 64-bit mode: EFER.LMA is clear and the code
+ * segment cannot be 64-bit.
+ */
+ if (sregs->efer & EFER_LMA || sregs->cs.l)
+ return false;
+ }
+
+ return kvm_is_valid_cr4(vcpu, sregs->cr4) &&
+ kvm_is_valid_cr0(vcpu, sregs->cr0);
+}
+
+static int __set_sregs_common(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs,
+ int *mmu_reset_needed, bool update_pdptrs)
+{
+ struct msr_data apic_base_msr;
+ int idx;
+ struct desc_ptr dt;
+
+ if (!kvm_is_valid_sregs(vcpu, sregs))
+ return -EINVAL;
+
+ apic_base_msr.data = sregs->apic_base;
+ apic_base_msr.host_initiated = true;
+ if (kvm_set_apic_base(vcpu, &apic_base_msr))
+ return -EINVAL;
+
+ if (vcpu->arch.guest_state_protected)
+ return 0;
+
+ dt.size = sregs->idt.limit;
+ dt.address = sregs->idt.base;
+ static_call(kvm_x86_set_idt)(vcpu, &dt);
+ dt.size = sregs->gdt.limit;
+ dt.address = sregs->gdt.base;
+ static_call(kvm_x86_set_gdt)(vcpu, &dt);
+
+ vcpu->arch.cr2 = sregs->cr2;
+ *mmu_reset_needed |= kvm_read_cr3(vcpu) != sregs->cr3;
+ vcpu->arch.cr3 = sregs->cr3;
+ kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3);
+ static_call_cond(kvm_x86_post_set_cr3)(vcpu, sregs->cr3);
+
+ kvm_set_cr8(vcpu, sregs->cr8);
+
+ *mmu_reset_needed |= vcpu->arch.efer != sregs->efer;
+ static_call(kvm_x86_set_efer)(vcpu, sregs->efer);
+
+ *mmu_reset_needed |= kvm_read_cr0(vcpu) != sregs->cr0;
+ static_call(kvm_x86_set_cr0)(vcpu, sregs->cr0);
+ vcpu->arch.cr0 = sregs->cr0;
+
+ *mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4;
+ static_call(kvm_x86_set_cr4)(vcpu, sregs->cr4);
+
+ if (update_pdptrs) {
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
+ if (is_pae_paging(vcpu)) {
+ load_pdptrs(vcpu, kvm_read_cr3(vcpu));
+ *mmu_reset_needed = 1;
+ }
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
+ }
+
+ kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
+ kvm_set_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
+ kvm_set_segment(vcpu, &sregs->es, VCPU_SREG_ES);
+ kvm_set_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
+ kvm_set_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
+ kvm_set_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
+
+ kvm_set_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
+ kvm_set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
+
+ update_cr8_intercept(vcpu);
+
+ /* Older userspace won't unhalt the vcpu on reset. */
+ if (kvm_vcpu_is_bsp(vcpu) && kvm_rip_read(vcpu) == 0xfff0 &&
+ sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 &&
+ !is_protmode(vcpu))
+ vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
+
+ return 0;
+}
+
+static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
+{
+ int pending_vec, max_bits;
+ int mmu_reset_needed = 0;
+ int ret = __set_sregs_common(vcpu, sregs, &mmu_reset_needed, true);
+
+ if (ret)
+ return ret;
+
+ if (mmu_reset_needed)
+ kvm_mmu_reset_context(vcpu);
+
+ max_bits = KVM_NR_INTERRUPTS;
+ pending_vec = find_first_bit(
+ (const unsigned long *)sregs->interrupt_bitmap, max_bits);
+
+ if (pending_vec < max_bits) {
+ kvm_queue_interrupt(vcpu, pending_vec, false);
+ pr_debug("Set back pending irq %d\n", pending_vec);
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ }
+ return 0;
+}
+
+static int __set_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2)
+{
+ int mmu_reset_needed = 0;
+ bool valid_pdptrs = sregs2->flags & KVM_SREGS2_FLAGS_PDPTRS_VALID;
+ bool pae = (sregs2->cr0 & X86_CR0_PG) && (sregs2->cr4 & X86_CR4_PAE) &&
+ !(sregs2->efer & EFER_LMA);
+ int i, ret;
+
+ if (sregs2->flags & ~KVM_SREGS2_FLAGS_PDPTRS_VALID)
+ return -EINVAL;
+
+ if (valid_pdptrs && (!pae || vcpu->arch.guest_state_protected))
+ return -EINVAL;
+
+ ret = __set_sregs_common(vcpu, (struct kvm_sregs *)sregs2,
+ &mmu_reset_needed, !valid_pdptrs);
+ if (ret)
+ return ret;
+
+ if (valid_pdptrs) {
+ for (i = 0; i < 4 ; i++)
+ kvm_pdptr_write(vcpu, i, sregs2->pdptrs[i]);
+
+ kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR);
+ mmu_reset_needed = 1;
+ vcpu->arch.pdptrs_from_userspace = true;
+ }
+ if (mmu_reset_needed)
+ kvm_mmu_reset_context(vcpu);
+ return 0;
+}
+
+int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
+ struct kvm_sregs *sregs)
+{
+ int ret;
+
+ vcpu_load(vcpu);
+ ret = __set_sregs(vcpu, sregs);
+ vcpu_put(vcpu);
+ return ret;
+}
+
+static void kvm_arch_vcpu_guestdbg_update_apicv_inhibit(struct kvm *kvm)
+{
+ bool set = false;
+ struct kvm_vcpu *vcpu;
+ unsigned long i;
+
+ if (!enable_apicv)
+ return;
+
+ down_write(&kvm->arch.apicv_update_lock);
+
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ if (vcpu->guest_debug & KVM_GUESTDBG_BLOCKIRQ) {
+ set = true;
+ break;
+ }
+ }
+ __kvm_set_or_clear_apicv_inhibit(kvm, APICV_INHIBIT_REASON_BLOCKIRQ, set);
+ up_write(&kvm->arch.apicv_update_lock);
+}
+
+int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
+ struct kvm_guest_debug *dbg)
+{
+ unsigned long rflags;
+ int i, r;
+
+ if (vcpu->arch.guest_state_protected)
+ return -EINVAL;
+
+ vcpu_load(vcpu);
+
+ if (dbg->control & (KVM_GUESTDBG_INJECT_DB | KVM_GUESTDBG_INJECT_BP)) {
+ r = -EBUSY;
+ if (kvm_is_exception_pending(vcpu))
+ goto out;
+ if (dbg->control & KVM_GUESTDBG_INJECT_DB)
+ kvm_queue_exception(vcpu, DB_VECTOR);
+ else
+ kvm_queue_exception(vcpu, BP_VECTOR);
+ }
+
+ /*
+ * Read rflags as long as potentially injected trace flags are still
+ * filtered out.
+ */
+ rflags = kvm_get_rflags(vcpu);
+
+ vcpu->guest_debug = dbg->control;
+ if (!(vcpu->guest_debug & KVM_GUESTDBG_ENABLE))
+ vcpu->guest_debug = 0;
+
+ if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) {
+ for (i = 0; i < KVM_NR_DB_REGS; ++i)
+ vcpu->arch.eff_db[i] = dbg->arch.debugreg[i];
+ vcpu->arch.guest_debug_dr7 = dbg->arch.debugreg[7];
+ } else {
+ for (i = 0; i < KVM_NR_DB_REGS; i++)
+ vcpu->arch.eff_db[i] = vcpu->arch.db[i];
+ }
+ kvm_update_dr7(vcpu);
+
+ if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
+ vcpu->arch.singlestep_rip = kvm_get_linear_rip(vcpu);
+
+ /*
+ * Trigger an rflags update that will inject or remove the trace
+ * flags.
+ */
+ kvm_set_rflags(vcpu, rflags);
+
+ static_call(kvm_x86_update_exception_bitmap)(vcpu);
+
+ kvm_arch_vcpu_guestdbg_update_apicv_inhibit(vcpu->kvm);
+
+ r = 0;
+
+out:
+ vcpu_put(vcpu);
+ return r;
+}
+
+/*
+ * Translate a guest virtual address to a guest physical address.
+ */
+int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
+ struct kvm_translation *tr)
+{
+ unsigned long vaddr = tr->linear_address;
+ gpa_t gpa;
+ int idx;
+
+ vcpu_load(vcpu);
+
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
+ gpa = kvm_mmu_gva_to_gpa_system(vcpu, vaddr, NULL);
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
+ tr->physical_address = gpa;
+ tr->valid = gpa != INVALID_GPA;
+ tr->writeable = 1;
+ tr->usermode = 0;
+
+ vcpu_put(vcpu);
+ return 0;
+}
+
+int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
+{
+ struct fxregs_state *fxsave;
+
+ if (fpstate_is_confidential(&vcpu->arch.guest_fpu))
+ return 0;
+
+ vcpu_load(vcpu);
+
+ fxsave = &vcpu->arch.guest_fpu.fpstate->regs.fxsave;
+ memcpy(fpu->fpr, fxsave->st_space, 128);
+ fpu->fcw = fxsave->cwd;
+ fpu->fsw = fxsave->swd;
+ fpu->ftwx = fxsave->twd;
+ fpu->last_opcode = fxsave->fop;
+ fpu->last_ip = fxsave->rip;
+ fpu->last_dp = fxsave->rdp;
+ memcpy(fpu->xmm, fxsave->xmm_space, sizeof(fxsave->xmm_space));
+
+ vcpu_put(vcpu);
+ return 0;
+}
+
+int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
+{
+ struct fxregs_state *fxsave;
+
+ if (fpstate_is_confidential(&vcpu->arch.guest_fpu))
+ return 0;
+
+ vcpu_load(vcpu);
+
+ fxsave = &vcpu->arch.guest_fpu.fpstate->regs.fxsave;
+
+ memcpy(fxsave->st_space, fpu->fpr, 128);
+ fxsave->cwd = fpu->fcw;
+ fxsave->swd = fpu->fsw;
+ fxsave->twd = fpu->ftwx;
+ fxsave->fop = fpu->last_opcode;
+ fxsave->rip = fpu->last_ip;
+ fxsave->rdp = fpu->last_dp;
+ memcpy(fxsave->xmm_space, fpu->xmm, sizeof(fxsave->xmm_space));
+
+ vcpu_put(vcpu);
+ return 0;
+}
+
+static void store_regs(struct kvm_vcpu *vcpu)
+{
+ BUILD_BUG_ON(sizeof(struct kvm_sync_regs) > SYNC_REGS_SIZE_BYTES);
+
+ if (vcpu->run->kvm_valid_regs & KVM_SYNC_X86_REGS)
+ __get_regs(vcpu, &vcpu->run->s.regs.regs);
+
+ if (vcpu->run->kvm_valid_regs & KVM_SYNC_X86_SREGS)
+ __get_sregs(vcpu, &vcpu->run->s.regs.sregs);
+
+ if (vcpu->run->kvm_valid_regs & KVM_SYNC_X86_EVENTS)
+ kvm_vcpu_ioctl_x86_get_vcpu_events(
+ vcpu, &vcpu->run->s.regs.events);
+}
+
+static int sync_regs(struct kvm_vcpu *vcpu)
+{
+ if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_REGS) {
+ __set_regs(vcpu, &vcpu->run->s.regs.regs);
+ vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_REGS;
+ }
+
+ if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_SREGS) {
+ struct kvm_sregs sregs = vcpu->run->s.regs.sregs;
+
+ if (__set_sregs(vcpu, &sregs))
+ return -EINVAL;
+
+ vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_SREGS;
+ }
+
+ if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_EVENTS) {
+ struct kvm_vcpu_events events = vcpu->run->s.regs.events;
+
+ if (kvm_vcpu_ioctl_x86_set_vcpu_events(vcpu, &events))
+ return -EINVAL;
+
+ vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_EVENTS;
+ }
+
+ return 0;
+}
+
+int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id)
+{
+ if (kvm_check_tsc_unstable() && kvm->created_vcpus)
+ pr_warn_once("SMP vm created on host with unstable TSC; "
+ "guest TSC will not be reliable\n");
+
+ if (!kvm->arch.max_vcpu_ids)
+ kvm->arch.max_vcpu_ids = KVM_MAX_VCPU_IDS;
+
+ if (id >= kvm->arch.max_vcpu_ids)
+ return -EINVAL;
+
+ return static_call(kvm_x86_vcpu_precreate)(kvm);
+}
+
+int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
+{
+ struct page *page;
+ int r;
+
+ vcpu->arch.last_vmentry_cpu = -1;
+ vcpu->arch.regs_avail = ~0;
+ vcpu->arch.regs_dirty = ~0;
+
+ kvm_gpc_init(&vcpu->arch.pv_time, vcpu->kvm, vcpu, KVM_HOST_USES_PFN);
+
+ if (!irqchip_in_kernel(vcpu->kvm) || kvm_vcpu_is_reset_bsp(vcpu))
+ vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
+ else
+ vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED;
+
+ r = kvm_mmu_create(vcpu);
+ if (r < 0)
+ return r;
+
+ if (irqchip_in_kernel(vcpu->kvm)) {
+ r = kvm_create_lapic(vcpu, lapic_timer_advance_ns);
+ if (r < 0)
+ goto fail_mmu_destroy;
+
+ /*
+ * Defer evaluating inhibits until the vCPU is first run, as
+ * this vCPU will not get notified of any changes until this
+ * vCPU is visible to other vCPUs (marked online and added to
+ * the set of vCPUs). Opportunistically mark APICv active as
+ * VMX in particularly is highly unlikely to have inhibits.
+ * Ignore the current per-VM APICv state so that vCPU creation
+ * is guaranteed to run with a deterministic value, the request
+ * will ensure the vCPU gets the correct state before VM-Entry.
+ */
+ if (enable_apicv) {
+ vcpu->arch.apic->apicv_active = true;
+ kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu);
+ }
+ } else
+ static_branch_inc(&kvm_has_noapic_vcpu);
+
+ r = -ENOMEM;
+
+ page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+ if (!page)
+ goto fail_free_lapic;
+ vcpu->arch.pio_data = page_address(page);
+
+ vcpu->arch.mce_banks = kcalloc(KVM_MAX_MCE_BANKS * 4, sizeof(u64),
+ GFP_KERNEL_ACCOUNT);
+ vcpu->arch.mci_ctl2_banks = kcalloc(KVM_MAX_MCE_BANKS, sizeof(u64),
+ GFP_KERNEL_ACCOUNT);
+ if (!vcpu->arch.mce_banks || !vcpu->arch.mci_ctl2_banks)
+ goto fail_free_mce_banks;
+ vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS;
+
+ if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask,
+ GFP_KERNEL_ACCOUNT))
+ goto fail_free_mce_banks;
+
+ if (!alloc_emulate_ctxt(vcpu))
+ goto free_wbinvd_dirty_mask;
+
+ if (!fpu_alloc_guest_fpstate(&vcpu->arch.guest_fpu)) {
+ pr_err("failed to allocate vcpu's fpu\n");
+ goto free_emulate_ctxt;
+ }
+
+ vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
+ vcpu->arch.reserved_gpa_bits = kvm_vcpu_reserved_gpa_bits_raw(vcpu);
+
+ vcpu->arch.pat = MSR_IA32_CR_PAT_DEFAULT;
+
+ kvm_async_pf_hash_reset(vcpu);
+
+ vcpu->arch.perf_capabilities = kvm_caps.supported_perf_cap;
+ kvm_pmu_init(vcpu);
+
+ vcpu->arch.pending_external_vector = -1;
+ vcpu->arch.preempted_in_kernel = false;
+
+#if IS_ENABLED(CONFIG_HYPERV)
+ vcpu->arch.hv_root_tdp = INVALID_PAGE;
+#endif
+
+ r = static_call(kvm_x86_vcpu_create)(vcpu);
+ if (r)
+ goto free_guest_fpu;
+
+ vcpu->arch.arch_capabilities = kvm_get_arch_capabilities();
+ vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT;
+ kvm_xen_init_vcpu(vcpu);
+ kvm_vcpu_mtrr_init(vcpu);
+ vcpu_load(vcpu);
+ kvm_set_tsc_khz(vcpu, vcpu->kvm->arch.default_tsc_khz);
+ kvm_vcpu_reset(vcpu, false);
+ kvm_init_mmu(vcpu);
+ vcpu_put(vcpu);
+ return 0;
+
+free_guest_fpu:
+ fpu_free_guest_fpstate(&vcpu->arch.guest_fpu);
+free_emulate_ctxt:
+ kmem_cache_free(x86_emulator_cache, vcpu->arch.emulate_ctxt);
+free_wbinvd_dirty_mask:
+ free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
+fail_free_mce_banks:
+ kfree(vcpu->arch.mce_banks);
+ kfree(vcpu->arch.mci_ctl2_banks);
+ free_page((unsigned long)vcpu->arch.pio_data);
+fail_free_lapic:
+ kvm_free_lapic(vcpu);
+fail_mmu_destroy:
+ kvm_mmu_destroy(vcpu);
+ return r;
+}
+
+void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
+{
+ struct kvm *kvm = vcpu->kvm;
+
+ if (mutex_lock_killable(&vcpu->mutex))
+ return;
+ vcpu_load(vcpu);
+ kvm_synchronize_tsc(vcpu, 0);
+ vcpu_put(vcpu);
+
+ /* poll control enabled by default */
+ vcpu->arch.msr_kvm_poll_control = 1;
+
+ mutex_unlock(&vcpu->mutex);
+
+ if (kvmclock_periodic_sync && vcpu->vcpu_idx == 0)
+ schedule_delayed_work(&kvm->arch.kvmclock_sync_work,
+ KVMCLOCK_SYNC_PERIOD);
+}
+
+void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
+{
+ int idx;
+
+ kvmclock_reset(vcpu);
+
+ static_call(kvm_x86_vcpu_free)(vcpu);
+
+ kmem_cache_free(x86_emulator_cache, vcpu->arch.emulate_ctxt);
+ free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
+ fpu_free_guest_fpstate(&vcpu->arch.guest_fpu);
+
+ kvm_xen_destroy_vcpu(vcpu);
+ kvm_hv_vcpu_uninit(vcpu);
+ kvm_pmu_destroy(vcpu);
+ kfree(vcpu->arch.mce_banks);
+ kfree(vcpu->arch.mci_ctl2_banks);
+ kvm_free_lapic(vcpu);
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
+ kvm_mmu_destroy(vcpu);
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
+ free_page((unsigned long)vcpu->arch.pio_data);
+ kvfree(vcpu->arch.cpuid_entries);
+ if (!lapic_in_kernel(vcpu))
+ static_branch_dec(&kvm_has_noapic_vcpu);
+}
+
+void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
+{
+ struct kvm_cpuid_entry2 *cpuid_0x1;
+ unsigned long old_cr0 = kvm_read_cr0(vcpu);
+ unsigned long new_cr0;
+
+ /*
+ * Several of the "set" flows, e.g. ->set_cr0(), read other registers
+ * to handle side effects. RESET emulation hits those flows and relies
+ * on emulated/virtualized registers, including those that are loaded
+ * into hardware, to be zeroed at vCPU creation. Use CRs as a sentinel
+ * to detect improper or missing initialization.
+ */
+ WARN_ON_ONCE(!init_event &&
+ (old_cr0 || kvm_read_cr3(vcpu) || kvm_read_cr4(vcpu)));
+
+ /*
+ * SVM doesn't unconditionally VM-Exit on INIT and SHUTDOWN, thus it's
+ * possible to INIT the vCPU while L2 is active. Force the vCPU back
+ * into L1 as EFER.SVME is cleared on INIT (along with all other EFER
+ * bits), i.e. virtualization is disabled.
+ */
+ if (is_guest_mode(vcpu))
+ kvm_leave_nested(vcpu);
+
+ kvm_lapic_reset(vcpu, init_event);
+
+ WARN_ON_ONCE(is_guest_mode(vcpu) || is_smm(vcpu));
+ vcpu->arch.hflags = 0;
+
+ vcpu->arch.smi_pending = 0;
+ vcpu->arch.smi_count = 0;
+ atomic_set(&vcpu->arch.nmi_queued, 0);
+ vcpu->arch.nmi_pending = 0;
+ vcpu->arch.nmi_injected = false;
+ kvm_clear_interrupt_queue(vcpu);
+ kvm_clear_exception_queue(vcpu);
+
+ memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db));
+ kvm_update_dr0123(vcpu);
+ vcpu->arch.dr6 = DR6_ACTIVE_LOW;
+ vcpu->arch.dr7 = DR7_FIXED_1;
+ kvm_update_dr7(vcpu);
+
+ vcpu->arch.cr2 = 0;
+
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ vcpu->arch.apf.msr_en_val = 0;
+ vcpu->arch.apf.msr_int_val = 0;
+ vcpu->arch.st.msr_val = 0;
+
+ kvmclock_reset(vcpu);
+
+ kvm_clear_async_pf_completion_queue(vcpu);
+ kvm_async_pf_hash_reset(vcpu);
+ vcpu->arch.apf.halted = false;
+
+ if (vcpu->arch.guest_fpu.fpstate && kvm_mpx_supported()) {
+ struct fpstate *fpstate = vcpu->arch.guest_fpu.fpstate;
+
+ /*
+ * All paths that lead to INIT are required to load the guest's
+ * FPU state (because most paths are buried in KVM_RUN).
+ */
+ if (init_event)
+ kvm_put_guest_fpu(vcpu);
+
+ fpstate_clear_xstate_component(fpstate, XFEATURE_BNDREGS);
+ fpstate_clear_xstate_component(fpstate, XFEATURE_BNDCSR);
+
+ if (init_event)
+ kvm_load_guest_fpu(vcpu);
+ }
+
+ if (!init_event) {
+ kvm_pmu_reset(vcpu);
+ vcpu->arch.smbase = 0x30000;
+
+ vcpu->arch.msr_misc_features_enables = 0;
+ vcpu->arch.ia32_misc_enable_msr = MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL |
+ MSR_IA32_MISC_ENABLE_BTS_UNAVAIL;
+
+ __kvm_set_xcr(vcpu, 0, XFEATURE_MASK_FP);
+ __kvm_set_msr(vcpu, MSR_IA32_XSS, 0, true);
+ }
+
+ /* All GPRs except RDX (handled below) are zeroed on RESET/INIT. */
+ memset(vcpu->arch.regs, 0, sizeof(vcpu->arch.regs));
+ kvm_register_mark_dirty(vcpu, VCPU_REGS_RSP);
+
+ /*
+ * Fall back to KVM's default Family/Model/Stepping of 0x600 (P6/Athlon)
+ * if no CPUID match is found. Note, it's impossible to get a match at
+ * RESET since KVM emulates RESET before exposing the vCPU to userspace,
+ * i.e. it's impossible for kvm_find_cpuid_entry() to find a valid entry
+ * on RESET. But, go through the motions in case that's ever remedied.
+ */
+ cpuid_0x1 = kvm_find_cpuid_entry(vcpu, 1);
+ kvm_rdx_write(vcpu, cpuid_0x1 ? cpuid_0x1->eax : 0x600);
+
+ static_call(kvm_x86_vcpu_reset)(vcpu, init_event);
+
+ kvm_set_rflags(vcpu, X86_EFLAGS_FIXED);
+ kvm_rip_write(vcpu, 0xfff0);
+
+ vcpu->arch.cr3 = 0;
+ kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3);
+
+ /*
+ * CR0.CD/NW are set on RESET, preserved on INIT. Note, some versions
+ * of Intel's SDM list CD/NW as being set on INIT, but they contradict
+ * (or qualify) that with a footnote stating that CD/NW are preserved.
+ */
+ new_cr0 = X86_CR0_ET;
+ if (init_event)
+ new_cr0 |= (old_cr0 & (X86_CR0_NW | X86_CR0_CD));
+ else
+ new_cr0 |= X86_CR0_NW | X86_CR0_CD;
+
+ static_call(kvm_x86_set_cr0)(vcpu, new_cr0);
+ static_call(kvm_x86_set_cr4)(vcpu, 0);
+ static_call(kvm_x86_set_efer)(vcpu, 0);
+ static_call(kvm_x86_update_exception_bitmap)(vcpu);
+
+ /*
+ * On the standard CR0/CR4/EFER modification paths, there are several
+ * complex conditions determining whether the MMU has to be reset and/or
+ * which PCIDs have to be flushed. However, CR0.WP and the paging-related
+ * bits in CR4 and EFER are irrelevant if CR0.PG was '0'; and a reset+flush
+ * is needed anyway if CR0.PG was '1' (which can only happen for INIT, as
+ * CR0 will be '0' prior to RESET). So we only need to check CR0.PG here.
+ */
+ if (old_cr0 & X86_CR0_PG) {
+ kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
+ kvm_mmu_reset_context(vcpu);
+ }
+
+ /*
+ * Intel's SDM states that all TLB entries are flushed on INIT. AMD's
+ * APM states the TLBs are untouched by INIT, but it also states that
+ * the TLBs are flushed on "External initialization of the processor."
+ * Flush the guest TLB regardless of vendor, there is no meaningful
+ * benefit in relying on the guest to flush the TLB immediately after
+ * INIT. A spurious TLB flush is benign and likely negligible from a
+ * performance perspective.
+ */
+ if (init_event)
+ kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
+}
+EXPORT_SYMBOL_GPL(kvm_vcpu_reset);
+
+void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
+{
+ struct kvm_segment cs;
+
+ kvm_get_segment(vcpu, &cs, VCPU_SREG_CS);
+ cs.selector = vector << 8;
+ cs.base = vector << 12;
+ kvm_set_segment(vcpu, &cs, VCPU_SREG_CS);
+ kvm_rip_write(vcpu, 0);
+}
+EXPORT_SYMBOL_GPL(kvm_vcpu_deliver_sipi_vector);
+
+int kvm_arch_hardware_enable(void)
+{
+ struct kvm *kvm;
+ struct kvm_vcpu *vcpu;
+ unsigned long i;
+ int ret;
+ u64 local_tsc;
+ u64 max_tsc = 0;
+ bool stable, backwards_tsc = false;
+
+ kvm_user_return_msr_cpu_online();
+
+ ret = kvm_x86_check_processor_compatibility();
+ if (ret)
+ return ret;
+
+ ret = static_call(kvm_x86_hardware_enable)();
+ if (ret != 0)
+ return ret;
+
+ local_tsc = rdtsc();
+ stable = !kvm_check_tsc_unstable();
+ list_for_each_entry(kvm, &vm_list, vm_list) {
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ if (!stable && vcpu->cpu == smp_processor_id())
+ kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
+ if (stable && vcpu->arch.last_host_tsc > local_tsc) {
+ backwards_tsc = true;
+ if (vcpu->arch.last_host_tsc > max_tsc)
+ max_tsc = vcpu->arch.last_host_tsc;
+ }
+ }
+ }
+
+ /*
+ * Sometimes, even reliable TSCs go backwards. This happens on
+ * platforms that reset TSC during suspend or hibernate actions, but
+ * maintain synchronization. We must compensate. Fortunately, we can
+ * detect that condition here, which happens early in CPU bringup,
+ * before any KVM threads can be running. Unfortunately, we can't
+ * bring the TSCs fully up to date with real time, as we aren't yet far
+ * enough into CPU bringup that we know how much real time has actually
+ * elapsed; our helper function, ktime_get_boottime_ns() will be using boot
+ * variables that haven't been updated yet.
+ *
+ * So we simply find the maximum observed TSC above, then record the
+ * adjustment to TSC in each VCPU. When the VCPU later gets loaded,
+ * the adjustment will be applied. Note that we accumulate
+ * adjustments, in case multiple suspend cycles happen before some VCPU
+ * gets a chance to run again. In the event that no KVM threads get a
+ * chance to run, we will miss the entire elapsed period, as we'll have
+ * reset last_host_tsc, so VCPUs will not have the TSC adjusted and may
+ * loose cycle time. This isn't too big a deal, since the loss will be
+ * uniform across all VCPUs (not to mention the scenario is extremely
+ * unlikely). It is possible that a second hibernate recovery happens
+ * much faster than a first, causing the observed TSC here to be
+ * smaller; this would require additional padding adjustment, which is
+ * why we set last_host_tsc to the local tsc observed here.
+ *
+ * N.B. - this code below runs only on platforms with reliable TSC,
+ * as that is the only way backwards_tsc is set above. Also note
+ * that this runs for ALL vcpus, which is not a bug; all VCPUs should
+ * have the same delta_cyc adjustment applied if backwards_tsc
+ * is detected. Note further, this adjustment is only done once,
+ * as we reset last_host_tsc on all VCPUs to stop this from being
+ * called multiple times (one for each physical CPU bringup).
+ *
+ * Platforms with unreliable TSCs don't have to deal with this, they
+ * will be compensated by the logic in vcpu_load, which sets the TSC to
+ * catchup mode. This will catchup all VCPUs to real time, but cannot
+ * guarantee that they stay in perfect synchronization.
+ */
+ if (backwards_tsc) {
+ u64 delta_cyc = max_tsc - local_tsc;
+ list_for_each_entry(kvm, &vm_list, vm_list) {
+ kvm->arch.backwards_tsc_observed = true;
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ vcpu->arch.tsc_offset_adjustment += delta_cyc;
+ vcpu->arch.last_host_tsc = local_tsc;
+ kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
+ }
+
+ /*
+ * We have to disable TSC offset matching.. if you were
+ * booting a VM while issuing an S4 host suspend....
+ * you may have some problem. Solving this issue is
+ * left as an exercise to the reader.
+ */
+ kvm->arch.last_tsc_nsec = 0;
+ kvm->arch.last_tsc_write = 0;
+ }
+
+ }
+ return 0;
+}
+
+void kvm_arch_hardware_disable(void)
+{
+ static_call(kvm_x86_hardware_disable)();
+ drop_user_return_notifiers();
+}
+
+bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu)
+{
+ return vcpu->kvm->arch.bsp_vcpu_id == vcpu->vcpu_id;
+}
+
+bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu)
+{
+ return (vcpu->arch.apic_base & MSR_IA32_APICBASE_BSP) != 0;
+}
+
+__read_mostly DEFINE_STATIC_KEY_FALSE(kvm_has_noapic_vcpu);
+EXPORT_SYMBOL_GPL(kvm_has_noapic_vcpu);
+
+void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+
+ vcpu->arch.l1tf_flush_l1d = true;
+ if (pmu->version && unlikely(pmu->event_count)) {
+ pmu->need_cleanup = true;
+ kvm_make_request(KVM_REQ_PMU, vcpu);
+ }
+ static_call(kvm_x86_sched_in)(vcpu, cpu);
+}
+
+void kvm_arch_free_vm(struct kvm *kvm)
+{
+ kfree(to_kvm_hv(kvm)->hv_pa_pg);
+ __kvm_arch_free_vm(kvm);
+}
+
+
+int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
+{
+ int ret;
+ unsigned long flags;
+
+ if (type)
+ return -EINVAL;
+
+ ret = kvm_page_track_init(kvm);
+ if (ret)
+ goto out;
+
+ kvm_mmu_init_vm(kvm);
+
+ ret = static_call(kvm_x86_vm_init)(kvm);
+ if (ret)
+ goto out_uninit_mmu;
+
+ INIT_HLIST_HEAD(&kvm->arch.mask_notifier_list);
+ INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
+ atomic_set(&kvm->arch.noncoherent_dma_count, 0);
+
+ /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */
+ set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap);
+ /* Reserve bit 1 of irq_sources_bitmap for irqfd-resampler */
+ set_bit(KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
+ &kvm->arch.irq_sources_bitmap);
+
+ raw_spin_lock_init(&kvm->arch.tsc_write_lock);
+ mutex_init(&kvm->arch.apic_map_lock);
+ seqcount_raw_spinlock_init(&kvm->arch.pvclock_sc, &kvm->arch.tsc_write_lock);
+ kvm->arch.kvmclock_offset = -get_kvmclock_base_ns();
+
+ raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
+ pvclock_update_vm_gtod_copy(kvm);
+ raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
+
+ kvm->arch.default_tsc_khz = max_tsc_khz ? : tsc_khz;
+ kvm->arch.guest_can_read_msr_platform_info = true;
+ kvm->arch.enable_pmu = enable_pmu;
+
+#if IS_ENABLED(CONFIG_HYPERV)
+ spin_lock_init(&kvm->arch.hv_root_tdp_lock);
+ kvm->arch.hv_root_tdp = INVALID_PAGE;
+#endif
+
+ INIT_DELAYED_WORK(&kvm->arch.kvmclock_update_work, kvmclock_update_fn);
+ INIT_DELAYED_WORK(&kvm->arch.kvmclock_sync_work, kvmclock_sync_fn);
+
+ kvm_apicv_init(kvm);
+ kvm_hv_init_vm(kvm);
+ kvm_xen_init_vm(kvm);
+
+ return 0;
+
+out_uninit_mmu:
+ kvm_mmu_uninit_vm(kvm);
+ kvm_page_track_cleanup(kvm);
+out:
+ return ret;
+}
+
+int kvm_arch_post_init_vm(struct kvm *kvm)
+{
+ return kvm_mmu_post_init_vm(kvm);
+}
+
+static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
+{
+ vcpu_load(vcpu);
+ kvm_mmu_unload(vcpu);
+ vcpu_put(vcpu);
+}
+
+static void kvm_unload_vcpu_mmus(struct kvm *kvm)
+{
+ unsigned long i;
+ struct kvm_vcpu *vcpu;
+
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ kvm_clear_async_pf_completion_queue(vcpu);
+ kvm_unload_vcpu_mmu(vcpu);
+ }
+}
+
+void kvm_arch_sync_events(struct kvm *kvm)
+{
+ cancel_delayed_work_sync(&kvm->arch.kvmclock_sync_work);
+ cancel_delayed_work_sync(&kvm->arch.kvmclock_update_work);
+ kvm_free_pit(kvm);
+}
+
+/**
+ * __x86_set_memory_region: Setup KVM internal memory slot
+ *
+ * @kvm: the kvm pointer to the VM.
+ * @id: the slot ID to setup.
+ * @gpa: the GPA to install the slot (unused when @size == 0).
+ * @size: the size of the slot. Set to zero to uninstall a slot.
+ *
+ * This function helps to setup a KVM internal memory slot. Specify
+ * @size > 0 to install a new slot, while @size == 0 to uninstall a
+ * slot. The return code can be one of the following:
+ *
+ * HVA: on success (uninstall will return a bogus HVA)
+ * -errno: on error
+ *
+ * The caller should always use IS_ERR() to check the return value
+ * before use. Note, the KVM internal memory slots are guaranteed to
+ * remain valid and unchanged until the VM is destroyed, i.e., the
+ * GPA->HVA translation will not change. However, the HVA is a user
+ * address, i.e. its accessibility is not guaranteed, and must be
+ * accessed via __copy_{to,from}_user().
+ */
+void __user * __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa,
+ u32 size)
+{
+ int i, r;
+ unsigned long hva, old_npages;
+ struct kvm_memslots *slots = kvm_memslots(kvm);
+ struct kvm_memory_slot *slot;
+
+ /* Called with kvm->slots_lock held. */
+ if (WARN_ON(id >= KVM_MEM_SLOTS_NUM))
+ return ERR_PTR_USR(-EINVAL);
+
+ slot = id_to_memslot(slots, id);
+ if (size) {
+ if (slot && slot->npages)
+ return ERR_PTR_USR(-EEXIST);
+
+ /*
+ * MAP_SHARED to prevent internal slot pages from being moved
+ * by fork()/COW.
+ */
+ hva = vm_mmap(NULL, 0, size, PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_ANONYMOUS, 0);
+ if (IS_ERR_VALUE(hva))
+ return (void __user *)hva;
+ } else {
+ if (!slot || !slot->npages)
+ return NULL;
+
+ old_npages = slot->npages;
+ hva = slot->userspace_addr;
+ }
+
+ for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
+ struct kvm_userspace_memory_region m;
+
+ m.slot = id | (i << 16);
+ m.flags = 0;
+ m.guest_phys_addr = gpa;
+ m.userspace_addr = hva;
+ m.memory_size = size;
+ r = __kvm_set_memory_region(kvm, &m);
+ if (r < 0)
+ return ERR_PTR_USR(r);
+ }
+
+ if (!size)
+ vm_munmap(hva, old_npages * PAGE_SIZE);
+
+ return (void __user *)hva;
+}
+EXPORT_SYMBOL_GPL(__x86_set_memory_region);
+
+void kvm_arch_pre_destroy_vm(struct kvm *kvm)
+{
+ kvm_mmu_pre_destroy_vm(kvm);
+}
+
+void kvm_arch_destroy_vm(struct kvm *kvm)
+{
+ if (current->mm == kvm->mm) {
+ /*
+ * Free memory regions allocated on behalf of userspace,
+ * unless the memory map has changed due to process exit
+ * or fd copying.
+ */
+ mutex_lock(&kvm->slots_lock);
+ __x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT,
+ 0, 0);
+ __x86_set_memory_region(kvm, IDENTITY_PAGETABLE_PRIVATE_MEMSLOT,
+ 0, 0);
+ __x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, 0, 0);
+ mutex_unlock(&kvm->slots_lock);
+ }
+ kvm_unload_vcpu_mmus(kvm);
+ static_call_cond(kvm_x86_vm_destroy)(kvm);
+ kvm_free_msr_filter(srcu_dereference_check(kvm->arch.msr_filter, &kvm->srcu, 1));
+ kvm_pic_destroy(kvm);
+ kvm_ioapic_destroy(kvm);
+ kvm_destroy_vcpus(kvm);
+ kvfree(rcu_dereference_check(kvm->arch.apic_map, 1));
+ kfree(srcu_dereference_check(kvm->arch.pmu_event_filter, &kvm->srcu, 1));
+ kvm_mmu_uninit_vm(kvm);
+ kvm_page_track_cleanup(kvm);
+ kvm_xen_destroy_vm(kvm);
+ kvm_hv_destroy_vm(kvm);
+}
+
+static void memslot_rmap_free(struct kvm_memory_slot *slot)
+{
+ int i;
+
+ for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
+ kvfree(slot->arch.rmap[i]);
+ slot->arch.rmap[i] = NULL;
+ }
+}
+
+void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
+{
+ int i;
+
+ memslot_rmap_free(slot);
+
+ for (i = 1; i < KVM_NR_PAGE_SIZES; ++i) {
+ kvfree(slot->arch.lpage_info[i - 1]);
+ slot->arch.lpage_info[i - 1] = NULL;
+ }
+
+ kvm_page_track_free_memslot(slot);
+}
+
+int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages)
+{
+ const int sz = sizeof(*slot->arch.rmap[0]);
+ int i;
+
+ for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
+ int level = i + 1;
+ int lpages = __kvm_mmu_slot_lpages(slot, npages, level);
+
+ if (slot->arch.rmap[i])
+ continue;
+
+ slot->arch.rmap[i] = __vcalloc(lpages, sz, GFP_KERNEL_ACCOUNT);
+ if (!slot->arch.rmap[i]) {
+ memslot_rmap_free(slot);
+ return -ENOMEM;
+ }
+ }
+
+ return 0;
+}
+
+static int kvm_alloc_memslot_metadata(struct kvm *kvm,
+ struct kvm_memory_slot *slot)
+{
+ unsigned long npages = slot->npages;
+ int i, r;
+
+ /*
+ * Clear out the previous array pointers for the KVM_MR_MOVE case. The
+ * old arrays will be freed by __kvm_set_memory_region() if installing
+ * the new memslot is successful.
+ */
+ memset(&slot->arch, 0, sizeof(slot->arch));
+
+ if (kvm_memslots_have_rmaps(kvm)) {
+ r = memslot_rmap_alloc(slot, npages);
+ if (r)
+ return r;
+ }
+
+ for (i = 1; i < KVM_NR_PAGE_SIZES; ++i) {
+ struct kvm_lpage_info *linfo;
+ unsigned long ugfn;
+ int lpages;
+ int level = i + 1;
+
+ lpages = __kvm_mmu_slot_lpages(slot, npages, level);
+
+ linfo = __vcalloc(lpages, sizeof(*linfo), GFP_KERNEL_ACCOUNT);
+ if (!linfo)
+ goto out_free;
+
+ slot->arch.lpage_info[i - 1] = linfo;
+
+ if (slot->base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1))
+ linfo[0].disallow_lpage = 1;
+ if ((slot->base_gfn + npages) & (KVM_PAGES_PER_HPAGE(level) - 1))
+ linfo[lpages - 1].disallow_lpage = 1;
+ ugfn = slot->userspace_addr >> PAGE_SHIFT;
+ /*
+ * If the gfn and userspace address are not aligned wrt each
+ * other, disable large page support for this slot.
+ */
+ if ((slot->base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1)) {
+ unsigned long j;
+
+ for (j = 0; j < lpages; ++j)
+ linfo[j].disallow_lpage = 1;
+ }
+ }
+
+ if (kvm_page_track_create_memslot(kvm, slot, npages))
+ goto out_free;
+
+ return 0;
+
+out_free:
+ memslot_rmap_free(slot);
+
+ for (i = 1; i < KVM_NR_PAGE_SIZES; ++i) {
+ kvfree(slot->arch.lpage_info[i - 1]);
+ slot->arch.lpage_info[i - 1] = NULL;
+ }
+ return -ENOMEM;
+}
+
+void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen)
+{
+ struct kvm_vcpu *vcpu;
+ unsigned long i;
+
+ /*
+ * memslots->generation has been incremented.
+ * mmio generation may have reached its maximum value.
+ */
+ kvm_mmu_invalidate_mmio_sptes(kvm, gen);
+
+ /* Force re-initialization of steal_time cache */
+ kvm_for_each_vcpu(i, vcpu, kvm)
+ kvm_vcpu_kick(vcpu);
+}
+
+int kvm_arch_prepare_memory_region(struct kvm *kvm,
+ const struct kvm_memory_slot *old,
+ struct kvm_memory_slot *new,
+ enum kvm_mr_change change)
+{
+ /*
+ * KVM doesn't support moving memslots when there are external page
+ * trackers attached to the VM, i.e. if KVMGT is in use.
+ */
+ if (change == KVM_MR_MOVE && kvm_page_track_has_external_user(kvm))
+ return -EINVAL;
+
+ if (change == KVM_MR_CREATE || change == KVM_MR_MOVE) {
+ if ((new->base_gfn + new->npages - 1) > kvm_mmu_max_gfn())
+ return -EINVAL;
+
+ return kvm_alloc_memslot_metadata(kvm, new);
+ }
+
+ if (change == KVM_MR_FLAGS_ONLY)
+ memcpy(&new->arch, &old->arch, sizeof(old->arch));
+ else if (WARN_ON_ONCE(change != KVM_MR_DELETE))
+ return -EIO;
+
+ return 0;
+}
+
+
+static void kvm_mmu_update_cpu_dirty_logging(struct kvm *kvm, bool enable)
+{
+ int nr_slots;
+
+ if (!kvm_x86_ops.cpu_dirty_log_size)
+ return;
+
+ nr_slots = atomic_read(&kvm->nr_memslots_dirty_logging);
+ if ((enable && nr_slots == 1) || !nr_slots)
+ kvm_make_all_cpus_request(kvm, KVM_REQ_UPDATE_CPU_DIRTY_LOGGING);
+}
+
+static void kvm_mmu_slot_apply_flags(struct kvm *kvm,
+ struct kvm_memory_slot *old,
+ const struct kvm_memory_slot *new,
+ enum kvm_mr_change change)
+{
+ u32 old_flags = old ? old->flags : 0;
+ u32 new_flags = new ? new->flags : 0;
+ bool log_dirty_pages = new_flags & KVM_MEM_LOG_DIRTY_PAGES;
+
+ /*
+ * Update CPU dirty logging if dirty logging is being toggled. This
+ * applies to all operations.
+ */
+ if ((old_flags ^ new_flags) & KVM_MEM_LOG_DIRTY_PAGES)
+ kvm_mmu_update_cpu_dirty_logging(kvm, log_dirty_pages);
+
+ /*
+ * Nothing more to do for RO slots (which can't be dirtied and can't be
+ * made writable) or CREATE/MOVE/DELETE of a slot.
+ *
+ * For a memslot with dirty logging disabled:
+ * CREATE: No dirty mappings will already exist.
+ * MOVE/DELETE: The old mappings will already have been cleaned up by
+ * kvm_arch_flush_shadow_memslot()
+ *
+ * For a memslot with dirty logging enabled:
+ * CREATE: No shadow pages exist, thus nothing to write-protect
+ * and no dirty bits to clear.
+ * MOVE/DELETE: The old mappings will already have been cleaned up by
+ * kvm_arch_flush_shadow_memslot().
+ */
+ if ((change != KVM_MR_FLAGS_ONLY) || (new_flags & KVM_MEM_READONLY))
+ return;
+
+ /*
+ * READONLY and non-flags changes were filtered out above, and the only
+ * other flag is LOG_DIRTY_PAGES, i.e. something is wrong if dirty
+ * logging isn't being toggled on or off.
+ */
+ if (WARN_ON_ONCE(!((old_flags ^ new_flags) & KVM_MEM_LOG_DIRTY_PAGES)))
+ return;
+
+ if (!log_dirty_pages) {
+ /*
+ * Dirty logging tracks sptes in 4k granularity, meaning that
+ * large sptes have to be split. If live migration succeeds,
+ * the guest in the source machine will be destroyed and large
+ * sptes will be created in the destination. However, if the
+ * guest continues to run in the source machine (for example if
+ * live migration fails), small sptes will remain around and
+ * cause bad performance.
+ *
+ * Scan sptes if dirty logging has been stopped, dropping those
+ * which can be collapsed into a single large-page spte. Later
+ * page faults will create the large-page sptes.
+ */
+ kvm_mmu_zap_collapsible_sptes(kvm, new);
+ } else {
+ /*
+ * Initially-all-set does not require write protecting any page,
+ * because they're all assumed to be dirty.
+ */
+ if (kvm_dirty_log_manual_protect_and_init_set(kvm))
+ return;
+
+ if (READ_ONCE(eager_page_split))
+ kvm_mmu_slot_try_split_huge_pages(kvm, new, PG_LEVEL_4K);
+
+ if (kvm_x86_ops.cpu_dirty_log_size) {
+ kvm_mmu_slot_leaf_clear_dirty(kvm, new);
+ kvm_mmu_slot_remove_write_access(kvm, new, PG_LEVEL_2M);
+ } else {
+ kvm_mmu_slot_remove_write_access(kvm, new, PG_LEVEL_4K);
+ }
+
+ /*
+ * Unconditionally flush the TLBs after enabling dirty logging.
+ * A flush is almost always going to be necessary (see below),
+ * and unconditionally flushing allows the helpers to omit
+ * the subtly complex checks when removing write access.
+ *
+ * Do the flush outside of mmu_lock to reduce the amount of
+ * time mmu_lock is held. Flushing after dropping mmu_lock is
+ * safe as KVM only needs to guarantee the slot is fully
+ * write-protected before returning to userspace, i.e. before
+ * userspace can consume the dirty status.
+ *
+ * Flushing outside of mmu_lock requires KVM to be careful when
+ * making decisions based on writable status of an SPTE, e.g. a
+ * !writable SPTE doesn't guarantee a CPU can't perform writes.
+ *
+ * Specifically, KVM also write-protects guest page tables to
+ * monitor changes when using shadow paging, and must guarantee
+ * no CPUs can write to those page before mmu_lock is dropped.
+ * Because CPUs may have stale TLB entries at this point, a
+ * !writable SPTE doesn't guarantee CPUs can't perform writes.
+ *
+ * KVM also allows making SPTES writable outside of mmu_lock,
+ * e.g. to allow dirty logging without taking mmu_lock.
+ *
+ * To handle these scenarios, KVM uses a separate software-only
+ * bit (MMU-writable) to track if a SPTE is !writable due to
+ * a guest page table being write-protected (KVM clears the
+ * MMU-writable flag when write-protecting for shadow paging).
+ *
+ * The use of MMU-writable is also the primary motivation for
+ * the unconditional flush. Because KVM must guarantee that a
+ * CPU doesn't contain stale, writable TLB entries for a
+ * !MMU-writable SPTE, KVM must flush if it encounters any
+ * MMU-writable SPTE regardless of whether the actual hardware
+ * writable bit was set. I.e. KVM is almost guaranteed to need
+ * to flush, while unconditionally flushing allows the "remove
+ * write access" helpers to ignore MMU-writable entirely.
+ *
+ * See is_writable_pte() for more details (the case involving
+ * access-tracked SPTEs is particularly relevant).
+ */
+ kvm_flush_remote_tlbs_memslot(kvm, new);
+ }
+}
+
+void kvm_arch_commit_memory_region(struct kvm *kvm,
+ struct kvm_memory_slot *old,
+ const struct kvm_memory_slot *new,
+ enum kvm_mr_change change)
+{
+ if (change == KVM_MR_DELETE)
+ kvm_page_track_delete_slot(kvm, old);
+
+ if (!kvm->arch.n_requested_mmu_pages &&
+ (change == KVM_MR_CREATE || change == KVM_MR_DELETE)) {
+ unsigned long nr_mmu_pages;
+
+ nr_mmu_pages = kvm->nr_memslot_pages / KVM_MEMSLOT_PAGES_TO_MMU_PAGES_RATIO;
+ nr_mmu_pages = max(nr_mmu_pages, KVM_MIN_ALLOC_MMU_PAGES);
+ kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
+ }
+
+ kvm_mmu_slot_apply_flags(kvm, old, new, change);
+
+ /* Free the arrays associated with the old memslot. */
+ if (change == KVM_MR_MOVE)
+ kvm_arch_free_memslot(kvm, old);
+}
+
+static inline bool kvm_guest_apic_has_interrupt(struct kvm_vcpu *vcpu)
+{
+ return (is_guest_mode(vcpu) &&
+ static_call(kvm_x86_guest_apic_has_interrupt)(vcpu));
+}
+
+static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)
+{
+ if (!list_empty_careful(&vcpu->async_pf.done))
+ return true;
+
+ if (kvm_apic_has_pending_init_or_sipi(vcpu) &&
+ kvm_apic_init_sipi_allowed(vcpu))
+ return true;
+
+ if (vcpu->arch.pv.pv_unhalted)
+ return true;
+
+ if (kvm_is_exception_pending(vcpu))
+ return true;
+
+ if (kvm_test_request(KVM_REQ_NMI, vcpu) ||
+ (vcpu->arch.nmi_pending &&
+ static_call(kvm_x86_nmi_allowed)(vcpu, false)))
+ return true;
+
+#ifdef CONFIG_KVM_SMM
+ if (kvm_test_request(KVM_REQ_SMI, vcpu) ||
+ (vcpu->arch.smi_pending &&
+ static_call(kvm_x86_smi_allowed)(vcpu, false)))
+ return true;
+#endif
+
+ if (kvm_test_request(KVM_REQ_PMI, vcpu))
+ return true;
+
+ if (kvm_arch_interrupt_allowed(vcpu) &&
+ (kvm_cpu_has_interrupt(vcpu) ||
+ kvm_guest_apic_has_interrupt(vcpu)))
+ return true;
+
+ if (kvm_hv_has_stimer_pending(vcpu))
+ return true;
+
+ if (is_guest_mode(vcpu) &&
+ kvm_x86_ops.nested_ops->has_events &&
+ kvm_x86_ops.nested_ops->has_events(vcpu))
+ return true;
+
+ if (kvm_xen_has_pending_events(vcpu))
+ return true;
+
+ return false;
+}
+
+int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
+{
+ return kvm_vcpu_running(vcpu) || kvm_vcpu_has_events(vcpu);
+}
+
+bool kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu)
+{
+ if (kvm_vcpu_apicv_active(vcpu) &&
+ static_call(kvm_x86_dy_apicv_has_pending_interrupt)(vcpu))
+ return true;
+
+ return false;
+}
+
+bool kvm_arch_dy_runnable(struct kvm_vcpu *vcpu)
+{
+ if (READ_ONCE(vcpu->arch.pv.pv_unhalted))
+ return true;
+
+ if (kvm_test_request(KVM_REQ_NMI, vcpu) ||
+#ifdef CONFIG_KVM_SMM
+ kvm_test_request(KVM_REQ_SMI, vcpu) ||
+#endif
+ kvm_test_request(KVM_REQ_EVENT, vcpu))
+ return true;
+
+ return kvm_arch_dy_has_pending_interrupt(vcpu);
+}
+
+bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu)
+{
+ if (vcpu->arch.guest_state_protected)
+ return true;
+
+ return vcpu->arch.preempted_in_kernel;
+}
+
+unsigned long kvm_arch_vcpu_get_ip(struct kvm_vcpu *vcpu)
+{
+ return kvm_rip_read(vcpu);
+}
+
+int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)
+{
+ return kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE;
+}
+
+int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu)
+{
+ return static_call(kvm_x86_interrupt_allowed)(vcpu, false);
+}
+
+unsigned long kvm_get_linear_rip(struct kvm_vcpu *vcpu)
+{
+ /* Can't read the RIP when guest state is protected, just return 0 */
+ if (vcpu->arch.guest_state_protected)
+ return 0;
+
+ if (is_64_bit_mode(vcpu))
+ return kvm_rip_read(vcpu);
+ return (u32)(get_segment_base(vcpu, VCPU_SREG_CS) +
+ kvm_rip_read(vcpu));
+}
+EXPORT_SYMBOL_GPL(kvm_get_linear_rip);
+
+bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip)
+{
+ return kvm_get_linear_rip(vcpu) == linear_rip;
+}
+EXPORT_SYMBOL_GPL(kvm_is_linear_rip);
+
+unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu)
+{
+ unsigned long rflags;
+
+ rflags = static_call(kvm_x86_get_rflags)(vcpu);
+ if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
+ rflags &= ~X86_EFLAGS_TF;
+ return rflags;
+}
+EXPORT_SYMBOL_GPL(kvm_get_rflags);
+
+static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
+{
+ if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP &&
+ kvm_is_linear_rip(vcpu, vcpu->arch.singlestep_rip))
+ rflags |= X86_EFLAGS_TF;
+ static_call(kvm_x86_set_rflags)(vcpu, rflags);
+}
+
+void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
+{
+ __kvm_set_rflags(vcpu, rflags);
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+}
+EXPORT_SYMBOL_GPL(kvm_set_rflags);
+
+static inline u32 kvm_async_pf_hash_fn(gfn_t gfn)
+{
+ BUILD_BUG_ON(!is_power_of_2(ASYNC_PF_PER_VCPU));
+
+ return hash_32(gfn & 0xffffffff, order_base_2(ASYNC_PF_PER_VCPU));
+}
+
+static inline u32 kvm_async_pf_next_probe(u32 key)
+{
+ return (key + 1) & (ASYNC_PF_PER_VCPU - 1);
+}
+
+static void kvm_add_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
+{
+ u32 key = kvm_async_pf_hash_fn(gfn);
+
+ while (vcpu->arch.apf.gfns[key] != ~0)
+ key = kvm_async_pf_next_probe(key);
+
+ vcpu->arch.apf.gfns[key] = gfn;
+}
+
+static u32 kvm_async_pf_gfn_slot(struct kvm_vcpu *vcpu, gfn_t gfn)
+{
+ int i;
+ u32 key = kvm_async_pf_hash_fn(gfn);
+
+ for (i = 0; i < ASYNC_PF_PER_VCPU &&
+ (vcpu->arch.apf.gfns[key] != gfn &&
+ vcpu->arch.apf.gfns[key] != ~0); i++)
+ key = kvm_async_pf_next_probe(key);
+
+ return key;
+}
+
+bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
+{
+ return vcpu->arch.apf.gfns[kvm_async_pf_gfn_slot(vcpu, gfn)] == gfn;
+}
+
+static void kvm_del_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
+{
+ u32 i, j, k;
+
+ i = j = kvm_async_pf_gfn_slot(vcpu, gfn);
+
+ if (WARN_ON_ONCE(vcpu->arch.apf.gfns[i] != gfn))
+ return;
+
+ while (true) {
+ vcpu->arch.apf.gfns[i] = ~0;
+ do {
+ j = kvm_async_pf_next_probe(j);
+ if (vcpu->arch.apf.gfns[j] == ~0)
+ return;
+ k = kvm_async_pf_hash_fn(vcpu->arch.apf.gfns[j]);
+ /*
+ * k lies cyclically in ]i,j]
+ * | i.k.j |
+ * |....j i.k.| or |.k..j i...|
+ */
+ } while ((i <= j) ? (i < k && k <= j) : (i < k || k <= j));
+ vcpu->arch.apf.gfns[i] = vcpu->arch.apf.gfns[j];
+ i = j;
+ }
+}
+
+static inline int apf_put_user_notpresent(struct kvm_vcpu *vcpu)
+{
+ u32 reason = KVM_PV_REASON_PAGE_NOT_PRESENT;
+
+ return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.apf.data, &reason,
+ sizeof(reason));
+}
+
+static inline int apf_put_user_ready(struct kvm_vcpu *vcpu, u32 token)
+{
+ unsigned int offset = offsetof(struct kvm_vcpu_pv_apf_data, token);
+
+ return kvm_write_guest_offset_cached(vcpu->kvm, &vcpu->arch.apf.data,
+ &token, offset, sizeof(token));
+}
+
+static inline bool apf_pageready_slot_free(struct kvm_vcpu *vcpu)
+{
+ unsigned int offset = offsetof(struct kvm_vcpu_pv_apf_data, token);
+ u32 val;
+
+ if (kvm_read_guest_offset_cached(vcpu->kvm, &vcpu->arch.apf.data,
+ &val, offset, sizeof(val)))
+ return false;
+
+ return !val;
+}
+
+static bool kvm_can_deliver_async_pf(struct kvm_vcpu *vcpu)
+{
+
+ if (!kvm_pv_async_pf_enabled(vcpu))
+ return false;
+
+ if (vcpu->arch.apf.send_user_only &&
+ static_call(kvm_x86_get_cpl)(vcpu) == 0)
+ return false;
+
+ if (is_guest_mode(vcpu)) {
+ /*
+ * L1 needs to opt into the special #PF vmexits that are
+ * used to deliver async page faults.
+ */
+ return vcpu->arch.apf.delivery_as_pf_vmexit;
+ } else {
+ /*
+ * Play it safe in case the guest temporarily disables paging.
+ * The real mode IDT in particular is unlikely to have a #PF
+ * exception setup.
+ */
+ return is_paging(vcpu);
+ }
+}
+
+bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu)
+{
+ if (unlikely(!lapic_in_kernel(vcpu) ||
+ kvm_event_needs_reinjection(vcpu) ||
+ kvm_is_exception_pending(vcpu)))
+ return false;
+
+ if (kvm_hlt_in_guest(vcpu->kvm) && !kvm_can_deliver_async_pf(vcpu))
+ return false;
+
+ /*
+ * If interrupts are off we cannot even use an artificial
+ * halt state.
+ */
+ return kvm_arch_interrupt_allowed(vcpu);
+}
+
+bool kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
+ struct kvm_async_pf *work)
+{
+ struct x86_exception fault;
+
+ trace_kvm_async_pf_not_present(work->arch.token, work->cr2_or_gpa);
+ kvm_add_async_pf_gfn(vcpu, work->arch.gfn);
+
+ if (kvm_can_deliver_async_pf(vcpu) &&
+ !apf_put_user_notpresent(vcpu)) {
+ fault.vector = PF_VECTOR;
+ fault.error_code_valid = true;
+ fault.error_code = 0;
+ fault.nested_page_fault = false;
+ fault.address = work->arch.token;
+ fault.async_page_fault = true;
+ kvm_inject_page_fault(vcpu, &fault);
+ return true;
+ } else {
+ /*
+ * It is not possible to deliver a paravirtualized asynchronous
+ * page fault, but putting the guest in an artificial halt state
+ * can be beneficial nevertheless: if an interrupt arrives, we
+ * can deliver it timely and perhaps the guest will schedule
+ * another process. When the instruction that triggered a page
+ * fault is retried, hopefully the page will be ready in the host.
+ */
+ kvm_make_request(KVM_REQ_APF_HALT, vcpu);
+ return false;
+ }
+}
+
+void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
+ struct kvm_async_pf *work)
+{
+ struct kvm_lapic_irq irq = {
+ .delivery_mode = APIC_DM_FIXED,
+ .vector = vcpu->arch.apf.vec
+ };
+
+ if (work->wakeup_all)
+ work->arch.token = ~0; /* broadcast wakeup */
+ else
+ kvm_del_async_pf_gfn(vcpu, work->arch.gfn);
+ trace_kvm_async_pf_ready(work->arch.token, work->cr2_or_gpa);
+
+ if ((work->wakeup_all || work->notpresent_injected) &&
+ kvm_pv_async_pf_enabled(vcpu) &&
+ !apf_put_user_ready(vcpu, work->arch.token)) {
+ vcpu->arch.apf.pageready_pending = true;
+ kvm_apic_set_irq(vcpu, &irq, NULL);
+ }
+
+ vcpu->arch.apf.halted = false;
+ vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
+}
+
+void kvm_arch_async_page_present_queued(struct kvm_vcpu *vcpu)
+{
+ kvm_make_request(KVM_REQ_APF_READY, vcpu);
+ if (!vcpu->arch.apf.pageready_pending)
+ kvm_vcpu_kick(vcpu);
+}
+
+bool kvm_arch_can_dequeue_async_page_present(struct kvm_vcpu *vcpu)
+{
+ if (!kvm_pv_async_pf_enabled(vcpu))
+ return true;
+ else
+ return kvm_lapic_enabled(vcpu) && apf_pageready_slot_free(vcpu);
+}
+
+void kvm_arch_start_assignment(struct kvm *kvm)
+{
+ if (atomic_inc_return(&kvm->arch.assigned_device_count) == 1)
+ static_call_cond(kvm_x86_pi_start_assignment)(kvm);
+}
+EXPORT_SYMBOL_GPL(kvm_arch_start_assignment);
+
+void kvm_arch_end_assignment(struct kvm *kvm)
+{
+ atomic_dec(&kvm->arch.assigned_device_count);
+}
+EXPORT_SYMBOL_GPL(kvm_arch_end_assignment);
+
+bool noinstr kvm_arch_has_assigned_device(struct kvm *kvm)
+{
+ return raw_atomic_read(&kvm->arch.assigned_device_count);
+}
+EXPORT_SYMBOL_GPL(kvm_arch_has_assigned_device);
+
+void kvm_arch_register_noncoherent_dma(struct kvm *kvm)
+{
+ atomic_inc(&kvm->arch.noncoherent_dma_count);
+}
+EXPORT_SYMBOL_GPL(kvm_arch_register_noncoherent_dma);
+
+void kvm_arch_unregister_noncoherent_dma(struct kvm *kvm)
+{
+ atomic_dec(&kvm->arch.noncoherent_dma_count);
+}
+EXPORT_SYMBOL_GPL(kvm_arch_unregister_noncoherent_dma);
+
+bool kvm_arch_has_noncoherent_dma(struct kvm *kvm)
+{
+ return atomic_read(&kvm->arch.noncoherent_dma_count);
+}
+EXPORT_SYMBOL_GPL(kvm_arch_has_noncoherent_dma);
+
+bool kvm_arch_has_irq_bypass(void)
+{
+ return enable_apicv && irq_remapping_cap(IRQ_POSTING_CAP);
+}
+
+int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons,
+ struct irq_bypass_producer *prod)
+{
+ struct kvm_kernel_irqfd *irqfd =
+ container_of(cons, struct kvm_kernel_irqfd, consumer);
+ int ret;
+
+ irqfd->producer = prod;
+ kvm_arch_start_assignment(irqfd->kvm);
+ ret = static_call(kvm_x86_pi_update_irte)(irqfd->kvm,
+ prod->irq, irqfd->gsi, 1);
+
+ if (ret)
+ kvm_arch_end_assignment(irqfd->kvm);
+
+ return ret;
+}
+
+void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
+ struct irq_bypass_producer *prod)
+{
+ int ret;
+ struct kvm_kernel_irqfd *irqfd =
+ container_of(cons, struct kvm_kernel_irqfd, consumer);
+
+ WARN_ON(irqfd->producer != prod);
+ irqfd->producer = NULL;
+
+ /*
+ * When producer of consumer is unregistered, we change back to
+ * remapped mode, so we can re-use the current implementation
+ * when the irq is masked/disabled or the consumer side (KVM
+ * int this case doesn't want to receive the interrupts.
+ */
+ ret = static_call(kvm_x86_pi_update_irte)(irqfd->kvm, prod->irq, irqfd->gsi, 0);
+ if (ret)
+ printk(KERN_INFO "irq bypass consumer (token %p) unregistration"
+ " fails: %d\n", irqfd->consumer.token, ret);
+
+ kvm_arch_end_assignment(irqfd->kvm);
+}
+
+int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq,
+ uint32_t guest_irq, bool set)
+{
+ return static_call(kvm_x86_pi_update_irte)(kvm, host_irq, guest_irq, set);
+}
+
+bool kvm_arch_irqfd_route_changed(struct kvm_kernel_irq_routing_entry *old,
+ struct kvm_kernel_irq_routing_entry *new)
+{
+ if (new->type != KVM_IRQ_ROUTING_MSI)
+ return true;
+
+ return !!memcmp(&old->msi, &new->msi, sizeof(new->msi));
+}
+
+bool kvm_vector_hashing_enabled(void)
+{
+ return vector_hashing;
+}
+
+bool kvm_arch_no_poll(struct kvm_vcpu *vcpu)
+{
+ return (vcpu->arch.msr_kvm_poll_control & 1) == 0;
+}
+EXPORT_SYMBOL_GPL(kvm_arch_no_poll);
+
+
+int kvm_spec_ctrl_test_value(u64 value)
+{
+ /*
+ * test that setting IA32_SPEC_CTRL to given value
+ * is allowed by the host processor
+ */
+
+ u64 saved_value;
+ unsigned long flags;
+ int ret = 0;
+
+ local_irq_save(flags);
+
+ if (rdmsrl_safe(MSR_IA32_SPEC_CTRL, &saved_value))
+ ret = 1;
+ else if (wrmsrl_safe(MSR_IA32_SPEC_CTRL, value))
+ ret = 1;
+ else
+ wrmsrl(MSR_IA32_SPEC_CTRL, saved_value);
+
+ local_irq_restore(flags);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(kvm_spec_ctrl_test_value);
+
+void kvm_fixup_and_inject_pf_error(struct kvm_vcpu *vcpu, gva_t gva, u16 error_code)
+{
+ struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
+ struct x86_exception fault;
+ u64 access = error_code &
+ (PFERR_WRITE_MASK | PFERR_FETCH_MASK | PFERR_USER_MASK);
+
+ if (!(error_code & PFERR_PRESENT_MASK) ||
+ mmu->gva_to_gpa(vcpu, mmu, gva, access, &fault) != INVALID_GPA) {
+ /*
+ * If vcpu->arch.walk_mmu->gva_to_gpa succeeded, the page
+ * tables probably do not match the TLB. Just proceed
+ * with the error code that the processor gave.
+ */
+ fault.vector = PF_VECTOR;
+ fault.error_code_valid = true;
+ fault.error_code = error_code;
+ fault.nested_page_fault = false;
+ fault.address = gva;
+ fault.async_page_fault = false;
+ }
+ vcpu->arch.walk_mmu->inject_page_fault(vcpu, &fault);
+}
+EXPORT_SYMBOL_GPL(kvm_fixup_and_inject_pf_error);
+
+/*
+ * Handles kvm_read/write_guest_virt*() result and either injects #PF or returns
+ * KVM_EXIT_INTERNAL_ERROR for cases not currently handled by KVM. Return value
+ * indicates whether exit to userspace is needed.
+ */
+int kvm_handle_memory_failure(struct kvm_vcpu *vcpu, int r,
+ struct x86_exception *e)
+{
+ if (r == X86EMUL_PROPAGATE_FAULT) {
+ if (KVM_BUG_ON(!e, vcpu->kvm))
+ return -EIO;
+
+ kvm_inject_emulated_page_fault(vcpu, e);
+ return 1;
+ }
+
+ /*
+ * In case kvm_read/write_guest_virt*() failed with X86EMUL_IO_NEEDED
+ * while handling a VMX instruction KVM could've handled the request
+ * correctly by exiting to userspace and performing I/O but there
+ * doesn't seem to be a real use-case behind such requests, just return
+ * KVM_EXIT_INTERNAL_ERROR for now.
+ */
+ kvm_prepare_emulation_failure_exit(vcpu);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_handle_memory_failure);
+
+int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva)
+{
+ bool pcid_enabled;
+ struct x86_exception e;
+ struct {
+ u64 pcid;
+ u64 gla;
+ } operand;
+ int r;
+
+ r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e);
+ if (r != X86EMUL_CONTINUE)
+ return kvm_handle_memory_failure(vcpu, r, &e);
+
+ if (operand.pcid >> 12 != 0) {
+ kvm_inject_gp(vcpu, 0);
+ return 1;
+ }
+
+ pcid_enabled = kvm_is_cr4_bit_set(vcpu, X86_CR4_PCIDE);
+
+ switch (type) {
+ case INVPCID_TYPE_INDIV_ADDR:
+ if ((!pcid_enabled && (operand.pcid != 0)) ||
+ is_noncanonical_address(operand.gla, vcpu)) {
+ kvm_inject_gp(vcpu, 0);
+ return 1;
+ }
+ kvm_mmu_invpcid_gva(vcpu, operand.gla, operand.pcid);
+ return kvm_skip_emulated_instruction(vcpu);
+
+ case INVPCID_TYPE_SINGLE_CTXT:
+ if (!pcid_enabled && (operand.pcid != 0)) {
+ kvm_inject_gp(vcpu, 0);
+ return 1;
+ }
+
+ kvm_invalidate_pcid(vcpu, operand.pcid);
+ return kvm_skip_emulated_instruction(vcpu);
+
+ case INVPCID_TYPE_ALL_NON_GLOBAL:
+ /*
+ * Currently, KVM doesn't mark global entries in the shadow
+ * page tables, so a non-global flush just degenerates to a
+ * global flush. If needed, we could optimize this later by
+ * keeping track of global entries in shadow page tables.
+ */
+
+ fallthrough;
+ case INVPCID_TYPE_ALL_INCL_GLOBAL:
+ kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
+ return kvm_skip_emulated_instruction(vcpu);
+
+ default:
+ kvm_inject_gp(vcpu, 0);
+ return 1;
+ }
+}
+EXPORT_SYMBOL_GPL(kvm_handle_invpcid);
+
+static int complete_sev_es_emulated_mmio(struct kvm_vcpu *vcpu)
+{
+ struct kvm_run *run = vcpu->run;
+ struct kvm_mmio_fragment *frag;
+ unsigned int len;
+
+ BUG_ON(!vcpu->mmio_needed);
+
+ /* Complete previous fragment */
+ frag = &vcpu->mmio_fragments[vcpu->mmio_cur_fragment];
+ len = min(8u, frag->len);
+ if (!vcpu->mmio_is_write)
+ memcpy(frag->data, run->mmio.data, len);
+
+ if (frag->len <= 8) {
+ /* Switch to the next fragment. */
+ frag++;
+ vcpu->mmio_cur_fragment++;
+ } else {
+ /* Go forward to the next mmio piece. */
+ frag->data += len;
+ frag->gpa += len;
+ frag->len -= len;
+ }
+
+ if (vcpu->mmio_cur_fragment >= vcpu->mmio_nr_fragments) {
+ vcpu->mmio_needed = 0;
+
+ // VMG change, at this point, we're always done
+ // RIP has already been advanced
+ return 1;
+ }
+
+ // More MMIO is needed
+ run->mmio.phys_addr = frag->gpa;
+ run->mmio.len = min(8u, frag->len);
+ run->mmio.is_write = vcpu->mmio_is_write;
+ if (run->mmio.is_write)
+ memcpy(run->mmio.data, frag->data, min(8u, frag->len));
+ run->exit_reason = KVM_EXIT_MMIO;
+
+ vcpu->arch.complete_userspace_io = complete_sev_es_emulated_mmio;
+
+ return 0;
+}
+
+int kvm_sev_es_mmio_write(struct kvm_vcpu *vcpu, gpa_t gpa, unsigned int bytes,
+ void *data)
+{
+ int handled;
+ struct kvm_mmio_fragment *frag;
+
+ if (!data)
+ return -EINVAL;
+
+ handled = write_emultor.read_write_mmio(vcpu, gpa, bytes, data);
+ if (handled == bytes)
+ return 1;
+
+ bytes -= handled;
+ gpa += handled;
+ data += handled;
+
+ /*TODO: Check if need to increment number of frags */
+ frag = vcpu->mmio_fragments;
+ vcpu->mmio_nr_fragments = 1;
+ frag->len = bytes;
+ frag->gpa = gpa;
+ frag->data = data;
+
+ vcpu->mmio_needed = 1;
+ vcpu->mmio_cur_fragment = 0;
+
+ vcpu->run->mmio.phys_addr = gpa;
+ vcpu->run->mmio.len = min(8u, frag->len);
+ vcpu->run->mmio.is_write = 1;
+ memcpy(vcpu->run->mmio.data, frag->data, min(8u, frag->len));
+ vcpu->run->exit_reason = KVM_EXIT_MMIO;
+
+ vcpu->arch.complete_userspace_io = complete_sev_es_emulated_mmio;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_sev_es_mmio_write);
+
+int kvm_sev_es_mmio_read(struct kvm_vcpu *vcpu, gpa_t gpa, unsigned int bytes,
+ void *data)
+{
+ int handled;
+ struct kvm_mmio_fragment *frag;
+
+ if (!data)
+ return -EINVAL;
+
+ handled = read_emultor.read_write_mmio(vcpu, gpa, bytes, data);
+ if (handled == bytes)
+ return 1;
+
+ bytes -= handled;
+ gpa += handled;
+ data += handled;
+
+ /*TODO: Check if need to increment number of frags */
+ frag = vcpu->mmio_fragments;
+ vcpu->mmio_nr_fragments = 1;
+ frag->len = bytes;
+ frag->gpa = gpa;
+ frag->data = data;
+
+ vcpu->mmio_needed = 1;
+ vcpu->mmio_cur_fragment = 0;
+
+ vcpu->run->mmio.phys_addr = gpa;
+ vcpu->run->mmio.len = min(8u, frag->len);
+ vcpu->run->mmio.is_write = 0;
+ vcpu->run->exit_reason = KVM_EXIT_MMIO;
+
+ vcpu->arch.complete_userspace_io = complete_sev_es_emulated_mmio;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_sev_es_mmio_read);
+
+static void advance_sev_es_emulated_pio(struct kvm_vcpu *vcpu, unsigned count, int size)
+{
+ vcpu->arch.sev_pio_count -= count;
+ vcpu->arch.sev_pio_data += count * size;
+}
+
+static int kvm_sev_es_outs(struct kvm_vcpu *vcpu, unsigned int size,
+ unsigned int port);
+
+static int complete_sev_es_emulated_outs(struct kvm_vcpu *vcpu)
+{
+ int size = vcpu->arch.pio.size;
+ int port = vcpu->arch.pio.port;
+
+ vcpu->arch.pio.count = 0;
+ if (vcpu->arch.sev_pio_count)
+ return kvm_sev_es_outs(vcpu, size, port);
+ return 1;
+}
+
+static int kvm_sev_es_outs(struct kvm_vcpu *vcpu, unsigned int size,
+ unsigned int port)
+{
+ for (;;) {
+ unsigned int count =
+ min_t(unsigned int, PAGE_SIZE / size, vcpu->arch.sev_pio_count);
+ int ret = emulator_pio_out(vcpu, size, port, vcpu->arch.sev_pio_data, count);
+
+ /* memcpy done already by emulator_pio_out. */
+ advance_sev_es_emulated_pio(vcpu, count, size);
+ if (!ret)
+ break;
+
+ /* Emulation done by the kernel. */
+ if (!vcpu->arch.sev_pio_count)
+ return 1;
+ }
+
+ vcpu->arch.complete_userspace_io = complete_sev_es_emulated_outs;
+ return 0;
+}
+
+static int kvm_sev_es_ins(struct kvm_vcpu *vcpu, unsigned int size,
+ unsigned int port);
+
+static int complete_sev_es_emulated_ins(struct kvm_vcpu *vcpu)
+{
+ unsigned count = vcpu->arch.pio.count;
+ int size = vcpu->arch.pio.size;
+ int port = vcpu->arch.pio.port;
+
+ complete_emulator_pio_in(vcpu, vcpu->arch.sev_pio_data);
+ advance_sev_es_emulated_pio(vcpu, count, size);
+ if (vcpu->arch.sev_pio_count)
+ return kvm_sev_es_ins(vcpu, size, port);
+ return 1;
+}
+
+static int kvm_sev_es_ins(struct kvm_vcpu *vcpu, unsigned int size,
+ unsigned int port)
+{
+ for (;;) {
+ unsigned int count =
+ min_t(unsigned int, PAGE_SIZE / size, vcpu->arch.sev_pio_count);
+ if (!emulator_pio_in(vcpu, size, port, vcpu->arch.sev_pio_data, count))
+ break;
+
+ /* Emulation done by the kernel. */
+ advance_sev_es_emulated_pio(vcpu, count, size);
+ if (!vcpu->arch.sev_pio_count)
+ return 1;
+ }
+
+ vcpu->arch.complete_userspace_io = complete_sev_es_emulated_ins;
+ return 0;
+}
+
+int kvm_sev_es_string_io(struct kvm_vcpu *vcpu, unsigned int size,
+ unsigned int port, void *data, unsigned int count,
+ int in)
+{
+ vcpu->arch.sev_pio_data = data;
+ vcpu->arch.sev_pio_count = count;
+ return in ? kvm_sev_es_ins(vcpu, size, port)
+ : kvm_sev_es_outs(vcpu, size, port);
+}
+EXPORT_SYMBOL_GPL(kvm_sev_es_string_io);
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_entry);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_fast_mmio);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_msr);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_cr);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmenter);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit_inject);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intr_vmexit);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmenter_failed);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_invlpga);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_skinit);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intercepts);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_write_tsc_offset);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_ple_window_update);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pml_full);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pi_irte_update);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_unaccelerated_access);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_incomplete_ipi);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_ga_log);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_kick_vcpu_slowpath);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_doorbell);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_apicv_accept_irq);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_enter);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_exit);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_msr_protocol_enter);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_msr_protocol_exit);
+
+static int __init kvm_x86_init(void)
+{
+ kvm_mmu_x86_module_init();
+ mitigate_smt_rsb &= boot_cpu_has_bug(X86_BUG_SMT_RSB) && cpu_smt_possible();
+ return 0;
+}
+module_init(kvm_x86_init);
+
+static void __exit kvm_x86_exit(void)
+{
+ /*
+ * If module_init() is implemented, module_exit() must also be
+ * implemented to allow module unload.
+ */
+}
+module_exit(kvm_x86_exit);
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
new file mode 100644
index 000000000..1e7be1f6a
--- /dev/null
+++ b/arch/x86/kvm/x86.h
@@ -0,0 +1,543 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef ARCH_X86_KVM_X86_H
+#define ARCH_X86_KVM_X86_H
+
+#include <linux/kvm_host.h>
+#include <asm/fpu/xstate.h>
+#include <asm/mce.h>
+#include <asm/pvclock.h>
+#include "kvm_cache_regs.h"
+#include "kvm_emulate.h"
+
+struct kvm_caps {
+ /* control of guest tsc rate supported? */
+ bool has_tsc_control;
+ /* maximum supported tsc_khz for guests */
+ u32 max_guest_tsc_khz;
+ /* number of bits of the fractional part of the TSC scaling ratio */
+ u8 tsc_scaling_ratio_frac_bits;
+ /* maximum allowed value of TSC scaling ratio */
+ u64 max_tsc_scaling_ratio;
+ /* 1ull << kvm_caps.tsc_scaling_ratio_frac_bits */
+ u64 default_tsc_scaling_ratio;
+ /* bus lock detection supported? */
+ bool has_bus_lock_exit;
+ /* notify VM exit supported? */
+ bool has_notify_vmexit;
+
+ u64 supported_mce_cap;
+ u64 supported_xcr0;
+ u64 supported_xss;
+ u64 supported_perf_cap;
+};
+
+void kvm_spurious_fault(void);
+
+#define KVM_NESTED_VMENTER_CONSISTENCY_CHECK(consistency_check) \
+({ \
+ bool failed = (consistency_check); \
+ if (failed) \
+ trace_kvm_nested_vmenter_failed(#consistency_check, 0); \
+ failed; \
+})
+
+/*
+ * The first...last VMX feature MSRs that are emulated by KVM. This may or may
+ * not cover all known VMX MSRs, as KVM doesn't emulate an MSR until there's an
+ * associated feature that KVM supports for nested virtualization.
+ */
+#define KVM_FIRST_EMULATED_VMX_MSR MSR_IA32_VMX_BASIC
+#define KVM_LAST_EMULATED_VMX_MSR MSR_IA32_VMX_VMFUNC
+
+#define KVM_DEFAULT_PLE_GAP 128
+#define KVM_VMX_DEFAULT_PLE_WINDOW 4096
+#define KVM_DEFAULT_PLE_WINDOW_GROW 2
+#define KVM_DEFAULT_PLE_WINDOW_SHRINK 0
+#define KVM_VMX_DEFAULT_PLE_WINDOW_MAX UINT_MAX
+#define KVM_SVM_DEFAULT_PLE_WINDOW_MAX USHRT_MAX
+#define KVM_SVM_DEFAULT_PLE_WINDOW 3000
+
+static inline unsigned int __grow_ple_window(unsigned int val,
+ unsigned int base, unsigned int modifier, unsigned int max)
+{
+ u64 ret = val;
+
+ if (modifier < 1)
+ return base;
+
+ if (modifier < base)
+ ret *= modifier;
+ else
+ ret += modifier;
+
+ return min(ret, (u64)max);
+}
+
+static inline unsigned int __shrink_ple_window(unsigned int val,
+ unsigned int base, unsigned int modifier, unsigned int min)
+{
+ if (modifier < 1)
+ return base;
+
+ if (modifier < base)
+ val /= modifier;
+ else
+ val -= modifier;
+
+ return max(val, min);
+}
+
+#define MSR_IA32_CR_PAT_DEFAULT 0x0007040600070406ULL
+
+void kvm_service_local_tlb_flush_requests(struct kvm_vcpu *vcpu);
+int kvm_check_nested_events(struct kvm_vcpu *vcpu);
+
+static inline bool kvm_vcpu_has_run(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.last_vmentry_cpu != -1;
+}
+
+static inline bool kvm_is_exception_pending(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.exception.pending ||
+ vcpu->arch.exception_vmexit.pending ||
+ kvm_test_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+}
+
+static inline void kvm_clear_exception_queue(struct kvm_vcpu *vcpu)
+{
+ vcpu->arch.exception.pending = false;
+ vcpu->arch.exception.injected = false;
+ vcpu->arch.exception_vmexit.pending = false;
+}
+
+static inline void kvm_queue_interrupt(struct kvm_vcpu *vcpu, u8 vector,
+ bool soft)
+{
+ vcpu->arch.interrupt.injected = true;
+ vcpu->arch.interrupt.soft = soft;
+ vcpu->arch.interrupt.nr = vector;
+}
+
+static inline void kvm_clear_interrupt_queue(struct kvm_vcpu *vcpu)
+{
+ vcpu->arch.interrupt.injected = false;
+}
+
+static inline bool kvm_event_needs_reinjection(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.exception.injected || vcpu->arch.interrupt.injected ||
+ vcpu->arch.nmi_injected;
+}
+
+static inline bool kvm_exception_is_soft(unsigned int nr)
+{
+ return (nr == BP_VECTOR) || (nr == OF_VECTOR);
+}
+
+static inline bool is_protmode(struct kvm_vcpu *vcpu)
+{
+ return kvm_is_cr0_bit_set(vcpu, X86_CR0_PE);
+}
+
+static inline bool is_long_mode(struct kvm_vcpu *vcpu)
+{
+#ifdef CONFIG_X86_64
+ return !!(vcpu->arch.efer & EFER_LMA);
+#else
+ return false;
+#endif
+}
+
+static inline bool is_64_bit_mode(struct kvm_vcpu *vcpu)
+{
+ int cs_db, cs_l;
+
+ WARN_ON_ONCE(vcpu->arch.guest_state_protected);
+
+ if (!is_long_mode(vcpu))
+ return false;
+ static_call(kvm_x86_get_cs_db_l_bits)(vcpu, &cs_db, &cs_l);
+ return cs_l;
+}
+
+static inline bool is_64_bit_hypercall(struct kvm_vcpu *vcpu)
+{
+ /*
+ * If running with protected guest state, the CS register is not
+ * accessible. The hypercall register values will have had to been
+ * provided in 64-bit mode, so assume the guest is in 64-bit.
+ */
+ return vcpu->arch.guest_state_protected || is_64_bit_mode(vcpu);
+}
+
+static inline bool x86_exception_has_error_code(unsigned int vector)
+{
+ static u32 exception_has_error_code = BIT(DF_VECTOR) | BIT(TS_VECTOR) |
+ BIT(NP_VECTOR) | BIT(SS_VECTOR) | BIT(GP_VECTOR) |
+ BIT(PF_VECTOR) | BIT(AC_VECTOR);
+
+ return (1U << vector) & exception_has_error_code;
+}
+
+static inline bool mmu_is_nested(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.walk_mmu == &vcpu->arch.nested_mmu;
+}
+
+static inline bool is_pae(struct kvm_vcpu *vcpu)
+{
+ return kvm_is_cr4_bit_set(vcpu, X86_CR4_PAE);
+}
+
+static inline bool is_pse(struct kvm_vcpu *vcpu)
+{
+ return kvm_is_cr4_bit_set(vcpu, X86_CR4_PSE);
+}
+
+static inline bool is_paging(struct kvm_vcpu *vcpu)
+{
+ return likely(kvm_is_cr0_bit_set(vcpu, X86_CR0_PG));
+}
+
+static inline bool is_pae_paging(struct kvm_vcpu *vcpu)
+{
+ return !is_long_mode(vcpu) && is_pae(vcpu) && is_paging(vcpu);
+}
+
+static inline u8 vcpu_virt_addr_bits(struct kvm_vcpu *vcpu)
+{
+ return kvm_is_cr4_bit_set(vcpu, X86_CR4_LA57) ? 57 : 48;
+}
+
+static inline bool is_noncanonical_address(u64 la, struct kvm_vcpu *vcpu)
+{
+ return !__is_canonical_address(la, vcpu_virt_addr_bits(vcpu));
+}
+
+static inline void vcpu_cache_mmio_info(struct kvm_vcpu *vcpu,
+ gva_t gva, gfn_t gfn, unsigned access)
+{
+ u64 gen = kvm_memslots(vcpu->kvm)->generation;
+
+ if (unlikely(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS))
+ return;
+
+ /*
+ * If this is a shadow nested page table, the "GVA" is
+ * actually a nGPA.
+ */
+ vcpu->arch.mmio_gva = mmu_is_nested(vcpu) ? 0 : gva & PAGE_MASK;
+ vcpu->arch.mmio_access = access;
+ vcpu->arch.mmio_gfn = gfn;
+ vcpu->arch.mmio_gen = gen;
+}
+
+static inline bool vcpu_match_mmio_gen(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.mmio_gen == kvm_memslots(vcpu->kvm)->generation;
+}
+
+/*
+ * Clear the mmio cache info for the given gva. If gva is MMIO_GVA_ANY, we
+ * clear all mmio cache info.
+ */
+#define MMIO_GVA_ANY (~(gva_t)0)
+
+static inline void vcpu_clear_mmio_info(struct kvm_vcpu *vcpu, gva_t gva)
+{
+ if (gva != MMIO_GVA_ANY && vcpu->arch.mmio_gva != (gva & PAGE_MASK))
+ return;
+
+ vcpu->arch.mmio_gva = 0;
+}
+
+static inline bool vcpu_match_mmio_gva(struct kvm_vcpu *vcpu, unsigned long gva)
+{
+ if (vcpu_match_mmio_gen(vcpu) && vcpu->arch.mmio_gva &&
+ vcpu->arch.mmio_gva == (gva & PAGE_MASK))
+ return true;
+
+ return false;
+}
+
+static inline bool vcpu_match_mmio_gpa(struct kvm_vcpu *vcpu, gpa_t gpa)
+{
+ if (vcpu_match_mmio_gen(vcpu) && vcpu->arch.mmio_gfn &&
+ vcpu->arch.mmio_gfn == gpa >> PAGE_SHIFT)
+ return true;
+
+ return false;
+}
+
+static inline unsigned long kvm_register_read(struct kvm_vcpu *vcpu, int reg)
+{
+ unsigned long val = kvm_register_read_raw(vcpu, reg);
+
+ return is_64_bit_mode(vcpu) ? val : (u32)val;
+}
+
+static inline void kvm_register_write(struct kvm_vcpu *vcpu,
+ int reg, unsigned long val)
+{
+ if (!is_64_bit_mode(vcpu))
+ val = (u32)val;
+ return kvm_register_write_raw(vcpu, reg, val);
+}
+
+static inline bool kvm_check_has_quirk(struct kvm *kvm, u64 quirk)
+{
+ return !(kvm->arch.disabled_quirks & quirk);
+}
+
+void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip);
+
+u64 get_kvmclock_ns(struct kvm *kvm);
+
+int kvm_read_guest_virt(struct kvm_vcpu *vcpu,
+ gva_t addr, void *val, unsigned int bytes,
+ struct x86_exception *exception);
+
+int kvm_write_guest_virt_system(struct kvm_vcpu *vcpu,
+ gva_t addr, void *val, unsigned int bytes,
+ struct x86_exception *exception);
+
+int handle_ud(struct kvm_vcpu *vcpu);
+
+void kvm_deliver_exception_payload(struct kvm_vcpu *vcpu,
+ struct kvm_queued_exception *ex);
+
+void kvm_vcpu_mtrr_init(struct kvm_vcpu *vcpu);
+u8 kvm_mtrr_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn);
+int kvm_mtrr_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data);
+int kvm_mtrr_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata);
+bool kvm_mtrr_check_gfn_range_consistency(struct kvm_vcpu *vcpu, gfn_t gfn,
+ int page_num);
+bool kvm_vector_hashing_enabled(void);
+void kvm_fixup_and_inject_pf_error(struct kvm_vcpu *vcpu, gva_t gva, u16 error_code);
+int x86_decode_emulated_instruction(struct kvm_vcpu *vcpu, int emulation_type,
+ void *insn, int insn_len);
+int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
+ int emulation_type, void *insn, int insn_len);
+fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu);
+
+extern u64 host_xcr0;
+extern u64 host_xss;
+extern u64 host_arch_capabilities;
+
+extern struct kvm_caps kvm_caps;
+
+extern bool enable_pmu;
+
+/*
+ * Get a filtered version of KVM's supported XCR0 that strips out dynamic
+ * features for which the current process doesn't (yet) have permission to use.
+ * This is intended to be used only when enumerating support to userspace,
+ * e.g. in KVM_GET_SUPPORTED_CPUID and KVM_CAP_XSAVE2, it does NOT need to be
+ * used to check/restrict guest behavior as KVM rejects KVM_SET_CPUID{2} if
+ * userspace attempts to enable unpermitted features.
+ */
+static inline u64 kvm_get_filtered_xcr0(void)
+{
+ u64 permitted_xcr0 = kvm_caps.supported_xcr0;
+
+ BUILD_BUG_ON(XFEATURE_MASK_USER_DYNAMIC != XFEATURE_MASK_XTILE_DATA);
+
+ if (permitted_xcr0 & XFEATURE_MASK_USER_DYNAMIC) {
+ permitted_xcr0 &= xstate_get_guest_group_perm();
+
+ /*
+ * Treat XTILE_CFG as unsupported if the current process isn't
+ * allowed to use XTILE_DATA, as attempting to set XTILE_CFG in
+ * XCR0 without setting XTILE_DATA is architecturally illegal.
+ */
+ if (!(permitted_xcr0 & XFEATURE_MASK_XTILE_DATA))
+ permitted_xcr0 &= ~XFEATURE_MASK_XTILE_CFG;
+ }
+ return permitted_xcr0;
+}
+
+static inline bool kvm_mpx_supported(void)
+{
+ return (kvm_caps.supported_xcr0 & (XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR))
+ == (XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR);
+}
+
+extern unsigned int min_timer_period_us;
+
+extern bool enable_vmware_backdoor;
+
+extern int pi_inject_timer;
+
+extern bool report_ignored_msrs;
+
+extern bool eager_page_split;
+
+static inline void kvm_pr_unimpl_wrmsr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
+{
+ if (report_ignored_msrs)
+ vcpu_unimpl(vcpu, "Unhandled WRMSR(0x%x) = 0x%llx\n", msr, data);
+}
+
+static inline void kvm_pr_unimpl_rdmsr(struct kvm_vcpu *vcpu, u32 msr)
+{
+ if (report_ignored_msrs)
+ vcpu_unimpl(vcpu, "Unhandled RDMSR(0x%x)\n", msr);
+}
+
+static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec)
+{
+ return pvclock_scale_delta(nsec, vcpu->arch.virtual_tsc_mult,
+ vcpu->arch.virtual_tsc_shift);
+}
+
+/* Same "calling convention" as do_div:
+ * - divide (n << 32) by base
+ * - put result in n
+ * - return remainder
+ */
+#define do_shl32_div32(n, base) \
+ ({ \
+ u32 __quot, __rem; \
+ asm("divl %2" : "=a" (__quot), "=d" (__rem) \
+ : "rm" (base), "0" (0), "1" ((u32) n)); \
+ n = __quot; \
+ __rem; \
+ })
+
+static inline bool kvm_mwait_in_guest(struct kvm *kvm)
+{
+ return kvm->arch.mwait_in_guest;
+}
+
+static inline bool kvm_hlt_in_guest(struct kvm *kvm)
+{
+ return kvm->arch.hlt_in_guest;
+}
+
+static inline bool kvm_pause_in_guest(struct kvm *kvm)
+{
+ return kvm->arch.pause_in_guest;
+}
+
+static inline bool kvm_cstate_in_guest(struct kvm *kvm)
+{
+ return kvm->arch.cstate_in_guest;
+}
+
+static inline bool kvm_notify_vmexit_enabled(struct kvm *kvm)
+{
+ return kvm->arch.notify_vmexit_flags & KVM_X86_NOTIFY_VMEXIT_ENABLED;
+}
+
+enum kvm_intr_type {
+ /* Values are arbitrary, but must be non-zero. */
+ KVM_HANDLING_IRQ = 1,
+ KVM_HANDLING_NMI,
+};
+
+static __always_inline void kvm_before_interrupt(struct kvm_vcpu *vcpu,
+ enum kvm_intr_type intr)
+{
+ WRITE_ONCE(vcpu->arch.handling_intr_from_guest, (u8)intr);
+}
+
+static __always_inline void kvm_after_interrupt(struct kvm_vcpu *vcpu)
+{
+ WRITE_ONCE(vcpu->arch.handling_intr_from_guest, 0);
+}
+
+static inline bool kvm_handling_nmi_from_guest(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.handling_intr_from_guest == KVM_HANDLING_NMI;
+}
+
+static inline bool kvm_pat_valid(u64 data)
+{
+ if (data & 0xF8F8F8F8F8F8F8F8ull)
+ return false;
+ /* 0, 1, 4, 5, 6, 7 are valid values. */
+ return (data | ((data & 0x0202020202020202ull) << 1)) == data;
+}
+
+static inline bool kvm_dr7_valid(u64 data)
+{
+ /* Bits [63:32] are reserved */
+ return !(data >> 32);
+}
+static inline bool kvm_dr6_valid(u64 data)
+{
+ /* Bits [63:32] are reserved */
+ return !(data >> 32);
+}
+
+/*
+ * Trigger machine check on the host. We assume all the MSRs are already set up
+ * by the CPU and that we still run on the same CPU as the MCE occurred on.
+ * We pass a fake environment to the machine check handler because we want
+ * the guest to be always treated like user space, no matter what context
+ * it used internally.
+ */
+static inline void kvm_machine_check(void)
+{
+#if defined(CONFIG_X86_MCE)
+ struct pt_regs regs = {
+ .cs = 3, /* Fake ring 3 no matter what the guest ran on */
+ .flags = X86_EFLAGS_IF,
+ };
+
+ do_machine_check(&regs);
+#endif
+}
+
+void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu);
+void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu);
+int kvm_spec_ctrl_test_value(u64 value);
+bool __kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
+int kvm_handle_memory_failure(struct kvm_vcpu *vcpu, int r,
+ struct x86_exception *e);
+int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva);
+bool kvm_msr_allowed(struct kvm_vcpu *vcpu, u32 index, u32 type);
+
+/*
+ * Internal error codes that are used to indicate that MSR emulation encountered
+ * an error that should result in #GP in the guest, unless userspace
+ * handles it.
+ */
+#define KVM_MSR_RET_INVALID 2 /* in-kernel MSR emulation #GP condition */
+#define KVM_MSR_RET_FILTERED 3 /* #GP due to userspace MSR filter */
+
+#define __cr4_reserved_bits(__cpu_has, __c) \
+({ \
+ u64 __reserved_bits = CR4_RESERVED_BITS; \
+ \
+ if (!__cpu_has(__c, X86_FEATURE_XSAVE)) \
+ __reserved_bits |= X86_CR4_OSXSAVE; \
+ if (!__cpu_has(__c, X86_FEATURE_SMEP)) \
+ __reserved_bits |= X86_CR4_SMEP; \
+ if (!__cpu_has(__c, X86_FEATURE_SMAP)) \
+ __reserved_bits |= X86_CR4_SMAP; \
+ if (!__cpu_has(__c, X86_FEATURE_FSGSBASE)) \
+ __reserved_bits |= X86_CR4_FSGSBASE; \
+ if (!__cpu_has(__c, X86_FEATURE_PKU)) \
+ __reserved_bits |= X86_CR4_PKE; \
+ if (!__cpu_has(__c, X86_FEATURE_LA57)) \
+ __reserved_bits |= X86_CR4_LA57; \
+ if (!__cpu_has(__c, X86_FEATURE_UMIP)) \
+ __reserved_bits |= X86_CR4_UMIP; \
+ if (!__cpu_has(__c, X86_FEATURE_VMX)) \
+ __reserved_bits |= X86_CR4_VMXE; \
+ if (!__cpu_has(__c, X86_FEATURE_PCID)) \
+ __reserved_bits |= X86_CR4_PCIDE; \
+ __reserved_bits; \
+})
+
+int kvm_sev_es_mmio_write(struct kvm_vcpu *vcpu, gpa_t src, unsigned int bytes,
+ void *dst);
+int kvm_sev_es_mmio_read(struct kvm_vcpu *vcpu, gpa_t src, unsigned int bytes,
+ void *dst);
+int kvm_sev_es_string_io(struct kvm_vcpu *vcpu, unsigned int size,
+ unsigned int port, void *data, unsigned int count,
+ int in);
+
+#endif
diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c
new file mode 100644
index 000000000..40edf4d19
--- /dev/null
+++ b/arch/x86/kvm/xen.c
@@ -0,0 +1,2129 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright © 2019 Oracle and/or its affiliates. All rights reserved.
+ * Copyright © 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ *
+ * KVM Xen emulation
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include "x86.h"
+#include "xen.h"
+#include "hyperv.h"
+#include "lapic.h"
+
+#include <linux/eventfd.h>
+#include <linux/kvm_host.h>
+#include <linux/sched/stat.h>
+
+#include <trace/events/kvm.h>
+#include <xen/interface/xen.h>
+#include <xen/interface/vcpu.h>
+#include <xen/interface/version.h>
+#include <xen/interface/event_channel.h>
+#include <xen/interface/sched.h>
+
+#include <asm/xen/cpuid.h>
+
+#include "cpuid.h"
+#include "trace.h"
+
+static int kvm_xen_set_evtchn(struct kvm_xen_evtchn *xe, struct kvm *kvm);
+static int kvm_xen_setattr_evtchn(struct kvm *kvm, struct kvm_xen_hvm_attr *data);
+static bool kvm_xen_hcall_evtchn_send(struct kvm_vcpu *vcpu, u64 param, u64 *r);
+
+DEFINE_STATIC_KEY_DEFERRED_FALSE(kvm_xen_enabled, HZ);
+
+static int kvm_xen_shared_info_init(struct kvm *kvm, gfn_t gfn)
+{
+ struct gfn_to_pfn_cache *gpc = &kvm->arch.xen.shinfo_cache;
+ struct pvclock_wall_clock *wc;
+ gpa_t gpa = gfn_to_gpa(gfn);
+ u32 *wc_sec_hi;
+ u32 wc_version;
+ u64 wall_nsec;
+ int ret = 0;
+ int idx = srcu_read_lock(&kvm->srcu);
+
+ if (gfn == KVM_XEN_INVALID_GFN) {
+ kvm_gpc_deactivate(gpc);
+ goto out;
+ }
+
+ do {
+ ret = kvm_gpc_activate(gpc, gpa, PAGE_SIZE);
+ if (ret)
+ goto out;
+
+ /*
+ * This code mirrors kvm_write_wall_clock() except that it writes
+ * directly through the pfn cache and doesn't mark the page dirty.
+ */
+ wall_nsec = ktime_get_real_ns() - get_kvmclock_ns(kvm);
+
+ /* It could be invalid again already, so we need to check */
+ read_lock_irq(&gpc->lock);
+
+ if (gpc->valid)
+ break;
+
+ read_unlock_irq(&gpc->lock);
+ } while (1);
+
+ /* Paranoia checks on the 32-bit struct layout */
+ BUILD_BUG_ON(offsetof(struct compat_shared_info, wc) != 0x900);
+ BUILD_BUG_ON(offsetof(struct compat_shared_info, arch.wc_sec_hi) != 0x924);
+ BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0);
+
+#ifdef CONFIG_X86_64
+ /* Paranoia checks on the 64-bit struct layout */
+ BUILD_BUG_ON(offsetof(struct shared_info, wc) != 0xc00);
+ BUILD_BUG_ON(offsetof(struct shared_info, wc_sec_hi) != 0xc0c);
+
+ if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode) {
+ struct shared_info *shinfo = gpc->khva;
+
+ wc_sec_hi = &shinfo->wc_sec_hi;
+ wc = &shinfo->wc;
+ } else
+#endif
+ {
+ struct compat_shared_info *shinfo = gpc->khva;
+
+ wc_sec_hi = &shinfo->arch.wc_sec_hi;
+ wc = &shinfo->wc;
+ }
+
+ /* Increment and ensure an odd value */
+ wc_version = wc->version = (wc->version + 1) | 1;
+ smp_wmb();
+
+ wc->nsec = do_div(wall_nsec, 1000000000);
+ wc->sec = (u32)wall_nsec;
+ *wc_sec_hi = wall_nsec >> 32;
+ smp_wmb();
+
+ wc->version = wc_version + 1;
+ read_unlock_irq(&gpc->lock);
+
+ kvm_make_all_cpus_request(kvm, KVM_REQ_MASTERCLOCK_UPDATE);
+
+out:
+ srcu_read_unlock(&kvm->srcu, idx);
+ return ret;
+}
+
+void kvm_xen_inject_timer_irqs(struct kvm_vcpu *vcpu)
+{
+ if (atomic_read(&vcpu->arch.xen.timer_pending) > 0) {
+ struct kvm_xen_evtchn e;
+
+ e.vcpu_id = vcpu->vcpu_id;
+ e.vcpu_idx = vcpu->vcpu_idx;
+ e.port = vcpu->arch.xen.timer_virq;
+ e.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL;
+
+ kvm_xen_set_evtchn(&e, vcpu->kvm);
+
+ vcpu->arch.xen.timer_expires = 0;
+ atomic_set(&vcpu->arch.xen.timer_pending, 0);
+ }
+}
+
+static enum hrtimer_restart xen_timer_callback(struct hrtimer *timer)
+{
+ struct kvm_vcpu *vcpu = container_of(timer, struct kvm_vcpu,
+ arch.xen.timer);
+ if (atomic_read(&vcpu->arch.xen.timer_pending))
+ return HRTIMER_NORESTART;
+
+ atomic_inc(&vcpu->arch.xen.timer_pending);
+ kvm_make_request(KVM_REQ_UNBLOCK, vcpu);
+ kvm_vcpu_kick(vcpu);
+
+ return HRTIMER_NORESTART;
+}
+
+static void kvm_xen_start_timer(struct kvm_vcpu *vcpu, u64 guest_abs, s64 delta_ns)
+{
+ atomic_set(&vcpu->arch.xen.timer_pending, 0);
+ vcpu->arch.xen.timer_expires = guest_abs;
+
+ if (delta_ns <= 0) {
+ xen_timer_callback(&vcpu->arch.xen.timer);
+ } else {
+ ktime_t ktime_now = ktime_get();
+ hrtimer_start(&vcpu->arch.xen.timer,
+ ktime_add_ns(ktime_now, delta_ns),
+ HRTIMER_MODE_ABS_HARD);
+ }
+}
+
+static void kvm_xen_stop_timer(struct kvm_vcpu *vcpu)
+{
+ hrtimer_cancel(&vcpu->arch.xen.timer);
+ vcpu->arch.xen.timer_expires = 0;
+ atomic_set(&vcpu->arch.xen.timer_pending, 0);
+}
+
+static void kvm_xen_init_timer(struct kvm_vcpu *vcpu)
+{
+ hrtimer_init(&vcpu->arch.xen.timer, CLOCK_MONOTONIC,
+ HRTIMER_MODE_ABS_HARD);
+ vcpu->arch.xen.timer.function = xen_timer_callback;
+}
+
+static void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, bool atomic)
+{
+ struct kvm_vcpu_xen *vx = &v->arch.xen;
+ struct gfn_to_pfn_cache *gpc1 = &vx->runstate_cache;
+ struct gfn_to_pfn_cache *gpc2 = &vx->runstate2_cache;
+ size_t user_len, user_len1, user_len2;
+ struct vcpu_runstate_info rs;
+ unsigned long flags;
+ size_t times_ofs;
+ uint8_t *update_bit = NULL;
+ uint64_t entry_time;
+ uint64_t *rs_times;
+ int *rs_state;
+
+ /*
+ * The only difference between 32-bit and 64-bit versions of the
+ * runstate struct is the alignment of uint64_t in 32-bit, which
+ * means that the 64-bit version has an additional 4 bytes of
+ * padding after the first field 'state'. Let's be really really
+ * paranoid about that, and matching it with our internal data
+ * structures that we memcpy into it...
+ */
+ BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state) != 0);
+ BUILD_BUG_ON(offsetof(struct compat_vcpu_runstate_info, state) != 0);
+ BUILD_BUG_ON(sizeof(struct compat_vcpu_runstate_info) != 0x2c);
+#ifdef CONFIG_X86_64
+ /*
+ * The 64-bit structure has 4 bytes of padding before 'state_entry_time'
+ * so each subsequent field is shifted by 4, and it's 4 bytes longer.
+ */
+ BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state_entry_time) !=
+ offsetof(struct compat_vcpu_runstate_info, state_entry_time) + 4);
+ BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, time) !=
+ offsetof(struct compat_vcpu_runstate_info, time) + 4);
+ BUILD_BUG_ON(sizeof(struct vcpu_runstate_info) != 0x2c + 4);
+#endif
+ /*
+ * The state field is in the same place at the start of both structs,
+ * and is the same size (int) as vx->current_runstate.
+ */
+ BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state) !=
+ offsetof(struct compat_vcpu_runstate_info, state));
+ BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, state) !=
+ sizeof(vx->current_runstate));
+ BUILD_BUG_ON(sizeof_field(struct compat_vcpu_runstate_info, state) !=
+ sizeof(vx->current_runstate));
+
+ /*
+ * The state_entry_time field is 64 bits in both versions, and the
+ * XEN_RUNSTATE_UPDATE flag is in the top bit, which given that x86
+ * is little-endian means that it's in the last *byte* of the word.
+ * That detail is important later.
+ */
+ BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, state_entry_time) !=
+ sizeof(uint64_t));
+ BUILD_BUG_ON(sizeof_field(struct compat_vcpu_runstate_info, state_entry_time) !=
+ sizeof(uint64_t));
+ BUILD_BUG_ON((XEN_RUNSTATE_UPDATE >> 56) != 0x80);
+
+ /*
+ * The time array is four 64-bit quantities in both versions, matching
+ * the vx->runstate_times and immediately following state_entry_time.
+ */
+ BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state_entry_time) !=
+ offsetof(struct vcpu_runstate_info, time) - sizeof(uint64_t));
+ BUILD_BUG_ON(offsetof(struct compat_vcpu_runstate_info, state_entry_time) !=
+ offsetof(struct compat_vcpu_runstate_info, time) - sizeof(uint64_t));
+ BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, time) !=
+ sizeof_field(struct compat_vcpu_runstate_info, time));
+ BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, time) !=
+ sizeof(vx->runstate_times));
+
+ if (IS_ENABLED(CONFIG_64BIT) && v->kvm->arch.xen.long_mode) {
+ user_len = sizeof(struct vcpu_runstate_info);
+ times_ofs = offsetof(struct vcpu_runstate_info,
+ state_entry_time);
+ } else {
+ user_len = sizeof(struct compat_vcpu_runstate_info);
+ times_ofs = offsetof(struct compat_vcpu_runstate_info,
+ state_entry_time);
+ }
+
+ /*
+ * There are basically no alignment constraints. The guest can set it
+ * up so it crosses from one page to the next, and at arbitrary byte
+ * alignment (and the 32-bit ABI doesn't align the 64-bit integers
+ * anyway, even if the overall struct had been 64-bit aligned).
+ */
+ if ((gpc1->gpa & ~PAGE_MASK) + user_len >= PAGE_SIZE) {
+ user_len1 = PAGE_SIZE - (gpc1->gpa & ~PAGE_MASK);
+ user_len2 = user_len - user_len1;
+ } else {
+ user_len1 = user_len;
+ user_len2 = 0;
+ }
+ BUG_ON(user_len1 + user_len2 != user_len);
+
+ retry:
+ /*
+ * Attempt to obtain the GPC lock on *both* (if there are two)
+ * gfn_to_pfn caches that cover the region.
+ */
+ if (atomic) {
+ local_irq_save(flags);
+ if (!read_trylock(&gpc1->lock)) {
+ local_irq_restore(flags);
+ return;
+ }
+ } else {
+ read_lock_irqsave(&gpc1->lock, flags);
+ }
+ while (!kvm_gpc_check(gpc1, user_len1)) {
+ read_unlock_irqrestore(&gpc1->lock, flags);
+
+ /* When invoked from kvm_sched_out() we cannot sleep */
+ if (atomic)
+ return;
+
+ if (kvm_gpc_refresh(gpc1, user_len1))
+ return;
+
+ read_lock_irqsave(&gpc1->lock, flags);
+ }
+
+ if (likely(!user_len2)) {
+ /*
+ * Set up three pointers directly to the runstate_info
+ * struct in the guest (via the GPC).
+ *
+ * • @rs_state → state field
+ * • @rs_times → state_entry_time field.
+ * • @update_bit → last byte of state_entry_time, which
+ * contains the XEN_RUNSTATE_UPDATE bit.
+ */
+ rs_state = gpc1->khva;
+ rs_times = gpc1->khva + times_ofs;
+ if (v->kvm->arch.xen.runstate_update_flag)
+ update_bit = ((void *)(&rs_times[1])) - 1;
+ } else {
+ /*
+ * The guest's runstate_info is split across two pages and we
+ * need to hold and validate both GPCs simultaneously. We can
+ * declare a lock ordering GPC1 > GPC2 because nothing else
+ * takes them more than one at a time. Set a subclass on the
+ * gpc1 lock to make lockdep shut up about it.
+ */
+ lock_set_subclass(&gpc1->lock.dep_map, 1, _THIS_IP_);
+ if (atomic) {
+ if (!read_trylock(&gpc2->lock)) {
+ read_unlock_irqrestore(&gpc1->lock, flags);
+ return;
+ }
+ } else {
+ read_lock(&gpc2->lock);
+ }
+
+ if (!kvm_gpc_check(gpc2, user_len2)) {
+ read_unlock(&gpc2->lock);
+ read_unlock_irqrestore(&gpc1->lock, flags);
+
+ /* When invoked from kvm_sched_out() we cannot sleep */
+ if (atomic)
+ return;
+
+ /*
+ * Use kvm_gpc_activate() here because if the runstate
+ * area was configured in 32-bit mode and only extends
+ * to the second page now because the guest changed to
+ * 64-bit mode, the second GPC won't have been set up.
+ */
+ if (kvm_gpc_activate(gpc2, gpc1->gpa + user_len1,
+ user_len2))
+ return;
+
+ /*
+ * We dropped the lock on GPC1 so we have to go all the
+ * way back and revalidate that too.
+ */
+ goto retry;
+ }
+
+ /*
+ * In this case, the runstate_info struct will be assembled on
+ * the kernel stack (compat or not as appropriate) and will
+ * be copied to GPC1/GPC2 with a dual memcpy. Set up the three
+ * rs pointers accordingly.
+ */
+ rs_times = &rs.state_entry_time;
+
+ /*
+ * The rs_state pointer points to the start of what we'll
+ * copy to the guest, which in the case of a compat guest
+ * is the 32-bit field that the compiler thinks is padding.
+ */
+ rs_state = ((void *)rs_times) - times_ofs;
+
+ /*
+ * The update_bit is still directly in the guest memory,
+ * via one GPC or the other.
+ */
+ if (v->kvm->arch.xen.runstate_update_flag) {
+ if (user_len1 >= times_ofs + sizeof(uint64_t))
+ update_bit = gpc1->khva + times_ofs +
+ sizeof(uint64_t) - 1;
+ else
+ update_bit = gpc2->khva + times_ofs +
+ sizeof(uint64_t) - 1 - user_len1;
+ }
+
+#ifdef CONFIG_X86_64
+ /*
+ * Don't leak kernel memory through the padding in the 64-bit
+ * version of the struct.
+ */
+ memset(&rs, 0, offsetof(struct vcpu_runstate_info, state_entry_time));
+#endif
+ }
+
+ /*
+ * First, set the XEN_RUNSTATE_UPDATE bit in the top bit of the
+ * state_entry_time field, directly in the guest. We need to set
+ * that (and write-barrier) before writing to the rest of the
+ * structure, and clear it last. Just as Xen does, we address the
+ * single *byte* in which it resides because it might be in a
+ * different cache line to the rest of the 64-bit word, due to
+ * the (lack of) alignment constraints.
+ */
+ entry_time = vx->runstate_entry_time;
+ if (update_bit) {
+ entry_time |= XEN_RUNSTATE_UPDATE;
+ *update_bit = (vx->runstate_entry_time | XEN_RUNSTATE_UPDATE) >> 56;
+ smp_wmb();
+ }
+
+ /*
+ * Now assemble the actual structure, either on our kernel stack
+ * or directly in the guest according to how the rs_state and
+ * rs_times pointers were set up above.
+ */
+ *rs_state = vx->current_runstate;
+ rs_times[0] = entry_time;
+ memcpy(rs_times + 1, vx->runstate_times, sizeof(vx->runstate_times));
+
+ /* For the split case, we have to then copy it to the guest. */
+ if (user_len2) {
+ memcpy(gpc1->khva, rs_state, user_len1);
+ memcpy(gpc2->khva, ((void *)rs_state) + user_len1, user_len2);
+ }
+ smp_wmb();
+
+ /* Finally, clear the XEN_RUNSTATE_UPDATE bit. */
+ if (update_bit) {
+ entry_time &= ~XEN_RUNSTATE_UPDATE;
+ *update_bit = entry_time >> 56;
+ smp_wmb();
+ }
+
+ if (user_len2)
+ read_unlock(&gpc2->lock);
+
+ read_unlock_irqrestore(&gpc1->lock, flags);
+
+ mark_page_dirty_in_slot(v->kvm, gpc1->memslot, gpc1->gpa >> PAGE_SHIFT);
+ if (user_len2)
+ mark_page_dirty_in_slot(v->kvm, gpc2->memslot, gpc2->gpa >> PAGE_SHIFT);
+}
+
+void kvm_xen_update_runstate(struct kvm_vcpu *v, int state)
+{
+ struct kvm_vcpu_xen *vx = &v->arch.xen;
+ u64 now = get_kvmclock_ns(v->kvm);
+ u64 delta_ns = now - vx->runstate_entry_time;
+ u64 run_delay = current->sched_info.run_delay;
+
+ if (unlikely(!vx->runstate_entry_time))
+ vx->current_runstate = RUNSTATE_offline;
+
+ /*
+ * Time waiting for the scheduler isn't "stolen" if the
+ * vCPU wasn't running anyway.
+ */
+ if (vx->current_runstate == RUNSTATE_running) {
+ u64 steal_ns = run_delay - vx->last_steal;
+
+ delta_ns -= steal_ns;
+
+ vx->runstate_times[RUNSTATE_runnable] += steal_ns;
+ }
+ vx->last_steal = run_delay;
+
+ vx->runstate_times[vx->current_runstate] += delta_ns;
+ vx->current_runstate = state;
+ vx->runstate_entry_time = now;
+
+ if (vx->runstate_cache.active)
+ kvm_xen_update_runstate_guest(v, state == RUNSTATE_runnable);
+}
+
+static void kvm_xen_inject_vcpu_vector(struct kvm_vcpu *v)
+{
+ struct kvm_lapic_irq irq = { };
+ int r;
+
+ irq.dest_id = v->vcpu_id;
+ irq.vector = v->arch.xen.upcall_vector;
+ irq.dest_mode = APIC_DEST_PHYSICAL;
+ irq.shorthand = APIC_DEST_NOSHORT;
+ irq.delivery_mode = APIC_DM_FIXED;
+ irq.level = 1;
+
+ /* The fast version will always work for physical unicast */
+ WARN_ON_ONCE(!kvm_irq_delivery_to_apic_fast(v->kvm, NULL, &irq, &r, NULL));
+}
+
+/*
+ * On event channel delivery, the vcpu_info may not have been accessible.
+ * In that case, there are bits in vcpu->arch.xen.evtchn_pending_sel which
+ * need to be marked into the vcpu_info (and evtchn_upcall_pending set).
+ * Do so now that we can sleep in the context of the vCPU to bring the
+ * page in, and refresh the pfn cache for it.
+ */
+void kvm_xen_inject_pending_events(struct kvm_vcpu *v)
+{
+ unsigned long evtchn_pending_sel = READ_ONCE(v->arch.xen.evtchn_pending_sel);
+ struct gfn_to_pfn_cache *gpc = &v->arch.xen.vcpu_info_cache;
+ unsigned long flags;
+
+ if (!evtchn_pending_sel)
+ return;
+
+ /*
+ * Yes, this is an open-coded loop. But that's just what put_user()
+ * does anyway. Page it in and retry the instruction. We're just a
+ * little more honest about it.
+ */
+ read_lock_irqsave(&gpc->lock, flags);
+ while (!kvm_gpc_check(gpc, sizeof(struct vcpu_info))) {
+ read_unlock_irqrestore(&gpc->lock, flags);
+
+ if (kvm_gpc_refresh(gpc, sizeof(struct vcpu_info)))
+ return;
+
+ read_lock_irqsave(&gpc->lock, flags);
+ }
+
+ /* Now gpc->khva is a valid kernel address for the vcpu_info */
+ if (IS_ENABLED(CONFIG_64BIT) && v->kvm->arch.xen.long_mode) {
+ struct vcpu_info *vi = gpc->khva;
+
+ asm volatile(LOCK_PREFIX "orq %0, %1\n"
+ "notq %0\n"
+ LOCK_PREFIX "andq %0, %2\n"
+ : "=r" (evtchn_pending_sel),
+ "+m" (vi->evtchn_pending_sel),
+ "+m" (v->arch.xen.evtchn_pending_sel)
+ : "0" (evtchn_pending_sel));
+ WRITE_ONCE(vi->evtchn_upcall_pending, 1);
+ } else {
+ u32 evtchn_pending_sel32 = evtchn_pending_sel;
+ struct compat_vcpu_info *vi = gpc->khva;
+
+ asm volatile(LOCK_PREFIX "orl %0, %1\n"
+ "notl %0\n"
+ LOCK_PREFIX "andl %0, %2\n"
+ : "=r" (evtchn_pending_sel32),
+ "+m" (vi->evtchn_pending_sel),
+ "+m" (v->arch.xen.evtchn_pending_sel)
+ : "0" (evtchn_pending_sel32));
+ WRITE_ONCE(vi->evtchn_upcall_pending, 1);
+ }
+ read_unlock_irqrestore(&gpc->lock, flags);
+
+ /* For the per-vCPU lapic vector, deliver it as MSI. */
+ if (v->arch.xen.upcall_vector)
+ kvm_xen_inject_vcpu_vector(v);
+
+ mark_page_dirty_in_slot(v->kvm, gpc->memslot, gpc->gpa >> PAGE_SHIFT);
+}
+
+int __kvm_xen_has_interrupt(struct kvm_vcpu *v)
+{
+ struct gfn_to_pfn_cache *gpc = &v->arch.xen.vcpu_info_cache;
+ unsigned long flags;
+ u8 rc = 0;
+
+ /*
+ * If the global upcall vector (HVMIRQ_callback_vector) is set and
+ * the vCPU's evtchn_upcall_pending flag is set, the IRQ is pending.
+ */
+
+ /* No need for compat handling here */
+ BUILD_BUG_ON(offsetof(struct vcpu_info, evtchn_upcall_pending) !=
+ offsetof(struct compat_vcpu_info, evtchn_upcall_pending));
+ BUILD_BUG_ON(sizeof(rc) !=
+ sizeof_field(struct vcpu_info, evtchn_upcall_pending));
+ BUILD_BUG_ON(sizeof(rc) !=
+ sizeof_field(struct compat_vcpu_info, evtchn_upcall_pending));
+
+ read_lock_irqsave(&gpc->lock, flags);
+ while (!kvm_gpc_check(gpc, sizeof(struct vcpu_info))) {
+ read_unlock_irqrestore(&gpc->lock, flags);
+
+ /*
+ * This function gets called from kvm_vcpu_block() after setting the
+ * task to TASK_INTERRUPTIBLE, to see if it needs to wake immediately
+ * from a HLT. So we really mustn't sleep. If the page ended up absent
+ * at that point, just return 1 in order to trigger an immediate wake,
+ * and we'll end up getting called again from a context where we *can*
+ * fault in the page and wait for it.
+ */
+ if (in_atomic() || !task_is_running(current))
+ return 1;
+
+ if (kvm_gpc_refresh(gpc, sizeof(struct vcpu_info))) {
+ /*
+ * If this failed, userspace has screwed up the
+ * vcpu_info mapping. No interrupts for you.
+ */
+ return 0;
+ }
+ read_lock_irqsave(&gpc->lock, flags);
+ }
+
+ rc = ((struct vcpu_info *)gpc->khva)->evtchn_upcall_pending;
+ read_unlock_irqrestore(&gpc->lock, flags);
+ return rc;
+}
+
+int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data)
+{
+ int r = -ENOENT;
+
+
+ switch (data->type) {
+ case KVM_XEN_ATTR_TYPE_LONG_MODE:
+ if (!IS_ENABLED(CONFIG_64BIT) && data->u.long_mode) {
+ r = -EINVAL;
+ } else {
+ mutex_lock(&kvm->arch.xen.xen_lock);
+ kvm->arch.xen.long_mode = !!data->u.long_mode;
+ mutex_unlock(&kvm->arch.xen.xen_lock);
+ r = 0;
+ }
+ break;
+
+ case KVM_XEN_ATTR_TYPE_SHARED_INFO:
+ mutex_lock(&kvm->arch.xen.xen_lock);
+ r = kvm_xen_shared_info_init(kvm, data->u.shared_info.gfn);
+ mutex_unlock(&kvm->arch.xen.xen_lock);
+ break;
+
+ case KVM_XEN_ATTR_TYPE_UPCALL_VECTOR:
+ if (data->u.vector && data->u.vector < 0x10)
+ r = -EINVAL;
+ else {
+ mutex_lock(&kvm->arch.xen.xen_lock);
+ kvm->arch.xen.upcall_vector = data->u.vector;
+ mutex_unlock(&kvm->arch.xen.xen_lock);
+ r = 0;
+ }
+ break;
+
+ case KVM_XEN_ATTR_TYPE_EVTCHN:
+ r = kvm_xen_setattr_evtchn(kvm, data);
+ break;
+
+ case KVM_XEN_ATTR_TYPE_XEN_VERSION:
+ mutex_lock(&kvm->arch.xen.xen_lock);
+ kvm->arch.xen.xen_version = data->u.xen_version;
+ mutex_unlock(&kvm->arch.xen.xen_lock);
+ r = 0;
+ break;
+
+ case KVM_XEN_ATTR_TYPE_RUNSTATE_UPDATE_FLAG:
+ if (!sched_info_on()) {
+ r = -EOPNOTSUPP;
+ break;
+ }
+ mutex_lock(&kvm->arch.xen.xen_lock);
+ kvm->arch.xen.runstate_update_flag = !!data->u.runstate_update_flag;
+ mutex_unlock(&kvm->arch.xen.xen_lock);
+ r = 0;
+ break;
+
+ default:
+ break;
+ }
+
+ return r;
+}
+
+int kvm_xen_hvm_get_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data)
+{
+ int r = -ENOENT;
+
+ mutex_lock(&kvm->arch.xen.xen_lock);
+
+ switch (data->type) {
+ case KVM_XEN_ATTR_TYPE_LONG_MODE:
+ data->u.long_mode = kvm->arch.xen.long_mode;
+ r = 0;
+ break;
+
+ case KVM_XEN_ATTR_TYPE_SHARED_INFO:
+ if (kvm->arch.xen.shinfo_cache.active)
+ data->u.shared_info.gfn = gpa_to_gfn(kvm->arch.xen.shinfo_cache.gpa);
+ else
+ data->u.shared_info.gfn = KVM_XEN_INVALID_GFN;
+ r = 0;
+ break;
+
+ case KVM_XEN_ATTR_TYPE_UPCALL_VECTOR:
+ data->u.vector = kvm->arch.xen.upcall_vector;
+ r = 0;
+ break;
+
+ case KVM_XEN_ATTR_TYPE_XEN_VERSION:
+ data->u.xen_version = kvm->arch.xen.xen_version;
+ r = 0;
+ break;
+
+ case KVM_XEN_ATTR_TYPE_RUNSTATE_UPDATE_FLAG:
+ if (!sched_info_on()) {
+ r = -EOPNOTSUPP;
+ break;
+ }
+ data->u.runstate_update_flag = kvm->arch.xen.runstate_update_flag;
+ r = 0;
+ break;
+
+ default:
+ break;
+ }
+
+ mutex_unlock(&kvm->arch.xen.xen_lock);
+ return r;
+}
+
+int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
+{
+ int idx, r = -ENOENT;
+
+ mutex_lock(&vcpu->kvm->arch.xen.xen_lock);
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
+
+ switch (data->type) {
+ case KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO:
+ /* No compat necessary here. */
+ BUILD_BUG_ON(sizeof(struct vcpu_info) !=
+ sizeof(struct compat_vcpu_info));
+ BUILD_BUG_ON(offsetof(struct vcpu_info, time) !=
+ offsetof(struct compat_vcpu_info, time));
+
+ if (data->u.gpa == KVM_XEN_INVALID_GPA) {
+ kvm_gpc_deactivate(&vcpu->arch.xen.vcpu_info_cache);
+ r = 0;
+ break;
+ }
+
+ r = kvm_gpc_activate(&vcpu->arch.xen.vcpu_info_cache,
+ data->u.gpa, sizeof(struct vcpu_info));
+ if (!r)
+ kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
+
+ break;
+
+ case KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO:
+ if (data->u.gpa == KVM_XEN_INVALID_GPA) {
+ kvm_gpc_deactivate(&vcpu->arch.xen.vcpu_time_info_cache);
+ r = 0;
+ break;
+ }
+
+ r = kvm_gpc_activate(&vcpu->arch.xen.vcpu_time_info_cache,
+ data->u.gpa,
+ sizeof(struct pvclock_vcpu_time_info));
+ if (!r)
+ kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
+ break;
+
+ case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR: {
+ size_t sz, sz1, sz2;
+
+ if (!sched_info_on()) {
+ r = -EOPNOTSUPP;
+ break;
+ }
+ if (data->u.gpa == KVM_XEN_INVALID_GPA) {
+ r = 0;
+ deactivate_out:
+ kvm_gpc_deactivate(&vcpu->arch.xen.runstate_cache);
+ kvm_gpc_deactivate(&vcpu->arch.xen.runstate2_cache);
+ break;
+ }
+
+ /*
+ * If the guest switches to 64-bit mode after setting the runstate
+ * address, that's actually OK. kvm_xen_update_runstate_guest()
+ * will cope.
+ */
+ if (IS_ENABLED(CONFIG_64BIT) && vcpu->kvm->arch.xen.long_mode)
+ sz = sizeof(struct vcpu_runstate_info);
+ else
+ sz = sizeof(struct compat_vcpu_runstate_info);
+
+ /* How much fits in the (first) page? */
+ sz1 = PAGE_SIZE - (data->u.gpa & ~PAGE_MASK);
+ r = kvm_gpc_activate(&vcpu->arch.xen.runstate_cache,
+ data->u.gpa, sz1);
+ if (r)
+ goto deactivate_out;
+
+ /* Either map the second page, or deactivate the second GPC */
+ if (sz1 >= sz) {
+ kvm_gpc_deactivate(&vcpu->arch.xen.runstate2_cache);
+ } else {
+ sz2 = sz - sz1;
+ BUG_ON((data->u.gpa + sz1) & ~PAGE_MASK);
+ r = kvm_gpc_activate(&vcpu->arch.xen.runstate2_cache,
+ data->u.gpa + sz1, sz2);
+ if (r)
+ goto deactivate_out;
+ }
+
+ kvm_xen_update_runstate_guest(vcpu, false);
+ break;
+ }
+ case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_CURRENT:
+ if (!sched_info_on()) {
+ r = -EOPNOTSUPP;
+ break;
+ }
+ if (data->u.runstate.state > RUNSTATE_offline) {
+ r = -EINVAL;
+ break;
+ }
+
+ kvm_xen_update_runstate(vcpu, data->u.runstate.state);
+ r = 0;
+ break;
+
+ case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_DATA:
+ if (!sched_info_on()) {
+ r = -EOPNOTSUPP;
+ break;
+ }
+ if (data->u.runstate.state > RUNSTATE_offline) {
+ r = -EINVAL;
+ break;
+ }
+ if (data->u.runstate.state_entry_time !=
+ (data->u.runstate.time_running +
+ data->u.runstate.time_runnable +
+ data->u.runstate.time_blocked +
+ data->u.runstate.time_offline)) {
+ r = -EINVAL;
+ break;
+ }
+ if (get_kvmclock_ns(vcpu->kvm) <
+ data->u.runstate.state_entry_time) {
+ r = -EINVAL;
+ break;
+ }
+
+ vcpu->arch.xen.current_runstate = data->u.runstate.state;
+ vcpu->arch.xen.runstate_entry_time =
+ data->u.runstate.state_entry_time;
+ vcpu->arch.xen.runstate_times[RUNSTATE_running] =
+ data->u.runstate.time_running;
+ vcpu->arch.xen.runstate_times[RUNSTATE_runnable] =
+ data->u.runstate.time_runnable;
+ vcpu->arch.xen.runstate_times[RUNSTATE_blocked] =
+ data->u.runstate.time_blocked;
+ vcpu->arch.xen.runstate_times[RUNSTATE_offline] =
+ data->u.runstate.time_offline;
+ vcpu->arch.xen.last_steal = current->sched_info.run_delay;
+ r = 0;
+ break;
+
+ case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADJUST:
+ if (!sched_info_on()) {
+ r = -EOPNOTSUPP;
+ break;
+ }
+ if (data->u.runstate.state > RUNSTATE_offline &&
+ data->u.runstate.state != (u64)-1) {
+ r = -EINVAL;
+ break;
+ }
+ /* The adjustment must add up */
+ if (data->u.runstate.state_entry_time !=
+ (data->u.runstate.time_running +
+ data->u.runstate.time_runnable +
+ data->u.runstate.time_blocked +
+ data->u.runstate.time_offline)) {
+ r = -EINVAL;
+ break;
+ }
+
+ if (get_kvmclock_ns(vcpu->kvm) <
+ (vcpu->arch.xen.runstate_entry_time +
+ data->u.runstate.state_entry_time)) {
+ r = -EINVAL;
+ break;
+ }
+
+ vcpu->arch.xen.runstate_entry_time +=
+ data->u.runstate.state_entry_time;
+ vcpu->arch.xen.runstate_times[RUNSTATE_running] +=
+ data->u.runstate.time_running;
+ vcpu->arch.xen.runstate_times[RUNSTATE_runnable] +=
+ data->u.runstate.time_runnable;
+ vcpu->arch.xen.runstate_times[RUNSTATE_blocked] +=
+ data->u.runstate.time_blocked;
+ vcpu->arch.xen.runstate_times[RUNSTATE_offline] +=
+ data->u.runstate.time_offline;
+
+ if (data->u.runstate.state <= RUNSTATE_offline)
+ kvm_xen_update_runstate(vcpu, data->u.runstate.state);
+ else if (vcpu->arch.xen.runstate_cache.active)
+ kvm_xen_update_runstate_guest(vcpu, false);
+ r = 0;
+ break;
+
+ case KVM_XEN_VCPU_ATTR_TYPE_VCPU_ID:
+ if (data->u.vcpu_id >= KVM_MAX_VCPUS)
+ r = -EINVAL;
+ else {
+ vcpu->arch.xen.vcpu_id = data->u.vcpu_id;
+ r = 0;
+ }
+ break;
+
+ case KVM_XEN_VCPU_ATTR_TYPE_TIMER:
+ if (data->u.timer.port &&
+ data->u.timer.priority != KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL) {
+ r = -EINVAL;
+ break;
+ }
+
+ if (!vcpu->arch.xen.timer.function)
+ kvm_xen_init_timer(vcpu);
+
+ /* Stop the timer (if it's running) before changing the vector */
+ kvm_xen_stop_timer(vcpu);
+ vcpu->arch.xen.timer_virq = data->u.timer.port;
+
+ /* Start the timer if the new value has a valid vector+expiry. */
+ if (data->u.timer.port && data->u.timer.expires_ns)
+ kvm_xen_start_timer(vcpu, data->u.timer.expires_ns,
+ data->u.timer.expires_ns -
+ get_kvmclock_ns(vcpu->kvm));
+
+ r = 0;
+ break;
+
+ case KVM_XEN_VCPU_ATTR_TYPE_UPCALL_VECTOR:
+ if (data->u.vector && data->u.vector < 0x10)
+ r = -EINVAL;
+ else {
+ vcpu->arch.xen.upcall_vector = data->u.vector;
+ r = 0;
+ }
+ break;
+
+ default:
+ break;
+ }
+
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
+ mutex_unlock(&vcpu->kvm->arch.xen.xen_lock);
+ return r;
+}
+
+int kvm_xen_vcpu_get_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
+{
+ int r = -ENOENT;
+
+ mutex_lock(&vcpu->kvm->arch.xen.xen_lock);
+
+ switch (data->type) {
+ case KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO:
+ if (vcpu->arch.xen.vcpu_info_cache.active)
+ data->u.gpa = vcpu->arch.xen.vcpu_info_cache.gpa;
+ else
+ data->u.gpa = KVM_XEN_INVALID_GPA;
+ r = 0;
+ break;
+
+ case KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO:
+ if (vcpu->arch.xen.vcpu_time_info_cache.active)
+ data->u.gpa = vcpu->arch.xen.vcpu_time_info_cache.gpa;
+ else
+ data->u.gpa = KVM_XEN_INVALID_GPA;
+ r = 0;
+ break;
+
+ case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR:
+ if (!sched_info_on()) {
+ r = -EOPNOTSUPP;
+ break;
+ }
+ if (vcpu->arch.xen.runstate_cache.active) {
+ data->u.gpa = vcpu->arch.xen.runstate_cache.gpa;
+ r = 0;
+ }
+ break;
+
+ case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_CURRENT:
+ if (!sched_info_on()) {
+ r = -EOPNOTSUPP;
+ break;
+ }
+ data->u.runstate.state = vcpu->arch.xen.current_runstate;
+ r = 0;
+ break;
+
+ case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_DATA:
+ if (!sched_info_on()) {
+ r = -EOPNOTSUPP;
+ break;
+ }
+ data->u.runstate.state = vcpu->arch.xen.current_runstate;
+ data->u.runstate.state_entry_time =
+ vcpu->arch.xen.runstate_entry_time;
+ data->u.runstate.time_running =
+ vcpu->arch.xen.runstate_times[RUNSTATE_running];
+ data->u.runstate.time_runnable =
+ vcpu->arch.xen.runstate_times[RUNSTATE_runnable];
+ data->u.runstate.time_blocked =
+ vcpu->arch.xen.runstate_times[RUNSTATE_blocked];
+ data->u.runstate.time_offline =
+ vcpu->arch.xen.runstate_times[RUNSTATE_offline];
+ r = 0;
+ break;
+
+ case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADJUST:
+ r = -EINVAL;
+ break;
+
+ case KVM_XEN_VCPU_ATTR_TYPE_VCPU_ID:
+ data->u.vcpu_id = vcpu->arch.xen.vcpu_id;
+ r = 0;
+ break;
+
+ case KVM_XEN_VCPU_ATTR_TYPE_TIMER:
+ data->u.timer.port = vcpu->arch.xen.timer_virq;
+ data->u.timer.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL;
+ data->u.timer.expires_ns = vcpu->arch.xen.timer_expires;
+ r = 0;
+ break;
+
+ case KVM_XEN_VCPU_ATTR_TYPE_UPCALL_VECTOR:
+ data->u.vector = vcpu->arch.xen.upcall_vector;
+ r = 0;
+ break;
+
+ default:
+ break;
+ }
+
+ mutex_unlock(&vcpu->kvm->arch.xen.xen_lock);
+ return r;
+}
+
+int kvm_xen_write_hypercall_page(struct kvm_vcpu *vcpu, u64 data)
+{
+ struct kvm *kvm = vcpu->kvm;
+ u32 page_num = data & ~PAGE_MASK;
+ u64 page_addr = data & PAGE_MASK;
+ bool lm = is_long_mode(vcpu);
+
+ /* Latch long_mode for shared_info pages etc. */
+ vcpu->kvm->arch.xen.long_mode = lm;
+
+ /*
+ * If Xen hypercall intercept is enabled, fill the hypercall
+ * page with VMCALL/VMMCALL instructions since that's what
+ * we catch. Else the VMM has provided the hypercall pages
+ * with instructions of its own choosing, so use those.
+ */
+ if (kvm_xen_hypercall_enabled(kvm)) {
+ u8 instructions[32];
+ int i;
+
+ if (page_num)
+ return 1;
+
+ /* mov imm32, %eax */
+ instructions[0] = 0xb8;
+
+ /* vmcall / vmmcall */
+ static_call(kvm_x86_patch_hypercall)(vcpu, instructions + 5);
+
+ /* ret */
+ instructions[8] = 0xc3;
+
+ /* int3 to pad */
+ memset(instructions + 9, 0xcc, sizeof(instructions) - 9);
+
+ for (i = 0; i < PAGE_SIZE / sizeof(instructions); i++) {
+ *(u32 *)&instructions[1] = i;
+ if (kvm_vcpu_write_guest(vcpu,
+ page_addr + (i * sizeof(instructions)),
+ instructions, sizeof(instructions)))
+ return 1;
+ }
+ } else {
+ /*
+ * Note, truncation is a non-issue as 'lm' is guaranteed to be
+ * false for a 32-bit kernel, i.e. when hva_t is only 4 bytes.
+ */
+ hva_t blob_addr = lm ? kvm->arch.xen_hvm_config.blob_addr_64
+ : kvm->arch.xen_hvm_config.blob_addr_32;
+ u8 blob_size = lm ? kvm->arch.xen_hvm_config.blob_size_64
+ : kvm->arch.xen_hvm_config.blob_size_32;
+ u8 *page;
+ int ret;
+
+ if (page_num >= blob_size)
+ return 1;
+
+ blob_addr += page_num * PAGE_SIZE;
+
+ page = memdup_user((u8 __user *)blob_addr, PAGE_SIZE);
+ if (IS_ERR(page))
+ return PTR_ERR(page);
+
+ ret = kvm_vcpu_write_guest(vcpu, page_addr, page, PAGE_SIZE);
+ kfree(page);
+ if (ret)
+ return 1;
+ }
+ return 0;
+}
+
+int kvm_xen_hvm_config(struct kvm *kvm, struct kvm_xen_hvm_config *xhc)
+{
+ /* Only some feature flags need to be *enabled* by userspace */
+ u32 permitted_flags = KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL |
+ KVM_XEN_HVM_CONFIG_EVTCHN_SEND;
+
+ if (xhc->flags & ~permitted_flags)
+ return -EINVAL;
+
+ /*
+ * With hypercall interception the kernel generates its own
+ * hypercall page so it must not be provided.
+ */
+ if ((xhc->flags & KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL) &&
+ (xhc->blob_addr_32 || xhc->blob_addr_64 ||
+ xhc->blob_size_32 || xhc->blob_size_64))
+ return -EINVAL;
+
+ mutex_lock(&kvm->arch.xen.xen_lock);
+
+ if (xhc->msr && !kvm->arch.xen_hvm_config.msr)
+ static_branch_inc(&kvm_xen_enabled.key);
+ else if (!xhc->msr && kvm->arch.xen_hvm_config.msr)
+ static_branch_slow_dec_deferred(&kvm_xen_enabled);
+
+ memcpy(&kvm->arch.xen_hvm_config, xhc, sizeof(*xhc));
+
+ mutex_unlock(&kvm->arch.xen.xen_lock);
+ return 0;
+}
+
+static int kvm_xen_hypercall_set_result(struct kvm_vcpu *vcpu, u64 result)
+{
+ kvm_rax_write(vcpu, result);
+ return kvm_skip_emulated_instruction(vcpu);
+}
+
+static int kvm_xen_hypercall_complete_userspace(struct kvm_vcpu *vcpu)
+{
+ struct kvm_run *run = vcpu->run;
+
+ if (unlikely(!kvm_is_linear_rip(vcpu, vcpu->arch.xen.hypercall_rip)))
+ return 1;
+
+ return kvm_xen_hypercall_set_result(vcpu, run->xen.u.hcall.result);
+}
+
+static inline int max_evtchn_port(struct kvm *kvm)
+{
+ if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode)
+ return EVTCHN_2L_NR_CHANNELS;
+ else
+ return COMPAT_EVTCHN_2L_NR_CHANNELS;
+}
+
+static bool wait_pending_event(struct kvm_vcpu *vcpu, int nr_ports,
+ evtchn_port_t *ports)
+{
+ struct kvm *kvm = vcpu->kvm;
+ struct gfn_to_pfn_cache *gpc = &kvm->arch.xen.shinfo_cache;
+ unsigned long *pending_bits;
+ unsigned long flags;
+ bool ret = true;
+ int idx, i;
+
+ idx = srcu_read_lock(&kvm->srcu);
+ read_lock_irqsave(&gpc->lock, flags);
+ if (!kvm_gpc_check(gpc, PAGE_SIZE))
+ goto out_rcu;
+
+ ret = false;
+ if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode) {
+ struct shared_info *shinfo = gpc->khva;
+ pending_bits = (unsigned long *)&shinfo->evtchn_pending;
+ } else {
+ struct compat_shared_info *shinfo = gpc->khva;
+ pending_bits = (unsigned long *)&shinfo->evtchn_pending;
+ }
+
+ for (i = 0; i < nr_ports; i++) {
+ if (test_bit(ports[i], pending_bits)) {
+ ret = true;
+ break;
+ }
+ }
+
+ out_rcu:
+ read_unlock_irqrestore(&gpc->lock, flags);
+ srcu_read_unlock(&kvm->srcu, idx);
+
+ return ret;
+}
+
+static bool kvm_xen_schedop_poll(struct kvm_vcpu *vcpu, bool longmode,
+ u64 param, u64 *r)
+{
+ struct sched_poll sched_poll;
+ evtchn_port_t port, *ports;
+ struct x86_exception e;
+ int i;
+
+ if (!lapic_in_kernel(vcpu) ||
+ !(vcpu->kvm->arch.xen_hvm_config.flags & KVM_XEN_HVM_CONFIG_EVTCHN_SEND))
+ return false;
+
+ if (IS_ENABLED(CONFIG_64BIT) && !longmode) {
+ struct compat_sched_poll sp32;
+
+ /* Sanity check that the compat struct definition is correct */
+ BUILD_BUG_ON(sizeof(sp32) != 16);
+
+ if (kvm_read_guest_virt(vcpu, param, &sp32, sizeof(sp32), &e)) {
+ *r = -EFAULT;
+ return true;
+ }
+
+ /*
+ * This is a 32-bit pointer to an array of evtchn_port_t which
+ * are uint32_t, so once it's converted no further compat
+ * handling is needed.
+ */
+ sched_poll.ports = (void *)(unsigned long)(sp32.ports);
+ sched_poll.nr_ports = sp32.nr_ports;
+ sched_poll.timeout = sp32.timeout;
+ } else {
+ if (kvm_read_guest_virt(vcpu, param, &sched_poll,
+ sizeof(sched_poll), &e)) {
+ *r = -EFAULT;
+ return true;
+ }
+ }
+
+ if (unlikely(sched_poll.nr_ports > 1)) {
+ /* Xen (unofficially) limits number of pollers to 128 */
+ if (sched_poll.nr_ports > 128) {
+ *r = -EINVAL;
+ return true;
+ }
+
+ ports = kmalloc_array(sched_poll.nr_ports,
+ sizeof(*ports), GFP_KERNEL);
+ if (!ports) {
+ *r = -ENOMEM;
+ return true;
+ }
+ } else
+ ports = &port;
+
+ if (kvm_read_guest_virt(vcpu, (gva_t)sched_poll.ports, ports,
+ sched_poll.nr_ports * sizeof(*ports), &e)) {
+ *r = -EFAULT;
+ return true;
+ }
+
+ for (i = 0; i < sched_poll.nr_ports; i++) {
+ if (ports[i] >= max_evtchn_port(vcpu->kvm)) {
+ *r = -EINVAL;
+ goto out;
+ }
+ }
+
+ if (sched_poll.nr_ports == 1)
+ vcpu->arch.xen.poll_evtchn = port;
+ else
+ vcpu->arch.xen.poll_evtchn = -1;
+
+ set_bit(vcpu->vcpu_idx, vcpu->kvm->arch.xen.poll_mask);
+
+ if (!wait_pending_event(vcpu, sched_poll.nr_ports, ports)) {
+ vcpu->arch.mp_state = KVM_MP_STATE_HALTED;
+
+ if (sched_poll.timeout)
+ mod_timer(&vcpu->arch.xen.poll_timer,
+ jiffies + nsecs_to_jiffies(sched_poll.timeout));
+
+ kvm_vcpu_halt(vcpu);
+
+ if (sched_poll.timeout)
+ del_timer(&vcpu->arch.xen.poll_timer);
+
+ vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
+ }
+
+ vcpu->arch.xen.poll_evtchn = 0;
+ *r = 0;
+out:
+ /* Really, this is only needed in case of timeout */
+ clear_bit(vcpu->vcpu_idx, vcpu->kvm->arch.xen.poll_mask);
+
+ if (unlikely(sched_poll.nr_ports > 1))
+ kfree(ports);
+ return true;
+}
+
+static void cancel_evtchn_poll(struct timer_list *t)
+{
+ struct kvm_vcpu *vcpu = from_timer(vcpu, t, arch.xen.poll_timer);
+
+ kvm_make_request(KVM_REQ_UNBLOCK, vcpu);
+ kvm_vcpu_kick(vcpu);
+}
+
+static bool kvm_xen_hcall_sched_op(struct kvm_vcpu *vcpu, bool longmode,
+ int cmd, u64 param, u64 *r)
+{
+ switch (cmd) {
+ case SCHEDOP_poll:
+ if (kvm_xen_schedop_poll(vcpu, longmode, param, r))
+ return true;
+ fallthrough;
+ case SCHEDOP_yield:
+ kvm_vcpu_on_spin(vcpu, true);
+ *r = 0;
+ return true;
+ default:
+ break;
+ }
+
+ return false;
+}
+
+struct compat_vcpu_set_singleshot_timer {
+ uint64_t timeout_abs_ns;
+ uint32_t flags;
+} __attribute__((packed));
+
+static bool kvm_xen_hcall_vcpu_op(struct kvm_vcpu *vcpu, bool longmode, int cmd,
+ int vcpu_id, u64 param, u64 *r)
+{
+ struct vcpu_set_singleshot_timer oneshot;
+ struct x86_exception e;
+ s64 delta;
+
+ if (!kvm_xen_timer_enabled(vcpu))
+ return false;
+
+ switch (cmd) {
+ case VCPUOP_set_singleshot_timer:
+ if (vcpu->arch.xen.vcpu_id != vcpu_id) {
+ *r = -EINVAL;
+ return true;
+ }
+
+ /*
+ * The only difference for 32-bit compat is the 4 bytes of
+ * padding after the interesting part of the structure. So
+ * for a faithful emulation of Xen we have to *try* to copy
+ * the padding and return -EFAULT if we can't. Otherwise we
+ * might as well just have copied the 12-byte 32-bit struct.
+ */
+ BUILD_BUG_ON(offsetof(struct compat_vcpu_set_singleshot_timer, timeout_abs_ns) !=
+ offsetof(struct vcpu_set_singleshot_timer, timeout_abs_ns));
+ BUILD_BUG_ON(sizeof_field(struct compat_vcpu_set_singleshot_timer, timeout_abs_ns) !=
+ sizeof_field(struct vcpu_set_singleshot_timer, timeout_abs_ns));
+ BUILD_BUG_ON(offsetof(struct compat_vcpu_set_singleshot_timer, flags) !=
+ offsetof(struct vcpu_set_singleshot_timer, flags));
+ BUILD_BUG_ON(sizeof_field(struct compat_vcpu_set_singleshot_timer, flags) !=
+ sizeof_field(struct vcpu_set_singleshot_timer, flags));
+
+ if (kvm_read_guest_virt(vcpu, param, &oneshot, longmode ? sizeof(oneshot) :
+ sizeof(struct compat_vcpu_set_singleshot_timer), &e)) {
+ *r = -EFAULT;
+ return true;
+ }
+
+ delta = oneshot.timeout_abs_ns - get_kvmclock_ns(vcpu->kvm);
+ if ((oneshot.flags & VCPU_SSHOTTMR_future) && delta < 0) {
+ *r = -ETIME;
+ return true;
+ }
+
+ kvm_xen_start_timer(vcpu, oneshot.timeout_abs_ns, delta);
+ *r = 0;
+ return true;
+
+ case VCPUOP_stop_singleshot_timer:
+ if (vcpu->arch.xen.vcpu_id != vcpu_id) {
+ *r = -EINVAL;
+ return true;
+ }
+ kvm_xen_stop_timer(vcpu);
+ *r = 0;
+ return true;
+ }
+
+ return false;
+}
+
+static bool kvm_xen_hcall_set_timer_op(struct kvm_vcpu *vcpu, uint64_t timeout,
+ u64 *r)
+{
+ if (!kvm_xen_timer_enabled(vcpu))
+ return false;
+
+ if (timeout) {
+ uint64_t guest_now = get_kvmclock_ns(vcpu->kvm);
+ int64_t delta = timeout - guest_now;
+
+ /* Xen has a 'Linux workaround' in do_set_timer_op() which
+ * checks for negative absolute timeout values (caused by
+ * integer overflow), and for values about 13 days in the
+ * future (2^50ns) which would be caused by jiffies
+ * overflow. For those cases, it sets the timeout 100ms in
+ * the future (not *too* soon, since if a guest really did
+ * set a long timeout on purpose we don't want to keep
+ * churning CPU time by waking it up).
+ */
+ if (unlikely((int64_t)timeout < 0 ||
+ (delta > 0 && (uint32_t) (delta >> 50) != 0))) {
+ delta = 100 * NSEC_PER_MSEC;
+ timeout = guest_now + delta;
+ }
+
+ kvm_xen_start_timer(vcpu, timeout, delta);
+ } else {
+ kvm_xen_stop_timer(vcpu);
+ }
+
+ *r = 0;
+ return true;
+}
+
+int kvm_xen_hypercall(struct kvm_vcpu *vcpu)
+{
+ bool longmode;
+ u64 input, params[6], r = -ENOSYS;
+ bool handled = false;
+ u8 cpl;
+
+ input = (u64)kvm_register_read(vcpu, VCPU_REGS_RAX);
+
+ /* Hyper-V hypercalls get bit 31 set in EAX */
+ if ((input & 0x80000000) &&
+ kvm_hv_hypercall_enabled(vcpu))
+ return kvm_hv_hypercall(vcpu);
+
+ longmode = is_64_bit_hypercall(vcpu);
+ if (!longmode) {
+ params[0] = (u32)kvm_rbx_read(vcpu);
+ params[1] = (u32)kvm_rcx_read(vcpu);
+ params[2] = (u32)kvm_rdx_read(vcpu);
+ params[3] = (u32)kvm_rsi_read(vcpu);
+ params[4] = (u32)kvm_rdi_read(vcpu);
+ params[5] = (u32)kvm_rbp_read(vcpu);
+ }
+#ifdef CONFIG_X86_64
+ else {
+ params[0] = (u64)kvm_rdi_read(vcpu);
+ params[1] = (u64)kvm_rsi_read(vcpu);
+ params[2] = (u64)kvm_rdx_read(vcpu);
+ params[3] = (u64)kvm_r10_read(vcpu);
+ params[4] = (u64)kvm_r8_read(vcpu);
+ params[5] = (u64)kvm_r9_read(vcpu);
+ }
+#endif
+ cpl = static_call(kvm_x86_get_cpl)(vcpu);
+ trace_kvm_xen_hypercall(cpl, input, params[0], params[1], params[2],
+ params[3], params[4], params[5]);
+
+ /*
+ * Only allow hypercall acceleration for CPL0. The rare hypercalls that
+ * are permitted in guest userspace can be handled by the VMM.
+ */
+ if (unlikely(cpl > 0))
+ goto handle_in_userspace;
+
+ switch (input) {
+ case __HYPERVISOR_xen_version:
+ if (params[0] == XENVER_version && vcpu->kvm->arch.xen.xen_version) {
+ r = vcpu->kvm->arch.xen.xen_version;
+ handled = true;
+ }
+ break;
+ case __HYPERVISOR_event_channel_op:
+ if (params[0] == EVTCHNOP_send)
+ handled = kvm_xen_hcall_evtchn_send(vcpu, params[1], &r);
+ break;
+ case __HYPERVISOR_sched_op:
+ handled = kvm_xen_hcall_sched_op(vcpu, longmode, params[0],
+ params[1], &r);
+ break;
+ case __HYPERVISOR_vcpu_op:
+ handled = kvm_xen_hcall_vcpu_op(vcpu, longmode, params[0], params[1],
+ params[2], &r);
+ break;
+ case __HYPERVISOR_set_timer_op: {
+ u64 timeout = params[0];
+ /* In 32-bit mode, the 64-bit timeout is in two 32-bit params. */
+ if (!longmode)
+ timeout |= params[1] << 32;
+ handled = kvm_xen_hcall_set_timer_op(vcpu, timeout, &r);
+ break;
+ }
+ default:
+ break;
+ }
+
+ if (handled)
+ return kvm_xen_hypercall_set_result(vcpu, r);
+
+handle_in_userspace:
+ vcpu->run->exit_reason = KVM_EXIT_XEN;
+ vcpu->run->xen.type = KVM_EXIT_XEN_HCALL;
+ vcpu->run->xen.u.hcall.longmode = longmode;
+ vcpu->run->xen.u.hcall.cpl = cpl;
+ vcpu->run->xen.u.hcall.input = input;
+ vcpu->run->xen.u.hcall.params[0] = params[0];
+ vcpu->run->xen.u.hcall.params[1] = params[1];
+ vcpu->run->xen.u.hcall.params[2] = params[2];
+ vcpu->run->xen.u.hcall.params[3] = params[3];
+ vcpu->run->xen.u.hcall.params[4] = params[4];
+ vcpu->run->xen.u.hcall.params[5] = params[5];
+ vcpu->arch.xen.hypercall_rip = kvm_get_linear_rip(vcpu);
+ vcpu->arch.complete_userspace_io =
+ kvm_xen_hypercall_complete_userspace;
+
+ return 0;
+}
+
+static void kvm_xen_check_poller(struct kvm_vcpu *vcpu, int port)
+{
+ int poll_evtchn = vcpu->arch.xen.poll_evtchn;
+
+ if ((poll_evtchn == port || poll_evtchn == -1) &&
+ test_and_clear_bit(vcpu->vcpu_idx, vcpu->kvm->arch.xen.poll_mask)) {
+ kvm_make_request(KVM_REQ_UNBLOCK, vcpu);
+ kvm_vcpu_kick(vcpu);
+ }
+}
+
+/*
+ * The return value from this function is propagated to kvm_set_irq() API,
+ * so it returns:
+ * < 0 Interrupt was ignored (masked or not delivered for other reasons)
+ * = 0 Interrupt was coalesced (previous irq is still pending)
+ * > 0 Number of CPUs interrupt was delivered to
+ *
+ * It is also called directly from kvm_arch_set_irq_inatomic(), where the
+ * only check on its return value is a comparison with -EWOULDBLOCK'.
+ */
+int kvm_xen_set_evtchn_fast(struct kvm_xen_evtchn *xe, struct kvm *kvm)
+{
+ struct gfn_to_pfn_cache *gpc = &kvm->arch.xen.shinfo_cache;
+ struct kvm_vcpu *vcpu;
+ unsigned long *pending_bits, *mask_bits;
+ unsigned long flags;
+ int port_word_bit;
+ bool kick_vcpu = false;
+ int vcpu_idx, idx, rc;
+
+ vcpu_idx = READ_ONCE(xe->vcpu_idx);
+ if (vcpu_idx >= 0)
+ vcpu = kvm_get_vcpu(kvm, vcpu_idx);
+ else {
+ vcpu = kvm_get_vcpu_by_id(kvm, xe->vcpu_id);
+ if (!vcpu)
+ return -EINVAL;
+ WRITE_ONCE(xe->vcpu_idx, vcpu->vcpu_idx);
+ }
+
+ if (!vcpu->arch.xen.vcpu_info_cache.active)
+ return -EINVAL;
+
+ if (xe->port >= max_evtchn_port(kvm))
+ return -EINVAL;
+
+ rc = -EWOULDBLOCK;
+
+ idx = srcu_read_lock(&kvm->srcu);
+
+ read_lock_irqsave(&gpc->lock, flags);
+ if (!kvm_gpc_check(gpc, PAGE_SIZE))
+ goto out_rcu;
+
+ if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode) {
+ struct shared_info *shinfo = gpc->khva;
+ pending_bits = (unsigned long *)&shinfo->evtchn_pending;
+ mask_bits = (unsigned long *)&shinfo->evtchn_mask;
+ port_word_bit = xe->port / 64;
+ } else {
+ struct compat_shared_info *shinfo = gpc->khva;
+ pending_bits = (unsigned long *)&shinfo->evtchn_pending;
+ mask_bits = (unsigned long *)&shinfo->evtchn_mask;
+ port_word_bit = xe->port / 32;
+ }
+
+ /*
+ * If this port wasn't already set, and if it isn't masked, then
+ * we try to set the corresponding bit in the in-kernel shadow of
+ * evtchn_pending_sel for the target vCPU. And if *that* wasn't
+ * already set, then we kick the vCPU in question to write to the
+ * *real* evtchn_pending_sel in its own guest vcpu_info struct.
+ */
+ if (test_and_set_bit(xe->port, pending_bits)) {
+ rc = 0; /* It was already raised */
+ } else if (test_bit(xe->port, mask_bits)) {
+ rc = -ENOTCONN; /* Masked */
+ kvm_xen_check_poller(vcpu, xe->port);
+ } else {
+ rc = 1; /* Delivered to the bitmap in shared_info. */
+ /* Now switch to the vCPU's vcpu_info to set the index and pending_sel */
+ read_unlock_irqrestore(&gpc->lock, flags);
+ gpc = &vcpu->arch.xen.vcpu_info_cache;
+
+ read_lock_irqsave(&gpc->lock, flags);
+ if (!kvm_gpc_check(gpc, sizeof(struct vcpu_info))) {
+ /*
+ * Could not access the vcpu_info. Set the bit in-kernel
+ * and prod the vCPU to deliver it for itself.
+ */
+ if (!test_and_set_bit(port_word_bit, &vcpu->arch.xen.evtchn_pending_sel))
+ kick_vcpu = true;
+ goto out_rcu;
+ }
+
+ if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode) {
+ struct vcpu_info *vcpu_info = gpc->khva;
+ if (!test_and_set_bit(port_word_bit, &vcpu_info->evtchn_pending_sel)) {
+ WRITE_ONCE(vcpu_info->evtchn_upcall_pending, 1);
+ kick_vcpu = true;
+ }
+ } else {
+ struct compat_vcpu_info *vcpu_info = gpc->khva;
+ if (!test_and_set_bit(port_word_bit,
+ (unsigned long *)&vcpu_info->evtchn_pending_sel)) {
+ WRITE_ONCE(vcpu_info->evtchn_upcall_pending, 1);
+ kick_vcpu = true;
+ }
+ }
+
+ /* For the per-vCPU lapic vector, deliver it as MSI. */
+ if (kick_vcpu && vcpu->arch.xen.upcall_vector) {
+ kvm_xen_inject_vcpu_vector(vcpu);
+ kick_vcpu = false;
+ }
+ }
+
+ out_rcu:
+ read_unlock_irqrestore(&gpc->lock, flags);
+ srcu_read_unlock(&kvm->srcu, idx);
+
+ if (kick_vcpu) {
+ kvm_make_request(KVM_REQ_UNBLOCK, vcpu);
+ kvm_vcpu_kick(vcpu);
+ }
+
+ return rc;
+}
+
+static int kvm_xen_set_evtchn(struct kvm_xen_evtchn *xe, struct kvm *kvm)
+{
+ bool mm_borrowed = false;
+ int rc;
+
+ rc = kvm_xen_set_evtchn_fast(xe, kvm);
+ if (rc != -EWOULDBLOCK)
+ return rc;
+
+ if (current->mm != kvm->mm) {
+ /*
+ * If not on a thread which already belongs to this KVM,
+ * we'd better be in the irqfd workqueue.
+ */
+ if (WARN_ON_ONCE(current->mm))
+ return -EINVAL;
+
+ kthread_use_mm(kvm->mm);
+ mm_borrowed = true;
+ }
+
+ mutex_lock(&kvm->arch.xen.xen_lock);
+
+ /*
+ * It is theoretically possible for the page to be unmapped
+ * and the MMU notifier to invalidate the shared_info before
+ * we even get to use it. In that case, this looks like an
+ * infinite loop. It was tempting to do it via the userspace
+ * HVA instead... but that just *hides* the fact that it's
+ * an infinite loop, because if a fault occurs and it waits
+ * for the page to come back, it can *still* immediately
+ * fault and have to wait again, repeatedly.
+ *
+ * Conversely, the page could also have been reinstated by
+ * another thread before we even obtain the mutex above, so
+ * check again *first* before remapping it.
+ */
+ do {
+ struct gfn_to_pfn_cache *gpc = &kvm->arch.xen.shinfo_cache;
+ int idx;
+
+ rc = kvm_xen_set_evtchn_fast(xe, kvm);
+ if (rc != -EWOULDBLOCK)
+ break;
+
+ idx = srcu_read_lock(&kvm->srcu);
+ rc = kvm_gpc_refresh(gpc, PAGE_SIZE);
+ srcu_read_unlock(&kvm->srcu, idx);
+ } while(!rc);
+
+ mutex_unlock(&kvm->arch.xen.xen_lock);
+
+ if (mm_borrowed)
+ kthread_unuse_mm(kvm->mm);
+
+ return rc;
+}
+
+/* This is the version called from kvm_set_irq() as the .set function */
+static int evtchn_set_fn(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm,
+ int irq_source_id, int level, bool line_status)
+{
+ if (!level)
+ return -EINVAL;
+
+ return kvm_xen_set_evtchn(&e->xen_evtchn, kvm);
+}
+
+/*
+ * Set up an event channel interrupt from the KVM IRQ routing table.
+ * Used for e.g. PIRQ from passed through physical devices.
+ */
+int kvm_xen_setup_evtchn(struct kvm *kvm,
+ struct kvm_kernel_irq_routing_entry *e,
+ const struct kvm_irq_routing_entry *ue)
+
+{
+ struct kvm_vcpu *vcpu;
+
+ if (ue->u.xen_evtchn.port >= max_evtchn_port(kvm))
+ return -EINVAL;
+
+ /* We only support 2 level event channels for now */
+ if (ue->u.xen_evtchn.priority != KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL)
+ return -EINVAL;
+
+ /*
+ * Xen gives us interesting mappings from vCPU index to APIC ID,
+ * which means kvm_get_vcpu_by_id() has to iterate over all vCPUs
+ * to find it. Do that once at setup time, instead of every time.
+ * But beware that on live update / live migration, the routing
+ * table might be reinstated before the vCPU threads have finished
+ * recreating their vCPUs.
+ */
+ vcpu = kvm_get_vcpu_by_id(kvm, ue->u.xen_evtchn.vcpu);
+ if (vcpu)
+ e->xen_evtchn.vcpu_idx = vcpu->vcpu_idx;
+ else
+ e->xen_evtchn.vcpu_idx = -1;
+
+ e->xen_evtchn.port = ue->u.xen_evtchn.port;
+ e->xen_evtchn.vcpu_id = ue->u.xen_evtchn.vcpu;
+ e->xen_evtchn.priority = ue->u.xen_evtchn.priority;
+ e->set = evtchn_set_fn;
+
+ return 0;
+}
+
+/*
+ * Explicit event sending from userspace with KVM_XEN_HVM_EVTCHN_SEND ioctl.
+ */
+int kvm_xen_hvm_evtchn_send(struct kvm *kvm, struct kvm_irq_routing_xen_evtchn *uxe)
+{
+ struct kvm_xen_evtchn e;
+ int ret;
+
+ if (!uxe->port || uxe->port >= max_evtchn_port(kvm))
+ return -EINVAL;
+
+ /* We only support 2 level event channels for now */
+ if (uxe->priority != KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL)
+ return -EINVAL;
+
+ e.port = uxe->port;
+ e.vcpu_id = uxe->vcpu;
+ e.vcpu_idx = -1;
+ e.priority = uxe->priority;
+
+ ret = kvm_xen_set_evtchn(&e, kvm);
+
+ /*
+ * None of that 'return 1 if it actually got delivered' nonsense.
+ * We don't care if it was masked (-ENOTCONN) either.
+ */
+ if (ret > 0 || ret == -ENOTCONN)
+ ret = 0;
+
+ return ret;
+}
+
+/*
+ * Support for *outbound* event channel events via the EVTCHNOP_send hypercall.
+ */
+struct evtchnfd {
+ u32 send_port;
+ u32 type;
+ union {
+ struct kvm_xen_evtchn port;
+ struct {
+ u32 port; /* zero */
+ struct eventfd_ctx *ctx;
+ } eventfd;
+ } deliver;
+};
+
+/*
+ * Update target vCPU or priority for a registered sending channel.
+ */
+static int kvm_xen_eventfd_update(struct kvm *kvm,
+ struct kvm_xen_hvm_attr *data)
+{
+ u32 port = data->u.evtchn.send_port;
+ struct evtchnfd *evtchnfd;
+ int ret;
+
+ /* Protect writes to evtchnfd as well as the idr lookup. */
+ mutex_lock(&kvm->arch.xen.xen_lock);
+ evtchnfd = idr_find(&kvm->arch.xen.evtchn_ports, port);
+
+ ret = -ENOENT;
+ if (!evtchnfd)
+ goto out_unlock;
+
+ /* For an UPDATE, nothing may change except the priority/vcpu */
+ ret = -EINVAL;
+ if (evtchnfd->type != data->u.evtchn.type)
+ goto out_unlock;
+
+ /*
+ * Port cannot change, and if it's zero that was an eventfd
+ * which can't be changed either.
+ */
+ if (!evtchnfd->deliver.port.port ||
+ evtchnfd->deliver.port.port != data->u.evtchn.deliver.port.port)
+ goto out_unlock;
+
+ /* We only support 2 level event channels for now */
+ if (data->u.evtchn.deliver.port.priority != KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL)
+ goto out_unlock;
+
+ evtchnfd->deliver.port.priority = data->u.evtchn.deliver.port.priority;
+ if (evtchnfd->deliver.port.vcpu_id != data->u.evtchn.deliver.port.vcpu) {
+ evtchnfd->deliver.port.vcpu_id = data->u.evtchn.deliver.port.vcpu;
+ evtchnfd->deliver.port.vcpu_idx = -1;
+ }
+ ret = 0;
+out_unlock:
+ mutex_unlock(&kvm->arch.xen.xen_lock);
+ return ret;
+}
+
+/*
+ * Configure the target (eventfd or local port delivery) for sending on
+ * a given event channel.
+ */
+static int kvm_xen_eventfd_assign(struct kvm *kvm,
+ struct kvm_xen_hvm_attr *data)
+{
+ u32 port = data->u.evtchn.send_port;
+ struct eventfd_ctx *eventfd = NULL;
+ struct evtchnfd *evtchnfd;
+ int ret = -EINVAL;
+
+ evtchnfd = kzalloc(sizeof(struct evtchnfd), GFP_KERNEL);
+ if (!evtchnfd)
+ return -ENOMEM;
+
+ switch(data->u.evtchn.type) {
+ case EVTCHNSTAT_ipi:
+ /* IPI must map back to the same port# */
+ if (data->u.evtchn.deliver.port.port != data->u.evtchn.send_port)
+ goto out_noeventfd; /* -EINVAL */
+ break;
+
+ case EVTCHNSTAT_interdomain:
+ if (data->u.evtchn.deliver.port.port) {
+ if (data->u.evtchn.deliver.port.port >= max_evtchn_port(kvm))
+ goto out_noeventfd; /* -EINVAL */
+ } else {
+ eventfd = eventfd_ctx_fdget(data->u.evtchn.deliver.eventfd.fd);
+ if (IS_ERR(eventfd)) {
+ ret = PTR_ERR(eventfd);
+ goto out_noeventfd;
+ }
+ }
+ break;
+
+ case EVTCHNSTAT_virq:
+ case EVTCHNSTAT_closed:
+ case EVTCHNSTAT_unbound:
+ case EVTCHNSTAT_pirq:
+ default: /* Unknown event channel type */
+ goto out; /* -EINVAL */
+ }
+
+ evtchnfd->send_port = data->u.evtchn.send_port;
+ evtchnfd->type = data->u.evtchn.type;
+ if (eventfd) {
+ evtchnfd->deliver.eventfd.ctx = eventfd;
+ } else {
+ /* We only support 2 level event channels for now */
+ if (data->u.evtchn.deliver.port.priority != KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL)
+ goto out; /* -EINVAL; */
+
+ evtchnfd->deliver.port.port = data->u.evtchn.deliver.port.port;
+ evtchnfd->deliver.port.vcpu_id = data->u.evtchn.deliver.port.vcpu;
+ evtchnfd->deliver.port.vcpu_idx = -1;
+ evtchnfd->deliver.port.priority = data->u.evtchn.deliver.port.priority;
+ }
+
+ mutex_lock(&kvm->arch.xen.xen_lock);
+ ret = idr_alloc(&kvm->arch.xen.evtchn_ports, evtchnfd, port, port + 1,
+ GFP_KERNEL);
+ mutex_unlock(&kvm->arch.xen.xen_lock);
+ if (ret >= 0)
+ return 0;
+
+ if (ret == -ENOSPC)
+ ret = -EEXIST;
+out:
+ if (eventfd)
+ eventfd_ctx_put(eventfd);
+out_noeventfd:
+ kfree(evtchnfd);
+ return ret;
+}
+
+static int kvm_xen_eventfd_deassign(struct kvm *kvm, u32 port)
+{
+ struct evtchnfd *evtchnfd;
+
+ mutex_lock(&kvm->arch.xen.xen_lock);
+ evtchnfd = idr_remove(&kvm->arch.xen.evtchn_ports, port);
+ mutex_unlock(&kvm->arch.xen.xen_lock);
+
+ if (!evtchnfd)
+ return -ENOENT;
+
+ synchronize_srcu(&kvm->srcu);
+ if (!evtchnfd->deliver.port.port)
+ eventfd_ctx_put(evtchnfd->deliver.eventfd.ctx);
+ kfree(evtchnfd);
+ return 0;
+}
+
+static int kvm_xen_eventfd_reset(struct kvm *kvm)
+{
+ struct evtchnfd *evtchnfd, **all_evtchnfds;
+ int i;
+ int n = 0;
+
+ mutex_lock(&kvm->arch.xen.xen_lock);
+
+ /*
+ * Because synchronize_srcu() cannot be called inside the
+ * critical section, first collect all the evtchnfd objects
+ * in an array as they are removed from evtchn_ports.
+ */
+ idr_for_each_entry(&kvm->arch.xen.evtchn_ports, evtchnfd, i)
+ n++;
+
+ all_evtchnfds = kmalloc_array(n, sizeof(struct evtchnfd *), GFP_KERNEL);
+ if (!all_evtchnfds) {
+ mutex_unlock(&kvm->arch.xen.xen_lock);
+ return -ENOMEM;
+ }
+
+ n = 0;
+ idr_for_each_entry(&kvm->arch.xen.evtchn_ports, evtchnfd, i) {
+ all_evtchnfds[n++] = evtchnfd;
+ idr_remove(&kvm->arch.xen.evtchn_ports, evtchnfd->send_port);
+ }
+ mutex_unlock(&kvm->arch.xen.xen_lock);
+
+ synchronize_srcu(&kvm->srcu);
+
+ while (n--) {
+ evtchnfd = all_evtchnfds[n];
+ if (!evtchnfd->deliver.port.port)
+ eventfd_ctx_put(evtchnfd->deliver.eventfd.ctx);
+ kfree(evtchnfd);
+ }
+ kfree(all_evtchnfds);
+
+ return 0;
+}
+
+static int kvm_xen_setattr_evtchn(struct kvm *kvm, struct kvm_xen_hvm_attr *data)
+{
+ u32 port = data->u.evtchn.send_port;
+
+ if (data->u.evtchn.flags == KVM_XEN_EVTCHN_RESET)
+ return kvm_xen_eventfd_reset(kvm);
+
+ if (!port || port >= max_evtchn_port(kvm))
+ return -EINVAL;
+
+ if (data->u.evtchn.flags == KVM_XEN_EVTCHN_DEASSIGN)
+ return kvm_xen_eventfd_deassign(kvm, port);
+ if (data->u.evtchn.flags == KVM_XEN_EVTCHN_UPDATE)
+ return kvm_xen_eventfd_update(kvm, data);
+ if (data->u.evtchn.flags)
+ return -EINVAL;
+
+ return kvm_xen_eventfd_assign(kvm, data);
+}
+
+static bool kvm_xen_hcall_evtchn_send(struct kvm_vcpu *vcpu, u64 param, u64 *r)
+{
+ struct evtchnfd *evtchnfd;
+ struct evtchn_send send;
+ struct x86_exception e;
+
+ /* Sanity check: this structure is the same for 32-bit and 64-bit */
+ BUILD_BUG_ON(sizeof(send) != 4);
+ if (kvm_read_guest_virt(vcpu, param, &send, sizeof(send), &e)) {
+ *r = -EFAULT;
+ return true;
+ }
+
+ /*
+ * evtchnfd is protected by kvm->srcu; the idr lookup instead
+ * is protected by RCU.
+ */
+ rcu_read_lock();
+ evtchnfd = idr_find(&vcpu->kvm->arch.xen.evtchn_ports, send.port);
+ rcu_read_unlock();
+ if (!evtchnfd)
+ return false;
+
+ if (evtchnfd->deliver.port.port) {
+ int ret = kvm_xen_set_evtchn(&evtchnfd->deliver.port, vcpu->kvm);
+ if (ret < 0 && ret != -ENOTCONN)
+ return false;
+ } else {
+ eventfd_signal(evtchnfd->deliver.eventfd.ctx, 1);
+ }
+
+ *r = 0;
+ return true;
+}
+
+void kvm_xen_init_vcpu(struct kvm_vcpu *vcpu)
+{
+ vcpu->arch.xen.vcpu_id = vcpu->vcpu_idx;
+ vcpu->arch.xen.poll_evtchn = 0;
+
+ timer_setup(&vcpu->arch.xen.poll_timer, cancel_evtchn_poll, 0);
+
+ kvm_gpc_init(&vcpu->arch.xen.runstate_cache, vcpu->kvm, NULL,
+ KVM_HOST_USES_PFN);
+ kvm_gpc_init(&vcpu->arch.xen.runstate2_cache, vcpu->kvm, NULL,
+ KVM_HOST_USES_PFN);
+ kvm_gpc_init(&vcpu->arch.xen.vcpu_info_cache, vcpu->kvm, NULL,
+ KVM_HOST_USES_PFN);
+ kvm_gpc_init(&vcpu->arch.xen.vcpu_time_info_cache, vcpu->kvm, NULL,
+ KVM_HOST_USES_PFN);
+}
+
+void kvm_xen_destroy_vcpu(struct kvm_vcpu *vcpu)
+{
+ if (kvm_xen_timer_enabled(vcpu))
+ kvm_xen_stop_timer(vcpu);
+
+ kvm_gpc_deactivate(&vcpu->arch.xen.runstate_cache);
+ kvm_gpc_deactivate(&vcpu->arch.xen.runstate2_cache);
+ kvm_gpc_deactivate(&vcpu->arch.xen.vcpu_info_cache);
+ kvm_gpc_deactivate(&vcpu->arch.xen.vcpu_time_info_cache);
+
+ del_timer_sync(&vcpu->arch.xen.poll_timer);
+}
+
+void kvm_xen_update_tsc_info(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpuid_entry2 *entry;
+ u32 function;
+
+ if (!vcpu->arch.xen.cpuid.base)
+ return;
+
+ function = vcpu->arch.xen.cpuid.base | XEN_CPUID_LEAF(3);
+ if (function > vcpu->arch.xen.cpuid.limit)
+ return;
+
+ entry = kvm_find_cpuid_entry_index(vcpu, function, 1);
+ if (entry) {
+ entry->ecx = vcpu->arch.hv_clock.tsc_to_system_mul;
+ entry->edx = vcpu->arch.hv_clock.tsc_shift;
+ }
+
+ entry = kvm_find_cpuid_entry_index(vcpu, function, 2);
+ if (entry)
+ entry->eax = vcpu->arch.hw_tsc_khz;
+}
+
+void kvm_xen_init_vm(struct kvm *kvm)
+{
+ mutex_init(&kvm->arch.xen.xen_lock);
+ idr_init(&kvm->arch.xen.evtchn_ports);
+ kvm_gpc_init(&kvm->arch.xen.shinfo_cache, kvm, NULL, KVM_HOST_USES_PFN);
+}
+
+void kvm_xen_destroy_vm(struct kvm *kvm)
+{
+ struct evtchnfd *evtchnfd;
+ int i;
+
+ kvm_gpc_deactivate(&kvm->arch.xen.shinfo_cache);
+
+ idr_for_each_entry(&kvm->arch.xen.evtchn_ports, evtchnfd, i) {
+ if (!evtchnfd->deliver.port.port)
+ eventfd_ctx_put(evtchnfd->deliver.eventfd.ctx);
+ kfree(evtchnfd);
+ }
+ idr_destroy(&kvm->arch.xen.evtchn_ports);
+
+ if (kvm->arch.xen_hvm_config.msr)
+ static_branch_slow_dec_deferred(&kvm_xen_enabled);
+}
diff --git a/arch/x86/kvm/xen.h b/arch/x86/kvm/xen.h
new file mode 100644
index 000000000..f8f1fe22d
--- /dev/null
+++ b/arch/x86/kvm/xen.h
@@ -0,0 +1,224 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright © 2019 Oracle and/or its affiliates. All rights reserved.
+ * Copyright © 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ *
+ * KVM Xen emulation
+ */
+
+#ifndef __ARCH_X86_KVM_XEN_H__
+#define __ARCH_X86_KVM_XEN_H__
+
+#include <asm/xen/hypervisor.h>
+
+#ifdef CONFIG_KVM_XEN
+#include <linux/jump_label_ratelimit.h>
+
+extern struct static_key_false_deferred kvm_xen_enabled;
+
+int __kvm_xen_has_interrupt(struct kvm_vcpu *vcpu);
+void kvm_xen_inject_pending_events(struct kvm_vcpu *vcpu);
+int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data);
+int kvm_xen_vcpu_get_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data);
+int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data);
+int kvm_xen_hvm_get_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data);
+int kvm_xen_hvm_evtchn_send(struct kvm *kvm, struct kvm_irq_routing_xen_evtchn *evt);
+int kvm_xen_write_hypercall_page(struct kvm_vcpu *vcpu, u64 data);
+int kvm_xen_hvm_config(struct kvm *kvm, struct kvm_xen_hvm_config *xhc);
+void kvm_xen_init_vm(struct kvm *kvm);
+void kvm_xen_destroy_vm(struct kvm *kvm);
+void kvm_xen_init_vcpu(struct kvm_vcpu *vcpu);
+void kvm_xen_destroy_vcpu(struct kvm_vcpu *vcpu);
+int kvm_xen_set_evtchn_fast(struct kvm_xen_evtchn *xe,
+ struct kvm *kvm);
+int kvm_xen_setup_evtchn(struct kvm *kvm,
+ struct kvm_kernel_irq_routing_entry *e,
+ const struct kvm_irq_routing_entry *ue);
+void kvm_xen_update_tsc_info(struct kvm_vcpu *vcpu);
+
+static inline bool kvm_xen_msr_enabled(struct kvm *kvm)
+{
+ return static_branch_unlikely(&kvm_xen_enabled.key) &&
+ kvm->arch.xen_hvm_config.msr;
+}
+
+static inline bool kvm_xen_hypercall_enabled(struct kvm *kvm)
+{
+ return static_branch_unlikely(&kvm_xen_enabled.key) &&
+ (kvm->arch.xen_hvm_config.flags &
+ KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL);
+}
+
+static inline int kvm_xen_has_interrupt(struct kvm_vcpu *vcpu)
+{
+ if (static_branch_unlikely(&kvm_xen_enabled.key) &&
+ vcpu->arch.xen.vcpu_info_cache.active &&
+ vcpu->kvm->arch.xen.upcall_vector)
+ return __kvm_xen_has_interrupt(vcpu);
+
+ return 0;
+}
+
+static inline bool kvm_xen_has_pending_events(struct kvm_vcpu *vcpu)
+{
+ return static_branch_unlikely(&kvm_xen_enabled.key) &&
+ vcpu->arch.xen.evtchn_pending_sel;
+}
+
+static inline bool kvm_xen_timer_enabled(struct kvm_vcpu *vcpu)
+{
+ return !!vcpu->arch.xen.timer_virq;
+}
+
+static inline int kvm_xen_has_pending_timer(struct kvm_vcpu *vcpu)
+{
+ if (kvm_xen_hypercall_enabled(vcpu->kvm) && kvm_xen_timer_enabled(vcpu))
+ return atomic_read(&vcpu->arch.xen.timer_pending);
+
+ return 0;
+}
+
+void kvm_xen_inject_timer_irqs(struct kvm_vcpu *vcpu);
+#else
+static inline int kvm_xen_write_hypercall_page(struct kvm_vcpu *vcpu, u64 data)
+{
+ return 1;
+}
+
+static inline void kvm_xen_init_vm(struct kvm *kvm)
+{
+}
+
+static inline void kvm_xen_destroy_vm(struct kvm *kvm)
+{
+}
+
+static inline void kvm_xen_init_vcpu(struct kvm_vcpu *vcpu)
+{
+}
+
+static inline void kvm_xen_destroy_vcpu(struct kvm_vcpu *vcpu)
+{
+}
+
+static inline bool kvm_xen_msr_enabled(struct kvm *kvm)
+{
+ return false;
+}
+
+static inline bool kvm_xen_hypercall_enabled(struct kvm *kvm)
+{
+ return false;
+}
+
+static inline int kvm_xen_has_interrupt(struct kvm_vcpu *vcpu)
+{
+ return 0;
+}
+
+static inline void kvm_xen_inject_pending_events(struct kvm_vcpu *vcpu)
+{
+}
+
+static inline bool kvm_xen_has_pending_events(struct kvm_vcpu *vcpu)
+{
+ return false;
+}
+
+static inline int kvm_xen_has_pending_timer(struct kvm_vcpu *vcpu)
+{
+ return 0;
+}
+
+static inline void kvm_xen_inject_timer_irqs(struct kvm_vcpu *vcpu)
+{
+}
+
+static inline bool kvm_xen_timer_enabled(struct kvm_vcpu *vcpu)
+{
+ return false;
+}
+
+static inline void kvm_xen_update_tsc_info(struct kvm_vcpu *vcpu)
+{
+}
+#endif
+
+int kvm_xen_hypercall(struct kvm_vcpu *vcpu);
+
+#include <asm/pvclock-abi.h>
+#include <asm/xen/interface.h>
+#include <xen/interface/vcpu.h>
+
+void kvm_xen_update_runstate(struct kvm_vcpu *vcpu, int state);
+
+static inline void kvm_xen_runstate_set_running(struct kvm_vcpu *vcpu)
+{
+ kvm_xen_update_runstate(vcpu, RUNSTATE_running);
+}
+
+static inline void kvm_xen_runstate_set_preempted(struct kvm_vcpu *vcpu)
+{
+ /*
+ * If the vCPU wasn't preempted but took a normal exit for
+ * some reason (hypercalls, I/O, etc.), that is accounted as
+ * still RUNSTATE_running, as the VMM is still operating on
+ * behalf of the vCPU. Only if the VMM does actually block
+ * does it need to enter RUNSTATE_blocked.
+ */
+ if (WARN_ON_ONCE(!vcpu->preempted))
+ return;
+
+ kvm_xen_update_runstate(vcpu, RUNSTATE_runnable);
+}
+
+/* 32-bit compatibility definitions, also used natively in 32-bit build */
+struct compat_arch_vcpu_info {
+ unsigned int cr2;
+ unsigned int pad[5];
+};
+
+struct compat_vcpu_info {
+ uint8_t evtchn_upcall_pending;
+ uint8_t evtchn_upcall_mask;
+ uint16_t pad;
+ uint32_t evtchn_pending_sel;
+ struct compat_arch_vcpu_info arch;
+ struct pvclock_vcpu_time_info time;
+}; /* 64 bytes (x86) */
+
+struct compat_arch_shared_info {
+ unsigned int max_pfn;
+ unsigned int pfn_to_mfn_frame_list_list;
+ unsigned int nmi_reason;
+ unsigned int p2m_cr3;
+ unsigned int p2m_vaddr;
+ unsigned int p2m_generation;
+ uint32_t wc_sec_hi;
+};
+
+struct compat_shared_info {
+ struct compat_vcpu_info vcpu_info[MAX_VIRT_CPUS];
+ uint32_t evtchn_pending[32];
+ uint32_t evtchn_mask[32];
+ struct pvclock_wall_clock wc;
+ struct compat_arch_shared_info arch;
+};
+
+#define COMPAT_EVTCHN_2L_NR_CHANNELS (8 * \
+ sizeof_field(struct compat_shared_info, \
+ evtchn_pending))
+struct compat_vcpu_runstate_info {
+ int state;
+ uint64_t state_entry_time;
+ uint64_t time[4];
+} __attribute__((packed));
+
+struct compat_sched_poll {
+ /* This is actually a guest virtual address which points to ports. */
+ uint32_t ports;
+ unsigned int nr_ports;
+ uint64_t timeout;
+};
+
+#endif /* __ARCH_X86_KVM_XEN_H__ */