summaryrefslogtreecommitdiffstats
path: root/arch/x86/kvm
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-06 01:02:30 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-06 01:02:30 +0000
commit76cb841cb886eef6b3bee341a2266c76578724ad (patch)
treef5892e5ba6cc11949952a6ce4ecbe6d516d6ce58 /arch/x86/kvm
parentInitial commit. (diff)
downloadlinux-c109f8d9e922037b3fa45f46d78384d49db8ad76.tar.xz
linux-c109f8d9e922037b3fa45f46d78384d49db8ad76.zip
Adding upstream version 4.19.249.upstream/4.19.249upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to '')
-rw-r--r--arch/x86/kvm/Kconfig103
-rw-r--r--arch/x86/kvm/Makefile24
-rw-r--r--arch/x86/kvm/cpuid.c983
-rw-r--r--arch/x86/kvm/cpuid.h182
-rw-r--r--arch/x86/kvm/debugfs.c69
-rw-r--r--arch/x86/kvm/emulate.c5849
-rw-r--r--arch/x86/kvm/hyperv.c1683
-rw-r--r--arch/x86/kvm/hyperv.h99
-rw-r--r--arch/x86/kvm/i8254.c737
-rw-r--r--arch/x86/kvm/i8254.h66
-rw-r--r--arch/x86/kvm/i8259.c655
-rw-r--r--arch/x86/kvm/ioapic.c685
-rw-r--r--arch/x86/kvm/ioapic.h138
-rw-r--r--arch/x86/kvm/irq.c164
-rw-r--r--arch/x86/kvm/irq.h130
-rw-r--r--arch/x86/kvm/irq_comm.c441
-rw-r--r--arch/x86/kvm/kvm_cache_regs.h113
-rw-r--r--arch/x86/kvm/lapic.c2710
-rw-r--r--arch/x86/kvm/lapic.h237
-rw-r--r--arch/x86/kvm/mmu.c6292
-rw-r--r--arch/x86/kvm/mmu.h223
-rw-r--r--arch/x86/kvm/mmu_audit.c306
-rw-r--r--arch/x86/kvm/mmutrace.h395
-rw-r--r--arch/x86/kvm/mtrr.c727
-rw-r--r--arch/x86/kvm/page_track.c267
-rw-r--r--arch/x86/kvm/paging_tmpl.h1083
-rw-r--r--arch/x86/kvm/pmu.c349
-rw-r--r--arch/x86/kvm/pmu.h136
-rw-r--r--arch/x86/kvm/pmu_amd.c317
-rw-r--r--arch/x86/kvm/pmu_intel.c374
-rw-r--r--arch/x86/kvm/svm.c7323
-rw-r--r--arch/x86/kvm/trace.h1429
-rw-r--r--arch/x86/kvm/tss.h60
-rw-r--r--arch/x86/kvm/vmx.c14715
-rw-r--r--arch/x86/kvm/vmx_evmcs.h324
-rw-r--r--arch/x86/kvm/vmx_shadow_fields.h77
-rw-r--r--arch/x86/kvm/x86.c9835
-rw-r--r--arch/x86/kvm/x86.h365
38 files changed, 59665 insertions, 0 deletions
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
new file mode 100644
index 000000000..1bbec387d
--- /dev/null
+++ b/arch/x86/kvm/Kconfig
@@ -0,0 +1,103 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# KVM configuration
+#
+
+source "virt/kvm/Kconfig"
+
+menuconfig VIRTUALIZATION
+ bool "Virtualization"
+ depends on HAVE_KVM || X86
+ default y
+ ---help---
+ Say Y here to get to see options for using your Linux host to run other
+ operating systems inside virtual machines (guests).
+ This option alone does not add any kernel code.
+
+ If you say N, all options in this submenu will be skipped and disabled.
+
+if VIRTUALIZATION
+
+config KVM
+ tristate "Kernel-based Virtual Machine (KVM) support"
+ depends on HAVE_KVM
+ depends on HIGH_RES_TIMERS
+ # for TASKSTATS/TASK_DELAY_ACCT:
+ depends on NET && MULTIUSER
+ depends on X86_LOCAL_APIC
+ select PREEMPT_NOTIFIERS
+ select MMU_NOTIFIER
+ select ANON_INODES
+ select HAVE_KVM_IRQCHIP
+ select HAVE_KVM_IRQFD
+ select IRQ_BYPASS_MANAGER
+ select HAVE_KVM_IRQ_BYPASS
+ select HAVE_KVM_IRQ_ROUTING
+ select HAVE_KVM_EVENTFD
+ select KVM_ASYNC_PF
+ select USER_RETURN_NOTIFIER
+ select KVM_MMIO
+ select TASKSTATS
+ select TASK_DELAY_ACCT
+ select PERF_EVENTS
+ select HAVE_KVM_MSI
+ select HAVE_KVM_CPU_RELAX_INTERCEPT
+ select KVM_GENERIC_DIRTYLOG_READ_PROTECT
+ select KVM_VFIO
+ select SRCU
+ ---help---
+ Support hosting fully virtualized guest machines using hardware
+ virtualization extensions. You will need a fairly recent
+ processor equipped with virtualization extensions. You will also
+ need to select one or more of the processor modules below.
+
+ This module provides access to the hardware capabilities through
+ a character device node named /dev/kvm.
+
+ To compile this as a module, choose M here: the module
+ will be called kvm.
+
+ If unsure, say N.
+
+config KVM_INTEL
+ tristate "KVM for Intel processors support"
+ depends on KVM
+ # for perf_guest_get_msrs():
+ depends on CPU_SUP_INTEL
+ ---help---
+ Provides support for KVM on Intel processors equipped with the VT
+ extensions.
+
+ To compile this as a module, choose M here: the module
+ will be called kvm-intel.
+
+config KVM_AMD
+ tristate "KVM for AMD processors support"
+ depends on KVM
+ ---help---
+ Provides support for KVM on AMD processors equipped with the AMD-V
+ (SVM) extensions.
+
+ To compile this as a module, choose M here: the module
+ will be called kvm-amd.
+
+config KVM_AMD_SEV
+ def_bool y
+ bool "AMD Secure Encrypted Virtualization (SEV) support"
+ depends on KVM_AMD && X86_64
+ depends on CRYPTO_DEV_SP_PSP && !(KVM_AMD=y && CRYPTO_DEV_CCP_DD=m)
+ ---help---
+ Provides support for launching Encrypted VMs on AMD processors.
+
+config KVM_MMU_AUDIT
+ bool "Audit KVM MMU"
+ depends on KVM && TRACEPOINTS
+ ---help---
+ This option adds a R/W kVM module parameter 'mmu_audit', which allows
+ auditing of KVM MMU events at runtime.
+
+# OK, it's a little counter-intuitive to do this, but it puts it neatly under
+# the virtualization menu.
+source drivers/vhost/Kconfig
+
+endif # VIRTUALIZATION
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
new file mode 100644
index 000000000..dc4f2fdf5
--- /dev/null
+++ b/arch/x86/kvm/Makefile
@@ -0,0 +1,24 @@
+# SPDX-License-Identifier: GPL-2.0
+
+ccflags-y += -Iarch/x86/kvm
+
+CFLAGS_x86.o := -I.
+CFLAGS_svm.o := -I.
+CFLAGS_vmx.o := -I.
+
+KVM := ../../../virt/kvm
+
+kvm-y += $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o \
+ $(KVM)/eventfd.o $(KVM)/irqchip.o $(KVM)/vfio.o
+kvm-$(CONFIG_KVM_ASYNC_PF) += $(KVM)/async_pf.o
+
+kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \
+ i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o \
+ hyperv.o page_track.o debugfs.o
+
+kvm-intel-y += vmx.o pmu_intel.o
+kvm-amd-y += svm.o pmu_amd.o
+
+obj-$(CONFIG_KVM) += kvm.o
+obj-$(CONFIG_KVM_INTEL) += kvm-intel.o
+obj-$(CONFIG_KVM_AMD) += kvm-amd.o
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
new file mode 100644
index 000000000..0489ffc3d
--- /dev/null
+++ b/arch/x86/kvm/cpuid.c
@@ -0,0 +1,983 @@
+/*
+ * Kernel-based Virtual Machine driver for Linux
+ * cpuid support routines
+ *
+ * derived from arch/x86/kvm/x86.c
+ *
+ * Copyright 2011 Red Hat, Inc. and/or its affiliates.
+ * Copyright IBM Corporation, 2008
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#include <linux/kvm_host.h>
+#include <linux/export.h>
+#include <linux/vmalloc.h>
+#include <linux/uaccess.h>
+#include <linux/sched/stat.h>
+
+#include <asm/processor.h>
+#include <asm/user.h>
+#include <asm/fpu/xstate.h>
+#include "cpuid.h"
+#include "lapic.h"
+#include "mmu.h"
+#include "trace.h"
+#include "pmu.h"
+
+static u32 xstate_required_size(u64 xstate_bv, bool compacted)
+{
+ int feature_bit = 0;
+ u32 ret = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET;
+
+ xstate_bv &= XFEATURE_MASK_EXTEND;
+ while (xstate_bv) {
+ if (xstate_bv & 0x1) {
+ u32 eax, ebx, ecx, edx, offset;
+ cpuid_count(0xD, feature_bit, &eax, &ebx, &ecx, &edx);
+ offset = compacted ? ret : ebx;
+ ret = max(ret, offset + eax);
+ }
+
+ xstate_bv >>= 1;
+ feature_bit++;
+ }
+
+ return ret;
+}
+
+bool kvm_mpx_supported(void)
+{
+ return ((host_xcr0 & (XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR))
+ && kvm_x86_ops->mpx_supported());
+}
+EXPORT_SYMBOL_GPL(kvm_mpx_supported);
+
+u64 kvm_supported_xcr0(void)
+{
+ u64 xcr0 = KVM_SUPPORTED_XCR0 & host_xcr0;
+
+ if (!kvm_mpx_supported())
+ xcr0 &= ~(XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR);
+
+ return xcr0;
+}
+
+#define F(x) bit(X86_FEATURE_##x)
+
+/* For scattered features from cpufeatures.h; we currently expose none */
+#define KF(x) bit(KVM_CPUID_BIT_##x)
+
+int kvm_update_cpuid(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpuid_entry2 *best;
+ struct kvm_lapic *apic = vcpu->arch.apic;
+
+ best = kvm_find_cpuid_entry(vcpu, 1, 0);
+ if (!best)
+ return 0;
+
+ /* Update OSXSAVE bit */
+ if (boot_cpu_has(X86_FEATURE_XSAVE) && best->function == 0x1) {
+ best->ecx &= ~F(OSXSAVE);
+ if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE))
+ best->ecx |= F(OSXSAVE);
+ }
+
+ best->edx &= ~F(APIC);
+ if (vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE)
+ best->edx |= F(APIC);
+
+ if (apic) {
+ if (best->ecx & F(TSC_DEADLINE_TIMER))
+ apic->lapic_timer.timer_mode_mask = 3 << 17;
+ else
+ apic->lapic_timer.timer_mode_mask = 1 << 17;
+ }
+
+ best = kvm_find_cpuid_entry(vcpu, 7, 0);
+ if (best) {
+ /* Update OSPKE bit */
+ if (boot_cpu_has(X86_FEATURE_PKU) && best->function == 0x7) {
+ best->ecx &= ~F(OSPKE);
+ if (kvm_read_cr4_bits(vcpu, X86_CR4_PKE))
+ best->ecx |= F(OSPKE);
+ }
+ }
+
+ best = kvm_find_cpuid_entry(vcpu, 0xD, 0);
+ if (!best) {
+ vcpu->arch.guest_supported_xcr0 = 0;
+ vcpu->arch.guest_xstate_size = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET;
+ } else {
+ vcpu->arch.guest_supported_xcr0 =
+ (best->eax | ((u64)best->edx << 32)) &
+ kvm_supported_xcr0();
+ vcpu->arch.guest_xstate_size = best->ebx =
+ xstate_required_size(vcpu->arch.xcr0, false);
+ }
+
+ best = kvm_find_cpuid_entry(vcpu, 0xD, 1);
+ if (best && (best->eax & (F(XSAVES) | F(XSAVEC))))
+ best->ebx = xstate_required_size(vcpu->arch.xcr0, true);
+
+ /*
+ * The existing code assumes virtual address is 48-bit or 57-bit in the
+ * canonical address checks; exit if it is ever changed.
+ */
+ best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0);
+ if (best) {
+ int vaddr_bits = (best->eax & 0xff00) >> 8;
+
+ if (vaddr_bits != 48 && vaddr_bits != 57 && vaddr_bits != 0)
+ return -EINVAL;
+ }
+
+ best = kvm_find_cpuid_entry(vcpu, KVM_CPUID_FEATURES, 0);
+ if (kvm_hlt_in_guest(vcpu->kvm) && best &&
+ (best->eax & (1 << KVM_FEATURE_PV_UNHALT)))
+ best->eax &= ~(1 << KVM_FEATURE_PV_UNHALT);
+
+ /* Update physical-address width */
+ vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
+ kvm_mmu_reset_context(vcpu);
+
+ kvm_pmu_refresh(vcpu);
+ return 0;
+}
+
+static int is_efer_nx(void)
+{
+ unsigned long long efer = 0;
+
+ rdmsrl_safe(MSR_EFER, &efer);
+ return efer & EFER_NX;
+}
+
+static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu)
+{
+ int i;
+ struct kvm_cpuid_entry2 *e, *entry;
+
+ entry = NULL;
+ for (i = 0; i < vcpu->arch.cpuid_nent; ++i) {
+ e = &vcpu->arch.cpuid_entries[i];
+ if (e->function == 0x80000001) {
+ entry = e;
+ break;
+ }
+ }
+ if (entry && (entry->edx & F(NX)) && !is_efer_nx()) {
+ entry->edx &= ~F(NX);
+ printk(KERN_INFO "kvm: guest NX capability removed\n");
+ }
+}
+
+int cpuid_query_maxphyaddr(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpuid_entry2 *best;
+
+ best = kvm_find_cpuid_entry(vcpu, 0x80000000, 0);
+ if (!best || best->eax < 0x80000008)
+ goto not_found;
+ best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0);
+ if (best)
+ return best->eax & 0xff;
+not_found:
+ return 36;
+}
+EXPORT_SYMBOL_GPL(cpuid_query_maxphyaddr);
+
+/* when an old userspace process fills a new kernel module */
+int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
+ struct kvm_cpuid *cpuid,
+ struct kvm_cpuid_entry __user *entries)
+{
+ int r, i;
+ struct kvm_cpuid_entry *cpuid_entries = NULL;
+
+ r = -E2BIG;
+ if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
+ goto out;
+ r = -ENOMEM;
+ if (cpuid->nent) {
+ cpuid_entries =
+ vmalloc(array_size(sizeof(struct kvm_cpuid_entry),
+ cpuid->nent));
+ if (!cpuid_entries)
+ goto out;
+ r = -EFAULT;
+ if (copy_from_user(cpuid_entries, entries,
+ cpuid->nent * sizeof(struct kvm_cpuid_entry)))
+ goto out;
+ }
+ for (i = 0; i < cpuid->nent; i++) {
+ vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function;
+ vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax;
+ vcpu->arch.cpuid_entries[i].ebx = cpuid_entries[i].ebx;
+ vcpu->arch.cpuid_entries[i].ecx = cpuid_entries[i].ecx;
+ vcpu->arch.cpuid_entries[i].edx = cpuid_entries[i].edx;
+ vcpu->arch.cpuid_entries[i].index = 0;
+ vcpu->arch.cpuid_entries[i].flags = 0;
+ vcpu->arch.cpuid_entries[i].padding[0] = 0;
+ vcpu->arch.cpuid_entries[i].padding[1] = 0;
+ vcpu->arch.cpuid_entries[i].padding[2] = 0;
+ }
+ vcpu->arch.cpuid_nent = cpuid->nent;
+ cpuid_fix_nx_cap(vcpu);
+ kvm_apic_set_version(vcpu);
+ kvm_x86_ops->cpuid_update(vcpu);
+ r = kvm_update_cpuid(vcpu);
+
+out:
+ vfree(cpuid_entries);
+ return r;
+}
+
+int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
+ struct kvm_cpuid2 *cpuid,
+ struct kvm_cpuid_entry2 __user *entries)
+{
+ int r;
+
+ r = -E2BIG;
+ if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
+ goto out;
+ r = -EFAULT;
+ if (copy_from_user(&vcpu->arch.cpuid_entries, entries,
+ cpuid->nent * sizeof(struct kvm_cpuid_entry2)))
+ goto out;
+ vcpu->arch.cpuid_nent = cpuid->nent;
+ kvm_apic_set_version(vcpu);
+ kvm_x86_ops->cpuid_update(vcpu);
+ r = kvm_update_cpuid(vcpu);
+out:
+ return r;
+}
+
+int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
+ struct kvm_cpuid2 *cpuid,
+ struct kvm_cpuid_entry2 __user *entries)
+{
+ int r;
+
+ r = -E2BIG;
+ if (cpuid->nent < vcpu->arch.cpuid_nent)
+ goto out;
+ r = -EFAULT;
+ if (copy_to_user(entries, &vcpu->arch.cpuid_entries,
+ vcpu->arch.cpuid_nent * sizeof(struct kvm_cpuid_entry2)))
+ goto out;
+ return 0;
+
+out:
+ cpuid->nent = vcpu->arch.cpuid_nent;
+ return r;
+}
+
+static void cpuid_mask(u32 *word, int wordnum)
+{
+ *word &= boot_cpu_data.x86_capability[wordnum];
+}
+
+static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function,
+ u32 index)
+{
+ entry->function = function;
+ entry->index = index;
+ cpuid_count(entry->function, entry->index,
+ &entry->eax, &entry->ebx, &entry->ecx, &entry->edx);
+ entry->flags = 0;
+}
+
+static int __do_cpuid_ent_emulated(struct kvm_cpuid_entry2 *entry,
+ u32 func, u32 index, int *nent, int maxnent)
+{
+ switch (func) {
+ case 0:
+ entry->eax = 7;
+ ++*nent;
+ break;
+ case 1:
+ entry->ecx = F(MOVBE);
+ ++*nent;
+ break;
+ case 7:
+ entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
+ if (index == 0)
+ entry->ecx = F(RDPID);
+ ++*nent;
+ default:
+ break;
+ }
+
+ entry->function = func;
+ entry->index = index;
+
+ return 0;
+}
+
+static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
+ u32 index, int *nent, int maxnent)
+{
+ int r;
+ unsigned f_nx = is_efer_nx() ? F(NX) : 0;
+#ifdef CONFIG_X86_64
+ unsigned f_gbpages = (kvm_x86_ops->get_lpage_level() == PT_PDPE_LEVEL)
+ ? F(GBPAGES) : 0;
+ unsigned f_lm = F(LM);
+#else
+ unsigned f_gbpages = 0;
+ unsigned f_lm = 0;
+#endif
+ unsigned f_rdtscp = kvm_x86_ops->rdtscp_supported() ? F(RDTSCP) : 0;
+ unsigned f_invpcid = kvm_x86_ops->invpcid_supported() ? F(INVPCID) : 0;
+ unsigned f_mpx = kvm_mpx_supported() ? F(MPX) : 0;
+ unsigned f_xsaves = kvm_x86_ops->xsaves_supported() ? F(XSAVES) : 0;
+ unsigned f_umip = kvm_x86_ops->umip_emulated() ? F(UMIP) : 0;
+ unsigned f_la57 = 0;
+
+ /* cpuid 1.edx */
+ const u32 kvm_cpuid_1_edx_x86_features =
+ F(FPU) | F(VME) | F(DE) | F(PSE) |
+ F(TSC) | F(MSR) | F(PAE) | F(MCE) |
+ F(CX8) | F(APIC) | 0 /* Reserved */ | F(SEP) |
+ F(MTRR) | F(PGE) | F(MCA) | F(CMOV) |
+ F(PAT) | F(PSE36) | 0 /* PSN */ | F(CLFLUSH) |
+ 0 /* Reserved, DS, ACPI */ | F(MMX) |
+ F(FXSR) | F(XMM) | F(XMM2) | F(SELFSNOOP) |
+ 0 /* HTT, TM, Reserved, PBE */;
+ /* cpuid 0x80000001.edx */
+ const u32 kvm_cpuid_8000_0001_edx_x86_features =
+ F(FPU) | F(VME) | F(DE) | F(PSE) |
+ F(TSC) | F(MSR) | F(PAE) | F(MCE) |
+ F(CX8) | F(APIC) | 0 /* Reserved */ | F(SYSCALL) |
+ F(MTRR) | F(PGE) | F(MCA) | F(CMOV) |
+ F(PAT) | F(PSE36) | 0 /* Reserved */ |
+ f_nx | 0 /* Reserved */ | F(MMXEXT) | F(MMX) |
+ F(FXSR) | F(FXSR_OPT) | f_gbpages | f_rdtscp |
+ 0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW);
+ /* cpuid 1.ecx */
+ const u32 kvm_cpuid_1_ecx_x86_features =
+ /* NOTE: MONITOR (and MWAIT) are emulated as NOP,
+ * but *not* advertised to guests via CPUID ! */
+ F(XMM3) | F(PCLMULQDQ) | 0 /* DTES64, MONITOR */ |
+ 0 /* DS-CPL, VMX, SMX, EST */ |
+ 0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ |
+ F(FMA) | F(CX16) | 0 /* xTPR Update, PDCM */ |
+ F(PCID) | 0 /* Reserved, DCA */ | F(XMM4_1) |
+ F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) |
+ 0 /* Reserved*/ | F(AES) | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX) |
+ F(F16C) | F(RDRAND);
+ /* cpuid 0x80000001.ecx */
+ const u32 kvm_cpuid_8000_0001_ecx_x86_features =
+ F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ |
+ F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) |
+ F(3DNOWPREFETCH) | F(OSVW) | 0 /* IBS */ | F(XOP) |
+ 0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM) |
+ F(TOPOEXT) | F(PERFCTR_CORE);
+
+ /* cpuid 0x80000008.ebx */
+ const u32 kvm_cpuid_8000_0008_ebx_x86_features =
+ F(AMD_IBPB) | F(AMD_IBRS) | F(AMD_SSBD) | F(VIRT_SSBD) |
+ F(AMD_SSB_NO) | F(AMD_STIBP);
+
+ /* cpuid 0xC0000001.edx */
+ const u32 kvm_cpuid_C000_0001_edx_x86_features =
+ F(XSTORE) | F(XSTORE_EN) | F(XCRYPT) | F(XCRYPT_EN) |
+ F(ACE2) | F(ACE2_EN) | F(PHE) | F(PHE_EN) |
+ F(PMM) | F(PMM_EN);
+
+ /* cpuid 7.0.ebx */
+ const u32 kvm_cpuid_7_0_ebx_x86_features =
+ F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) |
+ F(BMI2) | F(ERMS) | f_invpcid | F(RTM) | f_mpx | F(RDSEED) |
+ F(ADX) | F(SMAP) | F(AVX512IFMA) | F(AVX512F) | F(AVX512PF) |
+ F(AVX512ER) | F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB) | F(AVX512DQ) |
+ F(SHA_NI) | F(AVX512BW) | F(AVX512VL);
+
+ /* cpuid 0xD.1.eax */
+ const u32 kvm_cpuid_D_1_eax_x86_features =
+ F(XSAVEOPT) | F(XSAVEC) | F(XGETBV1) | f_xsaves;
+
+ /* cpuid 7.0.ecx*/
+ const u32 kvm_cpuid_7_0_ecx_x86_features =
+ F(AVX512VBMI) | F(LA57) | F(PKU) | 0 /*OSPKE*/ |
+ F(AVX512_VPOPCNTDQ) | F(UMIP) | F(AVX512_VBMI2) | F(GFNI) |
+ F(VAES) | F(VPCLMULQDQ) | F(AVX512_VNNI) | F(AVX512_BITALG) |
+ F(CLDEMOTE);
+
+ /* cpuid 7.0.edx*/
+ const u32 kvm_cpuid_7_0_edx_x86_features =
+ F(AVX512_4VNNIW) | F(AVX512_4FMAPS) | F(SPEC_CTRL) |
+ F(SPEC_CTRL_SSBD) | F(ARCH_CAPABILITIES) | F(INTEL_STIBP) |
+ F(MD_CLEAR);
+
+ /* all calls to cpuid_count() should be made on the same cpu */
+ get_cpu();
+
+ r = -E2BIG;
+
+ if (WARN_ON(*nent >= maxnent))
+ goto out;
+
+ do_cpuid_1_ent(entry, function, index);
+ ++*nent;
+
+ switch (function) {
+ case 0:
+ entry->eax = min(entry->eax, (u32)0xd);
+ break;
+ case 1:
+ entry->edx &= kvm_cpuid_1_edx_x86_features;
+ cpuid_mask(&entry->edx, CPUID_1_EDX);
+ entry->ecx &= kvm_cpuid_1_ecx_x86_features;
+ cpuid_mask(&entry->ecx, CPUID_1_ECX);
+ /* we support x2apic emulation even if host does not support
+ * it since we emulate x2apic in software */
+ entry->ecx |= F(X2APIC);
+ break;
+ /* function 2 entries are STATEFUL. That is, repeated cpuid commands
+ * may return different values. This forces us to get_cpu() before
+ * issuing the first command, and also to emulate this annoying behavior
+ * in kvm_emulate_cpuid() using KVM_CPUID_FLAG_STATE_READ_NEXT */
+ case 2: {
+ int t, times = entry->eax & 0xff;
+
+ entry->flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
+ entry->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;
+ for (t = 1; t < times; ++t) {
+ if (*nent >= maxnent)
+ goto out;
+
+ do_cpuid_1_ent(&entry[t], function, 0);
+ entry[t].flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
+ ++*nent;
+ }
+ break;
+ }
+ /* function 4 has additional index. */
+ case 4: {
+ int i, cache_type;
+
+ entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
+ /* read more entries until cache_type is zero */
+ for (i = 1; ; ++i) {
+ if (*nent >= maxnent)
+ goto out;
+
+ cache_type = entry[i - 1].eax & 0x1f;
+ if (!cache_type)
+ break;
+ do_cpuid_1_ent(&entry[i], function, i);
+ entry[i].flags |=
+ KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
+ ++*nent;
+ }
+ break;
+ }
+ case 6: /* Thermal management */
+ entry->eax = 0x4; /* allow ARAT */
+ entry->ebx = 0;
+ entry->ecx = 0;
+ entry->edx = 0;
+ break;
+ case 7: {
+ entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
+ /* Mask ebx against host capability word 9 */
+ if (index == 0) {
+ entry->ebx &= kvm_cpuid_7_0_ebx_x86_features;
+ cpuid_mask(&entry->ebx, CPUID_7_0_EBX);
+ // TSC_ADJUST is emulated
+ entry->ebx |= F(TSC_ADJUST);
+ entry->ecx &= kvm_cpuid_7_0_ecx_x86_features;
+ f_la57 = entry->ecx & F(LA57);
+ cpuid_mask(&entry->ecx, CPUID_7_ECX);
+ /* Set LA57 based on hardware capability. */
+ entry->ecx |= f_la57;
+ entry->ecx |= f_umip;
+ /* PKU is not yet implemented for shadow paging. */
+ if (!tdp_enabled || !boot_cpu_has(X86_FEATURE_OSPKE))
+ entry->ecx &= ~F(PKU);
+
+ entry->edx &= kvm_cpuid_7_0_edx_x86_features;
+ cpuid_mask(&entry->edx, CPUID_7_EDX);
+ if (boot_cpu_has(X86_FEATURE_IBPB) &&
+ boot_cpu_has(X86_FEATURE_IBRS))
+ entry->edx |= F(SPEC_CTRL);
+ if (boot_cpu_has(X86_FEATURE_STIBP))
+ entry->edx |= F(INTEL_STIBP);
+ if (boot_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) ||
+ boot_cpu_has(X86_FEATURE_AMD_SSBD))
+ entry->edx |= F(SPEC_CTRL_SSBD);
+ /*
+ * We emulate ARCH_CAPABILITIES in software even
+ * if the host doesn't support it.
+ */
+ entry->edx |= F(ARCH_CAPABILITIES);
+ } else {
+ entry->ebx = 0;
+ entry->ecx = 0;
+ entry->edx = 0;
+ }
+ entry->eax = 0;
+ break;
+ }
+ case 9:
+ break;
+ case 0xa: { /* Architectural Performance Monitoring */
+ struct x86_pmu_capability cap;
+ union cpuid10_eax eax;
+ union cpuid10_edx edx;
+
+ if (!static_cpu_has(X86_FEATURE_ARCH_PERFMON)) {
+ entry->eax = entry->ebx = entry->ecx = entry->edx = 0;
+ break;
+ }
+
+ perf_get_x86_pmu_capability(&cap);
+
+ /*
+ * Only support guest architectural pmu on a host
+ * with architectural pmu.
+ */
+ if (!cap.version)
+ memset(&cap, 0, sizeof(cap));
+
+ eax.split.version_id = min(cap.version, 2);
+ eax.split.num_counters = cap.num_counters_gp;
+ eax.split.bit_width = cap.bit_width_gp;
+ eax.split.mask_length = cap.events_mask_len;
+
+ edx.split.num_counters_fixed = cap.num_counters_fixed;
+ edx.split.bit_width_fixed = cap.bit_width_fixed;
+ edx.split.reserved = 0;
+
+ entry->eax = eax.full;
+ entry->ebx = cap.events_mask;
+ entry->ecx = 0;
+ entry->edx = edx.full;
+ break;
+ }
+ /* function 0xb has additional index. */
+ case 0xb: {
+ int i, level_type;
+
+ entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
+ /* read more entries until level_type is zero */
+ for (i = 1; ; ++i) {
+ if (*nent >= maxnent)
+ goto out;
+
+ level_type = entry[i - 1].ecx & 0xff00;
+ if (!level_type)
+ break;
+ do_cpuid_1_ent(&entry[i], function, i);
+ entry[i].flags |=
+ KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
+ ++*nent;
+ }
+ break;
+ }
+ case 0xd: {
+ int idx, i;
+ u64 supported = kvm_supported_xcr0();
+
+ entry->eax &= supported;
+ entry->ebx = xstate_required_size(supported, false);
+ entry->ecx = entry->ebx;
+ entry->edx &= supported >> 32;
+ entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
+ if (!supported)
+ break;
+
+ for (idx = 1, i = 1; idx < 64; ++idx) {
+ u64 mask = ((u64)1 << idx);
+ if (*nent >= maxnent)
+ goto out;
+
+ do_cpuid_1_ent(&entry[i], function, idx);
+ if (idx == 1) {
+ entry[i].eax &= kvm_cpuid_D_1_eax_x86_features;
+ cpuid_mask(&entry[i].eax, CPUID_D_1_EAX);
+ entry[i].ebx = 0;
+ if (entry[i].eax & (F(XSAVES)|F(XSAVEC)))
+ entry[i].ebx =
+ xstate_required_size(supported,
+ true);
+ } else {
+ if (entry[i].eax == 0 || !(supported & mask))
+ continue;
+ if (WARN_ON_ONCE(entry[i].ecx & 1))
+ continue;
+ }
+ entry[i].ecx = 0;
+ entry[i].edx = 0;
+ entry[i].flags |=
+ KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
+ ++*nent;
+ ++i;
+ }
+ break;
+ }
+ case KVM_CPUID_SIGNATURE: {
+ static const char signature[12] = "KVMKVMKVM\0\0";
+ const u32 *sigptr = (const u32 *)signature;
+ entry->eax = KVM_CPUID_FEATURES;
+ entry->ebx = sigptr[0];
+ entry->ecx = sigptr[1];
+ entry->edx = sigptr[2];
+ break;
+ }
+ case KVM_CPUID_FEATURES:
+ entry->eax = (1 << KVM_FEATURE_CLOCKSOURCE) |
+ (1 << KVM_FEATURE_NOP_IO_DELAY) |
+ (1 << KVM_FEATURE_CLOCKSOURCE2) |
+ (1 << KVM_FEATURE_ASYNC_PF) |
+ (1 << KVM_FEATURE_PV_EOI) |
+ (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT) |
+ (1 << KVM_FEATURE_PV_UNHALT) |
+ (1 << KVM_FEATURE_PV_TLB_FLUSH) |
+ (1 << KVM_FEATURE_ASYNC_PF_VMEXIT) |
+ (1 << KVM_FEATURE_PV_SEND_IPI);
+
+ if (sched_info_on())
+ entry->eax |= (1 << KVM_FEATURE_STEAL_TIME);
+
+ entry->ebx = 0;
+ entry->ecx = 0;
+ entry->edx = 0;
+ break;
+ case 0x80000000:
+ entry->eax = min(entry->eax, 0x8000001f);
+ break;
+ case 0x80000001:
+ entry->edx &= kvm_cpuid_8000_0001_edx_x86_features;
+ cpuid_mask(&entry->edx, CPUID_8000_0001_EDX);
+ entry->ecx &= kvm_cpuid_8000_0001_ecx_x86_features;
+ cpuid_mask(&entry->ecx, CPUID_8000_0001_ECX);
+ break;
+ case 0x80000007: /* Advanced power management */
+ /* invariant TSC is CPUID.80000007H:EDX[8] */
+ entry->edx &= (1 << 8);
+ /* mask against host */
+ entry->edx &= boot_cpu_data.x86_power;
+ entry->eax = entry->ebx = entry->ecx = 0;
+ break;
+ case 0x80000008: {
+ unsigned g_phys_as = (entry->eax >> 16) & 0xff;
+ unsigned virt_as = max((entry->eax >> 8) & 0xff, 48U);
+ unsigned phys_as = entry->eax & 0xff;
+
+ /*
+ * Use bare metal's MAXPHADDR if the CPU doesn't report guest
+ * MAXPHYADDR separately, or if TDP (NPT) is disabled, as the
+ * guest version "applies only to guests using nested paging".
+ */
+ if (!g_phys_as || !tdp_enabled)
+ g_phys_as = phys_as;
+
+ entry->eax = g_phys_as | (virt_as << 8);
+ entry->edx = 0;
+ /*
+ * IBRS, IBPB and VIRT_SSBD aren't necessarily present in
+ * hardware cpuid
+ */
+ if (boot_cpu_has(X86_FEATURE_AMD_IBPB))
+ entry->ebx |= F(AMD_IBPB);
+ if (boot_cpu_has(X86_FEATURE_AMD_IBRS))
+ entry->ebx |= F(AMD_IBRS);
+ if (boot_cpu_has(X86_FEATURE_VIRT_SSBD))
+ entry->ebx |= F(VIRT_SSBD);
+ entry->ebx &= kvm_cpuid_8000_0008_ebx_x86_features;
+ cpuid_mask(&entry->ebx, CPUID_8000_0008_EBX);
+ /*
+ * The preference is to use SPEC CTRL MSR instead of the
+ * VIRT_SPEC MSR.
+ */
+ if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) &&
+ !boot_cpu_has(X86_FEATURE_AMD_SSBD))
+ entry->ebx |= F(VIRT_SSBD);
+ break;
+ }
+ case 0x80000019:
+ entry->ecx = entry->edx = 0;
+ break;
+ case 0x8000001a:
+ break;
+ case 0x8000001d:
+ break;
+ /*Add support for Centaur's CPUID instruction*/
+ case 0xC0000000:
+ /*Just support up to 0xC0000004 now*/
+ entry->eax = min(entry->eax, 0xC0000004);
+ break;
+ case 0xC0000001:
+ entry->edx &= kvm_cpuid_C000_0001_edx_x86_features;
+ cpuid_mask(&entry->edx, CPUID_C000_0001_EDX);
+ break;
+ case 3: /* Processor serial number */
+ case 5: /* MONITOR/MWAIT */
+ case 0xC0000002:
+ case 0xC0000003:
+ case 0xC0000004:
+ default:
+ entry->eax = entry->ebx = entry->ecx = entry->edx = 0;
+ break;
+ }
+
+ kvm_x86_ops->set_supported_cpuid(function, entry);
+
+ r = 0;
+
+out:
+ put_cpu();
+
+ return r;
+}
+
+static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 func,
+ u32 idx, int *nent, int maxnent, unsigned int type)
+{
+ if (*nent >= maxnent)
+ return -E2BIG;
+
+ if (type == KVM_GET_EMULATED_CPUID)
+ return __do_cpuid_ent_emulated(entry, func, idx, nent, maxnent);
+
+ return __do_cpuid_ent(entry, func, idx, nent, maxnent);
+}
+
+#undef F
+
+struct kvm_cpuid_param {
+ u32 func;
+ u32 idx;
+ bool has_leaf_count;
+ bool (*qualifier)(const struct kvm_cpuid_param *param);
+};
+
+static bool is_centaur_cpu(const struct kvm_cpuid_param *param)
+{
+ return boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR;
+}
+
+static bool sanity_check_entries(struct kvm_cpuid_entry2 __user *entries,
+ __u32 num_entries, unsigned int ioctl_type)
+{
+ int i;
+ __u32 pad[3];
+
+ if (ioctl_type != KVM_GET_EMULATED_CPUID)
+ return false;
+
+ /*
+ * We want to make sure that ->padding is being passed clean from
+ * userspace in case we want to use it for something in the future.
+ *
+ * Sadly, this wasn't enforced for KVM_GET_SUPPORTED_CPUID and so we
+ * have to give ourselves satisfied only with the emulated side. /me
+ * sheds a tear.
+ */
+ for (i = 0; i < num_entries; i++) {
+ if (copy_from_user(pad, entries[i].padding, sizeof(pad)))
+ return true;
+
+ if (pad[0] || pad[1] || pad[2])
+ return true;
+ }
+ return false;
+}
+
+int kvm_dev_ioctl_get_cpuid(struct kvm_cpuid2 *cpuid,
+ struct kvm_cpuid_entry2 __user *entries,
+ unsigned int type)
+{
+ struct kvm_cpuid_entry2 *cpuid_entries;
+ int limit, nent = 0, r = -E2BIG, i;
+ u32 func;
+ static const struct kvm_cpuid_param param[] = {
+ { .func = 0, .has_leaf_count = true },
+ { .func = 0x80000000, .has_leaf_count = true },
+ { .func = 0xC0000000, .qualifier = is_centaur_cpu, .has_leaf_count = true },
+ { .func = KVM_CPUID_SIGNATURE },
+ { .func = KVM_CPUID_FEATURES },
+ };
+
+ if (cpuid->nent < 1)
+ goto out;
+ if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
+ cpuid->nent = KVM_MAX_CPUID_ENTRIES;
+
+ if (sanity_check_entries(entries, cpuid->nent, type))
+ return -EINVAL;
+
+ r = -ENOMEM;
+ cpuid_entries = vzalloc(array_size(sizeof(struct kvm_cpuid_entry2),
+ cpuid->nent));
+ if (!cpuid_entries)
+ goto out;
+
+ r = 0;
+ for (i = 0; i < ARRAY_SIZE(param); i++) {
+ const struct kvm_cpuid_param *ent = &param[i];
+
+ if (ent->qualifier && !ent->qualifier(ent))
+ continue;
+
+ r = do_cpuid_ent(&cpuid_entries[nent], ent->func, ent->idx,
+ &nent, cpuid->nent, type);
+
+ if (r)
+ goto out_free;
+
+ if (!ent->has_leaf_count)
+ continue;
+
+ limit = cpuid_entries[nent - 1].eax;
+ for (func = ent->func + 1; func <= limit && nent < cpuid->nent && r == 0; ++func)
+ r = do_cpuid_ent(&cpuid_entries[nent], func, ent->idx,
+ &nent, cpuid->nent, type);
+
+ if (r)
+ goto out_free;
+ }
+
+ r = -EFAULT;
+ if (copy_to_user(entries, cpuid_entries,
+ nent * sizeof(struct kvm_cpuid_entry2)))
+ goto out_free;
+ cpuid->nent = nent;
+ r = 0;
+
+out_free:
+ vfree(cpuid_entries);
+out:
+ return r;
+}
+
+static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i)
+{
+ struct kvm_cpuid_entry2 *e = &vcpu->arch.cpuid_entries[i];
+ struct kvm_cpuid_entry2 *ej;
+ int j = i;
+ int nent = vcpu->arch.cpuid_nent;
+
+ e->flags &= ~KVM_CPUID_FLAG_STATE_READ_NEXT;
+ /* when no next entry is found, the current entry[i] is reselected */
+ do {
+ j = (j + 1) % nent;
+ ej = &vcpu->arch.cpuid_entries[j];
+ } while (ej->function != e->function);
+
+ ej->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;
+
+ return j;
+}
+
+/* find an entry with matching function, matching index (if needed), and that
+ * should be read next (if it's stateful) */
+static int is_matching_cpuid_entry(struct kvm_cpuid_entry2 *e,
+ u32 function, u32 index)
+{
+ if (e->function != function)
+ return 0;
+ if ((e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) && e->index != index)
+ return 0;
+ if ((e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) &&
+ !(e->flags & KVM_CPUID_FLAG_STATE_READ_NEXT))
+ return 0;
+ return 1;
+}
+
+struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
+ u32 function, u32 index)
+{
+ int i;
+ struct kvm_cpuid_entry2 *best = NULL;
+
+ for (i = 0; i < vcpu->arch.cpuid_nent; ++i) {
+ struct kvm_cpuid_entry2 *e;
+
+ e = &vcpu->arch.cpuid_entries[i];
+ if (is_matching_cpuid_entry(e, function, index)) {
+ if (e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC)
+ move_to_next_stateful_cpuid_entry(vcpu, i);
+ best = e;
+ break;
+ }
+ }
+ return best;
+}
+EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry);
+
+/*
+ * If no match is found, check whether we exceed the vCPU's limit
+ * and return the content of the highest valid _standard_ leaf instead.
+ * This is to satisfy the CPUID specification.
+ */
+static struct kvm_cpuid_entry2* check_cpuid_limit(struct kvm_vcpu *vcpu,
+ u32 function, u32 index)
+{
+ struct kvm_cpuid_entry2 *maxlevel;
+
+ maxlevel = kvm_find_cpuid_entry(vcpu, function & 0x80000000, 0);
+ if (!maxlevel || maxlevel->eax >= function)
+ return NULL;
+ if (function & 0x80000000) {
+ maxlevel = kvm_find_cpuid_entry(vcpu, 0, 0);
+ if (!maxlevel)
+ return NULL;
+ }
+ return kvm_find_cpuid_entry(vcpu, maxlevel->eax, index);
+}
+
+bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx,
+ u32 *ecx, u32 *edx, bool check_limit)
+{
+ u32 function = *eax, index = *ecx;
+ struct kvm_cpuid_entry2 *best;
+ bool entry_found = true;
+
+ best = kvm_find_cpuid_entry(vcpu, function, index);
+
+ if (!best) {
+ entry_found = false;
+ if (!check_limit)
+ goto out;
+
+ best = check_cpuid_limit(vcpu, function, index);
+ }
+
+out:
+ if (best) {
+ *eax = best->eax;
+ *ebx = best->ebx;
+ *ecx = best->ecx;
+ *edx = best->edx;
+ } else
+ *eax = *ebx = *ecx = *edx = 0;
+ trace_kvm_cpuid(function, *eax, *ebx, *ecx, *edx, entry_found);
+ return entry_found;
+}
+EXPORT_SYMBOL_GPL(kvm_cpuid);
+
+int kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
+{
+ u32 eax, ebx, ecx, edx;
+
+ if (cpuid_fault_enabled(vcpu) && !kvm_require_cpl(vcpu, 0))
+ return 1;
+
+ eax = kvm_register_read(vcpu, VCPU_REGS_RAX);
+ ecx = kvm_register_read(vcpu, VCPU_REGS_RCX);
+ kvm_cpuid(vcpu, &eax, &ebx, &ecx, &edx, true);
+ kvm_register_write(vcpu, VCPU_REGS_RAX, eax);
+ kvm_register_write(vcpu, VCPU_REGS_RBX, ebx);
+ kvm_register_write(vcpu, VCPU_REGS_RCX, ecx);
+ kvm_register_write(vcpu, VCPU_REGS_RDX, edx);
+ return kvm_skip_emulated_instruction(vcpu);
+}
+EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
new file mode 100644
index 000000000..7dec43b2c
--- /dev/null
+++ b/arch/x86/kvm/cpuid.h
@@ -0,0 +1,182 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef ARCH_X86_KVM_CPUID_H
+#define ARCH_X86_KVM_CPUID_H
+
+#include "x86.h"
+#include <asm/cpu.h>
+#include <asm/processor.h>
+
+int kvm_update_cpuid(struct kvm_vcpu *vcpu);
+bool kvm_mpx_supported(void);
+struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
+ u32 function, u32 index);
+int kvm_dev_ioctl_get_cpuid(struct kvm_cpuid2 *cpuid,
+ struct kvm_cpuid_entry2 __user *entries,
+ unsigned int type);
+int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
+ struct kvm_cpuid *cpuid,
+ struct kvm_cpuid_entry __user *entries);
+int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
+ struct kvm_cpuid2 *cpuid,
+ struct kvm_cpuid_entry2 __user *entries);
+int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
+ struct kvm_cpuid2 *cpuid,
+ struct kvm_cpuid_entry2 __user *entries);
+bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx,
+ u32 *ecx, u32 *edx, bool check_limit);
+
+int cpuid_query_maxphyaddr(struct kvm_vcpu *vcpu);
+
+static inline int cpuid_maxphyaddr(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.maxphyaddr;
+}
+
+struct cpuid_reg {
+ u32 function;
+ u32 index;
+ int reg;
+};
+
+static const struct cpuid_reg reverse_cpuid[] = {
+ [CPUID_1_EDX] = { 1, 0, CPUID_EDX},
+ [CPUID_8000_0001_EDX] = {0x80000001, 0, CPUID_EDX},
+ [CPUID_8086_0001_EDX] = {0x80860001, 0, CPUID_EDX},
+ [CPUID_1_ECX] = { 1, 0, CPUID_ECX},
+ [CPUID_C000_0001_EDX] = {0xc0000001, 0, CPUID_EDX},
+ [CPUID_8000_0001_ECX] = {0x80000001, 0, CPUID_ECX},
+ [CPUID_7_0_EBX] = { 7, 0, CPUID_EBX},
+ [CPUID_D_1_EAX] = { 0xd, 1, CPUID_EAX},
+ [CPUID_8000_0008_EBX] = {0x80000008, 0, CPUID_EBX},
+ [CPUID_6_EAX] = { 6, 0, CPUID_EAX},
+ [CPUID_8000_000A_EDX] = {0x8000000a, 0, CPUID_EDX},
+ [CPUID_7_ECX] = { 7, 0, CPUID_ECX},
+ [CPUID_8000_0007_EBX] = {0x80000007, 0, CPUID_EBX},
+ [CPUID_7_EDX] = { 7, 0, CPUID_EDX},
+};
+
+static __always_inline struct cpuid_reg x86_feature_cpuid(unsigned x86_feature)
+{
+ unsigned x86_leaf = x86_feature / 32;
+
+ BUILD_BUG_ON(x86_leaf >= ARRAY_SIZE(reverse_cpuid));
+ BUILD_BUG_ON(reverse_cpuid[x86_leaf].function == 0);
+
+ return reverse_cpuid[x86_leaf];
+}
+
+static __always_inline int *guest_cpuid_get_register(struct kvm_vcpu *vcpu, unsigned x86_feature)
+{
+ struct kvm_cpuid_entry2 *entry;
+ const struct cpuid_reg cpuid = x86_feature_cpuid(x86_feature);
+
+ entry = kvm_find_cpuid_entry(vcpu, cpuid.function, cpuid.index);
+ if (!entry)
+ return NULL;
+
+ switch (cpuid.reg) {
+ case CPUID_EAX:
+ return &entry->eax;
+ case CPUID_EBX:
+ return &entry->ebx;
+ case CPUID_ECX:
+ return &entry->ecx;
+ case CPUID_EDX:
+ return &entry->edx;
+ default:
+ BUILD_BUG();
+ return NULL;
+ }
+}
+
+static __always_inline bool guest_cpuid_has(struct kvm_vcpu *vcpu, unsigned x86_feature)
+{
+ int *reg;
+
+ if (x86_feature == X86_FEATURE_XSAVE &&
+ !static_cpu_has(X86_FEATURE_XSAVE))
+ return false;
+
+ reg = guest_cpuid_get_register(vcpu, x86_feature);
+ if (!reg)
+ return false;
+
+ return *reg & bit(x86_feature);
+}
+
+static __always_inline void guest_cpuid_clear(struct kvm_vcpu *vcpu, unsigned x86_feature)
+{
+ int *reg;
+
+ reg = guest_cpuid_get_register(vcpu, x86_feature);
+ if (reg)
+ *reg &= ~bit(x86_feature);
+}
+
+static inline bool guest_cpuid_is_amd(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpuid_entry2 *best;
+
+ best = kvm_find_cpuid_entry(vcpu, 0, 0);
+ return best && best->ebx == X86EMUL_CPUID_VENDOR_AuthenticAMD_ebx;
+}
+
+static inline int guest_cpuid_family(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpuid_entry2 *best;
+
+ best = kvm_find_cpuid_entry(vcpu, 0x1, 0);
+ if (!best)
+ return -1;
+
+ return x86_family(best->eax);
+}
+
+static inline int guest_cpuid_model(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpuid_entry2 *best;
+
+ best = kvm_find_cpuid_entry(vcpu, 0x1, 0);
+ if (!best)
+ return -1;
+
+ return x86_model(best->eax);
+}
+
+static inline int guest_cpuid_stepping(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpuid_entry2 *best;
+
+ best = kvm_find_cpuid_entry(vcpu, 0x1, 0);
+ if (!best)
+ return -1;
+
+ return x86_stepping(best->eax);
+}
+
+static inline bool guest_has_spec_ctrl_msr(struct kvm_vcpu *vcpu)
+{
+ return (guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL) ||
+ guest_cpuid_has(vcpu, X86_FEATURE_AMD_STIBP) ||
+ guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBRS) ||
+ guest_cpuid_has(vcpu, X86_FEATURE_AMD_SSBD));
+}
+
+static inline bool guest_has_pred_cmd_msr(struct kvm_vcpu *vcpu)
+{
+ return (guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL) ||
+ guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBPB));
+}
+
+static inline bool supports_cpuid_fault(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.msr_platform_info & MSR_PLATFORM_INFO_CPUID_FAULT;
+}
+
+static inline bool cpuid_fault_enabled(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.msr_misc_features_enables &
+ MSR_MISC_FEATURES_ENABLES_CPUID_FAULT;
+}
+
+#endif
diff --git a/arch/x86/kvm/debugfs.c b/arch/x86/kvm/debugfs.c
new file mode 100644
index 000000000..c19c7ede9
--- /dev/null
+++ b/arch/x86/kvm/debugfs.c
@@ -0,0 +1,69 @@
+/*
+ * Kernel-based Virtual Machine driver for Linux
+ *
+ * Copyright 2016 Red Hat, Inc. and/or its affiliates.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ *
+ */
+#include <linux/kvm_host.h>
+#include <linux/debugfs.h>
+
+bool kvm_arch_has_vcpu_debugfs(void)
+{
+ return true;
+}
+
+static int vcpu_get_tsc_offset(void *data, u64 *val)
+{
+ struct kvm_vcpu *vcpu = (struct kvm_vcpu *) data;
+ *val = vcpu->arch.tsc_offset;
+ return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(vcpu_tsc_offset_fops, vcpu_get_tsc_offset, NULL, "%lld\n");
+
+static int vcpu_get_tsc_scaling_ratio(void *data, u64 *val)
+{
+ struct kvm_vcpu *vcpu = (struct kvm_vcpu *) data;
+ *val = vcpu->arch.tsc_scaling_ratio;
+ return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(vcpu_tsc_scaling_fops, vcpu_get_tsc_scaling_ratio, NULL, "%llu\n");
+
+static int vcpu_get_tsc_scaling_frac_bits(void *data, u64 *val)
+{
+ *val = kvm_tsc_scaling_ratio_frac_bits;
+ return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(vcpu_tsc_scaling_frac_fops, vcpu_get_tsc_scaling_frac_bits, NULL, "%llu\n");
+
+int kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu)
+{
+ struct dentry *ret;
+
+ ret = debugfs_create_file("tsc-offset", 0444,
+ vcpu->debugfs_dentry,
+ vcpu, &vcpu_tsc_offset_fops);
+ if (!ret)
+ return -ENOMEM;
+
+ if (kvm_has_tsc_control) {
+ ret = debugfs_create_file("tsc-scaling-ratio", 0444,
+ vcpu->debugfs_dentry,
+ vcpu, &vcpu_tsc_scaling_fops);
+ if (!ret)
+ return -ENOMEM;
+ ret = debugfs_create_file("tsc-scaling-ratio-frac-bits", 0444,
+ vcpu->debugfs_dentry,
+ vcpu, &vcpu_tsc_scaling_frac_fops);
+ if (!ret)
+ return -ENOMEM;
+
+ }
+
+ return 0;
+}
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
new file mode 100644
index 000000000..63754d248
--- /dev/null
+++ b/arch/x86/kvm/emulate.c
@@ -0,0 +1,5849 @@
+/******************************************************************************
+ * emulate.c
+ *
+ * Generic x86 (32-bit and 64-bit) instruction decoder and emulator.
+ *
+ * Copyright (c) 2005 Keir Fraser
+ *
+ * Linux coding style, mod r/m decoder, segment base fixes, real-mode
+ * privileged instructions:
+ *
+ * Copyright (C) 2006 Qumranet
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
+ *
+ * Avi Kivity <avi@qumranet.com>
+ * Yaniv Kamay <yaniv@qumranet.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ *
+ * From: xen-unstable 10676:af9809f51f81a3c43f276f00c81a52ef558afda4
+ */
+
+#include <linux/kvm_host.h>
+#include "kvm_cache_regs.h"
+#include <asm/kvm_emulate.h>
+#include <linux/stringify.h>
+#include <asm/debugreg.h>
+#include <asm/nospec-branch.h>
+
+#include "x86.h"
+#include "tss.h"
+#include "mmu.h"
+#include "pmu.h"
+
+/*
+ * Operand types
+ */
+#define OpNone 0ull
+#define OpImplicit 1ull /* No generic decode */
+#define OpReg 2ull /* Register */
+#define OpMem 3ull /* Memory */
+#define OpAcc 4ull /* Accumulator: AL/AX/EAX/RAX */
+#define OpDI 5ull /* ES:DI/EDI/RDI */
+#define OpMem64 6ull /* Memory, 64-bit */
+#define OpImmUByte 7ull /* Zero-extended 8-bit immediate */
+#define OpDX 8ull /* DX register */
+#define OpCL 9ull /* CL register (for shifts) */
+#define OpImmByte 10ull /* 8-bit sign extended immediate */
+#define OpOne 11ull /* Implied 1 */
+#define OpImm 12ull /* Sign extended up to 32-bit immediate */
+#define OpMem16 13ull /* Memory operand (16-bit). */
+#define OpMem32 14ull /* Memory operand (32-bit). */
+#define OpImmU 15ull /* Immediate operand, zero extended */
+#define OpSI 16ull /* SI/ESI/RSI */
+#define OpImmFAddr 17ull /* Immediate far address */
+#define OpMemFAddr 18ull /* Far address in memory */
+#define OpImmU16 19ull /* Immediate operand, 16 bits, zero extended */
+#define OpES 20ull /* ES */
+#define OpCS 21ull /* CS */
+#define OpSS 22ull /* SS */
+#define OpDS 23ull /* DS */
+#define OpFS 24ull /* FS */
+#define OpGS 25ull /* GS */
+#define OpMem8 26ull /* 8-bit zero extended memory operand */
+#define OpImm64 27ull /* Sign extended 16/32/64-bit immediate */
+#define OpXLat 28ull /* memory at BX/EBX/RBX + zero-extended AL */
+#define OpAccLo 29ull /* Low part of extended acc (AX/AX/EAX/RAX) */
+#define OpAccHi 30ull /* High part of extended acc (-/DX/EDX/RDX) */
+
+#define OpBits 5 /* Width of operand field */
+#define OpMask ((1ull << OpBits) - 1)
+
+/*
+ * Opcode effective-address decode tables.
+ * Note that we only emulate instructions that have at least one memory
+ * operand (excluding implicit stack references). We assume that stack
+ * references and instruction fetches will never occur in special memory
+ * areas that require emulation. So, for example, 'mov <imm>,<reg>' need
+ * not be handled.
+ */
+
+/* Operand sizes: 8-bit operands or specified/overridden size. */
+#define ByteOp (1<<0) /* 8-bit operands. */
+/* Destination operand type. */
+#define DstShift 1
+#define ImplicitOps (OpImplicit << DstShift)
+#define DstReg (OpReg << DstShift)
+#define DstMem (OpMem << DstShift)
+#define DstAcc (OpAcc << DstShift)
+#define DstDI (OpDI << DstShift)
+#define DstMem64 (OpMem64 << DstShift)
+#define DstMem16 (OpMem16 << DstShift)
+#define DstImmUByte (OpImmUByte << DstShift)
+#define DstDX (OpDX << DstShift)
+#define DstAccLo (OpAccLo << DstShift)
+#define DstMask (OpMask << DstShift)
+/* Source operand type. */
+#define SrcShift 6
+#define SrcNone (OpNone << SrcShift)
+#define SrcReg (OpReg << SrcShift)
+#define SrcMem (OpMem << SrcShift)
+#define SrcMem16 (OpMem16 << SrcShift)
+#define SrcMem32 (OpMem32 << SrcShift)
+#define SrcImm (OpImm << SrcShift)
+#define SrcImmByte (OpImmByte << SrcShift)
+#define SrcOne (OpOne << SrcShift)
+#define SrcImmUByte (OpImmUByte << SrcShift)
+#define SrcImmU (OpImmU << SrcShift)
+#define SrcSI (OpSI << SrcShift)
+#define SrcXLat (OpXLat << SrcShift)
+#define SrcImmFAddr (OpImmFAddr << SrcShift)
+#define SrcMemFAddr (OpMemFAddr << SrcShift)
+#define SrcAcc (OpAcc << SrcShift)
+#define SrcImmU16 (OpImmU16 << SrcShift)
+#define SrcImm64 (OpImm64 << SrcShift)
+#define SrcDX (OpDX << SrcShift)
+#define SrcMem8 (OpMem8 << SrcShift)
+#define SrcAccHi (OpAccHi << SrcShift)
+#define SrcMask (OpMask << SrcShift)
+#define BitOp (1<<11)
+#define MemAbs (1<<12) /* Memory operand is absolute displacement */
+#define String (1<<13) /* String instruction (rep capable) */
+#define Stack (1<<14) /* Stack instruction (push/pop) */
+#define GroupMask (7<<15) /* Opcode uses one of the group mechanisms */
+#define Group (1<<15) /* Bits 3:5 of modrm byte extend opcode */
+#define GroupDual (2<<15) /* Alternate decoding of mod == 3 */
+#define Prefix (3<<15) /* Instruction varies with 66/f2/f3 prefix */
+#define RMExt (4<<15) /* Opcode extension in ModRM r/m if mod == 3 */
+#define Escape (5<<15) /* Escape to coprocessor instruction */
+#define InstrDual (6<<15) /* Alternate instruction decoding of mod == 3 */
+#define ModeDual (7<<15) /* Different instruction for 32/64 bit */
+#define Sse (1<<18) /* SSE Vector instruction */
+/* Generic ModRM decode. */
+#define ModRM (1<<19)
+/* Destination is only written; never read. */
+#define Mov (1<<20)
+/* Misc flags */
+#define Prot (1<<21) /* instruction generates #UD if not in prot-mode */
+#define EmulateOnUD (1<<22) /* Emulate if unsupported by the host */
+#define NoAccess (1<<23) /* Don't access memory (lea/invlpg/verr etc) */
+#define Op3264 (1<<24) /* Operand is 64b in long mode, 32b otherwise */
+#define Undefined (1<<25) /* No Such Instruction */
+#define Lock (1<<26) /* lock prefix is allowed for the instruction */
+#define Priv (1<<27) /* instruction generates #GP if current CPL != 0 */
+#define No64 (1<<28)
+#define PageTable (1 << 29) /* instruction used to write page table */
+#define NotImpl (1 << 30) /* instruction is not implemented */
+/* Source 2 operand type */
+#define Src2Shift (31)
+#define Src2None (OpNone << Src2Shift)
+#define Src2Mem (OpMem << Src2Shift)
+#define Src2CL (OpCL << Src2Shift)
+#define Src2ImmByte (OpImmByte << Src2Shift)
+#define Src2One (OpOne << Src2Shift)
+#define Src2Imm (OpImm << Src2Shift)
+#define Src2ES (OpES << Src2Shift)
+#define Src2CS (OpCS << Src2Shift)
+#define Src2SS (OpSS << Src2Shift)
+#define Src2DS (OpDS << Src2Shift)
+#define Src2FS (OpFS << Src2Shift)
+#define Src2GS (OpGS << Src2Shift)
+#define Src2Mask (OpMask << Src2Shift)
+#define Mmx ((u64)1 << 40) /* MMX Vector instruction */
+#define AlignMask ((u64)7 << 41)
+#define Aligned ((u64)1 << 41) /* Explicitly aligned (e.g. MOVDQA) */
+#define Unaligned ((u64)2 << 41) /* Explicitly unaligned (e.g. MOVDQU) */
+#define Avx ((u64)3 << 41) /* Advanced Vector Extensions */
+#define Aligned16 ((u64)4 << 41) /* Aligned to 16 byte boundary (e.g. FXSAVE) */
+#define Fastop ((u64)1 << 44) /* Use opcode::u.fastop */
+#define NoWrite ((u64)1 << 45) /* No writeback */
+#define SrcWrite ((u64)1 << 46) /* Write back src operand */
+#define NoMod ((u64)1 << 47) /* Mod field is ignored */
+#define Intercept ((u64)1 << 48) /* Has valid intercept field */
+#define CheckPerm ((u64)1 << 49) /* Has valid check_perm field */
+#define PrivUD ((u64)1 << 51) /* #UD instead of #GP on CPL > 0 */
+#define NearBranch ((u64)1 << 52) /* Near branches */
+#define No16 ((u64)1 << 53) /* No 16 bit operand */
+#define IncSP ((u64)1 << 54) /* SP is incremented before ModRM calc */
+#define TwoMemOp ((u64)1 << 55) /* Instruction has two memory operand */
+
+#define DstXacc (DstAccLo | SrcAccHi | SrcWrite)
+
+#define X2(x...) x, x
+#define X3(x...) X2(x), x
+#define X4(x...) X2(x), X2(x)
+#define X5(x...) X4(x), x
+#define X6(x...) X4(x), X2(x)
+#define X7(x...) X4(x), X3(x)
+#define X8(x...) X4(x), X4(x)
+#define X16(x...) X8(x), X8(x)
+
+#define NR_FASTOP (ilog2(sizeof(ulong)) + 1)
+#define FASTOP_SIZE 8
+
+/*
+ * fastop functions have a special calling convention:
+ *
+ * dst: rax (in/out)
+ * src: rdx (in/out)
+ * src2: rcx (in)
+ * flags: rflags (in/out)
+ * ex: rsi (in:fastop pointer, out:zero if exception)
+ *
+ * Moreover, they are all exactly FASTOP_SIZE bytes long, so functions for
+ * different operand sizes can be reached by calculation, rather than a jump
+ * table (which would be bigger than the code).
+ *
+ * fastop functions are declared as taking a never-defined fastop parameter,
+ * so they can't be called from C directly.
+ */
+
+struct fastop;
+
+struct opcode {
+ u64 flags : 56;
+ u64 intercept : 8;
+ union {
+ int (*execute)(struct x86_emulate_ctxt *ctxt);
+ const struct opcode *group;
+ const struct group_dual *gdual;
+ const struct gprefix *gprefix;
+ const struct escape *esc;
+ const struct instr_dual *idual;
+ const struct mode_dual *mdual;
+ void (*fastop)(struct fastop *fake);
+ } u;
+ int (*check_perm)(struct x86_emulate_ctxt *ctxt);
+};
+
+struct group_dual {
+ struct opcode mod012[8];
+ struct opcode mod3[8];
+};
+
+struct gprefix {
+ struct opcode pfx_no;
+ struct opcode pfx_66;
+ struct opcode pfx_f2;
+ struct opcode pfx_f3;
+};
+
+struct escape {
+ struct opcode op[8];
+ struct opcode high[64];
+};
+
+struct instr_dual {
+ struct opcode mod012;
+ struct opcode mod3;
+};
+
+struct mode_dual {
+ struct opcode mode32;
+ struct opcode mode64;
+};
+
+#define EFLG_RESERVED_ZEROS_MASK 0xffc0802a
+
+enum x86_transfer_type {
+ X86_TRANSFER_NONE,
+ X86_TRANSFER_CALL_JMP,
+ X86_TRANSFER_RET,
+ X86_TRANSFER_TASK_SWITCH,
+};
+
+static ulong reg_read(struct x86_emulate_ctxt *ctxt, unsigned nr)
+{
+ if (!(ctxt->regs_valid & (1 << nr))) {
+ ctxt->regs_valid |= 1 << nr;
+ ctxt->_regs[nr] = ctxt->ops->read_gpr(ctxt, nr);
+ }
+ return ctxt->_regs[nr];
+}
+
+static ulong *reg_write(struct x86_emulate_ctxt *ctxt, unsigned nr)
+{
+ ctxt->regs_valid |= 1 << nr;
+ ctxt->regs_dirty |= 1 << nr;
+ return &ctxt->_regs[nr];
+}
+
+static ulong *reg_rmw(struct x86_emulate_ctxt *ctxt, unsigned nr)
+{
+ reg_read(ctxt, nr);
+ return reg_write(ctxt, nr);
+}
+
+static void writeback_registers(struct x86_emulate_ctxt *ctxt)
+{
+ unsigned reg;
+
+ for_each_set_bit(reg, (ulong *)&ctxt->regs_dirty, 16)
+ ctxt->ops->write_gpr(ctxt, reg, ctxt->_regs[reg]);
+}
+
+static void invalidate_registers(struct x86_emulate_ctxt *ctxt)
+{
+ ctxt->regs_dirty = 0;
+ ctxt->regs_valid = 0;
+}
+
+/*
+ * These EFLAGS bits are restored from saved value during emulation, and
+ * any changes are written back to the saved value after emulation.
+ */
+#define EFLAGS_MASK (X86_EFLAGS_OF|X86_EFLAGS_SF|X86_EFLAGS_ZF|X86_EFLAGS_AF|\
+ X86_EFLAGS_PF|X86_EFLAGS_CF)
+
+#ifdef CONFIG_X86_64
+#define ON64(x) x
+#else
+#define ON64(x)
+#endif
+
+static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *));
+
+#define FOP_FUNC(name) \
+ ".align " __stringify(FASTOP_SIZE) " \n\t" \
+ ".type " name ", @function \n\t" \
+ name ":\n\t"
+
+#define FOP_RET "ret \n\t"
+
+#define FOP_START(op) \
+ extern void em_##op(struct fastop *fake); \
+ asm(".pushsection .text, \"ax\" \n\t" \
+ ".global em_" #op " \n\t" \
+ FOP_FUNC("em_" #op)
+
+#define FOP_END \
+ ".popsection")
+
+#define FOPNOP() \
+ FOP_FUNC(__stringify(__UNIQUE_ID(nop))) \
+ FOP_RET
+
+#define FOP1E(op, dst) \
+ FOP_FUNC(#op "_" #dst) \
+ "10: " #op " %" #dst " \n\t" FOP_RET
+
+#define FOP1EEX(op, dst) \
+ FOP1E(op, dst) _ASM_EXTABLE(10b, kvm_fastop_exception)
+
+#define FASTOP1(op) \
+ FOP_START(op) \
+ FOP1E(op##b, al) \
+ FOP1E(op##w, ax) \
+ FOP1E(op##l, eax) \
+ ON64(FOP1E(op##q, rax)) \
+ FOP_END
+
+/* 1-operand, using src2 (for MUL/DIV r/m) */
+#define FASTOP1SRC2(op, name) \
+ FOP_START(name) \
+ FOP1E(op, cl) \
+ FOP1E(op, cx) \
+ FOP1E(op, ecx) \
+ ON64(FOP1E(op, rcx)) \
+ FOP_END
+
+/* 1-operand, using src2 (for MUL/DIV r/m), with exceptions */
+#define FASTOP1SRC2EX(op, name) \
+ FOP_START(name) \
+ FOP1EEX(op, cl) \
+ FOP1EEX(op, cx) \
+ FOP1EEX(op, ecx) \
+ ON64(FOP1EEX(op, rcx)) \
+ FOP_END
+
+#define FOP2E(op, dst, src) \
+ FOP_FUNC(#op "_" #dst "_" #src) \
+ #op " %" #src ", %" #dst " \n\t" FOP_RET
+
+#define FASTOP2(op) \
+ FOP_START(op) \
+ FOP2E(op##b, al, dl) \
+ FOP2E(op##w, ax, dx) \
+ FOP2E(op##l, eax, edx) \
+ ON64(FOP2E(op##q, rax, rdx)) \
+ FOP_END
+
+/* 2 operand, word only */
+#define FASTOP2W(op) \
+ FOP_START(op) \
+ FOPNOP() \
+ FOP2E(op##w, ax, dx) \
+ FOP2E(op##l, eax, edx) \
+ ON64(FOP2E(op##q, rax, rdx)) \
+ FOP_END
+
+/* 2 operand, src is CL */
+#define FASTOP2CL(op) \
+ FOP_START(op) \
+ FOP2E(op##b, al, cl) \
+ FOP2E(op##w, ax, cl) \
+ FOP2E(op##l, eax, cl) \
+ ON64(FOP2E(op##q, rax, cl)) \
+ FOP_END
+
+/* 2 operand, src and dest are reversed */
+#define FASTOP2R(op, name) \
+ FOP_START(name) \
+ FOP2E(op##b, dl, al) \
+ FOP2E(op##w, dx, ax) \
+ FOP2E(op##l, edx, eax) \
+ ON64(FOP2E(op##q, rdx, rax)) \
+ FOP_END
+
+#define FOP3E(op, dst, src, src2) \
+ FOP_FUNC(#op "_" #dst "_" #src "_" #src2) \
+ #op " %" #src2 ", %" #src ", %" #dst " \n\t" FOP_RET
+
+/* 3-operand, word-only, src2=cl */
+#define FASTOP3WCL(op) \
+ FOP_START(op) \
+ FOPNOP() \
+ FOP3E(op##w, ax, dx, cl) \
+ FOP3E(op##l, eax, edx, cl) \
+ ON64(FOP3E(op##q, rax, rdx, cl)) \
+ FOP_END
+
+/* Special case for SETcc - 1 instruction per cc */
+#define FOP_SETCC(op) \
+ ".align 4 \n\t" \
+ ".type " #op ", @function \n\t" \
+ #op ": \n\t" \
+ #op " %al \n\t" \
+ FOP_RET
+
+asm(".pushsection .fixup, \"ax\"\n"
+ ".global kvm_fastop_exception \n"
+ "kvm_fastop_exception: xor %esi, %esi; ret\n"
+ ".popsection");
+
+FOP_START(setcc)
+FOP_SETCC(seto)
+FOP_SETCC(setno)
+FOP_SETCC(setc)
+FOP_SETCC(setnc)
+FOP_SETCC(setz)
+FOP_SETCC(setnz)
+FOP_SETCC(setbe)
+FOP_SETCC(setnbe)
+FOP_SETCC(sets)
+FOP_SETCC(setns)
+FOP_SETCC(setp)
+FOP_SETCC(setnp)
+FOP_SETCC(setl)
+FOP_SETCC(setnl)
+FOP_SETCC(setle)
+FOP_SETCC(setnle)
+FOP_END;
+
+FOP_START(salc) "pushf; sbb %al, %al; popf \n\t" FOP_RET
+FOP_END;
+
+/*
+ * XXX: inoutclob user must know where the argument is being expanded.
+ * Relying on CONFIG_CC_HAS_ASM_GOTO would allow us to remove _fault.
+ */
+#define asm_safe(insn, inoutclob...) \
+({ \
+ int _fault = 0; \
+ \
+ asm volatile("1:" insn "\n" \
+ "2:\n" \
+ ".pushsection .fixup, \"ax\"\n" \
+ "3: movl $1, %[_fault]\n" \
+ " jmp 2b\n" \
+ ".popsection\n" \
+ _ASM_EXTABLE(1b, 3b) \
+ : [_fault] "+qm"(_fault) inoutclob ); \
+ \
+ _fault ? X86EMUL_UNHANDLEABLE : X86EMUL_CONTINUE; \
+})
+
+static int emulator_check_intercept(struct x86_emulate_ctxt *ctxt,
+ enum x86_intercept intercept,
+ enum x86_intercept_stage stage)
+{
+ struct x86_instruction_info info = {
+ .intercept = intercept,
+ .rep_prefix = ctxt->rep_prefix,
+ .modrm_mod = ctxt->modrm_mod,
+ .modrm_reg = ctxt->modrm_reg,
+ .modrm_rm = ctxt->modrm_rm,
+ .src_val = ctxt->src.val64,
+ .dst_val = ctxt->dst.val64,
+ .src_bytes = ctxt->src.bytes,
+ .dst_bytes = ctxt->dst.bytes,
+ .ad_bytes = ctxt->ad_bytes,
+ .next_rip = ctxt->eip,
+ };
+
+ return ctxt->ops->intercept(ctxt, &info, stage);
+}
+
+static void assign_masked(ulong *dest, ulong src, ulong mask)
+{
+ *dest = (*dest & ~mask) | (src & mask);
+}
+
+static void assign_register(unsigned long *reg, u64 val, int bytes)
+{
+ /* The 4-byte case *is* correct: in 64-bit mode we zero-extend. */
+ switch (bytes) {
+ case 1:
+ *(u8 *)reg = (u8)val;
+ break;
+ case 2:
+ *(u16 *)reg = (u16)val;
+ break;
+ case 4:
+ *reg = (u32)val;
+ break; /* 64b: zero-extend */
+ case 8:
+ *reg = val;
+ break;
+ }
+}
+
+static inline unsigned long ad_mask(struct x86_emulate_ctxt *ctxt)
+{
+ return (1UL << (ctxt->ad_bytes << 3)) - 1;
+}
+
+static ulong stack_mask(struct x86_emulate_ctxt *ctxt)
+{
+ u16 sel;
+ struct desc_struct ss;
+
+ if (ctxt->mode == X86EMUL_MODE_PROT64)
+ return ~0UL;
+ ctxt->ops->get_segment(ctxt, &sel, &ss, NULL, VCPU_SREG_SS);
+ return ~0U >> ((ss.d ^ 1) * 16); /* d=0: 0xffff; d=1: 0xffffffff */
+}
+
+static int stack_size(struct x86_emulate_ctxt *ctxt)
+{
+ return (__fls(stack_mask(ctxt)) + 1) >> 3;
+}
+
+/* Access/update address held in a register, based on addressing mode. */
+static inline unsigned long
+address_mask(struct x86_emulate_ctxt *ctxt, unsigned long reg)
+{
+ if (ctxt->ad_bytes == sizeof(unsigned long))
+ return reg;
+ else
+ return reg & ad_mask(ctxt);
+}
+
+static inline unsigned long
+register_address(struct x86_emulate_ctxt *ctxt, int reg)
+{
+ return address_mask(ctxt, reg_read(ctxt, reg));
+}
+
+static void masked_increment(ulong *reg, ulong mask, int inc)
+{
+ assign_masked(reg, *reg + inc, mask);
+}
+
+static inline void
+register_address_increment(struct x86_emulate_ctxt *ctxt, int reg, int inc)
+{
+ ulong *preg = reg_rmw(ctxt, reg);
+
+ assign_register(preg, *preg + inc, ctxt->ad_bytes);
+}
+
+static void rsp_increment(struct x86_emulate_ctxt *ctxt, int inc)
+{
+ masked_increment(reg_rmw(ctxt, VCPU_REGS_RSP), stack_mask(ctxt), inc);
+}
+
+static u32 desc_limit_scaled(struct desc_struct *desc)
+{
+ u32 limit = get_desc_limit(desc);
+
+ return desc->g ? (limit << 12) | 0xfff : limit;
+}
+
+static unsigned long seg_base(struct x86_emulate_ctxt *ctxt, int seg)
+{
+ if (ctxt->mode == X86EMUL_MODE_PROT64 && seg < VCPU_SREG_FS)
+ return 0;
+
+ return ctxt->ops->get_cached_segment_base(ctxt, seg);
+}
+
+static int emulate_exception(struct x86_emulate_ctxt *ctxt, int vec,
+ u32 error, bool valid)
+{
+ WARN_ON(vec > 0x1f);
+ ctxt->exception.vector = vec;
+ ctxt->exception.error_code = error;
+ ctxt->exception.error_code_valid = valid;
+ return X86EMUL_PROPAGATE_FAULT;
+}
+
+static int emulate_db(struct x86_emulate_ctxt *ctxt)
+{
+ return emulate_exception(ctxt, DB_VECTOR, 0, false);
+}
+
+static int emulate_gp(struct x86_emulate_ctxt *ctxt, int err)
+{
+ return emulate_exception(ctxt, GP_VECTOR, err, true);
+}
+
+static int emulate_ss(struct x86_emulate_ctxt *ctxt, int err)
+{
+ return emulate_exception(ctxt, SS_VECTOR, err, true);
+}
+
+static int emulate_ud(struct x86_emulate_ctxt *ctxt)
+{
+ return emulate_exception(ctxt, UD_VECTOR, 0, false);
+}
+
+static int emulate_ts(struct x86_emulate_ctxt *ctxt, int err)
+{
+ return emulate_exception(ctxt, TS_VECTOR, err, true);
+}
+
+static int emulate_de(struct x86_emulate_ctxt *ctxt)
+{
+ return emulate_exception(ctxt, DE_VECTOR, 0, false);
+}
+
+static int emulate_nm(struct x86_emulate_ctxt *ctxt)
+{
+ return emulate_exception(ctxt, NM_VECTOR, 0, false);
+}
+
+static u16 get_segment_selector(struct x86_emulate_ctxt *ctxt, unsigned seg)
+{
+ u16 selector;
+ struct desc_struct desc;
+
+ ctxt->ops->get_segment(ctxt, &selector, &desc, NULL, seg);
+ return selector;
+}
+
+static void set_segment_selector(struct x86_emulate_ctxt *ctxt, u16 selector,
+ unsigned seg)
+{
+ u16 dummy;
+ u32 base3;
+ struct desc_struct desc;
+
+ ctxt->ops->get_segment(ctxt, &dummy, &desc, &base3, seg);
+ ctxt->ops->set_segment(ctxt, selector, &desc, base3, seg);
+}
+
+/*
+ * x86 defines three classes of vector instructions: explicitly
+ * aligned, explicitly unaligned, and the rest, which change behaviour
+ * depending on whether they're AVX encoded or not.
+ *
+ * Also included is CMPXCHG16B which is not a vector instruction, yet it is
+ * subject to the same check. FXSAVE and FXRSTOR are checked here too as their
+ * 512 bytes of data must be aligned to a 16 byte boundary.
+ */
+static unsigned insn_alignment(struct x86_emulate_ctxt *ctxt, unsigned size)
+{
+ u64 alignment = ctxt->d & AlignMask;
+
+ if (likely(size < 16))
+ return 1;
+
+ switch (alignment) {
+ case Unaligned:
+ case Avx:
+ return 1;
+ case Aligned16:
+ return 16;
+ case Aligned:
+ default:
+ return size;
+ }
+}
+
+static __always_inline int __linearize(struct x86_emulate_ctxt *ctxt,
+ struct segmented_address addr,
+ unsigned *max_size, unsigned size,
+ bool write, bool fetch,
+ enum x86emul_mode mode, ulong *linear)
+{
+ struct desc_struct desc;
+ bool usable;
+ ulong la;
+ u32 lim;
+ u16 sel;
+ u8 va_bits;
+
+ la = seg_base(ctxt, addr.seg) + addr.ea;
+ *max_size = 0;
+ switch (mode) {
+ case X86EMUL_MODE_PROT64:
+ *linear = la;
+ va_bits = ctxt_virt_addr_bits(ctxt);
+ if (get_canonical(la, va_bits) != la)
+ goto bad;
+
+ *max_size = min_t(u64, ~0u, (1ull << va_bits) - la);
+ if (size > *max_size)
+ goto bad;
+ break;
+ default:
+ *linear = la = (u32)la;
+ usable = ctxt->ops->get_segment(ctxt, &sel, &desc, NULL,
+ addr.seg);
+ if (!usable)
+ goto bad;
+ /* code segment in protected mode or read-only data segment */
+ if ((((ctxt->mode != X86EMUL_MODE_REAL) && (desc.type & 8))
+ || !(desc.type & 2)) && write)
+ goto bad;
+ /* unreadable code segment */
+ if (!fetch && (desc.type & 8) && !(desc.type & 2))
+ goto bad;
+ lim = desc_limit_scaled(&desc);
+ if (!(desc.type & 8) && (desc.type & 4)) {
+ /* expand-down segment */
+ if (addr.ea <= lim)
+ goto bad;
+ lim = desc.d ? 0xffffffff : 0xffff;
+ }
+ if (addr.ea > lim)
+ goto bad;
+ if (lim == 0xffffffff)
+ *max_size = ~0u;
+ else {
+ *max_size = (u64)lim + 1 - addr.ea;
+ if (size > *max_size)
+ goto bad;
+ }
+ break;
+ }
+ if (la & (insn_alignment(ctxt, size) - 1))
+ return emulate_gp(ctxt, 0);
+ return X86EMUL_CONTINUE;
+bad:
+ if (addr.seg == VCPU_SREG_SS)
+ return emulate_ss(ctxt, 0);
+ else
+ return emulate_gp(ctxt, 0);
+}
+
+static int linearize(struct x86_emulate_ctxt *ctxt,
+ struct segmented_address addr,
+ unsigned size, bool write,
+ ulong *linear)
+{
+ unsigned max_size;
+ return __linearize(ctxt, addr, &max_size, size, write, false,
+ ctxt->mode, linear);
+}
+
+static inline int assign_eip(struct x86_emulate_ctxt *ctxt, ulong dst,
+ enum x86emul_mode mode)
+{
+ ulong linear;
+ int rc;
+ unsigned max_size;
+ struct segmented_address addr = { .seg = VCPU_SREG_CS,
+ .ea = dst };
+
+ if (ctxt->op_bytes != sizeof(unsigned long))
+ addr.ea = dst & ((1UL << (ctxt->op_bytes << 3)) - 1);
+ rc = __linearize(ctxt, addr, &max_size, 1, false, true, mode, &linear);
+ if (rc == X86EMUL_CONTINUE)
+ ctxt->_eip = addr.ea;
+ return rc;
+}
+
+static inline int assign_eip_near(struct x86_emulate_ctxt *ctxt, ulong dst)
+{
+ return assign_eip(ctxt, dst, ctxt->mode);
+}
+
+static int assign_eip_far(struct x86_emulate_ctxt *ctxt, ulong dst,
+ const struct desc_struct *cs_desc)
+{
+ enum x86emul_mode mode = ctxt->mode;
+ int rc;
+
+#ifdef CONFIG_X86_64
+ if (ctxt->mode >= X86EMUL_MODE_PROT16) {
+ if (cs_desc->l) {
+ u64 efer = 0;
+
+ ctxt->ops->get_msr(ctxt, MSR_EFER, &efer);
+ if (efer & EFER_LMA)
+ mode = X86EMUL_MODE_PROT64;
+ } else
+ mode = X86EMUL_MODE_PROT32; /* temporary value */
+ }
+#endif
+ if (mode == X86EMUL_MODE_PROT16 || mode == X86EMUL_MODE_PROT32)
+ mode = cs_desc->d ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
+ rc = assign_eip(ctxt, dst, mode);
+ if (rc == X86EMUL_CONTINUE)
+ ctxt->mode = mode;
+ return rc;
+}
+
+static inline int jmp_rel(struct x86_emulate_ctxt *ctxt, int rel)
+{
+ return assign_eip_near(ctxt, ctxt->_eip + rel);
+}
+
+static int linear_read_system(struct x86_emulate_ctxt *ctxt, ulong linear,
+ void *data, unsigned size)
+{
+ return ctxt->ops->read_std(ctxt, linear, data, size, &ctxt->exception, true);
+}
+
+static int linear_write_system(struct x86_emulate_ctxt *ctxt,
+ ulong linear, void *data,
+ unsigned int size)
+{
+ return ctxt->ops->write_std(ctxt, linear, data, size, &ctxt->exception, true);
+}
+
+static int segmented_read_std(struct x86_emulate_ctxt *ctxt,
+ struct segmented_address addr,
+ void *data,
+ unsigned size)
+{
+ int rc;
+ ulong linear;
+
+ rc = linearize(ctxt, addr, size, false, &linear);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+ return ctxt->ops->read_std(ctxt, linear, data, size, &ctxt->exception, false);
+}
+
+static int segmented_write_std(struct x86_emulate_ctxt *ctxt,
+ struct segmented_address addr,
+ void *data,
+ unsigned int size)
+{
+ int rc;
+ ulong linear;
+
+ rc = linearize(ctxt, addr, size, true, &linear);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+ return ctxt->ops->write_std(ctxt, linear, data, size, &ctxt->exception, false);
+}
+
+/*
+ * Prefetch the remaining bytes of the instruction without crossing page
+ * boundary if they are not in fetch_cache yet.
+ */
+static int __do_insn_fetch_bytes(struct x86_emulate_ctxt *ctxt, int op_size)
+{
+ int rc;
+ unsigned size, max_size;
+ unsigned long linear;
+ int cur_size = ctxt->fetch.end - ctxt->fetch.data;
+ struct segmented_address addr = { .seg = VCPU_SREG_CS,
+ .ea = ctxt->eip + cur_size };
+
+ /*
+ * We do not know exactly how many bytes will be needed, and
+ * __linearize is expensive, so fetch as much as possible. We
+ * just have to avoid going beyond the 15 byte limit, the end
+ * of the segment, or the end of the page.
+ *
+ * __linearize is called with size 0 so that it does not do any
+ * boundary check itself. Instead, we use max_size to check
+ * against op_size.
+ */
+ rc = __linearize(ctxt, addr, &max_size, 0, false, true, ctxt->mode,
+ &linear);
+ if (unlikely(rc != X86EMUL_CONTINUE))
+ return rc;
+
+ size = min_t(unsigned, 15UL ^ cur_size, max_size);
+ size = min_t(unsigned, size, PAGE_SIZE - offset_in_page(linear));
+
+ /*
+ * One instruction can only straddle two pages,
+ * and one has been loaded at the beginning of
+ * x86_decode_insn. So, if not enough bytes
+ * still, we must have hit the 15-byte boundary.
+ */
+ if (unlikely(size < op_size))
+ return emulate_gp(ctxt, 0);
+
+ rc = ctxt->ops->fetch(ctxt, linear, ctxt->fetch.end,
+ size, &ctxt->exception);
+ if (unlikely(rc != X86EMUL_CONTINUE))
+ return rc;
+ ctxt->fetch.end += size;
+ return X86EMUL_CONTINUE;
+}
+
+static __always_inline int do_insn_fetch_bytes(struct x86_emulate_ctxt *ctxt,
+ unsigned size)
+{
+ unsigned done_size = ctxt->fetch.end - ctxt->fetch.ptr;
+
+ if (unlikely(done_size < size))
+ return __do_insn_fetch_bytes(ctxt, size - done_size);
+ else
+ return X86EMUL_CONTINUE;
+}
+
+/* Fetch next part of the instruction being emulated. */
+#define insn_fetch(_type, _ctxt) \
+({ _type _x; \
+ \
+ rc = do_insn_fetch_bytes(_ctxt, sizeof(_type)); \
+ if (rc != X86EMUL_CONTINUE) \
+ goto done; \
+ ctxt->_eip += sizeof(_type); \
+ memcpy(&_x, ctxt->fetch.ptr, sizeof(_type)); \
+ ctxt->fetch.ptr += sizeof(_type); \
+ _x; \
+})
+
+#define insn_fetch_arr(_arr, _size, _ctxt) \
+({ \
+ rc = do_insn_fetch_bytes(_ctxt, _size); \
+ if (rc != X86EMUL_CONTINUE) \
+ goto done; \
+ ctxt->_eip += (_size); \
+ memcpy(_arr, ctxt->fetch.ptr, _size); \
+ ctxt->fetch.ptr += (_size); \
+})
+
+/*
+ * Given the 'reg' portion of a ModRM byte, and a register block, return a
+ * pointer into the block that addresses the relevant register.
+ * @highbyte_regs specifies whether to decode AH,CH,DH,BH.
+ */
+static void *decode_register(struct x86_emulate_ctxt *ctxt, u8 modrm_reg,
+ int byteop)
+{
+ void *p;
+ int highbyte_regs = (ctxt->rex_prefix == 0) && byteop;
+
+ if (highbyte_regs && modrm_reg >= 4 && modrm_reg < 8)
+ p = (unsigned char *)reg_rmw(ctxt, modrm_reg & 3) + 1;
+ else
+ p = reg_rmw(ctxt, modrm_reg);
+ return p;
+}
+
+static int read_descriptor(struct x86_emulate_ctxt *ctxt,
+ struct segmented_address addr,
+ u16 *size, unsigned long *address, int op_bytes)
+{
+ int rc;
+
+ if (op_bytes == 2)
+ op_bytes = 3;
+ *address = 0;
+ rc = segmented_read_std(ctxt, addr, size, 2);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+ addr.ea += 2;
+ rc = segmented_read_std(ctxt, addr, address, op_bytes);
+ return rc;
+}
+
+FASTOP2(add);
+FASTOP2(or);
+FASTOP2(adc);
+FASTOP2(sbb);
+FASTOP2(and);
+FASTOP2(sub);
+FASTOP2(xor);
+FASTOP2(cmp);
+FASTOP2(test);
+
+FASTOP1SRC2(mul, mul_ex);
+FASTOP1SRC2(imul, imul_ex);
+FASTOP1SRC2EX(div, div_ex);
+FASTOP1SRC2EX(idiv, idiv_ex);
+
+FASTOP3WCL(shld);
+FASTOP3WCL(shrd);
+
+FASTOP2W(imul);
+
+FASTOP1(not);
+FASTOP1(neg);
+FASTOP1(inc);
+FASTOP1(dec);
+
+FASTOP2CL(rol);
+FASTOP2CL(ror);
+FASTOP2CL(rcl);
+FASTOP2CL(rcr);
+FASTOP2CL(shl);
+FASTOP2CL(shr);
+FASTOP2CL(sar);
+
+FASTOP2W(bsf);
+FASTOP2W(bsr);
+FASTOP2W(bt);
+FASTOP2W(bts);
+FASTOP2W(btr);
+FASTOP2W(btc);
+
+FASTOP2(xadd);
+
+FASTOP2R(cmp, cmp_r);
+
+static int em_bsf_c(struct x86_emulate_ctxt *ctxt)
+{
+ /* If src is zero, do not writeback, but update flags */
+ if (ctxt->src.val == 0)
+ ctxt->dst.type = OP_NONE;
+ return fastop(ctxt, em_bsf);
+}
+
+static int em_bsr_c(struct x86_emulate_ctxt *ctxt)
+{
+ /* If src is zero, do not writeback, but update flags */
+ if (ctxt->src.val == 0)
+ ctxt->dst.type = OP_NONE;
+ return fastop(ctxt, em_bsr);
+}
+
+static __always_inline u8 test_cc(unsigned int condition, unsigned long flags)
+{
+ u8 rc;
+ void (*fop)(void) = (void *)em_setcc + 4 * (condition & 0xf);
+
+ flags = (flags & EFLAGS_MASK) | X86_EFLAGS_IF;
+ asm("push %[flags]; popf; " CALL_NOSPEC
+ : "=a"(rc) : [thunk_target]"r"(fop), [flags]"r"(flags));
+ return rc;
+}
+
+static void fetch_register_operand(struct operand *op)
+{
+ switch (op->bytes) {
+ case 1:
+ op->val = *(u8 *)op->addr.reg;
+ break;
+ case 2:
+ op->val = *(u16 *)op->addr.reg;
+ break;
+ case 4:
+ op->val = *(u32 *)op->addr.reg;
+ break;
+ case 8:
+ op->val = *(u64 *)op->addr.reg;
+ break;
+ }
+}
+
+static void read_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data, int reg)
+{
+ switch (reg) {
+ case 0: asm("movdqa %%xmm0, %0" : "=m"(*data)); break;
+ case 1: asm("movdqa %%xmm1, %0" : "=m"(*data)); break;
+ case 2: asm("movdqa %%xmm2, %0" : "=m"(*data)); break;
+ case 3: asm("movdqa %%xmm3, %0" : "=m"(*data)); break;
+ case 4: asm("movdqa %%xmm4, %0" : "=m"(*data)); break;
+ case 5: asm("movdqa %%xmm5, %0" : "=m"(*data)); break;
+ case 6: asm("movdqa %%xmm6, %0" : "=m"(*data)); break;
+ case 7: asm("movdqa %%xmm7, %0" : "=m"(*data)); break;
+#ifdef CONFIG_X86_64
+ case 8: asm("movdqa %%xmm8, %0" : "=m"(*data)); break;
+ case 9: asm("movdqa %%xmm9, %0" : "=m"(*data)); break;
+ case 10: asm("movdqa %%xmm10, %0" : "=m"(*data)); break;
+ case 11: asm("movdqa %%xmm11, %0" : "=m"(*data)); break;
+ case 12: asm("movdqa %%xmm12, %0" : "=m"(*data)); break;
+ case 13: asm("movdqa %%xmm13, %0" : "=m"(*data)); break;
+ case 14: asm("movdqa %%xmm14, %0" : "=m"(*data)); break;
+ case 15: asm("movdqa %%xmm15, %0" : "=m"(*data)); break;
+#endif
+ default: BUG();
+ }
+}
+
+static void write_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data,
+ int reg)
+{
+ switch (reg) {
+ case 0: asm("movdqa %0, %%xmm0" : : "m"(*data)); break;
+ case 1: asm("movdqa %0, %%xmm1" : : "m"(*data)); break;
+ case 2: asm("movdqa %0, %%xmm2" : : "m"(*data)); break;
+ case 3: asm("movdqa %0, %%xmm3" : : "m"(*data)); break;
+ case 4: asm("movdqa %0, %%xmm4" : : "m"(*data)); break;
+ case 5: asm("movdqa %0, %%xmm5" : : "m"(*data)); break;
+ case 6: asm("movdqa %0, %%xmm6" : : "m"(*data)); break;
+ case 7: asm("movdqa %0, %%xmm7" : : "m"(*data)); break;
+#ifdef CONFIG_X86_64
+ case 8: asm("movdqa %0, %%xmm8" : : "m"(*data)); break;
+ case 9: asm("movdqa %0, %%xmm9" : : "m"(*data)); break;
+ case 10: asm("movdqa %0, %%xmm10" : : "m"(*data)); break;
+ case 11: asm("movdqa %0, %%xmm11" : : "m"(*data)); break;
+ case 12: asm("movdqa %0, %%xmm12" : : "m"(*data)); break;
+ case 13: asm("movdqa %0, %%xmm13" : : "m"(*data)); break;
+ case 14: asm("movdqa %0, %%xmm14" : : "m"(*data)); break;
+ case 15: asm("movdqa %0, %%xmm15" : : "m"(*data)); break;
+#endif
+ default: BUG();
+ }
+}
+
+static void read_mmx_reg(struct x86_emulate_ctxt *ctxt, u64 *data, int reg)
+{
+ switch (reg) {
+ case 0: asm("movq %%mm0, %0" : "=m"(*data)); break;
+ case 1: asm("movq %%mm1, %0" : "=m"(*data)); break;
+ case 2: asm("movq %%mm2, %0" : "=m"(*data)); break;
+ case 3: asm("movq %%mm3, %0" : "=m"(*data)); break;
+ case 4: asm("movq %%mm4, %0" : "=m"(*data)); break;
+ case 5: asm("movq %%mm5, %0" : "=m"(*data)); break;
+ case 6: asm("movq %%mm6, %0" : "=m"(*data)); break;
+ case 7: asm("movq %%mm7, %0" : "=m"(*data)); break;
+ default: BUG();
+ }
+}
+
+static void write_mmx_reg(struct x86_emulate_ctxt *ctxt, u64 *data, int reg)
+{
+ switch (reg) {
+ case 0: asm("movq %0, %%mm0" : : "m"(*data)); break;
+ case 1: asm("movq %0, %%mm1" : : "m"(*data)); break;
+ case 2: asm("movq %0, %%mm2" : : "m"(*data)); break;
+ case 3: asm("movq %0, %%mm3" : : "m"(*data)); break;
+ case 4: asm("movq %0, %%mm4" : : "m"(*data)); break;
+ case 5: asm("movq %0, %%mm5" : : "m"(*data)); break;
+ case 6: asm("movq %0, %%mm6" : : "m"(*data)); break;
+ case 7: asm("movq %0, %%mm7" : : "m"(*data)); break;
+ default: BUG();
+ }
+}
+
+static int em_fninit(struct x86_emulate_ctxt *ctxt)
+{
+ if (ctxt->ops->get_cr(ctxt, 0) & (X86_CR0_TS | X86_CR0_EM))
+ return emulate_nm(ctxt);
+
+ asm volatile("fninit");
+ return X86EMUL_CONTINUE;
+}
+
+static int em_fnstcw(struct x86_emulate_ctxt *ctxt)
+{
+ u16 fcw;
+
+ if (ctxt->ops->get_cr(ctxt, 0) & (X86_CR0_TS | X86_CR0_EM))
+ return emulate_nm(ctxt);
+
+ asm volatile("fnstcw %0": "+m"(fcw));
+
+ ctxt->dst.val = fcw;
+
+ return X86EMUL_CONTINUE;
+}
+
+static int em_fnstsw(struct x86_emulate_ctxt *ctxt)
+{
+ u16 fsw;
+
+ if (ctxt->ops->get_cr(ctxt, 0) & (X86_CR0_TS | X86_CR0_EM))
+ return emulate_nm(ctxt);
+
+ asm volatile("fnstsw %0": "+m"(fsw));
+
+ ctxt->dst.val = fsw;
+
+ return X86EMUL_CONTINUE;
+}
+
+static void decode_register_operand(struct x86_emulate_ctxt *ctxt,
+ struct operand *op)
+{
+ unsigned reg = ctxt->modrm_reg;
+
+ if (!(ctxt->d & ModRM))
+ reg = (ctxt->b & 7) | ((ctxt->rex_prefix & 1) << 3);
+
+ if (ctxt->d & Sse) {
+ op->type = OP_XMM;
+ op->bytes = 16;
+ op->addr.xmm = reg;
+ read_sse_reg(ctxt, &op->vec_val, reg);
+ return;
+ }
+ if (ctxt->d & Mmx) {
+ reg &= 7;
+ op->type = OP_MM;
+ op->bytes = 8;
+ op->addr.mm = reg;
+ return;
+ }
+
+ op->type = OP_REG;
+ op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
+ op->addr.reg = decode_register(ctxt, reg, ctxt->d & ByteOp);
+
+ fetch_register_operand(op);
+ op->orig_val = op->val;
+}
+
+static void adjust_modrm_seg(struct x86_emulate_ctxt *ctxt, int base_reg)
+{
+ if (base_reg == VCPU_REGS_RSP || base_reg == VCPU_REGS_RBP)
+ ctxt->modrm_seg = VCPU_SREG_SS;
+}
+
+static int decode_modrm(struct x86_emulate_ctxt *ctxt,
+ struct operand *op)
+{
+ u8 sib;
+ int index_reg, base_reg, scale;
+ int rc = X86EMUL_CONTINUE;
+ ulong modrm_ea = 0;
+
+ ctxt->modrm_reg = ((ctxt->rex_prefix << 1) & 8); /* REX.R */
+ index_reg = (ctxt->rex_prefix << 2) & 8; /* REX.X */
+ base_reg = (ctxt->rex_prefix << 3) & 8; /* REX.B */
+
+ ctxt->modrm_mod = (ctxt->modrm & 0xc0) >> 6;
+ ctxt->modrm_reg |= (ctxt->modrm & 0x38) >> 3;
+ ctxt->modrm_rm = base_reg | (ctxt->modrm & 0x07);
+ ctxt->modrm_seg = VCPU_SREG_DS;
+
+ if (ctxt->modrm_mod == 3 || (ctxt->d & NoMod)) {
+ op->type = OP_REG;
+ op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
+ op->addr.reg = decode_register(ctxt, ctxt->modrm_rm,
+ ctxt->d & ByteOp);
+ if (ctxt->d & Sse) {
+ op->type = OP_XMM;
+ op->bytes = 16;
+ op->addr.xmm = ctxt->modrm_rm;
+ read_sse_reg(ctxt, &op->vec_val, ctxt->modrm_rm);
+ return rc;
+ }
+ if (ctxt->d & Mmx) {
+ op->type = OP_MM;
+ op->bytes = 8;
+ op->addr.mm = ctxt->modrm_rm & 7;
+ return rc;
+ }
+ fetch_register_operand(op);
+ return rc;
+ }
+
+ op->type = OP_MEM;
+
+ if (ctxt->ad_bytes == 2) {
+ unsigned bx = reg_read(ctxt, VCPU_REGS_RBX);
+ unsigned bp = reg_read(ctxt, VCPU_REGS_RBP);
+ unsigned si = reg_read(ctxt, VCPU_REGS_RSI);
+ unsigned di = reg_read(ctxt, VCPU_REGS_RDI);
+
+ /* 16-bit ModR/M decode. */
+ switch (ctxt->modrm_mod) {
+ case 0:
+ if (ctxt->modrm_rm == 6)
+ modrm_ea += insn_fetch(u16, ctxt);
+ break;
+ case 1:
+ modrm_ea += insn_fetch(s8, ctxt);
+ break;
+ case 2:
+ modrm_ea += insn_fetch(u16, ctxt);
+ break;
+ }
+ switch (ctxt->modrm_rm) {
+ case 0:
+ modrm_ea += bx + si;
+ break;
+ case 1:
+ modrm_ea += bx + di;
+ break;
+ case 2:
+ modrm_ea += bp + si;
+ break;
+ case 3:
+ modrm_ea += bp + di;
+ break;
+ case 4:
+ modrm_ea += si;
+ break;
+ case 5:
+ modrm_ea += di;
+ break;
+ case 6:
+ if (ctxt->modrm_mod != 0)
+ modrm_ea += bp;
+ break;
+ case 7:
+ modrm_ea += bx;
+ break;
+ }
+ if (ctxt->modrm_rm == 2 || ctxt->modrm_rm == 3 ||
+ (ctxt->modrm_rm == 6 && ctxt->modrm_mod != 0))
+ ctxt->modrm_seg = VCPU_SREG_SS;
+ modrm_ea = (u16)modrm_ea;
+ } else {
+ /* 32/64-bit ModR/M decode. */
+ if ((ctxt->modrm_rm & 7) == 4) {
+ sib = insn_fetch(u8, ctxt);
+ index_reg |= (sib >> 3) & 7;
+ base_reg |= sib & 7;
+ scale = sib >> 6;
+
+ if ((base_reg & 7) == 5 && ctxt->modrm_mod == 0)
+ modrm_ea += insn_fetch(s32, ctxt);
+ else {
+ modrm_ea += reg_read(ctxt, base_reg);
+ adjust_modrm_seg(ctxt, base_reg);
+ /* Increment ESP on POP [ESP] */
+ if ((ctxt->d & IncSP) &&
+ base_reg == VCPU_REGS_RSP)
+ modrm_ea += ctxt->op_bytes;
+ }
+ if (index_reg != 4)
+ modrm_ea += reg_read(ctxt, index_reg) << scale;
+ } else if ((ctxt->modrm_rm & 7) == 5 && ctxt->modrm_mod == 0) {
+ modrm_ea += insn_fetch(s32, ctxt);
+ if (ctxt->mode == X86EMUL_MODE_PROT64)
+ ctxt->rip_relative = 1;
+ } else {
+ base_reg = ctxt->modrm_rm;
+ modrm_ea += reg_read(ctxt, base_reg);
+ adjust_modrm_seg(ctxt, base_reg);
+ }
+ switch (ctxt->modrm_mod) {
+ case 1:
+ modrm_ea += insn_fetch(s8, ctxt);
+ break;
+ case 2:
+ modrm_ea += insn_fetch(s32, ctxt);
+ break;
+ }
+ }
+ op->addr.mem.ea = modrm_ea;
+ if (ctxt->ad_bytes != 8)
+ ctxt->memop.addr.mem.ea = (u32)ctxt->memop.addr.mem.ea;
+
+done:
+ return rc;
+}
+
+static int decode_abs(struct x86_emulate_ctxt *ctxt,
+ struct operand *op)
+{
+ int rc = X86EMUL_CONTINUE;
+
+ op->type = OP_MEM;
+ switch (ctxt->ad_bytes) {
+ case 2:
+ op->addr.mem.ea = insn_fetch(u16, ctxt);
+ break;
+ case 4:
+ op->addr.mem.ea = insn_fetch(u32, ctxt);
+ break;
+ case 8:
+ op->addr.mem.ea = insn_fetch(u64, ctxt);
+ break;
+ }
+done:
+ return rc;
+}
+
+static void fetch_bit_operand(struct x86_emulate_ctxt *ctxt)
+{
+ long sv = 0, mask;
+
+ if (ctxt->dst.type == OP_MEM && ctxt->src.type == OP_REG) {
+ mask = ~((long)ctxt->dst.bytes * 8 - 1);
+
+ if (ctxt->src.bytes == 2)
+ sv = (s16)ctxt->src.val & (s16)mask;
+ else if (ctxt->src.bytes == 4)
+ sv = (s32)ctxt->src.val & (s32)mask;
+ else
+ sv = (s64)ctxt->src.val & (s64)mask;
+
+ ctxt->dst.addr.mem.ea = address_mask(ctxt,
+ ctxt->dst.addr.mem.ea + (sv >> 3));
+ }
+
+ /* only subword offset */
+ ctxt->src.val &= (ctxt->dst.bytes << 3) - 1;
+}
+
+static int read_emulated(struct x86_emulate_ctxt *ctxt,
+ unsigned long addr, void *dest, unsigned size)
+{
+ int rc;
+ struct read_cache *mc = &ctxt->mem_read;
+
+ if (mc->pos < mc->end)
+ goto read_cached;
+
+ WARN_ON((mc->end + size) >= sizeof(mc->data));
+
+ rc = ctxt->ops->read_emulated(ctxt, addr, mc->data + mc->end, size,
+ &ctxt->exception);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ mc->end += size;
+
+read_cached:
+ memcpy(dest, mc->data + mc->pos, size);
+ mc->pos += size;
+ return X86EMUL_CONTINUE;
+}
+
+static int segmented_read(struct x86_emulate_ctxt *ctxt,
+ struct segmented_address addr,
+ void *data,
+ unsigned size)
+{
+ int rc;
+ ulong linear;
+
+ rc = linearize(ctxt, addr, size, false, &linear);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+ return read_emulated(ctxt, linear, data, size);
+}
+
+static int segmented_write(struct x86_emulate_ctxt *ctxt,
+ struct segmented_address addr,
+ const void *data,
+ unsigned size)
+{
+ int rc;
+ ulong linear;
+
+ rc = linearize(ctxt, addr, size, true, &linear);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+ return ctxt->ops->write_emulated(ctxt, linear, data, size,
+ &ctxt->exception);
+}
+
+static int segmented_cmpxchg(struct x86_emulate_ctxt *ctxt,
+ struct segmented_address addr,
+ const void *orig_data, const void *data,
+ unsigned size)
+{
+ int rc;
+ ulong linear;
+
+ rc = linearize(ctxt, addr, size, true, &linear);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+ return ctxt->ops->cmpxchg_emulated(ctxt, linear, orig_data, data,
+ size, &ctxt->exception);
+}
+
+static int pio_in_emulated(struct x86_emulate_ctxt *ctxt,
+ unsigned int size, unsigned short port,
+ void *dest)
+{
+ struct read_cache *rc = &ctxt->io_read;
+
+ if (rc->pos == rc->end) { /* refill pio read ahead */
+ unsigned int in_page, n;
+ unsigned int count = ctxt->rep_prefix ?
+ address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) : 1;
+ in_page = (ctxt->eflags & X86_EFLAGS_DF) ?
+ offset_in_page(reg_read(ctxt, VCPU_REGS_RDI)) :
+ PAGE_SIZE - offset_in_page(reg_read(ctxt, VCPU_REGS_RDI));
+ n = min3(in_page, (unsigned int)sizeof(rc->data) / size, count);
+ if (n == 0)
+ n = 1;
+ rc->pos = rc->end = 0;
+ if (!ctxt->ops->pio_in_emulated(ctxt, size, port, rc->data, n))
+ return 0;
+ rc->end = n * size;
+ }
+
+ if (ctxt->rep_prefix && (ctxt->d & String) &&
+ !(ctxt->eflags & X86_EFLAGS_DF)) {
+ ctxt->dst.data = rc->data + rc->pos;
+ ctxt->dst.type = OP_MEM_STR;
+ ctxt->dst.count = (rc->end - rc->pos) / size;
+ rc->pos = rc->end;
+ } else {
+ memcpy(dest, rc->data + rc->pos, size);
+ rc->pos += size;
+ }
+ return 1;
+}
+
+static int read_interrupt_descriptor(struct x86_emulate_ctxt *ctxt,
+ u16 index, struct desc_struct *desc)
+{
+ struct desc_ptr dt;
+ ulong addr;
+
+ ctxt->ops->get_idt(ctxt, &dt);
+
+ if (dt.size < index * 8 + 7)
+ return emulate_gp(ctxt, index << 3 | 0x2);
+
+ addr = dt.address + index * 8;
+ return linear_read_system(ctxt, addr, desc, sizeof *desc);
+}
+
+static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt,
+ u16 selector, struct desc_ptr *dt)
+{
+ const struct x86_emulate_ops *ops = ctxt->ops;
+ u32 base3 = 0;
+
+ if (selector & 1 << 2) {
+ struct desc_struct desc;
+ u16 sel;
+
+ memset (dt, 0, sizeof *dt);
+ if (!ops->get_segment(ctxt, &sel, &desc, &base3,
+ VCPU_SREG_LDTR))
+ return;
+
+ dt->size = desc_limit_scaled(&desc); /* what if limit > 65535? */
+ dt->address = get_desc_base(&desc) | ((u64)base3 << 32);
+ } else
+ ops->get_gdt(ctxt, dt);
+}
+
+static int get_descriptor_ptr(struct x86_emulate_ctxt *ctxt,
+ u16 selector, ulong *desc_addr_p)
+{
+ struct desc_ptr dt;
+ u16 index = selector >> 3;
+ ulong addr;
+
+ get_descriptor_table_ptr(ctxt, selector, &dt);
+
+ if (dt.size < index * 8 + 7)
+ return emulate_gp(ctxt, selector & 0xfffc);
+
+ addr = dt.address + index * 8;
+
+#ifdef CONFIG_X86_64
+ if (addr >> 32 != 0) {
+ u64 efer = 0;
+
+ ctxt->ops->get_msr(ctxt, MSR_EFER, &efer);
+ if (!(efer & EFER_LMA))
+ addr &= (u32)-1;
+ }
+#endif
+
+ *desc_addr_p = addr;
+ return X86EMUL_CONTINUE;
+}
+
+/* allowed just for 8 bytes segments */
+static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt,
+ u16 selector, struct desc_struct *desc,
+ ulong *desc_addr_p)
+{
+ int rc;
+
+ rc = get_descriptor_ptr(ctxt, selector, desc_addr_p);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ return linear_read_system(ctxt, *desc_addr_p, desc, sizeof(*desc));
+}
+
+/* allowed just for 8 bytes segments */
+static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt,
+ u16 selector, struct desc_struct *desc)
+{
+ int rc;
+ ulong addr;
+
+ rc = get_descriptor_ptr(ctxt, selector, &addr);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ return linear_write_system(ctxt, addr, desc, sizeof *desc);
+}
+
+static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
+ u16 selector, int seg, u8 cpl,
+ enum x86_transfer_type transfer,
+ struct desc_struct *desc)
+{
+ struct desc_struct seg_desc, old_desc;
+ u8 dpl, rpl;
+ unsigned err_vec = GP_VECTOR;
+ u32 err_code = 0;
+ bool null_selector = !(selector & ~0x3); /* 0000-0003 are null */
+ ulong desc_addr;
+ int ret;
+ u16 dummy;
+ u32 base3 = 0;
+
+ memset(&seg_desc, 0, sizeof seg_desc);
+
+ if (ctxt->mode == X86EMUL_MODE_REAL) {
+ /* set real mode segment descriptor (keep limit etc. for
+ * unreal mode) */
+ ctxt->ops->get_segment(ctxt, &dummy, &seg_desc, NULL, seg);
+ set_desc_base(&seg_desc, selector << 4);
+ goto load;
+ } else if (seg <= VCPU_SREG_GS && ctxt->mode == X86EMUL_MODE_VM86) {
+ /* VM86 needs a clean new segment descriptor */
+ set_desc_base(&seg_desc, selector << 4);
+ set_desc_limit(&seg_desc, 0xffff);
+ seg_desc.type = 3;
+ seg_desc.p = 1;
+ seg_desc.s = 1;
+ seg_desc.dpl = 3;
+ goto load;
+ }
+
+ rpl = selector & 3;
+
+ /* TR should be in GDT only */
+ if (seg == VCPU_SREG_TR && (selector & (1 << 2)))
+ goto exception;
+
+ /* NULL selector is not valid for TR, CS and (except for long mode) SS */
+ if (null_selector) {
+ if (seg == VCPU_SREG_CS || seg == VCPU_SREG_TR)
+ goto exception;
+
+ if (seg == VCPU_SREG_SS) {
+ if (ctxt->mode != X86EMUL_MODE_PROT64 || rpl != cpl)
+ goto exception;
+
+ /*
+ * ctxt->ops->set_segment expects the CPL to be in
+ * SS.DPL, so fake an expand-up 32-bit data segment.
+ */
+ seg_desc.type = 3;
+ seg_desc.p = 1;
+ seg_desc.s = 1;
+ seg_desc.dpl = cpl;
+ seg_desc.d = 1;
+ seg_desc.g = 1;
+ }
+
+ /* Skip all following checks */
+ goto load;
+ }
+
+ ret = read_segment_descriptor(ctxt, selector, &seg_desc, &desc_addr);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+
+ err_code = selector & 0xfffc;
+ err_vec = (transfer == X86_TRANSFER_TASK_SWITCH) ? TS_VECTOR :
+ GP_VECTOR;
+
+ /* can't load system descriptor into segment selector */
+ if (seg <= VCPU_SREG_GS && !seg_desc.s) {
+ if (transfer == X86_TRANSFER_CALL_JMP)
+ return X86EMUL_UNHANDLEABLE;
+ goto exception;
+ }
+
+ dpl = seg_desc.dpl;
+
+ switch (seg) {
+ case VCPU_SREG_SS:
+ /*
+ * segment is not a writable data segment or segment
+ * selector's RPL != CPL or segment selector's RPL != CPL
+ */
+ if (rpl != cpl || (seg_desc.type & 0xa) != 0x2 || dpl != cpl)
+ goto exception;
+ break;
+ case VCPU_SREG_CS:
+ if (!(seg_desc.type & 8))
+ goto exception;
+
+ if (seg_desc.type & 4) {
+ /* conforming */
+ if (dpl > cpl)
+ goto exception;
+ } else {
+ /* nonconforming */
+ if (rpl > cpl || dpl != cpl)
+ goto exception;
+ }
+ /* in long-mode d/b must be clear if l is set */
+ if (seg_desc.d && seg_desc.l) {
+ u64 efer = 0;
+
+ ctxt->ops->get_msr(ctxt, MSR_EFER, &efer);
+ if (efer & EFER_LMA)
+ goto exception;
+ }
+
+ /* CS(RPL) <- CPL */
+ selector = (selector & 0xfffc) | cpl;
+ break;
+ case VCPU_SREG_TR:
+ if (seg_desc.s || (seg_desc.type != 1 && seg_desc.type != 9))
+ goto exception;
+ if (!seg_desc.p) {
+ err_vec = NP_VECTOR;
+ goto exception;
+ }
+ old_desc = seg_desc;
+ seg_desc.type |= 2; /* busy */
+ ret = ctxt->ops->cmpxchg_emulated(ctxt, desc_addr, &old_desc, &seg_desc,
+ sizeof(seg_desc), &ctxt->exception);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+ break;
+ case VCPU_SREG_LDTR:
+ if (seg_desc.s || seg_desc.type != 2)
+ goto exception;
+ break;
+ default: /* DS, ES, FS, or GS */
+ /*
+ * segment is not a data or readable code segment or
+ * ((segment is a data or nonconforming code segment)
+ * and (both RPL and CPL > DPL))
+ */
+ if ((seg_desc.type & 0xa) == 0x8 ||
+ (((seg_desc.type & 0xc) != 0xc) &&
+ (rpl > dpl && cpl > dpl)))
+ goto exception;
+ break;
+ }
+
+ if (!seg_desc.p) {
+ err_vec = (seg == VCPU_SREG_SS) ? SS_VECTOR : NP_VECTOR;
+ goto exception;
+ }
+
+ if (seg_desc.s) {
+ /* mark segment as accessed */
+ if (!(seg_desc.type & 1)) {
+ seg_desc.type |= 1;
+ ret = write_segment_descriptor(ctxt, selector,
+ &seg_desc);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+ }
+ } else if (ctxt->mode == X86EMUL_MODE_PROT64) {
+ ret = linear_read_system(ctxt, desc_addr+8, &base3, sizeof(base3));
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+ if (emul_is_noncanonical_address(get_desc_base(&seg_desc) |
+ ((u64)base3 << 32), ctxt))
+ return emulate_gp(ctxt, 0);
+ }
+load:
+ ctxt->ops->set_segment(ctxt, selector, &seg_desc, base3, seg);
+ if (desc)
+ *desc = seg_desc;
+ return X86EMUL_CONTINUE;
+exception:
+ return emulate_exception(ctxt, err_vec, err_code, true);
+}
+
+static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
+ u16 selector, int seg)
+{
+ u8 cpl = ctxt->ops->cpl(ctxt);
+
+ /*
+ * None of MOV, POP and LSS can load a NULL selector in CPL=3, but
+ * they can load it at CPL<3 (Intel's manual says only LSS can,
+ * but it's wrong).
+ *
+ * However, the Intel manual says that putting IST=1/DPL=3 in
+ * an interrupt gate will result in SS=3 (the AMD manual instead
+ * says it doesn't), so allow SS=3 in __load_segment_descriptor
+ * and only forbid it here.
+ */
+ if (seg == VCPU_SREG_SS && selector == 3 &&
+ ctxt->mode == X86EMUL_MODE_PROT64)
+ return emulate_exception(ctxt, GP_VECTOR, 0, true);
+
+ return __load_segment_descriptor(ctxt, selector, seg, cpl,
+ X86_TRANSFER_NONE, NULL);
+}
+
+static void write_register_operand(struct operand *op)
+{
+ return assign_register(op->addr.reg, op->val, op->bytes);
+}
+
+static int writeback(struct x86_emulate_ctxt *ctxt, struct operand *op)
+{
+ switch (op->type) {
+ case OP_REG:
+ write_register_operand(op);
+ break;
+ case OP_MEM:
+ if (ctxt->lock_prefix)
+ return segmented_cmpxchg(ctxt,
+ op->addr.mem,
+ &op->orig_val,
+ &op->val,
+ op->bytes);
+ else
+ return segmented_write(ctxt,
+ op->addr.mem,
+ &op->val,
+ op->bytes);
+ break;
+ case OP_MEM_STR:
+ return segmented_write(ctxt,
+ op->addr.mem,
+ op->data,
+ op->bytes * op->count);
+ break;
+ case OP_XMM:
+ write_sse_reg(ctxt, &op->vec_val, op->addr.xmm);
+ break;
+ case OP_MM:
+ write_mmx_reg(ctxt, &op->mm_val, op->addr.mm);
+ break;
+ case OP_NONE:
+ /* no writeback */
+ break;
+ default:
+ break;
+ }
+ return X86EMUL_CONTINUE;
+}
+
+static int push(struct x86_emulate_ctxt *ctxt, void *data, int bytes)
+{
+ struct segmented_address addr;
+
+ rsp_increment(ctxt, -bytes);
+ addr.ea = reg_read(ctxt, VCPU_REGS_RSP) & stack_mask(ctxt);
+ addr.seg = VCPU_SREG_SS;
+
+ return segmented_write(ctxt, addr, data, bytes);
+}
+
+static int em_push(struct x86_emulate_ctxt *ctxt)
+{
+ /* Disable writeback. */
+ ctxt->dst.type = OP_NONE;
+ return push(ctxt, &ctxt->src.val, ctxt->op_bytes);
+}
+
+static int emulate_pop(struct x86_emulate_ctxt *ctxt,
+ void *dest, int len)
+{
+ int rc;
+ struct segmented_address addr;
+
+ addr.ea = reg_read(ctxt, VCPU_REGS_RSP) & stack_mask(ctxt);
+ addr.seg = VCPU_SREG_SS;
+ rc = segmented_read(ctxt, addr, dest, len);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ rsp_increment(ctxt, len);
+ return rc;
+}
+
+static int em_pop(struct x86_emulate_ctxt *ctxt)
+{
+ return emulate_pop(ctxt, &ctxt->dst.val, ctxt->op_bytes);
+}
+
+static int emulate_popf(struct x86_emulate_ctxt *ctxt,
+ void *dest, int len)
+{
+ int rc;
+ unsigned long val, change_mask;
+ int iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> X86_EFLAGS_IOPL_BIT;
+ int cpl = ctxt->ops->cpl(ctxt);
+
+ rc = emulate_pop(ctxt, &val, len);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ change_mask = X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
+ X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF |
+ X86_EFLAGS_TF | X86_EFLAGS_DF | X86_EFLAGS_NT |
+ X86_EFLAGS_AC | X86_EFLAGS_ID;
+
+ switch(ctxt->mode) {
+ case X86EMUL_MODE_PROT64:
+ case X86EMUL_MODE_PROT32:
+ case X86EMUL_MODE_PROT16:
+ if (cpl == 0)
+ change_mask |= X86_EFLAGS_IOPL;
+ if (cpl <= iopl)
+ change_mask |= X86_EFLAGS_IF;
+ break;
+ case X86EMUL_MODE_VM86:
+ if (iopl < 3)
+ return emulate_gp(ctxt, 0);
+ change_mask |= X86_EFLAGS_IF;
+ break;
+ default: /* real mode */
+ change_mask |= (X86_EFLAGS_IOPL | X86_EFLAGS_IF);
+ break;
+ }
+
+ *(unsigned long *)dest =
+ (ctxt->eflags & ~change_mask) | (val & change_mask);
+
+ return rc;
+}
+
+static int em_popf(struct x86_emulate_ctxt *ctxt)
+{
+ ctxt->dst.type = OP_REG;
+ ctxt->dst.addr.reg = &ctxt->eflags;
+ ctxt->dst.bytes = ctxt->op_bytes;
+ return emulate_popf(ctxt, &ctxt->dst.val, ctxt->op_bytes);
+}
+
+static int em_enter(struct x86_emulate_ctxt *ctxt)
+{
+ int rc;
+ unsigned frame_size = ctxt->src.val;
+ unsigned nesting_level = ctxt->src2.val & 31;
+ ulong rbp;
+
+ if (nesting_level)
+ return X86EMUL_UNHANDLEABLE;
+
+ rbp = reg_read(ctxt, VCPU_REGS_RBP);
+ rc = push(ctxt, &rbp, stack_size(ctxt));
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+ assign_masked(reg_rmw(ctxt, VCPU_REGS_RBP), reg_read(ctxt, VCPU_REGS_RSP),
+ stack_mask(ctxt));
+ assign_masked(reg_rmw(ctxt, VCPU_REGS_RSP),
+ reg_read(ctxt, VCPU_REGS_RSP) - frame_size,
+ stack_mask(ctxt));
+ return X86EMUL_CONTINUE;
+}
+
+static int em_leave(struct x86_emulate_ctxt *ctxt)
+{
+ assign_masked(reg_rmw(ctxt, VCPU_REGS_RSP), reg_read(ctxt, VCPU_REGS_RBP),
+ stack_mask(ctxt));
+ return emulate_pop(ctxt, reg_rmw(ctxt, VCPU_REGS_RBP), ctxt->op_bytes);
+}
+
+static int em_push_sreg(struct x86_emulate_ctxt *ctxt)
+{
+ int seg = ctxt->src2.val;
+
+ ctxt->src.val = get_segment_selector(ctxt, seg);
+ if (ctxt->op_bytes == 4) {
+ rsp_increment(ctxt, -2);
+ ctxt->op_bytes = 2;
+ }
+
+ return em_push(ctxt);
+}
+
+static int em_pop_sreg(struct x86_emulate_ctxt *ctxt)
+{
+ int seg = ctxt->src2.val;
+ unsigned long selector;
+ int rc;
+
+ rc = emulate_pop(ctxt, &selector, 2);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ if (ctxt->modrm_reg == VCPU_SREG_SS)
+ ctxt->interruptibility = KVM_X86_SHADOW_INT_MOV_SS;
+ if (ctxt->op_bytes > 2)
+ rsp_increment(ctxt, ctxt->op_bytes - 2);
+
+ rc = load_segment_descriptor(ctxt, (u16)selector, seg);
+ return rc;
+}
+
+static int em_pusha(struct x86_emulate_ctxt *ctxt)
+{
+ unsigned long old_esp = reg_read(ctxt, VCPU_REGS_RSP);
+ int rc = X86EMUL_CONTINUE;
+ int reg = VCPU_REGS_RAX;
+
+ while (reg <= VCPU_REGS_RDI) {
+ (reg == VCPU_REGS_RSP) ?
+ (ctxt->src.val = old_esp) : (ctxt->src.val = reg_read(ctxt, reg));
+
+ rc = em_push(ctxt);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ ++reg;
+ }
+
+ return rc;
+}
+
+static int em_pushf(struct x86_emulate_ctxt *ctxt)
+{
+ ctxt->src.val = (unsigned long)ctxt->eflags & ~X86_EFLAGS_VM;
+ return em_push(ctxt);
+}
+
+static int em_popa(struct x86_emulate_ctxt *ctxt)
+{
+ int rc = X86EMUL_CONTINUE;
+ int reg = VCPU_REGS_RDI;
+ u32 val;
+
+ while (reg >= VCPU_REGS_RAX) {
+ if (reg == VCPU_REGS_RSP) {
+ rsp_increment(ctxt, ctxt->op_bytes);
+ --reg;
+ }
+
+ rc = emulate_pop(ctxt, &val, ctxt->op_bytes);
+ if (rc != X86EMUL_CONTINUE)
+ break;
+ assign_register(reg_rmw(ctxt, reg), val, ctxt->op_bytes);
+ --reg;
+ }
+ return rc;
+}
+
+static int __emulate_int_real(struct x86_emulate_ctxt *ctxt, int irq)
+{
+ const struct x86_emulate_ops *ops = ctxt->ops;
+ int rc;
+ struct desc_ptr dt;
+ gva_t cs_addr;
+ gva_t eip_addr;
+ u16 cs, eip;
+
+ /* TODO: Add limit checks */
+ ctxt->src.val = ctxt->eflags;
+ rc = em_push(ctxt);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ ctxt->eflags &= ~(X86_EFLAGS_IF | X86_EFLAGS_TF | X86_EFLAGS_AC);
+
+ ctxt->src.val = get_segment_selector(ctxt, VCPU_SREG_CS);
+ rc = em_push(ctxt);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ ctxt->src.val = ctxt->_eip;
+ rc = em_push(ctxt);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ ops->get_idt(ctxt, &dt);
+
+ eip_addr = dt.address + (irq << 2);
+ cs_addr = dt.address + (irq << 2) + 2;
+
+ rc = linear_read_system(ctxt, cs_addr, &cs, 2);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ rc = linear_read_system(ctxt, eip_addr, &eip, 2);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ rc = load_segment_descriptor(ctxt, cs, VCPU_SREG_CS);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ ctxt->_eip = eip;
+
+ return rc;
+}
+
+int emulate_int_real(struct x86_emulate_ctxt *ctxt, int irq)
+{
+ int rc;
+
+ invalidate_registers(ctxt);
+ rc = __emulate_int_real(ctxt, irq);
+ if (rc == X86EMUL_CONTINUE)
+ writeback_registers(ctxt);
+ return rc;
+}
+
+static int emulate_int(struct x86_emulate_ctxt *ctxt, int irq)
+{
+ switch(ctxt->mode) {
+ case X86EMUL_MODE_REAL:
+ return __emulate_int_real(ctxt, irq);
+ case X86EMUL_MODE_VM86:
+ case X86EMUL_MODE_PROT16:
+ case X86EMUL_MODE_PROT32:
+ case X86EMUL_MODE_PROT64:
+ default:
+ /* Protected mode interrupts unimplemented yet */
+ return X86EMUL_UNHANDLEABLE;
+ }
+}
+
+static int emulate_iret_real(struct x86_emulate_ctxt *ctxt)
+{
+ int rc = X86EMUL_CONTINUE;
+ unsigned long temp_eip = 0;
+ unsigned long temp_eflags = 0;
+ unsigned long cs = 0;
+ unsigned long mask = X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
+ X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_TF |
+ X86_EFLAGS_IF | X86_EFLAGS_DF | X86_EFLAGS_OF |
+ X86_EFLAGS_IOPL | X86_EFLAGS_NT | X86_EFLAGS_RF |
+ X86_EFLAGS_AC | X86_EFLAGS_ID |
+ X86_EFLAGS_FIXED;
+ unsigned long vm86_mask = X86_EFLAGS_VM | X86_EFLAGS_VIF |
+ X86_EFLAGS_VIP;
+
+ /* TODO: Add stack limit check */
+
+ rc = emulate_pop(ctxt, &temp_eip, ctxt->op_bytes);
+
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ if (temp_eip & ~0xffff)
+ return emulate_gp(ctxt, 0);
+
+ rc = emulate_pop(ctxt, &cs, ctxt->op_bytes);
+
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ rc = emulate_pop(ctxt, &temp_eflags, ctxt->op_bytes);
+
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ rc = load_segment_descriptor(ctxt, (u16)cs, VCPU_SREG_CS);
+
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ ctxt->_eip = temp_eip;
+
+ if (ctxt->op_bytes == 4)
+ ctxt->eflags = ((temp_eflags & mask) | (ctxt->eflags & vm86_mask));
+ else if (ctxt->op_bytes == 2) {
+ ctxt->eflags &= ~0xffff;
+ ctxt->eflags |= temp_eflags;
+ }
+
+ ctxt->eflags &= ~EFLG_RESERVED_ZEROS_MASK; /* Clear reserved zeros */
+ ctxt->eflags |= X86_EFLAGS_FIXED;
+ ctxt->ops->set_nmi_mask(ctxt, false);
+
+ return rc;
+}
+
+static int em_iret(struct x86_emulate_ctxt *ctxt)
+{
+ switch(ctxt->mode) {
+ case X86EMUL_MODE_REAL:
+ return emulate_iret_real(ctxt);
+ case X86EMUL_MODE_VM86:
+ case X86EMUL_MODE_PROT16:
+ case X86EMUL_MODE_PROT32:
+ case X86EMUL_MODE_PROT64:
+ default:
+ /* iret from protected mode unimplemented yet */
+ return X86EMUL_UNHANDLEABLE;
+ }
+}
+
+static int em_jmp_far(struct x86_emulate_ctxt *ctxt)
+{
+ int rc;
+ unsigned short sel;
+ struct desc_struct new_desc;
+ u8 cpl = ctxt->ops->cpl(ctxt);
+
+ memcpy(&sel, ctxt->src.valptr + ctxt->op_bytes, 2);
+
+ rc = __load_segment_descriptor(ctxt, sel, VCPU_SREG_CS, cpl,
+ X86_TRANSFER_CALL_JMP,
+ &new_desc);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ rc = assign_eip_far(ctxt, ctxt->src.val, &new_desc);
+ /* Error handling is not implemented. */
+ if (rc != X86EMUL_CONTINUE)
+ return X86EMUL_UNHANDLEABLE;
+
+ return rc;
+}
+
+static int em_jmp_abs(struct x86_emulate_ctxt *ctxt)
+{
+ return assign_eip_near(ctxt, ctxt->src.val);
+}
+
+static int em_call_near_abs(struct x86_emulate_ctxt *ctxt)
+{
+ int rc;
+ long int old_eip;
+
+ old_eip = ctxt->_eip;
+ rc = assign_eip_near(ctxt, ctxt->src.val);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+ ctxt->src.val = old_eip;
+ rc = em_push(ctxt);
+ return rc;
+}
+
+static int em_cmpxchg8b(struct x86_emulate_ctxt *ctxt)
+{
+ u64 old = ctxt->dst.orig_val64;
+
+ if (ctxt->dst.bytes == 16)
+ return X86EMUL_UNHANDLEABLE;
+
+ if (((u32) (old >> 0) != (u32) reg_read(ctxt, VCPU_REGS_RAX)) ||
+ ((u32) (old >> 32) != (u32) reg_read(ctxt, VCPU_REGS_RDX))) {
+ *reg_write(ctxt, VCPU_REGS_RAX) = (u32) (old >> 0);
+ *reg_write(ctxt, VCPU_REGS_RDX) = (u32) (old >> 32);
+ ctxt->eflags &= ~X86_EFLAGS_ZF;
+ } else {
+ ctxt->dst.val64 = ((u64)reg_read(ctxt, VCPU_REGS_RCX) << 32) |
+ (u32) reg_read(ctxt, VCPU_REGS_RBX);
+
+ ctxt->eflags |= X86_EFLAGS_ZF;
+ }
+ return X86EMUL_CONTINUE;
+}
+
+static int em_ret(struct x86_emulate_ctxt *ctxt)
+{
+ int rc;
+ unsigned long eip;
+
+ rc = emulate_pop(ctxt, &eip, ctxt->op_bytes);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ return assign_eip_near(ctxt, eip);
+}
+
+static int em_ret_far(struct x86_emulate_ctxt *ctxt)
+{
+ int rc;
+ unsigned long eip, cs;
+ int cpl = ctxt->ops->cpl(ctxt);
+ struct desc_struct new_desc;
+
+ rc = emulate_pop(ctxt, &eip, ctxt->op_bytes);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+ rc = emulate_pop(ctxt, &cs, ctxt->op_bytes);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+ /* Outer-privilege level return is not implemented */
+ if (ctxt->mode >= X86EMUL_MODE_PROT16 && (cs & 3) > cpl)
+ return X86EMUL_UNHANDLEABLE;
+ rc = __load_segment_descriptor(ctxt, (u16)cs, VCPU_SREG_CS, cpl,
+ X86_TRANSFER_RET,
+ &new_desc);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+ rc = assign_eip_far(ctxt, eip, &new_desc);
+ /* Error handling is not implemented. */
+ if (rc != X86EMUL_CONTINUE)
+ return X86EMUL_UNHANDLEABLE;
+
+ return rc;
+}
+
+static int em_ret_far_imm(struct x86_emulate_ctxt *ctxt)
+{
+ int rc;
+
+ rc = em_ret_far(ctxt);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+ rsp_increment(ctxt, ctxt->src.val);
+ return X86EMUL_CONTINUE;
+}
+
+static int em_cmpxchg(struct x86_emulate_ctxt *ctxt)
+{
+ /* Save real source value, then compare EAX against destination. */
+ ctxt->dst.orig_val = ctxt->dst.val;
+ ctxt->dst.val = reg_read(ctxt, VCPU_REGS_RAX);
+ ctxt->src.orig_val = ctxt->src.val;
+ ctxt->src.val = ctxt->dst.orig_val;
+ fastop(ctxt, em_cmp);
+
+ if (ctxt->eflags & X86_EFLAGS_ZF) {
+ /* Success: write back to memory; no update of EAX */
+ ctxt->src.type = OP_NONE;
+ ctxt->dst.val = ctxt->src.orig_val;
+ } else {
+ /* Failure: write the value we saw to EAX. */
+ ctxt->src.type = OP_REG;
+ ctxt->src.addr.reg = reg_rmw(ctxt, VCPU_REGS_RAX);
+ ctxt->src.val = ctxt->dst.orig_val;
+ /* Create write-cycle to dest by writing the same value */
+ ctxt->dst.val = ctxt->dst.orig_val;
+ }
+ return X86EMUL_CONTINUE;
+}
+
+static int em_lseg(struct x86_emulate_ctxt *ctxt)
+{
+ int seg = ctxt->src2.val;
+ unsigned short sel;
+ int rc;
+
+ memcpy(&sel, ctxt->src.valptr + ctxt->op_bytes, 2);
+
+ rc = load_segment_descriptor(ctxt, sel, seg);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ ctxt->dst.val = ctxt->src.val;
+ return rc;
+}
+
+static int emulator_has_longmode(struct x86_emulate_ctxt *ctxt)
+{
+#ifdef CONFIG_X86_64
+ u32 eax, ebx, ecx, edx;
+
+ eax = 0x80000001;
+ ecx = 0;
+ ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, false);
+ return edx & bit(X86_FEATURE_LM);
+#else
+ return false;
+#endif
+}
+
+#define GET_SMSTATE(type, smbase, offset) \
+ ({ \
+ type __val; \
+ int r = ctxt->ops->read_phys(ctxt, smbase + offset, &__val, \
+ sizeof(__val)); \
+ if (r != X86EMUL_CONTINUE) \
+ return X86EMUL_UNHANDLEABLE; \
+ __val; \
+ })
+
+static void rsm_set_desc_flags(struct desc_struct *desc, u32 flags)
+{
+ desc->g = (flags >> 23) & 1;
+ desc->d = (flags >> 22) & 1;
+ desc->l = (flags >> 21) & 1;
+ desc->avl = (flags >> 20) & 1;
+ desc->p = (flags >> 15) & 1;
+ desc->dpl = (flags >> 13) & 3;
+ desc->s = (flags >> 12) & 1;
+ desc->type = (flags >> 8) & 15;
+}
+
+static int rsm_load_seg_32(struct x86_emulate_ctxt *ctxt, u64 smbase, int n)
+{
+ struct desc_struct desc;
+ int offset;
+ u16 selector;
+
+ selector = GET_SMSTATE(u32, smbase, 0x7fa8 + n * 4);
+
+ if (n < 3)
+ offset = 0x7f84 + n * 12;
+ else
+ offset = 0x7f2c + (n - 3) * 12;
+
+ set_desc_base(&desc, GET_SMSTATE(u32, smbase, offset + 8));
+ set_desc_limit(&desc, GET_SMSTATE(u32, smbase, offset + 4));
+ rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smbase, offset));
+ ctxt->ops->set_segment(ctxt, selector, &desc, 0, n);
+ return X86EMUL_CONTINUE;
+}
+
+#ifdef CONFIG_X86_64
+static int rsm_load_seg_64(struct x86_emulate_ctxt *ctxt, u64 smbase, int n)
+{
+ struct desc_struct desc;
+ int offset;
+ u16 selector;
+ u32 base3;
+
+ offset = 0x7e00 + n * 16;
+
+ selector = GET_SMSTATE(u16, smbase, offset);
+ rsm_set_desc_flags(&desc, GET_SMSTATE(u16, smbase, offset + 2) << 8);
+ set_desc_limit(&desc, GET_SMSTATE(u32, smbase, offset + 4));
+ set_desc_base(&desc, GET_SMSTATE(u32, smbase, offset + 8));
+ base3 = GET_SMSTATE(u32, smbase, offset + 12);
+
+ ctxt->ops->set_segment(ctxt, selector, &desc, base3, n);
+ return X86EMUL_CONTINUE;
+}
+#endif
+
+static int rsm_enter_protected_mode(struct x86_emulate_ctxt *ctxt,
+ u64 cr0, u64 cr3, u64 cr4)
+{
+ int bad;
+ u64 pcid;
+
+ /* In order to later set CR4.PCIDE, CR3[11:0] must be zero. */
+ pcid = 0;
+ if (cr4 & X86_CR4_PCIDE) {
+ pcid = cr3 & 0xfff;
+ cr3 &= ~0xfff;
+ }
+
+ bad = ctxt->ops->set_cr(ctxt, 3, cr3);
+ if (bad)
+ return X86EMUL_UNHANDLEABLE;
+
+ /*
+ * First enable PAE, long mode needs it before CR0.PG = 1 is set.
+ * Then enable protected mode. However, PCID cannot be enabled
+ * if EFER.LMA=0, so set it separately.
+ */
+ bad = ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PCIDE);
+ if (bad)
+ return X86EMUL_UNHANDLEABLE;
+
+ bad = ctxt->ops->set_cr(ctxt, 0, cr0);
+ if (bad)
+ return X86EMUL_UNHANDLEABLE;
+
+ if (cr4 & X86_CR4_PCIDE) {
+ bad = ctxt->ops->set_cr(ctxt, 4, cr4);
+ if (bad)
+ return X86EMUL_UNHANDLEABLE;
+ if (pcid) {
+ bad = ctxt->ops->set_cr(ctxt, 3, cr3 | pcid);
+ if (bad)
+ return X86EMUL_UNHANDLEABLE;
+ }
+
+ }
+
+ return X86EMUL_CONTINUE;
+}
+
+static int rsm_load_state_32(struct x86_emulate_ctxt *ctxt, u64 smbase)
+{
+ struct desc_struct desc;
+ struct desc_ptr dt;
+ u16 selector;
+ u32 val, cr0, cr3, cr4;
+ int i;
+
+ cr0 = GET_SMSTATE(u32, smbase, 0x7ffc);
+ cr3 = GET_SMSTATE(u32, smbase, 0x7ff8);
+ ctxt->eflags = GET_SMSTATE(u32, smbase, 0x7ff4) | X86_EFLAGS_FIXED;
+ ctxt->_eip = GET_SMSTATE(u32, smbase, 0x7ff0);
+
+ for (i = 0; i < 8; i++)
+ *reg_write(ctxt, i) = GET_SMSTATE(u32, smbase, 0x7fd0 + i * 4);
+
+ val = GET_SMSTATE(u32, smbase, 0x7fcc);
+ ctxt->ops->set_dr(ctxt, 6, (val & DR6_VOLATILE) | DR6_FIXED_1);
+ val = GET_SMSTATE(u32, smbase, 0x7fc8);
+ ctxt->ops->set_dr(ctxt, 7, (val & DR7_VOLATILE) | DR7_FIXED_1);
+
+ selector = GET_SMSTATE(u32, smbase, 0x7fc4);
+ set_desc_base(&desc, GET_SMSTATE(u32, smbase, 0x7f64));
+ set_desc_limit(&desc, GET_SMSTATE(u32, smbase, 0x7f60));
+ rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smbase, 0x7f5c));
+ ctxt->ops->set_segment(ctxt, selector, &desc, 0, VCPU_SREG_TR);
+
+ selector = GET_SMSTATE(u32, smbase, 0x7fc0);
+ set_desc_base(&desc, GET_SMSTATE(u32, smbase, 0x7f80));
+ set_desc_limit(&desc, GET_SMSTATE(u32, smbase, 0x7f7c));
+ rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smbase, 0x7f78));
+ ctxt->ops->set_segment(ctxt, selector, &desc, 0, VCPU_SREG_LDTR);
+
+ dt.address = GET_SMSTATE(u32, smbase, 0x7f74);
+ dt.size = GET_SMSTATE(u32, smbase, 0x7f70);
+ ctxt->ops->set_gdt(ctxt, &dt);
+
+ dt.address = GET_SMSTATE(u32, smbase, 0x7f58);
+ dt.size = GET_SMSTATE(u32, smbase, 0x7f54);
+ ctxt->ops->set_idt(ctxt, &dt);
+
+ for (i = 0; i < 6; i++) {
+ int r = rsm_load_seg_32(ctxt, smbase, i);
+ if (r != X86EMUL_CONTINUE)
+ return r;
+ }
+
+ cr4 = GET_SMSTATE(u32, smbase, 0x7f14);
+
+ ctxt->ops->set_smbase(ctxt, GET_SMSTATE(u32, smbase, 0x7ef8));
+
+ return rsm_enter_protected_mode(ctxt, cr0, cr3, cr4);
+}
+
+#ifdef CONFIG_X86_64
+static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt, u64 smbase)
+{
+ struct desc_struct desc;
+ struct desc_ptr dt;
+ u64 val, cr0, cr3, cr4;
+ u32 base3;
+ u16 selector;
+ int i, r;
+
+ for (i = 0; i < 16; i++)
+ *reg_write(ctxt, i) = GET_SMSTATE(u64, smbase, 0x7ff8 - i * 8);
+
+ ctxt->_eip = GET_SMSTATE(u64, smbase, 0x7f78);
+ ctxt->eflags = GET_SMSTATE(u32, smbase, 0x7f70) | X86_EFLAGS_FIXED;
+
+ val = GET_SMSTATE(u32, smbase, 0x7f68);
+ ctxt->ops->set_dr(ctxt, 6, (val & DR6_VOLATILE) | DR6_FIXED_1);
+ val = GET_SMSTATE(u32, smbase, 0x7f60);
+ ctxt->ops->set_dr(ctxt, 7, (val & DR7_VOLATILE) | DR7_FIXED_1);
+
+ cr0 = GET_SMSTATE(u64, smbase, 0x7f58);
+ cr3 = GET_SMSTATE(u64, smbase, 0x7f50);
+ cr4 = GET_SMSTATE(u64, smbase, 0x7f48);
+ ctxt->ops->set_smbase(ctxt, GET_SMSTATE(u32, smbase, 0x7f00));
+ val = GET_SMSTATE(u64, smbase, 0x7ed0);
+ ctxt->ops->set_msr(ctxt, MSR_EFER, val & ~EFER_LMA);
+
+ selector = GET_SMSTATE(u32, smbase, 0x7e90);
+ rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smbase, 0x7e92) << 8);
+ set_desc_limit(&desc, GET_SMSTATE(u32, smbase, 0x7e94));
+ set_desc_base(&desc, GET_SMSTATE(u32, smbase, 0x7e98));
+ base3 = GET_SMSTATE(u32, smbase, 0x7e9c);
+ ctxt->ops->set_segment(ctxt, selector, &desc, base3, VCPU_SREG_TR);
+
+ dt.size = GET_SMSTATE(u32, smbase, 0x7e84);
+ dt.address = GET_SMSTATE(u64, smbase, 0x7e88);
+ ctxt->ops->set_idt(ctxt, &dt);
+
+ selector = GET_SMSTATE(u32, smbase, 0x7e70);
+ rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smbase, 0x7e72) << 8);
+ set_desc_limit(&desc, GET_SMSTATE(u32, smbase, 0x7e74));
+ set_desc_base(&desc, GET_SMSTATE(u32, smbase, 0x7e78));
+ base3 = GET_SMSTATE(u32, smbase, 0x7e7c);
+ ctxt->ops->set_segment(ctxt, selector, &desc, base3, VCPU_SREG_LDTR);
+
+ dt.size = GET_SMSTATE(u32, smbase, 0x7e64);
+ dt.address = GET_SMSTATE(u64, smbase, 0x7e68);
+ ctxt->ops->set_gdt(ctxt, &dt);
+
+ r = rsm_enter_protected_mode(ctxt, cr0, cr3, cr4);
+ if (r != X86EMUL_CONTINUE)
+ return r;
+
+ for (i = 0; i < 6; i++) {
+ r = rsm_load_seg_64(ctxt, smbase, i);
+ if (r != X86EMUL_CONTINUE)
+ return r;
+ }
+
+ return X86EMUL_CONTINUE;
+}
+#endif
+
+static int em_rsm(struct x86_emulate_ctxt *ctxt)
+{
+ unsigned long cr0, cr4, efer;
+ u64 smbase;
+ int ret;
+
+ if ((ctxt->ops->get_hflags(ctxt) & X86EMUL_SMM_MASK) == 0)
+ return emulate_ud(ctxt);
+
+ /*
+ * Get back to real mode, to prepare a safe state in which to load
+ * CR0/CR3/CR4/EFER. It's all a bit more complicated if the vCPU
+ * supports long mode.
+ */
+ if (emulator_has_longmode(ctxt)) {
+ struct desc_struct cs_desc;
+
+ /* Zero CR4.PCIDE before CR0.PG. */
+ cr4 = ctxt->ops->get_cr(ctxt, 4);
+ if (cr4 & X86_CR4_PCIDE)
+ ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PCIDE);
+
+ /* A 32-bit code segment is required to clear EFER.LMA. */
+ memset(&cs_desc, 0, sizeof(cs_desc));
+ cs_desc.type = 0xb;
+ cs_desc.s = cs_desc.g = cs_desc.p = 1;
+ ctxt->ops->set_segment(ctxt, 0, &cs_desc, 0, VCPU_SREG_CS);
+ }
+
+ /* For the 64-bit case, this will clear EFER.LMA. */
+ cr0 = ctxt->ops->get_cr(ctxt, 0);
+ if (cr0 & X86_CR0_PE)
+ ctxt->ops->set_cr(ctxt, 0, cr0 & ~(X86_CR0_PG | X86_CR0_PE));
+
+ if (emulator_has_longmode(ctxt)) {
+ /* Clear CR4.PAE before clearing EFER.LME. */
+ cr4 = ctxt->ops->get_cr(ctxt, 4);
+ if (cr4 & X86_CR4_PAE)
+ ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PAE);
+
+ /* And finally go back to 32-bit mode. */
+ efer = 0;
+ ctxt->ops->set_msr(ctxt, MSR_EFER, efer);
+ }
+
+ smbase = ctxt->ops->get_smbase(ctxt);
+
+ /*
+ * Give pre_leave_smm() a chance to make ISA-specific changes to the
+ * vCPU state (e.g. enter guest mode) before loading state from the SMM
+ * state-save area.
+ */
+ if (ctxt->ops->pre_leave_smm(ctxt, smbase))
+ return X86EMUL_UNHANDLEABLE;
+
+#ifdef CONFIG_X86_64
+ if (emulator_has_longmode(ctxt))
+ ret = rsm_load_state_64(ctxt, smbase + 0x8000);
+ else
+#endif
+ ret = rsm_load_state_32(ctxt, smbase + 0x8000);
+
+ if (ret != X86EMUL_CONTINUE) {
+ /* FIXME: should triple fault */
+ return X86EMUL_UNHANDLEABLE;
+ }
+
+ if ((ctxt->ops->get_hflags(ctxt) & X86EMUL_SMM_INSIDE_NMI_MASK) == 0)
+ ctxt->ops->set_nmi_mask(ctxt, false);
+
+ ctxt->ops->set_hflags(ctxt, ctxt->ops->get_hflags(ctxt) &
+ ~(X86EMUL_SMM_INSIDE_NMI_MASK | X86EMUL_SMM_MASK));
+ return X86EMUL_CONTINUE;
+}
+
+static void
+setup_syscalls_segments(struct x86_emulate_ctxt *ctxt,
+ struct desc_struct *cs, struct desc_struct *ss)
+{
+ cs->l = 0; /* will be adjusted later */
+ set_desc_base(cs, 0); /* flat segment */
+ cs->g = 1; /* 4kb granularity */
+ set_desc_limit(cs, 0xfffff); /* 4GB limit */
+ cs->type = 0x0b; /* Read, Execute, Accessed */
+ cs->s = 1;
+ cs->dpl = 0; /* will be adjusted later */
+ cs->p = 1;
+ cs->d = 1;
+ cs->avl = 0;
+
+ set_desc_base(ss, 0); /* flat segment */
+ set_desc_limit(ss, 0xfffff); /* 4GB limit */
+ ss->g = 1; /* 4kb granularity */
+ ss->s = 1;
+ ss->type = 0x03; /* Read/Write, Accessed */
+ ss->d = 1; /* 32bit stack segment */
+ ss->dpl = 0;
+ ss->p = 1;
+ ss->l = 0;
+ ss->avl = 0;
+}
+
+static bool vendor_intel(struct x86_emulate_ctxt *ctxt)
+{
+ u32 eax, ebx, ecx, edx;
+
+ eax = ecx = 0;
+ ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, false);
+ return ebx == X86EMUL_CPUID_VENDOR_GenuineIntel_ebx
+ && ecx == X86EMUL_CPUID_VENDOR_GenuineIntel_ecx
+ && edx == X86EMUL_CPUID_VENDOR_GenuineIntel_edx;
+}
+
+static bool em_syscall_is_enabled(struct x86_emulate_ctxt *ctxt)
+{
+ const struct x86_emulate_ops *ops = ctxt->ops;
+ u32 eax, ebx, ecx, edx;
+
+ /*
+ * syscall should always be enabled in longmode - so only become
+ * vendor specific (cpuid) if other modes are active...
+ */
+ if (ctxt->mode == X86EMUL_MODE_PROT64)
+ return true;
+
+ eax = 0x00000000;
+ ecx = 0x00000000;
+ ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, false);
+ /*
+ * Intel ("GenuineIntel")
+ * remark: Intel CPUs only support "syscall" in 64bit
+ * longmode. Also an 64bit guest with a
+ * 32bit compat-app running will #UD !! While this
+ * behaviour can be fixed (by emulating) into AMD
+ * response - CPUs of AMD can't behave like Intel.
+ */
+ if (ebx == X86EMUL_CPUID_VENDOR_GenuineIntel_ebx &&
+ ecx == X86EMUL_CPUID_VENDOR_GenuineIntel_ecx &&
+ edx == X86EMUL_CPUID_VENDOR_GenuineIntel_edx)
+ return false;
+
+ /* AMD ("AuthenticAMD") */
+ if (ebx == X86EMUL_CPUID_VENDOR_AuthenticAMD_ebx &&
+ ecx == X86EMUL_CPUID_VENDOR_AuthenticAMD_ecx &&
+ edx == X86EMUL_CPUID_VENDOR_AuthenticAMD_edx)
+ return true;
+
+ /* AMD ("AMDisbetter!") */
+ if (ebx == X86EMUL_CPUID_VENDOR_AMDisbetterI_ebx &&
+ ecx == X86EMUL_CPUID_VENDOR_AMDisbetterI_ecx &&
+ edx == X86EMUL_CPUID_VENDOR_AMDisbetterI_edx)
+ return true;
+
+ /* default: (not Intel, not AMD), apply Intel's stricter rules... */
+ return false;
+}
+
+static int em_syscall(struct x86_emulate_ctxt *ctxt)
+{
+ const struct x86_emulate_ops *ops = ctxt->ops;
+ struct desc_struct cs, ss;
+ u64 msr_data;
+ u16 cs_sel, ss_sel;
+ u64 efer = 0;
+
+ /* syscall is not available in real mode */
+ if (ctxt->mode == X86EMUL_MODE_REAL ||
+ ctxt->mode == X86EMUL_MODE_VM86)
+ return emulate_ud(ctxt);
+
+ if (!(em_syscall_is_enabled(ctxt)))
+ return emulate_ud(ctxt);
+
+ ops->get_msr(ctxt, MSR_EFER, &efer);
+ setup_syscalls_segments(ctxt, &cs, &ss);
+
+ if (!(efer & EFER_SCE))
+ return emulate_ud(ctxt);
+
+ ops->get_msr(ctxt, MSR_STAR, &msr_data);
+ msr_data >>= 32;
+ cs_sel = (u16)(msr_data & 0xfffc);
+ ss_sel = (u16)(msr_data + 8);
+
+ if (efer & EFER_LMA) {
+ cs.d = 0;
+ cs.l = 1;
+ }
+ ops->set_segment(ctxt, cs_sel, &cs, 0, VCPU_SREG_CS);
+ ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS);
+
+ *reg_write(ctxt, VCPU_REGS_RCX) = ctxt->_eip;
+ if (efer & EFER_LMA) {
+#ifdef CONFIG_X86_64
+ *reg_write(ctxt, VCPU_REGS_R11) = ctxt->eflags;
+
+ ops->get_msr(ctxt,
+ ctxt->mode == X86EMUL_MODE_PROT64 ?
+ MSR_LSTAR : MSR_CSTAR, &msr_data);
+ ctxt->_eip = msr_data;
+
+ ops->get_msr(ctxt, MSR_SYSCALL_MASK, &msr_data);
+ ctxt->eflags &= ~msr_data;
+ ctxt->eflags |= X86_EFLAGS_FIXED;
+#endif
+ } else {
+ /* legacy mode */
+ ops->get_msr(ctxt, MSR_STAR, &msr_data);
+ ctxt->_eip = (u32)msr_data;
+
+ ctxt->eflags &= ~(X86_EFLAGS_VM | X86_EFLAGS_IF);
+ }
+
+ ctxt->tf = (ctxt->eflags & X86_EFLAGS_TF) != 0;
+ return X86EMUL_CONTINUE;
+}
+
+static int em_sysenter(struct x86_emulate_ctxt *ctxt)
+{
+ const struct x86_emulate_ops *ops = ctxt->ops;
+ struct desc_struct cs, ss;
+ u64 msr_data;
+ u16 cs_sel, ss_sel;
+ u64 efer = 0;
+
+ ops->get_msr(ctxt, MSR_EFER, &efer);
+ /* inject #GP if in real mode */
+ if (ctxt->mode == X86EMUL_MODE_REAL)
+ return emulate_gp(ctxt, 0);
+
+ /*
+ * Not recognized on AMD in compat mode (but is recognized in legacy
+ * mode).
+ */
+ if ((ctxt->mode != X86EMUL_MODE_PROT64) && (efer & EFER_LMA)
+ && !vendor_intel(ctxt))
+ return emulate_ud(ctxt);
+
+ /* sysenter/sysexit have not been tested in 64bit mode. */
+ if (ctxt->mode == X86EMUL_MODE_PROT64)
+ return X86EMUL_UNHANDLEABLE;
+
+ setup_syscalls_segments(ctxt, &cs, &ss);
+
+ ops->get_msr(ctxt, MSR_IA32_SYSENTER_CS, &msr_data);
+ if ((msr_data & 0xfffc) == 0x0)
+ return emulate_gp(ctxt, 0);
+
+ ctxt->eflags &= ~(X86_EFLAGS_VM | X86_EFLAGS_IF);
+ cs_sel = (u16)msr_data & ~SEGMENT_RPL_MASK;
+ ss_sel = cs_sel + 8;
+ if (efer & EFER_LMA) {
+ cs.d = 0;
+ cs.l = 1;
+ }
+
+ ops->set_segment(ctxt, cs_sel, &cs, 0, VCPU_SREG_CS);
+ ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS);
+
+ ops->get_msr(ctxt, MSR_IA32_SYSENTER_EIP, &msr_data);
+ ctxt->_eip = (efer & EFER_LMA) ? msr_data : (u32)msr_data;
+
+ ops->get_msr(ctxt, MSR_IA32_SYSENTER_ESP, &msr_data);
+ *reg_write(ctxt, VCPU_REGS_RSP) = (efer & EFER_LMA) ? msr_data :
+ (u32)msr_data;
+
+ return X86EMUL_CONTINUE;
+}
+
+static int em_sysexit(struct x86_emulate_ctxt *ctxt)
+{
+ const struct x86_emulate_ops *ops = ctxt->ops;
+ struct desc_struct cs, ss;
+ u64 msr_data, rcx, rdx;
+ int usermode;
+ u16 cs_sel = 0, ss_sel = 0;
+
+ /* inject #GP if in real mode or Virtual 8086 mode */
+ if (ctxt->mode == X86EMUL_MODE_REAL ||
+ ctxt->mode == X86EMUL_MODE_VM86)
+ return emulate_gp(ctxt, 0);
+
+ setup_syscalls_segments(ctxt, &cs, &ss);
+
+ if ((ctxt->rex_prefix & 0x8) != 0x0)
+ usermode = X86EMUL_MODE_PROT64;
+ else
+ usermode = X86EMUL_MODE_PROT32;
+
+ rcx = reg_read(ctxt, VCPU_REGS_RCX);
+ rdx = reg_read(ctxt, VCPU_REGS_RDX);
+
+ cs.dpl = 3;
+ ss.dpl = 3;
+ ops->get_msr(ctxt, MSR_IA32_SYSENTER_CS, &msr_data);
+ switch (usermode) {
+ case X86EMUL_MODE_PROT32:
+ cs_sel = (u16)(msr_data + 16);
+ if ((msr_data & 0xfffc) == 0x0)
+ return emulate_gp(ctxt, 0);
+ ss_sel = (u16)(msr_data + 24);
+ rcx = (u32)rcx;
+ rdx = (u32)rdx;
+ break;
+ case X86EMUL_MODE_PROT64:
+ cs_sel = (u16)(msr_data + 32);
+ if (msr_data == 0x0)
+ return emulate_gp(ctxt, 0);
+ ss_sel = cs_sel + 8;
+ cs.d = 0;
+ cs.l = 1;
+ if (emul_is_noncanonical_address(rcx, ctxt) ||
+ emul_is_noncanonical_address(rdx, ctxt))
+ return emulate_gp(ctxt, 0);
+ break;
+ }
+ cs_sel |= SEGMENT_RPL_MASK;
+ ss_sel |= SEGMENT_RPL_MASK;
+
+ ops->set_segment(ctxt, cs_sel, &cs, 0, VCPU_SREG_CS);
+ ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS);
+
+ ctxt->_eip = rdx;
+ *reg_write(ctxt, VCPU_REGS_RSP) = rcx;
+
+ return X86EMUL_CONTINUE;
+}
+
+static bool emulator_bad_iopl(struct x86_emulate_ctxt *ctxt)
+{
+ int iopl;
+ if (ctxt->mode == X86EMUL_MODE_REAL)
+ return false;
+ if (ctxt->mode == X86EMUL_MODE_VM86)
+ return true;
+ iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> X86_EFLAGS_IOPL_BIT;
+ return ctxt->ops->cpl(ctxt) > iopl;
+}
+
+#define VMWARE_PORT_VMPORT (0x5658)
+#define VMWARE_PORT_VMRPC (0x5659)
+
+static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt,
+ u16 port, u16 len)
+{
+ const struct x86_emulate_ops *ops = ctxt->ops;
+ struct desc_struct tr_seg;
+ u32 base3;
+ int r;
+ u16 tr, io_bitmap_ptr, perm, bit_idx = port & 0x7;
+ unsigned mask = (1 << len) - 1;
+ unsigned long base;
+
+ /*
+ * VMware allows access to these ports even if denied
+ * by TSS I/O permission bitmap. Mimic behavior.
+ */
+ if (enable_vmware_backdoor &&
+ ((port == VMWARE_PORT_VMPORT) || (port == VMWARE_PORT_VMRPC)))
+ return true;
+
+ ops->get_segment(ctxt, &tr, &tr_seg, &base3, VCPU_SREG_TR);
+ if (!tr_seg.p)
+ return false;
+ if (desc_limit_scaled(&tr_seg) < 103)
+ return false;
+ base = get_desc_base(&tr_seg);
+#ifdef CONFIG_X86_64
+ base |= ((u64)base3) << 32;
+#endif
+ r = ops->read_std(ctxt, base + 102, &io_bitmap_ptr, 2, NULL, true);
+ if (r != X86EMUL_CONTINUE)
+ return false;
+ if (io_bitmap_ptr + port/8 > desc_limit_scaled(&tr_seg))
+ return false;
+ r = ops->read_std(ctxt, base + io_bitmap_ptr + port/8, &perm, 2, NULL, true);
+ if (r != X86EMUL_CONTINUE)
+ return false;
+ if ((perm >> bit_idx) & mask)
+ return false;
+ return true;
+}
+
+static bool emulator_io_permited(struct x86_emulate_ctxt *ctxt,
+ u16 port, u16 len)
+{
+ if (ctxt->perm_ok)
+ return true;
+
+ if (emulator_bad_iopl(ctxt))
+ if (!emulator_io_port_access_allowed(ctxt, port, len))
+ return false;
+
+ ctxt->perm_ok = true;
+
+ return true;
+}
+
+static void string_registers_quirk(struct x86_emulate_ctxt *ctxt)
+{
+ /*
+ * Intel CPUs mask the counter and pointers in quite strange
+ * manner when ECX is zero due to REP-string optimizations.
+ */
+#ifdef CONFIG_X86_64
+ if (ctxt->ad_bytes != 4 || !vendor_intel(ctxt))
+ return;
+
+ *reg_write(ctxt, VCPU_REGS_RCX) = 0;
+
+ switch (ctxt->b) {
+ case 0xa4: /* movsb */
+ case 0xa5: /* movsd/w */
+ *reg_rmw(ctxt, VCPU_REGS_RSI) &= (u32)-1;
+ /* fall through */
+ case 0xaa: /* stosb */
+ case 0xab: /* stosd/w */
+ *reg_rmw(ctxt, VCPU_REGS_RDI) &= (u32)-1;
+ }
+#endif
+}
+
+static void save_state_to_tss16(struct x86_emulate_ctxt *ctxt,
+ struct tss_segment_16 *tss)
+{
+ tss->ip = ctxt->_eip;
+ tss->flag = ctxt->eflags;
+ tss->ax = reg_read(ctxt, VCPU_REGS_RAX);
+ tss->cx = reg_read(ctxt, VCPU_REGS_RCX);
+ tss->dx = reg_read(ctxt, VCPU_REGS_RDX);
+ tss->bx = reg_read(ctxt, VCPU_REGS_RBX);
+ tss->sp = reg_read(ctxt, VCPU_REGS_RSP);
+ tss->bp = reg_read(ctxt, VCPU_REGS_RBP);
+ tss->si = reg_read(ctxt, VCPU_REGS_RSI);
+ tss->di = reg_read(ctxt, VCPU_REGS_RDI);
+
+ tss->es = get_segment_selector(ctxt, VCPU_SREG_ES);
+ tss->cs = get_segment_selector(ctxt, VCPU_SREG_CS);
+ tss->ss = get_segment_selector(ctxt, VCPU_SREG_SS);
+ tss->ds = get_segment_selector(ctxt, VCPU_SREG_DS);
+ tss->ldt = get_segment_selector(ctxt, VCPU_SREG_LDTR);
+}
+
+static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt,
+ struct tss_segment_16 *tss)
+{
+ int ret;
+ u8 cpl;
+
+ ctxt->_eip = tss->ip;
+ ctxt->eflags = tss->flag | 2;
+ *reg_write(ctxt, VCPU_REGS_RAX) = tss->ax;
+ *reg_write(ctxt, VCPU_REGS_RCX) = tss->cx;
+ *reg_write(ctxt, VCPU_REGS_RDX) = tss->dx;
+ *reg_write(ctxt, VCPU_REGS_RBX) = tss->bx;
+ *reg_write(ctxt, VCPU_REGS_RSP) = tss->sp;
+ *reg_write(ctxt, VCPU_REGS_RBP) = tss->bp;
+ *reg_write(ctxt, VCPU_REGS_RSI) = tss->si;
+ *reg_write(ctxt, VCPU_REGS_RDI) = tss->di;
+
+ /*
+ * SDM says that segment selectors are loaded before segment
+ * descriptors
+ */
+ set_segment_selector(ctxt, tss->ldt, VCPU_SREG_LDTR);
+ set_segment_selector(ctxt, tss->es, VCPU_SREG_ES);
+ set_segment_selector(ctxt, tss->cs, VCPU_SREG_CS);
+ set_segment_selector(ctxt, tss->ss, VCPU_SREG_SS);
+ set_segment_selector(ctxt, tss->ds, VCPU_SREG_DS);
+
+ cpl = tss->cs & 3;
+
+ /*
+ * Now load segment descriptors. If fault happens at this stage
+ * it is handled in a context of new task
+ */
+ ret = __load_segment_descriptor(ctxt, tss->ldt, VCPU_SREG_LDTR, cpl,
+ X86_TRANSFER_TASK_SWITCH, NULL);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+ ret = __load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES, cpl,
+ X86_TRANSFER_TASK_SWITCH, NULL);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+ ret = __load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS, cpl,
+ X86_TRANSFER_TASK_SWITCH, NULL);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+ ret = __load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS, cpl,
+ X86_TRANSFER_TASK_SWITCH, NULL);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+ ret = __load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS, cpl,
+ X86_TRANSFER_TASK_SWITCH, NULL);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+
+ return X86EMUL_CONTINUE;
+}
+
+static int task_switch_16(struct x86_emulate_ctxt *ctxt,
+ u16 tss_selector, u16 old_tss_sel,
+ ulong old_tss_base, struct desc_struct *new_desc)
+{
+ struct tss_segment_16 tss_seg;
+ int ret;
+ u32 new_tss_base = get_desc_base(new_desc);
+
+ ret = linear_read_system(ctxt, old_tss_base, &tss_seg, sizeof tss_seg);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+
+ save_state_to_tss16(ctxt, &tss_seg);
+
+ ret = linear_write_system(ctxt, old_tss_base, &tss_seg, sizeof tss_seg);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+
+ ret = linear_read_system(ctxt, new_tss_base, &tss_seg, sizeof tss_seg);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+
+ if (old_tss_sel != 0xffff) {
+ tss_seg.prev_task_link = old_tss_sel;
+
+ ret = linear_write_system(ctxt, new_tss_base,
+ &tss_seg.prev_task_link,
+ sizeof tss_seg.prev_task_link);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+ }
+
+ return load_state_from_tss16(ctxt, &tss_seg);
+}
+
+static void save_state_to_tss32(struct x86_emulate_ctxt *ctxt,
+ struct tss_segment_32 *tss)
+{
+ /* CR3 and ldt selector are not saved intentionally */
+ tss->eip = ctxt->_eip;
+ tss->eflags = ctxt->eflags;
+ tss->eax = reg_read(ctxt, VCPU_REGS_RAX);
+ tss->ecx = reg_read(ctxt, VCPU_REGS_RCX);
+ tss->edx = reg_read(ctxt, VCPU_REGS_RDX);
+ tss->ebx = reg_read(ctxt, VCPU_REGS_RBX);
+ tss->esp = reg_read(ctxt, VCPU_REGS_RSP);
+ tss->ebp = reg_read(ctxt, VCPU_REGS_RBP);
+ tss->esi = reg_read(ctxt, VCPU_REGS_RSI);
+ tss->edi = reg_read(ctxt, VCPU_REGS_RDI);
+
+ tss->es = get_segment_selector(ctxt, VCPU_SREG_ES);
+ tss->cs = get_segment_selector(ctxt, VCPU_SREG_CS);
+ tss->ss = get_segment_selector(ctxt, VCPU_SREG_SS);
+ tss->ds = get_segment_selector(ctxt, VCPU_SREG_DS);
+ tss->fs = get_segment_selector(ctxt, VCPU_SREG_FS);
+ tss->gs = get_segment_selector(ctxt, VCPU_SREG_GS);
+}
+
+static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
+ struct tss_segment_32 *tss)
+{
+ int ret;
+ u8 cpl;
+
+ if (ctxt->ops->set_cr(ctxt, 3, tss->cr3))
+ return emulate_gp(ctxt, 0);
+ ctxt->_eip = tss->eip;
+ ctxt->eflags = tss->eflags | 2;
+
+ /* General purpose registers */
+ *reg_write(ctxt, VCPU_REGS_RAX) = tss->eax;
+ *reg_write(ctxt, VCPU_REGS_RCX) = tss->ecx;
+ *reg_write(ctxt, VCPU_REGS_RDX) = tss->edx;
+ *reg_write(ctxt, VCPU_REGS_RBX) = tss->ebx;
+ *reg_write(ctxt, VCPU_REGS_RSP) = tss->esp;
+ *reg_write(ctxt, VCPU_REGS_RBP) = tss->ebp;
+ *reg_write(ctxt, VCPU_REGS_RSI) = tss->esi;
+ *reg_write(ctxt, VCPU_REGS_RDI) = tss->edi;
+
+ /*
+ * SDM says that segment selectors are loaded before segment
+ * descriptors. This is important because CPL checks will
+ * use CS.RPL.
+ */
+ set_segment_selector(ctxt, tss->ldt_selector, VCPU_SREG_LDTR);
+ set_segment_selector(ctxt, tss->es, VCPU_SREG_ES);
+ set_segment_selector(ctxt, tss->cs, VCPU_SREG_CS);
+ set_segment_selector(ctxt, tss->ss, VCPU_SREG_SS);
+ set_segment_selector(ctxt, tss->ds, VCPU_SREG_DS);
+ set_segment_selector(ctxt, tss->fs, VCPU_SREG_FS);
+ set_segment_selector(ctxt, tss->gs, VCPU_SREG_GS);
+
+ /*
+ * If we're switching between Protected Mode and VM86, we need to make
+ * sure to update the mode before loading the segment descriptors so
+ * that the selectors are interpreted correctly.
+ */
+ if (ctxt->eflags & X86_EFLAGS_VM) {
+ ctxt->mode = X86EMUL_MODE_VM86;
+ cpl = 3;
+ } else {
+ ctxt->mode = X86EMUL_MODE_PROT32;
+ cpl = tss->cs & 3;
+ }
+
+ /*
+ * Now load segment descriptors. If fault happenes at this stage
+ * it is handled in a context of new task
+ */
+ ret = __load_segment_descriptor(ctxt, tss->ldt_selector, VCPU_SREG_LDTR,
+ cpl, X86_TRANSFER_TASK_SWITCH, NULL);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+ ret = __load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES, cpl,
+ X86_TRANSFER_TASK_SWITCH, NULL);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+ ret = __load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS, cpl,
+ X86_TRANSFER_TASK_SWITCH, NULL);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+ ret = __load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS, cpl,
+ X86_TRANSFER_TASK_SWITCH, NULL);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+ ret = __load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS, cpl,
+ X86_TRANSFER_TASK_SWITCH, NULL);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+ ret = __load_segment_descriptor(ctxt, tss->fs, VCPU_SREG_FS, cpl,
+ X86_TRANSFER_TASK_SWITCH, NULL);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+ ret = __load_segment_descriptor(ctxt, tss->gs, VCPU_SREG_GS, cpl,
+ X86_TRANSFER_TASK_SWITCH, NULL);
+
+ return ret;
+}
+
+static int task_switch_32(struct x86_emulate_ctxt *ctxt,
+ u16 tss_selector, u16 old_tss_sel,
+ ulong old_tss_base, struct desc_struct *new_desc)
+{
+ struct tss_segment_32 tss_seg;
+ int ret;
+ u32 new_tss_base = get_desc_base(new_desc);
+ u32 eip_offset = offsetof(struct tss_segment_32, eip);
+ u32 ldt_sel_offset = offsetof(struct tss_segment_32, ldt_selector);
+
+ ret = linear_read_system(ctxt, old_tss_base, &tss_seg, sizeof tss_seg);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+
+ save_state_to_tss32(ctxt, &tss_seg);
+
+ /* Only GP registers and segment selectors are saved */
+ ret = linear_write_system(ctxt, old_tss_base + eip_offset, &tss_seg.eip,
+ ldt_sel_offset - eip_offset);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+
+ ret = linear_read_system(ctxt, new_tss_base, &tss_seg, sizeof tss_seg);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+
+ if (old_tss_sel != 0xffff) {
+ tss_seg.prev_task_link = old_tss_sel;
+
+ ret = linear_write_system(ctxt, new_tss_base,
+ &tss_seg.prev_task_link,
+ sizeof tss_seg.prev_task_link);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+ }
+
+ return load_state_from_tss32(ctxt, &tss_seg);
+}
+
+static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
+ u16 tss_selector, int idt_index, int reason,
+ bool has_error_code, u32 error_code)
+{
+ const struct x86_emulate_ops *ops = ctxt->ops;
+ struct desc_struct curr_tss_desc, next_tss_desc;
+ int ret;
+ u16 old_tss_sel = get_segment_selector(ctxt, VCPU_SREG_TR);
+ ulong old_tss_base =
+ ops->get_cached_segment_base(ctxt, VCPU_SREG_TR);
+ u32 desc_limit;
+ ulong desc_addr, dr7;
+
+ /* FIXME: old_tss_base == ~0 ? */
+
+ ret = read_segment_descriptor(ctxt, tss_selector, &next_tss_desc, &desc_addr);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+ ret = read_segment_descriptor(ctxt, old_tss_sel, &curr_tss_desc, &desc_addr);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+
+ /* FIXME: check that next_tss_desc is tss */
+
+ /*
+ * Check privileges. The three cases are task switch caused by...
+ *
+ * 1. jmp/call/int to task gate: Check against DPL of the task gate
+ * 2. Exception/IRQ/iret: No check is performed
+ * 3. jmp/call to TSS/task-gate: No check is performed since the
+ * hardware checks it before exiting.
+ */
+ if (reason == TASK_SWITCH_GATE) {
+ if (idt_index != -1) {
+ /* Software interrupts */
+ struct desc_struct task_gate_desc;
+ int dpl;
+
+ ret = read_interrupt_descriptor(ctxt, idt_index,
+ &task_gate_desc);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+
+ dpl = task_gate_desc.dpl;
+ if ((tss_selector & 3) > dpl || ops->cpl(ctxt) > dpl)
+ return emulate_gp(ctxt, (idt_index << 3) | 0x2);
+ }
+ }
+
+ desc_limit = desc_limit_scaled(&next_tss_desc);
+ if (!next_tss_desc.p ||
+ ((desc_limit < 0x67 && (next_tss_desc.type & 8)) ||
+ desc_limit < 0x2b)) {
+ return emulate_ts(ctxt, tss_selector & 0xfffc);
+ }
+
+ if (reason == TASK_SWITCH_IRET || reason == TASK_SWITCH_JMP) {
+ curr_tss_desc.type &= ~(1 << 1); /* clear busy flag */
+ write_segment_descriptor(ctxt, old_tss_sel, &curr_tss_desc);
+ }
+
+ if (reason == TASK_SWITCH_IRET)
+ ctxt->eflags = ctxt->eflags & ~X86_EFLAGS_NT;
+
+ /* set back link to prev task only if NT bit is set in eflags
+ note that old_tss_sel is not used after this point */
+ if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE)
+ old_tss_sel = 0xffff;
+
+ if (next_tss_desc.type & 8)
+ ret = task_switch_32(ctxt, tss_selector, old_tss_sel,
+ old_tss_base, &next_tss_desc);
+ else
+ ret = task_switch_16(ctxt, tss_selector, old_tss_sel,
+ old_tss_base, &next_tss_desc);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+
+ if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE)
+ ctxt->eflags = ctxt->eflags | X86_EFLAGS_NT;
+
+ if (reason != TASK_SWITCH_IRET) {
+ next_tss_desc.type |= (1 << 1); /* set busy flag */
+ write_segment_descriptor(ctxt, tss_selector, &next_tss_desc);
+ }
+
+ ops->set_cr(ctxt, 0, ops->get_cr(ctxt, 0) | X86_CR0_TS);
+ ops->set_segment(ctxt, tss_selector, &next_tss_desc, 0, VCPU_SREG_TR);
+
+ if (has_error_code) {
+ ctxt->op_bytes = ctxt->ad_bytes = (next_tss_desc.type & 8) ? 4 : 2;
+ ctxt->lock_prefix = 0;
+ ctxt->src.val = (unsigned long) error_code;
+ ret = em_push(ctxt);
+ }
+
+ ops->get_dr(ctxt, 7, &dr7);
+ ops->set_dr(ctxt, 7, dr7 & ~(DR_LOCAL_ENABLE_MASK | DR_LOCAL_SLOWDOWN));
+
+ return ret;
+}
+
+int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
+ u16 tss_selector, int idt_index, int reason,
+ bool has_error_code, u32 error_code)
+{
+ int rc;
+
+ invalidate_registers(ctxt);
+ ctxt->_eip = ctxt->eip;
+ ctxt->dst.type = OP_NONE;
+
+ rc = emulator_do_task_switch(ctxt, tss_selector, idt_index, reason,
+ has_error_code, error_code);
+
+ if (rc == X86EMUL_CONTINUE) {
+ ctxt->eip = ctxt->_eip;
+ writeback_registers(ctxt);
+ }
+
+ return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK;
+}
+
+static void string_addr_inc(struct x86_emulate_ctxt *ctxt, int reg,
+ struct operand *op)
+{
+ int df = (ctxt->eflags & X86_EFLAGS_DF) ? -op->count : op->count;
+
+ register_address_increment(ctxt, reg, df * op->bytes);
+ op->addr.mem.ea = register_address(ctxt, reg);
+}
+
+static int em_das(struct x86_emulate_ctxt *ctxt)
+{
+ u8 al, old_al;
+ bool af, cf, old_cf;
+
+ cf = ctxt->eflags & X86_EFLAGS_CF;
+ al = ctxt->dst.val;
+
+ old_al = al;
+ old_cf = cf;
+ cf = false;
+ af = ctxt->eflags & X86_EFLAGS_AF;
+ if ((al & 0x0f) > 9 || af) {
+ al -= 6;
+ cf = old_cf | (al >= 250);
+ af = true;
+ } else {
+ af = false;
+ }
+ if (old_al > 0x99 || old_cf) {
+ al -= 0x60;
+ cf = true;
+ }
+
+ ctxt->dst.val = al;
+ /* Set PF, ZF, SF */
+ ctxt->src.type = OP_IMM;
+ ctxt->src.val = 0;
+ ctxt->src.bytes = 1;
+ fastop(ctxt, em_or);
+ ctxt->eflags &= ~(X86_EFLAGS_AF | X86_EFLAGS_CF);
+ if (cf)
+ ctxt->eflags |= X86_EFLAGS_CF;
+ if (af)
+ ctxt->eflags |= X86_EFLAGS_AF;
+ return X86EMUL_CONTINUE;
+}
+
+static int em_aam(struct x86_emulate_ctxt *ctxt)
+{
+ u8 al, ah;
+
+ if (ctxt->src.val == 0)
+ return emulate_de(ctxt);
+
+ al = ctxt->dst.val & 0xff;
+ ah = al / ctxt->src.val;
+ al %= ctxt->src.val;
+
+ ctxt->dst.val = (ctxt->dst.val & 0xffff0000) | al | (ah << 8);
+
+ /* Set PF, ZF, SF */
+ ctxt->src.type = OP_IMM;
+ ctxt->src.val = 0;
+ ctxt->src.bytes = 1;
+ fastop(ctxt, em_or);
+
+ return X86EMUL_CONTINUE;
+}
+
+static int em_aad(struct x86_emulate_ctxt *ctxt)
+{
+ u8 al = ctxt->dst.val & 0xff;
+ u8 ah = (ctxt->dst.val >> 8) & 0xff;
+
+ al = (al + (ah * ctxt->src.val)) & 0xff;
+
+ ctxt->dst.val = (ctxt->dst.val & 0xffff0000) | al;
+
+ /* Set PF, ZF, SF */
+ ctxt->src.type = OP_IMM;
+ ctxt->src.val = 0;
+ ctxt->src.bytes = 1;
+ fastop(ctxt, em_or);
+
+ return X86EMUL_CONTINUE;
+}
+
+static int em_call(struct x86_emulate_ctxt *ctxt)
+{
+ int rc;
+ long rel = ctxt->src.val;
+
+ ctxt->src.val = (unsigned long)ctxt->_eip;
+ rc = jmp_rel(ctxt, rel);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+ return em_push(ctxt);
+}
+
+static int em_call_far(struct x86_emulate_ctxt *ctxt)
+{
+ u16 sel, old_cs;
+ ulong old_eip;
+ int rc;
+ struct desc_struct old_desc, new_desc;
+ const struct x86_emulate_ops *ops = ctxt->ops;
+ int cpl = ctxt->ops->cpl(ctxt);
+ enum x86emul_mode prev_mode = ctxt->mode;
+
+ old_eip = ctxt->_eip;
+ ops->get_segment(ctxt, &old_cs, &old_desc, NULL, VCPU_SREG_CS);
+
+ memcpy(&sel, ctxt->src.valptr + ctxt->op_bytes, 2);
+ rc = __load_segment_descriptor(ctxt, sel, VCPU_SREG_CS, cpl,
+ X86_TRANSFER_CALL_JMP, &new_desc);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ rc = assign_eip_far(ctxt, ctxt->src.val, &new_desc);
+ if (rc != X86EMUL_CONTINUE)
+ goto fail;
+
+ ctxt->src.val = old_cs;
+ rc = em_push(ctxt);
+ if (rc != X86EMUL_CONTINUE)
+ goto fail;
+
+ ctxt->src.val = old_eip;
+ rc = em_push(ctxt);
+ /* If we failed, we tainted the memory, but the very least we should
+ restore cs */
+ if (rc != X86EMUL_CONTINUE) {
+ pr_warn_once("faulting far call emulation tainted memory\n");
+ goto fail;
+ }
+ return rc;
+fail:
+ ops->set_segment(ctxt, old_cs, &old_desc, 0, VCPU_SREG_CS);
+ ctxt->mode = prev_mode;
+ return rc;
+
+}
+
+static int em_ret_near_imm(struct x86_emulate_ctxt *ctxt)
+{
+ int rc;
+ unsigned long eip;
+
+ rc = emulate_pop(ctxt, &eip, ctxt->op_bytes);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+ rc = assign_eip_near(ctxt, eip);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+ rsp_increment(ctxt, ctxt->src.val);
+ return X86EMUL_CONTINUE;
+}
+
+static int em_xchg(struct x86_emulate_ctxt *ctxt)
+{
+ /* Write back the register source. */
+ ctxt->src.val = ctxt->dst.val;
+ write_register_operand(&ctxt->src);
+
+ /* Write back the memory destination with implicit LOCK prefix. */
+ ctxt->dst.val = ctxt->src.orig_val;
+ ctxt->lock_prefix = 1;
+ return X86EMUL_CONTINUE;
+}
+
+static int em_imul_3op(struct x86_emulate_ctxt *ctxt)
+{
+ ctxt->dst.val = ctxt->src2.val;
+ return fastop(ctxt, em_imul);
+}
+
+static int em_cwd(struct x86_emulate_ctxt *ctxt)
+{
+ ctxt->dst.type = OP_REG;
+ ctxt->dst.bytes = ctxt->src.bytes;
+ ctxt->dst.addr.reg = reg_rmw(ctxt, VCPU_REGS_RDX);
+ ctxt->dst.val = ~((ctxt->src.val >> (ctxt->src.bytes * 8 - 1)) - 1);
+
+ return X86EMUL_CONTINUE;
+}
+
+static int em_rdpid(struct x86_emulate_ctxt *ctxt)
+{
+ u64 tsc_aux = 0;
+
+ if (ctxt->ops->get_msr(ctxt, MSR_TSC_AUX, &tsc_aux))
+ return emulate_ud(ctxt);
+ ctxt->dst.val = tsc_aux;
+ return X86EMUL_CONTINUE;
+}
+
+static int em_rdtsc(struct x86_emulate_ctxt *ctxt)
+{
+ u64 tsc = 0;
+
+ ctxt->ops->get_msr(ctxt, MSR_IA32_TSC, &tsc);
+ *reg_write(ctxt, VCPU_REGS_RAX) = (u32)tsc;
+ *reg_write(ctxt, VCPU_REGS_RDX) = tsc >> 32;
+ return X86EMUL_CONTINUE;
+}
+
+static int em_rdpmc(struct x86_emulate_ctxt *ctxt)
+{
+ u64 pmc;
+
+ if (ctxt->ops->read_pmc(ctxt, reg_read(ctxt, VCPU_REGS_RCX), &pmc))
+ return emulate_gp(ctxt, 0);
+ *reg_write(ctxt, VCPU_REGS_RAX) = (u32)pmc;
+ *reg_write(ctxt, VCPU_REGS_RDX) = pmc >> 32;
+ return X86EMUL_CONTINUE;
+}
+
+static int em_mov(struct x86_emulate_ctxt *ctxt)
+{
+ memcpy(ctxt->dst.valptr, ctxt->src.valptr, sizeof(ctxt->src.valptr));
+ return X86EMUL_CONTINUE;
+}
+
+#define FFL(x) bit(X86_FEATURE_##x)
+
+static int em_movbe(struct x86_emulate_ctxt *ctxt)
+{
+ u32 ebx, ecx, edx, eax = 1;
+ u16 tmp;
+
+ /*
+ * Check MOVBE is set in the guest-visible CPUID leaf.
+ */
+ ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, false);
+ if (!(ecx & FFL(MOVBE)))
+ return emulate_ud(ctxt);
+
+ switch (ctxt->op_bytes) {
+ case 2:
+ /*
+ * From MOVBE definition: "...When the operand size is 16 bits,
+ * the upper word of the destination register remains unchanged
+ * ..."
+ *
+ * Both casting ->valptr and ->val to u16 breaks strict aliasing
+ * rules so we have to do the operation almost per hand.
+ */
+ tmp = (u16)ctxt->src.val;
+ ctxt->dst.val &= ~0xffffUL;
+ ctxt->dst.val |= (unsigned long)swab16(tmp);
+ break;
+ case 4:
+ ctxt->dst.val = swab32((u32)ctxt->src.val);
+ break;
+ case 8:
+ ctxt->dst.val = swab64(ctxt->src.val);
+ break;
+ default:
+ BUG();
+ }
+ return X86EMUL_CONTINUE;
+}
+
+static int em_cr_write(struct x86_emulate_ctxt *ctxt)
+{
+ if (ctxt->ops->set_cr(ctxt, ctxt->modrm_reg, ctxt->src.val))
+ return emulate_gp(ctxt, 0);
+
+ /* Disable writeback. */
+ ctxt->dst.type = OP_NONE;
+ return X86EMUL_CONTINUE;
+}
+
+static int em_dr_write(struct x86_emulate_ctxt *ctxt)
+{
+ unsigned long val;
+
+ if (ctxt->mode == X86EMUL_MODE_PROT64)
+ val = ctxt->src.val & ~0ULL;
+ else
+ val = ctxt->src.val & ~0U;
+
+ /* #UD condition is already handled. */
+ if (ctxt->ops->set_dr(ctxt, ctxt->modrm_reg, val) < 0)
+ return emulate_gp(ctxt, 0);
+
+ /* Disable writeback. */
+ ctxt->dst.type = OP_NONE;
+ return X86EMUL_CONTINUE;
+}
+
+static int em_wrmsr(struct x86_emulate_ctxt *ctxt)
+{
+ u64 msr_data;
+
+ msr_data = (u32)reg_read(ctxt, VCPU_REGS_RAX)
+ | ((u64)reg_read(ctxt, VCPU_REGS_RDX) << 32);
+ if (ctxt->ops->set_msr(ctxt, reg_read(ctxt, VCPU_REGS_RCX), msr_data))
+ return emulate_gp(ctxt, 0);
+
+ return X86EMUL_CONTINUE;
+}
+
+static int em_rdmsr(struct x86_emulate_ctxt *ctxt)
+{
+ u64 msr_data;
+
+ if (ctxt->ops->get_msr(ctxt, reg_read(ctxt, VCPU_REGS_RCX), &msr_data))
+ return emulate_gp(ctxt, 0);
+
+ *reg_write(ctxt, VCPU_REGS_RAX) = (u32)msr_data;
+ *reg_write(ctxt, VCPU_REGS_RDX) = msr_data >> 32;
+ return X86EMUL_CONTINUE;
+}
+
+static int em_store_sreg(struct x86_emulate_ctxt *ctxt, int segment)
+{
+ if (segment > VCPU_SREG_GS &&
+ (ctxt->ops->get_cr(ctxt, 4) & X86_CR4_UMIP) &&
+ ctxt->ops->cpl(ctxt) > 0)
+ return emulate_gp(ctxt, 0);
+
+ ctxt->dst.val = get_segment_selector(ctxt, segment);
+ if (ctxt->dst.bytes == 4 && ctxt->dst.type == OP_MEM)
+ ctxt->dst.bytes = 2;
+ return X86EMUL_CONTINUE;
+}
+
+static int em_mov_rm_sreg(struct x86_emulate_ctxt *ctxt)
+{
+ if (ctxt->modrm_reg > VCPU_SREG_GS)
+ return emulate_ud(ctxt);
+
+ return em_store_sreg(ctxt, ctxt->modrm_reg);
+}
+
+static int em_mov_sreg_rm(struct x86_emulate_ctxt *ctxt)
+{
+ u16 sel = ctxt->src.val;
+
+ if (ctxt->modrm_reg == VCPU_SREG_CS || ctxt->modrm_reg > VCPU_SREG_GS)
+ return emulate_ud(ctxt);
+
+ if (ctxt->modrm_reg == VCPU_SREG_SS)
+ ctxt->interruptibility = KVM_X86_SHADOW_INT_MOV_SS;
+
+ /* Disable writeback. */
+ ctxt->dst.type = OP_NONE;
+ return load_segment_descriptor(ctxt, sel, ctxt->modrm_reg);
+}
+
+static int em_sldt(struct x86_emulate_ctxt *ctxt)
+{
+ return em_store_sreg(ctxt, VCPU_SREG_LDTR);
+}
+
+static int em_lldt(struct x86_emulate_ctxt *ctxt)
+{
+ u16 sel = ctxt->src.val;
+
+ /* Disable writeback. */
+ ctxt->dst.type = OP_NONE;
+ return load_segment_descriptor(ctxt, sel, VCPU_SREG_LDTR);
+}
+
+static int em_str(struct x86_emulate_ctxt *ctxt)
+{
+ return em_store_sreg(ctxt, VCPU_SREG_TR);
+}
+
+static int em_ltr(struct x86_emulate_ctxt *ctxt)
+{
+ u16 sel = ctxt->src.val;
+
+ /* Disable writeback. */
+ ctxt->dst.type = OP_NONE;
+ return load_segment_descriptor(ctxt, sel, VCPU_SREG_TR);
+}
+
+static int em_invlpg(struct x86_emulate_ctxt *ctxt)
+{
+ int rc;
+ ulong linear;
+
+ rc = linearize(ctxt, ctxt->src.addr.mem, 1, false, &linear);
+ if (rc == X86EMUL_CONTINUE)
+ ctxt->ops->invlpg(ctxt, linear);
+ /* Disable writeback. */
+ ctxt->dst.type = OP_NONE;
+ return X86EMUL_CONTINUE;
+}
+
+static int em_clts(struct x86_emulate_ctxt *ctxt)
+{
+ ulong cr0;
+
+ cr0 = ctxt->ops->get_cr(ctxt, 0);
+ cr0 &= ~X86_CR0_TS;
+ ctxt->ops->set_cr(ctxt, 0, cr0);
+ return X86EMUL_CONTINUE;
+}
+
+static int em_hypercall(struct x86_emulate_ctxt *ctxt)
+{
+ int rc = ctxt->ops->fix_hypercall(ctxt);
+
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ /* Let the processor re-execute the fixed hypercall */
+ ctxt->_eip = ctxt->eip;
+ /* Disable writeback. */
+ ctxt->dst.type = OP_NONE;
+ return X86EMUL_CONTINUE;
+}
+
+static int emulate_store_desc_ptr(struct x86_emulate_ctxt *ctxt,
+ void (*get)(struct x86_emulate_ctxt *ctxt,
+ struct desc_ptr *ptr))
+{
+ struct desc_ptr desc_ptr;
+
+ if ((ctxt->ops->get_cr(ctxt, 4) & X86_CR4_UMIP) &&
+ ctxt->ops->cpl(ctxt) > 0)
+ return emulate_gp(ctxt, 0);
+
+ if (ctxt->mode == X86EMUL_MODE_PROT64)
+ ctxt->op_bytes = 8;
+ get(ctxt, &desc_ptr);
+ if (ctxt->op_bytes == 2) {
+ ctxt->op_bytes = 4;
+ desc_ptr.address &= 0x00ffffff;
+ }
+ /* Disable writeback. */
+ ctxt->dst.type = OP_NONE;
+ return segmented_write_std(ctxt, ctxt->dst.addr.mem,
+ &desc_ptr, 2 + ctxt->op_bytes);
+}
+
+static int em_sgdt(struct x86_emulate_ctxt *ctxt)
+{
+ return emulate_store_desc_ptr(ctxt, ctxt->ops->get_gdt);
+}
+
+static int em_sidt(struct x86_emulate_ctxt *ctxt)
+{
+ return emulate_store_desc_ptr(ctxt, ctxt->ops->get_idt);
+}
+
+static int em_lgdt_lidt(struct x86_emulate_ctxt *ctxt, bool lgdt)
+{
+ struct desc_ptr desc_ptr;
+ int rc;
+
+ if (ctxt->mode == X86EMUL_MODE_PROT64)
+ ctxt->op_bytes = 8;
+ rc = read_descriptor(ctxt, ctxt->src.addr.mem,
+ &desc_ptr.size, &desc_ptr.address,
+ ctxt->op_bytes);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+ if (ctxt->mode == X86EMUL_MODE_PROT64 &&
+ emul_is_noncanonical_address(desc_ptr.address, ctxt))
+ return emulate_gp(ctxt, 0);
+ if (lgdt)
+ ctxt->ops->set_gdt(ctxt, &desc_ptr);
+ else
+ ctxt->ops->set_idt(ctxt, &desc_ptr);
+ /* Disable writeback. */
+ ctxt->dst.type = OP_NONE;
+ return X86EMUL_CONTINUE;
+}
+
+static int em_lgdt(struct x86_emulate_ctxt *ctxt)
+{
+ return em_lgdt_lidt(ctxt, true);
+}
+
+static int em_lidt(struct x86_emulate_ctxt *ctxt)
+{
+ return em_lgdt_lidt(ctxt, false);
+}
+
+static int em_smsw(struct x86_emulate_ctxt *ctxt)
+{
+ if ((ctxt->ops->get_cr(ctxt, 4) & X86_CR4_UMIP) &&
+ ctxt->ops->cpl(ctxt) > 0)
+ return emulate_gp(ctxt, 0);
+
+ if (ctxt->dst.type == OP_MEM)
+ ctxt->dst.bytes = 2;
+ ctxt->dst.val = ctxt->ops->get_cr(ctxt, 0);
+ return X86EMUL_CONTINUE;
+}
+
+static int em_lmsw(struct x86_emulate_ctxt *ctxt)
+{
+ ctxt->ops->set_cr(ctxt, 0, (ctxt->ops->get_cr(ctxt, 0) & ~0x0eul)
+ | (ctxt->src.val & 0x0f));
+ ctxt->dst.type = OP_NONE;
+ return X86EMUL_CONTINUE;
+}
+
+static int em_loop(struct x86_emulate_ctxt *ctxt)
+{
+ int rc = X86EMUL_CONTINUE;
+
+ register_address_increment(ctxt, VCPU_REGS_RCX, -1);
+ if ((address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) != 0) &&
+ (ctxt->b == 0xe2 || test_cc(ctxt->b ^ 0x5, ctxt->eflags)))
+ rc = jmp_rel(ctxt, ctxt->src.val);
+
+ return rc;
+}
+
+static int em_jcxz(struct x86_emulate_ctxt *ctxt)
+{
+ int rc = X86EMUL_CONTINUE;
+
+ if (address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) == 0)
+ rc = jmp_rel(ctxt, ctxt->src.val);
+
+ return rc;
+}
+
+static int em_in(struct x86_emulate_ctxt *ctxt)
+{
+ if (!pio_in_emulated(ctxt, ctxt->dst.bytes, ctxt->src.val,
+ &ctxt->dst.val))
+ return X86EMUL_IO_NEEDED;
+
+ return X86EMUL_CONTINUE;
+}
+
+static int em_out(struct x86_emulate_ctxt *ctxt)
+{
+ ctxt->ops->pio_out_emulated(ctxt, ctxt->src.bytes, ctxt->dst.val,
+ &ctxt->src.val, 1);
+ /* Disable writeback. */
+ ctxt->dst.type = OP_NONE;
+ return X86EMUL_CONTINUE;
+}
+
+static int em_cli(struct x86_emulate_ctxt *ctxt)
+{
+ if (emulator_bad_iopl(ctxt))
+ return emulate_gp(ctxt, 0);
+
+ ctxt->eflags &= ~X86_EFLAGS_IF;
+ return X86EMUL_CONTINUE;
+}
+
+static int em_sti(struct x86_emulate_ctxt *ctxt)
+{
+ if (emulator_bad_iopl(ctxt))
+ return emulate_gp(ctxt, 0);
+
+ ctxt->interruptibility = KVM_X86_SHADOW_INT_STI;
+ ctxt->eflags |= X86_EFLAGS_IF;
+ return X86EMUL_CONTINUE;
+}
+
+static int em_cpuid(struct x86_emulate_ctxt *ctxt)
+{
+ u32 eax, ebx, ecx, edx;
+ u64 msr = 0;
+
+ ctxt->ops->get_msr(ctxt, MSR_MISC_FEATURES_ENABLES, &msr);
+ if (msr & MSR_MISC_FEATURES_ENABLES_CPUID_FAULT &&
+ ctxt->ops->cpl(ctxt)) {
+ return emulate_gp(ctxt, 0);
+ }
+
+ eax = reg_read(ctxt, VCPU_REGS_RAX);
+ ecx = reg_read(ctxt, VCPU_REGS_RCX);
+ ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, true);
+ *reg_write(ctxt, VCPU_REGS_RAX) = eax;
+ *reg_write(ctxt, VCPU_REGS_RBX) = ebx;
+ *reg_write(ctxt, VCPU_REGS_RCX) = ecx;
+ *reg_write(ctxt, VCPU_REGS_RDX) = edx;
+ return X86EMUL_CONTINUE;
+}
+
+static int em_sahf(struct x86_emulate_ctxt *ctxt)
+{
+ u32 flags;
+
+ flags = X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF |
+ X86_EFLAGS_SF;
+ flags &= *reg_rmw(ctxt, VCPU_REGS_RAX) >> 8;
+
+ ctxt->eflags &= ~0xffUL;
+ ctxt->eflags |= flags | X86_EFLAGS_FIXED;
+ return X86EMUL_CONTINUE;
+}
+
+static int em_lahf(struct x86_emulate_ctxt *ctxt)
+{
+ *reg_rmw(ctxt, VCPU_REGS_RAX) &= ~0xff00UL;
+ *reg_rmw(ctxt, VCPU_REGS_RAX) |= (ctxt->eflags & 0xff) << 8;
+ return X86EMUL_CONTINUE;
+}
+
+static int em_bswap(struct x86_emulate_ctxt *ctxt)
+{
+ switch (ctxt->op_bytes) {
+#ifdef CONFIG_X86_64
+ case 8:
+ asm("bswap %0" : "+r"(ctxt->dst.val));
+ break;
+#endif
+ default:
+ asm("bswap %0" : "+r"(*(u32 *)&ctxt->dst.val));
+ break;
+ }
+ return X86EMUL_CONTINUE;
+}
+
+static int em_clflush(struct x86_emulate_ctxt *ctxt)
+{
+ /* emulating clflush regardless of cpuid */
+ return X86EMUL_CONTINUE;
+}
+
+static int em_clflushopt(struct x86_emulate_ctxt *ctxt)
+{
+ /* emulating clflushopt regardless of cpuid */
+ return X86EMUL_CONTINUE;
+}
+
+static int em_movsxd(struct x86_emulate_ctxt *ctxt)
+{
+ ctxt->dst.val = (s32) ctxt->src.val;
+ return X86EMUL_CONTINUE;
+}
+
+static int check_fxsr(struct x86_emulate_ctxt *ctxt)
+{
+ u32 eax = 1, ebx, ecx = 0, edx;
+
+ ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, false);
+ if (!(edx & FFL(FXSR)))
+ return emulate_ud(ctxt);
+
+ if (ctxt->ops->get_cr(ctxt, 0) & (X86_CR0_TS | X86_CR0_EM))
+ return emulate_nm(ctxt);
+
+ /*
+ * Don't emulate a case that should never be hit, instead of working
+ * around a lack of fxsave64/fxrstor64 on old compilers.
+ */
+ if (ctxt->mode >= X86EMUL_MODE_PROT64)
+ return X86EMUL_UNHANDLEABLE;
+
+ return X86EMUL_CONTINUE;
+}
+
+/*
+ * Hardware doesn't save and restore XMM 0-7 without CR4.OSFXSR, but does save
+ * and restore MXCSR.
+ */
+static size_t __fxstate_size(int nregs)
+{
+ return offsetof(struct fxregs_state, xmm_space[0]) + nregs * 16;
+}
+
+static inline size_t fxstate_size(struct x86_emulate_ctxt *ctxt)
+{
+ bool cr4_osfxsr;
+ if (ctxt->mode == X86EMUL_MODE_PROT64)
+ return __fxstate_size(16);
+
+ cr4_osfxsr = ctxt->ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR;
+ return __fxstate_size(cr4_osfxsr ? 8 : 0);
+}
+
+/*
+ * FXSAVE and FXRSTOR have 4 different formats depending on execution mode,
+ * 1) 16 bit mode
+ * 2) 32 bit mode
+ * - like (1), but FIP and FDP (foo) are only 16 bit. At least Intel CPUs
+ * preserve whole 32 bit values, though, so (1) and (2) are the same wrt.
+ * save and restore
+ * 3) 64-bit mode with REX.W prefix
+ * - like (2), but XMM 8-15 are being saved and restored
+ * 4) 64-bit mode without REX.W prefix
+ * - like (3), but FIP and FDP are 64 bit
+ *
+ * Emulation uses (3) for (1) and (2) and preserves XMM 8-15 to reach the
+ * desired result. (4) is not emulated.
+ *
+ * Note: Guest and host CPUID.(EAX=07H,ECX=0H):EBX[bit 13] (deprecate FPU CS
+ * and FPU DS) should match.
+ */
+static int em_fxsave(struct x86_emulate_ctxt *ctxt)
+{
+ struct fxregs_state fx_state;
+ int rc;
+
+ rc = check_fxsr(ctxt);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ rc = asm_safe("fxsave %[fx]", , [fx] "+m"(fx_state));
+
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ return segmented_write_std(ctxt, ctxt->memop.addr.mem, &fx_state,
+ fxstate_size(ctxt));
+}
+
+/*
+ * FXRSTOR might restore XMM registers not provided by the guest. Fill
+ * in the host registers (via FXSAVE) instead, so they won't be modified.
+ * (preemption has to stay disabled until FXRSTOR).
+ *
+ * Use noinline to keep the stack for other functions called by callers small.
+ */
+static noinline int fxregs_fixup(struct fxregs_state *fx_state,
+ const size_t used_size)
+{
+ struct fxregs_state fx_tmp;
+ int rc;
+
+ rc = asm_safe("fxsave %[fx]", , [fx] "+m"(fx_tmp));
+ memcpy((void *)fx_state + used_size, (void *)&fx_tmp + used_size,
+ __fxstate_size(16) - used_size);
+
+ return rc;
+}
+
+static int em_fxrstor(struct x86_emulate_ctxt *ctxt)
+{
+ struct fxregs_state fx_state;
+ int rc;
+ size_t size;
+
+ rc = check_fxsr(ctxt);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ size = fxstate_size(ctxt);
+ rc = segmented_read_std(ctxt, ctxt->memop.addr.mem, &fx_state, size);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ if (size < __fxstate_size(16)) {
+ rc = fxregs_fixup(&fx_state, size);
+ if (rc != X86EMUL_CONTINUE)
+ goto out;
+ }
+
+ if (fx_state.mxcsr >> 16) {
+ rc = emulate_gp(ctxt, 0);
+ goto out;
+ }
+
+ if (rc == X86EMUL_CONTINUE)
+ rc = asm_safe("fxrstor %[fx]", : [fx] "m"(fx_state));
+
+out:
+ return rc;
+}
+
+static bool valid_cr(int nr)
+{
+ switch (nr) {
+ case 0:
+ case 2 ... 4:
+ case 8:
+ return true;
+ default:
+ return false;
+ }
+}
+
+static int check_cr_read(struct x86_emulate_ctxt *ctxt)
+{
+ if (!valid_cr(ctxt->modrm_reg))
+ return emulate_ud(ctxt);
+
+ return X86EMUL_CONTINUE;
+}
+
+static int check_cr_write(struct x86_emulate_ctxt *ctxt)
+{
+ u64 new_val = ctxt->src.val64;
+ int cr = ctxt->modrm_reg;
+ u64 efer = 0;
+
+ static u64 cr_reserved_bits[] = {
+ 0xffffffff00000000ULL,
+ 0, 0, 0, /* CR3 checked later */
+ CR4_RESERVED_BITS,
+ 0, 0, 0,
+ CR8_RESERVED_BITS,
+ };
+
+ if (!valid_cr(cr))
+ return emulate_ud(ctxt);
+
+ if (new_val & cr_reserved_bits[cr])
+ return emulate_gp(ctxt, 0);
+
+ switch (cr) {
+ case 0: {
+ u64 cr4;
+ if (((new_val & X86_CR0_PG) && !(new_val & X86_CR0_PE)) ||
+ ((new_val & X86_CR0_NW) && !(new_val & X86_CR0_CD)))
+ return emulate_gp(ctxt, 0);
+
+ cr4 = ctxt->ops->get_cr(ctxt, 4);
+ ctxt->ops->get_msr(ctxt, MSR_EFER, &efer);
+
+ if ((new_val & X86_CR0_PG) && (efer & EFER_LME) &&
+ !(cr4 & X86_CR4_PAE))
+ return emulate_gp(ctxt, 0);
+
+ break;
+ }
+ case 3: {
+ u64 rsvd = 0;
+
+ ctxt->ops->get_msr(ctxt, MSR_EFER, &efer);
+ if (efer & EFER_LMA) {
+ u64 maxphyaddr;
+ u32 eax, ebx, ecx, edx;
+
+ eax = 0x80000008;
+ ecx = 0;
+ if (ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx,
+ &edx, false))
+ maxphyaddr = eax & 0xff;
+ else
+ maxphyaddr = 36;
+ rsvd = rsvd_bits(maxphyaddr, 63);
+ if (ctxt->ops->get_cr(ctxt, 4) & X86_CR4_PCIDE)
+ rsvd &= ~X86_CR3_PCID_NOFLUSH;
+ }
+
+ if (new_val & rsvd)
+ return emulate_gp(ctxt, 0);
+
+ break;
+ }
+ case 4: {
+ ctxt->ops->get_msr(ctxt, MSR_EFER, &efer);
+
+ if ((efer & EFER_LMA) && !(new_val & X86_CR4_PAE))
+ return emulate_gp(ctxt, 0);
+
+ break;
+ }
+ }
+
+ return X86EMUL_CONTINUE;
+}
+
+static int check_dr7_gd(struct x86_emulate_ctxt *ctxt)
+{
+ unsigned long dr7;
+
+ ctxt->ops->get_dr(ctxt, 7, &dr7);
+
+ /* Check if DR7.Global_Enable is set */
+ return dr7 & (1 << 13);
+}
+
+static int check_dr_read(struct x86_emulate_ctxt *ctxt)
+{
+ int dr = ctxt->modrm_reg;
+ u64 cr4;
+
+ if (dr > 7)
+ return emulate_ud(ctxt);
+
+ cr4 = ctxt->ops->get_cr(ctxt, 4);
+ if ((cr4 & X86_CR4_DE) && (dr == 4 || dr == 5))
+ return emulate_ud(ctxt);
+
+ if (check_dr7_gd(ctxt)) {
+ ulong dr6;
+
+ ctxt->ops->get_dr(ctxt, 6, &dr6);
+ dr6 &= ~15;
+ dr6 |= DR6_BD | DR6_RTM;
+ ctxt->ops->set_dr(ctxt, 6, dr6);
+ return emulate_db(ctxt);
+ }
+
+ return X86EMUL_CONTINUE;
+}
+
+static int check_dr_write(struct x86_emulate_ctxt *ctxt)
+{
+ u64 new_val = ctxt->src.val64;
+ int dr = ctxt->modrm_reg;
+
+ if ((dr == 6 || dr == 7) && (new_val & 0xffffffff00000000ULL))
+ return emulate_gp(ctxt, 0);
+
+ return check_dr_read(ctxt);
+}
+
+static int check_svme(struct x86_emulate_ctxt *ctxt)
+{
+ u64 efer = 0;
+
+ ctxt->ops->get_msr(ctxt, MSR_EFER, &efer);
+
+ if (!(efer & EFER_SVME))
+ return emulate_ud(ctxt);
+
+ return X86EMUL_CONTINUE;
+}
+
+static int check_svme_pa(struct x86_emulate_ctxt *ctxt)
+{
+ u64 rax = reg_read(ctxt, VCPU_REGS_RAX);
+
+ /* Valid physical address? */
+ if (rax & 0xffff000000000000ULL)
+ return emulate_gp(ctxt, 0);
+
+ return check_svme(ctxt);
+}
+
+static int check_rdtsc(struct x86_emulate_ctxt *ctxt)
+{
+ u64 cr4 = ctxt->ops->get_cr(ctxt, 4);
+
+ if (cr4 & X86_CR4_TSD && ctxt->ops->cpl(ctxt))
+ return emulate_ud(ctxt);
+
+ return X86EMUL_CONTINUE;
+}
+
+static int check_rdpmc(struct x86_emulate_ctxt *ctxt)
+{
+ u64 cr4 = ctxt->ops->get_cr(ctxt, 4);
+ u64 rcx = reg_read(ctxt, VCPU_REGS_RCX);
+
+ /*
+ * VMware allows access to these Pseduo-PMCs even when read via RDPMC
+ * in Ring3 when CR4.PCE=0.
+ */
+ if (enable_vmware_backdoor && is_vmware_backdoor_pmc(rcx))
+ return X86EMUL_CONTINUE;
+
+ if ((!(cr4 & X86_CR4_PCE) && ctxt->ops->cpl(ctxt)) ||
+ ctxt->ops->check_pmc(ctxt, rcx))
+ return emulate_gp(ctxt, 0);
+
+ return X86EMUL_CONTINUE;
+}
+
+static int check_perm_in(struct x86_emulate_ctxt *ctxt)
+{
+ ctxt->dst.bytes = min(ctxt->dst.bytes, 4u);
+ if (!emulator_io_permited(ctxt, ctxt->src.val, ctxt->dst.bytes))
+ return emulate_gp(ctxt, 0);
+
+ return X86EMUL_CONTINUE;
+}
+
+static int check_perm_out(struct x86_emulate_ctxt *ctxt)
+{
+ ctxt->src.bytes = min(ctxt->src.bytes, 4u);
+ if (!emulator_io_permited(ctxt, ctxt->dst.val, ctxt->src.bytes))
+ return emulate_gp(ctxt, 0);
+
+ return X86EMUL_CONTINUE;
+}
+
+#define D(_y) { .flags = (_y) }
+#define DI(_y, _i) { .flags = (_y)|Intercept, .intercept = x86_intercept_##_i }
+#define DIP(_y, _i, _p) { .flags = (_y)|Intercept|CheckPerm, \
+ .intercept = x86_intercept_##_i, .check_perm = (_p) }
+#define N D(NotImpl)
+#define EXT(_f, _e) { .flags = ((_f) | RMExt), .u.group = (_e) }
+#define G(_f, _g) { .flags = ((_f) | Group | ModRM), .u.group = (_g) }
+#define GD(_f, _g) { .flags = ((_f) | GroupDual | ModRM), .u.gdual = (_g) }
+#define ID(_f, _i) { .flags = ((_f) | InstrDual | ModRM), .u.idual = (_i) }
+#define MD(_f, _m) { .flags = ((_f) | ModeDual), .u.mdual = (_m) }
+#define E(_f, _e) { .flags = ((_f) | Escape | ModRM), .u.esc = (_e) }
+#define I(_f, _e) { .flags = (_f), .u.execute = (_e) }
+#define F(_f, _e) { .flags = (_f) | Fastop, .u.fastop = (_e) }
+#define II(_f, _e, _i) \
+ { .flags = (_f)|Intercept, .u.execute = (_e), .intercept = x86_intercept_##_i }
+#define IIP(_f, _e, _i, _p) \
+ { .flags = (_f)|Intercept|CheckPerm, .u.execute = (_e), \
+ .intercept = x86_intercept_##_i, .check_perm = (_p) }
+#define GP(_f, _g) { .flags = ((_f) | Prefix), .u.gprefix = (_g) }
+
+#define D2bv(_f) D((_f) | ByteOp), D(_f)
+#define D2bvIP(_f, _i, _p) DIP((_f) | ByteOp, _i, _p), DIP(_f, _i, _p)
+#define I2bv(_f, _e) I((_f) | ByteOp, _e), I(_f, _e)
+#define F2bv(_f, _e) F((_f) | ByteOp, _e), F(_f, _e)
+#define I2bvIP(_f, _e, _i, _p) \
+ IIP((_f) | ByteOp, _e, _i, _p), IIP(_f, _e, _i, _p)
+
+#define F6ALU(_f, _e) F2bv((_f) | DstMem | SrcReg | ModRM, _e), \
+ F2bv(((_f) | DstReg | SrcMem | ModRM) & ~Lock, _e), \
+ F2bv(((_f) & ~Lock) | DstAcc | SrcImm, _e)
+
+static const struct opcode group7_rm0[] = {
+ N,
+ I(SrcNone | Priv | EmulateOnUD, em_hypercall),
+ N, N, N, N, N, N,
+};
+
+static const struct opcode group7_rm1[] = {
+ DI(SrcNone | Priv, monitor),
+ DI(SrcNone | Priv, mwait),
+ N, N, N, N, N, N,
+};
+
+static const struct opcode group7_rm3[] = {
+ DIP(SrcNone | Prot | Priv, vmrun, check_svme_pa),
+ II(SrcNone | Prot | EmulateOnUD, em_hypercall, vmmcall),
+ DIP(SrcNone | Prot | Priv, vmload, check_svme_pa),
+ DIP(SrcNone | Prot | Priv, vmsave, check_svme_pa),
+ DIP(SrcNone | Prot | Priv, stgi, check_svme),
+ DIP(SrcNone | Prot | Priv, clgi, check_svme),
+ DIP(SrcNone | Prot | Priv, skinit, check_svme),
+ DIP(SrcNone | Prot | Priv, invlpga, check_svme),
+};
+
+static const struct opcode group7_rm7[] = {
+ N,
+ DIP(SrcNone, rdtscp, check_rdtsc),
+ N, N, N, N, N, N,
+};
+
+static const struct opcode group1[] = {
+ F(Lock, em_add),
+ F(Lock | PageTable, em_or),
+ F(Lock, em_adc),
+ F(Lock, em_sbb),
+ F(Lock | PageTable, em_and),
+ F(Lock, em_sub),
+ F(Lock, em_xor),
+ F(NoWrite, em_cmp),
+};
+
+static const struct opcode group1A[] = {
+ I(DstMem | SrcNone | Mov | Stack | IncSP | TwoMemOp, em_pop), N, N, N, N, N, N, N,
+};
+
+static const struct opcode group2[] = {
+ F(DstMem | ModRM, em_rol),
+ F(DstMem | ModRM, em_ror),
+ F(DstMem | ModRM, em_rcl),
+ F(DstMem | ModRM, em_rcr),
+ F(DstMem | ModRM, em_shl),
+ F(DstMem | ModRM, em_shr),
+ F(DstMem | ModRM, em_shl),
+ F(DstMem | ModRM, em_sar),
+};
+
+static const struct opcode group3[] = {
+ F(DstMem | SrcImm | NoWrite, em_test),
+ F(DstMem | SrcImm | NoWrite, em_test),
+ F(DstMem | SrcNone | Lock, em_not),
+ F(DstMem | SrcNone | Lock, em_neg),
+ F(DstXacc | Src2Mem, em_mul_ex),
+ F(DstXacc | Src2Mem, em_imul_ex),
+ F(DstXacc | Src2Mem, em_div_ex),
+ F(DstXacc | Src2Mem, em_idiv_ex),
+};
+
+static const struct opcode group4[] = {
+ F(ByteOp | DstMem | SrcNone | Lock, em_inc),
+ F(ByteOp | DstMem | SrcNone | Lock, em_dec),
+ N, N, N, N, N, N,
+};
+
+static const struct opcode group5[] = {
+ F(DstMem | SrcNone | Lock, em_inc),
+ F(DstMem | SrcNone | Lock, em_dec),
+ I(SrcMem | NearBranch, em_call_near_abs),
+ I(SrcMemFAddr | ImplicitOps, em_call_far),
+ I(SrcMem | NearBranch, em_jmp_abs),
+ I(SrcMemFAddr | ImplicitOps, em_jmp_far),
+ I(SrcMem | Stack | TwoMemOp, em_push), D(Undefined),
+};
+
+static const struct opcode group6[] = {
+ II(Prot | DstMem, em_sldt, sldt),
+ II(Prot | DstMem, em_str, str),
+ II(Prot | Priv | SrcMem16, em_lldt, lldt),
+ II(Prot | Priv | SrcMem16, em_ltr, ltr),
+ N, N, N, N,
+};
+
+static const struct group_dual group7 = { {
+ II(Mov | DstMem, em_sgdt, sgdt),
+ II(Mov | DstMem, em_sidt, sidt),
+ II(SrcMem | Priv, em_lgdt, lgdt),
+ II(SrcMem | Priv, em_lidt, lidt),
+ II(SrcNone | DstMem | Mov, em_smsw, smsw), N,
+ II(SrcMem16 | Mov | Priv, em_lmsw, lmsw),
+ II(SrcMem | ByteOp | Priv | NoAccess, em_invlpg, invlpg),
+}, {
+ EXT(0, group7_rm0),
+ EXT(0, group7_rm1),
+ N, EXT(0, group7_rm3),
+ II(SrcNone | DstMem | Mov, em_smsw, smsw), N,
+ II(SrcMem16 | Mov | Priv, em_lmsw, lmsw),
+ EXT(0, group7_rm7),
+} };
+
+static const struct opcode group8[] = {
+ N, N, N, N,
+ F(DstMem | SrcImmByte | NoWrite, em_bt),
+ F(DstMem | SrcImmByte | Lock | PageTable, em_bts),
+ F(DstMem | SrcImmByte | Lock, em_btr),
+ F(DstMem | SrcImmByte | Lock | PageTable, em_btc),
+};
+
+/*
+ * The "memory" destination is actually always a register, since we come
+ * from the register case of group9.
+ */
+static const struct gprefix pfx_0f_c7_7 = {
+ N, N, N, II(DstMem | ModRM | Op3264 | EmulateOnUD, em_rdpid, rdtscp),
+};
+
+
+static const struct group_dual group9 = { {
+ N, I(DstMem64 | Lock | PageTable, em_cmpxchg8b), N, N, N, N, N, N,
+}, {
+ N, N, N, N, N, N, N,
+ GP(0, &pfx_0f_c7_7),
+} };
+
+static const struct opcode group11[] = {
+ I(DstMem | SrcImm | Mov | PageTable, em_mov),
+ X7(D(Undefined)),
+};
+
+static const struct gprefix pfx_0f_ae_7 = {
+ I(SrcMem | ByteOp, em_clflush), I(SrcMem | ByteOp, em_clflushopt), N, N,
+};
+
+static const struct group_dual group15 = { {
+ I(ModRM | Aligned16, em_fxsave),
+ I(ModRM | Aligned16, em_fxrstor),
+ N, N, N, N, N, GP(0, &pfx_0f_ae_7),
+}, {
+ N, N, N, N, N, N, N, N,
+} };
+
+static const struct gprefix pfx_0f_6f_0f_7f = {
+ I(Mmx, em_mov), I(Sse | Aligned, em_mov), N, I(Sse | Unaligned, em_mov),
+};
+
+static const struct instr_dual instr_dual_0f_2b = {
+ I(0, em_mov), N
+};
+
+static const struct gprefix pfx_0f_2b = {
+ ID(0, &instr_dual_0f_2b), ID(0, &instr_dual_0f_2b), N, N,
+};
+
+static const struct gprefix pfx_0f_10_0f_11 = {
+ I(Unaligned, em_mov), I(Unaligned, em_mov), N, N,
+};
+
+static const struct gprefix pfx_0f_28_0f_29 = {
+ I(Aligned, em_mov), I(Aligned, em_mov), N, N,
+};
+
+static const struct gprefix pfx_0f_e7 = {
+ N, I(Sse, em_mov), N, N,
+};
+
+static const struct escape escape_d9 = { {
+ N, N, N, N, N, N, N, I(DstMem16 | Mov, em_fnstcw),
+}, {
+ /* 0xC0 - 0xC7 */
+ N, N, N, N, N, N, N, N,
+ /* 0xC8 - 0xCF */
+ N, N, N, N, N, N, N, N,
+ /* 0xD0 - 0xC7 */
+ N, N, N, N, N, N, N, N,
+ /* 0xD8 - 0xDF */
+ N, N, N, N, N, N, N, N,
+ /* 0xE0 - 0xE7 */
+ N, N, N, N, N, N, N, N,
+ /* 0xE8 - 0xEF */
+ N, N, N, N, N, N, N, N,
+ /* 0xF0 - 0xF7 */
+ N, N, N, N, N, N, N, N,
+ /* 0xF8 - 0xFF */
+ N, N, N, N, N, N, N, N,
+} };
+
+static const struct escape escape_db = { {
+ N, N, N, N, N, N, N, N,
+}, {
+ /* 0xC0 - 0xC7 */
+ N, N, N, N, N, N, N, N,
+ /* 0xC8 - 0xCF */
+ N, N, N, N, N, N, N, N,
+ /* 0xD0 - 0xC7 */
+ N, N, N, N, N, N, N, N,
+ /* 0xD8 - 0xDF */
+ N, N, N, N, N, N, N, N,
+ /* 0xE0 - 0xE7 */
+ N, N, N, I(ImplicitOps, em_fninit), N, N, N, N,
+ /* 0xE8 - 0xEF */
+ N, N, N, N, N, N, N, N,
+ /* 0xF0 - 0xF7 */
+ N, N, N, N, N, N, N, N,
+ /* 0xF8 - 0xFF */
+ N, N, N, N, N, N, N, N,
+} };
+
+static const struct escape escape_dd = { {
+ N, N, N, N, N, N, N, I(DstMem16 | Mov, em_fnstsw),
+}, {
+ /* 0xC0 - 0xC7 */
+ N, N, N, N, N, N, N, N,
+ /* 0xC8 - 0xCF */
+ N, N, N, N, N, N, N, N,
+ /* 0xD0 - 0xC7 */
+ N, N, N, N, N, N, N, N,
+ /* 0xD8 - 0xDF */
+ N, N, N, N, N, N, N, N,
+ /* 0xE0 - 0xE7 */
+ N, N, N, N, N, N, N, N,
+ /* 0xE8 - 0xEF */
+ N, N, N, N, N, N, N, N,
+ /* 0xF0 - 0xF7 */
+ N, N, N, N, N, N, N, N,
+ /* 0xF8 - 0xFF */
+ N, N, N, N, N, N, N, N,
+} };
+
+static const struct instr_dual instr_dual_0f_c3 = {
+ I(DstMem | SrcReg | ModRM | No16 | Mov, em_mov), N
+};
+
+static const struct mode_dual mode_dual_63 = {
+ N, I(DstReg | SrcMem32 | ModRM | Mov, em_movsxd)
+};
+
+static const struct opcode opcode_table[256] = {
+ /* 0x00 - 0x07 */
+ F6ALU(Lock, em_add),
+ I(ImplicitOps | Stack | No64 | Src2ES, em_push_sreg),
+ I(ImplicitOps | Stack | No64 | Src2ES, em_pop_sreg),
+ /* 0x08 - 0x0F */
+ F6ALU(Lock | PageTable, em_or),
+ I(ImplicitOps | Stack | No64 | Src2CS, em_push_sreg),
+ N,
+ /* 0x10 - 0x17 */
+ F6ALU(Lock, em_adc),
+ I(ImplicitOps | Stack | No64 | Src2SS, em_push_sreg),
+ I(ImplicitOps | Stack | No64 | Src2SS, em_pop_sreg),
+ /* 0x18 - 0x1F */
+ F6ALU(Lock, em_sbb),
+ I(ImplicitOps | Stack | No64 | Src2DS, em_push_sreg),
+ I(ImplicitOps | Stack | No64 | Src2DS, em_pop_sreg),
+ /* 0x20 - 0x27 */
+ F6ALU(Lock | PageTable, em_and), N, N,
+ /* 0x28 - 0x2F */
+ F6ALU(Lock, em_sub), N, I(ByteOp | DstAcc | No64, em_das),
+ /* 0x30 - 0x37 */
+ F6ALU(Lock, em_xor), N, N,
+ /* 0x38 - 0x3F */
+ F6ALU(NoWrite, em_cmp), N, N,
+ /* 0x40 - 0x4F */
+ X8(F(DstReg, em_inc)), X8(F(DstReg, em_dec)),
+ /* 0x50 - 0x57 */
+ X8(I(SrcReg | Stack, em_push)),
+ /* 0x58 - 0x5F */
+ X8(I(DstReg | Stack, em_pop)),
+ /* 0x60 - 0x67 */
+ I(ImplicitOps | Stack | No64, em_pusha),
+ I(ImplicitOps | Stack | No64, em_popa),
+ N, MD(ModRM, &mode_dual_63),
+ N, N, N, N,
+ /* 0x68 - 0x6F */
+ I(SrcImm | Mov | Stack, em_push),
+ I(DstReg | SrcMem | ModRM | Src2Imm, em_imul_3op),
+ I(SrcImmByte | Mov | Stack, em_push),
+ I(DstReg | SrcMem | ModRM | Src2ImmByte, em_imul_3op),
+ I2bvIP(DstDI | SrcDX | Mov | String | Unaligned, em_in, ins, check_perm_in), /* insb, insw/insd */
+ I2bvIP(SrcSI | DstDX | String, em_out, outs, check_perm_out), /* outsb, outsw/outsd */
+ /* 0x70 - 0x7F */
+ X16(D(SrcImmByte | NearBranch)),
+ /* 0x80 - 0x87 */
+ G(ByteOp | DstMem | SrcImm, group1),
+ G(DstMem | SrcImm, group1),
+ G(ByteOp | DstMem | SrcImm | No64, group1),
+ G(DstMem | SrcImmByte, group1),
+ F2bv(DstMem | SrcReg | ModRM | NoWrite, em_test),
+ I2bv(DstMem | SrcReg | ModRM | Lock | PageTable, em_xchg),
+ /* 0x88 - 0x8F */
+ I2bv(DstMem | SrcReg | ModRM | Mov | PageTable, em_mov),
+ I2bv(DstReg | SrcMem | ModRM | Mov, em_mov),
+ I(DstMem | SrcNone | ModRM | Mov | PageTable, em_mov_rm_sreg),
+ D(ModRM | SrcMem | NoAccess | DstReg),
+ I(ImplicitOps | SrcMem16 | ModRM, em_mov_sreg_rm),
+ G(0, group1A),
+ /* 0x90 - 0x97 */
+ DI(SrcAcc | DstReg, pause), X7(D(SrcAcc | DstReg)),
+ /* 0x98 - 0x9F */
+ D(DstAcc | SrcNone), I(ImplicitOps | SrcAcc, em_cwd),
+ I(SrcImmFAddr | No64, em_call_far), N,
+ II(ImplicitOps | Stack, em_pushf, pushf),
+ II(ImplicitOps | Stack, em_popf, popf),
+ I(ImplicitOps, em_sahf), I(ImplicitOps, em_lahf),
+ /* 0xA0 - 0xA7 */
+ I2bv(DstAcc | SrcMem | Mov | MemAbs, em_mov),
+ I2bv(DstMem | SrcAcc | Mov | MemAbs | PageTable, em_mov),
+ I2bv(SrcSI | DstDI | Mov | String | TwoMemOp, em_mov),
+ F2bv(SrcSI | DstDI | String | NoWrite | TwoMemOp, em_cmp_r),
+ /* 0xA8 - 0xAF */
+ F2bv(DstAcc | SrcImm | NoWrite, em_test),
+ I2bv(SrcAcc | DstDI | Mov | String, em_mov),
+ I2bv(SrcSI | DstAcc | Mov | String, em_mov),
+ F2bv(SrcAcc | DstDI | String | NoWrite, em_cmp_r),
+ /* 0xB0 - 0xB7 */
+ X8(I(ByteOp | DstReg | SrcImm | Mov, em_mov)),
+ /* 0xB8 - 0xBF */
+ X8(I(DstReg | SrcImm64 | Mov, em_mov)),
+ /* 0xC0 - 0xC7 */
+ G(ByteOp | Src2ImmByte, group2), G(Src2ImmByte, group2),
+ I(ImplicitOps | NearBranch | SrcImmU16, em_ret_near_imm),
+ I(ImplicitOps | NearBranch, em_ret),
+ I(DstReg | SrcMemFAddr | ModRM | No64 | Src2ES, em_lseg),
+ I(DstReg | SrcMemFAddr | ModRM | No64 | Src2DS, em_lseg),
+ G(ByteOp, group11), G(0, group11),
+ /* 0xC8 - 0xCF */
+ I(Stack | SrcImmU16 | Src2ImmByte, em_enter), I(Stack, em_leave),
+ I(ImplicitOps | SrcImmU16, em_ret_far_imm),
+ I(ImplicitOps, em_ret_far),
+ D(ImplicitOps), DI(SrcImmByte, intn),
+ D(ImplicitOps | No64), II(ImplicitOps, em_iret, iret),
+ /* 0xD0 - 0xD7 */
+ G(Src2One | ByteOp, group2), G(Src2One, group2),
+ G(Src2CL | ByteOp, group2), G(Src2CL, group2),
+ I(DstAcc | SrcImmUByte | No64, em_aam),
+ I(DstAcc | SrcImmUByte | No64, em_aad),
+ F(DstAcc | ByteOp | No64, em_salc),
+ I(DstAcc | SrcXLat | ByteOp, em_mov),
+ /* 0xD8 - 0xDF */
+ N, E(0, &escape_d9), N, E(0, &escape_db), N, E(0, &escape_dd), N, N,
+ /* 0xE0 - 0xE7 */
+ X3(I(SrcImmByte | NearBranch, em_loop)),
+ I(SrcImmByte | NearBranch, em_jcxz),
+ I2bvIP(SrcImmUByte | DstAcc, em_in, in, check_perm_in),
+ I2bvIP(SrcAcc | DstImmUByte, em_out, out, check_perm_out),
+ /* 0xE8 - 0xEF */
+ I(SrcImm | NearBranch, em_call), D(SrcImm | ImplicitOps | NearBranch),
+ I(SrcImmFAddr | No64, em_jmp_far),
+ D(SrcImmByte | ImplicitOps | NearBranch),
+ I2bvIP(SrcDX | DstAcc, em_in, in, check_perm_in),
+ I2bvIP(SrcAcc | DstDX, em_out, out, check_perm_out),
+ /* 0xF0 - 0xF7 */
+ N, DI(ImplicitOps, icebp), N, N,
+ DI(ImplicitOps | Priv, hlt), D(ImplicitOps),
+ G(ByteOp, group3), G(0, group3),
+ /* 0xF8 - 0xFF */
+ D(ImplicitOps), D(ImplicitOps),
+ I(ImplicitOps, em_cli), I(ImplicitOps, em_sti),
+ D(ImplicitOps), D(ImplicitOps), G(0, group4), G(0, group5),
+};
+
+static const struct opcode twobyte_table[256] = {
+ /* 0x00 - 0x0F */
+ G(0, group6), GD(0, &group7), N, N,
+ N, I(ImplicitOps | EmulateOnUD, em_syscall),
+ II(ImplicitOps | Priv, em_clts, clts), N,
+ DI(ImplicitOps | Priv, invd), DI(ImplicitOps | Priv, wbinvd), N, N,
+ N, D(ImplicitOps | ModRM | SrcMem | NoAccess), N, N,
+ /* 0x10 - 0x1F */
+ GP(ModRM | DstReg | SrcMem | Mov | Sse, &pfx_0f_10_0f_11),
+ GP(ModRM | DstMem | SrcReg | Mov | Sse, &pfx_0f_10_0f_11),
+ N, N, N, N, N, N,
+ D(ImplicitOps | ModRM | SrcMem | NoAccess),
+ N, N, N, N, N, N, D(ImplicitOps | ModRM | SrcMem | NoAccess),
+ /* 0x20 - 0x2F */
+ DIP(ModRM | DstMem | Priv | Op3264 | NoMod, cr_read, check_cr_read),
+ DIP(ModRM | DstMem | Priv | Op3264 | NoMod, dr_read, check_dr_read),
+ IIP(ModRM | SrcMem | Priv | Op3264 | NoMod, em_cr_write, cr_write,
+ check_cr_write),
+ IIP(ModRM | SrcMem | Priv | Op3264 | NoMod, em_dr_write, dr_write,
+ check_dr_write),
+ N, N, N, N,
+ GP(ModRM | DstReg | SrcMem | Mov | Sse, &pfx_0f_28_0f_29),
+ GP(ModRM | DstMem | SrcReg | Mov | Sse, &pfx_0f_28_0f_29),
+ N, GP(ModRM | DstMem | SrcReg | Mov | Sse, &pfx_0f_2b),
+ N, N, N, N,
+ /* 0x30 - 0x3F */
+ II(ImplicitOps | Priv, em_wrmsr, wrmsr),
+ IIP(ImplicitOps, em_rdtsc, rdtsc, check_rdtsc),
+ II(ImplicitOps | Priv, em_rdmsr, rdmsr),
+ IIP(ImplicitOps, em_rdpmc, rdpmc, check_rdpmc),
+ I(ImplicitOps | EmulateOnUD, em_sysenter),
+ I(ImplicitOps | Priv | EmulateOnUD, em_sysexit),
+ N, N,
+ N, N, N, N, N, N, N, N,
+ /* 0x40 - 0x4F */
+ X16(D(DstReg | SrcMem | ModRM)),
+ /* 0x50 - 0x5F */
+ N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N,
+ /* 0x60 - 0x6F */
+ N, N, N, N,
+ N, N, N, N,
+ N, N, N, N,
+ N, N, N, GP(SrcMem | DstReg | ModRM | Mov, &pfx_0f_6f_0f_7f),
+ /* 0x70 - 0x7F */
+ N, N, N, N,
+ N, N, N, N,
+ N, N, N, N,
+ N, N, N, GP(SrcReg | DstMem | ModRM | Mov, &pfx_0f_6f_0f_7f),
+ /* 0x80 - 0x8F */
+ X16(D(SrcImm | NearBranch)),
+ /* 0x90 - 0x9F */
+ X16(D(ByteOp | DstMem | SrcNone | ModRM| Mov)),
+ /* 0xA0 - 0xA7 */
+ I(Stack | Src2FS, em_push_sreg), I(Stack | Src2FS, em_pop_sreg),
+ II(ImplicitOps, em_cpuid, cpuid),
+ F(DstMem | SrcReg | ModRM | BitOp | NoWrite, em_bt),
+ F(DstMem | SrcReg | Src2ImmByte | ModRM, em_shld),
+ F(DstMem | SrcReg | Src2CL | ModRM, em_shld), N, N,
+ /* 0xA8 - 0xAF */
+ I(Stack | Src2GS, em_push_sreg), I(Stack | Src2GS, em_pop_sreg),
+ II(EmulateOnUD | ImplicitOps, em_rsm, rsm),
+ F(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_bts),
+ F(DstMem | SrcReg | Src2ImmByte | ModRM, em_shrd),
+ F(DstMem | SrcReg | Src2CL | ModRM, em_shrd),
+ GD(0, &group15), F(DstReg | SrcMem | ModRM, em_imul),
+ /* 0xB0 - 0xB7 */
+ I2bv(DstMem | SrcReg | ModRM | Lock | PageTable | SrcWrite, em_cmpxchg),
+ I(DstReg | SrcMemFAddr | ModRM | Src2SS, em_lseg),
+ F(DstMem | SrcReg | ModRM | BitOp | Lock, em_btr),
+ I(DstReg | SrcMemFAddr | ModRM | Src2FS, em_lseg),
+ I(DstReg | SrcMemFAddr | ModRM | Src2GS, em_lseg),
+ D(DstReg | SrcMem8 | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov),
+ /* 0xB8 - 0xBF */
+ N, N,
+ G(BitOp, group8),
+ F(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_btc),
+ I(DstReg | SrcMem | ModRM, em_bsf_c),
+ I(DstReg | SrcMem | ModRM, em_bsr_c),
+ D(DstReg | SrcMem8 | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov),
+ /* 0xC0 - 0xC7 */
+ F2bv(DstMem | SrcReg | ModRM | SrcWrite | Lock, em_xadd),
+ N, ID(0, &instr_dual_0f_c3),
+ N, N, N, GD(0, &group9),
+ /* 0xC8 - 0xCF */
+ X8(I(DstReg, em_bswap)),
+ /* 0xD0 - 0xDF */
+ N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N,
+ /* 0xE0 - 0xEF */
+ N, N, N, N, N, N, N, GP(SrcReg | DstMem | ModRM | Mov, &pfx_0f_e7),
+ N, N, N, N, N, N, N, N,
+ /* 0xF0 - 0xFF */
+ N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N
+};
+
+static const struct instr_dual instr_dual_0f_38_f0 = {
+ I(DstReg | SrcMem | Mov, em_movbe), N
+};
+
+static const struct instr_dual instr_dual_0f_38_f1 = {
+ I(DstMem | SrcReg | Mov, em_movbe), N
+};
+
+static const struct gprefix three_byte_0f_38_f0 = {
+ ID(0, &instr_dual_0f_38_f0), N, N, N
+};
+
+static const struct gprefix three_byte_0f_38_f1 = {
+ ID(0, &instr_dual_0f_38_f1), N, N, N
+};
+
+/*
+ * Insns below are selected by the prefix which indexed by the third opcode
+ * byte.
+ */
+static const struct opcode opcode_map_0f_38[256] = {
+ /* 0x00 - 0x7f */
+ X16(N), X16(N), X16(N), X16(N), X16(N), X16(N), X16(N), X16(N),
+ /* 0x80 - 0xef */
+ X16(N), X16(N), X16(N), X16(N), X16(N), X16(N), X16(N),
+ /* 0xf0 - 0xf1 */
+ GP(EmulateOnUD | ModRM, &three_byte_0f_38_f0),
+ GP(EmulateOnUD | ModRM, &three_byte_0f_38_f1),
+ /* 0xf2 - 0xff */
+ N, N, X4(N), X8(N)
+};
+
+#undef D
+#undef N
+#undef G
+#undef GD
+#undef I
+#undef GP
+#undef EXT
+#undef MD
+#undef ID
+
+#undef D2bv
+#undef D2bvIP
+#undef I2bv
+#undef I2bvIP
+#undef I6ALU
+
+static unsigned imm_size(struct x86_emulate_ctxt *ctxt)
+{
+ unsigned size;
+
+ size = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
+ if (size == 8)
+ size = 4;
+ return size;
+}
+
+static int decode_imm(struct x86_emulate_ctxt *ctxt, struct operand *op,
+ unsigned size, bool sign_extension)
+{
+ int rc = X86EMUL_CONTINUE;
+
+ op->type = OP_IMM;
+ op->bytes = size;
+ op->addr.mem.ea = ctxt->_eip;
+ /* NB. Immediates are sign-extended as necessary. */
+ switch (op->bytes) {
+ case 1:
+ op->val = insn_fetch(s8, ctxt);
+ break;
+ case 2:
+ op->val = insn_fetch(s16, ctxt);
+ break;
+ case 4:
+ op->val = insn_fetch(s32, ctxt);
+ break;
+ case 8:
+ op->val = insn_fetch(s64, ctxt);
+ break;
+ }
+ if (!sign_extension) {
+ switch (op->bytes) {
+ case 1:
+ op->val &= 0xff;
+ break;
+ case 2:
+ op->val &= 0xffff;
+ break;
+ case 4:
+ op->val &= 0xffffffff;
+ break;
+ }
+ }
+done:
+ return rc;
+}
+
+static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op,
+ unsigned d)
+{
+ int rc = X86EMUL_CONTINUE;
+
+ switch (d) {
+ case OpReg:
+ decode_register_operand(ctxt, op);
+ break;
+ case OpImmUByte:
+ rc = decode_imm(ctxt, op, 1, false);
+ break;
+ case OpMem:
+ ctxt->memop.bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
+ mem_common:
+ *op = ctxt->memop;
+ ctxt->memopp = op;
+ if (ctxt->d & BitOp)
+ fetch_bit_operand(ctxt);
+ op->orig_val = op->val;
+ break;
+ case OpMem64:
+ ctxt->memop.bytes = (ctxt->op_bytes == 8) ? 16 : 8;
+ goto mem_common;
+ case OpAcc:
+ op->type = OP_REG;
+ op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
+ op->addr.reg = reg_rmw(ctxt, VCPU_REGS_RAX);
+ fetch_register_operand(op);
+ op->orig_val = op->val;
+ break;
+ case OpAccLo:
+ op->type = OP_REG;
+ op->bytes = (ctxt->d & ByteOp) ? 2 : ctxt->op_bytes;
+ op->addr.reg = reg_rmw(ctxt, VCPU_REGS_RAX);
+ fetch_register_operand(op);
+ op->orig_val = op->val;
+ break;
+ case OpAccHi:
+ if (ctxt->d & ByteOp) {
+ op->type = OP_NONE;
+ break;
+ }
+ op->type = OP_REG;
+ op->bytes = ctxt->op_bytes;
+ op->addr.reg = reg_rmw(ctxt, VCPU_REGS_RDX);
+ fetch_register_operand(op);
+ op->orig_val = op->val;
+ break;
+ case OpDI:
+ op->type = OP_MEM;
+ op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
+ op->addr.mem.ea =
+ register_address(ctxt, VCPU_REGS_RDI);
+ op->addr.mem.seg = VCPU_SREG_ES;
+ op->val = 0;
+ op->count = 1;
+ break;
+ case OpDX:
+ op->type = OP_REG;
+ op->bytes = 2;
+ op->addr.reg = reg_rmw(ctxt, VCPU_REGS_RDX);
+ fetch_register_operand(op);
+ break;
+ case OpCL:
+ op->type = OP_IMM;
+ op->bytes = 1;
+ op->val = reg_read(ctxt, VCPU_REGS_RCX) & 0xff;
+ break;
+ case OpImmByte:
+ rc = decode_imm(ctxt, op, 1, true);
+ break;
+ case OpOne:
+ op->type = OP_IMM;
+ op->bytes = 1;
+ op->val = 1;
+ break;
+ case OpImm:
+ rc = decode_imm(ctxt, op, imm_size(ctxt), true);
+ break;
+ case OpImm64:
+ rc = decode_imm(ctxt, op, ctxt->op_bytes, true);
+ break;
+ case OpMem8:
+ ctxt->memop.bytes = 1;
+ if (ctxt->memop.type == OP_REG) {
+ ctxt->memop.addr.reg = decode_register(ctxt,
+ ctxt->modrm_rm, true);
+ fetch_register_operand(&ctxt->memop);
+ }
+ goto mem_common;
+ case OpMem16:
+ ctxt->memop.bytes = 2;
+ goto mem_common;
+ case OpMem32:
+ ctxt->memop.bytes = 4;
+ goto mem_common;
+ case OpImmU16:
+ rc = decode_imm(ctxt, op, 2, false);
+ break;
+ case OpImmU:
+ rc = decode_imm(ctxt, op, imm_size(ctxt), false);
+ break;
+ case OpSI:
+ op->type = OP_MEM;
+ op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
+ op->addr.mem.ea =
+ register_address(ctxt, VCPU_REGS_RSI);
+ op->addr.mem.seg = ctxt->seg_override;
+ op->val = 0;
+ op->count = 1;
+ break;
+ case OpXLat:
+ op->type = OP_MEM;
+ op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
+ op->addr.mem.ea =
+ address_mask(ctxt,
+ reg_read(ctxt, VCPU_REGS_RBX) +
+ (reg_read(ctxt, VCPU_REGS_RAX) & 0xff));
+ op->addr.mem.seg = ctxt->seg_override;
+ op->val = 0;
+ break;
+ case OpImmFAddr:
+ op->type = OP_IMM;
+ op->addr.mem.ea = ctxt->_eip;
+ op->bytes = ctxt->op_bytes + 2;
+ insn_fetch_arr(op->valptr, op->bytes, ctxt);
+ break;
+ case OpMemFAddr:
+ ctxt->memop.bytes = ctxt->op_bytes + 2;
+ goto mem_common;
+ case OpES:
+ op->type = OP_IMM;
+ op->val = VCPU_SREG_ES;
+ break;
+ case OpCS:
+ op->type = OP_IMM;
+ op->val = VCPU_SREG_CS;
+ break;
+ case OpSS:
+ op->type = OP_IMM;
+ op->val = VCPU_SREG_SS;
+ break;
+ case OpDS:
+ op->type = OP_IMM;
+ op->val = VCPU_SREG_DS;
+ break;
+ case OpFS:
+ op->type = OP_IMM;
+ op->val = VCPU_SREG_FS;
+ break;
+ case OpGS:
+ op->type = OP_IMM;
+ op->val = VCPU_SREG_GS;
+ break;
+ case OpImplicit:
+ /* Special instructions do their own operand decoding. */
+ default:
+ op->type = OP_NONE; /* Disable writeback. */
+ break;
+ }
+
+done:
+ return rc;
+}
+
+int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len)
+{
+ int rc = X86EMUL_CONTINUE;
+ int mode = ctxt->mode;
+ int def_op_bytes, def_ad_bytes, goffset, simd_prefix;
+ bool op_prefix = false;
+ bool has_seg_override = false;
+ struct opcode opcode;
+ u16 dummy;
+ struct desc_struct desc;
+
+ ctxt->memop.type = OP_NONE;
+ ctxt->memopp = NULL;
+ ctxt->_eip = ctxt->eip;
+ ctxt->fetch.ptr = ctxt->fetch.data;
+ ctxt->fetch.end = ctxt->fetch.data + insn_len;
+ ctxt->opcode_len = 1;
+ ctxt->intercept = x86_intercept_none;
+ if (insn_len > 0)
+ memcpy(ctxt->fetch.data, insn, insn_len);
+ else {
+ rc = __do_insn_fetch_bytes(ctxt, 1);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+ }
+
+ switch (mode) {
+ case X86EMUL_MODE_REAL:
+ case X86EMUL_MODE_VM86:
+ def_op_bytes = def_ad_bytes = 2;
+ ctxt->ops->get_segment(ctxt, &dummy, &desc, NULL, VCPU_SREG_CS);
+ if (desc.d)
+ def_op_bytes = def_ad_bytes = 4;
+ break;
+ case X86EMUL_MODE_PROT16:
+ def_op_bytes = def_ad_bytes = 2;
+ break;
+ case X86EMUL_MODE_PROT32:
+ def_op_bytes = def_ad_bytes = 4;
+ break;
+#ifdef CONFIG_X86_64
+ case X86EMUL_MODE_PROT64:
+ def_op_bytes = 4;
+ def_ad_bytes = 8;
+ break;
+#endif
+ default:
+ return EMULATION_FAILED;
+ }
+
+ ctxt->op_bytes = def_op_bytes;
+ ctxt->ad_bytes = def_ad_bytes;
+
+ /* Legacy prefixes. */
+ for (;;) {
+ switch (ctxt->b = insn_fetch(u8, ctxt)) {
+ case 0x66: /* operand-size override */
+ op_prefix = true;
+ /* switch between 2/4 bytes */
+ ctxt->op_bytes = def_op_bytes ^ 6;
+ break;
+ case 0x67: /* address-size override */
+ if (mode == X86EMUL_MODE_PROT64)
+ /* switch between 4/8 bytes */
+ ctxt->ad_bytes = def_ad_bytes ^ 12;
+ else
+ /* switch between 2/4 bytes */
+ ctxt->ad_bytes = def_ad_bytes ^ 6;
+ break;
+ case 0x26: /* ES override */
+ has_seg_override = true;
+ ctxt->seg_override = VCPU_SREG_ES;
+ break;
+ case 0x2e: /* CS override */
+ has_seg_override = true;
+ ctxt->seg_override = VCPU_SREG_CS;
+ break;
+ case 0x36: /* SS override */
+ has_seg_override = true;
+ ctxt->seg_override = VCPU_SREG_SS;
+ break;
+ case 0x3e: /* DS override */
+ has_seg_override = true;
+ ctxt->seg_override = VCPU_SREG_DS;
+ break;
+ case 0x64: /* FS override */
+ has_seg_override = true;
+ ctxt->seg_override = VCPU_SREG_FS;
+ break;
+ case 0x65: /* GS override */
+ has_seg_override = true;
+ ctxt->seg_override = VCPU_SREG_GS;
+ break;
+ case 0x40 ... 0x4f: /* REX */
+ if (mode != X86EMUL_MODE_PROT64)
+ goto done_prefixes;
+ ctxt->rex_prefix = ctxt->b;
+ continue;
+ case 0xf0: /* LOCK */
+ ctxt->lock_prefix = 1;
+ break;
+ case 0xf2: /* REPNE/REPNZ */
+ case 0xf3: /* REP/REPE/REPZ */
+ ctxt->rep_prefix = ctxt->b;
+ break;
+ default:
+ goto done_prefixes;
+ }
+
+ /* Any legacy prefix after a REX prefix nullifies its effect. */
+
+ ctxt->rex_prefix = 0;
+ }
+
+done_prefixes:
+
+ /* REX prefix. */
+ if (ctxt->rex_prefix & 8)
+ ctxt->op_bytes = 8; /* REX.W */
+
+ /* Opcode byte(s). */
+ opcode = opcode_table[ctxt->b];
+ /* Two-byte opcode? */
+ if (ctxt->b == 0x0f) {
+ ctxt->opcode_len = 2;
+ ctxt->b = insn_fetch(u8, ctxt);
+ opcode = twobyte_table[ctxt->b];
+
+ /* 0F_38 opcode map */
+ if (ctxt->b == 0x38) {
+ ctxt->opcode_len = 3;
+ ctxt->b = insn_fetch(u8, ctxt);
+ opcode = opcode_map_0f_38[ctxt->b];
+ }
+ }
+ ctxt->d = opcode.flags;
+
+ if (ctxt->d & ModRM)
+ ctxt->modrm = insn_fetch(u8, ctxt);
+
+ /* vex-prefix instructions are not implemented */
+ if (ctxt->opcode_len == 1 && (ctxt->b == 0xc5 || ctxt->b == 0xc4) &&
+ (mode == X86EMUL_MODE_PROT64 || (ctxt->modrm & 0xc0) == 0xc0)) {
+ ctxt->d = NotImpl;
+ }
+
+ while (ctxt->d & GroupMask) {
+ switch (ctxt->d & GroupMask) {
+ case Group:
+ goffset = (ctxt->modrm >> 3) & 7;
+ opcode = opcode.u.group[goffset];
+ break;
+ case GroupDual:
+ goffset = (ctxt->modrm >> 3) & 7;
+ if ((ctxt->modrm >> 6) == 3)
+ opcode = opcode.u.gdual->mod3[goffset];
+ else
+ opcode = opcode.u.gdual->mod012[goffset];
+ break;
+ case RMExt:
+ goffset = ctxt->modrm & 7;
+ opcode = opcode.u.group[goffset];
+ break;
+ case Prefix:
+ if (ctxt->rep_prefix && op_prefix)
+ return EMULATION_FAILED;
+ simd_prefix = op_prefix ? 0x66 : ctxt->rep_prefix;
+ switch (simd_prefix) {
+ case 0x00: opcode = opcode.u.gprefix->pfx_no; break;
+ case 0x66: opcode = opcode.u.gprefix->pfx_66; break;
+ case 0xf2: opcode = opcode.u.gprefix->pfx_f2; break;
+ case 0xf3: opcode = opcode.u.gprefix->pfx_f3; break;
+ }
+ break;
+ case Escape:
+ if (ctxt->modrm > 0xbf) {
+ size_t size = ARRAY_SIZE(opcode.u.esc->high);
+ u32 index = array_index_nospec(
+ ctxt->modrm - 0xc0, size);
+
+ opcode = opcode.u.esc->high[index];
+ } else {
+ opcode = opcode.u.esc->op[(ctxt->modrm >> 3) & 7];
+ }
+ break;
+ case InstrDual:
+ if ((ctxt->modrm >> 6) == 3)
+ opcode = opcode.u.idual->mod3;
+ else
+ opcode = opcode.u.idual->mod012;
+ break;
+ case ModeDual:
+ if (ctxt->mode == X86EMUL_MODE_PROT64)
+ opcode = opcode.u.mdual->mode64;
+ else
+ opcode = opcode.u.mdual->mode32;
+ break;
+ default:
+ return EMULATION_FAILED;
+ }
+
+ ctxt->d &= ~(u64)GroupMask;
+ ctxt->d |= opcode.flags;
+ }
+
+ /* Unrecognised? */
+ if (ctxt->d == 0)
+ return EMULATION_FAILED;
+
+ ctxt->execute = opcode.u.execute;
+
+ if (unlikely(ctxt->ud) && likely(!(ctxt->d & EmulateOnUD)))
+ return EMULATION_FAILED;
+
+ if (unlikely(ctxt->d &
+ (NotImpl|Stack|Op3264|Sse|Mmx|Intercept|CheckPerm|NearBranch|
+ No16))) {
+ /*
+ * These are copied unconditionally here, and checked unconditionally
+ * in x86_emulate_insn.
+ */
+ ctxt->check_perm = opcode.check_perm;
+ ctxt->intercept = opcode.intercept;
+
+ if (ctxt->d & NotImpl)
+ return EMULATION_FAILED;
+
+ if (mode == X86EMUL_MODE_PROT64) {
+ if (ctxt->op_bytes == 4 && (ctxt->d & Stack))
+ ctxt->op_bytes = 8;
+ else if (ctxt->d & NearBranch)
+ ctxt->op_bytes = 8;
+ }
+
+ if (ctxt->d & Op3264) {
+ if (mode == X86EMUL_MODE_PROT64)
+ ctxt->op_bytes = 8;
+ else
+ ctxt->op_bytes = 4;
+ }
+
+ if ((ctxt->d & No16) && ctxt->op_bytes == 2)
+ ctxt->op_bytes = 4;
+
+ if (ctxt->d & Sse)
+ ctxt->op_bytes = 16;
+ else if (ctxt->d & Mmx)
+ ctxt->op_bytes = 8;
+ }
+
+ /* ModRM and SIB bytes. */
+ if (ctxt->d & ModRM) {
+ rc = decode_modrm(ctxt, &ctxt->memop);
+ if (!has_seg_override) {
+ has_seg_override = true;
+ ctxt->seg_override = ctxt->modrm_seg;
+ }
+ } else if (ctxt->d & MemAbs)
+ rc = decode_abs(ctxt, &ctxt->memop);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+
+ if (!has_seg_override)
+ ctxt->seg_override = VCPU_SREG_DS;
+
+ ctxt->memop.addr.mem.seg = ctxt->seg_override;
+
+ /*
+ * Decode and fetch the source operand: register, memory
+ * or immediate.
+ */
+ rc = decode_operand(ctxt, &ctxt->src, (ctxt->d >> SrcShift) & OpMask);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+
+ /*
+ * Decode and fetch the second source operand: register, memory
+ * or immediate.
+ */
+ rc = decode_operand(ctxt, &ctxt->src2, (ctxt->d >> Src2Shift) & OpMask);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+
+ /* Decode and fetch the destination operand: register or memory. */
+ rc = decode_operand(ctxt, &ctxt->dst, (ctxt->d >> DstShift) & OpMask);
+
+ if (ctxt->rip_relative && likely(ctxt->memopp))
+ ctxt->memopp->addr.mem.ea = address_mask(ctxt,
+ ctxt->memopp->addr.mem.ea + ctxt->_eip);
+
+done:
+ if (rc == X86EMUL_PROPAGATE_FAULT)
+ ctxt->have_exception = true;
+ return (rc != X86EMUL_CONTINUE) ? EMULATION_FAILED : EMULATION_OK;
+}
+
+bool x86_page_table_writing_insn(struct x86_emulate_ctxt *ctxt)
+{
+ return ctxt->d & PageTable;
+}
+
+static bool string_insn_completed(struct x86_emulate_ctxt *ctxt)
+{
+ /* The second termination condition only applies for REPE
+ * and REPNE. Test if the repeat string operation prefix is
+ * REPE/REPZ or REPNE/REPNZ and if it's the case it tests the
+ * corresponding termination condition according to:
+ * - if REPE/REPZ and ZF = 0 then done
+ * - if REPNE/REPNZ and ZF = 1 then done
+ */
+ if (((ctxt->b == 0xa6) || (ctxt->b == 0xa7) ||
+ (ctxt->b == 0xae) || (ctxt->b == 0xaf))
+ && (((ctxt->rep_prefix == REPE_PREFIX) &&
+ ((ctxt->eflags & X86_EFLAGS_ZF) == 0))
+ || ((ctxt->rep_prefix == REPNE_PREFIX) &&
+ ((ctxt->eflags & X86_EFLAGS_ZF) == X86_EFLAGS_ZF))))
+ return true;
+
+ return false;
+}
+
+static int flush_pending_x87_faults(struct x86_emulate_ctxt *ctxt)
+{
+ int rc;
+
+ rc = asm_safe("fwait");
+
+ if (unlikely(rc != X86EMUL_CONTINUE))
+ return emulate_exception(ctxt, MF_VECTOR, 0, false);
+
+ return X86EMUL_CONTINUE;
+}
+
+static void fetch_possible_mmx_operand(struct x86_emulate_ctxt *ctxt,
+ struct operand *op)
+{
+ if (op->type == OP_MM)
+ read_mmx_reg(ctxt, &op->mm_val, op->addr.mm);
+}
+
+static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *))
+{
+ ulong flags = (ctxt->eflags & EFLAGS_MASK) | X86_EFLAGS_IF;
+
+ if (!(ctxt->d & ByteOp))
+ fop += __ffs(ctxt->dst.bytes) * FASTOP_SIZE;
+
+ asm("push %[flags]; popf; " CALL_NOSPEC " ; pushf; pop %[flags]\n"
+ : "+a"(ctxt->dst.val), "+d"(ctxt->src.val), [flags]"+D"(flags),
+ [thunk_target]"+S"(fop), ASM_CALL_CONSTRAINT
+ : "c"(ctxt->src2.val));
+
+ ctxt->eflags = (ctxt->eflags & ~EFLAGS_MASK) | (flags & EFLAGS_MASK);
+ if (!fop) /* exception is returned in fop variable */
+ return emulate_de(ctxt);
+ return X86EMUL_CONTINUE;
+}
+
+void init_decode_cache(struct x86_emulate_ctxt *ctxt)
+{
+ memset(&ctxt->rip_relative, 0,
+ (void *)&ctxt->modrm - (void *)&ctxt->rip_relative);
+
+ ctxt->io_read.pos = 0;
+ ctxt->io_read.end = 0;
+ ctxt->mem_read.end = 0;
+}
+
+int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
+{
+ const struct x86_emulate_ops *ops = ctxt->ops;
+ int rc = X86EMUL_CONTINUE;
+ int saved_dst_type = ctxt->dst.type;
+ unsigned emul_flags;
+
+ ctxt->mem_read.pos = 0;
+
+ /* LOCK prefix is allowed only with some instructions */
+ if (ctxt->lock_prefix && (!(ctxt->d & Lock) || ctxt->dst.type != OP_MEM)) {
+ rc = emulate_ud(ctxt);
+ goto done;
+ }
+
+ if ((ctxt->d & SrcMask) == SrcMemFAddr && ctxt->src.type != OP_MEM) {
+ rc = emulate_ud(ctxt);
+ goto done;
+ }
+
+ emul_flags = ctxt->ops->get_hflags(ctxt);
+ if (unlikely(ctxt->d &
+ (No64|Undefined|Sse|Mmx|Intercept|CheckPerm|Priv|Prot|String))) {
+ if ((ctxt->mode == X86EMUL_MODE_PROT64 && (ctxt->d & No64)) ||
+ (ctxt->d & Undefined)) {
+ rc = emulate_ud(ctxt);
+ goto done;
+ }
+
+ if (((ctxt->d & (Sse|Mmx)) && ((ops->get_cr(ctxt, 0) & X86_CR0_EM)))
+ || ((ctxt->d & Sse) && !(ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR))) {
+ rc = emulate_ud(ctxt);
+ goto done;
+ }
+
+ if ((ctxt->d & (Sse|Mmx)) && (ops->get_cr(ctxt, 0) & X86_CR0_TS)) {
+ rc = emulate_nm(ctxt);
+ goto done;
+ }
+
+ if (ctxt->d & Mmx) {
+ rc = flush_pending_x87_faults(ctxt);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+ /*
+ * Now that we know the fpu is exception safe, we can fetch
+ * operands from it.
+ */
+ fetch_possible_mmx_operand(ctxt, &ctxt->src);
+ fetch_possible_mmx_operand(ctxt, &ctxt->src2);
+ if (!(ctxt->d & Mov))
+ fetch_possible_mmx_operand(ctxt, &ctxt->dst);
+ }
+
+ if (unlikely(emul_flags & X86EMUL_GUEST_MASK) && ctxt->intercept) {
+ rc = emulator_check_intercept(ctxt, ctxt->intercept,
+ X86_ICPT_PRE_EXCEPT);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+ }
+
+ /* Instruction can only be executed in protected mode */
+ if ((ctxt->d & Prot) && ctxt->mode < X86EMUL_MODE_PROT16) {
+ rc = emulate_ud(ctxt);
+ goto done;
+ }
+
+ /* Privileged instruction can be executed only in CPL=0 */
+ if ((ctxt->d & Priv) && ops->cpl(ctxt)) {
+ if (ctxt->d & PrivUD)
+ rc = emulate_ud(ctxt);
+ else
+ rc = emulate_gp(ctxt, 0);
+ goto done;
+ }
+
+ /* Do instruction specific permission checks */
+ if (ctxt->d & CheckPerm) {
+ rc = ctxt->check_perm(ctxt);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+ }
+
+ if (unlikely(emul_flags & X86EMUL_GUEST_MASK) && (ctxt->d & Intercept)) {
+ rc = emulator_check_intercept(ctxt, ctxt->intercept,
+ X86_ICPT_POST_EXCEPT);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+ }
+
+ if (ctxt->rep_prefix && (ctxt->d & String)) {
+ /* All REP prefixes have the same first termination condition */
+ if (address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) == 0) {
+ string_registers_quirk(ctxt);
+ ctxt->eip = ctxt->_eip;
+ ctxt->eflags &= ~X86_EFLAGS_RF;
+ goto done;
+ }
+ }
+ }
+
+ if ((ctxt->src.type == OP_MEM) && !(ctxt->d & NoAccess)) {
+ rc = segmented_read(ctxt, ctxt->src.addr.mem,
+ ctxt->src.valptr, ctxt->src.bytes);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+ ctxt->src.orig_val64 = ctxt->src.val64;
+ }
+
+ if (ctxt->src2.type == OP_MEM) {
+ rc = segmented_read(ctxt, ctxt->src2.addr.mem,
+ &ctxt->src2.val, ctxt->src2.bytes);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+ }
+
+ if ((ctxt->d & DstMask) == ImplicitOps)
+ goto special_insn;
+
+
+ if ((ctxt->dst.type == OP_MEM) && !(ctxt->d & Mov)) {
+ /* optimisation - avoid slow emulated read if Mov */
+ rc = segmented_read(ctxt, ctxt->dst.addr.mem,
+ &ctxt->dst.val, ctxt->dst.bytes);
+ if (rc != X86EMUL_CONTINUE) {
+ if (!(ctxt->d & NoWrite) &&
+ rc == X86EMUL_PROPAGATE_FAULT &&
+ ctxt->exception.vector == PF_VECTOR)
+ ctxt->exception.error_code |= PFERR_WRITE_MASK;
+ goto done;
+ }
+ }
+ /* Copy full 64-bit value for CMPXCHG8B. */
+ ctxt->dst.orig_val64 = ctxt->dst.val64;
+
+special_insn:
+
+ if (unlikely(emul_flags & X86EMUL_GUEST_MASK) && (ctxt->d & Intercept)) {
+ rc = emulator_check_intercept(ctxt, ctxt->intercept,
+ X86_ICPT_POST_MEMACCESS);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+ }
+
+ if (ctxt->rep_prefix && (ctxt->d & String))
+ ctxt->eflags |= X86_EFLAGS_RF;
+ else
+ ctxt->eflags &= ~X86_EFLAGS_RF;
+
+ if (ctxt->execute) {
+ if (ctxt->d & Fastop) {
+ void (*fop)(struct fastop *) = (void *)ctxt->execute;
+ rc = fastop(ctxt, fop);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+ goto writeback;
+ }
+ rc = ctxt->execute(ctxt);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+ goto writeback;
+ }
+
+ if (ctxt->opcode_len == 2)
+ goto twobyte_insn;
+ else if (ctxt->opcode_len == 3)
+ goto threebyte_insn;
+
+ switch (ctxt->b) {
+ case 0x70 ... 0x7f: /* jcc (short) */
+ if (test_cc(ctxt->b, ctxt->eflags))
+ rc = jmp_rel(ctxt, ctxt->src.val);
+ break;
+ case 0x8d: /* lea r16/r32, m */
+ ctxt->dst.val = ctxt->src.addr.mem.ea;
+ break;
+ case 0x90 ... 0x97: /* nop / xchg reg, rax */
+ if (ctxt->dst.addr.reg == reg_rmw(ctxt, VCPU_REGS_RAX))
+ ctxt->dst.type = OP_NONE;
+ else
+ rc = em_xchg(ctxt);
+ break;
+ case 0x98: /* cbw/cwde/cdqe */
+ switch (ctxt->op_bytes) {
+ case 2: ctxt->dst.val = (s8)ctxt->dst.val; break;
+ case 4: ctxt->dst.val = (s16)ctxt->dst.val; break;
+ case 8: ctxt->dst.val = (s32)ctxt->dst.val; break;
+ }
+ break;
+ case 0xcc: /* int3 */
+ rc = emulate_int(ctxt, 3);
+ break;
+ case 0xcd: /* int n */
+ rc = emulate_int(ctxt, ctxt->src.val);
+ break;
+ case 0xce: /* into */
+ if (ctxt->eflags & X86_EFLAGS_OF)
+ rc = emulate_int(ctxt, 4);
+ break;
+ case 0xe9: /* jmp rel */
+ case 0xeb: /* jmp rel short */
+ rc = jmp_rel(ctxt, ctxt->src.val);
+ ctxt->dst.type = OP_NONE; /* Disable writeback. */
+ break;
+ case 0xf4: /* hlt */
+ ctxt->ops->halt(ctxt);
+ break;
+ case 0xf5: /* cmc */
+ /* complement carry flag from eflags reg */
+ ctxt->eflags ^= X86_EFLAGS_CF;
+ break;
+ case 0xf8: /* clc */
+ ctxt->eflags &= ~X86_EFLAGS_CF;
+ break;
+ case 0xf9: /* stc */
+ ctxt->eflags |= X86_EFLAGS_CF;
+ break;
+ case 0xfc: /* cld */
+ ctxt->eflags &= ~X86_EFLAGS_DF;
+ break;
+ case 0xfd: /* std */
+ ctxt->eflags |= X86_EFLAGS_DF;
+ break;
+ default:
+ goto cannot_emulate;
+ }
+
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+
+writeback:
+ if (ctxt->d & SrcWrite) {
+ BUG_ON(ctxt->src.type == OP_MEM || ctxt->src.type == OP_MEM_STR);
+ rc = writeback(ctxt, &ctxt->src);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+ }
+ if (!(ctxt->d & NoWrite)) {
+ rc = writeback(ctxt, &ctxt->dst);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+ }
+
+ /*
+ * restore dst type in case the decoding will be reused
+ * (happens for string instruction )
+ */
+ ctxt->dst.type = saved_dst_type;
+
+ if ((ctxt->d & SrcMask) == SrcSI)
+ string_addr_inc(ctxt, VCPU_REGS_RSI, &ctxt->src);
+
+ if ((ctxt->d & DstMask) == DstDI)
+ string_addr_inc(ctxt, VCPU_REGS_RDI, &ctxt->dst);
+
+ if (ctxt->rep_prefix && (ctxt->d & String)) {
+ unsigned int count;
+ struct read_cache *r = &ctxt->io_read;
+ if ((ctxt->d & SrcMask) == SrcSI)
+ count = ctxt->src.count;
+ else
+ count = ctxt->dst.count;
+ register_address_increment(ctxt, VCPU_REGS_RCX, -count);
+
+ if (!string_insn_completed(ctxt)) {
+ /*
+ * Re-enter guest when pio read ahead buffer is empty
+ * or, if it is not used, after each 1024 iteration.
+ */
+ if ((r->end != 0 || reg_read(ctxt, VCPU_REGS_RCX) & 0x3ff) &&
+ (r->end == 0 || r->end != r->pos)) {
+ /*
+ * Reset read cache. Usually happens before
+ * decode, but since instruction is restarted
+ * we have to do it here.
+ */
+ ctxt->mem_read.end = 0;
+ writeback_registers(ctxt);
+ return EMULATION_RESTART;
+ }
+ goto done; /* skip rip writeback */
+ }
+ ctxt->eflags &= ~X86_EFLAGS_RF;
+ }
+
+ ctxt->eip = ctxt->_eip;
+
+done:
+ if (rc == X86EMUL_PROPAGATE_FAULT) {
+ WARN_ON(ctxt->exception.vector > 0x1f);
+ ctxt->have_exception = true;
+ }
+ if (rc == X86EMUL_INTERCEPTED)
+ return EMULATION_INTERCEPTED;
+
+ if (rc == X86EMUL_CONTINUE)
+ writeback_registers(ctxt);
+
+ return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK;
+
+twobyte_insn:
+ switch (ctxt->b) {
+ case 0x09: /* wbinvd */
+ (ctxt->ops->wbinvd)(ctxt);
+ break;
+ case 0x08: /* invd */
+ case 0x0d: /* GrpP (prefetch) */
+ case 0x18: /* Grp16 (prefetch/nop) */
+ case 0x1f: /* nop */
+ break;
+ case 0x20: /* mov cr, reg */
+ ctxt->dst.val = ops->get_cr(ctxt, ctxt->modrm_reg);
+ break;
+ case 0x21: /* mov from dr to reg */
+ ops->get_dr(ctxt, ctxt->modrm_reg, &ctxt->dst.val);
+ break;
+ case 0x40 ... 0x4f: /* cmov */
+ if (test_cc(ctxt->b, ctxt->eflags))
+ ctxt->dst.val = ctxt->src.val;
+ else if (ctxt->op_bytes != 4)
+ ctxt->dst.type = OP_NONE; /* no writeback */
+ break;
+ case 0x80 ... 0x8f: /* jnz rel, etc*/
+ if (test_cc(ctxt->b, ctxt->eflags))
+ rc = jmp_rel(ctxt, ctxt->src.val);
+ break;
+ case 0x90 ... 0x9f: /* setcc r/m8 */
+ ctxt->dst.val = test_cc(ctxt->b, ctxt->eflags);
+ break;
+ case 0xb6 ... 0xb7: /* movzx */
+ ctxt->dst.bytes = ctxt->op_bytes;
+ ctxt->dst.val = (ctxt->src.bytes == 1) ? (u8) ctxt->src.val
+ : (u16) ctxt->src.val;
+ break;
+ case 0xbe ... 0xbf: /* movsx */
+ ctxt->dst.bytes = ctxt->op_bytes;
+ ctxt->dst.val = (ctxt->src.bytes == 1) ? (s8) ctxt->src.val :
+ (s16) ctxt->src.val;
+ break;
+ default:
+ goto cannot_emulate;
+ }
+
+threebyte_insn:
+
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+
+ goto writeback;
+
+cannot_emulate:
+ return EMULATION_FAILED;
+}
+
+void emulator_invalidate_register_cache(struct x86_emulate_ctxt *ctxt)
+{
+ invalidate_registers(ctxt);
+}
+
+void emulator_writeback_register_cache(struct x86_emulate_ctxt *ctxt)
+{
+ writeback_registers(ctxt);
+}
+
+bool emulator_can_use_gpa(struct x86_emulate_ctxt *ctxt)
+{
+ if (ctxt->rep_prefix && (ctxt->d & String))
+ return false;
+
+ if (ctxt->d & TwoMemOp)
+ return false;
+
+ return true;
+}
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
new file mode 100644
index 000000000..ca5a6c3f8
--- /dev/null
+++ b/arch/x86/kvm/hyperv.c
@@ -0,0 +1,1683 @@
+/*
+ * KVM Microsoft Hyper-V emulation
+ *
+ * derived from arch/x86/kvm/x86.c
+ *
+ * Copyright (C) 2006 Qumranet, Inc.
+ * Copyright (C) 2008 Qumranet, Inc.
+ * Copyright IBM Corporation, 2008
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
+ * Copyright (C) 2015 Andrey Smetanin <asmetanin@virtuozzo.com>
+ *
+ * Authors:
+ * Avi Kivity <avi@qumranet.com>
+ * Yaniv Kamay <yaniv@qumranet.com>
+ * Amit Shah <amit.shah@qumranet.com>
+ * Ben-Ami Yassour <benami@il.ibm.com>
+ * Andrey Smetanin <asmetanin@virtuozzo.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#include "x86.h"
+#include "lapic.h"
+#include "ioapic.h"
+#include "hyperv.h"
+
+#include <linux/kvm_host.h>
+#include <linux/highmem.h>
+#include <linux/sched/cputime.h>
+#include <linux/eventfd.h>
+
+#include <asm/apicdef.h>
+#include <trace/events/kvm.h>
+
+#include "trace.h"
+
+static inline u64 synic_read_sint(struct kvm_vcpu_hv_synic *synic, int sint)
+{
+ return atomic64_read(&synic->sint[sint]);
+}
+
+static inline int synic_get_sint_vector(u64 sint_value)
+{
+ if (sint_value & HV_SYNIC_SINT_MASKED)
+ return -1;
+ return sint_value & HV_SYNIC_SINT_VECTOR_MASK;
+}
+
+static bool synic_has_vector_connected(struct kvm_vcpu_hv_synic *synic,
+ int vector)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(synic->sint); i++) {
+ if (synic_get_sint_vector(synic_read_sint(synic, i)) == vector)
+ return true;
+ }
+ return false;
+}
+
+static bool synic_has_vector_auto_eoi(struct kvm_vcpu_hv_synic *synic,
+ int vector)
+{
+ int i;
+ u64 sint_value;
+
+ for (i = 0; i < ARRAY_SIZE(synic->sint); i++) {
+ sint_value = synic_read_sint(synic, i);
+ if (synic_get_sint_vector(sint_value) == vector &&
+ sint_value & HV_SYNIC_SINT_AUTO_EOI)
+ return true;
+ }
+ return false;
+}
+
+static void synic_update_vector(struct kvm_vcpu_hv_synic *synic,
+ int vector)
+{
+ if (vector < HV_SYNIC_FIRST_VALID_VECTOR)
+ return;
+
+ if (synic_has_vector_connected(synic, vector))
+ __set_bit(vector, synic->vec_bitmap);
+ else
+ __clear_bit(vector, synic->vec_bitmap);
+
+ if (synic_has_vector_auto_eoi(synic, vector))
+ __set_bit(vector, synic->auto_eoi_bitmap);
+ else
+ __clear_bit(vector, synic->auto_eoi_bitmap);
+}
+
+static int synic_set_sint(struct kvm_vcpu_hv_synic *synic, int sint,
+ u64 data, bool host)
+{
+ int vector, old_vector;
+ bool masked;
+
+ vector = data & HV_SYNIC_SINT_VECTOR_MASK;
+ masked = data & HV_SYNIC_SINT_MASKED;
+
+ /*
+ * Valid vectors are 16-255, however, nested Hyper-V attempts to write
+ * default '0x10000' value on boot and this should not #GP. We need to
+ * allow zero-initing the register from host as well.
+ */
+ if (vector < HV_SYNIC_FIRST_VALID_VECTOR && !host && !masked)
+ return 1;
+ /*
+ * Guest may configure multiple SINTs to use the same vector, so
+ * we maintain a bitmap of vectors handled by synic, and a
+ * bitmap of vectors with auto-eoi behavior. The bitmaps are
+ * updated here, and atomically queried on fast paths.
+ */
+ old_vector = synic_read_sint(synic, sint) & HV_SYNIC_SINT_VECTOR_MASK;
+
+ atomic64_set(&synic->sint[sint], data);
+
+ synic_update_vector(synic, old_vector);
+
+ synic_update_vector(synic, vector);
+
+ /* Load SynIC vectors into EOI exit bitmap */
+ kvm_make_request(KVM_REQ_SCAN_IOAPIC, synic_to_vcpu(synic));
+ return 0;
+}
+
+static struct kvm_vcpu *get_vcpu_by_vpidx(struct kvm *kvm, u32 vpidx)
+{
+ struct kvm_vcpu *vcpu = NULL;
+ int i;
+
+ if (vpidx >= KVM_MAX_VCPUS)
+ return NULL;
+
+ vcpu = kvm_get_vcpu(kvm, vpidx);
+ if (vcpu && vcpu_to_hv_vcpu(vcpu)->vp_index == vpidx)
+ return vcpu;
+ kvm_for_each_vcpu(i, vcpu, kvm)
+ if (vcpu_to_hv_vcpu(vcpu)->vp_index == vpidx)
+ return vcpu;
+ return NULL;
+}
+
+static struct kvm_vcpu_hv_synic *synic_get(struct kvm *kvm, u32 vpidx)
+{
+ struct kvm_vcpu *vcpu;
+ struct kvm_vcpu_hv_synic *synic;
+
+ vcpu = get_vcpu_by_vpidx(kvm, vpidx);
+ if (!vcpu)
+ return NULL;
+ synic = vcpu_to_synic(vcpu);
+ return (synic->active) ? synic : NULL;
+}
+
+static void synic_clear_sint_msg_pending(struct kvm_vcpu_hv_synic *synic,
+ u32 sint)
+{
+ struct kvm_vcpu *vcpu = synic_to_vcpu(synic);
+ struct page *page;
+ gpa_t gpa;
+ struct hv_message *msg;
+ struct hv_message_page *msg_page;
+
+ gpa = synic->msg_page & PAGE_MASK;
+ page = kvm_vcpu_gfn_to_page(vcpu, gpa >> PAGE_SHIFT);
+ if (is_error_page(page)) {
+ vcpu_err(vcpu, "Hyper-V SynIC can't get msg page, gpa 0x%llx\n",
+ gpa);
+ return;
+ }
+ msg_page = kmap_atomic(page);
+
+ msg = &msg_page->sint_message[sint];
+ msg->header.message_flags.msg_pending = 0;
+
+ kunmap_atomic(msg_page);
+ kvm_release_page_dirty(page);
+ kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT);
+}
+
+static void kvm_hv_notify_acked_sint(struct kvm_vcpu *vcpu, u32 sint)
+{
+ struct kvm *kvm = vcpu->kvm;
+ struct kvm_vcpu_hv_synic *synic = vcpu_to_synic(vcpu);
+ struct kvm_vcpu_hv *hv_vcpu = vcpu_to_hv_vcpu(vcpu);
+ struct kvm_vcpu_hv_stimer *stimer;
+ int gsi, idx, stimers_pending;
+
+ trace_kvm_hv_notify_acked_sint(vcpu->vcpu_id, sint);
+
+ if (synic->msg_page & HV_SYNIC_SIMP_ENABLE)
+ synic_clear_sint_msg_pending(synic, sint);
+
+ /* Try to deliver pending Hyper-V SynIC timers messages */
+ stimers_pending = 0;
+ for (idx = 0; idx < ARRAY_SIZE(hv_vcpu->stimer); idx++) {
+ stimer = &hv_vcpu->stimer[idx];
+ if (stimer->msg_pending &&
+ (stimer->config & HV_STIMER_ENABLE) &&
+ HV_STIMER_SINT(stimer->config) == sint) {
+ set_bit(stimer->index,
+ hv_vcpu->stimer_pending_bitmap);
+ stimers_pending++;
+ }
+ }
+ if (stimers_pending)
+ kvm_make_request(KVM_REQ_HV_STIMER, vcpu);
+
+ idx = srcu_read_lock(&kvm->irq_srcu);
+ gsi = atomic_read(&synic->sint_to_gsi[sint]);
+ if (gsi != -1)
+ kvm_notify_acked_gsi(kvm, gsi);
+ srcu_read_unlock(&kvm->irq_srcu, idx);
+}
+
+static void synic_exit(struct kvm_vcpu_hv_synic *synic, u32 msr)
+{
+ struct kvm_vcpu *vcpu = synic_to_vcpu(synic);
+ struct kvm_vcpu_hv *hv_vcpu = &vcpu->arch.hyperv;
+
+ hv_vcpu->exit.type = KVM_EXIT_HYPERV_SYNIC;
+ hv_vcpu->exit.u.synic.msr = msr;
+ hv_vcpu->exit.u.synic.control = synic->control;
+ hv_vcpu->exit.u.synic.evt_page = synic->evt_page;
+ hv_vcpu->exit.u.synic.msg_page = synic->msg_page;
+
+ kvm_make_request(KVM_REQ_HV_EXIT, vcpu);
+}
+
+static int synic_set_msr(struct kvm_vcpu_hv_synic *synic,
+ u32 msr, u64 data, bool host)
+{
+ struct kvm_vcpu *vcpu = synic_to_vcpu(synic);
+ int ret;
+
+ if (!synic->active && (!host || data))
+ return 1;
+
+ trace_kvm_hv_synic_set_msr(vcpu->vcpu_id, msr, data, host);
+
+ ret = 0;
+ switch (msr) {
+ case HV_X64_MSR_SCONTROL:
+ synic->control = data;
+ if (!host)
+ synic_exit(synic, msr);
+ break;
+ case HV_X64_MSR_SVERSION:
+ if (!host) {
+ ret = 1;
+ break;
+ }
+ synic->version = data;
+ break;
+ case HV_X64_MSR_SIEFP:
+ if ((data & HV_SYNIC_SIEFP_ENABLE) && !host &&
+ !synic->dont_zero_synic_pages)
+ if (kvm_clear_guest(vcpu->kvm,
+ data & PAGE_MASK, PAGE_SIZE)) {
+ ret = 1;
+ break;
+ }
+ synic->evt_page = data;
+ if (!host)
+ synic_exit(synic, msr);
+ break;
+ case HV_X64_MSR_SIMP:
+ if ((data & HV_SYNIC_SIMP_ENABLE) && !host &&
+ !synic->dont_zero_synic_pages)
+ if (kvm_clear_guest(vcpu->kvm,
+ data & PAGE_MASK, PAGE_SIZE)) {
+ ret = 1;
+ break;
+ }
+ synic->msg_page = data;
+ if (!host)
+ synic_exit(synic, msr);
+ break;
+ case HV_X64_MSR_EOM: {
+ int i;
+
+ if (!synic->active)
+ break;
+
+ for (i = 0; i < ARRAY_SIZE(synic->sint); i++)
+ kvm_hv_notify_acked_sint(vcpu, i);
+ break;
+ }
+ case HV_X64_MSR_SINT0 ... HV_X64_MSR_SINT15:
+ ret = synic_set_sint(synic, msr - HV_X64_MSR_SINT0, data, host);
+ break;
+ default:
+ ret = 1;
+ break;
+ }
+ return ret;
+}
+
+static int synic_get_msr(struct kvm_vcpu_hv_synic *synic, u32 msr, u64 *pdata,
+ bool host)
+{
+ int ret;
+
+ if (!synic->active && !host)
+ return 1;
+
+ ret = 0;
+ switch (msr) {
+ case HV_X64_MSR_SCONTROL:
+ *pdata = synic->control;
+ break;
+ case HV_X64_MSR_SVERSION:
+ *pdata = synic->version;
+ break;
+ case HV_X64_MSR_SIEFP:
+ *pdata = synic->evt_page;
+ break;
+ case HV_X64_MSR_SIMP:
+ *pdata = synic->msg_page;
+ break;
+ case HV_X64_MSR_EOM:
+ *pdata = 0;
+ break;
+ case HV_X64_MSR_SINT0 ... HV_X64_MSR_SINT15:
+ *pdata = atomic64_read(&synic->sint[msr - HV_X64_MSR_SINT0]);
+ break;
+ default:
+ ret = 1;
+ break;
+ }
+ return ret;
+}
+
+static int synic_set_irq(struct kvm_vcpu_hv_synic *synic, u32 sint)
+{
+ struct kvm_vcpu *vcpu = synic_to_vcpu(synic);
+ struct kvm_lapic_irq irq;
+ int ret, vector;
+
+ if (sint >= ARRAY_SIZE(synic->sint))
+ return -EINVAL;
+
+ vector = synic_get_sint_vector(synic_read_sint(synic, sint));
+ if (vector < 0)
+ return -ENOENT;
+
+ memset(&irq, 0, sizeof(irq));
+ irq.shorthand = APIC_DEST_SELF;
+ irq.dest_mode = APIC_DEST_PHYSICAL;
+ irq.delivery_mode = APIC_DM_FIXED;
+ irq.vector = vector;
+ irq.level = 1;
+
+ ret = kvm_irq_delivery_to_apic(vcpu->kvm, vcpu->arch.apic, &irq, NULL);
+ trace_kvm_hv_synic_set_irq(vcpu->vcpu_id, sint, irq.vector, ret);
+ return ret;
+}
+
+int kvm_hv_synic_set_irq(struct kvm *kvm, u32 vpidx, u32 sint)
+{
+ struct kvm_vcpu_hv_synic *synic;
+
+ synic = synic_get(kvm, vpidx);
+ if (!synic)
+ return -EINVAL;
+
+ return synic_set_irq(synic, sint);
+}
+
+void kvm_hv_synic_send_eoi(struct kvm_vcpu *vcpu, int vector)
+{
+ struct kvm_vcpu_hv_synic *synic = vcpu_to_synic(vcpu);
+ int i;
+
+ trace_kvm_hv_synic_send_eoi(vcpu->vcpu_id, vector);
+
+ for (i = 0; i < ARRAY_SIZE(synic->sint); i++)
+ if (synic_get_sint_vector(synic_read_sint(synic, i)) == vector)
+ kvm_hv_notify_acked_sint(vcpu, i);
+}
+
+static int kvm_hv_set_sint_gsi(struct kvm *kvm, u32 vpidx, u32 sint, int gsi)
+{
+ struct kvm_vcpu_hv_synic *synic;
+
+ synic = synic_get(kvm, vpidx);
+ if (!synic)
+ return -EINVAL;
+
+ if (sint >= ARRAY_SIZE(synic->sint_to_gsi))
+ return -EINVAL;
+
+ atomic_set(&synic->sint_to_gsi[sint], gsi);
+ return 0;
+}
+
+void kvm_hv_irq_routing_update(struct kvm *kvm)
+{
+ struct kvm_irq_routing_table *irq_rt;
+ struct kvm_kernel_irq_routing_entry *e;
+ u32 gsi;
+
+ irq_rt = srcu_dereference_check(kvm->irq_routing, &kvm->irq_srcu,
+ lockdep_is_held(&kvm->irq_lock));
+
+ for (gsi = 0; gsi < irq_rt->nr_rt_entries; gsi++) {
+ hlist_for_each_entry(e, &irq_rt->map[gsi], link) {
+ if (e->type == KVM_IRQ_ROUTING_HV_SINT)
+ kvm_hv_set_sint_gsi(kvm, e->hv_sint.vcpu,
+ e->hv_sint.sint, gsi);
+ }
+ }
+}
+
+static void synic_init(struct kvm_vcpu_hv_synic *synic)
+{
+ int i;
+
+ memset(synic, 0, sizeof(*synic));
+ synic->version = HV_SYNIC_VERSION_1;
+ for (i = 0; i < ARRAY_SIZE(synic->sint); i++) {
+ atomic64_set(&synic->sint[i], HV_SYNIC_SINT_MASKED);
+ atomic_set(&synic->sint_to_gsi[i], -1);
+ }
+}
+
+static u64 get_time_ref_counter(struct kvm *kvm)
+{
+ struct kvm_hv *hv = &kvm->arch.hyperv;
+ struct kvm_vcpu *vcpu;
+ u64 tsc;
+
+ /*
+ * The guest has not set up the TSC page or the clock isn't
+ * stable, fall back to get_kvmclock_ns.
+ */
+ if (!hv->tsc_ref.tsc_sequence)
+ return div_u64(get_kvmclock_ns(kvm), 100);
+
+ vcpu = kvm_get_vcpu(kvm, 0);
+ tsc = kvm_read_l1_tsc(vcpu, rdtsc());
+ return mul_u64_u64_shr(tsc, hv->tsc_ref.tsc_scale, 64)
+ + hv->tsc_ref.tsc_offset;
+}
+
+static void stimer_mark_pending(struct kvm_vcpu_hv_stimer *stimer,
+ bool vcpu_kick)
+{
+ struct kvm_vcpu *vcpu = stimer_to_vcpu(stimer);
+
+ set_bit(stimer->index,
+ vcpu_to_hv_vcpu(vcpu)->stimer_pending_bitmap);
+ kvm_make_request(KVM_REQ_HV_STIMER, vcpu);
+ if (vcpu_kick)
+ kvm_vcpu_kick(vcpu);
+}
+
+static void stimer_cleanup(struct kvm_vcpu_hv_stimer *stimer)
+{
+ struct kvm_vcpu *vcpu = stimer_to_vcpu(stimer);
+
+ trace_kvm_hv_stimer_cleanup(stimer_to_vcpu(stimer)->vcpu_id,
+ stimer->index);
+
+ hrtimer_cancel(&stimer->timer);
+ clear_bit(stimer->index,
+ vcpu_to_hv_vcpu(vcpu)->stimer_pending_bitmap);
+ stimer->msg_pending = false;
+ stimer->exp_time = 0;
+}
+
+static enum hrtimer_restart stimer_timer_callback(struct hrtimer *timer)
+{
+ struct kvm_vcpu_hv_stimer *stimer;
+
+ stimer = container_of(timer, struct kvm_vcpu_hv_stimer, timer);
+ trace_kvm_hv_stimer_callback(stimer_to_vcpu(stimer)->vcpu_id,
+ stimer->index);
+ stimer_mark_pending(stimer, true);
+
+ return HRTIMER_NORESTART;
+}
+
+/*
+ * stimer_start() assumptions:
+ * a) stimer->count is not equal to 0
+ * b) stimer->config has HV_STIMER_ENABLE flag
+ */
+static int stimer_start(struct kvm_vcpu_hv_stimer *stimer)
+{
+ u64 time_now;
+ ktime_t ktime_now;
+
+ time_now = get_time_ref_counter(stimer_to_vcpu(stimer)->kvm);
+ ktime_now = ktime_get();
+
+ if (stimer->config & HV_STIMER_PERIODIC) {
+ if (stimer->exp_time) {
+ if (time_now >= stimer->exp_time) {
+ u64 remainder;
+
+ div64_u64_rem(time_now - stimer->exp_time,
+ stimer->count, &remainder);
+ stimer->exp_time =
+ time_now + (stimer->count - remainder);
+ }
+ } else
+ stimer->exp_time = time_now + stimer->count;
+
+ trace_kvm_hv_stimer_start_periodic(
+ stimer_to_vcpu(stimer)->vcpu_id,
+ stimer->index,
+ time_now, stimer->exp_time);
+
+ hrtimer_start(&stimer->timer,
+ ktime_add_ns(ktime_now,
+ 100 * (stimer->exp_time - time_now)),
+ HRTIMER_MODE_ABS);
+ return 0;
+ }
+ stimer->exp_time = stimer->count;
+ if (time_now >= stimer->count) {
+ /*
+ * Expire timer according to Hypervisor Top-Level Functional
+ * specification v4(15.3.1):
+ * "If a one shot is enabled and the specified count is in
+ * the past, it will expire immediately."
+ */
+ stimer_mark_pending(stimer, false);
+ return 0;
+ }
+
+ trace_kvm_hv_stimer_start_one_shot(stimer_to_vcpu(stimer)->vcpu_id,
+ stimer->index,
+ time_now, stimer->count);
+
+ hrtimer_start(&stimer->timer,
+ ktime_add_ns(ktime_now, 100 * (stimer->count - time_now)),
+ HRTIMER_MODE_ABS);
+ return 0;
+}
+
+static int stimer_set_config(struct kvm_vcpu_hv_stimer *stimer, u64 config,
+ bool host)
+{
+ struct kvm_vcpu *vcpu = stimer_to_vcpu(stimer);
+ struct kvm_vcpu_hv_synic *synic = vcpu_to_synic(vcpu);
+
+ if (!synic->active && (!host || config))
+ return 1;
+
+ trace_kvm_hv_stimer_set_config(stimer_to_vcpu(stimer)->vcpu_id,
+ stimer->index, config, host);
+
+ stimer_cleanup(stimer);
+ if ((stimer->config & HV_STIMER_ENABLE) && HV_STIMER_SINT(config) == 0)
+ config &= ~HV_STIMER_ENABLE;
+ stimer->config = config;
+ stimer_mark_pending(stimer, false);
+ return 0;
+}
+
+static int stimer_set_count(struct kvm_vcpu_hv_stimer *stimer, u64 count,
+ bool host)
+{
+ struct kvm_vcpu *vcpu = stimer_to_vcpu(stimer);
+ struct kvm_vcpu_hv_synic *synic = vcpu_to_synic(vcpu);
+
+ if (!synic->active && (!host || count))
+ return 1;
+
+ trace_kvm_hv_stimer_set_count(stimer_to_vcpu(stimer)->vcpu_id,
+ stimer->index, count, host);
+
+ stimer_cleanup(stimer);
+ stimer->count = count;
+ if (stimer->count == 0)
+ stimer->config &= ~HV_STIMER_ENABLE;
+ else if (stimer->config & HV_STIMER_AUTOENABLE)
+ stimer->config |= HV_STIMER_ENABLE;
+ stimer_mark_pending(stimer, false);
+ return 0;
+}
+
+static int stimer_get_config(struct kvm_vcpu_hv_stimer *stimer, u64 *pconfig)
+{
+ *pconfig = stimer->config;
+ return 0;
+}
+
+static int stimer_get_count(struct kvm_vcpu_hv_stimer *stimer, u64 *pcount)
+{
+ *pcount = stimer->count;
+ return 0;
+}
+
+static int synic_deliver_msg(struct kvm_vcpu_hv_synic *synic, u32 sint,
+ struct hv_message *src_msg)
+{
+ struct kvm_vcpu *vcpu = synic_to_vcpu(synic);
+ struct page *page;
+ gpa_t gpa;
+ struct hv_message *dst_msg;
+ int r;
+ struct hv_message_page *msg_page;
+
+ if (!(synic->msg_page & HV_SYNIC_SIMP_ENABLE))
+ return -ENOENT;
+
+ gpa = synic->msg_page & PAGE_MASK;
+ page = kvm_vcpu_gfn_to_page(vcpu, gpa >> PAGE_SHIFT);
+ if (is_error_page(page))
+ return -EFAULT;
+
+ msg_page = kmap_atomic(page);
+ dst_msg = &msg_page->sint_message[sint];
+ if (sync_cmpxchg(&dst_msg->header.message_type, HVMSG_NONE,
+ src_msg->header.message_type) != HVMSG_NONE) {
+ dst_msg->header.message_flags.msg_pending = 1;
+ r = -EAGAIN;
+ } else {
+ memcpy(&dst_msg->u.payload, &src_msg->u.payload,
+ src_msg->header.payload_size);
+ dst_msg->header.message_type = src_msg->header.message_type;
+ dst_msg->header.payload_size = src_msg->header.payload_size;
+ r = synic_set_irq(synic, sint);
+ if (r >= 1)
+ r = 0;
+ else if (r == 0)
+ r = -EFAULT;
+ }
+ kunmap_atomic(msg_page);
+ kvm_release_page_dirty(page);
+ kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT);
+ return r;
+}
+
+static int stimer_send_msg(struct kvm_vcpu_hv_stimer *stimer)
+{
+ struct kvm_vcpu *vcpu = stimer_to_vcpu(stimer);
+ struct hv_message *msg = &stimer->msg;
+ struct hv_timer_message_payload *payload =
+ (struct hv_timer_message_payload *)&msg->u.payload;
+
+ payload->expiration_time = stimer->exp_time;
+ payload->delivery_time = get_time_ref_counter(vcpu->kvm);
+ return synic_deliver_msg(vcpu_to_synic(vcpu),
+ HV_STIMER_SINT(stimer->config), msg);
+}
+
+static void stimer_expiration(struct kvm_vcpu_hv_stimer *stimer)
+{
+ int r;
+
+ stimer->msg_pending = true;
+ r = stimer_send_msg(stimer);
+ trace_kvm_hv_stimer_expiration(stimer_to_vcpu(stimer)->vcpu_id,
+ stimer->index, r);
+ if (!r) {
+ stimer->msg_pending = false;
+ if (!(stimer->config & HV_STIMER_PERIODIC))
+ stimer->config &= ~HV_STIMER_ENABLE;
+ }
+}
+
+void kvm_hv_process_stimers(struct kvm_vcpu *vcpu)
+{
+ struct kvm_vcpu_hv *hv_vcpu = vcpu_to_hv_vcpu(vcpu);
+ struct kvm_vcpu_hv_stimer *stimer;
+ u64 time_now, exp_time;
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(hv_vcpu->stimer); i++)
+ if (test_and_clear_bit(i, hv_vcpu->stimer_pending_bitmap)) {
+ stimer = &hv_vcpu->stimer[i];
+ if (stimer->config & HV_STIMER_ENABLE) {
+ exp_time = stimer->exp_time;
+
+ if (exp_time) {
+ time_now =
+ get_time_ref_counter(vcpu->kvm);
+ if (time_now >= exp_time)
+ stimer_expiration(stimer);
+ }
+
+ if ((stimer->config & HV_STIMER_ENABLE) &&
+ stimer->count) {
+ if (!stimer->msg_pending)
+ stimer_start(stimer);
+ } else
+ stimer_cleanup(stimer);
+ }
+ }
+}
+
+void kvm_hv_vcpu_uninit(struct kvm_vcpu *vcpu)
+{
+ struct kvm_vcpu_hv *hv_vcpu = vcpu_to_hv_vcpu(vcpu);
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(hv_vcpu->stimer); i++)
+ stimer_cleanup(&hv_vcpu->stimer[i]);
+}
+
+bool kvm_hv_assist_page_enabled(struct kvm_vcpu *vcpu)
+{
+ if (!(vcpu->arch.hyperv.hv_vapic & HV_X64_MSR_VP_ASSIST_PAGE_ENABLE))
+ return false;
+ return vcpu->arch.pv_eoi.msr_val & KVM_MSR_ENABLED;
+}
+EXPORT_SYMBOL_GPL(kvm_hv_assist_page_enabled);
+
+bool kvm_hv_get_assist_page(struct kvm_vcpu *vcpu,
+ struct hv_vp_assist_page *assist_page)
+{
+ if (!kvm_hv_assist_page_enabled(vcpu))
+ return false;
+ return !kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.pv_eoi.data,
+ assist_page, sizeof(*assist_page));
+}
+EXPORT_SYMBOL_GPL(kvm_hv_get_assist_page);
+
+static void stimer_prepare_msg(struct kvm_vcpu_hv_stimer *stimer)
+{
+ struct hv_message *msg = &stimer->msg;
+ struct hv_timer_message_payload *payload =
+ (struct hv_timer_message_payload *)&msg->u.payload;
+
+ memset(&msg->header, 0, sizeof(msg->header));
+ msg->header.message_type = HVMSG_TIMER_EXPIRED;
+ msg->header.payload_size = sizeof(*payload);
+
+ payload->timer_index = stimer->index;
+ payload->expiration_time = 0;
+ payload->delivery_time = 0;
+}
+
+static void stimer_init(struct kvm_vcpu_hv_stimer *stimer, int timer_index)
+{
+ memset(stimer, 0, sizeof(*stimer));
+ stimer->index = timer_index;
+ hrtimer_init(&stimer->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+ stimer->timer.function = stimer_timer_callback;
+ stimer_prepare_msg(stimer);
+}
+
+void kvm_hv_vcpu_init(struct kvm_vcpu *vcpu)
+{
+ struct kvm_vcpu_hv *hv_vcpu = vcpu_to_hv_vcpu(vcpu);
+ int i;
+
+ synic_init(&hv_vcpu->synic);
+
+ bitmap_zero(hv_vcpu->stimer_pending_bitmap, HV_SYNIC_STIMER_COUNT);
+ for (i = 0; i < ARRAY_SIZE(hv_vcpu->stimer); i++)
+ stimer_init(&hv_vcpu->stimer[i], i);
+}
+
+void kvm_hv_vcpu_postcreate(struct kvm_vcpu *vcpu)
+{
+ struct kvm_vcpu_hv *hv_vcpu = vcpu_to_hv_vcpu(vcpu);
+
+ hv_vcpu->vp_index = kvm_vcpu_get_idx(vcpu);
+}
+
+int kvm_hv_activate_synic(struct kvm_vcpu *vcpu, bool dont_zero_synic_pages)
+{
+ struct kvm_vcpu_hv_synic *synic = vcpu_to_synic(vcpu);
+
+ /*
+ * Hyper-V SynIC auto EOI SINT's are
+ * not compatible with APICV, so deactivate APICV
+ */
+ kvm_vcpu_deactivate_apicv(vcpu);
+ synic->active = true;
+ synic->dont_zero_synic_pages = dont_zero_synic_pages;
+ return 0;
+}
+
+static bool kvm_hv_msr_partition_wide(u32 msr)
+{
+ bool r = false;
+
+ switch (msr) {
+ case HV_X64_MSR_GUEST_OS_ID:
+ case HV_X64_MSR_HYPERCALL:
+ case HV_X64_MSR_REFERENCE_TSC:
+ case HV_X64_MSR_TIME_REF_COUNT:
+ case HV_X64_MSR_CRASH_CTL:
+ case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
+ case HV_X64_MSR_RESET:
+ case HV_X64_MSR_REENLIGHTENMENT_CONTROL:
+ case HV_X64_MSR_TSC_EMULATION_CONTROL:
+ case HV_X64_MSR_TSC_EMULATION_STATUS:
+ r = true;
+ break;
+ }
+
+ return r;
+}
+
+static int kvm_hv_msr_get_crash_data(struct kvm_vcpu *vcpu,
+ u32 index, u64 *pdata)
+{
+ struct kvm_hv *hv = &vcpu->kvm->arch.hyperv;
+ size_t size = ARRAY_SIZE(hv->hv_crash_param);
+
+ if (WARN_ON_ONCE(index >= size))
+ return -EINVAL;
+
+ *pdata = hv->hv_crash_param[array_index_nospec(index, size)];
+ return 0;
+}
+
+static int kvm_hv_msr_get_crash_ctl(struct kvm_vcpu *vcpu, u64 *pdata)
+{
+ struct kvm_hv *hv = &vcpu->kvm->arch.hyperv;
+
+ *pdata = hv->hv_crash_ctl;
+ return 0;
+}
+
+static int kvm_hv_msr_set_crash_ctl(struct kvm_vcpu *vcpu, u64 data, bool host)
+{
+ struct kvm_hv *hv = &vcpu->kvm->arch.hyperv;
+
+ if (host)
+ hv->hv_crash_ctl = data & HV_X64_MSR_CRASH_CTL_NOTIFY;
+
+ if (!host && (data & HV_X64_MSR_CRASH_CTL_NOTIFY)) {
+
+ vcpu_debug(vcpu, "hv crash (0x%llx 0x%llx 0x%llx 0x%llx 0x%llx)\n",
+ hv->hv_crash_param[0],
+ hv->hv_crash_param[1],
+ hv->hv_crash_param[2],
+ hv->hv_crash_param[3],
+ hv->hv_crash_param[4]);
+
+ /* Send notification about crash to user space */
+ kvm_make_request(KVM_REQ_HV_CRASH, vcpu);
+ }
+
+ return 0;
+}
+
+static int kvm_hv_msr_set_crash_data(struct kvm_vcpu *vcpu,
+ u32 index, u64 data)
+{
+ struct kvm_hv *hv = &vcpu->kvm->arch.hyperv;
+ size_t size = ARRAY_SIZE(hv->hv_crash_param);
+
+ if (WARN_ON_ONCE(index >= size))
+ return -EINVAL;
+
+ hv->hv_crash_param[array_index_nospec(index, size)] = data;
+ return 0;
+}
+
+/*
+ * The kvmclock and Hyper-V TSC page use similar formulas, and converting
+ * between them is possible:
+ *
+ * kvmclock formula:
+ * nsec = (ticks - tsc_timestamp) * tsc_to_system_mul * 2^(tsc_shift-32)
+ * + system_time
+ *
+ * Hyper-V formula:
+ * nsec/100 = ticks * scale / 2^64 + offset
+ *
+ * When tsc_timestamp = system_time = 0, offset is zero in the Hyper-V formula.
+ * By dividing the kvmclock formula by 100 and equating what's left we get:
+ * ticks * scale / 2^64 = ticks * tsc_to_system_mul * 2^(tsc_shift-32) / 100
+ * scale / 2^64 = tsc_to_system_mul * 2^(tsc_shift-32) / 100
+ * scale = tsc_to_system_mul * 2^(32+tsc_shift) / 100
+ *
+ * Now expand the kvmclock formula and divide by 100:
+ * nsec = ticks * tsc_to_system_mul * 2^(tsc_shift-32)
+ * - tsc_timestamp * tsc_to_system_mul * 2^(tsc_shift-32)
+ * + system_time
+ * nsec/100 = ticks * tsc_to_system_mul * 2^(tsc_shift-32) / 100
+ * - tsc_timestamp * tsc_to_system_mul * 2^(tsc_shift-32) / 100
+ * + system_time / 100
+ *
+ * Replace tsc_to_system_mul * 2^(tsc_shift-32) / 100 by scale / 2^64:
+ * nsec/100 = ticks * scale / 2^64
+ * - tsc_timestamp * scale / 2^64
+ * + system_time / 100
+ *
+ * Equate with the Hyper-V formula so that ticks * scale / 2^64 cancels out:
+ * offset = system_time / 100 - tsc_timestamp * scale / 2^64
+ *
+ * These two equivalencies are implemented in this function.
+ */
+static bool compute_tsc_page_parameters(struct pvclock_vcpu_time_info *hv_clock,
+ HV_REFERENCE_TSC_PAGE *tsc_ref)
+{
+ u64 max_mul;
+
+ if (!(hv_clock->flags & PVCLOCK_TSC_STABLE_BIT))
+ return false;
+
+ /*
+ * check if scale would overflow, if so we use the time ref counter
+ * tsc_to_system_mul * 2^(tsc_shift+32) / 100 >= 2^64
+ * tsc_to_system_mul / 100 >= 2^(32-tsc_shift)
+ * tsc_to_system_mul >= 100 * 2^(32-tsc_shift)
+ */
+ max_mul = 100ull << (32 - hv_clock->tsc_shift);
+ if (hv_clock->tsc_to_system_mul >= max_mul)
+ return false;
+
+ /*
+ * Otherwise compute the scale and offset according to the formulas
+ * derived above.
+ */
+ tsc_ref->tsc_scale =
+ mul_u64_u32_div(1ULL << (32 + hv_clock->tsc_shift),
+ hv_clock->tsc_to_system_mul,
+ 100);
+
+ tsc_ref->tsc_offset = hv_clock->system_time;
+ do_div(tsc_ref->tsc_offset, 100);
+ tsc_ref->tsc_offset -=
+ mul_u64_u64_shr(hv_clock->tsc_timestamp, tsc_ref->tsc_scale, 64);
+ return true;
+}
+
+void kvm_hv_setup_tsc_page(struct kvm *kvm,
+ struct pvclock_vcpu_time_info *hv_clock)
+{
+ struct kvm_hv *hv = &kvm->arch.hyperv;
+ u32 tsc_seq;
+ u64 gfn;
+
+ BUILD_BUG_ON(sizeof(tsc_seq) != sizeof(hv->tsc_ref.tsc_sequence));
+ BUILD_BUG_ON(offsetof(HV_REFERENCE_TSC_PAGE, tsc_sequence) != 0);
+
+ if (!(hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE))
+ return;
+
+ mutex_lock(&kvm->arch.hyperv.hv_lock);
+ if (!(hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE))
+ goto out_unlock;
+
+ gfn = hv->hv_tsc_page >> HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT;
+ /*
+ * Because the TSC parameters only vary when there is a
+ * change in the master clock, do not bother with caching.
+ */
+ if (unlikely(kvm_read_guest(kvm, gfn_to_gpa(gfn),
+ &tsc_seq, sizeof(tsc_seq))))
+ goto out_unlock;
+
+ /*
+ * While we're computing and writing the parameters, force the
+ * guest to use the time reference count MSR.
+ */
+ hv->tsc_ref.tsc_sequence = 0;
+ if (kvm_write_guest(kvm, gfn_to_gpa(gfn),
+ &hv->tsc_ref, sizeof(hv->tsc_ref.tsc_sequence)))
+ goto out_unlock;
+
+ if (!compute_tsc_page_parameters(hv_clock, &hv->tsc_ref))
+ goto out_unlock;
+
+ /* Ensure sequence is zero before writing the rest of the struct. */
+ smp_wmb();
+ if (kvm_write_guest(kvm, gfn_to_gpa(gfn), &hv->tsc_ref, sizeof(hv->tsc_ref)))
+ goto out_unlock;
+
+ /*
+ * Now switch to the TSC page mechanism by writing the sequence.
+ */
+ tsc_seq++;
+ if (tsc_seq == 0xFFFFFFFF || tsc_seq == 0)
+ tsc_seq = 1;
+
+ /* Write the struct entirely before the non-zero sequence. */
+ smp_wmb();
+
+ hv->tsc_ref.tsc_sequence = tsc_seq;
+ kvm_write_guest(kvm, gfn_to_gpa(gfn),
+ &hv->tsc_ref, sizeof(hv->tsc_ref.tsc_sequence));
+out_unlock:
+ mutex_unlock(&kvm->arch.hyperv.hv_lock);
+}
+
+static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data,
+ bool host)
+{
+ struct kvm *kvm = vcpu->kvm;
+ struct kvm_hv *hv = &kvm->arch.hyperv;
+
+ switch (msr) {
+ case HV_X64_MSR_GUEST_OS_ID:
+ hv->hv_guest_os_id = data;
+ /* setting guest os id to zero disables hypercall page */
+ if (!hv->hv_guest_os_id)
+ hv->hv_hypercall &= ~HV_X64_MSR_HYPERCALL_ENABLE;
+ break;
+ case HV_X64_MSR_HYPERCALL: {
+ u64 gfn;
+ unsigned long addr;
+ u8 instructions[4];
+
+ /* if guest os id is not set hypercall should remain disabled */
+ if (!hv->hv_guest_os_id)
+ break;
+ if (!(data & HV_X64_MSR_HYPERCALL_ENABLE)) {
+ hv->hv_hypercall = data;
+ break;
+ }
+ gfn = data >> HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT;
+ addr = gfn_to_hva(kvm, gfn);
+ if (kvm_is_error_hva(addr))
+ return 1;
+ kvm_x86_ops->patch_hypercall(vcpu, instructions);
+ ((unsigned char *)instructions)[3] = 0xc3; /* ret */
+ if (__copy_to_user((void __user *)addr, instructions, 4))
+ return 1;
+ hv->hv_hypercall = data;
+ mark_page_dirty(kvm, gfn);
+ break;
+ }
+ case HV_X64_MSR_REFERENCE_TSC:
+ hv->hv_tsc_page = data;
+ if (hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE)
+ kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
+ break;
+ case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
+ return kvm_hv_msr_set_crash_data(vcpu,
+ msr - HV_X64_MSR_CRASH_P0,
+ data);
+ case HV_X64_MSR_CRASH_CTL:
+ return kvm_hv_msr_set_crash_ctl(vcpu, data, host);
+ case HV_X64_MSR_RESET:
+ if (data == 1) {
+ vcpu_debug(vcpu, "hyper-v reset requested\n");
+ kvm_make_request(KVM_REQ_HV_RESET, vcpu);
+ }
+ break;
+ case HV_X64_MSR_REENLIGHTENMENT_CONTROL:
+ hv->hv_reenlightenment_control = data;
+ break;
+ case HV_X64_MSR_TSC_EMULATION_CONTROL:
+ hv->hv_tsc_emulation_control = data;
+ break;
+ case HV_X64_MSR_TSC_EMULATION_STATUS:
+ hv->hv_tsc_emulation_status = data;
+ break;
+ case HV_X64_MSR_TIME_REF_COUNT:
+ /* read-only, but still ignore it if host-initiated */
+ if (!host)
+ return 1;
+ break;
+ default:
+ vcpu_unimpl(vcpu, "Hyper-V uhandled wrmsr: 0x%x data 0x%llx\n",
+ msr, data);
+ return 1;
+ }
+ return 0;
+}
+
+/* Calculate cpu time spent by current task in 100ns units */
+static u64 current_task_runtime_100ns(void)
+{
+ u64 utime, stime;
+
+ task_cputime_adjusted(current, &utime, &stime);
+
+ return div_u64(utime + stime, 100);
+}
+
+static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host)
+{
+ struct kvm_vcpu_hv *hv_vcpu = &vcpu->arch.hyperv;
+
+ switch (msr) {
+ case HV_X64_MSR_VP_INDEX: {
+ struct kvm_hv *hv = &vcpu->kvm->arch.hyperv;
+ int vcpu_idx = kvm_vcpu_get_idx(vcpu);
+ u32 new_vp_index = (u32)data;
+
+ if (!host || new_vp_index >= KVM_MAX_VCPUS)
+ return 1;
+
+ if (new_vp_index == hv_vcpu->vp_index)
+ return 0;
+
+ /*
+ * The VP index is initialized to vcpu_index by
+ * kvm_hv_vcpu_postcreate so they initially match. Now the
+ * VP index is changing, adjust num_mismatched_vp_indexes if
+ * it now matches or no longer matches vcpu_idx.
+ */
+ if (hv_vcpu->vp_index == vcpu_idx)
+ atomic_inc(&hv->num_mismatched_vp_indexes);
+ else if (new_vp_index == vcpu_idx)
+ atomic_dec(&hv->num_mismatched_vp_indexes);
+
+ hv_vcpu->vp_index = new_vp_index;
+ break;
+ }
+ case HV_X64_MSR_VP_ASSIST_PAGE: {
+ u64 gfn;
+ unsigned long addr;
+
+ if (!(data & HV_X64_MSR_VP_ASSIST_PAGE_ENABLE)) {
+ hv_vcpu->hv_vapic = data;
+ if (kvm_lapic_enable_pv_eoi(vcpu, 0, 0))
+ return 1;
+ break;
+ }
+ gfn = data >> HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_SHIFT;
+ addr = kvm_vcpu_gfn_to_hva(vcpu, gfn);
+ if (kvm_is_error_hva(addr))
+ return 1;
+ if (__clear_user((void __user *)addr, PAGE_SIZE))
+ return 1;
+ hv_vcpu->hv_vapic = data;
+ kvm_vcpu_mark_page_dirty(vcpu, gfn);
+ if (kvm_lapic_enable_pv_eoi(vcpu,
+ gfn_to_gpa(gfn) | KVM_MSR_ENABLED,
+ sizeof(struct hv_vp_assist_page)))
+ return 1;
+ break;
+ }
+ case HV_X64_MSR_EOI:
+ return kvm_hv_vapic_msr_write(vcpu, APIC_EOI, data);
+ case HV_X64_MSR_ICR:
+ return kvm_hv_vapic_msr_write(vcpu, APIC_ICR, data);
+ case HV_X64_MSR_TPR:
+ return kvm_hv_vapic_msr_write(vcpu, APIC_TASKPRI, data);
+ case HV_X64_MSR_VP_RUNTIME:
+ if (!host)
+ return 1;
+ hv_vcpu->runtime_offset = data - current_task_runtime_100ns();
+ break;
+ case HV_X64_MSR_SCONTROL:
+ case HV_X64_MSR_SVERSION:
+ case HV_X64_MSR_SIEFP:
+ case HV_X64_MSR_SIMP:
+ case HV_X64_MSR_EOM:
+ case HV_X64_MSR_SINT0 ... HV_X64_MSR_SINT15:
+ return synic_set_msr(vcpu_to_synic(vcpu), msr, data, host);
+ case HV_X64_MSR_STIMER0_CONFIG:
+ case HV_X64_MSR_STIMER1_CONFIG:
+ case HV_X64_MSR_STIMER2_CONFIG:
+ case HV_X64_MSR_STIMER3_CONFIG: {
+ int timer_index = (msr - HV_X64_MSR_STIMER0_CONFIG)/2;
+
+ return stimer_set_config(vcpu_to_stimer(vcpu, timer_index),
+ data, host);
+ }
+ case HV_X64_MSR_STIMER0_COUNT:
+ case HV_X64_MSR_STIMER1_COUNT:
+ case HV_X64_MSR_STIMER2_COUNT:
+ case HV_X64_MSR_STIMER3_COUNT: {
+ int timer_index = (msr - HV_X64_MSR_STIMER0_COUNT)/2;
+
+ return stimer_set_count(vcpu_to_stimer(vcpu, timer_index),
+ data, host);
+ }
+ case HV_X64_MSR_TSC_FREQUENCY:
+ case HV_X64_MSR_APIC_FREQUENCY:
+ /* read-only, but still ignore it if host-initiated */
+ if (!host)
+ return 1;
+ break;
+ default:
+ vcpu_unimpl(vcpu, "Hyper-V uhandled wrmsr: 0x%x data 0x%llx\n",
+ msr, data);
+ return 1;
+ }
+
+ return 0;
+}
+
+static int kvm_hv_get_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
+{
+ u64 data = 0;
+ struct kvm *kvm = vcpu->kvm;
+ struct kvm_hv *hv = &kvm->arch.hyperv;
+
+ switch (msr) {
+ case HV_X64_MSR_GUEST_OS_ID:
+ data = hv->hv_guest_os_id;
+ break;
+ case HV_X64_MSR_HYPERCALL:
+ data = hv->hv_hypercall;
+ break;
+ case HV_X64_MSR_TIME_REF_COUNT:
+ data = get_time_ref_counter(kvm);
+ break;
+ case HV_X64_MSR_REFERENCE_TSC:
+ data = hv->hv_tsc_page;
+ break;
+ case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
+ return kvm_hv_msr_get_crash_data(vcpu,
+ msr - HV_X64_MSR_CRASH_P0,
+ pdata);
+ case HV_X64_MSR_CRASH_CTL:
+ return kvm_hv_msr_get_crash_ctl(vcpu, pdata);
+ case HV_X64_MSR_RESET:
+ data = 0;
+ break;
+ case HV_X64_MSR_REENLIGHTENMENT_CONTROL:
+ data = hv->hv_reenlightenment_control;
+ break;
+ case HV_X64_MSR_TSC_EMULATION_CONTROL:
+ data = hv->hv_tsc_emulation_control;
+ break;
+ case HV_X64_MSR_TSC_EMULATION_STATUS:
+ data = hv->hv_tsc_emulation_status;
+ break;
+ default:
+ vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
+ return 1;
+ }
+
+ *pdata = data;
+ return 0;
+}
+
+static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata,
+ bool host)
+{
+ u64 data = 0;
+ struct kvm_vcpu_hv *hv_vcpu = &vcpu->arch.hyperv;
+
+ switch (msr) {
+ case HV_X64_MSR_VP_INDEX:
+ data = hv_vcpu->vp_index;
+ break;
+ case HV_X64_MSR_EOI:
+ return kvm_hv_vapic_msr_read(vcpu, APIC_EOI, pdata);
+ case HV_X64_MSR_ICR:
+ return kvm_hv_vapic_msr_read(vcpu, APIC_ICR, pdata);
+ case HV_X64_MSR_TPR:
+ return kvm_hv_vapic_msr_read(vcpu, APIC_TASKPRI, pdata);
+ case HV_X64_MSR_VP_ASSIST_PAGE:
+ data = hv_vcpu->hv_vapic;
+ break;
+ case HV_X64_MSR_VP_RUNTIME:
+ data = current_task_runtime_100ns() + hv_vcpu->runtime_offset;
+ break;
+ case HV_X64_MSR_SCONTROL:
+ case HV_X64_MSR_SVERSION:
+ case HV_X64_MSR_SIEFP:
+ case HV_X64_MSR_SIMP:
+ case HV_X64_MSR_EOM:
+ case HV_X64_MSR_SINT0 ... HV_X64_MSR_SINT15:
+ return synic_get_msr(vcpu_to_synic(vcpu), msr, pdata, host);
+ case HV_X64_MSR_STIMER0_CONFIG:
+ case HV_X64_MSR_STIMER1_CONFIG:
+ case HV_X64_MSR_STIMER2_CONFIG:
+ case HV_X64_MSR_STIMER3_CONFIG: {
+ int timer_index = (msr - HV_X64_MSR_STIMER0_CONFIG)/2;
+
+ return stimer_get_config(vcpu_to_stimer(vcpu, timer_index),
+ pdata);
+ }
+ case HV_X64_MSR_STIMER0_COUNT:
+ case HV_X64_MSR_STIMER1_COUNT:
+ case HV_X64_MSR_STIMER2_COUNT:
+ case HV_X64_MSR_STIMER3_COUNT: {
+ int timer_index = (msr - HV_X64_MSR_STIMER0_COUNT)/2;
+
+ return stimer_get_count(vcpu_to_stimer(vcpu, timer_index),
+ pdata);
+ }
+ case HV_X64_MSR_TSC_FREQUENCY:
+ data = (u64)vcpu->arch.virtual_tsc_khz * 1000;
+ break;
+ case HV_X64_MSR_APIC_FREQUENCY:
+ data = APIC_BUS_FREQUENCY;
+ break;
+ default:
+ vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
+ return 1;
+ }
+ *pdata = data;
+ return 0;
+}
+
+int kvm_hv_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host)
+{
+ if (kvm_hv_msr_partition_wide(msr)) {
+ int r;
+
+ mutex_lock(&vcpu->kvm->arch.hyperv.hv_lock);
+ r = kvm_hv_set_msr_pw(vcpu, msr, data, host);
+ mutex_unlock(&vcpu->kvm->arch.hyperv.hv_lock);
+ return r;
+ } else
+ return kvm_hv_set_msr(vcpu, msr, data, host);
+}
+
+int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host)
+{
+ if (kvm_hv_msr_partition_wide(msr)) {
+ int r;
+
+ mutex_lock(&vcpu->kvm->arch.hyperv.hv_lock);
+ r = kvm_hv_get_msr_pw(vcpu, msr, pdata);
+ mutex_unlock(&vcpu->kvm->arch.hyperv.hv_lock);
+ return r;
+ } else
+ return kvm_hv_get_msr(vcpu, msr, pdata, host);
+}
+
+static __always_inline int get_sparse_bank_no(u64 valid_bank_mask, int bank_no)
+{
+ int i = 0, j;
+
+ if (!(valid_bank_mask & BIT_ULL(bank_no)))
+ return -1;
+
+ for (j = 0; j < bank_no; j++)
+ if (valid_bank_mask & BIT_ULL(j))
+ i++;
+
+ return i;
+}
+
+static u64 kvm_hv_flush_tlb(struct kvm_vcpu *current_vcpu, u64 ingpa,
+ u16 rep_cnt, bool ex)
+{
+ struct kvm *kvm = current_vcpu->kvm;
+ struct kvm_vcpu_hv *hv_current = &current_vcpu->arch.hyperv;
+ struct hv_tlb_flush_ex flush_ex;
+ struct hv_tlb_flush flush;
+ struct kvm_vcpu *vcpu;
+ unsigned long vcpu_bitmap[BITS_TO_LONGS(KVM_MAX_VCPUS)] = {0};
+ unsigned long valid_bank_mask = 0;
+ u64 sparse_banks[64];
+ int sparse_banks_len, i;
+ bool all_cpus;
+
+ if (!ex) {
+ if (unlikely(kvm_read_guest(kvm, ingpa, &flush, sizeof(flush))))
+ return HV_STATUS_INVALID_HYPERCALL_INPUT;
+
+ trace_kvm_hv_flush_tlb(flush.processor_mask,
+ flush.address_space, flush.flags);
+
+ sparse_banks[0] = flush.processor_mask;
+
+ /*
+ * Work around possible WS2012 bug: it sends hypercalls
+ * with processor_mask = 0x0 and HV_FLUSH_ALL_PROCESSORS clear,
+ * while also expecting us to flush something and crashing if
+ * we don't. Let's treat processor_mask == 0 same as
+ * HV_FLUSH_ALL_PROCESSORS.
+ */
+ all_cpus = (flush.flags & HV_FLUSH_ALL_PROCESSORS) ||
+ flush.processor_mask == 0;
+ } else {
+ if (unlikely(kvm_read_guest(kvm, ingpa, &flush_ex,
+ sizeof(flush_ex))))
+ return HV_STATUS_INVALID_HYPERCALL_INPUT;
+
+ trace_kvm_hv_flush_tlb_ex(flush_ex.hv_vp_set.valid_bank_mask,
+ flush_ex.hv_vp_set.format,
+ flush_ex.address_space,
+ flush_ex.flags);
+
+ valid_bank_mask = flush_ex.hv_vp_set.valid_bank_mask;
+ all_cpus = flush_ex.hv_vp_set.format !=
+ HV_GENERIC_SET_SPARSE_4K;
+
+ sparse_banks_len = bitmap_weight(&valid_bank_mask, 64) *
+ sizeof(sparse_banks[0]);
+
+ if (!sparse_banks_len && !all_cpus)
+ goto ret_success;
+
+ if (!all_cpus &&
+ kvm_read_guest(kvm,
+ ingpa + offsetof(struct hv_tlb_flush_ex,
+ hv_vp_set.bank_contents),
+ sparse_banks,
+ sparse_banks_len))
+ return HV_STATUS_INVALID_HYPERCALL_INPUT;
+ }
+
+ cpumask_clear(&hv_current->tlb_lush);
+
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ struct kvm_vcpu_hv *hv = &vcpu->arch.hyperv;
+ int bank = hv->vp_index / 64, sbank = 0;
+
+ if (!all_cpus) {
+ /* Banks >64 can't be represented */
+ if (bank >= 64)
+ continue;
+
+ /* Non-ex hypercalls can only address first 64 vCPUs */
+ if (!ex && bank)
+ continue;
+
+ if (ex) {
+ /*
+ * Check is the bank of this vCPU is in sparse
+ * set and get the sparse bank number.
+ */
+ sbank = get_sparse_bank_no(valid_bank_mask,
+ bank);
+
+ if (sbank < 0)
+ continue;
+ }
+
+ if (!(sparse_banks[sbank] & BIT_ULL(hv->vp_index % 64)))
+ continue;
+ }
+
+ /*
+ * vcpu->arch.cr3 may not be up-to-date for running vCPUs so we
+ * can't analyze it here, flush TLB regardless of the specified
+ * address space.
+ */
+ __set_bit(i, vcpu_bitmap);
+ }
+
+ kvm_make_vcpus_request_mask(kvm,
+ KVM_REQ_TLB_FLUSH | KVM_REQUEST_NO_WAKEUP,
+ vcpu_bitmap, &hv_current->tlb_lush);
+
+ret_success:
+ /* We always do full TLB flush, set rep_done = rep_cnt. */
+ return (u64)HV_STATUS_SUCCESS |
+ ((u64)rep_cnt << HV_HYPERCALL_REP_COMP_OFFSET);
+}
+
+bool kvm_hv_hypercall_enabled(struct kvm *kvm)
+{
+ return READ_ONCE(kvm->arch.hyperv.hv_hypercall) & HV_X64_MSR_HYPERCALL_ENABLE;
+}
+
+static void kvm_hv_hypercall_set_result(struct kvm_vcpu *vcpu, u64 result)
+{
+ bool longmode;
+
+ longmode = is_64_bit_mode(vcpu);
+ if (longmode)
+ kvm_register_write(vcpu, VCPU_REGS_RAX, result);
+ else {
+ kvm_register_write(vcpu, VCPU_REGS_RDX, result >> 32);
+ kvm_register_write(vcpu, VCPU_REGS_RAX, result & 0xffffffff);
+ }
+}
+
+static int kvm_hv_hypercall_complete(struct kvm_vcpu *vcpu, u64 result)
+{
+ kvm_hv_hypercall_set_result(vcpu, result);
+ ++vcpu->stat.hypercalls;
+ return kvm_skip_emulated_instruction(vcpu);
+}
+
+static int kvm_hv_hypercall_complete_userspace(struct kvm_vcpu *vcpu)
+{
+ return kvm_hv_hypercall_complete(vcpu, vcpu->run->hyperv.u.hcall.result);
+}
+
+static u16 kvm_hvcall_signal_event(struct kvm_vcpu *vcpu, bool fast, u64 param)
+{
+ struct eventfd_ctx *eventfd;
+
+ if (unlikely(!fast)) {
+ int ret;
+ gpa_t gpa = param;
+
+ if ((gpa & (__alignof__(param) - 1)) ||
+ offset_in_page(gpa) + sizeof(param) > PAGE_SIZE)
+ return HV_STATUS_INVALID_ALIGNMENT;
+
+ ret = kvm_vcpu_read_guest(vcpu, gpa, &param, sizeof(param));
+ if (ret < 0)
+ return HV_STATUS_INVALID_ALIGNMENT;
+ }
+
+ /*
+ * Per spec, bits 32-47 contain the extra "flag number". However, we
+ * have no use for it, and in all known usecases it is zero, so just
+ * report lookup failure if it isn't.
+ */
+ if (param & 0xffff00000000ULL)
+ return HV_STATUS_INVALID_PORT_ID;
+ /* remaining bits are reserved-zero */
+ if (param & ~KVM_HYPERV_CONN_ID_MASK)
+ return HV_STATUS_INVALID_HYPERCALL_INPUT;
+
+ /* the eventfd is protected by vcpu->kvm->srcu, but conn_to_evt isn't */
+ rcu_read_lock();
+ eventfd = idr_find(&vcpu->kvm->arch.hyperv.conn_to_evt, param);
+ rcu_read_unlock();
+ if (!eventfd)
+ return HV_STATUS_INVALID_PORT_ID;
+
+ eventfd_signal(eventfd, 1);
+ return HV_STATUS_SUCCESS;
+}
+
+int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
+{
+ u64 param, ingpa, outgpa, ret = HV_STATUS_SUCCESS;
+ uint16_t code, rep_idx, rep_cnt;
+ bool fast, longmode, rep;
+
+ /*
+ * hypercall generates UD from non zero cpl and real mode
+ * per HYPER-V spec
+ */
+ if (kvm_x86_ops->get_cpl(vcpu) != 0 || !is_protmode(vcpu)) {
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+ }
+
+ longmode = is_64_bit_mode(vcpu);
+
+ if (!longmode) {
+ param = ((u64)kvm_register_read(vcpu, VCPU_REGS_RDX) << 32) |
+ (kvm_register_read(vcpu, VCPU_REGS_RAX) & 0xffffffff);
+ ingpa = ((u64)kvm_register_read(vcpu, VCPU_REGS_RBX) << 32) |
+ (kvm_register_read(vcpu, VCPU_REGS_RCX) & 0xffffffff);
+ outgpa = ((u64)kvm_register_read(vcpu, VCPU_REGS_RDI) << 32) |
+ (kvm_register_read(vcpu, VCPU_REGS_RSI) & 0xffffffff);
+ }
+#ifdef CONFIG_X86_64
+ else {
+ param = kvm_register_read(vcpu, VCPU_REGS_RCX);
+ ingpa = kvm_register_read(vcpu, VCPU_REGS_RDX);
+ outgpa = kvm_register_read(vcpu, VCPU_REGS_R8);
+ }
+#endif
+
+ code = param & 0xffff;
+ fast = !!(param & HV_HYPERCALL_FAST_BIT);
+ rep_cnt = (param >> HV_HYPERCALL_REP_COMP_OFFSET) & 0xfff;
+ rep_idx = (param >> HV_HYPERCALL_REP_START_OFFSET) & 0xfff;
+ rep = !!(rep_cnt || rep_idx);
+
+ trace_kvm_hv_hypercall(code, fast, rep_cnt, rep_idx, ingpa, outgpa);
+
+ switch (code) {
+ case HVCALL_NOTIFY_LONG_SPIN_WAIT:
+ if (unlikely(rep)) {
+ ret = HV_STATUS_INVALID_HYPERCALL_INPUT;
+ break;
+ }
+ kvm_vcpu_on_spin(vcpu, true);
+ break;
+ case HVCALL_SIGNAL_EVENT:
+ if (unlikely(rep)) {
+ ret = HV_STATUS_INVALID_HYPERCALL_INPUT;
+ break;
+ }
+ ret = kvm_hvcall_signal_event(vcpu, fast, ingpa);
+ if (ret != HV_STATUS_INVALID_PORT_ID)
+ break;
+ /* maybe userspace knows this conn_id: fall through */
+ case HVCALL_POST_MESSAGE:
+ /* don't bother userspace if it has no way to handle it */
+ if (unlikely(rep || !vcpu_to_synic(vcpu)->active)) {
+ ret = HV_STATUS_INVALID_HYPERCALL_INPUT;
+ break;
+ }
+ vcpu->run->exit_reason = KVM_EXIT_HYPERV;
+ vcpu->run->hyperv.type = KVM_EXIT_HYPERV_HCALL;
+ vcpu->run->hyperv.u.hcall.input = param;
+ vcpu->run->hyperv.u.hcall.params[0] = ingpa;
+ vcpu->run->hyperv.u.hcall.params[1] = outgpa;
+ vcpu->arch.complete_userspace_io =
+ kvm_hv_hypercall_complete_userspace;
+ return 0;
+ case HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST:
+ if (unlikely(fast || !rep_cnt || rep_idx)) {
+ ret = HV_STATUS_INVALID_HYPERCALL_INPUT;
+ break;
+ }
+ ret = kvm_hv_flush_tlb(vcpu, ingpa, rep_cnt, false);
+ break;
+ case HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE:
+ if (unlikely(fast || rep)) {
+ ret = HV_STATUS_INVALID_HYPERCALL_INPUT;
+ break;
+ }
+ ret = kvm_hv_flush_tlb(vcpu, ingpa, rep_cnt, false);
+ break;
+ case HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX:
+ if (unlikely(fast || !rep_cnt || rep_idx)) {
+ ret = HV_STATUS_INVALID_HYPERCALL_INPUT;
+ break;
+ }
+ ret = kvm_hv_flush_tlb(vcpu, ingpa, rep_cnt, true);
+ break;
+ case HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX:
+ if (unlikely(fast || rep)) {
+ ret = HV_STATUS_INVALID_HYPERCALL_INPUT;
+ break;
+ }
+ ret = kvm_hv_flush_tlb(vcpu, ingpa, rep_cnt, true);
+ break;
+ default:
+ ret = HV_STATUS_INVALID_HYPERCALL_CODE;
+ break;
+ }
+
+ return kvm_hv_hypercall_complete(vcpu, ret);
+}
+
+void kvm_hv_init_vm(struct kvm *kvm)
+{
+ mutex_init(&kvm->arch.hyperv.hv_lock);
+ idr_init(&kvm->arch.hyperv.conn_to_evt);
+}
+
+void kvm_hv_destroy_vm(struct kvm *kvm)
+{
+ struct eventfd_ctx *eventfd;
+ int i;
+
+ idr_for_each_entry(&kvm->arch.hyperv.conn_to_evt, eventfd, i)
+ eventfd_ctx_put(eventfd);
+ idr_destroy(&kvm->arch.hyperv.conn_to_evt);
+}
+
+static int kvm_hv_eventfd_assign(struct kvm *kvm, u32 conn_id, int fd)
+{
+ struct kvm_hv *hv = &kvm->arch.hyperv;
+ struct eventfd_ctx *eventfd;
+ int ret;
+
+ eventfd = eventfd_ctx_fdget(fd);
+ if (IS_ERR(eventfd))
+ return PTR_ERR(eventfd);
+
+ mutex_lock(&hv->hv_lock);
+ ret = idr_alloc(&hv->conn_to_evt, eventfd, conn_id, conn_id + 1,
+ GFP_KERNEL);
+ mutex_unlock(&hv->hv_lock);
+
+ if (ret >= 0)
+ return 0;
+
+ if (ret == -ENOSPC)
+ ret = -EEXIST;
+ eventfd_ctx_put(eventfd);
+ return ret;
+}
+
+static int kvm_hv_eventfd_deassign(struct kvm *kvm, u32 conn_id)
+{
+ struct kvm_hv *hv = &kvm->arch.hyperv;
+ struct eventfd_ctx *eventfd;
+
+ mutex_lock(&hv->hv_lock);
+ eventfd = idr_remove(&hv->conn_to_evt, conn_id);
+ mutex_unlock(&hv->hv_lock);
+
+ if (!eventfd)
+ return -ENOENT;
+
+ synchronize_srcu(&kvm->srcu);
+ eventfd_ctx_put(eventfd);
+ return 0;
+}
+
+int kvm_vm_ioctl_hv_eventfd(struct kvm *kvm, struct kvm_hyperv_eventfd *args)
+{
+ if ((args->flags & ~KVM_HYPERV_EVENTFD_DEASSIGN) ||
+ (args->conn_id & ~KVM_HYPERV_CONN_ID_MASK))
+ return -EINVAL;
+
+ if (args->flags == KVM_HYPERV_EVENTFD_DEASSIGN)
+ return kvm_hv_eventfd_deassign(kvm, args->conn_id);
+ return kvm_hv_eventfd_assign(kvm, args->conn_id, args->fd);
+}
diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h
new file mode 100644
index 000000000..0e66c12ed
--- /dev/null
+++ b/arch/x86/kvm/hyperv.h
@@ -0,0 +1,99 @@
+/*
+ * KVM Microsoft Hyper-V emulation
+ *
+ * derived from arch/x86/kvm/x86.c
+ *
+ * Copyright (C) 2006 Qumranet, Inc.
+ * Copyright (C) 2008 Qumranet, Inc.
+ * Copyright IBM Corporation, 2008
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
+ * Copyright (C) 2015 Andrey Smetanin <asmetanin@virtuozzo.com>
+ *
+ * Authors:
+ * Avi Kivity <avi@qumranet.com>
+ * Yaniv Kamay <yaniv@qumranet.com>
+ * Amit Shah <amit.shah@qumranet.com>
+ * Ben-Ami Yassour <benami@il.ibm.com>
+ * Andrey Smetanin <asmetanin@virtuozzo.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#ifndef __ARCH_X86_KVM_HYPERV_H__
+#define __ARCH_X86_KVM_HYPERV_H__
+
+static inline struct kvm_vcpu_hv *vcpu_to_hv_vcpu(struct kvm_vcpu *vcpu)
+{
+ return &vcpu->arch.hyperv;
+}
+
+static inline struct kvm_vcpu *hv_vcpu_to_vcpu(struct kvm_vcpu_hv *hv_vcpu)
+{
+ struct kvm_vcpu_arch *arch;
+
+ arch = container_of(hv_vcpu, struct kvm_vcpu_arch, hyperv);
+ return container_of(arch, struct kvm_vcpu, arch);
+}
+
+static inline struct kvm_vcpu_hv_synic *vcpu_to_synic(struct kvm_vcpu *vcpu)
+{
+ return &vcpu->arch.hyperv.synic;
+}
+
+static inline struct kvm_vcpu *synic_to_vcpu(struct kvm_vcpu_hv_synic *synic)
+{
+ return hv_vcpu_to_vcpu(container_of(synic, struct kvm_vcpu_hv, synic));
+}
+
+int kvm_hv_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host);
+int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host);
+
+bool kvm_hv_hypercall_enabled(struct kvm *kvm);
+int kvm_hv_hypercall(struct kvm_vcpu *vcpu);
+
+void kvm_hv_irq_routing_update(struct kvm *kvm);
+int kvm_hv_synic_set_irq(struct kvm *kvm, u32 vcpu_id, u32 sint);
+void kvm_hv_synic_send_eoi(struct kvm_vcpu *vcpu, int vector);
+int kvm_hv_activate_synic(struct kvm_vcpu *vcpu, bool dont_zero_synic_pages);
+
+void kvm_hv_vcpu_init(struct kvm_vcpu *vcpu);
+void kvm_hv_vcpu_postcreate(struct kvm_vcpu *vcpu);
+void kvm_hv_vcpu_uninit(struct kvm_vcpu *vcpu);
+
+bool kvm_hv_assist_page_enabled(struct kvm_vcpu *vcpu);
+bool kvm_hv_get_assist_page(struct kvm_vcpu *vcpu,
+ struct hv_vp_assist_page *assist_page);
+
+static inline struct kvm_vcpu_hv_stimer *vcpu_to_stimer(struct kvm_vcpu *vcpu,
+ int timer_index)
+{
+ return &vcpu_to_hv_vcpu(vcpu)->stimer[timer_index];
+}
+
+static inline struct kvm_vcpu *stimer_to_vcpu(struct kvm_vcpu_hv_stimer *stimer)
+{
+ struct kvm_vcpu_hv *hv_vcpu;
+
+ hv_vcpu = container_of(stimer - stimer->index, struct kvm_vcpu_hv,
+ stimer[0]);
+ return hv_vcpu_to_vcpu(hv_vcpu);
+}
+
+static inline bool kvm_hv_has_stimer_pending(struct kvm_vcpu *vcpu)
+{
+ return !bitmap_empty(vcpu->arch.hyperv.stimer_pending_bitmap,
+ HV_SYNIC_STIMER_COUNT);
+}
+
+void kvm_hv_process_stimers(struct kvm_vcpu *vcpu);
+
+void kvm_hv_setup_tsc_page(struct kvm *kvm,
+ struct pvclock_vcpu_time_info *hv_clock);
+
+void kvm_hv_init_vm(struct kvm *kvm);
+void kvm_hv_destroy_vm(struct kvm *kvm);
+int kvm_vm_ioctl_hv_eventfd(struct kvm *kvm, struct kvm_hyperv_eventfd *args);
+
+#endif
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
new file mode 100644
index 000000000..af192895b
--- /dev/null
+++ b/arch/x86/kvm/i8254.c
@@ -0,0 +1,737 @@
+/*
+ * 8253/8254 interval timer emulation
+ *
+ * Copyright (c) 2003-2004 Fabrice Bellard
+ * Copyright (c) 2006 Intel Corporation
+ * Copyright (c) 2007 Keir Fraser, XenSource Inc
+ * Copyright (c) 2008 Intel Corporation
+ * Copyright 2009 Red Hat, Inc. and/or its affiliates.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ * Authors:
+ * Sheng Yang <sheng.yang@intel.com>
+ * Based on QEMU and Xen.
+ */
+
+#define pr_fmt(fmt) "pit: " fmt
+
+#include <linux/kvm_host.h>
+#include <linux/slab.h>
+
+#include "ioapic.h"
+#include "irq.h"
+#include "i8254.h"
+#include "x86.h"
+
+#ifndef CONFIG_X86_64
+#define mod_64(x, y) ((x) - (y) * div64_u64(x, y))
+#else
+#define mod_64(x, y) ((x) % (y))
+#endif
+
+#define RW_STATE_LSB 1
+#define RW_STATE_MSB 2
+#define RW_STATE_WORD0 3
+#define RW_STATE_WORD1 4
+
+static void pit_set_gate(struct kvm_pit *pit, int channel, u32 val)
+{
+ struct kvm_kpit_channel_state *c = &pit->pit_state.channels[channel];
+
+ switch (c->mode) {
+ default:
+ case 0:
+ case 4:
+ /* XXX: just disable/enable counting */
+ break;
+ case 1:
+ case 2:
+ case 3:
+ case 5:
+ /* Restart counting on rising edge. */
+ if (c->gate < val)
+ c->count_load_time = ktime_get();
+ break;
+ }
+
+ c->gate = val;
+}
+
+static int pit_get_gate(struct kvm_pit *pit, int channel)
+{
+ return pit->pit_state.channels[channel].gate;
+}
+
+static s64 __kpit_elapsed(struct kvm_pit *pit)
+{
+ s64 elapsed;
+ ktime_t remaining;
+ struct kvm_kpit_state *ps = &pit->pit_state;
+
+ if (!ps->period)
+ return 0;
+
+ /*
+ * The Counter does not stop when it reaches zero. In
+ * Modes 0, 1, 4, and 5 the Counter ``wraps around'' to
+ * the highest count, either FFFF hex for binary counting
+ * or 9999 for BCD counting, and continues counting.
+ * Modes 2 and 3 are periodic; the Counter reloads
+ * itself with the initial count and continues counting
+ * from there.
+ */
+ remaining = hrtimer_get_remaining(&ps->timer);
+ elapsed = ps->period - ktime_to_ns(remaining);
+
+ return elapsed;
+}
+
+static s64 kpit_elapsed(struct kvm_pit *pit, struct kvm_kpit_channel_state *c,
+ int channel)
+{
+ if (channel == 0)
+ return __kpit_elapsed(pit);
+
+ return ktime_to_ns(ktime_sub(ktime_get(), c->count_load_time));
+}
+
+static int pit_get_count(struct kvm_pit *pit, int channel)
+{
+ struct kvm_kpit_channel_state *c = &pit->pit_state.channels[channel];
+ s64 d, t;
+ int counter;
+
+ t = kpit_elapsed(pit, c, channel);
+ d = mul_u64_u32_div(t, KVM_PIT_FREQ, NSEC_PER_SEC);
+
+ switch (c->mode) {
+ case 0:
+ case 1:
+ case 4:
+ case 5:
+ counter = (c->count - d) & 0xffff;
+ break;
+ case 3:
+ /* XXX: may be incorrect for odd counts */
+ counter = c->count - (mod_64((2 * d), c->count));
+ break;
+ default:
+ counter = c->count - mod_64(d, c->count);
+ break;
+ }
+ return counter;
+}
+
+static int pit_get_out(struct kvm_pit *pit, int channel)
+{
+ struct kvm_kpit_channel_state *c = &pit->pit_state.channels[channel];
+ s64 d, t;
+ int out;
+
+ t = kpit_elapsed(pit, c, channel);
+ d = mul_u64_u32_div(t, KVM_PIT_FREQ, NSEC_PER_SEC);
+
+ switch (c->mode) {
+ default:
+ case 0:
+ out = (d >= c->count);
+ break;
+ case 1:
+ out = (d < c->count);
+ break;
+ case 2:
+ out = ((mod_64(d, c->count) == 0) && (d != 0));
+ break;
+ case 3:
+ out = (mod_64(d, c->count) < ((c->count + 1) >> 1));
+ break;
+ case 4:
+ case 5:
+ out = (d == c->count);
+ break;
+ }
+
+ return out;
+}
+
+static void pit_latch_count(struct kvm_pit *pit, int channel)
+{
+ struct kvm_kpit_channel_state *c = &pit->pit_state.channels[channel];
+
+ if (!c->count_latched) {
+ c->latched_count = pit_get_count(pit, channel);
+ c->count_latched = c->rw_mode;
+ }
+}
+
+static void pit_latch_status(struct kvm_pit *pit, int channel)
+{
+ struct kvm_kpit_channel_state *c = &pit->pit_state.channels[channel];
+
+ if (!c->status_latched) {
+ /* TODO: Return NULL COUNT (bit 6). */
+ c->status = ((pit_get_out(pit, channel) << 7) |
+ (c->rw_mode << 4) |
+ (c->mode << 1) |
+ c->bcd);
+ c->status_latched = 1;
+ }
+}
+
+static inline struct kvm_pit *pit_state_to_pit(struct kvm_kpit_state *ps)
+{
+ return container_of(ps, struct kvm_pit, pit_state);
+}
+
+static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian)
+{
+ struct kvm_kpit_state *ps = container_of(kian, struct kvm_kpit_state,
+ irq_ack_notifier);
+ struct kvm_pit *pit = pit_state_to_pit(ps);
+
+ atomic_set(&ps->irq_ack, 1);
+ /* irq_ack should be set before pending is read. Order accesses with
+ * inc(pending) in pit_timer_fn and xchg(irq_ack, 0) in pit_do_work.
+ */
+ smp_mb();
+ if (atomic_dec_if_positive(&ps->pending) > 0)
+ kthread_queue_work(pit->worker, &pit->expired);
+}
+
+void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu)
+{
+ struct kvm_pit *pit = vcpu->kvm->arch.vpit;
+ struct hrtimer *timer;
+
+ if (!kvm_vcpu_is_bsp(vcpu) || !pit)
+ return;
+
+ timer = &pit->pit_state.timer;
+ mutex_lock(&pit->pit_state.lock);
+ if (hrtimer_cancel(timer))
+ hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
+ mutex_unlock(&pit->pit_state.lock);
+}
+
+static void destroy_pit_timer(struct kvm_pit *pit)
+{
+ hrtimer_cancel(&pit->pit_state.timer);
+ kthread_flush_work(&pit->expired);
+}
+
+static void pit_do_work(struct kthread_work *work)
+{
+ struct kvm_pit *pit = container_of(work, struct kvm_pit, expired);
+ struct kvm *kvm = pit->kvm;
+ struct kvm_vcpu *vcpu;
+ int i;
+ struct kvm_kpit_state *ps = &pit->pit_state;
+
+ if (atomic_read(&ps->reinject) && !atomic_xchg(&ps->irq_ack, 0))
+ return;
+
+ kvm_set_irq(kvm, pit->irq_source_id, 0, 1, false);
+ kvm_set_irq(kvm, pit->irq_source_id, 0, 0, false);
+
+ /*
+ * Provides NMI watchdog support via Virtual Wire mode.
+ * The route is: PIT -> LVT0 in NMI mode.
+ *
+ * Note: Our Virtual Wire implementation does not follow
+ * the MP specification. We propagate a PIT interrupt to all
+ * VCPUs and only when LVT0 is in NMI mode. The interrupt can
+ * also be simultaneously delivered through PIC and IOAPIC.
+ */
+ if (atomic_read(&kvm->arch.vapics_in_nmi_mode) > 0)
+ kvm_for_each_vcpu(i, vcpu, kvm)
+ kvm_apic_nmi_wd_deliver(vcpu);
+}
+
+static enum hrtimer_restart pit_timer_fn(struct hrtimer *data)
+{
+ struct kvm_kpit_state *ps = container_of(data, struct kvm_kpit_state, timer);
+ struct kvm_pit *pt = pit_state_to_pit(ps);
+
+ if (atomic_read(&ps->reinject))
+ atomic_inc(&ps->pending);
+
+ kthread_queue_work(pt->worker, &pt->expired);
+
+ if (ps->is_periodic) {
+ hrtimer_add_expires_ns(&ps->timer, ps->period);
+ return HRTIMER_RESTART;
+ } else
+ return HRTIMER_NORESTART;
+}
+
+static inline void kvm_pit_reset_reinject(struct kvm_pit *pit)
+{
+ atomic_set(&pit->pit_state.pending, 0);
+ atomic_set(&pit->pit_state.irq_ack, 1);
+}
+
+void kvm_pit_set_reinject(struct kvm_pit *pit, bool reinject)
+{
+ struct kvm_kpit_state *ps = &pit->pit_state;
+ struct kvm *kvm = pit->kvm;
+
+ if (atomic_read(&ps->reinject) == reinject)
+ return;
+
+ if (reinject) {
+ /* The initial state is preserved while ps->reinject == 0. */
+ kvm_pit_reset_reinject(pit);
+ kvm_register_irq_ack_notifier(kvm, &ps->irq_ack_notifier);
+ kvm_register_irq_mask_notifier(kvm, 0, &pit->mask_notifier);
+ } else {
+ kvm_unregister_irq_ack_notifier(kvm, &ps->irq_ack_notifier);
+ kvm_unregister_irq_mask_notifier(kvm, 0, &pit->mask_notifier);
+ }
+
+ atomic_set(&ps->reinject, reinject);
+}
+
+static void create_pit_timer(struct kvm_pit *pit, u32 val, int is_period)
+{
+ struct kvm_kpit_state *ps = &pit->pit_state;
+ struct kvm *kvm = pit->kvm;
+ s64 interval;
+
+ if (!ioapic_in_kernel(kvm) ||
+ ps->flags & KVM_PIT_FLAGS_HPET_LEGACY)
+ return;
+
+ interval = mul_u64_u32_div(val, NSEC_PER_SEC, KVM_PIT_FREQ);
+
+ pr_debug("create pit timer, interval is %llu nsec\n", interval);
+
+ /* TODO The new value only affected after the retriggered */
+ hrtimer_cancel(&ps->timer);
+ kthread_flush_work(&pit->expired);
+ ps->period = interval;
+ ps->is_periodic = is_period;
+
+ kvm_pit_reset_reinject(pit);
+
+ /*
+ * Do not allow the guest to program periodic timers with small
+ * interval, since the hrtimers are not throttled by the host
+ * scheduler.
+ */
+ if (ps->is_periodic) {
+ s64 min_period = min_timer_period_us * 1000LL;
+
+ if (ps->period < min_period) {
+ pr_info_ratelimited(
+ "kvm: requested %lld ns "
+ "i8254 timer period limited to %lld ns\n",
+ ps->period, min_period);
+ ps->period = min_period;
+ }
+ }
+
+ hrtimer_start(&ps->timer, ktime_add_ns(ktime_get(), interval),
+ HRTIMER_MODE_ABS);
+}
+
+static void pit_load_count(struct kvm_pit *pit, int channel, u32 val)
+{
+ struct kvm_kpit_state *ps = &pit->pit_state;
+
+ pr_debug("load_count val is %d, channel is %d\n", val, channel);
+
+ /*
+ * The largest possible initial count is 0; this is equivalent
+ * to 216 for binary counting and 104 for BCD counting.
+ */
+ if (val == 0)
+ val = 0x10000;
+
+ ps->channels[channel].count = val;
+
+ if (channel != 0) {
+ ps->channels[channel].count_load_time = ktime_get();
+ return;
+ }
+
+ /* Two types of timer
+ * mode 1 is one shot, mode 2 is period, otherwise del timer */
+ switch (ps->channels[0].mode) {
+ case 0:
+ case 1:
+ /* FIXME: enhance mode 4 precision */
+ case 4:
+ create_pit_timer(pit, val, 0);
+ break;
+ case 2:
+ case 3:
+ create_pit_timer(pit, val, 1);
+ break;
+ default:
+ destroy_pit_timer(pit);
+ }
+}
+
+void kvm_pit_load_count(struct kvm_pit *pit, int channel, u32 val,
+ int hpet_legacy_start)
+{
+ u8 saved_mode;
+
+ WARN_ON_ONCE(!mutex_is_locked(&pit->pit_state.lock));
+
+ if (hpet_legacy_start) {
+ /* save existing mode for later reenablement */
+ WARN_ON(channel != 0);
+ saved_mode = pit->pit_state.channels[0].mode;
+ pit->pit_state.channels[0].mode = 0xff; /* disable timer */
+ pit_load_count(pit, channel, val);
+ pit->pit_state.channels[0].mode = saved_mode;
+ } else {
+ pit_load_count(pit, channel, val);
+ }
+}
+
+static inline struct kvm_pit *dev_to_pit(struct kvm_io_device *dev)
+{
+ return container_of(dev, struct kvm_pit, dev);
+}
+
+static inline struct kvm_pit *speaker_to_pit(struct kvm_io_device *dev)
+{
+ return container_of(dev, struct kvm_pit, speaker_dev);
+}
+
+static inline int pit_in_range(gpa_t addr)
+{
+ return ((addr >= KVM_PIT_BASE_ADDRESS) &&
+ (addr < KVM_PIT_BASE_ADDRESS + KVM_PIT_MEM_LENGTH));
+}
+
+static int pit_ioport_write(struct kvm_vcpu *vcpu,
+ struct kvm_io_device *this,
+ gpa_t addr, int len, const void *data)
+{
+ struct kvm_pit *pit = dev_to_pit(this);
+ struct kvm_kpit_state *pit_state = &pit->pit_state;
+ int channel, access;
+ struct kvm_kpit_channel_state *s;
+ u32 val = *(u32 *) data;
+ if (!pit_in_range(addr))
+ return -EOPNOTSUPP;
+
+ val &= 0xff;
+ addr &= KVM_PIT_CHANNEL_MASK;
+
+ mutex_lock(&pit_state->lock);
+
+ if (val != 0)
+ pr_debug("write addr is 0x%x, len is %d, val is 0x%x\n",
+ (unsigned int)addr, len, val);
+
+ if (addr == 3) {
+ channel = val >> 6;
+ if (channel == 3) {
+ /* Read-Back Command. */
+ for (channel = 0; channel < 3; channel++) {
+ s = &pit_state->channels[channel];
+ if (val & (2 << channel)) {
+ if (!(val & 0x20))
+ pit_latch_count(pit, channel);
+ if (!(val & 0x10))
+ pit_latch_status(pit, channel);
+ }
+ }
+ } else {
+ /* Select Counter <channel>. */
+ s = &pit_state->channels[channel];
+ access = (val >> 4) & KVM_PIT_CHANNEL_MASK;
+ if (access == 0) {
+ pit_latch_count(pit, channel);
+ } else {
+ s->rw_mode = access;
+ s->read_state = access;
+ s->write_state = access;
+ s->mode = (val >> 1) & 7;
+ if (s->mode > 5)
+ s->mode -= 4;
+ s->bcd = val & 1;
+ }
+ }
+ } else {
+ /* Write Count. */
+ s = &pit_state->channels[addr];
+ switch (s->write_state) {
+ default:
+ case RW_STATE_LSB:
+ pit_load_count(pit, addr, val);
+ break;
+ case RW_STATE_MSB:
+ pit_load_count(pit, addr, val << 8);
+ break;
+ case RW_STATE_WORD0:
+ s->write_latch = val;
+ s->write_state = RW_STATE_WORD1;
+ break;
+ case RW_STATE_WORD1:
+ pit_load_count(pit, addr, s->write_latch | (val << 8));
+ s->write_state = RW_STATE_WORD0;
+ break;
+ }
+ }
+
+ mutex_unlock(&pit_state->lock);
+ return 0;
+}
+
+static int pit_ioport_read(struct kvm_vcpu *vcpu,
+ struct kvm_io_device *this,
+ gpa_t addr, int len, void *data)
+{
+ struct kvm_pit *pit = dev_to_pit(this);
+ struct kvm_kpit_state *pit_state = &pit->pit_state;
+ int ret, count;
+ struct kvm_kpit_channel_state *s;
+ if (!pit_in_range(addr))
+ return -EOPNOTSUPP;
+
+ addr &= KVM_PIT_CHANNEL_MASK;
+ if (addr == 3)
+ return 0;
+
+ s = &pit_state->channels[addr];
+
+ mutex_lock(&pit_state->lock);
+
+ if (s->status_latched) {
+ s->status_latched = 0;
+ ret = s->status;
+ } else if (s->count_latched) {
+ switch (s->count_latched) {
+ default:
+ case RW_STATE_LSB:
+ ret = s->latched_count & 0xff;
+ s->count_latched = 0;
+ break;
+ case RW_STATE_MSB:
+ ret = s->latched_count >> 8;
+ s->count_latched = 0;
+ break;
+ case RW_STATE_WORD0:
+ ret = s->latched_count & 0xff;
+ s->count_latched = RW_STATE_MSB;
+ break;
+ }
+ } else {
+ switch (s->read_state) {
+ default:
+ case RW_STATE_LSB:
+ count = pit_get_count(pit, addr);
+ ret = count & 0xff;
+ break;
+ case RW_STATE_MSB:
+ count = pit_get_count(pit, addr);
+ ret = (count >> 8) & 0xff;
+ break;
+ case RW_STATE_WORD0:
+ count = pit_get_count(pit, addr);
+ ret = count & 0xff;
+ s->read_state = RW_STATE_WORD1;
+ break;
+ case RW_STATE_WORD1:
+ count = pit_get_count(pit, addr);
+ ret = (count >> 8) & 0xff;
+ s->read_state = RW_STATE_WORD0;
+ break;
+ }
+ }
+
+ if (len > sizeof(ret))
+ len = sizeof(ret);
+ memcpy(data, (char *)&ret, len);
+
+ mutex_unlock(&pit_state->lock);
+ return 0;
+}
+
+static int speaker_ioport_write(struct kvm_vcpu *vcpu,
+ struct kvm_io_device *this,
+ gpa_t addr, int len, const void *data)
+{
+ struct kvm_pit *pit = speaker_to_pit(this);
+ struct kvm_kpit_state *pit_state = &pit->pit_state;
+ u32 val = *(u32 *) data;
+ if (addr != KVM_SPEAKER_BASE_ADDRESS)
+ return -EOPNOTSUPP;
+
+ mutex_lock(&pit_state->lock);
+ pit_state->speaker_data_on = (val >> 1) & 1;
+ pit_set_gate(pit, 2, val & 1);
+ mutex_unlock(&pit_state->lock);
+ return 0;
+}
+
+static int speaker_ioport_read(struct kvm_vcpu *vcpu,
+ struct kvm_io_device *this,
+ gpa_t addr, int len, void *data)
+{
+ struct kvm_pit *pit = speaker_to_pit(this);
+ struct kvm_kpit_state *pit_state = &pit->pit_state;
+ unsigned int refresh_clock;
+ int ret;
+ if (addr != KVM_SPEAKER_BASE_ADDRESS)
+ return -EOPNOTSUPP;
+
+ /* Refresh clock toggles at about 15us. We approximate as 2^14ns. */
+ refresh_clock = ((unsigned int)ktime_to_ns(ktime_get()) >> 14) & 1;
+
+ mutex_lock(&pit_state->lock);
+ ret = ((pit_state->speaker_data_on << 1) | pit_get_gate(pit, 2) |
+ (pit_get_out(pit, 2) << 5) | (refresh_clock << 4));
+ if (len > sizeof(ret))
+ len = sizeof(ret);
+ memcpy(data, (char *)&ret, len);
+ mutex_unlock(&pit_state->lock);
+ return 0;
+}
+
+static void kvm_pit_reset(struct kvm_pit *pit)
+{
+ int i;
+ struct kvm_kpit_channel_state *c;
+
+ pit->pit_state.flags = 0;
+ for (i = 0; i < 3; i++) {
+ c = &pit->pit_state.channels[i];
+ c->mode = 0xff;
+ c->gate = (i != 2);
+ pit_load_count(pit, i, 0);
+ }
+
+ kvm_pit_reset_reinject(pit);
+}
+
+static void pit_mask_notifer(struct kvm_irq_mask_notifier *kimn, bool mask)
+{
+ struct kvm_pit *pit = container_of(kimn, struct kvm_pit, mask_notifier);
+
+ if (!mask)
+ kvm_pit_reset_reinject(pit);
+}
+
+static const struct kvm_io_device_ops pit_dev_ops = {
+ .read = pit_ioport_read,
+ .write = pit_ioport_write,
+};
+
+static const struct kvm_io_device_ops speaker_dev_ops = {
+ .read = speaker_ioport_read,
+ .write = speaker_ioport_write,
+};
+
+struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
+{
+ struct kvm_pit *pit;
+ struct kvm_kpit_state *pit_state;
+ struct pid *pid;
+ pid_t pid_nr;
+ int ret;
+
+ pit = kzalloc(sizeof(struct kvm_pit), GFP_KERNEL);
+ if (!pit)
+ return NULL;
+
+ pit->irq_source_id = kvm_request_irq_source_id(kvm);
+ if (pit->irq_source_id < 0)
+ goto fail_request;
+
+ mutex_init(&pit->pit_state.lock);
+
+ pid = get_pid(task_tgid(current));
+ pid_nr = pid_vnr(pid);
+ put_pid(pid);
+
+ pit->worker = kthread_create_worker(0, "kvm-pit/%d", pid_nr);
+ if (IS_ERR(pit->worker))
+ goto fail_kthread;
+
+ kthread_init_work(&pit->expired, pit_do_work);
+
+ pit->kvm = kvm;
+
+ pit_state = &pit->pit_state;
+ hrtimer_init(&pit_state->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+ pit_state->timer.function = pit_timer_fn;
+
+ pit_state->irq_ack_notifier.gsi = 0;
+ pit_state->irq_ack_notifier.irq_acked = kvm_pit_ack_irq;
+ pit->mask_notifier.func = pit_mask_notifer;
+
+ kvm_pit_reset(pit);
+
+ kvm_pit_set_reinject(pit, true);
+
+ mutex_lock(&kvm->slots_lock);
+ kvm_iodevice_init(&pit->dev, &pit_dev_ops);
+ ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, KVM_PIT_BASE_ADDRESS,
+ KVM_PIT_MEM_LENGTH, &pit->dev);
+ if (ret < 0)
+ goto fail_register_pit;
+
+ if (flags & KVM_PIT_SPEAKER_DUMMY) {
+ kvm_iodevice_init(&pit->speaker_dev, &speaker_dev_ops);
+ ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS,
+ KVM_SPEAKER_BASE_ADDRESS, 4,
+ &pit->speaker_dev);
+ if (ret < 0)
+ goto fail_register_speaker;
+ }
+ mutex_unlock(&kvm->slots_lock);
+
+ return pit;
+
+fail_register_speaker:
+ kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &pit->dev);
+fail_register_pit:
+ mutex_unlock(&kvm->slots_lock);
+ kvm_pit_set_reinject(pit, false);
+ kthread_destroy_worker(pit->worker);
+fail_kthread:
+ kvm_free_irq_source_id(kvm, pit->irq_source_id);
+fail_request:
+ kfree(pit);
+ return NULL;
+}
+
+void kvm_free_pit(struct kvm *kvm)
+{
+ struct kvm_pit *pit = kvm->arch.vpit;
+
+ if (pit) {
+ mutex_lock(&kvm->slots_lock);
+ kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &pit->dev);
+ kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &pit->speaker_dev);
+ mutex_unlock(&kvm->slots_lock);
+ kvm_pit_set_reinject(pit, false);
+ hrtimer_cancel(&pit->pit_state.timer);
+ kthread_destroy_worker(pit->worker);
+ kvm_free_irq_source_id(kvm, pit->irq_source_id);
+ kfree(pit);
+ }
+}
diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h
new file mode 100644
index 000000000..394d9527d
--- /dev/null
+++ b/arch/x86/kvm/i8254.h
@@ -0,0 +1,66 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __I8254_H
+#define __I8254_H
+
+#include <linux/kthread.h>
+
+#include <kvm/iodev.h>
+
+struct kvm_kpit_channel_state {
+ u32 count; /* can be 65536 */
+ u16 latched_count;
+ u8 count_latched;
+ u8 status_latched;
+ u8 status;
+ u8 read_state;
+ u8 write_state;
+ u8 write_latch;
+ u8 rw_mode;
+ u8 mode;
+ u8 bcd; /* not supported */
+ u8 gate; /* timer start */
+ ktime_t count_load_time;
+};
+
+struct kvm_kpit_state {
+ /* All members before "struct mutex lock" are protected by the lock. */
+ struct kvm_kpit_channel_state channels[3];
+ u32 flags;
+ bool is_periodic;
+ s64 period; /* unit: ns */
+ struct hrtimer timer;
+ u32 speaker_data_on;
+
+ struct mutex lock;
+ atomic_t reinject;
+ atomic_t pending; /* accumulated triggered timers */
+ atomic_t irq_ack;
+ struct kvm_irq_ack_notifier irq_ack_notifier;
+};
+
+struct kvm_pit {
+ struct kvm_io_device dev;
+ struct kvm_io_device speaker_dev;
+ struct kvm *kvm;
+ struct kvm_kpit_state pit_state;
+ int irq_source_id;
+ struct kvm_irq_mask_notifier mask_notifier;
+ struct kthread_worker *worker;
+ struct kthread_work expired;
+};
+
+#define KVM_PIT_BASE_ADDRESS 0x40
+#define KVM_SPEAKER_BASE_ADDRESS 0x61
+#define KVM_PIT_MEM_LENGTH 4
+#define KVM_PIT_FREQ 1193181
+#define KVM_MAX_PIT_INTR_INTERVAL HZ / 100
+#define KVM_PIT_CHANNEL_MASK 0x3
+
+struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags);
+void kvm_free_pit(struct kvm *kvm);
+
+void kvm_pit_load_count(struct kvm_pit *pit, int channel, u32 val,
+ int hpet_legacy_start);
+void kvm_pit_set_reinject(struct kvm_pit *pit, bool reinject);
+
+#endif
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
new file mode 100644
index 000000000..38a36a1cc
--- /dev/null
+++ b/arch/x86/kvm/i8259.c
@@ -0,0 +1,655 @@
+/*
+ * 8259 interrupt controller emulation
+ *
+ * Copyright (c) 2003-2004 Fabrice Bellard
+ * Copyright (c) 2007 Intel Corporation
+ * Copyright 2009 Red Hat, Inc. and/or its affiliates.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ * Authors:
+ * Yaozu (Eddie) Dong <Eddie.dong@intel.com>
+ * Port from Qemu.
+ */
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/bitops.h>
+#include "irq.h"
+
+#include <linux/kvm_host.h>
+#include "trace.h"
+
+#define pr_pic_unimpl(fmt, ...) \
+ pr_err_ratelimited("kvm: pic: " fmt, ## __VA_ARGS__)
+
+static void pic_irq_request(struct kvm *kvm, int level);
+
+static void pic_lock(struct kvm_pic *s)
+ __acquires(&s->lock)
+{
+ spin_lock(&s->lock);
+}
+
+static void pic_unlock(struct kvm_pic *s)
+ __releases(&s->lock)
+{
+ bool wakeup = s->wakeup_needed;
+ struct kvm_vcpu *vcpu;
+ int i;
+
+ s->wakeup_needed = false;
+
+ spin_unlock(&s->lock);
+
+ if (wakeup) {
+ kvm_for_each_vcpu(i, vcpu, s->kvm) {
+ if (kvm_apic_accept_pic_intr(vcpu)) {
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ kvm_vcpu_kick(vcpu);
+ return;
+ }
+ }
+ }
+}
+
+static void pic_clear_isr(struct kvm_kpic_state *s, int irq)
+{
+ s->isr &= ~(1 << irq);
+ if (s != &s->pics_state->pics[0])
+ irq += 8;
+ /*
+ * We are dropping lock while calling ack notifiers since ack
+ * notifier callbacks for assigned devices call into PIC recursively.
+ * Other interrupt may be delivered to PIC while lock is dropped but
+ * it should be safe since PIC state is already updated at this stage.
+ */
+ pic_unlock(s->pics_state);
+ kvm_notify_acked_irq(s->pics_state->kvm, SELECT_PIC(irq), irq);
+ pic_lock(s->pics_state);
+}
+
+/*
+ * set irq level. If an edge is detected, then the IRR is set to 1
+ */
+static inline int pic_set_irq1(struct kvm_kpic_state *s, int irq, int level)
+{
+ int mask, ret = 1;
+ mask = 1 << irq;
+ if (s->elcr & mask) /* level triggered */
+ if (level) {
+ ret = !(s->irr & mask);
+ s->irr |= mask;
+ s->last_irr |= mask;
+ } else {
+ s->irr &= ~mask;
+ s->last_irr &= ~mask;
+ }
+ else /* edge triggered */
+ if (level) {
+ if ((s->last_irr & mask) == 0) {
+ ret = !(s->irr & mask);
+ s->irr |= mask;
+ }
+ s->last_irr |= mask;
+ } else
+ s->last_irr &= ~mask;
+
+ return (s->imr & mask) ? -1 : ret;
+}
+
+/*
+ * return the highest priority found in mask (highest = smallest
+ * number). Return 8 if no irq
+ */
+static inline int get_priority(struct kvm_kpic_state *s, int mask)
+{
+ int priority;
+ if (mask == 0)
+ return 8;
+ priority = 0;
+ while ((mask & (1 << ((priority + s->priority_add) & 7))) == 0)
+ priority++;
+ return priority;
+}
+
+/*
+ * return the pic wanted interrupt. return -1 if none
+ */
+static int pic_get_irq(struct kvm_kpic_state *s)
+{
+ int mask, cur_priority, priority;
+
+ mask = s->irr & ~s->imr;
+ priority = get_priority(s, mask);
+ if (priority == 8)
+ return -1;
+ /*
+ * compute current priority. If special fully nested mode on the
+ * master, the IRQ coming from the slave is not taken into account
+ * for the priority computation.
+ */
+ mask = s->isr;
+ if (s->special_fully_nested_mode && s == &s->pics_state->pics[0])
+ mask &= ~(1 << 2);
+ cur_priority = get_priority(s, mask);
+ if (priority < cur_priority)
+ /*
+ * higher priority found: an irq should be generated
+ */
+ return (priority + s->priority_add) & 7;
+ else
+ return -1;
+}
+
+/*
+ * raise irq to CPU if necessary. must be called every time the active
+ * irq may change
+ */
+static void pic_update_irq(struct kvm_pic *s)
+{
+ int irq2, irq;
+
+ irq2 = pic_get_irq(&s->pics[1]);
+ if (irq2 >= 0) {
+ /*
+ * if irq request by slave pic, signal master PIC
+ */
+ pic_set_irq1(&s->pics[0], 2, 1);
+ pic_set_irq1(&s->pics[0], 2, 0);
+ }
+ irq = pic_get_irq(&s->pics[0]);
+ pic_irq_request(s->kvm, irq >= 0);
+}
+
+void kvm_pic_update_irq(struct kvm_pic *s)
+{
+ pic_lock(s);
+ pic_update_irq(s);
+ pic_unlock(s);
+}
+
+int kvm_pic_set_irq(struct kvm_pic *s, int irq, int irq_source_id, int level)
+{
+ int ret, irq_level;
+
+ BUG_ON(irq < 0 || irq >= PIC_NUM_PINS);
+
+ pic_lock(s);
+ irq_level = __kvm_irq_line_state(&s->irq_states[irq],
+ irq_source_id, level);
+ ret = pic_set_irq1(&s->pics[irq >> 3], irq & 7, irq_level);
+ pic_update_irq(s);
+ trace_kvm_pic_set_irq(irq >> 3, irq & 7, s->pics[irq >> 3].elcr,
+ s->pics[irq >> 3].imr, ret == 0);
+ pic_unlock(s);
+
+ return ret;
+}
+
+void kvm_pic_clear_all(struct kvm_pic *s, int irq_source_id)
+{
+ int i;
+
+ pic_lock(s);
+ for (i = 0; i < PIC_NUM_PINS; i++)
+ __clear_bit(irq_source_id, &s->irq_states[i]);
+ pic_unlock(s);
+}
+
+/*
+ * acknowledge interrupt 'irq'
+ */
+static inline void pic_intack(struct kvm_kpic_state *s, int irq)
+{
+ s->isr |= 1 << irq;
+ /*
+ * We don't clear a level sensitive interrupt here
+ */
+ if (!(s->elcr & (1 << irq)))
+ s->irr &= ~(1 << irq);
+
+ if (s->auto_eoi) {
+ if (s->rotate_on_auto_eoi)
+ s->priority_add = (irq + 1) & 7;
+ pic_clear_isr(s, irq);
+ }
+
+}
+
+int kvm_pic_read_irq(struct kvm *kvm)
+{
+ int irq, irq2, intno;
+ struct kvm_pic *s = kvm->arch.vpic;
+
+ s->output = 0;
+
+ pic_lock(s);
+ irq = pic_get_irq(&s->pics[0]);
+ if (irq >= 0) {
+ pic_intack(&s->pics[0], irq);
+ if (irq == 2) {
+ irq2 = pic_get_irq(&s->pics[1]);
+ if (irq2 >= 0)
+ pic_intack(&s->pics[1], irq2);
+ else
+ /*
+ * spurious IRQ on slave controller
+ */
+ irq2 = 7;
+ intno = s->pics[1].irq_base + irq2;
+ irq = irq2 + 8;
+ } else
+ intno = s->pics[0].irq_base + irq;
+ } else {
+ /*
+ * spurious IRQ on host controller
+ */
+ irq = 7;
+ intno = s->pics[0].irq_base + irq;
+ }
+ pic_update_irq(s);
+ pic_unlock(s);
+
+ return intno;
+}
+
+static void kvm_pic_reset(struct kvm_kpic_state *s)
+{
+ int irq, i;
+ struct kvm_vcpu *vcpu;
+ u8 edge_irr = s->irr & ~s->elcr;
+ bool found = false;
+
+ s->last_irr = 0;
+ s->irr &= s->elcr;
+ s->imr = 0;
+ s->priority_add = 0;
+ s->special_mask = 0;
+ s->read_reg_select = 0;
+ if (!s->init4) {
+ s->special_fully_nested_mode = 0;
+ s->auto_eoi = 0;
+ }
+ s->init_state = 1;
+
+ kvm_for_each_vcpu(i, vcpu, s->pics_state->kvm)
+ if (kvm_apic_accept_pic_intr(vcpu)) {
+ found = true;
+ break;
+ }
+
+
+ if (!found)
+ return;
+
+ for (irq = 0; irq < PIC_NUM_PINS/2; irq++)
+ if (edge_irr & (1 << irq))
+ pic_clear_isr(s, irq);
+}
+
+static void pic_ioport_write(void *opaque, u32 addr, u32 val)
+{
+ struct kvm_kpic_state *s = opaque;
+ int priority, cmd, irq;
+
+ addr &= 1;
+ if (addr == 0) {
+ if (val & 0x10) {
+ s->init4 = val & 1;
+ if (val & 0x02)
+ pr_pic_unimpl("single mode not supported");
+ if (val & 0x08)
+ pr_pic_unimpl(
+ "level sensitive irq not supported");
+ kvm_pic_reset(s);
+ } else if (val & 0x08) {
+ if (val & 0x04)
+ s->poll = 1;
+ if (val & 0x02)
+ s->read_reg_select = val & 1;
+ if (val & 0x40)
+ s->special_mask = (val >> 5) & 1;
+ } else {
+ cmd = val >> 5;
+ switch (cmd) {
+ case 0:
+ case 4:
+ s->rotate_on_auto_eoi = cmd >> 2;
+ break;
+ case 1: /* end of interrupt */
+ case 5:
+ priority = get_priority(s, s->isr);
+ if (priority != 8) {
+ irq = (priority + s->priority_add) & 7;
+ if (cmd == 5)
+ s->priority_add = (irq + 1) & 7;
+ pic_clear_isr(s, irq);
+ pic_update_irq(s->pics_state);
+ }
+ break;
+ case 3:
+ irq = val & 7;
+ pic_clear_isr(s, irq);
+ pic_update_irq(s->pics_state);
+ break;
+ case 6:
+ s->priority_add = (val + 1) & 7;
+ pic_update_irq(s->pics_state);
+ break;
+ case 7:
+ irq = val & 7;
+ s->priority_add = (irq + 1) & 7;
+ pic_clear_isr(s, irq);
+ pic_update_irq(s->pics_state);
+ break;
+ default:
+ break; /* no operation */
+ }
+ }
+ } else
+ switch (s->init_state) {
+ case 0: { /* normal mode */
+ u8 imr_diff = s->imr ^ val,
+ off = (s == &s->pics_state->pics[0]) ? 0 : 8;
+ s->imr = val;
+ for (irq = 0; irq < PIC_NUM_PINS/2; irq++)
+ if (imr_diff & (1 << irq))
+ kvm_fire_mask_notifiers(
+ s->pics_state->kvm,
+ SELECT_PIC(irq + off),
+ irq + off,
+ !!(s->imr & (1 << irq)));
+ pic_update_irq(s->pics_state);
+ break;
+ }
+ case 1:
+ s->irq_base = val & 0xf8;
+ s->init_state = 2;
+ break;
+ case 2:
+ if (s->init4)
+ s->init_state = 3;
+ else
+ s->init_state = 0;
+ break;
+ case 3:
+ s->special_fully_nested_mode = (val >> 4) & 1;
+ s->auto_eoi = (val >> 1) & 1;
+ s->init_state = 0;
+ break;
+ }
+}
+
+static u32 pic_poll_read(struct kvm_kpic_state *s, u32 addr1)
+{
+ int ret;
+
+ ret = pic_get_irq(s);
+ if (ret >= 0) {
+ if (addr1 >> 7) {
+ s->pics_state->pics[0].isr &= ~(1 << 2);
+ s->pics_state->pics[0].irr &= ~(1 << 2);
+ }
+ s->irr &= ~(1 << ret);
+ pic_clear_isr(s, ret);
+ if (addr1 >> 7 || ret != 2)
+ pic_update_irq(s->pics_state);
+ } else {
+ ret = 0x07;
+ pic_update_irq(s->pics_state);
+ }
+
+ return ret;
+}
+
+static u32 pic_ioport_read(void *opaque, u32 addr)
+{
+ struct kvm_kpic_state *s = opaque;
+ int ret;
+
+ if (s->poll) {
+ ret = pic_poll_read(s, addr);
+ s->poll = 0;
+ } else
+ if ((addr & 1) == 0)
+ if (s->read_reg_select)
+ ret = s->isr;
+ else
+ ret = s->irr;
+ else
+ ret = s->imr;
+ return ret;
+}
+
+static void elcr_ioport_write(void *opaque, u32 addr, u32 val)
+{
+ struct kvm_kpic_state *s = opaque;
+ s->elcr = val & s->elcr_mask;
+}
+
+static u32 elcr_ioport_read(void *opaque, u32 addr1)
+{
+ struct kvm_kpic_state *s = opaque;
+ return s->elcr;
+}
+
+static int picdev_write(struct kvm_pic *s,
+ gpa_t addr, int len, const void *val)
+{
+ unsigned char data = *(unsigned char *)val;
+
+ if (len != 1) {
+ pr_pic_unimpl("non byte write\n");
+ return 0;
+ }
+ switch (addr) {
+ case 0x20:
+ case 0x21:
+ pic_lock(s);
+ pic_ioport_write(&s->pics[0], addr, data);
+ pic_unlock(s);
+ break;
+ case 0xa0:
+ case 0xa1:
+ pic_lock(s);
+ pic_ioport_write(&s->pics[1], addr, data);
+ pic_unlock(s);
+ break;
+ case 0x4d0:
+ case 0x4d1:
+ pic_lock(s);
+ elcr_ioport_write(&s->pics[addr & 1], addr, data);
+ pic_unlock(s);
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+ return 0;
+}
+
+static int picdev_read(struct kvm_pic *s,
+ gpa_t addr, int len, void *val)
+{
+ unsigned char *data = (unsigned char *)val;
+
+ if (len != 1) {
+ memset(val, 0, len);
+ pr_pic_unimpl("non byte read\n");
+ return 0;
+ }
+ switch (addr) {
+ case 0x20:
+ case 0x21:
+ case 0xa0:
+ case 0xa1:
+ pic_lock(s);
+ *data = pic_ioport_read(&s->pics[addr >> 7], addr);
+ pic_unlock(s);
+ break;
+ case 0x4d0:
+ case 0x4d1:
+ pic_lock(s);
+ *data = elcr_ioport_read(&s->pics[addr & 1], addr);
+ pic_unlock(s);
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+ return 0;
+}
+
+static int picdev_master_write(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
+ gpa_t addr, int len, const void *val)
+{
+ return picdev_write(container_of(dev, struct kvm_pic, dev_master),
+ addr, len, val);
+}
+
+static int picdev_master_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
+ gpa_t addr, int len, void *val)
+{
+ return picdev_read(container_of(dev, struct kvm_pic, dev_master),
+ addr, len, val);
+}
+
+static int picdev_slave_write(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
+ gpa_t addr, int len, const void *val)
+{
+ return picdev_write(container_of(dev, struct kvm_pic, dev_slave),
+ addr, len, val);
+}
+
+static int picdev_slave_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
+ gpa_t addr, int len, void *val)
+{
+ return picdev_read(container_of(dev, struct kvm_pic, dev_slave),
+ addr, len, val);
+}
+
+static int picdev_eclr_write(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
+ gpa_t addr, int len, const void *val)
+{
+ return picdev_write(container_of(dev, struct kvm_pic, dev_eclr),
+ addr, len, val);
+}
+
+static int picdev_eclr_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
+ gpa_t addr, int len, void *val)
+{
+ return picdev_read(container_of(dev, struct kvm_pic, dev_eclr),
+ addr, len, val);
+}
+
+/*
+ * callback when PIC0 irq status changed
+ */
+static void pic_irq_request(struct kvm *kvm, int level)
+{
+ struct kvm_pic *s = kvm->arch.vpic;
+
+ if (!s->output)
+ s->wakeup_needed = true;
+ s->output = level;
+}
+
+static const struct kvm_io_device_ops picdev_master_ops = {
+ .read = picdev_master_read,
+ .write = picdev_master_write,
+};
+
+static const struct kvm_io_device_ops picdev_slave_ops = {
+ .read = picdev_slave_read,
+ .write = picdev_slave_write,
+};
+
+static const struct kvm_io_device_ops picdev_eclr_ops = {
+ .read = picdev_eclr_read,
+ .write = picdev_eclr_write,
+};
+
+int kvm_pic_init(struct kvm *kvm)
+{
+ struct kvm_pic *s;
+ int ret;
+
+ s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL);
+ if (!s)
+ return -ENOMEM;
+ spin_lock_init(&s->lock);
+ s->kvm = kvm;
+ s->pics[0].elcr_mask = 0xf8;
+ s->pics[1].elcr_mask = 0xde;
+ s->pics[0].pics_state = s;
+ s->pics[1].pics_state = s;
+
+ /*
+ * Initialize PIO device
+ */
+ kvm_iodevice_init(&s->dev_master, &picdev_master_ops);
+ kvm_iodevice_init(&s->dev_slave, &picdev_slave_ops);
+ kvm_iodevice_init(&s->dev_eclr, &picdev_eclr_ops);
+ mutex_lock(&kvm->slots_lock);
+ ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, 0x20, 2,
+ &s->dev_master);
+ if (ret < 0)
+ goto fail_unlock;
+
+ ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, 0xa0, 2, &s->dev_slave);
+ if (ret < 0)
+ goto fail_unreg_2;
+
+ ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, 0x4d0, 2, &s->dev_eclr);
+ if (ret < 0)
+ goto fail_unreg_1;
+
+ mutex_unlock(&kvm->slots_lock);
+
+ kvm->arch.vpic = s;
+
+ return 0;
+
+fail_unreg_1:
+ kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &s->dev_slave);
+
+fail_unreg_2:
+ kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &s->dev_master);
+
+fail_unlock:
+ mutex_unlock(&kvm->slots_lock);
+
+ kfree(s);
+
+ return ret;
+}
+
+void kvm_pic_destroy(struct kvm *kvm)
+{
+ struct kvm_pic *vpic = kvm->arch.vpic;
+
+ if (!vpic)
+ return;
+
+ mutex_lock(&kvm->slots_lock);
+ kvm_io_bus_unregister_dev(vpic->kvm, KVM_PIO_BUS, &vpic->dev_master);
+ kvm_io_bus_unregister_dev(vpic->kvm, KVM_PIO_BUS, &vpic->dev_slave);
+ kvm_io_bus_unregister_dev(vpic->kvm, KVM_PIO_BUS, &vpic->dev_eclr);
+ mutex_unlock(&kvm->slots_lock);
+
+ kvm->arch.vpic = NULL;
+ kfree(vpic);
+}
diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c
new file mode 100644
index 000000000..bac2ec9b4
--- /dev/null
+++ b/arch/x86/kvm/ioapic.c
@@ -0,0 +1,685 @@
+/*
+ * Copyright (C) 2001 MandrakeSoft S.A.
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
+ *
+ * MandrakeSoft S.A.
+ * 43, rue d'Aboukir
+ * 75002 Paris - France
+ * http://www.linux-mandrake.com/
+ * http://www.mandrakesoft.com/
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Yunhong Jiang <yunhong.jiang@intel.com>
+ * Yaozu (Eddie) Dong <eddie.dong@intel.com>
+ * Based on Xen 3.1 code.
+ */
+
+#include <linux/kvm_host.h>
+#include <linux/kvm.h>
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/smp.h>
+#include <linux/hrtimer.h>
+#include <linux/io.h>
+#include <linux/slab.h>
+#include <linux/export.h>
+#include <linux/nospec.h>
+#include <asm/processor.h>
+#include <asm/page.h>
+#include <asm/current.h>
+#include <trace/events/kvm.h>
+
+#include "ioapic.h"
+#include "lapic.h"
+#include "irq.h"
+
+#if 0
+#define ioapic_debug(fmt,arg...) printk(KERN_WARNING fmt,##arg)
+#else
+#define ioapic_debug(fmt, arg...)
+#endif
+static int ioapic_service(struct kvm_ioapic *vioapic, int irq,
+ bool line_status);
+
+static unsigned long ioapic_read_indirect(struct kvm_ioapic *ioapic,
+ unsigned long addr,
+ unsigned long length)
+{
+ unsigned long result = 0;
+
+ switch (ioapic->ioregsel) {
+ case IOAPIC_REG_VERSION:
+ result = ((((IOAPIC_NUM_PINS - 1) & 0xff) << 16)
+ | (IOAPIC_VERSION_ID & 0xff));
+ break;
+
+ case IOAPIC_REG_APIC_ID:
+ case IOAPIC_REG_ARB_ID:
+ result = ((ioapic->id & 0xf) << 24);
+ break;
+
+ default:
+ {
+ u32 redir_index = (ioapic->ioregsel - 0x10) >> 1;
+ u64 redir_content = ~0ULL;
+
+ if (redir_index < IOAPIC_NUM_PINS) {
+ u32 index = array_index_nospec(
+ redir_index, IOAPIC_NUM_PINS);
+
+ redir_content = ioapic->redirtbl[index].bits;
+ }
+
+ result = (ioapic->ioregsel & 0x1) ?
+ (redir_content >> 32) & 0xffffffff :
+ redir_content & 0xffffffff;
+ break;
+ }
+ }
+
+ return result;
+}
+
+static void rtc_irq_eoi_tracking_reset(struct kvm_ioapic *ioapic)
+{
+ ioapic->rtc_status.pending_eoi = 0;
+ bitmap_zero(ioapic->rtc_status.dest_map.map, KVM_MAX_VCPU_ID);
+}
+
+static void kvm_rtc_eoi_tracking_restore_all(struct kvm_ioapic *ioapic);
+
+static void rtc_status_pending_eoi_check_valid(struct kvm_ioapic *ioapic)
+{
+ if (WARN_ON(ioapic->rtc_status.pending_eoi < 0))
+ kvm_rtc_eoi_tracking_restore_all(ioapic);
+}
+
+static void __rtc_irq_eoi_tracking_restore_one(struct kvm_vcpu *vcpu)
+{
+ bool new_val, old_val;
+ struct kvm_ioapic *ioapic = vcpu->kvm->arch.vioapic;
+ struct dest_map *dest_map = &ioapic->rtc_status.dest_map;
+ union kvm_ioapic_redirect_entry *e;
+
+ e = &ioapic->redirtbl[RTC_GSI];
+ if (!kvm_apic_match_dest(vcpu, NULL, 0, e->fields.dest_id,
+ e->fields.dest_mode))
+ return;
+
+ new_val = kvm_apic_pending_eoi(vcpu, e->fields.vector);
+ old_val = test_bit(vcpu->vcpu_id, dest_map->map);
+
+ if (new_val == old_val)
+ return;
+
+ if (new_val) {
+ __set_bit(vcpu->vcpu_id, dest_map->map);
+ dest_map->vectors[vcpu->vcpu_id] = e->fields.vector;
+ ioapic->rtc_status.pending_eoi++;
+ } else {
+ __clear_bit(vcpu->vcpu_id, dest_map->map);
+ ioapic->rtc_status.pending_eoi--;
+ rtc_status_pending_eoi_check_valid(ioapic);
+ }
+}
+
+void kvm_rtc_eoi_tracking_restore_one(struct kvm_vcpu *vcpu)
+{
+ struct kvm_ioapic *ioapic = vcpu->kvm->arch.vioapic;
+
+ spin_lock(&ioapic->lock);
+ __rtc_irq_eoi_tracking_restore_one(vcpu);
+ spin_unlock(&ioapic->lock);
+}
+
+static void kvm_rtc_eoi_tracking_restore_all(struct kvm_ioapic *ioapic)
+{
+ struct kvm_vcpu *vcpu;
+ int i;
+
+ if (RTC_GSI >= IOAPIC_NUM_PINS)
+ return;
+
+ rtc_irq_eoi_tracking_reset(ioapic);
+ kvm_for_each_vcpu(i, vcpu, ioapic->kvm)
+ __rtc_irq_eoi_tracking_restore_one(vcpu);
+}
+
+static void rtc_irq_eoi(struct kvm_ioapic *ioapic, struct kvm_vcpu *vcpu)
+{
+ if (test_and_clear_bit(vcpu->vcpu_id,
+ ioapic->rtc_status.dest_map.map)) {
+ --ioapic->rtc_status.pending_eoi;
+ rtc_status_pending_eoi_check_valid(ioapic);
+ }
+}
+
+static bool rtc_irq_check_coalesced(struct kvm_ioapic *ioapic)
+{
+ if (ioapic->rtc_status.pending_eoi > 0)
+ return true; /* coalesced */
+
+ return false;
+}
+
+static int ioapic_set_irq(struct kvm_ioapic *ioapic, unsigned int irq,
+ int irq_level, bool line_status)
+{
+ union kvm_ioapic_redirect_entry entry;
+ u32 mask = 1 << irq;
+ u32 old_irr;
+ int edge, ret;
+
+ entry = ioapic->redirtbl[irq];
+ edge = (entry.fields.trig_mode == IOAPIC_EDGE_TRIG);
+
+ if (!irq_level) {
+ ioapic->irr &= ~mask;
+ ret = 1;
+ goto out;
+ }
+
+ /*
+ * Return 0 for coalesced interrupts; for edge-triggered interrupts,
+ * this only happens if a previous edge has not been delivered due
+ * do masking. For level interrupts, the remote_irr field tells
+ * us if the interrupt is waiting for an EOI.
+ *
+ * RTC is special: it is edge-triggered, but userspace likes to know
+ * if it has been already ack-ed via EOI because coalesced RTC
+ * interrupts lead to time drift in Windows guests. So we track
+ * EOI manually for the RTC interrupt.
+ */
+ if (irq == RTC_GSI && line_status &&
+ rtc_irq_check_coalesced(ioapic)) {
+ ret = 0;
+ goto out;
+ }
+
+ old_irr = ioapic->irr;
+ ioapic->irr |= mask;
+ if (edge) {
+ ioapic->irr_delivered &= ~mask;
+ if (old_irr == ioapic->irr) {
+ ret = 0;
+ goto out;
+ }
+ }
+
+ ret = ioapic_service(ioapic, irq, line_status);
+
+out:
+ trace_kvm_ioapic_set_irq(entry.bits, irq, ret == 0);
+ return ret;
+}
+
+static void kvm_ioapic_inject_all(struct kvm_ioapic *ioapic, unsigned long irr)
+{
+ u32 idx;
+
+ rtc_irq_eoi_tracking_reset(ioapic);
+ for_each_set_bit(idx, &irr, IOAPIC_NUM_PINS)
+ ioapic_set_irq(ioapic, idx, 1, true);
+
+ kvm_rtc_eoi_tracking_restore_all(ioapic);
+}
+
+
+void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, ulong *ioapic_handled_vectors)
+{
+ struct kvm_ioapic *ioapic = vcpu->kvm->arch.vioapic;
+ struct dest_map *dest_map = &ioapic->rtc_status.dest_map;
+ union kvm_ioapic_redirect_entry *e;
+ int index;
+
+ spin_lock(&ioapic->lock);
+
+ /* Make sure we see any missing RTC EOI */
+ if (test_bit(vcpu->vcpu_id, dest_map->map))
+ __set_bit(dest_map->vectors[vcpu->vcpu_id],
+ ioapic_handled_vectors);
+
+ for (index = 0; index < IOAPIC_NUM_PINS; index++) {
+ e = &ioapic->redirtbl[index];
+ if (e->fields.trig_mode == IOAPIC_LEVEL_TRIG ||
+ kvm_irq_has_notifier(ioapic->kvm, KVM_IRQCHIP_IOAPIC, index) ||
+ index == RTC_GSI) {
+ if (kvm_apic_match_dest(vcpu, NULL, 0,
+ e->fields.dest_id, e->fields.dest_mode) ||
+ kvm_apic_pending_eoi(vcpu, e->fields.vector))
+ __set_bit(e->fields.vector,
+ ioapic_handled_vectors);
+ }
+ }
+ spin_unlock(&ioapic->lock);
+}
+
+void kvm_arch_post_irq_ack_notifier_list_update(struct kvm *kvm)
+{
+ if (!ioapic_in_kernel(kvm))
+ return;
+ kvm_make_scan_ioapic_request(kvm);
+}
+
+static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
+{
+ unsigned index;
+ bool mask_before, mask_after;
+ int old_remote_irr, old_delivery_status;
+ union kvm_ioapic_redirect_entry *e;
+
+ switch (ioapic->ioregsel) {
+ case IOAPIC_REG_VERSION:
+ /* Writes are ignored. */
+ break;
+
+ case IOAPIC_REG_APIC_ID:
+ ioapic->id = (val >> 24) & 0xf;
+ break;
+
+ case IOAPIC_REG_ARB_ID:
+ break;
+
+ default:
+ index = (ioapic->ioregsel - 0x10) >> 1;
+
+ ioapic_debug("change redir index %x val %x\n", index, val);
+ if (index >= IOAPIC_NUM_PINS)
+ return;
+ index = array_index_nospec(index, IOAPIC_NUM_PINS);
+ e = &ioapic->redirtbl[index];
+ mask_before = e->fields.mask;
+ /* Preserve read-only fields */
+ old_remote_irr = e->fields.remote_irr;
+ old_delivery_status = e->fields.delivery_status;
+ if (ioapic->ioregsel & 1) {
+ e->bits &= 0xffffffff;
+ e->bits |= (u64) val << 32;
+ } else {
+ e->bits &= ~0xffffffffULL;
+ e->bits |= (u32) val;
+ }
+ e->fields.remote_irr = old_remote_irr;
+ e->fields.delivery_status = old_delivery_status;
+
+ /*
+ * Some OSes (Linux, Xen) assume that Remote IRR bit will
+ * be cleared by IOAPIC hardware when the entry is configured
+ * as edge-triggered. This behavior is used to simulate an
+ * explicit EOI on IOAPICs that don't have the EOI register.
+ */
+ if (e->fields.trig_mode == IOAPIC_EDGE_TRIG)
+ e->fields.remote_irr = 0;
+
+ mask_after = e->fields.mask;
+ if (mask_before != mask_after)
+ kvm_fire_mask_notifiers(ioapic->kvm, KVM_IRQCHIP_IOAPIC, index, mask_after);
+ if (e->fields.trig_mode == IOAPIC_LEVEL_TRIG
+ && ioapic->irr & (1 << index))
+ ioapic_service(ioapic, index, false);
+ kvm_make_scan_ioapic_request(ioapic->kvm);
+ break;
+ }
+}
+
+static int ioapic_service(struct kvm_ioapic *ioapic, int irq, bool line_status)
+{
+ union kvm_ioapic_redirect_entry *entry = &ioapic->redirtbl[irq];
+ struct kvm_lapic_irq irqe;
+ int ret;
+
+ if (entry->fields.mask ||
+ (entry->fields.trig_mode == IOAPIC_LEVEL_TRIG &&
+ entry->fields.remote_irr))
+ return -1;
+
+ ioapic_debug("dest=%x dest_mode=%x delivery_mode=%x "
+ "vector=%x trig_mode=%x\n",
+ entry->fields.dest_id, entry->fields.dest_mode,
+ entry->fields.delivery_mode, entry->fields.vector,
+ entry->fields.trig_mode);
+
+ irqe.dest_id = entry->fields.dest_id;
+ irqe.vector = entry->fields.vector;
+ irqe.dest_mode = entry->fields.dest_mode;
+ irqe.trig_mode = entry->fields.trig_mode;
+ irqe.delivery_mode = entry->fields.delivery_mode << 8;
+ irqe.level = 1;
+ irqe.shorthand = 0;
+ irqe.msi_redir_hint = false;
+
+ if (irqe.trig_mode == IOAPIC_EDGE_TRIG)
+ ioapic->irr_delivered |= 1 << irq;
+
+ if (irq == RTC_GSI && line_status) {
+ /*
+ * pending_eoi cannot ever become negative (see
+ * rtc_status_pending_eoi_check_valid) and the caller
+ * ensures that it is only called if it is >= zero, namely
+ * if rtc_irq_check_coalesced returns false).
+ */
+ BUG_ON(ioapic->rtc_status.pending_eoi != 0);
+ ret = kvm_irq_delivery_to_apic(ioapic->kvm, NULL, &irqe,
+ &ioapic->rtc_status.dest_map);
+ ioapic->rtc_status.pending_eoi = (ret < 0 ? 0 : ret);
+ } else
+ ret = kvm_irq_delivery_to_apic(ioapic->kvm, NULL, &irqe, NULL);
+
+ if (ret && irqe.trig_mode == IOAPIC_LEVEL_TRIG)
+ entry->fields.remote_irr = 1;
+
+ return ret;
+}
+
+int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int irq_source_id,
+ int level, bool line_status)
+{
+ int ret, irq_level;
+
+ BUG_ON(irq < 0 || irq >= IOAPIC_NUM_PINS);
+
+ spin_lock(&ioapic->lock);
+ irq_level = __kvm_irq_line_state(&ioapic->irq_states[irq],
+ irq_source_id, level);
+ ret = ioapic_set_irq(ioapic, irq, irq_level, line_status);
+
+ spin_unlock(&ioapic->lock);
+
+ return ret;
+}
+
+void kvm_ioapic_clear_all(struct kvm_ioapic *ioapic, int irq_source_id)
+{
+ int i;
+
+ spin_lock(&ioapic->lock);
+ for (i = 0; i < KVM_IOAPIC_NUM_PINS; i++)
+ __clear_bit(irq_source_id, &ioapic->irq_states[i]);
+ spin_unlock(&ioapic->lock);
+}
+
+static void kvm_ioapic_eoi_inject_work(struct work_struct *work)
+{
+ int i;
+ struct kvm_ioapic *ioapic = container_of(work, struct kvm_ioapic,
+ eoi_inject.work);
+ spin_lock(&ioapic->lock);
+ for (i = 0; i < IOAPIC_NUM_PINS; i++) {
+ union kvm_ioapic_redirect_entry *ent = &ioapic->redirtbl[i];
+
+ if (ent->fields.trig_mode != IOAPIC_LEVEL_TRIG)
+ continue;
+
+ if (ioapic->irr & (1 << i) && !ent->fields.remote_irr)
+ ioapic_service(ioapic, i, false);
+ }
+ spin_unlock(&ioapic->lock);
+}
+
+#define IOAPIC_SUCCESSIVE_IRQ_MAX_COUNT 10000
+
+static void __kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu,
+ struct kvm_ioapic *ioapic, int vector, int trigger_mode)
+{
+ struct dest_map *dest_map = &ioapic->rtc_status.dest_map;
+ struct kvm_lapic *apic = vcpu->arch.apic;
+ int i;
+
+ /* RTC special handling */
+ if (test_bit(vcpu->vcpu_id, dest_map->map) &&
+ vector == dest_map->vectors[vcpu->vcpu_id])
+ rtc_irq_eoi(ioapic, vcpu);
+
+ for (i = 0; i < IOAPIC_NUM_PINS; i++) {
+ union kvm_ioapic_redirect_entry *ent = &ioapic->redirtbl[i];
+
+ if (ent->fields.vector != vector)
+ continue;
+
+ /*
+ * We are dropping lock while calling ack notifiers because ack
+ * notifier callbacks for assigned devices call into IOAPIC
+ * recursively. Since remote_irr is cleared only after call
+ * to notifiers if the same vector will be delivered while lock
+ * is dropped it will be put into irr and will be delivered
+ * after ack notifier returns.
+ */
+ spin_unlock(&ioapic->lock);
+ kvm_notify_acked_irq(ioapic->kvm, KVM_IRQCHIP_IOAPIC, i);
+ spin_lock(&ioapic->lock);
+
+ if (trigger_mode != IOAPIC_LEVEL_TRIG ||
+ kvm_lapic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI)
+ continue;
+
+ ASSERT(ent->fields.trig_mode == IOAPIC_LEVEL_TRIG);
+ ent->fields.remote_irr = 0;
+ if (!ent->fields.mask && (ioapic->irr & (1 << i))) {
+ ++ioapic->irq_eoi[i];
+ if (ioapic->irq_eoi[i] == IOAPIC_SUCCESSIVE_IRQ_MAX_COUNT) {
+ /*
+ * Real hardware does not deliver the interrupt
+ * immediately during eoi broadcast, and this
+ * lets a buggy guest make slow progress
+ * even if it does not correctly handle a
+ * level-triggered interrupt. Emulate this
+ * behavior if we detect an interrupt storm.
+ */
+ schedule_delayed_work(&ioapic->eoi_inject, HZ / 100);
+ ioapic->irq_eoi[i] = 0;
+ trace_kvm_ioapic_delayed_eoi_inj(ent->bits);
+ } else {
+ ioapic_service(ioapic, i, false);
+ }
+ } else {
+ ioapic->irq_eoi[i] = 0;
+ }
+ }
+}
+
+void kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu, int vector, int trigger_mode)
+{
+ struct kvm_ioapic *ioapic = vcpu->kvm->arch.vioapic;
+
+ spin_lock(&ioapic->lock);
+ __kvm_ioapic_update_eoi(vcpu, ioapic, vector, trigger_mode);
+ spin_unlock(&ioapic->lock);
+}
+
+static inline struct kvm_ioapic *to_ioapic(struct kvm_io_device *dev)
+{
+ return container_of(dev, struct kvm_ioapic, dev);
+}
+
+static inline int ioapic_in_range(struct kvm_ioapic *ioapic, gpa_t addr)
+{
+ return ((addr >= ioapic->base_address &&
+ (addr < ioapic->base_address + IOAPIC_MEM_LENGTH)));
+}
+
+static int ioapic_mmio_read(struct kvm_vcpu *vcpu, struct kvm_io_device *this,
+ gpa_t addr, int len, void *val)
+{
+ struct kvm_ioapic *ioapic = to_ioapic(this);
+ u32 result;
+ if (!ioapic_in_range(ioapic, addr))
+ return -EOPNOTSUPP;
+
+ ioapic_debug("addr %lx\n", (unsigned long)addr);
+ ASSERT(!(addr & 0xf)); /* check alignment */
+
+ addr &= 0xff;
+ spin_lock(&ioapic->lock);
+ switch (addr) {
+ case IOAPIC_REG_SELECT:
+ result = ioapic->ioregsel;
+ break;
+
+ case IOAPIC_REG_WINDOW:
+ result = ioapic_read_indirect(ioapic, addr, len);
+ break;
+
+ default:
+ result = 0;
+ break;
+ }
+ spin_unlock(&ioapic->lock);
+
+ switch (len) {
+ case 8:
+ *(u64 *) val = result;
+ break;
+ case 1:
+ case 2:
+ case 4:
+ memcpy(val, (char *)&result, len);
+ break;
+ default:
+ printk(KERN_WARNING "ioapic: wrong length %d\n", len);
+ }
+ return 0;
+}
+
+static int ioapic_mmio_write(struct kvm_vcpu *vcpu, struct kvm_io_device *this,
+ gpa_t addr, int len, const void *val)
+{
+ struct kvm_ioapic *ioapic = to_ioapic(this);
+ u32 data;
+ if (!ioapic_in_range(ioapic, addr))
+ return -EOPNOTSUPP;
+
+ ioapic_debug("ioapic_mmio_write addr=%p len=%d val=%p\n",
+ (void*)addr, len, val);
+ ASSERT(!(addr & 0xf)); /* check alignment */
+
+ switch (len) {
+ case 8:
+ case 4:
+ data = *(u32 *) val;
+ break;
+ case 2:
+ data = *(u16 *) val;
+ break;
+ case 1:
+ data = *(u8 *) val;
+ break;
+ default:
+ printk(KERN_WARNING "ioapic: Unsupported size %d\n", len);
+ return 0;
+ }
+
+ addr &= 0xff;
+ spin_lock(&ioapic->lock);
+ switch (addr) {
+ case IOAPIC_REG_SELECT:
+ ioapic->ioregsel = data & 0xFF; /* 8-bit register */
+ break;
+
+ case IOAPIC_REG_WINDOW:
+ ioapic_write_indirect(ioapic, data);
+ break;
+
+ default:
+ break;
+ }
+ spin_unlock(&ioapic->lock);
+ return 0;
+}
+
+static void kvm_ioapic_reset(struct kvm_ioapic *ioapic)
+{
+ int i;
+
+ cancel_delayed_work_sync(&ioapic->eoi_inject);
+ for (i = 0; i < IOAPIC_NUM_PINS; i++)
+ ioapic->redirtbl[i].fields.mask = 1;
+ ioapic->base_address = IOAPIC_DEFAULT_BASE_ADDRESS;
+ ioapic->ioregsel = 0;
+ ioapic->irr = 0;
+ ioapic->irr_delivered = 0;
+ ioapic->id = 0;
+ memset(ioapic->irq_eoi, 0x00, sizeof(ioapic->irq_eoi));
+ rtc_irq_eoi_tracking_reset(ioapic);
+}
+
+static const struct kvm_io_device_ops ioapic_mmio_ops = {
+ .read = ioapic_mmio_read,
+ .write = ioapic_mmio_write,
+};
+
+int kvm_ioapic_init(struct kvm *kvm)
+{
+ struct kvm_ioapic *ioapic;
+ int ret;
+
+ ioapic = kzalloc(sizeof(struct kvm_ioapic), GFP_KERNEL);
+ if (!ioapic)
+ return -ENOMEM;
+ spin_lock_init(&ioapic->lock);
+ INIT_DELAYED_WORK(&ioapic->eoi_inject, kvm_ioapic_eoi_inject_work);
+ kvm->arch.vioapic = ioapic;
+ kvm_ioapic_reset(ioapic);
+ kvm_iodevice_init(&ioapic->dev, &ioapic_mmio_ops);
+ ioapic->kvm = kvm;
+ mutex_lock(&kvm->slots_lock);
+ ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, ioapic->base_address,
+ IOAPIC_MEM_LENGTH, &ioapic->dev);
+ mutex_unlock(&kvm->slots_lock);
+ if (ret < 0) {
+ kvm->arch.vioapic = NULL;
+ kfree(ioapic);
+ }
+
+ return ret;
+}
+
+void kvm_ioapic_destroy(struct kvm *kvm)
+{
+ struct kvm_ioapic *ioapic = kvm->arch.vioapic;
+
+ if (!ioapic)
+ return;
+
+ cancel_delayed_work_sync(&ioapic->eoi_inject);
+ mutex_lock(&kvm->slots_lock);
+ kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, &ioapic->dev);
+ mutex_unlock(&kvm->slots_lock);
+ kvm->arch.vioapic = NULL;
+ kfree(ioapic);
+}
+
+void kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state)
+{
+ struct kvm_ioapic *ioapic = kvm->arch.vioapic;
+
+ spin_lock(&ioapic->lock);
+ memcpy(state, ioapic, sizeof(struct kvm_ioapic_state));
+ state->irr &= ~ioapic->irr_delivered;
+ spin_unlock(&ioapic->lock);
+}
+
+void kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state)
+{
+ struct kvm_ioapic *ioapic = kvm->arch.vioapic;
+
+ spin_lock(&ioapic->lock);
+ memcpy(ioapic, state, sizeof(struct kvm_ioapic_state));
+ ioapic->irr = 0;
+ ioapic->irr_delivered = 0;
+ kvm_make_scan_ioapic_request(kvm);
+ kvm_ioapic_inject_all(ioapic, state->irr);
+ spin_unlock(&ioapic->lock);
+}
diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h
new file mode 100644
index 000000000..ea1a4e029
--- /dev/null
+++ b/arch/x86/kvm/ioapic.h
@@ -0,0 +1,138 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __KVM_IO_APIC_H
+#define __KVM_IO_APIC_H
+
+#include <linux/kvm_host.h>
+
+#include <kvm/iodev.h>
+
+struct kvm;
+struct kvm_vcpu;
+
+#define IOAPIC_NUM_PINS KVM_IOAPIC_NUM_PINS
+#define MAX_NR_RESERVED_IOAPIC_PINS KVM_MAX_IRQ_ROUTES
+#define IOAPIC_VERSION_ID 0x11 /* IOAPIC version */
+#define IOAPIC_EDGE_TRIG 0
+#define IOAPIC_LEVEL_TRIG 1
+
+#define IOAPIC_DEFAULT_BASE_ADDRESS 0xfec00000
+#define IOAPIC_MEM_LENGTH 0x100
+
+/* Direct registers. */
+#define IOAPIC_REG_SELECT 0x00
+#define IOAPIC_REG_WINDOW 0x10
+
+/* Indirect registers. */
+#define IOAPIC_REG_APIC_ID 0x00 /* x86 IOAPIC only */
+#define IOAPIC_REG_VERSION 0x01
+#define IOAPIC_REG_ARB_ID 0x02 /* x86 IOAPIC only */
+
+/*ioapic delivery mode*/
+#define IOAPIC_FIXED 0x0
+#define IOAPIC_LOWEST_PRIORITY 0x1
+#define IOAPIC_PMI 0x2
+#define IOAPIC_NMI 0x4
+#define IOAPIC_INIT 0x5
+#define IOAPIC_EXTINT 0x7
+
+#ifdef CONFIG_X86
+#define RTC_GSI 8
+#else
+#define RTC_GSI -1U
+#endif
+
+struct dest_map {
+ /* vcpu bitmap where IRQ has been sent */
+ DECLARE_BITMAP(map, KVM_MAX_VCPU_ID);
+
+ /*
+ * Vector sent to a given vcpu, only valid when
+ * the vcpu's bit in map is set
+ */
+ u8 vectors[KVM_MAX_VCPU_ID];
+};
+
+
+struct rtc_status {
+ int pending_eoi;
+ struct dest_map dest_map;
+};
+
+union kvm_ioapic_redirect_entry {
+ u64 bits;
+ struct {
+ u8 vector;
+ u8 delivery_mode:3;
+ u8 dest_mode:1;
+ u8 delivery_status:1;
+ u8 polarity:1;
+ u8 remote_irr:1;
+ u8 trig_mode:1;
+ u8 mask:1;
+ u8 reserve:7;
+ u8 reserved[4];
+ u8 dest_id;
+ } fields;
+};
+
+struct kvm_ioapic {
+ u64 base_address;
+ u32 ioregsel;
+ u32 id;
+ u32 irr;
+ u32 pad;
+ union kvm_ioapic_redirect_entry redirtbl[IOAPIC_NUM_PINS];
+ unsigned long irq_states[IOAPIC_NUM_PINS];
+ struct kvm_io_device dev;
+ struct kvm *kvm;
+ void (*ack_notifier)(void *opaque, int irq);
+ spinlock_t lock;
+ struct rtc_status rtc_status;
+ struct delayed_work eoi_inject;
+ u32 irq_eoi[IOAPIC_NUM_PINS];
+ u32 irr_delivered;
+};
+
+#ifdef DEBUG
+#define ASSERT(x) \
+do { \
+ if (!(x)) { \
+ printk(KERN_EMERG "assertion failed %s: %d: %s\n", \
+ __FILE__, __LINE__, #x); \
+ BUG(); \
+ } \
+} while (0)
+#else
+#define ASSERT(x) do { } while (0)
+#endif
+
+static inline int ioapic_in_kernel(struct kvm *kvm)
+{
+ int mode = kvm->arch.irqchip_mode;
+
+ /* Matches smp_wmb() when setting irqchip_mode */
+ smp_rmb();
+ return mode == KVM_IRQCHIP_KERNEL;
+}
+
+void kvm_rtc_eoi_tracking_restore_one(struct kvm_vcpu *vcpu);
+bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
+ int short_hand, unsigned int dest, int dest_mode);
+int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2);
+void kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu, int vector,
+ int trigger_mode);
+int kvm_ioapic_init(struct kvm *kvm);
+void kvm_ioapic_destroy(struct kvm *kvm);
+int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int irq_source_id,
+ int level, bool line_status);
+void kvm_ioapic_clear_all(struct kvm_ioapic *ioapic, int irq_source_id);
+int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
+ struct kvm_lapic_irq *irq,
+ struct dest_map *dest_map);
+void kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state);
+void kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state);
+void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu,
+ ulong *ioapic_handled_vectors);
+void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu,
+ ulong *ioapic_handled_vectors);
+#endif
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
new file mode 100644
index 000000000..295ebadb8
--- /dev/null
+++ b/arch/x86/kvm/irq.c
@@ -0,0 +1,164 @@
+/*
+ * irq.c: API for in kernel interrupt controller
+ * Copyright (c) 2007, Intel Corporation.
+ * Copyright 2009 Red Hat, Inc. and/or its affiliates.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * Authors:
+ * Yaozu (Eddie) Dong <Eddie.dong@intel.com>
+ *
+ */
+
+#include <linux/export.h>
+#include <linux/kvm_host.h>
+
+#include "irq.h"
+#include "i8254.h"
+#include "x86.h"
+
+/*
+ * check if there are pending timer events
+ * to be processed.
+ */
+int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
+{
+ if (lapic_in_kernel(vcpu))
+ return apic_has_pending_timer(vcpu);
+
+ return 0;
+}
+EXPORT_SYMBOL(kvm_cpu_has_pending_timer);
+
+/*
+ * check if there is a pending userspace external interrupt
+ */
+static int pending_userspace_extint(struct kvm_vcpu *v)
+{
+ return v->arch.pending_external_vector != -1;
+}
+
+/*
+ * check if there is pending interrupt from
+ * non-APIC source without intack.
+ */
+int kvm_cpu_has_extint(struct kvm_vcpu *v)
+{
+ /*
+ * FIXME: interrupt.injected represents an interrupt whose
+ * side-effects have already been applied (e.g. bit from IRR
+ * already moved to ISR). Therefore, it is incorrect to rely
+ * on interrupt.injected to know if there is a pending
+ * interrupt in the user-mode LAPIC.
+ * This leads to nVMX/nSVM not be able to distinguish
+ * if it should exit from L2 to L1 on EXTERNAL_INTERRUPT on
+ * pending interrupt or should re-inject an injected
+ * interrupt.
+ */
+ if (!lapic_in_kernel(v))
+ return v->arch.interrupt.injected;
+
+ if (!kvm_apic_accept_pic_intr(v))
+ return 0;
+
+ if (irqchip_split(v->kvm))
+ return pending_userspace_extint(v);
+ else
+ return v->kvm->arch.vpic->output;
+}
+
+/*
+ * check if there is injectable interrupt:
+ * when virtual interrupt delivery enabled,
+ * interrupt from apic will handled by hardware,
+ * we don't need to check it here.
+ */
+int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v)
+{
+ if (kvm_cpu_has_extint(v))
+ return 1;
+
+ if (!is_guest_mode(v) && kvm_vcpu_apicv_active(v))
+ return 0;
+
+ return kvm_apic_has_interrupt(v) != -1; /* LAPIC */
+}
+
+/*
+ * check if there is pending interrupt without
+ * intack.
+ */
+int kvm_cpu_has_interrupt(struct kvm_vcpu *v)
+{
+ if (kvm_cpu_has_extint(v))
+ return 1;
+
+ return kvm_apic_has_interrupt(v) != -1; /* LAPIC */
+}
+EXPORT_SYMBOL_GPL(kvm_cpu_has_interrupt);
+
+/*
+ * Read pending interrupt(from non-APIC source)
+ * vector and intack.
+ */
+static int kvm_cpu_get_extint(struct kvm_vcpu *v)
+{
+ if (!kvm_cpu_has_extint(v)) {
+ WARN_ON(!lapic_in_kernel(v));
+ return -1;
+ }
+
+ if (!lapic_in_kernel(v))
+ return v->arch.interrupt.nr;
+
+ if (irqchip_split(v->kvm)) {
+ int vector = v->arch.pending_external_vector;
+
+ v->arch.pending_external_vector = -1;
+ return vector;
+ } else
+ return kvm_pic_read_irq(v->kvm); /* PIC */
+}
+
+/*
+ * Read pending interrupt vector and intack.
+ */
+int kvm_cpu_get_interrupt(struct kvm_vcpu *v)
+{
+ int vector = kvm_cpu_get_extint(v);
+ if (vector != -1)
+ return vector; /* PIC */
+
+ return kvm_get_apic_interrupt(v); /* APIC */
+}
+EXPORT_SYMBOL_GPL(kvm_cpu_get_interrupt);
+
+void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu)
+{
+ if (lapic_in_kernel(vcpu))
+ kvm_inject_apic_timer_irqs(vcpu);
+}
+EXPORT_SYMBOL_GPL(kvm_inject_pending_timer_irqs);
+
+void __kvm_migrate_timers(struct kvm_vcpu *vcpu)
+{
+ __kvm_migrate_apic_timer(vcpu);
+ __kvm_migrate_pit_timer(vcpu);
+}
+
+bool kvm_arch_irqfd_allowed(struct kvm *kvm, struct kvm_irqfd *args)
+{
+ bool resample = args->flags & KVM_IRQFD_FLAG_RESAMPLE;
+
+ return resample ? irqchip_kernel(kvm) : irqchip_in_kernel(kvm);
+}
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
new file mode 100644
index 000000000..fd210cdd4
--- /dev/null
+++ b/arch/x86/kvm/irq.h
@@ -0,0 +1,130 @@
+/*
+ * irq.h: in kernel interrupt controller related definitions
+ * Copyright (c) 2007, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * Authors:
+ * Yaozu (Eddie) Dong <Eddie.dong@intel.com>
+ *
+ */
+
+#ifndef __IRQ_H
+#define __IRQ_H
+
+#include <linux/mm_types.h>
+#include <linux/hrtimer.h>
+#include <linux/kvm_host.h>
+#include <linux/spinlock.h>
+
+#include <kvm/iodev.h>
+#include "ioapic.h"
+#include "lapic.h"
+
+#define PIC_NUM_PINS 16
+#define SELECT_PIC(irq) \
+ ((irq) < 8 ? KVM_IRQCHIP_PIC_MASTER : KVM_IRQCHIP_PIC_SLAVE)
+
+struct kvm;
+struct kvm_vcpu;
+
+struct kvm_kpic_state {
+ u8 last_irr; /* edge detection */
+ u8 irr; /* interrupt request register */
+ u8 imr; /* interrupt mask register */
+ u8 isr; /* interrupt service register */
+ u8 priority_add; /* highest irq priority */
+ u8 irq_base;
+ u8 read_reg_select;
+ u8 poll;
+ u8 special_mask;
+ u8 init_state;
+ u8 auto_eoi;
+ u8 rotate_on_auto_eoi;
+ u8 special_fully_nested_mode;
+ u8 init4; /* true if 4 byte init */
+ u8 elcr; /* PIIX edge/trigger selection */
+ u8 elcr_mask;
+ u8 isr_ack; /* interrupt ack detection */
+ struct kvm_pic *pics_state;
+};
+
+struct kvm_pic {
+ spinlock_t lock;
+ bool wakeup_needed;
+ unsigned pending_acks;
+ struct kvm *kvm;
+ struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */
+ int output; /* intr from master PIC */
+ struct kvm_io_device dev_master;
+ struct kvm_io_device dev_slave;
+ struct kvm_io_device dev_eclr;
+ void (*ack_notifier)(void *opaque, int irq);
+ unsigned long irq_states[PIC_NUM_PINS];
+};
+
+int kvm_pic_init(struct kvm *kvm);
+void kvm_pic_destroy(struct kvm *kvm);
+int kvm_pic_read_irq(struct kvm *kvm);
+void kvm_pic_update_irq(struct kvm_pic *s);
+
+static inline int pic_in_kernel(struct kvm *kvm)
+{
+ int mode = kvm->arch.irqchip_mode;
+
+ /* Matches smp_wmb() when setting irqchip_mode */
+ smp_rmb();
+ return mode == KVM_IRQCHIP_KERNEL;
+}
+
+static inline int irqchip_split(struct kvm *kvm)
+{
+ int mode = kvm->arch.irqchip_mode;
+
+ /* Matches smp_wmb() when setting irqchip_mode */
+ smp_rmb();
+ return mode == KVM_IRQCHIP_SPLIT;
+}
+
+static inline int irqchip_kernel(struct kvm *kvm)
+{
+ int mode = kvm->arch.irqchip_mode;
+
+ /* Matches smp_wmb() when setting irqchip_mode */
+ smp_rmb();
+ return mode == KVM_IRQCHIP_KERNEL;
+}
+
+static inline int irqchip_in_kernel(struct kvm *kvm)
+{
+ int mode = kvm->arch.irqchip_mode;
+
+ /* Matches smp_wmb() when setting irqchip_mode */
+ smp_rmb();
+ return mode != KVM_IRQCHIP_NONE;
+}
+
+bool kvm_arch_irqfd_allowed(struct kvm *kvm, struct kvm_irqfd *args);
+void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu);
+void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu);
+void kvm_apic_nmi_wd_deliver(struct kvm_vcpu *vcpu);
+void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu);
+void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu);
+void __kvm_migrate_timers(struct kvm_vcpu *vcpu);
+
+int apic_has_pending_timer(struct kvm_vcpu *vcpu);
+
+int kvm_setup_default_irq_routing(struct kvm *kvm);
+int kvm_setup_empty_irq_routing(struct kvm *kvm);
+
+#endif
diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c
new file mode 100644
index 000000000..4d000aea0
--- /dev/null
+++ b/arch/x86/kvm/irq_comm.c
@@ -0,0 +1,441 @@
+/*
+ * irq_comm.c: Common API for in kernel interrupt controller
+ * Copyright (c) 2007, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * Authors:
+ * Yaozu (Eddie) Dong <Eddie.dong@intel.com>
+ *
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
+ */
+
+#include <linux/kvm_host.h>
+#include <linux/slab.h>
+#include <linux/export.h>
+#include <linux/rculist.h>
+
+#include <trace/events/kvm.h>
+
+#include <asm/msidef.h>
+
+#include "irq.h"
+
+#include "ioapic.h"
+
+#include "lapic.h"
+
+#include "hyperv.h"
+#include "x86.h"
+
+static int kvm_set_pic_irq(struct kvm_kernel_irq_routing_entry *e,
+ struct kvm *kvm, int irq_source_id, int level,
+ bool line_status)
+{
+ struct kvm_pic *pic = kvm->arch.vpic;
+ return kvm_pic_set_irq(pic, e->irqchip.pin, irq_source_id, level);
+}
+
+static int kvm_set_ioapic_irq(struct kvm_kernel_irq_routing_entry *e,
+ struct kvm *kvm, int irq_source_id, int level,
+ bool line_status)
+{
+ struct kvm_ioapic *ioapic = kvm->arch.vioapic;
+ return kvm_ioapic_set_irq(ioapic, e->irqchip.pin, irq_source_id, level,
+ line_status);
+}
+
+int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
+ struct kvm_lapic_irq *irq, struct dest_map *dest_map)
+{
+ int i, r = -1;
+ struct kvm_vcpu *vcpu, *lowest = NULL;
+ unsigned long dest_vcpu_bitmap[BITS_TO_LONGS(KVM_MAX_VCPUS)];
+ unsigned int dest_vcpus = 0;
+
+ if (irq->dest_mode == 0 && irq->dest_id == 0xff &&
+ kvm_lowest_prio_delivery(irq)) {
+ printk(KERN_INFO "kvm: apic: phys broadcast and lowest prio\n");
+ irq->delivery_mode = APIC_DM_FIXED;
+ }
+
+ if (kvm_irq_delivery_to_apic_fast(kvm, src, irq, &r, dest_map))
+ return r;
+
+ memset(dest_vcpu_bitmap, 0, sizeof(dest_vcpu_bitmap));
+
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ if (!kvm_apic_present(vcpu))
+ continue;
+
+ if (!kvm_apic_match_dest(vcpu, src, irq->shorthand,
+ irq->dest_id, irq->dest_mode))
+ continue;
+
+ if (!kvm_lowest_prio_delivery(irq)) {
+ if (r < 0)
+ r = 0;
+ r += kvm_apic_set_irq(vcpu, irq, dest_map);
+ } else if (kvm_lapic_enabled(vcpu)) {
+ if (!kvm_vector_hashing_enabled()) {
+ if (!lowest)
+ lowest = vcpu;
+ else if (kvm_apic_compare_prio(vcpu, lowest) < 0)
+ lowest = vcpu;
+ } else {
+ __set_bit(i, dest_vcpu_bitmap);
+ dest_vcpus++;
+ }
+ }
+ }
+
+ if (dest_vcpus != 0) {
+ int idx = kvm_vector_to_index(irq->vector, dest_vcpus,
+ dest_vcpu_bitmap, KVM_MAX_VCPUS);
+
+ lowest = kvm_get_vcpu(kvm, idx);
+ }
+
+ if (lowest)
+ r = kvm_apic_set_irq(lowest, irq, dest_map);
+
+ return r;
+}
+
+void kvm_set_msi_irq(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e,
+ struct kvm_lapic_irq *irq)
+{
+ trace_kvm_msi_set_irq(e->msi.address_lo | (kvm->arch.x2apic_format ?
+ (u64)e->msi.address_hi << 32 : 0),
+ e->msi.data);
+
+ irq->dest_id = (e->msi.address_lo &
+ MSI_ADDR_DEST_ID_MASK) >> MSI_ADDR_DEST_ID_SHIFT;
+ if (kvm->arch.x2apic_format)
+ irq->dest_id |= MSI_ADDR_EXT_DEST_ID(e->msi.address_hi);
+ irq->vector = (e->msi.data &
+ MSI_DATA_VECTOR_MASK) >> MSI_DATA_VECTOR_SHIFT;
+ irq->dest_mode = (1 << MSI_ADDR_DEST_MODE_SHIFT) & e->msi.address_lo;
+ irq->trig_mode = (1 << MSI_DATA_TRIGGER_SHIFT) & e->msi.data;
+ irq->delivery_mode = e->msi.data & 0x700;
+ irq->msi_redir_hint = ((e->msi.address_lo
+ & MSI_ADDR_REDIRECTION_LOWPRI) > 0);
+ irq->level = 1;
+ irq->shorthand = 0;
+}
+EXPORT_SYMBOL_GPL(kvm_set_msi_irq);
+
+static inline bool kvm_msi_route_invalid(struct kvm *kvm,
+ struct kvm_kernel_irq_routing_entry *e)
+{
+ return kvm->arch.x2apic_format && (e->msi.address_hi & 0xff);
+}
+
+int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
+ struct kvm *kvm, int irq_source_id, int level, bool line_status)
+{
+ struct kvm_lapic_irq irq;
+
+ if (kvm_msi_route_invalid(kvm, e))
+ return -EINVAL;
+
+ if (!level)
+ return -1;
+
+ kvm_set_msi_irq(kvm, e, &irq);
+
+ return kvm_irq_delivery_to_apic(kvm, NULL, &irq, NULL);
+}
+
+
+static int kvm_hv_set_sint(struct kvm_kernel_irq_routing_entry *e,
+ struct kvm *kvm, int irq_source_id, int level,
+ bool line_status)
+{
+ if (!level)
+ return -1;
+
+ return kvm_hv_synic_set_irq(kvm, e->hv_sint.vcpu, e->hv_sint.sint);
+}
+
+int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e,
+ struct kvm *kvm, int irq_source_id, int level,
+ bool line_status)
+{
+ struct kvm_lapic_irq irq;
+ int r;
+
+ switch (e->type) {
+ case KVM_IRQ_ROUTING_HV_SINT:
+ return kvm_hv_set_sint(e, kvm, irq_source_id, level,
+ line_status);
+
+ case KVM_IRQ_ROUTING_MSI:
+ if (kvm_msi_route_invalid(kvm, e))
+ return -EINVAL;
+
+ kvm_set_msi_irq(kvm, e, &irq);
+
+ if (kvm_irq_delivery_to_apic_fast(kvm, NULL, &irq, &r, NULL))
+ return r;
+ break;
+
+ default:
+ break;
+ }
+
+ return -EWOULDBLOCK;
+}
+
+int kvm_request_irq_source_id(struct kvm *kvm)
+{
+ unsigned long *bitmap = &kvm->arch.irq_sources_bitmap;
+ int irq_source_id;
+
+ mutex_lock(&kvm->irq_lock);
+ irq_source_id = find_first_zero_bit(bitmap, BITS_PER_LONG);
+
+ if (irq_source_id >= BITS_PER_LONG) {
+ printk(KERN_WARNING "kvm: exhaust allocatable IRQ sources!\n");
+ irq_source_id = -EFAULT;
+ goto unlock;
+ }
+
+ ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID);
+ ASSERT(irq_source_id != KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID);
+ set_bit(irq_source_id, bitmap);
+unlock:
+ mutex_unlock(&kvm->irq_lock);
+
+ return irq_source_id;
+}
+
+void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id)
+{
+ ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID);
+ ASSERT(irq_source_id != KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID);
+
+ mutex_lock(&kvm->irq_lock);
+ if (irq_source_id < 0 ||
+ irq_source_id >= BITS_PER_LONG) {
+ printk(KERN_ERR "kvm: IRQ source ID out of range!\n");
+ goto unlock;
+ }
+ clear_bit(irq_source_id, &kvm->arch.irq_sources_bitmap);
+ if (!irqchip_kernel(kvm))
+ goto unlock;
+
+ kvm_ioapic_clear_all(kvm->arch.vioapic, irq_source_id);
+ kvm_pic_clear_all(kvm->arch.vpic, irq_source_id);
+unlock:
+ mutex_unlock(&kvm->irq_lock);
+}
+
+void kvm_register_irq_mask_notifier(struct kvm *kvm, int irq,
+ struct kvm_irq_mask_notifier *kimn)
+{
+ mutex_lock(&kvm->irq_lock);
+ kimn->irq = irq;
+ hlist_add_head_rcu(&kimn->link, &kvm->arch.mask_notifier_list);
+ mutex_unlock(&kvm->irq_lock);
+}
+
+void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq,
+ struct kvm_irq_mask_notifier *kimn)
+{
+ mutex_lock(&kvm->irq_lock);
+ hlist_del_rcu(&kimn->link);
+ mutex_unlock(&kvm->irq_lock);
+ synchronize_srcu(&kvm->irq_srcu);
+}
+
+void kvm_fire_mask_notifiers(struct kvm *kvm, unsigned irqchip, unsigned pin,
+ bool mask)
+{
+ struct kvm_irq_mask_notifier *kimn;
+ int idx, gsi;
+
+ idx = srcu_read_lock(&kvm->irq_srcu);
+ gsi = kvm_irq_map_chip_pin(kvm, irqchip, pin);
+ if (gsi != -1)
+ hlist_for_each_entry_rcu(kimn, &kvm->arch.mask_notifier_list, link)
+ if (kimn->irq == gsi)
+ kimn->func(kimn, mask);
+ srcu_read_unlock(&kvm->irq_srcu, idx);
+}
+
+bool kvm_arch_can_set_irq_routing(struct kvm *kvm)
+{
+ return irqchip_in_kernel(kvm);
+}
+
+int kvm_set_routing_entry(struct kvm *kvm,
+ struct kvm_kernel_irq_routing_entry *e,
+ const struct kvm_irq_routing_entry *ue)
+{
+ /* We can't check irqchip_in_kernel() here as some callers are
+ * currently inititalizing the irqchip. Other callers should therefore
+ * check kvm_arch_can_set_irq_routing() before calling this function.
+ */
+ switch (ue->type) {
+ case KVM_IRQ_ROUTING_IRQCHIP:
+ if (irqchip_split(kvm))
+ return -EINVAL;
+ e->irqchip.pin = ue->u.irqchip.pin;
+ switch (ue->u.irqchip.irqchip) {
+ case KVM_IRQCHIP_PIC_SLAVE:
+ e->irqchip.pin += PIC_NUM_PINS / 2;
+ /* fall through */
+ case KVM_IRQCHIP_PIC_MASTER:
+ if (ue->u.irqchip.pin >= PIC_NUM_PINS / 2)
+ return -EINVAL;
+ e->set = kvm_set_pic_irq;
+ break;
+ case KVM_IRQCHIP_IOAPIC:
+ if (ue->u.irqchip.pin >= KVM_IOAPIC_NUM_PINS)
+ return -EINVAL;
+ e->set = kvm_set_ioapic_irq;
+ break;
+ default:
+ return -EINVAL;
+ }
+ e->irqchip.irqchip = ue->u.irqchip.irqchip;
+ break;
+ case KVM_IRQ_ROUTING_MSI:
+ e->set = kvm_set_msi;
+ e->msi.address_lo = ue->u.msi.address_lo;
+ e->msi.address_hi = ue->u.msi.address_hi;
+ e->msi.data = ue->u.msi.data;
+
+ if (kvm_msi_route_invalid(kvm, e))
+ return -EINVAL;
+ break;
+ case KVM_IRQ_ROUTING_HV_SINT:
+ e->set = kvm_hv_set_sint;
+ e->hv_sint.vcpu = ue->u.hv_sint.vcpu;
+ e->hv_sint.sint = ue->u.hv_sint.sint;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq,
+ struct kvm_vcpu **dest_vcpu)
+{
+ int i, r = 0;
+ struct kvm_vcpu *vcpu;
+
+ if (kvm_intr_is_single_vcpu_fast(kvm, irq, dest_vcpu))
+ return true;
+
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ if (!kvm_apic_present(vcpu))
+ continue;
+
+ if (!kvm_apic_match_dest(vcpu, NULL, irq->shorthand,
+ irq->dest_id, irq->dest_mode))
+ continue;
+
+ if (++r == 2)
+ return false;
+
+ *dest_vcpu = vcpu;
+ }
+
+ return r == 1;
+}
+EXPORT_SYMBOL_GPL(kvm_intr_is_single_vcpu);
+
+#define IOAPIC_ROUTING_ENTRY(irq) \
+ { .gsi = irq, .type = KVM_IRQ_ROUTING_IRQCHIP, \
+ .u.irqchip = { .irqchip = KVM_IRQCHIP_IOAPIC, .pin = (irq) } }
+#define ROUTING_ENTRY1(irq) IOAPIC_ROUTING_ENTRY(irq)
+
+#define PIC_ROUTING_ENTRY(irq) \
+ { .gsi = irq, .type = KVM_IRQ_ROUTING_IRQCHIP, \
+ .u.irqchip = { .irqchip = SELECT_PIC(irq), .pin = (irq) % 8 } }
+#define ROUTING_ENTRY2(irq) \
+ IOAPIC_ROUTING_ENTRY(irq), PIC_ROUTING_ENTRY(irq)
+
+static const struct kvm_irq_routing_entry default_routing[] = {
+ ROUTING_ENTRY2(0), ROUTING_ENTRY2(1),
+ ROUTING_ENTRY2(2), ROUTING_ENTRY2(3),
+ ROUTING_ENTRY2(4), ROUTING_ENTRY2(5),
+ ROUTING_ENTRY2(6), ROUTING_ENTRY2(7),
+ ROUTING_ENTRY2(8), ROUTING_ENTRY2(9),
+ ROUTING_ENTRY2(10), ROUTING_ENTRY2(11),
+ ROUTING_ENTRY2(12), ROUTING_ENTRY2(13),
+ ROUTING_ENTRY2(14), ROUTING_ENTRY2(15),
+ ROUTING_ENTRY1(16), ROUTING_ENTRY1(17),
+ ROUTING_ENTRY1(18), ROUTING_ENTRY1(19),
+ ROUTING_ENTRY1(20), ROUTING_ENTRY1(21),
+ ROUTING_ENTRY1(22), ROUTING_ENTRY1(23),
+};
+
+int kvm_setup_default_irq_routing(struct kvm *kvm)
+{
+ return kvm_set_irq_routing(kvm, default_routing,
+ ARRAY_SIZE(default_routing), 0);
+}
+
+static const struct kvm_irq_routing_entry empty_routing[] = {};
+
+int kvm_setup_empty_irq_routing(struct kvm *kvm)
+{
+ return kvm_set_irq_routing(kvm, empty_routing, 0, 0);
+}
+
+void kvm_arch_post_irq_routing_update(struct kvm *kvm)
+{
+ if (!irqchip_split(kvm))
+ return;
+ kvm_make_scan_ioapic_request(kvm);
+}
+
+void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu,
+ ulong *ioapic_handled_vectors)
+{
+ struct kvm *kvm = vcpu->kvm;
+ struct kvm_kernel_irq_routing_entry *entry;
+ struct kvm_irq_routing_table *table;
+ u32 i, nr_ioapic_pins;
+ int idx;
+
+ idx = srcu_read_lock(&kvm->irq_srcu);
+ table = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
+ nr_ioapic_pins = min_t(u32, table->nr_rt_entries,
+ kvm->arch.nr_reserved_ioapic_pins);
+ for (i = 0; i < nr_ioapic_pins; ++i) {
+ hlist_for_each_entry(entry, &table->map[i], link) {
+ struct kvm_lapic_irq irq;
+
+ if (entry->type != KVM_IRQ_ROUTING_MSI)
+ continue;
+
+ kvm_set_msi_irq(vcpu->kvm, entry, &irq);
+
+ if (irq.trig_mode && kvm_apic_match_dest(vcpu, NULL, 0,
+ irq.dest_id, irq.dest_mode))
+ __set_bit(irq.vector, ioapic_handled_vectors);
+ }
+ }
+ srcu_read_unlock(&kvm->irq_srcu, idx);
+}
+
+void kvm_arch_irq_routing_update(struct kvm *kvm)
+{
+ kvm_hv_irq_routing_update(kvm);
+}
diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
new file mode 100644
index 000000000..7df600410
--- /dev/null
+++ b/arch/x86/kvm/kvm_cache_regs.h
@@ -0,0 +1,113 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef ASM_KVM_CACHE_REGS_H
+#define ASM_KVM_CACHE_REGS_H
+
+#define KVM_POSSIBLE_CR0_GUEST_BITS X86_CR0_TS
+#define KVM_POSSIBLE_CR4_GUEST_BITS \
+ (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \
+ | X86_CR4_OSXMMEXCPT | X86_CR4_LA57 | X86_CR4_PGE | X86_CR4_TSD)
+
+static inline unsigned long kvm_register_read(struct kvm_vcpu *vcpu,
+ enum kvm_reg reg)
+{
+ if (!test_bit(reg, (unsigned long *)&vcpu->arch.regs_avail))
+ kvm_x86_ops->cache_reg(vcpu, reg);
+
+ return vcpu->arch.regs[reg];
+}
+
+static inline void kvm_register_write(struct kvm_vcpu *vcpu,
+ enum kvm_reg reg,
+ unsigned long val)
+{
+ vcpu->arch.regs[reg] = val;
+ __set_bit(reg, (unsigned long *)&vcpu->arch.regs_dirty);
+ __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
+}
+
+static inline unsigned long kvm_rip_read(struct kvm_vcpu *vcpu)
+{
+ return kvm_register_read(vcpu, VCPU_REGS_RIP);
+}
+
+static inline void kvm_rip_write(struct kvm_vcpu *vcpu, unsigned long val)
+{
+ kvm_register_write(vcpu, VCPU_REGS_RIP, val);
+}
+
+static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index)
+{
+ might_sleep(); /* on svm */
+
+ if (!test_bit(VCPU_EXREG_PDPTR,
+ (unsigned long *)&vcpu->arch.regs_avail))
+ kvm_x86_ops->cache_reg(vcpu, (enum kvm_reg)VCPU_EXREG_PDPTR);
+
+ return vcpu->arch.walk_mmu->pdptrs[index];
+}
+
+static inline ulong kvm_read_cr0_bits(struct kvm_vcpu *vcpu, ulong mask)
+{
+ ulong tmask = mask & KVM_POSSIBLE_CR0_GUEST_BITS;
+ if (tmask & vcpu->arch.cr0_guest_owned_bits)
+ kvm_x86_ops->decache_cr0_guest_bits(vcpu);
+ return vcpu->arch.cr0 & mask;
+}
+
+static inline ulong kvm_read_cr0(struct kvm_vcpu *vcpu)
+{
+ return kvm_read_cr0_bits(vcpu, ~0UL);
+}
+
+static inline ulong kvm_read_cr4_bits(struct kvm_vcpu *vcpu, ulong mask)
+{
+ ulong tmask = mask & KVM_POSSIBLE_CR4_GUEST_BITS;
+ if (tmask & vcpu->arch.cr4_guest_owned_bits)
+ kvm_x86_ops->decache_cr4_guest_bits(vcpu);
+ return vcpu->arch.cr4 & mask;
+}
+
+static inline ulong kvm_read_cr3(struct kvm_vcpu *vcpu)
+{
+ if (!test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail))
+ kvm_x86_ops->decache_cr3(vcpu);
+ return vcpu->arch.cr3;
+}
+
+static inline ulong kvm_read_cr4(struct kvm_vcpu *vcpu)
+{
+ return kvm_read_cr4_bits(vcpu, ~0UL);
+}
+
+static inline u64 kvm_read_edx_eax(struct kvm_vcpu *vcpu)
+{
+ return (kvm_register_read(vcpu, VCPU_REGS_RAX) & -1u)
+ | ((u64)(kvm_register_read(vcpu, VCPU_REGS_RDX) & -1u) << 32);
+}
+
+static inline void enter_guest_mode(struct kvm_vcpu *vcpu)
+{
+ vcpu->arch.hflags |= HF_GUEST_MASK;
+}
+
+static inline void leave_guest_mode(struct kvm_vcpu *vcpu)
+{
+ vcpu->arch.hflags &= ~HF_GUEST_MASK;
+
+ if (vcpu->arch.load_eoi_exitmap_pending) {
+ vcpu->arch.load_eoi_exitmap_pending = false;
+ kvm_make_request(KVM_REQ_LOAD_EOI_EXITMAP, vcpu);
+ }
+}
+
+static inline bool is_guest_mode(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.hflags & HF_GUEST_MASK;
+}
+
+static inline bool is_smm(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.hflags & HF_SMM_MASK;
+}
+
+#endif
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
new file mode 100644
index 000000000..89d07312e
--- /dev/null
+++ b/arch/x86/kvm/lapic.c
@@ -0,0 +1,2710 @@
+
+/*
+ * Local APIC virtualization
+ *
+ * Copyright (C) 2006 Qumranet, Inc.
+ * Copyright (C) 2007 Novell
+ * Copyright (C) 2007 Intel
+ * Copyright 2009 Red Hat, Inc. and/or its affiliates.
+ *
+ * Authors:
+ * Dor Laor <dor.laor@qumranet.com>
+ * Gregory Haskins <ghaskins@novell.com>
+ * Yaozu (Eddie) Dong <eddie.dong@intel.com>
+ *
+ * Based on Xen 3.1 code, Copyright (c) 2004, Intel Corporation.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ */
+
+#include <linux/kvm_host.h>
+#include <linux/kvm.h>
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/smp.h>
+#include <linux/hrtimer.h>
+#include <linux/io.h>
+#include <linux/export.h>
+#include <linux/math64.h>
+#include <linux/slab.h>
+#include <asm/processor.h>
+#include <asm/msr.h>
+#include <asm/page.h>
+#include <asm/current.h>
+#include <asm/apicdef.h>
+#include <asm/delay.h>
+#include <linux/atomic.h>
+#include <linux/jump_label.h>
+#include "kvm_cache_regs.h"
+#include "irq.h"
+#include "trace.h"
+#include "x86.h"
+#include "cpuid.h"
+#include "hyperv.h"
+
+#ifndef CONFIG_X86_64
+#define mod_64(x, y) ((x) - (y) * div64_u64(x, y))
+#else
+#define mod_64(x, y) ((x) % (y))
+#endif
+
+#define PRId64 "d"
+#define PRIx64 "llx"
+#define PRIu64 "u"
+#define PRIo64 "o"
+
+/* #define apic_debug(fmt,arg...) printk(KERN_WARNING fmt,##arg) */
+#define apic_debug(fmt, arg...) do {} while (0)
+
+/* 14 is the version for Xeon and Pentium 8.4.8*/
+#define APIC_VERSION (0x14UL | ((KVM_APIC_LVT_NUM - 1) << 16))
+#define LAPIC_MMIO_LENGTH (1 << 12)
+/* followed define is not in apicdef.h */
+#define APIC_SHORT_MASK 0xc0000
+#define APIC_DEST_NOSHORT 0x0
+#define APIC_DEST_MASK 0x800
+#define MAX_APIC_VECTOR 256
+#define APIC_VECTORS_PER_REG 32
+
+#define APIC_BROADCAST 0xFF
+#define X2APIC_BROADCAST 0xFFFFFFFFul
+
+static inline int apic_test_vector(int vec, void *bitmap)
+{
+ return test_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
+}
+
+bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+
+ return apic_test_vector(vector, apic->regs + APIC_ISR) ||
+ apic_test_vector(vector, apic->regs + APIC_IRR);
+}
+
+static inline void apic_clear_vector(int vec, void *bitmap)
+{
+ clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
+}
+
+static inline int __apic_test_and_set_vector(int vec, void *bitmap)
+{
+ return __test_and_set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
+}
+
+static inline int __apic_test_and_clear_vector(int vec, void *bitmap)
+{
+ return __test_and_clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
+}
+
+struct static_key_deferred apic_hw_disabled __read_mostly;
+struct static_key_deferred apic_sw_disabled __read_mostly;
+
+static inline int apic_enabled(struct kvm_lapic *apic)
+{
+ return kvm_apic_sw_enabled(apic) && kvm_apic_hw_enabled(apic);
+}
+
+#define LVT_MASK \
+ (APIC_LVT_MASKED | APIC_SEND_PENDING | APIC_VECTOR_MASK)
+
+#define LINT_MASK \
+ (LVT_MASK | APIC_MODE_MASK | APIC_INPUT_POLARITY | \
+ APIC_LVT_REMOTE_IRR | APIC_LVT_LEVEL_TRIGGER)
+
+static inline u8 kvm_xapic_id(struct kvm_lapic *apic)
+{
+ return kvm_lapic_get_reg(apic, APIC_ID) >> 24;
+}
+
+static inline u32 kvm_x2apic_id(struct kvm_lapic *apic)
+{
+ return apic->vcpu->vcpu_id;
+}
+
+static inline bool kvm_apic_map_get_logical_dest(struct kvm_apic_map *map,
+ u32 dest_id, struct kvm_lapic ***cluster, u16 *mask) {
+ switch (map->mode) {
+ case KVM_APIC_MODE_X2APIC: {
+ u32 offset = (dest_id >> 16) * 16;
+ u32 max_apic_id = map->max_apic_id;
+
+ if (offset <= max_apic_id) {
+ u8 cluster_size = min(max_apic_id - offset + 1, 16U);
+
+ offset = array_index_nospec(offset, map->max_apic_id + 1);
+ *cluster = &map->phys_map[offset];
+ *mask = dest_id & (0xffff >> (16 - cluster_size));
+ } else {
+ *mask = 0;
+ }
+
+ return true;
+ }
+ case KVM_APIC_MODE_XAPIC_FLAT:
+ *cluster = map->xapic_flat_map;
+ *mask = dest_id & 0xff;
+ return true;
+ case KVM_APIC_MODE_XAPIC_CLUSTER:
+ *cluster = map->xapic_cluster_map[(dest_id >> 4) & 0xf];
+ *mask = dest_id & 0xf;
+ return true;
+ default:
+ /* Not optimized. */
+ return false;
+ }
+}
+
+static void kvm_apic_map_free(struct rcu_head *rcu)
+{
+ struct kvm_apic_map *map = container_of(rcu, struct kvm_apic_map, rcu);
+
+ kvfree(map);
+}
+
+static void recalculate_apic_map(struct kvm *kvm)
+{
+ struct kvm_apic_map *new, *old = NULL;
+ struct kvm_vcpu *vcpu;
+ int i;
+ u32 max_id = 255; /* enough space for any xAPIC ID */
+
+ mutex_lock(&kvm->arch.apic_map_lock);
+
+ kvm_for_each_vcpu(i, vcpu, kvm)
+ if (kvm_apic_present(vcpu))
+ max_id = max(max_id, kvm_x2apic_id(vcpu->arch.apic));
+
+ new = kvzalloc(sizeof(struct kvm_apic_map) +
+ sizeof(struct kvm_lapic *) * ((u64)max_id + 1), GFP_KERNEL);
+
+ if (!new)
+ goto out;
+
+ new->max_apic_id = max_id;
+
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ struct kvm_lapic *apic = vcpu->arch.apic;
+ struct kvm_lapic **cluster;
+ u16 mask;
+ u32 ldr;
+ u8 xapic_id;
+ u32 x2apic_id;
+
+ if (!kvm_apic_present(vcpu))
+ continue;
+
+ xapic_id = kvm_xapic_id(apic);
+ x2apic_id = kvm_x2apic_id(apic);
+
+ /* Hotplug hack: see kvm_apic_match_physical_addr(), ... */
+ if ((apic_x2apic_mode(apic) || x2apic_id > 0xff) &&
+ x2apic_id <= new->max_apic_id)
+ new->phys_map[x2apic_id] = apic;
+ /*
+ * ... xAPIC ID of VCPUs with APIC ID > 0xff will wrap-around,
+ * prevent them from masking VCPUs with APIC ID <= 0xff.
+ */
+ if (!apic_x2apic_mode(apic) && !new->phys_map[xapic_id])
+ new->phys_map[xapic_id] = apic;
+
+ if (!kvm_apic_sw_enabled(apic))
+ continue;
+
+ ldr = kvm_lapic_get_reg(apic, APIC_LDR);
+
+ if (apic_x2apic_mode(apic)) {
+ new->mode |= KVM_APIC_MODE_X2APIC;
+ } else if (ldr) {
+ ldr = GET_APIC_LOGICAL_ID(ldr);
+ if (kvm_lapic_get_reg(apic, APIC_DFR) == APIC_DFR_FLAT)
+ new->mode |= KVM_APIC_MODE_XAPIC_FLAT;
+ else
+ new->mode |= KVM_APIC_MODE_XAPIC_CLUSTER;
+ }
+
+ if (!kvm_apic_map_get_logical_dest(new, ldr, &cluster, &mask))
+ continue;
+
+ if (mask)
+ cluster[ffs(mask) - 1] = apic;
+ }
+out:
+ old = rcu_dereference_protected(kvm->arch.apic_map,
+ lockdep_is_held(&kvm->arch.apic_map_lock));
+ rcu_assign_pointer(kvm->arch.apic_map, new);
+ mutex_unlock(&kvm->arch.apic_map_lock);
+
+ if (old)
+ call_rcu(&old->rcu, kvm_apic_map_free);
+
+ kvm_make_scan_ioapic_request(kvm);
+}
+
+static inline void apic_set_spiv(struct kvm_lapic *apic, u32 val)
+{
+ bool enabled = val & APIC_SPIV_APIC_ENABLED;
+
+ kvm_lapic_set_reg(apic, APIC_SPIV, val);
+
+ if (enabled != apic->sw_enabled) {
+ apic->sw_enabled = enabled;
+ if (enabled) {
+ static_key_slow_dec_deferred(&apic_sw_disabled);
+ recalculate_apic_map(apic->vcpu->kvm);
+ } else
+ static_key_slow_inc(&apic_sw_disabled.key);
+
+ recalculate_apic_map(apic->vcpu->kvm);
+ }
+}
+
+static inline void kvm_apic_set_xapic_id(struct kvm_lapic *apic, u8 id)
+{
+ kvm_lapic_set_reg(apic, APIC_ID, id << 24);
+ recalculate_apic_map(apic->vcpu->kvm);
+}
+
+static inline void kvm_apic_set_ldr(struct kvm_lapic *apic, u32 id)
+{
+ kvm_lapic_set_reg(apic, APIC_LDR, id);
+ recalculate_apic_map(apic->vcpu->kvm);
+}
+
+static inline u32 kvm_apic_calc_x2apic_ldr(u32 id)
+{
+ return ((id >> 4) << 16) | (1 << (id & 0xf));
+}
+
+static inline void kvm_apic_set_x2apic_id(struct kvm_lapic *apic, u32 id)
+{
+ u32 ldr = kvm_apic_calc_x2apic_ldr(id);
+
+ WARN_ON_ONCE(id != apic->vcpu->vcpu_id);
+
+ kvm_lapic_set_reg(apic, APIC_ID, id);
+ kvm_lapic_set_reg(apic, APIC_LDR, ldr);
+ recalculate_apic_map(apic->vcpu->kvm);
+}
+
+static inline int apic_lvt_enabled(struct kvm_lapic *apic, int lvt_type)
+{
+ return !(kvm_lapic_get_reg(apic, lvt_type) & APIC_LVT_MASKED);
+}
+
+static inline int apic_lvt_vector(struct kvm_lapic *apic, int lvt_type)
+{
+ return kvm_lapic_get_reg(apic, lvt_type) & APIC_VECTOR_MASK;
+}
+
+static inline int apic_lvtt_oneshot(struct kvm_lapic *apic)
+{
+ return apic->lapic_timer.timer_mode == APIC_LVT_TIMER_ONESHOT;
+}
+
+static inline int apic_lvtt_period(struct kvm_lapic *apic)
+{
+ return apic->lapic_timer.timer_mode == APIC_LVT_TIMER_PERIODIC;
+}
+
+static inline int apic_lvtt_tscdeadline(struct kvm_lapic *apic)
+{
+ return apic->lapic_timer.timer_mode == APIC_LVT_TIMER_TSCDEADLINE;
+}
+
+static inline int apic_lvt_nmi_mode(u32 lvt_val)
+{
+ return (lvt_val & (APIC_MODE_MASK | APIC_LVT_MASKED)) == APIC_DM_NMI;
+}
+
+void kvm_apic_set_version(struct kvm_vcpu *vcpu)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+ struct kvm_cpuid_entry2 *feat;
+ u32 v = APIC_VERSION;
+
+ if (!lapic_in_kernel(vcpu))
+ return;
+
+ /*
+ * KVM emulates 82093AA datasheet (with in-kernel IOAPIC implementation)
+ * which doesn't have EOI register; Some buggy OSes (e.g. Windows with
+ * Hyper-V role) disable EOI broadcast in lapic not checking for IOAPIC
+ * version first and level-triggered interrupts never get EOIed in
+ * IOAPIC.
+ */
+ feat = kvm_find_cpuid_entry(apic->vcpu, 0x1, 0);
+ if (feat && (feat->ecx & (1 << (X86_FEATURE_X2APIC & 31))) &&
+ !ioapic_in_kernel(vcpu->kvm))
+ v |= APIC_LVR_DIRECTED_EOI;
+ kvm_lapic_set_reg(apic, APIC_LVR, v);
+}
+
+static const unsigned int apic_lvt_mask[KVM_APIC_LVT_NUM] = {
+ LVT_MASK , /* part LVTT mask, timer mode mask added at runtime */
+ LVT_MASK | APIC_MODE_MASK, /* LVTTHMR */
+ LVT_MASK | APIC_MODE_MASK, /* LVTPC */
+ LINT_MASK, LINT_MASK, /* LVT0-1 */
+ LVT_MASK /* LVTERR */
+};
+
+static int find_highest_vector(void *bitmap)
+{
+ int vec;
+ u32 *reg;
+
+ for (vec = MAX_APIC_VECTOR - APIC_VECTORS_PER_REG;
+ vec >= 0; vec -= APIC_VECTORS_PER_REG) {
+ reg = bitmap + REG_POS(vec);
+ if (*reg)
+ return __fls(*reg) + vec;
+ }
+
+ return -1;
+}
+
+static u8 count_vectors(void *bitmap)
+{
+ int vec;
+ u32 *reg;
+ u8 count = 0;
+
+ for (vec = 0; vec < MAX_APIC_VECTOR; vec += APIC_VECTORS_PER_REG) {
+ reg = bitmap + REG_POS(vec);
+ count += hweight32(*reg);
+ }
+
+ return count;
+}
+
+bool __kvm_apic_update_irr(u32 *pir, void *regs, int *max_irr)
+{
+ u32 i, vec;
+ u32 pir_val, irr_val, prev_irr_val;
+ int max_updated_irr;
+
+ max_updated_irr = -1;
+ *max_irr = -1;
+
+ for (i = vec = 0; i <= 7; i++, vec += 32) {
+ pir_val = READ_ONCE(pir[i]);
+ irr_val = *((u32 *)(regs + APIC_IRR + i * 0x10));
+ if (pir_val) {
+ prev_irr_val = irr_val;
+ irr_val |= xchg(&pir[i], 0);
+ *((u32 *)(regs + APIC_IRR + i * 0x10)) = irr_val;
+ if (prev_irr_val != irr_val) {
+ max_updated_irr =
+ __fls(irr_val ^ prev_irr_val) + vec;
+ }
+ }
+ if (irr_val)
+ *max_irr = __fls(irr_val) + vec;
+ }
+
+ return ((max_updated_irr != -1) &&
+ (max_updated_irr == *max_irr));
+}
+EXPORT_SYMBOL_GPL(__kvm_apic_update_irr);
+
+bool kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir, int *max_irr)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+
+ return __kvm_apic_update_irr(pir, apic->regs, max_irr);
+}
+EXPORT_SYMBOL_GPL(kvm_apic_update_irr);
+
+static inline int apic_search_irr(struct kvm_lapic *apic)
+{
+ return find_highest_vector(apic->regs + APIC_IRR);
+}
+
+static inline int apic_find_highest_irr(struct kvm_lapic *apic)
+{
+ int result;
+
+ /*
+ * Note that irr_pending is just a hint. It will be always
+ * true with virtual interrupt delivery enabled.
+ */
+ if (!apic->irr_pending)
+ return -1;
+
+ result = apic_search_irr(apic);
+ ASSERT(result == -1 || result >= 16);
+
+ return result;
+}
+
+static inline void apic_clear_irr(int vec, struct kvm_lapic *apic)
+{
+ struct kvm_vcpu *vcpu;
+
+ vcpu = apic->vcpu;
+
+ if (unlikely(vcpu->arch.apicv_active)) {
+ /* need to update RVI */
+ apic_clear_vector(vec, apic->regs + APIC_IRR);
+ kvm_x86_ops->hwapic_irr_update(vcpu,
+ apic_find_highest_irr(apic));
+ } else {
+ apic->irr_pending = false;
+ apic_clear_vector(vec, apic->regs + APIC_IRR);
+ if (apic_search_irr(apic) != -1)
+ apic->irr_pending = true;
+ }
+}
+
+static inline void apic_set_isr(int vec, struct kvm_lapic *apic)
+{
+ struct kvm_vcpu *vcpu;
+
+ if (__apic_test_and_set_vector(vec, apic->regs + APIC_ISR))
+ return;
+
+ vcpu = apic->vcpu;
+
+ /*
+ * With APIC virtualization enabled, all caching is disabled
+ * because the processor can modify ISR under the hood. Instead
+ * just set SVI.
+ */
+ if (unlikely(vcpu->arch.apicv_active))
+ kvm_x86_ops->hwapic_isr_update(vcpu, vec);
+ else {
+ ++apic->isr_count;
+ BUG_ON(apic->isr_count > MAX_APIC_VECTOR);
+ /*
+ * ISR (in service register) bit is set when injecting an interrupt.
+ * The highest vector is injected. Thus the latest bit set matches
+ * the highest bit in ISR.
+ */
+ apic->highest_isr_cache = vec;
+ }
+}
+
+static inline int apic_find_highest_isr(struct kvm_lapic *apic)
+{
+ int result;
+
+ /*
+ * Note that isr_count is always 1, and highest_isr_cache
+ * is always -1, with APIC virtualization enabled.
+ */
+ if (!apic->isr_count)
+ return -1;
+ if (likely(apic->highest_isr_cache != -1))
+ return apic->highest_isr_cache;
+
+ result = find_highest_vector(apic->regs + APIC_ISR);
+ ASSERT(result == -1 || result >= 16);
+
+ return result;
+}
+
+static inline void apic_clear_isr(int vec, struct kvm_lapic *apic)
+{
+ struct kvm_vcpu *vcpu;
+ if (!__apic_test_and_clear_vector(vec, apic->regs + APIC_ISR))
+ return;
+
+ vcpu = apic->vcpu;
+
+ /*
+ * We do get here for APIC virtualization enabled if the guest
+ * uses the Hyper-V APIC enlightenment. In this case we may need
+ * to trigger a new interrupt delivery by writing the SVI field;
+ * on the other hand isr_count and highest_isr_cache are unused
+ * and must be left alone.
+ */
+ if (unlikely(vcpu->arch.apicv_active))
+ kvm_x86_ops->hwapic_isr_update(vcpu,
+ apic_find_highest_isr(apic));
+ else {
+ --apic->isr_count;
+ BUG_ON(apic->isr_count < 0);
+ apic->highest_isr_cache = -1;
+ }
+}
+
+int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu)
+{
+ /* This may race with setting of irr in __apic_accept_irq() and
+ * value returned may be wrong, but kvm_vcpu_kick() in __apic_accept_irq
+ * will cause vmexit immediately and the value will be recalculated
+ * on the next vmentry.
+ */
+ return apic_find_highest_irr(vcpu->arch.apic);
+}
+EXPORT_SYMBOL_GPL(kvm_lapic_find_highest_irr);
+
+static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
+ int vector, int level, int trig_mode,
+ struct dest_map *dest_map);
+
+int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq,
+ struct dest_map *dest_map)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+
+ return __apic_accept_irq(apic, irq->delivery_mode, irq->vector,
+ irq->level, irq->trig_mode, dest_map);
+}
+
+int kvm_pv_send_ipi(struct kvm *kvm, unsigned long ipi_bitmap_low,
+ unsigned long ipi_bitmap_high, u32 min,
+ unsigned long icr, int op_64_bit)
+{
+ int i;
+ struct kvm_apic_map *map;
+ struct kvm_vcpu *vcpu;
+ struct kvm_lapic_irq irq = {0};
+ int cluster_size = op_64_bit ? 64 : 32;
+ int count = 0;
+
+ irq.vector = icr & APIC_VECTOR_MASK;
+ irq.delivery_mode = icr & APIC_MODE_MASK;
+ irq.level = (icr & APIC_INT_ASSERT) != 0;
+ irq.trig_mode = icr & APIC_INT_LEVELTRIG;
+
+ if (icr & APIC_DEST_MASK)
+ return -KVM_EINVAL;
+ if (icr & APIC_SHORT_MASK)
+ return -KVM_EINVAL;
+
+ rcu_read_lock();
+ map = rcu_dereference(kvm->arch.apic_map);
+
+ if (unlikely(!map)) {
+ count = -EOPNOTSUPP;
+ goto out;
+ }
+
+ if (min > map->max_apic_id)
+ goto out;
+ /* Bits above cluster_size are masked in the caller. */
+ for_each_set_bit(i, &ipi_bitmap_low,
+ min((u32)BITS_PER_LONG, (map->max_apic_id - min + 1))) {
+ if (map->phys_map[min + i]) {
+ vcpu = map->phys_map[min + i]->vcpu;
+ count += kvm_apic_set_irq(vcpu, &irq, NULL);
+ }
+ }
+
+ min += cluster_size;
+
+ if (min > map->max_apic_id)
+ goto out;
+
+ for_each_set_bit(i, &ipi_bitmap_high,
+ min((u32)BITS_PER_LONG, (map->max_apic_id - min + 1))) {
+ if (map->phys_map[min + i]) {
+ vcpu = map->phys_map[min + i]->vcpu;
+ count += kvm_apic_set_irq(vcpu, &irq, NULL);
+ }
+ }
+
+out:
+ rcu_read_unlock();
+ return count;
+}
+
+static int pv_eoi_put_user(struct kvm_vcpu *vcpu, u8 val)
+{
+
+ return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.pv_eoi.data, &val,
+ sizeof(val));
+}
+
+static int pv_eoi_get_user(struct kvm_vcpu *vcpu, u8 *val)
+{
+
+ return kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.pv_eoi.data, val,
+ sizeof(*val));
+}
+
+static inline bool pv_eoi_enabled(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.pv_eoi.msr_val & KVM_MSR_ENABLED;
+}
+
+static bool pv_eoi_get_pending(struct kvm_vcpu *vcpu)
+{
+ u8 val;
+ if (pv_eoi_get_user(vcpu, &val) < 0) {
+ apic_debug("Can't read EOI MSR value: 0x%llx\n",
+ (unsigned long long)vcpu->arch.pv_eoi.msr_val);
+ return false;
+ }
+ return val & 0x1;
+}
+
+static void pv_eoi_set_pending(struct kvm_vcpu *vcpu)
+{
+ if (pv_eoi_put_user(vcpu, KVM_PV_EOI_ENABLED) < 0) {
+ apic_debug("Can't set EOI MSR value: 0x%llx\n",
+ (unsigned long long)vcpu->arch.pv_eoi.msr_val);
+ return;
+ }
+ __set_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention);
+}
+
+static void pv_eoi_clr_pending(struct kvm_vcpu *vcpu)
+{
+ if (pv_eoi_put_user(vcpu, KVM_PV_EOI_DISABLED) < 0) {
+ apic_debug("Can't clear EOI MSR value: 0x%llx\n",
+ (unsigned long long)vcpu->arch.pv_eoi.msr_val);
+ return;
+ }
+ __clear_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention);
+}
+
+static int apic_has_interrupt_for_ppr(struct kvm_lapic *apic, u32 ppr)
+{
+ int highest_irr;
+ if (apic->vcpu->arch.apicv_active)
+ highest_irr = kvm_x86_ops->sync_pir_to_irr(apic->vcpu);
+ else
+ highest_irr = apic_find_highest_irr(apic);
+ if (highest_irr == -1 || (highest_irr & 0xF0) <= ppr)
+ return -1;
+ return highest_irr;
+}
+
+static bool __apic_update_ppr(struct kvm_lapic *apic, u32 *new_ppr)
+{
+ u32 tpr, isrv, ppr, old_ppr;
+ int isr;
+
+ old_ppr = kvm_lapic_get_reg(apic, APIC_PROCPRI);
+ tpr = kvm_lapic_get_reg(apic, APIC_TASKPRI);
+ isr = apic_find_highest_isr(apic);
+ isrv = (isr != -1) ? isr : 0;
+
+ if ((tpr & 0xf0) >= (isrv & 0xf0))
+ ppr = tpr & 0xff;
+ else
+ ppr = isrv & 0xf0;
+
+ apic_debug("vlapic %p, ppr 0x%x, isr 0x%x, isrv 0x%x",
+ apic, ppr, isr, isrv);
+
+ *new_ppr = ppr;
+ if (old_ppr != ppr)
+ kvm_lapic_set_reg(apic, APIC_PROCPRI, ppr);
+
+ return ppr < old_ppr;
+}
+
+static void apic_update_ppr(struct kvm_lapic *apic)
+{
+ u32 ppr;
+
+ if (__apic_update_ppr(apic, &ppr) &&
+ apic_has_interrupt_for_ppr(apic, ppr) != -1)
+ kvm_make_request(KVM_REQ_EVENT, apic->vcpu);
+}
+
+void kvm_apic_update_ppr(struct kvm_vcpu *vcpu)
+{
+ apic_update_ppr(vcpu->arch.apic);
+}
+EXPORT_SYMBOL_GPL(kvm_apic_update_ppr);
+
+static void apic_set_tpr(struct kvm_lapic *apic, u32 tpr)
+{
+ kvm_lapic_set_reg(apic, APIC_TASKPRI, tpr);
+ apic_update_ppr(apic);
+}
+
+static bool kvm_apic_broadcast(struct kvm_lapic *apic, u32 mda)
+{
+ return mda == (apic_x2apic_mode(apic) ?
+ X2APIC_BROADCAST : APIC_BROADCAST);
+}
+
+static bool kvm_apic_match_physical_addr(struct kvm_lapic *apic, u32 mda)
+{
+ if (kvm_apic_broadcast(apic, mda))
+ return true;
+
+ if (apic_x2apic_mode(apic))
+ return mda == kvm_x2apic_id(apic);
+
+ /*
+ * Hotplug hack: Make LAPIC in xAPIC mode also accept interrupts as if
+ * it were in x2APIC mode. Hotplugged VCPUs start in xAPIC mode and
+ * this allows unique addressing of VCPUs with APIC ID over 0xff.
+ * The 0xff condition is needed because writeable xAPIC ID.
+ */
+ if (kvm_x2apic_id(apic) > 0xff && mda == kvm_x2apic_id(apic))
+ return true;
+
+ return mda == kvm_xapic_id(apic);
+}
+
+static bool kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda)
+{
+ u32 logical_id;
+
+ if (kvm_apic_broadcast(apic, mda))
+ return true;
+
+ logical_id = kvm_lapic_get_reg(apic, APIC_LDR);
+
+ if (apic_x2apic_mode(apic))
+ return ((logical_id >> 16) == (mda >> 16))
+ && (logical_id & mda & 0xffff) != 0;
+
+ logical_id = GET_APIC_LOGICAL_ID(logical_id);
+
+ switch (kvm_lapic_get_reg(apic, APIC_DFR)) {
+ case APIC_DFR_FLAT:
+ return (logical_id & mda) != 0;
+ case APIC_DFR_CLUSTER:
+ return ((logical_id >> 4) == (mda >> 4))
+ && (logical_id & mda & 0xf) != 0;
+ default:
+ apic_debug("Bad DFR vcpu %d: %08x\n",
+ apic->vcpu->vcpu_id, kvm_lapic_get_reg(apic, APIC_DFR));
+ return false;
+ }
+}
+
+/* The KVM local APIC implementation has two quirks:
+ *
+ * - Real hardware delivers interrupts destined to x2APIC ID > 0xff to LAPICs
+ * in xAPIC mode if the "destination & 0xff" matches its xAPIC ID.
+ * KVM doesn't do that aliasing.
+ *
+ * - in-kernel IOAPIC messages have to be delivered directly to
+ * x2APIC, because the kernel does not support interrupt remapping.
+ * In order to support broadcast without interrupt remapping, x2APIC
+ * rewrites the destination of non-IPI messages from APIC_BROADCAST
+ * to X2APIC_BROADCAST.
+ *
+ * The broadcast quirk can be disabled with KVM_CAP_X2APIC_API. This is
+ * important when userspace wants to use x2APIC-format MSIs, because
+ * APIC_BROADCAST (0xff) is a legal route for "cluster 0, CPUs 0-7".
+ */
+static u32 kvm_apic_mda(struct kvm_vcpu *vcpu, unsigned int dest_id,
+ struct kvm_lapic *source, struct kvm_lapic *target)
+{
+ bool ipi = source != NULL;
+
+ if (!vcpu->kvm->arch.x2apic_broadcast_quirk_disabled &&
+ !ipi && dest_id == APIC_BROADCAST && apic_x2apic_mode(target))
+ return X2APIC_BROADCAST;
+
+ return dest_id;
+}
+
+bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
+ int short_hand, unsigned int dest, int dest_mode)
+{
+ struct kvm_lapic *target = vcpu->arch.apic;
+ u32 mda = kvm_apic_mda(vcpu, dest, source, target);
+
+ apic_debug("target %p, source %p, dest 0x%x, "
+ "dest_mode 0x%x, short_hand 0x%x\n",
+ target, source, dest, dest_mode, short_hand);
+
+ ASSERT(target);
+ switch (short_hand) {
+ case APIC_DEST_NOSHORT:
+ if (dest_mode == APIC_DEST_PHYSICAL)
+ return kvm_apic_match_physical_addr(target, mda);
+ else
+ return kvm_apic_match_logical_addr(target, mda);
+ case APIC_DEST_SELF:
+ return target == source;
+ case APIC_DEST_ALLINC:
+ return true;
+ case APIC_DEST_ALLBUT:
+ return target != source;
+ default:
+ apic_debug("kvm: apic: Bad dest shorthand value %x\n",
+ short_hand);
+ return false;
+ }
+}
+EXPORT_SYMBOL_GPL(kvm_apic_match_dest);
+
+int kvm_vector_to_index(u32 vector, u32 dest_vcpus,
+ const unsigned long *bitmap, u32 bitmap_size)
+{
+ u32 mod;
+ int i, idx = -1;
+
+ mod = vector % dest_vcpus;
+
+ for (i = 0; i <= mod; i++) {
+ idx = find_next_bit(bitmap, bitmap_size, idx + 1);
+ BUG_ON(idx == bitmap_size);
+ }
+
+ return idx;
+}
+
+static void kvm_apic_disabled_lapic_found(struct kvm *kvm)
+{
+ if (!kvm->arch.disabled_lapic_found) {
+ kvm->arch.disabled_lapic_found = true;
+ printk(KERN_INFO
+ "Disabled LAPIC found during irq injection\n");
+ }
+}
+
+static bool kvm_apic_is_broadcast_dest(struct kvm *kvm, struct kvm_lapic **src,
+ struct kvm_lapic_irq *irq, struct kvm_apic_map *map)
+{
+ if (kvm->arch.x2apic_broadcast_quirk_disabled) {
+ if ((irq->dest_id == APIC_BROADCAST &&
+ map->mode != KVM_APIC_MODE_X2APIC))
+ return true;
+ if (irq->dest_id == X2APIC_BROADCAST)
+ return true;
+ } else {
+ bool x2apic_ipi = src && *src && apic_x2apic_mode(*src);
+ if (irq->dest_id == (x2apic_ipi ?
+ X2APIC_BROADCAST : APIC_BROADCAST))
+ return true;
+ }
+
+ return false;
+}
+
+/* Return true if the interrupt can be handled by using *bitmap as index mask
+ * for valid destinations in *dst array.
+ * Return false if kvm_apic_map_get_dest_lapic did nothing useful.
+ * Note: we may have zero kvm_lapic destinations when we return true, which
+ * means that the interrupt should be dropped. In this case, *bitmap would be
+ * zero and *dst undefined.
+ */
+static inline bool kvm_apic_map_get_dest_lapic(struct kvm *kvm,
+ struct kvm_lapic **src, struct kvm_lapic_irq *irq,
+ struct kvm_apic_map *map, struct kvm_lapic ***dst,
+ unsigned long *bitmap)
+{
+ int i, lowest;
+
+ if (irq->shorthand == APIC_DEST_SELF && src) {
+ *dst = src;
+ *bitmap = 1;
+ return true;
+ } else if (irq->shorthand)
+ return false;
+
+ if (!map || kvm_apic_is_broadcast_dest(kvm, src, irq, map))
+ return false;
+
+ if (irq->dest_mode == APIC_DEST_PHYSICAL) {
+ if (irq->dest_id > map->max_apic_id) {
+ *bitmap = 0;
+ } else {
+ u32 dest_id = array_index_nospec(irq->dest_id, map->max_apic_id + 1);
+ *dst = &map->phys_map[dest_id];
+ *bitmap = 1;
+ }
+ return true;
+ }
+
+ *bitmap = 0;
+ if (!kvm_apic_map_get_logical_dest(map, irq->dest_id, dst,
+ (u16 *)bitmap))
+ return false;
+
+ if (!kvm_lowest_prio_delivery(irq))
+ return true;
+
+ if (!kvm_vector_hashing_enabled()) {
+ lowest = -1;
+ for_each_set_bit(i, bitmap, 16) {
+ if (!(*dst)[i])
+ continue;
+ if (lowest < 0)
+ lowest = i;
+ else if (kvm_apic_compare_prio((*dst)[i]->vcpu,
+ (*dst)[lowest]->vcpu) < 0)
+ lowest = i;
+ }
+ } else {
+ if (!*bitmap)
+ return true;
+
+ lowest = kvm_vector_to_index(irq->vector, hweight16(*bitmap),
+ bitmap, 16);
+
+ if (!(*dst)[lowest]) {
+ kvm_apic_disabled_lapic_found(kvm);
+ *bitmap = 0;
+ return true;
+ }
+ }
+
+ *bitmap = (lowest >= 0) ? 1 << lowest : 0;
+
+ return true;
+}
+
+bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
+ struct kvm_lapic_irq *irq, int *r, struct dest_map *dest_map)
+{
+ struct kvm_apic_map *map;
+ unsigned long bitmap;
+ struct kvm_lapic **dst = NULL;
+ int i;
+ bool ret;
+
+ *r = -1;
+
+ if (irq->shorthand == APIC_DEST_SELF) {
+ *r = kvm_apic_set_irq(src->vcpu, irq, dest_map);
+ return true;
+ }
+
+ rcu_read_lock();
+ map = rcu_dereference(kvm->arch.apic_map);
+
+ ret = kvm_apic_map_get_dest_lapic(kvm, &src, irq, map, &dst, &bitmap);
+ if (ret)
+ for_each_set_bit(i, &bitmap, 16) {
+ if (!dst[i])
+ continue;
+ if (*r < 0)
+ *r = 0;
+ *r += kvm_apic_set_irq(dst[i]->vcpu, irq, dest_map);
+ }
+
+ rcu_read_unlock();
+ return ret;
+}
+
+/*
+ * This routine tries to handler interrupts in posted mode, here is how
+ * it deals with different cases:
+ * - For single-destination interrupts, handle it in posted mode
+ * - Else if vector hashing is enabled and it is a lowest-priority
+ * interrupt, handle it in posted mode and use the following mechanism
+ * to find the destinaiton vCPU.
+ * 1. For lowest-priority interrupts, store all the possible
+ * destination vCPUs in an array.
+ * 2. Use "guest vector % max number of destination vCPUs" to find
+ * the right destination vCPU in the array for the lowest-priority
+ * interrupt.
+ * - Otherwise, use remapped mode to inject the interrupt.
+ */
+bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq,
+ struct kvm_vcpu **dest_vcpu)
+{
+ struct kvm_apic_map *map;
+ unsigned long bitmap;
+ struct kvm_lapic **dst = NULL;
+ bool ret = false;
+
+ if (irq->shorthand)
+ return false;
+
+ rcu_read_lock();
+ map = rcu_dereference(kvm->arch.apic_map);
+
+ if (kvm_apic_map_get_dest_lapic(kvm, NULL, irq, map, &dst, &bitmap) &&
+ hweight16(bitmap) == 1) {
+ unsigned long i = find_first_bit(&bitmap, 16);
+
+ if (dst[i]) {
+ *dest_vcpu = dst[i]->vcpu;
+ ret = true;
+ }
+ }
+
+ rcu_read_unlock();
+ return ret;
+}
+
+/*
+ * Add a pending IRQ into lapic.
+ * Return 1 if successfully added and 0 if discarded.
+ */
+static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
+ int vector, int level, int trig_mode,
+ struct dest_map *dest_map)
+{
+ int result = 0;
+ struct kvm_vcpu *vcpu = apic->vcpu;
+
+ trace_kvm_apic_accept_irq(vcpu->vcpu_id, delivery_mode,
+ trig_mode, vector);
+ switch (delivery_mode) {
+ case APIC_DM_LOWEST:
+ vcpu->arch.apic_arb_prio++;
+ case APIC_DM_FIXED:
+ if (unlikely(trig_mode && !level))
+ break;
+
+ /* FIXME add logic for vcpu on reset */
+ if (unlikely(!apic_enabled(apic)))
+ break;
+
+ result = 1;
+
+ if (dest_map) {
+ __set_bit(vcpu->vcpu_id, dest_map->map);
+ dest_map->vectors[vcpu->vcpu_id] = vector;
+ }
+
+ if (apic_test_vector(vector, apic->regs + APIC_TMR) != !!trig_mode) {
+ if (trig_mode)
+ kvm_lapic_set_vector(vector, apic->regs + APIC_TMR);
+ else
+ apic_clear_vector(vector, apic->regs + APIC_TMR);
+ }
+
+ if (kvm_x86_ops->deliver_posted_interrupt(vcpu, vector)) {
+ kvm_lapic_set_irr(vector, apic);
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ kvm_vcpu_kick(vcpu);
+ }
+ break;
+
+ case APIC_DM_REMRD:
+ result = 1;
+ vcpu->arch.pv.pv_unhalted = 1;
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ kvm_vcpu_kick(vcpu);
+ break;
+
+ case APIC_DM_SMI:
+ result = 1;
+ kvm_make_request(KVM_REQ_SMI, vcpu);
+ kvm_vcpu_kick(vcpu);
+ break;
+
+ case APIC_DM_NMI:
+ result = 1;
+ kvm_inject_nmi(vcpu);
+ kvm_vcpu_kick(vcpu);
+ break;
+
+ case APIC_DM_INIT:
+ if (!trig_mode || level) {
+ result = 1;
+ /* assumes that there are only KVM_APIC_INIT/SIPI */
+ apic->pending_events = (1UL << KVM_APIC_INIT);
+ /* make sure pending_events is visible before sending
+ * the request */
+ smp_wmb();
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ kvm_vcpu_kick(vcpu);
+ } else {
+ apic_debug("Ignoring de-assert INIT to vcpu %d\n",
+ vcpu->vcpu_id);
+ }
+ break;
+
+ case APIC_DM_STARTUP:
+ apic_debug("SIPI to vcpu %d vector 0x%02x\n",
+ vcpu->vcpu_id, vector);
+ result = 1;
+ apic->sipi_vector = vector;
+ /* make sure sipi_vector is visible for the receiver */
+ smp_wmb();
+ set_bit(KVM_APIC_SIPI, &apic->pending_events);
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ kvm_vcpu_kick(vcpu);
+ break;
+
+ case APIC_DM_EXTINT:
+ /*
+ * Should only be called by kvm_apic_local_deliver() with LVT0,
+ * before NMI watchdog was enabled. Already handled by
+ * kvm_apic_accept_pic_intr().
+ */
+ break;
+
+ default:
+ printk(KERN_ERR "TODO: unsupported delivery mode %x\n",
+ delivery_mode);
+ break;
+ }
+ return result;
+}
+
+int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2)
+{
+ return vcpu1->arch.apic_arb_prio - vcpu2->arch.apic_arb_prio;
+}
+
+static bool kvm_ioapic_handles_vector(struct kvm_lapic *apic, int vector)
+{
+ return test_bit(vector, apic->vcpu->arch.ioapic_handled_vectors);
+}
+
+static void kvm_ioapic_send_eoi(struct kvm_lapic *apic, int vector)
+{
+ int trigger_mode;
+
+ /* Eoi the ioapic only if the ioapic doesn't own the vector. */
+ if (!kvm_ioapic_handles_vector(apic, vector))
+ return;
+
+ /* Request a KVM exit to inform the userspace IOAPIC. */
+ if (irqchip_split(apic->vcpu->kvm)) {
+ apic->vcpu->arch.pending_ioapic_eoi = vector;
+ kvm_make_request(KVM_REQ_IOAPIC_EOI_EXIT, apic->vcpu);
+ return;
+ }
+
+ if (apic_test_vector(vector, apic->regs + APIC_TMR))
+ trigger_mode = IOAPIC_LEVEL_TRIG;
+ else
+ trigger_mode = IOAPIC_EDGE_TRIG;
+
+ kvm_ioapic_update_eoi(apic->vcpu, vector, trigger_mode);
+}
+
+static int apic_set_eoi(struct kvm_lapic *apic)
+{
+ int vector = apic_find_highest_isr(apic);
+
+ trace_kvm_eoi(apic, vector);
+
+ /*
+ * Not every write EOI will has corresponding ISR,
+ * one example is when Kernel check timer on setup_IO_APIC
+ */
+ if (vector == -1)
+ return vector;
+
+ apic_clear_isr(vector, apic);
+ apic_update_ppr(apic);
+
+ if (test_bit(vector, vcpu_to_synic(apic->vcpu)->vec_bitmap))
+ kvm_hv_synic_send_eoi(apic->vcpu, vector);
+
+ kvm_ioapic_send_eoi(apic, vector);
+ kvm_make_request(KVM_REQ_EVENT, apic->vcpu);
+ return vector;
+}
+
+/*
+ * this interface assumes a trap-like exit, which has already finished
+ * desired side effect including vISR and vPPR update.
+ */
+void kvm_apic_set_eoi_accelerated(struct kvm_vcpu *vcpu, int vector)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+
+ trace_kvm_eoi(apic, vector);
+
+ kvm_ioapic_send_eoi(apic, vector);
+ kvm_make_request(KVM_REQ_EVENT, apic->vcpu);
+}
+EXPORT_SYMBOL_GPL(kvm_apic_set_eoi_accelerated);
+
+static void apic_send_ipi(struct kvm_lapic *apic)
+{
+ u32 icr_low = kvm_lapic_get_reg(apic, APIC_ICR);
+ u32 icr_high = kvm_lapic_get_reg(apic, APIC_ICR2);
+ struct kvm_lapic_irq irq;
+
+ irq.vector = icr_low & APIC_VECTOR_MASK;
+ irq.delivery_mode = icr_low & APIC_MODE_MASK;
+ irq.dest_mode = icr_low & APIC_DEST_MASK;
+ irq.level = (icr_low & APIC_INT_ASSERT) != 0;
+ irq.trig_mode = icr_low & APIC_INT_LEVELTRIG;
+ irq.shorthand = icr_low & APIC_SHORT_MASK;
+ irq.msi_redir_hint = false;
+ if (apic_x2apic_mode(apic))
+ irq.dest_id = icr_high;
+ else
+ irq.dest_id = GET_APIC_DEST_FIELD(icr_high);
+
+ trace_kvm_apic_ipi(icr_low, irq.dest_id);
+
+ apic_debug("icr_high 0x%x, icr_low 0x%x, "
+ "short_hand 0x%x, dest 0x%x, trig_mode 0x%x, level 0x%x, "
+ "dest_mode 0x%x, delivery_mode 0x%x, vector 0x%x, "
+ "msi_redir_hint 0x%x\n",
+ icr_high, icr_low, irq.shorthand, irq.dest_id,
+ irq.trig_mode, irq.level, irq.dest_mode, irq.delivery_mode,
+ irq.vector, irq.msi_redir_hint);
+
+ kvm_irq_delivery_to_apic(apic->vcpu->kvm, apic, &irq, NULL);
+}
+
+static u32 apic_get_tmcct(struct kvm_lapic *apic)
+{
+ ktime_t remaining, now;
+ s64 ns;
+ u32 tmcct;
+
+ ASSERT(apic != NULL);
+
+ /* if initial count is 0, current count should also be 0 */
+ if (kvm_lapic_get_reg(apic, APIC_TMICT) == 0 ||
+ apic->lapic_timer.period == 0)
+ return 0;
+
+ now = ktime_get();
+ remaining = ktime_sub(apic->lapic_timer.target_expiration, now);
+ if (ktime_to_ns(remaining) < 0)
+ remaining = 0;
+
+ ns = mod_64(ktime_to_ns(remaining), apic->lapic_timer.period);
+ tmcct = div64_u64(ns,
+ (APIC_BUS_CYCLE_NS * apic->divide_count));
+
+ return tmcct;
+}
+
+static void __report_tpr_access(struct kvm_lapic *apic, bool write)
+{
+ struct kvm_vcpu *vcpu = apic->vcpu;
+ struct kvm_run *run = vcpu->run;
+
+ kvm_make_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu);
+ run->tpr_access.rip = kvm_rip_read(vcpu);
+ run->tpr_access.is_write = write;
+}
+
+static inline void report_tpr_access(struct kvm_lapic *apic, bool write)
+{
+ if (apic->vcpu->arch.tpr_access_reporting)
+ __report_tpr_access(apic, write);
+}
+
+static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset)
+{
+ u32 val = 0;
+
+ if (offset >= LAPIC_MMIO_LENGTH)
+ return 0;
+
+ switch (offset) {
+ case APIC_ARBPRI:
+ apic_debug("Access APIC ARBPRI register which is for P6\n");
+ break;
+
+ case APIC_TMCCT: /* Timer CCR */
+ if (apic_lvtt_tscdeadline(apic))
+ return 0;
+
+ val = apic_get_tmcct(apic);
+ break;
+ case APIC_PROCPRI:
+ apic_update_ppr(apic);
+ val = kvm_lapic_get_reg(apic, offset);
+ break;
+ case APIC_TASKPRI:
+ report_tpr_access(apic, false);
+ /* fall thru */
+ default:
+ val = kvm_lapic_get_reg(apic, offset);
+ break;
+ }
+
+ return val;
+}
+
+static inline struct kvm_lapic *to_lapic(struct kvm_io_device *dev)
+{
+ return container_of(dev, struct kvm_lapic, dev);
+}
+
+int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len,
+ void *data)
+{
+ unsigned char alignment = offset & 0xf;
+ u32 result;
+ /* this bitmask has a bit cleared for each reserved register */
+ static const u64 rmask = 0x43ff01ffffffe70cULL;
+
+ if ((alignment + len) > 4) {
+ apic_debug("KVM_APIC_READ: alignment error %x %d\n",
+ offset, len);
+ return 1;
+ }
+
+ if (offset > 0x3f0 || !(rmask & (1ULL << (offset >> 4)))) {
+ apic_debug("KVM_APIC_READ: read reserved register %x\n",
+ offset);
+ return 1;
+ }
+
+ result = __apic_read(apic, offset & ~0xf);
+
+ trace_kvm_apic_read(offset, result);
+
+ switch (len) {
+ case 1:
+ case 2:
+ case 4:
+ memcpy(data, (char *)&result + alignment, len);
+ break;
+ default:
+ printk(KERN_ERR "Local APIC read with len = %x, "
+ "should be 1,2, or 4 instead\n", len);
+ break;
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_lapic_reg_read);
+
+static int apic_mmio_in_range(struct kvm_lapic *apic, gpa_t addr)
+{
+ return addr >= apic->base_address &&
+ addr < apic->base_address + LAPIC_MMIO_LENGTH;
+}
+
+static int apic_mmio_read(struct kvm_vcpu *vcpu, struct kvm_io_device *this,
+ gpa_t address, int len, void *data)
+{
+ struct kvm_lapic *apic = to_lapic(this);
+ u32 offset = address - apic->base_address;
+
+ if (!apic_mmio_in_range(apic, address))
+ return -EOPNOTSUPP;
+
+ if (!kvm_apic_hw_enabled(apic) || apic_x2apic_mode(apic)) {
+ if (!kvm_check_has_quirk(vcpu->kvm,
+ KVM_X86_QUIRK_LAPIC_MMIO_HOLE))
+ return -EOPNOTSUPP;
+
+ memset(data, 0xff, len);
+ return 0;
+ }
+
+ kvm_lapic_reg_read(apic, offset, len, data);
+
+ return 0;
+}
+
+static void update_divide_count(struct kvm_lapic *apic)
+{
+ u32 tmp1, tmp2, tdcr;
+
+ tdcr = kvm_lapic_get_reg(apic, APIC_TDCR);
+ tmp1 = tdcr & 0xf;
+ tmp2 = ((tmp1 & 0x3) | ((tmp1 & 0x8) >> 1)) + 1;
+ apic->divide_count = 0x1 << (tmp2 & 0x7);
+
+ apic_debug("timer divide count is 0x%x\n",
+ apic->divide_count);
+}
+
+static void limit_periodic_timer_frequency(struct kvm_lapic *apic)
+{
+ /*
+ * Do not allow the guest to program periodic timers with small
+ * interval, since the hrtimers are not throttled by the host
+ * scheduler.
+ */
+ if (apic_lvtt_period(apic) && apic->lapic_timer.period) {
+ s64 min_period = min_timer_period_us * 1000LL;
+
+ if (apic->lapic_timer.period < min_period) {
+ pr_info_ratelimited(
+ "kvm: vcpu %i: requested %lld ns "
+ "lapic timer period limited to %lld ns\n",
+ apic->vcpu->vcpu_id,
+ apic->lapic_timer.period, min_period);
+ apic->lapic_timer.period = min_period;
+ }
+ }
+}
+
+static void apic_update_lvtt(struct kvm_lapic *apic)
+{
+ u32 timer_mode = kvm_lapic_get_reg(apic, APIC_LVTT) &
+ apic->lapic_timer.timer_mode_mask;
+
+ if (apic->lapic_timer.timer_mode != timer_mode) {
+ if (apic_lvtt_tscdeadline(apic) != (timer_mode ==
+ APIC_LVT_TIMER_TSCDEADLINE)) {
+ hrtimer_cancel(&apic->lapic_timer.timer);
+ kvm_lapic_set_reg(apic, APIC_TMICT, 0);
+ apic->lapic_timer.period = 0;
+ apic->lapic_timer.tscdeadline = 0;
+ }
+ apic->lapic_timer.timer_mode = timer_mode;
+ limit_periodic_timer_frequency(apic);
+ }
+}
+
+static void apic_timer_expired(struct kvm_lapic *apic)
+{
+ struct kvm_vcpu *vcpu = apic->vcpu;
+ struct swait_queue_head *q = &vcpu->wq;
+ struct kvm_timer *ktimer = &apic->lapic_timer;
+
+ if (atomic_read(&apic->lapic_timer.pending))
+ return;
+
+ atomic_inc(&apic->lapic_timer.pending);
+ kvm_set_pending_timer(vcpu);
+
+ /*
+ * For x86, the atomic_inc() is serialized, thus
+ * using swait_active() is safe.
+ */
+ if (swait_active(q))
+ swake_up_one(q);
+
+ if (apic_lvtt_tscdeadline(apic) || ktimer->hv_timer_in_use)
+ ktimer->expired_tscdeadline = ktimer->tscdeadline;
+}
+
+/*
+ * On APICv, this test will cause a busy wait
+ * during a higher-priority task.
+ */
+
+static bool lapic_timer_int_injected(struct kvm_vcpu *vcpu)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+ u32 reg = kvm_lapic_get_reg(apic, APIC_LVTT);
+
+ if (kvm_apic_hw_enabled(apic)) {
+ int vec = reg & APIC_VECTOR_MASK;
+ void *bitmap = apic->regs + APIC_ISR;
+
+ if (vcpu->arch.apicv_active)
+ bitmap = apic->regs + APIC_IRR;
+
+ if (apic_test_vector(vec, bitmap))
+ return true;
+ }
+ return false;
+}
+
+void wait_lapic_expire(struct kvm_vcpu *vcpu)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+ u64 guest_tsc, tsc_deadline;
+
+ if (!lapic_in_kernel(vcpu))
+ return;
+
+ if (apic->lapic_timer.expired_tscdeadline == 0)
+ return;
+
+ if (!lapic_timer_int_injected(vcpu))
+ return;
+
+ tsc_deadline = apic->lapic_timer.expired_tscdeadline;
+ apic->lapic_timer.expired_tscdeadline = 0;
+ guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
+ trace_kvm_wait_lapic_expire(vcpu->vcpu_id, guest_tsc - tsc_deadline);
+
+ /* __delay is delay_tsc whenever the hardware has TSC, thus always. */
+ if (guest_tsc < tsc_deadline)
+ __delay(min(tsc_deadline - guest_tsc,
+ nsec_to_cycles(vcpu, lapic_timer_advance_ns)));
+}
+
+static void start_sw_tscdeadline(struct kvm_lapic *apic)
+{
+ u64 guest_tsc, tscdeadline = apic->lapic_timer.tscdeadline;
+ u64 ns = 0;
+ ktime_t expire;
+ struct kvm_vcpu *vcpu = apic->vcpu;
+ unsigned long this_tsc_khz = vcpu->arch.virtual_tsc_khz;
+ unsigned long flags;
+ ktime_t now;
+
+ if (unlikely(!tscdeadline || !this_tsc_khz))
+ return;
+
+ local_irq_save(flags);
+
+ now = ktime_get();
+ guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
+ if (likely(tscdeadline > guest_tsc)) {
+ ns = (tscdeadline - guest_tsc) * 1000000ULL;
+ do_div(ns, this_tsc_khz);
+ expire = ktime_add_ns(now, ns);
+ expire = ktime_sub_ns(expire, lapic_timer_advance_ns);
+ hrtimer_start(&apic->lapic_timer.timer,
+ expire, HRTIMER_MODE_ABS_PINNED);
+ } else
+ apic_timer_expired(apic);
+
+ local_irq_restore(flags);
+}
+
+static void update_target_expiration(struct kvm_lapic *apic, uint32_t old_divisor)
+{
+ ktime_t now, remaining;
+ u64 ns_remaining_old, ns_remaining_new;
+
+ apic->lapic_timer.period = (u64)kvm_lapic_get_reg(apic, APIC_TMICT)
+ * APIC_BUS_CYCLE_NS * apic->divide_count;
+ limit_periodic_timer_frequency(apic);
+
+ now = ktime_get();
+ remaining = ktime_sub(apic->lapic_timer.target_expiration, now);
+ if (ktime_to_ns(remaining) < 0)
+ remaining = 0;
+
+ ns_remaining_old = ktime_to_ns(remaining);
+ ns_remaining_new = mul_u64_u32_div(ns_remaining_old,
+ apic->divide_count, old_divisor);
+
+ apic->lapic_timer.tscdeadline +=
+ nsec_to_cycles(apic->vcpu, ns_remaining_new) -
+ nsec_to_cycles(apic->vcpu, ns_remaining_old);
+ apic->lapic_timer.target_expiration = ktime_add_ns(now, ns_remaining_new);
+}
+
+static bool set_target_expiration(struct kvm_lapic *apic)
+{
+ ktime_t now;
+ u64 tscl = rdtsc();
+
+ now = ktime_get();
+ apic->lapic_timer.period = (u64)kvm_lapic_get_reg(apic, APIC_TMICT)
+ * APIC_BUS_CYCLE_NS * apic->divide_count;
+
+ if (!apic->lapic_timer.period) {
+ apic->lapic_timer.tscdeadline = 0;
+ return false;
+ }
+
+ limit_periodic_timer_frequency(apic);
+
+ apic_debug("%s: bus cycle is %" PRId64 "ns, now 0x%016"
+ PRIx64 ", "
+ "timer initial count 0x%x, period %lldns, "
+ "expire @ 0x%016" PRIx64 ".\n", __func__,
+ APIC_BUS_CYCLE_NS, ktime_to_ns(now),
+ kvm_lapic_get_reg(apic, APIC_TMICT),
+ apic->lapic_timer.period,
+ ktime_to_ns(ktime_add_ns(now,
+ apic->lapic_timer.period)));
+
+ apic->lapic_timer.tscdeadline = kvm_read_l1_tsc(apic->vcpu, tscl) +
+ nsec_to_cycles(apic->vcpu, apic->lapic_timer.period);
+ apic->lapic_timer.target_expiration = ktime_add_ns(now, apic->lapic_timer.period);
+
+ return true;
+}
+
+static void advance_periodic_target_expiration(struct kvm_lapic *apic)
+{
+ ktime_t now = ktime_get();
+ u64 tscl = rdtsc();
+ ktime_t delta;
+
+ /*
+ * Synchronize both deadlines to the same time source or
+ * differences in the periods (caused by differences in the
+ * underlying clocks or numerical approximation errors) will
+ * cause the two to drift apart over time as the errors
+ * accumulate.
+ */
+ apic->lapic_timer.target_expiration =
+ ktime_add_ns(apic->lapic_timer.target_expiration,
+ apic->lapic_timer.period);
+ delta = ktime_sub(apic->lapic_timer.target_expiration, now);
+ apic->lapic_timer.tscdeadline = kvm_read_l1_tsc(apic->vcpu, tscl) +
+ nsec_to_cycles(apic->vcpu, delta);
+}
+
+static void start_sw_period(struct kvm_lapic *apic)
+{
+ if (!apic->lapic_timer.period)
+ return;
+
+ if (ktime_after(ktime_get(),
+ apic->lapic_timer.target_expiration)) {
+ apic_timer_expired(apic);
+
+ if (apic_lvtt_oneshot(apic))
+ return;
+
+ advance_periodic_target_expiration(apic);
+ }
+
+ hrtimer_start(&apic->lapic_timer.timer,
+ apic->lapic_timer.target_expiration,
+ HRTIMER_MODE_ABS_PINNED);
+}
+
+bool kvm_lapic_hv_timer_in_use(struct kvm_vcpu *vcpu)
+{
+ if (!lapic_in_kernel(vcpu))
+ return false;
+
+ return vcpu->arch.apic->lapic_timer.hv_timer_in_use;
+}
+EXPORT_SYMBOL_GPL(kvm_lapic_hv_timer_in_use);
+
+static void cancel_hv_timer(struct kvm_lapic *apic)
+{
+ WARN_ON(preemptible());
+ WARN_ON(!apic->lapic_timer.hv_timer_in_use);
+ kvm_x86_ops->cancel_hv_timer(apic->vcpu);
+ apic->lapic_timer.hv_timer_in_use = false;
+}
+
+static bool start_hv_timer(struct kvm_lapic *apic)
+{
+ struct kvm_timer *ktimer = &apic->lapic_timer;
+ int r;
+
+ WARN_ON(preemptible());
+ if (!kvm_x86_ops->set_hv_timer)
+ return false;
+
+ if (!apic_lvtt_period(apic) && atomic_read(&ktimer->pending))
+ return false;
+
+ if (!ktimer->tscdeadline)
+ return false;
+
+ r = kvm_x86_ops->set_hv_timer(apic->vcpu, ktimer->tscdeadline);
+ if (r < 0)
+ return false;
+
+ ktimer->hv_timer_in_use = true;
+ hrtimer_cancel(&ktimer->timer);
+
+ /*
+ * Also recheck ktimer->pending, in case the sw timer triggered in
+ * the window. For periodic timer, leave the hv timer running for
+ * simplicity, and the deadline will be recomputed on the next vmexit.
+ */
+ if (!apic_lvtt_period(apic) && (r || atomic_read(&ktimer->pending))) {
+ if (r)
+ apic_timer_expired(apic);
+ return false;
+ }
+
+ trace_kvm_hv_timer_state(apic->vcpu->vcpu_id, true);
+ return true;
+}
+
+static void start_sw_timer(struct kvm_lapic *apic)
+{
+ struct kvm_timer *ktimer = &apic->lapic_timer;
+
+ WARN_ON(preemptible());
+ if (apic->lapic_timer.hv_timer_in_use)
+ cancel_hv_timer(apic);
+ if (!apic_lvtt_period(apic) && atomic_read(&ktimer->pending))
+ return;
+
+ if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic))
+ start_sw_period(apic);
+ else if (apic_lvtt_tscdeadline(apic))
+ start_sw_tscdeadline(apic);
+ trace_kvm_hv_timer_state(apic->vcpu->vcpu_id, false);
+}
+
+static void restart_apic_timer(struct kvm_lapic *apic)
+{
+ preempt_disable();
+ if (!start_hv_timer(apic))
+ start_sw_timer(apic);
+ preempt_enable();
+}
+
+void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+
+ preempt_disable();
+ /* If the preempt notifier has already run, it also called apic_timer_expired */
+ if (!apic->lapic_timer.hv_timer_in_use)
+ goto out;
+ WARN_ON(swait_active(&vcpu->wq));
+ cancel_hv_timer(apic);
+ apic_timer_expired(apic);
+
+ if (apic_lvtt_period(apic) && apic->lapic_timer.period) {
+ advance_periodic_target_expiration(apic);
+ restart_apic_timer(apic);
+ }
+out:
+ preempt_enable();
+}
+EXPORT_SYMBOL_GPL(kvm_lapic_expired_hv_timer);
+
+void kvm_lapic_switch_to_hv_timer(struct kvm_vcpu *vcpu)
+{
+ restart_apic_timer(vcpu->arch.apic);
+}
+EXPORT_SYMBOL_GPL(kvm_lapic_switch_to_hv_timer);
+
+void kvm_lapic_switch_to_sw_timer(struct kvm_vcpu *vcpu)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+
+ preempt_disable();
+ /* Possibly the TSC deadline timer is not enabled yet */
+ if (apic->lapic_timer.hv_timer_in_use)
+ start_sw_timer(apic);
+ preempt_enable();
+}
+EXPORT_SYMBOL_GPL(kvm_lapic_switch_to_sw_timer);
+
+void kvm_lapic_restart_hv_timer(struct kvm_vcpu *vcpu)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+
+ WARN_ON(!apic->lapic_timer.hv_timer_in_use);
+ restart_apic_timer(apic);
+}
+
+static void start_apic_timer(struct kvm_lapic *apic)
+{
+ atomic_set(&apic->lapic_timer.pending, 0);
+
+ if ((apic_lvtt_period(apic) || apic_lvtt_oneshot(apic))
+ && !set_target_expiration(apic))
+ return;
+
+ restart_apic_timer(apic);
+}
+
+static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val)
+{
+ bool lvt0_in_nmi_mode = apic_lvt_nmi_mode(lvt0_val);
+
+ if (apic->lvt0_in_nmi_mode != lvt0_in_nmi_mode) {
+ apic->lvt0_in_nmi_mode = lvt0_in_nmi_mode;
+ if (lvt0_in_nmi_mode) {
+ apic_debug("Receive NMI setting on APIC_LVT0 "
+ "for cpu %d\n", apic->vcpu->vcpu_id);
+ atomic_inc(&apic->vcpu->kvm->arch.vapics_in_nmi_mode);
+ } else
+ atomic_dec(&apic->vcpu->kvm->arch.vapics_in_nmi_mode);
+ }
+}
+
+int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
+{
+ int ret = 0;
+
+ trace_kvm_apic_write(reg, val);
+
+ switch (reg) {
+ case APIC_ID: /* Local APIC ID */
+ if (!apic_x2apic_mode(apic))
+ kvm_apic_set_xapic_id(apic, val >> 24);
+ else
+ ret = 1;
+ break;
+
+ case APIC_TASKPRI:
+ report_tpr_access(apic, true);
+ apic_set_tpr(apic, val & 0xff);
+ break;
+
+ case APIC_EOI:
+ apic_set_eoi(apic);
+ break;
+
+ case APIC_LDR:
+ if (!apic_x2apic_mode(apic))
+ kvm_apic_set_ldr(apic, val & APIC_LDR_MASK);
+ else
+ ret = 1;
+ break;
+
+ case APIC_DFR:
+ if (!apic_x2apic_mode(apic)) {
+ kvm_lapic_set_reg(apic, APIC_DFR, val | 0x0FFFFFFF);
+ recalculate_apic_map(apic->vcpu->kvm);
+ } else
+ ret = 1;
+ break;
+
+ case APIC_SPIV: {
+ u32 mask = 0x3ff;
+ if (kvm_lapic_get_reg(apic, APIC_LVR) & APIC_LVR_DIRECTED_EOI)
+ mask |= APIC_SPIV_DIRECTED_EOI;
+ apic_set_spiv(apic, val & mask);
+ if (!(val & APIC_SPIV_APIC_ENABLED)) {
+ int i;
+ u32 lvt_val;
+
+ for (i = 0; i < KVM_APIC_LVT_NUM; i++) {
+ lvt_val = kvm_lapic_get_reg(apic,
+ APIC_LVTT + 0x10 * i);
+ kvm_lapic_set_reg(apic, APIC_LVTT + 0x10 * i,
+ lvt_val | APIC_LVT_MASKED);
+ }
+ apic_update_lvtt(apic);
+ atomic_set(&apic->lapic_timer.pending, 0);
+
+ }
+ break;
+ }
+ case APIC_ICR:
+ /* No delay here, so we always clear the pending bit */
+ kvm_lapic_set_reg(apic, APIC_ICR, val & ~(1 << 12));
+ apic_send_ipi(apic);
+ break;
+
+ case APIC_ICR2:
+ if (!apic_x2apic_mode(apic))
+ val &= 0xff000000;
+ kvm_lapic_set_reg(apic, APIC_ICR2, val);
+ break;
+
+ case APIC_LVT0:
+ apic_manage_nmi_watchdog(apic, val);
+ case APIC_LVTTHMR:
+ case APIC_LVTPC:
+ case APIC_LVT1:
+ case APIC_LVTERR: {
+ /* TODO: Check vector */
+ size_t size;
+ u32 index;
+
+ if (!kvm_apic_sw_enabled(apic))
+ val |= APIC_LVT_MASKED;
+ size = ARRAY_SIZE(apic_lvt_mask);
+ index = array_index_nospec(
+ (reg - APIC_LVTT) >> 4, size);
+ val &= apic_lvt_mask[index];
+ kvm_lapic_set_reg(apic, reg, val);
+ break;
+ }
+
+ case APIC_LVTT:
+ if (!kvm_apic_sw_enabled(apic))
+ val |= APIC_LVT_MASKED;
+ val &= (apic_lvt_mask[0] | apic->lapic_timer.timer_mode_mask);
+ kvm_lapic_set_reg(apic, APIC_LVTT, val);
+ apic_update_lvtt(apic);
+ break;
+
+ case APIC_TMICT:
+ if (apic_lvtt_tscdeadline(apic))
+ break;
+
+ hrtimer_cancel(&apic->lapic_timer.timer);
+ kvm_lapic_set_reg(apic, APIC_TMICT, val);
+ start_apic_timer(apic);
+ break;
+
+ case APIC_TDCR: {
+ uint32_t old_divisor = apic->divide_count;
+
+ if (val & 4)
+ apic_debug("KVM_WRITE:TDCR %x\n", val);
+ kvm_lapic_set_reg(apic, APIC_TDCR, val);
+ update_divide_count(apic);
+ if (apic->divide_count != old_divisor &&
+ apic->lapic_timer.period) {
+ hrtimer_cancel(&apic->lapic_timer.timer);
+ update_target_expiration(apic, old_divisor);
+ restart_apic_timer(apic);
+ }
+ break;
+ }
+ case APIC_ESR:
+ if (apic_x2apic_mode(apic) && val != 0) {
+ apic_debug("KVM_WRITE:ESR not zero %x\n", val);
+ ret = 1;
+ }
+ break;
+
+ case APIC_SELF_IPI:
+ if (apic_x2apic_mode(apic)) {
+ kvm_lapic_reg_write(apic, APIC_ICR, 0x40000 | (val & 0xff));
+ } else
+ ret = 1;
+ break;
+ default:
+ ret = 1;
+ break;
+ }
+ if (ret)
+ apic_debug("Local APIC Write to read-only register %x\n", reg);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(kvm_lapic_reg_write);
+
+static int apic_mmio_write(struct kvm_vcpu *vcpu, struct kvm_io_device *this,
+ gpa_t address, int len, const void *data)
+{
+ struct kvm_lapic *apic = to_lapic(this);
+ unsigned int offset = address - apic->base_address;
+ u32 val;
+
+ if (!apic_mmio_in_range(apic, address))
+ return -EOPNOTSUPP;
+
+ if (!kvm_apic_hw_enabled(apic) || apic_x2apic_mode(apic)) {
+ if (!kvm_check_has_quirk(vcpu->kvm,
+ KVM_X86_QUIRK_LAPIC_MMIO_HOLE))
+ return -EOPNOTSUPP;
+
+ return 0;
+ }
+
+ /*
+ * APIC register must be aligned on 128-bits boundary.
+ * 32/64/128 bits registers must be accessed thru 32 bits.
+ * Refer SDM 8.4.1
+ */
+ if (len != 4 || (offset & 0xf)) {
+ /* Don't shout loud, $infamous_os would cause only noise. */
+ apic_debug("apic write: bad size=%d %lx\n", len, (long)address);
+ return 0;
+ }
+
+ val = *(u32*)data;
+
+ /* too common printing */
+ if (offset != APIC_EOI)
+ apic_debug("%s: offset 0x%x with length 0x%x, and value is "
+ "0x%x\n", __func__, offset, len, val);
+
+ kvm_lapic_reg_write(apic, offset & 0xff0, val);
+
+ return 0;
+}
+
+void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu)
+{
+ kvm_lapic_reg_write(vcpu->arch.apic, APIC_EOI, 0);
+}
+EXPORT_SYMBOL_GPL(kvm_lapic_set_eoi);
+
+/* emulate APIC access in a trap manner */
+void kvm_apic_write_nodecode(struct kvm_vcpu *vcpu, u32 offset)
+{
+ u32 val = 0;
+
+ /* hw has done the conditional check and inst decode */
+ offset &= 0xff0;
+
+ kvm_lapic_reg_read(vcpu->arch.apic, offset, 4, &val);
+
+ /* TODO: optimize to just emulate side effect w/o one more write */
+ kvm_lapic_reg_write(vcpu->arch.apic, offset, val);
+}
+EXPORT_SYMBOL_GPL(kvm_apic_write_nodecode);
+
+void kvm_free_lapic(struct kvm_vcpu *vcpu)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+
+ if (!vcpu->arch.apic)
+ return;
+
+ hrtimer_cancel(&apic->lapic_timer.timer);
+
+ if (!(vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE))
+ static_key_slow_dec_deferred(&apic_hw_disabled);
+
+ if (!apic->sw_enabled)
+ static_key_slow_dec_deferred(&apic_sw_disabled);
+
+ if (apic->regs)
+ free_page((unsigned long)apic->regs);
+
+ kfree(apic);
+}
+
+/*
+ *----------------------------------------------------------------------
+ * LAPIC interface
+ *----------------------------------------------------------------------
+ */
+u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+
+ if (!lapic_in_kernel(vcpu) ||
+ !apic_lvtt_tscdeadline(apic))
+ return 0;
+
+ return apic->lapic_timer.tscdeadline;
+}
+
+void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+
+ if (!kvm_apic_present(vcpu) || apic_lvtt_oneshot(apic) ||
+ apic_lvtt_period(apic))
+ return;
+
+ hrtimer_cancel(&apic->lapic_timer.timer);
+ apic->lapic_timer.tscdeadline = data;
+ start_apic_timer(apic);
+}
+
+void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8)
+{
+ apic_set_tpr(vcpu->arch.apic, (cr8 & 0x0f) << 4);
+}
+
+u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu)
+{
+ u64 tpr;
+
+ tpr = (u64) kvm_lapic_get_reg(vcpu->arch.apic, APIC_TASKPRI);
+
+ return (tpr & 0xf0) >> 4;
+}
+
+void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
+{
+ u64 old_value = vcpu->arch.apic_base;
+ struct kvm_lapic *apic = vcpu->arch.apic;
+
+ if (!apic)
+ value |= MSR_IA32_APICBASE_BSP;
+
+ vcpu->arch.apic_base = value;
+
+ if ((old_value ^ value) & MSR_IA32_APICBASE_ENABLE)
+ kvm_update_cpuid(vcpu);
+
+ if (!apic)
+ return;
+
+ /* update jump label if enable bit changes */
+ if ((old_value ^ value) & MSR_IA32_APICBASE_ENABLE) {
+ if (value & MSR_IA32_APICBASE_ENABLE) {
+ kvm_apic_set_xapic_id(apic, vcpu->vcpu_id);
+ static_key_slow_dec_deferred(&apic_hw_disabled);
+ } else {
+ static_key_slow_inc(&apic_hw_disabled.key);
+ recalculate_apic_map(vcpu->kvm);
+ }
+ }
+
+ if (((old_value ^ value) & X2APIC_ENABLE) && (value & X2APIC_ENABLE))
+ kvm_apic_set_x2apic_id(apic, vcpu->vcpu_id);
+
+ if ((old_value ^ value) & (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE))
+ kvm_x86_ops->set_virtual_apic_mode(vcpu);
+
+ apic->base_address = apic->vcpu->arch.apic_base &
+ MSR_IA32_APICBASE_BASE;
+
+ if ((value & MSR_IA32_APICBASE_ENABLE) &&
+ apic->base_address != APIC_DEFAULT_PHYS_BASE)
+ pr_warn_once("APIC base relocation is unsupported by KVM");
+
+ /* with FSB delivery interrupt, we can restart APIC functionality */
+ apic_debug("apic base msr is 0x%016" PRIx64 ", and base address is "
+ "0x%lx.\n", apic->vcpu->arch.apic_base, apic->base_address);
+
+}
+
+void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+ int i;
+
+ if (!apic)
+ return;
+
+ apic_debug("%s\n", __func__);
+
+ /* Stop the timer in case it's a reset to an active apic */
+ hrtimer_cancel(&apic->lapic_timer.timer);
+
+ if (!init_event) {
+ kvm_lapic_set_base(vcpu, APIC_DEFAULT_PHYS_BASE |
+ MSR_IA32_APICBASE_ENABLE);
+ kvm_apic_set_xapic_id(apic, vcpu->vcpu_id);
+ }
+ kvm_apic_set_version(apic->vcpu);
+
+ for (i = 0; i < KVM_APIC_LVT_NUM; i++)
+ kvm_lapic_set_reg(apic, APIC_LVTT + 0x10 * i, APIC_LVT_MASKED);
+ apic_update_lvtt(apic);
+ if (kvm_vcpu_is_reset_bsp(vcpu) &&
+ kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_LINT0_REENABLED))
+ kvm_lapic_set_reg(apic, APIC_LVT0,
+ SET_APIC_DELIVERY_MODE(0, APIC_MODE_EXTINT));
+ apic_manage_nmi_watchdog(apic, kvm_lapic_get_reg(apic, APIC_LVT0));
+
+ kvm_lapic_set_reg(apic, APIC_DFR, 0xffffffffU);
+ apic_set_spiv(apic, 0xff);
+ kvm_lapic_set_reg(apic, APIC_TASKPRI, 0);
+ if (!apic_x2apic_mode(apic))
+ kvm_apic_set_ldr(apic, 0);
+ kvm_lapic_set_reg(apic, APIC_ESR, 0);
+ kvm_lapic_set_reg(apic, APIC_ICR, 0);
+ kvm_lapic_set_reg(apic, APIC_ICR2, 0);
+ kvm_lapic_set_reg(apic, APIC_TDCR, 0);
+ kvm_lapic_set_reg(apic, APIC_TMICT, 0);
+ for (i = 0; i < 8; i++) {
+ kvm_lapic_set_reg(apic, APIC_IRR + 0x10 * i, 0);
+ kvm_lapic_set_reg(apic, APIC_ISR + 0x10 * i, 0);
+ kvm_lapic_set_reg(apic, APIC_TMR + 0x10 * i, 0);
+ }
+ apic->irr_pending = vcpu->arch.apicv_active;
+ apic->isr_count = vcpu->arch.apicv_active ? 1 : 0;
+ apic->highest_isr_cache = -1;
+ update_divide_count(apic);
+ atomic_set(&apic->lapic_timer.pending, 0);
+ if (kvm_vcpu_is_bsp(vcpu))
+ kvm_lapic_set_base(vcpu,
+ vcpu->arch.apic_base | MSR_IA32_APICBASE_BSP);
+ vcpu->arch.pv_eoi.msr_val = 0;
+ apic_update_ppr(apic);
+ if (vcpu->arch.apicv_active) {
+ kvm_x86_ops->apicv_post_state_restore(vcpu);
+ kvm_x86_ops->hwapic_irr_update(vcpu, -1);
+ kvm_x86_ops->hwapic_isr_update(vcpu, -1);
+ }
+
+ vcpu->arch.apic_arb_prio = 0;
+ vcpu->arch.apic_attention = 0;
+
+ apic_debug("%s: vcpu=%p, id=0x%x, base_msr="
+ "0x%016" PRIx64 ", base_address=0x%0lx.\n", __func__,
+ vcpu, kvm_lapic_get_reg(apic, APIC_ID),
+ vcpu->arch.apic_base, apic->base_address);
+}
+
+/*
+ *----------------------------------------------------------------------
+ * timer interface
+ *----------------------------------------------------------------------
+ */
+
+static bool lapic_is_periodic(struct kvm_lapic *apic)
+{
+ return apic_lvtt_period(apic);
+}
+
+int apic_has_pending_timer(struct kvm_vcpu *vcpu)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+
+ if (apic_enabled(apic) && apic_lvt_enabled(apic, APIC_LVTT))
+ return atomic_read(&apic->lapic_timer.pending);
+
+ return 0;
+}
+
+int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type)
+{
+ u32 reg = kvm_lapic_get_reg(apic, lvt_type);
+ int vector, mode, trig_mode;
+
+ if (kvm_apic_hw_enabled(apic) && !(reg & APIC_LVT_MASKED)) {
+ vector = reg & APIC_VECTOR_MASK;
+ mode = reg & APIC_MODE_MASK;
+ trig_mode = reg & APIC_LVT_LEVEL_TRIGGER;
+ return __apic_accept_irq(apic, mode, vector, 1, trig_mode,
+ NULL);
+ }
+ return 0;
+}
+
+void kvm_apic_nmi_wd_deliver(struct kvm_vcpu *vcpu)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+
+ if (apic)
+ kvm_apic_local_deliver(apic, APIC_LVT0);
+}
+
+static const struct kvm_io_device_ops apic_mmio_ops = {
+ .read = apic_mmio_read,
+ .write = apic_mmio_write,
+};
+
+static enum hrtimer_restart apic_timer_fn(struct hrtimer *data)
+{
+ struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer);
+ struct kvm_lapic *apic = container_of(ktimer, struct kvm_lapic, lapic_timer);
+
+ apic_timer_expired(apic);
+
+ if (lapic_is_periodic(apic)) {
+ advance_periodic_target_expiration(apic);
+ hrtimer_add_expires_ns(&ktimer->timer, ktimer->period);
+ return HRTIMER_RESTART;
+ } else
+ return HRTIMER_NORESTART;
+}
+
+int kvm_create_lapic(struct kvm_vcpu *vcpu)
+{
+ struct kvm_lapic *apic;
+
+ ASSERT(vcpu != NULL);
+ apic_debug("apic_init %d\n", vcpu->vcpu_id);
+
+ apic = kzalloc(sizeof(*apic), GFP_KERNEL);
+ if (!apic)
+ goto nomem;
+
+ vcpu->arch.apic = apic;
+
+ apic->regs = (void *)get_zeroed_page(GFP_KERNEL);
+ if (!apic->regs) {
+ printk(KERN_ERR "malloc apic regs error for vcpu %x\n",
+ vcpu->vcpu_id);
+ goto nomem_free_apic;
+ }
+ apic->vcpu = vcpu;
+
+ hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC,
+ HRTIMER_MODE_ABS_PINNED);
+ apic->lapic_timer.timer.function = apic_timer_fn;
+
+ /*
+ * APIC is created enabled. This will prevent kvm_lapic_set_base from
+ * thinking that APIC satet has changed.
+ */
+ vcpu->arch.apic_base = MSR_IA32_APICBASE_ENABLE;
+ static_key_slow_inc(&apic_sw_disabled.key); /* sw disabled at reset */
+ kvm_iodevice_init(&apic->dev, &apic_mmio_ops);
+
+ return 0;
+nomem_free_apic:
+ kfree(apic);
+nomem:
+ return -ENOMEM;
+}
+
+int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+ u32 ppr;
+
+ if (!kvm_apic_present(vcpu))
+ return -1;
+
+ __apic_update_ppr(apic, &ppr);
+ return apic_has_interrupt_for_ppr(apic, ppr);
+}
+
+int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu)
+{
+ u32 lvt0 = kvm_lapic_get_reg(vcpu->arch.apic, APIC_LVT0);
+ int r = 0;
+
+ if (!kvm_apic_hw_enabled(vcpu->arch.apic))
+ r = 1;
+ if ((lvt0 & APIC_LVT_MASKED) == 0 &&
+ GET_APIC_DELIVERY_MODE(lvt0) == APIC_MODE_EXTINT)
+ r = 1;
+ return r;
+}
+
+void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+
+ if (atomic_read(&apic->lapic_timer.pending) > 0) {
+ kvm_apic_local_deliver(apic, APIC_LVTT);
+ if (apic_lvtt_tscdeadline(apic))
+ apic->lapic_timer.tscdeadline = 0;
+ if (apic_lvtt_oneshot(apic)) {
+ apic->lapic_timer.tscdeadline = 0;
+ apic->lapic_timer.target_expiration = 0;
+ }
+ atomic_set(&apic->lapic_timer.pending, 0);
+ }
+}
+
+int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu)
+{
+ int vector = kvm_apic_has_interrupt(vcpu);
+ struct kvm_lapic *apic = vcpu->arch.apic;
+ u32 ppr;
+
+ if (vector == -1)
+ return -1;
+
+ /*
+ * We get here even with APIC virtualization enabled, if doing
+ * nested virtualization and L1 runs with the "acknowledge interrupt
+ * on exit" mode. Then we cannot inject the interrupt via RVI,
+ * because the process would deliver it through the IDT.
+ */
+
+ apic_clear_irr(vector, apic);
+ if (test_bit(vector, vcpu_to_synic(vcpu)->auto_eoi_bitmap)) {
+ /*
+ * For auto-EOI interrupts, there might be another pending
+ * interrupt above PPR, so check whether to raise another
+ * KVM_REQ_EVENT.
+ */
+ apic_update_ppr(apic);
+ } else {
+ /*
+ * For normal interrupts, PPR has been raised and there cannot
+ * be a higher-priority pending interrupt---except if there was
+ * a concurrent interrupt injection, but that would have
+ * triggered KVM_REQ_EVENT already.
+ */
+ apic_set_isr(vector, apic);
+ __apic_update_ppr(apic, &ppr);
+ }
+
+ return vector;
+}
+
+static int kvm_apic_state_fixup(struct kvm_vcpu *vcpu,
+ struct kvm_lapic_state *s, bool set)
+{
+ if (apic_x2apic_mode(vcpu->arch.apic)) {
+ u32 *id = (u32 *)(s->regs + APIC_ID);
+ u32 *ldr = (u32 *)(s->regs + APIC_LDR);
+
+ if (vcpu->kvm->arch.x2apic_format) {
+ if (*id != vcpu->vcpu_id)
+ return -EINVAL;
+ } else {
+ if (set)
+ *id >>= 24;
+ else
+ *id <<= 24;
+ }
+
+ /* In x2APIC mode, the LDR is fixed and based on the id */
+ if (set)
+ *ldr = kvm_apic_calc_x2apic_ldr(*id);
+ }
+
+ return 0;
+}
+
+int kvm_apic_get_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s)
+{
+ memcpy(s->regs, vcpu->arch.apic->regs, sizeof(*s));
+ return kvm_apic_state_fixup(vcpu, s, false);
+}
+
+int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+ int r;
+
+
+ kvm_lapic_set_base(vcpu, vcpu->arch.apic_base);
+ /* set SPIV separately to get count of SW disabled APICs right */
+ apic_set_spiv(apic, *((u32 *)(s->regs + APIC_SPIV)));
+
+ r = kvm_apic_state_fixup(vcpu, s, true);
+ if (r)
+ return r;
+ memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s);
+
+ recalculate_apic_map(vcpu->kvm);
+ kvm_apic_set_version(vcpu);
+
+ apic_update_ppr(apic);
+ hrtimer_cancel(&apic->lapic_timer.timer);
+ apic_update_lvtt(apic);
+ apic_manage_nmi_watchdog(apic, kvm_lapic_get_reg(apic, APIC_LVT0));
+ update_divide_count(apic);
+ start_apic_timer(apic);
+ apic->irr_pending = true;
+ apic->isr_count = vcpu->arch.apicv_active ?
+ 1 : count_vectors(apic->regs + APIC_ISR);
+ apic->highest_isr_cache = -1;
+ if (vcpu->arch.apicv_active) {
+ kvm_x86_ops->apicv_post_state_restore(vcpu);
+ kvm_x86_ops->hwapic_irr_update(vcpu,
+ apic_find_highest_irr(apic));
+ kvm_x86_ops->hwapic_isr_update(vcpu,
+ apic_find_highest_isr(apic));
+ }
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ if (ioapic_in_kernel(vcpu->kvm))
+ kvm_rtc_eoi_tracking_restore_one(vcpu);
+
+ vcpu->arch.apic_arb_prio = 0;
+
+ return 0;
+}
+
+void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
+{
+ struct hrtimer *timer;
+
+ if (!lapic_in_kernel(vcpu))
+ return;
+
+ timer = &vcpu->arch.apic->lapic_timer.timer;
+ if (hrtimer_cancel(timer))
+ hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED);
+}
+
+/*
+ * apic_sync_pv_eoi_from_guest - called on vmexit or cancel interrupt
+ *
+ * Detect whether guest triggered PV EOI since the
+ * last entry. If yes, set EOI on guests's behalf.
+ * Clear PV EOI in guest memory in any case.
+ */
+static void apic_sync_pv_eoi_from_guest(struct kvm_vcpu *vcpu,
+ struct kvm_lapic *apic)
+{
+ bool pending;
+ int vector;
+ /*
+ * PV EOI state is derived from KVM_APIC_PV_EOI_PENDING in host
+ * and KVM_PV_EOI_ENABLED in guest memory as follows:
+ *
+ * KVM_APIC_PV_EOI_PENDING is unset:
+ * -> host disabled PV EOI.
+ * KVM_APIC_PV_EOI_PENDING is set, KVM_PV_EOI_ENABLED is set:
+ * -> host enabled PV EOI, guest did not execute EOI yet.
+ * KVM_APIC_PV_EOI_PENDING is set, KVM_PV_EOI_ENABLED is unset:
+ * -> host enabled PV EOI, guest executed EOI.
+ */
+ BUG_ON(!pv_eoi_enabled(vcpu));
+ pending = pv_eoi_get_pending(vcpu);
+ /*
+ * Clear pending bit in any case: it will be set again on vmentry.
+ * While this might not be ideal from performance point of view,
+ * this makes sure pv eoi is only enabled when we know it's safe.
+ */
+ pv_eoi_clr_pending(vcpu);
+ if (pending)
+ return;
+ vector = apic_set_eoi(apic);
+ trace_kvm_pv_eoi(apic, vector);
+}
+
+void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu)
+{
+ u32 data;
+
+ if (test_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention))
+ apic_sync_pv_eoi_from_guest(vcpu, vcpu->arch.apic);
+
+ if (!test_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention))
+ return;
+
+ if (kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.apic->vapic_cache, &data,
+ sizeof(u32)))
+ return;
+
+ apic_set_tpr(vcpu->arch.apic, data & 0xff);
+}
+
+/*
+ * apic_sync_pv_eoi_to_guest - called before vmentry
+ *
+ * Detect whether it's safe to enable PV EOI and
+ * if yes do so.
+ */
+static void apic_sync_pv_eoi_to_guest(struct kvm_vcpu *vcpu,
+ struct kvm_lapic *apic)
+{
+ if (!pv_eoi_enabled(vcpu) ||
+ /* IRR set or many bits in ISR: could be nested. */
+ apic->irr_pending ||
+ /* Cache not set: could be safe but we don't bother. */
+ apic->highest_isr_cache == -1 ||
+ /* Need EOI to update ioapic. */
+ kvm_ioapic_handles_vector(apic, apic->highest_isr_cache)) {
+ /*
+ * PV EOI was disabled by apic_sync_pv_eoi_from_guest
+ * so we need not do anything here.
+ */
+ return;
+ }
+
+ pv_eoi_set_pending(apic->vcpu);
+}
+
+void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu)
+{
+ u32 data, tpr;
+ int max_irr, max_isr;
+ struct kvm_lapic *apic = vcpu->arch.apic;
+
+ apic_sync_pv_eoi_to_guest(vcpu, apic);
+
+ if (!test_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention))
+ return;
+
+ tpr = kvm_lapic_get_reg(apic, APIC_TASKPRI) & 0xff;
+ max_irr = apic_find_highest_irr(apic);
+ if (max_irr < 0)
+ max_irr = 0;
+ max_isr = apic_find_highest_isr(apic);
+ if (max_isr < 0)
+ max_isr = 0;
+ data = (tpr & 0xff) | ((max_isr & 0xf0) << 8) | (max_irr << 24);
+
+ kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.apic->vapic_cache, &data,
+ sizeof(u32));
+}
+
+int kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr)
+{
+ if (vapic_addr) {
+ if (kvm_gfn_to_hva_cache_init(vcpu->kvm,
+ &vcpu->arch.apic->vapic_cache,
+ vapic_addr, sizeof(u32)))
+ return -EINVAL;
+ __set_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention);
+ } else {
+ __clear_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention);
+ }
+
+ vcpu->arch.apic->vapic_addr = vapic_addr;
+ return 0;
+}
+
+int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+ u32 reg = (msr - APIC_BASE_MSR) << 4;
+
+ if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(apic))
+ return 1;
+
+ if (reg == APIC_ICR2)
+ return 1;
+
+ /* if this is ICR write vector before command */
+ if (reg == APIC_ICR)
+ kvm_lapic_reg_write(apic, APIC_ICR2, (u32)(data >> 32));
+ return kvm_lapic_reg_write(apic, reg, (u32)data);
+}
+
+int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+ u32 reg = (msr - APIC_BASE_MSR) << 4, low, high = 0;
+
+ if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(apic))
+ return 1;
+
+ if (reg == APIC_DFR || reg == APIC_ICR2) {
+ apic_debug("KVM_APIC_READ: read x2apic reserved register %x\n",
+ reg);
+ return 1;
+ }
+
+ if (kvm_lapic_reg_read(apic, reg, 4, &low))
+ return 1;
+ if (reg == APIC_ICR)
+ kvm_lapic_reg_read(apic, APIC_ICR2, 4, &high);
+
+ *data = (((u64)high) << 32) | low;
+
+ return 0;
+}
+
+int kvm_hv_vapic_msr_write(struct kvm_vcpu *vcpu, u32 reg, u64 data)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+
+ if (!lapic_in_kernel(vcpu))
+ return 1;
+
+ /* if this is ICR write vector before command */
+ if (reg == APIC_ICR)
+ kvm_lapic_reg_write(apic, APIC_ICR2, (u32)(data >> 32));
+ return kvm_lapic_reg_write(apic, reg, (u32)data);
+}
+
+int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 reg, u64 *data)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+ u32 low, high = 0;
+
+ if (!lapic_in_kernel(vcpu))
+ return 1;
+
+ if (kvm_lapic_reg_read(apic, reg, 4, &low))
+ return 1;
+ if (reg == APIC_ICR)
+ kvm_lapic_reg_read(apic, APIC_ICR2, 4, &high);
+
+ *data = (((u64)high) << 32) | low;
+
+ return 0;
+}
+
+int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data, unsigned long len)
+{
+ u64 addr = data & ~KVM_MSR_ENABLED;
+ struct gfn_to_hva_cache *ghc = &vcpu->arch.pv_eoi.data;
+ unsigned long new_len;
+
+ if (!IS_ALIGNED(addr, 4))
+ return 1;
+
+ vcpu->arch.pv_eoi.msr_val = data;
+ if (!pv_eoi_enabled(vcpu))
+ return 0;
+
+ if (addr == ghc->gpa && len <= ghc->len)
+ new_len = ghc->len;
+ else
+ new_len = len;
+
+ return kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, addr, new_len);
+}
+
+void kvm_apic_accept_events(struct kvm_vcpu *vcpu)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+ u8 sipi_vector;
+ unsigned long pe;
+
+ if (!lapic_in_kernel(vcpu) || !apic->pending_events)
+ return;
+
+ /*
+ * INITs are latched while in SMM. Because an SMM CPU cannot
+ * be in KVM_MP_STATE_INIT_RECEIVED state, just eat SIPIs
+ * and delay processing of INIT until the next RSM.
+ */
+ if (is_smm(vcpu)) {
+ WARN_ON_ONCE(vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED);
+ if (test_bit(KVM_APIC_SIPI, &apic->pending_events))
+ clear_bit(KVM_APIC_SIPI, &apic->pending_events);
+ return;
+ }
+
+ pe = xchg(&apic->pending_events, 0);
+ if (test_bit(KVM_APIC_INIT, &pe)) {
+ kvm_vcpu_reset(vcpu, true);
+ if (kvm_vcpu_is_bsp(apic->vcpu))
+ vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
+ else
+ vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED;
+ }
+ if (test_bit(KVM_APIC_SIPI, &pe) &&
+ vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
+ /* evaluate pending_events before reading the vector */
+ smp_rmb();
+ sipi_vector = apic->sipi_vector;
+ apic_debug("vcpu %d received sipi with vector # %x\n",
+ vcpu->vcpu_id, sipi_vector);
+ kvm_vcpu_deliver_sipi_vector(vcpu, sipi_vector);
+ vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
+ }
+}
+
+void kvm_lapic_init(void)
+{
+ /* do not patch jump label more than once per second */
+ jump_label_rate_limit(&apic_hw_disabled, HZ);
+ jump_label_rate_limit(&apic_sw_disabled, HZ);
+}
+
+void kvm_lapic_exit(void)
+{
+ static_key_deferred_flush(&apic_hw_disabled);
+ static_key_deferred_flush(&apic_sw_disabled);
+}
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
new file mode 100644
index 000000000..ff6ef9c3d
--- /dev/null
+++ b/arch/x86/kvm/lapic.h
@@ -0,0 +1,237 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __KVM_X86_LAPIC_H
+#define __KVM_X86_LAPIC_H
+
+#include <kvm/iodev.h>
+
+#include <linux/kvm_host.h>
+
+#define KVM_APIC_INIT 0
+#define KVM_APIC_SIPI 1
+#define KVM_APIC_LVT_NUM 6
+
+#define KVM_APIC_SHORT_MASK 0xc0000
+#define KVM_APIC_DEST_MASK 0x800
+
+#define APIC_BUS_CYCLE_NS 1
+#define APIC_BUS_FREQUENCY (1000000000ULL / APIC_BUS_CYCLE_NS)
+
+enum lapic_mode {
+ LAPIC_MODE_DISABLED = 0,
+ LAPIC_MODE_INVALID = X2APIC_ENABLE,
+ LAPIC_MODE_XAPIC = MSR_IA32_APICBASE_ENABLE,
+ LAPIC_MODE_X2APIC = MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE,
+};
+
+struct kvm_timer {
+ struct hrtimer timer;
+ s64 period; /* unit: ns */
+ ktime_t target_expiration;
+ u32 timer_mode;
+ u32 timer_mode_mask;
+ u64 tscdeadline;
+ u64 expired_tscdeadline;
+ atomic_t pending; /* accumulated triggered timers */
+ bool hv_timer_in_use;
+};
+
+struct kvm_lapic {
+ unsigned long base_address;
+ struct kvm_io_device dev;
+ struct kvm_timer lapic_timer;
+ u32 divide_count;
+ struct kvm_vcpu *vcpu;
+ bool sw_enabled;
+ bool irr_pending;
+ bool lvt0_in_nmi_mode;
+ /* Number of bits set in ISR. */
+ s16 isr_count;
+ /* The highest vector set in ISR; if -1 - invalid, must scan ISR. */
+ int highest_isr_cache;
+ /**
+ * APIC register page. The layout matches the register layout seen by
+ * the guest 1:1, because it is accessed by the vmx microcode.
+ * Note: Only one register, the TPR, is used by the microcode.
+ */
+ void *regs;
+ gpa_t vapic_addr;
+ struct gfn_to_hva_cache vapic_cache;
+ unsigned long pending_events;
+ unsigned int sipi_vector;
+};
+
+struct dest_map;
+
+int kvm_create_lapic(struct kvm_vcpu *vcpu);
+void kvm_free_lapic(struct kvm_vcpu *vcpu);
+
+int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu);
+int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu);
+int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu);
+void kvm_apic_accept_events(struct kvm_vcpu *vcpu);
+void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event);
+u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu);
+void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8);
+void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu);
+void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value);
+u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu);
+void kvm_apic_set_version(struct kvm_vcpu *vcpu);
+int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val);
+int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len,
+ void *data);
+bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
+ int short_hand, unsigned int dest, int dest_mode);
+
+bool __kvm_apic_update_irr(u32 *pir, void *regs, int *max_irr);
+bool kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir, int *max_irr);
+void kvm_apic_update_ppr(struct kvm_vcpu *vcpu);
+int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq,
+ struct dest_map *dest_map);
+int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type);
+
+bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
+ struct kvm_lapic_irq *irq, int *r, struct dest_map *dest_map);
+
+u64 kvm_get_apic_base(struct kvm_vcpu *vcpu);
+int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info);
+int kvm_apic_get_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s);
+int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s);
+enum lapic_mode kvm_get_apic_mode(struct kvm_vcpu *vcpu);
+int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu);
+
+u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu);
+void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data);
+
+void kvm_apic_write_nodecode(struct kvm_vcpu *vcpu, u32 offset);
+void kvm_apic_set_eoi_accelerated(struct kvm_vcpu *vcpu, int vector);
+
+int kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr);
+void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu);
+void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu);
+
+int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data);
+int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data);
+
+int kvm_hv_vapic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data);
+int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data);
+
+static inline bool kvm_hv_vapic_assist_page_enabled(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.hyperv.hv_vapic & HV_X64_MSR_VP_ASSIST_PAGE_ENABLE;
+}
+
+int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data, unsigned long len);
+void kvm_lapic_init(void);
+void kvm_lapic_exit(void);
+
+#define VEC_POS(v) ((v) & (32 - 1))
+#define REG_POS(v) (((v) >> 5) << 4)
+
+static inline void kvm_lapic_set_vector(int vec, void *bitmap)
+{
+ set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
+}
+
+static inline void kvm_lapic_set_irr(int vec, struct kvm_lapic *apic)
+{
+ kvm_lapic_set_vector(vec, apic->regs + APIC_IRR);
+ /*
+ * irr_pending must be true if any interrupt is pending; set it after
+ * APIC_IRR to avoid race with apic_clear_irr
+ */
+ apic->irr_pending = true;
+}
+
+static inline u32 kvm_lapic_get_reg(struct kvm_lapic *apic, int reg_off)
+{
+ return *((u32 *) (apic->regs + reg_off));
+}
+
+static inline void kvm_lapic_set_reg(struct kvm_lapic *apic, int reg_off, u32 val)
+{
+ *((u32 *) (apic->regs + reg_off)) = val;
+}
+
+extern struct static_key kvm_no_apic_vcpu;
+
+static inline bool lapic_in_kernel(struct kvm_vcpu *vcpu)
+{
+ if (static_key_false(&kvm_no_apic_vcpu))
+ return vcpu->arch.apic;
+ return true;
+}
+
+extern struct static_key_deferred apic_hw_disabled;
+
+static inline int kvm_apic_hw_enabled(struct kvm_lapic *apic)
+{
+ if (static_key_false(&apic_hw_disabled.key))
+ return apic->vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE;
+ return MSR_IA32_APICBASE_ENABLE;
+}
+
+extern struct static_key_deferred apic_sw_disabled;
+
+static inline bool kvm_apic_sw_enabled(struct kvm_lapic *apic)
+{
+ if (static_key_false(&apic_sw_disabled.key))
+ return apic->sw_enabled;
+ return true;
+}
+
+static inline bool kvm_apic_present(struct kvm_vcpu *vcpu)
+{
+ return lapic_in_kernel(vcpu) && kvm_apic_hw_enabled(vcpu->arch.apic);
+}
+
+static inline int kvm_lapic_enabled(struct kvm_vcpu *vcpu)
+{
+ return kvm_apic_present(vcpu) && kvm_apic_sw_enabled(vcpu->arch.apic);
+}
+
+static inline int apic_x2apic_mode(struct kvm_lapic *apic)
+{
+ return apic->vcpu->arch.apic_base & X2APIC_ENABLE;
+}
+
+static inline bool kvm_vcpu_apicv_active(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.apic && vcpu->arch.apicv_active;
+}
+
+static inline bool kvm_apic_has_events(struct kvm_vcpu *vcpu)
+{
+ return lapic_in_kernel(vcpu) && vcpu->arch.apic->pending_events;
+}
+
+static inline bool kvm_lowest_prio_delivery(struct kvm_lapic_irq *irq)
+{
+ return (irq->delivery_mode == APIC_DM_LOWEST ||
+ irq->msi_redir_hint);
+}
+
+static inline int kvm_lapic_latched_init(struct kvm_vcpu *vcpu)
+{
+ return lapic_in_kernel(vcpu) && test_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
+}
+
+bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector);
+
+void wait_lapic_expire(struct kvm_vcpu *vcpu);
+
+bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq,
+ struct kvm_vcpu **dest_vcpu);
+int kvm_vector_to_index(u32 vector, u32 dest_vcpus,
+ const unsigned long *bitmap, u32 bitmap_size);
+void kvm_lapic_switch_to_sw_timer(struct kvm_vcpu *vcpu);
+void kvm_lapic_switch_to_hv_timer(struct kvm_vcpu *vcpu);
+void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu);
+bool kvm_lapic_hv_timer_in_use(struct kvm_vcpu *vcpu);
+void kvm_lapic_restart_hv_timer(struct kvm_vcpu *vcpu);
+
+static inline enum lapic_mode kvm_apic_mode(u64 apic_base)
+{
+ return apic_base & (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE);
+}
+
+#endif
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
new file mode 100644
index 000000000..0cb82172c
--- /dev/null
+++ b/arch/x86/kvm/mmu.c
@@ -0,0 +1,6292 @@
+/*
+ * Kernel-based Virtual Machine driver for Linux
+ *
+ * This module enables machines with Intel VT-x extensions to run virtual
+ * machines without emulation or binary translation.
+ *
+ * MMU support
+ *
+ * Copyright (C) 2006 Qumranet, Inc.
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
+ *
+ * Authors:
+ * Yaniv Kamay <yaniv@qumranet.com>
+ * Avi Kivity <avi@qumranet.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#include "irq.h"
+#include "mmu.h"
+#include "x86.h"
+#include "kvm_cache_regs.h"
+#include "cpuid.h"
+
+#include <linux/kvm_host.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/moduleparam.h>
+#include <linux/export.h>
+#include <linux/swap.h>
+#include <linux/hugetlb.h>
+#include <linux/compiler.h>
+#include <linux/srcu.h>
+#include <linux/slab.h>
+#include <linux/sched/signal.h>
+#include <linux/uaccess.h>
+#include <linux/hash.h>
+#include <linux/kern_levels.h>
+#include <linux/kthread.h>
+
+#include <asm/page.h>
+#include <asm/pat.h>
+#include <asm/cmpxchg.h>
+#include <asm/io.h>
+#include <asm/vmx.h>
+#include <asm/kvm_page_track.h>
+#include "trace.h"
+
+extern bool itlb_multihit_kvm_mitigation;
+
+static int __read_mostly nx_huge_pages = -1;
+static uint __read_mostly nx_huge_pages_recovery_ratio = 60;
+
+static int set_nx_huge_pages(const char *val, const struct kernel_param *kp);
+static int set_nx_huge_pages_recovery_ratio(const char *val, const struct kernel_param *kp);
+
+static struct kernel_param_ops nx_huge_pages_ops = {
+ .set = set_nx_huge_pages,
+ .get = param_get_bool,
+};
+
+static struct kernel_param_ops nx_huge_pages_recovery_ratio_ops = {
+ .set = set_nx_huge_pages_recovery_ratio,
+ .get = param_get_uint,
+};
+
+module_param_cb(nx_huge_pages, &nx_huge_pages_ops, &nx_huge_pages, 0644);
+__MODULE_PARM_TYPE(nx_huge_pages, "bool");
+module_param_cb(nx_huge_pages_recovery_ratio, &nx_huge_pages_recovery_ratio_ops,
+ &nx_huge_pages_recovery_ratio, 0644);
+__MODULE_PARM_TYPE(nx_huge_pages_recovery_ratio, "uint");
+
+/*
+ * When setting this variable to true it enables Two-Dimensional-Paging
+ * where the hardware walks 2 page tables:
+ * 1. the guest-virtual to guest-physical
+ * 2. while doing 1. it walks guest-physical to host-physical
+ * If the hardware supports that we don't need to do shadow paging.
+ */
+bool tdp_enabled = false;
+
+enum {
+ AUDIT_PRE_PAGE_FAULT,
+ AUDIT_POST_PAGE_FAULT,
+ AUDIT_PRE_PTE_WRITE,
+ AUDIT_POST_PTE_WRITE,
+ AUDIT_PRE_SYNC,
+ AUDIT_POST_SYNC
+};
+
+#undef MMU_DEBUG
+
+#ifdef MMU_DEBUG
+static bool dbg = 0;
+module_param(dbg, bool, 0644);
+
+#define pgprintk(x...) do { if (dbg) printk(x); } while (0)
+#define rmap_printk(x...) do { if (dbg) printk(x); } while (0)
+#define MMU_WARN_ON(x) WARN_ON(x)
+#else
+#define pgprintk(x...) do { } while (0)
+#define rmap_printk(x...) do { } while (0)
+#define MMU_WARN_ON(x) do { } while (0)
+#endif
+
+#define PTE_PREFETCH_NUM 8
+
+#define PT_FIRST_AVAIL_BITS_SHIFT 10
+#define PT64_SECOND_AVAIL_BITS_SHIFT 52
+
+#define PT64_LEVEL_BITS 9
+
+#define PT64_LEVEL_SHIFT(level) \
+ (PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS)
+
+#define PT64_INDEX(address, level)\
+ (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1))
+
+
+#define PT32_LEVEL_BITS 10
+
+#define PT32_LEVEL_SHIFT(level) \
+ (PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS)
+
+#define PT32_LVL_OFFSET_MASK(level) \
+ (PT32_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \
+ * PT32_LEVEL_BITS))) - 1))
+
+#define PT32_INDEX(address, level)\
+ (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1))
+
+
+#define PT64_BASE_ADDR_MASK __sme_clr((((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1)))
+#define PT64_DIR_BASE_ADDR_MASK \
+ (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + PT64_LEVEL_BITS)) - 1))
+#define PT64_LVL_ADDR_MASK(level) \
+ (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
+ * PT64_LEVEL_BITS))) - 1))
+#define PT64_LVL_OFFSET_MASK(level) \
+ (PT64_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \
+ * PT64_LEVEL_BITS))) - 1))
+
+#define PT32_BASE_ADDR_MASK PAGE_MASK
+#define PT32_DIR_BASE_ADDR_MASK \
+ (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1))
+#define PT32_LVL_ADDR_MASK(level) \
+ (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
+ * PT32_LEVEL_BITS))) - 1))
+
+#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | shadow_user_mask \
+ | shadow_x_mask | shadow_nx_mask | shadow_me_mask)
+
+#define ACC_EXEC_MASK 1
+#define ACC_WRITE_MASK PT_WRITABLE_MASK
+#define ACC_USER_MASK PT_USER_MASK
+#define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)
+
+/* The mask for the R/X bits in EPT PTEs */
+#define PT64_EPT_READABLE_MASK 0x1ull
+#define PT64_EPT_EXECUTABLE_MASK 0x4ull
+
+#include <trace/events/kvm.h>
+
+#define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
+#define SPTE_MMU_WRITEABLE (1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 1))
+
+#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
+
+/* make pte_list_desc fit well in cache line */
+#define PTE_LIST_EXT 3
+
+/*
+ * Return values of handle_mmio_page_fault and mmu.page_fault:
+ * RET_PF_RETRY: let CPU fault again on the address.
+ * RET_PF_EMULATE: mmio page fault, emulate the instruction directly.
+ *
+ * For handle_mmio_page_fault only:
+ * RET_PF_INVALID: the spte is invalid, let the real page fault path update it.
+ */
+enum {
+ RET_PF_RETRY = 0,
+ RET_PF_EMULATE = 1,
+ RET_PF_INVALID = 2,
+};
+
+struct pte_list_desc {
+ u64 *sptes[PTE_LIST_EXT];
+ struct pte_list_desc *more;
+};
+
+struct kvm_shadow_walk_iterator {
+ u64 addr;
+ hpa_t shadow_addr;
+ u64 *sptep;
+ int level;
+ unsigned index;
+};
+
+static const union kvm_mmu_page_role mmu_base_role_mask = {
+ .cr0_wp = 1,
+ .cr4_pae = 1,
+ .nxe = 1,
+ .smep_andnot_wp = 1,
+ .smap_andnot_wp = 1,
+ .smm = 1,
+ .guest_mode = 1,
+ .ad_disabled = 1,
+};
+
+#define for_each_shadow_entry_using_root(_vcpu, _root, _addr, _walker) \
+ for (shadow_walk_init_using_root(&(_walker), (_vcpu), \
+ (_root), (_addr)); \
+ shadow_walk_okay(&(_walker)); \
+ shadow_walk_next(&(_walker)))
+
+#define for_each_shadow_entry(_vcpu, _addr, _walker) \
+ for (shadow_walk_init(&(_walker), _vcpu, _addr); \
+ shadow_walk_okay(&(_walker)); \
+ shadow_walk_next(&(_walker)))
+
+#define for_each_shadow_entry_lockless(_vcpu, _addr, _walker, spte) \
+ for (shadow_walk_init(&(_walker), _vcpu, _addr); \
+ shadow_walk_okay(&(_walker)) && \
+ ({ spte = mmu_spte_get_lockless(_walker.sptep); 1; }); \
+ __shadow_walk_next(&(_walker), spte))
+
+static struct kmem_cache *pte_list_desc_cache;
+static struct kmem_cache *mmu_page_header_cache;
+static struct percpu_counter kvm_total_used_mmu_pages;
+
+static u64 __read_mostly shadow_nx_mask;
+static u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */
+static u64 __read_mostly shadow_user_mask;
+static u64 __read_mostly shadow_accessed_mask;
+static u64 __read_mostly shadow_dirty_mask;
+static u64 __read_mostly shadow_mmio_mask;
+static u64 __read_mostly shadow_mmio_value;
+static u64 __read_mostly shadow_present_mask;
+static u64 __read_mostly shadow_me_mask;
+
+/*
+ * SPTEs used by MMUs without A/D bits are marked with shadow_acc_track_value.
+ * Non-present SPTEs with shadow_acc_track_value set are in place for access
+ * tracking.
+ */
+static u64 __read_mostly shadow_acc_track_mask;
+static const u64 shadow_acc_track_value = SPTE_SPECIAL_MASK;
+
+/*
+ * The mask/shift to use for saving the original R/X bits when marking the PTE
+ * as not-present for access tracking purposes. We do not save the W bit as the
+ * PTEs being access tracked also need to be dirty tracked, so the W bit will be
+ * restored only when a write is attempted to the page.
+ */
+static const u64 shadow_acc_track_saved_bits_mask = PT64_EPT_READABLE_MASK |
+ PT64_EPT_EXECUTABLE_MASK;
+static const u64 shadow_acc_track_saved_bits_shift = PT64_SECOND_AVAIL_BITS_SHIFT;
+
+/*
+ * This mask must be set on all non-zero Non-Present or Reserved SPTEs in order
+ * to guard against L1TF attacks.
+ */
+static u64 __read_mostly shadow_nonpresent_or_rsvd_mask;
+
+/*
+ * The number of high-order 1 bits to use in the mask above.
+ */
+static const u64 shadow_nonpresent_or_rsvd_mask_len = 5;
+
+/*
+ * In some cases, we need to preserve the GFN of a non-present or reserved
+ * SPTE when we usurp the upper five bits of the physical address space to
+ * defend against L1TF, e.g. for MMIO SPTEs. To preserve the GFN, we'll
+ * shift bits of the GFN that overlap with shadow_nonpresent_or_rsvd_mask
+ * left into the reserved bits, i.e. the GFN in the SPTE will be split into
+ * high and low parts. This mask covers the lower bits of the GFN.
+ */
+static u64 __read_mostly shadow_nonpresent_or_rsvd_lower_gfn_mask;
+
+/*
+ * The number of non-reserved physical address bits irrespective of features
+ * that repurpose legal bits, e.g. MKTME.
+ */
+static u8 __read_mostly shadow_phys_bits;
+
+static void mmu_spte_set(u64 *sptep, u64 spte);
+static bool is_executable_pte(u64 spte);
+static union kvm_mmu_page_role
+kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu);
+
+#define CREATE_TRACE_POINTS
+#include "mmutrace.h"
+
+
+void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask, u64 mmio_value)
+{
+ BUG_ON((mmio_mask & mmio_value) != mmio_value);
+ WARN_ON(mmio_value & (shadow_nonpresent_or_rsvd_mask << shadow_nonpresent_or_rsvd_mask_len));
+ WARN_ON(mmio_value & shadow_nonpresent_or_rsvd_lower_gfn_mask);
+ shadow_mmio_value = mmio_value | SPTE_SPECIAL_MASK;
+ shadow_mmio_mask = mmio_mask | SPTE_SPECIAL_MASK;
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask);
+
+static bool is_mmio_spte(u64 spte)
+{
+ return (spte & shadow_mmio_mask) == shadow_mmio_value;
+}
+
+static inline bool sp_ad_disabled(struct kvm_mmu_page *sp)
+{
+ return sp->role.ad_disabled;
+}
+
+static inline bool spte_ad_enabled(u64 spte)
+{
+ MMU_WARN_ON(is_mmio_spte(spte));
+ return !(spte & shadow_acc_track_value);
+}
+
+static bool is_nx_huge_page_enabled(void)
+{
+ return READ_ONCE(nx_huge_pages);
+}
+
+static inline u64 spte_shadow_accessed_mask(u64 spte)
+{
+ MMU_WARN_ON(is_mmio_spte(spte));
+ return spte_ad_enabled(spte) ? shadow_accessed_mask : 0;
+}
+
+static inline u64 spte_shadow_dirty_mask(u64 spte)
+{
+ MMU_WARN_ON(is_mmio_spte(spte));
+ return spte_ad_enabled(spte) ? shadow_dirty_mask : 0;
+}
+
+static inline bool is_access_track_spte(u64 spte)
+{
+ return !spte_ad_enabled(spte) && (spte & shadow_acc_track_mask) == 0;
+}
+
+/*
+ * the low bit of the generation number is always presumed to be zero.
+ * This disables mmio caching during memslot updates. The concept is
+ * similar to a seqcount but instead of retrying the access we just punt
+ * and ignore the cache.
+ *
+ * spte bits 3-11 are used as bits 1-9 of the generation number,
+ * the bits 52-61 are used as bits 10-19 of the generation number.
+ */
+#define MMIO_SPTE_GEN_LOW_SHIFT 2
+#define MMIO_SPTE_GEN_HIGH_SHIFT 52
+
+#define MMIO_GEN_SHIFT 20
+#define MMIO_GEN_LOW_SHIFT 10
+#define MMIO_GEN_LOW_MASK ((1 << MMIO_GEN_LOW_SHIFT) - 2)
+#define MMIO_GEN_MASK ((1 << MMIO_GEN_SHIFT) - 1)
+
+static u64 generation_mmio_spte_mask(unsigned int gen)
+{
+ u64 mask;
+
+ WARN_ON(gen & ~MMIO_GEN_MASK);
+
+ mask = (gen & MMIO_GEN_LOW_MASK) << MMIO_SPTE_GEN_LOW_SHIFT;
+ mask |= ((u64)gen >> MMIO_GEN_LOW_SHIFT) << MMIO_SPTE_GEN_HIGH_SHIFT;
+ return mask;
+}
+
+static unsigned int get_mmio_spte_generation(u64 spte)
+{
+ unsigned int gen;
+
+ spte &= ~shadow_mmio_mask;
+
+ gen = (spte >> MMIO_SPTE_GEN_LOW_SHIFT) & MMIO_GEN_LOW_MASK;
+ gen |= (spte >> MMIO_SPTE_GEN_HIGH_SHIFT) << MMIO_GEN_LOW_SHIFT;
+ return gen;
+}
+
+static unsigned int kvm_current_mmio_generation(struct kvm_vcpu *vcpu)
+{
+ return kvm_vcpu_memslots(vcpu)->generation & MMIO_GEN_MASK;
+}
+
+static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn,
+ unsigned access)
+{
+ unsigned int gen = kvm_current_mmio_generation(vcpu);
+ u64 mask = generation_mmio_spte_mask(gen);
+ u64 gpa = gfn << PAGE_SHIFT;
+
+ access &= ACC_WRITE_MASK | ACC_USER_MASK;
+ mask |= shadow_mmio_value | access;
+ mask |= gpa | shadow_nonpresent_or_rsvd_mask;
+ mask |= (gpa & shadow_nonpresent_or_rsvd_mask)
+ << shadow_nonpresent_or_rsvd_mask_len;
+
+ trace_mark_mmio_spte(sptep, gfn, access, gen);
+ mmu_spte_set(sptep, mask);
+}
+
+static gfn_t get_mmio_spte_gfn(u64 spte)
+{
+ u64 gpa = spte & shadow_nonpresent_or_rsvd_lower_gfn_mask;
+
+ gpa |= (spte >> shadow_nonpresent_or_rsvd_mask_len)
+ & shadow_nonpresent_or_rsvd_mask;
+
+ return gpa >> PAGE_SHIFT;
+}
+
+static unsigned get_mmio_spte_access(u64 spte)
+{
+ u64 mask = generation_mmio_spte_mask(MMIO_GEN_MASK) | shadow_mmio_mask;
+ return (spte & ~mask) & ~PAGE_MASK;
+}
+
+static bool set_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
+ kvm_pfn_t pfn, unsigned access)
+{
+ if (unlikely(is_noslot_pfn(pfn))) {
+ mark_mmio_spte(vcpu, sptep, gfn, access);
+ return true;
+ }
+
+ return false;
+}
+
+static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte)
+{
+ unsigned int kvm_gen, spte_gen;
+
+ kvm_gen = kvm_current_mmio_generation(vcpu);
+ spte_gen = get_mmio_spte_generation(spte);
+
+ trace_check_mmio_spte(spte, kvm_gen, spte_gen);
+ return likely(kvm_gen == spte_gen);
+}
+
+/*
+ * Sets the shadow PTE masks used by the MMU.
+ *
+ * Assumptions:
+ * - Setting either @accessed_mask or @dirty_mask requires setting both
+ * - At least one of @accessed_mask or @acc_track_mask must be set
+ */
+void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
+ u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask,
+ u64 acc_track_mask, u64 me_mask)
+{
+ BUG_ON(!dirty_mask != !accessed_mask);
+ BUG_ON(!accessed_mask && !acc_track_mask);
+ BUG_ON(acc_track_mask & shadow_acc_track_value);
+
+ shadow_user_mask = user_mask;
+ shadow_accessed_mask = accessed_mask;
+ shadow_dirty_mask = dirty_mask;
+ shadow_nx_mask = nx_mask;
+ shadow_x_mask = x_mask;
+ shadow_present_mask = p_mask;
+ shadow_acc_track_mask = acc_track_mask;
+ shadow_me_mask = me_mask;
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
+
+static u8 kvm_get_shadow_phys_bits(void)
+{
+ /*
+ * boot_cpu_data.x86_phys_bits is reduced when MKTME is detected
+ * in CPU detection code, but MKTME treats those reduced bits as
+ * 'keyID' thus they are not reserved bits. Therefore for MKTME
+ * we should still return physical address bits reported by CPUID.
+ */
+ if (!boot_cpu_has(X86_FEATURE_TME) ||
+ WARN_ON_ONCE(boot_cpu_data.extended_cpuid_level < 0x80000008))
+ return boot_cpu_data.x86_phys_bits;
+
+ return cpuid_eax(0x80000008) & 0xff;
+}
+
+static void kvm_mmu_reset_all_pte_masks(void)
+{
+ u8 low_phys_bits;
+
+ shadow_user_mask = 0;
+ shadow_accessed_mask = 0;
+ shadow_dirty_mask = 0;
+ shadow_nx_mask = 0;
+ shadow_x_mask = 0;
+ shadow_mmio_mask = 0;
+ shadow_present_mask = 0;
+ shadow_acc_track_mask = 0;
+
+ shadow_phys_bits = kvm_get_shadow_phys_bits();
+
+ /*
+ * If the CPU has 46 or less physical address bits, then set an
+ * appropriate mask to guard against L1TF attacks. Otherwise, it is
+ * assumed that the CPU is not vulnerable to L1TF.
+ *
+ * Some Intel CPUs address the L1 cache using more PA bits than are
+ * reported by CPUID. Use the PA width of the L1 cache when possible
+ * to achieve more effective mitigation, e.g. if system RAM overlaps
+ * the most significant bits of legal physical address space.
+ */
+ shadow_nonpresent_or_rsvd_mask = 0;
+ low_phys_bits = boot_cpu_data.x86_phys_bits;
+ if (boot_cpu_has_bug(X86_BUG_L1TF) &&
+ !WARN_ON_ONCE(boot_cpu_data.x86_cache_bits >=
+ 52 - shadow_nonpresent_or_rsvd_mask_len)) {
+ low_phys_bits = boot_cpu_data.x86_cache_bits
+ - shadow_nonpresent_or_rsvd_mask_len;
+ shadow_nonpresent_or_rsvd_mask =
+ rsvd_bits(low_phys_bits, boot_cpu_data.x86_cache_bits - 1);
+ }
+
+ shadow_nonpresent_or_rsvd_lower_gfn_mask =
+ GENMASK_ULL(low_phys_bits - 1, PAGE_SHIFT);
+}
+
+static int is_cpuid_PSE36(void)
+{
+ return 1;
+}
+
+static int is_nx(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.efer & EFER_NX;
+}
+
+static int is_shadow_present_pte(u64 pte)
+{
+ return (pte != 0) && !is_mmio_spte(pte);
+}
+
+static int is_large_pte(u64 pte)
+{
+ return pte & PT_PAGE_SIZE_MASK;
+}
+
+static int is_last_spte(u64 pte, int level)
+{
+ if (level == PT_PAGE_TABLE_LEVEL)
+ return 1;
+ if (is_large_pte(pte))
+ return 1;
+ return 0;
+}
+
+static bool is_executable_pte(u64 spte)
+{
+ return (spte & (shadow_x_mask | shadow_nx_mask)) == shadow_x_mask;
+}
+
+static kvm_pfn_t spte_to_pfn(u64 pte)
+{
+ return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
+}
+
+static gfn_t pse36_gfn_delta(u32 gpte)
+{
+ int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT;
+
+ return (gpte & PT32_DIR_PSE36_MASK) << shift;
+}
+
+#ifdef CONFIG_X86_64
+static void __set_spte(u64 *sptep, u64 spte)
+{
+ WRITE_ONCE(*sptep, spte);
+}
+
+static void __update_clear_spte_fast(u64 *sptep, u64 spte)
+{
+ WRITE_ONCE(*sptep, spte);
+}
+
+static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
+{
+ return xchg(sptep, spte);
+}
+
+static u64 __get_spte_lockless(u64 *sptep)
+{
+ return READ_ONCE(*sptep);
+}
+#else
+union split_spte {
+ struct {
+ u32 spte_low;
+ u32 spte_high;
+ };
+ u64 spte;
+};
+
+static void count_spte_clear(u64 *sptep, u64 spte)
+{
+ struct kvm_mmu_page *sp = page_header(__pa(sptep));
+
+ if (is_shadow_present_pte(spte))
+ return;
+
+ /* Ensure the spte is completely set before we increase the count */
+ smp_wmb();
+ sp->clear_spte_count++;
+}
+
+static void __set_spte(u64 *sptep, u64 spte)
+{
+ union split_spte *ssptep, sspte;
+
+ ssptep = (union split_spte *)sptep;
+ sspte = (union split_spte)spte;
+
+ ssptep->spte_high = sspte.spte_high;
+
+ /*
+ * If we map the spte from nonpresent to present, We should store
+ * the high bits firstly, then set present bit, so cpu can not
+ * fetch this spte while we are setting the spte.
+ */
+ smp_wmb();
+
+ WRITE_ONCE(ssptep->spte_low, sspte.spte_low);
+}
+
+static void __update_clear_spte_fast(u64 *sptep, u64 spte)
+{
+ union split_spte *ssptep, sspte;
+
+ ssptep = (union split_spte *)sptep;
+ sspte = (union split_spte)spte;
+
+ WRITE_ONCE(ssptep->spte_low, sspte.spte_low);
+
+ /*
+ * If we map the spte from present to nonpresent, we should clear
+ * present bit firstly to avoid vcpu fetch the old high bits.
+ */
+ smp_wmb();
+
+ ssptep->spte_high = sspte.spte_high;
+ count_spte_clear(sptep, spte);
+}
+
+static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
+{
+ union split_spte *ssptep, sspte, orig;
+
+ ssptep = (union split_spte *)sptep;
+ sspte = (union split_spte)spte;
+
+ /* xchg acts as a barrier before the setting of the high bits */
+ orig.spte_low = xchg(&ssptep->spte_low, sspte.spte_low);
+ orig.spte_high = ssptep->spte_high;
+ ssptep->spte_high = sspte.spte_high;
+ count_spte_clear(sptep, spte);
+
+ return orig.spte;
+}
+
+/*
+ * The idea using the light way get the spte on x86_32 guest is from
+ * gup_get_pte(arch/x86/mm/gup.c).
+ *
+ * An spte tlb flush may be pending, because kvm_set_pte_rmapp
+ * coalesces them and we are running out of the MMU lock. Therefore
+ * we need to protect against in-progress updates of the spte.
+ *
+ * Reading the spte while an update is in progress may get the old value
+ * for the high part of the spte. The race is fine for a present->non-present
+ * change (because the high part of the spte is ignored for non-present spte),
+ * but for a present->present change we must reread the spte.
+ *
+ * All such changes are done in two steps (present->non-present and
+ * non-present->present), hence it is enough to count the number of
+ * present->non-present updates: if it changed while reading the spte,
+ * we might have hit the race. This is done using clear_spte_count.
+ */
+static u64 __get_spte_lockless(u64 *sptep)
+{
+ struct kvm_mmu_page *sp = page_header(__pa(sptep));
+ union split_spte spte, *orig = (union split_spte *)sptep;
+ int count;
+
+retry:
+ count = sp->clear_spte_count;
+ smp_rmb();
+
+ spte.spte_low = orig->spte_low;
+ smp_rmb();
+
+ spte.spte_high = orig->spte_high;
+ smp_rmb();
+
+ if (unlikely(spte.spte_low != orig->spte_low ||
+ count != sp->clear_spte_count))
+ goto retry;
+
+ return spte.spte;
+}
+#endif
+
+static bool spte_can_locklessly_be_made_writable(u64 spte)
+{
+ return (spte & (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE)) ==
+ (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE);
+}
+
+static bool spte_has_volatile_bits(u64 spte)
+{
+ if (!is_shadow_present_pte(spte))
+ return false;
+
+ /*
+ * Always atomically update spte if it can be updated
+ * out of mmu-lock, it can ensure dirty bit is not lost,
+ * also, it can help us to get a stable is_writable_pte()
+ * to ensure tlb flush is not missed.
+ */
+ if (spte_can_locklessly_be_made_writable(spte) ||
+ is_access_track_spte(spte))
+ return true;
+
+ if (spte_ad_enabled(spte)) {
+ if ((spte & shadow_accessed_mask) == 0 ||
+ (is_writable_pte(spte) && (spte & shadow_dirty_mask) == 0))
+ return true;
+ }
+
+ return false;
+}
+
+static bool is_accessed_spte(u64 spte)
+{
+ u64 accessed_mask = spte_shadow_accessed_mask(spte);
+
+ return accessed_mask ? spte & accessed_mask
+ : !is_access_track_spte(spte);
+}
+
+static bool is_dirty_spte(u64 spte)
+{
+ u64 dirty_mask = spte_shadow_dirty_mask(spte);
+
+ return dirty_mask ? spte & dirty_mask : spte & PT_WRITABLE_MASK;
+}
+
+/* Rules for using mmu_spte_set:
+ * Set the sptep from nonpresent to present.
+ * Note: the sptep being assigned *must* be either not present
+ * or in a state where the hardware will not attempt to update
+ * the spte.
+ */
+static void mmu_spte_set(u64 *sptep, u64 new_spte)
+{
+ WARN_ON(is_shadow_present_pte(*sptep));
+ __set_spte(sptep, new_spte);
+}
+
+/*
+ * Update the SPTE (excluding the PFN), but do not track changes in its
+ * accessed/dirty status.
+ */
+static u64 mmu_spte_update_no_track(u64 *sptep, u64 new_spte)
+{
+ u64 old_spte = *sptep;
+
+ WARN_ON(!is_shadow_present_pte(new_spte));
+
+ if (!is_shadow_present_pte(old_spte)) {
+ mmu_spte_set(sptep, new_spte);
+ return old_spte;
+ }
+
+ if (!spte_has_volatile_bits(old_spte))
+ __update_clear_spte_fast(sptep, new_spte);
+ else
+ old_spte = __update_clear_spte_slow(sptep, new_spte);
+
+ WARN_ON(spte_to_pfn(old_spte) != spte_to_pfn(new_spte));
+
+ return old_spte;
+}
+
+/* Rules for using mmu_spte_update:
+ * Update the state bits, it means the mapped pfn is not changed.
+ *
+ * Whenever we overwrite a writable spte with a read-only one we
+ * should flush remote TLBs. Otherwise rmap_write_protect
+ * will find a read-only spte, even though the writable spte
+ * might be cached on a CPU's TLB, the return value indicates this
+ * case.
+ *
+ * Returns true if the TLB needs to be flushed
+ */
+static bool mmu_spte_update(u64 *sptep, u64 new_spte)
+{
+ bool flush = false;
+ u64 old_spte = mmu_spte_update_no_track(sptep, new_spte);
+
+ if (!is_shadow_present_pte(old_spte))
+ return false;
+
+ /*
+ * For the spte updated out of mmu-lock is safe, since
+ * we always atomically update it, see the comments in
+ * spte_has_volatile_bits().
+ */
+ if (spte_can_locklessly_be_made_writable(old_spte) &&
+ !is_writable_pte(new_spte))
+ flush = true;
+
+ /*
+ * Flush TLB when accessed/dirty states are changed in the page tables,
+ * to guarantee consistency between TLB and page tables.
+ */
+
+ if (is_accessed_spte(old_spte) && !is_accessed_spte(new_spte)) {
+ flush = true;
+ kvm_set_pfn_accessed(spte_to_pfn(old_spte));
+ }
+
+ if (is_dirty_spte(old_spte) && !is_dirty_spte(new_spte)) {
+ flush = true;
+ kvm_set_pfn_dirty(spte_to_pfn(old_spte));
+ }
+
+ return flush;
+}
+
+/*
+ * Rules for using mmu_spte_clear_track_bits:
+ * It sets the sptep from present to nonpresent, and track the
+ * state bits, it is used to clear the last level sptep.
+ * Returns non-zero if the PTE was previously valid.
+ */
+static int mmu_spte_clear_track_bits(u64 *sptep)
+{
+ kvm_pfn_t pfn;
+ u64 old_spte = *sptep;
+
+ if (!spte_has_volatile_bits(old_spte))
+ __update_clear_spte_fast(sptep, 0ull);
+ else
+ old_spte = __update_clear_spte_slow(sptep, 0ull);
+
+ if (!is_shadow_present_pte(old_spte))
+ return 0;
+
+ pfn = spte_to_pfn(old_spte);
+
+ /*
+ * KVM does not hold the refcount of the page used by
+ * kvm mmu, before reclaiming the page, we should
+ * unmap it from mmu first.
+ */
+ WARN_ON(!kvm_is_reserved_pfn(pfn) && !page_count(pfn_to_page(pfn)));
+
+ if (is_accessed_spte(old_spte))
+ kvm_set_pfn_accessed(pfn);
+
+ if (is_dirty_spte(old_spte))
+ kvm_set_pfn_dirty(pfn);
+
+ return 1;
+}
+
+/*
+ * Rules for using mmu_spte_clear_no_track:
+ * Directly clear spte without caring the state bits of sptep,
+ * it is used to set the upper level spte.
+ */
+static void mmu_spte_clear_no_track(u64 *sptep)
+{
+ __update_clear_spte_fast(sptep, 0ull);
+}
+
+static u64 mmu_spte_get_lockless(u64 *sptep)
+{
+ return __get_spte_lockless(sptep);
+}
+
+static u64 mark_spte_for_access_track(u64 spte)
+{
+ if (spte_ad_enabled(spte))
+ return spte & ~shadow_accessed_mask;
+
+ if (is_access_track_spte(spte))
+ return spte;
+
+ /*
+ * Making an Access Tracking PTE will result in removal of write access
+ * from the PTE. So, verify that we will be able to restore the write
+ * access in the fast page fault path later on.
+ */
+ WARN_ONCE((spte & PT_WRITABLE_MASK) &&
+ !spte_can_locklessly_be_made_writable(spte),
+ "kvm: Writable SPTE is not locklessly dirty-trackable\n");
+
+ WARN_ONCE(spte & (shadow_acc_track_saved_bits_mask <<
+ shadow_acc_track_saved_bits_shift),
+ "kvm: Access Tracking saved bit locations are not zero\n");
+
+ spte |= (spte & shadow_acc_track_saved_bits_mask) <<
+ shadow_acc_track_saved_bits_shift;
+ spte &= ~shadow_acc_track_mask;
+
+ return spte;
+}
+
+/* Restore an acc-track PTE back to a regular PTE */
+static u64 restore_acc_track_spte(u64 spte)
+{
+ u64 new_spte = spte;
+ u64 saved_bits = (spte >> shadow_acc_track_saved_bits_shift)
+ & shadow_acc_track_saved_bits_mask;
+
+ WARN_ON_ONCE(spte_ad_enabled(spte));
+ WARN_ON_ONCE(!is_access_track_spte(spte));
+
+ new_spte &= ~shadow_acc_track_mask;
+ new_spte &= ~(shadow_acc_track_saved_bits_mask <<
+ shadow_acc_track_saved_bits_shift);
+ new_spte |= saved_bits;
+
+ return new_spte;
+}
+
+/* Returns the Accessed status of the PTE and resets it at the same time. */
+static bool mmu_spte_age(u64 *sptep)
+{
+ u64 spte = mmu_spte_get_lockless(sptep);
+
+ if (!is_accessed_spte(spte))
+ return false;
+
+ if (spte_ad_enabled(spte)) {
+ clear_bit((ffs(shadow_accessed_mask) - 1),
+ (unsigned long *)sptep);
+ } else {
+ /*
+ * Capture the dirty status of the page, so that it doesn't get
+ * lost when the SPTE is marked for access tracking.
+ */
+ if (is_writable_pte(spte))
+ kvm_set_pfn_dirty(spte_to_pfn(spte));
+
+ spte = mark_spte_for_access_track(spte);
+ mmu_spte_update_no_track(sptep, spte);
+ }
+
+ return true;
+}
+
+static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu)
+{
+ /*
+ * Prevent page table teardown by making any free-er wait during
+ * kvm_flush_remote_tlbs() IPI to all active vcpus.
+ */
+ local_irq_disable();
+
+ /*
+ * Make sure a following spte read is not reordered ahead of the write
+ * to vcpu->mode.
+ */
+ smp_store_mb(vcpu->mode, READING_SHADOW_PAGE_TABLES);
+}
+
+static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu)
+{
+ /*
+ * Make sure the write to vcpu->mode is not reordered in front of
+ * reads to sptes. If it does, kvm_mmu_commit_zap_page() can see us
+ * OUTSIDE_GUEST_MODE and proceed to free the shadow page table.
+ */
+ smp_store_release(&vcpu->mode, OUTSIDE_GUEST_MODE);
+ local_irq_enable();
+}
+
+static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
+ struct kmem_cache *base_cache, int min)
+{
+ void *obj;
+
+ if (cache->nobjs >= min)
+ return 0;
+ while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
+ obj = kmem_cache_zalloc(base_cache, GFP_KERNEL);
+ if (!obj)
+ return -ENOMEM;
+ cache->objects[cache->nobjs++] = obj;
+ }
+ return 0;
+}
+
+static int mmu_memory_cache_free_objects(struct kvm_mmu_memory_cache *cache)
+{
+ return cache->nobjs;
+}
+
+static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc,
+ struct kmem_cache *cache)
+{
+ while (mc->nobjs)
+ kmem_cache_free(cache, mc->objects[--mc->nobjs]);
+}
+
+static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache,
+ int min)
+{
+ void *page;
+
+ if (cache->nobjs >= min)
+ return 0;
+ while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
+ page = (void *)__get_free_page(GFP_KERNEL_ACCOUNT);
+ if (!page)
+ return -ENOMEM;
+ cache->objects[cache->nobjs++] = page;
+ }
+ return 0;
+}
+
+static void mmu_free_memory_cache_page(struct kvm_mmu_memory_cache *mc)
+{
+ while (mc->nobjs)
+ free_page((unsigned long)mc->objects[--mc->nobjs]);
+}
+
+static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu)
+{
+ int r;
+
+ r = mmu_topup_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache,
+ pte_list_desc_cache, 8 + PTE_PREFETCH_NUM);
+ if (r)
+ goto out;
+ r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8);
+ if (r)
+ goto out;
+ r = mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache,
+ mmu_page_header_cache, 4);
+out:
+ return r;
+}
+
+static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
+{
+ mmu_free_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache,
+ pte_list_desc_cache);
+ mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache);
+ mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache,
+ mmu_page_header_cache);
+}
+
+static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc)
+{
+ void *p;
+
+ BUG_ON(!mc->nobjs);
+ p = mc->objects[--mc->nobjs];
+ return p;
+}
+
+static struct pte_list_desc *mmu_alloc_pte_list_desc(struct kvm_vcpu *vcpu)
+{
+ return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_list_desc_cache);
+}
+
+static void mmu_free_pte_list_desc(struct pte_list_desc *pte_list_desc)
+{
+ kmem_cache_free(pte_list_desc_cache, pte_list_desc);
+}
+
+static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index)
+{
+ if (!sp->role.direct)
+ return sp->gfns[index];
+
+ return sp->gfn + (index << ((sp->role.level - 1) * PT64_LEVEL_BITS));
+}
+
+static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn)
+{
+ if (!sp->role.direct) {
+ sp->gfns[index] = gfn;
+ return;
+ }
+
+ if (WARN_ON(gfn != kvm_mmu_page_get_gfn(sp, index)))
+ pr_err_ratelimited("gfn mismatch under direct page %llx "
+ "(expected %llx, got %llx)\n",
+ sp->gfn,
+ kvm_mmu_page_get_gfn(sp, index), gfn);
+}
+
+/*
+ * Return the pointer to the large page information for a given gfn,
+ * handling slots that are not large page aligned.
+ */
+static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn,
+ struct kvm_memory_slot *slot,
+ int level)
+{
+ unsigned long idx;
+
+ idx = gfn_to_index(gfn, slot->base_gfn, level);
+ return &slot->arch.lpage_info[level - 2][idx];
+}
+
+static void update_gfn_disallow_lpage_count(struct kvm_memory_slot *slot,
+ gfn_t gfn, int count)
+{
+ struct kvm_lpage_info *linfo;
+ int i;
+
+ for (i = PT_DIRECTORY_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
+ linfo = lpage_info_slot(gfn, slot, i);
+ linfo->disallow_lpage += count;
+ WARN_ON(linfo->disallow_lpage < 0);
+ }
+}
+
+void kvm_mmu_gfn_disallow_lpage(struct kvm_memory_slot *slot, gfn_t gfn)
+{
+ update_gfn_disallow_lpage_count(slot, gfn, 1);
+}
+
+void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn)
+{
+ update_gfn_disallow_lpage_count(slot, gfn, -1);
+}
+
+static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+ struct kvm_memslots *slots;
+ struct kvm_memory_slot *slot;
+ gfn_t gfn;
+
+ kvm->arch.indirect_shadow_pages++;
+ gfn = sp->gfn;
+ slots = kvm_memslots_for_spte_role(kvm, sp->role);
+ slot = __gfn_to_memslot(slots, gfn);
+
+ /* the non-leaf shadow pages are keeping readonly. */
+ if (sp->role.level > PT_PAGE_TABLE_LEVEL)
+ return kvm_slot_page_track_add_page(kvm, slot, gfn,
+ KVM_PAGE_TRACK_WRITE);
+
+ kvm_mmu_gfn_disallow_lpage(slot, gfn);
+}
+
+static void account_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+ if (sp->lpage_disallowed)
+ return;
+
+ ++kvm->stat.nx_lpage_splits;
+ list_add_tail(&sp->lpage_disallowed_link,
+ &kvm->arch.lpage_disallowed_mmu_pages);
+ sp->lpage_disallowed = true;
+}
+
+static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+ struct kvm_memslots *slots;
+ struct kvm_memory_slot *slot;
+ gfn_t gfn;
+
+ kvm->arch.indirect_shadow_pages--;
+ gfn = sp->gfn;
+ slots = kvm_memslots_for_spte_role(kvm, sp->role);
+ slot = __gfn_to_memslot(slots, gfn);
+ if (sp->role.level > PT_PAGE_TABLE_LEVEL)
+ return kvm_slot_page_track_remove_page(kvm, slot, gfn,
+ KVM_PAGE_TRACK_WRITE);
+
+ kvm_mmu_gfn_allow_lpage(slot, gfn);
+}
+
+static void unaccount_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+ --kvm->stat.nx_lpage_splits;
+ sp->lpage_disallowed = false;
+ list_del(&sp->lpage_disallowed_link);
+}
+
+static bool __mmu_gfn_lpage_is_disallowed(gfn_t gfn, int level,
+ struct kvm_memory_slot *slot)
+{
+ struct kvm_lpage_info *linfo;
+
+ if (slot) {
+ linfo = lpage_info_slot(gfn, slot, level);
+ return !!linfo->disallow_lpage;
+ }
+
+ return true;
+}
+
+static bool mmu_gfn_lpage_is_disallowed(struct kvm_vcpu *vcpu, gfn_t gfn,
+ int level)
+{
+ struct kvm_memory_slot *slot;
+
+ slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
+ return __mmu_gfn_lpage_is_disallowed(gfn, level, slot);
+}
+
+static int host_mapping_level(struct kvm_vcpu *vcpu, gfn_t gfn)
+{
+ unsigned long page_size;
+ int i, ret = 0;
+
+ page_size = kvm_host_page_size(vcpu, gfn);
+
+ for (i = PT_PAGE_TABLE_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
+ if (page_size >= KVM_HPAGE_SIZE(i))
+ ret = i;
+ else
+ break;
+ }
+
+ return ret;
+}
+
+static inline bool memslot_valid_for_gpte(struct kvm_memory_slot *slot,
+ bool no_dirty_log)
+{
+ if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
+ return false;
+ if (no_dirty_log && slot->dirty_bitmap)
+ return false;
+
+ return true;
+}
+
+static struct kvm_memory_slot *
+gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn,
+ bool no_dirty_log)
+{
+ struct kvm_memory_slot *slot;
+
+ slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
+ if (!memslot_valid_for_gpte(slot, no_dirty_log))
+ slot = NULL;
+
+ return slot;
+}
+
+static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn,
+ bool *force_pt_level)
+{
+ int host_level, level, max_level;
+ struct kvm_memory_slot *slot;
+
+ if (unlikely(*force_pt_level))
+ return PT_PAGE_TABLE_LEVEL;
+
+ slot = kvm_vcpu_gfn_to_memslot(vcpu, large_gfn);
+ *force_pt_level = !memslot_valid_for_gpte(slot, true);
+ if (unlikely(*force_pt_level))
+ return PT_PAGE_TABLE_LEVEL;
+
+ host_level = host_mapping_level(vcpu, large_gfn);
+
+ if (host_level == PT_PAGE_TABLE_LEVEL)
+ return host_level;
+
+ max_level = min(kvm_x86_ops->get_lpage_level(), host_level);
+
+ for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level)
+ if (__mmu_gfn_lpage_is_disallowed(large_gfn, level, slot))
+ break;
+
+ return level - 1;
+}
+
+/*
+ * About rmap_head encoding:
+ *
+ * If the bit zero of rmap_head->val is clear, then it points to the only spte
+ * in this rmap chain. Otherwise, (rmap_head->val & ~1) points to a struct
+ * pte_list_desc containing more mappings.
+ */
+
+/*
+ * Returns the number of pointers in the rmap chain, not counting the new one.
+ */
+static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte,
+ struct kvm_rmap_head *rmap_head)
+{
+ struct pte_list_desc *desc;
+ int i, count = 0;
+
+ if (!rmap_head->val) {
+ rmap_printk("pte_list_add: %p %llx 0->1\n", spte, *spte);
+ rmap_head->val = (unsigned long)spte;
+ } else if (!(rmap_head->val & 1)) {
+ rmap_printk("pte_list_add: %p %llx 1->many\n", spte, *spte);
+ desc = mmu_alloc_pte_list_desc(vcpu);
+ desc->sptes[0] = (u64 *)rmap_head->val;
+ desc->sptes[1] = spte;
+ rmap_head->val = (unsigned long)desc | 1;
+ ++count;
+ } else {
+ rmap_printk("pte_list_add: %p %llx many->many\n", spte, *spte);
+ desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
+ while (desc->sptes[PTE_LIST_EXT-1] && desc->more) {
+ desc = desc->more;
+ count += PTE_LIST_EXT;
+ }
+ if (desc->sptes[PTE_LIST_EXT-1]) {
+ desc->more = mmu_alloc_pte_list_desc(vcpu);
+ desc = desc->more;
+ }
+ for (i = 0; desc->sptes[i]; ++i)
+ ++count;
+ desc->sptes[i] = spte;
+ }
+ return count;
+}
+
+static void
+pte_list_desc_remove_entry(struct kvm_rmap_head *rmap_head,
+ struct pte_list_desc *desc, int i,
+ struct pte_list_desc *prev_desc)
+{
+ int j;
+
+ for (j = PTE_LIST_EXT - 1; !desc->sptes[j] && j > i; --j)
+ ;
+ desc->sptes[i] = desc->sptes[j];
+ desc->sptes[j] = NULL;
+ if (j != 0)
+ return;
+ if (!prev_desc && !desc->more)
+ rmap_head->val = (unsigned long)desc->sptes[0];
+ else
+ if (prev_desc)
+ prev_desc->more = desc->more;
+ else
+ rmap_head->val = (unsigned long)desc->more | 1;
+ mmu_free_pte_list_desc(desc);
+}
+
+static void pte_list_remove(u64 *spte, struct kvm_rmap_head *rmap_head)
+{
+ struct pte_list_desc *desc;
+ struct pte_list_desc *prev_desc;
+ int i;
+
+ if (!rmap_head->val) {
+ printk(KERN_ERR "pte_list_remove: %p 0->BUG\n", spte);
+ BUG();
+ } else if (!(rmap_head->val & 1)) {
+ rmap_printk("pte_list_remove: %p 1->0\n", spte);
+ if ((u64 *)rmap_head->val != spte) {
+ printk(KERN_ERR "pte_list_remove: %p 1->BUG\n", spte);
+ BUG();
+ }
+ rmap_head->val = 0;
+ } else {
+ rmap_printk("pte_list_remove: %p many->many\n", spte);
+ desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
+ prev_desc = NULL;
+ while (desc) {
+ for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i) {
+ if (desc->sptes[i] == spte) {
+ pte_list_desc_remove_entry(rmap_head,
+ desc, i, prev_desc);
+ return;
+ }
+ }
+ prev_desc = desc;
+ desc = desc->more;
+ }
+ pr_err("pte_list_remove: %p many->many\n", spte);
+ BUG();
+ }
+}
+
+static struct kvm_rmap_head *__gfn_to_rmap(gfn_t gfn, int level,
+ struct kvm_memory_slot *slot)
+{
+ unsigned long idx;
+
+ idx = gfn_to_index(gfn, slot->base_gfn, level);
+ return &slot->arch.rmap[level - PT_PAGE_TABLE_LEVEL][idx];
+}
+
+static struct kvm_rmap_head *gfn_to_rmap(struct kvm *kvm, gfn_t gfn,
+ struct kvm_mmu_page *sp)
+{
+ struct kvm_memslots *slots;
+ struct kvm_memory_slot *slot;
+
+ slots = kvm_memslots_for_spte_role(kvm, sp->role);
+ slot = __gfn_to_memslot(slots, gfn);
+ return __gfn_to_rmap(gfn, sp->role.level, slot);
+}
+
+static bool rmap_can_add(struct kvm_vcpu *vcpu)
+{
+ struct kvm_mmu_memory_cache *cache;
+
+ cache = &vcpu->arch.mmu_pte_list_desc_cache;
+ return mmu_memory_cache_free_objects(cache);
+}
+
+static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
+{
+ struct kvm_mmu_page *sp;
+ struct kvm_rmap_head *rmap_head;
+
+ sp = page_header(__pa(spte));
+ kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn);
+ rmap_head = gfn_to_rmap(vcpu->kvm, gfn, sp);
+ return pte_list_add(vcpu, spte, rmap_head);
+}
+
+static void rmap_remove(struct kvm *kvm, u64 *spte)
+{
+ struct kvm_mmu_page *sp;
+ gfn_t gfn;
+ struct kvm_rmap_head *rmap_head;
+
+ sp = page_header(__pa(spte));
+ gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt);
+ rmap_head = gfn_to_rmap(kvm, gfn, sp);
+ pte_list_remove(spte, rmap_head);
+}
+
+/*
+ * Used by the following functions to iterate through the sptes linked by a
+ * rmap. All fields are private and not assumed to be used outside.
+ */
+struct rmap_iterator {
+ /* private fields */
+ struct pte_list_desc *desc; /* holds the sptep if not NULL */
+ int pos; /* index of the sptep */
+};
+
+/*
+ * Iteration must be started by this function. This should also be used after
+ * removing/dropping sptes from the rmap link because in such cases the
+ * information in the itererator may not be valid.
+ *
+ * Returns sptep if found, NULL otherwise.
+ */
+static u64 *rmap_get_first(struct kvm_rmap_head *rmap_head,
+ struct rmap_iterator *iter)
+{
+ u64 *sptep;
+
+ if (!rmap_head->val)
+ return NULL;
+
+ if (!(rmap_head->val & 1)) {
+ iter->desc = NULL;
+ sptep = (u64 *)rmap_head->val;
+ goto out;
+ }
+
+ iter->desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
+ iter->pos = 0;
+ sptep = iter->desc->sptes[iter->pos];
+out:
+ BUG_ON(!is_shadow_present_pte(*sptep));
+ return sptep;
+}
+
+/*
+ * Must be used with a valid iterator: e.g. after rmap_get_first().
+ *
+ * Returns sptep if found, NULL otherwise.
+ */
+static u64 *rmap_get_next(struct rmap_iterator *iter)
+{
+ u64 *sptep;
+
+ if (iter->desc) {
+ if (iter->pos < PTE_LIST_EXT - 1) {
+ ++iter->pos;
+ sptep = iter->desc->sptes[iter->pos];
+ if (sptep)
+ goto out;
+ }
+
+ iter->desc = iter->desc->more;
+
+ if (iter->desc) {
+ iter->pos = 0;
+ /* desc->sptes[0] cannot be NULL */
+ sptep = iter->desc->sptes[iter->pos];
+ goto out;
+ }
+ }
+
+ return NULL;
+out:
+ BUG_ON(!is_shadow_present_pte(*sptep));
+ return sptep;
+}
+
+#define for_each_rmap_spte(_rmap_head_, _iter_, _spte_) \
+ for (_spte_ = rmap_get_first(_rmap_head_, _iter_); \
+ _spte_; _spte_ = rmap_get_next(_iter_))
+
+static void drop_spte(struct kvm *kvm, u64 *sptep)
+{
+ if (mmu_spte_clear_track_bits(sptep))
+ rmap_remove(kvm, sptep);
+}
+
+
+static bool __drop_large_spte(struct kvm *kvm, u64 *sptep)
+{
+ if (is_large_pte(*sptep)) {
+ WARN_ON(page_header(__pa(sptep))->role.level ==
+ PT_PAGE_TABLE_LEVEL);
+ drop_spte(kvm, sptep);
+ --kvm->stat.lpages;
+ return true;
+ }
+
+ return false;
+}
+
+static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep)
+{
+ if (__drop_large_spte(vcpu->kvm, sptep))
+ kvm_flush_remote_tlbs(vcpu->kvm);
+}
+
+/*
+ * Write-protect on the specified @sptep, @pt_protect indicates whether
+ * spte write-protection is caused by protecting shadow page table.
+ *
+ * Note: write protection is difference between dirty logging and spte
+ * protection:
+ * - for dirty logging, the spte can be set to writable at anytime if
+ * its dirty bitmap is properly set.
+ * - for spte protection, the spte can be writable only after unsync-ing
+ * shadow page.
+ *
+ * Return true if tlb need be flushed.
+ */
+static bool spte_write_protect(u64 *sptep, bool pt_protect)
+{
+ u64 spte = *sptep;
+
+ if (!is_writable_pte(spte) &&
+ !(pt_protect && spte_can_locklessly_be_made_writable(spte)))
+ return false;
+
+ rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep);
+
+ if (pt_protect)
+ spte &= ~SPTE_MMU_WRITEABLE;
+ spte = spte & ~PT_WRITABLE_MASK;
+
+ return mmu_spte_update(sptep, spte);
+}
+
+static bool __rmap_write_protect(struct kvm *kvm,
+ struct kvm_rmap_head *rmap_head,
+ bool pt_protect)
+{
+ u64 *sptep;
+ struct rmap_iterator iter;
+ bool flush = false;
+
+ for_each_rmap_spte(rmap_head, &iter, sptep)
+ flush |= spte_write_protect(sptep, pt_protect);
+
+ return flush;
+}
+
+static bool spte_clear_dirty(u64 *sptep)
+{
+ u64 spte = *sptep;
+
+ rmap_printk("rmap_clear_dirty: spte %p %llx\n", sptep, *sptep);
+
+ spte &= ~shadow_dirty_mask;
+
+ return mmu_spte_update(sptep, spte);
+}
+
+static bool wrprot_ad_disabled_spte(u64 *sptep)
+{
+ bool was_writable = test_and_clear_bit(PT_WRITABLE_SHIFT,
+ (unsigned long *)sptep);
+ if (was_writable)
+ kvm_set_pfn_dirty(spte_to_pfn(*sptep));
+
+ return was_writable;
+}
+
+/*
+ * Gets the GFN ready for another round of dirty logging by clearing the
+ * - D bit on ad-enabled SPTEs, and
+ * - W bit on ad-disabled SPTEs.
+ * Returns true iff any D or W bits were cleared.
+ */
+static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
+{
+ u64 *sptep;
+ struct rmap_iterator iter;
+ bool flush = false;
+
+ for_each_rmap_spte(rmap_head, &iter, sptep)
+ if (spte_ad_enabled(*sptep))
+ flush |= spte_clear_dirty(sptep);
+ else
+ flush |= wrprot_ad_disabled_spte(sptep);
+
+ return flush;
+}
+
+static bool spte_set_dirty(u64 *sptep)
+{
+ u64 spte = *sptep;
+
+ rmap_printk("rmap_set_dirty: spte %p %llx\n", sptep, *sptep);
+
+ spte |= shadow_dirty_mask;
+
+ return mmu_spte_update(sptep, spte);
+}
+
+static bool __rmap_set_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
+{
+ u64 *sptep;
+ struct rmap_iterator iter;
+ bool flush = false;
+
+ for_each_rmap_spte(rmap_head, &iter, sptep)
+ if (spte_ad_enabled(*sptep))
+ flush |= spte_set_dirty(sptep);
+
+ return flush;
+}
+
+/**
+ * kvm_mmu_write_protect_pt_masked - write protect selected PT level pages
+ * @kvm: kvm instance
+ * @slot: slot to protect
+ * @gfn_offset: start of the BITS_PER_LONG pages we care about
+ * @mask: indicates which pages we should protect
+ *
+ * Used when we do not need to care about huge page mappings: e.g. during dirty
+ * logging we do not have any such mappings.
+ */
+static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
+ struct kvm_memory_slot *slot,
+ gfn_t gfn_offset, unsigned long mask)
+{
+ struct kvm_rmap_head *rmap_head;
+
+ while (mask) {
+ rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
+ PT_PAGE_TABLE_LEVEL, slot);
+ __rmap_write_protect(kvm, rmap_head, false);
+
+ /* clear the first set bit */
+ mask &= mask - 1;
+ }
+}
+
+/**
+ * kvm_mmu_clear_dirty_pt_masked - clear MMU D-bit for PT level pages, or write
+ * protect the page if the D-bit isn't supported.
+ * @kvm: kvm instance
+ * @slot: slot to clear D-bit
+ * @gfn_offset: start of the BITS_PER_LONG pages we care about
+ * @mask: indicates which pages we should clear D-bit
+ *
+ * Used for PML to re-log the dirty GPAs after userspace querying dirty_bitmap.
+ */
+void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm,
+ struct kvm_memory_slot *slot,
+ gfn_t gfn_offset, unsigned long mask)
+{
+ struct kvm_rmap_head *rmap_head;
+
+ while (mask) {
+ rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
+ PT_PAGE_TABLE_LEVEL, slot);
+ __rmap_clear_dirty(kvm, rmap_head);
+
+ /* clear the first set bit */
+ mask &= mask - 1;
+ }
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_clear_dirty_pt_masked);
+
+/**
+ * kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected
+ * PT level pages.
+ *
+ * It calls kvm_mmu_write_protect_pt_masked to write protect selected pages to
+ * enable dirty logging for them.
+ *
+ * Used when we do not need to care about huge page mappings: e.g. during dirty
+ * logging we do not have any such mappings.
+ */
+void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
+ struct kvm_memory_slot *slot,
+ gfn_t gfn_offset, unsigned long mask)
+{
+ if (kvm_x86_ops->enable_log_dirty_pt_masked)
+ kvm_x86_ops->enable_log_dirty_pt_masked(kvm, slot, gfn_offset,
+ mask);
+ else
+ kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
+}
+
+/**
+ * kvm_arch_write_log_dirty - emulate dirty page logging
+ * @vcpu: Guest mode vcpu
+ *
+ * Emulate arch specific page modification logging for the
+ * nested hypervisor
+ */
+int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu, gpa_t l2_gpa)
+{
+ if (kvm_x86_ops->write_log_dirty)
+ return kvm_x86_ops->write_log_dirty(vcpu, l2_gpa);
+
+ return 0;
+}
+
+bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
+ struct kvm_memory_slot *slot, u64 gfn)
+{
+ struct kvm_rmap_head *rmap_head;
+ int i;
+ bool write_protected = false;
+
+ for (i = PT_PAGE_TABLE_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
+ rmap_head = __gfn_to_rmap(gfn, i, slot);
+ write_protected |= __rmap_write_protect(kvm, rmap_head, true);
+ }
+
+ return write_protected;
+}
+
+static bool rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn)
+{
+ struct kvm_memory_slot *slot;
+
+ slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
+ return kvm_mmu_slot_gfn_write_protect(vcpu->kvm, slot, gfn);
+}
+
+static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
+{
+ u64 *sptep;
+ struct rmap_iterator iter;
+ bool flush = false;
+
+ while ((sptep = rmap_get_first(rmap_head, &iter))) {
+ rmap_printk("%s: spte %p %llx.\n", __func__, sptep, *sptep);
+
+ drop_spte(kvm, sptep);
+ flush = true;
+ }
+
+ return flush;
+}
+
+static int kvm_unmap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
+ struct kvm_memory_slot *slot, gfn_t gfn, int level,
+ unsigned long data)
+{
+ return kvm_zap_rmapp(kvm, rmap_head);
+}
+
+static int kvm_set_pte_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
+ struct kvm_memory_slot *slot, gfn_t gfn, int level,
+ unsigned long data)
+{
+ u64 *sptep;
+ struct rmap_iterator iter;
+ int need_flush = 0;
+ u64 new_spte;
+ pte_t *ptep = (pte_t *)data;
+ kvm_pfn_t new_pfn;
+
+ WARN_ON(pte_huge(*ptep));
+ new_pfn = pte_pfn(*ptep);
+
+restart:
+ for_each_rmap_spte(rmap_head, &iter, sptep) {
+ rmap_printk("kvm_set_pte_rmapp: spte %p %llx gfn %llx (%d)\n",
+ sptep, *sptep, gfn, level);
+
+ need_flush = 1;
+
+ if (pte_write(*ptep)) {
+ drop_spte(kvm, sptep);
+ goto restart;
+ } else {
+ new_spte = *sptep & ~PT64_BASE_ADDR_MASK;
+ new_spte |= (u64)new_pfn << PAGE_SHIFT;
+
+ new_spte &= ~PT_WRITABLE_MASK;
+ new_spte &= ~SPTE_HOST_WRITEABLE;
+
+ new_spte = mark_spte_for_access_track(new_spte);
+
+ mmu_spte_clear_track_bits(sptep);
+ mmu_spte_set(sptep, new_spte);
+ }
+ }
+
+ if (need_flush)
+ kvm_flush_remote_tlbs(kvm);
+
+ return 0;
+}
+
+struct slot_rmap_walk_iterator {
+ /* input fields. */
+ struct kvm_memory_slot *slot;
+ gfn_t start_gfn;
+ gfn_t end_gfn;
+ int start_level;
+ int end_level;
+
+ /* output fields. */
+ gfn_t gfn;
+ struct kvm_rmap_head *rmap;
+ int level;
+
+ /* private field. */
+ struct kvm_rmap_head *end_rmap;
+};
+
+static void
+rmap_walk_init_level(struct slot_rmap_walk_iterator *iterator, int level)
+{
+ iterator->level = level;
+ iterator->gfn = iterator->start_gfn;
+ iterator->rmap = __gfn_to_rmap(iterator->gfn, level, iterator->slot);
+ iterator->end_rmap = __gfn_to_rmap(iterator->end_gfn, level,
+ iterator->slot);
+}
+
+static void
+slot_rmap_walk_init(struct slot_rmap_walk_iterator *iterator,
+ struct kvm_memory_slot *slot, int start_level,
+ int end_level, gfn_t start_gfn, gfn_t end_gfn)
+{
+ iterator->slot = slot;
+ iterator->start_level = start_level;
+ iterator->end_level = end_level;
+ iterator->start_gfn = start_gfn;
+ iterator->end_gfn = end_gfn;
+
+ rmap_walk_init_level(iterator, iterator->start_level);
+}
+
+static bool slot_rmap_walk_okay(struct slot_rmap_walk_iterator *iterator)
+{
+ return !!iterator->rmap;
+}
+
+static void slot_rmap_walk_next(struct slot_rmap_walk_iterator *iterator)
+{
+ if (++iterator->rmap <= iterator->end_rmap) {
+ iterator->gfn += (1UL << KVM_HPAGE_GFN_SHIFT(iterator->level));
+ return;
+ }
+
+ if (++iterator->level > iterator->end_level) {
+ iterator->rmap = NULL;
+ return;
+ }
+
+ rmap_walk_init_level(iterator, iterator->level);
+}
+
+#define for_each_slot_rmap_range(_slot_, _start_level_, _end_level_, \
+ _start_gfn, _end_gfn, _iter_) \
+ for (slot_rmap_walk_init(_iter_, _slot_, _start_level_, \
+ _end_level_, _start_gfn, _end_gfn); \
+ slot_rmap_walk_okay(_iter_); \
+ slot_rmap_walk_next(_iter_))
+
+static int kvm_handle_hva_range(struct kvm *kvm,
+ unsigned long start,
+ unsigned long end,
+ unsigned long data,
+ int (*handler)(struct kvm *kvm,
+ struct kvm_rmap_head *rmap_head,
+ struct kvm_memory_slot *slot,
+ gfn_t gfn,
+ int level,
+ unsigned long data))
+{
+ struct kvm_memslots *slots;
+ struct kvm_memory_slot *memslot;
+ struct slot_rmap_walk_iterator iterator;
+ int ret = 0;
+ int i;
+
+ for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
+ slots = __kvm_memslots(kvm, i);
+ kvm_for_each_memslot(memslot, slots) {
+ unsigned long hva_start, hva_end;
+ gfn_t gfn_start, gfn_end;
+
+ hva_start = max(start, memslot->userspace_addr);
+ hva_end = min(end, memslot->userspace_addr +
+ (memslot->npages << PAGE_SHIFT));
+ if (hva_start >= hva_end)
+ continue;
+ /*
+ * {gfn(page) | page intersects with [hva_start, hva_end)} =
+ * {gfn_start, gfn_start+1, ..., gfn_end-1}.
+ */
+ gfn_start = hva_to_gfn_memslot(hva_start, memslot);
+ gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
+
+ for_each_slot_rmap_range(memslot, PT_PAGE_TABLE_LEVEL,
+ PT_MAX_HUGEPAGE_LEVEL,
+ gfn_start, gfn_end - 1,
+ &iterator)
+ ret |= handler(kvm, iterator.rmap, memslot,
+ iterator.gfn, iterator.level, data);
+ }
+ }
+
+ return ret;
+}
+
+static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
+ unsigned long data,
+ int (*handler)(struct kvm *kvm,
+ struct kvm_rmap_head *rmap_head,
+ struct kvm_memory_slot *slot,
+ gfn_t gfn, int level,
+ unsigned long data))
+{
+ return kvm_handle_hva_range(kvm, hva, hva + 1, data, handler);
+}
+
+int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end,
+ bool blockable)
+{
+ return kvm_handle_hva_range(kvm, start, end, 0, kvm_unmap_rmapp);
+}
+
+void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
+{
+ kvm_handle_hva(kvm, hva, (unsigned long)&pte, kvm_set_pte_rmapp);
+}
+
+static int kvm_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
+ struct kvm_memory_slot *slot, gfn_t gfn, int level,
+ unsigned long data)
+{
+ u64 *sptep;
+ struct rmap_iterator uninitialized_var(iter);
+ int young = 0;
+
+ for_each_rmap_spte(rmap_head, &iter, sptep)
+ young |= mmu_spte_age(sptep);
+
+ trace_kvm_age_page(gfn, level, slot, young);
+ return young;
+}
+
+static int kvm_test_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
+ struct kvm_memory_slot *slot, gfn_t gfn,
+ int level, unsigned long data)
+{
+ u64 *sptep;
+ struct rmap_iterator iter;
+
+ for_each_rmap_spte(rmap_head, &iter, sptep)
+ if (is_accessed_spte(*sptep))
+ return 1;
+ return 0;
+}
+
+#define RMAP_RECYCLE_THRESHOLD 1000
+
+static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
+{
+ struct kvm_rmap_head *rmap_head;
+ struct kvm_mmu_page *sp;
+
+ sp = page_header(__pa(spte));
+
+ rmap_head = gfn_to_rmap(vcpu->kvm, gfn, sp);
+
+ kvm_unmap_rmapp(vcpu->kvm, rmap_head, NULL, gfn, sp->role.level, 0);
+ kvm_flush_remote_tlbs(vcpu->kvm);
+}
+
+int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
+{
+ return kvm_handle_hva_range(kvm, start, end, 0, kvm_age_rmapp);
+}
+
+int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
+{
+ return kvm_handle_hva(kvm, hva, 0, kvm_test_age_rmapp);
+}
+
+#ifdef MMU_DEBUG
+static int is_empty_shadow_page(u64 *spt)
+{
+ u64 *pos;
+ u64 *end;
+
+ for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++)
+ if (is_shadow_present_pte(*pos)) {
+ printk(KERN_ERR "%s: %p %llx\n", __func__,
+ pos, *pos);
+ return 0;
+ }
+ return 1;
+}
+#endif
+
+/*
+ * This value is the sum of all of the kvm instances's
+ * kvm->arch.n_used_mmu_pages values. We need a global,
+ * aggregate version in order to make the slab shrinker
+ * faster
+ */
+static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, long nr)
+{
+ kvm->arch.n_used_mmu_pages += nr;
+ percpu_counter_add(&kvm_total_used_mmu_pages, nr);
+}
+
+static void kvm_mmu_free_page(struct kvm_mmu_page *sp)
+{
+ MMU_WARN_ON(!is_empty_shadow_page(sp->spt));
+ hlist_del(&sp->hash_link);
+ list_del(&sp->link);
+ free_page((unsigned long)sp->spt);
+ if (!sp->role.direct)
+ free_page((unsigned long)sp->gfns);
+ kmem_cache_free(mmu_page_header_cache, sp);
+}
+
+static unsigned kvm_page_table_hashfn(gfn_t gfn)
+{
+ return hash_64(gfn, KVM_MMU_HASH_SHIFT);
+}
+
+static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu,
+ struct kvm_mmu_page *sp, u64 *parent_pte)
+{
+ if (!parent_pte)
+ return;
+
+ pte_list_add(vcpu, parent_pte, &sp->parent_ptes);
+}
+
+static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
+ u64 *parent_pte)
+{
+ pte_list_remove(parent_pte, &sp->parent_ptes);
+}
+
+static void drop_parent_pte(struct kvm_mmu_page *sp,
+ u64 *parent_pte)
+{
+ mmu_page_remove_parent_pte(sp, parent_pte);
+ mmu_spte_clear_no_track(parent_pte);
+}
+
+static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, int direct)
+{
+ struct kvm_mmu_page *sp;
+
+ sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
+ sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache);
+ if (!direct)
+ sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache);
+ set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
+
+ /*
+ * The active_mmu_pages list is the FIFO list, do not move the
+ * page until it is zapped. kvm_zap_obsolete_pages depends on
+ * this feature. See the comments in kvm_zap_obsolete_pages().
+ */
+ list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
+ kvm_mod_used_mmu_pages(vcpu->kvm, +1);
+ return sp;
+}
+
+static void mark_unsync(u64 *spte);
+static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp)
+{
+ u64 *sptep;
+ struct rmap_iterator iter;
+
+ for_each_rmap_spte(&sp->parent_ptes, &iter, sptep) {
+ mark_unsync(sptep);
+ }
+}
+
+static void mark_unsync(u64 *spte)
+{
+ struct kvm_mmu_page *sp;
+ unsigned int index;
+
+ sp = page_header(__pa(spte));
+ index = spte - sp->spt;
+ if (__test_and_set_bit(index, sp->unsync_child_bitmap))
+ return;
+ if (sp->unsync_children++)
+ return;
+ kvm_mmu_mark_parents_unsync(sp);
+}
+
+static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
+ struct kvm_mmu_page *sp)
+{
+ return 0;
+}
+
+static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root)
+{
+}
+
+static void nonpaging_update_pte(struct kvm_vcpu *vcpu,
+ struct kvm_mmu_page *sp, u64 *spte,
+ const void *pte)
+{
+ WARN_ON(1);
+}
+
+#define KVM_PAGE_ARRAY_NR 16
+
+struct kvm_mmu_pages {
+ struct mmu_page_and_offset {
+ struct kvm_mmu_page *sp;
+ unsigned int idx;
+ } page[KVM_PAGE_ARRAY_NR];
+ unsigned int nr;
+};
+
+static int mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp,
+ int idx)
+{
+ int i;
+
+ if (sp->unsync)
+ for (i=0; i < pvec->nr; i++)
+ if (pvec->page[i].sp == sp)
+ return 0;
+
+ pvec->page[pvec->nr].sp = sp;
+ pvec->page[pvec->nr].idx = idx;
+ pvec->nr++;
+ return (pvec->nr == KVM_PAGE_ARRAY_NR);
+}
+
+static inline void clear_unsync_child_bit(struct kvm_mmu_page *sp, int idx)
+{
+ --sp->unsync_children;
+ WARN_ON((int)sp->unsync_children < 0);
+ __clear_bit(idx, sp->unsync_child_bitmap);
+}
+
+static int __mmu_unsync_walk(struct kvm_mmu_page *sp,
+ struct kvm_mmu_pages *pvec)
+{
+ int i, ret, nr_unsync_leaf = 0;
+
+ for_each_set_bit(i, sp->unsync_child_bitmap, 512) {
+ struct kvm_mmu_page *child;
+ u64 ent = sp->spt[i];
+
+ if (!is_shadow_present_pte(ent) || is_large_pte(ent)) {
+ clear_unsync_child_bit(sp, i);
+ continue;
+ }
+
+ child = page_header(ent & PT64_BASE_ADDR_MASK);
+
+ if (child->unsync_children) {
+ if (mmu_pages_add(pvec, child, i))
+ return -ENOSPC;
+
+ ret = __mmu_unsync_walk(child, pvec);
+ if (!ret) {
+ clear_unsync_child_bit(sp, i);
+ continue;
+ } else if (ret > 0) {
+ nr_unsync_leaf += ret;
+ } else
+ return ret;
+ } else if (child->unsync) {
+ nr_unsync_leaf++;
+ if (mmu_pages_add(pvec, child, i))
+ return -ENOSPC;
+ } else
+ clear_unsync_child_bit(sp, i);
+ }
+
+ return nr_unsync_leaf;
+}
+
+#define INVALID_INDEX (-1)
+
+static int mmu_unsync_walk(struct kvm_mmu_page *sp,
+ struct kvm_mmu_pages *pvec)
+{
+ pvec->nr = 0;
+ if (!sp->unsync_children)
+ return 0;
+
+ mmu_pages_add(pvec, sp, INVALID_INDEX);
+ return __mmu_unsync_walk(sp, pvec);
+}
+
+static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+ WARN_ON(!sp->unsync);
+ trace_kvm_mmu_sync_page(sp);
+ sp->unsync = 0;
+ --kvm->stat.mmu_unsync;
+}
+
+static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
+ struct list_head *invalid_list);
+static void kvm_mmu_commit_zap_page(struct kvm *kvm,
+ struct list_head *invalid_list);
+
+/*
+ * NOTE: we should pay more attention on the zapped-obsolete page
+ * (is_obsolete_sp(sp) && sp->role.invalid) when you do hash list walk
+ * since it has been deleted from active_mmu_pages but still can be found
+ * at hast list.
+ *
+ * for_each_valid_sp() has skipped that kind of pages.
+ */
+#define for_each_valid_sp(_kvm, _sp, _gfn) \
+ hlist_for_each_entry(_sp, \
+ &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)], hash_link) \
+ if (is_obsolete_sp((_kvm), (_sp)) || (_sp)->role.invalid) { \
+ } else
+
+#define for_each_gfn_indirect_valid_sp(_kvm, _sp, _gfn) \
+ for_each_valid_sp(_kvm, _sp, _gfn) \
+ if ((_sp)->gfn != (_gfn) || (_sp)->role.direct) {} else
+
+/* @sp->gfn should be write-protected at the call site */
+static bool __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+ struct list_head *invalid_list)
+{
+ if (sp->role.cr4_pae != !!is_pae(vcpu)
+ || vcpu->arch.mmu.sync_page(vcpu, sp) == 0) {
+ kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
+ return false;
+ }
+
+ return true;
+}
+
+static void kvm_mmu_flush_or_zap(struct kvm_vcpu *vcpu,
+ struct list_head *invalid_list,
+ bool remote_flush, bool local_flush)
+{
+ if (!list_empty(invalid_list)) {
+ kvm_mmu_commit_zap_page(vcpu->kvm, invalid_list);
+ return;
+ }
+
+ if (remote_flush)
+ kvm_flush_remote_tlbs(vcpu->kvm);
+ else if (local_flush)
+ kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
+}
+
+#ifdef CONFIG_KVM_MMU_AUDIT
+#include "mmu_audit.c"
+#else
+static void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point) { }
+static void mmu_audit_disable(void) { }
+#endif
+
+static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+ return unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen);
+}
+
+static bool kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+ struct list_head *invalid_list)
+{
+ kvm_unlink_unsync_page(vcpu->kvm, sp);
+ return __kvm_sync_page(vcpu, sp, invalid_list);
+}
+
+/* @gfn should be write-protected at the call site */
+static bool kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn,
+ struct list_head *invalid_list)
+{
+ struct kvm_mmu_page *s;
+ bool ret = false;
+
+ for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn) {
+ if (!s->unsync)
+ continue;
+
+ WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL);
+ ret |= kvm_sync_page(vcpu, s, invalid_list);
+ }
+
+ return ret;
+}
+
+struct mmu_page_path {
+ struct kvm_mmu_page *parent[PT64_ROOT_MAX_LEVEL];
+ unsigned int idx[PT64_ROOT_MAX_LEVEL];
+};
+
+#define for_each_sp(pvec, sp, parents, i) \
+ for (i = mmu_pages_first(&pvec, &parents); \
+ i < pvec.nr && ({ sp = pvec.page[i].sp; 1;}); \
+ i = mmu_pages_next(&pvec, &parents, i))
+
+static int mmu_pages_next(struct kvm_mmu_pages *pvec,
+ struct mmu_page_path *parents,
+ int i)
+{
+ int n;
+
+ for (n = i+1; n < pvec->nr; n++) {
+ struct kvm_mmu_page *sp = pvec->page[n].sp;
+ unsigned idx = pvec->page[n].idx;
+ int level = sp->role.level;
+
+ parents->idx[level-1] = idx;
+ if (level == PT_PAGE_TABLE_LEVEL)
+ break;
+
+ parents->parent[level-2] = sp;
+ }
+
+ return n;
+}
+
+static int mmu_pages_first(struct kvm_mmu_pages *pvec,
+ struct mmu_page_path *parents)
+{
+ struct kvm_mmu_page *sp;
+ int level;
+
+ if (pvec->nr == 0)
+ return 0;
+
+ WARN_ON(pvec->page[0].idx != INVALID_INDEX);
+
+ sp = pvec->page[0].sp;
+ level = sp->role.level;
+ WARN_ON(level == PT_PAGE_TABLE_LEVEL);
+
+ parents->parent[level-2] = sp;
+
+ /* Also set up a sentinel. Further entries in pvec are all
+ * children of sp, so this element is never overwritten.
+ */
+ parents->parent[level-1] = NULL;
+ return mmu_pages_next(pvec, parents, 0);
+}
+
+static void mmu_pages_clear_parents(struct mmu_page_path *parents)
+{
+ struct kvm_mmu_page *sp;
+ unsigned int level = 0;
+
+ do {
+ unsigned int idx = parents->idx[level];
+ sp = parents->parent[level];
+ if (!sp)
+ return;
+
+ WARN_ON(idx == INVALID_INDEX);
+ clear_unsync_child_bit(sp, idx);
+ level++;
+ } while (!sp->unsync_children);
+}
+
+static void mmu_sync_children(struct kvm_vcpu *vcpu,
+ struct kvm_mmu_page *parent)
+{
+ int i;
+ struct kvm_mmu_page *sp;
+ struct mmu_page_path parents;
+ struct kvm_mmu_pages pages;
+ LIST_HEAD(invalid_list);
+ bool flush = false;
+
+ while (mmu_unsync_walk(parent, &pages)) {
+ bool protected = false;
+
+ for_each_sp(pages, sp, parents, i)
+ protected |= rmap_write_protect(vcpu, sp->gfn);
+
+ if (protected) {
+ kvm_flush_remote_tlbs(vcpu->kvm);
+ flush = false;
+ }
+
+ for_each_sp(pages, sp, parents, i) {
+ flush |= kvm_sync_page(vcpu, sp, &invalid_list);
+ mmu_pages_clear_parents(&parents);
+ }
+ if (need_resched() || spin_needbreak(&vcpu->kvm->mmu_lock)) {
+ kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush);
+ cond_resched_lock(&vcpu->kvm->mmu_lock);
+ flush = false;
+ }
+ }
+
+ kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush);
+}
+
+static void __clear_sp_write_flooding_count(struct kvm_mmu_page *sp)
+{
+ atomic_set(&sp->write_flooding_count, 0);
+}
+
+static void clear_sp_write_flooding_count(u64 *spte)
+{
+ struct kvm_mmu_page *sp = page_header(__pa(spte));
+
+ __clear_sp_write_flooding_count(sp);
+}
+
+static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
+ gfn_t gfn,
+ gva_t gaddr,
+ unsigned level,
+ int direct,
+ unsigned access)
+{
+ union kvm_mmu_page_role role;
+ unsigned quadrant;
+ struct kvm_mmu_page *sp;
+ bool need_sync = false;
+ bool flush = false;
+ int collisions = 0;
+ LIST_HEAD(invalid_list);
+
+ role = vcpu->arch.mmu.base_role;
+ role.level = level;
+ role.direct = direct;
+ if (role.direct)
+ role.cr4_pae = 0;
+ role.access = access;
+ if (!vcpu->arch.mmu.direct_map
+ && vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) {
+ quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
+ quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
+ role.quadrant = quadrant;
+ }
+ for_each_valid_sp(vcpu->kvm, sp, gfn) {
+ if (sp->gfn != gfn) {
+ collisions++;
+ continue;
+ }
+
+ if (!need_sync && sp->unsync)
+ need_sync = true;
+
+ if (sp->role.word != role.word)
+ continue;
+
+ if (sp->unsync) {
+ /* The page is good, but __kvm_sync_page might still end
+ * up zapping it. If so, break in order to rebuild it.
+ */
+ if (!__kvm_sync_page(vcpu, sp, &invalid_list))
+ break;
+
+ WARN_ON(!list_empty(&invalid_list));
+ kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
+ }
+
+ if (sp->unsync_children)
+ kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
+
+ __clear_sp_write_flooding_count(sp);
+ trace_kvm_mmu_get_page(sp, false);
+ goto out;
+ }
+
+ ++vcpu->kvm->stat.mmu_cache_miss;
+
+ sp = kvm_mmu_alloc_page(vcpu, direct);
+
+ sp->gfn = gfn;
+ sp->role = role;
+ hlist_add_head(&sp->hash_link,
+ &vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)]);
+ if (!direct) {
+ /*
+ * we should do write protection before syncing pages
+ * otherwise the content of the synced shadow page may
+ * be inconsistent with guest page table.
+ */
+ account_shadowed(vcpu->kvm, sp);
+ if (level == PT_PAGE_TABLE_LEVEL &&
+ rmap_write_protect(vcpu, gfn))
+ kvm_flush_remote_tlbs(vcpu->kvm);
+
+ if (level > PT_PAGE_TABLE_LEVEL && need_sync)
+ flush |= kvm_sync_pages(vcpu, gfn, &invalid_list);
+ }
+ sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen;
+ clear_page(sp->spt);
+ trace_kvm_mmu_get_page(sp, true);
+
+ kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush);
+out:
+ if (collisions > vcpu->kvm->stat.max_mmu_page_hash_collisions)
+ vcpu->kvm->stat.max_mmu_page_hash_collisions = collisions;
+ return sp;
+}
+
+static void shadow_walk_init_using_root(struct kvm_shadow_walk_iterator *iterator,
+ struct kvm_vcpu *vcpu, hpa_t root,
+ u64 addr)
+{
+ iterator->addr = addr;
+ iterator->shadow_addr = root;
+ iterator->level = vcpu->arch.mmu.shadow_root_level;
+
+ if (iterator->level == PT64_ROOT_4LEVEL &&
+ vcpu->arch.mmu.root_level < PT64_ROOT_4LEVEL &&
+ !vcpu->arch.mmu.direct_map)
+ --iterator->level;
+
+ if (iterator->level == PT32E_ROOT_LEVEL) {
+ /*
+ * prev_root is currently only used for 64-bit hosts. So only
+ * the active root_hpa is valid here.
+ */
+ BUG_ON(root != vcpu->arch.mmu.root_hpa);
+
+ iterator->shadow_addr
+ = vcpu->arch.mmu.pae_root[(addr >> 30) & 3];
+ iterator->shadow_addr &= PT64_BASE_ADDR_MASK;
+ --iterator->level;
+ if (!iterator->shadow_addr)
+ iterator->level = 0;
+ }
+}
+
+static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator,
+ struct kvm_vcpu *vcpu, u64 addr)
+{
+ shadow_walk_init_using_root(iterator, vcpu, vcpu->arch.mmu.root_hpa,
+ addr);
+}
+
+static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator)
+{
+ if (iterator->level < PT_PAGE_TABLE_LEVEL)
+ return false;
+
+ iterator->index = SHADOW_PT_INDEX(iterator->addr, iterator->level);
+ iterator->sptep = ((u64 *)__va(iterator->shadow_addr)) + iterator->index;
+ return true;
+}
+
+static void __shadow_walk_next(struct kvm_shadow_walk_iterator *iterator,
+ u64 spte)
+{
+ if (is_last_spte(spte, iterator->level)) {
+ iterator->level = 0;
+ return;
+ }
+
+ iterator->shadow_addr = spte & PT64_BASE_ADDR_MASK;
+ --iterator->level;
+}
+
+static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator)
+{
+ __shadow_walk_next(iterator, *iterator->sptep);
+}
+
+static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep,
+ struct kvm_mmu_page *sp)
+{
+ u64 spte;
+
+ BUILD_BUG_ON(VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK);
+
+ spte = __pa(sp->spt) | shadow_present_mask | PT_WRITABLE_MASK |
+ shadow_user_mask | shadow_x_mask | shadow_me_mask;
+
+ if (sp_ad_disabled(sp))
+ spte |= shadow_acc_track_value;
+ else
+ spte |= shadow_accessed_mask;
+
+ mmu_spte_set(sptep, spte);
+
+ mmu_page_add_parent_pte(vcpu, sp, sptep);
+
+ if (sp->unsync_children || sp->unsync)
+ mark_unsync(sptep);
+}
+
+static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep,
+ unsigned direct_access)
+{
+ if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) {
+ struct kvm_mmu_page *child;
+
+ /*
+ * For the direct sp, if the guest pte's dirty bit
+ * changed form clean to dirty, it will corrupt the
+ * sp's access: allow writable in the read-only sp,
+ * so we should update the spte at this point to get
+ * a new sp with the correct access.
+ */
+ child = page_header(*sptep & PT64_BASE_ADDR_MASK);
+ if (child->role.access == direct_access)
+ return;
+
+ drop_parent_pte(child, sptep);
+ kvm_flush_remote_tlbs(vcpu->kvm);
+ }
+}
+
+static bool mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
+ u64 *spte)
+{
+ u64 pte;
+ struct kvm_mmu_page *child;
+
+ pte = *spte;
+ if (is_shadow_present_pte(pte)) {
+ if (is_last_spte(pte, sp->role.level)) {
+ drop_spte(kvm, spte);
+ if (is_large_pte(pte))
+ --kvm->stat.lpages;
+ } else {
+ child = page_header(pte & PT64_BASE_ADDR_MASK);
+ drop_parent_pte(child, spte);
+ }
+ return true;
+ }
+
+ if (is_mmio_spte(pte))
+ mmu_spte_clear_no_track(spte);
+
+ return false;
+}
+
+static void kvm_mmu_page_unlink_children(struct kvm *kvm,
+ struct kvm_mmu_page *sp)
+{
+ unsigned i;
+
+ for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
+ mmu_page_zap_pte(kvm, sp, sp->spt + i);
+}
+
+static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+ u64 *sptep;
+ struct rmap_iterator iter;
+
+ while ((sptep = rmap_get_first(&sp->parent_ptes, &iter)))
+ drop_parent_pte(sp, sptep);
+}
+
+static int mmu_zap_unsync_children(struct kvm *kvm,
+ struct kvm_mmu_page *parent,
+ struct list_head *invalid_list)
+{
+ int i, zapped = 0;
+ struct mmu_page_path parents;
+ struct kvm_mmu_pages pages;
+
+ if (parent->role.level == PT_PAGE_TABLE_LEVEL)
+ return 0;
+
+ while (mmu_unsync_walk(parent, &pages)) {
+ struct kvm_mmu_page *sp;
+
+ for_each_sp(pages, sp, parents, i) {
+ kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
+ mmu_pages_clear_parents(&parents);
+ zapped++;
+ }
+ }
+
+ return zapped;
+}
+
+static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
+ struct list_head *invalid_list)
+{
+ int ret;
+
+ trace_kvm_mmu_prepare_zap_page(sp);
+ ++kvm->stat.mmu_shadow_zapped;
+ ret = mmu_zap_unsync_children(kvm, sp, invalid_list);
+ kvm_mmu_page_unlink_children(kvm, sp);
+ kvm_mmu_unlink_parents(kvm, sp);
+
+ if (!sp->role.invalid && !sp->role.direct)
+ unaccount_shadowed(kvm, sp);
+
+ if (sp->unsync)
+ kvm_unlink_unsync_page(kvm, sp);
+ if (!sp->root_count) {
+ /* Count self */
+ ret++;
+ list_move(&sp->link, invalid_list);
+ kvm_mod_used_mmu_pages(kvm, -1);
+ } else {
+ list_move(&sp->link, &kvm->arch.active_mmu_pages);
+
+ /*
+ * The obsolete pages can not be used on any vcpus.
+ * See the comments in kvm_mmu_invalidate_zap_all_pages().
+ */
+ if (!sp->role.invalid && !is_obsolete_sp(kvm, sp))
+ kvm_reload_remote_mmus(kvm);
+ }
+
+ if (sp->lpage_disallowed)
+ unaccount_huge_nx_page(kvm, sp);
+
+ sp->role.invalid = 1;
+ return ret;
+}
+
+static void kvm_mmu_commit_zap_page(struct kvm *kvm,
+ struct list_head *invalid_list)
+{
+ struct kvm_mmu_page *sp, *nsp;
+
+ if (list_empty(invalid_list))
+ return;
+
+ /*
+ * We need to make sure everyone sees our modifications to
+ * the page tables and see changes to vcpu->mode here. The barrier
+ * in the kvm_flush_remote_tlbs() achieves this. This pairs
+ * with vcpu_enter_guest and walk_shadow_page_lockless_begin/end.
+ *
+ * In addition, kvm_flush_remote_tlbs waits for all vcpus to exit
+ * guest mode and/or lockless shadow page table walks.
+ */
+ kvm_flush_remote_tlbs(kvm);
+
+ list_for_each_entry_safe(sp, nsp, invalid_list, link) {
+ WARN_ON(!sp->role.invalid || sp->root_count);
+ kvm_mmu_free_page(sp);
+ }
+}
+
+static bool prepare_zap_oldest_mmu_page(struct kvm *kvm,
+ struct list_head *invalid_list)
+{
+ struct kvm_mmu_page *sp;
+
+ if (list_empty(&kvm->arch.active_mmu_pages))
+ return false;
+
+ sp = list_last_entry(&kvm->arch.active_mmu_pages,
+ struct kvm_mmu_page, link);
+ return kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
+}
+
+/*
+ * Changing the number of mmu pages allocated to the vm
+ * Note: if goal_nr_mmu_pages is too small, you will get dead lock
+ */
+void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long goal_nr_mmu_pages)
+{
+ LIST_HEAD(invalid_list);
+
+ spin_lock(&kvm->mmu_lock);
+
+ if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) {
+ /* Need to free some mmu pages to achieve the goal. */
+ while (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages)
+ if (!prepare_zap_oldest_mmu_page(kvm, &invalid_list))
+ break;
+
+ kvm_mmu_commit_zap_page(kvm, &invalid_list);
+ goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages;
+ }
+
+ kvm->arch.n_max_mmu_pages = goal_nr_mmu_pages;
+
+ spin_unlock(&kvm->mmu_lock);
+}
+
+int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
+{
+ struct kvm_mmu_page *sp;
+ LIST_HEAD(invalid_list);
+ int r;
+
+ pgprintk("%s: looking for gfn %llx\n", __func__, gfn);
+ r = 0;
+ spin_lock(&kvm->mmu_lock);
+ for_each_gfn_indirect_valid_sp(kvm, sp, gfn) {
+ pgprintk("%s: gfn %llx role %x\n", __func__, gfn,
+ sp->role.word);
+ r = 1;
+ kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
+ }
+ kvm_mmu_commit_zap_page(kvm, &invalid_list);
+ spin_unlock(&kvm->mmu_lock);
+
+ return r;
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page);
+
+static void kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
+{
+ trace_kvm_mmu_unsync_page(sp);
+ ++vcpu->kvm->stat.mmu_unsync;
+ sp->unsync = 1;
+
+ kvm_mmu_mark_parents_unsync(sp);
+}
+
+static bool mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
+ bool can_unsync)
+{
+ struct kvm_mmu_page *sp;
+
+ if (kvm_page_track_is_active(vcpu, gfn, KVM_PAGE_TRACK_WRITE))
+ return true;
+
+ for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn) {
+ if (!can_unsync)
+ return true;
+
+ if (sp->unsync)
+ continue;
+
+ WARN_ON(sp->role.level != PT_PAGE_TABLE_LEVEL);
+ kvm_unsync_page(vcpu, sp);
+ }
+
+ /*
+ * We need to ensure that the marking of unsync pages is visible
+ * before the SPTE is updated to allow writes because
+ * kvm_mmu_sync_roots() checks the unsync flags without holding
+ * the MMU lock and so can race with this. If the SPTE was updated
+ * before the page had been marked as unsync-ed, something like the
+ * following could happen:
+ *
+ * CPU 1 CPU 2
+ * ---------------------------------------------------------------------
+ * 1.2 Host updates SPTE
+ * to be writable
+ * 2.1 Guest writes a GPTE for GVA X.
+ * (GPTE being in the guest page table shadowed
+ * by the SP from CPU 1.)
+ * This reads SPTE during the page table walk.
+ * Since SPTE.W is read as 1, there is no
+ * fault.
+ *
+ * 2.2 Guest issues TLB flush.
+ * That causes a VM Exit.
+ *
+ * 2.3 kvm_mmu_sync_pages() reads sp->unsync.
+ * Since it is false, so it just returns.
+ *
+ * 2.4 Guest accesses GVA X.
+ * Since the mapping in the SP was not updated,
+ * so the old mapping for GVA X incorrectly
+ * gets used.
+ * 1.1 Host marks SP
+ * as unsync
+ * (sp->unsync = true)
+ *
+ * The write barrier below ensures that 1.1 happens before 1.2 and thus
+ * the situation in 2.4 does not arise. The implicit barrier in 2.2
+ * pairs with this write barrier.
+ */
+ smp_wmb();
+
+ return false;
+}
+
+static bool kvm_is_mmio_pfn(kvm_pfn_t pfn)
+{
+ if (pfn_valid(pfn))
+ return !is_zero_pfn(pfn) && PageReserved(pfn_to_page(pfn)) &&
+ /*
+ * Some reserved pages, such as those from NVDIMM
+ * DAX devices, are not for MMIO, and can be mapped
+ * with cached memory type for better performance.
+ * However, the above check misconceives those pages
+ * as MMIO, and results in KVM mapping them with UC
+ * memory type, which would hurt the performance.
+ * Therefore, we check the host memory type in addition
+ * and only treat UC/UC-/WC pages as MMIO.
+ */
+ (!pat_enabled() || pat_pfn_immune_to_uc_mtrr(pfn));
+
+ return true;
+}
+
+/* Bits which may be returned by set_spte() */
+#define SET_SPTE_WRITE_PROTECTED_PT BIT(0)
+#define SET_SPTE_NEED_REMOTE_TLB_FLUSH BIT(1)
+
+static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
+ unsigned pte_access, int level,
+ gfn_t gfn, kvm_pfn_t pfn, bool speculative,
+ bool can_unsync, bool host_writable)
+{
+ u64 spte = 0;
+ int ret = 0;
+ struct kvm_mmu_page *sp;
+
+ if (set_mmio_spte(vcpu, sptep, gfn, pfn, pte_access))
+ return 0;
+
+ sp = page_header(__pa(sptep));
+ if (sp_ad_disabled(sp))
+ spte |= shadow_acc_track_value;
+
+ /*
+ * For the EPT case, shadow_present_mask is 0 if hardware
+ * supports exec-only page table entries. In that case,
+ * ACC_USER_MASK and shadow_user_mask are used to represent
+ * read access. See FNAME(gpte_access) in paging_tmpl.h.
+ */
+ spte |= shadow_present_mask;
+ if (!speculative)
+ spte |= spte_shadow_accessed_mask(spte);
+
+ if (level > PT_PAGE_TABLE_LEVEL && (pte_access & ACC_EXEC_MASK) &&
+ is_nx_huge_page_enabled()) {
+ pte_access &= ~ACC_EXEC_MASK;
+ }
+
+ if (pte_access & ACC_EXEC_MASK)
+ spte |= shadow_x_mask;
+ else
+ spte |= shadow_nx_mask;
+
+ if (pte_access & ACC_USER_MASK)
+ spte |= shadow_user_mask;
+
+ if (level > PT_PAGE_TABLE_LEVEL)
+ spte |= PT_PAGE_SIZE_MASK;
+ if (tdp_enabled)
+ spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn,
+ kvm_is_mmio_pfn(pfn));
+
+ if (host_writable)
+ spte |= SPTE_HOST_WRITEABLE;
+ else
+ pte_access &= ~ACC_WRITE_MASK;
+
+ if (!kvm_is_mmio_pfn(pfn))
+ spte |= shadow_me_mask;
+
+ spte |= (u64)pfn << PAGE_SHIFT;
+
+ if (pte_access & ACC_WRITE_MASK) {
+
+ /*
+ * Other vcpu creates new sp in the window between
+ * mapping_level() and acquiring mmu-lock. We can
+ * allow guest to retry the access, the mapping can
+ * be fixed if guest refault.
+ */
+ if (level > PT_PAGE_TABLE_LEVEL &&
+ mmu_gfn_lpage_is_disallowed(vcpu, gfn, level))
+ goto done;
+
+ spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE;
+
+ /*
+ * Optimization: for pte sync, if spte was writable the hash
+ * lookup is unnecessary (and expensive). Write protection
+ * is responsibility of mmu_get_page / kvm_sync_page.
+ * Same reasoning can be applied to dirty page accounting.
+ */
+ if (!can_unsync && is_writable_pte(*sptep))
+ goto set_pte;
+
+ if (mmu_need_write_protect(vcpu, gfn, can_unsync)) {
+ pgprintk("%s: found shadow page for %llx, marking ro\n",
+ __func__, gfn);
+ ret |= SET_SPTE_WRITE_PROTECTED_PT;
+ pte_access &= ~ACC_WRITE_MASK;
+ spte &= ~(PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE);
+ }
+ }
+
+ if (pte_access & ACC_WRITE_MASK) {
+ kvm_vcpu_mark_page_dirty(vcpu, gfn);
+ spte |= spte_shadow_dirty_mask(spte);
+ }
+
+ if (speculative)
+ spte = mark_spte_for_access_track(spte);
+
+set_pte:
+ if (mmu_spte_update(sptep, spte))
+ ret |= SET_SPTE_NEED_REMOTE_TLB_FLUSH;
+done:
+ return ret;
+}
+
+static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access,
+ int write_fault, int level, gfn_t gfn, kvm_pfn_t pfn,
+ bool speculative, bool host_writable)
+{
+ int was_rmapped = 0;
+ int rmap_count;
+ int set_spte_ret;
+ int ret = RET_PF_RETRY;
+ bool flush = false;
+
+ pgprintk("%s: spte %llx write_fault %d gfn %llx\n", __func__,
+ *sptep, write_fault, gfn);
+
+ if (is_shadow_present_pte(*sptep)) {
+ /*
+ * If we overwrite a PTE page pointer with a 2MB PMD, unlink
+ * the parent of the now unreachable PTE.
+ */
+ if (level > PT_PAGE_TABLE_LEVEL &&
+ !is_large_pte(*sptep)) {
+ struct kvm_mmu_page *child;
+ u64 pte = *sptep;
+
+ child = page_header(pte & PT64_BASE_ADDR_MASK);
+ drop_parent_pte(child, sptep);
+ flush = true;
+ } else if (pfn != spte_to_pfn(*sptep)) {
+ pgprintk("hfn old %llx new %llx\n",
+ spte_to_pfn(*sptep), pfn);
+ drop_spte(vcpu->kvm, sptep);
+ flush = true;
+ } else
+ was_rmapped = 1;
+ }
+
+ set_spte_ret = set_spte(vcpu, sptep, pte_access, level, gfn, pfn,
+ speculative, true, host_writable);
+ if (set_spte_ret & SET_SPTE_WRITE_PROTECTED_PT) {
+ if (write_fault)
+ ret = RET_PF_EMULATE;
+ kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
+ }
+ if (set_spte_ret & SET_SPTE_NEED_REMOTE_TLB_FLUSH || flush)
+ kvm_flush_remote_tlbs(vcpu->kvm);
+
+ if (unlikely(is_mmio_spte(*sptep)))
+ ret = RET_PF_EMULATE;
+
+ pgprintk("%s: setting spte %llx\n", __func__, *sptep);
+ trace_kvm_mmu_set_spte(level, gfn, sptep);
+ if (!was_rmapped && is_large_pte(*sptep))
+ ++vcpu->kvm->stat.lpages;
+
+ if (is_shadow_present_pte(*sptep)) {
+ if (!was_rmapped) {
+ rmap_count = rmap_add(vcpu, sptep, gfn);
+ if (rmap_count > RMAP_RECYCLE_THRESHOLD)
+ rmap_recycle(vcpu, sptep, gfn);
+ }
+ }
+
+ return ret;
+}
+
+static kvm_pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
+ bool no_dirty_log)
+{
+ struct kvm_memory_slot *slot;
+
+ slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, no_dirty_log);
+ if (!slot)
+ return KVM_PFN_ERR_FAULT;
+
+ return gfn_to_pfn_memslot_atomic(slot, gfn);
+}
+
+static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
+ struct kvm_mmu_page *sp,
+ u64 *start, u64 *end)
+{
+ struct page *pages[PTE_PREFETCH_NUM];
+ struct kvm_memory_slot *slot;
+ unsigned access = sp->role.access;
+ int i, ret;
+ gfn_t gfn;
+
+ gfn = kvm_mmu_page_get_gfn(sp, start - sp->spt);
+ slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, access & ACC_WRITE_MASK);
+ if (!slot)
+ return -1;
+
+ ret = gfn_to_page_many_atomic(slot, gfn, pages, end - start);
+ if (ret <= 0)
+ return -1;
+
+ for (i = 0; i < ret; i++, gfn++, start++) {
+ mmu_set_spte(vcpu, start, access, 0, sp->role.level, gfn,
+ page_to_pfn(pages[i]), true, true);
+ put_page(pages[i]);
+ }
+
+ return 0;
+}
+
+static void __direct_pte_prefetch(struct kvm_vcpu *vcpu,
+ struct kvm_mmu_page *sp, u64 *sptep)
+{
+ u64 *spte, *start = NULL;
+ int i;
+
+ WARN_ON(!sp->role.direct);
+
+ i = (sptep - sp->spt) & ~(PTE_PREFETCH_NUM - 1);
+ spte = sp->spt + i;
+
+ for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) {
+ if (is_shadow_present_pte(*spte) || spte == sptep) {
+ if (!start)
+ continue;
+ if (direct_pte_prefetch_many(vcpu, sp, start, spte) < 0)
+ break;
+ start = NULL;
+ } else if (!start)
+ start = spte;
+ }
+}
+
+static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
+{
+ struct kvm_mmu_page *sp;
+
+ sp = page_header(__pa(sptep));
+
+ /*
+ * Without accessed bits, there's no way to distinguish between
+ * actually accessed translations and prefetched, so disable pte
+ * prefetch if accessed bits aren't available.
+ */
+ if (sp_ad_disabled(sp))
+ return;
+
+ if (sp->role.level > PT_PAGE_TABLE_LEVEL)
+ return;
+
+ __direct_pte_prefetch(vcpu, sp, sptep);
+}
+
+static void disallowed_hugepage_adjust(struct kvm_shadow_walk_iterator it,
+ gfn_t gfn, kvm_pfn_t *pfnp, int *levelp)
+{
+ int level = *levelp;
+ u64 spte = *it.sptep;
+
+ if (it.level == level && level > PT_PAGE_TABLE_LEVEL &&
+ is_nx_huge_page_enabled() &&
+ is_shadow_present_pte(spte) &&
+ !is_large_pte(spte)) {
+ /*
+ * A small SPTE exists for this pfn, but FNAME(fetch)
+ * and __direct_map would like to create a large PTE
+ * instead: just force them to go down another level,
+ * patching back for them into pfn the next 9 bits of
+ * the address.
+ */
+ u64 page_mask = KVM_PAGES_PER_HPAGE(level) - KVM_PAGES_PER_HPAGE(level - 1);
+ *pfnp |= gfn & page_mask;
+ (*levelp)--;
+ }
+}
+
+static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write,
+ int map_writable, int level, kvm_pfn_t pfn,
+ bool prefault, bool lpage_disallowed)
+{
+ struct kvm_shadow_walk_iterator it;
+ struct kvm_mmu_page *sp;
+ int ret;
+ gfn_t gfn = gpa >> PAGE_SHIFT;
+ gfn_t base_gfn = gfn;
+
+ if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
+ return RET_PF_RETRY;
+
+ trace_kvm_mmu_spte_requested(gpa, level, pfn);
+ for_each_shadow_entry(vcpu, gpa, it) {
+ /*
+ * We cannot overwrite existing page tables with an NX
+ * large page, as the leaf could be executable.
+ */
+ disallowed_hugepage_adjust(it, gfn, &pfn, &level);
+
+ base_gfn = gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);
+ if (it.level == level)
+ break;
+
+ drop_large_spte(vcpu, it.sptep);
+ if (!is_shadow_present_pte(*it.sptep)) {
+ sp = kvm_mmu_get_page(vcpu, base_gfn, it.addr,
+ it.level - 1, true, ACC_ALL);
+
+ link_shadow_page(vcpu, it.sptep, sp);
+ if (lpage_disallowed)
+ account_huge_nx_page(vcpu->kvm, sp);
+ }
+ }
+
+ ret = mmu_set_spte(vcpu, it.sptep, ACC_ALL,
+ write, level, base_gfn, pfn, prefault,
+ map_writable);
+ direct_pte_prefetch(vcpu, it.sptep);
+ ++vcpu->stat.pf_fixed;
+ return ret;
+}
+
+static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *tsk)
+{
+ siginfo_t info;
+
+ clear_siginfo(&info);
+ info.si_signo = SIGBUS;
+ info.si_errno = 0;
+ info.si_code = BUS_MCEERR_AR;
+ info.si_addr = (void __user *)address;
+ info.si_addr_lsb = PAGE_SHIFT;
+
+ send_sig_info(SIGBUS, &info, tsk);
+}
+
+static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn)
+{
+ /*
+ * Do not cache the mmio info caused by writing the readonly gfn
+ * into the spte otherwise read access on readonly gfn also can
+ * caused mmio page fault and treat it as mmio access.
+ */
+ if (pfn == KVM_PFN_ERR_RO_FAULT)
+ return RET_PF_EMULATE;
+
+ if (pfn == KVM_PFN_ERR_HWPOISON) {
+ kvm_send_hwpoison_signal(kvm_vcpu_gfn_to_hva(vcpu, gfn), current);
+ return RET_PF_RETRY;
+ }
+
+ return -EFAULT;
+}
+
+static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
+ gfn_t gfn, kvm_pfn_t *pfnp,
+ int *levelp)
+{
+ kvm_pfn_t pfn = *pfnp;
+ int level = *levelp;
+
+ /*
+ * Check if it's a transparent hugepage. If this would be an
+ * hugetlbfs page, level wouldn't be set to
+ * PT_PAGE_TABLE_LEVEL and there would be no adjustment done
+ * here.
+ */
+ if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn) &&
+ !kvm_is_zone_device_pfn(pfn) && level == PT_PAGE_TABLE_LEVEL &&
+ PageTransCompoundMap(pfn_to_page(pfn)) &&
+ !mmu_gfn_lpage_is_disallowed(vcpu, gfn, PT_DIRECTORY_LEVEL)) {
+ unsigned long mask;
+ /*
+ * mmu_notifier_retry was successful and we hold the
+ * mmu_lock here, so the pmd can't become splitting
+ * from under us, and in turn
+ * __split_huge_page_refcount() can't run from under
+ * us and we can safely transfer the refcount from
+ * PG_tail to PG_head as we switch the pfn to tail to
+ * head.
+ */
+ *levelp = level = PT_DIRECTORY_LEVEL;
+ mask = KVM_PAGES_PER_HPAGE(level) - 1;
+ VM_BUG_ON((gfn & mask) != (pfn & mask));
+ if (pfn & mask) {
+ kvm_release_pfn_clean(pfn);
+ pfn &= ~mask;
+ kvm_get_pfn(pfn);
+ *pfnp = pfn;
+ }
+ }
+}
+
+static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
+ kvm_pfn_t pfn, unsigned access, int *ret_val)
+{
+ /* The pfn is invalid, report the error! */
+ if (unlikely(is_error_pfn(pfn))) {
+ *ret_val = kvm_handle_bad_page(vcpu, gfn, pfn);
+ return true;
+ }
+
+ if (unlikely(is_noslot_pfn(pfn)))
+ vcpu_cache_mmio_info(vcpu, gva, gfn, access);
+
+ return false;
+}
+
+static bool page_fault_can_be_fast(u32 error_code)
+{
+ /*
+ * Do not fix the mmio spte with invalid generation number which
+ * need to be updated by slow page fault path.
+ */
+ if (unlikely(error_code & PFERR_RSVD_MASK))
+ return false;
+
+ /* See if the page fault is due to an NX violation */
+ if (unlikely(((error_code & (PFERR_FETCH_MASK | PFERR_PRESENT_MASK))
+ == (PFERR_FETCH_MASK | PFERR_PRESENT_MASK))))
+ return false;
+
+ /*
+ * #PF can be fast if:
+ * 1. The shadow page table entry is not present, which could mean that
+ * the fault is potentially caused by access tracking (if enabled).
+ * 2. The shadow page table entry is present and the fault
+ * is caused by write-protect, that means we just need change the W
+ * bit of the spte which can be done out of mmu-lock.
+ *
+ * However, if access tracking is disabled we know that a non-present
+ * page must be a genuine page fault where we have to create a new SPTE.
+ * So, if access tracking is disabled, we return true only for write
+ * accesses to a present page.
+ */
+
+ return shadow_acc_track_mask != 0 ||
+ ((error_code & (PFERR_WRITE_MASK | PFERR_PRESENT_MASK))
+ == (PFERR_WRITE_MASK | PFERR_PRESENT_MASK));
+}
+
+/*
+ * Returns true if the SPTE was fixed successfully. Otherwise,
+ * someone else modified the SPTE from its original value.
+ */
+static bool
+fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+ u64 *sptep, u64 old_spte, u64 new_spte)
+{
+ gfn_t gfn;
+
+ WARN_ON(!sp->role.direct);
+
+ /*
+ * Theoretically we could also set dirty bit (and flush TLB) here in
+ * order to eliminate unnecessary PML logging. See comments in
+ * set_spte. But fast_page_fault is very unlikely to happen with PML
+ * enabled, so we do not do this. This might result in the same GPA
+ * to be logged in PML buffer again when the write really happens, and
+ * eventually to be called by mark_page_dirty twice. But it's also no
+ * harm. This also avoids the TLB flush needed after setting dirty bit
+ * so non-PML cases won't be impacted.
+ *
+ * Compare with set_spte where instead shadow_dirty_mask is set.
+ */
+ if (cmpxchg64(sptep, old_spte, new_spte) != old_spte)
+ return false;
+
+ if (is_writable_pte(new_spte) && !is_writable_pte(old_spte)) {
+ /*
+ * The gfn of direct spte is stable since it is
+ * calculated by sp->gfn.
+ */
+ gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
+ kvm_vcpu_mark_page_dirty(vcpu, gfn);
+ }
+
+ return true;
+}
+
+static bool is_access_allowed(u32 fault_err_code, u64 spte)
+{
+ if (fault_err_code & PFERR_FETCH_MASK)
+ return is_executable_pte(spte);
+
+ if (fault_err_code & PFERR_WRITE_MASK)
+ return is_writable_pte(spte);
+
+ /* Fault was on Read access */
+ return spte & PT_PRESENT_MASK;
+}
+
+/*
+ * Return value:
+ * - true: let the vcpu to access on the same address again.
+ * - false: let the real page fault path to fix it.
+ */
+static bool fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, int level,
+ u32 error_code)
+{
+ struct kvm_shadow_walk_iterator iterator;
+ struct kvm_mmu_page *sp;
+ bool fault_handled = false;
+ u64 spte = 0ull;
+ uint retry_count = 0;
+
+ if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
+ return false;
+
+ if (!page_fault_can_be_fast(error_code))
+ return false;
+
+ walk_shadow_page_lockless_begin(vcpu);
+
+ do {
+ u64 new_spte;
+
+ for_each_shadow_entry_lockless(vcpu, cr2_or_gpa, iterator, spte)
+ if (!is_shadow_present_pte(spte) ||
+ iterator.level < level)
+ break;
+
+ sp = page_header(__pa(iterator.sptep));
+ if (!is_last_spte(spte, sp->role.level))
+ break;
+
+ /*
+ * Check whether the memory access that caused the fault would
+ * still cause it if it were to be performed right now. If not,
+ * then this is a spurious fault caused by TLB lazily flushed,
+ * or some other CPU has already fixed the PTE after the
+ * current CPU took the fault.
+ *
+ * Need not check the access of upper level table entries since
+ * they are always ACC_ALL.
+ */
+ if (is_access_allowed(error_code, spte)) {
+ fault_handled = true;
+ break;
+ }
+
+ new_spte = spte;
+
+ if (is_access_track_spte(spte))
+ new_spte = restore_acc_track_spte(new_spte);
+
+ /*
+ * Currently, to simplify the code, write-protection can
+ * be removed in the fast path only if the SPTE was
+ * write-protected for dirty-logging or access tracking.
+ */
+ if ((error_code & PFERR_WRITE_MASK) &&
+ spte_can_locklessly_be_made_writable(spte))
+ {
+ new_spte |= PT_WRITABLE_MASK;
+
+ /*
+ * Do not fix write-permission on the large spte. Since
+ * we only dirty the first page into the dirty-bitmap in
+ * fast_pf_fix_direct_spte(), other pages are missed
+ * if its slot has dirty logging enabled.
+ *
+ * Instead, we let the slow page fault path create a
+ * normal spte to fix the access.
+ *
+ * See the comments in kvm_arch_commit_memory_region().
+ */
+ if (sp->role.level > PT_PAGE_TABLE_LEVEL)
+ break;
+ }
+
+ /* Verify that the fault can be handled in the fast path */
+ if (new_spte == spte ||
+ !is_access_allowed(error_code, new_spte))
+ break;
+
+ /*
+ * Currently, fast page fault only works for direct mapping
+ * since the gfn is not stable for indirect shadow page. See
+ * Documentation/virtual/kvm/locking.txt to get more detail.
+ */
+ fault_handled = fast_pf_fix_direct_spte(vcpu, sp,
+ iterator.sptep, spte,
+ new_spte);
+ if (fault_handled)
+ break;
+
+ if (++retry_count > 4) {
+ printk_once(KERN_WARNING
+ "kvm: Fast #PF retrying more than 4 times.\n");
+ break;
+ }
+
+ } while (true);
+
+ trace_fast_page_fault(vcpu, cr2_or_gpa, error_code, iterator.sptep,
+ spte, fault_handled);
+ walk_shadow_page_lockless_end(vcpu);
+
+ return fault_handled;
+}
+
+static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
+ gpa_t cr2_or_gpa, kvm_pfn_t *pfn, bool write,
+ bool *writable);
+static int make_mmu_pages_available(struct kvm_vcpu *vcpu);
+
+static int nonpaging_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
+ gfn_t gfn, bool prefault)
+{
+ int r;
+ int level;
+ bool force_pt_level;
+ kvm_pfn_t pfn;
+ unsigned long mmu_seq;
+ bool map_writable, write = error_code & PFERR_WRITE_MASK;
+ bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) &&
+ is_nx_huge_page_enabled();
+
+ force_pt_level = lpage_disallowed;
+ level = mapping_level(vcpu, gfn, &force_pt_level);
+ if (likely(!force_pt_level)) {
+ /*
+ * This path builds a PAE pagetable - so we can map
+ * 2mb pages at maximum. Therefore check if the level
+ * is larger than that.
+ */
+ if (level > PT_DIRECTORY_LEVEL)
+ level = PT_DIRECTORY_LEVEL;
+
+ gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
+ }
+
+ if (fast_page_fault(vcpu, gpa, level, error_code))
+ return RET_PF_RETRY;
+
+ mmu_seq = vcpu->kvm->mmu_notifier_seq;
+ smp_rmb();
+
+ if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable))
+ return RET_PF_RETRY;
+
+ if (handle_abnormal_pfn(vcpu, gpa, gfn, pfn, ACC_ALL, &r))
+ return r;
+
+ r = RET_PF_RETRY;
+ spin_lock(&vcpu->kvm->mmu_lock);
+ if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
+ goto out_unlock;
+ if (make_mmu_pages_available(vcpu) < 0)
+ goto out_unlock;
+ if (likely(!force_pt_level))
+ transparent_hugepage_adjust(vcpu, gfn, &pfn, &level);
+ r = __direct_map(vcpu, gpa, write, map_writable, level, pfn,
+ prefault, false);
+out_unlock:
+ spin_unlock(&vcpu->kvm->mmu_lock);
+ kvm_release_pfn_clean(pfn);
+ return r;
+}
+
+static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa,
+ struct list_head *invalid_list)
+{
+ struct kvm_mmu_page *sp;
+
+ if (!VALID_PAGE(*root_hpa))
+ return;
+
+ sp = page_header(*root_hpa & PT64_BASE_ADDR_MASK);
+ --sp->root_count;
+ if (!sp->root_count && sp->role.invalid)
+ kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
+
+ *root_hpa = INVALID_PAGE;
+}
+
+/* roots_to_free must be some combination of the KVM_MMU_ROOT_* flags */
+void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, ulong roots_to_free)
+{
+ int i;
+ LIST_HEAD(invalid_list);
+ struct kvm_mmu *mmu = &vcpu->arch.mmu;
+ bool free_active_root = roots_to_free & KVM_MMU_ROOT_CURRENT;
+
+ BUILD_BUG_ON(KVM_MMU_NUM_PREV_ROOTS >= BITS_PER_LONG);
+
+ /* Before acquiring the MMU lock, see if we need to do any real work. */
+ if (!(free_active_root && VALID_PAGE(mmu->root_hpa))) {
+ for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
+ if ((roots_to_free & KVM_MMU_ROOT_PREVIOUS(i)) &&
+ VALID_PAGE(mmu->prev_roots[i].hpa))
+ break;
+
+ if (i == KVM_MMU_NUM_PREV_ROOTS)
+ return;
+ }
+
+ spin_lock(&vcpu->kvm->mmu_lock);
+
+ for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
+ if (roots_to_free & KVM_MMU_ROOT_PREVIOUS(i))
+ mmu_free_root_page(vcpu->kvm, &mmu->prev_roots[i].hpa,
+ &invalid_list);
+
+ if (free_active_root) {
+ if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL &&
+ (mmu->root_level >= PT64_ROOT_4LEVEL || mmu->direct_map)) {
+ mmu_free_root_page(vcpu->kvm, &mmu->root_hpa,
+ &invalid_list);
+ } else {
+ for (i = 0; i < 4; ++i)
+ if (mmu->pae_root[i] != 0)
+ mmu_free_root_page(vcpu->kvm,
+ &mmu->pae_root[i],
+ &invalid_list);
+ mmu->root_hpa = INVALID_PAGE;
+ }
+ }
+
+ kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
+ spin_unlock(&vcpu->kvm->mmu_lock);
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_free_roots);
+
+static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn)
+{
+ int ret = 0;
+
+ if (!kvm_is_visible_gfn(vcpu->kvm, root_gfn)) {
+ kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+ ret = 1;
+ }
+
+ return ret;
+}
+
+static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
+{
+ struct kvm_mmu_page *sp;
+ unsigned i;
+
+ if (vcpu->arch.mmu.shadow_root_level >= PT64_ROOT_4LEVEL) {
+ spin_lock(&vcpu->kvm->mmu_lock);
+ if(make_mmu_pages_available(vcpu) < 0) {
+ spin_unlock(&vcpu->kvm->mmu_lock);
+ return -ENOSPC;
+ }
+ sp = kvm_mmu_get_page(vcpu, 0, 0,
+ vcpu->arch.mmu.shadow_root_level, 1, ACC_ALL);
+ ++sp->root_count;
+ spin_unlock(&vcpu->kvm->mmu_lock);
+ vcpu->arch.mmu.root_hpa = __pa(sp->spt);
+ } else if (vcpu->arch.mmu.shadow_root_level == PT32E_ROOT_LEVEL) {
+ for (i = 0; i < 4; ++i) {
+ hpa_t root = vcpu->arch.mmu.pae_root[i];
+
+ MMU_WARN_ON(VALID_PAGE(root));
+ spin_lock(&vcpu->kvm->mmu_lock);
+ if (make_mmu_pages_available(vcpu) < 0) {
+ spin_unlock(&vcpu->kvm->mmu_lock);
+ return -ENOSPC;
+ }
+ sp = kvm_mmu_get_page(vcpu, i << (30 - PAGE_SHIFT),
+ i << 30, PT32_ROOT_LEVEL, 1, ACC_ALL);
+ root = __pa(sp->spt);
+ ++sp->root_count;
+ spin_unlock(&vcpu->kvm->mmu_lock);
+ vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK;
+ }
+ vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
+ } else
+ BUG();
+
+ return 0;
+}
+
+static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
+{
+ struct kvm_mmu_page *sp;
+ u64 pdptr, pm_mask;
+ gfn_t root_gfn;
+ int i;
+
+ root_gfn = vcpu->arch.mmu.get_cr3(vcpu) >> PAGE_SHIFT;
+
+ if (mmu_check_root(vcpu, root_gfn))
+ return 1;
+
+ /*
+ * Do we shadow a long mode page table? If so we need to
+ * write-protect the guests page table root.
+ */
+ if (vcpu->arch.mmu.root_level >= PT64_ROOT_4LEVEL) {
+ hpa_t root = vcpu->arch.mmu.root_hpa;
+
+ MMU_WARN_ON(VALID_PAGE(root));
+
+ spin_lock(&vcpu->kvm->mmu_lock);
+ if (make_mmu_pages_available(vcpu) < 0) {
+ spin_unlock(&vcpu->kvm->mmu_lock);
+ return -ENOSPC;
+ }
+ sp = kvm_mmu_get_page(vcpu, root_gfn, 0,
+ vcpu->arch.mmu.shadow_root_level, 0, ACC_ALL);
+ root = __pa(sp->spt);
+ ++sp->root_count;
+ spin_unlock(&vcpu->kvm->mmu_lock);
+ vcpu->arch.mmu.root_hpa = root;
+ return 0;
+ }
+
+ /*
+ * We shadow a 32 bit page table. This may be a legacy 2-level
+ * or a PAE 3-level page table. In either case we need to be aware that
+ * the shadow page table may be a PAE or a long mode page table.
+ */
+ pm_mask = PT_PRESENT_MASK;
+ if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_4LEVEL)
+ pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK;
+
+ for (i = 0; i < 4; ++i) {
+ hpa_t root = vcpu->arch.mmu.pae_root[i];
+
+ MMU_WARN_ON(VALID_PAGE(root));
+ if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) {
+ pdptr = vcpu->arch.mmu.get_pdptr(vcpu, i);
+ if (!(pdptr & PT_PRESENT_MASK)) {
+ vcpu->arch.mmu.pae_root[i] = 0;
+ continue;
+ }
+ root_gfn = pdptr >> PAGE_SHIFT;
+ if (mmu_check_root(vcpu, root_gfn))
+ return 1;
+ }
+ spin_lock(&vcpu->kvm->mmu_lock);
+ if (make_mmu_pages_available(vcpu) < 0) {
+ spin_unlock(&vcpu->kvm->mmu_lock);
+ return -ENOSPC;
+ }
+ sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30, PT32_ROOT_LEVEL,
+ 0, ACC_ALL);
+ root = __pa(sp->spt);
+ ++sp->root_count;
+ spin_unlock(&vcpu->kvm->mmu_lock);
+
+ vcpu->arch.mmu.pae_root[i] = root | pm_mask;
+ }
+ vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
+
+ /*
+ * If we shadow a 32 bit page table with a long mode page
+ * table we enter this path.
+ */
+ if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_4LEVEL) {
+ if (vcpu->arch.mmu.lm_root == NULL) {
+ /*
+ * The additional page necessary for this is only
+ * allocated on demand.
+ */
+
+ u64 *lm_root;
+
+ lm_root = (void*)get_zeroed_page(GFP_KERNEL);
+ if (lm_root == NULL)
+ return 1;
+
+ lm_root[0] = __pa(vcpu->arch.mmu.pae_root) | pm_mask;
+
+ vcpu->arch.mmu.lm_root = lm_root;
+ }
+
+ vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.lm_root);
+ }
+
+ return 0;
+}
+
+static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
+{
+ if (vcpu->arch.mmu.direct_map)
+ return mmu_alloc_direct_roots(vcpu);
+ else
+ return mmu_alloc_shadow_roots(vcpu);
+}
+
+void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
+{
+ int i;
+ struct kvm_mmu_page *sp;
+
+ if (vcpu->arch.mmu.direct_map)
+ return;
+
+ if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
+ return;
+
+ vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
+
+ if (vcpu->arch.mmu.root_level >= PT64_ROOT_4LEVEL) {
+ hpa_t root = vcpu->arch.mmu.root_hpa;
+
+ sp = page_header(root);
+
+ /*
+ * Even if another CPU was marking the SP as unsync-ed
+ * simultaneously, any guest page table changes are not
+ * guaranteed to be visible anyway until this VCPU issues a TLB
+ * flush strictly after those changes are made. We only need to
+ * ensure that the other CPU sets these flags before any actual
+ * changes to the page tables are made. The comments in
+ * mmu_need_write_protect() describe what could go wrong if this
+ * requirement isn't satisfied.
+ */
+ if (!smp_load_acquire(&sp->unsync) &&
+ !smp_load_acquire(&sp->unsync_children))
+ return;
+
+ spin_lock(&vcpu->kvm->mmu_lock);
+ kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
+
+ mmu_sync_children(vcpu, sp);
+
+ kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
+ spin_unlock(&vcpu->kvm->mmu_lock);
+ return;
+ }
+
+ spin_lock(&vcpu->kvm->mmu_lock);
+ kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
+
+ for (i = 0; i < 4; ++i) {
+ hpa_t root = vcpu->arch.mmu.pae_root[i];
+
+ if (root && VALID_PAGE(root)) {
+ root &= PT64_BASE_ADDR_MASK;
+ sp = page_header(root);
+ mmu_sync_children(vcpu, sp);
+ }
+ }
+
+ kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
+ spin_unlock(&vcpu->kvm->mmu_lock);
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_sync_roots);
+
+static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gpa_t vaddr,
+ u32 access, struct x86_exception *exception)
+{
+ if (exception)
+ exception->error_code = 0;
+ return vaddr;
+}
+
+static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gpa_t vaddr,
+ u32 access,
+ struct x86_exception *exception)
+{
+ if (exception)
+ exception->error_code = 0;
+ return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access, exception);
+}
+
+static bool
+__is_rsvd_bits_set(struct rsvd_bits_validate *rsvd_check, u64 pte, int level)
+{
+ int bit7 = (pte >> 7) & 1, low6 = pte & 0x3f;
+
+ return (pte & rsvd_check->rsvd_bits_mask[bit7][level-1]) |
+ ((rsvd_check->bad_mt_xwr & (1ull << low6)) != 0);
+}
+
+static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level)
+{
+ return __is_rsvd_bits_set(&mmu->guest_rsvd_check, gpte, level);
+}
+
+static bool is_shadow_zero_bits_set(struct kvm_mmu *mmu, u64 spte, int level)
+{
+ return __is_rsvd_bits_set(&mmu->shadow_zero_check, spte, level);
+}
+
+static bool mmio_info_in_cache(struct kvm_vcpu *vcpu, u64 addr, bool direct)
+{
+ /*
+ * A nested guest cannot use the MMIO cache if it is using nested
+ * page tables, because cr2 is a nGPA while the cache stores GPAs.
+ */
+ if (mmu_is_nested(vcpu))
+ return false;
+
+ if (direct)
+ return vcpu_match_mmio_gpa(vcpu, addr);
+
+ return vcpu_match_mmio_gva(vcpu, addr);
+}
+
+/* return true if reserved bit is detected on spte. */
+static bool
+walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep)
+{
+ struct kvm_shadow_walk_iterator iterator;
+ u64 sptes[PT64_ROOT_MAX_LEVEL], spte = 0ull;
+ int root, leaf;
+ bool reserved = false;
+
+ if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
+ goto exit;
+
+ walk_shadow_page_lockless_begin(vcpu);
+
+ for (shadow_walk_init(&iterator, vcpu, addr),
+ leaf = root = iterator.level;
+ shadow_walk_okay(&iterator);
+ __shadow_walk_next(&iterator, spte)) {
+ spte = mmu_spte_get_lockless(iterator.sptep);
+
+ sptes[leaf - 1] = spte;
+ leaf--;
+
+ if (!is_shadow_present_pte(spte))
+ break;
+
+ reserved |= is_shadow_zero_bits_set(&vcpu->arch.mmu, spte,
+ iterator.level);
+ }
+
+ walk_shadow_page_lockless_end(vcpu);
+
+ if (reserved) {
+ pr_err("%s: detect reserved bits on spte, addr 0x%llx, dump hierarchy:\n",
+ __func__, addr);
+ while (root > leaf) {
+ pr_err("------ spte 0x%llx level %d.\n",
+ sptes[root - 1], root);
+ root--;
+ }
+ }
+exit:
+ *sptep = spte;
+ return reserved;
+}
+
+static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct)
+{
+ u64 spte;
+ bool reserved;
+
+ if (mmio_info_in_cache(vcpu, addr, direct))
+ return RET_PF_EMULATE;
+
+ reserved = walk_shadow_page_get_mmio_spte(vcpu, addr, &spte);
+ if (WARN_ON(reserved))
+ return -EINVAL;
+
+ if (is_mmio_spte(spte)) {
+ gfn_t gfn = get_mmio_spte_gfn(spte);
+ unsigned access = get_mmio_spte_access(spte);
+
+ if (!check_mmio_spte(vcpu, spte))
+ return RET_PF_INVALID;
+
+ if (direct)
+ addr = 0;
+
+ trace_handle_mmio_page_fault(addr, gfn, access);
+ vcpu_cache_mmio_info(vcpu, addr, gfn, access);
+ return RET_PF_EMULATE;
+ }
+
+ /*
+ * If the page table is zapped by other cpus, let CPU fault again on
+ * the address.
+ */
+ return RET_PF_RETRY;
+}
+
+static bool page_fault_handle_page_track(struct kvm_vcpu *vcpu,
+ u32 error_code, gfn_t gfn)
+{
+ if (unlikely(error_code & PFERR_RSVD_MASK))
+ return false;
+
+ if (!(error_code & PFERR_PRESENT_MASK) ||
+ !(error_code & PFERR_WRITE_MASK))
+ return false;
+
+ /*
+ * guest is writing the page which is write tracked which can
+ * not be fixed by page fault handler.
+ */
+ if (kvm_page_track_is_active(vcpu, gfn, KVM_PAGE_TRACK_WRITE))
+ return true;
+
+ return false;
+}
+
+static void shadow_page_table_clear_flood(struct kvm_vcpu *vcpu, gva_t addr)
+{
+ struct kvm_shadow_walk_iterator iterator;
+ u64 spte;
+
+ if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
+ return;
+
+ walk_shadow_page_lockless_begin(vcpu);
+ for_each_shadow_entry_lockless(vcpu, addr, iterator, spte) {
+ clear_sp_write_flooding_count(iterator.sptep);
+ if (!is_shadow_present_pte(spte))
+ break;
+ }
+ walk_shadow_page_lockless_end(vcpu);
+}
+
+static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa,
+ u32 error_code, bool prefault)
+{
+ gfn_t gfn = gpa >> PAGE_SHIFT;
+ int r;
+
+ /* Note, paging is disabled, ergo gva == gpa. */
+ pgprintk("%s: gva %lx error %x\n", __func__, gpa, error_code);
+
+ if (page_fault_handle_page_track(vcpu, error_code, gfn))
+ return RET_PF_EMULATE;
+
+ r = mmu_topup_memory_caches(vcpu);
+ if (r)
+ return r;
+
+ MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
+
+
+ return nonpaging_map(vcpu, gpa & PAGE_MASK,
+ error_code, gfn, prefault);
+}
+
+static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
+ gfn_t gfn)
+{
+ struct kvm_arch_async_pf arch;
+
+ arch.token = (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id;
+ arch.gfn = gfn;
+ arch.direct_map = vcpu->arch.mmu.direct_map;
+ arch.cr3 = vcpu->arch.mmu.get_cr3(vcpu);
+
+ return kvm_setup_async_pf(vcpu, cr2_or_gpa,
+ kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch);
+}
+
+bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu)
+{
+ if (unlikely(!lapic_in_kernel(vcpu) ||
+ kvm_event_needs_reinjection(vcpu) ||
+ vcpu->arch.exception.pending))
+ return false;
+
+ if (!vcpu->arch.apf.delivery_as_pf_vmexit && is_guest_mode(vcpu))
+ return false;
+
+ return kvm_x86_ops->interrupt_allowed(vcpu);
+}
+
+static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
+ gpa_t cr2_or_gpa, kvm_pfn_t *pfn, bool write,
+ bool *writable)
+{
+ struct kvm_memory_slot *slot;
+ bool async;
+
+ /*
+ * Don't expose private memslots to L2.
+ */
+ if (is_guest_mode(vcpu) && !kvm_is_visible_gfn(vcpu->kvm, gfn)) {
+ *pfn = KVM_PFN_NOSLOT;
+ return false;
+ }
+
+ slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
+ async = false;
+ *pfn = __gfn_to_pfn_memslot(slot, gfn, false, &async, write, writable);
+ if (!async)
+ return false; /* *pfn has correct page already */
+
+ if (!prefault && kvm_can_do_async_pf(vcpu)) {
+ trace_kvm_try_async_get_page(cr2_or_gpa, gfn);
+ if (kvm_find_async_pf_gfn(vcpu, gfn)) {
+ trace_kvm_async_pf_doublefault(cr2_or_gpa, gfn);
+ kvm_make_request(KVM_REQ_APF_HALT, vcpu);
+ return true;
+ } else if (kvm_arch_setup_async_pf(vcpu, cr2_or_gpa, gfn))
+ return true;
+ }
+
+ *pfn = __gfn_to_pfn_memslot(slot, gfn, false, NULL, write, writable);
+ return false;
+}
+
+int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
+ u64 fault_address, char *insn, int insn_len)
+{
+ int r = 1;
+
+#ifndef CONFIG_X86_64
+ /* A 64-bit CR2 should be impossible on 32-bit KVM. */
+ if (WARN_ON_ONCE(fault_address >> 32))
+ return -EFAULT;
+#endif
+
+ vcpu->arch.l1tf_flush_l1d = true;
+ switch (vcpu->arch.apf.host_apf_reason) {
+ default:
+ trace_kvm_page_fault(fault_address, error_code);
+
+ if (kvm_event_needs_reinjection(vcpu))
+ kvm_mmu_unprotect_page_virt(vcpu, fault_address);
+ r = kvm_mmu_page_fault(vcpu, fault_address, error_code, insn,
+ insn_len);
+ break;
+ case KVM_PV_REASON_PAGE_NOT_PRESENT:
+ vcpu->arch.apf.host_apf_reason = 0;
+ local_irq_disable();
+ kvm_async_pf_task_wait(fault_address, 0);
+ local_irq_enable();
+ break;
+ case KVM_PV_REASON_PAGE_READY:
+ vcpu->arch.apf.host_apf_reason = 0;
+ local_irq_disable();
+ kvm_async_pf_task_wake(fault_address);
+ local_irq_enable();
+ break;
+ }
+ return r;
+}
+EXPORT_SYMBOL_GPL(kvm_handle_page_fault);
+
+static bool
+check_hugepage_cache_consistency(struct kvm_vcpu *vcpu, gfn_t gfn, int level)
+{
+ int page_num = KVM_PAGES_PER_HPAGE(level);
+
+ gfn &= ~(page_num - 1);
+
+ return kvm_mtrr_check_gfn_range_consistency(vcpu, gfn, page_num);
+}
+
+static int tdp_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
+ bool prefault)
+{
+ kvm_pfn_t pfn;
+ int r;
+ int level;
+ bool force_pt_level;
+ gfn_t gfn = gpa >> PAGE_SHIFT;
+ unsigned long mmu_seq;
+ int write = error_code & PFERR_WRITE_MASK;
+ bool map_writable;
+ bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) &&
+ is_nx_huge_page_enabled();
+
+ MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
+
+ if (page_fault_handle_page_track(vcpu, error_code, gfn))
+ return RET_PF_EMULATE;
+
+ r = mmu_topup_memory_caches(vcpu);
+ if (r)
+ return r;
+
+ force_pt_level =
+ lpage_disallowed ||
+ !check_hugepage_cache_consistency(vcpu, gfn, PT_DIRECTORY_LEVEL);
+ level = mapping_level(vcpu, gfn, &force_pt_level);
+ if (likely(!force_pt_level)) {
+ if (level > PT_DIRECTORY_LEVEL &&
+ !check_hugepage_cache_consistency(vcpu, gfn, level))
+ level = PT_DIRECTORY_LEVEL;
+ gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
+ }
+
+ if (fast_page_fault(vcpu, gpa, level, error_code))
+ return RET_PF_RETRY;
+
+ mmu_seq = vcpu->kvm->mmu_notifier_seq;
+ smp_rmb();
+
+ if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable))
+ return RET_PF_RETRY;
+
+ if (handle_abnormal_pfn(vcpu, 0, gfn, pfn, ACC_ALL, &r))
+ return r;
+
+ r = RET_PF_RETRY;
+ spin_lock(&vcpu->kvm->mmu_lock);
+ if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
+ goto out_unlock;
+ if (make_mmu_pages_available(vcpu) < 0)
+ goto out_unlock;
+ if (likely(!force_pt_level))
+ transparent_hugepage_adjust(vcpu, gfn, &pfn, &level);
+ r = __direct_map(vcpu, gpa, write, map_writable, level, pfn,
+ prefault, lpage_disallowed);
+out_unlock:
+ spin_unlock(&vcpu->kvm->mmu_lock);
+ kvm_release_pfn_clean(pfn);
+ return r;
+}
+
+static void nonpaging_init_context(struct kvm_vcpu *vcpu,
+ struct kvm_mmu *context)
+{
+ context->page_fault = nonpaging_page_fault;
+ context->gva_to_gpa = nonpaging_gva_to_gpa;
+ context->sync_page = nonpaging_sync_page;
+ context->invlpg = nonpaging_invlpg;
+ context->update_pte = nonpaging_update_pte;
+ context->root_level = 0;
+ context->shadow_root_level = PT32E_ROOT_LEVEL;
+ context->direct_map = true;
+ context->nx = false;
+}
+
+/*
+ * Find out if a previously cached root matching the new CR3/role is available.
+ * The current root is also inserted into the cache.
+ * If a matching root was found, it is assigned to kvm_mmu->root_hpa and true is
+ * returned.
+ * Otherwise, the LRU root from the cache is assigned to kvm_mmu->root_hpa and
+ * false is returned. This root should now be freed by the caller.
+ */
+static bool cached_root_available(struct kvm_vcpu *vcpu, gpa_t new_cr3,
+ union kvm_mmu_page_role new_role)
+{
+ uint i;
+ struct kvm_mmu_root_info root;
+ struct kvm_mmu *mmu = &vcpu->arch.mmu;
+
+ root.cr3 = mmu->get_cr3(vcpu);
+ root.hpa = mmu->root_hpa;
+
+ for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
+ swap(root, mmu->prev_roots[i]);
+
+ if (new_cr3 == root.cr3 && VALID_PAGE(root.hpa) &&
+ page_header(root.hpa) != NULL &&
+ new_role.word == page_header(root.hpa)->role.word)
+ break;
+ }
+
+ mmu->root_hpa = root.hpa;
+
+ return i < KVM_MMU_NUM_PREV_ROOTS;
+}
+
+static bool fast_cr3_switch(struct kvm_vcpu *vcpu, gpa_t new_cr3,
+ union kvm_mmu_page_role new_role,
+ bool skip_tlb_flush)
+{
+ struct kvm_mmu *mmu = &vcpu->arch.mmu;
+
+ /*
+ * For now, limit the fast switch to 64-bit hosts+VMs in order to avoid
+ * having to deal with PDPTEs. We may add support for 32-bit hosts/VMs
+ * later if necessary.
+ */
+ if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL &&
+ mmu->root_level >= PT64_ROOT_4LEVEL) {
+ if (mmu_check_root(vcpu, new_cr3 >> PAGE_SHIFT))
+ return false;
+
+ if (cached_root_available(vcpu, new_cr3, new_role)) {
+ /*
+ * It is possible that the cached previous root page is
+ * obsolete because of a change in the MMU
+ * generation number. However, that is accompanied by
+ * KVM_REQ_MMU_RELOAD, which will free the root that we
+ * have set here and allocate a new one.
+ */
+
+ kvm_make_request(KVM_REQ_LOAD_CR3, vcpu);
+ if (!skip_tlb_flush) {
+ kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
+ kvm_x86_ops->tlb_flush(vcpu, true);
+ }
+
+ /*
+ * The last MMIO access's GVA and GPA are cached in the
+ * VCPU. When switching to a new CR3, that GVA->GPA
+ * mapping may no longer be valid. So clear any cached
+ * MMIO info even when we don't need to sync the shadow
+ * page tables.
+ */
+ vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
+
+ __clear_sp_write_flooding_count(
+ page_header(mmu->root_hpa));
+
+ return true;
+ }
+ }
+
+ return false;
+}
+
+static void __kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3,
+ union kvm_mmu_page_role new_role,
+ bool skip_tlb_flush)
+{
+ if (!fast_cr3_switch(vcpu, new_cr3, new_role, skip_tlb_flush))
+ kvm_mmu_free_roots(vcpu, KVM_MMU_ROOT_CURRENT);
+}
+
+void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3, bool skip_tlb_flush)
+{
+ __kvm_mmu_new_cr3(vcpu, new_cr3, kvm_mmu_calc_root_page_role(vcpu),
+ skip_tlb_flush);
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_new_cr3);
+
+static unsigned long get_cr3(struct kvm_vcpu *vcpu)
+{
+ return kvm_read_cr3(vcpu);
+}
+
+static void inject_page_fault(struct kvm_vcpu *vcpu,
+ struct x86_exception *fault)
+{
+ vcpu->arch.mmu.inject_page_fault(vcpu, fault);
+}
+
+static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
+ unsigned access, int *nr_present)
+{
+ if (unlikely(is_mmio_spte(*sptep))) {
+ if (gfn != get_mmio_spte_gfn(*sptep)) {
+ mmu_spte_clear_no_track(sptep);
+ return true;
+ }
+
+ (*nr_present)++;
+ mark_mmio_spte(vcpu, sptep, gfn, access);
+ return true;
+ }
+
+ return false;
+}
+
+static inline bool is_last_gpte(struct kvm_mmu *mmu,
+ unsigned level, unsigned gpte)
+{
+ /*
+ * The RHS has bit 7 set iff level < mmu->last_nonleaf_level.
+ * If it is clear, there are no large pages at this level, so clear
+ * PT_PAGE_SIZE_MASK in gpte if that is the case.
+ */
+ gpte &= level - mmu->last_nonleaf_level;
+
+ /*
+ * PT_PAGE_TABLE_LEVEL always terminates. The RHS has bit 7 set
+ * iff level <= PT_PAGE_TABLE_LEVEL, which for our purpose means
+ * level == PT_PAGE_TABLE_LEVEL; set PT_PAGE_SIZE_MASK in gpte then.
+ */
+ gpte |= level - PT_PAGE_TABLE_LEVEL - 1;
+
+ return gpte & PT_PAGE_SIZE_MASK;
+}
+
+#define PTTYPE_EPT 18 /* arbitrary */
+#define PTTYPE PTTYPE_EPT
+#include "paging_tmpl.h"
+#undef PTTYPE
+
+#define PTTYPE 64
+#include "paging_tmpl.h"
+#undef PTTYPE
+
+#define PTTYPE 32
+#include "paging_tmpl.h"
+#undef PTTYPE
+
+static void
+__reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
+ struct rsvd_bits_validate *rsvd_check,
+ int maxphyaddr, int level, bool nx, bool gbpages,
+ bool pse, bool amd)
+{
+ u64 exb_bit_rsvd = 0;
+ u64 gbpages_bit_rsvd = 0;
+ u64 nonleaf_bit8_rsvd = 0;
+
+ rsvd_check->bad_mt_xwr = 0;
+
+ if (!nx)
+ exb_bit_rsvd = rsvd_bits(63, 63);
+ if (!gbpages)
+ gbpages_bit_rsvd = rsvd_bits(7, 7);
+
+ /*
+ * Non-leaf PML4Es and PDPEs reserve bit 8 (which would be the G bit for
+ * leaf entries) on AMD CPUs only.
+ */
+ if (amd)
+ nonleaf_bit8_rsvd = rsvd_bits(8, 8);
+
+ switch (level) {
+ case PT32_ROOT_LEVEL:
+ /* no rsvd bits for 2 level 4K page table entries */
+ rsvd_check->rsvd_bits_mask[0][1] = 0;
+ rsvd_check->rsvd_bits_mask[0][0] = 0;
+ rsvd_check->rsvd_bits_mask[1][0] =
+ rsvd_check->rsvd_bits_mask[0][0];
+
+ if (!pse) {
+ rsvd_check->rsvd_bits_mask[1][1] = 0;
+ break;
+ }
+
+ if (is_cpuid_PSE36())
+ /* 36bits PSE 4MB page */
+ rsvd_check->rsvd_bits_mask[1][1] = rsvd_bits(17, 21);
+ else
+ /* 32 bits PSE 4MB page */
+ rsvd_check->rsvd_bits_mask[1][1] = rsvd_bits(13, 21);
+ break;
+ case PT32E_ROOT_LEVEL:
+ rsvd_check->rsvd_bits_mask[0][2] =
+ rsvd_bits(maxphyaddr, 63) |
+ rsvd_bits(5, 8) | rsvd_bits(1, 2); /* PDPTE */
+ rsvd_check->rsvd_bits_mask[0][1] = exb_bit_rsvd |
+ rsvd_bits(maxphyaddr, 62); /* PDE */
+ rsvd_check->rsvd_bits_mask[0][0] = exb_bit_rsvd |
+ rsvd_bits(maxphyaddr, 62); /* PTE */
+ rsvd_check->rsvd_bits_mask[1][1] = exb_bit_rsvd |
+ rsvd_bits(maxphyaddr, 62) |
+ rsvd_bits(13, 20); /* large page */
+ rsvd_check->rsvd_bits_mask[1][0] =
+ rsvd_check->rsvd_bits_mask[0][0];
+ break;
+ case PT64_ROOT_5LEVEL:
+ rsvd_check->rsvd_bits_mask[0][4] = exb_bit_rsvd |
+ nonleaf_bit8_rsvd | rsvd_bits(7, 7) |
+ rsvd_bits(maxphyaddr, 51);
+ rsvd_check->rsvd_bits_mask[1][4] =
+ rsvd_check->rsvd_bits_mask[0][4];
+ case PT64_ROOT_4LEVEL:
+ rsvd_check->rsvd_bits_mask[0][3] = exb_bit_rsvd |
+ nonleaf_bit8_rsvd | rsvd_bits(7, 7) |
+ rsvd_bits(maxphyaddr, 51);
+ rsvd_check->rsvd_bits_mask[0][2] = exb_bit_rsvd |
+ gbpages_bit_rsvd |
+ rsvd_bits(maxphyaddr, 51);
+ rsvd_check->rsvd_bits_mask[0][1] = exb_bit_rsvd |
+ rsvd_bits(maxphyaddr, 51);
+ rsvd_check->rsvd_bits_mask[0][0] = exb_bit_rsvd |
+ rsvd_bits(maxphyaddr, 51);
+ rsvd_check->rsvd_bits_mask[1][3] =
+ rsvd_check->rsvd_bits_mask[0][3];
+ rsvd_check->rsvd_bits_mask[1][2] = exb_bit_rsvd |
+ gbpages_bit_rsvd | rsvd_bits(maxphyaddr, 51) |
+ rsvd_bits(13, 29);
+ rsvd_check->rsvd_bits_mask[1][1] = exb_bit_rsvd |
+ rsvd_bits(maxphyaddr, 51) |
+ rsvd_bits(13, 20); /* large page */
+ rsvd_check->rsvd_bits_mask[1][0] =
+ rsvd_check->rsvd_bits_mask[0][0];
+ break;
+ }
+}
+
+static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
+ struct kvm_mmu *context)
+{
+ __reset_rsvds_bits_mask(vcpu, &context->guest_rsvd_check,
+ cpuid_maxphyaddr(vcpu), context->root_level,
+ context->nx,
+ guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES),
+ is_pse(vcpu), guest_cpuid_is_amd(vcpu));
+}
+
+static void
+__reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check,
+ int maxphyaddr, bool execonly)
+{
+ u64 bad_mt_xwr;
+
+ rsvd_check->rsvd_bits_mask[0][4] =
+ rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 7);
+ rsvd_check->rsvd_bits_mask[0][3] =
+ rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 7);
+ rsvd_check->rsvd_bits_mask[0][2] =
+ rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 6);
+ rsvd_check->rsvd_bits_mask[0][1] =
+ rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 6);
+ rsvd_check->rsvd_bits_mask[0][0] = rsvd_bits(maxphyaddr, 51);
+
+ /* large page */
+ rsvd_check->rsvd_bits_mask[1][4] = rsvd_check->rsvd_bits_mask[0][4];
+ rsvd_check->rsvd_bits_mask[1][3] = rsvd_check->rsvd_bits_mask[0][3];
+ rsvd_check->rsvd_bits_mask[1][2] =
+ rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 29);
+ rsvd_check->rsvd_bits_mask[1][1] =
+ rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 20);
+ rsvd_check->rsvd_bits_mask[1][0] = rsvd_check->rsvd_bits_mask[0][0];
+
+ bad_mt_xwr = 0xFFull << (2 * 8); /* bits 3..5 must not be 2 */
+ bad_mt_xwr |= 0xFFull << (3 * 8); /* bits 3..5 must not be 3 */
+ bad_mt_xwr |= 0xFFull << (7 * 8); /* bits 3..5 must not be 7 */
+ bad_mt_xwr |= REPEAT_BYTE(1ull << 2); /* bits 0..2 must not be 010 */
+ bad_mt_xwr |= REPEAT_BYTE(1ull << 6); /* bits 0..2 must not be 110 */
+ if (!execonly) {
+ /* bits 0..2 must not be 100 unless VMX capabilities allow it */
+ bad_mt_xwr |= REPEAT_BYTE(1ull << 4);
+ }
+ rsvd_check->bad_mt_xwr = bad_mt_xwr;
+}
+
+static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu,
+ struct kvm_mmu *context, bool execonly)
+{
+ __reset_rsvds_bits_mask_ept(&context->guest_rsvd_check,
+ cpuid_maxphyaddr(vcpu), execonly);
+}
+
+/*
+ * the page table on host is the shadow page table for the page
+ * table in guest or amd nested guest, its mmu features completely
+ * follow the features in guest.
+ */
+void
+reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
+{
+ /*
+ * KVM uses NX when TDP is disabled to handle a variety of scenarios,
+ * notably for huge SPTEs if iTLB multi-hit mitigation is enabled and
+ * to generate correct permissions for CR0.WP=0/CR4.SMEP=1/EFER.NX=0.
+ * The iTLB multi-hit workaround can be toggled at any time, so assume
+ * NX can be used by any non-nested shadow MMU to avoid having to reset
+ * MMU contexts. Note, KVM forces EFER.NX=1 when TDP is disabled.
+ */
+ bool uses_nx = context->nx || !tdp_enabled ||
+ context->base_role.smep_andnot_wp;
+ struct rsvd_bits_validate *shadow_zero_check;
+ int i;
+
+ /*
+ * Passing "true" to the last argument is okay; it adds a check
+ * on bit 8 of the SPTEs which KVM doesn't use anyway.
+ */
+ shadow_zero_check = &context->shadow_zero_check;
+ __reset_rsvds_bits_mask(vcpu, shadow_zero_check,
+ shadow_phys_bits,
+ context->shadow_root_level, uses_nx,
+ guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES),
+ is_pse(vcpu), true);
+
+ if (!shadow_me_mask)
+ return;
+
+ for (i = context->shadow_root_level; --i >= 0;) {
+ shadow_zero_check->rsvd_bits_mask[0][i] &= ~shadow_me_mask;
+ shadow_zero_check->rsvd_bits_mask[1][i] &= ~shadow_me_mask;
+ }
+
+}
+EXPORT_SYMBOL_GPL(reset_shadow_zero_bits_mask);
+
+static inline bool boot_cpu_is_amd(void)
+{
+ WARN_ON_ONCE(!tdp_enabled);
+ return shadow_x_mask == 0;
+}
+
+/*
+ * the direct page table on host, use as much mmu features as
+ * possible, however, kvm currently does not do execution-protection.
+ */
+static void
+reset_tdp_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
+ struct kvm_mmu *context)
+{
+ struct rsvd_bits_validate *shadow_zero_check;
+ int i;
+
+ shadow_zero_check = &context->shadow_zero_check;
+
+ if (boot_cpu_is_amd())
+ __reset_rsvds_bits_mask(vcpu, shadow_zero_check,
+ shadow_phys_bits,
+ context->shadow_root_level, false,
+ boot_cpu_has(X86_FEATURE_GBPAGES),
+ true, true);
+ else
+ __reset_rsvds_bits_mask_ept(shadow_zero_check,
+ shadow_phys_bits,
+ false);
+
+ if (!shadow_me_mask)
+ return;
+
+ for (i = context->shadow_root_level; --i >= 0;) {
+ shadow_zero_check->rsvd_bits_mask[0][i] &= ~shadow_me_mask;
+ shadow_zero_check->rsvd_bits_mask[1][i] &= ~shadow_me_mask;
+ }
+}
+
+/*
+ * as the comments in reset_shadow_zero_bits_mask() except it
+ * is the shadow page table for intel nested guest.
+ */
+static void
+reset_ept_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
+ struct kvm_mmu *context, bool execonly)
+{
+ __reset_rsvds_bits_mask_ept(&context->shadow_zero_check,
+ shadow_phys_bits, execonly);
+}
+
+#define BYTE_MASK(access) \
+ ((1 & (access) ? 2 : 0) | \
+ (2 & (access) ? 4 : 0) | \
+ (3 & (access) ? 8 : 0) | \
+ (4 & (access) ? 16 : 0) | \
+ (5 & (access) ? 32 : 0) | \
+ (6 & (access) ? 64 : 0) | \
+ (7 & (access) ? 128 : 0))
+
+
+static void update_permission_bitmask(struct kvm_vcpu *vcpu,
+ struct kvm_mmu *mmu, bool ept)
+{
+ unsigned byte;
+
+ const u8 x = BYTE_MASK(ACC_EXEC_MASK);
+ const u8 w = BYTE_MASK(ACC_WRITE_MASK);
+ const u8 u = BYTE_MASK(ACC_USER_MASK);
+
+ bool cr4_smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP) != 0;
+ bool cr4_smap = kvm_read_cr4_bits(vcpu, X86_CR4_SMAP) != 0;
+ bool cr0_wp = is_write_protection(vcpu);
+
+ for (byte = 0; byte < ARRAY_SIZE(mmu->permissions); ++byte) {
+ unsigned pfec = byte << 1;
+
+ /*
+ * Each "*f" variable has a 1 bit for each UWX value
+ * that causes a fault with the given PFEC.
+ */
+
+ /* Faults from writes to non-writable pages */
+ u8 wf = (pfec & PFERR_WRITE_MASK) ? (u8)~w : 0;
+ /* Faults from user mode accesses to supervisor pages */
+ u8 uf = (pfec & PFERR_USER_MASK) ? (u8)~u : 0;
+ /* Faults from fetches of non-executable pages*/
+ u8 ff = (pfec & PFERR_FETCH_MASK) ? (u8)~x : 0;
+ /* Faults from kernel mode fetches of user pages */
+ u8 smepf = 0;
+ /* Faults from kernel mode accesses of user pages */
+ u8 smapf = 0;
+
+ if (!ept) {
+ /* Faults from kernel mode accesses to user pages */
+ u8 kf = (pfec & PFERR_USER_MASK) ? 0 : u;
+
+ /* Not really needed: !nx will cause pte.nx to fault */
+ if (!mmu->nx)
+ ff = 0;
+
+ /* Allow supervisor writes if !cr0.wp */
+ if (!cr0_wp)
+ wf = (pfec & PFERR_USER_MASK) ? wf : 0;
+
+ /* Disallow supervisor fetches of user code if cr4.smep */
+ if (cr4_smep)
+ smepf = (pfec & PFERR_FETCH_MASK) ? kf : 0;
+
+ /*
+ * SMAP:kernel-mode data accesses from user-mode
+ * mappings should fault. A fault is considered
+ * as a SMAP violation if all of the following
+ * conditions are ture:
+ * - X86_CR4_SMAP is set in CR4
+ * - A user page is accessed
+ * - The access is not a fetch
+ * - Page fault in kernel mode
+ * - if CPL = 3 or X86_EFLAGS_AC is clear
+ *
+ * Here, we cover the first three conditions.
+ * The fourth is computed dynamically in permission_fault();
+ * PFERR_RSVD_MASK bit will be set in PFEC if the access is
+ * *not* subject to SMAP restrictions.
+ */
+ if (cr4_smap)
+ smapf = (pfec & (PFERR_RSVD_MASK|PFERR_FETCH_MASK)) ? 0 : kf;
+ }
+
+ mmu->permissions[byte] = ff | uf | wf | smepf | smapf;
+ }
+}
+
+/*
+* PKU is an additional mechanism by which the paging controls access to
+* user-mode addresses based on the value in the PKRU register. Protection
+* key violations are reported through a bit in the page fault error code.
+* Unlike other bits of the error code, the PK bit is not known at the
+* call site of e.g. gva_to_gpa; it must be computed directly in
+* permission_fault based on two bits of PKRU, on some machine state (CR4,
+* CR0, EFER, CPL), and on other bits of the error code and the page tables.
+*
+* In particular the following conditions come from the error code, the
+* page tables and the machine state:
+* - PK is always zero unless CR4.PKE=1 and EFER.LMA=1
+* - PK is always zero if RSVD=1 (reserved bit set) or F=1 (instruction fetch)
+* - PK is always zero if U=0 in the page tables
+* - PKRU.WD is ignored if CR0.WP=0 and the access is a supervisor access.
+*
+* The PKRU bitmask caches the result of these four conditions. The error
+* code (minus the P bit) and the page table's U bit form an index into the
+* PKRU bitmask. Two bits of the PKRU bitmask are then extracted and ANDed
+* with the two bits of the PKRU register corresponding to the protection key.
+* For the first three conditions above the bits will be 00, thus masking
+* away both AD and WD. For all reads or if the last condition holds, WD
+* only will be masked away.
+*/
+static void update_pkru_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
+ bool ept)
+{
+ unsigned bit;
+ bool wp;
+
+ if (ept) {
+ mmu->pkru_mask = 0;
+ return;
+ }
+
+ /* PKEY is enabled only if CR4.PKE and EFER.LMA are both set. */
+ if (!kvm_read_cr4_bits(vcpu, X86_CR4_PKE) || !is_long_mode(vcpu)) {
+ mmu->pkru_mask = 0;
+ return;
+ }
+
+ wp = is_write_protection(vcpu);
+
+ for (bit = 0; bit < ARRAY_SIZE(mmu->permissions); ++bit) {
+ unsigned pfec, pkey_bits;
+ bool check_pkey, check_write, ff, uf, wf, pte_user;
+
+ pfec = bit << 1;
+ ff = pfec & PFERR_FETCH_MASK;
+ uf = pfec & PFERR_USER_MASK;
+ wf = pfec & PFERR_WRITE_MASK;
+
+ /* PFEC.RSVD is replaced by ACC_USER_MASK. */
+ pte_user = pfec & PFERR_RSVD_MASK;
+
+ /*
+ * Only need to check the access which is not an
+ * instruction fetch and is to a user page.
+ */
+ check_pkey = (!ff && pte_user);
+ /*
+ * write access is controlled by PKRU if it is a
+ * user access or CR0.WP = 1.
+ */
+ check_write = check_pkey && wf && (uf || wp);
+
+ /* PKRU.AD stops both read and write access. */
+ pkey_bits = !!check_pkey;
+ /* PKRU.WD stops write access. */
+ pkey_bits |= (!!check_write) << 1;
+
+ mmu->pkru_mask |= (pkey_bits & 3) << pfec;
+ }
+}
+
+static void update_last_nonleaf_level(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
+{
+ unsigned root_level = mmu->root_level;
+
+ mmu->last_nonleaf_level = root_level;
+ if (root_level == PT32_ROOT_LEVEL && is_pse(vcpu))
+ mmu->last_nonleaf_level++;
+}
+
+static void paging64_init_context_common(struct kvm_vcpu *vcpu,
+ struct kvm_mmu *context,
+ int level)
+{
+ context->nx = is_nx(vcpu);
+ context->root_level = level;
+
+ reset_rsvds_bits_mask(vcpu, context);
+ update_permission_bitmask(vcpu, context, false);
+ update_pkru_bitmask(vcpu, context, false);
+ update_last_nonleaf_level(vcpu, context);
+
+ MMU_WARN_ON(!is_pae(vcpu));
+ context->page_fault = paging64_page_fault;
+ context->gva_to_gpa = paging64_gva_to_gpa;
+ context->sync_page = paging64_sync_page;
+ context->invlpg = paging64_invlpg;
+ context->update_pte = paging64_update_pte;
+ context->shadow_root_level = level;
+ context->direct_map = false;
+}
+
+static void paging64_init_context(struct kvm_vcpu *vcpu,
+ struct kvm_mmu *context)
+{
+ int root_level = is_la57_mode(vcpu) ?
+ PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL;
+
+ paging64_init_context_common(vcpu, context, root_level);
+}
+
+static void paging32_init_context(struct kvm_vcpu *vcpu,
+ struct kvm_mmu *context)
+{
+ context->nx = false;
+ context->root_level = PT32_ROOT_LEVEL;
+
+ reset_rsvds_bits_mask(vcpu, context);
+ update_permission_bitmask(vcpu, context, false);
+ update_pkru_bitmask(vcpu, context, false);
+ update_last_nonleaf_level(vcpu, context);
+
+ context->page_fault = paging32_page_fault;
+ context->gva_to_gpa = paging32_gva_to_gpa;
+ context->sync_page = paging32_sync_page;
+ context->invlpg = paging32_invlpg;
+ context->update_pte = paging32_update_pte;
+ context->shadow_root_level = PT32E_ROOT_LEVEL;
+ context->direct_map = false;
+}
+
+static void paging32E_init_context(struct kvm_vcpu *vcpu,
+ struct kvm_mmu *context)
+{
+ paging64_init_context_common(vcpu, context, PT32E_ROOT_LEVEL);
+}
+
+static union kvm_mmu_page_role
+kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu)
+{
+ union kvm_mmu_page_role role = {0};
+
+ role.guest_mode = is_guest_mode(vcpu);
+ role.smm = is_smm(vcpu);
+ role.ad_disabled = (shadow_accessed_mask == 0);
+ role.level = kvm_x86_ops->get_tdp_level(vcpu);
+ role.direct = true;
+ role.access = ACC_ALL;
+
+ return role;
+}
+
+static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
+{
+ struct kvm_mmu *context = &vcpu->arch.mmu;
+
+ context->base_role.word = mmu_base_role_mask.word &
+ kvm_calc_tdp_mmu_root_page_role(vcpu).word;
+ context->page_fault = tdp_page_fault;
+ context->sync_page = nonpaging_sync_page;
+ context->invlpg = nonpaging_invlpg;
+ context->update_pte = nonpaging_update_pte;
+ context->shadow_root_level = kvm_x86_ops->get_tdp_level(vcpu);
+ context->direct_map = true;
+ context->set_cr3 = kvm_x86_ops->set_tdp_cr3;
+ context->get_cr3 = get_cr3;
+ context->get_pdptr = kvm_pdptr_read;
+ context->inject_page_fault = kvm_inject_page_fault;
+
+ if (!is_paging(vcpu)) {
+ context->nx = false;
+ context->gva_to_gpa = nonpaging_gva_to_gpa;
+ context->root_level = 0;
+ } else if (is_long_mode(vcpu)) {
+ context->nx = is_nx(vcpu);
+ context->root_level = is_la57_mode(vcpu) ?
+ PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL;
+ reset_rsvds_bits_mask(vcpu, context);
+ context->gva_to_gpa = paging64_gva_to_gpa;
+ } else if (is_pae(vcpu)) {
+ context->nx = is_nx(vcpu);
+ context->root_level = PT32E_ROOT_LEVEL;
+ reset_rsvds_bits_mask(vcpu, context);
+ context->gva_to_gpa = paging64_gva_to_gpa;
+ } else {
+ context->nx = false;
+ context->root_level = PT32_ROOT_LEVEL;
+ reset_rsvds_bits_mask(vcpu, context);
+ context->gva_to_gpa = paging32_gva_to_gpa;
+ }
+
+ update_permission_bitmask(vcpu, context, false);
+ update_pkru_bitmask(vcpu, context, false);
+ update_last_nonleaf_level(vcpu, context);
+ reset_tdp_shadow_zero_bits_mask(vcpu, context);
+}
+
+static union kvm_mmu_page_role
+kvm_calc_shadow_mmu_root_page_role(struct kvm_vcpu *vcpu)
+{
+ union kvm_mmu_page_role role = {0};
+ bool smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP);
+ bool smap = kvm_read_cr4_bits(vcpu, X86_CR4_SMAP);
+
+ role.nxe = is_nx(vcpu);
+ role.cr4_pae = !!is_pae(vcpu);
+ role.cr0_wp = is_write_protection(vcpu);
+ role.smep_andnot_wp = smep && !is_write_protection(vcpu);
+ role.smap_andnot_wp = smap && !is_write_protection(vcpu);
+ role.guest_mode = is_guest_mode(vcpu);
+ role.smm = is_smm(vcpu);
+ role.direct = !is_paging(vcpu);
+ role.access = ACC_ALL;
+
+ if (!is_long_mode(vcpu))
+ role.level = PT32E_ROOT_LEVEL;
+ else if (is_la57_mode(vcpu))
+ role.level = PT64_ROOT_5LEVEL;
+ else
+ role.level = PT64_ROOT_4LEVEL;
+
+ return role;
+}
+
+void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu)
+{
+ struct kvm_mmu *context = &vcpu->arch.mmu;
+
+ if (!is_paging(vcpu))
+ nonpaging_init_context(vcpu, context);
+ else if (is_long_mode(vcpu))
+ paging64_init_context(vcpu, context);
+ else if (is_pae(vcpu))
+ paging32E_init_context(vcpu, context);
+ else
+ paging32_init_context(vcpu, context);
+
+ context->base_role.word = mmu_base_role_mask.word &
+ kvm_calc_shadow_mmu_root_page_role(vcpu).word;
+ reset_shadow_zero_bits_mask(vcpu, context);
+}
+EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu);
+
+static union kvm_mmu_page_role
+kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty)
+{
+ union kvm_mmu_page_role role = vcpu->arch.mmu.base_role;
+
+ role.level = PT64_ROOT_4LEVEL;
+ role.direct = false;
+ role.ad_disabled = !accessed_dirty;
+ role.guest_mode = true;
+ role.access = ACC_ALL;
+
+ return role;
+}
+
+void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
+ bool accessed_dirty, gpa_t new_eptp)
+{
+ struct kvm_mmu *context = &vcpu->arch.mmu;
+ union kvm_mmu_page_role root_page_role =
+ kvm_calc_shadow_ept_root_page_role(vcpu, accessed_dirty);
+
+ __kvm_mmu_new_cr3(vcpu, new_eptp, root_page_role, false);
+ context->shadow_root_level = PT64_ROOT_4LEVEL;
+
+ context->nx = true;
+ context->ept_ad = accessed_dirty;
+ context->page_fault = ept_page_fault;
+ context->gva_to_gpa = ept_gva_to_gpa;
+ context->sync_page = ept_sync_page;
+ context->invlpg = ept_invlpg;
+ context->update_pte = ept_update_pte;
+ context->root_level = PT64_ROOT_4LEVEL;
+ context->direct_map = false;
+ context->base_role.word = root_page_role.word & mmu_base_role_mask.word;
+ update_permission_bitmask(vcpu, context, true);
+ update_pkru_bitmask(vcpu, context, true);
+ update_last_nonleaf_level(vcpu, context);
+ reset_rsvds_bits_mask_ept(vcpu, context, execonly);
+ reset_ept_shadow_zero_bits_mask(vcpu, context, execonly);
+}
+EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu);
+
+static void init_kvm_softmmu(struct kvm_vcpu *vcpu)
+{
+ struct kvm_mmu *context = &vcpu->arch.mmu;
+
+ kvm_init_shadow_mmu(vcpu);
+ context->set_cr3 = kvm_x86_ops->set_cr3;
+ context->get_cr3 = get_cr3;
+ context->get_pdptr = kvm_pdptr_read;
+ context->inject_page_fault = kvm_inject_page_fault;
+}
+
+static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
+{
+ struct kvm_mmu *g_context = &vcpu->arch.nested_mmu;
+
+ g_context->get_cr3 = get_cr3;
+ g_context->get_pdptr = kvm_pdptr_read;
+ g_context->inject_page_fault = kvm_inject_page_fault;
+
+ /*
+ * Note that arch.mmu.gva_to_gpa translates l2_gpa to l1_gpa using
+ * L1's nested page tables (e.g. EPT12). The nested translation
+ * of l2_gva to l1_gpa is done by arch.nested_mmu.gva_to_gpa using
+ * L2's page tables as the first level of translation and L1's
+ * nested page tables as the second level of translation. Basically
+ * the gva_to_gpa functions between mmu and nested_mmu are swapped.
+ */
+ if (!is_paging(vcpu)) {
+ g_context->nx = false;
+ g_context->root_level = 0;
+ g_context->gva_to_gpa = nonpaging_gva_to_gpa_nested;
+ } else if (is_long_mode(vcpu)) {
+ g_context->nx = is_nx(vcpu);
+ g_context->root_level = is_la57_mode(vcpu) ?
+ PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL;
+ reset_rsvds_bits_mask(vcpu, g_context);
+ g_context->gva_to_gpa = paging64_gva_to_gpa_nested;
+ } else if (is_pae(vcpu)) {
+ g_context->nx = is_nx(vcpu);
+ g_context->root_level = PT32E_ROOT_LEVEL;
+ reset_rsvds_bits_mask(vcpu, g_context);
+ g_context->gva_to_gpa = paging64_gva_to_gpa_nested;
+ } else {
+ g_context->nx = false;
+ g_context->root_level = PT32_ROOT_LEVEL;
+ reset_rsvds_bits_mask(vcpu, g_context);
+ g_context->gva_to_gpa = paging32_gva_to_gpa_nested;
+ }
+
+ update_permission_bitmask(vcpu, g_context, false);
+ update_pkru_bitmask(vcpu, g_context, false);
+ update_last_nonleaf_level(vcpu, g_context);
+}
+
+void kvm_init_mmu(struct kvm_vcpu *vcpu, bool reset_roots)
+{
+ if (reset_roots) {
+ uint i;
+
+ vcpu->arch.mmu.root_hpa = INVALID_PAGE;
+
+ for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
+ vcpu->arch.mmu.prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
+ }
+
+ if (mmu_is_nested(vcpu))
+ init_kvm_nested_mmu(vcpu);
+ else if (tdp_enabled)
+ init_kvm_tdp_mmu(vcpu);
+ else
+ init_kvm_softmmu(vcpu);
+}
+EXPORT_SYMBOL_GPL(kvm_init_mmu);
+
+static union kvm_mmu_page_role
+kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu)
+{
+ if (tdp_enabled)
+ return kvm_calc_tdp_mmu_root_page_role(vcpu);
+ else
+ return kvm_calc_shadow_mmu_root_page_role(vcpu);
+}
+
+void kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
+{
+ kvm_mmu_unload(vcpu);
+ kvm_init_mmu(vcpu, true);
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_reset_context);
+
+int kvm_mmu_load(struct kvm_vcpu *vcpu)
+{
+ int r;
+
+ r = mmu_topup_memory_caches(vcpu);
+ if (r)
+ goto out;
+ r = mmu_alloc_roots(vcpu);
+ kvm_mmu_sync_roots(vcpu);
+ if (r)
+ goto out;
+ kvm_mmu_load_cr3(vcpu);
+ kvm_x86_ops->tlb_flush(vcpu, true);
+out:
+ return r;
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_load);
+
+void kvm_mmu_unload(struct kvm_vcpu *vcpu)
+{
+ kvm_mmu_free_roots(vcpu, KVM_MMU_ROOTS_ALL);
+ WARN_ON(VALID_PAGE(vcpu->arch.mmu.root_hpa));
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_unload);
+
+static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
+ struct kvm_mmu_page *sp, u64 *spte,
+ const void *new)
+{
+ if (sp->role.level != PT_PAGE_TABLE_LEVEL) {
+ ++vcpu->kvm->stat.mmu_pde_zapped;
+ return;
+ }
+
+ ++vcpu->kvm->stat.mmu_pte_updated;
+ vcpu->arch.mmu.update_pte(vcpu, sp, spte, new);
+}
+
+static bool need_remote_flush(u64 old, u64 new)
+{
+ if (!is_shadow_present_pte(old))
+ return false;
+ if (!is_shadow_present_pte(new))
+ return true;
+ if ((old ^ new) & PT64_BASE_ADDR_MASK)
+ return true;
+ old ^= shadow_nx_mask;
+ new ^= shadow_nx_mask;
+ return (old & ~new & PT64_PERM_MASK) != 0;
+}
+
+static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa,
+ int *bytes)
+{
+ u64 gentry = 0;
+ int r;
+
+ /*
+ * Assume that the pte write on a page table of the same type
+ * as the current vcpu paging mode since we update the sptes only
+ * when they have the same mode.
+ */
+ if (is_pae(vcpu) && *bytes == 4) {
+ /* Handle a 32-bit guest writing two halves of a 64-bit gpte */
+ *gpa &= ~(gpa_t)7;
+ *bytes = 8;
+ }
+
+ if (*bytes == 4 || *bytes == 8) {
+ r = kvm_vcpu_read_guest_atomic(vcpu, *gpa, &gentry, *bytes);
+ if (r)
+ gentry = 0;
+ }
+
+ return gentry;
+}
+
+/*
+ * If we're seeing too many writes to a page, it may no longer be a page table,
+ * or we may be forking, in which case it is better to unmap the page.
+ */
+static bool detect_write_flooding(struct kvm_mmu_page *sp)
+{
+ /*
+ * Skip write-flooding detected for the sp whose level is 1, because
+ * it can become unsync, then the guest page is not write-protected.
+ */
+ if (sp->role.level == PT_PAGE_TABLE_LEVEL)
+ return false;
+
+ atomic_inc(&sp->write_flooding_count);
+ return atomic_read(&sp->write_flooding_count) >= 3;
+}
+
+/*
+ * Misaligned accesses are too much trouble to fix up; also, they usually
+ * indicate a page is not used as a page table.
+ */
+static bool detect_write_misaligned(struct kvm_mmu_page *sp, gpa_t gpa,
+ int bytes)
+{
+ unsigned offset, pte_size, misaligned;
+
+ pgprintk("misaligned: gpa %llx bytes %d role %x\n",
+ gpa, bytes, sp->role.word);
+
+ offset = offset_in_page(gpa);
+ pte_size = sp->role.cr4_pae ? 8 : 4;
+
+ /*
+ * Sometimes, the OS only writes the last one bytes to update status
+ * bits, for example, in linux, andb instruction is used in clear_bit().
+ */
+ if (!(offset & (pte_size - 1)) && bytes == 1)
+ return false;
+
+ misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
+ misaligned |= bytes < 4;
+
+ return misaligned;
+}
+
+static u64 *get_written_sptes(struct kvm_mmu_page *sp, gpa_t gpa, int *nspte)
+{
+ unsigned page_offset, quadrant;
+ u64 *spte;
+ int level;
+
+ page_offset = offset_in_page(gpa);
+ level = sp->role.level;
+ *nspte = 1;
+ if (!sp->role.cr4_pae) {
+ page_offset <<= 1; /* 32->64 */
+ /*
+ * A 32-bit pde maps 4MB while the shadow pdes map
+ * only 2MB. So we need to double the offset again
+ * and zap two pdes instead of one.
+ */
+ if (level == PT32_ROOT_LEVEL) {
+ page_offset &= ~7; /* kill rounding error */
+ page_offset <<= 1;
+ *nspte = 2;
+ }
+ quadrant = page_offset >> PAGE_SHIFT;
+ page_offset &= ~PAGE_MASK;
+ if (quadrant != sp->role.quadrant)
+ return NULL;
+ }
+
+ spte = &sp->spt[page_offset / sizeof(*spte)];
+ return spte;
+}
+
+static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
+ const u8 *new, int bytes,
+ struct kvm_page_track_notifier_node *node)
+{
+ gfn_t gfn = gpa >> PAGE_SHIFT;
+ struct kvm_mmu_page *sp;
+ LIST_HEAD(invalid_list);
+ u64 entry, gentry, *spte;
+ int npte;
+ bool remote_flush, local_flush;
+
+ /*
+ * If we don't have indirect shadow pages, it means no page is
+ * write-protected, so we can exit simply.
+ */
+ if (!READ_ONCE(vcpu->kvm->arch.indirect_shadow_pages))
+ return;
+
+ remote_flush = local_flush = false;
+
+ pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
+
+ /*
+ * No need to care whether allocation memory is successful
+ * or not since pte prefetch is skiped if it does not have
+ * enough objects in the cache.
+ */
+ mmu_topup_memory_caches(vcpu);
+
+ spin_lock(&vcpu->kvm->mmu_lock);
+
+ gentry = mmu_pte_write_fetch_gpte(vcpu, &gpa, &bytes);
+
+ ++vcpu->kvm->stat.mmu_pte_write;
+ kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE);
+
+ for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn) {
+ if (detect_write_misaligned(sp, gpa, bytes) ||
+ detect_write_flooding(sp)) {
+ kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);
+ ++vcpu->kvm->stat.mmu_flooded;
+ continue;
+ }
+
+ spte = get_written_sptes(sp, gpa, &npte);
+ if (!spte)
+ continue;
+
+ local_flush = true;
+ while (npte--) {
+ entry = *spte;
+ mmu_page_zap_pte(vcpu->kvm, sp, spte);
+ if (gentry &&
+ !((sp->role.word ^ vcpu->arch.mmu.base_role.word)
+ & mmu_base_role_mask.word) && rmap_can_add(vcpu))
+ mmu_pte_write_new_pte(vcpu, sp, spte, &gentry);
+ if (need_remote_flush(entry, *spte))
+ remote_flush = true;
+ ++spte;
+ }
+ }
+ kvm_mmu_flush_or_zap(vcpu, &invalid_list, remote_flush, local_flush);
+ kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE);
+ spin_unlock(&vcpu->kvm->mmu_lock);
+}
+
+int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
+{
+ gpa_t gpa;
+ int r;
+
+ if (vcpu->arch.mmu.direct_map)
+ return 0;
+
+ gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL);
+
+ r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
+
+ return r;
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt);
+
+static int make_mmu_pages_available(struct kvm_vcpu *vcpu)
+{
+ LIST_HEAD(invalid_list);
+
+ if (likely(kvm_mmu_available_pages(vcpu->kvm) >= KVM_MIN_FREE_MMU_PAGES))
+ return 0;
+
+ while (kvm_mmu_available_pages(vcpu->kvm) < KVM_REFILL_PAGES) {
+ if (!prepare_zap_oldest_mmu_page(vcpu->kvm, &invalid_list))
+ break;
+
+ ++vcpu->kvm->stat.mmu_recycled;
+ }
+ kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
+
+ if (!kvm_mmu_available_pages(vcpu->kvm))
+ return -ENOSPC;
+ return 0;
+}
+
+int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code,
+ void *insn, int insn_len)
+{
+ int r, emulation_type = 0;
+ enum emulation_result er;
+ bool direct = vcpu->arch.mmu.direct_map;
+
+ /* With shadow page tables, fault_address contains a GVA or nGPA. */
+ if (vcpu->arch.mmu.direct_map) {
+ vcpu->arch.gpa_available = true;
+ vcpu->arch.gpa_val = cr2_or_gpa;
+ }
+
+ r = RET_PF_INVALID;
+ if (unlikely(error_code & PFERR_RSVD_MASK)) {
+ r = handle_mmio_page_fault(vcpu, cr2_or_gpa, direct);
+ if (r == RET_PF_EMULATE)
+ goto emulate;
+ }
+
+ if (r == RET_PF_INVALID) {
+ r = vcpu->arch.mmu.page_fault(vcpu, cr2_or_gpa,
+ lower_32_bits(error_code),
+ false);
+ WARN_ON(r == RET_PF_INVALID);
+ }
+
+ if (r == RET_PF_RETRY)
+ return 1;
+ if (r < 0)
+ return r;
+
+ /*
+ * Before emulating the instruction, check if the error code
+ * was due to a RO violation while translating the guest page.
+ * This can occur when using nested virtualization with nested
+ * paging in both guests. If true, we simply unprotect the page
+ * and resume the guest.
+ */
+ if (vcpu->arch.mmu.direct_map &&
+ (error_code & PFERR_NESTED_GUEST_PAGE) == PFERR_NESTED_GUEST_PAGE) {
+ kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2_or_gpa));
+ return 1;
+ }
+
+ /*
+ * vcpu->arch.mmu.page_fault returned RET_PF_EMULATE, but we can still
+ * optimistically try to just unprotect the page and let the processor
+ * re-execute the instruction that caused the page fault. Do not allow
+ * retrying MMIO emulation, as it's not only pointless but could also
+ * cause us to enter an infinite loop because the processor will keep
+ * faulting on the non-existent MMIO address. Retrying an instruction
+ * from a nested guest is also pointless and dangerous as we are only
+ * explicitly shadowing L1's page tables, i.e. unprotecting something
+ * for L1 isn't going to magically fix whatever issue cause L2 to fail.
+ */
+ if (!mmio_info_in_cache(vcpu, cr2_or_gpa, direct) && !is_guest_mode(vcpu))
+ emulation_type = EMULTYPE_ALLOW_RETRY;
+emulate:
+ /*
+ * On AMD platforms, under certain conditions insn_len may be zero on #NPF.
+ * This can happen if a guest gets a page-fault on data access but the HW
+ * table walker is not able to read the instruction page (e.g instruction
+ * page is not present in memory). In those cases we simply restart the
+ * guest.
+ */
+ if (unlikely(insn && !insn_len))
+ return 1;
+
+ er = x86_emulate_instruction(vcpu, cr2_or_gpa, emulation_type, insn, insn_len);
+
+ switch (er) {
+ case EMULATE_DONE:
+ return 1;
+ case EMULATE_USER_EXIT:
+ ++vcpu->stat.mmio_exits;
+ /* fall through */
+ case EMULATE_FAIL:
+ return 0;
+ default:
+ BUG();
+ }
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
+
+void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
+{
+ struct kvm_mmu *mmu = &vcpu->arch.mmu;
+ int i;
+
+ /* INVLPG on a * non-canonical address is a NOP according to the SDM. */
+ if (is_noncanonical_address(gva, vcpu))
+ return;
+
+ mmu->invlpg(vcpu, gva, mmu->root_hpa);
+
+ /*
+ * INVLPG is required to invalidate any global mappings for the VA,
+ * irrespective of PCID. Since it would take us roughly similar amount
+ * of work to determine whether any of the prev_root mappings of the VA
+ * is marked global, or to just sync it blindly, so we might as well
+ * just always sync it.
+ *
+ * Mappings not reachable via the current cr3 or the prev_roots will be
+ * synced when switching to that cr3, so nothing needs to be done here
+ * for them.
+ */
+ for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
+ if (VALID_PAGE(mmu->prev_roots[i].hpa))
+ mmu->invlpg(vcpu, gva, mmu->prev_roots[i].hpa);
+
+ kvm_x86_ops->tlb_flush_gva(vcpu, gva);
+ ++vcpu->stat.invlpg;
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_invlpg);
+
+void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid)
+{
+ struct kvm_mmu *mmu = &vcpu->arch.mmu;
+ bool tlb_flush = false;
+ uint i;
+
+ if (pcid == kvm_get_active_pcid(vcpu)) {
+ mmu->invlpg(vcpu, gva, mmu->root_hpa);
+ tlb_flush = true;
+ }
+
+ for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
+ if (VALID_PAGE(mmu->prev_roots[i].hpa) &&
+ pcid == kvm_get_pcid(vcpu, mmu->prev_roots[i].cr3)) {
+ mmu->invlpg(vcpu, gva, mmu->prev_roots[i].hpa);
+ tlb_flush = true;
+ }
+ }
+
+ if (tlb_flush)
+ kvm_x86_ops->tlb_flush_gva(vcpu, gva);
+
+ ++vcpu->stat.invlpg;
+
+ /*
+ * Mappings not reachable via the current cr3 or the prev_roots will be
+ * synced when switching to that cr3, so nothing needs to be done here
+ * for them.
+ */
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_invpcid_gva);
+
+void kvm_enable_tdp(void)
+{
+ tdp_enabled = true;
+}
+EXPORT_SYMBOL_GPL(kvm_enable_tdp);
+
+void kvm_disable_tdp(void)
+{
+ tdp_enabled = false;
+}
+EXPORT_SYMBOL_GPL(kvm_disable_tdp);
+
+static void free_mmu_pages(struct kvm_vcpu *vcpu)
+{
+ free_page((unsigned long)vcpu->arch.mmu.pae_root);
+ free_page((unsigned long)vcpu->arch.mmu.lm_root);
+}
+
+static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
+{
+ struct page *page;
+ int i;
+
+ /*
+ * When using PAE paging, the four PDPTEs are treated as 'root' pages,
+ * while the PDP table is a per-vCPU construct that's allocated at MMU
+ * creation. When emulating 32-bit mode, cr3 is only 32 bits even on
+ * x86_64. Therefore we need to allocate the PDP table in the first
+ * 4GB of memory, which happens to fit the DMA32 zone. Except for
+ * SVM's 32-bit NPT support, TDP paging doesn't use PAE paging and can
+ * skip allocating the PDP table.
+ */
+ if (tdp_enabled && kvm_x86_ops->get_tdp_level(vcpu) > PT32E_ROOT_LEVEL)
+ return 0;
+
+ /*
+ * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64.
+ * Therefore we need to allocate shadow page tables in the first
+ * 4GB of memory, which happens to fit the DMA32 zone.
+ */
+ page = alloc_page(GFP_KERNEL | __GFP_DMA32);
+ if (!page)
+ return -ENOMEM;
+
+ vcpu->arch.mmu.pae_root = page_address(page);
+ for (i = 0; i < 4; ++i)
+ vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
+
+ return 0;
+}
+
+int kvm_mmu_create(struct kvm_vcpu *vcpu)
+{
+ uint i;
+
+ vcpu->arch.walk_mmu = &vcpu->arch.mmu;
+ vcpu->arch.mmu.root_hpa = INVALID_PAGE;
+ vcpu->arch.mmu.translate_gpa = translate_gpa;
+ vcpu->arch.nested_mmu.translate_gpa = translate_nested_gpa;
+
+ for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
+ vcpu->arch.mmu.prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
+
+ return alloc_mmu_pages(vcpu);
+}
+
+void kvm_mmu_setup(struct kvm_vcpu *vcpu)
+{
+ MMU_WARN_ON(VALID_PAGE(vcpu->arch.mmu.root_hpa));
+
+ /*
+ * kvm_mmu_setup() is called only on vCPU initialization.
+ * Therefore, no need to reset mmu roots as they are not yet
+ * initialized.
+ */
+ kvm_init_mmu(vcpu, false);
+}
+
+static void kvm_mmu_invalidate_zap_pages_in_memslot(struct kvm *kvm,
+ struct kvm_memory_slot *slot,
+ struct kvm_page_track_notifier_node *node)
+{
+ kvm_mmu_invalidate_zap_all_pages(kvm);
+}
+
+void kvm_mmu_init_vm(struct kvm *kvm)
+{
+ struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;
+
+ node->track_write = kvm_mmu_pte_write;
+ node->track_flush_slot = kvm_mmu_invalidate_zap_pages_in_memslot;
+ kvm_page_track_register_notifier(kvm, node);
+}
+
+void kvm_mmu_uninit_vm(struct kvm *kvm)
+{
+ struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;
+
+ kvm_page_track_unregister_notifier(kvm, node);
+}
+
+/* The return value indicates if tlb flush on all vcpus is needed. */
+typedef bool (*slot_level_handler) (struct kvm *kvm, struct kvm_rmap_head *rmap_head);
+
+/* The caller should hold mmu-lock before calling this function. */
+static __always_inline bool
+slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot,
+ slot_level_handler fn, int start_level, int end_level,
+ gfn_t start_gfn, gfn_t end_gfn, bool lock_flush_tlb)
+{
+ struct slot_rmap_walk_iterator iterator;
+ bool flush = false;
+
+ for_each_slot_rmap_range(memslot, start_level, end_level, start_gfn,
+ end_gfn, &iterator) {
+ if (iterator.rmap)
+ flush |= fn(kvm, iterator.rmap);
+
+ if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
+ if (flush && lock_flush_tlb) {
+ kvm_flush_remote_tlbs(kvm);
+ flush = false;
+ }
+ cond_resched_lock(&kvm->mmu_lock);
+ }
+ }
+
+ if (flush && lock_flush_tlb) {
+ kvm_flush_remote_tlbs(kvm);
+ flush = false;
+ }
+
+ return flush;
+}
+
+static __always_inline bool
+slot_handle_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
+ slot_level_handler fn, int start_level, int end_level,
+ bool lock_flush_tlb)
+{
+ return slot_handle_level_range(kvm, memslot, fn, start_level,
+ end_level, memslot->base_gfn,
+ memslot->base_gfn + memslot->npages - 1,
+ lock_flush_tlb);
+}
+
+static __always_inline bool
+slot_handle_all_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
+ slot_level_handler fn, bool lock_flush_tlb)
+{
+ return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL,
+ PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb);
+}
+
+static __always_inline bool
+slot_handle_large_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
+ slot_level_handler fn, bool lock_flush_tlb)
+{
+ return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL + 1,
+ PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb);
+}
+
+static __always_inline bool
+slot_handle_leaf(struct kvm *kvm, struct kvm_memory_slot *memslot,
+ slot_level_handler fn, bool lock_flush_tlb)
+{
+ return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL,
+ PT_PAGE_TABLE_LEVEL, lock_flush_tlb);
+}
+
+void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
+{
+ struct kvm_memslots *slots;
+ struct kvm_memory_slot *memslot;
+ int i;
+
+ spin_lock(&kvm->mmu_lock);
+ for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
+ slots = __kvm_memslots(kvm, i);
+ kvm_for_each_memslot(memslot, slots) {
+ gfn_t start, end;
+
+ start = max(gfn_start, memslot->base_gfn);
+ end = min(gfn_end, memslot->base_gfn + memslot->npages);
+ if (start >= end)
+ continue;
+
+ slot_handle_level_range(kvm, memslot, kvm_zap_rmapp,
+ PT_PAGE_TABLE_LEVEL, PT_MAX_HUGEPAGE_LEVEL,
+ start, end - 1, true);
+ }
+ }
+
+ spin_unlock(&kvm->mmu_lock);
+}
+
+static bool slot_rmap_write_protect(struct kvm *kvm,
+ struct kvm_rmap_head *rmap_head)
+{
+ return __rmap_write_protect(kvm, rmap_head, false);
+}
+
+void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
+ struct kvm_memory_slot *memslot)
+{
+ bool flush;
+
+ spin_lock(&kvm->mmu_lock);
+ flush = slot_handle_all_level(kvm, memslot, slot_rmap_write_protect,
+ false);
+ spin_unlock(&kvm->mmu_lock);
+
+ /*
+ * kvm_mmu_slot_remove_write_access() and kvm_vm_ioctl_get_dirty_log()
+ * which do tlb flush out of mmu-lock should be serialized by
+ * kvm->slots_lock otherwise tlb flush would be missed.
+ */
+ lockdep_assert_held(&kvm->slots_lock);
+
+ /*
+ * We can flush all the TLBs out of the mmu lock without TLB
+ * corruption since we just change the spte from writable to
+ * readonly so that we only need to care the case of changing
+ * spte from present to present (changing the spte from present
+ * to nonpresent will flush all the TLBs immediately), in other
+ * words, the only case we care is mmu_spte_update() where we
+ * haved checked SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE
+ * instead of PT_WRITABLE_MASK, that means it does not depend
+ * on PT_WRITABLE_MASK anymore.
+ */
+ if (flush)
+ kvm_flush_remote_tlbs(kvm);
+}
+
+static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
+ struct kvm_rmap_head *rmap_head)
+{
+ u64 *sptep;
+ struct rmap_iterator iter;
+ int need_tlb_flush = 0;
+ kvm_pfn_t pfn;
+ struct kvm_mmu_page *sp;
+
+restart:
+ for_each_rmap_spte(rmap_head, &iter, sptep) {
+ sp = page_header(__pa(sptep));
+ pfn = spte_to_pfn(*sptep);
+
+ /*
+ * We cannot do huge page mapping for indirect shadow pages,
+ * which are found on the last rmap (level = 1) when not using
+ * tdp; such shadow pages are synced with the page table in
+ * the guest, and the guest page table is using 4K page size
+ * mapping if the indirect sp has level = 1.
+ */
+ if (sp->role.direct && !kvm_is_reserved_pfn(pfn) &&
+ !kvm_is_zone_device_pfn(pfn) &&
+ PageTransCompoundMap(pfn_to_page(pfn))) {
+ drop_spte(kvm, sptep);
+ need_tlb_flush = 1;
+ goto restart;
+ }
+ }
+
+ return need_tlb_flush;
+}
+
+void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
+ const struct kvm_memory_slot *memslot)
+{
+ /* FIXME: const-ify all uses of struct kvm_memory_slot. */
+ spin_lock(&kvm->mmu_lock);
+ slot_handle_leaf(kvm, (struct kvm_memory_slot *)memslot,
+ kvm_mmu_zap_collapsible_spte, true);
+ spin_unlock(&kvm->mmu_lock);
+}
+
+void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
+ struct kvm_memory_slot *memslot)
+{
+ bool flush;
+
+ spin_lock(&kvm->mmu_lock);
+ flush = slot_handle_leaf(kvm, memslot, __rmap_clear_dirty, false);
+ spin_unlock(&kvm->mmu_lock);
+
+ lockdep_assert_held(&kvm->slots_lock);
+
+ /*
+ * It's also safe to flush TLBs out of mmu lock here as currently this
+ * function is only used for dirty logging, in which case flushing TLB
+ * out of mmu lock also guarantees no dirty pages will be lost in
+ * dirty_bitmap.
+ */
+ if (flush)
+ kvm_flush_remote_tlbs(kvm);
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_slot_leaf_clear_dirty);
+
+void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm,
+ struct kvm_memory_slot *memslot)
+{
+ bool flush;
+
+ spin_lock(&kvm->mmu_lock);
+ flush = slot_handle_large_level(kvm, memslot, slot_rmap_write_protect,
+ false);
+ spin_unlock(&kvm->mmu_lock);
+
+ /* see kvm_mmu_slot_remove_write_access */
+ lockdep_assert_held(&kvm->slots_lock);
+
+ if (flush)
+ kvm_flush_remote_tlbs(kvm);
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_slot_largepage_remove_write_access);
+
+void kvm_mmu_slot_set_dirty(struct kvm *kvm,
+ struct kvm_memory_slot *memslot)
+{
+ bool flush;
+
+ spin_lock(&kvm->mmu_lock);
+ flush = slot_handle_all_level(kvm, memslot, __rmap_set_dirty, false);
+ spin_unlock(&kvm->mmu_lock);
+
+ lockdep_assert_held(&kvm->slots_lock);
+
+ /* see kvm_mmu_slot_leaf_clear_dirty */
+ if (flush)
+ kvm_flush_remote_tlbs(kvm);
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_slot_set_dirty);
+
+#define BATCH_ZAP_PAGES 10
+static void kvm_zap_obsolete_pages(struct kvm *kvm)
+{
+ struct kvm_mmu_page *sp, *node;
+ int batch = 0;
+
+restart:
+ list_for_each_entry_safe_reverse(sp, node,
+ &kvm->arch.active_mmu_pages, link) {
+ int ret;
+
+ /*
+ * No obsolete page exists before new created page since
+ * active_mmu_pages is the FIFO list.
+ */
+ if (!is_obsolete_sp(kvm, sp))
+ break;
+
+ /*
+ * Since we are reversely walking the list and the invalid
+ * list will be moved to the head, skip the invalid page
+ * can help us to avoid the infinity list walking.
+ */
+ if (sp->role.invalid)
+ continue;
+
+ /*
+ * Need not flush tlb since we only zap the sp with invalid
+ * generation number.
+ */
+ if (batch >= BATCH_ZAP_PAGES &&
+ cond_resched_lock(&kvm->mmu_lock)) {
+ batch = 0;
+ goto restart;
+ }
+
+ ret = kvm_mmu_prepare_zap_page(kvm, sp,
+ &kvm->arch.zapped_obsolete_pages);
+ batch += ret;
+
+ if (ret)
+ goto restart;
+ }
+
+ /*
+ * Should flush tlb before free page tables since lockless-walking
+ * may use the pages.
+ */
+ kvm_mmu_commit_zap_page(kvm, &kvm->arch.zapped_obsolete_pages);
+}
+
+/*
+ * Fast invalidate all shadow pages and use lock-break technique
+ * to zap obsolete pages.
+ *
+ * It's required when memslot is being deleted or VM is being
+ * destroyed, in these cases, we should ensure that KVM MMU does
+ * not use any resource of the being-deleted slot or all slots
+ * after calling the function.
+ */
+void kvm_mmu_invalidate_zap_all_pages(struct kvm *kvm)
+{
+ spin_lock(&kvm->mmu_lock);
+ trace_kvm_mmu_invalidate_zap_all_pages(kvm);
+ kvm->arch.mmu_valid_gen++;
+
+ /*
+ * Notify all vcpus to reload its shadow page table
+ * and flush TLB. Then all vcpus will switch to new
+ * shadow page table with the new mmu_valid_gen.
+ *
+ * Note: we should do this under the protection of
+ * mmu-lock, otherwise, vcpu would purge shadow page
+ * but miss tlb flush.
+ */
+ kvm_reload_remote_mmus(kvm);
+
+ kvm_zap_obsolete_pages(kvm);
+ spin_unlock(&kvm->mmu_lock);
+}
+
+static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm)
+{
+ return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages));
+}
+
+void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen)
+{
+ gen &= MMIO_GEN_MASK;
+
+ /*
+ * Shift to eliminate the "update in-progress" flag, which isn't
+ * included in the spte's generation number.
+ */
+ gen >>= 1;
+
+ /*
+ * Generation numbers are incremented in multiples of the number of
+ * address spaces in order to provide unique generations across all
+ * address spaces. Strip what is effectively the address space
+ * modifier prior to checking for a wrap of the MMIO generation so
+ * that a wrap in any address space is detected.
+ */
+ gen &= ~((u64)KVM_ADDRESS_SPACE_NUM - 1);
+
+ /*
+ * The very rare case: if the MMIO generation number has wrapped,
+ * zap all shadow pages.
+ */
+ if (unlikely(gen == 0)) {
+ kvm_debug_ratelimited("kvm: zapping shadow pages for mmio generation wraparound\n");
+ kvm_mmu_invalidate_zap_all_pages(kvm);
+ }
+}
+
+static unsigned long
+mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
+{
+ struct kvm *kvm;
+ int nr_to_scan = sc->nr_to_scan;
+ unsigned long freed = 0;
+
+ mutex_lock(&kvm_lock);
+
+ list_for_each_entry(kvm, &vm_list, vm_list) {
+ int idx;
+ LIST_HEAD(invalid_list);
+
+ /*
+ * Never scan more than sc->nr_to_scan VM instances.
+ * Will not hit this condition practically since we do not try
+ * to shrink more than one VM and it is very unlikely to see
+ * !n_used_mmu_pages so many times.
+ */
+ if (!nr_to_scan--)
+ break;
+ /*
+ * n_used_mmu_pages is accessed without holding kvm->mmu_lock
+ * here. We may skip a VM instance errorneosly, but we do not
+ * want to shrink a VM that only started to populate its MMU
+ * anyway.
+ */
+ if (!kvm->arch.n_used_mmu_pages &&
+ !kvm_has_zapped_obsolete_pages(kvm))
+ continue;
+
+ idx = srcu_read_lock(&kvm->srcu);
+ spin_lock(&kvm->mmu_lock);
+
+ if (kvm_has_zapped_obsolete_pages(kvm)) {
+ kvm_mmu_commit_zap_page(kvm,
+ &kvm->arch.zapped_obsolete_pages);
+ goto unlock;
+ }
+
+ if (prepare_zap_oldest_mmu_page(kvm, &invalid_list))
+ freed++;
+ kvm_mmu_commit_zap_page(kvm, &invalid_list);
+
+unlock:
+ spin_unlock(&kvm->mmu_lock);
+ srcu_read_unlock(&kvm->srcu, idx);
+
+ /*
+ * unfair on small ones
+ * per-vm shrinkers cry out
+ * sadness comes quickly
+ */
+ list_move_tail(&kvm->vm_list, &vm_list);
+ break;
+ }
+
+ mutex_unlock(&kvm_lock);
+ return freed;
+}
+
+static unsigned long
+mmu_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
+{
+ return percpu_counter_read_positive(&kvm_total_used_mmu_pages);
+}
+
+static struct shrinker mmu_shrinker = {
+ .count_objects = mmu_shrink_count,
+ .scan_objects = mmu_shrink_scan,
+ .seeks = DEFAULT_SEEKS * 10,
+};
+
+static void mmu_destroy_caches(void)
+{
+ kmem_cache_destroy(pte_list_desc_cache);
+ kmem_cache_destroy(mmu_page_header_cache);
+}
+
+static bool get_nx_auto_mode(void)
+{
+ /* Return true when CPU has the bug, and mitigations are ON */
+ return boot_cpu_has_bug(X86_BUG_ITLB_MULTIHIT) && !cpu_mitigations_off();
+}
+
+static void __set_nx_huge_pages(bool val)
+{
+ nx_huge_pages = itlb_multihit_kvm_mitigation = val;
+}
+
+static int set_nx_huge_pages(const char *val, const struct kernel_param *kp)
+{
+ bool old_val = nx_huge_pages;
+ bool new_val;
+
+ /* In "auto" mode deploy workaround only if CPU has the bug. */
+ if (sysfs_streq(val, "off"))
+ new_val = 0;
+ else if (sysfs_streq(val, "force"))
+ new_val = 1;
+ else if (sysfs_streq(val, "auto"))
+ new_val = get_nx_auto_mode();
+ else if (strtobool(val, &new_val) < 0)
+ return -EINVAL;
+
+ __set_nx_huge_pages(new_val);
+
+ if (new_val != old_val) {
+ struct kvm *kvm;
+ int idx;
+
+ mutex_lock(&kvm_lock);
+
+ list_for_each_entry(kvm, &vm_list, vm_list) {
+ idx = srcu_read_lock(&kvm->srcu);
+ kvm_mmu_invalidate_zap_all_pages(kvm);
+ srcu_read_unlock(&kvm->srcu, idx);
+
+ wake_up_process(kvm->arch.nx_lpage_recovery_thread);
+ }
+ mutex_unlock(&kvm_lock);
+ }
+
+ return 0;
+}
+
+static void kvm_set_mmio_spte_mask(void)
+{
+ u64 mask;
+
+ /*
+ * Set a reserved PA bit in MMIO SPTEs to generate page faults with
+ * PFEC.RSVD=1 on MMIO accesses. 64-bit PTEs (PAE, x86-64, and EPT
+ * paging) support a maximum of 52 bits of PA, i.e. if the CPU supports
+ * 52-bit physical addresses then there are no reserved PA bits in the
+ * PTEs and so the reserved PA approach must be disabled.
+ */
+ if (shadow_phys_bits < 52)
+ mask = BIT_ULL(51) | PT_PRESENT_MASK;
+ else
+ mask = 0;
+
+ kvm_mmu_set_mmio_spte_mask(mask, mask);
+}
+
+int kvm_mmu_module_init(void)
+{
+ int ret = -ENOMEM;
+
+ if (nx_huge_pages == -1)
+ __set_nx_huge_pages(get_nx_auto_mode());
+
+ kvm_mmu_reset_all_pte_masks();
+
+ kvm_set_mmio_spte_mask();
+
+ pte_list_desc_cache = kmem_cache_create("pte_list_desc",
+ sizeof(struct pte_list_desc),
+ 0, SLAB_ACCOUNT, NULL);
+ if (!pte_list_desc_cache)
+ goto out;
+
+ mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header",
+ sizeof(struct kvm_mmu_page),
+ 0, SLAB_ACCOUNT, NULL);
+ if (!mmu_page_header_cache)
+ goto out;
+
+ if (percpu_counter_init(&kvm_total_used_mmu_pages, 0, GFP_KERNEL))
+ goto out;
+
+ ret = register_shrinker(&mmu_shrinker);
+ if (ret)
+ goto out;
+
+ return 0;
+
+out:
+ mmu_destroy_caches();
+ return ret;
+}
+
+/*
+ * Caculate mmu pages needed for kvm.
+ */
+unsigned long kvm_mmu_calculate_mmu_pages(struct kvm *kvm)
+{
+ unsigned long nr_mmu_pages;
+ unsigned long nr_pages = 0;
+ struct kvm_memslots *slots;
+ struct kvm_memory_slot *memslot;
+ int i;
+
+ for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
+ slots = __kvm_memslots(kvm, i);
+
+ kvm_for_each_memslot(memslot, slots)
+ nr_pages += memslot->npages;
+ }
+
+ nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000;
+ nr_mmu_pages = max(nr_mmu_pages, KVM_MIN_ALLOC_MMU_PAGES);
+
+ return nr_mmu_pages;
+}
+
+void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
+{
+ kvm_mmu_unload(vcpu);
+ free_mmu_pages(vcpu);
+ mmu_free_memory_caches(vcpu);
+}
+
+void kvm_mmu_module_exit(void)
+{
+ mmu_destroy_caches();
+ percpu_counter_destroy(&kvm_total_used_mmu_pages);
+ unregister_shrinker(&mmu_shrinker);
+ mmu_audit_disable();
+}
+
+static int set_nx_huge_pages_recovery_ratio(const char *val, const struct kernel_param *kp)
+{
+ unsigned int old_val;
+ int err;
+
+ old_val = nx_huge_pages_recovery_ratio;
+ err = param_set_uint(val, kp);
+ if (err)
+ return err;
+
+ if (READ_ONCE(nx_huge_pages) &&
+ !old_val && nx_huge_pages_recovery_ratio) {
+ struct kvm *kvm;
+
+ mutex_lock(&kvm_lock);
+
+ list_for_each_entry(kvm, &vm_list, vm_list)
+ wake_up_process(kvm->arch.nx_lpage_recovery_thread);
+
+ mutex_unlock(&kvm_lock);
+ }
+
+ return err;
+}
+
+static void kvm_recover_nx_lpages(struct kvm *kvm)
+{
+ int rcu_idx;
+ struct kvm_mmu_page *sp;
+ unsigned int ratio;
+ LIST_HEAD(invalid_list);
+ ulong to_zap;
+
+ rcu_idx = srcu_read_lock(&kvm->srcu);
+ spin_lock(&kvm->mmu_lock);
+
+ ratio = READ_ONCE(nx_huge_pages_recovery_ratio);
+ to_zap = ratio ? DIV_ROUND_UP(kvm->stat.nx_lpage_splits, ratio) : 0;
+ while (to_zap && !list_empty(&kvm->arch.lpage_disallowed_mmu_pages)) {
+ /*
+ * We use a separate list instead of just using active_mmu_pages
+ * because the number of lpage_disallowed pages is expected to
+ * be relatively small compared to the total.
+ */
+ sp = list_first_entry(&kvm->arch.lpage_disallowed_mmu_pages,
+ struct kvm_mmu_page,
+ lpage_disallowed_link);
+ WARN_ON_ONCE(!sp->lpage_disallowed);
+ kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
+ WARN_ON_ONCE(sp->lpage_disallowed);
+
+ if (!--to_zap || need_resched() || spin_needbreak(&kvm->mmu_lock)) {
+ kvm_mmu_commit_zap_page(kvm, &invalid_list);
+ if (to_zap)
+ cond_resched_lock(&kvm->mmu_lock);
+ }
+ }
+ kvm_mmu_commit_zap_page(kvm, &invalid_list);
+
+ spin_unlock(&kvm->mmu_lock);
+ srcu_read_unlock(&kvm->srcu, rcu_idx);
+}
+
+static long get_nx_lpage_recovery_timeout(u64 start_time)
+{
+ return READ_ONCE(nx_huge_pages) && READ_ONCE(nx_huge_pages_recovery_ratio)
+ ? start_time + 60 * HZ - get_jiffies_64()
+ : MAX_SCHEDULE_TIMEOUT;
+}
+
+static int kvm_nx_lpage_recovery_worker(struct kvm *kvm, uintptr_t data)
+{
+ u64 start_time;
+ long remaining_time;
+
+ while (true) {
+ start_time = get_jiffies_64();
+ remaining_time = get_nx_lpage_recovery_timeout(start_time);
+
+ set_current_state(TASK_INTERRUPTIBLE);
+ while (!kthread_should_stop() && remaining_time > 0) {
+ schedule_timeout(remaining_time);
+ remaining_time = get_nx_lpage_recovery_timeout(start_time);
+ set_current_state(TASK_INTERRUPTIBLE);
+ }
+
+ set_current_state(TASK_RUNNING);
+
+ if (kthread_should_stop())
+ return 0;
+
+ kvm_recover_nx_lpages(kvm);
+ }
+}
+
+int kvm_mmu_post_init_vm(struct kvm *kvm)
+{
+ int err;
+
+ err = kvm_vm_create_worker_thread(kvm, kvm_nx_lpage_recovery_worker, 0,
+ "kvm-nx-lpage-recovery",
+ &kvm->arch.nx_lpage_recovery_thread);
+ if (!err)
+ kthread_unpark(kvm->arch.nx_lpage_recovery_thread);
+
+ return err;
+}
+
+void kvm_mmu_pre_destroy_vm(struct kvm *kvm)
+{
+ if (kvm->arch.nx_lpage_recovery_thread)
+ kthread_stop(kvm->arch.nx_lpage_recovery_thread);
+}
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
new file mode 100644
index 000000000..05a02b8ac
--- /dev/null
+++ b/arch/x86/kvm/mmu.h
@@ -0,0 +1,223 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __KVM_X86_MMU_H
+#define __KVM_X86_MMU_H
+
+#include <linux/kvm_host.h>
+#include "kvm_cache_regs.h"
+
+#define PT64_PT_BITS 9
+#define PT64_ENT_PER_PAGE (1 << PT64_PT_BITS)
+#define PT32_PT_BITS 10
+#define PT32_ENT_PER_PAGE (1 << PT32_PT_BITS)
+
+#define PT_WRITABLE_SHIFT 1
+#define PT_USER_SHIFT 2
+
+#define PT_PRESENT_MASK (1ULL << 0)
+#define PT_WRITABLE_MASK (1ULL << PT_WRITABLE_SHIFT)
+#define PT_USER_MASK (1ULL << PT_USER_SHIFT)
+#define PT_PWT_MASK (1ULL << 3)
+#define PT_PCD_MASK (1ULL << 4)
+#define PT_ACCESSED_SHIFT 5
+#define PT_ACCESSED_MASK (1ULL << PT_ACCESSED_SHIFT)
+#define PT_DIRTY_SHIFT 6
+#define PT_DIRTY_MASK (1ULL << PT_DIRTY_SHIFT)
+#define PT_PAGE_SIZE_SHIFT 7
+#define PT_PAGE_SIZE_MASK (1ULL << PT_PAGE_SIZE_SHIFT)
+#define PT_PAT_MASK (1ULL << 7)
+#define PT_GLOBAL_MASK (1ULL << 8)
+#define PT64_NX_SHIFT 63
+#define PT64_NX_MASK (1ULL << PT64_NX_SHIFT)
+
+#define PT_PAT_SHIFT 7
+#define PT_DIR_PAT_SHIFT 12
+#define PT_DIR_PAT_MASK (1ULL << PT_DIR_PAT_SHIFT)
+
+#define PT32_DIR_PSE36_SIZE 4
+#define PT32_DIR_PSE36_SHIFT 13
+#define PT32_DIR_PSE36_MASK \
+ (((1ULL << PT32_DIR_PSE36_SIZE) - 1) << PT32_DIR_PSE36_SHIFT)
+
+#define PT64_ROOT_5LEVEL 5
+#define PT64_ROOT_4LEVEL 4
+#define PT32_ROOT_LEVEL 2
+#define PT32E_ROOT_LEVEL 3
+
+#define PT_PDPE_LEVEL 3
+#define PT_DIRECTORY_LEVEL 2
+#define PT_PAGE_TABLE_LEVEL 1
+#define PT_MAX_HUGEPAGE_LEVEL (PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES - 1)
+
+static inline u64 rsvd_bits(int s, int e)
+{
+ if (e < s)
+ return 0;
+
+ return ((2ULL << (e - s)) - 1) << s;
+}
+
+void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask, u64 mmio_value);
+
+void
+reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context);
+
+void kvm_init_mmu(struct kvm_vcpu *vcpu, bool reset_roots);
+void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu);
+void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
+ bool accessed_dirty, gpa_t new_eptp);
+bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu);
+int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
+ u64 fault_address, char *insn, int insn_len);
+
+static inline unsigned long kvm_mmu_available_pages(struct kvm *kvm)
+{
+ if (kvm->arch.n_max_mmu_pages > kvm->arch.n_used_mmu_pages)
+ return kvm->arch.n_max_mmu_pages -
+ kvm->arch.n_used_mmu_pages;
+
+ return 0;
+}
+
+static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu)
+{
+ if (likely(vcpu->arch.mmu.root_hpa != INVALID_PAGE))
+ return 0;
+
+ return kvm_mmu_load(vcpu);
+}
+
+static inline unsigned long kvm_get_pcid(struct kvm_vcpu *vcpu, gpa_t cr3)
+{
+ BUILD_BUG_ON((X86_CR3_PCID_MASK & PAGE_MASK) != 0);
+
+ return kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE)
+ ? cr3 & X86_CR3_PCID_MASK
+ : 0;
+}
+
+static inline unsigned long kvm_get_active_pcid(struct kvm_vcpu *vcpu)
+{
+ return kvm_get_pcid(vcpu, kvm_read_cr3(vcpu));
+}
+
+static inline void kvm_mmu_load_cr3(struct kvm_vcpu *vcpu)
+{
+ if (VALID_PAGE(vcpu->arch.mmu.root_hpa))
+ vcpu->arch.mmu.set_cr3(vcpu, vcpu->arch.mmu.root_hpa |
+ kvm_get_active_pcid(vcpu));
+}
+
+/*
+ * Currently, we have two sorts of write-protection, a) the first one
+ * write-protects guest page to sync the guest modification, b) another one is
+ * used to sync dirty bitmap when we do KVM_GET_DIRTY_LOG. The differences
+ * between these two sorts are:
+ * 1) the first case clears SPTE_MMU_WRITEABLE bit.
+ * 2) the first case requires flushing tlb immediately avoiding corrupting
+ * shadow page table between all vcpus so it should be in the protection of
+ * mmu-lock. And the another case does not need to flush tlb until returning
+ * the dirty bitmap to userspace since it only write-protects the page
+ * logged in the bitmap, that means the page in the dirty bitmap is not
+ * missed, so it can flush tlb out of mmu-lock.
+ *
+ * So, there is the problem: the first case can meet the corrupted tlb caused
+ * by another case which write-protects pages but without flush tlb
+ * immediately. In order to making the first case be aware this problem we let
+ * it flush tlb if we try to write-protect a spte whose SPTE_MMU_WRITEABLE bit
+ * is set, it works since another case never touches SPTE_MMU_WRITEABLE bit.
+ *
+ * Anyway, whenever a spte is updated (only permission and status bits are
+ * changed) we need to check whether the spte with SPTE_MMU_WRITEABLE becomes
+ * readonly, if that happens, we need to flush tlb. Fortunately,
+ * mmu_spte_update() has already handled it perfectly.
+ *
+ * The rules to use SPTE_MMU_WRITEABLE and PT_WRITABLE_MASK:
+ * - if we want to see if it has writable tlb entry or if the spte can be
+ * writable on the mmu mapping, check SPTE_MMU_WRITEABLE, this is the most
+ * case, otherwise
+ * - if we fix page fault on the spte or do write-protection by dirty logging,
+ * check PT_WRITABLE_MASK.
+ *
+ * TODO: introduce APIs to split these two cases.
+ */
+static inline int is_writable_pte(unsigned long pte)
+{
+ return pte & PT_WRITABLE_MASK;
+}
+
+static inline bool is_write_protection(struct kvm_vcpu *vcpu)
+{
+ return kvm_read_cr0_bits(vcpu, X86_CR0_WP);
+}
+
+/*
+ * Check if a given access (described through the I/D, W/R and U/S bits of a
+ * page fault error code pfec) causes a permission fault with the given PTE
+ * access rights (in ACC_* format).
+ *
+ * Return zero if the access does not fault; return the page fault error code
+ * if the access faults.
+ */
+static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
+ unsigned pte_access, unsigned pte_pkey,
+ unsigned pfec)
+{
+ int cpl = kvm_x86_ops->get_cpl(vcpu);
+ unsigned long rflags = kvm_x86_ops->get_rflags(vcpu);
+
+ /*
+ * If CPL < 3, SMAP prevention are disabled if EFLAGS.AC = 1.
+ *
+ * If CPL = 3, SMAP applies to all supervisor-mode data accesses
+ * (these are implicit supervisor accesses) regardless of the value
+ * of EFLAGS.AC.
+ *
+ * This computes (cpl < 3) && (rflags & X86_EFLAGS_AC), leaving
+ * the result in X86_EFLAGS_AC. We then insert it in place of
+ * the PFERR_RSVD_MASK bit; this bit will always be zero in pfec,
+ * but it will be one in index if SMAP checks are being overridden.
+ * It is important to keep this branchless.
+ */
+ unsigned long smap = (cpl - 3) & (rflags & X86_EFLAGS_AC);
+ int index = (pfec >> 1) +
+ (smap >> (X86_EFLAGS_AC_BIT - PFERR_RSVD_BIT + 1));
+ bool fault = (mmu->permissions[index] >> pte_access) & 1;
+ u32 errcode = PFERR_PRESENT_MASK;
+
+ WARN_ON(pfec & (PFERR_PK_MASK | PFERR_RSVD_MASK));
+ if (unlikely(mmu->pkru_mask)) {
+ u32 pkru_bits, offset;
+
+ /*
+ * PKRU defines 32 bits, there are 16 domains and 2
+ * attribute bits per domain in pkru. pte_pkey is the
+ * index of the protection domain, so pte_pkey * 2 is
+ * is the index of the first bit for the domain.
+ */
+ pkru_bits = (vcpu->arch.pkru >> (pte_pkey * 2)) & 3;
+
+ /* clear present bit, replace PFEC.RSVD with ACC_USER_MASK. */
+ offset = (pfec & ~1) +
+ ((pte_access & PT_USER_MASK) << (PFERR_RSVD_BIT - PT_USER_SHIFT));
+
+ pkru_bits &= mmu->pkru_mask >> offset;
+ errcode |= -pkru_bits & PFERR_PK_MASK;
+ fault |= (pkru_bits != 0);
+ }
+
+ return -(u32)fault & errcode;
+}
+
+void kvm_mmu_invalidate_zap_all_pages(struct kvm *kvm);
+void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end);
+
+void kvm_mmu_gfn_disallow_lpage(struct kvm_memory_slot *slot, gfn_t gfn);
+void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn);
+bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
+ struct kvm_memory_slot *slot, u64 gfn);
+int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu, gpa_t l2_gpa);
+
+int kvm_mmu_post_init_vm(struct kvm *kvm);
+void kvm_mmu_pre_destroy_vm(struct kvm *kvm);
+
+#endif
diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c
new file mode 100644
index 000000000..1272861e7
--- /dev/null
+++ b/arch/x86/kvm/mmu_audit.c
@@ -0,0 +1,306 @@
+/*
+ * mmu_audit.c:
+ *
+ * Audit code for KVM MMU
+ *
+ * Copyright (C) 2006 Qumranet, Inc.
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
+ *
+ * Authors:
+ * Yaniv Kamay <yaniv@qumranet.com>
+ * Avi Kivity <avi@qumranet.com>
+ * Marcelo Tosatti <mtosatti@redhat.com>
+ * Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#include <linux/ratelimit.h>
+
+static char const *audit_point_name[] = {
+ "pre page fault",
+ "post page fault",
+ "pre pte write",
+ "post pte write",
+ "pre sync",
+ "post sync"
+};
+
+#define audit_printk(kvm, fmt, args...) \
+ printk(KERN_ERR "audit: (%s) error: " \
+ fmt, audit_point_name[kvm->arch.audit_point], ##args)
+
+typedef void (*inspect_spte_fn) (struct kvm_vcpu *vcpu, u64 *sptep, int level);
+
+static void __mmu_spte_walk(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+ inspect_spte_fn fn, int level)
+{
+ int i;
+
+ for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
+ u64 *ent = sp->spt;
+
+ fn(vcpu, ent + i, level);
+
+ if (is_shadow_present_pte(ent[i]) &&
+ !is_last_spte(ent[i], level)) {
+ struct kvm_mmu_page *child;
+
+ child = page_header(ent[i] & PT64_BASE_ADDR_MASK);
+ __mmu_spte_walk(vcpu, child, fn, level - 1);
+ }
+ }
+}
+
+static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn)
+{
+ int i;
+ struct kvm_mmu_page *sp;
+
+ if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
+ return;
+
+ if (vcpu->arch.mmu.root_level >= PT64_ROOT_4LEVEL) {
+ hpa_t root = vcpu->arch.mmu.root_hpa;
+
+ sp = page_header(root);
+ __mmu_spte_walk(vcpu, sp, fn, vcpu->arch.mmu.root_level);
+ return;
+ }
+
+ for (i = 0; i < 4; ++i) {
+ hpa_t root = vcpu->arch.mmu.pae_root[i];
+
+ if (root && VALID_PAGE(root)) {
+ root &= PT64_BASE_ADDR_MASK;
+ sp = page_header(root);
+ __mmu_spte_walk(vcpu, sp, fn, 2);
+ }
+ }
+
+ return;
+}
+
+typedef void (*sp_handler) (struct kvm *kvm, struct kvm_mmu_page *sp);
+
+static void walk_all_active_sps(struct kvm *kvm, sp_handler fn)
+{
+ struct kvm_mmu_page *sp;
+
+ list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link)
+ fn(kvm, sp);
+}
+
+static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level)
+{
+ struct kvm_mmu_page *sp;
+ gfn_t gfn;
+ kvm_pfn_t pfn;
+ hpa_t hpa;
+
+ sp = page_header(__pa(sptep));
+
+ if (sp->unsync) {
+ if (level != PT_PAGE_TABLE_LEVEL) {
+ audit_printk(vcpu->kvm, "unsync sp: %p "
+ "level = %d\n", sp, level);
+ return;
+ }
+ }
+
+ if (!is_shadow_present_pte(*sptep) || !is_last_spte(*sptep, level))
+ return;
+
+ gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
+ pfn = kvm_vcpu_gfn_to_pfn_atomic(vcpu, gfn);
+
+ if (is_error_pfn(pfn))
+ return;
+
+ hpa = pfn << PAGE_SHIFT;
+ if ((*sptep & PT64_BASE_ADDR_MASK) != hpa)
+ audit_printk(vcpu->kvm, "levels %d pfn %llx hpa %llx "
+ "ent %llxn", vcpu->arch.mmu.root_level, pfn,
+ hpa, *sptep);
+}
+
+static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
+{
+ static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 10);
+ struct kvm_rmap_head *rmap_head;
+ struct kvm_mmu_page *rev_sp;
+ struct kvm_memslots *slots;
+ struct kvm_memory_slot *slot;
+ gfn_t gfn;
+
+ rev_sp = page_header(__pa(sptep));
+ gfn = kvm_mmu_page_get_gfn(rev_sp, sptep - rev_sp->spt);
+
+ slots = kvm_memslots_for_spte_role(kvm, rev_sp->role);
+ slot = __gfn_to_memslot(slots, gfn);
+ if (!slot) {
+ if (!__ratelimit(&ratelimit_state))
+ return;
+ audit_printk(kvm, "no memslot for gfn %llx\n", gfn);
+ audit_printk(kvm, "index %ld of sp (gfn=%llx)\n",
+ (long int)(sptep - rev_sp->spt), rev_sp->gfn);
+ dump_stack();
+ return;
+ }
+
+ rmap_head = __gfn_to_rmap(gfn, rev_sp->role.level, slot);
+ if (!rmap_head->val) {
+ if (!__ratelimit(&ratelimit_state))
+ return;
+ audit_printk(kvm, "no rmap for writable spte %llx\n",
+ *sptep);
+ dump_stack();
+ }
+}
+
+static void audit_sptes_have_rmaps(struct kvm_vcpu *vcpu, u64 *sptep, int level)
+{
+ if (is_shadow_present_pte(*sptep) && is_last_spte(*sptep, level))
+ inspect_spte_has_rmap(vcpu->kvm, sptep);
+}
+
+static void audit_spte_after_sync(struct kvm_vcpu *vcpu, u64 *sptep, int level)
+{
+ struct kvm_mmu_page *sp = page_header(__pa(sptep));
+
+ if (vcpu->kvm->arch.audit_point == AUDIT_POST_SYNC && sp->unsync)
+ audit_printk(vcpu->kvm, "meet unsync sp(%p) after sync "
+ "root.\n", sp);
+}
+
+static void check_mappings_rmap(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+ int i;
+
+ if (sp->role.level != PT_PAGE_TABLE_LEVEL)
+ return;
+
+ for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
+ if (!is_shadow_present_pte(sp->spt[i]))
+ continue;
+
+ inspect_spte_has_rmap(kvm, sp->spt + i);
+ }
+}
+
+static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+ struct kvm_rmap_head *rmap_head;
+ u64 *sptep;
+ struct rmap_iterator iter;
+ struct kvm_memslots *slots;
+ struct kvm_memory_slot *slot;
+
+ if (sp->role.direct || sp->unsync || sp->role.invalid)
+ return;
+
+ slots = kvm_memslots_for_spte_role(kvm, sp->role);
+ slot = __gfn_to_memslot(slots, sp->gfn);
+ rmap_head = __gfn_to_rmap(sp->gfn, PT_PAGE_TABLE_LEVEL, slot);
+
+ for_each_rmap_spte(rmap_head, &iter, sptep) {
+ if (is_writable_pte(*sptep))
+ audit_printk(kvm, "shadow page has writable "
+ "mappings: gfn %llx role %x\n",
+ sp->gfn, sp->role.word);
+ }
+}
+
+static void audit_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+ check_mappings_rmap(kvm, sp);
+ audit_write_protection(kvm, sp);
+}
+
+static void audit_all_active_sps(struct kvm *kvm)
+{
+ walk_all_active_sps(kvm, audit_sp);
+}
+
+static void audit_spte(struct kvm_vcpu *vcpu, u64 *sptep, int level)
+{
+ audit_sptes_have_rmaps(vcpu, sptep, level);
+ audit_mappings(vcpu, sptep, level);
+ audit_spte_after_sync(vcpu, sptep, level);
+}
+
+static void audit_vcpu_spte(struct kvm_vcpu *vcpu)
+{
+ mmu_spte_walk(vcpu, audit_spte);
+}
+
+static bool mmu_audit;
+static struct static_key mmu_audit_key;
+
+static void __kvm_mmu_audit(struct kvm_vcpu *vcpu, int point)
+{
+ static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 10);
+
+ if (!__ratelimit(&ratelimit_state))
+ return;
+
+ vcpu->kvm->arch.audit_point = point;
+ audit_all_active_sps(vcpu->kvm);
+ audit_vcpu_spte(vcpu);
+}
+
+static inline void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point)
+{
+ if (static_key_false((&mmu_audit_key)))
+ __kvm_mmu_audit(vcpu, point);
+}
+
+static void mmu_audit_enable(void)
+{
+ if (mmu_audit)
+ return;
+
+ static_key_slow_inc(&mmu_audit_key);
+ mmu_audit = true;
+}
+
+static void mmu_audit_disable(void)
+{
+ if (!mmu_audit)
+ return;
+
+ static_key_slow_dec(&mmu_audit_key);
+ mmu_audit = false;
+}
+
+static int mmu_audit_set(const char *val, const struct kernel_param *kp)
+{
+ int ret;
+ unsigned long enable;
+
+ ret = kstrtoul(val, 10, &enable);
+ if (ret < 0)
+ return -EINVAL;
+
+ switch (enable) {
+ case 0:
+ mmu_audit_disable();
+ break;
+ case 1:
+ mmu_audit_enable();
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static const struct kernel_param_ops audit_param_ops = {
+ .set = mmu_audit_set,
+ .get = param_get_bool,
+};
+
+arch_param_cb(mmu_audit, &audit_param_ops, &mmu_audit, 0644);
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
new file mode 100644
index 000000000..7e0dc8c7d
--- /dev/null
+++ b/arch/x86/kvm/mmutrace.h
@@ -0,0 +1,395 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#if !defined(_TRACE_KVMMMU_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_KVMMMU_H
+
+#include <linux/tracepoint.h>
+#include <linux/trace_events.h>
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM kvmmmu
+
+#define KVM_MMU_PAGE_FIELDS \
+ __field(unsigned long, mmu_valid_gen) \
+ __field(__u64, gfn) \
+ __field(__u32, role) \
+ __field(__u32, root_count) \
+ __field(bool, unsync)
+
+#define KVM_MMU_PAGE_ASSIGN(sp) \
+ __entry->mmu_valid_gen = sp->mmu_valid_gen; \
+ __entry->gfn = sp->gfn; \
+ __entry->role = sp->role.word; \
+ __entry->root_count = sp->root_count; \
+ __entry->unsync = sp->unsync;
+
+#define KVM_MMU_PAGE_PRINTK() ({ \
+ const char *saved_ptr = trace_seq_buffer_ptr(p); \
+ static const char *access_str[] = { \
+ "---", "--x", "w--", "w-x", "-u-", "-ux", "wu-", "wux" \
+ }; \
+ union kvm_mmu_page_role role; \
+ \
+ role.word = __entry->role; \
+ \
+ trace_seq_printf(p, "sp gen %lx gfn %llx l%u%s q%u%s %s%s" \
+ " %snxe %sad root %u %s%c", \
+ __entry->mmu_valid_gen, \
+ __entry->gfn, role.level, \
+ role.cr4_pae ? " pae" : "", \
+ role.quadrant, \
+ role.direct ? " direct" : "", \
+ access_str[role.access], \
+ role.invalid ? " invalid" : "", \
+ role.nxe ? "" : "!", \
+ role.ad_disabled ? "!" : "", \
+ __entry->root_count, \
+ __entry->unsync ? "unsync" : "sync", 0); \
+ saved_ptr; \
+ })
+
+#define kvm_mmu_trace_pferr_flags \
+ { PFERR_PRESENT_MASK, "P" }, \
+ { PFERR_WRITE_MASK, "W" }, \
+ { PFERR_USER_MASK, "U" }, \
+ { PFERR_RSVD_MASK, "RSVD" }, \
+ { PFERR_FETCH_MASK, "F" }
+
+/*
+ * A pagetable walk has started
+ */
+TRACE_EVENT(
+ kvm_mmu_pagetable_walk,
+ TP_PROTO(u64 addr, u32 pferr),
+ TP_ARGS(addr, pferr),
+
+ TP_STRUCT__entry(
+ __field(__u64, addr)
+ __field(__u32, pferr)
+ ),
+
+ TP_fast_assign(
+ __entry->addr = addr;
+ __entry->pferr = pferr;
+ ),
+
+ TP_printk("addr %llx pferr %x %s", __entry->addr, __entry->pferr,
+ __print_flags(__entry->pferr, "|", kvm_mmu_trace_pferr_flags))
+);
+
+
+/* We just walked a paging element */
+TRACE_EVENT(
+ kvm_mmu_paging_element,
+ TP_PROTO(u64 pte, int level),
+ TP_ARGS(pte, level),
+
+ TP_STRUCT__entry(
+ __field(__u64, pte)
+ __field(__u32, level)
+ ),
+
+ TP_fast_assign(
+ __entry->pte = pte;
+ __entry->level = level;
+ ),
+
+ TP_printk("pte %llx level %u", __entry->pte, __entry->level)
+);
+
+DECLARE_EVENT_CLASS(kvm_mmu_set_bit_class,
+
+ TP_PROTO(unsigned long table_gfn, unsigned index, unsigned size),
+
+ TP_ARGS(table_gfn, index, size),
+
+ TP_STRUCT__entry(
+ __field(__u64, gpa)
+ ),
+
+ TP_fast_assign(
+ __entry->gpa = ((u64)table_gfn << PAGE_SHIFT)
+ + index * size;
+ ),
+
+ TP_printk("gpa %llx", __entry->gpa)
+);
+
+/* We set a pte accessed bit */
+DEFINE_EVENT(kvm_mmu_set_bit_class, kvm_mmu_set_accessed_bit,
+
+ TP_PROTO(unsigned long table_gfn, unsigned index, unsigned size),
+
+ TP_ARGS(table_gfn, index, size)
+);
+
+/* We set a pte dirty bit */
+DEFINE_EVENT(kvm_mmu_set_bit_class, kvm_mmu_set_dirty_bit,
+
+ TP_PROTO(unsigned long table_gfn, unsigned index, unsigned size),
+
+ TP_ARGS(table_gfn, index, size)
+);
+
+TRACE_EVENT(
+ kvm_mmu_walker_error,
+ TP_PROTO(u32 pferr),
+ TP_ARGS(pferr),
+
+ TP_STRUCT__entry(
+ __field(__u32, pferr)
+ ),
+
+ TP_fast_assign(
+ __entry->pferr = pferr;
+ ),
+
+ TP_printk("pferr %x %s", __entry->pferr,
+ __print_flags(__entry->pferr, "|", kvm_mmu_trace_pferr_flags))
+);
+
+TRACE_EVENT(
+ kvm_mmu_get_page,
+ TP_PROTO(struct kvm_mmu_page *sp, bool created),
+ TP_ARGS(sp, created),
+
+ TP_STRUCT__entry(
+ KVM_MMU_PAGE_FIELDS
+ __field(bool, created)
+ ),
+
+ TP_fast_assign(
+ KVM_MMU_PAGE_ASSIGN(sp)
+ __entry->created = created;
+ ),
+
+ TP_printk("%s %s", KVM_MMU_PAGE_PRINTK(),
+ __entry->created ? "new" : "existing")
+);
+
+DECLARE_EVENT_CLASS(kvm_mmu_page_class,
+
+ TP_PROTO(struct kvm_mmu_page *sp),
+ TP_ARGS(sp),
+
+ TP_STRUCT__entry(
+ KVM_MMU_PAGE_FIELDS
+ ),
+
+ TP_fast_assign(
+ KVM_MMU_PAGE_ASSIGN(sp)
+ ),
+
+ TP_printk("%s", KVM_MMU_PAGE_PRINTK())
+);
+
+DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_sync_page,
+ TP_PROTO(struct kvm_mmu_page *sp),
+
+ TP_ARGS(sp)
+);
+
+DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_unsync_page,
+ TP_PROTO(struct kvm_mmu_page *sp),
+
+ TP_ARGS(sp)
+);
+
+DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_prepare_zap_page,
+ TP_PROTO(struct kvm_mmu_page *sp),
+
+ TP_ARGS(sp)
+);
+
+TRACE_EVENT(
+ mark_mmio_spte,
+ TP_PROTO(u64 *sptep, gfn_t gfn, unsigned access, unsigned int gen),
+ TP_ARGS(sptep, gfn, access, gen),
+
+ TP_STRUCT__entry(
+ __field(void *, sptep)
+ __field(gfn_t, gfn)
+ __field(unsigned, access)
+ __field(unsigned int, gen)
+ ),
+
+ TP_fast_assign(
+ __entry->sptep = sptep;
+ __entry->gfn = gfn;
+ __entry->access = access;
+ __entry->gen = gen;
+ ),
+
+ TP_printk("sptep:%p gfn %llx access %x gen %x", __entry->sptep,
+ __entry->gfn, __entry->access, __entry->gen)
+);
+
+TRACE_EVENT(
+ handle_mmio_page_fault,
+ TP_PROTO(u64 addr, gfn_t gfn, unsigned access),
+ TP_ARGS(addr, gfn, access),
+
+ TP_STRUCT__entry(
+ __field(u64, addr)
+ __field(gfn_t, gfn)
+ __field(unsigned, access)
+ ),
+
+ TP_fast_assign(
+ __entry->addr = addr;
+ __entry->gfn = gfn;
+ __entry->access = access;
+ ),
+
+ TP_printk("addr:%llx gfn %llx access %x", __entry->addr, __entry->gfn,
+ __entry->access)
+);
+
+#define __spte_satisfied(__spte) \
+ (__entry->retry && is_writable_pte(__entry->__spte))
+
+TRACE_EVENT(
+ fast_page_fault,
+ TP_PROTO(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u32 error_code,
+ u64 *sptep, u64 old_spte, bool retry),
+ TP_ARGS(vcpu, cr2_or_gpa, error_code, sptep, old_spte, retry),
+
+ TP_STRUCT__entry(
+ __field(int, vcpu_id)
+ __field(gpa_t, cr2_or_gpa)
+ __field(u32, error_code)
+ __field(u64 *, sptep)
+ __field(u64, old_spte)
+ __field(u64, new_spte)
+ __field(bool, retry)
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu->vcpu_id;
+ __entry->cr2_or_gpa = cr2_or_gpa;
+ __entry->error_code = error_code;
+ __entry->sptep = sptep;
+ __entry->old_spte = old_spte;
+ __entry->new_spte = *sptep;
+ __entry->retry = retry;
+ ),
+
+ TP_printk("vcpu %d gva %llx error_code %s sptep %p old %#llx"
+ " new %llx spurious %d fixed %d", __entry->vcpu_id,
+ __entry->cr2_or_gpa, __print_flags(__entry->error_code, "|",
+ kvm_mmu_trace_pferr_flags), __entry->sptep,
+ __entry->old_spte, __entry->new_spte,
+ __spte_satisfied(old_spte), __spte_satisfied(new_spte)
+ )
+);
+
+TRACE_EVENT(
+ kvm_mmu_invalidate_zap_all_pages,
+ TP_PROTO(struct kvm *kvm),
+ TP_ARGS(kvm),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, mmu_valid_gen)
+ __field(unsigned int, mmu_used_pages)
+ ),
+
+ TP_fast_assign(
+ __entry->mmu_valid_gen = kvm->arch.mmu_valid_gen;
+ __entry->mmu_used_pages = kvm->arch.n_used_mmu_pages;
+ ),
+
+ TP_printk("kvm-mmu-valid-gen %lx used_pages %x",
+ __entry->mmu_valid_gen, __entry->mmu_used_pages
+ )
+);
+
+
+TRACE_EVENT(
+ check_mmio_spte,
+ TP_PROTO(u64 spte, unsigned int kvm_gen, unsigned int spte_gen),
+ TP_ARGS(spte, kvm_gen, spte_gen),
+
+ TP_STRUCT__entry(
+ __field(unsigned int, kvm_gen)
+ __field(unsigned int, spte_gen)
+ __field(u64, spte)
+ ),
+
+ TP_fast_assign(
+ __entry->kvm_gen = kvm_gen;
+ __entry->spte_gen = spte_gen;
+ __entry->spte = spte;
+ ),
+
+ TP_printk("spte %llx kvm_gen %x spte-gen %x valid %d", __entry->spte,
+ __entry->kvm_gen, __entry->spte_gen,
+ __entry->kvm_gen == __entry->spte_gen
+ )
+);
+
+TRACE_EVENT(
+ kvm_mmu_set_spte,
+ TP_PROTO(int level, gfn_t gfn, u64 *sptep),
+ TP_ARGS(level, gfn, sptep),
+
+ TP_STRUCT__entry(
+ __field(u64, gfn)
+ __field(u64, spte)
+ __field(u64, sptep)
+ __field(u8, level)
+ /* These depend on page entry type, so compute them now. */
+ __field(bool, r)
+ __field(bool, x)
+ __field(signed char, u)
+ ),
+
+ TP_fast_assign(
+ __entry->gfn = gfn;
+ __entry->spte = *sptep;
+ __entry->sptep = virt_to_phys(sptep);
+ __entry->level = level;
+ __entry->r = shadow_present_mask || (__entry->spte & PT_PRESENT_MASK);
+ __entry->x = is_executable_pte(__entry->spte);
+ __entry->u = shadow_user_mask ? !!(__entry->spte & shadow_user_mask) : -1;
+ ),
+
+ TP_printk("gfn %llx spte %llx (%s%s%s%s) level %d at %llx",
+ __entry->gfn, __entry->spte,
+ __entry->r ? "r" : "-",
+ __entry->spte & PT_WRITABLE_MASK ? "w" : "-",
+ __entry->x ? "x" : "-",
+ __entry->u == -1 ? "" : (__entry->u ? "u" : "-"),
+ __entry->level, __entry->sptep
+ )
+);
+
+TRACE_EVENT(
+ kvm_mmu_spte_requested,
+ TP_PROTO(gpa_t addr, int level, kvm_pfn_t pfn),
+ TP_ARGS(addr, level, pfn),
+
+ TP_STRUCT__entry(
+ __field(u64, gfn)
+ __field(u64, pfn)
+ __field(u8, level)
+ ),
+
+ TP_fast_assign(
+ __entry->gfn = addr >> PAGE_SHIFT;
+ __entry->pfn = pfn | (__entry->gfn & (KVM_PAGES_PER_HPAGE(level) - 1));
+ __entry->level = level;
+ ),
+
+ TP_printk("gfn %llx pfn %llx level %d",
+ __entry->gfn, __entry->pfn, __entry->level
+ )
+);
+
+#endif /* _TRACE_KVMMMU_H */
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE mmutrace
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/arch/x86/kvm/mtrr.c b/arch/x86/kvm/mtrr.c
new file mode 100644
index 000000000..fabce8769
--- /dev/null
+++ b/arch/x86/kvm/mtrr.c
@@ -0,0 +1,727 @@
+/*
+ * vMTRR implementation
+ *
+ * Copyright (C) 2006 Qumranet, Inc.
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
+ * Copyright(C) 2015 Intel Corporation.
+ *
+ * Authors:
+ * Yaniv Kamay <yaniv@qumranet.com>
+ * Avi Kivity <avi@qumranet.com>
+ * Marcelo Tosatti <mtosatti@redhat.com>
+ * Paolo Bonzini <pbonzini@redhat.com>
+ * Xiao Guangrong <guangrong.xiao@linux.intel.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ */
+
+#include <linux/kvm_host.h>
+#include <asm/mtrr.h>
+
+#include "cpuid.h"
+#include "mmu.h"
+
+#define IA32_MTRR_DEF_TYPE_E (1ULL << 11)
+#define IA32_MTRR_DEF_TYPE_FE (1ULL << 10)
+#define IA32_MTRR_DEF_TYPE_TYPE_MASK (0xff)
+
+static bool msr_mtrr_valid(unsigned msr)
+{
+ switch (msr) {
+ case 0x200 ... 0x200 + 2 * KVM_NR_VAR_MTRR - 1:
+ case MSR_MTRRfix64K_00000:
+ case MSR_MTRRfix16K_80000:
+ case MSR_MTRRfix16K_A0000:
+ case MSR_MTRRfix4K_C0000:
+ case MSR_MTRRfix4K_C8000:
+ case MSR_MTRRfix4K_D0000:
+ case MSR_MTRRfix4K_D8000:
+ case MSR_MTRRfix4K_E0000:
+ case MSR_MTRRfix4K_E8000:
+ case MSR_MTRRfix4K_F0000:
+ case MSR_MTRRfix4K_F8000:
+ case MSR_MTRRdefType:
+ case MSR_IA32_CR_PAT:
+ return true;
+ }
+ return false;
+}
+
+static bool valid_mtrr_type(unsigned t)
+{
+ return t < 8 && (1 << t) & 0x73; /* 0, 1, 4, 5, 6 */
+}
+
+bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data)
+{
+ int i;
+ u64 mask;
+
+ if (!msr_mtrr_valid(msr))
+ return false;
+
+ if (msr == MSR_IA32_CR_PAT) {
+ return kvm_pat_valid(data);
+ } else if (msr == MSR_MTRRdefType) {
+ if (data & ~0xcff)
+ return false;
+ return valid_mtrr_type(data & 0xff);
+ } else if (msr >= MSR_MTRRfix64K_00000 && msr <= MSR_MTRRfix4K_F8000) {
+ for (i = 0; i < 8 ; i++)
+ if (!valid_mtrr_type((data >> (i * 8)) & 0xff))
+ return false;
+ return true;
+ }
+
+ /* variable MTRRs */
+ WARN_ON(!(msr >= 0x200 && msr < 0x200 + 2 * KVM_NR_VAR_MTRR));
+
+ mask = (~0ULL) << cpuid_maxphyaddr(vcpu);
+ if ((msr & 1) == 0) {
+ /* MTRR base */
+ if (!valid_mtrr_type(data & 0xff))
+ return false;
+ mask |= 0xf00;
+ } else
+ /* MTRR mask */
+ mask |= 0x7ff;
+ if (data & mask) {
+ kvm_inject_gp(vcpu, 0);
+ return false;
+ }
+
+ return true;
+}
+EXPORT_SYMBOL_GPL(kvm_mtrr_valid);
+
+static bool mtrr_is_enabled(struct kvm_mtrr *mtrr_state)
+{
+ return !!(mtrr_state->deftype & IA32_MTRR_DEF_TYPE_E);
+}
+
+static bool fixed_mtrr_is_enabled(struct kvm_mtrr *mtrr_state)
+{
+ return !!(mtrr_state->deftype & IA32_MTRR_DEF_TYPE_FE);
+}
+
+static u8 mtrr_default_type(struct kvm_mtrr *mtrr_state)
+{
+ return mtrr_state->deftype & IA32_MTRR_DEF_TYPE_TYPE_MASK;
+}
+
+static u8 mtrr_disabled_type(struct kvm_vcpu *vcpu)
+{
+ /*
+ * Intel SDM 11.11.2.2: all MTRRs are disabled when
+ * IA32_MTRR_DEF_TYPE.E bit is cleared, and the UC
+ * memory type is applied to all of physical memory.
+ *
+ * However, virtual machines can be run with CPUID such that
+ * there are no MTRRs. In that case, the firmware will never
+ * enable MTRRs and it is obviously undesirable to run the
+ * guest entirely with UC memory and we use WB.
+ */
+ if (guest_cpuid_has(vcpu, X86_FEATURE_MTRR))
+ return MTRR_TYPE_UNCACHABLE;
+ else
+ return MTRR_TYPE_WRBACK;
+}
+
+/*
+* Three terms are used in the following code:
+* - segment, it indicates the address segments covered by fixed MTRRs.
+* - unit, it corresponds to the MSR entry in the segment.
+* - range, a range is covered in one memory cache type.
+*/
+struct fixed_mtrr_segment {
+ u64 start;
+ u64 end;
+
+ int range_shift;
+
+ /* the start position in kvm_mtrr.fixed_ranges[]. */
+ int range_start;
+};
+
+static struct fixed_mtrr_segment fixed_seg_table[] = {
+ /* MSR_MTRRfix64K_00000, 1 unit. 64K fixed mtrr. */
+ {
+ .start = 0x0,
+ .end = 0x80000,
+ .range_shift = 16, /* 64K */
+ .range_start = 0,
+ },
+
+ /*
+ * MSR_MTRRfix16K_80000 ... MSR_MTRRfix16K_A0000, 2 units,
+ * 16K fixed mtrr.
+ */
+ {
+ .start = 0x80000,
+ .end = 0xc0000,
+ .range_shift = 14, /* 16K */
+ .range_start = 8,
+ },
+
+ /*
+ * MSR_MTRRfix4K_C0000 ... MSR_MTRRfix4K_F8000, 8 units,
+ * 4K fixed mtrr.
+ */
+ {
+ .start = 0xc0000,
+ .end = 0x100000,
+ .range_shift = 12, /* 12K */
+ .range_start = 24,
+ }
+};
+
+/*
+ * The size of unit is covered in one MSR, one MSR entry contains
+ * 8 ranges so that unit size is always 8 * 2^range_shift.
+ */
+static u64 fixed_mtrr_seg_unit_size(int seg)
+{
+ return 8 << fixed_seg_table[seg].range_shift;
+}
+
+static bool fixed_msr_to_seg_unit(u32 msr, int *seg, int *unit)
+{
+ switch (msr) {
+ case MSR_MTRRfix64K_00000:
+ *seg = 0;
+ *unit = 0;
+ break;
+ case MSR_MTRRfix16K_80000 ... MSR_MTRRfix16K_A0000:
+ *seg = 1;
+ *unit = array_index_nospec(
+ msr - MSR_MTRRfix16K_80000,
+ MSR_MTRRfix16K_A0000 - MSR_MTRRfix16K_80000 + 1);
+ break;
+ case MSR_MTRRfix4K_C0000 ... MSR_MTRRfix4K_F8000:
+ *seg = 2;
+ *unit = array_index_nospec(
+ msr - MSR_MTRRfix4K_C0000,
+ MSR_MTRRfix4K_F8000 - MSR_MTRRfix4K_C0000 + 1);
+ break;
+ default:
+ return false;
+ }
+
+ return true;
+}
+
+static void fixed_mtrr_seg_unit_range(int seg, int unit, u64 *start, u64 *end)
+{
+ struct fixed_mtrr_segment *mtrr_seg = &fixed_seg_table[seg];
+ u64 unit_size = fixed_mtrr_seg_unit_size(seg);
+
+ *start = mtrr_seg->start + unit * unit_size;
+ *end = *start + unit_size;
+ WARN_ON(*end > mtrr_seg->end);
+}
+
+static int fixed_mtrr_seg_unit_range_index(int seg, int unit)
+{
+ struct fixed_mtrr_segment *mtrr_seg = &fixed_seg_table[seg];
+
+ WARN_ON(mtrr_seg->start + unit * fixed_mtrr_seg_unit_size(seg)
+ > mtrr_seg->end);
+
+ /* each unit has 8 ranges. */
+ return mtrr_seg->range_start + 8 * unit;
+}
+
+static int fixed_mtrr_seg_end_range_index(int seg)
+{
+ struct fixed_mtrr_segment *mtrr_seg = &fixed_seg_table[seg];
+ int n;
+
+ n = (mtrr_seg->end - mtrr_seg->start) >> mtrr_seg->range_shift;
+ return mtrr_seg->range_start + n - 1;
+}
+
+static bool fixed_msr_to_range(u32 msr, u64 *start, u64 *end)
+{
+ int seg, unit;
+
+ if (!fixed_msr_to_seg_unit(msr, &seg, &unit))
+ return false;
+
+ fixed_mtrr_seg_unit_range(seg, unit, start, end);
+ return true;
+}
+
+static int fixed_msr_to_range_index(u32 msr)
+{
+ int seg, unit;
+
+ if (!fixed_msr_to_seg_unit(msr, &seg, &unit))
+ return -1;
+
+ return fixed_mtrr_seg_unit_range_index(seg, unit);
+}
+
+static int fixed_mtrr_addr_to_seg(u64 addr)
+{
+ struct fixed_mtrr_segment *mtrr_seg;
+ int seg, seg_num = ARRAY_SIZE(fixed_seg_table);
+
+ for (seg = 0; seg < seg_num; seg++) {
+ mtrr_seg = &fixed_seg_table[seg];
+ if (mtrr_seg->start <= addr && addr < mtrr_seg->end)
+ return seg;
+ }
+
+ return -1;
+}
+
+static int fixed_mtrr_addr_seg_to_range_index(u64 addr, int seg)
+{
+ struct fixed_mtrr_segment *mtrr_seg;
+ int index;
+
+ mtrr_seg = &fixed_seg_table[seg];
+ index = mtrr_seg->range_start;
+ index += (addr - mtrr_seg->start) >> mtrr_seg->range_shift;
+ return index;
+}
+
+static u64 fixed_mtrr_range_end_addr(int seg, int index)
+{
+ struct fixed_mtrr_segment *mtrr_seg = &fixed_seg_table[seg];
+ int pos = index - mtrr_seg->range_start;
+
+ return mtrr_seg->start + ((pos + 1) << mtrr_seg->range_shift);
+}
+
+static void var_mtrr_range(struct kvm_mtrr_range *range, u64 *start, u64 *end)
+{
+ u64 mask;
+
+ *start = range->base & PAGE_MASK;
+
+ mask = range->mask & PAGE_MASK;
+
+ /* This cannot overflow because writing to the reserved bits of
+ * variable MTRRs causes a #GP.
+ */
+ *end = (*start | ~mask) + 1;
+}
+
+static void update_mtrr(struct kvm_vcpu *vcpu, u32 msr)
+{
+ struct kvm_mtrr *mtrr_state = &vcpu->arch.mtrr_state;
+ gfn_t start, end;
+ int index;
+
+ if (msr == MSR_IA32_CR_PAT || !tdp_enabled ||
+ !kvm_arch_has_noncoherent_dma(vcpu->kvm))
+ return;
+
+ if (!mtrr_is_enabled(mtrr_state) && msr != MSR_MTRRdefType)
+ return;
+
+ /* fixed MTRRs. */
+ if (fixed_msr_to_range(msr, &start, &end)) {
+ if (!fixed_mtrr_is_enabled(mtrr_state))
+ return;
+ } else if (msr == MSR_MTRRdefType) {
+ start = 0x0;
+ end = ~0ULL;
+ } else {
+ /* variable range MTRRs. */
+ index = (msr - 0x200) / 2;
+ var_mtrr_range(&mtrr_state->var_ranges[index], &start, &end);
+ }
+
+ kvm_zap_gfn_range(vcpu->kvm, gpa_to_gfn(start), gpa_to_gfn(end));
+}
+
+static bool var_mtrr_range_is_valid(struct kvm_mtrr_range *range)
+{
+ return (range->mask & (1 << 11)) != 0;
+}
+
+static void set_var_mtrr_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
+{
+ struct kvm_mtrr *mtrr_state = &vcpu->arch.mtrr_state;
+ struct kvm_mtrr_range *tmp, *cur;
+ int index, is_mtrr_mask;
+
+ index = (msr - 0x200) / 2;
+ is_mtrr_mask = msr - 0x200 - 2 * index;
+ cur = &mtrr_state->var_ranges[index];
+
+ /* remove the entry if it's in the list. */
+ if (var_mtrr_range_is_valid(cur))
+ list_del(&mtrr_state->var_ranges[index].node);
+
+ /* Extend the mask with all 1 bits to the left, since those
+ * bits must implicitly be 0. The bits are then cleared
+ * when reading them.
+ */
+ if (!is_mtrr_mask)
+ cur->base = data;
+ else
+ cur->mask = data | (-1LL << cpuid_maxphyaddr(vcpu));
+
+ /* add it to the list if it's enabled. */
+ if (var_mtrr_range_is_valid(cur)) {
+ list_for_each_entry(tmp, &mtrr_state->head, node)
+ if (cur->base >= tmp->base)
+ break;
+ list_add_tail(&cur->node, &tmp->node);
+ }
+}
+
+int kvm_mtrr_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
+{
+ int index;
+
+ if (!kvm_mtrr_valid(vcpu, msr, data))
+ return 1;
+
+ index = fixed_msr_to_range_index(msr);
+ if (index >= 0)
+ *(u64 *)&vcpu->arch.mtrr_state.fixed_ranges[index] = data;
+ else if (msr == MSR_MTRRdefType)
+ vcpu->arch.mtrr_state.deftype = data;
+ else if (msr == MSR_IA32_CR_PAT)
+ vcpu->arch.pat = data;
+ else
+ set_var_mtrr_msr(vcpu, msr, data);
+
+ update_mtrr(vcpu, msr);
+ return 0;
+}
+
+int kvm_mtrr_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
+{
+ int index;
+
+ /* MSR_MTRRcap is a readonly MSR. */
+ if (msr == MSR_MTRRcap) {
+ /*
+ * SMRR = 0
+ * WC = 1
+ * FIX = 1
+ * VCNT = KVM_NR_VAR_MTRR
+ */
+ *pdata = 0x500 | KVM_NR_VAR_MTRR;
+ return 0;
+ }
+
+ if (!msr_mtrr_valid(msr))
+ return 1;
+
+ index = fixed_msr_to_range_index(msr);
+ if (index >= 0)
+ *pdata = *(u64 *)&vcpu->arch.mtrr_state.fixed_ranges[index];
+ else if (msr == MSR_MTRRdefType)
+ *pdata = vcpu->arch.mtrr_state.deftype;
+ else if (msr == MSR_IA32_CR_PAT)
+ *pdata = vcpu->arch.pat;
+ else { /* Variable MTRRs */
+ int is_mtrr_mask;
+
+ index = (msr - 0x200) / 2;
+ is_mtrr_mask = msr - 0x200 - 2 * index;
+ if (!is_mtrr_mask)
+ *pdata = vcpu->arch.mtrr_state.var_ranges[index].base;
+ else
+ *pdata = vcpu->arch.mtrr_state.var_ranges[index].mask;
+
+ *pdata &= (1ULL << cpuid_maxphyaddr(vcpu)) - 1;
+ }
+
+ return 0;
+}
+
+void kvm_vcpu_mtrr_init(struct kvm_vcpu *vcpu)
+{
+ INIT_LIST_HEAD(&vcpu->arch.mtrr_state.head);
+}
+
+struct mtrr_iter {
+ /* input fields. */
+ struct kvm_mtrr *mtrr_state;
+ u64 start;
+ u64 end;
+
+ /* output fields. */
+ int mem_type;
+ /* mtrr is completely disabled? */
+ bool mtrr_disabled;
+ /* [start, end) is not fully covered in MTRRs? */
+ bool partial_map;
+
+ /* private fields. */
+ union {
+ /* used for fixed MTRRs. */
+ struct {
+ int index;
+ int seg;
+ };
+
+ /* used for var MTRRs. */
+ struct {
+ struct kvm_mtrr_range *range;
+ /* max address has been covered in var MTRRs. */
+ u64 start_max;
+ };
+ };
+
+ bool fixed;
+};
+
+static bool mtrr_lookup_fixed_start(struct mtrr_iter *iter)
+{
+ int seg, index;
+
+ if (!fixed_mtrr_is_enabled(iter->mtrr_state))
+ return false;
+
+ seg = fixed_mtrr_addr_to_seg(iter->start);
+ if (seg < 0)
+ return false;
+
+ iter->fixed = true;
+ index = fixed_mtrr_addr_seg_to_range_index(iter->start, seg);
+ iter->index = index;
+ iter->seg = seg;
+ return true;
+}
+
+static bool match_var_range(struct mtrr_iter *iter,
+ struct kvm_mtrr_range *range)
+{
+ u64 start, end;
+
+ var_mtrr_range(range, &start, &end);
+ if (!(start >= iter->end || end <= iter->start)) {
+ iter->range = range;
+
+ /*
+ * the function is called when we do kvm_mtrr.head walking.
+ * Range has the minimum base address which interleaves
+ * [looker->start_max, looker->end).
+ */
+ iter->partial_map |= iter->start_max < start;
+
+ /* update the max address has been covered. */
+ iter->start_max = max(iter->start_max, end);
+ return true;
+ }
+
+ return false;
+}
+
+static void __mtrr_lookup_var_next(struct mtrr_iter *iter)
+{
+ struct kvm_mtrr *mtrr_state = iter->mtrr_state;
+
+ list_for_each_entry_continue(iter->range, &mtrr_state->head, node)
+ if (match_var_range(iter, iter->range))
+ return;
+
+ iter->range = NULL;
+ iter->partial_map |= iter->start_max < iter->end;
+}
+
+static void mtrr_lookup_var_start(struct mtrr_iter *iter)
+{
+ struct kvm_mtrr *mtrr_state = iter->mtrr_state;
+
+ iter->fixed = false;
+ iter->start_max = iter->start;
+ iter->range = NULL;
+ iter->range = list_prepare_entry(iter->range, &mtrr_state->head, node);
+
+ __mtrr_lookup_var_next(iter);
+}
+
+static void mtrr_lookup_fixed_next(struct mtrr_iter *iter)
+{
+ /* terminate the lookup. */
+ if (fixed_mtrr_range_end_addr(iter->seg, iter->index) >= iter->end) {
+ iter->fixed = false;
+ iter->range = NULL;
+ return;
+ }
+
+ iter->index++;
+
+ /* have looked up for all fixed MTRRs. */
+ if (iter->index >= ARRAY_SIZE(iter->mtrr_state->fixed_ranges))
+ return mtrr_lookup_var_start(iter);
+
+ /* switch to next segment. */
+ if (iter->index > fixed_mtrr_seg_end_range_index(iter->seg))
+ iter->seg++;
+}
+
+static void mtrr_lookup_var_next(struct mtrr_iter *iter)
+{
+ __mtrr_lookup_var_next(iter);
+}
+
+static void mtrr_lookup_start(struct mtrr_iter *iter)
+{
+ if (!mtrr_is_enabled(iter->mtrr_state)) {
+ iter->mtrr_disabled = true;
+ return;
+ }
+
+ if (!mtrr_lookup_fixed_start(iter))
+ mtrr_lookup_var_start(iter);
+}
+
+static void mtrr_lookup_init(struct mtrr_iter *iter,
+ struct kvm_mtrr *mtrr_state, u64 start, u64 end)
+{
+ iter->mtrr_state = mtrr_state;
+ iter->start = start;
+ iter->end = end;
+ iter->mtrr_disabled = false;
+ iter->partial_map = false;
+ iter->fixed = false;
+ iter->range = NULL;
+
+ mtrr_lookup_start(iter);
+}
+
+static bool mtrr_lookup_okay(struct mtrr_iter *iter)
+{
+ if (iter->fixed) {
+ iter->mem_type = iter->mtrr_state->fixed_ranges[iter->index];
+ return true;
+ }
+
+ if (iter->range) {
+ iter->mem_type = iter->range->base & 0xff;
+ return true;
+ }
+
+ return false;
+}
+
+static void mtrr_lookup_next(struct mtrr_iter *iter)
+{
+ if (iter->fixed)
+ mtrr_lookup_fixed_next(iter);
+ else
+ mtrr_lookup_var_next(iter);
+}
+
+#define mtrr_for_each_mem_type(_iter_, _mtrr_, _gpa_start_, _gpa_end_) \
+ for (mtrr_lookup_init(_iter_, _mtrr_, _gpa_start_, _gpa_end_); \
+ mtrr_lookup_okay(_iter_); mtrr_lookup_next(_iter_))
+
+u8 kvm_mtrr_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn)
+{
+ struct kvm_mtrr *mtrr_state = &vcpu->arch.mtrr_state;
+ struct mtrr_iter iter;
+ u64 start, end;
+ int type = -1;
+ const int wt_wb_mask = (1 << MTRR_TYPE_WRBACK)
+ | (1 << MTRR_TYPE_WRTHROUGH);
+
+ start = gfn_to_gpa(gfn);
+ end = start + PAGE_SIZE;
+
+ mtrr_for_each_mem_type(&iter, mtrr_state, start, end) {
+ int curr_type = iter.mem_type;
+
+ /*
+ * Please refer to Intel SDM Volume 3: 11.11.4.1 MTRR
+ * Precedences.
+ */
+
+ if (type == -1) {
+ type = curr_type;
+ continue;
+ }
+
+ /*
+ * If two or more variable memory ranges match and the
+ * memory types are identical, then that memory type is
+ * used.
+ */
+ if (type == curr_type)
+ continue;
+
+ /*
+ * If two or more variable memory ranges match and one of
+ * the memory types is UC, the UC memory type used.
+ */
+ if (curr_type == MTRR_TYPE_UNCACHABLE)
+ return MTRR_TYPE_UNCACHABLE;
+
+ /*
+ * If two or more variable memory ranges match and the
+ * memory types are WT and WB, the WT memory type is used.
+ */
+ if (((1 << type) & wt_wb_mask) &&
+ ((1 << curr_type) & wt_wb_mask)) {
+ type = MTRR_TYPE_WRTHROUGH;
+ continue;
+ }
+
+ /*
+ * For overlaps not defined by the above rules, processor
+ * behavior is undefined.
+ */
+
+ /* We use WB for this undefined behavior. :( */
+ return MTRR_TYPE_WRBACK;
+ }
+
+ if (iter.mtrr_disabled)
+ return mtrr_disabled_type(vcpu);
+
+ /* not contained in any MTRRs. */
+ if (type == -1)
+ return mtrr_default_type(mtrr_state);
+
+ /*
+ * We just check one page, partially covered by MTRRs is
+ * impossible.
+ */
+ WARN_ON(iter.partial_map);
+
+ return type;
+}
+EXPORT_SYMBOL_GPL(kvm_mtrr_get_guest_memory_type);
+
+bool kvm_mtrr_check_gfn_range_consistency(struct kvm_vcpu *vcpu, gfn_t gfn,
+ int page_num)
+{
+ struct kvm_mtrr *mtrr_state = &vcpu->arch.mtrr_state;
+ struct mtrr_iter iter;
+ u64 start, end;
+ int type = -1;
+
+ start = gfn_to_gpa(gfn);
+ end = gfn_to_gpa(gfn + page_num);
+ mtrr_for_each_mem_type(&iter, mtrr_state, start, end) {
+ if (type == -1) {
+ type = iter.mem_type;
+ continue;
+ }
+
+ if (type != iter.mem_type)
+ return false;
+ }
+
+ if (iter.mtrr_disabled)
+ return true;
+
+ if (!iter.partial_map)
+ return true;
+
+ if (type == -1)
+ return true;
+
+ return type == mtrr_default_type(mtrr_state);
+}
diff --git a/arch/x86/kvm/page_track.c b/arch/x86/kvm/page_track.c
new file mode 100644
index 000000000..3052a59a3
--- /dev/null
+++ b/arch/x86/kvm/page_track.c
@@ -0,0 +1,267 @@
+/*
+ * Support KVM gust page tracking
+ *
+ * This feature allows us to track page access in guest. Currently, only
+ * write access is tracked.
+ *
+ * Copyright(C) 2015 Intel Corporation.
+ *
+ * Author:
+ * Xiao Guangrong <guangrong.xiao@linux.intel.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ */
+
+#include <linux/kvm_host.h>
+#include <linux/rculist.h>
+
+#include <asm/kvm_host.h>
+#include <asm/kvm_page_track.h>
+
+#include "mmu.h"
+
+void kvm_page_track_free_memslot(struct kvm_memory_slot *free,
+ struct kvm_memory_slot *dont)
+{
+ int i;
+
+ for (i = 0; i < KVM_PAGE_TRACK_MAX; i++)
+ if (!dont || free->arch.gfn_track[i] !=
+ dont->arch.gfn_track[i]) {
+ kvfree(free->arch.gfn_track[i]);
+ free->arch.gfn_track[i] = NULL;
+ }
+}
+
+int kvm_page_track_create_memslot(struct kvm_memory_slot *slot,
+ unsigned long npages)
+{
+ int i;
+
+ for (i = 0; i < KVM_PAGE_TRACK_MAX; i++) {
+ slot->arch.gfn_track[i] =
+ kvcalloc(npages, sizeof(*slot->arch.gfn_track[i]),
+ GFP_KERNEL);
+ if (!slot->arch.gfn_track[i])
+ goto track_free;
+ }
+
+ return 0;
+
+track_free:
+ kvm_page_track_free_memslot(slot, NULL);
+ return -ENOMEM;
+}
+
+static inline bool page_track_mode_is_valid(enum kvm_page_track_mode mode)
+{
+ if (mode < 0 || mode >= KVM_PAGE_TRACK_MAX)
+ return false;
+
+ return true;
+}
+
+static void update_gfn_track(struct kvm_memory_slot *slot, gfn_t gfn,
+ enum kvm_page_track_mode mode, short count)
+{
+ int index, val;
+
+ index = gfn_to_index(gfn, slot->base_gfn, PT_PAGE_TABLE_LEVEL);
+
+ val = slot->arch.gfn_track[mode][index];
+
+ if (WARN_ON(val + count < 0 || val + count > USHRT_MAX))
+ return;
+
+ slot->arch.gfn_track[mode][index] += count;
+}
+
+/*
+ * add guest page to the tracking pool so that corresponding access on that
+ * page will be intercepted.
+ *
+ * It should be called under the protection both of mmu-lock and kvm->srcu
+ * or kvm->slots_lock.
+ *
+ * @kvm: the guest instance we are interested in.
+ * @slot: the @gfn belongs to.
+ * @gfn: the guest page.
+ * @mode: tracking mode, currently only write track is supported.
+ */
+void kvm_slot_page_track_add_page(struct kvm *kvm,
+ struct kvm_memory_slot *slot, gfn_t gfn,
+ enum kvm_page_track_mode mode)
+{
+
+ if (WARN_ON(!page_track_mode_is_valid(mode)))
+ return;
+
+ update_gfn_track(slot, gfn, mode, 1);
+
+ /*
+ * new track stops large page mapping for the
+ * tracked page.
+ */
+ kvm_mmu_gfn_disallow_lpage(slot, gfn);
+
+ if (mode == KVM_PAGE_TRACK_WRITE)
+ if (kvm_mmu_slot_gfn_write_protect(kvm, slot, gfn))
+ kvm_flush_remote_tlbs(kvm);
+}
+EXPORT_SYMBOL_GPL(kvm_slot_page_track_add_page);
+
+/*
+ * remove the guest page from the tracking pool which stops the interception
+ * of corresponding access on that page. It is the opposed operation of
+ * kvm_slot_page_track_add_page().
+ *
+ * It should be called under the protection both of mmu-lock and kvm->srcu
+ * or kvm->slots_lock.
+ *
+ * @kvm: the guest instance we are interested in.
+ * @slot: the @gfn belongs to.
+ * @gfn: the guest page.
+ * @mode: tracking mode, currently only write track is supported.
+ */
+void kvm_slot_page_track_remove_page(struct kvm *kvm,
+ struct kvm_memory_slot *slot, gfn_t gfn,
+ enum kvm_page_track_mode mode)
+{
+ if (WARN_ON(!page_track_mode_is_valid(mode)))
+ return;
+
+ update_gfn_track(slot, gfn, mode, -1);
+
+ /*
+ * allow large page mapping for the tracked page
+ * after the tracker is gone.
+ */
+ kvm_mmu_gfn_allow_lpage(slot, gfn);
+}
+EXPORT_SYMBOL_GPL(kvm_slot_page_track_remove_page);
+
+/*
+ * check if the corresponding access on the specified guest page is tracked.
+ */
+bool kvm_page_track_is_active(struct kvm_vcpu *vcpu, gfn_t gfn,
+ enum kvm_page_track_mode mode)
+{
+ struct kvm_memory_slot *slot;
+ int index;
+
+ if (WARN_ON(!page_track_mode_is_valid(mode)))
+ return false;
+
+ slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
+ if (!slot)
+ return false;
+
+ index = gfn_to_index(gfn, slot->base_gfn, PT_PAGE_TABLE_LEVEL);
+ return !!READ_ONCE(slot->arch.gfn_track[mode][index]);
+}
+
+void kvm_page_track_cleanup(struct kvm *kvm)
+{
+ struct kvm_page_track_notifier_head *head;
+
+ head = &kvm->arch.track_notifier_head;
+ cleanup_srcu_struct(&head->track_srcu);
+}
+
+void kvm_page_track_init(struct kvm *kvm)
+{
+ struct kvm_page_track_notifier_head *head;
+
+ head = &kvm->arch.track_notifier_head;
+ init_srcu_struct(&head->track_srcu);
+ INIT_HLIST_HEAD(&head->track_notifier_list);
+}
+
+/*
+ * register the notifier so that event interception for the tracked guest
+ * pages can be received.
+ */
+void
+kvm_page_track_register_notifier(struct kvm *kvm,
+ struct kvm_page_track_notifier_node *n)
+{
+ struct kvm_page_track_notifier_head *head;
+
+ head = &kvm->arch.track_notifier_head;
+
+ spin_lock(&kvm->mmu_lock);
+ hlist_add_head_rcu(&n->node, &head->track_notifier_list);
+ spin_unlock(&kvm->mmu_lock);
+}
+EXPORT_SYMBOL_GPL(kvm_page_track_register_notifier);
+
+/*
+ * stop receiving the event interception. It is the opposed operation of
+ * kvm_page_track_register_notifier().
+ */
+void
+kvm_page_track_unregister_notifier(struct kvm *kvm,
+ struct kvm_page_track_notifier_node *n)
+{
+ struct kvm_page_track_notifier_head *head;
+
+ head = &kvm->arch.track_notifier_head;
+
+ spin_lock(&kvm->mmu_lock);
+ hlist_del_rcu(&n->node);
+ spin_unlock(&kvm->mmu_lock);
+ synchronize_srcu(&head->track_srcu);
+}
+EXPORT_SYMBOL_GPL(kvm_page_track_unregister_notifier);
+
+/*
+ * Notify the node that write access is intercepted and write emulation is
+ * finished at this time.
+ *
+ * The node should figure out if the written page is the one that node is
+ * interested in by itself.
+ */
+void kvm_page_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new,
+ int bytes)
+{
+ struct kvm_page_track_notifier_head *head;
+ struct kvm_page_track_notifier_node *n;
+ int idx;
+
+ head = &vcpu->kvm->arch.track_notifier_head;
+
+ if (hlist_empty(&head->track_notifier_list))
+ return;
+
+ idx = srcu_read_lock(&head->track_srcu);
+ hlist_for_each_entry_rcu(n, &head->track_notifier_list, node)
+ if (n->track_write)
+ n->track_write(vcpu, gpa, new, bytes, n);
+ srcu_read_unlock(&head->track_srcu, idx);
+}
+
+/*
+ * Notify the node that memory slot is being removed or moved so that it can
+ * drop write-protection for the pages in the memory slot.
+ *
+ * The node should figure out it has any write-protected pages in this slot
+ * by itself.
+ */
+void kvm_page_track_flush_slot(struct kvm *kvm, struct kvm_memory_slot *slot)
+{
+ struct kvm_page_track_notifier_head *head;
+ struct kvm_page_track_notifier_node *n;
+ int idx;
+
+ head = &kvm->arch.track_notifier_head;
+
+ if (hlist_empty(&head->track_notifier_list))
+ return;
+
+ idx = srcu_read_lock(&head->track_srcu);
+ hlist_for_each_entry_rcu(n, &head->track_notifier_list, node)
+ if (n->track_flush_slot)
+ n->track_flush_slot(kvm, slot, n);
+ srcu_read_unlock(&head->track_srcu, idx);
+}
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
new file mode 100644
index 000000000..9e15818de
--- /dev/null
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -0,0 +1,1083 @@
+/*
+ * Kernel-based Virtual Machine driver for Linux
+ *
+ * This module enables machines with Intel VT-x extensions to run virtual
+ * machines without emulation or binary translation.
+ *
+ * MMU support
+ *
+ * Copyright (C) 2006 Qumranet, Inc.
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
+ *
+ * Authors:
+ * Yaniv Kamay <yaniv@qumranet.com>
+ * Avi Kivity <avi@qumranet.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+/*
+ * We need the mmu code to access both 32-bit and 64-bit guest ptes,
+ * so the code in this file is compiled twice, once per pte size.
+ */
+
+#if PTTYPE == 64
+ #define pt_element_t u64
+ #define guest_walker guest_walker64
+ #define FNAME(name) paging##64_##name
+ #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK
+ #define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl)
+ #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl)
+ #define PT_INDEX(addr, level) PT64_INDEX(addr, level)
+ #define PT_LEVEL_BITS PT64_LEVEL_BITS
+ #define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT
+ #define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT
+ #define PT_HAVE_ACCESSED_DIRTY(mmu) true
+ #ifdef CONFIG_X86_64
+ #define PT_MAX_FULL_LEVELS PT64_ROOT_MAX_LEVEL
+ #define CMPXCHG cmpxchg
+ #else
+ #define CMPXCHG cmpxchg64
+ #define PT_MAX_FULL_LEVELS 2
+ #endif
+#elif PTTYPE == 32
+ #define pt_element_t u32
+ #define guest_walker guest_walker32
+ #define FNAME(name) paging##32_##name
+ #define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK
+ #define PT_LVL_ADDR_MASK(lvl) PT32_LVL_ADDR_MASK(lvl)
+ #define PT_LVL_OFFSET_MASK(lvl) PT32_LVL_OFFSET_MASK(lvl)
+ #define PT_INDEX(addr, level) PT32_INDEX(addr, level)
+ #define PT_LEVEL_BITS PT32_LEVEL_BITS
+ #define PT_MAX_FULL_LEVELS 2
+ #define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT
+ #define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT
+ #define PT_HAVE_ACCESSED_DIRTY(mmu) true
+ #define CMPXCHG cmpxchg
+#elif PTTYPE == PTTYPE_EPT
+ #define pt_element_t u64
+ #define guest_walker guest_walkerEPT
+ #define FNAME(name) ept_##name
+ #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK
+ #define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl)
+ #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl)
+ #define PT_INDEX(addr, level) PT64_INDEX(addr, level)
+ #define PT_LEVEL_BITS PT64_LEVEL_BITS
+ #define PT_GUEST_DIRTY_SHIFT 9
+ #define PT_GUEST_ACCESSED_SHIFT 8
+ #define PT_HAVE_ACCESSED_DIRTY(mmu) ((mmu)->ept_ad)
+ #define CMPXCHG cmpxchg64
+ #define PT_MAX_FULL_LEVELS 4
+#else
+ #error Invalid PTTYPE value
+#endif
+
+#define PT_GUEST_DIRTY_MASK (1 << PT_GUEST_DIRTY_SHIFT)
+#define PT_GUEST_ACCESSED_MASK (1 << PT_GUEST_ACCESSED_SHIFT)
+
+#define gpte_to_gfn_lvl FNAME(gpte_to_gfn_lvl)
+#define gpte_to_gfn(pte) gpte_to_gfn_lvl((pte), PT_PAGE_TABLE_LEVEL)
+
+/*
+ * The guest_walker structure emulates the behavior of the hardware page
+ * table walker.
+ */
+struct guest_walker {
+ int level;
+ unsigned max_level;
+ gfn_t table_gfn[PT_MAX_FULL_LEVELS];
+ pt_element_t ptes[PT_MAX_FULL_LEVELS];
+ pt_element_t prefetch_ptes[PTE_PREFETCH_NUM];
+ gpa_t pte_gpa[PT_MAX_FULL_LEVELS];
+ pt_element_t __user *ptep_user[PT_MAX_FULL_LEVELS];
+ bool pte_writable[PT_MAX_FULL_LEVELS];
+ unsigned int pt_access[PT_MAX_FULL_LEVELS];
+ unsigned int pte_access;
+ gfn_t gfn;
+ struct x86_exception fault;
+};
+
+static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl)
+{
+ return (gpte & PT_LVL_ADDR_MASK(lvl)) >> PAGE_SHIFT;
+}
+
+static inline void FNAME(protect_clean_gpte)(struct kvm_mmu *mmu, unsigned *access,
+ unsigned gpte)
+{
+ unsigned mask;
+
+ /* dirty bit is not supported, so no need to track it */
+ if (!PT_HAVE_ACCESSED_DIRTY(mmu))
+ return;
+
+ BUILD_BUG_ON(PT_WRITABLE_MASK != ACC_WRITE_MASK);
+
+ mask = (unsigned)~ACC_WRITE_MASK;
+ /* Allow write access to dirty gptes */
+ mask |= (gpte >> (PT_GUEST_DIRTY_SHIFT - PT_WRITABLE_SHIFT)) &
+ PT_WRITABLE_MASK;
+ *access &= mask;
+}
+
+static inline int FNAME(is_present_gpte)(unsigned long pte)
+{
+#if PTTYPE != PTTYPE_EPT
+ return pte & PT_PRESENT_MASK;
+#else
+ return pte & 7;
+#endif
+}
+
+static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
+ pt_element_t __user *ptep_user, unsigned index,
+ pt_element_t orig_pte, pt_element_t new_pte)
+{
+ int npages;
+ pt_element_t ret;
+ pt_element_t *table;
+ struct page *page;
+
+ npages = get_user_pages_fast((unsigned long)ptep_user, 1, 1, &page);
+ /* Check if the user is doing something meaningless. */
+ if (unlikely(npages != 1))
+ return -EFAULT;
+
+ table = kmap_atomic(page);
+ ret = CMPXCHG(&table[index], orig_pte, new_pte);
+ kunmap_atomic(table);
+
+ kvm_release_page_dirty(page);
+
+ return (ret != orig_pte);
+}
+
+static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
+ struct kvm_mmu_page *sp, u64 *spte,
+ u64 gpte)
+{
+ if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL))
+ goto no_present;
+
+ if (!FNAME(is_present_gpte)(gpte))
+ goto no_present;
+
+ /* if accessed bit is not supported prefetch non accessed gpte */
+ if (PT_HAVE_ACCESSED_DIRTY(&vcpu->arch.mmu) && !(gpte & PT_GUEST_ACCESSED_MASK))
+ goto no_present;
+
+ return false;
+
+no_present:
+ drop_spte(vcpu->kvm, spte);
+ return true;
+}
+
+/*
+ * For PTTYPE_EPT, a page table can be executable but not readable
+ * on supported processors. Therefore, set_spte does not automatically
+ * set bit 0 if execute only is supported. Here, we repurpose ACC_USER_MASK
+ * to signify readability since it isn't used in the EPT case
+ */
+static inline unsigned FNAME(gpte_access)(u64 gpte)
+{
+ unsigned access;
+#if PTTYPE == PTTYPE_EPT
+ access = ((gpte & VMX_EPT_WRITABLE_MASK) ? ACC_WRITE_MASK : 0) |
+ ((gpte & VMX_EPT_EXECUTABLE_MASK) ? ACC_EXEC_MASK : 0) |
+ ((gpte & VMX_EPT_READABLE_MASK) ? ACC_USER_MASK : 0);
+#else
+ BUILD_BUG_ON(ACC_EXEC_MASK != PT_PRESENT_MASK);
+ BUILD_BUG_ON(ACC_EXEC_MASK != 1);
+ access = gpte & (PT_WRITABLE_MASK | PT_USER_MASK | PT_PRESENT_MASK);
+ /* Combine NX with P (which is set here) to get ACC_EXEC_MASK. */
+ access ^= (gpte >> PT64_NX_SHIFT);
+#endif
+
+ return access;
+}
+
+static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu,
+ struct kvm_mmu *mmu,
+ struct guest_walker *walker,
+ gpa_t addr, int write_fault)
+{
+ unsigned level, index;
+ pt_element_t pte, orig_pte;
+ pt_element_t __user *ptep_user;
+ gfn_t table_gfn;
+ int ret;
+
+ /* dirty/accessed bits are not supported, so no need to update them */
+ if (!PT_HAVE_ACCESSED_DIRTY(mmu))
+ return 0;
+
+ for (level = walker->max_level; level >= walker->level; --level) {
+ pte = orig_pte = walker->ptes[level - 1];
+ table_gfn = walker->table_gfn[level - 1];
+ ptep_user = walker->ptep_user[level - 1];
+ index = offset_in_page(ptep_user) / sizeof(pt_element_t);
+ if (!(pte & PT_GUEST_ACCESSED_MASK)) {
+ trace_kvm_mmu_set_accessed_bit(table_gfn, index, sizeof(pte));
+ pte |= PT_GUEST_ACCESSED_MASK;
+ }
+ if (level == walker->level && write_fault &&
+ !(pte & PT_GUEST_DIRTY_MASK)) {
+ trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte));
+#if PTTYPE == PTTYPE_EPT
+ if (kvm_arch_write_log_dirty(vcpu, addr))
+ return -EINVAL;
+#endif
+ pte |= PT_GUEST_DIRTY_MASK;
+ }
+ if (pte == orig_pte)
+ continue;
+
+ /*
+ * If the slot is read-only, simply do not process the accessed
+ * and dirty bits. This is the correct thing to do if the slot
+ * is ROM, and page tables in read-as-ROM/write-as-MMIO slots
+ * are only supported if the accessed and dirty bits are already
+ * set in the ROM (so that MMIO writes are never needed).
+ *
+ * Note that NPT does not allow this at all and faults, since
+ * it always wants nested page table entries for the guest
+ * page tables to be writable. And EPT works but will simply
+ * overwrite the read-only memory to set the accessed and dirty
+ * bits.
+ */
+ if (unlikely(!walker->pte_writable[level - 1]))
+ continue;
+
+ ret = FNAME(cmpxchg_gpte)(vcpu, mmu, ptep_user, index, orig_pte, pte);
+ if (ret)
+ return ret;
+
+ kvm_vcpu_mark_page_dirty(vcpu, table_gfn);
+ walker->ptes[level - 1] = pte;
+ }
+ return 0;
+}
+
+static inline unsigned FNAME(gpte_pkeys)(struct kvm_vcpu *vcpu, u64 gpte)
+{
+ unsigned pkeys = 0;
+#if PTTYPE == 64
+ pte_t pte = {.pte = gpte};
+
+ pkeys = pte_flags_pkey(pte_flags(pte));
+#endif
+ return pkeys;
+}
+
+/*
+ * Fetch a guest pte for a guest virtual address, or for an L2's GPA.
+ */
+static int FNAME(walk_addr_generic)(struct guest_walker *walker,
+ struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
+ gpa_t addr, u32 access)
+{
+ int ret;
+ pt_element_t pte;
+ pt_element_t __user *uninitialized_var(ptep_user);
+ gfn_t table_gfn;
+ u64 pt_access, pte_access;
+ unsigned index, accessed_dirty, pte_pkey;
+ unsigned nested_access;
+ gpa_t pte_gpa;
+ bool have_ad;
+ int offset;
+ u64 walk_nx_mask = 0;
+ const int write_fault = access & PFERR_WRITE_MASK;
+ const int user_fault = access & PFERR_USER_MASK;
+ const int fetch_fault = access & PFERR_FETCH_MASK;
+ u16 errcode = 0;
+ gpa_t real_gpa;
+ gfn_t gfn;
+
+ trace_kvm_mmu_pagetable_walk(addr, access);
+retry_walk:
+ walker->level = mmu->root_level;
+ pte = mmu->get_cr3(vcpu);
+ have_ad = PT_HAVE_ACCESSED_DIRTY(mmu);
+
+#if PTTYPE == 64
+ walk_nx_mask = 1ULL << PT64_NX_SHIFT;
+ if (walker->level == PT32E_ROOT_LEVEL) {
+ pte = mmu->get_pdptr(vcpu, (addr >> 30) & 3);
+ trace_kvm_mmu_paging_element(pte, walker->level);
+ if (!FNAME(is_present_gpte)(pte))
+ goto error;
+ --walker->level;
+ }
+#endif
+ walker->max_level = walker->level;
+ ASSERT(!(is_long_mode(vcpu) && !is_pae(vcpu)));
+
+ /*
+ * FIXME: on Intel processors, loads of the PDPTE registers for PAE paging
+ * by the MOV to CR instruction are treated as reads and do not cause the
+ * processor to set the dirty flag in any EPT paging-structure entry.
+ */
+ nested_access = (have_ad ? PFERR_WRITE_MASK : 0) | PFERR_USER_MASK;
+
+ pte_access = ~0;
+ ++walker->level;
+
+ do {
+ gfn_t real_gfn;
+ unsigned long host_addr;
+
+ pt_access = pte_access;
+ --walker->level;
+
+ index = PT_INDEX(addr, walker->level);
+ table_gfn = gpte_to_gfn(pte);
+ offset = index * sizeof(pt_element_t);
+ pte_gpa = gfn_to_gpa(table_gfn) + offset;
+
+ BUG_ON(walker->level < 1);
+ walker->table_gfn[walker->level - 1] = table_gfn;
+ walker->pte_gpa[walker->level - 1] = pte_gpa;
+
+ real_gfn = mmu->translate_gpa(vcpu, gfn_to_gpa(table_gfn),
+ nested_access,
+ &walker->fault);
+
+ /*
+ * FIXME: This can happen if emulation (for of an INS/OUTS
+ * instruction) triggers a nested page fault. The exit
+ * qualification / exit info field will incorrectly have
+ * "guest page access" as the nested page fault's cause,
+ * instead of "guest page structure access". To fix this,
+ * the x86_exception struct should be augmented with enough
+ * information to fix the exit_qualification or exit_info_1
+ * fields.
+ */
+ if (unlikely(real_gfn == UNMAPPED_GVA))
+ return 0;
+
+ real_gfn = gpa_to_gfn(real_gfn);
+
+ host_addr = kvm_vcpu_gfn_to_hva_prot(vcpu, real_gfn,
+ &walker->pte_writable[walker->level - 1]);
+ if (unlikely(kvm_is_error_hva(host_addr)))
+ goto error;
+
+ ptep_user = (pt_element_t __user *)((void *)host_addr + offset);
+ if (unlikely(__copy_from_user(&pte, ptep_user, sizeof(pte))))
+ goto error;
+ walker->ptep_user[walker->level - 1] = ptep_user;
+
+ trace_kvm_mmu_paging_element(pte, walker->level);
+
+ /*
+ * Inverting the NX it lets us AND it like other
+ * permission bits.
+ */
+ pte_access = pt_access & (pte ^ walk_nx_mask);
+
+ if (unlikely(!FNAME(is_present_gpte)(pte)))
+ goto error;
+
+ if (unlikely(is_rsvd_bits_set(mmu, pte, walker->level))) {
+ errcode = PFERR_RSVD_MASK | PFERR_PRESENT_MASK;
+ goto error;
+ }
+
+ walker->ptes[walker->level - 1] = pte;
+
+ /* Convert to ACC_*_MASK flags for struct guest_walker. */
+ walker->pt_access[walker->level - 1] = FNAME(gpte_access)(pt_access ^ walk_nx_mask);
+ } while (!is_last_gpte(mmu, walker->level, pte));
+
+ pte_pkey = FNAME(gpte_pkeys)(vcpu, pte);
+ accessed_dirty = have_ad ? pte_access & PT_GUEST_ACCESSED_MASK : 0;
+
+ /* Convert to ACC_*_MASK flags for struct guest_walker. */
+ walker->pte_access = FNAME(gpte_access)(pte_access ^ walk_nx_mask);
+ errcode = permission_fault(vcpu, mmu, walker->pte_access, pte_pkey, access);
+ if (unlikely(errcode))
+ goto error;
+
+ gfn = gpte_to_gfn_lvl(pte, walker->level);
+ gfn += (addr & PT_LVL_OFFSET_MASK(walker->level)) >> PAGE_SHIFT;
+
+ if (PTTYPE == 32 && walker->level == PT_DIRECTORY_LEVEL && is_cpuid_PSE36())
+ gfn += pse36_gfn_delta(pte);
+
+ real_gpa = mmu->translate_gpa(vcpu, gfn_to_gpa(gfn), access, &walker->fault);
+ if (real_gpa == UNMAPPED_GVA)
+ return 0;
+
+ walker->gfn = real_gpa >> PAGE_SHIFT;
+
+ if (!write_fault)
+ FNAME(protect_clean_gpte)(mmu, &walker->pte_access, pte);
+ else
+ /*
+ * On a write fault, fold the dirty bit into accessed_dirty.
+ * For modes without A/D bits support accessed_dirty will be
+ * always clear.
+ */
+ accessed_dirty &= pte >>
+ (PT_GUEST_DIRTY_SHIFT - PT_GUEST_ACCESSED_SHIFT);
+
+ if (unlikely(!accessed_dirty)) {
+ ret = FNAME(update_accessed_dirty_bits)(vcpu, mmu, walker,
+ addr, write_fault);
+ if (unlikely(ret < 0))
+ goto error;
+ else if (ret)
+ goto retry_walk;
+ }
+
+ pgprintk("%s: pte %llx pte_access %x pt_access %x\n",
+ __func__, (u64)pte, walker->pte_access,
+ walker->pt_access[walker->level - 1]);
+ return 1;
+
+error:
+ errcode |= write_fault | user_fault;
+ if (fetch_fault && (mmu->nx ||
+ kvm_read_cr4_bits(vcpu, X86_CR4_SMEP)))
+ errcode |= PFERR_FETCH_MASK;
+
+ walker->fault.vector = PF_VECTOR;
+ walker->fault.error_code_valid = true;
+ walker->fault.error_code = errcode;
+
+#if PTTYPE == PTTYPE_EPT
+ /*
+ * Use PFERR_RSVD_MASK in error_code to to tell if EPT
+ * misconfiguration requires to be injected. The detection is
+ * done by is_rsvd_bits_set() above.
+ *
+ * We set up the value of exit_qualification to inject:
+ * [2:0] - Derive from the access bits. The exit_qualification might be
+ * out of date if it is serving an EPT misconfiguration.
+ * [5:3] - Calculated by the page walk of the guest EPT page tables
+ * [7:8] - Derived from [7:8] of real exit_qualification
+ *
+ * The other bits are set to 0.
+ */
+ if (!(errcode & PFERR_RSVD_MASK)) {
+ vcpu->arch.exit_qualification &= 0x180;
+ if (write_fault)
+ vcpu->arch.exit_qualification |= EPT_VIOLATION_ACC_WRITE;
+ if (user_fault)
+ vcpu->arch.exit_qualification |= EPT_VIOLATION_ACC_READ;
+ if (fetch_fault)
+ vcpu->arch.exit_qualification |= EPT_VIOLATION_ACC_INSTR;
+ vcpu->arch.exit_qualification |= (pte_access & 0x7) << 3;
+ }
+#endif
+ walker->fault.address = addr;
+ walker->fault.nested_page_fault = mmu != vcpu->arch.walk_mmu;
+
+ trace_kvm_mmu_walker_error(walker->fault.error_code);
+ return 0;
+}
+
+static int FNAME(walk_addr)(struct guest_walker *walker,
+ struct kvm_vcpu *vcpu, gpa_t addr, u32 access)
+{
+ return FNAME(walk_addr_generic)(walker, vcpu, &vcpu->arch.mmu, addr,
+ access);
+}
+
+#if PTTYPE != PTTYPE_EPT
+static int FNAME(walk_addr_nested)(struct guest_walker *walker,
+ struct kvm_vcpu *vcpu, gva_t addr,
+ u32 access)
+{
+ return FNAME(walk_addr_generic)(walker, vcpu, &vcpu->arch.nested_mmu,
+ addr, access);
+}
+#endif
+
+static bool
+FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+ u64 *spte, pt_element_t gpte, bool no_dirty_log)
+{
+ unsigned pte_access;
+ gfn_t gfn;
+ kvm_pfn_t pfn;
+
+ if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte))
+ return false;
+
+ pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);
+
+ gfn = gpte_to_gfn(gpte);
+ pte_access = sp->role.access & FNAME(gpte_access)(gpte);
+ FNAME(protect_clean_gpte)(&vcpu->arch.mmu, &pte_access, gpte);
+ pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn,
+ no_dirty_log && (pte_access & ACC_WRITE_MASK));
+ if (is_error_pfn(pfn))
+ return false;
+
+ /*
+ * we call mmu_set_spte() with host_writable = true because
+ * pte_prefetch_gfn_to_pfn always gets a writable pfn.
+ */
+ mmu_set_spte(vcpu, spte, pte_access, 0, PT_PAGE_TABLE_LEVEL, gfn, pfn,
+ true, true);
+
+ kvm_release_pfn_clean(pfn);
+ return true;
+}
+
+static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+ u64 *spte, const void *pte)
+{
+ pt_element_t gpte = *(const pt_element_t *)pte;
+
+ FNAME(prefetch_gpte)(vcpu, sp, spte, gpte, false);
+}
+
+static bool FNAME(gpte_changed)(struct kvm_vcpu *vcpu,
+ struct guest_walker *gw, int level)
+{
+ pt_element_t curr_pte;
+ gpa_t base_gpa, pte_gpa = gw->pte_gpa[level - 1];
+ u64 mask;
+ int r, index;
+
+ if (level == PT_PAGE_TABLE_LEVEL) {
+ mask = PTE_PREFETCH_NUM * sizeof(pt_element_t) - 1;
+ base_gpa = pte_gpa & ~mask;
+ index = (pte_gpa - base_gpa) / sizeof(pt_element_t);
+
+ r = kvm_vcpu_read_guest_atomic(vcpu, base_gpa,
+ gw->prefetch_ptes, sizeof(gw->prefetch_ptes));
+ curr_pte = gw->prefetch_ptes[index];
+ } else
+ r = kvm_vcpu_read_guest_atomic(vcpu, pte_gpa,
+ &curr_pte, sizeof(curr_pte));
+
+ return r || curr_pte != gw->ptes[level - 1];
+}
+
+static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
+ u64 *sptep)
+{
+ struct kvm_mmu_page *sp;
+ pt_element_t *gptep = gw->prefetch_ptes;
+ u64 *spte;
+ int i;
+
+ sp = page_header(__pa(sptep));
+
+ if (sp->role.level > PT_PAGE_TABLE_LEVEL)
+ return;
+
+ if (sp->role.direct)
+ return __direct_pte_prefetch(vcpu, sp, sptep);
+
+ i = (sptep - sp->spt) & ~(PTE_PREFETCH_NUM - 1);
+ spte = sp->spt + i;
+
+ for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) {
+ if (spte == sptep)
+ continue;
+
+ if (is_shadow_present_pte(*spte))
+ continue;
+
+ if (!FNAME(prefetch_gpte)(vcpu, sp, spte, gptep[i], true))
+ break;
+ }
+}
+
+/*
+ * Fetch a shadow pte for a specific level in the paging hierarchy.
+ * If the guest tries to write a write-protected page, we need to
+ * emulate this operation, return 1 to indicate this case.
+ */
+static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr,
+ struct guest_walker *gw,
+ int write_fault, int hlevel,
+ kvm_pfn_t pfn, bool map_writable, bool prefault,
+ bool lpage_disallowed)
+{
+ struct kvm_mmu_page *sp = NULL;
+ struct kvm_shadow_walk_iterator it;
+ unsigned int direct_access, access;
+ int top_level, ret;
+ gfn_t gfn, base_gfn;
+
+ direct_access = gw->pte_access;
+
+ top_level = vcpu->arch.mmu.root_level;
+ if (top_level == PT32E_ROOT_LEVEL)
+ top_level = PT32_ROOT_LEVEL;
+ /*
+ * Verify that the top-level gpte is still there. Since the page
+ * is a root page, it is either write protected (and cannot be
+ * changed from now on) or it is invalid (in which case, we don't
+ * really care if it changes underneath us after this point).
+ */
+ if (FNAME(gpte_changed)(vcpu, gw, top_level))
+ goto out_gpte_changed;
+
+ if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
+ goto out_gpte_changed;
+
+ for (shadow_walk_init(&it, vcpu, addr);
+ shadow_walk_okay(&it) && it.level > gw->level;
+ shadow_walk_next(&it)) {
+ gfn_t table_gfn;
+
+ clear_sp_write_flooding_count(it.sptep);
+ drop_large_spte(vcpu, it.sptep);
+
+ sp = NULL;
+ if (!is_shadow_present_pte(*it.sptep)) {
+ table_gfn = gw->table_gfn[it.level - 2];
+ access = gw->pt_access[it.level - 2];
+ sp = kvm_mmu_get_page(vcpu, table_gfn, addr, it.level-1,
+ false, access);
+ }
+
+ /*
+ * Verify that the gpte in the page we've just write
+ * protected is still there.
+ */
+ if (FNAME(gpte_changed)(vcpu, gw, it.level - 1))
+ goto out_gpte_changed;
+
+ if (sp)
+ link_shadow_page(vcpu, it.sptep, sp);
+ }
+
+ /*
+ * FNAME(page_fault) might have clobbered the bottom bits of
+ * gw->gfn, restore them from the virtual address.
+ */
+ gfn = gw->gfn | ((addr & PT_LVL_OFFSET_MASK(gw->level)) >> PAGE_SHIFT);
+ base_gfn = gfn;
+
+ trace_kvm_mmu_spte_requested(addr, gw->level, pfn);
+
+ for (; shadow_walk_okay(&it); shadow_walk_next(&it)) {
+ clear_sp_write_flooding_count(it.sptep);
+
+ /*
+ * We cannot overwrite existing page tables with an NX
+ * large page, as the leaf could be executable.
+ */
+ disallowed_hugepage_adjust(it, gfn, &pfn, &hlevel);
+
+ base_gfn = gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);
+ if (it.level == hlevel)
+ break;
+
+ validate_direct_spte(vcpu, it.sptep, direct_access);
+
+ drop_large_spte(vcpu, it.sptep);
+
+ if (!is_shadow_present_pte(*it.sptep)) {
+ sp = kvm_mmu_get_page(vcpu, base_gfn, addr,
+ it.level - 1, true, direct_access);
+ link_shadow_page(vcpu, it.sptep, sp);
+ if (lpage_disallowed)
+ account_huge_nx_page(vcpu->kvm, sp);
+ }
+ }
+
+ ret = mmu_set_spte(vcpu, it.sptep, gw->pte_access, write_fault,
+ it.level, base_gfn, pfn, prefault, map_writable);
+ FNAME(pte_prefetch)(vcpu, gw, it.sptep);
+ ++vcpu->stat.pf_fixed;
+ return ret;
+
+out_gpte_changed:
+ return RET_PF_RETRY;
+}
+
+ /*
+ * To see whether the mapped gfn can write its page table in the current
+ * mapping.
+ *
+ * It is the helper function of FNAME(page_fault). When guest uses large page
+ * size to map the writable gfn which is used as current page table, we should
+ * force kvm to use small page size to map it because new shadow page will be
+ * created when kvm establishes shadow page table that stop kvm using large
+ * page size. Do it early can avoid unnecessary #PF and emulation.
+ *
+ * @write_fault_to_shadow_pgtable will return true if the fault gfn is
+ * currently used as its page table.
+ *
+ * Note: the PDPT page table is not checked for PAE-32 bit guest. It is ok
+ * since the PDPT is always shadowed, that means, we can not use large page
+ * size to map the gfn which is used as PDPT.
+ */
+static bool
+FNAME(is_self_change_mapping)(struct kvm_vcpu *vcpu,
+ struct guest_walker *walker, int user_fault,
+ bool *write_fault_to_shadow_pgtable)
+{
+ int level;
+ gfn_t mask = ~(KVM_PAGES_PER_HPAGE(walker->level) - 1);
+ bool self_changed = false;
+
+ if (!(walker->pte_access & ACC_WRITE_MASK ||
+ (!is_write_protection(vcpu) && !user_fault)))
+ return false;
+
+ for (level = walker->level; level <= walker->max_level; level++) {
+ gfn_t gfn = walker->gfn ^ walker->table_gfn[level - 1];
+
+ self_changed |= !(gfn & mask);
+ *write_fault_to_shadow_pgtable |= !gfn;
+ }
+
+ return self_changed;
+}
+
+/*
+ * Page fault handler. There are several causes for a page fault:
+ * - there is no shadow pte for the guest pte
+ * - write access through a shadow pte marked read only so that we can set
+ * the dirty bit
+ * - write access to a shadow pte marked read only so we can update the page
+ * dirty bitmap, when userspace requests it
+ * - mmio access; in this case we will never install a present shadow pte
+ * - normal guest page fault due to the guest pte marked not present, not
+ * writable, or not executable
+ *
+ * Returns: 1 if we need to emulate the instruction, 0 otherwise, or
+ * a negative value on error.
+ */
+static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code,
+ bool prefault)
+{
+ int write_fault = error_code & PFERR_WRITE_MASK;
+ int user_fault = error_code & PFERR_USER_MASK;
+ struct guest_walker walker;
+ int r;
+ kvm_pfn_t pfn;
+ int level = PT_PAGE_TABLE_LEVEL;
+ unsigned long mmu_seq;
+ bool map_writable, is_self_change_mapping;
+ bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) &&
+ is_nx_huge_page_enabled();
+ bool force_pt_level = lpage_disallowed;
+
+ pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
+
+ r = mmu_topup_memory_caches(vcpu);
+ if (r)
+ return r;
+
+ /*
+ * If PFEC.RSVD is set, this is a shadow page fault.
+ * The bit needs to be cleared before walking guest page tables.
+ */
+ error_code &= ~PFERR_RSVD_MASK;
+
+ /*
+ * Look up the guest pte for the faulting address.
+ */
+ r = FNAME(walk_addr)(&walker, vcpu, addr, error_code);
+
+ /*
+ * The page is not mapped by the guest. Let the guest handle it.
+ */
+ if (!r) {
+ pgprintk("%s: guest page fault\n", __func__);
+ if (!prefault)
+ inject_page_fault(vcpu, &walker.fault);
+
+ return RET_PF_RETRY;
+ }
+
+ if (page_fault_handle_page_track(vcpu, error_code, walker.gfn)) {
+ shadow_page_table_clear_flood(vcpu, addr);
+ return RET_PF_EMULATE;
+ }
+
+ vcpu->arch.write_fault_to_shadow_pgtable = false;
+
+ is_self_change_mapping = FNAME(is_self_change_mapping)(vcpu,
+ &walker, user_fault, &vcpu->arch.write_fault_to_shadow_pgtable);
+
+ if (walker.level >= PT_DIRECTORY_LEVEL && !is_self_change_mapping) {
+ level = mapping_level(vcpu, walker.gfn, &force_pt_level);
+ if (likely(!force_pt_level)) {
+ level = min(walker.level, level);
+ walker.gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1);
+ }
+ } else
+ force_pt_level = true;
+
+ mmu_seq = vcpu->kvm->mmu_notifier_seq;
+ smp_rmb();
+
+ if (try_async_pf(vcpu, prefault, walker.gfn, addr, &pfn, write_fault,
+ &map_writable))
+ return RET_PF_RETRY;
+
+ if (handle_abnormal_pfn(vcpu, addr, walker.gfn, pfn, walker.pte_access, &r))
+ return r;
+
+ /*
+ * Do not change pte_access if the pfn is a mmio page, otherwise
+ * we will cache the incorrect access into mmio spte.
+ */
+ if (write_fault && !(walker.pte_access & ACC_WRITE_MASK) &&
+ !is_write_protection(vcpu) && !user_fault &&
+ !is_noslot_pfn(pfn)) {
+ walker.pte_access |= ACC_WRITE_MASK;
+ walker.pte_access &= ~ACC_USER_MASK;
+
+ /*
+ * If we converted a user page to a kernel page,
+ * so that the kernel can write to it when cr0.wp=0,
+ * then we should prevent the kernel from executing it
+ * if SMEP is enabled.
+ */
+ if (kvm_read_cr4_bits(vcpu, X86_CR4_SMEP))
+ walker.pte_access &= ~ACC_EXEC_MASK;
+ }
+
+ r = RET_PF_RETRY;
+ spin_lock(&vcpu->kvm->mmu_lock);
+ if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
+ goto out_unlock;
+
+ kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT);
+ if (make_mmu_pages_available(vcpu) < 0)
+ goto out_unlock;
+ if (!force_pt_level)
+ transparent_hugepage_adjust(vcpu, walker.gfn, &pfn, &level);
+ r = FNAME(fetch)(vcpu, addr, &walker, write_fault,
+ level, pfn, map_writable, prefault, lpage_disallowed);
+ kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT);
+
+out_unlock:
+ spin_unlock(&vcpu->kvm->mmu_lock);
+ kvm_release_pfn_clean(pfn);
+ return r;
+}
+
+static gpa_t FNAME(get_level1_sp_gpa)(struct kvm_mmu_page *sp)
+{
+ int offset = 0;
+
+ WARN_ON(sp->role.level != PT_PAGE_TABLE_LEVEL);
+
+ if (PTTYPE == 32)
+ offset = sp->role.quadrant << PT64_LEVEL_BITS;
+
+ return gfn_to_gpa(sp->gfn) + offset * sizeof(pt_element_t);
+}
+
+static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa)
+{
+ struct kvm_shadow_walk_iterator iterator;
+ struct kvm_mmu_page *sp;
+ int level;
+ u64 *sptep;
+
+ vcpu_clear_mmio_info(vcpu, gva);
+
+ /*
+ * No need to check return value here, rmap_can_add() can
+ * help us to skip pte prefetch later.
+ */
+ mmu_topup_memory_caches(vcpu);
+
+ if (!VALID_PAGE(root_hpa)) {
+ WARN_ON(1);
+ return;
+ }
+
+ spin_lock(&vcpu->kvm->mmu_lock);
+ for_each_shadow_entry_using_root(vcpu, root_hpa, gva, iterator) {
+ level = iterator.level;
+ sptep = iterator.sptep;
+
+ sp = page_header(__pa(sptep));
+ if (is_last_spte(*sptep, level)) {
+ pt_element_t gpte;
+ gpa_t pte_gpa;
+
+ if (!sp->unsync)
+ break;
+
+ pte_gpa = FNAME(get_level1_sp_gpa)(sp);
+ pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t);
+
+ if (mmu_page_zap_pte(vcpu->kvm, sp, sptep))
+ kvm_flush_remote_tlbs(vcpu->kvm);
+
+ if (!rmap_can_add(vcpu))
+ break;
+
+ if (kvm_vcpu_read_guest_atomic(vcpu, pte_gpa, &gpte,
+ sizeof(pt_element_t)))
+ break;
+
+ FNAME(update_pte)(vcpu, sp, sptep, &gpte);
+ }
+
+ if (!is_shadow_present_pte(*sptep) || !sp->unsync_children)
+ break;
+ }
+ spin_unlock(&vcpu->kvm->mmu_lock);
+}
+
+/* Note, @addr is a GPA when gva_to_gpa() translates an L2 GPA to an L1 GPA. */
+static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gpa_t addr, u32 access,
+ struct x86_exception *exception)
+{
+ struct guest_walker walker;
+ gpa_t gpa = UNMAPPED_GVA;
+ int r;
+
+ r = FNAME(walk_addr)(&walker, vcpu, addr, access);
+
+ if (r) {
+ gpa = gfn_to_gpa(walker.gfn);
+ gpa |= addr & ~PAGE_MASK;
+ } else if (exception)
+ *exception = walker.fault;
+
+ return gpa;
+}
+
+#if PTTYPE != PTTYPE_EPT
+/* Note, gva_to_gpa_nested() is only used to translate L2 GVAs. */
+static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gpa_t vaddr,
+ u32 access,
+ struct x86_exception *exception)
+{
+ struct guest_walker walker;
+ gpa_t gpa = UNMAPPED_GVA;
+ int r;
+
+#ifndef CONFIG_X86_64
+ /* A 64-bit GVA should be impossible on 32-bit KVM. */
+ WARN_ON_ONCE(vaddr >> 32);
+#endif
+
+ r = FNAME(walk_addr_nested)(&walker, vcpu, vaddr, access);
+
+ if (r) {
+ gpa = gfn_to_gpa(walker.gfn);
+ gpa |= vaddr & ~PAGE_MASK;
+ } else if (exception)
+ *exception = walker.fault;
+
+ return gpa;
+}
+#endif
+
+/*
+ * Using the cached information from sp->gfns is safe because:
+ * - The spte has a reference to the struct page, so the pfn for a given gfn
+ * can't change unless all sptes pointing to it are nuked first.
+ *
+ * Note:
+ * We should flush all tlbs if spte is dropped even though guest is
+ * responsible for it. Since if we don't, kvm_mmu_notifier_invalidate_page
+ * and kvm_mmu_notifier_invalidate_range_start detect the mapping page isn't
+ * used by guest then tlbs are not flushed, so guest is allowed to access the
+ * freed pages.
+ * And we increase kvm->tlbs_dirty to delay tlbs flush in this case.
+ */
+static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
+{
+ int i, nr_present = 0;
+ bool host_writable;
+ gpa_t first_pte_gpa;
+ int set_spte_ret = 0;
+
+ /* direct kvm_mmu_page can not be unsync. */
+ BUG_ON(sp->role.direct);
+
+ first_pte_gpa = FNAME(get_level1_sp_gpa)(sp);
+
+ for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
+ unsigned pte_access;
+ pt_element_t gpte;
+ gpa_t pte_gpa;
+ gfn_t gfn;
+
+ if (!sp->spt[i])
+ continue;
+
+ pte_gpa = first_pte_gpa + i * sizeof(pt_element_t);
+
+ if (kvm_vcpu_read_guest_atomic(vcpu, pte_gpa, &gpte,
+ sizeof(pt_element_t)))
+ return 0;
+
+ if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) {
+ /*
+ * Update spte before increasing tlbs_dirty to make
+ * sure no tlb flush is lost after spte is zapped; see
+ * the comments in kvm_flush_remote_tlbs().
+ */
+ smp_wmb();
+ vcpu->kvm->tlbs_dirty++;
+ continue;
+ }
+
+ gfn = gpte_to_gfn(gpte);
+ pte_access = sp->role.access;
+ pte_access &= FNAME(gpte_access)(gpte);
+ FNAME(protect_clean_gpte)(&vcpu->arch.mmu, &pte_access, gpte);
+
+ if (sync_mmio_spte(vcpu, &sp->spt[i], gfn, pte_access,
+ &nr_present))
+ continue;
+
+ if (gfn != sp->gfns[i]) {
+ drop_spte(vcpu->kvm, &sp->spt[i]);
+ /*
+ * The same as above where we are doing
+ * prefetch_invalid_gpte().
+ */
+ smp_wmb();
+ vcpu->kvm->tlbs_dirty++;
+ continue;
+ }
+
+ nr_present++;
+
+ host_writable = sp->spt[i] & SPTE_HOST_WRITEABLE;
+
+ set_spte_ret |= set_spte(vcpu, &sp->spt[i],
+ pte_access, PT_PAGE_TABLE_LEVEL,
+ gfn, spte_to_pfn(sp->spt[i]),
+ true, false, host_writable);
+ }
+
+ if (set_spte_ret & SET_SPTE_NEED_REMOTE_TLB_FLUSH)
+ kvm_flush_remote_tlbs(vcpu->kvm);
+
+ return nr_present;
+}
+
+#undef pt_element_t
+#undef guest_walker
+#undef FNAME
+#undef PT_BASE_ADDR_MASK
+#undef PT_INDEX
+#undef PT_LVL_ADDR_MASK
+#undef PT_LVL_OFFSET_MASK
+#undef PT_LEVEL_BITS
+#undef PT_MAX_FULL_LEVELS
+#undef gpte_to_gfn
+#undef gpte_to_gfn_lvl
+#undef CMPXCHG
+#undef PT_GUEST_ACCESSED_MASK
+#undef PT_GUEST_DIRTY_MASK
+#undef PT_GUEST_DIRTY_SHIFT
+#undef PT_GUEST_ACCESSED_SHIFT
+#undef PT_HAVE_ACCESSED_DIRTY
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
new file mode 100644
index 000000000..ad3d39c00
--- /dev/null
+++ b/arch/x86/kvm/pmu.c
@@ -0,0 +1,349 @@
+/*
+ * Kernel-based Virtual Machine -- Performance Monitoring Unit support
+ *
+ * Copyright 2015 Red Hat, Inc. and/or its affiliates.
+ *
+ * Authors:
+ * Avi Kivity <avi@redhat.com>
+ * Gleb Natapov <gleb@redhat.com>
+ * Wei Huang <wei@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#include <linux/types.h>
+#include <linux/kvm_host.h>
+#include <linux/perf_event.h>
+#include <asm/perf_event.h>
+#include "x86.h"
+#include "cpuid.h"
+#include "lapic.h"
+#include "pmu.h"
+
+/* NOTE:
+ * - Each perf counter is defined as "struct kvm_pmc";
+ * - There are two types of perf counters: general purpose (gp) and fixed.
+ * gp counters are stored in gp_counters[] and fixed counters are stored
+ * in fixed_counters[] respectively. Both of them are part of "struct
+ * kvm_pmu";
+ * - pmu.c understands the difference between gp counters and fixed counters.
+ * However AMD doesn't support fixed-counters;
+ * - There are three types of index to access perf counters (PMC):
+ * 1. MSR (named msr): For example Intel has MSR_IA32_PERFCTRn and AMD
+ * has MSR_K7_PERFCTRn.
+ * 2. MSR Index (named idx): This normally is used by RDPMC instruction.
+ * For instance AMD RDPMC instruction uses 0000_0003h in ECX to access
+ * C001_0007h (MSR_K7_PERCTR3). Intel has a similar mechanism, except
+ * that it also supports fixed counters. idx can be used to as index to
+ * gp and fixed counters.
+ * 3. Global PMC Index (named pmc): pmc is an index specific to PMU
+ * code. Each pmc, stored in kvm_pmc.idx field, is unique across
+ * all perf counters (both gp and fixed). The mapping relationship
+ * between pmc and perf counters is as the following:
+ * * Intel: [0 .. INTEL_PMC_MAX_GENERIC-1] <=> gp counters
+ * [INTEL_PMC_IDX_FIXED .. INTEL_PMC_IDX_FIXED + 2] <=> fixed
+ * * AMD: [0 .. AMD64_NUM_COUNTERS-1] <=> gp counters
+ */
+
+static void kvm_pmi_trigger_fn(struct irq_work *irq_work)
+{
+ struct kvm_pmu *pmu = container_of(irq_work, struct kvm_pmu, irq_work);
+ struct kvm_vcpu *vcpu = pmu_to_vcpu(pmu);
+
+ kvm_pmu_deliver_pmi(vcpu);
+}
+
+static void kvm_perf_overflow(struct perf_event *perf_event,
+ struct perf_sample_data *data,
+ struct pt_regs *regs)
+{
+ struct kvm_pmc *pmc = perf_event->overflow_handler_context;
+ struct kvm_pmu *pmu = pmc_to_pmu(pmc);
+
+ if (!test_and_set_bit(pmc->idx,
+ (unsigned long *)&pmu->reprogram_pmi)) {
+ __set_bit(pmc->idx, (unsigned long *)&pmu->global_status);
+ kvm_make_request(KVM_REQ_PMU, pmc->vcpu);
+ }
+}
+
+static void kvm_perf_overflow_intr(struct perf_event *perf_event,
+ struct perf_sample_data *data,
+ struct pt_regs *regs)
+{
+ struct kvm_pmc *pmc = perf_event->overflow_handler_context;
+ struct kvm_pmu *pmu = pmc_to_pmu(pmc);
+
+ if (!test_and_set_bit(pmc->idx,
+ (unsigned long *)&pmu->reprogram_pmi)) {
+ __set_bit(pmc->idx, (unsigned long *)&pmu->global_status);
+ kvm_make_request(KVM_REQ_PMU, pmc->vcpu);
+
+ /*
+ * Inject PMI. If vcpu was in a guest mode during NMI PMI
+ * can be ejected on a guest mode re-entry. Otherwise we can't
+ * be sure that vcpu wasn't executing hlt instruction at the
+ * time of vmexit and is not going to re-enter guest mode until
+ * woken up. So we should wake it, but this is impossible from
+ * NMI context. Do it from irq work instead.
+ */
+ if (!kvm_is_in_guest())
+ irq_work_queue(&pmc_to_pmu(pmc)->irq_work);
+ else
+ kvm_make_request(KVM_REQ_PMI, pmc->vcpu);
+ }
+}
+
+static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type,
+ unsigned config, bool exclude_user,
+ bool exclude_kernel, bool intr,
+ bool in_tx, bool in_tx_cp)
+{
+ struct perf_event *event;
+ struct perf_event_attr attr = {
+ .type = type,
+ .size = sizeof(attr),
+ .pinned = true,
+ .exclude_idle = true,
+ .exclude_host = 1,
+ .exclude_user = exclude_user,
+ .exclude_kernel = exclude_kernel,
+ .config = config,
+ };
+
+ attr.sample_period = (-pmc->counter) & pmc_bitmask(pmc);
+
+ if (in_tx)
+ attr.config |= HSW_IN_TX;
+ if (in_tx_cp) {
+ /*
+ * HSW_IN_TX_CHECKPOINTED is not supported with nonzero
+ * period. Just clear the sample period so at least
+ * allocating the counter doesn't fail.
+ */
+ attr.sample_period = 0;
+ attr.config |= HSW_IN_TX_CHECKPOINTED;
+ }
+
+ event = perf_event_create_kernel_counter(&attr, -1, current,
+ intr ? kvm_perf_overflow_intr :
+ kvm_perf_overflow, pmc);
+ if (IS_ERR(event)) {
+ pr_debug_ratelimited("kvm_pmu: event creation failed %ld for pmc->idx = %d\n",
+ PTR_ERR(event), pmc->idx);
+ return;
+ }
+
+ pmc->perf_event = event;
+ clear_bit(pmc->idx, (unsigned long*)&pmc_to_pmu(pmc)->reprogram_pmi);
+}
+
+void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel)
+{
+ unsigned config, type = PERF_TYPE_RAW;
+ u8 event_select, unit_mask;
+
+ if (eventsel & ARCH_PERFMON_EVENTSEL_PIN_CONTROL)
+ printk_once("kvm pmu: pin control bit is ignored\n");
+
+ pmc->eventsel = eventsel;
+
+ pmc_stop_counter(pmc);
+
+ if (!(eventsel & ARCH_PERFMON_EVENTSEL_ENABLE) || !pmc_is_enabled(pmc))
+ return;
+
+ event_select = eventsel & ARCH_PERFMON_EVENTSEL_EVENT;
+ unit_mask = (eventsel & ARCH_PERFMON_EVENTSEL_UMASK) >> 8;
+
+ if (!(eventsel & (ARCH_PERFMON_EVENTSEL_EDGE |
+ ARCH_PERFMON_EVENTSEL_INV |
+ ARCH_PERFMON_EVENTSEL_CMASK |
+ HSW_IN_TX |
+ HSW_IN_TX_CHECKPOINTED))) {
+ config = kvm_x86_ops->pmu_ops->find_arch_event(pmc_to_pmu(pmc),
+ event_select,
+ unit_mask);
+ if (config != PERF_COUNT_HW_MAX)
+ type = PERF_TYPE_HARDWARE;
+ }
+
+ if (type == PERF_TYPE_RAW)
+ config = eventsel & AMD64_RAW_EVENT_MASK;
+
+ pmc_reprogram_counter(pmc, type, config,
+ !(eventsel & ARCH_PERFMON_EVENTSEL_USR),
+ !(eventsel & ARCH_PERFMON_EVENTSEL_OS),
+ eventsel & ARCH_PERFMON_EVENTSEL_INT,
+ (eventsel & HSW_IN_TX),
+ (eventsel & HSW_IN_TX_CHECKPOINTED));
+}
+EXPORT_SYMBOL_GPL(reprogram_gp_counter);
+
+void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 ctrl, int idx)
+{
+ unsigned en_field = ctrl & 0x3;
+ bool pmi = ctrl & 0x8;
+
+ pmc_stop_counter(pmc);
+
+ if (!en_field || !pmc_is_enabled(pmc))
+ return;
+
+ pmc_reprogram_counter(pmc, PERF_TYPE_HARDWARE,
+ kvm_x86_ops->pmu_ops->find_fixed_event(idx),
+ !(en_field & 0x2), /* exclude user */
+ !(en_field & 0x1), /* exclude kernel */
+ pmi, false, false);
+}
+EXPORT_SYMBOL_GPL(reprogram_fixed_counter);
+
+void reprogram_counter(struct kvm_pmu *pmu, int pmc_idx)
+{
+ struct kvm_pmc *pmc = kvm_x86_ops->pmu_ops->pmc_idx_to_pmc(pmu, pmc_idx);
+
+ if (!pmc)
+ return;
+
+ if (pmc_is_gp(pmc))
+ reprogram_gp_counter(pmc, pmc->eventsel);
+ else {
+ int idx = pmc_idx - INTEL_PMC_IDX_FIXED;
+ u8 ctrl = fixed_ctrl_field(pmu->fixed_ctr_ctrl, idx);
+
+ reprogram_fixed_counter(pmc, ctrl, idx);
+ }
+}
+EXPORT_SYMBOL_GPL(reprogram_counter);
+
+void kvm_pmu_handle_event(struct kvm_vcpu *vcpu)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ u64 bitmask;
+ int bit;
+
+ bitmask = pmu->reprogram_pmi;
+
+ for_each_set_bit(bit, (unsigned long *)&bitmask, X86_PMC_IDX_MAX) {
+ struct kvm_pmc *pmc = kvm_x86_ops->pmu_ops->pmc_idx_to_pmc(pmu, bit);
+
+ if (unlikely(!pmc || !pmc->perf_event)) {
+ clear_bit(bit, (unsigned long *)&pmu->reprogram_pmi);
+ continue;
+ }
+
+ reprogram_counter(pmu, bit);
+ }
+}
+
+/* check if idx is a valid index to access PMU */
+int kvm_pmu_is_valid_msr_idx(struct kvm_vcpu *vcpu, unsigned idx)
+{
+ return kvm_x86_ops->pmu_ops->is_valid_msr_idx(vcpu, idx);
+}
+
+bool is_vmware_backdoor_pmc(u32 pmc_idx)
+{
+ switch (pmc_idx) {
+ case VMWARE_BACKDOOR_PMC_HOST_TSC:
+ case VMWARE_BACKDOOR_PMC_REAL_TIME:
+ case VMWARE_BACKDOOR_PMC_APPARENT_TIME:
+ return true;
+ }
+ return false;
+}
+
+static int kvm_pmu_rdpmc_vmware(struct kvm_vcpu *vcpu, unsigned idx, u64 *data)
+{
+ u64 ctr_val;
+
+ switch (idx) {
+ case VMWARE_BACKDOOR_PMC_HOST_TSC:
+ ctr_val = rdtsc();
+ break;
+ case VMWARE_BACKDOOR_PMC_REAL_TIME:
+ ctr_val = ktime_get_boot_ns();
+ break;
+ case VMWARE_BACKDOOR_PMC_APPARENT_TIME:
+ ctr_val = ktime_get_boot_ns() +
+ vcpu->kvm->arch.kvmclock_offset;
+ break;
+ default:
+ return 1;
+ }
+
+ *data = ctr_val;
+ return 0;
+}
+
+int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data)
+{
+ bool fast_mode = idx & (1u << 31);
+ struct kvm_pmc *pmc;
+ u64 mask = fast_mode ? ~0u : ~0ull;
+
+ if (is_vmware_backdoor_pmc(idx))
+ return kvm_pmu_rdpmc_vmware(vcpu, idx, data);
+
+ pmc = kvm_x86_ops->pmu_ops->msr_idx_to_pmc(vcpu, idx, &mask);
+ if (!pmc)
+ return 1;
+
+ *data = pmc_read_counter(pmc) & mask;
+ return 0;
+}
+
+void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu)
+{
+ if (lapic_in_kernel(vcpu))
+ kvm_apic_local_deliver(vcpu->arch.apic, APIC_LVTPC);
+}
+
+bool kvm_pmu_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr)
+{
+ return kvm_x86_ops->pmu_ops->is_valid_msr(vcpu, msr);
+}
+
+int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *data)
+{
+ return kvm_x86_ops->pmu_ops->get_msr(vcpu, msr, data);
+}
+
+int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+{
+ return kvm_x86_ops->pmu_ops->set_msr(vcpu, msr_info);
+}
+
+/* refresh PMU settings. This function generally is called when underlying
+ * settings are changed (such as changes of PMU CPUID by guest VMs), which
+ * should rarely happen.
+ */
+void kvm_pmu_refresh(struct kvm_vcpu *vcpu)
+{
+ kvm_x86_ops->pmu_ops->refresh(vcpu);
+}
+
+void kvm_pmu_reset(struct kvm_vcpu *vcpu)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+
+ irq_work_sync(&pmu->irq_work);
+ kvm_x86_ops->pmu_ops->reset(vcpu);
+}
+
+void kvm_pmu_init(struct kvm_vcpu *vcpu)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+
+ memset(pmu, 0, sizeof(*pmu));
+ kvm_x86_ops->pmu_ops->init(vcpu);
+ init_irq_work(&pmu->irq_work, kvm_pmi_trigger_fn);
+ kvm_pmu_refresh(vcpu);
+}
+
+void kvm_pmu_destroy(struct kvm_vcpu *vcpu)
+{
+ kvm_pmu_reset(vcpu);
+}
diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h
new file mode 100644
index 000000000..7b4828e50
--- /dev/null
+++ b/arch/x86/kvm/pmu.h
@@ -0,0 +1,136 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __KVM_X86_PMU_H
+#define __KVM_X86_PMU_H
+
+#include <linux/nospec.h>
+
+#define vcpu_to_pmu(vcpu) (&(vcpu)->arch.pmu)
+#define pmu_to_vcpu(pmu) (container_of((pmu), struct kvm_vcpu, arch.pmu))
+#define pmc_to_pmu(pmc) (&(pmc)->vcpu->arch.pmu)
+
+/* retrieve the 4 bits for EN and PMI out of IA32_FIXED_CTR_CTRL */
+#define fixed_ctrl_field(ctrl_reg, idx) (((ctrl_reg) >> ((idx)*4)) & 0xf)
+
+#define VMWARE_BACKDOOR_PMC_HOST_TSC 0x10000
+#define VMWARE_BACKDOOR_PMC_REAL_TIME 0x10001
+#define VMWARE_BACKDOOR_PMC_APPARENT_TIME 0x10002
+
+struct kvm_event_hw_type_mapping {
+ u8 eventsel;
+ u8 unit_mask;
+ unsigned event_type;
+};
+
+struct kvm_pmu_ops {
+ unsigned (*find_arch_event)(struct kvm_pmu *pmu, u8 event_select,
+ u8 unit_mask);
+ unsigned (*find_fixed_event)(int idx);
+ bool (*pmc_is_enabled)(struct kvm_pmc *pmc);
+ struct kvm_pmc *(*pmc_idx_to_pmc)(struct kvm_pmu *pmu, int pmc_idx);
+ struct kvm_pmc *(*msr_idx_to_pmc)(struct kvm_vcpu *vcpu, unsigned idx,
+ u64 *mask);
+ int (*is_valid_msr_idx)(struct kvm_vcpu *vcpu, unsigned idx);
+ bool (*is_valid_msr)(struct kvm_vcpu *vcpu, u32 msr);
+ int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr, u64 *data);
+ int (*set_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr_info);
+ void (*refresh)(struct kvm_vcpu *vcpu);
+ void (*init)(struct kvm_vcpu *vcpu);
+ void (*reset)(struct kvm_vcpu *vcpu);
+};
+
+static inline u64 pmc_bitmask(struct kvm_pmc *pmc)
+{
+ struct kvm_pmu *pmu = pmc_to_pmu(pmc);
+
+ return pmu->counter_bitmask[pmc->type];
+}
+
+static inline u64 pmc_read_counter(struct kvm_pmc *pmc)
+{
+ u64 counter, enabled, running;
+
+ counter = pmc->counter;
+ if (pmc->perf_event)
+ counter += perf_event_read_value(pmc->perf_event,
+ &enabled, &running);
+ /* FIXME: Scaling needed? */
+ return counter & pmc_bitmask(pmc);
+}
+
+static inline void pmc_stop_counter(struct kvm_pmc *pmc)
+{
+ if (pmc->perf_event) {
+ pmc->counter = pmc_read_counter(pmc);
+ perf_event_release_kernel(pmc->perf_event);
+ pmc->perf_event = NULL;
+ }
+}
+
+static inline bool pmc_is_gp(struct kvm_pmc *pmc)
+{
+ return pmc->type == KVM_PMC_GP;
+}
+
+static inline bool pmc_is_fixed(struct kvm_pmc *pmc)
+{
+ return pmc->type == KVM_PMC_FIXED;
+}
+
+static inline bool pmc_is_enabled(struct kvm_pmc *pmc)
+{
+ return kvm_x86_ops->pmu_ops->pmc_is_enabled(pmc);
+}
+
+/* returns general purpose PMC with the specified MSR. Note that it can be
+ * used for both PERFCTRn and EVNTSELn; that is why it accepts base as a
+ * paramenter to tell them apart.
+ */
+static inline struct kvm_pmc *get_gp_pmc(struct kvm_pmu *pmu, u32 msr,
+ u32 base)
+{
+ if (msr >= base && msr < base + pmu->nr_arch_gp_counters) {
+ u32 index = array_index_nospec(msr - base,
+ pmu->nr_arch_gp_counters);
+
+ return &pmu->gp_counters[index];
+ }
+
+ return NULL;
+}
+
+/* returns fixed PMC with the specified MSR */
+static inline struct kvm_pmc *get_fixed_pmc(struct kvm_pmu *pmu, u32 msr)
+{
+ int base = MSR_CORE_PERF_FIXED_CTR0;
+
+ if (msr >= base && msr < base + pmu->nr_arch_fixed_counters) {
+ u32 index = array_index_nospec(msr - base,
+ pmu->nr_arch_fixed_counters);
+
+ return &pmu->fixed_counters[index];
+ }
+
+ return NULL;
+}
+
+void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel);
+void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 ctrl, int fixed_idx);
+void reprogram_counter(struct kvm_pmu *pmu, int pmc_idx);
+
+void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu);
+void kvm_pmu_handle_event(struct kvm_vcpu *vcpu);
+int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data);
+int kvm_pmu_is_valid_msr_idx(struct kvm_vcpu *vcpu, unsigned idx);
+bool kvm_pmu_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr);
+int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *data);
+int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info);
+void kvm_pmu_refresh(struct kvm_vcpu *vcpu);
+void kvm_pmu_reset(struct kvm_vcpu *vcpu);
+void kvm_pmu_init(struct kvm_vcpu *vcpu);
+void kvm_pmu_destroy(struct kvm_vcpu *vcpu);
+
+bool is_vmware_backdoor_pmc(u32 pmc_idx);
+
+extern struct kvm_pmu_ops intel_pmu_ops;
+extern struct kvm_pmu_ops amd_pmu_ops;
+#endif /* __KVM_X86_PMU_H */
diff --git a/arch/x86/kvm/pmu_amd.c b/arch/x86/kvm/pmu_amd.c
new file mode 100644
index 000000000..93a135f21
--- /dev/null
+++ b/arch/x86/kvm/pmu_amd.c
@@ -0,0 +1,317 @@
+/*
+ * KVM PMU support for AMD
+ *
+ * Copyright 2015, Red Hat, Inc. and/or its affiliates.
+ *
+ * Author:
+ * Wei Huang <wei@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ *
+ * Implementation is based on pmu_intel.c file
+ */
+#include <linux/types.h>
+#include <linux/kvm_host.h>
+#include <linux/perf_event.h>
+#include "x86.h"
+#include "cpuid.h"
+#include "lapic.h"
+#include "pmu.h"
+
+enum pmu_type {
+ PMU_TYPE_COUNTER = 0,
+ PMU_TYPE_EVNTSEL,
+};
+
+enum index {
+ INDEX_ZERO = 0,
+ INDEX_ONE,
+ INDEX_TWO,
+ INDEX_THREE,
+ INDEX_FOUR,
+ INDEX_FIVE,
+ INDEX_ERROR,
+};
+
+/* duplicated from amd_perfmon_event_map, K7 and above should work. */
+static struct kvm_event_hw_type_mapping amd_event_mapping[] = {
+ [0] = { 0x76, 0x00, PERF_COUNT_HW_CPU_CYCLES },
+ [1] = { 0xc0, 0x00, PERF_COUNT_HW_INSTRUCTIONS },
+ [2] = { 0x7d, 0x07, PERF_COUNT_HW_CACHE_REFERENCES },
+ [3] = { 0x7e, 0x07, PERF_COUNT_HW_CACHE_MISSES },
+ [4] = { 0xc2, 0x00, PERF_COUNT_HW_BRANCH_INSTRUCTIONS },
+ [5] = { 0xc3, 0x00, PERF_COUNT_HW_BRANCH_MISSES },
+ [6] = { 0xd0, 0x00, PERF_COUNT_HW_STALLED_CYCLES_FRONTEND },
+ [7] = { 0xd1, 0x00, PERF_COUNT_HW_STALLED_CYCLES_BACKEND },
+};
+
+static unsigned int get_msr_base(struct kvm_pmu *pmu, enum pmu_type type)
+{
+ struct kvm_vcpu *vcpu = pmu_to_vcpu(pmu);
+
+ if (guest_cpuid_has(vcpu, X86_FEATURE_PERFCTR_CORE)) {
+ if (type == PMU_TYPE_COUNTER)
+ return MSR_F15H_PERF_CTR;
+ else
+ return MSR_F15H_PERF_CTL;
+ } else {
+ if (type == PMU_TYPE_COUNTER)
+ return MSR_K7_PERFCTR0;
+ else
+ return MSR_K7_EVNTSEL0;
+ }
+}
+
+static enum index msr_to_index(u32 msr)
+{
+ switch (msr) {
+ case MSR_F15H_PERF_CTL0:
+ case MSR_F15H_PERF_CTR0:
+ case MSR_K7_EVNTSEL0:
+ case MSR_K7_PERFCTR0:
+ return INDEX_ZERO;
+ case MSR_F15H_PERF_CTL1:
+ case MSR_F15H_PERF_CTR1:
+ case MSR_K7_EVNTSEL1:
+ case MSR_K7_PERFCTR1:
+ return INDEX_ONE;
+ case MSR_F15H_PERF_CTL2:
+ case MSR_F15H_PERF_CTR2:
+ case MSR_K7_EVNTSEL2:
+ case MSR_K7_PERFCTR2:
+ return INDEX_TWO;
+ case MSR_F15H_PERF_CTL3:
+ case MSR_F15H_PERF_CTR3:
+ case MSR_K7_EVNTSEL3:
+ case MSR_K7_PERFCTR3:
+ return INDEX_THREE;
+ case MSR_F15H_PERF_CTL4:
+ case MSR_F15H_PERF_CTR4:
+ return INDEX_FOUR;
+ case MSR_F15H_PERF_CTL5:
+ case MSR_F15H_PERF_CTR5:
+ return INDEX_FIVE;
+ default:
+ return INDEX_ERROR;
+ }
+}
+
+static inline struct kvm_pmc *get_gp_pmc_amd(struct kvm_pmu *pmu, u32 msr,
+ enum pmu_type type)
+{
+ switch (msr) {
+ case MSR_F15H_PERF_CTL0:
+ case MSR_F15H_PERF_CTL1:
+ case MSR_F15H_PERF_CTL2:
+ case MSR_F15H_PERF_CTL3:
+ case MSR_F15H_PERF_CTL4:
+ case MSR_F15H_PERF_CTL5:
+ case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3:
+ if (type != PMU_TYPE_EVNTSEL)
+ return NULL;
+ break;
+ case MSR_F15H_PERF_CTR0:
+ case MSR_F15H_PERF_CTR1:
+ case MSR_F15H_PERF_CTR2:
+ case MSR_F15H_PERF_CTR3:
+ case MSR_F15H_PERF_CTR4:
+ case MSR_F15H_PERF_CTR5:
+ case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3:
+ if (type != PMU_TYPE_COUNTER)
+ return NULL;
+ break;
+ default:
+ return NULL;
+ }
+
+ return &pmu->gp_counters[msr_to_index(msr)];
+}
+
+static unsigned amd_find_arch_event(struct kvm_pmu *pmu,
+ u8 event_select,
+ u8 unit_mask)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(amd_event_mapping); i++)
+ if (amd_event_mapping[i].eventsel == event_select
+ && amd_event_mapping[i].unit_mask == unit_mask)
+ break;
+
+ if (i == ARRAY_SIZE(amd_event_mapping))
+ return PERF_COUNT_HW_MAX;
+
+ return amd_event_mapping[i].event_type;
+}
+
+/* return PERF_COUNT_HW_MAX as AMD doesn't have fixed events */
+static unsigned amd_find_fixed_event(int idx)
+{
+ return PERF_COUNT_HW_MAX;
+}
+
+/* check if a PMC is enabled by comparing it against global_ctrl bits. Because
+ * AMD CPU doesn't have global_ctrl MSR, all PMCs are enabled (return TRUE).
+ */
+static bool amd_pmc_is_enabled(struct kvm_pmc *pmc)
+{
+ return true;
+}
+
+static struct kvm_pmc *amd_pmc_idx_to_pmc(struct kvm_pmu *pmu, int pmc_idx)
+{
+ unsigned int base = get_msr_base(pmu, PMU_TYPE_COUNTER);
+ struct kvm_vcpu *vcpu = pmu_to_vcpu(pmu);
+
+ if (guest_cpuid_has(vcpu, X86_FEATURE_PERFCTR_CORE)) {
+ /*
+ * The idx is contiguous. The MSRs are not. The counter MSRs
+ * are interleaved with the event select MSRs.
+ */
+ pmc_idx *= 2;
+ }
+
+ return get_gp_pmc_amd(pmu, base + pmc_idx, PMU_TYPE_COUNTER);
+}
+
+/* returns 0 if idx's corresponding MSR exists; otherwise returns 1. */
+static int amd_is_valid_msr_idx(struct kvm_vcpu *vcpu, unsigned idx)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+
+ idx &= ~(3u << 30);
+
+ return (idx >= pmu->nr_arch_gp_counters);
+}
+
+/* idx is the ECX register of RDPMC instruction */
+static struct kvm_pmc *amd_msr_idx_to_pmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *mask)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ struct kvm_pmc *counters;
+
+ idx &= ~(3u << 30);
+ if (idx >= pmu->nr_arch_gp_counters)
+ return NULL;
+ counters = pmu->gp_counters;
+
+ return &counters[idx];
+}
+
+static bool amd_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ int ret = false;
+
+ ret = get_gp_pmc_amd(pmu, msr, PMU_TYPE_COUNTER) ||
+ get_gp_pmc_amd(pmu, msr, PMU_TYPE_EVNTSEL);
+
+ return ret;
+}
+
+static int amd_pmu_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *data)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ struct kvm_pmc *pmc;
+
+ /* MSR_PERFCTRn */
+ pmc = get_gp_pmc_amd(pmu, msr, PMU_TYPE_COUNTER);
+ if (pmc) {
+ *data = pmc_read_counter(pmc);
+ return 0;
+ }
+ /* MSR_EVNTSELn */
+ pmc = get_gp_pmc_amd(pmu, msr, PMU_TYPE_EVNTSEL);
+ if (pmc) {
+ *data = pmc->eventsel;
+ return 0;
+ }
+
+ return 1;
+}
+
+static int amd_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ struct kvm_pmc *pmc;
+ u32 msr = msr_info->index;
+ u64 data = msr_info->data;
+
+ /* MSR_PERFCTRn */
+ pmc = get_gp_pmc_amd(pmu, msr, PMU_TYPE_COUNTER);
+ if (pmc) {
+ pmc->counter += data - pmc_read_counter(pmc);
+ return 0;
+ }
+ /* MSR_EVNTSELn */
+ pmc = get_gp_pmc_amd(pmu, msr, PMU_TYPE_EVNTSEL);
+ if (pmc) {
+ data &= ~pmu->reserved_bits;
+ if (data != pmc->eventsel)
+ reprogram_gp_counter(pmc, data);
+ return 0;
+ }
+
+ return 1;
+}
+
+static void amd_pmu_refresh(struct kvm_vcpu *vcpu)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+
+ if (guest_cpuid_has(vcpu, X86_FEATURE_PERFCTR_CORE))
+ pmu->nr_arch_gp_counters = AMD64_NUM_COUNTERS_CORE;
+ else
+ pmu->nr_arch_gp_counters = AMD64_NUM_COUNTERS;
+
+ pmu->counter_bitmask[KVM_PMC_GP] = ((u64)1 << 48) - 1;
+ pmu->reserved_bits = 0xffffffff00200000ull;
+ /* not applicable to AMD; but clean them to prevent any fall out */
+ pmu->counter_bitmask[KVM_PMC_FIXED] = 0;
+ pmu->nr_arch_fixed_counters = 0;
+ pmu->version = 0;
+ pmu->global_status = 0;
+}
+
+static void amd_pmu_init(struct kvm_vcpu *vcpu)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ int i;
+
+ BUILD_BUG_ON(AMD64_NUM_COUNTERS_CORE > INTEL_PMC_MAX_GENERIC);
+
+ for (i = 0; i < AMD64_NUM_COUNTERS_CORE ; i++) {
+ pmu->gp_counters[i].type = KVM_PMC_GP;
+ pmu->gp_counters[i].vcpu = vcpu;
+ pmu->gp_counters[i].idx = i;
+ }
+}
+
+static void amd_pmu_reset(struct kvm_vcpu *vcpu)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ int i;
+
+ for (i = 0; i < AMD64_NUM_COUNTERS_CORE; i++) {
+ struct kvm_pmc *pmc = &pmu->gp_counters[i];
+
+ pmc_stop_counter(pmc);
+ pmc->counter = pmc->eventsel = 0;
+ }
+}
+
+struct kvm_pmu_ops amd_pmu_ops = {
+ .find_arch_event = amd_find_arch_event,
+ .find_fixed_event = amd_find_fixed_event,
+ .pmc_is_enabled = amd_pmc_is_enabled,
+ .pmc_idx_to_pmc = amd_pmc_idx_to_pmc,
+ .msr_idx_to_pmc = amd_msr_idx_to_pmc,
+ .is_valid_msr_idx = amd_is_valid_msr_idx,
+ .is_valid_msr = amd_is_valid_msr,
+ .get_msr = amd_pmu_get_msr,
+ .set_msr = amd_pmu_set_msr,
+ .refresh = amd_pmu_refresh,
+ .init = amd_pmu_init,
+ .reset = amd_pmu_reset,
+};
diff --git a/arch/x86/kvm/pmu_intel.c b/arch/x86/kvm/pmu_intel.c
new file mode 100644
index 000000000..611f9e60f
--- /dev/null
+++ b/arch/x86/kvm/pmu_intel.c
@@ -0,0 +1,374 @@
+/*
+ * KVM PMU support for Intel CPUs
+ *
+ * Copyright 2011 Red Hat, Inc. and/or its affiliates.
+ *
+ * Authors:
+ * Avi Kivity <avi@redhat.com>
+ * Gleb Natapov <gleb@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ *
+ */
+#include <linux/types.h>
+#include <linux/kvm_host.h>
+#include <linux/perf_event.h>
+#include <asm/perf_event.h>
+#include "x86.h"
+#include "cpuid.h"
+#include "lapic.h"
+#include "pmu.h"
+
+static struct kvm_event_hw_type_mapping intel_arch_events[] = {
+ /* Index must match CPUID 0x0A.EBX bit vector */
+ [0] = { 0x3c, 0x00, PERF_COUNT_HW_CPU_CYCLES },
+ [1] = { 0xc0, 0x00, PERF_COUNT_HW_INSTRUCTIONS },
+ [2] = { 0x3c, 0x01, PERF_COUNT_HW_BUS_CYCLES },
+ [3] = { 0x2e, 0x4f, PERF_COUNT_HW_CACHE_REFERENCES },
+ [4] = { 0x2e, 0x41, PERF_COUNT_HW_CACHE_MISSES },
+ [5] = { 0xc4, 0x00, PERF_COUNT_HW_BRANCH_INSTRUCTIONS },
+ [6] = { 0xc5, 0x00, PERF_COUNT_HW_BRANCH_MISSES },
+ [7] = { 0x00, 0x03, PERF_COUNT_HW_REF_CPU_CYCLES },
+};
+
+/* mapping between fixed pmc index and intel_arch_events array */
+static int fixed_pmc_events[] = {1, 0, 7};
+
+static void reprogram_fixed_counters(struct kvm_pmu *pmu, u64 data)
+{
+ int i;
+
+ for (i = 0; i < pmu->nr_arch_fixed_counters; i++) {
+ u8 new_ctrl = fixed_ctrl_field(data, i);
+ u8 old_ctrl = fixed_ctrl_field(pmu->fixed_ctr_ctrl, i);
+ struct kvm_pmc *pmc;
+
+ pmc = get_fixed_pmc(pmu, MSR_CORE_PERF_FIXED_CTR0 + i);
+
+ if (old_ctrl == new_ctrl)
+ continue;
+
+ reprogram_fixed_counter(pmc, new_ctrl, i);
+ }
+
+ pmu->fixed_ctr_ctrl = data;
+}
+
+/* function is called when global control register has been updated. */
+static void global_ctrl_changed(struct kvm_pmu *pmu, u64 data)
+{
+ int bit;
+ u64 diff = pmu->global_ctrl ^ data;
+
+ pmu->global_ctrl = data;
+
+ for_each_set_bit(bit, (unsigned long *)&diff, X86_PMC_IDX_MAX)
+ reprogram_counter(pmu, bit);
+}
+
+static unsigned intel_find_arch_event(struct kvm_pmu *pmu,
+ u8 event_select,
+ u8 unit_mask)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(intel_arch_events); i++)
+ if (intel_arch_events[i].eventsel == event_select
+ && intel_arch_events[i].unit_mask == unit_mask
+ && (pmu->available_event_types & (1 << i)))
+ break;
+
+ if (i == ARRAY_SIZE(intel_arch_events))
+ return PERF_COUNT_HW_MAX;
+
+ return intel_arch_events[i].event_type;
+}
+
+static unsigned intel_find_fixed_event(int idx)
+{
+ u32 event;
+ size_t size = ARRAY_SIZE(fixed_pmc_events);
+
+ if (idx >= size)
+ return PERF_COUNT_HW_MAX;
+
+ event = fixed_pmc_events[array_index_nospec(idx, size)];
+ return intel_arch_events[event].event_type;
+}
+
+/* check if a PMC is enabled by comparing it with globl_ctrl bits. */
+static bool intel_pmc_is_enabled(struct kvm_pmc *pmc)
+{
+ struct kvm_pmu *pmu = pmc_to_pmu(pmc);
+
+ return test_bit(pmc->idx, (unsigned long *)&pmu->global_ctrl);
+}
+
+static struct kvm_pmc *intel_pmc_idx_to_pmc(struct kvm_pmu *pmu, int pmc_idx)
+{
+ if (pmc_idx < INTEL_PMC_IDX_FIXED)
+ return get_gp_pmc(pmu, MSR_P6_EVNTSEL0 + pmc_idx,
+ MSR_P6_EVNTSEL0);
+ else {
+ u32 idx = pmc_idx - INTEL_PMC_IDX_FIXED;
+
+ return get_fixed_pmc(pmu, idx + MSR_CORE_PERF_FIXED_CTR0);
+ }
+}
+
+/* returns 0 if idx's corresponding MSR exists; otherwise returns 1. */
+static int intel_is_valid_msr_idx(struct kvm_vcpu *vcpu, unsigned idx)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ bool fixed = idx & (1u << 30);
+
+ idx &= ~(3u << 30);
+
+ return (!fixed && idx >= pmu->nr_arch_gp_counters) ||
+ (fixed && idx >= pmu->nr_arch_fixed_counters);
+}
+
+static struct kvm_pmc *intel_msr_idx_to_pmc(struct kvm_vcpu *vcpu,
+ unsigned idx, u64 *mask)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ bool fixed = idx & (1u << 30);
+ struct kvm_pmc *counters;
+ unsigned int num_counters;
+
+ idx &= ~(3u << 30);
+ if (fixed) {
+ counters = pmu->fixed_counters;
+ num_counters = pmu->nr_arch_fixed_counters;
+ } else {
+ counters = pmu->gp_counters;
+ num_counters = pmu->nr_arch_gp_counters;
+ }
+ if (idx >= num_counters)
+ return NULL;
+ *mask &= pmu->counter_bitmask[fixed ? KVM_PMC_FIXED : KVM_PMC_GP];
+ return &counters[array_index_nospec(idx, num_counters)];
+}
+
+static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ int ret;
+
+ switch (msr) {
+ case MSR_CORE_PERF_FIXED_CTR_CTRL:
+ case MSR_CORE_PERF_GLOBAL_STATUS:
+ case MSR_CORE_PERF_GLOBAL_CTRL:
+ case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
+ ret = pmu->version > 1;
+ break;
+ default:
+ ret = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0) ||
+ get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0) ||
+ get_fixed_pmc(pmu, msr);
+ break;
+ }
+
+ return ret;
+}
+
+static int intel_pmu_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *data)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ struct kvm_pmc *pmc;
+
+ switch (msr) {
+ case MSR_CORE_PERF_FIXED_CTR_CTRL:
+ *data = pmu->fixed_ctr_ctrl;
+ return 0;
+ case MSR_CORE_PERF_GLOBAL_STATUS:
+ *data = pmu->global_status;
+ return 0;
+ case MSR_CORE_PERF_GLOBAL_CTRL:
+ *data = pmu->global_ctrl;
+ return 0;
+ case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
+ *data = pmu->global_ovf_ctrl;
+ return 0;
+ default:
+ if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0))) {
+ u64 val = pmc_read_counter(pmc);
+ *data = val & pmu->counter_bitmask[KVM_PMC_GP];
+ return 0;
+ } else if ((pmc = get_fixed_pmc(pmu, msr))) {
+ u64 val = pmc_read_counter(pmc);
+ *data = val & pmu->counter_bitmask[KVM_PMC_FIXED];
+ return 0;
+ } else if ((pmc = get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0))) {
+ *data = pmc->eventsel;
+ return 0;
+ }
+ }
+
+ return 1;
+}
+
+static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ struct kvm_pmc *pmc;
+ u32 msr = msr_info->index;
+ u64 data = msr_info->data;
+
+ switch (msr) {
+ case MSR_CORE_PERF_FIXED_CTR_CTRL:
+ if (pmu->fixed_ctr_ctrl == data)
+ return 0;
+ if (!(data & 0xfffffffffffff444ull)) {
+ reprogram_fixed_counters(pmu, data);
+ return 0;
+ }
+ break;
+ case MSR_CORE_PERF_GLOBAL_STATUS:
+ if (msr_info->host_initiated) {
+ pmu->global_status = data;
+ return 0;
+ }
+ break; /* RO MSR */
+ case MSR_CORE_PERF_GLOBAL_CTRL:
+ if (pmu->global_ctrl == data)
+ return 0;
+ if (!(data & pmu->global_ctrl_mask)) {
+ global_ctrl_changed(pmu, data);
+ return 0;
+ }
+ break;
+ case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
+ if (!(data & (pmu->global_ctrl_mask & ~(3ull<<62)))) {
+ if (!msr_info->host_initiated)
+ pmu->global_status &= ~data;
+ pmu->global_ovf_ctrl = data;
+ return 0;
+ }
+ break;
+ default:
+ if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0))) {
+ if (msr_info->host_initiated)
+ pmc->counter = data;
+ else
+ pmc->counter = (s32)data;
+ return 0;
+ } else if ((pmc = get_fixed_pmc(pmu, msr))) {
+ pmc->counter = data;
+ return 0;
+ } else if ((pmc = get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0))) {
+ if (data == pmc->eventsel)
+ return 0;
+ if (!(data & pmu->reserved_bits)) {
+ reprogram_gp_counter(pmc, data);
+ return 0;
+ }
+ }
+ }
+
+ return 1;
+}
+
+static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ struct kvm_cpuid_entry2 *entry;
+ union cpuid10_eax eax;
+ union cpuid10_edx edx;
+
+ pmu->nr_arch_gp_counters = 0;
+ pmu->nr_arch_fixed_counters = 0;
+ pmu->counter_bitmask[KVM_PMC_GP] = 0;
+ pmu->counter_bitmask[KVM_PMC_FIXED] = 0;
+ pmu->version = 0;
+ pmu->reserved_bits = 0xffffffff00200000ull;
+
+ entry = kvm_find_cpuid_entry(vcpu, 0xa, 0);
+ if (!entry)
+ return;
+ eax.full = entry->eax;
+ edx.full = entry->edx;
+
+ pmu->version = eax.split.version_id;
+ if (!pmu->version)
+ return;
+
+ pmu->nr_arch_gp_counters = min_t(int, eax.split.num_counters,
+ INTEL_PMC_MAX_GENERIC);
+ pmu->counter_bitmask[KVM_PMC_GP] = ((u64)1 << eax.split.bit_width) - 1;
+ pmu->available_event_types = ~entry->ebx &
+ ((1ull << eax.split.mask_length) - 1);
+
+ if (pmu->version == 1) {
+ pmu->nr_arch_fixed_counters = 0;
+ } else {
+ pmu->nr_arch_fixed_counters =
+ min_t(int, edx.split.num_counters_fixed,
+ INTEL_PMC_MAX_FIXED);
+ pmu->counter_bitmask[KVM_PMC_FIXED] =
+ ((u64)1 << edx.split.bit_width_fixed) - 1;
+ }
+
+ pmu->global_ctrl = ((1ull << pmu->nr_arch_gp_counters) - 1) |
+ (((1ull << pmu->nr_arch_fixed_counters) - 1) << INTEL_PMC_IDX_FIXED);
+ pmu->global_ctrl_mask = ~pmu->global_ctrl;
+
+ entry = kvm_find_cpuid_entry(vcpu, 7, 0);
+ if (entry &&
+ (boot_cpu_has(X86_FEATURE_HLE) || boot_cpu_has(X86_FEATURE_RTM)) &&
+ (entry->ebx & (X86_FEATURE_HLE|X86_FEATURE_RTM)))
+ pmu->reserved_bits ^= HSW_IN_TX|HSW_IN_TX_CHECKPOINTED;
+}
+
+static void intel_pmu_init(struct kvm_vcpu *vcpu)
+{
+ int i;
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+
+ for (i = 0; i < INTEL_PMC_MAX_GENERIC; i++) {
+ pmu->gp_counters[i].type = KVM_PMC_GP;
+ pmu->gp_counters[i].vcpu = vcpu;
+ pmu->gp_counters[i].idx = i;
+ }
+
+ for (i = 0; i < INTEL_PMC_MAX_FIXED; i++) {
+ pmu->fixed_counters[i].type = KVM_PMC_FIXED;
+ pmu->fixed_counters[i].vcpu = vcpu;
+ pmu->fixed_counters[i].idx = i + INTEL_PMC_IDX_FIXED;
+ }
+}
+
+static void intel_pmu_reset(struct kvm_vcpu *vcpu)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ int i;
+
+ for (i = 0; i < INTEL_PMC_MAX_GENERIC; i++) {
+ struct kvm_pmc *pmc = &pmu->gp_counters[i];
+
+ pmc_stop_counter(pmc);
+ pmc->counter = pmc->eventsel = 0;
+ }
+
+ for (i = 0; i < INTEL_PMC_MAX_FIXED; i++)
+ pmc_stop_counter(&pmu->fixed_counters[i]);
+
+ pmu->fixed_ctr_ctrl = pmu->global_ctrl = pmu->global_status =
+ pmu->global_ovf_ctrl = 0;
+}
+
+struct kvm_pmu_ops intel_pmu_ops = {
+ .find_arch_event = intel_find_arch_event,
+ .find_fixed_event = intel_find_fixed_event,
+ .pmc_is_enabled = intel_pmc_is_enabled,
+ .pmc_idx_to_pmc = intel_pmc_idx_to_pmc,
+ .msr_idx_to_pmc = intel_msr_idx_to_pmc,
+ .is_valid_msr_idx = intel_is_valid_msr_idx,
+ .is_valid_msr = intel_is_valid_msr,
+ .get_msr = intel_pmu_get_msr,
+ .set_msr = intel_pmu_set_msr,
+ .refresh = intel_pmu_refresh,
+ .init = intel_pmu_init,
+ .reset = intel_pmu_reset,
+};
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
new file mode 100644
index 000000000..851814574
--- /dev/null
+++ b/arch/x86/kvm/svm.c
@@ -0,0 +1,7323 @@
+/*
+ * Kernel-based Virtual Machine driver for Linux
+ *
+ * AMD SVM support
+ *
+ * Copyright (C) 2006 Qumranet, Inc.
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
+ *
+ * Authors:
+ * Yaniv Kamay <yaniv@qumranet.com>
+ * Avi Kivity <avi@qumranet.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#define pr_fmt(fmt) "SVM: " fmt
+
+#include <linux/kvm_host.h>
+
+#include "irq.h"
+#include "mmu.h"
+#include "kvm_cache_regs.h"
+#include "x86.h"
+#include "cpuid.h"
+#include "pmu.h"
+
+#include <linux/module.h>
+#include <linux/mod_devicetable.h>
+#include <linux/kernel.h>
+#include <linux/vmalloc.h>
+#include <linux/highmem.h>
+#include <linux/sched.h>
+#include <linux/trace_events.h>
+#include <linux/slab.h>
+#include <linux/amd-iommu.h>
+#include <linux/hashtable.h>
+#include <linux/frame.h>
+#include <linux/psp-sev.h>
+#include <linux/file.h>
+#include <linux/pagemap.h>
+#include <linux/swap.h>
+
+#include <asm/apic.h>
+#include <asm/perf_event.h>
+#include <asm/tlbflush.h>
+#include <asm/desc.h>
+#include <asm/debugreg.h>
+#include <asm/kvm_para.h>
+#include <asm/irq_remapping.h>
+#include <asm/spec-ctrl.h>
+
+#include <asm/virtext.h>
+#include "trace.h"
+
+#define __ex(x) __kvm_handle_fault_on_reboot(x)
+
+MODULE_AUTHOR("Qumranet");
+MODULE_LICENSE("GPL");
+
+static const struct x86_cpu_id svm_cpu_id[] = {
+ X86_FEATURE_MATCH(X86_FEATURE_SVM),
+ {}
+};
+MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id);
+
+#define IOPM_ALLOC_ORDER 2
+#define MSRPM_ALLOC_ORDER 1
+
+#define SEG_TYPE_LDT 2
+#define SEG_TYPE_BUSY_TSS16 3
+
+#define SVM_FEATURE_NPT (1 << 0)
+#define SVM_FEATURE_LBRV (1 << 1)
+#define SVM_FEATURE_SVML (1 << 2)
+#define SVM_FEATURE_NRIP (1 << 3)
+#define SVM_FEATURE_TSC_RATE (1 << 4)
+#define SVM_FEATURE_VMCB_CLEAN (1 << 5)
+#define SVM_FEATURE_FLUSH_ASID (1 << 6)
+#define SVM_FEATURE_DECODE_ASSIST (1 << 7)
+#define SVM_FEATURE_PAUSE_FILTER (1 << 10)
+
+#define SVM_AVIC_DOORBELL 0xc001011b
+
+#define NESTED_EXIT_HOST 0 /* Exit handled on host level */
+#define NESTED_EXIT_DONE 1 /* Exit caused nested vmexit */
+#define NESTED_EXIT_CONTINUE 2 /* Further checks needed */
+
+#define DEBUGCTL_RESERVED_BITS (~(0x3fULL))
+
+#define TSC_RATIO_RSVD 0xffffff0000000000ULL
+#define TSC_RATIO_MIN 0x0000000000000001ULL
+#define TSC_RATIO_MAX 0x000000ffffffffffULL
+
+#define AVIC_HPA_MASK ~((0xFFFULL << 52) | 0xFFF)
+
+/*
+ * 0xff is broadcast, so the max index allowed for physical APIC ID
+ * table is 0xfe. APIC IDs above 0xff are reserved.
+ */
+#define AVIC_MAX_PHYSICAL_ID_COUNT 255
+
+#define AVIC_UNACCEL_ACCESS_WRITE_MASK 1
+#define AVIC_UNACCEL_ACCESS_OFFSET_MASK 0xFF0
+#define AVIC_UNACCEL_ACCESS_VECTOR_MASK 0xFFFFFFFF
+
+/* AVIC GATAG is encoded using VM and VCPU IDs */
+#define AVIC_VCPU_ID_BITS 8
+#define AVIC_VCPU_ID_MASK ((1 << AVIC_VCPU_ID_BITS) - 1)
+
+#define AVIC_VM_ID_BITS 24
+#define AVIC_VM_ID_NR (1 << AVIC_VM_ID_BITS)
+#define AVIC_VM_ID_MASK ((1 << AVIC_VM_ID_BITS) - 1)
+
+#define AVIC_GATAG(x, y) (((x & AVIC_VM_ID_MASK) << AVIC_VCPU_ID_BITS) | \
+ (y & AVIC_VCPU_ID_MASK))
+#define AVIC_GATAG_TO_VMID(x) ((x >> AVIC_VCPU_ID_BITS) & AVIC_VM_ID_MASK)
+#define AVIC_GATAG_TO_VCPUID(x) (x & AVIC_VCPU_ID_MASK)
+
+static bool erratum_383_found __read_mostly;
+
+static const u32 host_save_user_msrs[] = {
+#ifdef CONFIG_X86_64
+ MSR_STAR, MSR_LSTAR, MSR_CSTAR, MSR_SYSCALL_MASK, MSR_KERNEL_GS_BASE,
+ MSR_FS_BASE,
+#endif
+ MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
+ MSR_TSC_AUX,
+};
+
+#define NR_HOST_SAVE_USER_MSRS ARRAY_SIZE(host_save_user_msrs)
+
+struct kvm_sev_info {
+ bool active; /* SEV enabled guest */
+ unsigned int asid; /* ASID used for this guest */
+ unsigned int handle; /* SEV firmware handle */
+ int fd; /* SEV device fd */
+ unsigned long pages_locked; /* Number of pages locked */
+ struct list_head regions_list; /* List of registered regions */
+};
+
+struct kvm_svm {
+ struct kvm kvm;
+
+ /* Struct members for AVIC */
+ u32 avic_vm_id;
+ u32 ldr_mode;
+ struct page *avic_logical_id_table_page;
+ struct page *avic_physical_id_table_page;
+ struct hlist_node hnode;
+
+ struct kvm_sev_info sev_info;
+};
+
+struct kvm_vcpu;
+
+struct nested_state {
+ struct vmcb *hsave;
+ u64 hsave_msr;
+ u64 vm_cr_msr;
+ u64 vmcb;
+
+ /* These are the merged vectors */
+ u32 *msrpm;
+
+ /* gpa pointers to the real vectors */
+ u64 vmcb_msrpm;
+ u64 vmcb_iopm;
+
+ /* A VMEXIT is required but not yet emulated */
+ bool exit_required;
+
+ /* cache for intercepts of the guest */
+ u32 intercept_cr;
+ u32 intercept_dr;
+ u32 intercept_exceptions;
+ u64 intercept;
+
+ /* Nested Paging related state */
+ u64 nested_cr3;
+};
+
+#define MSRPM_OFFSETS 16
+static u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
+
+/*
+ * Set osvw_len to higher value when updated Revision Guides
+ * are published and we know what the new status bits are
+ */
+static uint64_t osvw_len = 4, osvw_status;
+
+struct vcpu_svm {
+ struct kvm_vcpu vcpu;
+ struct vmcb *vmcb;
+ unsigned long vmcb_pa;
+ struct svm_cpu_data *svm_data;
+ uint64_t asid_generation;
+ uint64_t sysenter_esp;
+ uint64_t sysenter_eip;
+ uint64_t tsc_aux;
+
+ u64 msr_decfg;
+
+ u64 next_rip;
+
+ u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS];
+ struct {
+ u16 fs;
+ u16 gs;
+ u16 ldt;
+ u64 gs_base;
+ } host;
+
+ u64 spec_ctrl;
+ /*
+ * Contains guest-controlled bits of VIRT_SPEC_CTRL, which will be
+ * translated into the appropriate L2_CFG bits on the host to
+ * perform speculative control.
+ */
+ u64 virt_spec_ctrl;
+
+ u32 *msrpm;
+
+ ulong nmi_iret_rip;
+
+ struct nested_state nested;
+
+ bool nmi_singlestep;
+ u64 nmi_singlestep_guest_rflags;
+
+ unsigned int3_injected;
+ unsigned long int3_rip;
+
+ /* cached guest cpuid flags for faster access */
+ bool nrips_enabled : 1;
+
+ u32 ldr_reg;
+ struct page *avic_backing_page;
+ u64 *avic_physical_id_cache;
+ bool avic_is_running;
+
+ /*
+ * Per-vcpu list of struct amd_svm_iommu_ir:
+ * This is used mainly to store interrupt remapping information used
+ * when update the vcpu affinity. This avoids the need to scan for
+ * IRTE and try to match ga_tag in the IOMMU driver.
+ */
+ struct list_head ir_list;
+ spinlock_t ir_list_lock;
+
+ /* which host CPU was used for running this vcpu */
+ unsigned int last_cpu;
+};
+
+/*
+ * This is a wrapper of struct amd_iommu_ir_data.
+ */
+struct amd_svm_iommu_ir {
+ struct list_head node; /* Used by SVM for per-vcpu ir_list */
+ void *data; /* Storing pointer to struct amd_ir_data */
+};
+
+#define AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK (0xFF)
+#define AVIC_LOGICAL_ID_ENTRY_VALID_MASK (1 << 31)
+
+#define AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK (0xFFULL)
+#define AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK (0xFFFFFFFFFFULL << 12)
+#define AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK (1ULL << 62)
+#define AVIC_PHYSICAL_ID_ENTRY_VALID_MASK (1ULL << 63)
+
+static DEFINE_PER_CPU(u64, current_tsc_ratio);
+#define TSC_RATIO_DEFAULT 0x0100000000ULL
+
+#define MSR_INVALID 0xffffffffU
+
+static const struct svm_direct_access_msrs {
+ u32 index; /* Index of the MSR */
+ bool always; /* True if intercept is always on */
+} direct_access_msrs[] = {
+ { .index = MSR_STAR, .always = true },
+ { .index = MSR_IA32_SYSENTER_CS, .always = true },
+#ifdef CONFIG_X86_64
+ { .index = MSR_GS_BASE, .always = true },
+ { .index = MSR_FS_BASE, .always = true },
+ { .index = MSR_KERNEL_GS_BASE, .always = true },
+ { .index = MSR_LSTAR, .always = true },
+ { .index = MSR_CSTAR, .always = true },
+ { .index = MSR_SYSCALL_MASK, .always = true },
+#endif
+ { .index = MSR_IA32_SPEC_CTRL, .always = false },
+ { .index = MSR_IA32_PRED_CMD, .always = false },
+ { .index = MSR_IA32_LASTBRANCHFROMIP, .always = false },
+ { .index = MSR_IA32_LASTBRANCHTOIP, .always = false },
+ { .index = MSR_IA32_LASTINTFROMIP, .always = false },
+ { .index = MSR_IA32_LASTINTTOIP, .always = false },
+ { .index = MSR_INVALID, .always = false },
+};
+
+/* enable NPT for AMD64 and X86 with PAE */
+#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
+static bool npt_enabled = true;
+#else
+static bool npt_enabled;
+#endif
+
+/*
+ * These 2 parameters are used to config the controls for Pause-Loop Exiting:
+ * pause_filter_count: On processors that support Pause filtering(indicated
+ * by CPUID Fn8000_000A_EDX), the VMCB provides a 16 bit pause filter
+ * count value. On VMRUN this value is loaded into an internal counter.
+ * Each time a pause instruction is executed, this counter is decremented
+ * until it reaches zero at which time a #VMEXIT is generated if pause
+ * intercept is enabled. Refer to AMD APM Vol 2 Section 15.14.4 Pause
+ * Intercept Filtering for more details.
+ * This also indicate if ple logic enabled.
+ *
+ * pause_filter_thresh: In addition, some processor families support advanced
+ * pause filtering (indicated by CPUID Fn8000_000A_EDX) upper bound on
+ * the amount of time a guest is allowed to execute in a pause loop.
+ * In this mode, a 16-bit pause filter threshold field is added in the
+ * VMCB. The threshold value is a cycle count that is used to reset the
+ * pause counter. As with simple pause filtering, VMRUN loads the pause
+ * count value from VMCB into an internal counter. Then, on each pause
+ * instruction the hardware checks the elapsed number of cycles since
+ * the most recent pause instruction against the pause filter threshold.
+ * If the elapsed cycle count is greater than the pause filter threshold,
+ * then the internal pause count is reloaded from the VMCB and execution
+ * continues. If the elapsed cycle count is less than the pause filter
+ * threshold, then the internal pause count is decremented. If the count
+ * value is less than zero and PAUSE intercept is enabled, a #VMEXIT is
+ * triggered. If advanced pause filtering is supported and pause filter
+ * threshold field is set to zero, the filter will operate in the simpler,
+ * count only mode.
+ */
+
+static unsigned short pause_filter_thresh = KVM_DEFAULT_PLE_GAP;
+module_param(pause_filter_thresh, ushort, 0444);
+
+static unsigned short pause_filter_count = KVM_SVM_DEFAULT_PLE_WINDOW;
+module_param(pause_filter_count, ushort, 0444);
+
+/* Default doubles per-vcpu window every exit. */
+static unsigned short pause_filter_count_grow = KVM_DEFAULT_PLE_WINDOW_GROW;
+module_param(pause_filter_count_grow, ushort, 0444);
+
+/* Default resets per-vcpu window every exit to pause_filter_count. */
+static unsigned short pause_filter_count_shrink = KVM_DEFAULT_PLE_WINDOW_SHRINK;
+module_param(pause_filter_count_shrink, ushort, 0444);
+
+/* Default is to compute the maximum so we can never overflow. */
+static unsigned short pause_filter_count_max = KVM_SVM_DEFAULT_PLE_WINDOW_MAX;
+module_param(pause_filter_count_max, ushort, 0444);
+
+/* allow nested paging (virtualized MMU) for all guests */
+static int npt = true;
+module_param(npt, int, S_IRUGO);
+
+/* allow nested virtualization in KVM/SVM */
+static int nested = true;
+module_param(nested, int, S_IRUGO);
+
+/* enable / disable AVIC */
+static int avic;
+#ifdef CONFIG_X86_LOCAL_APIC
+module_param(avic, int, S_IRUGO);
+#endif
+
+/* enable/disable Virtual VMLOAD VMSAVE */
+static int vls = true;
+module_param(vls, int, 0444);
+
+/* enable/disable Virtual GIF */
+static int vgif = true;
+module_param(vgif, int, 0444);
+
+/* enable/disable SEV support */
+static int sev = IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT);
+module_param(sev, int, 0444);
+
+static u8 rsm_ins_bytes[] = "\x0f\xaa";
+
+static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
+static void svm_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa);
+static void svm_complete_interrupts(struct vcpu_svm *svm);
+
+static int nested_svm_exit_handled(struct vcpu_svm *svm);
+static int nested_svm_intercept(struct vcpu_svm *svm);
+static int nested_svm_vmexit(struct vcpu_svm *svm);
+static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
+ bool has_error_code, u32 error_code);
+
+enum {
+ VMCB_INTERCEPTS, /* Intercept vectors, TSC offset,
+ pause filter count */
+ VMCB_PERM_MAP, /* IOPM Base and MSRPM Base */
+ VMCB_ASID, /* ASID */
+ VMCB_INTR, /* int_ctl, int_vector */
+ VMCB_NPT, /* npt_en, nCR3, gPAT */
+ VMCB_CR, /* CR0, CR3, CR4, EFER */
+ VMCB_DR, /* DR6, DR7 */
+ VMCB_DT, /* GDT, IDT */
+ VMCB_SEG, /* CS, DS, SS, ES, CPL */
+ VMCB_CR2, /* CR2 only */
+ VMCB_LBR, /* DBGCTL, BR_FROM, BR_TO, LAST_EX_FROM, LAST_EX_TO */
+ VMCB_AVIC, /* AVIC APIC_BAR, AVIC APIC_BACKING_PAGE,
+ * AVIC PHYSICAL_TABLE pointer,
+ * AVIC LOGICAL_TABLE pointer
+ */
+ VMCB_DIRTY_MAX,
+};
+
+/* TPR and CR2 are always written before VMRUN */
+#define VMCB_ALWAYS_DIRTY_MASK ((1U << VMCB_INTR) | (1U << VMCB_CR2))
+
+#define VMCB_AVIC_APIC_BAR_MASK 0xFFFFFFFFFF000ULL
+
+static unsigned int max_sev_asid;
+static unsigned int min_sev_asid;
+static unsigned long *sev_asid_bitmap;
+#define __sme_page_pa(x) __sme_set(page_to_pfn(x) << PAGE_SHIFT)
+
+struct enc_region {
+ struct list_head list;
+ unsigned long npages;
+ struct page **pages;
+ unsigned long uaddr;
+ unsigned long size;
+};
+
+
+static inline struct kvm_svm *to_kvm_svm(struct kvm *kvm)
+{
+ return container_of(kvm, struct kvm_svm, kvm);
+}
+
+static inline bool svm_sev_enabled(void)
+{
+ return IS_ENABLED(CONFIG_KVM_AMD_SEV) ? max_sev_asid : 0;
+}
+
+static inline bool sev_guest(struct kvm *kvm)
+{
+#ifdef CONFIG_KVM_AMD_SEV
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+
+ return sev->active;
+#else
+ return false;
+#endif
+}
+
+static inline int sev_get_asid(struct kvm *kvm)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+
+ return sev->asid;
+}
+
+static inline void mark_all_dirty(struct vmcb *vmcb)
+{
+ vmcb->control.clean = 0;
+}
+
+static inline void mark_all_clean(struct vmcb *vmcb)
+{
+ vmcb->control.clean = ((1 << VMCB_DIRTY_MAX) - 1)
+ & ~VMCB_ALWAYS_DIRTY_MASK;
+}
+
+static inline void mark_dirty(struct vmcb *vmcb, int bit)
+{
+ vmcb->control.clean &= ~(1 << bit);
+}
+
+static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu)
+{
+ return container_of(vcpu, struct vcpu_svm, vcpu);
+}
+
+static inline void avic_update_vapic_bar(struct vcpu_svm *svm, u64 data)
+{
+ svm->vmcb->control.avic_vapic_bar = data & VMCB_AVIC_APIC_BAR_MASK;
+ mark_dirty(svm->vmcb, VMCB_AVIC);
+}
+
+static inline bool avic_vcpu_is_running(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ u64 *entry = svm->avic_physical_id_cache;
+
+ if (!entry)
+ return false;
+
+ return (READ_ONCE(*entry) & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK);
+}
+
+static void recalc_intercepts(struct vcpu_svm *svm)
+{
+ struct vmcb_control_area *c, *h;
+ struct nested_state *g;
+
+ mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
+
+ if (!is_guest_mode(&svm->vcpu))
+ return;
+
+ c = &svm->vmcb->control;
+ h = &svm->nested.hsave->control;
+ g = &svm->nested;
+
+ c->intercept_cr = h->intercept_cr | g->intercept_cr;
+ c->intercept_dr = h->intercept_dr | g->intercept_dr;
+ c->intercept_exceptions = h->intercept_exceptions | g->intercept_exceptions;
+ c->intercept = h->intercept | g->intercept;
+
+ c->intercept |= (1ULL << INTERCEPT_VMLOAD);
+ c->intercept |= (1ULL << INTERCEPT_VMSAVE);
+}
+
+static inline struct vmcb *get_host_vmcb(struct vcpu_svm *svm)
+{
+ if (is_guest_mode(&svm->vcpu))
+ return svm->nested.hsave;
+ else
+ return svm->vmcb;
+}
+
+static inline void set_cr_intercept(struct vcpu_svm *svm, int bit)
+{
+ struct vmcb *vmcb = get_host_vmcb(svm);
+
+ vmcb->control.intercept_cr |= (1U << bit);
+
+ recalc_intercepts(svm);
+}
+
+static inline void clr_cr_intercept(struct vcpu_svm *svm, int bit)
+{
+ struct vmcb *vmcb = get_host_vmcb(svm);
+
+ vmcb->control.intercept_cr &= ~(1U << bit);
+
+ recalc_intercepts(svm);
+}
+
+static inline bool is_cr_intercept(struct vcpu_svm *svm, int bit)
+{
+ struct vmcb *vmcb = get_host_vmcb(svm);
+
+ return vmcb->control.intercept_cr & (1U << bit);
+}
+
+static inline void set_dr_intercepts(struct vcpu_svm *svm)
+{
+ struct vmcb *vmcb = get_host_vmcb(svm);
+
+ vmcb->control.intercept_dr = (1 << INTERCEPT_DR0_READ)
+ | (1 << INTERCEPT_DR1_READ)
+ | (1 << INTERCEPT_DR2_READ)
+ | (1 << INTERCEPT_DR3_READ)
+ | (1 << INTERCEPT_DR4_READ)
+ | (1 << INTERCEPT_DR5_READ)
+ | (1 << INTERCEPT_DR6_READ)
+ | (1 << INTERCEPT_DR7_READ)
+ | (1 << INTERCEPT_DR0_WRITE)
+ | (1 << INTERCEPT_DR1_WRITE)
+ | (1 << INTERCEPT_DR2_WRITE)
+ | (1 << INTERCEPT_DR3_WRITE)
+ | (1 << INTERCEPT_DR4_WRITE)
+ | (1 << INTERCEPT_DR5_WRITE)
+ | (1 << INTERCEPT_DR6_WRITE)
+ | (1 << INTERCEPT_DR7_WRITE);
+
+ recalc_intercepts(svm);
+}
+
+static inline void clr_dr_intercepts(struct vcpu_svm *svm)
+{
+ struct vmcb *vmcb = get_host_vmcb(svm);
+
+ vmcb->control.intercept_dr = 0;
+
+ recalc_intercepts(svm);
+}
+
+static inline void set_exception_intercept(struct vcpu_svm *svm, int bit)
+{
+ struct vmcb *vmcb = get_host_vmcb(svm);
+
+ vmcb->control.intercept_exceptions |= (1U << bit);
+
+ recalc_intercepts(svm);
+}
+
+static inline void clr_exception_intercept(struct vcpu_svm *svm, int bit)
+{
+ struct vmcb *vmcb = get_host_vmcb(svm);
+
+ vmcb->control.intercept_exceptions &= ~(1U << bit);
+
+ recalc_intercepts(svm);
+}
+
+static inline void set_intercept(struct vcpu_svm *svm, int bit)
+{
+ struct vmcb *vmcb = get_host_vmcb(svm);
+
+ vmcb->control.intercept |= (1ULL << bit);
+
+ recalc_intercepts(svm);
+}
+
+static inline void clr_intercept(struct vcpu_svm *svm, int bit)
+{
+ struct vmcb *vmcb = get_host_vmcb(svm);
+
+ vmcb->control.intercept &= ~(1ULL << bit);
+
+ recalc_intercepts(svm);
+}
+
+static inline bool vgif_enabled(struct vcpu_svm *svm)
+{
+ return !!(svm->vmcb->control.int_ctl & V_GIF_ENABLE_MASK);
+}
+
+static inline void enable_gif(struct vcpu_svm *svm)
+{
+ if (vgif_enabled(svm))
+ svm->vmcb->control.int_ctl |= V_GIF_MASK;
+ else
+ svm->vcpu.arch.hflags |= HF_GIF_MASK;
+}
+
+static inline void disable_gif(struct vcpu_svm *svm)
+{
+ if (vgif_enabled(svm))
+ svm->vmcb->control.int_ctl &= ~V_GIF_MASK;
+ else
+ svm->vcpu.arch.hflags &= ~HF_GIF_MASK;
+}
+
+static inline bool gif_set(struct vcpu_svm *svm)
+{
+ if (vgif_enabled(svm))
+ return !!(svm->vmcb->control.int_ctl & V_GIF_MASK);
+ else
+ return !!(svm->vcpu.arch.hflags & HF_GIF_MASK);
+}
+
+static unsigned long iopm_base;
+
+struct kvm_ldttss_desc {
+ u16 limit0;
+ u16 base0;
+ unsigned base1:8, type:5, dpl:2, p:1;
+ unsigned limit1:4, zero0:3, g:1, base2:8;
+ u32 base3;
+ u32 zero1;
+} __attribute__((packed));
+
+struct svm_cpu_data {
+ int cpu;
+
+ u64 asid_generation;
+ u32 max_asid;
+ u32 next_asid;
+ u32 min_asid;
+ struct kvm_ldttss_desc *tss_desc;
+
+ struct page *save_area;
+ struct vmcb *current_vmcb;
+
+ /* index = sev_asid, value = vmcb pointer */
+ struct vmcb **sev_vmcbs;
+};
+
+static DEFINE_PER_CPU(struct svm_cpu_data *, svm_data);
+
+struct svm_init_data {
+ int cpu;
+ int r;
+};
+
+static const u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000};
+
+#define NUM_MSR_MAPS ARRAY_SIZE(msrpm_ranges)
+#define MSRS_RANGE_SIZE 2048
+#define MSRS_IN_RANGE (MSRS_RANGE_SIZE * 8 / 2)
+
+static u32 svm_msrpm_offset(u32 msr)
+{
+ u32 offset;
+ int i;
+
+ for (i = 0; i < NUM_MSR_MAPS; i++) {
+ if (msr < msrpm_ranges[i] ||
+ msr >= msrpm_ranges[i] + MSRS_IN_RANGE)
+ continue;
+
+ offset = (msr - msrpm_ranges[i]) / 4; /* 4 msrs per u8 */
+ offset += (i * MSRS_RANGE_SIZE); /* add range offset */
+
+ /* Now we have the u8 offset - but need the u32 offset */
+ return offset / 4;
+ }
+
+ /* MSR not in any range */
+ return MSR_INVALID;
+}
+
+#define MAX_INST_SIZE 15
+
+static inline void clgi(void)
+{
+ asm volatile (__ex(SVM_CLGI));
+}
+
+static inline void stgi(void)
+{
+ asm volatile (__ex(SVM_STGI));
+}
+
+static inline void invlpga(unsigned long addr, u32 asid)
+{
+ asm volatile (__ex(SVM_INVLPGA) : : "a"(addr), "c"(asid));
+}
+
+static int get_npt_level(struct kvm_vcpu *vcpu)
+{
+#ifdef CONFIG_X86_64
+ return PT64_ROOT_4LEVEL;
+#else
+ return PT32E_ROOT_LEVEL;
+#endif
+}
+
+static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
+{
+ vcpu->arch.efer = efer;
+
+ if (!npt_enabled) {
+ /* Shadow paging assumes NX to be available. */
+ efer |= EFER_NX;
+
+ if (!(efer & EFER_LMA))
+ efer &= ~EFER_LME;
+ }
+
+ to_svm(vcpu)->vmcb->save.efer = efer | EFER_SVME;
+ mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR);
+}
+
+static int is_external_interrupt(u32 info)
+{
+ info &= SVM_EVTINJ_TYPE_MASK | SVM_EVTINJ_VALID;
+ return info == (SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR);
+}
+
+static u32 svm_get_interrupt_shadow(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ u32 ret = 0;
+
+ if (svm->vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK)
+ ret = KVM_X86_SHADOW_INT_STI | KVM_X86_SHADOW_INT_MOV_SS;
+ return ret;
+}
+
+static void svm_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (mask == 0)
+ svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK;
+ else
+ svm->vmcb->control.int_state |= SVM_INTERRUPT_SHADOW_MASK;
+
+}
+
+static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (svm->vmcb->control.next_rip != 0) {
+ WARN_ON_ONCE(!static_cpu_has(X86_FEATURE_NRIPS));
+ svm->next_rip = svm->vmcb->control.next_rip;
+ }
+
+ if (!svm->next_rip) {
+ if (kvm_emulate_instruction(vcpu, EMULTYPE_SKIP) !=
+ EMULATE_DONE)
+ printk(KERN_DEBUG "%s: NOP\n", __func__);
+ return;
+ }
+ if (svm->next_rip - kvm_rip_read(vcpu) > MAX_INST_SIZE)
+ printk(KERN_ERR "%s: ip 0x%lx next 0x%llx\n",
+ __func__, kvm_rip_read(vcpu), svm->next_rip);
+
+ kvm_rip_write(vcpu, svm->next_rip);
+ svm_set_interrupt_shadow(vcpu, 0);
+}
+
+static void svm_queue_exception(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ unsigned nr = vcpu->arch.exception.nr;
+ bool has_error_code = vcpu->arch.exception.has_error_code;
+ bool reinject = vcpu->arch.exception.injected;
+ u32 error_code = vcpu->arch.exception.error_code;
+
+ /*
+ * If we are within a nested VM we'd better #VMEXIT and let the guest
+ * handle the exception
+ */
+ if (!reinject &&
+ nested_svm_check_exception(svm, nr, has_error_code, error_code))
+ return;
+
+ if (nr == BP_VECTOR && !static_cpu_has(X86_FEATURE_NRIPS)) {
+ unsigned long rip, old_rip = kvm_rip_read(&svm->vcpu);
+
+ /*
+ * For guest debugging where we have to reinject #BP if some
+ * INT3 is guest-owned:
+ * Emulate nRIP by moving RIP forward. Will fail if injection
+ * raises a fault that is not intercepted. Still better than
+ * failing in all cases.
+ */
+ skip_emulated_instruction(&svm->vcpu);
+ rip = kvm_rip_read(&svm->vcpu);
+ svm->int3_rip = rip + svm->vmcb->save.cs.base;
+ svm->int3_injected = rip - old_rip;
+ }
+
+ svm->vmcb->control.event_inj = nr
+ | SVM_EVTINJ_VALID
+ | (has_error_code ? SVM_EVTINJ_VALID_ERR : 0)
+ | SVM_EVTINJ_TYPE_EXEPT;
+ svm->vmcb->control.event_inj_err = error_code;
+}
+
+static void svm_init_erratum_383(void)
+{
+ u32 low, high;
+ int err;
+ u64 val;
+
+ if (!static_cpu_has_bug(X86_BUG_AMD_TLB_MMATCH))
+ return;
+
+ /* Use _safe variants to not break nested virtualization */
+ val = native_read_msr_safe(MSR_AMD64_DC_CFG, &err);
+ if (err)
+ return;
+
+ val |= (1ULL << 47);
+
+ low = lower_32_bits(val);
+ high = upper_32_bits(val);
+
+ native_write_msr_safe(MSR_AMD64_DC_CFG, low, high);
+
+ erratum_383_found = true;
+}
+
+static void svm_init_osvw(struct kvm_vcpu *vcpu)
+{
+ /*
+ * Guests should see errata 400 and 415 as fixed (assuming that
+ * HLT and IO instructions are intercepted).
+ */
+ vcpu->arch.osvw.length = (osvw_len >= 3) ? (osvw_len) : 3;
+ vcpu->arch.osvw.status = osvw_status & ~(6ULL);
+
+ /*
+ * By increasing VCPU's osvw.length to 3 we are telling the guest that
+ * all osvw.status bits inside that length, including bit 0 (which is
+ * reserved for erratum 298), are valid. However, if host processor's
+ * osvw_len is 0 then osvw_status[0] carries no information. We need to
+ * be conservative here and therefore we tell the guest that erratum 298
+ * is present (because we really don't know).
+ */
+ if (osvw_len == 0 && boot_cpu_data.x86 == 0x10)
+ vcpu->arch.osvw.status |= 1;
+}
+
+static int has_svm(void)
+{
+ const char *msg;
+
+ if (!cpu_has_svm(&msg)) {
+ printk(KERN_INFO "has_svm: %s\n", msg);
+ return 0;
+ }
+
+ if (sev_active()) {
+ pr_info("KVM is unsupported when running as an SEV guest\n");
+ return 0;
+ }
+
+ return 1;
+}
+
+static void svm_hardware_disable(void)
+{
+ /* Make sure we clean up behind us */
+ if (static_cpu_has(X86_FEATURE_TSCRATEMSR))
+ wrmsrl(MSR_AMD64_TSC_RATIO, TSC_RATIO_DEFAULT);
+
+ cpu_svm_disable();
+
+ amd_pmu_disable_virt();
+}
+
+static int svm_hardware_enable(void)
+{
+
+ struct svm_cpu_data *sd;
+ uint64_t efer;
+ struct desc_struct *gdt;
+ int me = raw_smp_processor_id();
+
+ rdmsrl(MSR_EFER, efer);
+ if (efer & EFER_SVME)
+ return -EBUSY;
+
+ if (!has_svm()) {
+ pr_err("%s: err EOPNOTSUPP on %d\n", __func__, me);
+ return -EINVAL;
+ }
+ sd = per_cpu(svm_data, me);
+ if (!sd) {
+ pr_err("%s: svm_data is NULL on %d\n", __func__, me);
+ return -EINVAL;
+ }
+
+ sd->asid_generation = 1;
+ sd->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1;
+ sd->next_asid = sd->max_asid + 1;
+ sd->min_asid = max_sev_asid + 1;
+
+ gdt = get_current_gdt_rw();
+ sd->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS);
+
+ wrmsrl(MSR_EFER, efer | EFER_SVME);
+
+ wrmsrl(MSR_VM_HSAVE_PA, page_to_pfn(sd->save_area) << PAGE_SHIFT);
+
+ if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) {
+ wrmsrl(MSR_AMD64_TSC_RATIO, TSC_RATIO_DEFAULT);
+ __this_cpu_write(current_tsc_ratio, TSC_RATIO_DEFAULT);
+ }
+
+
+ /*
+ * Get OSVW bits.
+ *
+ * Note that it is possible to have a system with mixed processor
+ * revisions and therefore different OSVW bits. If bits are not the same
+ * on different processors then choose the worst case (i.e. if erratum
+ * is present on one processor and not on another then assume that the
+ * erratum is present everywhere).
+ */
+ if (cpu_has(&boot_cpu_data, X86_FEATURE_OSVW)) {
+ uint64_t len, status = 0;
+ int err;
+
+ len = native_read_msr_safe(MSR_AMD64_OSVW_ID_LENGTH, &err);
+ if (!err)
+ status = native_read_msr_safe(MSR_AMD64_OSVW_STATUS,
+ &err);
+
+ if (err)
+ osvw_status = osvw_len = 0;
+ else {
+ if (len < osvw_len)
+ osvw_len = len;
+ osvw_status |= status;
+ osvw_status &= (1ULL << osvw_len) - 1;
+ }
+ } else
+ osvw_status = osvw_len = 0;
+
+ svm_init_erratum_383();
+
+ amd_pmu_enable_virt();
+
+ return 0;
+}
+
+static void svm_cpu_uninit(int cpu)
+{
+ struct svm_cpu_data *sd = per_cpu(svm_data, raw_smp_processor_id());
+
+ if (!sd)
+ return;
+
+ per_cpu(svm_data, raw_smp_processor_id()) = NULL;
+ kfree(sd->sev_vmcbs);
+ __free_page(sd->save_area);
+ kfree(sd);
+}
+
+static int svm_cpu_init(int cpu)
+{
+ struct svm_cpu_data *sd;
+
+ sd = kzalloc(sizeof(struct svm_cpu_data), GFP_KERNEL);
+ if (!sd)
+ return -ENOMEM;
+ sd->cpu = cpu;
+ sd->save_area = alloc_page(GFP_KERNEL);
+ if (!sd->save_area)
+ goto free_cpu_data;
+
+ if (svm_sev_enabled()) {
+ sd->sev_vmcbs = kmalloc_array(max_sev_asid + 1,
+ sizeof(void *),
+ GFP_KERNEL);
+ if (!sd->sev_vmcbs)
+ goto free_save_area;
+ }
+
+ per_cpu(svm_data, cpu) = sd;
+
+ return 0;
+
+free_save_area:
+ __free_page(sd->save_area);
+free_cpu_data:
+ kfree(sd);
+ return -ENOMEM;
+
+}
+
+static bool valid_msr_intercept(u32 index)
+{
+ int i;
+
+ for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++)
+ if (direct_access_msrs[i].index == index)
+ return true;
+
+ return false;
+}
+
+static bool msr_write_intercepted(struct kvm_vcpu *vcpu, unsigned msr)
+{
+ u8 bit_write;
+ unsigned long tmp;
+ u32 offset;
+ u32 *msrpm;
+
+ msrpm = is_guest_mode(vcpu) ? to_svm(vcpu)->nested.msrpm:
+ to_svm(vcpu)->msrpm;
+
+ offset = svm_msrpm_offset(msr);
+ bit_write = 2 * (msr & 0x0f) + 1;
+ tmp = msrpm[offset];
+
+ BUG_ON(offset == MSR_INVALID);
+
+ return !!test_bit(bit_write, &tmp);
+}
+
+static void set_msr_interception(u32 *msrpm, unsigned msr,
+ int read, int write)
+{
+ u8 bit_read, bit_write;
+ unsigned long tmp;
+ u32 offset;
+
+ /*
+ * If this warning triggers extend the direct_access_msrs list at the
+ * beginning of the file
+ */
+ WARN_ON(!valid_msr_intercept(msr));
+
+ offset = svm_msrpm_offset(msr);
+ bit_read = 2 * (msr & 0x0f);
+ bit_write = 2 * (msr & 0x0f) + 1;
+ tmp = msrpm[offset];
+
+ BUG_ON(offset == MSR_INVALID);
+
+ read ? clear_bit(bit_read, &tmp) : set_bit(bit_read, &tmp);
+ write ? clear_bit(bit_write, &tmp) : set_bit(bit_write, &tmp);
+
+ msrpm[offset] = tmp;
+}
+
+static void svm_vcpu_init_msrpm(u32 *msrpm)
+{
+ int i;
+
+ memset(msrpm, 0xff, PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER));
+
+ for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
+ if (!direct_access_msrs[i].always)
+ continue;
+
+ set_msr_interception(msrpm, direct_access_msrs[i].index, 1, 1);
+ }
+}
+
+static void add_msr_offset(u32 offset)
+{
+ int i;
+
+ for (i = 0; i < MSRPM_OFFSETS; ++i) {
+
+ /* Offset already in list? */
+ if (msrpm_offsets[i] == offset)
+ return;
+
+ /* Slot used by another offset? */
+ if (msrpm_offsets[i] != MSR_INVALID)
+ continue;
+
+ /* Add offset to list */
+ msrpm_offsets[i] = offset;
+
+ return;
+ }
+
+ /*
+ * If this BUG triggers the msrpm_offsets table has an overflow. Just
+ * increase MSRPM_OFFSETS in this case.
+ */
+ BUG();
+}
+
+static void init_msrpm_offsets(void)
+{
+ int i;
+
+ memset(msrpm_offsets, 0xff, sizeof(msrpm_offsets));
+
+ for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
+ u32 offset;
+
+ offset = svm_msrpm_offset(direct_access_msrs[i].index);
+ BUG_ON(offset == MSR_INVALID);
+
+ add_msr_offset(offset);
+ }
+}
+
+static void svm_enable_lbrv(struct vcpu_svm *svm)
+{
+ u32 *msrpm = svm->msrpm;
+
+ svm->vmcb->control.virt_ext |= LBR_CTL_ENABLE_MASK;
+ set_msr_interception(msrpm, MSR_IA32_LASTBRANCHFROMIP, 1, 1);
+ set_msr_interception(msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1);
+ set_msr_interception(msrpm, MSR_IA32_LASTINTFROMIP, 1, 1);
+ set_msr_interception(msrpm, MSR_IA32_LASTINTTOIP, 1, 1);
+}
+
+static void svm_disable_lbrv(struct vcpu_svm *svm)
+{
+ u32 *msrpm = svm->msrpm;
+
+ svm->vmcb->control.virt_ext &= ~LBR_CTL_ENABLE_MASK;
+ set_msr_interception(msrpm, MSR_IA32_LASTBRANCHFROMIP, 0, 0);
+ set_msr_interception(msrpm, MSR_IA32_LASTBRANCHTOIP, 0, 0);
+ set_msr_interception(msrpm, MSR_IA32_LASTINTFROMIP, 0, 0);
+ set_msr_interception(msrpm, MSR_IA32_LASTINTTOIP, 0, 0);
+}
+
+static void disable_nmi_singlestep(struct vcpu_svm *svm)
+{
+ svm->nmi_singlestep = false;
+
+ if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP)) {
+ /* Clear our flags if they were not set by the guest */
+ if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF))
+ svm->vmcb->save.rflags &= ~X86_EFLAGS_TF;
+ if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_RF))
+ svm->vmcb->save.rflags &= ~X86_EFLAGS_RF;
+ }
+}
+
+/* Note:
+ * This hash table is used to map VM_ID to a struct kvm_svm,
+ * when handling AMD IOMMU GALOG notification to schedule in
+ * a particular vCPU.
+ */
+#define SVM_VM_DATA_HASH_BITS 8
+static DEFINE_HASHTABLE(svm_vm_data_hash, SVM_VM_DATA_HASH_BITS);
+static u32 next_vm_id = 0;
+static bool next_vm_id_wrapped = 0;
+static DEFINE_SPINLOCK(svm_vm_data_hash_lock);
+
+/* Note:
+ * This function is called from IOMMU driver to notify
+ * SVM to schedule in a particular vCPU of a particular VM.
+ */
+static int avic_ga_log_notifier(u32 ga_tag)
+{
+ unsigned long flags;
+ struct kvm_svm *kvm_svm;
+ struct kvm_vcpu *vcpu = NULL;
+ u32 vm_id = AVIC_GATAG_TO_VMID(ga_tag);
+ u32 vcpu_id = AVIC_GATAG_TO_VCPUID(ga_tag);
+
+ pr_debug("SVM: %s: vm_id=%#x, vcpu_id=%#x\n", __func__, vm_id, vcpu_id);
+
+ spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
+ hash_for_each_possible(svm_vm_data_hash, kvm_svm, hnode, vm_id) {
+ if (kvm_svm->avic_vm_id != vm_id)
+ continue;
+ vcpu = kvm_get_vcpu_by_id(&kvm_svm->kvm, vcpu_id);
+ break;
+ }
+ spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
+
+ /* Note:
+ * At this point, the IOMMU should have already set the pending
+ * bit in the vAPIC backing page. So, we just need to schedule
+ * in the vcpu.
+ */
+ if (vcpu)
+ kvm_vcpu_wake_up(vcpu);
+
+ return 0;
+}
+
+static __init int sev_hardware_setup(void)
+{
+ struct sev_user_data_status *status;
+ int rc;
+
+ /* Maximum number of encrypted guests supported simultaneously */
+ max_sev_asid = cpuid_ecx(0x8000001F);
+
+ if (!max_sev_asid)
+ return 1;
+
+ /* Minimum ASID value that should be used for SEV guest */
+ min_sev_asid = cpuid_edx(0x8000001F);
+
+ /* Initialize SEV ASID bitmap */
+ sev_asid_bitmap = bitmap_zalloc(max_sev_asid, GFP_KERNEL);
+ if (!sev_asid_bitmap)
+ return 1;
+
+ status = kmalloc(sizeof(*status), GFP_KERNEL);
+ if (!status)
+ return 1;
+
+ /*
+ * Check SEV platform status.
+ *
+ * PLATFORM_STATUS can be called in any state, if we failed to query
+ * the PLATFORM status then either PSP firmware does not support SEV
+ * feature or SEV firmware is dead.
+ */
+ rc = sev_platform_status(status, NULL);
+ if (rc)
+ goto err;
+
+ pr_info("SEV supported\n");
+
+err:
+ kfree(status);
+ return rc;
+}
+
+static void grow_ple_window(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb_control_area *control = &svm->vmcb->control;
+ int old = control->pause_filter_count;
+
+ control->pause_filter_count = __grow_ple_window(old,
+ pause_filter_count,
+ pause_filter_count_grow,
+ pause_filter_count_max);
+
+ if (control->pause_filter_count != old)
+ mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
+
+ trace_kvm_ple_window_grow(vcpu->vcpu_id,
+ control->pause_filter_count, old);
+}
+
+static void shrink_ple_window(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb_control_area *control = &svm->vmcb->control;
+ int old = control->pause_filter_count;
+
+ control->pause_filter_count =
+ __shrink_ple_window(old,
+ pause_filter_count,
+ pause_filter_count_shrink,
+ pause_filter_count);
+ if (control->pause_filter_count != old)
+ mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
+
+ trace_kvm_ple_window_shrink(vcpu->vcpu_id,
+ control->pause_filter_count, old);
+}
+
+/*
+ * The default MMIO mask is a single bit (excluding the present bit),
+ * which could conflict with the memory encryption bit. Check for
+ * memory encryption support and override the default MMIO mask if
+ * memory encryption is enabled.
+ */
+static __init void svm_adjust_mmio_mask(void)
+{
+ unsigned int enc_bit, mask_bit;
+ u64 msr, mask;
+
+ /* If there is no memory encryption support, use existing mask */
+ if (cpuid_eax(0x80000000) < 0x8000001f)
+ return;
+
+ /* If memory encryption is not enabled, use existing mask */
+ rdmsrl(MSR_K8_SYSCFG, msr);
+ if (!(msr & MSR_K8_SYSCFG_MEM_ENCRYPT))
+ return;
+
+ enc_bit = cpuid_ebx(0x8000001f) & 0x3f;
+ mask_bit = boot_cpu_data.x86_phys_bits;
+
+ /* Increment the mask bit if it is the same as the encryption bit */
+ if (enc_bit == mask_bit)
+ mask_bit++;
+
+ /*
+ * If the mask bit location is below 52, then some bits above the
+ * physical addressing limit will always be reserved, so use the
+ * rsvd_bits() function to generate the mask. This mask, along with
+ * the present bit, will be used to generate a page fault with
+ * PFER.RSV = 1.
+ *
+ * If the mask bit location is 52 (or above), then clear the mask.
+ */
+ mask = (mask_bit < 52) ? rsvd_bits(mask_bit, 51) | PT_PRESENT_MASK : 0;
+
+ kvm_mmu_set_mmio_spte_mask(mask, mask);
+}
+
+static __init int svm_hardware_setup(void)
+{
+ int cpu;
+ struct page *iopm_pages;
+ void *iopm_va;
+ int r;
+
+ iopm_pages = alloc_pages(GFP_KERNEL, IOPM_ALLOC_ORDER);
+
+ if (!iopm_pages)
+ return -ENOMEM;
+
+ iopm_va = page_address(iopm_pages);
+ memset(iopm_va, 0xff, PAGE_SIZE * (1 << IOPM_ALLOC_ORDER));
+ iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT;
+
+ init_msrpm_offsets();
+
+ if (boot_cpu_has(X86_FEATURE_NX))
+ kvm_enable_efer_bits(EFER_NX);
+
+ if (boot_cpu_has(X86_FEATURE_FXSR_OPT))
+ kvm_enable_efer_bits(EFER_FFXSR);
+
+ if (boot_cpu_has(X86_FEATURE_TSCRATEMSR)) {
+ kvm_has_tsc_control = true;
+ kvm_max_tsc_scaling_ratio = TSC_RATIO_MAX;
+ kvm_tsc_scaling_ratio_frac_bits = 32;
+ }
+
+ /* Check for pause filtering support */
+ if (!boot_cpu_has(X86_FEATURE_PAUSEFILTER)) {
+ pause_filter_count = 0;
+ pause_filter_thresh = 0;
+ } else if (!boot_cpu_has(X86_FEATURE_PFTHRESHOLD)) {
+ pause_filter_thresh = 0;
+ }
+
+ if (nested) {
+ printk(KERN_INFO "kvm: Nested Virtualization enabled\n");
+ kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE);
+ }
+
+ if (sev) {
+ if (boot_cpu_has(X86_FEATURE_SEV) &&
+ IS_ENABLED(CONFIG_KVM_AMD_SEV)) {
+ r = sev_hardware_setup();
+ if (r)
+ sev = false;
+ } else {
+ sev = false;
+ }
+ }
+
+ svm_adjust_mmio_mask();
+
+ for_each_possible_cpu(cpu) {
+ r = svm_cpu_init(cpu);
+ if (r)
+ goto err;
+ }
+
+ if (!boot_cpu_has(X86_FEATURE_NPT))
+ npt_enabled = false;
+
+ if (npt_enabled && !npt) {
+ printk(KERN_INFO "kvm: Nested Paging disabled\n");
+ npt_enabled = false;
+ }
+
+ if (npt_enabled) {
+ printk(KERN_INFO "kvm: Nested Paging enabled\n");
+ kvm_enable_tdp();
+ } else
+ kvm_disable_tdp();
+
+ if (avic) {
+ if (!npt_enabled ||
+ !boot_cpu_has(X86_FEATURE_AVIC) ||
+ !IS_ENABLED(CONFIG_X86_LOCAL_APIC)) {
+ avic = false;
+ } else {
+ pr_info("AVIC enabled\n");
+
+ amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier);
+ }
+ }
+
+ if (vls) {
+ if (!npt_enabled ||
+ !boot_cpu_has(X86_FEATURE_V_VMSAVE_VMLOAD) ||
+ !IS_ENABLED(CONFIG_X86_64)) {
+ vls = false;
+ } else {
+ pr_info("Virtual VMLOAD VMSAVE supported\n");
+ }
+ }
+
+ vgif = false; /* Disabled for CVE-2021-3653 */
+
+ return 0;
+
+err:
+ __free_pages(iopm_pages, IOPM_ALLOC_ORDER);
+ iopm_base = 0;
+ return r;
+}
+
+static __exit void svm_hardware_unsetup(void)
+{
+ int cpu;
+
+ if (svm_sev_enabled())
+ bitmap_free(sev_asid_bitmap);
+
+ for_each_possible_cpu(cpu)
+ svm_cpu_uninit(cpu);
+
+ __free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT), IOPM_ALLOC_ORDER);
+ iopm_base = 0;
+}
+
+static void init_seg(struct vmcb_seg *seg)
+{
+ seg->selector = 0;
+ seg->attrib = SVM_SELECTOR_P_MASK | SVM_SELECTOR_S_MASK |
+ SVM_SELECTOR_WRITE_MASK; /* Read/Write Data Segment */
+ seg->limit = 0xffff;
+ seg->base = 0;
+}
+
+static void init_sys_seg(struct vmcb_seg *seg, uint32_t type)
+{
+ seg->selector = 0;
+ seg->attrib = SVM_SELECTOR_P_MASK | type;
+ seg->limit = 0xffff;
+ seg->base = 0;
+}
+
+static u64 svm_read_l1_tsc_offset(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (is_guest_mode(vcpu))
+ return svm->nested.hsave->control.tsc_offset;
+
+ return vcpu->arch.tsc_offset;
+}
+
+static u64 svm_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ u64 g_tsc_offset = 0;
+
+ if (is_guest_mode(vcpu)) {
+ /* Write L1's TSC offset. */
+ g_tsc_offset = svm->vmcb->control.tsc_offset -
+ svm->nested.hsave->control.tsc_offset;
+ svm->nested.hsave->control.tsc_offset = offset;
+ } else
+ trace_kvm_write_tsc_offset(vcpu->vcpu_id,
+ svm->vmcb->control.tsc_offset,
+ offset);
+
+ svm->vmcb->control.tsc_offset = offset + g_tsc_offset;
+
+ mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
+ return svm->vmcb->control.tsc_offset;
+}
+
+static void avic_init_vmcb(struct vcpu_svm *svm)
+{
+ struct vmcb *vmcb = svm->vmcb;
+ struct kvm_svm *kvm_svm = to_kvm_svm(svm->vcpu.kvm);
+ phys_addr_t bpa = __sme_set(page_to_phys(svm->avic_backing_page));
+ phys_addr_t lpa = __sme_set(page_to_phys(kvm_svm->avic_logical_id_table_page));
+ phys_addr_t ppa = __sme_set(page_to_phys(kvm_svm->avic_physical_id_table_page));
+
+ vmcb->control.avic_backing_page = bpa & AVIC_HPA_MASK;
+ vmcb->control.avic_logical_id = lpa & AVIC_HPA_MASK;
+ vmcb->control.avic_physical_id = ppa & AVIC_HPA_MASK;
+ vmcb->control.avic_physical_id |= AVIC_MAX_PHYSICAL_ID_COUNT;
+ vmcb->control.int_ctl |= AVIC_ENABLE_MASK;
+}
+
+static void init_vmcb(struct vcpu_svm *svm)
+{
+ struct vmcb_control_area *control = &svm->vmcb->control;
+ struct vmcb_save_area *save = &svm->vmcb->save;
+
+ svm->vcpu.arch.hflags = 0;
+
+ set_cr_intercept(svm, INTERCEPT_CR0_READ);
+ set_cr_intercept(svm, INTERCEPT_CR3_READ);
+ set_cr_intercept(svm, INTERCEPT_CR4_READ);
+ set_cr_intercept(svm, INTERCEPT_CR0_WRITE);
+ set_cr_intercept(svm, INTERCEPT_CR3_WRITE);
+ set_cr_intercept(svm, INTERCEPT_CR4_WRITE);
+ if (!kvm_vcpu_apicv_active(&svm->vcpu))
+ set_cr_intercept(svm, INTERCEPT_CR8_WRITE);
+
+ set_dr_intercepts(svm);
+
+ set_exception_intercept(svm, PF_VECTOR);
+ set_exception_intercept(svm, UD_VECTOR);
+ set_exception_intercept(svm, MC_VECTOR);
+ set_exception_intercept(svm, AC_VECTOR);
+ set_exception_intercept(svm, DB_VECTOR);
+ /*
+ * Guest access to VMware backdoor ports could legitimately
+ * trigger #GP because of TSS I/O permission bitmap.
+ * We intercept those #GP and allow access to them anyway
+ * as VMware does.
+ */
+ if (enable_vmware_backdoor)
+ set_exception_intercept(svm, GP_VECTOR);
+
+ set_intercept(svm, INTERCEPT_INTR);
+ set_intercept(svm, INTERCEPT_NMI);
+ set_intercept(svm, INTERCEPT_SMI);
+ set_intercept(svm, INTERCEPT_SELECTIVE_CR0);
+ set_intercept(svm, INTERCEPT_RDPMC);
+ set_intercept(svm, INTERCEPT_CPUID);
+ set_intercept(svm, INTERCEPT_INVD);
+ set_intercept(svm, INTERCEPT_INVLPG);
+ set_intercept(svm, INTERCEPT_INVLPGA);
+ set_intercept(svm, INTERCEPT_IOIO_PROT);
+ set_intercept(svm, INTERCEPT_MSR_PROT);
+ set_intercept(svm, INTERCEPT_TASK_SWITCH);
+ set_intercept(svm, INTERCEPT_SHUTDOWN);
+ set_intercept(svm, INTERCEPT_VMRUN);
+ set_intercept(svm, INTERCEPT_VMMCALL);
+ set_intercept(svm, INTERCEPT_VMLOAD);
+ set_intercept(svm, INTERCEPT_VMSAVE);
+ set_intercept(svm, INTERCEPT_STGI);
+ set_intercept(svm, INTERCEPT_CLGI);
+ set_intercept(svm, INTERCEPT_SKINIT);
+ set_intercept(svm, INTERCEPT_WBINVD);
+ set_intercept(svm, INTERCEPT_XSETBV);
+ set_intercept(svm, INTERCEPT_RSM);
+
+ if (!kvm_mwait_in_guest(svm->vcpu.kvm)) {
+ set_intercept(svm, INTERCEPT_MONITOR);
+ set_intercept(svm, INTERCEPT_MWAIT);
+ }
+
+ if (!kvm_hlt_in_guest(svm->vcpu.kvm))
+ set_intercept(svm, INTERCEPT_HLT);
+
+ control->iopm_base_pa = __sme_set(iopm_base);
+ control->msrpm_base_pa = __sme_set(__pa(svm->msrpm));
+ control->int_ctl = V_INTR_MASKING_MASK;
+
+ init_seg(&save->es);
+ init_seg(&save->ss);
+ init_seg(&save->ds);
+ init_seg(&save->fs);
+ init_seg(&save->gs);
+
+ save->cs.selector = 0xf000;
+ save->cs.base = 0xffff0000;
+ /* Executable/Readable Code Segment */
+ save->cs.attrib = SVM_SELECTOR_READ_MASK | SVM_SELECTOR_P_MASK |
+ SVM_SELECTOR_S_MASK | SVM_SELECTOR_CODE_MASK;
+ save->cs.limit = 0xffff;
+
+ save->gdtr.limit = 0xffff;
+ save->idtr.limit = 0xffff;
+
+ init_sys_seg(&save->ldtr, SEG_TYPE_LDT);
+ init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16);
+
+ svm_set_efer(&svm->vcpu, 0);
+ save->dr6 = 0xffff0ff0;
+ kvm_set_rflags(&svm->vcpu, 2);
+ save->rip = 0x0000fff0;
+ svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip;
+
+ /*
+ * svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0.
+ * It also updates the guest-visible cr0 value.
+ */
+ svm_set_cr0(&svm->vcpu, X86_CR0_NW | X86_CR0_CD | X86_CR0_ET);
+ kvm_mmu_reset_context(&svm->vcpu);
+
+ save->cr4 = X86_CR4_PAE;
+ /* rdx = ?? */
+
+ if (npt_enabled) {
+ /* Setup VMCB for Nested Paging */
+ control->nested_ctl |= SVM_NESTED_CTL_NP_ENABLE;
+ clr_intercept(svm, INTERCEPT_INVLPG);
+ clr_exception_intercept(svm, PF_VECTOR);
+ clr_cr_intercept(svm, INTERCEPT_CR3_READ);
+ clr_cr_intercept(svm, INTERCEPT_CR3_WRITE);
+ save->g_pat = svm->vcpu.arch.pat;
+ save->cr3 = 0;
+ save->cr4 = 0;
+ }
+ svm->asid_generation = 0;
+
+ svm->nested.vmcb = 0;
+ svm->vcpu.arch.hflags = 0;
+
+ if (pause_filter_count) {
+ control->pause_filter_count = pause_filter_count;
+ if (pause_filter_thresh)
+ control->pause_filter_thresh = pause_filter_thresh;
+ set_intercept(svm, INTERCEPT_PAUSE);
+ } else {
+ clr_intercept(svm, INTERCEPT_PAUSE);
+ }
+
+ if (kvm_vcpu_apicv_active(&svm->vcpu))
+ avic_init_vmcb(svm);
+
+ /*
+ * If hardware supports Virtual VMLOAD VMSAVE then enable it
+ * in VMCB and clear intercepts to avoid #VMEXIT.
+ */
+ if (vls) {
+ clr_intercept(svm, INTERCEPT_VMLOAD);
+ clr_intercept(svm, INTERCEPT_VMSAVE);
+ svm->vmcb->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
+ }
+
+ if (vgif) {
+ clr_intercept(svm, INTERCEPT_STGI);
+ clr_intercept(svm, INTERCEPT_CLGI);
+ svm->vmcb->control.int_ctl |= V_GIF_ENABLE_MASK;
+ }
+
+ if (sev_guest(svm->vcpu.kvm)) {
+ svm->vmcb->control.nested_ctl |= SVM_NESTED_CTL_SEV_ENABLE;
+ clr_exception_intercept(svm, UD_VECTOR);
+ }
+
+ mark_all_dirty(svm->vmcb);
+
+ enable_gif(svm);
+
+}
+
+static u64 *avic_get_physical_id_entry(struct kvm_vcpu *vcpu,
+ unsigned int index)
+{
+ u64 *avic_physical_id_table;
+ struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
+
+ if (index >= AVIC_MAX_PHYSICAL_ID_COUNT)
+ return NULL;
+
+ avic_physical_id_table = page_address(kvm_svm->avic_physical_id_table_page);
+
+ return &avic_physical_id_table[index];
+}
+
+/**
+ * Note:
+ * AVIC hardware walks the nested page table to check permissions,
+ * but does not use the SPA address specified in the leaf page
+ * table entry since it uses address in the AVIC_BACKING_PAGE pointer
+ * field of the VMCB. Therefore, we set up the
+ * APIC_ACCESS_PAGE_PRIVATE_MEMSLOT (4KB) here.
+ */
+static int avic_init_access_page(struct kvm_vcpu *vcpu)
+{
+ struct kvm *kvm = vcpu->kvm;
+ int ret = 0;
+
+ mutex_lock(&kvm->slots_lock);
+ if (kvm->arch.apic_access_page_done)
+ goto out;
+
+ ret = __x86_set_memory_region(kvm,
+ APIC_ACCESS_PAGE_PRIVATE_MEMSLOT,
+ APIC_DEFAULT_PHYS_BASE,
+ PAGE_SIZE);
+ if (ret)
+ goto out;
+
+ kvm->arch.apic_access_page_done = true;
+out:
+ mutex_unlock(&kvm->slots_lock);
+ return ret;
+}
+
+static int avic_init_backing_page(struct kvm_vcpu *vcpu)
+{
+ int ret;
+ u64 *entry, new_entry;
+ int id = vcpu->vcpu_id;
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ ret = avic_init_access_page(vcpu);
+ if (ret)
+ return ret;
+
+ if (id >= AVIC_MAX_PHYSICAL_ID_COUNT)
+ return -EINVAL;
+
+ if (!svm->vcpu.arch.apic->regs)
+ return -EINVAL;
+
+ svm->avic_backing_page = virt_to_page(svm->vcpu.arch.apic->regs);
+
+ /* Setting AVIC backing page address in the phy APIC ID table */
+ entry = avic_get_physical_id_entry(vcpu, id);
+ if (!entry)
+ return -EINVAL;
+
+ new_entry = READ_ONCE(*entry);
+ new_entry = __sme_set((page_to_phys(svm->avic_backing_page) &
+ AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK) |
+ AVIC_PHYSICAL_ID_ENTRY_VALID_MASK);
+ WRITE_ONCE(*entry, new_entry);
+
+ svm->avic_physical_id_cache = entry;
+
+ return 0;
+}
+
+static void __sev_asid_free(int asid)
+{
+ struct svm_cpu_data *sd;
+ int cpu, pos;
+
+ pos = asid - 1;
+ clear_bit(pos, sev_asid_bitmap);
+
+ for_each_possible_cpu(cpu) {
+ sd = per_cpu(svm_data, cpu);
+ sd->sev_vmcbs[asid] = NULL;
+ }
+}
+
+static void sev_asid_free(struct kvm *kvm)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+
+ __sev_asid_free(sev->asid);
+}
+
+static void sev_decommission(unsigned int handle)
+{
+ struct sev_data_decommission *decommission;
+
+ if (!handle)
+ return;
+
+ decommission = kzalloc(sizeof(*decommission), GFP_KERNEL);
+ if (!decommission)
+ return;
+
+ decommission->handle = handle;
+ sev_guest_decommission(decommission, NULL);
+
+ kfree(decommission);
+}
+
+static void sev_unbind_asid(struct kvm *kvm, unsigned int handle)
+{
+ struct sev_data_deactivate *data;
+
+ if (!handle)
+ return;
+
+ data = kzalloc(sizeof(*data), GFP_KERNEL);
+ if (!data)
+ return;
+
+ /* deactivate handle */
+ data->handle = handle;
+ sev_guest_deactivate(data, NULL);
+
+ wbinvd_on_all_cpus();
+ sev_guest_df_flush(NULL);
+ kfree(data);
+
+ sev_decommission(handle);
+}
+
+static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr,
+ unsigned long ulen, unsigned long *n,
+ int write)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ unsigned long npages, npinned, size;
+ unsigned long locked, lock_limit;
+ struct page **pages;
+ unsigned long first, last;
+
+ lockdep_assert_held(&kvm->lock);
+
+ if (ulen == 0 || uaddr + ulen < uaddr)
+ return NULL;
+
+ /* Calculate number of pages. */
+ first = (uaddr & PAGE_MASK) >> PAGE_SHIFT;
+ last = ((uaddr + ulen - 1) & PAGE_MASK) >> PAGE_SHIFT;
+ npages = (last - first + 1);
+
+ locked = sev->pages_locked + npages;
+ lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+ if (locked > lock_limit && !capable(CAP_IPC_LOCK)) {
+ pr_err("SEV: %lu locked pages exceed the lock limit of %lu.\n", locked, lock_limit);
+ return NULL;
+ }
+
+ /* Avoid using vmalloc for smaller buffers. */
+ size = npages * sizeof(struct page *);
+ if (size > PAGE_SIZE)
+ pages = vmalloc(size);
+ else
+ pages = kmalloc(size, GFP_KERNEL);
+
+ if (!pages)
+ return NULL;
+
+ /* Pin the user virtual address. */
+ npinned = get_user_pages_fast(uaddr, npages, write ? FOLL_WRITE : 0, pages);
+ if (npinned != npages) {
+ pr_err("SEV: Failure locking %lu pages.\n", npages);
+ goto err;
+ }
+
+ *n = npages;
+ sev->pages_locked = locked;
+
+ return pages;
+
+err:
+ if (npinned > 0)
+ release_pages(pages, npinned);
+
+ kvfree(pages);
+ return NULL;
+}
+
+static void sev_unpin_memory(struct kvm *kvm, struct page **pages,
+ unsigned long npages)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+
+ release_pages(pages, npages);
+ kvfree(pages);
+ sev->pages_locked -= npages;
+}
+
+static void sev_clflush_pages(struct page *pages[], unsigned long npages)
+{
+ uint8_t *page_virtual;
+ unsigned long i;
+
+ if (npages == 0 || pages == NULL)
+ return;
+
+ for (i = 0; i < npages; i++) {
+ page_virtual = kmap_atomic(pages[i]);
+ clflush_cache_range(page_virtual, PAGE_SIZE);
+ kunmap_atomic(page_virtual);
+ }
+}
+
+static void __unregister_enc_region_locked(struct kvm *kvm,
+ struct enc_region *region)
+{
+ /*
+ * The guest may change the memory encryption attribute from C=0 -> C=1
+ * or vice versa for this memory range. Lets make sure caches are
+ * flushed to ensure that guest data gets written into memory with
+ * correct C-bit.
+ */
+ sev_clflush_pages(region->pages, region->npages);
+
+ sev_unpin_memory(kvm, region->pages, region->npages);
+ list_del(&region->list);
+ kfree(region);
+}
+
+static struct kvm *svm_vm_alloc(void)
+{
+ struct kvm_svm *kvm_svm = vzalloc(sizeof(struct kvm_svm));
+
+ if (!kvm_svm)
+ return NULL;
+
+ return &kvm_svm->kvm;
+}
+
+static void svm_vm_free(struct kvm *kvm)
+{
+ vfree(to_kvm_svm(kvm));
+}
+
+static void sev_vm_destroy(struct kvm *kvm)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct list_head *head = &sev->regions_list;
+ struct list_head *pos, *q;
+
+ if (!sev_guest(kvm))
+ return;
+
+ mutex_lock(&kvm->lock);
+
+ /*
+ * if userspace was terminated before unregistering the memory regions
+ * then lets unpin all the registered memory.
+ */
+ if (!list_empty(head)) {
+ list_for_each_safe(pos, q, head) {
+ __unregister_enc_region_locked(kvm,
+ list_entry(pos, struct enc_region, list));
+ cond_resched();
+ }
+ }
+
+ mutex_unlock(&kvm->lock);
+
+ sev_unbind_asid(kvm, sev->handle);
+ sev_asid_free(kvm);
+}
+
+static void avic_vm_destroy(struct kvm *kvm)
+{
+ unsigned long flags;
+ struct kvm_svm *kvm_svm = to_kvm_svm(kvm);
+
+ if (!avic)
+ return;
+
+ if (kvm_svm->avic_logical_id_table_page)
+ __free_page(kvm_svm->avic_logical_id_table_page);
+ if (kvm_svm->avic_physical_id_table_page)
+ __free_page(kvm_svm->avic_physical_id_table_page);
+
+ spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
+ hash_del(&kvm_svm->hnode);
+ spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
+}
+
+static void svm_vm_destroy(struct kvm *kvm)
+{
+ avic_vm_destroy(kvm);
+ sev_vm_destroy(kvm);
+}
+
+static int avic_vm_init(struct kvm *kvm)
+{
+ unsigned long flags;
+ int err = -ENOMEM;
+ struct kvm_svm *kvm_svm = to_kvm_svm(kvm);
+ struct kvm_svm *k2;
+ struct page *p_page;
+ struct page *l_page;
+ u32 vm_id;
+
+ if (!avic)
+ return 0;
+
+ /* Allocating physical APIC ID table (4KB) */
+ p_page = alloc_page(GFP_KERNEL);
+ if (!p_page)
+ goto free_avic;
+
+ kvm_svm->avic_physical_id_table_page = p_page;
+ clear_page(page_address(p_page));
+
+ /* Allocating logical APIC ID table (4KB) */
+ l_page = alloc_page(GFP_KERNEL);
+ if (!l_page)
+ goto free_avic;
+
+ kvm_svm->avic_logical_id_table_page = l_page;
+ clear_page(page_address(l_page));
+
+ spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
+ again:
+ vm_id = next_vm_id = (next_vm_id + 1) & AVIC_VM_ID_MASK;
+ if (vm_id == 0) { /* id is 1-based, zero is not okay */
+ next_vm_id_wrapped = 1;
+ goto again;
+ }
+ /* Is it still in use? Only possible if wrapped at least once */
+ if (next_vm_id_wrapped) {
+ hash_for_each_possible(svm_vm_data_hash, k2, hnode, vm_id) {
+ if (k2->avic_vm_id == vm_id)
+ goto again;
+ }
+ }
+ kvm_svm->avic_vm_id = vm_id;
+ hash_add(svm_vm_data_hash, &kvm_svm->hnode, kvm_svm->avic_vm_id);
+ spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
+
+ return 0;
+
+free_avic:
+ avic_vm_destroy(kvm);
+ return err;
+}
+
+static inline int
+avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu, bool r)
+{
+ int ret = 0;
+ unsigned long flags;
+ struct amd_svm_iommu_ir *ir;
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (!kvm_arch_has_assigned_device(vcpu->kvm))
+ return 0;
+
+ /*
+ * Here, we go through the per-vcpu ir_list to update all existing
+ * interrupt remapping table entry targeting this vcpu.
+ */
+ spin_lock_irqsave(&svm->ir_list_lock, flags);
+
+ if (list_empty(&svm->ir_list))
+ goto out;
+
+ list_for_each_entry(ir, &svm->ir_list, node) {
+ ret = amd_iommu_update_ga(cpu, r, ir->data);
+ if (ret)
+ break;
+ }
+out:
+ spin_unlock_irqrestore(&svm->ir_list_lock, flags);
+ return ret;
+}
+
+static void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+{
+ u64 entry;
+ /* ID = 0xff (broadcast), ID > 0xff (reserved) */
+ int h_physical_id = kvm_cpu_get_apicid(cpu);
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (!kvm_vcpu_apicv_active(vcpu))
+ return;
+
+ /*
+ * Since the host physical APIC id is 8 bits,
+ * we can support host APIC ID upto 255.
+ */
+ if (WARN_ON(h_physical_id > AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK))
+ return;
+
+ entry = READ_ONCE(*(svm->avic_physical_id_cache));
+ WARN_ON(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK);
+
+ entry &= ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK;
+ entry |= (h_physical_id & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK);
+
+ entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
+ if (svm->avic_is_running)
+ entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
+
+ WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
+ avic_update_iommu_vcpu_affinity(vcpu, h_physical_id,
+ svm->avic_is_running);
+}
+
+static void avic_vcpu_put(struct kvm_vcpu *vcpu)
+{
+ u64 entry;
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (!kvm_vcpu_apicv_active(vcpu))
+ return;
+
+ entry = READ_ONCE(*(svm->avic_physical_id_cache));
+ if (entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK)
+ avic_update_iommu_vcpu_affinity(vcpu, -1, 0);
+
+ entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
+ WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
+}
+
+/**
+ * This function is called during VCPU halt/unhalt.
+ */
+static void avic_set_running(struct kvm_vcpu *vcpu, bool is_run)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ svm->avic_is_running = is_run;
+ if (is_run)
+ avic_vcpu_load(vcpu, vcpu->cpu);
+ else
+ avic_vcpu_put(vcpu);
+}
+
+static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ u32 dummy;
+ u32 eax = 1;
+
+ vcpu->arch.microcode_version = 0x01000065;
+ svm->spec_ctrl = 0;
+ svm->virt_spec_ctrl = 0;
+
+ if (!init_event) {
+ svm->vcpu.arch.apic_base = APIC_DEFAULT_PHYS_BASE |
+ MSR_IA32_APICBASE_ENABLE;
+ if (kvm_vcpu_is_reset_bsp(&svm->vcpu))
+ svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP;
+ }
+ init_vmcb(svm);
+
+ kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy, true);
+ kvm_register_write(vcpu, VCPU_REGS_RDX, eax);
+
+ if (kvm_vcpu_apicv_active(vcpu) && !init_event)
+ avic_update_vapic_bar(svm, APIC_DEFAULT_PHYS_BASE);
+}
+
+static int avic_init_vcpu(struct vcpu_svm *svm)
+{
+ int ret;
+
+ if (!kvm_vcpu_apicv_active(&svm->vcpu))
+ return 0;
+
+ ret = avic_init_backing_page(&svm->vcpu);
+ if (ret)
+ return ret;
+
+ INIT_LIST_HEAD(&svm->ir_list);
+ spin_lock_init(&svm->ir_list_lock);
+
+ return ret;
+}
+
+static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
+{
+ struct vcpu_svm *svm;
+ struct page *page;
+ struct page *msrpm_pages;
+ struct page *hsave_page;
+ struct page *nested_msrpm_pages;
+ int err;
+
+ svm = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
+ if (!svm) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ err = kvm_vcpu_init(&svm->vcpu, kvm, id);
+ if (err)
+ goto free_svm;
+
+ err = -ENOMEM;
+ page = alloc_page(GFP_KERNEL);
+ if (!page)
+ goto uninit;
+
+ msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER);
+ if (!msrpm_pages)
+ goto free_page1;
+
+ nested_msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER);
+ if (!nested_msrpm_pages)
+ goto free_page2;
+
+ hsave_page = alloc_page(GFP_KERNEL);
+ if (!hsave_page)
+ goto free_page3;
+
+ err = avic_init_vcpu(svm);
+ if (err)
+ goto free_page4;
+
+ /* We initialize this flag to true to make sure that the is_running
+ * bit would be set the first time the vcpu is loaded.
+ */
+ svm->avic_is_running = true;
+
+ svm->nested.hsave = page_address(hsave_page);
+
+ svm->msrpm = page_address(msrpm_pages);
+ svm_vcpu_init_msrpm(svm->msrpm);
+
+ svm->nested.msrpm = page_address(nested_msrpm_pages);
+ svm_vcpu_init_msrpm(svm->nested.msrpm);
+
+ svm->vmcb = page_address(page);
+ clear_page(svm->vmcb);
+ svm->vmcb_pa = __sme_set(page_to_pfn(page) << PAGE_SHIFT);
+ svm->asid_generation = 0;
+ init_vmcb(svm);
+
+ svm_init_osvw(&svm->vcpu);
+
+ return &svm->vcpu;
+
+free_page4:
+ __free_page(hsave_page);
+free_page3:
+ __free_pages(nested_msrpm_pages, MSRPM_ALLOC_ORDER);
+free_page2:
+ __free_pages(msrpm_pages, MSRPM_ALLOC_ORDER);
+free_page1:
+ __free_page(page);
+uninit:
+ kvm_vcpu_uninit(&svm->vcpu);
+free_svm:
+ kmem_cache_free(kvm_vcpu_cache, svm);
+out:
+ return ERR_PTR(err);
+}
+
+static void svm_clear_current_vmcb(struct vmcb *vmcb)
+{
+ int i;
+
+ for_each_online_cpu(i)
+ cmpxchg(&per_cpu(svm_data, i)->current_vmcb, vmcb, NULL);
+}
+
+static void svm_free_vcpu(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ /*
+ * The vmcb page can be recycled, causing a false negative in
+ * svm_vcpu_load(). So, ensure that no logical CPU has this
+ * vmcb page recorded as its current vmcb.
+ */
+ svm_clear_current_vmcb(svm->vmcb);
+
+ __free_page(pfn_to_page(__sme_clr(svm->vmcb_pa) >> PAGE_SHIFT));
+ __free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER);
+ __free_page(virt_to_page(svm->nested.hsave));
+ __free_pages(virt_to_page(svm->nested.msrpm), MSRPM_ALLOC_ORDER);
+ kvm_vcpu_uninit(vcpu);
+ kmem_cache_free(kvm_vcpu_cache, svm);
+}
+
+static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
+ int i;
+
+ if (unlikely(cpu != vcpu->cpu)) {
+ svm->asid_generation = 0;
+ mark_all_dirty(svm->vmcb);
+ }
+
+#ifdef CONFIG_X86_64
+ rdmsrl(MSR_GS_BASE, to_svm(vcpu)->host.gs_base);
+#endif
+ savesegment(fs, svm->host.fs);
+ savesegment(gs, svm->host.gs);
+ svm->host.ldt = kvm_read_ldt();
+
+ for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
+ rdmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
+
+ if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) {
+ u64 tsc_ratio = vcpu->arch.tsc_scaling_ratio;
+ if (tsc_ratio != __this_cpu_read(current_tsc_ratio)) {
+ __this_cpu_write(current_tsc_ratio, tsc_ratio);
+ wrmsrl(MSR_AMD64_TSC_RATIO, tsc_ratio);
+ }
+ }
+ /* This assumes that the kernel never uses MSR_TSC_AUX */
+ if (static_cpu_has(X86_FEATURE_RDTSCP))
+ wrmsrl(MSR_TSC_AUX, svm->tsc_aux);
+
+ if (sd->current_vmcb != svm->vmcb) {
+ sd->current_vmcb = svm->vmcb;
+ indirect_branch_prediction_barrier();
+ }
+ avic_vcpu_load(vcpu, cpu);
+}
+
+static void svm_vcpu_put(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ int i;
+
+ avic_vcpu_put(vcpu);
+
+ ++vcpu->stat.host_state_reload;
+ kvm_load_ldt(svm->host.ldt);
+#ifdef CONFIG_X86_64
+ loadsegment(fs, svm->host.fs);
+ wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gsbase);
+ load_gs_index(svm->host.gs);
+#else
+#ifdef CONFIG_X86_32_LAZY_GS
+ loadsegment(gs, svm->host.gs);
+#endif
+#endif
+ for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
+ wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
+}
+
+static void svm_vcpu_blocking(struct kvm_vcpu *vcpu)
+{
+ avic_set_running(vcpu, false);
+}
+
+static void svm_vcpu_unblocking(struct kvm_vcpu *vcpu)
+{
+ avic_set_running(vcpu, true);
+}
+
+static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ unsigned long rflags = svm->vmcb->save.rflags;
+
+ if (svm->nmi_singlestep) {
+ /* Hide our flags if they were not set by the guest */
+ if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF))
+ rflags &= ~X86_EFLAGS_TF;
+ if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_RF))
+ rflags &= ~X86_EFLAGS_RF;
+ }
+ return rflags;
+}
+
+static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
+{
+ if (to_svm(vcpu)->nmi_singlestep)
+ rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
+
+ /*
+ * Any change of EFLAGS.VM is accompanied by a reload of SS
+ * (caused by either a task switch or an inter-privilege IRET),
+ * so we do not need to update the CPL here.
+ */
+ to_svm(vcpu)->vmcb->save.rflags = rflags;
+}
+
+static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
+{
+ switch (reg) {
+ case VCPU_EXREG_PDPTR:
+ BUG_ON(!npt_enabled);
+ load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu));
+ break;
+ default:
+ BUG();
+ }
+}
+
+static void svm_set_vintr(struct vcpu_svm *svm)
+{
+ set_intercept(svm, INTERCEPT_VINTR);
+}
+
+static void svm_clear_vintr(struct vcpu_svm *svm)
+{
+ clr_intercept(svm, INTERCEPT_VINTR);
+}
+
+static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg)
+{
+ struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;
+
+ switch (seg) {
+ case VCPU_SREG_CS: return &save->cs;
+ case VCPU_SREG_DS: return &save->ds;
+ case VCPU_SREG_ES: return &save->es;
+ case VCPU_SREG_FS: return &save->fs;
+ case VCPU_SREG_GS: return &save->gs;
+ case VCPU_SREG_SS: return &save->ss;
+ case VCPU_SREG_TR: return &save->tr;
+ case VCPU_SREG_LDTR: return &save->ldtr;
+ }
+ BUG();
+ return NULL;
+}
+
+static u64 svm_get_segment_base(struct kvm_vcpu *vcpu, int seg)
+{
+ struct vmcb_seg *s = svm_seg(vcpu, seg);
+
+ return s->base;
+}
+
+static void svm_get_segment(struct kvm_vcpu *vcpu,
+ struct kvm_segment *var, int seg)
+{
+ struct vmcb_seg *s = svm_seg(vcpu, seg);
+
+ var->base = s->base;
+ var->limit = s->limit;
+ var->selector = s->selector;
+ var->type = s->attrib & SVM_SELECTOR_TYPE_MASK;
+ var->s = (s->attrib >> SVM_SELECTOR_S_SHIFT) & 1;
+ var->dpl = (s->attrib >> SVM_SELECTOR_DPL_SHIFT) & 3;
+ var->present = (s->attrib >> SVM_SELECTOR_P_SHIFT) & 1;
+ var->avl = (s->attrib >> SVM_SELECTOR_AVL_SHIFT) & 1;
+ var->l = (s->attrib >> SVM_SELECTOR_L_SHIFT) & 1;
+ var->db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1;
+
+ /*
+ * AMD CPUs circa 2014 track the G bit for all segments except CS.
+ * However, the SVM spec states that the G bit is not observed by the
+ * CPU, and some VMware virtual CPUs drop the G bit for all segments.
+ * So let's synthesize a legal G bit for all segments, this helps
+ * running KVM nested. It also helps cross-vendor migration, because
+ * Intel's vmentry has a check on the 'G' bit.
+ */
+ var->g = s->limit > 0xfffff;
+
+ /*
+ * AMD's VMCB does not have an explicit unusable field, so emulate it
+ * for cross vendor migration purposes by "not present"
+ */
+ var->unusable = !var->present;
+
+ switch (seg) {
+ case VCPU_SREG_TR:
+ /*
+ * Work around a bug where the busy flag in the tr selector
+ * isn't exposed
+ */
+ var->type |= 0x2;
+ break;
+ case VCPU_SREG_DS:
+ case VCPU_SREG_ES:
+ case VCPU_SREG_FS:
+ case VCPU_SREG_GS:
+ /*
+ * The accessed bit must always be set in the segment
+ * descriptor cache, although it can be cleared in the
+ * descriptor, the cached bit always remains at 1. Since
+ * Intel has a check on this, set it here to support
+ * cross-vendor migration.
+ */
+ if (!var->unusable)
+ var->type |= 0x1;
+ break;
+ case VCPU_SREG_SS:
+ /*
+ * On AMD CPUs sometimes the DB bit in the segment
+ * descriptor is left as 1, although the whole segment has
+ * been made unusable. Clear it here to pass an Intel VMX
+ * entry check when cross vendor migrating.
+ */
+ if (var->unusable)
+ var->db = 0;
+ /* This is symmetric with svm_set_segment() */
+ var->dpl = to_svm(vcpu)->vmcb->save.cpl;
+ break;
+ }
+}
+
+static int svm_get_cpl(struct kvm_vcpu *vcpu)
+{
+ struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;
+
+ return save->cpl;
+}
+
+static void svm_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ dt->size = svm->vmcb->save.idtr.limit;
+ dt->address = svm->vmcb->save.idtr.base;
+}
+
+static void svm_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ svm->vmcb->save.idtr.limit = dt->size;
+ svm->vmcb->save.idtr.base = dt->address ;
+ mark_dirty(svm->vmcb, VMCB_DT);
+}
+
+static void svm_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ dt->size = svm->vmcb->save.gdtr.limit;
+ dt->address = svm->vmcb->save.gdtr.base;
+}
+
+static void svm_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ svm->vmcb->save.gdtr.limit = dt->size;
+ svm->vmcb->save.gdtr.base = dt->address ;
+ mark_dirty(svm->vmcb, VMCB_DT);
+}
+
+static void svm_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
+{
+}
+
+static void svm_decache_cr3(struct kvm_vcpu *vcpu)
+{
+}
+
+static void svm_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
+{
+}
+
+static void update_cr0_intercept(struct vcpu_svm *svm)
+{
+ ulong gcr0 = svm->vcpu.arch.cr0;
+ u64 *hcr0 = &svm->vmcb->save.cr0;
+
+ *hcr0 = (*hcr0 & ~SVM_CR0_SELECTIVE_MASK)
+ | (gcr0 & SVM_CR0_SELECTIVE_MASK);
+
+ mark_dirty(svm->vmcb, VMCB_CR);
+
+ if (gcr0 == *hcr0) {
+ clr_cr_intercept(svm, INTERCEPT_CR0_READ);
+ clr_cr_intercept(svm, INTERCEPT_CR0_WRITE);
+ } else {
+ set_cr_intercept(svm, INTERCEPT_CR0_READ);
+ set_cr_intercept(svm, INTERCEPT_CR0_WRITE);
+ }
+}
+
+static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+#ifdef CONFIG_X86_64
+ if (vcpu->arch.efer & EFER_LME) {
+ if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
+ vcpu->arch.efer |= EFER_LMA;
+ svm->vmcb->save.efer |= EFER_LMA | EFER_LME;
+ }
+
+ if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) {
+ vcpu->arch.efer &= ~EFER_LMA;
+ svm->vmcb->save.efer &= ~(EFER_LMA | EFER_LME);
+ }
+ }
+#endif
+ vcpu->arch.cr0 = cr0;
+
+ if (!npt_enabled)
+ cr0 |= X86_CR0_PG | X86_CR0_WP;
+
+ /*
+ * re-enable caching here because the QEMU bios
+ * does not do it - this results in some delay at
+ * reboot
+ */
+ if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
+ cr0 &= ~(X86_CR0_CD | X86_CR0_NW);
+ svm->vmcb->save.cr0 = cr0;
+ mark_dirty(svm->vmcb, VMCB_CR);
+ update_cr0_intercept(svm);
+}
+
+static int svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
+{
+ unsigned long host_cr4_mce = cr4_read_shadow() & X86_CR4_MCE;
+ unsigned long old_cr4 = to_svm(vcpu)->vmcb->save.cr4;
+
+ if (cr4 & X86_CR4_VMXE)
+ return 1;
+
+ if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE))
+ svm_flush_tlb(vcpu, true);
+
+ vcpu->arch.cr4 = cr4;
+ if (!npt_enabled)
+ cr4 |= X86_CR4_PAE;
+ cr4 |= host_cr4_mce;
+ to_svm(vcpu)->vmcb->save.cr4 = cr4;
+ mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR);
+ return 0;
+}
+
+static void svm_set_segment(struct kvm_vcpu *vcpu,
+ struct kvm_segment *var, int seg)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb_seg *s = svm_seg(vcpu, seg);
+
+ s->base = var->base;
+ s->limit = var->limit;
+ s->selector = var->selector;
+ s->attrib = (var->type & SVM_SELECTOR_TYPE_MASK);
+ s->attrib |= (var->s & 1) << SVM_SELECTOR_S_SHIFT;
+ s->attrib |= (var->dpl & 3) << SVM_SELECTOR_DPL_SHIFT;
+ s->attrib |= ((var->present & 1) && !var->unusable) << SVM_SELECTOR_P_SHIFT;
+ s->attrib |= (var->avl & 1) << SVM_SELECTOR_AVL_SHIFT;
+ s->attrib |= (var->l & 1) << SVM_SELECTOR_L_SHIFT;
+ s->attrib |= (var->db & 1) << SVM_SELECTOR_DB_SHIFT;
+ s->attrib |= (var->g & 1) << SVM_SELECTOR_G_SHIFT;
+
+ /*
+ * This is always accurate, except if SYSRET returned to a segment
+ * with SS.DPL != 3. Intel does not have this quirk, and always
+ * forces SS.DPL to 3 on sysret, so we ignore that case; fixing it
+ * would entail passing the CPL to userspace and back.
+ */
+ if (seg == VCPU_SREG_SS)
+ /* This is symmetric with svm_get_segment() */
+ svm->vmcb->save.cpl = (var->dpl & 3);
+
+ mark_dirty(svm->vmcb, VMCB_SEG);
+}
+
+static void update_bp_intercept(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ clr_exception_intercept(svm, BP_VECTOR);
+
+ if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) {
+ if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
+ set_exception_intercept(svm, BP_VECTOR);
+ } else
+ vcpu->guest_debug = 0;
+}
+
+static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd)
+{
+ if (sd->next_asid > sd->max_asid) {
+ ++sd->asid_generation;
+ sd->next_asid = sd->min_asid;
+ svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ALL_ASID;
+ }
+
+ svm->asid_generation = sd->asid_generation;
+ svm->vmcb->control.asid = sd->next_asid++;
+
+ mark_dirty(svm->vmcb, VMCB_ASID);
+}
+
+static u64 svm_get_dr6(struct kvm_vcpu *vcpu)
+{
+ return to_svm(vcpu)->vmcb->save.dr6;
+}
+
+static void svm_set_dr6(struct kvm_vcpu *vcpu, unsigned long value)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ svm->vmcb->save.dr6 = value;
+ mark_dirty(svm->vmcb, VMCB_DR);
+}
+
+static void svm_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ get_debugreg(vcpu->arch.db[0], 0);
+ get_debugreg(vcpu->arch.db[1], 1);
+ get_debugreg(vcpu->arch.db[2], 2);
+ get_debugreg(vcpu->arch.db[3], 3);
+ vcpu->arch.dr6 = svm_get_dr6(vcpu);
+ vcpu->arch.dr7 = svm->vmcb->save.dr7;
+
+ vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT;
+ set_dr_intercepts(svm);
+}
+
+static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ svm->vmcb->save.dr7 = value;
+ mark_dirty(svm->vmcb, VMCB_DR);
+}
+
+static int pf_interception(struct vcpu_svm *svm)
+{
+ u64 fault_address = __sme_clr(svm->vmcb->control.exit_info_2);
+ u64 error_code = svm->vmcb->control.exit_info_1;
+
+ return kvm_handle_page_fault(&svm->vcpu, error_code, fault_address,
+ static_cpu_has(X86_FEATURE_DECODEASSISTS) ?
+ svm->vmcb->control.insn_bytes : NULL,
+ svm->vmcb->control.insn_len);
+}
+
+static int npf_interception(struct vcpu_svm *svm)
+{
+ u64 fault_address = __sme_clr(svm->vmcb->control.exit_info_2);
+ u64 error_code = svm->vmcb->control.exit_info_1;
+
+ trace_kvm_page_fault(fault_address, error_code);
+ return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code,
+ static_cpu_has(X86_FEATURE_DECODEASSISTS) ?
+ svm->vmcb->control.insn_bytes : NULL,
+ svm->vmcb->control.insn_len);
+}
+
+static int db_interception(struct vcpu_svm *svm)
+{
+ struct kvm_run *kvm_run = svm->vcpu.run;
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+
+ if (!(svm->vcpu.guest_debug &
+ (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) &&
+ !svm->nmi_singlestep) {
+ kvm_queue_exception(&svm->vcpu, DB_VECTOR);
+ return 1;
+ }
+
+ if (svm->nmi_singlestep) {
+ disable_nmi_singlestep(svm);
+ /* Make sure we check for pending NMIs upon entry */
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ }
+
+ if (svm->vcpu.guest_debug &
+ (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) {
+ kvm_run->exit_reason = KVM_EXIT_DEBUG;
+ kvm_run->debug.arch.pc =
+ svm->vmcb->save.cs.base + svm->vmcb->save.rip;
+ kvm_run->debug.arch.exception = DB_VECTOR;
+ return 0;
+ }
+
+ return 1;
+}
+
+static int bp_interception(struct vcpu_svm *svm)
+{
+ struct kvm_run *kvm_run = svm->vcpu.run;
+
+ kvm_run->exit_reason = KVM_EXIT_DEBUG;
+ kvm_run->debug.arch.pc = svm->vmcb->save.cs.base + svm->vmcb->save.rip;
+ kvm_run->debug.arch.exception = BP_VECTOR;
+ return 0;
+}
+
+static int ud_interception(struct vcpu_svm *svm)
+{
+ return handle_ud(&svm->vcpu);
+}
+
+static int ac_interception(struct vcpu_svm *svm)
+{
+ kvm_queue_exception_e(&svm->vcpu, AC_VECTOR, 0);
+ return 1;
+}
+
+static int gp_interception(struct vcpu_svm *svm)
+{
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+ u32 error_code = svm->vmcb->control.exit_info_1;
+ int er;
+
+ WARN_ON_ONCE(!enable_vmware_backdoor);
+
+ er = kvm_emulate_instruction(vcpu,
+ EMULTYPE_VMWARE | EMULTYPE_NO_UD_ON_FAIL);
+ if (er == EMULATE_USER_EXIT)
+ return 0;
+ else if (er != EMULATE_DONE)
+ kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
+ return 1;
+}
+
+static bool is_erratum_383(void)
+{
+ int err, i;
+ u64 value;
+
+ if (!erratum_383_found)
+ return false;
+
+ value = native_read_msr_safe(MSR_IA32_MC0_STATUS, &err);
+ if (err)
+ return false;
+
+ /* Bit 62 may or may not be set for this mce */
+ value &= ~(1ULL << 62);
+
+ if (value != 0xb600000000010015ULL)
+ return false;
+
+ /* Clear MCi_STATUS registers */
+ for (i = 0; i < 6; ++i)
+ native_write_msr_safe(MSR_IA32_MCx_STATUS(i), 0, 0);
+
+ value = native_read_msr_safe(MSR_IA32_MCG_STATUS, &err);
+ if (!err) {
+ u32 low, high;
+
+ value &= ~(1ULL << 2);
+ low = lower_32_bits(value);
+ high = upper_32_bits(value);
+
+ native_write_msr_safe(MSR_IA32_MCG_STATUS, low, high);
+ }
+
+ /* Flush tlb to evict multi-match entries */
+ __flush_tlb_all();
+
+ return true;
+}
+
+static void svm_handle_mce(struct vcpu_svm *svm)
+{
+ if (is_erratum_383()) {
+ /*
+ * Erratum 383 triggered. Guest state is corrupt so kill the
+ * guest.
+ */
+ pr_err("KVM: Guest triggered AMD Erratum 383\n");
+
+ kvm_make_request(KVM_REQ_TRIPLE_FAULT, &svm->vcpu);
+
+ return;
+ }
+
+ /*
+ * On an #MC intercept the MCE handler is not called automatically in
+ * the host. So do it by hand here.
+ */
+ asm volatile (
+ "int $0x12\n");
+ /* not sure if we ever come back to this point */
+
+ return;
+}
+
+static int mc_interception(struct vcpu_svm *svm)
+{
+ return 1;
+}
+
+static int shutdown_interception(struct vcpu_svm *svm)
+{
+ struct kvm_run *kvm_run = svm->vcpu.run;
+
+ /*
+ * VMCB is undefined after a SHUTDOWN intercept
+ * so reinitialize it.
+ */
+ clear_page(svm->vmcb);
+ init_vmcb(svm);
+
+ kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
+ return 0;
+}
+
+static int io_interception(struct vcpu_svm *svm)
+{
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+ u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */
+ int size, in, string;
+ unsigned port;
+
+ ++svm->vcpu.stat.io_exits;
+ string = (io_info & SVM_IOIO_STR_MASK) != 0;
+ in = (io_info & SVM_IOIO_TYPE_MASK) != 0;
+ if (string)
+ return kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE;
+
+ port = io_info >> 16;
+ size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT;
+ svm->next_rip = svm->vmcb->control.exit_info_2;
+
+ return kvm_fast_pio(&svm->vcpu, size, port, in);
+}
+
+static int nmi_interception(struct vcpu_svm *svm)
+{
+ return 1;
+}
+
+static int intr_interception(struct vcpu_svm *svm)
+{
+ ++svm->vcpu.stat.irq_exits;
+ return 1;
+}
+
+static int nop_on_interception(struct vcpu_svm *svm)
+{
+ return 1;
+}
+
+static int halt_interception(struct vcpu_svm *svm)
+{
+ svm->next_rip = kvm_rip_read(&svm->vcpu) + 1;
+ return kvm_emulate_halt(&svm->vcpu);
+}
+
+static int vmmcall_interception(struct vcpu_svm *svm)
+{
+ svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
+ return kvm_emulate_hypercall(&svm->vcpu);
+}
+
+static unsigned long nested_svm_get_tdp_cr3(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ return svm->nested.nested_cr3;
+}
+
+static u64 nested_svm_get_tdp_pdptr(struct kvm_vcpu *vcpu, int index)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ u64 cr3 = svm->nested.nested_cr3;
+ u64 pdpte;
+ int ret;
+
+ ret = kvm_vcpu_read_guest_page(vcpu, gpa_to_gfn(__sme_clr(cr3)), &pdpte,
+ offset_in_page(cr3) + index * 8, 8);
+ if (ret)
+ return 0;
+ return pdpte;
+}
+
+static void nested_svm_set_tdp_cr3(struct kvm_vcpu *vcpu,
+ unsigned long root)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ svm->vmcb->control.nested_cr3 = __sme_set(root);
+ mark_dirty(svm->vmcb, VMCB_NPT);
+}
+
+static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu,
+ struct x86_exception *fault)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (svm->vmcb->control.exit_code != SVM_EXIT_NPF) {
+ /*
+ * TODO: track the cause of the nested page fault, and
+ * correctly fill in the high bits of exit_info_1.
+ */
+ svm->vmcb->control.exit_code = SVM_EXIT_NPF;
+ svm->vmcb->control.exit_code_hi = 0;
+ svm->vmcb->control.exit_info_1 = (1ULL << 32);
+ svm->vmcb->control.exit_info_2 = fault->address;
+ }
+
+ svm->vmcb->control.exit_info_1 &= ~0xffffffffULL;
+ svm->vmcb->control.exit_info_1 |= fault->error_code;
+
+ /*
+ * The present bit is always zero for page structure faults on real
+ * hardware.
+ */
+ if (svm->vmcb->control.exit_info_1 & (2ULL << 32))
+ svm->vmcb->control.exit_info_1 &= ~1;
+
+ nested_svm_vmexit(svm);
+}
+
+static void nested_svm_init_mmu_context(struct kvm_vcpu *vcpu)
+{
+ WARN_ON(mmu_is_nested(vcpu));
+ kvm_init_shadow_mmu(vcpu);
+ vcpu->arch.mmu.set_cr3 = nested_svm_set_tdp_cr3;
+ vcpu->arch.mmu.get_cr3 = nested_svm_get_tdp_cr3;
+ vcpu->arch.mmu.get_pdptr = nested_svm_get_tdp_pdptr;
+ vcpu->arch.mmu.inject_page_fault = nested_svm_inject_npf_exit;
+ vcpu->arch.mmu.shadow_root_level = get_npt_level(vcpu);
+ reset_shadow_zero_bits_mask(vcpu, &vcpu->arch.mmu);
+ vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu;
+}
+
+static void nested_svm_uninit_mmu_context(struct kvm_vcpu *vcpu)
+{
+ vcpu->arch.walk_mmu = &vcpu->arch.mmu;
+}
+
+static int nested_svm_check_permissions(struct vcpu_svm *svm)
+{
+ if (!(svm->vcpu.arch.efer & EFER_SVME) ||
+ !is_paging(&svm->vcpu)) {
+ kvm_queue_exception(&svm->vcpu, UD_VECTOR);
+ return 1;
+ }
+
+ if (svm->vmcb->save.cpl) {
+ kvm_inject_gp(&svm->vcpu, 0);
+ return 1;
+ }
+
+ return 0;
+}
+
+static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
+ bool has_error_code, u32 error_code)
+{
+ int vmexit;
+
+ if (!is_guest_mode(&svm->vcpu))
+ return 0;
+
+ vmexit = nested_svm_intercept(svm);
+ if (vmexit != NESTED_EXIT_DONE)
+ return 0;
+
+ svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + nr;
+ svm->vmcb->control.exit_code_hi = 0;
+ svm->vmcb->control.exit_info_1 = error_code;
+
+ /*
+ * FIXME: we should not write CR2 when L1 intercepts an L2 #PF exception.
+ * The fix is to add the ancillary datum (CR2 or DR6) to structs
+ * kvm_queued_exception and kvm_vcpu_events, so that CR2 and DR6 can be
+ * written only when inject_pending_event runs (DR6 would written here
+ * too). This should be conditional on a new capability---if the
+ * capability is disabled, kvm_multiple_exception would write the
+ * ancillary information to CR2 or DR6, for backwards ABI-compatibility.
+ */
+ if (svm->vcpu.arch.exception.nested_apf)
+ svm->vmcb->control.exit_info_2 = svm->vcpu.arch.apf.nested_apf_token;
+ else
+ svm->vmcb->control.exit_info_2 = svm->vcpu.arch.cr2;
+
+ svm->nested.exit_required = true;
+ return vmexit;
+}
+
+/* This function returns true if it is save to enable the irq window */
+static inline bool nested_svm_intr(struct vcpu_svm *svm)
+{
+ if (!is_guest_mode(&svm->vcpu))
+ return true;
+
+ if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK))
+ return true;
+
+ if (!(svm->vcpu.arch.hflags & HF_HIF_MASK))
+ return false;
+
+ /*
+ * if vmexit was already requested (by intercepted exception
+ * for instance) do not overwrite it with "external interrupt"
+ * vmexit.
+ */
+ if (svm->nested.exit_required)
+ return false;
+
+ svm->vmcb->control.exit_code = SVM_EXIT_INTR;
+ svm->vmcb->control.exit_info_1 = 0;
+ svm->vmcb->control.exit_info_2 = 0;
+
+ if (svm->nested.intercept & 1ULL) {
+ /*
+ * The #vmexit can't be emulated here directly because this
+ * code path runs with irqs and preemption disabled. A
+ * #vmexit emulation might sleep. Only signal request for
+ * the #vmexit here.
+ */
+ svm->nested.exit_required = true;
+ trace_kvm_nested_intr_vmexit(svm->vmcb->save.rip);
+ return false;
+ }
+
+ return true;
+}
+
+/* This function returns true if it is save to enable the nmi window */
+static inline bool nested_svm_nmi(struct vcpu_svm *svm)
+{
+ if (!is_guest_mode(&svm->vcpu))
+ return true;
+
+ if (!(svm->nested.intercept & (1ULL << INTERCEPT_NMI)))
+ return true;
+
+ svm->vmcb->control.exit_code = SVM_EXIT_NMI;
+ svm->nested.exit_required = true;
+
+ return false;
+}
+
+static void *nested_svm_map(struct vcpu_svm *svm, u64 gpa, struct page **_page)
+{
+ struct page *page;
+
+ might_sleep();
+
+ page = kvm_vcpu_gfn_to_page(&svm->vcpu, gpa >> PAGE_SHIFT);
+ if (is_error_page(page))
+ goto error;
+
+ *_page = page;
+
+ return kmap(page);
+
+error:
+ kvm_inject_gp(&svm->vcpu, 0);
+
+ return NULL;
+}
+
+static void nested_svm_unmap(struct page *page)
+{
+ kunmap(page);
+ kvm_release_page_dirty(page);
+}
+
+static int nested_svm_intercept_ioio(struct vcpu_svm *svm)
+{
+ unsigned port, size, iopm_len;
+ u16 val, mask;
+ u8 start_bit;
+ u64 gpa;
+
+ if (!(svm->nested.intercept & (1ULL << INTERCEPT_IOIO_PROT)))
+ return NESTED_EXIT_HOST;
+
+ port = svm->vmcb->control.exit_info_1 >> 16;
+ size = (svm->vmcb->control.exit_info_1 & SVM_IOIO_SIZE_MASK) >>
+ SVM_IOIO_SIZE_SHIFT;
+ gpa = svm->nested.vmcb_iopm + (port / 8);
+ start_bit = port % 8;
+ iopm_len = (start_bit + size > 8) ? 2 : 1;
+ mask = (0xf >> (4 - size)) << start_bit;
+ val = 0;
+
+ if (kvm_vcpu_read_guest(&svm->vcpu, gpa, &val, iopm_len))
+ return NESTED_EXIT_DONE;
+
+ return (val & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST;
+}
+
+static int nested_svm_exit_handled_msr(struct vcpu_svm *svm)
+{
+ u32 offset, msr, value;
+ int write, mask;
+
+ if (!(svm->nested.intercept & (1ULL << INTERCEPT_MSR_PROT)))
+ return NESTED_EXIT_HOST;
+
+ msr = svm->vcpu.arch.regs[VCPU_REGS_RCX];
+ offset = svm_msrpm_offset(msr);
+ write = svm->vmcb->control.exit_info_1 & 1;
+ mask = 1 << ((2 * (msr & 0xf)) + write);
+
+ if (offset == MSR_INVALID)
+ return NESTED_EXIT_DONE;
+
+ /* Offset is in 32 bit units but need in 8 bit units */
+ offset *= 4;
+
+ if (kvm_vcpu_read_guest(&svm->vcpu, svm->nested.vmcb_msrpm + offset, &value, 4))
+ return NESTED_EXIT_DONE;
+
+ return (value & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST;
+}
+
+/* DB exceptions for our internal use must not cause vmexit */
+static int nested_svm_intercept_db(struct vcpu_svm *svm)
+{
+ unsigned long dr6;
+
+ /* if we're not singlestepping, it's not ours */
+ if (!svm->nmi_singlestep)
+ return NESTED_EXIT_DONE;
+
+ /* if it's not a singlestep exception, it's not ours */
+ if (kvm_get_dr(&svm->vcpu, 6, &dr6))
+ return NESTED_EXIT_DONE;
+ if (!(dr6 & DR6_BS))
+ return NESTED_EXIT_DONE;
+
+ /* if the guest is singlestepping, it should get the vmexit */
+ if (svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF) {
+ disable_nmi_singlestep(svm);
+ return NESTED_EXIT_DONE;
+ }
+
+ /* it's ours, the nested hypervisor must not see this one */
+ return NESTED_EXIT_HOST;
+}
+
+static int nested_svm_exit_special(struct vcpu_svm *svm)
+{
+ u32 exit_code = svm->vmcb->control.exit_code;
+
+ switch (exit_code) {
+ case SVM_EXIT_INTR:
+ case SVM_EXIT_NMI:
+ case SVM_EXIT_EXCP_BASE + MC_VECTOR:
+ return NESTED_EXIT_HOST;
+ case SVM_EXIT_NPF:
+ /* For now we are always handling NPFs when using them */
+ if (npt_enabled)
+ return NESTED_EXIT_HOST;
+ break;
+ case SVM_EXIT_EXCP_BASE + PF_VECTOR:
+ /* Trap async PF even if not shadowing */
+ if (!npt_enabled || svm->vcpu.arch.apf.host_apf_reason)
+ return NESTED_EXIT_HOST;
+ break;
+ default:
+ break;
+ }
+
+ return NESTED_EXIT_CONTINUE;
+}
+
+/*
+ * If this function returns true, this #vmexit was already handled
+ */
+static int nested_svm_intercept(struct vcpu_svm *svm)
+{
+ u32 exit_code = svm->vmcb->control.exit_code;
+ int vmexit = NESTED_EXIT_HOST;
+
+ switch (exit_code) {
+ case SVM_EXIT_MSR:
+ vmexit = nested_svm_exit_handled_msr(svm);
+ break;
+ case SVM_EXIT_IOIO:
+ vmexit = nested_svm_intercept_ioio(svm);
+ break;
+ case SVM_EXIT_READ_CR0 ... SVM_EXIT_WRITE_CR8: {
+ u32 bit = 1U << (exit_code - SVM_EXIT_READ_CR0);
+ if (svm->nested.intercept_cr & bit)
+ vmexit = NESTED_EXIT_DONE;
+ break;
+ }
+ case SVM_EXIT_READ_DR0 ... SVM_EXIT_WRITE_DR7: {
+ u32 bit = 1U << (exit_code - SVM_EXIT_READ_DR0);
+ if (svm->nested.intercept_dr & bit)
+ vmexit = NESTED_EXIT_DONE;
+ break;
+ }
+ case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 0x1f: {
+ u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE);
+ if (svm->nested.intercept_exceptions & excp_bits) {
+ if (exit_code == SVM_EXIT_EXCP_BASE + DB_VECTOR)
+ vmexit = nested_svm_intercept_db(svm);
+ else
+ vmexit = NESTED_EXIT_DONE;
+ }
+ /* async page fault always cause vmexit */
+ else if ((exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR) &&
+ svm->vcpu.arch.exception.nested_apf != 0)
+ vmexit = NESTED_EXIT_DONE;
+ break;
+ }
+ case SVM_EXIT_ERR: {
+ vmexit = NESTED_EXIT_DONE;
+ break;
+ }
+ default: {
+ u64 exit_bits = 1ULL << (exit_code - SVM_EXIT_INTR);
+ if (svm->nested.intercept & exit_bits)
+ vmexit = NESTED_EXIT_DONE;
+ }
+ }
+
+ return vmexit;
+}
+
+static int nested_svm_exit_handled(struct vcpu_svm *svm)
+{
+ int vmexit;
+
+ vmexit = nested_svm_intercept(svm);
+
+ if (vmexit == NESTED_EXIT_DONE)
+ nested_svm_vmexit(svm);
+
+ return vmexit;
+}
+
+static inline void copy_vmcb_control_area(struct vmcb *dst_vmcb, struct vmcb *from_vmcb)
+{
+ struct vmcb_control_area *dst = &dst_vmcb->control;
+ struct vmcb_control_area *from = &from_vmcb->control;
+
+ dst->intercept_cr = from->intercept_cr;
+ dst->intercept_dr = from->intercept_dr;
+ dst->intercept_exceptions = from->intercept_exceptions;
+ dst->intercept = from->intercept;
+ dst->iopm_base_pa = from->iopm_base_pa;
+ dst->msrpm_base_pa = from->msrpm_base_pa;
+ dst->tsc_offset = from->tsc_offset;
+ /* asid not copied, it is handled manually for svm->vmcb. */
+ dst->tlb_ctl = from->tlb_ctl;
+ dst->int_ctl = from->int_ctl;
+ dst->int_vector = from->int_vector;
+ dst->int_state = from->int_state;
+ dst->exit_code = from->exit_code;
+ dst->exit_code_hi = from->exit_code_hi;
+ dst->exit_info_1 = from->exit_info_1;
+ dst->exit_info_2 = from->exit_info_2;
+ dst->exit_int_info = from->exit_int_info;
+ dst->exit_int_info_err = from->exit_int_info_err;
+ dst->nested_ctl = from->nested_ctl;
+ dst->event_inj = from->event_inj;
+ dst->event_inj_err = from->event_inj_err;
+ dst->nested_cr3 = from->nested_cr3;
+ dst->virt_ext = from->virt_ext;
+}
+
+static int nested_svm_vmexit(struct vcpu_svm *svm)
+{
+ struct vmcb *nested_vmcb;
+ struct vmcb *hsave = svm->nested.hsave;
+ struct vmcb *vmcb = svm->vmcb;
+ struct page *page;
+
+ trace_kvm_nested_vmexit_inject(vmcb->control.exit_code,
+ vmcb->control.exit_info_1,
+ vmcb->control.exit_info_2,
+ vmcb->control.exit_int_info,
+ vmcb->control.exit_int_info_err,
+ KVM_ISA_SVM);
+
+ nested_vmcb = nested_svm_map(svm, svm->nested.vmcb, &page);
+ if (!nested_vmcb)
+ return 1;
+
+ /* Exit Guest-Mode */
+ leave_guest_mode(&svm->vcpu);
+ svm->nested.vmcb = 0;
+
+ /* Give the current vmcb to the guest */
+ disable_gif(svm);
+
+ nested_vmcb->save.es = vmcb->save.es;
+ nested_vmcb->save.cs = vmcb->save.cs;
+ nested_vmcb->save.ss = vmcb->save.ss;
+ nested_vmcb->save.ds = vmcb->save.ds;
+ nested_vmcb->save.gdtr = vmcb->save.gdtr;
+ nested_vmcb->save.idtr = vmcb->save.idtr;
+ nested_vmcb->save.efer = svm->vcpu.arch.efer;
+ nested_vmcb->save.cr0 = kvm_read_cr0(&svm->vcpu);
+ nested_vmcb->save.cr3 = kvm_read_cr3(&svm->vcpu);
+ nested_vmcb->save.cr2 = vmcb->save.cr2;
+ nested_vmcb->save.cr4 = svm->vcpu.arch.cr4;
+ nested_vmcb->save.rflags = kvm_get_rflags(&svm->vcpu);
+ nested_vmcb->save.rip = vmcb->save.rip;
+ nested_vmcb->save.rsp = vmcb->save.rsp;
+ nested_vmcb->save.rax = vmcb->save.rax;
+ nested_vmcb->save.dr7 = vmcb->save.dr7;
+ nested_vmcb->save.dr6 = vmcb->save.dr6;
+ nested_vmcb->save.cpl = vmcb->save.cpl;
+
+ nested_vmcb->control.int_ctl = vmcb->control.int_ctl;
+ nested_vmcb->control.int_vector = vmcb->control.int_vector;
+ nested_vmcb->control.int_state = vmcb->control.int_state;
+ nested_vmcb->control.exit_code = vmcb->control.exit_code;
+ nested_vmcb->control.exit_code_hi = vmcb->control.exit_code_hi;
+ nested_vmcb->control.exit_info_1 = vmcb->control.exit_info_1;
+ nested_vmcb->control.exit_info_2 = vmcb->control.exit_info_2;
+ nested_vmcb->control.exit_int_info = vmcb->control.exit_int_info;
+ nested_vmcb->control.exit_int_info_err = vmcb->control.exit_int_info_err;
+
+ if (svm->nrips_enabled)
+ nested_vmcb->control.next_rip = vmcb->control.next_rip;
+
+ /*
+ * If we emulate a VMRUN/#VMEXIT in the same host #vmexit cycle we have
+ * to make sure that we do not lose injected events. So check event_inj
+ * here and copy it to exit_int_info if it is valid.
+ * Exit_int_info and event_inj can't be both valid because the case
+ * below only happens on a VMRUN instruction intercept which has
+ * no valid exit_int_info set.
+ */
+ if (vmcb->control.event_inj & SVM_EVTINJ_VALID) {
+ struct vmcb_control_area *nc = &nested_vmcb->control;
+
+ nc->exit_int_info = vmcb->control.event_inj;
+ nc->exit_int_info_err = vmcb->control.event_inj_err;
+ }
+
+ nested_vmcb->control.tlb_ctl = 0;
+ nested_vmcb->control.event_inj = 0;
+ nested_vmcb->control.event_inj_err = 0;
+
+ /* We always set V_INTR_MASKING and remember the old value in hflags */
+ if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK))
+ nested_vmcb->control.int_ctl &= ~V_INTR_MASKING_MASK;
+
+ /* Restore the original control entries */
+ copy_vmcb_control_area(vmcb, hsave);
+
+ svm->vcpu.arch.tsc_offset = svm->vmcb->control.tsc_offset;
+ kvm_clear_exception_queue(&svm->vcpu);
+ kvm_clear_interrupt_queue(&svm->vcpu);
+
+ svm->nested.nested_cr3 = 0;
+
+ /* Restore selected save entries */
+ svm->vmcb->save.es = hsave->save.es;
+ svm->vmcb->save.cs = hsave->save.cs;
+ svm->vmcb->save.ss = hsave->save.ss;
+ svm->vmcb->save.ds = hsave->save.ds;
+ svm->vmcb->save.gdtr = hsave->save.gdtr;
+ svm->vmcb->save.idtr = hsave->save.idtr;
+ kvm_set_rflags(&svm->vcpu, hsave->save.rflags);
+ svm_set_efer(&svm->vcpu, hsave->save.efer);
+ svm_set_cr0(&svm->vcpu, hsave->save.cr0 | X86_CR0_PE);
+ svm_set_cr4(&svm->vcpu, hsave->save.cr4);
+ if (npt_enabled) {
+ svm->vmcb->save.cr3 = hsave->save.cr3;
+ svm->vcpu.arch.cr3 = hsave->save.cr3;
+ } else {
+ (void)kvm_set_cr3(&svm->vcpu, hsave->save.cr3);
+ }
+ kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, hsave->save.rax);
+ kvm_register_write(&svm->vcpu, VCPU_REGS_RSP, hsave->save.rsp);
+ kvm_register_write(&svm->vcpu, VCPU_REGS_RIP, hsave->save.rip);
+ svm->vmcb->save.dr7 = 0;
+ svm->vmcb->save.cpl = 0;
+ svm->vmcb->control.exit_int_info = 0;
+
+ mark_all_dirty(svm->vmcb);
+
+ nested_svm_unmap(page);
+
+ nested_svm_uninit_mmu_context(&svm->vcpu);
+ kvm_mmu_reset_context(&svm->vcpu);
+ kvm_mmu_load(&svm->vcpu);
+
+ /*
+ * Drop what we picked up for L2 via svm_complete_interrupts() so it
+ * doesn't end up in L1.
+ */
+ svm->vcpu.arch.nmi_injected = false;
+ kvm_clear_exception_queue(&svm->vcpu);
+ kvm_clear_interrupt_queue(&svm->vcpu);
+
+ return 0;
+}
+
+static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
+{
+ /*
+ * This function merges the msr permission bitmaps of kvm and the
+ * nested vmcb. It is optimized in that it only merges the parts where
+ * the kvm msr permission bitmap may contain zero bits
+ */
+ int i;
+
+ if (!(svm->nested.intercept & (1ULL << INTERCEPT_MSR_PROT)))
+ return true;
+
+ for (i = 0; i < MSRPM_OFFSETS; i++) {
+ u32 value, p;
+ u64 offset;
+
+ if (msrpm_offsets[i] == 0xffffffff)
+ break;
+
+ p = msrpm_offsets[i];
+ offset = svm->nested.vmcb_msrpm + (p * 4);
+
+ if (kvm_vcpu_read_guest(&svm->vcpu, offset, &value, 4))
+ return false;
+
+ svm->nested.msrpm[p] = svm->msrpm[p] | value;
+ }
+
+ svm->vmcb->control.msrpm_base_pa = __sme_set(__pa(svm->nested.msrpm));
+
+ return true;
+}
+
+static bool nested_vmcb_checks(struct vmcb *vmcb)
+{
+ if ((vmcb->control.intercept & (1ULL << INTERCEPT_VMRUN)) == 0)
+ return false;
+
+ if (vmcb->control.asid == 0)
+ return false;
+
+ if ((vmcb->control.nested_ctl & SVM_NESTED_CTL_NP_ENABLE) &&
+ !npt_enabled)
+ return false;
+
+ return true;
+}
+
+static void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa,
+ struct vmcb *nested_vmcb, struct page *page)
+{
+ if (kvm_get_rflags(&svm->vcpu) & X86_EFLAGS_IF)
+ svm->vcpu.arch.hflags |= HF_HIF_MASK;
+ else
+ svm->vcpu.arch.hflags &= ~HF_HIF_MASK;
+
+ if (nested_vmcb->control.nested_ctl & SVM_NESTED_CTL_NP_ENABLE) {
+ kvm_mmu_unload(&svm->vcpu);
+ svm->nested.nested_cr3 = nested_vmcb->control.nested_cr3;
+ nested_svm_init_mmu_context(&svm->vcpu);
+ }
+
+ /* Load the nested guest state */
+ svm->vmcb->save.es = nested_vmcb->save.es;
+ svm->vmcb->save.cs = nested_vmcb->save.cs;
+ svm->vmcb->save.ss = nested_vmcb->save.ss;
+ svm->vmcb->save.ds = nested_vmcb->save.ds;
+ svm->vmcb->save.gdtr = nested_vmcb->save.gdtr;
+ svm->vmcb->save.idtr = nested_vmcb->save.idtr;
+ kvm_set_rflags(&svm->vcpu, nested_vmcb->save.rflags);
+ svm_set_efer(&svm->vcpu, nested_vmcb->save.efer);
+ svm_set_cr0(&svm->vcpu, nested_vmcb->save.cr0);
+ svm_set_cr4(&svm->vcpu, nested_vmcb->save.cr4);
+ if (npt_enabled) {
+ svm->vmcb->save.cr3 = nested_vmcb->save.cr3;
+ svm->vcpu.arch.cr3 = nested_vmcb->save.cr3;
+ } else
+ (void)kvm_set_cr3(&svm->vcpu, nested_vmcb->save.cr3);
+
+ /* Guest paging mode is active - reset mmu */
+ kvm_mmu_reset_context(&svm->vcpu);
+
+ svm->vmcb->save.cr2 = svm->vcpu.arch.cr2 = nested_vmcb->save.cr2;
+ kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, nested_vmcb->save.rax);
+ kvm_register_write(&svm->vcpu, VCPU_REGS_RSP, nested_vmcb->save.rsp);
+ kvm_register_write(&svm->vcpu, VCPU_REGS_RIP, nested_vmcb->save.rip);
+
+ /* In case we don't even reach vcpu_run, the fields are not updated */
+ svm->vmcb->save.rax = nested_vmcb->save.rax;
+ svm->vmcb->save.rsp = nested_vmcb->save.rsp;
+ svm->vmcb->save.rip = nested_vmcb->save.rip;
+ svm->vmcb->save.dr7 = nested_vmcb->save.dr7;
+ svm->vmcb->save.dr6 = nested_vmcb->save.dr6;
+ svm->vmcb->save.cpl = nested_vmcb->save.cpl;
+
+ svm->nested.vmcb_msrpm = nested_vmcb->control.msrpm_base_pa & ~0x0fffULL;
+ svm->nested.vmcb_iopm = nested_vmcb->control.iopm_base_pa & ~0x0fffULL;
+
+ /* cache intercepts */
+ svm->nested.intercept_cr = nested_vmcb->control.intercept_cr;
+ svm->nested.intercept_dr = nested_vmcb->control.intercept_dr;
+ svm->nested.intercept_exceptions = nested_vmcb->control.intercept_exceptions;
+ svm->nested.intercept = nested_vmcb->control.intercept;
+
+ svm_flush_tlb(&svm->vcpu, true);
+
+ svm->vmcb->control.int_ctl &=
+ V_INTR_MASKING_MASK | V_GIF_ENABLE_MASK | V_GIF_MASK;
+
+ svm->vmcb->control.int_ctl |= nested_vmcb->control.int_ctl &
+ (V_TPR_MASK | V_IRQ_INJECTION_BITS_MASK);
+
+ if (nested_vmcb->control.int_ctl & V_INTR_MASKING_MASK)
+ svm->vcpu.arch.hflags |= HF_VINTR_MASK;
+ else
+ svm->vcpu.arch.hflags &= ~HF_VINTR_MASK;
+
+ if (svm->vcpu.arch.hflags & HF_VINTR_MASK) {
+ /* We only want the cr8 intercept bits of the guest */
+ clr_cr_intercept(svm, INTERCEPT_CR8_READ);
+ clr_cr_intercept(svm, INTERCEPT_CR8_WRITE);
+ }
+
+ /* We don't want to see VMMCALLs from a nested guest */
+ clr_intercept(svm, INTERCEPT_VMMCALL);
+
+ svm->vcpu.arch.tsc_offset += nested_vmcb->control.tsc_offset;
+ svm->vmcb->control.tsc_offset = svm->vcpu.arch.tsc_offset;
+
+ svm->vmcb->control.virt_ext = nested_vmcb->control.virt_ext;
+ svm->vmcb->control.int_vector = nested_vmcb->control.int_vector;
+ svm->vmcb->control.int_state = nested_vmcb->control.int_state;
+ svm->vmcb->control.event_inj = nested_vmcb->control.event_inj;
+ svm->vmcb->control.event_inj_err = nested_vmcb->control.event_inj_err;
+
+ nested_svm_unmap(page);
+
+ /* Enter Guest-Mode */
+ enter_guest_mode(&svm->vcpu);
+
+ /*
+ * Merge guest and host intercepts - must be called with vcpu in
+ * guest-mode to take affect here
+ */
+ recalc_intercepts(svm);
+
+ svm->nested.vmcb = vmcb_gpa;
+
+ enable_gif(svm);
+
+ mark_all_dirty(svm->vmcb);
+}
+
+static bool nested_svm_vmrun(struct vcpu_svm *svm)
+{
+ struct vmcb *nested_vmcb;
+ struct vmcb *hsave = svm->nested.hsave;
+ struct vmcb *vmcb = svm->vmcb;
+ struct page *page;
+ u64 vmcb_gpa;
+
+ vmcb_gpa = svm->vmcb->save.rax;
+
+ nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page);
+ if (!nested_vmcb)
+ return false;
+
+ if (!nested_vmcb_checks(nested_vmcb)) {
+ nested_vmcb->control.exit_code = SVM_EXIT_ERR;
+ nested_vmcb->control.exit_code_hi = 0;
+ nested_vmcb->control.exit_info_1 = 0;
+ nested_vmcb->control.exit_info_2 = 0;
+
+ nested_svm_unmap(page);
+
+ return false;
+ }
+
+ trace_kvm_nested_vmrun(svm->vmcb->save.rip, vmcb_gpa,
+ nested_vmcb->save.rip,
+ nested_vmcb->control.int_ctl,
+ nested_vmcb->control.event_inj,
+ nested_vmcb->control.nested_ctl);
+
+ trace_kvm_nested_intercepts(nested_vmcb->control.intercept_cr & 0xffff,
+ nested_vmcb->control.intercept_cr >> 16,
+ nested_vmcb->control.intercept_exceptions,
+ nested_vmcb->control.intercept);
+
+ /* Clear internal status */
+ kvm_clear_exception_queue(&svm->vcpu);
+ kvm_clear_interrupt_queue(&svm->vcpu);
+
+ /*
+ * Save the old vmcb, so we don't need to pick what we save, but can
+ * restore everything when a VMEXIT occurs
+ */
+ hsave->save.es = vmcb->save.es;
+ hsave->save.cs = vmcb->save.cs;
+ hsave->save.ss = vmcb->save.ss;
+ hsave->save.ds = vmcb->save.ds;
+ hsave->save.gdtr = vmcb->save.gdtr;
+ hsave->save.idtr = vmcb->save.idtr;
+ hsave->save.efer = svm->vcpu.arch.efer;
+ hsave->save.cr0 = kvm_read_cr0(&svm->vcpu);
+ hsave->save.cr4 = svm->vcpu.arch.cr4;
+ hsave->save.rflags = kvm_get_rflags(&svm->vcpu);
+ hsave->save.rip = kvm_rip_read(&svm->vcpu);
+ hsave->save.rsp = vmcb->save.rsp;
+ hsave->save.rax = vmcb->save.rax;
+ if (npt_enabled)
+ hsave->save.cr3 = vmcb->save.cr3;
+ else
+ hsave->save.cr3 = kvm_read_cr3(&svm->vcpu);
+
+ copy_vmcb_control_area(hsave, vmcb);
+
+ enter_svm_guest_mode(svm, vmcb_gpa, nested_vmcb, page);
+
+ return true;
+}
+
+static void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb)
+{
+ to_vmcb->save.fs = from_vmcb->save.fs;
+ to_vmcb->save.gs = from_vmcb->save.gs;
+ to_vmcb->save.tr = from_vmcb->save.tr;
+ to_vmcb->save.ldtr = from_vmcb->save.ldtr;
+ to_vmcb->save.kernel_gs_base = from_vmcb->save.kernel_gs_base;
+ to_vmcb->save.star = from_vmcb->save.star;
+ to_vmcb->save.lstar = from_vmcb->save.lstar;
+ to_vmcb->save.cstar = from_vmcb->save.cstar;
+ to_vmcb->save.sfmask = from_vmcb->save.sfmask;
+ to_vmcb->save.sysenter_cs = from_vmcb->save.sysenter_cs;
+ to_vmcb->save.sysenter_esp = from_vmcb->save.sysenter_esp;
+ to_vmcb->save.sysenter_eip = from_vmcb->save.sysenter_eip;
+}
+
+static int vmload_interception(struct vcpu_svm *svm)
+{
+ struct vmcb *nested_vmcb;
+ struct page *page;
+ int ret;
+
+ if (nested_svm_check_permissions(svm))
+ return 1;
+
+ nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page);
+ if (!nested_vmcb)
+ return 1;
+
+ svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
+ ret = kvm_skip_emulated_instruction(&svm->vcpu);
+
+ nested_svm_vmloadsave(nested_vmcb, svm->vmcb);
+ nested_svm_unmap(page);
+
+ return ret;
+}
+
+static int vmsave_interception(struct vcpu_svm *svm)
+{
+ struct vmcb *nested_vmcb;
+ struct page *page;
+ int ret;
+
+ if (nested_svm_check_permissions(svm))
+ return 1;
+
+ nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page);
+ if (!nested_vmcb)
+ return 1;
+
+ svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
+ ret = kvm_skip_emulated_instruction(&svm->vcpu);
+
+ nested_svm_vmloadsave(svm->vmcb, nested_vmcb);
+ nested_svm_unmap(page);
+
+ return ret;
+}
+
+static int vmrun_interception(struct vcpu_svm *svm)
+{
+ if (nested_svm_check_permissions(svm))
+ return 1;
+
+ /* Save rip after vmrun instruction */
+ kvm_rip_write(&svm->vcpu, kvm_rip_read(&svm->vcpu) + 3);
+
+ if (!nested_svm_vmrun(svm))
+ return 1;
+
+ if (!nested_svm_vmrun_msrpm(svm))
+ goto failed;
+
+ return 1;
+
+failed:
+
+ svm->vmcb->control.exit_code = SVM_EXIT_ERR;
+ svm->vmcb->control.exit_code_hi = 0;
+ svm->vmcb->control.exit_info_1 = 0;
+ svm->vmcb->control.exit_info_2 = 0;
+
+ nested_svm_vmexit(svm);
+
+ return 1;
+}
+
+static int stgi_interception(struct vcpu_svm *svm)
+{
+ int ret;
+
+ if (nested_svm_check_permissions(svm))
+ return 1;
+
+ /*
+ * If VGIF is enabled, the STGI intercept is only added to
+ * detect the opening of the SMI/NMI window; remove it now.
+ */
+ if (vgif_enabled(svm))
+ clr_intercept(svm, INTERCEPT_STGI);
+
+ svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
+ ret = kvm_skip_emulated_instruction(&svm->vcpu);
+ kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
+
+ enable_gif(svm);
+
+ return ret;
+}
+
+static int clgi_interception(struct vcpu_svm *svm)
+{
+ int ret;
+
+ if (nested_svm_check_permissions(svm))
+ return 1;
+
+ svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
+ ret = kvm_skip_emulated_instruction(&svm->vcpu);
+
+ disable_gif(svm);
+
+ /* After a CLGI no interrupts should come */
+ if (!kvm_vcpu_apicv_active(&svm->vcpu)) {
+ svm_clear_vintr(svm);
+ svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
+ mark_dirty(svm->vmcb, VMCB_INTR);
+ }
+
+ return ret;
+}
+
+static int invlpga_interception(struct vcpu_svm *svm)
+{
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+
+ trace_kvm_invlpga(svm->vmcb->save.rip, kvm_register_read(&svm->vcpu, VCPU_REGS_RCX),
+ kvm_register_read(&svm->vcpu, VCPU_REGS_RAX));
+
+ /* Let's treat INVLPGA the same as INVLPG (can be optimized!) */
+ kvm_mmu_invlpg(vcpu, kvm_register_read(&svm->vcpu, VCPU_REGS_RAX));
+
+ svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
+ return kvm_skip_emulated_instruction(&svm->vcpu);
+}
+
+static int skinit_interception(struct vcpu_svm *svm)
+{
+ trace_kvm_skinit(svm->vmcb->save.rip, kvm_register_read(&svm->vcpu, VCPU_REGS_RAX));
+
+ kvm_queue_exception(&svm->vcpu, UD_VECTOR);
+ return 1;
+}
+
+static int wbinvd_interception(struct vcpu_svm *svm)
+{
+ return kvm_emulate_wbinvd(&svm->vcpu);
+}
+
+static int xsetbv_interception(struct vcpu_svm *svm)
+{
+ u64 new_bv = kvm_read_edx_eax(&svm->vcpu);
+ u32 index = kvm_register_read(&svm->vcpu, VCPU_REGS_RCX);
+
+ if (kvm_set_xcr(&svm->vcpu, index, new_bv) == 0) {
+ svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
+ return kvm_skip_emulated_instruction(&svm->vcpu);
+ }
+
+ return 1;
+}
+
+static int task_switch_interception(struct vcpu_svm *svm)
+{
+ u16 tss_selector;
+ int reason;
+ int int_type = svm->vmcb->control.exit_int_info &
+ SVM_EXITINTINFO_TYPE_MASK;
+ int int_vec = svm->vmcb->control.exit_int_info & SVM_EVTINJ_VEC_MASK;
+ uint32_t type =
+ svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_TYPE_MASK;
+ uint32_t idt_v =
+ svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_VALID;
+ bool has_error_code = false;
+ u32 error_code = 0;
+
+ tss_selector = (u16)svm->vmcb->control.exit_info_1;
+
+ if (svm->vmcb->control.exit_info_2 &
+ (1ULL << SVM_EXITINFOSHIFT_TS_REASON_IRET))
+ reason = TASK_SWITCH_IRET;
+ else if (svm->vmcb->control.exit_info_2 &
+ (1ULL << SVM_EXITINFOSHIFT_TS_REASON_JMP))
+ reason = TASK_SWITCH_JMP;
+ else if (idt_v)
+ reason = TASK_SWITCH_GATE;
+ else
+ reason = TASK_SWITCH_CALL;
+
+ if (reason == TASK_SWITCH_GATE) {
+ switch (type) {
+ case SVM_EXITINTINFO_TYPE_NMI:
+ svm->vcpu.arch.nmi_injected = false;
+ break;
+ case SVM_EXITINTINFO_TYPE_EXEPT:
+ if (svm->vmcb->control.exit_info_2 &
+ (1ULL << SVM_EXITINFOSHIFT_TS_HAS_ERROR_CODE)) {
+ has_error_code = true;
+ error_code =
+ (u32)svm->vmcb->control.exit_info_2;
+ }
+ kvm_clear_exception_queue(&svm->vcpu);
+ break;
+ case SVM_EXITINTINFO_TYPE_INTR:
+ kvm_clear_interrupt_queue(&svm->vcpu);
+ break;
+ default:
+ break;
+ }
+ }
+
+ if (reason != TASK_SWITCH_GATE ||
+ int_type == SVM_EXITINTINFO_TYPE_SOFT ||
+ (int_type == SVM_EXITINTINFO_TYPE_EXEPT &&
+ (int_vec == OF_VECTOR || int_vec == BP_VECTOR)))
+ skip_emulated_instruction(&svm->vcpu);
+
+ if (int_type != SVM_EXITINTINFO_TYPE_SOFT)
+ int_vec = -1;
+
+ if (kvm_task_switch(&svm->vcpu, tss_selector, int_vec, reason,
+ has_error_code, error_code) == EMULATE_FAIL) {
+ svm->vcpu.run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+ svm->vcpu.run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
+ svm->vcpu.run->internal.ndata = 0;
+ return 0;
+ }
+ return 1;
+}
+
+static int cpuid_interception(struct vcpu_svm *svm)
+{
+ svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
+ return kvm_emulate_cpuid(&svm->vcpu);
+}
+
+static int iret_interception(struct vcpu_svm *svm)
+{
+ ++svm->vcpu.stat.nmi_window_exits;
+ clr_intercept(svm, INTERCEPT_IRET);
+ svm->vcpu.arch.hflags |= HF_IRET_MASK;
+ svm->nmi_iret_rip = kvm_rip_read(&svm->vcpu);
+ kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
+ return 1;
+}
+
+static int invd_interception(struct vcpu_svm *svm)
+{
+ /* Treat an INVD instruction as a NOP and just skip it. */
+ return kvm_skip_emulated_instruction(&svm->vcpu);
+}
+
+static int invlpg_interception(struct vcpu_svm *svm)
+{
+ if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
+ return kvm_emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE;
+
+ kvm_mmu_invlpg(&svm->vcpu, svm->vmcb->control.exit_info_1);
+ return kvm_skip_emulated_instruction(&svm->vcpu);
+}
+
+static int emulate_on_interception(struct vcpu_svm *svm)
+{
+ return kvm_emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE;
+}
+
+static int rsm_interception(struct vcpu_svm *svm)
+{
+ return kvm_emulate_instruction_from_buffer(&svm->vcpu,
+ rsm_ins_bytes, 2) == EMULATE_DONE;
+}
+
+static int rdpmc_interception(struct vcpu_svm *svm)
+{
+ int err;
+
+ if (!static_cpu_has(X86_FEATURE_NRIPS))
+ return emulate_on_interception(svm);
+
+ err = kvm_rdpmc(&svm->vcpu);
+ return kvm_complete_insn_gp(&svm->vcpu, err);
+}
+
+static bool check_selective_cr0_intercepted(struct vcpu_svm *svm,
+ unsigned long val)
+{
+ unsigned long cr0 = svm->vcpu.arch.cr0;
+ bool ret = false;
+ u64 intercept;
+
+ intercept = svm->nested.intercept;
+
+ if (!is_guest_mode(&svm->vcpu) ||
+ (!(intercept & (1ULL << INTERCEPT_SELECTIVE_CR0))))
+ return false;
+
+ cr0 &= ~SVM_CR0_SELECTIVE_MASK;
+ val &= ~SVM_CR0_SELECTIVE_MASK;
+
+ if (cr0 ^ val) {
+ svm->vmcb->control.exit_code = SVM_EXIT_CR0_SEL_WRITE;
+ ret = (nested_svm_exit_handled(svm) == NESTED_EXIT_DONE);
+ }
+
+ return ret;
+}
+
+#define CR_VALID (1ULL << 63)
+
+static int cr_interception(struct vcpu_svm *svm)
+{
+ int reg, cr;
+ unsigned long val;
+ int err;
+
+ if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
+ return emulate_on_interception(svm);
+
+ if (unlikely((svm->vmcb->control.exit_info_1 & CR_VALID) == 0))
+ return emulate_on_interception(svm);
+
+ reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
+ if (svm->vmcb->control.exit_code == SVM_EXIT_CR0_SEL_WRITE)
+ cr = SVM_EXIT_WRITE_CR0 - SVM_EXIT_READ_CR0;
+ else
+ cr = svm->vmcb->control.exit_code - SVM_EXIT_READ_CR0;
+
+ err = 0;
+ if (cr >= 16) { /* mov to cr */
+ cr -= 16;
+ val = kvm_register_readl(&svm->vcpu, reg);
+ switch (cr) {
+ case 0:
+ if (!check_selective_cr0_intercepted(svm, val))
+ err = kvm_set_cr0(&svm->vcpu, val);
+ else
+ return 1;
+
+ break;
+ case 3:
+ err = kvm_set_cr3(&svm->vcpu, val);
+ break;
+ case 4:
+ err = kvm_set_cr4(&svm->vcpu, val);
+ break;
+ case 8:
+ err = kvm_set_cr8(&svm->vcpu, val);
+ break;
+ default:
+ WARN(1, "unhandled write to CR%d", cr);
+ kvm_queue_exception(&svm->vcpu, UD_VECTOR);
+ return 1;
+ }
+ } else { /* mov from cr */
+ switch (cr) {
+ case 0:
+ val = kvm_read_cr0(&svm->vcpu);
+ break;
+ case 2:
+ val = svm->vcpu.arch.cr2;
+ break;
+ case 3:
+ val = kvm_read_cr3(&svm->vcpu);
+ break;
+ case 4:
+ val = kvm_read_cr4(&svm->vcpu);
+ break;
+ case 8:
+ val = kvm_get_cr8(&svm->vcpu);
+ break;
+ default:
+ WARN(1, "unhandled read from CR%d", cr);
+ kvm_queue_exception(&svm->vcpu, UD_VECTOR);
+ return 1;
+ }
+ kvm_register_writel(&svm->vcpu, reg, val);
+ }
+ return kvm_complete_insn_gp(&svm->vcpu, err);
+}
+
+static int dr_interception(struct vcpu_svm *svm)
+{
+ int reg, dr;
+ unsigned long val;
+
+ if (svm->vcpu.guest_debug == 0) {
+ /*
+ * No more DR vmexits; force a reload of the debug registers
+ * and reenter on this instruction. The next vmexit will
+ * retrieve the full state of the debug registers.
+ */
+ clr_dr_intercepts(svm);
+ svm->vcpu.arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT;
+ return 1;
+ }
+
+ if (!boot_cpu_has(X86_FEATURE_DECODEASSISTS))
+ return emulate_on_interception(svm);
+
+ reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
+ dr = svm->vmcb->control.exit_code - SVM_EXIT_READ_DR0;
+
+ if (dr >= 16) { /* mov to DRn */
+ if (!kvm_require_dr(&svm->vcpu, dr - 16))
+ return 1;
+ val = kvm_register_readl(&svm->vcpu, reg);
+ kvm_set_dr(&svm->vcpu, dr - 16, val);
+ } else {
+ if (!kvm_require_dr(&svm->vcpu, dr))
+ return 1;
+ kvm_get_dr(&svm->vcpu, dr, &val);
+ kvm_register_writel(&svm->vcpu, reg, val);
+ }
+
+ return kvm_skip_emulated_instruction(&svm->vcpu);
+}
+
+static int cr8_write_interception(struct vcpu_svm *svm)
+{
+ struct kvm_run *kvm_run = svm->vcpu.run;
+ int r;
+
+ u8 cr8_prev = kvm_get_cr8(&svm->vcpu);
+ /* instruction emulation calls kvm_set_cr8() */
+ r = cr_interception(svm);
+ if (lapic_in_kernel(&svm->vcpu))
+ return r;
+ if (cr8_prev <= kvm_get_cr8(&svm->vcpu))
+ return r;
+ kvm_run->exit_reason = KVM_EXIT_SET_TPR;
+ return 0;
+}
+
+static int svm_get_msr_feature(struct kvm_msr_entry *msr)
+{
+ msr->data = 0;
+
+ switch (msr->index) {
+ case MSR_F10H_DECFG:
+ if (boot_cpu_has(X86_FEATURE_LFENCE_RDTSC))
+ msr->data |= MSR_F10H_DECFG_LFENCE_SERIALIZE;
+ break;
+ default:
+ return 1;
+ }
+
+ return 0;
+}
+
+static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ switch (msr_info->index) {
+ case MSR_STAR:
+ msr_info->data = svm->vmcb->save.star;
+ break;
+#ifdef CONFIG_X86_64
+ case MSR_LSTAR:
+ msr_info->data = svm->vmcb->save.lstar;
+ break;
+ case MSR_CSTAR:
+ msr_info->data = svm->vmcb->save.cstar;
+ break;
+ case MSR_KERNEL_GS_BASE:
+ msr_info->data = svm->vmcb->save.kernel_gs_base;
+ break;
+ case MSR_SYSCALL_MASK:
+ msr_info->data = svm->vmcb->save.sfmask;
+ break;
+#endif
+ case MSR_IA32_SYSENTER_CS:
+ msr_info->data = svm->vmcb->save.sysenter_cs;
+ break;
+ case MSR_IA32_SYSENTER_EIP:
+ msr_info->data = svm->sysenter_eip;
+ break;
+ case MSR_IA32_SYSENTER_ESP:
+ msr_info->data = svm->sysenter_esp;
+ break;
+ case MSR_TSC_AUX:
+ if (!boot_cpu_has(X86_FEATURE_RDTSCP))
+ return 1;
+ msr_info->data = svm->tsc_aux;
+ break;
+ /*
+ * Nobody will change the following 5 values in the VMCB so we can
+ * safely return them on rdmsr. They will always be 0 until LBRV is
+ * implemented.
+ */
+ case MSR_IA32_DEBUGCTLMSR:
+ msr_info->data = svm->vmcb->save.dbgctl;
+ break;
+ case MSR_IA32_LASTBRANCHFROMIP:
+ msr_info->data = svm->vmcb->save.br_from;
+ break;
+ case MSR_IA32_LASTBRANCHTOIP:
+ msr_info->data = svm->vmcb->save.br_to;
+ break;
+ case MSR_IA32_LASTINTFROMIP:
+ msr_info->data = svm->vmcb->save.last_excp_from;
+ break;
+ case MSR_IA32_LASTINTTOIP:
+ msr_info->data = svm->vmcb->save.last_excp_to;
+ break;
+ case MSR_VM_HSAVE_PA:
+ msr_info->data = svm->nested.hsave_msr;
+ break;
+ case MSR_VM_CR:
+ msr_info->data = svm->nested.vm_cr_msr;
+ break;
+ case MSR_IA32_SPEC_CTRL:
+ if (!msr_info->host_initiated &&
+ !guest_has_spec_ctrl_msr(vcpu))
+ return 1;
+
+ msr_info->data = svm->spec_ctrl;
+ break;
+ case MSR_AMD64_VIRT_SPEC_CTRL:
+ if (!msr_info->host_initiated &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_VIRT_SSBD))
+ return 1;
+
+ msr_info->data = svm->virt_spec_ctrl;
+ break;
+ case MSR_F15H_IC_CFG: {
+
+ int family, model;
+
+ family = guest_cpuid_family(vcpu);
+ model = guest_cpuid_model(vcpu);
+
+ if (family < 0 || model < 0)
+ return kvm_get_msr_common(vcpu, msr_info);
+
+ msr_info->data = 0;
+
+ if (family == 0x15 &&
+ (model >= 0x2 && model < 0x20))
+ msr_info->data = 0x1E;
+ }
+ break;
+ case MSR_F10H_DECFG:
+ msr_info->data = svm->msr_decfg;
+ break;
+ default:
+ return kvm_get_msr_common(vcpu, msr_info);
+ }
+ return 0;
+}
+
+static int rdmsr_interception(struct vcpu_svm *svm)
+{
+ u32 ecx = kvm_register_read(&svm->vcpu, VCPU_REGS_RCX);
+ struct msr_data msr_info;
+
+ msr_info.index = ecx;
+ msr_info.host_initiated = false;
+ if (svm_get_msr(&svm->vcpu, &msr_info)) {
+ trace_kvm_msr_read_ex(ecx);
+ kvm_inject_gp(&svm->vcpu, 0);
+ return 1;
+ } else {
+ trace_kvm_msr_read(ecx, msr_info.data);
+
+ kvm_register_write(&svm->vcpu, VCPU_REGS_RAX,
+ msr_info.data & 0xffffffff);
+ kvm_register_write(&svm->vcpu, VCPU_REGS_RDX,
+ msr_info.data >> 32);
+ svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
+ return kvm_skip_emulated_instruction(&svm->vcpu);
+ }
+}
+
+static int svm_set_vm_cr(struct kvm_vcpu *vcpu, u64 data)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ int svm_dis, chg_mask;
+
+ if (data & ~SVM_VM_CR_VALID_MASK)
+ return 1;
+
+ chg_mask = SVM_VM_CR_VALID_MASK;
+
+ if (svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK)
+ chg_mask &= ~(SVM_VM_CR_SVM_LOCK_MASK | SVM_VM_CR_SVM_DIS_MASK);
+
+ svm->nested.vm_cr_msr &= ~chg_mask;
+ svm->nested.vm_cr_msr |= (data & chg_mask);
+
+ svm_dis = svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK;
+
+ /* check for svm_disable while efer.svme is set */
+ if (svm_dis && (vcpu->arch.efer & EFER_SVME))
+ return 1;
+
+ return 0;
+}
+
+static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ u32 ecx = msr->index;
+ u64 data = msr->data;
+ switch (ecx) {
+ case MSR_IA32_CR_PAT:
+ if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data))
+ return 1;
+ vcpu->arch.pat = data;
+ svm->vmcb->save.g_pat = data;
+ mark_dirty(svm->vmcb, VMCB_NPT);
+ break;
+ case MSR_IA32_SPEC_CTRL:
+ if (!msr->host_initiated &&
+ !guest_has_spec_ctrl_msr(vcpu))
+ return 1;
+
+ /* The STIBP bit doesn't fault even if it's not advertised */
+ if (data & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP | SPEC_CTRL_SSBD))
+ return 1;
+
+ svm->spec_ctrl = data;
+
+ if (!data)
+ break;
+
+ /*
+ * For non-nested:
+ * When it's written (to non-zero) for the first time, pass
+ * it through.
+ *
+ * For nested:
+ * The handling of the MSR bitmap for L2 guests is done in
+ * nested_svm_vmrun_msrpm.
+ * We update the L1 MSR bit as well since it will end up
+ * touching the MSR anyway now.
+ */
+ set_msr_interception(svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1);
+ break;
+ case MSR_IA32_PRED_CMD:
+ if (!msr->host_initiated &&
+ !guest_has_pred_cmd_msr(vcpu))
+ return 1;
+
+ if (data & ~PRED_CMD_IBPB)
+ return 1;
+ if (!data)
+ break;
+
+ wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB);
+ if (is_guest_mode(vcpu))
+ break;
+ set_msr_interception(svm->msrpm, MSR_IA32_PRED_CMD, 0, 1);
+ break;
+ case MSR_AMD64_VIRT_SPEC_CTRL:
+ if (!msr->host_initiated &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_VIRT_SSBD))
+ return 1;
+
+ if (data & ~SPEC_CTRL_SSBD)
+ return 1;
+
+ svm->virt_spec_ctrl = data;
+ break;
+ case MSR_STAR:
+ svm->vmcb->save.star = data;
+ break;
+#ifdef CONFIG_X86_64
+ case MSR_LSTAR:
+ svm->vmcb->save.lstar = data;
+ break;
+ case MSR_CSTAR:
+ svm->vmcb->save.cstar = data;
+ break;
+ case MSR_KERNEL_GS_BASE:
+ svm->vmcb->save.kernel_gs_base = data;
+ break;
+ case MSR_SYSCALL_MASK:
+ svm->vmcb->save.sfmask = data;
+ break;
+#endif
+ case MSR_IA32_SYSENTER_CS:
+ svm->vmcb->save.sysenter_cs = data;
+ break;
+ case MSR_IA32_SYSENTER_EIP:
+ svm->sysenter_eip = data;
+ svm->vmcb->save.sysenter_eip = data;
+ break;
+ case MSR_IA32_SYSENTER_ESP:
+ svm->sysenter_esp = data;
+ svm->vmcb->save.sysenter_esp = data;
+ break;
+ case MSR_TSC_AUX:
+ if (!boot_cpu_has(X86_FEATURE_RDTSCP))
+ return 1;
+
+ /*
+ * This is rare, so we update the MSR here instead of using
+ * direct_access_msrs. Doing that would require a rdmsr in
+ * svm_vcpu_put.
+ */
+ svm->tsc_aux = data;
+ wrmsrl(MSR_TSC_AUX, svm->tsc_aux);
+ break;
+ case MSR_IA32_DEBUGCTLMSR:
+ if (!boot_cpu_has(X86_FEATURE_LBRV)) {
+ vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n",
+ __func__, data);
+ break;
+ }
+ if (data & DEBUGCTL_RESERVED_BITS)
+ return 1;
+
+ svm->vmcb->save.dbgctl = data;
+ mark_dirty(svm->vmcb, VMCB_LBR);
+ if (data & (1ULL<<0))
+ svm_enable_lbrv(svm);
+ else
+ svm_disable_lbrv(svm);
+ break;
+ case MSR_VM_HSAVE_PA:
+ svm->nested.hsave_msr = data;
+ break;
+ case MSR_VM_CR:
+ return svm_set_vm_cr(vcpu, data);
+ case MSR_VM_IGNNE:
+ vcpu_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data);
+ break;
+ case MSR_F10H_DECFG: {
+ struct kvm_msr_entry msr_entry;
+
+ msr_entry.index = msr->index;
+ if (svm_get_msr_feature(&msr_entry))
+ return 1;
+
+ /* Check the supported bits */
+ if (data & ~msr_entry.data)
+ return 1;
+
+ /* Don't allow the guest to change a bit, #GP */
+ if (!msr->host_initiated && (data ^ msr_entry.data))
+ return 1;
+
+ svm->msr_decfg = data;
+ break;
+ }
+ case MSR_IA32_APICBASE:
+ if (kvm_vcpu_apicv_active(vcpu))
+ avic_update_vapic_bar(to_svm(vcpu), data);
+ /* Follow through */
+ default:
+ return kvm_set_msr_common(vcpu, msr);
+ }
+ return 0;
+}
+
+static int wrmsr_interception(struct vcpu_svm *svm)
+{
+ struct msr_data msr;
+ u32 ecx = kvm_register_read(&svm->vcpu, VCPU_REGS_RCX);
+ u64 data = kvm_read_edx_eax(&svm->vcpu);
+
+ msr.data = data;
+ msr.index = ecx;
+ msr.host_initiated = false;
+
+ svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
+ if (kvm_set_msr(&svm->vcpu, &msr)) {
+ trace_kvm_msr_write_ex(ecx, data);
+ kvm_inject_gp(&svm->vcpu, 0);
+ return 1;
+ } else {
+ trace_kvm_msr_write(ecx, data);
+ return kvm_skip_emulated_instruction(&svm->vcpu);
+ }
+}
+
+static int msr_interception(struct vcpu_svm *svm)
+{
+ if (svm->vmcb->control.exit_info_1)
+ return wrmsr_interception(svm);
+ else
+ return rdmsr_interception(svm);
+}
+
+static int interrupt_window_interception(struct vcpu_svm *svm)
+{
+ kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
+ svm_clear_vintr(svm);
+ svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
+ mark_dirty(svm->vmcb, VMCB_INTR);
+ ++svm->vcpu.stat.irq_window_exits;
+ return 1;
+}
+
+static int pause_interception(struct vcpu_svm *svm)
+{
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+ bool in_kernel = (svm_get_cpl(vcpu) == 0);
+
+ if (pause_filter_thresh)
+ grow_ple_window(vcpu);
+
+ kvm_vcpu_on_spin(vcpu, in_kernel);
+ return 1;
+}
+
+static int nop_interception(struct vcpu_svm *svm)
+{
+ return kvm_skip_emulated_instruction(&(svm->vcpu));
+}
+
+static int monitor_interception(struct vcpu_svm *svm)
+{
+ printk_once(KERN_WARNING "kvm: MONITOR instruction emulated as NOP!\n");
+ return nop_interception(svm);
+}
+
+static int mwait_interception(struct vcpu_svm *svm)
+{
+ printk_once(KERN_WARNING "kvm: MWAIT instruction emulated as NOP!\n");
+ return nop_interception(svm);
+}
+
+enum avic_ipi_failure_cause {
+ AVIC_IPI_FAILURE_INVALID_INT_TYPE,
+ AVIC_IPI_FAILURE_TARGET_NOT_RUNNING,
+ AVIC_IPI_FAILURE_INVALID_TARGET,
+ AVIC_IPI_FAILURE_INVALID_BACKING_PAGE,
+};
+
+static int avic_incomplete_ipi_interception(struct vcpu_svm *svm)
+{
+ u32 icrh = svm->vmcb->control.exit_info_1 >> 32;
+ u32 icrl = svm->vmcb->control.exit_info_1;
+ u32 id = svm->vmcb->control.exit_info_2 >> 32;
+ u32 index = svm->vmcb->control.exit_info_2 & 0xFF;
+ struct kvm_lapic *apic = svm->vcpu.arch.apic;
+
+ trace_kvm_avic_incomplete_ipi(svm->vcpu.vcpu_id, icrh, icrl, id, index);
+
+ switch (id) {
+ case AVIC_IPI_FAILURE_INVALID_INT_TYPE:
+ /*
+ * AVIC hardware handles the generation of
+ * IPIs when the specified Message Type is Fixed
+ * (also known as fixed delivery mode) and
+ * the Trigger Mode is edge-triggered. The hardware
+ * also supports self and broadcast delivery modes
+ * specified via the Destination Shorthand(DSH)
+ * field of the ICRL. Logical and physical APIC ID
+ * formats are supported. All other IPI types cause
+ * a #VMEXIT, which needs to emulated.
+ */
+ kvm_lapic_reg_write(apic, APIC_ICR2, icrh);
+ kvm_lapic_reg_write(apic, APIC_ICR, icrl);
+ break;
+ case AVIC_IPI_FAILURE_TARGET_NOT_RUNNING: {
+ int i;
+ struct kvm_vcpu *vcpu;
+ struct kvm *kvm = svm->vcpu.kvm;
+ struct kvm_lapic *apic = svm->vcpu.arch.apic;
+
+ /*
+ * At this point, we expect that the AVIC HW has already
+ * set the appropriate IRR bits on the valid target
+ * vcpus. So, we just need to kick the appropriate vcpu.
+ */
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ bool m = kvm_apic_match_dest(vcpu, apic,
+ icrl & KVM_APIC_SHORT_MASK,
+ GET_APIC_DEST_FIELD(icrh),
+ icrl & KVM_APIC_DEST_MASK);
+
+ if (m && !avic_vcpu_is_running(vcpu))
+ kvm_vcpu_wake_up(vcpu);
+ }
+ break;
+ }
+ case AVIC_IPI_FAILURE_INVALID_TARGET:
+ break;
+ case AVIC_IPI_FAILURE_INVALID_BACKING_PAGE:
+ WARN_ONCE(1, "Invalid backing page\n");
+ break;
+ default:
+ pr_err("Unknown IPI interception\n");
+ }
+
+ return 1;
+}
+
+static u32 *avic_get_logical_id_entry(struct kvm_vcpu *vcpu, u32 ldr, bool flat)
+{
+ struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
+ int index;
+ u32 *logical_apic_id_table;
+ int dlid = GET_APIC_LOGICAL_ID(ldr);
+
+ if (!dlid)
+ return NULL;
+
+ if (flat) { /* flat */
+ index = ffs(dlid) - 1;
+ if (index > 7)
+ return NULL;
+ } else { /* cluster */
+ int cluster = (dlid & 0xf0) >> 4;
+ int apic = ffs(dlid & 0x0f) - 1;
+
+ if ((apic < 0) || (apic > 7) ||
+ (cluster >= 0xf))
+ return NULL;
+ index = (cluster << 2) + apic;
+ }
+
+ logical_apic_id_table = (u32 *) page_address(kvm_svm->avic_logical_id_table_page);
+
+ return &logical_apic_id_table[index];
+}
+
+static int avic_ldr_write(struct kvm_vcpu *vcpu, u8 g_physical_id, u32 ldr,
+ bool valid)
+{
+ bool flat;
+ u32 *entry, new_entry;
+
+ flat = kvm_lapic_get_reg(vcpu->arch.apic, APIC_DFR) == APIC_DFR_FLAT;
+ entry = avic_get_logical_id_entry(vcpu, ldr, flat);
+ if (!entry)
+ return -EINVAL;
+
+ new_entry = READ_ONCE(*entry);
+ new_entry &= ~AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK;
+ new_entry |= (g_physical_id & AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK);
+ if (valid)
+ new_entry |= AVIC_LOGICAL_ID_ENTRY_VALID_MASK;
+ else
+ new_entry &= ~AVIC_LOGICAL_ID_ENTRY_VALID_MASK;
+ WRITE_ONCE(*entry, new_entry);
+
+ return 0;
+}
+
+static int avic_handle_ldr_update(struct kvm_vcpu *vcpu)
+{
+ int ret;
+ struct vcpu_svm *svm = to_svm(vcpu);
+ u32 ldr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_LDR);
+
+ if (!ldr)
+ return 1;
+
+ ret = avic_ldr_write(vcpu, vcpu->vcpu_id, ldr, true);
+ if (ret && svm->ldr_reg) {
+ avic_ldr_write(vcpu, 0, svm->ldr_reg, false);
+ svm->ldr_reg = 0;
+ } else {
+ svm->ldr_reg = ldr;
+ }
+ return ret;
+}
+
+static int avic_handle_apic_id_update(struct kvm_vcpu *vcpu)
+{
+ u64 *old, *new;
+ struct vcpu_svm *svm = to_svm(vcpu);
+ u32 apic_id_reg = kvm_lapic_get_reg(vcpu->arch.apic, APIC_ID);
+ u32 id = (apic_id_reg >> 24) & 0xff;
+
+ if (vcpu->vcpu_id == id)
+ return 0;
+
+ old = avic_get_physical_id_entry(vcpu, vcpu->vcpu_id);
+ new = avic_get_physical_id_entry(vcpu, id);
+ if (!new || !old)
+ return 1;
+
+ /* We need to move physical_id_entry to new offset */
+ *new = *old;
+ *old = 0ULL;
+ to_svm(vcpu)->avic_physical_id_cache = new;
+
+ /*
+ * Also update the guest physical APIC ID in the logical
+ * APIC ID table entry if already setup the LDR.
+ */
+ if (svm->ldr_reg)
+ avic_handle_ldr_update(vcpu);
+
+ return 0;
+}
+
+static int avic_handle_dfr_update(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
+ u32 dfr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_DFR);
+ u32 mod = (dfr >> 28) & 0xf;
+
+ /*
+ * We assume that all local APICs are using the same type.
+ * If this changes, we need to flush the AVIC logical
+ * APID id table.
+ */
+ if (kvm_svm->ldr_mode == mod)
+ return 0;
+
+ clear_page(page_address(kvm_svm->avic_logical_id_table_page));
+ kvm_svm->ldr_mode = mod;
+
+ if (svm->ldr_reg)
+ avic_handle_ldr_update(vcpu);
+ return 0;
+}
+
+static int avic_unaccel_trap_write(struct vcpu_svm *svm)
+{
+ struct kvm_lapic *apic = svm->vcpu.arch.apic;
+ u32 offset = svm->vmcb->control.exit_info_1 &
+ AVIC_UNACCEL_ACCESS_OFFSET_MASK;
+
+ switch (offset) {
+ case APIC_ID:
+ if (avic_handle_apic_id_update(&svm->vcpu))
+ return 0;
+ break;
+ case APIC_LDR:
+ if (avic_handle_ldr_update(&svm->vcpu))
+ return 0;
+ break;
+ case APIC_DFR:
+ avic_handle_dfr_update(&svm->vcpu);
+ break;
+ default:
+ break;
+ }
+
+ kvm_lapic_reg_write(apic, offset, kvm_lapic_get_reg(apic, offset));
+
+ return 1;
+}
+
+static bool is_avic_unaccelerated_access_trap(u32 offset)
+{
+ bool ret = false;
+
+ switch (offset) {
+ case APIC_ID:
+ case APIC_EOI:
+ case APIC_RRR:
+ case APIC_LDR:
+ case APIC_DFR:
+ case APIC_SPIV:
+ case APIC_ESR:
+ case APIC_ICR:
+ case APIC_LVTT:
+ case APIC_LVTTHMR:
+ case APIC_LVTPC:
+ case APIC_LVT0:
+ case APIC_LVT1:
+ case APIC_LVTERR:
+ case APIC_TMICT:
+ case APIC_TDCR:
+ ret = true;
+ break;
+ default:
+ break;
+ }
+ return ret;
+}
+
+static int avic_unaccelerated_access_interception(struct vcpu_svm *svm)
+{
+ int ret = 0;
+ u32 offset = svm->vmcb->control.exit_info_1 &
+ AVIC_UNACCEL_ACCESS_OFFSET_MASK;
+ u32 vector = svm->vmcb->control.exit_info_2 &
+ AVIC_UNACCEL_ACCESS_VECTOR_MASK;
+ bool write = (svm->vmcb->control.exit_info_1 >> 32) &
+ AVIC_UNACCEL_ACCESS_WRITE_MASK;
+ bool trap = is_avic_unaccelerated_access_trap(offset);
+
+ trace_kvm_avic_unaccelerated_access(svm->vcpu.vcpu_id, offset,
+ trap, write, vector);
+ if (trap) {
+ /* Handling Trap */
+ WARN_ONCE(!write, "svm: Handling trap read.\n");
+ ret = avic_unaccel_trap_write(svm);
+ } else {
+ /* Handling Fault */
+ ret = (kvm_emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE);
+ }
+
+ return ret;
+}
+
+static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = {
+ [SVM_EXIT_READ_CR0] = cr_interception,
+ [SVM_EXIT_READ_CR3] = cr_interception,
+ [SVM_EXIT_READ_CR4] = cr_interception,
+ [SVM_EXIT_READ_CR8] = cr_interception,
+ [SVM_EXIT_CR0_SEL_WRITE] = cr_interception,
+ [SVM_EXIT_WRITE_CR0] = cr_interception,
+ [SVM_EXIT_WRITE_CR3] = cr_interception,
+ [SVM_EXIT_WRITE_CR4] = cr_interception,
+ [SVM_EXIT_WRITE_CR8] = cr8_write_interception,
+ [SVM_EXIT_READ_DR0] = dr_interception,
+ [SVM_EXIT_READ_DR1] = dr_interception,
+ [SVM_EXIT_READ_DR2] = dr_interception,
+ [SVM_EXIT_READ_DR3] = dr_interception,
+ [SVM_EXIT_READ_DR4] = dr_interception,
+ [SVM_EXIT_READ_DR5] = dr_interception,
+ [SVM_EXIT_READ_DR6] = dr_interception,
+ [SVM_EXIT_READ_DR7] = dr_interception,
+ [SVM_EXIT_WRITE_DR0] = dr_interception,
+ [SVM_EXIT_WRITE_DR1] = dr_interception,
+ [SVM_EXIT_WRITE_DR2] = dr_interception,
+ [SVM_EXIT_WRITE_DR3] = dr_interception,
+ [SVM_EXIT_WRITE_DR4] = dr_interception,
+ [SVM_EXIT_WRITE_DR5] = dr_interception,
+ [SVM_EXIT_WRITE_DR6] = dr_interception,
+ [SVM_EXIT_WRITE_DR7] = dr_interception,
+ [SVM_EXIT_EXCP_BASE + DB_VECTOR] = db_interception,
+ [SVM_EXIT_EXCP_BASE + BP_VECTOR] = bp_interception,
+ [SVM_EXIT_EXCP_BASE + UD_VECTOR] = ud_interception,
+ [SVM_EXIT_EXCP_BASE + PF_VECTOR] = pf_interception,
+ [SVM_EXIT_EXCP_BASE + MC_VECTOR] = mc_interception,
+ [SVM_EXIT_EXCP_BASE + AC_VECTOR] = ac_interception,
+ [SVM_EXIT_EXCP_BASE + GP_VECTOR] = gp_interception,
+ [SVM_EXIT_INTR] = intr_interception,
+ [SVM_EXIT_NMI] = nmi_interception,
+ [SVM_EXIT_SMI] = nop_on_interception,
+ [SVM_EXIT_INIT] = nop_on_interception,
+ [SVM_EXIT_VINTR] = interrupt_window_interception,
+ [SVM_EXIT_RDPMC] = rdpmc_interception,
+ [SVM_EXIT_CPUID] = cpuid_interception,
+ [SVM_EXIT_IRET] = iret_interception,
+ [SVM_EXIT_INVD] = invd_interception,
+ [SVM_EXIT_PAUSE] = pause_interception,
+ [SVM_EXIT_HLT] = halt_interception,
+ [SVM_EXIT_INVLPG] = invlpg_interception,
+ [SVM_EXIT_INVLPGA] = invlpga_interception,
+ [SVM_EXIT_IOIO] = io_interception,
+ [SVM_EXIT_MSR] = msr_interception,
+ [SVM_EXIT_TASK_SWITCH] = task_switch_interception,
+ [SVM_EXIT_SHUTDOWN] = shutdown_interception,
+ [SVM_EXIT_VMRUN] = vmrun_interception,
+ [SVM_EXIT_VMMCALL] = vmmcall_interception,
+ [SVM_EXIT_VMLOAD] = vmload_interception,
+ [SVM_EXIT_VMSAVE] = vmsave_interception,
+ [SVM_EXIT_STGI] = stgi_interception,
+ [SVM_EXIT_CLGI] = clgi_interception,
+ [SVM_EXIT_SKINIT] = skinit_interception,
+ [SVM_EXIT_WBINVD] = wbinvd_interception,
+ [SVM_EXIT_MONITOR] = monitor_interception,
+ [SVM_EXIT_MWAIT] = mwait_interception,
+ [SVM_EXIT_XSETBV] = xsetbv_interception,
+ [SVM_EXIT_NPF] = npf_interception,
+ [SVM_EXIT_RSM] = rsm_interception,
+ [SVM_EXIT_AVIC_INCOMPLETE_IPI] = avic_incomplete_ipi_interception,
+ [SVM_EXIT_AVIC_UNACCELERATED_ACCESS] = avic_unaccelerated_access_interception,
+};
+
+static void dump_vmcb(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb_control_area *control = &svm->vmcb->control;
+ struct vmcb_save_area *save = &svm->vmcb->save;
+
+ pr_err("VMCB Control Area:\n");
+ pr_err("%-20s%04x\n", "cr_read:", control->intercept_cr & 0xffff);
+ pr_err("%-20s%04x\n", "cr_write:", control->intercept_cr >> 16);
+ pr_err("%-20s%04x\n", "dr_read:", control->intercept_dr & 0xffff);
+ pr_err("%-20s%04x\n", "dr_write:", control->intercept_dr >> 16);
+ pr_err("%-20s%08x\n", "exceptions:", control->intercept_exceptions);
+ pr_err("%-20s%016llx\n", "intercepts:", control->intercept);
+ pr_err("%-20s%d\n", "pause filter count:", control->pause_filter_count);
+ pr_err("%-20s%d\n", "pause filter threshold:",
+ control->pause_filter_thresh);
+ pr_err("%-20s%016llx\n", "iopm_base_pa:", control->iopm_base_pa);
+ pr_err("%-20s%016llx\n", "msrpm_base_pa:", control->msrpm_base_pa);
+ pr_err("%-20s%016llx\n", "tsc_offset:", control->tsc_offset);
+ pr_err("%-20s%d\n", "asid:", control->asid);
+ pr_err("%-20s%d\n", "tlb_ctl:", control->tlb_ctl);
+ pr_err("%-20s%08x\n", "int_ctl:", control->int_ctl);
+ pr_err("%-20s%08x\n", "int_vector:", control->int_vector);
+ pr_err("%-20s%08x\n", "int_state:", control->int_state);
+ pr_err("%-20s%08x\n", "exit_code:", control->exit_code);
+ pr_err("%-20s%016llx\n", "exit_info1:", control->exit_info_1);
+ pr_err("%-20s%016llx\n", "exit_info2:", control->exit_info_2);
+ pr_err("%-20s%08x\n", "exit_int_info:", control->exit_int_info);
+ pr_err("%-20s%08x\n", "exit_int_info_err:", control->exit_int_info_err);
+ pr_err("%-20s%lld\n", "nested_ctl:", control->nested_ctl);
+ pr_err("%-20s%016llx\n", "nested_cr3:", control->nested_cr3);
+ pr_err("%-20s%016llx\n", "avic_vapic_bar:", control->avic_vapic_bar);
+ pr_err("%-20s%08x\n", "event_inj:", control->event_inj);
+ pr_err("%-20s%08x\n", "event_inj_err:", control->event_inj_err);
+ pr_err("%-20s%lld\n", "virt_ext:", control->virt_ext);
+ pr_err("%-20s%016llx\n", "next_rip:", control->next_rip);
+ pr_err("%-20s%016llx\n", "avic_backing_page:", control->avic_backing_page);
+ pr_err("%-20s%016llx\n", "avic_logical_id:", control->avic_logical_id);
+ pr_err("%-20s%016llx\n", "avic_physical_id:", control->avic_physical_id);
+ pr_err("VMCB State Save Area:\n");
+ pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
+ "es:",
+ save->es.selector, save->es.attrib,
+ save->es.limit, save->es.base);
+ pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
+ "cs:",
+ save->cs.selector, save->cs.attrib,
+ save->cs.limit, save->cs.base);
+ pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
+ "ss:",
+ save->ss.selector, save->ss.attrib,
+ save->ss.limit, save->ss.base);
+ pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
+ "ds:",
+ save->ds.selector, save->ds.attrib,
+ save->ds.limit, save->ds.base);
+ pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
+ "fs:",
+ save->fs.selector, save->fs.attrib,
+ save->fs.limit, save->fs.base);
+ pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
+ "gs:",
+ save->gs.selector, save->gs.attrib,
+ save->gs.limit, save->gs.base);
+ pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
+ "gdtr:",
+ save->gdtr.selector, save->gdtr.attrib,
+ save->gdtr.limit, save->gdtr.base);
+ pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
+ "ldtr:",
+ save->ldtr.selector, save->ldtr.attrib,
+ save->ldtr.limit, save->ldtr.base);
+ pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
+ "idtr:",
+ save->idtr.selector, save->idtr.attrib,
+ save->idtr.limit, save->idtr.base);
+ pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
+ "tr:",
+ save->tr.selector, save->tr.attrib,
+ save->tr.limit, save->tr.base);
+ pr_err("cpl: %d efer: %016llx\n",
+ save->cpl, save->efer);
+ pr_err("%-15s %016llx %-13s %016llx\n",
+ "cr0:", save->cr0, "cr2:", save->cr2);
+ pr_err("%-15s %016llx %-13s %016llx\n",
+ "cr3:", save->cr3, "cr4:", save->cr4);
+ pr_err("%-15s %016llx %-13s %016llx\n",
+ "dr6:", save->dr6, "dr7:", save->dr7);
+ pr_err("%-15s %016llx %-13s %016llx\n",
+ "rip:", save->rip, "rflags:", save->rflags);
+ pr_err("%-15s %016llx %-13s %016llx\n",
+ "rsp:", save->rsp, "rax:", save->rax);
+ pr_err("%-15s %016llx %-13s %016llx\n",
+ "star:", save->star, "lstar:", save->lstar);
+ pr_err("%-15s %016llx %-13s %016llx\n",
+ "cstar:", save->cstar, "sfmask:", save->sfmask);
+ pr_err("%-15s %016llx %-13s %016llx\n",
+ "kernel_gs_base:", save->kernel_gs_base,
+ "sysenter_cs:", save->sysenter_cs);
+ pr_err("%-15s %016llx %-13s %016llx\n",
+ "sysenter_esp:", save->sysenter_esp,
+ "sysenter_eip:", save->sysenter_eip);
+ pr_err("%-15s %016llx %-13s %016llx\n",
+ "gpat:", save->g_pat, "dbgctl:", save->dbgctl);
+ pr_err("%-15s %016llx %-13s %016llx\n",
+ "br_from:", save->br_from, "br_to:", save->br_to);
+ pr_err("%-15s %016llx %-13s %016llx\n",
+ "excp_from:", save->last_excp_from,
+ "excp_to:", save->last_excp_to);
+}
+
+static void svm_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
+{
+ struct vmcb_control_area *control = &to_svm(vcpu)->vmcb->control;
+
+ *info1 = control->exit_info_1;
+ *info2 = control->exit_info_2;
+}
+
+static int handle_exit(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct kvm_run *kvm_run = vcpu->run;
+ u32 exit_code = svm->vmcb->control.exit_code;
+
+ trace_kvm_exit(exit_code, vcpu, KVM_ISA_SVM);
+
+ if (!is_cr_intercept(svm, INTERCEPT_CR0_WRITE))
+ vcpu->arch.cr0 = svm->vmcb->save.cr0;
+ if (npt_enabled)
+ vcpu->arch.cr3 = svm->vmcb->save.cr3;
+
+ if (unlikely(svm->nested.exit_required)) {
+ nested_svm_vmexit(svm);
+ svm->nested.exit_required = false;
+
+ return 1;
+ }
+
+ if (is_guest_mode(vcpu)) {
+ int vmexit;
+
+ trace_kvm_nested_vmexit(svm->vmcb->save.rip, exit_code,
+ svm->vmcb->control.exit_info_1,
+ svm->vmcb->control.exit_info_2,
+ svm->vmcb->control.exit_int_info,
+ svm->vmcb->control.exit_int_info_err,
+ KVM_ISA_SVM);
+
+ vmexit = nested_svm_exit_special(svm);
+
+ if (vmexit == NESTED_EXIT_CONTINUE)
+ vmexit = nested_svm_exit_handled(svm);
+
+ if (vmexit == NESTED_EXIT_DONE)
+ return 1;
+ }
+
+ svm_complete_interrupts(svm);
+
+ if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) {
+ kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
+ kvm_run->fail_entry.hardware_entry_failure_reason
+ = svm->vmcb->control.exit_code;
+ pr_err("KVM: FAILED VMRUN WITH VMCB:\n");
+ dump_vmcb(vcpu);
+ return 0;
+ }
+
+ if (is_external_interrupt(svm->vmcb->control.exit_int_info) &&
+ exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR &&
+ exit_code != SVM_EXIT_NPF && exit_code != SVM_EXIT_TASK_SWITCH &&
+ exit_code != SVM_EXIT_INTR && exit_code != SVM_EXIT_NMI)
+ printk(KERN_ERR "%s: unexpected exit_int_info 0x%x "
+ "exit_code 0x%x\n",
+ __func__, svm->vmcb->control.exit_int_info,
+ exit_code);
+
+ if (exit_code >= ARRAY_SIZE(svm_exit_handlers)
+ || !svm_exit_handlers[exit_code]) {
+ WARN_ONCE(1, "svm: unexpected exit reason 0x%x\n", exit_code);
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+ }
+
+ return svm_exit_handlers[exit_code](svm);
+}
+
+static void reload_tss(struct kvm_vcpu *vcpu)
+{
+ int cpu = raw_smp_processor_id();
+
+ struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
+ sd->tss_desc->type = 9; /* available 32/64-bit TSS */
+ load_TR_desc();
+}
+
+static void pre_sev_run(struct vcpu_svm *svm, int cpu)
+{
+ struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
+ int asid = sev_get_asid(svm->vcpu.kvm);
+
+ /* Assign the asid allocated with this SEV guest */
+ svm->vmcb->control.asid = asid;
+
+ /*
+ * Flush guest TLB:
+ *
+ * 1) when different VMCB for the same ASID is to be run on the same host CPU.
+ * 2) or this VMCB was executed on different host CPU in previous VMRUNs.
+ */
+ if (sd->sev_vmcbs[asid] == svm->vmcb &&
+ svm->last_cpu == cpu)
+ return;
+
+ svm->last_cpu = cpu;
+ sd->sev_vmcbs[asid] = svm->vmcb;
+ svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID;
+ mark_dirty(svm->vmcb, VMCB_ASID);
+}
+
+static void pre_svm_run(struct vcpu_svm *svm)
+{
+ int cpu = raw_smp_processor_id();
+
+ struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
+
+ if (sev_guest(svm->vcpu.kvm))
+ return pre_sev_run(svm, cpu);
+
+ /* FIXME: handle wraparound of asid_generation */
+ if (svm->asid_generation != sd->asid_generation)
+ new_asid(svm, sd);
+}
+
+static void svm_inject_nmi(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI;
+ vcpu->arch.hflags |= HF_NMI_MASK;
+ set_intercept(svm, INTERCEPT_IRET);
+ ++vcpu->stat.nmi_injections;
+}
+
+static inline void svm_inject_irq(struct vcpu_svm *svm, int irq)
+{
+ struct vmcb_control_area *control;
+
+ /* The following fields are ignored when AVIC is enabled */
+ control = &svm->vmcb->control;
+ control->int_vector = irq;
+ control->int_ctl &= ~V_INTR_PRIO_MASK;
+ control->int_ctl |= V_IRQ_MASK |
+ ((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT);
+ mark_dirty(svm->vmcb, VMCB_INTR);
+}
+
+static void svm_set_irq(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ BUG_ON(!(gif_set(svm)));
+
+ trace_kvm_inj_virq(vcpu->arch.interrupt.nr);
+ ++vcpu->stat.irq_injections;
+
+ svm->vmcb->control.event_inj = vcpu->arch.interrupt.nr |
+ SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR;
+}
+
+static inline bool svm_nested_virtualize_tpr(struct kvm_vcpu *vcpu)
+{
+ return is_guest_mode(vcpu) && (vcpu->arch.hflags & HF_VINTR_MASK);
+}
+
+static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (svm_nested_virtualize_tpr(vcpu) ||
+ kvm_vcpu_apicv_active(vcpu))
+ return;
+
+ clr_cr_intercept(svm, INTERCEPT_CR8_WRITE);
+
+ if (irr == -1)
+ return;
+
+ if (tpr >= irr)
+ set_cr_intercept(svm, INTERCEPT_CR8_WRITE);
+}
+
+static void svm_set_virtual_apic_mode(struct kvm_vcpu *vcpu)
+{
+ return;
+}
+
+static bool svm_get_enable_apicv(struct kvm_vcpu *vcpu)
+{
+ return avic && irqchip_split(vcpu->kvm);
+}
+
+static void svm_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr)
+{
+}
+
+static void svm_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr)
+{
+}
+
+/* Note: Currently only used by Hyper-V. */
+static void svm_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb *vmcb = svm->vmcb;
+
+ if (!kvm_vcpu_apicv_active(&svm->vcpu))
+ return;
+
+ vmcb->control.int_ctl &= ~AVIC_ENABLE_MASK;
+ mark_dirty(vmcb, VMCB_INTR);
+}
+
+static void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
+{
+ return;
+}
+
+static int svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec)
+{
+ if (!vcpu->arch.apicv_active)
+ return -1;
+
+ kvm_lapic_set_irr(vec, vcpu->arch.apic);
+ smp_mb__after_atomic();
+
+ if (avic_vcpu_is_running(vcpu))
+ wrmsrl(SVM_AVIC_DOORBELL,
+ kvm_cpu_get_apicid(vcpu->cpu));
+ else
+ kvm_vcpu_wake_up(vcpu);
+
+ return 0;
+}
+
+static bool svm_dy_apicv_has_pending_interrupt(struct kvm_vcpu *vcpu)
+{
+ return false;
+}
+
+static void svm_ir_list_del(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi)
+{
+ unsigned long flags;
+ struct amd_svm_iommu_ir *cur;
+
+ spin_lock_irqsave(&svm->ir_list_lock, flags);
+ list_for_each_entry(cur, &svm->ir_list, node) {
+ if (cur->data != pi->ir_data)
+ continue;
+ list_del(&cur->node);
+ kfree(cur);
+ break;
+ }
+ spin_unlock_irqrestore(&svm->ir_list_lock, flags);
+}
+
+static int svm_ir_list_add(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi)
+{
+ int ret = 0;
+ unsigned long flags;
+ struct amd_svm_iommu_ir *ir;
+
+ /**
+ * In some cases, the existing irte is updaed and re-set,
+ * so we need to check here if it's already been * added
+ * to the ir_list.
+ */
+ if (pi->ir_data && (pi->prev_ga_tag != 0)) {
+ struct kvm *kvm = svm->vcpu.kvm;
+ u32 vcpu_id = AVIC_GATAG_TO_VCPUID(pi->prev_ga_tag);
+ struct kvm_vcpu *prev_vcpu = kvm_get_vcpu_by_id(kvm, vcpu_id);
+ struct vcpu_svm *prev_svm;
+
+ if (!prev_vcpu) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ prev_svm = to_svm(prev_vcpu);
+ svm_ir_list_del(prev_svm, pi);
+ }
+
+ /**
+ * Allocating new amd_iommu_pi_data, which will get
+ * add to the per-vcpu ir_list.
+ */
+ ir = kzalloc(sizeof(struct amd_svm_iommu_ir), GFP_KERNEL);
+ if (!ir) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ ir->data = pi->ir_data;
+
+ spin_lock_irqsave(&svm->ir_list_lock, flags);
+ list_add(&ir->node, &svm->ir_list);
+ spin_unlock_irqrestore(&svm->ir_list_lock, flags);
+out:
+ return ret;
+}
+
+/**
+ * Note:
+ * The HW cannot support posting multicast/broadcast
+ * interrupts to a vCPU. So, we still use legacy interrupt
+ * remapping for these kind of interrupts.
+ *
+ * For lowest-priority interrupts, we only support
+ * those with single CPU as the destination, e.g. user
+ * configures the interrupts via /proc/irq or uses
+ * irqbalance to make the interrupts single-CPU.
+ */
+static int
+get_pi_vcpu_info(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e,
+ struct vcpu_data *vcpu_info, struct vcpu_svm **svm)
+{
+ struct kvm_lapic_irq irq;
+ struct kvm_vcpu *vcpu = NULL;
+
+ kvm_set_msi_irq(kvm, e, &irq);
+
+ if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu)) {
+ pr_debug("SVM: %s: use legacy intr remap mode for irq %u\n",
+ __func__, irq.vector);
+ return -1;
+ }
+
+ pr_debug("SVM: %s: use GA mode for irq %u\n", __func__,
+ irq.vector);
+ *svm = to_svm(vcpu);
+ vcpu_info->pi_desc_addr = __sme_set(page_to_phys((*svm)->avic_backing_page));
+ vcpu_info->vector = irq.vector;
+
+ return 0;
+}
+
+/*
+ * svm_update_pi_irte - set IRTE for Posted-Interrupts
+ *
+ * @kvm: kvm
+ * @host_irq: host irq of the interrupt
+ * @guest_irq: gsi of the interrupt
+ * @set: set or unset PI
+ * returns 0 on success, < 0 on failure
+ */
+static int svm_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
+ uint32_t guest_irq, bool set)
+{
+ struct kvm_kernel_irq_routing_entry *e;
+ struct kvm_irq_routing_table *irq_rt;
+ int idx, ret = -EINVAL;
+
+ if (!kvm_arch_has_assigned_device(kvm) ||
+ !irq_remapping_cap(IRQ_POSTING_CAP))
+ return 0;
+
+ pr_debug("SVM: %s: host_irq=%#x, guest_irq=%#x, set=%#x\n",
+ __func__, host_irq, guest_irq, set);
+
+ idx = srcu_read_lock(&kvm->irq_srcu);
+ irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
+ WARN_ON(guest_irq >= irq_rt->nr_rt_entries);
+
+ hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) {
+ struct vcpu_data vcpu_info;
+ struct vcpu_svm *svm = NULL;
+
+ if (e->type != KVM_IRQ_ROUTING_MSI)
+ continue;
+
+ /**
+ * Here, we setup with legacy mode in the following cases:
+ * 1. When cannot target interrupt to a specific vcpu.
+ * 2. Unsetting posted interrupt.
+ * 3. APIC virtialization is disabled for the vcpu.
+ */
+ if (!get_pi_vcpu_info(kvm, e, &vcpu_info, &svm) && set &&
+ kvm_vcpu_apicv_active(&svm->vcpu)) {
+ struct amd_iommu_pi_data pi;
+
+ /* Try to enable guest_mode in IRTE */
+ pi.base = __sme_set(page_to_phys(svm->avic_backing_page) &
+ AVIC_HPA_MASK);
+ pi.ga_tag = AVIC_GATAG(to_kvm_svm(kvm)->avic_vm_id,
+ svm->vcpu.vcpu_id);
+ pi.is_guest_mode = true;
+ pi.vcpu_data = &vcpu_info;
+ ret = irq_set_vcpu_affinity(host_irq, &pi);
+
+ /**
+ * Here, we successfully setting up vcpu affinity in
+ * IOMMU guest mode. Now, we need to store the posted
+ * interrupt information in a per-vcpu ir_list so that
+ * we can reference to them directly when we update vcpu
+ * scheduling information in IOMMU irte.
+ */
+ if (!ret && pi.is_guest_mode)
+ svm_ir_list_add(svm, &pi);
+ } else {
+ /* Use legacy mode in IRTE */
+ struct amd_iommu_pi_data pi;
+
+ /**
+ * Here, pi is used to:
+ * - Tell IOMMU to use legacy mode for this interrupt.
+ * - Retrieve ga_tag of prior interrupt remapping data.
+ */
+ pi.prev_ga_tag = 0;
+ pi.is_guest_mode = false;
+ ret = irq_set_vcpu_affinity(host_irq, &pi);
+
+ /**
+ * Check if the posted interrupt was previously
+ * setup with the guest_mode by checking if the ga_tag
+ * was cached. If so, we need to clean up the per-vcpu
+ * ir_list.
+ */
+ if (!ret && pi.prev_ga_tag) {
+ int id = AVIC_GATAG_TO_VCPUID(pi.prev_ga_tag);
+ struct kvm_vcpu *vcpu;
+
+ vcpu = kvm_get_vcpu_by_id(kvm, id);
+ if (vcpu)
+ svm_ir_list_del(to_svm(vcpu), &pi);
+ }
+ }
+
+ if (!ret && svm) {
+ trace_kvm_pi_irte_update(host_irq, svm->vcpu.vcpu_id,
+ e->gsi, vcpu_info.vector,
+ vcpu_info.pi_desc_addr, set);
+ }
+
+ if (ret < 0) {
+ pr_err("%s: failed to update PI IRTE\n", __func__);
+ goto out;
+ }
+ }
+
+ ret = 0;
+out:
+ srcu_read_unlock(&kvm->irq_srcu, idx);
+ return ret;
+}
+
+static int svm_nmi_allowed(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb *vmcb = svm->vmcb;
+ int ret;
+ ret = !(vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) &&
+ !(svm->vcpu.arch.hflags & HF_NMI_MASK);
+ ret = ret && gif_set(svm) && nested_svm_nmi(svm);
+
+ return ret;
+}
+
+static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ return !!(svm->vcpu.arch.hflags & HF_NMI_MASK);
+}
+
+static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (masked) {
+ svm->vcpu.arch.hflags |= HF_NMI_MASK;
+ set_intercept(svm, INTERCEPT_IRET);
+ } else {
+ svm->vcpu.arch.hflags &= ~HF_NMI_MASK;
+ clr_intercept(svm, INTERCEPT_IRET);
+ }
+}
+
+static int svm_interrupt_allowed(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb *vmcb = svm->vmcb;
+ int ret;
+
+ if (!gif_set(svm) ||
+ (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK))
+ return 0;
+
+ ret = !!(kvm_get_rflags(vcpu) & X86_EFLAGS_IF);
+
+ if (is_guest_mode(vcpu))
+ return ret && !(svm->vcpu.arch.hflags & HF_VINTR_MASK);
+
+ return ret;
+}
+
+static void enable_irq_window(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (kvm_vcpu_apicv_active(vcpu))
+ return;
+
+ /*
+ * In case GIF=0 we can't rely on the CPU to tell us when GIF becomes
+ * 1, because that's a separate STGI/VMRUN intercept. The next time we
+ * get that intercept, this function will be called again though and
+ * we'll get the vintr intercept. However, if the vGIF feature is
+ * enabled, the STGI interception will not occur. Enable the irq
+ * window under the assumption that the hardware will set the GIF.
+ */
+ if ((vgif_enabled(svm) || gif_set(svm)) && nested_svm_intr(svm)) {
+ svm_set_vintr(svm);
+ svm_inject_irq(svm, 0x0);
+ }
+}
+
+static void enable_nmi_window(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if ((svm->vcpu.arch.hflags & (HF_NMI_MASK | HF_IRET_MASK))
+ == HF_NMI_MASK)
+ return; /* IRET will cause a vm exit */
+
+ if (!gif_set(svm)) {
+ if (vgif_enabled(svm))
+ set_intercept(svm, INTERCEPT_STGI);
+ return; /* STGI will cause a vm exit */
+ }
+
+ if (svm->nested.exit_required)
+ return; /* we're not going to run the guest yet */
+
+ /*
+ * Something prevents NMI from been injected. Single step over possible
+ * problem (IRET or exception injection or interrupt shadow)
+ */
+ svm->nmi_singlestep_guest_rflags = svm_get_rflags(vcpu);
+ svm->nmi_singlestep = true;
+ svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
+}
+
+static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr)
+{
+ return 0;
+}
+
+static int svm_set_identity_map_addr(struct kvm *kvm, u64 ident_addr)
+{
+ return 0;
+}
+
+static void svm_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (static_cpu_has(X86_FEATURE_FLUSHBYASID))
+ svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID;
+ else
+ svm->asid_generation--;
+}
+
+static void svm_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t gva)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ invlpga(gva, svm->vmcb->control.asid);
+}
+
+static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu)
+{
+}
+
+static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (svm_nested_virtualize_tpr(vcpu))
+ return;
+
+ if (!is_cr_intercept(svm, INTERCEPT_CR8_WRITE)) {
+ int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK;
+ kvm_set_cr8(vcpu, cr8);
+ }
+}
+
+static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ u64 cr8;
+
+ if (svm_nested_virtualize_tpr(vcpu) ||
+ kvm_vcpu_apicv_active(vcpu))
+ return;
+
+ cr8 = kvm_get_cr8(vcpu);
+ svm->vmcb->control.int_ctl &= ~V_TPR_MASK;
+ svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK;
+}
+
+static void svm_complete_interrupts(struct vcpu_svm *svm)
+{
+ u8 vector;
+ int type;
+ u32 exitintinfo = svm->vmcb->control.exit_int_info;
+ unsigned int3_injected = svm->int3_injected;
+
+ svm->int3_injected = 0;
+
+ /*
+ * If we've made progress since setting HF_IRET_MASK, we've
+ * executed an IRET and can allow NMI injection.
+ */
+ if ((svm->vcpu.arch.hflags & HF_IRET_MASK)
+ && kvm_rip_read(&svm->vcpu) != svm->nmi_iret_rip) {
+ svm->vcpu.arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK);
+ kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
+ }
+
+ svm->vcpu.arch.nmi_injected = false;
+ kvm_clear_exception_queue(&svm->vcpu);
+ kvm_clear_interrupt_queue(&svm->vcpu);
+
+ if (!(exitintinfo & SVM_EXITINTINFO_VALID))
+ return;
+
+ kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
+
+ vector = exitintinfo & SVM_EXITINTINFO_VEC_MASK;
+ type = exitintinfo & SVM_EXITINTINFO_TYPE_MASK;
+
+ switch (type) {
+ case SVM_EXITINTINFO_TYPE_NMI:
+ svm->vcpu.arch.nmi_injected = true;
+ break;
+ case SVM_EXITINTINFO_TYPE_EXEPT:
+ /*
+ * In case of software exceptions, do not reinject the vector,
+ * but re-execute the instruction instead. Rewind RIP first
+ * if we emulated INT3 before.
+ */
+ if (kvm_exception_is_soft(vector)) {
+ if (vector == BP_VECTOR && int3_injected &&
+ kvm_is_linear_rip(&svm->vcpu, svm->int3_rip))
+ kvm_rip_write(&svm->vcpu,
+ kvm_rip_read(&svm->vcpu) -
+ int3_injected);
+ break;
+ }
+ if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) {
+ u32 err = svm->vmcb->control.exit_int_info_err;
+ kvm_requeue_exception_e(&svm->vcpu, vector, err);
+
+ } else
+ kvm_requeue_exception(&svm->vcpu, vector);
+ break;
+ case SVM_EXITINTINFO_TYPE_INTR:
+ kvm_queue_interrupt(&svm->vcpu, vector, false);
+ break;
+ default:
+ break;
+ }
+}
+
+static void svm_cancel_injection(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb_control_area *control = &svm->vmcb->control;
+
+ control->exit_int_info = control->event_inj;
+ control->exit_int_info_err = control->event_inj_err;
+ control->event_inj = 0;
+ svm_complete_interrupts(svm);
+}
+
+static void svm_vcpu_run(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
+ svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
+ svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];
+
+ /*
+ * A vmexit emulation is required before the vcpu can be executed
+ * again.
+ */
+ if (unlikely(svm->nested.exit_required))
+ return;
+
+ /*
+ * Disable singlestep if we're injecting an interrupt/exception.
+ * We don't want our modified rflags to be pushed on the stack where
+ * we might not be able to easily reset them if we disabled NMI
+ * singlestep later.
+ */
+ if (svm->nmi_singlestep && svm->vmcb->control.event_inj) {
+ /*
+ * Event injection happens before external interrupts cause a
+ * vmexit and interrupts are disabled here, so smp_send_reschedule
+ * is enough to force an immediate vmexit.
+ */
+ disable_nmi_singlestep(svm);
+ smp_send_reschedule(vcpu->cpu);
+ }
+
+ pre_svm_run(svm);
+
+ sync_lapic_to_cr8(vcpu);
+
+ svm->vmcb->save.cr2 = vcpu->arch.cr2;
+
+ clgi();
+ kvm_load_guest_xcr0(vcpu);
+
+ /*
+ * If this vCPU has touched SPEC_CTRL, restore the guest's value if
+ * it's non-zero. Since vmentry is serialising on affected CPUs, there
+ * is no need to worry about the conditional branch over the wrmsr
+ * being speculatively taken.
+ */
+ x86_spec_ctrl_set_guest(svm->spec_ctrl, svm->virt_spec_ctrl);
+
+ local_irq_enable();
+
+ asm volatile (
+ "push %%" _ASM_BP "; \n\t"
+ "mov %c[rbx](%[svm]), %%" _ASM_BX " \n\t"
+ "mov %c[rcx](%[svm]), %%" _ASM_CX " \n\t"
+ "mov %c[rdx](%[svm]), %%" _ASM_DX " \n\t"
+ "mov %c[rsi](%[svm]), %%" _ASM_SI " \n\t"
+ "mov %c[rdi](%[svm]), %%" _ASM_DI " \n\t"
+ "mov %c[rbp](%[svm]), %%" _ASM_BP " \n\t"
+#ifdef CONFIG_X86_64
+ "mov %c[r8](%[svm]), %%r8 \n\t"
+ "mov %c[r9](%[svm]), %%r9 \n\t"
+ "mov %c[r10](%[svm]), %%r10 \n\t"
+ "mov %c[r11](%[svm]), %%r11 \n\t"
+ "mov %c[r12](%[svm]), %%r12 \n\t"
+ "mov %c[r13](%[svm]), %%r13 \n\t"
+ "mov %c[r14](%[svm]), %%r14 \n\t"
+ "mov %c[r15](%[svm]), %%r15 \n\t"
+#endif
+
+ /* Enter guest mode */
+ "push %%" _ASM_AX " \n\t"
+ "mov %c[vmcb](%[svm]), %%" _ASM_AX " \n\t"
+ __ex(SVM_VMLOAD) "\n\t"
+ __ex(SVM_VMRUN) "\n\t"
+ __ex(SVM_VMSAVE) "\n\t"
+ "pop %%" _ASM_AX " \n\t"
+
+ /* Save guest registers, load host registers */
+ "mov %%" _ASM_BX ", %c[rbx](%[svm]) \n\t"
+ "mov %%" _ASM_CX ", %c[rcx](%[svm]) \n\t"
+ "mov %%" _ASM_DX ", %c[rdx](%[svm]) \n\t"
+ "mov %%" _ASM_SI ", %c[rsi](%[svm]) \n\t"
+ "mov %%" _ASM_DI ", %c[rdi](%[svm]) \n\t"
+ "mov %%" _ASM_BP ", %c[rbp](%[svm]) \n\t"
+#ifdef CONFIG_X86_64
+ "mov %%r8, %c[r8](%[svm]) \n\t"
+ "mov %%r9, %c[r9](%[svm]) \n\t"
+ "mov %%r10, %c[r10](%[svm]) \n\t"
+ "mov %%r11, %c[r11](%[svm]) \n\t"
+ "mov %%r12, %c[r12](%[svm]) \n\t"
+ "mov %%r13, %c[r13](%[svm]) \n\t"
+ "mov %%r14, %c[r14](%[svm]) \n\t"
+ "mov %%r15, %c[r15](%[svm]) \n\t"
+#endif
+ /*
+ * Clear host registers marked as clobbered to prevent
+ * speculative use.
+ */
+ "xor %%" _ASM_BX ", %%" _ASM_BX " \n\t"
+ "xor %%" _ASM_CX ", %%" _ASM_CX " \n\t"
+ "xor %%" _ASM_DX ", %%" _ASM_DX " \n\t"
+ "xor %%" _ASM_SI ", %%" _ASM_SI " \n\t"
+ "xor %%" _ASM_DI ", %%" _ASM_DI " \n\t"
+#ifdef CONFIG_X86_64
+ "xor %%r8, %%r8 \n\t"
+ "xor %%r9, %%r9 \n\t"
+ "xor %%r10, %%r10 \n\t"
+ "xor %%r11, %%r11 \n\t"
+ "xor %%r12, %%r12 \n\t"
+ "xor %%r13, %%r13 \n\t"
+ "xor %%r14, %%r14 \n\t"
+ "xor %%r15, %%r15 \n\t"
+#endif
+ "pop %%" _ASM_BP
+ :
+ : [svm]"a"(svm),
+ [vmcb]"i"(offsetof(struct vcpu_svm, vmcb_pa)),
+ [rbx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RBX])),
+ [rcx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RCX])),
+ [rdx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RDX])),
+ [rsi]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RSI])),
+ [rdi]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RDI])),
+ [rbp]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RBP]))
+#ifdef CONFIG_X86_64
+ , [r8]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R8])),
+ [r9]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R9])),
+ [r10]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R10])),
+ [r11]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R11])),
+ [r12]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R12])),
+ [r13]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R13])),
+ [r14]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R14])),
+ [r15]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R15]))
+#endif
+ : "cc", "memory"
+#ifdef CONFIG_X86_64
+ , "rbx", "rcx", "rdx", "rsi", "rdi"
+ , "r8", "r9", "r10", "r11" , "r12", "r13", "r14", "r15"
+#else
+ , "ebx", "ecx", "edx", "esi", "edi"
+#endif
+ );
+
+ /* Eliminate branch target predictions from guest mode */
+ vmexit_fill_RSB();
+
+#ifdef CONFIG_X86_64
+ wrmsrl(MSR_GS_BASE, svm->host.gs_base);
+#else
+ loadsegment(fs, svm->host.fs);
+#ifndef CONFIG_X86_32_LAZY_GS
+ loadsegment(gs, svm->host.gs);
+#endif
+#endif
+
+ /*
+ * We do not use IBRS in the kernel. If this vCPU has used the
+ * SPEC_CTRL MSR it may have left it on; save the value and
+ * turn it off. This is much more efficient than blindly adding
+ * it to the atomic save/restore list. Especially as the former
+ * (Saving guest MSRs on vmexit) doesn't even exist in KVM.
+ *
+ * For non-nested case:
+ * If the L01 MSR bitmap does not intercept the MSR, then we need to
+ * save it.
+ *
+ * For nested case:
+ * If the L02 MSR bitmap does not intercept the MSR, then we need to
+ * save it.
+ */
+ if (unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL)))
+ svm->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL);
+
+ reload_tss(vcpu);
+
+ local_irq_disable();
+
+ x86_spec_ctrl_restore_host(svm->spec_ctrl, svm->virt_spec_ctrl);
+
+ vcpu->arch.cr2 = svm->vmcb->save.cr2;
+ vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
+ vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
+ vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip;
+
+ if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
+ kvm_before_interrupt(&svm->vcpu);
+
+ kvm_put_guest_xcr0(vcpu);
+ stgi();
+
+ /* Any pending NMI will happen here */
+
+ if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
+ kvm_after_interrupt(&svm->vcpu);
+
+ sync_cr8_to_lapic(vcpu);
+
+ svm->next_rip = 0;
+
+ svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;
+
+ /* if exit due to PF check for async PF */
+ if (svm->vmcb->control.exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR)
+ svm->vcpu.arch.apf.host_apf_reason = kvm_read_and_reset_pf_reason();
+
+ if (npt_enabled) {
+ vcpu->arch.regs_avail &= ~(1 << VCPU_EXREG_PDPTR);
+ vcpu->arch.regs_dirty &= ~(1 << VCPU_EXREG_PDPTR);
+ }
+
+ /*
+ * We need to handle MC intercepts here before the vcpu has a chance to
+ * change the physical cpu
+ */
+ if (unlikely(svm->vmcb->control.exit_code ==
+ SVM_EXIT_EXCP_BASE + MC_VECTOR))
+ svm_handle_mce(svm);
+
+ mark_all_clean(svm->vmcb);
+}
+STACK_FRAME_NON_STANDARD(svm_vcpu_run);
+
+static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ svm->vmcb->save.cr3 = __sme_set(root);
+ mark_dirty(svm->vmcb, VMCB_CR);
+}
+
+static void set_tdp_cr3(struct kvm_vcpu *vcpu, unsigned long root)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ svm->vmcb->control.nested_cr3 = __sme_set(root);
+ mark_dirty(svm->vmcb, VMCB_NPT);
+
+ /* Also sync guest cr3 here in case we live migrate */
+ svm->vmcb->save.cr3 = kvm_read_cr3(vcpu);
+ mark_dirty(svm->vmcb, VMCB_CR);
+}
+
+static int is_disabled(void)
+{
+ u64 vm_cr;
+
+ rdmsrl(MSR_VM_CR, vm_cr);
+ if (vm_cr & (1 << SVM_VM_CR_SVM_DISABLE))
+ return 1;
+
+ return 0;
+}
+
+static void
+svm_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
+{
+ /*
+ * Patch in the VMMCALL instruction:
+ */
+ hypercall[0] = 0x0f;
+ hypercall[1] = 0x01;
+ hypercall[2] = 0xd9;
+}
+
+static void svm_check_processor_compat(void *rtn)
+{
+ *(int *)rtn = 0;
+}
+
+static bool svm_cpu_has_accelerated_tpr(void)
+{
+ return false;
+}
+
+static bool svm_has_emulated_msr(int index)
+{
+ switch (index) {
+ case MSR_IA32_MCG_EXT_CTL:
+ return false;
+ default:
+ break;
+ }
+
+ return true;
+}
+
+static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
+{
+ return 0;
+}
+
+static void svm_cpuid_update(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ /* Update nrips enabled cache */
+ svm->nrips_enabled = !!guest_cpuid_has(&svm->vcpu, X86_FEATURE_NRIPS);
+
+ if (!kvm_vcpu_apicv_active(vcpu))
+ return;
+
+ guest_cpuid_clear(vcpu, X86_FEATURE_X2APIC);
+}
+
+static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
+{
+ switch (func) {
+ case 0x1:
+ if (avic)
+ entry->ecx &= ~bit(X86_FEATURE_X2APIC);
+ break;
+ case 0x80000001:
+ if (nested)
+ entry->ecx |= (1 << 2); /* Set SVM bit */
+ break;
+ case 0x8000000A:
+ entry->eax = 1; /* SVM revision 1 */
+ entry->ebx = 8; /* Lets support 8 ASIDs in case we add proper
+ ASID emulation to nested SVM */
+ entry->ecx = 0; /* Reserved */
+ entry->edx = 0; /* Per default do not support any
+ additional features */
+
+ /* Support next_rip if host supports it */
+ if (boot_cpu_has(X86_FEATURE_NRIPS))
+ entry->edx |= SVM_FEATURE_NRIP;
+
+ /* Support NPT for the guest if enabled */
+ if (npt_enabled)
+ entry->edx |= SVM_FEATURE_NPT;
+
+ break;
+ case 0x8000001F:
+ /* Support memory encryption cpuid if host supports it */
+ if (boot_cpu_has(X86_FEATURE_SEV))
+ cpuid(0x8000001f, &entry->eax, &entry->ebx,
+ &entry->ecx, &entry->edx);
+
+ }
+}
+
+static int svm_get_lpage_level(void)
+{
+ return PT_PDPE_LEVEL;
+}
+
+static bool svm_rdtscp_supported(void)
+{
+ return boot_cpu_has(X86_FEATURE_RDTSCP);
+}
+
+static bool svm_invpcid_supported(void)
+{
+ return false;
+}
+
+static bool svm_mpx_supported(void)
+{
+ return false;
+}
+
+static bool svm_xsaves_supported(void)
+{
+ return false;
+}
+
+static bool svm_umip_emulated(void)
+{
+ return false;
+}
+
+static bool svm_has_wbinvd_exit(void)
+{
+ return true;
+}
+
+#define PRE_EX(exit) { .exit_code = (exit), \
+ .stage = X86_ICPT_PRE_EXCEPT, }
+#define POST_EX(exit) { .exit_code = (exit), \
+ .stage = X86_ICPT_POST_EXCEPT, }
+#define POST_MEM(exit) { .exit_code = (exit), \
+ .stage = X86_ICPT_POST_MEMACCESS, }
+
+static const struct __x86_intercept {
+ u32 exit_code;
+ enum x86_intercept_stage stage;
+} x86_intercept_map[] = {
+ [x86_intercept_cr_read] = POST_EX(SVM_EXIT_READ_CR0),
+ [x86_intercept_cr_write] = POST_EX(SVM_EXIT_WRITE_CR0),
+ [x86_intercept_clts] = POST_EX(SVM_EXIT_WRITE_CR0),
+ [x86_intercept_lmsw] = POST_EX(SVM_EXIT_WRITE_CR0),
+ [x86_intercept_smsw] = POST_EX(SVM_EXIT_READ_CR0),
+ [x86_intercept_dr_read] = POST_EX(SVM_EXIT_READ_DR0),
+ [x86_intercept_dr_write] = POST_EX(SVM_EXIT_WRITE_DR0),
+ [x86_intercept_sldt] = POST_EX(SVM_EXIT_LDTR_READ),
+ [x86_intercept_str] = POST_EX(SVM_EXIT_TR_READ),
+ [x86_intercept_lldt] = POST_EX(SVM_EXIT_LDTR_WRITE),
+ [x86_intercept_ltr] = POST_EX(SVM_EXIT_TR_WRITE),
+ [x86_intercept_sgdt] = POST_EX(SVM_EXIT_GDTR_READ),
+ [x86_intercept_sidt] = POST_EX(SVM_EXIT_IDTR_READ),
+ [x86_intercept_lgdt] = POST_EX(SVM_EXIT_GDTR_WRITE),
+ [x86_intercept_lidt] = POST_EX(SVM_EXIT_IDTR_WRITE),
+ [x86_intercept_vmrun] = POST_EX(SVM_EXIT_VMRUN),
+ [x86_intercept_vmmcall] = POST_EX(SVM_EXIT_VMMCALL),
+ [x86_intercept_vmload] = POST_EX(SVM_EXIT_VMLOAD),
+ [x86_intercept_vmsave] = POST_EX(SVM_EXIT_VMSAVE),
+ [x86_intercept_stgi] = POST_EX(SVM_EXIT_STGI),
+ [x86_intercept_clgi] = POST_EX(SVM_EXIT_CLGI),
+ [x86_intercept_skinit] = POST_EX(SVM_EXIT_SKINIT),
+ [x86_intercept_invlpga] = POST_EX(SVM_EXIT_INVLPGA),
+ [x86_intercept_rdtscp] = POST_EX(SVM_EXIT_RDTSCP),
+ [x86_intercept_monitor] = POST_MEM(SVM_EXIT_MONITOR),
+ [x86_intercept_mwait] = POST_EX(SVM_EXIT_MWAIT),
+ [x86_intercept_invlpg] = POST_EX(SVM_EXIT_INVLPG),
+ [x86_intercept_invd] = POST_EX(SVM_EXIT_INVD),
+ [x86_intercept_wbinvd] = POST_EX(SVM_EXIT_WBINVD),
+ [x86_intercept_wrmsr] = POST_EX(SVM_EXIT_MSR),
+ [x86_intercept_rdtsc] = POST_EX(SVM_EXIT_RDTSC),
+ [x86_intercept_rdmsr] = POST_EX(SVM_EXIT_MSR),
+ [x86_intercept_rdpmc] = POST_EX(SVM_EXIT_RDPMC),
+ [x86_intercept_cpuid] = PRE_EX(SVM_EXIT_CPUID),
+ [x86_intercept_rsm] = PRE_EX(SVM_EXIT_RSM),
+ [x86_intercept_pause] = PRE_EX(SVM_EXIT_PAUSE),
+ [x86_intercept_pushf] = PRE_EX(SVM_EXIT_PUSHF),
+ [x86_intercept_popf] = PRE_EX(SVM_EXIT_POPF),
+ [x86_intercept_intn] = PRE_EX(SVM_EXIT_SWINT),
+ [x86_intercept_iret] = PRE_EX(SVM_EXIT_IRET),
+ [x86_intercept_icebp] = PRE_EX(SVM_EXIT_ICEBP),
+ [x86_intercept_hlt] = POST_EX(SVM_EXIT_HLT),
+ [x86_intercept_in] = POST_EX(SVM_EXIT_IOIO),
+ [x86_intercept_ins] = POST_EX(SVM_EXIT_IOIO),
+ [x86_intercept_out] = POST_EX(SVM_EXIT_IOIO),
+ [x86_intercept_outs] = POST_EX(SVM_EXIT_IOIO),
+};
+
+#undef PRE_EX
+#undef POST_EX
+#undef POST_MEM
+
+static int svm_check_intercept(struct kvm_vcpu *vcpu,
+ struct x86_instruction_info *info,
+ enum x86_intercept_stage stage)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ int vmexit, ret = X86EMUL_CONTINUE;
+ struct __x86_intercept icpt_info;
+ struct vmcb *vmcb = svm->vmcb;
+
+ if (info->intercept >= ARRAY_SIZE(x86_intercept_map))
+ goto out;
+
+ icpt_info = x86_intercept_map[info->intercept];
+
+ if (stage != icpt_info.stage)
+ goto out;
+
+ switch (icpt_info.exit_code) {
+ case SVM_EXIT_READ_CR0:
+ if (info->intercept == x86_intercept_cr_read)
+ icpt_info.exit_code += info->modrm_reg;
+ break;
+ case SVM_EXIT_WRITE_CR0: {
+ unsigned long cr0, val;
+ u64 intercept;
+
+ if (info->intercept == x86_intercept_cr_write)
+ icpt_info.exit_code += info->modrm_reg;
+
+ if (icpt_info.exit_code != SVM_EXIT_WRITE_CR0 ||
+ info->intercept == x86_intercept_clts)
+ break;
+
+ intercept = svm->nested.intercept;
+
+ if (!(intercept & (1ULL << INTERCEPT_SELECTIVE_CR0)))
+ break;
+
+ cr0 = vcpu->arch.cr0 & ~SVM_CR0_SELECTIVE_MASK;
+ val = info->src_val & ~SVM_CR0_SELECTIVE_MASK;
+
+ if (info->intercept == x86_intercept_lmsw) {
+ cr0 &= 0xfUL;
+ val &= 0xfUL;
+ /* lmsw can't clear PE - catch this here */
+ if (cr0 & X86_CR0_PE)
+ val |= X86_CR0_PE;
+ }
+
+ if (cr0 ^ val)
+ icpt_info.exit_code = SVM_EXIT_CR0_SEL_WRITE;
+
+ break;
+ }
+ case SVM_EXIT_READ_DR0:
+ case SVM_EXIT_WRITE_DR0:
+ icpt_info.exit_code += info->modrm_reg;
+ break;
+ case SVM_EXIT_MSR:
+ if (info->intercept == x86_intercept_wrmsr)
+ vmcb->control.exit_info_1 = 1;
+ else
+ vmcb->control.exit_info_1 = 0;
+ break;
+ case SVM_EXIT_PAUSE:
+ /*
+ * We get this for NOP only, but pause
+ * is rep not, check this here
+ */
+ if (info->rep_prefix != REPE_PREFIX)
+ goto out;
+ break;
+ case SVM_EXIT_IOIO: {
+ u64 exit_info;
+ u32 bytes;
+
+ if (info->intercept == x86_intercept_in ||
+ info->intercept == x86_intercept_ins) {
+ exit_info = ((info->src_val & 0xffff) << 16) |
+ SVM_IOIO_TYPE_MASK;
+ bytes = info->dst_bytes;
+ } else {
+ exit_info = (info->dst_val & 0xffff) << 16;
+ bytes = info->src_bytes;
+ }
+
+ if (info->intercept == x86_intercept_outs ||
+ info->intercept == x86_intercept_ins)
+ exit_info |= SVM_IOIO_STR_MASK;
+
+ if (info->rep_prefix)
+ exit_info |= SVM_IOIO_REP_MASK;
+
+ bytes = min(bytes, 4u);
+
+ exit_info |= bytes << SVM_IOIO_SIZE_SHIFT;
+
+ exit_info |= (u32)info->ad_bytes << (SVM_IOIO_ASIZE_SHIFT - 1);
+
+ vmcb->control.exit_info_1 = exit_info;
+ vmcb->control.exit_info_2 = info->next_rip;
+
+ break;
+ }
+ default:
+ break;
+ }
+
+ /* TODO: Advertise NRIPS to guest hypervisor unconditionally */
+ if (static_cpu_has(X86_FEATURE_NRIPS))
+ vmcb->control.next_rip = info->next_rip;
+ vmcb->control.exit_code = icpt_info.exit_code;
+ vmexit = nested_svm_exit_handled(svm);
+
+ ret = (vmexit == NESTED_EXIT_DONE) ? X86EMUL_INTERCEPTED
+ : X86EMUL_CONTINUE;
+
+out:
+ return ret;
+}
+
+static void svm_handle_external_intr(struct kvm_vcpu *vcpu)
+{
+ local_irq_enable();
+ /*
+ * We must have an instruction with interrupts enabled, so
+ * the timer interrupt isn't delayed by the interrupt shadow.
+ */
+ asm("nop");
+ local_irq_disable();
+}
+
+static void svm_sched_in(struct kvm_vcpu *vcpu, int cpu)
+{
+ if (pause_filter_thresh)
+ shrink_ple_window(vcpu);
+}
+
+static inline void avic_post_state_restore(struct kvm_vcpu *vcpu)
+{
+ if (avic_handle_apic_id_update(vcpu) != 0)
+ return;
+ if (avic_handle_dfr_update(vcpu) != 0)
+ return;
+ avic_handle_ldr_update(vcpu);
+}
+
+static void svm_setup_mce(struct kvm_vcpu *vcpu)
+{
+ /* [63:9] are reserved. */
+ vcpu->arch.mcg_cap &= 0x1ff;
+}
+
+static int svm_smi_allowed(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ /* Per APM Vol.2 15.22.2 "Response to SMI" */
+ if (!gif_set(svm))
+ return 0;
+
+ if (is_guest_mode(&svm->vcpu) &&
+ svm->nested.intercept & (1ULL << INTERCEPT_SMI)) {
+ /* TODO: Might need to set exit_info_1 and exit_info_2 here */
+ svm->vmcb->control.exit_code = SVM_EXIT_SMI;
+ svm->nested.exit_required = true;
+ return 0;
+ }
+
+ return 1;
+}
+
+static int svm_pre_enter_smm(struct kvm_vcpu *vcpu, char *smstate)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ int ret;
+
+ if (is_guest_mode(vcpu)) {
+ /* FED8h - SVM Guest */
+ put_smstate(u64, smstate, 0x7ed8, 1);
+ /* FEE0h - SVM Guest VMCB Physical Address */
+ put_smstate(u64, smstate, 0x7ee0, svm->nested.vmcb);
+
+ svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
+ svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
+ svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];
+
+ ret = nested_svm_vmexit(svm);
+ if (ret)
+ return ret;
+ }
+ return 0;
+}
+
+static int svm_pre_leave_smm(struct kvm_vcpu *vcpu, u64 smbase)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb *nested_vmcb;
+ struct page *page;
+ struct {
+ u64 guest;
+ u64 vmcb;
+ } svm_state_save;
+ int ret;
+
+ ret = kvm_vcpu_read_guest(vcpu, smbase + 0xfed8, &svm_state_save,
+ sizeof(svm_state_save));
+ if (ret)
+ return ret;
+
+ if (svm_state_save.guest) {
+ vcpu->arch.hflags &= ~HF_SMM_MASK;
+ nested_vmcb = nested_svm_map(svm, svm_state_save.vmcb, &page);
+ if (nested_vmcb)
+ enter_svm_guest_mode(svm, svm_state_save.vmcb, nested_vmcb, page);
+ else
+ ret = 1;
+ vcpu->arch.hflags |= HF_SMM_MASK;
+ }
+ return ret;
+}
+
+static int enable_smi_window(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (!gif_set(svm)) {
+ if (vgif_enabled(svm))
+ set_intercept(svm, INTERCEPT_STGI);
+ /* STGI will cause a vm exit */
+ return 1;
+ }
+ return 0;
+}
+
+static int sev_asid_new(void)
+{
+ int pos;
+
+ /*
+ * SEV-enabled guest must use asid from min_sev_asid to max_sev_asid.
+ */
+ pos = find_next_zero_bit(sev_asid_bitmap, max_sev_asid, min_sev_asid - 1);
+ if (pos >= max_sev_asid)
+ return -EBUSY;
+
+ set_bit(pos, sev_asid_bitmap);
+ return pos + 1;
+}
+
+static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ int asid, ret;
+
+ ret = -EBUSY;
+ if (unlikely(sev->active))
+ return ret;
+
+ asid = sev_asid_new();
+ if (asid < 0)
+ return ret;
+
+ ret = sev_platform_init(&argp->error);
+ if (ret)
+ goto e_free;
+
+ sev->active = true;
+ sev->asid = asid;
+ INIT_LIST_HEAD(&sev->regions_list);
+
+ return 0;
+
+e_free:
+ __sev_asid_free(asid);
+ return ret;
+}
+
+static int sev_bind_asid(struct kvm *kvm, unsigned int handle, int *error)
+{
+ struct sev_data_activate *data;
+ int asid = sev_get_asid(kvm);
+ int ret;
+
+ wbinvd_on_all_cpus();
+
+ ret = sev_guest_df_flush(error);
+ if (ret)
+ return ret;
+
+ data = kzalloc(sizeof(*data), GFP_KERNEL);
+ if (!data)
+ return -ENOMEM;
+
+ /* activate ASID on the given handle */
+ data->handle = handle;
+ data->asid = asid;
+ ret = sev_guest_activate(data, error);
+ kfree(data);
+
+ return ret;
+}
+
+static int __sev_issue_cmd(int fd, int id, void *data, int *error)
+{
+ struct fd f;
+ int ret;
+
+ f = fdget(fd);
+ if (!f.file)
+ return -EBADF;
+
+ ret = sev_issue_cmd_external_user(f.file, id, data, error);
+
+ fdput(f);
+ return ret;
+}
+
+static int sev_issue_cmd(struct kvm *kvm, int id, void *data, int *error)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+
+ return __sev_issue_cmd(sev->fd, id, data, error);
+}
+
+static int sev_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct sev_data_launch_start *start;
+ struct kvm_sev_launch_start params;
+ void *dh_blob, *session_blob;
+ int *error = &argp->error;
+ int ret;
+
+ if (!sev_guest(kvm))
+ return -ENOTTY;
+
+ if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data, sizeof(params)))
+ return -EFAULT;
+
+ start = kzalloc(sizeof(*start), GFP_KERNEL);
+ if (!start)
+ return -ENOMEM;
+
+ dh_blob = NULL;
+ if (params.dh_uaddr) {
+ dh_blob = psp_copy_user_blob(params.dh_uaddr, params.dh_len);
+ if (IS_ERR(dh_blob)) {
+ ret = PTR_ERR(dh_blob);
+ goto e_free;
+ }
+
+ start->dh_cert_address = __sme_set(__pa(dh_blob));
+ start->dh_cert_len = params.dh_len;
+ }
+
+ session_blob = NULL;
+ if (params.session_uaddr) {
+ session_blob = psp_copy_user_blob(params.session_uaddr, params.session_len);
+ if (IS_ERR(session_blob)) {
+ ret = PTR_ERR(session_blob);
+ goto e_free_dh;
+ }
+
+ start->session_address = __sme_set(__pa(session_blob));
+ start->session_len = params.session_len;
+ }
+
+ start->handle = params.handle;
+ start->policy = params.policy;
+
+ /* create memory encryption context */
+ ret = __sev_issue_cmd(argp->sev_fd, SEV_CMD_LAUNCH_START, start, error);
+ if (ret)
+ goto e_free_session;
+
+ /* Bind ASID to this guest */
+ ret = sev_bind_asid(kvm, start->handle, error);
+ if (ret) {
+ sev_decommission(start->handle);
+ goto e_free_session;
+ }
+
+ /* return handle to userspace */
+ params.handle = start->handle;
+ if (copy_to_user((void __user *)(uintptr_t)argp->data, &params, sizeof(params))) {
+ sev_unbind_asid(kvm, start->handle);
+ ret = -EFAULT;
+ goto e_free_session;
+ }
+
+ sev->handle = start->handle;
+ sev->fd = argp->sev_fd;
+
+e_free_session:
+ kfree(session_blob);
+e_free_dh:
+ kfree(dh_blob);
+e_free:
+ kfree(start);
+ return ret;
+}
+
+static unsigned long get_num_contig_pages(unsigned long idx,
+ struct page **inpages, unsigned long npages)
+{
+ unsigned long paddr, next_paddr;
+ unsigned long i = idx + 1, pages = 1;
+
+ /* find the number of contiguous pages starting from idx */
+ paddr = __sme_page_pa(inpages[idx]);
+ while (i < npages) {
+ next_paddr = __sme_page_pa(inpages[i++]);
+ if ((paddr + PAGE_SIZE) == next_paddr) {
+ pages++;
+ paddr = next_paddr;
+ continue;
+ }
+ break;
+ }
+
+ return pages;
+}
+
+static int sev_launch_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ unsigned long vaddr, vaddr_end, next_vaddr, npages, pages, size, i;
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct kvm_sev_launch_update_data params;
+ struct sev_data_launch_update_data *data;
+ struct page **inpages;
+ int ret;
+
+ if (!sev_guest(kvm))
+ return -ENOTTY;
+
+ if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data, sizeof(params)))
+ return -EFAULT;
+
+ data = kzalloc(sizeof(*data), GFP_KERNEL);
+ if (!data)
+ return -ENOMEM;
+
+ vaddr = params.uaddr;
+ size = params.len;
+ vaddr_end = vaddr + size;
+
+ /* Lock the user memory. */
+ inpages = sev_pin_memory(kvm, vaddr, size, &npages, 1);
+ if (!inpages) {
+ ret = -ENOMEM;
+ goto e_free;
+ }
+
+ /*
+ * The LAUNCH_UPDATE command will perform in-place encryption of the
+ * memory content (i.e it will write the same memory region with C=1).
+ * It's possible that the cache may contain the data with C=0, i.e.,
+ * unencrypted so invalidate it first.
+ */
+ sev_clflush_pages(inpages, npages);
+
+ for (i = 0; vaddr < vaddr_end; vaddr = next_vaddr, i += pages) {
+ int offset, len;
+
+ /*
+ * If the user buffer is not page-aligned, calculate the offset
+ * within the page.
+ */
+ offset = vaddr & (PAGE_SIZE - 1);
+
+ /* Calculate the number of pages that can be encrypted in one go. */
+ pages = get_num_contig_pages(i, inpages, npages);
+
+ len = min_t(size_t, ((pages * PAGE_SIZE) - offset), size);
+
+ data->handle = sev->handle;
+ data->len = len;
+ data->address = __sme_page_pa(inpages[i]) + offset;
+ ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_DATA, data, &argp->error);
+ if (ret)
+ goto e_unpin;
+
+ size -= len;
+ next_vaddr = vaddr + len;
+ }
+
+e_unpin:
+ /* content of memory is updated, mark pages dirty */
+ for (i = 0; i < npages; i++) {
+ set_page_dirty_lock(inpages[i]);
+ mark_page_accessed(inpages[i]);
+ }
+ /* unlock the user pages */
+ sev_unpin_memory(kvm, inpages, npages);
+e_free:
+ kfree(data);
+ return ret;
+}
+
+static int sev_launch_measure(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ void __user *measure = (void __user *)(uintptr_t)argp->data;
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct sev_data_launch_measure *data;
+ struct kvm_sev_launch_measure params;
+ void __user *p = NULL;
+ void *blob = NULL;
+ int ret;
+
+ if (!sev_guest(kvm))
+ return -ENOTTY;
+
+ if (copy_from_user(&params, measure, sizeof(params)))
+ return -EFAULT;
+
+ data = kzalloc(sizeof(*data), GFP_KERNEL);
+ if (!data)
+ return -ENOMEM;
+
+ /* User wants to query the blob length */
+ if (!params.len)
+ goto cmd;
+
+ p = (void __user *)(uintptr_t)params.uaddr;
+ if (p) {
+ if (params.len > SEV_FW_BLOB_MAX_SIZE) {
+ ret = -EINVAL;
+ goto e_free;
+ }
+
+ ret = -ENOMEM;
+ blob = kmalloc(params.len, GFP_KERNEL);
+ if (!blob)
+ goto e_free;
+
+ data->address = __psp_pa(blob);
+ data->len = params.len;
+ }
+
+cmd:
+ data->handle = sev->handle;
+ ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_MEASURE, data, &argp->error);
+
+ /*
+ * If we query the session length, FW responded with expected data.
+ */
+ if (!params.len)
+ goto done;
+
+ if (ret)
+ goto e_free_blob;
+
+ if (blob) {
+ if (copy_to_user(p, blob, params.len))
+ ret = -EFAULT;
+ }
+
+done:
+ params.len = data->len;
+ if (copy_to_user(measure, &params, sizeof(params)))
+ ret = -EFAULT;
+e_free_blob:
+ kfree(blob);
+e_free:
+ kfree(data);
+ return ret;
+}
+
+static int sev_launch_finish(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct sev_data_launch_finish *data;
+ int ret;
+
+ if (!sev_guest(kvm))
+ return -ENOTTY;
+
+ data = kzalloc(sizeof(*data), GFP_KERNEL);
+ if (!data)
+ return -ENOMEM;
+
+ data->handle = sev->handle;
+ ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_FINISH, data, &argp->error);
+
+ kfree(data);
+ return ret;
+}
+
+static int sev_guest_status(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct kvm_sev_guest_status params;
+ struct sev_data_guest_status *data;
+ int ret;
+
+ if (!sev_guest(kvm))
+ return -ENOTTY;
+
+ data = kzalloc(sizeof(*data), GFP_KERNEL);
+ if (!data)
+ return -ENOMEM;
+
+ data->handle = sev->handle;
+ ret = sev_issue_cmd(kvm, SEV_CMD_GUEST_STATUS, data, &argp->error);
+ if (ret)
+ goto e_free;
+
+ params.policy = data->policy;
+ params.state = data->state;
+ params.handle = data->handle;
+
+ if (copy_to_user((void __user *)(uintptr_t)argp->data, &params, sizeof(params)))
+ ret = -EFAULT;
+e_free:
+ kfree(data);
+ return ret;
+}
+
+static int __sev_issue_dbg_cmd(struct kvm *kvm, unsigned long src,
+ unsigned long dst, int size,
+ int *error, bool enc)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct sev_data_dbg *data;
+ int ret;
+
+ data = kzalloc(sizeof(*data), GFP_KERNEL);
+ if (!data)
+ return -ENOMEM;
+
+ data->handle = sev->handle;
+ data->dst_addr = dst;
+ data->src_addr = src;
+ data->len = size;
+
+ ret = sev_issue_cmd(kvm,
+ enc ? SEV_CMD_DBG_ENCRYPT : SEV_CMD_DBG_DECRYPT,
+ data, error);
+ kfree(data);
+ return ret;
+}
+
+static int __sev_dbg_decrypt(struct kvm *kvm, unsigned long src_paddr,
+ unsigned long dst_paddr, int sz, int *err)
+{
+ int offset;
+
+ /*
+ * Its safe to read more than we are asked, caller should ensure that
+ * destination has enough space.
+ */
+ src_paddr = round_down(src_paddr, 16);
+ offset = src_paddr & 15;
+ sz = round_up(sz + offset, 16);
+
+ return __sev_issue_dbg_cmd(kvm, src_paddr, dst_paddr, sz, err, false);
+}
+
+static int __sev_dbg_decrypt_user(struct kvm *kvm, unsigned long paddr,
+ unsigned long __user dst_uaddr,
+ unsigned long dst_paddr,
+ int size, int *err)
+{
+ struct page *tpage = NULL;
+ int ret, offset;
+
+ /* if inputs are not 16-byte then use intermediate buffer */
+ if (!IS_ALIGNED(dst_paddr, 16) ||
+ !IS_ALIGNED(paddr, 16) ||
+ !IS_ALIGNED(size, 16)) {
+ tpage = (void *)alloc_page(GFP_KERNEL);
+ if (!tpage)
+ return -ENOMEM;
+
+ dst_paddr = __sme_page_pa(tpage);
+ }
+
+ ret = __sev_dbg_decrypt(kvm, paddr, dst_paddr, size, err);
+ if (ret)
+ goto e_free;
+
+ if (tpage) {
+ offset = paddr & 15;
+ if (copy_to_user((void __user *)(uintptr_t)dst_uaddr,
+ page_address(tpage) + offset, size))
+ ret = -EFAULT;
+ }
+
+e_free:
+ if (tpage)
+ __free_page(tpage);
+
+ return ret;
+}
+
+static int __sev_dbg_encrypt_user(struct kvm *kvm, unsigned long paddr,
+ unsigned long __user vaddr,
+ unsigned long dst_paddr,
+ unsigned long __user dst_vaddr,
+ int size, int *error)
+{
+ struct page *src_tpage = NULL;
+ struct page *dst_tpage = NULL;
+ int ret, len = size;
+
+ /* If source buffer is not aligned then use an intermediate buffer */
+ if (!IS_ALIGNED(vaddr, 16)) {
+ src_tpage = alloc_page(GFP_KERNEL);
+ if (!src_tpage)
+ return -ENOMEM;
+
+ if (copy_from_user(page_address(src_tpage),
+ (void __user *)(uintptr_t)vaddr, size)) {
+ __free_page(src_tpage);
+ return -EFAULT;
+ }
+
+ paddr = __sme_page_pa(src_tpage);
+ }
+
+ /*
+ * If destination buffer or length is not aligned then do read-modify-write:
+ * - decrypt destination in an intermediate buffer
+ * - copy the source buffer in an intermediate buffer
+ * - use the intermediate buffer as source buffer
+ */
+ if (!IS_ALIGNED(dst_vaddr, 16) || !IS_ALIGNED(size, 16)) {
+ int dst_offset;
+
+ dst_tpage = alloc_page(GFP_KERNEL);
+ if (!dst_tpage) {
+ ret = -ENOMEM;
+ goto e_free;
+ }
+
+ ret = __sev_dbg_decrypt(kvm, dst_paddr,
+ __sme_page_pa(dst_tpage), size, error);
+ if (ret)
+ goto e_free;
+
+ /*
+ * If source is kernel buffer then use memcpy() otherwise
+ * copy_from_user().
+ */
+ dst_offset = dst_paddr & 15;
+
+ if (src_tpage)
+ memcpy(page_address(dst_tpage) + dst_offset,
+ page_address(src_tpage), size);
+ else {
+ if (copy_from_user(page_address(dst_tpage) + dst_offset,
+ (void __user *)(uintptr_t)vaddr, size)) {
+ ret = -EFAULT;
+ goto e_free;
+ }
+ }
+
+ paddr = __sme_page_pa(dst_tpage);
+ dst_paddr = round_down(dst_paddr, 16);
+ len = round_up(size, 16);
+ }
+
+ ret = __sev_issue_dbg_cmd(kvm, paddr, dst_paddr, len, error, true);
+
+e_free:
+ if (src_tpage)
+ __free_page(src_tpage);
+ if (dst_tpage)
+ __free_page(dst_tpage);
+ return ret;
+}
+
+static int sev_dbg_crypt(struct kvm *kvm, struct kvm_sev_cmd *argp, bool dec)
+{
+ unsigned long vaddr, vaddr_end, next_vaddr;
+ unsigned long dst_vaddr;
+ struct page **src_p, **dst_p;
+ struct kvm_sev_dbg debug;
+ unsigned long n;
+ unsigned int size;
+ int ret;
+
+ if (!sev_guest(kvm))
+ return -ENOTTY;
+
+ if (copy_from_user(&debug, (void __user *)(uintptr_t)argp->data, sizeof(debug)))
+ return -EFAULT;
+
+ if (!debug.len || debug.src_uaddr + debug.len < debug.src_uaddr)
+ return -EINVAL;
+ if (!debug.dst_uaddr)
+ return -EINVAL;
+
+ vaddr = debug.src_uaddr;
+ size = debug.len;
+ vaddr_end = vaddr + size;
+ dst_vaddr = debug.dst_uaddr;
+
+ for (; vaddr < vaddr_end; vaddr = next_vaddr) {
+ int len, s_off, d_off;
+
+ /* lock userspace source and destination page */
+ src_p = sev_pin_memory(kvm, vaddr & PAGE_MASK, PAGE_SIZE, &n, 0);
+ if (!src_p)
+ return -EFAULT;
+
+ dst_p = sev_pin_memory(kvm, dst_vaddr & PAGE_MASK, PAGE_SIZE, &n, 1);
+ if (!dst_p) {
+ sev_unpin_memory(kvm, src_p, n);
+ return -EFAULT;
+ }
+
+ /*
+ * The DBG_{DE,EN}CRYPT commands will perform {dec,en}cryption of the
+ * memory content (i.e it will write the same memory region with C=1).
+ * It's possible that the cache may contain the data with C=0, i.e.,
+ * unencrypted so invalidate it first.
+ */
+ sev_clflush_pages(src_p, 1);
+ sev_clflush_pages(dst_p, 1);
+
+ /*
+ * Since user buffer may not be page aligned, calculate the
+ * offset within the page.
+ */
+ s_off = vaddr & ~PAGE_MASK;
+ d_off = dst_vaddr & ~PAGE_MASK;
+ len = min_t(size_t, (PAGE_SIZE - s_off), size);
+
+ if (dec)
+ ret = __sev_dbg_decrypt_user(kvm,
+ __sme_page_pa(src_p[0]) + s_off,
+ dst_vaddr,
+ __sme_page_pa(dst_p[0]) + d_off,
+ len, &argp->error);
+ else
+ ret = __sev_dbg_encrypt_user(kvm,
+ __sme_page_pa(src_p[0]) + s_off,
+ vaddr,
+ __sme_page_pa(dst_p[0]) + d_off,
+ dst_vaddr,
+ len, &argp->error);
+
+ sev_unpin_memory(kvm, src_p, n);
+ sev_unpin_memory(kvm, dst_p, n);
+
+ if (ret)
+ goto err;
+
+ next_vaddr = vaddr + len;
+ dst_vaddr = dst_vaddr + len;
+ size -= len;
+ }
+err:
+ return ret;
+}
+
+static int sev_launch_secret(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct sev_data_launch_secret *data;
+ struct kvm_sev_launch_secret params;
+ struct page **pages;
+ void *blob, *hdr;
+ unsigned long n;
+ int ret, offset;
+
+ if (!sev_guest(kvm))
+ return -ENOTTY;
+
+ if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data, sizeof(params)))
+ return -EFAULT;
+
+ pages = sev_pin_memory(kvm, params.guest_uaddr, params.guest_len, &n, 1);
+ if (!pages)
+ return -ENOMEM;
+
+ /*
+ * The secret must be copied into contiguous memory region, lets verify
+ * that userspace memory pages are contiguous before we issue command.
+ */
+ if (get_num_contig_pages(0, pages, n) != n) {
+ ret = -EINVAL;
+ goto e_unpin_memory;
+ }
+
+ ret = -ENOMEM;
+ data = kzalloc(sizeof(*data), GFP_KERNEL);
+ if (!data)
+ goto e_unpin_memory;
+
+ offset = params.guest_uaddr & (PAGE_SIZE - 1);
+ data->guest_address = __sme_page_pa(pages[0]) + offset;
+ data->guest_len = params.guest_len;
+
+ blob = psp_copy_user_blob(params.trans_uaddr, params.trans_len);
+ if (IS_ERR(blob)) {
+ ret = PTR_ERR(blob);
+ goto e_free;
+ }
+
+ data->trans_address = __psp_pa(blob);
+ data->trans_len = params.trans_len;
+
+ hdr = psp_copy_user_blob(params.hdr_uaddr, params.hdr_len);
+ if (IS_ERR(hdr)) {
+ ret = PTR_ERR(hdr);
+ goto e_free_blob;
+ }
+ data->hdr_address = __psp_pa(hdr);
+ data->hdr_len = params.hdr_len;
+
+ data->handle = sev->handle;
+ ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_SECRET, data, &argp->error);
+
+ kfree(hdr);
+
+e_free_blob:
+ kfree(blob);
+e_free:
+ kfree(data);
+e_unpin_memory:
+ sev_unpin_memory(kvm, pages, n);
+ return ret;
+}
+
+static int svm_mem_enc_op(struct kvm *kvm, void __user *argp)
+{
+ struct kvm_sev_cmd sev_cmd;
+ int r;
+
+ if (!svm_sev_enabled())
+ return -ENOTTY;
+
+ if (copy_from_user(&sev_cmd, argp, sizeof(struct kvm_sev_cmd)))
+ return -EFAULT;
+
+ mutex_lock(&kvm->lock);
+
+ switch (sev_cmd.id) {
+ case KVM_SEV_INIT:
+ r = sev_guest_init(kvm, &sev_cmd);
+ break;
+ case KVM_SEV_LAUNCH_START:
+ r = sev_launch_start(kvm, &sev_cmd);
+ break;
+ case KVM_SEV_LAUNCH_UPDATE_DATA:
+ r = sev_launch_update_data(kvm, &sev_cmd);
+ break;
+ case KVM_SEV_LAUNCH_MEASURE:
+ r = sev_launch_measure(kvm, &sev_cmd);
+ break;
+ case KVM_SEV_LAUNCH_FINISH:
+ r = sev_launch_finish(kvm, &sev_cmd);
+ break;
+ case KVM_SEV_GUEST_STATUS:
+ r = sev_guest_status(kvm, &sev_cmd);
+ break;
+ case KVM_SEV_DBG_DECRYPT:
+ r = sev_dbg_crypt(kvm, &sev_cmd, true);
+ break;
+ case KVM_SEV_DBG_ENCRYPT:
+ r = sev_dbg_crypt(kvm, &sev_cmd, false);
+ break;
+ case KVM_SEV_LAUNCH_SECRET:
+ r = sev_launch_secret(kvm, &sev_cmd);
+ break;
+ default:
+ r = -EINVAL;
+ goto out;
+ }
+
+ if (copy_to_user(argp, &sev_cmd, sizeof(struct kvm_sev_cmd)))
+ r = -EFAULT;
+
+out:
+ mutex_unlock(&kvm->lock);
+ return r;
+}
+
+static int svm_register_enc_region(struct kvm *kvm,
+ struct kvm_enc_region *range)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct enc_region *region;
+ int ret = 0;
+
+ if (!sev_guest(kvm))
+ return -ENOTTY;
+
+ if (range->addr > ULONG_MAX || range->size > ULONG_MAX)
+ return -EINVAL;
+
+ region = kzalloc(sizeof(*region), GFP_KERNEL);
+ if (!region)
+ return -ENOMEM;
+
+ mutex_lock(&kvm->lock);
+ region->pages = sev_pin_memory(kvm, range->addr, range->size, &region->npages, 1);
+ if (!region->pages) {
+ ret = -ENOMEM;
+ mutex_unlock(&kvm->lock);
+ goto e_free;
+ }
+
+ region->uaddr = range->addr;
+ region->size = range->size;
+
+ list_add_tail(&region->list, &sev->regions_list);
+ mutex_unlock(&kvm->lock);
+
+ /*
+ * The guest may change the memory encryption attribute from C=0 -> C=1
+ * or vice versa for this memory range. Lets make sure caches are
+ * flushed to ensure that guest data gets written into memory with
+ * correct C-bit.
+ */
+ sev_clflush_pages(region->pages, region->npages);
+
+ return ret;
+
+e_free:
+ kfree(region);
+ return ret;
+}
+
+static struct enc_region *
+find_enc_region(struct kvm *kvm, struct kvm_enc_region *range)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct list_head *head = &sev->regions_list;
+ struct enc_region *i;
+
+ list_for_each_entry(i, head, list) {
+ if (i->uaddr == range->addr &&
+ i->size == range->size)
+ return i;
+ }
+
+ return NULL;
+}
+
+
+static int svm_unregister_enc_region(struct kvm *kvm,
+ struct kvm_enc_region *range)
+{
+ struct enc_region *region;
+ int ret;
+
+ mutex_lock(&kvm->lock);
+
+ if (!sev_guest(kvm)) {
+ ret = -ENOTTY;
+ goto failed;
+ }
+
+ region = find_enc_region(kvm, range);
+ if (!region) {
+ ret = -EINVAL;
+ goto failed;
+ }
+
+ __unregister_enc_region_locked(kvm, region);
+
+ mutex_unlock(&kvm->lock);
+ return 0;
+
+failed:
+ mutex_unlock(&kvm->lock);
+ return ret;
+}
+
+static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
+ .cpu_has_kvm_support = has_svm,
+ .disabled_by_bios = is_disabled,
+ .hardware_setup = svm_hardware_setup,
+ .hardware_unsetup = svm_hardware_unsetup,
+ .check_processor_compatibility = svm_check_processor_compat,
+ .hardware_enable = svm_hardware_enable,
+ .hardware_disable = svm_hardware_disable,
+ .cpu_has_accelerated_tpr = svm_cpu_has_accelerated_tpr,
+ .has_emulated_msr = svm_has_emulated_msr,
+
+ .vcpu_create = svm_create_vcpu,
+ .vcpu_free = svm_free_vcpu,
+ .vcpu_reset = svm_vcpu_reset,
+
+ .vm_alloc = svm_vm_alloc,
+ .vm_free = svm_vm_free,
+ .vm_init = avic_vm_init,
+ .vm_destroy = svm_vm_destroy,
+
+ .prepare_guest_switch = svm_prepare_guest_switch,
+ .vcpu_load = svm_vcpu_load,
+ .vcpu_put = svm_vcpu_put,
+ .vcpu_blocking = svm_vcpu_blocking,
+ .vcpu_unblocking = svm_vcpu_unblocking,
+
+ .update_bp_intercept = update_bp_intercept,
+ .get_msr_feature = svm_get_msr_feature,
+ .get_msr = svm_get_msr,
+ .set_msr = svm_set_msr,
+ .get_segment_base = svm_get_segment_base,
+ .get_segment = svm_get_segment,
+ .set_segment = svm_set_segment,
+ .get_cpl = svm_get_cpl,
+ .get_cs_db_l_bits = kvm_get_cs_db_l_bits,
+ .decache_cr0_guest_bits = svm_decache_cr0_guest_bits,
+ .decache_cr3 = svm_decache_cr3,
+ .decache_cr4_guest_bits = svm_decache_cr4_guest_bits,
+ .set_cr0 = svm_set_cr0,
+ .set_cr3 = svm_set_cr3,
+ .set_cr4 = svm_set_cr4,
+ .set_efer = svm_set_efer,
+ .get_idt = svm_get_idt,
+ .set_idt = svm_set_idt,
+ .get_gdt = svm_get_gdt,
+ .set_gdt = svm_set_gdt,
+ .get_dr6 = svm_get_dr6,
+ .set_dr6 = svm_set_dr6,
+ .set_dr7 = svm_set_dr7,
+ .sync_dirty_debug_regs = svm_sync_dirty_debug_regs,
+ .cache_reg = svm_cache_reg,
+ .get_rflags = svm_get_rflags,
+ .set_rflags = svm_set_rflags,
+
+ .tlb_flush = svm_flush_tlb,
+ .tlb_flush_gva = svm_flush_tlb_gva,
+
+ .run = svm_vcpu_run,
+ .handle_exit = handle_exit,
+ .skip_emulated_instruction = skip_emulated_instruction,
+ .set_interrupt_shadow = svm_set_interrupt_shadow,
+ .get_interrupt_shadow = svm_get_interrupt_shadow,
+ .patch_hypercall = svm_patch_hypercall,
+ .set_irq = svm_set_irq,
+ .set_nmi = svm_inject_nmi,
+ .queue_exception = svm_queue_exception,
+ .cancel_injection = svm_cancel_injection,
+ .interrupt_allowed = svm_interrupt_allowed,
+ .nmi_allowed = svm_nmi_allowed,
+ .get_nmi_mask = svm_get_nmi_mask,
+ .set_nmi_mask = svm_set_nmi_mask,
+ .enable_nmi_window = enable_nmi_window,
+ .enable_irq_window = enable_irq_window,
+ .update_cr8_intercept = update_cr8_intercept,
+ .set_virtual_apic_mode = svm_set_virtual_apic_mode,
+ .get_enable_apicv = svm_get_enable_apicv,
+ .refresh_apicv_exec_ctrl = svm_refresh_apicv_exec_ctrl,
+ .load_eoi_exitmap = svm_load_eoi_exitmap,
+ .hwapic_irr_update = svm_hwapic_irr_update,
+ .hwapic_isr_update = svm_hwapic_isr_update,
+ .sync_pir_to_irr = kvm_lapic_find_highest_irr,
+ .apicv_post_state_restore = avic_post_state_restore,
+
+ .set_tss_addr = svm_set_tss_addr,
+ .set_identity_map_addr = svm_set_identity_map_addr,
+ .get_tdp_level = get_npt_level,
+ .get_mt_mask = svm_get_mt_mask,
+
+ .get_exit_info = svm_get_exit_info,
+
+ .get_lpage_level = svm_get_lpage_level,
+
+ .cpuid_update = svm_cpuid_update,
+
+ .rdtscp_supported = svm_rdtscp_supported,
+ .invpcid_supported = svm_invpcid_supported,
+ .mpx_supported = svm_mpx_supported,
+ .xsaves_supported = svm_xsaves_supported,
+ .umip_emulated = svm_umip_emulated,
+
+ .set_supported_cpuid = svm_set_supported_cpuid,
+
+ .has_wbinvd_exit = svm_has_wbinvd_exit,
+
+ .read_l1_tsc_offset = svm_read_l1_tsc_offset,
+ .write_l1_tsc_offset = svm_write_l1_tsc_offset,
+
+ .set_tdp_cr3 = set_tdp_cr3,
+
+ .check_intercept = svm_check_intercept,
+ .handle_external_intr = svm_handle_external_intr,
+
+ .request_immediate_exit = __kvm_request_immediate_exit,
+
+ .sched_in = svm_sched_in,
+
+ .pmu_ops = &amd_pmu_ops,
+ .deliver_posted_interrupt = svm_deliver_avic_intr,
+ .dy_apicv_has_pending_interrupt = svm_dy_apicv_has_pending_interrupt,
+ .update_pi_irte = svm_update_pi_irte,
+ .setup_mce = svm_setup_mce,
+
+ .smi_allowed = svm_smi_allowed,
+ .pre_enter_smm = svm_pre_enter_smm,
+ .pre_leave_smm = svm_pre_leave_smm,
+ .enable_smi_window = enable_smi_window,
+
+ .mem_enc_op = svm_mem_enc_op,
+ .mem_enc_reg_region = svm_register_enc_region,
+ .mem_enc_unreg_region = svm_unregister_enc_region,
+};
+
+static int __init svm_init(void)
+{
+ return kvm_init(&svm_x86_ops, sizeof(struct vcpu_svm),
+ __alignof__(struct vcpu_svm), THIS_MODULE);
+}
+
+static void __exit svm_exit(void)
+{
+ kvm_exit();
+}
+
+module_init(svm_init)
+module_exit(svm_exit)
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
new file mode 100644
index 000000000..b3f219b7c
--- /dev/null
+++ b/arch/x86/kvm/trace.h
@@ -0,0 +1,1429 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#if !defined(_TRACE_KVM_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_KVM_H
+
+#include <linux/tracepoint.h>
+#include <asm/vmx.h>
+#include <asm/svm.h>
+#include <asm/clocksource.h>
+#include <asm/pvclock-abi.h>
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM kvm
+
+/*
+ * Tracepoint for guest mode entry.
+ */
+TRACE_EVENT(kvm_entry,
+ TP_PROTO(unsigned int vcpu_id),
+ TP_ARGS(vcpu_id),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, vcpu_id )
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ ),
+
+ TP_printk("vcpu %u", __entry->vcpu_id)
+);
+
+/*
+ * Tracepoint for hypercall.
+ */
+TRACE_EVENT(kvm_hypercall,
+ TP_PROTO(unsigned long nr, unsigned long a0, unsigned long a1,
+ unsigned long a2, unsigned long a3),
+ TP_ARGS(nr, a0, a1, a2, a3),
+
+ TP_STRUCT__entry(
+ __field( unsigned long, nr )
+ __field( unsigned long, a0 )
+ __field( unsigned long, a1 )
+ __field( unsigned long, a2 )
+ __field( unsigned long, a3 )
+ ),
+
+ TP_fast_assign(
+ __entry->nr = nr;
+ __entry->a0 = a0;
+ __entry->a1 = a1;
+ __entry->a2 = a2;
+ __entry->a3 = a3;
+ ),
+
+ TP_printk("nr 0x%lx a0 0x%lx a1 0x%lx a2 0x%lx a3 0x%lx",
+ __entry->nr, __entry->a0, __entry->a1, __entry->a2,
+ __entry->a3)
+);
+
+/*
+ * Tracepoint for hypercall.
+ */
+TRACE_EVENT(kvm_hv_hypercall,
+ TP_PROTO(__u16 code, bool fast, __u16 rep_cnt, __u16 rep_idx,
+ __u64 ingpa, __u64 outgpa),
+ TP_ARGS(code, fast, rep_cnt, rep_idx, ingpa, outgpa),
+
+ TP_STRUCT__entry(
+ __field( __u16, rep_cnt )
+ __field( __u16, rep_idx )
+ __field( __u64, ingpa )
+ __field( __u64, outgpa )
+ __field( __u16, code )
+ __field( bool, fast )
+ ),
+
+ TP_fast_assign(
+ __entry->rep_cnt = rep_cnt;
+ __entry->rep_idx = rep_idx;
+ __entry->ingpa = ingpa;
+ __entry->outgpa = outgpa;
+ __entry->code = code;
+ __entry->fast = fast;
+ ),
+
+ TP_printk("code 0x%x %s cnt 0x%x idx 0x%x in 0x%llx out 0x%llx",
+ __entry->code, __entry->fast ? "fast" : "slow",
+ __entry->rep_cnt, __entry->rep_idx, __entry->ingpa,
+ __entry->outgpa)
+);
+
+/*
+ * Tracepoint for PIO.
+ */
+
+#define KVM_PIO_IN 0
+#define KVM_PIO_OUT 1
+
+TRACE_EVENT(kvm_pio,
+ TP_PROTO(unsigned int rw, unsigned int port, unsigned int size,
+ unsigned int count, void *data),
+ TP_ARGS(rw, port, size, count, data),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, rw )
+ __field( unsigned int, port )
+ __field( unsigned int, size )
+ __field( unsigned int, count )
+ __field( unsigned int, val )
+ ),
+
+ TP_fast_assign(
+ __entry->rw = rw;
+ __entry->port = port;
+ __entry->size = size;
+ __entry->count = count;
+ if (size == 1)
+ __entry->val = *(unsigned char *)data;
+ else if (size == 2)
+ __entry->val = *(unsigned short *)data;
+ else
+ __entry->val = *(unsigned int *)data;
+ ),
+
+ TP_printk("pio_%s at 0x%x size %d count %d val 0x%x %s",
+ __entry->rw ? "write" : "read",
+ __entry->port, __entry->size, __entry->count, __entry->val,
+ __entry->count > 1 ? "(...)" : "")
+);
+
+/*
+ * Tracepoint for fast mmio.
+ */
+TRACE_EVENT(kvm_fast_mmio,
+ TP_PROTO(u64 gpa),
+ TP_ARGS(gpa),
+
+ TP_STRUCT__entry(
+ __field(u64, gpa)
+ ),
+
+ TP_fast_assign(
+ __entry->gpa = gpa;
+ ),
+
+ TP_printk("fast mmio at gpa 0x%llx", __entry->gpa)
+);
+
+/*
+ * Tracepoint for cpuid.
+ */
+TRACE_EVENT(kvm_cpuid,
+ TP_PROTO(unsigned int function, unsigned long rax, unsigned long rbx,
+ unsigned long rcx, unsigned long rdx, bool found),
+ TP_ARGS(function, rax, rbx, rcx, rdx, found),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, function )
+ __field( unsigned long, rax )
+ __field( unsigned long, rbx )
+ __field( unsigned long, rcx )
+ __field( unsigned long, rdx )
+ __field( bool, found )
+ ),
+
+ TP_fast_assign(
+ __entry->function = function;
+ __entry->rax = rax;
+ __entry->rbx = rbx;
+ __entry->rcx = rcx;
+ __entry->rdx = rdx;
+ __entry->found = found;
+ ),
+
+ TP_printk("func %x rax %lx rbx %lx rcx %lx rdx %lx, cpuid entry %s",
+ __entry->function, __entry->rax,
+ __entry->rbx, __entry->rcx, __entry->rdx,
+ __entry->found ? "found" : "not found")
+);
+
+#define AREG(x) { APIC_##x, "APIC_" #x }
+
+#define kvm_trace_symbol_apic \
+ AREG(ID), AREG(LVR), AREG(TASKPRI), AREG(ARBPRI), AREG(PROCPRI), \
+ AREG(EOI), AREG(RRR), AREG(LDR), AREG(DFR), AREG(SPIV), AREG(ISR), \
+ AREG(TMR), AREG(IRR), AREG(ESR), AREG(ICR), AREG(ICR2), AREG(LVTT), \
+ AREG(LVTTHMR), AREG(LVTPC), AREG(LVT0), AREG(LVT1), AREG(LVTERR), \
+ AREG(TMICT), AREG(TMCCT), AREG(TDCR), AREG(SELF_IPI), AREG(EFEAT), \
+ AREG(ECTRL)
+/*
+ * Tracepoint for apic access.
+ */
+TRACE_EVENT(kvm_apic,
+ TP_PROTO(unsigned int rw, unsigned int reg, unsigned int val),
+ TP_ARGS(rw, reg, val),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, rw )
+ __field( unsigned int, reg )
+ __field( unsigned int, val )
+ ),
+
+ TP_fast_assign(
+ __entry->rw = rw;
+ __entry->reg = reg;
+ __entry->val = val;
+ ),
+
+ TP_printk("apic_%s %s = 0x%x",
+ __entry->rw ? "write" : "read",
+ __print_symbolic(__entry->reg, kvm_trace_symbol_apic),
+ __entry->val)
+);
+
+#define trace_kvm_apic_read(reg, val) trace_kvm_apic(0, reg, val)
+#define trace_kvm_apic_write(reg, val) trace_kvm_apic(1, reg, val)
+
+#define KVM_ISA_VMX 1
+#define KVM_ISA_SVM 2
+
+/*
+ * Tracepoint for kvm guest exit:
+ */
+TRACE_EVENT(kvm_exit,
+ TP_PROTO(unsigned int exit_reason, struct kvm_vcpu *vcpu, u32 isa),
+ TP_ARGS(exit_reason, vcpu, isa),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, exit_reason )
+ __field( unsigned long, guest_rip )
+ __field( u32, isa )
+ __field( u64, info1 )
+ __field( u64, info2 )
+ ),
+
+ TP_fast_assign(
+ __entry->exit_reason = exit_reason;
+ __entry->guest_rip = kvm_rip_read(vcpu);
+ __entry->isa = isa;
+ kvm_x86_ops->get_exit_info(vcpu, &__entry->info1,
+ &__entry->info2);
+ ),
+
+ TP_printk("reason %s rip 0x%lx info %llx %llx",
+ (__entry->isa == KVM_ISA_VMX) ?
+ __print_symbolic(__entry->exit_reason, VMX_EXIT_REASONS) :
+ __print_symbolic(__entry->exit_reason, SVM_EXIT_REASONS),
+ __entry->guest_rip, __entry->info1, __entry->info2)
+);
+
+/*
+ * Tracepoint for kvm interrupt injection:
+ */
+TRACE_EVENT(kvm_inj_virq,
+ TP_PROTO(unsigned int irq),
+ TP_ARGS(irq),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, irq )
+ ),
+
+ TP_fast_assign(
+ __entry->irq = irq;
+ ),
+
+ TP_printk("irq %u", __entry->irq)
+);
+
+#define EXS(x) { x##_VECTOR, "#" #x }
+
+#define kvm_trace_sym_exc \
+ EXS(DE), EXS(DB), EXS(BP), EXS(OF), EXS(BR), EXS(UD), EXS(NM), \
+ EXS(DF), EXS(TS), EXS(NP), EXS(SS), EXS(GP), EXS(PF), \
+ EXS(MF), EXS(AC), EXS(MC)
+
+/*
+ * Tracepoint for kvm interrupt injection:
+ */
+TRACE_EVENT(kvm_inj_exception,
+ TP_PROTO(unsigned exception, bool has_error, unsigned error_code),
+ TP_ARGS(exception, has_error, error_code),
+
+ TP_STRUCT__entry(
+ __field( u8, exception )
+ __field( u8, has_error )
+ __field( u32, error_code )
+ ),
+
+ TP_fast_assign(
+ __entry->exception = exception;
+ __entry->has_error = has_error;
+ __entry->error_code = error_code;
+ ),
+
+ TP_printk("%s (0x%x)",
+ __print_symbolic(__entry->exception, kvm_trace_sym_exc),
+ /* FIXME: don't print error_code if not present */
+ __entry->has_error ? __entry->error_code : 0)
+);
+
+/*
+ * Tracepoint for page fault.
+ */
+TRACE_EVENT(kvm_page_fault,
+ TP_PROTO(unsigned long fault_address, unsigned int error_code),
+ TP_ARGS(fault_address, error_code),
+
+ TP_STRUCT__entry(
+ __field( unsigned long, fault_address )
+ __field( unsigned int, error_code )
+ ),
+
+ TP_fast_assign(
+ __entry->fault_address = fault_address;
+ __entry->error_code = error_code;
+ ),
+
+ TP_printk("address %lx error_code %x",
+ __entry->fault_address, __entry->error_code)
+);
+
+/*
+ * Tracepoint for guest MSR access.
+ */
+TRACE_EVENT(kvm_msr,
+ TP_PROTO(unsigned write, u32 ecx, u64 data, bool exception),
+ TP_ARGS(write, ecx, data, exception),
+
+ TP_STRUCT__entry(
+ __field( unsigned, write )
+ __field( u32, ecx )
+ __field( u64, data )
+ __field( u8, exception )
+ ),
+
+ TP_fast_assign(
+ __entry->write = write;
+ __entry->ecx = ecx;
+ __entry->data = data;
+ __entry->exception = exception;
+ ),
+
+ TP_printk("msr_%s %x = 0x%llx%s",
+ __entry->write ? "write" : "read",
+ __entry->ecx, __entry->data,
+ __entry->exception ? " (#GP)" : "")
+);
+
+#define trace_kvm_msr_read(ecx, data) trace_kvm_msr(0, ecx, data, false)
+#define trace_kvm_msr_write(ecx, data) trace_kvm_msr(1, ecx, data, false)
+#define trace_kvm_msr_read_ex(ecx) trace_kvm_msr(0, ecx, 0, true)
+#define trace_kvm_msr_write_ex(ecx, data) trace_kvm_msr(1, ecx, data, true)
+
+/*
+ * Tracepoint for guest CR access.
+ */
+TRACE_EVENT(kvm_cr,
+ TP_PROTO(unsigned int rw, unsigned int cr, unsigned long val),
+ TP_ARGS(rw, cr, val),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, rw )
+ __field( unsigned int, cr )
+ __field( unsigned long, val )
+ ),
+
+ TP_fast_assign(
+ __entry->rw = rw;
+ __entry->cr = cr;
+ __entry->val = val;
+ ),
+
+ TP_printk("cr_%s %x = 0x%lx",
+ __entry->rw ? "write" : "read",
+ __entry->cr, __entry->val)
+);
+
+#define trace_kvm_cr_read(cr, val) trace_kvm_cr(0, cr, val)
+#define trace_kvm_cr_write(cr, val) trace_kvm_cr(1, cr, val)
+
+TRACE_EVENT(kvm_pic_set_irq,
+ TP_PROTO(__u8 chip, __u8 pin, __u8 elcr, __u8 imr, bool coalesced),
+ TP_ARGS(chip, pin, elcr, imr, coalesced),
+
+ TP_STRUCT__entry(
+ __field( __u8, chip )
+ __field( __u8, pin )
+ __field( __u8, elcr )
+ __field( __u8, imr )
+ __field( bool, coalesced )
+ ),
+
+ TP_fast_assign(
+ __entry->chip = chip;
+ __entry->pin = pin;
+ __entry->elcr = elcr;
+ __entry->imr = imr;
+ __entry->coalesced = coalesced;
+ ),
+
+ TP_printk("chip %u pin %u (%s%s)%s",
+ __entry->chip, __entry->pin,
+ (__entry->elcr & (1 << __entry->pin)) ? "level":"edge",
+ (__entry->imr & (1 << __entry->pin)) ? "|masked":"",
+ __entry->coalesced ? " (coalesced)" : "")
+);
+
+#define kvm_apic_dst_shorthand \
+ {0x0, "dst"}, \
+ {0x1, "self"}, \
+ {0x2, "all"}, \
+ {0x3, "all-but-self"}
+
+TRACE_EVENT(kvm_apic_ipi,
+ TP_PROTO(__u32 icr_low, __u32 dest_id),
+ TP_ARGS(icr_low, dest_id),
+
+ TP_STRUCT__entry(
+ __field( __u32, icr_low )
+ __field( __u32, dest_id )
+ ),
+
+ TP_fast_assign(
+ __entry->icr_low = icr_low;
+ __entry->dest_id = dest_id;
+ ),
+
+ TP_printk("dst %x vec %u (%s|%s|%s|%s|%s)",
+ __entry->dest_id, (u8)__entry->icr_low,
+ __print_symbolic((__entry->icr_low >> 8 & 0x7),
+ kvm_deliver_mode),
+ (__entry->icr_low & (1<<11)) ? "logical" : "physical",
+ (__entry->icr_low & (1<<14)) ? "assert" : "de-assert",
+ (__entry->icr_low & (1<<15)) ? "level" : "edge",
+ __print_symbolic((__entry->icr_low >> 18 & 0x3),
+ kvm_apic_dst_shorthand))
+);
+
+TRACE_EVENT(kvm_apic_accept_irq,
+ TP_PROTO(__u32 apicid, __u16 dm, __u16 tm, __u8 vec),
+ TP_ARGS(apicid, dm, tm, vec),
+
+ TP_STRUCT__entry(
+ __field( __u32, apicid )
+ __field( __u16, dm )
+ __field( __u16, tm )
+ __field( __u8, vec )
+ ),
+
+ TP_fast_assign(
+ __entry->apicid = apicid;
+ __entry->dm = dm;
+ __entry->tm = tm;
+ __entry->vec = vec;
+ ),
+
+ TP_printk("apicid %x vec %u (%s|%s)",
+ __entry->apicid, __entry->vec,
+ __print_symbolic((__entry->dm >> 8 & 0x7), kvm_deliver_mode),
+ __entry->tm ? "level" : "edge")
+);
+
+TRACE_EVENT(kvm_eoi,
+ TP_PROTO(struct kvm_lapic *apic, int vector),
+ TP_ARGS(apic, vector),
+
+ TP_STRUCT__entry(
+ __field( __u32, apicid )
+ __field( int, vector )
+ ),
+
+ TP_fast_assign(
+ __entry->apicid = apic->vcpu->vcpu_id;
+ __entry->vector = vector;
+ ),
+
+ TP_printk("apicid %x vector %d", __entry->apicid, __entry->vector)
+);
+
+TRACE_EVENT(kvm_pv_eoi,
+ TP_PROTO(struct kvm_lapic *apic, int vector),
+ TP_ARGS(apic, vector),
+
+ TP_STRUCT__entry(
+ __field( __u32, apicid )
+ __field( int, vector )
+ ),
+
+ TP_fast_assign(
+ __entry->apicid = apic->vcpu->vcpu_id;
+ __entry->vector = vector;
+ ),
+
+ TP_printk("apicid %x vector %d", __entry->apicid, __entry->vector)
+);
+
+/*
+ * Tracepoint for nested VMRUN
+ */
+TRACE_EVENT(kvm_nested_vmrun,
+ TP_PROTO(__u64 rip, __u64 vmcb, __u64 nested_rip, __u32 int_ctl,
+ __u32 event_inj, bool npt),
+ TP_ARGS(rip, vmcb, nested_rip, int_ctl, event_inj, npt),
+
+ TP_STRUCT__entry(
+ __field( __u64, rip )
+ __field( __u64, vmcb )
+ __field( __u64, nested_rip )
+ __field( __u32, int_ctl )
+ __field( __u32, event_inj )
+ __field( bool, npt )
+ ),
+
+ TP_fast_assign(
+ __entry->rip = rip;
+ __entry->vmcb = vmcb;
+ __entry->nested_rip = nested_rip;
+ __entry->int_ctl = int_ctl;
+ __entry->event_inj = event_inj;
+ __entry->npt = npt;
+ ),
+
+ TP_printk("rip: 0x%016llx vmcb: 0x%016llx nrip: 0x%016llx int_ctl: 0x%08x "
+ "event_inj: 0x%08x npt: %s",
+ __entry->rip, __entry->vmcb, __entry->nested_rip,
+ __entry->int_ctl, __entry->event_inj,
+ __entry->npt ? "on" : "off")
+);
+
+TRACE_EVENT(kvm_nested_intercepts,
+ TP_PROTO(__u16 cr_read, __u16 cr_write, __u32 exceptions, __u64 intercept),
+ TP_ARGS(cr_read, cr_write, exceptions, intercept),
+
+ TP_STRUCT__entry(
+ __field( __u16, cr_read )
+ __field( __u16, cr_write )
+ __field( __u32, exceptions )
+ __field( __u64, intercept )
+ ),
+
+ TP_fast_assign(
+ __entry->cr_read = cr_read;
+ __entry->cr_write = cr_write;
+ __entry->exceptions = exceptions;
+ __entry->intercept = intercept;
+ ),
+
+ TP_printk("cr_read: %04x cr_write: %04x excp: %08x intercept: %016llx",
+ __entry->cr_read, __entry->cr_write, __entry->exceptions,
+ __entry->intercept)
+);
+/*
+ * Tracepoint for #VMEXIT while nested
+ */
+TRACE_EVENT(kvm_nested_vmexit,
+ TP_PROTO(__u64 rip, __u32 exit_code,
+ __u64 exit_info1, __u64 exit_info2,
+ __u32 exit_int_info, __u32 exit_int_info_err, __u32 isa),
+ TP_ARGS(rip, exit_code, exit_info1, exit_info2,
+ exit_int_info, exit_int_info_err, isa),
+
+ TP_STRUCT__entry(
+ __field( __u64, rip )
+ __field( __u32, exit_code )
+ __field( __u64, exit_info1 )
+ __field( __u64, exit_info2 )
+ __field( __u32, exit_int_info )
+ __field( __u32, exit_int_info_err )
+ __field( __u32, isa )
+ ),
+
+ TP_fast_assign(
+ __entry->rip = rip;
+ __entry->exit_code = exit_code;
+ __entry->exit_info1 = exit_info1;
+ __entry->exit_info2 = exit_info2;
+ __entry->exit_int_info = exit_int_info;
+ __entry->exit_int_info_err = exit_int_info_err;
+ __entry->isa = isa;
+ ),
+ TP_printk("rip: 0x%016llx reason: %s ext_inf1: 0x%016llx "
+ "ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x",
+ __entry->rip,
+ (__entry->isa == KVM_ISA_VMX) ?
+ __print_symbolic(__entry->exit_code, VMX_EXIT_REASONS) :
+ __print_symbolic(__entry->exit_code, SVM_EXIT_REASONS),
+ __entry->exit_info1, __entry->exit_info2,
+ __entry->exit_int_info, __entry->exit_int_info_err)
+);
+
+/*
+ * Tracepoint for #VMEXIT reinjected to the guest
+ */
+TRACE_EVENT(kvm_nested_vmexit_inject,
+ TP_PROTO(__u32 exit_code,
+ __u64 exit_info1, __u64 exit_info2,
+ __u32 exit_int_info, __u32 exit_int_info_err, __u32 isa),
+ TP_ARGS(exit_code, exit_info1, exit_info2,
+ exit_int_info, exit_int_info_err, isa),
+
+ TP_STRUCT__entry(
+ __field( __u32, exit_code )
+ __field( __u64, exit_info1 )
+ __field( __u64, exit_info2 )
+ __field( __u32, exit_int_info )
+ __field( __u32, exit_int_info_err )
+ __field( __u32, isa )
+ ),
+
+ TP_fast_assign(
+ __entry->exit_code = exit_code;
+ __entry->exit_info1 = exit_info1;
+ __entry->exit_info2 = exit_info2;
+ __entry->exit_int_info = exit_int_info;
+ __entry->exit_int_info_err = exit_int_info_err;
+ __entry->isa = isa;
+ ),
+
+ TP_printk("reason: %s ext_inf1: 0x%016llx "
+ "ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x",
+ (__entry->isa == KVM_ISA_VMX) ?
+ __print_symbolic(__entry->exit_code, VMX_EXIT_REASONS) :
+ __print_symbolic(__entry->exit_code, SVM_EXIT_REASONS),
+ __entry->exit_info1, __entry->exit_info2,
+ __entry->exit_int_info, __entry->exit_int_info_err)
+);
+
+/*
+ * Tracepoint for nested #vmexit because of interrupt pending
+ */
+TRACE_EVENT(kvm_nested_intr_vmexit,
+ TP_PROTO(__u64 rip),
+ TP_ARGS(rip),
+
+ TP_STRUCT__entry(
+ __field( __u64, rip )
+ ),
+
+ TP_fast_assign(
+ __entry->rip = rip
+ ),
+
+ TP_printk("rip: 0x%016llx", __entry->rip)
+);
+
+/*
+ * Tracepoint for nested #vmexit because of interrupt pending
+ */
+TRACE_EVENT(kvm_invlpga,
+ TP_PROTO(__u64 rip, int asid, u64 address),
+ TP_ARGS(rip, asid, address),
+
+ TP_STRUCT__entry(
+ __field( __u64, rip )
+ __field( int, asid )
+ __field( __u64, address )
+ ),
+
+ TP_fast_assign(
+ __entry->rip = rip;
+ __entry->asid = asid;
+ __entry->address = address;
+ ),
+
+ TP_printk("rip: 0x%016llx asid: %d address: 0x%016llx",
+ __entry->rip, __entry->asid, __entry->address)
+);
+
+/*
+ * Tracepoint for nested #vmexit because of interrupt pending
+ */
+TRACE_EVENT(kvm_skinit,
+ TP_PROTO(__u64 rip, __u32 slb),
+ TP_ARGS(rip, slb),
+
+ TP_STRUCT__entry(
+ __field( __u64, rip )
+ __field( __u32, slb )
+ ),
+
+ TP_fast_assign(
+ __entry->rip = rip;
+ __entry->slb = slb;
+ ),
+
+ TP_printk("rip: 0x%016llx slb: 0x%08x",
+ __entry->rip, __entry->slb)
+);
+
+#define KVM_EMUL_INSN_F_CR0_PE (1 << 0)
+#define KVM_EMUL_INSN_F_EFL_VM (1 << 1)
+#define KVM_EMUL_INSN_F_CS_D (1 << 2)
+#define KVM_EMUL_INSN_F_CS_L (1 << 3)
+
+#define kvm_trace_symbol_emul_flags \
+ { 0, "real" }, \
+ { KVM_EMUL_INSN_F_CR0_PE \
+ | KVM_EMUL_INSN_F_EFL_VM, "vm16" }, \
+ { KVM_EMUL_INSN_F_CR0_PE, "prot16" }, \
+ { KVM_EMUL_INSN_F_CR0_PE \
+ | KVM_EMUL_INSN_F_CS_D, "prot32" }, \
+ { KVM_EMUL_INSN_F_CR0_PE \
+ | KVM_EMUL_INSN_F_CS_L, "prot64" }
+
+#define kei_decode_mode(mode) ({ \
+ u8 flags = 0xff; \
+ switch (mode) { \
+ case X86EMUL_MODE_REAL: \
+ flags = 0; \
+ break; \
+ case X86EMUL_MODE_VM86: \
+ flags = KVM_EMUL_INSN_F_EFL_VM; \
+ break; \
+ case X86EMUL_MODE_PROT16: \
+ flags = KVM_EMUL_INSN_F_CR0_PE; \
+ break; \
+ case X86EMUL_MODE_PROT32: \
+ flags = KVM_EMUL_INSN_F_CR0_PE \
+ | KVM_EMUL_INSN_F_CS_D; \
+ break; \
+ case X86EMUL_MODE_PROT64: \
+ flags = KVM_EMUL_INSN_F_CR0_PE \
+ | KVM_EMUL_INSN_F_CS_L; \
+ break; \
+ } \
+ flags; \
+ })
+
+TRACE_EVENT(kvm_emulate_insn,
+ TP_PROTO(struct kvm_vcpu *vcpu, __u8 failed),
+ TP_ARGS(vcpu, failed),
+
+ TP_STRUCT__entry(
+ __field( __u64, rip )
+ __field( __u32, csbase )
+ __field( __u8, len )
+ __array( __u8, insn, 15 )
+ __field( __u8, flags )
+ __field( __u8, failed )
+ ),
+
+ TP_fast_assign(
+ __entry->csbase = kvm_x86_ops->get_segment_base(vcpu, VCPU_SREG_CS);
+ __entry->len = vcpu->arch.emulate_ctxt.fetch.ptr
+ - vcpu->arch.emulate_ctxt.fetch.data;
+ __entry->rip = vcpu->arch.emulate_ctxt._eip - __entry->len;
+ memcpy(__entry->insn,
+ vcpu->arch.emulate_ctxt.fetch.data,
+ 15);
+ __entry->flags = kei_decode_mode(vcpu->arch.emulate_ctxt.mode);
+ __entry->failed = failed;
+ ),
+
+ TP_printk("%x:%llx:%s (%s)%s",
+ __entry->csbase, __entry->rip,
+ __print_hex(__entry->insn, __entry->len),
+ __print_symbolic(__entry->flags,
+ kvm_trace_symbol_emul_flags),
+ __entry->failed ? " failed" : ""
+ )
+ );
+
+#define trace_kvm_emulate_insn_start(vcpu) trace_kvm_emulate_insn(vcpu, 0)
+#define trace_kvm_emulate_insn_failed(vcpu) trace_kvm_emulate_insn(vcpu, 1)
+
+TRACE_EVENT(
+ vcpu_match_mmio,
+ TP_PROTO(gva_t gva, gpa_t gpa, bool write, bool gpa_match),
+ TP_ARGS(gva, gpa, write, gpa_match),
+
+ TP_STRUCT__entry(
+ __field(gva_t, gva)
+ __field(gpa_t, gpa)
+ __field(bool, write)
+ __field(bool, gpa_match)
+ ),
+
+ TP_fast_assign(
+ __entry->gva = gva;
+ __entry->gpa = gpa;
+ __entry->write = write;
+ __entry->gpa_match = gpa_match
+ ),
+
+ TP_printk("gva %#lx gpa %#llx %s %s", __entry->gva, __entry->gpa,
+ __entry->write ? "Write" : "Read",
+ __entry->gpa_match ? "GPA" : "GVA")
+);
+
+TRACE_EVENT(kvm_write_tsc_offset,
+ TP_PROTO(unsigned int vcpu_id, __u64 previous_tsc_offset,
+ __u64 next_tsc_offset),
+ TP_ARGS(vcpu_id, previous_tsc_offset, next_tsc_offset),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, vcpu_id )
+ __field( __u64, previous_tsc_offset )
+ __field( __u64, next_tsc_offset )
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->previous_tsc_offset = previous_tsc_offset;
+ __entry->next_tsc_offset = next_tsc_offset;
+ ),
+
+ TP_printk("vcpu=%u prev=%llu next=%llu", __entry->vcpu_id,
+ __entry->previous_tsc_offset, __entry->next_tsc_offset)
+);
+
+#ifdef CONFIG_X86_64
+
+#define host_clocks \
+ {VCLOCK_NONE, "none"}, \
+ {VCLOCK_TSC, "tsc"} \
+
+TRACE_EVENT(kvm_update_master_clock,
+ TP_PROTO(bool use_master_clock, unsigned int host_clock, bool offset_matched),
+ TP_ARGS(use_master_clock, host_clock, offset_matched),
+
+ TP_STRUCT__entry(
+ __field( bool, use_master_clock )
+ __field( unsigned int, host_clock )
+ __field( bool, offset_matched )
+ ),
+
+ TP_fast_assign(
+ __entry->use_master_clock = use_master_clock;
+ __entry->host_clock = host_clock;
+ __entry->offset_matched = offset_matched;
+ ),
+
+ TP_printk("masterclock %d hostclock %s offsetmatched %u",
+ __entry->use_master_clock,
+ __print_symbolic(__entry->host_clock, host_clocks),
+ __entry->offset_matched)
+);
+
+TRACE_EVENT(kvm_track_tsc,
+ TP_PROTO(unsigned int vcpu_id, unsigned int nr_matched,
+ unsigned int online_vcpus, bool use_master_clock,
+ unsigned int host_clock),
+ TP_ARGS(vcpu_id, nr_matched, online_vcpus, use_master_clock,
+ host_clock),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, vcpu_id )
+ __field( unsigned int, nr_vcpus_matched_tsc )
+ __field( unsigned int, online_vcpus )
+ __field( bool, use_master_clock )
+ __field( unsigned int, host_clock )
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->nr_vcpus_matched_tsc = nr_matched;
+ __entry->online_vcpus = online_vcpus;
+ __entry->use_master_clock = use_master_clock;
+ __entry->host_clock = host_clock;
+ ),
+
+ TP_printk("vcpu_id %u masterclock %u offsetmatched %u nr_online %u"
+ " hostclock %s",
+ __entry->vcpu_id, __entry->use_master_clock,
+ __entry->nr_vcpus_matched_tsc, __entry->online_vcpus,
+ __print_symbolic(__entry->host_clock, host_clocks))
+);
+
+#endif /* CONFIG_X86_64 */
+
+/*
+ * Tracepoint for PML full VMEXIT.
+ */
+TRACE_EVENT(kvm_pml_full,
+ TP_PROTO(unsigned int vcpu_id),
+ TP_ARGS(vcpu_id),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, vcpu_id )
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ ),
+
+ TP_printk("vcpu %d: PML full", __entry->vcpu_id)
+);
+
+TRACE_EVENT(kvm_ple_window,
+ TP_PROTO(bool grow, unsigned int vcpu_id, int new, int old),
+ TP_ARGS(grow, vcpu_id, new, old),
+
+ TP_STRUCT__entry(
+ __field( bool, grow )
+ __field( unsigned int, vcpu_id )
+ __field( int, new )
+ __field( int, old )
+ ),
+
+ TP_fast_assign(
+ __entry->grow = grow;
+ __entry->vcpu_id = vcpu_id;
+ __entry->new = new;
+ __entry->old = old;
+ ),
+
+ TP_printk("vcpu %u: ple_window %d (%s %d)",
+ __entry->vcpu_id,
+ __entry->new,
+ __entry->grow ? "grow" : "shrink",
+ __entry->old)
+);
+
+#define trace_kvm_ple_window_grow(vcpu_id, new, old) \
+ trace_kvm_ple_window(true, vcpu_id, new, old)
+#define trace_kvm_ple_window_shrink(vcpu_id, new, old) \
+ trace_kvm_ple_window(false, vcpu_id, new, old)
+
+TRACE_EVENT(kvm_pvclock_update,
+ TP_PROTO(unsigned int vcpu_id, struct pvclock_vcpu_time_info *pvclock),
+ TP_ARGS(vcpu_id, pvclock),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, vcpu_id )
+ __field( __u32, version )
+ __field( __u64, tsc_timestamp )
+ __field( __u64, system_time )
+ __field( __u32, tsc_to_system_mul )
+ __field( __s8, tsc_shift )
+ __field( __u8, flags )
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->version = pvclock->version;
+ __entry->tsc_timestamp = pvclock->tsc_timestamp;
+ __entry->system_time = pvclock->system_time;
+ __entry->tsc_to_system_mul = pvclock->tsc_to_system_mul;
+ __entry->tsc_shift = pvclock->tsc_shift;
+ __entry->flags = pvclock->flags;
+ ),
+
+ TP_printk("vcpu_id %u, pvclock { version %u, tsc_timestamp 0x%llx, "
+ "system_time 0x%llx, tsc_to_system_mul 0x%x, tsc_shift %d, "
+ "flags 0x%x }",
+ __entry->vcpu_id,
+ __entry->version,
+ __entry->tsc_timestamp,
+ __entry->system_time,
+ __entry->tsc_to_system_mul,
+ __entry->tsc_shift,
+ __entry->flags)
+);
+
+TRACE_EVENT(kvm_wait_lapic_expire,
+ TP_PROTO(unsigned int vcpu_id, s64 delta),
+ TP_ARGS(vcpu_id, delta),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, vcpu_id )
+ __field( s64, delta )
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->delta = delta;
+ ),
+
+ TP_printk("vcpu %u: delta %lld (%s)",
+ __entry->vcpu_id,
+ __entry->delta,
+ __entry->delta < 0 ? "early" : "late")
+);
+
+TRACE_EVENT(kvm_enter_smm,
+ TP_PROTO(unsigned int vcpu_id, u64 smbase, bool entering),
+ TP_ARGS(vcpu_id, smbase, entering),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, vcpu_id )
+ __field( u64, smbase )
+ __field( bool, entering )
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->smbase = smbase;
+ __entry->entering = entering;
+ ),
+
+ TP_printk("vcpu %u: %s SMM, smbase 0x%llx",
+ __entry->vcpu_id,
+ __entry->entering ? "entering" : "leaving",
+ __entry->smbase)
+);
+
+/*
+ * Tracepoint for VT-d posted-interrupts.
+ */
+TRACE_EVENT(kvm_pi_irte_update,
+ TP_PROTO(unsigned int host_irq, unsigned int vcpu_id,
+ unsigned int gsi, unsigned int gvec,
+ u64 pi_desc_addr, bool set),
+ TP_ARGS(host_irq, vcpu_id, gsi, gvec, pi_desc_addr, set),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, host_irq )
+ __field( unsigned int, vcpu_id )
+ __field( unsigned int, gsi )
+ __field( unsigned int, gvec )
+ __field( u64, pi_desc_addr )
+ __field( bool, set )
+ ),
+
+ TP_fast_assign(
+ __entry->host_irq = host_irq;
+ __entry->vcpu_id = vcpu_id;
+ __entry->gsi = gsi;
+ __entry->gvec = gvec;
+ __entry->pi_desc_addr = pi_desc_addr;
+ __entry->set = set;
+ ),
+
+ TP_printk("VT-d PI is %s for irq %u, vcpu %u, gsi: 0x%x, "
+ "gvec: 0x%x, pi_desc_addr: 0x%llx",
+ __entry->set ? "enabled and being updated" : "disabled",
+ __entry->host_irq,
+ __entry->vcpu_id,
+ __entry->gsi,
+ __entry->gvec,
+ __entry->pi_desc_addr)
+);
+
+/*
+ * Tracepoint for kvm_hv_notify_acked_sint.
+ */
+TRACE_EVENT(kvm_hv_notify_acked_sint,
+ TP_PROTO(int vcpu_id, u32 sint),
+ TP_ARGS(vcpu_id, sint),
+
+ TP_STRUCT__entry(
+ __field(int, vcpu_id)
+ __field(u32, sint)
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->sint = sint;
+ ),
+
+ TP_printk("vcpu_id %d sint %u", __entry->vcpu_id, __entry->sint)
+);
+
+/*
+ * Tracepoint for synic_set_irq.
+ */
+TRACE_EVENT(kvm_hv_synic_set_irq,
+ TP_PROTO(int vcpu_id, u32 sint, int vector, int ret),
+ TP_ARGS(vcpu_id, sint, vector, ret),
+
+ TP_STRUCT__entry(
+ __field(int, vcpu_id)
+ __field(u32, sint)
+ __field(int, vector)
+ __field(int, ret)
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->sint = sint;
+ __entry->vector = vector;
+ __entry->ret = ret;
+ ),
+
+ TP_printk("vcpu_id %d sint %u vector %d ret %d",
+ __entry->vcpu_id, __entry->sint, __entry->vector,
+ __entry->ret)
+);
+
+/*
+ * Tracepoint for kvm_hv_synic_send_eoi.
+ */
+TRACE_EVENT(kvm_hv_synic_send_eoi,
+ TP_PROTO(int vcpu_id, int vector),
+ TP_ARGS(vcpu_id, vector),
+
+ TP_STRUCT__entry(
+ __field(int, vcpu_id)
+ __field(u32, sint)
+ __field(int, vector)
+ __field(int, ret)
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->vector = vector;
+ ),
+
+ TP_printk("vcpu_id %d vector %d", __entry->vcpu_id, __entry->vector)
+);
+
+/*
+ * Tracepoint for synic_set_msr.
+ */
+TRACE_EVENT(kvm_hv_synic_set_msr,
+ TP_PROTO(int vcpu_id, u32 msr, u64 data, bool host),
+ TP_ARGS(vcpu_id, msr, data, host),
+
+ TP_STRUCT__entry(
+ __field(int, vcpu_id)
+ __field(u32, msr)
+ __field(u64, data)
+ __field(bool, host)
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->msr = msr;
+ __entry->data = data;
+ __entry->host = host
+ ),
+
+ TP_printk("vcpu_id %d msr 0x%x data 0x%llx host %d",
+ __entry->vcpu_id, __entry->msr, __entry->data, __entry->host)
+);
+
+/*
+ * Tracepoint for stimer_set_config.
+ */
+TRACE_EVENT(kvm_hv_stimer_set_config,
+ TP_PROTO(int vcpu_id, int timer_index, u64 config, bool host),
+ TP_ARGS(vcpu_id, timer_index, config, host),
+
+ TP_STRUCT__entry(
+ __field(int, vcpu_id)
+ __field(int, timer_index)
+ __field(u64, config)
+ __field(bool, host)
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->timer_index = timer_index;
+ __entry->config = config;
+ __entry->host = host;
+ ),
+
+ TP_printk("vcpu_id %d timer %d config 0x%llx host %d",
+ __entry->vcpu_id, __entry->timer_index, __entry->config,
+ __entry->host)
+);
+
+/*
+ * Tracepoint for stimer_set_count.
+ */
+TRACE_EVENT(kvm_hv_stimer_set_count,
+ TP_PROTO(int vcpu_id, int timer_index, u64 count, bool host),
+ TP_ARGS(vcpu_id, timer_index, count, host),
+
+ TP_STRUCT__entry(
+ __field(int, vcpu_id)
+ __field(int, timer_index)
+ __field(u64, count)
+ __field(bool, host)
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->timer_index = timer_index;
+ __entry->count = count;
+ __entry->host = host;
+ ),
+
+ TP_printk("vcpu_id %d timer %d count %llu host %d",
+ __entry->vcpu_id, __entry->timer_index, __entry->count,
+ __entry->host)
+);
+
+/*
+ * Tracepoint for stimer_start(periodic timer case).
+ */
+TRACE_EVENT(kvm_hv_stimer_start_periodic,
+ TP_PROTO(int vcpu_id, int timer_index, u64 time_now, u64 exp_time),
+ TP_ARGS(vcpu_id, timer_index, time_now, exp_time),
+
+ TP_STRUCT__entry(
+ __field(int, vcpu_id)
+ __field(int, timer_index)
+ __field(u64, time_now)
+ __field(u64, exp_time)
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->timer_index = timer_index;
+ __entry->time_now = time_now;
+ __entry->exp_time = exp_time;
+ ),
+
+ TP_printk("vcpu_id %d timer %d time_now %llu exp_time %llu",
+ __entry->vcpu_id, __entry->timer_index, __entry->time_now,
+ __entry->exp_time)
+);
+
+/*
+ * Tracepoint for stimer_start(one-shot timer case).
+ */
+TRACE_EVENT(kvm_hv_stimer_start_one_shot,
+ TP_PROTO(int vcpu_id, int timer_index, u64 time_now, u64 count),
+ TP_ARGS(vcpu_id, timer_index, time_now, count),
+
+ TP_STRUCT__entry(
+ __field(int, vcpu_id)
+ __field(int, timer_index)
+ __field(u64, time_now)
+ __field(u64, count)
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->timer_index = timer_index;
+ __entry->time_now = time_now;
+ __entry->count = count;
+ ),
+
+ TP_printk("vcpu_id %d timer %d time_now %llu count %llu",
+ __entry->vcpu_id, __entry->timer_index, __entry->time_now,
+ __entry->count)
+);
+
+/*
+ * Tracepoint for stimer_timer_callback.
+ */
+TRACE_EVENT(kvm_hv_stimer_callback,
+ TP_PROTO(int vcpu_id, int timer_index),
+ TP_ARGS(vcpu_id, timer_index),
+
+ TP_STRUCT__entry(
+ __field(int, vcpu_id)
+ __field(int, timer_index)
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->timer_index = timer_index;
+ ),
+
+ TP_printk("vcpu_id %d timer %d",
+ __entry->vcpu_id, __entry->timer_index)
+);
+
+/*
+ * Tracepoint for stimer_expiration.
+ */
+TRACE_EVENT(kvm_hv_stimer_expiration,
+ TP_PROTO(int vcpu_id, int timer_index, int msg_send_result),
+ TP_ARGS(vcpu_id, timer_index, msg_send_result),
+
+ TP_STRUCT__entry(
+ __field(int, vcpu_id)
+ __field(int, timer_index)
+ __field(int, msg_send_result)
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->timer_index = timer_index;
+ __entry->msg_send_result = msg_send_result;
+ ),
+
+ TP_printk("vcpu_id %d timer %d msg send result %d",
+ __entry->vcpu_id, __entry->timer_index,
+ __entry->msg_send_result)
+);
+
+/*
+ * Tracepoint for stimer_cleanup.
+ */
+TRACE_EVENT(kvm_hv_stimer_cleanup,
+ TP_PROTO(int vcpu_id, int timer_index),
+ TP_ARGS(vcpu_id, timer_index),
+
+ TP_STRUCT__entry(
+ __field(int, vcpu_id)
+ __field(int, timer_index)
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->timer_index = timer_index;
+ ),
+
+ TP_printk("vcpu_id %d timer %d",
+ __entry->vcpu_id, __entry->timer_index)
+);
+
+/*
+ * Tracepoint for AMD AVIC
+ */
+TRACE_EVENT(kvm_avic_incomplete_ipi,
+ TP_PROTO(u32 vcpu, u32 icrh, u32 icrl, u32 id, u32 index),
+ TP_ARGS(vcpu, icrh, icrl, id, index),
+
+ TP_STRUCT__entry(
+ __field(u32, vcpu)
+ __field(u32, icrh)
+ __field(u32, icrl)
+ __field(u32, id)
+ __field(u32, index)
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu = vcpu;
+ __entry->icrh = icrh;
+ __entry->icrl = icrl;
+ __entry->id = id;
+ __entry->index = index;
+ ),
+
+ TP_printk("vcpu=%u, icrh:icrl=%#010x:%08x, id=%u, index=%u\n",
+ __entry->vcpu, __entry->icrh, __entry->icrl,
+ __entry->id, __entry->index)
+);
+
+TRACE_EVENT(kvm_avic_unaccelerated_access,
+ TP_PROTO(u32 vcpu, u32 offset, bool ft, bool rw, u32 vec),
+ TP_ARGS(vcpu, offset, ft, rw, vec),
+
+ TP_STRUCT__entry(
+ __field(u32, vcpu)
+ __field(u32, offset)
+ __field(bool, ft)
+ __field(bool, rw)
+ __field(u32, vec)
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu = vcpu;
+ __entry->offset = offset;
+ __entry->ft = ft;
+ __entry->rw = rw;
+ __entry->vec = vec;
+ ),
+
+ TP_printk("vcpu=%u, offset=%#x(%s), %s, %s, vec=%#x\n",
+ __entry->vcpu,
+ __entry->offset,
+ __print_symbolic(__entry->offset, kvm_trace_symbol_apic),
+ __entry->ft ? "trap" : "fault",
+ __entry->rw ? "write" : "read",
+ __entry->vec)
+);
+
+TRACE_EVENT(kvm_hv_timer_state,
+ TP_PROTO(unsigned int vcpu_id, unsigned int hv_timer_in_use),
+ TP_ARGS(vcpu_id, hv_timer_in_use),
+ TP_STRUCT__entry(
+ __field(unsigned int, vcpu_id)
+ __field(unsigned int, hv_timer_in_use)
+ ),
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->hv_timer_in_use = hv_timer_in_use;
+ ),
+ TP_printk("vcpu_id %x hv_timer %x\n",
+ __entry->vcpu_id,
+ __entry->hv_timer_in_use)
+);
+
+/*
+ * Tracepoint for kvm_hv_flush_tlb.
+ */
+TRACE_EVENT(kvm_hv_flush_tlb,
+ TP_PROTO(u64 processor_mask, u64 address_space, u64 flags),
+ TP_ARGS(processor_mask, address_space, flags),
+
+ TP_STRUCT__entry(
+ __field(u64, processor_mask)
+ __field(u64, address_space)
+ __field(u64, flags)
+ ),
+
+ TP_fast_assign(
+ __entry->processor_mask = processor_mask;
+ __entry->address_space = address_space;
+ __entry->flags = flags;
+ ),
+
+ TP_printk("processor_mask 0x%llx address_space 0x%llx flags 0x%llx",
+ __entry->processor_mask, __entry->address_space,
+ __entry->flags)
+);
+
+/*
+ * Tracepoint for kvm_hv_flush_tlb_ex.
+ */
+TRACE_EVENT(kvm_hv_flush_tlb_ex,
+ TP_PROTO(u64 valid_bank_mask, u64 format, u64 address_space, u64 flags),
+ TP_ARGS(valid_bank_mask, format, address_space, flags),
+
+ TP_STRUCT__entry(
+ __field(u64, valid_bank_mask)
+ __field(u64, format)
+ __field(u64, address_space)
+ __field(u64, flags)
+ ),
+
+ TP_fast_assign(
+ __entry->valid_bank_mask = valid_bank_mask;
+ __entry->format = format;
+ __entry->address_space = address_space;
+ __entry->flags = flags;
+ ),
+
+ TP_printk("valid_bank_mask 0x%llx format 0x%llx "
+ "address_space 0x%llx flags 0x%llx",
+ __entry->valid_bank_mask, __entry->format,
+ __entry->address_space, __entry->flags)
+);
+#endif /* _TRACE_KVM_H */
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH arch/x86/kvm
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE trace
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/arch/x86/kvm/tss.h b/arch/x86/kvm/tss.h
new file mode 100644
index 000000000..3f9150125
--- /dev/null
+++ b/arch/x86/kvm/tss.h
@@ -0,0 +1,60 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __TSS_SEGMENT_H
+#define __TSS_SEGMENT_H
+
+struct tss_segment_32 {
+ u32 prev_task_link;
+ u32 esp0;
+ u32 ss0;
+ u32 esp1;
+ u32 ss1;
+ u32 esp2;
+ u32 ss2;
+ u32 cr3;
+ u32 eip;
+ u32 eflags;
+ u32 eax;
+ u32 ecx;
+ u32 edx;
+ u32 ebx;
+ u32 esp;
+ u32 ebp;
+ u32 esi;
+ u32 edi;
+ u32 es;
+ u32 cs;
+ u32 ss;
+ u32 ds;
+ u32 fs;
+ u32 gs;
+ u32 ldt_selector;
+ u16 t;
+ u16 io_map;
+};
+
+struct tss_segment_16 {
+ u16 prev_task_link;
+ u16 sp0;
+ u16 ss0;
+ u16 sp1;
+ u16 ss1;
+ u16 sp2;
+ u16 ss2;
+ u16 ip;
+ u16 flag;
+ u16 ax;
+ u16 cx;
+ u16 dx;
+ u16 bx;
+ u16 sp;
+ u16 bp;
+ u16 si;
+ u16 di;
+ u16 es;
+ u16 cs;
+ u16 ss;
+ u16 ds;
+ u16 ldt;
+};
+
+#endif
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
new file mode 100644
index 000000000..44cce3e8e
--- /dev/null
+++ b/arch/x86/kvm/vmx.c
@@ -0,0 +1,14715 @@
+/*
+ * Kernel-based Virtual Machine driver for Linux
+ *
+ * This module enables machines with Intel VT-x extensions to run virtual
+ * machines without emulation or binary translation.
+ *
+ * Copyright (C) 2006 Qumranet, Inc.
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
+ *
+ * Authors:
+ * Avi Kivity <avi@qumranet.com>
+ * Yaniv Kamay <yaniv@qumranet.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#include "irq.h"
+#include "mmu.h"
+#include "cpuid.h"
+#include "lapic.h"
+
+#include <linux/kvm_host.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/sched.h>
+#include <linux/sched/smt.h>
+#include <linux/moduleparam.h>
+#include <linux/mod_devicetable.h>
+#include <linux/trace_events.h>
+#include <linux/slab.h>
+#include <linux/tboot.h>
+#include <linux/hrtimer.h>
+#include <linux/frame.h>
+#include <linux/nospec.h>
+#include "kvm_cache_regs.h"
+#include "x86.h"
+
+#include <asm/asm.h>
+#include <asm/cpu.h>
+#include <asm/io.h>
+#include <asm/desc.h>
+#include <asm/vmx.h>
+#include <asm/virtext.h>
+#include <asm/mce.h>
+#include <asm/fpu/internal.h>
+#include <asm/perf_event.h>
+#include <asm/debugreg.h>
+#include <asm/kexec.h>
+#include <asm/apic.h>
+#include <asm/irq_remapping.h>
+#include <asm/mmu_context.h>
+#include <asm/spec-ctrl.h>
+#include <asm/mshyperv.h>
+
+#include "trace.h"
+#include "pmu.h"
+#include "vmx_evmcs.h"
+
+#define __ex(x) __kvm_handle_fault_on_reboot(x)
+#define __ex_clear(x, reg) \
+ ____kvm_handle_fault_on_reboot(x, "xor " reg " , " reg)
+
+MODULE_AUTHOR("Qumranet");
+MODULE_LICENSE("GPL");
+
+static const struct x86_cpu_id vmx_cpu_id[] = {
+ X86_FEATURE_MATCH(X86_FEATURE_VMX),
+ {}
+};
+MODULE_DEVICE_TABLE(x86cpu, vmx_cpu_id);
+
+static bool __read_mostly enable_vpid = 1;
+module_param_named(vpid, enable_vpid, bool, 0444);
+
+static bool __read_mostly enable_vnmi = 1;
+module_param_named(vnmi, enable_vnmi, bool, S_IRUGO);
+
+static bool __read_mostly flexpriority_enabled = 1;
+module_param_named(flexpriority, flexpriority_enabled, bool, S_IRUGO);
+
+static bool __read_mostly enable_ept = 1;
+module_param_named(ept, enable_ept, bool, S_IRUGO);
+
+static bool __read_mostly enable_unrestricted_guest = 1;
+module_param_named(unrestricted_guest,
+ enable_unrestricted_guest, bool, S_IRUGO);
+
+static bool __read_mostly enable_ept_ad_bits = 1;
+module_param_named(eptad, enable_ept_ad_bits, bool, S_IRUGO);
+
+static bool __read_mostly emulate_invalid_guest_state = true;
+module_param(emulate_invalid_guest_state, bool, S_IRUGO);
+
+static bool __read_mostly fasteoi = 1;
+module_param(fasteoi, bool, S_IRUGO);
+
+static bool __read_mostly enable_apicv = 1;
+module_param(enable_apicv, bool, S_IRUGO);
+
+static bool __read_mostly enable_shadow_vmcs = 1;
+module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO);
+/*
+ * If nested=1, nested virtualization is supported, i.e., guests may use
+ * VMX and be a hypervisor for its own guests. If nested=0, guests may not
+ * use VMX instructions.
+ */
+static bool __read_mostly nested = 0;
+module_param(nested, bool, S_IRUGO);
+
+static u64 __read_mostly host_xss;
+
+static bool __read_mostly enable_pml = 1;
+module_param_named(pml, enable_pml, bool, S_IRUGO);
+
+#define MSR_TYPE_R 1
+#define MSR_TYPE_W 2
+#define MSR_TYPE_RW 3
+
+#define MSR_BITMAP_MODE_X2APIC 1
+#define MSR_BITMAP_MODE_X2APIC_APICV 2
+
+#define KVM_VMX_TSC_MULTIPLIER_MAX 0xffffffffffffffffULL
+
+/* Guest_tsc -> host_tsc conversion requires 64-bit division. */
+static int __read_mostly cpu_preemption_timer_multi;
+static bool __read_mostly enable_preemption_timer = 1;
+#ifdef CONFIG_X86_64
+module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO);
+#endif
+
+#define KVM_GUEST_CR0_MASK (X86_CR0_NW | X86_CR0_CD)
+#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR0_NE
+#define KVM_VM_CR0_ALWAYS_ON \
+ (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | \
+ X86_CR0_WP | X86_CR0_PG | X86_CR0_PE)
+#define KVM_CR4_GUEST_OWNED_BITS \
+ (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \
+ | X86_CR4_OSXMMEXCPT | X86_CR4_LA57 | X86_CR4_TSD)
+
+#define KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR4_VMXE
+#define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE)
+#define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE)
+
+#define RMODE_GUEST_OWNED_EFLAGS_BITS (~(X86_EFLAGS_IOPL | X86_EFLAGS_VM))
+
+#define VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE 5
+
+/*
+ * Hyper-V requires all of these, so mark them as supported even though
+ * they are just treated the same as all-context.
+ */
+#define VMX_VPID_EXTENT_SUPPORTED_MASK \
+ (VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT | \
+ VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT | \
+ VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT | \
+ VMX_VPID_EXTENT_SINGLE_NON_GLOBAL_BIT)
+
+/*
+ * These 2 parameters are used to config the controls for Pause-Loop Exiting:
+ * ple_gap: upper bound on the amount of time between two successive
+ * executions of PAUSE in a loop. Also indicate if ple enabled.
+ * According to test, this time is usually smaller than 128 cycles.
+ * ple_window: upper bound on the amount of time a guest is allowed to execute
+ * in a PAUSE loop. Tests indicate that most spinlocks are held for
+ * less than 2^12 cycles
+ * Time is measured based on a counter that runs at the same rate as the TSC,
+ * refer SDM volume 3b section 21.6.13 & 22.1.3.
+ */
+static unsigned int ple_gap = KVM_DEFAULT_PLE_GAP;
+module_param(ple_gap, uint, 0444);
+
+static unsigned int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW;
+module_param(ple_window, uint, 0444);
+
+/* Default doubles per-vcpu window every exit. */
+static unsigned int ple_window_grow = KVM_DEFAULT_PLE_WINDOW_GROW;
+module_param(ple_window_grow, uint, 0444);
+
+/* Default resets per-vcpu window every exit to ple_window. */
+static unsigned int ple_window_shrink = KVM_DEFAULT_PLE_WINDOW_SHRINK;
+module_param(ple_window_shrink, uint, 0444);
+
+/* Default is to compute the maximum so we can never overflow. */
+static unsigned int ple_window_max = KVM_VMX_DEFAULT_PLE_WINDOW_MAX;
+module_param(ple_window_max, uint, 0444);
+
+extern const ulong vmx_return;
+
+static DEFINE_STATIC_KEY_FALSE(vmx_l1d_should_flush);
+static DEFINE_STATIC_KEY_FALSE(vmx_l1d_flush_cond);
+static DEFINE_MUTEX(vmx_l1d_flush_mutex);
+
+/* Storage for pre module init parameter parsing */
+static enum vmx_l1d_flush_state __read_mostly vmentry_l1d_flush_param = VMENTER_L1D_FLUSH_AUTO;
+
+static const struct {
+ const char *option;
+ bool for_parse;
+} vmentry_l1d_param[] = {
+ [VMENTER_L1D_FLUSH_AUTO] = {"auto", true},
+ [VMENTER_L1D_FLUSH_NEVER] = {"never", true},
+ [VMENTER_L1D_FLUSH_COND] = {"cond", true},
+ [VMENTER_L1D_FLUSH_ALWAYS] = {"always", true},
+ [VMENTER_L1D_FLUSH_EPT_DISABLED] = {"EPT disabled", false},
+ [VMENTER_L1D_FLUSH_NOT_REQUIRED] = {"not required", false},
+};
+
+#define L1D_CACHE_ORDER 4
+static void *vmx_l1d_flush_pages;
+
+/* Control for disabling CPU Fill buffer clear */
+static bool __read_mostly vmx_fb_clear_ctrl_available;
+
+static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf)
+{
+ struct page *page;
+ unsigned int i;
+
+ if (!enable_ept) {
+ l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_EPT_DISABLED;
+ return 0;
+ }
+
+ if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) {
+ u64 msr;
+
+ rdmsrl(MSR_IA32_ARCH_CAPABILITIES, msr);
+ if (msr & ARCH_CAP_SKIP_VMENTRY_L1DFLUSH) {
+ l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED;
+ return 0;
+ }
+ }
+
+ /* If set to auto use the default l1tf mitigation method */
+ if (l1tf == VMENTER_L1D_FLUSH_AUTO) {
+ switch (l1tf_mitigation) {
+ case L1TF_MITIGATION_OFF:
+ l1tf = VMENTER_L1D_FLUSH_NEVER;
+ break;
+ case L1TF_MITIGATION_FLUSH_NOWARN:
+ case L1TF_MITIGATION_FLUSH:
+ case L1TF_MITIGATION_FLUSH_NOSMT:
+ l1tf = VMENTER_L1D_FLUSH_COND;
+ break;
+ case L1TF_MITIGATION_FULL:
+ case L1TF_MITIGATION_FULL_FORCE:
+ l1tf = VMENTER_L1D_FLUSH_ALWAYS;
+ break;
+ }
+ } else if (l1tf_mitigation == L1TF_MITIGATION_FULL_FORCE) {
+ l1tf = VMENTER_L1D_FLUSH_ALWAYS;
+ }
+
+ if (l1tf != VMENTER_L1D_FLUSH_NEVER && !vmx_l1d_flush_pages &&
+ !boot_cpu_has(X86_FEATURE_FLUSH_L1D)) {
+ page = alloc_pages(GFP_KERNEL, L1D_CACHE_ORDER);
+ if (!page)
+ return -ENOMEM;
+ vmx_l1d_flush_pages = page_address(page);
+
+ /*
+ * Initialize each page with a different pattern in
+ * order to protect against KSM in the nested
+ * virtualization case.
+ */
+ for (i = 0; i < 1u << L1D_CACHE_ORDER; ++i) {
+ memset(vmx_l1d_flush_pages + i * PAGE_SIZE, i + 1,
+ PAGE_SIZE);
+ }
+ }
+
+ l1tf_vmx_mitigation = l1tf;
+
+ if (l1tf != VMENTER_L1D_FLUSH_NEVER)
+ static_branch_enable(&vmx_l1d_should_flush);
+ else
+ static_branch_disable(&vmx_l1d_should_flush);
+
+ if (l1tf == VMENTER_L1D_FLUSH_COND)
+ static_branch_enable(&vmx_l1d_flush_cond);
+ else
+ static_branch_disable(&vmx_l1d_flush_cond);
+ return 0;
+}
+
+static int vmentry_l1d_flush_parse(const char *s)
+{
+ unsigned int i;
+
+ if (s) {
+ for (i = 0; i < ARRAY_SIZE(vmentry_l1d_param); i++) {
+ if (vmentry_l1d_param[i].for_parse &&
+ sysfs_streq(s, vmentry_l1d_param[i].option))
+ return i;
+ }
+ }
+ return -EINVAL;
+}
+
+static int vmentry_l1d_flush_set(const char *s, const struct kernel_param *kp)
+{
+ int l1tf, ret;
+
+ l1tf = vmentry_l1d_flush_parse(s);
+ if (l1tf < 0)
+ return l1tf;
+
+ if (!boot_cpu_has(X86_BUG_L1TF))
+ return 0;
+
+ /*
+ * Has vmx_init() run already? If not then this is the pre init
+ * parameter parsing. In that case just store the value and let
+ * vmx_init() do the proper setup after enable_ept has been
+ * established.
+ */
+ if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_AUTO) {
+ vmentry_l1d_flush_param = l1tf;
+ return 0;
+ }
+
+ mutex_lock(&vmx_l1d_flush_mutex);
+ ret = vmx_setup_l1d_flush(l1tf);
+ mutex_unlock(&vmx_l1d_flush_mutex);
+ return ret;
+}
+
+static int vmentry_l1d_flush_get(char *s, const struct kernel_param *kp)
+{
+ if (WARN_ON_ONCE(l1tf_vmx_mitigation >= ARRAY_SIZE(vmentry_l1d_param)))
+ return sprintf(s, "???\n");
+
+ return sprintf(s, "%s\n", vmentry_l1d_param[l1tf_vmx_mitigation].option);
+}
+
+static const struct kernel_param_ops vmentry_l1d_flush_ops = {
+ .set = vmentry_l1d_flush_set,
+ .get = vmentry_l1d_flush_get,
+};
+module_param_cb(vmentry_l1d_flush, &vmentry_l1d_flush_ops, NULL, 0644);
+
+enum ept_pointers_status {
+ EPT_POINTERS_CHECK = 0,
+ EPT_POINTERS_MATCH = 1,
+ EPT_POINTERS_MISMATCH = 2
+};
+
+struct kvm_vmx {
+ struct kvm kvm;
+
+ unsigned int tss_addr;
+ bool ept_identity_pagetable_done;
+ gpa_t ept_identity_map_addr;
+
+ enum ept_pointers_status ept_pointers_match;
+ spinlock_t ept_pointer_lock;
+};
+
+#define NR_AUTOLOAD_MSRS 8
+
+struct vmcs_hdr {
+ u32 revision_id:31;
+ u32 shadow_vmcs:1;
+};
+
+struct vmcs {
+ struct vmcs_hdr hdr;
+ u32 abort;
+ char data[0];
+};
+
+/*
+ * vmcs_host_state tracks registers that are loaded from the VMCS on VMEXIT
+ * and whose values change infrequently, but are not constant. I.e. this is
+ * used as a write-through cache of the corresponding VMCS fields.
+ */
+struct vmcs_host_state {
+ unsigned long cr3; /* May not match real cr3 */
+ unsigned long cr4; /* May not match real cr4 */
+ unsigned long gs_base;
+ unsigned long fs_base;
+
+ u16 fs_sel, gs_sel, ldt_sel;
+#ifdef CONFIG_X86_64
+ u16 ds_sel, es_sel;
+#endif
+};
+
+/*
+ * Track a VMCS that may be loaded on a certain CPU. If it is (cpu!=-1), also
+ * remember whether it was VMLAUNCHed, and maintain a linked list of all VMCSs
+ * loaded on this CPU (so we can clear them if the CPU goes down).
+ */
+struct loaded_vmcs {
+ struct vmcs *vmcs;
+ struct vmcs *shadow_vmcs;
+ int cpu;
+ bool launched;
+ bool nmi_known_unmasked;
+ bool hv_timer_armed;
+ /* Support for vnmi-less CPUs */
+ int soft_vnmi_blocked;
+ ktime_t entry_time;
+ s64 vnmi_blocked_time;
+ unsigned long *msr_bitmap;
+ struct list_head loaded_vmcss_on_cpu_link;
+ struct vmcs_host_state host_state;
+};
+
+struct shared_msr_entry {
+ unsigned index;
+ u64 data;
+ u64 mask;
+};
+
+/*
+ * struct vmcs12 describes the state that our guest hypervisor (L1) keeps for a
+ * single nested guest (L2), hence the name vmcs12. Any VMX implementation has
+ * a VMCS structure, and vmcs12 is our emulated VMX's VMCS. This structure is
+ * stored in guest memory specified by VMPTRLD, but is opaque to the guest,
+ * which must access it using VMREAD/VMWRITE/VMCLEAR instructions.
+ * More than one of these structures may exist, if L1 runs multiple L2 guests.
+ * nested_vmx_run() will use the data here to build the vmcs02: a VMCS for the
+ * underlying hardware which will be used to run L2.
+ * This structure is packed to ensure that its layout is identical across
+ * machines (necessary for live migration).
+ *
+ * IMPORTANT: Changing the layout of existing fields in this structure
+ * will break save/restore compatibility with older kvm releases. When
+ * adding new fields, either use space in the reserved padding* arrays
+ * or add the new fields to the end of the structure.
+ */
+typedef u64 natural_width;
+struct __packed vmcs12 {
+ /* According to the Intel spec, a VMCS region must start with the
+ * following two fields. Then follow implementation-specific data.
+ */
+ struct vmcs_hdr hdr;
+ u32 abort;
+
+ u32 launch_state; /* set to 0 by VMCLEAR, to 1 by VMLAUNCH */
+ u32 padding[7]; /* room for future expansion */
+
+ u64 io_bitmap_a;
+ u64 io_bitmap_b;
+ u64 msr_bitmap;
+ u64 vm_exit_msr_store_addr;
+ u64 vm_exit_msr_load_addr;
+ u64 vm_entry_msr_load_addr;
+ u64 tsc_offset;
+ u64 virtual_apic_page_addr;
+ u64 apic_access_addr;
+ u64 posted_intr_desc_addr;
+ u64 ept_pointer;
+ u64 eoi_exit_bitmap0;
+ u64 eoi_exit_bitmap1;
+ u64 eoi_exit_bitmap2;
+ u64 eoi_exit_bitmap3;
+ u64 xss_exit_bitmap;
+ u64 guest_physical_address;
+ u64 vmcs_link_pointer;
+ u64 guest_ia32_debugctl;
+ u64 guest_ia32_pat;
+ u64 guest_ia32_efer;
+ u64 guest_ia32_perf_global_ctrl;
+ u64 guest_pdptr0;
+ u64 guest_pdptr1;
+ u64 guest_pdptr2;
+ u64 guest_pdptr3;
+ u64 guest_bndcfgs;
+ u64 host_ia32_pat;
+ u64 host_ia32_efer;
+ u64 host_ia32_perf_global_ctrl;
+ u64 vmread_bitmap;
+ u64 vmwrite_bitmap;
+ u64 vm_function_control;
+ u64 eptp_list_address;
+ u64 pml_address;
+ u64 padding64[3]; /* room for future expansion */
+ /*
+ * To allow migration of L1 (complete with its L2 guests) between
+ * machines of different natural widths (32 or 64 bit), we cannot have
+ * unsigned long fields with no explict size. We use u64 (aliased
+ * natural_width) instead. Luckily, x86 is little-endian.
+ */
+ natural_width cr0_guest_host_mask;
+ natural_width cr4_guest_host_mask;
+ natural_width cr0_read_shadow;
+ natural_width cr4_read_shadow;
+ natural_width cr3_target_value0;
+ natural_width cr3_target_value1;
+ natural_width cr3_target_value2;
+ natural_width cr3_target_value3;
+ natural_width exit_qualification;
+ natural_width guest_linear_address;
+ natural_width guest_cr0;
+ natural_width guest_cr3;
+ natural_width guest_cr4;
+ natural_width guest_es_base;
+ natural_width guest_cs_base;
+ natural_width guest_ss_base;
+ natural_width guest_ds_base;
+ natural_width guest_fs_base;
+ natural_width guest_gs_base;
+ natural_width guest_ldtr_base;
+ natural_width guest_tr_base;
+ natural_width guest_gdtr_base;
+ natural_width guest_idtr_base;
+ natural_width guest_dr7;
+ natural_width guest_rsp;
+ natural_width guest_rip;
+ natural_width guest_rflags;
+ natural_width guest_pending_dbg_exceptions;
+ natural_width guest_sysenter_esp;
+ natural_width guest_sysenter_eip;
+ natural_width host_cr0;
+ natural_width host_cr3;
+ natural_width host_cr4;
+ natural_width host_fs_base;
+ natural_width host_gs_base;
+ natural_width host_tr_base;
+ natural_width host_gdtr_base;
+ natural_width host_idtr_base;
+ natural_width host_ia32_sysenter_esp;
+ natural_width host_ia32_sysenter_eip;
+ natural_width host_rsp;
+ natural_width host_rip;
+ natural_width paddingl[8]; /* room for future expansion */
+ u32 pin_based_vm_exec_control;
+ u32 cpu_based_vm_exec_control;
+ u32 exception_bitmap;
+ u32 page_fault_error_code_mask;
+ u32 page_fault_error_code_match;
+ u32 cr3_target_count;
+ u32 vm_exit_controls;
+ u32 vm_exit_msr_store_count;
+ u32 vm_exit_msr_load_count;
+ u32 vm_entry_controls;
+ u32 vm_entry_msr_load_count;
+ u32 vm_entry_intr_info_field;
+ u32 vm_entry_exception_error_code;
+ u32 vm_entry_instruction_len;
+ u32 tpr_threshold;
+ u32 secondary_vm_exec_control;
+ u32 vm_instruction_error;
+ u32 vm_exit_reason;
+ u32 vm_exit_intr_info;
+ u32 vm_exit_intr_error_code;
+ u32 idt_vectoring_info_field;
+ u32 idt_vectoring_error_code;
+ u32 vm_exit_instruction_len;
+ u32 vmx_instruction_info;
+ u32 guest_es_limit;
+ u32 guest_cs_limit;
+ u32 guest_ss_limit;
+ u32 guest_ds_limit;
+ u32 guest_fs_limit;
+ u32 guest_gs_limit;
+ u32 guest_ldtr_limit;
+ u32 guest_tr_limit;
+ u32 guest_gdtr_limit;
+ u32 guest_idtr_limit;
+ u32 guest_es_ar_bytes;
+ u32 guest_cs_ar_bytes;
+ u32 guest_ss_ar_bytes;
+ u32 guest_ds_ar_bytes;
+ u32 guest_fs_ar_bytes;
+ u32 guest_gs_ar_bytes;
+ u32 guest_ldtr_ar_bytes;
+ u32 guest_tr_ar_bytes;
+ u32 guest_interruptibility_info;
+ u32 guest_activity_state;
+ u32 guest_sysenter_cs;
+ u32 host_ia32_sysenter_cs;
+ u32 vmx_preemption_timer_value;
+ u32 padding32[7]; /* room for future expansion */
+ u16 virtual_processor_id;
+ u16 posted_intr_nv;
+ u16 guest_es_selector;
+ u16 guest_cs_selector;
+ u16 guest_ss_selector;
+ u16 guest_ds_selector;
+ u16 guest_fs_selector;
+ u16 guest_gs_selector;
+ u16 guest_ldtr_selector;
+ u16 guest_tr_selector;
+ u16 guest_intr_status;
+ u16 host_es_selector;
+ u16 host_cs_selector;
+ u16 host_ss_selector;
+ u16 host_ds_selector;
+ u16 host_fs_selector;
+ u16 host_gs_selector;
+ u16 host_tr_selector;
+ u16 guest_pml_index;
+};
+
+/*
+ * For save/restore compatibility, the vmcs12 field offsets must not change.
+ */
+#define CHECK_OFFSET(field, loc) \
+ BUILD_BUG_ON_MSG(offsetof(struct vmcs12, field) != (loc), \
+ "Offset of " #field " in struct vmcs12 has changed.")
+
+static inline void vmx_check_vmcs12_offsets(void) {
+ CHECK_OFFSET(hdr, 0);
+ CHECK_OFFSET(abort, 4);
+ CHECK_OFFSET(launch_state, 8);
+ CHECK_OFFSET(io_bitmap_a, 40);
+ CHECK_OFFSET(io_bitmap_b, 48);
+ CHECK_OFFSET(msr_bitmap, 56);
+ CHECK_OFFSET(vm_exit_msr_store_addr, 64);
+ CHECK_OFFSET(vm_exit_msr_load_addr, 72);
+ CHECK_OFFSET(vm_entry_msr_load_addr, 80);
+ CHECK_OFFSET(tsc_offset, 88);
+ CHECK_OFFSET(virtual_apic_page_addr, 96);
+ CHECK_OFFSET(apic_access_addr, 104);
+ CHECK_OFFSET(posted_intr_desc_addr, 112);
+ CHECK_OFFSET(ept_pointer, 120);
+ CHECK_OFFSET(eoi_exit_bitmap0, 128);
+ CHECK_OFFSET(eoi_exit_bitmap1, 136);
+ CHECK_OFFSET(eoi_exit_bitmap2, 144);
+ CHECK_OFFSET(eoi_exit_bitmap3, 152);
+ CHECK_OFFSET(xss_exit_bitmap, 160);
+ CHECK_OFFSET(guest_physical_address, 168);
+ CHECK_OFFSET(vmcs_link_pointer, 176);
+ CHECK_OFFSET(guest_ia32_debugctl, 184);
+ CHECK_OFFSET(guest_ia32_pat, 192);
+ CHECK_OFFSET(guest_ia32_efer, 200);
+ CHECK_OFFSET(guest_ia32_perf_global_ctrl, 208);
+ CHECK_OFFSET(guest_pdptr0, 216);
+ CHECK_OFFSET(guest_pdptr1, 224);
+ CHECK_OFFSET(guest_pdptr2, 232);
+ CHECK_OFFSET(guest_pdptr3, 240);
+ CHECK_OFFSET(guest_bndcfgs, 248);
+ CHECK_OFFSET(host_ia32_pat, 256);
+ CHECK_OFFSET(host_ia32_efer, 264);
+ CHECK_OFFSET(host_ia32_perf_global_ctrl, 272);
+ CHECK_OFFSET(vmread_bitmap, 280);
+ CHECK_OFFSET(vmwrite_bitmap, 288);
+ CHECK_OFFSET(vm_function_control, 296);
+ CHECK_OFFSET(eptp_list_address, 304);
+ CHECK_OFFSET(pml_address, 312);
+ CHECK_OFFSET(cr0_guest_host_mask, 344);
+ CHECK_OFFSET(cr4_guest_host_mask, 352);
+ CHECK_OFFSET(cr0_read_shadow, 360);
+ CHECK_OFFSET(cr4_read_shadow, 368);
+ CHECK_OFFSET(cr3_target_value0, 376);
+ CHECK_OFFSET(cr3_target_value1, 384);
+ CHECK_OFFSET(cr3_target_value2, 392);
+ CHECK_OFFSET(cr3_target_value3, 400);
+ CHECK_OFFSET(exit_qualification, 408);
+ CHECK_OFFSET(guest_linear_address, 416);
+ CHECK_OFFSET(guest_cr0, 424);
+ CHECK_OFFSET(guest_cr3, 432);
+ CHECK_OFFSET(guest_cr4, 440);
+ CHECK_OFFSET(guest_es_base, 448);
+ CHECK_OFFSET(guest_cs_base, 456);
+ CHECK_OFFSET(guest_ss_base, 464);
+ CHECK_OFFSET(guest_ds_base, 472);
+ CHECK_OFFSET(guest_fs_base, 480);
+ CHECK_OFFSET(guest_gs_base, 488);
+ CHECK_OFFSET(guest_ldtr_base, 496);
+ CHECK_OFFSET(guest_tr_base, 504);
+ CHECK_OFFSET(guest_gdtr_base, 512);
+ CHECK_OFFSET(guest_idtr_base, 520);
+ CHECK_OFFSET(guest_dr7, 528);
+ CHECK_OFFSET(guest_rsp, 536);
+ CHECK_OFFSET(guest_rip, 544);
+ CHECK_OFFSET(guest_rflags, 552);
+ CHECK_OFFSET(guest_pending_dbg_exceptions, 560);
+ CHECK_OFFSET(guest_sysenter_esp, 568);
+ CHECK_OFFSET(guest_sysenter_eip, 576);
+ CHECK_OFFSET(host_cr0, 584);
+ CHECK_OFFSET(host_cr3, 592);
+ CHECK_OFFSET(host_cr4, 600);
+ CHECK_OFFSET(host_fs_base, 608);
+ CHECK_OFFSET(host_gs_base, 616);
+ CHECK_OFFSET(host_tr_base, 624);
+ CHECK_OFFSET(host_gdtr_base, 632);
+ CHECK_OFFSET(host_idtr_base, 640);
+ CHECK_OFFSET(host_ia32_sysenter_esp, 648);
+ CHECK_OFFSET(host_ia32_sysenter_eip, 656);
+ CHECK_OFFSET(host_rsp, 664);
+ CHECK_OFFSET(host_rip, 672);
+ CHECK_OFFSET(pin_based_vm_exec_control, 744);
+ CHECK_OFFSET(cpu_based_vm_exec_control, 748);
+ CHECK_OFFSET(exception_bitmap, 752);
+ CHECK_OFFSET(page_fault_error_code_mask, 756);
+ CHECK_OFFSET(page_fault_error_code_match, 760);
+ CHECK_OFFSET(cr3_target_count, 764);
+ CHECK_OFFSET(vm_exit_controls, 768);
+ CHECK_OFFSET(vm_exit_msr_store_count, 772);
+ CHECK_OFFSET(vm_exit_msr_load_count, 776);
+ CHECK_OFFSET(vm_entry_controls, 780);
+ CHECK_OFFSET(vm_entry_msr_load_count, 784);
+ CHECK_OFFSET(vm_entry_intr_info_field, 788);
+ CHECK_OFFSET(vm_entry_exception_error_code, 792);
+ CHECK_OFFSET(vm_entry_instruction_len, 796);
+ CHECK_OFFSET(tpr_threshold, 800);
+ CHECK_OFFSET(secondary_vm_exec_control, 804);
+ CHECK_OFFSET(vm_instruction_error, 808);
+ CHECK_OFFSET(vm_exit_reason, 812);
+ CHECK_OFFSET(vm_exit_intr_info, 816);
+ CHECK_OFFSET(vm_exit_intr_error_code, 820);
+ CHECK_OFFSET(idt_vectoring_info_field, 824);
+ CHECK_OFFSET(idt_vectoring_error_code, 828);
+ CHECK_OFFSET(vm_exit_instruction_len, 832);
+ CHECK_OFFSET(vmx_instruction_info, 836);
+ CHECK_OFFSET(guest_es_limit, 840);
+ CHECK_OFFSET(guest_cs_limit, 844);
+ CHECK_OFFSET(guest_ss_limit, 848);
+ CHECK_OFFSET(guest_ds_limit, 852);
+ CHECK_OFFSET(guest_fs_limit, 856);
+ CHECK_OFFSET(guest_gs_limit, 860);
+ CHECK_OFFSET(guest_ldtr_limit, 864);
+ CHECK_OFFSET(guest_tr_limit, 868);
+ CHECK_OFFSET(guest_gdtr_limit, 872);
+ CHECK_OFFSET(guest_idtr_limit, 876);
+ CHECK_OFFSET(guest_es_ar_bytes, 880);
+ CHECK_OFFSET(guest_cs_ar_bytes, 884);
+ CHECK_OFFSET(guest_ss_ar_bytes, 888);
+ CHECK_OFFSET(guest_ds_ar_bytes, 892);
+ CHECK_OFFSET(guest_fs_ar_bytes, 896);
+ CHECK_OFFSET(guest_gs_ar_bytes, 900);
+ CHECK_OFFSET(guest_ldtr_ar_bytes, 904);
+ CHECK_OFFSET(guest_tr_ar_bytes, 908);
+ CHECK_OFFSET(guest_interruptibility_info, 912);
+ CHECK_OFFSET(guest_activity_state, 916);
+ CHECK_OFFSET(guest_sysenter_cs, 920);
+ CHECK_OFFSET(host_ia32_sysenter_cs, 924);
+ CHECK_OFFSET(vmx_preemption_timer_value, 928);
+ CHECK_OFFSET(virtual_processor_id, 960);
+ CHECK_OFFSET(posted_intr_nv, 962);
+ CHECK_OFFSET(guest_es_selector, 964);
+ CHECK_OFFSET(guest_cs_selector, 966);
+ CHECK_OFFSET(guest_ss_selector, 968);
+ CHECK_OFFSET(guest_ds_selector, 970);
+ CHECK_OFFSET(guest_fs_selector, 972);
+ CHECK_OFFSET(guest_gs_selector, 974);
+ CHECK_OFFSET(guest_ldtr_selector, 976);
+ CHECK_OFFSET(guest_tr_selector, 978);
+ CHECK_OFFSET(guest_intr_status, 980);
+ CHECK_OFFSET(host_es_selector, 982);
+ CHECK_OFFSET(host_cs_selector, 984);
+ CHECK_OFFSET(host_ss_selector, 986);
+ CHECK_OFFSET(host_ds_selector, 988);
+ CHECK_OFFSET(host_fs_selector, 990);
+ CHECK_OFFSET(host_gs_selector, 992);
+ CHECK_OFFSET(host_tr_selector, 994);
+ CHECK_OFFSET(guest_pml_index, 996);
+}
+
+/*
+ * VMCS12_REVISION is an arbitrary id that should be changed if the content or
+ * layout of struct vmcs12 is changed. MSR_IA32_VMX_BASIC returns this id, and
+ * VMPTRLD verifies that the VMCS region that L1 is loading contains this id.
+ *
+ * IMPORTANT: Changing this value will break save/restore compatibility with
+ * older kvm releases.
+ */
+#define VMCS12_REVISION 0x11e57ed0
+
+/*
+ * VMCS12_SIZE is the number of bytes L1 should allocate for the VMXON region
+ * and any VMCS region. Although only sizeof(struct vmcs12) are used by the
+ * current implementation, 4K are reserved to avoid future complications.
+ */
+#define VMCS12_SIZE 0x1000
+
+/*
+ * VMCS12_MAX_FIELD_INDEX is the highest index value used in any
+ * supported VMCS12 field encoding.
+ */
+#define VMCS12_MAX_FIELD_INDEX 0x17
+
+struct nested_vmx_msrs {
+ /*
+ * We only store the "true" versions of the VMX capability MSRs. We
+ * generate the "non-true" versions by setting the must-be-1 bits
+ * according to the SDM.
+ */
+ u32 procbased_ctls_low;
+ u32 procbased_ctls_high;
+ u32 secondary_ctls_low;
+ u32 secondary_ctls_high;
+ u32 pinbased_ctls_low;
+ u32 pinbased_ctls_high;
+ u32 exit_ctls_low;
+ u32 exit_ctls_high;
+ u32 entry_ctls_low;
+ u32 entry_ctls_high;
+ u32 misc_low;
+ u32 misc_high;
+ u32 ept_caps;
+ u32 vpid_caps;
+ u64 basic;
+ u64 cr0_fixed0;
+ u64 cr0_fixed1;
+ u64 cr4_fixed0;
+ u64 cr4_fixed1;
+ u64 vmcs_enum;
+ u64 vmfunc_controls;
+};
+
+/*
+ * The nested_vmx structure is part of vcpu_vmx, and holds information we need
+ * for correct emulation of VMX (i.e., nested VMX) on this vcpu.
+ */
+struct nested_vmx {
+ /* Has the level1 guest done vmxon? */
+ bool vmxon;
+ gpa_t vmxon_ptr;
+ bool pml_full;
+
+ /* The guest-physical address of the current VMCS L1 keeps for L2 */
+ gpa_t current_vmptr;
+ /*
+ * Cache of the guest's VMCS, existing outside of guest memory.
+ * Loaded from guest memory during VMPTRLD. Flushed to guest
+ * memory during VMCLEAR and VMPTRLD.
+ */
+ struct vmcs12 *cached_vmcs12;
+ /*
+ * Cache of the guest's shadow VMCS, existing outside of guest
+ * memory. Loaded from guest memory during VM entry. Flushed
+ * to guest memory during VM exit.
+ */
+ struct vmcs12 *cached_shadow_vmcs12;
+ /*
+ * Indicates if the shadow vmcs must be updated with the
+ * data hold by vmcs12
+ */
+ bool sync_shadow_vmcs;
+ bool dirty_vmcs12;
+
+ bool change_vmcs01_virtual_apic_mode;
+
+ /* L2 must run next, and mustn't decide to exit to L1. */
+ bool nested_run_pending;
+
+ struct loaded_vmcs vmcs02;
+
+ /*
+ * Guest pages referred to in the vmcs02 with host-physical
+ * pointers, so we must keep them pinned while L2 runs.
+ */
+ struct page *apic_access_page;
+ struct page *virtual_apic_page;
+ struct page *pi_desc_page;
+ struct pi_desc *pi_desc;
+ bool pi_pending;
+ u16 posted_intr_nv;
+
+ struct hrtimer preemption_timer;
+ bool preemption_timer_expired;
+
+ /* to migrate it to L2 if VM_ENTRY_LOAD_DEBUG_CONTROLS is off */
+ u64 vmcs01_debugctl;
+ u64 vmcs01_guest_bndcfgs;
+
+ u16 vpid02;
+ u16 last_vpid;
+
+ struct nested_vmx_msrs msrs;
+
+ /* SMM related state */
+ struct {
+ /* in VMX operation on SMM entry? */
+ bool vmxon;
+ /* in guest mode on SMM entry? */
+ bool guest_mode;
+ } smm;
+};
+
+#define POSTED_INTR_ON 0
+#define POSTED_INTR_SN 1
+
+/* Posted-Interrupt Descriptor */
+struct pi_desc {
+ u32 pir[8]; /* Posted interrupt requested */
+ union {
+ struct {
+ /* bit 256 - Outstanding Notification */
+ u16 on : 1,
+ /* bit 257 - Suppress Notification */
+ sn : 1,
+ /* bit 271:258 - Reserved */
+ rsvd_1 : 14;
+ /* bit 279:272 - Notification Vector */
+ u8 nv;
+ /* bit 287:280 - Reserved */
+ u8 rsvd_2;
+ /* bit 319:288 - Notification Destination */
+ u32 ndst;
+ };
+ u64 control;
+ };
+ u32 rsvd[6];
+} __aligned(64);
+
+static bool pi_test_and_set_on(struct pi_desc *pi_desc)
+{
+ return test_and_set_bit(POSTED_INTR_ON,
+ (unsigned long *)&pi_desc->control);
+}
+
+static bool pi_test_and_clear_on(struct pi_desc *pi_desc)
+{
+ return test_and_clear_bit(POSTED_INTR_ON,
+ (unsigned long *)&pi_desc->control);
+}
+
+static int pi_test_and_set_pir(int vector, struct pi_desc *pi_desc)
+{
+ return test_and_set_bit(vector, (unsigned long *)pi_desc->pir);
+}
+
+static inline void pi_clear_sn(struct pi_desc *pi_desc)
+{
+ return clear_bit(POSTED_INTR_SN,
+ (unsigned long *)&pi_desc->control);
+}
+
+static inline void pi_set_sn(struct pi_desc *pi_desc)
+{
+ return set_bit(POSTED_INTR_SN,
+ (unsigned long *)&pi_desc->control);
+}
+
+static inline void pi_clear_on(struct pi_desc *pi_desc)
+{
+ clear_bit(POSTED_INTR_ON,
+ (unsigned long *)&pi_desc->control);
+}
+
+static inline int pi_test_on(struct pi_desc *pi_desc)
+{
+ return test_bit(POSTED_INTR_ON,
+ (unsigned long *)&pi_desc->control);
+}
+
+static inline int pi_test_sn(struct pi_desc *pi_desc)
+{
+ return test_bit(POSTED_INTR_SN,
+ (unsigned long *)&pi_desc->control);
+}
+
+struct vmx_msrs {
+ unsigned int nr;
+ struct vmx_msr_entry val[NR_AUTOLOAD_MSRS];
+};
+
+struct vcpu_vmx {
+ struct kvm_vcpu vcpu;
+ unsigned long host_rsp;
+ u8 fail;
+ u8 msr_bitmap_mode;
+ u32 exit_intr_info;
+ u32 idt_vectoring_info;
+ ulong rflags;
+ struct shared_msr_entry *guest_msrs;
+ int nmsrs;
+ int save_nmsrs;
+ bool guest_msrs_dirty;
+ unsigned long host_idt_base;
+#ifdef CONFIG_X86_64
+ u64 msr_host_kernel_gs_base;
+ u64 msr_guest_kernel_gs_base;
+#endif
+
+ u64 spec_ctrl;
+
+ u32 vm_entry_controls_shadow;
+ u32 vm_exit_controls_shadow;
+ u32 secondary_exec_control;
+
+ /*
+ * loaded_vmcs points to the VMCS currently used in this vcpu. For a
+ * non-nested (L1) guest, it always points to vmcs01. For a nested
+ * guest (L2), it points to a different VMCS. loaded_cpu_state points
+ * to the VMCS whose state is loaded into the CPU registers that only
+ * need to be switched when transitioning to/from the kernel; a NULL
+ * value indicates that host state is loaded.
+ */
+ struct loaded_vmcs vmcs01;
+ struct loaded_vmcs *loaded_vmcs;
+ struct loaded_vmcs *loaded_cpu_state;
+ bool __launched; /* temporary, used in vmx_vcpu_run */
+ struct msr_autoload {
+ struct vmx_msrs guest;
+ struct vmx_msrs host;
+ } msr_autoload;
+
+ struct {
+ int vm86_active;
+ ulong save_rflags;
+ struct kvm_segment segs[8];
+ } rmode;
+ struct {
+ u32 bitmask; /* 4 bits per segment (1 bit per field) */
+ struct kvm_save_segment {
+ u16 selector;
+ unsigned long base;
+ u32 limit;
+ u32 ar;
+ } seg[8];
+ } segment_cache;
+ int vpid;
+ bool emulation_required;
+
+ u32 exit_reason;
+
+ /* Posted interrupt descriptor */
+ struct pi_desc pi_desc;
+
+ /* Support for a guest hypervisor (nested VMX) */
+ struct nested_vmx nested;
+
+ /* Dynamic PLE window. */
+ int ple_window;
+ bool ple_window_dirty;
+
+ bool req_immediate_exit;
+
+ /* Support for PML */
+#define PML_ENTITY_NUM 512
+ struct page *pml_pg;
+
+ /* apic deadline value in host tsc */
+ u64 hv_deadline_tsc;
+
+ u64 current_tsc_ratio;
+
+ u32 host_pkru;
+
+ unsigned long host_debugctlmsr;
+
+ /*
+ * Only bits masked by msr_ia32_feature_control_valid_bits can be set in
+ * msr_ia32_feature_control. FEATURE_CONTROL_LOCKED is always included
+ * in msr_ia32_feature_control_valid_bits.
+ */
+ u64 msr_ia32_feature_control;
+ u64 msr_ia32_feature_control_valid_bits;
+ u64 ept_pointer;
+ u64 msr_ia32_mcu_opt_ctrl;
+ bool disable_fb_clear;
+};
+
+enum segment_cache_field {
+ SEG_FIELD_SEL = 0,
+ SEG_FIELD_BASE = 1,
+ SEG_FIELD_LIMIT = 2,
+ SEG_FIELD_AR = 3,
+
+ SEG_FIELD_NR = 4
+};
+
+static inline struct kvm_vmx *to_kvm_vmx(struct kvm *kvm)
+{
+ return container_of(kvm, struct kvm_vmx, kvm);
+}
+
+static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
+{
+ return container_of(vcpu, struct vcpu_vmx, vcpu);
+}
+
+static struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu)
+{
+ return &(to_vmx(vcpu)->pi_desc);
+}
+
+#define ROL16(val, n) ((u16)(((u16)(val) << (n)) | ((u16)(val) >> (16 - (n)))))
+#define VMCS12_OFFSET(x) offsetof(struct vmcs12, x)
+#define FIELD(number, name) [ROL16(number, 6)] = VMCS12_OFFSET(name)
+#define FIELD64(number, name) \
+ FIELD(number, name), \
+ [ROL16(number##_HIGH, 6)] = VMCS12_OFFSET(name) + sizeof(u32)
+
+
+static u16 shadow_read_only_fields[] = {
+#define SHADOW_FIELD_RO(x) x,
+#include "vmx_shadow_fields.h"
+};
+static int max_shadow_read_only_fields =
+ ARRAY_SIZE(shadow_read_only_fields);
+
+static u16 shadow_read_write_fields[] = {
+#define SHADOW_FIELD_RW(x) x,
+#include "vmx_shadow_fields.h"
+};
+static int max_shadow_read_write_fields =
+ ARRAY_SIZE(shadow_read_write_fields);
+
+static const unsigned short vmcs_field_to_offset_table[] = {
+ FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id),
+ FIELD(POSTED_INTR_NV, posted_intr_nv),
+ FIELD(GUEST_ES_SELECTOR, guest_es_selector),
+ FIELD(GUEST_CS_SELECTOR, guest_cs_selector),
+ FIELD(GUEST_SS_SELECTOR, guest_ss_selector),
+ FIELD(GUEST_DS_SELECTOR, guest_ds_selector),
+ FIELD(GUEST_FS_SELECTOR, guest_fs_selector),
+ FIELD(GUEST_GS_SELECTOR, guest_gs_selector),
+ FIELD(GUEST_LDTR_SELECTOR, guest_ldtr_selector),
+ FIELD(GUEST_TR_SELECTOR, guest_tr_selector),
+ FIELD(GUEST_INTR_STATUS, guest_intr_status),
+ FIELD(GUEST_PML_INDEX, guest_pml_index),
+ FIELD(HOST_ES_SELECTOR, host_es_selector),
+ FIELD(HOST_CS_SELECTOR, host_cs_selector),
+ FIELD(HOST_SS_SELECTOR, host_ss_selector),
+ FIELD(HOST_DS_SELECTOR, host_ds_selector),
+ FIELD(HOST_FS_SELECTOR, host_fs_selector),
+ FIELD(HOST_GS_SELECTOR, host_gs_selector),
+ FIELD(HOST_TR_SELECTOR, host_tr_selector),
+ FIELD64(IO_BITMAP_A, io_bitmap_a),
+ FIELD64(IO_BITMAP_B, io_bitmap_b),
+ FIELD64(MSR_BITMAP, msr_bitmap),
+ FIELD64(VM_EXIT_MSR_STORE_ADDR, vm_exit_msr_store_addr),
+ FIELD64(VM_EXIT_MSR_LOAD_ADDR, vm_exit_msr_load_addr),
+ FIELD64(VM_ENTRY_MSR_LOAD_ADDR, vm_entry_msr_load_addr),
+ FIELD64(PML_ADDRESS, pml_address),
+ FIELD64(TSC_OFFSET, tsc_offset),
+ FIELD64(VIRTUAL_APIC_PAGE_ADDR, virtual_apic_page_addr),
+ FIELD64(APIC_ACCESS_ADDR, apic_access_addr),
+ FIELD64(POSTED_INTR_DESC_ADDR, posted_intr_desc_addr),
+ FIELD64(VM_FUNCTION_CONTROL, vm_function_control),
+ FIELD64(EPT_POINTER, ept_pointer),
+ FIELD64(EOI_EXIT_BITMAP0, eoi_exit_bitmap0),
+ FIELD64(EOI_EXIT_BITMAP1, eoi_exit_bitmap1),
+ FIELD64(EOI_EXIT_BITMAP2, eoi_exit_bitmap2),
+ FIELD64(EOI_EXIT_BITMAP3, eoi_exit_bitmap3),
+ FIELD64(EPTP_LIST_ADDRESS, eptp_list_address),
+ FIELD64(VMREAD_BITMAP, vmread_bitmap),
+ FIELD64(VMWRITE_BITMAP, vmwrite_bitmap),
+ FIELD64(XSS_EXIT_BITMAP, xss_exit_bitmap),
+ FIELD64(GUEST_PHYSICAL_ADDRESS, guest_physical_address),
+ FIELD64(VMCS_LINK_POINTER, vmcs_link_pointer),
+ FIELD64(GUEST_IA32_DEBUGCTL, guest_ia32_debugctl),
+ FIELD64(GUEST_IA32_PAT, guest_ia32_pat),
+ FIELD64(GUEST_IA32_EFER, guest_ia32_efer),
+ FIELD64(GUEST_IA32_PERF_GLOBAL_CTRL, guest_ia32_perf_global_ctrl),
+ FIELD64(GUEST_PDPTR0, guest_pdptr0),
+ FIELD64(GUEST_PDPTR1, guest_pdptr1),
+ FIELD64(GUEST_PDPTR2, guest_pdptr2),
+ FIELD64(GUEST_PDPTR3, guest_pdptr3),
+ FIELD64(GUEST_BNDCFGS, guest_bndcfgs),
+ FIELD64(HOST_IA32_PAT, host_ia32_pat),
+ FIELD64(HOST_IA32_EFER, host_ia32_efer),
+ FIELD64(HOST_IA32_PERF_GLOBAL_CTRL, host_ia32_perf_global_ctrl),
+ FIELD(PIN_BASED_VM_EXEC_CONTROL, pin_based_vm_exec_control),
+ FIELD(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control),
+ FIELD(EXCEPTION_BITMAP, exception_bitmap),
+ FIELD(PAGE_FAULT_ERROR_CODE_MASK, page_fault_error_code_mask),
+ FIELD(PAGE_FAULT_ERROR_CODE_MATCH, page_fault_error_code_match),
+ FIELD(CR3_TARGET_COUNT, cr3_target_count),
+ FIELD(VM_EXIT_CONTROLS, vm_exit_controls),
+ FIELD(VM_EXIT_MSR_STORE_COUNT, vm_exit_msr_store_count),
+ FIELD(VM_EXIT_MSR_LOAD_COUNT, vm_exit_msr_load_count),
+ FIELD(VM_ENTRY_CONTROLS, vm_entry_controls),
+ FIELD(VM_ENTRY_MSR_LOAD_COUNT, vm_entry_msr_load_count),
+ FIELD(VM_ENTRY_INTR_INFO_FIELD, vm_entry_intr_info_field),
+ FIELD(VM_ENTRY_EXCEPTION_ERROR_CODE, vm_entry_exception_error_code),
+ FIELD(VM_ENTRY_INSTRUCTION_LEN, vm_entry_instruction_len),
+ FIELD(TPR_THRESHOLD, tpr_threshold),
+ FIELD(SECONDARY_VM_EXEC_CONTROL, secondary_vm_exec_control),
+ FIELD(VM_INSTRUCTION_ERROR, vm_instruction_error),
+ FIELD(VM_EXIT_REASON, vm_exit_reason),
+ FIELD(VM_EXIT_INTR_INFO, vm_exit_intr_info),
+ FIELD(VM_EXIT_INTR_ERROR_CODE, vm_exit_intr_error_code),
+ FIELD(IDT_VECTORING_INFO_FIELD, idt_vectoring_info_field),
+ FIELD(IDT_VECTORING_ERROR_CODE, idt_vectoring_error_code),
+ FIELD(VM_EXIT_INSTRUCTION_LEN, vm_exit_instruction_len),
+ FIELD(VMX_INSTRUCTION_INFO, vmx_instruction_info),
+ FIELD(GUEST_ES_LIMIT, guest_es_limit),
+ FIELD(GUEST_CS_LIMIT, guest_cs_limit),
+ FIELD(GUEST_SS_LIMIT, guest_ss_limit),
+ FIELD(GUEST_DS_LIMIT, guest_ds_limit),
+ FIELD(GUEST_FS_LIMIT, guest_fs_limit),
+ FIELD(GUEST_GS_LIMIT, guest_gs_limit),
+ FIELD(GUEST_LDTR_LIMIT, guest_ldtr_limit),
+ FIELD(GUEST_TR_LIMIT, guest_tr_limit),
+ FIELD(GUEST_GDTR_LIMIT, guest_gdtr_limit),
+ FIELD(GUEST_IDTR_LIMIT, guest_idtr_limit),
+ FIELD(GUEST_ES_AR_BYTES, guest_es_ar_bytes),
+ FIELD(GUEST_CS_AR_BYTES, guest_cs_ar_bytes),
+ FIELD(GUEST_SS_AR_BYTES, guest_ss_ar_bytes),
+ FIELD(GUEST_DS_AR_BYTES, guest_ds_ar_bytes),
+ FIELD(GUEST_FS_AR_BYTES, guest_fs_ar_bytes),
+ FIELD(GUEST_GS_AR_BYTES, guest_gs_ar_bytes),
+ FIELD(GUEST_LDTR_AR_BYTES, guest_ldtr_ar_bytes),
+ FIELD(GUEST_TR_AR_BYTES, guest_tr_ar_bytes),
+ FIELD(GUEST_INTERRUPTIBILITY_INFO, guest_interruptibility_info),
+ FIELD(GUEST_ACTIVITY_STATE, guest_activity_state),
+ FIELD(GUEST_SYSENTER_CS, guest_sysenter_cs),
+ FIELD(HOST_IA32_SYSENTER_CS, host_ia32_sysenter_cs),
+ FIELD(VMX_PREEMPTION_TIMER_VALUE, vmx_preemption_timer_value),
+ FIELD(CR0_GUEST_HOST_MASK, cr0_guest_host_mask),
+ FIELD(CR4_GUEST_HOST_MASK, cr4_guest_host_mask),
+ FIELD(CR0_READ_SHADOW, cr0_read_shadow),
+ FIELD(CR4_READ_SHADOW, cr4_read_shadow),
+ FIELD(CR3_TARGET_VALUE0, cr3_target_value0),
+ FIELD(CR3_TARGET_VALUE1, cr3_target_value1),
+ FIELD(CR3_TARGET_VALUE2, cr3_target_value2),
+ FIELD(CR3_TARGET_VALUE3, cr3_target_value3),
+ FIELD(EXIT_QUALIFICATION, exit_qualification),
+ FIELD(GUEST_LINEAR_ADDRESS, guest_linear_address),
+ FIELD(GUEST_CR0, guest_cr0),
+ FIELD(GUEST_CR3, guest_cr3),
+ FIELD(GUEST_CR4, guest_cr4),
+ FIELD(GUEST_ES_BASE, guest_es_base),
+ FIELD(GUEST_CS_BASE, guest_cs_base),
+ FIELD(GUEST_SS_BASE, guest_ss_base),
+ FIELD(GUEST_DS_BASE, guest_ds_base),
+ FIELD(GUEST_FS_BASE, guest_fs_base),
+ FIELD(GUEST_GS_BASE, guest_gs_base),
+ FIELD(GUEST_LDTR_BASE, guest_ldtr_base),
+ FIELD(GUEST_TR_BASE, guest_tr_base),
+ FIELD(GUEST_GDTR_BASE, guest_gdtr_base),
+ FIELD(GUEST_IDTR_BASE, guest_idtr_base),
+ FIELD(GUEST_DR7, guest_dr7),
+ FIELD(GUEST_RSP, guest_rsp),
+ FIELD(GUEST_RIP, guest_rip),
+ FIELD(GUEST_RFLAGS, guest_rflags),
+ FIELD(GUEST_PENDING_DBG_EXCEPTIONS, guest_pending_dbg_exceptions),
+ FIELD(GUEST_SYSENTER_ESP, guest_sysenter_esp),
+ FIELD(GUEST_SYSENTER_EIP, guest_sysenter_eip),
+ FIELD(HOST_CR0, host_cr0),
+ FIELD(HOST_CR3, host_cr3),
+ FIELD(HOST_CR4, host_cr4),
+ FIELD(HOST_FS_BASE, host_fs_base),
+ FIELD(HOST_GS_BASE, host_gs_base),
+ FIELD(HOST_TR_BASE, host_tr_base),
+ FIELD(HOST_GDTR_BASE, host_gdtr_base),
+ FIELD(HOST_IDTR_BASE, host_idtr_base),
+ FIELD(HOST_IA32_SYSENTER_ESP, host_ia32_sysenter_esp),
+ FIELD(HOST_IA32_SYSENTER_EIP, host_ia32_sysenter_eip),
+ FIELD(HOST_RSP, host_rsp),
+ FIELD(HOST_RIP, host_rip),
+};
+
+static inline short vmcs_field_to_offset(unsigned long field)
+{
+ const size_t size = ARRAY_SIZE(vmcs_field_to_offset_table);
+ unsigned short offset;
+ unsigned index;
+
+ if (field >> 15)
+ return -ENOENT;
+
+ index = ROL16(field, 6);
+ if (index >= size)
+ return -ENOENT;
+
+ index = array_index_nospec(index, size);
+ offset = vmcs_field_to_offset_table[index];
+ if (offset == 0)
+ return -ENOENT;
+ return offset;
+}
+
+static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu)
+{
+ return to_vmx(vcpu)->nested.cached_vmcs12;
+}
+
+static inline struct vmcs12 *get_shadow_vmcs12(struct kvm_vcpu *vcpu)
+{
+ return to_vmx(vcpu)->nested.cached_shadow_vmcs12;
+}
+
+static bool nested_ept_ad_enabled(struct kvm_vcpu *vcpu);
+static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu);
+static u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa);
+static bool vmx_xsaves_supported(void);
+static void vmx_set_segment(struct kvm_vcpu *vcpu,
+ struct kvm_segment *var, int seg);
+static void vmx_get_segment(struct kvm_vcpu *vcpu,
+ struct kvm_segment *var, int seg);
+static bool guest_state_valid(struct kvm_vcpu *vcpu);
+static u32 vmx_segment_access_rights(struct kvm_segment *var);
+static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx);
+static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu);
+static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked);
+static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12,
+ u16 error_code);
+static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu);
+static __always_inline void vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
+ u32 msr, int type);
+
+static DEFINE_PER_CPU(struct vmcs *, vmxarea);
+static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
+/*
+ * We maintain a per-CPU linked-list of VMCS loaded on that CPU. This is needed
+ * when a CPU is brought down, and we need to VMCLEAR all VMCSs loaded on it.
+ */
+static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu);
+
+/*
+ * We maintian a per-CPU linked-list of vCPU, so in wakeup_handler() we
+ * can find which vCPU should be waken up.
+ */
+static DEFINE_PER_CPU(struct list_head, blocked_vcpu_on_cpu);
+static DEFINE_PER_CPU(spinlock_t, blocked_vcpu_on_cpu_lock);
+
+enum {
+ VMX_VMREAD_BITMAP,
+ VMX_VMWRITE_BITMAP,
+ VMX_BITMAP_NR
+};
+
+static unsigned long *vmx_bitmap[VMX_BITMAP_NR];
+
+#define vmx_vmread_bitmap (vmx_bitmap[VMX_VMREAD_BITMAP])
+#define vmx_vmwrite_bitmap (vmx_bitmap[VMX_VMWRITE_BITMAP])
+
+static bool cpu_has_load_ia32_efer;
+static bool cpu_has_load_perf_global_ctrl;
+
+static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS);
+static DEFINE_SPINLOCK(vmx_vpid_lock);
+
+static struct vmcs_config {
+ int size;
+ int order;
+ u32 basic_cap;
+ u32 revision_id;
+ u32 pin_based_exec_ctrl;
+ u32 cpu_based_exec_ctrl;
+ u32 cpu_based_2nd_exec_ctrl;
+ u32 vmexit_ctrl;
+ u32 vmentry_ctrl;
+ struct nested_vmx_msrs nested;
+} vmcs_config;
+
+static struct vmx_capability {
+ u32 ept;
+ u32 vpid;
+} vmx_capability;
+
+#define VMX_SEGMENT_FIELD(seg) \
+ [VCPU_SREG_##seg] = { \
+ .selector = GUEST_##seg##_SELECTOR, \
+ .base = GUEST_##seg##_BASE, \
+ .limit = GUEST_##seg##_LIMIT, \
+ .ar_bytes = GUEST_##seg##_AR_BYTES, \
+ }
+
+static const struct kvm_vmx_segment_field {
+ unsigned selector;
+ unsigned base;
+ unsigned limit;
+ unsigned ar_bytes;
+} kvm_vmx_segment_fields[] = {
+ VMX_SEGMENT_FIELD(CS),
+ VMX_SEGMENT_FIELD(DS),
+ VMX_SEGMENT_FIELD(ES),
+ VMX_SEGMENT_FIELD(FS),
+ VMX_SEGMENT_FIELD(GS),
+ VMX_SEGMENT_FIELD(SS),
+ VMX_SEGMENT_FIELD(TR),
+ VMX_SEGMENT_FIELD(LDTR),
+};
+
+static u64 host_efer;
+
+static void ept_save_pdptrs(struct kvm_vcpu *vcpu);
+
+/*
+ * Keep MSR_STAR at the end, as setup_msrs() will try to optimize it
+ * away by decrementing the array size.
+ */
+static const u32 vmx_msr_index[] = {
+#ifdef CONFIG_X86_64
+ MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR,
+#endif
+ MSR_EFER, MSR_TSC_AUX, MSR_STAR,
+};
+
+DEFINE_STATIC_KEY_FALSE(enable_evmcs);
+
+#define current_evmcs ((struct hv_enlightened_vmcs *)this_cpu_read(current_vmcs))
+
+#define KVM_EVMCS_VERSION 1
+
+#if IS_ENABLED(CONFIG_HYPERV)
+static bool __read_mostly enlightened_vmcs = true;
+module_param(enlightened_vmcs, bool, 0444);
+
+static inline void evmcs_write64(unsigned long field, u64 value)
+{
+ u16 clean_field;
+ int offset = get_evmcs_offset(field, &clean_field);
+
+ if (offset < 0)
+ return;
+
+ *(u64 *)((char *)current_evmcs + offset) = value;
+
+ current_evmcs->hv_clean_fields &= ~clean_field;
+}
+
+static inline void evmcs_write32(unsigned long field, u32 value)
+{
+ u16 clean_field;
+ int offset = get_evmcs_offset(field, &clean_field);
+
+ if (offset < 0)
+ return;
+
+ *(u32 *)((char *)current_evmcs + offset) = value;
+ current_evmcs->hv_clean_fields &= ~clean_field;
+}
+
+static inline void evmcs_write16(unsigned long field, u16 value)
+{
+ u16 clean_field;
+ int offset = get_evmcs_offset(field, &clean_field);
+
+ if (offset < 0)
+ return;
+
+ *(u16 *)((char *)current_evmcs + offset) = value;
+ current_evmcs->hv_clean_fields &= ~clean_field;
+}
+
+static inline u64 evmcs_read64(unsigned long field)
+{
+ int offset = get_evmcs_offset(field, NULL);
+
+ if (offset < 0)
+ return 0;
+
+ return *(u64 *)((char *)current_evmcs + offset);
+}
+
+static inline u32 evmcs_read32(unsigned long field)
+{
+ int offset = get_evmcs_offset(field, NULL);
+
+ if (offset < 0)
+ return 0;
+
+ return *(u32 *)((char *)current_evmcs + offset);
+}
+
+static inline u16 evmcs_read16(unsigned long field)
+{
+ int offset = get_evmcs_offset(field, NULL);
+
+ if (offset < 0)
+ return 0;
+
+ return *(u16 *)((char *)current_evmcs + offset);
+}
+
+static inline void evmcs_touch_msr_bitmap(void)
+{
+ if (unlikely(!current_evmcs))
+ return;
+
+ if (current_evmcs->hv_enlightenments_control.msr_bitmap)
+ current_evmcs->hv_clean_fields &=
+ ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP;
+}
+
+static void evmcs_load(u64 phys_addr)
+{
+ struct hv_vp_assist_page *vp_ap =
+ hv_get_vp_assist_page(smp_processor_id());
+
+ vp_ap->current_nested_vmcs = phys_addr;
+ vp_ap->enlighten_vmentry = 1;
+}
+
+static void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf)
+{
+ /*
+ * Enlightened VMCSv1 doesn't support these:
+ *
+ * POSTED_INTR_NV = 0x00000002,
+ * GUEST_INTR_STATUS = 0x00000810,
+ * APIC_ACCESS_ADDR = 0x00002014,
+ * POSTED_INTR_DESC_ADDR = 0x00002016,
+ * EOI_EXIT_BITMAP0 = 0x0000201c,
+ * EOI_EXIT_BITMAP1 = 0x0000201e,
+ * EOI_EXIT_BITMAP2 = 0x00002020,
+ * EOI_EXIT_BITMAP3 = 0x00002022,
+ */
+ vmcs_conf->pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR;
+ vmcs_conf->cpu_based_2nd_exec_ctrl &=
+ ~SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY;
+ vmcs_conf->cpu_based_2nd_exec_ctrl &=
+ ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
+ vmcs_conf->cpu_based_2nd_exec_ctrl &=
+ ~SECONDARY_EXEC_APIC_REGISTER_VIRT;
+
+ /*
+ * GUEST_PML_INDEX = 0x00000812,
+ * PML_ADDRESS = 0x0000200e,
+ */
+ vmcs_conf->cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_ENABLE_PML;
+
+ /* VM_FUNCTION_CONTROL = 0x00002018, */
+ vmcs_conf->cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_ENABLE_VMFUNC;
+
+ /*
+ * EPTP_LIST_ADDRESS = 0x00002024,
+ * VMREAD_BITMAP = 0x00002026,
+ * VMWRITE_BITMAP = 0x00002028,
+ */
+ vmcs_conf->cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_SHADOW_VMCS;
+
+ /*
+ * TSC_MULTIPLIER = 0x00002032,
+ */
+ vmcs_conf->cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_TSC_SCALING;
+
+ /*
+ * PLE_GAP = 0x00004020,
+ * PLE_WINDOW = 0x00004022,
+ */
+ vmcs_conf->cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
+
+ /*
+ * VMX_PREEMPTION_TIMER_VALUE = 0x0000482E,
+ */
+ vmcs_conf->pin_based_exec_ctrl &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
+
+ /*
+ * GUEST_IA32_PERF_GLOBAL_CTRL = 0x00002808,
+ * HOST_IA32_PERF_GLOBAL_CTRL = 0x00002c04,
+ */
+ vmcs_conf->vmexit_ctrl &= ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL;
+ vmcs_conf->vmentry_ctrl &= ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
+
+ /*
+ * Currently unsupported in KVM:
+ * GUEST_IA32_RTIT_CTL = 0x00002814,
+ */
+}
+
+/* check_ept_pointer() should be under protection of ept_pointer_lock. */
+static void check_ept_pointer_match(struct kvm *kvm)
+{
+ struct kvm_vcpu *vcpu;
+ u64 tmp_eptp = INVALID_PAGE;
+ int i;
+
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ if (!VALID_PAGE(tmp_eptp)) {
+ tmp_eptp = to_vmx(vcpu)->ept_pointer;
+ } else if (tmp_eptp != to_vmx(vcpu)->ept_pointer) {
+ to_kvm_vmx(kvm)->ept_pointers_match
+ = EPT_POINTERS_MISMATCH;
+ return;
+ }
+ }
+
+ to_kvm_vmx(kvm)->ept_pointers_match = EPT_POINTERS_MATCH;
+}
+
+static int vmx_hv_remote_flush_tlb(struct kvm *kvm)
+{
+ int ret;
+
+ spin_lock(&to_kvm_vmx(kvm)->ept_pointer_lock);
+
+ if (to_kvm_vmx(kvm)->ept_pointers_match == EPT_POINTERS_CHECK)
+ check_ept_pointer_match(kvm);
+
+ if (to_kvm_vmx(kvm)->ept_pointers_match != EPT_POINTERS_MATCH) {
+ ret = -ENOTSUPP;
+ goto out;
+ }
+
+ /*
+ * FLUSH_GUEST_PHYSICAL_ADDRESS_SPACE hypercall needs the address of the
+ * base of EPT PML4 table, strip off EPT configuration information.
+ */
+ ret = hyperv_flush_guest_mapping(
+ to_vmx(kvm_get_vcpu(kvm, 0))->ept_pointer & PAGE_MASK);
+
+out:
+ spin_unlock(&to_kvm_vmx(kvm)->ept_pointer_lock);
+ return ret;
+}
+#else /* !IS_ENABLED(CONFIG_HYPERV) */
+static inline void evmcs_write64(unsigned long field, u64 value) {}
+static inline void evmcs_write32(unsigned long field, u32 value) {}
+static inline void evmcs_write16(unsigned long field, u16 value) {}
+static inline u64 evmcs_read64(unsigned long field) { return 0; }
+static inline u32 evmcs_read32(unsigned long field) { return 0; }
+static inline u16 evmcs_read16(unsigned long field) { return 0; }
+static inline void evmcs_load(u64 phys_addr) {}
+static inline void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf) {}
+static inline void evmcs_touch_msr_bitmap(void) {}
+#endif /* IS_ENABLED(CONFIG_HYPERV) */
+
+static inline bool is_exception_n(u32 intr_info, u8 vector)
+{
+ return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
+ INTR_INFO_VALID_MASK)) ==
+ (INTR_TYPE_HARD_EXCEPTION | vector | INTR_INFO_VALID_MASK);
+}
+
+static inline bool is_debug(u32 intr_info)
+{
+ return is_exception_n(intr_info, DB_VECTOR);
+}
+
+static inline bool is_breakpoint(u32 intr_info)
+{
+ return is_exception_n(intr_info, BP_VECTOR);
+}
+
+static inline bool is_page_fault(u32 intr_info)
+{
+ return is_exception_n(intr_info, PF_VECTOR);
+}
+
+static inline bool is_no_device(u32 intr_info)
+{
+ return is_exception_n(intr_info, NM_VECTOR);
+}
+
+static inline bool is_invalid_opcode(u32 intr_info)
+{
+ return is_exception_n(intr_info, UD_VECTOR);
+}
+
+static inline bool is_gp_fault(u32 intr_info)
+{
+ return is_exception_n(intr_info, GP_VECTOR);
+}
+
+static inline bool is_external_interrupt(u32 intr_info)
+{
+ return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
+ == (INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK);
+}
+
+static inline bool is_machine_check(u32 intr_info)
+{
+ return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
+ INTR_INFO_VALID_MASK)) ==
+ (INTR_TYPE_HARD_EXCEPTION | MC_VECTOR | INTR_INFO_VALID_MASK);
+}
+
+/* Undocumented: icebp/int1 */
+static inline bool is_icebp(u32 intr_info)
+{
+ return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
+ == (INTR_TYPE_PRIV_SW_EXCEPTION | INTR_INFO_VALID_MASK);
+}
+
+static inline bool cpu_has_vmx_msr_bitmap(void)
+{
+ return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_USE_MSR_BITMAPS;
+}
+
+static inline bool cpu_has_vmx_tpr_shadow(void)
+{
+ return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW;
+}
+
+static inline bool cpu_need_tpr_shadow(struct kvm_vcpu *vcpu)
+{
+ return cpu_has_vmx_tpr_shadow() && lapic_in_kernel(vcpu);
+}
+
+static inline bool cpu_has_secondary_exec_ctrls(void)
+{
+ return vmcs_config.cpu_based_exec_ctrl &
+ CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
+}
+
+static inline bool cpu_has_vmx_virtualize_apic_accesses(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
+}
+
+static inline bool cpu_has_vmx_virtualize_x2apic_mode(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
+}
+
+static inline bool cpu_has_vmx_apic_register_virt(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_APIC_REGISTER_VIRT;
+}
+
+static inline bool cpu_has_vmx_virtual_intr_delivery(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY;
+}
+
+static inline bool cpu_has_vmx_encls_vmexit(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_ENCLS_EXITING;
+}
+
+/*
+ * Comment's format: document - errata name - stepping - processor name.
+ * Refer from
+ * https://www.virtualbox.org/svn/vbox/trunk/src/VBox/VMM/VMMR0/HMR0.cpp
+ */
+static u32 vmx_preemption_cpu_tfms[] = {
+/* 323344.pdf - BA86 - D0 - Xeon 7500 Series */
+0x000206E6,
+/* 323056.pdf - AAX65 - C2 - Xeon L3406 */
+/* 322814.pdf - AAT59 - C2 - i7-600, i5-500, i5-400 and i3-300 Mobile */
+/* 322911.pdf - AAU65 - C2 - i5-600, i3-500 Desktop and Pentium G6950 */
+0x00020652,
+/* 322911.pdf - AAU65 - K0 - i5-600, i3-500 Desktop and Pentium G6950 */
+0x00020655,
+/* 322373.pdf - AAO95 - B1 - Xeon 3400 Series */
+/* 322166.pdf - AAN92 - B1 - i7-800 and i5-700 Desktop */
+/*
+ * 320767.pdf - AAP86 - B1 -
+ * i7-900 Mobile Extreme, i7-800 and i7-700 Mobile
+ */
+0x000106E5,
+/* 321333.pdf - AAM126 - C0 - Xeon 3500 */
+0x000106A0,
+/* 321333.pdf - AAM126 - C1 - Xeon 3500 */
+0x000106A1,
+/* 320836.pdf - AAJ124 - C0 - i7-900 Desktop Extreme and i7-900 Desktop */
+0x000106A4,
+ /* 321333.pdf - AAM126 - D0 - Xeon 3500 */
+ /* 321324.pdf - AAK139 - D0 - Xeon 5500 */
+ /* 320836.pdf - AAJ124 - D0 - i7-900 Extreme and i7-900 Desktop */
+0x000106A5,
+};
+
+static inline bool cpu_has_broken_vmx_preemption_timer(void)
+{
+ u32 eax = cpuid_eax(0x00000001), i;
+
+ /* Clear the reserved bits */
+ eax &= ~(0x3U << 14 | 0xfU << 28);
+ for (i = 0; i < ARRAY_SIZE(vmx_preemption_cpu_tfms); i++)
+ if (eax == vmx_preemption_cpu_tfms[i])
+ return true;
+
+ return false;
+}
+
+static inline bool cpu_has_vmx_preemption_timer(void)
+{
+ return vmcs_config.pin_based_exec_ctrl &
+ PIN_BASED_VMX_PREEMPTION_TIMER;
+}
+
+static inline bool cpu_has_vmx_posted_intr(void)
+{
+ return IS_ENABLED(CONFIG_X86_LOCAL_APIC) &&
+ vmcs_config.pin_based_exec_ctrl & PIN_BASED_POSTED_INTR;
+}
+
+static inline bool cpu_has_vmx_apicv(void)
+{
+ return cpu_has_vmx_apic_register_virt() &&
+ cpu_has_vmx_virtual_intr_delivery() &&
+ cpu_has_vmx_posted_intr();
+}
+
+static inline bool cpu_has_vmx_flexpriority(void)
+{
+ return cpu_has_vmx_tpr_shadow() &&
+ cpu_has_vmx_virtualize_apic_accesses();
+}
+
+static inline bool cpu_has_vmx_ept_execute_only(void)
+{
+ return vmx_capability.ept & VMX_EPT_EXECUTE_ONLY_BIT;
+}
+
+static inline bool cpu_has_vmx_ept_2m_page(void)
+{
+ return vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT;
+}
+
+static inline bool cpu_has_vmx_ept_1g_page(void)
+{
+ return vmx_capability.ept & VMX_EPT_1GB_PAGE_BIT;
+}
+
+static inline bool cpu_has_vmx_ept_4levels(void)
+{
+ return vmx_capability.ept & VMX_EPT_PAGE_WALK_4_BIT;
+}
+
+static inline bool cpu_has_vmx_ept_mt_wb(void)
+{
+ return vmx_capability.ept & VMX_EPTP_WB_BIT;
+}
+
+static inline bool cpu_has_vmx_ept_5levels(void)
+{
+ return vmx_capability.ept & VMX_EPT_PAGE_WALK_5_BIT;
+}
+
+static inline bool cpu_has_vmx_ept_ad_bits(void)
+{
+ return vmx_capability.ept & VMX_EPT_AD_BIT;
+}
+
+static inline bool cpu_has_vmx_invept_context(void)
+{
+ return vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT;
+}
+
+static inline bool cpu_has_vmx_invept_global(void)
+{
+ return vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT;
+}
+
+static inline bool cpu_has_vmx_invvpid_individual_addr(void)
+{
+ return vmx_capability.vpid & VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT;
+}
+
+static inline bool cpu_has_vmx_invvpid_single(void)
+{
+ return vmx_capability.vpid & VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT;
+}
+
+static inline bool cpu_has_vmx_invvpid_global(void)
+{
+ return vmx_capability.vpid & VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT;
+}
+
+static inline bool cpu_has_vmx_invvpid(void)
+{
+ return vmx_capability.vpid & VMX_VPID_INVVPID_BIT;
+}
+
+static inline bool cpu_has_vmx_ept(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_ENABLE_EPT;
+}
+
+static inline bool cpu_has_vmx_unrestricted_guest(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_UNRESTRICTED_GUEST;
+}
+
+static inline bool cpu_has_vmx_ple(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_PAUSE_LOOP_EXITING;
+}
+
+static inline bool cpu_has_vmx_basic_inout(void)
+{
+ return (((u64)vmcs_config.basic_cap << 32) & VMX_BASIC_INOUT);
+}
+
+static inline bool cpu_need_virtualize_apic_accesses(struct kvm_vcpu *vcpu)
+{
+ return flexpriority_enabled && lapic_in_kernel(vcpu);
+}
+
+static inline bool cpu_has_vmx_vpid(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_ENABLE_VPID;
+}
+
+static inline bool cpu_has_vmx_rdtscp(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_RDTSCP;
+}
+
+static inline bool cpu_has_vmx_invpcid(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_ENABLE_INVPCID;
+}
+
+static inline bool cpu_has_virtual_nmis(void)
+{
+ return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS;
+}
+
+static inline bool cpu_has_vmx_wbinvd_exit(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_WBINVD_EXITING;
+}
+
+static inline bool cpu_has_vmx_shadow_vmcs(void)
+{
+ u64 vmx_msr;
+ rdmsrl(MSR_IA32_VMX_MISC, vmx_msr);
+ /* check if the cpu supports writing r/o exit information fields */
+ if (!(vmx_msr & MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS))
+ return false;
+
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_SHADOW_VMCS;
+}
+
+static inline bool cpu_has_vmx_pml(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_ENABLE_PML;
+}
+
+static inline bool cpu_has_vmx_tsc_scaling(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_TSC_SCALING;
+}
+
+static inline bool cpu_has_vmx_vmfunc(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_ENABLE_VMFUNC;
+}
+
+static bool vmx_umip_emulated(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_DESC;
+}
+
+static inline bool report_flexpriority(void)
+{
+ return flexpriority_enabled;
+}
+
+static inline unsigned nested_cpu_vmx_misc_cr3_count(struct kvm_vcpu *vcpu)
+{
+ return vmx_misc_cr3_count(to_vmx(vcpu)->nested.msrs.misc_low);
+}
+
+/*
+ * Do the virtual VMX capability MSRs specify that L1 can use VMWRITE
+ * to modify any valid field of the VMCS, or are the VM-exit
+ * information fields read-only?
+ */
+static inline bool nested_cpu_has_vmwrite_any_field(struct kvm_vcpu *vcpu)
+{
+ return to_vmx(vcpu)->nested.msrs.misc_low &
+ MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS;
+}
+
+static inline bool nested_cpu_has_zero_length_injection(struct kvm_vcpu *vcpu)
+{
+ return to_vmx(vcpu)->nested.msrs.misc_low & VMX_MISC_ZERO_LEN_INS;
+}
+
+static inline bool nested_cpu_supports_monitor_trap_flag(struct kvm_vcpu *vcpu)
+{
+ return to_vmx(vcpu)->nested.msrs.procbased_ctls_high &
+ CPU_BASED_MONITOR_TRAP_FLAG;
+}
+
+static inline bool nested_cpu_has_vmx_shadow_vmcs(struct kvm_vcpu *vcpu)
+{
+ return to_vmx(vcpu)->nested.msrs.secondary_ctls_high &
+ SECONDARY_EXEC_SHADOW_VMCS;
+}
+
+static inline bool nested_cpu_has(struct vmcs12 *vmcs12, u32 bit)
+{
+ return vmcs12->cpu_based_vm_exec_control & bit;
+}
+
+static inline bool nested_cpu_has2(struct vmcs12 *vmcs12, u32 bit)
+{
+ return (vmcs12->cpu_based_vm_exec_control &
+ CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) &&
+ (vmcs12->secondary_vm_exec_control & bit);
+}
+
+static inline bool nested_cpu_has_preemption_timer(struct vmcs12 *vmcs12)
+{
+ return vmcs12->pin_based_vm_exec_control &
+ PIN_BASED_VMX_PREEMPTION_TIMER;
+}
+
+static inline bool nested_cpu_has_nmi_exiting(struct vmcs12 *vmcs12)
+{
+ return vmcs12->pin_based_vm_exec_control & PIN_BASED_NMI_EXITING;
+}
+
+static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12)
+{
+ return vmcs12->pin_based_vm_exec_control & PIN_BASED_VIRTUAL_NMIS;
+}
+
+static inline int nested_cpu_has_ept(struct vmcs12 *vmcs12)
+{
+ return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_EPT);
+}
+
+static inline bool nested_cpu_has_xsaves(struct vmcs12 *vmcs12)
+{
+ return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES);
+}
+
+static inline bool nested_cpu_has_pml(struct vmcs12 *vmcs12)
+{
+ return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_PML);
+}
+
+static inline bool nested_cpu_has_virt_x2apic_mode(struct vmcs12 *vmcs12)
+{
+ return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE);
+}
+
+static inline bool nested_cpu_has_vpid(struct vmcs12 *vmcs12)
+{
+ return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_VPID);
+}
+
+static inline bool nested_cpu_has_apic_reg_virt(struct vmcs12 *vmcs12)
+{
+ return nested_cpu_has2(vmcs12, SECONDARY_EXEC_APIC_REGISTER_VIRT);
+}
+
+static inline bool nested_cpu_has_vid(struct vmcs12 *vmcs12)
+{
+ return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
+}
+
+static inline bool nested_cpu_has_posted_intr(struct vmcs12 *vmcs12)
+{
+ return vmcs12->pin_based_vm_exec_control & PIN_BASED_POSTED_INTR;
+}
+
+static inline bool nested_cpu_has_vmfunc(struct vmcs12 *vmcs12)
+{
+ return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_VMFUNC);
+}
+
+static inline bool nested_cpu_has_eptp_switching(struct vmcs12 *vmcs12)
+{
+ return nested_cpu_has_vmfunc(vmcs12) &&
+ (vmcs12->vm_function_control &
+ VMX_VMFUNC_EPTP_SWITCHING);
+}
+
+static inline bool nested_cpu_has_shadow_vmcs(struct vmcs12 *vmcs12)
+{
+ return nested_cpu_has2(vmcs12, SECONDARY_EXEC_SHADOW_VMCS);
+}
+
+static inline bool is_nmi(u32 intr_info)
+{
+ return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
+ == (INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK);
+}
+
+static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
+ u32 exit_intr_info,
+ unsigned long exit_qualification);
+static void nested_vmx_entry_failure(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12,
+ u32 reason, unsigned long qualification);
+
+static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
+{
+ int i;
+
+ for (i = 0; i < vmx->nmsrs; ++i)
+ if (vmx_msr_index[vmx->guest_msrs[i].index] == msr)
+ return i;
+ return -1;
+}
+
+static inline void __invvpid(unsigned long ext, u16 vpid, gva_t gva)
+{
+ struct {
+ u64 vpid : 16;
+ u64 rsvd : 48;
+ u64 gva;
+ } operand = { vpid, 0, gva };
+ bool error;
+
+ asm volatile (__ex(ASM_VMX_INVVPID) CC_SET(na)
+ : CC_OUT(na) (error) : "a"(&operand), "c"(ext)
+ : "memory");
+ BUG_ON(error);
+}
+
+static inline void __invept(unsigned long ext, u64 eptp, gpa_t gpa)
+{
+ struct {
+ u64 eptp, gpa;
+ } operand = {eptp, gpa};
+ bool error;
+
+ asm volatile (__ex(ASM_VMX_INVEPT) CC_SET(na)
+ : CC_OUT(na) (error) : "a" (&operand), "c" (ext)
+ : "memory");
+ BUG_ON(error);
+}
+
+static void vmx_setup_fb_clear_ctrl(void)
+{
+ u64 msr;
+
+ if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES) &&
+ !boot_cpu_has_bug(X86_BUG_MDS) &&
+ !boot_cpu_has_bug(X86_BUG_TAA)) {
+ rdmsrl(MSR_IA32_ARCH_CAPABILITIES, msr);
+ if (msr & ARCH_CAP_FB_CLEAR_CTRL)
+ vmx_fb_clear_ctrl_available = true;
+ }
+}
+
+static __always_inline void vmx_disable_fb_clear(struct vcpu_vmx *vmx)
+{
+ u64 msr;
+
+ if (!vmx->disable_fb_clear)
+ return;
+
+ rdmsrl(MSR_IA32_MCU_OPT_CTRL, msr);
+ msr |= FB_CLEAR_DIS;
+ wrmsrl(MSR_IA32_MCU_OPT_CTRL, msr);
+ /* Cache the MSR value to avoid reading it later */
+ vmx->msr_ia32_mcu_opt_ctrl = msr;
+}
+
+static __always_inline void vmx_enable_fb_clear(struct vcpu_vmx *vmx)
+{
+ if (!vmx->disable_fb_clear)
+ return;
+
+ vmx->msr_ia32_mcu_opt_ctrl &= ~FB_CLEAR_DIS;
+ wrmsrl(MSR_IA32_MCU_OPT_CTRL, vmx->msr_ia32_mcu_opt_ctrl);
+}
+
+static void vmx_update_fb_clear_dis(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx)
+{
+ vmx->disable_fb_clear = vmx_fb_clear_ctrl_available;
+
+ /*
+ * If guest will not execute VERW, there is no need to set FB_CLEAR_DIS
+ * at VMEntry. Skip the MSR read/write when a guest has no use case to
+ * execute VERW.
+ */
+ if ((vcpu->arch.arch_capabilities & ARCH_CAP_FB_CLEAR) ||
+ ((vcpu->arch.arch_capabilities & ARCH_CAP_MDS_NO) &&
+ (vcpu->arch.arch_capabilities & ARCH_CAP_TAA_NO) &&
+ (vcpu->arch.arch_capabilities & ARCH_CAP_PSDP_NO) &&
+ (vcpu->arch.arch_capabilities & ARCH_CAP_FBSDP_NO) &&
+ (vcpu->arch.arch_capabilities & ARCH_CAP_SBDR_SSDP_NO)))
+ vmx->disable_fb_clear = false;
+}
+
+static struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr)
+{
+ int i;
+
+ i = __find_msr_index(vmx, msr);
+ if (i >= 0)
+ return &vmx->guest_msrs[i];
+ return NULL;
+}
+
+static void vmcs_clear(struct vmcs *vmcs)
+{
+ u64 phys_addr = __pa(vmcs);
+ bool error;
+
+ asm volatile (__ex(ASM_VMX_VMCLEAR_RAX) CC_SET(na)
+ : CC_OUT(na) (error) : "a"(&phys_addr), "m"(phys_addr)
+ : "memory");
+ if (unlikely(error))
+ printk(KERN_ERR "kvm: vmclear fail: %p/%llx\n",
+ vmcs, phys_addr);
+}
+
+static inline void loaded_vmcs_init(struct loaded_vmcs *loaded_vmcs)
+{
+ vmcs_clear(loaded_vmcs->vmcs);
+ if (loaded_vmcs->shadow_vmcs && loaded_vmcs->launched)
+ vmcs_clear(loaded_vmcs->shadow_vmcs);
+ loaded_vmcs->cpu = -1;
+ loaded_vmcs->launched = 0;
+}
+
+static void vmcs_load(struct vmcs *vmcs)
+{
+ u64 phys_addr = __pa(vmcs);
+ bool error;
+
+ if (static_branch_unlikely(&enable_evmcs))
+ return evmcs_load(phys_addr);
+
+ asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) CC_SET(na)
+ : CC_OUT(na) (error) : "a"(&phys_addr), "m"(phys_addr)
+ : "memory");
+ if (unlikely(error))
+ printk(KERN_ERR "kvm: vmptrld %p/%llx failed\n",
+ vmcs, phys_addr);
+}
+
+#ifdef CONFIG_KEXEC_CORE
+static void crash_vmclear_local_loaded_vmcss(void)
+{
+ int cpu = raw_smp_processor_id();
+ struct loaded_vmcs *v;
+
+ list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu),
+ loaded_vmcss_on_cpu_link)
+ vmcs_clear(v->vmcs);
+}
+#endif /* CONFIG_KEXEC_CORE */
+
+static void __loaded_vmcs_clear(void *arg)
+{
+ struct loaded_vmcs *loaded_vmcs = arg;
+ int cpu = raw_smp_processor_id();
+
+ if (loaded_vmcs->cpu != cpu)
+ return; /* vcpu migration can race with cpu offline */
+ if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs)
+ per_cpu(current_vmcs, cpu) = NULL;
+
+ vmcs_clear(loaded_vmcs->vmcs);
+ if (loaded_vmcs->shadow_vmcs && loaded_vmcs->launched)
+ vmcs_clear(loaded_vmcs->shadow_vmcs);
+
+ list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link);
+
+ /*
+ * Ensure all writes to loaded_vmcs, including deleting it from its
+ * current percpu list, complete before setting loaded_vmcs->vcpu to
+ * -1, otherwise a different cpu can see vcpu == -1 first and add
+ * loaded_vmcs to its percpu list before it's deleted from this cpu's
+ * list. Pairs with the smp_rmb() in vmx_vcpu_load_vmcs().
+ */
+ smp_wmb();
+
+ loaded_vmcs->cpu = -1;
+ loaded_vmcs->launched = 0;
+}
+
+static void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs)
+{
+ int cpu = loaded_vmcs->cpu;
+
+ if (cpu != -1)
+ smp_call_function_single(cpu,
+ __loaded_vmcs_clear, loaded_vmcs, 1);
+}
+
+static inline bool vpid_sync_vcpu_addr(int vpid, gva_t addr)
+{
+ if (vpid == 0)
+ return true;
+
+ if (cpu_has_vmx_invvpid_individual_addr()) {
+ __invvpid(VMX_VPID_EXTENT_INDIVIDUAL_ADDR, vpid, addr);
+ return true;
+ }
+
+ return false;
+}
+
+static inline void vpid_sync_vcpu_single(int vpid)
+{
+ if (vpid == 0)
+ return;
+
+ if (cpu_has_vmx_invvpid_single())
+ __invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vpid, 0);
+}
+
+static inline void vpid_sync_vcpu_global(void)
+{
+ if (cpu_has_vmx_invvpid_global())
+ __invvpid(VMX_VPID_EXTENT_ALL_CONTEXT, 0, 0);
+}
+
+static inline void vpid_sync_context(int vpid)
+{
+ if (cpu_has_vmx_invvpid_single())
+ vpid_sync_vcpu_single(vpid);
+ else
+ vpid_sync_vcpu_global();
+}
+
+static inline void ept_sync_global(void)
+{
+ __invept(VMX_EPT_EXTENT_GLOBAL, 0, 0);
+}
+
+static inline void ept_sync_context(u64 eptp)
+{
+ if (cpu_has_vmx_invept_context())
+ __invept(VMX_EPT_EXTENT_CONTEXT, eptp, 0);
+ else
+ ept_sync_global();
+}
+
+static __always_inline void vmcs_check16(unsigned long field)
+{
+ BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2000,
+ "16-bit accessor invalid for 64-bit field");
+ BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2001,
+ "16-bit accessor invalid for 64-bit high field");
+ BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x4000,
+ "16-bit accessor invalid for 32-bit high field");
+ BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x6000,
+ "16-bit accessor invalid for natural width field");
+}
+
+static __always_inline void vmcs_check32(unsigned long field)
+{
+ BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0,
+ "32-bit accessor invalid for 16-bit field");
+ BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x6000,
+ "32-bit accessor invalid for natural width field");
+}
+
+static __always_inline void vmcs_check64(unsigned long field)
+{
+ BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0,
+ "64-bit accessor invalid for 16-bit field");
+ BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2001,
+ "64-bit accessor invalid for 64-bit high field");
+ BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x4000,
+ "64-bit accessor invalid for 32-bit field");
+ BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x6000,
+ "64-bit accessor invalid for natural width field");
+}
+
+static __always_inline void vmcs_checkl(unsigned long field)
+{
+ BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0,
+ "Natural width accessor invalid for 16-bit field");
+ BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2000,
+ "Natural width accessor invalid for 64-bit field");
+ BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2001,
+ "Natural width accessor invalid for 64-bit high field");
+ BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x4000,
+ "Natural width accessor invalid for 32-bit field");
+}
+
+static __always_inline unsigned long __vmcs_readl(unsigned long field)
+{
+ unsigned long value;
+
+ asm volatile (__ex_clear(ASM_VMX_VMREAD_RDX_RAX, "%0")
+ : "=a"(value) : "d"(field) : "cc");
+ return value;
+}
+
+static __always_inline u16 vmcs_read16(unsigned long field)
+{
+ vmcs_check16(field);
+ if (static_branch_unlikely(&enable_evmcs))
+ return evmcs_read16(field);
+ return __vmcs_readl(field);
+}
+
+static __always_inline u32 vmcs_read32(unsigned long field)
+{
+ vmcs_check32(field);
+ if (static_branch_unlikely(&enable_evmcs))
+ return evmcs_read32(field);
+ return __vmcs_readl(field);
+}
+
+static __always_inline u64 vmcs_read64(unsigned long field)
+{
+ vmcs_check64(field);
+ if (static_branch_unlikely(&enable_evmcs))
+ return evmcs_read64(field);
+#ifdef CONFIG_X86_64
+ return __vmcs_readl(field);
+#else
+ return __vmcs_readl(field) | ((u64)__vmcs_readl(field+1) << 32);
+#endif
+}
+
+static __always_inline unsigned long vmcs_readl(unsigned long field)
+{
+ vmcs_checkl(field);
+ if (static_branch_unlikely(&enable_evmcs))
+ return evmcs_read64(field);
+ return __vmcs_readl(field);
+}
+
+static noinline void vmwrite_error(unsigned long field, unsigned long value)
+{
+ printk(KERN_ERR "vmwrite error: reg %lx value %lx (err %d)\n",
+ field, value, vmcs_read32(VM_INSTRUCTION_ERROR));
+ dump_stack();
+}
+
+static __always_inline void __vmcs_writel(unsigned long field, unsigned long value)
+{
+ bool error;
+
+ asm volatile (__ex(ASM_VMX_VMWRITE_RAX_RDX) CC_SET(na)
+ : CC_OUT(na) (error) : "a"(value), "d"(field));
+ if (unlikely(error))
+ vmwrite_error(field, value);
+}
+
+static __always_inline void vmcs_write16(unsigned long field, u16 value)
+{
+ vmcs_check16(field);
+ if (static_branch_unlikely(&enable_evmcs))
+ return evmcs_write16(field, value);
+
+ __vmcs_writel(field, value);
+}
+
+static __always_inline void vmcs_write32(unsigned long field, u32 value)
+{
+ vmcs_check32(field);
+ if (static_branch_unlikely(&enable_evmcs))
+ return evmcs_write32(field, value);
+
+ __vmcs_writel(field, value);
+}
+
+static __always_inline void vmcs_write64(unsigned long field, u64 value)
+{
+ vmcs_check64(field);
+ if (static_branch_unlikely(&enable_evmcs))
+ return evmcs_write64(field, value);
+
+ __vmcs_writel(field, value);
+#ifndef CONFIG_X86_64
+ asm volatile ("");
+ __vmcs_writel(field+1, value >> 32);
+#endif
+}
+
+static __always_inline void vmcs_writel(unsigned long field, unsigned long value)
+{
+ vmcs_checkl(field);
+ if (static_branch_unlikely(&enable_evmcs))
+ return evmcs_write64(field, value);
+
+ __vmcs_writel(field, value);
+}
+
+static __always_inline void vmcs_clear_bits(unsigned long field, u32 mask)
+{
+ BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x2000,
+ "vmcs_clear_bits does not support 64-bit fields");
+ if (static_branch_unlikely(&enable_evmcs))
+ return evmcs_write32(field, evmcs_read32(field) & ~mask);
+
+ __vmcs_writel(field, __vmcs_readl(field) & ~mask);
+}
+
+static __always_inline void vmcs_set_bits(unsigned long field, u32 mask)
+{
+ BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x2000,
+ "vmcs_set_bits does not support 64-bit fields");
+ if (static_branch_unlikely(&enable_evmcs))
+ return evmcs_write32(field, evmcs_read32(field) | mask);
+
+ __vmcs_writel(field, __vmcs_readl(field) | mask);
+}
+
+static inline void vm_entry_controls_reset_shadow(struct vcpu_vmx *vmx)
+{
+ vmx->vm_entry_controls_shadow = vmcs_read32(VM_ENTRY_CONTROLS);
+}
+
+static inline void vm_entry_controls_init(struct vcpu_vmx *vmx, u32 val)
+{
+ vmcs_write32(VM_ENTRY_CONTROLS, val);
+ vmx->vm_entry_controls_shadow = val;
+}
+
+static inline void vm_entry_controls_set(struct vcpu_vmx *vmx, u32 val)
+{
+ if (vmx->vm_entry_controls_shadow != val)
+ vm_entry_controls_init(vmx, val);
+}
+
+static inline u32 vm_entry_controls_get(struct vcpu_vmx *vmx)
+{
+ return vmx->vm_entry_controls_shadow;
+}
+
+
+static inline void vm_entry_controls_setbit(struct vcpu_vmx *vmx, u32 val)
+{
+ vm_entry_controls_set(vmx, vm_entry_controls_get(vmx) | val);
+}
+
+static inline void vm_entry_controls_clearbit(struct vcpu_vmx *vmx, u32 val)
+{
+ vm_entry_controls_set(vmx, vm_entry_controls_get(vmx) & ~val);
+}
+
+static inline void vm_exit_controls_reset_shadow(struct vcpu_vmx *vmx)
+{
+ vmx->vm_exit_controls_shadow = vmcs_read32(VM_EXIT_CONTROLS);
+}
+
+static inline void vm_exit_controls_init(struct vcpu_vmx *vmx, u32 val)
+{
+ vmcs_write32(VM_EXIT_CONTROLS, val);
+ vmx->vm_exit_controls_shadow = val;
+}
+
+static inline void vm_exit_controls_set(struct vcpu_vmx *vmx, u32 val)
+{
+ if (vmx->vm_exit_controls_shadow != val)
+ vm_exit_controls_init(vmx, val);
+}
+
+static inline u32 vm_exit_controls_get(struct vcpu_vmx *vmx)
+{
+ return vmx->vm_exit_controls_shadow;
+}
+
+
+static inline void vm_exit_controls_setbit(struct vcpu_vmx *vmx, u32 val)
+{
+ vm_exit_controls_set(vmx, vm_exit_controls_get(vmx) | val);
+}
+
+static inline void vm_exit_controls_clearbit(struct vcpu_vmx *vmx, u32 val)
+{
+ vm_exit_controls_set(vmx, vm_exit_controls_get(vmx) & ~val);
+}
+
+static void vmx_segment_cache_clear(struct vcpu_vmx *vmx)
+{
+ vmx->segment_cache.bitmask = 0;
+}
+
+static bool vmx_segment_cache_test_set(struct vcpu_vmx *vmx, unsigned seg,
+ unsigned field)
+{
+ bool ret;
+ u32 mask = 1 << (seg * SEG_FIELD_NR + field);
+
+ if (!(vmx->vcpu.arch.regs_avail & (1 << VCPU_EXREG_SEGMENTS))) {
+ vmx->vcpu.arch.regs_avail |= (1 << VCPU_EXREG_SEGMENTS);
+ vmx->segment_cache.bitmask = 0;
+ }
+ ret = vmx->segment_cache.bitmask & mask;
+ vmx->segment_cache.bitmask |= mask;
+ return ret;
+}
+
+static u16 vmx_read_guest_seg_selector(struct vcpu_vmx *vmx, unsigned seg)
+{
+ u16 *p = &vmx->segment_cache.seg[seg].selector;
+
+ if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_SEL))
+ *p = vmcs_read16(kvm_vmx_segment_fields[seg].selector);
+ return *p;
+}
+
+static ulong vmx_read_guest_seg_base(struct vcpu_vmx *vmx, unsigned seg)
+{
+ ulong *p = &vmx->segment_cache.seg[seg].base;
+
+ if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_BASE))
+ *p = vmcs_readl(kvm_vmx_segment_fields[seg].base);
+ return *p;
+}
+
+static u32 vmx_read_guest_seg_limit(struct vcpu_vmx *vmx, unsigned seg)
+{
+ u32 *p = &vmx->segment_cache.seg[seg].limit;
+
+ if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_LIMIT))
+ *p = vmcs_read32(kvm_vmx_segment_fields[seg].limit);
+ return *p;
+}
+
+static u32 vmx_read_guest_seg_ar(struct vcpu_vmx *vmx, unsigned seg)
+{
+ u32 *p = &vmx->segment_cache.seg[seg].ar;
+
+ if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_AR))
+ *p = vmcs_read32(kvm_vmx_segment_fields[seg].ar_bytes);
+ return *p;
+}
+
+static void update_exception_bitmap(struct kvm_vcpu *vcpu)
+{
+ u32 eb;
+
+ eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) |
+ (1u << DB_VECTOR) | (1u << AC_VECTOR);
+ /*
+ * Guest access to VMware backdoor ports could legitimately
+ * trigger #GP because of TSS I/O permission bitmap.
+ * We intercept those #GP and allow access to them anyway
+ * as VMware does.
+ */
+ if (enable_vmware_backdoor)
+ eb |= (1u << GP_VECTOR);
+ if ((vcpu->guest_debug &
+ (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) ==
+ (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP))
+ eb |= 1u << BP_VECTOR;
+ if (to_vmx(vcpu)->rmode.vm86_active)
+ eb = ~0;
+ if (enable_ept)
+ eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */
+
+ /* When we are running a nested L2 guest and L1 specified for it a
+ * certain exception bitmap, we must trap the same exceptions and pass
+ * them to L1. When running L2, we will only handle the exceptions
+ * specified above if L1 did not want them.
+ */
+ if (is_guest_mode(vcpu))
+ eb |= get_vmcs12(vcpu)->exception_bitmap;
+
+ vmcs_write32(EXCEPTION_BITMAP, eb);
+}
+
+/*
+ * Check if MSR is intercepted for currently loaded MSR bitmap.
+ */
+static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr)
+{
+ unsigned long *msr_bitmap;
+ int f = sizeof(unsigned long);
+
+ if (!cpu_has_vmx_msr_bitmap())
+ return true;
+
+ msr_bitmap = to_vmx(vcpu)->loaded_vmcs->msr_bitmap;
+
+ if (msr <= 0x1fff) {
+ return !!test_bit(msr, msr_bitmap + 0x800 / f);
+ } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
+ msr &= 0x1fff;
+ return !!test_bit(msr, msr_bitmap + 0xc00 / f);
+ }
+
+ return true;
+}
+
+/*
+ * Check if MSR is intercepted for L01 MSR bitmap.
+ */
+static bool msr_write_intercepted_l01(struct kvm_vcpu *vcpu, u32 msr)
+{
+ unsigned long *msr_bitmap;
+ int f = sizeof(unsigned long);
+
+ if (!cpu_has_vmx_msr_bitmap())
+ return true;
+
+ msr_bitmap = to_vmx(vcpu)->vmcs01.msr_bitmap;
+
+ if (msr <= 0x1fff) {
+ return !!test_bit(msr, msr_bitmap + 0x800 / f);
+ } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
+ msr &= 0x1fff;
+ return !!test_bit(msr, msr_bitmap + 0xc00 / f);
+ }
+
+ return true;
+}
+
+static void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx,
+ unsigned long entry, unsigned long exit)
+{
+ vm_entry_controls_clearbit(vmx, entry);
+ vm_exit_controls_clearbit(vmx, exit);
+}
+
+static int find_msr(struct vmx_msrs *m, unsigned int msr)
+{
+ unsigned int i;
+
+ for (i = 0; i < m->nr; ++i) {
+ if (m->val[i].index == msr)
+ return i;
+ }
+ return -ENOENT;
+}
+
+static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr)
+{
+ int i;
+ struct msr_autoload *m = &vmx->msr_autoload;
+
+ switch (msr) {
+ case MSR_EFER:
+ if (cpu_has_load_ia32_efer) {
+ clear_atomic_switch_msr_special(vmx,
+ VM_ENTRY_LOAD_IA32_EFER,
+ VM_EXIT_LOAD_IA32_EFER);
+ return;
+ }
+ break;
+ case MSR_CORE_PERF_GLOBAL_CTRL:
+ if (cpu_has_load_perf_global_ctrl) {
+ clear_atomic_switch_msr_special(vmx,
+ VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL,
+ VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL);
+ return;
+ }
+ break;
+ }
+ i = find_msr(&m->guest, msr);
+ if (i < 0)
+ goto skip_guest;
+ --m->guest.nr;
+ m->guest.val[i] = m->guest.val[m->guest.nr];
+ vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr);
+
+skip_guest:
+ i = find_msr(&m->host, msr);
+ if (i < 0)
+ return;
+
+ --m->host.nr;
+ m->host.val[i] = m->host.val[m->host.nr];
+ vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr);
+}
+
+static void add_atomic_switch_msr_special(struct vcpu_vmx *vmx,
+ unsigned long entry, unsigned long exit,
+ unsigned long guest_val_vmcs, unsigned long host_val_vmcs,
+ u64 guest_val, u64 host_val)
+{
+ vmcs_write64(guest_val_vmcs, guest_val);
+ vmcs_write64(host_val_vmcs, host_val);
+ vm_entry_controls_setbit(vmx, entry);
+ vm_exit_controls_setbit(vmx, exit);
+}
+
+static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
+ u64 guest_val, u64 host_val, bool entry_only)
+{
+ int i, j = 0;
+ struct msr_autoload *m = &vmx->msr_autoload;
+
+ switch (msr) {
+ case MSR_EFER:
+ if (cpu_has_load_ia32_efer) {
+ add_atomic_switch_msr_special(vmx,
+ VM_ENTRY_LOAD_IA32_EFER,
+ VM_EXIT_LOAD_IA32_EFER,
+ GUEST_IA32_EFER,
+ HOST_IA32_EFER,
+ guest_val, host_val);
+ return;
+ }
+ break;
+ case MSR_CORE_PERF_GLOBAL_CTRL:
+ if (cpu_has_load_perf_global_ctrl) {
+ add_atomic_switch_msr_special(vmx,
+ VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL,
+ VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL,
+ GUEST_IA32_PERF_GLOBAL_CTRL,
+ HOST_IA32_PERF_GLOBAL_CTRL,
+ guest_val, host_val);
+ return;
+ }
+ break;
+ case MSR_IA32_PEBS_ENABLE:
+ /* PEBS needs a quiescent period after being disabled (to write
+ * a record). Disabling PEBS through VMX MSR swapping doesn't
+ * provide that period, so a CPU could write host's record into
+ * guest's memory.
+ */
+ wrmsrl(MSR_IA32_PEBS_ENABLE, 0);
+ }
+
+ i = find_msr(&m->guest, msr);
+ if (!entry_only)
+ j = find_msr(&m->host, msr);
+
+ if ((i < 0 && m->guest.nr == NR_AUTOLOAD_MSRS) ||
+ (j < 0 && m->host.nr == NR_AUTOLOAD_MSRS)) {
+ printk_once(KERN_WARNING "Not enough msr switch entries. "
+ "Can't add msr %x\n", msr);
+ return;
+ }
+ if (i < 0) {
+ i = m->guest.nr++;
+ vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr);
+ }
+ m->guest.val[i].index = msr;
+ m->guest.val[i].value = guest_val;
+
+ if (entry_only)
+ return;
+
+ if (j < 0) {
+ j = m->host.nr++;
+ vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr);
+ }
+ m->host.val[j].index = msr;
+ m->host.val[j].value = host_val;
+}
+
+static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
+{
+ u64 guest_efer = vmx->vcpu.arch.efer;
+ u64 ignore_bits = 0;
+
+ /* Shadow paging assumes NX to be available. */
+ if (!enable_ept)
+ guest_efer |= EFER_NX;
+
+ /*
+ * LMA and LME handled by hardware; SCE meaningless outside long mode.
+ */
+ ignore_bits |= EFER_SCE;
+#ifdef CONFIG_X86_64
+ ignore_bits |= EFER_LMA | EFER_LME;
+ /* SCE is meaningful only in long mode on Intel */
+ if (guest_efer & EFER_LMA)
+ ignore_bits &= ~(u64)EFER_SCE;
+#endif
+
+ clear_atomic_switch_msr(vmx, MSR_EFER);
+
+ /*
+ * On EPT, we can't emulate NX, so we must switch EFER atomically.
+ * On CPUs that support "load IA32_EFER", always switch EFER
+ * atomically, since it's faster than switching it manually.
+ */
+ if (cpu_has_load_ia32_efer ||
+ (enable_ept && ((vmx->vcpu.arch.efer ^ host_efer) & EFER_NX))) {
+ if (!(guest_efer & EFER_LMA))
+ guest_efer &= ~EFER_LME;
+ if (guest_efer != host_efer)
+ add_atomic_switch_msr(vmx, MSR_EFER,
+ guest_efer, host_efer, false);
+ return false;
+ } else {
+ guest_efer &= ~ignore_bits;
+ guest_efer |= host_efer & ignore_bits;
+
+ vmx->guest_msrs[efer_offset].data = guest_efer;
+ vmx->guest_msrs[efer_offset].mask = ~ignore_bits;
+
+ return true;
+ }
+}
+
+#ifdef CONFIG_X86_32
+/*
+ * On 32-bit kernels, VM exits still load the FS and GS bases from the
+ * VMCS rather than the segment table. KVM uses this helper to figure
+ * out the current bases to poke them into the VMCS before entry.
+ */
+static unsigned long segment_base(u16 selector)
+{
+ struct desc_struct *table;
+ unsigned long v;
+
+ if (!(selector & ~SEGMENT_RPL_MASK))
+ return 0;
+
+ table = get_current_gdt_ro();
+
+ if ((selector & SEGMENT_TI_MASK) == SEGMENT_LDT) {
+ u16 ldt_selector = kvm_read_ldt();
+
+ if (!(ldt_selector & ~SEGMENT_RPL_MASK))
+ return 0;
+
+ table = (struct desc_struct *)segment_base(ldt_selector);
+ }
+ v = get_desc_base(&table[selector >> 3]);
+ return v;
+}
+#endif
+
+static void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct vmcs_host_state *host_state;
+#ifdef CONFIG_X86_64
+ int cpu = raw_smp_processor_id();
+#endif
+ unsigned long fs_base, gs_base;
+ u16 fs_sel, gs_sel;
+ int i;
+
+ vmx->req_immediate_exit = false;
+
+ /*
+ * Note that guest MSRs to be saved/restored can also be changed
+ * when guest state is loaded. This happens when guest transitions
+ * to/from long-mode by setting MSR_EFER.LMA.
+ */
+ if (!vmx->loaded_cpu_state || vmx->guest_msrs_dirty) {
+ vmx->guest_msrs_dirty = false;
+ for (i = 0; i < vmx->save_nmsrs; ++i)
+ kvm_set_shared_msr(vmx->guest_msrs[i].index,
+ vmx->guest_msrs[i].data,
+ vmx->guest_msrs[i].mask);
+
+ }
+
+ if (vmx->loaded_cpu_state)
+ return;
+
+ vmx->loaded_cpu_state = vmx->loaded_vmcs;
+ host_state = &vmx->loaded_cpu_state->host_state;
+
+ /*
+ * Set host fs and gs selectors. Unfortunately, 22.2.3 does not
+ * allow segment selectors with cpl > 0 or ti == 1.
+ */
+ host_state->ldt_sel = kvm_read_ldt();
+
+#ifdef CONFIG_X86_64
+ savesegment(ds, host_state->ds_sel);
+ savesegment(es, host_state->es_sel);
+
+ gs_base = cpu_kernelmode_gs_base(cpu);
+ if (likely(is_64bit_mm(current->mm))) {
+ save_fsgs_for_kvm();
+ fs_sel = current->thread.fsindex;
+ gs_sel = current->thread.gsindex;
+ fs_base = current->thread.fsbase;
+ vmx->msr_host_kernel_gs_base = current->thread.gsbase;
+ } else {
+ savesegment(fs, fs_sel);
+ savesegment(gs, gs_sel);
+ fs_base = read_msr(MSR_FS_BASE);
+ vmx->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE);
+ }
+
+ wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
+#else
+ savesegment(fs, fs_sel);
+ savesegment(gs, gs_sel);
+ fs_base = segment_base(fs_sel);
+ gs_base = segment_base(gs_sel);
+#endif
+
+ if (unlikely(fs_sel != host_state->fs_sel)) {
+ if (!(fs_sel & 7))
+ vmcs_write16(HOST_FS_SELECTOR, fs_sel);
+ else
+ vmcs_write16(HOST_FS_SELECTOR, 0);
+ host_state->fs_sel = fs_sel;
+ }
+ if (unlikely(gs_sel != host_state->gs_sel)) {
+ if (!(gs_sel & 7))
+ vmcs_write16(HOST_GS_SELECTOR, gs_sel);
+ else
+ vmcs_write16(HOST_GS_SELECTOR, 0);
+ host_state->gs_sel = gs_sel;
+ }
+ if (unlikely(fs_base != host_state->fs_base)) {
+ vmcs_writel(HOST_FS_BASE, fs_base);
+ host_state->fs_base = fs_base;
+ }
+ if (unlikely(gs_base != host_state->gs_base)) {
+ vmcs_writel(HOST_GS_BASE, gs_base);
+ host_state->gs_base = gs_base;
+ }
+}
+
+static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx)
+{
+ struct vmcs_host_state *host_state;
+
+ if (!vmx->loaded_cpu_state)
+ return;
+
+ WARN_ON_ONCE(vmx->loaded_cpu_state != vmx->loaded_vmcs);
+ host_state = &vmx->loaded_cpu_state->host_state;
+
+ ++vmx->vcpu.stat.host_state_reload;
+ vmx->loaded_cpu_state = NULL;
+
+#ifdef CONFIG_X86_64
+ rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
+#endif
+ if (host_state->ldt_sel || (host_state->gs_sel & 7)) {
+ kvm_load_ldt(host_state->ldt_sel);
+#ifdef CONFIG_X86_64
+ load_gs_index(host_state->gs_sel);
+#else
+ loadsegment(gs, host_state->gs_sel);
+#endif
+ }
+ if (host_state->fs_sel & 7)
+ loadsegment(fs, host_state->fs_sel);
+#ifdef CONFIG_X86_64
+ if (unlikely(host_state->ds_sel | host_state->es_sel)) {
+ loadsegment(ds, host_state->ds_sel);
+ loadsegment(es, host_state->es_sel);
+ }
+#endif
+ invalidate_tss_limit();
+#ifdef CONFIG_X86_64
+ wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
+#endif
+ load_fixmap_gdt(raw_smp_processor_id());
+}
+
+#ifdef CONFIG_X86_64
+static u64 vmx_read_guest_kernel_gs_base(struct vcpu_vmx *vmx)
+{
+ preempt_disable();
+ if (vmx->loaded_cpu_state)
+ rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
+ preempt_enable();
+ return vmx->msr_guest_kernel_gs_base;
+}
+
+static void vmx_write_guest_kernel_gs_base(struct vcpu_vmx *vmx, u64 data)
+{
+ preempt_disable();
+ if (vmx->loaded_cpu_state)
+ wrmsrl(MSR_KERNEL_GS_BASE, data);
+ preempt_enable();
+ vmx->msr_guest_kernel_gs_base = data;
+}
+#endif
+
+static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
+{
+ struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
+ struct pi_desc old, new;
+ unsigned int dest;
+
+ /*
+ * In case of hot-plug or hot-unplug, we may have to undo
+ * vmx_vcpu_pi_put even if there is no assigned device. And we
+ * always keep PI.NDST up to date for simplicity: it makes the
+ * code easier, and CPU migration is not a fast path.
+ */
+ if (!pi_test_sn(pi_desc) && vcpu->cpu == cpu)
+ return;
+
+ /*
+ * First handle the simple case where no cmpxchg is necessary; just
+ * allow posting non-urgent interrupts.
+ *
+ * If the 'nv' field is POSTED_INTR_WAKEUP_VECTOR, do not change
+ * PI.NDST: pi_post_block will do it for us and the wakeup_handler
+ * expects the VCPU to be on the blocked_vcpu_list that matches
+ * PI.NDST.
+ */
+ if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR ||
+ vcpu->cpu == cpu) {
+ pi_clear_sn(pi_desc);
+ return;
+ }
+
+ /* The full case. */
+ do {
+ old.control = new.control = pi_desc->control;
+
+ dest = cpu_physical_id(cpu);
+
+ if (x2apic_enabled())
+ new.ndst = dest;
+ else
+ new.ndst = (dest << 8) & 0xFF00;
+
+ new.sn = 0;
+ } while (cmpxchg64(&pi_desc->control, old.control,
+ new.control) != old.control);
+}
+
+static void decache_tsc_multiplier(struct vcpu_vmx *vmx)
+{
+ vmx->current_tsc_ratio = vmx->vcpu.arch.tsc_scaling_ratio;
+ vmcs_write64(TSC_MULTIPLIER, vmx->current_tsc_ratio);
+}
+
+/*
+ * Switches to specified vcpu, until a matching vcpu_put(), but assumes
+ * vcpu mutex is already taken.
+ */
+static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ bool already_loaded = vmx->loaded_vmcs->cpu == cpu;
+
+ if (!already_loaded) {
+ loaded_vmcs_clear(vmx->loaded_vmcs);
+ local_irq_disable();
+
+ /*
+ * Ensure loaded_vmcs->cpu is read before adding loaded_vmcs to
+ * this cpu's percpu list, otherwise it may not yet be deleted
+ * from its previous cpu's percpu list. Pairs with the
+ * smb_wmb() in __loaded_vmcs_clear().
+ */
+ smp_rmb();
+
+ list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link,
+ &per_cpu(loaded_vmcss_on_cpu, cpu));
+ local_irq_enable();
+ }
+
+ if (per_cpu(current_vmcs, cpu) != vmx->loaded_vmcs->vmcs) {
+ per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs;
+ vmcs_load(vmx->loaded_vmcs->vmcs);
+ indirect_branch_prediction_barrier();
+ }
+
+ if (!already_loaded) {
+ void *gdt = get_current_gdt_ro();
+ unsigned long sysenter_esp;
+
+ kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
+
+ /*
+ * Linux uses per-cpu TSS and GDT, so set these when switching
+ * processors. See 22.2.4.
+ */
+ vmcs_writel(HOST_TR_BASE,
+ (unsigned long)&get_cpu_entry_area(cpu)->tss.x86_tss);
+ vmcs_writel(HOST_GDTR_BASE, (unsigned long)gdt); /* 22.2.4 */
+
+ /*
+ * VM exits change the host TR limit to 0x67 after a VM
+ * exit. This is okay, since 0x67 covers everything except
+ * the IO bitmap and have have code to handle the IO bitmap
+ * being lost after a VM exit.
+ */
+ BUILD_BUG_ON(IO_BITMAP_OFFSET - 1 != 0x67);
+
+ rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
+ vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
+
+ vmx->loaded_vmcs->cpu = cpu;
+ }
+
+ /* Setup TSC multiplier */
+ if (kvm_has_tsc_control &&
+ vmx->current_tsc_ratio != vcpu->arch.tsc_scaling_ratio)
+ decache_tsc_multiplier(vmx);
+
+ vmx_vcpu_pi_load(vcpu, cpu);
+ vmx->host_pkru = read_pkru();
+ vmx->host_debugctlmsr = get_debugctlmsr();
+}
+
+static void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu)
+{
+ struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
+
+ if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
+ !irq_remapping_cap(IRQ_POSTING_CAP) ||
+ !kvm_vcpu_apicv_active(vcpu))
+ return;
+
+ /* Set SN when the vCPU is preempted */
+ if (vcpu->preempted)
+ pi_set_sn(pi_desc);
+}
+
+static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
+{
+ vmx_vcpu_pi_put(vcpu);
+
+ vmx_prepare_switch_to_host(to_vmx(vcpu));
+}
+
+static bool emulation_required(struct kvm_vcpu *vcpu)
+{
+ return emulate_invalid_guest_state && !guest_state_valid(vcpu);
+}
+
+static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu);
+
+/*
+ * Return the cr0 value that a nested guest would read. This is a combination
+ * of the real cr0 used to run the guest (guest_cr0), and the bits shadowed by
+ * its hypervisor (cr0_read_shadow).
+ */
+static inline unsigned long nested_read_cr0(struct vmcs12 *fields)
+{
+ return (fields->guest_cr0 & ~fields->cr0_guest_host_mask) |
+ (fields->cr0_read_shadow & fields->cr0_guest_host_mask);
+}
+static inline unsigned long nested_read_cr4(struct vmcs12 *fields)
+{
+ return (fields->guest_cr4 & ~fields->cr4_guest_host_mask) |
+ (fields->cr4_read_shadow & fields->cr4_guest_host_mask);
+}
+
+static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
+{
+ unsigned long rflags, save_rflags;
+
+ if (!test_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail)) {
+ __set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail);
+ rflags = vmcs_readl(GUEST_RFLAGS);
+ if (to_vmx(vcpu)->rmode.vm86_active) {
+ rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
+ save_rflags = to_vmx(vcpu)->rmode.save_rflags;
+ rflags |= save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
+ }
+ to_vmx(vcpu)->rflags = rflags;
+ }
+ return to_vmx(vcpu)->rflags;
+}
+
+static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
+{
+ unsigned long old_rflags = vmx_get_rflags(vcpu);
+
+ __set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail);
+ to_vmx(vcpu)->rflags = rflags;
+ if (to_vmx(vcpu)->rmode.vm86_active) {
+ to_vmx(vcpu)->rmode.save_rflags = rflags;
+ rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
+ }
+ vmcs_writel(GUEST_RFLAGS, rflags);
+
+ if ((old_rflags ^ to_vmx(vcpu)->rflags) & X86_EFLAGS_VM)
+ to_vmx(vcpu)->emulation_required = emulation_required(vcpu);
+}
+
+static u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu)
+{
+ u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
+ int ret = 0;
+
+ if (interruptibility & GUEST_INTR_STATE_STI)
+ ret |= KVM_X86_SHADOW_INT_STI;
+ if (interruptibility & GUEST_INTR_STATE_MOV_SS)
+ ret |= KVM_X86_SHADOW_INT_MOV_SS;
+
+ return ret;
+}
+
+static void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
+{
+ u32 interruptibility_old = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
+ u32 interruptibility = interruptibility_old;
+
+ interruptibility &= ~(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS);
+
+ if (mask & KVM_X86_SHADOW_INT_MOV_SS)
+ interruptibility |= GUEST_INTR_STATE_MOV_SS;
+ else if (mask & KVM_X86_SHADOW_INT_STI)
+ interruptibility |= GUEST_INTR_STATE_STI;
+
+ if ((interruptibility != interruptibility_old))
+ vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, interruptibility);
+}
+
+static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
+{
+ unsigned long rip;
+
+ rip = kvm_rip_read(vcpu);
+ rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
+ kvm_rip_write(vcpu, rip);
+
+ /* skipping an emulated instruction also counts */
+ vmx_set_interrupt_shadow(vcpu, 0);
+}
+
+static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu,
+ unsigned long exit_qual)
+{
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+ unsigned int nr = vcpu->arch.exception.nr;
+ u32 intr_info = nr | INTR_INFO_VALID_MASK;
+
+ if (vcpu->arch.exception.has_error_code) {
+ vmcs12->vm_exit_intr_error_code = vcpu->arch.exception.error_code;
+ intr_info |= INTR_INFO_DELIVER_CODE_MASK;
+ }
+
+ if (kvm_exception_is_soft(nr))
+ intr_info |= INTR_TYPE_SOFT_EXCEPTION;
+ else
+ intr_info |= INTR_TYPE_HARD_EXCEPTION;
+
+ if (!(vmcs12->idt_vectoring_info_field & VECTORING_INFO_VALID_MASK) &&
+ vmx_get_nmi_mask(vcpu))
+ intr_info |= INTR_INFO_UNBLOCK_NMI;
+
+ nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, intr_info, exit_qual);
+}
+
+/*
+ * KVM wants to inject page-faults which it got to the guest. This function
+ * checks whether in a nested guest, we need to inject them to L1 or L2.
+ */
+static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned long *exit_qual)
+{
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+ unsigned int nr = vcpu->arch.exception.nr;
+
+ if (nr == PF_VECTOR) {
+ if (vcpu->arch.exception.nested_apf) {
+ *exit_qual = vcpu->arch.apf.nested_apf_token;
+ return 1;
+ }
+ /*
+ * FIXME: we must not write CR2 when L1 intercepts an L2 #PF exception.
+ * The fix is to add the ancillary datum (CR2 or DR6) to structs
+ * kvm_queued_exception and kvm_vcpu_events, so that CR2 and DR6
+ * can be written only when inject_pending_event runs. This should be
+ * conditional on a new capability---if the capability is disabled,
+ * kvm_multiple_exception would write the ancillary information to
+ * CR2 or DR6, for backwards ABI-compatibility.
+ */
+ if (nested_vmx_is_page_fault_vmexit(vmcs12,
+ vcpu->arch.exception.error_code)) {
+ *exit_qual = vcpu->arch.cr2;
+ return 1;
+ }
+ } else {
+ if (vmcs12->exception_bitmap & (1u << nr)) {
+ if (nr == DB_VECTOR) {
+ *exit_qual = vcpu->arch.dr6;
+ *exit_qual &= ~(DR6_FIXED_1 | DR6_BT);
+ *exit_qual ^= DR6_RTM;
+ } else {
+ *exit_qual = 0;
+ }
+ return 1;
+ }
+ }
+
+ return 0;
+}
+
+static void vmx_clear_hlt(struct kvm_vcpu *vcpu)
+{
+ /*
+ * Ensure that we clear the HLT state in the VMCS. We don't need to
+ * explicitly skip the instruction because if the HLT state is set,
+ * then the instruction is already executing and RIP has already been
+ * advanced.
+ */
+ if (kvm_hlt_in_guest(vcpu->kvm) &&
+ vmcs_read32(GUEST_ACTIVITY_STATE) == GUEST_ACTIVITY_HLT)
+ vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
+}
+
+static void vmx_queue_exception(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ unsigned nr = vcpu->arch.exception.nr;
+ bool has_error_code = vcpu->arch.exception.has_error_code;
+ u32 error_code = vcpu->arch.exception.error_code;
+ u32 intr_info = nr | INTR_INFO_VALID_MASK;
+
+ if (has_error_code) {
+ vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
+ intr_info |= INTR_INFO_DELIVER_CODE_MASK;
+ }
+
+ if (vmx->rmode.vm86_active) {
+ int inc_eip = 0;
+ if (kvm_exception_is_soft(nr))
+ inc_eip = vcpu->arch.event_exit_inst_len;
+ if (kvm_inject_realmode_interrupt(vcpu, nr, inc_eip) != EMULATE_DONE)
+ kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+ return;
+ }
+
+ WARN_ON_ONCE(vmx->emulation_required);
+
+ if (kvm_exception_is_soft(nr)) {
+ vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
+ vmx->vcpu.arch.event_exit_inst_len);
+ intr_info |= INTR_TYPE_SOFT_EXCEPTION;
+ } else
+ intr_info |= INTR_TYPE_HARD_EXCEPTION;
+
+ vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
+
+ vmx_clear_hlt(vcpu);
+}
+
+static bool vmx_rdtscp_supported(void)
+{
+ return cpu_has_vmx_rdtscp();
+}
+
+static bool vmx_invpcid_supported(void)
+{
+ return cpu_has_vmx_invpcid();
+}
+
+/*
+ * Swap MSR entry in host/guest MSR entry array.
+ */
+static void move_msr_up(struct vcpu_vmx *vmx, int from, int to)
+{
+ struct shared_msr_entry tmp;
+
+ tmp = vmx->guest_msrs[to];
+ vmx->guest_msrs[to] = vmx->guest_msrs[from];
+ vmx->guest_msrs[from] = tmp;
+}
+
+/*
+ * Set up the vmcs to automatically save and restore system
+ * msrs. Don't touch the 64-bit msrs if the guest is in legacy
+ * mode, as fiddling with msrs is very expensive.
+ */
+static void setup_msrs(struct vcpu_vmx *vmx)
+{
+ int save_nmsrs, index;
+
+ save_nmsrs = 0;
+#ifdef CONFIG_X86_64
+ if (is_long_mode(&vmx->vcpu)) {
+ index = __find_msr_index(vmx, MSR_SYSCALL_MASK);
+ if (index >= 0)
+ move_msr_up(vmx, index, save_nmsrs++);
+ index = __find_msr_index(vmx, MSR_LSTAR);
+ if (index >= 0)
+ move_msr_up(vmx, index, save_nmsrs++);
+ index = __find_msr_index(vmx, MSR_CSTAR);
+ if (index >= 0)
+ move_msr_up(vmx, index, save_nmsrs++);
+ /*
+ * MSR_STAR is only needed on long mode guests, and only
+ * if efer.sce is enabled.
+ */
+ index = __find_msr_index(vmx, MSR_STAR);
+ if ((index >= 0) && (vmx->vcpu.arch.efer & EFER_SCE))
+ move_msr_up(vmx, index, save_nmsrs++);
+ }
+#endif
+ index = __find_msr_index(vmx, MSR_EFER);
+ if (index >= 0 && update_transition_efer(vmx, index))
+ move_msr_up(vmx, index, save_nmsrs++);
+ index = __find_msr_index(vmx, MSR_TSC_AUX);
+ if (index >= 0 && guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDTSCP))
+ move_msr_up(vmx, index, save_nmsrs++);
+
+ vmx->save_nmsrs = save_nmsrs;
+ vmx->guest_msrs_dirty = true;
+
+ if (cpu_has_vmx_msr_bitmap())
+ vmx_update_msr_bitmap(&vmx->vcpu);
+}
+
+static u64 vmx_read_l1_tsc_offset(struct kvm_vcpu *vcpu)
+{
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+
+ if (is_guest_mode(vcpu) &&
+ (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING))
+ return vcpu->arch.tsc_offset - vmcs12->tsc_offset;
+
+ return vcpu->arch.tsc_offset;
+}
+
+static u64 vmx_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
+{
+ u64 active_offset = offset;
+ if (is_guest_mode(vcpu)) {
+ /*
+ * We're here if L1 chose not to trap WRMSR to TSC. According
+ * to the spec, this should set L1's TSC; The offset that L1
+ * set for L2 remains unchanged, and still needs to be added
+ * to the newly set TSC to get L2's TSC.
+ */
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+ if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETING))
+ active_offset += vmcs12->tsc_offset;
+ } else {
+ trace_kvm_write_tsc_offset(vcpu->vcpu_id,
+ vmcs_read64(TSC_OFFSET), offset);
+ }
+
+ vmcs_write64(TSC_OFFSET, active_offset);
+ return active_offset;
+}
+
+/*
+ * nested_vmx_allowed() checks whether a guest should be allowed to use VMX
+ * instructions and MSRs (i.e., nested VMX). Nested VMX is disabled for
+ * all guests if the "nested" module option is off, and can also be disabled
+ * for a single guest by disabling its VMX cpuid bit.
+ */
+static inline bool nested_vmx_allowed(struct kvm_vcpu *vcpu)
+{
+ return nested && guest_cpuid_has(vcpu, X86_FEATURE_VMX);
+}
+
+/*
+ * nested_vmx_setup_ctls_msrs() sets up variables containing the values to be
+ * returned for the various VMX controls MSRs when nested VMX is enabled.
+ * The same values should also be used to verify that vmcs12 control fields are
+ * valid during nested entry from L1 to L2.
+ * Each of these control msrs has a low and high 32-bit half: A low bit is on
+ * if the corresponding bit in the (32-bit) control field *must* be on, and a
+ * bit in the high half is on if the corresponding bit in the control field
+ * may be on. See also vmx_control_verify().
+ */
+static void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, bool apicv)
+{
+ if (!nested) {
+ memset(msrs, 0, sizeof(*msrs));
+ return;
+ }
+
+ /*
+ * Note that as a general rule, the high half of the MSRs (bits in
+ * the control fields which may be 1) should be initialized by the
+ * intersection of the underlying hardware's MSR (i.e., features which
+ * can be supported) and the list of features we want to expose -
+ * because they are known to be properly supported in our code.
+ * Also, usually, the low half of the MSRs (bits which must be 1) can
+ * be set to 0, meaning that L1 may turn off any of these bits. The
+ * reason is that if one of these bits is necessary, it will appear
+ * in vmcs01 and prepare_vmcs02, when it bitwise-or's the control
+ * fields of vmcs01 and vmcs02, will turn these bits off - and
+ * nested_vmx_exit_reflected() will not pass related exits to L1.
+ * These rules have exceptions below.
+ */
+
+ /* pin-based controls */
+ rdmsr(MSR_IA32_VMX_PINBASED_CTLS,
+ msrs->pinbased_ctls_low,
+ msrs->pinbased_ctls_high);
+ msrs->pinbased_ctls_low |=
+ PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
+ msrs->pinbased_ctls_high &=
+ PIN_BASED_EXT_INTR_MASK |
+ PIN_BASED_NMI_EXITING |
+ PIN_BASED_VIRTUAL_NMIS |
+ (apicv ? PIN_BASED_POSTED_INTR : 0);
+ msrs->pinbased_ctls_high |=
+ PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
+ PIN_BASED_VMX_PREEMPTION_TIMER;
+
+ /* exit controls */
+ rdmsr(MSR_IA32_VMX_EXIT_CTLS,
+ msrs->exit_ctls_low,
+ msrs->exit_ctls_high);
+ msrs->exit_ctls_low =
+ VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
+
+ msrs->exit_ctls_high &=
+#ifdef CONFIG_X86_64
+ VM_EXIT_HOST_ADDR_SPACE_SIZE |
+#endif
+ VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT;
+ msrs->exit_ctls_high |=
+ VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
+ VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER |
+ VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT;
+
+ /* We support free control of debug control saving. */
+ msrs->exit_ctls_low &= ~VM_EXIT_SAVE_DEBUG_CONTROLS;
+
+ /* entry controls */
+ rdmsr(MSR_IA32_VMX_ENTRY_CTLS,
+ msrs->entry_ctls_low,
+ msrs->entry_ctls_high);
+ msrs->entry_ctls_low =
+ VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
+ msrs->entry_ctls_high &=
+#ifdef CONFIG_X86_64
+ VM_ENTRY_IA32E_MODE |
+#endif
+ VM_ENTRY_LOAD_IA32_PAT;
+ msrs->entry_ctls_high |=
+ (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER);
+
+ /* We support free control of debug control loading. */
+ msrs->entry_ctls_low &= ~VM_ENTRY_LOAD_DEBUG_CONTROLS;
+
+ /* cpu-based controls */
+ rdmsr(MSR_IA32_VMX_PROCBASED_CTLS,
+ msrs->procbased_ctls_low,
+ msrs->procbased_ctls_high);
+ msrs->procbased_ctls_low =
+ CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
+ msrs->procbased_ctls_high &=
+ CPU_BASED_VIRTUAL_INTR_PENDING |
+ CPU_BASED_VIRTUAL_NMI_PENDING | CPU_BASED_USE_TSC_OFFSETING |
+ CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING |
+ CPU_BASED_MWAIT_EXITING | CPU_BASED_CR3_LOAD_EXITING |
+ CPU_BASED_CR3_STORE_EXITING |
+#ifdef CONFIG_X86_64
+ CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING |
+#endif
+ CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING |
+ CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_TRAP_FLAG |
+ CPU_BASED_MONITOR_EXITING | CPU_BASED_RDPMC_EXITING |
+ CPU_BASED_RDTSC_EXITING | CPU_BASED_PAUSE_EXITING |
+ CPU_BASED_TPR_SHADOW | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
+ /*
+ * We can allow some features even when not supported by the
+ * hardware. For example, L1 can specify an MSR bitmap - and we
+ * can use it to avoid exits to L1 - even when L0 runs L2
+ * without MSR bitmaps.
+ */
+ msrs->procbased_ctls_high |=
+ CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
+ CPU_BASED_USE_MSR_BITMAPS;
+
+ /* We support free control of CR3 access interception. */
+ msrs->procbased_ctls_low &=
+ ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING);
+
+ /*
+ * secondary cpu-based controls. Do not include those that
+ * depend on CPUID bits, they are added later by vmx_cpuid_update.
+ */
+ if (msrs->procbased_ctls_high & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)
+ rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2,
+ msrs->secondary_ctls_low,
+ msrs->secondary_ctls_high);
+
+ msrs->secondary_ctls_low = 0;
+ msrs->secondary_ctls_high &=
+ SECONDARY_EXEC_DESC |
+ SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
+ SECONDARY_EXEC_APIC_REGISTER_VIRT |
+ SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
+ SECONDARY_EXEC_WBINVD_EXITING;
+
+ /*
+ * We can emulate "VMCS shadowing," even if the hardware
+ * doesn't support it.
+ */
+ msrs->secondary_ctls_high |=
+ SECONDARY_EXEC_SHADOW_VMCS;
+
+ if (enable_ept) {
+ /* nested EPT: emulate EPT also to L1 */
+ msrs->secondary_ctls_high |=
+ SECONDARY_EXEC_ENABLE_EPT;
+ msrs->ept_caps = VMX_EPT_PAGE_WALK_4_BIT |
+ VMX_EPTP_WB_BIT | VMX_EPT_INVEPT_BIT;
+ if (cpu_has_vmx_ept_execute_only())
+ msrs->ept_caps |=
+ VMX_EPT_EXECUTE_ONLY_BIT;
+ msrs->ept_caps &= vmx_capability.ept;
+ msrs->ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT |
+ VMX_EPT_EXTENT_CONTEXT_BIT | VMX_EPT_2MB_PAGE_BIT |
+ VMX_EPT_1GB_PAGE_BIT;
+ if (enable_ept_ad_bits) {
+ msrs->secondary_ctls_high |=
+ SECONDARY_EXEC_ENABLE_PML;
+ msrs->ept_caps |= VMX_EPT_AD_BIT;
+ }
+ }
+
+ if (cpu_has_vmx_vmfunc()) {
+ msrs->secondary_ctls_high |=
+ SECONDARY_EXEC_ENABLE_VMFUNC;
+ /*
+ * Advertise EPTP switching unconditionally
+ * since we emulate it
+ */
+ if (enable_ept)
+ msrs->vmfunc_controls =
+ VMX_VMFUNC_EPTP_SWITCHING;
+ }
+
+ /*
+ * Old versions of KVM use the single-context version without
+ * checking for support, so declare that it is supported even
+ * though it is treated as global context. The alternative is
+ * not failing the single-context invvpid, and it is worse.
+ */
+ if (enable_vpid) {
+ msrs->secondary_ctls_high |=
+ SECONDARY_EXEC_ENABLE_VPID;
+ msrs->vpid_caps = VMX_VPID_INVVPID_BIT |
+ VMX_VPID_EXTENT_SUPPORTED_MASK;
+ }
+
+ if (enable_unrestricted_guest)
+ msrs->secondary_ctls_high |=
+ SECONDARY_EXEC_UNRESTRICTED_GUEST;
+
+ if (flexpriority_enabled)
+ msrs->secondary_ctls_high |=
+ SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
+
+ /* miscellaneous data */
+ rdmsr(MSR_IA32_VMX_MISC,
+ msrs->misc_low,
+ msrs->misc_high);
+ msrs->misc_low &= VMX_MISC_SAVE_EFER_LMA;
+ msrs->misc_low |=
+ MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS |
+ VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE |
+ VMX_MISC_ACTIVITY_HLT;
+ msrs->misc_high = 0;
+
+ /*
+ * This MSR reports some information about VMX support. We
+ * should return information about the VMX we emulate for the
+ * guest, and the VMCS structure we give it - not about the
+ * VMX support of the underlying hardware.
+ */
+ msrs->basic =
+ VMCS12_REVISION |
+ VMX_BASIC_TRUE_CTLS |
+ ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) |
+ (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT);
+
+ if (cpu_has_vmx_basic_inout())
+ msrs->basic |= VMX_BASIC_INOUT;
+
+ /*
+ * These MSRs specify bits which the guest must keep fixed on
+ * while L1 is in VMXON mode (in L1's root mode, or running an L2).
+ * We picked the standard core2 setting.
+ */
+#define VMXON_CR0_ALWAYSON (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE)
+#define VMXON_CR4_ALWAYSON X86_CR4_VMXE
+ msrs->cr0_fixed0 = VMXON_CR0_ALWAYSON;
+ msrs->cr4_fixed0 = VMXON_CR4_ALWAYSON;
+
+ /* These MSRs specify bits which the guest must keep fixed off. */
+ rdmsrl(MSR_IA32_VMX_CR0_FIXED1, msrs->cr0_fixed1);
+ rdmsrl(MSR_IA32_VMX_CR4_FIXED1, msrs->cr4_fixed1);
+
+ /* highest index: VMX_PREEMPTION_TIMER_VALUE */
+ msrs->vmcs_enum = VMCS12_MAX_FIELD_INDEX << 1;
+}
+
+/*
+ * if fixed0[i] == 1: val[i] must be 1
+ * if fixed1[i] == 0: val[i] must be 0
+ */
+static inline bool fixed_bits_valid(u64 val, u64 fixed0, u64 fixed1)
+{
+ return ((val & fixed1) | fixed0) == val;
+}
+
+static inline bool vmx_control_verify(u32 control, u32 low, u32 high)
+{
+ return fixed_bits_valid(control, low, high);
+}
+
+static inline u64 vmx_control_msr(u32 low, u32 high)
+{
+ return low | ((u64)high << 32);
+}
+
+static bool is_bitwise_subset(u64 superset, u64 subset, u64 mask)
+{
+ superset &= mask;
+ subset &= mask;
+
+ return (superset | subset) == superset;
+}
+
+static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data)
+{
+ const u64 feature_and_reserved =
+ /* feature (except bit 48; see below) */
+ BIT_ULL(49) | BIT_ULL(54) | BIT_ULL(55) |
+ /* reserved */
+ BIT_ULL(31) | GENMASK_ULL(47, 45) | GENMASK_ULL(63, 56);
+ u64 vmx_basic = vmx->nested.msrs.basic;
+
+ if (!is_bitwise_subset(vmx_basic, data, feature_and_reserved))
+ return -EINVAL;
+
+ /*
+ * KVM does not emulate a version of VMX that constrains physical
+ * addresses of VMX structures (e.g. VMCS) to 32-bits.
+ */
+ if (data & BIT_ULL(48))
+ return -EINVAL;
+
+ if (vmx_basic_vmcs_revision_id(vmx_basic) !=
+ vmx_basic_vmcs_revision_id(data))
+ return -EINVAL;
+
+ if (vmx_basic_vmcs_size(vmx_basic) > vmx_basic_vmcs_size(data))
+ return -EINVAL;
+
+ vmx->nested.msrs.basic = data;
+ return 0;
+}
+
+static int
+vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data)
+{
+ u64 supported;
+ u32 *lowp, *highp;
+
+ switch (msr_index) {
+ case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
+ lowp = &vmx->nested.msrs.pinbased_ctls_low;
+ highp = &vmx->nested.msrs.pinbased_ctls_high;
+ break;
+ case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
+ lowp = &vmx->nested.msrs.procbased_ctls_low;
+ highp = &vmx->nested.msrs.procbased_ctls_high;
+ break;
+ case MSR_IA32_VMX_TRUE_EXIT_CTLS:
+ lowp = &vmx->nested.msrs.exit_ctls_low;
+ highp = &vmx->nested.msrs.exit_ctls_high;
+ break;
+ case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
+ lowp = &vmx->nested.msrs.entry_ctls_low;
+ highp = &vmx->nested.msrs.entry_ctls_high;
+ break;
+ case MSR_IA32_VMX_PROCBASED_CTLS2:
+ lowp = &vmx->nested.msrs.secondary_ctls_low;
+ highp = &vmx->nested.msrs.secondary_ctls_high;
+ break;
+ default:
+ BUG();
+ }
+
+ supported = vmx_control_msr(*lowp, *highp);
+
+ /* Check must-be-1 bits are still 1. */
+ if (!is_bitwise_subset(data, supported, GENMASK_ULL(31, 0)))
+ return -EINVAL;
+
+ /* Check must-be-0 bits are still 0. */
+ if (!is_bitwise_subset(supported, data, GENMASK_ULL(63, 32)))
+ return -EINVAL;
+
+ *lowp = data;
+ *highp = data >> 32;
+ return 0;
+}
+
+static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data)
+{
+ const u64 feature_and_reserved_bits =
+ /* feature */
+ BIT_ULL(5) | GENMASK_ULL(8, 6) | BIT_ULL(14) | BIT_ULL(15) |
+ BIT_ULL(28) | BIT_ULL(29) | BIT_ULL(30) |
+ /* reserved */
+ GENMASK_ULL(13, 9) | BIT_ULL(31);
+ u64 vmx_misc;
+
+ vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low,
+ vmx->nested.msrs.misc_high);
+
+ if (!is_bitwise_subset(vmx_misc, data, feature_and_reserved_bits))
+ return -EINVAL;
+
+ if ((vmx->nested.msrs.pinbased_ctls_high &
+ PIN_BASED_VMX_PREEMPTION_TIMER) &&
+ vmx_misc_preemption_timer_rate(data) !=
+ vmx_misc_preemption_timer_rate(vmx_misc))
+ return -EINVAL;
+
+ if (vmx_misc_cr3_count(data) > vmx_misc_cr3_count(vmx_misc))
+ return -EINVAL;
+
+ if (vmx_misc_max_msr(data) > vmx_misc_max_msr(vmx_misc))
+ return -EINVAL;
+
+ if (vmx_misc_mseg_revid(data) != vmx_misc_mseg_revid(vmx_misc))
+ return -EINVAL;
+
+ vmx->nested.msrs.misc_low = data;
+ vmx->nested.msrs.misc_high = data >> 32;
+
+ /*
+ * If L1 has read-only VM-exit information fields, use the
+ * less permissive vmx_vmwrite_bitmap to specify write
+ * permissions for the shadow VMCS.
+ */
+ if (enable_shadow_vmcs && !nested_cpu_has_vmwrite_any_field(&vmx->vcpu))
+ vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap));
+
+ return 0;
+}
+
+static int vmx_restore_vmx_ept_vpid_cap(struct vcpu_vmx *vmx, u64 data)
+{
+ u64 vmx_ept_vpid_cap;
+
+ vmx_ept_vpid_cap = vmx_control_msr(vmx->nested.msrs.ept_caps,
+ vmx->nested.msrs.vpid_caps);
+
+ /* Every bit is either reserved or a feature bit. */
+ if (!is_bitwise_subset(vmx_ept_vpid_cap, data, -1ULL))
+ return -EINVAL;
+
+ vmx->nested.msrs.ept_caps = data;
+ vmx->nested.msrs.vpid_caps = data >> 32;
+ return 0;
+}
+
+static int vmx_restore_fixed0_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data)
+{
+ u64 *msr;
+
+ switch (msr_index) {
+ case MSR_IA32_VMX_CR0_FIXED0:
+ msr = &vmx->nested.msrs.cr0_fixed0;
+ break;
+ case MSR_IA32_VMX_CR4_FIXED0:
+ msr = &vmx->nested.msrs.cr4_fixed0;
+ break;
+ default:
+ BUG();
+ }
+
+ /*
+ * 1 bits (which indicates bits which "must-be-1" during VMX operation)
+ * must be 1 in the restored value.
+ */
+ if (!is_bitwise_subset(data, *msr, -1ULL))
+ return -EINVAL;
+
+ *msr = data;
+ return 0;
+}
+
+/*
+ * Called when userspace is restoring VMX MSRs.
+ *
+ * Returns 0 on success, non-0 otherwise.
+ */
+static int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ /*
+ * Don't allow changes to the VMX capability MSRs while the vCPU
+ * is in VMX operation.
+ */
+ if (vmx->nested.vmxon)
+ return -EBUSY;
+
+ switch (msr_index) {
+ case MSR_IA32_VMX_BASIC:
+ return vmx_restore_vmx_basic(vmx, data);
+ case MSR_IA32_VMX_PINBASED_CTLS:
+ case MSR_IA32_VMX_PROCBASED_CTLS:
+ case MSR_IA32_VMX_EXIT_CTLS:
+ case MSR_IA32_VMX_ENTRY_CTLS:
+ /*
+ * The "non-true" VMX capability MSRs are generated from the
+ * "true" MSRs, so we do not support restoring them directly.
+ *
+ * If userspace wants to emulate VMX_BASIC[55]=0, userspace
+ * should restore the "true" MSRs with the must-be-1 bits
+ * set according to the SDM Vol 3. A.2 "RESERVED CONTROLS AND
+ * DEFAULT SETTINGS".
+ */
+ return -EINVAL;
+ case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
+ case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
+ case MSR_IA32_VMX_TRUE_EXIT_CTLS:
+ case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
+ case MSR_IA32_VMX_PROCBASED_CTLS2:
+ return vmx_restore_control_msr(vmx, msr_index, data);
+ case MSR_IA32_VMX_MISC:
+ return vmx_restore_vmx_misc(vmx, data);
+ case MSR_IA32_VMX_CR0_FIXED0:
+ case MSR_IA32_VMX_CR4_FIXED0:
+ return vmx_restore_fixed0_msr(vmx, msr_index, data);
+ case MSR_IA32_VMX_CR0_FIXED1:
+ case MSR_IA32_VMX_CR4_FIXED1:
+ /*
+ * These MSRs are generated based on the vCPU's CPUID, so we
+ * do not support restoring them directly.
+ */
+ return -EINVAL;
+ case MSR_IA32_VMX_EPT_VPID_CAP:
+ return vmx_restore_vmx_ept_vpid_cap(vmx, data);
+ case MSR_IA32_VMX_VMCS_ENUM:
+ vmx->nested.msrs.vmcs_enum = data;
+ return 0;
+ default:
+ /*
+ * The rest of the VMX capability MSRs do not support restore.
+ */
+ return -EINVAL;
+ }
+}
+
+/* Returns 0 on success, non-0 otherwise. */
+static int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata)
+{
+ switch (msr_index) {
+ case MSR_IA32_VMX_BASIC:
+ *pdata = msrs->basic;
+ break;
+ case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
+ case MSR_IA32_VMX_PINBASED_CTLS:
+ *pdata = vmx_control_msr(
+ msrs->pinbased_ctls_low,
+ msrs->pinbased_ctls_high);
+ if (msr_index == MSR_IA32_VMX_PINBASED_CTLS)
+ *pdata |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
+ break;
+ case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
+ case MSR_IA32_VMX_PROCBASED_CTLS:
+ *pdata = vmx_control_msr(
+ msrs->procbased_ctls_low,
+ msrs->procbased_ctls_high);
+ if (msr_index == MSR_IA32_VMX_PROCBASED_CTLS)
+ *pdata |= CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
+ break;
+ case MSR_IA32_VMX_TRUE_EXIT_CTLS:
+ case MSR_IA32_VMX_EXIT_CTLS:
+ *pdata = vmx_control_msr(
+ msrs->exit_ctls_low,
+ msrs->exit_ctls_high);
+ if (msr_index == MSR_IA32_VMX_EXIT_CTLS)
+ *pdata |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
+ break;
+ case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
+ case MSR_IA32_VMX_ENTRY_CTLS:
+ *pdata = vmx_control_msr(
+ msrs->entry_ctls_low,
+ msrs->entry_ctls_high);
+ if (msr_index == MSR_IA32_VMX_ENTRY_CTLS)
+ *pdata |= VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
+ break;
+ case MSR_IA32_VMX_MISC:
+ *pdata = vmx_control_msr(
+ msrs->misc_low,
+ msrs->misc_high);
+ break;
+ case MSR_IA32_VMX_CR0_FIXED0:
+ *pdata = msrs->cr0_fixed0;
+ break;
+ case MSR_IA32_VMX_CR0_FIXED1:
+ *pdata = msrs->cr0_fixed1;
+ break;
+ case MSR_IA32_VMX_CR4_FIXED0:
+ *pdata = msrs->cr4_fixed0;
+ break;
+ case MSR_IA32_VMX_CR4_FIXED1:
+ *pdata = msrs->cr4_fixed1;
+ break;
+ case MSR_IA32_VMX_VMCS_ENUM:
+ *pdata = msrs->vmcs_enum;
+ break;
+ case MSR_IA32_VMX_PROCBASED_CTLS2:
+ *pdata = vmx_control_msr(
+ msrs->secondary_ctls_low,
+ msrs->secondary_ctls_high);
+ break;
+ case MSR_IA32_VMX_EPT_VPID_CAP:
+ *pdata = msrs->ept_caps |
+ ((u64)msrs->vpid_caps << 32);
+ break;
+ case MSR_IA32_VMX_VMFUNC:
+ *pdata = msrs->vmfunc_controls;
+ break;
+ default:
+ return 1;
+ }
+
+ return 0;
+}
+
+static inline bool vmx_feature_control_msr_valid(struct kvm_vcpu *vcpu,
+ uint64_t val)
+{
+ uint64_t valid_bits = to_vmx(vcpu)->msr_ia32_feature_control_valid_bits;
+
+ return !(val & ~valid_bits);
+}
+
+static int vmx_get_msr_feature(struct kvm_msr_entry *msr)
+{
+ switch (msr->index) {
+ case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
+ if (!nested)
+ return 1;
+ return vmx_get_vmx_msr(&vmcs_config.nested, msr->index, &msr->data);
+ default:
+ return 1;
+ }
+
+ return 0;
+}
+
+/*
+ * Reads an msr value (of 'msr_index') into 'pdata'.
+ * Returns 0 on success, non-0 otherwise.
+ * Assumes vcpu_load() was already called.
+ */
+static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct shared_msr_entry *msr;
+
+ switch (msr_info->index) {
+#ifdef CONFIG_X86_64
+ case MSR_FS_BASE:
+ msr_info->data = vmcs_readl(GUEST_FS_BASE);
+ break;
+ case MSR_GS_BASE:
+ msr_info->data = vmcs_readl(GUEST_GS_BASE);
+ break;
+ case MSR_KERNEL_GS_BASE:
+ msr_info->data = vmx_read_guest_kernel_gs_base(vmx);
+ break;
+#endif
+ case MSR_EFER:
+ return kvm_get_msr_common(vcpu, msr_info);
+ case MSR_IA32_SPEC_CTRL:
+ if (!msr_info->host_initiated &&
+ !guest_has_spec_ctrl_msr(vcpu))
+ return 1;
+
+ msr_info->data = to_vmx(vcpu)->spec_ctrl;
+ break;
+ case MSR_IA32_SYSENTER_CS:
+ msr_info->data = vmcs_read32(GUEST_SYSENTER_CS);
+ break;
+ case MSR_IA32_SYSENTER_EIP:
+ msr_info->data = vmcs_readl(GUEST_SYSENTER_EIP);
+ break;
+ case MSR_IA32_SYSENTER_ESP:
+ msr_info->data = vmcs_readl(GUEST_SYSENTER_ESP);
+ break;
+ case MSR_IA32_BNDCFGS:
+ if (!kvm_mpx_supported() ||
+ (!msr_info->host_initiated &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_MPX)))
+ return 1;
+ msr_info->data = vmcs_read64(GUEST_BNDCFGS);
+ break;
+ case MSR_IA32_MCG_EXT_CTL:
+ if (!msr_info->host_initiated &&
+ !(vmx->msr_ia32_feature_control &
+ FEATURE_CONTROL_LMCE))
+ return 1;
+ msr_info->data = vcpu->arch.mcg_ext_ctl;
+ break;
+ case MSR_IA32_FEATURE_CONTROL:
+ msr_info->data = vmx->msr_ia32_feature_control;
+ break;
+ case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
+ if (!nested_vmx_allowed(vcpu))
+ return 1;
+ return vmx_get_vmx_msr(&vmx->nested.msrs, msr_info->index,
+ &msr_info->data);
+ case MSR_IA32_XSS:
+ if (!vmx_xsaves_supported() ||
+ (!msr_info->host_initiated &&
+ !(guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) &&
+ guest_cpuid_has(vcpu, X86_FEATURE_XSAVES))))
+ return 1;
+ msr_info->data = vcpu->arch.ia32_xss;
+ break;
+ case MSR_TSC_AUX:
+ if (!msr_info->host_initiated &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
+ return 1;
+ /* Otherwise falls through */
+ default:
+ msr = find_msr_entry(vmx, msr_info->index);
+ if (msr) {
+ msr_info->data = msr->data;
+ break;
+ }
+ return kvm_get_msr_common(vcpu, msr_info);
+ }
+
+ return 0;
+}
+
+static void vmx_leave_nested(struct kvm_vcpu *vcpu);
+
+/*
+ * Writes msr value into into the appropriate "register".
+ * Returns 0 on success, non-0 otherwise.
+ * Assumes vcpu_load() was already called.
+ */
+static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct shared_msr_entry *msr;
+ int ret = 0;
+ u32 msr_index = msr_info->index;
+ u64 data = msr_info->data;
+
+ switch (msr_index) {
+ case MSR_EFER:
+ ret = kvm_set_msr_common(vcpu, msr_info);
+ break;
+#ifdef CONFIG_X86_64
+ case MSR_FS_BASE:
+ vmx_segment_cache_clear(vmx);
+ vmcs_writel(GUEST_FS_BASE, data);
+ break;
+ case MSR_GS_BASE:
+ vmx_segment_cache_clear(vmx);
+ vmcs_writel(GUEST_GS_BASE, data);
+ break;
+ case MSR_KERNEL_GS_BASE:
+ vmx_write_guest_kernel_gs_base(vmx, data);
+ break;
+#endif
+ case MSR_IA32_SYSENTER_CS:
+ vmcs_write32(GUEST_SYSENTER_CS, data);
+ break;
+ case MSR_IA32_SYSENTER_EIP:
+ vmcs_writel(GUEST_SYSENTER_EIP, data);
+ break;
+ case MSR_IA32_SYSENTER_ESP:
+ vmcs_writel(GUEST_SYSENTER_ESP, data);
+ break;
+ case MSR_IA32_BNDCFGS:
+ if (!kvm_mpx_supported() ||
+ (!msr_info->host_initiated &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_MPX)))
+ return 1;
+ if (is_noncanonical_address(data & PAGE_MASK, vcpu) ||
+ (data & MSR_IA32_BNDCFGS_RSVD))
+ return 1;
+ vmcs_write64(GUEST_BNDCFGS, data);
+ break;
+ case MSR_IA32_SPEC_CTRL:
+ if (!msr_info->host_initiated &&
+ !guest_has_spec_ctrl_msr(vcpu))
+ return 1;
+
+ /* The STIBP bit doesn't fault even if it's not advertised */
+ if (data & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP | SPEC_CTRL_SSBD))
+ return 1;
+
+ vmx->spec_ctrl = data;
+
+ if (!data)
+ break;
+
+ /*
+ * For non-nested:
+ * When it's written (to non-zero) for the first time, pass
+ * it through.
+ *
+ * For nested:
+ * The handling of the MSR bitmap for L2 guests is done in
+ * nested_vmx_merge_msr_bitmap. We should not touch the
+ * vmcs02.msr_bitmap here since it gets completely overwritten
+ * in the merging. We update the vmcs01 here for L1 as well
+ * since it will end up touching the MSR anyway now.
+ */
+ vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap,
+ MSR_IA32_SPEC_CTRL,
+ MSR_TYPE_RW);
+ break;
+ case MSR_IA32_PRED_CMD:
+ if (!msr_info->host_initiated &&
+ !guest_has_pred_cmd_msr(vcpu))
+ return 1;
+
+ if (data & ~PRED_CMD_IBPB)
+ return 1;
+
+ if (!data)
+ break;
+
+ wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB);
+
+ /*
+ * For non-nested:
+ * When it's written (to non-zero) for the first time, pass
+ * it through.
+ *
+ * For nested:
+ * The handling of the MSR bitmap for L2 guests is done in
+ * nested_vmx_merge_msr_bitmap. We should not touch the
+ * vmcs02.msr_bitmap here since it gets completely overwritten
+ * in the merging.
+ */
+ vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap, MSR_IA32_PRED_CMD,
+ MSR_TYPE_W);
+ break;
+ case MSR_IA32_CR_PAT:
+ if (!kvm_pat_valid(data))
+ return 1;
+
+ if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
+ vmcs_write64(GUEST_IA32_PAT, data);
+ vcpu->arch.pat = data;
+ break;
+ }
+ ret = kvm_set_msr_common(vcpu, msr_info);
+ break;
+ case MSR_IA32_TSC_ADJUST:
+ ret = kvm_set_msr_common(vcpu, msr_info);
+ break;
+ case MSR_IA32_MCG_EXT_CTL:
+ if ((!msr_info->host_initiated &&
+ !(to_vmx(vcpu)->msr_ia32_feature_control &
+ FEATURE_CONTROL_LMCE)) ||
+ (data & ~MCG_EXT_CTL_LMCE_EN))
+ return 1;
+ vcpu->arch.mcg_ext_ctl = data;
+ break;
+ case MSR_IA32_FEATURE_CONTROL:
+ if (!vmx_feature_control_msr_valid(vcpu, data) ||
+ (to_vmx(vcpu)->msr_ia32_feature_control &
+ FEATURE_CONTROL_LOCKED && !msr_info->host_initiated))
+ return 1;
+ vmx->msr_ia32_feature_control = data;
+ if (msr_info->host_initiated && data == 0)
+ vmx_leave_nested(vcpu);
+ break;
+ case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
+ if (!msr_info->host_initiated)
+ return 1; /* they are read-only */
+ if (!nested_vmx_allowed(vcpu))
+ return 1;
+ return vmx_set_vmx_msr(vcpu, msr_index, data);
+ case MSR_IA32_XSS:
+ if (!vmx_xsaves_supported() ||
+ (!msr_info->host_initiated &&
+ !(guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) &&
+ guest_cpuid_has(vcpu, X86_FEATURE_XSAVES))))
+ return 1;
+ /*
+ * The only supported bit as of Skylake is bit 8, but
+ * it is not supported on KVM.
+ */
+ if (data != 0)
+ return 1;
+ vcpu->arch.ia32_xss = data;
+ if (vcpu->arch.ia32_xss != host_xss)
+ add_atomic_switch_msr(vmx, MSR_IA32_XSS,
+ vcpu->arch.ia32_xss, host_xss, false);
+ else
+ clear_atomic_switch_msr(vmx, MSR_IA32_XSS);
+ break;
+ case MSR_TSC_AUX:
+ if (!msr_info->host_initiated &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
+ return 1;
+ /* Check reserved bit, higher 32 bits should be zero */
+ if ((data >> 32) != 0)
+ return 1;
+ /* Otherwise falls through */
+ default:
+ msr = find_msr_entry(vmx, msr_index);
+ if (msr) {
+ u64 old_msr_data = msr->data;
+ msr->data = data;
+ if (msr - vmx->guest_msrs < vmx->save_nmsrs) {
+ preempt_disable();
+ ret = kvm_set_shared_msr(msr->index, msr->data,
+ msr->mask);
+ preempt_enable();
+ if (ret)
+ msr->data = old_msr_data;
+ }
+ break;
+ }
+ ret = kvm_set_msr_common(vcpu, msr_info);
+ }
+
+ /* FB_CLEAR may have changed, also update the FB_CLEAR_DIS behavior */
+ if (msr_index == MSR_IA32_ARCH_CAPABILITIES)
+ vmx_update_fb_clear_dis(vcpu, vmx);
+
+ return ret;
+}
+
+static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
+{
+ __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
+ switch (reg) {
+ case VCPU_REGS_RSP:
+ vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP);
+ break;
+ case VCPU_REGS_RIP:
+ vcpu->arch.regs[VCPU_REGS_RIP] = vmcs_readl(GUEST_RIP);
+ break;
+ case VCPU_EXREG_PDPTR:
+ if (enable_ept)
+ ept_save_pdptrs(vcpu);
+ break;
+ default:
+ break;
+ }
+}
+
+static __init int cpu_has_kvm_support(void)
+{
+ return cpu_has_vmx();
+}
+
+static __init int vmx_disabled_by_bios(void)
+{
+ u64 msr;
+
+ rdmsrl(MSR_IA32_FEATURE_CONTROL, msr);
+ if (msr & FEATURE_CONTROL_LOCKED) {
+ /* launched w/ TXT and VMX disabled */
+ if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX)
+ && tboot_enabled())
+ return 1;
+ /* launched w/o TXT and VMX only enabled w/ TXT */
+ if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX)
+ && (msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX)
+ && !tboot_enabled()) {
+ printk(KERN_WARNING "kvm: disable TXT in the BIOS or "
+ "activate TXT before enabling KVM\n");
+ return 1;
+ }
+ /* launched w/o TXT and VMX disabled */
+ if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX)
+ && !tboot_enabled())
+ return 1;
+ }
+
+ return 0;
+}
+
+static void kvm_cpu_vmxon(u64 addr)
+{
+ cr4_set_bits(X86_CR4_VMXE);
+ intel_pt_handle_vmx(1);
+
+ asm volatile (ASM_VMX_VMXON_RAX
+ : : "a"(&addr), "m"(addr)
+ : "memory", "cc");
+}
+
+static int hardware_enable(void)
+{
+ int cpu = raw_smp_processor_id();
+ u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
+ u64 old, test_bits;
+
+ if (cr4_read_shadow() & X86_CR4_VMXE)
+ return -EBUSY;
+
+ /*
+ * This can happen if we hot-added a CPU but failed to allocate
+ * VP assist page for it.
+ */
+ if (static_branch_unlikely(&enable_evmcs) &&
+ !hv_get_vp_assist_page(cpu))
+ return -EFAULT;
+
+ rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
+
+ test_bits = FEATURE_CONTROL_LOCKED;
+ test_bits |= FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
+ if (tboot_enabled())
+ test_bits |= FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX;
+
+ if ((old & test_bits) != test_bits) {
+ /* enable and lock */
+ wrmsrl(MSR_IA32_FEATURE_CONTROL, old | test_bits);
+ }
+ kvm_cpu_vmxon(phys_addr);
+ if (enable_ept)
+ ept_sync_global();
+
+ return 0;
+}
+
+static void vmclear_local_loaded_vmcss(void)
+{
+ int cpu = raw_smp_processor_id();
+ struct loaded_vmcs *v, *n;
+
+ list_for_each_entry_safe(v, n, &per_cpu(loaded_vmcss_on_cpu, cpu),
+ loaded_vmcss_on_cpu_link)
+ __loaded_vmcs_clear(v);
+}
+
+
+/* Just like cpu_vmxoff(), but with the __kvm_handle_fault_on_reboot()
+ * tricks.
+ */
+static void kvm_cpu_vmxoff(void)
+{
+ asm volatile (__ex(ASM_VMX_VMXOFF) : : : "cc");
+
+ intel_pt_handle_vmx(0);
+ cr4_clear_bits(X86_CR4_VMXE);
+}
+
+static void hardware_disable(void)
+{
+ vmclear_local_loaded_vmcss();
+ kvm_cpu_vmxoff();
+}
+
+static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
+ u32 msr, u32 *result)
+{
+ u32 vmx_msr_low, vmx_msr_high;
+ u32 ctl = ctl_min | ctl_opt;
+
+ rdmsr(msr, vmx_msr_low, vmx_msr_high);
+
+ ctl &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */
+ ctl |= vmx_msr_low; /* bit == 1 in low word ==> must be one */
+
+ /* Ensure minimum (required) set of control bits are supported. */
+ if (ctl_min & ~ctl)
+ return -EIO;
+
+ *result = ctl;
+ return 0;
+}
+
+static __init bool allow_1_setting(u32 msr, u32 ctl)
+{
+ u32 vmx_msr_low, vmx_msr_high;
+
+ rdmsr(msr, vmx_msr_low, vmx_msr_high);
+ return vmx_msr_high & ctl;
+}
+
+static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
+{
+ u32 vmx_msr_low, vmx_msr_high;
+ u32 min, opt, min2, opt2;
+ u32 _pin_based_exec_control = 0;
+ u32 _cpu_based_exec_control = 0;
+ u32 _cpu_based_2nd_exec_control = 0;
+ u32 _vmexit_control = 0;
+ u32 _vmentry_control = 0;
+
+ memset(vmcs_conf, 0, sizeof(*vmcs_conf));
+ min = CPU_BASED_HLT_EXITING |
+#ifdef CONFIG_X86_64
+ CPU_BASED_CR8_LOAD_EXITING |
+ CPU_BASED_CR8_STORE_EXITING |
+#endif
+ CPU_BASED_CR3_LOAD_EXITING |
+ CPU_BASED_CR3_STORE_EXITING |
+ CPU_BASED_UNCOND_IO_EXITING |
+ CPU_BASED_MOV_DR_EXITING |
+ CPU_BASED_USE_TSC_OFFSETING |
+ CPU_BASED_MWAIT_EXITING |
+ CPU_BASED_MONITOR_EXITING |
+ CPU_BASED_INVLPG_EXITING |
+ CPU_BASED_RDPMC_EXITING;
+
+ opt = CPU_BASED_TPR_SHADOW |
+ CPU_BASED_USE_MSR_BITMAPS |
+ CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
+ if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS,
+ &_cpu_based_exec_control) < 0)
+ return -EIO;
+#ifdef CONFIG_X86_64
+ if ((_cpu_based_exec_control & CPU_BASED_TPR_SHADOW))
+ _cpu_based_exec_control &= ~CPU_BASED_CR8_LOAD_EXITING &
+ ~CPU_BASED_CR8_STORE_EXITING;
+#endif
+ if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) {
+ min2 = 0;
+ opt2 = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
+ SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
+ SECONDARY_EXEC_WBINVD_EXITING |
+ SECONDARY_EXEC_ENABLE_VPID |
+ SECONDARY_EXEC_ENABLE_EPT |
+ SECONDARY_EXEC_UNRESTRICTED_GUEST |
+ SECONDARY_EXEC_PAUSE_LOOP_EXITING |
+ SECONDARY_EXEC_DESC |
+ SECONDARY_EXEC_RDTSCP |
+ SECONDARY_EXEC_ENABLE_INVPCID |
+ SECONDARY_EXEC_APIC_REGISTER_VIRT |
+ SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
+ SECONDARY_EXEC_SHADOW_VMCS |
+ SECONDARY_EXEC_XSAVES |
+ SECONDARY_EXEC_RDSEED_EXITING |
+ SECONDARY_EXEC_RDRAND_EXITING |
+ SECONDARY_EXEC_ENABLE_PML |
+ SECONDARY_EXEC_TSC_SCALING |
+ SECONDARY_EXEC_ENABLE_VMFUNC |
+ SECONDARY_EXEC_ENCLS_EXITING;
+ if (adjust_vmx_controls(min2, opt2,
+ MSR_IA32_VMX_PROCBASED_CTLS2,
+ &_cpu_based_2nd_exec_control) < 0)
+ return -EIO;
+ }
+#ifndef CONFIG_X86_64
+ if (!(_cpu_based_2nd_exec_control &
+ SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
+ _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW;
+#endif
+
+ if (!(_cpu_based_exec_control & CPU_BASED_TPR_SHADOW))
+ _cpu_based_2nd_exec_control &= ~(
+ SECONDARY_EXEC_APIC_REGISTER_VIRT |
+ SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
+ SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
+
+ rdmsr_safe(MSR_IA32_VMX_EPT_VPID_CAP,
+ &vmx_capability.ept, &vmx_capability.vpid);
+
+ if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) {
+ /* CR3 accesses and invlpg don't need to cause VM Exits when EPT
+ enabled */
+ _cpu_based_exec_control &= ~(CPU_BASED_CR3_LOAD_EXITING |
+ CPU_BASED_CR3_STORE_EXITING |
+ CPU_BASED_INVLPG_EXITING);
+ } else if (vmx_capability.ept) {
+ vmx_capability.ept = 0;
+ pr_warn_once("EPT CAP should not exist if not support "
+ "1-setting enable EPT VM-execution control\n");
+ }
+ if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_VPID) &&
+ vmx_capability.vpid) {
+ vmx_capability.vpid = 0;
+ pr_warn_once("VPID CAP should not exist if not support "
+ "1-setting enable VPID VM-execution control\n");
+ }
+
+ min = VM_EXIT_SAVE_DEBUG_CONTROLS | VM_EXIT_ACK_INTR_ON_EXIT;
+#ifdef CONFIG_X86_64
+ min |= VM_EXIT_HOST_ADDR_SPACE_SIZE;
+#endif
+ opt = VM_EXIT_SAVE_IA32_PAT | VM_EXIT_LOAD_IA32_PAT |
+ VM_EXIT_CLEAR_BNDCFGS;
+ if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS,
+ &_vmexit_control) < 0)
+ return -EIO;
+
+ min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING;
+ opt = PIN_BASED_VIRTUAL_NMIS | PIN_BASED_POSTED_INTR |
+ PIN_BASED_VMX_PREEMPTION_TIMER;
+ if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS,
+ &_pin_based_exec_control) < 0)
+ return -EIO;
+
+ if (cpu_has_broken_vmx_preemption_timer())
+ _pin_based_exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
+ if (!(_cpu_based_2nd_exec_control &
+ SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY))
+ _pin_based_exec_control &= ~PIN_BASED_POSTED_INTR;
+
+ min = VM_ENTRY_LOAD_DEBUG_CONTROLS;
+ opt = VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_BNDCFGS;
+ if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS,
+ &_vmentry_control) < 0)
+ return -EIO;
+
+ rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high);
+
+ /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */
+ if ((vmx_msr_high & 0x1fff) > PAGE_SIZE)
+ return -EIO;
+
+#ifdef CONFIG_X86_64
+ /* IA-32 SDM Vol 3B: 64-bit CPUs always have VMX_BASIC_MSR[48]==0. */
+ if (vmx_msr_high & (1u<<16))
+ return -EIO;
+#endif
+
+ /* Require Write-Back (WB) memory type for VMCS accesses. */
+ if (((vmx_msr_high >> 18) & 15) != 6)
+ return -EIO;
+
+ vmcs_conf->size = vmx_msr_high & 0x1fff;
+ vmcs_conf->order = get_order(vmcs_conf->size);
+ vmcs_conf->basic_cap = vmx_msr_high & ~0x1fff;
+
+ vmcs_conf->revision_id = vmx_msr_low;
+
+ vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control;
+ vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control;
+ vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control;
+ vmcs_conf->vmexit_ctrl = _vmexit_control;
+ vmcs_conf->vmentry_ctrl = _vmentry_control;
+
+ if (static_branch_unlikely(&enable_evmcs))
+ evmcs_sanitize_exec_ctrls(vmcs_conf);
+
+ cpu_has_load_ia32_efer =
+ allow_1_setting(MSR_IA32_VMX_ENTRY_CTLS,
+ VM_ENTRY_LOAD_IA32_EFER)
+ && allow_1_setting(MSR_IA32_VMX_EXIT_CTLS,
+ VM_EXIT_LOAD_IA32_EFER);
+
+ cpu_has_load_perf_global_ctrl =
+ allow_1_setting(MSR_IA32_VMX_ENTRY_CTLS,
+ VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL)
+ && allow_1_setting(MSR_IA32_VMX_EXIT_CTLS,
+ VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL);
+
+ /*
+ * Some cpus support VM_ENTRY_(LOAD|SAVE)_IA32_PERF_GLOBAL_CTRL
+ * but due to errata below it can't be used. Workaround is to use
+ * msr load mechanism to switch IA32_PERF_GLOBAL_CTRL.
+ *
+ * VM Exit May Incorrectly Clear IA32_PERF_GLOBAL_CTRL [34:32]
+ *
+ * AAK155 (model 26)
+ * AAP115 (model 30)
+ * AAT100 (model 37)
+ * BC86,AAY89,BD102 (model 44)
+ * BA97 (model 46)
+ *
+ */
+ if (cpu_has_load_perf_global_ctrl && boot_cpu_data.x86 == 0x6) {
+ switch (boot_cpu_data.x86_model) {
+ case 26:
+ case 30:
+ case 37:
+ case 44:
+ case 46:
+ cpu_has_load_perf_global_ctrl = false;
+ printk_once(KERN_WARNING"kvm: VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL "
+ "does not work properly. Using workaround\n");
+ break;
+ default:
+ break;
+ }
+ }
+
+ if (boot_cpu_has(X86_FEATURE_XSAVES))
+ rdmsrl(MSR_IA32_XSS, host_xss);
+
+ return 0;
+}
+
+static struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu)
+{
+ int node = cpu_to_node(cpu);
+ struct page *pages;
+ struct vmcs *vmcs;
+
+ pages = __alloc_pages_node(node, GFP_KERNEL, vmcs_config.order);
+ if (!pages)
+ return NULL;
+ vmcs = page_address(pages);
+ memset(vmcs, 0, vmcs_config.size);
+
+ /* KVM supports Enlightened VMCS v1 only */
+ if (static_branch_unlikely(&enable_evmcs))
+ vmcs->hdr.revision_id = KVM_EVMCS_VERSION;
+ else
+ vmcs->hdr.revision_id = vmcs_config.revision_id;
+
+ if (shadow)
+ vmcs->hdr.shadow_vmcs = 1;
+ return vmcs;
+}
+
+static void free_vmcs(struct vmcs *vmcs)
+{
+ free_pages((unsigned long)vmcs, vmcs_config.order);
+}
+
+/*
+ * Free a VMCS, but before that VMCLEAR it on the CPU where it was last loaded
+ */
+static void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
+{
+ if (!loaded_vmcs->vmcs)
+ return;
+ loaded_vmcs_clear(loaded_vmcs);
+ free_vmcs(loaded_vmcs->vmcs);
+ loaded_vmcs->vmcs = NULL;
+ if (loaded_vmcs->msr_bitmap)
+ free_page((unsigned long)loaded_vmcs->msr_bitmap);
+ WARN_ON(loaded_vmcs->shadow_vmcs != NULL);
+}
+
+static struct vmcs *alloc_vmcs(bool shadow)
+{
+ return alloc_vmcs_cpu(shadow, raw_smp_processor_id());
+}
+
+static int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
+{
+ loaded_vmcs->vmcs = alloc_vmcs(false);
+ if (!loaded_vmcs->vmcs)
+ return -ENOMEM;
+
+ loaded_vmcs->shadow_vmcs = NULL;
+ loaded_vmcs_init(loaded_vmcs);
+
+ if (cpu_has_vmx_msr_bitmap()) {
+ loaded_vmcs->msr_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
+ if (!loaded_vmcs->msr_bitmap)
+ goto out_vmcs;
+ memset(loaded_vmcs->msr_bitmap, 0xff, PAGE_SIZE);
+
+ if (IS_ENABLED(CONFIG_HYPERV) &&
+ static_branch_unlikely(&enable_evmcs) &&
+ (ms_hyperv.nested_features & HV_X64_NESTED_MSR_BITMAP)) {
+ struct hv_enlightened_vmcs *evmcs =
+ (struct hv_enlightened_vmcs *)loaded_vmcs->vmcs;
+
+ evmcs->hv_enlightenments_control.msr_bitmap = 1;
+ }
+ }
+
+ memset(&loaded_vmcs->host_state, 0, sizeof(struct vmcs_host_state));
+
+ return 0;
+
+out_vmcs:
+ free_loaded_vmcs(loaded_vmcs);
+ return -ENOMEM;
+}
+
+static void free_kvm_area(void)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ free_vmcs(per_cpu(vmxarea, cpu));
+ per_cpu(vmxarea, cpu) = NULL;
+ }
+}
+
+enum vmcs_field_width {
+ VMCS_FIELD_WIDTH_U16 = 0,
+ VMCS_FIELD_WIDTH_U64 = 1,
+ VMCS_FIELD_WIDTH_U32 = 2,
+ VMCS_FIELD_WIDTH_NATURAL_WIDTH = 3
+};
+
+static inline int vmcs_field_width(unsigned long field)
+{
+ if (0x1 & field) /* the *_HIGH fields are all 32 bit */
+ return VMCS_FIELD_WIDTH_U32;
+ return (field >> 13) & 0x3 ;
+}
+
+static inline int vmcs_field_readonly(unsigned long field)
+{
+ return (((field >> 10) & 0x3) == 1);
+}
+
+static void init_vmcs_shadow_fields(void)
+{
+ int i, j;
+
+ for (i = j = 0; i < max_shadow_read_only_fields; i++) {
+ u16 field = shadow_read_only_fields[i];
+ if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 &&
+ (i + 1 == max_shadow_read_only_fields ||
+ shadow_read_only_fields[i + 1] != field + 1))
+ pr_err("Missing field from shadow_read_only_field %x\n",
+ field + 1);
+
+ clear_bit(field, vmx_vmread_bitmap);
+#ifdef CONFIG_X86_64
+ if (field & 1)
+ continue;
+#endif
+ if (j < i)
+ shadow_read_only_fields[j] = field;
+ j++;
+ }
+ max_shadow_read_only_fields = j;
+
+ for (i = j = 0; i < max_shadow_read_write_fields; i++) {
+ u16 field = shadow_read_write_fields[i];
+ if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 &&
+ (i + 1 == max_shadow_read_write_fields ||
+ shadow_read_write_fields[i + 1] != field + 1))
+ pr_err("Missing field from shadow_read_write_field %x\n",
+ field + 1);
+
+ /*
+ * PML and the preemption timer can be emulated, but the
+ * processor cannot vmwrite to fields that don't exist
+ * on bare metal.
+ */
+ switch (field) {
+ case GUEST_PML_INDEX:
+ if (!cpu_has_vmx_pml())
+ continue;
+ break;
+ case VMX_PREEMPTION_TIMER_VALUE:
+ if (!cpu_has_vmx_preemption_timer())
+ continue;
+ break;
+ case GUEST_INTR_STATUS:
+ if (!cpu_has_vmx_apicv())
+ continue;
+ break;
+ default:
+ break;
+ }
+
+ clear_bit(field, vmx_vmwrite_bitmap);
+ clear_bit(field, vmx_vmread_bitmap);
+#ifdef CONFIG_X86_64
+ if (field & 1)
+ continue;
+#endif
+ if (j < i)
+ shadow_read_write_fields[j] = field;
+ j++;
+ }
+ max_shadow_read_write_fields = j;
+}
+
+static __init int alloc_kvm_area(void)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ struct vmcs *vmcs;
+
+ vmcs = alloc_vmcs_cpu(false, cpu);
+ if (!vmcs) {
+ free_kvm_area();
+ return -ENOMEM;
+ }
+
+ /*
+ * When eVMCS is enabled, alloc_vmcs_cpu() sets
+ * vmcs->revision_id to KVM_EVMCS_VERSION instead of
+ * revision_id reported by MSR_IA32_VMX_BASIC.
+ *
+ * However, even though not explictly documented by
+ * TLFS, VMXArea passed as VMXON argument should
+ * still be marked with revision_id reported by
+ * physical CPU.
+ */
+ if (static_branch_unlikely(&enable_evmcs))
+ vmcs->hdr.revision_id = vmcs_config.revision_id;
+
+ per_cpu(vmxarea, cpu) = vmcs;
+ }
+ return 0;
+}
+
+static void fix_pmode_seg(struct kvm_vcpu *vcpu, int seg,
+ struct kvm_segment *save)
+{
+ if (!emulate_invalid_guest_state) {
+ /*
+ * CS and SS RPL should be equal during guest entry according
+ * to VMX spec, but in reality it is not always so. Since vcpu
+ * is in the middle of the transition from real mode to
+ * protected mode it is safe to assume that RPL 0 is a good
+ * default value.
+ */
+ if (seg == VCPU_SREG_CS || seg == VCPU_SREG_SS)
+ save->selector &= ~SEGMENT_RPL_MASK;
+ save->dpl = save->selector & SEGMENT_RPL_MASK;
+ save->s = 1;
+ }
+ vmx_set_segment(vcpu, save, seg);
+}
+
+static void enter_pmode(struct kvm_vcpu *vcpu)
+{
+ unsigned long flags;
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ /*
+ * Update real mode segment cache. It may be not up-to-date if sement
+ * register was written while vcpu was in a guest mode.
+ */
+ vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES);
+ vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS);
+ vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS);
+ vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS);
+ vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS);
+ vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS);
+
+ vmx->rmode.vm86_active = 0;
+
+ vmx_segment_cache_clear(vmx);
+
+ vmx_set_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);
+
+ flags = vmcs_readl(GUEST_RFLAGS);
+ flags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
+ flags |= vmx->rmode.save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
+ vmcs_writel(GUEST_RFLAGS, flags);
+
+ vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) |
+ (vmcs_readl(CR4_READ_SHADOW) & X86_CR4_VME));
+
+ update_exception_bitmap(vcpu);
+
+ fix_pmode_seg(vcpu, VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]);
+ fix_pmode_seg(vcpu, VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]);
+ fix_pmode_seg(vcpu, VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]);
+ fix_pmode_seg(vcpu, VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]);
+ fix_pmode_seg(vcpu, VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]);
+ fix_pmode_seg(vcpu, VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]);
+}
+
+static void fix_rmode_seg(int seg, struct kvm_segment *save)
+{
+ const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
+ struct kvm_segment var = *save;
+
+ var.dpl = 0x3;
+ if (seg == VCPU_SREG_CS)
+ var.type = 0x3;
+
+ if (!emulate_invalid_guest_state) {
+ var.selector = var.base >> 4;
+ var.base = var.base & 0xffff0;
+ var.limit = 0xffff;
+ var.g = 0;
+ var.db = 0;
+ var.present = 1;
+ var.s = 1;
+ var.l = 0;
+ var.unusable = 0;
+ var.type = 0x3;
+ var.avl = 0;
+ if (save->base & 0xf)
+ printk_once(KERN_WARNING "kvm: segment base is not "
+ "paragraph aligned when entering "
+ "protected mode (seg=%d)", seg);
+ }
+
+ vmcs_write16(sf->selector, var.selector);
+ vmcs_writel(sf->base, var.base);
+ vmcs_write32(sf->limit, var.limit);
+ vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(&var));
+}
+
+static void enter_rmode(struct kvm_vcpu *vcpu)
+{
+ unsigned long flags;
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct kvm_vmx *kvm_vmx = to_kvm_vmx(vcpu->kvm);
+
+ vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);
+ vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES);
+ vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS);
+ vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS);
+ vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS);
+ vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS);
+ vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS);
+
+ vmx->rmode.vm86_active = 1;
+
+ /*
+ * Very old userspace does not call KVM_SET_TSS_ADDR before entering
+ * vcpu. Warn the user that an update is overdue.
+ */
+ if (!kvm_vmx->tss_addr)
+ printk_once(KERN_WARNING "kvm: KVM_SET_TSS_ADDR need to be "
+ "called before entering vcpu\n");
+
+ vmx_segment_cache_clear(vmx);
+
+ vmcs_writel(GUEST_TR_BASE, kvm_vmx->tss_addr);
+ vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1);
+ vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
+
+ flags = vmcs_readl(GUEST_RFLAGS);
+ vmx->rmode.save_rflags = flags;
+
+ flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
+
+ vmcs_writel(GUEST_RFLAGS, flags);
+ vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME);
+ update_exception_bitmap(vcpu);
+
+ fix_rmode_seg(VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]);
+ fix_rmode_seg(VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]);
+ fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]);
+ fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]);
+ fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]);
+ fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]);
+
+ kvm_mmu_reset_context(vcpu);
+}
+
+static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct shared_msr_entry *msr = find_msr_entry(vmx, MSR_EFER);
+
+ if (!msr)
+ return;
+
+ vcpu->arch.efer = efer;
+ if (efer & EFER_LMA) {
+ vm_entry_controls_setbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE);
+ msr->data = efer;
+ } else {
+ vm_entry_controls_clearbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE);
+
+ msr->data = efer & ~EFER_LME;
+ }
+ setup_msrs(vmx);
+}
+
+#ifdef CONFIG_X86_64
+
+static void enter_lmode(struct kvm_vcpu *vcpu)
+{
+ u32 guest_tr_ar;
+
+ vmx_segment_cache_clear(to_vmx(vcpu));
+
+ guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES);
+ if ((guest_tr_ar & VMX_AR_TYPE_MASK) != VMX_AR_TYPE_BUSY_64_TSS) {
+ pr_debug_ratelimited("%s: tss fixup for long mode. \n",
+ __func__);
+ vmcs_write32(GUEST_TR_AR_BYTES,
+ (guest_tr_ar & ~VMX_AR_TYPE_MASK)
+ | VMX_AR_TYPE_BUSY_64_TSS);
+ }
+ vmx_set_efer(vcpu, vcpu->arch.efer | EFER_LMA);
+}
+
+static void exit_lmode(struct kvm_vcpu *vcpu)
+{
+ vm_entry_controls_clearbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE);
+ vmx_set_efer(vcpu, vcpu->arch.efer & ~EFER_LMA);
+}
+
+#endif
+
+static inline void __vmx_flush_tlb(struct kvm_vcpu *vcpu, int vpid,
+ bool invalidate_gpa)
+{
+ if (enable_ept && (invalidate_gpa || !enable_vpid)) {
+ if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
+ return;
+ ept_sync_context(construct_eptp(vcpu, vcpu->arch.mmu.root_hpa));
+ } else {
+ vpid_sync_context(vpid);
+ }
+}
+
+static void vmx_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa)
+{
+ __vmx_flush_tlb(vcpu, to_vmx(vcpu)->vpid, invalidate_gpa);
+}
+
+static void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr)
+{
+ int vpid = to_vmx(vcpu)->vpid;
+
+ if (!vpid_sync_vcpu_addr(vpid, addr))
+ vpid_sync_context(vpid);
+
+ /*
+ * If VPIDs are not supported or enabled, then the above is a no-op.
+ * But we don't really need a TLB flush in that case anyway, because
+ * each VM entry/exit includes an implicit flush when VPID is 0.
+ */
+}
+
+static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
+{
+ ulong cr0_guest_owned_bits = vcpu->arch.cr0_guest_owned_bits;
+
+ vcpu->arch.cr0 &= ~cr0_guest_owned_bits;
+ vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & cr0_guest_owned_bits;
+}
+
+static void vmx_decache_cr3(struct kvm_vcpu *vcpu)
+{
+ if (enable_unrestricted_guest || (enable_ept && is_paging(vcpu)))
+ vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
+ __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
+}
+
+static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
+{
+ ulong cr4_guest_owned_bits = vcpu->arch.cr4_guest_owned_bits;
+
+ vcpu->arch.cr4 &= ~cr4_guest_owned_bits;
+ vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & cr4_guest_owned_bits;
+}
+
+static void ept_load_pdptrs(struct kvm_vcpu *vcpu)
+{
+ struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
+
+ if (!test_bit(VCPU_EXREG_PDPTR,
+ (unsigned long *)&vcpu->arch.regs_dirty))
+ return;
+
+ if (is_pae_paging(vcpu)) {
+ vmcs_write64(GUEST_PDPTR0, mmu->pdptrs[0]);
+ vmcs_write64(GUEST_PDPTR1, mmu->pdptrs[1]);
+ vmcs_write64(GUEST_PDPTR2, mmu->pdptrs[2]);
+ vmcs_write64(GUEST_PDPTR3, mmu->pdptrs[3]);
+ }
+}
+
+static void ept_save_pdptrs(struct kvm_vcpu *vcpu)
+{
+ struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
+
+ if (is_pae_paging(vcpu)) {
+ mmu->pdptrs[0] = vmcs_read64(GUEST_PDPTR0);
+ mmu->pdptrs[1] = vmcs_read64(GUEST_PDPTR1);
+ mmu->pdptrs[2] = vmcs_read64(GUEST_PDPTR2);
+ mmu->pdptrs[3] = vmcs_read64(GUEST_PDPTR3);
+ }
+
+ __set_bit(VCPU_EXREG_PDPTR,
+ (unsigned long *)&vcpu->arch.regs_avail);
+ __set_bit(VCPU_EXREG_PDPTR,
+ (unsigned long *)&vcpu->arch.regs_dirty);
+}
+
+static bool nested_guest_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val)
+{
+ u64 fixed0 = to_vmx(vcpu)->nested.msrs.cr0_fixed0;
+ u64 fixed1 = to_vmx(vcpu)->nested.msrs.cr0_fixed1;
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+
+ if (to_vmx(vcpu)->nested.msrs.secondary_ctls_high &
+ SECONDARY_EXEC_UNRESTRICTED_GUEST &&
+ nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST))
+ fixed0 &= ~(X86_CR0_PE | X86_CR0_PG);
+
+ return fixed_bits_valid(val, fixed0, fixed1);
+}
+
+static bool nested_host_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val)
+{
+ u64 fixed0 = to_vmx(vcpu)->nested.msrs.cr0_fixed0;
+ u64 fixed1 = to_vmx(vcpu)->nested.msrs.cr0_fixed1;
+
+ return fixed_bits_valid(val, fixed0, fixed1);
+}
+
+static bool nested_cr4_valid(struct kvm_vcpu *vcpu, unsigned long val)
+{
+ u64 fixed0 = to_vmx(vcpu)->nested.msrs.cr4_fixed0;
+ u64 fixed1 = to_vmx(vcpu)->nested.msrs.cr4_fixed1;
+
+ return fixed_bits_valid(val, fixed0, fixed1);
+}
+
+/* No difference in the restrictions on guest and host CR4 in VMX operation. */
+#define nested_guest_cr4_valid nested_cr4_valid
+#define nested_host_cr4_valid nested_cr4_valid
+
+static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
+
+static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,
+ unsigned long cr0,
+ struct kvm_vcpu *vcpu)
+{
+ if (!test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail))
+ vmx_decache_cr3(vcpu);
+ if (!(cr0 & X86_CR0_PG)) {
+ /* From paging/starting to nonpaging */
+ vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
+ vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) |
+ (CPU_BASED_CR3_LOAD_EXITING |
+ CPU_BASED_CR3_STORE_EXITING));
+ vcpu->arch.cr0 = cr0;
+ vmx_set_cr4(vcpu, kvm_read_cr4(vcpu));
+ } else if (!is_paging(vcpu)) {
+ /* From nonpaging to paging */
+ vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
+ vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) &
+ ~(CPU_BASED_CR3_LOAD_EXITING |
+ CPU_BASED_CR3_STORE_EXITING));
+ vcpu->arch.cr0 = cr0;
+ vmx_set_cr4(vcpu, kvm_read_cr4(vcpu));
+ }
+
+ if (!(cr0 & X86_CR0_WP))
+ *hw_cr0 &= ~X86_CR0_WP;
+}
+
+static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ unsigned long hw_cr0;
+
+ hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK);
+ if (enable_unrestricted_guest)
+ hw_cr0 |= KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST;
+ else {
+ hw_cr0 |= KVM_VM_CR0_ALWAYS_ON;
+
+ if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE))
+ enter_pmode(vcpu);
+
+ if (!vmx->rmode.vm86_active && !(cr0 & X86_CR0_PE))
+ enter_rmode(vcpu);
+ }
+
+#ifdef CONFIG_X86_64
+ if (vcpu->arch.efer & EFER_LME) {
+ if (!is_paging(vcpu) && (cr0 & X86_CR0_PG))
+ enter_lmode(vcpu);
+ if (is_paging(vcpu) && !(cr0 & X86_CR0_PG))
+ exit_lmode(vcpu);
+ }
+#endif
+
+ if (enable_ept && !enable_unrestricted_guest)
+ ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu);
+
+ vmcs_writel(CR0_READ_SHADOW, cr0);
+ vmcs_writel(GUEST_CR0, hw_cr0);
+ vcpu->arch.cr0 = cr0;
+
+ /* depends on vcpu->arch.cr0 to be set to a new value */
+ vmx->emulation_required = emulation_required(vcpu);
+}
+
+static int get_ept_level(struct kvm_vcpu *vcpu)
+{
+ /* Nested EPT currently only supports 4-level walks. */
+ if (is_guest_mode(vcpu) && nested_cpu_has_ept(get_vmcs12(vcpu)))
+ return 4;
+ if (cpu_has_vmx_ept_5levels() && (cpuid_maxphyaddr(vcpu) > 48))
+ return 5;
+ return 4;
+}
+
+static u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa)
+{
+ u64 eptp = VMX_EPTP_MT_WB;
+
+ eptp |= (get_ept_level(vcpu) == 5) ? VMX_EPTP_PWL_5 : VMX_EPTP_PWL_4;
+
+ if (enable_ept_ad_bits &&
+ (!is_guest_mode(vcpu) || nested_ept_ad_enabled(vcpu)))
+ eptp |= VMX_EPTP_AD_ENABLE_BIT;
+ eptp |= (root_hpa & PAGE_MASK);
+
+ return eptp;
+}
+
+static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
+{
+ struct kvm *kvm = vcpu->kvm;
+ unsigned long guest_cr3;
+ u64 eptp;
+
+ guest_cr3 = cr3;
+ if (enable_ept) {
+ eptp = construct_eptp(vcpu, cr3);
+ vmcs_write64(EPT_POINTER, eptp);
+
+ if (kvm_x86_ops->tlb_remote_flush) {
+ spin_lock(&to_kvm_vmx(kvm)->ept_pointer_lock);
+ to_vmx(vcpu)->ept_pointer = eptp;
+ to_kvm_vmx(kvm)->ept_pointers_match
+ = EPT_POINTERS_CHECK;
+ spin_unlock(&to_kvm_vmx(kvm)->ept_pointer_lock);
+ }
+
+ if (enable_unrestricted_guest || is_paging(vcpu) ||
+ is_guest_mode(vcpu))
+ guest_cr3 = kvm_read_cr3(vcpu);
+ else
+ guest_cr3 = to_kvm_vmx(kvm)->ept_identity_map_addr;
+ ept_load_pdptrs(vcpu);
+ }
+
+ vmcs_writel(GUEST_CR3, guest_cr3);
+}
+
+static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
+{
+ /*
+ * Pass through host's Machine Check Enable value to hw_cr4, which
+ * is in force while we are in guest mode. Do not let guests control
+ * this bit, even if host CR4.MCE == 0.
+ */
+ unsigned long hw_cr4;
+
+ hw_cr4 = (cr4_read_shadow() & X86_CR4_MCE) | (cr4 & ~X86_CR4_MCE);
+ if (enable_unrestricted_guest)
+ hw_cr4 |= KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST;
+ else if (to_vmx(vcpu)->rmode.vm86_active)
+ hw_cr4 |= KVM_RMODE_VM_CR4_ALWAYS_ON;
+ else
+ hw_cr4 |= KVM_PMODE_VM_CR4_ALWAYS_ON;
+
+ if (!boot_cpu_has(X86_FEATURE_UMIP) && vmx_umip_emulated()) {
+ if (cr4 & X86_CR4_UMIP) {
+ vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL,
+ SECONDARY_EXEC_DESC);
+ hw_cr4 &= ~X86_CR4_UMIP;
+ } else if (!is_guest_mode(vcpu) ||
+ !nested_cpu_has2(get_vmcs12(vcpu), SECONDARY_EXEC_DESC))
+ vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL,
+ SECONDARY_EXEC_DESC);
+ }
+
+ if (cr4 & X86_CR4_VMXE) {
+ /*
+ * To use VMXON (and later other VMX instructions), a guest
+ * must first be able to turn on cr4.VMXE (see handle_vmon()).
+ * So basically the check on whether to allow nested VMX
+ * is here. We operate under the default treatment of SMM,
+ * so VMX cannot be enabled under SMM.
+ */
+ if (!nested_vmx_allowed(vcpu) || is_smm(vcpu))
+ return 1;
+ }
+
+ if (to_vmx(vcpu)->nested.vmxon && !nested_cr4_valid(vcpu, cr4))
+ return 1;
+
+ vcpu->arch.cr4 = cr4;
+
+ if (!enable_unrestricted_guest) {
+ if (enable_ept) {
+ if (!is_paging(vcpu)) {
+ hw_cr4 &= ~X86_CR4_PAE;
+ hw_cr4 |= X86_CR4_PSE;
+ } else if (!(cr4 & X86_CR4_PAE)) {
+ hw_cr4 &= ~X86_CR4_PAE;
+ }
+ }
+
+ /*
+ * SMEP/SMAP/PKU is disabled if CPU is in non-paging mode in
+ * hardware. To emulate this behavior, SMEP/SMAP/PKU needs
+ * to be manually disabled when guest switches to non-paging
+ * mode.
+ *
+ * If !enable_unrestricted_guest, the CPU is always running
+ * with CR0.PG=1 and CR4 needs to be modified.
+ * If enable_unrestricted_guest, the CPU automatically
+ * disables SMEP/SMAP/PKU when the guest sets CR0.PG=0.
+ */
+ if (!is_paging(vcpu))
+ hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE);
+ }
+
+ vmcs_writel(CR4_READ_SHADOW, cr4);
+ vmcs_writel(GUEST_CR4, hw_cr4);
+ return 0;
+}
+
+static void vmx_get_segment(struct kvm_vcpu *vcpu,
+ struct kvm_segment *var, int seg)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ u32 ar;
+
+ if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) {
+ *var = vmx->rmode.segs[seg];
+ if (seg == VCPU_SREG_TR
+ || var->selector == vmx_read_guest_seg_selector(vmx, seg))
+ return;
+ var->base = vmx_read_guest_seg_base(vmx, seg);
+ var->selector = vmx_read_guest_seg_selector(vmx, seg);
+ return;
+ }
+ var->base = vmx_read_guest_seg_base(vmx, seg);
+ var->limit = vmx_read_guest_seg_limit(vmx, seg);
+ var->selector = vmx_read_guest_seg_selector(vmx, seg);
+ ar = vmx_read_guest_seg_ar(vmx, seg);
+ var->unusable = (ar >> 16) & 1;
+ var->type = ar & 15;
+ var->s = (ar >> 4) & 1;
+ var->dpl = (ar >> 5) & 3;
+ /*
+ * Some userspaces do not preserve unusable property. Since usable
+ * segment has to be present according to VMX spec we can use present
+ * property to amend userspace bug by making unusable segment always
+ * nonpresent. vmx_segment_access_rights() already marks nonpresent
+ * segment as unusable.
+ */
+ var->present = !var->unusable;
+ var->avl = (ar >> 12) & 1;
+ var->l = (ar >> 13) & 1;
+ var->db = (ar >> 14) & 1;
+ var->g = (ar >> 15) & 1;
+}
+
+static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg)
+{
+ struct kvm_segment s;
+
+ if (to_vmx(vcpu)->rmode.vm86_active) {
+ vmx_get_segment(vcpu, &s, seg);
+ return s.base;
+ }
+ return vmx_read_guest_seg_base(to_vmx(vcpu), seg);
+}
+
+static int vmx_get_cpl(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ if (unlikely(vmx->rmode.vm86_active))
+ return 0;
+ else {
+ int ar = vmx_read_guest_seg_ar(vmx, VCPU_SREG_SS);
+ return VMX_AR_DPL(ar);
+ }
+}
+
+static u32 vmx_segment_access_rights(struct kvm_segment *var)
+{
+ u32 ar;
+
+ if (var->unusable || !var->present)
+ ar = 1 << 16;
+ else {
+ ar = var->type & 15;
+ ar |= (var->s & 1) << 4;
+ ar |= (var->dpl & 3) << 5;
+ ar |= (var->present & 1) << 7;
+ ar |= (var->avl & 1) << 12;
+ ar |= (var->l & 1) << 13;
+ ar |= (var->db & 1) << 14;
+ ar |= (var->g & 1) << 15;
+ }
+
+ return ar;
+}
+
+static void vmx_set_segment(struct kvm_vcpu *vcpu,
+ struct kvm_segment *var, int seg)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
+
+ vmx_segment_cache_clear(vmx);
+
+ if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) {
+ vmx->rmode.segs[seg] = *var;
+ if (seg == VCPU_SREG_TR)
+ vmcs_write16(sf->selector, var->selector);
+ else if (var->s)
+ fix_rmode_seg(seg, &vmx->rmode.segs[seg]);
+ goto out;
+ }
+
+ vmcs_writel(sf->base, var->base);
+ vmcs_write32(sf->limit, var->limit);
+ vmcs_write16(sf->selector, var->selector);
+
+ /*
+ * Fix the "Accessed" bit in AR field of segment registers for older
+ * qemu binaries.
+ * IA32 arch specifies that at the time of processor reset the
+ * "Accessed" bit in the AR field of segment registers is 1. And qemu
+ * is setting it to 0 in the userland code. This causes invalid guest
+ * state vmexit when "unrestricted guest" mode is turned on.
+ * Fix for this setup issue in cpu_reset is being pushed in the qemu
+ * tree. Newer qemu binaries with that qemu fix would not need this
+ * kvm hack.
+ */
+ if (enable_unrestricted_guest && (seg != VCPU_SREG_LDTR))
+ var->type |= 0x1; /* Accessed */
+
+ vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(var));
+
+out:
+ vmx->emulation_required = emulation_required(vcpu);
+}
+
+static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
+{
+ u32 ar = vmx_read_guest_seg_ar(to_vmx(vcpu), VCPU_SREG_CS);
+
+ *db = (ar >> 14) & 1;
+ *l = (ar >> 13) & 1;
+}
+
+static void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
+{
+ dt->size = vmcs_read32(GUEST_IDTR_LIMIT);
+ dt->address = vmcs_readl(GUEST_IDTR_BASE);
+}
+
+static void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
+{
+ vmcs_write32(GUEST_IDTR_LIMIT, dt->size);
+ vmcs_writel(GUEST_IDTR_BASE, dt->address);
+}
+
+static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
+{
+ dt->size = vmcs_read32(GUEST_GDTR_LIMIT);
+ dt->address = vmcs_readl(GUEST_GDTR_BASE);
+}
+
+static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
+{
+ vmcs_write32(GUEST_GDTR_LIMIT, dt->size);
+ vmcs_writel(GUEST_GDTR_BASE, dt->address);
+}
+
+static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg)
+{
+ struct kvm_segment var;
+ u32 ar;
+
+ vmx_get_segment(vcpu, &var, seg);
+ var.dpl = 0x3;
+ if (seg == VCPU_SREG_CS)
+ var.type = 0x3;
+ ar = vmx_segment_access_rights(&var);
+
+ if (var.base != (var.selector << 4))
+ return false;
+ if (var.limit != 0xffff)
+ return false;
+ if (ar != 0xf3)
+ return false;
+
+ return true;
+}
+
+static bool code_segment_valid(struct kvm_vcpu *vcpu)
+{
+ struct kvm_segment cs;
+ unsigned int cs_rpl;
+
+ vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
+ cs_rpl = cs.selector & SEGMENT_RPL_MASK;
+
+ if (cs.unusable)
+ return false;
+ if (~cs.type & (VMX_AR_TYPE_CODE_MASK|VMX_AR_TYPE_ACCESSES_MASK))
+ return false;
+ if (!cs.s)
+ return false;
+ if (cs.type & VMX_AR_TYPE_WRITEABLE_MASK) {
+ if (cs.dpl > cs_rpl)
+ return false;
+ } else {
+ if (cs.dpl != cs_rpl)
+ return false;
+ }
+ if (!cs.present)
+ return false;
+
+ /* TODO: Add Reserved field check, this'll require a new member in the kvm_segment_field structure */
+ return true;
+}
+
+static bool stack_segment_valid(struct kvm_vcpu *vcpu)
+{
+ struct kvm_segment ss;
+ unsigned int ss_rpl;
+
+ vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);
+ ss_rpl = ss.selector & SEGMENT_RPL_MASK;
+
+ if (ss.unusable)
+ return true;
+ if (ss.type != 3 && ss.type != 7)
+ return false;
+ if (!ss.s)
+ return false;
+ if (ss.dpl != ss_rpl) /* DPL != RPL */
+ return false;
+ if (!ss.present)
+ return false;
+
+ return true;
+}
+
+static bool data_segment_valid(struct kvm_vcpu *vcpu, int seg)
+{
+ struct kvm_segment var;
+ unsigned int rpl;
+
+ vmx_get_segment(vcpu, &var, seg);
+ rpl = var.selector & SEGMENT_RPL_MASK;
+
+ if (var.unusable)
+ return true;
+ if (!var.s)
+ return false;
+ if (!var.present)
+ return false;
+ if (~var.type & (VMX_AR_TYPE_CODE_MASK|VMX_AR_TYPE_WRITEABLE_MASK)) {
+ if (var.dpl < rpl) /* DPL < RPL */
+ return false;
+ }
+
+ /* TODO: Add other members to kvm_segment_field to allow checking for other access
+ * rights flags
+ */
+ return true;
+}
+
+static bool tr_valid(struct kvm_vcpu *vcpu)
+{
+ struct kvm_segment tr;
+
+ vmx_get_segment(vcpu, &tr, VCPU_SREG_TR);
+
+ if (tr.unusable)
+ return false;
+ if (tr.selector & SEGMENT_TI_MASK) /* TI = 1 */
+ return false;
+ if (tr.type != 3 && tr.type != 11) /* TODO: Check if guest is in IA32e mode */
+ return false;
+ if (!tr.present)
+ return false;
+
+ return true;
+}
+
+static bool ldtr_valid(struct kvm_vcpu *vcpu)
+{
+ struct kvm_segment ldtr;
+
+ vmx_get_segment(vcpu, &ldtr, VCPU_SREG_LDTR);
+
+ if (ldtr.unusable)
+ return true;
+ if (ldtr.selector & SEGMENT_TI_MASK) /* TI = 1 */
+ return false;
+ if (ldtr.type != 2)
+ return false;
+ if (!ldtr.present)
+ return false;
+
+ return true;
+}
+
+static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu)
+{
+ struct kvm_segment cs, ss;
+
+ vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
+ vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);
+
+ return ((cs.selector & SEGMENT_RPL_MASK) ==
+ (ss.selector & SEGMENT_RPL_MASK));
+}
+
+static bool nested_vmx_check_io_bitmaps(struct kvm_vcpu *vcpu,
+ unsigned int port, int size);
+static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+ unsigned long exit_qualification;
+ unsigned short port;
+ int size;
+
+ if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
+ return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING);
+
+ exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+
+ port = exit_qualification >> 16;
+ size = (exit_qualification & 7) + 1;
+
+ return nested_vmx_check_io_bitmaps(vcpu, port, size);
+}
+
+/*
+ * Check if guest state is valid. Returns true if valid, false if
+ * not.
+ * We assume that registers are always usable
+ */
+static bool guest_state_valid(struct kvm_vcpu *vcpu)
+{
+ if (enable_unrestricted_guest)
+ return true;
+
+ /* real mode guest state checks */
+ if (!is_protmode(vcpu) || (vmx_get_rflags(vcpu) & X86_EFLAGS_VM)) {
+ if (!rmode_segment_valid(vcpu, VCPU_SREG_CS))
+ return false;
+ if (!rmode_segment_valid(vcpu, VCPU_SREG_SS))
+ return false;
+ if (!rmode_segment_valid(vcpu, VCPU_SREG_DS))
+ return false;
+ if (!rmode_segment_valid(vcpu, VCPU_SREG_ES))
+ return false;
+ if (!rmode_segment_valid(vcpu, VCPU_SREG_FS))
+ return false;
+ if (!rmode_segment_valid(vcpu, VCPU_SREG_GS))
+ return false;
+ } else {
+ /* protected mode guest state checks */
+ if (!cs_ss_rpl_check(vcpu))
+ return false;
+ if (!code_segment_valid(vcpu))
+ return false;
+ if (!stack_segment_valid(vcpu))
+ return false;
+ if (!data_segment_valid(vcpu, VCPU_SREG_DS))
+ return false;
+ if (!data_segment_valid(vcpu, VCPU_SREG_ES))
+ return false;
+ if (!data_segment_valid(vcpu, VCPU_SREG_FS))
+ return false;
+ if (!data_segment_valid(vcpu, VCPU_SREG_GS))
+ return false;
+ if (!tr_valid(vcpu))
+ return false;
+ if (!ldtr_valid(vcpu))
+ return false;
+ }
+ /* TODO:
+ * - Add checks on RIP
+ * - Add checks on RFLAGS
+ */
+
+ return true;
+}
+
+static bool page_address_valid(struct kvm_vcpu *vcpu, gpa_t gpa)
+{
+ return PAGE_ALIGNED(gpa) && !(gpa >> cpuid_maxphyaddr(vcpu));
+}
+
+static int init_rmode_tss(struct kvm *kvm)
+{
+ gfn_t fn;
+ u16 data = 0;
+ int idx, r;
+
+ idx = srcu_read_lock(&kvm->srcu);
+ fn = to_kvm_vmx(kvm)->tss_addr >> PAGE_SHIFT;
+ r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
+ if (r < 0)
+ goto out;
+ data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE;
+ r = kvm_write_guest_page(kvm, fn++, &data,
+ TSS_IOPB_BASE_OFFSET, sizeof(u16));
+ if (r < 0)
+ goto out;
+ r = kvm_clear_guest_page(kvm, fn++, 0, PAGE_SIZE);
+ if (r < 0)
+ goto out;
+ r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
+ if (r < 0)
+ goto out;
+ data = ~0;
+ r = kvm_write_guest_page(kvm, fn, &data,
+ RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1,
+ sizeof(u8));
+out:
+ srcu_read_unlock(&kvm->srcu, idx);
+ return r;
+}
+
+static int init_rmode_identity_map(struct kvm *kvm)
+{
+ struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm);
+ int i, idx, r = 0;
+ kvm_pfn_t identity_map_pfn;
+ u32 tmp;
+
+ /* Protect kvm_vmx->ept_identity_pagetable_done. */
+ mutex_lock(&kvm->slots_lock);
+
+ if (likely(kvm_vmx->ept_identity_pagetable_done))
+ goto out2;
+
+ if (!kvm_vmx->ept_identity_map_addr)
+ kvm_vmx->ept_identity_map_addr = VMX_EPT_IDENTITY_PAGETABLE_ADDR;
+ identity_map_pfn = kvm_vmx->ept_identity_map_addr >> PAGE_SHIFT;
+
+ r = __x86_set_memory_region(kvm, IDENTITY_PAGETABLE_PRIVATE_MEMSLOT,
+ kvm_vmx->ept_identity_map_addr, PAGE_SIZE);
+ if (r < 0)
+ goto out2;
+
+ idx = srcu_read_lock(&kvm->srcu);
+ r = kvm_clear_guest_page(kvm, identity_map_pfn, 0, PAGE_SIZE);
+ if (r < 0)
+ goto out;
+ /* Set up identity-mapping pagetable for EPT in real mode */
+ for (i = 0; i < PT32_ENT_PER_PAGE; i++) {
+ tmp = (i << 22) + (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |
+ _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE);
+ r = kvm_write_guest_page(kvm, identity_map_pfn,
+ &tmp, i * sizeof(tmp), sizeof(tmp));
+ if (r < 0)
+ goto out;
+ }
+ kvm_vmx->ept_identity_pagetable_done = true;
+
+out:
+ srcu_read_unlock(&kvm->srcu, idx);
+
+out2:
+ mutex_unlock(&kvm->slots_lock);
+ return r;
+}
+
+static void seg_setup(int seg)
+{
+ const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
+ unsigned int ar;
+
+ vmcs_write16(sf->selector, 0);
+ vmcs_writel(sf->base, 0);
+ vmcs_write32(sf->limit, 0xffff);
+ ar = 0x93;
+ if (seg == VCPU_SREG_CS)
+ ar |= 0x08; /* code segment */
+
+ vmcs_write32(sf->ar_bytes, ar);
+}
+
+static int alloc_apic_access_page(struct kvm *kvm)
+{
+ struct page *page;
+ int r = 0;
+
+ mutex_lock(&kvm->slots_lock);
+ if (kvm->arch.apic_access_page_done)
+ goto out;
+ r = __x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT,
+ APIC_DEFAULT_PHYS_BASE, PAGE_SIZE);
+ if (r)
+ goto out;
+
+ page = gfn_to_page(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT);
+ if (is_error_page(page)) {
+ r = -EFAULT;
+ goto out;
+ }
+
+ /*
+ * Do not pin the page in memory, so that memory hot-unplug
+ * is able to migrate it.
+ */
+ put_page(page);
+ kvm->arch.apic_access_page_done = true;
+out:
+ mutex_unlock(&kvm->slots_lock);
+ return r;
+}
+
+static int allocate_vpid(void)
+{
+ int vpid;
+
+ if (!enable_vpid)
+ return 0;
+ spin_lock(&vmx_vpid_lock);
+ vpid = find_first_zero_bit(vmx_vpid_bitmap, VMX_NR_VPIDS);
+ if (vpid < VMX_NR_VPIDS)
+ __set_bit(vpid, vmx_vpid_bitmap);
+ else
+ vpid = 0;
+ spin_unlock(&vmx_vpid_lock);
+ return vpid;
+}
+
+static void free_vpid(int vpid)
+{
+ if (!enable_vpid || vpid == 0)
+ return;
+ spin_lock(&vmx_vpid_lock);
+ __clear_bit(vpid, vmx_vpid_bitmap);
+ spin_unlock(&vmx_vpid_lock);
+}
+
+static __always_inline void vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
+ u32 msr, int type)
+{
+ int f = sizeof(unsigned long);
+
+ if (!cpu_has_vmx_msr_bitmap())
+ return;
+
+ if (static_branch_unlikely(&enable_evmcs))
+ evmcs_touch_msr_bitmap();
+
+ /*
+ * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
+ * have the write-low and read-high bitmap offsets the wrong way round.
+ * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
+ */
+ if (msr <= 0x1fff) {
+ if (type & MSR_TYPE_R)
+ /* read-low */
+ __clear_bit(msr, msr_bitmap + 0x000 / f);
+
+ if (type & MSR_TYPE_W)
+ /* write-low */
+ __clear_bit(msr, msr_bitmap + 0x800 / f);
+
+ } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
+ msr &= 0x1fff;
+ if (type & MSR_TYPE_R)
+ /* read-high */
+ __clear_bit(msr, msr_bitmap + 0x400 / f);
+
+ if (type & MSR_TYPE_W)
+ /* write-high */
+ __clear_bit(msr, msr_bitmap + 0xc00 / f);
+
+ }
+}
+
+static __always_inline void vmx_enable_intercept_for_msr(unsigned long *msr_bitmap,
+ u32 msr, int type)
+{
+ int f = sizeof(unsigned long);
+
+ if (!cpu_has_vmx_msr_bitmap())
+ return;
+
+ if (static_branch_unlikely(&enable_evmcs))
+ evmcs_touch_msr_bitmap();
+
+ /*
+ * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
+ * have the write-low and read-high bitmap offsets the wrong way round.
+ * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
+ */
+ if (msr <= 0x1fff) {
+ if (type & MSR_TYPE_R)
+ /* read-low */
+ __set_bit(msr, msr_bitmap + 0x000 / f);
+
+ if (type & MSR_TYPE_W)
+ /* write-low */
+ __set_bit(msr, msr_bitmap + 0x800 / f);
+
+ } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
+ msr &= 0x1fff;
+ if (type & MSR_TYPE_R)
+ /* read-high */
+ __set_bit(msr, msr_bitmap + 0x400 / f);
+
+ if (type & MSR_TYPE_W)
+ /* write-high */
+ __set_bit(msr, msr_bitmap + 0xc00 / f);
+
+ }
+}
+
+static __always_inline void vmx_set_intercept_for_msr(unsigned long *msr_bitmap,
+ u32 msr, int type, bool value)
+{
+ if (value)
+ vmx_enable_intercept_for_msr(msr_bitmap, msr, type);
+ else
+ vmx_disable_intercept_for_msr(msr_bitmap, msr, type);
+}
+
+/*
+ * If a msr is allowed by L0, we should check whether it is allowed by L1.
+ * The corresponding bit will be cleared unless both of L0 and L1 allow it.
+ */
+static void nested_vmx_disable_intercept_for_msr(unsigned long *msr_bitmap_l1,
+ unsigned long *msr_bitmap_nested,
+ u32 msr, int type)
+{
+ int f = sizeof(unsigned long);
+
+ /*
+ * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
+ * have the write-low and read-high bitmap offsets the wrong way round.
+ * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
+ */
+ if (msr <= 0x1fff) {
+ if (type & MSR_TYPE_R &&
+ !test_bit(msr, msr_bitmap_l1 + 0x000 / f))
+ /* read-low */
+ __clear_bit(msr, msr_bitmap_nested + 0x000 / f);
+
+ if (type & MSR_TYPE_W &&
+ !test_bit(msr, msr_bitmap_l1 + 0x800 / f))
+ /* write-low */
+ __clear_bit(msr, msr_bitmap_nested + 0x800 / f);
+
+ } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
+ msr &= 0x1fff;
+ if (type & MSR_TYPE_R &&
+ !test_bit(msr, msr_bitmap_l1 + 0x400 / f))
+ /* read-high */
+ __clear_bit(msr, msr_bitmap_nested + 0x400 / f);
+
+ if (type & MSR_TYPE_W &&
+ !test_bit(msr, msr_bitmap_l1 + 0xc00 / f))
+ /* write-high */
+ __clear_bit(msr, msr_bitmap_nested + 0xc00 / f);
+
+ }
+}
+
+static u8 vmx_msr_bitmap_mode(struct kvm_vcpu *vcpu)
+{
+ u8 mode = 0;
+
+ if (cpu_has_secondary_exec_ctrls() &&
+ (vmcs_read32(SECONDARY_VM_EXEC_CONTROL) &
+ SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) {
+ mode |= MSR_BITMAP_MODE_X2APIC;
+ if (enable_apicv && kvm_vcpu_apicv_active(vcpu))
+ mode |= MSR_BITMAP_MODE_X2APIC_APICV;
+ }
+
+ return mode;
+}
+
+#define X2APIC_MSR(r) (APIC_BASE_MSR + ((r) >> 4))
+
+static void vmx_update_msr_bitmap_x2apic(unsigned long *msr_bitmap,
+ u8 mode)
+{
+ int msr;
+
+ for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
+ unsigned word = msr / BITS_PER_LONG;
+ msr_bitmap[word] = (mode & MSR_BITMAP_MODE_X2APIC_APICV) ? 0 : ~0;
+ msr_bitmap[word + (0x800 / sizeof(long))] = ~0;
+ }
+
+ if (mode & MSR_BITMAP_MODE_X2APIC) {
+ /*
+ * TPR reads and writes can be virtualized even if virtual interrupt
+ * delivery is not in use.
+ */
+ vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_TASKPRI), MSR_TYPE_RW);
+ if (mode & MSR_BITMAP_MODE_X2APIC_APICV) {
+ vmx_enable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_TMCCT), MSR_TYPE_R);
+ vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_EOI), MSR_TYPE_W);
+ vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W);
+ }
+ }
+}
+
+static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
+ u8 mode = vmx_msr_bitmap_mode(vcpu);
+ u8 changed = mode ^ vmx->msr_bitmap_mode;
+
+ if (!changed)
+ return;
+
+ if (changed & (MSR_BITMAP_MODE_X2APIC | MSR_BITMAP_MODE_X2APIC_APICV))
+ vmx_update_msr_bitmap_x2apic(msr_bitmap, mode);
+
+ vmx->msr_bitmap_mode = mode;
+}
+
+static bool vmx_get_enable_apicv(struct kvm_vcpu *vcpu)
+{
+ return enable_apicv;
+}
+
+static void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu)
+{
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+ gfn_t gfn;
+
+ /*
+ * Don't need to mark the APIC access page dirty; it is never
+ * written to by the CPU during APIC virtualization.
+ */
+
+ if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) {
+ gfn = vmcs12->virtual_apic_page_addr >> PAGE_SHIFT;
+ kvm_vcpu_mark_page_dirty(vcpu, gfn);
+ }
+
+ if (nested_cpu_has_posted_intr(vmcs12)) {
+ gfn = vmcs12->posted_intr_desc_addr >> PAGE_SHIFT;
+ kvm_vcpu_mark_page_dirty(vcpu, gfn);
+ }
+}
+
+
+static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ int max_irr;
+ void *vapic_page;
+ u16 status;
+
+ if (!vmx->nested.pi_desc || !vmx->nested.pi_pending)
+ return;
+
+ vmx->nested.pi_pending = false;
+ if (!pi_test_and_clear_on(vmx->nested.pi_desc))
+ return;
+
+ max_irr = find_last_bit((unsigned long *)vmx->nested.pi_desc->pir, 256);
+ if (max_irr != 256) {
+ vapic_page = kmap(vmx->nested.virtual_apic_page);
+ __kvm_apic_update_irr(vmx->nested.pi_desc->pir,
+ vapic_page, &max_irr);
+ kunmap(vmx->nested.virtual_apic_page);
+
+ status = vmcs_read16(GUEST_INTR_STATUS);
+ if ((u8)max_irr > ((u8)status & 0xff)) {
+ status &= ~0xff;
+ status |= (u8)max_irr;
+ vmcs_write16(GUEST_INTR_STATUS, status);
+ }
+ }
+
+ nested_mark_vmcs12_pages_dirty(vcpu);
+}
+
+static u8 vmx_get_rvi(void)
+{
+ return vmcs_read16(GUEST_INTR_STATUS) & 0xff;
+}
+
+static bool vmx_guest_apic_has_interrupt(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ void *vapic_page;
+ u32 vppr;
+ int rvi;
+
+ if (WARN_ON_ONCE(!is_guest_mode(vcpu)) ||
+ !nested_cpu_has_vid(get_vmcs12(vcpu)) ||
+ WARN_ON_ONCE(!vmx->nested.virtual_apic_page))
+ return false;
+
+ rvi = vmx_get_rvi();
+
+ vapic_page = kmap(vmx->nested.virtual_apic_page);
+ vppr = *((u32 *)(vapic_page + APIC_PROCPRI));
+ kunmap(vmx->nested.virtual_apic_page);
+
+ return ((rvi & 0xf0) > (vppr & 0xf0));
+}
+
+static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu,
+ bool nested)
+{
+#ifdef CONFIG_SMP
+ int pi_vec = nested ? POSTED_INTR_NESTED_VECTOR : POSTED_INTR_VECTOR;
+
+ if (vcpu->mode == IN_GUEST_MODE) {
+ /*
+ * The vector of interrupt to be delivered to vcpu had
+ * been set in PIR before this function.
+ *
+ * Following cases will be reached in this block, and
+ * we always send a notification event in all cases as
+ * explained below.
+ *
+ * Case 1: vcpu keeps in non-root mode. Sending a
+ * notification event posts the interrupt to vcpu.
+ *
+ * Case 2: vcpu exits to root mode and is still
+ * runnable. PIR will be synced to vIRR before the
+ * next vcpu entry. Sending a notification event in
+ * this case has no effect, as vcpu is not in root
+ * mode.
+ *
+ * Case 3: vcpu exits to root mode and is blocked.
+ * vcpu_block() has already synced PIR to vIRR and
+ * never blocks vcpu if vIRR is not cleared. Therefore,
+ * a blocked vcpu here does not wait for any requested
+ * interrupts in PIR, and sending a notification event
+ * which has no effect is safe here.
+ */
+
+ apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), pi_vec);
+ return true;
+ }
+#endif
+ return false;
+}
+
+static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu,
+ int vector)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ if (is_guest_mode(vcpu) &&
+ vector == vmx->nested.posted_intr_nv) {
+ /*
+ * If a posted intr is not recognized by hardware,
+ * we will accomplish it in the next vmentry.
+ */
+ vmx->nested.pi_pending = true;
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ /* the PIR and ON have been set by L1. */
+ if (!kvm_vcpu_trigger_posted_interrupt(vcpu, true))
+ kvm_vcpu_kick(vcpu);
+ return 0;
+ }
+ return -1;
+}
+/*
+ * Send interrupt to vcpu via posted interrupt way.
+ * 1. If target vcpu is running(non-root mode), send posted interrupt
+ * notification to vcpu and hardware will sync PIR to vIRR atomically.
+ * 2. If target vcpu isn't running(root mode), kick it to pick up the
+ * interrupt from PIR in next vmentry.
+ */
+static int vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ int r;
+
+ r = vmx_deliver_nested_posted_interrupt(vcpu, vector);
+ if (!r)
+ return 0;
+
+ if (!vcpu->arch.apicv_active)
+ return -1;
+
+ if (pi_test_and_set_pir(vector, &vmx->pi_desc))
+ return 0;
+
+ /* If a previous notification has sent the IPI, nothing to do. */
+ if (pi_test_and_set_on(&vmx->pi_desc))
+ return 0;
+
+ if (!kvm_vcpu_trigger_posted_interrupt(vcpu, false))
+ kvm_vcpu_kick(vcpu);
+
+ return 0;
+}
+
+/*
+ * Set up the vmcs's constant host-state fields, i.e., host-state fields that
+ * will not change in the lifetime of the guest.
+ * Note that host-state that does change is set elsewhere. E.g., host-state
+ * that is set differently for each CPU is set in vmx_vcpu_load(), not here.
+ */
+static void vmx_set_constant_host_state(struct vcpu_vmx *vmx)
+{
+ u32 low32, high32;
+ unsigned long tmpl;
+ struct desc_ptr dt;
+ unsigned long cr0, cr3, cr4;
+
+ cr0 = read_cr0();
+ WARN_ON(cr0 & X86_CR0_TS);
+ vmcs_writel(HOST_CR0, cr0); /* 22.2.3 */
+
+ /*
+ * Save the most likely value for this task's CR3 in the VMCS.
+ * We can't use __get_current_cr3_fast() because we're not atomic.
+ */
+ cr3 = __read_cr3();
+ vmcs_writel(HOST_CR3, cr3); /* 22.2.3 FIXME: shadow tables */
+ vmx->loaded_vmcs->host_state.cr3 = cr3;
+
+ /* Save the most likely value for this task's CR4 in the VMCS. */
+ cr4 = cr4_read_shadow();
+ vmcs_writel(HOST_CR4, cr4); /* 22.2.3, 22.2.5 */
+ vmx->loaded_vmcs->host_state.cr4 = cr4;
+
+ vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */
+#ifdef CONFIG_X86_64
+ /*
+ * Load null selectors, so we can avoid reloading them in
+ * vmx_prepare_switch_to_host(), in case userspace uses
+ * the null selectors too (the expected case).
+ */
+ vmcs_write16(HOST_DS_SELECTOR, 0);
+ vmcs_write16(HOST_ES_SELECTOR, 0);
+#else
+ vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */
+ vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */
+#endif
+ vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */
+ vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */
+
+ store_idt(&dt);
+ vmcs_writel(HOST_IDTR_BASE, dt.address); /* 22.2.4 */
+ vmx->host_idt_base = dt.address;
+
+ vmcs_writel(HOST_RIP, vmx_return); /* 22.2.5 */
+
+ rdmsr(MSR_IA32_SYSENTER_CS, low32, high32);
+ vmcs_write32(HOST_IA32_SYSENTER_CS, low32);
+ rdmsrl(MSR_IA32_SYSENTER_EIP, tmpl);
+ vmcs_writel(HOST_IA32_SYSENTER_EIP, tmpl); /* 22.2.3 */
+
+ if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) {
+ rdmsr(MSR_IA32_CR_PAT, low32, high32);
+ vmcs_write64(HOST_IA32_PAT, low32 | ((u64) high32 << 32));
+ }
+}
+
+static void set_cr4_guest_host_mask(struct vcpu_vmx *vmx)
+{
+ BUILD_BUG_ON(KVM_CR4_GUEST_OWNED_BITS & ~KVM_POSSIBLE_CR4_GUEST_BITS);
+
+ vmx->vcpu.arch.cr4_guest_owned_bits = KVM_CR4_GUEST_OWNED_BITS;
+ if (enable_ept)
+ vmx->vcpu.arch.cr4_guest_owned_bits |= X86_CR4_PGE;
+ if (is_guest_mode(&vmx->vcpu))
+ vmx->vcpu.arch.cr4_guest_owned_bits &=
+ ~get_vmcs12(&vmx->vcpu)->cr4_guest_host_mask;
+ vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits);
+}
+
+static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx)
+{
+ u32 pin_based_exec_ctrl = vmcs_config.pin_based_exec_ctrl;
+
+ if (!kvm_vcpu_apicv_active(&vmx->vcpu))
+ pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR;
+
+ if (!enable_vnmi)
+ pin_based_exec_ctrl &= ~PIN_BASED_VIRTUAL_NMIS;
+
+ /* Enable the preemption timer dynamically */
+ pin_based_exec_ctrl &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
+ return pin_based_exec_ctrl;
+}
+
+static void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, vmx_pin_based_exec_ctrl(vmx));
+ if (cpu_has_secondary_exec_ctrls()) {
+ if (kvm_vcpu_apicv_active(vcpu))
+ vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL,
+ SECONDARY_EXEC_APIC_REGISTER_VIRT |
+ SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
+ else
+ vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL,
+ SECONDARY_EXEC_APIC_REGISTER_VIRT |
+ SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
+ }
+
+ if (cpu_has_vmx_msr_bitmap())
+ vmx_update_msr_bitmap(vcpu);
+}
+
+static u32 vmx_exec_control(struct vcpu_vmx *vmx)
+{
+ u32 exec_control = vmcs_config.cpu_based_exec_ctrl;
+
+ if (vmx->vcpu.arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)
+ exec_control &= ~CPU_BASED_MOV_DR_EXITING;
+
+ if (!cpu_need_tpr_shadow(&vmx->vcpu)) {
+ exec_control &= ~CPU_BASED_TPR_SHADOW;
+#ifdef CONFIG_X86_64
+ exec_control |= CPU_BASED_CR8_STORE_EXITING |
+ CPU_BASED_CR8_LOAD_EXITING;
+#endif
+ }
+ if (!enable_ept)
+ exec_control |= CPU_BASED_CR3_STORE_EXITING |
+ CPU_BASED_CR3_LOAD_EXITING |
+ CPU_BASED_INVLPG_EXITING;
+ if (kvm_mwait_in_guest(vmx->vcpu.kvm))
+ exec_control &= ~(CPU_BASED_MWAIT_EXITING |
+ CPU_BASED_MONITOR_EXITING);
+ if (kvm_hlt_in_guest(vmx->vcpu.kvm))
+ exec_control &= ~CPU_BASED_HLT_EXITING;
+ return exec_control;
+}
+
+static bool vmx_rdrand_supported(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_RDRAND_EXITING;
+}
+
+static bool vmx_rdseed_supported(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_RDSEED_EXITING;
+}
+
+static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
+{
+ struct kvm_vcpu *vcpu = &vmx->vcpu;
+
+ u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl;
+
+ if (!cpu_need_virtualize_apic_accesses(vcpu))
+ exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
+ if (vmx->vpid == 0)
+ exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
+ if (!enable_ept) {
+ exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
+ enable_unrestricted_guest = 0;
+ }
+ if (!enable_unrestricted_guest)
+ exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
+ if (kvm_pause_in_guest(vmx->vcpu.kvm))
+ exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
+ if (!kvm_vcpu_apicv_active(vcpu))
+ exec_control &= ~(SECONDARY_EXEC_APIC_REGISTER_VIRT |
+ SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
+ exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
+
+ /* SECONDARY_EXEC_DESC is enabled/disabled on writes to CR4.UMIP,
+ * in vmx_set_cr4. */
+ exec_control &= ~SECONDARY_EXEC_DESC;
+
+ /* SECONDARY_EXEC_SHADOW_VMCS is enabled when L1 executes VMPTRLD
+ (handle_vmptrld).
+ We can NOT enable shadow_vmcs here because we don't have yet
+ a current VMCS12
+ */
+ exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
+
+ if (!enable_pml)
+ exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
+
+ if (vmx_xsaves_supported()) {
+ /* Exposing XSAVES only when XSAVE is exposed */
+ bool xsaves_enabled =
+ guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) &&
+ guest_cpuid_has(vcpu, X86_FEATURE_XSAVES);
+
+ if (!xsaves_enabled)
+ exec_control &= ~SECONDARY_EXEC_XSAVES;
+
+ if (nested) {
+ if (xsaves_enabled)
+ vmx->nested.msrs.secondary_ctls_high |=
+ SECONDARY_EXEC_XSAVES;
+ else
+ vmx->nested.msrs.secondary_ctls_high &=
+ ~SECONDARY_EXEC_XSAVES;
+ }
+ }
+
+ if (vmx_rdtscp_supported()) {
+ bool rdtscp_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP);
+ if (!rdtscp_enabled)
+ exec_control &= ~SECONDARY_EXEC_RDTSCP;
+
+ if (nested) {
+ if (rdtscp_enabled)
+ vmx->nested.msrs.secondary_ctls_high |=
+ SECONDARY_EXEC_RDTSCP;
+ else
+ vmx->nested.msrs.secondary_ctls_high &=
+ ~SECONDARY_EXEC_RDTSCP;
+ }
+ }
+
+ if (vmx_invpcid_supported()) {
+ /* Exposing INVPCID only when PCID is exposed */
+ bool invpcid_enabled =
+ guest_cpuid_has(vcpu, X86_FEATURE_INVPCID) &&
+ guest_cpuid_has(vcpu, X86_FEATURE_PCID);
+
+ if (!invpcid_enabled) {
+ exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID;
+ guest_cpuid_clear(vcpu, X86_FEATURE_INVPCID);
+ }
+
+ if (nested) {
+ if (invpcid_enabled)
+ vmx->nested.msrs.secondary_ctls_high |=
+ SECONDARY_EXEC_ENABLE_INVPCID;
+ else
+ vmx->nested.msrs.secondary_ctls_high &=
+ ~SECONDARY_EXEC_ENABLE_INVPCID;
+ }
+ }
+
+ if (vmx_rdrand_supported()) {
+ bool rdrand_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDRAND);
+ if (rdrand_enabled)
+ exec_control &= ~SECONDARY_EXEC_RDRAND_EXITING;
+
+ if (nested) {
+ if (rdrand_enabled)
+ vmx->nested.msrs.secondary_ctls_high |=
+ SECONDARY_EXEC_RDRAND_EXITING;
+ else
+ vmx->nested.msrs.secondary_ctls_high &=
+ ~SECONDARY_EXEC_RDRAND_EXITING;
+ }
+ }
+
+ if (vmx_rdseed_supported()) {
+ bool rdseed_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDSEED);
+ if (rdseed_enabled)
+ exec_control &= ~SECONDARY_EXEC_RDSEED_EXITING;
+
+ if (nested) {
+ if (rdseed_enabled)
+ vmx->nested.msrs.secondary_ctls_high |=
+ SECONDARY_EXEC_RDSEED_EXITING;
+ else
+ vmx->nested.msrs.secondary_ctls_high &=
+ ~SECONDARY_EXEC_RDSEED_EXITING;
+ }
+ }
+
+ vmx->secondary_exec_control = exec_control;
+}
+
+static void ept_set_mmio_spte_mask(void)
+{
+ /*
+ * EPT Misconfigurations can be generated if the value of bits 2:0
+ * of an EPT paging-structure entry is 110b (write/execute).
+ */
+ kvm_mmu_set_mmio_spte_mask(VMX_EPT_RWX_MASK,
+ VMX_EPT_MISCONFIG_WX_VALUE);
+}
+
+#define VMX_XSS_EXIT_BITMAP 0
+/*
+ * Sets up the vmcs for emulated real mode.
+ */
+static void vmx_vcpu_setup(struct vcpu_vmx *vmx)
+{
+ int i;
+
+ if (enable_shadow_vmcs) {
+ /*
+ * At vCPU creation, "VMWRITE to any supported field
+ * in the VMCS" is supported, so use the more
+ * permissive vmx_vmread_bitmap to specify both read
+ * and write permissions for the shadow VMCS.
+ */
+ vmcs_write64(VMREAD_BITMAP, __pa(vmx_vmread_bitmap));
+ vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmread_bitmap));
+ }
+ if (cpu_has_vmx_msr_bitmap())
+ vmcs_write64(MSR_BITMAP, __pa(vmx->vmcs01.msr_bitmap));
+
+ vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
+
+ /* Control */
+ vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, vmx_pin_based_exec_ctrl(vmx));
+ vmx->hv_deadline_tsc = -1;
+
+ vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, vmx_exec_control(vmx));
+
+ if (cpu_has_secondary_exec_ctrls()) {
+ vmx_compute_secondary_exec_control(vmx);
+ vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
+ vmx->secondary_exec_control);
+ }
+
+ if (kvm_vcpu_apicv_active(&vmx->vcpu)) {
+ vmcs_write64(EOI_EXIT_BITMAP0, 0);
+ vmcs_write64(EOI_EXIT_BITMAP1, 0);
+ vmcs_write64(EOI_EXIT_BITMAP2, 0);
+ vmcs_write64(EOI_EXIT_BITMAP3, 0);
+
+ vmcs_write16(GUEST_INTR_STATUS, 0);
+
+ vmcs_write16(POSTED_INTR_NV, POSTED_INTR_VECTOR);
+ vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((&vmx->pi_desc)));
+ }
+
+ if (!kvm_pause_in_guest(vmx->vcpu.kvm)) {
+ vmcs_write32(PLE_GAP, ple_gap);
+ vmx->ple_window = ple_window;
+ vmx->ple_window_dirty = true;
+ }
+
+ vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
+ vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
+ vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */
+
+ vmcs_write16(HOST_FS_SELECTOR, 0); /* 22.2.4 */
+ vmcs_write16(HOST_GS_SELECTOR, 0); /* 22.2.4 */
+ vmx_set_constant_host_state(vmx);
+ vmcs_writel(HOST_FS_BASE, 0); /* 22.2.4 */
+ vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */
+
+ if (cpu_has_vmx_vmfunc())
+ vmcs_write64(VM_FUNCTION_CONTROL, 0);
+
+ vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
+ vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
+ vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val));
+ vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
+ vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val));
+
+ if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
+ vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
+
+ for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i) {
+ u32 index = vmx_msr_index[i];
+ u32 data_low, data_high;
+ int j = vmx->nmsrs;
+
+ if (rdmsr_safe(index, &data_low, &data_high) < 0)
+ continue;
+ if (wrmsr_safe(index, data_low, data_high) < 0)
+ continue;
+ vmx->guest_msrs[j].index = i;
+ vmx->guest_msrs[j].data = 0;
+ vmx->guest_msrs[j].mask = -1ull;
+ ++vmx->nmsrs;
+ }
+
+ vm_exit_controls_init(vmx, vmcs_config.vmexit_ctrl);
+
+ /* 22.2.1, 20.8.1 */
+ vm_entry_controls_init(vmx, vmcs_config.vmentry_ctrl);
+
+ vmx->vcpu.arch.cr0_guest_owned_bits = X86_CR0_TS;
+ vmcs_writel(CR0_GUEST_HOST_MASK, ~X86_CR0_TS);
+
+ set_cr4_guest_host_mask(vmx);
+
+ if (vmx_xsaves_supported())
+ vmcs_write64(XSS_EXIT_BITMAP, VMX_XSS_EXIT_BITMAP);
+
+ if (enable_pml) {
+ ASSERT(vmx->pml_pg);
+ vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
+ vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
+ }
+
+ if (cpu_has_vmx_encls_vmexit())
+ vmcs_write64(ENCLS_EXITING_BITMAP, -1ull);
+}
+
+static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct msr_data apic_base_msr;
+ u64 cr0;
+
+ vmx->rmode.vm86_active = 0;
+ vmx->spec_ctrl = 0;
+
+ vcpu->arch.microcode_version = 0x100000000ULL;
+ vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val();
+ kvm_set_cr8(vcpu, 0);
+
+ if (!init_event) {
+ apic_base_msr.data = APIC_DEFAULT_PHYS_BASE |
+ MSR_IA32_APICBASE_ENABLE;
+ if (kvm_vcpu_is_reset_bsp(vcpu))
+ apic_base_msr.data |= MSR_IA32_APICBASE_BSP;
+ apic_base_msr.host_initiated = true;
+ kvm_set_apic_base(vcpu, &apic_base_msr);
+ }
+
+ vmx_segment_cache_clear(vmx);
+
+ seg_setup(VCPU_SREG_CS);
+ vmcs_write16(GUEST_CS_SELECTOR, 0xf000);
+ vmcs_writel(GUEST_CS_BASE, 0xffff0000ul);
+
+ seg_setup(VCPU_SREG_DS);
+ seg_setup(VCPU_SREG_ES);
+ seg_setup(VCPU_SREG_FS);
+ seg_setup(VCPU_SREG_GS);
+ seg_setup(VCPU_SREG_SS);
+
+ vmcs_write16(GUEST_TR_SELECTOR, 0);
+ vmcs_writel(GUEST_TR_BASE, 0);
+ vmcs_write32(GUEST_TR_LIMIT, 0xffff);
+ vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
+
+ vmcs_write16(GUEST_LDTR_SELECTOR, 0);
+ vmcs_writel(GUEST_LDTR_BASE, 0);
+ vmcs_write32(GUEST_LDTR_LIMIT, 0xffff);
+ vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082);
+
+ if (!init_event) {
+ vmcs_write32(GUEST_SYSENTER_CS, 0);
+ vmcs_writel(GUEST_SYSENTER_ESP, 0);
+ vmcs_writel(GUEST_SYSENTER_EIP, 0);
+ vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
+ }
+
+ kvm_set_rflags(vcpu, X86_EFLAGS_FIXED);
+ kvm_rip_write(vcpu, 0xfff0);
+
+ vmcs_writel(GUEST_GDTR_BASE, 0);
+ vmcs_write32(GUEST_GDTR_LIMIT, 0xffff);
+
+ vmcs_writel(GUEST_IDTR_BASE, 0);
+ vmcs_write32(GUEST_IDTR_LIMIT, 0xffff);
+
+ vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
+ vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
+ vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 0);
+ if (kvm_mpx_supported())
+ vmcs_write64(GUEST_BNDCFGS, 0);
+
+ setup_msrs(vmx);
+
+ vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); /* 22.2.1 */
+
+ if (cpu_has_vmx_tpr_shadow() && !init_event) {
+ vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
+ if (cpu_need_tpr_shadow(vcpu))
+ vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
+ __pa(vcpu->arch.apic->regs));
+ vmcs_write32(TPR_THRESHOLD, 0);
+ }
+
+ kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
+
+ if (vmx->vpid != 0)
+ vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
+
+ cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET;
+ vmx->vcpu.arch.cr0 = cr0;
+ vmx_set_cr0(vcpu, cr0); /* enter rmode */
+ vmx_set_cr4(vcpu, 0);
+ vmx_set_efer(vcpu, 0);
+
+ update_exception_bitmap(vcpu);
+
+ vpid_sync_context(vmx->vpid);
+ if (init_event)
+ vmx_clear_hlt(vcpu);
+
+ vmx_update_fb_clear_dis(vcpu, vmx);
+}
+
+/*
+ * In nested virtualization, check if L1 asked to exit on external interrupts.
+ * For most existing hypervisors, this will always return true.
+ */
+static bool nested_exit_on_intr(struct kvm_vcpu *vcpu)
+{
+ return get_vmcs12(vcpu)->pin_based_vm_exec_control &
+ PIN_BASED_EXT_INTR_MASK;
+}
+
+/*
+ * In nested virtualization, check if L1 has set
+ * VM_EXIT_ACK_INTR_ON_EXIT
+ */
+static bool nested_exit_intr_ack_set(struct kvm_vcpu *vcpu)
+{
+ return get_vmcs12(vcpu)->vm_exit_controls &
+ VM_EXIT_ACK_INTR_ON_EXIT;
+}
+
+static bool nested_exit_on_nmi(struct kvm_vcpu *vcpu)
+{
+ return nested_cpu_has_nmi_exiting(get_vmcs12(vcpu));
+}
+
+static void enable_irq_window(struct kvm_vcpu *vcpu)
+{
+ vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL,
+ CPU_BASED_VIRTUAL_INTR_PENDING);
+}
+
+static void enable_nmi_window(struct kvm_vcpu *vcpu)
+{
+ if (!enable_vnmi ||
+ vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) {
+ enable_irq_window(vcpu);
+ return;
+ }
+
+ vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL,
+ CPU_BASED_VIRTUAL_NMI_PENDING);
+}
+
+static void vmx_inject_irq(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ uint32_t intr;
+ int irq = vcpu->arch.interrupt.nr;
+
+ trace_kvm_inj_virq(irq);
+
+ ++vcpu->stat.irq_injections;
+ if (vmx->rmode.vm86_active) {
+ int inc_eip = 0;
+ if (vcpu->arch.interrupt.soft)
+ inc_eip = vcpu->arch.event_exit_inst_len;
+ if (kvm_inject_realmode_interrupt(vcpu, irq, inc_eip) != EMULATE_DONE)
+ kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+ return;
+ }
+ intr = irq | INTR_INFO_VALID_MASK;
+ if (vcpu->arch.interrupt.soft) {
+ intr |= INTR_TYPE_SOFT_INTR;
+ vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
+ vmx->vcpu.arch.event_exit_inst_len);
+ } else
+ intr |= INTR_TYPE_EXT_INTR;
+ vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr);
+
+ vmx_clear_hlt(vcpu);
+}
+
+static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ if (!enable_vnmi) {
+ /*
+ * Tracking the NMI-blocked state in software is built upon
+ * finding the next open IRQ window. This, in turn, depends on
+ * well-behaving guests: They have to keep IRQs disabled at
+ * least as long as the NMI handler runs. Otherwise we may
+ * cause NMI nesting, maybe breaking the guest. But as this is
+ * highly unlikely, we can live with the residual risk.
+ */
+ vmx->loaded_vmcs->soft_vnmi_blocked = 1;
+ vmx->loaded_vmcs->vnmi_blocked_time = 0;
+ }
+
+ ++vcpu->stat.nmi_injections;
+ vmx->loaded_vmcs->nmi_known_unmasked = false;
+
+ if (vmx->rmode.vm86_active) {
+ if (kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR, 0) != EMULATE_DONE)
+ kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+ return;
+ }
+
+ vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
+ INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR);
+
+ vmx_clear_hlt(vcpu);
+}
+
+static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ bool masked;
+
+ if (!enable_vnmi)
+ return vmx->loaded_vmcs->soft_vnmi_blocked;
+ if (vmx->loaded_vmcs->nmi_known_unmasked)
+ return false;
+ masked = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI;
+ vmx->loaded_vmcs->nmi_known_unmasked = !masked;
+ return masked;
+}
+
+static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ if (!enable_vnmi) {
+ if (vmx->loaded_vmcs->soft_vnmi_blocked != masked) {
+ vmx->loaded_vmcs->soft_vnmi_blocked = masked;
+ vmx->loaded_vmcs->vnmi_blocked_time = 0;
+ }
+ } else {
+ vmx->loaded_vmcs->nmi_known_unmasked = !masked;
+ if (masked)
+ vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
+ GUEST_INTR_STATE_NMI);
+ else
+ vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
+ GUEST_INTR_STATE_NMI);
+ }
+}
+
+static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
+{
+ if (to_vmx(vcpu)->nested.nested_run_pending)
+ return 0;
+
+ if (!enable_vnmi &&
+ to_vmx(vcpu)->loaded_vmcs->soft_vnmi_blocked)
+ return 0;
+
+ return !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
+ (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI
+ | GUEST_INTR_STATE_NMI));
+}
+
+static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu)
+{
+ if (to_vmx(vcpu)->nested.nested_run_pending)
+ return false;
+
+ if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu))
+ return true;
+
+ return (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
+ !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
+ (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS));
+}
+
+static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
+{
+ int ret;
+
+ if (enable_unrestricted_guest)
+ return 0;
+
+ ret = x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, addr,
+ PAGE_SIZE * 3);
+ if (ret)
+ return ret;
+ to_kvm_vmx(kvm)->tss_addr = addr;
+ return init_rmode_tss(kvm);
+}
+
+static int vmx_set_identity_map_addr(struct kvm *kvm, u64 ident_addr)
+{
+ to_kvm_vmx(kvm)->ept_identity_map_addr = ident_addr;
+ return 0;
+}
+
+static bool rmode_exception(struct kvm_vcpu *vcpu, int vec)
+{
+ switch (vec) {
+ case BP_VECTOR:
+ /*
+ * Update instruction length as we may reinject the exception
+ * from user space while in guest debugging mode.
+ */
+ to_vmx(vcpu)->vcpu.arch.event_exit_inst_len =
+ vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
+ if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
+ return false;
+ /* fall through */
+ case DB_VECTOR:
+ if (vcpu->guest_debug &
+ (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
+ return false;
+ /* fall through */
+ case DE_VECTOR:
+ case OF_VECTOR:
+ case BR_VECTOR:
+ case UD_VECTOR:
+ case DF_VECTOR:
+ case SS_VECTOR:
+ case GP_VECTOR:
+ case MF_VECTOR:
+ return true;
+ break;
+ }
+ return false;
+}
+
+static int handle_rmode_exception(struct kvm_vcpu *vcpu,
+ int vec, u32 err_code)
+{
+ /*
+ * Instruction with address size override prefix opcode 0x67
+ * Cause the #SS fault with 0 error code in VM86 mode.
+ */
+ if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) {
+ if (kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE) {
+ if (vcpu->arch.halt_request) {
+ vcpu->arch.halt_request = 0;
+ return kvm_vcpu_halt(vcpu);
+ }
+ return 1;
+ }
+ return 0;
+ }
+
+ /*
+ * Forward all other exceptions that are valid in real mode.
+ * FIXME: Breaks guest debugging in real mode, needs to be fixed with
+ * the required debugging infrastructure rework.
+ */
+ kvm_queue_exception(vcpu, vec);
+ return 1;
+}
+
+/*
+ * Trigger machine check on the host. We assume all the MSRs are already set up
+ * by the CPU and that we still run on the same CPU as the MCE occurred on.
+ * We pass a fake environment to the machine check handler because we want
+ * the guest to be always treated like user space, no matter what context
+ * it used internally.
+ */
+static void kvm_machine_check(void)
+{
+#if defined(CONFIG_X86_MCE)
+ struct pt_regs regs = {
+ .cs = 3, /* Fake ring 3 no matter what the guest ran on */
+ .flags = X86_EFLAGS_IF,
+ };
+
+ do_machine_check(&regs, 0);
+#endif
+}
+
+static int handle_machine_check(struct kvm_vcpu *vcpu)
+{
+ /* already handled by vcpu_run */
+ return 1;
+}
+
+static int handle_exception(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct kvm_run *kvm_run = vcpu->run;
+ u32 intr_info, ex_no, error_code;
+ unsigned long cr2, rip, dr6;
+ u32 vect_info;
+ enum emulation_result er;
+
+ vect_info = vmx->idt_vectoring_info;
+ intr_info = vmx->exit_intr_info;
+
+ if (is_machine_check(intr_info))
+ return handle_machine_check(vcpu);
+
+ if (is_nmi(intr_info))
+ return 1; /* already handled by vmx_vcpu_run() */
+
+ if (is_invalid_opcode(intr_info))
+ return handle_ud(vcpu);
+
+ error_code = 0;
+ if (intr_info & INTR_INFO_DELIVER_CODE_MASK)
+ error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
+
+ if (!vmx->rmode.vm86_active && is_gp_fault(intr_info)) {
+ WARN_ON_ONCE(!enable_vmware_backdoor);
+ er = kvm_emulate_instruction(vcpu,
+ EMULTYPE_VMWARE | EMULTYPE_NO_UD_ON_FAIL);
+ if (er == EMULATE_USER_EXIT)
+ return 0;
+ else if (er != EMULATE_DONE)
+ kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
+ return 1;
+ }
+
+ /*
+ * The #PF with PFEC.RSVD = 1 indicates the guest is accessing
+ * MMIO, it is better to report an internal error.
+ * See the comments in vmx_handle_exit.
+ */
+ if ((vect_info & VECTORING_INFO_VALID_MASK) &&
+ !(is_page_fault(intr_info) && !(error_code & PFERR_RSVD_MASK))) {
+ vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+ vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX;
+ vcpu->run->internal.ndata = 3;
+ vcpu->run->internal.data[0] = vect_info;
+ vcpu->run->internal.data[1] = intr_info;
+ vcpu->run->internal.data[2] = error_code;
+ return 0;
+ }
+
+ if (is_page_fault(intr_info)) {
+ cr2 = vmcs_readl(EXIT_QUALIFICATION);
+ /* EPT won't cause page fault directly */
+ WARN_ON_ONCE(!vcpu->arch.apf.host_apf_reason && enable_ept);
+ return kvm_handle_page_fault(vcpu, error_code, cr2, NULL, 0);
+ }
+
+ ex_no = intr_info & INTR_INFO_VECTOR_MASK;
+
+ if (vmx->rmode.vm86_active && rmode_exception(vcpu, ex_no))
+ return handle_rmode_exception(vcpu, ex_no, error_code);
+
+ switch (ex_no) {
+ case AC_VECTOR:
+ kvm_queue_exception_e(vcpu, AC_VECTOR, error_code);
+ return 1;
+ case DB_VECTOR:
+ dr6 = vmcs_readl(EXIT_QUALIFICATION);
+ if (!(vcpu->guest_debug &
+ (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) {
+ vcpu->arch.dr6 &= ~15;
+ vcpu->arch.dr6 |= dr6 | DR6_RTM;
+ if (is_icebp(intr_info))
+ skip_emulated_instruction(vcpu);
+
+ kvm_queue_exception(vcpu, DB_VECTOR);
+ return 1;
+ }
+ kvm_run->debug.arch.dr6 = dr6 | DR6_FIXED_1;
+ kvm_run->debug.arch.dr7 = vmcs_readl(GUEST_DR7);
+ /* fall through */
+ case BP_VECTOR:
+ /*
+ * Update instruction length as we may reinject #BP from
+ * user space while in guest debugging mode. Reading it for
+ * #DB as well causes no harm, it is not used in that case.
+ */
+ vmx->vcpu.arch.event_exit_inst_len =
+ vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
+ kvm_run->exit_reason = KVM_EXIT_DEBUG;
+ rip = kvm_rip_read(vcpu);
+ kvm_run->debug.arch.pc = vmcs_readl(GUEST_CS_BASE) + rip;
+ kvm_run->debug.arch.exception = ex_no;
+ break;
+ default:
+ kvm_run->exit_reason = KVM_EXIT_EXCEPTION;
+ kvm_run->ex.exception = ex_no;
+ kvm_run->ex.error_code = error_code;
+ break;
+ }
+ return 0;
+}
+
+static int handle_external_interrupt(struct kvm_vcpu *vcpu)
+{
+ ++vcpu->stat.irq_exits;
+ return 1;
+}
+
+static int handle_triple_fault(struct kvm_vcpu *vcpu)
+{
+ vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
+ vcpu->mmio_needed = 0;
+ return 0;
+}
+
+static int handle_io(struct kvm_vcpu *vcpu)
+{
+ unsigned long exit_qualification;
+ int size, in, string;
+ unsigned port;
+
+ exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+ string = (exit_qualification & 16) != 0;
+
+ ++vcpu->stat.io_exits;
+
+ if (string)
+ return kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE;
+
+ port = exit_qualification >> 16;
+ size = (exit_qualification & 7) + 1;
+ in = (exit_qualification & 8) != 0;
+
+ return kvm_fast_pio(vcpu, size, port, in);
+}
+
+static void
+vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
+{
+ /*
+ * Patch in the VMCALL instruction:
+ */
+ hypercall[0] = 0x0f;
+ hypercall[1] = 0x01;
+ hypercall[2] = 0xc1;
+}
+
+/* called to set cr0 as appropriate for a mov-to-cr0 exit. */
+static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val)
+{
+ if (is_guest_mode(vcpu)) {
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+ unsigned long orig_val = val;
+
+ /*
+ * We get here when L2 changed cr0 in a way that did not change
+ * any of L1's shadowed bits (see nested_vmx_exit_handled_cr),
+ * but did change L0 shadowed bits. So we first calculate the
+ * effective cr0 value that L1 would like to write into the
+ * hardware. It consists of the L2-owned bits from the new
+ * value combined with the L1-owned bits from L1's guest_cr0.
+ */
+ val = (val & ~vmcs12->cr0_guest_host_mask) |
+ (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask);
+
+ if (!nested_guest_cr0_valid(vcpu, val))
+ return 1;
+
+ if (kvm_set_cr0(vcpu, val))
+ return 1;
+ vmcs_writel(CR0_READ_SHADOW, orig_val);
+ return 0;
+ } else {
+ if (to_vmx(vcpu)->nested.vmxon &&
+ !nested_host_cr0_valid(vcpu, val))
+ return 1;
+
+ return kvm_set_cr0(vcpu, val);
+ }
+}
+
+static int handle_set_cr4(struct kvm_vcpu *vcpu, unsigned long val)
+{
+ if (is_guest_mode(vcpu)) {
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+ unsigned long orig_val = val;
+
+ /* analogously to handle_set_cr0 */
+ val = (val & ~vmcs12->cr4_guest_host_mask) |
+ (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask);
+ if (kvm_set_cr4(vcpu, val))
+ return 1;
+ vmcs_writel(CR4_READ_SHADOW, orig_val);
+ return 0;
+ } else
+ return kvm_set_cr4(vcpu, val);
+}
+
+static int handle_desc(struct kvm_vcpu *vcpu)
+{
+ WARN_ON(!(vcpu->arch.cr4 & X86_CR4_UMIP));
+ return kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE;
+}
+
+static int handle_cr(struct kvm_vcpu *vcpu)
+{
+ unsigned long exit_qualification, val;
+ int cr;
+ int reg;
+ int err;
+ int ret;
+
+ exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+ cr = exit_qualification & 15;
+ reg = (exit_qualification >> 8) & 15;
+ switch ((exit_qualification >> 4) & 3) {
+ case 0: /* mov to cr */
+ val = kvm_register_readl(vcpu, reg);
+ trace_kvm_cr_write(cr, val);
+ switch (cr) {
+ case 0:
+ err = handle_set_cr0(vcpu, val);
+ return kvm_complete_insn_gp(vcpu, err);
+ case 3:
+ WARN_ON_ONCE(enable_unrestricted_guest);
+ err = kvm_set_cr3(vcpu, val);
+ return kvm_complete_insn_gp(vcpu, err);
+ case 4:
+ err = handle_set_cr4(vcpu, val);
+ return kvm_complete_insn_gp(vcpu, err);
+ case 8: {
+ u8 cr8_prev = kvm_get_cr8(vcpu);
+ u8 cr8 = (u8)val;
+ err = kvm_set_cr8(vcpu, cr8);
+ ret = kvm_complete_insn_gp(vcpu, err);
+ if (lapic_in_kernel(vcpu))
+ return ret;
+ if (cr8_prev <= cr8)
+ return ret;
+ /*
+ * TODO: we might be squashing a
+ * KVM_GUESTDBG_SINGLESTEP-triggered
+ * KVM_EXIT_DEBUG here.
+ */
+ vcpu->run->exit_reason = KVM_EXIT_SET_TPR;
+ return 0;
+ }
+ }
+ break;
+ case 2: /* clts */
+ WARN_ONCE(1, "Guest should always own CR0.TS");
+ vmx_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS));
+ trace_kvm_cr_write(0, kvm_read_cr0(vcpu));
+ return kvm_skip_emulated_instruction(vcpu);
+ case 1: /*mov from cr*/
+ switch (cr) {
+ case 3:
+ WARN_ON_ONCE(enable_unrestricted_guest);
+ val = kvm_read_cr3(vcpu);
+ kvm_register_write(vcpu, reg, val);
+ trace_kvm_cr_read(cr, val);
+ return kvm_skip_emulated_instruction(vcpu);
+ case 8:
+ val = kvm_get_cr8(vcpu);
+ kvm_register_write(vcpu, reg, val);
+ trace_kvm_cr_read(cr, val);
+ return kvm_skip_emulated_instruction(vcpu);
+ }
+ break;
+ case 3: /* lmsw */
+ val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f;
+ trace_kvm_cr_write(0, (kvm_read_cr0(vcpu) & ~0xful) | val);
+ kvm_lmsw(vcpu, val);
+
+ return kvm_skip_emulated_instruction(vcpu);
+ default:
+ break;
+ }
+ vcpu->run->exit_reason = 0;
+ vcpu_unimpl(vcpu, "unhandled control register: op %d cr %d\n",
+ (int)(exit_qualification >> 4) & 3, cr);
+ return 0;
+}
+
+static int handle_dr(struct kvm_vcpu *vcpu)
+{
+ unsigned long exit_qualification;
+ int dr, dr7, reg;
+
+ exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+ dr = exit_qualification & DEBUG_REG_ACCESS_NUM;
+
+ /* First, if DR does not exist, trigger UD */
+ if (!kvm_require_dr(vcpu, dr))
+ return 1;
+
+ /* Do not handle if the CPL > 0, will trigger GP on re-entry */
+ if (!kvm_require_cpl(vcpu, 0))
+ return 1;
+ dr7 = vmcs_readl(GUEST_DR7);
+ if (dr7 & DR7_GD) {
+ /*
+ * As the vm-exit takes precedence over the debug trap, we
+ * need to emulate the latter, either for the host or the
+ * guest debugging itself.
+ */
+ if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) {
+ vcpu->run->debug.arch.dr6 = vcpu->arch.dr6;
+ vcpu->run->debug.arch.dr7 = dr7;
+ vcpu->run->debug.arch.pc = kvm_get_linear_rip(vcpu);
+ vcpu->run->debug.arch.exception = DB_VECTOR;
+ vcpu->run->exit_reason = KVM_EXIT_DEBUG;
+ return 0;
+ } else {
+ vcpu->arch.dr6 &= ~15;
+ vcpu->arch.dr6 |= DR6_BD | DR6_RTM;
+ kvm_queue_exception(vcpu, DB_VECTOR);
+ return 1;
+ }
+ }
+
+ if (vcpu->guest_debug == 0) {
+ vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL,
+ CPU_BASED_MOV_DR_EXITING);
+
+ /*
+ * No more DR vmexits; force a reload of the debug registers
+ * and reenter on this instruction. The next vmexit will
+ * retrieve the full state of the debug registers.
+ */
+ vcpu->arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT;
+ return 1;
+ }
+
+ reg = DEBUG_REG_ACCESS_REG(exit_qualification);
+ if (exit_qualification & TYPE_MOV_FROM_DR) {
+ unsigned long val;
+
+ if (kvm_get_dr(vcpu, dr, &val))
+ return 1;
+ kvm_register_write(vcpu, reg, val);
+ } else
+ if (kvm_set_dr(vcpu, dr, kvm_register_readl(vcpu, reg)))
+ return 1;
+
+ return kvm_skip_emulated_instruction(vcpu);
+}
+
+static u64 vmx_get_dr6(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.dr6;
+}
+
+static void vmx_set_dr6(struct kvm_vcpu *vcpu, unsigned long val)
+{
+}
+
+static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
+{
+ get_debugreg(vcpu->arch.db[0], 0);
+ get_debugreg(vcpu->arch.db[1], 1);
+ get_debugreg(vcpu->arch.db[2], 2);
+ get_debugreg(vcpu->arch.db[3], 3);
+ get_debugreg(vcpu->arch.dr6, 6);
+ vcpu->arch.dr7 = vmcs_readl(GUEST_DR7);
+
+ vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT;
+ vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL, CPU_BASED_MOV_DR_EXITING);
+}
+
+static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val)
+{
+ vmcs_writel(GUEST_DR7, val);
+}
+
+static int handle_cpuid(struct kvm_vcpu *vcpu)
+{
+ return kvm_emulate_cpuid(vcpu);
+}
+
+static int handle_rdmsr(struct kvm_vcpu *vcpu)
+{
+ u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX];
+ struct msr_data msr_info;
+
+ msr_info.index = ecx;
+ msr_info.host_initiated = false;
+ if (vmx_get_msr(vcpu, &msr_info)) {
+ trace_kvm_msr_read_ex(ecx);
+ kvm_inject_gp(vcpu, 0);
+ return 1;
+ }
+
+ trace_kvm_msr_read(ecx, msr_info.data);
+
+ /* FIXME: handling of bits 32:63 of rax, rdx */
+ vcpu->arch.regs[VCPU_REGS_RAX] = msr_info.data & -1u;
+ vcpu->arch.regs[VCPU_REGS_RDX] = (msr_info.data >> 32) & -1u;
+ return kvm_skip_emulated_instruction(vcpu);
+}
+
+static int handle_wrmsr(struct kvm_vcpu *vcpu)
+{
+ struct msr_data msr;
+ u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX];
+ u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u)
+ | ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32);
+
+ msr.data = data;
+ msr.index = ecx;
+ msr.host_initiated = false;
+ if (kvm_set_msr(vcpu, &msr) != 0) {
+ trace_kvm_msr_write_ex(ecx, data);
+ kvm_inject_gp(vcpu, 0);
+ return 1;
+ }
+
+ trace_kvm_msr_write(ecx, data);
+ return kvm_skip_emulated_instruction(vcpu);
+}
+
+static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu)
+{
+ kvm_apic_update_ppr(vcpu);
+ return 1;
+}
+
+static int handle_interrupt_window(struct kvm_vcpu *vcpu)
+{
+ vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL,
+ CPU_BASED_VIRTUAL_INTR_PENDING);
+
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+
+ ++vcpu->stat.irq_window_exits;
+ return 1;
+}
+
+static int handle_halt(struct kvm_vcpu *vcpu)
+{
+ return kvm_emulate_halt(vcpu);
+}
+
+static int handle_vmcall(struct kvm_vcpu *vcpu)
+{
+ return kvm_emulate_hypercall(vcpu);
+}
+
+static int handle_invd(struct kvm_vcpu *vcpu)
+{
+ return kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE;
+}
+
+static int handle_invlpg(struct kvm_vcpu *vcpu)
+{
+ unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+
+ kvm_mmu_invlpg(vcpu, exit_qualification);
+ return kvm_skip_emulated_instruction(vcpu);
+}
+
+static int handle_rdpmc(struct kvm_vcpu *vcpu)
+{
+ int err;
+
+ err = kvm_rdpmc(vcpu);
+ return kvm_complete_insn_gp(vcpu, err);
+}
+
+static int handle_wbinvd(struct kvm_vcpu *vcpu)
+{
+ return kvm_emulate_wbinvd(vcpu);
+}
+
+static int handle_xsetbv(struct kvm_vcpu *vcpu)
+{
+ u64 new_bv = kvm_read_edx_eax(vcpu);
+ u32 index = kvm_register_read(vcpu, VCPU_REGS_RCX);
+
+ if (kvm_set_xcr(vcpu, index, new_bv) == 0)
+ return kvm_skip_emulated_instruction(vcpu);
+ return 1;
+}
+
+static int handle_xsaves(struct kvm_vcpu *vcpu)
+{
+ kvm_skip_emulated_instruction(vcpu);
+ WARN(1, "this should never happen\n");
+ return 1;
+}
+
+static int handle_xrstors(struct kvm_vcpu *vcpu)
+{
+ kvm_skip_emulated_instruction(vcpu);
+ WARN(1, "this should never happen\n");
+ return 1;
+}
+
+static int handle_apic_access(struct kvm_vcpu *vcpu)
+{
+ if (likely(fasteoi)) {
+ unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+ int access_type, offset;
+
+ access_type = exit_qualification & APIC_ACCESS_TYPE;
+ offset = exit_qualification & APIC_ACCESS_OFFSET;
+ /*
+ * Sane guest uses MOV to write EOI, with written value
+ * not cared. So make a short-circuit here by avoiding
+ * heavy instruction emulation.
+ */
+ if ((access_type == TYPE_LINEAR_APIC_INST_WRITE) &&
+ (offset == APIC_EOI)) {
+ kvm_lapic_set_eoi(vcpu);
+ return kvm_skip_emulated_instruction(vcpu);
+ }
+ }
+ return kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE;
+}
+
+static int handle_apic_eoi_induced(struct kvm_vcpu *vcpu)
+{
+ unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+ int vector = exit_qualification & 0xff;
+
+ /* EOI-induced VM exit is trap-like and thus no need to adjust IP */
+ kvm_apic_set_eoi_accelerated(vcpu, vector);
+ return 1;
+}
+
+static int handle_apic_write(struct kvm_vcpu *vcpu)
+{
+ unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+ u32 offset = exit_qualification & 0xfff;
+
+ /* APIC-write VM exit is trap-like and thus no need to adjust IP */
+ kvm_apic_write_nodecode(vcpu, offset);
+ return 1;
+}
+
+static int handle_task_switch(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ unsigned long exit_qualification;
+ bool has_error_code = false;
+ u32 error_code = 0;
+ u16 tss_selector;
+ int reason, type, idt_v, idt_index;
+
+ idt_v = (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK);
+ idt_index = (vmx->idt_vectoring_info & VECTORING_INFO_VECTOR_MASK);
+ type = (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK);
+
+ exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+
+ reason = (u32)exit_qualification >> 30;
+ if (reason == TASK_SWITCH_GATE && idt_v) {
+ switch (type) {
+ case INTR_TYPE_NMI_INTR:
+ vcpu->arch.nmi_injected = false;
+ vmx_set_nmi_mask(vcpu, true);
+ break;
+ case INTR_TYPE_EXT_INTR:
+ case INTR_TYPE_SOFT_INTR:
+ kvm_clear_interrupt_queue(vcpu);
+ break;
+ case INTR_TYPE_HARD_EXCEPTION:
+ if (vmx->idt_vectoring_info &
+ VECTORING_INFO_DELIVER_CODE_MASK) {
+ has_error_code = true;
+ error_code =
+ vmcs_read32(IDT_VECTORING_ERROR_CODE);
+ }
+ /* fall through */
+ case INTR_TYPE_SOFT_EXCEPTION:
+ kvm_clear_exception_queue(vcpu);
+ break;
+ default:
+ break;
+ }
+ }
+ tss_selector = exit_qualification;
+
+ if (!idt_v || (type != INTR_TYPE_HARD_EXCEPTION &&
+ type != INTR_TYPE_EXT_INTR &&
+ type != INTR_TYPE_NMI_INTR))
+ skip_emulated_instruction(vcpu);
+
+ if (kvm_task_switch(vcpu, tss_selector,
+ type == INTR_TYPE_SOFT_INTR ? idt_index : -1, reason,
+ has_error_code, error_code) == EMULATE_FAIL) {
+ vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+ vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
+ vcpu->run->internal.ndata = 0;
+ return 0;
+ }
+
+ /*
+ * TODO: What about debug traps on tss switch?
+ * Are we supposed to inject them and update dr6?
+ */
+
+ return 1;
+}
+
+static int handle_ept_violation(struct kvm_vcpu *vcpu)
+{
+ unsigned long exit_qualification;
+ gpa_t gpa;
+ u64 error_code;
+
+ exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+
+ /*
+ * EPT violation happened while executing iret from NMI,
+ * "blocked by NMI" bit has to be set before next VM entry.
+ * There are errata that may cause this bit to not be set:
+ * AAK134, BY25.
+ */
+ if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
+ enable_vnmi &&
+ (exit_qualification & INTR_INFO_UNBLOCK_NMI))
+ vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI);
+
+ gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
+ trace_kvm_page_fault(gpa, exit_qualification);
+
+ /* Is it a read fault? */
+ error_code = (exit_qualification & EPT_VIOLATION_ACC_READ)
+ ? PFERR_USER_MASK : 0;
+ /* Is it a write fault? */
+ error_code |= (exit_qualification & EPT_VIOLATION_ACC_WRITE)
+ ? PFERR_WRITE_MASK : 0;
+ /* Is it a fetch fault? */
+ error_code |= (exit_qualification & EPT_VIOLATION_ACC_INSTR)
+ ? PFERR_FETCH_MASK : 0;
+ /* ept page table entry is present? */
+ error_code |= (exit_qualification &
+ (EPT_VIOLATION_READABLE | EPT_VIOLATION_WRITABLE |
+ EPT_VIOLATION_EXECUTABLE))
+ ? PFERR_PRESENT_MASK : 0;
+
+ error_code |= (exit_qualification & 0x100) != 0 ?
+ PFERR_GUEST_FINAL_MASK : PFERR_GUEST_PAGE_MASK;
+
+ vcpu->arch.exit_qualification = exit_qualification;
+ return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0);
+}
+
+static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
+{
+ gpa_t gpa;
+
+ /*
+ * A nested guest cannot optimize MMIO vmexits, because we have an
+ * nGPA here instead of the required GPA.
+ */
+ gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
+ if (!is_guest_mode(vcpu) &&
+ !kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) {
+ trace_kvm_fast_mmio(gpa);
+ /*
+ * Doing kvm_skip_emulated_instruction() depends on undefined
+ * behavior: Intel's manual doesn't mandate
+ * VM_EXIT_INSTRUCTION_LEN to be set in VMCS when EPT MISCONFIG
+ * occurs and while on real hardware it was observed to be set,
+ * other hypervisors (namely Hyper-V) don't set it, we end up
+ * advancing IP with some random value. Disable fast mmio when
+ * running nested and keep it for real hardware in hope that
+ * VM_EXIT_INSTRUCTION_LEN will always be set correctly.
+ */
+ if (!static_cpu_has(X86_FEATURE_HYPERVISOR))
+ return kvm_skip_emulated_instruction(vcpu);
+ else
+ return kvm_emulate_instruction(vcpu, EMULTYPE_SKIP) ==
+ EMULATE_DONE;
+ }
+
+ return kvm_mmu_page_fault(vcpu, gpa, PFERR_RSVD_MASK, NULL, 0);
+}
+
+static int handle_nmi_window(struct kvm_vcpu *vcpu)
+{
+ WARN_ON_ONCE(!enable_vnmi);
+ vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL,
+ CPU_BASED_VIRTUAL_NMI_PENDING);
+ ++vcpu->stat.nmi_window_exits;
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+
+ return 1;
+}
+
+static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ enum emulation_result err = EMULATE_DONE;
+ int ret = 1;
+ u32 cpu_exec_ctrl;
+ bool intr_window_requested;
+ unsigned count = 130;
+
+ /*
+ * We should never reach the point where we are emulating L2
+ * due to invalid guest state as that means we incorrectly
+ * allowed a nested VMEntry with an invalid vmcs12.
+ */
+ WARN_ON_ONCE(vmx->emulation_required && vmx->nested.nested_run_pending);
+
+ cpu_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
+ intr_window_requested = cpu_exec_ctrl & CPU_BASED_VIRTUAL_INTR_PENDING;
+
+ while (vmx->emulation_required && count-- != 0) {
+ if (intr_window_requested && vmx_interrupt_allowed(vcpu))
+ return handle_interrupt_window(&vmx->vcpu);
+
+ if (kvm_test_request(KVM_REQ_EVENT, vcpu))
+ return 1;
+
+ err = kvm_emulate_instruction(vcpu, 0);
+
+ if (err == EMULATE_USER_EXIT) {
+ ++vcpu->stat.mmio_exits;
+ ret = 0;
+ goto out;
+ }
+
+ if (err != EMULATE_DONE)
+ goto emulation_error;
+
+ if (vmx->emulation_required && !vmx->rmode.vm86_active &&
+ vcpu->arch.exception.pending)
+ goto emulation_error;
+
+ if (vcpu->arch.halt_request) {
+ vcpu->arch.halt_request = 0;
+ ret = kvm_vcpu_halt(vcpu);
+ goto out;
+ }
+
+ if (signal_pending(current))
+ goto out;
+ if (need_resched())
+ schedule();
+ }
+
+out:
+ return ret;
+
+emulation_error:
+ vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+ vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
+ vcpu->run->internal.ndata = 0;
+ return 0;
+}
+
+static void grow_ple_window(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ int old = vmx->ple_window;
+
+ vmx->ple_window = __grow_ple_window(old, ple_window,
+ ple_window_grow,
+ ple_window_max);
+
+ if (vmx->ple_window != old)
+ vmx->ple_window_dirty = true;
+
+ trace_kvm_ple_window_grow(vcpu->vcpu_id, vmx->ple_window, old);
+}
+
+static void shrink_ple_window(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ int old = vmx->ple_window;
+
+ vmx->ple_window = __shrink_ple_window(old, ple_window,
+ ple_window_shrink,
+ ple_window);
+
+ if (vmx->ple_window != old)
+ vmx->ple_window_dirty = true;
+
+ trace_kvm_ple_window_shrink(vcpu->vcpu_id, vmx->ple_window, old);
+}
+
+/*
+ * Handler for POSTED_INTERRUPT_WAKEUP_VECTOR.
+ */
+static void wakeup_handler(void)
+{
+ struct kvm_vcpu *vcpu;
+ int cpu = smp_processor_id();
+
+ spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
+ list_for_each_entry(vcpu, &per_cpu(blocked_vcpu_on_cpu, cpu),
+ blocked_vcpu_list) {
+ struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
+
+ if (pi_test_on(pi_desc) == 1)
+ kvm_vcpu_kick(vcpu);
+ }
+ spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
+}
+
+static void vmx_enable_tdp(void)
+{
+ kvm_mmu_set_mask_ptes(VMX_EPT_READABLE_MASK,
+ enable_ept_ad_bits ? VMX_EPT_ACCESS_BIT : 0ull,
+ enable_ept_ad_bits ? VMX_EPT_DIRTY_BIT : 0ull,
+ 0ull, VMX_EPT_EXECUTABLE_MASK,
+ cpu_has_vmx_ept_execute_only() ? 0ull : VMX_EPT_READABLE_MASK,
+ VMX_EPT_RWX_MASK, 0ull);
+
+ ept_set_mmio_spte_mask();
+ kvm_enable_tdp();
+}
+
+static __init int hardware_setup(void)
+{
+ unsigned long host_bndcfgs;
+ int r = -ENOMEM, i;
+
+ rdmsrl_safe(MSR_EFER, &host_efer);
+
+ for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i)
+ kvm_define_shared_msr(i, vmx_msr_index[i]);
+
+ for (i = 0; i < VMX_BITMAP_NR; i++) {
+ vmx_bitmap[i] = (unsigned long *)__get_free_page(GFP_KERNEL);
+ if (!vmx_bitmap[i])
+ goto out;
+ }
+
+ memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE);
+ memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE);
+
+ if (setup_vmcs_config(&vmcs_config) < 0) {
+ r = -EIO;
+ goto out;
+ }
+
+ if (boot_cpu_has(X86_FEATURE_NX))
+ kvm_enable_efer_bits(EFER_NX);
+
+ if (boot_cpu_has(X86_FEATURE_MPX)) {
+ rdmsrl(MSR_IA32_BNDCFGS, host_bndcfgs);
+ WARN_ONCE(host_bndcfgs, "KVM: BNDCFGS in host will be lost");
+ }
+
+ if (!cpu_has_vmx_vpid() || !cpu_has_vmx_invvpid() ||
+ !(cpu_has_vmx_invvpid_single() || cpu_has_vmx_invvpid_global()))
+ enable_vpid = 0;
+
+ if (!cpu_has_vmx_ept() ||
+ !cpu_has_vmx_ept_4levels() ||
+ !cpu_has_vmx_ept_mt_wb() ||
+ !cpu_has_vmx_invept_global())
+ enable_ept = 0;
+
+ if (!cpu_has_vmx_ept_ad_bits() || !enable_ept)
+ enable_ept_ad_bits = 0;
+
+ if (!cpu_has_vmx_unrestricted_guest() || !enable_ept)
+ enable_unrestricted_guest = 0;
+
+ if (!cpu_has_vmx_flexpriority())
+ flexpriority_enabled = 0;
+
+ if (!cpu_has_virtual_nmis())
+ enable_vnmi = 0;
+
+ /*
+ * set_apic_access_page_addr() is used to reload apic access
+ * page upon invalidation. No need to do anything if not
+ * using the APIC_ACCESS_ADDR VMCS field.
+ */
+ if (!flexpriority_enabled)
+ kvm_x86_ops->set_apic_access_page_addr = NULL;
+
+ if (!cpu_has_vmx_tpr_shadow())
+ kvm_x86_ops->update_cr8_intercept = NULL;
+
+ if (enable_ept && !cpu_has_vmx_ept_2m_page())
+ kvm_disable_largepages();
+
+#if IS_ENABLED(CONFIG_HYPERV)
+ if (ms_hyperv.nested_features & HV_X64_NESTED_GUEST_MAPPING_FLUSH
+ && enable_ept)
+ kvm_x86_ops->tlb_remote_flush = vmx_hv_remote_flush_tlb;
+#endif
+
+ if (!cpu_has_vmx_ple()) {
+ ple_gap = 0;
+ ple_window = 0;
+ ple_window_grow = 0;
+ ple_window_max = 0;
+ ple_window_shrink = 0;
+ }
+
+ if (!cpu_has_vmx_apicv()) {
+ enable_apicv = 0;
+ kvm_x86_ops->sync_pir_to_irr = NULL;
+ }
+
+ if (cpu_has_vmx_tsc_scaling()) {
+ kvm_has_tsc_control = true;
+ kvm_max_tsc_scaling_ratio = KVM_VMX_TSC_MULTIPLIER_MAX;
+ kvm_tsc_scaling_ratio_frac_bits = 48;
+ }
+
+ set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
+
+ if (enable_ept)
+ vmx_enable_tdp();
+ else
+ kvm_disable_tdp();
+
+ if (!nested) {
+ kvm_x86_ops->get_nested_state = NULL;
+ kvm_x86_ops->set_nested_state = NULL;
+ }
+
+ /*
+ * Only enable PML when hardware supports PML feature, and both EPT
+ * and EPT A/D bit features are enabled -- PML depends on them to work.
+ */
+ if (!enable_ept || !enable_ept_ad_bits || !cpu_has_vmx_pml())
+ enable_pml = 0;
+
+ if (!enable_pml) {
+ kvm_x86_ops->slot_enable_log_dirty = NULL;
+ kvm_x86_ops->slot_disable_log_dirty = NULL;
+ kvm_x86_ops->flush_log_dirty = NULL;
+ kvm_x86_ops->enable_log_dirty_pt_masked = NULL;
+ }
+
+ if (!cpu_has_vmx_preemption_timer())
+ kvm_x86_ops->request_immediate_exit = __kvm_request_immediate_exit;
+
+ if (cpu_has_vmx_preemption_timer() && enable_preemption_timer) {
+ u64 vmx_msr;
+
+ rdmsrl(MSR_IA32_VMX_MISC, vmx_msr);
+ cpu_preemption_timer_multi =
+ vmx_msr & VMX_MISC_PREEMPTION_TIMER_RATE_MASK;
+ } else {
+ kvm_x86_ops->set_hv_timer = NULL;
+ kvm_x86_ops->cancel_hv_timer = NULL;
+ }
+
+ if (!cpu_has_vmx_shadow_vmcs())
+ enable_shadow_vmcs = 0;
+ if (enable_shadow_vmcs)
+ init_vmcs_shadow_fields();
+
+ kvm_set_posted_intr_wakeup_handler(wakeup_handler);
+ nested_vmx_setup_ctls_msrs(&vmcs_config.nested, enable_apicv);
+
+ kvm_mce_cap_supported |= MCG_LMCE_P;
+
+ r = alloc_kvm_area();
+ if (r)
+ goto out;
+ return 0;
+
+out:
+ for (i = 0; i < VMX_BITMAP_NR; i++)
+ free_page((unsigned long)vmx_bitmap[i]);
+
+ return r;
+}
+
+static __exit void hardware_unsetup(void)
+{
+ int i;
+
+ for (i = 0; i < VMX_BITMAP_NR; i++)
+ free_page((unsigned long)vmx_bitmap[i]);
+
+ free_kvm_area();
+}
+
+/*
+ * Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE
+ * exiting, so only get here on cpu with PAUSE-Loop-Exiting.
+ */
+static int handle_pause(struct kvm_vcpu *vcpu)
+{
+ if (!kvm_pause_in_guest(vcpu->kvm))
+ grow_ple_window(vcpu);
+
+ /*
+ * Intel sdm vol3 ch-25.1.3 says: The "PAUSE-loop exiting"
+ * VM-execution control is ignored if CPL > 0. OTOH, KVM
+ * never set PAUSE_EXITING and just set PLE if supported,
+ * so the vcpu must be CPL=0 if it gets a PAUSE exit.
+ */
+ kvm_vcpu_on_spin(vcpu, true);
+ return kvm_skip_emulated_instruction(vcpu);
+}
+
+static int handle_nop(struct kvm_vcpu *vcpu)
+{
+ return kvm_skip_emulated_instruction(vcpu);
+}
+
+static int handle_mwait(struct kvm_vcpu *vcpu)
+{
+ printk_once(KERN_WARNING "kvm: MWAIT instruction emulated as NOP!\n");
+ return handle_nop(vcpu);
+}
+
+static int handle_invalid_op(struct kvm_vcpu *vcpu)
+{
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+}
+
+static int handle_monitor_trap(struct kvm_vcpu *vcpu)
+{
+ return 1;
+}
+
+static int handle_monitor(struct kvm_vcpu *vcpu)
+{
+ printk_once(KERN_WARNING "kvm: MONITOR instruction emulated as NOP!\n");
+ return handle_nop(vcpu);
+}
+
+/*
+ * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(),
+ * set the success or error code of an emulated VMX instruction, as specified
+ * by Vol 2B, VMX Instruction Reference, "Conventions".
+ */
+static void nested_vmx_succeed(struct kvm_vcpu *vcpu)
+{
+ vmx_set_rflags(vcpu, vmx_get_rflags(vcpu)
+ & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
+ X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF));
+}
+
+static void nested_vmx_failInvalid(struct kvm_vcpu *vcpu)
+{
+ vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
+ & ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF |
+ X86_EFLAGS_SF | X86_EFLAGS_OF))
+ | X86_EFLAGS_CF);
+}
+
+static void nested_vmx_failValid(struct kvm_vcpu *vcpu,
+ u32 vm_instruction_error)
+{
+ if (to_vmx(vcpu)->nested.current_vmptr == -1ull) {
+ /*
+ * failValid writes the error number to the current VMCS, which
+ * can't be done there isn't a current VMCS.
+ */
+ nested_vmx_failInvalid(vcpu);
+ return;
+ }
+ vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
+ & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
+ X86_EFLAGS_SF | X86_EFLAGS_OF))
+ | X86_EFLAGS_ZF);
+ get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error;
+ /*
+ * We don't need to force a shadow sync because
+ * VM_INSTRUCTION_ERROR is not shadowed
+ */
+}
+
+static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator)
+{
+ /* TODO: not to reset guest simply here. */
+ kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+ pr_debug_ratelimited("kvm: nested vmx abort, indicator %d\n", indicator);
+}
+
+static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer)
+{
+ struct vcpu_vmx *vmx =
+ container_of(timer, struct vcpu_vmx, nested.preemption_timer);
+
+ vmx->nested.preemption_timer_expired = true;
+ kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu);
+ kvm_vcpu_kick(&vmx->vcpu);
+
+ return HRTIMER_NORESTART;
+}
+
+/*
+ * Decode the memory-address operand of a vmx instruction, as recorded on an
+ * exit caused by such an instruction (run by a guest hypervisor).
+ * On success, returns 0. When the operand is invalid, returns 1 and throws
+ * #UD or #GP.
+ */
+static int get_vmx_mem_address(struct kvm_vcpu *vcpu,
+ unsigned long exit_qualification,
+ u32 vmx_instruction_info, bool wr, gva_t *ret)
+{
+ gva_t off;
+ bool exn;
+ struct kvm_segment s;
+
+ /*
+ * According to Vol. 3B, "Information for VM Exits Due to Instruction
+ * Execution", on an exit, vmx_instruction_info holds most of the
+ * addressing components of the operand. Only the displacement part
+ * is put in exit_qualification (see 3B, "Basic VM-Exit Information").
+ * For how an actual address is calculated from all these components,
+ * refer to Vol. 1, "Operand Addressing".
+ */
+ int scaling = vmx_instruction_info & 3;
+ int addr_size = (vmx_instruction_info >> 7) & 7;
+ bool is_reg = vmx_instruction_info & (1u << 10);
+ int seg_reg = (vmx_instruction_info >> 15) & 7;
+ int index_reg = (vmx_instruction_info >> 18) & 0xf;
+ bool index_is_valid = !(vmx_instruction_info & (1u << 22));
+ int base_reg = (vmx_instruction_info >> 23) & 0xf;
+ bool base_is_valid = !(vmx_instruction_info & (1u << 27));
+
+ if (is_reg) {
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+ }
+
+ /* Addr = segment_base + offset */
+ /* offset = base + [index * scale] + displacement */
+ off = exit_qualification; /* holds the displacement */
+ if (addr_size == 1)
+ off = (gva_t)sign_extend64(off, 31);
+ else if (addr_size == 0)
+ off = (gva_t)sign_extend64(off, 15);
+ if (base_is_valid)
+ off += kvm_register_read(vcpu, base_reg);
+ if (index_is_valid)
+ off += kvm_register_read(vcpu, index_reg)<<scaling;
+ vmx_get_segment(vcpu, &s, seg_reg);
+
+ /*
+ * The effective address, i.e. @off, of a memory operand is truncated
+ * based on the address size of the instruction. Note that this is
+ * the *effective address*, i.e. the address prior to accounting for
+ * the segment's base.
+ */
+ if (addr_size == 1) /* 32 bit */
+ off &= 0xffffffff;
+ else if (addr_size == 0) /* 16 bit */
+ off &= 0xffff;
+
+ /* Checks for #GP/#SS exceptions. */
+ exn = false;
+ if (is_long_mode(vcpu)) {
+ /*
+ * The virtual/linear address is never truncated in 64-bit
+ * mode, e.g. a 32-bit address size can yield a 64-bit virtual
+ * address when using FS/GS with a non-zero base.
+ */
+ *ret = s.base + off;
+
+ /* Long mode: #GP(0)/#SS(0) if the memory address is in a
+ * non-canonical form. This is the only check on the memory
+ * destination for long mode!
+ */
+ exn = is_noncanonical_address(*ret, vcpu);
+ } else if (is_protmode(vcpu)) {
+ /*
+ * When not in long mode, the virtual/linear address is
+ * unconditionally truncated to 32 bits regardless of the
+ * address size.
+ */
+ *ret = (s.base + off) & 0xffffffff;
+
+ /* Protected mode: apply checks for segment validity in the
+ * following order:
+ * - segment type check (#GP(0) may be thrown)
+ * - usability check (#GP(0)/#SS(0))
+ * - limit check (#GP(0)/#SS(0))
+ */
+ if (wr)
+ /* #GP(0) if the destination operand is located in a
+ * read-only data segment or any code segment.
+ */
+ exn = ((s.type & 0xa) == 0 || (s.type & 8));
+ else
+ /* #GP(0) if the source operand is located in an
+ * execute-only code segment
+ */
+ exn = ((s.type & 0xa) == 8);
+ if (exn) {
+ kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
+ return 1;
+ }
+ /* Protected mode: #GP(0)/#SS(0) if the segment is unusable.
+ */
+ exn = (s.unusable != 0);
+
+ /*
+ * Protected mode: #GP(0)/#SS(0) if the memory operand is
+ * outside the segment limit. All CPUs that support VMX ignore
+ * limit checks for flat segments, i.e. segments with base==0,
+ * limit==0xffffffff and of type expand-up data or code.
+ */
+ if (!(s.base == 0 && s.limit == 0xffffffff &&
+ ((s.type & 8) || !(s.type & 4))))
+ exn = exn || (off + sizeof(u64) > s.limit);
+ }
+ if (exn) {
+ kvm_queue_exception_e(vcpu,
+ seg_reg == VCPU_SREG_SS ?
+ SS_VECTOR : GP_VECTOR,
+ 0);
+ return 1;
+ }
+
+ return 0;
+}
+
+static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer)
+{
+ gva_t gva;
+ struct x86_exception e;
+
+ if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
+ vmcs_read32(VMX_INSTRUCTION_INFO), false, &gva))
+ return 1;
+
+ if (kvm_read_guest_virt(vcpu, gva, vmpointer, sizeof(*vmpointer), &e)) {
+ kvm_inject_page_fault(vcpu, &e);
+ return 1;
+ }
+
+ return 0;
+}
+
+/*
+ * Allocate a shadow VMCS and associate it with the currently loaded
+ * VMCS, unless such a shadow VMCS already exists. The newly allocated
+ * VMCS is also VMCLEARed, so that it is ready for use.
+ */
+static struct vmcs *alloc_shadow_vmcs(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct loaded_vmcs *loaded_vmcs = vmx->loaded_vmcs;
+
+ /*
+ * We should allocate a shadow vmcs for vmcs01 only when L1
+ * executes VMXON and free it when L1 executes VMXOFF.
+ * As it is invalid to execute VMXON twice, we shouldn't reach
+ * here when vmcs01 already have an allocated shadow vmcs.
+ */
+ WARN_ON(loaded_vmcs == &vmx->vmcs01 && loaded_vmcs->shadow_vmcs);
+
+ if (!loaded_vmcs->shadow_vmcs) {
+ loaded_vmcs->shadow_vmcs = alloc_vmcs(true);
+ if (loaded_vmcs->shadow_vmcs)
+ vmcs_clear(loaded_vmcs->shadow_vmcs);
+ }
+ return loaded_vmcs->shadow_vmcs;
+}
+
+static int enter_vmx_operation(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ int r;
+
+ r = alloc_loaded_vmcs(&vmx->nested.vmcs02);
+ if (r < 0)
+ goto out_vmcs02;
+
+ vmx->nested.cached_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL);
+ if (!vmx->nested.cached_vmcs12)
+ goto out_cached_vmcs12;
+
+ vmx->nested.cached_shadow_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL);
+ if (!vmx->nested.cached_shadow_vmcs12)
+ goto out_cached_shadow_vmcs12;
+
+ if (enable_shadow_vmcs && !alloc_shadow_vmcs(vcpu))
+ goto out_shadow_vmcs;
+
+ hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC,
+ HRTIMER_MODE_REL_PINNED);
+ vmx->nested.preemption_timer.function = vmx_preemption_timer_fn;
+
+ vmx->nested.vpid02 = allocate_vpid();
+
+ vmx->nested.vmxon = true;
+ return 0;
+
+out_shadow_vmcs:
+ kfree(vmx->nested.cached_shadow_vmcs12);
+
+out_cached_shadow_vmcs12:
+ kfree(vmx->nested.cached_vmcs12);
+
+out_cached_vmcs12:
+ free_loaded_vmcs(&vmx->nested.vmcs02);
+
+out_vmcs02:
+ return -ENOMEM;
+}
+
+/*
+ * Emulate the VMXON instruction.
+ * Currently, we just remember that VMX is active, and do not save or even
+ * inspect the argument to VMXON (the so-called "VMXON pointer") because we
+ * do not currently need to store anything in that guest-allocated memory
+ * region. Consequently, VMCLEAR and VMPTRLD also do not verify that the their
+ * argument is different from the VMXON pointer (which the spec says they do).
+ */
+static int handle_vmon(struct kvm_vcpu *vcpu)
+{
+ int ret;
+ gpa_t vmptr;
+ struct page *page;
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ const u64 VMXON_NEEDED_FEATURES = FEATURE_CONTROL_LOCKED
+ | FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
+
+ /*
+ * The Intel VMX Instruction Reference lists a bunch of bits that are
+ * prerequisite to running VMXON, most notably cr4.VMXE must be set to
+ * 1 (see vmx_set_cr4() for when we allow the guest to set this).
+ * Otherwise, we should fail with #UD. But most faulting conditions
+ * have already been checked by hardware, prior to the VM-exit for
+ * VMXON. We do test guest cr4.VMXE because processor CR4 always has
+ * that bit set to 1 in non-root mode.
+ */
+ if (!kvm_read_cr4_bits(vcpu, X86_CR4_VMXE)) {
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+ }
+
+ /* CPL=0 must be checked manually. */
+ if (vmx_get_cpl(vcpu)) {
+ kvm_inject_gp(vcpu, 0);
+ return 1;
+ }
+
+ if (vmx->nested.vmxon) {
+ nested_vmx_failValid(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION);
+ return kvm_skip_emulated_instruction(vcpu);
+ }
+
+ if ((vmx->msr_ia32_feature_control & VMXON_NEEDED_FEATURES)
+ != VMXON_NEEDED_FEATURES) {
+ kvm_inject_gp(vcpu, 0);
+ return 1;
+ }
+
+ if (nested_vmx_get_vmptr(vcpu, &vmptr))
+ return 1;
+
+ /*
+ * SDM 3: 24.11.5
+ * The first 4 bytes of VMXON region contain the supported
+ * VMCS revision identifier
+ *
+ * Note - IA32_VMX_BASIC[48] will never be 1 for the nested case;
+ * which replaces physical address width with 32
+ */
+ if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu))) {
+ nested_vmx_failInvalid(vcpu);
+ return kvm_skip_emulated_instruction(vcpu);
+ }
+
+ page = kvm_vcpu_gpa_to_page(vcpu, vmptr);
+ if (is_error_page(page)) {
+ nested_vmx_failInvalid(vcpu);
+ return kvm_skip_emulated_instruction(vcpu);
+ }
+ if (*(u32 *)kmap(page) != VMCS12_REVISION) {
+ kunmap(page);
+ kvm_release_page_clean(page);
+ nested_vmx_failInvalid(vcpu);
+ return kvm_skip_emulated_instruction(vcpu);
+ }
+ kunmap(page);
+ kvm_release_page_clean(page);
+
+ vmx->nested.vmxon_ptr = vmptr;
+ ret = enter_vmx_operation(vcpu);
+ if (ret)
+ return ret;
+
+ nested_vmx_succeed(vcpu);
+ return kvm_skip_emulated_instruction(vcpu);
+}
+
+/*
+ * Intel's VMX Instruction Reference specifies a common set of prerequisites
+ * for running VMX instructions (except VMXON, whose prerequisites are
+ * slightly different). It also specifies what exception to inject otherwise.
+ * Note that many of these exceptions have priority over VM exits, so they
+ * don't have to be checked again here.
+ */
+static int nested_vmx_check_permission(struct kvm_vcpu *vcpu)
+{
+ if (!to_vmx(vcpu)->nested.vmxon) {
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 0;
+ }
+
+ if (vmx_get_cpl(vcpu)) {
+ kvm_inject_gp(vcpu, 0);
+ return 0;
+ }
+
+ return 1;
+}
+
+static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx)
+{
+ vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL, SECONDARY_EXEC_SHADOW_VMCS);
+ vmcs_write64(VMCS_LINK_POINTER, -1ull);
+ vmx->nested.sync_shadow_vmcs = false;
+}
+
+static inline void nested_release_vmcs12(struct vcpu_vmx *vmx)
+{
+ if (vmx->nested.current_vmptr == -1ull)
+ return;
+
+ if (enable_shadow_vmcs) {
+ /* copy to memory all shadowed fields in case
+ they were modified */
+ copy_shadow_to_vmcs12(vmx);
+ vmx_disable_shadow_vmcs(vmx);
+ }
+ vmx->nested.posted_intr_nv = -1;
+
+ /* Flush VMCS12 to guest memory */
+ kvm_vcpu_write_guest_page(&vmx->vcpu,
+ vmx->nested.current_vmptr >> PAGE_SHIFT,
+ vmx->nested.cached_vmcs12, 0, VMCS12_SIZE);
+
+ vmx->nested.current_vmptr = -1ull;
+}
+
+/*
+ * Free whatever needs to be freed from vmx->nested when L1 goes down, or
+ * just stops using VMX.
+ */
+static void free_nested(struct vcpu_vmx *vmx)
+{
+ if (!vmx->nested.vmxon && !vmx->nested.smm.vmxon)
+ return;
+
+ kvm_clear_request(KVM_REQ_GET_VMCS12_PAGES, &vmx->vcpu);
+
+ hrtimer_cancel(&vmx->nested.preemption_timer);
+ vmx->nested.vmxon = false;
+ vmx->nested.smm.vmxon = false;
+ free_vpid(vmx->nested.vpid02);
+ vmx->nested.posted_intr_nv = -1;
+ vmx->nested.current_vmptr = -1ull;
+ if (enable_shadow_vmcs) {
+ vmx_disable_shadow_vmcs(vmx);
+ vmcs_clear(vmx->vmcs01.shadow_vmcs);
+ free_vmcs(vmx->vmcs01.shadow_vmcs);
+ vmx->vmcs01.shadow_vmcs = NULL;
+ }
+ kfree(vmx->nested.cached_vmcs12);
+ kfree(vmx->nested.cached_shadow_vmcs12);
+ /* Unpin physical memory we referred to in the vmcs02 */
+ if (vmx->nested.apic_access_page) {
+ kvm_release_page_dirty(vmx->nested.apic_access_page);
+ vmx->nested.apic_access_page = NULL;
+ }
+ if (vmx->nested.virtual_apic_page) {
+ kvm_release_page_dirty(vmx->nested.virtual_apic_page);
+ vmx->nested.virtual_apic_page = NULL;
+ }
+ if (vmx->nested.pi_desc_page) {
+ kunmap(vmx->nested.pi_desc_page);
+ kvm_release_page_dirty(vmx->nested.pi_desc_page);
+ vmx->nested.pi_desc_page = NULL;
+ vmx->nested.pi_desc = NULL;
+ }
+
+ free_loaded_vmcs(&vmx->nested.vmcs02);
+}
+
+/* Emulate the VMXOFF instruction */
+static int handle_vmoff(struct kvm_vcpu *vcpu)
+{
+ if (!nested_vmx_check_permission(vcpu))
+ return 1;
+ free_nested(to_vmx(vcpu));
+ nested_vmx_succeed(vcpu);
+ return kvm_skip_emulated_instruction(vcpu);
+}
+
+/* Emulate the VMCLEAR instruction */
+static int handle_vmclear(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ u32 zero = 0;
+ gpa_t vmptr;
+
+ if (!nested_vmx_check_permission(vcpu))
+ return 1;
+
+ if (nested_vmx_get_vmptr(vcpu, &vmptr))
+ return 1;
+
+ if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu))) {
+ nested_vmx_failValid(vcpu, VMXERR_VMCLEAR_INVALID_ADDRESS);
+ return kvm_skip_emulated_instruction(vcpu);
+ }
+
+ if (vmptr == vmx->nested.vmxon_ptr) {
+ nested_vmx_failValid(vcpu, VMXERR_VMCLEAR_VMXON_POINTER);
+ return kvm_skip_emulated_instruction(vcpu);
+ }
+
+ if (vmptr == vmx->nested.current_vmptr)
+ nested_release_vmcs12(vmx);
+
+ kvm_vcpu_write_guest(vcpu,
+ vmptr + offsetof(struct vmcs12, launch_state),
+ &zero, sizeof(zero));
+
+ nested_vmx_succeed(vcpu);
+ return kvm_skip_emulated_instruction(vcpu);
+}
+
+static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch);
+
+/* Emulate the VMLAUNCH instruction */
+static int handle_vmlaunch(struct kvm_vcpu *vcpu)
+{
+ return nested_vmx_run(vcpu, true);
+}
+
+/* Emulate the VMRESUME instruction */
+static int handle_vmresume(struct kvm_vcpu *vcpu)
+{
+
+ return nested_vmx_run(vcpu, false);
+}
+
+/*
+ * Read a vmcs12 field. Since these can have varying lengths and we return
+ * one type, we chose the biggest type (u64) and zero-extend the return value
+ * to that size. Note that the caller, handle_vmread, might need to use only
+ * some of the bits we return here (e.g., on 32-bit guests, only 32 bits of
+ * 64-bit fields are to be returned).
+ */
+static inline int vmcs12_read_any(struct vmcs12 *vmcs12,
+ unsigned long field, u64 *ret)
+{
+ short offset = vmcs_field_to_offset(field);
+ char *p;
+
+ if (offset < 0)
+ return offset;
+
+ p = (char *)vmcs12 + offset;
+
+ switch (vmcs_field_width(field)) {
+ case VMCS_FIELD_WIDTH_NATURAL_WIDTH:
+ *ret = *((natural_width *)p);
+ return 0;
+ case VMCS_FIELD_WIDTH_U16:
+ *ret = *((u16 *)p);
+ return 0;
+ case VMCS_FIELD_WIDTH_U32:
+ *ret = *((u32 *)p);
+ return 0;
+ case VMCS_FIELD_WIDTH_U64:
+ *ret = *((u64 *)p);
+ return 0;
+ default:
+ WARN_ON(1);
+ return -ENOENT;
+ }
+}
+
+
+static inline int vmcs12_write_any(struct vmcs12 *vmcs12,
+ unsigned long field, u64 field_value){
+ short offset = vmcs_field_to_offset(field);
+ char *p = (char *)vmcs12 + offset;
+ if (offset < 0)
+ return offset;
+
+ switch (vmcs_field_width(field)) {
+ case VMCS_FIELD_WIDTH_U16:
+ *(u16 *)p = field_value;
+ return 0;
+ case VMCS_FIELD_WIDTH_U32:
+ *(u32 *)p = field_value;
+ return 0;
+ case VMCS_FIELD_WIDTH_U64:
+ *(u64 *)p = field_value;
+ return 0;
+ case VMCS_FIELD_WIDTH_NATURAL_WIDTH:
+ *(natural_width *)p = field_value;
+ return 0;
+ default:
+ WARN_ON(1);
+ return -ENOENT;
+ }
+
+}
+
+/*
+ * Copy the writable VMCS shadow fields back to the VMCS12, in case
+ * they have been modified by the L1 guest. Note that the "read-only"
+ * VM-exit information fields are actually writable if the vCPU is
+ * configured to support "VMWRITE to any supported field in the VMCS."
+ */
+static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx)
+{
+ const u16 *fields[] = {
+ shadow_read_write_fields,
+ shadow_read_only_fields
+ };
+ const int max_fields[] = {
+ max_shadow_read_write_fields,
+ max_shadow_read_only_fields
+ };
+ int i, q;
+ unsigned long field;
+ u64 field_value;
+ struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs;
+
+ if (WARN_ON(!shadow_vmcs))
+ return;
+
+ preempt_disable();
+
+ vmcs_load(shadow_vmcs);
+
+ for (q = 0; q < ARRAY_SIZE(fields); q++) {
+ for (i = 0; i < max_fields[q]; i++) {
+ field = fields[q][i];
+ field_value = __vmcs_readl(field);
+ vmcs12_write_any(get_vmcs12(&vmx->vcpu), field, field_value);
+ }
+ /*
+ * Skip the VM-exit information fields if they are read-only.
+ */
+ if (!nested_cpu_has_vmwrite_any_field(&vmx->vcpu))
+ break;
+ }
+
+ vmcs_clear(shadow_vmcs);
+ vmcs_load(vmx->loaded_vmcs->vmcs);
+
+ preempt_enable();
+}
+
+static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx)
+{
+ const u16 *fields[] = {
+ shadow_read_write_fields,
+ shadow_read_only_fields
+ };
+ const int max_fields[] = {
+ max_shadow_read_write_fields,
+ max_shadow_read_only_fields
+ };
+ int i, q;
+ unsigned long field;
+ u64 field_value = 0;
+ struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs;
+
+ if (WARN_ON(!shadow_vmcs))
+ return;
+
+ vmcs_load(shadow_vmcs);
+
+ for (q = 0; q < ARRAY_SIZE(fields); q++) {
+ for (i = 0; i < max_fields[q]; i++) {
+ field = fields[q][i];
+ vmcs12_read_any(get_vmcs12(&vmx->vcpu), field, &field_value);
+ __vmcs_writel(field, field_value);
+ }
+ }
+
+ vmcs_clear(shadow_vmcs);
+ vmcs_load(vmx->loaded_vmcs->vmcs);
+}
+
+/*
+ * VMX instructions which assume a current vmcs12 (i.e., that VMPTRLD was
+ * used before) all generate the same failure when it is missing.
+ */
+static int nested_vmx_check_vmcs12(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ if (vmx->nested.current_vmptr == -1ull) {
+ nested_vmx_failInvalid(vcpu);
+ return 0;
+ }
+ return 1;
+}
+
+static int handle_vmread(struct kvm_vcpu *vcpu)
+{
+ unsigned long field;
+ u64 field_value;
+ unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+ u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
+ gva_t gva = 0;
+ struct vmcs12 *vmcs12;
+ struct x86_exception e;
+
+ if (!nested_vmx_check_permission(vcpu))
+ return 1;
+
+ if (!nested_vmx_check_vmcs12(vcpu))
+ return kvm_skip_emulated_instruction(vcpu);
+
+ if (!is_guest_mode(vcpu))
+ vmcs12 = get_vmcs12(vcpu);
+ else {
+ /*
+ * When vmcs->vmcs_link_pointer is -1ull, any VMREAD
+ * to shadowed-field sets the ALU flags for VMfailInvalid.
+ */
+ if (get_vmcs12(vcpu)->vmcs_link_pointer == -1ull) {
+ nested_vmx_failInvalid(vcpu);
+ return kvm_skip_emulated_instruction(vcpu);
+ }
+ vmcs12 = get_shadow_vmcs12(vcpu);
+ }
+
+ /* Decode instruction info and find the field to read */
+ field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
+ /* Read the field, zero-extended to a u64 field_value */
+ if (vmcs12_read_any(vmcs12, field, &field_value) < 0) {
+ nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
+ return kvm_skip_emulated_instruction(vcpu);
+ }
+ /*
+ * Now copy part of this value to register or memory, as requested.
+ * Note that the number of bits actually copied is 32 or 64 depending
+ * on the guest's mode (32 or 64 bit), not on the given field's length.
+ */
+ if (vmx_instruction_info & (1u << 10)) {
+ kvm_register_writel(vcpu, (((vmx_instruction_info) >> 3) & 0xf),
+ field_value);
+ } else {
+ if (get_vmx_mem_address(vcpu, exit_qualification,
+ vmx_instruction_info, true, &gva))
+ return 1;
+ /* _system ok, nested_vmx_check_permission has verified cpl=0 */
+ if (kvm_write_guest_virt_system(vcpu, gva, &field_value,
+ (is_long_mode(vcpu) ? 8 : 4),
+ &e)) {
+ kvm_inject_page_fault(vcpu, &e);
+ return 1;
+ }
+ }
+
+ nested_vmx_succeed(vcpu);
+ return kvm_skip_emulated_instruction(vcpu);
+}
+
+
+static int handle_vmwrite(struct kvm_vcpu *vcpu)
+{
+ unsigned long field;
+ gva_t gva;
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+ u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
+
+ /* The value to write might be 32 or 64 bits, depending on L1's long
+ * mode, and eventually we need to write that into a field of several
+ * possible lengths. The code below first zero-extends the value to 64
+ * bit (field_value), and then copies only the appropriate number of
+ * bits into the vmcs12 field.
+ */
+ u64 field_value = 0;
+ struct x86_exception e;
+ struct vmcs12 *vmcs12;
+
+ if (!nested_vmx_check_permission(vcpu))
+ return 1;
+
+ if (!nested_vmx_check_vmcs12(vcpu))
+ return kvm_skip_emulated_instruction(vcpu);
+
+ if (vmx_instruction_info & (1u << 10))
+ field_value = kvm_register_readl(vcpu,
+ (((vmx_instruction_info) >> 3) & 0xf));
+ else {
+ if (get_vmx_mem_address(vcpu, exit_qualification,
+ vmx_instruction_info, false, &gva))
+ return 1;
+ if (kvm_read_guest_virt(vcpu, gva, &field_value,
+ (is_64_bit_mode(vcpu) ? 8 : 4), &e)) {
+ kvm_inject_page_fault(vcpu, &e);
+ return 1;
+ }
+ }
+
+
+ field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
+ /*
+ * If the vCPU supports "VMWRITE to any supported field in the
+ * VMCS," then the "read-only" fields are actually read/write.
+ */
+ if (vmcs_field_readonly(field) &&
+ !nested_cpu_has_vmwrite_any_field(vcpu)) {
+ nested_vmx_failValid(vcpu,
+ VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT);
+ return kvm_skip_emulated_instruction(vcpu);
+ }
+
+ if (!is_guest_mode(vcpu))
+ vmcs12 = get_vmcs12(vcpu);
+ else {
+ /*
+ * When vmcs->vmcs_link_pointer is -1ull, any VMWRITE
+ * to shadowed-field sets the ALU flags for VMfailInvalid.
+ */
+ if (get_vmcs12(vcpu)->vmcs_link_pointer == -1ull) {
+ nested_vmx_failInvalid(vcpu);
+ return kvm_skip_emulated_instruction(vcpu);
+ }
+ vmcs12 = get_shadow_vmcs12(vcpu);
+
+ }
+
+ if (vmcs12_write_any(vmcs12, field, field_value) < 0) {
+ nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
+ return kvm_skip_emulated_instruction(vcpu);
+ }
+
+ /*
+ * Do not track vmcs12 dirty-state if in guest-mode
+ * as we actually dirty shadow vmcs12 instead of vmcs12.
+ */
+ if (!is_guest_mode(vcpu)) {
+ switch (field) {
+#define SHADOW_FIELD_RW(x) case x:
+#include "vmx_shadow_fields.h"
+ /*
+ * The fields that can be updated by L1 without a vmexit are
+ * always updated in the vmcs02, the others go down the slow
+ * path of prepare_vmcs02.
+ */
+ break;
+ default:
+ vmx->nested.dirty_vmcs12 = true;
+ break;
+ }
+ }
+
+ nested_vmx_succeed(vcpu);
+ return kvm_skip_emulated_instruction(vcpu);
+}
+
+static void set_current_vmptr(struct vcpu_vmx *vmx, gpa_t vmptr)
+{
+ vmx->nested.current_vmptr = vmptr;
+ if (enable_shadow_vmcs) {
+ vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL,
+ SECONDARY_EXEC_SHADOW_VMCS);
+ vmcs_write64(VMCS_LINK_POINTER,
+ __pa(vmx->vmcs01.shadow_vmcs));
+ vmx->nested.sync_shadow_vmcs = true;
+ }
+ vmx->nested.dirty_vmcs12 = true;
+}
+
+/* Emulate the VMPTRLD instruction */
+static int handle_vmptrld(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ gpa_t vmptr;
+
+ if (!nested_vmx_check_permission(vcpu))
+ return 1;
+
+ if (nested_vmx_get_vmptr(vcpu, &vmptr))
+ return 1;
+
+ if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu))) {
+ nested_vmx_failValid(vcpu, VMXERR_VMPTRLD_INVALID_ADDRESS);
+ return kvm_skip_emulated_instruction(vcpu);
+ }
+
+ if (vmptr == vmx->nested.vmxon_ptr) {
+ nested_vmx_failValid(vcpu, VMXERR_VMPTRLD_VMXON_POINTER);
+ return kvm_skip_emulated_instruction(vcpu);
+ }
+
+ if (vmx->nested.current_vmptr != vmptr) {
+ struct vmcs12 *new_vmcs12;
+ struct page *page;
+ page = kvm_vcpu_gpa_to_page(vcpu, vmptr);
+ if (is_error_page(page)) {
+ nested_vmx_failInvalid(vcpu);
+ return kvm_skip_emulated_instruction(vcpu);
+ }
+ new_vmcs12 = kmap(page);
+ if (new_vmcs12->hdr.revision_id != VMCS12_REVISION ||
+ (new_vmcs12->hdr.shadow_vmcs &&
+ !nested_cpu_has_vmx_shadow_vmcs(vcpu))) {
+ kunmap(page);
+ kvm_release_page_clean(page);
+ nested_vmx_failValid(vcpu,
+ VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
+ return kvm_skip_emulated_instruction(vcpu);
+ }
+
+ nested_release_vmcs12(vmx);
+ /*
+ * Load VMCS12 from guest memory since it is not already
+ * cached.
+ */
+ memcpy(vmx->nested.cached_vmcs12, new_vmcs12, VMCS12_SIZE);
+ kunmap(page);
+ kvm_release_page_clean(page);
+
+ set_current_vmptr(vmx, vmptr);
+ }
+
+ nested_vmx_succeed(vcpu);
+ return kvm_skip_emulated_instruction(vcpu);
+}
+
+/* Emulate the VMPTRST instruction */
+static int handle_vmptrst(struct kvm_vcpu *vcpu)
+{
+ unsigned long exit_qual = vmcs_readl(EXIT_QUALIFICATION);
+ u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO);
+ gpa_t current_vmptr = to_vmx(vcpu)->nested.current_vmptr;
+ struct x86_exception e;
+ gva_t gva;
+
+ if (!nested_vmx_check_permission(vcpu))
+ return 1;
+
+ if (get_vmx_mem_address(vcpu, exit_qual, instr_info, true, &gva))
+ return 1;
+ /* *_system ok, nested_vmx_check_permission has verified cpl=0 */
+ if (kvm_write_guest_virt_system(vcpu, gva, (void *)&current_vmptr,
+ sizeof(gpa_t), &e)) {
+ kvm_inject_page_fault(vcpu, &e);
+ return 1;
+ }
+ nested_vmx_succeed(vcpu);
+ return kvm_skip_emulated_instruction(vcpu);
+}
+
+/* Emulate the INVEPT instruction */
+static int handle_invept(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ u32 vmx_instruction_info, types;
+ unsigned long type;
+ gva_t gva;
+ struct x86_exception e;
+ struct {
+ u64 eptp, gpa;
+ } operand;
+
+ if (!(vmx->nested.msrs.secondary_ctls_high &
+ SECONDARY_EXEC_ENABLE_EPT) ||
+ !(vmx->nested.msrs.ept_caps & VMX_EPT_INVEPT_BIT)) {
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+ }
+
+ if (!nested_vmx_check_permission(vcpu))
+ return 1;
+
+ vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
+ type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf);
+
+ types = (vmx->nested.msrs.ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6;
+
+ if (type >= 32 || !(types & (1 << type))) {
+ nested_vmx_failValid(vcpu,
+ VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
+ return kvm_skip_emulated_instruction(vcpu);
+ }
+
+ /* According to the Intel VMX instruction reference, the memory
+ * operand is read even if it isn't needed (e.g., for type==global)
+ */
+ if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
+ vmx_instruction_info, false, &gva))
+ return 1;
+ if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) {
+ kvm_inject_page_fault(vcpu, &e);
+ return 1;
+ }
+
+ switch (type) {
+ case VMX_EPT_EXTENT_GLOBAL:
+ /*
+ * TODO: track mappings and invalidate
+ * single context requests appropriately
+ */
+ case VMX_EPT_EXTENT_CONTEXT:
+ kvm_mmu_sync_roots(vcpu);
+ kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
+ nested_vmx_succeed(vcpu);
+ break;
+ default:
+ BUG_ON(1);
+ break;
+ }
+
+ return kvm_skip_emulated_instruction(vcpu);
+}
+
+static int handle_invvpid(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ u32 vmx_instruction_info;
+ unsigned long type, types;
+ gva_t gva;
+ struct x86_exception e;
+ struct {
+ u64 vpid;
+ u64 gla;
+ } operand;
+
+ if (!(vmx->nested.msrs.secondary_ctls_high &
+ SECONDARY_EXEC_ENABLE_VPID) ||
+ !(vmx->nested.msrs.vpid_caps & VMX_VPID_INVVPID_BIT)) {
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+ }
+
+ if (!nested_vmx_check_permission(vcpu))
+ return 1;
+
+ vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
+ type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf);
+
+ types = (vmx->nested.msrs.vpid_caps &
+ VMX_VPID_EXTENT_SUPPORTED_MASK) >> 8;
+
+ if (type >= 32 || !(types & (1 << type))) {
+ nested_vmx_failValid(vcpu,
+ VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
+ return kvm_skip_emulated_instruction(vcpu);
+ }
+
+ /* according to the intel vmx instruction reference, the memory
+ * operand is read even if it isn't needed (e.g., for type==global)
+ */
+ if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
+ vmx_instruction_info, false, &gva))
+ return 1;
+ if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) {
+ kvm_inject_page_fault(vcpu, &e);
+ return 1;
+ }
+ if (operand.vpid >> 16) {
+ nested_vmx_failValid(vcpu,
+ VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
+ return kvm_skip_emulated_instruction(vcpu);
+ }
+
+ switch (type) {
+ case VMX_VPID_EXTENT_INDIVIDUAL_ADDR:
+ if (!operand.vpid ||
+ is_noncanonical_address(operand.gla, vcpu)) {
+ nested_vmx_failValid(vcpu,
+ VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
+ return kvm_skip_emulated_instruction(vcpu);
+ }
+ if (cpu_has_vmx_invvpid_individual_addr() &&
+ vmx->nested.vpid02) {
+ __invvpid(VMX_VPID_EXTENT_INDIVIDUAL_ADDR,
+ vmx->nested.vpid02, operand.gla);
+ } else
+ __vmx_flush_tlb(vcpu, vmx->nested.vpid02, true);
+ break;
+ case VMX_VPID_EXTENT_SINGLE_CONTEXT:
+ case VMX_VPID_EXTENT_SINGLE_NON_GLOBAL:
+ if (!operand.vpid) {
+ nested_vmx_failValid(vcpu,
+ VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
+ return kvm_skip_emulated_instruction(vcpu);
+ }
+ __vmx_flush_tlb(vcpu, vmx->nested.vpid02, true);
+ break;
+ case VMX_VPID_EXTENT_ALL_CONTEXT:
+ __vmx_flush_tlb(vcpu, vmx->nested.vpid02, true);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ return kvm_skip_emulated_instruction(vcpu);
+ }
+
+ nested_vmx_succeed(vcpu);
+
+ return kvm_skip_emulated_instruction(vcpu);
+}
+
+static int handle_invpcid(struct kvm_vcpu *vcpu)
+{
+ u32 vmx_instruction_info;
+ unsigned long type;
+ bool pcid_enabled;
+ gva_t gva;
+ struct x86_exception e;
+ unsigned i;
+ unsigned long roots_to_free = 0;
+ struct {
+ u64 pcid;
+ u64 gla;
+ } operand;
+
+ if (!guest_cpuid_has(vcpu, X86_FEATURE_INVPCID)) {
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+ }
+
+ vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
+ type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf);
+
+ if (type > 3) {
+ kvm_inject_gp(vcpu, 0);
+ return 1;
+ }
+
+ /* According to the Intel instruction reference, the memory operand
+ * is read even if it isn't needed (e.g., for type==all)
+ */
+ if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
+ vmx_instruction_info, false, &gva))
+ return 1;
+
+ if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) {
+ kvm_inject_page_fault(vcpu, &e);
+ return 1;
+ }
+
+ if (operand.pcid >> 12 != 0) {
+ kvm_inject_gp(vcpu, 0);
+ return 1;
+ }
+
+ pcid_enabled = kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE);
+
+ switch (type) {
+ case INVPCID_TYPE_INDIV_ADDR:
+ if ((!pcid_enabled && (operand.pcid != 0)) ||
+ is_noncanonical_address(operand.gla, vcpu)) {
+ kvm_inject_gp(vcpu, 0);
+ return 1;
+ }
+ kvm_mmu_invpcid_gva(vcpu, operand.gla, operand.pcid);
+ return kvm_skip_emulated_instruction(vcpu);
+
+ case INVPCID_TYPE_SINGLE_CTXT:
+ if (!pcid_enabled && (operand.pcid != 0)) {
+ kvm_inject_gp(vcpu, 0);
+ return 1;
+ }
+
+ if (kvm_get_active_pcid(vcpu) == operand.pcid) {
+ kvm_mmu_sync_roots(vcpu);
+ kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
+ }
+
+ for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
+ if (kvm_get_pcid(vcpu, vcpu->arch.mmu.prev_roots[i].cr3)
+ == operand.pcid)
+ roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
+
+ kvm_mmu_free_roots(vcpu, roots_to_free);
+ /*
+ * If neither the current cr3 nor any of the prev_roots use the
+ * given PCID, then nothing needs to be done here because a
+ * resync will happen anyway before switching to any other CR3.
+ */
+
+ return kvm_skip_emulated_instruction(vcpu);
+
+ case INVPCID_TYPE_ALL_NON_GLOBAL:
+ /*
+ * Currently, KVM doesn't mark global entries in the shadow
+ * page tables, so a non-global flush just degenerates to a
+ * global flush. If needed, we could optimize this later by
+ * keeping track of global entries in shadow page tables.
+ */
+
+ /* fall-through */
+ case INVPCID_TYPE_ALL_INCL_GLOBAL:
+ kvm_mmu_unload(vcpu);
+ return kvm_skip_emulated_instruction(vcpu);
+
+ default:
+ BUG(); /* We have already checked above that type <= 3 */
+ }
+}
+
+static int handle_pml_full(struct kvm_vcpu *vcpu)
+{
+ unsigned long exit_qualification;
+
+ trace_kvm_pml_full(vcpu->vcpu_id);
+
+ exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+
+ /*
+ * PML buffer FULL happened while executing iret from NMI,
+ * "blocked by NMI" bit has to be set before next VM entry.
+ */
+ if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
+ enable_vnmi &&
+ (exit_qualification & INTR_INFO_UNBLOCK_NMI))
+ vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
+ GUEST_INTR_STATE_NMI);
+
+ /*
+ * PML buffer already flushed at beginning of VMEXIT. Nothing to do
+ * here.., and there's no userspace involvement needed for PML.
+ */
+ return 1;
+}
+
+static int handle_preemption_timer(struct kvm_vcpu *vcpu)
+{
+ if (!to_vmx(vcpu)->req_immediate_exit)
+ kvm_lapic_expired_hv_timer(vcpu);
+ return 1;
+}
+
+static bool valid_ept_address(struct kvm_vcpu *vcpu, u64 address)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ int maxphyaddr = cpuid_maxphyaddr(vcpu);
+
+ /* Check for memory type validity */
+ switch (address & VMX_EPTP_MT_MASK) {
+ case VMX_EPTP_MT_UC:
+ if (!(vmx->nested.msrs.ept_caps & VMX_EPTP_UC_BIT))
+ return false;
+ break;
+ case VMX_EPTP_MT_WB:
+ if (!(vmx->nested.msrs.ept_caps & VMX_EPTP_WB_BIT))
+ return false;
+ break;
+ default:
+ return false;
+ }
+
+ /* only 4 levels page-walk length are valid */
+ if ((address & VMX_EPTP_PWL_MASK) != VMX_EPTP_PWL_4)
+ return false;
+
+ /* Reserved bits should not be set */
+ if (address >> maxphyaddr || ((address >> 7) & 0x1f))
+ return false;
+
+ /* AD, if set, should be supported */
+ if (address & VMX_EPTP_AD_ENABLE_BIT) {
+ if (!(vmx->nested.msrs.ept_caps & VMX_EPT_AD_BIT))
+ return false;
+ }
+
+ return true;
+}
+
+static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+ u32 index = vcpu->arch.regs[VCPU_REGS_RCX];
+ u64 address;
+ bool accessed_dirty;
+ struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
+
+ if (!nested_cpu_has_eptp_switching(vmcs12) ||
+ !nested_cpu_has_ept(vmcs12))
+ return 1;
+
+ if (index >= VMFUNC_EPTP_ENTRIES)
+ return 1;
+
+
+ if (kvm_vcpu_read_guest_page(vcpu, vmcs12->eptp_list_address >> PAGE_SHIFT,
+ &address, index * 8, 8))
+ return 1;
+
+ accessed_dirty = !!(address & VMX_EPTP_AD_ENABLE_BIT);
+
+ /*
+ * If the (L2) guest does a vmfunc to the currently
+ * active ept pointer, we don't have to do anything else
+ */
+ if (vmcs12->ept_pointer != address) {
+ if (!valid_ept_address(vcpu, address))
+ return 1;
+
+ kvm_mmu_unload(vcpu);
+ mmu->ept_ad = accessed_dirty;
+ mmu->base_role.ad_disabled = !accessed_dirty;
+ vmcs12->ept_pointer = address;
+ /*
+ * TODO: Check what's the correct approach in case
+ * mmu reload fails. Currently, we just let the next
+ * reload potentially fail
+ */
+ kvm_mmu_reload(vcpu);
+ }
+
+ return 0;
+}
+
+static int handle_vmfunc(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct vmcs12 *vmcs12;
+ u32 function = vcpu->arch.regs[VCPU_REGS_RAX];
+
+ /*
+ * VMFUNC is only supported for nested guests, but we always enable the
+ * secondary control for simplicity; for non-nested mode, fake that we
+ * didn't by injecting #UD.
+ */
+ if (!is_guest_mode(vcpu)) {
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+ }
+
+ vmcs12 = get_vmcs12(vcpu);
+ if ((vmcs12->vm_function_control & (1 << function)) == 0)
+ goto fail;
+
+ switch (function) {
+ case 0:
+ if (nested_vmx_eptp_switching(vcpu, vmcs12))
+ goto fail;
+ break;
+ default:
+ goto fail;
+ }
+ return kvm_skip_emulated_instruction(vcpu);
+
+fail:
+ nested_vmx_vmexit(vcpu, vmx->exit_reason,
+ vmcs_read32(VM_EXIT_INTR_INFO),
+ vmcs_readl(EXIT_QUALIFICATION));
+ return 1;
+}
+
+static int handle_encls(struct kvm_vcpu *vcpu)
+{
+ /*
+ * SGX virtualization is not yet supported. There is no software
+ * enable bit for SGX, so we have to trap ENCLS and inject a #UD
+ * to prevent the guest from executing ENCLS.
+ */
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+}
+
+/*
+ * The exit handlers return 1 if the exit was handled fully and guest execution
+ * may resume. Otherwise they set the kvm_run parameter to indicate what needs
+ * to be done to userspace and return 0.
+ */
+static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
+ [EXIT_REASON_EXCEPTION_NMI] = handle_exception,
+ [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt,
+ [EXIT_REASON_TRIPLE_FAULT] = handle_triple_fault,
+ [EXIT_REASON_NMI_WINDOW] = handle_nmi_window,
+ [EXIT_REASON_IO_INSTRUCTION] = handle_io,
+ [EXIT_REASON_CR_ACCESS] = handle_cr,
+ [EXIT_REASON_DR_ACCESS] = handle_dr,
+ [EXIT_REASON_CPUID] = handle_cpuid,
+ [EXIT_REASON_MSR_READ] = handle_rdmsr,
+ [EXIT_REASON_MSR_WRITE] = handle_wrmsr,
+ [EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window,
+ [EXIT_REASON_HLT] = handle_halt,
+ [EXIT_REASON_INVD] = handle_invd,
+ [EXIT_REASON_INVLPG] = handle_invlpg,
+ [EXIT_REASON_RDPMC] = handle_rdpmc,
+ [EXIT_REASON_VMCALL] = handle_vmcall,
+ [EXIT_REASON_VMCLEAR] = handle_vmclear,
+ [EXIT_REASON_VMLAUNCH] = handle_vmlaunch,
+ [EXIT_REASON_VMPTRLD] = handle_vmptrld,
+ [EXIT_REASON_VMPTRST] = handle_vmptrst,
+ [EXIT_REASON_VMREAD] = handle_vmread,
+ [EXIT_REASON_VMRESUME] = handle_vmresume,
+ [EXIT_REASON_VMWRITE] = handle_vmwrite,
+ [EXIT_REASON_VMOFF] = handle_vmoff,
+ [EXIT_REASON_VMON] = handle_vmon,
+ [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold,
+ [EXIT_REASON_APIC_ACCESS] = handle_apic_access,
+ [EXIT_REASON_APIC_WRITE] = handle_apic_write,
+ [EXIT_REASON_EOI_INDUCED] = handle_apic_eoi_induced,
+ [EXIT_REASON_WBINVD] = handle_wbinvd,
+ [EXIT_REASON_XSETBV] = handle_xsetbv,
+ [EXIT_REASON_TASK_SWITCH] = handle_task_switch,
+ [EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check,
+ [EXIT_REASON_GDTR_IDTR] = handle_desc,
+ [EXIT_REASON_LDTR_TR] = handle_desc,
+ [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation,
+ [EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig,
+ [EXIT_REASON_PAUSE_INSTRUCTION] = handle_pause,
+ [EXIT_REASON_MWAIT_INSTRUCTION] = handle_mwait,
+ [EXIT_REASON_MONITOR_TRAP_FLAG] = handle_monitor_trap,
+ [EXIT_REASON_MONITOR_INSTRUCTION] = handle_monitor,
+ [EXIT_REASON_INVEPT] = handle_invept,
+ [EXIT_REASON_INVVPID] = handle_invvpid,
+ [EXIT_REASON_RDRAND] = handle_invalid_op,
+ [EXIT_REASON_RDSEED] = handle_invalid_op,
+ [EXIT_REASON_XSAVES] = handle_xsaves,
+ [EXIT_REASON_XRSTORS] = handle_xrstors,
+ [EXIT_REASON_PML_FULL] = handle_pml_full,
+ [EXIT_REASON_INVPCID] = handle_invpcid,
+ [EXIT_REASON_VMFUNC] = handle_vmfunc,
+ [EXIT_REASON_PREEMPTION_TIMER] = handle_preemption_timer,
+ [EXIT_REASON_ENCLS] = handle_encls,
+};
+
+static const int kvm_vmx_max_exit_handlers =
+ ARRAY_SIZE(kvm_vmx_exit_handlers);
+
+/*
+ * Return true if an IO instruction with the specified port and size should cause
+ * a VM-exit into L1.
+ */
+bool nested_vmx_check_io_bitmaps(struct kvm_vcpu *vcpu, unsigned int port,
+ int size)
+{
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+ gpa_t bitmap, last_bitmap;
+ u8 b;
+
+ last_bitmap = (gpa_t)-1;
+ b = -1;
+
+ while (size > 0) {
+ if (port < 0x8000)
+ bitmap = vmcs12->io_bitmap_a;
+ else if (port < 0x10000)
+ bitmap = vmcs12->io_bitmap_b;
+ else
+ return true;
+ bitmap += (port & 0x7fff) / 8;
+
+ if (last_bitmap != bitmap)
+ if (kvm_vcpu_read_guest(vcpu, bitmap, &b, 1))
+ return true;
+ if (b & (1 << (port & 7)))
+ return true;
+
+ port++;
+ size--;
+ last_bitmap = bitmap;
+ }
+
+ return false;
+}
+
+/*
+ * Return 1 if we should exit from L2 to L1 to handle an MSR access access,
+ * rather than handle it ourselves in L0. I.e., check whether L1 expressed
+ * disinterest in the current event (read or write a specific MSR) by using an
+ * MSR bitmap. This may be the case even when L0 doesn't use MSR bitmaps.
+ */
+static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12, u32 exit_reason)
+{
+ u32 msr_index = vcpu->arch.regs[VCPU_REGS_RCX];
+ gpa_t bitmap;
+
+ if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
+ return true;
+
+ /*
+ * The MSR_BITMAP page is divided into four 1024-byte bitmaps,
+ * for the four combinations of read/write and low/high MSR numbers.
+ * First we need to figure out which of the four to use:
+ */
+ bitmap = vmcs12->msr_bitmap;
+ if (exit_reason == EXIT_REASON_MSR_WRITE)
+ bitmap += 2048;
+ if (msr_index >= 0xc0000000) {
+ msr_index -= 0xc0000000;
+ bitmap += 1024;
+ }
+
+ /* Then read the msr_index'th bit from this bitmap: */
+ if (msr_index < 1024*8) {
+ unsigned char b;
+ if (kvm_vcpu_read_guest(vcpu, bitmap + msr_index/8, &b, 1))
+ return true;
+ return 1 & (b >> (msr_index & 7));
+ } else
+ return true; /* let L1 handle the wrong parameter */
+}
+
+/*
+ * Return 1 if we should exit from L2 to L1 to handle a CR access exit,
+ * rather than handle it ourselves in L0. I.e., check if L1 wanted to
+ * intercept (via guest_host_mask etc.) the current event.
+ */
+static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+ unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+ int cr = exit_qualification & 15;
+ int reg;
+ unsigned long val;
+
+ switch ((exit_qualification >> 4) & 3) {
+ case 0: /* mov to cr */
+ reg = (exit_qualification >> 8) & 15;
+ val = kvm_register_readl(vcpu, reg);
+ switch (cr) {
+ case 0:
+ if (vmcs12->cr0_guest_host_mask &
+ (val ^ vmcs12->cr0_read_shadow))
+ return true;
+ break;
+ case 3:
+ if ((vmcs12->cr3_target_count >= 1 &&
+ vmcs12->cr3_target_value0 == val) ||
+ (vmcs12->cr3_target_count >= 2 &&
+ vmcs12->cr3_target_value1 == val) ||
+ (vmcs12->cr3_target_count >= 3 &&
+ vmcs12->cr3_target_value2 == val) ||
+ (vmcs12->cr3_target_count >= 4 &&
+ vmcs12->cr3_target_value3 == val))
+ return false;
+ if (nested_cpu_has(vmcs12, CPU_BASED_CR3_LOAD_EXITING))
+ return true;
+ break;
+ case 4:
+ if (vmcs12->cr4_guest_host_mask &
+ (vmcs12->cr4_read_shadow ^ val))
+ return true;
+ break;
+ case 8:
+ if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING))
+ return true;
+ break;
+ }
+ break;
+ case 2: /* clts */
+ if ((vmcs12->cr0_guest_host_mask & X86_CR0_TS) &&
+ (vmcs12->cr0_read_shadow & X86_CR0_TS))
+ return true;
+ break;
+ case 1: /* mov from cr */
+ switch (cr) {
+ case 3:
+ if (vmcs12->cpu_based_vm_exec_control &
+ CPU_BASED_CR3_STORE_EXITING)
+ return true;
+ break;
+ case 8:
+ if (vmcs12->cpu_based_vm_exec_control &
+ CPU_BASED_CR8_STORE_EXITING)
+ return true;
+ break;
+ }
+ break;
+ case 3: /* lmsw */
+ /*
+ * lmsw can change bits 1..3 of cr0, and only set bit 0 of
+ * cr0. Other attempted changes are ignored, with no exit.
+ */
+ val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f;
+ if (vmcs12->cr0_guest_host_mask & 0xe &
+ (val ^ vmcs12->cr0_read_shadow))
+ return true;
+ if ((vmcs12->cr0_guest_host_mask & 0x1) &&
+ !(vmcs12->cr0_read_shadow & 0x1) &&
+ (val & 0x1))
+ return true;
+ break;
+ }
+ return false;
+}
+
+static bool nested_vmx_exit_handled_vmcs_access(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12, gpa_t bitmap)
+{
+ u32 vmx_instruction_info;
+ unsigned long field;
+ u8 b;
+
+ if (!nested_cpu_has_shadow_vmcs(vmcs12))
+ return true;
+
+ /* Decode instruction info and find the field to access */
+ vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
+ field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
+
+ /* Out-of-range fields always cause a VM exit from L2 to L1 */
+ if (field >> 15)
+ return true;
+
+ if (kvm_vcpu_read_guest(vcpu, bitmap + field/8, &b, 1))
+ return true;
+
+ return 1 & (b >> (field & 7));
+}
+
+/*
+ * Return 1 if we should exit from L2 to L1 to handle an exit, or 0 if we
+ * should handle it ourselves in L0 (and then continue L2). Only call this
+ * when in is_guest_mode (L2).
+ */
+static bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason)
+{
+ u32 intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+
+ if (vmx->nested.nested_run_pending)
+ return false;
+
+ if (unlikely(vmx->fail)) {
+ pr_info_ratelimited("%s failed vm entry %x\n", __func__,
+ vmcs_read32(VM_INSTRUCTION_ERROR));
+ return true;
+ }
+
+ /*
+ * The host physical addresses of some pages of guest memory
+ * are loaded into the vmcs02 (e.g. vmcs12's Virtual APIC
+ * Page). The CPU may write to these pages via their host
+ * physical address while L2 is running, bypassing any
+ * address-translation-based dirty tracking (e.g. EPT write
+ * protection).
+ *
+ * Mark them dirty on every exit from L2 to prevent them from
+ * getting out of sync with dirty tracking.
+ */
+ nested_mark_vmcs12_pages_dirty(vcpu);
+
+ trace_kvm_nested_vmexit(kvm_rip_read(vcpu), exit_reason,
+ vmcs_readl(EXIT_QUALIFICATION),
+ vmx->idt_vectoring_info,
+ intr_info,
+ vmcs_read32(VM_EXIT_INTR_ERROR_CODE),
+ KVM_ISA_VMX);
+
+ switch ((u16)exit_reason) {
+ case EXIT_REASON_EXCEPTION_NMI:
+ if (is_nmi(intr_info))
+ return false;
+ else if (is_page_fault(intr_info))
+ return !vmx->vcpu.arch.apf.host_apf_reason && enable_ept;
+ else if (is_no_device(intr_info) &&
+ !(vmcs12->guest_cr0 & X86_CR0_TS))
+ return false;
+ else if (is_debug(intr_info) &&
+ vcpu->guest_debug &
+ (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
+ return false;
+ else if (is_breakpoint(intr_info) &&
+ vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
+ return false;
+ return vmcs12->exception_bitmap &
+ (1u << (intr_info & INTR_INFO_VECTOR_MASK));
+ case EXIT_REASON_EXTERNAL_INTERRUPT:
+ return false;
+ case EXIT_REASON_TRIPLE_FAULT:
+ return true;
+ case EXIT_REASON_PENDING_INTERRUPT:
+ return nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_INTR_PENDING);
+ case EXIT_REASON_NMI_WINDOW:
+ return nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_NMI_PENDING);
+ case EXIT_REASON_TASK_SWITCH:
+ return true;
+ case EXIT_REASON_CPUID:
+ return true;
+ case EXIT_REASON_HLT:
+ return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING);
+ case EXIT_REASON_INVD:
+ return true;
+ case EXIT_REASON_INVLPG:
+ return nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING);
+ case EXIT_REASON_RDPMC:
+ return nested_cpu_has(vmcs12, CPU_BASED_RDPMC_EXITING);
+ case EXIT_REASON_RDRAND:
+ return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDRAND_EXITING);
+ case EXIT_REASON_RDSEED:
+ return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDSEED_EXITING);
+ case EXIT_REASON_RDTSC: case EXIT_REASON_RDTSCP:
+ return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING);
+ case EXIT_REASON_VMREAD:
+ return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12,
+ vmcs12->vmread_bitmap);
+ case EXIT_REASON_VMWRITE:
+ return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12,
+ vmcs12->vmwrite_bitmap);
+ case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR:
+ case EXIT_REASON_VMLAUNCH: case EXIT_REASON_VMPTRLD:
+ case EXIT_REASON_VMPTRST: case EXIT_REASON_VMRESUME:
+ case EXIT_REASON_VMOFF: case EXIT_REASON_VMON:
+ case EXIT_REASON_INVEPT: case EXIT_REASON_INVVPID:
+ /*
+ * VMX instructions trap unconditionally. This allows L1 to
+ * emulate them for its L2 guest, i.e., allows 3-level nesting!
+ */
+ return true;
+ case EXIT_REASON_CR_ACCESS:
+ return nested_vmx_exit_handled_cr(vcpu, vmcs12);
+ case EXIT_REASON_DR_ACCESS:
+ return nested_cpu_has(vmcs12, CPU_BASED_MOV_DR_EXITING);
+ case EXIT_REASON_IO_INSTRUCTION:
+ return nested_vmx_exit_handled_io(vcpu, vmcs12);
+ case EXIT_REASON_GDTR_IDTR: case EXIT_REASON_LDTR_TR:
+ return nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC);
+ case EXIT_REASON_MSR_READ:
+ case EXIT_REASON_MSR_WRITE:
+ return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason);
+ case EXIT_REASON_INVALID_STATE:
+ return true;
+ case EXIT_REASON_MWAIT_INSTRUCTION:
+ return nested_cpu_has(vmcs12, CPU_BASED_MWAIT_EXITING);
+ case EXIT_REASON_MONITOR_TRAP_FLAG:
+ return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_TRAP_FLAG);
+ case EXIT_REASON_MONITOR_INSTRUCTION:
+ return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_EXITING);
+ case EXIT_REASON_PAUSE_INSTRUCTION:
+ return nested_cpu_has(vmcs12, CPU_BASED_PAUSE_EXITING) ||
+ nested_cpu_has2(vmcs12,
+ SECONDARY_EXEC_PAUSE_LOOP_EXITING);
+ case EXIT_REASON_MCE_DURING_VMENTRY:
+ return false;
+ case EXIT_REASON_TPR_BELOW_THRESHOLD:
+ return nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW);
+ case EXIT_REASON_APIC_ACCESS:
+ case EXIT_REASON_APIC_WRITE:
+ case EXIT_REASON_EOI_INDUCED:
+ /*
+ * The controls for "virtualize APIC accesses," "APIC-
+ * register virtualization," and "virtual-interrupt
+ * delivery" only come from vmcs12.
+ */
+ return true;
+ case EXIT_REASON_EPT_VIOLATION:
+ /*
+ * L0 always deals with the EPT violation. If nested EPT is
+ * used, and the nested mmu code discovers that the address is
+ * missing in the guest EPT table (EPT12), the EPT violation
+ * will be injected with nested_ept_inject_page_fault()
+ */
+ return false;
+ case EXIT_REASON_EPT_MISCONFIG:
+ /*
+ * L2 never uses directly L1's EPT, but rather L0's own EPT
+ * table (shadow on EPT) or a merged EPT table that L0 built
+ * (EPT on EPT). So any problems with the structure of the
+ * table is L0's fault.
+ */
+ return false;
+ case EXIT_REASON_INVPCID:
+ return
+ nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_INVPCID) &&
+ nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING);
+ case EXIT_REASON_WBINVD:
+ return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING);
+ case EXIT_REASON_XSETBV:
+ return true;
+ case EXIT_REASON_XSAVES: case EXIT_REASON_XRSTORS:
+ /*
+ * This should never happen, since it is not possible to
+ * set XSS to a non-zero value---neither in L1 nor in L2.
+ * If if it were, XSS would have to be checked against
+ * the XSS exit bitmap in vmcs12.
+ */
+ return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES);
+ case EXIT_REASON_PREEMPTION_TIMER:
+ return false;
+ case EXIT_REASON_PML_FULL:
+ /* We emulate PML support to L1. */
+ return false;
+ case EXIT_REASON_VMFUNC:
+ /* VM functions are emulated through L2->L0 vmexits. */
+ return false;
+ case EXIT_REASON_ENCLS:
+ /* SGX is never exposed to L1 */
+ return false;
+ default:
+ return true;
+ }
+}
+
+static int nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason)
+{
+ u32 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+
+ /*
+ * At this point, the exit interruption info in exit_intr_info
+ * is only valid for EXCEPTION_NMI exits. For EXTERNAL_INTERRUPT
+ * we need to query the in-kernel LAPIC.
+ */
+ WARN_ON(exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT);
+ if ((exit_intr_info &
+ (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) ==
+ (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) {
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+ vmcs12->vm_exit_intr_error_code =
+ vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
+ }
+
+ nested_vmx_vmexit(vcpu, exit_reason, exit_intr_info,
+ vmcs_readl(EXIT_QUALIFICATION));
+ return 1;
+}
+
+static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
+{
+ *info1 = vmcs_readl(EXIT_QUALIFICATION);
+ *info2 = vmcs_read32(VM_EXIT_INTR_INFO);
+}
+
+static void vmx_destroy_pml_buffer(struct vcpu_vmx *vmx)
+{
+ if (vmx->pml_pg) {
+ __free_page(vmx->pml_pg);
+ vmx->pml_pg = NULL;
+ }
+}
+
+static void vmx_flush_pml_buffer(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ u64 *pml_buf;
+ u16 pml_idx;
+
+ pml_idx = vmcs_read16(GUEST_PML_INDEX);
+
+ /* Do nothing if PML buffer is empty */
+ if (pml_idx == (PML_ENTITY_NUM - 1))
+ return;
+
+ /* PML index always points to next available PML buffer entity */
+ if (pml_idx >= PML_ENTITY_NUM)
+ pml_idx = 0;
+ else
+ pml_idx++;
+
+ pml_buf = page_address(vmx->pml_pg);
+ for (; pml_idx < PML_ENTITY_NUM; pml_idx++) {
+ u64 gpa;
+
+ gpa = pml_buf[pml_idx];
+ WARN_ON(gpa & (PAGE_SIZE - 1));
+ kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT);
+ }
+
+ /* reset PML index */
+ vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
+}
+
+/*
+ * Flush all vcpus' PML buffer and update logged GPAs to dirty_bitmap.
+ * Called before reporting dirty_bitmap to userspace.
+ */
+static void kvm_flush_pml_buffers(struct kvm *kvm)
+{
+ int i;
+ struct kvm_vcpu *vcpu;
+ /*
+ * We only need to kick vcpu out of guest mode here, as PML buffer
+ * is flushed at beginning of all VMEXITs, and it's obvious that only
+ * vcpus running in guest are possible to have unflushed GPAs in PML
+ * buffer.
+ */
+ kvm_for_each_vcpu(i, vcpu, kvm)
+ kvm_vcpu_kick(vcpu);
+}
+
+static void vmx_dump_sel(char *name, uint32_t sel)
+{
+ pr_err("%s sel=0x%04x, attr=0x%05x, limit=0x%08x, base=0x%016lx\n",
+ name, vmcs_read16(sel),
+ vmcs_read32(sel + GUEST_ES_AR_BYTES - GUEST_ES_SELECTOR),
+ vmcs_read32(sel + GUEST_ES_LIMIT - GUEST_ES_SELECTOR),
+ vmcs_readl(sel + GUEST_ES_BASE - GUEST_ES_SELECTOR));
+}
+
+static void vmx_dump_dtsel(char *name, uint32_t limit)
+{
+ pr_err("%s limit=0x%08x, base=0x%016lx\n",
+ name, vmcs_read32(limit),
+ vmcs_readl(limit + GUEST_GDTR_BASE - GUEST_GDTR_LIMIT));
+}
+
+static void dump_vmcs(void)
+{
+ u32 vmentry_ctl = vmcs_read32(VM_ENTRY_CONTROLS);
+ u32 vmexit_ctl = vmcs_read32(VM_EXIT_CONTROLS);
+ u32 cpu_based_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
+ u32 pin_based_exec_ctrl = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL);
+ u32 secondary_exec_control = 0;
+ unsigned long cr4 = vmcs_readl(GUEST_CR4);
+ u64 efer = vmcs_read64(GUEST_IA32_EFER);
+ int i, n;
+
+ if (cpu_has_secondary_exec_ctrls())
+ secondary_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
+
+ pr_err("*** Guest State ***\n");
+ pr_err("CR0: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n",
+ vmcs_readl(GUEST_CR0), vmcs_readl(CR0_READ_SHADOW),
+ vmcs_readl(CR0_GUEST_HOST_MASK));
+ pr_err("CR4: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n",
+ cr4, vmcs_readl(CR4_READ_SHADOW), vmcs_readl(CR4_GUEST_HOST_MASK));
+ pr_err("CR3 = 0x%016lx\n", vmcs_readl(GUEST_CR3));
+ if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT) &&
+ (cr4 & X86_CR4_PAE) && !(efer & EFER_LMA))
+ {
+ pr_err("PDPTR0 = 0x%016llx PDPTR1 = 0x%016llx\n",
+ vmcs_read64(GUEST_PDPTR0), vmcs_read64(GUEST_PDPTR1));
+ pr_err("PDPTR2 = 0x%016llx PDPTR3 = 0x%016llx\n",
+ vmcs_read64(GUEST_PDPTR2), vmcs_read64(GUEST_PDPTR3));
+ }
+ pr_err("RSP = 0x%016lx RIP = 0x%016lx\n",
+ vmcs_readl(GUEST_RSP), vmcs_readl(GUEST_RIP));
+ pr_err("RFLAGS=0x%08lx DR7 = 0x%016lx\n",
+ vmcs_readl(GUEST_RFLAGS), vmcs_readl(GUEST_DR7));
+ pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n",
+ vmcs_readl(GUEST_SYSENTER_ESP),
+ vmcs_read32(GUEST_SYSENTER_CS), vmcs_readl(GUEST_SYSENTER_EIP));
+ vmx_dump_sel("CS: ", GUEST_CS_SELECTOR);
+ vmx_dump_sel("DS: ", GUEST_DS_SELECTOR);
+ vmx_dump_sel("SS: ", GUEST_SS_SELECTOR);
+ vmx_dump_sel("ES: ", GUEST_ES_SELECTOR);
+ vmx_dump_sel("FS: ", GUEST_FS_SELECTOR);
+ vmx_dump_sel("GS: ", GUEST_GS_SELECTOR);
+ vmx_dump_dtsel("GDTR:", GUEST_GDTR_LIMIT);
+ vmx_dump_sel("LDTR:", GUEST_LDTR_SELECTOR);
+ vmx_dump_dtsel("IDTR:", GUEST_IDTR_LIMIT);
+ vmx_dump_sel("TR: ", GUEST_TR_SELECTOR);
+ if ((vmexit_ctl & (VM_EXIT_SAVE_IA32_PAT | VM_EXIT_SAVE_IA32_EFER)) ||
+ (vmentry_ctl & (VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_IA32_EFER)))
+ pr_err("EFER = 0x%016llx PAT = 0x%016llx\n",
+ efer, vmcs_read64(GUEST_IA32_PAT));
+ pr_err("DebugCtl = 0x%016llx DebugExceptions = 0x%016lx\n",
+ vmcs_read64(GUEST_IA32_DEBUGCTL),
+ vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS));
+ if (cpu_has_load_perf_global_ctrl &&
+ vmentry_ctl & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL)
+ pr_err("PerfGlobCtl = 0x%016llx\n",
+ vmcs_read64(GUEST_IA32_PERF_GLOBAL_CTRL));
+ if (vmentry_ctl & VM_ENTRY_LOAD_BNDCFGS)
+ pr_err("BndCfgS = 0x%016llx\n", vmcs_read64(GUEST_BNDCFGS));
+ pr_err("Interruptibility = %08x ActivityState = %08x\n",
+ vmcs_read32(GUEST_INTERRUPTIBILITY_INFO),
+ vmcs_read32(GUEST_ACTIVITY_STATE));
+ if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)
+ pr_err("InterruptStatus = %04x\n",
+ vmcs_read16(GUEST_INTR_STATUS));
+
+ pr_err("*** Host State ***\n");
+ pr_err("RIP = 0x%016lx RSP = 0x%016lx\n",
+ vmcs_readl(HOST_RIP), vmcs_readl(HOST_RSP));
+ pr_err("CS=%04x SS=%04x DS=%04x ES=%04x FS=%04x GS=%04x TR=%04x\n",
+ vmcs_read16(HOST_CS_SELECTOR), vmcs_read16(HOST_SS_SELECTOR),
+ vmcs_read16(HOST_DS_SELECTOR), vmcs_read16(HOST_ES_SELECTOR),
+ vmcs_read16(HOST_FS_SELECTOR), vmcs_read16(HOST_GS_SELECTOR),
+ vmcs_read16(HOST_TR_SELECTOR));
+ pr_err("FSBase=%016lx GSBase=%016lx TRBase=%016lx\n",
+ vmcs_readl(HOST_FS_BASE), vmcs_readl(HOST_GS_BASE),
+ vmcs_readl(HOST_TR_BASE));
+ pr_err("GDTBase=%016lx IDTBase=%016lx\n",
+ vmcs_readl(HOST_GDTR_BASE), vmcs_readl(HOST_IDTR_BASE));
+ pr_err("CR0=%016lx CR3=%016lx CR4=%016lx\n",
+ vmcs_readl(HOST_CR0), vmcs_readl(HOST_CR3),
+ vmcs_readl(HOST_CR4));
+ pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n",
+ vmcs_readl(HOST_IA32_SYSENTER_ESP),
+ vmcs_read32(HOST_IA32_SYSENTER_CS),
+ vmcs_readl(HOST_IA32_SYSENTER_EIP));
+ if (vmexit_ctl & (VM_EXIT_LOAD_IA32_PAT | VM_EXIT_LOAD_IA32_EFER))
+ pr_err("EFER = 0x%016llx PAT = 0x%016llx\n",
+ vmcs_read64(HOST_IA32_EFER),
+ vmcs_read64(HOST_IA32_PAT));
+ if (cpu_has_load_perf_global_ctrl &&
+ vmexit_ctl & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
+ pr_err("PerfGlobCtl = 0x%016llx\n",
+ vmcs_read64(HOST_IA32_PERF_GLOBAL_CTRL));
+
+ pr_err("*** Control State ***\n");
+ pr_err("PinBased=%08x CPUBased=%08x SecondaryExec=%08x\n",
+ pin_based_exec_ctrl, cpu_based_exec_ctrl, secondary_exec_control);
+ pr_err("EntryControls=%08x ExitControls=%08x\n", vmentry_ctl, vmexit_ctl);
+ pr_err("ExceptionBitmap=%08x PFECmask=%08x PFECmatch=%08x\n",
+ vmcs_read32(EXCEPTION_BITMAP),
+ vmcs_read32(PAGE_FAULT_ERROR_CODE_MASK),
+ vmcs_read32(PAGE_FAULT_ERROR_CODE_MATCH));
+ pr_err("VMEntry: intr_info=%08x errcode=%08x ilen=%08x\n",
+ vmcs_read32(VM_ENTRY_INTR_INFO_FIELD),
+ vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE),
+ vmcs_read32(VM_ENTRY_INSTRUCTION_LEN));
+ pr_err("VMExit: intr_info=%08x errcode=%08x ilen=%08x\n",
+ vmcs_read32(VM_EXIT_INTR_INFO),
+ vmcs_read32(VM_EXIT_INTR_ERROR_CODE),
+ vmcs_read32(VM_EXIT_INSTRUCTION_LEN));
+ pr_err(" reason=%08x qualification=%016lx\n",
+ vmcs_read32(VM_EXIT_REASON), vmcs_readl(EXIT_QUALIFICATION));
+ pr_err("IDTVectoring: info=%08x errcode=%08x\n",
+ vmcs_read32(IDT_VECTORING_INFO_FIELD),
+ vmcs_read32(IDT_VECTORING_ERROR_CODE));
+ pr_err("TSC Offset = 0x%016llx\n", vmcs_read64(TSC_OFFSET));
+ if (secondary_exec_control & SECONDARY_EXEC_TSC_SCALING)
+ pr_err("TSC Multiplier = 0x%016llx\n",
+ vmcs_read64(TSC_MULTIPLIER));
+ if (cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW)
+ pr_err("TPR Threshold = 0x%02x\n", vmcs_read32(TPR_THRESHOLD));
+ if (pin_based_exec_ctrl & PIN_BASED_POSTED_INTR)
+ pr_err("PostedIntrVec = 0x%02x\n", vmcs_read16(POSTED_INTR_NV));
+ if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT))
+ pr_err("EPT pointer = 0x%016llx\n", vmcs_read64(EPT_POINTER));
+ n = vmcs_read32(CR3_TARGET_COUNT);
+ for (i = 0; i + 1 < n; i += 4)
+ pr_err("CR3 target%u=%016lx target%u=%016lx\n",
+ i, vmcs_readl(CR3_TARGET_VALUE0 + i * 2),
+ i + 1, vmcs_readl(CR3_TARGET_VALUE0 + i * 2 + 2));
+ if (i < n)
+ pr_err("CR3 target%u=%016lx\n",
+ i, vmcs_readl(CR3_TARGET_VALUE0 + i * 2));
+ if (secondary_exec_control & SECONDARY_EXEC_PAUSE_LOOP_EXITING)
+ pr_err("PLE Gap=%08x Window=%08x\n",
+ vmcs_read32(PLE_GAP), vmcs_read32(PLE_WINDOW));
+ if (secondary_exec_control & SECONDARY_EXEC_ENABLE_VPID)
+ pr_err("Virtual processor ID = 0x%04x\n",
+ vmcs_read16(VIRTUAL_PROCESSOR_ID));
+}
+
+/*
+ * The guest has exited. See if we can fix it or if we need userspace
+ * assistance.
+ */
+static int vmx_handle_exit(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ u32 exit_reason = vmx->exit_reason;
+ u32 vectoring_info = vmx->idt_vectoring_info;
+
+ trace_kvm_exit(exit_reason, vcpu, KVM_ISA_VMX);
+
+ /*
+ * Flush logged GPAs PML buffer, this will make dirty_bitmap more
+ * updated. Another good is, in kvm_vm_ioctl_get_dirty_log, before
+ * querying dirty_bitmap, we only need to kick all vcpus out of guest
+ * mode as if vcpus is in root mode, the PML buffer must has been
+ * flushed already.
+ */
+ if (enable_pml)
+ vmx_flush_pml_buffer(vcpu);
+
+ /* If guest state is invalid, start emulating */
+ if (vmx->emulation_required)
+ return handle_invalid_guest_state(vcpu);
+
+ if (is_guest_mode(vcpu) && nested_vmx_exit_reflected(vcpu, exit_reason))
+ return nested_vmx_reflect_vmexit(vcpu, exit_reason);
+
+ if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) {
+ dump_vmcs();
+ vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
+ vcpu->run->fail_entry.hardware_entry_failure_reason
+ = exit_reason;
+ return 0;
+ }
+
+ if (unlikely(vmx->fail)) {
+ vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
+ vcpu->run->fail_entry.hardware_entry_failure_reason
+ = vmcs_read32(VM_INSTRUCTION_ERROR);
+ return 0;
+ }
+
+ /*
+ * Note:
+ * Do not try to fix EXIT_REASON_EPT_MISCONFIG if it caused by
+ * delivery event since it indicates guest is accessing MMIO.
+ * The vm-exit can be triggered again after return to guest that
+ * will cause infinite loop.
+ */
+ if ((vectoring_info & VECTORING_INFO_VALID_MASK) &&
+ (exit_reason != EXIT_REASON_EXCEPTION_NMI &&
+ exit_reason != EXIT_REASON_EPT_VIOLATION &&
+ exit_reason != EXIT_REASON_PML_FULL &&
+ exit_reason != EXIT_REASON_APIC_ACCESS &&
+ exit_reason != EXIT_REASON_TASK_SWITCH)) {
+ vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+ vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV;
+ vcpu->run->internal.ndata = 3;
+ vcpu->run->internal.data[0] = vectoring_info;
+ vcpu->run->internal.data[1] = exit_reason;
+ vcpu->run->internal.data[2] = vcpu->arch.exit_qualification;
+ if (exit_reason == EXIT_REASON_EPT_MISCONFIG) {
+ vcpu->run->internal.ndata++;
+ vcpu->run->internal.data[3] =
+ vmcs_read64(GUEST_PHYSICAL_ADDRESS);
+ }
+ return 0;
+ }
+
+ if (unlikely(!enable_vnmi &&
+ vmx->loaded_vmcs->soft_vnmi_blocked)) {
+ if (vmx_interrupt_allowed(vcpu)) {
+ vmx->loaded_vmcs->soft_vnmi_blocked = 0;
+ } else if (vmx->loaded_vmcs->vnmi_blocked_time > 1000000000LL &&
+ vcpu->arch.nmi_pending) {
+ /*
+ * This CPU don't support us in finding the end of an
+ * NMI-blocked window if the guest runs with IRQs
+ * disabled. So we pull the trigger after 1 s of
+ * futile waiting, but inform the user about this.
+ */
+ printk(KERN_WARNING "%s: Breaking out of NMI-blocked "
+ "state on VCPU %d after 1 s timeout\n",
+ __func__, vcpu->vcpu_id);
+ vmx->loaded_vmcs->soft_vnmi_blocked = 0;
+ }
+ }
+
+ if (exit_reason < kvm_vmx_max_exit_handlers
+ && kvm_vmx_exit_handlers[exit_reason])
+ return kvm_vmx_exit_handlers[exit_reason](vcpu);
+ else {
+ vcpu_unimpl(vcpu, "vmx: unexpected exit reason 0x%x\n",
+ exit_reason);
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+ }
+}
+
+/*
+ * Software based L1D cache flush which is used when microcode providing
+ * the cache control MSR is not loaded.
+ *
+ * The L1D cache is 32 KiB on Nehalem and later microarchitectures, but to
+ * flush it is required to read in 64 KiB because the replacement algorithm
+ * is not exactly LRU. This could be sized at runtime via topology
+ * information but as all relevant affected CPUs have 32KiB L1D cache size
+ * there is no point in doing so.
+ */
+static void vmx_l1d_flush(struct kvm_vcpu *vcpu)
+{
+ int size = PAGE_SIZE << L1D_CACHE_ORDER;
+
+ /*
+ * This code is only executed when the the flush mode is 'cond' or
+ * 'always'
+ */
+ if (static_branch_likely(&vmx_l1d_flush_cond)) {
+ bool flush_l1d;
+
+ /*
+ * Clear the per-vcpu flush bit, it gets set again
+ * either from vcpu_run() or from one of the unsafe
+ * VMEXIT handlers.
+ */
+ flush_l1d = vcpu->arch.l1tf_flush_l1d;
+ vcpu->arch.l1tf_flush_l1d = false;
+
+ /*
+ * Clear the per-cpu flush bit, it gets set again from
+ * the interrupt handlers.
+ */
+ flush_l1d |= kvm_get_cpu_l1tf_flush_l1d();
+ kvm_clear_cpu_l1tf_flush_l1d();
+
+ if (!flush_l1d)
+ return;
+ }
+
+ vcpu->stat.l1d_flush++;
+
+ if (static_cpu_has(X86_FEATURE_FLUSH_L1D)) {
+ wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH);
+ return;
+ }
+
+ asm volatile(
+ /* First ensure the pages are in the TLB */
+ "xorl %%eax, %%eax\n"
+ ".Lpopulate_tlb:\n\t"
+ "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t"
+ "addl $4096, %%eax\n\t"
+ "cmpl %%eax, %[size]\n\t"
+ "jne .Lpopulate_tlb\n\t"
+ "xorl %%eax, %%eax\n\t"
+ "cpuid\n\t"
+ /* Now fill the cache */
+ "xorl %%eax, %%eax\n"
+ ".Lfill_cache:\n"
+ "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t"
+ "addl $64, %%eax\n\t"
+ "cmpl %%eax, %[size]\n\t"
+ "jne .Lfill_cache\n\t"
+ "lfence\n"
+ :: [flush_pages] "r" (vmx_l1d_flush_pages),
+ [size] "r" (size)
+ : "eax", "ebx", "ecx", "edx");
+}
+
+static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
+{
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+
+ if (is_guest_mode(vcpu) &&
+ nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))
+ return;
+
+ if (irr == -1 || tpr < irr) {
+ vmcs_write32(TPR_THRESHOLD, 0);
+ return;
+ }
+
+ vmcs_write32(TPR_THRESHOLD, irr);
+}
+
+static void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu)
+{
+ u32 sec_exec_control;
+
+ if (!lapic_in_kernel(vcpu))
+ return;
+
+ if (!flexpriority_enabled &&
+ !cpu_has_vmx_virtualize_x2apic_mode())
+ return;
+
+ /* Postpone execution until vmcs01 is the current VMCS. */
+ if (is_guest_mode(vcpu)) {
+ to_vmx(vcpu)->nested.change_vmcs01_virtual_apic_mode = true;
+ return;
+ }
+
+ sec_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
+ sec_exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
+ SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE);
+
+ switch (kvm_get_apic_mode(vcpu)) {
+ case LAPIC_MODE_INVALID:
+ WARN_ONCE(true, "Invalid local APIC state");
+ case LAPIC_MODE_DISABLED:
+ break;
+ case LAPIC_MODE_XAPIC:
+ if (flexpriority_enabled) {
+ sec_exec_control |=
+ SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
+ vmx_flush_tlb(vcpu, true);
+ }
+ break;
+ case LAPIC_MODE_X2APIC:
+ if (cpu_has_vmx_virtualize_x2apic_mode())
+ sec_exec_control |=
+ SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
+ break;
+ }
+ vmcs_write32(SECONDARY_VM_EXEC_CONTROL, sec_exec_control);
+
+ vmx_update_msr_bitmap(vcpu);
+}
+
+static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu, hpa_t hpa)
+{
+ if (!is_guest_mode(vcpu)) {
+ vmcs_write64(APIC_ACCESS_ADDR, hpa);
+ vmx_flush_tlb(vcpu, true);
+ }
+}
+
+static void vmx_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr)
+{
+ u16 status;
+ u8 old;
+
+ if (max_isr == -1)
+ max_isr = 0;
+
+ status = vmcs_read16(GUEST_INTR_STATUS);
+ old = status >> 8;
+ if (max_isr != old) {
+ status &= 0xff;
+ status |= max_isr << 8;
+ vmcs_write16(GUEST_INTR_STATUS, status);
+ }
+}
+
+static void vmx_set_rvi(int vector)
+{
+ u16 status;
+ u8 old;
+
+ if (vector == -1)
+ vector = 0;
+
+ status = vmcs_read16(GUEST_INTR_STATUS);
+ old = (u8)status & 0xff;
+ if ((u8)vector != old) {
+ status &= ~0xff;
+ status |= (u8)vector;
+ vmcs_write16(GUEST_INTR_STATUS, status);
+ }
+}
+
+static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr)
+{
+ /*
+ * When running L2, updating RVI is only relevant when
+ * vmcs12 virtual-interrupt-delivery enabled.
+ * However, it can be enabled only when L1 also
+ * intercepts external-interrupts and in that case
+ * we should not update vmcs02 RVI but instead intercept
+ * interrupt. Therefore, do nothing when running L2.
+ */
+ if (!is_guest_mode(vcpu))
+ vmx_set_rvi(max_irr);
+}
+
+static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ int max_irr;
+ bool max_irr_updated;
+
+ WARN_ON(!vcpu->arch.apicv_active);
+ if (pi_test_on(&vmx->pi_desc)) {
+ pi_clear_on(&vmx->pi_desc);
+ /*
+ * IOMMU can write to PIR.ON, so the barrier matters even on UP.
+ * But on x86 this is just a compiler barrier anyway.
+ */
+ smp_mb__after_atomic();
+ max_irr_updated =
+ kvm_apic_update_irr(vcpu, vmx->pi_desc.pir, &max_irr);
+
+ /*
+ * If we are running L2 and L1 has a new pending interrupt
+ * which can be injected, we should re-evaluate
+ * what should be done with this new L1 interrupt.
+ * If L1 intercepts external-interrupts, we should
+ * exit from L2 to L1. Otherwise, interrupt should be
+ * delivered directly to L2.
+ */
+ if (is_guest_mode(vcpu) && max_irr_updated) {
+ if (nested_exit_on_intr(vcpu))
+ kvm_vcpu_exiting_guest_mode(vcpu);
+ else
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ }
+ } else {
+ max_irr = kvm_lapic_find_highest_irr(vcpu);
+ }
+ vmx_hwapic_irr_update(vcpu, max_irr);
+ return max_irr;
+}
+
+static u8 vmx_has_apicv_interrupt(struct kvm_vcpu *vcpu)
+{
+ u8 rvi = vmx_get_rvi();
+ u8 vppr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_PROCPRI);
+
+ return ((rvi & 0xf0) > (vppr & 0xf0));
+}
+
+static bool vmx_dy_apicv_has_pending_interrupt(struct kvm_vcpu *vcpu)
+{
+ return pi_test_on(vcpu_to_pi_desc(vcpu));
+}
+
+static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
+{
+ if (!kvm_vcpu_apicv_active(vcpu))
+ return;
+
+ vmcs_write64(EOI_EXIT_BITMAP0, eoi_exit_bitmap[0]);
+ vmcs_write64(EOI_EXIT_BITMAP1, eoi_exit_bitmap[1]);
+ vmcs_write64(EOI_EXIT_BITMAP2, eoi_exit_bitmap[2]);
+ vmcs_write64(EOI_EXIT_BITMAP3, eoi_exit_bitmap[3]);
+}
+
+static void vmx_apicv_post_state_restore(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ pi_clear_on(&vmx->pi_desc);
+ memset(vmx->pi_desc.pir, 0, sizeof(vmx->pi_desc.pir));
+}
+
+static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx)
+{
+ if (vmx->exit_reason != EXIT_REASON_EXCEPTION_NMI)
+ return;
+
+ vmx->exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+
+ /* if exit due to PF check for async PF */
+ if (is_page_fault(vmx->exit_intr_info))
+ vmx->vcpu.arch.apf.host_apf_reason = kvm_read_and_reset_pf_reason();
+
+ /* Handle machine checks before interrupts are enabled */
+ if (is_machine_check(vmx->exit_intr_info))
+ kvm_machine_check();
+
+ /* We need to handle NMIs before interrupts are enabled */
+ if (is_nmi(vmx->exit_intr_info)) {
+ kvm_before_interrupt(&vmx->vcpu);
+ asm("int $2");
+ kvm_after_interrupt(&vmx->vcpu);
+ }
+}
+
+static void vmx_handle_external_intr(struct kvm_vcpu *vcpu)
+{
+ u32 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+
+ if ((exit_intr_info & (INTR_INFO_VALID_MASK | INTR_INFO_INTR_TYPE_MASK))
+ == (INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR)) {
+ unsigned int vector;
+ unsigned long entry;
+ gate_desc *desc;
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+#ifdef CONFIG_X86_64
+ unsigned long tmp;
+#endif
+
+ vector = exit_intr_info & INTR_INFO_VECTOR_MASK;
+ desc = (gate_desc *)vmx->host_idt_base + vector;
+ entry = gate_offset(desc);
+ asm volatile(
+#ifdef CONFIG_X86_64
+ "mov %%" _ASM_SP ", %[sp]\n\t"
+ "and $0xfffffffffffffff0, %%" _ASM_SP "\n\t"
+ "push $%c[ss]\n\t"
+ "push %[sp]\n\t"
+#endif
+ "pushf\n\t"
+ __ASM_SIZE(push) " $%c[cs]\n\t"
+ CALL_NOSPEC
+ :
+#ifdef CONFIG_X86_64
+ [sp]"=&r"(tmp),
+#endif
+ ASM_CALL_CONSTRAINT
+ :
+ THUNK_TARGET(entry),
+ [ss]"i"(__KERNEL_DS),
+ [cs]"i"(__KERNEL_CS)
+ );
+ }
+}
+STACK_FRAME_NON_STANDARD(vmx_handle_external_intr);
+
+static bool vmx_has_emulated_msr(int index)
+{
+ switch (index) {
+ case MSR_IA32_SMBASE:
+ /*
+ * We cannot do SMM unless we can run the guest in big
+ * real mode.
+ */
+ return enable_unrestricted_guest || emulate_invalid_guest_state;
+ case MSR_AMD64_VIRT_SPEC_CTRL:
+ /* This is AMD only. */
+ return false;
+ default:
+ return true;
+ }
+}
+
+static bool vmx_mpx_supported(void)
+{
+ return (vmcs_config.vmexit_ctrl & VM_EXIT_CLEAR_BNDCFGS) &&
+ (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_BNDCFGS);
+}
+
+static bool vmx_xsaves_supported(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_XSAVES;
+}
+
+static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
+{
+ u32 exit_intr_info;
+ bool unblock_nmi;
+ u8 vector;
+ bool idtv_info_valid;
+
+ idtv_info_valid = vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK;
+
+ if (enable_vnmi) {
+ if (vmx->loaded_vmcs->nmi_known_unmasked)
+ return;
+ /*
+ * Can't use vmx->exit_intr_info since we're not sure what
+ * the exit reason is.
+ */
+ exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+ unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0;
+ vector = exit_intr_info & INTR_INFO_VECTOR_MASK;
+ /*
+ * SDM 3: 27.7.1.2 (September 2008)
+ * Re-set bit "block by NMI" before VM entry if vmexit caused by
+ * a guest IRET fault.
+ * SDM 3: 23.2.2 (September 2008)
+ * Bit 12 is undefined in any of the following cases:
+ * If the VM exit sets the valid bit in the IDT-vectoring
+ * information field.
+ * If the VM exit is due to a double fault.
+ */
+ if ((exit_intr_info & INTR_INFO_VALID_MASK) && unblock_nmi &&
+ vector != DF_VECTOR && !idtv_info_valid)
+ vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
+ GUEST_INTR_STATE_NMI);
+ else
+ vmx->loaded_vmcs->nmi_known_unmasked =
+ !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO)
+ & GUEST_INTR_STATE_NMI);
+ } else if (unlikely(vmx->loaded_vmcs->soft_vnmi_blocked))
+ vmx->loaded_vmcs->vnmi_blocked_time +=
+ ktime_to_ns(ktime_sub(ktime_get(),
+ vmx->loaded_vmcs->entry_time));
+}
+
+static void __vmx_complete_interrupts(struct kvm_vcpu *vcpu,
+ u32 idt_vectoring_info,
+ int instr_len_field,
+ int error_code_field)
+{
+ u8 vector;
+ int type;
+ bool idtv_info_valid;
+
+ idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK;
+
+ vcpu->arch.nmi_injected = false;
+ kvm_clear_exception_queue(vcpu);
+ kvm_clear_interrupt_queue(vcpu);
+
+ if (!idtv_info_valid)
+ return;
+
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+
+ vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK;
+ type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK;
+
+ switch (type) {
+ case INTR_TYPE_NMI_INTR:
+ vcpu->arch.nmi_injected = true;
+ /*
+ * SDM 3: 27.7.1.2 (September 2008)
+ * Clear bit "block by NMI" before VM entry if a NMI
+ * delivery faulted.
+ */
+ vmx_set_nmi_mask(vcpu, false);
+ break;
+ case INTR_TYPE_SOFT_EXCEPTION:
+ vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field);
+ /* fall through */
+ case INTR_TYPE_HARD_EXCEPTION:
+ if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) {
+ u32 err = vmcs_read32(error_code_field);
+ kvm_requeue_exception_e(vcpu, vector, err);
+ } else
+ kvm_requeue_exception(vcpu, vector);
+ break;
+ case INTR_TYPE_SOFT_INTR:
+ vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field);
+ /* fall through */
+ case INTR_TYPE_EXT_INTR:
+ kvm_queue_interrupt(vcpu, vector, type == INTR_TYPE_SOFT_INTR);
+ break;
+ default:
+ break;
+ }
+}
+
+static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
+{
+ __vmx_complete_interrupts(&vmx->vcpu, vmx->idt_vectoring_info,
+ VM_EXIT_INSTRUCTION_LEN,
+ IDT_VECTORING_ERROR_CODE);
+}
+
+static void vmx_cancel_injection(struct kvm_vcpu *vcpu)
+{
+ __vmx_complete_interrupts(vcpu,
+ vmcs_read32(VM_ENTRY_INTR_INFO_FIELD),
+ VM_ENTRY_INSTRUCTION_LEN,
+ VM_ENTRY_EXCEPTION_ERROR_CODE);
+
+ vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);
+}
+
+static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx)
+{
+ int i, nr_msrs;
+ struct perf_guest_switch_msr *msrs;
+
+ msrs = perf_guest_get_msrs(&nr_msrs);
+
+ if (!msrs)
+ return;
+
+ for (i = 0; i < nr_msrs; i++)
+ if (msrs[i].host == msrs[i].guest)
+ clear_atomic_switch_msr(vmx, msrs[i].msr);
+ else
+ add_atomic_switch_msr(vmx, msrs[i].msr, msrs[i].guest,
+ msrs[i].host, false);
+}
+
+static void vmx_arm_hv_timer(struct vcpu_vmx *vmx, u32 val)
+{
+ vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, val);
+ if (!vmx->loaded_vmcs->hv_timer_armed)
+ vmcs_set_bits(PIN_BASED_VM_EXEC_CONTROL,
+ PIN_BASED_VMX_PREEMPTION_TIMER);
+ vmx->loaded_vmcs->hv_timer_armed = true;
+}
+
+static void vmx_update_hv_timer(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ u64 tscl;
+ u32 delta_tsc;
+
+ if (vmx->req_immediate_exit) {
+ vmx_arm_hv_timer(vmx, 0);
+ return;
+ }
+
+ if (vmx->hv_deadline_tsc != -1) {
+ tscl = rdtsc();
+ if (vmx->hv_deadline_tsc > tscl)
+ /* set_hv_timer ensures the delta fits in 32-bits */
+ delta_tsc = (u32)((vmx->hv_deadline_tsc - tscl) >>
+ cpu_preemption_timer_multi);
+ else
+ delta_tsc = 0;
+
+ vmx_arm_hv_timer(vmx, delta_tsc);
+ return;
+ }
+
+ if (vmx->loaded_vmcs->hv_timer_armed)
+ vmcs_clear_bits(PIN_BASED_VM_EXEC_CONTROL,
+ PIN_BASED_VMX_PREEMPTION_TIMER);
+ vmx->loaded_vmcs->hv_timer_armed = false;
+}
+
+static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ unsigned long cr3, cr4, evmcs_rsp;
+
+ /* Record the guest's net vcpu time for enforced NMI injections. */
+ if (unlikely(!enable_vnmi &&
+ vmx->loaded_vmcs->soft_vnmi_blocked))
+ vmx->loaded_vmcs->entry_time = ktime_get();
+
+ /* Don't enter VMX if guest state is invalid, let the exit handler
+ start emulation until we arrive back to a valid state */
+ if (vmx->emulation_required)
+ return;
+
+ if (vmx->ple_window_dirty) {
+ vmx->ple_window_dirty = false;
+ vmcs_write32(PLE_WINDOW, vmx->ple_window);
+ }
+
+ if (vmx->nested.sync_shadow_vmcs) {
+ copy_vmcs12_to_shadow(vmx);
+ vmx->nested.sync_shadow_vmcs = false;
+ }
+
+ if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty))
+ vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);
+ if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty))
+ vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
+
+ cr3 = __get_current_cr3_fast();
+ if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) {
+ vmcs_writel(HOST_CR3, cr3);
+ vmx->loaded_vmcs->host_state.cr3 = cr3;
+ }
+
+ cr4 = cr4_read_shadow();
+ if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) {
+ vmcs_writel(HOST_CR4, cr4);
+ vmx->loaded_vmcs->host_state.cr4 = cr4;
+ }
+
+ /* When single-stepping over STI and MOV SS, we must clear the
+ * corresponding interruptibility bits in the guest state. Otherwise
+ * vmentry fails as it then expects bit 14 (BS) in pending debug
+ * exceptions being set, but that's not correct for the guest debugging
+ * case. */
+ if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
+ vmx_set_interrupt_shadow(vcpu, 0);
+
+ kvm_load_guest_xcr0(vcpu);
+
+ if (static_cpu_has(X86_FEATURE_PKU) &&
+ kvm_read_cr4_bits(vcpu, X86_CR4_PKE) &&
+ vcpu->arch.pkru != vmx->host_pkru)
+ __write_pkru(vcpu->arch.pkru);
+
+ atomic_switch_perf_msrs(vmx);
+
+ vmx_update_hv_timer(vcpu);
+
+ /*
+ * If this vCPU has touched SPEC_CTRL, restore the guest's value if
+ * it's non-zero. Since vmentry is serialising on affected CPUs, there
+ * is no need to worry about the conditional branch over the wrmsr
+ * being speculatively taken.
+ */
+ x86_spec_ctrl_set_guest(vmx->spec_ctrl, 0);
+
+ vmx->__launched = vmx->loaded_vmcs->launched;
+
+ evmcs_rsp = static_branch_unlikely(&enable_evmcs) ?
+ (unsigned long)&current_evmcs->host_rsp : 0;
+
+ /* L1D Flush includes CPU buffer clear to mitigate MDS */
+ if (static_branch_unlikely(&vmx_l1d_should_flush))
+ vmx_l1d_flush(vcpu);
+ else if (static_branch_unlikely(&mds_user_clear))
+ mds_clear_cpu_buffers();
+ else if (static_branch_unlikely(&mmio_stale_data_clear) &&
+ kvm_arch_has_assigned_device(vcpu->kvm))
+ mds_clear_cpu_buffers();
+
+ vmx_disable_fb_clear(vmx);
+
+ asm volatile (
+ /* Store host registers */
+ "push %%" _ASM_DX "; push %%" _ASM_BP ";"
+ "push %%" _ASM_CX " \n\t" /* placeholder for guest rcx */
+ "push %%" _ASM_CX " \n\t"
+ "cmp %%" _ASM_SP ", %c[host_rsp](%%" _ASM_CX ") \n\t"
+ "je 1f \n\t"
+ "mov %%" _ASM_SP ", %c[host_rsp](%%" _ASM_CX ") \n\t"
+ /* Avoid VMWRITE when Enlightened VMCS is in use */
+ "test %%" _ASM_SI ", %%" _ASM_SI " \n\t"
+ "jz 2f \n\t"
+ "mov %%" _ASM_SP ", (%%" _ASM_SI ") \n\t"
+ "jmp 1f \n\t"
+ "2: \n\t"
+ __ex(ASM_VMX_VMWRITE_RSP_RDX) "\n\t"
+ "1: \n\t"
+ /* Reload cr2 if changed */
+ "mov %c[cr2](%%" _ASM_CX "), %%" _ASM_AX " \n\t"
+ "mov %%cr2, %%" _ASM_DX " \n\t"
+ "cmp %%" _ASM_AX ", %%" _ASM_DX " \n\t"
+ "je 3f \n\t"
+ "mov %%" _ASM_AX", %%cr2 \n\t"
+ "3: \n\t"
+ /* Check if vmlaunch of vmresume is needed */
+ "cmpb $0, %c[launched](%%" _ASM_CX ") \n\t"
+ /* Load guest registers. Don't clobber flags. */
+ "mov %c[rax](%%" _ASM_CX "), %%" _ASM_AX " \n\t"
+ "mov %c[rbx](%%" _ASM_CX "), %%" _ASM_BX " \n\t"
+ "mov %c[rdx](%%" _ASM_CX "), %%" _ASM_DX " \n\t"
+ "mov %c[rsi](%%" _ASM_CX "), %%" _ASM_SI " \n\t"
+ "mov %c[rdi](%%" _ASM_CX "), %%" _ASM_DI " \n\t"
+ "mov %c[rbp](%%" _ASM_CX "), %%" _ASM_BP " \n\t"
+#ifdef CONFIG_X86_64
+ "mov %c[r8](%%" _ASM_CX "), %%r8 \n\t"
+ "mov %c[r9](%%" _ASM_CX "), %%r9 \n\t"
+ "mov %c[r10](%%" _ASM_CX "), %%r10 \n\t"
+ "mov %c[r11](%%" _ASM_CX "), %%r11 \n\t"
+ "mov %c[r12](%%" _ASM_CX "), %%r12 \n\t"
+ "mov %c[r13](%%" _ASM_CX "), %%r13 \n\t"
+ "mov %c[r14](%%" _ASM_CX "), %%r14 \n\t"
+ "mov %c[r15](%%" _ASM_CX "), %%r15 \n\t"
+#endif
+ /* Load guest RCX. This kills the vmx_vcpu pointer! */
+ "mov %c[rcx](%%" _ASM_CX "), %%" _ASM_CX " \n\t"
+
+ /* Enter guest mode */
+ "jne 1f \n\t"
+ __ex(ASM_VMX_VMLAUNCH) "\n\t"
+ "jmp 2f \n\t"
+ "1: " __ex(ASM_VMX_VMRESUME) "\n\t"
+ "2: "
+
+ /* Save guest's RCX to the stack placeholder (see above) */
+ "mov %%" _ASM_CX ", %c[wordsize](%%" _ASM_SP ") \n\t"
+
+ /* Load host's RCX, i.e. the vmx_vcpu pointer */
+ "pop %%" _ASM_CX " \n\t"
+
+ /* Set vmx->fail based on EFLAGS.{CF,ZF} */
+ "setbe %c[fail](%%" _ASM_CX ")\n\t"
+
+ /* Save all guest registers, including RCX from the stack */
+ "mov %%" _ASM_AX ", %c[rax](%%" _ASM_CX ") \n\t"
+ "mov %%" _ASM_BX ", %c[rbx](%%" _ASM_CX ") \n\t"
+ __ASM_SIZE(pop) " %c[rcx](%%" _ASM_CX ") \n\t"
+ "mov %%" _ASM_DX ", %c[rdx](%%" _ASM_CX ") \n\t"
+ "mov %%" _ASM_SI ", %c[rsi](%%" _ASM_CX ") \n\t"
+ "mov %%" _ASM_DI ", %c[rdi](%%" _ASM_CX ") \n\t"
+ "mov %%" _ASM_BP ", %c[rbp](%%" _ASM_CX ") \n\t"
+#ifdef CONFIG_X86_64
+ "mov %%r8, %c[r8](%%" _ASM_CX ") \n\t"
+ "mov %%r9, %c[r9](%%" _ASM_CX ") \n\t"
+ "mov %%r10, %c[r10](%%" _ASM_CX ") \n\t"
+ "mov %%r11, %c[r11](%%" _ASM_CX ") \n\t"
+ "mov %%r12, %c[r12](%%" _ASM_CX ") \n\t"
+ "mov %%r13, %c[r13](%%" _ASM_CX ") \n\t"
+ "mov %%r14, %c[r14](%%" _ASM_CX ") \n\t"
+ "mov %%r15, %c[r15](%%" _ASM_CX ") \n\t"
+
+ /*
+ * Clear all general purpose registers (except RSP, which is loaded by
+ * the CPU during VM-Exit) to prevent speculative use of the guest's
+ * values, even those that are saved/loaded via the stack. In theory,
+ * an L1 cache miss when restoring registers could lead to speculative
+ * execution with the guest's values. Zeroing XORs are dirt cheap,
+ * i.e. the extra paranoia is essentially free.
+ */
+ "xor %%r8d, %%r8d \n\t"
+ "xor %%r9d, %%r9d \n\t"
+ "xor %%r10d, %%r10d \n\t"
+ "xor %%r11d, %%r11d \n\t"
+ "xor %%r12d, %%r12d \n\t"
+ "xor %%r13d, %%r13d \n\t"
+ "xor %%r14d, %%r14d \n\t"
+ "xor %%r15d, %%r15d \n\t"
+#endif
+ "mov %%cr2, %%" _ASM_AX " \n\t"
+ "mov %%" _ASM_AX ", %c[cr2](%%" _ASM_CX ") \n\t"
+
+ "xor %%eax, %%eax \n\t"
+ "xor %%ebx, %%ebx \n\t"
+ "xor %%ecx, %%ecx \n\t"
+ "xor %%edx, %%edx \n\t"
+ "xor %%esi, %%esi \n\t"
+ "xor %%edi, %%edi \n\t"
+ "xor %%ebp, %%ebp \n\t"
+ "pop %%" _ASM_BP "; pop %%" _ASM_DX " \n\t"
+ ".pushsection .rodata \n\t"
+ ".global vmx_return \n\t"
+ "vmx_return: " _ASM_PTR " 2b \n\t"
+ ".popsection"
+ : "=c"((int){0}), "=d"((int){0}), "=S"((int){0})
+ : "c"(vmx), "d"((unsigned long)HOST_RSP), "S"(evmcs_rsp),
+ [launched]"i"(offsetof(struct vcpu_vmx, __launched)),
+ [fail]"i"(offsetof(struct vcpu_vmx, fail)),
+ [host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)),
+ [rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])),
+ [rbx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBX])),
+ [rcx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RCX])),
+ [rdx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDX])),
+ [rsi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RSI])),
+ [rdi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDI])),
+ [rbp]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBP])),
+#ifdef CONFIG_X86_64
+ [r8]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R8])),
+ [r9]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R9])),
+ [r10]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R10])),
+ [r11]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R11])),
+ [r12]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R12])),
+ [r13]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R13])),
+ [r14]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R14])),
+ [r15]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R15])),
+#endif
+ [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2)),
+ [wordsize]"i"(sizeof(ulong))
+ : "cc", "memory"
+#ifdef CONFIG_X86_64
+ , "rax", "rbx", "rdi"
+ , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"
+#else
+ , "eax", "ebx", "edi"
+#endif
+ );
+
+ vmx_enable_fb_clear(vmx);
+
+ /*
+ * We do not use IBRS in the kernel. If this vCPU has used the
+ * SPEC_CTRL MSR it may have left it on; save the value and
+ * turn it off. This is much more efficient than blindly adding
+ * it to the atomic save/restore list. Especially as the former
+ * (Saving guest MSRs on vmexit) doesn't even exist in KVM.
+ *
+ * For non-nested case:
+ * If the L01 MSR bitmap does not intercept the MSR, then we need to
+ * save it.
+ *
+ * For nested case:
+ * If the L02 MSR bitmap does not intercept the MSR, then we need to
+ * save it.
+ */
+ if (unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL)))
+ vmx->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL);
+
+ x86_spec_ctrl_restore_host(vmx->spec_ctrl, 0);
+
+ /* Eliminate branch target predictions from guest mode */
+ vmexit_fill_RSB();
+
+ /* All fields are clean at this point */
+ if (static_branch_unlikely(&enable_evmcs))
+ current_evmcs->hv_clean_fields |=
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
+
+ /* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */
+ if (vmx->host_debugctlmsr)
+ update_debugctlmsr(vmx->host_debugctlmsr);
+
+#ifndef CONFIG_X86_64
+ /*
+ * The sysexit path does not restore ds/es, so we must set them to
+ * a reasonable value ourselves.
+ *
+ * We can't defer this to vmx_prepare_switch_to_host() since that
+ * function may be executed in interrupt context, which saves and
+ * restore segments around it, nullifying its effect.
+ */
+ loadsegment(ds, __USER_DS);
+ loadsegment(es, __USER_DS);
+#endif
+
+ vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)
+ | (1 << VCPU_EXREG_RFLAGS)
+ | (1 << VCPU_EXREG_PDPTR)
+ | (1 << VCPU_EXREG_SEGMENTS)
+ | (1 << VCPU_EXREG_CR3));
+ vcpu->arch.regs_dirty = 0;
+
+ /*
+ * eager fpu is enabled if PKEY is supported and CR4 is switched
+ * back on host, so it is safe to read guest PKRU from current
+ * XSAVE.
+ */
+ if (static_cpu_has(X86_FEATURE_PKU) &&
+ kvm_read_cr4_bits(vcpu, X86_CR4_PKE)) {
+ vcpu->arch.pkru = __read_pkru();
+ if (vcpu->arch.pkru != vmx->host_pkru)
+ __write_pkru(vmx->host_pkru);
+ }
+
+ kvm_put_guest_xcr0(vcpu);
+
+ vmx->nested.nested_run_pending = 0;
+ vmx->idt_vectoring_info = 0;
+
+ vmx->exit_reason = vmx->fail ? 0xdead : vmcs_read32(VM_EXIT_REASON);
+ if ((u16)vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY)
+ kvm_machine_check();
+
+ if (vmx->fail || (vmx->exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY))
+ return;
+
+ vmx->loaded_vmcs->launched = 1;
+ vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
+
+ vmx_complete_atomic_exit(vmx);
+ vmx_recover_nmi_blocking(vmx);
+ vmx_complete_interrupts(vmx);
+}
+STACK_FRAME_NON_STANDARD(vmx_vcpu_run);
+
+static struct kvm *vmx_vm_alloc(void)
+{
+ struct kvm_vmx *kvm_vmx = vzalloc(sizeof(struct kvm_vmx));
+
+ if (!kvm_vmx)
+ return NULL;
+
+ return &kvm_vmx->kvm;
+}
+
+static void vmx_vm_free(struct kvm *kvm)
+{
+ vfree(to_kvm_vmx(kvm));
+}
+
+static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ int cpu;
+
+ if (vmx->loaded_vmcs == vmcs)
+ return;
+
+ cpu = get_cpu();
+ vmx_vcpu_put(vcpu);
+ vmx->loaded_vmcs = vmcs;
+ vmx_vcpu_load(vcpu, cpu);
+ put_cpu();
+}
+
+/*
+ * Ensure that the current vmcs of the logical processor is the
+ * vmcs01 of the vcpu before calling free_nested().
+ */
+static void vmx_free_vcpu_nested(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ vcpu_load(vcpu);
+ vmx_switch_vmcs(vcpu, &vmx->vmcs01);
+ free_nested(vmx);
+ vcpu_put(vcpu);
+}
+
+static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ if (enable_pml)
+ vmx_destroy_pml_buffer(vmx);
+ free_vpid(vmx->vpid);
+ leave_guest_mode(vcpu);
+ vmx_free_vcpu_nested(vcpu);
+ free_loaded_vmcs(vmx->loaded_vmcs);
+ kfree(vmx->guest_msrs);
+ kvm_vcpu_uninit(vcpu);
+ kmem_cache_free(kvm_vcpu_cache, vmx);
+}
+
+static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
+{
+ int err;
+ struct vcpu_vmx *vmx = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
+ unsigned long *msr_bitmap;
+ int cpu;
+
+ if (!vmx)
+ return ERR_PTR(-ENOMEM);
+
+ vmx->vpid = allocate_vpid();
+
+ err = kvm_vcpu_init(&vmx->vcpu, kvm, id);
+ if (err)
+ goto free_vcpu;
+
+ err = -ENOMEM;
+
+ /*
+ * If PML is turned on, failure on enabling PML just results in failure
+ * of creating the vcpu, therefore we can simplify PML logic (by
+ * avoiding dealing with cases, such as enabling PML partially on vcpus
+ * for the guest, etc.
+ */
+ if (enable_pml) {
+ vmx->pml_pg = alloc_page(GFP_KERNEL | __GFP_ZERO);
+ if (!vmx->pml_pg)
+ goto uninit_vcpu;
+ }
+
+ vmx->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL);
+ BUILD_BUG_ON(ARRAY_SIZE(vmx_msr_index) * sizeof(vmx->guest_msrs[0])
+ > PAGE_SIZE);
+
+ if (!vmx->guest_msrs)
+ goto free_pml;
+
+ err = alloc_loaded_vmcs(&vmx->vmcs01);
+ if (err < 0)
+ goto free_msrs;
+
+ msr_bitmap = vmx->vmcs01.msr_bitmap;
+ vmx_disable_intercept_for_msr(msr_bitmap, MSR_FS_BASE, MSR_TYPE_RW);
+ vmx_disable_intercept_for_msr(msr_bitmap, MSR_GS_BASE, MSR_TYPE_RW);
+ vmx_disable_intercept_for_msr(msr_bitmap, MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
+ vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW);
+ vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW);
+ vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW);
+ vmx->msr_bitmap_mode = 0;
+
+ vmx->loaded_vmcs = &vmx->vmcs01;
+ cpu = get_cpu();
+ vmx_vcpu_load(&vmx->vcpu, cpu);
+ vmx->vcpu.cpu = cpu;
+ vmx_vcpu_setup(vmx);
+ vmx_vcpu_put(&vmx->vcpu);
+ put_cpu();
+ if (cpu_need_virtualize_apic_accesses(&vmx->vcpu)) {
+ err = alloc_apic_access_page(kvm);
+ if (err)
+ goto free_vmcs;
+ }
+
+ if (enable_ept && !enable_unrestricted_guest) {
+ err = init_rmode_identity_map(kvm);
+ if (err)
+ goto free_vmcs;
+ }
+
+ if (nested)
+ nested_vmx_setup_ctls_msrs(&vmx->nested.msrs,
+ kvm_vcpu_apicv_active(&vmx->vcpu));
+
+ vmx->nested.posted_intr_nv = -1;
+ vmx->nested.current_vmptr = -1ull;
+
+ vmx->msr_ia32_feature_control_valid_bits = FEATURE_CONTROL_LOCKED;
+
+ /*
+ * Enforce invariant: pi_desc.nv is always either POSTED_INTR_VECTOR
+ * or POSTED_INTR_WAKEUP_VECTOR.
+ */
+ vmx->pi_desc.nv = POSTED_INTR_VECTOR;
+ vmx->pi_desc.sn = 1;
+
+ return &vmx->vcpu;
+
+free_vmcs:
+ free_loaded_vmcs(vmx->loaded_vmcs);
+free_msrs:
+ kfree(vmx->guest_msrs);
+free_pml:
+ vmx_destroy_pml_buffer(vmx);
+uninit_vcpu:
+ kvm_vcpu_uninit(&vmx->vcpu);
+free_vcpu:
+ free_vpid(vmx->vpid);
+ kmem_cache_free(kvm_vcpu_cache, vmx);
+ return ERR_PTR(err);
+}
+
+#define L1TF_MSG_SMT "L1TF CPU bug present and SMT on, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n"
+#define L1TF_MSG_L1D "L1TF CPU bug present and virtualization mitigation disabled, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n"
+
+static int vmx_vm_init(struct kvm *kvm)
+{
+ spin_lock_init(&to_kvm_vmx(kvm)->ept_pointer_lock);
+
+ if (!ple_gap)
+ kvm->arch.pause_in_guest = true;
+
+ if (boot_cpu_has(X86_BUG_L1TF) && enable_ept) {
+ switch (l1tf_mitigation) {
+ case L1TF_MITIGATION_OFF:
+ case L1TF_MITIGATION_FLUSH_NOWARN:
+ /* 'I explicitly don't care' is set */
+ break;
+ case L1TF_MITIGATION_FLUSH:
+ case L1TF_MITIGATION_FLUSH_NOSMT:
+ case L1TF_MITIGATION_FULL:
+ /*
+ * Warn upon starting the first VM in a potentially
+ * insecure environment.
+ */
+ if (sched_smt_active())
+ pr_warn_once(L1TF_MSG_SMT);
+ if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_NEVER)
+ pr_warn_once(L1TF_MSG_L1D);
+ break;
+ case L1TF_MITIGATION_FULL_FORCE:
+ /* Flush is enforced */
+ break;
+ }
+ }
+ return 0;
+}
+
+static void __init vmx_check_processor_compat(void *rtn)
+{
+ struct vmcs_config vmcs_conf;
+
+ *(int *)rtn = 0;
+ if (setup_vmcs_config(&vmcs_conf) < 0)
+ *(int *)rtn = -EIO;
+ nested_vmx_setup_ctls_msrs(&vmcs_conf.nested, enable_apicv);
+ if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config)) != 0) {
+ printk(KERN_ERR "kvm: CPU %d feature inconsistency!\n",
+ smp_processor_id());
+ *(int *)rtn = -EIO;
+ }
+}
+
+static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
+{
+ u8 cache;
+ u64 ipat = 0;
+
+ /* For VT-d and EPT combination
+ * 1. MMIO: always map as UC
+ * 2. EPT with VT-d:
+ * a. VT-d without snooping control feature: can't guarantee the
+ * result, try to trust guest.
+ * b. VT-d with snooping control feature: snooping control feature of
+ * VT-d engine can guarantee the cache correctness. Just set it
+ * to WB to keep consistent with host. So the same as item 3.
+ * 3. EPT without VT-d: always map as WB and set IPAT=1 to keep
+ * consistent with host MTRR
+ */
+ if (is_mmio) {
+ cache = MTRR_TYPE_UNCACHABLE;
+ goto exit;
+ }
+
+ if (!kvm_arch_has_noncoherent_dma(vcpu->kvm)) {
+ ipat = VMX_EPT_IPAT_BIT;
+ cache = MTRR_TYPE_WRBACK;
+ goto exit;
+ }
+
+ if (kvm_read_cr0(vcpu) & X86_CR0_CD) {
+ ipat = VMX_EPT_IPAT_BIT;
+ if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
+ cache = MTRR_TYPE_WRBACK;
+ else
+ cache = MTRR_TYPE_UNCACHABLE;
+ goto exit;
+ }
+
+ cache = kvm_mtrr_get_guest_memory_type(vcpu, gfn);
+
+exit:
+ return (cache << VMX_EPT_MT_EPTE_SHIFT) | ipat;
+}
+
+static int vmx_get_lpage_level(void)
+{
+ if (enable_ept && !cpu_has_vmx_ept_1g_page())
+ return PT_DIRECTORY_LEVEL;
+ else
+ /* For shadow and EPT supported 1GB page */
+ return PT_PDPE_LEVEL;
+}
+
+static void vmcs_set_secondary_exec_control(u32 new_ctl)
+{
+ /*
+ * These bits in the secondary execution controls field
+ * are dynamic, the others are mostly based on the hypervisor
+ * architecture and the guest's CPUID. Do not touch the
+ * dynamic bits.
+ */
+ u32 mask =
+ SECONDARY_EXEC_SHADOW_VMCS |
+ SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
+ SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
+ SECONDARY_EXEC_DESC;
+
+ u32 cur_ctl = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
+
+ vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
+ (new_ctl & ~mask) | (cur_ctl & mask));
+}
+
+/*
+ * Generate MSR_IA32_VMX_CR{0,4}_FIXED1 according to CPUID. Only set bits
+ * (indicating "allowed-1") if they are supported in the guest's CPUID.
+ */
+static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct kvm_cpuid_entry2 *entry;
+
+ vmx->nested.msrs.cr0_fixed1 = 0xffffffff;
+ vmx->nested.msrs.cr4_fixed1 = X86_CR4_PCE;
+
+#define cr4_fixed1_update(_cr4_mask, _reg, _cpuid_mask) do { \
+ if (entry && (entry->_reg & (_cpuid_mask))) \
+ vmx->nested.msrs.cr4_fixed1 |= (_cr4_mask); \
+} while (0)
+
+ entry = kvm_find_cpuid_entry(vcpu, 0x1, 0);
+ cr4_fixed1_update(X86_CR4_VME, edx, bit(X86_FEATURE_VME));
+ cr4_fixed1_update(X86_CR4_PVI, edx, bit(X86_FEATURE_VME));
+ cr4_fixed1_update(X86_CR4_TSD, edx, bit(X86_FEATURE_TSC));
+ cr4_fixed1_update(X86_CR4_DE, edx, bit(X86_FEATURE_DE));
+ cr4_fixed1_update(X86_CR4_PSE, edx, bit(X86_FEATURE_PSE));
+ cr4_fixed1_update(X86_CR4_PAE, edx, bit(X86_FEATURE_PAE));
+ cr4_fixed1_update(X86_CR4_MCE, edx, bit(X86_FEATURE_MCE));
+ cr4_fixed1_update(X86_CR4_PGE, edx, bit(X86_FEATURE_PGE));
+ cr4_fixed1_update(X86_CR4_OSFXSR, edx, bit(X86_FEATURE_FXSR));
+ cr4_fixed1_update(X86_CR4_OSXMMEXCPT, edx, bit(X86_FEATURE_XMM));
+ cr4_fixed1_update(X86_CR4_VMXE, ecx, bit(X86_FEATURE_VMX));
+ cr4_fixed1_update(X86_CR4_SMXE, ecx, bit(X86_FEATURE_SMX));
+ cr4_fixed1_update(X86_CR4_PCIDE, ecx, bit(X86_FEATURE_PCID));
+ cr4_fixed1_update(X86_CR4_OSXSAVE, ecx, bit(X86_FEATURE_XSAVE));
+
+ entry = kvm_find_cpuid_entry(vcpu, 0x7, 0);
+ cr4_fixed1_update(X86_CR4_FSGSBASE, ebx, bit(X86_FEATURE_FSGSBASE));
+ cr4_fixed1_update(X86_CR4_SMEP, ebx, bit(X86_FEATURE_SMEP));
+ cr4_fixed1_update(X86_CR4_SMAP, ebx, bit(X86_FEATURE_SMAP));
+ cr4_fixed1_update(X86_CR4_PKE, ecx, bit(X86_FEATURE_PKU));
+ cr4_fixed1_update(X86_CR4_UMIP, ecx, bit(X86_FEATURE_UMIP));
+
+#undef cr4_fixed1_update
+}
+
+static void nested_vmx_entry_exit_ctls_update(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ if (kvm_mpx_supported()) {
+ bool mpx_enabled = guest_cpuid_has(vcpu, X86_FEATURE_MPX);
+
+ if (mpx_enabled) {
+ vmx->nested.msrs.entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS;
+ vmx->nested.msrs.exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS;
+ } else {
+ vmx->nested.msrs.entry_ctls_high &= ~VM_ENTRY_LOAD_BNDCFGS;
+ vmx->nested.msrs.exit_ctls_high &= ~VM_EXIT_CLEAR_BNDCFGS;
+ }
+ }
+}
+
+static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ if (cpu_has_secondary_exec_ctrls()) {
+ vmx_compute_secondary_exec_control(vmx);
+ vmcs_set_secondary_exec_control(vmx->secondary_exec_control);
+ }
+
+ if (nested_vmx_allowed(vcpu))
+ to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |=
+ FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
+ else
+ to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &=
+ ~FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
+
+ if (nested_vmx_allowed(vcpu)) {
+ nested_vmx_cr_fixed1_bits_update(vcpu);
+ nested_vmx_entry_exit_ctls_update(vcpu);
+ }
+}
+
+static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
+{
+ if (func == 1 && nested)
+ entry->ecx |= bit(X86_FEATURE_VMX);
+}
+
+static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu,
+ struct x86_exception *fault)
+{
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ u32 exit_reason;
+ unsigned long exit_qualification = vcpu->arch.exit_qualification;
+
+ if (vmx->nested.pml_full) {
+ exit_reason = EXIT_REASON_PML_FULL;
+ vmx->nested.pml_full = false;
+ exit_qualification &= INTR_INFO_UNBLOCK_NMI;
+ } else if (fault->error_code & PFERR_RSVD_MASK)
+ exit_reason = EXIT_REASON_EPT_MISCONFIG;
+ else
+ exit_reason = EXIT_REASON_EPT_VIOLATION;
+
+ nested_vmx_vmexit(vcpu, exit_reason, 0, exit_qualification);
+ vmcs12->guest_physical_address = fault->address;
+}
+
+static bool nested_ept_ad_enabled(struct kvm_vcpu *vcpu)
+{
+ return nested_ept_get_cr3(vcpu) & VMX_EPTP_AD_ENABLE_BIT;
+}
+
+/* Callbacks for nested_ept_init_mmu_context: */
+
+static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu)
+{
+ /* return the page table to be shadowed - in our case, EPT12 */
+ return get_vmcs12(vcpu)->ept_pointer;
+}
+
+static int nested_ept_init_mmu_context(struct kvm_vcpu *vcpu)
+{
+ WARN_ON(mmu_is_nested(vcpu));
+ if (!valid_ept_address(vcpu, nested_ept_get_cr3(vcpu)))
+ return 1;
+
+ kvm_init_shadow_ept_mmu(vcpu,
+ to_vmx(vcpu)->nested.msrs.ept_caps &
+ VMX_EPT_EXECUTE_ONLY_BIT,
+ nested_ept_ad_enabled(vcpu),
+ nested_ept_get_cr3(vcpu));
+ vcpu->arch.mmu.set_cr3 = vmx_set_cr3;
+ vcpu->arch.mmu.get_cr3 = nested_ept_get_cr3;
+ vcpu->arch.mmu.inject_page_fault = nested_ept_inject_page_fault;
+
+ vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu;
+ return 0;
+}
+
+static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu)
+{
+ vcpu->arch.walk_mmu = &vcpu->arch.mmu;
+}
+
+static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12,
+ u16 error_code)
+{
+ bool inequality, bit;
+
+ bit = (vmcs12->exception_bitmap & (1u << PF_VECTOR)) != 0;
+ inequality =
+ (error_code & vmcs12->page_fault_error_code_mask) !=
+ vmcs12->page_fault_error_code_match;
+ return inequality ^ bit;
+}
+
+static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu,
+ struct x86_exception *fault)
+{
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+
+ WARN_ON(!is_guest_mode(vcpu));
+
+ if (nested_vmx_is_page_fault_vmexit(vmcs12, fault->error_code) &&
+ !to_vmx(vcpu)->nested.nested_run_pending) {
+ vmcs12->vm_exit_intr_error_code = fault->error_code;
+ nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI,
+ PF_VECTOR | INTR_TYPE_HARD_EXCEPTION |
+ INTR_INFO_DELIVER_CODE_MASK | INTR_INFO_VALID_MASK,
+ fault->address);
+ } else {
+ kvm_inject_page_fault(vcpu, fault);
+ }
+}
+
+static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12);
+
+static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
+{
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct page *page;
+ u64 hpa;
+
+ if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
+ /*
+ * Translate L1 physical address to host physical
+ * address for vmcs02. Keep the page pinned, so this
+ * physical address remains valid. We keep a reference
+ * to it so we can release it later.
+ */
+ if (vmx->nested.apic_access_page) { /* shouldn't happen */
+ kvm_release_page_dirty(vmx->nested.apic_access_page);
+ vmx->nested.apic_access_page = NULL;
+ }
+ page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->apic_access_addr);
+ /*
+ * If translation failed, no matter: This feature asks
+ * to exit when accessing the given address, and if it
+ * can never be accessed, this feature won't do
+ * anything anyway.
+ */
+ if (!is_error_page(page)) {
+ vmx->nested.apic_access_page = page;
+ hpa = page_to_phys(vmx->nested.apic_access_page);
+ vmcs_write64(APIC_ACCESS_ADDR, hpa);
+ } else {
+ vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL,
+ SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
+ }
+ }
+
+ if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) {
+ if (vmx->nested.virtual_apic_page) { /* shouldn't happen */
+ kvm_release_page_dirty(vmx->nested.virtual_apic_page);
+ vmx->nested.virtual_apic_page = NULL;
+ }
+ page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->virtual_apic_page_addr);
+
+ /*
+ * If translation failed, VM entry will fail because
+ * prepare_vmcs02 set VIRTUAL_APIC_PAGE_ADDR to -1ull.
+ * Failing the vm entry is _not_ what the processor
+ * does but it's basically the only possibility we
+ * have. We could still enter the guest if CR8 load
+ * exits are enabled, CR8 store exits are enabled, and
+ * virtualize APIC access is disabled; in this case
+ * the processor would never use the TPR shadow and we
+ * could simply clear the bit from the execution
+ * control. But such a configuration is useless, so
+ * let's keep the code simple.
+ */
+ if (!is_error_page(page)) {
+ vmx->nested.virtual_apic_page = page;
+ hpa = page_to_phys(vmx->nested.virtual_apic_page);
+ vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, hpa);
+ }
+ }
+
+ if (nested_cpu_has_posted_intr(vmcs12)) {
+ if (vmx->nested.pi_desc_page) { /* shouldn't happen */
+ kunmap(vmx->nested.pi_desc_page);
+ kvm_release_page_dirty(vmx->nested.pi_desc_page);
+ vmx->nested.pi_desc_page = NULL;
+ vmx->nested.pi_desc = NULL;
+ vmcs_write64(POSTED_INTR_DESC_ADDR, -1ull);
+ }
+ page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->posted_intr_desc_addr);
+ if (is_error_page(page))
+ return;
+ vmx->nested.pi_desc_page = page;
+ vmx->nested.pi_desc = kmap(vmx->nested.pi_desc_page);
+ vmx->nested.pi_desc =
+ (struct pi_desc *)((void *)vmx->nested.pi_desc +
+ (unsigned long)(vmcs12->posted_intr_desc_addr &
+ (PAGE_SIZE - 1)));
+ vmcs_write64(POSTED_INTR_DESC_ADDR,
+ page_to_phys(vmx->nested.pi_desc_page) +
+ (unsigned long)(vmcs12->posted_intr_desc_addr &
+ (PAGE_SIZE - 1)));
+ }
+ if (nested_vmx_prepare_msr_bitmap(vcpu, vmcs12))
+ vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL,
+ CPU_BASED_USE_MSR_BITMAPS);
+ else
+ vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL,
+ CPU_BASED_USE_MSR_BITMAPS);
+}
+
+static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu)
+{
+ u64 preemption_timeout = get_vmcs12(vcpu)->vmx_preemption_timer_value;
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ /*
+ * A timer value of zero is architecturally guaranteed to cause
+ * a VMExit prior to executing any instructions in the guest.
+ */
+ if (preemption_timeout == 0) {
+ vmx_preemption_timer_fn(&vmx->nested.preemption_timer);
+ return;
+ }
+
+ if (vcpu->arch.virtual_tsc_khz == 0)
+ return;
+
+ preemption_timeout <<= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE;
+ preemption_timeout *= 1000000;
+ do_div(preemption_timeout, vcpu->arch.virtual_tsc_khz);
+ hrtimer_start(&vmx->nested.preemption_timer,
+ ns_to_ktime(preemption_timeout), HRTIMER_MODE_REL);
+}
+
+static int nested_vmx_check_io_bitmap_controls(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+ if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
+ return 0;
+
+ if (!page_address_valid(vcpu, vmcs12->io_bitmap_a) ||
+ !page_address_valid(vcpu, vmcs12->io_bitmap_b))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+ if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
+ return 0;
+
+ if (!page_address_valid(vcpu, vmcs12->msr_bitmap))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int nested_vmx_check_tpr_shadow_controls(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+ if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))
+ return 0;
+
+ if (!page_address_valid(vcpu, vmcs12->virtual_apic_page_addr))
+ return -EINVAL;
+
+ return 0;
+}
+
+static inline void enable_x2apic_msr_intercepts(unsigned long *msr_bitmap) {
+ int msr;
+
+ for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
+ unsigned word = msr / BITS_PER_LONG;
+
+ msr_bitmap[word] = ~0;
+ msr_bitmap[word + (0x800 / sizeof(long))] = ~0;
+ }
+}
+
+/*
+ * Merge L0's and L1's MSR bitmap, return false to indicate that
+ * we do not use the hardware.
+ */
+static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+ int msr;
+ struct page *page;
+ unsigned long *msr_bitmap_l1;
+ unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.vmcs02.msr_bitmap;
+ /*
+ * pred_cmd & spec_ctrl are trying to verify two things:
+ *
+ * 1. L0 gave a permission to L1 to actually passthrough the MSR. This
+ * ensures that we do not accidentally generate an L02 MSR bitmap
+ * from the L12 MSR bitmap that is too permissive.
+ * 2. That L1 or L2s have actually used the MSR. This avoids
+ * unnecessarily merging of the bitmap if the MSR is unused. This
+ * works properly because we only update the L01 MSR bitmap lazily.
+ * So even if L0 should pass L1 these MSRs, the L01 bitmap is only
+ * updated to reflect this when L1 (or its L2s) actually write to
+ * the MSR.
+ */
+ bool pred_cmd = !msr_write_intercepted_l01(vcpu, MSR_IA32_PRED_CMD);
+ bool spec_ctrl = !msr_write_intercepted_l01(vcpu, MSR_IA32_SPEC_CTRL);
+
+ /* Nothing to do if the MSR bitmap is not in use. */
+ if (!cpu_has_vmx_msr_bitmap() ||
+ !nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
+ return false;
+
+ if (!nested_cpu_has_virt_x2apic_mode(vmcs12) &&
+ !pred_cmd && !spec_ctrl)
+ return false;
+
+ page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->msr_bitmap);
+ if (is_error_page(page))
+ return false;
+
+ msr_bitmap_l1 = (unsigned long *)kmap(page);
+
+ /*
+ * To keep the control flow simple, pay eight 8-byte writes (sixteen
+ * 4-byte writes on 32-bit systems) up front to enable intercepts for
+ * the x2APIC MSR range and selectively disable them below.
+ */
+ enable_x2apic_msr_intercepts(msr_bitmap_l0);
+
+ if (nested_cpu_has_virt_x2apic_mode(vmcs12)) {
+ if (nested_cpu_has_apic_reg_virt(vmcs12)) {
+ /*
+ * L0 need not intercept reads for MSRs between 0x800
+ * and 0x8ff, it just lets the processor take the value
+ * from the virtual-APIC page; take those 256 bits
+ * directly from the L1 bitmap.
+ */
+ for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
+ unsigned word = msr / BITS_PER_LONG;
+
+ msr_bitmap_l0[word] = msr_bitmap_l1[word];
+ }
+ }
+
+ nested_vmx_disable_intercept_for_msr(
+ msr_bitmap_l1, msr_bitmap_l0,
+ X2APIC_MSR(APIC_TASKPRI),
+ MSR_TYPE_R | MSR_TYPE_W);
+
+ if (nested_cpu_has_vid(vmcs12)) {
+ nested_vmx_disable_intercept_for_msr(
+ msr_bitmap_l1, msr_bitmap_l0,
+ X2APIC_MSR(APIC_EOI),
+ MSR_TYPE_W);
+ nested_vmx_disable_intercept_for_msr(
+ msr_bitmap_l1, msr_bitmap_l0,
+ X2APIC_MSR(APIC_SELF_IPI),
+ MSR_TYPE_W);
+ }
+ }
+
+ if (spec_ctrl)
+ nested_vmx_disable_intercept_for_msr(
+ msr_bitmap_l1, msr_bitmap_l0,
+ MSR_IA32_SPEC_CTRL,
+ MSR_TYPE_R | MSR_TYPE_W);
+
+ if (pred_cmd)
+ nested_vmx_disable_intercept_for_msr(
+ msr_bitmap_l1, msr_bitmap_l0,
+ MSR_IA32_PRED_CMD,
+ MSR_TYPE_W);
+
+ kunmap(page);
+ kvm_release_page_clean(page);
+
+ return true;
+}
+
+static void nested_cache_shadow_vmcs12(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+ struct vmcs12 *shadow;
+ struct page *page;
+
+ if (!nested_cpu_has_shadow_vmcs(vmcs12) ||
+ vmcs12->vmcs_link_pointer == -1ull)
+ return;
+
+ shadow = get_shadow_vmcs12(vcpu);
+ page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->vmcs_link_pointer);
+
+ memcpy(shadow, kmap(page), VMCS12_SIZE);
+
+ kunmap(page);
+ kvm_release_page_clean(page);
+}
+
+static void nested_flush_cached_shadow_vmcs12(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ if (!nested_cpu_has_shadow_vmcs(vmcs12) ||
+ vmcs12->vmcs_link_pointer == -1ull)
+ return;
+
+ kvm_write_guest(vmx->vcpu.kvm, vmcs12->vmcs_link_pointer,
+ get_shadow_vmcs12(vcpu), VMCS12_SIZE);
+}
+
+static int nested_vmx_check_apic_access_controls(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+ if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) &&
+ !page_address_valid(vcpu, vmcs12->apic_access_addr))
+ return -EINVAL;
+ else
+ return 0;
+}
+
+static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+ if (!nested_cpu_has_virt_x2apic_mode(vmcs12) &&
+ !nested_cpu_has_apic_reg_virt(vmcs12) &&
+ !nested_cpu_has_vid(vmcs12) &&
+ !nested_cpu_has_posted_intr(vmcs12))
+ return 0;
+
+ /*
+ * If virtualize x2apic mode is enabled,
+ * virtualize apic access must be disabled.
+ */
+ if (nested_cpu_has_virt_x2apic_mode(vmcs12) &&
+ nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
+ return -EINVAL;
+
+ /*
+ * If virtual interrupt delivery is enabled,
+ * we must exit on external interrupts.
+ */
+ if (nested_cpu_has_vid(vmcs12) &&
+ !nested_exit_on_intr(vcpu))
+ return -EINVAL;
+
+ /*
+ * bits 15:8 should be zero in posted_intr_nv,
+ * the descriptor address has been already checked
+ * in nested_get_vmcs12_pages.
+ *
+ * bits 5:0 of posted_intr_desc_addr should be zero.
+ */
+ if (nested_cpu_has_posted_intr(vmcs12) &&
+ (!nested_cpu_has_vid(vmcs12) ||
+ !nested_exit_intr_ack_set(vcpu) ||
+ (vmcs12->posted_intr_nv & 0xff00) ||
+ (vmcs12->posted_intr_desc_addr & 0x3f) ||
+ (vmcs12->posted_intr_desc_addr >> cpuid_maxphyaddr(vcpu))))
+ return -EINVAL;
+
+ /* tpr shadow is needed by all apicv features. */
+ if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu,
+ unsigned long count_field,
+ unsigned long addr_field)
+{
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+ int maxphyaddr;
+ u64 count, addr;
+
+ if (vmcs12_read_any(vmcs12, count_field, &count) ||
+ vmcs12_read_any(vmcs12, addr_field, &addr)) {
+ WARN_ON(1);
+ return -EINVAL;
+ }
+ if (count == 0)
+ return 0;
+ maxphyaddr = cpuid_maxphyaddr(vcpu);
+ if (!IS_ALIGNED(addr, 16) || addr >> maxphyaddr ||
+ (addr + count * sizeof(struct vmx_msr_entry) - 1) >> maxphyaddr) {
+ pr_debug_ratelimited(
+ "nVMX: invalid MSR switch (0x%lx, %d, %llu, 0x%08llx)",
+ addr_field, maxphyaddr, count, addr);
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static int nested_vmx_check_msr_switch_controls(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+ if (vmcs12->vm_exit_msr_load_count == 0 &&
+ vmcs12->vm_exit_msr_store_count == 0 &&
+ vmcs12->vm_entry_msr_load_count == 0)
+ return 0; /* Fast path */
+ if (nested_vmx_check_msr_switch(vcpu, VM_EXIT_MSR_LOAD_COUNT,
+ VM_EXIT_MSR_LOAD_ADDR) ||
+ nested_vmx_check_msr_switch(vcpu, VM_EXIT_MSR_STORE_COUNT,
+ VM_EXIT_MSR_STORE_ADDR) ||
+ nested_vmx_check_msr_switch(vcpu, VM_ENTRY_MSR_LOAD_COUNT,
+ VM_ENTRY_MSR_LOAD_ADDR))
+ return -EINVAL;
+ return 0;
+}
+
+static int nested_vmx_check_pml_controls(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+ u64 address = vmcs12->pml_address;
+ int maxphyaddr = cpuid_maxphyaddr(vcpu);
+
+ if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_PML)) {
+ if (!nested_cpu_has_ept(vmcs12) ||
+ !IS_ALIGNED(address, 4096) ||
+ address >> maxphyaddr)
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int nested_vmx_check_shadow_vmcs_controls(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+ if (!nested_cpu_has_shadow_vmcs(vmcs12))
+ return 0;
+
+ if (!page_address_valid(vcpu, vmcs12->vmread_bitmap) ||
+ !page_address_valid(vcpu, vmcs12->vmwrite_bitmap))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int nested_vmx_msr_check_common(struct kvm_vcpu *vcpu,
+ struct vmx_msr_entry *e)
+{
+ /* x2APIC MSR accesses are not allowed */
+ if (vcpu->arch.apic_base & X2APIC_ENABLE && e->index >> 8 == 0x8)
+ return -EINVAL;
+ if (e->index == MSR_IA32_UCODE_WRITE || /* SDM Table 35-2 */
+ e->index == MSR_IA32_UCODE_REV)
+ return -EINVAL;
+ if (e->reserved != 0)
+ return -EINVAL;
+ return 0;
+}
+
+static int nested_vmx_load_msr_check(struct kvm_vcpu *vcpu,
+ struct vmx_msr_entry *e)
+{
+ if (e->index == MSR_FS_BASE ||
+ e->index == MSR_GS_BASE ||
+ e->index == MSR_IA32_SMM_MONITOR_CTL || /* SMM is not supported */
+ nested_vmx_msr_check_common(vcpu, e))
+ return -EINVAL;
+ return 0;
+}
+
+static int nested_vmx_store_msr_check(struct kvm_vcpu *vcpu,
+ struct vmx_msr_entry *e)
+{
+ if (e->index == MSR_IA32_SMBASE || /* SMM is not supported */
+ nested_vmx_msr_check_common(vcpu, e))
+ return -EINVAL;
+ return 0;
+}
+
+/*
+ * Load guest's/host's msr at nested entry/exit.
+ * return 0 for success, entry index for failure.
+ */
+static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
+{
+ u32 i;
+ struct vmx_msr_entry e;
+ struct msr_data msr;
+
+ msr.host_initiated = false;
+ for (i = 0; i < count; i++) {
+ if (kvm_vcpu_read_guest(vcpu, gpa + i * sizeof(e),
+ &e, sizeof(e))) {
+ pr_debug_ratelimited(
+ "%s cannot read MSR entry (%u, 0x%08llx)\n",
+ __func__, i, gpa + i * sizeof(e));
+ goto fail;
+ }
+ if (nested_vmx_load_msr_check(vcpu, &e)) {
+ pr_debug_ratelimited(
+ "%s check failed (%u, 0x%x, 0x%x)\n",
+ __func__, i, e.index, e.reserved);
+ goto fail;
+ }
+ msr.index = e.index;
+ msr.data = e.value;
+ if (kvm_set_msr(vcpu, &msr)) {
+ pr_debug_ratelimited(
+ "%s cannot write MSR (%u, 0x%x, 0x%llx)\n",
+ __func__, i, e.index, e.value);
+ goto fail;
+ }
+ }
+ return 0;
+fail:
+ return i + 1;
+}
+
+static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
+{
+ u32 i;
+ struct vmx_msr_entry e;
+
+ for (i = 0; i < count; i++) {
+ struct msr_data msr_info;
+ if (kvm_vcpu_read_guest(vcpu,
+ gpa + i * sizeof(e),
+ &e, 2 * sizeof(u32))) {
+ pr_debug_ratelimited(
+ "%s cannot read MSR entry (%u, 0x%08llx)\n",
+ __func__, i, gpa + i * sizeof(e));
+ return -EINVAL;
+ }
+ if (nested_vmx_store_msr_check(vcpu, &e)) {
+ pr_debug_ratelimited(
+ "%s check failed (%u, 0x%x, 0x%x)\n",
+ __func__, i, e.index, e.reserved);
+ return -EINVAL;
+ }
+ msr_info.host_initiated = false;
+ msr_info.index = e.index;
+ if (kvm_get_msr(vcpu, &msr_info)) {
+ pr_debug_ratelimited(
+ "%s cannot read MSR (%u, 0x%x)\n",
+ __func__, i, e.index);
+ return -EINVAL;
+ }
+ if (kvm_vcpu_write_guest(vcpu,
+ gpa + i * sizeof(e) +
+ offsetof(struct vmx_msr_entry, value),
+ &msr_info.data, sizeof(msr_info.data))) {
+ pr_debug_ratelimited(
+ "%s cannot write MSR (%u, 0x%x, 0x%llx)\n",
+ __func__, i, e.index, msr_info.data);
+ return -EINVAL;
+ }
+ }
+ return 0;
+}
+
+static bool nested_cr3_valid(struct kvm_vcpu *vcpu, unsigned long val)
+{
+ unsigned long invalid_mask;
+
+ invalid_mask = (~0ULL) << cpuid_maxphyaddr(vcpu);
+ return (val & invalid_mask) == 0;
+}
+
+/*
+ * Load guest's/host's cr3 at nested entry/exit. nested_ept is true if we are
+ * emulating VM entry into a guest with EPT enabled.
+ * Returns 0 on success, 1 on failure. Invalid state exit qualification code
+ * is assigned to entry_failure_code on failure.
+ */
+static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool nested_ept,
+ u32 *entry_failure_code)
+{
+ if (cr3 != kvm_read_cr3(vcpu) || (!nested_ept && pdptrs_changed(vcpu))) {
+ if (!nested_cr3_valid(vcpu, cr3)) {
+ *entry_failure_code = ENTRY_FAIL_DEFAULT;
+ return 1;
+ }
+
+ /*
+ * If PAE paging and EPT are both on, CR3 is not used by the CPU and
+ * must not be dereferenced.
+ */
+ if (is_pae_paging(vcpu) && !nested_ept) {
+ if (!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)) {
+ *entry_failure_code = ENTRY_FAIL_PDPTE;
+ return 1;
+ }
+ }
+ }
+
+ if (!nested_ept)
+ kvm_mmu_new_cr3(vcpu, cr3, false);
+
+ vcpu->arch.cr3 = cr3;
+ __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
+
+ kvm_init_mmu(vcpu, false);
+
+ return 0;
+}
+
+static void prepare_vmcs02_full(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector);
+ vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector);
+ vmcs_write16(GUEST_DS_SELECTOR, vmcs12->guest_ds_selector);
+ vmcs_write16(GUEST_FS_SELECTOR, vmcs12->guest_fs_selector);
+ vmcs_write16(GUEST_GS_SELECTOR, vmcs12->guest_gs_selector);
+ vmcs_write16(GUEST_LDTR_SELECTOR, vmcs12->guest_ldtr_selector);
+ vmcs_write16(GUEST_TR_SELECTOR, vmcs12->guest_tr_selector);
+ vmcs_write32(GUEST_ES_LIMIT, vmcs12->guest_es_limit);
+ vmcs_write32(GUEST_SS_LIMIT, vmcs12->guest_ss_limit);
+ vmcs_write32(GUEST_DS_LIMIT, vmcs12->guest_ds_limit);
+ vmcs_write32(GUEST_FS_LIMIT, vmcs12->guest_fs_limit);
+ vmcs_write32(GUEST_GS_LIMIT, vmcs12->guest_gs_limit);
+ vmcs_write32(GUEST_LDTR_LIMIT, vmcs12->guest_ldtr_limit);
+ vmcs_write32(GUEST_TR_LIMIT, vmcs12->guest_tr_limit);
+ vmcs_write32(GUEST_GDTR_LIMIT, vmcs12->guest_gdtr_limit);
+ vmcs_write32(GUEST_IDTR_LIMIT, vmcs12->guest_idtr_limit);
+ vmcs_write32(GUEST_ES_AR_BYTES, vmcs12->guest_es_ar_bytes);
+ vmcs_write32(GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes);
+ vmcs_write32(GUEST_DS_AR_BYTES, vmcs12->guest_ds_ar_bytes);
+ vmcs_write32(GUEST_FS_AR_BYTES, vmcs12->guest_fs_ar_bytes);
+ vmcs_write32(GUEST_GS_AR_BYTES, vmcs12->guest_gs_ar_bytes);
+ vmcs_write32(GUEST_LDTR_AR_BYTES, vmcs12->guest_ldtr_ar_bytes);
+ vmcs_write32(GUEST_TR_AR_BYTES, vmcs12->guest_tr_ar_bytes);
+ vmcs_writel(GUEST_SS_BASE, vmcs12->guest_ss_base);
+ vmcs_writel(GUEST_DS_BASE, vmcs12->guest_ds_base);
+ vmcs_writel(GUEST_FS_BASE, vmcs12->guest_fs_base);
+ vmcs_writel(GUEST_GS_BASE, vmcs12->guest_gs_base);
+ vmcs_writel(GUEST_LDTR_BASE, vmcs12->guest_ldtr_base);
+ vmcs_writel(GUEST_TR_BASE, vmcs12->guest_tr_base);
+ vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base);
+ vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base);
+
+ vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs);
+ vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
+ vmcs12->guest_pending_dbg_exceptions);
+ vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp);
+ vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip);
+
+ if (nested_cpu_has_xsaves(vmcs12))
+ vmcs_write64(XSS_EXIT_BITMAP, vmcs12->xss_exit_bitmap);
+ vmcs_write64(VMCS_LINK_POINTER, -1ull);
+
+ if (cpu_has_vmx_posted_intr())
+ vmcs_write16(POSTED_INTR_NV, POSTED_INTR_NESTED_VECTOR);
+
+ /*
+ * Whether page-faults are trapped is determined by a combination of
+ * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF.
+ * If enable_ept, L0 doesn't care about page faults and we should
+ * set all of these to L1's desires. However, if !enable_ept, L0 does
+ * care about (at least some) page faults, and because it is not easy
+ * (if at all possible?) to merge L0 and L1's desires, we simply ask
+ * to exit on each and every L2 page fault. This is done by setting
+ * MASK=MATCH=0 and (see below) EB.PF=1.
+ * Note that below we don't need special code to set EB.PF beyond the
+ * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept,
+ * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when
+ * !enable_ept, EB.PF is 1, so the "or" will always be 1.
+ */
+ vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK,
+ enable_ept ? vmcs12->page_fault_error_code_mask : 0);
+ vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH,
+ enable_ept ? vmcs12->page_fault_error_code_match : 0);
+
+ /* All VMFUNCs are currently emulated through L0 vmexits. */
+ if (cpu_has_vmx_vmfunc())
+ vmcs_write64(VM_FUNCTION_CONTROL, 0);
+
+ if (cpu_has_vmx_apicv()) {
+ vmcs_write64(EOI_EXIT_BITMAP0, vmcs12->eoi_exit_bitmap0);
+ vmcs_write64(EOI_EXIT_BITMAP1, vmcs12->eoi_exit_bitmap1);
+ vmcs_write64(EOI_EXIT_BITMAP2, vmcs12->eoi_exit_bitmap2);
+ vmcs_write64(EOI_EXIT_BITMAP3, vmcs12->eoi_exit_bitmap3);
+ }
+
+ /*
+ * Set host-state according to L0's settings (vmcs12 is irrelevant here)
+ * Some constant fields are set here by vmx_set_constant_host_state().
+ * Other fields are different per CPU, and will be set later when
+ * vmx_vcpu_load() is called, and when vmx_prepare_switch_to_guest()
+ * is called.
+ */
+ vmx_set_constant_host_state(vmx);
+
+ /*
+ * Set the MSR load/store lists to match L0's settings.
+ */
+ vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
+ vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
+ vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val));
+ vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
+ vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val));
+
+ set_cr4_guest_host_mask(vmx);
+
+ if (kvm_mpx_supported() && vmx->nested.nested_run_pending &&
+ (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))
+ vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs);
+
+ if (enable_vpid) {
+ if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02)
+ vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.vpid02);
+ else
+ vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
+ }
+
+ /*
+ * L1 may access the L2's PDPTR, so save them to construct vmcs12
+ */
+ if (enable_ept) {
+ vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0);
+ vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1);
+ vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2);
+ vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3);
+ }
+
+ if (cpu_has_vmx_msr_bitmap())
+ vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap));
+}
+
+/*
+ * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested
+ * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it
+ * with L0's requirements for its guest (a.k.a. vmcs01), so we can run the L2
+ * guest in a way that will both be appropriate to L1's requests, and our
+ * needs. In addition to modifying the active vmcs (which is vmcs02), this
+ * function also has additional necessary side-effects, like setting various
+ * vcpu->arch fields.
+ * Returns 0 on success, 1 on failure. Invalid state exit qualification code
+ * is assigned to entry_failure_code on failure.
+ */
+static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
+ u32 *entry_failure_code)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ u32 exec_control, vmcs12_exec_ctrl;
+
+ if (vmx->nested.dirty_vmcs12) {
+ prepare_vmcs02_full(vcpu, vmcs12);
+ vmx->nested.dirty_vmcs12 = false;
+ }
+
+ /*
+ * First, the fields that are shadowed. This must be kept in sync
+ * with vmx_shadow_fields.h.
+ */
+
+ vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector);
+ vmcs_write32(GUEST_CS_LIMIT, vmcs12->guest_cs_limit);
+ vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes);
+ vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base);
+ vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base);
+
+ if (vmx->nested.nested_run_pending &&
+ (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) {
+ kvm_set_dr(vcpu, 7, vmcs12->guest_dr7);
+ vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl);
+ } else {
+ kvm_set_dr(vcpu, 7, vcpu->arch.dr7);
+ vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.vmcs01_debugctl);
+ }
+ if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending ||
+ !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)))
+ vmcs_write64(GUEST_BNDCFGS, vmx->nested.vmcs01_guest_bndcfgs);
+ if (vmx->nested.nested_run_pending) {
+ vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
+ vmcs12->vm_entry_intr_info_field);
+ vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
+ vmcs12->vm_entry_exception_error_code);
+ vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
+ vmcs12->vm_entry_instruction_len);
+ vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
+ vmcs12->guest_interruptibility_info);
+ vmx->loaded_vmcs->nmi_known_unmasked =
+ !(vmcs12->guest_interruptibility_info & GUEST_INTR_STATE_NMI);
+ } else {
+ vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);
+ }
+ vmx_set_rflags(vcpu, vmcs12->guest_rflags);
+
+ exec_control = vmcs12->pin_based_vm_exec_control;
+
+ /* Preemption timer setting is computed directly in vmx_vcpu_run. */
+ exec_control |= vmcs_config.pin_based_exec_ctrl;
+ exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
+ vmx->loaded_vmcs->hv_timer_armed = false;
+
+ /* Posted interrupts setting is only taken from vmcs12. */
+ if (nested_cpu_has_posted_intr(vmcs12)) {
+ vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv;
+ vmx->nested.pi_pending = false;
+ } else {
+ exec_control &= ~PIN_BASED_POSTED_INTR;
+ }
+
+ vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, exec_control);
+
+ vmx->nested.preemption_timer_expired = false;
+ if (nested_cpu_has_preemption_timer(vmcs12))
+ vmx_start_preemption_timer(vcpu);
+
+ if (cpu_has_secondary_exec_ctrls()) {
+ exec_control = vmx->secondary_exec_control;
+
+ /* Take the following fields only from vmcs12 */
+ exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
+ SECONDARY_EXEC_ENABLE_INVPCID |
+ SECONDARY_EXEC_RDTSCP |
+ SECONDARY_EXEC_XSAVES |
+ SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
+ SECONDARY_EXEC_APIC_REGISTER_VIRT |
+ SECONDARY_EXEC_ENABLE_VMFUNC);
+ if (nested_cpu_has(vmcs12,
+ CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) {
+ vmcs12_exec_ctrl = vmcs12->secondary_vm_exec_control &
+ ~SECONDARY_EXEC_ENABLE_PML;
+ exec_control |= vmcs12_exec_ctrl;
+ }
+
+ /* VMCS shadowing for L2 is emulated for now */
+ exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
+
+ if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)
+ vmcs_write16(GUEST_INTR_STATUS,
+ vmcs12->guest_intr_status);
+
+ /*
+ * Write an illegal value to APIC_ACCESS_ADDR. Later,
+ * nested_get_vmcs12_pages will either fix it up or
+ * remove the VM execution control.
+ */
+ if (exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)
+ vmcs_write64(APIC_ACCESS_ADDR, -1ull);
+
+ if (exec_control & SECONDARY_EXEC_ENCLS_EXITING)
+ vmcs_write64(ENCLS_EXITING_BITMAP, -1ull);
+
+ vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
+ }
+
+ /*
+ * HOST_RSP is normally set correctly in vmx_vcpu_run() just before
+ * entry, but only if the current (host) sp changed from the value
+ * we wrote last (vmx->host_rsp). This cache is no longer relevant
+ * if we switch vmcs, and rather than hold a separate cache per vmcs,
+ * here we just force the write to happen on entry.
+ */
+ vmx->host_rsp = 0;
+
+ exec_control = vmx_exec_control(vmx); /* L0's desires */
+ exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
+ exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING;
+ exec_control &= ~CPU_BASED_TPR_SHADOW;
+ exec_control |= vmcs12->cpu_based_vm_exec_control;
+
+ /*
+ * Write an illegal value to VIRTUAL_APIC_PAGE_ADDR. Later, if
+ * nested_get_vmcs12_pages can't fix it up, the illegal value
+ * will result in a VM entry failure.
+ */
+ if (exec_control & CPU_BASED_TPR_SHADOW) {
+ vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, -1ull);
+ vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold);
+ } else {
+#ifdef CONFIG_X86_64
+ exec_control |= CPU_BASED_CR8_LOAD_EXITING |
+ CPU_BASED_CR8_STORE_EXITING;
+#endif
+ }
+
+ /*
+ * A vmexit (to either L1 hypervisor or L0 userspace) is always needed
+ * for I/O port accesses.
+ */
+ exec_control &= ~CPU_BASED_USE_IO_BITMAPS;
+ exec_control |= CPU_BASED_UNCOND_IO_EXITING;
+
+ vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control);
+
+ /* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the
+ * bitwise-or of what L1 wants to trap for L2, and what we want to
+ * trap. Note that CR0.TS also needs updating - we do this later.
+ */
+ update_exception_bitmap(vcpu);
+ vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask;
+ vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
+
+ /* L2->L1 exit controls are emulated - the hardware exit is to L0 so
+ * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER
+ * bits are further modified by vmx_set_efer() below.
+ */
+ vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl);
+
+ /* vmcs12's VM_ENTRY_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE are
+ * emulated by vmx_set_efer(), below.
+ */
+ vm_entry_controls_init(vmx,
+ (vmcs12->vm_entry_controls & ~VM_ENTRY_LOAD_IA32_EFER &
+ ~VM_ENTRY_IA32E_MODE) |
+ (vmcs_config.vmentry_ctrl & ~VM_ENTRY_IA32E_MODE));
+
+ if (vmx->nested.nested_run_pending &&
+ (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT)) {
+ vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat);
+ vcpu->arch.pat = vmcs12->guest_ia32_pat;
+ } else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
+ vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
+ }
+
+ vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
+
+ if (kvm_has_tsc_control)
+ decache_tsc_multiplier(vmx);
+
+ if (enable_vpid) {
+ /*
+ * There is no direct mapping between vpid02 and vpid12, the
+ * vpid02 is per-vCPU for L0 and reused while the value of
+ * vpid12 is changed w/ one invvpid during nested vmentry.
+ * The vpid12 is allocated by L1 for L2, so it will not
+ * influence global bitmap(for vpid01 and vpid02 allocation)
+ * even if spawn a lot of nested vCPUs.
+ */
+ if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02) {
+ if (vmcs12->virtual_processor_id != vmx->nested.last_vpid) {
+ vmx->nested.last_vpid = vmcs12->virtual_processor_id;
+ __vmx_flush_tlb(vcpu, vmx->nested.vpid02, true);
+ }
+ } else {
+ vmx_flush_tlb(vcpu, true);
+ }
+ }
+
+ if (enable_pml) {
+ /*
+ * Conceptually we want to copy the PML address and index from
+ * vmcs01 here, and then back to vmcs01 on nested vmexit. But,
+ * since we always flush the log on each vmexit, this happens
+ * to be equivalent to simply resetting the fields in vmcs02.
+ */
+ ASSERT(vmx->pml_pg);
+ vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
+ vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
+ }
+
+ if (nested_cpu_has_ept(vmcs12)) {
+ if (nested_ept_init_mmu_context(vcpu)) {
+ *entry_failure_code = ENTRY_FAIL_DEFAULT;
+ return 1;
+ }
+ } else if (nested_cpu_has2(vmcs12,
+ SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
+ vmx_flush_tlb(vcpu, true);
+ }
+
+ /*
+ * This sets GUEST_CR0 to vmcs12->guest_cr0, possibly modifying those
+ * bits which we consider mandatory enabled.
+ * The CR0_READ_SHADOW is what L2 should have expected to read given
+ * the specifications by L1; It's not enough to take
+ * vmcs12->cr0_read_shadow because on our cr0_guest_host_mask we we
+ * have more bits than L1 expected.
+ */
+ vmx_set_cr0(vcpu, vmcs12->guest_cr0);
+ vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12));
+
+ vmx_set_cr4(vcpu, vmcs12->guest_cr4);
+ vmcs_writel(CR4_READ_SHADOW, nested_read_cr4(vmcs12));
+
+ if (vmx->nested.nested_run_pending &&
+ (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER))
+ vcpu->arch.efer = vmcs12->guest_ia32_efer;
+ else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE)
+ vcpu->arch.efer |= (EFER_LMA | EFER_LME);
+ else
+ vcpu->arch.efer &= ~(EFER_LMA | EFER_LME);
+ /* Note: modifies VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */
+ vmx_set_efer(vcpu, vcpu->arch.efer);
+
+ /*
+ * Guest state is invalid and unrestricted guest is disabled,
+ * which means L1 attempted VMEntry to L2 with invalid state.
+ * Fail the VMEntry.
+ */
+ if (vmx->emulation_required) {
+ *entry_failure_code = ENTRY_FAIL_DEFAULT;
+ return 1;
+ }
+
+ /* Shadow page tables on either EPT or shadow page tables. */
+ if (nested_vmx_load_cr3(vcpu, vmcs12->guest_cr3, nested_cpu_has_ept(vmcs12),
+ entry_failure_code))
+ return 1;
+
+ if (!enable_ept)
+ vcpu->arch.walk_mmu->inject_page_fault = vmx_inject_page_fault_nested;
+
+ kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->guest_rsp);
+ kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->guest_rip);
+ return 0;
+}
+
+static int nested_vmx_check_nmi_controls(struct vmcs12 *vmcs12)
+{
+ if (!nested_cpu_has_nmi_exiting(vmcs12) &&
+ nested_cpu_has_virtual_nmis(vmcs12))
+ return -EINVAL;
+
+ if (!nested_cpu_has_virtual_nmis(vmcs12) &&
+ nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_NMI_PENDING))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ if (vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE &&
+ vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT)
+ return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
+
+ if (nested_cpu_has_vpid(vmcs12) && !vmcs12->virtual_processor_id)
+ return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
+
+ if (nested_vmx_check_io_bitmap_controls(vcpu, vmcs12))
+ return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
+
+ if (nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12))
+ return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
+
+ if (nested_vmx_check_apic_access_controls(vcpu, vmcs12))
+ return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
+
+ if (nested_vmx_check_tpr_shadow_controls(vcpu, vmcs12))
+ return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
+
+ if (nested_vmx_check_apicv_controls(vcpu, vmcs12))
+ return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
+
+ if (nested_vmx_check_msr_switch_controls(vcpu, vmcs12))
+ return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
+
+ if (nested_vmx_check_pml_controls(vcpu, vmcs12))
+ return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
+
+ if (nested_vmx_check_shadow_vmcs_controls(vcpu, vmcs12))
+ return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
+
+ if (!vmx_control_verify(vmcs12->cpu_based_vm_exec_control,
+ vmx->nested.msrs.procbased_ctls_low,
+ vmx->nested.msrs.procbased_ctls_high) ||
+ (nested_cpu_has(vmcs12, CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) &&
+ !vmx_control_verify(vmcs12->secondary_vm_exec_control,
+ vmx->nested.msrs.secondary_ctls_low,
+ vmx->nested.msrs.secondary_ctls_high)) ||
+ !vmx_control_verify(vmcs12->pin_based_vm_exec_control,
+ vmx->nested.msrs.pinbased_ctls_low,
+ vmx->nested.msrs.pinbased_ctls_high) ||
+ !vmx_control_verify(vmcs12->vm_exit_controls,
+ vmx->nested.msrs.exit_ctls_low,
+ vmx->nested.msrs.exit_ctls_high) ||
+ !vmx_control_verify(vmcs12->vm_entry_controls,
+ vmx->nested.msrs.entry_ctls_low,
+ vmx->nested.msrs.entry_ctls_high))
+ return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
+
+ if (nested_vmx_check_nmi_controls(vmcs12))
+ return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
+
+ if (nested_cpu_has_vmfunc(vmcs12)) {
+ if (vmcs12->vm_function_control &
+ ~vmx->nested.msrs.vmfunc_controls)
+ return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
+
+ if (nested_cpu_has_eptp_switching(vmcs12)) {
+ if (!nested_cpu_has_ept(vmcs12) ||
+ !page_address_valid(vcpu, vmcs12->eptp_list_address))
+ return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
+ }
+ }
+
+ if (vmcs12->cr3_target_count > nested_cpu_vmx_misc_cr3_count(vcpu))
+ return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
+
+ if (!nested_host_cr0_valid(vcpu, vmcs12->host_cr0) ||
+ !nested_host_cr4_valid(vcpu, vmcs12->host_cr4) ||
+ !nested_cr3_valid(vcpu, vmcs12->host_cr3))
+ return VMXERR_ENTRY_INVALID_HOST_STATE_FIELD;
+
+ /*
+ * From the Intel SDM, volume 3:
+ * Fields relevant to VM-entry event injection must be set properly.
+ * These fields are the VM-entry interruption-information field, the
+ * VM-entry exception error code, and the VM-entry instruction length.
+ */
+ if (vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) {
+ u32 intr_info = vmcs12->vm_entry_intr_info_field;
+ u8 vector = intr_info & INTR_INFO_VECTOR_MASK;
+ u32 intr_type = intr_info & INTR_INFO_INTR_TYPE_MASK;
+ bool has_error_code = intr_info & INTR_INFO_DELIVER_CODE_MASK;
+ bool should_have_error_code;
+ bool urg = nested_cpu_has2(vmcs12,
+ SECONDARY_EXEC_UNRESTRICTED_GUEST);
+ bool prot_mode = !urg || vmcs12->guest_cr0 & X86_CR0_PE;
+
+ /* VM-entry interruption-info field: interruption type */
+ if (intr_type == INTR_TYPE_RESERVED ||
+ (intr_type == INTR_TYPE_OTHER_EVENT &&
+ !nested_cpu_supports_monitor_trap_flag(vcpu)))
+ return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
+
+ /* VM-entry interruption-info field: vector */
+ if ((intr_type == INTR_TYPE_NMI_INTR && vector != NMI_VECTOR) ||
+ (intr_type == INTR_TYPE_HARD_EXCEPTION && vector > 31) ||
+ (intr_type == INTR_TYPE_OTHER_EVENT && vector != 0))
+ return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
+
+ /* VM-entry interruption-info field: deliver error code */
+ should_have_error_code =
+ intr_type == INTR_TYPE_HARD_EXCEPTION && prot_mode &&
+ x86_exception_has_error_code(vector);
+ if (has_error_code != should_have_error_code)
+ return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
+
+ /* VM-entry exception error code */
+ if (has_error_code &&
+ vmcs12->vm_entry_exception_error_code & GENMASK(31, 16))
+ return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
+
+ /* VM-entry interruption-info field: reserved bits */
+ if (intr_info & INTR_INFO_RESVD_BITS_MASK)
+ return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
+
+ /* VM-entry instruction length */
+ switch (intr_type) {
+ case INTR_TYPE_SOFT_EXCEPTION:
+ case INTR_TYPE_SOFT_INTR:
+ case INTR_TYPE_PRIV_SW_EXCEPTION:
+ if ((vmcs12->vm_entry_instruction_len > 15) ||
+ (vmcs12->vm_entry_instruction_len == 0 &&
+ !nested_cpu_has_zero_length_injection(vcpu)))
+ return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
+ }
+ }
+
+ return 0;
+}
+
+static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+ int r;
+ struct page *page;
+ struct vmcs12 *shadow;
+
+ if (vmcs12->vmcs_link_pointer == -1ull)
+ return 0;
+
+ if (!page_address_valid(vcpu, vmcs12->vmcs_link_pointer))
+ return -EINVAL;
+
+ page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->vmcs_link_pointer);
+ if (is_error_page(page))
+ return -EINVAL;
+
+ r = 0;
+ shadow = kmap(page);
+ if (shadow->hdr.revision_id != VMCS12_REVISION ||
+ shadow->hdr.shadow_vmcs != nested_cpu_has_shadow_vmcs(vmcs12))
+ r = -EINVAL;
+ kunmap(page);
+ kvm_release_page_clean(page);
+ return r;
+}
+
+static int check_vmentry_postreqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
+ u32 *exit_qual)
+{
+ bool ia32e;
+
+ *exit_qual = ENTRY_FAIL_DEFAULT;
+
+ if (!nested_guest_cr0_valid(vcpu, vmcs12->guest_cr0) ||
+ !nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4))
+ return 1;
+
+ if (nested_vmx_check_vmcs_link_ptr(vcpu, vmcs12)) {
+ *exit_qual = ENTRY_FAIL_VMCS_LINK_PTR;
+ return 1;
+ }
+
+ /*
+ * If the load IA32_EFER VM-entry control is 1, the following checks
+ * are performed on the field for the IA32_EFER MSR:
+ * - Bits reserved in the IA32_EFER MSR must be 0.
+ * - Bit 10 (corresponding to IA32_EFER.LMA) must equal the value of
+ * the IA-32e mode guest VM-exit control. It must also be identical
+ * to bit 8 (LME) if bit 31 in the CR0 field (corresponding to
+ * CR0.PG) is 1.
+ */
+ if (to_vmx(vcpu)->nested.nested_run_pending &&
+ (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) {
+ ia32e = (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) != 0;
+ if (!kvm_valid_efer(vcpu, vmcs12->guest_ia32_efer) ||
+ ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA) ||
+ ((vmcs12->guest_cr0 & X86_CR0_PG) &&
+ ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME)))
+ return 1;
+ }
+
+ /*
+ * If the load IA32_EFER VM-exit control is 1, bits reserved in the
+ * IA32_EFER MSR must be 0 in the field for that register. In addition,
+ * the values of the LMA and LME bits in the field must each be that of
+ * the host address-space size VM-exit control.
+ */
+ if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) {
+ ia32e = (vmcs12->vm_exit_controls &
+ VM_EXIT_HOST_ADDR_SPACE_SIZE) != 0;
+ if (!kvm_valid_efer(vcpu, vmcs12->host_ia32_efer) ||
+ ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA) ||
+ ia32e != !!(vmcs12->host_ia32_efer & EFER_LME))
+ return 1;
+ }
+
+ if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS) &&
+ (is_noncanonical_address(vmcs12->guest_bndcfgs & PAGE_MASK, vcpu) ||
+ (vmcs12->guest_bndcfgs & MSR_IA32_BNDCFGS_RSVD)))
+ return 1;
+
+ return 0;
+}
+
+/*
+ * If exit_qual is NULL, this is being called from state restore (either RSM
+ * or KVM_SET_NESTED_STATE). Otherwise it's called from vmlaunch/vmresume.
+ */
+static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, u32 *exit_qual)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+ bool from_vmentry = !!exit_qual;
+ u32 dummy_exit_qual;
+ bool evaluate_pending_interrupts;
+ int r = 0;
+
+ evaluate_pending_interrupts = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) &
+ (CPU_BASED_VIRTUAL_INTR_PENDING | CPU_BASED_VIRTUAL_NMI_PENDING);
+ if (likely(!evaluate_pending_interrupts) && kvm_vcpu_apicv_active(vcpu))
+ evaluate_pending_interrupts |= vmx_has_apicv_interrupt(vcpu);
+
+ enter_guest_mode(vcpu);
+
+ if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS))
+ vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
+ if (kvm_mpx_supported() &&
+ !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))
+ vmx->nested.vmcs01_guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
+
+ vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02);
+ vmx_segment_cache_clear(vmx);
+
+ if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
+ vcpu->arch.tsc_offset += vmcs12->tsc_offset;
+
+ r = EXIT_REASON_INVALID_STATE;
+ if (prepare_vmcs02(vcpu, vmcs12, from_vmentry ? exit_qual : &dummy_exit_qual))
+ goto fail;
+
+ if (from_vmentry) {
+ nested_get_vmcs12_pages(vcpu);
+
+ r = EXIT_REASON_MSR_LOAD_FAIL;
+ *exit_qual = nested_vmx_load_msr(vcpu,
+ vmcs12->vm_entry_msr_load_addr,
+ vmcs12->vm_entry_msr_load_count);
+ if (*exit_qual)
+ goto fail;
+ } else {
+ /*
+ * The MMU is not initialized to point at the right entities yet and
+ * "get pages" would need to read data from the guest (i.e. we will
+ * need to perform gpa to hpa translation). Request a call
+ * to nested_get_vmcs12_pages before the next VM-entry. The MSRs
+ * have already been set at vmentry time and should not be reset.
+ */
+ kvm_make_request(KVM_REQ_GET_VMCS12_PAGES, vcpu);
+ }
+
+ /*
+ * If L1 had a pending IRQ/NMI until it executed
+ * VMLAUNCH/VMRESUME which wasn't delivered because it was
+ * disallowed (e.g. interrupts disabled), L0 needs to
+ * evaluate if this pending event should cause an exit from L2
+ * to L1 or delivered directly to L2 (e.g. In case L1 don't
+ * intercept EXTERNAL_INTERRUPT).
+ *
+ * Usually this would be handled by the processor noticing an
+ * IRQ/NMI window request, or checking RVI during evaluation of
+ * pending virtual interrupts. However, this setting was done
+ * on VMCS01 and now VMCS02 is active instead. Thus, we force L0
+ * to perform pending event evaluation by requesting a KVM_REQ_EVENT.
+ */
+ if (unlikely(evaluate_pending_interrupts))
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+
+ /*
+ * Note no nested_vmx_succeed or nested_vmx_fail here. At this point
+ * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet
+ * returned as far as L1 is concerned. It will only return (and set
+ * the success flag) when L2 exits (see nested_vmx_vmexit()).
+ */
+ return 0;
+
+fail:
+ if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
+ vcpu->arch.tsc_offset -= vmcs12->tsc_offset;
+ leave_guest_mode(vcpu);
+ vmx_switch_vmcs(vcpu, &vmx->vmcs01);
+ return r;
+}
+
+/*
+ * nested_vmx_run() handles a nested entry, i.e., a VMLAUNCH or VMRESUME on L1
+ * for running an L2 nested guest.
+ */
+static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
+{
+ struct vmcs12 *vmcs12;
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ u32 interrupt_shadow = vmx_get_interrupt_shadow(vcpu);
+ u32 exit_qual;
+ int ret;
+
+ if (!nested_vmx_check_permission(vcpu))
+ return 1;
+
+ if (!nested_vmx_check_vmcs12(vcpu))
+ goto out;
+
+ vmcs12 = get_vmcs12(vcpu);
+
+ /*
+ * Can't VMLAUNCH or VMRESUME a shadow VMCS. Despite the fact
+ * that there *is* a valid VMCS pointer, RFLAGS.CF is set
+ * rather than RFLAGS.ZF, and no error number is stored to the
+ * VM-instruction error field.
+ */
+ if (vmcs12->hdr.shadow_vmcs) {
+ nested_vmx_failInvalid(vcpu);
+ goto out;
+ }
+
+ if (enable_shadow_vmcs)
+ copy_shadow_to_vmcs12(vmx);
+
+ /*
+ * The nested entry process starts with enforcing various prerequisites
+ * on vmcs12 as required by the Intel SDM, and act appropriately when
+ * they fail: As the SDM explains, some conditions should cause the
+ * instruction to fail, while others will cause the instruction to seem
+ * to succeed, but return an EXIT_REASON_INVALID_STATE.
+ * To speed up the normal (success) code path, we should avoid checking
+ * for misconfigurations which will anyway be caught by the processor
+ * when using the merged vmcs02.
+ */
+ if (interrupt_shadow & KVM_X86_SHADOW_INT_MOV_SS) {
+ nested_vmx_failValid(vcpu,
+ VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS);
+ goto out;
+ }
+
+ if (vmcs12->launch_state == launch) {
+ nested_vmx_failValid(vcpu,
+ launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS
+ : VMXERR_VMRESUME_NONLAUNCHED_VMCS);
+ goto out;
+ }
+
+ ret = check_vmentry_prereqs(vcpu, vmcs12);
+ if (ret) {
+ nested_vmx_failValid(vcpu, ret);
+ goto out;
+ }
+
+ /*
+ * After this point, the trap flag no longer triggers a singlestep trap
+ * on the vm entry instructions; don't call kvm_skip_emulated_instruction.
+ * This is not 100% correct; for performance reasons, we delegate most
+ * of the checks on host state to the processor. If those fail,
+ * the singlestep trap is missed.
+ */
+ skip_emulated_instruction(vcpu);
+
+ ret = check_vmentry_postreqs(vcpu, vmcs12, &exit_qual);
+ if (ret) {
+ nested_vmx_entry_failure(vcpu, vmcs12,
+ EXIT_REASON_INVALID_STATE, exit_qual);
+ return 1;
+ }
+
+ /*
+ * We're finally done with prerequisite checking, and can start with
+ * the nested entry.
+ */
+
+ vmx->nested.nested_run_pending = 1;
+ ret = enter_vmx_non_root_mode(vcpu, &exit_qual);
+ if (ret) {
+ nested_vmx_entry_failure(vcpu, vmcs12, ret, exit_qual);
+ vmx->nested.nested_run_pending = 0;
+ return 1;
+ }
+
+ /* Hide L1D cache contents from the nested guest. */
+ vmx->vcpu.arch.l1tf_flush_l1d = true;
+
+ /*
+ * Must happen outside of enter_vmx_non_root_mode() as it will
+ * also be used as part of restoring nVMX state for
+ * snapshot restore (migration).
+ *
+ * In this flow, it is assumed that vmcs12 cache was
+ * trasferred as part of captured nVMX state and should
+ * therefore not be read from guest memory (which may not
+ * exist on destination host yet).
+ */
+ nested_cache_shadow_vmcs12(vcpu, vmcs12);
+
+ /*
+ * If we're entering a halted L2 vcpu and the L2 vcpu won't be
+ * awakened by event injection or by an NMI-window VM-exit or
+ * by an interrupt-window VM-exit, halt the vcpu.
+ */
+ if ((vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT) &&
+ !(vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) &&
+ !(vmcs12->cpu_based_vm_exec_control & CPU_BASED_VIRTUAL_NMI_PENDING) &&
+ !((vmcs12->cpu_based_vm_exec_control & CPU_BASED_VIRTUAL_INTR_PENDING) &&
+ (vmcs12->guest_rflags & X86_EFLAGS_IF))) {
+ vmx->nested.nested_run_pending = 0;
+ return kvm_vcpu_halt(vcpu);
+ }
+ return 1;
+
+out:
+ return kvm_skip_emulated_instruction(vcpu);
+}
+
+/*
+ * On a nested exit from L2 to L1, vmcs12.guest_cr0 might not be up-to-date
+ * because L2 may have changed some cr0 bits directly (CRO_GUEST_HOST_MASK).
+ * This function returns the new value we should put in vmcs12.guest_cr0.
+ * It's not enough to just return the vmcs02 GUEST_CR0. Rather,
+ * 1. Bits that neither L0 nor L1 trapped, were set directly by L2 and are now
+ * available in vmcs02 GUEST_CR0. (Note: It's enough to check that L0
+ * didn't trap the bit, because if L1 did, so would L0).
+ * 2. Bits that L1 asked to trap (and therefore L0 also did) could not have
+ * been modified by L2, and L1 knows it. So just leave the old value of
+ * the bit from vmcs12.guest_cr0. Note that the bit from vmcs02 GUEST_CR0
+ * isn't relevant, because if L0 traps this bit it can set it to anything.
+ * 3. Bits that L1 didn't trap, but L0 did. L1 believes the guest could have
+ * changed these bits, and therefore they need to be updated, but L0
+ * didn't necessarily allow them to be changed in GUEST_CR0 - and rather
+ * put them in vmcs02 CR0_READ_SHADOW. So take these bits from there.
+ */
+static inline unsigned long
+vmcs12_guest_cr0(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
+{
+ return
+ /*1*/ (vmcs_readl(GUEST_CR0) & vcpu->arch.cr0_guest_owned_bits) |
+ /*2*/ (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask) |
+ /*3*/ (vmcs_readl(CR0_READ_SHADOW) & ~(vmcs12->cr0_guest_host_mask |
+ vcpu->arch.cr0_guest_owned_bits));
+}
+
+static inline unsigned long
+vmcs12_guest_cr4(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
+{
+ return
+ /*1*/ (vmcs_readl(GUEST_CR4) & vcpu->arch.cr4_guest_owned_bits) |
+ /*2*/ (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask) |
+ /*3*/ (vmcs_readl(CR4_READ_SHADOW) & ~(vmcs12->cr4_guest_host_mask |
+ vcpu->arch.cr4_guest_owned_bits));
+}
+
+static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+ u32 idt_vectoring;
+ unsigned int nr;
+
+ if (vcpu->arch.exception.injected) {
+ nr = vcpu->arch.exception.nr;
+ idt_vectoring = nr | VECTORING_INFO_VALID_MASK;
+
+ if (kvm_exception_is_soft(nr)) {
+ vmcs12->vm_exit_instruction_len =
+ vcpu->arch.event_exit_inst_len;
+ idt_vectoring |= INTR_TYPE_SOFT_EXCEPTION;
+ } else
+ idt_vectoring |= INTR_TYPE_HARD_EXCEPTION;
+
+ if (vcpu->arch.exception.has_error_code) {
+ idt_vectoring |= VECTORING_INFO_DELIVER_CODE_MASK;
+ vmcs12->idt_vectoring_error_code =
+ vcpu->arch.exception.error_code;
+ }
+
+ vmcs12->idt_vectoring_info_field = idt_vectoring;
+ } else if (vcpu->arch.nmi_injected) {
+ vmcs12->idt_vectoring_info_field =
+ INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR;
+ } else if (vcpu->arch.interrupt.injected) {
+ nr = vcpu->arch.interrupt.nr;
+ idt_vectoring = nr | VECTORING_INFO_VALID_MASK;
+
+ if (vcpu->arch.interrupt.soft) {
+ idt_vectoring |= INTR_TYPE_SOFT_INTR;
+ vmcs12->vm_entry_instruction_len =
+ vcpu->arch.event_exit_inst_len;
+ } else
+ idt_vectoring |= INTR_TYPE_EXT_INTR;
+
+ vmcs12->idt_vectoring_info_field = idt_vectoring;
+ }
+}
+
+static int vmx_check_nested_events(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ unsigned long exit_qual;
+ bool block_nested_events =
+ vmx->nested.nested_run_pending || kvm_event_needs_reinjection(vcpu);
+
+ if (vcpu->arch.exception.pending &&
+ nested_vmx_check_exception(vcpu, &exit_qual)) {
+ if (block_nested_events)
+ return -EBUSY;
+ nested_vmx_inject_exception_vmexit(vcpu, exit_qual);
+ return 0;
+ }
+
+ if (nested_cpu_has_preemption_timer(get_vmcs12(vcpu)) &&
+ vmx->nested.preemption_timer_expired) {
+ if (block_nested_events)
+ return -EBUSY;
+ nested_vmx_vmexit(vcpu, EXIT_REASON_PREEMPTION_TIMER, 0, 0);
+ return 0;
+ }
+
+ if (vcpu->arch.nmi_pending && nested_exit_on_nmi(vcpu)) {
+ if (block_nested_events)
+ return -EBUSY;
+ nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI,
+ NMI_VECTOR | INTR_TYPE_NMI_INTR |
+ INTR_INFO_VALID_MASK, 0);
+ /*
+ * The NMI-triggered VM exit counts as injection:
+ * clear this one and block further NMIs.
+ */
+ vcpu->arch.nmi_pending = 0;
+ vmx_set_nmi_mask(vcpu, true);
+ return 0;
+ }
+
+ if (kvm_cpu_has_interrupt(vcpu) && nested_exit_on_intr(vcpu)) {
+ if (block_nested_events)
+ return -EBUSY;
+ nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0);
+ return 0;
+ }
+
+ vmx_complete_nested_posted_interrupt(vcpu);
+ return 0;
+}
+
+static void vmx_request_immediate_exit(struct kvm_vcpu *vcpu)
+{
+ to_vmx(vcpu)->req_immediate_exit = true;
+}
+
+static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu)
+{
+ ktime_t remaining =
+ hrtimer_get_remaining(&to_vmx(vcpu)->nested.preemption_timer);
+ u64 value;
+
+ if (ktime_to_ns(remaining) <= 0)
+ return 0;
+
+ value = ktime_to_ns(remaining) * vcpu->arch.virtual_tsc_khz;
+ do_div(value, 1000000);
+ return value >> VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE;
+}
+
+/*
+ * Update the guest state fields of vmcs12 to reflect changes that
+ * occurred while L2 was running. (The "IA-32e mode guest" bit of the
+ * VM-entry controls is also updated, since this is really a guest
+ * state bit.)
+ */
+static void sync_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
+{
+ vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12);
+ vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12);
+
+ vmcs12->guest_rsp = kvm_register_read(vcpu, VCPU_REGS_RSP);
+ vmcs12->guest_rip = kvm_register_read(vcpu, VCPU_REGS_RIP);
+ vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS);
+
+ vmcs12->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR);
+ vmcs12->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR);
+ vmcs12->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR);
+ vmcs12->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR);
+ vmcs12->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR);
+ vmcs12->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR);
+ vmcs12->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR);
+ vmcs12->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR);
+ vmcs12->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT);
+ vmcs12->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT);
+ vmcs12->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT);
+ vmcs12->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT);
+ vmcs12->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT);
+ vmcs12->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT);
+ vmcs12->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT);
+ vmcs12->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT);
+ vmcs12->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT);
+ vmcs12->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT);
+ vmcs12->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES);
+ vmcs12->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES);
+ vmcs12->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES);
+ vmcs12->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES);
+ vmcs12->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES);
+ vmcs12->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES);
+ vmcs12->guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES);
+ vmcs12->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES);
+ vmcs12->guest_es_base = vmcs_readl(GUEST_ES_BASE);
+ vmcs12->guest_cs_base = vmcs_readl(GUEST_CS_BASE);
+ vmcs12->guest_ss_base = vmcs_readl(GUEST_SS_BASE);
+ vmcs12->guest_ds_base = vmcs_readl(GUEST_DS_BASE);
+ vmcs12->guest_fs_base = vmcs_readl(GUEST_FS_BASE);
+ vmcs12->guest_gs_base = vmcs_readl(GUEST_GS_BASE);
+ vmcs12->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE);
+ vmcs12->guest_tr_base = vmcs_readl(GUEST_TR_BASE);
+ vmcs12->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE);
+ vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE);
+
+ vmcs12->guest_interruptibility_info =
+ vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
+ vmcs12->guest_pending_dbg_exceptions =
+ vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
+ if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED)
+ vmcs12->guest_activity_state = GUEST_ACTIVITY_HLT;
+ else
+ vmcs12->guest_activity_state = GUEST_ACTIVITY_ACTIVE;
+
+ if (nested_cpu_has_preemption_timer(vmcs12)) {
+ if (vmcs12->vm_exit_controls &
+ VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)
+ vmcs12->vmx_preemption_timer_value =
+ vmx_get_preemption_timer_value(vcpu);
+ hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer);
+ }
+
+ /*
+ * In some cases (usually, nested EPT), L2 is allowed to change its
+ * own CR3 without exiting. If it has changed it, we must keep it.
+ * Of course, if L0 is using shadow page tables, GUEST_CR3 was defined
+ * by L0, not L1 or L2, so we mustn't unconditionally copy it to vmcs12.
+ *
+ * Additionally, restore L2's PDPTR to vmcs12.
+ */
+ if (enable_ept) {
+ vmcs12->guest_cr3 = vmcs_readl(GUEST_CR3);
+ vmcs12->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0);
+ vmcs12->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1);
+ vmcs12->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2);
+ vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3);
+ }
+
+ vmcs12->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS);
+
+ if (nested_cpu_has_vid(vmcs12))
+ vmcs12->guest_intr_status = vmcs_read16(GUEST_INTR_STATUS);
+
+ vmcs12->vm_entry_controls =
+ (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) |
+ (vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE);
+
+ if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS) {
+ kvm_get_dr(vcpu, 7, (unsigned long *)&vmcs12->guest_dr7);
+ vmcs12->guest_ia32_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
+ }
+
+ /* TODO: These cannot have changed unless we have MSR bitmaps and
+ * the relevant bit asks not to trap the change */
+ if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_PAT)
+ vmcs12->guest_ia32_pat = vmcs_read64(GUEST_IA32_PAT);
+ if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_EFER)
+ vmcs12->guest_ia32_efer = vcpu->arch.efer;
+ vmcs12->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS);
+ vmcs12->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP);
+ vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP);
+ if (kvm_mpx_supported())
+ vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
+}
+
+/*
+ * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits
+ * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12),
+ * and this function updates it to reflect the changes to the guest state while
+ * L2 was running (and perhaps made some exits which were handled directly by L0
+ * without going back to L1), and to reflect the exit reason.
+ * Note that we do not have to copy here all VMCS fields, just those that
+ * could have changed by the L2 guest or the exit - i.e., the guest-state and
+ * exit-information fields only. Other fields are modified by L1 with VMWRITE,
+ * which already writes to vmcs12 directly.
+ */
+static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
+ u32 exit_reason, u32 exit_intr_info,
+ unsigned long exit_qualification)
+{
+ /* update guest state fields: */
+ sync_vmcs12(vcpu, vmcs12);
+
+ /* update exit information fields: */
+
+ vmcs12->vm_exit_reason = exit_reason;
+ vmcs12->exit_qualification = exit_qualification;
+ vmcs12->vm_exit_intr_info = exit_intr_info;
+
+ vmcs12->idt_vectoring_info_field = 0;
+ vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
+ vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
+
+ if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) {
+ vmcs12->launch_state = 1;
+
+ /* vm_entry_intr_info_field is cleared on exit. Emulate this
+ * instead of reading the real value. */
+ vmcs12->vm_entry_intr_info_field &= ~INTR_INFO_VALID_MASK;
+
+ /*
+ * Transfer the event that L0 or L1 may wanted to inject into
+ * L2 to IDT_VECTORING_INFO_FIELD.
+ */
+ vmcs12_save_pending_event(vcpu, vmcs12);
+ }
+
+ /*
+ * Drop what we picked up for L2 via vmx_complete_interrupts. It is
+ * preserved above and would only end up incorrectly in L1.
+ */
+ vcpu->arch.nmi_injected = false;
+ kvm_clear_exception_queue(vcpu);
+ kvm_clear_interrupt_queue(vcpu);
+}
+
+/*
+ * A part of what we need to when the nested L2 guest exits and we want to
+ * run its L1 parent, is to reset L1's guest state to the host state specified
+ * in vmcs12.
+ * This function is to be called not only on normal nested exit, but also on
+ * a nested entry failure, as explained in Intel's spec, 3B.23.7 ("VM-Entry
+ * Failures During or After Loading Guest State").
+ * This function should be called when the active VMCS is L1's (vmcs01).
+ */
+static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+ struct kvm_segment seg;
+ u32 entry_failure_code;
+
+ if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER)
+ vcpu->arch.efer = vmcs12->host_ia32_efer;
+ else if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)
+ vcpu->arch.efer |= (EFER_LMA | EFER_LME);
+ else
+ vcpu->arch.efer &= ~(EFER_LMA | EFER_LME);
+ vmx_set_efer(vcpu, vcpu->arch.efer);
+
+ kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->host_rsp);
+ kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->host_rip);
+ vmx_set_rflags(vcpu, X86_EFLAGS_FIXED);
+ /*
+ * Note that calling vmx_set_cr0 is important, even if cr0 hasn't
+ * actually changed, because vmx_set_cr0 refers to efer set above.
+ *
+ * CR0_GUEST_HOST_MASK is already set in the original vmcs01
+ * (KVM doesn't change it);
+ */
+ vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS;
+ vmx_set_cr0(vcpu, vmcs12->host_cr0);
+
+ /* Same as above - no reason to call set_cr4_guest_host_mask(). */
+ vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK);
+ vmx_set_cr4(vcpu, vmcs12->host_cr4);
+
+ nested_ept_uninit_mmu_context(vcpu);
+
+ /*
+ * Only PDPTE load can fail as the value of cr3 was checked on entry and
+ * couldn't have changed.
+ */
+ if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, &entry_failure_code))
+ nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL);
+
+ if (!enable_ept)
+ vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault;
+
+ /*
+ * If vmcs01 don't use VPID, CPU flushes TLB on every
+ * VMEntry/VMExit. Thus, no need to flush TLB.
+ *
+ * If vmcs12 uses VPID, TLB entries populated by L2 are
+ * tagged with vmx->nested.vpid02 while L1 entries are tagged
+ * with vmx->vpid. Thus, no need to flush TLB.
+ *
+ * Therefore, flush TLB only in case vmcs01 uses VPID and
+ * vmcs12 don't use VPID as in this case L1 & L2 TLB entries
+ * are both tagged with vmx->vpid.
+ */
+ if (enable_vpid &&
+ !(nested_cpu_has_vpid(vmcs12) && to_vmx(vcpu)->nested.vpid02)) {
+ vmx_flush_tlb(vcpu, true);
+ }
+
+ vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs);
+ vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->host_ia32_sysenter_esp);
+ vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip);
+ vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base);
+ vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base);
+ vmcs_write32(GUEST_IDTR_LIMIT, 0xFFFF);
+ vmcs_write32(GUEST_GDTR_LIMIT, 0xFFFF);
+
+ /* If not VM_EXIT_CLEAR_BNDCFGS, the L2 value propagates to L1. */
+ if (vmcs12->vm_exit_controls & VM_EXIT_CLEAR_BNDCFGS)
+ vmcs_write64(GUEST_BNDCFGS, 0);
+
+ if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) {
+ vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat);
+ vcpu->arch.pat = vmcs12->host_ia32_pat;
+ }
+ if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
+ vmcs_write64(GUEST_IA32_PERF_GLOBAL_CTRL,
+ vmcs12->host_ia32_perf_global_ctrl);
+
+ /* Set L1 segment info according to Intel SDM
+ 27.5.2 Loading Host Segment and Descriptor-Table Registers */
+ seg = (struct kvm_segment) {
+ .base = 0,
+ .limit = 0xFFFFFFFF,
+ .selector = vmcs12->host_cs_selector,
+ .type = 11,
+ .present = 1,
+ .s = 1,
+ .g = 1
+ };
+ if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)
+ seg.l = 1;
+ else
+ seg.db = 1;
+ vmx_set_segment(vcpu, &seg, VCPU_SREG_CS);
+ seg = (struct kvm_segment) {
+ .base = 0,
+ .limit = 0xFFFFFFFF,
+ .type = 3,
+ .present = 1,
+ .s = 1,
+ .db = 1,
+ .g = 1
+ };
+ seg.selector = vmcs12->host_ds_selector;
+ vmx_set_segment(vcpu, &seg, VCPU_SREG_DS);
+ seg.selector = vmcs12->host_es_selector;
+ vmx_set_segment(vcpu, &seg, VCPU_SREG_ES);
+ seg.selector = vmcs12->host_ss_selector;
+ vmx_set_segment(vcpu, &seg, VCPU_SREG_SS);
+ seg.selector = vmcs12->host_fs_selector;
+ seg.base = vmcs12->host_fs_base;
+ vmx_set_segment(vcpu, &seg, VCPU_SREG_FS);
+ seg.selector = vmcs12->host_gs_selector;
+ seg.base = vmcs12->host_gs_base;
+ vmx_set_segment(vcpu, &seg, VCPU_SREG_GS);
+ seg = (struct kvm_segment) {
+ .base = vmcs12->host_tr_base,
+ .limit = 0x67,
+ .selector = vmcs12->host_tr_selector,
+ .type = 11,
+ .present = 1
+ };
+ vmx_set_segment(vcpu, &seg, VCPU_SREG_TR);
+
+ kvm_set_dr(vcpu, 7, 0x400);
+ vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
+
+ if (cpu_has_vmx_msr_bitmap())
+ vmx_update_msr_bitmap(vcpu);
+
+ if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr,
+ vmcs12->vm_exit_msr_load_count))
+ nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL);
+}
+
+static inline u64 nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx *vmx)
+{
+ struct shared_msr_entry *efer_msr;
+ unsigned int i;
+
+ if (vm_entry_controls_get(vmx) & VM_ENTRY_LOAD_IA32_EFER)
+ return vmcs_read64(GUEST_IA32_EFER);
+
+ if (cpu_has_load_ia32_efer)
+ return host_efer;
+
+ for (i = 0; i < vmx->msr_autoload.guest.nr; ++i) {
+ if (vmx->msr_autoload.guest.val[i].index == MSR_EFER)
+ return vmx->msr_autoload.guest.val[i].value;
+ }
+
+ efer_msr = find_msr_entry(vmx, MSR_EFER);
+ if (efer_msr)
+ return efer_msr->data;
+
+ return host_efer;
+}
+
+static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu)
+{
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct vmx_msr_entry g, h;
+ struct msr_data msr;
+ gpa_t gpa;
+ u32 i, j;
+
+ vcpu->arch.pat = vmcs_read64(GUEST_IA32_PAT);
+
+ if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) {
+ /*
+ * L1's host DR7 is lost if KVM_GUESTDBG_USE_HW_BP is set
+ * as vmcs01.GUEST_DR7 contains a userspace defined value
+ * and vcpu->arch.dr7 is not squirreled away before the
+ * nested VMENTER (not worth adding a variable in nested_vmx).
+ */
+ if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
+ kvm_set_dr(vcpu, 7, DR7_FIXED_1);
+ else
+ WARN_ON(kvm_set_dr(vcpu, 7, vmcs_readl(GUEST_DR7)));
+ }
+
+ /*
+ * Note that calling vmx_set_{efer,cr0,cr4} is important as they
+ * handle a variety of side effects to KVM's software model.
+ */
+ vmx_set_efer(vcpu, nested_vmx_get_vmcs01_guest_efer(vmx));
+
+ vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS;
+ vmx_set_cr0(vcpu, vmcs_readl(CR0_READ_SHADOW));
+
+ vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK);
+ vmx_set_cr4(vcpu, vmcs_readl(CR4_READ_SHADOW));
+
+ nested_ept_uninit_mmu_context(vcpu);
+ vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
+ __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
+
+ /*
+ * Use ept_save_pdptrs(vcpu) to load the MMU's cached PDPTRs
+ * from vmcs01 (if necessary). The PDPTRs are not loaded on
+ * VMFail, like everything else we just need to ensure our
+ * software model is up-to-date.
+ */
+ ept_save_pdptrs(vcpu);
+
+ kvm_mmu_reset_context(vcpu);
+
+ if (cpu_has_vmx_msr_bitmap())
+ vmx_update_msr_bitmap(vcpu);
+
+ /*
+ * This nasty bit of open coding is a compromise between blindly
+ * loading L1's MSRs using the exit load lists (incorrect emulation
+ * of VMFail), leaving the nested VM's MSRs in the software model
+ * (incorrect behavior) and snapshotting the modified MSRs (too
+ * expensive since the lists are unbound by hardware). For each
+ * MSR that was (prematurely) loaded from the nested VMEntry load
+ * list, reload it from the exit load list if it exists and differs
+ * from the guest value. The intent is to stuff host state as
+ * silently as possible, not to fully process the exit load list.
+ */
+ msr.host_initiated = false;
+ for (i = 0; i < vmcs12->vm_entry_msr_load_count; i++) {
+ gpa = vmcs12->vm_entry_msr_load_addr + (i * sizeof(g));
+ if (kvm_vcpu_read_guest(vcpu, gpa, &g, sizeof(g))) {
+ pr_debug_ratelimited(
+ "%s read MSR index failed (%u, 0x%08llx)\n",
+ __func__, i, gpa);
+ goto vmabort;
+ }
+
+ for (j = 0; j < vmcs12->vm_exit_msr_load_count; j++) {
+ gpa = vmcs12->vm_exit_msr_load_addr + (j * sizeof(h));
+ if (kvm_vcpu_read_guest(vcpu, gpa, &h, sizeof(h))) {
+ pr_debug_ratelimited(
+ "%s read MSR failed (%u, 0x%08llx)\n",
+ __func__, j, gpa);
+ goto vmabort;
+ }
+ if (h.index != g.index)
+ continue;
+ if (h.value == g.value)
+ break;
+
+ if (nested_vmx_load_msr_check(vcpu, &h)) {
+ pr_debug_ratelimited(
+ "%s check failed (%u, 0x%x, 0x%x)\n",
+ __func__, j, h.index, h.reserved);
+ goto vmabort;
+ }
+
+ msr.index = h.index;
+ msr.data = h.value;
+ if (kvm_set_msr(vcpu, &msr)) {
+ pr_debug_ratelimited(
+ "%s WRMSR failed (%u, 0x%x, 0x%llx)\n",
+ __func__, j, h.index, h.value);
+ goto vmabort;
+ }
+ }
+ }
+
+ return;
+
+vmabort:
+ nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL);
+}
+
+/*
+ * Emulate an exit from nested guest (L2) to L1, i.e., prepare to run L1
+ * and modify vmcs12 to make it see what it would expect to see there if
+ * L2 was its real guest. Must only be called when in L2 (is_guest_mode())
+ */
+static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
+ u32 exit_intr_info,
+ unsigned long exit_qualification)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+
+ /* trying to cancel vmlaunch/vmresume is a bug */
+ WARN_ON_ONCE(vmx->nested.nested_run_pending);
+
+ /*
+ * The only expected VM-instruction error is "VM entry with
+ * invalid control field(s)." Anything else indicates a
+ * problem with L0.
+ */
+ WARN_ON_ONCE(vmx->fail && (vmcs_read32(VM_INSTRUCTION_ERROR) !=
+ VMXERR_ENTRY_INVALID_CONTROL_FIELD));
+
+ leave_guest_mode(vcpu);
+
+ if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
+ vcpu->arch.tsc_offset -= vmcs12->tsc_offset;
+
+ if (likely(!vmx->fail)) {
+ if (exit_reason == -1)
+ sync_vmcs12(vcpu, vmcs12);
+ else
+ prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info,
+ exit_qualification);
+
+ /*
+ * Must happen outside of sync_vmcs12() as it will
+ * also be used to capture vmcs12 cache as part of
+ * capturing nVMX state for snapshot (migration).
+ *
+ * Otherwise, this flush will dirty guest memory at a
+ * point it is already assumed by user-space to be
+ * immutable.
+ */
+ nested_flush_cached_shadow_vmcs12(vcpu, vmcs12);
+
+ if (nested_vmx_store_msr(vcpu, vmcs12->vm_exit_msr_store_addr,
+ vmcs12->vm_exit_msr_store_count))
+ nested_vmx_abort(vcpu, VMX_ABORT_SAVE_GUEST_MSR_FAIL);
+ }
+
+ vmx_switch_vmcs(vcpu, &vmx->vmcs01);
+ vm_entry_controls_reset_shadow(vmx);
+ vm_exit_controls_reset_shadow(vmx);
+ vmx_segment_cache_clear(vmx);
+
+ /* Update any VMCS fields that might have changed while L2 ran */
+ vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
+ vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
+ vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
+
+ if (kvm_has_tsc_control)
+ decache_tsc_multiplier(vmx);
+
+ if (vmx->nested.change_vmcs01_virtual_apic_mode) {
+ vmx->nested.change_vmcs01_virtual_apic_mode = false;
+ vmx_set_virtual_apic_mode(vcpu);
+ } else if (!nested_cpu_has_ept(vmcs12) &&
+ nested_cpu_has2(vmcs12,
+ SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
+ vmx_flush_tlb(vcpu, true);
+ }
+
+ /* This is needed for same reason as it was needed in prepare_vmcs02 */
+ vmx->host_rsp = 0;
+
+ /* Unpin physical memory we referred to in vmcs02 */
+ if (vmx->nested.apic_access_page) {
+ kvm_release_page_dirty(vmx->nested.apic_access_page);
+ vmx->nested.apic_access_page = NULL;
+ }
+ if (vmx->nested.virtual_apic_page) {
+ kvm_release_page_dirty(vmx->nested.virtual_apic_page);
+ vmx->nested.virtual_apic_page = NULL;
+ }
+ if (vmx->nested.pi_desc_page) {
+ kunmap(vmx->nested.pi_desc_page);
+ kvm_release_page_dirty(vmx->nested.pi_desc_page);
+ vmx->nested.pi_desc_page = NULL;
+ vmx->nested.pi_desc = NULL;
+ }
+
+ /*
+ * We are now running in L2, mmu_notifier will force to reload the
+ * page's hpa for L2 vmcs. Need to reload it for L1 before entering L1.
+ */
+ kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
+
+ if (enable_shadow_vmcs && exit_reason != -1)
+ vmx->nested.sync_shadow_vmcs = true;
+
+ /* in case we halted in L2 */
+ vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
+
+ if (likely(!vmx->fail)) {
+ if (exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT &&
+ nested_exit_intr_ack_set(vcpu)) {
+ int irq = kvm_cpu_get_interrupt(vcpu);
+ WARN_ON(irq < 0);
+ vmcs12->vm_exit_intr_info = irq |
+ INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR;
+ }
+
+ if (exit_reason != -1)
+ trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason,
+ vmcs12->exit_qualification,
+ vmcs12->idt_vectoring_info_field,
+ vmcs12->vm_exit_intr_info,
+ vmcs12->vm_exit_intr_error_code,
+ KVM_ISA_VMX);
+
+ load_vmcs12_host_state(vcpu, vmcs12);
+
+ return;
+ }
+
+ /*
+ * After an early L2 VM-entry failure, we're now back
+ * in L1 which thinks it just finished a VMLAUNCH or
+ * VMRESUME instruction, so we need to set the failure
+ * flag and the VM-instruction error field of the VMCS
+ * accordingly.
+ */
+ nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
+
+ /*
+ * Restore L1's host state to KVM's software model. We're here
+ * because a consistency check was caught by hardware, which
+ * means some amount of guest state has been propagated to KVM's
+ * model and needs to be unwound to the host's state.
+ */
+ nested_vmx_restore_host_state(vcpu);
+
+ /*
+ * The emulated instruction was already skipped in
+ * nested_vmx_run, but the updated RIP was never
+ * written back to the vmcs01.
+ */
+ skip_emulated_instruction(vcpu);
+ vmx->fail = 0;
+}
+
+/*
+ * Forcibly leave nested mode in order to be able to reset the VCPU later on.
+ */
+static void vmx_leave_nested(struct kvm_vcpu *vcpu)
+{
+ if (is_guest_mode(vcpu)) {
+ to_vmx(vcpu)->nested.nested_run_pending = 0;
+ nested_vmx_vmexit(vcpu, -1, 0, 0);
+ }
+ free_nested(to_vmx(vcpu));
+}
+
+/*
+ * L1's failure to enter L2 is a subset of a normal exit, as explained in
+ * 23.7 "VM-entry failures during or after loading guest state" (this also
+ * lists the acceptable exit-reason and exit-qualification parameters).
+ * It should only be called before L2 actually succeeded to run, and when
+ * vmcs01 is current (it doesn't leave_guest_mode() or switch vmcss).
+ */
+static void nested_vmx_entry_failure(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12,
+ u32 reason, unsigned long qualification)
+{
+ load_vmcs12_host_state(vcpu, vmcs12);
+ vmcs12->vm_exit_reason = reason | VMX_EXIT_REASONS_FAILED_VMENTRY;
+ vmcs12->exit_qualification = qualification;
+ nested_vmx_succeed(vcpu);
+ if (enable_shadow_vmcs)
+ to_vmx(vcpu)->nested.sync_shadow_vmcs = true;
+}
+
+static int vmx_check_intercept_io(struct kvm_vcpu *vcpu,
+ struct x86_instruction_info *info)
+{
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+ unsigned short port;
+ bool intercept;
+ int size;
+
+ if (info->intercept == x86_intercept_in ||
+ info->intercept == x86_intercept_ins) {
+ port = info->src_val;
+ size = info->dst_bytes;
+ } else {
+ port = info->dst_val;
+ size = info->src_bytes;
+ }
+
+ /*
+ * If the 'use IO bitmaps' VM-execution control is 0, IO instruction
+ * VM-exits depend on the 'unconditional IO exiting' VM-execution
+ * control.
+ *
+ * Otherwise, IO instruction VM-exits are controlled by the IO bitmaps.
+ */
+ if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
+ intercept = nested_cpu_has(vmcs12,
+ CPU_BASED_UNCOND_IO_EXITING);
+ else
+ intercept = nested_vmx_check_io_bitmaps(vcpu, port, size);
+
+ /* FIXME: produce nested vmexit and return X86EMUL_INTERCEPTED. */
+ return intercept ? X86EMUL_UNHANDLEABLE : X86EMUL_CONTINUE;
+}
+
+static int vmx_check_intercept(struct kvm_vcpu *vcpu,
+ struct x86_instruction_info *info,
+ enum x86_intercept_stage stage)
+{
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+ struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
+
+ switch (info->intercept) {
+ /*
+ * RDPID causes #UD if disabled through secondary execution controls.
+ * Because it is marked as EmulateOnUD, we need to intercept it here.
+ */
+ case x86_intercept_rdtscp:
+ if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDTSCP)) {
+ ctxt->exception.vector = UD_VECTOR;
+ ctxt->exception.error_code_valid = false;
+ return X86EMUL_PROPAGATE_FAULT;
+ }
+ break;
+
+ case x86_intercept_in:
+ case x86_intercept_ins:
+ case x86_intercept_out:
+ case x86_intercept_outs:
+ return vmx_check_intercept_io(vcpu, info);
+
+ case x86_intercept_lgdt:
+ case x86_intercept_lidt:
+ case x86_intercept_lldt:
+ case x86_intercept_ltr:
+ case x86_intercept_sgdt:
+ case x86_intercept_sidt:
+ case x86_intercept_sldt:
+ case x86_intercept_str:
+ if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC))
+ return X86EMUL_CONTINUE;
+
+ /* FIXME: produce nested vmexit and return X86EMUL_INTERCEPTED. */
+ break;
+
+ /* TODO: check more intercepts... */
+ default:
+ break;
+ }
+
+ return X86EMUL_UNHANDLEABLE;
+}
+
+#ifdef CONFIG_X86_64
+/* (a << shift) / divisor, return 1 if overflow otherwise 0 */
+static inline int u64_shl_div_u64(u64 a, unsigned int shift,
+ u64 divisor, u64 *result)
+{
+ u64 low = a << shift, high = a >> (64 - shift);
+
+ /* To avoid the overflow on divq */
+ if (high >= divisor)
+ return 1;
+
+ /* Low hold the result, high hold rem which is discarded */
+ asm("divq %2\n\t" : "=a" (low), "=d" (high) :
+ "rm" (divisor), "0" (low), "1" (high));
+ *result = low;
+
+ return 0;
+}
+
+static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc)
+{
+ struct vcpu_vmx *vmx;
+ u64 tscl, guest_tscl, delta_tsc, lapic_timer_advance_cycles;
+
+ if (kvm_mwait_in_guest(vcpu->kvm))
+ return -EOPNOTSUPP;
+
+ vmx = to_vmx(vcpu);
+ tscl = rdtsc();
+ guest_tscl = kvm_read_l1_tsc(vcpu, tscl);
+ delta_tsc = max(guest_deadline_tsc, guest_tscl) - guest_tscl;
+ lapic_timer_advance_cycles = nsec_to_cycles(vcpu, lapic_timer_advance_ns);
+
+ if (delta_tsc > lapic_timer_advance_cycles)
+ delta_tsc -= lapic_timer_advance_cycles;
+ else
+ delta_tsc = 0;
+
+ /* Convert to host delta tsc if tsc scaling is enabled */
+ if (vcpu->arch.tsc_scaling_ratio != kvm_default_tsc_scaling_ratio &&
+ u64_shl_div_u64(delta_tsc,
+ kvm_tsc_scaling_ratio_frac_bits,
+ vcpu->arch.tsc_scaling_ratio,
+ &delta_tsc))
+ return -ERANGE;
+
+ /*
+ * If the delta tsc can't fit in the 32 bit after the multi shift,
+ * we can't use the preemption timer.
+ * It's possible that it fits on later vmentries, but checking
+ * on every vmentry is costly so we just use an hrtimer.
+ */
+ if (delta_tsc >> (cpu_preemption_timer_multi + 32))
+ return -ERANGE;
+
+ vmx->hv_deadline_tsc = tscl + delta_tsc;
+ return delta_tsc == 0;
+}
+
+static void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu)
+{
+ to_vmx(vcpu)->hv_deadline_tsc = -1;
+}
+#endif
+
+static void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu)
+{
+ if (!kvm_pause_in_guest(vcpu->kvm))
+ shrink_ple_window(vcpu);
+}
+
+static void vmx_slot_enable_log_dirty(struct kvm *kvm,
+ struct kvm_memory_slot *slot)
+{
+ kvm_mmu_slot_leaf_clear_dirty(kvm, slot);
+ kvm_mmu_slot_largepage_remove_write_access(kvm, slot);
+}
+
+static void vmx_slot_disable_log_dirty(struct kvm *kvm,
+ struct kvm_memory_slot *slot)
+{
+ kvm_mmu_slot_set_dirty(kvm, slot);
+}
+
+static void vmx_flush_log_dirty(struct kvm *kvm)
+{
+ kvm_flush_pml_buffers(kvm);
+}
+
+static int vmx_write_pml_buffer(struct kvm_vcpu *vcpu, gpa_t gpa)
+{
+ struct vmcs12 *vmcs12;
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct page *page = NULL;
+ u64 *pml_address;
+
+ if (is_guest_mode(vcpu)) {
+ WARN_ON_ONCE(vmx->nested.pml_full);
+
+ /*
+ * Check if PML is enabled for the nested guest.
+ * Whether eptp bit 6 is set is already checked
+ * as part of A/D emulation.
+ */
+ vmcs12 = get_vmcs12(vcpu);
+ if (!nested_cpu_has_pml(vmcs12))
+ return 0;
+
+ if (vmcs12->guest_pml_index >= PML_ENTITY_NUM) {
+ vmx->nested.pml_full = true;
+ return 1;
+ }
+
+ gpa &= ~0xFFFull;
+
+ page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->pml_address);
+ if (is_error_page(page))
+ return 0;
+
+ pml_address = kmap(page);
+ pml_address[vmcs12->guest_pml_index--] = gpa;
+ kunmap(page);
+ kvm_release_page_clean(page);
+ }
+
+ return 0;
+}
+
+static void vmx_enable_log_dirty_pt_masked(struct kvm *kvm,
+ struct kvm_memory_slot *memslot,
+ gfn_t offset, unsigned long mask)
+{
+ kvm_mmu_clear_dirty_pt_masked(kvm, memslot, offset, mask);
+}
+
+static void __pi_post_block(struct kvm_vcpu *vcpu)
+{
+ struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
+ struct pi_desc old, new;
+ unsigned int dest;
+
+ do {
+ old.control = new.control = pi_desc->control;
+ WARN(old.nv != POSTED_INTR_WAKEUP_VECTOR,
+ "Wakeup handler not enabled while the VCPU is blocked\n");
+
+ dest = cpu_physical_id(vcpu->cpu);
+
+ if (x2apic_enabled())
+ new.ndst = dest;
+ else
+ new.ndst = (dest << 8) & 0xFF00;
+
+ /* set 'NV' to 'notification vector' */
+ new.nv = POSTED_INTR_VECTOR;
+ } while (cmpxchg64(&pi_desc->control, old.control,
+ new.control) != old.control);
+
+ if (!WARN_ON_ONCE(vcpu->pre_pcpu == -1)) {
+ spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
+ list_del(&vcpu->blocked_vcpu_list);
+ spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
+ vcpu->pre_pcpu = -1;
+ }
+}
+
+/*
+ * This routine does the following things for vCPU which is going
+ * to be blocked if VT-d PI is enabled.
+ * - Store the vCPU to the wakeup list, so when interrupts happen
+ * we can find the right vCPU to wake up.
+ * - Change the Posted-interrupt descriptor as below:
+ * 'NDST' <-- vcpu->pre_pcpu
+ * 'NV' <-- POSTED_INTR_WAKEUP_VECTOR
+ * - If 'ON' is set during this process, which means at least one
+ * interrupt is posted for this vCPU, we cannot block it, in
+ * this case, return 1, otherwise, return 0.
+ *
+ */
+static int pi_pre_block(struct kvm_vcpu *vcpu)
+{
+ unsigned int dest;
+ struct pi_desc old, new;
+ struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
+
+ if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
+ !irq_remapping_cap(IRQ_POSTING_CAP) ||
+ !kvm_vcpu_apicv_active(vcpu))
+ return 0;
+
+ WARN_ON(irqs_disabled());
+ local_irq_disable();
+ if (!WARN_ON_ONCE(vcpu->pre_pcpu != -1)) {
+ vcpu->pre_pcpu = vcpu->cpu;
+ spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
+ list_add_tail(&vcpu->blocked_vcpu_list,
+ &per_cpu(blocked_vcpu_on_cpu,
+ vcpu->pre_pcpu));
+ spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
+ }
+
+ do {
+ old.control = new.control = pi_desc->control;
+
+ WARN((pi_desc->sn == 1),
+ "Warning: SN field of posted-interrupts "
+ "is set before blocking\n");
+
+ /*
+ * Since vCPU can be preempted during this process,
+ * vcpu->cpu could be different with pre_pcpu, we
+ * need to set pre_pcpu as the destination of wakeup
+ * notification event, then we can find the right vCPU
+ * to wakeup in wakeup handler if interrupts happen
+ * when the vCPU is in blocked state.
+ */
+ dest = cpu_physical_id(vcpu->pre_pcpu);
+
+ if (x2apic_enabled())
+ new.ndst = dest;
+ else
+ new.ndst = (dest << 8) & 0xFF00;
+
+ /* set 'NV' to 'wakeup vector' */
+ new.nv = POSTED_INTR_WAKEUP_VECTOR;
+ } while (cmpxchg64(&pi_desc->control, old.control,
+ new.control) != old.control);
+
+ /* We should not block the vCPU if an interrupt is posted for it. */
+ if (pi_test_on(pi_desc) == 1)
+ __pi_post_block(vcpu);
+
+ local_irq_enable();
+ return (vcpu->pre_pcpu == -1);
+}
+
+static int vmx_pre_block(struct kvm_vcpu *vcpu)
+{
+ if (pi_pre_block(vcpu))
+ return 1;
+
+ if (kvm_lapic_hv_timer_in_use(vcpu))
+ kvm_lapic_switch_to_sw_timer(vcpu);
+
+ return 0;
+}
+
+static void pi_post_block(struct kvm_vcpu *vcpu)
+{
+ if (vcpu->pre_pcpu == -1)
+ return;
+
+ WARN_ON(irqs_disabled());
+ local_irq_disable();
+ __pi_post_block(vcpu);
+ local_irq_enable();
+}
+
+static void vmx_post_block(struct kvm_vcpu *vcpu)
+{
+ if (kvm_x86_ops->set_hv_timer)
+ kvm_lapic_switch_to_hv_timer(vcpu);
+
+ pi_post_block(vcpu);
+}
+
+/*
+ * vmx_update_pi_irte - set IRTE for Posted-Interrupts
+ *
+ * @kvm: kvm
+ * @host_irq: host irq of the interrupt
+ * @guest_irq: gsi of the interrupt
+ * @set: set or unset PI
+ * returns 0 on success, < 0 on failure
+ */
+static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
+ uint32_t guest_irq, bool set)
+{
+ struct kvm_kernel_irq_routing_entry *e;
+ struct kvm_irq_routing_table *irq_rt;
+ struct kvm_lapic_irq irq;
+ struct kvm_vcpu *vcpu;
+ struct vcpu_data vcpu_info;
+ int idx, ret = 0;
+
+ if (!kvm_arch_has_assigned_device(kvm) ||
+ !irq_remapping_cap(IRQ_POSTING_CAP) ||
+ !kvm_vcpu_apicv_active(kvm->vcpus[0]))
+ return 0;
+
+ idx = srcu_read_lock(&kvm->irq_srcu);
+ irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
+ if (guest_irq >= irq_rt->nr_rt_entries ||
+ hlist_empty(&irq_rt->map[guest_irq])) {
+ pr_warn_once("no route for guest_irq %u/%u (broken user space?)\n",
+ guest_irq, irq_rt->nr_rt_entries);
+ goto out;
+ }
+
+ hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) {
+ if (e->type != KVM_IRQ_ROUTING_MSI)
+ continue;
+ /*
+ * VT-d PI cannot support posting multicast/broadcast
+ * interrupts to a vCPU, we still use interrupt remapping
+ * for these kind of interrupts.
+ *
+ * For lowest-priority interrupts, we only support
+ * those with single CPU as the destination, e.g. user
+ * configures the interrupts via /proc/irq or uses
+ * irqbalance to make the interrupts single-CPU.
+ *
+ * We will support full lowest-priority interrupt later.
+ */
+
+ kvm_set_msi_irq(kvm, e, &irq);
+ if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu)) {
+ /*
+ * Make sure the IRTE is in remapped mode if
+ * we don't handle it in posted mode.
+ */
+ ret = irq_set_vcpu_affinity(host_irq, NULL);
+ if (ret < 0) {
+ printk(KERN_INFO
+ "failed to back to remapped mode, irq: %u\n",
+ host_irq);
+ goto out;
+ }
+
+ continue;
+ }
+
+ vcpu_info.pi_desc_addr = __pa(vcpu_to_pi_desc(vcpu));
+ vcpu_info.vector = irq.vector;
+
+ trace_kvm_pi_irte_update(host_irq, vcpu->vcpu_id, e->gsi,
+ vcpu_info.vector, vcpu_info.pi_desc_addr, set);
+
+ if (set)
+ ret = irq_set_vcpu_affinity(host_irq, &vcpu_info);
+ else
+ ret = irq_set_vcpu_affinity(host_irq, NULL);
+
+ if (ret < 0) {
+ printk(KERN_INFO "%s: failed to update PI IRTE\n",
+ __func__);
+ goto out;
+ }
+ }
+
+ ret = 0;
+out:
+ srcu_read_unlock(&kvm->irq_srcu, idx);
+ return ret;
+}
+
+static void vmx_setup_mce(struct kvm_vcpu *vcpu)
+{
+ if (vcpu->arch.mcg_cap & MCG_LMCE_P)
+ to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |=
+ FEATURE_CONTROL_LMCE;
+ else
+ to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &=
+ ~FEATURE_CONTROL_LMCE;
+}
+
+static int vmx_smi_allowed(struct kvm_vcpu *vcpu)
+{
+ /* we need a nested vmexit to enter SMM, postpone if run is pending */
+ if (to_vmx(vcpu)->nested.nested_run_pending)
+ return 0;
+ return 1;
+}
+
+static int vmx_pre_enter_smm(struct kvm_vcpu *vcpu, char *smstate)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ vmx->nested.smm.guest_mode = is_guest_mode(vcpu);
+ if (vmx->nested.smm.guest_mode)
+ nested_vmx_vmexit(vcpu, -1, 0, 0);
+
+ vmx->nested.smm.vmxon = vmx->nested.vmxon;
+ vmx->nested.vmxon = false;
+ vmx_clear_hlt(vcpu);
+ return 0;
+}
+
+static int vmx_pre_leave_smm(struct kvm_vcpu *vcpu, u64 smbase)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ int ret;
+
+ if (vmx->nested.smm.vmxon) {
+ vmx->nested.vmxon = true;
+ vmx->nested.smm.vmxon = false;
+ }
+
+ if (vmx->nested.smm.guest_mode) {
+ vcpu->arch.hflags &= ~HF_SMM_MASK;
+ ret = enter_vmx_non_root_mode(vcpu, NULL);
+ vcpu->arch.hflags |= HF_SMM_MASK;
+ if (ret)
+ return ret;
+
+ vmx->nested.smm.guest_mode = false;
+ }
+ return 0;
+}
+
+static int enable_smi_window(struct kvm_vcpu *vcpu)
+{
+ return 0;
+}
+
+static int vmx_get_nested_state(struct kvm_vcpu *vcpu,
+ struct kvm_nested_state __user *user_kvm_nested_state,
+ u32 user_data_size)
+{
+ struct vcpu_vmx *vmx;
+ struct vmcs12 *vmcs12;
+ struct kvm_nested_state kvm_state = {
+ .flags = 0,
+ .format = 0,
+ .size = sizeof(kvm_state),
+ .vmx.vmxon_pa = -1ull,
+ .vmx.vmcs_pa = -1ull,
+ };
+
+ if (!vcpu)
+ return kvm_state.size + 2 * VMCS12_SIZE;
+
+ vmx = to_vmx(vcpu);
+ vmcs12 = get_vmcs12(vcpu);
+ if (nested_vmx_allowed(vcpu) &&
+ (vmx->nested.vmxon || vmx->nested.smm.vmxon)) {
+ kvm_state.vmx.vmxon_pa = vmx->nested.vmxon_ptr;
+ kvm_state.vmx.vmcs_pa = vmx->nested.current_vmptr;
+
+ if (vmx->nested.current_vmptr != -1ull) {
+ kvm_state.size += VMCS12_SIZE;
+
+ if (is_guest_mode(vcpu) &&
+ nested_cpu_has_shadow_vmcs(vmcs12) &&
+ vmcs12->vmcs_link_pointer != -1ull)
+ kvm_state.size += VMCS12_SIZE;
+ }
+
+ if (vmx->nested.smm.vmxon)
+ kvm_state.vmx.smm.flags |= KVM_STATE_NESTED_SMM_VMXON;
+
+ if (vmx->nested.smm.guest_mode)
+ kvm_state.vmx.smm.flags |= KVM_STATE_NESTED_SMM_GUEST_MODE;
+
+ if (is_guest_mode(vcpu)) {
+ kvm_state.flags |= KVM_STATE_NESTED_GUEST_MODE;
+
+ if (vmx->nested.nested_run_pending)
+ kvm_state.flags |= KVM_STATE_NESTED_RUN_PENDING;
+ }
+ }
+
+ if (user_data_size < kvm_state.size)
+ goto out;
+
+ if (copy_to_user(user_kvm_nested_state, &kvm_state, sizeof(kvm_state)))
+ return -EFAULT;
+
+ if (vmx->nested.current_vmptr == -1ull)
+ goto out;
+
+ /*
+ * When running L2, the authoritative vmcs12 state is in the
+ * vmcs02. When running L1, the authoritative vmcs12 state is
+ * in the shadow vmcs linked to vmcs01, unless
+ * sync_shadow_vmcs is set, in which case, the authoritative
+ * vmcs12 state is in the vmcs12 already.
+ */
+ if (is_guest_mode(vcpu))
+ sync_vmcs12(vcpu, vmcs12);
+ else if (enable_shadow_vmcs && !vmx->nested.sync_shadow_vmcs)
+ copy_shadow_to_vmcs12(vmx);
+
+ /*
+ * Copy over the full allocated size of vmcs12 rather than just the size
+ * of the struct.
+ */
+ if (copy_to_user(user_kvm_nested_state->data, vmcs12, VMCS12_SIZE))
+ return -EFAULT;
+
+ if (nested_cpu_has_shadow_vmcs(vmcs12) &&
+ vmcs12->vmcs_link_pointer != -1ull) {
+ if (copy_to_user(user_kvm_nested_state->data + VMCS12_SIZE,
+ get_shadow_vmcs12(vcpu), VMCS12_SIZE))
+ return -EFAULT;
+ }
+
+out:
+ return kvm_state.size;
+}
+
+static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
+ struct kvm_nested_state __user *user_kvm_nested_state,
+ struct kvm_nested_state *kvm_state)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct vmcs12 *vmcs12;
+ u32 exit_qual;
+ int ret;
+
+ if (kvm_state->format != 0)
+ return -EINVAL;
+
+ if (!nested_vmx_allowed(vcpu))
+ return kvm_state->vmx.vmxon_pa == -1ull ? 0 : -EINVAL;
+
+ if (kvm_state->vmx.vmxon_pa == -1ull) {
+ if (kvm_state->vmx.smm.flags)
+ return -EINVAL;
+
+ if (kvm_state->vmx.vmcs_pa != -1ull)
+ return -EINVAL;
+
+ vmx_leave_nested(vcpu);
+ return 0;
+ }
+
+ if (!page_address_valid(vcpu, kvm_state->vmx.vmxon_pa))
+ return -EINVAL;
+
+ if ((kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) &&
+ (kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE))
+ return -EINVAL;
+
+ if (kvm_state->vmx.smm.flags &
+ ~(KVM_STATE_NESTED_SMM_GUEST_MODE | KVM_STATE_NESTED_SMM_VMXON))
+ return -EINVAL;
+
+ /*
+ * SMM temporarily disables VMX, so we cannot be in guest mode,
+ * nor can VMLAUNCH/VMRESUME be pending. Outside SMM, SMM flags
+ * must be zero.
+ */
+ if (is_smm(vcpu) ? kvm_state->flags : kvm_state->vmx.smm.flags)
+ return -EINVAL;
+
+ if ((kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) &&
+ !(kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON))
+ return -EINVAL;
+
+ vmx_leave_nested(vcpu);
+ if (kvm_state->vmx.vmxon_pa == -1ull)
+ return 0;
+
+ vmx->nested.vmxon_ptr = kvm_state->vmx.vmxon_pa;
+ ret = enter_vmx_operation(vcpu);
+ if (ret)
+ return ret;
+
+ /* Empty 'VMXON' state is permitted */
+ if (kvm_state->size < sizeof(*kvm_state) + sizeof(*vmcs12))
+ return 0;
+
+ if (kvm_state->vmx.vmcs_pa == kvm_state->vmx.vmxon_pa ||
+ !page_address_valid(vcpu, kvm_state->vmx.vmcs_pa))
+ return -EINVAL;
+
+ set_current_vmptr(vmx, kvm_state->vmx.vmcs_pa);
+
+ if (kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON) {
+ vmx->nested.smm.vmxon = true;
+ vmx->nested.vmxon = false;
+
+ if (kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE)
+ vmx->nested.smm.guest_mode = true;
+ }
+
+ vmcs12 = get_vmcs12(vcpu);
+ if (copy_from_user(vmcs12, user_kvm_nested_state->data, sizeof(*vmcs12)))
+ return -EFAULT;
+
+ if (vmcs12->hdr.revision_id != VMCS12_REVISION)
+ return -EINVAL;
+
+ if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE))
+ return 0;
+
+ vmx->nested.nested_run_pending =
+ !!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING);
+
+ if (nested_cpu_has_shadow_vmcs(vmcs12) &&
+ vmcs12->vmcs_link_pointer != -1ull) {
+ struct vmcs12 *shadow_vmcs12 = get_shadow_vmcs12(vcpu);
+ if (kvm_state->size < sizeof(*kvm_state) + 2 * sizeof(*vmcs12))
+ return -EINVAL;
+
+ if (copy_from_user(shadow_vmcs12,
+ user_kvm_nested_state->data + VMCS12_SIZE,
+ sizeof(*vmcs12)))
+ return -EFAULT;
+
+ if (shadow_vmcs12->hdr.revision_id != VMCS12_REVISION ||
+ !shadow_vmcs12->hdr.shadow_vmcs)
+ return -EINVAL;
+ }
+
+ if (check_vmentry_prereqs(vcpu, vmcs12) ||
+ check_vmentry_postreqs(vcpu, vmcs12, &exit_qual))
+ return -EINVAL;
+
+ vmx->nested.dirty_vmcs12 = true;
+ ret = enter_vmx_non_root_mode(vcpu, NULL);
+ if (ret)
+ return -EINVAL;
+
+ return 0;
+}
+
+static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
+ .cpu_has_kvm_support = cpu_has_kvm_support,
+ .disabled_by_bios = vmx_disabled_by_bios,
+ .hardware_setup = hardware_setup,
+ .hardware_unsetup = hardware_unsetup,
+ .check_processor_compatibility = vmx_check_processor_compat,
+ .hardware_enable = hardware_enable,
+ .hardware_disable = hardware_disable,
+ .cpu_has_accelerated_tpr = report_flexpriority,
+ .has_emulated_msr = vmx_has_emulated_msr,
+
+ .vm_init = vmx_vm_init,
+ .vm_alloc = vmx_vm_alloc,
+ .vm_free = vmx_vm_free,
+
+ .vcpu_create = vmx_create_vcpu,
+ .vcpu_free = vmx_free_vcpu,
+ .vcpu_reset = vmx_vcpu_reset,
+
+ .prepare_guest_switch = vmx_prepare_switch_to_guest,
+ .vcpu_load = vmx_vcpu_load,
+ .vcpu_put = vmx_vcpu_put,
+
+ .update_bp_intercept = update_exception_bitmap,
+ .get_msr_feature = vmx_get_msr_feature,
+ .get_msr = vmx_get_msr,
+ .set_msr = vmx_set_msr,
+ .get_segment_base = vmx_get_segment_base,
+ .get_segment = vmx_get_segment,
+ .set_segment = vmx_set_segment,
+ .get_cpl = vmx_get_cpl,
+ .get_cs_db_l_bits = vmx_get_cs_db_l_bits,
+ .decache_cr0_guest_bits = vmx_decache_cr0_guest_bits,
+ .decache_cr3 = vmx_decache_cr3,
+ .decache_cr4_guest_bits = vmx_decache_cr4_guest_bits,
+ .set_cr0 = vmx_set_cr0,
+ .set_cr3 = vmx_set_cr3,
+ .set_cr4 = vmx_set_cr4,
+ .set_efer = vmx_set_efer,
+ .get_idt = vmx_get_idt,
+ .set_idt = vmx_set_idt,
+ .get_gdt = vmx_get_gdt,
+ .set_gdt = vmx_set_gdt,
+ .get_dr6 = vmx_get_dr6,
+ .set_dr6 = vmx_set_dr6,
+ .set_dr7 = vmx_set_dr7,
+ .sync_dirty_debug_regs = vmx_sync_dirty_debug_regs,
+ .cache_reg = vmx_cache_reg,
+ .get_rflags = vmx_get_rflags,
+ .set_rflags = vmx_set_rflags,
+
+ .tlb_flush = vmx_flush_tlb,
+ .tlb_flush_gva = vmx_flush_tlb_gva,
+
+ .run = vmx_vcpu_run,
+ .handle_exit = vmx_handle_exit,
+ .skip_emulated_instruction = skip_emulated_instruction,
+ .set_interrupt_shadow = vmx_set_interrupt_shadow,
+ .get_interrupt_shadow = vmx_get_interrupt_shadow,
+ .patch_hypercall = vmx_patch_hypercall,
+ .set_irq = vmx_inject_irq,
+ .set_nmi = vmx_inject_nmi,
+ .queue_exception = vmx_queue_exception,
+ .cancel_injection = vmx_cancel_injection,
+ .interrupt_allowed = vmx_interrupt_allowed,
+ .nmi_allowed = vmx_nmi_allowed,
+ .get_nmi_mask = vmx_get_nmi_mask,
+ .set_nmi_mask = vmx_set_nmi_mask,
+ .enable_nmi_window = enable_nmi_window,
+ .enable_irq_window = enable_irq_window,
+ .update_cr8_intercept = update_cr8_intercept,
+ .set_virtual_apic_mode = vmx_set_virtual_apic_mode,
+ .set_apic_access_page_addr = vmx_set_apic_access_page_addr,
+ .get_enable_apicv = vmx_get_enable_apicv,
+ .refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl,
+ .load_eoi_exitmap = vmx_load_eoi_exitmap,
+ .apicv_post_state_restore = vmx_apicv_post_state_restore,
+ .hwapic_irr_update = vmx_hwapic_irr_update,
+ .hwapic_isr_update = vmx_hwapic_isr_update,
+ .guest_apic_has_interrupt = vmx_guest_apic_has_interrupt,
+ .sync_pir_to_irr = vmx_sync_pir_to_irr,
+ .deliver_posted_interrupt = vmx_deliver_posted_interrupt,
+ .dy_apicv_has_pending_interrupt = vmx_dy_apicv_has_pending_interrupt,
+
+ .set_tss_addr = vmx_set_tss_addr,
+ .set_identity_map_addr = vmx_set_identity_map_addr,
+ .get_tdp_level = get_ept_level,
+ .get_mt_mask = vmx_get_mt_mask,
+
+ .get_exit_info = vmx_get_exit_info,
+
+ .get_lpage_level = vmx_get_lpage_level,
+
+ .cpuid_update = vmx_cpuid_update,
+
+ .rdtscp_supported = vmx_rdtscp_supported,
+ .invpcid_supported = vmx_invpcid_supported,
+
+ .set_supported_cpuid = vmx_set_supported_cpuid,
+
+ .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
+
+ .read_l1_tsc_offset = vmx_read_l1_tsc_offset,
+ .write_l1_tsc_offset = vmx_write_l1_tsc_offset,
+
+ .set_tdp_cr3 = vmx_set_cr3,
+
+ .check_intercept = vmx_check_intercept,
+ .handle_external_intr = vmx_handle_external_intr,
+ .mpx_supported = vmx_mpx_supported,
+ .xsaves_supported = vmx_xsaves_supported,
+ .umip_emulated = vmx_umip_emulated,
+
+ .check_nested_events = vmx_check_nested_events,
+ .request_immediate_exit = vmx_request_immediate_exit,
+
+ .sched_in = vmx_sched_in,
+
+ .slot_enable_log_dirty = vmx_slot_enable_log_dirty,
+ .slot_disable_log_dirty = vmx_slot_disable_log_dirty,
+ .flush_log_dirty = vmx_flush_log_dirty,
+ .enable_log_dirty_pt_masked = vmx_enable_log_dirty_pt_masked,
+ .write_log_dirty = vmx_write_pml_buffer,
+
+ .pre_block = vmx_pre_block,
+ .post_block = vmx_post_block,
+
+ .pmu_ops = &intel_pmu_ops,
+
+ .update_pi_irte = vmx_update_pi_irte,
+
+#ifdef CONFIG_X86_64
+ .set_hv_timer = vmx_set_hv_timer,
+ .cancel_hv_timer = vmx_cancel_hv_timer,
+#endif
+
+ .setup_mce = vmx_setup_mce,
+
+ .get_nested_state = vmx_get_nested_state,
+ .set_nested_state = vmx_set_nested_state,
+ .get_vmcs12_pages = nested_get_vmcs12_pages,
+
+ .smi_allowed = vmx_smi_allowed,
+ .pre_enter_smm = vmx_pre_enter_smm,
+ .pre_leave_smm = vmx_pre_leave_smm,
+ .enable_smi_window = enable_smi_window,
+};
+
+static void vmx_cleanup_l1d_flush(void)
+{
+ if (vmx_l1d_flush_pages) {
+ free_pages((unsigned long)vmx_l1d_flush_pages, L1D_CACHE_ORDER);
+ vmx_l1d_flush_pages = NULL;
+ }
+ /* Restore state so sysfs ignores VMX */
+ l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO;
+}
+
+static void vmx_exit(void)
+{
+#ifdef CONFIG_KEXEC_CORE
+ RCU_INIT_POINTER(crash_vmclear_loaded_vmcss, NULL);
+ synchronize_rcu();
+#endif
+
+ kvm_exit();
+
+#if IS_ENABLED(CONFIG_HYPERV)
+ if (static_branch_unlikely(&enable_evmcs)) {
+ int cpu;
+ struct hv_vp_assist_page *vp_ap;
+ /*
+ * Reset everything to support using non-enlightened VMCS
+ * access later (e.g. when we reload the module with
+ * enlightened_vmcs=0)
+ */
+ for_each_online_cpu(cpu) {
+ vp_ap = hv_get_vp_assist_page(cpu);
+
+ if (!vp_ap)
+ continue;
+
+ vp_ap->current_nested_vmcs = 0;
+ vp_ap->enlighten_vmentry = 0;
+ }
+
+ static_branch_disable(&enable_evmcs);
+ }
+#endif
+ vmx_cleanup_l1d_flush();
+}
+module_exit(vmx_exit);
+
+static int __init vmx_init(void)
+{
+ int r, cpu;
+
+#if IS_ENABLED(CONFIG_HYPERV)
+ /*
+ * Enlightened VMCS usage should be recommended and the host needs
+ * to support eVMCS v1 or above. We can also disable eVMCS support
+ * with module parameter.
+ */
+ if (enlightened_vmcs &&
+ ms_hyperv.hints & HV_X64_ENLIGHTENED_VMCS_RECOMMENDED &&
+ (ms_hyperv.nested_features & HV_X64_ENLIGHTENED_VMCS_VERSION) >=
+ KVM_EVMCS_VERSION) {
+ int cpu;
+
+ /* Check that we have assist pages on all online CPUs */
+ for_each_online_cpu(cpu) {
+ if (!hv_get_vp_assist_page(cpu)) {
+ enlightened_vmcs = false;
+ break;
+ }
+ }
+
+ if (enlightened_vmcs) {
+ pr_info("KVM: vmx: using Hyper-V Enlightened VMCS\n");
+ static_branch_enable(&enable_evmcs);
+ }
+ } else {
+ enlightened_vmcs = false;
+ }
+#endif
+
+ r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx),
+ __alignof__(struct vcpu_vmx), THIS_MODULE);
+ if (r)
+ return r;
+
+ /*
+ * Must be called after kvm_init() so enable_ept is properly set
+ * up. Hand the parameter mitigation value in which was stored in
+ * the pre module init parser. If no parameter was given, it will
+ * contain 'auto' which will be turned into the default 'cond'
+ * mitigation mode.
+ */
+ if (boot_cpu_has(X86_BUG_L1TF)) {
+ r = vmx_setup_l1d_flush(vmentry_l1d_flush_param);
+ if (r) {
+ vmx_exit();
+ return r;
+ }
+ }
+
+ vmx_setup_fb_clear_ctrl();
+
+ for_each_possible_cpu(cpu) {
+ INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
+
+ INIT_LIST_HEAD(&per_cpu(blocked_vcpu_on_cpu, cpu));
+ spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
+ }
+
+#ifdef CONFIG_KEXEC_CORE
+ rcu_assign_pointer(crash_vmclear_loaded_vmcss,
+ crash_vmclear_local_loaded_vmcss);
+#endif
+ vmx_check_vmcs12_offsets();
+
+ return 0;
+}
+module_init(vmx_init);
diff --git a/arch/x86/kvm/vmx_evmcs.h b/arch/x86/kvm/vmx_evmcs.h
new file mode 100644
index 000000000..210a88409
--- /dev/null
+++ b/arch/x86/kvm/vmx_evmcs.h
@@ -0,0 +1,324 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __KVM_X86_VMX_EVMCS_H
+#define __KVM_X86_VMX_EVMCS_H
+
+#include <asm/hyperv-tlfs.h>
+
+#define ROL16(val, n) ((u16)(((u16)(val) << (n)) | ((u16)(val) >> (16 - (n)))))
+#define EVMCS1_OFFSET(x) offsetof(struct hv_enlightened_vmcs, x)
+#define EVMCS1_FIELD(number, name, clean_field)[ROL16(number, 6)] = \
+ {EVMCS1_OFFSET(name), clean_field}
+
+struct evmcs_field {
+ u16 offset;
+ u16 clean_field;
+};
+
+static const struct evmcs_field vmcs_field_to_evmcs_1[] = {
+ /* 64 bit rw */
+ EVMCS1_FIELD(GUEST_RIP, guest_rip,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
+ EVMCS1_FIELD(GUEST_RSP, guest_rsp,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC),
+ EVMCS1_FIELD(GUEST_RFLAGS, guest_rflags,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC),
+ EVMCS1_FIELD(HOST_IA32_PAT, host_ia32_pat,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ EVMCS1_FIELD(HOST_IA32_EFER, host_ia32_efer,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ EVMCS1_FIELD(HOST_CR0, host_cr0,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ EVMCS1_FIELD(HOST_CR3, host_cr3,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ EVMCS1_FIELD(HOST_CR4, host_cr4,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ EVMCS1_FIELD(HOST_IA32_SYSENTER_ESP, host_ia32_sysenter_esp,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ EVMCS1_FIELD(HOST_IA32_SYSENTER_EIP, host_ia32_sysenter_eip,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ EVMCS1_FIELD(HOST_RIP, host_rip,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ EVMCS1_FIELD(IO_BITMAP_A, io_bitmap_a,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP),
+ EVMCS1_FIELD(IO_BITMAP_B, io_bitmap_b,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP),
+ EVMCS1_FIELD(MSR_BITMAP, msr_bitmap,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP),
+ EVMCS1_FIELD(GUEST_ES_BASE, guest_es_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_CS_BASE, guest_cs_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_SS_BASE, guest_ss_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_DS_BASE, guest_ds_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_FS_BASE, guest_fs_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_GS_BASE, guest_gs_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_LDTR_BASE, guest_ldtr_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_TR_BASE, guest_tr_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_GDTR_BASE, guest_gdtr_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_IDTR_BASE, guest_idtr_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(TSC_OFFSET, tsc_offset,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2),
+ EVMCS1_FIELD(VIRTUAL_APIC_PAGE_ADDR, virtual_apic_page_addr,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2),
+ EVMCS1_FIELD(VMCS_LINK_POINTER, vmcs_link_pointer,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
+ EVMCS1_FIELD(GUEST_IA32_DEBUGCTL, guest_ia32_debugctl,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
+ EVMCS1_FIELD(GUEST_IA32_PAT, guest_ia32_pat,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
+ EVMCS1_FIELD(GUEST_IA32_EFER, guest_ia32_efer,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
+ EVMCS1_FIELD(GUEST_PDPTR0, guest_pdptr0,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
+ EVMCS1_FIELD(GUEST_PDPTR1, guest_pdptr1,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
+ EVMCS1_FIELD(GUEST_PDPTR2, guest_pdptr2,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
+ EVMCS1_FIELD(GUEST_PDPTR3, guest_pdptr3,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
+ EVMCS1_FIELD(GUEST_PENDING_DBG_EXCEPTIONS, guest_pending_dbg_exceptions,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
+ EVMCS1_FIELD(GUEST_SYSENTER_ESP, guest_sysenter_esp,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
+ EVMCS1_FIELD(GUEST_SYSENTER_EIP, guest_sysenter_eip,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
+ EVMCS1_FIELD(CR0_GUEST_HOST_MASK, cr0_guest_host_mask,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR),
+ EVMCS1_FIELD(CR4_GUEST_HOST_MASK, cr4_guest_host_mask,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR),
+ EVMCS1_FIELD(CR0_READ_SHADOW, cr0_read_shadow,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR),
+ EVMCS1_FIELD(CR4_READ_SHADOW, cr4_read_shadow,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR),
+ EVMCS1_FIELD(GUEST_CR0, guest_cr0,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR),
+ EVMCS1_FIELD(GUEST_CR3, guest_cr3,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR),
+ EVMCS1_FIELD(GUEST_CR4, guest_cr4,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR),
+ EVMCS1_FIELD(GUEST_DR7, guest_dr7,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR),
+ EVMCS1_FIELD(HOST_FS_BASE, host_fs_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER),
+ EVMCS1_FIELD(HOST_GS_BASE, host_gs_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER),
+ EVMCS1_FIELD(HOST_TR_BASE, host_tr_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER),
+ EVMCS1_FIELD(HOST_GDTR_BASE, host_gdtr_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER),
+ EVMCS1_FIELD(HOST_IDTR_BASE, host_idtr_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER),
+ EVMCS1_FIELD(HOST_RSP, host_rsp,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER),
+ EVMCS1_FIELD(EPT_POINTER, ept_pointer,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT),
+ EVMCS1_FIELD(GUEST_BNDCFGS, guest_bndcfgs,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
+ EVMCS1_FIELD(XSS_EXIT_BITMAP, xss_exit_bitmap,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2),
+
+ /* 64 bit read only */
+ EVMCS1_FIELD(GUEST_PHYSICAL_ADDRESS, guest_physical_address,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
+ EVMCS1_FIELD(EXIT_QUALIFICATION, exit_qualification,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
+ /*
+ * Not defined in KVM:
+ *
+ * EVMCS1_FIELD(0x00006402, exit_io_instruction_ecx,
+ * HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE);
+ * EVMCS1_FIELD(0x00006404, exit_io_instruction_esi,
+ * HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE);
+ * EVMCS1_FIELD(0x00006406, exit_io_instruction_esi,
+ * HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE);
+ * EVMCS1_FIELD(0x00006408, exit_io_instruction_eip,
+ * HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE);
+ */
+ EVMCS1_FIELD(GUEST_LINEAR_ADDRESS, guest_linear_address,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
+
+ /*
+ * No mask defined in the spec as Hyper-V doesn't currently support
+ * these. Future proof by resetting the whole clean field mask on
+ * access.
+ */
+ EVMCS1_FIELD(VM_EXIT_MSR_STORE_ADDR, vm_exit_msr_store_addr,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
+ EVMCS1_FIELD(VM_EXIT_MSR_LOAD_ADDR, vm_exit_msr_load_addr,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
+ EVMCS1_FIELD(VM_ENTRY_MSR_LOAD_ADDR, vm_entry_msr_load_addr,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
+ EVMCS1_FIELD(CR3_TARGET_VALUE0, cr3_target_value0,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
+ EVMCS1_FIELD(CR3_TARGET_VALUE1, cr3_target_value1,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
+ EVMCS1_FIELD(CR3_TARGET_VALUE2, cr3_target_value2,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
+ EVMCS1_FIELD(CR3_TARGET_VALUE3, cr3_target_value3,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
+
+ /* 32 bit rw */
+ EVMCS1_FIELD(TPR_THRESHOLD, tpr_threshold,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
+ EVMCS1_FIELD(GUEST_INTERRUPTIBILITY_INFO, guest_interruptibility_info,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC),
+ EVMCS1_FIELD(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC),
+ EVMCS1_FIELD(EXCEPTION_BITMAP, exception_bitmap,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EXCPN),
+ EVMCS1_FIELD(VM_ENTRY_CONTROLS, vm_entry_controls,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_ENTRY),
+ EVMCS1_FIELD(VM_ENTRY_INTR_INFO_FIELD, vm_entry_intr_info_field,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT),
+ EVMCS1_FIELD(VM_ENTRY_EXCEPTION_ERROR_CODE,
+ vm_entry_exception_error_code,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT),
+ EVMCS1_FIELD(VM_ENTRY_INSTRUCTION_LEN, vm_entry_instruction_len,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT),
+ EVMCS1_FIELD(HOST_IA32_SYSENTER_CS, host_ia32_sysenter_cs,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ EVMCS1_FIELD(PIN_BASED_VM_EXEC_CONTROL, pin_based_vm_exec_control,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1),
+ EVMCS1_FIELD(VM_EXIT_CONTROLS, vm_exit_controls,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1),
+ EVMCS1_FIELD(SECONDARY_VM_EXEC_CONTROL, secondary_vm_exec_control,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1),
+ EVMCS1_FIELD(GUEST_ES_LIMIT, guest_es_limit,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_CS_LIMIT, guest_cs_limit,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_SS_LIMIT, guest_ss_limit,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_DS_LIMIT, guest_ds_limit,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_FS_LIMIT, guest_fs_limit,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_GS_LIMIT, guest_gs_limit,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_LDTR_LIMIT, guest_ldtr_limit,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_TR_LIMIT, guest_tr_limit,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_GDTR_LIMIT, guest_gdtr_limit,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_IDTR_LIMIT, guest_idtr_limit,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_ES_AR_BYTES, guest_es_ar_bytes,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_CS_AR_BYTES, guest_cs_ar_bytes,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_SS_AR_BYTES, guest_ss_ar_bytes,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_DS_AR_BYTES, guest_ds_ar_bytes,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_FS_AR_BYTES, guest_fs_ar_bytes,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_GS_AR_BYTES, guest_gs_ar_bytes,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_LDTR_AR_BYTES, guest_ldtr_ar_bytes,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_TR_AR_BYTES, guest_tr_ar_bytes,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_ACTIVITY_STATE, guest_activity_state,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
+ EVMCS1_FIELD(GUEST_SYSENTER_CS, guest_sysenter_cs,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
+
+ /* 32 bit read only */
+ EVMCS1_FIELD(VM_INSTRUCTION_ERROR, vm_instruction_error,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
+ EVMCS1_FIELD(VM_EXIT_REASON, vm_exit_reason,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
+ EVMCS1_FIELD(VM_EXIT_INTR_INFO, vm_exit_intr_info,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
+ EVMCS1_FIELD(VM_EXIT_INTR_ERROR_CODE, vm_exit_intr_error_code,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
+ EVMCS1_FIELD(IDT_VECTORING_INFO_FIELD, idt_vectoring_info_field,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
+ EVMCS1_FIELD(IDT_VECTORING_ERROR_CODE, idt_vectoring_error_code,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
+ EVMCS1_FIELD(VM_EXIT_INSTRUCTION_LEN, vm_exit_instruction_len,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
+ EVMCS1_FIELD(VMX_INSTRUCTION_INFO, vmx_instruction_info,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
+
+ /* No mask defined in the spec (not used) */
+ EVMCS1_FIELD(PAGE_FAULT_ERROR_CODE_MASK, page_fault_error_code_mask,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
+ EVMCS1_FIELD(PAGE_FAULT_ERROR_CODE_MATCH, page_fault_error_code_match,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
+ EVMCS1_FIELD(CR3_TARGET_COUNT, cr3_target_count,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
+ EVMCS1_FIELD(VM_EXIT_MSR_STORE_COUNT, vm_exit_msr_store_count,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
+ EVMCS1_FIELD(VM_EXIT_MSR_LOAD_COUNT, vm_exit_msr_load_count,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
+ EVMCS1_FIELD(VM_ENTRY_MSR_LOAD_COUNT, vm_entry_msr_load_count,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
+
+ /* 16 bit rw */
+ EVMCS1_FIELD(HOST_ES_SELECTOR, host_es_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ EVMCS1_FIELD(HOST_CS_SELECTOR, host_cs_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ EVMCS1_FIELD(HOST_SS_SELECTOR, host_ss_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ EVMCS1_FIELD(HOST_DS_SELECTOR, host_ds_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ EVMCS1_FIELD(HOST_FS_SELECTOR, host_fs_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ EVMCS1_FIELD(HOST_GS_SELECTOR, host_gs_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ EVMCS1_FIELD(HOST_TR_SELECTOR, host_tr_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ EVMCS1_FIELD(GUEST_ES_SELECTOR, guest_es_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_CS_SELECTOR, guest_cs_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_SS_SELECTOR, guest_ss_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_DS_SELECTOR, guest_ds_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_FS_SELECTOR, guest_fs_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_GS_SELECTOR, guest_gs_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_LDTR_SELECTOR, guest_ldtr_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_TR_SELECTOR, guest_tr_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT),
+};
+
+static __always_inline int get_evmcs_offset(unsigned long field,
+ u16 *clean_field)
+{
+ unsigned int index = ROL16(field, 6);
+ const struct evmcs_field *evmcs_field;
+
+ if (unlikely(index >= ARRAY_SIZE(vmcs_field_to_evmcs_1))) {
+ WARN_ONCE(1, "KVM: accessing unsupported EVMCS field %lx\n",
+ field);
+ return -ENOENT;
+ }
+
+ evmcs_field = &vmcs_field_to_evmcs_1[index];
+
+ if (clean_field)
+ *clean_field = evmcs_field->clean_field;
+
+ return evmcs_field->offset;
+}
+
+#undef ROL16
+
+#endif /* __KVM_X86_VMX_EVMCS_H */
diff --git a/arch/x86/kvm/vmx_shadow_fields.h b/arch/x86/kvm/vmx_shadow_fields.h
new file mode 100644
index 000000000..cd0c75f6d
--- /dev/null
+++ b/arch/x86/kvm/vmx_shadow_fields.h
@@ -0,0 +1,77 @@
+#ifndef SHADOW_FIELD_RO
+#define SHADOW_FIELD_RO(x)
+#endif
+#ifndef SHADOW_FIELD_RW
+#define SHADOW_FIELD_RW(x)
+#endif
+
+/*
+ * We do NOT shadow fields that are modified when L0
+ * traps and emulates any vmx instruction (e.g. VMPTRLD,
+ * VMXON...) executed by L1.
+ * For example, VM_INSTRUCTION_ERROR is read
+ * by L1 if a vmx instruction fails (part of the error path).
+ * Note the code assumes this logic. If for some reason
+ * we start shadowing these fields then we need to
+ * force a shadow sync when L0 emulates vmx instructions
+ * (e.g. force a sync if VM_INSTRUCTION_ERROR is modified
+ * by nested_vmx_failValid)
+ *
+ * When adding or removing fields here, note that shadowed
+ * fields must always be synced by prepare_vmcs02, not just
+ * prepare_vmcs02_full.
+ */
+
+/*
+ * Keeping the fields ordered by size is an attempt at improving
+ * branch prediction in vmcs_read_any and vmcs_write_any.
+ */
+
+/* 16-bits */
+SHADOW_FIELD_RW(GUEST_CS_SELECTOR)
+SHADOW_FIELD_RW(GUEST_INTR_STATUS)
+SHADOW_FIELD_RW(GUEST_PML_INDEX)
+SHADOW_FIELD_RW(HOST_FS_SELECTOR)
+SHADOW_FIELD_RW(HOST_GS_SELECTOR)
+
+/* 32-bits */
+SHADOW_FIELD_RO(VM_EXIT_REASON)
+SHADOW_FIELD_RO(VM_EXIT_INTR_INFO)
+SHADOW_FIELD_RO(VM_EXIT_INSTRUCTION_LEN)
+SHADOW_FIELD_RO(IDT_VECTORING_INFO_FIELD)
+SHADOW_FIELD_RO(IDT_VECTORING_ERROR_CODE)
+SHADOW_FIELD_RO(VM_EXIT_INTR_ERROR_CODE)
+SHADOW_FIELD_RW(CPU_BASED_VM_EXEC_CONTROL)
+SHADOW_FIELD_RW(EXCEPTION_BITMAP)
+SHADOW_FIELD_RW(VM_ENTRY_EXCEPTION_ERROR_CODE)
+SHADOW_FIELD_RW(VM_ENTRY_INTR_INFO_FIELD)
+SHADOW_FIELD_RW(VM_ENTRY_INSTRUCTION_LEN)
+SHADOW_FIELD_RW(TPR_THRESHOLD)
+SHADOW_FIELD_RW(GUEST_CS_LIMIT)
+SHADOW_FIELD_RW(GUEST_CS_AR_BYTES)
+SHADOW_FIELD_RW(GUEST_INTERRUPTIBILITY_INFO)
+SHADOW_FIELD_RW(VMX_PREEMPTION_TIMER_VALUE)
+
+/* Natural width */
+SHADOW_FIELD_RO(EXIT_QUALIFICATION)
+SHADOW_FIELD_RO(GUEST_LINEAR_ADDRESS)
+SHADOW_FIELD_RW(GUEST_RIP)
+SHADOW_FIELD_RW(GUEST_RSP)
+SHADOW_FIELD_RW(GUEST_CR0)
+SHADOW_FIELD_RW(GUEST_CR3)
+SHADOW_FIELD_RW(GUEST_CR4)
+SHADOW_FIELD_RW(GUEST_RFLAGS)
+SHADOW_FIELD_RW(GUEST_CS_BASE)
+SHADOW_FIELD_RW(GUEST_ES_BASE)
+SHADOW_FIELD_RW(CR0_GUEST_HOST_MASK)
+SHADOW_FIELD_RW(CR0_READ_SHADOW)
+SHADOW_FIELD_RW(CR4_READ_SHADOW)
+SHADOW_FIELD_RW(HOST_FS_BASE)
+SHADOW_FIELD_RW(HOST_GS_BASE)
+
+/* 64-bit */
+SHADOW_FIELD_RO(GUEST_PHYSICAL_ADDRESS)
+SHADOW_FIELD_RO(GUEST_PHYSICAL_ADDRESS_HIGH)
+
+#undef SHADOW_FIELD_RO
+#undef SHADOW_FIELD_RW
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
new file mode 100644
index 000000000..be4697d91
--- /dev/null
+++ b/arch/x86/kvm/x86.c
@@ -0,0 +1,9835 @@
+/*
+ * Kernel-based Virtual Machine driver for Linux
+ *
+ * derived from drivers/kvm/kvm_main.c
+ *
+ * Copyright (C) 2006 Qumranet, Inc.
+ * Copyright (C) 2008 Qumranet, Inc.
+ * Copyright IBM Corporation, 2008
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
+ *
+ * Authors:
+ * Avi Kivity <avi@qumranet.com>
+ * Yaniv Kamay <yaniv@qumranet.com>
+ * Amit Shah <amit.shah@qumranet.com>
+ * Ben-Ami Yassour <benami@il.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#include <linux/kvm_host.h>
+#include "irq.h"
+#include "mmu.h"
+#include "i8254.h"
+#include "tss.h"
+#include "kvm_cache_regs.h"
+#include "x86.h"
+#include "cpuid.h"
+#include "pmu.h"
+#include "hyperv.h"
+
+#include <linux/clocksource.h>
+#include <linux/interrupt.h>
+#include <linux/kvm.h>
+#include <linux/fs.h>
+#include <linux/vmalloc.h>
+#include <linux/export.h>
+#include <linux/moduleparam.h>
+#include <linux/mman.h>
+#include <linux/highmem.h>
+#include <linux/iommu.h>
+#include <linux/intel-iommu.h>
+#include <linux/cpufreq.h>
+#include <linux/user-return-notifier.h>
+#include <linux/srcu.h>
+#include <linux/slab.h>
+#include <linux/perf_event.h>
+#include <linux/uaccess.h>
+#include <linux/hash.h>
+#include <linux/pci.h>
+#include <linux/timekeeper_internal.h>
+#include <linux/pvclock_gtod.h>
+#include <linux/kvm_irqfd.h>
+#include <linux/irqbypass.h>
+#include <linux/sched/stat.h>
+#include <linux/mem_encrypt.h>
+
+#include <trace/events/kvm.h>
+
+#include <asm/debugreg.h>
+#include <asm/msr.h>
+#include <asm/desc.h>
+#include <asm/mce.h>
+#include <linux/kernel_stat.h>
+#include <asm/fpu/internal.h> /* Ugh! */
+#include <asm/pvclock.h>
+#include <asm/div64.h>
+#include <asm/irq_remapping.h>
+#include <asm/mshyperv.h>
+#include <asm/hypervisor.h>
+
+#define CREATE_TRACE_POINTS
+#include "trace.h"
+
+#define MAX_IO_MSRS 256
+#define KVM_MAX_MCE_BANKS 32
+u64 __read_mostly kvm_mce_cap_supported = MCG_CTL_P | MCG_SER_P;
+EXPORT_SYMBOL_GPL(kvm_mce_cap_supported);
+
+#define emul_to_vcpu(ctxt) \
+ container_of(ctxt, struct kvm_vcpu, arch.emulate_ctxt)
+
+/* EFER defaults:
+ * - enable syscall per default because its emulated by KVM
+ * - enable LME and LMA per default on 64 bit KVM
+ */
+#ifdef CONFIG_X86_64
+static
+u64 __read_mostly efer_reserved_bits = ~((u64)(EFER_SCE | EFER_LME | EFER_LMA));
+#else
+static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE);
+#endif
+
+static u64 __read_mostly cr4_reserved_bits = CR4_RESERVED_BITS;
+
+#define VM_STAT(x, ...) offsetof(struct kvm, stat.x), KVM_STAT_VM, ## __VA_ARGS__
+#define VCPU_STAT(x, ...) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU, ## __VA_ARGS__
+
+#define KVM_X2APIC_API_VALID_FLAGS (KVM_X2APIC_API_USE_32BIT_IDS | \
+ KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK)
+
+static void update_cr8_intercept(struct kvm_vcpu *vcpu);
+static void process_nmi(struct kvm_vcpu *vcpu);
+static void process_smi(struct kvm_vcpu *vcpu);
+static void enter_smm(struct kvm_vcpu *vcpu);
+static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
+static void store_regs(struct kvm_vcpu *vcpu);
+static int sync_regs(struct kvm_vcpu *vcpu);
+
+struct kvm_x86_ops *kvm_x86_ops __read_mostly;
+EXPORT_SYMBOL_GPL(kvm_x86_ops);
+
+static bool __read_mostly ignore_msrs = 0;
+module_param(ignore_msrs, bool, S_IRUGO | S_IWUSR);
+
+static bool __read_mostly report_ignored_msrs = true;
+module_param(report_ignored_msrs, bool, S_IRUGO | S_IWUSR);
+
+unsigned int min_timer_period_us = 200;
+module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR);
+
+static bool __read_mostly kvmclock_periodic_sync = true;
+module_param(kvmclock_periodic_sync, bool, S_IRUGO);
+
+bool __read_mostly kvm_has_tsc_control;
+EXPORT_SYMBOL_GPL(kvm_has_tsc_control);
+u32 __read_mostly kvm_max_guest_tsc_khz;
+EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz);
+u8 __read_mostly kvm_tsc_scaling_ratio_frac_bits;
+EXPORT_SYMBOL_GPL(kvm_tsc_scaling_ratio_frac_bits);
+u64 __read_mostly kvm_max_tsc_scaling_ratio;
+EXPORT_SYMBOL_GPL(kvm_max_tsc_scaling_ratio);
+u64 __read_mostly kvm_default_tsc_scaling_ratio;
+EXPORT_SYMBOL_GPL(kvm_default_tsc_scaling_ratio);
+
+/* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */
+static u32 __read_mostly tsc_tolerance_ppm = 250;
+module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR);
+
+/* lapic timer advance (tscdeadline mode only) in nanoseconds */
+unsigned int __read_mostly lapic_timer_advance_ns = 0;
+module_param(lapic_timer_advance_ns, uint, S_IRUGO | S_IWUSR);
+EXPORT_SYMBOL_GPL(lapic_timer_advance_ns);
+
+static bool __read_mostly vector_hashing = true;
+module_param(vector_hashing, bool, S_IRUGO);
+
+bool __read_mostly enable_vmware_backdoor = false;
+module_param(enable_vmware_backdoor, bool, S_IRUGO);
+EXPORT_SYMBOL_GPL(enable_vmware_backdoor);
+
+static bool __read_mostly force_emulation_prefix = false;
+module_param(force_emulation_prefix, bool, S_IRUGO);
+
+#define KVM_NR_SHARED_MSRS 16
+
+struct kvm_shared_msrs_global {
+ int nr;
+ u32 msrs[KVM_NR_SHARED_MSRS];
+};
+
+struct kvm_shared_msrs {
+ struct user_return_notifier urn;
+ bool registered;
+ struct kvm_shared_msr_values {
+ u64 host;
+ u64 curr;
+ } values[KVM_NR_SHARED_MSRS];
+};
+
+static struct kvm_shared_msrs_global __read_mostly shared_msrs_global;
+static struct kvm_shared_msrs __percpu *shared_msrs;
+
+struct kvm_stats_debugfs_item debugfs_entries[] = {
+ { "pf_fixed", VCPU_STAT(pf_fixed) },
+ { "pf_guest", VCPU_STAT(pf_guest) },
+ { "tlb_flush", VCPU_STAT(tlb_flush) },
+ { "invlpg", VCPU_STAT(invlpg) },
+ { "exits", VCPU_STAT(exits) },
+ { "io_exits", VCPU_STAT(io_exits) },
+ { "mmio_exits", VCPU_STAT(mmio_exits) },
+ { "signal_exits", VCPU_STAT(signal_exits) },
+ { "irq_window", VCPU_STAT(irq_window_exits) },
+ { "nmi_window", VCPU_STAT(nmi_window_exits) },
+ { "halt_exits", VCPU_STAT(halt_exits) },
+ { "halt_successful_poll", VCPU_STAT(halt_successful_poll) },
+ { "halt_attempted_poll", VCPU_STAT(halt_attempted_poll) },
+ { "halt_poll_invalid", VCPU_STAT(halt_poll_invalid) },
+ { "halt_wakeup", VCPU_STAT(halt_wakeup) },
+ { "hypercalls", VCPU_STAT(hypercalls) },
+ { "request_irq", VCPU_STAT(request_irq_exits) },
+ { "irq_exits", VCPU_STAT(irq_exits) },
+ { "host_state_reload", VCPU_STAT(host_state_reload) },
+ { "fpu_reload", VCPU_STAT(fpu_reload) },
+ { "insn_emulation", VCPU_STAT(insn_emulation) },
+ { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) },
+ { "irq_injections", VCPU_STAT(irq_injections) },
+ { "nmi_injections", VCPU_STAT(nmi_injections) },
+ { "req_event", VCPU_STAT(req_event) },
+ { "l1d_flush", VCPU_STAT(l1d_flush) },
+ { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
+ { "mmu_pte_write", VM_STAT(mmu_pte_write) },
+ { "mmu_pte_updated", VM_STAT(mmu_pte_updated) },
+ { "mmu_pde_zapped", VM_STAT(mmu_pde_zapped) },
+ { "mmu_flooded", VM_STAT(mmu_flooded) },
+ { "mmu_recycled", VM_STAT(mmu_recycled) },
+ { "mmu_cache_miss", VM_STAT(mmu_cache_miss) },
+ { "mmu_unsync", VM_STAT(mmu_unsync) },
+ { "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
+ { "largepages", VM_STAT(lpages, .mode = 0444) },
+ { "nx_largepages_splitted", VM_STAT(nx_lpage_splits, .mode = 0444) },
+ { "max_mmu_page_hash_collisions",
+ VM_STAT(max_mmu_page_hash_collisions) },
+ { NULL }
+};
+
+u64 __read_mostly host_xcr0;
+
+static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt);
+
+static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu)
+{
+ int i;
+ for (i = 0; i < roundup_pow_of_two(ASYNC_PF_PER_VCPU); i++)
+ vcpu->arch.apf.gfns[i] = ~0;
+}
+
+static void kvm_on_user_return(struct user_return_notifier *urn)
+{
+ unsigned slot;
+ struct kvm_shared_msrs *locals
+ = container_of(urn, struct kvm_shared_msrs, urn);
+ struct kvm_shared_msr_values *values;
+ unsigned long flags;
+
+ /*
+ * Disabling irqs at this point since the following code could be
+ * interrupted and executed through kvm_arch_hardware_disable()
+ */
+ local_irq_save(flags);
+ if (locals->registered) {
+ locals->registered = false;
+ user_return_notifier_unregister(urn);
+ }
+ local_irq_restore(flags);
+ for (slot = 0; slot < shared_msrs_global.nr; ++slot) {
+ values = &locals->values[slot];
+ if (values->host != values->curr) {
+ wrmsrl(shared_msrs_global.msrs[slot], values->host);
+ values->curr = values->host;
+ }
+ }
+}
+
+static void shared_msr_update(unsigned slot, u32 msr)
+{
+ u64 value;
+ unsigned int cpu = smp_processor_id();
+ struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu);
+
+ /* only read, and nobody should modify it at this time,
+ * so don't need lock */
+ if (slot >= shared_msrs_global.nr) {
+ printk(KERN_ERR "kvm: invalid MSR slot!");
+ return;
+ }
+ rdmsrl_safe(msr, &value);
+ smsr->values[slot].host = value;
+ smsr->values[slot].curr = value;
+}
+
+void kvm_define_shared_msr(unsigned slot, u32 msr)
+{
+ BUG_ON(slot >= KVM_NR_SHARED_MSRS);
+ shared_msrs_global.msrs[slot] = msr;
+ if (slot >= shared_msrs_global.nr)
+ shared_msrs_global.nr = slot + 1;
+}
+EXPORT_SYMBOL_GPL(kvm_define_shared_msr);
+
+static void kvm_shared_msr_cpu_online(void)
+{
+ unsigned i;
+
+ for (i = 0; i < shared_msrs_global.nr; ++i)
+ shared_msr_update(i, shared_msrs_global.msrs[i]);
+}
+
+int kvm_set_shared_msr(unsigned slot, u64 value, u64 mask)
+{
+ unsigned int cpu = smp_processor_id();
+ struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu);
+ int err;
+
+ value = (value & mask) | (smsr->values[slot].host & ~mask);
+ if (value == smsr->values[slot].curr)
+ return 0;
+ err = wrmsrl_safe(shared_msrs_global.msrs[slot], value);
+ if (err)
+ return 1;
+
+ smsr->values[slot].curr = value;
+ if (!smsr->registered) {
+ smsr->urn.on_user_return = kvm_on_user_return;
+ user_return_notifier_register(&smsr->urn);
+ smsr->registered = true;
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_set_shared_msr);
+
+static void drop_user_return_notifiers(void)
+{
+ unsigned int cpu = smp_processor_id();
+ struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu);
+
+ if (smsr->registered)
+ kvm_on_user_return(&smsr->urn);
+}
+
+u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.apic_base;
+}
+EXPORT_SYMBOL_GPL(kvm_get_apic_base);
+
+enum lapic_mode kvm_get_apic_mode(struct kvm_vcpu *vcpu)
+{
+ return kvm_apic_mode(kvm_get_apic_base(vcpu));
+}
+EXPORT_SYMBOL_GPL(kvm_get_apic_mode);
+
+int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+{
+ enum lapic_mode old_mode = kvm_get_apic_mode(vcpu);
+ enum lapic_mode new_mode = kvm_apic_mode(msr_info->data);
+ u64 reserved_bits = ((~0ULL) << cpuid_maxphyaddr(vcpu)) | 0x2ff |
+ (guest_cpuid_has(vcpu, X86_FEATURE_X2APIC) ? 0 : X2APIC_ENABLE);
+
+ if ((msr_info->data & reserved_bits) != 0 || new_mode == LAPIC_MODE_INVALID)
+ return 1;
+ if (!msr_info->host_initiated) {
+ if (old_mode == LAPIC_MODE_X2APIC && new_mode == LAPIC_MODE_XAPIC)
+ return 1;
+ if (old_mode == LAPIC_MODE_DISABLED && new_mode == LAPIC_MODE_X2APIC)
+ return 1;
+ }
+
+ kvm_lapic_set_base(vcpu, msr_info->data);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_set_apic_base);
+
+asmlinkage __visible void kvm_spurious_fault(void)
+{
+ /* Fault while not rebooting. We want the trace. */
+ BUG();
+}
+EXPORT_SYMBOL_GPL(kvm_spurious_fault);
+
+#define EXCPT_BENIGN 0
+#define EXCPT_CONTRIBUTORY 1
+#define EXCPT_PF 2
+
+static int exception_class(int vector)
+{
+ switch (vector) {
+ case PF_VECTOR:
+ return EXCPT_PF;
+ case DE_VECTOR:
+ case TS_VECTOR:
+ case NP_VECTOR:
+ case SS_VECTOR:
+ case GP_VECTOR:
+ return EXCPT_CONTRIBUTORY;
+ default:
+ break;
+ }
+ return EXCPT_BENIGN;
+}
+
+#define EXCPT_FAULT 0
+#define EXCPT_TRAP 1
+#define EXCPT_ABORT 2
+#define EXCPT_INTERRUPT 3
+
+static int exception_type(int vector)
+{
+ unsigned int mask;
+
+ if (WARN_ON(vector > 31 || vector == NMI_VECTOR))
+ return EXCPT_INTERRUPT;
+
+ mask = 1 << vector;
+
+ /* #DB is trap, as instruction watchpoints are handled elsewhere */
+ if (mask & ((1 << DB_VECTOR) | (1 << BP_VECTOR) | (1 << OF_VECTOR)))
+ return EXCPT_TRAP;
+
+ if (mask & ((1 << DF_VECTOR) | (1 << MC_VECTOR)))
+ return EXCPT_ABORT;
+
+ /* Reserved exceptions will result in fault */
+ return EXCPT_FAULT;
+}
+
+static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
+ unsigned nr, bool has_error, u32 error_code,
+ bool reinject)
+{
+ u32 prev_nr;
+ int class1, class2;
+
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+
+ if (!vcpu->arch.exception.pending && !vcpu->arch.exception.injected) {
+ queue:
+ if (reinject) {
+ /*
+ * On vmentry, vcpu->arch.exception.pending is only
+ * true if an event injection was blocked by
+ * nested_run_pending. In that case, however,
+ * vcpu_enter_guest requests an immediate exit,
+ * and the guest shouldn't proceed far enough to
+ * need reinjection.
+ */
+ WARN_ON_ONCE(vcpu->arch.exception.pending);
+ vcpu->arch.exception.injected = true;
+ } else {
+ vcpu->arch.exception.pending = true;
+ vcpu->arch.exception.injected = false;
+ }
+ vcpu->arch.exception.has_error_code = has_error;
+ vcpu->arch.exception.nr = nr;
+ vcpu->arch.exception.error_code = error_code;
+ return;
+ }
+
+ /* to check exception */
+ prev_nr = vcpu->arch.exception.nr;
+ if (prev_nr == DF_VECTOR) {
+ /* triple fault -> shutdown */
+ kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+ return;
+ }
+ class1 = exception_class(prev_nr);
+ class2 = exception_class(nr);
+ if ((class1 == EXCPT_CONTRIBUTORY && class2 == EXCPT_CONTRIBUTORY)
+ || (class1 == EXCPT_PF && class2 != EXCPT_BENIGN)) {
+ /*
+ * Generate double fault per SDM Table 5-5. Set
+ * exception.pending = true so that the double fault
+ * can trigger a nested vmexit.
+ */
+ vcpu->arch.exception.pending = true;
+ vcpu->arch.exception.injected = false;
+ vcpu->arch.exception.has_error_code = true;
+ vcpu->arch.exception.nr = DF_VECTOR;
+ vcpu->arch.exception.error_code = 0;
+ } else
+ /* replace previous exception with a new one in a hope
+ that instruction re-execution will regenerate lost
+ exception */
+ goto queue;
+}
+
+void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr)
+{
+ kvm_multiple_exception(vcpu, nr, false, 0, false);
+}
+EXPORT_SYMBOL_GPL(kvm_queue_exception);
+
+void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr)
+{
+ kvm_multiple_exception(vcpu, nr, false, 0, true);
+}
+EXPORT_SYMBOL_GPL(kvm_requeue_exception);
+
+int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err)
+{
+ if (err)
+ kvm_inject_gp(vcpu, 0);
+ else
+ return kvm_skip_emulated_instruction(vcpu);
+
+ return 1;
+}
+EXPORT_SYMBOL_GPL(kvm_complete_insn_gp);
+
+void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
+{
+ ++vcpu->stat.pf_guest;
+ vcpu->arch.exception.nested_apf =
+ is_guest_mode(vcpu) && fault->async_page_fault;
+ if (vcpu->arch.exception.nested_apf)
+ vcpu->arch.apf.nested_apf_token = fault->address;
+ else
+ vcpu->arch.cr2 = fault->address;
+ kvm_queue_exception_e(vcpu, PF_VECTOR, fault->error_code);
+}
+EXPORT_SYMBOL_GPL(kvm_inject_page_fault);
+
+static bool kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
+{
+ if (mmu_is_nested(vcpu) && !fault->nested_page_fault)
+ vcpu->arch.nested_mmu.inject_page_fault(vcpu, fault);
+ else
+ vcpu->arch.mmu.inject_page_fault(vcpu, fault);
+
+ return fault->nested_page_fault;
+}
+
+void kvm_inject_nmi(struct kvm_vcpu *vcpu)
+{
+ atomic_inc(&vcpu->arch.nmi_queued);
+ kvm_make_request(KVM_REQ_NMI, vcpu);
+}
+EXPORT_SYMBOL_GPL(kvm_inject_nmi);
+
+void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
+{
+ kvm_multiple_exception(vcpu, nr, true, error_code, false);
+}
+EXPORT_SYMBOL_GPL(kvm_queue_exception_e);
+
+void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
+{
+ kvm_multiple_exception(vcpu, nr, true, error_code, true);
+}
+EXPORT_SYMBOL_GPL(kvm_requeue_exception_e);
+
+/*
+ * Checks if cpl <= required_cpl; if true, return true. Otherwise queue
+ * a #GP and return false.
+ */
+bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl)
+{
+ if (kvm_x86_ops->get_cpl(vcpu) <= required_cpl)
+ return true;
+ kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
+ return false;
+}
+EXPORT_SYMBOL_GPL(kvm_require_cpl);
+
+bool kvm_require_dr(struct kvm_vcpu *vcpu, int dr)
+{
+ if ((dr != 4 && dr != 5) || !kvm_read_cr4_bits(vcpu, X86_CR4_DE))
+ return true;
+
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return false;
+}
+EXPORT_SYMBOL_GPL(kvm_require_dr);
+
+/*
+ * This function will be used to read from the physical memory of the currently
+ * running guest. The difference to kvm_vcpu_read_guest_page is that this function
+ * can read from guest physical or from the guest's guest physical memory.
+ */
+int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
+ gfn_t ngfn, void *data, int offset, int len,
+ u32 access)
+{
+ struct x86_exception exception;
+ gfn_t real_gfn;
+ gpa_t ngpa;
+
+ ngpa = gfn_to_gpa(ngfn);
+ real_gfn = mmu->translate_gpa(vcpu, ngpa, access, &exception);
+ if (real_gfn == UNMAPPED_GVA)
+ return -EFAULT;
+
+ real_gfn = gpa_to_gfn(real_gfn);
+
+ return kvm_vcpu_read_guest_page(vcpu, real_gfn, data, offset, len);
+}
+EXPORT_SYMBOL_GPL(kvm_read_guest_page_mmu);
+
+static int kvm_read_nested_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn,
+ void *data, int offset, int len, u32 access)
+{
+ return kvm_read_guest_page_mmu(vcpu, vcpu->arch.walk_mmu, gfn,
+ data, offset, len, access);
+}
+
+static inline u64 pdptr_rsvd_bits(struct kvm_vcpu *vcpu)
+{
+ return rsvd_bits(cpuid_maxphyaddr(vcpu), 63) | rsvd_bits(5, 8) |
+ rsvd_bits(1, 2);
+}
+
+/*
+ * Load the pae pdptrs. Return 1 if they are all valid, 0 otherwise.
+ */
+int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3)
+{
+ gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
+ unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2;
+ int i;
+ int ret;
+ u64 pdpte[ARRAY_SIZE(mmu->pdptrs)];
+
+ ret = kvm_read_guest_page_mmu(vcpu, mmu, pdpt_gfn, pdpte,
+ offset * sizeof(u64), sizeof(pdpte),
+ PFERR_USER_MASK|PFERR_WRITE_MASK);
+ if (ret < 0) {
+ ret = 0;
+ goto out;
+ }
+ for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
+ if ((pdpte[i] & PT_PRESENT_MASK) &&
+ (pdpte[i] & pdptr_rsvd_bits(vcpu))) {
+ ret = 0;
+ goto out;
+ }
+ }
+ ret = 1;
+
+ memcpy(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs));
+ __set_bit(VCPU_EXREG_PDPTR,
+ (unsigned long *)&vcpu->arch.regs_avail);
+ __set_bit(VCPU_EXREG_PDPTR,
+ (unsigned long *)&vcpu->arch.regs_dirty);
+out:
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(load_pdptrs);
+
+bool pdptrs_changed(struct kvm_vcpu *vcpu)
+{
+ u64 pdpte[ARRAY_SIZE(vcpu->arch.walk_mmu->pdptrs)];
+ bool changed = true;
+ int offset;
+ gfn_t gfn;
+ int r;
+
+ if (!is_pae_paging(vcpu))
+ return false;
+
+ if (!test_bit(VCPU_EXREG_PDPTR,
+ (unsigned long *)&vcpu->arch.regs_avail))
+ return true;
+
+ gfn = (kvm_read_cr3(vcpu) & 0xffffffe0ul) >> PAGE_SHIFT;
+ offset = (kvm_read_cr3(vcpu) & 0xffffffe0ul) & (PAGE_SIZE - 1);
+ r = kvm_read_nested_guest_page(vcpu, gfn, pdpte, offset, sizeof(pdpte),
+ PFERR_USER_MASK | PFERR_WRITE_MASK);
+ if (r < 0)
+ goto out;
+ changed = memcmp(pdpte, vcpu->arch.walk_mmu->pdptrs, sizeof(pdpte)) != 0;
+out:
+
+ return changed;
+}
+EXPORT_SYMBOL_GPL(pdptrs_changed);
+
+int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
+{
+ unsigned long old_cr0 = kvm_read_cr0(vcpu);
+ unsigned long update_bits = X86_CR0_PG | X86_CR0_WP;
+
+ cr0 |= X86_CR0_ET;
+
+#ifdef CONFIG_X86_64
+ if (cr0 & 0xffffffff00000000UL)
+ return 1;
+#endif
+
+ cr0 &= ~CR0_RESERVED_BITS;
+
+ if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD))
+ return 1;
+
+ if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE))
+ return 1;
+
+ if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
+#ifdef CONFIG_X86_64
+ if ((vcpu->arch.efer & EFER_LME)) {
+ int cs_db, cs_l;
+
+ if (!is_pae(vcpu))
+ return 1;
+ kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
+ if (cs_l)
+ return 1;
+ } else
+#endif
+ if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.walk_mmu,
+ kvm_read_cr3(vcpu)))
+ return 1;
+ }
+
+ if (!(cr0 & X86_CR0_PG) && kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE))
+ return 1;
+
+ kvm_x86_ops->set_cr0(vcpu, cr0);
+
+ if ((cr0 ^ old_cr0) & X86_CR0_PG) {
+ kvm_clear_async_pf_completion_queue(vcpu);
+ kvm_async_pf_hash_reset(vcpu);
+ }
+
+ if ((cr0 ^ old_cr0) & update_bits)
+ kvm_mmu_reset_context(vcpu);
+
+ if (((cr0 ^ old_cr0) & X86_CR0_CD) &&
+ kvm_arch_has_noncoherent_dma(vcpu->kvm) &&
+ !kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
+ kvm_zap_gfn_range(vcpu->kvm, 0, ~0ULL);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_set_cr0);
+
+void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
+{
+ (void)kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0eul) | (msw & 0x0f));
+}
+EXPORT_SYMBOL_GPL(kvm_lmsw);
+
+void kvm_load_guest_xcr0(struct kvm_vcpu *vcpu)
+{
+ if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE) &&
+ !vcpu->guest_xcr0_loaded) {
+ /* kvm_set_xcr() also depends on this */
+ if (vcpu->arch.xcr0 != host_xcr0)
+ xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0);
+ vcpu->guest_xcr0_loaded = 1;
+ }
+}
+EXPORT_SYMBOL_GPL(kvm_load_guest_xcr0);
+
+void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu)
+{
+ if (vcpu->guest_xcr0_loaded) {
+ if (vcpu->arch.xcr0 != host_xcr0)
+ xsetbv(XCR_XFEATURE_ENABLED_MASK, host_xcr0);
+ vcpu->guest_xcr0_loaded = 0;
+ }
+}
+EXPORT_SYMBOL_GPL(kvm_put_guest_xcr0);
+
+static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
+{
+ u64 xcr0 = xcr;
+ u64 old_xcr0 = vcpu->arch.xcr0;
+ u64 valid_bits;
+
+ /* Only support XCR_XFEATURE_ENABLED_MASK(xcr0) now */
+ if (index != XCR_XFEATURE_ENABLED_MASK)
+ return 1;
+ if (!(xcr0 & XFEATURE_MASK_FP))
+ return 1;
+ if ((xcr0 & XFEATURE_MASK_YMM) && !(xcr0 & XFEATURE_MASK_SSE))
+ return 1;
+
+ /*
+ * Do not allow the guest to set bits that we do not support
+ * saving. However, xcr0 bit 0 is always set, even if the
+ * emulated CPU does not support XSAVE (see fx_init).
+ */
+ valid_bits = vcpu->arch.guest_supported_xcr0 | XFEATURE_MASK_FP;
+ if (xcr0 & ~valid_bits)
+ return 1;
+
+ if ((!(xcr0 & XFEATURE_MASK_BNDREGS)) !=
+ (!(xcr0 & XFEATURE_MASK_BNDCSR)))
+ return 1;
+
+ if (xcr0 & XFEATURE_MASK_AVX512) {
+ if (!(xcr0 & XFEATURE_MASK_YMM))
+ return 1;
+ if ((xcr0 & XFEATURE_MASK_AVX512) != XFEATURE_MASK_AVX512)
+ return 1;
+ }
+ vcpu->arch.xcr0 = xcr0;
+
+ if ((xcr0 ^ old_xcr0) & XFEATURE_MASK_EXTEND)
+ kvm_update_cpuid(vcpu);
+ return 0;
+}
+
+int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
+{
+ if (kvm_x86_ops->get_cpl(vcpu) != 0 ||
+ __kvm_set_xcr(vcpu, index, xcr)) {
+ kvm_inject_gp(vcpu, 0);
+ return 1;
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_set_xcr);
+
+static u64 kvm_host_cr4_reserved_bits(struct cpuinfo_x86 *c)
+{
+ u64 reserved_bits = CR4_RESERVED_BITS;
+
+ if (!cpu_has(c, X86_FEATURE_XSAVE))
+ reserved_bits |= X86_CR4_OSXSAVE;
+
+ if (!cpu_has(c, X86_FEATURE_SMEP))
+ reserved_bits |= X86_CR4_SMEP;
+
+ if (!cpu_has(c, X86_FEATURE_SMAP))
+ reserved_bits |= X86_CR4_SMAP;
+
+ if (!cpu_has(c, X86_FEATURE_FSGSBASE))
+ reserved_bits |= X86_CR4_FSGSBASE;
+
+ if (!cpu_has(c, X86_FEATURE_PKU))
+ reserved_bits |= X86_CR4_PKE;
+
+ if (!cpu_has(c, X86_FEATURE_LA57) &&
+ !(cpuid_ecx(0x7) & bit(X86_FEATURE_LA57)))
+ reserved_bits |= X86_CR4_LA57;
+
+ if (!cpu_has(c, X86_FEATURE_UMIP) && !kvm_x86_ops->umip_emulated())
+ reserved_bits |= X86_CR4_UMIP;
+
+ return reserved_bits;
+}
+
+static int kvm_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
+{
+ if (cr4 & cr4_reserved_bits)
+ return -EINVAL;
+
+ if (!guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) && (cr4 & X86_CR4_OSXSAVE))
+ return -EINVAL;
+
+ if (!guest_cpuid_has(vcpu, X86_FEATURE_SMEP) && (cr4 & X86_CR4_SMEP))
+ return -EINVAL;
+
+ if (!guest_cpuid_has(vcpu, X86_FEATURE_SMAP) && (cr4 & X86_CR4_SMAP))
+ return -EINVAL;
+
+ if (!guest_cpuid_has(vcpu, X86_FEATURE_FSGSBASE) && (cr4 & X86_CR4_FSGSBASE))
+ return -EINVAL;
+
+ if (!guest_cpuid_has(vcpu, X86_FEATURE_PKU) && (cr4 & X86_CR4_PKE))
+ return -EINVAL;
+
+ if (!guest_cpuid_has(vcpu, X86_FEATURE_LA57) && (cr4 & X86_CR4_LA57))
+ return -EINVAL;
+
+ if (!guest_cpuid_has(vcpu, X86_FEATURE_UMIP) && (cr4 & X86_CR4_UMIP))
+ return -EINVAL;
+
+ return 0;
+}
+
+int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
+{
+ unsigned long old_cr4 = kvm_read_cr4(vcpu);
+ unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE |
+ X86_CR4_SMEP;
+ unsigned long mmu_role_bits = pdptr_bits | X86_CR4_SMAP | X86_CR4_PKE;
+
+ if (kvm_valid_cr4(vcpu, cr4))
+ return 1;
+
+ if (is_long_mode(vcpu)) {
+ if (!(cr4 & X86_CR4_PAE))
+ return 1;
+ if ((cr4 ^ old_cr4) & X86_CR4_LA57)
+ return 1;
+ } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE)
+ && ((cr4 ^ old_cr4) & pdptr_bits)
+ && !load_pdptrs(vcpu, vcpu->arch.walk_mmu,
+ kvm_read_cr3(vcpu)))
+ return 1;
+
+ if ((cr4 & X86_CR4_PCIDE) && !(old_cr4 & X86_CR4_PCIDE)) {
+ if (!guest_cpuid_has(vcpu, X86_FEATURE_PCID))
+ return 1;
+
+ /* PCID can not be enabled when cr3[11:0]!=000H or EFER.LMA=0 */
+ if ((kvm_read_cr3(vcpu) & X86_CR3_PCID_MASK) || !is_long_mode(vcpu))
+ return 1;
+ }
+
+ if (kvm_x86_ops->set_cr4(vcpu, cr4))
+ return 1;
+
+ if (((cr4 ^ old_cr4) & mmu_role_bits) ||
+ (!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE)))
+ kvm_mmu_reset_context(vcpu);
+
+ if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE))
+ kvm_update_cpuid(vcpu);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_set_cr4);
+
+int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
+{
+ bool skip_tlb_flush = false;
+#ifdef CONFIG_X86_64
+ bool pcid_enabled = kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE);
+
+ if (pcid_enabled) {
+ skip_tlb_flush = cr3 & X86_CR3_PCID_NOFLUSH;
+ cr3 &= ~X86_CR3_PCID_NOFLUSH;
+ }
+#endif
+
+ if (cr3 == kvm_read_cr3(vcpu) && !pdptrs_changed(vcpu)) {
+ if (!skip_tlb_flush) {
+ kvm_mmu_sync_roots(vcpu);
+ kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
+ }
+ return 0;
+ }
+
+ if (is_long_mode(vcpu) &&
+ (cr3 & rsvd_bits(cpuid_maxphyaddr(vcpu), 63)))
+ return 1;
+ else if (is_pae_paging(vcpu) &&
+ !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))
+ return 1;
+
+ kvm_mmu_new_cr3(vcpu, cr3, skip_tlb_flush);
+ vcpu->arch.cr3 = cr3;
+ __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_set_cr3);
+
+int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
+{
+ if (cr8 & CR8_RESERVED_BITS)
+ return 1;
+ if (lapic_in_kernel(vcpu))
+ kvm_lapic_set_tpr(vcpu, cr8);
+ else
+ vcpu->arch.cr8 = cr8;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_set_cr8);
+
+unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
+{
+ if (lapic_in_kernel(vcpu))
+ return kvm_lapic_get_cr8(vcpu);
+ else
+ return vcpu->arch.cr8;
+}
+EXPORT_SYMBOL_GPL(kvm_get_cr8);
+
+static void kvm_update_dr0123(struct kvm_vcpu *vcpu)
+{
+ int i;
+
+ if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
+ for (i = 0; i < KVM_NR_DB_REGS; i++)
+ vcpu->arch.eff_db[i] = vcpu->arch.db[i];
+ vcpu->arch.switch_db_regs |= KVM_DEBUGREG_RELOAD;
+ }
+}
+
+static void kvm_update_dr6(struct kvm_vcpu *vcpu)
+{
+ if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
+ kvm_x86_ops->set_dr6(vcpu, vcpu->arch.dr6);
+}
+
+static void kvm_update_dr7(struct kvm_vcpu *vcpu)
+{
+ unsigned long dr7;
+
+ if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
+ dr7 = vcpu->arch.guest_debug_dr7;
+ else
+ dr7 = vcpu->arch.dr7;
+ kvm_x86_ops->set_dr7(vcpu, dr7);
+ vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_BP_ENABLED;
+ if (dr7 & DR7_BP_EN_MASK)
+ vcpu->arch.switch_db_regs |= KVM_DEBUGREG_BP_ENABLED;
+}
+
+static u64 kvm_dr6_fixed(struct kvm_vcpu *vcpu)
+{
+ u64 fixed = DR6_FIXED_1;
+
+ if (!guest_cpuid_has(vcpu, X86_FEATURE_RTM))
+ fixed |= DR6_RTM;
+ return fixed;
+}
+
+static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
+{
+ size_t size = ARRAY_SIZE(vcpu->arch.db);
+
+ switch (dr) {
+ case 0 ... 3:
+ vcpu->arch.db[array_index_nospec(dr, size)] = val;
+ if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
+ vcpu->arch.eff_db[dr] = val;
+ break;
+ case 4:
+ /* fall through */
+ case 6:
+ if (val & 0xffffffff00000000ULL)
+ return -1; /* #GP */
+ vcpu->arch.dr6 = (val & DR6_VOLATILE) | kvm_dr6_fixed(vcpu);
+ kvm_update_dr6(vcpu);
+ break;
+ case 5:
+ /* fall through */
+ default: /* 7 */
+ if (val & 0xffffffff00000000ULL)
+ return -1; /* #GP */
+ vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1;
+ kvm_update_dr7(vcpu);
+ break;
+ }
+
+ return 0;
+}
+
+int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
+{
+ if (__kvm_set_dr(vcpu, dr, val)) {
+ kvm_inject_gp(vcpu, 0);
+ return 1;
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_set_dr);
+
+int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
+{
+ size_t size = ARRAY_SIZE(vcpu->arch.db);
+
+ switch (dr) {
+ case 0 ... 3:
+ *val = vcpu->arch.db[array_index_nospec(dr, size)];
+ break;
+ case 4:
+ /* fall through */
+ case 6:
+ if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
+ *val = vcpu->arch.dr6;
+ else
+ *val = kvm_x86_ops->get_dr6(vcpu);
+ break;
+ case 5:
+ /* fall through */
+ default: /* 7 */
+ *val = vcpu->arch.dr7;
+ break;
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_get_dr);
+
+bool kvm_rdpmc(struct kvm_vcpu *vcpu)
+{
+ u32 ecx = kvm_register_read(vcpu, VCPU_REGS_RCX);
+ u64 data;
+ int err;
+
+ err = kvm_pmu_rdpmc(vcpu, ecx, &data);
+ if (err)
+ return err;
+ kvm_register_write(vcpu, VCPU_REGS_RAX, (u32)data);
+ kvm_register_write(vcpu, VCPU_REGS_RDX, data >> 32);
+ return err;
+}
+EXPORT_SYMBOL_GPL(kvm_rdpmc);
+
+/*
+ * List of msr numbers which we expose to userspace through KVM_GET_MSRS
+ * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
+ *
+ * This list is modified at module load time to reflect the
+ * capabilities of the host cpu. This capabilities test skips MSRs that are
+ * kvm-specific. Those are put in emulated_msrs; filtering of emulated_msrs
+ * may depend on host virtualization features rather than host cpu features.
+ */
+
+static u32 msrs_to_save[] = {
+ MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
+ MSR_STAR,
+#ifdef CONFIG_X86_64
+ MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
+#endif
+ MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA,
+ MSR_IA32_FEATURE_CONTROL, MSR_IA32_BNDCFGS, MSR_TSC_AUX,
+ MSR_IA32_SPEC_CTRL, MSR_IA32_ARCH_CAPABILITIES
+};
+
+static unsigned num_msrs_to_save;
+
+static u32 emulated_msrs[] = {
+ MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
+ MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
+ HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
+ HV_X64_MSR_TIME_REF_COUNT, HV_X64_MSR_REFERENCE_TSC,
+ HV_X64_MSR_TSC_FREQUENCY, HV_X64_MSR_APIC_FREQUENCY,
+ HV_X64_MSR_CRASH_P0, HV_X64_MSR_CRASH_P1, HV_X64_MSR_CRASH_P2,
+ HV_X64_MSR_CRASH_P3, HV_X64_MSR_CRASH_P4, HV_X64_MSR_CRASH_CTL,
+ HV_X64_MSR_RESET,
+ HV_X64_MSR_VP_INDEX,
+ HV_X64_MSR_VP_RUNTIME,
+ HV_X64_MSR_SCONTROL,
+ HV_X64_MSR_STIMER0_CONFIG,
+ HV_X64_MSR_VP_ASSIST_PAGE,
+ HV_X64_MSR_REENLIGHTENMENT_CONTROL, HV_X64_MSR_TSC_EMULATION_CONTROL,
+ HV_X64_MSR_TSC_EMULATION_STATUS,
+
+ MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
+ MSR_KVM_PV_EOI_EN,
+
+ MSR_IA32_TSC_ADJUST,
+ MSR_IA32_TSCDEADLINE,
+ MSR_IA32_MISC_ENABLE,
+ MSR_IA32_MCG_STATUS,
+ MSR_IA32_MCG_CTL,
+ MSR_IA32_MCG_EXT_CTL,
+ MSR_IA32_SMBASE,
+ MSR_SMI_COUNT,
+ MSR_PLATFORM_INFO,
+ MSR_MISC_FEATURES_ENABLES,
+ MSR_AMD64_VIRT_SPEC_CTRL,
+};
+
+static unsigned num_emulated_msrs;
+
+/*
+ * List of msr numbers which are used to expose MSR-based features that
+ * can be used by a hypervisor to validate requested CPU features.
+ */
+static u32 msr_based_features[] = {
+ MSR_IA32_VMX_BASIC,
+ MSR_IA32_VMX_TRUE_PINBASED_CTLS,
+ MSR_IA32_VMX_PINBASED_CTLS,
+ MSR_IA32_VMX_TRUE_PROCBASED_CTLS,
+ MSR_IA32_VMX_PROCBASED_CTLS,
+ MSR_IA32_VMX_TRUE_EXIT_CTLS,
+ MSR_IA32_VMX_EXIT_CTLS,
+ MSR_IA32_VMX_TRUE_ENTRY_CTLS,
+ MSR_IA32_VMX_ENTRY_CTLS,
+ MSR_IA32_VMX_MISC,
+ MSR_IA32_VMX_CR0_FIXED0,
+ MSR_IA32_VMX_CR0_FIXED1,
+ MSR_IA32_VMX_CR4_FIXED0,
+ MSR_IA32_VMX_CR4_FIXED1,
+ MSR_IA32_VMX_VMCS_ENUM,
+ MSR_IA32_VMX_PROCBASED_CTLS2,
+ MSR_IA32_VMX_EPT_VPID_CAP,
+ MSR_IA32_VMX_VMFUNC,
+
+ MSR_F10H_DECFG,
+ MSR_IA32_UCODE_REV,
+ MSR_IA32_ARCH_CAPABILITIES,
+};
+
+static unsigned int num_msr_based_features;
+
+u64 kvm_get_arch_capabilities(void)
+{
+ u64 data;
+
+ rdmsrl_safe(MSR_IA32_ARCH_CAPABILITIES, &data);
+
+ /*
+ * If nx_huge_pages is enabled, KVM's shadow paging will ensure that
+ * the nested hypervisor runs with NX huge pages. If it is not,
+ * L1 is anyway vulnerable to ITLB_MULTIHIT explots from other
+ * L1 guests, so it need not worry about its own (L2) guests.
+ */
+ data |= ARCH_CAP_PSCHANGE_MC_NO;
+
+ /*
+ * If we're doing cache flushes (either "always" or "cond")
+ * we will do one whenever the guest does a vmlaunch/vmresume.
+ * If an outer hypervisor is doing the cache flush for us
+ * (VMENTER_L1D_FLUSH_NESTED_VM), we can safely pass that
+ * capability to the guest too, and if EPT is disabled we're not
+ * vulnerable. Overall, only VMENTER_L1D_FLUSH_NEVER will
+ * require a nested hypervisor to do a flush of its own.
+ */
+ if (l1tf_vmx_mitigation != VMENTER_L1D_FLUSH_NEVER)
+ data |= ARCH_CAP_SKIP_VMENTRY_L1DFLUSH;
+
+ if (!boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN))
+ data |= ARCH_CAP_RDCL_NO;
+ if (!boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS))
+ data |= ARCH_CAP_SSB_NO;
+ if (!boot_cpu_has_bug(X86_BUG_MDS))
+ data |= ARCH_CAP_MDS_NO;
+
+ /*
+ * On TAA affected systems, export MDS_NO=0 when:
+ * - TSX is enabled on the host, i.e. X86_FEATURE_RTM=1.
+ * - Updated microcode is present. This is detected by
+ * the presence of ARCH_CAP_TSX_CTRL_MSR and ensures
+ * that VERW clears CPU buffers.
+ *
+ * When MDS_NO=0 is exported, guests deploy clear CPU buffer
+ * mitigation and don't complain:
+ *
+ * "Vulnerable: Clear CPU buffers attempted, no microcode"
+ *
+ * If TSX is disabled on the system, guests are also mitigated against
+ * TAA and clear CPU buffer mitigation is not required for guests.
+ */
+ if (!boot_cpu_has(X86_FEATURE_RTM))
+ data &= ~ARCH_CAP_TAA_NO;
+ else if (!boot_cpu_has_bug(X86_BUG_TAA))
+ data |= ARCH_CAP_TAA_NO;
+ else if (data & ARCH_CAP_TSX_CTRL_MSR)
+ data &= ~ARCH_CAP_MDS_NO;
+
+ /* KVM does not emulate MSR_IA32_TSX_CTRL. */
+ data &= ~ARCH_CAP_TSX_CTRL_MSR;
+
+ /* Guests don't need to know "Fill buffer clear control" exists */
+ data &= ~ARCH_CAP_FB_CLEAR_CTRL;
+
+ return data;
+}
+
+EXPORT_SYMBOL_GPL(kvm_get_arch_capabilities);
+
+static int kvm_get_msr_feature(struct kvm_msr_entry *msr)
+{
+ switch (msr->index) {
+ case MSR_IA32_ARCH_CAPABILITIES:
+ msr->data = kvm_get_arch_capabilities();
+ break;
+ case MSR_IA32_UCODE_REV:
+ rdmsrl_safe(msr->index, &msr->data);
+ break;
+ default:
+ if (kvm_x86_ops->get_msr_feature(msr))
+ return 1;
+ }
+ return 0;
+}
+
+static int do_get_msr_feature(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
+{
+ struct kvm_msr_entry msr;
+ int r;
+
+ msr.index = index;
+ r = kvm_get_msr_feature(&msr);
+ if (r)
+ return r;
+
+ *data = msr.data;
+
+ return 0;
+}
+
+static bool __kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer)
+{
+ if (efer & EFER_FFXSR && !guest_cpuid_has(vcpu, X86_FEATURE_FXSR_OPT))
+ return false;
+
+ if (efer & EFER_SVME && !guest_cpuid_has(vcpu, X86_FEATURE_SVM))
+ return false;
+
+ return true;
+
+}
+bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer)
+{
+ if (efer & efer_reserved_bits)
+ return false;
+
+ return __kvm_valid_efer(vcpu, efer);
+}
+EXPORT_SYMBOL_GPL(kvm_valid_efer);
+
+static int set_efer(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+{
+ u64 old_efer = vcpu->arch.efer;
+ u64 efer = msr_info->data;
+
+ if (efer & efer_reserved_bits)
+ return 1;
+
+ if (!msr_info->host_initiated) {
+ if (!__kvm_valid_efer(vcpu, efer))
+ return 1;
+
+ if (is_paging(vcpu) &&
+ (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME))
+ return 1;
+ }
+
+ efer &= ~EFER_LMA;
+ efer |= vcpu->arch.efer & EFER_LMA;
+
+ kvm_x86_ops->set_efer(vcpu, efer);
+
+ /* Update reserved bits */
+ if ((efer ^ old_efer) & EFER_NX)
+ kvm_mmu_reset_context(vcpu);
+
+ return 0;
+}
+
+void kvm_enable_efer_bits(u64 mask)
+{
+ efer_reserved_bits &= ~mask;
+}
+EXPORT_SYMBOL_GPL(kvm_enable_efer_bits);
+
+/*
+ * Writes msr value into into the appropriate "register".
+ * Returns 0 on success, non-0 otherwise.
+ * Assumes vcpu_load() was already called.
+ */
+int kvm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
+{
+ switch (msr->index) {
+ case MSR_FS_BASE:
+ case MSR_GS_BASE:
+ case MSR_KERNEL_GS_BASE:
+ case MSR_CSTAR:
+ case MSR_LSTAR:
+ if (is_noncanonical_address(msr->data, vcpu))
+ return 1;
+ break;
+ case MSR_IA32_SYSENTER_EIP:
+ case MSR_IA32_SYSENTER_ESP:
+ /*
+ * IA32_SYSENTER_ESP and IA32_SYSENTER_EIP cause #GP if
+ * non-canonical address is written on Intel but not on
+ * AMD (which ignores the top 32-bits, because it does
+ * not implement 64-bit SYSENTER).
+ *
+ * 64-bit code should hence be able to write a non-canonical
+ * value on AMD. Making the address canonical ensures that
+ * vmentry does not fail on Intel after writing a non-canonical
+ * value, and that something deterministic happens if the guest
+ * invokes 64-bit SYSENTER.
+ */
+ msr->data = get_canonical(msr->data, vcpu_virt_addr_bits(vcpu));
+ }
+ return kvm_x86_ops->set_msr(vcpu, msr);
+}
+EXPORT_SYMBOL_GPL(kvm_set_msr);
+
+/*
+ * Adapt set_msr() to msr_io()'s calling convention
+ */
+static int do_get_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
+{
+ struct msr_data msr;
+ int r;
+
+ msr.index = index;
+ msr.host_initiated = true;
+ r = kvm_get_msr(vcpu, &msr);
+ if (r)
+ return r;
+
+ *data = msr.data;
+ return 0;
+}
+
+static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
+{
+ struct msr_data msr;
+
+ msr.data = *data;
+ msr.index = index;
+ msr.host_initiated = true;
+ return kvm_set_msr(vcpu, &msr);
+}
+
+#ifdef CONFIG_X86_64
+struct pvclock_gtod_data {
+ seqcount_t seq;
+
+ struct { /* extract of a clocksource struct */
+ int vclock_mode;
+ u64 cycle_last;
+ u64 mask;
+ u32 mult;
+ u32 shift;
+ } clock;
+
+ u64 boot_ns;
+ u64 nsec_base;
+ u64 wall_time_sec;
+};
+
+static struct pvclock_gtod_data pvclock_gtod_data;
+
+static void update_pvclock_gtod(struct timekeeper *tk)
+{
+ struct pvclock_gtod_data *vdata = &pvclock_gtod_data;
+ u64 boot_ns;
+
+ boot_ns = ktime_to_ns(ktime_add(tk->tkr_mono.base, tk->offs_boot));
+
+ write_seqcount_begin(&vdata->seq);
+
+ /* copy pvclock gtod data */
+ vdata->clock.vclock_mode = tk->tkr_mono.clock->archdata.vclock_mode;
+ vdata->clock.cycle_last = tk->tkr_mono.cycle_last;
+ vdata->clock.mask = tk->tkr_mono.mask;
+ vdata->clock.mult = tk->tkr_mono.mult;
+ vdata->clock.shift = tk->tkr_mono.shift;
+
+ vdata->boot_ns = boot_ns;
+ vdata->nsec_base = tk->tkr_mono.xtime_nsec;
+
+ vdata->wall_time_sec = tk->xtime_sec;
+
+ write_seqcount_end(&vdata->seq);
+}
+#endif
+
+void kvm_set_pending_timer(struct kvm_vcpu *vcpu)
+{
+ /*
+ * Note: KVM_REQ_PENDING_TIMER is implicitly checked in
+ * vcpu_enter_guest. This function is only called from
+ * the physical CPU that is running vcpu.
+ */
+ kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
+}
+
+static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
+{
+ int version;
+ int r;
+ struct pvclock_wall_clock wc;
+ struct timespec64 boot;
+
+ if (!wall_clock)
+ return;
+
+ r = kvm_read_guest(kvm, wall_clock, &version, sizeof(version));
+ if (r)
+ return;
+
+ if (version & 1)
+ ++version; /* first time write, random junk */
+
+ ++version;
+
+ if (kvm_write_guest(kvm, wall_clock, &version, sizeof(version)))
+ return;
+
+ /*
+ * The guest calculates current wall clock time by adding
+ * system time (updated by kvm_guest_time_update below) to the
+ * wall clock specified here. guest system time equals host
+ * system time for us, thus we must fill in host boot time here.
+ */
+ getboottime64(&boot);
+
+ if (kvm->arch.kvmclock_offset) {
+ struct timespec64 ts = ns_to_timespec64(kvm->arch.kvmclock_offset);
+ boot = timespec64_sub(boot, ts);
+ }
+ wc.sec = (u32)boot.tv_sec; /* overflow in 2106 guest time */
+ wc.nsec = boot.tv_nsec;
+ wc.version = version;
+
+ kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc));
+
+ version++;
+ kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
+}
+
+static uint32_t div_frac(uint32_t dividend, uint32_t divisor)
+{
+ do_shl32_div32(dividend, divisor);
+ return dividend;
+}
+
+static void kvm_get_time_scale(uint64_t scaled_hz, uint64_t base_hz,
+ s8 *pshift, u32 *pmultiplier)
+{
+ uint64_t scaled64;
+ int32_t shift = 0;
+ uint64_t tps64;
+ uint32_t tps32;
+
+ tps64 = base_hz;
+ scaled64 = scaled_hz;
+ while (tps64 > scaled64*2 || tps64 & 0xffffffff00000000ULL) {
+ tps64 >>= 1;
+ shift--;
+ }
+
+ tps32 = (uint32_t)tps64;
+ while (tps32 <= scaled64 || scaled64 & 0xffffffff00000000ULL) {
+ if (scaled64 & 0xffffffff00000000ULL || tps32 & 0x80000000)
+ scaled64 >>= 1;
+ else
+ tps32 <<= 1;
+ shift++;
+ }
+
+ *pshift = shift;
+ *pmultiplier = div_frac(scaled64, tps32);
+
+ pr_debug("%s: base_hz %llu => %llu, shift %d, mul %u\n",
+ __func__, base_hz, scaled_hz, shift, *pmultiplier);
+}
+
+#ifdef CONFIG_X86_64
+static atomic_t kvm_guest_has_master_clock = ATOMIC_INIT(0);
+#endif
+
+static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
+static unsigned long max_tsc_khz;
+
+static u32 adjust_tsc_khz(u32 khz, s32 ppm)
+{
+ u64 v = (u64)khz * (1000000 + ppm);
+ do_div(v, 1000000);
+ return v;
+}
+
+static int set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
+{
+ u64 ratio;
+
+ /* Guest TSC same frequency as host TSC? */
+ if (!scale) {
+ vcpu->arch.tsc_scaling_ratio = kvm_default_tsc_scaling_ratio;
+ return 0;
+ }
+
+ /* TSC scaling supported? */
+ if (!kvm_has_tsc_control) {
+ if (user_tsc_khz > tsc_khz) {
+ vcpu->arch.tsc_catchup = 1;
+ vcpu->arch.tsc_always_catchup = 1;
+ return 0;
+ } else {
+ pr_warn_ratelimited("user requested TSC rate below hardware speed\n");
+ return -1;
+ }
+ }
+
+ /* TSC scaling required - calculate ratio */
+ ratio = mul_u64_u32_div(1ULL << kvm_tsc_scaling_ratio_frac_bits,
+ user_tsc_khz, tsc_khz);
+
+ if (ratio == 0 || ratio >= kvm_max_tsc_scaling_ratio) {
+ pr_warn_ratelimited("Invalid TSC scaling ratio - virtual-tsc-khz=%u\n",
+ user_tsc_khz);
+ return -1;
+ }
+
+ vcpu->arch.tsc_scaling_ratio = ratio;
+ return 0;
+}
+
+static int kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz)
+{
+ u32 thresh_lo, thresh_hi;
+ int use_scaling = 0;
+
+ /* tsc_khz can be zero if TSC calibration fails */
+ if (user_tsc_khz == 0) {
+ /* set tsc_scaling_ratio to a safe value */
+ vcpu->arch.tsc_scaling_ratio = kvm_default_tsc_scaling_ratio;
+ return -1;
+ }
+
+ /* Compute a scale to convert nanoseconds in TSC cycles */
+ kvm_get_time_scale(user_tsc_khz * 1000LL, NSEC_PER_SEC,
+ &vcpu->arch.virtual_tsc_shift,
+ &vcpu->arch.virtual_tsc_mult);
+ vcpu->arch.virtual_tsc_khz = user_tsc_khz;
+
+ /*
+ * Compute the variation in TSC rate which is acceptable
+ * within the range of tolerance and decide if the
+ * rate being applied is within that bounds of the hardware
+ * rate. If so, no scaling or compensation need be done.
+ */
+ thresh_lo = adjust_tsc_khz(tsc_khz, -tsc_tolerance_ppm);
+ thresh_hi = adjust_tsc_khz(tsc_khz, tsc_tolerance_ppm);
+ if (user_tsc_khz < thresh_lo || user_tsc_khz > thresh_hi) {
+ pr_debug("kvm: requested TSC rate %u falls outside tolerance [%u,%u]\n", user_tsc_khz, thresh_lo, thresh_hi);
+ use_scaling = 1;
+ }
+ return set_tsc_khz(vcpu, user_tsc_khz, use_scaling);
+}
+
+static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns)
+{
+ u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.this_tsc_nsec,
+ vcpu->arch.virtual_tsc_mult,
+ vcpu->arch.virtual_tsc_shift);
+ tsc += vcpu->arch.this_tsc_write;
+ return tsc;
+}
+
+static inline int gtod_is_based_on_tsc(int mode)
+{
+ return mode == VCLOCK_TSC || mode == VCLOCK_HVCLOCK;
+}
+
+static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu)
+{
+#ifdef CONFIG_X86_64
+ bool vcpus_matched;
+ struct kvm_arch *ka = &vcpu->kvm->arch;
+ struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
+
+ vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 ==
+ atomic_read(&vcpu->kvm->online_vcpus));
+
+ /*
+ * Once the masterclock is enabled, always perform request in
+ * order to update it.
+ *
+ * In order to enable masterclock, the host clocksource must be TSC
+ * and the vcpus need to have matched TSCs. When that happens,
+ * perform request to enable masterclock.
+ */
+ if (ka->use_master_clock ||
+ (gtod_is_based_on_tsc(gtod->clock.vclock_mode) && vcpus_matched))
+ kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
+
+ trace_kvm_track_tsc(vcpu->vcpu_id, ka->nr_vcpus_matched_tsc,
+ atomic_read(&vcpu->kvm->online_vcpus),
+ ka->use_master_clock, gtod->clock.vclock_mode);
+#endif
+}
+
+static void update_ia32_tsc_adjust_msr(struct kvm_vcpu *vcpu, s64 offset)
+{
+ u64 curr_offset = kvm_x86_ops->read_l1_tsc_offset(vcpu);
+ vcpu->arch.ia32_tsc_adjust_msr += offset - curr_offset;
+}
+
+/*
+ * Multiply tsc by a fixed point number represented by ratio.
+ *
+ * The most significant 64-N bits (mult) of ratio represent the
+ * integral part of the fixed point number; the remaining N bits
+ * (frac) represent the fractional part, ie. ratio represents a fixed
+ * point number (mult + frac * 2^(-N)).
+ *
+ * N equals to kvm_tsc_scaling_ratio_frac_bits.
+ */
+static inline u64 __scale_tsc(u64 ratio, u64 tsc)
+{
+ return mul_u64_u64_shr(tsc, ratio, kvm_tsc_scaling_ratio_frac_bits);
+}
+
+u64 kvm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc)
+{
+ u64 _tsc = tsc;
+ u64 ratio = vcpu->arch.tsc_scaling_ratio;
+
+ if (ratio != kvm_default_tsc_scaling_ratio)
+ _tsc = __scale_tsc(ratio, tsc);
+
+ return _tsc;
+}
+EXPORT_SYMBOL_GPL(kvm_scale_tsc);
+
+static u64 kvm_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
+{
+ u64 tsc;
+
+ tsc = kvm_scale_tsc(vcpu, rdtsc());
+
+ return target_tsc - tsc;
+}
+
+u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc)
+{
+ u64 tsc_offset = kvm_x86_ops->read_l1_tsc_offset(vcpu);
+
+ return tsc_offset + kvm_scale_tsc(vcpu, host_tsc);
+}
+EXPORT_SYMBOL_GPL(kvm_read_l1_tsc);
+
+static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
+{
+ vcpu->arch.tsc_offset = kvm_x86_ops->write_l1_tsc_offset(vcpu, offset);
+}
+
+static inline bool kvm_check_tsc_unstable(void)
+{
+#ifdef CONFIG_X86_64
+ /*
+ * TSC is marked unstable when we're running on Hyper-V,
+ * 'TSC page' clocksource is good.
+ */
+ if (pvclock_gtod_data.clock.vclock_mode == VCLOCK_HVCLOCK)
+ return false;
+#endif
+ return check_tsc_unstable();
+}
+
+void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
+{
+ struct kvm *kvm = vcpu->kvm;
+ u64 offset, ns, elapsed;
+ unsigned long flags;
+ bool matched;
+ bool already_matched;
+ u64 data = msr->data;
+ bool synchronizing = false;
+
+ raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
+ offset = kvm_compute_tsc_offset(vcpu, data);
+ ns = ktime_get_boot_ns();
+ elapsed = ns - kvm->arch.last_tsc_nsec;
+
+ if (vcpu->arch.virtual_tsc_khz) {
+ if (data == 0 && msr->host_initiated) {
+ /*
+ * detection of vcpu initialization -- need to sync
+ * with other vCPUs. This particularly helps to keep
+ * kvm_clock stable after CPU hotplug
+ */
+ synchronizing = true;
+ } else {
+ u64 tsc_exp = kvm->arch.last_tsc_write +
+ nsec_to_cycles(vcpu, elapsed);
+ u64 tsc_hz = vcpu->arch.virtual_tsc_khz * 1000LL;
+ /*
+ * Special case: TSC write with a small delta (1 second)
+ * of virtual cycle time against real time is
+ * interpreted as an attempt to synchronize the CPU.
+ */
+ synchronizing = data < tsc_exp + tsc_hz &&
+ data + tsc_hz > tsc_exp;
+ }
+ }
+
+ /*
+ * For a reliable TSC, we can match TSC offsets, and for an unstable
+ * TSC, we add elapsed time in this computation. We could let the
+ * compensation code attempt to catch up if we fall behind, but
+ * it's better to try to match offsets from the beginning.
+ */
+ if (synchronizing &&
+ vcpu->arch.virtual_tsc_khz == kvm->arch.last_tsc_khz) {
+ if (!kvm_check_tsc_unstable()) {
+ offset = kvm->arch.cur_tsc_offset;
+ pr_debug("kvm: matched tsc offset for %llu\n", data);
+ } else {
+ u64 delta = nsec_to_cycles(vcpu, elapsed);
+ data += delta;
+ offset = kvm_compute_tsc_offset(vcpu, data);
+ pr_debug("kvm: adjusted tsc offset by %llu\n", delta);
+ }
+ matched = true;
+ already_matched = (vcpu->arch.this_tsc_generation == kvm->arch.cur_tsc_generation);
+ } else {
+ /*
+ * We split periods of matched TSC writes into generations.
+ * For each generation, we track the original measured
+ * nanosecond time, offset, and write, so if TSCs are in
+ * sync, we can match exact offset, and if not, we can match
+ * exact software computation in compute_guest_tsc()
+ *
+ * These values are tracked in kvm->arch.cur_xxx variables.
+ */
+ kvm->arch.cur_tsc_generation++;
+ kvm->arch.cur_tsc_nsec = ns;
+ kvm->arch.cur_tsc_write = data;
+ kvm->arch.cur_tsc_offset = offset;
+ matched = false;
+ pr_debug("kvm: new tsc generation %llu, clock %llu\n",
+ kvm->arch.cur_tsc_generation, data);
+ }
+
+ /*
+ * We also track th most recent recorded KHZ, write and time to
+ * allow the matching interval to be extended at each write.
+ */
+ kvm->arch.last_tsc_nsec = ns;
+ kvm->arch.last_tsc_write = data;
+ kvm->arch.last_tsc_khz = vcpu->arch.virtual_tsc_khz;
+
+ vcpu->arch.last_guest_tsc = data;
+
+ /* Keep track of which generation this VCPU has synchronized to */
+ vcpu->arch.this_tsc_generation = kvm->arch.cur_tsc_generation;
+ vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec;
+ vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write;
+
+ if (!msr->host_initiated && guest_cpuid_has(vcpu, X86_FEATURE_TSC_ADJUST))
+ update_ia32_tsc_adjust_msr(vcpu, offset);
+
+ kvm_vcpu_write_tsc_offset(vcpu, offset);
+ raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
+
+ spin_lock(&kvm->arch.pvclock_gtod_sync_lock);
+ if (!matched) {
+ kvm->arch.nr_vcpus_matched_tsc = 0;
+ } else if (!already_matched) {
+ kvm->arch.nr_vcpus_matched_tsc++;
+ }
+
+ kvm_track_tsc_matching(vcpu);
+ spin_unlock(&kvm->arch.pvclock_gtod_sync_lock);
+}
+
+EXPORT_SYMBOL_GPL(kvm_write_tsc);
+
+static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu,
+ s64 adjustment)
+{
+ u64 tsc_offset = kvm_x86_ops->read_l1_tsc_offset(vcpu);
+ kvm_vcpu_write_tsc_offset(vcpu, tsc_offset + adjustment);
+}
+
+static inline void adjust_tsc_offset_host(struct kvm_vcpu *vcpu, s64 adjustment)
+{
+ if (vcpu->arch.tsc_scaling_ratio != kvm_default_tsc_scaling_ratio)
+ WARN_ON(adjustment < 0);
+ adjustment = kvm_scale_tsc(vcpu, (u64) adjustment);
+ adjust_tsc_offset_guest(vcpu, adjustment);
+}
+
+#ifdef CONFIG_X86_64
+
+static u64 read_tsc(void)
+{
+ u64 ret = (u64)rdtsc_ordered();
+ u64 last = pvclock_gtod_data.clock.cycle_last;
+
+ if (likely(ret >= last))
+ return ret;
+
+ /*
+ * GCC likes to generate cmov here, but this branch is extremely
+ * predictable (it's just a function of time and the likely is
+ * very likely) and there's a data dependence, so force GCC
+ * to generate a branch instead. I don't barrier() because
+ * we don't actually need a barrier, and if this function
+ * ever gets inlined it will generate worse code.
+ */
+ asm volatile ("");
+ return last;
+}
+
+static inline u64 vgettsc(u64 *tsc_timestamp, int *mode)
+{
+ long v;
+ struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
+ u64 tsc_pg_val;
+
+ switch (gtod->clock.vclock_mode) {
+ case VCLOCK_HVCLOCK:
+ tsc_pg_val = hv_read_tsc_page_tsc(hv_get_tsc_page(),
+ tsc_timestamp);
+ if (tsc_pg_val != U64_MAX) {
+ /* TSC page valid */
+ *mode = VCLOCK_HVCLOCK;
+ v = (tsc_pg_val - gtod->clock.cycle_last) &
+ gtod->clock.mask;
+ } else {
+ /* TSC page invalid */
+ *mode = VCLOCK_NONE;
+ }
+ break;
+ case VCLOCK_TSC:
+ *mode = VCLOCK_TSC;
+ *tsc_timestamp = read_tsc();
+ v = (*tsc_timestamp - gtod->clock.cycle_last) &
+ gtod->clock.mask;
+ break;
+ default:
+ *mode = VCLOCK_NONE;
+ }
+
+ if (*mode == VCLOCK_NONE)
+ *tsc_timestamp = v = 0;
+
+ return v * gtod->clock.mult;
+}
+
+static int do_monotonic_boot(s64 *t, u64 *tsc_timestamp)
+{
+ struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
+ unsigned long seq;
+ int mode;
+ u64 ns;
+
+ do {
+ seq = read_seqcount_begin(&gtod->seq);
+ ns = gtod->nsec_base;
+ ns += vgettsc(tsc_timestamp, &mode);
+ ns >>= gtod->clock.shift;
+ ns += gtod->boot_ns;
+ } while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
+ *t = ns;
+
+ return mode;
+}
+
+static int do_realtime(struct timespec64 *ts, u64 *tsc_timestamp)
+{
+ struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
+ unsigned long seq;
+ int mode;
+ u64 ns;
+
+ do {
+ seq = read_seqcount_begin(&gtod->seq);
+ ts->tv_sec = gtod->wall_time_sec;
+ ns = gtod->nsec_base;
+ ns += vgettsc(tsc_timestamp, &mode);
+ ns >>= gtod->clock.shift;
+ } while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
+
+ ts->tv_sec += __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns);
+ ts->tv_nsec = ns;
+
+ return mode;
+}
+
+/* returns true if host is using TSC based clocksource */
+static bool kvm_get_time_and_clockread(s64 *kernel_ns, u64 *tsc_timestamp)
+{
+ /* checked again under seqlock below */
+ if (!gtod_is_based_on_tsc(pvclock_gtod_data.clock.vclock_mode))
+ return false;
+
+ return gtod_is_based_on_tsc(do_monotonic_boot(kernel_ns,
+ tsc_timestamp));
+}
+
+/* returns true if host is using TSC based clocksource */
+static bool kvm_get_walltime_and_clockread(struct timespec64 *ts,
+ u64 *tsc_timestamp)
+{
+ /* checked again under seqlock below */
+ if (!gtod_is_based_on_tsc(pvclock_gtod_data.clock.vclock_mode))
+ return false;
+
+ return gtod_is_based_on_tsc(do_realtime(ts, tsc_timestamp));
+}
+#endif
+
+/*
+ *
+ * Assuming a stable TSC across physical CPUS, and a stable TSC
+ * across virtual CPUs, the following condition is possible.
+ * Each numbered line represents an event visible to both
+ * CPUs at the next numbered event.
+ *
+ * "timespecX" represents host monotonic time. "tscX" represents
+ * RDTSC value.
+ *
+ * VCPU0 on CPU0 | VCPU1 on CPU1
+ *
+ * 1. read timespec0,tsc0
+ * 2. | timespec1 = timespec0 + N
+ * | tsc1 = tsc0 + M
+ * 3. transition to guest | transition to guest
+ * 4. ret0 = timespec0 + (rdtsc - tsc0) |
+ * 5. | ret1 = timespec1 + (rdtsc - tsc1)
+ * | ret1 = timespec0 + N + (rdtsc - (tsc0 + M))
+ *
+ * Since ret0 update is visible to VCPU1 at time 5, to obey monotonicity:
+ *
+ * - ret0 < ret1
+ * - timespec0 + (rdtsc - tsc0) < timespec0 + N + (rdtsc - (tsc0 + M))
+ * ...
+ * - 0 < N - M => M < N
+ *
+ * That is, when timespec0 != timespec1, M < N. Unfortunately that is not
+ * always the case (the difference between two distinct xtime instances
+ * might be smaller then the difference between corresponding TSC reads,
+ * when updating guest vcpus pvclock areas).
+ *
+ * To avoid that problem, do not allow visibility of distinct
+ * system_timestamp/tsc_timestamp values simultaneously: use a master
+ * copy of host monotonic time values. Update that master copy
+ * in lockstep.
+ *
+ * Rely on synchronization of host TSCs and guest TSCs for monotonicity.
+ *
+ */
+
+static void pvclock_update_vm_gtod_copy(struct kvm *kvm)
+{
+#ifdef CONFIG_X86_64
+ struct kvm_arch *ka = &kvm->arch;
+ int vclock_mode;
+ bool host_tsc_clocksource, vcpus_matched;
+
+ vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 ==
+ atomic_read(&kvm->online_vcpus));
+
+ /*
+ * If the host uses TSC clock, then passthrough TSC as stable
+ * to the guest.
+ */
+ host_tsc_clocksource = kvm_get_time_and_clockread(
+ &ka->master_kernel_ns,
+ &ka->master_cycle_now);
+
+ ka->use_master_clock = host_tsc_clocksource && vcpus_matched
+ && !ka->backwards_tsc_observed
+ && !ka->boot_vcpu_runs_old_kvmclock;
+
+ if (ka->use_master_clock)
+ atomic_set(&kvm_guest_has_master_clock, 1);
+
+ vclock_mode = pvclock_gtod_data.clock.vclock_mode;
+ trace_kvm_update_master_clock(ka->use_master_clock, vclock_mode,
+ vcpus_matched);
+#endif
+}
+
+void kvm_make_mclock_inprogress_request(struct kvm *kvm)
+{
+ kvm_make_all_cpus_request(kvm, KVM_REQ_MCLOCK_INPROGRESS);
+}
+
+static void kvm_gen_update_masterclock(struct kvm *kvm)
+{
+#ifdef CONFIG_X86_64
+ int i;
+ struct kvm_vcpu *vcpu;
+ struct kvm_arch *ka = &kvm->arch;
+
+ spin_lock(&ka->pvclock_gtod_sync_lock);
+ kvm_make_mclock_inprogress_request(kvm);
+ /* no guest entries from this point */
+ pvclock_update_vm_gtod_copy(kvm);
+
+ kvm_for_each_vcpu(i, vcpu, kvm)
+ kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
+
+ /* guest entries allowed */
+ kvm_for_each_vcpu(i, vcpu, kvm)
+ kvm_clear_request(KVM_REQ_MCLOCK_INPROGRESS, vcpu);
+
+ spin_unlock(&ka->pvclock_gtod_sync_lock);
+#endif
+}
+
+u64 get_kvmclock_ns(struct kvm *kvm)
+{
+ struct kvm_arch *ka = &kvm->arch;
+ struct pvclock_vcpu_time_info hv_clock;
+ u64 ret;
+
+ spin_lock(&ka->pvclock_gtod_sync_lock);
+ if (!ka->use_master_clock) {
+ spin_unlock(&ka->pvclock_gtod_sync_lock);
+ return ktime_get_boot_ns() + ka->kvmclock_offset;
+ }
+
+ hv_clock.tsc_timestamp = ka->master_cycle_now;
+ hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset;
+ spin_unlock(&ka->pvclock_gtod_sync_lock);
+
+ /* both __this_cpu_read() and rdtsc() should be on the same cpu */
+ get_cpu();
+
+ if (__this_cpu_read(cpu_tsc_khz)) {
+ kvm_get_time_scale(NSEC_PER_SEC, __this_cpu_read(cpu_tsc_khz) * 1000LL,
+ &hv_clock.tsc_shift,
+ &hv_clock.tsc_to_system_mul);
+ ret = __pvclock_read_cycles(&hv_clock, rdtsc());
+ } else
+ ret = ktime_get_boot_ns() + ka->kvmclock_offset;
+
+ put_cpu();
+
+ return ret;
+}
+
+static void kvm_setup_pvclock_page(struct kvm_vcpu *v)
+{
+ struct kvm_vcpu_arch *vcpu = &v->arch;
+ struct pvclock_vcpu_time_info guest_hv_clock;
+
+ if (unlikely(kvm_read_guest_cached(v->kvm, &vcpu->pv_time,
+ &guest_hv_clock, sizeof(guest_hv_clock))))
+ return;
+
+ /* This VCPU is paused, but it's legal for a guest to read another
+ * VCPU's kvmclock, so we really have to follow the specification where
+ * it says that version is odd if data is being modified, and even after
+ * it is consistent.
+ *
+ * Version field updates must be kept separate. This is because
+ * kvm_write_guest_cached might use a "rep movs" instruction, and
+ * writes within a string instruction are weakly ordered. So there
+ * are three writes overall.
+ *
+ * As a small optimization, only write the version field in the first
+ * and third write. The vcpu->pv_time cache is still valid, because the
+ * version field is the first in the struct.
+ */
+ BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0);
+
+ if (guest_hv_clock.version & 1)
+ ++guest_hv_clock.version; /* first time write, random junk */
+
+ vcpu->hv_clock.version = guest_hv_clock.version + 1;
+ kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
+ &vcpu->hv_clock,
+ sizeof(vcpu->hv_clock.version));
+
+ smp_wmb();
+
+ /* retain PVCLOCK_GUEST_STOPPED if set in guest copy */
+ vcpu->hv_clock.flags |= (guest_hv_clock.flags & PVCLOCK_GUEST_STOPPED);
+
+ if (vcpu->pvclock_set_guest_stopped_request) {
+ vcpu->hv_clock.flags |= PVCLOCK_GUEST_STOPPED;
+ vcpu->pvclock_set_guest_stopped_request = false;
+ }
+
+ trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock);
+
+ kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
+ &vcpu->hv_clock,
+ sizeof(vcpu->hv_clock));
+
+ smp_wmb();
+
+ vcpu->hv_clock.version++;
+ kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
+ &vcpu->hv_clock,
+ sizeof(vcpu->hv_clock.version));
+}
+
+static int kvm_guest_time_update(struct kvm_vcpu *v)
+{
+ unsigned long flags, tgt_tsc_khz;
+ struct kvm_vcpu_arch *vcpu = &v->arch;
+ struct kvm_arch *ka = &v->kvm->arch;
+ s64 kernel_ns;
+ u64 tsc_timestamp, host_tsc;
+ u8 pvclock_flags;
+ bool use_master_clock;
+
+ kernel_ns = 0;
+ host_tsc = 0;
+
+ /*
+ * If the host uses TSC clock, then passthrough TSC as stable
+ * to the guest.
+ */
+ spin_lock(&ka->pvclock_gtod_sync_lock);
+ use_master_clock = ka->use_master_clock;
+ if (use_master_clock) {
+ host_tsc = ka->master_cycle_now;
+ kernel_ns = ka->master_kernel_ns;
+ }
+ spin_unlock(&ka->pvclock_gtod_sync_lock);
+
+ /* Keep irq disabled to prevent changes to the clock */
+ local_irq_save(flags);
+ tgt_tsc_khz = __this_cpu_read(cpu_tsc_khz);
+ if (unlikely(tgt_tsc_khz == 0)) {
+ local_irq_restore(flags);
+ kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
+ return 1;
+ }
+ if (!use_master_clock) {
+ host_tsc = rdtsc();
+ kernel_ns = ktime_get_boot_ns();
+ }
+
+ tsc_timestamp = kvm_read_l1_tsc(v, host_tsc);
+
+ /*
+ * We may have to catch up the TSC to match elapsed wall clock
+ * time for two reasons, even if kvmclock is used.
+ * 1) CPU could have been running below the maximum TSC rate
+ * 2) Broken TSC compensation resets the base at each VCPU
+ * entry to avoid unknown leaps of TSC even when running
+ * again on the same CPU. This may cause apparent elapsed
+ * time to disappear, and the guest to stand still or run
+ * very slowly.
+ */
+ if (vcpu->tsc_catchup) {
+ u64 tsc = compute_guest_tsc(v, kernel_ns);
+ if (tsc > tsc_timestamp) {
+ adjust_tsc_offset_guest(v, tsc - tsc_timestamp);
+ tsc_timestamp = tsc;
+ }
+ }
+
+ local_irq_restore(flags);
+
+ /* With all the info we got, fill in the values */
+
+ if (kvm_has_tsc_control)
+ tgt_tsc_khz = kvm_scale_tsc(v, tgt_tsc_khz);
+
+ if (unlikely(vcpu->hw_tsc_khz != tgt_tsc_khz)) {
+ kvm_get_time_scale(NSEC_PER_SEC, tgt_tsc_khz * 1000LL,
+ &vcpu->hv_clock.tsc_shift,
+ &vcpu->hv_clock.tsc_to_system_mul);
+ vcpu->hw_tsc_khz = tgt_tsc_khz;
+ }
+
+ vcpu->hv_clock.tsc_timestamp = tsc_timestamp;
+ vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
+ vcpu->last_guest_tsc = tsc_timestamp;
+
+ /* If the host uses TSC clocksource, then it is stable */
+ pvclock_flags = 0;
+ if (use_master_clock)
+ pvclock_flags |= PVCLOCK_TSC_STABLE_BIT;
+
+ vcpu->hv_clock.flags = pvclock_flags;
+
+ if (vcpu->pv_time_enabled)
+ kvm_setup_pvclock_page(v);
+ if (v == kvm_get_vcpu(v->kvm, 0))
+ kvm_hv_setup_tsc_page(v->kvm, &vcpu->hv_clock);
+ return 0;
+}
+
+/*
+ * kvmclock updates which are isolated to a given vcpu, such as
+ * vcpu->cpu migration, should not allow system_timestamp from
+ * the rest of the vcpus to remain static. Otherwise ntp frequency
+ * correction applies to one vcpu's system_timestamp but not
+ * the others.
+ *
+ * So in those cases, request a kvmclock update for all vcpus.
+ * We need to rate-limit these requests though, as they can
+ * considerably slow guests that have a large number of vcpus.
+ * The time for a remote vcpu to update its kvmclock is bound
+ * by the delay we use to rate-limit the updates.
+ */
+
+#define KVMCLOCK_UPDATE_DELAY msecs_to_jiffies(100)
+
+static void kvmclock_update_fn(struct work_struct *work)
+{
+ int i;
+ struct delayed_work *dwork = to_delayed_work(work);
+ struct kvm_arch *ka = container_of(dwork, struct kvm_arch,
+ kvmclock_update_work);
+ struct kvm *kvm = container_of(ka, struct kvm, arch);
+ struct kvm_vcpu *vcpu;
+
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
+ kvm_vcpu_kick(vcpu);
+ }
+}
+
+static void kvm_gen_kvmclock_update(struct kvm_vcpu *v)
+{
+ struct kvm *kvm = v->kvm;
+
+ kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
+ schedule_delayed_work(&kvm->arch.kvmclock_update_work,
+ KVMCLOCK_UPDATE_DELAY);
+}
+
+#define KVMCLOCK_SYNC_PERIOD (300 * HZ)
+
+static void kvmclock_sync_fn(struct work_struct *work)
+{
+ struct delayed_work *dwork = to_delayed_work(work);
+ struct kvm_arch *ka = container_of(dwork, struct kvm_arch,
+ kvmclock_sync_work);
+ struct kvm *kvm = container_of(ka, struct kvm, arch);
+
+ if (!kvmclock_periodic_sync)
+ return;
+
+ schedule_delayed_work(&kvm->arch.kvmclock_update_work, 0);
+ schedule_delayed_work(&kvm->arch.kvmclock_sync_work,
+ KVMCLOCK_SYNC_PERIOD);
+}
+
+static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+{
+ u64 mcg_cap = vcpu->arch.mcg_cap;
+ unsigned bank_num = mcg_cap & 0xff;
+ u32 msr = msr_info->index;
+ u64 data = msr_info->data;
+
+ switch (msr) {
+ case MSR_IA32_MCG_STATUS:
+ vcpu->arch.mcg_status = data;
+ break;
+ case MSR_IA32_MCG_CTL:
+ if (!(mcg_cap & MCG_CTL_P) &&
+ (data || !msr_info->host_initiated))
+ return 1;
+ if (data != 0 && data != ~(u64)0)
+ return 1;
+ vcpu->arch.mcg_ctl = data;
+ break;
+ default:
+ if (msr >= MSR_IA32_MC0_CTL &&
+ msr < MSR_IA32_MCx_CTL(bank_num)) {
+ u32 offset = array_index_nospec(
+ msr - MSR_IA32_MC0_CTL,
+ MSR_IA32_MCx_CTL(bank_num) - MSR_IA32_MC0_CTL);
+
+ /* only 0 or all 1s can be written to IA32_MCi_CTL
+ * some Linux kernels though clear bit 10 in bank 4 to
+ * workaround a BIOS/GART TBL issue on AMD K8s, ignore
+ * this to avoid an uncatched #GP in the guest
+ */
+ if ((offset & 0x3) == 0 &&
+ data != 0 && (data | (1 << 10)) != ~(u64)0)
+ return -1;
+ if (!msr_info->host_initiated &&
+ (offset & 0x3) == 1 && data != 0)
+ return -1;
+ vcpu->arch.mce_banks[offset] = data;
+ break;
+ }
+ return 1;
+ }
+ return 0;
+}
+
+static int xen_hvm_config(struct kvm_vcpu *vcpu, u64 data)
+{
+ struct kvm *kvm = vcpu->kvm;
+ int lm = is_long_mode(vcpu);
+ u8 *blob_addr = lm ? (u8 *)(long)kvm->arch.xen_hvm_config.blob_addr_64
+ : (u8 *)(long)kvm->arch.xen_hvm_config.blob_addr_32;
+ u8 blob_size = lm ? kvm->arch.xen_hvm_config.blob_size_64
+ : kvm->arch.xen_hvm_config.blob_size_32;
+ u32 page_num = data & ~PAGE_MASK;
+ u64 page_addr = data & PAGE_MASK;
+ u8 *page;
+ int r;
+
+ r = -E2BIG;
+ if (page_num >= blob_size)
+ goto out;
+ r = -ENOMEM;
+ page = memdup_user(blob_addr + (page_num * PAGE_SIZE), PAGE_SIZE);
+ if (IS_ERR(page)) {
+ r = PTR_ERR(page);
+ goto out;
+ }
+ if (kvm_vcpu_write_guest(vcpu, page_addr, page, PAGE_SIZE))
+ goto out_free;
+ r = 0;
+out_free:
+ kfree(page);
+out:
+ return r;
+}
+
+static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data)
+{
+ gpa_t gpa = data & ~0x3f;
+
+ /* Bits 3:5 are reserved, Should be zero */
+ if (data & 0x38)
+ return 1;
+
+ vcpu->arch.apf.msr_val = data;
+
+ if (!(data & KVM_ASYNC_PF_ENABLED)) {
+ kvm_clear_async_pf_completion_queue(vcpu);
+ kvm_async_pf_hash_reset(vcpu);
+ return 0;
+ }
+
+ if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apf.data, gpa,
+ sizeof(u32)))
+ return 1;
+
+ vcpu->arch.apf.send_user_only = !(data & KVM_ASYNC_PF_SEND_ALWAYS);
+ vcpu->arch.apf.delivery_as_pf_vmexit = data & KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT;
+ kvm_async_pf_wakeup_all(vcpu);
+ return 0;
+}
+
+static void kvmclock_reset(struct kvm_vcpu *vcpu)
+{
+ vcpu->arch.pv_time_enabled = false;
+}
+
+static void kvm_vcpu_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa)
+{
+ ++vcpu->stat.tlb_flush;
+ kvm_x86_ops->tlb_flush(vcpu, invalidate_gpa);
+}
+
+static void record_steal_time(struct kvm_vcpu *vcpu)
+{
+ struct kvm_host_map map;
+ struct kvm_steal_time *st;
+
+ if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
+ return;
+
+ /* -EAGAIN is returned in atomic context so we can just return. */
+ if (kvm_map_gfn(vcpu, vcpu->arch.st.msr_val >> PAGE_SHIFT,
+ &map, &vcpu->arch.st.cache, false))
+ return;
+
+ st = map.hva +
+ offset_in_page(vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS);
+
+ /*
+ * Doing a TLB flush here, on the guest's behalf, can avoid
+ * expensive IPIs.
+ */
+ if (xchg(&st->preempted, 0) & KVM_VCPU_FLUSH_TLB)
+ kvm_vcpu_flush_tlb(vcpu, false);
+
+ vcpu->arch.st.preempted = 0;
+
+ if (st->version & 1)
+ st->version += 1; /* first time write, random junk */
+
+ st->version += 1;
+
+ smp_wmb();
+
+ st->steal += current->sched_info.run_delay -
+ vcpu->arch.st.last_steal;
+ vcpu->arch.st.last_steal = current->sched_info.run_delay;
+
+ smp_wmb();
+
+ st->version += 1;
+
+ kvm_unmap_gfn(vcpu, &map, &vcpu->arch.st.cache, true, false);
+}
+
+int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+{
+ bool pr = false;
+ u32 msr = msr_info->index;
+ u64 data = msr_info->data;
+
+ switch (msr) {
+ case MSR_AMD64_NB_CFG:
+ case MSR_IA32_UCODE_WRITE:
+ case MSR_VM_HSAVE_PA:
+ case MSR_AMD64_PATCH_LOADER:
+ case MSR_AMD64_BU_CFG2:
+ case MSR_AMD64_DC_CFG:
+ case MSR_F15H_EX_CFG:
+ break;
+
+ case MSR_IA32_UCODE_REV:
+ if (msr_info->host_initiated)
+ vcpu->arch.microcode_version = data;
+ break;
+ case MSR_IA32_ARCH_CAPABILITIES:
+ if (!msr_info->host_initiated)
+ return 1;
+ vcpu->arch.arch_capabilities = data;
+ break;
+ case MSR_EFER:
+ return set_efer(vcpu, msr_info);
+ case MSR_K7_HWCR:
+ data &= ~(u64)0x40; /* ignore flush filter disable */
+ data &= ~(u64)0x100; /* ignore ignne emulation enable */
+ data &= ~(u64)0x8; /* ignore TLB cache disable */
+ data &= ~(u64)0x40000; /* ignore Mc status write enable */
+ if (data != 0) {
+ vcpu_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n",
+ data);
+ return 1;
+ }
+ break;
+ case MSR_FAM10H_MMIO_CONF_BASE:
+ if (data != 0) {
+ vcpu_unimpl(vcpu, "unimplemented MMIO_CONF_BASE wrmsr: "
+ "0x%llx\n", data);
+ return 1;
+ }
+ break;
+ case MSR_IA32_DEBUGCTLMSR:
+ if (!data) {
+ /* We support the non-activated case already */
+ break;
+ } else if (data & ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_BTF)) {
+ /* Values other than LBR and BTF are vendor-specific,
+ thus reserved and should throw a #GP */
+ return 1;
+ }
+ vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n",
+ __func__, data);
+ break;
+ case 0x200 ... 0x2ff:
+ return kvm_mtrr_set_msr(vcpu, msr, data);
+ case MSR_IA32_APICBASE:
+ return kvm_set_apic_base(vcpu, msr_info);
+ case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff:
+ return kvm_x2apic_msr_write(vcpu, msr, data);
+ case MSR_IA32_TSCDEADLINE:
+ kvm_set_lapic_tscdeadline_msr(vcpu, data);
+ break;
+ case MSR_IA32_TSC_ADJUST:
+ if (guest_cpuid_has(vcpu, X86_FEATURE_TSC_ADJUST)) {
+ if (!msr_info->host_initiated) {
+ s64 adj = data - vcpu->arch.ia32_tsc_adjust_msr;
+ adjust_tsc_offset_guest(vcpu, adj);
+ /* Before back to guest, tsc_timestamp must be adjusted
+ * as well, otherwise guest's percpu pvclock time could jump.
+ */
+ kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
+ }
+ vcpu->arch.ia32_tsc_adjust_msr = data;
+ }
+ break;
+ case MSR_IA32_MISC_ENABLE:
+ vcpu->arch.ia32_misc_enable_msr = data;
+ break;
+ case MSR_IA32_SMBASE:
+ if (!msr_info->host_initiated)
+ return 1;
+ vcpu->arch.smbase = data;
+ break;
+ case MSR_IA32_TSC:
+ kvm_write_tsc(vcpu, msr_info);
+ break;
+ case MSR_SMI_COUNT:
+ if (!msr_info->host_initiated)
+ return 1;
+ vcpu->arch.smi_count = data;
+ break;
+ case MSR_KVM_WALL_CLOCK_NEW:
+ case MSR_KVM_WALL_CLOCK:
+ vcpu->kvm->arch.wall_clock = data;
+ kvm_write_wall_clock(vcpu->kvm, data);
+ break;
+ case MSR_KVM_SYSTEM_TIME_NEW:
+ case MSR_KVM_SYSTEM_TIME: {
+ struct kvm_arch *ka = &vcpu->kvm->arch;
+
+ kvmclock_reset(vcpu);
+
+ if (vcpu->vcpu_id == 0 && !msr_info->host_initiated) {
+ bool tmp = (msr == MSR_KVM_SYSTEM_TIME);
+
+ if (ka->boot_vcpu_runs_old_kvmclock != tmp)
+ kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
+
+ ka->boot_vcpu_runs_old_kvmclock = tmp;
+ }
+
+ vcpu->arch.time = data;
+ kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu);
+
+ /* we verify if the enable bit is set... */
+ if (!(data & 1))
+ break;
+
+ if (kvm_gfn_to_hva_cache_init(vcpu->kvm,
+ &vcpu->arch.pv_time, data & ~1ULL,
+ sizeof(struct pvclock_vcpu_time_info)))
+ vcpu->arch.pv_time_enabled = false;
+ else
+ vcpu->arch.pv_time_enabled = true;
+
+ break;
+ }
+ case MSR_KVM_ASYNC_PF_EN:
+ if (kvm_pv_enable_async_pf(vcpu, data))
+ return 1;
+ break;
+ case MSR_KVM_STEAL_TIME:
+
+ if (unlikely(!sched_info_on()))
+ return 1;
+
+ if (data & KVM_STEAL_RESERVED_MASK)
+ return 1;
+
+ vcpu->arch.st.msr_val = data;
+
+ if (!(data & KVM_MSR_ENABLED))
+ break;
+
+ kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
+
+ break;
+ case MSR_KVM_PV_EOI_EN:
+ if (kvm_lapic_enable_pv_eoi(vcpu, data, sizeof(u8)))
+ return 1;
+ break;
+
+ case MSR_IA32_MCG_CTL:
+ case MSR_IA32_MCG_STATUS:
+ case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
+ return set_msr_mce(vcpu, msr_info);
+
+ case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3:
+ case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1:
+ pr = true; /* fall through */
+ case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3:
+ case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1:
+ if (kvm_pmu_is_valid_msr(vcpu, msr))
+ return kvm_pmu_set_msr(vcpu, msr_info);
+
+ if (pr || data != 0)
+ vcpu_unimpl(vcpu, "disabled perfctr wrmsr: "
+ "0x%x data 0x%llx\n", msr, data);
+ break;
+ case MSR_K7_CLK_CTL:
+ /*
+ * Ignore all writes to this no longer documented MSR.
+ * Writes are only relevant for old K7 processors,
+ * all pre-dating SVM, but a recommended workaround from
+ * AMD for these chips. It is possible to specify the
+ * affected processor models on the command line, hence
+ * the need to ignore the workaround.
+ */
+ break;
+ case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
+ case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
+ case HV_X64_MSR_CRASH_CTL:
+ case HV_X64_MSR_STIMER0_CONFIG ... HV_X64_MSR_STIMER3_COUNT:
+ case HV_X64_MSR_REENLIGHTENMENT_CONTROL:
+ case HV_X64_MSR_TSC_EMULATION_CONTROL:
+ case HV_X64_MSR_TSC_EMULATION_STATUS:
+ return kvm_hv_set_msr_common(vcpu, msr, data,
+ msr_info->host_initiated);
+ case MSR_IA32_BBL_CR_CTL3:
+ /* Drop writes to this legacy MSR -- see rdmsr
+ * counterpart for further detail.
+ */
+ if (report_ignored_msrs)
+ vcpu_unimpl(vcpu, "ignored wrmsr: 0x%x data 0x%llx\n",
+ msr, data);
+ break;
+ case MSR_AMD64_OSVW_ID_LENGTH:
+ if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW))
+ return 1;
+ vcpu->arch.osvw.length = data;
+ break;
+ case MSR_AMD64_OSVW_STATUS:
+ if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW))
+ return 1;
+ vcpu->arch.osvw.status = data;
+ break;
+ case MSR_PLATFORM_INFO:
+ if (!msr_info->host_initiated ||
+ (!(data & MSR_PLATFORM_INFO_CPUID_FAULT) &&
+ cpuid_fault_enabled(vcpu)))
+ return 1;
+ vcpu->arch.msr_platform_info = data;
+ break;
+ case MSR_MISC_FEATURES_ENABLES:
+ if (data & ~MSR_MISC_FEATURES_ENABLES_CPUID_FAULT ||
+ (data & MSR_MISC_FEATURES_ENABLES_CPUID_FAULT &&
+ !supports_cpuid_fault(vcpu)))
+ return 1;
+ vcpu->arch.msr_misc_features_enables = data;
+ break;
+ default:
+ if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr))
+ return xen_hvm_config(vcpu, data);
+ if (kvm_pmu_is_valid_msr(vcpu, msr))
+ return kvm_pmu_set_msr(vcpu, msr_info);
+ if (!ignore_msrs) {
+ vcpu_debug_ratelimited(vcpu, "unhandled wrmsr: 0x%x data 0x%llx\n",
+ msr, data);
+ return 1;
+ } else {
+ if (report_ignored_msrs)
+ vcpu_unimpl(vcpu,
+ "ignored wrmsr: 0x%x data 0x%llx\n",
+ msr, data);
+ break;
+ }
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_set_msr_common);
+
+
+/*
+ * Reads an msr value (of 'msr_index') into 'pdata'.
+ * Returns 0 on success, non-0 otherwise.
+ * Assumes vcpu_load() was already called.
+ */
+int kvm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
+{
+ return kvm_x86_ops->get_msr(vcpu, msr);
+}
+EXPORT_SYMBOL_GPL(kvm_get_msr);
+
+static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host)
+{
+ u64 data;
+ u64 mcg_cap = vcpu->arch.mcg_cap;
+ unsigned bank_num = mcg_cap & 0xff;
+
+ switch (msr) {
+ case MSR_IA32_P5_MC_ADDR:
+ case MSR_IA32_P5_MC_TYPE:
+ data = 0;
+ break;
+ case MSR_IA32_MCG_CAP:
+ data = vcpu->arch.mcg_cap;
+ break;
+ case MSR_IA32_MCG_CTL:
+ if (!(mcg_cap & MCG_CTL_P) && !host)
+ return 1;
+ data = vcpu->arch.mcg_ctl;
+ break;
+ case MSR_IA32_MCG_STATUS:
+ data = vcpu->arch.mcg_status;
+ break;
+ default:
+ if (msr >= MSR_IA32_MC0_CTL &&
+ msr < MSR_IA32_MCx_CTL(bank_num)) {
+ u32 offset = array_index_nospec(
+ msr - MSR_IA32_MC0_CTL,
+ MSR_IA32_MCx_CTL(bank_num) - MSR_IA32_MC0_CTL);
+
+ data = vcpu->arch.mce_banks[offset];
+ break;
+ }
+ return 1;
+ }
+ *pdata = data;
+ return 0;
+}
+
+int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+{
+ switch (msr_info->index) {
+ case MSR_IA32_PLATFORM_ID:
+ case MSR_IA32_EBL_CR_POWERON:
+ case MSR_IA32_DEBUGCTLMSR:
+ case MSR_IA32_LASTBRANCHFROMIP:
+ case MSR_IA32_LASTBRANCHTOIP:
+ case MSR_IA32_LASTINTFROMIP:
+ case MSR_IA32_LASTINTTOIP:
+ case MSR_K8_SYSCFG:
+ case MSR_K8_TSEG_ADDR:
+ case MSR_K8_TSEG_MASK:
+ case MSR_K7_HWCR:
+ case MSR_VM_HSAVE_PA:
+ case MSR_K8_INT_PENDING_MSG:
+ case MSR_AMD64_NB_CFG:
+ case MSR_FAM10H_MMIO_CONF_BASE:
+ case MSR_AMD64_BU_CFG2:
+ case MSR_IA32_PERF_CTL:
+ case MSR_AMD64_DC_CFG:
+ case MSR_F15H_EX_CFG:
+ msr_info->data = 0;
+ break;
+ case MSR_F15H_PERF_CTL0 ... MSR_F15H_PERF_CTR5:
+ case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3:
+ case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3:
+ case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1:
+ case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1:
+ if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
+ return kvm_pmu_get_msr(vcpu, msr_info->index, &msr_info->data);
+ msr_info->data = 0;
+ break;
+ case MSR_IA32_UCODE_REV:
+ msr_info->data = vcpu->arch.microcode_version;
+ break;
+ case MSR_IA32_ARCH_CAPABILITIES:
+ if (!msr_info->host_initiated &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_ARCH_CAPABILITIES))
+ return 1;
+ msr_info->data = vcpu->arch.arch_capabilities;
+ break;
+ case MSR_IA32_TSC:
+ msr_info->data = kvm_scale_tsc(vcpu, rdtsc()) + vcpu->arch.tsc_offset;
+ break;
+ case MSR_MTRRcap:
+ case 0x200 ... 0x2ff:
+ return kvm_mtrr_get_msr(vcpu, msr_info->index, &msr_info->data);
+ case 0xcd: /* fsb frequency */
+ msr_info->data = 3;
+ break;
+ /*
+ * MSR_EBC_FREQUENCY_ID
+ * Conservative value valid for even the basic CPU models.
+ * Models 0,1: 000 in bits 23:21 indicating a bus speed of
+ * 100MHz, model 2 000 in bits 18:16 indicating 100MHz,
+ * and 266MHz for model 3, or 4. Set Core Clock
+ * Frequency to System Bus Frequency Ratio to 1 (bits
+ * 31:24) even though these are only valid for CPU
+ * models > 2, however guests may end up dividing or
+ * multiplying by zero otherwise.
+ */
+ case MSR_EBC_FREQUENCY_ID:
+ msr_info->data = 1 << 24;
+ break;
+ case MSR_IA32_APICBASE:
+ msr_info->data = kvm_get_apic_base(vcpu);
+ break;
+ case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff:
+ return kvm_x2apic_msr_read(vcpu, msr_info->index, &msr_info->data);
+ break;
+ case MSR_IA32_TSCDEADLINE:
+ msr_info->data = kvm_get_lapic_tscdeadline_msr(vcpu);
+ break;
+ case MSR_IA32_TSC_ADJUST:
+ msr_info->data = (u64)vcpu->arch.ia32_tsc_adjust_msr;
+ break;
+ case MSR_IA32_MISC_ENABLE:
+ msr_info->data = vcpu->arch.ia32_misc_enable_msr;
+ break;
+ case MSR_IA32_SMBASE:
+ if (!msr_info->host_initiated)
+ return 1;
+ msr_info->data = vcpu->arch.smbase;
+ break;
+ case MSR_SMI_COUNT:
+ msr_info->data = vcpu->arch.smi_count;
+ break;
+ case MSR_IA32_PERF_STATUS:
+ /* TSC increment by tick */
+ msr_info->data = 1000ULL;
+ /* CPU multiplier */
+ msr_info->data |= (((uint64_t)4ULL) << 40);
+ break;
+ case MSR_EFER:
+ msr_info->data = vcpu->arch.efer;
+ break;
+ case MSR_KVM_WALL_CLOCK:
+ case MSR_KVM_WALL_CLOCK_NEW:
+ msr_info->data = vcpu->kvm->arch.wall_clock;
+ break;
+ case MSR_KVM_SYSTEM_TIME:
+ case MSR_KVM_SYSTEM_TIME_NEW:
+ msr_info->data = vcpu->arch.time;
+ break;
+ case MSR_KVM_ASYNC_PF_EN:
+ msr_info->data = vcpu->arch.apf.msr_val;
+ break;
+ case MSR_KVM_STEAL_TIME:
+ msr_info->data = vcpu->arch.st.msr_val;
+ break;
+ case MSR_KVM_PV_EOI_EN:
+ msr_info->data = vcpu->arch.pv_eoi.msr_val;
+ break;
+ case MSR_IA32_P5_MC_ADDR:
+ case MSR_IA32_P5_MC_TYPE:
+ case MSR_IA32_MCG_CAP:
+ case MSR_IA32_MCG_CTL:
+ case MSR_IA32_MCG_STATUS:
+ case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
+ return get_msr_mce(vcpu, msr_info->index, &msr_info->data,
+ msr_info->host_initiated);
+ case MSR_K7_CLK_CTL:
+ /*
+ * Provide expected ramp-up count for K7. All other
+ * are set to zero, indicating minimum divisors for
+ * every field.
+ *
+ * This prevents guest kernels on AMD host with CPU
+ * type 6, model 8 and higher from exploding due to
+ * the rdmsr failing.
+ */
+ msr_info->data = 0x20000000;
+ break;
+ case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
+ case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
+ case HV_X64_MSR_CRASH_CTL:
+ case HV_X64_MSR_STIMER0_CONFIG ... HV_X64_MSR_STIMER3_COUNT:
+ case HV_X64_MSR_REENLIGHTENMENT_CONTROL:
+ case HV_X64_MSR_TSC_EMULATION_CONTROL:
+ case HV_X64_MSR_TSC_EMULATION_STATUS:
+ return kvm_hv_get_msr_common(vcpu,
+ msr_info->index, &msr_info->data,
+ msr_info->host_initiated);
+ break;
+ case MSR_IA32_BBL_CR_CTL3:
+ /* This legacy MSR exists but isn't fully documented in current
+ * silicon. It is however accessed by winxp in very narrow
+ * scenarios where it sets bit #19, itself documented as
+ * a "reserved" bit. Best effort attempt to source coherent
+ * read data here should the balance of the register be
+ * interpreted by the guest:
+ *
+ * L2 cache control register 3: 64GB range, 256KB size,
+ * enabled, latency 0x1, configured
+ */
+ msr_info->data = 0xbe702111;
+ break;
+ case MSR_AMD64_OSVW_ID_LENGTH:
+ if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW))
+ return 1;
+ msr_info->data = vcpu->arch.osvw.length;
+ break;
+ case MSR_AMD64_OSVW_STATUS:
+ if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW))
+ return 1;
+ msr_info->data = vcpu->arch.osvw.status;
+ break;
+ case MSR_PLATFORM_INFO:
+ if (!msr_info->host_initiated &&
+ !vcpu->kvm->arch.guest_can_read_msr_platform_info)
+ return 1;
+ msr_info->data = vcpu->arch.msr_platform_info;
+ break;
+ case MSR_MISC_FEATURES_ENABLES:
+ msr_info->data = vcpu->arch.msr_misc_features_enables;
+ break;
+ default:
+ if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
+ return kvm_pmu_get_msr(vcpu, msr_info->index, &msr_info->data);
+ if (!ignore_msrs) {
+ vcpu_debug_ratelimited(vcpu, "unhandled rdmsr: 0x%x\n",
+ msr_info->index);
+ return 1;
+ } else {
+ if (report_ignored_msrs)
+ vcpu_unimpl(vcpu, "ignored rdmsr: 0x%x\n",
+ msr_info->index);
+ msr_info->data = 0;
+ }
+ break;
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_get_msr_common);
+
+/*
+ * Read or write a bunch of msrs. All parameters are kernel addresses.
+ *
+ * @return number of msrs set successfully.
+ */
+static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,
+ struct kvm_msr_entry *entries,
+ int (*do_msr)(struct kvm_vcpu *vcpu,
+ unsigned index, u64 *data))
+{
+ int i;
+
+ for (i = 0; i < msrs->nmsrs; ++i)
+ if (do_msr(vcpu, entries[i].index, &entries[i].data))
+ break;
+
+ return i;
+}
+
+/*
+ * Read or write a bunch of msrs. Parameters are user addresses.
+ *
+ * @return number of msrs set successfully.
+ */
+static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs,
+ int (*do_msr)(struct kvm_vcpu *vcpu,
+ unsigned index, u64 *data),
+ int writeback)
+{
+ struct kvm_msrs msrs;
+ struct kvm_msr_entry *entries;
+ int r, n;
+ unsigned size;
+
+ r = -EFAULT;
+ if (copy_from_user(&msrs, user_msrs, sizeof msrs))
+ goto out;
+
+ r = -E2BIG;
+ if (msrs.nmsrs >= MAX_IO_MSRS)
+ goto out;
+
+ size = sizeof(struct kvm_msr_entry) * msrs.nmsrs;
+ entries = memdup_user(user_msrs->entries, size);
+ if (IS_ERR(entries)) {
+ r = PTR_ERR(entries);
+ goto out;
+ }
+
+ r = n = __msr_io(vcpu, &msrs, entries, do_msr);
+ if (r < 0)
+ goto out_free;
+
+ r = -EFAULT;
+ if (writeback && copy_to_user(user_msrs->entries, entries, size))
+ goto out_free;
+
+ r = n;
+
+out_free:
+ kfree(entries);
+out:
+ return r;
+}
+
+static inline bool kvm_can_mwait_in_guest(void)
+{
+ return boot_cpu_has(X86_FEATURE_MWAIT) &&
+ !boot_cpu_has_bug(X86_BUG_MONITOR) &&
+ boot_cpu_has(X86_FEATURE_ARAT);
+}
+
+int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
+{
+ int r = 0;
+
+ switch (ext) {
+ case KVM_CAP_IRQCHIP:
+ case KVM_CAP_HLT:
+ case KVM_CAP_MMU_SHADOW_CACHE_CONTROL:
+ case KVM_CAP_SET_TSS_ADDR:
+ case KVM_CAP_EXT_CPUID:
+ case KVM_CAP_EXT_EMUL_CPUID:
+ case KVM_CAP_CLOCKSOURCE:
+ case KVM_CAP_PIT:
+ case KVM_CAP_NOP_IO_DELAY:
+ case KVM_CAP_MP_STATE:
+ case KVM_CAP_SYNC_MMU:
+ case KVM_CAP_USER_NMI:
+ case KVM_CAP_REINJECT_CONTROL:
+ case KVM_CAP_IRQ_INJECT_STATUS:
+ case KVM_CAP_IOEVENTFD:
+ case KVM_CAP_IOEVENTFD_NO_LENGTH:
+ case KVM_CAP_PIT2:
+ case KVM_CAP_PIT_STATE2:
+ case KVM_CAP_SET_IDENTITY_MAP_ADDR:
+ case KVM_CAP_XEN_HVM:
+ case KVM_CAP_VCPU_EVENTS:
+ case KVM_CAP_HYPERV:
+ case KVM_CAP_HYPERV_VAPIC:
+ case KVM_CAP_HYPERV_SPIN:
+ case KVM_CAP_HYPERV_SYNIC:
+ case KVM_CAP_HYPERV_SYNIC2:
+ case KVM_CAP_HYPERV_VP_INDEX:
+ case KVM_CAP_HYPERV_EVENTFD:
+ case KVM_CAP_HYPERV_TLBFLUSH:
+ case KVM_CAP_PCI_SEGMENT:
+ case KVM_CAP_DEBUGREGS:
+ case KVM_CAP_X86_ROBUST_SINGLESTEP:
+ case KVM_CAP_XSAVE:
+ case KVM_CAP_ASYNC_PF:
+ case KVM_CAP_GET_TSC_KHZ:
+ case KVM_CAP_KVMCLOCK_CTRL:
+ case KVM_CAP_READONLY_MEM:
+ case KVM_CAP_HYPERV_TIME:
+ case KVM_CAP_IOAPIC_POLARITY_IGNORED:
+ case KVM_CAP_TSC_DEADLINE_TIMER:
+ case KVM_CAP_ENABLE_CAP_VM:
+ case KVM_CAP_DISABLE_QUIRKS:
+ case KVM_CAP_SET_BOOT_CPU_ID:
+ case KVM_CAP_SPLIT_IRQCHIP:
+ case KVM_CAP_IMMEDIATE_EXIT:
+ case KVM_CAP_GET_MSR_FEATURES:
+ case KVM_CAP_MSR_PLATFORM_INFO:
+ r = 1;
+ break;
+ case KVM_CAP_SYNC_REGS:
+ r = KVM_SYNC_X86_VALID_FIELDS;
+ break;
+ case KVM_CAP_ADJUST_CLOCK:
+ r = KVM_CLOCK_TSC_STABLE;
+ break;
+ case KVM_CAP_X86_DISABLE_EXITS:
+ r |= KVM_X86_DISABLE_EXITS_HLT | KVM_X86_DISABLE_EXITS_PAUSE;
+ if(kvm_can_mwait_in_guest())
+ r |= KVM_X86_DISABLE_EXITS_MWAIT;
+ break;
+ case KVM_CAP_X86_SMM:
+ /* SMBASE is usually relocated above 1M on modern chipsets,
+ * and SMM handlers might indeed rely on 4G segment limits,
+ * so do not report SMM to be available if real mode is
+ * emulated via vm86 mode. Still, do not go to great lengths
+ * to avoid userspace's usage of the feature, because it is a
+ * fringe case that is not enabled except via specific settings
+ * of the module parameters.
+ */
+ r = kvm_x86_ops->has_emulated_msr(MSR_IA32_SMBASE);
+ break;
+ case KVM_CAP_VAPIC:
+ r = !kvm_x86_ops->cpu_has_accelerated_tpr();
+ break;
+ case KVM_CAP_NR_VCPUS:
+ r = KVM_SOFT_MAX_VCPUS;
+ break;
+ case KVM_CAP_MAX_VCPUS:
+ r = KVM_MAX_VCPUS;
+ break;
+ case KVM_CAP_MAX_VCPU_ID:
+ r = KVM_MAX_VCPU_ID;
+ break;
+ case KVM_CAP_NR_MEMSLOTS:
+ r = KVM_USER_MEM_SLOTS;
+ break;
+ case KVM_CAP_PV_MMU: /* obsolete */
+ r = 0;
+ break;
+ case KVM_CAP_MCE:
+ r = KVM_MAX_MCE_BANKS;
+ break;
+ case KVM_CAP_XCRS:
+ r = boot_cpu_has(X86_FEATURE_XSAVE);
+ break;
+ case KVM_CAP_TSC_CONTROL:
+ r = kvm_has_tsc_control;
+ break;
+ case KVM_CAP_X2APIC_API:
+ r = KVM_X2APIC_API_VALID_FLAGS;
+ break;
+ case KVM_CAP_NESTED_STATE:
+ r = kvm_x86_ops->get_nested_state ?
+ kvm_x86_ops->get_nested_state(NULL, 0, 0) : 0;
+ break;
+ default:
+ break;
+ }
+ return r;
+
+}
+
+long kvm_arch_dev_ioctl(struct file *filp,
+ unsigned int ioctl, unsigned long arg)
+{
+ void __user *argp = (void __user *)arg;
+ long r;
+
+ switch (ioctl) {
+ case KVM_GET_MSR_INDEX_LIST: {
+ struct kvm_msr_list __user *user_msr_list = argp;
+ struct kvm_msr_list msr_list;
+ unsigned n;
+
+ r = -EFAULT;
+ if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list))
+ goto out;
+ n = msr_list.nmsrs;
+ msr_list.nmsrs = num_msrs_to_save + num_emulated_msrs;
+ if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list))
+ goto out;
+ r = -E2BIG;
+ if (n < msr_list.nmsrs)
+ goto out;
+ r = -EFAULT;
+ if (copy_to_user(user_msr_list->indices, &msrs_to_save,
+ num_msrs_to_save * sizeof(u32)))
+ goto out;
+ if (copy_to_user(user_msr_list->indices + num_msrs_to_save,
+ &emulated_msrs,
+ num_emulated_msrs * sizeof(u32)))
+ goto out;
+ r = 0;
+ break;
+ }
+ case KVM_GET_SUPPORTED_CPUID:
+ case KVM_GET_EMULATED_CPUID: {
+ struct kvm_cpuid2 __user *cpuid_arg = argp;
+ struct kvm_cpuid2 cpuid;
+
+ r = -EFAULT;
+ if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
+ goto out;
+
+ r = kvm_dev_ioctl_get_cpuid(&cpuid, cpuid_arg->entries,
+ ioctl);
+ if (r)
+ goto out;
+
+ r = -EFAULT;
+ if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid))
+ goto out;
+ r = 0;
+ break;
+ }
+ case KVM_X86_GET_MCE_CAP_SUPPORTED: {
+ r = -EFAULT;
+ if (copy_to_user(argp, &kvm_mce_cap_supported,
+ sizeof(kvm_mce_cap_supported)))
+ goto out;
+ r = 0;
+ break;
+ case KVM_GET_MSR_FEATURE_INDEX_LIST: {
+ struct kvm_msr_list __user *user_msr_list = argp;
+ struct kvm_msr_list msr_list;
+ unsigned int n;
+
+ r = -EFAULT;
+ if (copy_from_user(&msr_list, user_msr_list, sizeof(msr_list)))
+ goto out;
+ n = msr_list.nmsrs;
+ msr_list.nmsrs = num_msr_based_features;
+ if (copy_to_user(user_msr_list, &msr_list, sizeof(msr_list)))
+ goto out;
+ r = -E2BIG;
+ if (n < msr_list.nmsrs)
+ goto out;
+ r = -EFAULT;
+ if (copy_to_user(user_msr_list->indices, &msr_based_features,
+ num_msr_based_features * sizeof(u32)))
+ goto out;
+ r = 0;
+ break;
+ }
+ case KVM_GET_MSRS:
+ r = msr_io(NULL, argp, do_get_msr_feature, 1);
+ break;
+ }
+ default:
+ r = -EINVAL;
+ }
+out:
+ return r;
+}
+
+static void wbinvd_ipi(void *garbage)
+{
+ wbinvd();
+}
+
+static bool need_emulate_wbinvd(struct kvm_vcpu *vcpu)
+{
+ return kvm_arch_has_noncoherent_dma(vcpu->kvm);
+}
+
+void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+{
+ /* Address WBINVD may be executed by guest */
+ if (need_emulate_wbinvd(vcpu)) {
+ if (kvm_x86_ops->has_wbinvd_exit())
+ cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
+ else if (vcpu->cpu != -1 && vcpu->cpu != cpu)
+ smp_call_function_single(vcpu->cpu,
+ wbinvd_ipi, NULL, 1);
+ }
+
+ kvm_x86_ops->vcpu_load(vcpu, cpu);
+
+ /* Apply any externally detected TSC adjustments (due to suspend) */
+ if (unlikely(vcpu->arch.tsc_offset_adjustment)) {
+ adjust_tsc_offset_host(vcpu, vcpu->arch.tsc_offset_adjustment);
+ vcpu->arch.tsc_offset_adjustment = 0;
+ kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
+ }
+
+ if (unlikely(vcpu->cpu != cpu) || kvm_check_tsc_unstable()) {
+ s64 tsc_delta = !vcpu->arch.last_host_tsc ? 0 :
+ rdtsc() - vcpu->arch.last_host_tsc;
+ if (tsc_delta < 0)
+ mark_tsc_unstable("KVM discovered backwards TSC");
+
+ if (kvm_check_tsc_unstable()) {
+ u64 offset = kvm_compute_tsc_offset(vcpu,
+ vcpu->arch.last_guest_tsc);
+ kvm_vcpu_write_tsc_offset(vcpu, offset);
+ vcpu->arch.tsc_catchup = 1;
+ }
+
+ if (kvm_lapic_hv_timer_in_use(vcpu))
+ kvm_lapic_restart_hv_timer(vcpu);
+
+ /*
+ * On a host with synchronized TSC, there is no need to update
+ * kvmclock on vcpu->cpu migration
+ */
+ if (!vcpu->kvm->arch.use_master_clock || vcpu->cpu == -1)
+ kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu);
+ if (vcpu->cpu != cpu)
+ kvm_make_request(KVM_REQ_MIGRATE_TIMER, vcpu);
+ vcpu->cpu = cpu;
+ }
+
+ kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
+}
+
+static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu)
+{
+ struct kvm_host_map map;
+ struct kvm_steal_time *st;
+
+ if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
+ return;
+
+ if (vcpu->arch.st.preempted)
+ return;
+
+ if (kvm_map_gfn(vcpu, vcpu->arch.st.msr_val >> PAGE_SHIFT, &map,
+ &vcpu->arch.st.cache, true))
+ return;
+
+ st = map.hva +
+ offset_in_page(vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS);
+
+ st->preempted = vcpu->arch.st.preempted = KVM_VCPU_PREEMPTED;
+
+ kvm_unmap_gfn(vcpu, &map, &vcpu->arch.st.cache, true, true);
+}
+
+void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
+{
+ int idx;
+
+ if (vcpu->preempted)
+ vcpu->arch.preempted_in_kernel = !kvm_x86_ops->get_cpl(vcpu);
+
+ /*
+ * Disable page faults because we're in atomic context here.
+ * kvm_write_guest_offset_cached() would call might_fault()
+ * that relies on pagefault_disable() to tell if there's a
+ * bug. NOTE: the write to guest memory may not go through if
+ * during postcopy live migration or if there's heavy guest
+ * paging.
+ */
+ pagefault_disable();
+ /*
+ * kvm_memslots() will be called by
+ * kvm_write_guest_offset_cached() so take the srcu lock.
+ */
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
+ kvm_steal_time_set_preempted(vcpu);
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
+ pagefault_enable();
+ kvm_x86_ops->vcpu_put(vcpu);
+ vcpu->arch.last_host_tsc = rdtsc();
+ /*
+ * If userspace has set any breakpoints or watchpoints, dr6 is restored
+ * on every vmexit, but if not, we might have a stale dr6 from the
+ * guest. do_debug expects dr6 to be cleared after it runs, do the same.
+ */
+ set_debugreg(0, 6);
+}
+
+static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
+ struct kvm_lapic_state *s)
+{
+ if (vcpu->arch.apicv_active)
+ kvm_x86_ops->sync_pir_to_irr(vcpu);
+
+ return kvm_apic_get_state(vcpu, s);
+}
+
+static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
+ struct kvm_lapic_state *s)
+{
+ int r;
+
+ r = kvm_apic_set_state(vcpu, s);
+ if (r)
+ return r;
+ update_cr8_intercept(vcpu);
+
+ return 0;
+}
+
+static int kvm_cpu_accept_dm_intr(struct kvm_vcpu *vcpu)
+{
+ /*
+ * We can accept userspace's request for interrupt injection
+ * as long as we have a place to store the interrupt number.
+ * The actual injection will happen when the CPU is able to
+ * deliver the interrupt.
+ */
+ if (kvm_cpu_has_extint(vcpu))
+ return false;
+
+ /* Acknowledging ExtINT does not happen if LINT0 is masked. */
+ return (!lapic_in_kernel(vcpu) ||
+ kvm_apic_accept_pic_intr(vcpu));
+}
+
+static int kvm_vcpu_ready_for_interrupt_injection(struct kvm_vcpu *vcpu)
+{
+ /*
+ * Do not cause an interrupt window exit if an exception
+ * is pending or an event needs reinjection; userspace
+ * might want to inject the interrupt manually using KVM_SET_REGS
+ * or KVM_SET_SREGS. For that to work, we must be at an
+ * instruction boundary and with no events half-injected.
+ */
+ return (kvm_arch_interrupt_allowed(vcpu) &&
+ kvm_cpu_accept_dm_intr(vcpu) &&
+ !kvm_event_needs_reinjection(vcpu) &&
+ !vcpu->arch.exception.pending);
+}
+
+static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
+ struct kvm_interrupt *irq)
+{
+ if (irq->irq >= KVM_NR_INTERRUPTS)
+ return -EINVAL;
+
+ if (!irqchip_in_kernel(vcpu->kvm)) {
+ kvm_queue_interrupt(vcpu, irq->irq, false);
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ return 0;
+ }
+
+ /*
+ * With in-kernel LAPIC, we only use this to inject EXTINT, so
+ * fail for in-kernel 8259.
+ */
+ if (pic_in_kernel(vcpu->kvm))
+ return -ENXIO;
+
+ if (vcpu->arch.pending_external_vector != -1)
+ return -EEXIST;
+
+ vcpu->arch.pending_external_vector = irq->irq;
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ return 0;
+}
+
+static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu)
+{
+ kvm_inject_nmi(vcpu);
+
+ return 0;
+}
+
+static int kvm_vcpu_ioctl_smi(struct kvm_vcpu *vcpu)
+{
+ kvm_make_request(KVM_REQ_SMI, vcpu);
+
+ return 0;
+}
+
+static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu,
+ struct kvm_tpr_access_ctl *tac)
+{
+ if (tac->flags)
+ return -EINVAL;
+ vcpu->arch.tpr_access_reporting = !!tac->enabled;
+ return 0;
+}
+
+static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
+ u64 mcg_cap)
+{
+ int r;
+ unsigned bank_num = mcg_cap & 0xff, bank;
+
+ r = -EINVAL;
+ if (!bank_num || bank_num > KVM_MAX_MCE_BANKS)
+ goto out;
+ if (mcg_cap & ~(kvm_mce_cap_supported | 0xff | 0xff0000))
+ goto out;
+ r = 0;
+ vcpu->arch.mcg_cap = mcg_cap;
+ /* Init IA32_MCG_CTL to all 1s */
+ if (mcg_cap & MCG_CTL_P)
+ vcpu->arch.mcg_ctl = ~(u64)0;
+ /* Init IA32_MCi_CTL to all 1s */
+ for (bank = 0; bank < bank_num; bank++)
+ vcpu->arch.mce_banks[bank*4] = ~(u64)0;
+
+ if (kvm_x86_ops->setup_mce)
+ kvm_x86_ops->setup_mce(vcpu);
+out:
+ return r;
+}
+
+static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
+ struct kvm_x86_mce *mce)
+{
+ u64 mcg_cap = vcpu->arch.mcg_cap;
+ unsigned bank_num = mcg_cap & 0xff;
+ u64 *banks = vcpu->arch.mce_banks;
+
+ if (mce->bank >= bank_num || !(mce->status & MCI_STATUS_VAL))
+ return -EINVAL;
+ /*
+ * if IA32_MCG_CTL is not all 1s, the uncorrected error
+ * reporting is disabled
+ */
+ if ((mce->status & MCI_STATUS_UC) && (mcg_cap & MCG_CTL_P) &&
+ vcpu->arch.mcg_ctl != ~(u64)0)
+ return 0;
+ banks += 4 * mce->bank;
+ /*
+ * if IA32_MCi_CTL is not all 1s, the uncorrected error
+ * reporting is disabled for the bank
+ */
+ if ((mce->status & MCI_STATUS_UC) && banks[0] != ~(u64)0)
+ return 0;
+ if (mce->status & MCI_STATUS_UC) {
+ if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) ||
+ !kvm_read_cr4_bits(vcpu, X86_CR4_MCE)) {
+ kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+ return 0;
+ }
+ if (banks[1] & MCI_STATUS_VAL)
+ mce->status |= MCI_STATUS_OVER;
+ banks[2] = mce->addr;
+ banks[3] = mce->misc;
+ vcpu->arch.mcg_status = mce->mcg_status;
+ banks[1] = mce->status;
+ kvm_queue_exception(vcpu, MC_VECTOR);
+ } else if (!(banks[1] & MCI_STATUS_VAL)
+ || !(banks[1] & MCI_STATUS_UC)) {
+ if (banks[1] & MCI_STATUS_VAL)
+ mce->status |= MCI_STATUS_OVER;
+ banks[2] = mce->addr;
+ banks[3] = mce->misc;
+ banks[1] = mce->status;
+ } else
+ banks[1] |= MCI_STATUS_OVER;
+ return 0;
+}
+
+static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
+ struct kvm_vcpu_events *events)
+{
+ process_nmi(vcpu);
+
+ if (kvm_check_request(KVM_REQ_SMI, vcpu))
+ process_smi(vcpu);
+
+ /*
+ * FIXME: pass injected and pending separately. This is only
+ * needed for nested virtualization, whose state cannot be
+ * migrated yet. For now we can combine them.
+ */
+ events->exception.injected =
+ (vcpu->arch.exception.pending ||
+ vcpu->arch.exception.injected) &&
+ !kvm_exception_is_soft(vcpu->arch.exception.nr);
+ events->exception.nr = vcpu->arch.exception.nr;
+ events->exception.has_error_code = vcpu->arch.exception.has_error_code;
+ events->exception.pad = 0;
+ events->exception.error_code = vcpu->arch.exception.error_code;
+
+ events->interrupt.injected =
+ vcpu->arch.interrupt.injected && !vcpu->arch.interrupt.soft;
+ events->interrupt.nr = vcpu->arch.interrupt.nr;
+ events->interrupt.soft = 0;
+ events->interrupt.shadow = kvm_x86_ops->get_interrupt_shadow(vcpu);
+
+ events->nmi.injected = vcpu->arch.nmi_injected;
+ events->nmi.pending = vcpu->arch.nmi_pending != 0;
+ events->nmi.masked = kvm_x86_ops->get_nmi_mask(vcpu);
+ events->nmi.pad = 0;
+
+ events->sipi_vector = 0; /* never valid when reporting to user space */
+
+ events->smi.smm = is_smm(vcpu);
+ events->smi.pending = vcpu->arch.smi_pending;
+ events->smi.smm_inside_nmi =
+ !!(vcpu->arch.hflags & HF_SMM_INSIDE_NMI_MASK);
+ events->smi.latched_init = kvm_lapic_latched_init(vcpu);
+
+ events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING
+ | KVM_VCPUEVENT_VALID_SHADOW
+ | KVM_VCPUEVENT_VALID_SMM);
+ memset(&events->reserved, 0, sizeof(events->reserved));
+}
+
+static void kvm_set_hflags(struct kvm_vcpu *vcpu, unsigned emul_flags);
+
+static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
+ struct kvm_vcpu_events *events)
+{
+ if (events->flags & ~(KVM_VCPUEVENT_VALID_NMI_PENDING
+ | KVM_VCPUEVENT_VALID_SIPI_VECTOR
+ | KVM_VCPUEVENT_VALID_SHADOW
+ | KVM_VCPUEVENT_VALID_SMM))
+ return -EINVAL;
+
+ if (events->exception.injected &&
+ (events->exception.nr > 31 || events->exception.nr == NMI_VECTOR ||
+ is_guest_mode(vcpu)))
+ return -EINVAL;
+
+ /* INITs are latched while in SMM */
+ if (events->flags & KVM_VCPUEVENT_VALID_SMM &&
+ (events->smi.smm || events->smi.pending) &&
+ vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED)
+ return -EINVAL;
+
+ process_nmi(vcpu);
+ vcpu->arch.exception.injected = false;
+ vcpu->arch.exception.pending = events->exception.injected;
+ vcpu->arch.exception.nr = events->exception.nr;
+ vcpu->arch.exception.has_error_code = events->exception.has_error_code;
+ vcpu->arch.exception.error_code = events->exception.error_code;
+
+ vcpu->arch.interrupt.injected = events->interrupt.injected;
+ vcpu->arch.interrupt.nr = events->interrupt.nr;
+ vcpu->arch.interrupt.soft = events->interrupt.soft;
+ if (events->flags & KVM_VCPUEVENT_VALID_SHADOW)
+ kvm_x86_ops->set_interrupt_shadow(vcpu,
+ events->interrupt.shadow);
+
+ vcpu->arch.nmi_injected = events->nmi.injected;
+ if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING)
+ vcpu->arch.nmi_pending = events->nmi.pending;
+ kvm_x86_ops->set_nmi_mask(vcpu, events->nmi.masked);
+
+ if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR &&
+ lapic_in_kernel(vcpu))
+ vcpu->arch.apic->sipi_vector = events->sipi_vector;
+
+ if (events->flags & KVM_VCPUEVENT_VALID_SMM) {
+ u32 hflags = vcpu->arch.hflags;
+ if (events->smi.smm)
+ hflags |= HF_SMM_MASK;
+ else
+ hflags &= ~HF_SMM_MASK;
+ kvm_set_hflags(vcpu, hflags);
+
+ vcpu->arch.smi_pending = events->smi.pending;
+
+ if (events->smi.smm) {
+ if (events->smi.smm_inside_nmi)
+ vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK;
+ else
+ vcpu->arch.hflags &= ~HF_SMM_INSIDE_NMI_MASK;
+ if (lapic_in_kernel(vcpu)) {
+ if (events->smi.latched_init)
+ set_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
+ else
+ clear_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
+ }
+ }
+ }
+
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+
+ return 0;
+}
+
+static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu,
+ struct kvm_debugregs *dbgregs)
+{
+ unsigned long val;
+
+ memcpy(dbgregs->db, vcpu->arch.db, sizeof(vcpu->arch.db));
+ kvm_get_dr(vcpu, 6, &val);
+ dbgregs->dr6 = val;
+ dbgregs->dr7 = vcpu->arch.dr7;
+ dbgregs->flags = 0;
+ memset(&dbgregs->reserved, 0, sizeof(dbgregs->reserved));
+}
+
+static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
+ struct kvm_debugregs *dbgregs)
+{
+ if (dbgregs->flags)
+ return -EINVAL;
+
+ if (dbgregs->dr6 & ~0xffffffffull)
+ return -EINVAL;
+ if (dbgregs->dr7 & ~0xffffffffull)
+ return -EINVAL;
+
+ memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db));
+ kvm_update_dr0123(vcpu);
+ vcpu->arch.dr6 = dbgregs->dr6;
+ kvm_update_dr6(vcpu);
+ vcpu->arch.dr7 = dbgregs->dr7;
+ kvm_update_dr7(vcpu);
+
+ return 0;
+}
+
+#define XSTATE_COMPACTION_ENABLED (1ULL << 63)
+
+static void fill_xsave(u8 *dest, struct kvm_vcpu *vcpu)
+{
+ struct xregs_state *xsave = &vcpu->arch.guest_fpu.state.xsave;
+ u64 xstate_bv = xsave->header.xfeatures;
+ u64 valid;
+
+ /*
+ * Copy legacy XSAVE area, to avoid complications with CPUID
+ * leaves 0 and 1 in the loop below.
+ */
+ memcpy(dest, xsave, XSAVE_HDR_OFFSET);
+
+ /* Set XSTATE_BV */
+ xstate_bv &= vcpu->arch.guest_supported_xcr0 | XFEATURE_MASK_FPSSE;
+ *(u64 *)(dest + XSAVE_HDR_OFFSET) = xstate_bv;
+
+ /*
+ * Copy each region from the possibly compacted offset to the
+ * non-compacted offset.
+ */
+ valid = xstate_bv & ~XFEATURE_MASK_FPSSE;
+ while (valid) {
+ u64 feature = valid & -valid;
+ int index = fls64(feature) - 1;
+ void *src = get_xsave_addr(xsave, feature);
+
+ if (src) {
+ u32 size, offset, ecx, edx;
+ cpuid_count(XSTATE_CPUID, index,
+ &size, &offset, &ecx, &edx);
+ if (feature == XFEATURE_MASK_PKRU)
+ memcpy(dest + offset, &vcpu->arch.pkru,
+ sizeof(vcpu->arch.pkru));
+ else
+ memcpy(dest + offset, src, size);
+
+ }
+
+ valid -= feature;
+ }
+}
+
+static void load_xsave(struct kvm_vcpu *vcpu, u8 *src)
+{
+ struct xregs_state *xsave = &vcpu->arch.guest_fpu.state.xsave;
+ u64 xstate_bv = *(u64 *)(src + XSAVE_HDR_OFFSET);
+ u64 valid;
+
+ /*
+ * Copy legacy XSAVE area, to avoid complications with CPUID
+ * leaves 0 and 1 in the loop below.
+ */
+ memcpy(xsave, src, XSAVE_HDR_OFFSET);
+
+ /* Set XSTATE_BV and possibly XCOMP_BV. */
+ xsave->header.xfeatures = xstate_bv;
+ if (boot_cpu_has(X86_FEATURE_XSAVES))
+ xsave->header.xcomp_bv = host_xcr0 | XSTATE_COMPACTION_ENABLED;
+
+ /*
+ * Copy each region from the non-compacted offset to the
+ * possibly compacted offset.
+ */
+ valid = xstate_bv & ~XFEATURE_MASK_FPSSE;
+ while (valid) {
+ u64 feature = valid & -valid;
+ int index = fls64(feature) - 1;
+ void *dest = get_xsave_addr(xsave, feature);
+
+ if (dest) {
+ u32 size, offset, ecx, edx;
+ cpuid_count(XSTATE_CPUID, index,
+ &size, &offset, &ecx, &edx);
+ if (feature == XFEATURE_MASK_PKRU)
+ memcpy(&vcpu->arch.pkru, src + offset,
+ sizeof(vcpu->arch.pkru));
+ else
+ memcpy(dest, src + offset, size);
+ }
+
+ valid -= feature;
+ }
+}
+
+static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu,
+ struct kvm_xsave *guest_xsave)
+{
+ if (boot_cpu_has(X86_FEATURE_XSAVE)) {
+ memset(guest_xsave, 0, sizeof(struct kvm_xsave));
+ fill_xsave((u8 *) guest_xsave->region, vcpu);
+ } else {
+ memcpy(guest_xsave->region,
+ &vcpu->arch.guest_fpu.state.fxsave,
+ sizeof(struct fxregs_state));
+ *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)] =
+ XFEATURE_MASK_FPSSE;
+ }
+}
+
+#define XSAVE_MXCSR_OFFSET 24
+
+static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu,
+ struct kvm_xsave *guest_xsave)
+{
+ u64 xstate_bv =
+ *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)];
+ u32 mxcsr = *(u32 *)&guest_xsave->region[XSAVE_MXCSR_OFFSET / sizeof(u32)];
+
+ if (boot_cpu_has(X86_FEATURE_XSAVE)) {
+ /*
+ * Here we allow setting states that are not present in
+ * CPUID leaf 0xD, index 0, EDX:EAX. This is for compatibility
+ * with old userspace.
+ */
+ if (xstate_bv & ~kvm_supported_xcr0() ||
+ mxcsr & ~mxcsr_feature_mask)
+ return -EINVAL;
+ load_xsave(vcpu, (u8 *)guest_xsave->region);
+ } else {
+ if (xstate_bv & ~XFEATURE_MASK_FPSSE ||
+ mxcsr & ~mxcsr_feature_mask)
+ return -EINVAL;
+ memcpy(&vcpu->arch.guest_fpu.state.fxsave,
+ guest_xsave->region, sizeof(struct fxregs_state));
+ }
+ return 0;
+}
+
+static void kvm_vcpu_ioctl_x86_get_xcrs(struct kvm_vcpu *vcpu,
+ struct kvm_xcrs *guest_xcrs)
+{
+ if (!boot_cpu_has(X86_FEATURE_XSAVE)) {
+ guest_xcrs->nr_xcrs = 0;
+ return;
+ }
+
+ guest_xcrs->nr_xcrs = 1;
+ guest_xcrs->flags = 0;
+ guest_xcrs->xcrs[0].xcr = XCR_XFEATURE_ENABLED_MASK;
+ guest_xcrs->xcrs[0].value = vcpu->arch.xcr0;
+}
+
+static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu,
+ struct kvm_xcrs *guest_xcrs)
+{
+ int i, r = 0;
+
+ if (!boot_cpu_has(X86_FEATURE_XSAVE))
+ return -EINVAL;
+
+ if (guest_xcrs->nr_xcrs > KVM_MAX_XCRS || guest_xcrs->flags)
+ return -EINVAL;
+
+ for (i = 0; i < guest_xcrs->nr_xcrs; i++)
+ /* Only support XCR0 currently */
+ if (guest_xcrs->xcrs[i].xcr == XCR_XFEATURE_ENABLED_MASK) {
+ r = __kvm_set_xcr(vcpu, XCR_XFEATURE_ENABLED_MASK,
+ guest_xcrs->xcrs[i].value);
+ break;
+ }
+ if (r)
+ r = -EINVAL;
+ return r;
+}
+
+/*
+ * kvm_set_guest_paused() indicates to the guest kernel that it has been
+ * stopped by the hypervisor. This function will be called from the host only.
+ * EINVAL is returned when the host attempts to set the flag for a guest that
+ * does not support pv clocks.
+ */
+static int kvm_set_guest_paused(struct kvm_vcpu *vcpu)
+{
+ if (!vcpu->arch.pv_time_enabled)
+ return -EINVAL;
+ vcpu->arch.pvclock_set_guest_stopped_request = true;
+ kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
+ return 0;
+}
+
+static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
+ struct kvm_enable_cap *cap)
+{
+ if (cap->flags)
+ return -EINVAL;
+
+ switch (cap->cap) {
+ case KVM_CAP_HYPERV_SYNIC2:
+ if (cap->args[0])
+ return -EINVAL;
+ case KVM_CAP_HYPERV_SYNIC:
+ if (!irqchip_in_kernel(vcpu->kvm))
+ return -EINVAL;
+ return kvm_hv_activate_synic(vcpu, cap->cap ==
+ KVM_CAP_HYPERV_SYNIC2);
+ default:
+ return -EINVAL;
+ }
+}
+
+long kvm_arch_vcpu_ioctl(struct file *filp,
+ unsigned int ioctl, unsigned long arg)
+{
+ struct kvm_vcpu *vcpu = filp->private_data;
+ void __user *argp = (void __user *)arg;
+ int r;
+ union {
+ struct kvm_lapic_state *lapic;
+ struct kvm_xsave *xsave;
+ struct kvm_xcrs *xcrs;
+ void *buffer;
+ } u;
+
+ vcpu_load(vcpu);
+
+ u.buffer = NULL;
+ switch (ioctl) {
+ case KVM_GET_LAPIC: {
+ r = -EINVAL;
+ if (!lapic_in_kernel(vcpu))
+ goto out;
+ u.lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
+
+ r = -ENOMEM;
+ if (!u.lapic)
+ goto out;
+ r = kvm_vcpu_ioctl_get_lapic(vcpu, u.lapic);
+ if (r)
+ goto out;
+ r = -EFAULT;
+ if (copy_to_user(argp, u.lapic, sizeof(struct kvm_lapic_state)))
+ goto out;
+ r = 0;
+ break;
+ }
+ case KVM_SET_LAPIC: {
+ r = -EINVAL;
+ if (!lapic_in_kernel(vcpu))
+ goto out;
+ u.lapic = memdup_user(argp, sizeof(*u.lapic));
+ if (IS_ERR(u.lapic)) {
+ r = PTR_ERR(u.lapic);
+ goto out_nofree;
+ }
+
+ r = kvm_vcpu_ioctl_set_lapic(vcpu, u.lapic);
+ break;
+ }
+ case KVM_INTERRUPT: {
+ struct kvm_interrupt irq;
+
+ r = -EFAULT;
+ if (copy_from_user(&irq, argp, sizeof irq))
+ goto out;
+ r = kvm_vcpu_ioctl_interrupt(vcpu, &irq);
+ break;
+ }
+ case KVM_NMI: {
+ r = kvm_vcpu_ioctl_nmi(vcpu);
+ break;
+ }
+ case KVM_SMI: {
+ r = kvm_vcpu_ioctl_smi(vcpu);
+ break;
+ }
+ case KVM_SET_CPUID: {
+ struct kvm_cpuid __user *cpuid_arg = argp;
+ struct kvm_cpuid cpuid;
+
+ r = -EFAULT;
+ if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
+ goto out;
+ r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries);
+ break;
+ }
+ case KVM_SET_CPUID2: {
+ struct kvm_cpuid2 __user *cpuid_arg = argp;
+ struct kvm_cpuid2 cpuid;
+
+ r = -EFAULT;
+ if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
+ goto out;
+ r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid,
+ cpuid_arg->entries);
+ break;
+ }
+ case KVM_GET_CPUID2: {
+ struct kvm_cpuid2 __user *cpuid_arg = argp;
+ struct kvm_cpuid2 cpuid;
+
+ r = -EFAULT;
+ if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
+ goto out;
+ r = kvm_vcpu_ioctl_get_cpuid2(vcpu, &cpuid,
+ cpuid_arg->entries);
+ if (r)
+ goto out;
+ r = -EFAULT;
+ if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid))
+ goto out;
+ r = 0;
+ break;
+ }
+ case KVM_GET_MSRS: {
+ int idx = srcu_read_lock(&vcpu->kvm->srcu);
+ r = msr_io(vcpu, argp, do_get_msr, 1);
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
+ break;
+ }
+ case KVM_SET_MSRS: {
+ int idx = srcu_read_lock(&vcpu->kvm->srcu);
+ r = msr_io(vcpu, argp, do_set_msr, 0);
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
+ break;
+ }
+ case KVM_TPR_ACCESS_REPORTING: {
+ struct kvm_tpr_access_ctl tac;
+
+ r = -EFAULT;
+ if (copy_from_user(&tac, argp, sizeof tac))
+ goto out;
+ r = vcpu_ioctl_tpr_access_reporting(vcpu, &tac);
+ if (r)
+ goto out;
+ r = -EFAULT;
+ if (copy_to_user(argp, &tac, sizeof tac))
+ goto out;
+ r = 0;
+ break;
+ };
+ case KVM_SET_VAPIC_ADDR: {
+ struct kvm_vapic_addr va;
+ int idx;
+
+ r = -EINVAL;
+ if (!lapic_in_kernel(vcpu))
+ goto out;
+ r = -EFAULT;
+ if (copy_from_user(&va, argp, sizeof va))
+ goto out;
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
+ r = kvm_lapic_set_vapic_addr(vcpu, va.vapic_addr);
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
+ break;
+ }
+ case KVM_X86_SETUP_MCE: {
+ u64 mcg_cap;
+
+ r = -EFAULT;
+ if (copy_from_user(&mcg_cap, argp, sizeof mcg_cap))
+ goto out;
+ r = kvm_vcpu_ioctl_x86_setup_mce(vcpu, mcg_cap);
+ break;
+ }
+ case KVM_X86_SET_MCE: {
+ struct kvm_x86_mce mce;
+
+ r = -EFAULT;
+ if (copy_from_user(&mce, argp, sizeof mce))
+ goto out;
+ r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce);
+ break;
+ }
+ case KVM_GET_VCPU_EVENTS: {
+ struct kvm_vcpu_events events;
+
+ kvm_vcpu_ioctl_x86_get_vcpu_events(vcpu, &events);
+
+ r = -EFAULT;
+ if (copy_to_user(argp, &events, sizeof(struct kvm_vcpu_events)))
+ break;
+ r = 0;
+ break;
+ }
+ case KVM_SET_VCPU_EVENTS: {
+ struct kvm_vcpu_events events;
+
+ r = -EFAULT;
+ if (copy_from_user(&events, argp, sizeof(struct kvm_vcpu_events)))
+ break;
+
+ r = kvm_vcpu_ioctl_x86_set_vcpu_events(vcpu, &events);
+ break;
+ }
+ case KVM_GET_DEBUGREGS: {
+ struct kvm_debugregs dbgregs;
+
+ kvm_vcpu_ioctl_x86_get_debugregs(vcpu, &dbgregs);
+
+ r = -EFAULT;
+ if (copy_to_user(argp, &dbgregs,
+ sizeof(struct kvm_debugregs)))
+ break;
+ r = 0;
+ break;
+ }
+ case KVM_SET_DEBUGREGS: {
+ struct kvm_debugregs dbgregs;
+
+ r = -EFAULT;
+ if (copy_from_user(&dbgregs, argp,
+ sizeof(struct kvm_debugregs)))
+ break;
+
+ r = kvm_vcpu_ioctl_x86_set_debugregs(vcpu, &dbgregs);
+ break;
+ }
+ case KVM_GET_XSAVE: {
+ u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL);
+ r = -ENOMEM;
+ if (!u.xsave)
+ break;
+
+ kvm_vcpu_ioctl_x86_get_xsave(vcpu, u.xsave);
+
+ r = -EFAULT;
+ if (copy_to_user(argp, u.xsave, sizeof(struct kvm_xsave)))
+ break;
+ r = 0;
+ break;
+ }
+ case KVM_SET_XSAVE: {
+ u.xsave = memdup_user(argp, sizeof(*u.xsave));
+ if (IS_ERR(u.xsave)) {
+ r = PTR_ERR(u.xsave);
+ goto out_nofree;
+ }
+
+ r = kvm_vcpu_ioctl_x86_set_xsave(vcpu, u.xsave);
+ break;
+ }
+ case KVM_GET_XCRS: {
+ u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL);
+ r = -ENOMEM;
+ if (!u.xcrs)
+ break;
+
+ kvm_vcpu_ioctl_x86_get_xcrs(vcpu, u.xcrs);
+
+ r = -EFAULT;
+ if (copy_to_user(argp, u.xcrs,
+ sizeof(struct kvm_xcrs)))
+ break;
+ r = 0;
+ break;
+ }
+ case KVM_SET_XCRS: {
+ u.xcrs = memdup_user(argp, sizeof(*u.xcrs));
+ if (IS_ERR(u.xcrs)) {
+ r = PTR_ERR(u.xcrs);
+ goto out_nofree;
+ }
+
+ r = kvm_vcpu_ioctl_x86_set_xcrs(vcpu, u.xcrs);
+ break;
+ }
+ case KVM_SET_TSC_KHZ: {
+ u32 user_tsc_khz;
+
+ r = -EINVAL;
+ user_tsc_khz = (u32)arg;
+
+ if (user_tsc_khz >= kvm_max_guest_tsc_khz)
+ goto out;
+
+ if (user_tsc_khz == 0)
+ user_tsc_khz = tsc_khz;
+
+ if (!kvm_set_tsc_khz(vcpu, user_tsc_khz))
+ r = 0;
+
+ goto out;
+ }
+ case KVM_GET_TSC_KHZ: {
+ r = vcpu->arch.virtual_tsc_khz;
+ goto out;
+ }
+ case KVM_KVMCLOCK_CTRL: {
+ r = kvm_set_guest_paused(vcpu);
+ goto out;
+ }
+ case KVM_ENABLE_CAP: {
+ struct kvm_enable_cap cap;
+
+ r = -EFAULT;
+ if (copy_from_user(&cap, argp, sizeof(cap)))
+ goto out;
+ r = kvm_vcpu_ioctl_enable_cap(vcpu, &cap);
+ break;
+ }
+ case KVM_GET_NESTED_STATE: {
+ struct kvm_nested_state __user *user_kvm_nested_state = argp;
+ u32 user_data_size;
+
+ r = -EINVAL;
+ if (!kvm_x86_ops->get_nested_state)
+ break;
+
+ BUILD_BUG_ON(sizeof(user_data_size) != sizeof(user_kvm_nested_state->size));
+ r = -EFAULT;
+ if (get_user(user_data_size, &user_kvm_nested_state->size))
+ break;
+
+ r = kvm_x86_ops->get_nested_state(vcpu, user_kvm_nested_state,
+ user_data_size);
+ if (r < 0)
+ break;
+
+ if (r > user_data_size) {
+ if (put_user(r, &user_kvm_nested_state->size))
+ r = -EFAULT;
+ else
+ r = -E2BIG;
+ break;
+ }
+
+ r = 0;
+ break;
+ }
+ case KVM_SET_NESTED_STATE: {
+ struct kvm_nested_state __user *user_kvm_nested_state = argp;
+ struct kvm_nested_state kvm_state;
+ int idx;
+
+ r = -EINVAL;
+ if (!kvm_x86_ops->set_nested_state)
+ break;
+
+ r = -EFAULT;
+ if (copy_from_user(&kvm_state, user_kvm_nested_state, sizeof(kvm_state)))
+ break;
+
+ r = -EINVAL;
+ if (kvm_state.size < sizeof(kvm_state))
+ break;
+
+ if (kvm_state.flags &
+ ~(KVM_STATE_NESTED_RUN_PENDING | KVM_STATE_NESTED_GUEST_MODE))
+ break;
+
+ /* nested_run_pending implies guest_mode. */
+ if (kvm_state.flags == KVM_STATE_NESTED_RUN_PENDING)
+ break;
+
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
+ r = kvm_x86_ops->set_nested_state(vcpu, user_kvm_nested_state, &kvm_state);
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
+ break;
+ }
+ default:
+ r = -EINVAL;
+ }
+out:
+ kfree(u.buffer);
+out_nofree:
+ vcpu_put(vcpu);
+ return r;
+}
+
+vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
+{
+ return VM_FAULT_SIGBUS;
+}
+
+static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr)
+{
+ int ret;
+
+ if (addr > (unsigned int)(-3 * PAGE_SIZE))
+ return -EINVAL;
+ ret = kvm_x86_ops->set_tss_addr(kvm, addr);
+ return ret;
+}
+
+static int kvm_vm_ioctl_set_identity_map_addr(struct kvm *kvm,
+ u64 ident_addr)
+{
+ return kvm_x86_ops->set_identity_map_addr(kvm, ident_addr);
+}
+
+static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
+ unsigned long kvm_nr_mmu_pages)
+{
+ if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES)
+ return -EINVAL;
+
+ mutex_lock(&kvm->slots_lock);
+
+ kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages);
+ kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages;
+
+ mutex_unlock(&kvm->slots_lock);
+ return 0;
+}
+
+static unsigned long kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm)
+{
+ return kvm->arch.n_max_mmu_pages;
+}
+
+static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
+{
+ struct kvm_pic *pic = kvm->arch.vpic;
+ int r;
+
+ r = 0;
+ switch (chip->chip_id) {
+ case KVM_IRQCHIP_PIC_MASTER:
+ memcpy(&chip->chip.pic, &pic->pics[0],
+ sizeof(struct kvm_pic_state));
+ break;
+ case KVM_IRQCHIP_PIC_SLAVE:
+ memcpy(&chip->chip.pic, &pic->pics[1],
+ sizeof(struct kvm_pic_state));
+ break;
+ case KVM_IRQCHIP_IOAPIC:
+ kvm_get_ioapic(kvm, &chip->chip.ioapic);
+ break;
+ default:
+ r = -EINVAL;
+ break;
+ }
+ return r;
+}
+
+static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
+{
+ struct kvm_pic *pic = kvm->arch.vpic;
+ int r;
+
+ r = 0;
+ switch (chip->chip_id) {
+ case KVM_IRQCHIP_PIC_MASTER:
+ spin_lock(&pic->lock);
+ memcpy(&pic->pics[0], &chip->chip.pic,
+ sizeof(struct kvm_pic_state));
+ spin_unlock(&pic->lock);
+ break;
+ case KVM_IRQCHIP_PIC_SLAVE:
+ spin_lock(&pic->lock);
+ memcpy(&pic->pics[1], &chip->chip.pic,
+ sizeof(struct kvm_pic_state));
+ spin_unlock(&pic->lock);
+ break;
+ case KVM_IRQCHIP_IOAPIC:
+ kvm_set_ioapic(kvm, &chip->chip.ioapic);
+ break;
+ default:
+ r = -EINVAL;
+ break;
+ }
+ kvm_pic_update_irq(pic);
+ return r;
+}
+
+static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps)
+{
+ struct kvm_kpit_state *kps = &kvm->arch.vpit->pit_state;
+
+ BUILD_BUG_ON(sizeof(*ps) != sizeof(kps->channels));
+
+ mutex_lock(&kps->lock);
+ memcpy(ps, &kps->channels, sizeof(*ps));
+ mutex_unlock(&kps->lock);
+ return 0;
+}
+
+static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps)
+{
+ int i;
+ struct kvm_pit *pit = kvm->arch.vpit;
+
+ mutex_lock(&pit->pit_state.lock);
+ memcpy(&pit->pit_state.channels, ps, sizeof(*ps));
+ for (i = 0; i < 3; i++)
+ kvm_pit_load_count(pit, i, ps->channels[i].count, 0);
+ mutex_unlock(&pit->pit_state.lock);
+ return 0;
+}
+
+static int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
+{
+ mutex_lock(&kvm->arch.vpit->pit_state.lock);
+ memcpy(ps->channels, &kvm->arch.vpit->pit_state.channels,
+ sizeof(ps->channels));
+ ps->flags = kvm->arch.vpit->pit_state.flags;
+ mutex_unlock(&kvm->arch.vpit->pit_state.lock);
+ memset(&ps->reserved, 0, sizeof(ps->reserved));
+ return 0;
+}
+
+static int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
+{
+ int start = 0;
+ int i;
+ u32 prev_legacy, cur_legacy;
+ struct kvm_pit *pit = kvm->arch.vpit;
+
+ mutex_lock(&pit->pit_state.lock);
+ prev_legacy = pit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY;
+ cur_legacy = ps->flags & KVM_PIT_FLAGS_HPET_LEGACY;
+ if (!prev_legacy && cur_legacy)
+ start = 1;
+ memcpy(&pit->pit_state.channels, &ps->channels,
+ sizeof(pit->pit_state.channels));
+ pit->pit_state.flags = ps->flags;
+ for (i = 0; i < 3; i++)
+ kvm_pit_load_count(pit, i, pit->pit_state.channels[i].count,
+ start && i == 0);
+ mutex_unlock(&pit->pit_state.lock);
+ return 0;
+}
+
+static int kvm_vm_ioctl_reinject(struct kvm *kvm,
+ struct kvm_reinject_control *control)
+{
+ struct kvm_pit *pit = kvm->arch.vpit;
+
+ if (!pit)
+ return -ENXIO;
+
+ /* pit->pit_state.lock was overloaded to prevent userspace from getting
+ * an inconsistent state after running multiple KVM_REINJECT_CONTROL
+ * ioctls in parallel. Use a separate lock if that ioctl isn't rare.
+ */
+ mutex_lock(&pit->pit_state.lock);
+ kvm_pit_set_reinject(pit, control->pit_reinject);
+ mutex_unlock(&pit->pit_state.lock);
+
+ return 0;
+}
+
+/**
+ * kvm_vm_ioctl_get_dirty_log - get and clear the log of dirty pages in a slot
+ * @kvm: kvm instance
+ * @log: slot id and address to which we copy the log
+ *
+ * Steps 1-4 below provide general overview of dirty page logging. See
+ * kvm_get_dirty_log_protect() function description for additional details.
+ *
+ * We call kvm_get_dirty_log_protect() to handle steps 1-3, upon return we
+ * always flush the TLB (step 4) even if previous step failed and the dirty
+ * bitmap may be corrupt. Regardless of previous outcome the KVM logging API
+ * does not preclude user space subsequent dirty log read. Flushing TLB ensures
+ * writes will be marked dirty for next log read.
+ *
+ * 1. Take a snapshot of the bit and clear it if needed.
+ * 2. Write protect the corresponding page.
+ * 3. Copy the snapshot to the userspace.
+ * 4. Flush TLB's if needed.
+ */
+int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
+{
+ bool is_dirty = false;
+ int r;
+
+ mutex_lock(&kvm->slots_lock);
+
+ /*
+ * Flush potentially hardware-cached dirty pages to dirty_bitmap.
+ */
+ if (kvm_x86_ops->flush_log_dirty)
+ kvm_x86_ops->flush_log_dirty(kvm);
+
+ r = kvm_get_dirty_log_protect(kvm, log, &is_dirty);
+
+ /*
+ * All the TLBs can be flushed out of mmu lock, see the comments in
+ * kvm_mmu_slot_remove_write_access().
+ */
+ lockdep_assert_held(&kvm->slots_lock);
+ if (is_dirty)
+ kvm_flush_remote_tlbs(kvm);
+
+ mutex_unlock(&kvm->slots_lock);
+ return r;
+}
+
+int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event,
+ bool line_status)
+{
+ if (!irqchip_in_kernel(kvm))
+ return -ENXIO;
+
+ irq_event->status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
+ irq_event->irq, irq_event->level,
+ line_status);
+ return 0;
+}
+
+static int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
+ struct kvm_enable_cap *cap)
+{
+ int r;
+
+ if (cap->flags)
+ return -EINVAL;
+
+ switch (cap->cap) {
+ case KVM_CAP_DISABLE_QUIRKS:
+ kvm->arch.disabled_quirks = cap->args[0];
+ r = 0;
+ break;
+ case KVM_CAP_SPLIT_IRQCHIP: {
+ mutex_lock(&kvm->lock);
+ r = -EINVAL;
+ if (cap->args[0] > MAX_NR_RESERVED_IOAPIC_PINS)
+ goto split_irqchip_unlock;
+ r = -EEXIST;
+ if (irqchip_in_kernel(kvm))
+ goto split_irqchip_unlock;
+ if (kvm->created_vcpus)
+ goto split_irqchip_unlock;
+ r = kvm_setup_empty_irq_routing(kvm);
+ if (r)
+ goto split_irqchip_unlock;
+ /* Pairs with irqchip_in_kernel. */
+ smp_wmb();
+ kvm->arch.irqchip_mode = KVM_IRQCHIP_SPLIT;
+ kvm->arch.nr_reserved_ioapic_pins = cap->args[0];
+ r = 0;
+split_irqchip_unlock:
+ mutex_unlock(&kvm->lock);
+ break;
+ }
+ case KVM_CAP_X2APIC_API:
+ r = -EINVAL;
+ if (cap->args[0] & ~KVM_X2APIC_API_VALID_FLAGS)
+ break;
+
+ if (cap->args[0] & KVM_X2APIC_API_USE_32BIT_IDS)
+ kvm->arch.x2apic_format = true;
+ if (cap->args[0] & KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK)
+ kvm->arch.x2apic_broadcast_quirk_disabled = true;
+
+ r = 0;
+ break;
+ case KVM_CAP_X86_DISABLE_EXITS:
+ r = -EINVAL;
+ if (cap->args[0] & ~KVM_X86_DISABLE_VALID_EXITS)
+ break;
+
+ if ((cap->args[0] & KVM_X86_DISABLE_EXITS_MWAIT) &&
+ kvm_can_mwait_in_guest())
+ kvm->arch.mwait_in_guest = true;
+ if (cap->args[0] & KVM_X86_DISABLE_EXITS_HLT)
+ kvm->arch.hlt_in_guest = true;
+ if (cap->args[0] & KVM_X86_DISABLE_EXITS_PAUSE)
+ kvm->arch.pause_in_guest = true;
+ r = 0;
+ break;
+ case KVM_CAP_MSR_PLATFORM_INFO:
+ kvm->arch.guest_can_read_msr_platform_info = cap->args[0];
+ r = 0;
+ break;
+ default:
+ r = -EINVAL;
+ break;
+ }
+ return r;
+}
+
+long kvm_arch_vm_ioctl(struct file *filp,
+ unsigned int ioctl, unsigned long arg)
+{
+ struct kvm *kvm = filp->private_data;
+ void __user *argp = (void __user *)arg;
+ int r = -ENOTTY;
+ /*
+ * This union makes it completely explicit to gcc-3.x
+ * that these two variables' stack usage should be
+ * combined, not added together.
+ */
+ union {
+ struct kvm_pit_state ps;
+ struct kvm_pit_state2 ps2;
+ struct kvm_pit_config pit_config;
+ } u;
+
+ switch (ioctl) {
+ case KVM_SET_TSS_ADDR:
+ r = kvm_vm_ioctl_set_tss_addr(kvm, arg);
+ break;
+ case KVM_SET_IDENTITY_MAP_ADDR: {
+ u64 ident_addr;
+
+ mutex_lock(&kvm->lock);
+ r = -EINVAL;
+ if (kvm->created_vcpus)
+ goto set_identity_unlock;
+ r = -EFAULT;
+ if (copy_from_user(&ident_addr, argp, sizeof ident_addr))
+ goto set_identity_unlock;
+ r = kvm_vm_ioctl_set_identity_map_addr(kvm, ident_addr);
+set_identity_unlock:
+ mutex_unlock(&kvm->lock);
+ break;
+ }
+ case KVM_SET_NR_MMU_PAGES:
+ r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg);
+ break;
+ case KVM_GET_NR_MMU_PAGES:
+ r = kvm_vm_ioctl_get_nr_mmu_pages(kvm);
+ break;
+ case KVM_CREATE_IRQCHIP: {
+ mutex_lock(&kvm->lock);
+
+ r = -EEXIST;
+ if (irqchip_in_kernel(kvm))
+ goto create_irqchip_unlock;
+
+ r = -EINVAL;
+ if (kvm->created_vcpus)
+ goto create_irqchip_unlock;
+
+ r = kvm_pic_init(kvm);
+ if (r)
+ goto create_irqchip_unlock;
+
+ r = kvm_ioapic_init(kvm);
+ if (r) {
+ kvm_pic_destroy(kvm);
+ goto create_irqchip_unlock;
+ }
+
+ r = kvm_setup_default_irq_routing(kvm);
+ if (r) {
+ kvm_ioapic_destroy(kvm);
+ kvm_pic_destroy(kvm);
+ goto create_irqchip_unlock;
+ }
+ /* Write kvm->irq_routing before enabling irqchip_in_kernel. */
+ smp_wmb();
+ kvm->arch.irqchip_mode = KVM_IRQCHIP_KERNEL;
+ create_irqchip_unlock:
+ mutex_unlock(&kvm->lock);
+ break;
+ }
+ case KVM_CREATE_PIT:
+ u.pit_config.flags = KVM_PIT_SPEAKER_DUMMY;
+ goto create_pit;
+ case KVM_CREATE_PIT2:
+ r = -EFAULT;
+ if (copy_from_user(&u.pit_config, argp,
+ sizeof(struct kvm_pit_config)))
+ goto out;
+ create_pit:
+ mutex_lock(&kvm->lock);
+ r = -EEXIST;
+ if (kvm->arch.vpit)
+ goto create_pit_unlock;
+ r = -ENOMEM;
+ kvm->arch.vpit = kvm_create_pit(kvm, u.pit_config.flags);
+ if (kvm->arch.vpit)
+ r = 0;
+ create_pit_unlock:
+ mutex_unlock(&kvm->lock);
+ break;
+ case KVM_GET_IRQCHIP: {
+ /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
+ struct kvm_irqchip *chip;
+
+ chip = memdup_user(argp, sizeof(*chip));
+ if (IS_ERR(chip)) {
+ r = PTR_ERR(chip);
+ goto out;
+ }
+
+ r = -ENXIO;
+ if (!irqchip_kernel(kvm))
+ goto get_irqchip_out;
+ r = kvm_vm_ioctl_get_irqchip(kvm, chip);
+ if (r)
+ goto get_irqchip_out;
+ r = -EFAULT;
+ if (copy_to_user(argp, chip, sizeof *chip))
+ goto get_irqchip_out;
+ r = 0;
+ get_irqchip_out:
+ kfree(chip);
+ break;
+ }
+ case KVM_SET_IRQCHIP: {
+ /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
+ struct kvm_irqchip *chip;
+
+ chip = memdup_user(argp, sizeof(*chip));
+ if (IS_ERR(chip)) {
+ r = PTR_ERR(chip);
+ goto out;
+ }
+
+ r = -ENXIO;
+ if (!irqchip_kernel(kvm))
+ goto set_irqchip_out;
+ r = kvm_vm_ioctl_set_irqchip(kvm, chip);
+ if (r)
+ goto set_irqchip_out;
+ r = 0;
+ set_irqchip_out:
+ kfree(chip);
+ break;
+ }
+ case KVM_GET_PIT: {
+ r = -EFAULT;
+ if (copy_from_user(&u.ps, argp, sizeof(struct kvm_pit_state)))
+ goto out;
+ r = -ENXIO;
+ if (!kvm->arch.vpit)
+ goto out;
+ r = kvm_vm_ioctl_get_pit(kvm, &u.ps);
+ if (r)
+ goto out;
+ r = -EFAULT;
+ if (copy_to_user(argp, &u.ps, sizeof(struct kvm_pit_state)))
+ goto out;
+ r = 0;
+ break;
+ }
+ case KVM_SET_PIT: {
+ r = -EFAULT;
+ if (copy_from_user(&u.ps, argp, sizeof u.ps))
+ goto out;
+ mutex_lock(&kvm->lock);
+ r = -ENXIO;
+ if (!kvm->arch.vpit)
+ goto set_pit_out;
+ r = kvm_vm_ioctl_set_pit(kvm, &u.ps);
+set_pit_out:
+ mutex_unlock(&kvm->lock);
+ break;
+ }
+ case KVM_GET_PIT2: {
+ r = -ENXIO;
+ if (!kvm->arch.vpit)
+ goto out;
+ r = kvm_vm_ioctl_get_pit2(kvm, &u.ps2);
+ if (r)
+ goto out;
+ r = -EFAULT;
+ if (copy_to_user(argp, &u.ps2, sizeof(u.ps2)))
+ goto out;
+ r = 0;
+ break;
+ }
+ case KVM_SET_PIT2: {
+ r = -EFAULT;
+ if (copy_from_user(&u.ps2, argp, sizeof(u.ps2)))
+ goto out;
+ mutex_lock(&kvm->lock);
+ r = -ENXIO;
+ if (!kvm->arch.vpit)
+ goto set_pit2_out;
+ r = kvm_vm_ioctl_set_pit2(kvm, &u.ps2);
+set_pit2_out:
+ mutex_unlock(&kvm->lock);
+ break;
+ }
+ case KVM_REINJECT_CONTROL: {
+ struct kvm_reinject_control control;
+ r = -EFAULT;
+ if (copy_from_user(&control, argp, sizeof(control)))
+ goto out;
+ r = kvm_vm_ioctl_reinject(kvm, &control);
+ break;
+ }
+ case KVM_SET_BOOT_CPU_ID:
+ r = 0;
+ mutex_lock(&kvm->lock);
+ if (kvm->created_vcpus)
+ r = -EBUSY;
+ else
+ kvm->arch.bsp_vcpu_id = arg;
+ mutex_unlock(&kvm->lock);
+ break;
+ case KVM_XEN_HVM_CONFIG: {
+ struct kvm_xen_hvm_config xhc;
+ r = -EFAULT;
+ if (copy_from_user(&xhc, argp, sizeof(xhc)))
+ goto out;
+ r = -EINVAL;
+ if (xhc.flags)
+ goto out;
+ memcpy(&kvm->arch.xen_hvm_config, &xhc, sizeof(xhc));
+ r = 0;
+ break;
+ }
+ case KVM_SET_CLOCK: {
+ struct kvm_clock_data user_ns;
+ u64 now_ns;
+
+ r = -EFAULT;
+ if (copy_from_user(&user_ns, argp, sizeof(user_ns)))
+ goto out;
+
+ r = -EINVAL;
+ if (user_ns.flags)
+ goto out;
+
+ r = 0;
+ /*
+ * TODO: userspace has to take care of races with VCPU_RUN, so
+ * kvm_gen_update_masterclock() can be cut down to locked
+ * pvclock_update_vm_gtod_copy().
+ */
+ kvm_gen_update_masterclock(kvm);
+ now_ns = get_kvmclock_ns(kvm);
+ kvm->arch.kvmclock_offset += user_ns.clock - now_ns;
+ kvm_make_all_cpus_request(kvm, KVM_REQ_CLOCK_UPDATE);
+ break;
+ }
+ case KVM_GET_CLOCK: {
+ struct kvm_clock_data user_ns;
+ u64 now_ns;
+
+ now_ns = get_kvmclock_ns(kvm);
+ user_ns.clock = now_ns;
+ user_ns.flags = kvm->arch.use_master_clock ? KVM_CLOCK_TSC_STABLE : 0;
+ memset(&user_ns.pad, 0, sizeof(user_ns.pad));
+
+ r = -EFAULT;
+ if (copy_to_user(argp, &user_ns, sizeof(user_ns)))
+ goto out;
+ r = 0;
+ break;
+ }
+ case KVM_ENABLE_CAP: {
+ struct kvm_enable_cap cap;
+
+ r = -EFAULT;
+ if (copy_from_user(&cap, argp, sizeof(cap)))
+ goto out;
+ r = kvm_vm_ioctl_enable_cap(kvm, &cap);
+ break;
+ }
+ case KVM_MEMORY_ENCRYPT_OP: {
+ r = -ENOTTY;
+ if (kvm_x86_ops->mem_enc_op)
+ r = kvm_x86_ops->mem_enc_op(kvm, argp);
+ break;
+ }
+ case KVM_MEMORY_ENCRYPT_REG_REGION: {
+ struct kvm_enc_region region;
+
+ r = -EFAULT;
+ if (copy_from_user(&region, argp, sizeof(region)))
+ goto out;
+
+ r = -ENOTTY;
+ if (kvm_x86_ops->mem_enc_reg_region)
+ r = kvm_x86_ops->mem_enc_reg_region(kvm, &region);
+ break;
+ }
+ case KVM_MEMORY_ENCRYPT_UNREG_REGION: {
+ struct kvm_enc_region region;
+
+ r = -EFAULT;
+ if (copy_from_user(&region, argp, sizeof(region)))
+ goto out;
+
+ r = -ENOTTY;
+ if (kvm_x86_ops->mem_enc_unreg_region)
+ r = kvm_x86_ops->mem_enc_unreg_region(kvm, &region);
+ break;
+ }
+ case KVM_HYPERV_EVENTFD: {
+ struct kvm_hyperv_eventfd hvevfd;
+
+ r = -EFAULT;
+ if (copy_from_user(&hvevfd, argp, sizeof(hvevfd)))
+ goto out;
+ r = kvm_vm_ioctl_hv_eventfd(kvm, &hvevfd);
+ break;
+ }
+ default:
+ r = -ENOTTY;
+ }
+out:
+ return r;
+}
+
+static void kvm_init_msr_list(void)
+{
+ u32 dummy[2];
+ unsigned i, j;
+
+ for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) {
+ if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0)
+ continue;
+
+ /*
+ * Even MSRs that are valid in the host may not be exposed
+ * to the guests in some cases.
+ */
+ switch (msrs_to_save[i]) {
+ case MSR_IA32_BNDCFGS:
+ if (!kvm_mpx_supported())
+ continue;
+ break;
+ case MSR_TSC_AUX:
+ if (!kvm_x86_ops->rdtscp_supported())
+ continue;
+ break;
+ default:
+ break;
+ }
+
+ if (j < i)
+ msrs_to_save[j] = msrs_to_save[i];
+ j++;
+ }
+ num_msrs_to_save = j;
+
+ for (i = j = 0; i < ARRAY_SIZE(emulated_msrs); i++) {
+ if (!kvm_x86_ops->has_emulated_msr(emulated_msrs[i]))
+ continue;
+
+ if (j < i)
+ emulated_msrs[j] = emulated_msrs[i];
+ j++;
+ }
+ num_emulated_msrs = j;
+
+ for (i = j = 0; i < ARRAY_SIZE(msr_based_features); i++) {
+ struct kvm_msr_entry msr;
+
+ msr.index = msr_based_features[i];
+ if (kvm_get_msr_feature(&msr))
+ continue;
+
+ if (j < i)
+ msr_based_features[j] = msr_based_features[i];
+ j++;
+ }
+ num_msr_based_features = j;
+}
+
+static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len,
+ const void *v)
+{
+ int handled = 0;
+ int n;
+
+ do {
+ n = min(len, 8);
+ if (!(lapic_in_kernel(vcpu) &&
+ !kvm_iodevice_write(vcpu, &vcpu->arch.apic->dev, addr, n, v))
+ && kvm_io_bus_write(vcpu, KVM_MMIO_BUS, addr, n, v))
+ break;
+ handled += n;
+ addr += n;
+ len -= n;
+ v += n;
+ } while (len);
+
+ return handled;
+}
+
+static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)
+{
+ int handled = 0;
+ int n;
+
+ do {
+ n = min(len, 8);
+ if (!(lapic_in_kernel(vcpu) &&
+ !kvm_iodevice_read(vcpu, &vcpu->arch.apic->dev,
+ addr, n, v))
+ && kvm_io_bus_read(vcpu, KVM_MMIO_BUS, addr, n, v))
+ break;
+ trace_kvm_mmio(KVM_TRACE_MMIO_READ, n, addr, v);
+ handled += n;
+ addr += n;
+ len -= n;
+ v += n;
+ } while (len);
+
+ return handled;
+}
+
+static void kvm_set_segment(struct kvm_vcpu *vcpu,
+ struct kvm_segment *var, int seg)
+{
+ kvm_x86_ops->set_segment(vcpu, var, seg);
+}
+
+void kvm_get_segment(struct kvm_vcpu *vcpu,
+ struct kvm_segment *var, int seg)
+{
+ kvm_x86_ops->get_segment(vcpu, var, seg);
+}
+
+gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access,
+ struct x86_exception *exception)
+{
+ gpa_t t_gpa;
+
+ BUG_ON(!mmu_is_nested(vcpu));
+
+ /* NPT walks are always user-walks */
+ access |= PFERR_USER_MASK;
+ t_gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gpa, access, exception);
+
+ return t_gpa;
+}
+
+gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva,
+ struct x86_exception *exception)
+{
+ u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
+ return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
+}
+
+ gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva,
+ struct x86_exception *exception)
+{
+ u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
+ access |= PFERR_FETCH_MASK;
+ return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
+}
+
+gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva,
+ struct x86_exception *exception)
+{
+ u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
+ access |= PFERR_WRITE_MASK;
+ return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
+}
+
+/* uses this to access any guest's mapped memory without checking CPL */
+gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva,
+ struct x86_exception *exception)
+{
+ return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, 0, exception);
+}
+
+static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,
+ struct kvm_vcpu *vcpu, u32 access,
+ struct x86_exception *exception)
+{
+ void *data = val;
+ int r = X86EMUL_CONTINUE;
+
+ while (bytes) {
+ gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, access,
+ exception);
+ unsigned offset = addr & (PAGE_SIZE-1);
+ unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset);
+ int ret;
+
+ if (gpa == UNMAPPED_GVA)
+ return X86EMUL_PROPAGATE_FAULT;
+ ret = kvm_vcpu_read_guest_page(vcpu, gpa >> PAGE_SHIFT, data,
+ offset, toread);
+ if (ret < 0) {
+ r = X86EMUL_IO_NEEDED;
+ goto out;
+ }
+
+ bytes -= toread;
+ data += toread;
+ addr += toread;
+ }
+out:
+ return r;
+}
+
+/* used for instruction fetching */
+static int kvm_fetch_guest_virt(struct x86_emulate_ctxt *ctxt,
+ gva_t addr, void *val, unsigned int bytes,
+ struct x86_exception *exception)
+{
+ struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
+ u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
+ unsigned offset;
+ int ret;
+
+ /* Inline kvm_read_guest_virt_helper for speed. */
+ gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, access|PFERR_FETCH_MASK,
+ exception);
+ if (unlikely(gpa == UNMAPPED_GVA))
+ return X86EMUL_PROPAGATE_FAULT;
+
+ offset = addr & (PAGE_SIZE-1);
+ if (WARN_ON(offset + bytes > PAGE_SIZE))
+ bytes = (unsigned)PAGE_SIZE - offset;
+ ret = kvm_vcpu_read_guest_page(vcpu, gpa >> PAGE_SHIFT, val,
+ offset, bytes);
+ if (unlikely(ret < 0))
+ return X86EMUL_IO_NEEDED;
+
+ return X86EMUL_CONTINUE;
+}
+
+int kvm_read_guest_virt(struct kvm_vcpu *vcpu,
+ gva_t addr, void *val, unsigned int bytes,
+ struct x86_exception *exception)
+{
+ u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
+
+ /*
+ * FIXME: this should call handle_emulation_failure if X86EMUL_IO_NEEDED
+ * is returned, but our callers are not ready for that and they blindly
+ * call kvm_inject_page_fault. Ensure that they at least do not leak
+ * uninitialized kernel stack memory into cr2 and error code.
+ */
+ memset(exception, 0, sizeof(*exception));
+ return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access,
+ exception);
+}
+EXPORT_SYMBOL_GPL(kvm_read_guest_virt);
+
+static int emulator_read_std(struct x86_emulate_ctxt *ctxt,
+ gva_t addr, void *val, unsigned int bytes,
+ struct x86_exception *exception, bool system)
+{
+ struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
+ u32 access = 0;
+
+ if (!system && kvm_x86_ops->get_cpl(vcpu) == 3)
+ access |= PFERR_USER_MASK;
+
+ return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access, exception);
+}
+
+static int kvm_read_guest_phys_system(struct x86_emulate_ctxt *ctxt,
+ unsigned long addr, void *val, unsigned int bytes)
+{
+ struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
+ int r = kvm_vcpu_read_guest(vcpu, addr, val, bytes);
+
+ return r < 0 ? X86EMUL_IO_NEEDED : X86EMUL_CONTINUE;
+}
+
+static int kvm_write_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,
+ struct kvm_vcpu *vcpu, u32 access,
+ struct x86_exception *exception)
+{
+ void *data = val;
+ int r = X86EMUL_CONTINUE;
+
+ while (bytes) {
+ gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr,
+ access,
+ exception);
+ unsigned offset = addr & (PAGE_SIZE-1);
+ unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset);
+ int ret;
+
+ if (gpa == UNMAPPED_GVA)
+ return X86EMUL_PROPAGATE_FAULT;
+ ret = kvm_vcpu_write_guest(vcpu, gpa, data, towrite);
+ if (ret < 0) {
+ r = X86EMUL_IO_NEEDED;
+ goto out;
+ }
+
+ bytes -= towrite;
+ data += towrite;
+ addr += towrite;
+ }
+out:
+ return r;
+}
+
+static int emulator_write_std(struct x86_emulate_ctxt *ctxt, gva_t addr, void *val,
+ unsigned int bytes, struct x86_exception *exception,
+ bool system)
+{
+ struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
+ u32 access = PFERR_WRITE_MASK;
+
+ if (!system && kvm_x86_ops->get_cpl(vcpu) == 3)
+ access |= PFERR_USER_MASK;
+
+ return kvm_write_guest_virt_helper(addr, val, bytes, vcpu,
+ access, exception);
+}
+
+int kvm_write_guest_virt_system(struct kvm_vcpu *vcpu, gva_t addr, void *val,
+ unsigned int bytes, struct x86_exception *exception)
+{
+ /* kvm_write_guest_virt_system can pull in tons of pages. */
+ vcpu->arch.l1tf_flush_l1d = true;
+
+ /*
+ * FIXME: this should call handle_emulation_failure if X86EMUL_IO_NEEDED
+ * is returned, but our callers are not ready for that and they blindly
+ * call kvm_inject_page_fault. Ensure that they at least do not leak
+ * uninitialized kernel stack memory into cr2 and error code.
+ */
+ memset(exception, 0, sizeof(*exception));
+ return kvm_write_guest_virt_helper(addr, val, bytes, vcpu,
+ PFERR_WRITE_MASK, exception);
+}
+EXPORT_SYMBOL_GPL(kvm_write_guest_virt_system);
+
+int handle_ud(struct kvm_vcpu *vcpu)
+{
+ int emul_type = EMULTYPE_TRAP_UD;
+ enum emulation_result er;
+ char sig[5]; /* ud2; .ascii "kvm" */
+ struct x86_exception e;
+
+ if (force_emulation_prefix &&
+ kvm_read_guest_virt(vcpu, kvm_get_linear_rip(vcpu),
+ sig, sizeof(sig), &e) == 0 &&
+ memcmp(sig, "\xf\xbkvm", sizeof(sig)) == 0) {
+ kvm_rip_write(vcpu, kvm_rip_read(vcpu) + sizeof(sig));
+ emul_type = 0;
+ }
+
+ er = kvm_emulate_instruction(vcpu, emul_type);
+ if (er == EMULATE_USER_EXIT)
+ return 0;
+ if (er != EMULATE_DONE)
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+}
+EXPORT_SYMBOL_GPL(handle_ud);
+
+static int vcpu_is_mmio_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
+ gpa_t gpa, bool write)
+{
+ /* For APIC access vmexit */
+ if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
+ return 1;
+
+ if (vcpu_match_mmio_gpa(vcpu, gpa)) {
+ trace_vcpu_match_mmio(gva, gpa, write, true);
+ return 1;
+ }
+
+ return 0;
+}
+
+static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
+ gpa_t *gpa, struct x86_exception *exception,
+ bool write)
+{
+ u32 access = ((kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0)
+ | (write ? PFERR_WRITE_MASK : 0);
+
+ /*
+ * currently PKRU is only applied to ept enabled guest so
+ * there is no pkey in EPT page table for L1 guest or EPT
+ * shadow page table for L2 guest.
+ */
+ if (vcpu_match_mmio_gva(vcpu, gva)
+ && !permission_fault(vcpu, vcpu->arch.walk_mmu,
+ vcpu->arch.access, 0, access)) {
+ *gpa = vcpu->arch.mmio_gfn << PAGE_SHIFT |
+ (gva & (PAGE_SIZE - 1));
+ trace_vcpu_match_mmio(gva, *gpa, write, false);
+ return 1;
+ }
+
+ *gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
+
+ if (*gpa == UNMAPPED_GVA)
+ return -1;
+
+ return vcpu_is_mmio_gpa(vcpu, gva, *gpa, write);
+}
+
+int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
+ const void *val, int bytes)
+{
+ int ret;
+
+ ret = kvm_vcpu_write_guest(vcpu, gpa, val, bytes);
+ if (ret < 0)
+ return 0;
+ kvm_page_track_write(vcpu, gpa, val, bytes);
+ return 1;
+}
+
+struct read_write_emulator_ops {
+ int (*read_write_prepare)(struct kvm_vcpu *vcpu, void *val,
+ int bytes);
+ int (*read_write_emulate)(struct kvm_vcpu *vcpu, gpa_t gpa,
+ void *val, int bytes);
+ int (*read_write_mmio)(struct kvm_vcpu *vcpu, gpa_t gpa,
+ int bytes, void *val);
+ int (*read_write_exit_mmio)(struct kvm_vcpu *vcpu, gpa_t gpa,
+ void *val, int bytes);
+ bool write;
+};
+
+static int read_prepare(struct kvm_vcpu *vcpu, void *val, int bytes)
+{
+ if (vcpu->mmio_read_completed) {
+ trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes,
+ vcpu->mmio_fragments[0].gpa, val);
+ vcpu->mmio_read_completed = 0;
+ return 1;
+ }
+
+ return 0;
+}
+
+static int read_emulate(struct kvm_vcpu *vcpu, gpa_t gpa,
+ void *val, int bytes)
+{
+ return !kvm_vcpu_read_guest(vcpu, gpa, val, bytes);
+}
+
+static int write_emulate(struct kvm_vcpu *vcpu, gpa_t gpa,
+ void *val, int bytes)
+{
+ return emulator_write_phys(vcpu, gpa, val, bytes);
+}
+
+static int write_mmio(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes, void *val)
+{
+ trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, val);
+ return vcpu_mmio_write(vcpu, gpa, bytes, val);
+}
+
+static int read_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa,
+ void *val, int bytes)
+{
+ trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, NULL);
+ return X86EMUL_IO_NEEDED;
+}
+
+static int write_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa,
+ void *val, int bytes)
+{
+ struct kvm_mmio_fragment *frag = &vcpu->mmio_fragments[0];
+
+ memcpy(vcpu->run->mmio.data, frag->data, min(8u, frag->len));
+ return X86EMUL_CONTINUE;
+}
+
+static const struct read_write_emulator_ops read_emultor = {
+ .read_write_prepare = read_prepare,
+ .read_write_emulate = read_emulate,
+ .read_write_mmio = vcpu_mmio_read,
+ .read_write_exit_mmio = read_exit_mmio,
+};
+
+static const struct read_write_emulator_ops write_emultor = {
+ .read_write_emulate = write_emulate,
+ .read_write_mmio = write_mmio,
+ .read_write_exit_mmio = write_exit_mmio,
+ .write = true,
+};
+
+static int emulator_read_write_onepage(unsigned long addr, void *val,
+ unsigned int bytes,
+ struct x86_exception *exception,
+ struct kvm_vcpu *vcpu,
+ const struct read_write_emulator_ops *ops)
+{
+ gpa_t gpa;
+ int handled, ret;
+ bool write = ops->write;
+ struct kvm_mmio_fragment *frag;
+ struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
+
+ /*
+ * If the exit was due to a NPF we may already have a GPA.
+ * If the GPA is present, use it to avoid the GVA to GPA table walk.
+ * Note, this cannot be used on string operations since string
+ * operation using rep will only have the initial GPA from the NPF
+ * occurred.
+ */
+ if (vcpu->arch.gpa_available &&
+ emulator_can_use_gpa(ctxt) &&
+ (addr & ~PAGE_MASK) == (vcpu->arch.gpa_val & ~PAGE_MASK)) {
+ gpa = vcpu->arch.gpa_val;
+ ret = vcpu_is_mmio_gpa(vcpu, addr, gpa, write);
+ } else {
+ ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, write);
+ if (ret < 0)
+ return X86EMUL_PROPAGATE_FAULT;
+ }
+
+ if (!ret && ops->read_write_emulate(vcpu, gpa, val, bytes))
+ return X86EMUL_CONTINUE;
+
+ /*
+ * Is this MMIO handled locally?
+ */
+ handled = ops->read_write_mmio(vcpu, gpa, bytes, val);
+ if (handled == bytes)
+ return X86EMUL_CONTINUE;
+
+ gpa += handled;
+ bytes -= handled;
+ val += handled;
+
+ WARN_ON(vcpu->mmio_nr_fragments >= KVM_MAX_MMIO_FRAGMENTS);
+ frag = &vcpu->mmio_fragments[vcpu->mmio_nr_fragments++];
+ frag->gpa = gpa;
+ frag->data = val;
+ frag->len = bytes;
+ return X86EMUL_CONTINUE;
+}
+
+static int emulator_read_write(struct x86_emulate_ctxt *ctxt,
+ unsigned long addr,
+ void *val, unsigned int bytes,
+ struct x86_exception *exception,
+ const struct read_write_emulator_ops *ops)
+{
+ struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
+ gpa_t gpa;
+ int rc;
+
+ if (ops->read_write_prepare &&
+ ops->read_write_prepare(vcpu, val, bytes))
+ return X86EMUL_CONTINUE;
+
+ vcpu->mmio_nr_fragments = 0;
+
+ /* Crossing a page boundary? */
+ if (((addr + bytes - 1) ^ addr) & PAGE_MASK) {
+ int now;
+
+ now = -addr & ~PAGE_MASK;
+ rc = emulator_read_write_onepage(addr, val, now, exception,
+ vcpu, ops);
+
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+ addr += now;
+ if (ctxt->mode != X86EMUL_MODE_PROT64)
+ addr = (u32)addr;
+ val += now;
+ bytes -= now;
+ }
+
+ rc = emulator_read_write_onepage(addr, val, bytes, exception,
+ vcpu, ops);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ if (!vcpu->mmio_nr_fragments)
+ return rc;
+
+ gpa = vcpu->mmio_fragments[0].gpa;
+
+ vcpu->mmio_needed = 1;
+ vcpu->mmio_cur_fragment = 0;
+
+ vcpu->run->mmio.len = min(8u, vcpu->mmio_fragments[0].len);
+ vcpu->run->mmio.is_write = vcpu->mmio_is_write = ops->write;
+ vcpu->run->exit_reason = KVM_EXIT_MMIO;
+ vcpu->run->mmio.phys_addr = gpa;
+
+ return ops->read_write_exit_mmio(vcpu, gpa, val, bytes);
+}
+
+static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt,
+ unsigned long addr,
+ void *val,
+ unsigned int bytes,
+ struct x86_exception *exception)
+{
+ return emulator_read_write(ctxt, addr, val, bytes,
+ exception, &read_emultor);
+}
+
+static int emulator_write_emulated(struct x86_emulate_ctxt *ctxt,
+ unsigned long addr,
+ const void *val,
+ unsigned int bytes,
+ struct x86_exception *exception)
+{
+ return emulator_read_write(ctxt, addr, (void *)val, bytes,
+ exception, &write_emultor);
+}
+
+#define CMPXCHG_TYPE(t, ptr, old, new) \
+ (cmpxchg((t *)(ptr), *(t *)(old), *(t *)(new)) == *(t *)(old))
+
+#ifdef CONFIG_X86_64
+# define CMPXCHG64(ptr, old, new) CMPXCHG_TYPE(u64, ptr, old, new)
+#else
+# define CMPXCHG64(ptr, old, new) \
+ (cmpxchg64((u64 *)(ptr), *(u64 *)(old), *(u64 *)(new)) == *(u64 *)(old))
+#endif
+
+static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt,
+ unsigned long addr,
+ const void *old,
+ const void *new,
+ unsigned int bytes,
+ struct x86_exception *exception)
+{
+ struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
+ gpa_t gpa;
+ struct page *page;
+ char *kaddr;
+ bool exchanged;
+
+ /* guests cmpxchg8b have to be emulated atomically */
+ if (bytes > 8 || (bytes & (bytes - 1)))
+ goto emul_write;
+
+ gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, NULL);
+
+ if (gpa == UNMAPPED_GVA ||
+ (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
+ goto emul_write;
+
+ if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK))
+ goto emul_write;
+
+ page = kvm_vcpu_gfn_to_page(vcpu, gpa >> PAGE_SHIFT);
+ if (is_error_page(page))
+ goto emul_write;
+
+ kaddr = kmap_atomic(page);
+ kaddr += offset_in_page(gpa);
+ switch (bytes) {
+ case 1:
+ exchanged = CMPXCHG_TYPE(u8, kaddr, old, new);
+ break;
+ case 2:
+ exchanged = CMPXCHG_TYPE(u16, kaddr, old, new);
+ break;
+ case 4:
+ exchanged = CMPXCHG_TYPE(u32, kaddr, old, new);
+ break;
+ case 8:
+ exchanged = CMPXCHG64(kaddr, old, new);
+ break;
+ default:
+ BUG();
+ }
+ kunmap_atomic(kaddr);
+ kvm_release_page_dirty(page);
+
+ if (!exchanged)
+ return X86EMUL_CMPXCHG_FAILED;
+
+ kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT);
+ kvm_page_track_write(vcpu, gpa, new, bytes);
+
+ return X86EMUL_CONTINUE;
+
+emul_write:
+ printk_once(KERN_WARNING "kvm: emulating exchange as write\n");
+
+ return emulator_write_emulated(ctxt, addr, new, bytes, exception);
+}
+
+static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
+{
+ int r = 0, i;
+
+ for (i = 0; i < vcpu->arch.pio.count; i++) {
+ if (vcpu->arch.pio.in)
+ r = kvm_io_bus_read(vcpu, KVM_PIO_BUS, vcpu->arch.pio.port,
+ vcpu->arch.pio.size, pd);
+ else
+ r = kvm_io_bus_write(vcpu, KVM_PIO_BUS,
+ vcpu->arch.pio.port, vcpu->arch.pio.size,
+ pd);
+ if (r)
+ break;
+ pd += vcpu->arch.pio.size;
+ }
+ return r;
+}
+
+static int emulator_pio_in_out(struct kvm_vcpu *vcpu, int size,
+ unsigned short port, void *val,
+ unsigned int count, bool in)
+{
+ vcpu->arch.pio.port = port;
+ vcpu->arch.pio.in = in;
+ vcpu->arch.pio.count = count;
+ vcpu->arch.pio.size = size;
+
+ if (!kernel_pio(vcpu, vcpu->arch.pio_data)) {
+ vcpu->arch.pio.count = 0;
+ return 1;
+ }
+
+ vcpu->run->exit_reason = KVM_EXIT_IO;
+ vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
+ vcpu->run->io.size = size;
+ vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
+ vcpu->run->io.count = count;
+ vcpu->run->io.port = port;
+
+ return 0;
+}
+
+static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt,
+ int size, unsigned short port, void *val,
+ unsigned int count)
+{
+ struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
+ int ret;
+
+ if (vcpu->arch.pio.count)
+ goto data_avail;
+
+ memset(vcpu->arch.pio_data, 0, size * count);
+
+ ret = emulator_pio_in_out(vcpu, size, port, val, count, true);
+ if (ret) {
+data_avail:
+ memcpy(val, vcpu->arch.pio_data, size * count);
+ trace_kvm_pio(KVM_PIO_IN, port, size, count, vcpu->arch.pio_data);
+ vcpu->arch.pio.count = 0;
+ return 1;
+ }
+
+ return 0;
+}
+
+static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt,
+ int size, unsigned short port,
+ const void *val, unsigned int count)
+{
+ struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
+
+ memcpy(vcpu->arch.pio_data, val, size * count);
+ trace_kvm_pio(KVM_PIO_OUT, port, size, count, vcpu->arch.pio_data);
+ return emulator_pio_in_out(vcpu, size, port, (void *)val, count, false);
+}
+
+static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
+{
+ return kvm_x86_ops->get_segment_base(vcpu, seg);
+}
+
+static void emulator_invlpg(struct x86_emulate_ctxt *ctxt, ulong address)
+{
+ kvm_mmu_invlpg(emul_to_vcpu(ctxt), address);
+}
+
+static int kvm_emulate_wbinvd_noskip(struct kvm_vcpu *vcpu)
+{
+ if (!need_emulate_wbinvd(vcpu))
+ return X86EMUL_CONTINUE;
+
+ if (kvm_x86_ops->has_wbinvd_exit()) {
+ int cpu = get_cpu();
+
+ cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
+ smp_call_function_many(vcpu->arch.wbinvd_dirty_mask,
+ wbinvd_ipi, NULL, 1);
+ put_cpu();
+ cpumask_clear(vcpu->arch.wbinvd_dirty_mask);
+ } else
+ wbinvd();
+ return X86EMUL_CONTINUE;
+}
+
+int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu)
+{
+ kvm_emulate_wbinvd_noskip(vcpu);
+ return kvm_skip_emulated_instruction(vcpu);
+}
+EXPORT_SYMBOL_GPL(kvm_emulate_wbinvd);
+
+
+
+static void emulator_wbinvd(struct x86_emulate_ctxt *ctxt)
+{
+ kvm_emulate_wbinvd_noskip(emul_to_vcpu(ctxt));
+}
+
+static int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr,
+ unsigned long *dest)
+{
+ return kvm_get_dr(emul_to_vcpu(ctxt), dr, dest);
+}
+
+static int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr,
+ unsigned long value)
+{
+
+ return __kvm_set_dr(emul_to_vcpu(ctxt), dr, value);
+}
+
+static u64 mk_cr_64(u64 curr_cr, u32 new_val)
+{
+ return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
+}
+
+static unsigned long emulator_get_cr(struct x86_emulate_ctxt *ctxt, int cr)
+{
+ struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
+ unsigned long value;
+
+ switch (cr) {
+ case 0:
+ value = kvm_read_cr0(vcpu);
+ break;
+ case 2:
+ value = vcpu->arch.cr2;
+ break;
+ case 3:
+ value = kvm_read_cr3(vcpu);
+ break;
+ case 4:
+ value = kvm_read_cr4(vcpu);
+ break;
+ case 8:
+ value = kvm_get_cr8(vcpu);
+ break;
+ default:
+ kvm_err("%s: unexpected cr %u\n", __func__, cr);
+ return 0;
+ }
+
+ return value;
+}
+
+static int emulator_set_cr(struct x86_emulate_ctxt *ctxt, int cr, ulong val)
+{
+ struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
+ int res = 0;
+
+ switch (cr) {
+ case 0:
+ res = kvm_set_cr0(vcpu, mk_cr_64(kvm_read_cr0(vcpu), val));
+ break;
+ case 2:
+ vcpu->arch.cr2 = val;
+ break;
+ case 3:
+ res = kvm_set_cr3(vcpu, val);
+ break;
+ case 4:
+ res = kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val));
+ break;
+ case 8:
+ res = kvm_set_cr8(vcpu, val);
+ break;
+ default:
+ kvm_err("%s: unexpected cr %u\n", __func__, cr);
+ res = -1;
+ }
+
+ return res;
+}
+
+static int emulator_get_cpl(struct x86_emulate_ctxt *ctxt)
+{
+ return kvm_x86_ops->get_cpl(emul_to_vcpu(ctxt));
+}
+
+static void emulator_get_gdt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
+{
+ kvm_x86_ops->get_gdt(emul_to_vcpu(ctxt), dt);
+}
+
+static void emulator_get_idt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
+{
+ kvm_x86_ops->get_idt(emul_to_vcpu(ctxt), dt);
+}
+
+static void emulator_set_gdt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
+{
+ kvm_x86_ops->set_gdt(emul_to_vcpu(ctxt), dt);
+}
+
+static void emulator_set_idt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
+{
+ kvm_x86_ops->set_idt(emul_to_vcpu(ctxt), dt);
+}
+
+static unsigned long emulator_get_cached_segment_base(
+ struct x86_emulate_ctxt *ctxt, int seg)
+{
+ return get_segment_base(emul_to_vcpu(ctxt), seg);
+}
+
+static bool emulator_get_segment(struct x86_emulate_ctxt *ctxt, u16 *selector,
+ struct desc_struct *desc, u32 *base3,
+ int seg)
+{
+ struct kvm_segment var;
+
+ kvm_get_segment(emul_to_vcpu(ctxt), &var, seg);
+ *selector = var.selector;
+
+ if (var.unusable) {
+ memset(desc, 0, sizeof(*desc));
+ if (base3)
+ *base3 = 0;
+ return false;
+ }
+
+ if (var.g)
+ var.limit >>= 12;
+ set_desc_limit(desc, var.limit);
+ set_desc_base(desc, (unsigned long)var.base);
+#ifdef CONFIG_X86_64
+ if (base3)
+ *base3 = var.base >> 32;
+#endif
+ desc->type = var.type;
+ desc->s = var.s;
+ desc->dpl = var.dpl;
+ desc->p = var.present;
+ desc->avl = var.avl;
+ desc->l = var.l;
+ desc->d = var.db;
+ desc->g = var.g;
+
+ return true;
+}
+
+static void emulator_set_segment(struct x86_emulate_ctxt *ctxt, u16 selector,
+ struct desc_struct *desc, u32 base3,
+ int seg)
+{
+ struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
+ struct kvm_segment var;
+
+ var.selector = selector;
+ var.base = get_desc_base(desc);
+#ifdef CONFIG_X86_64
+ var.base |= ((u64)base3) << 32;
+#endif
+ var.limit = get_desc_limit(desc);
+ if (desc->g)
+ var.limit = (var.limit << 12) | 0xfff;
+ var.type = desc->type;
+ var.dpl = desc->dpl;
+ var.db = desc->d;
+ var.s = desc->s;
+ var.l = desc->l;
+ var.g = desc->g;
+ var.avl = desc->avl;
+ var.present = desc->p;
+ var.unusable = !var.present;
+ var.padding = 0;
+
+ kvm_set_segment(vcpu, &var, seg);
+ return;
+}
+
+static int emulator_get_msr(struct x86_emulate_ctxt *ctxt,
+ u32 msr_index, u64 *pdata)
+{
+ struct msr_data msr;
+ int r;
+
+ msr.index = msr_index;
+ msr.host_initiated = false;
+ r = kvm_get_msr(emul_to_vcpu(ctxt), &msr);
+ if (r)
+ return r;
+
+ *pdata = msr.data;
+ return 0;
+}
+
+static int emulator_set_msr(struct x86_emulate_ctxt *ctxt,
+ u32 msr_index, u64 data)
+{
+ struct msr_data msr;
+
+ msr.data = data;
+ msr.index = msr_index;
+ msr.host_initiated = false;
+ return kvm_set_msr(emul_to_vcpu(ctxt), &msr);
+}
+
+static u64 emulator_get_smbase(struct x86_emulate_ctxt *ctxt)
+{
+ struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
+
+ return vcpu->arch.smbase;
+}
+
+static void emulator_set_smbase(struct x86_emulate_ctxt *ctxt, u64 smbase)
+{
+ struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
+
+ vcpu->arch.smbase = smbase;
+}
+
+static int emulator_check_pmc(struct x86_emulate_ctxt *ctxt,
+ u32 pmc)
+{
+ return kvm_pmu_is_valid_msr_idx(emul_to_vcpu(ctxt), pmc);
+}
+
+static int emulator_read_pmc(struct x86_emulate_ctxt *ctxt,
+ u32 pmc, u64 *pdata)
+{
+ return kvm_pmu_rdpmc(emul_to_vcpu(ctxt), pmc, pdata);
+}
+
+static void emulator_halt(struct x86_emulate_ctxt *ctxt)
+{
+ emul_to_vcpu(ctxt)->arch.halt_request = 1;
+}
+
+static int emulator_intercept(struct x86_emulate_ctxt *ctxt,
+ struct x86_instruction_info *info,
+ enum x86_intercept_stage stage)
+{
+ return kvm_x86_ops->check_intercept(emul_to_vcpu(ctxt), info, stage);
+}
+
+static bool emulator_get_cpuid(struct x86_emulate_ctxt *ctxt,
+ u32 *eax, u32 *ebx, u32 *ecx, u32 *edx, bool check_limit)
+{
+ return kvm_cpuid(emul_to_vcpu(ctxt), eax, ebx, ecx, edx, check_limit);
+}
+
+static ulong emulator_read_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg)
+{
+ return kvm_register_read(emul_to_vcpu(ctxt), reg);
+}
+
+static void emulator_write_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg, ulong val)
+{
+ kvm_register_write(emul_to_vcpu(ctxt), reg, val);
+}
+
+static void emulator_set_nmi_mask(struct x86_emulate_ctxt *ctxt, bool masked)
+{
+ kvm_x86_ops->set_nmi_mask(emul_to_vcpu(ctxt), masked);
+}
+
+static unsigned emulator_get_hflags(struct x86_emulate_ctxt *ctxt)
+{
+ return emul_to_vcpu(ctxt)->arch.hflags;
+}
+
+static void emulator_set_hflags(struct x86_emulate_ctxt *ctxt, unsigned emul_flags)
+{
+ kvm_set_hflags(emul_to_vcpu(ctxt), emul_flags);
+}
+
+static int emulator_pre_leave_smm(struct x86_emulate_ctxt *ctxt, u64 smbase)
+{
+ return kvm_x86_ops->pre_leave_smm(emul_to_vcpu(ctxt), smbase);
+}
+
+static const struct x86_emulate_ops emulate_ops = {
+ .read_gpr = emulator_read_gpr,
+ .write_gpr = emulator_write_gpr,
+ .read_std = emulator_read_std,
+ .write_std = emulator_write_std,
+ .read_phys = kvm_read_guest_phys_system,
+ .fetch = kvm_fetch_guest_virt,
+ .read_emulated = emulator_read_emulated,
+ .write_emulated = emulator_write_emulated,
+ .cmpxchg_emulated = emulator_cmpxchg_emulated,
+ .invlpg = emulator_invlpg,
+ .pio_in_emulated = emulator_pio_in_emulated,
+ .pio_out_emulated = emulator_pio_out_emulated,
+ .get_segment = emulator_get_segment,
+ .set_segment = emulator_set_segment,
+ .get_cached_segment_base = emulator_get_cached_segment_base,
+ .get_gdt = emulator_get_gdt,
+ .get_idt = emulator_get_idt,
+ .set_gdt = emulator_set_gdt,
+ .set_idt = emulator_set_idt,
+ .get_cr = emulator_get_cr,
+ .set_cr = emulator_set_cr,
+ .cpl = emulator_get_cpl,
+ .get_dr = emulator_get_dr,
+ .set_dr = emulator_set_dr,
+ .get_smbase = emulator_get_smbase,
+ .set_smbase = emulator_set_smbase,
+ .set_msr = emulator_set_msr,
+ .get_msr = emulator_get_msr,
+ .check_pmc = emulator_check_pmc,
+ .read_pmc = emulator_read_pmc,
+ .halt = emulator_halt,
+ .wbinvd = emulator_wbinvd,
+ .fix_hypercall = emulator_fix_hypercall,
+ .intercept = emulator_intercept,
+ .get_cpuid = emulator_get_cpuid,
+ .set_nmi_mask = emulator_set_nmi_mask,
+ .get_hflags = emulator_get_hflags,
+ .set_hflags = emulator_set_hflags,
+ .pre_leave_smm = emulator_pre_leave_smm,
+};
+
+static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask)
+{
+ u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(vcpu);
+ /*
+ * an sti; sti; sequence only disable interrupts for the first
+ * instruction. So, if the last instruction, be it emulated or
+ * not, left the system with the INT_STI flag enabled, it
+ * means that the last instruction is an sti. We should not
+ * leave the flag on in this case. The same goes for mov ss
+ */
+ if (int_shadow & mask)
+ mask = 0;
+ if (unlikely(int_shadow || mask)) {
+ kvm_x86_ops->set_interrupt_shadow(vcpu, mask);
+ if (!mask)
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ }
+}
+
+static bool inject_emulated_exception(struct kvm_vcpu *vcpu)
+{
+ struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
+ if (ctxt->exception.vector == PF_VECTOR)
+ return kvm_propagate_fault(vcpu, &ctxt->exception);
+
+ if (ctxt->exception.error_code_valid)
+ kvm_queue_exception_e(vcpu, ctxt->exception.vector,
+ ctxt->exception.error_code);
+ else
+ kvm_queue_exception(vcpu, ctxt->exception.vector);
+ return false;
+}
+
+static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
+{
+ struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
+ int cs_db, cs_l;
+
+ kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
+
+ ctxt->eflags = kvm_get_rflags(vcpu);
+ ctxt->tf = (ctxt->eflags & X86_EFLAGS_TF) != 0;
+
+ ctxt->eip = kvm_rip_read(vcpu);
+ ctxt->mode = (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL :
+ (ctxt->eflags & X86_EFLAGS_VM) ? X86EMUL_MODE_VM86 :
+ (cs_l && is_long_mode(vcpu)) ? X86EMUL_MODE_PROT64 :
+ cs_db ? X86EMUL_MODE_PROT32 :
+ X86EMUL_MODE_PROT16;
+ BUILD_BUG_ON(HF_GUEST_MASK != X86EMUL_GUEST_MASK);
+ BUILD_BUG_ON(HF_SMM_MASK != X86EMUL_SMM_MASK);
+ BUILD_BUG_ON(HF_SMM_INSIDE_NMI_MASK != X86EMUL_SMM_INSIDE_NMI_MASK);
+
+ init_decode_cache(ctxt);
+ vcpu->arch.emulate_regs_need_sync_from_vcpu = false;
+}
+
+int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip)
+{
+ struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
+ int ret;
+
+ init_emulate_ctxt(vcpu);
+
+ ctxt->op_bytes = 2;
+ ctxt->ad_bytes = 2;
+ ctxt->_eip = ctxt->eip + inc_eip;
+ ret = emulate_int_real(ctxt, irq);
+
+ if (ret != X86EMUL_CONTINUE)
+ return EMULATE_FAIL;
+
+ ctxt->eip = ctxt->_eip;
+ kvm_rip_write(vcpu, ctxt->eip);
+ kvm_set_rflags(vcpu, ctxt->eflags);
+
+ return EMULATE_DONE;
+}
+EXPORT_SYMBOL_GPL(kvm_inject_realmode_interrupt);
+
+static int handle_emulation_failure(struct kvm_vcpu *vcpu, int emulation_type)
+{
+ int r = EMULATE_DONE;
+
+ ++vcpu->stat.insn_emulation_fail;
+ trace_kvm_emulate_insn_failed(vcpu);
+
+ if (emulation_type & EMULTYPE_NO_UD_ON_FAIL)
+ return EMULATE_FAIL;
+
+ if (!is_guest_mode(vcpu) && kvm_x86_ops->get_cpl(vcpu) == 0) {
+ vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+ vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
+ vcpu->run->internal.ndata = 0;
+ r = EMULATE_USER_EXIT;
+ }
+
+ kvm_queue_exception(vcpu, UD_VECTOR);
+
+ return r;
+}
+
+static bool reexecute_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
+ bool write_fault_to_shadow_pgtable,
+ int emulation_type)
+{
+ gpa_t gpa = cr2_or_gpa;
+ kvm_pfn_t pfn;
+
+ if (!(emulation_type & EMULTYPE_ALLOW_RETRY))
+ return false;
+
+ if (WARN_ON_ONCE(is_guest_mode(vcpu)))
+ return false;
+
+ if (!vcpu->arch.mmu.direct_map) {
+ /*
+ * Write permission should be allowed since only
+ * write access need to be emulated.
+ */
+ gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2_or_gpa, NULL);
+
+ /*
+ * If the mapping is invalid in guest, let cpu retry
+ * it to generate fault.
+ */
+ if (gpa == UNMAPPED_GVA)
+ return true;
+ }
+
+ /*
+ * Do not retry the unhandleable instruction if it faults on the
+ * readonly host memory, otherwise it will goto a infinite loop:
+ * retry instruction -> write #PF -> emulation fail -> retry
+ * instruction -> ...
+ */
+ pfn = gfn_to_pfn(vcpu->kvm, gpa_to_gfn(gpa));
+
+ /*
+ * If the instruction failed on the error pfn, it can not be fixed,
+ * report the error to userspace.
+ */
+ if (is_error_noslot_pfn(pfn))
+ return false;
+
+ kvm_release_pfn_clean(pfn);
+
+ /* The instructions are well-emulated on direct mmu. */
+ if (vcpu->arch.mmu.direct_map) {
+ unsigned int indirect_shadow_pages;
+
+ spin_lock(&vcpu->kvm->mmu_lock);
+ indirect_shadow_pages = vcpu->kvm->arch.indirect_shadow_pages;
+ spin_unlock(&vcpu->kvm->mmu_lock);
+
+ if (indirect_shadow_pages)
+ kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
+
+ return true;
+ }
+
+ /*
+ * if emulation was due to access to shadowed page table
+ * and it failed try to unshadow page and re-enter the
+ * guest to let CPU execute the instruction.
+ */
+ kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
+
+ /*
+ * If the access faults on its page table, it can not
+ * be fixed by unprotecting shadow page and it should
+ * be reported to userspace.
+ */
+ return !write_fault_to_shadow_pgtable;
+}
+
+static bool retry_instruction(struct x86_emulate_ctxt *ctxt,
+ gpa_t cr2_or_gpa, int emulation_type)
+{
+ struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
+ unsigned long last_retry_eip, last_retry_addr, gpa = cr2_or_gpa;
+
+ last_retry_eip = vcpu->arch.last_retry_eip;
+ last_retry_addr = vcpu->arch.last_retry_addr;
+
+ /*
+ * If the emulation is caused by #PF and it is non-page_table
+ * writing instruction, it means the VM-EXIT is caused by shadow
+ * page protected, we can zap the shadow page and retry this
+ * instruction directly.
+ *
+ * Note: if the guest uses a non-page-table modifying instruction
+ * on the PDE that points to the instruction, then we will unmap
+ * the instruction and go to an infinite loop. So, we cache the
+ * last retried eip and the last fault address, if we meet the eip
+ * and the address again, we can break out of the potential infinite
+ * loop.
+ */
+ vcpu->arch.last_retry_eip = vcpu->arch.last_retry_addr = 0;
+
+ if (!(emulation_type & EMULTYPE_ALLOW_RETRY))
+ return false;
+
+ if (WARN_ON_ONCE(is_guest_mode(vcpu)))
+ return false;
+
+ if (x86_page_table_writing_insn(ctxt))
+ return false;
+
+ if (ctxt->eip == last_retry_eip && last_retry_addr == cr2_or_gpa)
+ return false;
+
+ vcpu->arch.last_retry_eip = ctxt->eip;
+ vcpu->arch.last_retry_addr = cr2_or_gpa;
+
+ if (!vcpu->arch.mmu.direct_map)
+ gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2_or_gpa, NULL);
+
+ kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
+
+ return true;
+}
+
+static int complete_emulated_mmio(struct kvm_vcpu *vcpu);
+static int complete_emulated_pio(struct kvm_vcpu *vcpu);
+
+static void kvm_smm_changed(struct kvm_vcpu *vcpu)
+{
+ if (!(vcpu->arch.hflags & HF_SMM_MASK)) {
+ /* This is a good place to trace that we are exiting SMM. */
+ trace_kvm_enter_smm(vcpu->vcpu_id, vcpu->arch.smbase, false);
+
+ /* Process a latched INIT or SMI, if any. */
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ }
+
+ kvm_mmu_reset_context(vcpu);
+}
+
+static void kvm_set_hflags(struct kvm_vcpu *vcpu, unsigned emul_flags)
+{
+ unsigned changed = vcpu->arch.hflags ^ emul_flags;
+
+ vcpu->arch.hflags = emul_flags;
+
+ if (changed & HF_SMM_MASK)
+ kvm_smm_changed(vcpu);
+}
+
+static int kvm_vcpu_check_hw_bp(unsigned long addr, u32 type, u32 dr7,
+ unsigned long *db)
+{
+ u32 dr6 = 0;
+ int i;
+ u32 enable, rwlen;
+
+ enable = dr7;
+ rwlen = dr7 >> 16;
+ for (i = 0; i < 4; i++, enable >>= 2, rwlen >>= 4)
+ if ((enable & 3) && (rwlen & 15) == type && db[i] == addr)
+ dr6 |= (1 << i);
+ return dr6;
+}
+
+static void kvm_vcpu_do_singlestep(struct kvm_vcpu *vcpu, int *r)
+{
+ struct kvm_run *kvm_run = vcpu->run;
+
+ if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) {
+ kvm_run->debug.arch.dr6 = DR6_BS | DR6_FIXED_1 | DR6_RTM;
+ kvm_run->debug.arch.pc = vcpu->arch.singlestep_rip;
+ kvm_run->debug.arch.exception = DB_VECTOR;
+ kvm_run->exit_reason = KVM_EXIT_DEBUG;
+ *r = EMULATE_USER_EXIT;
+ } else {
+ /*
+ * "Certain debug exceptions may clear bit 0-3. The
+ * remaining contents of the DR6 register are never
+ * cleared by the processor".
+ */
+ vcpu->arch.dr6 &= ~15;
+ vcpu->arch.dr6 |= DR6_BS | DR6_RTM;
+ kvm_queue_exception(vcpu, DB_VECTOR);
+ }
+}
+
+int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu)
+{
+ unsigned long rflags = kvm_x86_ops->get_rflags(vcpu);
+ int r = EMULATE_DONE;
+
+ kvm_x86_ops->skip_emulated_instruction(vcpu);
+
+ /*
+ * rflags is the old, "raw" value of the flags. The new value has
+ * not been saved yet.
+ *
+ * This is correct even for TF set by the guest, because "the
+ * processor will not generate this exception after the instruction
+ * that sets the TF flag".
+ */
+ if (unlikely(rflags & X86_EFLAGS_TF))
+ kvm_vcpu_do_singlestep(vcpu, &r);
+ return r == EMULATE_DONE;
+}
+EXPORT_SYMBOL_GPL(kvm_skip_emulated_instruction);
+
+static bool kvm_vcpu_check_breakpoint(struct kvm_vcpu *vcpu, int *r)
+{
+ if (unlikely(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) &&
+ (vcpu->arch.guest_debug_dr7 & DR7_BP_EN_MASK)) {
+ struct kvm_run *kvm_run = vcpu->run;
+ unsigned long eip = kvm_get_linear_rip(vcpu);
+ u32 dr6 = kvm_vcpu_check_hw_bp(eip, 0,
+ vcpu->arch.guest_debug_dr7,
+ vcpu->arch.eff_db);
+
+ if (dr6 != 0) {
+ kvm_run->debug.arch.dr6 = dr6 | DR6_FIXED_1 | DR6_RTM;
+ kvm_run->debug.arch.pc = eip;
+ kvm_run->debug.arch.exception = DB_VECTOR;
+ kvm_run->exit_reason = KVM_EXIT_DEBUG;
+ *r = EMULATE_USER_EXIT;
+ return true;
+ }
+ }
+
+ if (unlikely(vcpu->arch.dr7 & DR7_BP_EN_MASK) &&
+ !(kvm_get_rflags(vcpu) & X86_EFLAGS_RF)) {
+ unsigned long eip = kvm_get_linear_rip(vcpu);
+ u32 dr6 = kvm_vcpu_check_hw_bp(eip, 0,
+ vcpu->arch.dr7,
+ vcpu->arch.db);
+
+ if (dr6 != 0) {
+ vcpu->arch.dr6 &= ~15;
+ vcpu->arch.dr6 |= dr6 | DR6_RTM;
+ kvm_queue_exception(vcpu, DB_VECTOR);
+ *r = EMULATE_DONE;
+ return true;
+ }
+ }
+
+ return false;
+}
+
+static bool is_vmware_backdoor_opcode(struct x86_emulate_ctxt *ctxt)
+{
+ switch (ctxt->opcode_len) {
+ case 1:
+ switch (ctxt->b) {
+ case 0xe4: /* IN */
+ case 0xe5:
+ case 0xec:
+ case 0xed:
+ case 0xe6: /* OUT */
+ case 0xe7:
+ case 0xee:
+ case 0xef:
+ case 0x6c: /* INS */
+ case 0x6d:
+ case 0x6e: /* OUTS */
+ case 0x6f:
+ return true;
+ }
+ break;
+ case 2:
+ switch (ctxt->b) {
+ case 0x33: /* RDPMC */
+ return true;
+ }
+ break;
+ }
+
+ return false;
+}
+
+int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
+ int emulation_type, void *insn, int insn_len)
+{
+ int r;
+ struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
+ bool writeback = true;
+ bool write_fault_to_spt = vcpu->arch.write_fault_to_shadow_pgtable;
+
+ vcpu->arch.l1tf_flush_l1d = true;
+
+ /*
+ * Clear write_fault_to_shadow_pgtable here to ensure it is
+ * never reused.
+ */
+ vcpu->arch.write_fault_to_shadow_pgtable = false;
+ kvm_clear_exception_queue(vcpu);
+
+ if (!(emulation_type & EMULTYPE_NO_DECODE)) {
+ init_emulate_ctxt(vcpu);
+
+ /*
+ * We will reenter on the same instruction since
+ * we do not set complete_userspace_io. This does not
+ * handle watchpoints yet, those would be handled in
+ * the emulate_ops.
+ */
+ if (!(emulation_type & EMULTYPE_SKIP) &&
+ kvm_vcpu_check_breakpoint(vcpu, &r))
+ return r;
+
+ ctxt->interruptibility = 0;
+ ctxt->have_exception = false;
+ ctxt->exception.vector = -1;
+ ctxt->perm_ok = false;
+
+ ctxt->ud = emulation_type & EMULTYPE_TRAP_UD;
+
+ r = x86_decode_insn(ctxt, insn, insn_len);
+
+ trace_kvm_emulate_insn_start(vcpu);
+ ++vcpu->stat.insn_emulation;
+ if (r != EMULATION_OK) {
+ if (emulation_type & EMULTYPE_TRAP_UD)
+ return EMULATE_FAIL;
+ if (reexecute_instruction(vcpu, cr2_or_gpa, write_fault_to_spt,
+ emulation_type))
+ return EMULATE_DONE;
+ if (ctxt->have_exception) {
+ /*
+ * #UD should result in just EMULATION_FAILED, and trap-like
+ * exception should not be encountered during decode.
+ */
+ WARN_ON_ONCE(ctxt->exception.vector == UD_VECTOR ||
+ exception_type(ctxt->exception.vector) == EXCPT_TRAP);
+ inject_emulated_exception(vcpu);
+ return EMULATE_DONE;
+ }
+ if (emulation_type & EMULTYPE_SKIP)
+ return EMULATE_FAIL;
+ return handle_emulation_failure(vcpu, emulation_type);
+ }
+ }
+
+ if ((emulation_type & EMULTYPE_VMWARE) &&
+ !is_vmware_backdoor_opcode(ctxt))
+ return EMULATE_FAIL;
+
+ if (emulation_type & EMULTYPE_SKIP) {
+ kvm_rip_write(vcpu, ctxt->_eip);
+ if (ctxt->eflags & X86_EFLAGS_RF)
+ kvm_set_rflags(vcpu, ctxt->eflags & ~X86_EFLAGS_RF);
+ return EMULATE_DONE;
+ }
+
+ if (retry_instruction(ctxt, cr2_or_gpa, emulation_type))
+ return EMULATE_DONE;
+
+ /* this is needed for vmware backdoor interface to work since it
+ changes registers values during IO operation */
+ if (vcpu->arch.emulate_regs_need_sync_from_vcpu) {
+ vcpu->arch.emulate_regs_need_sync_from_vcpu = false;
+ emulator_invalidate_register_cache(ctxt);
+ }
+
+restart:
+ /* Save the faulting GPA (cr2) in the address field */
+ ctxt->exception.address = cr2_or_gpa;
+
+ r = x86_emulate_insn(ctxt);
+
+ if (r == EMULATION_INTERCEPTED)
+ return EMULATE_DONE;
+
+ if (r == EMULATION_FAILED) {
+ if (reexecute_instruction(vcpu, cr2_or_gpa, write_fault_to_spt,
+ emulation_type))
+ return EMULATE_DONE;
+
+ return handle_emulation_failure(vcpu, emulation_type);
+ }
+
+ if (ctxt->have_exception) {
+ r = EMULATE_DONE;
+ if (inject_emulated_exception(vcpu))
+ return r;
+ } else if (vcpu->arch.pio.count) {
+ if (!vcpu->arch.pio.in) {
+ /* FIXME: return into emulator if single-stepping. */
+ vcpu->arch.pio.count = 0;
+ } else {
+ writeback = false;
+ vcpu->arch.complete_userspace_io = complete_emulated_pio;
+ }
+ r = EMULATE_USER_EXIT;
+ } else if (vcpu->mmio_needed) {
+ if (!vcpu->mmio_is_write)
+ writeback = false;
+ r = EMULATE_USER_EXIT;
+ vcpu->arch.complete_userspace_io = complete_emulated_mmio;
+ } else if (r == EMULATION_RESTART)
+ goto restart;
+ else
+ r = EMULATE_DONE;
+
+ if (writeback) {
+ unsigned long rflags = kvm_x86_ops->get_rflags(vcpu);
+ toggle_interruptibility(vcpu, ctxt->interruptibility);
+ vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
+ if (!ctxt->have_exception ||
+ exception_type(ctxt->exception.vector) == EXCPT_TRAP) {
+ kvm_rip_write(vcpu, ctxt->eip);
+ if (r == EMULATE_DONE && ctxt->tf)
+ kvm_vcpu_do_singlestep(vcpu, &r);
+ __kvm_set_rflags(vcpu, ctxt->eflags);
+ }
+
+ /*
+ * For STI, interrupts are shadowed; so KVM_REQ_EVENT will
+ * do nothing, and it will be requested again as soon as
+ * the shadow expires. But we still need to check here,
+ * because POPF has no interrupt shadow.
+ */
+ if (unlikely((ctxt->eflags & ~rflags) & X86_EFLAGS_IF))
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ } else
+ vcpu->arch.emulate_regs_need_sync_to_vcpu = true;
+
+ return r;
+}
+
+int kvm_emulate_instruction(struct kvm_vcpu *vcpu, int emulation_type)
+{
+ return x86_emulate_instruction(vcpu, 0, emulation_type, NULL, 0);
+}
+EXPORT_SYMBOL_GPL(kvm_emulate_instruction);
+
+int kvm_emulate_instruction_from_buffer(struct kvm_vcpu *vcpu,
+ void *insn, int insn_len)
+{
+ return x86_emulate_instruction(vcpu, 0, 0, insn, insn_len);
+}
+EXPORT_SYMBOL_GPL(kvm_emulate_instruction_from_buffer);
+
+static int complete_fast_pio_out_port_0x7e(struct kvm_vcpu *vcpu)
+{
+ vcpu->arch.pio.count = 0;
+ return 1;
+}
+
+static int complete_fast_pio_out(struct kvm_vcpu *vcpu)
+{
+ vcpu->arch.pio.count = 0;
+
+ if (unlikely(!kvm_is_linear_rip(vcpu, vcpu->arch.pio.linear_rip)))
+ return 1;
+
+ return kvm_skip_emulated_instruction(vcpu);
+}
+
+static int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size,
+ unsigned short port)
+{
+ unsigned long val = kvm_register_read(vcpu, VCPU_REGS_RAX);
+ int ret = emulator_pio_out_emulated(&vcpu->arch.emulate_ctxt,
+ size, port, &val, 1);
+ if (ret)
+ return ret;
+
+ /*
+ * Workaround userspace that relies on old KVM behavior of %rip being
+ * incremented prior to exiting to userspace to handle "OUT 0x7e".
+ */
+ if (port == 0x7e &&
+ kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_OUT_7E_INC_RIP)) {
+ vcpu->arch.complete_userspace_io =
+ complete_fast_pio_out_port_0x7e;
+ kvm_skip_emulated_instruction(vcpu);
+ } else {
+ vcpu->arch.pio.linear_rip = kvm_get_linear_rip(vcpu);
+ vcpu->arch.complete_userspace_io = complete_fast_pio_out;
+ }
+ return 0;
+}
+
+static int complete_fast_pio_in(struct kvm_vcpu *vcpu)
+{
+ unsigned long val;
+
+ /* We should only ever be called with arch.pio.count equal to 1 */
+ BUG_ON(vcpu->arch.pio.count != 1);
+
+ if (unlikely(!kvm_is_linear_rip(vcpu, vcpu->arch.pio.linear_rip))) {
+ vcpu->arch.pio.count = 0;
+ return 1;
+ }
+
+ /* For size less than 4 we merge, else we zero extend */
+ val = (vcpu->arch.pio.size < 4) ? kvm_register_read(vcpu, VCPU_REGS_RAX)
+ : 0;
+
+ /*
+ * Since vcpu->arch.pio.count == 1 let emulator_pio_in_emulated perform
+ * the copy and tracing
+ */
+ emulator_pio_in_emulated(&vcpu->arch.emulate_ctxt, vcpu->arch.pio.size,
+ vcpu->arch.pio.port, &val, 1);
+ kvm_register_write(vcpu, VCPU_REGS_RAX, val);
+
+ return kvm_skip_emulated_instruction(vcpu);
+}
+
+static int kvm_fast_pio_in(struct kvm_vcpu *vcpu, int size,
+ unsigned short port)
+{
+ unsigned long val;
+ int ret;
+
+ /* For size less than 4 we merge, else we zero extend */
+ val = (size < 4) ? kvm_register_read(vcpu, VCPU_REGS_RAX) : 0;
+
+ ret = emulator_pio_in_emulated(&vcpu->arch.emulate_ctxt, size, port,
+ &val, 1);
+ if (ret) {
+ kvm_register_write(vcpu, VCPU_REGS_RAX, val);
+ return ret;
+ }
+
+ vcpu->arch.pio.linear_rip = kvm_get_linear_rip(vcpu);
+ vcpu->arch.complete_userspace_io = complete_fast_pio_in;
+
+ return 0;
+}
+
+int kvm_fast_pio(struct kvm_vcpu *vcpu, int size, unsigned short port, int in)
+{
+ int ret;
+
+ if (in)
+ ret = kvm_fast_pio_in(vcpu, size, port);
+ else
+ ret = kvm_fast_pio_out(vcpu, size, port);
+ return ret && kvm_skip_emulated_instruction(vcpu);
+}
+EXPORT_SYMBOL_GPL(kvm_fast_pio);
+
+static int kvmclock_cpu_down_prep(unsigned int cpu)
+{
+ __this_cpu_write(cpu_tsc_khz, 0);
+ return 0;
+}
+
+static void tsc_khz_changed(void *data)
+{
+ struct cpufreq_freqs *freq = data;
+ unsigned long khz = 0;
+
+ if (data)
+ khz = freq->new;
+ else if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
+ khz = cpufreq_quick_get(raw_smp_processor_id());
+ if (!khz)
+ khz = tsc_khz;
+ __this_cpu_write(cpu_tsc_khz, khz);
+}
+
+#ifdef CONFIG_X86_64
+static void kvm_hyperv_tsc_notifier(void)
+{
+ struct kvm *kvm;
+ struct kvm_vcpu *vcpu;
+ int cpu;
+
+ mutex_lock(&kvm_lock);
+ list_for_each_entry(kvm, &vm_list, vm_list)
+ kvm_make_mclock_inprogress_request(kvm);
+
+ hyperv_stop_tsc_emulation();
+
+ /* TSC frequency always matches when on Hyper-V */
+ for_each_present_cpu(cpu)
+ per_cpu(cpu_tsc_khz, cpu) = tsc_khz;
+ kvm_max_guest_tsc_khz = tsc_khz;
+
+ list_for_each_entry(kvm, &vm_list, vm_list) {
+ struct kvm_arch *ka = &kvm->arch;
+
+ spin_lock(&ka->pvclock_gtod_sync_lock);
+
+ pvclock_update_vm_gtod_copy(kvm);
+
+ kvm_for_each_vcpu(cpu, vcpu, kvm)
+ kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
+
+ kvm_for_each_vcpu(cpu, vcpu, kvm)
+ kvm_clear_request(KVM_REQ_MCLOCK_INPROGRESS, vcpu);
+
+ spin_unlock(&ka->pvclock_gtod_sync_lock);
+ }
+ mutex_unlock(&kvm_lock);
+}
+#endif
+
+static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
+ void *data)
+{
+ struct cpufreq_freqs *freq = data;
+ struct kvm *kvm;
+ struct kvm_vcpu *vcpu;
+ int i, send_ipi = 0;
+
+ /*
+ * We allow guests to temporarily run on slowing clocks,
+ * provided we notify them after, or to run on accelerating
+ * clocks, provided we notify them before. Thus time never
+ * goes backwards.
+ *
+ * However, we have a problem. We can't atomically update
+ * the frequency of a given CPU from this function; it is
+ * merely a notifier, which can be called from any CPU.
+ * Changing the TSC frequency at arbitrary points in time
+ * requires a recomputation of local variables related to
+ * the TSC for each VCPU. We must flag these local variables
+ * to be updated and be sure the update takes place with the
+ * new frequency before any guests proceed.
+ *
+ * Unfortunately, the combination of hotplug CPU and frequency
+ * change creates an intractable locking scenario; the order
+ * of when these callouts happen is undefined with respect to
+ * CPU hotplug, and they can race with each other. As such,
+ * merely setting per_cpu(cpu_tsc_khz) = X during a hotadd is
+ * undefined; you can actually have a CPU frequency change take
+ * place in between the computation of X and the setting of the
+ * variable. To protect against this problem, all updates of
+ * the per_cpu tsc_khz variable are done in an interrupt
+ * protected IPI, and all callers wishing to update the value
+ * must wait for a synchronous IPI to complete (which is trivial
+ * if the caller is on the CPU already). This establishes the
+ * necessary total order on variable updates.
+ *
+ * Note that because a guest time update may take place
+ * anytime after the setting of the VCPU's request bit, the
+ * correct TSC value must be set before the request. However,
+ * to ensure the update actually makes it to any guest which
+ * starts running in hardware virtualization between the set
+ * and the acquisition of the spinlock, we must also ping the
+ * CPU after setting the request bit.
+ *
+ */
+
+ if (val == CPUFREQ_PRECHANGE && freq->old > freq->new)
+ return 0;
+ if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new)
+ return 0;
+
+ smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1);
+
+ mutex_lock(&kvm_lock);
+ list_for_each_entry(kvm, &vm_list, vm_list) {
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ if (vcpu->cpu != freq->cpu)
+ continue;
+ kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
+ if (vcpu->cpu != raw_smp_processor_id())
+ send_ipi = 1;
+ }
+ }
+ mutex_unlock(&kvm_lock);
+
+ if (freq->old < freq->new && send_ipi) {
+ /*
+ * We upscale the frequency. Must make the guest
+ * doesn't see old kvmclock values while running with
+ * the new frequency, otherwise we risk the guest sees
+ * time go backwards.
+ *
+ * In case we update the frequency for another cpu
+ * (which might be in guest context) send an interrupt
+ * to kick the cpu out of guest context. Next time
+ * guest context is entered kvmclock will be updated,
+ * so the guest will not see stale values.
+ */
+ smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1);
+ }
+ return 0;
+}
+
+static struct notifier_block kvmclock_cpufreq_notifier_block = {
+ .notifier_call = kvmclock_cpufreq_notifier
+};
+
+static int kvmclock_cpu_online(unsigned int cpu)
+{
+ tsc_khz_changed(NULL);
+ return 0;
+}
+
+static void kvm_timer_init(void)
+{
+ max_tsc_khz = tsc_khz;
+
+ if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
+#ifdef CONFIG_CPU_FREQ
+ struct cpufreq_policy policy;
+ int cpu;
+
+ memset(&policy, 0, sizeof(policy));
+ cpu = get_cpu();
+ cpufreq_get_policy(&policy, cpu);
+ if (policy.cpuinfo.max_freq)
+ max_tsc_khz = policy.cpuinfo.max_freq;
+ put_cpu();
+#endif
+ cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block,
+ CPUFREQ_TRANSITION_NOTIFIER);
+ }
+ pr_debug("kvm: max_tsc_khz = %ld\n", max_tsc_khz);
+
+ cpuhp_setup_state(CPUHP_AP_X86_KVM_CLK_ONLINE, "x86/kvm/clk:online",
+ kvmclock_cpu_online, kvmclock_cpu_down_prep);
+}
+
+DEFINE_PER_CPU(struct kvm_vcpu *, current_vcpu);
+EXPORT_PER_CPU_SYMBOL_GPL(current_vcpu);
+
+int kvm_is_in_guest(void)
+{
+ return __this_cpu_read(current_vcpu) != NULL;
+}
+
+static int kvm_is_user_mode(void)
+{
+ int user_mode = 3;
+
+ if (__this_cpu_read(current_vcpu))
+ user_mode = kvm_x86_ops->get_cpl(__this_cpu_read(current_vcpu));
+
+ return user_mode != 0;
+}
+
+static unsigned long kvm_get_guest_ip(void)
+{
+ unsigned long ip = 0;
+
+ if (__this_cpu_read(current_vcpu))
+ ip = kvm_rip_read(__this_cpu_read(current_vcpu));
+
+ return ip;
+}
+
+static struct perf_guest_info_callbacks kvm_guest_cbs = {
+ .is_in_guest = kvm_is_in_guest,
+ .is_user_mode = kvm_is_user_mode,
+ .get_guest_ip = kvm_get_guest_ip,
+};
+
+#ifdef CONFIG_X86_64
+static void pvclock_gtod_update_fn(struct work_struct *work)
+{
+ struct kvm *kvm;
+
+ struct kvm_vcpu *vcpu;
+ int i;
+
+ mutex_lock(&kvm_lock);
+ list_for_each_entry(kvm, &vm_list, vm_list)
+ kvm_for_each_vcpu(i, vcpu, kvm)
+ kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
+ atomic_set(&kvm_guest_has_master_clock, 0);
+ mutex_unlock(&kvm_lock);
+}
+
+static DECLARE_WORK(pvclock_gtod_work, pvclock_gtod_update_fn);
+
+/*
+ * Notification about pvclock gtod data update.
+ */
+static int pvclock_gtod_notify(struct notifier_block *nb, unsigned long unused,
+ void *priv)
+{
+ struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
+ struct timekeeper *tk = priv;
+
+ update_pvclock_gtod(tk);
+
+ /* disable master clock if host does not trust, or does not
+ * use, TSC based clocksource.
+ */
+ if (!gtod_is_based_on_tsc(gtod->clock.vclock_mode) &&
+ atomic_read(&kvm_guest_has_master_clock) != 0)
+ queue_work(system_long_wq, &pvclock_gtod_work);
+
+ return 0;
+}
+
+static struct notifier_block pvclock_gtod_notifier = {
+ .notifier_call = pvclock_gtod_notify,
+};
+#endif
+
+int kvm_arch_init(void *opaque)
+{
+ int r;
+ struct kvm_x86_ops *ops = opaque;
+
+ if (kvm_x86_ops) {
+ printk(KERN_ERR "kvm: already loaded the other module\n");
+ r = -EEXIST;
+ goto out;
+ }
+
+ if (!ops->cpu_has_kvm_support()) {
+ printk(KERN_ERR "kvm: no hardware support\n");
+ r = -EOPNOTSUPP;
+ goto out;
+ }
+ if (ops->disabled_by_bios()) {
+ printk(KERN_ERR "kvm: disabled by bios\n");
+ r = -EOPNOTSUPP;
+ goto out;
+ }
+
+ r = -ENOMEM;
+ shared_msrs = alloc_percpu(struct kvm_shared_msrs);
+ if (!shared_msrs) {
+ printk(KERN_ERR "kvm: failed to allocate percpu kvm_shared_msrs\n");
+ goto out;
+ }
+
+ r = kvm_mmu_module_init();
+ if (r)
+ goto out_free_percpu;
+
+ kvm_x86_ops = ops;
+
+ kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
+ PT_DIRTY_MASK, PT64_NX_MASK, 0,
+ PT_PRESENT_MASK, 0, sme_me_mask);
+ kvm_timer_init();
+
+ perf_register_guest_info_callbacks(&kvm_guest_cbs);
+
+ if (boot_cpu_has(X86_FEATURE_XSAVE))
+ host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
+
+ kvm_lapic_init();
+#ifdef CONFIG_X86_64
+ pvclock_gtod_register_notifier(&pvclock_gtod_notifier);
+
+ if (hypervisor_is_type(X86_HYPER_MS_HYPERV))
+ set_hv_tscchange_cb(kvm_hyperv_tsc_notifier);
+#endif
+
+ return 0;
+
+out_free_percpu:
+ free_percpu(shared_msrs);
+out:
+ return r;
+}
+
+void kvm_arch_exit(void)
+{
+#ifdef CONFIG_X86_64
+ if (hypervisor_is_type(X86_HYPER_MS_HYPERV))
+ clear_hv_tscchange_cb();
+#endif
+ kvm_lapic_exit();
+ perf_unregister_guest_info_callbacks(&kvm_guest_cbs);
+
+ if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
+ cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block,
+ CPUFREQ_TRANSITION_NOTIFIER);
+ cpuhp_remove_state_nocalls(CPUHP_AP_X86_KVM_CLK_ONLINE);
+#ifdef CONFIG_X86_64
+ pvclock_gtod_unregister_notifier(&pvclock_gtod_notifier);
+ cancel_work_sync(&pvclock_gtod_work);
+#endif
+ kvm_x86_ops = NULL;
+ kvm_mmu_module_exit();
+ free_percpu(shared_msrs);
+}
+
+int kvm_vcpu_halt(struct kvm_vcpu *vcpu)
+{
+ ++vcpu->stat.halt_exits;
+ if (lapic_in_kernel(vcpu)) {
+ vcpu->arch.mp_state = KVM_MP_STATE_HALTED;
+ return 1;
+ } else {
+ vcpu->run->exit_reason = KVM_EXIT_HLT;
+ return 0;
+ }
+}
+EXPORT_SYMBOL_GPL(kvm_vcpu_halt);
+
+int kvm_emulate_halt(struct kvm_vcpu *vcpu)
+{
+ int ret = kvm_skip_emulated_instruction(vcpu);
+ /*
+ * TODO: we might be squashing a GUESTDBG_SINGLESTEP-triggered
+ * KVM_EXIT_DEBUG here.
+ */
+ return kvm_vcpu_halt(vcpu) && ret;
+}
+EXPORT_SYMBOL_GPL(kvm_emulate_halt);
+
+#ifdef CONFIG_X86_64
+static int kvm_pv_clock_pairing(struct kvm_vcpu *vcpu, gpa_t paddr,
+ unsigned long clock_type)
+{
+ struct kvm_clock_pairing clock_pairing;
+ struct timespec64 ts;
+ u64 cycle;
+ int ret;
+
+ if (clock_type != KVM_CLOCK_PAIRING_WALLCLOCK)
+ return -KVM_EOPNOTSUPP;
+
+ if (kvm_get_walltime_and_clockread(&ts, &cycle) == false)
+ return -KVM_EOPNOTSUPP;
+
+ clock_pairing.sec = ts.tv_sec;
+ clock_pairing.nsec = ts.tv_nsec;
+ clock_pairing.tsc = kvm_read_l1_tsc(vcpu, cycle);
+ clock_pairing.flags = 0;
+ memset(&clock_pairing.pad, 0, sizeof(clock_pairing.pad));
+
+ ret = 0;
+ if (kvm_write_guest(vcpu->kvm, paddr, &clock_pairing,
+ sizeof(struct kvm_clock_pairing)))
+ ret = -KVM_EFAULT;
+
+ return ret;
+}
+#endif
+
+/*
+ * kvm_pv_kick_cpu_op: Kick a vcpu.
+ *
+ * @apicid - apicid of vcpu to be kicked.
+ */
+static void kvm_pv_kick_cpu_op(struct kvm *kvm, unsigned long flags, int apicid)
+{
+ struct kvm_lapic_irq lapic_irq;
+
+ lapic_irq.shorthand = 0;
+ lapic_irq.dest_mode = 0;
+ lapic_irq.level = 0;
+ lapic_irq.dest_id = apicid;
+ lapic_irq.msi_redir_hint = false;
+
+ lapic_irq.delivery_mode = APIC_DM_REMRD;
+ kvm_irq_delivery_to_apic(kvm, NULL, &lapic_irq, NULL);
+}
+
+void kvm_vcpu_deactivate_apicv(struct kvm_vcpu *vcpu)
+{
+ vcpu->arch.apicv_active = false;
+ kvm_x86_ops->refresh_apicv_exec_ctrl(vcpu);
+}
+
+int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
+{
+ unsigned long nr, a0, a1, a2, a3, ret;
+ int op_64_bit;
+
+ if (kvm_hv_hypercall_enabled(vcpu->kvm))
+ return kvm_hv_hypercall(vcpu);
+
+ nr = kvm_register_read(vcpu, VCPU_REGS_RAX);
+ a0 = kvm_register_read(vcpu, VCPU_REGS_RBX);
+ a1 = kvm_register_read(vcpu, VCPU_REGS_RCX);
+ a2 = kvm_register_read(vcpu, VCPU_REGS_RDX);
+ a3 = kvm_register_read(vcpu, VCPU_REGS_RSI);
+
+ trace_kvm_hypercall(nr, a0, a1, a2, a3);
+
+ op_64_bit = is_64_bit_mode(vcpu);
+ if (!op_64_bit) {
+ nr &= 0xFFFFFFFF;
+ a0 &= 0xFFFFFFFF;
+ a1 &= 0xFFFFFFFF;
+ a2 &= 0xFFFFFFFF;
+ a3 &= 0xFFFFFFFF;
+ }
+
+ if (kvm_x86_ops->get_cpl(vcpu) != 0) {
+ ret = -KVM_EPERM;
+ goto out;
+ }
+
+ switch (nr) {
+ case KVM_HC_VAPIC_POLL_IRQ:
+ ret = 0;
+ break;
+ case KVM_HC_KICK_CPU:
+ kvm_pv_kick_cpu_op(vcpu->kvm, a0, a1);
+ ret = 0;
+ break;
+#ifdef CONFIG_X86_64
+ case KVM_HC_CLOCK_PAIRING:
+ ret = kvm_pv_clock_pairing(vcpu, a0, a1);
+ break;
+#endif
+ case KVM_HC_SEND_IPI:
+ ret = kvm_pv_send_ipi(vcpu->kvm, a0, a1, a2, a3, op_64_bit);
+ break;
+ default:
+ ret = -KVM_ENOSYS;
+ break;
+ }
+out:
+ if (!op_64_bit)
+ ret = (u32)ret;
+ kvm_register_write(vcpu, VCPU_REGS_RAX, ret);
+
+ ++vcpu->stat.hypercalls;
+ return kvm_skip_emulated_instruction(vcpu);
+}
+EXPORT_SYMBOL_GPL(kvm_emulate_hypercall);
+
+static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt)
+{
+ struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
+ char instruction[3];
+ unsigned long rip = kvm_rip_read(vcpu);
+
+ kvm_x86_ops->patch_hypercall(vcpu, instruction);
+
+ return emulator_write_emulated(ctxt, rip, instruction, 3,
+ &ctxt->exception);
+}
+
+static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu)
+{
+ return vcpu->run->request_interrupt_window &&
+ likely(!pic_in_kernel(vcpu->kvm));
+}
+
+static void post_kvm_run_save(struct kvm_vcpu *vcpu)
+{
+ struct kvm_run *kvm_run = vcpu->run;
+
+ kvm_run->if_flag = (kvm_get_rflags(vcpu) & X86_EFLAGS_IF) != 0;
+ kvm_run->flags = is_smm(vcpu) ? KVM_RUN_X86_SMM : 0;
+ kvm_run->cr8 = kvm_get_cr8(vcpu);
+ kvm_run->apic_base = kvm_get_apic_base(vcpu);
+ kvm_run->ready_for_interrupt_injection =
+ pic_in_kernel(vcpu->kvm) ||
+ kvm_vcpu_ready_for_interrupt_injection(vcpu);
+}
+
+static void update_cr8_intercept(struct kvm_vcpu *vcpu)
+{
+ int max_irr, tpr;
+
+ if (!kvm_x86_ops->update_cr8_intercept)
+ return;
+
+ if (!lapic_in_kernel(vcpu))
+ return;
+
+ if (vcpu->arch.apicv_active)
+ return;
+
+ if (!vcpu->arch.apic->vapic_addr)
+ max_irr = kvm_lapic_find_highest_irr(vcpu);
+ else
+ max_irr = -1;
+
+ if (max_irr != -1)
+ max_irr >>= 4;
+
+ tpr = kvm_lapic_get_cr8(vcpu);
+
+ kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr);
+}
+
+static void kvm_inject_exception(struct kvm_vcpu *vcpu)
+{
+ if (vcpu->arch.exception.error_code && !is_protmode(vcpu))
+ vcpu->arch.exception.error_code = false;
+ kvm_x86_ops->queue_exception(vcpu);
+}
+
+static int inject_pending_event(struct kvm_vcpu *vcpu)
+{
+ int r;
+
+ /* try to reinject previous events if any */
+
+ if (vcpu->arch.exception.injected)
+ kvm_inject_exception(vcpu);
+ /*
+ * Do not inject an NMI or interrupt if there is a pending
+ * exception. Exceptions and interrupts are recognized at
+ * instruction boundaries, i.e. the start of an instruction.
+ * Trap-like exceptions, e.g. #DB, have higher priority than
+ * NMIs and interrupts, i.e. traps are recognized before an
+ * NMI/interrupt that's pending on the same instruction.
+ * Fault-like exceptions, e.g. #GP and #PF, are the lowest
+ * priority, but are only generated (pended) during instruction
+ * execution, i.e. a pending fault-like exception means the
+ * fault occurred on the *previous* instruction and must be
+ * serviced prior to recognizing any new events in order to
+ * fully complete the previous instruction.
+ */
+ else if (!vcpu->arch.exception.pending) {
+ if (vcpu->arch.nmi_injected)
+ kvm_x86_ops->set_nmi(vcpu);
+ else if (vcpu->arch.interrupt.injected)
+ kvm_x86_ops->set_irq(vcpu);
+ }
+
+ /*
+ * Call check_nested_events() even if we reinjected a previous event
+ * in order for caller to determine if it should require immediate-exit
+ * from L2 to L1 due to pending L1 events which require exit
+ * from L2 to L1.
+ */
+ if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events) {
+ r = kvm_x86_ops->check_nested_events(vcpu);
+ if (r != 0)
+ return r;
+ }
+
+ /* try to inject new event if pending */
+ if (vcpu->arch.exception.pending) {
+ trace_kvm_inj_exception(vcpu->arch.exception.nr,
+ vcpu->arch.exception.has_error_code,
+ vcpu->arch.exception.error_code);
+
+ WARN_ON_ONCE(vcpu->arch.exception.injected);
+ vcpu->arch.exception.pending = false;
+ vcpu->arch.exception.injected = true;
+
+ if (exception_type(vcpu->arch.exception.nr) == EXCPT_FAULT)
+ __kvm_set_rflags(vcpu, kvm_get_rflags(vcpu) |
+ X86_EFLAGS_RF);
+
+ if (vcpu->arch.exception.nr == DB_VECTOR &&
+ (vcpu->arch.dr7 & DR7_GD)) {
+ vcpu->arch.dr7 &= ~DR7_GD;
+ kvm_update_dr7(vcpu);
+ }
+
+ kvm_inject_exception(vcpu);
+ }
+
+ /* Don't consider new event if we re-injected an event */
+ if (kvm_event_needs_reinjection(vcpu))
+ return 0;
+
+ if (vcpu->arch.smi_pending && !is_smm(vcpu) &&
+ kvm_x86_ops->smi_allowed(vcpu)) {
+ vcpu->arch.smi_pending = false;
+ ++vcpu->arch.smi_count;
+ enter_smm(vcpu);
+ } else if (vcpu->arch.nmi_pending && kvm_x86_ops->nmi_allowed(vcpu)) {
+ --vcpu->arch.nmi_pending;
+ vcpu->arch.nmi_injected = true;
+ kvm_x86_ops->set_nmi(vcpu);
+ } else if (kvm_cpu_has_injectable_intr(vcpu)) {
+ /*
+ * Because interrupts can be injected asynchronously, we are
+ * calling check_nested_events again here to avoid a race condition.
+ * See https://lkml.org/lkml/2014/7/2/60 for discussion about this
+ * proposal and current concerns. Perhaps we should be setting
+ * KVM_REQ_EVENT only on certain events and not unconditionally?
+ */
+ if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events) {
+ r = kvm_x86_ops->check_nested_events(vcpu);
+ if (r != 0)
+ return r;
+ }
+ if (kvm_x86_ops->interrupt_allowed(vcpu)) {
+ kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu),
+ false);
+ kvm_x86_ops->set_irq(vcpu);
+ }
+ }
+
+ return 0;
+}
+
+static void process_nmi(struct kvm_vcpu *vcpu)
+{
+ unsigned limit = 2;
+
+ /*
+ * x86 is limited to one NMI running, and one NMI pending after it.
+ * If an NMI is already in progress, limit further NMIs to just one.
+ * Otherwise, allow two (and we'll inject the first one immediately).
+ */
+ if (kvm_x86_ops->get_nmi_mask(vcpu) || vcpu->arch.nmi_injected)
+ limit = 1;
+
+ vcpu->arch.nmi_pending += atomic_xchg(&vcpu->arch.nmi_queued, 0);
+ vcpu->arch.nmi_pending = min(vcpu->arch.nmi_pending, limit);
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+}
+
+static u32 enter_smm_get_segment_flags(struct kvm_segment *seg)
+{
+ u32 flags = 0;
+ flags |= seg->g << 23;
+ flags |= seg->db << 22;
+ flags |= seg->l << 21;
+ flags |= seg->avl << 20;
+ flags |= seg->present << 15;
+ flags |= seg->dpl << 13;
+ flags |= seg->s << 12;
+ flags |= seg->type << 8;
+ return flags;
+}
+
+static void enter_smm_save_seg_32(struct kvm_vcpu *vcpu, char *buf, int n)
+{
+ struct kvm_segment seg;
+ int offset;
+
+ kvm_get_segment(vcpu, &seg, n);
+ put_smstate(u32, buf, 0x7fa8 + n * 4, seg.selector);
+
+ if (n < 3)
+ offset = 0x7f84 + n * 12;
+ else
+ offset = 0x7f2c + (n - 3) * 12;
+
+ put_smstate(u32, buf, offset + 8, seg.base);
+ put_smstate(u32, buf, offset + 4, seg.limit);
+ put_smstate(u32, buf, offset, enter_smm_get_segment_flags(&seg));
+}
+
+#ifdef CONFIG_X86_64
+static void enter_smm_save_seg_64(struct kvm_vcpu *vcpu, char *buf, int n)
+{
+ struct kvm_segment seg;
+ int offset;
+ u16 flags;
+
+ kvm_get_segment(vcpu, &seg, n);
+ offset = 0x7e00 + n * 16;
+
+ flags = enter_smm_get_segment_flags(&seg) >> 8;
+ put_smstate(u16, buf, offset, seg.selector);
+ put_smstate(u16, buf, offset + 2, flags);
+ put_smstate(u32, buf, offset + 4, seg.limit);
+ put_smstate(u64, buf, offset + 8, seg.base);
+}
+#endif
+
+static void enter_smm_save_state_32(struct kvm_vcpu *vcpu, char *buf)
+{
+ struct desc_ptr dt;
+ struct kvm_segment seg;
+ unsigned long val;
+ int i;
+
+ put_smstate(u32, buf, 0x7ffc, kvm_read_cr0(vcpu));
+ put_smstate(u32, buf, 0x7ff8, kvm_read_cr3(vcpu));
+ put_smstate(u32, buf, 0x7ff4, kvm_get_rflags(vcpu));
+ put_smstate(u32, buf, 0x7ff0, kvm_rip_read(vcpu));
+
+ for (i = 0; i < 8; i++)
+ put_smstate(u32, buf, 0x7fd0 + i * 4, kvm_register_read(vcpu, i));
+
+ kvm_get_dr(vcpu, 6, &val);
+ put_smstate(u32, buf, 0x7fcc, (u32)val);
+ kvm_get_dr(vcpu, 7, &val);
+ put_smstate(u32, buf, 0x7fc8, (u32)val);
+
+ kvm_get_segment(vcpu, &seg, VCPU_SREG_TR);
+ put_smstate(u32, buf, 0x7fc4, seg.selector);
+ put_smstate(u32, buf, 0x7f64, seg.base);
+ put_smstate(u32, buf, 0x7f60, seg.limit);
+ put_smstate(u32, buf, 0x7f5c, enter_smm_get_segment_flags(&seg));
+
+ kvm_get_segment(vcpu, &seg, VCPU_SREG_LDTR);
+ put_smstate(u32, buf, 0x7fc0, seg.selector);
+ put_smstate(u32, buf, 0x7f80, seg.base);
+ put_smstate(u32, buf, 0x7f7c, seg.limit);
+ put_smstate(u32, buf, 0x7f78, enter_smm_get_segment_flags(&seg));
+
+ kvm_x86_ops->get_gdt(vcpu, &dt);
+ put_smstate(u32, buf, 0x7f74, dt.address);
+ put_smstate(u32, buf, 0x7f70, dt.size);
+
+ kvm_x86_ops->get_idt(vcpu, &dt);
+ put_smstate(u32, buf, 0x7f58, dt.address);
+ put_smstate(u32, buf, 0x7f54, dt.size);
+
+ for (i = 0; i < 6; i++)
+ enter_smm_save_seg_32(vcpu, buf, i);
+
+ put_smstate(u32, buf, 0x7f14, kvm_read_cr4(vcpu));
+
+ /* revision id */
+ put_smstate(u32, buf, 0x7efc, 0x00020000);
+ put_smstate(u32, buf, 0x7ef8, vcpu->arch.smbase);
+}
+
+#ifdef CONFIG_X86_64
+static void enter_smm_save_state_64(struct kvm_vcpu *vcpu, char *buf)
+{
+ struct desc_ptr dt;
+ struct kvm_segment seg;
+ unsigned long val;
+ int i;
+
+ for (i = 0; i < 16; i++)
+ put_smstate(u64, buf, 0x7ff8 - i * 8, kvm_register_read(vcpu, i));
+
+ put_smstate(u64, buf, 0x7f78, kvm_rip_read(vcpu));
+ put_smstate(u32, buf, 0x7f70, kvm_get_rflags(vcpu));
+
+ kvm_get_dr(vcpu, 6, &val);
+ put_smstate(u64, buf, 0x7f68, val);
+ kvm_get_dr(vcpu, 7, &val);
+ put_smstate(u64, buf, 0x7f60, val);
+
+ put_smstate(u64, buf, 0x7f58, kvm_read_cr0(vcpu));
+ put_smstate(u64, buf, 0x7f50, kvm_read_cr3(vcpu));
+ put_smstate(u64, buf, 0x7f48, kvm_read_cr4(vcpu));
+
+ put_smstate(u32, buf, 0x7f00, vcpu->arch.smbase);
+
+ /* revision id */
+ put_smstate(u32, buf, 0x7efc, 0x00020064);
+
+ put_smstate(u64, buf, 0x7ed0, vcpu->arch.efer);
+
+ kvm_get_segment(vcpu, &seg, VCPU_SREG_TR);
+ put_smstate(u16, buf, 0x7e90, seg.selector);
+ put_smstate(u16, buf, 0x7e92, enter_smm_get_segment_flags(&seg) >> 8);
+ put_smstate(u32, buf, 0x7e94, seg.limit);
+ put_smstate(u64, buf, 0x7e98, seg.base);
+
+ kvm_x86_ops->get_idt(vcpu, &dt);
+ put_smstate(u32, buf, 0x7e84, dt.size);
+ put_smstate(u64, buf, 0x7e88, dt.address);
+
+ kvm_get_segment(vcpu, &seg, VCPU_SREG_LDTR);
+ put_smstate(u16, buf, 0x7e70, seg.selector);
+ put_smstate(u16, buf, 0x7e72, enter_smm_get_segment_flags(&seg) >> 8);
+ put_smstate(u32, buf, 0x7e74, seg.limit);
+ put_smstate(u64, buf, 0x7e78, seg.base);
+
+ kvm_x86_ops->get_gdt(vcpu, &dt);
+ put_smstate(u32, buf, 0x7e64, dt.size);
+ put_smstate(u64, buf, 0x7e68, dt.address);
+
+ for (i = 0; i < 6; i++)
+ enter_smm_save_seg_64(vcpu, buf, i);
+}
+#endif
+
+static void enter_smm(struct kvm_vcpu *vcpu)
+{
+ struct kvm_segment cs, ds;
+ struct desc_ptr dt;
+ char buf[512];
+ u32 cr0;
+
+ trace_kvm_enter_smm(vcpu->vcpu_id, vcpu->arch.smbase, true);
+ memset(buf, 0, 512);
+#ifdef CONFIG_X86_64
+ if (guest_cpuid_has(vcpu, X86_FEATURE_LM))
+ enter_smm_save_state_64(vcpu, buf);
+ else
+#endif
+ enter_smm_save_state_32(vcpu, buf);
+
+ /*
+ * Give pre_enter_smm() a chance to make ISA-specific changes to the
+ * vCPU state (e.g. leave guest mode) after we've saved the state into
+ * the SMM state-save area.
+ */
+ kvm_x86_ops->pre_enter_smm(vcpu, buf);
+
+ vcpu->arch.hflags |= HF_SMM_MASK;
+ kvm_vcpu_write_guest(vcpu, vcpu->arch.smbase + 0xfe00, buf, sizeof(buf));
+
+ if (kvm_x86_ops->get_nmi_mask(vcpu))
+ vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK;
+ else
+ kvm_x86_ops->set_nmi_mask(vcpu, true);
+
+ kvm_set_rflags(vcpu, X86_EFLAGS_FIXED);
+ kvm_rip_write(vcpu, 0x8000);
+
+ cr0 = vcpu->arch.cr0 & ~(X86_CR0_PE | X86_CR0_EM | X86_CR0_TS | X86_CR0_PG);
+ kvm_x86_ops->set_cr0(vcpu, cr0);
+ vcpu->arch.cr0 = cr0;
+
+ kvm_x86_ops->set_cr4(vcpu, 0);
+
+ /* Undocumented: IDT limit is set to zero on entry to SMM. */
+ dt.address = dt.size = 0;
+ kvm_x86_ops->set_idt(vcpu, &dt);
+
+ __kvm_set_dr(vcpu, 7, DR7_FIXED_1);
+
+ cs.selector = (vcpu->arch.smbase >> 4) & 0xffff;
+ cs.base = vcpu->arch.smbase;
+
+ ds.selector = 0;
+ ds.base = 0;
+
+ cs.limit = ds.limit = 0xffffffff;
+ cs.type = ds.type = 0x3;
+ cs.dpl = ds.dpl = 0;
+ cs.db = ds.db = 0;
+ cs.s = ds.s = 1;
+ cs.l = ds.l = 0;
+ cs.g = ds.g = 1;
+ cs.avl = ds.avl = 0;
+ cs.present = ds.present = 1;
+ cs.unusable = ds.unusable = 0;
+ cs.padding = ds.padding = 0;
+
+ kvm_set_segment(vcpu, &cs, VCPU_SREG_CS);
+ kvm_set_segment(vcpu, &ds, VCPU_SREG_DS);
+ kvm_set_segment(vcpu, &ds, VCPU_SREG_ES);
+ kvm_set_segment(vcpu, &ds, VCPU_SREG_FS);
+ kvm_set_segment(vcpu, &ds, VCPU_SREG_GS);
+ kvm_set_segment(vcpu, &ds, VCPU_SREG_SS);
+
+#ifdef CONFIG_X86_64
+ if (guest_cpuid_has(vcpu, X86_FEATURE_LM))
+ kvm_x86_ops->set_efer(vcpu, 0);
+#endif
+
+ kvm_update_cpuid(vcpu);
+ kvm_mmu_reset_context(vcpu);
+}
+
+static void process_smi(struct kvm_vcpu *vcpu)
+{
+ vcpu->arch.smi_pending = true;
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+}
+
+void kvm_make_scan_ioapic_request(struct kvm *kvm)
+{
+ kvm_make_all_cpus_request(kvm, KVM_REQ_SCAN_IOAPIC);
+}
+
+static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
+{
+ if (!kvm_apic_present(vcpu))
+ return;
+
+ bitmap_zero(vcpu->arch.ioapic_handled_vectors, 256);
+
+ if (irqchip_split(vcpu->kvm))
+ kvm_scan_ioapic_routes(vcpu, vcpu->arch.ioapic_handled_vectors);
+ else {
+ if (vcpu->arch.apicv_active)
+ kvm_x86_ops->sync_pir_to_irr(vcpu);
+ if (ioapic_in_kernel(vcpu->kvm))
+ kvm_ioapic_scan_entry(vcpu, vcpu->arch.ioapic_handled_vectors);
+ }
+
+ if (is_guest_mode(vcpu))
+ vcpu->arch.load_eoi_exitmap_pending = true;
+ else
+ kvm_make_request(KVM_REQ_LOAD_EOI_EXITMAP, vcpu);
+}
+
+static void vcpu_load_eoi_exitmap(struct kvm_vcpu *vcpu)
+{
+ u64 eoi_exit_bitmap[4];
+
+ if (!kvm_apic_hw_enabled(vcpu->arch.apic))
+ return;
+
+ bitmap_or((ulong *)eoi_exit_bitmap, vcpu->arch.ioapic_handled_vectors,
+ vcpu_to_synic(vcpu)->vec_bitmap, 256);
+ kvm_x86_ops->load_eoi_exitmap(vcpu, eoi_exit_bitmap);
+}
+
+void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
+ unsigned long start, unsigned long end)
+{
+ unsigned long apic_address;
+
+ /*
+ * The physical address of apic access page is stored in the VMCS.
+ * Update it when it becomes invalid.
+ */
+ apic_address = gfn_to_hva(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT);
+ if (start <= apic_address && apic_address < end)
+ kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD);
+}
+
+void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu)
+{
+ struct page *page = NULL;
+
+ if (!lapic_in_kernel(vcpu))
+ return;
+
+ if (!kvm_x86_ops->set_apic_access_page_addr)
+ return;
+
+ page = gfn_to_page(vcpu->kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT);
+ if (is_error_page(page))
+ return;
+ kvm_x86_ops->set_apic_access_page_addr(vcpu, page_to_phys(page));
+
+ /*
+ * Do not pin apic access page in memory, the MMU notifier
+ * will call us again if it is migrated or swapped out.
+ */
+ put_page(page);
+}
+EXPORT_SYMBOL_GPL(kvm_vcpu_reload_apic_access_page);
+
+void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu)
+{
+ smp_send_reschedule(vcpu->cpu);
+}
+EXPORT_SYMBOL_GPL(__kvm_request_immediate_exit);
+
+/*
+ * Returns 1 to let vcpu_run() continue the guest execution loop without
+ * exiting to the userspace. Otherwise, the value will be returned to the
+ * userspace.
+ */
+static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
+{
+ int r;
+ bool req_int_win =
+ dm_request_for_irq_injection(vcpu) &&
+ kvm_cpu_accept_dm_intr(vcpu);
+
+ bool req_immediate_exit = false;
+
+ if (kvm_request_pending(vcpu)) {
+ if (kvm_check_request(KVM_REQ_GET_VMCS12_PAGES, vcpu))
+ kvm_x86_ops->get_vmcs12_pages(vcpu);
+ if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu))
+ kvm_mmu_unload(vcpu);
+ if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu))
+ __kvm_migrate_timers(vcpu);
+ if (kvm_check_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu))
+ kvm_gen_update_masterclock(vcpu->kvm);
+ if (kvm_check_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu))
+ kvm_gen_kvmclock_update(vcpu);
+ if (kvm_check_request(KVM_REQ_CLOCK_UPDATE, vcpu)) {
+ r = kvm_guest_time_update(vcpu);
+ if (unlikely(r))
+ goto out;
+ }
+ if (kvm_check_request(KVM_REQ_MMU_SYNC, vcpu))
+ kvm_mmu_sync_roots(vcpu);
+ if (kvm_check_request(KVM_REQ_LOAD_CR3, vcpu))
+ kvm_mmu_load_cr3(vcpu);
+ if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu))
+ kvm_vcpu_flush_tlb(vcpu, true);
+ if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) {
+ vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS;
+ r = 0;
+ goto out;
+ }
+ if (kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)) {
+ vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
+ vcpu->mmio_needed = 0;
+ r = 0;
+ goto out;
+ }
+ if (kvm_check_request(KVM_REQ_APF_HALT, vcpu)) {
+ /* Page is swapped out. Do synthetic halt */
+ vcpu->arch.apf.halted = true;
+ r = 1;
+ goto out;
+ }
+ if (kvm_check_request(KVM_REQ_STEAL_UPDATE, vcpu))
+ record_steal_time(vcpu);
+ if (kvm_check_request(KVM_REQ_SMI, vcpu))
+ process_smi(vcpu);
+ if (kvm_check_request(KVM_REQ_NMI, vcpu))
+ process_nmi(vcpu);
+ if (kvm_check_request(KVM_REQ_PMU, vcpu))
+ kvm_pmu_handle_event(vcpu);
+ if (kvm_check_request(KVM_REQ_PMI, vcpu))
+ kvm_pmu_deliver_pmi(vcpu);
+ if (kvm_check_request(KVM_REQ_IOAPIC_EOI_EXIT, vcpu)) {
+ BUG_ON(vcpu->arch.pending_ioapic_eoi > 255);
+ if (test_bit(vcpu->arch.pending_ioapic_eoi,
+ vcpu->arch.ioapic_handled_vectors)) {
+ vcpu->run->exit_reason = KVM_EXIT_IOAPIC_EOI;
+ vcpu->run->eoi.vector =
+ vcpu->arch.pending_ioapic_eoi;
+ r = 0;
+ goto out;
+ }
+ }
+ if (kvm_check_request(KVM_REQ_SCAN_IOAPIC, vcpu))
+ vcpu_scan_ioapic(vcpu);
+ if (kvm_check_request(KVM_REQ_LOAD_EOI_EXITMAP, vcpu))
+ vcpu_load_eoi_exitmap(vcpu);
+ if (kvm_check_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu))
+ kvm_vcpu_reload_apic_access_page(vcpu);
+ if (kvm_check_request(KVM_REQ_HV_CRASH, vcpu)) {
+ vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
+ vcpu->run->system_event.type = KVM_SYSTEM_EVENT_CRASH;
+ r = 0;
+ goto out;
+ }
+ if (kvm_check_request(KVM_REQ_HV_RESET, vcpu)) {
+ vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
+ vcpu->run->system_event.type = KVM_SYSTEM_EVENT_RESET;
+ r = 0;
+ goto out;
+ }
+ if (kvm_check_request(KVM_REQ_HV_EXIT, vcpu)) {
+ vcpu->run->exit_reason = KVM_EXIT_HYPERV;
+ vcpu->run->hyperv = vcpu->arch.hyperv.exit;
+ r = 0;
+ goto out;
+ }
+
+ /*
+ * KVM_REQ_HV_STIMER has to be processed after
+ * KVM_REQ_CLOCK_UPDATE, because Hyper-V SynIC timers
+ * depend on the guest clock being up-to-date
+ */
+ if (kvm_check_request(KVM_REQ_HV_STIMER, vcpu))
+ kvm_hv_process_stimers(vcpu);
+ }
+
+ if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
+ ++vcpu->stat.req_event;
+ kvm_apic_accept_events(vcpu);
+ if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
+ r = 1;
+ goto out;
+ }
+
+ if (inject_pending_event(vcpu) != 0)
+ req_immediate_exit = true;
+ else {
+ /* Enable SMI/NMI/IRQ window open exits if needed.
+ *
+ * SMIs have three cases:
+ * 1) They can be nested, and then there is nothing to
+ * do here because RSM will cause a vmexit anyway.
+ * 2) There is an ISA-specific reason why SMI cannot be
+ * injected, and the moment when this changes can be
+ * intercepted.
+ * 3) Or the SMI can be pending because
+ * inject_pending_event has completed the injection
+ * of an IRQ or NMI from the previous vmexit, and
+ * then we request an immediate exit to inject the
+ * SMI.
+ */
+ if (vcpu->arch.smi_pending && !is_smm(vcpu))
+ if (!kvm_x86_ops->enable_smi_window(vcpu))
+ req_immediate_exit = true;
+ if (vcpu->arch.nmi_pending)
+ kvm_x86_ops->enable_nmi_window(vcpu);
+ if (kvm_cpu_has_injectable_intr(vcpu) || req_int_win)
+ kvm_x86_ops->enable_irq_window(vcpu);
+ WARN_ON(vcpu->arch.exception.pending);
+ }
+
+ if (kvm_lapic_enabled(vcpu)) {
+ update_cr8_intercept(vcpu);
+ kvm_lapic_sync_to_vapic(vcpu);
+ }
+ }
+
+ r = kvm_mmu_reload(vcpu);
+ if (unlikely(r)) {
+ goto cancel_injection;
+ }
+
+ preempt_disable();
+
+ kvm_x86_ops->prepare_guest_switch(vcpu);
+
+ /*
+ * Disable IRQs before setting IN_GUEST_MODE. Posted interrupt
+ * IPI are then delayed after guest entry, which ensures that they
+ * result in virtual interrupt delivery.
+ */
+ local_irq_disable();
+ vcpu->mode = IN_GUEST_MODE;
+
+ srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
+
+ /*
+ * 1) We should set ->mode before checking ->requests. Please see
+ * the comment in kvm_vcpu_exiting_guest_mode().
+ *
+ * 2) For APICv, we should set ->mode before checking PIR.ON. This
+ * pairs with the memory barrier implicit in pi_test_and_set_on
+ * (see vmx_deliver_posted_interrupt).
+ *
+ * 3) This also orders the write to mode from any reads to the page
+ * tables done while the VCPU is running. Please see the comment
+ * in kvm_flush_remote_tlbs.
+ */
+ smp_mb__after_srcu_read_unlock();
+
+ /*
+ * This handles the case where a posted interrupt was
+ * notified with kvm_vcpu_kick.
+ */
+ if (kvm_lapic_enabled(vcpu) && vcpu->arch.apicv_active)
+ kvm_x86_ops->sync_pir_to_irr(vcpu);
+
+ if (vcpu->mode == EXITING_GUEST_MODE || kvm_request_pending(vcpu)
+ || need_resched() || signal_pending(current)) {
+ vcpu->mode = OUTSIDE_GUEST_MODE;
+ smp_wmb();
+ local_irq_enable();
+ preempt_enable();
+ vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
+ r = 1;
+ goto cancel_injection;
+ }
+
+ if (req_immediate_exit) {
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ kvm_x86_ops->request_immediate_exit(vcpu);
+ }
+
+ trace_kvm_entry(vcpu->vcpu_id);
+ if (lapic_timer_advance_ns)
+ wait_lapic_expire(vcpu);
+ guest_enter_irqoff();
+
+ if (unlikely(vcpu->arch.switch_db_regs)) {
+ set_debugreg(0, 7);
+ set_debugreg(vcpu->arch.eff_db[0], 0);
+ set_debugreg(vcpu->arch.eff_db[1], 1);
+ set_debugreg(vcpu->arch.eff_db[2], 2);
+ set_debugreg(vcpu->arch.eff_db[3], 3);
+ set_debugreg(vcpu->arch.dr6, 6);
+ vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_RELOAD;
+ } else if (unlikely(hw_breakpoint_active())) {
+ set_debugreg(0, 7);
+ }
+
+ kvm_x86_ops->run(vcpu);
+
+ /*
+ * Do this here before restoring debug registers on the host. And
+ * since we do this before handling the vmexit, a DR access vmexit
+ * can (a) read the correct value of the debug registers, (b) set
+ * KVM_DEBUGREG_WONT_EXIT again.
+ */
+ if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) {
+ WARN_ON(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP);
+ kvm_x86_ops->sync_dirty_debug_regs(vcpu);
+ kvm_update_dr0123(vcpu);
+ kvm_update_dr6(vcpu);
+ kvm_update_dr7(vcpu);
+ vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_RELOAD;
+ }
+
+ /*
+ * If the guest has used debug registers, at least dr7
+ * will be disabled while returning to the host.
+ * If we don't have active breakpoints in the host, we don't
+ * care about the messed up debug address registers. But if
+ * we have some of them active, restore the old state.
+ */
+ if (hw_breakpoint_active())
+ hw_breakpoint_restore();
+
+ vcpu->arch.last_guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
+
+ vcpu->mode = OUTSIDE_GUEST_MODE;
+ smp_wmb();
+
+ kvm_before_interrupt(vcpu);
+ kvm_x86_ops->handle_external_intr(vcpu);
+ kvm_after_interrupt(vcpu);
+
+ ++vcpu->stat.exits;
+
+ guest_exit_irqoff();
+
+ local_irq_enable();
+ preempt_enable();
+
+ vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
+
+ /*
+ * Profile KVM exit RIPs:
+ */
+ if (unlikely(prof_on == KVM_PROFILING)) {
+ unsigned long rip = kvm_rip_read(vcpu);
+ profile_hit(KVM_PROFILING, (void *)rip);
+ }
+
+ if (unlikely(vcpu->arch.tsc_always_catchup))
+ kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
+
+ if (vcpu->arch.apic_attention)
+ kvm_lapic_sync_from_vapic(vcpu);
+
+ vcpu->arch.gpa_available = false;
+ r = kvm_x86_ops->handle_exit(vcpu);
+ return r;
+
+cancel_injection:
+ kvm_x86_ops->cancel_injection(vcpu);
+ if (unlikely(vcpu->arch.apic_attention))
+ kvm_lapic_sync_from_vapic(vcpu);
+out:
+ return r;
+}
+
+static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu)
+{
+ if (!kvm_arch_vcpu_runnable(vcpu) &&
+ (!kvm_x86_ops->pre_block || kvm_x86_ops->pre_block(vcpu) == 0)) {
+ srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
+ kvm_vcpu_block(vcpu);
+ vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
+
+ if (kvm_x86_ops->post_block)
+ kvm_x86_ops->post_block(vcpu);
+
+ if (!kvm_check_request(KVM_REQ_UNHALT, vcpu))
+ return 1;
+ }
+
+ kvm_apic_accept_events(vcpu);
+ switch(vcpu->arch.mp_state) {
+ case KVM_MP_STATE_HALTED:
+ vcpu->arch.pv.pv_unhalted = false;
+ vcpu->arch.mp_state =
+ KVM_MP_STATE_RUNNABLE;
+ case KVM_MP_STATE_RUNNABLE:
+ vcpu->arch.apf.halted = false;
+ break;
+ case KVM_MP_STATE_INIT_RECEIVED:
+ break;
+ default:
+ return -EINTR;
+ break;
+ }
+ return 1;
+}
+
+static inline bool kvm_vcpu_running(struct kvm_vcpu *vcpu)
+{
+ if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events)
+ kvm_x86_ops->check_nested_events(vcpu);
+
+ return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
+ !vcpu->arch.apf.halted);
+}
+
+static int vcpu_run(struct kvm_vcpu *vcpu)
+{
+ int r;
+ struct kvm *kvm = vcpu->kvm;
+
+ vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
+ vcpu->arch.l1tf_flush_l1d = true;
+
+ for (;;) {
+ if (kvm_vcpu_running(vcpu)) {
+ r = vcpu_enter_guest(vcpu);
+ } else {
+ r = vcpu_block(kvm, vcpu);
+ }
+
+ if (r <= 0)
+ break;
+
+ kvm_clear_request(KVM_REQ_PENDING_TIMER, vcpu);
+ if (kvm_cpu_has_pending_timer(vcpu))
+ kvm_inject_pending_timer_irqs(vcpu);
+
+ if (dm_request_for_irq_injection(vcpu) &&
+ kvm_vcpu_ready_for_interrupt_injection(vcpu)) {
+ r = 0;
+ vcpu->run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
+ ++vcpu->stat.request_irq_exits;
+ break;
+ }
+
+ kvm_check_async_pf_completion(vcpu);
+
+ if (signal_pending(current)) {
+ r = -EINTR;
+ vcpu->run->exit_reason = KVM_EXIT_INTR;
+ ++vcpu->stat.signal_exits;
+ break;
+ }
+ if (need_resched()) {
+ srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
+ cond_resched();
+ vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
+ }
+ }
+
+ srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
+
+ return r;
+}
+
+static inline int complete_emulated_io(struct kvm_vcpu *vcpu)
+{
+ int r;
+ vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
+ r = kvm_emulate_instruction(vcpu, EMULTYPE_NO_DECODE);
+ srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
+ if (r != EMULATE_DONE)
+ return 0;
+ return 1;
+}
+
+static int complete_emulated_pio(struct kvm_vcpu *vcpu)
+{
+ BUG_ON(!vcpu->arch.pio.count);
+
+ return complete_emulated_io(vcpu);
+}
+
+/*
+ * Implements the following, as a state machine:
+ *
+ * read:
+ * for each fragment
+ * for each mmio piece in the fragment
+ * write gpa, len
+ * exit
+ * copy data
+ * execute insn
+ *
+ * write:
+ * for each fragment
+ * for each mmio piece in the fragment
+ * write gpa, len
+ * copy data
+ * exit
+ */
+static int complete_emulated_mmio(struct kvm_vcpu *vcpu)
+{
+ struct kvm_run *run = vcpu->run;
+ struct kvm_mmio_fragment *frag;
+ unsigned len;
+
+ BUG_ON(!vcpu->mmio_needed);
+
+ /* Complete previous fragment */
+ frag = &vcpu->mmio_fragments[vcpu->mmio_cur_fragment];
+ len = min(8u, frag->len);
+ if (!vcpu->mmio_is_write)
+ memcpy(frag->data, run->mmio.data, len);
+
+ if (frag->len <= 8) {
+ /* Switch to the next fragment. */
+ frag++;
+ vcpu->mmio_cur_fragment++;
+ } else {
+ /* Go forward to the next mmio piece. */
+ frag->data += len;
+ frag->gpa += len;
+ frag->len -= len;
+ }
+
+ if (vcpu->mmio_cur_fragment >= vcpu->mmio_nr_fragments) {
+ vcpu->mmio_needed = 0;
+
+ /* FIXME: return into emulator if single-stepping. */
+ if (vcpu->mmio_is_write)
+ return 1;
+ vcpu->mmio_read_completed = 1;
+ return complete_emulated_io(vcpu);
+ }
+
+ run->exit_reason = KVM_EXIT_MMIO;
+ run->mmio.phys_addr = frag->gpa;
+ if (vcpu->mmio_is_write)
+ memcpy(run->mmio.data, frag->data, min(8u, frag->len));
+ run->mmio.len = min(8u, frag->len);
+ run->mmio.is_write = vcpu->mmio_is_write;
+ vcpu->arch.complete_userspace_io = complete_emulated_mmio;
+ return 0;
+}
+
+/* Swap (qemu) user FPU context for the guest FPU context. */
+static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
+{
+ preempt_disable();
+ copy_fpregs_to_fpstate(&vcpu->arch.user_fpu);
+ /* PKRU is separately restored in kvm_x86_ops->run. */
+ __copy_kernel_to_fpregs(&vcpu->arch.guest_fpu.state,
+ ~XFEATURE_MASK_PKRU);
+ preempt_enable();
+ trace_kvm_fpu(1);
+}
+
+/* When vcpu_run ends, restore user space FPU context. */
+static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
+{
+ preempt_disable();
+ copy_fpregs_to_fpstate(&vcpu->arch.guest_fpu);
+ copy_kernel_to_fpregs(&vcpu->arch.user_fpu.state);
+ preempt_enable();
+ ++vcpu->stat.fpu_reload;
+ trace_kvm_fpu(0);
+}
+
+int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+{
+ int r;
+
+ vcpu_load(vcpu);
+ kvm_sigset_activate(vcpu);
+ kvm_load_guest_fpu(vcpu);
+
+ if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) {
+ if (kvm_run->immediate_exit) {
+ r = -EINTR;
+ goto out;
+ }
+ kvm_vcpu_block(vcpu);
+ kvm_apic_accept_events(vcpu);
+ kvm_clear_request(KVM_REQ_UNHALT, vcpu);
+ r = -EAGAIN;
+ if (signal_pending(current)) {
+ r = -EINTR;
+ vcpu->run->exit_reason = KVM_EXIT_INTR;
+ ++vcpu->stat.signal_exits;
+ }
+ goto out;
+ }
+
+ if (vcpu->run->kvm_valid_regs & ~KVM_SYNC_X86_VALID_FIELDS) {
+ r = -EINVAL;
+ goto out;
+ }
+
+ if (vcpu->run->kvm_dirty_regs) {
+ r = sync_regs(vcpu);
+ if (r != 0)
+ goto out;
+ }
+
+ /* re-sync apic's tpr */
+ if (!lapic_in_kernel(vcpu)) {
+ if (kvm_set_cr8(vcpu, kvm_run->cr8) != 0) {
+ r = -EINVAL;
+ goto out;
+ }
+ }
+
+ if (unlikely(vcpu->arch.complete_userspace_io)) {
+ int (*cui)(struct kvm_vcpu *) = vcpu->arch.complete_userspace_io;
+ vcpu->arch.complete_userspace_io = NULL;
+ r = cui(vcpu);
+ if (r <= 0)
+ goto out;
+ } else
+ WARN_ON(vcpu->arch.pio.count || vcpu->mmio_needed);
+
+ if (kvm_run->immediate_exit)
+ r = -EINTR;
+ else
+ r = vcpu_run(vcpu);
+
+out:
+ kvm_put_guest_fpu(vcpu);
+ if (vcpu->run->kvm_valid_regs)
+ store_regs(vcpu);
+ post_kvm_run_save(vcpu);
+ kvm_sigset_deactivate(vcpu);
+
+ vcpu_put(vcpu);
+ return r;
+}
+
+static void __get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
+{
+ if (vcpu->arch.emulate_regs_need_sync_to_vcpu) {
+ /*
+ * We are here if userspace calls get_regs() in the middle of
+ * instruction emulation. Registers state needs to be copied
+ * back from emulation context to vcpu. Userspace shouldn't do
+ * that usually, but some bad designed PV devices (vmware
+ * backdoor interface) need this to work
+ */
+ emulator_writeback_register_cache(&vcpu->arch.emulate_ctxt);
+ vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
+ }
+ regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX);
+ regs->rbx = kvm_register_read(vcpu, VCPU_REGS_RBX);
+ regs->rcx = kvm_register_read(vcpu, VCPU_REGS_RCX);
+ regs->rdx = kvm_register_read(vcpu, VCPU_REGS_RDX);
+ regs->rsi = kvm_register_read(vcpu, VCPU_REGS_RSI);
+ regs->rdi = kvm_register_read(vcpu, VCPU_REGS_RDI);
+ regs->rsp = kvm_register_read(vcpu, VCPU_REGS_RSP);
+ regs->rbp = kvm_register_read(vcpu, VCPU_REGS_RBP);
+#ifdef CONFIG_X86_64
+ regs->r8 = kvm_register_read(vcpu, VCPU_REGS_R8);
+ regs->r9 = kvm_register_read(vcpu, VCPU_REGS_R9);
+ regs->r10 = kvm_register_read(vcpu, VCPU_REGS_R10);
+ regs->r11 = kvm_register_read(vcpu, VCPU_REGS_R11);
+ regs->r12 = kvm_register_read(vcpu, VCPU_REGS_R12);
+ regs->r13 = kvm_register_read(vcpu, VCPU_REGS_R13);
+ regs->r14 = kvm_register_read(vcpu, VCPU_REGS_R14);
+ regs->r15 = kvm_register_read(vcpu, VCPU_REGS_R15);
+#endif
+
+ regs->rip = kvm_rip_read(vcpu);
+ regs->rflags = kvm_get_rflags(vcpu);
+}
+
+int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
+{
+ vcpu_load(vcpu);
+ __get_regs(vcpu, regs);
+ vcpu_put(vcpu);
+ return 0;
+}
+
+static void __set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
+{
+ vcpu->arch.emulate_regs_need_sync_from_vcpu = true;
+ vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
+
+ kvm_register_write(vcpu, VCPU_REGS_RAX, regs->rax);
+ kvm_register_write(vcpu, VCPU_REGS_RBX, regs->rbx);
+ kvm_register_write(vcpu, VCPU_REGS_RCX, regs->rcx);
+ kvm_register_write(vcpu, VCPU_REGS_RDX, regs->rdx);
+ kvm_register_write(vcpu, VCPU_REGS_RSI, regs->rsi);
+ kvm_register_write(vcpu, VCPU_REGS_RDI, regs->rdi);
+ kvm_register_write(vcpu, VCPU_REGS_RSP, regs->rsp);
+ kvm_register_write(vcpu, VCPU_REGS_RBP, regs->rbp);
+#ifdef CONFIG_X86_64
+ kvm_register_write(vcpu, VCPU_REGS_R8, regs->r8);
+ kvm_register_write(vcpu, VCPU_REGS_R9, regs->r9);
+ kvm_register_write(vcpu, VCPU_REGS_R10, regs->r10);
+ kvm_register_write(vcpu, VCPU_REGS_R11, regs->r11);
+ kvm_register_write(vcpu, VCPU_REGS_R12, regs->r12);
+ kvm_register_write(vcpu, VCPU_REGS_R13, regs->r13);
+ kvm_register_write(vcpu, VCPU_REGS_R14, regs->r14);
+ kvm_register_write(vcpu, VCPU_REGS_R15, regs->r15);
+#endif
+
+ kvm_rip_write(vcpu, regs->rip);
+ kvm_set_rflags(vcpu, regs->rflags | X86_EFLAGS_FIXED);
+
+ vcpu->arch.exception.pending = false;
+
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+}
+
+int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
+{
+ vcpu_load(vcpu);
+ __set_regs(vcpu, regs);
+ vcpu_put(vcpu);
+ return 0;
+}
+
+void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
+{
+ struct kvm_segment cs;
+
+ kvm_get_segment(vcpu, &cs, VCPU_SREG_CS);
+ *db = cs.db;
+ *l = cs.l;
+}
+EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits);
+
+static void __get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
+{
+ struct desc_ptr dt;
+
+ kvm_get_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
+ kvm_get_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
+ kvm_get_segment(vcpu, &sregs->es, VCPU_SREG_ES);
+ kvm_get_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
+ kvm_get_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
+ kvm_get_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
+
+ kvm_get_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
+ kvm_get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
+
+ kvm_x86_ops->get_idt(vcpu, &dt);
+ sregs->idt.limit = dt.size;
+ sregs->idt.base = dt.address;
+ kvm_x86_ops->get_gdt(vcpu, &dt);
+ sregs->gdt.limit = dt.size;
+ sregs->gdt.base = dt.address;
+
+ sregs->cr0 = kvm_read_cr0(vcpu);
+ sregs->cr2 = vcpu->arch.cr2;
+ sregs->cr3 = kvm_read_cr3(vcpu);
+ sregs->cr4 = kvm_read_cr4(vcpu);
+ sregs->cr8 = kvm_get_cr8(vcpu);
+ sregs->efer = vcpu->arch.efer;
+ sregs->apic_base = kvm_get_apic_base(vcpu);
+
+ memset(sregs->interrupt_bitmap, 0, sizeof sregs->interrupt_bitmap);
+
+ if (vcpu->arch.interrupt.injected && !vcpu->arch.interrupt.soft)
+ set_bit(vcpu->arch.interrupt.nr,
+ (unsigned long *)sregs->interrupt_bitmap);
+}
+
+int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
+ struct kvm_sregs *sregs)
+{
+ vcpu_load(vcpu);
+ __get_sregs(vcpu, sregs);
+ vcpu_put(vcpu);
+ return 0;
+}
+
+int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
+ struct kvm_mp_state *mp_state)
+{
+ vcpu_load(vcpu);
+ if (kvm_mpx_supported())
+ kvm_load_guest_fpu(vcpu);
+
+ kvm_apic_accept_events(vcpu);
+ if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED &&
+ vcpu->arch.pv.pv_unhalted)
+ mp_state->mp_state = KVM_MP_STATE_RUNNABLE;
+ else
+ mp_state->mp_state = vcpu->arch.mp_state;
+
+ if (kvm_mpx_supported())
+ kvm_put_guest_fpu(vcpu);
+ vcpu_put(vcpu);
+ return 0;
+}
+
+int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
+ struct kvm_mp_state *mp_state)
+{
+ int ret = -EINVAL;
+
+ vcpu_load(vcpu);
+
+ if (!lapic_in_kernel(vcpu) &&
+ mp_state->mp_state != KVM_MP_STATE_RUNNABLE)
+ goto out;
+
+ /* INITs are latched while in SMM */
+ if ((is_smm(vcpu) || vcpu->arch.smi_pending) &&
+ (mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED ||
+ mp_state->mp_state == KVM_MP_STATE_INIT_RECEIVED))
+ goto out;
+
+ if (mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED) {
+ vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED;
+ set_bit(KVM_APIC_SIPI, &vcpu->arch.apic->pending_events);
+ } else
+ vcpu->arch.mp_state = mp_state->mp_state;
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+
+ ret = 0;
+out:
+ vcpu_put(vcpu);
+ return ret;
+}
+
+int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index,
+ int reason, bool has_error_code, u32 error_code)
+{
+ struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
+ int ret;
+
+ init_emulate_ctxt(vcpu);
+
+ ret = emulator_task_switch(ctxt, tss_selector, idt_index, reason,
+ has_error_code, error_code);
+
+ if (ret)
+ return EMULATE_FAIL;
+
+ kvm_rip_write(vcpu, ctxt->eip);
+ kvm_set_rflags(vcpu, ctxt->eflags);
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ return EMULATE_DONE;
+}
+EXPORT_SYMBOL_GPL(kvm_task_switch);
+
+static int kvm_valid_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
+{
+ if ((sregs->efer & EFER_LME) && (sregs->cr0 & X86_CR0_PG)) {
+ /*
+ * When EFER.LME and CR0.PG are set, the processor is in
+ * 64-bit mode (though maybe in a 32-bit code segment).
+ * CR4.PAE and EFER.LMA must be set.
+ */
+ if (!(sregs->cr4 & X86_CR4_PAE)
+ || !(sregs->efer & EFER_LMA))
+ return -EINVAL;
+ } else {
+ /*
+ * Not in 64-bit mode: EFER.LMA is clear and the code
+ * segment cannot be 64-bit.
+ */
+ if (sregs->efer & EFER_LMA || sregs->cs.l)
+ return -EINVAL;
+ }
+
+ return kvm_valid_cr4(vcpu, sregs->cr4);
+}
+
+static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
+{
+ struct msr_data apic_base_msr;
+ int mmu_reset_needed = 0;
+ int cpuid_update_needed = 0;
+ int pending_vec, max_bits, idx;
+ struct desc_ptr dt;
+ int ret = -EINVAL;
+
+ if (kvm_valid_sregs(vcpu, sregs))
+ goto out;
+
+ apic_base_msr.data = sregs->apic_base;
+ apic_base_msr.host_initiated = true;
+ if (kvm_set_apic_base(vcpu, &apic_base_msr))
+ goto out;
+
+ dt.size = sregs->idt.limit;
+ dt.address = sregs->idt.base;
+ kvm_x86_ops->set_idt(vcpu, &dt);
+ dt.size = sregs->gdt.limit;
+ dt.address = sregs->gdt.base;
+ kvm_x86_ops->set_gdt(vcpu, &dt);
+
+ vcpu->arch.cr2 = sregs->cr2;
+ mmu_reset_needed |= kvm_read_cr3(vcpu) != sregs->cr3;
+ vcpu->arch.cr3 = sregs->cr3;
+ __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
+
+ kvm_set_cr8(vcpu, sregs->cr8);
+
+ mmu_reset_needed |= vcpu->arch.efer != sregs->efer;
+ kvm_x86_ops->set_efer(vcpu, sregs->efer);
+
+ mmu_reset_needed |= kvm_read_cr0(vcpu) != sregs->cr0;
+ kvm_x86_ops->set_cr0(vcpu, sregs->cr0);
+ vcpu->arch.cr0 = sregs->cr0;
+
+ mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4;
+ cpuid_update_needed |= ((kvm_read_cr4(vcpu) ^ sregs->cr4) &
+ (X86_CR4_OSXSAVE | X86_CR4_PKE));
+ kvm_x86_ops->set_cr4(vcpu, sregs->cr4);
+ if (cpuid_update_needed)
+ kvm_update_cpuid(vcpu);
+
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
+ if (is_pae_paging(vcpu)) {
+ load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu));
+ mmu_reset_needed = 1;
+ }
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
+
+ if (mmu_reset_needed)
+ kvm_mmu_reset_context(vcpu);
+
+ max_bits = KVM_NR_INTERRUPTS;
+ pending_vec = find_first_bit(
+ (const unsigned long *)sregs->interrupt_bitmap, max_bits);
+ if (pending_vec < max_bits) {
+ kvm_queue_interrupt(vcpu, pending_vec, false);
+ pr_debug("Set back pending irq %d\n", pending_vec);
+ }
+
+ kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
+ kvm_set_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
+ kvm_set_segment(vcpu, &sregs->es, VCPU_SREG_ES);
+ kvm_set_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
+ kvm_set_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
+ kvm_set_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
+
+ kvm_set_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
+ kvm_set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
+
+ update_cr8_intercept(vcpu);
+
+ /* Older userspace won't unhalt the vcpu on reset. */
+ if (kvm_vcpu_is_bsp(vcpu) && kvm_rip_read(vcpu) == 0xfff0 &&
+ sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 &&
+ !is_protmode(vcpu))
+ vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
+
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+
+ ret = 0;
+out:
+ return ret;
+}
+
+int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
+ struct kvm_sregs *sregs)
+{
+ int ret;
+
+ vcpu_load(vcpu);
+ ret = __set_sregs(vcpu, sregs);
+ vcpu_put(vcpu);
+ return ret;
+}
+
+int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
+ struct kvm_guest_debug *dbg)
+{
+ unsigned long rflags;
+ int i, r;
+
+ vcpu_load(vcpu);
+
+ if (dbg->control & (KVM_GUESTDBG_INJECT_DB | KVM_GUESTDBG_INJECT_BP)) {
+ r = -EBUSY;
+ if (vcpu->arch.exception.pending)
+ goto out;
+ if (dbg->control & KVM_GUESTDBG_INJECT_DB)
+ kvm_queue_exception(vcpu, DB_VECTOR);
+ else
+ kvm_queue_exception(vcpu, BP_VECTOR);
+ }
+
+ /*
+ * Read rflags as long as potentially injected trace flags are still
+ * filtered out.
+ */
+ rflags = kvm_get_rflags(vcpu);
+
+ vcpu->guest_debug = dbg->control;
+ if (!(vcpu->guest_debug & KVM_GUESTDBG_ENABLE))
+ vcpu->guest_debug = 0;
+
+ if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) {
+ for (i = 0; i < KVM_NR_DB_REGS; ++i)
+ vcpu->arch.eff_db[i] = dbg->arch.debugreg[i];
+ vcpu->arch.guest_debug_dr7 = dbg->arch.debugreg[7];
+ } else {
+ for (i = 0; i < KVM_NR_DB_REGS; i++)
+ vcpu->arch.eff_db[i] = vcpu->arch.db[i];
+ }
+ kvm_update_dr7(vcpu);
+
+ if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
+ vcpu->arch.singlestep_rip = kvm_rip_read(vcpu) +
+ get_segment_base(vcpu, VCPU_SREG_CS);
+
+ /*
+ * Trigger an rflags update that will inject or remove the trace
+ * flags.
+ */
+ kvm_set_rflags(vcpu, rflags);
+
+ kvm_x86_ops->update_bp_intercept(vcpu);
+
+ r = 0;
+
+out:
+ vcpu_put(vcpu);
+ return r;
+}
+
+/*
+ * Translate a guest virtual address to a guest physical address.
+ */
+int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
+ struct kvm_translation *tr)
+{
+ unsigned long vaddr = tr->linear_address;
+ gpa_t gpa;
+ int idx;
+
+ vcpu_load(vcpu);
+
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
+ gpa = kvm_mmu_gva_to_gpa_system(vcpu, vaddr, NULL);
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
+ tr->physical_address = gpa;
+ tr->valid = gpa != UNMAPPED_GVA;
+ tr->writeable = 1;
+ tr->usermode = 0;
+
+ vcpu_put(vcpu);
+ return 0;
+}
+
+int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
+{
+ struct fxregs_state *fxsave;
+
+ vcpu_load(vcpu);
+
+ fxsave = &vcpu->arch.guest_fpu.state.fxsave;
+ memcpy(fpu->fpr, fxsave->st_space, 128);
+ fpu->fcw = fxsave->cwd;
+ fpu->fsw = fxsave->swd;
+ fpu->ftwx = fxsave->twd;
+ fpu->last_opcode = fxsave->fop;
+ fpu->last_ip = fxsave->rip;
+ fpu->last_dp = fxsave->rdp;
+ memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space);
+
+ vcpu_put(vcpu);
+ return 0;
+}
+
+int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
+{
+ struct fxregs_state *fxsave;
+
+ vcpu_load(vcpu);
+
+ fxsave = &vcpu->arch.guest_fpu.state.fxsave;
+
+ memcpy(fxsave->st_space, fpu->fpr, 128);
+ fxsave->cwd = fpu->fcw;
+ fxsave->swd = fpu->fsw;
+ fxsave->twd = fpu->ftwx;
+ fxsave->fop = fpu->last_opcode;
+ fxsave->rip = fpu->last_ip;
+ fxsave->rdp = fpu->last_dp;
+ memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space);
+
+ vcpu_put(vcpu);
+ return 0;
+}
+
+static void store_regs(struct kvm_vcpu *vcpu)
+{
+ BUILD_BUG_ON(sizeof(struct kvm_sync_regs) > SYNC_REGS_SIZE_BYTES);
+
+ if (vcpu->run->kvm_valid_regs & KVM_SYNC_X86_REGS)
+ __get_regs(vcpu, &vcpu->run->s.regs.regs);
+
+ if (vcpu->run->kvm_valid_regs & KVM_SYNC_X86_SREGS)
+ __get_sregs(vcpu, &vcpu->run->s.regs.sregs);
+
+ if (vcpu->run->kvm_valid_regs & KVM_SYNC_X86_EVENTS)
+ kvm_vcpu_ioctl_x86_get_vcpu_events(
+ vcpu, &vcpu->run->s.regs.events);
+}
+
+static int sync_regs(struct kvm_vcpu *vcpu)
+{
+ if (vcpu->run->kvm_dirty_regs & ~KVM_SYNC_X86_VALID_FIELDS)
+ return -EINVAL;
+
+ if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_REGS) {
+ __set_regs(vcpu, &vcpu->run->s.regs.regs);
+ vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_REGS;
+ }
+ if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_SREGS) {
+ if (__set_sregs(vcpu, &vcpu->run->s.regs.sregs))
+ return -EINVAL;
+ vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_SREGS;
+ }
+ if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_EVENTS) {
+ if (kvm_vcpu_ioctl_x86_set_vcpu_events(
+ vcpu, &vcpu->run->s.regs.events))
+ return -EINVAL;
+ vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_EVENTS;
+ }
+
+ return 0;
+}
+
+static void fx_init(struct kvm_vcpu *vcpu)
+{
+ fpstate_init(&vcpu->arch.guest_fpu.state);
+ if (boot_cpu_has(X86_FEATURE_XSAVES))
+ vcpu->arch.guest_fpu.state.xsave.header.xcomp_bv =
+ host_xcr0 | XSTATE_COMPACTION_ENABLED;
+
+ /*
+ * Ensure guest xcr0 is valid for loading
+ */
+ vcpu->arch.xcr0 = XFEATURE_MASK_FP;
+
+ vcpu->arch.cr0 |= X86_CR0_ET;
+}
+
+void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
+{
+ void *wbinvd_dirty_mask = vcpu->arch.wbinvd_dirty_mask;
+ struct gfn_to_pfn_cache *cache = &vcpu->arch.st.cache;
+
+ kvm_release_pfn(cache->pfn, cache->dirty, cache);
+
+ kvmclock_reset(vcpu);
+
+ kvm_x86_ops->vcpu_free(vcpu);
+ free_cpumask_var(wbinvd_dirty_mask);
+}
+
+struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
+ unsigned int id)
+{
+ struct kvm_vcpu *vcpu;
+
+ if (kvm_check_tsc_unstable() && atomic_read(&kvm->online_vcpus) != 0)
+ printk_once(KERN_WARNING
+ "kvm: SMP vm created on host with unstable TSC; "
+ "guest TSC will not be reliable\n");
+
+ vcpu = kvm_x86_ops->vcpu_create(kvm, id);
+
+ return vcpu;
+}
+
+int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
+{
+ vcpu->arch.arch_capabilities = kvm_get_arch_capabilities();
+ kvm_vcpu_mtrr_init(vcpu);
+ vcpu_load(vcpu);
+ kvm_vcpu_reset(vcpu, false);
+ kvm_mmu_setup(vcpu);
+ vcpu_put(vcpu);
+ return 0;
+}
+
+void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
+{
+ struct msr_data msr;
+ struct kvm *kvm = vcpu->kvm;
+
+ kvm_hv_vcpu_postcreate(vcpu);
+
+ if (mutex_lock_killable(&vcpu->mutex))
+ return;
+ vcpu_load(vcpu);
+ msr.data = 0x0;
+ msr.index = MSR_IA32_TSC;
+ msr.host_initiated = true;
+ kvm_write_tsc(vcpu, &msr);
+ vcpu_put(vcpu);
+ mutex_unlock(&vcpu->mutex);
+
+ if (!kvmclock_periodic_sync)
+ return;
+
+ schedule_delayed_work(&kvm->arch.kvmclock_sync_work,
+ KVMCLOCK_SYNC_PERIOD);
+}
+
+void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
+{
+ kvm_arch_vcpu_free(vcpu);
+}
+
+void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
+{
+ kvm_lapic_reset(vcpu, init_event);
+
+ vcpu->arch.hflags = 0;
+
+ vcpu->arch.smi_pending = 0;
+ vcpu->arch.smi_count = 0;
+ atomic_set(&vcpu->arch.nmi_queued, 0);
+ vcpu->arch.nmi_pending = 0;
+ vcpu->arch.nmi_injected = false;
+ kvm_clear_interrupt_queue(vcpu);
+ kvm_clear_exception_queue(vcpu);
+ vcpu->arch.exception.pending = false;
+
+ memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db));
+ kvm_update_dr0123(vcpu);
+ vcpu->arch.dr6 = DR6_INIT;
+ kvm_update_dr6(vcpu);
+ vcpu->arch.dr7 = DR7_FIXED_1;
+ kvm_update_dr7(vcpu);
+
+ vcpu->arch.cr2 = 0;
+
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ vcpu->arch.apf.msr_val = 0;
+ vcpu->arch.st.msr_val = 0;
+
+ kvmclock_reset(vcpu);
+
+ kvm_clear_async_pf_completion_queue(vcpu);
+ kvm_async_pf_hash_reset(vcpu);
+ vcpu->arch.apf.halted = false;
+
+ if (kvm_mpx_supported()) {
+ void *mpx_state_buffer;
+
+ /*
+ * To avoid have the INIT path from kvm_apic_has_events() that be
+ * called with loaded FPU and does not let userspace fix the state.
+ */
+ if (init_event)
+ kvm_put_guest_fpu(vcpu);
+ mpx_state_buffer = get_xsave_addr(&vcpu->arch.guest_fpu.state.xsave,
+ XFEATURE_MASK_BNDREGS);
+ if (mpx_state_buffer)
+ memset(mpx_state_buffer, 0, sizeof(struct mpx_bndreg_state));
+ mpx_state_buffer = get_xsave_addr(&vcpu->arch.guest_fpu.state.xsave,
+ XFEATURE_MASK_BNDCSR);
+ if (mpx_state_buffer)
+ memset(mpx_state_buffer, 0, sizeof(struct mpx_bndcsr));
+ if (init_event)
+ kvm_load_guest_fpu(vcpu);
+ }
+
+ if (!init_event) {
+ kvm_pmu_reset(vcpu);
+ vcpu->arch.smbase = 0x30000;
+
+ vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT;
+ vcpu->arch.msr_misc_features_enables = 0;
+
+ vcpu->arch.xcr0 = XFEATURE_MASK_FP;
+ }
+
+ memset(vcpu->arch.regs, 0, sizeof(vcpu->arch.regs));
+ vcpu->arch.regs_avail = ~0;
+ vcpu->arch.regs_dirty = ~0;
+
+ vcpu->arch.ia32_xss = 0;
+
+ kvm_x86_ops->vcpu_reset(vcpu, init_event);
+}
+
+void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
+{
+ struct kvm_segment cs;
+
+ kvm_get_segment(vcpu, &cs, VCPU_SREG_CS);
+ cs.selector = vector << 8;
+ cs.base = vector << 12;
+ kvm_set_segment(vcpu, &cs, VCPU_SREG_CS);
+ kvm_rip_write(vcpu, 0);
+}
+
+int kvm_arch_hardware_enable(void)
+{
+ struct kvm *kvm;
+ struct kvm_vcpu *vcpu;
+ int i;
+ int ret;
+ u64 local_tsc;
+ u64 max_tsc = 0;
+ bool stable, backwards_tsc = false;
+
+ kvm_shared_msr_cpu_online();
+ ret = kvm_x86_ops->hardware_enable();
+ if (ret != 0)
+ return ret;
+
+ local_tsc = rdtsc();
+ stable = !kvm_check_tsc_unstable();
+ list_for_each_entry(kvm, &vm_list, vm_list) {
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ if (!stable && vcpu->cpu == smp_processor_id())
+ kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
+ if (stable && vcpu->arch.last_host_tsc > local_tsc) {
+ backwards_tsc = true;
+ if (vcpu->arch.last_host_tsc > max_tsc)
+ max_tsc = vcpu->arch.last_host_tsc;
+ }
+ }
+ }
+
+ /*
+ * Sometimes, even reliable TSCs go backwards. This happens on
+ * platforms that reset TSC during suspend or hibernate actions, but
+ * maintain synchronization. We must compensate. Fortunately, we can
+ * detect that condition here, which happens early in CPU bringup,
+ * before any KVM threads can be running. Unfortunately, we can't
+ * bring the TSCs fully up to date with real time, as we aren't yet far
+ * enough into CPU bringup that we know how much real time has actually
+ * elapsed; our helper function, ktime_get_boot_ns() will be using boot
+ * variables that haven't been updated yet.
+ *
+ * So we simply find the maximum observed TSC above, then record the
+ * adjustment to TSC in each VCPU. When the VCPU later gets loaded,
+ * the adjustment will be applied. Note that we accumulate
+ * adjustments, in case multiple suspend cycles happen before some VCPU
+ * gets a chance to run again. In the event that no KVM threads get a
+ * chance to run, we will miss the entire elapsed period, as we'll have
+ * reset last_host_tsc, so VCPUs will not have the TSC adjusted and may
+ * loose cycle time. This isn't too big a deal, since the loss will be
+ * uniform across all VCPUs (not to mention the scenario is extremely
+ * unlikely). It is possible that a second hibernate recovery happens
+ * much faster than a first, causing the observed TSC here to be
+ * smaller; this would require additional padding adjustment, which is
+ * why we set last_host_tsc to the local tsc observed here.
+ *
+ * N.B. - this code below runs only on platforms with reliable TSC,
+ * as that is the only way backwards_tsc is set above. Also note
+ * that this runs for ALL vcpus, which is not a bug; all VCPUs should
+ * have the same delta_cyc adjustment applied if backwards_tsc
+ * is detected. Note further, this adjustment is only done once,
+ * as we reset last_host_tsc on all VCPUs to stop this from being
+ * called multiple times (one for each physical CPU bringup).
+ *
+ * Platforms with unreliable TSCs don't have to deal with this, they
+ * will be compensated by the logic in vcpu_load, which sets the TSC to
+ * catchup mode. This will catchup all VCPUs to real time, but cannot
+ * guarantee that they stay in perfect synchronization.
+ */
+ if (backwards_tsc) {
+ u64 delta_cyc = max_tsc - local_tsc;
+ list_for_each_entry(kvm, &vm_list, vm_list) {
+ kvm->arch.backwards_tsc_observed = true;
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ vcpu->arch.tsc_offset_adjustment += delta_cyc;
+ vcpu->arch.last_host_tsc = local_tsc;
+ kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
+ }
+
+ /*
+ * We have to disable TSC offset matching.. if you were
+ * booting a VM while issuing an S4 host suspend....
+ * you may have some problem. Solving this issue is
+ * left as an exercise to the reader.
+ */
+ kvm->arch.last_tsc_nsec = 0;
+ kvm->arch.last_tsc_write = 0;
+ }
+
+ }
+ return 0;
+}
+
+void kvm_arch_hardware_disable(void)
+{
+ kvm_x86_ops->hardware_disable();
+ drop_user_return_notifiers();
+}
+
+int kvm_arch_hardware_setup(void)
+{
+ int r;
+
+ r = kvm_x86_ops->hardware_setup();
+ if (r != 0)
+ return r;
+
+ cr4_reserved_bits = kvm_host_cr4_reserved_bits(&boot_cpu_data);
+
+ if (kvm_has_tsc_control) {
+ /*
+ * Make sure the user can only configure tsc_khz values that
+ * fit into a signed integer.
+ * A min value is not calculated because it will always
+ * be 1 on all machines.
+ */
+ u64 max = min(0x7fffffffULL,
+ __scale_tsc(kvm_max_tsc_scaling_ratio, tsc_khz));
+ kvm_max_guest_tsc_khz = max;
+
+ kvm_default_tsc_scaling_ratio = 1ULL << kvm_tsc_scaling_ratio_frac_bits;
+ }
+
+ kvm_init_msr_list();
+ return 0;
+}
+
+void kvm_arch_hardware_unsetup(void)
+{
+ kvm_x86_ops->hardware_unsetup();
+}
+
+void kvm_arch_check_processor_compat(void *rtn)
+{
+ kvm_x86_ops->check_processor_compatibility(rtn);
+}
+
+bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu)
+{
+ return vcpu->kvm->arch.bsp_vcpu_id == vcpu->vcpu_id;
+}
+EXPORT_SYMBOL_GPL(kvm_vcpu_is_reset_bsp);
+
+bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu)
+{
+ return (vcpu->arch.apic_base & MSR_IA32_APICBASE_BSP) != 0;
+}
+
+struct static_key kvm_no_apic_vcpu __read_mostly;
+EXPORT_SYMBOL_GPL(kvm_no_apic_vcpu);
+
+int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
+{
+ struct page *page;
+ int r;
+
+ vcpu->arch.apicv_active = kvm_x86_ops->get_enable_apicv(vcpu);
+ vcpu->arch.emulate_ctxt.ops = &emulate_ops;
+ if (!irqchip_in_kernel(vcpu->kvm) || kvm_vcpu_is_reset_bsp(vcpu))
+ vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
+ else
+ vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED;
+
+ page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+ if (!page) {
+ r = -ENOMEM;
+ goto fail;
+ }
+ vcpu->arch.pio_data = page_address(page);
+
+ kvm_set_tsc_khz(vcpu, max_tsc_khz);
+
+ r = kvm_mmu_create(vcpu);
+ if (r < 0)
+ goto fail_free_pio_data;
+
+ if (irqchip_in_kernel(vcpu->kvm)) {
+ r = kvm_create_lapic(vcpu);
+ if (r < 0)
+ goto fail_mmu_destroy;
+ } else
+ static_key_slow_inc(&kvm_no_apic_vcpu);
+
+ vcpu->arch.mce_banks = kzalloc(KVM_MAX_MCE_BANKS * sizeof(u64) * 4,
+ GFP_KERNEL);
+ if (!vcpu->arch.mce_banks) {
+ r = -ENOMEM;
+ goto fail_free_lapic;
+ }
+ vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS;
+
+ if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL)) {
+ r = -ENOMEM;
+ goto fail_free_mce_banks;
+ }
+
+ fx_init(vcpu);
+
+ vcpu->arch.guest_xstate_size = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET;
+
+ vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
+
+ vcpu->arch.pat = MSR_IA32_CR_PAT_DEFAULT;
+
+ kvm_async_pf_hash_reset(vcpu);
+ kvm_pmu_init(vcpu);
+
+ vcpu->arch.pending_external_vector = -1;
+ vcpu->arch.preempted_in_kernel = false;
+
+ kvm_hv_vcpu_init(vcpu);
+
+ return 0;
+
+fail_free_mce_banks:
+ kfree(vcpu->arch.mce_banks);
+fail_free_lapic:
+ kvm_free_lapic(vcpu);
+fail_mmu_destroy:
+ kvm_mmu_destroy(vcpu);
+fail_free_pio_data:
+ free_page((unsigned long)vcpu->arch.pio_data);
+fail:
+ return r;
+}
+
+void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
+{
+ int idx;
+
+ kvm_hv_vcpu_uninit(vcpu);
+ kvm_pmu_destroy(vcpu);
+ kfree(vcpu->arch.mce_banks);
+ kvm_free_lapic(vcpu);
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
+ kvm_mmu_destroy(vcpu);
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
+ free_page((unsigned long)vcpu->arch.pio_data);
+ if (!lapic_in_kernel(vcpu))
+ static_key_slow_dec(&kvm_no_apic_vcpu);
+}
+
+void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu)
+{
+ vcpu->arch.l1tf_flush_l1d = true;
+ kvm_x86_ops->sched_in(vcpu, cpu);
+}
+
+int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
+{
+ if (type)
+ return -EINVAL;
+
+ INIT_HLIST_HEAD(&kvm->arch.mask_notifier_list);
+ INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
+ INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages);
+ INIT_LIST_HEAD(&kvm->arch.lpage_disallowed_mmu_pages);
+ INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
+ atomic_set(&kvm->arch.noncoherent_dma_count, 0);
+
+ /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */
+ set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap);
+ /* Reserve bit 1 of irq_sources_bitmap for irqfd-resampler */
+ set_bit(KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
+ &kvm->arch.irq_sources_bitmap);
+
+ raw_spin_lock_init(&kvm->arch.tsc_write_lock);
+ mutex_init(&kvm->arch.apic_map_lock);
+ spin_lock_init(&kvm->arch.pvclock_gtod_sync_lock);
+
+ kvm->arch.kvmclock_offset = -ktime_get_boot_ns();
+ pvclock_update_vm_gtod_copy(kvm);
+
+ kvm->arch.guest_can_read_msr_platform_info = true;
+
+ INIT_DELAYED_WORK(&kvm->arch.kvmclock_update_work, kvmclock_update_fn);
+ INIT_DELAYED_WORK(&kvm->arch.kvmclock_sync_work, kvmclock_sync_fn);
+
+ kvm_hv_init_vm(kvm);
+ kvm_page_track_init(kvm);
+ kvm_mmu_init_vm(kvm);
+
+ if (kvm_x86_ops->vm_init)
+ return kvm_x86_ops->vm_init(kvm);
+
+ return 0;
+}
+
+int kvm_arch_post_init_vm(struct kvm *kvm)
+{
+ return kvm_mmu_post_init_vm(kvm);
+}
+
+static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
+{
+ vcpu_load(vcpu);
+ kvm_mmu_unload(vcpu);
+ vcpu_put(vcpu);
+}
+
+static void kvm_free_vcpus(struct kvm *kvm)
+{
+ unsigned int i;
+ struct kvm_vcpu *vcpu;
+
+ /*
+ * Unpin any mmu pages first.
+ */
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ kvm_clear_async_pf_completion_queue(vcpu);
+ kvm_unload_vcpu_mmu(vcpu);
+ }
+ kvm_for_each_vcpu(i, vcpu, kvm)
+ kvm_arch_vcpu_free(vcpu);
+
+ mutex_lock(&kvm->lock);
+ for (i = 0; i < atomic_read(&kvm->online_vcpus); i++)
+ kvm->vcpus[i] = NULL;
+
+ atomic_set(&kvm->online_vcpus, 0);
+ mutex_unlock(&kvm->lock);
+}
+
+void kvm_arch_sync_events(struct kvm *kvm)
+{
+ cancel_delayed_work_sync(&kvm->arch.kvmclock_sync_work);
+ cancel_delayed_work_sync(&kvm->arch.kvmclock_update_work);
+ kvm_free_pit(kvm);
+}
+
+int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size)
+{
+ int i, r;
+ unsigned long hva;
+ struct kvm_memslots *slots = kvm_memslots(kvm);
+ struct kvm_memory_slot *slot, old;
+
+ /* Called with kvm->slots_lock held. */
+ if (WARN_ON(id >= KVM_MEM_SLOTS_NUM))
+ return -EINVAL;
+
+ slot = id_to_memslot(slots, id);
+ if (size) {
+ if (slot->npages)
+ return -EEXIST;
+
+ /*
+ * MAP_SHARED to prevent internal slot pages from being moved
+ * by fork()/COW.
+ */
+ hva = vm_mmap(NULL, 0, size, PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_ANONYMOUS, 0);
+ if (IS_ERR((void *)hva))
+ return PTR_ERR((void *)hva);
+ } else {
+ if (!slot->npages)
+ return 0;
+
+ hva = 0;
+ }
+
+ old = *slot;
+ for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
+ struct kvm_userspace_memory_region m;
+
+ m.slot = id | (i << 16);
+ m.flags = 0;
+ m.guest_phys_addr = gpa;
+ m.userspace_addr = hva;
+ m.memory_size = size;
+ r = __kvm_set_memory_region(kvm, &m);
+ if (r < 0)
+ return r;
+ }
+
+ if (!size)
+ vm_munmap(old.userspace_addr, old.npages * PAGE_SIZE);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(__x86_set_memory_region);
+
+int x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size)
+{
+ int r;
+
+ mutex_lock(&kvm->slots_lock);
+ r = __x86_set_memory_region(kvm, id, gpa, size);
+ mutex_unlock(&kvm->slots_lock);
+
+ return r;
+}
+EXPORT_SYMBOL_GPL(x86_set_memory_region);
+
+void kvm_arch_pre_destroy_vm(struct kvm *kvm)
+{
+ kvm_mmu_pre_destroy_vm(kvm);
+}
+
+void kvm_arch_destroy_vm(struct kvm *kvm)
+{
+ if (current->mm == kvm->mm) {
+ /*
+ * Free memory regions allocated on behalf of userspace,
+ * unless the the memory map has changed due to process exit
+ * or fd copying.
+ */
+ x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT, 0, 0);
+ x86_set_memory_region(kvm, IDENTITY_PAGETABLE_PRIVATE_MEMSLOT, 0, 0);
+ x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, 0, 0);
+ }
+ if (kvm_x86_ops->vm_destroy)
+ kvm_x86_ops->vm_destroy(kvm);
+ kvm_pic_destroy(kvm);
+ kvm_ioapic_destroy(kvm);
+ kvm_free_vcpus(kvm);
+ kvfree(rcu_dereference_check(kvm->arch.apic_map, 1));
+ kvm_mmu_uninit_vm(kvm);
+ kvm_page_track_cleanup(kvm);
+ kvm_hv_destroy_vm(kvm);
+}
+
+void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free,
+ struct kvm_memory_slot *dont)
+{
+ int i;
+
+ for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
+ if (!dont || free->arch.rmap[i] != dont->arch.rmap[i]) {
+ kvfree(free->arch.rmap[i]);
+ free->arch.rmap[i] = NULL;
+ }
+ if (i == 0)
+ continue;
+
+ if (!dont || free->arch.lpage_info[i - 1] !=
+ dont->arch.lpage_info[i - 1]) {
+ kvfree(free->arch.lpage_info[i - 1]);
+ free->arch.lpage_info[i - 1] = NULL;
+ }
+ }
+
+ kvm_page_track_free_memslot(free, dont);
+}
+
+int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
+ unsigned long npages)
+{
+ int i;
+
+ /*
+ * Clear out the previous array pointers for the KVM_MR_MOVE case. The
+ * old arrays will be freed by __kvm_set_memory_region() if installing
+ * the new memslot is successful.
+ */
+ memset(&slot->arch, 0, sizeof(slot->arch));
+
+ for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
+ struct kvm_lpage_info *linfo;
+ unsigned long ugfn;
+ int lpages;
+ int level = i + 1;
+
+ lpages = gfn_to_index(slot->base_gfn + npages - 1,
+ slot->base_gfn, level) + 1;
+
+ slot->arch.rmap[i] =
+ kvcalloc(lpages, sizeof(*slot->arch.rmap[i]),
+ GFP_KERNEL);
+ if (!slot->arch.rmap[i])
+ goto out_free;
+ if (i == 0)
+ continue;
+
+ linfo = kvcalloc(lpages, sizeof(*linfo), GFP_KERNEL);
+ if (!linfo)
+ goto out_free;
+
+ slot->arch.lpage_info[i - 1] = linfo;
+
+ if (slot->base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1))
+ linfo[0].disallow_lpage = 1;
+ if ((slot->base_gfn + npages) & (KVM_PAGES_PER_HPAGE(level) - 1))
+ linfo[lpages - 1].disallow_lpage = 1;
+ ugfn = slot->userspace_addr >> PAGE_SHIFT;
+ /*
+ * If the gfn and userspace address are not aligned wrt each
+ * other, or if explicitly asked to, disable large page
+ * support for this slot
+ */
+ if ((slot->base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1) ||
+ !kvm_largepages_enabled()) {
+ unsigned long j;
+
+ for (j = 0; j < lpages; ++j)
+ linfo[j].disallow_lpage = 1;
+ }
+ }
+
+ if (kvm_page_track_create_memslot(slot, npages))
+ goto out_free;
+
+ return 0;
+
+out_free:
+ for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
+ kvfree(slot->arch.rmap[i]);
+ slot->arch.rmap[i] = NULL;
+ if (i == 0)
+ continue;
+
+ kvfree(slot->arch.lpage_info[i - 1]);
+ slot->arch.lpage_info[i - 1] = NULL;
+ }
+ return -ENOMEM;
+}
+
+void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen)
+{
+ struct kvm_vcpu *vcpu;
+ int i;
+
+ /*
+ * memslots->generation has been incremented.
+ * mmio generation may have reached its maximum value.
+ */
+ kvm_mmu_invalidate_mmio_sptes(kvm, gen);
+
+ /* Force re-initialization of steal_time cache */
+ kvm_for_each_vcpu(i, vcpu, kvm)
+ kvm_vcpu_kick(vcpu);
+}
+
+int kvm_arch_prepare_memory_region(struct kvm *kvm,
+ struct kvm_memory_slot *memslot,
+ const struct kvm_userspace_memory_region *mem,
+ enum kvm_mr_change change)
+{
+ if (change == KVM_MR_MOVE)
+ return kvm_arch_create_memslot(kvm, memslot,
+ mem->memory_size >> PAGE_SHIFT);
+
+ return 0;
+}
+
+static void kvm_mmu_slot_apply_flags(struct kvm *kvm,
+ struct kvm_memory_slot *new)
+{
+ /* Still write protect RO slot */
+ if (new->flags & KVM_MEM_READONLY) {
+ kvm_mmu_slot_remove_write_access(kvm, new);
+ return;
+ }
+
+ /*
+ * Call kvm_x86_ops dirty logging hooks when they are valid.
+ *
+ * kvm_x86_ops->slot_disable_log_dirty is called when:
+ *
+ * - KVM_MR_CREATE with dirty logging is disabled
+ * - KVM_MR_FLAGS_ONLY with dirty logging is disabled in new flag
+ *
+ * The reason is, in case of PML, we need to set D-bit for any slots
+ * with dirty logging disabled in order to eliminate unnecessary GPA
+ * logging in PML buffer (and potential PML buffer full VMEXT). This
+ * guarantees leaving PML enabled during guest's lifetime won't have
+ * any additonal overhead from PML when guest is running with dirty
+ * logging disabled for memory slots.
+ *
+ * kvm_x86_ops->slot_enable_log_dirty is called when switching new slot
+ * to dirty logging mode.
+ *
+ * If kvm_x86_ops dirty logging hooks are invalid, use write protect.
+ *
+ * In case of write protect:
+ *
+ * Write protect all pages for dirty logging.
+ *
+ * All the sptes including the large sptes which point to this
+ * slot are set to readonly. We can not create any new large
+ * spte on this slot until the end of the logging.
+ *
+ * See the comments in fast_page_fault().
+ */
+ if (new->flags & KVM_MEM_LOG_DIRTY_PAGES) {
+ if (kvm_x86_ops->slot_enable_log_dirty)
+ kvm_x86_ops->slot_enable_log_dirty(kvm, new);
+ else
+ kvm_mmu_slot_remove_write_access(kvm, new);
+ } else {
+ if (kvm_x86_ops->slot_disable_log_dirty)
+ kvm_x86_ops->slot_disable_log_dirty(kvm, new);
+ }
+}
+
+void kvm_arch_commit_memory_region(struct kvm *kvm,
+ const struct kvm_userspace_memory_region *mem,
+ const struct kvm_memory_slot *old,
+ const struct kvm_memory_slot *new,
+ enum kvm_mr_change change)
+{
+ int nr_mmu_pages = 0;
+
+ if (!kvm->arch.n_requested_mmu_pages)
+ nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm);
+
+ if (nr_mmu_pages)
+ kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
+
+ /*
+ * Dirty logging tracks sptes in 4k granularity, meaning that large
+ * sptes have to be split. If live migration is successful, the guest
+ * in the source machine will be destroyed and large sptes will be
+ * created in the destination. However, if the guest continues to run
+ * in the source machine (for example if live migration fails), small
+ * sptes will remain around and cause bad performance.
+ *
+ * Scan sptes if dirty logging has been stopped, dropping those
+ * which can be collapsed into a single large-page spte. Later
+ * page faults will create the large-page sptes.
+ */
+ if ((change != KVM_MR_DELETE) &&
+ (old->flags & KVM_MEM_LOG_DIRTY_PAGES) &&
+ !(new->flags & KVM_MEM_LOG_DIRTY_PAGES))
+ kvm_mmu_zap_collapsible_sptes(kvm, new);
+
+ /*
+ * Set up write protection and/or dirty logging for the new slot.
+ *
+ * For KVM_MR_DELETE and KVM_MR_MOVE, the shadow pages of old slot have
+ * been zapped so no dirty logging staff is needed for old slot. For
+ * KVM_MR_FLAGS_ONLY, the old slot is essentially the same one as the
+ * new and it's also covered when dealing with the new slot.
+ *
+ * FIXME: const-ify all uses of struct kvm_memory_slot.
+ */
+ if (change != KVM_MR_DELETE)
+ kvm_mmu_slot_apply_flags(kvm, (struct kvm_memory_slot *) new);
+}
+
+void kvm_arch_flush_shadow_all(struct kvm *kvm)
+{
+ kvm_mmu_invalidate_zap_all_pages(kvm);
+}
+
+void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
+ struct kvm_memory_slot *slot)
+{
+ kvm_page_track_flush_slot(kvm, slot);
+}
+
+static inline bool kvm_guest_apic_has_interrupt(struct kvm_vcpu *vcpu)
+{
+ return (is_guest_mode(vcpu) &&
+ kvm_x86_ops->guest_apic_has_interrupt &&
+ kvm_x86_ops->guest_apic_has_interrupt(vcpu));
+}
+
+static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)
+{
+ if (!list_empty_careful(&vcpu->async_pf.done))
+ return true;
+
+ if (kvm_apic_has_events(vcpu))
+ return true;
+
+ if (vcpu->arch.pv.pv_unhalted)
+ return true;
+
+ if (vcpu->arch.exception.pending)
+ return true;
+
+ if (kvm_test_request(KVM_REQ_NMI, vcpu) ||
+ (vcpu->arch.nmi_pending &&
+ kvm_x86_ops->nmi_allowed(vcpu)))
+ return true;
+
+ if (kvm_test_request(KVM_REQ_SMI, vcpu) ||
+ (vcpu->arch.smi_pending && !is_smm(vcpu)))
+ return true;
+
+ if (kvm_arch_interrupt_allowed(vcpu) &&
+ (kvm_cpu_has_interrupt(vcpu) ||
+ kvm_guest_apic_has_interrupt(vcpu)))
+ return true;
+
+ if (kvm_hv_has_stimer_pending(vcpu))
+ return true;
+
+ return false;
+}
+
+int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
+{
+ return kvm_vcpu_running(vcpu) || kvm_vcpu_has_events(vcpu);
+}
+
+bool kvm_arch_dy_runnable(struct kvm_vcpu *vcpu)
+{
+ if (READ_ONCE(vcpu->arch.pv.pv_unhalted))
+ return true;
+
+ if (kvm_test_request(KVM_REQ_NMI, vcpu) ||
+ kvm_test_request(KVM_REQ_SMI, vcpu) ||
+ kvm_test_request(KVM_REQ_EVENT, vcpu))
+ return true;
+
+ if (vcpu->arch.apicv_active && kvm_x86_ops->dy_apicv_has_pending_interrupt(vcpu))
+ return true;
+
+ return false;
+}
+
+bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.preempted_in_kernel;
+}
+
+int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)
+{
+ return kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE;
+}
+
+int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu)
+{
+ return kvm_x86_ops->interrupt_allowed(vcpu);
+}
+
+unsigned long kvm_get_linear_rip(struct kvm_vcpu *vcpu)
+{
+ if (is_64_bit_mode(vcpu))
+ return kvm_rip_read(vcpu);
+ return (u32)(get_segment_base(vcpu, VCPU_SREG_CS) +
+ kvm_rip_read(vcpu));
+}
+EXPORT_SYMBOL_GPL(kvm_get_linear_rip);
+
+bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip)
+{
+ return kvm_get_linear_rip(vcpu) == linear_rip;
+}
+EXPORT_SYMBOL_GPL(kvm_is_linear_rip);
+
+unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu)
+{
+ unsigned long rflags;
+
+ rflags = kvm_x86_ops->get_rflags(vcpu);
+ if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
+ rflags &= ~X86_EFLAGS_TF;
+ return rflags;
+}
+EXPORT_SYMBOL_GPL(kvm_get_rflags);
+
+static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
+{
+ if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP &&
+ kvm_is_linear_rip(vcpu, vcpu->arch.singlestep_rip))
+ rflags |= X86_EFLAGS_TF;
+ kvm_x86_ops->set_rflags(vcpu, rflags);
+}
+
+void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
+{
+ __kvm_set_rflags(vcpu, rflags);
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+}
+EXPORT_SYMBOL_GPL(kvm_set_rflags);
+
+void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
+{
+ int r;
+
+ if ((vcpu->arch.mmu.direct_map != work->arch.direct_map) ||
+ work->wakeup_all)
+ return;
+
+ r = kvm_mmu_reload(vcpu);
+ if (unlikely(r))
+ return;
+
+ if (!vcpu->arch.mmu.direct_map &&
+ work->arch.cr3 != vcpu->arch.mmu.get_cr3(vcpu))
+ return;
+
+ vcpu->arch.mmu.page_fault(vcpu, work->cr2_or_gpa, 0, true);
+}
+
+static inline u32 kvm_async_pf_hash_fn(gfn_t gfn)
+{
+ return hash_32(gfn & 0xffffffff, order_base_2(ASYNC_PF_PER_VCPU));
+}
+
+static inline u32 kvm_async_pf_next_probe(u32 key)
+{
+ return (key + 1) & (roundup_pow_of_two(ASYNC_PF_PER_VCPU) - 1);
+}
+
+static void kvm_add_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
+{
+ u32 key = kvm_async_pf_hash_fn(gfn);
+
+ while (vcpu->arch.apf.gfns[key] != ~0)
+ key = kvm_async_pf_next_probe(key);
+
+ vcpu->arch.apf.gfns[key] = gfn;
+}
+
+static u32 kvm_async_pf_gfn_slot(struct kvm_vcpu *vcpu, gfn_t gfn)
+{
+ int i;
+ u32 key = kvm_async_pf_hash_fn(gfn);
+
+ for (i = 0; i < roundup_pow_of_two(ASYNC_PF_PER_VCPU) &&
+ (vcpu->arch.apf.gfns[key] != gfn &&
+ vcpu->arch.apf.gfns[key] != ~0); i++)
+ key = kvm_async_pf_next_probe(key);
+
+ return key;
+}
+
+bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
+{
+ return vcpu->arch.apf.gfns[kvm_async_pf_gfn_slot(vcpu, gfn)] == gfn;
+}
+
+static void kvm_del_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
+{
+ u32 i, j, k;
+
+ i = j = kvm_async_pf_gfn_slot(vcpu, gfn);
+ while (true) {
+ vcpu->arch.apf.gfns[i] = ~0;
+ do {
+ j = kvm_async_pf_next_probe(j);
+ if (vcpu->arch.apf.gfns[j] == ~0)
+ return;
+ k = kvm_async_pf_hash_fn(vcpu->arch.apf.gfns[j]);
+ /*
+ * k lies cyclically in ]i,j]
+ * | i.k.j |
+ * |....j i.k.| or |.k..j i...|
+ */
+ } while ((i <= j) ? (i < k && k <= j) : (i < k || k <= j));
+ vcpu->arch.apf.gfns[i] = vcpu->arch.apf.gfns[j];
+ i = j;
+ }
+}
+
+static int apf_put_user(struct kvm_vcpu *vcpu, u32 val)
+{
+
+ return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.apf.data, &val,
+ sizeof(val));
+}
+
+static int apf_get_user(struct kvm_vcpu *vcpu, u32 *val)
+{
+
+ return kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.apf.data, val,
+ sizeof(u32));
+}
+
+void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
+ struct kvm_async_pf *work)
+{
+ struct x86_exception fault;
+
+ trace_kvm_async_pf_not_present(work->arch.token, work->cr2_or_gpa);
+ kvm_add_async_pf_gfn(vcpu, work->arch.gfn);
+
+ if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) ||
+ (vcpu->arch.apf.send_user_only &&
+ kvm_x86_ops->get_cpl(vcpu) == 0))
+ kvm_make_request(KVM_REQ_APF_HALT, vcpu);
+ else if (!apf_put_user(vcpu, KVM_PV_REASON_PAGE_NOT_PRESENT)) {
+ fault.vector = PF_VECTOR;
+ fault.error_code_valid = true;
+ fault.error_code = 0;
+ fault.nested_page_fault = false;
+ fault.address = work->arch.token;
+ fault.async_page_fault = true;
+ kvm_inject_page_fault(vcpu, &fault);
+ }
+}
+
+void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
+ struct kvm_async_pf *work)
+{
+ struct x86_exception fault;
+ u32 val;
+
+ if (work->wakeup_all)
+ work->arch.token = ~0; /* broadcast wakeup */
+ else
+ kvm_del_async_pf_gfn(vcpu, work->arch.gfn);
+ trace_kvm_async_pf_ready(work->arch.token, work->cr2_or_gpa);
+
+ if (vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED &&
+ !apf_get_user(vcpu, &val)) {
+ if (val == KVM_PV_REASON_PAGE_NOT_PRESENT &&
+ vcpu->arch.exception.pending &&
+ vcpu->arch.exception.nr == PF_VECTOR &&
+ !apf_put_user(vcpu, 0)) {
+ vcpu->arch.exception.injected = false;
+ vcpu->arch.exception.pending = false;
+ vcpu->arch.exception.nr = 0;
+ vcpu->arch.exception.has_error_code = false;
+ vcpu->arch.exception.error_code = 0;
+ } else if (!apf_put_user(vcpu, KVM_PV_REASON_PAGE_READY)) {
+ fault.vector = PF_VECTOR;
+ fault.error_code_valid = true;
+ fault.error_code = 0;
+ fault.nested_page_fault = false;
+ fault.address = work->arch.token;
+ fault.async_page_fault = true;
+ kvm_inject_page_fault(vcpu, &fault);
+ }
+ }
+ vcpu->arch.apf.halted = false;
+ vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
+}
+
+bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu)
+{
+ if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED))
+ return true;
+ else
+ return kvm_can_do_async_pf(vcpu);
+}
+
+void kvm_arch_start_assignment(struct kvm *kvm)
+{
+ atomic_inc(&kvm->arch.assigned_device_count);
+}
+EXPORT_SYMBOL_GPL(kvm_arch_start_assignment);
+
+void kvm_arch_end_assignment(struct kvm *kvm)
+{
+ atomic_dec(&kvm->arch.assigned_device_count);
+}
+EXPORT_SYMBOL_GPL(kvm_arch_end_assignment);
+
+bool kvm_arch_has_assigned_device(struct kvm *kvm)
+{
+ return atomic_read(&kvm->arch.assigned_device_count);
+}
+EXPORT_SYMBOL_GPL(kvm_arch_has_assigned_device);
+
+void kvm_arch_register_noncoherent_dma(struct kvm *kvm)
+{
+ atomic_inc(&kvm->arch.noncoherent_dma_count);
+}
+EXPORT_SYMBOL_GPL(kvm_arch_register_noncoherent_dma);
+
+void kvm_arch_unregister_noncoherent_dma(struct kvm *kvm)
+{
+ atomic_dec(&kvm->arch.noncoherent_dma_count);
+}
+EXPORT_SYMBOL_GPL(kvm_arch_unregister_noncoherent_dma);
+
+bool kvm_arch_has_noncoherent_dma(struct kvm *kvm)
+{
+ return atomic_read(&kvm->arch.noncoherent_dma_count);
+}
+EXPORT_SYMBOL_GPL(kvm_arch_has_noncoherent_dma);
+
+bool kvm_arch_has_irq_bypass(void)
+{
+ return kvm_x86_ops->update_pi_irte != NULL;
+}
+
+int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons,
+ struct irq_bypass_producer *prod)
+{
+ struct kvm_kernel_irqfd *irqfd =
+ container_of(cons, struct kvm_kernel_irqfd, consumer);
+
+ irqfd->producer = prod;
+
+ return kvm_x86_ops->update_pi_irte(irqfd->kvm,
+ prod->irq, irqfd->gsi, 1);
+}
+
+void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
+ struct irq_bypass_producer *prod)
+{
+ int ret;
+ struct kvm_kernel_irqfd *irqfd =
+ container_of(cons, struct kvm_kernel_irqfd, consumer);
+
+ WARN_ON(irqfd->producer != prod);
+ irqfd->producer = NULL;
+
+ /*
+ * When producer of consumer is unregistered, we change back to
+ * remapped mode, so we can re-use the current implementation
+ * when the irq is masked/disabled or the consumer side (KVM
+ * int this case doesn't want to receive the interrupts.
+ */
+ ret = kvm_x86_ops->update_pi_irte(irqfd->kvm, prod->irq, irqfd->gsi, 0);
+ if (ret)
+ printk(KERN_INFO "irq bypass consumer (token %p) unregistration"
+ " fails: %d\n", irqfd->consumer.token, ret);
+}
+
+int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq,
+ uint32_t guest_irq, bool set)
+{
+ if (!kvm_x86_ops->update_pi_irte)
+ return -EINVAL;
+
+ return kvm_x86_ops->update_pi_irte(kvm, host_irq, guest_irq, set);
+}
+
+bool kvm_vector_hashing_enabled(void)
+{
+ return vector_hashing;
+}
+EXPORT_SYMBOL_GPL(kvm_vector_hashing_enabled);
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_fast_mmio);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_msr);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_cr);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmrun);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit_inject);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intr_vmexit);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_invlpga);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_skinit);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intercepts);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_write_tsc_offset);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_ple_window);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pml_full);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pi_irte_update);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_unaccelerated_access);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_incomplete_ipi);
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
new file mode 100644
index 000000000..422331b25
--- /dev/null
+++ b/arch/x86/kvm/x86.h
@@ -0,0 +1,365 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef ARCH_X86_KVM_X86_H
+#define ARCH_X86_KVM_X86_H
+
+#include <linux/kvm_host.h>
+#include <asm/pvclock.h>
+#include "kvm_cache_regs.h"
+
+#define KVM_DEFAULT_PLE_GAP 128
+#define KVM_VMX_DEFAULT_PLE_WINDOW 4096
+#define KVM_DEFAULT_PLE_WINDOW_GROW 2
+#define KVM_DEFAULT_PLE_WINDOW_SHRINK 0
+#define KVM_VMX_DEFAULT_PLE_WINDOW_MAX UINT_MAX
+#define KVM_SVM_DEFAULT_PLE_WINDOW_MAX USHRT_MAX
+#define KVM_SVM_DEFAULT_PLE_WINDOW 3000
+
+static inline unsigned int __grow_ple_window(unsigned int val,
+ unsigned int base, unsigned int modifier, unsigned int max)
+{
+ u64 ret = val;
+
+ if (modifier < 1)
+ return base;
+
+ if (modifier < base)
+ ret *= modifier;
+ else
+ ret += modifier;
+
+ return min(ret, (u64)max);
+}
+
+static inline unsigned int __shrink_ple_window(unsigned int val,
+ unsigned int base, unsigned int modifier, unsigned int min)
+{
+ if (modifier < 1)
+ return base;
+
+ if (modifier < base)
+ val /= modifier;
+ else
+ val -= modifier;
+
+ return max(val, min);
+}
+
+#define MSR_IA32_CR_PAT_DEFAULT 0x0007040600070406ULL
+
+static inline void kvm_clear_exception_queue(struct kvm_vcpu *vcpu)
+{
+ vcpu->arch.exception.pending = false;
+ vcpu->arch.exception.injected = false;
+}
+
+static inline void kvm_queue_interrupt(struct kvm_vcpu *vcpu, u8 vector,
+ bool soft)
+{
+ vcpu->arch.interrupt.injected = true;
+ vcpu->arch.interrupt.soft = soft;
+ vcpu->arch.interrupt.nr = vector;
+}
+
+static inline void kvm_clear_interrupt_queue(struct kvm_vcpu *vcpu)
+{
+ vcpu->arch.interrupt.injected = false;
+}
+
+static inline bool kvm_event_needs_reinjection(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.exception.injected || vcpu->arch.interrupt.injected ||
+ vcpu->arch.nmi_injected;
+}
+
+static inline bool kvm_exception_is_soft(unsigned int nr)
+{
+ return (nr == BP_VECTOR) || (nr == OF_VECTOR);
+}
+
+static inline bool is_protmode(struct kvm_vcpu *vcpu)
+{
+ return kvm_read_cr0_bits(vcpu, X86_CR0_PE);
+}
+
+static inline int is_long_mode(struct kvm_vcpu *vcpu)
+{
+#ifdef CONFIG_X86_64
+ return vcpu->arch.efer & EFER_LMA;
+#else
+ return 0;
+#endif
+}
+
+static inline bool is_64_bit_mode(struct kvm_vcpu *vcpu)
+{
+ int cs_db, cs_l;
+
+ if (!is_long_mode(vcpu))
+ return false;
+ kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
+ return cs_l;
+}
+
+static inline bool is_la57_mode(struct kvm_vcpu *vcpu)
+{
+#ifdef CONFIG_X86_64
+ return (vcpu->arch.efer & EFER_LMA) &&
+ kvm_read_cr4_bits(vcpu, X86_CR4_LA57);
+#else
+ return 0;
+#endif
+}
+
+static inline bool x86_exception_has_error_code(unsigned int vector)
+{
+ static u32 exception_has_error_code = BIT(DF_VECTOR) | BIT(TS_VECTOR) |
+ BIT(NP_VECTOR) | BIT(SS_VECTOR) | BIT(GP_VECTOR) |
+ BIT(PF_VECTOR) | BIT(AC_VECTOR);
+
+ return (1U << vector) & exception_has_error_code;
+}
+
+static inline bool mmu_is_nested(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.walk_mmu == &vcpu->arch.nested_mmu;
+}
+
+static inline int is_pae(struct kvm_vcpu *vcpu)
+{
+ return kvm_read_cr4_bits(vcpu, X86_CR4_PAE);
+}
+
+static inline int is_pse(struct kvm_vcpu *vcpu)
+{
+ return kvm_read_cr4_bits(vcpu, X86_CR4_PSE);
+}
+
+static inline int is_paging(struct kvm_vcpu *vcpu)
+{
+ return likely(kvm_read_cr0_bits(vcpu, X86_CR0_PG));
+}
+
+static inline bool is_pae_paging(struct kvm_vcpu *vcpu)
+{
+ return !is_long_mode(vcpu) && is_pae(vcpu) && is_paging(vcpu);
+}
+
+static inline u32 bit(int bitno)
+{
+ return 1 << (bitno & 31);
+}
+
+static inline u8 vcpu_virt_addr_bits(struct kvm_vcpu *vcpu)
+{
+ return kvm_read_cr4_bits(vcpu, X86_CR4_LA57) ? 57 : 48;
+}
+
+static inline u8 ctxt_virt_addr_bits(struct x86_emulate_ctxt *ctxt)
+{
+ return (ctxt->ops->get_cr(ctxt, 4) & X86_CR4_LA57) ? 57 : 48;
+}
+
+static inline u64 get_canonical(u64 la, u8 vaddr_bits)
+{
+ return ((int64_t)la << (64 - vaddr_bits)) >> (64 - vaddr_bits);
+}
+
+static inline bool is_noncanonical_address(u64 la, struct kvm_vcpu *vcpu)
+{
+#ifdef CONFIG_X86_64
+ return get_canonical(la, vcpu_virt_addr_bits(vcpu)) != la;
+#else
+ return false;
+#endif
+}
+
+static inline bool emul_is_noncanonical_address(u64 la,
+ struct x86_emulate_ctxt *ctxt)
+{
+#ifdef CONFIG_X86_64
+ return get_canonical(la, ctxt_virt_addr_bits(ctxt)) != la;
+#else
+ return false;
+#endif
+}
+
+static inline void vcpu_cache_mmio_info(struct kvm_vcpu *vcpu,
+ gva_t gva, gfn_t gfn, unsigned access)
+{
+ u64 gen = kvm_memslots(vcpu->kvm)->generation;
+
+ if (unlikely(gen & 1))
+ return;
+
+ /*
+ * If this is a shadow nested page table, the "GVA" is
+ * actually a nGPA.
+ */
+ vcpu->arch.mmio_gva = mmu_is_nested(vcpu) ? 0 : gva & PAGE_MASK;
+ vcpu->arch.access = access;
+ vcpu->arch.mmio_gfn = gfn;
+ vcpu->arch.mmio_gen = gen;
+}
+
+static inline bool vcpu_match_mmio_gen(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.mmio_gen == kvm_memslots(vcpu->kvm)->generation;
+}
+
+/*
+ * Clear the mmio cache info for the given gva. If gva is MMIO_GVA_ANY, we
+ * clear all mmio cache info.
+ */
+#define MMIO_GVA_ANY (~(gva_t)0)
+
+static inline void vcpu_clear_mmio_info(struct kvm_vcpu *vcpu, gva_t gva)
+{
+ if (gva != MMIO_GVA_ANY && vcpu->arch.mmio_gva != (gva & PAGE_MASK))
+ return;
+
+ vcpu->arch.mmio_gva = 0;
+}
+
+static inline bool vcpu_match_mmio_gva(struct kvm_vcpu *vcpu, unsigned long gva)
+{
+ if (vcpu_match_mmio_gen(vcpu) && vcpu->arch.mmio_gva &&
+ vcpu->arch.mmio_gva == (gva & PAGE_MASK))
+ return true;
+
+ return false;
+}
+
+static inline bool vcpu_match_mmio_gpa(struct kvm_vcpu *vcpu, gpa_t gpa)
+{
+ if (vcpu_match_mmio_gen(vcpu) && vcpu->arch.mmio_gfn &&
+ vcpu->arch.mmio_gfn == gpa >> PAGE_SHIFT)
+ return true;
+
+ return false;
+}
+
+static inline unsigned long kvm_register_readl(struct kvm_vcpu *vcpu,
+ enum kvm_reg reg)
+{
+ unsigned long val = kvm_register_read(vcpu, reg);
+
+ return is_64_bit_mode(vcpu) ? val : (u32)val;
+}
+
+static inline void kvm_register_writel(struct kvm_vcpu *vcpu,
+ enum kvm_reg reg,
+ unsigned long val)
+{
+ if (!is_64_bit_mode(vcpu))
+ val = (u32)val;
+ return kvm_register_write(vcpu, reg, val);
+}
+
+static inline bool kvm_check_has_quirk(struct kvm *kvm, u64 quirk)
+{
+ return !(kvm->arch.disabled_quirks & quirk);
+}
+
+void kvm_set_pending_timer(struct kvm_vcpu *vcpu);
+int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip);
+
+void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr);
+u64 get_kvmclock_ns(struct kvm *kvm);
+
+int kvm_read_guest_virt(struct kvm_vcpu *vcpu,
+ gva_t addr, void *val, unsigned int bytes,
+ struct x86_exception *exception);
+
+int kvm_write_guest_virt_system(struct kvm_vcpu *vcpu,
+ gva_t addr, void *val, unsigned int bytes,
+ struct x86_exception *exception);
+
+int handle_ud(struct kvm_vcpu *vcpu);
+
+void kvm_vcpu_mtrr_init(struct kvm_vcpu *vcpu);
+u8 kvm_mtrr_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn);
+bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data);
+int kvm_mtrr_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data);
+int kvm_mtrr_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata);
+bool kvm_mtrr_check_gfn_range_consistency(struct kvm_vcpu *vcpu, gfn_t gfn,
+ int page_num);
+bool kvm_vector_hashing_enabled(void);
+int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
+ int emulation_type, void *insn, int insn_len);
+
+#define KVM_SUPPORTED_XCR0 (XFEATURE_MASK_FP | XFEATURE_MASK_SSE \
+ | XFEATURE_MASK_YMM | XFEATURE_MASK_BNDREGS \
+ | XFEATURE_MASK_BNDCSR | XFEATURE_MASK_AVX512 \
+ | XFEATURE_MASK_PKRU)
+extern u64 host_xcr0;
+
+extern u64 kvm_supported_xcr0(void);
+
+extern unsigned int min_timer_period_us;
+
+extern unsigned int lapic_timer_advance_ns;
+
+extern bool enable_vmware_backdoor;
+
+extern struct static_key kvm_no_apic_vcpu;
+
+static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec)
+{
+ return pvclock_scale_delta(nsec, vcpu->arch.virtual_tsc_mult,
+ vcpu->arch.virtual_tsc_shift);
+}
+
+/* Same "calling convention" as do_div:
+ * - divide (n << 32) by base
+ * - put result in n
+ * - return remainder
+ */
+#define do_shl32_div32(n, base) \
+ ({ \
+ u32 __quot, __rem; \
+ asm("divl %2" : "=a" (__quot), "=d" (__rem) \
+ : "rm" (base), "0" (0), "1" ((u32) n)); \
+ n = __quot; \
+ __rem; \
+ })
+
+static inline bool kvm_mwait_in_guest(struct kvm *kvm)
+{
+ return kvm->arch.mwait_in_guest;
+}
+
+static inline bool kvm_hlt_in_guest(struct kvm *kvm)
+{
+ return kvm->arch.hlt_in_guest;
+}
+
+static inline bool kvm_pause_in_guest(struct kvm *kvm)
+{
+ return kvm->arch.pause_in_guest;
+}
+
+DECLARE_PER_CPU(struct kvm_vcpu *, current_vcpu);
+
+static inline void kvm_before_interrupt(struct kvm_vcpu *vcpu)
+{
+ __this_cpu_write(current_vcpu, vcpu);
+}
+
+static inline void kvm_after_interrupt(struct kvm_vcpu *vcpu)
+{
+ __this_cpu_write(current_vcpu, NULL);
+}
+
+
+static inline bool kvm_pat_valid(u64 data)
+{
+ if (data & 0xF8F8F8F8F8F8F8F8ull)
+ return false;
+ /* 0, 1, 4, 5, 6, 7 are valid values. */
+ return (data | ((data & 0x0202020202020202ull) << 1)) == data;
+}
+
+void kvm_load_guest_xcr0(struct kvm_vcpu *vcpu);
+void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu);
+
+#endif