diff options
Diffstat (limited to '')
-rw-r--r-- | arch/x86/kvm/vmx/capabilities.h | 404 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/evmcs.c | 509 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/evmcs.h | 234 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/nested.c | 7028 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/nested.h | 292 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/pmu_intel.c | 814 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/posted_intr.c | 351 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/posted_intr.h | 106 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/run_flags.h | 8 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/sgx.c | 498 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/sgx.h | 34 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/vmcs.h | 193 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/vmcs12.c | 154 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/vmcs12.h | 430 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/vmcs_shadow_fields.h | 79 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/vmenter.S | 352 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/vmx.c | 8628 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/vmx.h | 773 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/vmx_ops.h | 347 |
19 files changed, 21234 insertions, 0 deletions
diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h new file mode 100644 index 000000000..cd2ac9536 --- /dev/null +++ b/arch/x86/kvm/vmx/capabilities.h @@ -0,0 +1,404 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __KVM_X86_VMX_CAPS_H +#define __KVM_X86_VMX_CAPS_H + +#include <asm/vmx.h> + +#include "../lapic.h" +#include "../x86.h" +#include "../pmu.h" +#include "../cpuid.h" + +extern bool __read_mostly enable_vpid; +extern bool __read_mostly flexpriority_enabled; +extern bool __read_mostly enable_ept; +extern bool __read_mostly enable_unrestricted_guest; +extern bool __read_mostly enable_ept_ad_bits; +extern bool __read_mostly enable_pml; +extern bool __read_mostly enable_ipiv; +extern int __read_mostly pt_mode; + +#define PT_MODE_SYSTEM 0 +#define PT_MODE_HOST_GUEST 1 + +#define PMU_CAP_FW_WRITES (1ULL << 13) +#define PMU_CAP_LBR_FMT 0x3f + +struct nested_vmx_msrs { + /* + * We only store the "true" versions of the VMX capability MSRs. We + * generate the "non-true" versions by setting the must-be-1 bits + * according to the SDM. + */ + u32 procbased_ctls_low; + u32 procbased_ctls_high; + u32 secondary_ctls_low; + u32 secondary_ctls_high; + u32 pinbased_ctls_low; + u32 pinbased_ctls_high; + u32 exit_ctls_low; + u32 exit_ctls_high; + u32 entry_ctls_low; + u32 entry_ctls_high; + u32 misc_low; + u32 misc_high; + u32 ept_caps; + u32 vpid_caps; + u64 basic; + u64 cr0_fixed0; + u64 cr0_fixed1; + u64 cr4_fixed0; + u64 cr4_fixed1; + u64 vmcs_enum; + u64 vmfunc_controls; +}; + +struct vmcs_config { + int size; + u32 basic_cap; + u32 revision_id; + u32 pin_based_exec_ctrl; + u32 cpu_based_exec_ctrl; + u32 cpu_based_2nd_exec_ctrl; + u64 cpu_based_3rd_exec_ctrl; + u32 vmexit_ctrl; + u32 vmentry_ctrl; + u64 misc; + struct nested_vmx_msrs nested; +}; +extern struct vmcs_config vmcs_config; + +struct vmx_capability { + u32 ept; + u32 vpid; +}; +extern struct vmx_capability vmx_capability; + +static inline bool cpu_has_vmx_basic_inout(void) +{ + return (((u64)vmcs_config.basic_cap << 32) & VMX_BASIC_INOUT); +} + +static inline bool cpu_has_virtual_nmis(void) +{ + return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS && + vmcs_config.cpu_based_exec_ctrl & CPU_BASED_NMI_WINDOW_EXITING; +} + +static inline bool cpu_has_vmx_preemption_timer(void) +{ + return vmcs_config.pin_based_exec_ctrl & + PIN_BASED_VMX_PREEMPTION_TIMER; +} + +static inline bool cpu_has_vmx_posted_intr(void) +{ + return vmcs_config.pin_based_exec_ctrl & PIN_BASED_POSTED_INTR; +} + +static inline bool cpu_has_load_ia32_efer(void) +{ + return vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_EFER; +} + +static inline bool cpu_has_load_perf_global_ctrl(void) +{ + return vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; +} + +static inline bool cpu_has_vmx_mpx(void) +{ + return vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_BNDCFGS; +} + +static inline bool cpu_has_vmx_tpr_shadow(void) +{ + return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW; +} + +static inline bool cpu_need_tpr_shadow(struct kvm_vcpu *vcpu) +{ + return cpu_has_vmx_tpr_shadow() && lapic_in_kernel(vcpu); +} + +static inline bool cpu_has_vmx_msr_bitmap(void) +{ + return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_USE_MSR_BITMAPS; +} + +static inline bool cpu_has_secondary_exec_ctrls(void) +{ + return vmcs_config.cpu_based_exec_ctrl & + CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; +} + +static inline bool cpu_has_tertiary_exec_ctrls(void) +{ + return vmcs_config.cpu_based_exec_ctrl & + CPU_BASED_ACTIVATE_TERTIARY_CONTROLS; +} + +static inline bool cpu_has_vmx_virtualize_apic_accesses(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; +} + +static inline bool cpu_has_vmx_ept(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_ENABLE_EPT; +} + +static inline bool vmx_umip_emulated(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_DESC; +} + +static inline bool cpu_has_vmx_rdtscp(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_ENABLE_RDTSCP; +} + +static inline bool cpu_has_vmx_virtualize_x2apic_mode(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; +} + +static inline bool cpu_has_vmx_vpid(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_ENABLE_VPID; +} + +static inline bool cpu_has_vmx_wbinvd_exit(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_WBINVD_EXITING; +} + +static inline bool cpu_has_vmx_unrestricted_guest(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_UNRESTRICTED_GUEST; +} + +static inline bool cpu_has_vmx_apic_register_virt(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_APIC_REGISTER_VIRT; +} + +static inline bool cpu_has_vmx_virtual_intr_delivery(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY; +} + +static inline bool cpu_has_vmx_ple(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_PAUSE_LOOP_EXITING; +} + +static inline bool cpu_has_vmx_rdrand(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_RDRAND_EXITING; +} + +static inline bool cpu_has_vmx_invpcid(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_ENABLE_INVPCID; +} + +static inline bool cpu_has_vmx_vmfunc(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_ENABLE_VMFUNC; +} + +static inline bool cpu_has_vmx_shadow_vmcs(void) +{ + /* check if the cpu supports writing r/o exit information fields */ + if (!(vmcs_config.misc & MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS)) + return false; + + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_SHADOW_VMCS; +} + +static inline bool cpu_has_vmx_encls_vmexit(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_ENCLS_EXITING; +} + +static inline bool cpu_has_vmx_rdseed(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_RDSEED_EXITING; +} + +static inline bool cpu_has_vmx_pml(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_ENABLE_PML; +} + +static inline bool cpu_has_vmx_xsaves(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_XSAVES; +} + +static inline bool cpu_has_vmx_waitpkg(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE; +} + +static inline bool cpu_has_vmx_tsc_scaling(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_TSC_SCALING; +} + +static inline bool cpu_has_vmx_bus_lock_detection(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_BUS_LOCK_DETECTION; +} + +static inline bool cpu_has_vmx_apicv(void) +{ + return cpu_has_vmx_apic_register_virt() && + cpu_has_vmx_virtual_intr_delivery() && + cpu_has_vmx_posted_intr(); +} + +static inline bool cpu_has_vmx_ipiv(void) +{ + return vmcs_config.cpu_based_3rd_exec_ctrl & TERTIARY_EXEC_IPI_VIRT; +} + +static inline bool cpu_has_vmx_flexpriority(void) +{ + return cpu_has_vmx_tpr_shadow() && + cpu_has_vmx_virtualize_apic_accesses(); +} + +static inline bool cpu_has_vmx_ept_execute_only(void) +{ + return vmx_capability.ept & VMX_EPT_EXECUTE_ONLY_BIT; +} + +static inline bool cpu_has_vmx_ept_4levels(void) +{ + return vmx_capability.ept & VMX_EPT_PAGE_WALK_4_BIT; +} + +static inline bool cpu_has_vmx_ept_5levels(void) +{ + return vmx_capability.ept & VMX_EPT_PAGE_WALK_5_BIT; +} + +static inline bool cpu_has_vmx_ept_mt_wb(void) +{ + return vmx_capability.ept & VMX_EPTP_WB_BIT; +} + +static inline bool cpu_has_vmx_ept_2m_page(void) +{ + return vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT; +} + +static inline bool cpu_has_vmx_ept_1g_page(void) +{ + return vmx_capability.ept & VMX_EPT_1GB_PAGE_BIT; +} + +static inline int ept_caps_to_lpage_level(u32 ept_caps) +{ + if (ept_caps & VMX_EPT_1GB_PAGE_BIT) + return PG_LEVEL_1G; + if (ept_caps & VMX_EPT_2MB_PAGE_BIT) + return PG_LEVEL_2M; + return PG_LEVEL_4K; +} + +static inline bool cpu_has_vmx_ept_ad_bits(void) +{ + return vmx_capability.ept & VMX_EPT_AD_BIT; +} + +static inline bool cpu_has_vmx_invept_context(void) +{ + return vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT; +} + +static inline bool cpu_has_vmx_invept_global(void) +{ + return vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT; +} + +static inline bool cpu_has_vmx_invvpid(void) +{ + return vmx_capability.vpid & VMX_VPID_INVVPID_BIT; +} + +static inline bool cpu_has_vmx_invvpid_individual_addr(void) +{ + return vmx_capability.vpid & VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT; +} + +static inline bool cpu_has_vmx_invvpid_single(void) +{ + return vmx_capability.vpid & VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT; +} + +static inline bool cpu_has_vmx_invvpid_global(void) +{ + return vmx_capability.vpid & VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT; +} + +static inline bool cpu_has_vmx_intel_pt(void) +{ + return (vmcs_config.misc & MSR_IA32_VMX_MISC_INTEL_PT) && + (vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_PT_USE_GPA) && + (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_RTIT_CTL); +} + +/* + * Processor Trace can operate in one of three modes: + * a. system-wide: trace both host/guest and output to host buffer + * b. host-only: only trace host and output to host buffer + * c. host-guest: trace host and guest simultaneously and output to their + * respective buffer + * + * KVM currently only supports (a) and (c). + */ +static inline bool vmx_pt_mode_is_system(void) +{ + return pt_mode == PT_MODE_SYSTEM; +} +static inline bool vmx_pt_mode_is_host_guest(void) +{ + return pt_mode == PT_MODE_HOST_GUEST; +} + +static inline bool vmx_pebs_supported(void) +{ + return boot_cpu_has(X86_FEATURE_PEBS) && kvm_pmu_cap.pebs_ept; +} + +static inline bool cpu_has_notify_vmexit(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_NOTIFY_VM_EXITING; +} + +#endif /* __KVM_X86_VMX_CAPS_H */ diff --git a/arch/x86/kvm/vmx/evmcs.c b/arch/x86/kvm/vmx/evmcs.c new file mode 100644 index 000000000..d8b23c96d --- /dev/null +++ b/arch/x86/kvm/vmx/evmcs.c @@ -0,0 +1,509 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/errno.h> +#include <linux/smp.h> + +#include "../hyperv.h" +#include "../cpuid.h" +#include "evmcs.h" +#include "vmcs.h" +#include "vmx.h" +#include "trace.h" + +#define CC KVM_NESTED_VMENTER_CONSISTENCY_CHECK + +DEFINE_STATIC_KEY_FALSE(enable_evmcs); + +#define EVMCS1_OFFSET(x) offsetof(struct hv_enlightened_vmcs, x) +#define EVMCS1_FIELD(number, name, clean_field)[ROL16(number, 6)] = \ + {EVMCS1_OFFSET(name), clean_field} + +const struct evmcs_field vmcs_field_to_evmcs_1[] = { + /* 64 bit rw */ + EVMCS1_FIELD(GUEST_RIP, guest_rip, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), + EVMCS1_FIELD(GUEST_RSP, guest_rsp, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC), + EVMCS1_FIELD(GUEST_RFLAGS, guest_rflags, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC), + EVMCS1_FIELD(HOST_IA32_PAT, host_ia32_pat, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + EVMCS1_FIELD(HOST_IA32_EFER, host_ia32_efer, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + EVMCS1_FIELD(HOST_IA32_PERF_GLOBAL_CTRL, host_ia32_perf_global_ctrl, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + EVMCS1_FIELD(HOST_CR0, host_cr0, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + EVMCS1_FIELD(HOST_CR3, host_cr3, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + EVMCS1_FIELD(HOST_CR4, host_cr4, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + EVMCS1_FIELD(HOST_IA32_SYSENTER_ESP, host_ia32_sysenter_esp, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + EVMCS1_FIELD(HOST_IA32_SYSENTER_EIP, host_ia32_sysenter_eip, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + EVMCS1_FIELD(HOST_RIP, host_rip, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + EVMCS1_FIELD(IO_BITMAP_A, io_bitmap_a, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP), + EVMCS1_FIELD(IO_BITMAP_B, io_bitmap_b, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP), + EVMCS1_FIELD(MSR_BITMAP, msr_bitmap, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP), + EVMCS1_FIELD(GUEST_ES_BASE, guest_es_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_CS_BASE, guest_cs_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_SS_BASE, guest_ss_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_DS_BASE, guest_ds_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_FS_BASE, guest_fs_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_GS_BASE, guest_gs_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_LDTR_BASE, guest_ldtr_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_TR_BASE, guest_tr_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_GDTR_BASE, guest_gdtr_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_IDTR_BASE, guest_idtr_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(TSC_OFFSET, tsc_offset, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2), + EVMCS1_FIELD(VIRTUAL_APIC_PAGE_ADDR, virtual_apic_page_addr, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2), + EVMCS1_FIELD(VMCS_LINK_POINTER, vmcs_link_pointer, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), + EVMCS1_FIELD(GUEST_IA32_DEBUGCTL, guest_ia32_debugctl, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), + EVMCS1_FIELD(GUEST_IA32_PAT, guest_ia32_pat, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), + EVMCS1_FIELD(GUEST_IA32_EFER, guest_ia32_efer, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), + EVMCS1_FIELD(GUEST_IA32_PERF_GLOBAL_CTRL, guest_ia32_perf_global_ctrl, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), + EVMCS1_FIELD(GUEST_PDPTR0, guest_pdptr0, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), + EVMCS1_FIELD(GUEST_PDPTR1, guest_pdptr1, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), + EVMCS1_FIELD(GUEST_PDPTR2, guest_pdptr2, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), + EVMCS1_FIELD(GUEST_PDPTR3, guest_pdptr3, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), + EVMCS1_FIELD(GUEST_PENDING_DBG_EXCEPTIONS, guest_pending_dbg_exceptions, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), + EVMCS1_FIELD(GUEST_SYSENTER_ESP, guest_sysenter_esp, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), + EVMCS1_FIELD(GUEST_SYSENTER_EIP, guest_sysenter_eip, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), + EVMCS1_FIELD(CR0_GUEST_HOST_MASK, cr0_guest_host_mask, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR), + EVMCS1_FIELD(CR4_GUEST_HOST_MASK, cr4_guest_host_mask, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR), + EVMCS1_FIELD(CR0_READ_SHADOW, cr0_read_shadow, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR), + EVMCS1_FIELD(CR4_READ_SHADOW, cr4_read_shadow, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR), + EVMCS1_FIELD(GUEST_CR0, guest_cr0, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR), + EVMCS1_FIELD(GUEST_CR3, guest_cr3, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR), + EVMCS1_FIELD(GUEST_CR4, guest_cr4, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR), + EVMCS1_FIELD(GUEST_DR7, guest_dr7, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR), + EVMCS1_FIELD(HOST_FS_BASE, host_fs_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER), + EVMCS1_FIELD(HOST_GS_BASE, host_gs_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER), + EVMCS1_FIELD(HOST_TR_BASE, host_tr_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER), + EVMCS1_FIELD(HOST_GDTR_BASE, host_gdtr_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER), + EVMCS1_FIELD(HOST_IDTR_BASE, host_idtr_base, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER), + EVMCS1_FIELD(HOST_RSP, host_rsp, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER), + EVMCS1_FIELD(EPT_POINTER, ept_pointer, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT), + EVMCS1_FIELD(GUEST_BNDCFGS, guest_bndcfgs, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), + EVMCS1_FIELD(XSS_EXIT_BITMAP, xss_exit_bitmap, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2), + EVMCS1_FIELD(ENCLS_EXITING_BITMAP, encls_exiting_bitmap, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2), + EVMCS1_FIELD(TSC_MULTIPLIER, tsc_multiplier, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2), + /* + * Not used by KVM: + * + * EVMCS1_FIELD(0x00006828, guest_ia32_s_cet, + * HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), + * EVMCS1_FIELD(0x0000682A, guest_ssp, + * HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC), + * EVMCS1_FIELD(0x0000682C, guest_ia32_int_ssp_table_addr, + * HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), + * EVMCS1_FIELD(0x00002816, guest_ia32_lbr_ctl, + * HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), + * EVMCS1_FIELD(0x00006C18, host_ia32_s_cet, + * HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + * EVMCS1_FIELD(0x00006C1A, host_ssp, + * HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + * EVMCS1_FIELD(0x00006C1C, host_ia32_int_ssp_table_addr, + * HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + */ + + /* 64 bit read only */ + EVMCS1_FIELD(GUEST_PHYSICAL_ADDRESS, guest_physical_address, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), + EVMCS1_FIELD(EXIT_QUALIFICATION, exit_qualification, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), + /* + * Not defined in KVM: + * + * EVMCS1_FIELD(0x00006402, exit_io_instruction_ecx, + * HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE); + * EVMCS1_FIELD(0x00006404, exit_io_instruction_esi, + * HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE); + * EVMCS1_FIELD(0x00006406, exit_io_instruction_esi, + * HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE); + * EVMCS1_FIELD(0x00006408, exit_io_instruction_eip, + * HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE); + */ + EVMCS1_FIELD(GUEST_LINEAR_ADDRESS, guest_linear_address, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), + + /* + * No mask defined in the spec as Hyper-V doesn't currently support + * these. Future proof by resetting the whole clean field mask on + * access. + */ + EVMCS1_FIELD(VM_EXIT_MSR_STORE_ADDR, vm_exit_msr_store_addr, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), + EVMCS1_FIELD(VM_EXIT_MSR_LOAD_ADDR, vm_exit_msr_load_addr, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), + EVMCS1_FIELD(VM_ENTRY_MSR_LOAD_ADDR, vm_entry_msr_load_addr, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), + + /* 32 bit rw */ + EVMCS1_FIELD(TPR_THRESHOLD, tpr_threshold, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), + EVMCS1_FIELD(GUEST_INTERRUPTIBILITY_INFO, guest_interruptibility_info, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC), + EVMCS1_FIELD(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC), + EVMCS1_FIELD(EXCEPTION_BITMAP, exception_bitmap, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EXCPN), + EVMCS1_FIELD(VM_ENTRY_CONTROLS, vm_entry_controls, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_ENTRY), + EVMCS1_FIELD(VM_ENTRY_INTR_INFO_FIELD, vm_entry_intr_info_field, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT), + EVMCS1_FIELD(VM_ENTRY_EXCEPTION_ERROR_CODE, + vm_entry_exception_error_code, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT), + EVMCS1_FIELD(VM_ENTRY_INSTRUCTION_LEN, vm_entry_instruction_len, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT), + EVMCS1_FIELD(HOST_IA32_SYSENTER_CS, host_ia32_sysenter_cs, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + EVMCS1_FIELD(PIN_BASED_VM_EXEC_CONTROL, pin_based_vm_exec_control, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1), + EVMCS1_FIELD(VM_EXIT_CONTROLS, vm_exit_controls, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1), + EVMCS1_FIELD(SECONDARY_VM_EXEC_CONTROL, secondary_vm_exec_control, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1), + EVMCS1_FIELD(GUEST_ES_LIMIT, guest_es_limit, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_CS_LIMIT, guest_cs_limit, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_SS_LIMIT, guest_ss_limit, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_DS_LIMIT, guest_ds_limit, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_FS_LIMIT, guest_fs_limit, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_GS_LIMIT, guest_gs_limit, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_LDTR_LIMIT, guest_ldtr_limit, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_TR_LIMIT, guest_tr_limit, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_GDTR_LIMIT, guest_gdtr_limit, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_IDTR_LIMIT, guest_idtr_limit, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_ES_AR_BYTES, guest_es_ar_bytes, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_CS_AR_BYTES, guest_cs_ar_bytes, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_SS_AR_BYTES, guest_ss_ar_bytes, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_DS_AR_BYTES, guest_ds_ar_bytes, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_FS_AR_BYTES, guest_fs_ar_bytes, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_GS_AR_BYTES, guest_gs_ar_bytes, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_LDTR_AR_BYTES, guest_ldtr_ar_bytes, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_TR_AR_BYTES, guest_tr_ar_bytes, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_ACTIVITY_STATE, guest_activity_state, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), + EVMCS1_FIELD(GUEST_SYSENTER_CS, guest_sysenter_cs, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), + + /* 32 bit read only */ + EVMCS1_FIELD(VM_INSTRUCTION_ERROR, vm_instruction_error, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), + EVMCS1_FIELD(VM_EXIT_REASON, vm_exit_reason, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), + EVMCS1_FIELD(VM_EXIT_INTR_INFO, vm_exit_intr_info, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), + EVMCS1_FIELD(VM_EXIT_INTR_ERROR_CODE, vm_exit_intr_error_code, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), + EVMCS1_FIELD(IDT_VECTORING_INFO_FIELD, idt_vectoring_info_field, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), + EVMCS1_FIELD(IDT_VECTORING_ERROR_CODE, idt_vectoring_error_code, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), + EVMCS1_FIELD(VM_EXIT_INSTRUCTION_LEN, vm_exit_instruction_len, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), + EVMCS1_FIELD(VMX_INSTRUCTION_INFO, vmx_instruction_info, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), + + /* No mask defined in the spec (not used) */ + EVMCS1_FIELD(PAGE_FAULT_ERROR_CODE_MASK, page_fault_error_code_mask, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), + EVMCS1_FIELD(PAGE_FAULT_ERROR_CODE_MATCH, page_fault_error_code_match, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), + EVMCS1_FIELD(CR3_TARGET_COUNT, cr3_target_count, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), + EVMCS1_FIELD(VM_EXIT_MSR_STORE_COUNT, vm_exit_msr_store_count, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), + EVMCS1_FIELD(VM_EXIT_MSR_LOAD_COUNT, vm_exit_msr_load_count, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), + EVMCS1_FIELD(VM_ENTRY_MSR_LOAD_COUNT, vm_entry_msr_load_count, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), + + /* 16 bit rw */ + EVMCS1_FIELD(HOST_ES_SELECTOR, host_es_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + EVMCS1_FIELD(HOST_CS_SELECTOR, host_cs_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + EVMCS1_FIELD(HOST_SS_SELECTOR, host_ss_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + EVMCS1_FIELD(HOST_DS_SELECTOR, host_ds_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + EVMCS1_FIELD(HOST_FS_SELECTOR, host_fs_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + EVMCS1_FIELD(HOST_GS_SELECTOR, host_gs_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + EVMCS1_FIELD(HOST_TR_SELECTOR, host_tr_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + EVMCS1_FIELD(GUEST_ES_SELECTOR, guest_es_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_CS_SELECTOR, guest_cs_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_SS_SELECTOR, guest_ss_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_DS_SELECTOR, guest_ds_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_FS_SELECTOR, guest_fs_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_GS_SELECTOR, guest_gs_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_LDTR_SELECTOR, guest_ldtr_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(GUEST_TR_SELECTOR, guest_tr_selector, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2), + EVMCS1_FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT), +}; +const unsigned int nr_evmcs_1_fields = ARRAY_SIZE(vmcs_field_to_evmcs_1); + +bool nested_enlightened_vmentry(struct kvm_vcpu *vcpu, u64 *evmcs_gpa) +{ + struct hv_vp_assist_page assist_page; + + *evmcs_gpa = -1ull; + + if (unlikely(!kvm_hv_get_assist_page(vcpu, &assist_page))) + return false; + + if (unlikely(!assist_page.enlighten_vmentry)) + return false; + + if (unlikely(!evmptr_is_valid(assist_page.current_nested_vmcs))) + return false; + + *evmcs_gpa = assist_page.current_nested_vmcs; + + return true; +} + +uint16_t nested_get_evmcs_version(struct kvm_vcpu *vcpu) +{ + /* + * vmcs_version represents the range of supported Enlightened VMCS + * versions: lower 8 bits is the minimal version, higher 8 bits is the + * maximum supported version. KVM supports versions from 1 to + * KVM_EVMCS_VERSION. + * + * Note, do not check the Hyper-V is fully enabled in guest CPUID, this + * helper is used to _get_ the vCPU's supported CPUID. + */ + if (kvm_cpu_cap_get(X86_FEATURE_VMX) && + (!vcpu || to_vmx(vcpu)->nested.enlightened_vmcs_enabled)) + return (KVM_EVMCS_VERSION << 8) | 1; + + return 0; +} + +enum evmcs_revision { + EVMCSv1_LEGACY, + NR_EVMCS_REVISIONS, +}; + +enum evmcs_ctrl_type { + EVMCS_EXIT_CTRLS, + EVMCS_ENTRY_CTRLS, + EVMCS_2NDEXEC, + EVMCS_PINCTRL, + EVMCS_VMFUNC, + NR_EVMCS_CTRLS, +}; + +static const u32 evmcs_unsupported_ctrls[NR_EVMCS_CTRLS][NR_EVMCS_REVISIONS] = { + [EVMCS_EXIT_CTRLS] = { + [EVMCSv1_LEGACY] = EVMCS1_UNSUPPORTED_VMEXIT_CTRL, + }, + [EVMCS_ENTRY_CTRLS] = { + [EVMCSv1_LEGACY] = EVMCS1_UNSUPPORTED_VMENTRY_CTRL, + }, + [EVMCS_2NDEXEC] = { + [EVMCSv1_LEGACY] = EVMCS1_UNSUPPORTED_2NDEXEC, + }, + [EVMCS_PINCTRL] = { + [EVMCSv1_LEGACY] = EVMCS1_UNSUPPORTED_PINCTRL, + }, + [EVMCS_VMFUNC] = { + [EVMCSv1_LEGACY] = EVMCS1_UNSUPPORTED_VMFUNC, + }, +}; + +static u32 evmcs_get_unsupported_ctls(enum evmcs_ctrl_type ctrl_type) +{ + enum evmcs_revision evmcs_rev = EVMCSv1_LEGACY; + + return evmcs_unsupported_ctrls[ctrl_type][evmcs_rev]; +} + +static bool evmcs_has_perf_global_ctrl(struct kvm_vcpu *vcpu) +{ + struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); + + /* + * PERF_GLOBAL_CTRL has a quirk where some Windows guests may fail to + * boot if a PV CPUID feature flag is not also set. Treat the fields + * as unsupported if the flag is not set in guest CPUID. This should + * be called only for guest accesses, and all guest accesses should be + * gated on Hyper-V being enabled and initialized. + */ + if (WARN_ON_ONCE(!hv_vcpu)) + return false; + + return hv_vcpu->cpuid_cache.nested_ebx & HV_X64_NESTED_EVMCS1_PERF_GLOBAL_CTRL; +} + +void nested_evmcs_filter_control_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) +{ + u32 ctl_low = (u32)*pdata; + u32 ctl_high = (u32)(*pdata >> 32); + u32 unsupported_ctrls; + + /* + * Hyper-V 2016 and 2019 try using these features even when eVMCS + * is enabled but there are no corresponding fields. + */ + switch (msr_index) { + case MSR_IA32_VMX_EXIT_CTLS: + case MSR_IA32_VMX_TRUE_EXIT_CTLS: + unsupported_ctrls = evmcs_get_unsupported_ctls(EVMCS_EXIT_CTRLS); + if (!evmcs_has_perf_global_ctrl(vcpu)) + unsupported_ctrls |= VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL; + ctl_high &= ~unsupported_ctrls; + break; + case MSR_IA32_VMX_ENTRY_CTLS: + case MSR_IA32_VMX_TRUE_ENTRY_CTLS: + unsupported_ctrls = evmcs_get_unsupported_ctls(EVMCS_ENTRY_CTRLS); + if (!evmcs_has_perf_global_ctrl(vcpu)) + unsupported_ctrls |= VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; + ctl_high &= ~unsupported_ctrls; + break; + case MSR_IA32_VMX_PROCBASED_CTLS2: + ctl_high &= ~evmcs_get_unsupported_ctls(EVMCS_2NDEXEC); + break; + case MSR_IA32_VMX_TRUE_PINBASED_CTLS: + case MSR_IA32_VMX_PINBASED_CTLS: + ctl_high &= ~evmcs_get_unsupported_ctls(EVMCS_PINCTRL); + break; + case MSR_IA32_VMX_VMFUNC: + ctl_low &= ~evmcs_get_unsupported_ctls(EVMCS_VMFUNC); + break; + } + + *pdata = ctl_low | ((u64)ctl_high << 32); +} + +static bool nested_evmcs_is_valid_controls(enum evmcs_ctrl_type ctrl_type, + u32 val) +{ + return !(val & evmcs_get_unsupported_ctls(ctrl_type)); +} + +int nested_evmcs_check_controls(struct vmcs12 *vmcs12) +{ + if (CC(!nested_evmcs_is_valid_controls(EVMCS_PINCTRL, + vmcs12->pin_based_vm_exec_control))) + return -EINVAL; + + if (CC(!nested_evmcs_is_valid_controls(EVMCS_2NDEXEC, + vmcs12->secondary_vm_exec_control))) + return -EINVAL; + + if (CC(!nested_evmcs_is_valid_controls(EVMCS_EXIT_CTRLS, + vmcs12->vm_exit_controls))) + return -EINVAL; + + if (CC(!nested_evmcs_is_valid_controls(EVMCS_ENTRY_CTRLS, + vmcs12->vm_entry_controls))) + return -EINVAL; + + /* + * VM-Func controls are 64-bit, but KVM currently doesn't support any + * controls in bits 63:32, i.e. dropping those bits on the consistency + * check is intentional. + */ + if (WARN_ON_ONCE(vmcs12->vm_function_control >> 32)) + return -EINVAL; + + if (CC(!nested_evmcs_is_valid_controls(EVMCS_VMFUNC, + vmcs12->vm_function_control))) + return -EINVAL; + + return 0; +} + +int nested_enable_evmcs(struct kvm_vcpu *vcpu, + uint16_t *vmcs_version) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + vmx->nested.enlightened_vmcs_enabled = true; + + if (vmcs_version) + *vmcs_version = nested_get_evmcs_version(vcpu); + + return 0; +} diff --git a/arch/x86/kvm/vmx/evmcs.h b/arch/x86/kvm/vmx/evmcs.h new file mode 100644 index 000000000..1bc4e8408 --- /dev/null +++ b/arch/x86/kvm/vmx/evmcs.h @@ -0,0 +1,234 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __KVM_X86_VMX_EVMCS_H +#define __KVM_X86_VMX_EVMCS_H + +#include <linux/jump_label.h> + +#include <asm/hyperv-tlfs.h> +#include <asm/mshyperv.h> +#include <asm/vmx.h> + +#include "capabilities.h" +#include "vmcs.h" +#include "vmcs12.h" + +struct vmcs_config; + +DECLARE_STATIC_KEY_FALSE(enable_evmcs); + +#define current_evmcs ((struct hv_enlightened_vmcs *)this_cpu_read(current_vmcs)) + +#define KVM_EVMCS_VERSION 1 + +/* + * Enlightened VMCSv1 doesn't support these: + * + * POSTED_INTR_NV = 0x00000002, + * GUEST_INTR_STATUS = 0x00000810, + * APIC_ACCESS_ADDR = 0x00002014, + * POSTED_INTR_DESC_ADDR = 0x00002016, + * EOI_EXIT_BITMAP0 = 0x0000201c, + * EOI_EXIT_BITMAP1 = 0x0000201e, + * EOI_EXIT_BITMAP2 = 0x00002020, + * EOI_EXIT_BITMAP3 = 0x00002022, + * GUEST_PML_INDEX = 0x00000812, + * PML_ADDRESS = 0x0000200e, + * VM_FUNCTION_CONTROL = 0x00002018, + * EPTP_LIST_ADDRESS = 0x00002024, + * VMREAD_BITMAP = 0x00002026, + * VMWRITE_BITMAP = 0x00002028, + * + * TSC_MULTIPLIER = 0x00002032, + * PLE_GAP = 0x00004020, + * PLE_WINDOW = 0x00004022, + * VMX_PREEMPTION_TIMER_VALUE = 0x0000482E, + * + * Currently unsupported in KVM: + * GUEST_IA32_RTIT_CTL = 0x00002814, + */ +#define EVMCS1_UNSUPPORTED_PINCTRL (PIN_BASED_POSTED_INTR | \ + PIN_BASED_VMX_PREEMPTION_TIMER) +#define EVMCS1_UNSUPPORTED_EXEC_CTRL (CPU_BASED_ACTIVATE_TERTIARY_CONTROLS) +#define EVMCS1_UNSUPPORTED_2NDEXEC \ + (SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | \ + SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | \ + SECONDARY_EXEC_APIC_REGISTER_VIRT | \ + SECONDARY_EXEC_ENABLE_PML | \ + SECONDARY_EXEC_ENABLE_VMFUNC | \ + SECONDARY_EXEC_SHADOW_VMCS | \ + SECONDARY_EXEC_TSC_SCALING | \ + SECONDARY_EXEC_PAUSE_LOOP_EXITING) +#define EVMCS1_UNSUPPORTED_VMEXIT_CTRL \ + (VM_EXIT_SAVE_VMX_PREEMPTION_TIMER) +#define EVMCS1_UNSUPPORTED_VMENTRY_CTRL (0) +#define EVMCS1_UNSUPPORTED_VMFUNC (VMX_VMFUNC_EPTP_SWITCHING) + +struct evmcs_field { + u16 offset; + u16 clean_field; +}; + +extern const struct evmcs_field vmcs_field_to_evmcs_1[]; +extern const unsigned int nr_evmcs_1_fields; + +static __always_inline int evmcs_field_offset(unsigned long field, + u16 *clean_field) +{ + unsigned int index = ROL16(field, 6); + const struct evmcs_field *evmcs_field; + + if (unlikely(index >= nr_evmcs_1_fields)) + return -ENOENT; + + evmcs_field = &vmcs_field_to_evmcs_1[index]; + + /* + * Use offset=0 to detect holes in eVMCS. This offset belongs to + * 'revision_id' but this field has no encoding and is supposed to + * be accessed directly. + */ + if (unlikely(!evmcs_field->offset)) + return -ENOENT; + + if (clean_field) + *clean_field = evmcs_field->clean_field; + + return evmcs_field->offset; +} + +static inline u64 evmcs_read_any(struct hv_enlightened_vmcs *evmcs, + unsigned long field, u16 offset) +{ + /* + * vmcs12_read_any() doesn't care whether the supplied structure + * is 'struct vmcs12' or 'struct hv_enlightened_vmcs' as it takes + * the exact offset of the required field, use it for convenience + * here. + */ + return vmcs12_read_any((void *)evmcs, field, offset); +} + +#if IS_ENABLED(CONFIG_HYPERV) + +static __always_inline int get_evmcs_offset(unsigned long field, + u16 *clean_field) +{ + int offset = evmcs_field_offset(field, clean_field); + + WARN_ONCE(offset < 0, "KVM: accessing unsupported EVMCS field %lx\n", + field); + + return offset; +} + +static __always_inline void evmcs_write64(unsigned long field, u64 value) +{ + u16 clean_field; + int offset = get_evmcs_offset(field, &clean_field); + + if (offset < 0) + return; + + *(u64 *)((char *)current_evmcs + offset) = value; + + current_evmcs->hv_clean_fields &= ~clean_field; +} + +static inline void evmcs_write32(unsigned long field, u32 value) +{ + u16 clean_field; + int offset = get_evmcs_offset(field, &clean_field); + + if (offset < 0) + return; + + *(u32 *)((char *)current_evmcs + offset) = value; + current_evmcs->hv_clean_fields &= ~clean_field; +} + +static inline void evmcs_write16(unsigned long field, u16 value) +{ + u16 clean_field; + int offset = get_evmcs_offset(field, &clean_field); + + if (offset < 0) + return; + + *(u16 *)((char *)current_evmcs + offset) = value; + current_evmcs->hv_clean_fields &= ~clean_field; +} + +static inline u64 evmcs_read64(unsigned long field) +{ + int offset = get_evmcs_offset(field, NULL); + + if (offset < 0) + return 0; + + return *(u64 *)((char *)current_evmcs + offset); +} + +static inline u32 evmcs_read32(unsigned long field) +{ + int offset = get_evmcs_offset(field, NULL); + + if (offset < 0) + return 0; + + return *(u32 *)((char *)current_evmcs + offset); +} + +static inline u16 evmcs_read16(unsigned long field) +{ + int offset = get_evmcs_offset(field, NULL); + + if (offset < 0) + return 0; + + return *(u16 *)((char *)current_evmcs + offset); +} + +static inline void evmcs_load(u64 phys_addr) +{ + struct hv_vp_assist_page *vp_ap = + hv_get_vp_assist_page(smp_processor_id()); + + if (current_evmcs->hv_enlightenments_control.nested_flush_hypercall) + vp_ap->nested_control.features.directhypercall = 1; + vp_ap->current_nested_vmcs = phys_addr; + vp_ap->enlighten_vmentry = 1; +} + +#else /* !IS_ENABLED(CONFIG_HYPERV) */ +static __always_inline void evmcs_write64(unsigned long field, u64 value) {} +static inline void evmcs_write32(unsigned long field, u32 value) {} +static inline void evmcs_write16(unsigned long field, u16 value) {} +static inline u64 evmcs_read64(unsigned long field) { return 0; } +static inline u32 evmcs_read32(unsigned long field) { return 0; } +static inline u16 evmcs_read16(unsigned long field) { return 0; } +static inline void evmcs_load(u64 phys_addr) {} +#endif /* IS_ENABLED(CONFIG_HYPERV) */ + +#define EVMPTR_INVALID (-1ULL) +#define EVMPTR_MAP_PENDING (-2ULL) + +static inline bool evmptr_is_valid(u64 evmptr) +{ + return evmptr != EVMPTR_INVALID && evmptr != EVMPTR_MAP_PENDING; +} + +enum nested_evmptrld_status { + EVMPTRLD_DISABLED, + EVMPTRLD_SUCCEEDED, + EVMPTRLD_VMFAIL, + EVMPTRLD_ERROR, +}; + +bool nested_enlightened_vmentry(struct kvm_vcpu *vcpu, u64 *evmcs_gpa); +uint16_t nested_get_evmcs_version(struct kvm_vcpu *vcpu); +int nested_enable_evmcs(struct kvm_vcpu *vcpu, + uint16_t *vmcs_version); +void nested_evmcs_filter_control_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata); +int nested_evmcs_check_controls(struct vmcs12 *vmcs12); + +#endif /* __KVM_X86_VMX_EVMCS_H */ diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c new file mode 100644 index 000000000..9d683b606 --- /dev/null +++ b/arch/x86/kvm/vmx/nested.c @@ -0,0 +1,7028 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/objtool.h> +#include <linux/percpu.h> + +#include <asm/debugreg.h> +#include <asm/mmu_context.h> + +#include "cpuid.h" +#include "evmcs.h" +#include "hyperv.h" +#include "mmu.h" +#include "nested.h" +#include "pmu.h" +#include "sgx.h" +#include "trace.h" +#include "vmx.h" +#include "x86.h" + +static bool __read_mostly enable_shadow_vmcs = 1; +module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO); + +static bool __read_mostly nested_early_check = 0; +module_param(nested_early_check, bool, S_IRUGO); + +#define CC KVM_NESTED_VMENTER_CONSISTENCY_CHECK + +/* + * Hyper-V requires all of these, so mark them as supported even though + * they are just treated the same as all-context. + */ +#define VMX_VPID_EXTENT_SUPPORTED_MASK \ + (VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT | \ + VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT | \ + VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT | \ + VMX_VPID_EXTENT_SINGLE_NON_GLOBAL_BIT) + +#define VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE 5 + +enum { + VMX_VMREAD_BITMAP, + VMX_VMWRITE_BITMAP, + VMX_BITMAP_NR +}; +static unsigned long *vmx_bitmap[VMX_BITMAP_NR]; + +#define vmx_vmread_bitmap (vmx_bitmap[VMX_VMREAD_BITMAP]) +#define vmx_vmwrite_bitmap (vmx_bitmap[VMX_VMWRITE_BITMAP]) + +struct shadow_vmcs_field { + u16 encoding; + u16 offset; +}; +static struct shadow_vmcs_field shadow_read_only_fields[] = { +#define SHADOW_FIELD_RO(x, y) { x, offsetof(struct vmcs12, y) }, +#include "vmcs_shadow_fields.h" +}; +static int max_shadow_read_only_fields = + ARRAY_SIZE(shadow_read_only_fields); + +static struct shadow_vmcs_field shadow_read_write_fields[] = { +#define SHADOW_FIELD_RW(x, y) { x, offsetof(struct vmcs12, y) }, +#include "vmcs_shadow_fields.h" +}; +static int max_shadow_read_write_fields = + ARRAY_SIZE(shadow_read_write_fields); + +static void init_vmcs_shadow_fields(void) +{ + int i, j; + + memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE); + memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE); + + for (i = j = 0; i < max_shadow_read_only_fields; i++) { + struct shadow_vmcs_field entry = shadow_read_only_fields[i]; + u16 field = entry.encoding; + + if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 && + (i + 1 == max_shadow_read_only_fields || + shadow_read_only_fields[i + 1].encoding != field + 1)) + pr_err("Missing field from shadow_read_only_field %x\n", + field + 1); + + clear_bit(field, vmx_vmread_bitmap); + if (field & 1) +#ifdef CONFIG_X86_64 + continue; +#else + entry.offset += sizeof(u32); +#endif + shadow_read_only_fields[j++] = entry; + } + max_shadow_read_only_fields = j; + + for (i = j = 0; i < max_shadow_read_write_fields; i++) { + struct shadow_vmcs_field entry = shadow_read_write_fields[i]; + u16 field = entry.encoding; + + if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 && + (i + 1 == max_shadow_read_write_fields || + shadow_read_write_fields[i + 1].encoding != field + 1)) + pr_err("Missing field from shadow_read_write_field %x\n", + field + 1); + + WARN_ONCE(field >= GUEST_ES_AR_BYTES && + field <= GUEST_TR_AR_BYTES, + "Update vmcs12_write_any() to drop reserved bits from AR_BYTES"); + + /* + * PML and the preemption timer can be emulated, but the + * processor cannot vmwrite to fields that don't exist + * on bare metal. + */ + switch (field) { + case GUEST_PML_INDEX: + if (!cpu_has_vmx_pml()) + continue; + break; + case VMX_PREEMPTION_TIMER_VALUE: + if (!cpu_has_vmx_preemption_timer()) + continue; + break; + case GUEST_INTR_STATUS: + if (!cpu_has_vmx_apicv()) + continue; + break; + default: + break; + } + + clear_bit(field, vmx_vmwrite_bitmap); + clear_bit(field, vmx_vmread_bitmap); + if (field & 1) +#ifdef CONFIG_X86_64 + continue; +#else + entry.offset += sizeof(u32); +#endif + shadow_read_write_fields[j++] = entry; + } + max_shadow_read_write_fields = j; +} + +/* + * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(), + * set the success or error code of an emulated VMX instruction (as specified + * by Vol 2B, VMX Instruction Reference, "Conventions"), and skip the emulated + * instruction. + */ +static int nested_vmx_succeed(struct kvm_vcpu *vcpu) +{ + vmx_set_rflags(vcpu, vmx_get_rflags(vcpu) + & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | + X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF)); + return kvm_skip_emulated_instruction(vcpu); +} + +static int nested_vmx_failInvalid(struct kvm_vcpu *vcpu) +{ + vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) + & ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF | + X86_EFLAGS_SF | X86_EFLAGS_OF)) + | X86_EFLAGS_CF); + return kvm_skip_emulated_instruction(vcpu); +} + +static int nested_vmx_failValid(struct kvm_vcpu *vcpu, + u32 vm_instruction_error) +{ + vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) + & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | + X86_EFLAGS_SF | X86_EFLAGS_OF)) + | X86_EFLAGS_ZF); + get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error; + /* + * We don't need to force sync to shadow VMCS because + * VM_INSTRUCTION_ERROR is not shadowed. Enlightened VMCS 'shadows' all + * fields and thus must be synced. + */ + if (to_vmx(vcpu)->nested.hv_evmcs_vmptr != EVMPTR_INVALID) + to_vmx(vcpu)->nested.need_vmcs12_to_shadow_sync = true; + + return kvm_skip_emulated_instruction(vcpu); +} + +static int nested_vmx_fail(struct kvm_vcpu *vcpu, u32 vm_instruction_error) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + /* + * failValid writes the error number to the current VMCS, which + * can't be done if there isn't a current VMCS. + */ + if (vmx->nested.current_vmptr == INVALID_GPA && + !evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) + return nested_vmx_failInvalid(vcpu); + + return nested_vmx_failValid(vcpu, vm_instruction_error); +} + +static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator) +{ + /* TODO: not to reset guest simply here. */ + kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); + pr_debug_ratelimited("kvm: nested vmx abort, indicator %d\n", indicator); +} + +static inline bool vmx_control_verify(u32 control, u32 low, u32 high) +{ + return fixed_bits_valid(control, low, high); +} + +static inline u64 vmx_control_msr(u32 low, u32 high) +{ + return low | ((u64)high << 32); +} + +static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx) +{ + secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_SHADOW_VMCS); + vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA); + vmx->nested.need_vmcs12_to_shadow_sync = false; +} + +static inline void nested_release_evmcs(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) { + kvm_vcpu_unmap(vcpu, &vmx->nested.hv_evmcs_map, true); + vmx->nested.hv_evmcs = NULL; + } + + vmx->nested.hv_evmcs_vmptr = EVMPTR_INVALID; +} + +static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx, + struct loaded_vmcs *prev) +{ + struct vmcs_host_state *dest, *src; + + if (unlikely(!vmx->guest_state_loaded)) + return; + + src = &prev->host_state; + dest = &vmx->loaded_vmcs->host_state; + + vmx_set_host_fs_gs(dest, src->fs_sel, src->gs_sel, src->fs_base, src->gs_base); + dest->ldt_sel = src->ldt_sel; +#ifdef CONFIG_X86_64 + dest->ds_sel = src->ds_sel; + dest->es_sel = src->es_sel; +#endif +} + +static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct loaded_vmcs *prev; + int cpu; + + if (WARN_ON_ONCE(vmx->loaded_vmcs == vmcs)) + return; + + cpu = get_cpu(); + prev = vmx->loaded_vmcs; + vmx->loaded_vmcs = vmcs; + vmx_vcpu_load_vmcs(vcpu, cpu, prev); + vmx_sync_vmcs_host_state(vmx, prev); + put_cpu(); + + vcpu->arch.regs_avail = ~VMX_REGS_LAZY_LOAD_SET; + + /* + * All lazily updated registers will be reloaded from VMCS12 on both + * vmentry and vmexit. + */ + vcpu->arch.regs_dirty = 0; +} + +/* + * Free whatever needs to be freed from vmx->nested when L1 goes down, or + * just stops using VMX. + */ +static void free_nested(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (WARN_ON_ONCE(vmx->loaded_vmcs != &vmx->vmcs01)) + vmx_switch_vmcs(vcpu, &vmx->vmcs01); + + if (!vmx->nested.vmxon && !vmx->nested.smm.vmxon) + return; + + kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); + + vmx->nested.vmxon = false; + vmx->nested.smm.vmxon = false; + vmx->nested.vmxon_ptr = INVALID_GPA; + free_vpid(vmx->nested.vpid02); + vmx->nested.posted_intr_nv = -1; + vmx->nested.current_vmptr = INVALID_GPA; + if (enable_shadow_vmcs) { + vmx_disable_shadow_vmcs(vmx); + vmcs_clear(vmx->vmcs01.shadow_vmcs); + free_vmcs(vmx->vmcs01.shadow_vmcs); + vmx->vmcs01.shadow_vmcs = NULL; + } + kfree(vmx->nested.cached_vmcs12); + vmx->nested.cached_vmcs12 = NULL; + kfree(vmx->nested.cached_shadow_vmcs12); + vmx->nested.cached_shadow_vmcs12 = NULL; + /* + * Unpin physical memory we referred to in the vmcs02. The APIC access + * page's backing page (yeah, confusing) shouldn't actually be accessed, + * and if it is written, the contents are irrelevant. + */ + kvm_vcpu_unmap(vcpu, &vmx->nested.apic_access_page_map, false); + kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true); + kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true); + vmx->nested.pi_desc = NULL; + + kvm_mmu_free_roots(vcpu->kvm, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL); + + nested_release_evmcs(vcpu); + + free_loaded_vmcs(&vmx->nested.vmcs02); +} + +/* + * Ensure that the current vmcs of the logical processor is the + * vmcs01 of the vcpu before calling free_nested(). + */ +void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu) +{ + vcpu_load(vcpu); + vmx_leave_nested(vcpu); + vcpu_put(vcpu); +} + +#define EPTP_PA_MASK GENMASK_ULL(51, 12) + +static bool nested_ept_root_matches(hpa_t root_hpa, u64 root_eptp, u64 eptp) +{ + return VALID_PAGE(root_hpa) && + ((root_eptp & EPTP_PA_MASK) == (eptp & EPTP_PA_MASK)); +} + +static void nested_ept_invalidate_addr(struct kvm_vcpu *vcpu, gpa_t eptp, + gpa_t addr) +{ + uint i; + struct kvm_mmu_root_info *cached_root; + + WARN_ON_ONCE(!mmu_is_nested(vcpu)); + + for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { + cached_root = &vcpu->arch.mmu->prev_roots[i]; + + if (nested_ept_root_matches(cached_root->hpa, cached_root->pgd, + eptp)) + vcpu->arch.mmu->invlpg(vcpu, addr, cached_root->hpa); + } +} + +static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu, + struct x86_exception *fault) +{ + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + struct vcpu_vmx *vmx = to_vmx(vcpu); + u32 vm_exit_reason; + unsigned long exit_qualification = vcpu->arch.exit_qualification; + + if (vmx->nested.pml_full) { + vm_exit_reason = EXIT_REASON_PML_FULL; + vmx->nested.pml_full = false; + exit_qualification &= INTR_INFO_UNBLOCK_NMI; + } else { + if (fault->error_code & PFERR_RSVD_MASK) + vm_exit_reason = EXIT_REASON_EPT_MISCONFIG; + else + vm_exit_reason = EXIT_REASON_EPT_VIOLATION; + + /* + * Although the caller (kvm_inject_emulated_page_fault) would + * have already synced the faulting address in the shadow EPT + * tables for the current EPTP12, we also need to sync it for + * any other cached EPTP02s based on the same EP4TA, since the + * TLB associates mappings to the EP4TA rather than the full EPTP. + */ + nested_ept_invalidate_addr(vcpu, vmcs12->ept_pointer, + fault->address); + } + + nested_vmx_vmexit(vcpu, vm_exit_reason, 0, exit_qualification); + vmcs12->guest_physical_address = fault->address; +} + +static void nested_ept_new_eptp(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + bool execonly = vmx->nested.msrs.ept_caps & VMX_EPT_EXECUTE_ONLY_BIT; + int ept_lpage_level = ept_caps_to_lpage_level(vmx->nested.msrs.ept_caps); + + kvm_init_shadow_ept_mmu(vcpu, execonly, ept_lpage_level, + nested_ept_ad_enabled(vcpu), + nested_ept_get_eptp(vcpu)); +} + +static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu) +{ + WARN_ON(mmu_is_nested(vcpu)); + + vcpu->arch.mmu = &vcpu->arch.guest_mmu; + nested_ept_new_eptp(vcpu); + vcpu->arch.mmu->get_guest_pgd = nested_ept_get_eptp; + vcpu->arch.mmu->inject_page_fault = nested_ept_inject_page_fault; + vcpu->arch.mmu->get_pdptr = kvm_pdptr_read; + + vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu; +} + +static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu) +{ + vcpu->arch.mmu = &vcpu->arch.root_mmu; + vcpu->arch.walk_mmu = &vcpu->arch.root_mmu; +} + +static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12, + u16 error_code) +{ + bool inequality, bit; + + bit = (vmcs12->exception_bitmap & (1u << PF_VECTOR)) != 0; + inequality = + (error_code & vmcs12->page_fault_error_code_mask) != + vmcs12->page_fault_error_code_match; + return inequality ^ bit; +} + +static bool nested_vmx_is_exception_vmexit(struct kvm_vcpu *vcpu, u8 vector, + u32 error_code) +{ + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + + /* + * Drop bits 31:16 of the error code when performing the #PF mask+match + * check. All VMCS fields involved are 32 bits, but Intel CPUs never + * set bits 31:16 and VMX disallows setting bits 31:16 in the injected + * error code. Including the to-be-dropped bits in the check might + * result in an "impossible" or missed exit from L1's perspective. + */ + if (vector == PF_VECTOR) + return nested_vmx_is_page_fault_vmexit(vmcs12, (u16)error_code); + + return (vmcs12->exception_bitmap & (1u << vector)); +} + +static int nested_vmx_check_io_bitmap_controls(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) + return 0; + + if (CC(!page_address_valid(vcpu, vmcs12->io_bitmap_a)) || + CC(!page_address_valid(vcpu, vmcs12->io_bitmap_b))) + return -EINVAL; + + return 0; +} + +static int nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) + return 0; + + if (CC(!page_address_valid(vcpu, vmcs12->msr_bitmap))) + return -EINVAL; + + return 0; +} + +static int nested_vmx_check_tpr_shadow_controls(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) + return 0; + + if (CC(!page_address_valid(vcpu, vmcs12->virtual_apic_page_addr))) + return -EINVAL; + + return 0; +} + +/* + * For x2APIC MSRs, ignore the vmcs01 bitmap. L1 can enable x2APIC without L1 + * itself utilizing x2APIC. All MSRs were previously set to be intercepted, + * only the "disable intercept" case needs to be handled. + */ +static void nested_vmx_disable_intercept_for_x2apic_msr(unsigned long *msr_bitmap_l1, + unsigned long *msr_bitmap_l0, + u32 msr, int type) +{ + if (type & MSR_TYPE_R && !vmx_test_msr_bitmap_read(msr_bitmap_l1, msr)) + vmx_clear_msr_bitmap_read(msr_bitmap_l0, msr); + + if (type & MSR_TYPE_W && !vmx_test_msr_bitmap_write(msr_bitmap_l1, msr)) + vmx_clear_msr_bitmap_write(msr_bitmap_l0, msr); +} + +static inline void enable_x2apic_msr_intercepts(unsigned long *msr_bitmap) +{ + int msr; + + for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) { + unsigned word = msr / BITS_PER_LONG; + + msr_bitmap[word] = ~0; + msr_bitmap[word + (0x800 / sizeof(long))] = ~0; + } +} + +#define BUILD_NVMX_MSR_INTERCEPT_HELPER(rw) \ +static inline \ +void nested_vmx_set_msr_##rw##_intercept(struct vcpu_vmx *vmx, \ + unsigned long *msr_bitmap_l1, \ + unsigned long *msr_bitmap_l0, u32 msr) \ +{ \ + if (vmx_test_msr_bitmap_##rw(vmx->vmcs01.msr_bitmap, msr) || \ + vmx_test_msr_bitmap_##rw(msr_bitmap_l1, msr)) \ + vmx_set_msr_bitmap_##rw(msr_bitmap_l0, msr); \ + else \ + vmx_clear_msr_bitmap_##rw(msr_bitmap_l0, msr); \ +} +BUILD_NVMX_MSR_INTERCEPT_HELPER(read) +BUILD_NVMX_MSR_INTERCEPT_HELPER(write) + +static inline void nested_vmx_set_intercept_for_msr(struct vcpu_vmx *vmx, + unsigned long *msr_bitmap_l1, + unsigned long *msr_bitmap_l0, + u32 msr, int types) +{ + if (types & MSR_TYPE_R) + nested_vmx_set_msr_read_intercept(vmx, msr_bitmap_l1, + msr_bitmap_l0, msr); + if (types & MSR_TYPE_W) + nested_vmx_set_msr_write_intercept(vmx, msr_bitmap_l1, + msr_bitmap_l0, msr); +} + +/* + * Merge L0's and L1's MSR bitmap, return false to indicate that + * we do not use the hardware. + */ +static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + int msr; + unsigned long *msr_bitmap_l1; + unsigned long *msr_bitmap_l0 = vmx->nested.vmcs02.msr_bitmap; + struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs; + struct kvm_host_map *map = &vmx->nested.msr_bitmap_map; + + /* Nothing to do if the MSR bitmap is not in use. */ + if (!cpu_has_vmx_msr_bitmap() || + !nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) + return false; + + /* + * MSR bitmap update can be skipped when: + * - MSR bitmap for L1 hasn't changed. + * - Nested hypervisor (L1) is attempting to launch the same L2 as + * before. + * - Nested hypervisor (L1) has enabled 'Enlightened MSR Bitmap' feature + * and tells KVM (L0) there were no changes in MSR bitmap for L2. + */ + if (!vmx->nested.force_msr_bitmap_recalc && evmcs && + evmcs->hv_enlightenments_control.msr_bitmap && + evmcs->hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP) + return true; + + if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->msr_bitmap), map)) + return false; + + msr_bitmap_l1 = (unsigned long *)map->hva; + + /* + * To keep the control flow simple, pay eight 8-byte writes (sixteen + * 4-byte writes on 32-bit systems) up front to enable intercepts for + * the x2APIC MSR range and selectively toggle those relevant to L2. + */ + enable_x2apic_msr_intercepts(msr_bitmap_l0); + + if (nested_cpu_has_virt_x2apic_mode(vmcs12)) { + if (nested_cpu_has_apic_reg_virt(vmcs12)) { + /* + * L0 need not intercept reads for MSRs between 0x800 + * and 0x8ff, it just lets the processor take the value + * from the virtual-APIC page; take those 256 bits + * directly from the L1 bitmap. + */ + for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) { + unsigned word = msr / BITS_PER_LONG; + + msr_bitmap_l0[word] = msr_bitmap_l1[word]; + } + } + + nested_vmx_disable_intercept_for_x2apic_msr( + msr_bitmap_l1, msr_bitmap_l0, + X2APIC_MSR(APIC_TASKPRI), + MSR_TYPE_R | MSR_TYPE_W); + + if (nested_cpu_has_vid(vmcs12)) { + nested_vmx_disable_intercept_for_x2apic_msr( + msr_bitmap_l1, msr_bitmap_l0, + X2APIC_MSR(APIC_EOI), + MSR_TYPE_W); + nested_vmx_disable_intercept_for_x2apic_msr( + msr_bitmap_l1, msr_bitmap_l0, + X2APIC_MSR(APIC_SELF_IPI), + MSR_TYPE_W); + } + } + + /* + * Always check vmcs01's bitmap to honor userspace MSR filters and any + * other runtime changes to vmcs01's bitmap, e.g. dynamic pass-through. + */ +#ifdef CONFIG_X86_64 + nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, + MSR_FS_BASE, MSR_TYPE_RW); + + nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, + MSR_GS_BASE, MSR_TYPE_RW); + + nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, + MSR_KERNEL_GS_BASE, MSR_TYPE_RW); +#endif + nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, + MSR_IA32_SPEC_CTRL, MSR_TYPE_RW); + + nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, + MSR_IA32_PRED_CMD, MSR_TYPE_W); + + kvm_vcpu_unmap(vcpu, &vmx->nested.msr_bitmap_map, false); + + vmx->nested.force_msr_bitmap_recalc = false; + + return true; +} + +static void nested_cache_shadow_vmcs12(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct gfn_to_hva_cache *ghc = &vmx->nested.shadow_vmcs12_cache; + + if (!nested_cpu_has_shadow_vmcs(vmcs12) || + vmcs12->vmcs_link_pointer == INVALID_GPA) + return; + + if (ghc->gpa != vmcs12->vmcs_link_pointer && + kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, + vmcs12->vmcs_link_pointer, VMCS12_SIZE)) + return; + + kvm_read_guest_cached(vmx->vcpu.kvm, ghc, get_shadow_vmcs12(vcpu), + VMCS12_SIZE); +} + +static void nested_flush_cached_shadow_vmcs12(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct gfn_to_hva_cache *ghc = &vmx->nested.shadow_vmcs12_cache; + + if (!nested_cpu_has_shadow_vmcs(vmcs12) || + vmcs12->vmcs_link_pointer == INVALID_GPA) + return; + + if (ghc->gpa != vmcs12->vmcs_link_pointer && + kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, + vmcs12->vmcs_link_pointer, VMCS12_SIZE)) + return; + + kvm_write_guest_cached(vmx->vcpu.kvm, ghc, get_shadow_vmcs12(vcpu), + VMCS12_SIZE); +} + +/* + * In nested virtualization, check if L1 has set + * VM_EXIT_ACK_INTR_ON_EXIT + */ +static bool nested_exit_intr_ack_set(struct kvm_vcpu *vcpu) +{ + return get_vmcs12(vcpu)->vm_exit_controls & + VM_EXIT_ACK_INTR_ON_EXIT; +} + +static int nested_vmx_check_apic_access_controls(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) && + CC(!page_address_valid(vcpu, vmcs12->apic_access_addr))) + return -EINVAL; + else + return 0; +} + +static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + if (!nested_cpu_has_virt_x2apic_mode(vmcs12) && + !nested_cpu_has_apic_reg_virt(vmcs12) && + !nested_cpu_has_vid(vmcs12) && + !nested_cpu_has_posted_intr(vmcs12)) + return 0; + + /* + * If virtualize x2apic mode is enabled, + * virtualize apic access must be disabled. + */ + if (CC(nested_cpu_has_virt_x2apic_mode(vmcs12) && + nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))) + return -EINVAL; + + /* + * If virtual interrupt delivery is enabled, + * we must exit on external interrupts. + */ + if (CC(nested_cpu_has_vid(vmcs12) && !nested_exit_on_intr(vcpu))) + return -EINVAL; + + /* + * bits 15:8 should be zero in posted_intr_nv, + * the descriptor address has been already checked + * in nested_get_vmcs12_pages. + * + * bits 5:0 of posted_intr_desc_addr should be zero. + */ + if (nested_cpu_has_posted_intr(vmcs12) && + (CC(!nested_cpu_has_vid(vmcs12)) || + CC(!nested_exit_intr_ack_set(vcpu)) || + CC((vmcs12->posted_intr_nv & 0xff00)) || + CC(!kvm_vcpu_is_legal_aligned_gpa(vcpu, vmcs12->posted_intr_desc_addr, 64)))) + return -EINVAL; + + /* tpr shadow is needed by all apicv features. */ + if (CC(!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))) + return -EINVAL; + + return 0; +} + +static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu, + u32 count, u64 addr) +{ + if (count == 0) + return 0; + + if (!kvm_vcpu_is_legal_aligned_gpa(vcpu, addr, 16) || + !kvm_vcpu_is_legal_gpa(vcpu, (addr + count * sizeof(struct vmx_msr_entry) - 1))) + return -EINVAL; + + return 0; +} + +static int nested_vmx_check_exit_msr_switch_controls(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + if (CC(nested_vmx_check_msr_switch(vcpu, + vmcs12->vm_exit_msr_load_count, + vmcs12->vm_exit_msr_load_addr)) || + CC(nested_vmx_check_msr_switch(vcpu, + vmcs12->vm_exit_msr_store_count, + vmcs12->vm_exit_msr_store_addr))) + return -EINVAL; + + return 0; +} + +static int nested_vmx_check_entry_msr_switch_controls(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + if (CC(nested_vmx_check_msr_switch(vcpu, + vmcs12->vm_entry_msr_load_count, + vmcs12->vm_entry_msr_load_addr))) + return -EINVAL; + + return 0; +} + +static int nested_vmx_check_pml_controls(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + if (!nested_cpu_has_pml(vmcs12)) + return 0; + + if (CC(!nested_cpu_has_ept(vmcs12)) || + CC(!page_address_valid(vcpu, vmcs12->pml_address))) + return -EINVAL; + + return 0; +} + +static int nested_vmx_check_unrestricted_guest_controls(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + if (CC(nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST) && + !nested_cpu_has_ept(vmcs12))) + return -EINVAL; + return 0; +} + +static int nested_vmx_check_mode_based_ept_exec_controls(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + if (CC(nested_cpu_has2(vmcs12, SECONDARY_EXEC_MODE_BASED_EPT_EXEC) && + !nested_cpu_has_ept(vmcs12))) + return -EINVAL; + return 0; +} + +static int nested_vmx_check_shadow_vmcs_controls(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + if (!nested_cpu_has_shadow_vmcs(vmcs12)) + return 0; + + if (CC(!page_address_valid(vcpu, vmcs12->vmread_bitmap)) || + CC(!page_address_valid(vcpu, vmcs12->vmwrite_bitmap))) + return -EINVAL; + + return 0; +} + +static int nested_vmx_msr_check_common(struct kvm_vcpu *vcpu, + struct vmx_msr_entry *e) +{ + /* x2APIC MSR accesses are not allowed */ + if (CC(vcpu->arch.apic_base & X2APIC_ENABLE && e->index >> 8 == 0x8)) + return -EINVAL; + if (CC(e->index == MSR_IA32_UCODE_WRITE) || /* SDM Table 35-2 */ + CC(e->index == MSR_IA32_UCODE_REV)) + return -EINVAL; + if (CC(e->reserved != 0)) + return -EINVAL; + return 0; +} + +static int nested_vmx_load_msr_check(struct kvm_vcpu *vcpu, + struct vmx_msr_entry *e) +{ + if (CC(e->index == MSR_FS_BASE) || + CC(e->index == MSR_GS_BASE) || + CC(e->index == MSR_IA32_SMM_MONITOR_CTL) || /* SMM is not supported */ + nested_vmx_msr_check_common(vcpu, e)) + return -EINVAL; + return 0; +} + +static int nested_vmx_store_msr_check(struct kvm_vcpu *vcpu, + struct vmx_msr_entry *e) +{ + if (CC(e->index == MSR_IA32_SMBASE) || /* SMM is not supported */ + nested_vmx_msr_check_common(vcpu, e)) + return -EINVAL; + return 0; +} + +static u32 nested_vmx_max_atomic_switch_msrs(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + u64 vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low, + vmx->nested.msrs.misc_high); + + return (vmx_misc_max_msr(vmx_misc) + 1) * VMX_MISC_MSR_LIST_MULTIPLIER; +} + +/* + * Load guest's/host's msr at nested entry/exit. + * return 0 for success, entry index for failure. + * + * One of the failure modes for MSR load/store is when a list exceeds the + * virtual hardware's capacity. To maintain compatibility with hardware inasmuch + * as possible, process all valid entries before failing rather than precheck + * for a capacity violation. + */ +static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) +{ + u32 i; + struct vmx_msr_entry e; + u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu); + + for (i = 0; i < count; i++) { + if (unlikely(i >= max_msr_list_size)) + goto fail; + + if (kvm_vcpu_read_guest(vcpu, gpa + i * sizeof(e), + &e, sizeof(e))) { + pr_debug_ratelimited( + "%s cannot read MSR entry (%u, 0x%08llx)\n", + __func__, i, gpa + i * sizeof(e)); + goto fail; + } + if (nested_vmx_load_msr_check(vcpu, &e)) { + pr_debug_ratelimited( + "%s check failed (%u, 0x%x, 0x%x)\n", + __func__, i, e.index, e.reserved); + goto fail; + } + if (kvm_set_msr(vcpu, e.index, e.value)) { + pr_debug_ratelimited( + "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", + __func__, i, e.index, e.value); + goto fail; + } + } + return 0; +fail: + /* Note, max_msr_list_size is at most 4096, i.e. this can't wrap. */ + return i + 1; +} + +static bool nested_vmx_get_vmexit_msr_value(struct kvm_vcpu *vcpu, + u32 msr_index, + u64 *data) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + /* + * If the L0 hypervisor stored a more accurate value for the TSC that + * does not include the time taken for emulation of the L2->L1 + * VM-exit in L0, use the more accurate value. + */ + if (msr_index == MSR_IA32_TSC) { + int i = vmx_find_loadstore_msr_slot(&vmx->msr_autostore.guest, + MSR_IA32_TSC); + + if (i >= 0) { + u64 val = vmx->msr_autostore.guest.val[i].value; + + *data = kvm_read_l1_tsc(vcpu, val); + return true; + } + } + + if (kvm_get_msr(vcpu, msr_index, data)) { + pr_debug_ratelimited("%s cannot read MSR (0x%x)\n", __func__, + msr_index); + return false; + } + return true; +} + +static bool read_and_check_msr_entry(struct kvm_vcpu *vcpu, u64 gpa, int i, + struct vmx_msr_entry *e) +{ + if (kvm_vcpu_read_guest(vcpu, + gpa + i * sizeof(*e), + e, 2 * sizeof(u32))) { + pr_debug_ratelimited( + "%s cannot read MSR entry (%u, 0x%08llx)\n", + __func__, i, gpa + i * sizeof(*e)); + return false; + } + if (nested_vmx_store_msr_check(vcpu, e)) { + pr_debug_ratelimited( + "%s check failed (%u, 0x%x, 0x%x)\n", + __func__, i, e->index, e->reserved); + return false; + } + return true; +} + +static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) +{ + u64 data; + u32 i; + struct vmx_msr_entry e; + u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu); + + for (i = 0; i < count; i++) { + if (unlikely(i >= max_msr_list_size)) + return -EINVAL; + + if (!read_and_check_msr_entry(vcpu, gpa, i, &e)) + return -EINVAL; + + if (!nested_vmx_get_vmexit_msr_value(vcpu, e.index, &data)) + return -EINVAL; + + if (kvm_vcpu_write_guest(vcpu, + gpa + i * sizeof(e) + + offsetof(struct vmx_msr_entry, value), + &data, sizeof(data))) { + pr_debug_ratelimited( + "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", + __func__, i, e.index, data); + return -EINVAL; + } + } + return 0; +} + +static bool nested_msr_store_list_has_msr(struct kvm_vcpu *vcpu, u32 msr_index) +{ + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + u32 count = vmcs12->vm_exit_msr_store_count; + u64 gpa = vmcs12->vm_exit_msr_store_addr; + struct vmx_msr_entry e; + u32 i; + + for (i = 0; i < count; i++) { + if (!read_and_check_msr_entry(vcpu, gpa, i, &e)) + return false; + + if (e.index == msr_index) + return true; + } + return false; +} + +static void prepare_vmx_msr_autostore_list(struct kvm_vcpu *vcpu, + u32 msr_index) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct vmx_msrs *autostore = &vmx->msr_autostore.guest; + bool in_vmcs12_store_list; + int msr_autostore_slot; + bool in_autostore_list; + int last; + + msr_autostore_slot = vmx_find_loadstore_msr_slot(autostore, msr_index); + in_autostore_list = msr_autostore_slot >= 0; + in_vmcs12_store_list = nested_msr_store_list_has_msr(vcpu, msr_index); + + if (in_vmcs12_store_list && !in_autostore_list) { + if (autostore->nr == MAX_NR_LOADSTORE_MSRS) { + /* + * Emulated VMEntry does not fail here. Instead a less + * accurate value will be returned by + * nested_vmx_get_vmexit_msr_value() using kvm_get_msr() + * instead of reading the value from the vmcs02 VMExit + * MSR-store area. + */ + pr_warn_ratelimited( + "Not enough msr entries in msr_autostore. Can't add msr %x\n", + msr_index); + return; + } + last = autostore->nr++; + autostore->val[last].index = msr_index; + } else if (!in_vmcs12_store_list && in_autostore_list) { + last = --autostore->nr; + autostore->val[msr_autostore_slot] = autostore->val[last]; + } +} + +/* + * Load guest's/host's cr3 at nested entry/exit. @nested_ept is true if we are + * emulating VM-Entry into a guest with EPT enabled. On failure, the expected + * Exit Qualification (for a VM-Entry consistency check VM-Exit) is assigned to + * @entry_failure_code. + */ +static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, + bool nested_ept, bool reload_pdptrs, + enum vm_entry_failure_code *entry_failure_code) +{ + if (CC(kvm_vcpu_is_illegal_gpa(vcpu, cr3))) { + *entry_failure_code = ENTRY_FAIL_DEFAULT; + return -EINVAL; + } + + /* + * If PAE paging and EPT are both on, CR3 is not used by the CPU and + * must not be dereferenced. + */ + if (reload_pdptrs && !nested_ept && is_pae_paging(vcpu) && + CC(!load_pdptrs(vcpu, cr3))) { + *entry_failure_code = ENTRY_FAIL_PDPTE; + return -EINVAL; + } + + vcpu->arch.cr3 = cr3; + kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3); + + /* Re-initialize the MMU, e.g. to pick up CR4 MMU role changes. */ + kvm_init_mmu(vcpu); + + if (!nested_ept) + kvm_mmu_new_pgd(vcpu, cr3); + + return 0; +} + +/* + * Returns if KVM is able to config CPU to tag TLB entries + * populated by L2 differently than TLB entries populated + * by L1. + * + * If L0 uses EPT, L1 and L2 run with different EPTP because + * guest_mode is part of kvm_mmu_page_role. Thus, TLB entries + * are tagged with different EPTP. + * + * If L1 uses VPID and we allocated a vpid02, TLB entries are tagged + * with different VPID (L1 entries are tagged with vmx->vpid + * while L2 entries are tagged with vmx->nested.vpid02). + */ +static bool nested_has_guest_tlb_tag(struct kvm_vcpu *vcpu) +{ + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + + return enable_ept || + (nested_cpu_has_vpid(vmcs12) && to_vmx(vcpu)->nested.vpid02); +} + +static void nested_vmx_transition_tlb_flush(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12, + bool is_vmenter) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + /* + * If vmcs12 doesn't use VPID, L1 expects linear and combined mappings + * for *all* contexts to be flushed on VM-Enter/VM-Exit, i.e. it's a + * full TLB flush from the guest's perspective. This is required even + * if VPID is disabled in the host as KVM may need to synchronize the + * MMU in response to the guest TLB flush. + * + * Note, using TLB_FLUSH_GUEST is correct even if nested EPT is in use. + * EPT is a special snowflake, as guest-physical mappings aren't + * flushed on VPID invalidations, including VM-Enter or VM-Exit with + * VPID disabled. As a result, KVM _never_ needs to sync nEPT + * entries on VM-Enter because L1 can't rely on VM-Enter to flush + * those mappings. + */ + if (!nested_cpu_has_vpid(vmcs12)) { + kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); + return; + } + + /* L2 should never have a VPID if VPID is disabled. */ + WARN_ON(!enable_vpid); + + /* + * VPID is enabled and in use by vmcs12. If vpid12 is changing, then + * emulate a guest TLB flush as KVM does not track vpid12 history nor + * is the VPID incorporated into the MMU context. I.e. KVM must assume + * that the new vpid12 has never been used and thus represents a new + * guest ASID that cannot have entries in the TLB. + */ + if (is_vmenter && vmcs12->virtual_processor_id != vmx->nested.last_vpid) { + vmx->nested.last_vpid = vmcs12->virtual_processor_id; + kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); + return; + } + + /* + * If VPID is enabled, used by vmc12, and vpid12 is not changing but + * does not have a unique TLB tag (ASID), i.e. EPT is disabled and + * KVM was unable to allocate a VPID for L2, flush the current context + * as the effective ASID is common to both L1 and L2. + */ + if (!nested_has_guest_tlb_tag(vcpu)) + kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); +} + +static bool is_bitwise_subset(u64 superset, u64 subset, u64 mask) +{ + superset &= mask; + subset &= mask; + + return (superset | subset) == superset; +} + +static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data) +{ + const u64 feature_and_reserved = + /* feature (except bit 48; see below) */ + BIT_ULL(49) | BIT_ULL(54) | BIT_ULL(55) | + /* reserved */ + BIT_ULL(31) | GENMASK_ULL(47, 45) | GENMASK_ULL(63, 56); + u64 vmx_basic = vmcs_config.nested.basic; + + if (!is_bitwise_subset(vmx_basic, data, feature_and_reserved)) + return -EINVAL; + + /* + * KVM does not emulate a version of VMX that constrains physical + * addresses of VMX structures (e.g. VMCS) to 32-bits. + */ + if (data & BIT_ULL(48)) + return -EINVAL; + + if (vmx_basic_vmcs_revision_id(vmx_basic) != + vmx_basic_vmcs_revision_id(data)) + return -EINVAL; + + if (vmx_basic_vmcs_size(vmx_basic) > vmx_basic_vmcs_size(data)) + return -EINVAL; + + vmx->nested.msrs.basic = data; + return 0; +} + +static void vmx_get_control_msr(struct nested_vmx_msrs *msrs, u32 msr_index, + u32 **low, u32 **high) +{ + switch (msr_index) { + case MSR_IA32_VMX_TRUE_PINBASED_CTLS: + *low = &msrs->pinbased_ctls_low; + *high = &msrs->pinbased_ctls_high; + break; + case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: + *low = &msrs->procbased_ctls_low; + *high = &msrs->procbased_ctls_high; + break; + case MSR_IA32_VMX_TRUE_EXIT_CTLS: + *low = &msrs->exit_ctls_low; + *high = &msrs->exit_ctls_high; + break; + case MSR_IA32_VMX_TRUE_ENTRY_CTLS: + *low = &msrs->entry_ctls_low; + *high = &msrs->entry_ctls_high; + break; + case MSR_IA32_VMX_PROCBASED_CTLS2: + *low = &msrs->secondary_ctls_low; + *high = &msrs->secondary_ctls_high; + break; + default: + BUG(); + } +} + +static int +vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) +{ + u32 *lowp, *highp; + u64 supported; + + vmx_get_control_msr(&vmcs_config.nested, msr_index, &lowp, &highp); + + supported = vmx_control_msr(*lowp, *highp); + + /* Check must-be-1 bits are still 1. */ + if (!is_bitwise_subset(data, supported, GENMASK_ULL(31, 0))) + return -EINVAL; + + /* Check must-be-0 bits are still 0. */ + if (!is_bitwise_subset(supported, data, GENMASK_ULL(63, 32))) + return -EINVAL; + + vmx_get_control_msr(&vmx->nested.msrs, msr_index, &lowp, &highp); + *lowp = data; + *highp = data >> 32; + return 0; +} + +static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data) +{ + const u64 feature_and_reserved_bits = + /* feature */ + BIT_ULL(5) | GENMASK_ULL(8, 6) | BIT_ULL(14) | BIT_ULL(15) | + BIT_ULL(28) | BIT_ULL(29) | BIT_ULL(30) | + /* reserved */ + GENMASK_ULL(13, 9) | BIT_ULL(31); + u64 vmx_misc = vmx_control_msr(vmcs_config.nested.misc_low, + vmcs_config.nested.misc_high); + + if (!is_bitwise_subset(vmx_misc, data, feature_and_reserved_bits)) + return -EINVAL; + + if ((vmx->nested.msrs.pinbased_ctls_high & + PIN_BASED_VMX_PREEMPTION_TIMER) && + vmx_misc_preemption_timer_rate(data) != + vmx_misc_preemption_timer_rate(vmx_misc)) + return -EINVAL; + + if (vmx_misc_cr3_count(data) > vmx_misc_cr3_count(vmx_misc)) + return -EINVAL; + + if (vmx_misc_max_msr(data) > vmx_misc_max_msr(vmx_misc)) + return -EINVAL; + + if (vmx_misc_mseg_revid(data) != vmx_misc_mseg_revid(vmx_misc)) + return -EINVAL; + + vmx->nested.msrs.misc_low = data; + vmx->nested.msrs.misc_high = data >> 32; + + return 0; +} + +static int vmx_restore_vmx_ept_vpid_cap(struct vcpu_vmx *vmx, u64 data) +{ + u64 vmx_ept_vpid_cap = vmx_control_msr(vmcs_config.nested.ept_caps, + vmcs_config.nested.vpid_caps); + + /* Every bit is either reserved or a feature bit. */ + if (!is_bitwise_subset(vmx_ept_vpid_cap, data, -1ULL)) + return -EINVAL; + + vmx->nested.msrs.ept_caps = data; + vmx->nested.msrs.vpid_caps = data >> 32; + return 0; +} + +static u64 *vmx_get_fixed0_msr(struct nested_vmx_msrs *msrs, u32 msr_index) +{ + switch (msr_index) { + case MSR_IA32_VMX_CR0_FIXED0: + return &msrs->cr0_fixed0; + case MSR_IA32_VMX_CR4_FIXED0: + return &msrs->cr4_fixed0; + default: + BUG(); + } +} + +static int vmx_restore_fixed0_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) +{ + const u64 *msr = vmx_get_fixed0_msr(&vmcs_config.nested, msr_index); + + /* + * 1 bits (which indicates bits which "must-be-1" during VMX operation) + * must be 1 in the restored value. + */ + if (!is_bitwise_subset(data, *msr, -1ULL)) + return -EINVAL; + + *vmx_get_fixed0_msr(&vmx->nested.msrs, msr_index) = data; + return 0; +} + +/* + * Called when userspace is restoring VMX MSRs. + * + * Returns 0 on success, non-0 otherwise. + */ +int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + /* + * Don't allow changes to the VMX capability MSRs while the vCPU + * is in VMX operation. + */ + if (vmx->nested.vmxon) + return -EBUSY; + + switch (msr_index) { + case MSR_IA32_VMX_BASIC: + return vmx_restore_vmx_basic(vmx, data); + case MSR_IA32_VMX_PINBASED_CTLS: + case MSR_IA32_VMX_PROCBASED_CTLS: + case MSR_IA32_VMX_EXIT_CTLS: + case MSR_IA32_VMX_ENTRY_CTLS: + /* + * The "non-true" VMX capability MSRs are generated from the + * "true" MSRs, so we do not support restoring them directly. + * + * If userspace wants to emulate VMX_BASIC[55]=0, userspace + * should restore the "true" MSRs with the must-be-1 bits + * set according to the SDM Vol 3. A.2 "RESERVED CONTROLS AND + * DEFAULT SETTINGS". + */ + return -EINVAL; + case MSR_IA32_VMX_TRUE_PINBASED_CTLS: + case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: + case MSR_IA32_VMX_TRUE_EXIT_CTLS: + case MSR_IA32_VMX_TRUE_ENTRY_CTLS: + case MSR_IA32_VMX_PROCBASED_CTLS2: + return vmx_restore_control_msr(vmx, msr_index, data); + case MSR_IA32_VMX_MISC: + return vmx_restore_vmx_misc(vmx, data); + case MSR_IA32_VMX_CR0_FIXED0: + case MSR_IA32_VMX_CR4_FIXED0: + return vmx_restore_fixed0_msr(vmx, msr_index, data); + case MSR_IA32_VMX_CR0_FIXED1: + case MSR_IA32_VMX_CR4_FIXED1: + /* + * These MSRs are generated based on the vCPU's CPUID, so we + * do not support restoring them directly. + */ + return -EINVAL; + case MSR_IA32_VMX_EPT_VPID_CAP: + return vmx_restore_vmx_ept_vpid_cap(vmx, data); + case MSR_IA32_VMX_VMCS_ENUM: + vmx->nested.msrs.vmcs_enum = data; + return 0; + case MSR_IA32_VMX_VMFUNC: + if (data & ~vmcs_config.nested.vmfunc_controls) + return -EINVAL; + vmx->nested.msrs.vmfunc_controls = data; + return 0; + default: + /* + * The rest of the VMX capability MSRs do not support restore. + */ + return -EINVAL; + } +} + +/* Returns 0 on success, non-0 otherwise. */ +int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata) +{ + switch (msr_index) { + case MSR_IA32_VMX_BASIC: + *pdata = msrs->basic; + break; + case MSR_IA32_VMX_TRUE_PINBASED_CTLS: + case MSR_IA32_VMX_PINBASED_CTLS: + *pdata = vmx_control_msr( + msrs->pinbased_ctls_low, + msrs->pinbased_ctls_high); + if (msr_index == MSR_IA32_VMX_PINBASED_CTLS) + *pdata |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; + break; + case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: + case MSR_IA32_VMX_PROCBASED_CTLS: + *pdata = vmx_control_msr( + msrs->procbased_ctls_low, + msrs->procbased_ctls_high); + if (msr_index == MSR_IA32_VMX_PROCBASED_CTLS) + *pdata |= CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR; + break; + case MSR_IA32_VMX_TRUE_EXIT_CTLS: + case MSR_IA32_VMX_EXIT_CTLS: + *pdata = vmx_control_msr( + msrs->exit_ctls_low, + msrs->exit_ctls_high); + if (msr_index == MSR_IA32_VMX_EXIT_CTLS) + *pdata |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; + break; + case MSR_IA32_VMX_TRUE_ENTRY_CTLS: + case MSR_IA32_VMX_ENTRY_CTLS: + *pdata = vmx_control_msr( + msrs->entry_ctls_low, + msrs->entry_ctls_high); + if (msr_index == MSR_IA32_VMX_ENTRY_CTLS) + *pdata |= VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; + break; + case MSR_IA32_VMX_MISC: + *pdata = vmx_control_msr( + msrs->misc_low, + msrs->misc_high); + break; + case MSR_IA32_VMX_CR0_FIXED0: + *pdata = msrs->cr0_fixed0; + break; + case MSR_IA32_VMX_CR0_FIXED1: + *pdata = msrs->cr0_fixed1; + break; + case MSR_IA32_VMX_CR4_FIXED0: + *pdata = msrs->cr4_fixed0; + break; + case MSR_IA32_VMX_CR4_FIXED1: + *pdata = msrs->cr4_fixed1; + break; + case MSR_IA32_VMX_VMCS_ENUM: + *pdata = msrs->vmcs_enum; + break; + case MSR_IA32_VMX_PROCBASED_CTLS2: + *pdata = vmx_control_msr( + msrs->secondary_ctls_low, + msrs->secondary_ctls_high); + break; + case MSR_IA32_VMX_EPT_VPID_CAP: + *pdata = msrs->ept_caps | + ((u64)msrs->vpid_caps << 32); + break; + case MSR_IA32_VMX_VMFUNC: + *pdata = msrs->vmfunc_controls; + break; + default: + return 1; + } + + return 0; +} + +/* + * Copy the writable VMCS shadow fields back to the VMCS12, in case they have + * been modified by the L1 guest. Note, "writable" in this context means + * "writable by the guest", i.e. tagged SHADOW_FIELD_RW; the set of + * fields tagged SHADOW_FIELD_RO may or may not align with the "read-only" + * VM-exit information fields (which are actually writable if the vCPU is + * configured to support "VMWRITE to any supported field in the VMCS"). + */ +static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx) +{ + struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs; + struct vmcs12 *vmcs12 = get_vmcs12(&vmx->vcpu); + struct shadow_vmcs_field field; + unsigned long val; + int i; + + if (WARN_ON(!shadow_vmcs)) + return; + + preempt_disable(); + + vmcs_load(shadow_vmcs); + + for (i = 0; i < max_shadow_read_write_fields; i++) { + field = shadow_read_write_fields[i]; + val = __vmcs_readl(field.encoding); + vmcs12_write_any(vmcs12, field.encoding, field.offset, val); + } + + vmcs_clear(shadow_vmcs); + vmcs_load(vmx->loaded_vmcs->vmcs); + + preempt_enable(); +} + +static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx) +{ + const struct shadow_vmcs_field *fields[] = { + shadow_read_write_fields, + shadow_read_only_fields + }; + const int max_fields[] = { + max_shadow_read_write_fields, + max_shadow_read_only_fields + }; + struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs; + struct vmcs12 *vmcs12 = get_vmcs12(&vmx->vcpu); + struct shadow_vmcs_field field; + unsigned long val; + int i, q; + + if (WARN_ON(!shadow_vmcs)) + return; + + vmcs_load(shadow_vmcs); + + for (q = 0; q < ARRAY_SIZE(fields); q++) { + for (i = 0; i < max_fields[q]; i++) { + field = fields[q][i]; + val = vmcs12_read_any(vmcs12, field.encoding, + field.offset); + __vmcs_writel(field.encoding, val); + } + } + + vmcs_clear(shadow_vmcs); + vmcs_load(vmx->loaded_vmcs->vmcs); +} + +static void copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx, u32 hv_clean_fields) +{ + struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12; + struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs; + + /* HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE */ + vmcs12->tpr_threshold = evmcs->tpr_threshold; + vmcs12->guest_rip = evmcs->guest_rip; + + if (unlikely(!(hv_clean_fields & + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC))) { + vmcs12->guest_rsp = evmcs->guest_rsp; + vmcs12->guest_rflags = evmcs->guest_rflags; + vmcs12->guest_interruptibility_info = + evmcs->guest_interruptibility_info; + /* + * Not present in struct vmcs12: + * vmcs12->guest_ssp = evmcs->guest_ssp; + */ + } + + if (unlikely(!(hv_clean_fields & + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC))) { + vmcs12->cpu_based_vm_exec_control = + evmcs->cpu_based_vm_exec_control; + } + + if (unlikely(!(hv_clean_fields & + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EXCPN))) { + vmcs12->exception_bitmap = evmcs->exception_bitmap; + } + + if (unlikely(!(hv_clean_fields & + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_ENTRY))) { + vmcs12->vm_entry_controls = evmcs->vm_entry_controls; + } + + if (unlikely(!(hv_clean_fields & + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT))) { + vmcs12->vm_entry_intr_info_field = + evmcs->vm_entry_intr_info_field; + vmcs12->vm_entry_exception_error_code = + evmcs->vm_entry_exception_error_code; + vmcs12->vm_entry_instruction_len = + evmcs->vm_entry_instruction_len; + } + + if (unlikely(!(hv_clean_fields & + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1))) { + vmcs12->host_ia32_pat = evmcs->host_ia32_pat; + vmcs12->host_ia32_efer = evmcs->host_ia32_efer; + vmcs12->host_cr0 = evmcs->host_cr0; + vmcs12->host_cr3 = evmcs->host_cr3; + vmcs12->host_cr4 = evmcs->host_cr4; + vmcs12->host_ia32_sysenter_esp = evmcs->host_ia32_sysenter_esp; + vmcs12->host_ia32_sysenter_eip = evmcs->host_ia32_sysenter_eip; + vmcs12->host_rip = evmcs->host_rip; + vmcs12->host_ia32_sysenter_cs = evmcs->host_ia32_sysenter_cs; + vmcs12->host_es_selector = evmcs->host_es_selector; + vmcs12->host_cs_selector = evmcs->host_cs_selector; + vmcs12->host_ss_selector = evmcs->host_ss_selector; + vmcs12->host_ds_selector = evmcs->host_ds_selector; + vmcs12->host_fs_selector = evmcs->host_fs_selector; + vmcs12->host_gs_selector = evmcs->host_gs_selector; + vmcs12->host_tr_selector = evmcs->host_tr_selector; + vmcs12->host_ia32_perf_global_ctrl = evmcs->host_ia32_perf_global_ctrl; + /* + * Not present in struct vmcs12: + * vmcs12->host_ia32_s_cet = evmcs->host_ia32_s_cet; + * vmcs12->host_ssp = evmcs->host_ssp; + * vmcs12->host_ia32_int_ssp_table_addr = evmcs->host_ia32_int_ssp_table_addr; + */ + } + + if (unlikely(!(hv_clean_fields & + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1))) { + vmcs12->pin_based_vm_exec_control = + evmcs->pin_based_vm_exec_control; + vmcs12->vm_exit_controls = evmcs->vm_exit_controls; + vmcs12->secondary_vm_exec_control = + evmcs->secondary_vm_exec_control; + } + + if (unlikely(!(hv_clean_fields & + HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP))) { + vmcs12->io_bitmap_a = evmcs->io_bitmap_a; + vmcs12->io_bitmap_b = evmcs->io_bitmap_b; + } + + if (unlikely(!(hv_clean_fields & + HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP))) { + vmcs12->msr_bitmap = evmcs->msr_bitmap; + } + + if (unlikely(!(hv_clean_fields & + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2))) { + vmcs12->guest_es_base = evmcs->guest_es_base; + vmcs12->guest_cs_base = evmcs->guest_cs_base; + vmcs12->guest_ss_base = evmcs->guest_ss_base; + vmcs12->guest_ds_base = evmcs->guest_ds_base; + vmcs12->guest_fs_base = evmcs->guest_fs_base; + vmcs12->guest_gs_base = evmcs->guest_gs_base; + vmcs12->guest_ldtr_base = evmcs->guest_ldtr_base; + vmcs12->guest_tr_base = evmcs->guest_tr_base; + vmcs12->guest_gdtr_base = evmcs->guest_gdtr_base; + vmcs12->guest_idtr_base = evmcs->guest_idtr_base; + vmcs12->guest_es_limit = evmcs->guest_es_limit; + vmcs12->guest_cs_limit = evmcs->guest_cs_limit; + vmcs12->guest_ss_limit = evmcs->guest_ss_limit; + vmcs12->guest_ds_limit = evmcs->guest_ds_limit; + vmcs12->guest_fs_limit = evmcs->guest_fs_limit; + vmcs12->guest_gs_limit = evmcs->guest_gs_limit; + vmcs12->guest_ldtr_limit = evmcs->guest_ldtr_limit; + vmcs12->guest_tr_limit = evmcs->guest_tr_limit; + vmcs12->guest_gdtr_limit = evmcs->guest_gdtr_limit; + vmcs12->guest_idtr_limit = evmcs->guest_idtr_limit; + vmcs12->guest_es_ar_bytes = evmcs->guest_es_ar_bytes; + vmcs12->guest_cs_ar_bytes = evmcs->guest_cs_ar_bytes; + vmcs12->guest_ss_ar_bytes = evmcs->guest_ss_ar_bytes; + vmcs12->guest_ds_ar_bytes = evmcs->guest_ds_ar_bytes; + vmcs12->guest_fs_ar_bytes = evmcs->guest_fs_ar_bytes; + vmcs12->guest_gs_ar_bytes = evmcs->guest_gs_ar_bytes; + vmcs12->guest_ldtr_ar_bytes = evmcs->guest_ldtr_ar_bytes; + vmcs12->guest_tr_ar_bytes = evmcs->guest_tr_ar_bytes; + vmcs12->guest_es_selector = evmcs->guest_es_selector; + vmcs12->guest_cs_selector = evmcs->guest_cs_selector; + vmcs12->guest_ss_selector = evmcs->guest_ss_selector; + vmcs12->guest_ds_selector = evmcs->guest_ds_selector; + vmcs12->guest_fs_selector = evmcs->guest_fs_selector; + vmcs12->guest_gs_selector = evmcs->guest_gs_selector; + vmcs12->guest_ldtr_selector = evmcs->guest_ldtr_selector; + vmcs12->guest_tr_selector = evmcs->guest_tr_selector; + } + + if (unlikely(!(hv_clean_fields & + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2))) { + vmcs12->tsc_offset = evmcs->tsc_offset; + vmcs12->virtual_apic_page_addr = evmcs->virtual_apic_page_addr; + vmcs12->xss_exit_bitmap = evmcs->xss_exit_bitmap; + vmcs12->encls_exiting_bitmap = evmcs->encls_exiting_bitmap; + vmcs12->tsc_multiplier = evmcs->tsc_multiplier; + } + + if (unlikely(!(hv_clean_fields & + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR))) { + vmcs12->cr0_guest_host_mask = evmcs->cr0_guest_host_mask; + vmcs12->cr4_guest_host_mask = evmcs->cr4_guest_host_mask; + vmcs12->cr0_read_shadow = evmcs->cr0_read_shadow; + vmcs12->cr4_read_shadow = evmcs->cr4_read_shadow; + vmcs12->guest_cr0 = evmcs->guest_cr0; + vmcs12->guest_cr3 = evmcs->guest_cr3; + vmcs12->guest_cr4 = evmcs->guest_cr4; + vmcs12->guest_dr7 = evmcs->guest_dr7; + } + + if (unlikely(!(hv_clean_fields & + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER))) { + vmcs12->host_fs_base = evmcs->host_fs_base; + vmcs12->host_gs_base = evmcs->host_gs_base; + vmcs12->host_tr_base = evmcs->host_tr_base; + vmcs12->host_gdtr_base = evmcs->host_gdtr_base; + vmcs12->host_idtr_base = evmcs->host_idtr_base; + vmcs12->host_rsp = evmcs->host_rsp; + } + + if (unlikely(!(hv_clean_fields & + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT))) { + vmcs12->ept_pointer = evmcs->ept_pointer; + vmcs12->virtual_processor_id = evmcs->virtual_processor_id; + } + + if (unlikely(!(hv_clean_fields & + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1))) { + vmcs12->vmcs_link_pointer = evmcs->vmcs_link_pointer; + vmcs12->guest_ia32_debugctl = evmcs->guest_ia32_debugctl; + vmcs12->guest_ia32_pat = evmcs->guest_ia32_pat; + vmcs12->guest_ia32_efer = evmcs->guest_ia32_efer; + vmcs12->guest_pdptr0 = evmcs->guest_pdptr0; + vmcs12->guest_pdptr1 = evmcs->guest_pdptr1; + vmcs12->guest_pdptr2 = evmcs->guest_pdptr2; + vmcs12->guest_pdptr3 = evmcs->guest_pdptr3; + vmcs12->guest_pending_dbg_exceptions = + evmcs->guest_pending_dbg_exceptions; + vmcs12->guest_sysenter_esp = evmcs->guest_sysenter_esp; + vmcs12->guest_sysenter_eip = evmcs->guest_sysenter_eip; + vmcs12->guest_bndcfgs = evmcs->guest_bndcfgs; + vmcs12->guest_activity_state = evmcs->guest_activity_state; + vmcs12->guest_sysenter_cs = evmcs->guest_sysenter_cs; + vmcs12->guest_ia32_perf_global_ctrl = evmcs->guest_ia32_perf_global_ctrl; + /* + * Not present in struct vmcs12: + * vmcs12->guest_ia32_s_cet = evmcs->guest_ia32_s_cet; + * vmcs12->guest_ia32_lbr_ctl = evmcs->guest_ia32_lbr_ctl; + * vmcs12->guest_ia32_int_ssp_table_addr = evmcs->guest_ia32_int_ssp_table_addr; + */ + } + + /* + * Not used? + * vmcs12->vm_exit_msr_store_addr = evmcs->vm_exit_msr_store_addr; + * vmcs12->vm_exit_msr_load_addr = evmcs->vm_exit_msr_load_addr; + * vmcs12->vm_entry_msr_load_addr = evmcs->vm_entry_msr_load_addr; + * vmcs12->page_fault_error_code_mask = + * evmcs->page_fault_error_code_mask; + * vmcs12->page_fault_error_code_match = + * evmcs->page_fault_error_code_match; + * vmcs12->cr3_target_count = evmcs->cr3_target_count; + * vmcs12->vm_exit_msr_store_count = evmcs->vm_exit_msr_store_count; + * vmcs12->vm_exit_msr_load_count = evmcs->vm_exit_msr_load_count; + * vmcs12->vm_entry_msr_load_count = evmcs->vm_entry_msr_load_count; + */ + + /* + * Read only fields: + * vmcs12->guest_physical_address = evmcs->guest_physical_address; + * vmcs12->vm_instruction_error = evmcs->vm_instruction_error; + * vmcs12->vm_exit_reason = evmcs->vm_exit_reason; + * vmcs12->vm_exit_intr_info = evmcs->vm_exit_intr_info; + * vmcs12->vm_exit_intr_error_code = evmcs->vm_exit_intr_error_code; + * vmcs12->idt_vectoring_info_field = evmcs->idt_vectoring_info_field; + * vmcs12->idt_vectoring_error_code = evmcs->idt_vectoring_error_code; + * vmcs12->vm_exit_instruction_len = evmcs->vm_exit_instruction_len; + * vmcs12->vmx_instruction_info = evmcs->vmx_instruction_info; + * vmcs12->exit_qualification = evmcs->exit_qualification; + * vmcs12->guest_linear_address = evmcs->guest_linear_address; + * + * Not present in struct vmcs12: + * vmcs12->exit_io_instruction_ecx = evmcs->exit_io_instruction_ecx; + * vmcs12->exit_io_instruction_esi = evmcs->exit_io_instruction_esi; + * vmcs12->exit_io_instruction_edi = evmcs->exit_io_instruction_edi; + * vmcs12->exit_io_instruction_eip = evmcs->exit_io_instruction_eip; + */ + + return; +} + +static void copy_vmcs12_to_enlightened(struct vcpu_vmx *vmx) +{ + struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12; + struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs; + + /* + * Should not be changed by KVM: + * + * evmcs->host_es_selector = vmcs12->host_es_selector; + * evmcs->host_cs_selector = vmcs12->host_cs_selector; + * evmcs->host_ss_selector = vmcs12->host_ss_selector; + * evmcs->host_ds_selector = vmcs12->host_ds_selector; + * evmcs->host_fs_selector = vmcs12->host_fs_selector; + * evmcs->host_gs_selector = vmcs12->host_gs_selector; + * evmcs->host_tr_selector = vmcs12->host_tr_selector; + * evmcs->host_ia32_pat = vmcs12->host_ia32_pat; + * evmcs->host_ia32_efer = vmcs12->host_ia32_efer; + * evmcs->host_cr0 = vmcs12->host_cr0; + * evmcs->host_cr3 = vmcs12->host_cr3; + * evmcs->host_cr4 = vmcs12->host_cr4; + * evmcs->host_ia32_sysenter_esp = vmcs12->host_ia32_sysenter_esp; + * evmcs->host_ia32_sysenter_eip = vmcs12->host_ia32_sysenter_eip; + * evmcs->host_rip = vmcs12->host_rip; + * evmcs->host_ia32_sysenter_cs = vmcs12->host_ia32_sysenter_cs; + * evmcs->host_fs_base = vmcs12->host_fs_base; + * evmcs->host_gs_base = vmcs12->host_gs_base; + * evmcs->host_tr_base = vmcs12->host_tr_base; + * evmcs->host_gdtr_base = vmcs12->host_gdtr_base; + * evmcs->host_idtr_base = vmcs12->host_idtr_base; + * evmcs->host_rsp = vmcs12->host_rsp; + * sync_vmcs02_to_vmcs12() doesn't read these: + * evmcs->io_bitmap_a = vmcs12->io_bitmap_a; + * evmcs->io_bitmap_b = vmcs12->io_bitmap_b; + * evmcs->msr_bitmap = vmcs12->msr_bitmap; + * evmcs->ept_pointer = vmcs12->ept_pointer; + * evmcs->xss_exit_bitmap = vmcs12->xss_exit_bitmap; + * evmcs->vm_exit_msr_store_addr = vmcs12->vm_exit_msr_store_addr; + * evmcs->vm_exit_msr_load_addr = vmcs12->vm_exit_msr_load_addr; + * evmcs->vm_entry_msr_load_addr = vmcs12->vm_entry_msr_load_addr; + * evmcs->tpr_threshold = vmcs12->tpr_threshold; + * evmcs->virtual_processor_id = vmcs12->virtual_processor_id; + * evmcs->exception_bitmap = vmcs12->exception_bitmap; + * evmcs->vmcs_link_pointer = vmcs12->vmcs_link_pointer; + * evmcs->pin_based_vm_exec_control = vmcs12->pin_based_vm_exec_control; + * evmcs->vm_exit_controls = vmcs12->vm_exit_controls; + * evmcs->secondary_vm_exec_control = vmcs12->secondary_vm_exec_control; + * evmcs->page_fault_error_code_mask = + * vmcs12->page_fault_error_code_mask; + * evmcs->page_fault_error_code_match = + * vmcs12->page_fault_error_code_match; + * evmcs->cr3_target_count = vmcs12->cr3_target_count; + * evmcs->virtual_apic_page_addr = vmcs12->virtual_apic_page_addr; + * evmcs->tsc_offset = vmcs12->tsc_offset; + * evmcs->guest_ia32_debugctl = vmcs12->guest_ia32_debugctl; + * evmcs->cr0_guest_host_mask = vmcs12->cr0_guest_host_mask; + * evmcs->cr4_guest_host_mask = vmcs12->cr4_guest_host_mask; + * evmcs->cr0_read_shadow = vmcs12->cr0_read_shadow; + * evmcs->cr4_read_shadow = vmcs12->cr4_read_shadow; + * evmcs->vm_exit_msr_store_count = vmcs12->vm_exit_msr_store_count; + * evmcs->vm_exit_msr_load_count = vmcs12->vm_exit_msr_load_count; + * evmcs->vm_entry_msr_load_count = vmcs12->vm_entry_msr_load_count; + * evmcs->guest_ia32_perf_global_ctrl = vmcs12->guest_ia32_perf_global_ctrl; + * evmcs->host_ia32_perf_global_ctrl = vmcs12->host_ia32_perf_global_ctrl; + * evmcs->encls_exiting_bitmap = vmcs12->encls_exiting_bitmap; + * evmcs->tsc_multiplier = vmcs12->tsc_multiplier; + * + * Not present in struct vmcs12: + * evmcs->exit_io_instruction_ecx = vmcs12->exit_io_instruction_ecx; + * evmcs->exit_io_instruction_esi = vmcs12->exit_io_instruction_esi; + * evmcs->exit_io_instruction_edi = vmcs12->exit_io_instruction_edi; + * evmcs->exit_io_instruction_eip = vmcs12->exit_io_instruction_eip; + * evmcs->host_ia32_s_cet = vmcs12->host_ia32_s_cet; + * evmcs->host_ssp = vmcs12->host_ssp; + * evmcs->host_ia32_int_ssp_table_addr = vmcs12->host_ia32_int_ssp_table_addr; + * evmcs->guest_ia32_s_cet = vmcs12->guest_ia32_s_cet; + * evmcs->guest_ia32_lbr_ctl = vmcs12->guest_ia32_lbr_ctl; + * evmcs->guest_ia32_int_ssp_table_addr = vmcs12->guest_ia32_int_ssp_table_addr; + * evmcs->guest_ssp = vmcs12->guest_ssp; + */ + + evmcs->guest_es_selector = vmcs12->guest_es_selector; + evmcs->guest_cs_selector = vmcs12->guest_cs_selector; + evmcs->guest_ss_selector = vmcs12->guest_ss_selector; + evmcs->guest_ds_selector = vmcs12->guest_ds_selector; + evmcs->guest_fs_selector = vmcs12->guest_fs_selector; + evmcs->guest_gs_selector = vmcs12->guest_gs_selector; + evmcs->guest_ldtr_selector = vmcs12->guest_ldtr_selector; + evmcs->guest_tr_selector = vmcs12->guest_tr_selector; + + evmcs->guest_es_limit = vmcs12->guest_es_limit; + evmcs->guest_cs_limit = vmcs12->guest_cs_limit; + evmcs->guest_ss_limit = vmcs12->guest_ss_limit; + evmcs->guest_ds_limit = vmcs12->guest_ds_limit; + evmcs->guest_fs_limit = vmcs12->guest_fs_limit; + evmcs->guest_gs_limit = vmcs12->guest_gs_limit; + evmcs->guest_ldtr_limit = vmcs12->guest_ldtr_limit; + evmcs->guest_tr_limit = vmcs12->guest_tr_limit; + evmcs->guest_gdtr_limit = vmcs12->guest_gdtr_limit; + evmcs->guest_idtr_limit = vmcs12->guest_idtr_limit; + + evmcs->guest_es_ar_bytes = vmcs12->guest_es_ar_bytes; + evmcs->guest_cs_ar_bytes = vmcs12->guest_cs_ar_bytes; + evmcs->guest_ss_ar_bytes = vmcs12->guest_ss_ar_bytes; + evmcs->guest_ds_ar_bytes = vmcs12->guest_ds_ar_bytes; + evmcs->guest_fs_ar_bytes = vmcs12->guest_fs_ar_bytes; + evmcs->guest_gs_ar_bytes = vmcs12->guest_gs_ar_bytes; + evmcs->guest_ldtr_ar_bytes = vmcs12->guest_ldtr_ar_bytes; + evmcs->guest_tr_ar_bytes = vmcs12->guest_tr_ar_bytes; + + evmcs->guest_es_base = vmcs12->guest_es_base; + evmcs->guest_cs_base = vmcs12->guest_cs_base; + evmcs->guest_ss_base = vmcs12->guest_ss_base; + evmcs->guest_ds_base = vmcs12->guest_ds_base; + evmcs->guest_fs_base = vmcs12->guest_fs_base; + evmcs->guest_gs_base = vmcs12->guest_gs_base; + evmcs->guest_ldtr_base = vmcs12->guest_ldtr_base; + evmcs->guest_tr_base = vmcs12->guest_tr_base; + evmcs->guest_gdtr_base = vmcs12->guest_gdtr_base; + evmcs->guest_idtr_base = vmcs12->guest_idtr_base; + + evmcs->guest_ia32_pat = vmcs12->guest_ia32_pat; + evmcs->guest_ia32_efer = vmcs12->guest_ia32_efer; + + evmcs->guest_pdptr0 = vmcs12->guest_pdptr0; + evmcs->guest_pdptr1 = vmcs12->guest_pdptr1; + evmcs->guest_pdptr2 = vmcs12->guest_pdptr2; + evmcs->guest_pdptr3 = vmcs12->guest_pdptr3; + + evmcs->guest_pending_dbg_exceptions = + vmcs12->guest_pending_dbg_exceptions; + evmcs->guest_sysenter_esp = vmcs12->guest_sysenter_esp; + evmcs->guest_sysenter_eip = vmcs12->guest_sysenter_eip; + + evmcs->guest_activity_state = vmcs12->guest_activity_state; + evmcs->guest_sysenter_cs = vmcs12->guest_sysenter_cs; + + evmcs->guest_cr0 = vmcs12->guest_cr0; + evmcs->guest_cr3 = vmcs12->guest_cr3; + evmcs->guest_cr4 = vmcs12->guest_cr4; + evmcs->guest_dr7 = vmcs12->guest_dr7; + + evmcs->guest_physical_address = vmcs12->guest_physical_address; + + evmcs->vm_instruction_error = vmcs12->vm_instruction_error; + evmcs->vm_exit_reason = vmcs12->vm_exit_reason; + evmcs->vm_exit_intr_info = vmcs12->vm_exit_intr_info; + evmcs->vm_exit_intr_error_code = vmcs12->vm_exit_intr_error_code; + evmcs->idt_vectoring_info_field = vmcs12->idt_vectoring_info_field; + evmcs->idt_vectoring_error_code = vmcs12->idt_vectoring_error_code; + evmcs->vm_exit_instruction_len = vmcs12->vm_exit_instruction_len; + evmcs->vmx_instruction_info = vmcs12->vmx_instruction_info; + + evmcs->exit_qualification = vmcs12->exit_qualification; + + evmcs->guest_linear_address = vmcs12->guest_linear_address; + evmcs->guest_rsp = vmcs12->guest_rsp; + evmcs->guest_rflags = vmcs12->guest_rflags; + + evmcs->guest_interruptibility_info = + vmcs12->guest_interruptibility_info; + evmcs->cpu_based_vm_exec_control = vmcs12->cpu_based_vm_exec_control; + evmcs->vm_entry_controls = vmcs12->vm_entry_controls; + evmcs->vm_entry_intr_info_field = vmcs12->vm_entry_intr_info_field; + evmcs->vm_entry_exception_error_code = + vmcs12->vm_entry_exception_error_code; + evmcs->vm_entry_instruction_len = vmcs12->vm_entry_instruction_len; + + evmcs->guest_rip = vmcs12->guest_rip; + + evmcs->guest_bndcfgs = vmcs12->guest_bndcfgs; + + return; +} + +/* + * This is an equivalent of the nested hypervisor executing the vmptrld + * instruction. + */ +static enum nested_evmptrld_status nested_vmx_handle_enlightened_vmptrld( + struct kvm_vcpu *vcpu, bool from_launch) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + bool evmcs_gpa_changed = false; + u64 evmcs_gpa; + + if (likely(!guest_cpuid_has_evmcs(vcpu))) + return EVMPTRLD_DISABLED; + + if (!nested_enlightened_vmentry(vcpu, &evmcs_gpa)) { + nested_release_evmcs(vcpu); + return EVMPTRLD_DISABLED; + } + + if (unlikely(evmcs_gpa != vmx->nested.hv_evmcs_vmptr)) { + vmx->nested.current_vmptr = INVALID_GPA; + + nested_release_evmcs(vcpu); + + if (kvm_vcpu_map(vcpu, gpa_to_gfn(evmcs_gpa), + &vmx->nested.hv_evmcs_map)) + return EVMPTRLD_ERROR; + + vmx->nested.hv_evmcs = vmx->nested.hv_evmcs_map.hva; + + /* + * Currently, KVM only supports eVMCS version 1 + * (== KVM_EVMCS_VERSION) and thus we expect guest to set this + * value to first u32 field of eVMCS which should specify eVMCS + * VersionNumber. + * + * Guest should be aware of supported eVMCS versions by host by + * examining CPUID.0x4000000A.EAX[0:15]. Host userspace VMM is + * expected to set this CPUID leaf according to the value + * returned in vmcs_version from nested_enable_evmcs(). + * + * However, it turns out that Microsoft Hyper-V fails to comply + * to their own invented interface: When Hyper-V use eVMCS, it + * just sets first u32 field of eVMCS to revision_id specified + * in MSR_IA32_VMX_BASIC. Instead of used eVMCS version number + * which is one of the supported versions specified in + * CPUID.0x4000000A.EAX[0:15]. + * + * To overcome Hyper-V bug, we accept here either a supported + * eVMCS version or VMCS12 revision_id as valid values for first + * u32 field of eVMCS. + */ + if ((vmx->nested.hv_evmcs->revision_id != KVM_EVMCS_VERSION) && + (vmx->nested.hv_evmcs->revision_id != VMCS12_REVISION)) { + nested_release_evmcs(vcpu); + return EVMPTRLD_VMFAIL; + } + + vmx->nested.hv_evmcs_vmptr = evmcs_gpa; + + evmcs_gpa_changed = true; + /* + * Unlike normal vmcs12, enlightened vmcs12 is not fully + * reloaded from guest's memory (read only fields, fields not + * present in struct hv_enlightened_vmcs, ...). Make sure there + * are no leftovers. + */ + if (from_launch) { + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + memset(vmcs12, 0, sizeof(*vmcs12)); + vmcs12->hdr.revision_id = VMCS12_REVISION; + } + + } + + /* + * Clean fields data can't be used on VMLAUNCH and when we switch + * between different L2 guests as KVM keeps a single VMCS12 per L1. + */ + if (from_launch || evmcs_gpa_changed) { + vmx->nested.hv_evmcs->hv_clean_fields &= + ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; + + vmx->nested.force_msr_bitmap_recalc = true; + } + + return EVMPTRLD_SUCCEEDED; +} + +void nested_sync_vmcs12_to_shadow(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) + copy_vmcs12_to_enlightened(vmx); + else + copy_vmcs12_to_shadow(vmx); + + vmx->nested.need_vmcs12_to_shadow_sync = false; +} + +static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer) +{ + struct vcpu_vmx *vmx = + container_of(timer, struct vcpu_vmx, nested.preemption_timer); + + vmx->nested.preemption_timer_expired = true; + kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu); + kvm_vcpu_kick(&vmx->vcpu); + + return HRTIMER_NORESTART; +} + +static u64 vmx_calc_preemption_timer_value(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + + u64 l1_scaled_tsc = kvm_read_l1_tsc(vcpu, rdtsc()) >> + VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; + + if (!vmx->nested.has_preemption_timer_deadline) { + vmx->nested.preemption_timer_deadline = + vmcs12->vmx_preemption_timer_value + l1_scaled_tsc; + vmx->nested.has_preemption_timer_deadline = true; + } + return vmx->nested.preemption_timer_deadline - l1_scaled_tsc; +} + +static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu, + u64 preemption_timeout) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + /* + * A timer value of zero is architecturally guaranteed to cause + * a VMExit prior to executing any instructions in the guest. + */ + if (preemption_timeout == 0) { + vmx_preemption_timer_fn(&vmx->nested.preemption_timer); + return; + } + + if (vcpu->arch.virtual_tsc_khz == 0) + return; + + preemption_timeout <<= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; + preemption_timeout *= 1000000; + do_div(preemption_timeout, vcpu->arch.virtual_tsc_khz); + hrtimer_start(&vmx->nested.preemption_timer, + ktime_add_ns(ktime_get(), preemption_timeout), + HRTIMER_MODE_ABS_PINNED); +} + +static u64 nested_vmx_calc_efer(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) +{ + if (vmx->nested.nested_run_pending && + (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) + return vmcs12->guest_ia32_efer; + else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) + return vmx->vcpu.arch.efer | (EFER_LMA | EFER_LME); + else + return vmx->vcpu.arch.efer & ~(EFER_LMA | EFER_LME); +} + +static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx) +{ + struct kvm *kvm = vmx->vcpu.kvm; + + /* + * If vmcs02 hasn't been initialized, set the constant vmcs02 state + * according to L0's settings (vmcs12 is irrelevant here). Host + * fields that come from L0 and are not constant, e.g. HOST_CR3, + * will be set as needed prior to VMLAUNCH/VMRESUME. + */ + if (vmx->nested.vmcs02_initialized) + return; + vmx->nested.vmcs02_initialized = true; + + /* + * We don't care what the EPTP value is we just need to guarantee + * it's valid so we don't get a false positive when doing early + * consistency checks. + */ + if (enable_ept && nested_early_check) + vmcs_write64(EPT_POINTER, + construct_eptp(&vmx->vcpu, 0, PT64_ROOT_4LEVEL)); + + /* All VMFUNCs are currently emulated through L0 vmexits. */ + if (cpu_has_vmx_vmfunc()) + vmcs_write64(VM_FUNCTION_CONTROL, 0); + + if (cpu_has_vmx_posted_intr()) + vmcs_write16(POSTED_INTR_NV, POSTED_INTR_NESTED_VECTOR); + + if (cpu_has_vmx_msr_bitmap()) + vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap)); + + /* + * PML is emulated for L2, but never enabled in hardware as the MMU + * handles A/D emulation. Disabling PML for L2 also avoids having to + * deal with filtering out L2 GPAs from the buffer. + */ + if (enable_pml) { + vmcs_write64(PML_ADDRESS, 0); + vmcs_write16(GUEST_PML_INDEX, -1); + } + + if (cpu_has_vmx_encls_vmexit()) + vmcs_write64(ENCLS_EXITING_BITMAP, INVALID_GPA); + + if (kvm_notify_vmexit_enabled(kvm)) + vmcs_write32(NOTIFY_WINDOW, kvm->arch.notify_window); + + /* + * Set the MSR load/store lists to match L0's settings. Only the + * addresses are constant (for vmcs02), the counts can change based + * on L2's behavior, e.g. switching to/from long mode. + */ + vmcs_write64(VM_EXIT_MSR_STORE_ADDR, __pa(vmx->msr_autostore.guest.val)); + vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val)); + vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val)); + + vmx_set_constant_host_state(vmx); +} + +static void prepare_vmcs02_early_rare(struct vcpu_vmx *vmx, + struct vmcs12 *vmcs12) +{ + prepare_vmcs02_constant_state(vmx); + + vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA); + + if (enable_vpid) { + if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02) + vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.vpid02); + else + vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); + } +} + +static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs01, + struct vmcs12 *vmcs12) +{ + u32 exec_control; + u64 guest_efer = nested_vmx_calc_efer(vmx, vmcs12); + + if (vmx->nested.dirty_vmcs12 || evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) + prepare_vmcs02_early_rare(vmx, vmcs12); + + /* + * PIN CONTROLS + */ + exec_control = __pin_controls_get(vmcs01); + exec_control |= (vmcs12->pin_based_vm_exec_control & + ~PIN_BASED_VMX_PREEMPTION_TIMER); + + /* Posted interrupts setting is only taken from vmcs12. */ + vmx->nested.pi_pending = false; + if (nested_cpu_has_posted_intr(vmcs12)) + vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv; + else + exec_control &= ~PIN_BASED_POSTED_INTR; + pin_controls_set(vmx, exec_control); + + /* + * EXEC CONTROLS + */ + exec_control = __exec_controls_get(vmcs01); /* L0's desires */ + exec_control &= ~CPU_BASED_INTR_WINDOW_EXITING; + exec_control &= ~CPU_BASED_NMI_WINDOW_EXITING; + exec_control &= ~CPU_BASED_TPR_SHADOW; + exec_control |= vmcs12->cpu_based_vm_exec_control; + + vmx->nested.l1_tpr_threshold = -1; + if (exec_control & CPU_BASED_TPR_SHADOW) + vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold); +#ifdef CONFIG_X86_64 + else + exec_control |= CPU_BASED_CR8_LOAD_EXITING | + CPU_BASED_CR8_STORE_EXITING; +#endif + + /* + * A vmexit (to either L1 hypervisor or L0 userspace) is always needed + * for I/O port accesses. + */ + exec_control |= CPU_BASED_UNCOND_IO_EXITING; + exec_control &= ~CPU_BASED_USE_IO_BITMAPS; + + /* + * This bit will be computed in nested_get_vmcs12_pages, because + * we do not have access to L1's MSR bitmap yet. For now, keep + * the same bit as before, hoping to avoid multiple VMWRITEs that + * only set/clear this bit. + */ + exec_control &= ~CPU_BASED_USE_MSR_BITMAPS; + exec_control |= exec_controls_get(vmx) & CPU_BASED_USE_MSR_BITMAPS; + + exec_controls_set(vmx, exec_control); + + /* + * SECONDARY EXEC CONTROLS + */ + if (cpu_has_secondary_exec_ctrls()) { + exec_control = __secondary_exec_controls_get(vmcs01); + + /* Take the following fields only from vmcs12 */ + exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | + SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | + SECONDARY_EXEC_ENABLE_INVPCID | + SECONDARY_EXEC_ENABLE_RDTSCP | + SECONDARY_EXEC_XSAVES | + SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE | + SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | + SECONDARY_EXEC_APIC_REGISTER_VIRT | + SECONDARY_EXEC_ENABLE_VMFUNC | + SECONDARY_EXEC_DESC); + + if (nested_cpu_has(vmcs12, + CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) + exec_control |= vmcs12->secondary_vm_exec_control; + + /* PML is emulated and never enabled in hardware for L2. */ + exec_control &= ~SECONDARY_EXEC_ENABLE_PML; + + /* VMCS shadowing for L2 is emulated for now */ + exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS; + + /* + * Preset *DT exiting when emulating UMIP, so that vmx_set_cr4() + * will not have to rewrite the controls just for this bit. + */ + if (!boot_cpu_has(X86_FEATURE_UMIP) && vmx_umip_emulated() && + (vmcs12->guest_cr4 & X86_CR4_UMIP)) + exec_control |= SECONDARY_EXEC_DESC; + + if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) + vmcs_write16(GUEST_INTR_STATUS, + vmcs12->guest_intr_status); + + if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST)) + exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; + + if (exec_control & SECONDARY_EXEC_ENCLS_EXITING) + vmx_write_encls_bitmap(&vmx->vcpu, vmcs12); + + secondary_exec_controls_set(vmx, exec_control); + } + + /* + * ENTRY CONTROLS + * + * vmcs12's VM_{ENTRY,EXIT}_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE + * are emulated by vmx_set_efer() in prepare_vmcs02(), but speculate + * on the related bits (if supported by the CPU) in the hope that + * we can avoid VMWrites during vmx_set_efer(). + * + * Similarly, take vmcs01's PERF_GLOBAL_CTRL in the hope that if KVM is + * loading PERF_GLOBAL_CTRL via the VMCS for L1, then KVM will want to + * do the same for L2. + */ + exec_control = __vm_entry_controls_get(vmcs01); + exec_control |= (vmcs12->vm_entry_controls & + ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL); + exec_control &= ~(VM_ENTRY_IA32E_MODE | VM_ENTRY_LOAD_IA32_EFER); + if (cpu_has_load_ia32_efer()) { + if (guest_efer & EFER_LMA) + exec_control |= VM_ENTRY_IA32E_MODE; + if (guest_efer != host_efer) + exec_control |= VM_ENTRY_LOAD_IA32_EFER; + } + vm_entry_controls_set(vmx, exec_control); + + /* + * EXIT CONTROLS + * + * L2->L1 exit controls are emulated - the hardware exit is to L0 so + * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER + * bits may be modified by vmx_set_efer() in prepare_vmcs02(). + */ + exec_control = __vm_exit_controls_get(vmcs01); + if (cpu_has_load_ia32_efer() && guest_efer != host_efer) + exec_control |= VM_EXIT_LOAD_IA32_EFER; + else + exec_control &= ~VM_EXIT_LOAD_IA32_EFER; + vm_exit_controls_set(vmx, exec_control); + + /* + * Interrupt/Exception Fields + */ + if (vmx->nested.nested_run_pending) { + vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, + vmcs12->vm_entry_intr_info_field); + vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, + vmcs12->vm_entry_exception_error_code); + vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, + vmcs12->vm_entry_instruction_len); + vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, + vmcs12->guest_interruptibility_info); + vmx->loaded_vmcs->nmi_known_unmasked = + !(vmcs12->guest_interruptibility_info & GUEST_INTR_STATE_NMI); + } else { + vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); + } +} + +static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) +{ + struct hv_enlightened_vmcs *hv_evmcs = vmx->nested.hv_evmcs; + + if (!hv_evmcs || !(hv_evmcs->hv_clean_fields & + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2)) { + vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector); + vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector); + vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector); + vmcs_write16(GUEST_DS_SELECTOR, vmcs12->guest_ds_selector); + vmcs_write16(GUEST_FS_SELECTOR, vmcs12->guest_fs_selector); + vmcs_write16(GUEST_GS_SELECTOR, vmcs12->guest_gs_selector); + vmcs_write16(GUEST_LDTR_SELECTOR, vmcs12->guest_ldtr_selector); + vmcs_write16(GUEST_TR_SELECTOR, vmcs12->guest_tr_selector); + vmcs_write32(GUEST_ES_LIMIT, vmcs12->guest_es_limit); + vmcs_write32(GUEST_CS_LIMIT, vmcs12->guest_cs_limit); + vmcs_write32(GUEST_SS_LIMIT, vmcs12->guest_ss_limit); + vmcs_write32(GUEST_DS_LIMIT, vmcs12->guest_ds_limit); + vmcs_write32(GUEST_FS_LIMIT, vmcs12->guest_fs_limit); + vmcs_write32(GUEST_GS_LIMIT, vmcs12->guest_gs_limit); + vmcs_write32(GUEST_LDTR_LIMIT, vmcs12->guest_ldtr_limit); + vmcs_write32(GUEST_TR_LIMIT, vmcs12->guest_tr_limit); + vmcs_write32(GUEST_GDTR_LIMIT, vmcs12->guest_gdtr_limit); + vmcs_write32(GUEST_IDTR_LIMIT, vmcs12->guest_idtr_limit); + vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes); + vmcs_write32(GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes); + vmcs_write32(GUEST_ES_AR_BYTES, vmcs12->guest_es_ar_bytes); + vmcs_write32(GUEST_DS_AR_BYTES, vmcs12->guest_ds_ar_bytes); + vmcs_write32(GUEST_FS_AR_BYTES, vmcs12->guest_fs_ar_bytes); + vmcs_write32(GUEST_GS_AR_BYTES, vmcs12->guest_gs_ar_bytes); + vmcs_write32(GUEST_LDTR_AR_BYTES, vmcs12->guest_ldtr_ar_bytes); + vmcs_write32(GUEST_TR_AR_BYTES, vmcs12->guest_tr_ar_bytes); + vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base); + vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base); + vmcs_writel(GUEST_SS_BASE, vmcs12->guest_ss_base); + vmcs_writel(GUEST_DS_BASE, vmcs12->guest_ds_base); + vmcs_writel(GUEST_FS_BASE, vmcs12->guest_fs_base); + vmcs_writel(GUEST_GS_BASE, vmcs12->guest_gs_base); + vmcs_writel(GUEST_LDTR_BASE, vmcs12->guest_ldtr_base); + vmcs_writel(GUEST_TR_BASE, vmcs12->guest_tr_base); + vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base); + vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base); + + vmx->segment_cache.bitmask = 0; + } + + if (!hv_evmcs || !(hv_evmcs->hv_clean_fields & + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1)) { + vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs); + vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, + vmcs12->guest_pending_dbg_exceptions); + vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp); + vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip); + + /* + * L1 may access the L2's PDPTR, so save them to construct + * vmcs12 + */ + if (enable_ept) { + vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0); + vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1); + vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2); + vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3); + } + + if (kvm_mpx_supported() && vmx->nested.nested_run_pending && + (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)) + vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs); + } + + if (nested_cpu_has_xsaves(vmcs12)) + vmcs_write64(XSS_EXIT_BITMAP, vmcs12->xss_exit_bitmap); + + /* + * Whether page-faults are trapped is determined by a combination of + * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF. If L0 + * doesn't care about page faults then we should set all of these to + * L1's desires. However, if L0 does care about (some) page faults, it + * is not easy (if at all possible?) to merge L0 and L1's desires, we + * simply ask to exit on each and every L2 page fault. This is done by + * setting MASK=MATCH=0 and (see below) EB.PF=1. + * Note that below we don't need special code to set EB.PF beyond the + * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept, + * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when + * !enable_ept, EB.PF is 1, so the "or" will always be 1. + */ + if (vmx_need_pf_intercept(&vmx->vcpu)) { + /* + * TODO: if both L0 and L1 need the same MASK and MATCH, + * go ahead and use it? + */ + vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0); + vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0); + } else { + vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, vmcs12->page_fault_error_code_mask); + vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, vmcs12->page_fault_error_code_match); + } + + if (cpu_has_vmx_apicv()) { + vmcs_write64(EOI_EXIT_BITMAP0, vmcs12->eoi_exit_bitmap0); + vmcs_write64(EOI_EXIT_BITMAP1, vmcs12->eoi_exit_bitmap1); + vmcs_write64(EOI_EXIT_BITMAP2, vmcs12->eoi_exit_bitmap2); + vmcs_write64(EOI_EXIT_BITMAP3, vmcs12->eoi_exit_bitmap3); + } + + /* + * Make sure the msr_autostore list is up to date before we set the + * count in the vmcs02. + */ + prepare_vmx_msr_autostore_list(&vmx->vcpu, MSR_IA32_TSC); + + vmcs_write32(VM_EXIT_MSR_STORE_COUNT, vmx->msr_autostore.guest.nr); + vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); + vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); + + set_cr4_guest_host_mask(vmx); +} + +/* + * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested + * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it + * with L0's requirements for its guest (a.k.a. vmcs01), so we can run the L2 + * guest in a way that will both be appropriate to L1's requests, and our + * needs. In addition to modifying the active vmcs (which is vmcs02), this + * function also has additional necessary side-effects, like setting various + * vcpu->arch fields. + * Returns 0 on success, 1 on failure. Invalid state exit qualification code + * is assigned to entry_failure_code on failure. + */ +static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, + bool from_vmentry, + enum vm_entry_failure_code *entry_failure_code) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + bool load_guest_pdptrs_vmcs12 = false; + + if (vmx->nested.dirty_vmcs12 || evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) { + prepare_vmcs02_rare(vmx, vmcs12); + vmx->nested.dirty_vmcs12 = false; + + load_guest_pdptrs_vmcs12 = !evmptr_is_valid(vmx->nested.hv_evmcs_vmptr) || + !(vmx->nested.hv_evmcs->hv_clean_fields & + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1); + } + + if (vmx->nested.nested_run_pending && + (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) { + kvm_set_dr(vcpu, 7, vmcs12->guest_dr7); + vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl); + } else { + kvm_set_dr(vcpu, 7, vcpu->arch.dr7); + vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.pre_vmenter_debugctl); + } + if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending || + !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))) + vmcs_write64(GUEST_BNDCFGS, vmx->nested.pre_vmenter_bndcfgs); + vmx_set_rflags(vcpu, vmcs12->guest_rflags); + + /* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the + * bitwise-or of what L1 wants to trap for L2, and what we want to + * trap. Note that CR0.TS also needs updating - we do this later. + */ + vmx_update_exception_bitmap(vcpu); + vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask; + vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); + + if (vmx->nested.nested_run_pending && + (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT)) { + vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat); + vcpu->arch.pat = vmcs12->guest_ia32_pat; + } else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { + vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat); + } + + vcpu->arch.tsc_offset = kvm_calc_nested_tsc_offset( + vcpu->arch.l1_tsc_offset, + vmx_get_l2_tsc_offset(vcpu), + vmx_get_l2_tsc_multiplier(vcpu)); + + vcpu->arch.tsc_scaling_ratio = kvm_calc_nested_tsc_multiplier( + vcpu->arch.l1_tsc_scaling_ratio, + vmx_get_l2_tsc_multiplier(vcpu)); + + vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); + if (kvm_caps.has_tsc_control) + vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio); + + nested_vmx_transition_tlb_flush(vcpu, vmcs12, true); + + if (nested_cpu_has_ept(vmcs12)) + nested_ept_init_mmu_context(vcpu); + + /* + * This sets GUEST_CR0 to vmcs12->guest_cr0, possibly modifying those + * bits which we consider mandatory enabled. + * The CR0_READ_SHADOW is what L2 should have expected to read given + * the specifications by L1; It's not enough to take + * vmcs12->cr0_read_shadow because on our cr0_guest_host_mask we + * have more bits than L1 expected. + */ + vmx_set_cr0(vcpu, vmcs12->guest_cr0); + vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12)); + + vmx_set_cr4(vcpu, vmcs12->guest_cr4); + vmcs_writel(CR4_READ_SHADOW, nested_read_cr4(vmcs12)); + + vcpu->arch.efer = nested_vmx_calc_efer(vmx, vmcs12); + /* Note: may modify VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */ + vmx_set_efer(vcpu, vcpu->arch.efer); + + /* + * Guest state is invalid and unrestricted guest is disabled, + * which means L1 attempted VMEntry to L2 with invalid state. + * Fail the VMEntry. + * + * However when force loading the guest state (SMM exit or + * loading nested state after migration, it is possible to + * have invalid guest state now, which will be later fixed by + * restoring L2 register state + */ + if (CC(from_vmentry && !vmx_guest_state_valid(vcpu))) { + *entry_failure_code = ENTRY_FAIL_DEFAULT; + return -EINVAL; + } + + /* Shadow page tables on either EPT or shadow page tables. */ + if (nested_vmx_load_cr3(vcpu, vmcs12->guest_cr3, nested_cpu_has_ept(vmcs12), + from_vmentry, entry_failure_code)) + return -EINVAL; + + /* + * Immediately write vmcs02.GUEST_CR3. It will be propagated to vmcs12 + * on nested VM-Exit, which can occur without actually running L2 and + * thus without hitting vmx_load_mmu_pgd(), e.g. if L1 is entering L2 with + * vmcs12.GUEST_ACTIVITYSTATE=HLT, in which case KVM will intercept the + * transition to HLT instead of running L2. + */ + if (enable_ept) + vmcs_writel(GUEST_CR3, vmcs12->guest_cr3); + + /* Late preparation of GUEST_PDPTRs now that EFER and CRs are set. */ + if (load_guest_pdptrs_vmcs12 && nested_cpu_has_ept(vmcs12) && + is_pae_paging(vcpu)) { + vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0); + vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1); + vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2); + vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3); + } + + if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) && + intel_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu)) && + WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, + vmcs12->guest_ia32_perf_global_ctrl))) { + *entry_failure_code = ENTRY_FAIL_DEFAULT; + return -EINVAL; + } + + kvm_rsp_write(vcpu, vmcs12->guest_rsp); + kvm_rip_write(vcpu, vmcs12->guest_rip); + + /* + * It was observed that genuine Hyper-V running in L1 doesn't reset + * 'hv_clean_fields' by itself, it only sets the corresponding dirty + * bits when it changes a field in eVMCS. Mark all fields as clean + * here. + */ + if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) + vmx->nested.hv_evmcs->hv_clean_fields |= + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; + + return 0; +} + +static int nested_vmx_check_nmi_controls(struct vmcs12 *vmcs12) +{ + if (CC(!nested_cpu_has_nmi_exiting(vmcs12) && + nested_cpu_has_virtual_nmis(vmcs12))) + return -EINVAL; + + if (CC(!nested_cpu_has_virtual_nmis(vmcs12) && + nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING))) + return -EINVAL; + + return 0; +} + +static bool nested_vmx_check_eptp(struct kvm_vcpu *vcpu, u64 new_eptp) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + /* Check for memory type validity */ + switch (new_eptp & VMX_EPTP_MT_MASK) { + case VMX_EPTP_MT_UC: + if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPTP_UC_BIT))) + return false; + break; + case VMX_EPTP_MT_WB: + if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPTP_WB_BIT))) + return false; + break; + default: + return false; + } + + /* Page-walk levels validity. */ + switch (new_eptp & VMX_EPTP_PWL_MASK) { + case VMX_EPTP_PWL_5: + if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_PAGE_WALK_5_BIT))) + return false; + break; + case VMX_EPTP_PWL_4: + if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_PAGE_WALK_4_BIT))) + return false; + break; + default: + return false; + } + + /* Reserved bits should not be set */ + if (CC(kvm_vcpu_is_illegal_gpa(vcpu, new_eptp) || ((new_eptp >> 7) & 0x1f))) + return false; + + /* AD, if set, should be supported */ + if (new_eptp & VMX_EPTP_AD_ENABLE_BIT) { + if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_AD_BIT))) + return false; + } + + return true; +} + +/* + * Checks related to VM-Execution Control Fields + */ +static int nested_check_vm_execution_controls(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (CC(!vmx_control_verify(vmcs12->pin_based_vm_exec_control, + vmx->nested.msrs.pinbased_ctls_low, + vmx->nested.msrs.pinbased_ctls_high)) || + CC(!vmx_control_verify(vmcs12->cpu_based_vm_exec_control, + vmx->nested.msrs.procbased_ctls_low, + vmx->nested.msrs.procbased_ctls_high))) + return -EINVAL; + + if (nested_cpu_has(vmcs12, CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) && + CC(!vmx_control_verify(vmcs12->secondary_vm_exec_control, + vmx->nested.msrs.secondary_ctls_low, + vmx->nested.msrs.secondary_ctls_high))) + return -EINVAL; + + if (CC(vmcs12->cr3_target_count > nested_cpu_vmx_misc_cr3_count(vcpu)) || + nested_vmx_check_io_bitmap_controls(vcpu, vmcs12) || + nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12) || + nested_vmx_check_tpr_shadow_controls(vcpu, vmcs12) || + nested_vmx_check_apic_access_controls(vcpu, vmcs12) || + nested_vmx_check_apicv_controls(vcpu, vmcs12) || + nested_vmx_check_nmi_controls(vmcs12) || + nested_vmx_check_pml_controls(vcpu, vmcs12) || + nested_vmx_check_unrestricted_guest_controls(vcpu, vmcs12) || + nested_vmx_check_mode_based_ept_exec_controls(vcpu, vmcs12) || + nested_vmx_check_shadow_vmcs_controls(vcpu, vmcs12) || + CC(nested_cpu_has_vpid(vmcs12) && !vmcs12->virtual_processor_id)) + return -EINVAL; + + if (!nested_cpu_has_preemption_timer(vmcs12) && + nested_cpu_has_save_preemption_timer(vmcs12)) + return -EINVAL; + + if (nested_cpu_has_ept(vmcs12) && + CC(!nested_vmx_check_eptp(vcpu, vmcs12->ept_pointer))) + return -EINVAL; + + if (nested_cpu_has_vmfunc(vmcs12)) { + if (CC(vmcs12->vm_function_control & + ~vmx->nested.msrs.vmfunc_controls)) + return -EINVAL; + + if (nested_cpu_has_eptp_switching(vmcs12)) { + if (CC(!nested_cpu_has_ept(vmcs12)) || + CC(!page_address_valid(vcpu, vmcs12->eptp_list_address))) + return -EINVAL; + } + } + + return 0; +} + +/* + * Checks related to VM-Exit Control Fields + */ +static int nested_check_vm_exit_controls(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (CC(!vmx_control_verify(vmcs12->vm_exit_controls, + vmx->nested.msrs.exit_ctls_low, + vmx->nested.msrs.exit_ctls_high)) || + CC(nested_vmx_check_exit_msr_switch_controls(vcpu, vmcs12))) + return -EINVAL; + + return 0; +} + +/* + * Checks related to VM-Entry Control Fields + */ +static int nested_check_vm_entry_controls(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (CC(!vmx_control_verify(vmcs12->vm_entry_controls, + vmx->nested.msrs.entry_ctls_low, + vmx->nested.msrs.entry_ctls_high))) + return -EINVAL; + + /* + * From the Intel SDM, volume 3: + * Fields relevant to VM-entry event injection must be set properly. + * These fields are the VM-entry interruption-information field, the + * VM-entry exception error code, and the VM-entry instruction length. + */ + if (vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) { + u32 intr_info = vmcs12->vm_entry_intr_info_field; + u8 vector = intr_info & INTR_INFO_VECTOR_MASK; + u32 intr_type = intr_info & INTR_INFO_INTR_TYPE_MASK; + bool has_error_code = intr_info & INTR_INFO_DELIVER_CODE_MASK; + bool should_have_error_code; + bool urg = nested_cpu_has2(vmcs12, + SECONDARY_EXEC_UNRESTRICTED_GUEST); + bool prot_mode = !urg || vmcs12->guest_cr0 & X86_CR0_PE; + + /* VM-entry interruption-info field: interruption type */ + if (CC(intr_type == INTR_TYPE_RESERVED) || + CC(intr_type == INTR_TYPE_OTHER_EVENT && + !nested_cpu_supports_monitor_trap_flag(vcpu))) + return -EINVAL; + + /* VM-entry interruption-info field: vector */ + if (CC(intr_type == INTR_TYPE_NMI_INTR && vector != NMI_VECTOR) || + CC(intr_type == INTR_TYPE_HARD_EXCEPTION && vector > 31) || + CC(intr_type == INTR_TYPE_OTHER_EVENT && vector != 0)) + return -EINVAL; + + /* VM-entry interruption-info field: deliver error code */ + should_have_error_code = + intr_type == INTR_TYPE_HARD_EXCEPTION && prot_mode && + x86_exception_has_error_code(vector); + if (CC(has_error_code != should_have_error_code)) + return -EINVAL; + + /* VM-entry exception error code */ + if (CC(has_error_code && + vmcs12->vm_entry_exception_error_code & GENMASK(31, 16))) + return -EINVAL; + + /* VM-entry interruption-info field: reserved bits */ + if (CC(intr_info & INTR_INFO_RESVD_BITS_MASK)) + return -EINVAL; + + /* VM-entry instruction length */ + switch (intr_type) { + case INTR_TYPE_SOFT_EXCEPTION: + case INTR_TYPE_SOFT_INTR: + case INTR_TYPE_PRIV_SW_EXCEPTION: + if (CC(vmcs12->vm_entry_instruction_len > 15) || + CC(vmcs12->vm_entry_instruction_len == 0 && + CC(!nested_cpu_has_zero_length_injection(vcpu)))) + return -EINVAL; + } + } + + if (nested_vmx_check_entry_msr_switch_controls(vcpu, vmcs12)) + return -EINVAL; + + return 0; +} + +static int nested_vmx_check_controls(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + if (nested_check_vm_execution_controls(vcpu, vmcs12) || + nested_check_vm_exit_controls(vcpu, vmcs12) || + nested_check_vm_entry_controls(vcpu, vmcs12)) + return -EINVAL; + + if (guest_cpuid_has_evmcs(vcpu)) + return nested_evmcs_check_controls(vmcs12); + + return 0; +} + +static int nested_vmx_check_address_space_size(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ +#ifdef CONFIG_X86_64 + if (CC(!!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) != + !!(vcpu->arch.efer & EFER_LMA))) + return -EINVAL; +#endif + return 0; +} + +static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + bool ia32e; + + if (CC(!nested_host_cr0_valid(vcpu, vmcs12->host_cr0)) || + CC(!nested_host_cr4_valid(vcpu, vmcs12->host_cr4)) || + CC(kvm_vcpu_is_illegal_gpa(vcpu, vmcs12->host_cr3))) + return -EINVAL; + + if (CC(is_noncanonical_address(vmcs12->host_ia32_sysenter_esp, vcpu)) || + CC(is_noncanonical_address(vmcs12->host_ia32_sysenter_eip, vcpu))) + return -EINVAL; + + if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) && + CC(!kvm_pat_valid(vmcs12->host_ia32_pat))) + return -EINVAL; + + if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) && + CC(!kvm_valid_perf_global_ctrl(vcpu_to_pmu(vcpu), + vmcs12->host_ia32_perf_global_ctrl))) + return -EINVAL; + +#ifdef CONFIG_X86_64 + ia32e = !!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE); +#else + ia32e = false; +#endif + + if (ia32e) { + if (CC(!(vmcs12->host_cr4 & X86_CR4_PAE))) + return -EINVAL; + } else { + if (CC(vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) || + CC(vmcs12->host_cr4 & X86_CR4_PCIDE) || + CC((vmcs12->host_rip) >> 32)) + return -EINVAL; + } + + if (CC(vmcs12->host_cs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || + CC(vmcs12->host_ss_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || + CC(vmcs12->host_ds_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || + CC(vmcs12->host_es_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || + CC(vmcs12->host_fs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || + CC(vmcs12->host_gs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || + CC(vmcs12->host_tr_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || + CC(vmcs12->host_cs_selector == 0) || + CC(vmcs12->host_tr_selector == 0) || + CC(vmcs12->host_ss_selector == 0 && !ia32e)) + return -EINVAL; + + if (CC(is_noncanonical_address(vmcs12->host_fs_base, vcpu)) || + CC(is_noncanonical_address(vmcs12->host_gs_base, vcpu)) || + CC(is_noncanonical_address(vmcs12->host_gdtr_base, vcpu)) || + CC(is_noncanonical_address(vmcs12->host_idtr_base, vcpu)) || + CC(is_noncanonical_address(vmcs12->host_tr_base, vcpu)) || + CC(is_noncanonical_address(vmcs12->host_rip, vcpu))) + return -EINVAL; + + /* + * If the load IA32_EFER VM-exit control is 1, bits reserved in the + * IA32_EFER MSR must be 0 in the field for that register. In addition, + * the values of the LMA and LME bits in the field must each be that of + * the host address-space size VM-exit control. + */ + if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) { + if (CC(!kvm_valid_efer(vcpu, vmcs12->host_ia32_efer)) || + CC(ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA)) || + CC(ia32e != !!(vmcs12->host_ia32_efer & EFER_LME))) + return -EINVAL; + } + + return 0; +} + +static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct gfn_to_hva_cache *ghc = &vmx->nested.shadow_vmcs12_cache; + struct vmcs_hdr hdr; + + if (vmcs12->vmcs_link_pointer == INVALID_GPA) + return 0; + + if (CC(!page_address_valid(vcpu, vmcs12->vmcs_link_pointer))) + return -EINVAL; + + if (ghc->gpa != vmcs12->vmcs_link_pointer && + CC(kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, + vmcs12->vmcs_link_pointer, VMCS12_SIZE))) + return -EINVAL; + + if (CC(kvm_read_guest_offset_cached(vcpu->kvm, ghc, &hdr, + offsetof(struct vmcs12, hdr), + sizeof(hdr)))) + return -EINVAL; + + if (CC(hdr.revision_id != VMCS12_REVISION) || + CC(hdr.shadow_vmcs != nested_cpu_has_shadow_vmcs(vmcs12))) + return -EINVAL; + + return 0; +} + +/* + * Checks related to Guest Non-register State + */ +static int nested_check_guest_non_reg_state(struct vmcs12 *vmcs12) +{ + if (CC(vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE && + vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT && + vmcs12->guest_activity_state != GUEST_ACTIVITY_WAIT_SIPI)) + return -EINVAL; + + return 0; +} + +static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12, + enum vm_entry_failure_code *entry_failure_code) +{ + bool ia32e = !!(vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE); + + *entry_failure_code = ENTRY_FAIL_DEFAULT; + + if (CC(!nested_guest_cr0_valid(vcpu, vmcs12->guest_cr0)) || + CC(!nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4))) + return -EINVAL; + + if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) && + CC(!kvm_dr7_valid(vmcs12->guest_dr7))) + return -EINVAL; + + if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) && + CC(!kvm_pat_valid(vmcs12->guest_ia32_pat))) + return -EINVAL; + + if (nested_vmx_check_vmcs_link_ptr(vcpu, vmcs12)) { + *entry_failure_code = ENTRY_FAIL_VMCS_LINK_PTR; + return -EINVAL; + } + + if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) && + CC(!kvm_valid_perf_global_ctrl(vcpu_to_pmu(vcpu), + vmcs12->guest_ia32_perf_global_ctrl))) + return -EINVAL; + + if (CC((vmcs12->guest_cr0 & (X86_CR0_PG | X86_CR0_PE)) == X86_CR0_PG)) + return -EINVAL; + + if (CC(ia32e && !(vmcs12->guest_cr4 & X86_CR4_PAE)) || + CC(ia32e && !(vmcs12->guest_cr0 & X86_CR0_PG))) + return -EINVAL; + + /* + * If the load IA32_EFER VM-entry control is 1, the following checks + * are performed on the field for the IA32_EFER MSR: + * - Bits reserved in the IA32_EFER MSR must be 0. + * - Bit 10 (corresponding to IA32_EFER.LMA) must equal the value of + * the IA-32e mode guest VM-exit control. It must also be identical + * to bit 8 (LME) if bit 31 in the CR0 field (corresponding to + * CR0.PG) is 1. + */ + if (to_vmx(vcpu)->nested.nested_run_pending && + (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) { + if (CC(!kvm_valid_efer(vcpu, vmcs12->guest_ia32_efer)) || + CC(ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA)) || + CC(((vmcs12->guest_cr0 & X86_CR0_PG) && + ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME)))) + return -EINVAL; + } + + if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS) && + (CC(is_noncanonical_address(vmcs12->guest_bndcfgs & PAGE_MASK, vcpu)) || + CC((vmcs12->guest_bndcfgs & MSR_IA32_BNDCFGS_RSVD)))) + return -EINVAL; + + if (nested_check_guest_non_reg_state(vmcs12)) + return -EINVAL; + + return 0; +} + +static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + unsigned long cr3, cr4; + bool vm_fail; + + if (!nested_early_check) + return 0; + + if (vmx->msr_autoload.host.nr) + vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0); + if (vmx->msr_autoload.guest.nr) + vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0); + + preempt_disable(); + + vmx_prepare_switch_to_guest(vcpu); + + /* + * Induce a consistency check VMExit by clearing bit 1 in GUEST_RFLAGS, + * which is reserved to '1' by hardware. GUEST_RFLAGS is guaranteed to + * be written (by prepare_vmcs02()) before the "real" VMEnter, i.e. + * there is no need to preserve other bits or save/restore the field. + */ + vmcs_writel(GUEST_RFLAGS, 0); + + cr3 = __get_current_cr3_fast(); + if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) { + vmcs_writel(HOST_CR3, cr3); + vmx->loaded_vmcs->host_state.cr3 = cr3; + } + + cr4 = cr4_read_shadow(); + if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) { + vmcs_writel(HOST_CR4, cr4); + vmx->loaded_vmcs->host_state.cr4 = cr4; + } + + vm_fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs, + __vmx_vcpu_run_flags(vmx)); + + if (vmx->msr_autoload.host.nr) + vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); + if (vmx->msr_autoload.guest.nr) + vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); + + if (vm_fail) { + u32 error = vmcs_read32(VM_INSTRUCTION_ERROR); + + preempt_enable(); + + trace_kvm_nested_vmenter_failed( + "early hardware check VM-instruction error: ", error); + WARN_ON_ONCE(error != VMXERR_ENTRY_INVALID_CONTROL_FIELD); + return 1; + } + + /* + * VMExit clears RFLAGS.IF and DR7, even on a consistency check. + */ + if (hw_breakpoint_active()) + set_debugreg(__this_cpu_read(cpu_dr7), 7); + local_irq_enable(); + preempt_enable(); + + /* + * A non-failing VMEntry means we somehow entered guest mode with + * an illegal RIP, and that's just the tip of the iceberg. There + * is no telling what memory has been modified or what state has + * been exposed to unknown code. Hitting this all but guarantees + * a (very critical) hardware issue. + */ + WARN_ON(!(vmcs_read32(VM_EXIT_REASON) & + VMX_EXIT_REASONS_FAILED_VMENTRY)); + + return 0; +} + +static bool nested_get_evmcs_page(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + /* + * hv_evmcs may end up being not mapped after migration (when + * L2 was running), map it here to make sure vmcs12 changes are + * properly reflected. + */ + if (guest_cpuid_has_evmcs(vcpu) && + vmx->nested.hv_evmcs_vmptr == EVMPTR_MAP_PENDING) { + enum nested_evmptrld_status evmptrld_status = + nested_vmx_handle_enlightened_vmptrld(vcpu, false); + + if (evmptrld_status == EVMPTRLD_VMFAIL || + evmptrld_status == EVMPTRLD_ERROR) + return false; + + /* + * Post migration VMCS12 always provides the most actual + * information, copy it to eVMCS upon entry. + */ + vmx->nested.need_vmcs12_to_shadow_sync = true; + } + + return true; +} + +static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) +{ + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct kvm_host_map *map; + + if (!vcpu->arch.pdptrs_from_userspace && + !nested_cpu_has_ept(vmcs12) && is_pae_paging(vcpu)) { + /* + * Reload the guest's PDPTRs since after a migration + * the guest CR3 might be restored prior to setting the nested + * state which can lead to a load of wrong PDPTRs. + */ + if (CC(!load_pdptrs(vcpu, vcpu->arch.cr3))) + return false; + } + + + if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { + map = &vmx->nested.apic_access_page_map; + + if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->apic_access_addr), map)) { + vmcs_write64(APIC_ACCESS_ADDR, pfn_to_hpa(map->pfn)); + } else { + pr_debug_ratelimited("%s: no backing for APIC-access address in vmcs12\n", + __func__); + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = + KVM_INTERNAL_ERROR_EMULATION; + vcpu->run->internal.ndata = 0; + return false; + } + } + + if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) { + map = &vmx->nested.virtual_apic_map; + + if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->virtual_apic_page_addr), map)) { + vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, pfn_to_hpa(map->pfn)); + } else if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING) && + nested_cpu_has(vmcs12, CPU_BASED_CR8_STORE_EXITING) && + !nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { + /* + * The processor will never use the TPR shadow, simply + * clear the bit from the execution control. Such a + * configuration is useless, but it happens in tests. + * For any other configuration, failing the vm entry is + * _not_ what the processor does but it's basically the + * only possibility we have. + */ + exec_controls_clearbit(vmx, CPU_BASED_TPR_SHADOW); + } else { + /* + * Write an illegal value to VIRTUAL_APIC_PAGE_ADDR to + * force VM-Entry to fail. + */ + vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, INVALID_GPA); + } + } + + if (nested_cpu_has_posted_intr(vmcs12)) { + map = &vmx->nested.pi_desc_map; + + if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->posted_intr_desc_addr), map)) { + vmx->nested.pi_desc = + (struct pi_desc *)(((void *)map->hva) + + offset_in_page(vmcs12->posted_intr_desc_addr)); + vmcs_write64(POSTED_INTR_DESC_ADDR, + pfn_to_hpa(map->pfn) + offset_in_page(vmcs12->posted_intr_desc_addr)); + } else { + /* + * Defer the KVM_INTERNAL_EXIT until KVM tries to + * access the contents of the VMCS12 posted interrupt + * descriptor. (Note that KVM may do this when it + * should not, per the architectural specification.) + */ + vmx->nested.pi_desc = NULL; + pin_controls_clearbit(vmx, PIN_BASED_POSTED_INTR); + } + } + if (nested_vmx_prepare_msr_bitmap(vcpu, vmcs12)) + exec_controls_setbit(vmx, CPU_BASED_USE_MSR_BITMAPS); + else + exec_controls_clearbit(vmx, CPU_BASED_USE_MSR_BITMAPS); + + return true; +} + +static bool vmx_get_nested_state_pages(struct kvm_vcpu *vcpu) +{ + if (!nested_get_evmcs_page(vcpu)) { + pr_debug_ratelimited("%s: enlightened vmptrld failed\n", + __func__); + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = + KVM_INTERNAL_ERROR_EMULATION; + vcpu->run->internal.ndata = 0; + + return false; + } + + if (is_guest_mode(vcpu) && !nested_get_vmcs12_pages(vcpu)) + return false; + + return true; +} + +static int nested_vmx_write_pml_buffer(struct kvm_vcpu *vcpu, gpa_t gpa) +{ + struct vmcs12 *vmcs12; + struct vcpu_vmx *vmx = to_vmx(vcpu); + gpa_t dst; + + if (WARN_ON_ONCE(!is_guest_mode(vcpu))) + return 0; + + if (WARN_ON_ONCE(vmx->nested.pml_full)) + return 1; + + /* + * Check if PML is enabled for the nested guest. Whether eptp bit 6 is + * set is already checked as part of A/D emulation. + */ + vmcs12 = get_vmcs12(vcpu); + if (!nested_cpu_has_pml(vmcs12)) + return 0; + + if (vmcs12->guest_pml_index >= PML_ENTITY_NUM) { + vmx->nested.pml_full = true; + return 1; + } + + gpa &= ~0xFFFull; + dst = vmcs12->pml_address + sizeof(u64) * vmcs12->guest_pml_index; + + if (kvm_write_guest_page(vcpu->kvm, gpa_to_gfn(dst), &gpa, + offset_in_page(dst), sizeof(gpa))) + return 0; + + vmcs12->guest_pml_index--; + + return 0; +} + +/* + * Intel's VMX Instruction Reference specifies a common set of prerequisites + * for running VMX instructions (except VMXON, whose prerequisites are + * slightly different). It also specifies what exception to inject otherwise. + * Note that many of these exceptions have priority over VM exits, so they + * don't have to be checked again here. + */ +static int nested_vmx_check_permission(struct kvm_vcpu *vcpu) +{ + if (!to_vmx(vcpu)->nested.vmxon) { + kvm_queue_exception(vcpu, UD_VECTOR); + return 0; + } + + if (vmx_get_cpl(vcpu)) { + kvm_inject_gp(vcpu, 0); + return 0; + } + + return 1; +} + +static u8 vmx_has_apicv_interrupt(struct kvm_vcpu *vcpu) +{ + u8 rvi = vmx_get_rvi(); + u8 vppr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_PROCPRI); + + return ((rvi & 0xf0) > (vppr & 0xf0)); +} + +static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12); + +/* + * If from_vmentry is false, this is being called from state restore (either RSM + * or KVM_SET_NESTED_STATE). Otherwise it's called from vmlaunch/vmresume. + * + * Returns: + * NVMX_VMENTRY_SUCCESS: Entered VMX non-root mode + * NVMX_VMENTRY_VMFAIL: Consistency check VMFail + * NVMX_VMENTRY_VMEXIT: Consistency check VMExit + * NVMX_VMENTRY_KVM_INTERNAL_ERROR: KVM internal error + */ +enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, + bool from_vmentry) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + enum vm_entry_failure_code entry_failure_code; + bool evaluate_pending_interrupts; + union vmx_exit_reason exit_reason = { + .basic = EXIT_REASON_INVALID_STATE, + .failed_vmentry = 1, + }; + u32 failed_index; + + trace_kvm_nested_vmenter(kvm_rip_read(vcpu), + vmx->nested.current_vmptr, + vmcs12->guest_rip, + vmcs12->guest_intr_status, + vmcs12->vm_entry_intr_info_field, + vmcs12->secondary_vm_exec_control & SECONDARY_EXEC_ENABLE_EPT, + vmcs12->ept_pointer, + vmcs12->guest_cr3, + KVM_ISA_VMX); + + kvm_service_local_tlb_flush_requests(vcpu); + + evaluate_pending_interrupts = exec_controls_get(vmx) & + (CPU_BASED_INTR_WINDOW_EXITING | CPU_BASED_NMI_WINDOW_EXITING); + if (likely(!evaluate_pending_interrupts) && kvm_vcpu_apicv_active(vcpu)) + evaluate_pending_interrupts |= vmx_has_apicv_interrupt(vcpu); + if (!evaluate_pending_interrupts) + evaluate_pending_interrupts |= kvm_apic_has_pending_init_or_sipi(vcpu); + + if (!vmx->nested.nested_run_pending || + !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) + vmx->nested.pre_vmenter_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); + if (kvm_mpx_supported() && + (!vmx->nested.nested_run_pending || + !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))) + vmx->nested.pre_vmenter_bndcfgs = vmcs_read64(GUEST_BNDCFGS); + + /* + * Overwrite vmcs01.GUEST_CR3 with L1's CR3 if EPT is disabled *and* + * nested early checks are disabled. In the event of a "late" VM-Fail, + * i.e. a VM-Fail detected by hardware but not KVM, KVM must unwind its + * software model to the pre-VMEntry host state. When EPT is disabled, + * GUEST_CR3 holds KVM's shadow CR3, not L1's "real" CR3, which causes + * nested_vmx_restore_host_state() to corrupt vcpu->arch.cr3. Stuffing + * vmcs01.GUEST_CR3 results in the unwind naturally setting arch.cr3 to + * the correct value. Smashing vmcs01.GUEST_CR3 is safe because nested + * VM-Exits, and the unwind, reset KVM's MMU, i.e. vmcs01.GUEST_CR3 is + * guaranteed to be overwritten with a shadow CR3 prior to re-entering + * L1. Don't stuff vmcs01.GUEST_CR3 when using nested early checks as + * KVM modifies vcpu->arch.cr3 if and only if the early hardware checks + * pass, and early VM-Fails do not reset KVM's MMU, i.e. the VM-Fail + * path would need to manually save/restore vmcs01.GUEST_CR3. + */ + if (!enable_ept && !nested_early_check) + vmcs_writel(GUEST_CR3, vcpu->arch.cr3); + + vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02); + + prepare_vmcs02_early(vmx, &vmx->vmcs01, vmcs12); + + if (from_vmentry) { + if (unlikely(!nested_get_vmcs12_pages(vcpu))) { + vmx_switch_vmcs(vcpu, &vmx->vmcs01); + return NVMX_VMENTRY_KVM_INTERNAL_ERROR; + } + + if (nested_vmx_check_vmentry_hw(vcpu)) { + vmx_switch_vmcs(vcpu, &vmx->vmcs01); + return NVMX_VMENTRY_VMFAIL; + } + + if (nested_vmx_check_guest_state(vcpu, vmcs12, + &entry_failure_code)) { + exit_reason.basic = EXIT_REASON_INVALID_STATE; + vmcs12->exit_qualification = entry_failure_code; + goto vmentry_fail_vmexit; + } + } + + enter_guest_mode(vcpu); + + if (prepare_vmcs02(vcpu, vmcs12, from_vmentry, &entry_failure_code)) { + exit_reason.basic = EXIT_REASON_INVALID_STATE; + vmcs12->exit_qualification = entry_failure_code; + goto vmentry_fail_vmexit_guest_mode; + } + + if (from_vmentry) { + failed_index = nested_vmx_load_msr(vcpu, + vmcs12->vm_entry_msr_load_addr, + vmcs12->vm_entry_msr_load_count); + if (failed_index) { + exit_reason.basic = EXIT_REASON_MSR_LOAD_FAIL; + vmcs12->exit_qualification = failed_index; + goto vmentry_fail_vmexit_guest_mode; + } + } else { + /* + * The MMU is not initialized to point at the right entities yet and + * "get pages" would need to read data from the guest (i.e. we will + * need to perform gpa to hpa translation). Request a call + * to nested_get_vmcs12_pages before the next VM-entry. The MSRs + * have already been set at vmentry time and should not be reset. + */ + kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); + } + + /* + * Re-evaluate pending events if L1 had a pending IRQ/NMI/INIT/SIPI + * when it executed VMLAUNCH/VMRESUME, as entering non-root mode can + * effectively unblock various events, e.g. INIT/SIPI cause VM-Exit + * unconditionally. + */ + if (unlikely(evaluate_pending_interrupts)) + kvm_make_request(KVM_REQ_EVENT, vcpu); + + /* + * Do not start the preemption timer hrtimer until after we know + * we are successful, so that only nested_vmx_vmexit needs to cancel + * the timer. + */ + vmx->nested.preemption_timer_expired = false; + if (nested_cpu_has_preemption_timer(vmcs12)) { + u64 timer_value = vmx_calc_preemption_timer_value(vcpu); + vmx_start_preemption_timer(vcpu, timer_value); + } + + /* + * Note no nested_vmx_succeed or nested_vmx_fail here. At this point + * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet + * returned as far as L1 is concerned. It will only return (and set + * the success flag) when L2 exits (see nested_vmx_vmexit()). + */ + return NVMX_VMENTRY_SUCCESS; + + /* + * A failed consistency check that leads to a VMExit during L1's + * VMEnter to L2 is a variation of a normal VMexit, as explained in + * 26.7 "VM-entry failures during or after loading guest state". + */ +vmentry_fail_vmexit_guest_mode: + if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING) + vcpu->arch.tsc_offset -= vmcs12->tsc_offset; + leave_guest_mode(vcpu); + +vmentry_fail_vmexit: + vmx_switch_vmcs(vcpu, &vmx->vmcs01); + + if (!from_vmentry) + return NVMX_VMENTRY_VMEXIT; + + load_vmcs12_host_state(vcpu, vmcs12); + vmcs12->vm_exit_reason = exit_reason.full; + if (enable_shadow_vmcs || evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) + vmx->nested.need_vmcs12_to_shadow_sync = true; + return NVMX_VMENTRY_VMEXIT; +} + +/* + * nested_vmx_run() handles a nested entry, i.e., a VMLAUNCH or VMRESUME on L1 + * for running an L2 nested guest. + */ +static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) +{ + struct vmcs12 *vmcs12; + enum nvmx_vmentry_status status; + struct vcpu_vmx *vmx = to_vmx(vcpu); + u32 interrupt_shadow = vmx_get_interrupt_shadow(vcpu); + enum nested_evmptrld_status evmptrld_status; + + if (!nested_vmx_check_permission(vcpu)) + return 1; + + evmptrld_status = nested_vmx_handle_enlightened_vmptrld(vcpu, launch); + if (evmptrld_status == EVMPTRLD_ERROR) { + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; + } + + kvm_pmu_trigger_event(vcpu, PERF_COUNT_HW_BRANCH_INSTRUCTIONS); + + if (CC(evmptrld_status == EVMPTRLD_VMFAIL)) + return nested_vmx_failInvalid(vcpu); + + if (CC(!evmptr_is_valid(vmx->nested.hv_evmcs_vmptr) && + vmx->nested.current_vmptr == INVALID_GPA)) + return nested_vmx_failInvalid(vcpu); + + vmcs12 = get_vmcs12(vcpu); + + /* + * Can't VMLAUNCH or VMRESUME a shadow VMCS. Despite the fact + * that there *is* a valid VMCS pointer, RFLAGS.CF is set + * rather than RFLAGS.ZF, and no error number is stored to the + * VM-instruction error field. + */ + if (CC(vmcs12->hdr.shadow_vmcs)) + return nested_vmx_failInvalid(vcpu); + + if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) { + copy_enlightened_to_vmcs12(vmx, vmx->nested.hv_evmcs->hv_clean_fields); + /* Enlightened VMCS doesn't have launch state */ + vmcs12->launch_state = !launch; + } else if (enable_shadow_vmcs) { + copy_shadow_to_vmcs12(vmx); + } + + /* + * The nested entry process starts with enforcing various prerequisites + * on vmcs12 as required by the Intel SDM, and act appropriately when + * they fail: As the SDM explains, some conditions should cause the + * instruction to fail, while others will cause the instruction to seem + * to succeed, but return an EXIT_REASON_INVALID_STATE. + * To speed up the normal (success) code path, we should avoid checking + * for misconfigurations which will anyway be caught by the processor + * when using the merged vmcs02. + */ + if (CC(interrupt_shadow & KVM_X86_SHADOW_INT_MOV_SS)) + return nested_vmx_fail(vcpu, VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS); + + if (CC(vmcs12->launch_state == launch)) + return nested_vmx_fail(vcpu, + launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS + : VMXERR_VMRESUME_NONLAUNCHED_VMCS); + + if (nested_vmx_check_controls(vcpu, vmcs12)) + return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); + + if (nested_vmx_check_address_space_size(vcpu, vmcs12)) + return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD); + + if (nested_vmx_check_host_state(vcpu, vmcs12)) + return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD); + + /* + * We're finally done with prerequisite checking, and can start with + * the nested entry. + */ + vmx->nested.nested_run_pending = 1; + vmx->nested.has_preemption_timer_deadline = false; + status = nested_vmx_enter_non_root_mode(vcpu, true); + if (unlikely(status != NVMX_VMENTRY_SUCCESS)) + goto vmentry_failed; + + /* Emulate processing of posted interrupts on VM-Enter. */ + if (nested_cpu_has_posted_intr(vmcs12) && + kvm_apic_has_interrupt(vcpu) == vmx->nested.posted_intr_nv) { + vmx->nested.pi_pending = true; + kvm_make_request(KVM_REQ_EVENT, vcpu); + kvm_apic_clear_irr(vcpu, vmx->nested.posted_intr_nv); + } + + /* Hide L1D cache contents from the nested guest. */ + vmx->vcpu.arch.l1tf_flush_l1d = true; + + /* + * Must happen outside of nested_vmx_enter_non_root_mode() as it will + * also be used as part of restoring nVMX state for + * snapshot restore (migration). + * + * In this flow, it is assumed that vmcs12 cache was + * transferred as part of captured nVMX state and should + * therefore not be read from guest memory (which may not + * exist on destination host yet). + */ + nested_cache_shadow_vmcs12(vcpu, vmcs12); + + switch (vmcs12->guest_activity_state) { + case GUEST_ACTIVITY_HLT: + /* + * If we're entering a halted L2 vcpu and the L2 vcpu won't be + * awakened by event injection or by an NMI-window VM-exit or + * by an interrupt-window VM-exit, halt the vcpu. + */ + if (!(vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) && + !nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING) && + !(nested_cpu_has(vmcs12, CPU_BASED_INTR_WINDOW_EXITING) && + (vmcs12->guest_rflags & X86_EFLAGS_IF))) { + vmx->nested.nested_run_pending = 0; + return kvm_emulate_halt_noskip(vcpu); + } + break; + case GUEST_ACTIVITY_WAIT_SIPI: + vmx->nested.nested_run_pending = 0; + vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; + break; + default: + break; + } + + return 1; + +vmentry_failed: + vmx->nested.nested_run_pending = 0; + if (status == NVMX_VMENTRY_KVM_INTERNAL_ERROR) + return 0; + if (status == NVMX_VMENTRY_VMEXIT) + return 1; + WARN_ON_ONCE(status != NVMX_VMENTRY_VMFAIL); + return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); +} + +/* + * On a nested exit from L2 to L1, vmcs12.guest_cr0 might not be up-to-date + * because L2 may have changed some cr0 bits directly (CR0_GUEST_HOST_MASK). + * This function returns the new value we should put in vmcs12.guest_cr0. + * It's not enough to just return the vmcs02 GUEST_CR0. Rather, + * 1. Bits that neither L0 nor L1 trapped, were set directly by L2 and are now + * available in vmcs02 GUEST_CR0. (Note: It's enough to check that L0 + * didn't trap the bit, because if L1 did, so would L0). + * 2. Bits that L1 asked to trap (and therefore L0 also did) could not have + * been modified by L2, and L1 knows it. So just leave the old value of + * the bit from vmcs12.guest_cr0. Note that the bit from vmcs02 GUEST_CR0 + * isn't relevant, because if L0 traps this bit it can set it to anything. + * 3. Bits that L1 didn't trap, but L0 did. L1 believes the guest could have + * changed these bits, and therefore they need to be updated, but L0 + * didn't necessarily allow them to be changed in GUEST_CR0 - and rather + * put them in vmcs02 CR0_READ_SHADOW. So take these bits from there. + */ +static inline unsigned long +vmcs12_guest_cr0(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) +{ + return + /*1*/ (vmcs_readl(GUEST_CR0) & vcpu->arch.cr0_guest_owned_bits) | + /*2*/ (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask) | + /*3*/ (vmcs_readl(CR0_READ_SHADOW) & ~(vmcs12->cr0_guest_host_mask | + vcpu->arch.cr0_guest_owned_bits)); +} + +static inline unsigned long +vmcs12_guest_cr4(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) +{ + return + /*1*/ (vmcs_readl(GUEST_CR4) & vcpu->arch.cr4_guest_owned_bits) | + /*2*/ (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask) | + /*3*/ (vmcs_readl(CR4_READ_SHADOW) & ~(vmcs12->cr4_guest_host_mask | + vcpu->arch.cr4_guest_owned_bits)); +} + +static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12, + u32 vm_exit_reason, u32 exit_intr_info) +{ + u32 idt_vectoring; + unsigned int nr; + + /* + * Per the SDM, VM-Exits due to double and triple faults are never + * considered to occur during event delivery, even if the double/triple + * fault is the result of an escalating vectoring issue. + * + * Note, the SDM qualifies the double fault behavior with "The original + * event results in a double-fault exception". It's unclear why the + * qualification exists since exits due to double fault can occur only + * while vectoring a different exception (injected events are never + * subject to interception), i.e. there's _always_ an original event. + * + * The SDM also uses NMI as a confusing example for the "original event + * causes the VM exit directly" clause. NMI isn't special in any way, + * the same rule applies to all events that cause an exit directly. + * NMI is an odd choice for the example because NMIs can only occur on + * instruction boundaries, i.e. they _can't_ occur during vectoring. + */ + if ((u16)vm_exit_reason == EXIT_REASON_TRIPLE_FAULT || + ((u16)vm_exit_reason == EXIT_REASON_EXCEPTION_NMI && + is_double_fault(exit_intr_info))) { + vmcs12->idt_vectoring_info_field = 0; + } else if (vcpu->arch.exception.injected) { + nr = vcpu->arch.exception.vector; + idt_vectoring = nr | VECTORING_INFO_VALID_MASK; + + if (kvm_exception_is_soft(nr)) { + vmcs12->vm_exit_instruction_len = + vcpu->arch.event_exit_inst_len; + idt_vectoring |= INTR_TYPE_SOFT_EXCEPTION; + } else + idt_vectoring |= INTR_TYPE_HARD_EXCEPTION; + + if (vcpu->arch.exception.has_error_code) { + idt_vectoring |= VECTORING_INFO_DELIVER_CODE_MASK; + vmcs12->idt_vectoring_error_code = + vcpu->arch.exception.error_code; + } + + vmcs12->idt_vectoring_info_field = idt_vectoring; + } else if (vcpu->arch.nmi_injected) { + vmcs12->idt_vectoring_info_field = + INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR; + } else if (vcpu->arch.interrupt.injected) { + nr = vcpu->arch.interrupt.nr; + idt_vectoring = nr | VECTORING_INFO_VALID_MASK; + + if (vcpu->arch.interrupt.soft) { + idt_vectoring |= INTR_TYPE_SOFT_INTR; + vmcs12->vm_entry_instruction_len = + vcpu->arch.event_exit_inst_len; + } else + idt_vectoring |= INTR_TYPE_EXT_INTR; + + vmcs12->idt_vectoring_info_field = idt_vectoring; + } else { + vmcs12->idt_vectoring_info_field = 0; + } +} + + +void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu) +{ + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + gfn_t gfn; + + /* + * Don't need to mark the APIC access page dirty; it is never + * written to by the CPU during APIC virtualization. + */ + + if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) { + gfn = vmcs12->virtual_apic_page_addr >> PAGE_SHIFT; + kvm_vcpu_mark_page_dirty(vcpu, gfn); + } + + if (nested_cpu_has_posted_intr(vmcs12)) { + gfn = vmcs12->posted_intr_desc_addr >> PAGE_SHIFT; + kvm_vcpu_mark_page_dirty(vcpu, gfn); + } +} + +static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + int max_irr; + void *vapic_page; + u16 status; + + if (!vmx->nested.pi_pending) + return 0; + + if (!vmx->nested.pi_desc) + goto mmio_needed; + + vmx->nested.pi_pending = false; + + if (!pi_test_and_clear_on(vmx->nested.pi_desc)) + return 0; + + max_irr = find_last_bit((unsigned long *)vmx->nested.pi_desc->pir, 256); + if (max_irr != 256) { + vapic_page = vmx->nested.virtual_apic_map.hva; + if (!vapic_page) + goto mmio_needed; + + __kvm_apic_update_irr(vmx->nested.pi_desc->pir, + vapic_page, &max_irr); + status = vmcs_read16(GUEST_INTR_STATUS); + if ((u8)max_irr > ((u8)status & 0xff)) { + status &= ~0xff; + status |= (u8)max_irr; + vmcs_write16(GUEST_INTR_STATUS, status); + } + } + + nested_mark_vmcs12_pages_dirty(vcpu); + return 0; + +mmio_needed: + kvm_handle_memory_failure(vcpu, X86EMUL_IO_NEEDED, NULL); + return -ENXIO; +} + +static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu) +{ + struct kvm_queued_exception *ex = &vcpu->arch.exception_vmexit; + u32 intr_info = ex->vector | INTR_INFO_VALID_MASK; + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + unsigned long exit_qual; + + if (ex->has_payload) { + exit_qual = ex->payload; + } else if (ex->vector == PF_VECTOR) { + exit_qual = vcpu->arch.cr2; + } else if (ex->vector == DB_VECTOR) { + exit_qual = vcpu->arch.dr6; + exit_qual &= ~DR6_BT; + exit_qual ^= DR6_ACTIVE_LOW; + } else { + exit_qual = 0; + } + + /* + * Unlike AMD's Paged Real Mode, which reports an error code on #PF + * VM-Exits even if the CPU is in Real Mode, Intel VMX never sets the + * "has error code" flags on VM-Exit if the CPU is in Real Mode. + */ + if (ex->has_error_code && is_protmode(vcpu)) { + /* + * Intel CPUs do not generate error codes with bits 31:16 set, + * and more importantly VMX disallows setting bits 31:16 in the + * injected error code for VM-Entry. Drop the bits to mimic + * hardware and avoid inducing failure on nested VM-Entry if L1 + * chooses to inject the exception back to L2. AMD CPUs _do_ + * generate "full" 32-bit error codes, so KVM allows userspace + * to inject exception error codes with bits 31:16 set. + */ + vmcs12->vm_exit_intr_error_code = (u16)ex->error_code; + intr_info |= INTR_INFO_DELIVER_CODE_MASK; + } + + if (kvm_exception_is_soft(ex->vector)) + intr_info |= INTR_TYPE_SOFT_EXCEPTION; + else + intr_info |= INTR_TYPE_HARD_EXCEPTION; + + if (!(vmcs12->idt_vectoring_info_field & VECTORING_INFO_VALID_MASK) && + vmx_get_nmi_mask(vcpu)) + intr_info |= INTR_INFO_UNBLOCK_NMI; + + nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, intr_info, exit_qual); +} + +/* + * Returns true if a debug trap is (likely) pending delivery. Infer the class + * of a #DB (trap-like vs. fault-like) from the exception payload (to-be-DR6). + * Using the payload is flawed because code breakpoints (fault-like) and data + * breakpoints (trap-like) set the same bits in DR6 (breakpoint detected), i.e. + * this will return false positives if a to-be-injected code breakpoint #DB is + * pending (from KVM's perspective, but not "pending" across an instruction + * boundary). ICEBP, a.k.a. INT1, is also not reflected here even though it + * too is trap-like. + * + * KVM "works" despite these flaws as ICEBP isn't currently supported by the + * emulator, Monitor Trap Flag is not marked pending on intercepted #DBs (the + * #DB has already happened), and MTF isn't marked pending on code breakpoints + * from the emulator (because such #DBs are fault-like and thus don't trigger + * actions that fire on instruction retire). + */ +static unsigned long vmx_get_pending_dbg_trap(struct kvm_queued_exception *ex) +{ + if (!ex->pending || ex->vector != DB_VECTOR) + return 0; + + /* General Detect #DBs are always fault-like. */ + return ex->payload & ~DR6_BD; +} + +/* + * Returns true if there's a pending #DB exception that is lower priority than + * a pending Monitor Trap Flag VM-Exit. TSS T-flag #DBs are not emulated by + * KVM, but could theoretically be injected by userspace. Note, this code is + * imperfect, see above. + */ +static bool vmx_is_low_priority_db_trap(struct kvm_queued_exception *ex) +{ + return vmx_get_pending_dbg_trap(ex) & ~DR6_BT; +} + +/* + * Certain VM-exits set the 'pending debug exceptions' field to indicate a + * recognized #DB (data or single-step) that has yet to be delivered. Since KVM + * represents these debug traps with a payload that is said to be compatible + * with the 'pending debug exceptions' field, write the payload to the VMCS + * field if a VM-exit is delivered before the debug trap. + */ +static void nested_vmx_update_pending_dbg(struct kvm_vcpu *vcpu) +{ + unsigned long pending_dbg; + + pending_dbg = vmx_get_pending_dbg_trap(&vcpu->arch.exception); + if (pending_dbg) + vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, pending_dbg); +} + +static bool nested_vmx_preemption_timer_pending(struct kvm_vcpu *vcpu) +{ + return nested_cpu_has_preemption_timer(get_vmcs12(vcpu)) && + to_vmx(vcpu)->nested.preemption_timer_expired; +} + +static bool vmx_has_nested_events(struct kvm_vcpu *vcpu) +{ + return nested_vmx_preemption_timer_pending(vcpu) || + to_vmx(vcpu)->nested.mtf_pending; +} + +/* + * Per the Intel SDM's table "Priority Among Concurrent Events", with minor + * edits to fill in missing examples, e.g. #DB due to split-lock accesses, + * and less minor edits to splice in the priority of VMX Non-Root specific + * events, e.g. MTF and NMI/INTR-window exiting. + * + * 1 Hardware Reset and Machine Checks + * - RESET + * - Machine Check + * + * 2 Trap on Task Switch + * - T flag in TSS is set (on task switch) + * + * 3 External Hardware Interventions + * - FLUSH + * - STOPCLK + * - SMI + * - INIT + * + * 3.5 Monitor Trap Flag (MTF) VM-exit[1] + * + * 4 Traps on Previous Instruction + * - Breakpoints + * - Trap-class Debug Exceptions (#DB due to TF flag set, data/I-O + * breakpoint, or #DB due to a split-lock access) + * + * 4.3 VMX-preemption timer expired VM-exit + * + * 4.6 NMI-window exiting VM-exit[2] + * + * 5 Nonmaskable Interrupts (NMI) + * + * 5.5 Interrupt-window exiting VM-exit and Virtual-interrupt delivery + * + * 6 Maskable Hardware Interrupts + * + * 7 Code Breakpoint Fault + * + * 8 Faults from Fetching Next Instruction + * - Code-Segment Limit Violation + * - Code Page Fault + * - Control protection exception (missing ENDBRANCH at target of indirect + * call or jump) + * + * 9 Faults from Decoding Next Instruction + * - Instruction length > 15 bytes + * - Invalid Opcode + * - Coprocessor Not Available + * + *10 Faults on Executing Instruction + * - Overflow + * - Bound error + * - Invalid TSS + * - Segment Not Present + * - Stack fault + * - General Protection + * - Data Page Fault + * - Alignment Check + * - x86 FPU Floating-point exception + * - SIMD floating-point exception + * - Virtualization exception + * - Control protection exception + * + * [1] Per the "Monitor Trap Flag" section: System-management interrupts (SMIs), + * INIT signals, and higher priority events take priority over MTF VM exits. + * MTF VM exits take priority over debug-trap exceptions and lower priority + * events. + * + * [2] Debug-trap exceptions and higher priority events take priority over VM exits + * caused by the VMX-preemption timer. VM exits caused by the VMX-preemption + * timer take priority over VM exits caused by the "NMI-window exiting" + * VM-execution control and lower priority events. + * + * [3] Debug-trap exceptions and higher priority events take priority over VM exits + * caused by "NMI-window exiting". VM exits caused by this control take + * priority over non-maskable interrupts (NMIs) and lower priority events. + * + * [4] Virtual-interrupt delivery has the same priority as that of VM exits due to + * the 1-setting of the "interrupt-window exiting" VM-execution control. Thus, + * non-maskable interrupts (NMIs) and higher priority events take priority over + * delivery of a virtual interrupt; delivery of a virtual interrupt takes + * priority over external interrupts and lower priority events. + */ +static int vmx_check_nested_events(struct kvm_vcpu *vcpu) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + struct vcpu_vmx *vmx = to_vmx(vcpu); + /* + * Only a pending nested run blocks a pending exception. If there is a + * previously injected event, the pending exception occurred while said + * event was being delivered and thus needs to be handled. + */ + bool block_nested_exceptions = vmx->nested.nested_run_pending; + /* + * New events (not exceptions) are only recognized at instruction + * boundaries. If an event needs reinjection, then KVM is handling a + * VM-Exit that occurred _during_ instruction execution; new events are + * blocked until the instruction completes. + */ + bool block_nested_events = block_nested_exceptions || + kvm_event_needs_reinjection(vcpu); + + if (lapic_in_kernel(vcpu) && + test_bit(KVM_APIC_INIT, &apic->pending_events)) { + if (block_nested_events) + return -EBUSY; + nested_vmx_update_pending_dbg(vcpu); + clear_bit(KVM_APIC_INIT, &apic->pending_events); + if (vcpu->arch.mp_state != KVM_MP_STATE_INIT_RECEIVED) + nested_vmx_vmexit(vcpu, EXIT_REASON_INIT_SIGNAL, 0, 0); + + /* MTF is discarded if the vCPU is in WFS. */ + vmx->nested.mtf_pending = false; + return 0; + } + + if (lapic_in_kernel(vcpu) && + test_bit(KVM_APIC_SIPI, &apic->pending_events)) { + if (block_nested_events) + return -EBUSY; + + clear_bit(KVM_APIC_SIPI, &apic->pending_events); + if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { + nested_vmx_vmexit(vcpu, EXIT_REASON_SIPI_SIGNAL, 0, + apic->sipi_vector & 0xFFUL); + return 0; + } + /* Fallthrough, the SIPI is completely ignored. */ + } + + /* + * Process exceptions that are higher priority than Monitor Trap Flag: + * fault-like exceptions, TSS T flag #DB (not emulated by KVM, but + * could theoretically come in from userspace), and ICEBP (INT1). + * + * TODO: SMIs have higher priority than MTF and trap-like #DBs (except + * for TSS T flag #DBs). KVM also doesn't save/restore pending MTF + * across SMI/RSM as it should; that needs to be addressed in order to + * prioritize SMI over MTF and trap-like #DBs. + */ + if (vcpu->arch.exception_vmexit.pending && + !vmx_is_low_priority_db_trap(&vcpu->arch.exception_vmexit)) { + if (block_nested_exceptions) + return -EBUSY; + + nested_vmx_inject_exception_vmexit(vcpu); + return 0; + } + + if (vcpu->arch.exception.pending && + !vmx_is_low_priority_db_trap(&vcpu->arch.exception)) { + if (block_nested_exceptions) + return -EBUSY; + goto no_vmexit; + } + + if (vmx->nested.mtf_pending) { + if (block_nested_events) + return -EBUSY; + nested_vmx_update_pending_dbg(vcpu); + nested_vmx_vmexit(vcpu, EXIT_REASON_MONITOR_TRAP_FLAG, 0, 0); + return 0; + } + + if (vcpu->arch.exception_vmexit.pending) { + if (block_nested_exceptions) + return -EBUSY; + + nested_vmx_inject_exception_vmexit(vcpu); + return 0; + } + + if (vcpu->arch.exception.pending) { + if (block_nested_exceptions) + return -EBUSY; + goto no_vmexit; + } + + if (nested_vmx_preemption_timer_pending(vcpu)) { + if (block_nested_events) + return -EBUSY; + nested_vmx_vmexit(vcpu, EXIT_REASON_PREEMPTION_TIMER, 0, 0); + return 0; + } + + if (vcpu->arch.smi_pending && !is_smm(vcpu)) { + if (block_nested_events) + return -EBUSY; + goto no_vmexit; + } + + if (vcpu->arch.nmi_pending && !vmx_nmi_blocked(vcpu)) { + if (block_nested_events) + return -EBUSY; + if (!nested_exit_on_nmi(vcpu)) + goto no_vmexit; + + nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, + NMI_VECTOR | INTR_TYPE_NMI_INTR | + INTR_INFO_VALID_MASK, 0); + /* + * The NMI-triggered VM exit counts as injection: + * clear this one and block further NMIs. + */ + vcpu->arch.nmi_pending = 0; + vmx_set_nmi_mask(vcpu, true); + return 0; + } + + if (kvm_cpu_has_interrupt(vcpu) && !vmx_interrupt_blocked(vcpu)) { + if (block_nested_events) + return -EBUSY; + if (!nested_exit_on_intr(vcpu)) + goto no_vmexit; + nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0); + return 0; + } + +no_vmexit: + return vmx_complete_nested_posted_interrupt(vcpu); +} + +static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu) +{ + ktime_t remaining = + hrtimer_get_remaining(&to_vmx(vcpu)->nested.preemption_timer); + u64 value; + + if (ktime_to_ns(remaining) <= 0) + return 0; + + value = ktime_to_ns(remaining) * vcpu->arch.virtual_tsc_khz; + do_div(value, 1000000); + return value >> VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; +} + +static bool is_vmcs12_ext_field(unsigned long field) +{ + switch (field) { + case GUEST_ES_SELECTOR: + case GUEST_CS_SELECTOR: + case GUEST_SS_SELECTOR: + case GUEST_DS_SELECTOR: + case GUEST_FS_SELECTOR: + case GUEST_GS_SELECTOR: + case GUEST_LDTR_SELECTOR: + case GUEST_TR_SELECTOR: + case GUEST_ES_LIMIT: + case GUEST_CS_LIMIT: + case GUEST_SS_LIMIT: + case GUEST_DS_LIMIT: + case GUEST_FS_LIMIT: + case GUEST_GS_LIMIT: + case GUEST_LDTR_LIMIT: + case GUEST_TR_LIMIT: + case GUEST_GDTR_LIMIT: + case GUEST_IDTR_LIMIT: + case GUEST_ES_AR_BYTES: + case GUEST_DS_AR_BYTES: + case GUEST_FS_AR_BYTES: + case GUEST_GS_AR_BYTES: + case GUEST_LDTR_AR_BYTES: + case GUEST_TR_AR_BYTES: + case GUEST_ES_BASE: + case GUEST_CS_BASE: + case GUEST_SS_BASE: + case GUEST_DS_BASE: + case GUEST_FS_BASE: + case GUEST_GS_BASE: + case GUEST_LDTR_BASE: + case GUEST_TR_BASE: + case GUEST_GDTR_BASE: + case GUEST_IDTR_BASE: + case GUEST_PENDING_DBG_EXCEPTIONS: + case GUEST_BNDCFGS: + return true; + default: + break; + } + + return false; +} + +static void sync_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + vmcs12->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR); + vmcs12->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR); + vmcs12->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR); + vmcs12->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR); + vmcs12->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR); + vmcs12->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR); + vmcs12->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR); + vmcs12->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR); + vmcs12->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT); + vmcs12->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT); + vmcs12->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT); + vmcs12->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT); + vmcs12->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT); + vmcs12->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT); + vmcs12->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT); + vmcs12->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT); + vmcs12->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT); + vmcs12->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT); + vmcs12->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES); + vmcs12->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES); + vmcs12->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES); + vmcs12->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES); + vmcs12->guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES); + vmcs12->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES); + vmcs12->guest_es_base = vmcs_readl(GUEST_ES_BASE); + vmcs12->guest_cs_base = vmcs_readl(GUEST_CS_BASE); + vmcs12->guest_ss_base = vmcs_readl(GUEST_SS_BASE); + vmcs12->guest_ds_base = vmcs_readl(GUEST_DS_BASE); + vmcs12->guest_fs_base = vmcs_readl(GUEST_FS_BASE); + vmcs12->guest_gs_base = vmcs_readl(GUEST_GS_BASE); + vmcs12->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE); + vmcs12->guest_tr_base = vmcs_readl(GUEST_TR_BASE); + vmcs12->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE); + vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE); + vmcs12->guest_pending_dbg_exceptions = + vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS); + + vmx->nested.need_sync_vmcs02_to_vmcs12_rare = false; +} + +static void copy_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + int cpu; + + if (!vmx->nested.need_sync_vmcs02_to_vmcs12_rare) + return; + + + WARN_ON_ONCE(vmx->loaded_vmcs != &vmx->vmcs01); + + cpu = get_cpu(); + vmx->loaded_vmcs = &vmx->nested.vmcs02; + vmx_vcpu_load_vmcs(vcpu, cpu, &vmx->vmcs01); + + sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12); + + vmx->loaded_vmcs = &vmx->vmcs01; + vmx_vcpu_load_vmcs(vcpu, cpu, &vmx->nested.vmcs02); + put_cpu(); +} + +/* + * Update the guest state fields of vmcs12 to reflect changes that + * occurred while L2 was running. (The "IA-32e mode guest" bit of the + * VM-entry controls is also updated, since this is really a guest + * state bit.) + */ +static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) + sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12); + + vmx->nested.need_sync_vmcs02_to_vmcs12_rare = + !evmptr_is_valid(vmx->nested.hv_evmcs_vmptr); + + vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12); + vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12); + + vmcs12->guest_rsp = kvm_rsp_read(vcpu); + vmcs12->guest_rip = kvm_rip_read(vcpu); + vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS); + + vmcs12->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES); + vmcs12->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES); + + vmcs12->guest_interruptibility_info = + vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); + + if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED) + vmcs12->guest_activity_state = GUEST_ACTIVITY_HLT; + else if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) + vmcs12->guest_activity_state = GUEST_ACTIVITY_WAIT_SIPI; + else + vmcs12->guest_activity_state = GUEST_ACTIVITY_ACTIVE; + + if (nested_cpu_has_preemption_timer(vmcs12) && + vmcs12->vm_exit_controls & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER && + !vmx->nested.nested_run_pending) + vmcs12->vmx_preemption_timer_value = + vmx_get_preemption_timer_value(vcpu); + + /* + * In some cases (usually, nested EPT), L2 is allowed to change its + * own CR3 without exiting. If it has changed it, we must keep it. + * Of course, if L0 is using shadow page tables, GUEST_CR3 was defined + * by L0, not L1 or L2, so we mustn't unconditionally copy it to vmcs12. + * + * Additionally, restore L2's PDPTR to vmcs12. + */ + if (enable_ept) { + vmcs12->guest_cr3 = vmcs_readl(GUEST_CR3); + if (nested_cpu_has_ept(vmcs12) && is_pae_paging(vcpu)) { + vmcs12->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0); + vmcs12->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1); + vmcs12->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2); + vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3); + } + } + + vmcs12->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS); + + if (nested_cpu_has_vid(vmcs12)) + vmcs12->guest_intr_status = vmcs_read16(GUEST_INTR_STATUS); + + vmcs12->vm_entry_controls = + (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) | + (vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE); + + if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS) + kvm_get_dr(vcpu, 7, (unsigned long *)&vmcs12->guest_dr7); + + if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_EFER) + vmcs12->guest_ia32_efer = vcpu->arch.efer; +} + +/* + * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits + * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12), + * and this function updates it to reflect the changes to the guest state while + * L2 was running (and perhaps made some exits which were handled directly by L0 + * without going back to L1), and to reflect the exit reason. + * Note that we do not have to copy here all VMCS fields, just those that + * could have changed by the L2 guest or the exit - i.e., the guest-state and + * exit-information fields only. Other fields are modified by L1 with VMWRITE, + * which already writes to vmcs12 directly. + */ +static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, + u32 vm_exit_reason, u32 exit_intr_info, + unsigned long exit_qualification) +{ + /* update exit information fields: */ + vmcs12->vm_exit_reason = vm_exit_reason; + if (to_vmx(vcpu)->exit_reason.enclave_mode) + vmcs12->vm_exit_reason |= VMX_EXIT_REASONS_SGX_ENCLAVE_MODE; + vmcs12->exit_qualification = exit_qualification; + + /* + * On VM-Exit due to a failed VM-Entry, the VMCS isn't marked launched + * and only EXIT_REASON and EXIT_QUALIFICATION are updated, all other + * exit info fields are unmodified. + */ + if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) { + vmcs12->launch_state = 1; + + /* vm_entry_intr_info_field is cleared on exit. Emulate this + * instead of reading the real value. */ + vmcs12->vm_entry_intr_info_field &= ~INTR_INFO_VALID_MASK; + + /* + * Transfer the event that L0 or L1 may wanted to inject into + * L2 to IDT_VECTORING_INFO_FIELD. + */ + vmcs12_save_pending_event(vcpu, vmcs12, + vm_exit_reason, exit_intr_info); + + vmcs12->vm_exit_intr_info = exit_intr_info; + vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN); + vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); + + /* + * According to spec, there's no need to store the guest's + * MSRs if the exit is due to a VM-entry failure that occurs + * during or after loading the guest state. Since this exit + * does not fall in that category, we need to save the MSRs. + */ + if (nested_vmx_store_msr(vcpu, + vmcs12->vm_exit_msr_store_addr, + vmcs12->vm_exit_msr_store_count)) + nested_vmx_abort(vcpu, + VMX_ABORT_SAVE_GUEST_MSR_FAIL); + } +} + +/* + * A part of what we need to when the nested L2 guest exits and we want to + * run its L1 parent, is to reset L1's guest state to the host state specified + * in vmcs12. + * This function is to be called not only on normal nested exit, but also on + * a nested entry failure, as explained in Intel's spec, 3B.23.7 ("VM-Entry + * Failures During or After Loading Guest State"). + * This function should be called when the active VMCS is L1's (vmcs01). + */ +static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + enum vm_entry_failure_code ignored; + struct kvm_segment seg; + + if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) + vcpu->arch.efer = vmcs12->host_ia32_efer; + else if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) + vcpu->arch.efer |= (EFER_LMA | EFER_LME); + else + vcpu->arch.efer &= ~(EFER_LMA | EFER_LME); + vmx_set_efer(vcpu, vcpu->arch.efer); + + kvm_rsp_write(vcpu, vmcs12->host_rsp); + kvm_rip_write(vcpu, vmcs12->host_rip); + vmx_set_rflags(vcpu, X86_EFLAGS_FIXED); + vmx_set_interrupt_shadow(vcpu, 0); + + /* + * Note that calling vmx_set_cr0 is important, even if cr0 hasn't + * actually changed, because vmx_set_cr0 refers to efer set above. + * + * CR0_GUEST_HOST_MASK is already set in the original vmcs01 + * (KVM doesn't change it); + */ + vcpu->arch.cr0_guest_owned_bits = vmx_l1_guest_owned_cr0_bits(); + vmx_set_cr0(vcpu, vmcs12->host_cr0); + + /* Same as above - no reason to call set_cr4_guest_host_mask(). */ + vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK); + vmx_set_cr4(vcpu, vmcs12->host_cr4); + + nested_ept_uninit_mmu_context(vcpu); + + /* + * Only PDPTE load can fail as the value of cr3 was checked on entry and + * couldn't have changed. + */ + if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, true, &ignored)) + nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL); + + nested_vmx_transition_tlb_flush(vcpu, vmcs12, false); + + vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs); + vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->host_ia32_sysenter_esp); + vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip); + vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base); + vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base); + vmcs_write32(GUEST_IDTR_LIMIT, 0xFFFF); + vmcs_write32(GUEST_GDTR_LIMIT, 0xFFFF); + + /* If not VM_EXIT_CLEAR_BNDCFGS, the L2 value propagates to L1. */ + if (vmcs12->vm_exit_controls & VM_EXIT_CLEAR_BNDCFGS) + vmcs_write64(GUEST_BNDCFGS, 0); + + if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) { + vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat); + vcpu->arch.pat = vmcs12->host_ia32_pat; + } + if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) && + intel_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu))) + WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, + vmcs12->host_ia32_perf_global_ctrl)); + + /* Set L1 segment info according to Intel SDM + 27.5.2 Loading Host Segment and Descriptor-Table Registers */ + seg = (struct kvm_segment) { + .base = 0, + .limit = 0xFFFFFFFF, + .selector = vmcs12->host_cs_selector, + .type = 11, + .present = 1, + .s = 1, + .g = 1 + }; + if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) + seg.l = 1; + else + seg.db = 1; + __vmx_set_segment(vcpu, &seg, VCPU_SREG_CS); + seg = (struct kvm_segment) { + .base = 0, + .limit = 0xFFFFFFFF, + .type = 3, + .present = 1, + .s = 1, + .db = 1, + .g = 1 + }; + seg.selector = vmcs12->host_ds_selector; + __vmx_set_segment(vcpu, &seg, VCPU_SREG_DS); + seg.selector = vmcs12->host_es_selector; + __vmx_set_segment(vcpu, &seg, VCPU_SREG_ES); + seg.selector = vmcs12->host_ss_selector; + __vmx_set_segment(vcpu, &seg, VCPU_SREG_SS); + seg.selector = vmcs12->host_fs_selector; + seg.base = vmcs12->host_fs_base; + __vmx_set_segment(vcpu, &seg, VCPU_SREG_FS); + seg.selector = vmcs12->host_gs_selector; + seg.base = vmcs12->host_gs_base; + __vmx_set_segment(vcpu, &seg, VCPU_SREG_GS); + seg = (struct kvm_segment) { + .base = vmcs12->host_tr_base, + .limit = 0x67, + .selector = vmcs12->host_tr_selector, + .type = 11, + .present = 1 + }; + __vmx_set_segment(vcpu, &seg, VCPU_SREG_TR); + + memset(&seg, 0, sizeof(seg)); + seg.unusable = 1; + __vmx_set_segment(vcpu, &seg, VCPU_SREG_LDTR); + + kvm_set_dr(vcpu, 7, 0x400); + vmcs_write64(GUEST_IA32_DEBUGCTL, 0); + + if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr, + vmcs12->vm_exit_msr_load_count)) + nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL); + + to_vmx(vcpu)->emulation_required = vmx_emulation_required(vcpu); +} + +static inline u64 nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx *vmx) +{ + struct vmx_uret_msr *efer_msr; + unsigned int i; + + if (vm_entry_controls_get(vmx) & VM_ENTRY_LOAD_IA32_EFER) + return vmcs_read64(GUEST_IA32_EFER); + + if (cpu_has_load_ia32_efer()) + return host_efer; + + for (i = 0; i < vmx->msr_autoload.guest.nr; ++i) { + if (vmx->msr_autoload.guest.val[i].index == MSR_EFER) + return vmx->msr_autoload.guest.val[i].value; + } + + efer_msr = vmx_find_uret_msr(vmx, MSR_EFER); + if (efer_msr) + return efer_msr->data; + + return host_efer; +} + +static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu) +{ + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct vmx_msr_entry g, h; + gpa_t gpa; + u32 i, j; + + vcpu->arch.pat = vmcs_read64(GUEST_IA32_PAT); + + if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) { + /* + * L1's host DR7 is lost if KVM_GUESTDBG_USE_HW_BP is set + * as vmcs01.GUEST_DR7 contains a userspace defined value + * and vcpu->arch.dr7 is not squirreled away before the + * nested VMENTER (not worth adding a variable in nested_vmx). + */ + if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) + kvm_set_dr(vcpu, 7, DR7_FIXED_1); + else + WARN_ON(kvm_set_dr(vcpu, 7, vmcs_readl(GUEST_DR7))); + } + + /* + * Note that calling vmx_set_{efer,cr0,cr4} is important as they + * handle a variety of side effects to KVM's software model. + */ + vmx_set_efer(vcpu, nested_vmx_get_vmcs01_guest_efer(vmx)); + + vcpu->arch.cr0_guest_owned_bits = vmx_l1_guest_owned_cr0_bits(); + vmx_set_cr0(vcpu, vmcs_readl(CR0_READ_SHADOW)); + + vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK); + vmx_set_cr4(vcpu, vmcs_readl(CR4_READ_SHADOW)); + + nested_ept_uninit_mmu_context(vcpu); + vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); + kvm_register_mark_available(vcpu, VCPU_EXREG_CR3); + + /* + * Use ept_save_pdptrs(vcpu) to load the MMU's cached PDPTRs + * from vmcs01 (if necessary). The PDPTRs are not loaded on + * VMFail, like everything else we just need to ensure our + * software model is up-to-date. + */ + if (enable_ept && is_pae_paging(vcpu)) + ept_save_pdptrs(vcpu); + + kvm_mmu_reset_context(vcpu); + + /* + * This nasty bit of open coding is a compromise between blindly + * loading L1's MSRs using the exit load lists (incorrect emulation + * of VMFail), leaving the nested VM's MSRs in the software model + * (incorrect behavior) and snapshotting the modified MSRs (too + * expensive since the lists are unbound by hardware). For each + * MSR that was (prematurely) loaded from the nested VMEntry load + * list, reload it from the exit load list if it exists and differs + * from the guest value. The intent is to stuff host state as + * silently as possible, not to fully process the exit load list. + */ + for (i = 0; i < vmcs12->vm_entry_msr_load_count; i++) { + gpa = vmcs12->vm_entry_msr_load_addr + (i * sizeof(g)); + if (kvm_vcpu_read_guest(vcpu, gpa, &g, sizeof(g))) { + pr_debug_ratelimited( + "%s read MSR index failed (%u, 0x%08llx)\n", + __func__, i, gpa); + goto vmabort; + } + + for (j = 0; j < vmcs12->vm_exit_msr_load_count; j++) { + gpa = vmcs12->vm_exit_msr_load_addr + (j * sizeof(h)); + if (kvm_vcpu_read_guest(vcpu, gpa, &h, sizeof(h))) { + pr_debug_ratelimited( + "%s read MSR failed (%u, 0x%08llx)\n", + __func__, j, gpa); + goto vmabort; + } + if (h.index != g.index) + continue; + if (h.value == g.value) + break; + + if (nested_vmx_load_msr_check(vcpu, &h)) { + pr_debug_ratelimited( + "%s check failed (%u, 0x%x, 0x%x)\n", + __func__, j, h.index, h.reserved); + goto vmabort; + } + + if (kvm_set_msr(vcpu, h.index, h.value)) { + pr_debug_ratelimited( + "%s WRMSR failed (%u, 0x%x, 0x%llx)\n", + __func__, j, h.index, h.value); + goto vmabort; + } + } + } + + return; + +vmabort: + nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL); +} + +/* + * Emulate an exit from nested guest (L2) to L1, i.e., prepare to run L1 + * and modify vmcs12 to make it see what it would expect to see there if + * L2 was its real guest. Must only be called when in L2 (is_guest_mode()) + */ +void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, + u32 exit_intr_info, unsigned long exit_qualification) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + + /* Pending MTF traps are discarded on VM-Exit. */ + vmx->nested.mtf_pending = false; + + /* trying to cancel vmlaunch/vmresume is a bug */ + WARN_ON_ONCE(vmx->nested.nested_run_pending); + + if (kvm_check_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu)) { + /* + * KVM_REQ_GET_NESTED_STATE_PAGES is also used to map + * Enlightened VMCS after migration and we still need to + * do that when something is forcing L2->L1 exit prior to + * the first L2 run. + */ + (void)nested_get_evmcs_page(vcpu); + } + + /* Service pending TLB flush requests for L2 before switching to L1. */ + kvm_service_local_tlb_flush_requests(vcpu); + + /* + * VCPU_EXREG_PDPTR will be clobbered in arch/x86/kvm/vmx/vmx.h between + * now and the new vmentry. Ensure that the VMCS02 PDPTR fields are + * up-to-date before switching to L1. + */ + if (enable_ept && is_pae_paging(vcpu)) + vmx_ept_load_pdptrs(vcpu); + + leave_guest_mode(vcpu); + + if (nested_cpu_has_preemption_timer(vmcs12)) + hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer); + + if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETTING)) { + vcpu->arch.tsc_offset = vcpu->arch.l1_tsc_offset; + if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_TSC_SCALING)) + vcpu->arch.tsc_scaling_ratio = vcpu->arch.l1_tsc_scaling_ratio; + } + + if (likely(!vmx->fail)) { + sync_vmcs02_to_vmcs12(vcpu, vmcs12); + + if (vm_exit_reason != -1) + prepare_vmcs12(vcpu, vmcs12, vm_exit_reason, + exit_intr_info, exit_qualification); + + /* + * Must happen outside of sync_vmcs02_to_vmcs12() as it will + * also be used to capture vmcs12 cache as part of + * capturing nVMX state for snapshot (migration). + * + * Otherwise, this flush will dirty guest memory at a + * point it is already assumed by user-space to be + * immutable. + */ + nested_flush_cached_shadow_vmcs12(vcpu, vmcs12); + } else { + /* + * The only expected VM-instruction error is "VM entry with + * invalid control field(s)." Anything else indicates a + * problem with L0. And we should never get here with a + * VMFail of any type if early consistency checks are enabled. + */ + WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR) != + VMXERR_ENTRY_INVALID_CONTROL_FIELD); + WARN_ON_ONCE(nested_early_check); + } + + /* + * Drop events/exceptions that were queued for re-injection to L2 + * (picked up via vmx_complete_interrupts()), as well as exceptions + * that were pending for L2. Note, this must NOT be hoisted above + * prepare_vmcs12(), events/exceptions queued for re-injection need to + * be captured in vmcs12 (see vmcs12_save_pending_event()). + */ + vcpu->arch.nmi_injected = false; + kvm_clear_exception_queue(vcpu); + kvm_clear_interrupt_queue(vcpu); + + vmx_switch_vmcs(vcpu, &vmx->vmcs01); + + /* + * If IBRS is advertised to the vCPU, KVM must flush the indirect + * branch predictors when transitioning from L2 to L1, as L1 expects + * hardware (KVM in this case) to provide separate predictor modes. + * Bare metal isolates VMX root (host) from VMX non-root (guest), but + * doesn't isolate different VMCSs, i.e. in this case, doesn't provide + * separate modes for L2 vs L1. + */ + if (guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL)) + indirect_branch_prediction_barrier(); + + /* Update any VMCS fields that might have changed while L2 ran */ + vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); + vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); + vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); + if (kvm_caps.has_tsc_control) + vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio); + + if (vmx->nested.l1_tpr_threshold != -1) + vmcs_write32(TPR_THRESHOLD, vmx->nested.l1_tpr_threshold); + + if (vmx->nested.change_vmcs01_virtual_apic_mode) { + vmx->nested.change_vmcs01_virtual_apic_mode = false; + vmx_set_virtual_apic_mode(vcpu); + } + + if (vmx->nested.update_vmcs01_cpu_dirty_logging) { + vmx->nested.update_vmcs01_cpu_dirty_logging = false; + vmx_update_cpu_dirty_logging(vcpu); + } + + /* Unpin physical memory we referred to in vmcs02 */ + kvm_vcpu_unmap(vcpu, &vmx->nested.apic_access_page_map, false); + kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true); + kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true); + vmx->nested.pi_desc = NULL; + + if (vmx->nested.reload_vmcs01_apic_access_page) { + vmx->nested.reload_vmcs01_apic_access_page = false; + kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); + } + + if (vmx->nested.update_vmcs01_apicv_status) { + vmx->nested.update_vmcs01_apicv_status = false; + kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu); + } + + if ((vm_exit_reason != -1) && + (enable_shadow_vmcs || evmptr_is_valid(vmx->nested.hv_evmcs_vmptr))) + vmx->nested.need_vmcs12_to_shadow_sync = true; + + /* in case we halted in L2 */ + vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; + + if (likely(!vmx->fail)) { + if ((u16)vm_exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT && + nested_exit_intr_ack_set(vcpu)) { + int irq = kvm_cpu_get_interrupt(vcpu); + WARN_ON(irq < 0); + vmcs12->vm_exit_intr_info = irq | + INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR; + } + + if (vm_exit_reason != -1) + trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason, + vmcs12->exit_qualification, + vmcs12->idt_vectoring_info_field, + vmcs12->vm_exit_intr_info, + vmcs12->vm_exit_intr_error_code, + KVM_ISA_VMX); + + load_vmcs12_host_state(vcpu, vmcs12); + + return; + } + + /* + * After an early L2 VM-entry failure, we're now back + * in L1 which thinks it just finished a VMLAUNCH or + * VMRESUME instruction, so we need to set the failure + * flag and the VM-instruction error field of the VMCS + * accordingly, and skip the emulated instruction. + */ + (void)nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); + + /* + * Restore L1's host state to KVM's software model. We're here + * because a consistency check was caught by hardware, which + * means some amount of guest state has been propagated to KVM's + * model and needs to be unwound to the host's state. + */ + nested_vmx_restore_host_state(vcpu); + + vmx->fail = 0; +} + +static void nested_vmx_triple_fault(struct kvm_vcpu *vcpu) +{ + kvm_clear_request(KVM_REQ_TRIPLE_FAULT, vcpu); + nested_vmx_vmexit(vcpu, EXIT_REASON_TRIPLE_FAULT, 0, 0); +} + +/* + * Decode the memory-address operand of a vmx instruction, as recorded on an + * exit caused by such an instruction (run by a guest hypervisor). + * On success, returns 0. When the operand is invalid, returns 1 and throws + * #UD, #GP, or #SS. + */ +int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification, + u32 vmx_instruction_info, bool wr, int len, gva_t *ret) +{ + gva_t off; + bool exn; + struct kvm_segment s; + + /* + * According to Vol. 3B, "Information for VM Exits Due to Instruction + * Execution", on an exit, vmx_instruction_info holds most of the + * addressing components of the operand. Only the displacement part + * is put in exit_qualification (see 3B, "Basic VM-Exit Information"). + * For how an actual address is calculated from all these components, + * refer to Vol. 1, "Operand Addressing". + */ + int scaling = vmx_instruction_info & 3; + int addr_size = (vmx_instruction_info >> 7) & 7; + bool is_reg = vmx_instruction_info & (1u << 10); + int seg_reg = (vmx_instruction_info >> 15) & 7; + int index_reg = (vmx_instruction_info >> 18) & 0xf; + bool index_is_valid = !(vmx_instruction_info & (1u << 22)); + int base_reg = (vmx_instruction_info >> 23) & 0xf; + bool base_is_valid = !(vmx_instruction_info & (1u << 27)); + + if (is_reg) { + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; + } + + /* Addr = segment_base + offset */ + /* offset = base + [index * scale] + displacement */ + off = exit_qualification; /* holds the displacement */ + if (addr_size == 1) + off = (gva_t)sign_extend64(off, 31); + else if (addr_size == 0) + off = (gva_t)sign_extend64(off, 15); + if (base_is_valid) + off += kvm_register_read(vcpu, base_reg); + if (index_is_valid) + off += kvm_register_read(vcpu, index_reg) << scaling; + vmx_get_segment(vcpu, &s, seg_reg); + + /* + * The effective address, i.e. @off, of a memory operand is truncated + * based on the address size of the instruction. Note that this is + * the *effective address*, i.e. the address prior to accounting for + * the segment's base. + */ + if (addr_size == 1) /* 32 bit */ + off &= 0xffffffff; + else if (addr_size == 0) /* 16 bit */ + off &= 0xffff; + + /* Checks for #GP/#SS exceptions. */ + exn = false; + if (is_long_mode(vcpu)) { + /* + * The virtual/linear address is never truncated in 64-bit + * mode, e.g. a 32-bit address size can yield a 64-bit virtual + * address when using FS/GS with a non-zero base. + */ + if (seg_reg == VCPU_SREG_FS || seg_reg == VCPU_SREG_GS) + *ret = s.base + off; + else + *ret = off; + + /* Long mode: #GP(0)/#SS(0) if the memory address is in a + * non-canonical form. This is the only check on the memory + * destination for long mode! + */ + exn = is_noncanonical_address(*ret, vcpu); + } else { + /* + * When not in long mode, the virtual/linear address is + * unconditionally truncated to 32 bits regardless of the + * address size. + */ + *ret = (s.base + off) & 0xffffffff; + + /* Protected mode: apply checks for segment validity in the + * following order: + * - segment type check (#GP(0) may be thrown) + * - usability check (#GP(0)/#SS(0)) + * - limit check (#GP(0)/#SS(0)) + */ + if (wr) + /* #GP(0) if the destination operand is located in a + * read-only data segment or any code segment. + */ + exn = ((s.type & 0xa) == 0 || (s.type & 8)); + else + /* #GP(0) if the source operand is located in an + * execute-only code segment + */ + exn = ((s.type & 0xa) == 8); + if (exn) { + kvm_queue_exception_e(vcpu, GP_VECTOR, 0); + return 1; + } + /* Protected mode: #GP(0)/#SS(0) if the segment is unusable. + */ + exn = (s.unusable != 0); + + /* + * Protected mode: #GP(0)/#SS(0) if the memory operand is + * outside the segment limit. All CPUs that support VMX ignore + * limit checks for flat segments, i.e. segments with base==0, + * limit==0xffffffff and of type expand-up data or code. + */ + if (!(s.base == 0 && s.limit == 0xffffffff && + ((s.type & 8) || !(s.type & 4)))) + exn = exn || ((u64)off + len - 1 > s.limit); + } + if (exn) { + kvm_queue_exception_e(vcpu, + seg_reg == VCPU_SREG_SS ? + SS_VECTOR : GP_VECTOR, + 0); + return 1; + } + + return 0; +} + +static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer, + int *ret) +{ + gva_t gva; + struct x86_exception e; + int r; + + if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu), + vmcs_read32(VMX_INSTRUCTION_INFO), false, + sizeof(*vmpointer), &gva)) { + *ret = 1; + return -EINVAL; + } + + r = kvm_read_guest_virt(vcpu, gva, vmpointer, sizeof(*vmpointer), &e); + if (r != X86EMUL_CONTINUE) { + *ret = kvm_handle_memory_failure(vcpu, r, &e); + return -EINVAL; + } + + return 0; +} + +/* + * Allocate a shadow VMCS and associate it with the currently loaded + * VMCS, unless such a shadow VMCS already exists. The newly allocated + * VMCS is also VMCLEARed, so that it is ready for use. + */ +static struct vmcs *alloc_shadow_vmcs(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct loaded_vmcs *loaded_vmcs = vmx->loaded_vmcs; + + /* + * KVM allocates a shadow VMCS only when L1 executes VMXON and frees it + * when L1 executes VMXOFF or the vCPU is forced out of nested + * operation. VMXON faults if the CPU is already post-VMXON, so it + * should be impossible to already have an allocated shadow VMCS. KVM + * doesn't support virtualization of VMCS shadowing, so vmcs01 should + * always be the loaded VMCS. + */ + if (WARN_ON(loaded_vmcs != &vmx->vmcs01 || loaded_vmcs->shadow_vmcs)) + return loaded_vmcs->shadow_vmcs; + + loaded_vmcs->shadow_vmcs = alloc_vmcs(true); + if (loaded_vmcs->shadow_vmcs) + vmcs_clear(loaded_vmcs->shadow_vmcs); + + return loaded_vmcs->shadow_vmcs; +} + +static int enter_vmx_operation(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + int r; + + r = alloc_loaded_vmcs(&vmx->nested.vmcs02); + if (r < 0) + goto out_vmcs02; + + vmx->nested.cached_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT); + if (!vmx->nested.cached_vmcs12) + goto out_cached_vmcs12; + + vmx->nested.shadow_vmcs12_cache.gpa = INVALID_GPA; + vmx->nested.cached_shadow_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT); + if (!vmx->nested.cached_shadow_vmcs12) + goto out_cached_shadow_vmcs12; + + if (enable_shadow_vmcs && !alloc_shadow_vmcs(vcpu)) + goto out_shadow_vmcs; + + hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC, + HRTIMER_MODE_ABS_PINNED); + vmx->nested.preemption_timer.function = vmx_preemption_timer_fn; + + vmx->nested.vpid02 = allocate_vpid(); + + vmx->nested.vmcs02_initialized = false; + vmx->nested.vmxon = true; + + if (vmx_pt_mode_is_host_guest()) { + vmx->pt_desc.guest.ctl = 0; + pt_update_intercept_for_msr(vcpu); + } + + return 0; + +out_shadow_vmcs: + kfree(vmx->nested.cached_shadow_vmcs12); + +out_cached_shadow_vmcs12: + kfree(vmx->nested.cached_vmcs12); + +out_cached_vmcs12: + free_loaded_vmcs(&vmx->nested.vmcs02); + +out_vmcs02: + return -ENOMEM; +} + +/* Emulate the VMXON instruction. */ +static int handle_vmxon(struct kvm_vcpu *vcpu) +{ + int ret; + gpa_t vmptr; + uint32_t revision; + struct vcpu_vmx *vmx = to_vmx(vcpu); + const u64 VMXON_NEEDED_FEATURES = FEAT_CTL_LOCKED + | FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX; + + /* + * Manually check CR4.VMXE checks, KVM must force CR4.VMXE=1 to enter + * the guest and so cannot rely on hardware to perform the check, + * which has higher priority than VM-Exit (see Intel SDM's pseudocode + * for VMXON). + * + * Rely on hardware for the other pre-VM-Exit checks, CR0.PE=1, !VM86 + * and !COMPATIBILITY modes. For an unrestricted guest, KVM doesn't + * force any of the relevant guest state. For a restricted guest, KVM + * does force CR0.PE=1, but only to also force VM86 in order to emulate + * Real Mode, and so there's no need to check CR0.PE manually. + */ + if (!kvm_read_cr4_bits(vcpu, X86_CR4_VMXE)) { + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; + } + + /* + * The CPL is checked for "not in VMX operation" and for "in VMX root", + * and has higher priority than the VM-Fail due to being post-VMXON, + * i.e. VMXON #GPs outside of VMX non-root if CPL!=0. In VMX non-root, + * VMXON causes VM-Exit and KVM unconditionally forwards VMXON VM-Exits + * from L2 to L1, i.e. there's no need to check for the vCPU being in + * VMX non-root. + * + * Forwarding the VM-Exit unconditionally, i.e. without performing the + * #UD checks (see above), is functionally ok because KVM doesn't allow + * L1 to run L2 without CR4.VMXE=0, and because KVM never modifies L2's + * CR0 or CR4, i.e. it's L2's responsibility to emulate #UDs that are + * missed by hardware due to shadowing CR0 and/or CR4. + */ + if (vmx_get_cpl(vcpu)) { + kvm_inject_gp(vcpu, 0); + return 1; + } + + if (vmx->nested.vmxon) + return nested_vmx_fail(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION); + + /* + * Invalid CR0/CR4 generates #GP. These checks are performed if and + * only if the vCPU isn't already in VMX operation, i.e. effectively + * have lower priority than the VM-Fail above. + */ + if (!nested_host_cr0_valid(vcpu, kvm_read_cr0(vcpu)) || + !nested_host_cr4_valid(vcpu, kvm_read_cr4(vcpu))) { + kvm_inject_gp(vcpu, 0); + return 1; + } + + if ((vmx->msr_ia32_feature_control & VMXON_NEEDED_FEATURES) + != VMXON_NEEDED_FEATURES) { + kvm_inject_gp(vcpu, 0); + return 1; + } + + if (nested_vmx_get_vmptr(vcpu, &vmptr, &ret)) + return ret; + + /* + * SDM 3: 24.11.5 + * The first 4 bytes of VMXON region contain the supported + * VMCS revision identifier + * + * Note - IA32_VMX_BASIC[48] will never be 1 for the nested case; + * which replaces physical address width with 32 + */ + if (!page_address_valid(vcpu, vmptr)) + return nested_vmx_failInvalid(vcpu); + + if (kvm_read_guest(vcpu->kvm, vmptr, &revision, sizeof(revision)) || + revision != VMCS12_REVISION) + return nested_vmx_failInvalid(vcpu); + + vmx->nested.vmxon_ptr = vmptr; + ret = enter_vmx_operation(vcpu); + if (ret) + return ret; + + return nested_vmx_succeed(vcpu); +} + +static inline void nested_release_vmcs12(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (vmx->nested.current_vmptr == INVALID_GPA) + return; + + copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu)); + + if (enable_shadow_vmcs) { + /* copy to memory all shadowed fields in case + they were modified */ + copy_shadow_to_vmcs12(vmx); + vmx_disable_shadow_vmcs(vmx); + } + vmx->nested.posted_intr_nv = -1; + + /* Flush VMCS12 to guest memory */ + kvm_vcpu_write_guest_page(vcpu, + vmx->nested.current_vmptr >> PAGE_SHIFT, + vmx->nested.cached_vmcs12, 0, VMCS12_SIZE); + + kvm_mmu_free_roots(vcpu->kvm, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL); + + vmx->nested.current_vmptr = INVALID_GPA; +} + +/* Emulate the VMXOFF instruction */ +static int handle_vmxoff(struct kvm_vcpu *vcpu) +{ + if (!nested_vmx_check_permission(vcpu)) + return 1; + + free_nested(vcpu); + + if (kvm_apic_has_pending_init_or_sipi(vcpu)) + kvm_make_request(KVM_REQ_EVENT, vcpu); + + return nested_vmx_succeed(vcpu); +} + +/* Emulate the VMCLEAR instruction */ +static int handle_vmclear(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + u32 zero = 0; + gpa_t vmptr; + u64 evmcs_gpa; + int r; + + if (!nested_vmx_check_permission(vcpu)) + return 1; + + if (nested_vmx_get_vmptr(vcpu, &vmptr, &r)) + return r; + + if (!page_address_valid(vcpu, vmptr)) + return nested_vmx_fail(vcpu, VMXERR_VMCLEAR_INVALID_ADDRESS); + + if (vmptr == vmx->nested.vmxon_ptr) + return nested_vmx_fail(vcpu, VMXERR_VMCLEAR_VMXON_POINTER); + + /* + * When Enlightened VMEntry is enabled on the calling CPU we treat + * memory area pointer by vmptr as Enlightened VMCS (as there's no good + * way to distinguish it from VMCS12) and we must not corrupt it by + * writing to the non-existent 'launch_state' field. The area doesn't + * have to be the currently active EVMCS on the calling CPU and there's + * nothing KVM has to do to transition it from 'active' to 'non-active' + * state. It is possible that the area will stay mapped as + * vmx->nested.hv_evmcs but this shouldn't be a problem. + */ + if (likely(!guest_cpuid_has_evmcs(vcpu) || + !nested_enlightened_vmentry(vcpu, &evmcs_gpa))) { + if (vmptr == vmx->nested.current_vmptr) + nested_release_vmcs12(vcpu); + + kvm_vcpu_write_guest(vcpu, + vmptr + offsetof(struct vmcs12, + launch_state), + &zero, sizeof(zero)); + } else if (vmx->nested.hv_evmcs && vmptr == vmx->nested.hv_evmcs_vmptr) { + nested_release_evmcs(vcpu); + } + + return nested_vmx_succeed(vcpu); +} + +/* Emulate the VMLAUNCH instruction */ +static int handle_vmlaunch(struct kvm_vcpu *vcpu) +{ + return nested_vmx_run(vcpu, true); +} + +/* Emulate the VMRESUME instruction */ +static int handle_vmresume(struct kvm_vcpu *vcpu) +{ + + return nested_vmx_run(vcpu, false); +} + +static int handle_vmread(struct kvm_vcpu *vcpu) +{ + struct vmcs12 *vmcs12 = is_guest_mode(vcpu) ? get_shadow_vmcs12(vcpu) + : get_vmcs12(vcpu); + unsigned long exit_qualification = vmx_get_exit_qual(vcpu); + u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO); + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct x86_exception e; + unsigned long field; + u64 value; + gva_t gva = 0; + short offset; + int len, r; + + if (!nested_vmx_check_permission(vcpu)) + return 1; + + /* Decode instruction info and find the field to read */ + field = kvm_register_read(vcpu, (((instr_info) >> 28) & 0xf)); + + if (!evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) { + /* + * In VMX non-root operation, when the VMCS-link pointer is INVALID_GPA, + * any VMREAD sets the ALU flags for VMfailInvalid. + */ + if (vmx->nested.current_vmptr == INVALID_GPA || + (is_guest_mode(vcpu) && + get_vmcs12(vcpu)->vmcs_link_pointer == INVALID_GPA)) + return nested_vmx_failInvalid(vcpu); + + offset = get_vmcs12_field_offset(field); + if (offset < 0) + return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); + + if (!is_guest_mode(vcpu) && is_vmcs12_ext_field(field)) + copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12); + + /* Read the field, zero-extended to a u64 value */ + value = vmcs12_read_any(vmcs12, field, offset); + } else { + /* + * Hyper-V TLFS (as of 6.0b) explicitly states, that while an + * enlightened VMCS is active VMREAD/VMWRITE instructions are + * unsupported. Unfortunately, certain versions of Windows 11 + * don't comply with this requirement which is not enforced in + * genuine Hyper-V. Allow VMREAD from an enlightened VMCS as a + * workaround, as misbehaving guests will panic on VM-Fail. + * Note, enlightened VMCS is incompatible with shadow VMCS so + * all VMREADs from L2 should go to L1. + */ + if (WARN_ON_ONCE(is_guest_mode(vcpu))) + return nested_vmx_failInvalid(vcpu); + + offset = evmcs_field_offset(field, NULL); + if (offset < 0) + return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); + + /* Read the field, zero-extended to a u64 value */ + value = evmcs_read_any(vmx->nested.hv_evmcs, field, offset); + } + + /* + * Now copy part of this value to register or memory, as requested. + * Note that the number of bits actually copied is 32 or 64 depending + * on the guest's mode (32 or 64 bit), not on the given field's length. + */ + if (instr_info & BIT(10)) { + kvm_register_write(vcpu, (((instr_info) >> 3) & 0xf), value); + } else { + len = is_64_bit_mode(vcpu) ? 8 : 4; + if (get_vmx_mem_address(vcpu, exit_qualification, + instr_info, true, len, &gva)) + return 1; + /* _system ok, nested_vmx_check_permission has verified cpl=0 */ + r = kvm_write_guest_virt_system(vcpu, gva, &value, len, &e); + if (r != X86EMUL_CONTINUE) + return kvm_handle_memory_failure(vcpu, r, &e); + } + + return nested_vmx_succeed(vcpu); +} + +static bool is_shadow_field_rw(unsigned long field) +{ + switch (field) { +#define SHADOW_FIELD_RW(x, y) case x: +#include "vmcs_shadow_fields.h" + return true; + default: + break; + } + return false; +} + +static bool is_shadow_field_ro(unsigned long field) +{ + switch (field) { +#define SHADOW_FIELD_RO(x, y) case x: +#include "vmcs_shadow_fields.h" + return true; + default: + break; + } + return false; +} + +static int handle_vmwrite(struct kvm_vcpu *vcpu) +{ + struct vmcs12 *vmcs12 = is_guest_mode(vcpu) ? get_shadow_vmcs12(vcpu) + : get_vmcs12(vcpu); + unsigned long exit_qualification = vmx_get_exit_qual(vcpu); + u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO); + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct x86_exception e; + unsigned long field; + short offset; + gva_t gva; + int len, r; + + /* + * The value to write might be 32 or 64 bits, depending on L1's long + * mode, and eventually we need to write that into a field of several + * possible lengths. The code below first zero-extends the value to 64 + * bit (value), and then copies only the appropriate number of + * bits into the vmcs12 field. + */ + u64 value = 0; + + if (!nested_vmx_check_permission(vcpu)) + return 1; + + /* + * In VMX non-root operation, when the VMCS-link pointer is INVALID_GPA, + * any VMWRITE sets the ALU flags for VMfailInvalid. + */ + if (vmx->nested.current_vmptr == INVALID_GPA || + (is_guest_mode(vcpu) && + get_vmcs12(vcpu)->vmcs_link_pointer == INVALID_GPA)) + return nested_vmx_failInvalid(vcpu); + + if (instr_info & BIT(10)) + value = kvm_register_read(vcpu, (((instr_info) >> 3) & 0xf)); + else { + len = is_64_bit_mode(vcpu) ? 8 : 4; + if (get_vmx_mem_address(vcpu, exit_qualification, + instr_info, false, len, &gva)) + return 1; + r = kvm_read_guest_virt(vcpu, gva, &value, len, &e); + if (r != X86EMUL_CONTINUE) + return kvm_handle_memory_failure(vcpu, r, &e); + } + + field = kvm_register_read(vcpu, (((instr_info) >> 28) & 0xf)); + + offset = get_vmcs12_field_offset(field); + if (offset < 0) + return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); + + /* + * If the vCPU supports "VMWRITE to any supported field in the + * VMCS," then the "read-only" fields are actually read/write. + */ + if (vmcs_field_readonly(field) && + !nested_cpu_has_vmwrite_any_field(vcpu)) + return nested_vmx_fail(vcpu, VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT); + + /* + * Ensure vmcs12 is up-to-date before any VMWRITE that dirties + * vmcs12, else we may crush a field or consume a stale value. + */ + if (!is_guest_mode(vcpu) && !is_shadow_field_rw(field)) + copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12); + + /* + * Some Intel CPUs intentionally drop the reserved bits of the AR byte + * fields on VMWRITE. Emulate this behavior to ensure consistent KVM + * behavior regardless of the underlying hardware, e.g. if an AR_BYTE + * field is intercepted for VMWRITE but not VMREAD (in L1), then VMREAD + * from L1 will return a different value than VMREAD from L2 (L1 sees + * the stripped down value, L2 sees the full value as stored by KVM). + */ + if (field >= GUEST_ES_AR_BYTES && field <= GUEST_TR_AR_BYTES) + value &= 0x1f0ff; + + vmcs12_write_any(vmcs12, field, offset, value); + + /* + * Do not track vmcs12 dirty-state if in guest-mode as we actually + * dirty shadow vmcs12 instead of vmcs12. Fields that can be updated + * by L1 without a vmexit are always updated in the vmcs02, i.e. don't + * "dirty" vmcs12, all others go down the prepare_vmcs02() slow path. + */ + if (!is_guest_mode(vcpu) && !is_shadow_field_rw(field)) { + /* + * L1 can read these fields without exiting, ensure the + * shadow VMCS is up-to-date. + */ + if (enable_shadow_vmcs && is_shadow_field_ro(field)) { + preempt_disable(); + vmcs_load(vmx->vmcs01.shadow_vmcs); + + __vmcs_writel(field, value); + + vmcs_clear(vmx->vmcs01.shadow_vmcs); + vmcs_load(vmx->loaded_vmcs->vmcs); + preempt_enable(); + } + vmx->nested.dirty_vmcs12 = true; + } + + return nested_vmx_succeed(vcpu); +} + +static void set_current_vmptr(struct vcpu_vmx *vmx, gpa_t vmptr) +{ + vmx->nested.current_vmptr = vmptr; + if (enable_shadow_vmcs) { + secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_SHADOW_VMCS); + vmcs_write64(VMCS_LINK_POINTER, + __pa(vmx->vmcs01.shadow_vmcs)); + vmx->nested.need_vmcs12_to_shadow_sync = true; + } + vmx->nested.dirty_vmcs12 = true; + vmx->nested.force_msr_bitmap_recalc = true; +} + +/* Emulate the VMPTRLD instruction */ +static int handle_vmptrld(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + gpa_t vmptr; + int r; + + if (!nested_vmx_check_permission(vcpu)) + return 1; + + if (nested_vmx_get_vmptr(vcpu, &vmptr, &r)) + return r; + + if (!page_address_valid(vcpu, vmptr)) + return nested_vmx_fail(vcpu, VMXERR_VMPTRLD_INVALID_ADDRESS); + + if (vmptr == vmx->nested.vmxon_ptr) + return nested_vmx_fail(vcpu, VMXERR_VMPTRLD_VMXON_POINTER); + + /* Forbid normal VMPTRLD if Enlightened version was used */ + if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) + return 1; + + if (vmx->nested.current_vmptr != vmptr) { + struct gfn_to_hva_cache *ghc = &vmx->nested.vmcs12_cache; + struct vmcs_hdr hdr; + + if (kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, vmptr, VMCS12_SIZE)) { + /* + * Reads from an unbacked page return all 1s, + * which means that the 32 bits located at the + * given physical address won't match the required + * VMCS12_REVISION identifier. + */ + return nested_vmx_fail(vcpu, + VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); + } + + if (kvm_read_guest_offset_cached(vcpu->kvm, ghc, &hdr, + offsetof(struct vmcs12, hdr), + sizeof(hdr))) { + return nested_vmx_fail(vcpu, + VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); + } + + if (hdr.revision_id != VMCS12_REVISION || + (hdr.shadow_vmcs && + !nested_cpu_has_vmx_shadow_vmcs(vcpu))) { + return nested_vmx_fail(vcpu, + VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); + } + + nested_release_vmcs12(vcpu); + + /* + * Load VMCS12 from guest memory since it is not already + * cached. + */ + if (kvm_read_guest_cached(vcpu->kvm, ghc, vmx->nested.cached_vmcs12, + VMCS12_SIZE)) { + return nested_vmx_fail(vcpu, + VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); + } + + set_current_vmptr(vmx, vmptr); + } + + return nested_vmx_succeed(vcpu); +} + +/* Emulate the VMPTRST instruction */ +static int handle_vmptrst(struct kvm_vcpu *vcpu) +{ + unsigned long exit_qual = vmx_get_exit_qual(vcpu); + u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO); + gpa_t current_vmptr = to_vmx(vcpu)->nested.current_vmptr; + struct x86_exception e; + gva_t gva; + int r; + + if (!nested_vmx_check_permission(vcpu)) + return 1; + + if (unlikely(evmptr_is_valid(to_vmx(vcpu)->nested.hv_evmcs_vmptr))) + return 1; + + if (get_vmx_mem_address(vcpu, exit_qual, instr_info, + true, sizeof(gpa_t), &gva)) + return 1; + /* *_system ok, nested_vmx_check_permission has verified cpl=0 */ + r = kvm_write_guest_virt_system(vcpu, gva, (void *)¤t_vmptr, + sizeof(gpa_t), &e); + if (r != X86EMUL_CONTINUE) + return kvm_handle_memory_failure(vcpu, r, &e); + + return nested_vmx_succeed(vcpu); +} + +/* Emulate the INVEPT instruction */ +static int handle_invept(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + u32 vmx_instruction_info, types; + unsigned long type, roots_to_free; + struct kvm_mmu *mmu; + gva_t gva; + struct x86_exception e; + struct { + u64 eptp, gpa; + } operand; + int i, r, gpr_index; + + if (!(vmx->nested.msrs.secondary_ctls_high & + SECONDARY_EXEC_ENABLE_EPT) || + !(vmx->nested.msrs.ept_caps & VMX_EPT_INVEPT_BIT)) { + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; + } + + if (!nested_vmx_check_permission(vcpu)) + return 1; + + vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); + gpr_index = vmx_get_instr_info_reg2(vmx_instruction_info); + type = kvm_register_read(vcpu, gpr_index); + + types = (vmx->nested.msrs.ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6; + + if (type >= 32 || !(types & (1 << type))) + return nested_vmx_fail(vcpu, VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); + + /* According to the Intel VMX instruction reference, the memory + * operand is read even if it isn't needed (e.g., for type==global) + */ + if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu), + vmx_instruction_info, false, sizeof(operand), &gva)) + return 1; + r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e); + if (r != X86EMUL_CONTINUE) + return kvm_handle_memory_failure(vcpu, r, &e); + + /* + * Nested EPT roots are always held through guest_mmu, + * not root_mmu. + */ + mmu = &vcpu->arch.guest_mmu; + + switch (type) { + case VMX_EPT_EXTENT_CONTEXT: + if (!nested_vmx_check_eptp(vcpu, operand.eptp)) + return nested_vmx_fail(vcpu, + VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); + + roots_to_free = 0; + if (nested_ept_root_matches(mmu->root.hpa, mmu->root.pgd, + operand.eptp)) + roots_to_free |= KVM_MMU_ROOT_CURRENT; + + for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { + if (nested_ept_root_matches(mmu->prev_roots[i].hpa, + mmu->prev_roots[i].pgd, + operand.eptp)) + roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i); + } + break; + case VMX_EPT_EXTENT_GLOBAL: + roots_to_free = KVM_MMU_ROOTS_ALL; + break; + default: + BUG(); + break; + } + + if (roots_to_free) + kvm_mmu_free_roots(vcpu->kvm, mmu, roots_to_free); + + return nested_vmx_succeed(vcpu); +} + +static int handle_invvpid(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + u32 vmx_instruction_info; + unsigned long type, types; + gva_t gva; + struct x86_exception e; + struct { + u64 vpid; + u64 gla; + } operand; + u16 vpid02; + int r, gpr_index; + + if (!(vmx->nested.msrs.secondary_ctls_high & + SECONDARY_EXEC_ENABLE_VPID) || + !(vmx->nested.msrs.vpid_caps & VMX_VPID_INVVPID_BIT)) { + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; + } + + if (!nested_vmx_check_permission(vcpu)) + return 1; + + vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); + gpr_index = vmx_get_instr_info_reg2(vmx_instruction_info); + type = kvm_register_read(vcpu, gpr_index); + + types = (vmx->nested.msrs.vpid_caps & + VMX_VPID_EXTENT_SUPPORTED_MASK) >> 8; + + if (type >= 32 || !(types & (1 << type))) + return nested_vmx_fail(vcpu, + VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); + + /* according to the intel vmx instruction reference, the memory + * operand is read even if it isn't needed (e.g., for type==global) + */ + if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu), + vmx_instruction_info, false, sizeof(operand), &gva)) + return 1; + r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e); + if (r != X86EMUL_CONTINUE) + return kvm_handle_memory_failure(vcpu, r, &e); + + if (operand.vpid >> 16) + return nested_vmx_fail(vcpu, + VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); + + vpid02 = nested_get_vpid02(vcpu); + switch (type) { + case VMX_VPID_EXTENT_INDIVIDUAL_ADDR: + if (!operand.vpid || + is_noncanonical_address(operand.gla, vcpu)) + return nested_vmx_fail(vcpu, + VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); + vpid_sync_vcpu_addr(vpid02, operand.gla); + break; + case VMX_VPID_EXTENT_SINGLE_CONTEXT: + case VMX_VPID_EXTENT_SINGLE_NON_GLOBAL: + if (!operand.vpid) + return nested_vmx_fail(vcpu, + VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); + vpid_sync_context(vpid02); + break; + case VMX_VPID_EXTENT_ALL_CONTEXT: + vpid_sync_context(vpid02); + break; + default: + WARN_ON_ONCE(1); + return kvm_skip_emulated_instruction(vcpu); + } + + /* + * Sync the shadow page tables if EPT is disabled, L1 is invalidating + * linear mappings for L2 (tagged with L2's VPID). Free all guest + * roots as VPIDs are not tracked in the MMU role. + * + * Note, this operates on root_mmu, not guest_mmu, as L1 and L2 share + * an MMU when EPT is disabled. + * + * TODO: sync only the affected SPTEs for INVDIVIDUAL_ADDR. + */ + if (!enable_ept) + kvm_mmu_free_guest_mode_roots(vcpu->kvm, &vcpu->arch.root_mmu); + + return nested_vmx_succeed(vcpu); +} + +static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + u32 index = kvm_rcx_read(vcpu); + u64 new_eptp; + + if (WARN_ON_ONCE(!nested_cpu_has_ept(vmcs12))) + return 1; + if (index >= VMFUNC_EPTP_ENTRIES) + return 1; + + if (kvm_vcpu_read_guest_page(vcpu, vmcs12->eptp_list_address >> PAGE_SHIFT, + &new_eptp, index * 8, 8)) + return 1; + + /* + * If the (L2) guest does a vmfunc to the currently + * active ept pointer, we don't have to do anything else + */ + if (vmcs12->ept_pointer != new_eptp) { + if (!nested_vmx_check_eptp(vcpu, new_eptp)) + return 1; + + vmcs12->ept_pointer = new_eptp; + nested_ept_new_eptp(vcpu); + + if (!nested_cpu_has_vpid(vmcs12)) + kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); + } + + return 0; +} + +static int handle_vmfunc(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct vmcs12 *vmcs12; + u32 function = kvm_rax_read(vcpu); + + /* + * VMFUNC is only supported for nested guests, but we always enable the + * secondary control for simplicity; for non-nested mode, fake that we + * didn't by injecting #UD. + */ + if (!is_guest_mode(vcpu)) { + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; + } + + vmcs12 = get_vmcs12(vcpu); + + /* + * #UD on out-of-bounds function has priority over VM-Exit, and VMFUNC + * is enabled in vmcs02 if and only if it's enabled in vmcs12. + */ + if (WARN_ON_ONCE((function > 63) || !nested_cpu_has_vmfunc(vmcs12))) { + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; + } + + if (!(vmcs12->vm_function_control & BIT_ULL(function))) + goto fail; + + switch (function) { + case 0: + if (nested_vmx_eptp_switching(vcpu, vmcs12)) + goto fail; + break; + default: + goto fail; + } + return kvm_skip_emulated_instruction(vcpu); + +fail: + /* + * This is effectively a reflected VM-Exit, as opposed to a synthesized + * nested VM-Exit. Pass the original exit reason, i.e. don't hardcode + * EXIT_REASON_VMFUNC as the exit reason. + */ + nested_vmx_vmexit(vcpu, vmx->exit_reason.full, + vmx_get_intr_info(vcpu), + vmx_get_exit_qual(vcpu)); + return 1; +} + +/* + * Return true if an IO instruction with the specified port and size should cause + * a VM-exit into L1. + */ +bool nested_vmx_check_io_bitmaps(struct kvm_vcpu *vcpu, unsigned int port, + int size) +{ + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + gpa_t bitmap, last_bitmap; + u8 b; + + last_bitmap = INVALID_GPA; + b = -1; + + while (size > 0) { + if (port < 0x8000) + bitmap = vmcs12->io_bitmap_a; + else if (port < 0x10000) + bitmap = vmcs12->io_bitmap_b; + else + return true; + bitmap += (port & 0x7fff) / 8; + + if (last_bitmap != bitmap) + if (kvm_vcpu_read_guest(vcpu, bitmap, &b, 1)) + return true; + if (b & (1 << (port & 7))) + return true; + + port++; + size--; + last_bitmap = bitmap; + } + + return false; +} + +static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + unsigned long exit_qualification; + unsigned short port; + int size; + + if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) + return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING); + + exit_qualification = vmx_get_exit_qual(vcpu); + + port = exit_qualification >> 16; + size = (exit_qualification & 7) + 1; + + return nested_vmx_check_io_bitmaps(vcpu, port, size); +} + +/* + * Return 1 if we should exit from L2 to L1 to handle an MSR access, + * rather than handle it ourselves in L0. I.e., check whether L1 expressed + * disinterest in the current event (read or write a specific MSR) by using an + * MSR bitmap. This may be the case even when L0 doesn't use MSR bitmaps. + */ +static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12, + union vmx_exit_reason exit_reason) +{ + u32 msr_index = kvm_rcx_read(vcpu); + gpa_t bitmap; + + if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) + return true; + + /* + * The MSR_BITMAP page is divided into four 1024-byte bitmaps, + * for the four combinations of read/write and low/high MSR numbers. + * First we need to figure out which of the four to use: + */ + bitmap = vmcs12->msr_bitmap; + if (exit_reason.basic == EXIT_REASON_MSR_WRITE) + bitmap += 2048; + if (msr_index >= 0xc0000000) { + msr_index -= 0xc0000000; + bitmap += 1024; + } + + /* Then read the msr_index'th bit from this bitmap: */ + if (msr_index < 1024*8) { + unsigned char b; + if (kvm_vcpu_read_guest(vcpu, bitmap + msr_index/8, &b, 1)) + return true; + return 1 & (b >> (msr_index & 7)); + } else + return true; /* let L1 handle the wrong parameter */ +} + +/* + * Return 1 if we should exit from L2 to L1 to handle a CR access exit, + * rather than handle it ourselves in L0. I.e., check if L1 wanted to + * intercept (via guest_host_mask etc.) the current event. + */ +static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + unsigned long exit_qualification = vmx_get_exit_qual(vcpu); + int cr = exit_qualification & 15; + int reg; + unsigned long val; + + switch ((exit_qualification >> 4) & 3) { + case 0: /* mov to cr */ + reg = (exit_qualification >> 8) & 15; + val = kvm_register_read(vcpu, reg); + switch (cr) { + case 0: + if (vmcs12->cr0_guest_host_mask & + (val ^ vmcs12->cr0_read_shadow)) + return true; + break; + case 3: + if (nested_cpu_has(vmcs12, CPU_BASED_CR3_LOAD_EXITING)) + return true; + break; + case 4: + if (vmcs12->cr4_guest_host_mask & + (vmcs12->cr4_read_shadow ^ val)) + return true; + break; + case 8: + if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING)) + return true; + break; + } + break; + case 2: /* clts */ + if ((vmcs12->cr0_guest_host_mask & X86_CR0_TS) && + (vmcs12->cr0_read_shadow & X86_CR0_TS)) + return true; + break; + case 1: /* mov from cr */ + switch (cr) { + case 3: + if (vmcs12->cpu_based_vm_exec_control & + CPU_BASED_CR3_STORE_EXITING) + return true; + break; + case 8: + if (vmcs12->cpu_based_vm_exec_control & + CPU_BASED_CR8_STORE_EXITING) + return true; + break; + } + break; + case 3: /* lmsw */ + /* + * lmsw can change bits 1..3 of cr0, and only set bit 0 of + * cr0. Other attempted changes are ignored, with no exit. + */ + val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f; + if (vmcs12->cr0_guest_host_mask & 0xe & + (val ^ vmcs12->cr0_read_shadow)) + return true; + if ((vmcs12->cr0_guest_host_mask & 0x1) && + !(vmcs12->cr0_read_shadow & 0x1) && + (val & 0x1)) + return true; + break; + } + return false; +} + +static bool nested_vmx_exit_handled_encls(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + u32 encls_leaf; + + if (!guest_cpuid_has(vcpu, X86_FEATURE_SGX) || + !nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENCLS_EXITING)) + return false; + + encls_leaf = kvm_rax_read(vcpu); + if (encls_leaf > 62) + encls_leaf = 63; + return vmcs12->encls_exiting_bitmap & BIT_ULL(encls_leaf); +} + +static bool nested_vmx_exit_handled_vmcs_access(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12, gpa_t bitmap) +{ + u32 vmx_instruction_info; + unsigned long field; + u8 b; + + if (!nested_cpu_has_shadow_vmcs(vmcs12)) + return true; + + /* Decode instruction info and find the field to access */ + vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); + field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); + + /* Out-of-range fields always cause a VM exit from L2 to L1 */ + if (field >> 15) + return true; + + if (kvm_vcpu_read_guest(vcpu, bitmap + field/8, &b, 1)) + return true; + + return 1 & (b >> (field & 7)); +} + +static bool nested_vmx_exit_handled_mtf(struct vmcs12 *vmcs12) +{ + u32 entry_intr_info = vmcs12->vm_entry_intr_info_field; + + if (nested_cpu_has_mtf(vmcs12)) + return true; + + /* + * An MTF VM-exit may be injected into the guest by setting the + * interruption-type to 7 (other event) and the vector field to 0. Such + * is the case regardless of the 'monitor trap flag' VM-execution + * control. + */ + return entry_intr_info == (INTR_INFO_VALID_MASK + | INTR_TYPE_OTHER_EVENT); +} + +/* + * Return true if L0 wants to handle an exit from L2 regardless of whether or not + * L1 wants the exit. Only call this when in is_guest_mode (L2). + */ +static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu, + union vmx_exit_reason exit_reason) +{ + u32 intr_info; + + switch ((u16)exit_reason.basic) { + case EXIT_REASON_EXCEPTION_NMI: + intr_info = vmx_get_intr_info(vcpu); + if (is_nmi(intr_info)) + return true; + else if (is_page_fault(intr_info)) + return vcpu->arch.apf.host_apf_flags || + vmx_need_pf_intercept(vcpu); + else if (is_debug(intr_info) && + vcpu->guest_debug & + (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) + return true; + else if (is_breakpoint(intr_info) && + vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) + return true; + else if (is_alignment_check(intr_info) && + !vmx_guest_inject_ac(vcpu)) + return true; + return false; + case EXIT_REASON_EXTERNAL_INTERRUPT: + return true; + case EXIT_REASON_MCE_DURING_VMENTRY: + return true; + case EXIT_REASON_EPT_VIOLATION: + /* + * L0 always deals with the EPT violation. If nested EPT is + * used, and the nested mmu code discovers that the address is + * missing in the guest EPT table (EPT12), the EPT violation + * will be injected with nested_ept_inject_page_fault() + */ + return true; + case EXIT_REASON_EPT_MISCONFIG: + /* + * L2 never uses directly L1's EPT, but rather L0's own EPT + * table (shadow on EPT) or a merged EPT table that L0 built + * (EPT on EPT). So any problems with the structure of the + * table is L0's fault. + */ + return true; + case EXIT_REASON_PREEMPTION_TIMER: + return true; + case EXIT_REASON_PML_FULL: + /* + * PML is emulated for an L1 VMM and should never be enabled in + * vmcs02, always "handle" PML_FULL by exiting to userspace. + */ + return true; + case EXIT_REASON_VMFUNC: + /* VM functions are emulated through L2->L0 vmexits. */ + return true; + case EXIT_REASON_BUS_LOCK: + /* + * At present, bus lock VM exit is never exposed to L1. + * Handle L2's bus locks in L0 directly. + */ + return true; + default: + break; + } + return false; +} + +/* + * Return 1 if L1 wants to intercept an exit from L2. Only call this when in + * is_guest_mode (L2). + */ +static bool nested_vmx_l1_wants_exit(struct kvm_vcpu *vcpu, + union vmx_exit_reason exit_reason) +{ + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + u32 intr_info; + + switch ((u16)exit_reason.basic) { + case EXIT_REASON_EXCEPTION_NMI: + intr_info = vmx_get_intr_info(vcpu); + if (is_nmi(intr_info)) + return true; + else if (is_page_fault(intr_info)) + return true; + return vmcs12->exception_bitmap & + (1u << (intr_info & INTR_INFO_VECTOR_MASK)); + case EXIT_REASON_EXTERNAL_INTERRUPT: + return nested_exit_on_intr(vcpu); + case EXIT_REASON_TRIPLE_FAULT: + return true; + case EXIT_REASON_INTERRUPT_WINDOW: + return nested_cpu_has(vmcs12, CPU_BASED_INTR_WINDOW_EXITING); + case EXIT_REASON_NMI_WINDOW: + return nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING); + case EXIT_REASON_TASK_SWITCH: + return true; + case EXIT_REASON_CPUID: + return true; + case EXIT_REASON_HLT: + return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING); + case EXIT_REASON_INVD: + return true; + case EXIT_REASON_INVLPG: + return nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING); + case EXIT_REASON_RDPMC: + return nested_cpu_has(vmcs12, CPU_BASED_RDPMC_EXITING); + case EXIT_REASON_RDRAND: + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDRAND_EXITING); + case EXIT_REASON_RDSEED: + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDSEED_EXITING); + case EXIT_REASON_RDTSC: case EXIT_REASON_RDTSCP: + return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING); + case EXIT_REASON_VMREAD: + return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12, + vmcs12->vmread_bitmap); + case EXIT_REASON_VMWRITE: + return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12, + vmcs12->vmwrite_bitmap); + case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR: + case EXIT_REASON_VMLAUNCH: case EXIT_REASON_VMPTRLD: + case EXIT_REASON_VMPTRST: case EXIT_REASON_VMRESUME: + case EXIT_REASON_VMOFF: case EXIT_REASON_VMON: + case EXIT_REASON_INVEPT: case EXIT_REASON_INVVPID: + /* + * VMX instructions trap unconditionally. This allows L1 to + * emulate them for its L2 guest, i.e., allows 3-level nesting! + */ + return true; + case EXIT_REASON_CR_ACCESS: + return nested_vmx_exit_handled_cr(vcpu, vmcs12); + case EXIT_REASON_DR_ACCESS: + return nested_cpu_has(vmcs12, CPU_BASED_MOV_DR_EXITING); + case EXIT_REASON_IO_INSTRUCTION: + return nested_vmx_exit_handled_io(vcpu, vmcs12); + case EXIT_REASON_GDTR_IDTR: case EXIT_REASON_LDTR_TR: + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC); + case EXIT_REASON_MSR_READ: + case EXIT_REASON_MSR_WRITE: + return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason); + case EXIT_REASON_INVALID_STATE: + return true; + case EXIT_REASON_MWAIT_INSTRUCTION: + return nested_cpu_has(vmcs12, CPU_BASED_MWAIT_EXITING); + case EXIT_REASON_MONITOR_TRAP_FLAG: + return nested_vmx_exit_handled_mtf(vmcs12); + case EXIT_REASON_MONITOR_INSTRUCTION: + return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_EXITING); + case EXIT_REASON_PAUSE_INSTRUCTION: + return nested_cpu_has(vmcs12, CPU_BASED_PAUSE_EXITING) || + nested_cpu_has2(vmcs12, + SECONDARY_EXEC_PAUSE_LOOP_EXITING); + case EXIT_REASON_MCE_DURING_VMENTRY: + return true; + case EXIT_REASON_TPR_BELOW_THRESHOLD: + return nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW); + case EXIT_REASON_APIC_ACCESS: + case EXIT_REASON_APIC_WRITE: + case EXIT_REASON_EOI_INDUCED: + /* + * The controls for "virtualize APIC accesses," "APIC- + * register virtualization," and "virtual-interrupt + * delivery" only come from vmcs12. + */ + return true; + case EXIT_REASON_INVPCID: + return + nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_INVPCID) && + nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING); + case EXIT_REASON_WBINVD: + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING); + case EXIT_REASON_XSETBV: + return true; + case EXIT_REASON_XSAVES: case EXIT_REASON_XRSTORS: + /* + * This should never happen, since it is not possible to + * set XSS to a non-zero value---neither in L1 nor in L2. + * If if it were, XSS would have to be checked against + * the XSS exit bitmap in vmcs12. + */ + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES); + case EXIT_REASON_UMWAIT: + case EXIT_REASON_TPAUSE: + return nested_cpu_has2(vmcs12, + SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE); + case EXIT_REASON_ENCLS: + return nested_vmx_exit_handled_encls(vcpu, vmcs12); + case EXIT_REASON_NOTIFY: + /* Notify VM exit is not exposed to L1 */ + return false; + default: + return true; + } +} + +/* + * Conditionally reflect a VM-Exit into L1. Returns %true if the VM-Exit was + * reflected into L1. + */ +bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + union vmx_exit_reason exit_reason = vmx->exit_reason; + unsigned long exit_qual; + u32 exit_intr_info; + + WARN_ON_ONCE(vmx->nested.nested_run_pending); + + /* + * Late nested VM-Fail shares the same flow as nested VM-Exit since KVM + * has already loaded L2's state. + */ + if (unlikely(vmx->fail)) { + trace_kvm_nested_vmenter_failed( + "hardware VM-instruction error: ", + vmcs_read32(VM_INSTRUCTION_ERROR)); + exit_intr_info = 0; + exit_qual = 0; + goto reflect_vmexit; + } + + trace_kvm_nested_vmexit(vcpu, KVM_ISA_VMX); + + /* If L0 (KVM) wants the exit, it trumps L1's desires. */ + if (nested_vmx_l0_wants_exit(vcpu, exit_reason)) + return false; + + /* If L1 doesn't want the exit, handle it in L0. */ + if (!nested_vmx_l1_wants_exit(vcpu, exit_reason)) + return false; + + /* + * vmcs.VM_EXIT_INTR_INFO is only valid for EXCEPTION_NMI exits. For + * EXTERNAL_INTERRUPT, the value for vmcs12->vm_exit_intr_info would + * need to be synthesized by querying the in-kernel LAPIC, but external + * interrupts are never reflected to L1 so it's a non-issue. + */ + exit_intr_info = vmx_get_intr_info(vcpu); + if (is_exception_with_error_code(exit_intr_info)) { + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + + vmcs12->vm_exit_intr_error_code = + vmcs_read32(VM_EXIT_INTR_ERROR_CODE); + } + exit_qual = vmx_get_exit_qual(vcpu); + +reflect_vmexit: + nested_vmx_vmexit(vcpu, exit_reason.full, exit_intr_info, exit_qual); + return true; +} + +static int vmx_get_nested_state(struct kvm_vcpu *vcpu, + struct kvm_nested_state __user *user_kvm_nested_state, + u32 user_data_size) +{ + struct vcpu_vmx *vmx; + struct vmcs12 *vmcs12; + struct kvm_nested_state kvm_state = { + .flags = 0, + .format = KVM_STATE_NESTED_FORMAT_VMX, + .size = sizeof(kvm_state), + .hdr.vmx.flags = 0, + .hdr.vmx.vmxon_pa = INVALID_GPA, + .hdr.vmx.vmcs12_pa = INVALID_GPA, + .hdr.vmx.preemption_timer_deadline = 0, + }; + struct kvm_vmx_nested_state_data __user *user_vmx_nested_state = + &user_kvm_nested_state->data.vmx[0]; + + if (!vcpu) + return kvm_state.size + sizeof(*user_vmx_nested_state); + + vmx = to_vmx(vcpu); + vmcs12 = get_vmcs12(vcpu); + + if (nested_vmx_allowed(vcpu) && + (vmx->nested.vmxon || vmx->nested.smm.vmxon)) { + kvm_state.hdr.vmx.vmxon_pa = vmx->nested.vmxon_ptr; + kvm_state.hdr.vmx.vmcs12_pa = vmx->nested.current_vmptr; + + if (vmx_has_valid_vmcs12(vcpu)) { + kvm_state.size += sizeof(user_vmx_nested_state->vmcs12); + + /* 'hv_evmcs_vmptr' can also be EVMPTR_MAP_PENDING here */ + if (vmx->nested.hv_evmcs_vmptr != EVMPTR_INVALID) + kvm_state.flags |= KVM_STATE_NESTED_EVMCS; + + if (is_guest_mode(vcpu) && + nested_cpu_has_shadow_vmcs(vmcs12) && + vmcs12->vmcs_link_pointer != INVALID_GPA) + kvm_state.size += sizeof(user_vmx_nested_state->shadow_vmcs12); + } + + if (vmx->nested.smm.vmxon) + kvm_state.hdr.vmx.smm.flags |= KVM_STATE_NESTED_SMM_VMXON; + + if (vmx->nested.smm.guest_mode) + kvm_state.hdr.vmx.smm.flags |= KVM_STATE_NESTED_SMM_GUEST_MODE; + + if (is_guest_mode(vcpu)) { + kvm_state.flags |= KVM_STATE_NESTED_GUEST_MODE; + + if (vmx->nested.nested_run_pending) + kvm_state.flags |= KVM_STATE_NESTED_RUN_PENDING; + + if (vmx->nested.mtf_pending) + kvm_state.flags |= KVM_STATE_NESTED_MTF_PENDING; + + if (nested_cpu_has_preemption_timer(vmcs12) && + vmx->nested.has_preemption_timer_deadline) { + kvm_state.hdr.vmx.flags |= + KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE; + kvm_state.hdr.vmx.preemption_timer_deadline = + vmx->nested.preemption_timer_deadline; + } + } + } + + if (user_data_size < kvm_state.size) + goto out; + + if (copy_to_user(user_kvm_nested_state, &kvm_state, sizeof(kvm_state))) + return -EFAULT; + + if (!vmx_has_valid_vmcs12(vcpu)) + goto out; + + /* + * When running L2, the authoritative vmcs12 state is in the + * vmcs02. When running L1, the authoritative vmcs12 state is + * in the shadow or enlightened vmcs linked to vmcs01, unless + * need_vmcs12_to_shadow_sync is set, in which case, the authoritative + * vmcs12 state is in the vmcs12 already. + */ + if (is_guest_mode(vcpu)) { + sync_vmcs02_to_vmcs12(vcpu, vmcs12); + sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12); + } else { + copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu)); + if (!vmx->nested.need_vmcs12_to_shadow_sync) { + if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) + /* + * L1 hypervisor is not obliged to keep eVMCS + * clean fields data always up-to-date while + * not in guest mode, 'hv_clean_fields' is only + * supposed to be actual upon vmentry so we need + * to ignore it here and do full copy. + */ + copy_enlightened_to_vmcs12(vmx, 0); + else if (enable_shadow_vmcs) + copy_shadow_to_vmcs12(vmx); + } + } + + BUILD_BUG_ON(sizeof(user_vmx_nested_state->vmcs12) < VMCS12_SIZE); + BUILD_BUG_ON(sizeof(user_vmx_nested_state->shadow_vmcs12) < VMCS12_SIZE); + + /* + * Copy over the full allocated size of vmcs12 rather than just the size + * of the struct. + */ + if (copy_to_user(user_vmx_nested_state->vmcs12, vmcs12, VMCS12_SIZE)) + return -EFAULT; + + if (nested_cpu_has_shadow_vmcs(vmcs12) && + vmcs12->vmcs_link_pointer != INVALID_GPA) { + if (copy_to_user(user_vmx_nested_state->shadow_vmcs12, + get_shadow_vmcs12(vcpu), VMCS12_SIZE)) + return -EFAULT; + } +out: + return kvm_state.size; +} + +void vmx_leave_nested(struct kvm_vcpu *vcpu) +{ + if (is_guest_mode(vcpu)) { + to_vmx(vcpu)->nested.nested_run_pending = 0; + nested_vmx_vmexit(vcpu, -1, 0, 0); + } + free_nested(vcpu); +} + +static int vmx_set_nested_state(struct kvm_vcpu *vcpu, + struct kvm_nested_state __user *user_kvm_nested_state, + struct kvm_nested_state *kvm_state) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct vmcs12 *vmcs12; + enum vm_entry_failure_code ignored; + struct kvm_vmx_nested_state_data __user *user_vmx_nested_state = + &user_kvm_nested_state->data.vmx[0]; + int ret; + + if (kvm_state->format != KVM_STATE_NESTED_FORMAT_VMX) + return -EINVAL; + + if (kvm_state->hdr.vmx.vmxon_pa == INVALID_GPA) { + if (kvm_state->hdr.vmx.smm.flags) + return -EINVAL; + + if (kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA) + return -EINVAL; + + /* + * KVM_STATE_NESTED_EVMCS used to signal that KVM should + * enable eVMCS capability on vCPU. However, since then + * code was changed such that flag signals vmcs12 should + * be copied into eVMCS in guest memory. + * + * To preserve backwards compatability, allow user + * to set this flag even when there is no VMXON region. + */ + if (kvm_state->flags & ~KVM_STATE_NESTED_EVMCS) + return -EINVAL; + } else { + if (!nested_vmx_allowed(vcpu)) + return -EINVAL; + + if (!page_address_valid(vcpu, kvm_state->hdr.vmx.vmxon_pa)) + return -EINVAL; + } + + if ((kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) && + (kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) + return -EINVAL; + + if (kvm_state->hdr.vmx.smm.flags & + ~(KVM_STATE_NESTED_SMM_GUEST_MODE | KVM_STATE_NESTED_SMM_VMXON)) + return -EINVAL; + + if (kvm_state->hdr.vmx.flags & ~KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE) + return -EINVAL; + + /* + * SMM temporarily disables VMX, so we cannot be in guest mode, + * nor can VMLAUNCH/VMRESUME be pending. Outside SMM, SMM flags + * must be zero. + */ + if (is_smm(vcpu) ? + (kvm_state->flags & + (KVM_STATE_NESTED_GUEST_MODE | KVM_STATE_NESTED_RUN_PENDING)) + : kvm_state->hdr.vmx.smm.flags) + return -EINVAL; + + if ((kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) && + !(kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON)) + return -EINVAL; + + if ((kvm_state->flags & KVM_STATE_NESTED_EVMCS) && + (!nested_vmx_allowed(vcpu) || !vmx->nested.enlightened_vmcs_enabled)) + return -EINVAL; + + vmx_leave_nested(vcpu); + + if (kvm_state->hdr.vmx.vmxon_pa == INVALID_GPA) + return 0; + + vmx->nested.vmxon_ptr = kvm_state->hdr.vmx.vmxon_pa; + ret = enter_vmx_operation(vcpu); + if (ret) + return ret; + + /* Empty 'VMXON' state is permitted if no VMCS loaded */ + if (kvm_state->size < sizeof(*kvm_state) + sizeof(*vmcs12)) { + /* See vmx_has_valid_vmcs12. */ + if ((kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE) || + (kvm_state->flags & KVM_STATE_NESTED_EVMCS) || + (kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA)) + return -EINVAL; + else + return 0; + } + + if (kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA) { + if (kvm_state->hdr.vmx.vmcs12_pa == kvm_state->hdr.vmx.vmxon_pa || + !page_address_valid(vcpu, kvm_state->hdr.vmx.vmcs12_pa)) + return -EINVAL; + + set_current_vmptr(vmx, kvm_state->hdr.vmx.vmcs12_pa); + } else if (kvm_state->flags & KVM_STATE_NESTED_EVMCS) { + /* + * nested_vmx_handle_enlightened_vmptrld() cannot be called + * directly from here as HV_X64_MSR_VP_ASSIST_PAGE may not be + * restored yet. EVMCS will be mapped from + * nested_get_vmcs12_pages(). + */ + vmx->nested.hv_evmcs_vmptr = EVMPTR_MAP_PENDING; + kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); + } else { + return -EINVAL; + } + + if (kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON) { + vmx->nested.smm.vmxon = true; + vmx->nested.vmxon = false; + + if (kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) + vmx->nested.smm.guest_mode = true; + } + + vmcs12 = get_vmcs12(vcpu); + if (copy_from_user(vmcs12, user_vmx_nested_state->vmcs12, sizeof(*vmcs12))) + return -EFAULT; + + if (vmcs12->hdr.revision_id != VMCS12_REVISION) + return -EINVAL; + + if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) + return 0; + + vmx->nested.nested_run_pending = + !!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING); + + vmx->nested.mtf_pending = + !!(kvm_state->flags & KVM_STATE_NESTED_MTF_PENDING); + + ret = -EINVAL; + if (nested_cpu_has_shadow_vmcs(vmcs12) && + vmcs12->vmcs_link_pointer != INVALID_GPA) { + struct vmcs12 *shadow_vmcs12 = get_shadow_vmcs12(vcpu); + + if (kvm_state->size < + sizeof(*kvm_state) + + sizeof(user_vmx_nested_state->vmcs12) + sizeof(*shadow_vmcs12)) + goto error_guest_mode; + + if (copy_from_user(shadow_vmcs12, + user_vmx_nested_state->shadow_vmcs12, + sizeof(*shadow_vmcs12))) { + ret = -EFAULT; + goto error_guest_mode; + } + + if (shadow_vmcs12->hdr.revision_id != VMCS12_REVISION || + !shadow_vmcs12->hdr.shadow_vmcs) + goto error_guest_mode; + } + + vmx->nested.has_preemption_timer_deadline = false; + if (kvm_state->hdr.vmx.flags & KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE) { + vmx->nested.has_preemption_timer_deadline = true; + vmx->nested.preemption_timer_deadline = + kvm_state->hdr.vmx.preemption_timer_deadline; + } + + if (nested_vmx_check_controls(vcpu, vmcs12) || + nested_vmx_check_host_state(vcpu, vmcs12) || + nested_vmx_check_guest_state(vcpu, vmcs12, &ignored)) + goto error_guest_mode; + + vmx->nested.dirty_vmcs12 = true; + vmx->nested.force_msr_bitmap_recalc = true; + ret = nested_vmx_enter_non_root_mode(vcpu, false); + if (ret) + goto error_guest_mode; + + if (vmx->nested.mtf_pending) + kvm_make_request(KVM_REQ_EVENT, vcpu); + + return 0; + +error_guest_mode: + vmx->nested.nested_run_pending = 0; + return ret; +} + +void nested_vmx_set_vmcs_shadowing_bitmap(void) +{ + if (enable_shadow_vmcs) { + vmcs_write64(VMREAD_BITMAP, __pa(vmx_vmread_bitmap)); + vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap)); + } +} + +/* + * Indexing into the vmcs12 uses the VMCS encoding rotated left by 6. Undo + * that madness to get the encoding for comparison. + */ +#define VMCS12_IDX_TO_ENC(idx) ((u16)(((u16)(idx) >> 6) | ((u16)(idx) << 10))) + +static u64 nested_vmx_calc_vmcs_enum_msr(void) +{ + /* + * Note these are the so called "index" of the VMCS field encoding, not + * the index into vmcs12. + */ + unsigned int max_idx, idx; + int i; + + /* + * For better or worse, KVM allows VMREAD/VMWRITE to all fields in + * vmcs12, regardless of whether or not the associated feature is + * exposed to L1. Simply find the field with the highest index. + */ + max_idx = 0; + for (i = 0; i < nr_vmcs12_fields; i++) { + /* The vmcs12 table is very, very sparsely populated. */ + if (!vmcs12_field_offsets[i]) + continue; + + idx = vmcs_field_index(VMCS12_IDX_TO_ENC(i)); + if (idx > max_idx) + max_idx = idx; + } + + return (u64)max_idx << VMCS_FIELD_INDEX_SHIFT; +} + +/* + * nested_vmx_setup_ctls_msrs() sets up variables containing the values to be + * returned for the various VMX controls MSRs when nested VMX is enabled. + * The same values should also be used to verify that vmcs12 control fields are + * valid during nested entry from L1 to L2. + * Each of these control msrs has a low and high 32-bit half: A low bit is on + * if the corresponding bit in the (32-bit) control field *must* be on, and a + * bit in the high half is on if the corresponding bit in the control field + * may be on. See also vmx_control_verify(). + */ +void nested_vmx_setup_ctls_msrs(struct vmcs_config *vmcs_conf, u32 ept_caps) +{ + struct nested_vmx_msrs *msrs = &vmcs_conf->nested; + + /* + * Note that as a general rule, the high half of the MSRs (bits in + * the control fields which may be 1) should be initialized by the + * intersection of the underlying hardware's MSR (i.e., features which + * can be supported) and the list of features we want to expose - + * because they are known to be properly supported in our code. + * Also, usually, the low half of the MSRs (bits which must be 1) can + * be set to 0, meaning that L1 may turn off any of these bits. The + * reason is that if one of these bits is necessary, it will appear + * in vmcs01 and prepare_vmcs02, when it bitwise-or's the control + * fields of vmcs01 and vmcs02, will turn these bits off - and + * nested_vmx_l1_wants_exit() will not pass related exits to L1. + * These rules have exceptions below. + */ + + /* pin-based controls */ + msrs->pinbased_ctls_low = + PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; + + msrs->pinbased_ctls_high = vmcs_conf->pin_based_exec_ctrl; + msrs->pinbased_ctls_high &= + PIN_BASED_EXT_INTR_MASK | + PIN_BASED_NMI_EXITING | + PIN_BASED_VIRTUAL_NMIS | + (enable_apicv ? PIN_BASED_POSTED_INTR : 0); + msrs->pinbased_ctls_high |= + PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR | + PIN_BASED_VMX_PREEMPTION_TIMER; + + /* exit controls */ + msrs->exit_ctls_low = + VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; + + msrs->exit_ctls_high = vmcs_conf->vmexit_ctrl; + msrs->exit_ctls_high &= +#ifdef CONFIG_X86_64 + VM_EXIT_HOST_ADDR_SPACE_SIZE | +#endif + VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT | + VM_EXIT_CLEAR_BNDCFGS; + msrs->exit_ctls_high |= + VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | + VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER | + VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT | + VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL; + + /* We support free control of debug control saving. */ + msrs->exit_ctls_low &= ~VM_EXIT_SAVE_DEBUG_CONTROLS; + + /* entry controls */ + msrs->entry_ctls_low = + VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; + + msrs->entry_ctls_high = vmcs_conf->vmentry_ctrl; + msrs->entry_ctls_high &= +#ifdef CONFIG_X86_64 + VM_ENTRY_IA32E_MODE | +#endif + VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_BNDCFGS; + msrs->entry_ctls_high |= + (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER | + VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL); + + /* We support free control of debug control loading. */ + msrs->entry_ctls_low &= ~VM_ENTRY_LOAD_DEBUG_CONTROLS; + + /* cpu-based controls */ + msrs->procbased_ctls_low = + CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR; + + msrs->procbased_ctls_high = vmcs_conf->cpu_based_exec_ctrl; + msrs->procbased_ctls_high &= + CPU_BASED_INTR_WINDOW_EXITING | + CPU_BASED_NMI_WINDOW_EXITING | CPU_BASED_USE_TSC_OFFSETTING | + CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING | + CPU_BASED_MWAIT_EXITING | CPU_BASED_CR3_LOAD_EXITING | + CPU_BASED_CR3_STORE_EXITING | +#ifdef CONFIG_X86_64 + CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING | +#endif + CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING | + CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_TRAP_FLAG | + CPU_BASED_MONITOR_EXITING | CPU_BASED_RDPMC_EXITING | + CPU_BASED_RDTSC_EXITING | CPU_BASED_PAUSE_EXITING | + CPU_BASED_TPR_SHADOW | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; + /* + * We can allow some features even when not supported by the + * hardware. For example, L1 can specify an MSR bitmap - and we + * can use it to avoid exits to L1 - even when L0 runs L2 + * without MSR bitmaps. + */ + msrs->procbased_ctls_high |= + CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR | + CPU_BASED_USE_MSR_BITMAPS; + + /* We support free control of CR3 access interception. */ + msrs->procbased_ctls_low &= + ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING); + + /* + * secondary cpu-based controls. Do not include those that + * depend on CPUID bits, they are added later by + * vmx_vcpu_after_set_cpuid. + */ + msrs->secondary_ctls_low = 0; + + msrs->secondary_ctls_high = vmcs_conf->cpu_based_2nd_exec_ctrl; + msrs->secondary_ctls_high &= + SECONDARY_EXEC_DESC | + SECONDARY_EXEC_ENABLE_RDTSCP | + SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | + SECONDARY_EXEC_WBINVD_EXITING | + SECONDARY_EXEC_APIC_REGISTER_VIRT | + SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | + SECONDARY_EXEC_RDRAND_EXITING | + SECONDARY_EXEC_ENABLE_INVPCID | + SECONDARY_EXEC_RDSEED_EXITING | + SECONDARY_EXEC_XSAVES | + SECONDARY_EXEC_TSC_SCALING | + SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE; + + /* + * We can emulate "VMCS shadowing," even if the hardware + * doesn't support it. + */ + msrs->secondary_ctls_high |= + SECONDARY_EXEC_SHADOW_VMCS; + + if (enable_ept) { + /* nested EPT: emulate EPT also to L1 */ + msrs->secondary_ctls_high |= + SECONDARY_EXEC_ENABLE_EPT; + msrs->ept_caps = + VMX_EPT_PAGE_WALK_4_BIT | + VMX_EPT_PAGE_WALK_5_BIT | + VMX_EPTP_WB_BIT | + VMX_EPT_INVEPT_BIT | + VMX_EPT_EXECUTE_ONLY_BIT; + + msrs->ept_caps &= ept_caps; + msrs->ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT | + VMX_EPT_EXTENT_CONTEXT_BIT | VMX_EPT_2MB_PAGE_BIT | + VMX_EPT_1GB_PAGE_BIT; + if (enable_ept_ad_bits) { + msrs->secondary_ctls_high |= + SECONDARY_EXEC_ENABLE_PML; + msrs->ept_caps |= VMX_EPT_AD_BIT; + } + } + + if (cpu_has_vmx_vmfunc()) { + msrs->secondary_ctls_high |= + SECONDARY_EXEC_ENABLE_VMFUNC; + /* + * Advertise EPTP switching unconditionally + * since we emulate it + */ + if (enable_ept) + msrs->vmfunc_controls = + VMX_VMFUNC_EPTP_SWITCHING; + } + + /* + * Old versions of KVM use the single-context version without + * checking for support, so declare that it is supported even + * though it is treated as global context. The alternative is + * not failing the single-context invvpid, and it is worse. + */ + if (enable_vpid) { + msrs->secondary_ctls_high |= + SECONDARY_EXEC_ENABLE_VPID; + msrs->vpid_caps = VMX_VPID_INVVPID_BIT | + VMX_VPID_EXTENT_SUPPORTED_MASK; + } + + if (enable_unrestricted_guest) + msrs->secondary_ctls_high |= + SECONDARY_EXEC_UNRESTRICTED_GUEST; + + if (flexpriority_enabled) + msrs->secondary_ctls_high |= + SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; + + if (enable_sgx) + msrs->secondary_ctls_high |= SECONDARY_EXEC_ENCLS_EXITING; + + /* miscellaneous data */ + msrs->misc_low = (u32)vmcs_conf->misc & VMX_MISC_SAVE_EFER_LMA; + msrs->misc_low |= + MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS | + VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE | + VMX_MISC_ACTIVITY_HLT | + VMX_MISC_ACTIVITY_WAIT_SIPI; + msrs->misc_high = 0; + + /* + * This MSR reports some information about VMX support. We + * should return information about the VMX we emulate for the + * guest, and the VMCS structure we give it - not about the + * VMX support of the underlying hardware. + */ + msrs->basic = + VMCS12_REVISION | + VMX_BASIC_TRUE_CTLS | + ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) | + (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT); + + if (cpu_has_vmx_basic_inout()) + msrs->basic |= VMX_BASIC_INOUT; + + /* + * These MSRs specify bits which the guest must keep fixed on + * while L1 is in VMXON mode (in L1's root mode, or running an L2). + * We picked the standard core2 setting. + */ +#define VMXON_CR0_ALWAYSON (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE) +#define VMXON_CR4_ALWAYSON X86_CR4_VMXE + msrs->cr0_fixed0 = VMXON_CR0_ALWAYSON; + msrs->cr4_fixed0 = VMXON_CR4_ALWAYSON; + + /* These MSRs specify bits which the guest must keep fixed off. */ + rdmsrl(MSR_IA32_VMX_CR0_FIXED1, msrs->cr0_fixed1); + rdmsrl(MSR_IA32_VMX_CR4_FIXED1, msrs->cr4_fixed1); + + if (vmx_umip_emulated()) + msrs->cr4_fixed1 |= X86_CR4_UMIP; + + msrs->vmcs_enum = nested_vmx_calc_vmcs_enum_msr(); +} + +void nested_vmx_hardware_unsetup(void) +{ + int i; + + if (enable_shadow_vmcs) { + for (i = 0; i < VMX_BITMAP_NR; i++) + free_page((unsigned long)vmx_bitmap[i]); + } +} + +__init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *)) +{ + int i; + + if (!cpu_has_vmx_shadow_vmcs()) + enable_shadow_vmcs = 0; + if (enable_shadow_vmcs) { + for (i = 0; i < VMX_BITMAP_NR; i++) { + /* + * The vmx_bitmap is not tied to a VM and so should + * not be charged to a memcg. + */ + vmx_bitmap[i] = (unsigned long *) + __get_free_page(GFP_KERNEL); + if (!vmx_bitmap[i]) { + nested_vmx_hardware_unsetup(); + return -ENOMEM; + } + } + + init_vmcs_shadow_fields(); + } + + exit_handlers[EXIT_REASON_VMCLEAR] = handle_vmclear; + exit_handlers[EXIT_REASON_VMLAUNCH] = handle_vmlaunch; + exit_handlers[EXIT_REASON_VMPTRLD] = handle_vmptrld; + exit_handlers[EXIT_REASON_VMPTRST] = handle_vmptrst; + exit_handlers[EXIT_REASON_VMREAD] = handle_vmread; + exit_handlers[EXIT_REASON_VMRESUME] = handle_vmresume; + exit_handlers[EXIT_REASON_VMWRITE] = handle_vmwrite; + exit_handlers[EXIT_REASON_VMOFF] = handle_vmxoff; + exit_handlers[EXIT_REASON_VMON] = handle_vmxon; + exit_handlers[EXIT_REASON_INVEPT] = handle_invept; + exit_handlers[EXIT_REASON_INVVPID] = handle_invvpid; + exit_handlers[EXIT_REASON_VMFUNC] = handle_vmfunc; + + return 0; +} + +struct kvm_x86_nested_ops vmx_nested_ops = { + .leave_nested = vmx_leave_nested, + .is_exception_vmexit = nested_vmx_is_exception_vmexit, + .check_events = vmx_check_nested_events, + .has_events = vmx_has_nested_events, + .triple_fault = nested_vmx_triple_fault, + .get_state = vmx_get_nested_state, + .set_state = vmx_set_nested_state, + .get_nested_state_pages = vmx_get_nested_state_pages, + .write_log_dirty = nested_vmx_write_pml_buffer, + .enable_evmcs = nested_enable_evmcs, + .get_evmcs_version = nested_get_evmcs_version, +}; diff --git a/arch/x86/kvm/vmx/nested.h b/arch/x86/kvm/vmx/nested.h new file mode 100644 index 000000000..6312c9541 --- /dev/null +++ b/arch/x86/kvm/vmx/nested.h @@ -0,0 +1,292 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __KVM_X86_VMX_NESTED_H +#define __KVM_X86_VMX_NESTED_H + +#include "kvm_cache_regs.h" +#include "vmcs12.h" +#include "vmx.h" + +/* + * Status returned by nested_vmx_enter_non_root_mode(): + */ +enum nvmx_vmentry_status { + NVMX_VMENTRY_SUCCESS, /* Entered VMX non-root mode */ + NVMX_VMENTRY_VMFAIL, /* Consistency check VMFail */ + NVMX_VMENTRY_VMEXIT, /* Consistency check VMExit */ + NVMX_VMENTRY_KVM_INTERNAL_ERROR,/* KVM internal error */ +}; + +void vmx_leave_nested(struct kvm_vcpu *vcpu); +void nested_vmx_setup_ctls_msrs(struct vmcs_config *vmcs_conf, u32 ept_caps); +void nested_vmx_hardware_unsetup(void); +__init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *)); +void nested_vmx_set_vmcs_shadowing_bitmap(void); +void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu); +enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, + bool from_vmentry); +bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu); +void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, + u32 exit_intr_info, unsigned long exit_qualification); +void nested_sync_vmcs12_to_shadow(struct kvm_vcpu *vcpu); +int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data); +int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata); +int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification, + u32 vmx_instruction_info, bool wr, int len, gva_t *ret); +void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu); +bool nested_vmx_check_io_bitmaps(struct kvm_vcpu *vcpu, unsigned int port, + int size); + +static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu) +{ + return to_vmx(vcpu)->nested.cached_vmcs12; +} + +static inline struct vmcs12 *get_shadow_vmcs12(struct kvm_vcpu *vcpu) +{ + return to_vmx(vcpu)->nested.cached_shadow_vmcs12; +} + +/* + * Note: the same condition is checked against the state provided by userspace + * in vmx_set_nested_state; if it is satisfied, the nested state must include + * the VMCS12. + */ +static inline int vmx_has_valid_vmcs12(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + /* 'hv_evmcs_vmptr' can also be EVMPTR_MAP_PENDING here */ + return vmx->nested.current_vmptr != -1ull || + vmx->nested.hv_evmcs_vmptr != EVMPTR_INVALID; +} + +static inline u16 nested_get_vpid02(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + return vmx->nested.vpid02 ? vmx->nested.vpid02 : vmx->vpid; +} + +static inline unsigned long nested_ept_get_eptp(struct kvm_vcpu *vcpu) +{ + /* return the page table to be shadowed - in our case, EPT12 */ + return get_vmcs12(vcpu)->ept_pointer; +} + +static inline bool nested_ept_ad_enabled(struct kvm_vcpu *vcpu) +{ + return nested_ept_get_eptp(vcpu) & VMX_EPTP_AD_ENABLE_BIT; +} + +/* + * Return the cr0 value that a nested guest would read. This is a combination + * of the real cr0 used to run the guest (guest_cr0), and the bits shadowed by + * its hypervisor (cr0_read_shadow). + */ +static inline unsigned long nested_read_cr0(struct vmcs12 *fields) +{ + return (fields->guest_cr0 & ~fields->cr0_guest_host_mask) | + (fields->cr0_read_shadow & fields->cr0_guest_host_mask); +} +static inline unsigned long nested_read_cr4(struct vmcs12 *fields) +{ + return (fields->guest_cr4 & ~fields->cr4_guest_host_mask) | + (fields->cr4_read_shadow & fields->cr4_guest_host_mask); +} + +static inline unsigned nested_cpu_vmx_misc_cr3_count(struct kvm_vcpu *vcpu) +{ + return vmx_misc_cr3_count(to_vmx(vcpu)->nested.msrs.misc_low); +} + +/* + * Do the virtual VMX capability MSRs specify that L1 can use VMWRITE + * to modify any valid field of the VMCS, or are the VM-exit + * information fields read-only? + */ +static inline bool nested_cpu_has_vmwrite_any_field(struct kvm_vcpu *vcpu) +{ + return to_vmx(vcpu)->nested.msrs.misc_low & + MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS; +} + +static inline bool nested_cpu_has_zero_length_injection(struct kvm_vcpu *vcpu) +{ + return to_vmx(vcpu)->nested.msrs.misc_low & VMX_MISC_ZERO_LEN_INS; +} + +static inline bool nested_cpu_supports_monitor_trap_flag(struct kvm_vcpu *vcpu) +{ + return to_vmx(vcpu)->nested.msrs.procbased_ctls_high & + CPU_BASED_MONITOR_TRAP_FLAG; +} + +static inline bool nested_cpu_has_vmx_shadow_vmcs(struct kvm_vcpu *vcpu) +{ + return to_vmx(vcpu)->nested.msrs.secondary_ctls_high & + SECONDARY_EXEC_SHADOW_VMCS; +} + +static inline bool nested_cpu_has(struct vmcs12 *vmcs12, u32 bit) +{ + return vmcs12->cpu_based_vm_exec_control & bit; +} + +static inline bool nested_cpu_has2(struct vmcs12 *vmcs12, u32 bit) +{ + return (vmcs12->cpu_based_vm_exec_control & + CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) && + (vmcs12->secondary_vm_exec_control & bit); +} + +static inline bool nested_cpu_has_preemption_timer(struct vmcs12 *vmcs12) +{ + return vmcs12->pin_based_vm_exec_control & + PIN_BASED_VMX_PREEMPTION_TIMER; +} + +static inline bool nested_cpu_has_nmi_exiting(struct vmcs12 *vmcs12) +{ + return vmcs12->pin_based_vm_exec_control & PIN_BASED_NMI_EXITING; +} + +static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12) +{ + return vmcs12->pin_based_vm_exec_control & PIN_BASED_VIRTUAL_NMIS; +} + +static inline int nested_cpu_has_mtf(struct vmcs12 *vmcs12) +{ + return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_TRAP_FLAG); +} + +static inline int nested_cpu_has_ept(struct vmcs12 *vmcs12) +{ + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_EPT); +} + +static inline bool nested_cpu_has_xsaves(struct vmcs12 *vmcs12) +{ + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES); +} + +static inline bool nested_cpu_has_pml(struct vmcs12 *vmcs12) +{ + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_PML); +} + +static inline bool nested_cpu_has_virt_x2apic_mode(struct vmcs12 *vmcs12) +{ + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE); +} + +static inline bool nested_cpu_has_vpid(struct vmcs12 *vmcs12) +{ + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_VPID); +} + +static inline bool nested_cpu_has_apic_reg_virt(struct vmcs12 *vmcs12) +{ + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_APIC_REGISTER_VIRT); +} + +static inline bool nested_cpu_has_vid(struct vmcs12 *vmcs12) +{ + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); +} + +static inline bool nested_cpu_has_posted_intr(struct vmcs12 *vmcs12) +{ + return vmcs12->pin_based_vm_exec_control & PIN_BASED_POSTED_INTR; +} + +static inline bool nested_cpu_has_vmfunc(struct vmcs12 *vmcs12) +{ + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_VMFUNC); +} + +static inline bool nested_cpu_has_eptp_switching(struct vmcs12 *vmcs12) +{ + return nested_cpu_has_vmfunc(vmcs12) && + (vmcs12->vm_function_control & + VMX_VMFUNC_EPTP_SWITCHING); +} + +static inline bool nested_cpu_has_shadow_vmcs(struct vmcs12 *vmcs12) +{ + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_SHADOW_VMCS); +} + +static inline bool nested_cpu_has_save_preemption_timer(struct vmcs12 *vmcs12) +{ + return vmcs12->vm_exit_controls & + VM_EXIT_SAVE_VMX_PREEMPTION_TIMER; +} + +static inline bool nested_exit_on_nmi(struct kvm_vcpu *vcpu) +{ + return nested_cpu_has_nmi_exiting(get_vmcs12(vcpu)); +} + +/* + * In nested virtualization, check if L1 asked to exit on external interrupts. + * For most existing hypervisors, this will always return true. + */ +static inline bool nested_exit_on_intr(struct kvm_vcpu *vcpu) +{ + return get_vmcs12(vcpu)->pin_based_vm_exec_control & + PIN_BASED_EXT_INTR_MASK; +} + +static inline bool nested_cpu_has_encls_exit(struct vmcs12 *vmcs12) +{ + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENCLS_EXITING); +} + +/* + * if fixed0[i] == 1: val[i] must be 1 + * if fixed1[i] == 0: val[i] must be 0 + */ +static inline bool fixed_bits_valid(u64 val, u64 fixed0, u64 fixed1) +{ + return ((val & fixed1) | fixed0) == val; +} + +static inline bool nested_guest_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val) +{ + u64 fixed0 = to_vmx(vcpu)->nested.msrs.cr0_fixed0; + u64 fixed1 = to_vmx(vcpu)->nested.msrs.cr0_fixed1; + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + + if (to_vmx(vcpu)->nested.msrs.secondary_ctls_high & + SECONDARY_EXEC_UNRESTRICTED_GUEST && + nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST)) + fixed0 &= ~(X86_CR0_PE | X86_CR0_PG); + + return fixed_bits_valid(val, fixed0, fixed1); +} + +static inline bool nested_host_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val) +{ + u64 fixed0 = to_vmx(vcpu)->nested.msrs.cr0_fixed0; + u64 fixed1 = to_vmx(vcpu)->nested.msrs.cr0_fixed1; + + return fixed_bits_valid(val, fixed0, fixed1); +} + +static inline bool nested_cr4_valid(struct kvm_vcpu *vcpu, unsigned long val) +{ + u64 fixed0 = to_vmx(vcpu)->nested.msrs.cr4_fixed0; + u64 fixed1 = to_vmx(vcpu)->nested.msrs.cr4_fixed1; + + return fixed_bits_valid(val, fixed0, fixed1) && + __kvm_is_valid_cr4(vcpu, val); +} + +/* No difference in the restrictions on guest and host CR4 in VMX operation. */ +#define nested_guest_cr4_valid nested_cr4_valid +#define nested_host_cr4_valid nested_cr4_valid + +extern struct kvm_x86_nested_ops vmx_nested_ops; + +#endif /* __KVM_X86_VMX_NESTED_H */ diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c new file mode 100644 index 000000000..9a75a0d5d --- /dev/null +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -0,0 +1,814 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * KVM PMU support for Intel CPUs + * + * Copyright 2011 Red Hat, Inc. and/or its affiliates. + * + * Authors: + * Avi Kivity <avi@redhat.com> + * Gleb Natapov <gleb@redhat.com> + */ +#include <linux/types.h> +#include <linux/kvm_host.h> +#include <linux/perf_event.h> +#include <asm/perf_event.h> +#include "x86.h" +#include "cpuid.h" +#include "lapic.h" +#include "nested.h" +#include "pmu.h" + +#define MSR_PMC_FULL_WIDTH_BIT (MSR_IA32_PMC0 - MSR_IA32_PERFCTR0) + +static struct kvm_event_hw_type_mapping intel_arch_events[] = { + [0] = { 0x3c, 0x00, PERF_COUNT_HW_CPU_CYCLES }, + [1] = { 0xc0, 0x00, PERF_COUNT_HW_INSTRUCTIONS }, + [2] = { 0x3c, 0x01, PERF_COUNT_HW_BUS_CYCLES }, + [3] = { 0x2e, 0x4f, PERF_COUNT_HW_CACHE_REFERENCES }, + [4] = { 0x2e, 0x41, PERF_COUNT_HW_CACHE_MISSES }, + [5] = { 0xc4, 0x00, PERF_COUNT_HW_BRANCH_INSTRUCTIONS }, + [6] = { 0xc5, 0x00, PERF_COUNT_HW_BRANCH_MISSES }, + /* The above index must match CPUID 0x0A.EBX bit vector */ + [7] = { 0x00, 0x03, PERF_COUNT_HW_REF_CPU_CYCLES }, +}; + +/* mapping between fixed pmc index and intel_arch_events array */ +static int fixed_pmc_events[] = {1, 0, 7}; + +static void reprogram_fixed_counters(struct kvm_pmu *pmu, u64 data) +{ + struct kvm_pmc *pmc; + u8 old_fixed_ctr_ctrl = pmu->fixed_ctr_ctrl; + int i; + + pmu->fixed_ctr_ctrl = data; + for (i = 0; i < pmu->nr_arch_fixed_counters; i++) { + u8 new_ctrl = fixed_ctrl_field(data, i); + u8 old_ctrl = fixed_ctrl_field(old_fixed_ctr_ctrl, i); + + if (old_ctrl == new_ctrl) + continue; + + pmc = get_fixed_pmc(pmu, MSR_CORE_PERF_FIXED_CTR0 + i); + + __set_bit(INTEL_PMC_IDX_FIXED + i, pmu->pmc_in_use); + reprogram_counter(pmc); + } +} + +static struct kvm_pmc *intel_pmc_idx_to_pmc(struct kvm_pmu *pmu, int pmc_idx) +{ + if (pmc_idx < INTEL_PMC_IDX_FIXED) { + return get_gp_pmc(pmu, MSR_P6_EVNTSEL0 + pmc_idx, + MSR_P6_EVNTSEL0); + } else { + u32 idx = pmc_idx - INTEL_PMC_IDX_FIXED; + + return get_fixed_pmc(pmu, idx + MSR_CORE_PERF_FIXED_CTR0); + } +} + +static void reprogram_counters(struct kvm_pmu *pmu, u64 diff) +{ + int bit; + struct kvm_pmc *pmc; + + for_each_set_bit(bit, (unsigned long *)&diff, X86_PMC_IDX_MAX) { + pmc = intel_pmc_idx_to_pmc(pmu, bit); + if (pmc) + reprogram_counter(pmc); + } +} + +static bool intel_hw_event_available(struct kvm_pmc *pmc) +{ + struct kvm_pmu *pmu = pmc_to_pmu(pmc); + u8 event_select = pmc->eventsel & ARCH_PERFMON_EVENTSEL_EVENT; + u8 unit_mask = (pmc->eventsel & ARCH_PERFMON_EVENTSEL_UMASK) >> 8; + int i; + + for (i = 0; i < ARRAY_SIZE(intel_arch_events); i++) { + if (intel_arch_events[i].eventsel != event_select || + intel_arch_events[i].unit_mask != unit_mask) + continue; + + /* disable event that reported as not present by cpuid */ + if ((i < 7) && !(pmu->available_event_types & (1 << i))) + return false; + + break; + } + + return true; +} + +/* check if a PMC is enabled by comparing it with globl_ctrl bits. */ +static bool intel_pmc_is_enabled(struct kvm_pmc *pmc) +{ + struct kvm_pmu *pmu = pmc_to_pmu(pmc); + + if (!intel_pmu_has_perf_global_ctrl(pmu)) + return true; + + return test_bit(pmc->idx, (unsigned long *)&pmu->global_ctrl); +} + +static bool intel_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx) +{ + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + bool fixed = idx & (1u << 30); + + idx &= ~(3u << 30); + + return fixed ? idx < pmu->nr_arch_fixed_counters + : idx < pmu->nr_arch_gp_counters; +} + +static struct kvm_pmc *intel_rdpmc_ecx_to_pmc(struct kvm_vcpu *vcpu, + unsigned int idx, u64 *mask) +{ + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + bool fixed = idx & (1u << 30); + struct kvm_pmc *counters; + unsigned int num_counters; + + idx &= ~(3u << 30); + if (fixed) { + counters = pmu->fixed_counters; + num_counters = pmu->nr_arch_fixed_counters; + } else { + counters = pmu->gp_counters; + num_counters = pmu->nr_arch_gp_counters; + } + if (idx >= num_counters) + return NULL; + *mask &= pmu->counter_bitmask[fixed ? KVM_PMC_FIXED : KVM_PMC_GP]; + return &counters[array_index_nospec(idx, num_counters)]; +} + +static inline u64 vcpu_get_perf_capabilities(struct kvm_vcpu *vcpu) +{ + if (!guest_cpuid_has(vcpu, X86_FEATURE_PDCM)) + return 0; + + return vcpu->arch.perf_capabilities; +} + +static inline bool fw_writes_is_enabled(struct kvm_vcpu *vcpu) +{ + return (vcpu_get_perf_capabilities(vcpu) & PMU_CAP_FW_WRITES) != 0; +} + +static inline struct kvm_pmc *get_fw_gp_pmc(struct kvm_pmu *pmu, u32 msr) +{ + if (!fw_writes_is_enabled(pmu_to_vcpu(pmu))) + return NULL; + + return get_gp_pmc(pmu, msr, MSR_IA32_PMC0); +} + +static bool intel_pmu_is_valid_lbr_msr(struct kvm_vcpu *vcpu, u32 index) +{ + struct x86_pmu_lbr *records = vcpu_to_lbr_records(vcpu); + bool ret = false; + + if (!intel_pmu_lbr_is_enabled(vcpu)) + return ret; + + ret = (index == MSR_LBR_SELECT) || (index == MSR_LBR_TOS) || + (index >= records->from && index < records->from + records->nr) || + (index >= records->to && index < records->to + records->nr); + + if (!ret && records->info) + ret = (index >= records->info && index < records->info + records->nr); + + return ret; +} + +static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr) +{ + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + u64 perf_capabilities; + int ret; + + switch (msr) { + case MSR_CORE_PERF_FIXED_CTR_CTRL: + case MSR_CORE_PERF_GLOBAL_STATUS: + case MSR_CORE_PERF_GLOBAL_CTRL: + case MSR_CORE_PERF_GLOBAL_OVF_CTRL: + return intel_pmu_has_perf_global_ctrl(pmu); + break; + case MSR_IA32_PEBS_ENABLE: + ret = vcpu_get_perf_capabilities(vcpu) & PERF_CAP_PEBS_FORMAT; + break; + case MSR_IA32_DS_AREA: + ret = guest_cpuid_has(vcpu, X86_FEATURE_DS); + break; + case MSR_PEBS_DATA_CFG: + perf_capabilities = vcpu_get_perf_capabilities(vcpu); + ret = (perf_capabilities & PERF_CAP_PEBS_BASELINE) && + ((perf_capabilities & PERF_CAP_PEBS_FORMAT) > 3); + break; + default: + ret = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0) || + get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0) || + get_fixed_pmc(pmu, msr) || get_fw_gp_pmc(pmu, msr) || + intel_pmu_is_valid_lbr_msr(vcpu, msr); + break; + } + + return ret; +} + +static struct kvm_pmc *intel_msr_idx_to_pmc(struct kvm_vcpu *vcpu, u32 msr) +{ + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + struct kvm_pmc *pmc; + + pmc = get_fixed_pmc(pmu, msr); + pmc = pmc ? pmc : get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0); + pmc = pmc ? pmc : get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0); + + return pmc; +} + +static inline void intel_pmu_release_guest_lbr_event(struct kvm_vcpu *vcpu) +{ + struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu); + + if (lbr_desc->event) { + perf_event_release_kernel(lbr_desc->event); + lbr_desc->event = NULL; + vcpu_to_pmu(vcpu)->event_count--; + } +} + +int intel_pmu_create_guest_lbr_event(struct kvm_vcpu *vcpu) +{ + struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu); + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + struct perf_event *event; + + /* + * The perf_event_attr is constructed in the minimum efficient way: + * - set 'pinned = true' to make it task pinned so that if another + * cpu pinned event reclaims LBR, the event->oncpu will be set to -1; + * - set '.exclude_host = true' to record guest branches behavior; + * + * - set '.config = INTEL_FIXED_VLBR_EVENT' to indicates host perf + * schedule the event without a real HW counter but a fake one; + * check is_guest_lbr_event() and __intel_get_event_constraints(); + * + * - set 'sample_type = PERF_SAMPLE_BRANCH_STACK' and + * 'branch_sample_type = PERF_SAMPLE_BRANCH_CALL_STACK | + * PERF_SAMPLE_BRANCH_USER' to configure it as a LBR callstack + * event, which helps KVM to save/restore guest LBR records + * during host context switches and reduces quite a lot overhead, + * check branch_user_callstack() and intel_pmu_lbr_sched_task(); + */ + struct perf_event_attr attr = { + .type = PERF_TYPE_RAW, + .size = sizeof(attr), + .config = INTEL_FIXED_VLBR_EVENT, + .sample_type = PERF_SAMPLE_BRANCH_STACK, + .pinned = true, + .exclude_host = true, + .branch_sample_type = PERF_SAMPLE_BRANCH_CALL_STACK | + PERF_SAMPLE_BRANCH_USER, + }; + + if (unlikely(lbr_desc->event)) { + __set_bit(INTEL_PMC_IDX_FIXED_VLBR, pmu->pmc_in_use); + return 0; + } + + event = perf_event_create_kernel_counter(&attr, -1, + current, NULL, NULL); + if (IS_ERR(event)) { + pr_debug_ratelimited("%s: failed %ld\n", + __func__, PTR_ERR(event)); + return PTR_ERR(event); + } + lbr_desc->event = event; + pmu->event_count++; + __set_bit(INTEL_PMC_IDX_FIXED_VLBR, pmu->pmc_in_use); + return 0; +} + +/* + * It's safe to access LBR msrs from guest when they have not + * been passthrough since the host would help restore or reset + * the LBR msrs records when the guest LBR event is scheduled in. + */ +static bool intel_pmu_handle_lbr_msrs_access(struct kvm_vcpu *vcpu, + struct msr_data *msr_info, bool read) +{ + struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu); + u32 index = msr_info->index; + + if (!intel_pmu_is_valid_lbr_msr(vcpu, index)) + return false; + + if (!lbr_desc->event && intel_pmu_create_guest_lbr_event(vcpu) < 0) + goto dummy; + + /* + * Disable irq to ensure the LBR feature doesn't get reclaimed by the + * host at the time the value is read from the msr, and this avoids the + * host LBR value to be leaked to the guest. If LBR has been reclaimed, + * return 0 on guest reads. + */ + local_irq_disable(); + if (lbr_desc->event->state == PERF_EVENT_STATE_ACTIVE) { + if (read) + rdmsrl(index, msr_info->data); + else + wrmsrl(index, msr_info->data); + __set_bit(INTEL_PMC_IDX_FIXED_VLBR, vcpu_to_pmu(vcpu)->pmc_in_use); + local_irq_enable(); + return true; + } + clear_bit(INTEL_PMC_IDX_FIXED_VLBR, vcpu_to_pmu(vcpu)->pmc_in_use); + local_irq_enable(); + +dummy: + if (read) + msr_info->data = 0; + return true; +} + +static int intel_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) +{ + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + struct kvm_pmc *pmc; + u32 msr = msr_info->index; + + switch (msr) { + case MSR_CORE_PERF_FIXED_CTR_CTRL: + msr_info->data = pmu->fixed_ctr_ctrl; + return 0; + case MSR_CORE_PERF_GLOBAL_STATUS: + msr_info->data = pmu->global_status; + return 0; + case MSR_CORE_PERF_GLOBAL_CTRL: + msr_info->data = pmu->global_ctrl; + return 0; + case MSR_CORE_PERF_GLOBAL_OVF_CTRL: + msr_info->data = 0; + return 0; + case MSR_IA32_PEBS_ENABLE: + msr_info->data = pmu->pebs_enable; + return 0; + case MSR_IA32_DS_AREA: + msr_info->data = pmu->ds_area; + return 0; + case MSR_PEBS_DATA_CFG: + msr_info->data = pmu->pebs_data_cfg; + return 0; + default: + if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)) || + (pmc = get_gp_pmc(pmu, msr, MSR_IA32_PMC0))) { + u64 val = pmc_read_counter(pmc); + msr_info->data = + val & pmu->counter_bitmask[KVM_PMC_GP]; + return 0; + } else if ((pmc = get_fixed_pmc(pmu, msr))) { + u64 val = pmc_read_counter(pmc); + msr_info->data = + val & pmu->counter_bitmask[KVM_PMC_FIXED]; + return 0; + } else if ((pmc = get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0))) { + msr_info->data = pmc->eventsel; + return 0; + } else if (intel_pmu_handle_lbr_msrs_access(vcpu, msr_info, true)) + return 0; + } + + return 1; +} + +static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) +{ + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + struct kvm_pmc *pmc; + u32 msr = msr_info->index; + u64 data = msr_info->data; + u64 reserved_bits, diff; + + switch (msr) { + case MSR_CORE_PERF_FIXED_CTR_CTRL: + if (pmu->fixed_ctr_ctrl == data) + return 0; + if (!(data & pmu->fixed_ctr_ctrl_mask)) { + reprogram_fixed_counters(pmu, data); + return 0; + } + break; + case MSR_CORE_PERF_GLOBAL_STATUS: + if (msr_info->host_initiated) { + pmu->global_status = data; + return 0; + } + break; /* RO MSR */ + case MSR_CORE_PERF_GLOBAL_CTRL: + if (pmu->global_ctrl == data) + return 0; + if (kvm_valid_perf_global_ctrl(pmu, data)) { + diff = pmu->global_ctrl ^ data; + pmu->global_ctrl = data; + reprogram_counters(pmu, diff); + return 0; + } + break; + case MSR_CORE_PERF_GLOBAL_OVF_CTRL: + if (!(data & pmu->global_ovf_ctrl_mask)) { + if (!msr_info->host_initiated) + pmu->global_status &= ~data; + return 0; + } + break; + case MSR_IA32_PEBS_ENABLE: + if (pmu->pebs_enable == data) + return 0; + if (!(data & pmu->pebs_enable_mask)) { + diff = pmu->pebs_enable ^ data; + pmu->pebs_enable = data; + reprogram_counters(pmu, diff); + return 0; + } + break; + case MSR_IA32_DS_AREA: + if (msr_info->host_initiated && data && !guest_cpuid_has(vcpu, X86_FEATURE_DS)) + return 1; + if (is_noncanonical_address(data, vcpu)) + return 1; + pmu->ds_area = data; + return 0; + case MSR_PEBS_DATA_CFG: + if (pmu->pebs_data_cfg == data) + return 0; + if (!(data & pmu->pebs_data_cfg_mask)) { + pmu->pebs_data_cfg = data; + return 0; + } + break; + default: + if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)) || + (pmc = get_gp_pmc(pmu, msr, MSR_IA32_PMC0))) { + if ((msr & MSR_PMC_FULL_WIDTH_BIT) && + (data & ~pmu->counter_bitmask[KVM_PMC_GP])) + return 1; + if (!msr_info->host_initiated && + !(msr & MSR_PMC_FULL_WIDTH_BIT)) + data = (s64)(s32)data; + pmc_write_counter(pmc, data); + pmc_update_sample_period(pmc); + return 0; + } else if ((pmc = get_fixed_pmc(pmu, msr))) { + pmc_write_counter(pmc, data); + pmc_update_sample_period(pmc); + return 0; + } else if ((pmc = get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0))) { + if (data == pmc->eventsel) + return 0; + reserved_bits = pmu->reserved_bits; + if ((pmc->idx == 2) && + (pmu->raw_event_mask & HSW_IN_TX_CHECKPOINTED)) + reserved_bits ^= HSW_IN_TX_CHECKPOINTED; + if (!(data & reserved_bits)) { + pmc->eventsel = data; + reprogram_counter(pmc); + return 0; + } + } else if (intel_pmu_handle_lbr_msrs_access(vcpu, msr_info, false)) + return 0; + } + + return 1; +} + +static void setup_fixed_pmc_eventsel(struct kvm_pmu *pmu) +{ + size_t size = ARRAY_SIZE(fixed_pmc_events); + struct kvm_pmc *pmc; + u32 event; + int i; + + for (i = 0; i < pmu->nr_arch_fixed_counters; i++) { + pmc = &pmu->fixed_counters[i]; + event = fixed_pmc_events[array_index_nospec(i, size)]; + pmc->eventsel = (intel_arch_events[event].unit_mask << 8) | + intel_arch_events[event].eventsel; + } +} + +static void intel_pmu_refresh(struct kvm_vcpu *vcpu) +{ + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu); + struct kvm_cpuid_entry2 *entry; + union cpuid10_eax eax; + union cpuid10_edx edx; + u64 perf_capabilities; + u64 counter_mask; + int i; + + pmu->nr_arch_gp_counters = 0; + pmu->nr_arch_fixed_counters = 0; + pmu->counter_bitmask[KVM_PMC_GP] = 0; + pmu->counter_bitmask[KVM_PMC_FIXED] = 0; + pmu->version = 0; + pmu->reserved_bits = 0xffffffff00200000ull; + pmu->raw_event_mask = X86_RAW_EVENT_MASK; + pmu->global_ctrl_mask = ~0ull; + pmu->global_ovf_ctrl_mask = ~0ull; + pmu->fixed_ctr_ctrl_mask = ~0ull; + pmu->pebs_enable_mask = ~0ull; + pmu->pebs_data_cfg_mask = ~0ull; + + entry = kvm_find_cpuid_entry(vcpu, 0xa); + if (!entry || !vcpu->kvm->arch.enable_pmu) + return; + eax.full = entry->eax; + edx.full = entry->edx; + + pmu->version = eax.split.version_id; + if (!pmu->version) + return; + + pmu->nr_arch_gp_counters = min_t(int, eax.split.num_counters, + kvm_pmu_cap.num_counters_gp); + eax.split.bit_width = min_t(int, eax.split.bit_width, + kvm_pmu_cap.bit_width_gp); + pmu->counter_bitmask[KVM_PMC_GP] = ((u64)1 << eax.split.bit_width) - 1; + eax.split.mask_length = min_t(int, eax.split.mask_length, + kvm_pmu_cap.events_mask_len); + pmu->available_event_types = ~entry->ebx & + ((1ull << eax.split.mask_length) - 1); + + if (pmu->version == 1) { + pmu->nr_arch_fixed_counters = 0; + } else { + pmu->nr_arch_fixed_counters = + min3(ARRAY_SIZE(fixed_pmc_events), + (size_t) edx.split.num_counters_fixed, + (size_t)kvm_pmu_cap.num_counters_fixed); + edx.split.bit_width_fixed = min_t(int, edx.split.bit_width_fixed, + kvm_pmu_cap.bit_width_fixed); + pmu->counter_bitmask[KVM_PMC_FIXED] = + ((u64)1 << edx.split.bit_width_fixed) - 1; + setup_fixed_pmc_eventsel(pmu); + } + + for (i = 0; i < pmu->nr_arch_fixed_counters; i++) + pmu->fixed_ctr_ctrl_mask &= ~(0xbull << (i * 4)); + counter_mask = ~(((1ull << pmu->nr_arch_gp_counters) - 1) | + (((1ull << pmu->nr_arch_fixed_counters) - 1) << INTEL_PMC_IDX_FIXED)); + pmu->global_ctrl_mask = counter_mask; + pmu->global_ovf_ctrl_mask = pmu->global_ctrl_mask + & ~(MSR_CORE_PERF_GLOBAL_OVF_CTRL_OVF_BUF | + MSR_CORE_PERF_GLOBAL_OVF_CTRL_COND_CHGD); + if (vmx_pt_mode_is_host_guest()) + pmu->global_ovf_ctrl_mask &= + ~MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI; + + entry = kvm_find_cpuid_entry_index(vcpu, 7, 0); + if (entry && + (boot_cpu_has(X86_FEATURE_HLE) || boot_cpu_has(X86_FEATURE_RTM)) && + (entry->ebx & (X86_FEATURE_HLE|X86_FEATURE_RTM))) { + pmu->reserved_bits ^= HSW_IN_TX; + pmu->raw_event_mask |= (HSW_IN_TX|HSW_IN_TX_CHECKPOINTED); + } + + bitmap_set(pmu->all_valid_pmc_idx, + 0, pmu->nr_arch_gp_counters); + bitmap_set(pmu->all_valid_pmc_idx, + INTEL_PMC_MAX_GENERIC, pmu->nr_arch_fixed_counters); + + perf_capabilities = vcpu_get_perf_capabilities(vcpu); + if (cpuid_model_is_consistent(vcpu) && + (perf_capabilities & PMU_CAP_LBR_FMT)) + x86_perf_get_lbr(&lbr_desc->records); + else + lbr_desc->records.nr = 0; + + if (lbr_desc->records.nr) + bitmap_set(pmu->all_valid_pmc_idx, INTEL_PMC_IDX_FIXED_VLBR, 1); + + if (perf_capabilities & PERF_CAP_PEBS_FORMAT) { + if (perf_capabilities & PERF_CAP_PEBS_BASELINE) { + pmu->pebs_enable_mask = counter_mask; + pmu->reserved_bits &= ~ICL_EVENTSEL_ADAPTIVE; + for (i = 0; i < pmu->nr_arch_fixed_counters; i++) { + pmu->fixed_ctr_ctrl_mask &= + ~(1ULL << (INTEL_PMC_IDX_FIXED + i * 4)); + } + pmu->pebs_data_cfg_mask = ~0xff00000full; + } else { + pmu->pebs_enable_mask = + ~((1ull << pmu->nr_arch_gp_counters) - 1); + } + } +} + +static void intel_pmu_init(struct kvm_vcpu *vcpu) +{ + int i; + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu); + + for (i = 0; i < KVM_INTEL_PMC_MAX_GENERIC; i++) { + pmu->gp_counters[i].type = KVM_PMC_GP; + pmu->gp_counters[i].vcpu = vcpu; + pmu->gp_counters[i].idx = i; + pmu->gp_counters[i].current_config = 0; + } + + for (i = 0; i < KVM_PMC_MAX_FIXED; i++) { + pmu->fixed_counters[i].type = KVM_PMC_FIXED; + pmu->fixed_counters[i].vcpu = vcpu; + pmu->fixed_counters[i].idx = i + INTEL_PMC_IDX_FIXED; + pmu->fixed_counters[i].current_config = 0; + } + + vcpu->arch.perf_capabilities = kvm_caps.supported_perf_cap; + lbr_desc->records.nr = 0; + lbr_desc->event = NULL; + lbr_desc->msr_passthrough = false; +} + +static void intel_pmu_reset(struct kvm_vcpu *vcpu) +{ + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + struct kvm_pmc *pmc = NULL; + int i; + + for (i = 0; i < KVM_INTEL_PMC_MAX_GENERIC; i++) { + pmc = &pmu->gp_counters[i]; + + pmc_stop_counter(pmc); + pmc->counter = pmc->eventsel = 0; + } + + for (i = 0; i < KVM_PMC_MAX_FIXED; i++) { + pmc = &pmu->fixed_counters[i]; + + pmc_stop_counter(pmc); + pmc->counter = 0; + } + + pmu->fixed_ctr_ctrl = pmu->global_ctrl = pmu->global_status = 0; + + intel_pmu_release_guest_lbr_event(vcpu); +} + +/* + * Emulate LBR_On_PMI behavior for 1 < pmu.version < 4. + * + * If Freeze_LBR_On_PMI = 1, the LBR is frozen on PMI and + * the KVM emulates to clear the LBR bit (bit 0) in IA32_DEBUGCTL. + * + * Guest needs to re-enable LBR to resume branches recording. + */ +static void intel_pmu_legacy_freezing_lbrs_on_pmi(struct kvm_vcpu *vcpu) +{ + u64 data = vmcs_read64(GUEST_IA32_DEBUGCTL); + + if (data & DEBUGCTLMSR_FREEZE_LBRS_ON_PMI) { + data &= ~DEBUGCTLMSR_LBR; + vmcs_write64(GUEST_IA32_DEBUGCTL, data); + } +} + +static void intel_pmu_deliver_pmi(struct kvm_vcpu *vcpu) +{ + u8 version = vcpu_to_pmu(vcpu)->version; + + if (!intel_pmu_lbr_is_enabled(vcpu)) + return; + + if (version > 1 && version < 4) + intel_pmu_legacy_freezing_lbrs_on_pmi(vcpu); +} + +static void vmx_update_intercept_for_lbr_msrs(struct kvm_vcpu *vcpu, bool set) +{ + struct x86_pmu_lbr *lbr = vcpu_to_lbr_records(vcpu); + int i; + + for (i = 0; i < lbr->nr; i++) { + vmx_set_intercept_for_msr(vcpu, lbr->from + i, MSR_TYPE_RW, set); + vmx_set_intercept_for_msr(vcpu, lbr->to + i, MSR_TYPE_RW, set); + if (lbr->info) + vmx_set_intercept_for_msr(vcpu, lbr->info + i, MSR_TYPE_RW, set); + } + + vmx_set_intercept_for_msr(vcpu, MSR_LBR_SELECT, MSR_TYPE_RW, set); + vmx_set_intercept_for_msr(vcpu, MSR_LBR_TOS, MSR_TYPE_RW, set); +} + +static inline void vmx_disable_lbr_msrs_passthrough(struct kvm_vcpu *vcpu) +{ + struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu); + + if (!lbr_desc->msr_passthrough) + return; + + vmx_update_intercept_for_lbr_msrs(vcpu, true); + lbr_desc->msr_passthrough = false; +} + +static inline void vmx_enable_lbr_msrs_passthrough(struct kvm_vcpu *vcpu) +{ + struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu); + + if (lbr_desc->msr_passthrough) + return; + + vmx_update_intercept_for_lbr_msrs(vcpu, false); + lbr_desc->msr_passthrough = true; +} + +/* + * Higher priority host perf events (e.g. cpu pinned) could reclaim the + * pmu resources (e.g. LBR) that were assigned to the guest. This is + * usually done via ipi calls (more details in perf_install_in_context). + * + * Before entering the non-root mode (with irq disabled here), double + * confirm that the pmu features enabled to the guest are not reclaimed + * by higher priority host events. Otherwise, disallow vcpu's access to + * the reclaimed features. + */ +void vmx_passthrough_lbr_msrs(struct kvm_vcpu *vcpu) +{ + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu); + + if (!lbr_desc->event) { + vmx_disable_lbr_msrs_passthrough(vcpu); + if (vmcs_read64(GUEST_IA32_DEBUGCTL) & DEBUGCTLMSR_LBR) + goto warn; + if (test_bit(INTEL_PMC_IDX_FIXED_VLBR, pmu->pmc_in_use)) + goto warn; + return; + } + + if (lbr_desc->event->state < PERF_EVENT_STATE_ACTIVE) { + vmx_disable_lbr_msrs_passthrough(vcpu); + __clear_bit(INTEL_PMC_IDX_FIXED_VLBR, pmu->pmc_in_use); + goto warn; + } else + vmx_enable_lbr_msrs_passthrough(vcpu); + + return; + +warn: + pr_warn_ratelimited("kvm: vcpu-%d: fail to passthrough LBR.\n", + vcpu->vcpu_id); +} + +static void intel_pmu_cleanup(struct kvm_vcpu *vcpu) +{ + if (!(vmcs_read64(GUEST_IA32_DEBUGCTL) & DEBUGCTLMSR_LBR)) + intel_pmu_release_guest_lbr_event(vcpu); +} + +void intel_pmu_cross_mapped_check(struct kvm_pmu *pmu) +{ + struct kvm_pmc *pmc = NULL; + int bit, hw_idx; + + for_each_set_bit(bit, (unsigned long *)&pmu->global_ctrl, + X86_PMC_IDX_MAX) { + pmc = intel_pmc_idx_to_pmc(pmu, bit); + + if (!pmc || !pmc_speculative_in_use(pmc) || + !intel_pmc_is_enabled(pmc) || !pmc->perf_event) + continue; + + /* + * A negative index indicates the event isn't mapped to a + * physical counter in the host, e.g. due to contention. + */ + hw_idx = pmc->perf_event->hw.idx; + if (hw_idx != pmc->idx && hw_idx > -1) + pmu->host_cross_mapped_mask |= BIT_ULL(hw_idx); + } +} + +struct kvm_pmu_ops intel_pmu_ops __initdata = { + .hw_event_available = intel_hw_event_available, + .pmc_is_enabled = intel_pmc_is_enabled, + .pmc_idx_to_pmc = intel_pmc_idx_to_pmc, + .rdpmc_ecx_to_pmc = intel_rdpmc_ecx_to_pmc, + .msr_idx_to_pmc = intel_msr_idx_to_pmc, + .is_valid_rdpmc_ecx = intel_is_valid_rdpmc_ecx, + .is_valid_msr = intel_is_valid_msr, + .get_msr = intel_pmu_get_msr, + .set_msr = intel_pmu_set_msr, + .refresh = intel_pmu_refresh, + .init = intel_pmu_init, + .reset = intel_pmu_reset, + .deliver_pmi = intel_pmu_deliver_pmi, + .cleanup = intel_pmu_cleanup, +}; diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c new file mode 100644 index 000000000..1b56c5e5c --- /dev/null +++ b/arch/x86/kvm/vmx/posted_intr.c @@ -0,0 +1,351 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <linux/kvm_host.h> + +#include <asm/irq_remapping.h> +#include <asm/cpu.h> + +#include "lapic.h" +#include "irq.h" +#include "posted_intr.h" +#include "trace.h" +#include "vmx.h" + +/* + * Maintain a per-CPU list of vCPUs that need to be awakened by wakeup_handler() + * when a WAKEUP_VECTOR interrupted is posted. vCPUs are added to the list when + * the vCPU is scheduled out and is blocking (e.g. in HLT) with IRQs enabled. + * The vCPUs posted interrupt descriptor is updated at the same time to set its + * notification vector to WAKEUP_VECTOR, so that posted interrupt from devices + * wake the target vCPUs. vCPUs are removed from the list and the notification + * vector is reset when the vCPU is scheduled in. + */ +static DEFINE_PER_CPU(struct list_head, wakeup_vcpus_on_cpu); +/* + * Protect the per-CPU list with a per-CPU spinlock to handle task migration. + * When a blocking vCPU is awakened _and_ migrated to a different pCPU, the + * ->sched_in() path will need to take the vCPU off the list of the _previous_ + * CPU. IRQs must be disabled when taking this lock, otherwise deadlock will + * occur if a wakeup IRQ arrives and attempts to acquire the lock. + */ +static DEFINE_PER_CPU(raw_spinlock_t, wakeup_vcpus_on_cpu_lock); + +static inline struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu) +{ + return &(to_vmx(vcpu)->pi_desc); +} + +static int pi_try_set_control(struct pi_desc *pi_desc, u64 *pold, u64 new) +{ + /* + * PID.ON can be set at any time by a different vCPU or by hardware, + * e.g. a device. PID.control must be written atomically, and the + * update must be retried with a fresh snapshot an ON change causes + * the cmpxchg to fail. + */ + if (!try_cmpxchg64(&pi_desc->control, pold, new)) + return -EBUSY; + + return 0; +} + +void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) +{ + struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct pi_desc old, new; + unsigned long flags; + unsigned int dest; + + /* + * To simplify hot-plug and dynamic toggling of APICv, keep PI.NDST and + * PI.SN up-to-date even if there is no assigned device or if APICv is + * deactivated due to a dynamic inhibit bit, e.g. for Hyper-V's SyncIC. + */ + if (!enable_apicv || !lapic_in_kernel(vcpu)) + return; + + /* + * If the vCPU wasn't on the wakeup list and wasn't migrated, then the + * full update can be skipped as neither the vector nor the destination + * needs to be changed. + */ + if (pi_desc->nv != POSTED_INTR_WAKEUP_VECTOR && vcpu->cpu == cpu) { + /* + * Clear SN if it was set due to being preempted. Again, do + * this even if there is no assigned device for simplicity. + */ + if (pi_test_and_clear_sn(pi_desc)) + goto after_clear_sn; + return; + } + + local_irq_save(flags); + + /* + * If the vCPU was waiting for wakeup, remove the vCPU from the wakeup + * list of the _previous_ pCPU, which will not be the same as the + * current pCPU if the task was migrated. + */ + if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR) { + raw_spin_lock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu)); + list_del(&vmx->pi_wakeup_list); + raw_spin_unlock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu)); + } + + dest = cpu_physical_id(cpu); + if (!x2apic_mode) + dest = (dest << 8) & 0xFF00; + + old.control = READ_ONCE(pi_desc->control); + do { + new.control = old.control; + + /* + * Clear SN (as above) and refresh the destination APIC ID to + * handle task migration (@cpu != vcpu->cpu). + */ + new.ndst = dest; + new.sn = 0; + + /* + * Restore the notification vector; in the blocking case, the + * descriptor was modified on "put" to use the wakeup vector. + */ + new.nv = POSTED_INTR_VECTOR; + } while (pi_try_set_control(pi_desc, &old.control, new.control)); + + local_irq_restore(flags); + +after_clear_sn: + + /* + * Clear SN before reading the bitmap. The VT-d firmware + * writes the bitmap and reads SN atomically (5.2.3 in the + * spec), so it doesn't really have a memory barrier that + * pairs with this, but we cannot do that and we need one. + */ + smp_mb__after_atomic(); + + if (!pi_is_pir_empty(pi_desc)) + pi_set_on(pi_desc); +} + +static bool vmx_can_use_vtd_pi(struct kvm *kvm) +{ + return irqchip_in_kernel(kvm) && enable_apicv && + kvm_arch_has_assigned_device(kvm) && + irq_remapping_cap(IRQ_POSTING_CAP); +} + +/* + * Put the vCPU on this pCPU's list of vCPUs that needs to be awakened and set + * WAKEUP as the notification vector in the PI descriptor. + */ +static void pi_enable_wakeup_handler(struct kvm_vcpu *vcpu) +{ + struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct pi_desc old, new; + unsigned long flags; + + local_irq_save(flags); + + raw_spin_lock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu)); + list_add_tail(&vmx->pi_wakeup_list, + &per_cpu(wakeup_vcpus_on_cpu, vcpu->cpu)); + raw_spin_unlock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu)); + + WARN(pi_desc->sn, "PI descriptor SN field set before blocking"); + + old.control = READ_ONCE(pi_desc->control); + do { + /* set 'NV' to 'wakeup vector' */ + new.control = old.control; + new.nv = POSTED_INTR_WAKEUP_VECTOR; + } while (pi_try_set_control(pi_desc, &old.control, new.control)); + + /* + * Send a wakeup IPI to this CPU if an interrupt may have been posted + * before the notification vector was updated, in which case the IRQ + * will arrive on the non-wakeup vector. An IPI is needed as calling + * try_to_wake_up() from ->sched_out() isn't allowed (IRQs are not + * enabled until it is safe to call try_to_wake_up() on the task being + * scheduled out). + */ + if (pi_test_on(&new)) + apic->send_IPI_self(POSTED_INTR_WAKEUP_VECTOR); + + local_irq_restore(flags); +} + +static bool vmx_needs_pi_wakeup(struct kvm_vcpu *vcpu) +{ + /* + * The default posted interrupt vector does nothing when + * invoked outside guest mode. Return whether a blocked vCPU + * can be the target of posted interrupts, as is the case when + * using either IPI virtualization or VT-d PI, so that the + * notification vector is switched to the one that calls + * back to the pi_wakeup_handler() function. + */ + return vmx_can_use_ipiv(vcpu) || vmx_can_use_vtd_pi(vcpu->kvm); +} + +void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu) +{ + struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); + + if (!vmx_needs_pi_wakeup(vcpu)) + return; + + if (kvm_vcpu_is_blocking(vcpu) && !vmx_interrupt_blocked(vcpu)) + pi_enable_wakeup_handler(vcpu); + + /* + * Set SN when the vCPU is preempted. Note, the vCPU can both be seen + * as blocking and preempted, e.g. if it's preempted between setting + * its wait state and manually scheduling out. + */ + if (vcpu->preempted) + pi_set_sn(pi_desc); +} + +/* + * Handler for POSTED_INTERRUPT_WAKEUP_VECTOR. + */ +void pi_wakeup_handler(void) +{ + int cpu = smp_processor_id(); + struct list_head *wakeup_list = &per_cpu(wakeup_vcpus_on_cpu, cpu); + raw_spinlock_t *spinlock = &per_cpu(wakeup_vcpus_on_cpu_lock, cpu); + struct vcpu_vmx *vmx; + + raw_spin_lock(spinlock); + list_for_each_entry(vmx, wakeup_list, pi_wakeup_list) { + + if (pi_test_on(&vmx->pi_desc)) + kvm_vcpu_wake_up(&vmx->vcpu); + } + raw_spin_unlock(spinlock); +} + +void __init pi_init_cpu(int cpu) +{ + INIT_LIST_HEAD(&per_cpu(wakeup_vcpus_on_cpu, cpu)); + raw_spin_lock_init(&per_cpu(wakeup_vcpus_on_cpu_lock, cpu)); +} + +bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu) +{ + struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); + + return pi_test_on(pi_desc) || + (pi_test_sn(pi_desc) && !pi_is_pir_empty(pi_desc)); +} + + +/* + * Bail out of the block loop if the VM has an assigned + * device, but the blocking vCPU didn't reconfigure the + * PI.NV to the wakeup vector, i.e. the assigned device + * came along after the initial check in vmx_vcpu_pi_put(). + */ +void vmx_pi_start_assignment(struct kvm *kvm) +{ + if (!irq_remapping_cap(IRQ_POSTING_CAP)) + return; + + kvm_make_all_cpus_request(kvm, KVM_REQ_UNBLOCK); +} + +/* + * vmx_pi_update_irte - set IRTE for Posted-Interrupts + * + * @kvm: kvm + * @host_irq: host irq of the interrupt + * @guest_irq: gsi of the interrupt + * @set: set or unset PI + * returns 0 on success, < 0 on failure + */ +int vmx_pi_update_irte(struct kvm *kvm, unsigned int host_irq, + uint32_t guest_irq, bool set) +{ + struct kvm_kernel_irq_routing_entry *e; + struct kvm_irq_routing_table *irq_rt; + struct kvm_lapic_irq irq; + struct kvm_vcpu *vcpu; + struct vcpu_data vcpu_info; + int idx, ret = 0; + + if (!vmx_can_use_vtd_pi(kvm)) + return 0; + + idx = srcu_read_lock(&kvm->irq_srcu); + irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu); + if (guest_irq >= irq_rt->nr_rt_entries || + hlist_empty(&irq_rt->map[guest_irq])) { + pr_warn_once("no route for guest_irq %u/%u (broken user space?)\n", + guest_irq, irq_rt->nr_rt_entries); + goto out; + } + + hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) { + if (e->type != KVM_IRQ_ROUTING_MSI) + continue; + /* + * VT-d PI cannot support posting multicast/broadcast + * interrupts to a vCPU, we still use interrupt remapping + * for these kind of interrupts. + * + * For lowest-priority interrupts, we only support + * those with single CPU as the destination, e.g. user + * configures the interrupts via /proc/irq or uses + * irqbalance to make the interrupts single-CPU. + * + * We will support full lowest-priority interrupt later. + * + * In addition, we can only inject generic interrupts using + * the PI mechanism, refuse to route others through it. + */ + + kvm_set_msi_irq(kvm, e, &irq); + if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu) || + !kvm_irq_is_postable(&irq)) { + /* + * Make sure the IRTE is in remapped mode if + * we don't handle it in posted mode. + */ + ret = irq_set_vcpu_affinity(host_irq, NULL); + if (ret < 0) { + printk(KERN_INFO + "failed to back to remapped mode, irq: %u\n", + host_irq); + goto out; + } + + continue; + } + + vcpu_info.pi_desc_addr = __pa(vcpu_to_pi_desc(vcpu)); + vcpu_info.vector = irq.vector; + + trace_kvm_pi_irte_update(host_irq, vcpu->vcpu_id, e->gsi, + vcpu_info.vector, vcpu_info.pi_desc_addr, set); + + if (set) + ret = irq_set_vcpu_affinity(host_irq, &vcpu_info); + else + ret = irq_set_vcpu_affinity(host_irq, NULL); + + if (ret < 0) { + printk(KERN_INFO "%s: failed to update PI IRTE\n", + __func__); + goto out; + } + } + + ret = 0; +out: + srcu_read_unlock(&kvm->irq_srcu, idx); + return ret; +} diff --git a/arch/x86/kvm/vmx/posted_intr.h b/arch/x86/kvm/vmx/posted_intr.h new file mode 100644 index 000000000..269920765 --- /dev/null +++ b/arch/x86/kvm/vmx/posted_intr.h @@ -0,0 +1,106 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __KVM_X86_VMX_POSTED_INTR_H +#define __KVM_X86_VMX_POSTED_INTR_H + +#define POSTED_INTR_ON 0 +#define POSTED_INTR_SN 1 + +#define PID_TABLE_ENTRY_VALID 1 + +/* Posted-Interrupt Descriptor */ +struct pi_desc { + u32 pir[8]; /* Posted interrupt requested */ + union { + struct { + /* bit 256 - Outstanding Notification */ + u16 on : 1, + /* bit 257 - Suppress Notification */ + sn : 1, + /* bit 271:258 - Reserved */ + rsvd_1 : 14; + /* bit 279:272 - Notification Vector */ + u8 nv; + /* bit 287:280 - Reserved */ + u8 rsvd_2; + /* bit 319:288 - Notification Destination */ + u32 ndst; + }; + u64 control; + }; + u32 rsvd[6]; +} __aligned(64); + +static inline bool pi_test_and_set_on(struct pi_desc *pi_desc) +{ + return test_and_set_bit(POSTED_INTR_ON, + (unsigned long *)&pi_desc->control); +} + +static inline bool pi_test_and_clear_on(struct pi_desc *pi_desc) +{ + return test_and_clear_bit(POSTED_INTR_ON, + (unsigned long *)&pi_desc->control); +} + +static inline bool pi_test_and_clear_sn(struct pi_desc *pi_desc) +{ + return test_and_clear_bit(POSTED_INTR_SN, + (unsigned long *)&pi_desc->control); +} + +static inline bool pi_test_and_set_pir(int vector, struct pi_desc *pi_desc) +{ + return test_and_set_bit(vector, (unsigned long *)pi_desc->pir); +} + +static inline bool pi_is_pir_empty(struct pi_desc *pi_desc) +{ + return bitmap_empty((unsigned long *)pi_desc->pir, NR_VECTORS); +} + +static inline void pi_set_sn(struct pi_desc *pi_desc) +{ + set_bit(POSTED_INTR_SN, + (unsigned long *)&pi_desc->control); +} + +static inline void pi_set_on(struct pi_desc *pi_desc) +{ + set_bit(POSTED_INTR_ON, + (unsigned long *)&pi_desc->control); +} + +static inline void pi_clear_on(struct pi_desc *pi_desc) +{ + clear_bit(POSTED_INTR_ON, + (unsigned long *)&pi_desc->control); +} + +static inline void pi_clear_sn(struct pi_desc *pi_desc) +{ + clear_bit(POSTED_INTR_SN, + (unsigned long *)&pi_desc->control); +} + +static inline bool pi_test_on(struct pi_desc *pi_desc) +{ + return test_bit(POSTED_INTR_ON, + (unsigned long *)&pi_desc->control); +} + +static inline bool pi_test_sn(struct pi_desc *pi_desc) +{ + return test_bit(POSTED_INTR_SN, + (unsigned long *)&pi_desc->control); +} + +void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu); +void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu); +void pi_wakeup_handler(void); +void __init pi_init_cpu(int cpu); +bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu); +int vmx_pi_update_irte(struct kvm *kvm, unsigned int host_irq, + uint32_t guest_irq, bool set); +void vmx_pi_start_assignment(struct kvm *kvm); + +#endif /* __KVM_X86_VMX_POSTED_INTR_H */ diff --git a/arch/x86/kvm/vmx/run_flags.h b/arch/x86/kvm/vmx/run_flags.h new file mode 100644 index 000000000..edc3f16cc --- /dev/null +++ b/arch/x86/kvm/vmx/run_flags.h @@ -0,0 +1,8 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __KVM_X86_VMX_RUN_FLAGS_H +#define __KVM_X86_VMX_RUN_FLAGS_H + +#define VMX_RUN_VMRESUME (1 << 0) +#define VMX_RUN_SAVE_SPEC_CTRL (1 << 1) + +#endif /* __KVM_X86_VMX_RUN_FLAGS_H */ diff --git a/arch/x86/kvm/vmx/sgx.c b/arch/x86/kvm/vmx/sgx.c new file mode 100644 index 000000000..b12da2a6d --- /dev/null +++ b/arch/x86/kvm/vmx/sgx.c @@ -0,0 +1,498 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright(c) 2021 Intel Corporation. */ + +#include <asm/sgx.h> + +#include "cpuid.h" +#include "kvm_cache_regs.h" +#include "nested.h" +#include "sgx.h" +#include "vmx.h" +#include "x86.h" + +bool __read_mostly enable_sgx = 1; +module_param_named(sgx, enable_sgx, bool, 0444); + +/* Initial value of guest's virtual SGX_LEPUBKEYHASHn MSRs */ +static u64 sgx_pubkey_hash[4] __ro_after_init; + +/* + * ENCLS's memory operands use a fixed segment (DS) and a fixed + * address size based on the mode. Related prefixes are ignored. + */ +static int sgx_get_encls_gva(struct kvm_vcpu *vcpu, unsigned long offset, + int size, int alignment, gva_t *gva) +{ + struct kvm_segment s; + bool fault; + + /* Skip vmcs.GUEST_DS retrieval for 64-bit mode to avoid VMREADs. */ + *gva = offset; + if (!is_long_mode(vcpu)) { + vmx_get_segment(vcpu, &s, VCPU_SREG_DS); + *gva += s.base; + } + + if (!IS_ALIGNED(*gva, alignment)) { + fault = true; + } else if (likely(is_long_mode(vcpu))) { + fault = is_noncanonical_address(*gva, vcpu); + } else { + *gva &= 0xffffffff; + fault = (s.unusable) || + (s.type != 2 && s.type != 3) || + (*gva > s.limit) || + ((s.base != 0 || s.limit != 0xffffffff) && + (((u64)*gva + size - 1) > s.limit + 1)); + } + if (fault) + kvm_inject_gp(vcpu, 0); + return fault ? -EINVAL : 0; +} + +static void sgx_handle_emulation_failure(struct kvm_vcpu *vcpu, u64 addr, + unsigned int size) +{ + uint64_t data[2] = { addr, size }; + + __kvm_prepare_emulation_failure_exit(vcpu, data, ARRAY_SIZE(data)); +} + +static int sgx_read_hva(struct kvm_vcpu *vcpu, unsigned long hva, void *data, + unsigned int size) +{ + if (__copy_from_user(data, (void __user *)hva, size)) { + sgx_handle_emulation_failure(vcpu, hva, size); + return -EFAULT; + } + + return 0; +} + +static int sgx_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t gva, bool write, + gpa_t *gpa) +{ + struct x86_exception ex; + + if (write) + *gpa = kvm_mmu_gva_to_gpa_write(vcpu, gva, &ex); + else + *gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, &ex); + + if (*gpa == INVALID_GPA) { + kvm_inject_emulated_page_fault(vcpu, &ex); + return -EFAULT; + } + + return 0; +} + +static int sgx_gpa_to_hva(struct kvm_vcpu *vcpu, gpa_t gpa, unsigned long *hva) +{ + *hva = kvm_vcpu_gfn_to_hva(vcpu, PFN_DOWN(gpa)); + if (kvm_is_error_hva(*hva)) { + sgx_handle_emulation_failure(vcpu, gpa, 1); + return -EFAULT; + } + + *hva |= gpa & ~PAGE_MASK; + + return 0; +} + +static int sgx_inject_fault(struct kvm_vcpu *vcpu, gva_t gva, int trapnr) +{ + struct x86_exception ex; + + /* + * A non-EPCM #PF indicates a bad userspace HVA. This *should* check + * for PFEC.SGX and not assume any #PF on SGX2 originated in the EPC, + * but the error code isn't (yet) plumbed through the ENCLS helpers. + */ + if (trapnr == PF_VECTOR && !boot_cpu_has(X86_FEATURE_SGX2)) { + kvm_prepare_emulation_failure_exit(vcpu); + return 0; + } + + /* + * If the guest thinks it's running on SGX2 hardware, inject an SGX + * #PF if the fault matches an EPCM fault signature (#GP on SGX1, + * #PF on SGX2). The assumption is that EPCM faults are much more + * likely than a bad userspace address. + */ + if ((trapnr == PF_VECTOR || !boot_cpu_has(X86_FEATURE_SGX2)) && + guest_cpuid_has(vcpu, X86_FEATURE_SGX2)) { + memset(&ex, 0, sizeof(ex)); + ex.vector = PF_VECTOR; + ex.error_code = PFERR_PRESENT_MASK | PFERR_WRITE_MASK | + PFERR_SGX_MASK; + ex.address = gva; + ex.error_code_valid = true; + ex.nested_page_fault = false; + kvm_inject_emulated_page_fault(vcpu, &ex); + } else { + kvm_inject_gp(vcpu, 0); + } + return 1; +} + +static int __handle_encls_ecreate(struct kvm_vcpu *vcpu, + struct sgx_pageinfo *pageinfo, + unsigned long secs_hva, + gva_t secs_gva) +{ + struct sgx_secs *contents = (struct sgx_secs *)pageinfo->contents; + struct kvm_cpuid_entry2 *sgx_12_0, *sgx_12_1; + u64 attributes, xfrm, size; + u32 miscselect; + u8 max_size_log2; + int trapnr, ret; + + sgx_12_0 = kvm_find_cpuid_entry_index(vcpu, 0x12, 0); + sgx_12_1 = kvm_find_cpuid_entry_index(vcpu, 0x12, 1); + if (!sgx_12_0 || !sgx_12_1) { + kvm_prepare_emulation_failure_exit(vcpu); + return 0; + } + + miscselect = contents->miscselect; + attributes = contents->attributes; + xfrm = contents->xfrm; + size = contents->size; + + /* Enforce restriction of access to the PROVISIONKEY. */ + if (!vcpu->kvm->arch.sgx_provisioning_allowed && + (attributes & SGX_ATTR_PROVISIONKEY)) { + if (sgx_12_1->eax & SGX_ATTR_PROVISIONKEY) + pr_warn_once("KVM: SGX PROVISIONKEY advertised but not allowed\n"); + kvm_inject_gp(vcpu, 0); + return 1; + } + + /* Enforce CPUID restrictions on MISCSELECT, ATTRIBUTES and XFRM. */ + if ((u32)miscselect & ~sgx_12_0->ebx || + (u32)attributes & ~sgx_12_1->eax || + (u32)(attributes >> 32) & ~sgx_12_1->ebx || + (u32)xfrm & ~sgx_12_1->ecx || + (u32)(xfrm >> 32) & ~sgx_12_1->edx) { + kvm_inject_gp(vcpu, 0); + return 1; + } + + /* Enforce CPUID restriction on max enclave size. */ + max_size_log2 = (attributes & SGX_ATTR_MODE64BIT) ? sgx_12_0->edx >> 8 : + sgx_12_0->edx; + if (size >= BIT_ULL(max_size_log2)) { + kvm_inject_gp(vcpu, 0); + return 1; + } + + /* + * sgx_virt_ecreate() returns: + * 1) 0: ECREATE was successful + * 2) -EFAULT: ECREATE was run but faulted, and trapnr was set to the + * exception number. + * 3) -EINVAL: access_ok() on @secs_hva failed. This should never + * happen as KVM checks host addresses at memslot creation. + * sgx_virt_ecreate() has already warned in this case. + */ + ret = sgx_virt_ecreate(pageinfo, (void __user *)secs_hva, &trapnr); + if (!ret) + return kvm_skip_emulated_instruction(vcpu); + if (ret == -EFAULT) + return sgx_inject_fault(vcpu, secs_gva, trapnr); + + return ret; +} + +static int handle_encls_ecreate(struct kvm_vcpu *vcpu) +{ + gva_t pageinfo_gva, secs_gva; + gva_t metadata_gva, contents_gva; + gpa_t metadata_gpa, contents_gpa, secs_gpa; + unsigned long metadata_hva, contents_hva, secs_hva; + struct sgx_pageinfo pageinfo; + struct sgx_secs *contents; + struct x86_exception ex; + int r; + + if (sgx_get_encls_gva(vcpu, kvm_rbx_read(vcpu), 32, 32, &pageinfo_gva) || + sgx_get_encls_gva(vcpu, kvm_rcx_read(vcpu), 4096, 4096, &secs_gva)) + return 1; + + /* + * Copy the PAGEINFO to local memory, its pointers need to be + * translated, i.e. we need to do a deep copy/translate. + */ + r = kvm_read_guest_virt(vcpu, pageinfo_gva, &pageinfo, + sizeof(pageinfo), &ex); + if (r == X86EMUL_PROPAGATE_FAULT) { + kvm_inject_emulated_page_fault(vcpu, &ex); + return 1; + } else if (r != X86EMUL_CONTINUE) { + sgx_handle_emulation_failure(vcpu, pageinfo_gva, + sizeof(pageinfo)); + return 0; + } + + if (sgx_get_encls_gva(vcpu, pageinfo.metadata, 64, 64, &metadata_gva) || + sgx_get_encls_gva(vcpu, pageinfo.contents, 4096, 4096, + &contents_gva)) + return 1; + + /* + * Translate the SECINFO, SOURCE and SECS pointers from GVA to GPA. + * Resume the guest on failure to inject a #PF. + */ + if (sgx_gva_to_gpa(vcpu, metadata_gva, false, &metadata_gpa) || + sgx_gva_to_gpa(vcpu, contents_gva, false, &contents_gpa) || + sgx_gva_to_gpa(vcpu, secs_gva, true, &secs_gpa)) + return 1; + + /* + * ...and then to HVA. The order of accesses isn't architectural, i.e. + * KVM doesn't have to fully process one address at a time. Exit to + * userspace if a GPA is invalid. + */ + if (sgx_gpa_to_hva(vcpu, metadata_gpa, &metadata_hva) || + sgx_gpa_to_hva(vcpu, contents_gpa, &contents_hva) || + sgx_gpa_to_hva(vcpu, secs_gpa, &secs_hva)) + return 0; + + /* + * Copy contents into kernel memory to prevent TOCTOU attack. E.g. the + * guest could do ECREATE w/ SECS.SGX_ATTR_PROVISIONKEY=0, and + * simultaneously set SGX_ATTR_PROVISIONKEY to bypass the check to + * enforce restriction of access to the PROVISIONKEY. + */ + contents = (struct sgx_secs *)__get_free_page(GFP_KERNEL_ACCOUNT); + if (!contents) + return -ENOMEM; + + /* Exit to userspace if copying from a host userspace address fails. */ + if (sgx_read_hva(vcpu, contents_hva, (void *)contents, PAGE_SIZE)) { + free_page((unsigned long)contents); + return 0; + } + + pageinfo.metadata = metadata_hva; + pageinfo.contents = (u64)contents; + + r = __handle_encls_ecreate(vcpu, &pageinfo, secs_hva, secs_gva); + + free_page((unsigned long)contents); + + return r; +} + +static int handle_encls_einit(struct kvm_vcpu *vcpu) +{ + unsigned long sig_hva, secs_hva, token_hva, rflags; + struct vcpu_vmx *vmx = to_vmx(vcpu); + gva_t sig_gva, secs_gva, token_gva; + gpa_t sig_gpa, secs_gpa, token_gpa; + int ret, trapnr; + + if (sgx_get_encls_gva(vcpu, kvm_rbx_read(vcpu), 1808, 4096, &sig_gva) || + sgx_get_encls_gva(vcpu, kvm_rcx_read(vcpu), 4096, 4096, &secs_gva) || + sgx_get_encls_gva(vcpu, kvm_rdx_read(vcpu), 304, 512, &token_gva)) + return 1; + + /* + * Translate the SIGSTRUCT, SECS and TOKEN pointers from GVA to GPA. + * Resume the guest on failure to inject a #PF. + */ + if (sgx_gva_to_gpa(vcpu, sig_gva, false, &sig_gpa) || + sgx_gva_to_gpa(vcpu, secs_gva, true, &secs_gpa) || + sgx_gva_to_gpa(vcpu, token_gva, false, &token_gpa)) + return 1; + + /* + * ...and then to HVA. The order of accesses isn't architectural, i.e. + * KVM doesn't have to fully process one address at a time. Exit to + * userspace if a GPA is invalid. Note, all structures are aligned and + * cannot split pages. + */ + if (sgx_gpa_to_hva(vcpu, sig_gpa, &sig_hva) || + sgx_gpa_to_hva(vcpu, secs_gpa, &secs_hva) || + sgx_gpa_to_hva(vcpu, token_gpa, &token_hva)) + return 0; + + ret = sgx_virt_einit((void __user *)sig_hva, (void __user *)token_hva, + (void __user *)secs_hva, + vmx->msr_ia32_sgxlepubkeyhash, &trapnr); + + if (ret == -EFAULT) + return sgx_inject_fault(vcpu, secs_gva, trapnr); + + /* + * sgx_virt_einit() returns -EINVAL when access_ok() fails on @sig_hva, + * @token_hva or @secs_hva. This should never happen as KVM checks host + * addresses at memslot creation. sgx_virt_einit() has already warned + * in this case, so just return. + */ + if (ret < 0) + return ret; + + rflags = vmx_get_rflags(vcpu) & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | + X86_EFLAGS_AF | X86_EFLAGS_SF | + X86_EFLAGS_OF); + if (ret) + rflags |= X86_EFLAGS_ZF; + else + rflags &= ~X86_EFLAGS_ZF; + vmx_set_rflags(vcpu, rflags); + + kvm_rax_write(vcpu, ret); + return kvm_skip_emulated_instruction(vcpu); +} + +static inline bool encls_leaf_enabled_in_guest(struct kvm_vcpu *vcpu, u32 leaf) +{ + if (!enable_sgx || !guest_cpuid_has(vcpu, X86_FEATURE_SGX)) + return false; + + if (leaf >= ECREATE && leaf <= ETRACK) + return guest_cpuid_has(vcpu, X86_FEATURE_SGX1); + + if (leaf >= EAUG && leaf <= EMODT) + return guest_cpuid_has(vcpu, X86_FEATURE_SGX2); + + return false; +} + +static inline bool sgx_enabled_in_guest_bios(struct kvm_vcpu *vcpu) +{ + const u64 bits = FEAT_CTL_SGX_ENABLED | FEAT_CTL_LOCKED; + + return (to_vmx(vcpu)->msr_ia32_feature_control & bits) == bits; +} + +int handle_encls(struct kvm_vcpu *vcpu) +{ + u32 leaf = (u32)kvm_rax_read(vcpu); + + if (!encls_leaf_enabled_in_guest(vcpu, leaf)) { + kvm_queue_exception(vcpu, UD_VECTOR); + } else if (!sgx_enabled_in_guest_bios(vcpu)) { + kvm_inject_gp(vcpu, 0); + } else { + if (leaf == ECREATE) + return handle_encls_ecreate(vcpu); + if (leaf == EINIT) + return handle_encls_einit(vcpu); + WARN(1, "KVM: unexpected exit on ENCLS[%u]", leaf); + vcpu->run->exit_reason = KVM_EXIT_UNKNOWN; + vcpu->run->hw.hardware_exit_reason = EXIT_REASON_ENCLS; + return 0; + } + return 1; +} + +void setup_default_sgx_lepubkeyhash(void) +{ + /* + * Use Intel's default value for Skylake hardware if Launch Control is + * not supported, i.e. Intel's hash is hardcoded into silicon, or if + * Launch Control is supported and enabled, i.e. mimic the reset value + * and let the guest write the MSRs at will. If Launch Control is + * supported but disabled, then use the current MSR values as the hash + * MSRs exist but are read-only (locked and not writable). + */ + if (!enable_sgx || boot_cpu_has(X86_FEATURE_SGX_LC) || + rdmsrl_safe(MSR_IA32_SGXLEPUBKEYHASH0, &sgx_pubkey_hash[0])) { + sgx_pubkey_hash[0] = 0xa6053e051270b7acULL; + sgx_pubkey_hash[1] = 0x6cfbe8ba8b3b413dULL; + sgx_pubkey_hash[2] = 0xc4916d99f2b3735dULL; + sgx_pubkey_hash[3] = 0xd4f8c05909f9bb3bULL; + } else { + /* MSR_IA32_SGXLEPUBKEYHASH0 is read above */ + rdmsrl(MSR_IA32_SGXLEPUBKEYHASH1, sgx_pubkey_hash[1]); + rdmsrl(MSR_IA32_SGXLEPUBKEYHASH2, sgx_pubkey_hash[2]); + rdmsrl(MSR_IA32_SGXLEPUBKEYHASH3, sgx_pubkey_hash[3]); + } +} + +void vcpu_setup_sgx_lepubkeyhash(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + memcpy(vmx->msr_ia32_sgxlepubkeyhash, sgx_pubkey_hash, + sizeof(sgx_pubkey_hash)); +} + +/* + * ECREATE must be intercepted to enforce MISCSELECT, ATTRIBUTES and XFRM + * restrictions if the guest's allowed-1 settings diverge from hardware. + */ +static bool sgx_intercept_encls_ecreate(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *guest_cpuid; + u32 eax, ebx, ecx, edx; + + if (!vcpu->kvm->arch.sgx_provisioning_allowed) + return true; + + guest_cpuid = kvm_find_cpuid_entry_index(vcpu, 0x12, 0); + if (!guest_cpuid) + return true; + + cpuid_count(0x12, 0, &eax, &ebx, &ecx, &edx); + if (guest_cpuid->ebx != ebx || guest_cpuid->edx != edx) + return true; + + guest_cpuid = kvm_find_cpuid_entry_index(vcpu, 0x12, 1); + if (!guest_cpuid) + return true; + + cpuid_count(0x12, 1, &eax, &ebx, &ecx, &edx); + if (guest_cpuid->eax != eax || guest_cpuid->ebx != ebx || + guest_cpuid->ecx != ecx || guest_cpuid->edx != edx) + return true; + + return false; +} + +void vmx_write_encls_bitmap(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) +{ + /* + * There is no software enable bit for SGX that is virtualized by + * hardware, e.g. there's no CR4.SGXE, so when SGX is disabled in the + * guest (either by the host or by the guest's BIOS) but enabled in the + * host, trap all ENCLS leafs and inject #UD/#GP as needed to emulate + * the expected system behavior for ENCLS. + */ + u64 bitmap = -1ull; + + /* Nothing to do if hardware doesn't support SGX */ + if (!cpu_has_vmx_encls_vmexit()) + return; + + if (guest_cpuid_has(vcpu, X86_FEATURE_SGX) && + sgx_enabled_in_guest_bios(vcpu)) { + if (guest_cpuid_has(vcpu, X86_FEATURE_SGX1)) { + bitmap &= ~GENMASK_ULL(ETRACK, ECREATE); + if (sgx_intercept_encls_ecreate(vcpu)) + bitmap |= (1 << ECREATE); + } + + if (guest_cpuid_has(vcpu, X86_FEATURE_SGX2)) + bitmap &= ~GENMASK_ULL(EMODT, EAUG); + + /* + * Trap and execute EINIT if launch control is enabled in the + * host using the guest's values for launch control MSRs, even + * if the guest's values are fixed to hardware default values. + * The MSRs are not loaded/saved on VM-Enter/VM-Exit as writing + * the MSRs is extraordinarily expensive. + */ + if (boot_cpu_has(X86_FEATURE_SGX_LC)) + bitmap |= (1 << EINIT); + + if (!vmcs12 && is_guest_mode(vcpu)) + vmcs12 = get_vmcs12(vcpu); + if (vmcs12 && nested_cpu_has_encls_exit(vmcs12)) + bitmap |= vmcs12->encls_exiting_bitmap; + } + vmcs_write64(ENCLS_EXITING_BITMAP, bitmap); +} diff --git a/arch/x86/kvm/vmx/sgx.h b/arch/x86/kvm/vmx/sgx.h new file mode 100644 index 000000000..a400888b3 --- /dev/null +++ b/arch/x86/kvm/vmx/sgx.h @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __KVM_X86_SGX_H +#define __KVM_X86_SGX_H + +#include <linux/kvm_host.h> + +#include "capabilities.h" +#include "vmx_ops.h" + +#ifdef CONFIG_X86_SGX_KVM +extern bool __read_mostly enable_sgx; + +int handle_encls(struct kvm_vcpu *vcpu); + +void setup_default_sgx_lepubkeyhash(void); +void vcpu_setup_sgx_lepubkeyhash(struct kvm_vcpu *vcpu); + +void vmx_write_encls_bitmap(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12); +#else +#define enable_sgx 0 + +static inline void setup_default_sgx_lepubkeyhash(void) { } +static inline void vcpu_setup_sgx_lepubkeyhash(struct kvm_vcpu *vcpu) { } + +static inline void vmx_write_encls_bitmap(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + /* Nothing to do if hardware doesn't support SGX */ + if (cpu_has_vmx_encls_vmexit()) + vmcs_write64(ENCLS_EXITING_BITMAP, -1ull); +} +#endif + +#endif /* __KVM_X86_SGX_H */ diff --git a/arch/x86/kvm/vmx/vmcs.h b/arch/x86/kvm/vmx/vmcs.h new file mode 100644 index 000000000..ac290a44a --- /dev/null +++ b/arch/x86/kvm/vmx/vmcs.h @@ -0,0 +1,193 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __KVM_X86_VMX_VMCS_H +#define __KVM_X86_VMX_VMCS_H + +#include <linux/ktime.h> +#include <linux/list.h> +#include <linux/nospec.h> + +#include <asm/kvm.h> +#include <asm/vmx.h> + +#include "capabilities.h" + +#define ROL16(val, n) ((u16)(((u16)(val) << (n)) | ((u16)(val) >> (16 - (n))))) + +struct vmcs_hdr { + u32 revision_id:31; + u32 shadow_vmcs:1; +}; + +struct vmcs { + struct vmcs_hdr hdr; + u32 abort; + char data[]; +}; + +DECLARE_PER_CPU(struct vmcs *, current_vmcs); + +/* + * vmcs_host_state tracks registers that are loaded from the VMCS on VMEXIT + * and whose values change infrequently, but are not constant. I.e. this is + * used as a write-through cache of the corresponding VMCS fields. + */ +struct vmcs_host_state { + unsigned long cr3; /* May not match real cr3 */ + unsigned long cr4; /* May not match real cr4 */ + unsigned long gs_base; + unsigned long fs_base; + unsigned long rsp; + + u16 fs_sel, gs_sel, ldt_sel; +#ifdef CONFIG_X86_64 + u16 ds_sel, es_sel; +#endif +}; + +struct vmcs_controls_shadow { + u32 vm_entry; + u32 vm_exit; + u32 pin; + u32 exec; + u32 secondary_exec; + u64 tertiary_exec; +}; + +/* + * Track a VMCS that may be loaded on a certain CPU. If it is (cpu!=-1), also + * remember whether it was VMLAUNCHed, and maintain a linked list of all VMCSs + * loaded on this CPU (so we can clear them if the CPU goes down). + */ +struct loaded_vmcs { + struct vmcs *vmcs; + struct vmcs *shadow_vmcs; + int cpu; + bool launched; + bool nmi_known_unmasked; + bool hv_timer_soft_disabled; + /* Support for vnmi-less CPUs */ + int soft_vnmi_blocked; + ktime_t entry_time; + s64 vnmi_blocked_time; + unsigned long *msr_bitmap; + struct list_head loaded_vmcss_on_cpu_link; + struct vmcs_host_state host_state; + struct vmcs_controls_shadow controls_shadow; +}; + +static inline bool is_intr_type(u32 intr_info, u32 type) +{ + const u32 mask = INTR_INFO_VALID_MASK | INTR_INFO_INTR_TYPE_MASK; + + return (intr_info & mask) == (INTR_INFO_VALID_MASK | type); +} + +static inline bool is_intr_type_n(u32 intr_info, u32 type, u8 vector) +{ + const u32 mask = INTR_INFO_VALID_MASK | INTR_INFO_INTR_TYPE_MASK | + INTR_INFO_VECTOR_MASK; + + return (intr_info & mask) == (INTR_INFO_VALID_MASK | type | vector); +} + +static inline bool is_exception_n(u32 intr_info, u8 vector) +{ + return is_intr_type_n(intr_info, INTR_TYPE_HARD_EXCEPTION, vector); +} + +static inline bool is_debug(u32 intr_info) +{ + return is_exception_n(intr_info, DB_VECTOR); +} + +static inline bool is_breakpoint(u32 intr_info) +{ + return is_exception_n(intr_info, BP_VECTOR); +} + +static inline bool is_double_fault(u32 intr_info) +{ + return is_exception_n(intr_info, DF_VECTOR); +} + +static inline bool is_page_fault(u32 intr_info) +{ + return is_exception_n(intr_info, PF_VECTOR); +} + +static inline bool is_invalid_opcode(u32 intr_info) +{ + return is_exception_n(intr_info, UD_VECTOR); +} + +static inline bool is_gp_fault(u32 intr_info) +{ + return is_exception_n(intr_info, GP_VECTOR); +} + +static inline bool is_alignment_check(u32 intr_info) +{ + return is_exception_n(intr_info, AC_VECTOR); +} + +static inline bool is_machine_check(u32 intr_info) +{ + return is_exception_n(intr_info, MC_VECTOR); +} + +static inline bool is_nm_fault(u32 intr_info) +{ + return is_exception_n(intr_info, NM_VECTOR); +} + +/* Undocumented: icebp/int1 */ +static inline bool is_icebp(u32 intr_info) +{ + return is_intr_type(intr_info, INTR_TYPE_PRIV_SW_EXCEPTION); +} + +static inline bool is_nmi(u32 intr_info) +{ + return is_intr_type(intr_info, INTR_TYPE_NMI_INTR); +} + +static inline bool is_external_intr(u32 intr_info) +{ + return is_intr_type(intr_info, INTR_TYPE_EXT_INTR); +} + +static inline bool is_exception_with_error_code(u32 intr_info) +{ + const u32 mask = INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK; + + return (intr_info & mask) == mask; +} + +enum vmcs_field_width { + VMCS_FIELD_WIDTH_U16 = 0, + VMCS_FIELD_WIDTH_U64 = 1, + VMCS_FIELD_WIDTH_U32 = 2, + VMCS_FIELD_WIDTH_NATURAL_WIDTH = 3 +}; + +static inline int vmcs_field_width(unsigned long field) +{ + if (0x1 & field) /* the *_HIGH fields are all 32 bit */ + return VMCS_FIELD_WIDTH_U32; + return (field >> 13) & 0x3; +} + +static inline int vmcs_field_readonly(unsigned long field) +{ + return (((field >> 10) & 0x3) == 1); +} + +#define VMCS_FIELD_INDEX_SHIFT (1) +#define VMCS_FIELD_INDEX_MASK GENMASK(9, 1) + +static inline unsigned int vmcs_field_index(unsigned long field) +{ + return (field & VMCS_FIELD_INDEX_MASK) >> VMCS_FIELD_INDEX_SHIFT; +} + +#endif /* __KVM_X86_VMX_VMCS_H */ diff --git a/arch/x86/kvm/vmx/vmcs12.c b/arch/x86/kvm/vmx/vmcs12.c new file mode 100644 index 000000000..2251b6092 --- /dev/null +++ b/arch/x86/kvm/vmx/vmcs12.c @@ -0,0 +1,154 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "vmcs12.h" + +#define VMCS12_OFFSET(x) offsetof(struct vmcs12, x) +#define FIELD(number, name) [ROL16(number, 6)] = VMCS12_OFFSET(name) +#define FIELD64(number, name) \ + FIELD(number, name), \ + [ROL16(number##_HIGH, 6)] = VMCS12_OFFSET(name) + sizeof(u32) + +const unsigned short vmcs12_field_offsets[] = { + FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id), + FIELD(POSTED_INTR_NV, posted_intr_nv), + FIELD(GUEST_ES_SELECTOR, guest_es_selector), + FIELD(GUEST_CS_SELECTOR, guest_cs_selector), + FIELD(GUEST_SS_SELECTOR, guest_ss_selector), + FIELD(GUEST_DS_SELECTOR, guest_ds_selector), + FIELD(GUEST_FS_SELECTOR, guest_fs_selector), + FIELD(GUEST_GS_SELECTOR, guest_gs_selector), + FIELD(GUEST_LDTR_SELECTOR, guest_ldtr_selector), + FIELD(GUEST_TR_SELECTOR, guest_tr_selector), + FIELD(GUEST_INTR_STATUS, guest_intr_status), + FIELD(GUEST_PML_INDEX, guest_pml_index), + FIELD(HOST_ES_SELECTOR, host_es_selector), + FIELD(HOST_CS_SELECTOR, host_cs_selector), + FIELD(HOST_SS_SELECTOR, host_ss_selector), + FIELD(HOST_DS_SELECTOR, host_ds_selector), + FIELD(HOST_FS_SELECTOR, host_fs_selector), + FIELD(HOST_GS_SELECTOR, host_gs_selector), + FIELD(HOST_TR_SELECTOR, host_tr_selector), + FIELD64(IO_BITMAP_A, io_bitmap_a), + FIELD64(IO_BITMAP_B, io_bitmap_b), + FIELD64(MSR_BITMAP, msr_bitmap), + FIELD64(VM_EXIT_MSR_STORE_ADDR, vm_exit_msr_store_addr), + FIELD64(VM_EXIT_MSR_LOAD_ADDR, vm_exit_msr_load_addr), + FIELD64(VM_ENTRY_MSR_LOAD_ADDR, vm_entry_msr_load_addr), + FIELD64(PML_ADDRESS, pml_address), + FIELD64(TSC_OFFSET, tsc_offset), + FIELD64(TSC_MULTIPLIER, tsc_multiplier), + FIELD64(VIRTUAL_APIC_PAGE_ADDR, virtual_apic_page_addr), + FIELD64(APIC_ACCESS_ADDR, apic_access_addr), + FIELD64(POSTED_INTR_DESC_ADDR, posted_intr_desc_addr), + FIELD64(VM_FUNCTION_CONTROL, vm_function_control), + FIELD64(EPT_POINTER, ept_pointer), + FIELD64(EOI_EXIT_BITMAP0, eoi_exit_bitmap0), + FIELD64(EOI_EXIT_BITMAP1, eoi_exit_bitmap1), + FIELD64(EOI_EXIT_BITMAP2, eoi_exit_bitmap2), + FIELD64(EOI_EXIT_BITMAP3, eoi_exit_bitmap3), + FIELD64(EPTP_LIST_ADDRESS, eptp_list_address), + FIELD64(VMREAD_BITMAP, vmread_bitmap), + FIELD64(VMWRITE_BITMAP, vmwrite_bitmap), + FIELD64(XSS_EXIT_BITMAP, xss_exit_bitmap), + FIELD64(ENCLS_EXITING_BITMAP, encls_exiting_bitmap), + FIELD64(GUEST_PHYSICAL_ADDRESS, guest_physical_address), + FIELD64(VMCS_LINK_POINTER, vmcs_link_pointer), + FIELD64(GUEST_IA32_DEBUGCTL, guest_ia32_debugctl), + FIELD64(GUEST_IA32_PAT, guest_ia32_pat), + FIELD64(GUEST_IA32_EFER, guest_ia32_efer), + FIELD64(GUEST_IA32_PERF_GLOBAL_CTRL, guest_ia32_perf_global_ctrl), + FIELD64(GUEST_PDPTR0, guest_pdptr0), + FIELD64(GUEST_PDPTR1, guest_pdptr1), + FIELD64(GUEST_PDPTR2, guest_pdptr2), + FIELD64(GUEST_PDPTR3, guest_pdptr3), + FIELD64(GUEST_BNDCFGS, guest_bndcfgs), + FIELD64(HOST_IA32_PAT, host_ia32_pat), + FIELD64(HOST_IA32_EFER, host_ia32_efer), + FIELD64(HOST_IA32_PERF_GLOBAL_CTRL, host_ia32_perf_global_ctrl), + FIELD(PIN_BASED_VM_EXEC_CONTROL, pin_based_vm_exec_control), + FIELD(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control), + FIELD(EXCEPTION_BITMAP, exception_bitmap), + FIELD(PAGE_FAULT_ERROR_CODE_MASK, page_fault_error_code_mask), + FIELD(PAGE_FAULT_ERROR_CODE_MATCH, page_fault_error_code_match), + FIELD(CR3_TARGET_COUNT, cr3_target_count), + FIELD(VM_EXIT_CONTROLS, vm_exit_controls), + FIELD(VM_EXIT_MSR_STORE_COUNT, vm_exit_msr_store_count), + FIELD(VM_EXIT_MSR_LOAD_COUNT, vm_exit_msr_load_count), + FIELD(VM_ENTRY_CONTROLS, vm_entry_controls), + FIELD(VM_ENTRY_MSR_LOAD_COUNT, vm_entry_msr_load_count), + FIELD(VM_ENTRY_INTR_INFO_FIELD, vm_entry_intr_info_field), + FIELD(VM_ENTRY_EXCEPTION_ERROR_CODE, vm_entry_exception_error_code), + FIELD(VM_ENTRY_INSTRUCTION_LEN, vm_entry_instruction_len), + FIELD(TPR_THRESHOLD, tpr_threshold), + FIELD(SECONDARY_VM_EXEC_CONTROL, secondary_vm_exec_control), + FIELD(VM_INSTRUCTION_ERROR, vm_instruction_error), + FIELD(VM_EXIT_REASON, vm_exit_reason), + FIELD(VM_EXIT_INTR_INFO, vm_exit_intr_info), + FIELD(VM_EXIT_INTR_ERROR_CODE, vm_exit_intr_error_code), + FIELD(IDT_VECTORING_INFO_FIELD, idt_vectoring_info_field), + FIELD(IDT_VECTORING_ERROR_CODE, idt_vectoring_error_code), + FIELD(VM_EXIT_INSTRUCTION_LEN, vm_exit_instruction_len), + FIELD(VMX_INSTRUCTION_INFO, vmx_instruction_info), + FIELD(GUEST_ES_LIMIT, guest_es_limit), + FIELD(GUEST_CS_LIMIT, guest_cs_limit), + FIELD(GUEST_SS_LIMIT, guest_ss_limit), + FIELD(GUEST_DS_LIMIT, guest_ds_limit), + FIELD(GUEST_FS_LIMIT, guest_fs_limit), + FIELD(GUEST_GS_LIMIT, guest_gs_limit), + FIELD(GUEST_LDTR_LIMIT, guest_ldtr_limit), + FIELD(GUEST_TR_LIMIT, guest_tr_limit), + FIELD(GUEST_GDTR_LIMIT, guest_gdtr_limit), + FIELD(GUEST_IDTR_LIMIT, guest_idtr_limit), + FIELD(GUEST_ES_AR_BYTES, guest_es_ar_bytes), + FIELD(GUEST_CS_AR_BYTES, guest_cs_ar_bytes), + FIELD(GUEST_SS_AR_BYTES, guest_ss_ar_bytes), + FIELD(GUEST_DS_AR_BYTES, guest_ds_ar_bytes), + FIELD(GUEST_FS_AR_BYTES, guest_fs_ar_bytes), + FIELD(GUEST_GS_AR_BYTES, guest_gs_ar_bytes), + FIELD(GUEST_LDTR_AR_BYTES, guest_ldtr_ar_bytes), + FIELD(GUEST_TR_AR_BYTES, guest_tr_ar_bytes), + FIELD(GUEST_INTERRUPTIBILITY_INFO, guest_interruptibility_info), + FIELD(GUEST_ACTIVITY_STATE, guest_activity_state), + FIELD(GUEST_SYSENTER_CS, guest_sysenter_cs), + FIELD(HOST_IA32_SYSENTER_CS, host_ia32_sysenter_cs), + FIELD(VMX_PREEMPTION_TIMER_VALUE, vmx_preemption_timer_value), + FIELD(CR0_GUEST_HOST_MASK, cr0_guest_host_mask), + FIELD(CR4_GUEST_HOST_MASK, cr4_guest_host_mask), + FIELD(CR0_READ_SHADOW, cr0_read_shadow), + FIELD(CR4_READ_SHADOW, cr4_read_shadow), + FIELD(EXIT_QUALIFICATION, exit_qualification), + FIELD(GUEST_LINEAR_ADDRESS, guest_linear_address), + FIELD(GUEST_CR0, guest_cr0), + FIELD(GUEST_CR3, guest_cr3), + FIELD(GUEST_CR4, guest_cr4), + FIELD(GUEST_ES_BASE, guest_es_base), + FIELD(GUEST_CS_BASE, guest_cs_base), + FIELD(GUEST_SS_BASE, guest_ss_base), + FIELD(GUEST_DS_BASE, guest_ds_base), + FIELD(GUEST_FS_BASE, guest_fs_base), + FIELD(GUEST_GS_BASE, guest_gs_base), + FIELD(GUEST_LDTR_BASE, guest_ldtr_base), + FIELD(GUEST_TR_BASE, guest_tr_base), + FIELD(GUEST_GDTR_BASE, guest_gdtr_base), + FIELD(GUEST_IDTR_BASE, guest_idtr_base), + FIELD(GUEST_DR7, guest_dr7), + FIELD(GUEST_RSP, guest_rsp), + FIELD(GUEST_RIP, guest_rip), + FIELD(GUEST_RFLAGS, guest_rflags), + FIELD(GUEST_PENDING_DBG_EXCEPTIONS, guest_pending_dbg_exceptions), + FIELD(GUEST_SYSENTER_ESP, guest_sysenter_esp), + FIELD(GUEST_SYSENTER_EIP, guest_sysenter_eip), + FIELD(HOST_CR0, host_cr0), + FIELD(HOST_CR3, host_cr3), + FIELD(HOST_CR4, host_cr4), + FIELD(HOST_FS_BASE, host_fs_base), + FIELD(HOST_GS_BASE, host_gs_base), + FIELD(HOST_TR_BASE, host_tr_base), + FIELD(HOST_GDTR_BASE, host_gdtr_base), + FIELD(HOST_IDTR_BASE, host_idtr_base), + FIELD(HOST_IA32_SYSENTER_ESP, host_ia32_sysenter_esp), + FIELD(HOST_IA32_SYSENTER_EIP, host_ia32_sysenter_eip), + FIELD(HOST_RSP, host_rsp), + FIELD(HOST_RIP, host_rip), +}; +const unsigned int nr_vmcs12_fields = ARRAY_SIZE(vmcs12_field_offsets); diff --git a/arch/x86/kvm/vmx/vmcs12.h b/arch/x86/kvm/vmx/vmcs12.h new file mode 100644 index 000000000..746129ddd --- /dev/null +++ b/arch/x86/kvm/vmx/vmcs12.h @@ -0,0 +1,430 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __KVM_X86_VMX_VMCS12_H +#define __KVM_X86_VMX_VMCS12_H + +#include <linux/build_bug.h> + +#include "vmcs.h" + +/* + * struct vmcs12 describes the state that our guest hypervisor (L1) keeps for a + * single nested guest (L2), hence the name vmcs12. Any VMX implementation has + * a VMCS structure, and vmcs12 is our emulated VMX's VMCS. This structure is + * stored in guest memory specified by VMPTRLD, but is opaque to the guest, + * which must access it using VMREAD/VMWRITE/VMCLEAR instructions. + * More than one of these structures may exist, if L1 runs multiple L2 guests. + * nested_vmx_run() will use the data here to build the vmcs02: a VMCS for the + * underlying hardware which will be used to run L2. + * This structure is packed to ensure that its layout is identical across + * machines (necessary for live migration). + * + * IMPORTANT: Changing the layout of existing fields in this structure + * will break save/restore compatibility with older kvm releases. When + * adding new fields, either use space in the reserved padding* arrays + * or add the new fields to the end of the structure. + */ +typedef u64 natural_width; +struct __packed vmcs12 { + /* According to the Intel spec, a VMCS region must start with the + * following two fields. Then follow implementation-specific data. + */ + struct vmcs_hdr hdr; + u32 abort; + + u32 launch_state; /* set to 0 by VMCLEAR, to 1 by VMLAUNCH */ + u32 padding[7]; /* room for future expansion */ + + u64 io_bitmap_a; + u64 io_bitmap_b; + u64 msr_bitmap; + u64 vm_exit_msr_store_addr; + u64 vm_exit_msr_load_addr; + u64 vm_entry_msr_load_addr; + u64 tsc_offset; + u64 virtual_apic_page_addr; + u64 apic_access_addr; + u64 posted_intr_desc_addr; + u64 ept_pointer; + u64 eoi_exit_bitmap0; + u64 eoi_exit_bitmap1; + u64 eoi_exit_bitmap2; + u64 eoi_exit_bitmap3; + u64 xss_exit_bitmap; + u64 guest_physical_address; + u64 vmcs_link_pointer; + u64 guest_ia32_debugctl; + u64 guest_ia32_pat; + u64 guest_ia32_efer; + u64 guest_ia32_perf_global_ctrl; + u64 guest_pdptr0; + u64 guest_pdptr1; + u64 guest_pdptr2; + u64 guest_pdptr3; + u64 guest_bndcfgs; + u64 host_ia32_pat; + u64 host_ia32_efer; + u64 host_ia32_perf_global_ctrl; + u64 vmread_bitmap; + u64 vmwrite_bitmap; + u64 vm_function_control; + u64 eptp_list_address; + u64 pml_address; + u64 encls_exiting_bitmap; + u64 tsc_multiplier; + u64 padding64[1]; /* room for future expansion */ + /* + * To allow migration of L1 (complete with its L2 guests) between + * machines of different natural widths (32 or 64 bit), we cannot have + * unsigned long fields with no explicit size. We use u64 (aliased + * natural_width) instead. Luckily, x86 is little-endian. + */ + natural_width cr0_guest_host_mask; + natural_width cr4_guest_host_mask; + natural_width cr0_read_shadow; + natural_width cr4_read_shadow; + natural_width dead_space[4]; /* Last remnants of cr3_target_value[0-3]. */ + natural_width exit_qualification; + natural_width guest_linear_address; + natural_width guest_cr0; + natural_width guest_cr3; + natural_width guest_cr4; + natural_width guest_es_base; + natural_width guest_cs_base; + natural_width guest_ss_base; + natural_width guest_ds_base; + natural_width guest_fs_base; + natural_width guest_gs_base; + natural_width guest_ldtr_base; + natural_width guest_tr_base; + natural_width guest_gdtr_base; + natural_width guest_idtr_base; + natural_width guest_dr7; + natural_width guest_rsp; + natural_width guest_rip; + natural_width guest_rflags; + natural_width guest_pending_dbg_exceptions; + natural_width guest_sysenter_esp; + natural_width guest_sysenter_eip; + natural_width host_cr0; + natural_width host_cr3; + natural_width host_cr4; + natural_width host_fs_base; + natural_width host_gs_base; + natural_width host_tr_base; + natural_width host_gdtr_base; + natural_width host_idtr_base; + natural_width host_ia32_sysenter_esp; + natural_width host_ia32_sysenter_eip; + natural_width host_rsp; + natural_width host_rip; + natural_width paddingl[8]; /* room for future expansion */ + u32 pin_based_vm_exec_control; + u32 cpu_based_vm_exec_control; + u32 exception_bitmap; + u32 page_fault_error_code_mask; + u32 page_fault_error_code_match; + u32 cr3_target_count; + u32 vm_exit_controls; + u32 vm_exit_msr_store_count; + u32 vm_exit_msr_load_count; + u32 vm_entry_controls; + u32 vm_entry_msr_load_count; + u32 vm_entry_intr_info_field; + u32 vm_entry_exception_error_code; + u32 vm_entry_instruction_len; + u32 tpr_threshold; + u32 secondary_vm_exec_control; + u32 vm_instruction_error; + u32 vm_exit_reason; + u32 vm_exit_intr_info; + u32 vm_exit_intr_error_code; + u32 idt_vectoring_info_field; + u32 idt_vectoring_error_code; + u32 vm_exit_instruction_len; + u32 vmx_instruction_info; + u32 guest_es_limit; + u32 guest_cs_limit; + u32 guest_ss_limit; + u32 guest_ds_limit; + u32 guest_fs_limit; + u32 guest_gs_limit; + u32 guest_ldtr_limit; + u32 guest_tr_limit; + u32 guest_gdtr_limit; + u32 guest_idtr_limit; + u32 guest_es_ar_bytes; + u32 guest_cs_ar_bytes; + u32 guest_ss_ar_bytes; + u32 guest_ds_ar_bytes; + u32 guest_fs_ar_bytes; + u32 guest_gs_ar_bytes; + u32 guest_ldtr_ar_bytes; + u32 guest_tr_ar_bytes; + u32 guest_interruptibility_info; + u32 guest_activity_state; + u32 guest_sysenter_cs; + u32 host_ia32_sysenter_cs; + u32 vmx_preemption_timer_value; + u32 padding32[7]; /* room for future expansion */ + u16 virtual_processor_id; + u16 posted_intr_nv; + u16 guest_es_selector; + u16 guest_cs_selector; + u16 guest_ss_selector; + u16 guest_ds_selector; + u16 guest_fs_selector; + u16 guest_gs_selector; + u16 guest_ldtr_selector; + u16 guest_tr_selector; + u16 guest_intr_status; + u16 host_es_selector; + u16 host_cs_selector; + u16 host_ss_selector; + u16 host_ds_selector; + u16 host_fs_selector; + u16 host_gs_selector; + u16 host_tr_selector; + u16 guest_pml_index; +}; + +/* + * VMCS12_REVISION is an arbitrary id that should be changed if the content or + * layout of struct vmcs12 is changed. MSR_IA32_VMX_BASIC returns this id, and + * VMPTRLD verifies that the VMCS region that L1 is loading contains this id. + * + * IMPORTANT: Changing this value will break save/restore compatibility with + * older kvm releases. + */ +#define VMCS12_REVISION 0x11e57ed0 + +/* + * VMCS12_SIZE is the number of bytes L1 should allocate for the VMXON region + * and any VMCS region. Although only sizeof(struct vmcs12) are used by the + * current implementation, 4K are reserved to avoid future complications and + * to preserve userspace ABI. + */ +#define VMCS12_SIZE KVM_STATE_NESTED_VMX_VMCS_SIZE + +/* + * For save/restore compatibility, the vmcs12 field offsets must not change. + */ +#define CHECK_OFFSET(field, loc) \ + BUILD_BUG_ON_MSG(offsetof(struct vmcs12, field) != (loc), \ + "Offset of " #field " in struct vmcs12 has changed.") + +static inline void vmx_check_vmcs12_offsets(void) +{ + CHECK_OFFSET(hdr, 0); + CHECK_OFFSET(abort, 4); + CHECK_OFFSET(launch_state, 8); + CHECK_OFFSET(io_bitmap_a, 40); + CHECK_OFFSET(io_bitmap_b, 48); + CHECK_OFFSET(msr_bitmap, 56); + CHECK_OFFSET(vm_exit_msr_store_addr, 64); + CHECK_OFFSET(vm_exit_msr_load_addr, 72); + CHECK_OFFSET(vm_entry_msr_load_addr, 80); + CHECK_OFFSET(tsc_offset, 88); + CHECK_OFFSET(virtual_apic_page_addr, 96); + CHECK_OFFSET(apic_access_addr, 104); + CHECK_OFFSET(posted_intr_desc_addr, 112); + CHECK_OFFSET(ept_pointer, 120); + CHECK_OFFSET(eoi_exit_bitmap0, 128); + CHECK_OFFSET(eoi_exit_bitmap1, 136); + CHECK_OFFSET(eoi_exit_bitmap2, 144); + CHECK_OFFSET(eoi_exit_bitmap3, 152); + CHECK_OFFSET(xss_exit_bitmap, 160); + CHECK_OFFSET(guest_physical_address, 168); + CHECK_OFFSET(vmcs_link_pointer, 176); + CHECK_OFFSET(guest_ia32_debugctl, 184); + CHECK_OFFSET(guest_ia32_pat, 192); + CHECK_OFFSET(guest_ia32_efer, 200); + CHECK_OFFSET(guest_ia32_perf_global_ctrl, 208); + CHECK_OFFSET(guest_pdptr0, 216); + CHECK_OFFSET(guest_pdptr1, 224); + CHECK_OFFSET(guest_pdptr2, 232); + CHECK_OFFSET(guest_pdptr3, 240); + CHECK_OFFSET(guest_bndcfgs, 248); + CHECK_OFFSET(host_ia32_pat, 256); + CHECK_OFFSET(host_ia32_efer, 264); + CHECK_OFFSET(host_ia32_perf_global_ctrl, 272); + CHECK_OFFSET(vmread_bitmap, 280); + CHECK_OFFSET(vmwrite_bitmap, 288); + CHECK_OFFSET(vm_function_control, 296); + CHECK_OFFSET(eptp_list_address, 304); + CHECK_OFFSET(pml_address, 312); + CHECK_OFFSET(encls_exiting_bitmap, 320); + CHECK_OFFSET(tsc_multiplier, 328); + CHECK_OFFSET(cr0_guest_host_mask, 344); + CHECK_OFFSET(cr4_guest_host_mask, 352); + CHECK_OFFSET(cr0_read_shadow, 360); + CHECK_OFFSET(cr4_read_shadow, 368); + CHECK_OFFSET(dead_space, 376); + CHECK_OFFSET(exit_qualification, 408); + CHECK_OFFSET(guest_linear_address, 416); + CHECK_OFFSET(guest_cr0, 424); + CHECK_OFFSET(guest_cr3, 432); + CHECK_OFFSET(guest_cr4, 440); + CHECK_OFFSET(guest_es_base, 448); + CHECK_OFFSET(guest_cs_base, 456); + CHECK_OFFSET(guest_ss_base, 464); + CHECK_OFFSET(guest_ds_base, 472); + CHECK_OFFSET(guest_fs_base, 480); + CHECK_OFFSET(guest_gs_base, 488); + CHECK_OFFSET(guest_ldtr_base, 496); + CHECK_OFFSET(guest_tr_base, 504); + CHECK_OFFSET(guest_gdtr_base, 512); + CHECK_OFFSET(guest_idtr_base, 520); + CHECK_OFFSET(guest_dr7, 528); + CHECK_OFFSET(guest_rsp, 536); + CHECK_OFFSET(guest_rip, 544); + CHECK_OFFSET(guest_rflags, 552); + CHECK_OFFSET(guest_pending_dbg_exceptions, 560); + CHECK_OFFSET(guest_sysenter_esp, 568); + CHECK_OFFSET(guest_sysenter_eip, 576); + CHECK_OFFSET(host_cr0, 584); + CHECK_OFFSET(host_cr3, 592); + CHECK_OFFSET(host_cr4, 600); + CHECK_OFFSET(host_fs_base, 608); + CHECK_OFFSET(host_gs_base, 616); + CHECK_OFFSET(host_tr_base, 624); + CHECK_OFFSET(host_gdtr_base, 632); + CHECK_OFFSET(host_idtr_base, 640); + CHECK_OFFSET(host_ia32_sysenter_esp, 648); + CHECK_OFFSET(host_ia32_sysenter_eip, 656); + CHECK_OFFSET(host_rsp, 664); + CHECK_OFFSET(host_rip, 672); + CHECK_OFFSET(pin_based_vm_exec_control, 744); + CHECK_OFFSET(cpu_based_vm_exec_control, 748); + CHECK_OFFSET(exception_bitmap, 752); + CHECK_OFFSET(page_fault_error_code_mask, 756); + CHECK_OFFSET(page_fault_error_code_match, 760); + CHECK_OFFSET(cr3_target_count, 764); + CHECK_OFFSET(vm_exit_controls, 768); + CHECK_OFFSET(vm_exit_msr_store_count, 772); + CHECK_OFFSET(vm_exit_msr_load_count, 776); + CHECK_OFFSET(vm_entry_controls, 780); + CHECK_OFFSET(vm_entry_msr_load_count, 784); + CHECK_OFFSET(vm_entry_intr_info_field, 788); + CHECK_OFFSET(vm_entry_exception_error_code, 792); + CHECK_OFFSET(vm_entry_instruction_len, 796); + CHECK_OFFSET(tpr_threshold, 800); + CHECK_OFFSET(secondary_vm_exec_control, 804); + CHECK_OFFSET(vm_instruction_error, 808); + CHECK_OFFSET(vm_exit_reason, 812); + CHECK_OFFSET(vm_exit_intr_info, 816); + CHECK_OFFSET(vm_exit_intr_error_code, 820); + CHECK_OFFSET(idt_vectoring_info_field, 824); + CHECK_OFFSET(idt_vectoring_error_code, 828); + CHECK_OFFSET(vm_exit_instruction_len, 832); + CHECK_OFFSET(vmx_instruction_info, 836); + CHECK_OFFSET(guest_es_limit, 840); + CHECK_OFFSET(guest_cs_limit, 844); + CHECK_OFFSET(guest_ss_limit, 848); + CHECK_OFFSET(guest_ds_limit, 852); + CHECK_OFFSET(guest_fs_limit, 856); + CHECK_OFFSET(guest_gs_limit, 860); + CHECK_OFFSET(guest_ldtr_limit, 864); + CHECK_OFFSET(guest_tr_limit, 868); + CHECK_OFFSET(guest_gdtr_limit, 872); + CHECK_OFFSET(guest_idtr_limit, 876); + CHECK_OFFSET(guest_es_ar_bytes, 880); + CHECK_OFFSET(guest_cs_ar_bytes, 884); + CHECK_OFFSET(guest_ss_ar_bytes, 888); + CHECK_OFFSET(guest_ds_ar_bytes, 892); + CHECK_OFFSET(guest_fs_ar_bytes, 896); + CHECK_OFFSET(guest_gs_ar_bytes, 900); + CHECK_OFFSET(guest_ldtr_ar_bytes, 904); + CHECK_OFFSET(guest_tr_ar_bytes, 908); + CHECK_OFFSET(guest_interruptibility_info, 912); + CHECK_OFFSET(guest_activity_state, 916); + CHECK_OFFSET(guest_sysenter_cs, 920); + CHECK_OFFSET(host_ia32_sysenter_cs, 924); + CHECK_OFFSET(vmx_preemption_timer_value, 928); + CHECK_OFFSET(virtual_processor_id, 960); + CHECK_OFFSET(posted_intr_nv, 962); + CHECK_OFFSET(guest_es_selector, 964); + CHECK_OFFSET(guest_cs_selector, 966); + CHECK_OFFSET(guest_ss_selector, 968); + CHECK_OFFSET(guest_ds_selector, 970); + CHECK_OFFSET(guest_fs_selector, 972); + CHECK_OFFSET(guest_gs_selector, 974); + CHECK_OFFSET(guest_ldtr_selector, 976); + CHECK_OFFSET(guest_tr_selector, 978); + CHECK_OFFSET(guest_intr_status, 980); + CHECK_OFFSET(host_es_selector, 982); + CHECK_OFFSET(host_cs_selector, 984); + CHECK_OFFSET(host_ss_selector, 986); + CHECK_OFFSET(host_ds_selector, 988); + CHECK_OFFSET(host_fs_selector, 990); + CHECK_OFFSET(host_gs_selector, 992); + CHECK_OFFSET(host_tr_selector, 994); + CHECK_OFFSET(guest_pml_index, 996); +} + +extern const unsigned short vmcs12_field_offsets[]; +extern const unsigned int nr_vmcs12_fields; + +static inline short get_vmcs12_field_offset(unsigned long field) +{ + unsigned short offset; + unsigned int index; + + if (field >> 15) + return -ENOENT; + + index = ROL16(field, 6); + if (index >= nr_vmcs12_fields) + return -ENOENT; + + index = array_index_nospec(index, nr_vmcs12_fields); + offset = vmcs12_field_offsets[index]; + if (offset == 0) + return -ENOENT; + return offset; +} + +static inline u64 vmcs12_read_any(struct vmcs12 *vmcs12, unsigned long field, + u16 offset) +{ + char *p = (char *)vmcs12 + offset; + + switch (vmcs_field_width(field)) { + case VMCS_FIELD_WIDTH_NATURAL_WIDTH: + return *((natural_width *)p); + case VMCS_FIELD_WIDTH_U16: + return *((u16 *)p); + case VMCS_FIELD_WIDTH_U32: + return *((u32 *)p); + case VMCS_FIELD_WIDTH_U64: + return *((u64 *)p); + default: + WARN_ON_ONCE(1); + return -1; + } +} + +static inline void vmcs12_write_any(struct vmcs12 *vmcs12, unsigned long field, + u16 offset, u64 field_value) +{ + char *p = (char *)vmcs12 + offset; + + switch (vmcs_field_width(field)) { + case VMCS_FIELD_WIDTH_U16: + *(u16 *)p = field_value; + break; + case VMCS_FIELD_WIDTH_U32: + *(u32 *)p = field_value; + break; + case VMCS_FIELD_WIDTH_U64: + *(u64 *)p = field_value; + break; + case VMCS_FIELD_WIDTH_NATURAL_WIDTH: + *(natural_width *)p = field_value; + break; + default: + WARN_ON_ONCE(1); + break; + } +} + +#endif /* __KVM_X86_VMX_VMCS12_H */ diff --git a/arch/x86/kvm/vmx/vmcs_shadow_fields.h b/arch/x86/kvm/vmx/vmcs_shadow_fields.h new file mode 100644 index 000000000..cad128d16 --- /dev/null +++ b/arch/x86/kvm/vmx/vmcs_shadow_fields.h @@ -0,0 +1,79 @@ +#if !defined(SHADOW_FIELD_RO) && !defined(SHADOW_FIELD_RW) +BUILD_BUG_ON(1) +#endif + +#ifndef SHADOW_FIELD_RO +#define SHADOW_FIELD_RO(x, y) +#endif +#ifndef SHADOW_FIELD_RW +#define SHADOW_FIELD_RW(x, y) +#endif + +/* + * We do NOT shadow fields that are modified when L0 + * traps and emulates any vmx instruction (e.g. VMPTRLD, + * VMXON...) executed by L1. + * For example, VM_INSTRUCTION_ERROR is read + * by L1 if a vmx instruction fails (part of the error path). + * Note the code assumes this logic. If for some reason + * we start shadowing these fields then we need to + * force a shadow sync when L0 emulates vmx instructions + * (e.g. force a sync if VM_INSTRUCTION_ERROR is modified + * by nested_vmx_failValid) + * + * When adding or removing fields here, note that shadowed + * fields must always be synced by prepare_vmcs02, not just + * prepare_vmcs02_rare. + */ + +/* + * Keeping the fields ordered by size is an attempt at improving + * branch prediction in vmcs12_read_any and vmcs12_write_any. + */ + +/* 16-bits */ +SHADOW_FIELD_RW(GUEST_INTR_STATUS, guest_intr_status) +SHADOW_FIELD_RW(GUEST_PML_INDEX, guest_pml_index) +SHADOW_FIELD_RW(HOST_FS_SELECTOR, host_fs_selector) +SHADOW_FIELD_RW(HOST_GS_SELECTOR, host_gs_selector) + +/* 32-bits */ +SHADOW_FIELD_RO(VM_EXIT_REASON, vm_exit_reason) +SHADOW_FIELD_RO(VM_EXIT_INTR_INFO, vm_exit_intr_info) +SHADOW_FIELD_RO(VM_EXIT_INSTRUCTION_LEN, vm_exit_instruction_len) +SHADOW_FIELD_RO(IDT_VECTORING_INFO_FIELD, idt_vectoring_info_field) +SHADOW_FIELD_RO(IDT_VECTORING_ERROR_CODE, idt_vectoring_error_code) +SHADOW_FIELD_RO(VM_EXIT_INTR_ERROR_CODE, vm_exit_intr_error_code) +SHADOW_FIELD_RO(GUEST_CS_AR_BYTES, guest_cs_ar_bytes) +SHADOW_FIELD_RO(GUEST_SS_AR_BYTES, guest_ss_ar_bytes) +SHADOW_FIELD_RW(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control) +SHADOW_FIELD_RW(PIN_BASED_VM_EXEC_CONTROL, pin_based_vm_exec_control) +SHADOW_FIELD_RW(EXCEPTION_BITMAP, exception_bitmap) +SHADOW_FIELD_RW(VM_ENTRY_EXCEPTION_ERROR_CODE, vm_entry_exception_error_code) +SHADOW_FIELD_RW(VM_ENTRY_INTR_INFO_FIELD, vm_entry_intr_info_field) +SHADOW_FIELD_RW(VM_ENTRY_INSTRUCTION_LEN, vm_entry_instruction_len) +SHADOW_FIELD_RW(TPR_THRESHOLD, tpr_threshold) +SHADOW_FIELD_RW(GUEST_INTERRUPTIBILITY_INFO, guest_interruptibility_info) +SHADOW_FIELD_RW(VMX_PREEMPTION_TIMER_VALUE, vmx_preemption_timer_value) + +/* Natural width */ +SHADOW_FIELD_RO(EXIT_QUALIFICATION, exit_qualification) +SHADOW_FIELD_RO(GUEST_LINEAR_ADDRESS, guest_linear_address) +SHADOW_FIELD_RW(GUEST_RIP, guest_rip) +SHADOW_FIELD_RW(GUEST_RSP, guest_rsp) +SHADOW_FIELD_RW(GUEST_CR0, guest_cr0) +SHADOW_FIELD_RW(GUEST_CR3, guest_cr3) +SHADOW_FIELD_RW(GUEST_CR4, guest_cr4) +SHADOW_FIELD_RW(GUEST_RFLAGS, guest_rflags) +SHADOW_FIELD_RW(CR0_GUEST_HOST_MASK, cr0_guest_host_mask) +SHADOW_FIELD_RW(CR0_READ_SHADOW, cr0_read_shadow) +SHADOW_FIELD_RW(CR4_READ_SHADOW, cr4_read_shadow) +SHADOW_FIELD_RW(HOST_FS_BASE, host_fs_base) +SHADOW_FIELD_RW(HOST_GS_BASE, host_gs_base) + +/* 64-bit */ +SHADOW_FIELD_RO(GUEST_PHYSICAL_ADDRESS, guest_physical_address) +SHADOW_FIELD_RO(GUEST_PHYSICAL_ADDRESS_HIGH, guest_physical_address) + +#undef SHADOW_FIELD_RO +#undef SHADOW_FIELD_RW diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S new file mode 100644 index 000000000..0b5db4de4 --- /dev/null +++ b/arch/x86/kvm/vmx/vmenter.S @@ -0,0 +1,352 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include <linux/linkage.h> +#include <asm/asm.h> +#include <asm/bitsperlong.h> +#include <asm/kvm_vcpu_regs.h> +#include <asm/nospec-branch.h> +#include <asm/percpu.h> +#include <asm/segment.h> +#include "kvm-asm-offsets.h" +#include "run_flags.h" + +#define WORD_SIZE (BITS_PER_LONG / 8) + +#define VCPU_RAX __VCPU_REGS_RAX * WORD_SIZE +#define VCPU_RCX __VCPU_REGS_RCX * WORD_SIZE +#define VCPU_RDX __VCPU_REGS_RDX * WORD_SIZE +#define VCPU_RBX __VCPU_REGS_RBX * WORD_SIZE +/* Intentionally omit RSP as it's context switched by hardware */ +#define VCPU_RBP __VCPU_REGS_RBP * WORD_SIZE +#define VCPU_RSI __VCPU_REGS_RSI * WORD_SIZE +#define VCPU_RDI __VCPU_REGS_RDI * WORD_SIZE + +#ifdef CONFIG_X86_64 +#define VCPU_R8 __VCPU_REGS_R8 * WORD_SIZE +#define VCPU_R9 __VCPU_REGS_R9 * WORD_SIZE +#define VCPU_R10 __VCPU_REGS_R10 * WORD_SIZE +#define VCPU_R11 __VCPU_REGS_R11 * WORD_SIZE +#define VCPU_R12 __VCPU_REGS_R12 * WORD_SIZE +#define VCPU_R13 __VCPU_REGS_R13 * WORD_SIZE +#define VCPU_R14 __VCPU_REGS_R14 * WORD_SIZE +#define VCPU_R15 __VCPU_REGS_R15 * WORD_SIZE +#endif + +.section .noinstr.text, "ax" + +/** + * __vmx_vcpu_run - Run a vCPU via a transition to VMX guest mode + * @vmx: struct vcpu_vmx * + * @regs: unsigned long * (to guest registers) + * @flags: VMX_RUN_VMRESUME: use VMRESUME instead of VMLAUNCH + * VMX_RUN_SAVE_SPEC_CTRL: save guest SPEC_CTRL into vmx->spec_ctrl + * + * Returns: + * 0 on VM-Exit, 1 on VM-Fail + */ +SYM_FUNC_START(__vmx_vcpu_run) + push %_ASM_BP + mov %_ASM_SP, %_ASM_BP +#ifdef CONFIG_X86_64 + push %r15 + push %r14 + push %r13 + push %r12 +#else + push %edi + push %esi +#endif + push %_ASM_BX + + /* Save @vmx for SPEC_CTRL handling */ + push %_ASM_ARG1 + + /* Save @flags for SPEC_CTRL handling */ + push %_ASM_ARG3 + + /* + * Save @regs, _ASM_ARG2 may be modified by vmx_update_host_rsp() and + * @regs is needed after VM-Exit to save the guest's register values. + */ + push %_ASM_ARG2 + + /* Copy @flags to BL, _ASM_ARG3 is volatile. */ + mov %_ASM_ARG3B, %bl + + lea (%_ASM_SP), %_ASM_ARG2 + call vmx_update_host_rsp + + ALTERNATIVE "jmp .Lspec_ctrl_done", "", X86_FEATURE_MSR_SPEC_CTRL + + /* + * SPEC_CTRL handling: if the guest's SPEC_CTRL value differs from the + * host's, write the MSR. + * + * IMPORTANT: To avoid RSB underflow attacks and any other nastiness, + * there must not be any returns or indirect branches between this code + * and vmentry. + */ + mov 2*WORD_SIZE(%_ASM_SP), %_ASM_DI + movl VMX_spec_ctrl(%_ASM_DI), %edi + movl PER_CPU_VAR(x86_spec_ctrl_current), %esi + cmp %edi, %esi + je .Lspec_ctrl_done + mov $MSR_IA32_SPEC_CTRL, %ecx + xor %edx, %edx + mov %edi, %eax + wrmsr + +.Lspec_ctrl_done: + + /* + * Since vmentry is serializing on affected CPUs, there's no need for + * an LFENCE to stop speculation from skipping the wrmsr. + */ + + /* Load @regs to RAX. */ + mov (%_ASM_SP), %_ASM_AX + + /* Check if vmlaunch or vmresume is needed */ + testb $VMX_RUN_VMRESUME, %bl + + /* Load guest registers. Don't clobber flags. */ + mov VCPU_RCX(%_ASM_AX), %_ASM_CX + mov VCPU_RDX(%_ASM_AX), %_ASM_DX + mov VCPU_RBX(%_ASM_AX), %_ASM_BX + mov VCPU_RBP(%_ASM_AX), %_ASM_BP + mov VCPU_RSI(%_ASM_AX), %_ASM_SI + mov VCPU_RDI(%_ASM_AX), %_ASM_DI +#ifdef CONFIG_X86_64 + mov VCPU_R8 (%_ASM_AX), %r8 + mov VCPU_R9 (%_ASM_AX), %r9 + mov VCPU_R10(%_ASM_AX), %r10 + mov VCPU_R11(%_ASM_AX), %r11 + mov VCPU_R12(%_ASM_AX), %r12 + mov VCPU_R13(%_ASM_AX), %r13 + mov VCPU_R14(%_ASM_AX), %r14 + mov VCPU_R15(%_ASM_AX), %r15 +#endif + /* Load guest RAX. This kills the @regs pointer! */ + mov VCPU_RAX(%_ASM_AX), %_ASM_AX + + /* Check EFLAGS.ZF from 'testb' above */ + jz .Lvmlaunch + + /* + * After a successful VMRESUME/VMLAUNCH, control flow "magically" + * resumes below at 'vmx_vmexit' due to the VMCS HOST_RIP setting. + * So this isn't a typical function and objtool needs to be told to + * save the unwind state here and restore it below. + */ + UNWIND_HINT_SAVE + +/* + * If VMRESUME/VMLAUNCH and corresponding vmexit succeed, execution resumes at + * the 'vmx_vmexit' label below. + */ +.Lvmresume: + vmresume + jmp .Lvmfail + +.Lvmlaunch: + vmlaunch + jmp .Lvmfail + + _ASM_EXTABLE(.Lvmresume, .Lfixup) + _ASM_EXTABLE(.Lvmlaunch, .Lfixup) + +SYM_INNER_LABEL(vmx_vmexit, SYM_L_GLOBAL) + + /* Restore unwind state from before the VMRESUME/VMLAUNCH. */ + UNWIND_HINT_RESTORE + ENDBR + + /* Temporarily save guest's RAX. */ + push %_ASM_AX + + /* Reload @regs to RAX. */ + mov WORD_SIZE(%_ASM_SP), %_ASM_AX + + /* Save all guest registers, including RAX from the stack */ + pop VCPU_RAX(%_ASM_AX) + mov %_ASM_CX, VCPU_RCX(%_ASM_AX) + mov %_ASM_DX, VCPU_RDX(%_ASM_AX) + mov %_ASM_BX, VCPU_RBX(%_ASM_AX) + mov %_ASM_BP, VCPU_RBP(%_ASM_AX) + mov %_ASM_SI, VCPU_RSI(%_ASM_AX) + mov %_ASM_DI, VCPU_RDI(%_ASM_AX) +#ifdef CONFIG_X86_64 + mov %r8, VCPU_R8 (%_ASM_AX) + mov %r9, VCPU_R9 (%_ASM_AX) + mov %r10, VCPU_R10(%_ASM_AX) + mov %r11, VCPU_R11(%_ASM_AX) + mov %r12, VCPU_R12(%_ASM_AX) + mov %r13, VCPU_R13(%_ASM_AX) + mov %r14, VCPU_R14(%_ASM_AX) + mov %r15, VCPU_R15(%_ASM_AX) +#endif + + /* Clear return value to indicate VM-Exit (as opposed to VM-Fail). */ + xor %ebx, %ebx + +.Lclear_regs: + /* Discard @regs. The register is irrelevant, it just can't be RBX. */ + pop %_ASM_AX + + /* + * Clear all general purpose registers except RSP and RBX to prevent + * speculative use of the guest's values, even those that are reloaded + * via the stack. In theory, an L1 cache miss when restoring registers + * could lead to speculative execution with the guest's values. + * Zeroing XORs are dirt cheap, i.e. the extra paranoia is essentially + * free. RSP and RBX are exempt as RSP is restored by hardware during + * VM-Exit and RBX is explicitly loaded with 0 or 1 to hold the return + * value. + */ + xor %eax, %eax + xor %ecx, %ecx + xor %edx, %edx + xor %ebp, %ebp + xor %esi, %esi + xor %edi, %edi +#ifdef CONFIG_X86_64 + xor %r8d, %r8d + xor %r9d, %r9d + xor %r10d, %r10d + xor %r11d, %r11d + xor %r12d, %r12d + xor %r13d, %r13d + xor %r14d, %r14d + xor %r15d, %r15d +#endif + + /* + * IMPORTANT: RSB filling and SPEC_CTRL handling must be done before + * the first unbalanced RET after vmexit! + * + * For retpoline or IBRS, RSB filling is needed to prevent poisoned RSB + * entries and (in some cases) RSB underflow. + * + * eIBRS has its own protection against poisoned RSB, so it doesn't + * need the RSB filling sequence. But it does need to be enabled, and a + * single call to retire, before the first unbalanced RET. + */ + + FILL_RETURN_BUFFER %_ASM_CX, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_VMEXIT,\ + X86_FEATURE_RSB_VMEXIT_LITE + + pop %_ASM_ARG2 /* @flags */ + pop %_ASM_ARG1 /* @vmx */ + + call vmx_spec_ctrl_restore_host + + /* Put return value in AX */ + mov %_ASM_BX, %_ASM_AX + + pop %_ASM_BX +#ifdef CONFIG_X86_64 + pop %r12 + pop %r13 + pop %r14 + pop %r15 +#else + pop %esi + pop %edi +#endif + pop %_ASM_BP + RET + +.Lfixup: + cmpb $0, kvm_rebooting + jne .Lvmfail + ud2 +.Lvmfail: + /* VM-Fail: set return value to 1 */ + mov $1, %_ASM_BX + jmp .Lclear_regs + +SYM_FUNC_END(__vmx_vcpu_run) + + +.section .text, "ax" + +/** + * vmread_error_trampoline - Trampoline from inline asm to vmread_error() + * @field: VMCS field encoding that failed + * @fault: %true if the VMREAD faulted, %false if it failed + + * Save and restore volatile registers across a call to vmread_error(). Note, + * all parameters are passed on the stack. + */ +SYM_FUNC_START(vmread_error_trampoline) + push %_ASM_BP + mov %_ASM_SP, %_ASM_BP + + push %_ASM_AX + push %_ASM_CX + push %_ASM_DX +#ifdef CONFIG_X86_64 + push %rdi + push %rsi + push %r8 + push %r9 + push %r10 + push %r11 +#endif + + /* Load @field and @fault to arg1 and arg2 respectively. */ + mov 3*WORD_SIZE(%_ASM_BP), %_ASM_ARG2 + mov 2*WORD_SIZE(%_ASM_BP), %_ASM_ARG1 + + call vmread_error + + /* Zero out @fault, which will be popped into the result register. */ + _ASM_MOV $0, 3*WORD_SIZE(%_ASM_BP) + +#ifdef CONFIG_X86_64 + pop %r11 + pop %r10 + pop %r9 + pop %r8 + pop %rsi + pop %rdi +#endif + pop %_ASM_DX + pop %_ASM_CX + pop %_ASM_AX + pop %_ASM_BP + + RET +SYM_FUNC_END(vmread_error_trampoline) + +SYM_FUNC_START(vmx_do_interrupt_nmi_irqoff) + /* + * Unconditionally create a stack frame, getting the correct RSP on the + * stack (for x86-64) would take two instructions anyways, and RBP can + * be used to restore RSP to make objtool happy (see below). + */ + push %_ASM_BP + mov %_ASM_SP, %_ASM_BP + +#ifdef CONFIG_X86_64 + /* + * Align RSP to a 16-byte boundary (to emulate CPU behavior) before + * creating the synthetic interrupt stack frame for the IRQ/NMI. + */ + and $-16, %rsp + push $__KERNEL_DS + push %rbp +#endif + pushf + push $__KERNEL_CS + CALL_NOSPEC _ASM_ARG1 + + /* + * "Restore" RSP from RBP, even though IRET has already unwound RSP to + * the correct value. objtool doesn't know the callee will IRET and, + * without the explicit restore, thinks the stack is getting walloped. + * Using an unwind hint is problematic due to x86-64's dynamic alignment. + */ + mov %_ASM_BP, %_ASM_SP + pop %_ASM_BP + RET +SYM_FUNC_END(vmx_do_interrupt_nmi_irqoff) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c new file mode 100644 index 000000000..98d732b94 --- /dev/null +++ b/arch/x86/kvm/vmx/vmx.c @@ -0,0 +1,8628 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Kernel-based Virtual Machine driver for Linux + * + * This module enables machines with Intel VT-x extensions to run virtual + * machines without emulation or binary translation. + * + * Copyright (C) 2006 Qumranet, Inc. + * Copyright 2010 Red Hat, Inc. and/or its affiliates. + * + * Authors: + * Avi Kivity <avi@qumranet.com> + * Yaniv Kamay <yaniv@qumranet.com> + */ + +#include <linux/highmem.h> +#include <linux/hrtimer.h> +#include <linux/kernel.h> +#include <linux/kvm_host.h> +#include <linux/module.h> +#include <linux/moduleparam.h> +#include <linux/mod_devicetable.h> +#include <linux/mm.h> +#include <linux/objtool.h> +#include <linux/sched.h> +#include <linux/sched/smt.h> +#include <linux/slab.h> +#include <linux/tboot.h> +#include <linux/trace_events.h> +#include <linux/entry-kvm.h> + +#include <asm/apic.h> +#include <asm/asm.h> +#include <asm/cpu.h> +#include <asm/cpu_device_id.h> +#include <asm/debugreg.h> +#include <asm/desc.h> +#include <asm/fpu/api.h> +#include <asm/fpu/xstate.h> +#include <asm/idtentry.h> +#include <asm/io.h> +#include <asm/irq_remapping.h> +#include <asm/reboot.h> +#include <asm/perf_event.h> +#include <asm/mmu_context.h> +#include <asm/mshyperv.h> +#include <asm/mwait.h> +#include <asm/spec-ctrl.h> +#include <asm/virtext.h> +#include <asm/vmx.h> + +#include "capabilities.h" +#include "cpuid.h" +#include "evmcs.h" +#include "hyperv.h" +#include "kvm_onhyperv.h" +#include "irq.h" +#include "kvm_cache_regs.h" +#include "lapic.h" +#include "mmu.h" +#include "nested.h" +#include "pmu.h" +#include "sgx.h" +#include "trace.h" +#include "vmcs.h" +#include "vmcs12.h" +#include "vmx.h" +#include "x86.h" + +MODULE_AUTHOR("Qumranet"); +MODULE_LICENSE("GPL"); + +#ifdef MODULE +static const struct x86_cpu_id vmx_cpu_id[] = { + X86_MATCH_FEATURE(X86_FEATURE_VMX, NULL), + {} +}; +MODULE_DEVICE_TABLE(x86cpu, vmx_cpu_id); +#endif + +bool __read_mostly enable_vpid = 1; +module_param_named(vpid, enable_vpid, bool, 0444); + +static bool __read_mostly enable_vnmi = 1; +module_param_named(vnmi, enable_vnmi, bool, S_IRUGO); + +bool __read_mostly flexpriority_enabled = 1; +module_param_named(flexpriority, flexpriority_enabled, bool, S_IRUGO); + +bool __read_mostly enable_ept = 1; +module_param_named(ept, enable_ept, bool, S_IRUGO); + +bool __read_mostly enable_unrestricted_guest = 1; +module_param_named(unrestricted_guest, + enable_unrestricted_guest, bool, S_IRUGO); + +bool __read_mostly enable_ept_ad_bits = 1; +module_param_named(eptad, enable_ept_ad_bits, bool, S_IRUGO); + +static bool __read_mostly emulate_invalid_guest_state = true; +module_param(emulate_invalid_guest_state, bool, S_IRUGO); + +static bool __read_mostly fasteoi = 1; +module_param(fasteoi, bool, S_IRUGO); + +module_param(enable_apicv, bool, S_IRUGO); + +bool __read_mostly enable_ipiv = true; +module_param(enable_ipiv, bool, 0444); + +/* + * If nested=1, nested virtualization is supported, i.e., guests may use + * VMX and be a hypervisor for its own guests. If nested=0, guests may not + * use VMX instructions. + */ +static bool __read_mostly nested = 1; +module_param(nested, bool, S_IRUGO); + +bool __read_mostly enable_pml = 1; +module_param_named(pml, enable_pml, bool, S_IRUGO); + +static bool __read_mostly error_on_inconsistent_vmcs_config = true; +module_param(error_on_inconsistent_vmcs_config, bool, 0444); + +static bool __read_mostly dump_invalid_vmcs = 0; +module_param(dump_invalid_vmcs, bool, 0644); + +#define MSR_BITMAP_MODE_X2APIC 1 +#define MSR_BITMAP_MODE_X2APIC_APICV 2 + +#define KVM_VMX_TSC_MULTIPLIER_MAX 0xffffffffffffffffULL + +/* Guest_tsc -> host_tsc conversion requires 64-bit division. */ +static int __read_mostly cpu_preemption_timer_multi; +static bool __read_mostly enable_preemption_timer = 1; +#ifdef CONFIG_X86_64 +module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO); +#endif + +extern bool __read_mostly allow_smaller_maxphyaddr; +module_param(allow_smaller_maxphyaddr, bool, S_IRUGO); + +#define KVM_VM_CR0_ALWAYS_OFF (X86_CR0_NW | X86_CR0_CD) +#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR0_NE +#define KVM_VM_CR0_ALWAYS_ON \ + (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE) + +#define KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR4_VMXE +#define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE) +#define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE) + +#define RMODE_GUEST_OWNED_EFLAGS_BITS (~(X86_EFLAGS_IOPL | X86_EFLAGS_VM)) + +#define MSR_IA32_RTIT_STATUS_MASK (~(RTIT_STATUS_FILTEREN | \ + RTIT_STATUS_CONTEXTEN | RTIT_STATUS_TRIGGEREN | \ + RTIT_STATUS_ERROR | RTIT_STATUS_STOPPED | \ + RTIT_STATUS_BYTECNT)) + +/* + * List of MSRs that can be directly passed to the guest. + * In addition to these x2apic and PT MSRs are handled specially. + */ +static u32 vmx_possible_passthrough_msrs[MAX_POSSIBLE_PASSTHROUGH_MSRS] = { + MSR_IA32_SPEC_CTRL, + MSR_IA32_PRED_CMD, + MSR_IA32_TSC, +#ifdef CONFIG_X86_64 + MSR_FS_BASE, + MSR_GS_BASE, + MSR_KERNEL_GS_BASE, + MSR_IA32_XFD, + MSR_IA32_XFD_ERR, +#endif + MSR_IA32_SYSENTER_CS, + MSR_IA32_SYSENTER_ESP, + MSR_IA32_SYSENTER_EIP, + MSR_CORE_C1_RES, + MSR_CORE_C3_RESIDENCY, + MSR_CORE_C6_RESIDENCY, + MSR_CORE_C7_RESIDENCY, +}; + +/* + * These 2 parameters are used to config the controls for Pause-Loop Exiting: + * ple_gap: upper bound on the amount of time between two successive + * executions of PAUSE in a loop. Also indicate if ple enabled. + * According to test, this time is usually smaller than 128 cycles. + * ple_window: upper bound on the amount of time a guest is allowed to execute + * in a PAUSE loop. Tests indicate that most spinlocks are held for + * less than 2^12 cycles + * Time is measured based on a counter that runs at the same rate as the TSC, + * refer SDM volume 3b section 21.6.13 & 22.1.3. + */ +static unsigned int ple_gap = KVM_DEFAULT_PLE_GAP; +module_param(ple_gap, uint, 0444); + +static unsigned int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW; +module_param(ple_window, uint, 0444); + +/* Default doubles per-vcpu window every exit. */ +static unsigned int ple_window_grow = KVM_DEFAULT_PLE_WINDOW_GROW; +module_param(ple_window_grow, uint, 0444); + +/* Default resets per-vcpu window every exit to ple_window. */ +static unsigned int ple_window_shrink = KVM_DEFAULT_PLE_WINDOW_SHRINK; +module_param(ple_window_shrink, uint, 0444); + +/* Default is to compute the maximum so we can never overflow. */ +static unsigned int ple_window_max = KVM_VMX_DEFAULT_PLE_WINDOW_MAX; +module_param(ple_window_max, uint, 0444); + +/* Default is SYSTEM mode, 1 for host-guest mode */ +int __read_mostly pt_mode = PT_MODE_SYSTEM; +module_param(pt_mode, int, S_IRUGO); + +static DEFINE_STATIC_KEY_FALSE(vmx_l1d_should_flush); +static DEFINE_STATIC_KEY_FALSE(vmx_l1d_flush_cond); +static DEFINE_MUTEX(vmx_l1d_flush_mutex); + +/* Storage for pre module init parameter parsing */ +static enum vmx_l1d_flush_state __read_mostly vmentry_l1d_flush_param = VMENTER_L1D_FLUSH_AUTO; + +static const struct { + const char *option; + bool for_parse; +} vmentry_l1d_param[] = { + [VMENTER_L1D_FLUSH_AUTO] = {"auto", true}, + [VMENTER_L1D_FLUSH_NEVER] = {"never", true}, + [VMENTER_L1D_FLUSH_COND] = {"cond", true}, + [VMENTER_L1D_FLUSH_ALWAYS] = {"always", true}, + [VMENTER_L1D_FLUSH_EPT_DISABLED] = {"EPT disabled", false}, + [VMENTER_L1D_FLUSH_NOT_REQUIRED] = {"not required", false}, +}; + +#define L1D_CACHE_ORDER 4 +static void *vmx_l1d_flush_pages; + +/* Control for disabling CPU Fill buffer clear */ +static bool __read_mostly vmx_fb_clear_ctrl_available; + +static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf) +{ + struct page *page; + unsigned int i; + + if (!boot_cpu_has_bug(X86_BUG_L1TF)) { + l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED; + return 0; + } + + if (!enable_ept) { + l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_EPT_DISABLED; + return 0; + } + + if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) { + u64 msr; + + rdmsrl(MSR_IA32_ARCH_CAPABILITIES, msr); + if (msr & ARCH_CAP_SKIP_VMENTRY_L1DFLUSH) { + l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED; + return 0; + } + } + + /* If set to auto use the default l1tf mitigation method */ + if (l1tf == VMENTER_L1D_FLUSH_AUTO) { + switch (l1tf_mitigation) { + case L1TF_MITIGATION_OFF: + l1tf = VMENTER_L1D_FLUSH_NEVER; + break; + case L1TF_MITIGATION_FLUSH_NOWARN: + case L1TF_MITIGATION_FLUSH: + case L1TF_MITIGATION_FLUSH_NOSMT: + l1tf = VMENTER_L1D_FLUSH_COND; + break; + case L1TF_MITIGATION_FULL: + case L1TF_MITIGATION_FULL_FORCE: + l1tf = VMENTER_L1D_FLUSH_ALWAYS; + break; + } + } else if (l1tf_mitigation == L1TF_MITIGATION_FULL_FORCE) { + l1tf = VMENTER_L1D_FLUSH_ALWAYS; + } + + if (l1tf != VMENTER_L1D_FLUSH_NEVER && !vmx_l1d_flush_pages && + !boot_cpu_has(X86_FEATURE_FLUSH_L1D)) { + /* + * This allocation for vmx_l1d_flush_pages is not tied to a VM + * lifetime and so should not be charged to a memcg. + */ + page = alloc_pages(GFP_KERNEL, L1D_CACHE_ORDER); + if (!page) + return -ENOMEM; + vmx_l1d_flush_pages = page_address(page); + + /* + * Initialize each page with a different pattern in + * order to protect against KSM in the nested + * virtualization case. + */ + for (i = 0; i < 1u << L1D_CACHE_ORDER; ++i) { + memset(vmx_l1d_flush_pages + i * PAGE_SIZE, i + 1, + PAGE_SIZE); + } + } + + l1tf_vmx_mitigation = l1tf; + + if (l1tf != VMENTER_L1D_FLUSH_NEVER) + static_branch_enable(&vmx_l1d_should_flush); + else + static_branch_disable(&vmx_l1d_should_flush); + + if (l1tf == VMENTER_L1D_FLUSH_COND) + static_branch_enable(&vmx_l1d_flush_cond); + else + static_branch_disable(&vmx_l1d_flush_cond); + return 0; +} + +static int vmentry_l1d_flush_parse(const char *s) +{ + unsigned int i; + + if (s) { + for (i = 0; i < ARRAY_SIZE(vmentry_l1d_param); i++) { + if (vmentry_l1d_param[i].for_parse && + sysfs_streq(s, vmentry_l1d_param[i].option)) + return i; + } + } + return -EINVAL; +} + +static int vmentry_l1d_flush_set(const char *s, const struct kernel_param *kp) +{ + int l1tf, ret; + + l1tf = vmentry_l1d_flush_parse(s); + if (l1tf < 0) + return l1tf; + + if (!boot_cpu_has(X86_BUG_L1TF)) + return 0; + + /* + * Has vmx_init() run already? If not then this is the pre init + * parameter parsing. In that case just store the value and let + * vmx_init() do the proper setup after enable_ept has been + * established. + */ + if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_AUTO) { + vmentry_l1d_flush_param = l1tf; + return 0; + } + + mutex_lock(&vmx_l1d_flush_mutex); + ret = vmx_setup_l1d_flush(l1tf); + mutex_unlock(&vmx_l1d_flush_mutex); + return ret; +} + +static int vmentry_l1d_flush_get(char *s, const struct kernel_param *kp) +{ + if (WARN_ON_ONCE(l1tf_vmx_mitigation >= ARRAY_SIZE(vmentry_l1d_param))) + return sprintf(s, "???\n"); + + return sprintf(s, "%s\n", vmentry_l1d_param[l1tf_vmx_mitigation].option); +} + +static void vmx_setup_fb_clear_ctrl(void) +{ + u64 msr; + + if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES) && + !boot_cpu_has_bug(X86_BUG_MDS) && + !boot_cpu_has_bug(X86_BUG_TAA)) { + rdmsrl(MSR_IA32_ARCH_CAPABILITIES, msr); + if (msr & ARCH_CAP_FB_CLEAR_CTRL) + vmx_fb_clear_ctrl_available = true; + } +} + +static __always_inline void vmx_disable_fb_clear(struct vcpu_vmx *vmx) +{ + u64 msr; + + if (!vmx->disable_fb_clear) + return; + + msr = __rdmsr(MSR_IA32_MCU_OPT_CTRL); + msr |= FB_CLEAR_DIS; + native_wrmsrl(MSR_IA32_MCU_OPT_CTRL, msr); + /* Cache the MSR value to avoid reading it later */ + vmx->msr_ia32_mcu_opt_ctrl = msr; +} + +static __always_inline void vmx_enable_fb_clear(struct vcpu_vmx *vmx) +{ + if (!vmx->disable_fb_clear) + return; + + vmx->msr_ia32_mcu_opt_ctrl &= ~FB_CLEAR_DIS; + native_wrmsrl(MSR_IA32_MCU_OPT_CTRL, vmx->msr_ia32_mcu_opt_ctrl); +} + +static void vmx_update_fb_clear_dis(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) +{ + vmx->disable_fb_clear = vmx_fb_clear_ctrl_available; + + /* + * If guest will not execute VERW, there is no need to set FB_CLEAR_DIS + * at VMEntry. Skip the MSR read/write when a guest has no use case to + * execute VERW. + */ + if ((vcpu->arch.arch_capabilities & ARCH_CAP_FB_CLEAR) || + ((vcpu->arch.arch_capabilities & ARCH_CAP_MDS_NO) && + (vcpu->arch.arch_capabilities & ARCH_CAP_TAA_NO) && + (vcpu->arch.arch_capabilities & ARCH_CAP_PSDP_NO) && + (vcpu->arch.arch_capabilities & ARCH_CAP_FBSDP_NO) && + (vcpu->arch.arch_capabilities & ARCH_CAP_SBDR_SSDP_NO))) + vmx->disable_fb_clear = false; +} + +static const struct kernel_param_ops vmentry_l1d_flush_ops = { + .set = vmentry_l1d_flush_set, + .get = vmentry_l1d_flush_get, +}; +module_param_cb(vmentry_l1d_flush, &vmentry_l1d_flush_ops, NULL, 0644); + +static u32 vmx_segment_access_rights(struct kvm_segment *var); + +void vmx_vmexit(void); + +#define vmx_insn_failed(fmt...) \ +do { \ + WARN_ONCE(1, fmt); \ + pr_warn_ratelimited(fmt); \ +} while (0) + +void vmread_error(unsigned long field, bool fault) +{ + if (fault) + kvm_spurious_fault(); + else + vmx_insn_failed("kvm: vmread failed: field=%lx\n", field); +} + +noinline void vmwrite_error(unsigned long field, unsigned long value) +{ + vmx_insn_failed("kvm: vmwrite failed: field=%lx val=%lx err=%u\n", + field, value, vmcs_read32(VM_INSTRUCTION_ERROR)); +} + +noinline void vmclear_error(struct vmcs *vmcs, u64 phys_addr) +{ + vmx_insn_failed("kvm: vmclear failed: %p/%llx err=%u\n", + vmcs, phys_addr, vmcs_read32(VM_INSTRUCTION_ERROR)); +} + +noinline void vmptrld_error(struct vmcs *vmcs, u64 phys_addr) +{ + vmx_insn_failed("kvm: vmptrld failed: %p/%llx err=%u\n", + vmcs, phys_addr, vmcs_read32(VM_INSTRUCTION_ERROR)); +} + +noinline void invvpid_error(unsigned long ext, u16 vpid, gva_t gva) +{ + vmx_insn_failed("kvm: invvpid failed: ext=0x%lx vpid=%u gva=0x%lx\n", + ext, vpid, gva); +} + +noinline void invept_error(unsigned long ext, u64 eptp, gpa_t gpa) +{ + vmx_insn_failed("kvm: invept failed: ext=0x%lx eptp=%llx gpa=0x%llx\n", + ext, eptp, gpa); +} + +static DEFINE_PER_CPU(struct vmcs *, vmxarea); +DEFINE_PER_CPU(struct vmcs *, current_vmcs); +/* + * We maintain a per-CPU linked-list of VMCS loaded on that CPU. This is needed + * when a CPU is brought down, and we need to VMCLEAR all VMCSs loaded on it. + */ +static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu); + +static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS); +static DEFINE_SPINLOCK(vmx_vpid_lock); + +struct vmcs_config vmcs_config; +struct vmx_capability vmx_capability; + +#define VMX_SEGMENT_FIELD(seg) \ + [VCPU_SREG_##seg] = { \ + .selector = GUEST_##seg##_SELECTOR, \ + .base = GUEST_##seg##_BASE, \ + .limit = GUEST_##seg##_LIMIT, \ + .ar_bytes = GUEST_##seg##_AR_BYTES, \ + } + +static const struct kvm_vmx_segment_field { + unsigned selector; + unsigned base; + unsigned limit; + unsigned ar_bytes; +} kvm_vmx_segment_fields[] = { + VMX_SEGMENT_FIELD(CS), + VMX_SEGMENT_FIELD(DS), + VMX_SEGMENT_FIELD(ES), + VMX_SEGMENT_FIELD(FS), + VMX_SEGMENT_FIELD(GS), + VMX_SEGMENT_FIELD(SS), + VMX_SEGMENT_FIELD(TR), + VMX_SEGMENT_FIELD(LDTR), +}; + +static inline void vmx_segment_cache_clear(struct vcpu_vmx *vmx) +{ + vmx->segment_cache.bitmask = 0; +} + +static unsigned long host_idt_base; + +#if IS_ENABLED(CONFIG_HYPERV) +static bool __read_mostly enlightened_vmcs = true; +module_param(enlightened_vmcs, bool, 0444); + +static int hv_enable_direct_tlbflush(struct kvm_vcpu *vcpu) +{ + struct hv_enlightened_vmcs *evmcs; + struct hv_partition_assist_pg **p_hv_pa_pg = + &to_kvm_hv(vcpu->kvm)->hv_pa_pg; + /* + * Synthetic VM-Exit is not enabled in current code and so All + * evmcs in singe VM shares same assist page. + */ + if (!*p_hv_pa_pg) + *p_hv_pa_pg = kzalloc(PAGE_SIZE, GFP_KERNEL_ACCOUNT); + + if (!*p_hv_pa_pg) + return -ENOMEM; + + evmcs = (struct hv_enlightened_vmcs *)to_vmx(vcpu)->loaded_vmcs->vmcs; + + evmcs->partition_assist_page = + __pa(*p_hv_pa_pg); + evmcs->hv_vm_id = (unsigned long)vcpu->kvm; + evmcs->hv_enlightenments_control.nested_flush_hypercall = 1; + + return 0; +} + +static void hv_reset_evmcs(void) +{ + struct hv_vp_assist_page *vp_ap; + + if (!static_branch_unlikely(&enable_evmcs)) + return; + + /* + * KVM should enable eVMCS if and only if all CPUs have a VP assist + * page, and should reject CPU onlining if eVMCS is enabled the CPU + * doesn't have a VP assist page allocated. + */ + vp_ap = hv_get_vp_assist_page(smp_processor_id()); + if (WARN_ON_ONCE(!vp_ap)) + return; + + /* + * Reset everything to support using non-enlightened VMCS access later + * (e.g. when we reload the module with enlightened_vmcs=0) + */ + vp_ap->nested_control.features.directhypercall = 0; + vp_ap->current_nested_vmcs = 0; + vp_ap->enlighten_vmentry = 0; +} + +#else /* IS_ENABLED(CONFIG_HYPERV) */ +static void hv_reset_evmcs(void) {} +#endif /* IS_ENABLED(CONFIG_HYPERV) */ + +/* + * Comment's format: document - errata name - stepping - processor name. + * Refer from + * https://www.virtualbox.org/svn/vbox/trunk/src/VBox/VMM/VMMR0/HMR0.cpp + */ +static u32 vmx_preemption_cpu_tfms[] = { +/* 323344.pdf - BA86 - D0 - Xeon 7500 Series */ +0x000206E6, +/* 323056.pdf - AAX65 - C2 - Xeon L3406 */ +/* 322814.pdf - AAT59 - C2 - i7-600, i5-500, i5-400 and i3-300 Mobile */ +/* 322911.pdf - AAU65 - C2 - i5-600, i3-500 Desktop and Pentium G6950 */ +0x00020652, +/* 322911.pdf - AAU65 - K0 - i5-600, i3-500 Desktop and Pentium G6950 */ +0x00020655, +/* 322373.pdf - AAO95 - B1 - Xeon 3400 Series */ +/* 322166.pdf - AAN92 - B1 - i7-800 and i5-700 Desktop */ +/* + * 320767.pdf - AAP86 - B1 - + * i7-900 Mobile Extreme, i7-800 and i7-700 Mobile + */ +0x000106E5, +/* 321333.pdf - AAM126 - C0 - Xeon 3500 */ +0x000106A0, +/* 321333.pdf - AAM126 - C1 - Xeon 3500 */ +0x000106A1, +/* 320836.pdf - AAJ124 - C0 - i7-900 Desktop Extreme and i7-900 Desktop */ +0x000106A4, + /* 321333.pdf - AAM126 - D0 - Xeon 3500 */ + /* 321324.pdf - AAK139 - D0 - Xeon 5500 */ + /* 320836.pdf - AAJ124 - D0 - i7-900 Extreme and i7-900 Desktop */ +0x000106A5, + /* Xeon E3-1220 V2 */ +0x000306A8, +}; + +static inline bool cpu_has_broken_vmx_preemption_timer(void) +{ + u32 eax = cpuid_eax(0x00000001), i; + + /* Clear the reserved bits */ + eax &= ~(0x3U << 14 | 0xfU << 28); + for (i = 0; i < ARRAY_SIZE(vmx_preemption_cpu_tfms); i++) + if (eax == vmx_preemption_cpu_tfms[i]) + return true; + + return false; +} + +static inline bool cpu_need_virtualize_apic_accesses(struct kvm_vcpu *vcpu) +{ + return flexpriority_enabled && lapic_in_kernel(vcpu); +} + +static int possible_passthrough_msr_slot(u32 msr) +{ + u32 i; + + for (i = 0; i < ARRAY_SIZE(vmx_possible_passthrough_msrs); i++) + if (vmx_possible_passthrough_msrs[i] == msr) + return i; + + return -ENOENT; +} + +static bool is_valid_passthrough_msr(u32 msr) +{ + bool r; + + switch (msr) { + case 0x800 ... 0x8ff: + /* x2APIC MSRs. These are handled in vmx_update_msr_bitmap_x2apic() */ + return true; + case MSR_IA32_RTIT_STATUS: + case MSR_IA32_RTIT_OUTPUT_BASE: + case MSR_IA32_RTIT_OUTPUT_MASK: + case MSR_IA32_RTIT_CR3_MATCH: + case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B: + /* PT MSRs. These are handled in pt_update_intercept_for_msr() */ + case MSR_LBR_SELECT: + case MSR_LBR_TOS: + case MSR_LBR_INFO_0 ... MSR_LBR_INFO_0 + 31: + case MSR_LBR_NHM_FROM ... MSR_LBR_NHM_FROM + 31: + case MSR_LBR_NHM_TO ... MSR_LBR_NHM_TO + 31: + case MSR_LBR_CORE_FROM ... MSR_LBR_CORE_FROM + 8: + case MSR_LBR_CORE_TO ... MSR_LBR_CORE_TO + 8: + /* LBR MSRs. These are handled in vmx_update_intercept_for_lbr_msrs() */ + return true; + } + + r = possible_passthrough_msr_slot(msr) != -ENOENT; + + WARN(!r, "Invalid MSR %x, please adapt vmx_possible_passthrough_msrs[]", msr); + + return r; +} + +struct vmx_uret_msr *vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr) +{ + int i; + + i = kvm_find_user_return_msr(msr); + if (i >= 0) + return &vmx->guest_uret_msrs[i]; + return NULL; +} + +static int vmx_set_guest_uret_msr(struct vcpu_vmx *vmx, + struct vmx_uret_msr *msr, u64 data) +{ + unsigned int slot = msr - vmx->guest_uret_msrs; + int ret = 0; + + if (msr->load_into_hardware) { + preempt_disable(); + ret = kvm_set_user_return_msr(slot, data, msr->mask); + preempt_enable(); + } + if (!ret) + msr->data = data; + return ret; +} + +static void crash_vmclear_local_loaded_vmcss(void) +{ + int cpu = raw_smp_processor_id(); + struct loaded_vmcs *v; + + list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu), + loaded_vmcss_on_cpu_link) + vmcs_clear(v->vmcs); +} + +static void __loaded_vmcs_clear(void *arg) +{ + struct loaded_vmcs *loaded_vmcs = arg; + int cpu = raw_smp_processor_id(); + + if (loaded_vmcs->cpu != cpu) + return; /* vcpu migration can race with cpu offline */ + if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs) + per_cpu(current_vmcs, cpu) = NULL; + + vmcs_clear(loaded_vmcs->vmcs); + if (loaded_vmcs->shadow_vmcs && loaded_vmcs->launched) + vmcs_clear(loaded_vmcs->shadow_vmcs); + + list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link); + + /* + * Ensure all writes to loaded_vmcs, including deleting it from its + * current percpu list, complete before setting loaded_vmcs->cpu to + * -1, otherwise a different cpu can see loaded_vmcs->cpu == -1 first + * and add loaded_vmcs to its percpu list before it's deleted from this + * cpu's list. Pairs with the smp_rmb() in vmx_vcpu_load_vmcs(). + */ + smp_wmb(); + + loaded_vmcs->cpu = -1; + loaded_vmcs->launched = 0; +} + +void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs) +{ + int cpu = loaded_vmcs->cpu; + + if (cpu != -1) + smp_call_function_single(cpu, + __loaded_vmcs_clear, loaded_vmcs, 1); +} + +static bool vmx_segment_cache_test_set(struct vcpu_vmx *vmx, unsigned seg, + unsigned field) +{ + bool ret; + u32 mask = 1 << (seg * SEG_FIELD_NR + field); + + if (!kvm_register_is_available(&vmx->vcpu, VCPU_EXREG_SEGMENTS)) { + kvm_register_mark_available(&vmx->vcpu, VCPU_EXREG_SEGMENTS); + vmx->segment_cache.bitmask = 0; + } + ret = vmx->segment_cache.bitmask & mask; + vmx->segment_cache.bitmask |= mask; + return ret; +} + +static u16 vmx_read_guest_seg_selector(struct vcpu_vmx *vmx, unsigned seg) +{ + u16 *p = &vmx->segment_cache.seg[seg].selector; + + if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_SEL)) + *p = vmcs_read16(kvm_vmx_segment_fields[seg].selector); + return *p; +} + +static ulong vmx_read_guest_seg_base(struct vcpu_vmx *vmx, unsigned seg) +{ + ulong *p = &vmx->segment_cache.seg[seg].base; + + if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_BASE)) + *p = vmcs_readl(kvm_vmx_segment_fields[seg].base); + return *p; +} + +static u32 vmx_read_guest_seg_limit(struct vcpu_vmx *vmx, unsigned seg) +{ + u32 *p = &vmx->segment_cache.seg[seg].limit; + + if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_LIMIT)) + *p = vmcs_read32(kvm_vmx_segment_fields[seg].limit); + return *p; +} + +static u32 vmx_read_guest_seg_ar(struct vcpu_vmx *vmx, unsigned seg) +{ + u32 *p = &vmx->segment_cache.seg[seg].ar; + + if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_AR)) + *p = vmcs_read32(kvm_vmx_segment_fields[seg].ar_bytes); + return *p; +} + +void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu) +{ + u32 eb; + + eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) | + (1u << DB_VECTOR) | (1u << AC_VECTOR); + /* + * Guest access to VMware backdoor ports could legitimately + * trigger #GP because of TSS I/O permission bitmap. + * We intercept those #GP and allow access to them anyway + * as VMware does. + */ + if (enable_vmware_backdoor) + eb |= (1u << GP_VECTOR); + if ((vcpu->guest_debug & + (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) == + (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) + eb |= 1u << BP_VECTOR; + if (to_vmx(vcpu)->rmode.vm86_active) + eb = ~0; + if (!vmx_need_pf_intercept(vcpu)) + eb &= ~(1u << PF_VECTOR); + + /* When we are running a nested L2 guest and L1 specified for it a + * certain exception bitmap, we must trap the same exceptions and pass + * them to L1. When running L2, we will only handle the exceptions + * specified above if L1 did not want them. + */ + if (is_guest_mode(vcpu)) + eb |= get_vmcs12(vcpu)->exception_bitmap; + else { + int mask = 0, match = 0; + + if (enable_ept && (eb & (1u << PF_VECTOR))) { + /* + * If EPT is enabled, #PF is currently only intercepted + * if MAXPHYADDR is smaller on the guest than on the + * host. In that case we only care about present, + * non-reserved faults. For vmcs02, however, PFEC_MASK + * and PFEC_MATCH are set in prepare_vmcs02_rare. + */ + mask = PFERR_PRESENT_MASK | PFERR_RSVD_MASK; + match = PFERR_PRESENT_MASK; + } + vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, mask); + vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, match); + } + + /* + * Disabling xfd interception indicates that dynamic xfeatures + * might be used in the guest. Always trap #NM in this case + * to save guest xfd_err timely. + */ + if (vcpu->arch.xfd_no_write_intercept) + eb |= (1u << NM_VECTOR); + + vmcs_write32(EXCEPTION_BITMAP, eb); +} + +/* + * Check if MSR is intercepted for currently loaded MSR bitmap. + */ +static bool msr_write_intercepted(struct vcpu_vmx *vmx, u32 msr) +{ + if (!(exec_controls_get(vmx) & CPU_BASED_USE_MSR_BITMAPS)) + return true; + + return vmx_test_msr_bitmap_write(vmx->loaded_vmcs->msr_bitmap, msr); +} + +unsigned int __vmx_vcpu_run_flags(struct vcpu_vmx *vmx) +{ + unsigned int flags = 0; + + if (vmx->loaded_vmcs->launched) + flags |= VMX_RUN_VMRESUME; + + /* + * If writes to the SPEC_CTRL MSR aren't intercepted, the guest is free + * to change it directly without causing a vmexit. In that case read + * it after vmexit and store it in vmx->spec_ctrl. + */ + if (unlikely(!msr_write_intercepted(vmx, MSR_IA32_SPEC_CTRL))) + flags |= VMX_RUN_SAVE_SPEC_CTRL; + + return flags; +} + +static __always_inline void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx, + unsigned long entry, unsigned long exit) +{ + vm_entry_controls_clearbit(vmx, entry); + vm_exit_controls_clearbit(vmx, exit); +} + +int vmx_find_loadstore_msr_slot(struct vmx_msrs *m, u32 msr) +{ + unsigned int i; + + for (i = 0; i < m->nr; ++i) { + if (m->val[i].index == msr) + return i; + } + return -ENOENT; +} + +static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr) +{ + int i; + struct msr_autoload *m = &vmx->msr_autoload; + + switch (msr) { + case MSR_EFER: + if (cpu_has_load_ia32_efer()) { + clear_atomic_switch_msr_special(vmx, + VM_ENTRY_LOAD_IA32_EFER, + VM_EXIT_LOAD_IA32_EFER); + return; + } + break; + case MSR_CORE_PERF_GLOBAL_CTRL: + if (cpu_has_load_perf_global_ctrl()) { + clear_atomic_switch_msr_special(vmx, + VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL, + VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL); + return; + } + break; + } + i = vmx_find_loadstore_msr_slot(&m->guest, msr); + if (i < 0) + goto skip_guest; + --m->guest.nr; + m->guest.val[i] = m->guest.val[m->guest.nr]; + vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr); + +skip_guest: + i = vmx_find_loadstore_msr_slot(&m->host, msr); + if (i < 0) + return; + + --m->host.nr; + m->host.val[i] = m->host.val[m->host.nr]; + vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr); +} + +static __always_inline void add_atomic_switch_msr_special(struct vcpu_vmx *vmx, + unsigned long entry, unsigned long exit, + unsigned long guest_val_vmcs, unsigned long host_val_vmcs, + u64 guest_val, u64 host_val) +{ + vmcs_write64(guest_val_vmcs, guest_val); + if (host_val_vmcs != HOST_IA32_EFER) + vmcs_write64(host_val_vmcs, host_val); + vm_entry_controls_setbit(vmx, entry); + vm_exit_controls_setbit(vmx, exit); +} + +static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr, + u64 guest_val, u64 host_val, bool entry_only) +{ + int i, j = 0; + struct msr_autoload *m = &vmx->msr_autoload; + + switch (msr) { + case MSR_EFER: + if (cpu_has_load_ia32_efer()) { + add_atomic_switch_msr_special(vmx, + VM_ENTRY_LOAD_IA32_EFER, + VM_EXIT_LOAD_IA32_EFER, + GUEST_IA32_EFER, + HOST_IA32_EFER, + guest_val, host_val); + return; + } + break; + case MSR_CORE_PERF_GLOBAL_CTRL: + if (cpu_has_load_perf_global_ctrl()) { + add_atomic_switch_msr_special(vmx, + VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL, + VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL, + GUEST_IA32_PERF_GLOBAL_CTRL, + HOST_IA32_PERF_GLOBAL_CTRL, + guest_val, host_val); + return; + } + break; + case MSR_IA32_PEBS_ENABLE: + /* PEBS needs a quiescent period after being disabled (to write + * a record). Disabling PEBS through VMX MSR swapping doesn't + * provide that period, so a CPU could write host's record into + * guest's memory. + */ + wrmsrl(MSR_IA32_PEBS_ENABLE, 0); + } + + i = vmx_find_loadstore_msr_slot(&m->guest, msr); + if (!entry_only) + j = vmx_find_loadstore_msr_slot(&m->host, msr); + + if ((i < 0 && m->guest.nr == MAX_NR_LOADSTORE_MSRS) || + (j < 0 && m->host.nr == MAX_NR_LOADSTORE_MSRS)) { + printk_once(KERN_WARNING "Not enough msr switch entries. " + "Can't add msr %x\n", msr); + return; + } + if (i < 0) { + i = m->guest.nr++; + vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr); + } + m->guest.val[i].index = msr; + m->guest.val[i].value = guest_val; + + if (entry_only) + return; + + if (j < 0) { + j = m->host.nr++; + vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr); + } + m->host.val[j].index = msr; + m->host.val[j].value = host_val; +} + +static bool update_transition_efer(struct vcpu_vmx *vmx) +{ + u64 guest_efer = vmx->vcpu.arch.efer; + u64 ignore_bits = 0; + int i; + + /* Shadow paging assumes NX to be available. */ + if (!enable_ept) + guest_efer |= EFER_NX; + + /* + * LMA and LME handled by hardware; SCE meaningless outside long mode. + */ + ignore_bits |= EFER_SCE; +#ifdef CONFIG_X86_64 + ignore_bits |= EFER_LMA | EFER_LME; + /* SCE is meaningful only in long mode on Intel */ + if (guest_efer & EFER_LMA) + ignore_bits &= ~(u64)EFER_SCE; +#endif + + /* + * On EPT, we can't emulate NX, so we must switch EFER atomically. + * On CPUs that support "load IA32_EFER", always switch EFER + * atomically, since it's faster than switching it manually. + */ + if (cpu_has_load_ia32_efer() || + (enable_ept && ((vmx->vcpu.arch.efer ^ host_efer) & EFER_NX))) { + if (!(guest_efer & EFER_LMA)) + guest_efer &= ~EFER_LME; + if (guest_efer != host_efer) + add_atomic_switch_msr(vmx, MSR_EFER, + guest_efer, host_efer, false); + else + clear_atomic_switch_msr(vmx, MSR_EFER); + return false; + } + + i = kvm_find_user_return_msr(MSR_EFER); + if (i < 0) + return false; + + clear_atomic_switch_msr(vmx, MSR_EFER); + + guest_efer &= ~ignore_bits; + guest_efer |= host_efer & ignore_bits; + + vmx->guest_uret_msrs[i].data = guest_efer; + vmx->guest_uret_msrs[i].mask = ~ignore_bits; + + return true; +} + +#ifdef CONFIG_X86_32 +/* + * On 32-bit kernels, VM exits still load the FS and GS bases from the + * VMCS rather than the segment table. KVM uses this helper to figure + * out the current bases to poke them into the VMCS before entry. + */ +static unsigned long segment_base(u16 selector) +{ + struct desc_struct *table; + unsigned long v; + + if (!(selector & ~SEGMENT_RPL_MASK)) + return 0; + + table = get_current_gdt_ro(); + + if ((selector & SEGMENT_TI_MASK) == SEGMENT_LDT) { + u16 ldt_selector = kvm_read_ldt(); + + if (!(ldt_selector & ~SEGMENT_RPL_MASK)) + return 0; + + table = (struct desc_struct *)segment_base(ldt_selector); + } + v = get_desc_base(&table[selector >> 3]); + return v; +} +#endif + +static inline bool pt_can_write_msr(struct vcpu_vmx *vmx) +{ + return vmx_pt_mode_is_host_guest() && + !(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN); +} + +static inline bool pt_output_base_valid(struct kvm_vcpu *vcpu, u64 base) +{ + /* The base must be 128-byte aligned and a legal physical address. */ + return kvm_vcpu_is_legal_aligned_gpa(vcpu, base, 128); +} + +static inline void pt_load_msr(struct pt_ctx *ctx, u32 addr_range) +{ + u32 i; + + wrmsrl(MSR_IA32_RTIT_STATUS, ctx->status); + wrmsrl(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base); + wrmsrl(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask); + wrmsrl(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match); + for (i = 0; i < addr_range; i++) { + wrmsrl(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]); + wrmsrl(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]); + } +} + +static inline void pt_save_msr(struct pt_ctx *ctx, u32 addr_range) +{ + u32 i; + + rdmsrl(MSR_IA32_RTIT_STATUS, ctx->status); + rdmsrl(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base); + rdmsrl(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask); + rdmsrl(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match); + for (i = 0; i < addr_range; i++) { + rdmsrl(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]); + rdmsrl(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]); + } +} + +static void pt_guest_enter(struct vcpu_vmx *vmx) +{ + if (vmx_pt_mode_is_system()) + return; + + /* + * GUEST_IA32_RTIT_CTL is already set in the VMCS. + * Save host state before VM entry. + */ + rdmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl); + if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) { + wrmsrl(MSR_IA32_RTIT_CTL, 0); + pt_save_msr(&vmx->pt_desc.host, vmx->pt_desc.num_address_ranges); + pt_load_msr(&vmx->pt_desc.guest, vmx->pt_desc.num_address_ranges); + } +} + +static void pt_guest_exit(struct vcpu_vmx *vmx) +{ + if (vmx_pt_mode_is_system()) + return; + + if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) { + pt_save_msr(&vmx->pt_desc.guest, vmx->pt_desc.num_address_ranges); + pt_load_msr(&vmx->pt_desc.host, vmx->pt_desc.num_address_ranges); + } + + /* + * KVM requires VM_EXIT_CLEAR_IA32_RTIT_CTL to expose PT to the guest, + * i.e. RTIT_CTL is always cleared on VM-Exit. Restore it if necessary. + */ + if (vmx->pt_desc.host.ctl) + wrmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl); +} + +void vmx_set_host_fs_gs(struct vmcs_host_state *host, u16 fs_sel, u16 gs_sel, + unsigned long fs_base, unsigned long gs_base) +{ + if (unlikely(fs_sel != host->fs_sel)) { + if (!(fs_sel & 7)) + vmcs_write16(HOST_FS_SELECTOR, fs_sel); + else + vmcs_write16(HOST_FS_SELECTOR, 0); + host->fs_sel = fs_sel; + } + if (unlikely(gs_sel != host->gs_sel)) { + if (!(gs_sel & 7)) + vmcs_write16(HOST_GS_SELECTOR, gs_sel); + else + vmcs_write16(HOST_GS_SELECTOR, 0); + host->gs_sel = gs_sel; + } + if (unlikely(fs_base != host->fs_base)) { + vmcs_writel(HOST_FS_BASE, fs_base); + host->fs_base = fs_base; + } + if (unlikely(gs_base != host->gs_base)) { + vmcs_writel(HOST_GS_BASE, gs_base); + host->gs_base = gs_base; + } +} + +void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct vmcs_host_state *host_state; +#ifdef CONFIG_X86_64 + int cpu = raw_smp_processor_id(); +#endif + unsigned long fs_base, gs_base; + u16 fs_sel, gs_sel; + int i; + + vmx->req_immediate_exit = false; + + /* + * Note that guest MSRs to be saved/restored can also be changed + * when guest state is loaded. This happens when guest transitions + * to/from long-mode by setting MSR_EFER.LMA. + */ + if (!vmx->guest_uret_msrs_loaded) { + vmx->guest_uret_msrs_loaded = true; + for (i = 0; i < kvm_nr_uret_msrs; ++i) { + if (!vmx->guest_uret_msrs[i].load_into_hardware) + continue; + + kvm_set_user_return_msr(i, + vmx->guest_uret_msrs[i].data, + vmx->guest_uret_msrs[i].mask); + } + } + + if (vmx->nested.need_vmcs12_to_shadow_sync) + nested_sync_vmcs12_to_shadow(vcpu); + + if (vmx->guest_state_loaded) + return; + + host_state = &vmx->loaded_vmcs->host_state; + + /* + * Set host fs and gs selectors. Unfortunately, 22.2.3 does not + * allow segment selectors with cpl > 0 or ti == 1. + */ + host_state->ldt_sel = kvm_read_ldt(); + +#ifdef CONFIG_X86_64 + savesegment(ds, host_state->ds_sel); + savesegment(es, host_state->es_sel); + + gs_base = cpu_kernelmode_gs_base(cpu); + if (likely(is_64bit_mm(current->mm))) { + current_save_fsgs(); + fs_sel = current->thread.fsindex; + gs_sel = current->thread.gsindex; + fs_base = current->thread.fsbase; + vmx->msr_host_kernel_gs_base = current->thread.gsbase; + } else { + savesegment(fs, fs_sel); + savesegment(gs, gs_sel); + fs_base = read_msr(MSR_FS_BASE); + vmx->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE); + } + + wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); +#else + savesegment(fs, fs_sel); + savesegment(gs, gs_sel); + fs_base = segment_base(fs_sel); + gs_base = segment_base(gs_sel); +#endif + + vmx_set_host_fs_gs(host_state, fs_sel, gs_sel, fs_base, gs_base); + vmx->guest_state_loaded = true; +} + +static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx) +{ + struct vmcs_host_state *host_state; + + if (!vmx->guest_state_loaded) + return; + + host_state = &vmx->loaded_vmcs->host_state; + + ++vmx->vcpu.stat.host_state_reload; + +#ifdef CONFIG_X86_64 + rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); +#endif + if (host_state->ldt_sel || (host_state->gs_sel & 7)) { + kvm_load_ldt(host_state->ldt_sel); +#ifdef CONFIG_X86_64 + load_gs_index(host_state->gs_sel); +#else + loadsegment(gs, host_state->gs_sel); +#endif + } + if (host_state->fs_sel & 7) + loadsegment(fs, host_state->fs_sel); +#ifdef CONFIG_X86_64 + if (unlikely(host_state->ds_sel | host_state->es_sel)) { + loadsegment(ds, host_state->ds_sel); + loadsegment(es, host_state->es_sel); + } +#endif + invalidate_tss_limit(); +#ifdef CONFIG_X86_64 + wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base); +#endif + load_fixmap_gdt(raw_smp_processor_id()); + vmx->guest_state_loaded = false; + vmx->guest_uret_msrs_loaded = false; +} + +#ifdef CONFIG_X86_64 +static u64 vmx_read_guest_kernel_gs_base(struct vcpu_vmx *vmx) +{ + preempt_disable(); + if (vmx->guest_state_loaded) + rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); + preempt_enable(); + return vmx->msr_guest_kernel_gs_base; +} + +static void vmx_write_guest_kernel_gs_base(struct vcpu_vmx *vmx, u64 data) +{ + preempt_disable(); + if (vmx->guest_state_loaded) + wrmsrl(MSR_KERNEL_GS_BASE, data); + preempt_enable(); + vmx->msr_guest_kernel_gs_base = data; +} +#endif + +void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu, + struct loaded_vmcs *buddy) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + bool already_loaded = vmx->loaded_vmcs->cpu == cpu; + struct vmcs *prev; + + if (!already_loaded) { + loaded_vmcs_clear(vmx->loaded_vmcs); + local_irq_disable(); + + /* + * Ensure loaded_vmcs->cpu is read before adding loaded_vmcs to + * this cpu's percpu list, otherwise it may not yet be deleted + * from its previous cpu's percpu list. Pairs with the + * smb_wmb() in __loaded_vmcs_clear(). + */ + smp_rmb(); + + list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link, + &per_cpu(loaded_vmcss_on_cpu, cpu)); + local_irq_enable(); + } + + prev = per_cpu(current_vmcs, cpu); + if (prev != vmx->loaded_vmcs->vmcs) { + per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs; + vmcs_load(vmx->loaded_vmcs->vmcs); + + /* + * No indirect branch prediction barrier needed when switching + * the active VMCS within a vCPU, unless IBRS is advertised to + * the vCPU. To minimize the number of IBPBs executed, KVM + * performs IBPB on nested VM-Exit (a single nested transition + * may switch the active VMCS multiple times). + */ + if (!buddy || WARN_ON_ONCE(buddy->vmcs != prev)) + indirect_branch_prediction_barrier(); + } + + if (!already_loaded) { + void *gdt = get_current_gdt_ro(); + + /* + * Flush all EPTP/VPID contexts, the new pCPU may have stale + * TLB entries from its previous association with the vCPU. + */ + kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); + + /* + * Linux uses per-cpu TSS and GDT, so set these when switching + * processors. See 22.2.4. + */ + vmcs_writel(HOST_TR_BASE, + (unsigned long)&get_cpu_entry_area(cpu)->tss.x86_tss); + vmcs_writel(HOST_GDTR_BASE, (unsigned long)gdt); /* 22.2.4 */ + + if (IS_ENABLED(CONFIG_IA32_EMULATION) || IS_ENABLED(CONFIG_X86_32)) { + /* 22.2.3 */ + vmcs_writel(HOST_IA32_SYSENTER_ESP, + (unsigned long)(cpu_entry_stack(cpu) + 1)); + } + + vmx->loaded_vmcs->cpu = cpu; + } +} + +/* + * Switches to specified vcpu, until a matching vcpu_put(), but assumes + * vcpu mutex is already taken. + */ +static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + vmx_vcpu_load_vmcs(vcpu, cpu, NULL); + + vmx_vcpu_pi_load(vcpu, cpu); + + vmx->host_debugctlmsr = get_debugctlmsr(); +} + +static void vmx_vcpu_put(struct kvm_vcpu *vcpu) +{ + vmx_vcpu_pi_put(vcpu); + + vmx_prepare_switch_to_host(to_vmx(vcpu)); +} + +bool vmx_emulation_required(struct kvm_vcpu *vcpu) +{ + return emulate_invalid_guest_state && !vmx_guest_state_valid(vcpu); +} + +unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + unsigned long rflags, save_rflags; + + if (!kvm_register_is_available(vcpu, VCPU_EXREG_RFLAGS)) { + kvm_register_mark_available(vcpu, VCPU_EXREG_RFLAGS); + rflags = vmcs_readl(GUEST_RFLAGS); + if (vmx->rmode.vm86_active) { + rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS; + save_rflags = vmx->rmode.save_rflags; + rflags |= save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS; + } + vmx->rflags = rflags; + } + return vmx->rflags; +} + +void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + unsigned long old_rflags; + + /* + * Unlike CR0 and CR4, RFLAGS handling requires checking if the vCPU + * is an unrestricted guest in order to mark L2 as needing emulation + * if L1 runs L2 as a restricted guest. + */ + if (is_unrestricted_guest(vcpu)) { + kvm_register_mark_available(vcpu, VCPU_EXREG_RFLAGS); + vmx->rflags = rflags; + vmcs_writel(GUEST_RFLAGS, rflags); + return; + } + + old_rflags = vmx_get_rflags(vcpu); + vmx->rflags = rflags; + if (vmx->rmode.vm86_active) { + vmx->rmode.save_rflags = rflags; + rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; + } + vmcs_writel(GUEST_RFLAGS, rflags); + + if ((old_rflags ^ vmx->rflags) & X86_EFLAGS_VM) + vmx->emulation_required = vmx_emulation_required(vcpu); +} + +static bool vmx_get_if_flag(struct kvm_vcpu *vcpu) +{ + return vmx_get_rflags(vcpu) & X86_EFLAGS_IF; +} + +u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu) +{ + u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); + int ret = 0; + + if (interruptibility & GUEST_INTR_STATE_STI) + ret |= KVM_X86_SHADOW_INT_STI; + if (interruptibility & GUEST_INTR_STATE_MOV_SS) + ret |= KVM_X86_SHADOW_INT_MOV_SS; + + return ret; +} + +void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) +{ + u32 interruptibility_old = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); + u32 interruptibility = interruptibility_old; + + interruptibility &= ~(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS); + + if (mask & KVM_X86_SHADOW_INT_MOV_SS) + interruptibility |= GUEST_INTR_STATE_MOV_SS; + else if (mask & KVM_X86_SHADOW_INT_STI) + interruptibility |= GUEST_INTR_STATE_STI; + + if ((interruptibility != interruptibility_old)) + vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, interruptibility); +} + +static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + unsigned long value; + + /* + * Any MSR write that attempts to change bits marked reserved will + * case a #GP fault. + */ + if (data & vmx->pt_desc.ctl_bitmask) + return 1; + + /* + * Any attempt to modify IA32_RTIT_CTL while TraceEn is set will + * result in a #GP unless the same write also clears TraceEn. + */ + if ((vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) && + ((vmx->pt_desc.guest.ctl ^ data) & ~RTIT_CTL_TRACEEN)) + return 1; + + /* + * WRMSR to IA32_RTIT_CTL that sets TraceEn but clears this bit + * and FabricEn would cause #GP, if + * CPUID.(EAX=14H, ECX=0):ECX.SNGLRGNOUT[bit 2] = 0 + */ + if ((data & RTIT_CTL_TRACEEN) && !(data & RTIT_CTL_TOPA) && + !(data & RTIT_CTL_FABRIC_EN) && + !intel_pt_validate_cap(vmx->pt_desc.caps, + PT_CAP_single_range_output)) + return 1; + + /* + * MTCFreq, CycThresh and PSBFreq encodings check, any MSR write that + * utilize encodings marked reserved will cause a #GP fault. + */ + value = intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc_periods); + if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc) && + !test_bit((data & RTIT_CTL_MTC_RANGE) >> + RTIT_CTL_MTC_RANGE_OFFSET, &value)) + return 1; + value = intel_pt_validate_cap(vmx->pt_desc.caps, + PT_CAP_cycle_thresholds); + if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc) && + !test_bit((data & RTIT_CTL_CYC_THRESH) >> + RTIT_CTL_CYC_THRESH_OFFSET, &value)) + return 1; + value = intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_periods); + if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc) && + !test_bit((data & RTIT_CTL_PSB_FREQ) >> + RTIT_CTL_PSB_FREQ_OFFSET, &value)) + return 1; + + /* + * If ADDRx_CFG is reserved or the encodings is >2 will + * cause a #GP fault. + */ + value = (data & RTIT_CTL_ADDR0) >> RTIT_CTL_ADDR0_OFFSET; + if ((value && (vmx->pt_desc.num_address_ranges < 1)) || (value > 2)) + return 1; + value = (data & RTIT_CTL_ADDR1) >> RTIT_CTL_ADDR1_OFFSET; + if ((value && (vmx->pt_desc.num_address_ranges < 2)) || (value > 2)) + return 1; + value = (data & RTIT_CTL_ADDR2) >> RTIT_CTL_ADDR2_OFFSET; + if ((value && (vmx->pt_desc.num_address_ranges < 3)) || (value > 2)) + return 1; + value = (data & RTIT_CTL_ADDR3) >> RTIT_CTL_ADDR3_OFFSET; + if ((value && (vmx->pt_desc.num_address_ranges < 4)) || (value > 2)) + return 1; + + return 0; +} + +static bool vmx_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type, + void *insn, int insn_len) +{ + /* + * Emulation of instructions in SGX enclaves is impossible as RIP does + * not point at the failing instruction, and even if it did, the code + * stream is inaccessible. Inject #UD instead of exiting to userspace + * so that guest userspace can't DoS the guest simply by triggering + * emulation (enclaves are CPL3 only). + */ + if (to_vmx(vcpu)->exit_reason.enclave_mode) { + kvm_queue_exception(vcpu, UD_VECTOR); + return false; + } + return true; +} + +static int skip_emulated_instruction(struct kvm_vcpu *vcpu) +{ + union vmx_exit_reason exit_reason = to_vmx(vcpu)->exit_reason; + unsigned long rip, orig_rip; + u32 instr_len; + + /* + * Using VMCS.VM_EXIT_INSTRUCTION_LEN on EPT misconfig depends on + * undefined behavior: Intel's SDM doesn't mandate the VMCS field be + * set when EPT misconfig occurs. In practice, real hardware updates + * VM_EXIT_INSTRUCTION_LEN on EPT misconfig, but other hypervisors + * (namely Hyper-V) don't set it due to it being undefined behavior, + * i.e. we end up advancing IP with some random value. + */ + if (!static_cpu_has(X86_FEATURE_HYPERVISOR) || + exit_reason.basic != EXIT_REASON_EPT_MISCONFIG) { + instr_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN); + + /* + * Emulating an enclave's instructions isn't supported as KVM + * cannot access the enclave's memory or its true RIP, e.g. the + * vmcs.GUEST_RIP points at the exit point of the enclave, not + * the RIP that actually triggered the VM-Exit. But, because + * most instructions that cause VM-Exit will #UD in an enclave, + * most instruction-based VM-Exits simply do not occur. + * + * There are a few exceptions, notably the debug instructions + * INT1ICEBRK and INT3, as they are allowed in debug enclaves + * and generate #DB/#BP as expected, which KVM might intercept. + * But again, the CPU does the dirty work and saves an instr + * length of zero so VMMs don't shoot themselves in the foot. + * WARN if KVM tries to skip a non-zero length instruction on + * a VM-Exit from an enclave. + */ + if (!instr_len) + goto rip_updated; + + WARN(exit_reason.enclave_mode, + "KVM: skipping instruction after SGX enclave VM-Exit"); + + orig_rip = kvm_rip_read(vcpu); + rip = orig_rip + instr_len; +#ifdef CONFIG_X86_64 + /* + * We need to mask out the high 32 bits of RIP if not in 64-bit + * mode, but just finding out that we are in 64-bit mode is + * quite expensive. Only do it if there was a carry. + */ + if (unlikely(((rip ^ orig_rip) >> 31) == 3) && !is_64_bit_mode(vcpu)) + rip = (u32)rip; +#endif + kvm_rip_write(vcpu, rip); + } else { + if (!kvm_emulate_instruction(vcpu, EMULTYPE_SKIP)) + return 0; + } + +rip_updated: + /* skipping an emulated instruction also counts */ + vmx_set_interrupt_shadow(vcpu, 0); + + return 1; +} + +/* + * Recognizes a pending MTF VM-exit and records the nested state for later + * delivery. + */ +static void vmx_update_emulated_instruction(struct kvm_vcpu *vcpu) +{ + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (!is_guest_mode(vcpu)) + return; + + /* + * Per the SDM, MTF takes priority over debug-trap exceptions besides + * TSS T-bit traps and ICEBP (INT1). KVM doesn't emulate T-bit traps + * or ICEBP (in the emulator proper), and skipping of ICEBP after an + * intercepted #DB deliberately avoids single-step #DB and MTF updates + * as ICEBP is higher priority than both. As instruction emulation is + * completed at this point (i.e. KVM is at the instruction boundary), + * any #DB exception pending delivery must be a debug-trap of lower + * priority than MTF. Record the pending MTF state to be delivered in + * vmx_check_nested_events(). + */ + if (nested_cpu_has_mtf(vmcs12) && + (!vcpu->arch.exception.pending || + vcpu->arch.exception.vector == DB_VECTOR) && + (!vcpu->arch.exception_vmexit.pending || + vcpu->arch.exception_vmexit.vector == DB_VECTOR)) { + vmx->nested.mtf_pending = true; + kvm_make_request(KVM_REQ_EVENT, vcpu); + } else { + vmx->nested.mtf_pending = false; + } +} + +static int vmx_skip_emulated_instruction(struct kvm_vcpu *vcpu) +{ + vmx_update_emulated_instruction(vcpu); + return skip_emulated_instruction(vcpu); +} + +static void vmx_clear_hlt(struct kvm_vcpu *vcpu) +{ + /* + * Ensure that we clear the HLT state in the VMCS. We don't need to + * explicitly skip the instruction because if the HLT state is set, + * then the instruction is already executing and RIP has already been + * advanced. + */ + if (kvm_hlt_in_guest(vcpu->kvm) && + vmcs_read32(GUEST_ACTIVITY_STATE) == GUEST_ACTIVITY_HLT) + vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE); +} + +static void vmx_inject_exception(struct kvm_vcpu *vcpu) +{ + struct kvm_queued_exception *ex = &vcpu->arch.exception; + u32 intr_info = ex->vector | INTR_INFO_VALID_MASK; + struct vcpu_vmx *vmx = to_vmx(vcpu); + + kvm_deliver_exception_payload(vcpu, ex); + + if (ex->has_error_code) { + /* + * Despite the error code being architecturally defined as 32 + * bits, and the VMCS field being 32 bits, Intel CPUs and thus + * VMX don't actually supporting setting bits 31:16. Hardware + * will (should) never provide a bogus error code, but AMD CPUs + * do generate error codes with bits 31:16 set, and so KVM's + * ABI lets userspace shove in arbitrary 32-bit values. Drop + * the upper bits to avoid VM-Fail, losing information that + * does't really exist is preferable to killing the VM. + */ + vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, (u16)ex->error_code); + intr_info |= INTR_INFO_DELIVER_CODE_MASK; + } + + if (vmx->rmode.vm86_active) { + int inc_eip = 0; + if (kvm_exception_is_soft(ex->vector)) + inc_eip = vcpu->arch.event_exit_inst_len; + kvm_inject_realmode_interrupt(vcpu, ex->vector, inc_eip); + return; + } + + WARN_ON_ONCE(vmx->emulation_required); + + if (kvm_exception_is_soft(ex->vector)) { + vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, + vmx->vcpu.arch.event_exit_inst_len); + intr_info |= INTR_TYPE_SOFT_EXCEPTION; + } else + intr_info |= INTR_TYPE_HARD_EXCEPTION; + + vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info); + + vmx_clear_hlt(vcpu); +} + +static void vmx_setup_uret_msr(struct vcpu_vmx *vmx, unsigned int msr, + bool load_into_hardware) +{ + struct vmx_uret_msr *uret_msr; + + uret_msr = vmx_find_uret_msr(vmx, msr); + if (!uret_msr) + return; + + uret_msr->load_into_hardware = load_into_hardware; +} + +/* + * Configuring user return MSRs to automatically save, load, and restore MSRs + * that need to be shoved into hardware when running the guest. Note, omitting + * an MSR here does _NOT_ mean it's not emulated, only that it will not be + * loaded into hardware when running the guest. + */ +static void vmx_setup_uret_msrs(struct vcpu_vmx *vmx) +{ +#ifdef CONFIG_X86_64 + bool load_syscall_msrs; + + /* + * The SYSCALL MSRs are only needed on long mode guests, and only + * when EFER.SCE is set. + */ + load_syscall_msrs = is_long_mode(&vmx->vcpu) && + (vmx->vcpu.arch.efer & EFER_SCE); + + vmx_setup_uret_msr(vmx, MSR_STAR, load_syscall_msrs); + vmx_setup_uret_msr(vmx, MSR_LSTAR, load_syscall_msrs); + vmx_setup_uret_msr(vmx, MSR_SYSCALL_MASK, load_syscall_msrs); +#endif + vmx_setup_uret_msr(vmx, MSR_EFER, update_transition_efer(vmx)); + + vmx_setup_uret_msr(vmx, MSR_TSC_AUX, + guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDTSCP) || + guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDPID)); + + /* + * hle=0, rtm=0, tsx_ctrl=1 can be found with some combinations of new + * kernel and old userspace. If those guests run on a tsx=off host, do + * allow guests to use TSX_CTRL, but don't change the value in hardware + * so that TSX remains always disabled. + */ + vmx_setup_uret_msr(vmx, MSR_IA32_TSX_CTRL, boot_cpu_has(X86_FEATURE_RTM)); + + /* + * The set of MSRs to load may have changed, reload MSRs before the + * next VM-Enter. + */ + vmx->guest_uret_msrs_loaded = false; +} + +u64 vmx_get_l2_tsc_offset(struct kvm_vcpu *vcpu) +{ + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + + if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETTING)) + return vmcs12->tsc_offset; + + return 0; +} + +u64 vmx_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu) +{ + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + + if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETTING) && + nested_cpu_has2(vmcs12, SECONDARY_EXEC_TSC_SCALING)) + return vmcs12->tsc_multiplier; + + return kvm_caps.default_tsc_scaling_ratio; +} + +static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) +{ + vmcs_write64(TSC_OFFSET, offset); +} + +static void vmx_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 multiplier) +{ + vmcs_write64(TSC_MULTIPLIER, multiplier); +} + +/* + * nested_vmx_allowed() checks whether a guest should be allowed to use VMX + * instructions and MSRs (i.e., nested VMX). Nested VMX is disabled for + * all guests if the "nested" module option is off, and can also be disabled + * for a single guest by disabling its VMX cpuid bit. + */ +bool nested_vmx_allowed(struct kvm_vcpu *vcpu) +{ + return nested && guest_cpuid_has(vcpu, X86_FEATURE_VMX); +} + +static inline bool vmx_feature_control_msr_valid(struct kvm_vcpu *vcpu, + uint64_t val) +{ + uint64_t valid_bits = to_vmx(vcpu)->msr_ia32_feature_control_valid_bits; + + return !(val & ~valid_bits); +} + +static int vmx_get_msr_feature(struct kvm_msr_entry *msr) +{ + switch (msr->index) { + case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC: + if (!nested) + return 1; + return vmx_get_vmx_msr(&vmcs_config.nested, msr->index, &msr->data); + case MSR_IA32_PERF_CAPABILITIES: + msr->data = kvm_caps.supported_perf_cap; + return 0; + default: + return KVM_MSR_RET_INVALID; + } +} + +/* + * Reads an msr value (of 'msr_info->index') into 'msr_info->data'. + * Returns 0 on success, non-0 otherwise. + * Assumes vcpu_load() was already called. + */ +static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct vmx_uret_msr *msr; + u32 index; + + switch (msr_info->index) { +#ifdef CONFIG_X86_64 + case MSR_FS_BASE: + msr_info->data = vmcs_readl(GUEST_FS_BASE); + break; + case MSR_GS_BASE: + msr_info->data = vmcs_readl(GUEST_GS_BASE); + break; + case MSR_KERNEL_GS_BASE: + msr_info->data = vmx_read_guest_kernel_gs_base(vmx); + break; +#endif + case MSR_EFER: + return kvm_get_msr_common(vcpu, msr_info); + case MSR_IA32_TSX_CTRL: + if (!msr_info->host_initiated && + !(vcpu->arch.arch_capabilities & ARCH_CAP_TSX_CTRL_MSR)) + return 1; + goto find_uret_msr; + case MSR_IA32_UMWAIT_CONTROL: + if (!msr_info->host_initiated && !vmx_has_waitpkg(vmx)) + return 1; + + msr_info->data = vmx->msr_ia32_umwait_control; + break; + case MSR_IA32_SPEC_CTRL: + if (!msr_info->host_initiated && + !guest_has_spec_ctrl_msr(vcpu)) + return 1; + + msr_info->data = to_vmx(vcpu)->spec_ctrl; + break; + case MSR_IA32_SYSENTER_CS: + msr_info->data = vmcs_read32(GUEST_SYSENTER_CS); + break; + case MSR_IA32_SYSENTER_EIP: + msr_info->data = vmcs_readl(GUEST_SYSENTER_EIP); + break; + case MSR_IA32_SYSENTER_ESP: + msr_info->data = vmcs_readl(GUEST_SYSENTER_ESP); + break; + case MSR_IA32_BNDCFGS: + if (!kvm_mpx_supported() || + (!msr_info->host_initiated && + !guest_cpuid_has(vcpu, X86_FEATURE_MPX))) + return 1; + msr_info->data = vmcs_read64(GUEST_BNDCFGS); + break; + case MSR_IA32_MCG_EXT_CTL: + if (!msr_info->host_initiated && + !(vmx->msr_ia32_feature_control & + FEAT_CTL_LMCE_ENABLED)) + return 1; + msr_info->data = vcpu->arch.mcg_ext_ctl; + break; + case MSR_IA32_FEAT_CTL: + msr_info->data = vmx->msr_ia32_feature_control; + break; + case MSR_IA32_SGXLEPUBKEYHASH0 ... MSR_IA32_SGXLEPUBKEYHASH3: + if (!msr_info->host_initiated && + !guest_cpuid_has(vcpu, X86_FEATURE_SGX_LC)) + return 1; + msr_info->data = to_vmx(vcpu)->msr_ia32_sgxlepubkeyhash + [msr_info->index - MSR_IA32_SGXLEPUBKEYHASH0]; + break; + case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC: + if (!nested_vmx_allowed(vcpu)) + return 1; + if (vmx_get_vmx_msr(&vmx->nested.msrs, msr_info->index, + &msr_info->data)) + return 1; + /* + * Enlightened VMCS v1 doesn't have certain VMCS fields but + * instead of just ignoring the features, different Hyper-V + * versions are either trying to use them and fail or do some + * sanity checking and refuse to boot. Filter all unsupported + * features out. + */ + if (!msr_info->host_initiated && guest_cpuid_has_evmcs(vcpu)) + nested_evmcs_filter_control_msr(vcpu, msr_info->index, + &msr_info->data); + break; + case MSR_IA32_RTIT_CTL: + if (!vmx_pt_mode_is_host_guest()) + return 1; + msr_info->data = vmx->pt_desc.guest.ctl; + break; + case MSR_IA32_RTIT_STATUS: + if (!vmx_pt_mode_is_host_guest()) + return 1; + msr_info->data = vmx->pt_desc.guest.status; + break; + case MSR_IA32_RTIT_CR3_MATCH: + if (!vmx_pt_mode_is_host_guest() || + !intel_pt_validate_cap(vmx->pt_desc.caps, + PT_CAP_cr3_filtering)) + return 1; + msr_info->data = vmx->pt_desc.guest.cr3_match; + break; + case MSR_IA32_RTIT_OUTPUT_BASE: + if (!vmx_pt_mode_is_host_guest() || + (!intel_pt_validate_cap(vmx->pt_desc.caps, + PT_CAP_topa_output) && + !intel_pt_validate_cap(vmx->pt_desc.caps, + PT_CAP_single_range_output))) + return 1; + msr_info->data = vmx->pt_desc.guest.output_base; + break; + case MSR_IA32_RTIT_OUTPUT_MASK: + if (!vmx_pt_mode_is_host_guest() || + (!intel_pt_validate_cap(vmx->pt_desc.caps, + PT_CAP_topa_output) && + !intel_pt_validate_cap(vmx->pt_desc.caps, + PT_CAP_single_range_output))) + return 1; + msr_info->data = vmx->pt_desc.guest.output_mask; + break; + case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B: + index = msr_info->index - MSR_IA32_RTIT_ADDR0_A; + if (!vmx_pt_mode_is_host_guest() || + (index >= 2 * vmx->pt_desc.num_address_ranges)) + return 1; + if (index % 2) + msr_info->data = vmx->pt_desc.guest.addr_b[index / 2]; + else + msr_info->data = vmx->pt_desc.guest.addr_a[index / 2]; + break; + case MSR_IA32_DEBUGCTLMSR: + msr_info->data = vmcs_read64(GUEST_IA32_DEBUGCTL); + break; + default: + find_uret_msr: + msr = vmx_find_uret_msr(vmx, msr_info->index); + if (msr) { + msr_info->data = msr->data; + break; + } + return kvm_get_msr_common(vcpu, msr_info); + } + + return 0; +} + +static u64 nested_vmx_truncate_sysenter_addr(struct kvm_vcpu *vcpu, + u64 data) +{ +#ifdef CONFIG_X86_64 + if (!guest_cpuid_has(vcpu, X86_FEATURE_LM)) + return (u32)data; +#endif + return (unsigned long)data; +} + +static u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated) +{ + u64 debugctl = 0; + + if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT) && + (host_initiated || guest_cpuid_has(vcpu, X86_FEATURE_BUS_LOCK_DETECT))) + debugctl |= DEBUGCTLMSR_BUS_LOCK_DETECT; + + if ((kvm_caps.supported_perf_cap & PMU_CAP_LBR_FMT) && + (host_initiated || intel_pmu_lbr_is_enabled(vcpu))) + debugctl |= DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI; + + return debugctl; +} + +/* + * Writes msr value into the appropriate "register". + * Returns 0 on success, non-0 otherwise. + * Assumes vcpu_load() was already called. + */ +static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct vmx_uret_msr *msr; + int ret = 0; + u32 msr_index = msr_info->index; + u64 data = msr_info->data; + u32 index; + + switch (msr_index) { + case MSR_EFER: + ret = kvm_set_msr_common(vcpu, msr_info); + break; +#ifdef CONFIG_X86_64 + case MSR_FS_BASE: + vmx_segment_cache_clear(vmx); + vmcs_writel(GUEST_FS_BASE, data); + break; + case MSR_GS_BASE: + vmx_segment_cache_clear(vmx); + vmcs_writel(GUEST_GS_BASE, data); + break; + case MSR_KERNEL_GS_BASE: + vmx_write_guest_kernel_gs_base(vmx, data); + break; + case MSR_IA32_XFD: + ret = kvm_set_msr_common(vcpu, msr_info); + /* + * Always intercepting WRMSR could incur non-negligible + * overhead given xfd might be changed frequently in + * guest context switch. Disable write interception + * upon the first write with a non-zero value (indicating + * potential usage on dynamic xfeatures). Also update + * exception bitmap to trap #NM for proper virtualization + * of guest xfd_err. + */ + if (!ret && data) { + vmx_disable_intercept_for_msr(vcpu, MSR_IA32_XFD, + MSR_TYPE_RW); + vcpu->arch.xfd_no_write_intercept = true; + vmx_update_exception_bitmap(vcpu); + } + break; +#endif + case MSR_IA32_SYSENTER_CS: + if (is_guest_mode(vcpu)) + get_vmcs12(vcpu)->guest_sysenter_cs = data; + vmcs_write32(GUEST_SYSENTER_CS, data); + break; + case MSR_IA32_SYSENTER_EIP: + if (is_guest_mode(vcpu)) { + data = nested_vmx_truncate_sysenter_addr(vcpu, data); + get_vmcs12(vcpu)->guest_sysenter_eip = data; + } + vmcs_writel(GUEST_SYSENTER_EIP, data); + break; + case MSR_IA32_SYSENTER_ESP: + if (is_guest_mode(vcpu)) { + data = nested_vmx_truncate_sysenter_addr(vcpu, data); + get_vmcs12(vcpu)->guest_sysenter_esp = data; + } + vmcs_writel(GUEST_SYSENTER_ESP, data); + break; + case MSR_IA32_DEBUGCTLMSR: { + u64 invalid; + + invalid = data & ~vmx_get_supported_debugctl(vcpu, msr_info->host_initiated); + if (invalid & (DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR)) { + if (report_ignored_msrs) + vcpu_unimpl(vcpu, "%s: BTF|LBR in IA32_DEBUGCTLMSR 0x%llx, nop\n", + __func__, data); + data &= ~(DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR); + invalid &= ~(DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR); + } + + if (invalid) + return 1; + + if (is_guest_mode(vcpu) && get_vmcs12(vcpu)->vm_exit_controls & + VM_EXIT_SAVE_DEBUG_CONTROLS) + get_vmcs12(vcpu)->guest_ia32_debugctl = data; + + vmcs_write64(GUEST_IA32_DEBUGCTL, data); + if (intel_pmu_lbr_is_enabled(vcpu) && !to_vmx(vcpu)->lbr_desc.event && + (data & DEBUGCTLMSR_LBR)) + intel_pmu_create_guest_lbr_event(vcpu); + return 0; + } + case MSR_IA32_BNDCFGS: + if (!kvm_mpx_supported() || + (!msr_info->host_initiated && + !guest_cpuid_has(vcpu, X86_FEATURE_MPX))) + return 1; + if (is_noncanonical_address(data & PAGE_MASK, vcpu) || + (data & MSR_IA32_BNDCFGS_RSVD)) + return 1; + + if (is_guest_mode(vcpu) && + ((vmx->nested.msrs.entry_ctls_high & VM_ENTRY_LOAD_BNDCFGS) || + (vmx->nested.msrs.exit_ctls_high & VM_EXIT_CLEAR_BNDCFGS))) + get_vmcs12(vcpu)->guest_bndcfgs = data; + + vmcs_write64(GUEST_BNDCFGS, data); + break; + case MSR_IA32_UMWAIT_CONTROL: + if (!msr_info->host_initiated && !vmx_has_waitpkg(vmx)) + return 1; + + /* The reserved bit 1 and non-32 bit [63:32] should be zero */ + if (data & (BIT_ULL(1) | GENMASK_ULL(63, 32))) + return 1; + + vmx->msr_ia32_umwait_control = data; + break; + case MSR_IA32_SPEC_CTRL: + if (!msr_info->host_initiated && + !guest_has_spec_ctrl_msr(vcpu)) + return 1; + + if (kvm_spec_ctrl_test_value(data)) + return 1; + + vmx->spec_ctrl = data; + if (!data) + break; + + /* + * For non-nested: + * When it's written (to non-zero) for the first time, pass + * it through. + * + * For nested: + * The handling of the MSR bitmap for L2 guests is done in + * nested_vmx_prepare_msr_bitmap. We should not touch the + * vmcs02.msr_bitmap here since it gets completely overwritten + * in the merging. We update the vmcs01 here for L1 as well + * since it will end up touching the MSR anyway now. + */ + vmx_disable_intercept_for_msr(vcpu, + MSR_IA32_SPEC_CTRL, + MSR_TYPE_RW); + break; + case MSR_IA32_TSX_CTRL: + if (!msr_info->host_initiated && + !(vcpu->arch.arch_capabilities & ARCH_CAP_TSX_CTRL_MSR)) + return 1; + if (data & ~(TSX_CTRL_RTM_DISABLE | TSX_CTRL_CPUID_CLEAR)) + return 1; + goto find_uret_msr; + case MSR_IA32_PRED_CMD: + if (!msr_info->host_initiated && + !guest_has_pred_cmd_msr(vcpu)) + return 1; + + if (data & ~PRED_CMD_IBPB) + return 1; + if (!boot_cpu_has(X86_FEATURE_IBPB)) + return 1; + if (!data) + break; + + wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB); + + /* + * For non-nested: + * When it's written (to non-zero) for the first time, pass + * it through. + * + * For nested: + * The handling of the MSR bitmap for L2 guests is done in + * nested_vmx_prepare_msr_bitmap. We should not touch the + * vmcs02.msr_bitmap here since it gets completely overwritten + * in the merging. + */ + vmx_disable_intercept_for_msr(vcpu, MSR_IA32_PRED_CMD, MSR_TYPE_W); + break; + case MSR_IA32_CR_PAT: + if (!kvm_pat_valid(data)) + return 1; + + if (is_guest_mode(vcpu) && + get_vmcs12(vcpu)->vm_exit_controls & VM_EXIT_SAVE_IA32_PAT) + get_vmcs12(vcpu)->guest_ia32_pat = data; + + if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { + vmcs_write64(GUEST_IA32_PAT, data); + vcpu->arch.pat = data; + break; + } + ret = kvm_set_msr_common(vcpu, msr_info); + break; + case MSR_IA32_MCG_EXT_CTL: + if ((!msr_info->host_initiated && + !(to_vmx(vcpu)->msr_ia32_feature_control & + FEAT_CTL_LMCE_ENABLED)) || + (data & ~MCG_EXT_CTL_LMCE_EN)) + return 1; + vcpu->arch.mcg_ext_ctl = data; + break; + case MSR_IA32_FEAT_CTL: + if (!vmx_feature_control_msr_valid(vcpu, data) || + (to_vmx(vcpu)->msr_ia32_feature_control & + FEAT_CTL_LOCKED && !msr_info->host_initiated)) + return 1; + vmx->msr_ia32_feature_control = data; + if (msr_info->host_initiated && data == 0) + vmx_leave_nested(vcpu); + + /* SGX may be enabled/disabled by guest's firmware */ + vmx_write_encls_bitmap(vcpu, NULL); + break; + case MSR_IA32_SGXLEPUBKEYHASH0 ... MSR_IA32_SGXLEPUBKEYHASH3: + /* + * On real hardware, the LE hash MSRs are writable before + * the firmware sets bit 0 in MSR 0x7a ("activating" SGX), + * at which point SGX related bits in IA32_FEATURE_CONTROL + * become writable. + * + * KVM does not emulate SGX activation for simplicity, so + * allow writes to the LE hash MSRs if IA32_FEATURE_CONTROL + * is unlocked. This is technically not architectural + * behavior, but it's close enough. + */ + if (!msr_info->host_initiated && + (!guest_cpuid_has(vcpu, X86_FEATURE_SGX_LC) || + ((vmx->msr_ia32_feature_control & FEAT_CTL_LOCKED) && + !(vmx->msr_ia32_feature_control & FEAT_CTL_SGX_LC_ENABLED)))) + return 1; + vmx->msr_ia32_sgxlepubkeyhash + [msr_index - MSR_IA32_SGXLEPUBKEYHASH0] = data; + break; + case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC: + if (!msr_info->host_initiated) + return 1; /* they are read-only */ + if (!nested_vmx_allowed(vcpu)) + return 1; + return vmx_set_vmx_msr(vcpu, msr_index, data); + case MSR_IA32_RTIT_CTL: + if (!vmx_pt_mode_is_host_guest() || + vmx_rtit_ctl_check(vcpu, data) || + vmx->nested.vmxon) + return 1; + vmcs_write64(GUEST_IA32_RTIT_CTL, data); + vmx->pt_desc.guest.ctl = data; + pt_update_intercept_for_msr(vcpu); + break; + case MSR_IA32_RTIT_STATUS: + if (!pt_can_write_msr(vmx)) + return 1; + if (data & MSR_IA32_RTIT_STATUS_MASK) + return 1; + vmx->pt_desc.guest.status = data; + break; + case MSR_IA32_RTIT_CR3_MATCH: + if (!pt_can_write_msr(vmx)) + return 1; + if (!intel_pt_validate_cap(vmx->pt_desc.caps, + PT_CAP_cr3_filtering)) + return 1; + vmx->pt_desc.guest.cr3_match = data; + break; + case MSR_IA32_RTIT_OUTPUT_BASE: + if (!pt_can_write_msr(vmx)) + return 1; + if (!intel_pt_validate_cap(vmx->pt_desc.caps, + PT_CAP_topa_output) && + !intel_pt_validate_cap(vmx->pt_desc.caps, + PT_CAP_single_range_output)) + return 1; + if (!pt_output_base_valid(vcpu, data)) + return 1; + vmx->pt_desc.guest.output_base = data; + break; + case MSR_IA32_RTIT_OUTPUT_MASK: + if (!pt_can_write_msr(vmx)) + return 1; + if (!intel_pt_validate_cap(vmx->pt_desc.caps, + PT_CAP_topa_output) && + !intel_pt_validate_cap(vmx->pt_desc.caps, + PT_CAP_single_range_output)) + return 1; + vmx->pt_desc.guest.output_mask = data; + break; + case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B: + if (!pt_can_write_msr(vmx)) + return 1; + index = msr_info->index - MSR_IA32_RTIT_ADDR0_A; + if (index >= 2 * vmx->pt_desc.num_address_ranges) + return 1; + if (is_noncanonical_address(data, vcpu)) + return 1; + if (index % 2) + vmx->pt_desc.guest.addr_b[index / 2] = data; + else + vmx->pt_desc.guest.addr_a[index / 2] = data; + break; + case MSR_IA32_PERF_CAPABILITIES: + if (data && !vcpu_to_pmu(vcpu)->version) + return 1; + if (data & PMU_CAP_LBR_FMT) { + if ((data & PMU_CAP_LBR_FMT) != + (kvm_caps.supported_perf_cap & PMU_CAP_LBR_FMT)) + return 1; + if (!cpuid_model_is_consistent(vcpu)) + return 1; + } + if (data & PERF_CAP_PEBS_FORMAT) { + if ((data & PERF_CAP_PEBS_MASK) != + (kvm_caps.supported_perf_cap & PERF_CAP_PEBS_MASK)) + return 1; + if (!guest_cpuid_has(vcpu, X86_FEATURE_DS)) + return 1; + if (!guest_cpuid_has(vcpu, X86_FEATURE_DTES64)) + return 1; + if (!cpuid_model_is_consistent(vcpu)) + return 1; + } + ret = kvm_set_msr_common(vcpu, msr_info); + break; + + default: + find_uret_msr: + msr = vmx_find_uret_msr(vmx, msr_index); + if (msr) + ret = vmx_set_guest_uret_msr(vmx, msr, data); + else + ret = kvm_set_msr_common(vcpu, msr_info); + } + + /* FB_CLEAR may have changed, also update the FB_CLEAR_DIS behavior */ + if (msr_index == MSR_IA32_ARCH_CAPABILITIES) + vmx_update_fb_clear_dis(vcpu, vmx); + + return ret; +} + +static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) +{ + unsigned long guest_owned_bits; + + kvm_register_mark_available(vcpu, reg); + + switch (reg) { + case VCPU_REGS_RSP: + vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP); + break; + case VCPU_REGS_RIP: + vcpu->arch.regs[VCPU_REGS_RIP] = vmcs_readl(GUEST_RIP); + break; + case VCPU_EXREG_PDPTR: + if (enable_ept) + ept_save_pdptrs(vcpu); + break; + case VCPU_EXREG_CR0: + guest_owned_bits = vcpu->arch.cr0_guest_owned_bits; + + vcpu->arch.cr0 &= ~guest_owned_bits; + vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & guest_owned_bits; + break; + case VCPU_EXREG_CR3: + /* + * When intercepting CR3 loads, e.g. for shadowing paging, KVM's + * CR3 is loaded into hardware, not the guest's CR3. + */ + if (!(exec_controls_get(to_vmx(vcpu)) & CPU_BASED_CR3_LOAD_EXITING)) + vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); + break; + case VCPU_EXREG_CR4: + guest_owned_bits = vcpu->arch.cr4_guest_owned_bits; + + vcpu->arch.cr4 &= ~guest_owned_bits; + vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & guest_owned_bits; + break; + default: + KVM_BUG_ON(1, vcpu->kvm); + break; + } +} + +static __init int cpu_has_kvm_support(void) +{ + return cpu_has_vmx(); +} + +static __init int vmx_disabled_by_bios(void) +{ + return !boot_cpu_has(X86_FEATURE_MSR_IA32_FEAT_CTL) || + !boot_cpu_has(X86_FEATURE_VMX); +} + +static int kvm_cpu_vmxon(u64 vmxon_pointer) +{ + u64 msr; + + cr4_set_bits(X86_CR4_VMXE); + + asm_volatile_goto("1: vmxon %[vmxon_pointer]\n\t" + _ASM_EXTABLE(1b, %l[fault]) + : : [vmxon_pointer] "m"(vmxon_pointer) + : : fault); + return 0; + +fault: + WARN_ONCE(1, "VMXON faulted, MSR_IA32_FEAT_CTL (0x3a) = 0x%llx\n", + rdmsrl_safe(MSR_IA32_FEAT_CTL, &msr) ? 0xdeadbeef : msr); + cr4_clear_bits(X86_CR4_VMXE); + + return -EFAULT; +} + +static int vmx_hardware_enable(void) +{ + int cpu = raw_smp_processor_id(); + u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); + int r; + + if (cr4_read_shadow() & X86_CR4_VMXE) + return -EBUSY; + + /* + * This can happen if we hot-added a CPU but failed to allocate + * VP assist page for it. + */ + if (static_branch_unlikely(&enable_evmcs) && + !hv_get_vp_assist_page(cpu)) + return -EFAULT; + + intel_pt_handle_vmx(1); + + r = kvm_cpu_vmxon(phys_addr); + if (r) { + intel_pt_handle_vmx(0); + return r; + } + + if (enable_ept) + ept_sync_global(); + + return 0; +} + +static void vmclear_local_loaded_vmcss(void) +{ + int cpu = raw_smp_processor_id(); + struct loaded_vmcs *v, *n; + + list_for_each_entry_safe(v, n, &per_cpu(loaded_vmcss_on_cpu, cpu), + loaded_vmcss_on_cpu_link) + __loaded_vmcs_clear(v); +} + +static void vmx_hardware_disable(void) +{ + vmclear_local_loaded_vmcss(); + + if (cpu_vmxoff()) + kvm_spurious_fault(); + + hv_reset_evmcs(); + + intel_pt_handle_vmx(0); +} + +/* + * There is no X86_FEATURE for SGX yet, but anyway we need to query CPUID + * directly instead of going through cpu_has(), to ensure KVM is trapping + * ENCLS whenever it's supported in hardware. It does not matter whether + * the host OS supports or has enabled SGX. + */ +static bool cpu_has_sgx(void) +{ + return cpuid_eax(0) >= 0x12 && (cpuid_eax(0x12) & BIT(0)); +} + +/* + * Some cpus support VM_{ENTRY,EXIT}_IA32_PERF_GLOBAL_CTRL but they + * can't be used due to errata where VM Exit may incorrectly clear + * IA32_PERF_GLOBAL_CTRL[34:32]. Work around the errata by using the + * MSR load mechanism to switch IA32_PERF_GLOBAL_CTRL. + */ +static bool cpu_has_perf_global_ctrl_bug(void) +{ + if (boot_cpu_data.x86 == 0x6) { + switch (boot_cpu_data.x86_model) { + case INTEL_FAM6_NEHALEM_EP: /* AAK155 */ + case INTEL_FAM6_NEHALEM: /* AAP115 */ + case INTEL_FAM6_WESTMERE: /* AAT100 */ + case INTEL_FAM6_WESTMERE_EP: /* BC86,AAY89,BD102 */ + case INTEL_FAM6_NEHALEM_EX: /* BA97 */ + return true; + default: + break; + } + } + + return false; +} + +static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, + u32 msr, u32 *result) +{ + u32 vmx_msr_low, vmx_msr_high; + u32 ctl = ctl_min | ctl_opt; + + rdmsr(msr, vmx_msr_low, vmx_msr_high); + + ctl &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */ + ctl |= vmx_msr_low; /* bit == 1 in low word ==> must be one */ + + /* Ensure minimum (required) set of control bits are supported. */ + if (ctl_min & ~ctl) + return -EIO; + + *result = ctl; + return 0; +} + +static __init u64 adjust_vmx_controls64(u64 ctl_opt, u32 msr) +{ + u64 allowed; + + rdmsrl(msr, allowed); + + return ctl_opt & allowed; +} + +static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf, + struct vmx_capability *vmx_cap) +{ + u32 vmx_msr_low, vmx_msr_high; + u32 _pin_based_exec_control = 0; + u32 _cpu_based_exec_control = 0; + u32 _cpu_based_2nd_exec_control = 0; + u64 _cpu_based_3rd_exec_control = 0; + u32 _vmexit_control = 0; + u32 _vmentry_control = 0; + u64 misc_msr; + int i; + + /* + * LOAD/SAVE_DEBUG_CONTROLS are absent because both are mandatory. + * SAVE_IA32_PAT and SAVE_IA32_EFER are absent because KVM always + * intercepts writes to PAT and EFER, i.e. never enables those controls. + */ + struct { + u32 entry_control; + u32 exit_control; + } const vmcs_entry_exit_pairs[] = { + { VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL, VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL }, + { VM_ENTRY_LOAD_IA32_PAT, VM_EXIT_LOAD_IA32_PAT }, + { VM_ENTRY_LOAD_IA32_EFER, VM_EXIT_LOAD_IA32_EFER }, + { VM_ENTRY_LOAD_BNDCFGS, VM_EXIT_CLEAR_BNDCFGS }, + { VM_ENTRY_LOAD_IA32_RTIT_CTL, VM_EXIT_CLEAR_IA32_RTIT_CTL }, + }; + + memset(vmcs_conf, 0, sizeof(*vmcs_conf)); + + if (adjust_vmx_controls(KVM_REQUIRED_VMX_CPU_BASED_VM_EXEC_CONTROL, + KVM_OPTIONAL_VMX_CPU_BASED_VM_EXEC_CONTROL, + MSR_IA32_VMX_PROCBASED_CTLS, + &_cpu_based_exec_control)) + return -EIO; + if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) { + if (adjust_vmx_controls(KVM_REQUIRED_VMX_SECONDARY_VM_EXEC_CONTROL, + KVM_OPTIONAL_VMX_SECONDARY_VM_EXEC_CONTROL, + MSR_IA32_VMX_PROCBASED_CTLS2, + &_cpu_based_2nd_exec_control)) + return -EIO; + } +#ifndef CONFIG_X86_64 + if (!(_cpu_based_2nd_exec_control & + SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) + _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW; +#endif + + if (!(_cpu_based_exec_control & CPU_BASED_TPR_SHADOW)) + _cpu_based_2nd_exec_control &= ~( + SECONDARY_EXEC_APIC_REGISTER_VIRT | + SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | + SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); + + rdmsr_safe(MSR_IA32_VMX_EPT_VPID_CAP, + &vmx_cap->ept, &vmx_cap->vpid); + + if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) && + vmx_cap->ept) { + pr_warn_once("EPT CAP should not exist if not support " + "1-setting enable EPT VM-execution control\n"); + + if (error_on_inconsistent_vmcs_config) + return -EIO; + + vmx_cap->ept = 0; + } + if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_VPID) && + vmx_cap->vpid) { + pr_warn_once("VPID CAP should not exist if not support " + "1-setting enable VPID VM-execution control\n"); + + if (error_on_inconsistent_vmcs_config) + return -EIO; + + vmx_cap->vpid = 0; + } + + if (!cpu_has_sgx()) + _cpu_based_2nd_exec_control &= ~SECONDARY_EXEC_ENCLS_EXITING; + + if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_TERTIARY_CONTROLS) + _cpu_based_3rd_exec_control = + adjust_vmx_controls64(KVM_OPTIONAL_VMX_TERTIARY_VM_EXEC_CONTROL, + MSR_IA32_VMX_PROCBASED_CTLS3); + + if (adjust_vmx_controls(KVM_REQUIRED_VMX_VM_EXIT_CONTROLS, + KVM_OPTIONAL_VMX_VM_EXIT_CONTROLS, + MSR_IA32_VMX_EXIT_CTLS, + &_vmexit_control)) + return -EIO; + + if (adjust_vmx_controls(KVM_REQUIRED_VMX_PIN_BASED_VM_EXEC_CONTROL, + KVM_OPTIONAL_VMX_PIN_BASED_VM_EXEC_CONTROL, + MSR_IA32_VMX_PINBASED_CTLS, + &_pin_based_exec_control)) + return -EIO; + + if (cpu_has_broken_vmx_preemption_timer()) + _pin_based_exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER; + if (!(_cpu_based_2nd_exec_control & + SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)) + _pin_based_exec_control &= ~PIN_BASED_POSTED_INTR; + + if (adjust_vmx_controls(KVM_REQUIRED_VMX_VM_ENTRY_CONTROLS, + KVM_OPTIONAL_VMX_VM_ENTRY_CONTROLS, + MSR_IA32_VMX_ENTRY_CTLS, + &_vmentry_control)) + return -EIO; + + for (i = 0; i < ARRAY_SIZE(vmcs_entry_exit_pairs); i++) { + u32 n_ctrl = vmcs_entry_exit_pairs[i].entry_control; + u32 x_ctrl = vmcs_entry_exit_pairs[i].exit_control; + + if (!(_vmentry_control & n_ctrl) == !(_vmexit_control & x_ctrl)) + continue; + + pr_warn_once("Inconsistent VM-Entry/VM-Exit pair, entry = %x, exit = %x\n", + _vmentry_control & n_ctrl, _vmexit_control & x_ctrl); + + if (error_on_inconsistent_vmcs_config) + return -EIO; + + _vmentry_control &= ~n_ctrl; + _vmexit_control &= ~x_ctrl; + } + + rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high); + + /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */ + if ((vmx_msr_high & 0x1fff) > PAGE_SIZE) + return -EIO; + +#ifdef CONFIG_X86_64 + /* IA-32 SDM Vol 3B: 64-bit CPUs always have VMX_BASIC_MSR[48]==0. */ + if (vmx_msr_high & (1u<<16)) + return -EIO; +#endif + + /* Require Write-Back (WB) memory type for VMCS accesses. */ + if (((vmx_msr_high >> 18) & 15) != 6) + return -EIO; + + rdmsrl(MSR_IA32_VMX_MISC, misc_msr); + + vmcs_conf->size = vmx_msr_high & 0x1fff; + vmcs_conf->basic_cap = vmx_msr_high & ~0x1fff; + + vmcs_conf->revision_id = vmx_msr_low; + + vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control; + vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control; + vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control; + vmcs_conf->cpu_based_3rd_exec_ctrl = _cpu_based_3rd_exec_control; + vmcs_conf->vmexit_ctrl = _vmexit_control; + vmcs_conf->vmentry_ctrl = _vmentry_control; + vmcs_conf->misc = misc_msr; + + return 0; +} + +struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags) +{ + int node = cpu_to_node(cpu); + struct page *pages; + struct vmcs *vmcs; + + pages = __alloc_pages_node(node, flags, 0); + if (!pages) + return NULL; + vmcs = page_address(pages); + memset(vmcs, 0, vmcs_config.size); + + /* KVM supports Enlightened VMCS v1 only */ + if (static_branch_unlikely(&enable_evmcs)) + vmcs->hdr.revision_id = KVM_EVMCS_VERSION; + else + vmcs->hdr.revision_id = vmcs_config.revision_id; + + if (shadow) + vmcs->hdr.shadow_vmcs = 1; + return vmcs; +} + +void free_vmcs(struct vmcs *vmcs) +{ + free_page((unsigned long)vmcs); +} + +/* + * Free a VMCS, but before that VMCLEAR it on the CPU where it was last loaded + */ +void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs) +{ + if (!loaded_vmcs->vmcs) + return; + loaded_vmcs_clear(loaded_vmcs); + free_vmcs(loaded_vmcs->vmcs); + loaded_vmcs->vmcs = NULL; + if (loaded_vmcs->msr_bitmap) + free_page((unsigned long)loaded_vmcs->msr_bitmap); + WARN_ON(loaded_vmcs->shadow_vmcs != NULL); +} + +int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs) +{ + loaded_vmcs->vmcs = alloc_vmcs(false); + if (!loaded_vmcs->vmcs) + return -ENOMEM; + + vmcs_clear(loaded_vmcs->vmcs); + + loaded_vmcs->shadow_vmcs = NULL; + loaded_vmcs->hv_timer_soft_disabled = false; + loaded_vmcs->cpu = -1; + loaded_vmcs->launched = 0; + + if (cpu_has_vmx_msr_bitmap()) { + loaded_vmcs->msr_bitmap = (unsigned long *) + __get_free_page(GFP_KERNEL_ACCOUNT); + if (!loaded_vmcs->msr_bitmap) + goto out_vmcs; + memset(loaded_vmcs->msr_bitmap, 0xff, PAGE_SIZE); + } + + memset(&loaded_vmcs->host_state, 0, sizeof(struct vmcs_host_state)); + memset(&loaded_vmcs->controls_shadow, 0, + sizeof(struct vmcs_controls_shadow)); + + return 0; + +out_vmcs: + free_loaded_vmcs(loaded_vmcs); + return -ENOMEM; +} + +static void free_kvm_area(void) +{ + int cpu; + + for_each_possible_cpu(cpu) { + free_vmcs(per_cpu(vmxarea, cpu)); + per_cpu(vmxarea, cpu) = NULL; + } +} + +static __init int alloc_kvm_area(void) +{ + int cpu; + + for_each_possible_cpu(cpu) { + struct vmcs *vmcs; + + vmcs = alloc_vmcs_cpu(false, cpu, GFP_KERNEL); + if (!vmcs) { + free_kvm_area(); + return -ENOMEM; + } + + /* + * When eVMCS is enabled, alloc_vmcs_cpu() sets + * vmcs->revision_id to KVM_EVMCS_VERSION instead of + * revision_id reported by MSR_IA32_VMX_BASIC. + * + * However, even though not explicitly documented by + * TLFS, VMXArea passed as VMXON argument should + * still be marked with revision_id reported by + * physical CPU. + */ + if (static_branch_unlikely(&enable_evmcs)) + vmcs->hdr.revision_id = vmcs_config.revision_id; + + per_cpu(vmxarea, cpu) = vmcs; + } + return 0; +} + +static void fix_pmode_seg(struct kvm_vcpu *vcpu, int seg, + struct kvm_segment *save) +{ + if (!emulate_invalid_guest_state) { + /* + * CS and SS RPL should be equal during guest entry according + * to VMX spec, but in reality it is not always so. Since vcpu + * is in the middle of the transition from real mode to + * protected mode it is safe to assume that RPL 0 is a good + * default value. + */ + if (seg == VCPU_SREG_CS || seg == VCPU_SREG_SS) + save->selector &= ~SEGMENT_RPL_MASK; + save->dpl = save->selector & SEGMENT_RPL_MASK; + save->s = 1; + } + __vmx_set_segment(vcpu, save, seg); +} + +static void enter_pmode(struct kvm_vcpu *vcpu) +{ + unsigned long flags; + struct vcpu_vmx *vmx = to_vmx(vcpu); + + /* + * Update real mode segment cache. It may be not up-to-date if segment + * register was written while vcpu was in a guest mode. + */ + vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES); + vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS); + vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS); + vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS); + vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS); + vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS); + + vmx->rmode.vm86_active = 0; + + __vmx_set_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR); + + flags = vmcs_readl(GUEST_RFLAGS); + flags &= RMODE_GUEST_OWNED_EFLAGS_BITS; + flags |= vmx->rmode.save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS; + vmcs_writel(GUEST_RFLAGS, flags); + + vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) | + (vmcs_readl(CR4_READ_SHADOW) & X86_CR4_VME)); + + vmx_update_exception_bitmap(vcpu); + + fix_pmode_seg(vcpu, VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]); + fix_pmode_seg(vcpu, VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]); + fix_pmode_seg(vcpu, VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]); + fix_pmode_seg(vcpu, VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]); + fix_pmode_seg(vcpu, VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]); + fix_pmode_seg(vcpu, VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]); +} + +static void fix_rmode_seg(int seg, struct kvm_segment *save) +{ + const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; + struct kvm_segment var = *save; + + var.dpl = 0x3; + if (seg == VCPU_SREG_CS) + var.type = 0x3; + + if (!emulate_invalid_guest_state) { + var.selector = var.base >> 4; + var.base = var.base & 0xffff0; + var.limit = 0xffff; + var.g = 0; + var.db = 0; + var.present = 1; + var.s = 1; + var.l = 0; + var.unusable = 0; + var.type = 0x3; + var.avl = 0; + if (save->base & 0xf) + printk_once(KERN_WARNING "kvm: segment base is not " + "paragraph aligned when entering " + "protected mode (seg=%d)", seg); + } + + vmcs_write16(sf->selector, var.selector); + vmcs_writel(sf->base, var.base); + vmcs_write32(sf->limit, var.limit); + vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(&var)); +} + +static void enter_rmode(struct kvm_vcpu *vcpu) +{ + unsigned long flags; + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct kvm_vmx *kvm_vmx = to_kvm_vmx(vcpu->kvm); + + /* + * KVM should never use VM86 to virtualize Real Mode when L2 is active, + * as using VM86 is unnecessary if unrestricted guest is enabled, and + * if unrestricted guest is disabled, VM-Enter (from L1) with CR0.PG=0 + * should VM-Fail and KVM should reject userspace attempts to stuff + * CR0.PG=0 when L2 is active. + */ + WARN_ON_ONCE(is_guest_mode(vcpu)); + + vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR); + vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES); + vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS); + vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS); + vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS); + vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS); + vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS); + + vmx->rmode.vm86_active = 1; + + /* + * Very old userspace does not call KVM_SET_TSS_ADDR before entering + * vcpu. Warn the user that an update is overdue. + */ + if (!kvm_vmx->tss_addr) + printk_once(KERN_WARNING "kvm: KVM_SET_TSS_ADDR need to be " + "called before entering vcpu\n"); + + vmx_segment_cache_clear(vmx); + + vmcs_writel(GUEST_TR_BASE, kvm_vmx->tss_addr); + vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1); + vmcs_write32(GUEST_TR_AR_BYTES, 0x008b); + + flags = vmcs_readl(GUEST_RFLAGS); + vmx->rmode.save_rflags = flags; + + flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; + + vmcs_writel(GUEST_RFLAGS, flags); + vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME); + vmx_update_exception_bitmap(vcpu); + + fix_rmode_seg(VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]); + fix_rmode_seg(VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]); + fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]); + fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]); + fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]); + fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]); +} + +int vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + /* Nothing to do if hardware doesn't support EFER. */ + if (!vmx_find_uret_msr(vmx, MSR_EFER)) + return 0; + + vcpu->arch.efer = efer; +#ifdef CONFIG_X86_64 + if (efer & EFER_LMA) + vm_entry_controls_setbit(vmx, VM_ENTRY_IA32E_MODE); + else + vm_entry_controls_clearbit(vmx, VM_ENTRY_IA32E_MODE); +#else + if (KVM_BUG_ON(efer & EFER_LMA, vcpu->kvm)) + return 1; +#endif + + vmx_setup_uret_msrs(vmx); + return 0; +} + +#ifdef CONFIG_X86_64 + +static void enter_lmode(struct kvm_vcpu *vcpu) +{ + u32 guest_tr_ar; + + vmx_segment_cache_clear(to_vmx(vcpu)); + + guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES); + if ((guest_tr_ar & VMX_AR_TYPE_MASK) != VMX_AR_TYPE_BUSY_64_TSS) { + pr_debug_ratelimited("%s: tss fixup for long mode. \n", + __func__); + vmcs_write32(GUEST_TR_AR_BYTES, + (guest_tr_ar & ~VMX_AR_TYPE_MASK) + | VMX_AR_TYPE_BUSY_64_TSS); + } + vmx_set_efer(vcpu, vcpu->arch.efer | EFER_LMA); +} + +static void exit_lmode(struct kvm_vcpu *vcpu) +{ + vmx_set_efer(vcpu, vcpu->arch.efer & ~EFER_LMA); +} + +#endif + +static void vmx_flush_tlb_all(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + /* + * INVEPT must be issued when EPT is enabled, irrespective of VPID, as + * the CPU is not required to invalidate guest-physical mappings on + * VM-Entry, even if VPID is disabled. Guest-physical mappings are + * associated with the root EPT structure and not any particular VPID + * (INVVPID also isn't required to invalidate guest-physical mappings). + */ + if (enable_ept) { + ept_sync_global(); + } else if (enable_vpid) { + if (cpu_has_vmx_invvpid_global()) { + vpid_sync_vcpu_global(); + } else { + vpid_sync_vcpu_single(vmx->vpid); + vpid_sync_vcpu_single(vmx->nested.vpid02); + } + } +} + +static inline int vmx_get_current_vpid(struct kvm_vcpu *vcpu) +{ + if (is_guest_mode(vcpu)) + return nested_get_vpid02(vcpu); + return to_vmx(vcpu)->vpid; +} + +static void vmx_flush_tlb_current(struct kvm_vcpu *vcpu) +{ + struct kvm_mmu *mmu = vcpu->arch.mmu; + u64 root_hpa = mmu->root.hpa; + + /* No flush required if the current context is invalid. */ + if (!VALID_PAGE(root_hpa)) + return; + + if (enable_ept) + ept_sync_context(construct_eptp(vcpu, root_hpa, + mmu->root_role.level)); + else + vpid_sync_context(vmx_get_current_vpid(vcpu)); +} + +static void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr) +{ + /* + * vpid_sync_vcpu_addr() is a nop if vpid==0, see the comment in + * vmx_flush_tlb_guest() for an explanation of why this is ok. + */ + vpid_sync_vcpu_addr(vmx_get_current_vpid(vcpu), addr); +} + +static void vmx_flush_tlb_guest(struct kvm_vcpu *vcpu) +{ + /* + * vpid_sync_context() is a nop if vpid==0, e.g. if enable_vpid==0 or a + * vpid couldn't be allocated for this vCPU. VM-Enter and VM-Exit are + * required to flush GVA->{G,H}PA mappings from the TLB if vpid is + * disabled (VM-Enter with vpid enabled and vpid==0 is disallowed), + * i.e. no explicit INVVPID is necessary. + */ + vpid_sync_context(vmx_get_current_vpid(vcpu)); +} + +void vmx_ept_load_pdptrs(struct kvm_vcpu *vcpu) +{ + struct kvm_mmu *mmu = vcpu->arch.walk_mmu; + + if (!kvm_register_is_dirty(vcpu, VCPU_EXREG_PDPTR)) + return; + + if (is_pae_paging(vcpu)) { + vmcs_write64(GUEST_PDPTR0, mmu->pdptrs[0]); + vmcs_write64(GUEST_PDPTR1, mmu->pdptrs[1]); + vmcs_write64(GUEST_PDPTR2, mmu->pdptrs[2]); + vmcs_write64(GUEST_PDPTR3, mmu->pdptrs[3]); + } +} + +void ept_save_pdptrs(struct kvm_vcpu *vcpu) +{ + struct kvm_mmu *mmu = vcpu->arch.walk_mmu; + + if (WARN_ON_ONCE(!is_pae_paging(vcpu))) + return; + + mmu->pdptrs[0] = vmcs_read64(GUEST_PDPTR0); + mmu->pdptrs[1] = vmcs_read64(GUEST_PDPTR1); + mmu->pdptrs[2] = vmcs_read64(GUEST_PDPTR2); + mmu->pdptrs[3] = vmcs_read64(GUEST_PDPTR3); + + kvm_register_mark_available(vcpu, VCPU_EXREG_PDPTR); +} + +#define CR3_EXITING_BITS (CPU_BASED_CR3_LOAD_EXITING | \ + CPU_BASED_CR3_STORE_EXITING) + +static bool vmx_is_valid_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) +{ + if (is_guest_mode(vcpu)) + return nested_guest_cr0_valid(vcpu, cr0); + + if (to_vmx(vcpu)->nested.vmxon) + return nested_host_cr0_valid(vcpu, cr0); + + return true; +} + +void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + unsigned long hw_cr0, old_cr0_pg; + u32 tmp; + + old_cr0_pg = kvm_read_cr0_bits(vcpu, X86_CR0_PG); + + hw_cr0 = (cr0 & ~KVM_VM_CR0_ALWAYS_OFF); + if (enable_unrestricted_guest) + hw_cr0 |= KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST; + else { + hw_cr0 |= KVM_VM_CR0_ALWAYS_ON; + if (!enable_ept) + hw_cr0 |= X86_CR0_WP; + + if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE)) + enter_pmode(vcpu); + + if (!vmx->rmode.vm86_active && !(cr0 & X86_CR0_PE)) + enter_rmode(vcpu); + } + + vmcs_writel(CR0_READ_SHADOW, cr0); + vmcs_writel(GUEST_CR0, hw_cr0); + vcpu->arch.cr0 = cr0; + kvm_register_mark_available(vcpu, VCPU_EXREG_CR0); + +#ifdef CONFIG_X86_64 + if (vcpu->arch.efer & EFER_LME) { + if (!old_cr0_pg && (cr0 & X86_CR0_PG)) + enter_lmode(vcpu); + else if (old_cr0_pg && !(cr0 & X86_CR0_PG)) + exit_lmode(vcpu); + } +#endif + + if (enable_ept && !enable_unrestricted_guest) { + /* + * Ensure KVM has an up-to-date snapshot of the guest's CR3. If + * the below code _enables_ CR3 exiting, vmx_cache_reg() will + * (correctly) stop reading vmcs.GUEST_CR3 because it thinks + * KVM's CR3 is installed. + */ + if (!kvm_register_is_available(vcpu, VCPU_EXREG_CR3)) + vmx_cache_reg(vcpu, VCPU_EXREG_CR3); + + /* + * When running with EPT but not unrestricted guest, KVM must + * intercept CR3 accesses when paging is _disabled_. This is + * necessary because restricted guests can't actually run with + * paging disabled, and so KVM stuffs its own CR3 in order to + * run the guest when identity mapped page tables. + * + * Do _NOT_ check the old CR0.PG, e.g. to optimize away the + * update, it may be stale with respect to CR3 interception, + * e.g. after nested VM-Enter. + * + * Lastly, honor L1's desires, i.e. intercept CR3 loads and/or + * stores to forward them to L1, even if KVM does not need to + * intercept them to preserve its identity mapped page tables. + */ + if (!(cr0 & X86_CR0_PG)) { + exec_controls_setbit(vmx, CR3_EXITING_BITS); + } else if (!is_guest_mode(vcpu)) { + exec_controls_clearbit(vmx, CR3_EXITING_BITS); + } else { + tmp = exec_controls_get(vmx); + tmp &= ~CR3_EXITING_BITS; + tmp |= get_vmcs12(vcpu)->cpu_based_vm_exec_control & CR3_EXITING_BITS; + exec_controls_set(vmx, tmp); + } + + /* Note, vmx_set_cr4() consumes the new vcpu->arch.cr0. */ + if ((old_cr0_pg ^ cr0) & X86_CR0_PG) + vmx_set_cr4(vcpu, kvm_read_cr4(vcpu)); + + /* + * When !CR0_PG -> CR0_PG, vcpu->arch.cr3 becomes active, but + * GUEST_CR3 is still vmx->ept_identity_map_addr if EPT + !URG. + */ + if (!(old_cr0_pg & X86_CR0_PG) && (cr0 & X86_CR0_PG)) + kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3); + } + + /* depends on vcpu->arch.cr0 to be set to a new value */ + vmx->emulation_required = vmx_emulation_required(vcpu); +} + +static int vmx_get_max_tdp_level(void) +{ + if (cpu_has_vmx_ept_5levels()) + return 5; + return 4; +} + +u64 construct_eptp(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level) +{ + u64 eptp = VMX_EPTP_MT_WB; + + eptp |= (root_level == 5) ? VMX_EPTP_PWL_5 : VMX_EPTP_PWL_4; + + if (enable_ept_ad_bits && + (!is_guest_mode(vcpu) || nested_ept_ad_enabled(vcpu))) + eptp |= VMX_EPTP_AD_ENABLE_BIT; + eptp |= root_hpa; + + return eptp; +} + +static void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, + int root_level) +{ + struct kvm *kvm = vcpu->kvm; + bool update_guest_cr3 = true; + unsigned long guest_cr3; + u64 eptp; + + if (enable_ept) { + eptp = construct_eptp(vcpu, root_hpa, root_level); + vmcs_write64(EPT_POINTER, eptp); + + hv_track_root_tdp(vcpu, root_hpa); + + if (!enable_unrestricted_guest && !is_paging(vcpu)) + guest_cr3 = to_kvm_vmx(kvm)->ept_identity_map_addr; + else if (kvm_register_is_dirty(vcpu, VCPU_EXREG_CR3)) + guest_cr3 = vcpu->arch.cr3; + else /* vmcs.GUEST_CR3 is already up-to-date. */ + update_guest_cr3 = false; + vmx_ept_load_pdptrs(vcpu); + } else { + guest_cr3 = root_hpa | kvm_get_active_pcid(vcpu); + } + + if (update_guest_cr3) + vmcs_writel(GUEST_CR3, guest_cr3); +} + + +static bool vmx_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) +{ + /* + * We operate under the default treatment of SMM, so VMX cannot be + * enabled under SMM. Note, whether or not VMXE is allowed at all, + * i.e. is a reserved bit, is handled by common x86 code. + */ + if ((cr4 & X86_CR4_VMXE) && is_smm(vcpu)) + return false; + + if (to_vmx(vcpu)->nested.vmxon && !nested_cr4_valid(vcpu, cr4)) + return false; + + return true; +} + +void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) +{ + unsigned long old_cr4 = vcpu->arch.cr4; + struct vcpu_vmx *vmx = to_vmx(vcpu); + /* + * Pass through host's Machine Check Enable value to hw_cr4, which + * is in force while we are in guest mode. Do not let guests control + * this bit, even if host CR4.MCE == 0. + */ + unsigned long hw_cr4; + + hw_cr4 = (cr4_read_shadow() & X86_CR4_MCE) | (cr4 & ~X86_CR4_MCE); + if (enable_unrestricted_guest) + hw_cr4 |= KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST; + else if (vmx->rmode.vm86_active) + hw_cr4 |= KVM_RMODE_VM_CR4_ALWAYS_ON; + else + hw_cr4 |= KVM_PMODE_VM_CR4_ALWAYS_ON; + + if (!boot_cpu_has(X86_FEATURE_UMIP) && vmx_umip_emulated()) { + if (cr4 & X86_CR4_UMIP) { + secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_DESC); + hw_cr4 &= ~X86_CR4_UMIP; + } else if (!is_guest_mode(vcpu) || + !nested_cpu_has2(get_vmcs12(vcpu), SECONDARY_EXEC_DESC)) { + secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_DESC); + } + } + + vcpu->arch.cr4 = cr4; + kvm_register_mark_available(vcpu, VCPU_EXREG_CR4); + + if (!enable_unrestricted_guest) { + if (enable_ept) { + if (!is_paging(vcpu)) { + hw_cr4 &= ~X86_CR4_PAE; + hw_cr4 |= X86_CR4_PSE; + } else if (!(cr4 & X86_CR4_PAE)) { + hw_cr4 &= ~X86_CR4_PAE; + } + } + + /* + * SMEP/SMAP/PKU is disabled if CPU is in non-paging mode in + * hardware. To emulate this behavior, SMEP/SMAP/PKU needs + * to be manually disabled when guest switches to non-paging + * mode. + * + * If !enable_unrestricted_guest, the CPU is always running + * with CR0.PG=1 and CR4 needs to be modified. + * If enable_unrestricted_guest, the CPU automatically + * disables SMEP/SMAP/PKU when the guest sets CR0.PG=0. + */ + if (!is_paging(vcpu)) + hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE); + } + + vmcs_writel(CR4_READ_SHADOW, cr4); + vmcs_writel(GUEST_CR4, hw_cr4); + + if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE)) + kvm_update_cpuid_runtime(vcpu); +} + +void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + u32 ar; + + if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) { + *var = vmx->rmode.segs[seg]; + if (seg == VCPU_SREG_TR + || var->selector == vmx_read_guest_seg_selector(vmx, seg)) + return; + var->base = vmx_read_guest_seg_base(vmx, seg); + var->selector = vmx_read_guest_seg_selector(vmx, seg); + return; + } + var->base = vmx_read_guest_seg_base(vmx, seg); + var->limit = vmx_read_guest_seg_limit(vmx, seg); + var->selector = vmx_read_guest_seg_selector(vmx, seg); + ar = vmx_read_guest_seg_ar(vmx, seg); + var->unusable = (ar >> 16) & 1; + var->type = ar & 15; + var->s = (ar >> 4) & 1; + var->dpl = (ar >> 5) & 3; + /* + * Some userspaces do not preserve unusable property. Since usable + * segment has to be present according to VMX spec we can use present + * property to amend userspace bug by making unusable segment always + * nonpresent. vmx_segment_access_rights() already marks nonpresent + * segment as unusable. + */ + var->present = !var->unusable; + var->avl = (ar >> 12) & 1; + var->l = (ar >> 13) & 1; + var->db = (ar >> 14) & 1; + var->g = (ar >> 15) & 1; +} + +static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg) +{ + struct kvm_segment s; + + if (to_vmx(vcpu)->rmode.vm86_active) { + vmx_get_segment(vcpu, &s, seg); + return s.base; + } + return vmx_read_guest_seg_base(to_vmx(vcpu), seg); +} + +int vmx_get_cpl(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (unlikely(vmx->rmode.vm86_active)) + return 0; + else { + int ar = vmx_read_guest_seg_ar(vmx, VCPU_SREG_SS); + return VMX_AR_DPL(ar); + } +} + +static u32 vmx_segment_access_rights(struct kvm_segment *var) +{ + u32 ar; + + ar = var->type & 15; + ar |= (var->s & 1) << 4; + ar |= (var->dpl & 3) << 5; + ar |= (var->present & 1) << 7; + ar |= (var->avl & 1) << 12; + ar |= (var->l & 1) << 13; + ar |= (var->db & 1) << 14; + ar |= (var->g & 1) << 15; + ar |= (var->unusable || !var->present) << 16; + + return ar; +} + +void __vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; + + vmx_segment_cache_clear(vmx); + + if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) { + vmx->rmode.segs[seg] = *var; + if (seg == VCPU_SREG_TR) + vmcs_write16(sf->selector, var->selector); + else if (var->s) + fix_rmode_seg(seg, &vmx->rmode.segs[seg]); + return; + } + + vmcs_writel(sf->base, var->base); + vmcs_write32(sf->limit, var->limit); + vmcs_write16(sf->selector, var->selector); + + /* + * Fix the "Accessed" bit in AR field of segment registers for older + * qemu binaries. + * IA32 arch specifies that at the time of processor reset the + * "Accessed" bit in the AR field of segment registers is 1. And qemu + * is setting it to 0 in the userland code. This causes invalid guest + * state vmexit when "unrestricted guest" mode is turned on. + * Fix for this setup issue in cpu_reset is being pushed in the qemu + * tree. Newer qemu binaries with that qemu fix would not need this + * kvm hack. + */ + if (is_unrestricted_guest(vcpu) && (seg != VCPU_SREG_LDTR)) + var->type |= 0x1; /* Accessed */ + + vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(var)); +} + +static void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) +{ + __vmx_set_segment(vcpu, var, seg); + + to_vmx(vcpu)->emulation_required = vmx_emulation_required(vcpu); +} + +static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) +{ + u32 ar = vmx_read_guest_seg_ar(to_vmx(vcpu), VCPU_SREG_CS); + + *db = (ar >> 14) & 1; + *l = (ar >> 13) & 1; +} + +static void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) +{ + dt->size = vmcs_read32(GUEST_IDTR_LIMIT); + dt->address = vmcs_readl(GUEST_IDTR_BASE); +} + +static void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) +{ + vmcs_write32(GUEST_IDTR_LIMIT, dt->size); + vmcs_writel(GUEST_IDTR_BASE, dt->address); +} + +static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) +{ + dt->size = vmcs_read32(GUEST_GDTR_LIMIT); + dt->address = vmcs_readl(GUEST_GDTR_BASE); +} + +static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) +{ + vmcs_write32(GUEST_GDTR_LIMIT, dt->size); + vmcs_writel(GUEST_GDTR_BASE, dt->address); +} + +static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg) +{ + struct kvm_segment var; + u32 ar; + + vmx_get_segment(vcpu, &var, seg); + var.dpl = 0x3; + if (seg == VCPU_SREG_CS) + var.type = 0x3; + ar = vmx_segment_access_rights(&var); + + if (var.base != (var.selector << 4)) + return false; + if (var.limit != 0xffff) + return false; + if (ar != 0xf3) + return false; + + return true; +} + +static bool code_segment_valid(struct kvm_vcpu *vcpu) +{ + struct kvm_segment cs; + unsigned int cs_rpl; + + vmx_get_segment(vcpu, &cs, VCPU_SREG_CS); + cs_rpl = cs.selector & SEGMENT_RPL_MASK; + + if (cs.unusable) + return false; + if (~cs.type & (VMX_AR_TYPE_CODE_MASK|VMX_AR_TYPE_ACCESSES_MASK)) + return false; + if (!cs.s) + return false; + if (cs.type & VMX_AR_TYPE_WRITEABLE_MASK) { + if (cs.dpl > cs_rpl) + return false; + } else { + if (cs.dpl != cs_rpl) + return false; + } + if (!cs.present) + return false; + + /* TODO: Add Reserved field check, this'll require a new member in the kvm_segment_field structure */ + return true; +} + +static bool stack_segment_valid(struct kvm_vcpu *vcpu) +{ + struct kvm_segment ss; + unsigned int ss_rpl; + + vmx_get_segment(vcpu, &ss, VCPU_SREG_SS); + ss_rpl = ss.selector & SEGMENT_RPL_MASK; + + if (ss.unusable) + return true; + if (ss.type != 3 && ss.type != 7) + return false; + if (!ss.s) + return false; + if (ss.dpl != ss_rpl) /* DPL != RPL */ + return false; + if (!ss.present) + return false; + + return true; +} + +static bool data_segment_valid(struct kvm_vcpu *vcpu, int seg) +{ + struct kvm_segment var; + unsigned int rpl; + + vmx_get_segment(vcpu, &var, seg); + rpl = var.selector & SEGMENT_RPL_MASK; + + if (var.unusable) + return true; + if (!var.s) + return false; + if (!var.present) + return false; + if (~var.type & (VMX_AR_TYPE_CODE_MASK|VMX_AR_TYPE_WRITEABLE_MASK)) { + if (var.dpl < rpl) /* DPL < RPL */ + return false; + } + + /* TODO: Add other members to kvm_segment_field to allow checking for other access + * rights flags + */ + return true; +} + +static bool tr_valid(struct kvm_vcpu *vcpu) +{ + struct kvm_segment tr; + + vmx_get_segment(vcpu, &tr, VCPU_SREG_TR); + + if (tr.unusable) + return false; + if (tr.selector & SEGMENT_TI_MASK) /* TI = 1 */ + return false; + if (tr.type != 3 && tr.type != 11) /* TODO: Check if guest is in IA32e mode */ + return false; + if (!tr.present) + return false; + + return true; +} + +static bool ldtr_valid(struct kvm_vcpu *vcpu) +{ + struct kvm_segment ldtr; + + vmx_get_segment(vcpu, &ldtr, VCPU_SREG_LDTR); + + if (ldtr.unusable) + return true; + if (ldtr.selector & SEGMENT_TI_MASK) /* TI = 1 */ + return false; + if (ldtr.type != 2) + return false; + if (!ldtr.present) + return false; + + return true; +} + +static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu) +{ + struct kvm_segment cs, ss; + + vmx_get_segment(vcpu, &cs, VCPU_SREG_CS); + vmx_get_segment(vcpu, &ss, VCPU_SREG_SS); + + return ((cs.selector & SEGMENT_RPL_MASK) == + (ss.selector & SEGMENT_RPL_MASK)); +} + +/* + * Check if guest state is valid. Returns true if valid, false if + * not. + * We assume that registers are always usable + */ +bool __vmx_guest_state_valid(struct kvm_vcpu *vcpu) +{ + /* real mode guest state checks */ + if (!is_protmode(vcpu) || (vmx_get_rflags(vcpu) & X86_EFLAGS_VM)) { + if (!rmode_segment_valid(vcpu, VCPU_SREG_CS)) + return false; + if (!rmode_segment_valid(vcpu, VCPU_SREG_SS)) + return false; + if (!rmode_segment_valid(vcpu, VCPU_SREG_DS)) + return false; + if (!rmode_segment_valid(vcpu, VCPU_SREG_ES)) + return false; + if (!rmode_segment_valid(vcpu, VCPU_SREG_FS)) + return false; + if (!rmode_segment_valid(vcpu, VCPU_SREG_GS)) + return false; + } else { + /* protected mode guest state checks */ + if (!cs_ss_rpl_check(vcpu)) + return false; + if (!code_segment_valid(vcpu)) + return false; + if (!stack_segment_valid(vcpu)) + return false; + if (!data_segment_valid(vcpu, VCPU_SREG_DS)) + return false; + if (!data_segment_valid(vcpu, VCPU_SREG_ES)) + return false; + if (!data_segment_valid(vcpu, VCPU_SREG_FS)) + return false; + if (!data_segment_valid(vcpu, VCPU_SREG_GS)) + return false; + if (!tr_valid(vcpu)) + return false; + if (!ldtr_valid(vcpu)) + return false; + } + /* TODO: + * - Add checks on RIP + * - Add checks on RFLAGS + */ + + return true; +} + +static int init_rmode_tss(struct kvm *kvm, void __user *ua) +{ + const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0))); + u16 data; + int i; + + for (i = 0; i < 3; i++) { + if (__copy_to_user(ua + PAGE_SIZE * i, zero_page, PAGE_SIZE)) + return -EFAULT; + } + + data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE; + if (__copy_to_user(ua + TSS_IOPB_BASE_OFFSET, &data, sizeof(u16))) + return -EFAULT; + + data = ~0; + if (__copy_to_user(ua + RMODE_TSS_SIZE - 1, &data, sizeof(u8))) + return -EFAULT; + + return 0; +} + +static int init_rmode_identity_map(struct kvm *kvm) +{ + struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm); + int i, r = 0; + void __user *uaddr; + u32 tmp; + + /* Protect kvm_vmx->ept_identity_pagetable_done. */ + mutex_lock(&kvm->slots_lock); + + if (likely(kvm_vmx->ept_identity_pagetable_done)) + goto out; + + if (!kvm_vmx->ept_identity_map_addr) + kvm_vmx->ept_identity_map_addr = VMX_EPT_IDENTITY_PAGETABLE_ADDR; + + uaddr = __x86_set_memory_region(kvm, + IDENTITY_PAGETABLE_PRIVATE_MEMSLOT, + kvm_vmx->ept_identity_map_addr, + PAGE_SIZE); + if (IS_ERR(uaddr)) { + r = PTR_ERR(uaddr); + goto out; + } + + /* Set up identity-mapping pagetable for EPT in real mode */ + for (i = 0; i < (PAGE_SIZE / sizeof(tmp)); i++) { + tmp = (i << 22) + (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | + _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE); + if (__copy_to_user(uaddr + i * sizeof(tmp), &tmp, sizeof(tmp))) { + r = -EFAULT; + goto out; + } + } + kvm_vmx->ept_identity_pagetable_done = true; + +out: + mutex_unlock(&kvm->slots_lock); + return r; +} + +static void seg_setup(int seg) +{ + const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; + unsigned int ar; + + vmcs_write16(sf->selector, 0); + vmcs_writel(sf->base, 0); + vmcs_write32(sf->limit, 0xffff); + ar = 0x93; + if (seg == VCPU_SREG_CS) + ar |= 0x08; /* code segment */ + + vmcs_write32(sf->ar_bytes, ar); +} + +static int alloc_apic_access_page(struct kvm *kvm) +{ + struct page *page; + void __user *hva; + int ret = 0; + + mutex_lock(&kvm->slots_lock); + if (kvm->arch.apic_access_memslot_enabled) + goto out; + hva = __x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT, + APIC_DEFAULT_PHYS_BASE, PAGE_SIZE); + if (IS_ERR(hva)) { + ret = PTR_ERR(hva); + goto out; + } + + page = gfn_to_page(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT); + if (is_error_page(page)) { + ret = -EFAULT; + goto out; + } + + /* + * Do not pin the page in memory, so that memory hot-unplug + * is able to migrate it. + */ + put_page(page); + kvm->arch.apic_access_memslot_enabled = true; +out: + mutex_unlock(&kvm->slots_lock); + return ret; +} + +int allocate_vpid(void) +{ + int vpid; + + if (!enable_vpid) + return 0; + spin_lock(&vmx_vpid_lock); + vpid = find_first_zero_bit(vmx_vpid_bitmap, VMX_NR_VPIDS); + if (vpid < VMX_NR_VPIDS) + __set_bit(vpid, vmx_vpid_bitmap); + else + vpid = 0; + spin_unlock(&vmx_vpid_lock); + return vpid; +} + +void free_vpid(int vpid) +{ + if (!enable_vpid || vpid == 0) + return; + spin_lock(&vmx_vpid_lock); + __clear_bit(vpid, vmx_vpid_bitmap); + spin_unlock(&vmx_vpid_lock); +} + +static void vmx_msr_bitmap_l01_changed(struct vcpu_vmx *vmx) +{ + /* + * When KVM is a nested hypervisor on top of Hyper-V and uses + * 'Enlightened MSR Bitmap' feature L0 needs to know that MSR + * bitmap has changed. + */ + if (IS_ENABLED(CONFIG_HYPERV) && static_branch_unlikely(&enable_evmcs)) { + struct hv_enlightened_vmcs *evmcs = (void *)vmx->vmcs01.vmcs; + + if (evmcs->hv_enlightenments_control.msr_bitmap) + evmcs->hv_clean_fields &= + ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP; + } + + vmx->nested.force_msr_bitmap_recalc = true; +} + +void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap; + + if (!cpu_has_vmx_msr_bitmap()) + return; + + vmx_msr_bitmap_l01_changed(vmx); + + /* + * Mark the desired intercept state in shadow bitmap, this is needed + * for resync when the MSR filters change. + */ + if (is_valid_passthrough_msr(msr)) { + int idx = possible_passthrough_msr_slot(msr); + + if (idx != -ENOENT) { + if (type & MSR_TYPE_R) + clear_bit(idx, vmx->shadow_msr_intercept.read); + if (type & MSR_TYPE_W) + clear_bit(idx, vmx->shadow_msr_intercept.write); + } + } + + if ((type & MSR_TYPE_R) && + !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_READ)) { + vmx_set_msr_bitmap_read(msr_bitmap, msr); + type &= ~MSR_TYPE_R; + } + + if ((type & MSR_TYPE_W) && + !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_WRITE)) { + vmx_set_msr_bitmap_write(msr_bitmap, msr); + type &= ~MSR_TYPE_W; + } + + if (type & MSR_TYPE_R) + vmx_clear_msr_bitmap_read(msr_bitmap, msr); + + if (type & MSR_TYPE_W) + vmx_clear_msr_bitmap_write(msr_bitmap, msr); +} + +void vmx_enable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap; + + if (!cpu_has_vmx_msr_bitmap()) + return; + + vmx_msr_bitmap_l01_changed(vmx); + + /* + * Mark the desired intercept state in shadow bitmap, this is needed + * for resync when the MSR filter changes. + */ + if (is_valid_passthrough_msr(msr)) { + int idx = possible_passthrough_msr_slot(msr); + + if (idx != -ENOENT) { + if (type & MSR_TYPE_R) + set_bit(idx, vmx->shadow_msr_intercept.read); + if (type & MSR_TYPE_W) + set_bit(idx, vmx->shadow_msr_intercept.write); + } + } + + if (type & MSR_TYPE_R) + vmx_set_msr_bitmap_read(msr_bitmap, msr); + + if (type & MSR_TYPE_W) + vmx_set_msr_bitmap_write(msr_bitmap, msr); +} + +static void vmx_reset_x2apic_msrs(struct kvm_vcpu *vcpu, u8 mode) +{ + unsigned long *msr_bitmap = to_vmx(vcpu)->vmcs01.msr_bitmap; + unsigned long read_intercept; + int msr; + + read_intercept = (mode & MSR_BITMAP_MODE_X2APIC_APICV) ? 0 : ~0; + + for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) { + unsigned int read_idx = msr / BITS_PER_LONG; + unsigned int write_idx = read_idx + (0x800 / sizeof(long)); + + msr_bitmap[read_idx] = read_intercept; + msr_bitmap[write_idx] = ~0ul; + } +} + +static void vmx_update_msr_bitmap_x2apic(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + u8 mode; + + if (!cpu_has_vmx_msr_bitmap()) + return; + + if (cpu_has_secondary_exec_ctrls() && + (secondary_exec_controls_get(vmx) & + SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) { + mode = MSR_BITMAP_MODE_X2APIC; + if (enable_apicv && kvm_vcpu_apicv_active(vcpu)) + mode |= MSR_BITMAP_MODE_X2APIC_APICV; + } else { + mode = 0; + } + + if (mode == vmx->x2apic_msr_bitmap_mode) + return; + + vmx->x2apic_msr_bitmap_mode = mode; + + vmx_reset_x2apic_msrs(vcpu, mode); + + /* + * TPR reads and writes can be virtualized even if virtual interrupt + * delivery is not in use. + */ + vmx_set_intercept_for_msr(vcpu, X2APIC_MSR(APIC_TASKPRI), MSR_TYPE_RW, + !(mode & MSR_BITMAP_MODE_X2APIC)); + + if (mode & MSR_BITMAP_MODE_X2APIC_APICV) { + vmx_enable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_TMCCT), MSR_TYPE_RW); + vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_EOI), MSR_TYPE_W); + vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W); + if (enable_ipiv) + vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_ICR), MSR_TYPE_RW); + } +} + +void pt_update_intercept_for_msr(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + bool flag = !(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN); + u32 i; + + vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_STATUS, MSR_TYPE_RW, flag); + vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_OUTPUT_BASE, MSR_TYPE_RW, flag); + vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_OUTPUT_MASK, MSR_TYPE_RW, flag); + vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_CR3_MATCH, MSR_TYPE_RW, flag); + for (i = 0; i < vmx->pt_desc.num_address_ranges; i++) { + vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_ADDR0_A + i * 2, MSR_TYPE_RW, flag); + vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_ADDR0_B + i * 2, MSR_TYPE_RW, flag); + } +} + +static bool vmx_guest_apic_has_interrupt(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + void *vapic_page; + u32 vppr; + int rvi; + + if (WARN_ON_ONCE(!is_guest_mode(vcpu)) || + !nested_cpu_has_vid(get_vmcs12(vcpu)) || + WARN_ON_ONCE(!vmx->nested.virtual_apic_map.gfn)) + return false; + + rvi = vmx_get_rvi(); + + vapic_page = vmx->nested.virtual_apic_map.hva; + vppr = *((u32 *)(vapic_page + APIC_PROCPRI)); + + return ((rvi & 0xf0) > (vppr & 0xf0)); +} + +static void vmx_msr_filter_changed(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + u32 i; + + /* + * Redo intercept permissions for MSRs that KVM is passing through to + * the guest. Disabling interception will check the new MSR filter and + * ensure that KVM enables interception if usersepace wants to filter + * the MSR. MSRs that KVM is already intercepting don't need to be + * refreshed since KVM is going to intercept them regardless of what + * userspace wants. + */ + for (i = 0; i < ARRAY_SIZE(vmx_possible_passthrough_msrs); i++) { + u32 msr = vmx_possible_passthrough_msrs[i]; + + if (!test_bit(i, vmx->shadow_msr_intercept.read)) + vmx_disable_intercept_for_msr(vcpu, msr, MSR_TYPE_R); + + if (!test_bit(i, vmx->shadow_msr_intercept.write)) + vmx_disable_intercept_for_msr(vcpu, msr, MSR_TYPE_W); + } + + /* PT MSRs can be passed through iff PT is exposed to the guest. */ + if (vmx_pt_mode_is_host_guest()) + pt_update_intercept_for_msr(vcpu); +} + +static inline void kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu, + int pi_vec) +{ +#ifdef CONFIG_SMP + if (vcpu->mode == IN_GUEST_MODE) { + /* + * The vector of the virtual has already been set in the PIR. + * Send a notification event to deliver the virtual interrupt + * unless the vCPU is the currently running vCPU, i.e. the + * event is being sent from a fastpath VM-Exit handler, in + * which case the PIR will be synced to the vIRR before + * re-entering the guest. + * + * When the target is not the running vCPU, the following + * possibilities emerge: + * + * Case 1: vCPU stays in non-root mode. Sending a notification + * event posts the interrupt to the vCPU. + * + * Case 2: vCPU exits to root mode and is still runnable. The + * PIR will be synced to the vIRR before re-entering the guest. + * Sending a notification event is ok as the host IRQ handler + * will ignore the spurious event. + * + * Case 3: vCPU exits to root mode and is blocked. vcpu_block() + * has already synced PIR to vIRR and never blocks the vCPU if + * the vIRR is not empty. Therefore, a blocked vCPU here does + * not wait for any requested interrupts in PIR, and sending a + * notification event also results in a benign, spurious event. + */ + + if (vcpu != kvm_get_running_vcpu()) + apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), pi_vec); + return; + } +#endif + /* + * The vCPU isn't in the guest; wake the vCPU in case it is blocking, + * otherwise do nothing as KVM will grab the highest priority pending + * IRQ via ->sync_pir_to_irr() in vcpu_enter_guest(). + */ + kvm_vcpu_wake_up(vcpu); +} + +static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu, + int vector) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (is_guest_mode(vcpu) && + vector == vmx->nested.posted_intr_nv) { + /* + * If a posted intr is not recognized by hardware, + * we will accomplish it in the next vmentry. + */ + vmx->nested.pi_pending = true; + kvm_make_request(KVM_REQ_EVENT, vcpu); + + /* + * This pairs with the smp_mb_*() after setting vcpu->mode in + * vcpu_enter_guest() to guarantee the vCPU sees the event + * request if triggering a posted interrupt "fails" because + * vcpu->mode != IN_GUEST_MODE. The extra barrier is needed as + * the smb_wmb() in kvm_make_request() only ensures everything + * done before making the request is visible when the request + * is visible, it doesn't ensure ordering between the store to + * vcpu->requests and the load from vcpu->mode. + */ + smp_mb__after_atomic(); + + /* the PIR and ON have been set by L1. */ + kvm_vcpu_trigger_posted_interrupt(vcpu, POSTED_INTR_NESTED_VECTOR); + return 0; + } + return -1; +} +/* + * Send interrupt to vcpu via posted interrupt way. + * 1. If target vcpu is running(non-root mode), send posted interrupt + * notification to vcpu and hardware will sync PIR to vIRR atomically. + * 2. If target vcpu isn't running(root mode), kick it to pick up the + * interrupt from PIR in next vmentry. + */ +static int vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + int r; + + r = vmx_deliver_nested_posted_interrupt(vcpu, vector); + if (!r) + return 0; + + /* Note, this is called iff the local APIC is in-kernel. */ + if (!vcpu->arch.apic->apicv_active) + return -1; + + if (pi_test_and_set_pir(vector, &vmx->pi_desc)) + return 0; + + /* If a previous notification has sent the IPI, nothing to do. */ + if (pi_test_and_set_on(&vmx->pi_desc)) + return 0; + + /* + * The implied barrier in pi_test_and_set_on() pairs with the smp_mb_*() + * after setting vcpu->mode in vcpu_enter_guest(), thus the vCPU is + * guaranteed to see PID.ON=1 and sync the PIR to IRR if triggering a + * posted interrupt "fails" because vcpu->mode != IN_GUEST_MODE. + */ + kvm_vcpu_trigger_posted_interrupt(vcpu, POSTED_INTR_VECTOR); + return 0; +} + +static void vmx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode, + int trig_mode, int vector) +{ + struct kvm_vcpu *vcpu = apic->vcpu; + + if (vmx_deliver_posted_interrupt(vcpu, vector)) { + kvm_lapic_set_irr(vector, apic); + kvm_make_request(KVM_REQ_EVENT, vcpu); + kvm_vcpu_kick(vcpu); + } else { + trace_kvm_apicv_accept_irq(vcpu->vcpu_id, delivery_mode, + trig_mode, vector); + } +} + +/* + * Set up the vmcs's constant host-state fields, i.e., host-state fields that + * will not change in the lifetime of the guest. + * Note that host-state that does change is set elsewhere. E.g., host-state + * that is set differently for each CPU is set in vmx_vcpu_load(), not here. + */ +void vmx_set_constant_host_state(struct vcpu_vmx *vmx) +{ + u32 low32, high32; + unsigned long tmpl; + unsigned long cr0, cr3, cr4; + + cr0 = read_cr0(); + WARN_ON(cr0 & X86_CR0_TS); + vmcs_writel(HOST_CR0, cr0); /* 22.2.3 */ + + /* + * Save the most likely value for this task's CR3 in the VMCS. + * We can't use __get_current_cr3_fast() because we're not atomic. + */ + cr3 = __read_cr3(); + vmcs_writel(HOST_CR3, cr3); /* 22.2.3 FIXME: shadow tables */ + vmx->loaded_vmcs->host_state.cr3 = cr3; + + /* Save the most likely value for this task's CR4 in the VMCS. */ + cr4 = cr4_read_shadow(); + vmcs_writel(HOST_CR4, cr4); /* 22.2.3, 22.2.5 */ + vmx->loaded_vmcs->host_state.cr4 = cr4; + + vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */ +#ifdef CONFIG_X86_64 + /* + * Load null selectors, so we can avoid reloading them in + * vmx_prepare_switch_to_host(), in case userspace uses + * the null selectors too (the expected case). + */ + vmcs_write16(HOST_DS_SELECTOR, 0); + vmcs_write16(HOST_ES_SELECTOR, 0); +#else + vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ + vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */ +#endif + vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ + vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */ + + vmcs_writel(HOST_IDTR_BASE, host_idt_base); /* 22.2.4 */ + + vmcs_writel(HOST_RIP, (unsigned long)vmx_vmexit); /* 22.2.5 */ + + rdmsr(MSR_IA32_SYSENTER_CS, low32, high32); + vmcs_write32(HOST_IA32_SYSENTER_CS, low32); + + /* + * SYSENTER is used for 32-bit system calls on either 32-bit or + * 64-bit kernels. It is always zero If neither is allowed, otherwise + * vmx_vcpu_load_vmcs loads it with the per-CPU entry stack (and may + * have already done so!). + */ + if (!IS_ENABLED(CONFIG_IA32_EMULATION) && !IS_ENABLED(CONFIG_X86_32)) + vmcs_writel(HOST_IA32_SYSENTER_ESP, 0); + + rdmsrl(MSR_IA32_SYSENTER_EIP, tmpl); + vmcs_writel(HOST_IA32_SYSENTER_EIP, tmpl); /* 22.2.3 */ + + if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) { + rdmsr(MSR_IA32_CR_PAT, low32, high32); + vmcs_write64(HOST_IA32_PAT, low32 | ((u64) high32 << 32)); + } + + if (cpu_has_load_ia32_efer()) + vmcs_write64(HOST_IA32_EFER, host_efer); +} + +void set_cr4_guest_host_mask(struct vcpu_vmx *vmx) +{ + struct kvm_vcpu *vcpu = &vmx->vcpu; + + vcpu->arch.cr4_guest_owned_bits = KVM_POSSIBLE_CR4_GUEST_BITS & + ~vcpu->arch.cr4_guest_rsvd_bits; + if (!enable_ept) { + vcpu->arch.cr4_guest_owned_bits &= ~X86_CR4_TLBFLUSH_BITS; + vcpu->arch.cr4_guest_owned_bits &= ~X86_CR4_PDPTR_BITS; + } + if (is_guest_mode(&vmx->vcpu)) + vcpu->arch.cr4_guest_owned_bits &= + ~get_vmcs12(vcpu)->cr4_guest_host_mask; + vmcs_writel(CR4_GUEST_HOST_MASK, ~vcpu->arch.cr4_guest_owned_bits); +} + +static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx) +{ + u32 pin_based_exec_ctrl = vmcs_config.pin_based_exec_ctrl; + + if (!kvm_vcpu_apicv_active(&vmx->vcpu)) + pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR; + + if (!enable_vnmi) + pin_based_exec_ctrl &= ~PIN_BASED_VIRTUAL_NMIS; + + if (!enable_preemption_timer) + pin_based_exec_ctrl &= ~PIN_BASED_VMX_PREEMPTION_TIMER; + + return pin_based_exec_ctrl; +} + +static u32 vmx_vmentry_ctrl(void) +{ + u32 vmentry_ctrl = vmcs_config.vmentry_ctrl; + + if (vmx_pt_mode_is_system()) + vmentry_ctrl &= ~(VM_ENTRY_PT_CONCEAL_PIP | + VM_ENTRY_LOAD_IA32_RTIT_CTL); + /* + * IA32e mode, and loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically. + */ + vmentry_ctrl &= ~(VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL | + VM_ENTRY_LOAD_IA32_EFER | + VM_ENTRY_IA32E_MODE); + + if (cpu_has_perf_global_ctrl_bug()) + vmentry_ctrl &= ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; + + return vmentry_ctrl; +} + +static u32 vmx_vmexit_ctrl(void) +{ + u32 vmexit_ctrl = vmcs_config.vmexit_ctrl; + + /* + * Not used by KVM and never set in vmcs01 or vmcs02, but emulated for + * nested virtualization and thus allowed to be set in vmcs12. + */ + vmexit_ctrl &= ~(VM_EXIT_SAVE_IA32_PAT | VM_EXIT_SAVE_IA32_EFER | + VM_EXIT_SAVE_VMX_PREEMPTION_TIMER); + + if (vmx_pt_mode_is_system()) + vmexit_ctrl &= ~(VM_EXIT_PT_CONCEAL_PIP | + VM_EXIT_CLEAR_IA32_RTIT_CTL); + + if (cpu_has_perf_global_ctrl_bug()) + vmexit_ctrl &= ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL; + + /* Loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically */ + return vmexit_ctrl & + ~(VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | VM_EXIT_LOAD_IA32_EFER); +} + +static void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (is_guest_mode(vcpu)) { + vmx->nested.update_vmcs01_apicv_status = true; + return; + } + + pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx)); + + if (kvm_vcpu_apicv_active(vcpu)) { + secondary_exec_controls_setbit(vmx, + SECONDARY_EXEC_APIC_REGISTER_VIRT | + SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); + if (enable_ipiv) + tertiary_exec_controls_setbit(vmx, TERTIARY_EXEC_IPI_VIRT); + } else { + secondary_exec_controls_clearbit(vmx, + SECONDARY_EXEC_APIC_REGISTER_VIRT | + SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); + if (enable_ipiv) + tertiary_exec_controls_clearbit(vmx, TERTIARY_EXEC_IPI_VIRT); + } + + vmx_update_msr_bitmap_x2apic(vcpu); +} + +static u32 vmx_exec_control(struct vcpu_vmx *vmx) +{ + u32 exec_control = vmcs_config.cpu_based_exec_ctrl; + + /* + * Not used by KVM, but fully supported for nesting, i.e. are allowed in + * vmcs12 and propagated to vmcs02 when set in vmcs12. + */ + exec_control &= ~(CPU_BASED_RDTSC_EXITING | + CPU_BASED_USE_IO_BITMAPS | + CPU_BASED_MONITOR_TRAP_FLAG | + CPU_BASED_PAUSE_EXITING); + + /* INTR_WINDOW_EXITING and NMI_WINDOW_EXITING are toggled dynamically */ + exec_control &= ~(CPU_BASED_INTR_WINDOW_EXITING | + CPU_BASED_NMI_WINDOW_EXITING); + + if (vmx->vcpu.arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT) + exec_control &= ~CPU_BASED_MOV_DR_EXITING; + + if (!cpu_need_tpr_shadow(&vmx->vcpu)) + exec_control &= ~CPU_BASED_TPR_SHADOW; + +#ifdef CONFIG_X86_64 + if (exec_control & CPU_BASED_TPR_SHADOW) + exec_control &= ~(CPU_BASED_CR8_LOAD_EXITING | + CPU_BASED_CR8_STORE_EXITING); + else + exec_control |= CPU_BASED_CR8_STORE_EXITING | + CPU_BASED_CR8_LOAD_EXITING; +#endif + /* No need to intercept CR3 access or INVPLG when using EPT. */ + if (enable_ept) + exec_control &= ~(CPU_BASED_CR3_LOAD_EXITING | + CPU_BASED_CR3_STORE_EXITING | + CPU_BASED_INVLPG_EXITING); + if (kvm_mwait_in_guest(vmx->vcpu.kvm)) + exec_control &= ~(CPU_BASED_MWAIT_EXITING | + CPU_BASED_MONITOR_EXITING); + if (kvm_hlt_in_guest(vmx->vcpu.kvm)) + exec_control &= ~CPU_BASED_HLT_EXITING; + return exec_control; +} + +static u64 vmx_tertiary_exec_control(struct vcpu_vmx *vmx) +{ + u64 exec_control = vmcs_config.cpu_based_3rd_exec_ctrl; + + /* + * IPI virtualization relies on APICv. Disable IPI virtualization if + * APICv is inhibited. + */ + if (!enable_ipiv || !kvm_vcpu_apicv_active(&vmx->vcpu)) + exec_control &= ~TERTIARY_EXEC_IPI_VIRT; + + return exec_control; +} + +/* + * Adjust a single secondary execution control bit to intercept/allow an + * instruction in the guest. This is usually done based on whether or not a + * feature has been exposed to the guest in order to correctly emulate faults. + */ +static inline void +vmx_adjust_secondary_exec_control(struct vcpu_vmx *vmx, u32 *exec_control, + u32 control, bool enabled, bool exiting) +{ + /* + * If the control is for an opt-in feature, clear the control if the + * feature is not exposed to the guest, i.e. not enabled. If the + * control is opt-out, i.e. an exiting control, clear the control if + * the feature _is_ exposed to the guest, i.e. exiting/interception is + * disabled for the associated instruction. Note, the caller is + * responsible presetting exec_control to set all supported bits. + */ + if (enabled == exiting) + *exec_control &= ~control; + + /* + * Update the nested MSR settings so that a nested VMM can/can't set + * controls for features that are/aren't exposed to the guest. + */ + if (nested) { + if (enabled) + vmx->nested.msrs.secondary_ctls_high |= control; + else + vmx->nested.msrs.secondary_ctls_high &= ~control; + } +} + +/* + * Wrapper macro for the common case of adjusting a secondary execution control + * based on a single guest CPUID bit, with a dedicated feature bit. This also + * verifies that the control is actually supported by KVM and hardware. + */ +#define vmx_adjust_sec_exec_control(vmx, exec_control, name, feat_name, ctrl_name, exiting) \ +({ \ + bool __enabled; \ + \ + if (cpu_has_vmx_##name()) { \ + __enabled = guest_cpuid_has(&(vmx)->vcpu, \ + X86_FEATURE_##feat_name); \ + vmx_adjust_secondary_exec_control(vmx, exec_control, \ + SECONDARY_EXEC_##ctrl_name, __enabled, exiting); \ + } \ +}) + +/* More macro magic for ENABLE_/opt-in versus _EXITING/opt-out controls. */ +#define vmx_adjust_sec_exec_feature(vmx, exec_control, lname, uname) \ + vmx_adjust_sec_exec_control(vmx, exec_control, lname, uname, ENABLE_##uname, false) + +#define vmx_adjust_sec_exec_exiting(vmx, exec_control, lname, uname) \ + vmx_adjust_sec_exec_control(vmx, exec_control, lname, uname, uname##_EXITING, true) + +static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) +{ + struct kvm_vcpu *vcpu = &vmx->vcpu; + + u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl; + + if (vmx_pt_mode_is_system()) + exec_control &= ~(SECONDARY_EXEC_PT_USE_GPA | SECONDARY_EXEC_PT_CONCEAL_VMX); + if (!cpu_need_virtualize_apic_accesses(vcpu)) + exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; + if (vmx->vpid == 0) + exec_control &= ~SECONDARY_EXEC_ENABLE_VPID; + if (!enable_ept) { + exec_control &= ~SECONDARY_EXEC_ENABLE_EPT; + enable_unrestricted_guest = 0; + } + if (!enable_unrestricted_guest) + exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; + if (kvm_pause_in_guest(vmx->vcpu.kvm)) + exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING; + if (!kvm_vcpu_apicv_active(vcpu)) + exec_control &= ~(SECONDARY_EXEC_APIC_REGISTER_VIRT | + SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); + exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; + + /* SECONDARY_EXEC_DESC is enabled/disabled on writes to CR4.UMIP, + * in vmx_set_cr4. */ + exec_control &= ~SECONDARY_EXEC_DESC; + + /* SECONDARY_EXEC_SHADOW_VMCS is enabled when L1 executes VMPTRLD + (handle_vmptrld). + We can NOT enable shadow_vmcs here because we don't have yet + a current VMCS12 + */ + exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS; + + /* + * PML is enabled/disabled when dirty logging of memsmlots changes, but + * it needs to be set here when dirty logging is already active, e.g. + * if this vCPU was created after dirty logging was enabled. + */ + if (!vcpu->kvm->arch.cpu_dirty_logging_count) + exec_control &= ~SECONDARY_EXEC_ENABLE_PML; + + if (cpu_has_vmx_xsaves()) { + /* Exposing XSAVES only when XSAVE is exposed */ + bool xsaves_enabled = + boot_cpu_has(X86_FEATURE_XSAVE) && + guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) && + guest_cpuid_has(vcpu, X86_FEATURE_XSAVES); + + vcpu->arch.xsaves_enabled = xsaves_enabled; + + vmx_adjust_secondary_exec_control(vmx, &exec_control, + SECONDARY_EXEC_XSAVES, + xsaves_enabled, false); + } + + /* + * RDPID is also gated by ENABLE_RDTSCP, turn on the control if either + * feature is exposed to the guest. This creates a virtualization hole + * if both are supported in hardware but only one is exposed to the + * guest, but letting the guest execute RDTSCP or RDPID when either one + * is advertised is preferable to emulating the advertised instruction + * in KVM on #UD, and obviously better than incorrectly injecting #UD. + */ + if (cpu_has_vmx_rdtscp()) { + bool rdpid_or_rdtscp_enabled = + guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP) || + guest_cpuid_has(vcpu, X86_FEATURE_RDPID); + + vmx_adjust_secondary_exec_control(vmx, &exec_control, + SECONDARY_EXEC_ENABLE_RDTSCP, + rdpid_or_rdtscp_enabled, false); + } + vmx_adjust_sec_exec_feature(vmx, &exec_control, invpcid, INVPCID); + + vmx_adjust_sec_exec_exiting(vmx, &exec_control, rdrand, RDRAND); + vmx_adjust_sec_exec_exiting(vmx, &exec_control, rdseed, RDSEED); + + vmx_adjust_sec_exec_control(vmx, &exec_control, waitpkg, WAITPKG, + ENABLE_USR_WAIT_PAUSE, false); + + if (!vcpu->kvm->arch.bus_lock_detection_enabled) + exec_control &= ~SECONDARY_EXEC_BUS_LOCK_DETECTION; + + if (!kvm_notify_vmexit_enabled(vcpu->kvm)) + exec_control &= ~SECONDARY_EXEC_NOTIFY_VM_EXITING; + + return exec_control; +} + +static inline int vmx_get_pid_table_order(struct kvm *kvm) +{ + return get_order(kvm->arch.max_vcpu_ids * sizeof(*to_kvm_vmx(kvm)->pid_table)); +} + +static int vmx_alloc_ipiv_pid_table(struct kvm *kvm) +{ + struct page *pages; + struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm); + + if (!irqchip_in_kernel(kvm) || !enable_ipiv) + return 0; + + if (kvm_vmx->pid_table) + return 0; + + pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, vmx_get_pid_table_order(kvm)); + if (!pages) + return -ENOMEM; + + kvm_vmx->pid_table = (void *)page_address(pages); + return 0; +} + +static int vmx_vcpu_precreate(struct kvm *kvm) +{ + return vmx_alloc_ipiv_pid_table(kvm); +} + +#define VMX_XSS_EXIT_BITMAP 0 + +static void init_vmcs(struct vcpu_vmx *vmx) +{ + struct kvm *kvm = vmx->vcpu.kvm; + struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm); + + if (nested) + nested_vmx_set_vmcs_shadowing_bitmap(); + + if (cpu_has_vmx_msr_bitmap()) + vmcs_write64(MSR_BITMAP, __pa(vmx->vmcs01.msr_bitmap)); + + vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA); /* 22.3.1.5 */ + + /* Control */ + pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx)); + + exec_controls_set(vmx, vmx_exec_control(vmx)); + + if (cpu_has_secondary_exec_ctrls()) + secondary_exec_controls_set(vmx, vmx_secondary_exec_control(vmx)); + + if (cpu_has_tertiary_exec_ctrls()) + tertiary_exec_controls_set(vmx, vmx_tertiary_exec_control(vmx)); + + if (enable_apicv && lapic_in_kernel(&vmx->vcpu)) { + vmcs_write64(EOI_EXIT_BITMAP0, 0); + vmcs_write64(EOI_EXIT_BITMAP1, 0); + vmcs_write64(EOI_EXIT_BITMAP2, 0); + vmcs_write64(EOI_EXIT_BITMAP3, 0); + + vmcs_write16(GUEST_INTR_STATUS, 0); + + vmcs_write16(POSTED_INTR_NV, POSTED_INTR_VECTOR); + vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((&vmx->pi_desc))); + } + + if (vmx_can_use_ipiv(&vmx->vcpu)) { + vmcs_write64(PID_POINTER_TABLE, __pa(kvm_vmx->pid_table)); + vmcs_write16(LAST_PID_POINTER_INDEX, kvm->arch.max_vcpu_ids - 1); + } + + if (!kvm_pause_in_guest(kvm)) { + vmcs_write32(PLE_GAP, ple_gap); + vmx->ple_window = ple_window; + vmx->ple_window_dirty = true; + } + + if (kvm_notify_vmexit_enabled(kvm)) + vmcs_write32(NOTIFY_WINDOW, kvm->arch.notify_window); + + vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0); + vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0); + vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */ + + vmcs_write16(HOST_FS_SELECTOR, 0); /* 22.2.4 */ + vmcs_write16(HOST_GS_SELECTOR, 0); /* 22.2.4 */ + vmx_set_constant_host_state(vmx); + vmcs_writel(HOST_FS_BASE, 0); /* 22.2.4 */ + vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */ + + if (cpu_has_vmx_vmfunc()) + vmcs_write64(VM_FUNCTION_CONTROL, 0); + + vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0); + vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0); + vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val)); + vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0); + vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val)); + + if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) + vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat); + + vm_exit_controls_set(vmx, vmx_vmexit_ctrl()); + + /* 22.2.1, 20.8.1 */ + vm_entry_controls_set(vmx, vmx_vmentry_ctrl()); + + vmx->vcpu.arch.cr0_guest_owned_bits = vmx_l1_guest_owned_cr0_bits(); + vmcs_writel(CR0_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr0_guest_owned_bits); + + set_cr4_guest_host_mask(vmx); + + if (vmx->vpid != 0) + vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); + + if (cpu_has_vmx_xsaves()) + vmcs_write64(XSS_EXIT_BITMAP, VMX_XSS_EXIT_BITMAP); + + if (enable_pml) { + vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg)); + vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1); + } + + vmx_write_encls_bitmap(&vmx->vcpu, NULL); + + if (vmx_pt_mode_is_host_guest()) { + memset(&vmx->pt_desc, 0, sizeof(vmx->pt_desc)); + /* Bit[6~0] are forced to 1, writes are ignored. */ + vmx->pt_desc.guest.output_mask = 0x7F; + vmcs_write64(GUEST_IA32_RTIT_CTL, 0); + } + + vmcs_write32(GUEST_SYSENTER_CS, 0); + vmcs_writel(GUEST_SYSENTER_ESP, 0); + vmcs_writel(GUEST_SYSENTER_EIP, 0); + vmcs_write64(GUEST_IA32_DEBUGCTL, 0); + + if (cpu_has_vmx_tpr_shadow()) { + vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0); + if (cpu_need_tpr_shadow(&vmx->vcpu)) + vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, + __pa(vmx->vcpu.arch.apic->regs)); + vmcs_write32(TPR_THRESHOLD, 0); + } + + vmx_setup_uret_msrs(vmx); +} + +static void __vmx_vcpu_reset(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + init_vmcs(vmx); + + if (nested) + memcpy(&vmx->nested.msrs, &vmcs_config.nested, sizeof(vmx->nested.msrs)); + + vcpu_setup_sgx_lepubkeyhash(vcpu); + + vmx->nested.posted_intr_nv = -1; + vmx->nested.vmxon_ptr = INVALID_GPA; + vmx->nested.current_vmptr = INVALID_GPA; + vmx->nested.hv_evmcs_vmptr = EVMPTR_INVALID; + + vcpu->arch.microcode_version = 0x100000000ULL; + vmx->msr_ia32_feature_control_valid_bits = FEAT_CTL_LOCKED; + + /* + * Enforce invariant: pi_desc.nv is always either POSTED_INTR_VECTOR + * or POSTED_INTR_WAKEUP_VECTOR. + */ + vmx->pi_desc.nv = POSTED_INTR_VECTOR; + vmx->pi_desc.sn = 1; +} + +static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (!init_event) + __vmx_vcpu_reset(vcpu); + + vmx->rmode.vm86_active = 0; + vmx->spec_ctrl = 0; + + vmx->msr_ia32_umwait_control = 0; + + vmx->hv_deadline_tsc = -1; + kvm_set_cr8(vcpu, 0); + + vmx_segment_cache_clear(vmx); + kvm_register_mark_available(vcpu, VCPU_EXREG_SEGMENTS); + + seg_setup(VCPU_SREG_CS); + vmcs_write16(GUEST_CS_SELECTOR, 0xf000); + vmcs_writel(GUEST_CS_BASE, 0xffff0000ul); + + seg_setup(VCPU_SREG_DS); + seg_setup(VCPU_SREG_ES); + seg_setup(VCPU_SREG_FS); + seg_setup(VCPU_SREG_GS); + seg_setup(VCPU_SREG_SS); + + vmcs_write16(GUEST_TR_SELECTOR, 0); + vmcs_writel(GUEST_TR_BASE, 0); + vmcs_write32(GUEST_TR_LIMIT, 0xffff); + vmcs_write32(GUEST_TR_AR_BYTES, 0x008b); + + vmcs_write16(GUEST_LDTR_SELECTOR, 0); + vmcs_writel(GUEST_LDTR_BASE, 0); + vmcs_write32(GUEST_LDTR_LIMIT, 0xffff); + vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082); + + vmcs_writel(GUEST_GDTR_BASE, 0); + vmcs_write32(GUEST_GDTR_LIMIT, 0xffff); + + vmcs_writel(GUEST_IDTR_BASE, 0); + vmcs_write32(GUEST_IDTR_LIMIT, 0xffff); + + vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE); + vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0); + vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 0); + if (kvm_mpx_supported()) + vmcs_write64(GUEST_BNDCFGS, 0); + + vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); /* 22.2.1 */ + + kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); + + vpid_sync_context(vmx->vpid); + + vmx_update_fb_clear_dis(vcpu, vmx); +} + +static void vmx_enable_irq_window(struct kvm_vcpu *vcpu) +{ + exec_controls_setbit(to_vmx(vcpu), CPU_BASED_INTR_WINDOW_EXITING); +} + +static void vmx_enable_nmi_window(struct kvm_vcpu *vcpu) +{ + if (!enable_vnmi || + vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) { + vmx_enable_irq_window(vcpu); + return; + } + + exec_controls_setbit(to_vmx(vcpu), CPU_BASED_NMI_WINDOW_EXITING); +} + +static void vmx_inject_irq(struct kvm_vcpu *vcpu, bool reinjected) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + uint32_t intr; + int irq = vcpu->arch.interrupt.nr; + + trace_kvm_inj_virq(irq, vcpu->arch.interrupt.soft, reinjected); + + ++vcpu->stat.irq_injections; + if (vmx->rmode.vm86_active) { + int inc_eip = 0; + if (vcpu->arch.interrupt.soft) + inc_eip = vcpu->arch.event_exit_inst_len; + kvm_inject_realmode_interrupt(vcpu, irq, inc_eip); + return; + } + intr = irq | INTR_INFO_VALID_MASK; + if (vcpu->arch.interrupt.soft) { + intr |= INTR_TYPE_SOFT_INTR; + vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, + vmx->vcpu.arch.event_exit_inst_len); + } else + intr |= INTR_TYPE_EXT_INTR; + vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr); + + vmx_clear_hlt(vcpu); +} + +static void vmx_inject_nmi(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (!enable_vnmi) { + /* + * Tracking the NMI-blocked state in software is built upon + * finding the next open IRQ window. This, in turn, depends on + * well-behaving guests: They have to keep IRQs disabled at + * least as long as the NMI handler runs. Otherwise we may + * cause NMI nesting, maybe breaking the guest. But as this is + * highly unlikely, we can live with the residual risk. + */ + vmx->loaded_vmcs->soft_vnmi_blocked = 1; + vmx->loaded_vmcs->vnmi_blocked_time = 0; + } + + ++vcpu->stat.nmi_injections; + vmx->loaded_vmcs->nmi_known_unmasked = false; + + if (vmx->rmode.vm86_active) { + kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR, 0); + return; + } + + vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, + INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR); + + vmx_clear_hlt(vcpu); +} + +bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + bool masked; + + if (!enable_vnmi) + return vmx->loaded_vmcs->soft_vnmi_blocked; + if (vmx->loaded_vmcs->nmi_known_unmasked) + return false; + masked = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI; + vmx->loaded_vmcs->nmi_known_unmasked = !masked; + return masked; +} + +void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (!enable_vnmi) { + if (vmx->loaded_vmcs->soft_vnmi_blocked != masked) { + vmx->loaded_vmcs->soft_vnmi_blocked = masked; + vmx->loaded_vmcs->vnmi_blocked_time = 0; + } + } else { + vmx->loaded_vmcs->nmi_known_unmasked = !masked; + if (masked) + vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, + GUEST_INTR_STATE_NMI); + else + vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO, + GUEST_INTR_STATE_NMI); + } +} + +bool vmx_nmi_blocked(struct kvm_vcpu *vcpu) +{ + if (is_guest_mode(vcpu) && nested_exit_on_nmi(vcpu)) + return false; + + if (!enable_vnmi && to_vmx(vcpu)->loaded_vmcs->soft_vnmi_blocked) + return true; + + return (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & + (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI | + GUEST_INTR_STATE_NMI)); +} + +static int vmx_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection) +{ + if (to_vmx(vcpu)->nested.nested_run_pending) + return -EBUSY; + + /* An NMI must not be injected into L2 if it's supposed to VM-Exit. */ + if (for_injection && is_guest_mode(vcpu) && nested_exit_on_nmi(vcpu)) + return -EBUSY; + + return !vmx_nmi_blocked(vcpu); +} + +bool vmx_interrupt_blocked(struct kvm_vcpu *vcpu) +{ + if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) + return false; + + return !(vmx_get_rflags(vcpu) & X86_EFLAGS_IF) || + (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & + (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS)); +} + +static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection) +{ + if (to_vmx(vcpu)->nested.nested_run_pending) + return -EBUSY; + + /* + * An IRQ must not be injected into L2 if it's supposed to VM-Exit, + * e.g. if the IRQ arrived asynchronously after checking nested events. + */ + if (for_injection && is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) + return -EBUSY; + + return !vmx_interrupt_blocked(vcpu); +} + +static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr) +{ + void __user *ret; + + if (enable_unrestricted_guest) + return 0; + + mutex_lock(&kvm->slots_lock); + ret = __x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, addr, + PAGE_SIZE * 3); + mutex_unlock(&kvm->slots_lock); + + if (IS_ERR(ret)) + return PTR_ERR(ret); + + to_kvm_vmx(kvm)->tss_addr = addr; + + return init_rmode_tss(kvm, ret); +} + +static int vmx_set_identity_map_addr(struct kvm *kvm, u64 ident_addr) +{ + to_kvm_vmx(kvm)->ept_identity_map_addr = ident_addr; + return 0; +} + +static bool rmode_exception(struct kvm_vcpu *vcpu, int vec) +{ + switch (vec) { + case BP_VECTOR: + /* + * Update instruction length as we may reinject the exception + * from user space while in guest debugging mode. + */ + to_vmx(vcpu)->vcpu.arch.event_exit_inst_len = + vmcs_read32(VM_EXIT_INSTRUCTION_LEN); + if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) + return false; + fallthrough; + case DB_VECTOR: + return !(vcpu->guest_debug & + (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)); + case DE_VECTOR: + case OF_VECTOR: + case BR_VECTOR: + case UD_VECTOR: + case DF_VECTOR: + case SS_VECTOR: + case GP_VECTOR: + case MF_VECTOR: + return true; + } + return false; +} + +static int handle_rmode_exception(struct kvm_vcpu *vcpu, + int vec, u32 err_code) +{ + /* + * Instruction with address size override prefix opcode 0x67 + * Cause the #SS fault with 0 error code in VM86 mode. + */ + if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) { + if (kvm_emulate_instruction(vcpu, 0)) { + if (vcpu->arch.halt_request) { + vcpu->arch.halt_request = 0; + return kvm_emulate_halt_noskip(vcpu); + } + return 1; + } + return 0; + } + + /* + * Forward all other exceptions that are valid in real mode. + * FIXME: Breaks guest debugging in real mode, needs to be fixed with + * the required debugging infrastructure rework. + */ + kvm_queue_exception(vcpu, vec); + return 1; +} + +static int handle_machine_check(struct kvm_vcpu *vcpu) +{ + /* handled by vmx_vcpu_run() */ + return 1; +} + +/* + * If the host has split lock detection disabled, then #AC is + * unconditionally injected into the guest, which is the pre split lock + * detection behaviour. + * + * If the host has split lock detection enabled then #AC is + * only injected into the guest when: + * - Guest CPL == 3 (user mode) + * - Guest has #AC detection enabled in CR0 + * - Guest EFLAGS has AC bit set + */ +bool vmx_guest_inject_ac(struct kvm_vcpu *vcpu) +{ + if (!boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT)) + return true; + + return vmx_get_cpl(vcpu) == 3 && kvm_read_cr0_bits(vcpu, X86_CR0_AM) && + (kvm_get_rflags(vcpu) & X86_EFLAGS_AC); +} + +static int handle_exception_nmi(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct kvm_run *kvm_run = vcpu->run; + u32 intr_info, ex_no, error_code; + unsigned long cr2, dr6; + u32 vect_info; + + vect_info = vmx->idt_vectoring_info; + intr_info = vmx_get_intr_info(vcpu); + + if (is_machine_check(intr_info) || is_nmi(intr_info)) + return 1; /* handled by handle_exception_nmi_irqoff() */ + + /* + * Queue the exception here instead of in handle_nm_fault_irqoff(). + * This ensures the nested_vmx check is not skipped so vmexit can + * be reflected to L1 (when it intercepts #NM) before reaching this + * point. + */ + if (is_nm_fault(intr_info)) { + kvm_queue_exception(vcpu, NM_VECTOR); + return 1; + } + + if (is_invalid_opcode(intr_info)) + return handle_ud(vcpu); + + error_code = 0; + if (intr_info & INTR_INFO_DELIVER_CODE_MASK) + error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); + + if (!vmx->rmode.vm86_active && is_gp_fault(intr_info)) { + WARN_ON_ONCE(!enable_vmware_backdoor); + + /* + * VMware backdoor emulation on #GP interception only handles + * IN{S}, OUT{S}, and RDPMC, none of which generate a non-zero + * error code on #GP. + */ + if (error_code) { + kvm_queue_exception_e(vcpu, GP_VECTOR, error_code); + return 1; + } + return kvm_emulate_instruction(vcpu, EMULTYPE_VMWARE_GP); + } + + /* + * The #PF with PFEC.RSVD = 1 indicates the guest is accessing + * MMIO, it is better to report an internal error. + * See the comments in vmx_handle_exit. + */ + if ((vect_info & VECTORING_INFO_VALID_MASK) && + !(is_page_fault(intr_info) && !(error_code & PFERR_RSVD_MASK))) { + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX; + vcpu->run->internal.ndata = 4; + vcpu->run->internal.data[0] = vect_info; + vcpu->run->internal.data[1] = intr_info; + vcpu->run->internal.data[2] = error_code; + vcpu->run->internal.data[3] = vcpu->arch.last_vmentry_cpu; + return 0; + } + + if (is_page_fault(intr_info)) { + cr2 = vmx_get_exit_qual(vcpu); + if (enable_ept && !vcpu->arch.apf.host_apf_flags) { + /* + * EPT will cause page fault only if we need to + * detect illegal GPAs. + */ + WARN_ON_ONCE(!allow_smaller_maxphyaddr); + kvm_fixup_and_inject_pf_error(vcpu, cr2, error_code); + return 1; + } else + return kvm_handle_page_fault(vcpu, error_code, cr2, NULL, 0); + } + + ex_no = intr_info & INTR_INFO_VECTOR_MASK; + + if (vmx->rmode.vm86_active && rmode_exception(vcpu, ex_no)) + return handle_rmode_exception(vcpu, ex_no, error_code); + + switch (ex_no) { + case DB_VECTOR: + dr6 = vmx_get_exit_qual(vcpu); + if (!(vcpu->guest_debug & + (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) { + /* + * If the #DB was due to ICEBP, a.k.a. INT1, skip the + * instruction. ICEBP generates a trap-like #DB, but + * despite its interception control being tied to #DB, + * is an instruction intercept, i.e. the VM-Exit occurs + * on the ICEBP itself. Use the inner "skip" helper to + * avoid single-step #DB and MTF updates, as ICEBP is + * higher priority. Note, skipping ICEBP still clears + * STI and MOVSS blocking. + * + * For all other #DBs, set vmcs.PENDING_DBG_EXCEPTIONS.BS + * if single-step is enabled in RFLAGS and STI or MOVSS + * blocking is active, as the CPU doesn't set the bit + * on VM-Exit due to #DB interception. VM-Entry has a + * consistency check that a single-step #DB is pending + * in this scenario as the previous instruction cannot + * have toggled RFLAGS.TF 0=>1 (because STI and POP/MOV + * don't modify RFLAGS), therefore the one instruction + * delay when activating single-step breakpoints must + * have already expired. Note, the CPU sets/clears BS + * as appropriate for all other VM-Exits types. + */ + if (is_icebp(intr_info)) + WARN_ON(!skip_emulated_instruction(vcpu)); + else if ((vmx_get_rflags(vcpu) & X86_EFLAGS_TF) && + (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & + (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS))) + vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, + vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS) | DR6_BS); + + kvm_queue_exception_p(vcpu, DB_VECTOR, dr6); + return 1; + } + kvm_run->debug.arch.dr6 = dr6 | DR6_ACTIVE_LOW; + kvm_run->debug.arch.dr7 = vmcs_readl(GUEST_DR7); + fallthrough; + case BP_VECTOR: + /* + * Update instruction length as we may reinject #BP from + * user space while in guest debugging mode. Reading it for + * #DB as well causes no harm, it is not used in that case. + */ + vmx->vcpu.arch.event_exit_inst_len = + vmcs_read32(VM_EXIT_INSTRUCTION_LEN); + kvm_run->exit_reason = KVM_EXIT_DEBUG; + kvm_run->debug.arch.pc = kvm_get_linear_rip(vcpu); + kvm_run->debug.arch.exception = ex_no; + break; + case AC_VECTOR: + if (vmx_guest_inject_ac(vcpu)) { + kvm_queue_exception_e(vcpu, AC_VECTOR, error_code); + return 1; + } + + /* + * Handle split lock. Depending on detection mode this will + * either warn and disable split lock detection for this + * task or force SIGBUS on it. + */ + if (handle_guest_split_lock(kvm_rip_read(vcpu))) + return 1; + fallthrough; + default: + kvm_run->exit_reason = KVM_EXIT_EXCEPTION; + kvm_run->ex.exception = ex_no; + kvm_run->ex.error_code = error_code; + break; + } + return 0; +} + +static __always_inline int handle_external_interrupt(struct kvm_vcpu *vcpu) +{ + ++vcpu->stat.irq_exits; + return 1; +} + +static int handle_triple_fault(struct kvm_vcpu *vcpu) +{ + vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN; + vcpu->mmio_needed = 0; + return 0; +} + +static int handle_io(struct kvm_vcpu *vcpu) +{ + unsigned long exit_qualification; + int size, in, string; + unsigned port; + + exit_qualification = vmx_get_exit_qual(vcpu); + string = (exit_qualification & 16) != 0; + + ++vcpu->stat.io_exits; + + if (string) + return kvm_emulate_instruction(vcpu, 0); + + port = exit_qualification >> 16; + size = (exit_qualification & 7) + 1; + in = (exit_qualification & 8) != 0; + + return kvm_fast_pio(vcpu, size, port, in); +} + +static void +vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall) +{ + /* + * Patch in the VMCALL instruction: + */ + hypercall[0] = 0x0f; + hypercall[1] = 0x01; + hypercall[2] = 0xc1; +} + +/* called to set cr0 as appropriate for a mov-to-cr0 exit. */ +static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val) +{ + if (is_guest_mode(vcpu)) { + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + unsigned long orig_val = val; + + /* + * We get here when L2 changed cr0 in a way that did not change + * any of L1's shadowed bits (see nested_vmx_exit_handled_cr), + * but did change L0 shadowed bits. So we first calculate the + * effective cr0 value that L1 would like to write into the + * hardware. It consists of the L2-owned bits from the new + * value combined with the L1-owned bits from L1's guest_cr0. + */ + val = (val & ~vmcs12->cr0_guest_host_mask) | + (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask); + + if (kvm_set_cr0(vcpu, val)) + return 1; + vmcs_writel(CR0_READ_SHADOW, orig_val); + return 0; + } else { + return kvm_set_cr0(vcpu, val); + } +} + +static int handle_set_cr4(struct kvm_vcpu *vcpu, unsigned long val) +{ + if (is_guest_mode(vcpu)) { + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + unsigned long orig_val = val; + + /* analogously to handle_set_cr0 */ + val = (val & ~vmcs12->cr4_guest_host_mask) | + (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask); + if (kvm_set_cr4(vcpu, val)) + return 1; + vmcs_writel(CR4_READ_SHADOW, orig_val); + return 0; + } else + return kvm_set_cr4(vcpu, val); +} + +static int handle_desc(struct kvm_vcpu *vcpu) +{ + WARN_ON(!(vcpu->arch.cr4 & X86_CR4_UMIP)); + return kvm_emulate_instruction(vcpu, 0); +} + +static int handle_cr(struct kvm_vcpu *vcpu) +{ + unsigned long exit_qualification, val; + int cr; + int reg; + int err; + int ret; + + exit_qualification = vmx_get_exit_qual(vcpu); + cr = exit_qualification & 15; + reg = (exit_qualification >> 8) & 15; + switch ((exit_qualification >> 4) & 3) { + case 0: /* mov to cr */ + val = kvm_register_read(vcpu, reg); + trace_kvm_cr_write(cr, val); + switch (cr) { + case 0: + err = handle_set_cr0(vcpu, val); + return kvm_complete_insn_gp(vcpu, err); + case 3: + WARN_ON_ONCE(enable_unrestricted_guest); + + err = kvm_set_cr3(vcpu, val); + return kvm_complete_insn_gp(vcpu, err); + case 4: + err = handle_set_cr4(vcpu, val); + return kvm_complete_insn_gp(vcpu, err); + case 8: { + u8 cr8_prev = kvm_get_cr8(vcpu); + u8 cr8 = (u8)val; + err = kvm_set_cr8(vcpu, cr8); + ret = kvm_complete_insn_gp(vcpu, err); + if (lapic_in_kernel(vcpu)) + return ret; + if (cr8_prev <= cr8) + return ret; + /* + * TODO: we might be squashing a + * KVM_GUESTDBG_SINGLESTEP-triggered + * KVM_EXIT_DEBUG here. + */ + vcpu->run->exit_reason = KVM_EXIT_SET_TPR; + return 0; + } + } + break; + case 2: /* clts */ + KVM_BUG(1, vcpu->kvm, "Guest always owns CR0.TS"); + return -EIO; + case 1: /*mov from cr*/ + switch (cr) { + case 3: + WARN_ON_ONCE(enable_unrestricted_guest); + + val = kvm_read_cr3(vcpu); + kvm_register_write(vcpu, reg, val); + trace_kvm_cr_read(cr, val); + return kvm_skip_emulated_instruction(vcpu); + case 8: + val = kvm_get_cr8(vcpu); + kvm_register_write(vcpu, reg, val); + trace_kvm_cr_read(cr, val); + return kvm_skip_emulated_instruction(vcpu); + } + break; + case 3: /* lmsw */ + val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f; + trace_kvm_cr_write(0, (kvm_read_cr0_bits(vcpu, ~0xful) | val)); + kvm_lmsw(vcpu, val); + + return kvm_skip_emulated_instruction(vcpu); + default: + break; + } + vcpu->run->exit_reason = 0; + vcpu_unimpl(vcpu, "unhandled control register: op %d cr %d\n", + (int)(exit_qualification >> 4) & 3, cr); + return 0; +} + +static int handle_dr(struct kvm_vcpu *vcpu) +{ + unsigned long exit_qualification; + int dr, dr7, reg; + int err = 1; + + exit_qualification = vmx_get_exit_qual(vcpu); + dr = exit_qualification & DEBUG_REG_ACCESS_NUM; + + /* First, if DR does not exist, trigger UD */ + if (!kvm_require_dr(vcpu, dr)) + return 1; + + if (vmx_get_cpl(vcpu) > 0) + goto out; + + dr7 = vmcs_readl(GUEST_DR7); + if (dr7 & DR7_GD) { + /* + * As the vm-exit takes precedence over the debug trap, we + * need to emulate the latter, either for the host or the + * guest debugging itself. + */ + if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) { + vcpu->run->debug.arch.dr6 = DR6_BD | DR6_ACTIVE_LOW; + vcpu->run->debug.arch.dr7 = dr7; + vcpu->run->debug.arch.pc = kvm_get_linear_rip(vcpu); + vcpu->run->debug.arch.exception = DB_VECTOR; + vcpu->run->exit_reason = KVM_EXIT_DEBUG; + return 0; + } else { + kvm_queue_exception_p(vcpu, DB_VECTOR, DR6_BD); + return 1; + } + } + + if (vcpu->guest_debug == 0) { + exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_MOV_DR_EXITING); + + /* + * No more DR vmexits; force a reload of the debug registers + * and reenter on this instruction. The next vmexit will + * retrieve the full state of the debug registers. + */ + vcpu->arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT; + return 1; + } + + reg = DEBUG_REG_ACCESS_REG(exit_qualification); + if (exit_qualification & TYPE_MOV_FROM_DR) { + unsigned long val; + + kvm_get_dr(vcpu, dr, &val); + kvm_register_write(vcpu, reg, val); + err = 0; + } else { + err = kvm_set_dr(vcpu, dr, kvm_register_read(vcpu, reg)); + } + +out: + return kvm_complete_insn_gp(vcpu, err); +} + +static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu) +{ + get_debugreg(vcpu->arch.db[0], 0); + get_debugreg(vcpu->arch.db[1], 1); + get_debugreg(vcpu->arch.db[2], 2); + get_debugreg(vcpu->arch.db[3], 3); + get_debugreg(vcpu->arch.dr6, 6); + vcpu->arch.dr7 = vmcs_readl(GUEST_DR7); + + vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT; + exec_controls_setbit(to_vmx(vcpu), CPU_BASED_MOV_DR_EXITING); + + /* + * exc_debug expects dr6 to be cleared after it runs, avoid that it sees + * a stale dr6 from the guest. + */ + set_debugreg(DR6_RESERVED, 6); +} + +static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val) +{ + vmcs_writel(GUEST_DR7, val); +} + +static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu) +{ + kvm_apic_update_ppr(vcpu); + return 1; +} + +static int handle_interrupt_window(struct kvm_vcpu *vcpu) +{ + exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_INTR_WINDOW_EXITING); + + kvm_make_request(KVM_REQ_EVENT, vcpu); + + ++vcpu->stat.irq_window_exits; + return 1; +} + +static int handle_invlpg(struct kvm_vcpu *vcpu) +{ + unsigned long exit_qualification = vmx_get_exit_qual(vcpu); + + kvm_mmu_invlpg(vcpu, exit_qualification); + return kvm_skip_emulated_instruction(vcpu); +} + +static int handle_apic_access(struct kvm_vcpu *vcpu) +{ + if (likely(fasteoi)) { + unsigned long exit_qualification = vmx_get_exit_qual(vcpu); + int access_type, offset; + + access_type = exit_qualification & APIC_ACCESS_TYPE; + offset = exit_qualification & APIC_ACCESS_OFFSET; + /* + * Sane guest uses MOV to write EOI, with written value + * not cared. So make a short-circuit here by avoiding + * heavy instruction emulation. + */ + if ((access_type == TYPE_LINEAR_APIC_INST_WRITE) && + (offset == APIC_EOI)) { + kvm_lapic_set_eoi(vcpu); + return kvm_skip_emulated_instruction(vcpu); + } + } + return kvm_emulate_instruction(vcpu, 0); +} + +static int handle_apic_eoi_induced(struct kvm_vcpu *vcpu) +{ + unsigned long exit_qualification = vmx_get_exit_qual(vcpu); + int vector = exit_qualification & 0xff; + + /* EOI-induced VM exit is trap-like and thus no need to adjust IP */ + kvm_apic_set_eoi_accelerated(vcpu, vector); + return 1; +} + +static int handle_apic_write(struct kvm_vcpu *vcpu) +{ + unsigned long exit_qualification = vmx_get_exit_qual(vcpu); + + /* + * APIC-write VM-Exit is trap-like, KVM doesn't need to advance RIP and + * hardware has done any necessary aliasing, offset adjustments, etc... + * for the access. I.e. the correct value has already been written to + * the vAPIC page for the correct 16-byte chunk. KVM needs only to + * retrieve the register value and emulate the access. + */ + u32 offset = exit_qualification & 0xff0; + + kvm_apic_write_nodecode(vcpu, offset); + return 1; +} + +static int handle_task_switch(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + unsigned long exit_qualification; + bool has_error_code = false; + u32 error_code = 0; + u16 tss_selector; + int reason, type, idt_v, idt_index; + + idt_v = (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK); + idt_index = (vmx->idt_vectoring_info & VECTORING_INFO_VECTOR_MASK); + type = (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK); + + exit_qualification = vmx_get_exit_qual(vcpu); + + reason = (u32)exit_qualification >> 30; + if (reason == TASK_SWITCH_GATE && idt_v) { + switch (type) { + case INTR_TYPE_NMI_INTR: + vcpu->arch.nmi_injected = false; + vmx_set_nmi_mask(vcpu, true); + break; + case INTR_TYPE_EXT_INTR: + case INTR_TYPE_SOFT_INTR: + kvm_clear_interrupt_queue(vcpu); + break; + case INTR_TYPE_HARD_EXCEPTION: + if (vmx->idt_vectoring_info & + VECTORING_INFO_DELIVER_CODE_MASK) { + has_error_code = true; + error_code = + vmcs_read32(IDT_VECTORING_ERROR_CODE); + } + fallthrough; + case INTR_TYPE_SOFT_EXCEPTION: + kvm_clear_exception_queue(vcpu); + break; + default: + break; + } + } + tss_selector = exit_qualification; + + if (!idt_v || (type != INTR_TYPE_HARD_EXCEPTION && + type != INTR_TYPE_EXT_INTR && + type != INTR_TYPE_NMI_INTR)) + WARN_ON(!skip_emulated_instruction(vcpu)); + + /* + * TODO: What about debug traps on tss switch? + * Are we supposed to inject them and update dr6? + */ + return kvm_task_switch(vcpu, tss_selector, + type == INTR_TYPE_SOFT_INTR ? idt_index : -1, + reason, has_error_code, error_code); +} + +static int handle_ept_violation(struct kvm_vcpu *vcpu) +{ + unsigned long exit_qualification; + gpa_t gpa; + u64 error_code; + + exit_qualification = vmx_get_exit_qual(vcpu); + + /* + * EPT violation happened while executing iret from NMI, + * "blocked by NMI" bit has to be set before next VM entry. + * There are errata that may cause this bit to not be set: + * AAK134, BY25. + */ + if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) && + enable_vnmi && + (exit_qualification & INTR_INFO_UNBLOCK_NMI)) + vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI); + + gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); + trace_kvm_page_fault(vcpu, gpa, exit_qualification); + + /* Is it a read fault? */ + error_code = (exit_qualification & EPT_VIOLATION_ACC_READ) + ? PFERR_USER_MASK : 0; + /* Is it a write fault? */ + error_code |= (exit_qualification & EPT_VIOLATION_ACC_WRITE) + ? PFERR_WRITE_MASK : 0; + /* Is it a fetch fault? */ + error_code |= (exit_qualification & EPT_VIOLATION_ACC_INSTR) + ? PFERR_FETCH_MASK : 0; + /* ept page table entry is present? */ + error_code |= (exit_qualification & EPT_VIOLATION_RWX_MASK) + ? PFERR_PRESENT_MASK : 0; + + error_code |= (exit_qualification & EPT_VIOLATION_GVA_TRANSLATED) != 0 ? + PFERR_GUEST_FINAL_MASK : PFERR_GUEST_PAGE_MASK; + + vcpu->arch.exit_qualification = exit_qualification; + + /* + * Check that the GPA doesn't exceed physical memory limits, as that is + * a guest page fault. We have to emulate the instruction here, because + * if the illegal address is that of a paging structure, then + * EPT_VIOLATION_ACC_WRITE bit is set. Alternatively, if supported we + * would also use advanced VM-exit information for EPT violations to + * reconstruct the page fault error code. + */ + if (unlikely(allow_smaller_maxphyaddr && kvm_vcpu_is_illegal_gpa(vcpu, gpa))) + return kvm_emulate_instruction(vcpu, 0); + + return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0); +} + +static int handle_ept_misconfig(struct kvm_vcpu *vcpu) +{ + gpa_t gpa; + + if (!vmx_can_emulate_instruction(vcpu, EMULTYPE_PF, NULL, 0)) + return 1; + + /* + * A nested guest cannot optimize MMIO vmexits, because we have an + * nGPA here instead of the required GPA. + */ + gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); + if (!is_guest_mode(vcpu) && + !kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) { + trace_kvm_fast_mmio(gpa); + return kvm_skip_emulated_instruction(vcpu); + } + + return kvm_mmu_page_fault(vcpu, gpa, PFERR_RSVD_MASK, NULL, 0); +} + +static int handle_nmi_window(struct kvm_vcpu *vcpu) +{ + if (KVM_BUG_ON(!enable_vnmi, vcpu->kvm)) + return -EIO; + + exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_NMI_WINDOW_EXITING); + ++vcpu->stat.nmi_window_exits; + kvm_make_request(KVM_REQ_EVENT, vcpu); + + return 1; +} + +static bool vmx_emulation_required_with_pending_exception(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + return vmx->emulation_required && !vmx->rmode.vm86_active && + (kvm_is_exception_pending(vcpu) || vcpu->arch.exception.injected); +} + +static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + bool intr_window_requested; + unsigned count = 130; + + intr_window_requested = exec_controls_get(vmx) & + CPU_BASED_INTR_WINDOW_EXITING; + + while (vmx->emulation_required && count-- != 0) { + if (intr_window_requested && !vmx_interrupt_blocked(vcpu)) + return handle_interrupt_window(&vmx->vcpu); + + if (kvm_test_request(KVM_REQ_EVENT, vcpu)) + return 1; + + if (!kvm_emulate_instruction(vcpu, 0)) + return 0; + + if (vmx_emulation_required_with_pending_exception(vcpu)) { + kvm_prepare_emulation_failure_exit(vcpu); + return 0; + } + + if (vcpu->arch.halt_request) { + vcpu->arch.halt_request = 0; + return kvm_emulate_halt_noskip(vcpu); + } + + /* + * Note, return 1 and not 0, vcpu_run() will invoke + * xfer_to_guest_mode() which will create a proper return + * code. + */ + if (__xfer_to_guest_mode_work_pending()) + return 1; + } + + return 1; +} + +static int vmx_vcpu_pre_run(struct kvm_vcpu *vcpu) +{ + if (vmx_emulation_required_with_pending_exception(vcpu)) { + kvm_prepare_emulation_failure_exit(vcpu); + return 0; + } + + return 1; +} + +static void grow_ple_window(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + unsigned int old = vmx->ple_window; + + vmx->ple_window = __grow_ple_window(old, ple_window, + ple_window_grow, + ple_window_max); + + if (vmx->ple_window != old) { + vmx->ple_window_dirty = true; + trace_kvm_ple_window_update(vcpu->vcpu_id, + vmx->ple_window, old); + } +} + +static void shrink_ple_window(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + unsigned int old = vmx->ple_window; + + vmx->ple_window = __shrink_ple_window(old, ple_window, + ple_window_shrink, + ple_window); + + if (vmx->ple_window != old) { + vmx->ple_window_dirty = true; + trace_kvm_ple_window_update(vcpu->vcpu_id, + vmx->ple_window, old); + } +} + +/* + * Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE + * exiting, so only get here on cpu with PAUSE-Loop-Exiting. + */ +static int handle_pause(struct kvm_vcpu *vcpu) +{ + if (!kvm_pause_in_guest(vcpu->kvm)) + grow_ple_window(vcpu); + + /* + * Intel sdm vol3 ch-25.1.3 says: The "PAUSE-loop exiting" + * VM-execution control is ignored if CPL > 0. OTOH, KVM + * never set PAUSE_EXITING and just set PLE if supported, + * so the vcpu must be CPL=0 if it gets a PAUSE exit. + */ + kvm_vcpu_on_spin(vcpu, true); + return kvm_skip_emulated_instruction(vcpu); +} + +static int handle_monitor_trap(struct kvm_vcpu *vcpu) +{ + return 1; +} + +static int handle_invpcid(struct kvm_vcpu *vcpu) +{ + u32 vmx_instruction_info; + unsigned long type; + gva_t gva; + struct { + u64 pcid; + u64 gla; + } operand; + int gpr_index; + + if (!guest_cpuid_has(vcpu, X86_FEATURE_INVPCID)) { + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; + } + + vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); + gpr_index = vmx_get_instr_info_reg2(vmx_instruction_info); + type = kvm_register_read(vcpu, gpr_index); + + /* According to the Intel instruction reference, the memory operand + * is read even if it isn't needed (e.g., for type==all) + */ + if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu), + vmx_instruction_info, false, + sizeof(operand), &gva)) + return 1; + + return kvm_handle_invpcid(vcpu, type, gva); +} + +static int handle_pml_full(struct kvm_vcpu *vcpu) +{ + unsigned long exit_qualification; + + trace_kvm_pml_full(vcpu->vcpu_id); + + exit_qualification = vmx_get_exit_qual(vcpu); + + /* + * PML buffer FULL happened while executing iret from NMI, + * "blocked by NMI" bit has to be set before next VM entry. + */ + if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) && + enable_vnmi && + (exit_qualification & INTR_INFO_UNBLOCK_NMI)) + vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, + GUEST_INTR_STATE_NMI); + + /* + * PML buffer already flushed at beginning of VMEXIT. Nothing to do + * here.., and there's no userspace involvement needed for PML. + */ + return 1; +} + +static fastpath_t handle_fastpath_preemption_timer(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (!vmx->req_immediate_exit && + !unlikely(vmx->loaded_vmcs->hv_timer_soft_disabled)) { + kvm_lapic_expired_hv_timer(vcpu); + return EXIT_FASTPATH_REENTER_GUEST; + } + + return EXIT_FASTPATH_NONE; +} + +static int handle_preemption_timer(struct kvm_vcpu *vcpu) +{ + handle_fastpath_preemption_timer(vcpu); + return 1; +} + +/* + * When nested=0, all VMX instruction VM Exits filter here. The handlers + * are overwritten by nested_vmx_setup() when nested=1. + */ +static int handle_vmx_instruction(struct kvm_vcpu *vcpu) +{ + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; +} + +#ifndef CONFIG_X86_SGX_KVM +static int handle_encls(struct kvm_vcpu *vcpu) +{ + /* + * SGX virtualization is disabled. There is no software enable bit for + * SGX, so KVM intercepts all ENCLS leafs and injects a #UD to prevent + * the guest from executing ENCLS (when SGX is supported by hardware). + */ + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; +} +#endif /* CONFIG_X86_SGX_KVM */ + +static int handle_bus_lock_vmexit(struct kvm_vcpu *vcpu) +{ + /* + * Hardware may or may not set the BUS_LOCK_DETECTED flag on BUS_LOCK + * VM-Exits. Unconditionally set the flag here and leave the handling to + * vmx_handle_exit(). + */ + to_vmx(vcpu)->exit_reason.bus_lock_detected = true; + return 1; +} + +static int handle_notify(struct kvm_vcpu *vcpu) +{ + unsigned long exit_qual = vmx_get_exit_qual(vcpu); + bool context_invalid = exit_qual & NOTIFY_VM_CONTEXT_INVALID; + + ++vcpu->stat.notify_window_exits; + + /* + * Notify VM exit happened while executing iret from NMI, + * "blocked by NMI" bit has to be set before next VM entry. + */ + if (enable_vnmi && (exit_qual & INTR_INFO_UNBLOCK_NMI)) + vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, + GUEST_INTR_STATE_NMI); + + if (vcpu->kvm->arch.notify_vmexit_flags & KVM_X86_NOTIFY_VMEXIT_USER || + context_invalid) { + vcpu->run->exit_reason = KVM_EXIT_NOTIFY; + vcpu->run->notify.flags = context_invalid ? + KVM_NOTIFY_CONTEXT_INVALID : 0; + return 0; + } + + return 1; +} + +/* + * The exit handlers return 1 if the exit was handled fully and guest execution + * may resume. Otherwise they set the kvm_run parameter to indicate what needs + * to be done to userspace and return 0. + */ +static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { + [EXIT_REASON_EXCEPTION_NMI] = handle_exception_nmi, + [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt, + [EXIT_REASON_TRIPLE_FAULT] = handle_triple_fault, + [EXIT_REASON_NMI_WINDOW] = handle_nmi_window, + [EXIT_REASON_IO_INSTRUCTION] = handle_io, + [EXIT_REASON_CR_ACCESS] = handle_cr, + [EXIT_REASON_DR_ACCESS] = handle_dr, + [EXIT_REASON_CPUID] = kvm_emulate_cpuid, + [EXIT_REASON_MSR_READ] = kvm_emulate_rdmsr, + [EXIT_REASON_MSR_WRITE] = kvm_emulate_wrmsr, + [EXIT_REASON_INTERRUPT_WINDOW] = handle_interrupt_window, + [EXIT_REASON_HLT] = kvm_emulate_halt, + [EXIT_REASON_INVD] = kvm_emulate_invd, + [EXIT_REASON_INVLPG] = handle_invlpg, + [EXIT_REASON_RDPMC] = kvm_emulate_rdpmc, + [EXIT_REASON_VMCALL] = kvm_emulate_hypercall, + [EXIT_REASON_VMCLEAR] = handle_vmx_instruction, + [EXIT_REASON_VMLAUNCH] = handle_vmx_instruction, + [EXIT_REASON_VMPTRLD] = handle_vmx_instruction, + [EXIT_REASON_VMPTRST] = handle_vmx_instruction, + [EXIT_REASON_VMREAD] = handle_vmx_instruction, + [EXIT_REASON_VMRESUME] = handle_vmx_instruction, + [EXIT_REASON_VMWRITE] = handle_vmx_instruction, + [EXIT_REASON_VMOFF] = handle_vmx_instruction, + [EXIT_REASON_VMON] = handle_vmx_instruction, + [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold, + [EXIT_REASON_APIC_ACCESS] = handle_apic_access, + [EXIT_REASON_APIC_WRITE] = handle_apic_write, + [EXIT_REASON_EOI_INDUCED] = handle_apic_eoi_induced, + [EXIT_REASON_WBINVD] = kvm_emulate_wbinvd, + [EXIT_REASON_XSETBV] = kvm_emulate_xsetbv, + [EXIT_REASON_TASK_SWITCH] = handle_task_switch, + [EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check, + [EXIT_REASON_GDTR_IDTR] = handle_desc, + [EXIT_REASON_LDTR_TR] = handle_desc, + [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation, + [EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig, + [EXIT_REASON_PAUSE_INSTRUCTION] = handle_pause, + [EXIT_REASON_MWAIT_INSTRUCTION] = kvm_emulate_mwait, + [EXIT_REASON_MONITOR_TRAP_FLAG] = handle_monitor_trap, + [EXIT_REASON_MONITOR_INSTRUCTION] = kvm_emulate_monitor, + [EXIT_REASON_INVEPT] = handle_vmx_instruction, + [EXIT_REASON_INVVPID] = handle_vmx_instruction, + [EXIT_REASON_RDRAND] = kvm_handle_invalid_op, + [EXIT_REASON_RDSEED] = kvm_handle_invalid_op, + [EXIT_REASON_PML_FULL] = handle_pml_full, + [EXIT_REASON_INVPCID] = handle_invpcid, + [EXIT_REASON_VMFUNC] = handle_vmx_instruction, + [EXIT_REASON_PREEMPTION_TIMER] = handle_preemption_timer, + [EXIT_REASON_ENCLS] = handle_encls, + [EXIT_REASON_BUS_LOCK] = handle_bus_lock_vmexit, + [EXIT_REASON_NOTIFY] = handle_notify, +}; + +static const int kvm_vmx_max_exit_handlers = + ARRAY_SIZE(kvm_vmx_exit_handlers); + +static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason, + u64 *info1, u64 *info2, + u32 *intr_info, u32 *error_code) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + *reason = vmx->exit_reason.full; + *info1 = vmx_get_exit_qual(vcpu); + if (!(vmx->exit_reason.failed_vmentry)) { + *info2 = vmx->idt_vectoring_info; + *intr_info = vmx_get_intr_info(vcpu); + if (is_exception_with_error_code(*intr_info)) + *error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); + else + *error_code = 0; + } else { + *info2 = 0; + *intr_info = 0; + *error_code = 0; + } +} + +static void vmx_destroy_pml_buffer(struct vcpu_vmx *vmx) +{ + if (vmx->pml_pg) { + __free_page(vmx->pml_pg); + vmx->pml_pg = NULL; + } +} + +static void vmx_flush_pml_buffer(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + u64 *pml_buf; + u16 pml_idx; + + pml_idx = vmcs_read16(GUEST_PML_INDEX); + + /* Do nothing if PML buffer is empty */ + if (pml_idx == (PML_ENTITY_NUM - 1)) + return; + + /* PML index always points to next available PML buffer entity */ + if (pml_idx >= PML_ENTITY_NUM) + pml_idx = 0; + else + pml_idx++; + + pml_buf = page_address(vmx->pml_pg); + for (; pml_idx < PML_ENTITY_NUM; pml_idx++) { + u64 gpa; + + gpa = pml_buf[pml_idx]; + WARN_ON(gpa & (PAGE_SIZE - 1)); + kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT); + } + + /* reset PML index */ + vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1); +} + +static void vmx_dump_sel(char *name, uint32_t sel) +{ + pr_err("%s sel=0x%04x, attr=0x%05x, limit=0x%08x, base=0x%016lx\n", + name, vmcs_read16(sel), + vmcs_read32(sel + GUEST_ES_AR_BYTES - GUEST_ES_SELECTOR), + vmcs_read32(sel + GUEST_ES_LIMIT - GUEST_ES_SELECTOR), + vmcs_readl(sel + GUEST_ES_BASE - GUEST_ES_SELECTOR)); +} + +static void vmx_dump_dtsel(char *name, uint32_t limit) +{ + pr_err("%s limit=0x%08x, base=0x%016lx\n", + name, vmcs_read32(limit), + vmcs_readl(limit + GUEST_GDTR_BASE - GUEST_GDTR_LIMIT)); +} + +static void vmx_dump_msrs(char *name, struct vmx_msrs *m) +{ + unsigned int i; + struct vmx_msr_entry *e; + + pr_err("MSR %s:\n", name); + for (i = 0, e = m->val; i < m->nr; ++i, ++e) + pr_err(" %2d: msr=0x%08x value=0x%016llx\n", i, e->index, e->value); +} + +void dump_vmcs(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + u32 vmentry_ctl, vmexit_ctl; + u32 cpu_based_exec_ctrl, pin_based_exec_ctrl, secondary_exec_control; + u64 tertiary_exec_control; + unsigned long cr4; + int efer_slot; + + if (!dump_invalid_vmcs) { + pr_warn_ratelimited("set kvm_intel.dump_invalid_vmcs=1 to dump internal KVM state.\n"); + return; + } + + vmentry_ctl = vmcs_read32(VM_ENTRY_CONTROLS); + vmexit_ctl = vmcs_read32(VM_EXIT_CONTROLS); + cpu_based_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); + pin_based_exec_ctrl = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL); + cr4 = vmcs_readl(GUEST_CR4); + + if (cpu_has_secondary_exec_ctrls()) + secondary_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); + else + secondary_exec_control = 0; + + if (cpu_has_tertiary_exec_ctrls()) + tertiary_exec_control = vmcs_read64(TERTIARY_VM_EXEC_CONTROL); + else + tertiary_exec_control = 0; + + pr_err("VMCS %p, last attempted VM-entry on CPU %d\n", + vmx->loaded_vmcs->vmcs, vcpu->arch.last_vmentry_cpu); + pr_err("*** Guest State ***\n"); + pr_err("CR0: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n", + vmcs_readl(GUEST_CR0), vmcs_readl(CR0_READ_SHADOW), + vmcs_readl(CR0_GUEST_HOST_MASK)); + pr_err("CR4: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n", + cr4, vmcs_readl(CR4_READ_SHADOW), vmcs_readl(CR4_GUEST_HOST_MASK)); + pr_err("CR3 = 0x%016lx\n", vmcs_readl(GUEST_CR3)); + if (cpu_has_vmx_ept()) { + pr_err("PDPTR0 = 0x%016llx PDPTR1 = 0x%016llx\n", + vmcs_read64(GUEST_PDPTR0), vmcs_read64(GUEST_PDPTR1)); + pr_err("PDPTR2 = 0x%016llx PDPTR3 = 0x%016llx\n", + vmcs_read64(GUEST_PDPTR2), vmcs_read64(GUEST_PDPTR3)); + } + pr_err("RSP = 0x%016lx RIP = 0x%016lx\n", + vmcs_readl(GUEST_RSP), vmcs_readl(GUEST_RIP)); + pr_err("RFLAGS=0x%08lx DR7 = 0x%016lx\n", + vmcs_readl(GUEST_RFLAGS), vmcs_readl(GUEST_DR7)); + pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n", + vmcs_readl(GUEST_SYSENTER_ESP), + vmcs_read32(GUEST_SYSENTER_CS), vmcs_readl(GUEST_SYSENTER_EIP)); + vmx_dump_sel("CS: ", GUEST_CS_SELECTOR); + vmx_dump_sel("DS: ", GUEST_DS_SELECTOR); + vmx_dump_sel("SS: ", GUEST_SS_SELECTOR); + vmx_dump_sel("ES: ", GUEST_ES_SELECTOR); + vmx_dump_sel("FS: ", GUEST_FS_SELECTOR); + vmx_dump_sel("GS: ", GUEST_GS_SELECTOR); + vmx_dump_dtsel("GDTR:", GUEST_GDTR_LIMIT); + vmx_dump_sel("LDTR:", GUEST_LDTR_SELECTOR); + vmx_dump_dtsel("IDTR:", GUEST_IDTR_LIMIT); + vmx_dump_sel("TR: ", GUEST_TR_SELECTOR); + efer_slot = vmx_find_loadstore_msr_slot(&vmx->msr_autoload.guest, MSR_EFER); + if (vmentry_ctl & VM_ENTRY_LOAD_IA32_EFER) + pr_err("EFER= 0x%016llx\n", vmcs_read64(GUEST_IA32_EFER)); + else if (efer_slot >= 0) + pr_err("EFER= 0x%016llx (autoload)\n", + vmx->msr_autoload.guest.val[efer_slot].value); + else if (vmentry_ctl & VM_ENTRY_IA32E_MODE) + pr_err("EFER= 0x%016llx (effective)\n", + vcpu->arch.efer | (EFER_LMA | EFER_LME)); + else + pr_err("EFER= 0x%016llx (effective)\n", + vcpu->arch.efer & ~(EFER_LMA | EFER_LME)); + if (vmentry_ctl & VM_ENTRY_LOAD_IA32_PAT) + pr_err("PAT = 0x%016llx\n", vmcs_read64(GUEST_IA32_PAT)); + pr_err("DebugCtl = 0x%016llx DebugExceptions = 0x%016lx\n", + vmcs_read64(GUEST_IA32_DEBUGCTL), + vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS)); + if (cpu_has_load_perf_global_ctrl() && + vmentry_ctl & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) + pr_err("PerfGlobCtl = 0x%016llx\n", + vmcs_read64(GUEST_IA32_PERF_GLOBAL_CTRL)); + if (vmentry_ctl & VM_ENTRY_LOAD_BNDCFGS) + pr_err("BndCfgS = 0x%016llx\n", vmcs_read64(GUEST_BNDCFGS)); + pr_err("Interruptibility = %08x ActivityState = %08x\n", + vmcs_read32(GUEST_INTERRUPTIBILITY_INFO), + vmcs_read32(GUEST_ACTIVITY_STATE)); + if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) + pr_err("InterruptStatus = %04x\n", + vmcs_read16(GUEST_INTR_STATUS)); + if (vmcs_read32(VM_ENTRY_MSR_LOAD_COUNT) > 0) + vmx_dump_msrs("guest autoload", &vmx->msr_autoload.guest); + if (vmcs_read32(VM_EXIT_MSR_STORE_COUNT) > 0) + vmx_dump_msrs("guest autostore", &vmx->msr_autostore.guest); + + pr_err("*** Host State ***\n"); + pr_err("RIP = 0x%016lx RSP = 0x%016lx\n", + vmcs_readl(HOST_RIP), vmcs_readl(HOST_RSP)); + pr_err("CS=%04x SS=%04x DS=%04x ES=%04x FS=%04x GS=%04x TR=%04x\n", + vmcs_read16(HOST_CS_SELECTOR), vmcs_read16(HOST_SS_SELECTOR), + vmcs_read16(HOST_DS_SELECTOR), vmcs_read16(HOST_ES_SELECTOR), + vmcs_read16(HOST_FS_SELECTOR), vmcs_read16(HOST_GS_SELECTOR), + vmcs_read16(HOST_TR_SELECTOR)); + pr_err("FSBase=%016lx GSBase=%016lx TRBase=%016lx\n", + vmcs_readl(HOST_FS_BASE), vmcs_readl(HOST_GS_BASE), + vmcs_readl(HOST_TR_BASE)); + pr_err("GDTBase=%016lx IDTBase=%016lx\n", + vmcs_readl(HOST_GDTR_BASE), vmcs_readl(HOST_IDTR_BASE)); + pr_err("CR0=%016lx CR3=%016lx CR4=%016lx\n", + vmcs_readl(HOST_CR0), vmcs_readl(HOST_CR3), + vmcs_readl(HOST_CR4)); + pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n", + vmcs_readl(HOST_IA32_SYSENTER_ESP), + vmcs_read32(HOST_IA32_SYSENTER_CS), + vmcs_readl(HOST_IA32_SYSENTER_EIP)); + if (vmexit_ctl & VM_EXIT_LOAD_IA32_EFER) + pr_err("EFER= 0x%016llx\n", vmcs_read64(HOST_IA32_EFER)); + if (vmexit_ctl & VM_EXIT_LOAD_IA32_PAT) + pr_err("PAT = 0x%016llx\n", vmcs_read64(HOST_IA32_PAT)); + if (cpu_has_load_perf_global_ctrl() && + vmexit_ctl & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) + pr_err("PerfGlobCtl = 0x%016llx\n", + vmcs_read64(HOST_IA32_PERF_GLOBAL_CTRL)); + if (vmcs_read32(VM_EXIT_MSR_LOAD_COUNT) > 0) + vmx_dump_msrs("host autoload", &vmx->msr_autoload.host); + + pr_err("*** Control State ***\n"); + pr_err("CPUBased=0x%08x SecondaryExec=0x%08x TertiaryExec=0x%016llx\n", + cpu_based_exec_ctrl, secondary_exec_control, tertiary_exec_control); + pr_err("PinBased=0x%08x EntryControls=%08x ExitControls=%08x\n", + pin_based_exec_ctrl, vmentry_ctl, vmexit_ctl); + pr_err("ExceptionBitmap=%08x PFECmask=%08x PFECmatch=%08x\n", + vmcs_read32(EXCEPTION_BITMAP), + vmcs_read32(PAGE_FAULT_ERROR_CODE_MASK), + vmcs_read32(PAGE_FAULT_ERROR_CODE_MATCH)); + pr_err("VMEntry: intr_info=%08x errcode=%08x ilen=%08x\n", + vmcs_read32(VM_ENTRY_INTR_INFO_FIELD), + vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE), + vmcs_read32(VM_ENTRY_INSTRUCTION_LEN)); + pr_err("VMExit: intr_info=%08x errcode=%08x ilen=%08x\n", + vmcs_read32(VM_EXIT_INTR_INFO), + vmcs_read32(VM_EXIT_INTR_ERROR_CODE), + vmcs_read32(VM_EXIT_INSTRUCTION_LEN)); + pr_err(" reason=%08x qualification=%016lx\n", + vmcs_read32(VM_EXIT_REASON), vmcs_readl(EXIT_QUALIFICATION)); + pr_err("IDTVectoring: info=%08x errcode=%08x\n", + vmcs_read32(IDT_VECTORING_INFO_FIELD), + vmcs_read32(IDT_VECTORING_ERROR_CODE)); + pr_err("TSC Offset = 0x%016llx\n", vmcs_read64(TSC_OFFSET)); + if (secondary_exec_control & SECONDARY_EXEC_TSC_SCALING) + pr_err("TSC Multiplier = 0x%016llx\n", + vmcs_read64(TSC_MULTIPLIER)); + if (cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW) { + if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) { + u16 status = vmcs_read16(GUEST_INTR_STATUS); + pr_err("SVI|RVI = %02x|%02x ", status >> 8, status & 0xff); + } + pr_cont("TPR Threshold = 0x%02x\n", vmcs_read32(TPR_THRESHOLD)); + if (secondary_exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) + pr_err("APIC-access addr = 0x%016llx ", vmcs_read64(APIC_ACCESS_ADDR)); + pr_cont("virt-APIC addr = 0x%016llx\n", vmcs_read64(VIRTUAL_APIC_PAGE_ADDR)); + } + if (pin_based_exec_ctrl & PIN_BASED_POSTED_INTR) + pr_err("PostedIntrVec = 0x%02x\n", vmcs_read16(POSTED_INTR_NV)); + if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT)) + pr_err("EPT pointer = 0x%016llx\n", vmcs_read64(EPT_POINTER)); + if (secondary_exec_control & SECONDARY_EXEC_PAUSE_LOOP_EXITING) + pr_err("PLE Gap=%08x Window=%08x\n", + vmcs_read32(PLE_GAP), vmcs_read32(PLE_WINDOW)); + if (secondary_exec_control & SECONDARY_EXEC_ENABLE_VPID) + pr_err("Virtual processor ID = 0x%04x\n", + vmcs_read16(VIRTUAL_PROCESSOR_ID)); +} + +/* + * The guest has exited. See if we can fix it or if we need userspace + * assistance. + */ +static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + union vmx_exit_reason exit_reason = vmx->exit_reason; + u32 vectoring_info = vmx->idt_vectoring_info; + u16 exit_handler_index; + + /* + * Flush logged GPAs PML buffer, this will make dirty_bitmap more + * updated. Another good is, in kvm_vm_ioctl_get_dirty_log, before + * querying dirty_bitmap, we only need to kick all vcpus out of guest + * mode as if vcpus is in root mode, the PML buffer must has been + * flushed already. Note, PML is never enabled in hardware while + * running L2. + */ + if (enable_pml && !is_guest_mode(vcpu)) + vmx_flush_pml_buffer(vcpu); + + /* + * KVM should never reach this point with a pending nested VM-Enter. + * More specifically, short-circuiting VM-Entry to emulate L2 due to + * invalid guest state should never happen as that means KVM knowingly + * allowed a nested VM-Enter with an invalid vmcs12. More below. + */ + if (KVM_BUG_ON(vmx->nested.nested_run_pending, vcpu->kvm)) + return -EIO; + + if (is_guest_mode(vcpu)) { + /* + * PML is never enabled when running L2, bail immediately if a + * PML full exit occurs as something is horribly wrong. + */ + if (exit_reason.basic == EXIT_REASON_PML_FULL) + goto unexpected_vmexit; + + /* + * The host physical addresses of some pages of guest memory + * are loaded into the vmcs02 (e.g. vmcs12's Virtual APIC + * Page). The CPU may write to these pages via their host + * physical address while L2 is running, bypassing any + * address-translation-based dirty tracking (e.g. EPT write + * protection). + * + * Mark them dirty on every exit from L2 to prevent them from + * getting out of sync with dirty tracking. + */ + nested_mark_vmcs12_pages_dirty(vcpu); + + /* + * Synthesize a triple fault if L2 state is invalid. In normal + * operation, nested VM-Enter rejects any attempt to enter L2 + * with invalid state. However, those checks are skipped if + * state is being stuffed via RSM or KVM_SET_NESTED_STATE. If + * L2 state is invalid, it means either L1 modified SMRAM state + * or userspace provided bad state. Synthesize TRIPLE_FAULT as + * doing so is architecturally allowed in the RSM case, and is + * the least awful solution for the userspace case without + * risking false positives. + */ + if (vmx->emulation_required) { + nested_vmx_vmexit(vcpu, EXIT_REASON_TRIPLE_FAULT, 0, 0); + return 1; + } + + if (nested_vmx_reflect_vmexit(vcpu)) + return 1; + } + + /* If guest state is invalid, start emulating. L2 is handled above. */ + if (vmx->emulation_required) + return handle_invalid_guest_state(vcpu); + + if (exit_reason.failed_vmentry) { + dump_vmcs(vcpu); + vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; + vcpu->run->fail_entry.hardware_entry_failure_reason + = exit_reason.full; + vcpu->run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu; + return 0; + } + + if (unlikely(vmx->fail)) { + dump_vmcs(vcpu); + vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; + vcpu->run->fail_entry.hardware_entry_failure_reason + = vmcs_read32(VM_INSTRUCTION_ERROR); + vcpu->run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu; + return 0; + } + + /* + * Note: + * Do not try to fix EXIT_REASON_EPT_MISCONFIG if it caused by + * delivery event since it indicates guest is accessing MMIO. + * The vm-exit can be triggered again after return to guest that + * will cause infinite loop. + */ + if ((vectoring_info & VECTORING_INFO_VALID_MASK) && + (exit_reason.basic != EXIT_REASON_EXCEPTION_NMI && + exit_reason.basic != EXIT_REASON_EPT_VIOLATION && + exit_reason.basic != EXIT_REASON_PML_FULL && + exit_reason.basic != EXIT_REASON_APIC_ACCESS && + exit_reason.basic != EXIT_REASON_TASK_SWITCH && + exit_reason.basic != EXIT_REASON_NOTIFY)) { + int ndata = 3; + + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV; + vcpu->run->internal.data[0] = vectoring_info; + vcpu->run->internal.data[1] = exit_reason.full; + vcpu->run->internal.data[2] = vcpu->arch.exit_qualification; + if (exit_reason.basic == EXIT_REASON_EPT_MISCONFIG) { + vcpu->run->internal.data[ndata++] = + vmcs_read64(GUEST_PHYSICAL_ADDRESS); + } + vcpu->run->internal.data[ndata++] = vcpu->arch.last_vmentry_cpu; + vcpu->run->internal.ndata = ndata; + return 0; + } + + if (unlikely(!enable_vnmi && + vmx->loaded_vmcs->soft_vnmi_blocked)) { + if (!vmx_interrupt_blocked(vcpu)) { + vmx->loaded_vmcs->soft_vnmi_blocked = 0; + } else if (vmx->loaded_vmcs->vnmi_blocked_time > 1000000000LL && + vcpu->arch.nmi_pending) { + /* + * This CPU don't support us in finding the end of an + * NMI-blocked window if the guest runs with IRQs + * disabled. So we pull the trigger after 1 s of + * futile waiting, but inform the user about this. + */ + printk(KERN_WARNING "%s: Breaking out of NMI-blocked " + "state on VCPU %d after 1 s timeout\n", + __func__, vcpu->vcpu_id); + vmx->loaded_vmcs->soft_vnmi_blocked = 0; + } + } + + if (exit_fastpath != EXIT_FASTPATH_NONE) + return 1; + + if (exit_reason.basic >= kvm_vmx_max_exit_handlers) + goto unexpected_vmexit; +#ifdef CONFIG_RETPOLINE + if (exit_reason.basic == EXIT_REASON_MSR_WRITE) + return kvm_emulate_wrmsr(vcpu); + else if (exit_reason.basic == EXIT_REASON_PREEMPTION_TIMER) + return handle_preemption_timer(vcpu); + else if (exit_reason.basic == EXIT_REASON_INTERRUPT_WINDOW) + return handle_interrupt_window(vcpu); + else if (exit_reason.basic == EXIT_REASON_EXTERNAL_INTERRUPT) + return handle_external_interrupt(vcpu); + else if (exit_reason.basic == EXIT_REASON_HLT) + return kvm_emulate_halt(vcpu); + else if (exit_reason.basic == EXIT_REASON_EPT_MISCONFIG) + return handle_ept_misconfig(vcpu); +#endif + + exit_handler_index = array_index_nospec((u16)exit_reason.basic, + kvm_vmx_max_exit_handlers); + if (!kvm_vmx_exit_handlers[exit_handler_index]) + goto unexpected_vmexit; + + return kvm_vmx_exit_handlers[exit_handler_index](vcpu); + +unexpected_vmexit: + vcpu_unimpl(vcpu, "vmx: unexpected exit reason 0x%x\n", + exit_reason.full); + dump_vmcs(vcpu); + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = + KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON; + vcpu->run->internal.ndata = 2; + vcpu->run->internal.data[0] = exit_reason.full; + vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu; + return 0; +} + +static int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) +{ + int ret = __vmx_handle_exit(vcpu, exit_fastpath); + + /* + * Exit to user space when bus lock detected to inform that there is + * a bus lock in guest. + */ + if (to_vmx(vcpu)->exit_reason.bus_lock_detected) { + if (ret > 0) + vcpu->run->exit_reason = KVM_EXIT_X86_BUS_LOCK; + + vcpu->run->flags |= KVM_RUN_X86_BUS_LOCK; + return 0; + } + return ret; +} + +/* + * Software based L1D cache flush which is used when microcode providing + * the cache control MSR is not loaded. + * + * The L1D cache is 32 KiB on Nehalem and later microarchitectures, but to + * flush it is required to read in 64 KiB because the replacement algorithm + * is not exactly LRU. This could be sized at runtime via topology + * information but as all relevant affected CPUs have 32KiB L1D cache size + * there is no point in doing so. + */ +static noinstr void vmx_l1d_flush(struct kvm_vcpu *vcpu) +{ + int size = PAGE_SIZE << L1D_CACHE_ORDER; + + /* + * This code is only executed when the flush mode is 'cond' or + * 'always' + */ + if (static_branch_likely(&vmx_l1d_flush_cond)) { + bool flush_l1d; + + /* + * Clear the per-vcpu flush bit, it gets set again + * either from vcpu_run() or from one of the unsafe + * VMEXIT handlers. + */ + flush_l1d = vcpu->arch.l1tf_flush_l1d; + vcpu->arch.l1tf_flush_l1d = false; + + /* + * Clear the per-cpu flush bit, it gets set again from + * the interrupt handlers. + */ + flush_l1d |= kvm_get_cpu_l1tf_flush_l1d(); + kvm_clear_cpu_l1tf_flush_l1d(); + + if (!flush_l1d) + return; + } + + vcpu->stat.l1d_flush++; + + if (static_cpu_has(X86_FEATURE_FLUSH_L1D)) { + native_wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH); + return; + } + + asm volatile( + /* First ensure the pages are in the TLB */ + "xorl %%eax, %%eax\n" + ".Lpopulate_tlb:\n\t" + "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t" + "addl $4096, %%eax\n\t" + "cmpl %%eax, %[size]\n\t" + "jne .Lpopulate_tlb\n\t" + "xorl %%eax, %%eax\n\t" + "cpuid\n\t" + /* Now fill the cache */ + "xorl %%eax, %%eax\n" + ".Lfill_cache:\n" + "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t" + "addl $64, %%eax\n\t" + "cmpl %%eax, %[size]\n\t" + "jne .Lfill_cache\n\t" + "lfence\n" + :: [flush_pages] "r" (vmx_l1d_flush_pages), + [size] "r" (size) + : "eax", "ebx", "ecx", "edx"); +} + +static void vmx_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) +{ + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + int tpr_threshold; + + if (is_guest_mode(vcpu) && + nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) + return; + + tpr_threshold = (irr == -1 || tpr < irr) ? 0 : irr; + if (is_guest_mode(vcpu)) + to_vmx(vcpu)->nested.l1_tpr_threshold = tpr_threshold; + else + vmcs_write32(TPR_THRESHOLD, tpr_threshold); +} + +void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + u32 sec_exec_control; + + if (!lapic_in_kernel(vcpu)) + return; + + if (!flexpriority_enabled && + !cpu_has_vmx_virtualize_x2apic_mode()) + return; + + /* Postpone execution until vmcs01 is the current VMCS. */ + if (is_guest_mode(vcpu)) { + vmx->nested.change_vmcs01_virtual_apic_mode = true; + return; + } + + sec_exec_control = secondary_exec_controls_get(vmx); + sec_exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | + SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE); + + switch (kvm_get_apic_mode(vcpu)) { + case LAPIC_MODE_INVALID: + WARN_ONCE(true, "Invalid local APIC state"); + break; + case LAPIC_MODE_DISABLED: + break; + case LAPIC_MODE_XAPIC: + if (flexpriority_enabled) { + sec_exec_control |= + SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; + kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); + + /* + * Flush the TLB, reloading the APIC access page will + * only do so if its physical address has changed, but + * the guest may have inserted a non-APIC mapping into + * the TLB while the APIC access page was disabled. + */ + kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); + } + break; + case LAPIC_MODE_X2APIC: + if (cpu_has_vmx_virtualize_x2apic_mode()) + sec_exec_control |= + SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; + break; + } + secondary_exec_controls_set(vmx, sec_exec_control); + + vmx_update_msr_bitmap_x2apic(vcpu); +} + +static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu) +{ + struct page *page; + + /* Defer reload until vmcs01 is the current VMCS. */ + if (is_guest_mode(vcpu)) { + to_vmx(vcpu)->nested.reload_vmcs01_apic_access_page = true; + return; + } + + if (!(secondary_exec_controls_get(to_vmx(vcpu)) & + SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) + return; + + page = gfn_to_page(vcpu->kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT); + if (is_error_page(page)) + return; + + vmcs_write64(APIC_ACCESS_ADDR, page_to_phys(page)); + vmx_flush_tlb_current(vcpu); + + /* + * Do not pin apic access page in memory, the MMU notifier + * will call us again if it is migrated or swapped out. + */ + put_page(page); +} + +static void vmx_hwapic_isr_update(int max_isr) +{ + u16 status; + u8 old; + + if (max_isr == -1) + max_isr = 0; + + status = vmcs_read16(GUEST_INTR_STATUS); + old = status >> 8; + if (max_isr != old) { + status &= 0xff; + status |= max_isr << 8; + vmcs_write16(GUEST_INTR_STATUS, status); + } +} + +static void vmx_set_rvi(int vector) +{ + u16 status; + u8 old; + + if (vector == -1) + vector = 0; + + status = vmcs_read16(GUEST_INTR_STATUS); + old = (u8)status & 0xff; + if ((u8)vector != old) { + status &= ~0xff; + status |= (u8)vector; + vmcs_write16(GUEST_INTR_STATUS, status); + } +} + +static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr) +{ + /* + * When running L2, updating RVI is only relevant when + * vmcs12 virtual-interrupt-delivery enabled. + * However, it can be enabled only when L1 also + * intercepts external-interrupts and in that case + * we should not update vmcs02 RVI but instead intercept + * interrupt. Therefore, do nothing when running L2. + */ + if (!is_guest_mode(vcpu)) + vmx_set_rvi(max_irr); +} + +static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + int max_irr; + bool got_posted_interrupt; + + if (KVM_BUG_ON(!enable_apicv, vcpu->kvm)) + return -EIO; + + if (pi_test_on(&vmx->pi_desc)) { + pi_clear_on(&vmx->pi_desc); + /* + * IOMMU can write to PID.ON, so the barrier matters even on UP. + * But on x86 this is just a compiler barrier anyway. + */ + smp_mb__after_atomic(); + got_posted_interrupt = + kvm_apic_update_irr(vcpu, vmx->pi_desc.pir, &max_irr); + } else { + max_irr = kvm_lapic_find_highest_irr(vcpu); + got_posted_interrupt = false; + } + + /* + * Newly recognized interrupts are injected via either virtual interrupt + * delivery (RVI) or KVM_REQ_EVENT. Virtual interrupt delivery is + * disabled in two cases: + * + * 1) If L2 is running and the vCPU has a new pending interrupt. If L1 + * wants to exit on interrupts, KVM_REQ_EVENT is needed to synthesize a + * VM-Exit to L1. If L1 doesn't want to exit, the interrupt is injected + * into L2, but KVM doesn't use virtual interrupt delivery to inject + * interrupts into L2, and so KVM_REQ_EVENT is again needed. + * + * 2) If APICv is disabled for this vCPU, assigned devices may still + * attempt to post interrupts. The posted interrupt vector will cause + * a VM-Exit and the subsequent entry will call sync_pir_to_irr. + */ + if (!is_guest_mode(vcpu) && kvm_vcpu_apicv_active(vcpu)) + vmx_set_rvi(max_irr); + else if (got_posted_interrupt) + kvm_make_request(KVM_REQ_EVENT, vcpu); + + return max_irr; +} + +static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) +{ + if (!kvm_vcpu_apicv_active(vcpu)) + return; + + vmcs_write64(EOI_EXIT_BITMAP0, eoi_exit_bitmap[0]); + vmcs_write64(EOI_EXIT_BITMAP1, eoi_exit_bitmap[1]); + vmcs_write64(EOI_EXIT_BITMAP2, eoi_exit_bitmap[2]); + vmcs_write64(EOI_EXIT_BITMAP3, eoi_exit_bitmap[3]); +} + +static void vmx_apicv_pre_state_restore(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + pi_clear_on(&vmx->pi_desc); + memset(vmx->pi_desc.pir, 0, sizeof(vmx->pi_desc.pir)); +} + +void vmx_do_interrupt_nmi_irqoff(unsigned long entry); + +static void handle_interrupt_nmi_irqoff(struct kvm_vcpu *vcpu, + unsigned long entry) +{ + bool is_nmi = entry == (unsigned long)asm_exc_nmi_noist; + + kvm_before_interrupt(vcpu, is_nmi ? KVM_HANDLING_NMI : KVM_HANDLING_IRQ); + vmx_do_interrupt_nmi_irqoff(entry); + kvm_after_interrupt(vcpu); +} + +static void handle_nm_fault_irqoff(struct kvm_vcpu *vcpu) +{ + /* + * Save xfd_err to guest_fpu before interrupt is enabled, so the + * MSR value is not clobbered by the host activity before the guest + * has chance to consume it. + * + * Do not blindly read xfd_err here, since this exception might + * be caused by L1 interception on a platform which doesn't + * support xfd at all. + * + * Do it conditionally upon guest_fpu::xfd. xfd_err matters + * only when xfd contains a non-zero value. + * + * Queuing exception is done in vmx_handle_exit. See comment there. + */ + if (vcpu->arch.guest_fpu.fpstate->xfd) + rdmsrl(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err); +} + +static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx) +{ + const unsigned long nmi_entry = (unsigned long)asm_exc_nmi_noist; + u32 intr_info = vmx_get_intr_info(&vmx->vcpu); + + /* if exit due to PF check for async PF */ + if (is_page_fault(intr_info)) + vmx->vcpu.arch.apf.host_apf_flags = kvm_read_and_reset_apf_flags(); + /* if exit due to NM, handle before interrupts are enabled */ + else if (is_nm_fault(intr_info)) + handle_nm_fault_irqoff(&vmx->vcpu); + /* Handle machine checks before interrupts are enabled */ + else if (is_machine_check(intr_info)) + kvm_machine_check(); + /* We need to handle NMIs before interrupts are enabled */ + else if (is_nmi(intr_info)) + handle_interrupt_nmi_irqoff(&vmx->vcpu, nmi_entry); +} + +static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu) +{ + u32 intr_info = vmx_get_intr_info(vcpu); + unsigned int vector = intr_info & INTR_INFO_VECTOR_MASK; + gate_desc *desc = (gate_desc *)host_idt_base + vector; + + if (KVM_BUG(!is_external_intr(intr_info), vcpu->kvm, + "KVM: unexpected VM-Exit interrupt info: 0x%x", intr_info)) + return; + + handle_interrupt_nmi_irqoff(vcpu, gate_offset(desc)); + vcpu->arch.at_instruction_boundary = true; +} + +static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (vmx->emulation_required) + return; + + if (vmx->exit_reason.basic == EXIT_REASON_EXTERNAL_INTERRUPT) + handle_external_interrupt_irqoff(vcpu); + else if (vmx->exit_reason.basic == EXIT_REASON_EXCEPTION_NMI) + handle_exception_nmi_irqoff(vmx); +} + +/* + * The kvm parameter can be NULL (module initialization, or invocation before + * VM creation). Be sure to check the kvm parameter before using it. + */ +static bool vmx_has_emulated_msr(struct kvm *kvm, u32 index) +{ + switch (index) { + case MSR_IA32_SMBASE: + /* + * We cannot do SMM unless we can run the guest in big + * real mode. + */ + return enable_unrestricted_guest || emulate_invalid_guest_state; + case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC: + return nested; + case MSR_AMD64_VIRT_SPEC_CTRL: + case MSR_AMD64_TSC_RATIO: + /* This is AMD only. */ + return false; + default: + return true; + } +} + +static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx) +{ + u32 exit_intr_info; + bool unblock_nmi; + u8 vector; + bool idtv_info_valid; + + idtv_info_valid = vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK; + + if (enable_vnmi) { + if (vmx->loaded_vmcs->nmi_known_unmasked) + return; + + exit_intr_info = vmx_get_intr_info(&vmx->vcpu); + unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0; + vector = exit_intr_info & INTR_INFO_VECTOR_MASK; + /* + * SDM 3: 27.7.1.2 (September 2008) + * Re-set bit "block by NMI" before VM entry if vmexit caused by + * a guest IRET fault. + * SDM 3: 23.2.2 (September 2008) + * Bit 12 is undefined in any of the following cases: + * If the VM exit sets the valid bit in the IDT-vectoring + * information field. + * If the VM exit is due to a double fault. + */ + if ((exit_intr_info & INTR_INFO_VALID_MASK) && unblock_nmi && + vector != DF_VECTOR && !idtv_info_valid) + vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, + GUEST_INTR_STATE_NMI); + else + vmx->loaded_vmcs->nmi_known_unmasked = + !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) + & GUEST_INTR_STATE_NMI); + } else if (unlikely(vmx->loaded_vmcs->soft_vnmi_blocked)) + vmx->loaded_vmcs->vnmi_blocked_time += + ktime_to_ns(ktime_sub(ktime_get(), + vmx->loaded_vmcs->entry_time)); +} + +static void __vmx_complete_interrupts(struct kvm_vcpu *vcpu, + u32 idt_vectoring_info, + int instr_len_field, + int error_code_field) +{ + u8 vector; + int type; + bool idtv_info_valid; + + idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK; + + vcpu->arch.nmi_injected = false; + kvm_clear_exception_queue(vcpu); + kvm_clear_interrupt_queue(vcpu); + + if (!idtv_info_valid) + return; + + kvm_make_request(KVM_REQ_EVENT, vcpu); + + vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK; + type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK; + + switch (type) { + case INTR_TYPE_NMI_INTR: + vcpu->arch.nmi_injected = true; + /* + * SDM 3: 27.7.1.2 (September 2008) + * Clear bit "block by NMI" before VM entry if a NMI + * delivery faulted. + */ + vmx_set_nmi_mask(vcpu, false); + break; + case INTR_TYPE_SOFT_EXCEPTION: + vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field); + fallthrough; + case INTR_TYPE_HARD_EXCEPTION: + if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) { + u32 err = vmcs_read32(error_code_field); + kvm_requeue_exception_e(vcpu, vector, err); + } else + kvm_requeue_exception(vcpu, vector); + break; + case INTR_TYPE_SOFT_INTR: + vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field); + fallthrough; + case INTR_TYPE_EXT_INTR: + kvm_queue_interrupt(vcpu, vector, type == INTR_TYPE_SOFT_INTR); + break; + default: + break; + } +} + +static void vmx_complete_interrupts(struct vcpu_vmx *vmx) +{ + __vmx_complete_interrupts(&vmx->vcpu, vmx->idt_vectoring_info, + VM_EXIT_INSTRUCTION_LEN, + IDT_VECTORING_ERROR_CODE); +} + +static void vmx_cancel_injection(struct kvm_vcpu *vcpu) +{ + __vmx_complete_interrupts(vcpu, + vmcs_read32(VM_ENTRY_INTR_INFO_FIELD), + VM_ENTRY_INSTRUCTION_LEN, + VM_ENTRY_EXCEPTION_ERROR_CODE); + + vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); +} + +static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx) +{ + int i, nr_msrs; + struct perf_guest_switch_msr *msrs; + struct kvm_pmu *pmu = vcpu_to_pmu(&vmx->vcpu); + + pmu->host_cross_mapped_mask = 0; + if (pmu->pebs_enable & pmu->global_ctrl) + intel_pmu_cross_mapped_check(pmu); + + /* Note, nr_msrs may be garbage if perf_guest_get_msrs() returns NULL. */ + msrs = perf_guest_get_msrs(&nr_msrs, (void *)pmu); + if (!msrs) + return; + + for (i = 0; i < nr_msrs; i++) + if (msrs[i].host == msrs[i].guest) + clear_atomic_switch_msr(vmx, msrs[i].msr); + else + add_atomic_switch_msr(vmx, msrs[i].msr, msrs[i].guest, + msrs[i].host, false); +} + +static void vmx_update_hv_timer(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + u64 tscl; + u32 delta_tsc; + + if (vmx->req_immediate_exit) { + vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, 0); + vmx->loaded_vmcs->hv_timer_soft_disabled = false; + } else if (vmx->hv_deadline_tsc != -1) { + tscl = rdtsc(); + if (vmx->hv_deadline_tsc > tscl) + /* set_hv_timer ensures the delta fits in 32-bits */ + delta_tsc = (u32)((vmx->hv_deadline_tsc - tscl) >> + cpu_preemption_timer_multi); + else + delta_tsc = 0; + + vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, delta_tsc); + vmx->loaded_vmcs->hv_timer_soft_disabled = false; + } else if (!vmx->loaded_vmcs->hv_timer_soft_disabled) { + vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, -1); + vmx->loaded_vmcs->hv_timer_soft_disabled = true; + } +} + +void noinstr vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp) +{ + if (unlikely(host_rsp != vmx->loaded_vmcs->host_state.rsp)) { + vmx->loaded_vmcs->host_state.rsp = host_rsp; + vmcs_writel(HOST_RSP, host_rsp); + } +} + +void noinstr vmx_spec_ctrl_restore_host(struct vcpu_vmx *vmx, + unsigned int flags) +{ + u64 hostval = this_cpu_read(x86_spec_ctrl_current); + + if (!cpu_feature_enabled(X86_FEATURE_MSR_SPEC_CTRL)) + return; + + if (flags & VMX_RUN_SAVE_SPEC_CTRL) + vmx->spec_ctrl = __rdmsr(MSR_IA32_SPEC_CTRL); + + /* + * If the guest/host SPEC_CTRL values differ, restore the host value. + * + * For legacy IBRS, the IBRS bit always needs to be written after + * transitioning from a less privileged predictor mode, regardless of + * whether the guest/host values differ. + */ + if (cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS) || + vmx->spec_ctrl != hostval) + native_wrmsrl(MSR_IA32_SPEC_CTRL, hostval); + + barrier_nospec(); +} + +static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu) +{ + switch (to_vmx(vcpu)->exit_reason.basic) { + case EXIT_REASON_MSR_WRITE: + return handle_fastpath_set_msr_irqoff(vcpu); + case EXIT_REASON_PREEMPTION_TIMER: + return handle_fastpath_preemption_timer(vcpu); + default: + return EXIT_FASTPATH_NONE; + } +} + +static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu, + struct vcpu_vmx *vmx, + unsigned long flags) +{ + guest_state_enter_irqoff(); + + /* L1D Flush includes CPU buffer clear to mitigate MDS */ + if (static_branch_unlikely(&vmx_l1d_should_flush)) + vmx_l1d_flush(vcpu); + else if (static_branch_unlikely(&mds_user_clear)) + mds_clear_cpu_buffers(); + else if (static_branch_unlikely(&mmio_stale_data_clear) && + kvm_arch_has_assigned_device(vcpu->kvm)) + mds_clear_cpu_buffers(); + + vmx_disable_fb_clear(vmx); + + if (vcpu->arch.cr2 != native_read_cr2()) + native_write_cr2(vcpu->arch.cr2); + + vmx->fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs, + flags); + + vcpu->arch.cr2 = native_read_cr2(); + + vmx_enable_fb_clear(vmx); + + guest_state_exit_irqoff(); +} + +static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + unsigned long cr3, cr4; + + /* Record the guest's net vcpu time for enforced NMI injections. */ + if (unlikely(!enable_vnmi && + vmx->loaded_vmcs->soft_vnmi_blocked)) + vmx->loaded_vmcs->entry_time = ktime_get(); + + /* + * Don't enter VMX if guest state is invalid, let the exit handler + * start emulation until we arrive back to a valid state. Synthesize a + * consistency check VM-Exit due to invalid guest state and bail. + */ + if (unlikely(vmx->emulation_required)) { + vmx->fail = 0; + + vmx->exit_reason.full = EXIT_REASON_INVALID_STATE; + vmx->exit_reason.failed_vmentry = 1; + kvm_register_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_1); + vmx->exit_qualification = ENTRY_FAIL_DEFAULT; + kvm_register_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_2); + vmx->exit_intr_info = 0; + return EXIT_FASTPATH_NONE; + } + + trace_kvm_entry(vcpu); + + if (vmx->ple_window_dirty) { + vmx->ple_window_dirty = false; + vmcs_write32(PLE_WINDOW, vmx->ple_window); + } + + /* + * We did this in prepare_switch_to_guest, because it needs to + * be within srcu_read_lock. + */ + WARN_ON_ONCE(vmx->nested.need_vmcs12_to_shadow_sync); + + if (kvm_register_is_dirty(vcpu, VCPU_REGS_RSP)) + vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]); + if (kvm_register_is_dirty(vcpu, VCPU_REGS_RIP)) + vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]); + vcpu->arch.regs_dirty = 0; + + /* + * Refresh vmcs.HOST_CR3 if necessary. This must be done immediately + * prior to VM-Enter, as the kernel may load a new ASID (PCID) any time + * it switches back to the current->mm, which can occur in KVM context + * when switching to a temporary mm to patch kernel code, e.g. if KVM + * toggles a static key while handling a VM-Exit. + */ + cr3 = __get_current_cr3_fast(); + if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) { + vmcs_writel(HOST_CR3, cr3); + vmx->loaded_vmcs->host_state.cr3 = cr3; + } + + cr4 = cr4_read_shadow(); + if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) { + vmcs_writel(HOST_CR4, cr4); + vmx->loaded_vmcs->host_state.cr4 = cr4; + } + + /* When KVM_DEBUGREG_WONT_EXIT, dr6 is accessible in guest. */ + if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) + set_debugreg(vcpu->arch.dr6, 6); + + /* When single-stepping over STI and MOV SS, we must clear the + * corresponding interruptibility bits in the guest state. Otherwise + * vmentry fails as it then expects bit 14 (BS) in pending debug + * exceptions being set, but that's not correct for the guest debugging + * case. */ + if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) + vmx_set_interrupt_shadow(vcpu, 0); + + kvm_load_guest_xsave_state(vcpu); + + pt_guest_enter(vmx); + + atomic_switch_perf_msrs(vmx); + if (intel_pmu_lbr_is_enabled(vcpu)) + vmx_passthrough_lbr_msrs(vcpu); + + if (enable_preemption_timer) + vmx_update_hv_timer(vcpu); + + kvm_wait_lapic_expire(vcpu); + + /* The actual VMENTER/EXIT is in the .noinstr.text section. */ + vmx_vcpu_enter_exit(vcpu, vmx, __vmx_vcpu_run_flags(vmx)); + + /* All fields are clean at this point */ + if (static_branch_unlikely(&enable_evmcs)) { + current_evmcs->hv_clean_fields |= + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; + + current_evmcs->hv_vp_id = kvm_hv_get_vpindex(vcpu); + } + + /* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */ + if (vmx->host_debugctlmsr) + update_debugctlmsr(vmx->host_debugctlmsr); + +#ifndef CONFIG_X86_64 + /* + * The sysexit path does not restore ds/es, so we must set them to + * a reasonable value ourselves. + * + * We can't defer this to vmx_prepare_switch_to_host() since that + * function may be executed in interrupt context, which saves and + * restore segments around it, nullifying its effect. + */ + loadsegment(ds, __USER_DS); + loadsegment(es, __USER_DS); +#endif + + vcpu->arch.regs_avail &= ~VMX_REGS_LAZY_LOAD_SET; + + pt_guest_exit(vmx); + + kvm_load_host_xsave_state(vcpu); + + if (is_guest_mode(vcpu)) { + /* + * Track VMLAUNCH/VMRESUME that have made past guest state + * checking. + */ + if (vmx->nested.nested_run_pending && + !vmx->exit_reason.failed_vmentry) + ++vcpu->stat.nested_run; + + vmx->nested.nested_run_pending = 0; + } + + vmx->idt_vectoring_info = 0; + + if (unlikely(vmx->fail)) { + vmx->exit_reason.full = 0xdead; + return EXIT_FASTPATH_NONE; + } + + vmx->exit_reason.full = vmcs_read32(VM_EXIT_REASON); + if (unlikely((u16)vmx->exit_reason.basic == EXIT_REASON_MCE_DURING_VMENTRY)) + kvm_machine_check(); + + if (likely(!vmx->exit_reason.failed_vmentry)) + vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); + + trace_kvm_exit(vcpu, KVM_ISA_VMX); + + if (unlikely(vmx->exit_reason.failed_vmentry)) + return EXIT_FASTPATH_NONE; + + vmx->loaded_vmcs->launched = 1; + + vmx_recover_nmi_blocking(vmx); + vmx_complete_interrupts(vmx); + + if (is_guest_mode(vcpu)) + return EXIT_FASTPATH_NONE; + + return vmx_exit_handlers_fastpath(vcpu); +} + +static void vmx_vcpu_free(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (enable_pml) + vmx_destroy_pml_buffer(vmx); + free_vpid(vmx->vpid); + nested_vmx_free_vcpu(vcpu); + free_loaded_vmcs(vmx->loaded_vmcs); +} + +static int vmx_vcpu_create(struct kvm_vcpu *vcpu) +{ + struct vmx_uret_msr *tsx_ctrl; + struct vcpu_vmx *vmx; + int i, err; + + BUILD_BUG_ON(offsetof(struct vcpu_vmx, vcpu) != 0); + vmx = to_vmx(vcpu); + + INIT_LIST_HEAD(&vmx->pi_wakeup_list); + + err = -ENOMEM; + + vmx->vpid = allocate_vpid(); + + /* + * If PML is turned on, failure on enabling PML just results in failure + * of creating the vcpu, therefore we can simplify PML logic (by + * avoiding dealing with cases, such as enabling PML partially on vcpus + * for the guest), etc. + */ + if (enable_pml) { + vmx->pml_pg = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); + if (!vmx->pml_pg) + goto free_vpid; + } + + for (i = 0; i < kvm_nr_uret_msrs; ++i) + vmx->guest_uret_msrs[i].mask = -1ull; + if (boot_cpu_has(X86_FEATURE_RTM)) { + /* + * TSX_CTRL_CPUID_CLEAR is handled in the CPUID interception. + * Keep the host value unchanged to avoid changing CPUID bits + * under the host kernel's feet. + */ + tsx_ctrl = vmx_find_uret_msr(vmx, MSR_IA32_TSX_CTRL); + if (tsx_ctrl) + tsx_ctrl->mask = ~(u64)TSX_CTRL_CPUID_CLEAR; + } + + err = alloc_loaded_vmcs(&vmx->vmcs01); + if (err < 0) + goto free_pml; + + /* + * Use Hyper-V 'Enlightened MSR Bitmap' feature when KVM runs as a + * nested (L1) hypervisor and Hyper-V in L0 supports it. Enable the + * feature only for vmcs01, KVM currently isn't equipped to realize any + * performance benefits from enabling it for vmcs02. + */ + if (IS_ENABLED(CONFIG_HYPERV) && static_branch_unlikely(&enable_evmcs) && + (ms_hyperv.nested_features & HV_X64_NESTED_MSR_BITMAP)) { + struct hv_enlightened_vmcs *evmcs = (void *)vmx->vmcs01.vmcs; + + evmcs->hv_enlightenments_control.msr_bitmap = 1; + } + + /* The MSR bitmap starts with all ones */ + bitmap_fill(vmx->shadow_msr_intercept.read, MAX_POSSIBLE_PASSTHROUGH_MSRS); + bitmap_fill(vmx->shadow_msr_intercept.write, MAX_POSSIBLE_PASSTHROUGH_MSRS); + + vmx_disable_intercept_for_msr(vcpu, MSR_IA32_TSC, MSR_TYPE_R); +#ifdef CONFIG_X86_64 + vmx_disable_intercept_for_msr(vcpu, MSR_FS_BASE, MSR_TYPE_RW); + vmx_disable_intercept_for_msr(vcpu, MSR_GS_BASE, MSR_TYPE_RW); + vmx_disable_intercept_for_msr(vcpu, MSR_KERNEL_GS_BASE, MSR_TYPE_RW); +#endif + vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW); + vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW); + vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW); + if (kvm_cstate_in_guest(vcpu->kvm)) { + vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C1_RES, MSR_TYPE_R); + vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C3_RESIDENCY, MSR_TYPE_R); + vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C6_RESIDENCY, MSR_TYPE_R); + vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C7_RESIDENCY, MSR_TYPE_R); + } + + vmx->loaded_vmcs = &vmx->vmcs01; + + if (cpu_need_virtualize_apic_accesses(vcpu)) { + err = alloc_apic_access_page(vcpu->kvm); + if (err) + goto free_vmcs; + } + + if (enable_ept && !enable_unrestricted_guest) { + err = init_rmode_identity_map(vcpu->kvm); + if (err) + goto free_vmcs; + } + + if (vmx_can_use_ipiv(vcpu)) + WRITE_ONCE(to_kvm_vmx(vcpu->kvm)->pid_table[vcpu->vcpu_id], + __pa(&vmx->pi_desc) | PID_TABLE_ENTRY_VALID); + + return 0; + +free_vmcs: + free_loaded_vmcs(vmx->loaded_vmcs); +free_pml: + vmx_destroy_pml_buffer(vmx); +free_vpid: + free_vpid(vmx->vpid); + return err; +} + +#define L1TF_MSG_SMT "L1TF CPU bug present and SMT on, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n" +#define L1TF_MSG_L1D "L1TF CPU bug present and virtualization mitigation disabled, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n" + +static int vmx_vm_init(struct kvm *kvm) +{ + if (!ple_gap) + kvm->arch.pause_in_guest = true; + + if (boot_cpu_has(X86_BUG_L1TF) && enable_ept) { + switch (l1tf_mitigation) { + case L1TF_MITIGATION_OFF: + case L1TF_MITIGATION_FLUSH_NOWARN: + /* 'I explicitly don't care' is set */ + break; + case L1TF_MITIGATION_FLUSH: + case L1TF_MITIGATION_FLUSH_NOSMT: + case L1TF_MITIGATION_FULL: + /* + * Warn upon starting the first VM in a potentially + * insecure environment. + */ + if (sched_smt_active()) + pr_warn_once(L1TF_MSG_SMT); + if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_NEVER) + pr_warn_once(L1TF_MSG_L1D); + break; + case L1TF_MITIGATION_FULL_FORCE: + /* Flush is enforced */ + break; + } + } + return 0; +} + +static int __init vmx_check_processor_compat(void) +{ + struct vmcs_config vmcs_conf; + struct vmx_capability vmx_cap; + + if (!this_cpu_has(X86_FEATURE_MSR_IA32_FEAT_CTL) || + !this_cpu_has(X86_FEATURE_VMX)) { + pr_err("kvm: VMX is disabled on CPU %d\n", smp_processor_id()); + return -EIO; + } + + if (setup_vmcs_config(&vmcs_conf, &vmx_cap) < 0) + return -EIO; + if (nested) + nested_vmx_setup_ctls_msrs(&vmcs_conf, vmx_cap.ept); + if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config)) != 0) { + printk(KERN_ERR "kvm: CPU %d feature inconsistency!\n", + smp_processor_id()); + return -EIO; + } + return 0; +} + +static u8 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) +{ + u8 cache; + + /* We wanted to honor guest CD/MTRR/PAT, but doing so could result in + * memory aliases with conflicting memory types and sometimes MCEs. + * We have to be careful as to what are honored and when. + * + * For MMIO, guest CD/MTRR are ignored. The EPT memory type is set to + * UC. The effective memory type is UC or WC depending on guest PAT. + * This was historically the source of MCEs and we want to be + * conservative. + * + * When there is no need to deal with noncoherent DMA (e.g., no VT-d + * or VT-d has snoop control), guest CD/MTRR/PAT are all ignored. The + * EPT memory type is set to WB. The effective memory type is forced + * WB. + * + * Otherwise, we trust guest. Guest CD/MTRR/PAT are all honored. The + * EPT memory type is used to emulate guest CD/MTRR. + */ + + if (is_mmio) + return MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT; + + if (!kvm_arch_has_noncoherent_dma(vcpu->kvm)) + return (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT) | VMX_EPT_IPAT_BIT; + + if (kvm_read_cr0_bits(vcpu, X86_CR0_CD)) { + if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED)) + cache = MTRR_TYPE_WRBACK; + else + cache = MTRR_TYPE_UNCACHABLE; + + return (cache << VMX_EPT_MT_EPTE_SHIFT) | VMX_EPT_IPAT_BIT; + } + + return kvm_mtrr_get_guest_memory_type(vcpu, gfn) << VMX_EPT_MT_EPTE_SHIFT; +} + +static void vmcs_set_secondary_exec_control(struct vcpu_vmx *vmx, u32 new_ctl) +{ + /* + * These bits in the secondary execution controls field + * are dynamic, the others are mostly based on the hypervisor + * architecture and the guest's CPUID. Do not touch the + * dynamic bits. + */ + u32 mask = + SECONDARY_EXEC_SHADOW_VMCS | + SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | + SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | + SECONDARY_EXEC_DESC; + + u32 cur_ctl = secondary_exec_controls_get(vmx); + + secondary_exec_controls_set(vmx, (new_ctl & ~mask) | (cur_ctl & mask)); +} + +/* + * Generate MSR_IA32_VMX_CR{0,4}_FIXED1 according to CPUID. Only set bits + * (indicating "allowed-1") if they are supported in the guest's CPUID. + */ +static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct kvm_cpuid_entry2 *entry; + + vmx->nested.msrs.cr0_fixed1 = 0xffffffff; + vmx->nested.msrs.cr4_fixed1 = X86_CR4_PCE; + +#define cr4_fixed1_update(_cr4_mask, _reg, _cpuid_mask) do { \ + if (entry && (entry->_reg & (_cpuid_mask))) \ + vmx->nested.msrs.cr4_fixed1 |= (_cr4_mask); \ +} while (0) + + entry = kvm_find_cpuid_entry(vcpu, 0x1); + cr4_fixed1_update(X86_CR4_VME, edx, feature_bit(VME)); + cr4_fixed1_update(X86_CR4_PVI, edx, feature_bit(VME)); + cr4_fixed1_update(X86_CR4_TSD, edx, feature_bit(TSC)); + cr4_fixed1_update(X86_CR4_DE, edx, feature_bit(DE)); + cr4_fixed1_update(X86_CR4_PSE, edx, feature_bit(PSE)); + cr4_fixed1_update(X86_CR4_PAE, edx, feature_bit(PAE)); + cr4_fixed1_update(X86_CR4_MCE, edx, feature_bit(MCE)); + cr4_fixed1_update(X86_CR4_PGE, edx, feature_bit(PGE)); + cr4_fixed1_update(X86_CR4_OSFXSR, edx, feature_bit(FXSR)); + cr4_fixed1_update(X86_CR4_OSXMMEXCPT, edx, feature_bit(XMM)); + cr4_fixed1_update(X86_CR4_VMXE, ecx, feature_bit(VMX)); + cr4_fixed1_update(X86_CR4_SMXE, ecx, feature_bit(SMX)); + cr4_fixed1_update(X86_CR4_PCIDE, ecx, feature_bit(PCID)); + cr4_fixed1_update(X86_CR4_OSXSAVE, ecx, feature_bit(XSAVE)); + + entry = kvm_find_cpuid_entry_index(vcpu, 0x7, 0); + cr4_fixed1_update(X86_CR4_FSGSBASE, ebx, feature_bit(FSGSBASE)); + cr4_fixed1_update(X86_CR4_SMEP, ebx, feature_bit(SMEP)); + cr4_fixed1_update(X86_CR4_SMAP, ebx, feature_bit(SMAP)); + cr4_fixed1_update(X86_CR4_PKE, ecx, feature_bit(PKU)); + cr4_fixed1_update(X86_CR4_UMIP, ecx, feature_bit(UMIP)); + cr4_fixed1_update(X86_CR4_LA57, ecx, feature_bit(LA57)); + +#undef cr4_fixed1_update +} + +static void update_intel_pt_cfg(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct kvm_cpuid_entry2 *best = NULL; + int i; + + for (i = 0; i < PT_CPUID_LEAVES; i++) { + best = kvm_find_cpuid_entry_index(vcpu, 0x14, i); + if (!best) + return; + vmx->pt_desc.caps[CPUID_EAX + i*PT_CPUID_REGS_NUM] = best->eax; + vmx->pt_desc.caps[CPUID_EBX + i*PT_CPUID_REGS_NUM] = best->ebx; + vmx->pt_desc.caps[CPUID_ECX + i*PT_CPUID_REGS_NUM] = best->ecx; + vmx->pt_desc.caps[CPUID_EDX + i*PT_CPUID_REGS_NUM] = best->edx; + } + + /* Get the number of configurable Address Ranges for filtering */ + vmx->pt_desc.num_address_ranges = intel_pt_validate_cap(vmx->pt_desc.caps, + PT_CAP_num_address_ranges); + + /* Initialize and clear the no dependency bits */ + vmx->pt_desc.ctl_bitmask = ~(RTIT_CTL_TRACEEN | RTIT_CTL_OS | + RTIT_CTL_USR | RTIT_CTL_TSC_EN | RTIT_CTL_DISRETC | + RTIT_CTL_BRANCH_EN); + + /* + * If CPUID.(EAX=14H,ECX=0):EBX[0]=1 CR3Filter can be set otherwise + * will inject an #GP + */ + if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_cr3_filtering)) + vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_CR3EN; + + /* + * If CPUID.(EAX=14H,ECX=0):EBX[1]=1 CYCEn, CycThresh and + * PSBFreq can be set + */ + if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc)) + vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_CYCLEACC | + RTIT_CTL_CYC_THRESH | RTIT_CTL_PSB_FREQ); + + /* + * If CPUID.(EAX=14H,ECX=0):EBX[3]=1 MTCEn and MTCFreq can be set + */ + if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc)) + vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_MTC_EN | + RTIT_CTL_MTC_RANGE); + + /* If CPUID.(EAX=14H,ECX=0):EBX[4]=1 FUPonPTW and PTWEn can be set */ + if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_ptwrite)) + vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_FUP_ON_PTW | + RTIT_CTL_PTW_EN); + + /* If CPUID.(EAX=14H,ECX=0):EBX[5]=1 PwrEvEn can be set */ + if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_power_event_trace)) + vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_PWR_EVT_EN; + + /* If CPUID.(EAX=14H,ECX=0):ECX[0]=1 ToPA can be set */ + if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_topa_output)) + vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_TOPA; + + /* If CPUID.(EAX=14H,ECX=0):ECX[3]=1 FabricEn can be set */ + if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_output_subsys)) + vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_FABRIC_EN; + + /* unmask address range configure area */ + for (i = 0; i < vmx->pt_desc.num_address_ranges; i++) + vmx->pt_desc.ctl_bitmask &= ~(0xfULL << (32 + i * 4)); +} + +static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + /* xsaves_enabled is recomputed in vmx_compute_secondary_exec_control(). */ + vcpu->arch.xsaves_enabled = false; + + vmx_setup_uret_msrs(vmx); + + if (cpu_has_secondary_exec_ctrls()) + vmcs_set_secondary_exec_control(vmx, + vmx_secondary_exec_control(vmx)); + + if (nested_vmx_allowed(vcpu)) + vmx->msr_ia32_feature_control_valid_bits |= + FEAT_CTL_VMX_ENABLED_INSIDE_SMX | + FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX; + else + vmx->msr_ia32_feature_control_valid_bits &= + ~(FEAT_CTL_VMX_ENABLED_INSIDE_SMX | + FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX); + + if (nested_vmx_allowed(vcpu)) + nested_vmx_cr_fixed1_bits_update(vcpu); + + if (boot_cpu_has(X86_FEATURE_INTEL_PT) && + guest_cpuid_has(vcpu, X86_FEATURE_INTEL_PT)) + update_intel_pt_cfg(vcpu); + + if (boot_cpu_has(X86_FEATURE_RTM)) { + struct vmx_uret_msr *msr; + msr = vmx_find_uret_msr(vmx, MSR_IA32_TSX_CTRL); + if (msr) { + bool enabled = guest_cpuid_has(vcpu, X86_FEATURE_RTM); + vmx_set_guest_uret_msr(vmx, msr, enabled ? 0 : TSX_CTRL_RTM_DISABLE); + } + } + + if (kvm_cpu_cap_has(X86_FEATURE_XFD)) + vmx_set_intercept_for_msr(vcpu, MSR_IA32_XFD_ERR, MSR_TYPE_R, + !guest_cpuid_has(vcpu, X86_FEATURE_XFD)); + + + set_cr4_guest_host_mask(vmx); + + vmx_write_encls_bitmap(vcpu, NULL); + if (guest_cpuid_has(vcpu, X86_FEATURE_SGX)) + vmx->msr_ia32_feature_control_valid_bits |= FEAT_CTL_SGX_ENABLED; + else + vmx->msr_ia32_feature_control_valid_bits &= ~FEAT_CTL_SGX_ENABLED; + + if (guest_cpuid_has(vcpu, X86_FEATURE_SGX_LC)) + vmx->msr_ia32_feature_control_valid_bits |= + FEAT_CTL_SGX_LC_ENABLED; + else + vmx->msr_ia32_feature_control_valid_bits &= + ~FEAT_CTL_SGX_LC_ENABLED; + + /* Refresh #PF interception to account for MAXPHYADDR changes. */ + vmx_update_exception_bitmap(vcpu); +} + +static u64 vmx_get_perf_capabilities(void) +{ + u64 perf_cap = PMU_CAP_FW_WRITES; + struct x86_pmu_lbr lbr; + u64 host_perf_cap = 0; + + if (!enable_pmu) + return 0; + + if (boot_cpu_has(X86_FEATURE_PDCM)) + rdmsrl(MSR_IA32_PERF_CAPABILITIES, host_perf_cap); + + if (!cpu_feature_enabled(X86_FEATURE_ARCH_LBR)) { + x86_perf_get_lbr(&lbr); + if (lbr.nr) + perf_cap |= host_perf_cap & PMU_CAP_LBR_FMT; + } + + if (vmx_pebs_supported()) { + perf_cap |= host_perf_cap & PERF_CAP_PEBS_MASK; + if ((perf_cap & PERF_CAP_PEBS_FORMAT) < 4) + perf_cap &= ~PERF_CAP_PEBS_BASELINE; + } + + return perf_cap; +} + +static __init void vmx_set_cpu_caps(void) +{ + kvm_set_cpu_caps(); + + /* CPUID 0x1 */ + if (nested) + kvm_cpu_cap_set(X86_FEATURE_VMX); + + /* CPUID 0x7 */ + if (kvm_mpx_supported()) + kvm_cpu_cap_check_and_set(X86_FEATURE_MPX); + if (!cpu_has_vmx_invpcid()) + kvm_cpu_cap_clear(X86_FEATURE_INVPCID); + if (vmx_pt_mode_is_host_guest()) + kvm_cpu_cap_check_and_set(X86_FEATURE_INTEL_PT); + if (vmx_pebs_supported()) { + kvm_cpu_cap_check_and_set(X86_FEATURE_DS); + kvm_cpu_cap_check_and_set(X86_FEATURE_DTES64); + } + + if (!enable_pmu) + kvm_cpu_cap_clear(X86_FEATURE_PDCM); + kvm_caps.supported_perf_cap = vmx_get_perf_capabilities(); + + if (!enable_sgx) { + kvm_cpu_cap_clear(X86_FEATURE_SGX); + kvm_cpu_cap_clear(X86_FEATURE_SGX_LC); + kvm_cpu_cap_clear(X86_FEATURE_SGX1); + kvm_cpu_cap_clear(X86_FEATURE_SGX2); + } + + if (vmx_umip_emulated()) + kvm_cpu_cap_set(X86_FEATURE_UMIP); + + /* CPUID 0xD.1 */ + kvm_caps.supported_xss = 0; + if (!cpu_has_vmx_xsaves()) + kvm_cpu_cap_clear(X86_FEATURE_XSAVES); + + /* CPUID 0x80000001 and 0x7 (RDPID) */ + if (!cpu_has_vmx_rdtscp()) { + kvm_cpu_cap_clear(X86_FEATURE_RDTSCP); + kvm_cpu_cap_clear(X86_FEATURE_RDPID); + } + + if (cpu_has_vmx_waitpkg()) + kvm_cpu_cap_check_and_set(X86_FEATURE_WAITPKG); +} + +static void vmx_request_immediate_exit(struct kvm_vcpu *vcpu) +{ + to_vmx(vcpu)->req_immediate_exit = true; +} + +static int vmx_check_intercept_io(struct kvm_vcpu *vcpu, + struct x86_instruction_info *info) +{ + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + unsigned short port; + bool intercept; + int size; + + if (info->intercept == x86_intercept_in || + info->intercept == x86_intercept_ins) { + port = info->src_val; + size = info->dst_bytes; + } else { + port = info->dst_val; + size = info->src_bytes; + } + + /* + * If the 'use IO bitmaps' VM-execution control is 0, IO instruction + * VM-exits depend on the 'unconditional IO exiting' VM-execution + * control. + * + * Otherwise, IO instruction VM-exits are controlled by the IO bitmaps. + */ + if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) + intercept = nested_cpu_has(vmcs12, + CPU_BASED_UNCOND_IO_EXITING); + else + intercept = nested_vmx_check_io_bitmaps(vcpu, port, size); + + /* FIXME: produce nested vmexit and return X86EMUL_INTERCEPTED. */ + return intercept ? X86EMUL_UNHANDLEABLE : X86EMUL_CONTINUE; +} + +static int vmx_check_intercept(struct kvm_vcpu *vcpu, + struct x86_instruction_info *info, + enum x86_intercept_stage stage, + struct x86_exception *exception) +{ + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + + switch (info->intercept) { + /* + * RDPID causes #UD if disabled through secondary execution controls. + * Because it is marked as EmulateOnUD, we need to intercept it here. + * Note, RDPID is hidden behind ENABLE_RDTSCP. + */ + case x86_intercept_rdpid: + if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_RDTSCP)) { + exception->vector = UD_VECTOR; + exception->error_code_valid = false; + return X86EMUL_PROPAGATE_FAULT; + } + break; + + case x86_intercept_in: + case x86_intercept_ins: + case x86_intercept_out: + case x86_intercept_outs: + return vmx_check_intercept_io(vcpu, info); + + case x86_intercept_lgdt: + case x86_intercept_lidt: + case x86_intercept_lldt: + case x86_intercept_ltr: + case x86_intercept_sgdt: + case x86_intercept_sidt: + case x86_intercept_sldt: + case x86_intercept_str: + if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC)) + return X86EMUL_CONTINUE; + + /* FIXME: produce nested vmexit and return X86EMUL_INTERCEPTED. */ + break; + + case x86_intercept_pause: + /* + * PAUSE is a single-byte NOP with a REPE prefix, i.e. collides + * with vanilla NOPs in the emulator. Apply the interception + * check only to actual PAUSE instructions. Don't check + * PAUSE-loop-exiting, software can't expect a given PAUSE to + * exit, i.e. KVM is within its rights to allow L2 to execute + * the PAUSE. + */ + if ((info->rep_prefix != REPE_PREFIX) || + !nested_cpu_has2(vmcs12, CPU_BASED_PAUSE_EXITING)) + return X86EMUL_CONTINUE; + + break; + + /* TODO: check more intercepts... */ + default: + break; + } + + return X86EMUL_UNHANDLEABLE; +} + +#ifdef CONFIG_X86_64 +/* (a << shift) / divisor, return 1 if overflow otherwise 0 */ +static inline int u64_shl_div_u64(u64 a, unsigned int shift, + u64 divisor, u64 *result) +{ + u64 low = a << shift, high = a >> (64 - shift); + + /* To avoid the overflow on divq */ + if (high >= divisor) + return 1; + + /* Low hold the result, high hold rem which is discarded */ + asm("divq %2\n\t" : "=a" (low), "=d" (high) : + "rm" (divisor), "0" (low), "1" (high)); + *result = low; + + return 0; +} + +static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc, + bool *expired) +{ + struct vcpu_vmx *vmx; + u64 tscl, guest_tscl, delta_tsc, lapic_timer_advance_cycles; + struct kvm_timer *ktimer = &vcpu->arch.apic->lapic_timer; + + vmx = to_vmx(vcpu); + tscl = rdtsc(); + guest_tscl = kvm_read_l1_tsc(vcpu, tscl); + delta_tsc = max(guest_deadline_tsc, guest_tscl) - guest_tscl; + lapic_timer_advance_cycles = nsec_to_cycles(vcpu, + ktimer->timer_advance_ns); + + if (delta_tsc > lapic_timer_advance_cycles) + delta_tsc -= lapic_timer_advance_cycles; + else + delta_tsc = 0; + + /* Convert to host delta tsc if tsc scaling is enabled */ + if (vcpu->arch.l1_tsc_scaling_ratio != kvm_caps.default_tsc_scaling_ratio && + delta_tsc && u64_shl_div_u64(delta_tsc, + kvm_caps.tsc_scaling_ratio_frac_bits, + vcpu->arch.l1_tsc_scaling_ratio, &delta_tsc)) + return -ERANGE; + + /* + * If the delta tsc can't fit in the 32 bit after the multi shift, + * we can't use the preemption timer. + * It's possible that it fits on later vmentries, but checking + * on every vmentry is costly so we just use an hrtimer. + */ + if (delta_tsc >> (cpu_preemption_timer_multi + 32)) + return -ERANGE; + + vmx->hv_deadline_tsc = tscl + delta_tsc; + *expired = !delta_tsc; + return 0; +} + +static void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu) +{ + to_vmx(vcpu)->hv_deadline_tsc = -1; +} +#endif + +static void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu) +{ + if (!kvm_pause_in_guest(vcpu->kvm)) + shrink_ple_window(vcpu); +} + +void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (is_guest_mode(vcpu)) { + vmx->nested.update_vmcs01_cpu_dirty_logging = true; + return; + } + + /* + * Note, cpu_dirty_logging_count can be changed concurrent with this + * code, but in that case another update request will be made and so + * the guest will never run with a stale PML value. + */ + if (vcpu->kvm->arch.cpu_dirty_logging_count) + secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_ENABLE_PML); + else + secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_ENABLE_PML); +} + +static void vmx_setup_mce(struct kvm_vcpu *vcpu) +{ + if (vcpu->arch.mcg_cap & MCG_LMCE_P) + to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |= + FEAT_CTL_LMCE_ENABLED; + else + to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &= + ~FEAT_CTL_LMCE_ENABLED; +} + +static int vmx_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection) +{ + /* we need a nested vmexit to enter SMM, postpone if run is pending */ + if (to_vmx(vcpu)->nested.nested_run_pending) + return -EBUSY; + return !is_smm(vcpu); +} + +static int vmx_enter_smm(struct kvm_vcpu *vcpu, char *smstate) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + /* + * TODO: Implement custom flows for forcing the vCPU out/in of L2 on + * SMI and RSM. Using the common VM-Exit + VM-Enter routines is wrong + * SMI and RSM only modify state that is saved and restored via SMRAM. + * E.g. most MSRs are left untouched, but many are modified by VM-Exit + * and VM-Enter, and thus L2's values may be corrupted on SMI+RSM. + */ + vmx->nested.smm.guest_mode = is_guest_mode(vcpu); + if (vmx->nested.smm.guest_mode) + nested_vmx_vmexit(vcpu, -1, 0, 0); + + vmx->nested.smm.vmxon = vmx->nested.vmxon; + vmx->nested.vmxon = false; + vmx_clear_hlt(vcpu); + return 0; +} + +static int vmx_leave_smm(struct kvm_vcpu *vcpu, const char *smstate) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + int ret; + + if (vmx->nested.smm.vmxon) { + vmx->nested.vmxon = true; + vmx->nested.smm.vmxon = false; + } + + if (vmx->nested.smm.guest_mode) { + ret = nested_vmx_enter_non_root_mode(vcpu, false); + if (ret) + return ret; + + vmx->nested.nested_run_pending = 1; + vmx->nested.smm.guest_mode = false; + } + return 0; +} + +static void vmx_enable_smi_window(struct kvm_vcpu *vcpu) +{ + /* RSM will cause a vmexit anyway. */ +} + +static bool vmx_apic_init_signal_blocked(struct kvm_vcpu *vcpu) +{ + return to_vmx(vcpu)->nested.vmxon && !is_guest_mode(vcpu); +} + +static void vmx_migrate_timers(struct kvm_vcpu *vcpu) +{ + if (is_guest_mode(vcpu)) { + struct hrtimer *timer = &to_vmx(vcpu)->nested.preemption_timer; + + if (hrtimer_try_to_cancel(timer) == 1) + hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED); + } +} + +static void vmx_hardware_unsetup(void) +{ + kvm_set_posted_intr_wakeup_handler(NULL); + + if (nested) + nested_vmx_hardware_unsetup(); + + free_kvm_area(); +} + +static bool vmx_check_apicv_inhibit_reasons(enum kvm_apicv_inhibit reason) +{ + ulong supported = BIT(APICV_INHIBIT_REASON_DISABLE) | + BIT(APICV_INHIBIT_REASON_ABSENT) | + BIT(APICV_INHIBIT_REASON_HYPERV) | + BIT(APICV_INHIBIT_REASON_BLOCKIRQ) | + BIT(APICV_INHIBIT_REASON_APIC_ID_MODIFIED) | + BIT(APICV_INHIBIT_REASON_APIC_BASE_MODIFIED); + + return supported & BIT(reason); +} + +static void vmx_vm_destroy(struct kvm *kvm) +{ + struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm); + + free_pages((unsigned long)kvm_vmx->pid_table, vmx_get_pid_table_order(kvm)); +} + +static struct kvm_x86_ops vmx_x86_ops __initdata = { + .name = "kvm_intel", + + .hardware_unsetup = vmx_hardware_unsetup, + + .hardware_enable = vmx_hardware_enable, + .hardware_disable = vmx_hardware_disable, + .has_emulated_msr = vmx_has_emulated_msr, + + .vm_size = sizeof(struct kvm_vmx), + .vm_init = vmx_vm_init, + .vm_destroy = vmx_vm_destroy, + + .vcpu_precreate = vmx_vcpu_precreate, + .vcpu_create = vmx_vcpu_create, + .vcpu_free = vmx_vcpu_free, + .vcpu_reset = vmx_vcpu_reset, + + .prepare_switch_to_guest = vmx_prepare_switch_to_guest, + .vcpu_load = vmx_vcpu_load, + .vcpu_put = vmx_vcpu_put, + + .update_exception_bitmap = vmx_update_exception_bitmap, + .get_msr_feature = vmx_get_msr_feature, + .get_msr = vmx_get_msr, + .set_msr = vmx_set_msr, + .get_segment_base = vmx_get_segment_base, + .get_segment = vmx_get_segment, + .set_segment = vmx_set_segment, + .get_cpl = vmx_get_cpl, + .get_cs_db_l_bits = vmx_get_cs_db_l_bits, + .is_valid_cr0 = vmx_is_valid_cr0, + .set_cr0 = vmx_set_cr0, + .is_valid_cr4 = vmx_is_valid_cr4, + .set_cr4 = vmx_set_cr4, + .set_efer = vmx_set_efer, + .get_idt = vmx_get_idt, + .set_idt = vmx_set_idt, + .get_gdt = vmx_get_gdt, + .set_gdt = vmx_set_gdt, + .set_dr7 = vmx_set_dr7, + .sync_dirty_debug_regs = vmx_sync_dirty_debug_regs, + .cache_reg = vmx_cache_reg, + .get_rflags = vmx_get_rflags, + .set_rflags = vmx_set_rflags, + .get_if_flag = vmx_get_if_flag, + + .flush_tlb_all = vmx_flush_tlb_all, + .flush_tlb_current = vmx_flush_tlb_current, + .flush_tlb_gva = vmx_flush_tlb_gva, + .flush_tlb_guest = vmx_flush_tlb_guest, + + .vcpu_pre_run = vmx_vcpu_pre_run, + .vcpu_run = vmx_vcpu_run, + .handle_exit = vmx_handle_exit, + .skip_emulated_instruction = vmx_skip_emulated_instruction, + .update_emulated_instruction = vmx_update_emulated_instruction, + .set_interrupt_shadow = vmx_set_interrupt_shadow, + .get_interrupt_shadow = vmx_get_interrupt_shadow, + .patch_hypercall = vmx_patch_hypercall, + .inject_irq = vmx_inject_irq, + .inject_nmi = vmx_inject_nmi, + .inject_exception = vmx_inject_exception, + .cancel_injection = vmx_cancel_injection, + .interrupt_allowed = vmx_interrupt_allowed, + .nmi_allowed = vmx_nmi_allowed, + .get_nmi_mask = vmx_get_nmi_mask, + .set_nmi_mask = vmx_set_nmi_mask, + .enable_nmi_window = vmx_enable_nmi_window, + .enable_irq_window = vmx_enable_irq_window, + .update_cr8_intercept = vmx_update_cr8_intercept, + .set_virtual_apic_mode = vmx_set_virtual_apic_mode, + .set_apic_access_page_addr = vmx_set_apic_access_page_addr, + .refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl, + .load_eoi_exitmap = vmx_load_eoi_exitmap, + .apicv_pre_state_restore = vmx_apicv_pre_state_restore, + .check_apicv_inhibit_reasons = vmx_check_apicv_inhibit_reasons, + .hwapic_irr_update = vmx_hwapic_irr_update, + .hwapic_isr_update = vmx_hwapic_isr_update, + .guest_apic_has_interrupt = vmx_guest_apic_has_interrupt, + .sync_pir_to_irr = vmx_sync_pir_to_irr, + .deliver_interrupt = vmx_deliver_interrupt, + .dy_apicv_has_pending_interrupt = pi_has_pending_interrupt, + + .set_tss_addr = vmx_set_tss_addr, + .set_identity_map_addr = vmx_set_identity_map_addr, + .get_mt_mask = vmx_get_mt_mask, + + .get_exit_info = vmx_get_exit_info, + + .vcpu_after_set_cpuid = vmx_vcpu_after_set_cpuid, + + .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit, + + .get_l2_tsc_offset = vmx_get_l2_tsc_offset, + .get_l2_tsc_multiplier = vmx_get_l2_tsc_multiplier, + .write_tsc_offset = vmx_write_tsc_offset, + .write_tsc_multiplier = vmx_write_tsc_multiplier, + + .load_mmu_pgd = vmx_load_mmu_pgd, + + .check_intercept = vmx_check_intercept, + .handle_exit_irqoff = vmx_handle_exit_irqoff, + + .request_immediate_exit = vmx_request_immediate_exit, + + .sched_in = vmx_sched_in, + + .cpu_dirty_log_size = PML_ENTITY_NUM, + .update_cpu_dirty_logging = vmx_update_cpu_dirty_logging, + + .nested_ops = &vmx_nested_ops, + + .pi_update_irte = vmx_pi_update_irte, + .pi_start_assignment = vmx_pi_start_assignment, + +#ifdef CONFIG_X86_64 + .set_hv_timer = vmx_set_hv_timer, + .cancel_hv_timer = vmx_cancel_hv_timer, +#endif + + .setup_mce = vmx_setup_mce, + + .smi_allowed = vmx_smi_allowed, + .enter_smm = vmx_enter_smm, + .leave_smm = vmx_leave_smm, + .enable_smi_window = vmx_enable_smi_window, + + .can_emulate_instruction = vmx_can_emulate_instruction, + .apic_init_signal_blocked = vmx_apic_init_signal_blocked, + .migrate_timers = vmx_migrate_timers, + + .msr_filter_changed = vmx_msr_filter_changed, + .complete_emulated_msr = kvm_complete_insn_gp, + + .vcpu_deliver_sipi_vector = kvm_vcpu_deliver_sipi_vector, +}; + +static unsigned int vmx_handle_intel_pt_intr(void) +{ + struct kvm_vcpu *vcpu = kvm_get_running_vcpu(); + + /* '0' on failure so that the !PT case can use a RET0 static call. */ + if (!vcpu || !kvm_handling_nmi_from_guest(vcpu)) + return 0; + + kvm_make_request(KVM_REQ_PMI, vcpu); + __set_bit(MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI_BIT, + (unsigned long *)&vcpu->arch.pmu.global_status); + return 1; +} + +static __init void vmx_setup_user_return_msrs(void) +{ + + /* + * Though SYSCALL is only supported in 64-bit mode on Intel CPUs, kvm + * will emulate SYSCALL in legacy mode if the vendor string in guest + * CPUID.0:{EBX,ECX,EDX} is "AuthenticAMD" or "AMDisbetter!" To + * support this emulation, MSR_STAR is included in the list for i386, + * but is never loaded into hardware. MSR_CSTAR is also never loaded + * into hardware and is here purely for emulation purposes. + */ + const u32 vmx_uret_msrs_list[] = { + #ifdef CONFIG_X86_64 + MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, + #endif + MSR_EFER, MSR_TSC_AUX, MSR_STAR, + MSR_IA32_TSX_CTRL, + }; + int i; + + BUILD_BUG_ON(ARRAY_SIZE(vmx_uret_msrs_list) != MAX_NR_USER_RETURN_MSRS); + + for (i = 0; i < ARRAY_SIZE(vmx_uret_msrs_list); ++i) + kvm_add_user_return_msr(vmx_uret_msrs_list[i]); +} + +static void __init vmx_setup_me_spte_mask(void) +{ + u64 me_mask = 0; + + /* + * kvm_get_shadow_phys_bits() returns shadow_phys_bits. Use + * the former to avoid exposing shadow_phys_bits. + * + * On pre-MKTME system, boot_cpu_data.x86_phys_bits equals to + * shadow_phys_bits. On MKTME and/or TDX capable systems, + * boot_cpu_data.x86_phys_bits holds the actual physical address + * w/o the KeyID bits, and shadow_phys_bits equals to MAXPHYADDR + * reported by CPUID. Those bits between are KeyID bits. + */ + if (boot_cpu_data.x86_phys_bits != kvm_get_shadow_phys_bits()) + me_mask = rsvd_bits(boot_cpu_data.x86_phys_bits, + kvm_get_shadow_phys_bits() - 1); + /* + * Unlike SME, host kernel doesn't support setting up any + * MKTME KeyID on Intel platforms. No memory encryption + * bits should be included into the SPTE. + */ + kvm_mmu_set_me_spte_mask(0, me_mask); +} + +static struct kvm_x86_init_ops vmx_init_ops __initdata; + +static __init int hardware_setup(void) +{ + unsigned long host_bndcfgs; + struct desc_ptr dt; + int r; + + store_idt(&dt); + host_idt_base = dt.address; + + vmx_setup_user_return_msrs(); + + if (setup_vmcs_config(&vmcs_config, &vmx_capability) < 0) + return -EIO; + + if (cpu_has_perf_global_ctrl_bug()) + pr_warn_once("kvm: VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL " + "does not work properly. Using workaround\n"); + + if (boot_cpu_has(X86_FEATURE_NX)) + kvm_enable_efer_bits(EFER_NX); + + if (boot_cpu_has(X86_FEATURE_MPX)) { + rdmsrl(MSR_IA32_BNDCFGS, host_bndcfgs); + WARN_ONCE(host_bndcfgs, "KVM: BNDCFGS in host will be lost"); + } + + if (!cpu_has_vmx_mpx()) + kvm_caps.supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS | + XFEATURE_MASK_BNDCSR); + + if (!cpu_has_vmx_vpid() || !cpu_has_vmx_invvpid() || + !(cpu_has_vmx_invvpid_single() || cpu_has_vmx_invvpid_global())) + enable_vpid = 0; + + if (!cpu_has_vmx_ept() || + !cpu_has_vmx_ept_4levels() || + !cpu_has_vmx_ept_mt_wb() || + !cpu_has_vmx_invept_global()) + enable_ept = 0; + + /* NX support is required for shadow paging. */ + if (!enable_ept && !boot_cpu_has(X86_FEATURE_NX)) { + pr_err_ratelimited("kvm: NX (Execute Disable) not supported\n"); + return -EOPNOTSUPP; + } + + if (!cpu_has_vmx_ept_ad_bits() || !enable_ept) + enable_ept_ad_bits = 0; + + if (!cpu_has_vmx_unrestricted_guest() || !enable_ept) + enable_unrestricted_guest = 0; + + if (!cpu_has_vmx_flexpriority()) + flexpriority_enabled = 0; + + if (!cpu_has_virtual_nmis()) + enable_vnmi = 0; + +#ifdef CONFIG_X86_SGX_KVM + if (!cpu_has_vmx_encls_vmexit()) + enable_sgx = false; +#endif + + /* + * set_apic_access_page_addr() is used to reload apic access + * page upon invalidation. No need to do anything if not + * using the APIC_ACCESS_ADDR VMCS field. + */ + if (!flexpriority_enabled) + vmx_x86_ops.set_apic_access_page_addr = NULL; + + if (!cpu_has_vmx_tpr_shadow()) + vmx_x86_ops.update_cr8_intercept = NULL; + +#if IS_ENABLED(CONFIG_HYPERV) + if (ms_hyperv.nested_features & HV_X64_NESTED_GUEST_MAPPING_FLUSH + && enable_ept) { + vmx_x86_ops.tlb_remote_flush = hv_remote_flush_tlb; + vmx_x86_ops.tlb_remote_flush_with_range = + hv_remote_flush_tlb_with_range; + } +#endif + + if (!cpu_has_vmx_ple()) { + ple_gap = 0; + ple_window = 0; + ple_window_grow = 0; + ple_window_max = 0; + ple_window_shrink = 0; + } + + if (!cpu_has_vmx_apicv()) + enable_apicv = 0; + if (!enable_apicv) + vmx_x86_ops.sync_pir_to_irr = NULL; + + if (!enable_apicv || !cpu_has_vmx_ipiv()) + enable_ipiv = false; + + if (cpu_has_vmx_tsc_scaling()) + kvm_caps.has_tsc_control = true; + + kvm_caps.max_tsc_scaling_ratio = KVM_VMX_TSC_MULTIPLIER_MAX; + kvm_caps.tsc_scaling_ratio_frac_bits = 48; + kvm_caps.has_bus_lock_exit = cpu_has_vmx_bus_lock_detection(); + kvm_caps.has_notify_vmexit = cpu_has_notify_vmexit(); + + set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */ + + if (enable_ept) + kvm_mmu_set_ept_masks(enable_ept_ad_bits, + cpu_has_vmx_ept_execute_only()); + + /* + * Setup shadow_me_value/shadow_me_mask to include MKTME KeyID + * bits to shadow_zero_check. + */ + vmx_setup_me_spte_mask(); + + kvm_configure_mmu(enable_ept, 0, vmx_get_max_tdp_level(), + ept_caps_to_lpage_level(vmx_capability.ept)); + + /* + * Only enable PML when hardware supports PML feature, and both EPT + * and EPT A/D bit features are enabled -- PML depends on them to work. + */ + if (!enable_ept || !enable_ept_ad_bits || !cpu_has_vmx_pml()) + enable_pml = 0; + + if (!enable_pml) + vmx_x86_ops.cpu_dirty_log_size = 0; + + if (!cpu_has_vmx_preemption_timer()) + enable_preemption_timer = false; + + if (enable_preemption_timer) { + u64 use_timer_freq = 5000ULL * 1000 * 1000; + + cpu_preemption_timer_multi = + vmcs_config.misc & VMX_MISC_PREEMPTION_TIMER_RATE_MASK; + + if (tsc_khz) + use_timer_freq = (u64)tsc_khz * 1000; + use_timer_freq >>= cpu_preemption_timer_multi; + + /* + * KVM "disables" the preemption timer by setting it to its max + * value. Don't use the timer if it might cause spurious exits + * at a rate faster than 0.1 Hz (of uninterrupted guest time). + */ + if (use_timer_freq > 0xffffffffu / 10) + enable_preemption_timer = false; + } + + if (!enable_preemption_timer) { + vmx_x86_ops.set_hv_timer = NULL; + vmx_x86_ops.cancel_hv_timer = NULL; + vmx_x86_ops.request_immediate_exit = __kvm_request_immediate_exit; + } + + kvm_caps.supported_mce_cap |= MCG_LMCE_P; + kvm_caps.supported_mce_cap |= MCG_CMCI_P; + + if (pt_mode != PT_MODE_SYSTEM && pt_mode != PT_MODE_HOST_GUEST) + return -EINVAL; + if (!enable_ept || !enable_pmu || !cpu_has_vmx_intel_pt()) + pt_mode = PT_MODE_SYSTEM; + if (pt_mode == PT_MODE_HOST_GUEST) + vmx_init_ops.handle_intel_pt_intr = vmx_handle_intel_pt_intr; + else + vmx_init_ops.handle_intel_pt_intr = NULL; + + setup_default_sgx_lepubkeyhash(); + + if (nested) { + nested_vmx_setup_ctls_msrs(&vmcs_config, vmx_capability.ept); + + r = nested_vmx_hardware_setup(kvm_vmx_exit_handlers); + if (r) + return r; + } + + vmx_set_cpu_caps(); + + r = alloc_kvm_area(); + if (r && nested) + nested_vmx_hardware_unsetup(); + + kvm_set_posted_intr_wakeup_handler(pi_wakeup_handler); + + return r; +} + +static struct kvm_x86_init_ops vmx_init_ops __initdata = { + .cpu_has_kvm_support = cpu_has_kvm_support, + .disabled_by_bios = vmx_disabled_by_bios, + .check_processor_compatibility = vmx_check_processor_compat, + .hardware_setup = hardware_setup, + .handle_intel_pt_intr = NULL, + + .runtime_ops = &vmx_x86_ops, + .pmu_ops = &intel_pmu_ops, +}; + +static void vmx_cleanup_l1d_flush(void) +{ + if (vmx_l1d_flush_pages) { + free_pages((unsigned long)vmx_l1d_flush_pages, L1D_CACHE_ORDER); + vmx_l1d_flush_pages = NULL; + } + /* Restore state so sysfs ignores VMX */ + l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO; +} + +static void __vmx_exit(void) +{ + allow_smaller_maxphyaddr = false; + + RCU_INIT_POINTER(crash_vmclear_loaded_vmcss, NULL); + synchronize_rcu(); + + vmx_cleanup_l1d_flush(); +} + +static void vmx_exit(void) +{ + kvm_exit(); + kvm_x86_vendor_exit(); + + __vmx_exit(); +} +module_exit(vmx_exit); + +static int __init vmx_init(void) +{ + int r, cpu; + +#if IS_ENABLED(CONFIG_HYPERV) + /* + * Enlightened VMCS usage should be recommended and the host needs + * to support eVMCS v1 or above. We can also disable eVMCS support + * with module parameter. + */ + if (enlightened_vmcs && + ms_hyperv.hints & HV_X64_ENLIGHTENED_VMCS_RECOMMENDED && + (ms_hyperv.nested_features & HV_X64_ENLIGHTENED_VMCS_VERSION) >= + KVM_EVMCS_VERSION) { + + /* Check that we have assist pages on all online CPUs */ + for_each_online_cpu(cpu) { + if (!hv_get_vp_assist_page(cpu)) { + enlightened_vmcs = false; + break; + } + } + + if (enlightened_vmcs) { + pr_info("KVM: vmx: using Hyper-V Enlightened VMCS\n"); + static_branch_enable(&enable_evmcs); + } + + if (ms_hyperv.nested_features & HV_X64_NESTED_DIRECT_FLUSH) + vmx_x86_ops.enable_direct_tlbflush + = hv_enable_direct_tlbflush; + + } else { + enlightened_vmcs = false; + } +#endif + + r = kvm_x86_vendor_init(&vmx_init_ops); + if (r) + return r; + + /* + * Must be called after common x86 init so enable_ept is properly set + * up. Hand the parameter mitigation value in which was stored in + * the pre module init parser. If no parameter was given, it will + * contain 'auto' which will be turned into the default 'cond' + * mitigation mode. + */ + r = vmx_setup_l1d_flush(vmentry_l1d_flush_param); + if (r) + goto err_l1d_flush; + + vmx_setup_fb_clear_ctrl(); + + for_each_possible_cpu(cpu) { + INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu)); + + pi_init_cpu(cpu); + } + + rcu_assign_pointer(crash_vmclear_loaded_vmcss, + crash_vmclear_local_loaded_vmcss); + + vmx_check_vmcs12_offsets(); + + /* + * Shadow paging doesn't have a (further) performance penalty + * from GUEST_MAXPHYADDR < HOST_MAXPHYADDR so enable it + * by default + */ + if (!enable_ept) + allow_smaller_maxphyaddr = true; + + /* + * Common KVM initialization _must_ come last, after this, /dev/kvm is + * exposed to userspace! + */ + r = kvm_init(&vmx_init_ops, sizeof(struct vcpu_vmx), + __alignof__(struct vcpu_vmx), THIS_MODULE); + if (r) + goto err_kvm_init; + + return 0; + +err_kvm_init: + __vmx_exit(); +err_l1d_flush: + kvm_x86_vendor_exit(); + return r; +} +module_init(vmx_init); diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h new file mode 100644 index 000000000..e2b04f4c0 --- /dev/null +++ b/arch/x86/kvm/vmx/vmx.h @@ -0,0 +1,773 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __KVM_X86_VMX_H +#define __KVM_X86_VMX_H + +#include <linux/kvm_host.h> + +#include <asm/kvm.h> +#include <asm/intel_pt.h> +#include <asm/perf_event.h> + +#include "capabilities.h" +#include "../kvm_cache_regs.h" +#include "posted_intr.h" +#include "vmcs.h" +#include "vmx_ops.h" +#include "../cpuid.h" +#include "run_flags.h" + +#define MSR_TYPE_R 1 +#define MSR_TYPE_W 2 +#define MSR_TYPE_RW 3 + +#define X2APIC_MSR(r) (APIC_BASE_MSR + ((r) >> 4)) + +#ifdef CONFIG_X86_64 +#define MAX_NR_USER_RETURN_MSRS 7 +#else +#define MAX_NR_USER_RETURN_MSRS 4 +#endif + +#define MAX_NR_LOADSTORE_MSRS 8 + +struct vmx_msrs { + unsigned int nr; + struct vmx_msr_entry val[MAX_NR_LOADSTORE_MSRS]; +}; + +struct vmx_uret_msr { + bool load_into_hardware; + u64 data; + u64 mask; +}; + +enum segment_cache_field { + SEG_FIELD_SEL = 0, + SEG_FIELD_BASE = 1, + SEG_FIELD_LIMIT = 2, + SEG_FIELD_AR = 3, + + SEG_FIELD_NR = 4 +}; + +#define RTIT_ADDR_RANGE 4 + +struct pt_ctx { + u64 ctl; + u64 status; + u64 output_base; + u64 output_mask; + u64 cr3_match; + u64 addr_a[RTIT_ADDR_RANGE]; + u64 addr_b[RTIT_ADDR_RANGE]; +}; + +struct pt_desc { + u64 ctl_bitmask; + u32 num_address_ranges; + u32 caps[PT_CPUID_REGS_NUM * PT_CPUID_LEAVES]; + struct pt_ctx host; + struct pt_ctx guest; +}; + +union vmx_exit_reason { + struct { + u32 basic : 16; + u32 reserved16 : 1; + u32 reserved17 : 1; + u32 reserved18 : 1; + u32 reserved19 : 1; + u32 reserved20 : 1; + u32 reserved21 : 1; + u32 reserved22 : 1; + u32 reserved23 : 1; + u32 reserved24 : 1; + u32 reserved25 : 1; + u32 bus_lock_detected : 1; + u32 enclave_mode : 1; + u32 smi_pending_mtf : 1; + u32 smi_from_vmx_root : 1; + u32 reserved30 : 1; + u32 failed_vmentry : 1; + }; + u32 full; +}; + +static inline bool intel_pmu_has_perf_global_ctrl(struct kvm_pmu *pmu) +{ + /* + * Architecturally, Intel's SDM states that IA32_PERF_GLOBAL_CTRL is + * supported if "CPUID.0AH: EAX[7:0] > 0", i.e. if the PMU version is + * greater than zero. However, KVM only exposes and emulates the MSR + * to/for the guest if the guest PMU supports at least "Architectural + * Performance Monitoring Version 2". + */ + return pmu->version > 1; +} + +struct lbr_desc { + /* Basic info about guest LBR records. */ + struct x86_pmu_lbr records; + + /* + * Emulate LBR feature via passthrough LBR registers when the + * per-vcpu guest LBR event is scheduled on the current pcpu. + * + * The records may be inaccurate if the host reclaims the LBR. + */ + struct perf_event *event; + + /* True if LBRs are marked as not intercepted in the MSR bitmap */ + bool msr_passthrough; +}; + +/* + * The nested_vmx structure is part of vcpu_vmx, and holds information we need + * for correct emulation of VMX (i.e., nested VMX) on this vcpu. + */ +struct nested_vmx { + /* Has the level1 guest done vmxon? */ + bool vmxon; + gpa_t vmxon_ptr; + bool pml_full; + + /* The guest-physical address of the current VMCS L1 keeps for L2 */ + gpa_t current_vmptr; + /* + * Cache of the guest's VMCS, existing outside of guest memory. + * Loaded from guest memory during VMPTRLD. Flushed to guest + * memory during VMCLEAR and VMPTRLD. + */ + struct vmcs12 *cached_vmcs12; + /* + * Cache of the guest's shadow VMCS, existing outside of guest + * memory. Loaded from guest memory during VM entry. Flushed + * to guest memory during VM exit. + */ + struct vmcs12 *cached_shadow_vmcs12; + + /* + * GPA to HVA cache for accessing vmcs12->vmcs_link_pointer + */ + struct gfn_to_hva_cache shadow_vmcs12_cache; + + /* + * GPA to HVA cache for VMCS12 + */ + struct gfn_to_hva_cache vmcs12_cache; + + /* + * Indicates if the shadow vmcs or enlightened vmcs must be updated + * with the data held by struct vmcs12. + */ + bool need_vmcs12_to_shadow_sync; + bool dirty_vmcs12; + + /* + * Indicates whether MSR bitmap for L2 needs to be rebuilt due to + * changes in MSR bitmap for L1 or switching to a different L2. Note, + * this flag can only be used reliably in conjunction with a paravirt L1 + * which informs L0 whether any changes to MSR bitmap for L2 were done + * on its side. + */ + bool force_msr_bitmap_recalc; + + /* + * Indicates lazily loaded guest state has not yet been decached from + * vmcs02. + */ + bool need_sync_vmcs02_to_vmcs12_rare; + + /* + * vmcs02 has been initialized, i.e. state that is constant for + * vmcs02 has been written to the backing VMCS. Initialization + * is delayed until L1 actually attempts to run a nested VM. + */ + bool vmcs02_initialized; + + bool change_vmcs01_virtual_apic_mode; + bool reload_vmcs01_apic_access_page; + bool update_vmcs01_cpu_dirty_logging; + bool update_vmcs01_apicv_status; + + /* + * Enlightened VMCS has been enabled. It does not mean that L1 has to + * use it. However, VMX features available to L1 will be limited based + * on what the enlightened VMCS supports. + */ + bool enlightened_vmcs_enabled; + + /* L2 must run next, and mustn't decide to exit to L1. */ + bool nested_run_pending; + + /* Pending MTF VM-exit into L1. */ + bool mtf_pending; + + struct loaded_vmcs vmcs02; + + /* + * Guest pages referred to in the vmcs02 with host-physical + * pointers, so we must keep them pinned while L2 runs. + */ + struct kvm_host_map apic_access_page_map; + struct kvm_host_map virtual_apic_map; + struct kvm_host_map pi_desc_map; + + struct kvm_host_map msr_bitmap_map; + + struct pi_desc *pi_desc; + bool pi_pending; + u16 posted_intr_nv; + + struct hrtimer preemption_timer; + u64 preemption_timer_deadline; + bool has_preemption_timer_deadline; + bool preemption_timer_expired; + + /* + * Used to snapshot MSRs that are conditionally loaded on VM-Enter in + * order to propagate the guest's pre-VM-Enter value into vmcs02. For + * emulation of VMLAUNCH/VMRESUME, the snapshot will be of L1's value. + * For KVM_SET_NESTED_STATE, the snapshot is of L2's value, _if_ + * userspace restores MSRs before nested state. If userspace restores + * MSRs after nested state, the snapshot holds garbage, but KVM can't + * detect that, and the garbage value in vmcs02 will be overwritten by + * MSR restoration in any case. + */ + u64 pre_vmenter_debugctl; + u64 pre_vmenter_bndcfgs; + + /* to migrate it to L1 if L2 writes to L1's CR8 directly */ + int l1_tpr_threshold; + + u16 vpid02; + u16 last_vpid; + + struct nested_vmx_msrs msrs; + + /* SMM related state */ + struct { + /* in VMX operation on SMM entry? */ + bool vmxon; + /* in guest mode on SMM entry? */ + bool guest_mode; + } smm; + + gpa_t hv_evmcs_vmptr; + struct kvm_host_map hv_evmcs_map; + struct hv_enlightened_vmcs *hv_evmcs; +}; + +struct vcpu_vmx { + struct kvm_vcpu vcpu; + u8 fail; + u8 x2apic_msr_bitmap_mode; + + /* + * If true, host state has been stored in vmx->loaded_vmcs for + * the CPU registers that only need to be switched when transitioning + * to/from the kernel, and the registers have been loaded with guest + * values. If false, host state is loaded in the CPU registers + * and vmx->loaded_vmcs->host_state is invalid. + */ + bool guest_state_loaded; + + unsigned long exit_qualification; + u32 exit_intr_info; + u32 idt_vectoring_info; + ulong rflags; + + /* + * User return MSRs are always emulated when enabled in the guest, but + * only loaded into hardware when necessary, e.g. SYSCALL #UDs outside + * of 64-bit mode or if EFER.SCE=1, thus the SYSCALL MSRs don't need to + * be loaded into hardware if those conditions aren't met. + */ + struct vmx_uret_msr guest_uret_msrs[MAX_NR_USER_RETURN_MSRS]; + bool guest_uret_msrs_loaded; +#ifdef CONFIG_X86_64 + u64 msr_host_kernel_gs_base; + u64 msr_guest_kernel_gs_base; +#endif + + u64 spec_ctrl; + u32 msr_ia32_umwait_control; + + /* + * loaded_vmcs points to the VMCS currently used in this vcpu. For a + * non-nested (L1) guest, it always points to vmcs01. For a nested + * guest (L2), it points to a different VMCS. + */ + struct loaded_vmcs vmcs01; + struct loaded_vmcs *loaded_vmcs; + + struct msr_autoload { + struct vmx_msrs guest; + struct vmx_msrs host; + } msr_autoload; + + struct msr_autostore { + struct vmx_msrs guest; + } msr_autostore; + + struct { + int vm86_active; + ulong save_rflags; + struct kvm_segment segs[8]; + } rmode; + struct { + u32 bitmask; /* 4 bits per segment (1 bit per field) */ + struct kvm_save_segment { + u16 selector; + unsigned long base; + u32 limit; + u32 ar; + } seg[8]; + } segment_cache; + int vpid; + bool emulation_required; + + union vmx_exit_reason exit_reason; + + /* Posted interrupt descriptor */ + struct pi_desc pi_desc; + + /* Used if this vCPU is waiting for PI notification wakeup. */ + struct list_head pi_wakeup_list; + + /* Support for a guest hypervisor (nested VMX) */ + struct nested_vmx nested; + + /* Dynamic PLE window. */ + unsigned int ple_window; + bool ple_window_dirty; + + bool req_immediate_exit; + + /* Support for PML */ +#define PML_ENTITY_NUM 512 + struct page *pml_pg; + + /* apic deadline value in host tsc */ + u64 hv_deadline_tsc; + + unsigned long host_debugctlmsr; + + /* + * Only bits masked by msr_ia32_feature_control_valid_bits can be set in + * msr_ia32_feature_control. FEAT_CTL_LOCKED is always included + * in msr_ia32_feature_control_valid_bits. + */ + u64 msr_ia32_feature_control; + u64 msr_ia32_feature_control_valid_bits; + /* SGX Launch Control public key hash */ + u64 msr_ia32_sgxlepubkeyhash[4]; + u64 msr_ia32_mcu_opt_ctrl; + bool disable_fb_clear; + + struct pt_desc pt_desc; + struct lbr_desc lbr_desc; + + /* Save desired MSR intercept (read: pass-through) state */ +#define MAX_POSSIBLE_PASSTHROUGH_MSRS 15 + struct { + DECLARE_BITMAP(read, MAX_POSSIBLE_PASSTHROUGH_MSRS); + DECLARE_BITMAP(write, MAX_POSSIBLE_PASSTHROUGH_MSRS); + } shadow_msr_intercept; +}; + +struct kvm_vmx { + struct kvm kvm; + + unsigned int tss_addr; + bool ept_identity_pagetable_done; + gpa_t ept_identity_map_addr; + /* Posted Interrupt Descriptor (PID) table for IPI virtualization */ + u64 *pid_table; +}; + +bool nested_vmx_allowed(struct kvm_vcpu *vcpu); +void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu, + struct loaded_vmcs *buddy); +int allocate_vpid(void); +void free_vpid(int vpid); +void vmx_set_constant_host_state(struct vcpu_vmx *vmx); +void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu); +void vmx_set_host_fs_gs(struct vmcs_host_state *host, u16 fs_sel, u16 gs_sel, + unsigned long fs_base, unsigned long gs_base); +int vmx_get_cpl(struct kvm_vcpu *vcpu); +bool vmx_emulation_required(struct kvm_vcpu *vcpu); +unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu); +void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags); +u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu); +void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask); +int vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer); +void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); +void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); +void set_cr4_guest_host_mask(struct vcpu_vmx *vmx); +void ept_save_pdptrs(struct kvm_vcpu *vcpu); +void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); +void __vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); +u64 construct_eptp(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level); + +bool vmx_guest_inject_ac(struct kvm_vcpu *vcpu); +void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu); +bool vmx_nmi_blocked(struct kvm_vcpu *vcpu); +bool vmx_interrupt_blocked(struct kvm_vcpu *vcpu); +bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu); +void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked); +void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu); +struct vmx_uret_msr *vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr); +void pt_update_intercept_for_msr(struct kvm_vcpu *vcpu); +void vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp); +void vmx_spec_ctrl_restore_host(struct vcpu_vmx *vmx, unsigned int flags); +unsigned int __vmx_vcpu_run_flags(struct vcpu_vmx *vmx); +bool __vmx_vcpu_run(struct vcpu_vmx *vmx, unsigned long *regs, + unsigned int flags); +int vmx_find_loadstore_msr_slot(struct vmx_msrs *m, u32 msr); +void vmx_ept_load_pdptrs(struct kvm_vcpu *vcpu); + +void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type); +void vmx_enable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type); + +u64 vmx_get_l2_tsc_offset(struct kvm_vcpu *vcpu); +u64 vmx_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu); + +static inline void vmx_set_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, + int type, bool value) +{ + if (value) + vmx_enable_intercept_for_msr(vcpu, msr, type); + else + vmx_disable_intercept_for_msr(vcpu, msr, type); +} + +void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu); + +/* + * Note, early Intel manuals have the write-low and read-high bitmap offsets + * the wrong way round. The bitmaps control MSRs 0x00000000-0x00001fff and + * 0xc0000000-0xc0001fff. The former (low) uses bytes 0-0x3ff for reads and + * 0x800-0xbff for writes. The latter (high) uses 0x400-0x7ff for reads and + * 0xc00-0xfff for writes. MSRs not covered by either of the ranges always + * VM-Exit. + */ +#define __BUILD_VMX_MSR_BITMAP_HELPER(rtype, action, bitop, access, base) \ +static inline rtype vmx_##action##_msr_bitmap_##access(unsigned long *bitmap, \ + u32 msr) \ +{ \ + int f = sizeof(unsigned long); \ + \ + if (msr <= 0x1fff) \ + return bitop##_bit(msr, bitmap + base / f); \ + else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) \ + return bitop##_bit(msr & 0x1fff, bitmap + (base + 0x400) / f); \ + return (rtype)true; \ +} +#define BUILD_VMX_MSR_BITMAP_HELPERS(ret_type, action, bitop) \ + __BUILD_VMX_MSR_BITMAP_HELPER(ret_type, action, bitop, read, 0x0) \ + __BUILD_VMX_MSR_BITMAP_HELPER(ret_type, action, bitop, write, 0x800) + +BUILD_VMX_MSR_BITMAP_HELPERS(bool, test, test) +BUILD_VMX_MSR_BITMAP_HELPERS(void, clear, __clear) +BUILD_VMX_MSR_BITMAP_HELPERS(void, set, __set) + +static inline u8 vmx_get_rvi(void) +{ + return vmcs_read16(GUEST_INTR_STATUS) & 0xff; +} + +#define __KVM_REQUIRED_VMX_VM_ENTRY_CONTROLS \ + (VM_ENTRY_LOAD_DEBUG_CONTROLS) +#ifdef CONFIG_X86_64 + #define KVM_REQUIRED_VMX_VM_ENTRY_CONTROLS \ + (__KVM_REQUIRED_VMX_VM_ENTRY_CONTROLS | \ + VM_ENTRY_IA32E_MODE) +#else + #define KVM_REQUIRED_VMX_VM_ENTRY_CONTROLS \ + __KVM_REQUIRED_VMX_VM_ENTRY_CONTROLS +#endif +#define KVM_OPTIONAL_VMX_VM_ENTRY_CONTROLS \ + (VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL | \ + VM_ENTRY_LOAD_IA32_PAT | \ + VM_ENTRY_LOAD_IA32_EFER | \ + VM_ENTRY_LOAD_BNDCFGS | \ + VM_ENTRY_PT_CONCEAL_PIP | \ + VM_ENTRY_LOAD_IA32_RTIT_CTL) + +#define __KVM_REQUIRED_VMX_VM_EXIT_CONTROLS \ + (VM_EXIT_SAVE_DEBUG_CONTROLS | \ + VM_EXIT_ACK_INTR_ON_EXIT) +#ifdef CONFIG_X86_64 + #define KVM_REQUIRED_VMX_VM_EXIT_CONTROLS \ + (__KVM_REQUIRED_VMX_VM_EXIT_CONTROLS | \ + VM_EXIT_HOST_ADDR_SPACE_SIZE) +#else + #define KVM_REQUIRED_VMX_VM_EXIT_CONTROLS \ + __KVM_REQUIRED_VMX_VM_EXIT_CONTROLS +#endif +#define KVM_OPTIONAL_VMX_VM_EXIT_CONTROLS \ + (VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | \ + VM_EXIT_SAVE_IA32_PAT | \ + VM_EXIT_LOAD_IA32_PAT | \ + VM_EXIT_SAVE_IA32_EFER | \ + VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | \ + VM_EXIT_LOAD_IA32_EFER | \ + VM_EXIT_CLEAR_BNDCFGS | \ + VM_EXIT_PT_CONCEAL_PIP | \ + VM_EXIT_CLEAR_IA32_RTIT_CTL) + +#define KVM_REQUIRED_VMX_PIN_BASED_VM_EXEC_CONTROL \ + (PIN_BASED_EXT_INTR_MASK | \ + PIN_BASED_NMI_EXITING) +#define KVM_OPTIONAL_VMX_PIN_BASED_VM_EXEC_CONTROL \ + (PIN_BASED_VIRTUAL_NMIS | \ + PIN_BASED_POSTED_INTR | \ + PIN_BASED_VMX_PREEMPTION_TIMER) + +#define __KVM_REQUIRED_VMX_CPU_BASED_VM_EXEC_CONTROL \ + (CPU_BASED_HLT_EXITING | \ + CPU_BASED_CR3_LOAD_EXITING | \ + CPU_BASED_CR3_STORE_EXITING | \ + CPU_BASED_UNCOND_IO_EXITING | \ + CPU_BASED_MOV_DR_EXITING | \ + CPU_BASED_USE_TSC_OFFSETTING | \ + CPU_BASED_MWAIT_EXITING | \ + CPU_BASED_MONITOR_EXITING | \ + CPU_BASED_INVLPG_EXITING | \ + CPU_BASED_RDPMC_EXITING | \ + CPU_BASED_INTR_WINDOW_EXITING) + +#ifdef CONFIG_X86_64 + #define KVM_REQUIRED_VMX_CPU_BASED_VM_EXEC_CONTROL \ + (__KVM_REQUIRED_VMX_CPU_BASED_VM_EXEC_CONTROL | \ + CPU_BASED_CR8_LOAD_EXITING | \ + CPU_BASED_CR8_STORE_EXITING) +#else + #define KVM_REQUIRED_VMX_CPU_BASED_VM_EXEC_CONTROL \ + __KVM_REQUIRED_VMX_CPU_BASED_VM_EXEC_CONTROL +#endif + +#define KVM_OPTIONAL_VMX_CPU_BASED_VM_EXEC_CONTROL \ + (CPU_BASED_RDTSC_EXITING | \ + CPU_BASED_TPR_SHADOW | \ + CPU_BASED_USE_IO_BITMAPS | \ + CPU_BASED_MONITOR_TRAP_FLAG | \ + CPU_BASED_USE_MSR_BITMAPS | \ + CPU_BASED_NMI_WINDOW_EXITING | \ + CPU_BASED_PAUSE_EXITING | \ + CPU_BASED_ACTIVATE_SECONDARY_CONTROLS | \ + CPU_BASED_ACTIVATE_TERTIARY_CONTROLS) + +#define KVM_REQUIRED_VMX_SECONDARY_VM_EXEC_CONTROL 0 +#define KVM_OPTIONAL_VMX_SECONDARY_VM_EXEC_CONTROL \ + (SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | \ + SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | \ + SECONDARY_EXEC_WBINVD_EXITING | \ + SECONDARY_EXEC_ENABLE_VPID | \ + SECONDARY_EXEC_ENABLE_EPT | \ + SECONDARY_EXEC_UNRESTRICTED_GUEST | \ + SECONDARY_EXEC_PAUSE_LOOP_EXITING | \ + SECONDARY_EXEC_DESC | \ + SECONDARY_EXEC_ENABLE_RDTSCP | \ + SECONDARY_EXEC_ENABLE_INVPCID | \ + SECONDARY_EXEC_APIC_REGISTER_VIRT | \ + SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | \ + SECONDARY_EXEC_SHADOW_VMCS | \ + SECONDARY_EXEC_XSAVES | \ + SECONDARY_EXEC_RDSEED_EXITING | \ + SECONDARY_EXEC_RDRAND_EXITING | \ + SECONDARY_EXEC_ENABLE_PML | \ + SECONDARY_EXEC_TSC_SCALING | \ + SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE | \ + SECONDARY_EXEC_PT_USE_GPA | \ + SECONDARY_EXEC_PT_CONCEAL_VMX | \ + SECONDARY_EXEC_ENABLE_VMFUNC | \ + SECONDARY_EXEC_BUS_LOCK_DETECTION | \ + SECONDARY_EXEC_NOTIFY_VM_EXITING | \ + SECONDARY_EXEC_ENCLS_EXITING) + +#define KVM_REQUIRED_VMX_TERTIARY_VM_EXEC_CONTROL 0 +#define KVM_OPTIONAL_VMX_TERTIARY_VM_EXEC_CONTROL \ + (TERTIARY_EXEC_IPI_VIRT) + +#define BUILD_CONTROLS_SHADOW(lname, uname, bits) \ +static inline void lname##_controls_set(struct vcpu_vmx *vmx, u##bits val) \ +{ \ + if (vmx->loaded_vmcs->controls_shadow.lname != val) { \ + vmcs_write##bits(uname, val); \ + vmx->loaded_vmcs->controls_shadow.lname = val; \ + } \ +} \ +static inline u##bits __##lname##_controls_get(struct loaded_vmcs *vmcs) \ +{ \ + return vmcs->controls_shadow.lname; \ +} \ +static inline u##bits lname##_controls_get(struct vcpu_vmx *vmx) \ +{ \ + return __##lname##_controls_get(vmx->loaded_vmcs); \ +} \ +static __always_inline void lname##_controls_setbit(struct vcpu_vmx *vmx, u##bits val) \ +{ \ + BUILD_BUG_ON(!(val & (KVM_REQUIRED_VMX_##uname | KVM_OPTIONAL_VMX_##uname))); \ + lname##_controls_set(vmx, lname##_controls_get(vmx) | val); \ +} \ +static __always_inline void lname##_controls_clearbit(struct vcpu_vmx *vmx, u##bits val) \ +{ \ + BUILD_BUG_ON(!(val & (KVM_REQUIRED_VMX_##uname | KVM_OPTIONAL_VMX_##uname))); \ + lname##_controls_set(vmx, lname##_controls_get(vmx) & ~val); \ +} +BUILD_CONTROLS_SHADOW(vm_entry, VM_ENTRY_CONTROLS, 32) +BUILD_CONTROLS_SHADOW(vm_exit, VM_EXIT_CONTROLS, 32) +BUILD_CONTROLS_SHADOW(pin, PIN_BASED_VM_EXEC_CONTROL, 32) +BUILD_CONTROLS_SHADOW(exec, CPU_BASED_VM_EXEC_CONTROL, 32) +BUILD_CONTROLS_SHADOW(secondary_exec, SECONDARY_VM_EXEC_CONTROL, 32) +BUILD_CONTROLS_SHADOW(tertiary_exec, TERTIARY_VM_EXEC_CONTROL, 64) + +/* + * VMX_REGS_LAZY_LOAD_SET - The set of registers that will be updated in the + * cache on demand. Other registers not listed here are synced to + * the cache immediately after VM-Exit. + */ +#define VMX_REGS_LAZY_LOAD_SET ((1 << VCPU_REGS_RIP) | \ + (1 << VCPU_REGS_RSP) | \ + (1 << VCPU_EXREG_RFLAGS) | \ + (1 << VCPU_EXREG_PDPTR) | \ + (1 << VCPU_EXREG_SEGMENTS) | \ + (1 << VCPU_EXREG_CR0) | \ + (1 << VCPU_EXREG_CR3) | \ + (1 << VCPU_EXREG_CR4) | \ + (1 << VCPU_EXREG_EXIT_INFO_1) | \ + (1 << VCPU_EXREG_EXIT_INFO_2)) + +static inline unsigned long vmx_l1_guest_owned_cr0_bits(void) +{ + unsigned long bits = KVM_POSSIBLE_CR0_GUEST_BITS; + + /* + * CR0.WP needs to be intercepted when KVM is shadowing legacy paging + * in order to construct shadow PTEs with the correct protections. + * Note! CR0.WP technically can be passed through to the guest if + * paging is disabled, but checking CR0.PG would generate a cyclical + * dependency of sorts due to forcing the caller to ensure CR0 holds + * the correct value prior to determining which CR0 bits can be owned + * by L1. Keep it simple and limit the optimization to EPT. + */ + if (!enable_ept) + bits &= ~X86_CR0_WP; + return bits; +} + +static inline struct kvm_vmx *to_kvm_vmx(struct kvm *kvm) +{ + return container_of(kvm, struct kvm_vmx, kvm); +} + +static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) +{ + return container_of(vcpu, struct vcpu_vmx, vcpu); +} + +static inline struct lbr_desc *vcpu_to_lbr_desc(struct kvm_vcpu *vcpu) +{ + return &to_vmx(vcpu)->lbr_desc; +} + +static inline struct x86_pmu_lbr *vcpu_to_lbr_records(struct kvm_vcpu *vcpu) +{ + return &vcpu_to_lbr_desc(vcpu)->records; +} + +static inline bool intel_pmu_lbr_is_enabled(struct kvm_vcpu *vcpu) +{ + return !!vcpu_to_lbr_records(vcpu)->nr; +} + +void intel_pmu_cross_mapped_check(struct kvm_pmu *pmu); +int intel_pmu_create_guest_lbr_event(struct kvm_vcpu *vcpu); +void vmx_passthrough_lbr_msrs(struct kvm_vcpu *vcpu); + +static inline unsigned long vmx_get_exit_qual(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (!kvm_register_is_available(vcpu, VCPU_EXREG_EXIT_INFO_1)) { + kvm_register_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_1); + vmx->exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + } + return vmx->exit_qualification; +} + +static inline u32 vmx_get_intr_info(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (!kvm_register_is_available(vcpu, VCPU_EXREG_EXIT_INFO_2)) { + kvm_register_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_2); + vmx->exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); + } + return vmx->exit_intr_info; +} + +struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags); +void free_vmcs(struct vmcs *vmcs); +int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs); +void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs); +void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs); + +static inline struct vmcs *alloc_vmcs(bool shadow) +{ + return alloc_vmcs_cpu(shadow, raw_smp_processor_id(), + GFP_KERNEL_ACCOUNT); +} + +static inline bool vmx_has_waitpkg(struct vcpu_vmx *vmx) +{ + return secondary_exec_controls_get(vmx) & + SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE; +} + +static inline bool vmx_need_pf_intercept(struct kvm_vcpu *vcpu) +{ + if (!enable_ept) + return true; + + return allow_smaller_maxphyaddr && cpuid_maxphyaddr(vcpu) < boot_cpu_data.x86_phys_bits; +} + +static inline bool is_unrestricted_guest(struct kvm_vcpu *vcpu) +{ + return enable_unrestricted_guest && (!is_guest_mode(vcpu) || + (secondary_exec_controls_get(to_vmx(vcpu)) & + SECONDARY_EXEC_UNRESTRICTED_GUEST)); +} + +bool __vmx_guest_state_valid(struct kvm_vcpu *vcpu); +static inline bool vmx_guest_state_valid(struct kvm_vcpu *vcpu) +{ + return is_unrestricted_guest(vcpu) || __vmx_guest_state_valid(vcpu); +} + +void dump_vmcs(struct kvm_vcpu *vcpu); + +static inline int vmx_get_instr_info_reg2(u32 vmx_instr_info) +{ + return (vmx_instr_info >> 28) & 0xf; +} + +static inline bool vmx_can_use_ipiv(struct kvm_vcpu *vcpu) +{ + return lapic_in_kernel(vcpu) && enable_ipiv; +} + +static inline bool guest_cpuid_has_evmcs(struct kvm_vcpu *vcpu) +{ + /* + * eVMCS is exposed to the guest if Hyper-V is enabled in CPUID and + * eVMCS has been explicitly enabled by userspace. + */ + return vcpu->arch.hyperv_enabled && + to_vmx(vcpu)->nested.enlightened_vmcs_enabled; +} + +#endif /* __KVM_X86_VMX_H */ diff --git a/arch/x86/kvm/vmx/vmx_ops.h b/arch/x86/kvm/vmx/vmx_ops.h new file mode 100644 index 000000000..ec268df83 --- /dev/null +++ b/arch/x86/kvm/vmx/vmx_ops.h @@ -0,0 +1,347 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __KVM_X86_VMX_INSN_H +#define __KVM_X86_VMX_INSN_H + +#include <linux/nospec.h> + +#include <asm/vmx.h> + +#include "evmcs.h" +#include "vmcs.h" +#include "../x86.h" + +void vmread_error(unsigned long field, bool fault); +__attribute__((regparm(0))) void vmread_error_trampoline(unsigned long field, + bool fault); +void vmwrite_error(unsigned long field, unsigned long value); +void vmclear_error(struct vmcs *vmcs, u64 phys_addr); +void vmptrld_error(struct vmcs *vmcs, u64 phys_addr); +void invvpid_error(unsigned long ext, u16 vpid, gva_t gva); +void invept_error(unsigned long ext, u64 eptp, gpa_t gpa); + +static __always_inline void vmcs_check16(unsigned long field) +{ + BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2000, + "16-bit accessor invalid for 64-bit field"); + BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2001, + "16-bit accessor invalid for 64-bit high field"); + BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x4000, + "16-bit accessor invalid for 32-bit high field"); + BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x6000, + "16-bit accessor invalid for natural width field"); +} + +static __always_inline void vmcs_check32(unsigned long field) +{ + BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0, + "32-bit accessor invalid for 16-bit field"); + BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2000, + "32-bit accessor invalid for 64-bit field"); + BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2001, + "32-bit accessor invalid for 64-bit high field"); + BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x6000, + "32-bit accessor invalid for natural width field"); +} + +static __always_inline void vmcs_check64(unsigned long field) +{ + BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0, + "64-bit accessor invalid for 16-bit field"); + BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2001, + "64-bit accessor invalid for 64-bit high field"); + BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x4000, + "64-bit accessor invalid for 32-bit field"); + BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x6000, + "64-bit accessor invalid for natural width field"); +} + +static __always_inline void vmcs_checkl(unsigned long field) +{ + BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0, + "Natural width accessor invalid for 16-bit field"); + BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2000, + "Natural width accessor invalid for 64-bit field"); + BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2001, + "Natural width accessor invalid for 64-bit high field"); + BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x4000, + "Natural width accessor invalid for 32-bit field"); +} + +static __always_inline unsigned long __vmcs_readl(unsigned long field) +{ + unsigned long value; + +#ifdef CONFIG_CC_HAS_ASM_GOTO_OUTPUT + + asm_volatile_goto("1: vmread %[field], %[output]\n\t" + "jna %l[do_fail]\n\t" + + _ASM_EXTABLE(1b, %l[do_exception]) + + : [output] "=r" (value) + : [field] "r" (field) + : "cc" + : do_fail, do_exception); + + return value; + +do_fail: + WARN_ONCE(1, "kvm: vmread failed: field=%lx\n", field); + pr_warn_ratelimited("kvm: vmread failed: field=%lx\n", field); + return 0; + +do_exception: + kvm_spurious_fault(); + return 0; + +#else /* !CONFIG_CC_HAS_ASM_GOTO_OUTPUT */ + + asm volatile("1: vmread %2, %1\n\t" + ".byte 0x3e\n\t" /* branch taken hint */ + "ja 3f\n\t" + + /* + * VMREAD failed. Push '0' for @fault, push the failing + * @field, and bounce through the trampoline to preserve + * volatile registers. + */ + "xorl %k1, %k1\n\t" + "2:\n\t" + "push %1\n\t" + "push %2\n\t" + "call vmread_error_trampoline\n\t" + + /* + * Unwind the stack. Note, the trampoline zeros out the + * memory for @fault so that the result is '0' on error. + */ + "pop %2\n\t" + "pop %1\n\t" + "3:\n\t" + + /* VMREAD faulted. As above, except push '1' for @fault. */ + _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_ONE_REG, %1) + + : ASM_CALL_CONSTRAINT, "=&r"(value) : "r"(field) : "cc"); + return value; + +#endif /* CONFIG_CC_HAS_ASM_GOTO_OUTPUT */ +} + +static __always_inline u16 vmcs_read16(unsigned long field) +{ + vmcs_check16(field); + if (static_branch_unlikely(&enable_evmcs)) + return evmcs_read16(field); + return __vmcs_readl(field); +} + +static __always_inline u32 vmcs_read32(unsigned long field) +{ + vmcs_check32(field); + if (static_branch_unlikely(&enable_evmcs)) + return evmcs_read32(field); + return __vmcs_readl(field); +} + +static __always_inline u64 vmcs_read64(unsigned long field) +{ + vmcs_check64(field); + if (static_branch_unlikely(&enable_evmcs)) + return evmcs_read64(field); +#ifdef CONFIG_X86_64 + return __vmcs_readl(field); +#else + return __vmcs_readl(field) | ((u64)__vmcs_readl(field+1) << 32); +#endif +} + +static __always_inline unsigned long vmcs_readl(unsigned long field) +{ + vmcs_checkl(field); + if (static_branch_unlikely(&enable_evmcs)) + return evmcs_read64(field); + return __vmcs_readl(field); +} + +#define vmx_asm1(insn, op1, error_args...) \ +do { \ + asm_volatile_goto("1: " __stringify(insn) " %0\n\t" \ + ".byte 0x2e\n\t" /* branch not taken hint */ \ + "jna %l[error]\n\t" \ + _ASM_EXTABLE(1b, %l[fault]) \ + : : op1 : "cc" : error, fault); \ + return; \ +error: \ + instrumentation_begin(); \ + insn##_error(error_args); \ + instrumentation_end(); \ + return; \ +fault: \ + kvm_spurious_fault(); \ +} while (0) + +#define vmx_asm2(insn, op1, op2, error_args...) \ +do { \ + asm_volatile_goto("1: " __stringify(insn) " %1, %0\n\t" \ + ".byte 0x2e\n\t" /* branch not taken hint */ \ + "jna %l[error]\n\t" \ + _ASM_EXTABLE(1b, %l[fault]) \ + : : op1, op2 : "cc" : error, fault); \ + return; \ +error: \ + instrumentation_begin(); \ + insn##_error(error_args); \ + instrumentation_end(); \ + return; \ +fault: \ + kvm_spurious_fault(); \ +} while (0) + +static __always_inline void __vmcs_writel(unsigned long field, unsigned long value) +{ + vmx_asm2(vmwrite, "r"(field), "rm"(value), field, value); +} + +static __always_inline void vmcs_write16(unsigned long field, u16 value) +{ + vmcs_check16(field); + if (static_branch_unlikely(&enable_evmcs)) + return evmcs_write16(field, value); + + __vmcs_writel(field, value); +} + +static __always_inline void vmcs_write32(unsigned long field, u32 value) +{ + vmcs_check32(field); + if (static_branch_unlikely(&enable_evmcs)) + return evmcs_write32(field, value); + + __vmcs_writel(field, value); +} + +static __always_inline void vmcs_write64(unsigned long field, u64 value) +{ + vmcs_check64(field); + if (static_branch_unlikely(&enable_evmcs)) + return evmcs_write64(field, value); + + __vmcs_writel(field, value); +#ifndef CONFIG_X86_64 + __vmcs_writel(field+1, value >> 32); +#endif +} + +static __always_inline void vmcs_writel(unsigned long field, unsigned long value) +{ + vmcs_checkl(field); + if (static_branch_unlikely(&enable_evmcs)) + return evmcs_write64(field, value); + + __vmcs_writel(field, value); +} + +static __always_inline void vmcs_clear_bits(unsigned long field, u32 mask) +{ + BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x2000, + "vmcs_clear_bits does not support 64-bit fields"); + if (static_branch_unlikely(&enable_evmcs)) + return evmcs_write32(field, evmcs_read32(field) & ~mask); + + __vmcs_writel(field, __vmcs_readl(field) & ~mask); +} + +static __always_inline void vmcs_set_bits(unsigned long field, u32 mask) +{ + BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x2000, + "vmcs_set_bits does not support 64-bit fields"); + if (static_branch_unlikely(&enable_evmcs)) + return evmcs_write32(field, evmcs_read32(field) | mask); + + __vmcs_writel(field, __vmcs_readl(field) | mask); +} + +static inline void vmcs_clear(struct vmcs *vmcs) +{ + u64 phys_addr = __pa(vmcs); + + vmx_asm1(vmclear, "m"(phys_addr), vmcs, phys_addr); +} + +static inline void vmcs_load(struct vmcs *vmcs) +{ + u64 phys_addr = __pa(vmcs); + + if (static_branch_unlikely(&enable_evmcs)) + return evmcs_load(phys_addr); + + vmx_asm1(vmptrld, "m"(phys_addr), vmcs, phys_addr); +} + +static inline void __invvpid(unsigned long ext, u16 vpid, gva_t gva) +{ + struct { + u64 vpid : 16; + u64 rsvd : 48; + u64 gva; + } operand = { vpid, 0, gva }; + + vmx_asm2(invvpid, "r"(ext), "m"(operand), ext, vpid, gva); +} + +static inline void __invept(unsigned long ext, u64 eptp, gpa_t gpa) +{ + struct { + u64 eptp, gpa; + } operand = {eptp, gpa}; + + vmx_asm2(invept, "r"(ext), "m"(operand), ext, eptp, gpa); +} + +static inline void vpid_sync_vcpu_single(int vpid) +{ + if (vpid == 0) + return; + + __invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vpid, 0); +} + +static inline void vpid_sync_vcpu_global(void) +{ + __invvpid(VMX_VPID_EXTENT_ALL_CONTEXT, 0, 0); +} + +static inline void vpid_sync_context(int vpid) +{ + if (cpu_has_vmx_invvpid_single()) + vpid_sync_vcpu_single(vpid); + else if (vpid != 0) + vpid_sync_vcpu_global(); +} + +static inline void vpid_sync_vcpu_addr(int vpid, gva_t addr) +{ + if (vpid == 0) + return; + + if (cpu_has_vmx_invvpid_individual_addr()) + __invvpid(VMX_VPID_EXTENT_INDIVIDUAL_ADDR, vpid, addr); + else + vpid_sync_context(vpid); +} + +static inline void ept_sync_global(void) +{ + __invept(VMX_EPT_EXTENT_GLOBAL, 0, 0); +} + +static inline void ept_sync_context(u64 eptp) +{ + if (cpu_has_vmx_invept_context()) + __invept(VMX_EPT_EXTENT_CONTEXT, eptp, 0); + else + ept_sync_global(); +} + +#endif /* __KVM_X86_VMX_INSN_H */ |