diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-08-07 13:11:40 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-08-07 13:11:40 +0000 |
commit | 8b0a8165cdad0f4133837d753649ef4682e42c3b (patch) | |
tree | 5c58f869f31ddb1f7bd6e8bdea269b680b36c5b6 /arch/x86/kvm/vmx | |
parent | Releasing progress-linux version 6.8.12-1~progress7.99u1. (diff) | |
download | linux-8b0a8165cdad0f4133837d753649ef4682e42c3b.tar.xz linux-8b0a8165cdad0f4133837d753649ef4682e42c3b.zip |
Merging upstream version 6.9.7.
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'arch/x86/kvm/vmx')
-rw-r--r-- | arch/x86/kvm/vmx/nested.c | 4 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/pmu_intel.c | 206 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/vmx.c | 188 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/vmx.h | 8 |
4 files changed, 198 insertions, 208 deletions
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 6329a30685..d05ddf7514 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -3606,7 +3606,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) return 1; } - kvm_pmu_trigger_event(vcpu, PERF_COUNT_HW_BRANCH_INSTRUCTIONS); + kvm_pmu_trigger_event(vcpu, kvm_pmu_eventsel.BRANCH_INSTRUCTIONS_RETIRED); if (CC(evmptrld_status == EVMPTRLD_VMFAIL)) return nested_vmx_failInvalid(vcpu); @@ -4433,7 +4433,7 @@ static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) (vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE); if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS) - kvm_get_dr(vcpu, 7, (unsigned long *)&vmcs12->guest_dr7); + vmcs12->guest_dr7 = vcpu->arch.dr7; if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_EFER) vmcs12->guest_ia32_efer = vcpu->arch.efer; diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 600a021ae9..be40474de6 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -20,53 +20,19 @@ #include "nested.h" #include "pmu.h" -#define MSR_PMC_FULL_WIDTH_BIT (MSR_IA32_PMC0 - MSR_IA32_PERFCTR0) - -enum intel_pmu_architectural_events { - /* - * The order of the architectural events matters as support for each - * event is enumerated via CPUID using the index of the event. - */ - INTEL_ARCH_CPU_CYCLES, - INTEL_ARCH_INSTRUCTIONS_RETIRED, - INTEL_ARCH_REFERENCE_CYCLES, - INTEL_ARCH_LLC_REFERENCES, - INTEL_ARCH_LLC_MISSES, - INTEL_ARCH_BRANCHES_RETIRED, - INTEL_ARCH_BRANCHES_MISPREDICTED, - - NR_REAL_INTEL_ARCH_EVENTS, - - /* - * Pseudo-architectural event used to implement IA32_FIXED_CTR2, a.k.a. - * TSC reference cycles. The architectural reference cycles event may - * or may not actually use the TSC as the reference, e.g. might use the - * core crystal clock or the bus clock (yeah, "architectural"). - */ - PSEUDO_ARCH_REFERENCE_CYCLES = NR_REAL_INTEL_ARCH_EVENTS, - NR_INTEL_ARCH_EVENTS, -}; +/* + * Perf's "BASE" is wildly misleading, architectural PMUs use bits 31:16 of ECX + * to encode the "type" of counter to read, i.e. this is not a "base". And to + * further confuse things, non-architectural PMUs use bit 31 as a flag for + * "fast" reads, whereas the "type" is an explicit value. + */ +#define INTEL_RDPMC_GP 0 +#define INTEL_RDPMC_FIXED INTEL_PMC_FIXED_RDPMC_BASE -static struct { - u8 eventsel; - u8 unit_mask; -} const intel_arch_events[] = { - [INTEL_ARCH_CPU_CYCLES] = { 0x3c, 0x00 }, - [INTEL_ARCH_INSTRUCTIONS_RETIRED] = { 0xc0, 0x00 }, - [INTEL_ARCH_REFERENCE_CYCLES] = { 0x3c, 0x01 }, - [INTEL_ARCH_LLC_REFERENCES] = { 0x2e, 0x4f }, - [INTEL_ARCH_LLC_MISSES] = { 0x2e, 0x41 }, - [INTEL_ARCH_BRANCHES_RETIRED] = { 0xc4, 0x00 }, - [INTEL_ARCH_BRANCHES_MISPREDICTED] = { 0xc5, 0x00 }, - [PSEUDO_ARCH_REFERENCE_CYCLES] = { 0x00, 0x03 }, -}; +#define INTEL_RDPMC_TYPE_MASK GENMASK(31, 16) +#define INTEL_RDPMC_INDEX_MASK GENMASK(15, 0) -/* mapping between fixed pmc index and intel_arch_events array */ -static int fixed_pmc_events[] = { - [0] = INTEL_ARCH_INSTRUCTIONS_RETIRED, - [1] = INTEL_ARCH_CPU_CYCLES, - [2] = PSEUDO_ARCH_REFERENCE_CYCLES, -}; +#define MSR_PMC_FULL_WIDTH_BIT (MSR_IA32_PMC0 - MSR_IA32_PERFCTR0) static void reprogram_fixed_counters(struct kvm_pmu *pmu, u64 data) { @@ -84,77 +50,61 @@ static void reprogram_fixed_counters(struct kvm_pmu *pmu, u64 data) pmc = get_fixed_pmc(pmu, MSR_CORE_PERF_FIXED_CTR0 + i); - __set_bit(INTEL_PMC_IDX_FIXED + i, pmu->pmc_in_use); + __set_bit(KVM_FIXED_PMC_BASE_IDX + i, pmu->pmc_in_use); kvm_pmu_request_counter_reprogram(pmc); } } -static struct kvm_pmc *intel_pmc_idx_to_pmc(struct kvm_pmu *pmu, int pmc_idx) -{ - if (pmc_idx < INTEL_PMC_IDX_FIXED) { - return get_gp_pmc(pmu, MSR_P6_EVNTSEL0 + pmc_idx, - MSR_P6_EVNTSEL0); - } else { - u32 idx = pmc_idx - INTEL_PMC_IDX_FIXED; - - return get_fixed_pmc(pmu, idx + MSR_CORE_PERF_FIXED_CTR0); - } -} - -static bool intel_hw_event_available(struct kvm_pmc *pmc) -{ - struct kvm_pmu *pmu = pmc_to_pmu(pmc); - u8 event_select = pmc->eventsel & ARCH_PERFMON_EVENTSEL_EVENT; - u8 unit_mask = (pmc->eventsel & ARCH_PERFMON_EVENTSEL_UMASK) >> 8; - int i; - - BUILD_BUG_ON(ARRAY_SIZE(intel_arch_events) != NR_INTEL_ARCH_EVENTS); - - /* - * Disallow events reported as unavailable in guest CPUID. Note, this - * doesn't apply to pseudo-architectural events. - */ - for (i = 0; i < NR_REAL_INTEL_ARCH_EVENTS; i++) { - if (intel_arch_events[i].eventsel != event_select || - intel_arch_events[i].unit_mask != unit_mask) - continue; - - return pmu->available_event_types & BIT(i); - } - - return true; -} - -static bool intel_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx) -{ - struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); - bool fixed = idx & (1u << 30); - - idx &= ~(3u << 30); - - return fixed ? idx < pmu->nr_arch_fixed_counters - : idx < pmu->nr_arch_gp_counters; -} - static struct kvm_pmc *intel_rdpmc_ecx_to_pmc(struct kvm_vcpu *vcpu, unsigned int idx, u64 *mask) { + unsigned int type = idx & INTEL_RDPMC_TYPE_MASK; struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); - bool fixed = idx & (1u << 30); struct kvm_pmc *counters; unsigned int num_counters; + u64 bitmask; + + /* + * The encoding of ECX for RDPMC is different for architectural versus + * non-architecturals PMUs (PMUs with version '0'). For architectural + * PMUs, bits 31:16 specify the PMC type and bits 15:0 specify the PMC + * index. For non-architectural PMUs, bit 31 is a "fast" flag, and + * bits 30:0 specify the PMC index. + * + * Yell and reject attempts to read PMCs for a non-architectural PMU, + * as KVM doesn't support such PMUs. + */ + if (WARN_ON_ONCE(!pmu->version)) + return NULL; - idx &= ~(3u << 30); - if (fixed) { + /* + * General Purpose (GP) PMCs are supported on all PMUs, and fixed PMCs + * are supported on all architectural PMUs, i.e. on all virtual PMUs + * supported by KVM. Note, KVM only emulates fixed PMCs for PMU v2+, + * but the type itself is still valid, i.e. let RDPMC fail due to + * accessing a non-existent counter. Reject attempts to read all other + * types, which are unknown/unsupported. + */ + switch (type) { + case INTEL_RDPMC_FIXED: counters = pmu->fixed_counters; num_counters = pmu->nr_arch_fixed_counters; - } else { + bitmask = pmu->counter_bitmask[KVM_PMC_FIXED]; + break; + case INTEL_RDPMC_GP: counters = pmu->gp_counters; num_counters = pmu->nr_arch_gp_counters; + bitmask = pmu->counter_bitmask[KVM_PMC_GP]; + break; + default: + return NULL; } + + idx &= INTEL_RDPMC_INDEX_MASK; if (idx >= num_counters) return NULL; - *mask &= pmu->counter_bitmask[fixed ? KVM_PMC_FIXED : KVM_PMC_GP]; + + *mask &= bitmask; return &counters[array_index_nospec(idx, num_counters)]; } @@ -464,20 +414,38 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 0; } -static void setup_fixed_pmc_eventsel(struct kvm_pmu *pmu) +/* + * Map fixed counter events to architectural general purpose event encodings. + * Perf doesn't provide APIs to allow KVM to directly program a fixed counter, + * and so KVM instead programs the architectural event to effectively request + * the fixed counter. Perf isn't guaranteed to use a fixed counter and may + * instead program the encoding into a general purpose counter, e.g. if a + * different perf_event is already utilizing the requested counter, but the end + * result is the same (ignoring the fact that using a general purpose counter + * will likely exacerbate counter contention). + * + * Forcibly inlined to allow asserting on @index at build time, and there should + * never be more than one user. + */ +static __always_inline u64 intel_get_fixed_pmc_eventsel(unsigned int index) { - int i; - - BUILD_BUG_ON(ARRAY_SIZE(fixed_pmc_events) != KVM_PMC_MAX_FIXED); + const enum perf_hw_id fixed_pmc_perf_ids[] = { + [0] = PERF_COUNT_HW_INSTRUCTIONS, + [1] = PERF_COUNT_HW_CPU_CYCLES, + [2] = PERF_COUNT_HW_REF_CPU_CYCLES, + }; + u64 eventsel; - for (i = 0; i < pmu->nr_arch_fixed_counters; i++) { - int index = array_index_nospec(i, KVM_PMC_MAX_FIXED); - struct kvm_pmc *pmc = &pmu->fixed_counters[index]; - u32 event = fixed_pmc_events[index]; + BUILD_BUG_ON(ARRAY_SIZE(fixed_pmc_perf_ids) != KVM_PMC_MAX_FIXED); + BUILD_BUG_ON(index >= KVM_PMC_MAX_FIXED); - pmc->eventsel = (intel_arch_events[event].unit_mask << 8) | - intel_arch_events[event].eventsel; - } + /* + * Yell if perf reports support for a fixed counter but perf doesn't + * have a known encoding for the associated general purpose event. + */ + eventsel = perf_get_hw_event_config(fixed_pmc_perf_ids[index]); + WARN_ON_ONCE(!eventsel && index < kvm_pmu_cap.num_counters_fixed); + return eventsel; } static void intel_pmu_refresh(struct kvm_vcpu *vcpu) @@ -531,13 +499,12 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) kvm_pmu_cap.bit_width_fixed); pmu->counter_bitmask[KVM_PMC_FIXED] = ((u64)1 << edx.split.bit_width_fixed) - 1; - setup_fixed_pmc_eventsel(pmu); } for (i = 0; i < pmu->nr_arch_fixed_counters; i++) pmu->fixed_ctr_ctrl_mask &= ~(0xbull << (i * 4)); counter_mask = ~(((1ull << pmu->nr_arch_gp_counters) - 1) | - (((1ull << pmu->nr_arch_fixed_counters) - 1) << INTEL_PMC_IDX_FIXED)); + (((1ull << pmu->nr_arch_fixed_counters) - 1) << KVM_FIXED_PMC_BASE_IDX)); pmu->global_ctrl_mask = counter_mask; /* @@ -568,7 +535,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) perf_capabilities = vcpu_get_perf_capabilities(vcpu); if (cpuid_model_is_consistent(vcpu) && (perf_capabilities & PMU_CAP_LBR_FMT)) - x86_perf_get_lbr(&lbr_desc->records); + memcpy(&lbr_desc->records, &vmx_lbr_caps, sizeof(vmx_lbr_caps)); else lbr_desc->records.nr = 0; @@ -581,7 +548,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) pmu->reserved_bits &= ~ICL_EVENTSEL_ADAPTIVE; for (i = 0; i < pmu->nr_arch_fixed_counters; i++) { pmu->fixed_ctr_ctrl_mask &= - ~(1ULL << (INTEL_PMC_IDX_FIXED + i * 4)); + ~(1ULL << (KVM_FIXED_PMC_BASE_IDX + i * 4)); } pmu->pebs_data_cfg_mask = ~0xff00000full; } else { @@ -607,8 +574,9 @@ static void intel_pmu_init(struct kvm_vcpu *vcpu) for (i = 0; i < KVM_PMC_MAX_FIXED; i++) { pmu->fixed_counters[i].type = KVM_PMC_FIXED; pmu->fixed_counters[i].vcpu = vcpu; - pmu->fixed_counters[i].idx = i + INTEL_PMC_IDX_FIXED; + pmu->fixed_counters[i].idx = i + KVM_FIXED_PMC_BASE_IDX; pmu->fixed_counters[i].current_config = 0; + pmu->fixed_counters[i].eventsel = intel_get_fixed_pmc_eventsel(i); } lbr_desc->records.nr = 0; @@ -736,11 +704,8 @@ void intel_pmu_cross_mapped_check(struct kvm_pmu *pmu) struct kvm_pmc *pmc = NULL; int bit, hw_idx; - for_each_set_bit(bit, (unsigned long *)&pmu->global_ctrl, - X86_PMC_IDX_MAX) { - pmc = intel_pmc_idx_to_pmc(pmu, bit); - - if (!pmc || !pmc_speculative_in_use(pmc) || + kvm_for_each_pmc(pmu, pmc, bit, (unsigned long *)&pmu->global_ctrl) { + if (!pmc_speculative_in_use(pmc) || !pmc_is_globally_enabled(pmc) || !pmc->perf_event) continue; @@ -755,11 +720,8 @@ void intel_pmu_cross_mapped_check(struct kvm_pmu *pmu) } struct kvm_pmu_ops intel_pmu_ops __initdata = { - .hw_event_available = intel_hw_event_available, - .pmc_idx_to_pmc = intel_pmc_idx_to_pmc, .rdpmc_ecx_to_pmc = intel_rdpmc_ecx_to_pmc, .msr_idx_to_pmc = intel_msr_idx_to_pmc, - .is_valid_rdpmc_ecx = intel_is_valid_rdpmc_ecx, .is_valid_msr = intel_is_valid_msr, .get_msr = intel_pmu_get_msr, .set_msr = intel_pmu_set_msr, diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 784f2ecca5..22411f4aff 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -38,6 +38,7 @@ #include <asm/desc.h> #include <asm/fpu/api.h> #include <asm/fpu/xstate.h> +#include <asm/fred.h> #include <asm/idtentry.h> #include <asm/io.h> #include <asm/irq_remapping.h> @@ -49,6 +50,8 @@ #include <asm/spec-ctrl.h> #include <asm/vmx.h> +#include <trace/events/ipi.h> + #include "capabilities.h" #include "cpuid.h" #include "hyperv.h" @@ -159,7 +162,7 @@ module_param(allow_smaller_maxphyaddr, bool, S_IRUGO); /* * List of MSRs that can be directly passed to the guest. - * In addition to these x2apic and PT MSRs are handled specially. + * In addition to these x2apic, PT and LBR MSRs are handled specially. */ static u32 vmx_possible_passthrough_msrs[MAX_POSSIBLE_PASSTHROUGH_MSRS] = { MSR_IA32_SPEC_CTRL, @@ -215,6 +218,8 @@ module_param(ple_window_max, uint, 0444); int __read_mostly pt_mode = PT_MODE_SYSTEM; module_param(pt_mode, int, S_IRUGO); +struct x86_pmu_lbr __ro_after_init vmx_lbr_caps; + static DEFINE_STATIC_KEY_FALSE(vmx_l1d_should_flush); static DEFINE_STATIC_KEY_FALSE(vmx_l1d_flush_cond); static DEFINE_MUTEX(vmx_l1d_flush_mutex); @@ -667,25 +672,14 @@ static inline bool cpu_need_virtualize_apic_accesses(struct kvm_vcpu *vcpu) return flexpriority_enabled && lapic_in_kernel(vcpu); } -static int possible_passthrough_msr_slot(u32 msr) -{ - u32 i; - - for (i = 0; i < ARRAY_SIZE(vmx_possible_passthrough_msrs); i++) - if (vmx_possible_passthrough_msrs[i] == msr) - return i; - - return -ENOENT; -} - -static bool is_valid_passthrough_msr(u32 msr) +static int vmx_get_passthrough_msr_slot(u32 msr) { - bool r; + int i; switch (msr) { case 0x800 ... 0x8ff: /* x2APIC MSRs. These are handled in vmx_update_msr_bitmap_x2apic() */ - return true; + return -ENOENT; case MSR_IA32_RTIT_STATUS: case MSR_IA32_RTIT_OUTPUT_BASE: case MSR_IA32_RTIT_OUTPUT_MASK: @@ -700,14 +694,16 @@ static bool is_valid_passthrough_msr(u32 msr) case MSR_LBR_CORE_FROM ... MSR_LBR_CORE_FROM + 8: case MSR_LBR_CORE_TO ... MSR_LBR_CORE_TO + 8: /* LBR MSRs. These are handled in vmx_update_intercept_for_lbr_msrs() */ - return true; + return -ENOENT; } - r = possible_passthrough_msr_slot(msr) != -ENOENT; - - WARN(!r, "Invalid MSR %x, please adapt vmx_possible_passthrough_msrs[]", msr); + for (i = 0; i < ARRAY_SIZE(vmx_possible_passthrough_msrs); i++) { + if (vmx_possible_passthrough_msrs[i] == msr) + return i; + } - return r; + WARN(1, "Invalid MSR %x, please adapt vmx_possible_passthrough_msrs[]", msr); + return -ENOENT; } struct vmx_uret_msr *vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr) @@ -1290,8 +1286,6 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) u16 fs_sel, gs_sel; int i; - vmx->req_immediate_exit = false; - /* * Note that guest MSRs to be saved/restored can also be changed * when guest state is loaded. This happens when guest transitions @@ -3963,6 +3957,7 @@ void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type) { struct vcpu_vmx *vmx = to_vmx(vcpu); unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap; + int idx; if (!cpu_has_vmx_msr_bitmap()) return; @@ -3972,16 +3967,13 @@ void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type) /* * Mark the desired intercept state in shadow bitmap, this is needed * for resync when the MSR filters change. - */ - if (is_valid_passthrough_msr(msr)) { - int idx = possible_passthrough_msr_slot(msr); - - if (idx != -ENOENT) { - if (type & MSR_TYPE_R) - clear_bit(idx, vmx->shadow_msr_intercept.read); - if (type & MSR_TYPE_W) - clear_bit(idx, vmx->shadow_msr_intercept.write); - } + */ + idx = vmx_get_passthrough_msr_slot(msr); + if (idx >= 0) { + if (type & MSR_TYPE_R) + clear_bit(idx, vmx->shadow_msr_intercept.read); + if (type & MSR_TYPE_W) + clear_bit(idx, vmx->shadow_msr_intercept.write); } if ((type & MSR_TYPE_R) && @@ -4007,6 +3999,7 @@ void vmx_enable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type) { struct vcpu_vmx *vmx = to_vmx(vcpu); unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap; + int idx; if (!cpu_has_vmx_msr_bitmap()) return; @@ -4016,16 +4009,13 @@ void vmx_enable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type) /* * Mark the desired intercept state in shadow bitmap, this is needed * for resync when the MSR filter changes. - */ - if (is_valid_passthrough_msr(msr)) { - int idx = possible_passthrough_msr_slot(msr); - - if (idx != -ENOENT) { - if (type & MSR_TYPE_R) - set_bit(idx, vmx->shadow_msr_intercept.read); - if (type & MSR_TYPE_W) - set_bit(idx, vmx->shadow_msr_intercept.write); - } + */ + idx = vmx_get_passthrough_msr_slot(msr); + if (idx >= 0) { + if (type & MSR_TYPE_R) + set_bit(idx, vmx->shadow_msr_intercept.read); + if (type & MSR_TYPE_W) + set_bit(idx, vmx->shadow_msr_intercept.write); } if (type & MSR_TYPE_R) @@ -4136,6 +4126,9 @@ static void vmx_msr_filter_changed(struct kvm_vcpu *vcpu) struct vcpu_vmx *vmx = to_vmx(vcpu); u32 i; + if (!cpu_has_vmx_msr_bitmap()) + return; + /* * Redo intercept permissions for MSRs that KVM is passing through to * the guest. Disabling interception will check the new MSR filter and @@ -5575,10 +5568,7 @@ static int handle_dr(struct kvm_vcpu *vcpu) reg = DEBUG_REG_ACCESS_REG(exit_qualification); if (exit_qualification & TYPE_MOV_FROM_DR) { - unsigned long val; - - kvm_get_dr(vcpu, dr, &val); - kvm_register_write(vcpu, reg, val); + kvm_register_write(vcpu, reg, kvm_get_dr(vcpu, dr)); err = 0; } else { err = kvm_set_dr(vcpu, dr, kvm_register_read(vcpu, reg)); @@ -6000,22 +5990,46 @@ static int handle_pml_full(struct kvm_vcpu *vcpu) return 1; } -static fastpath_t handle_fastpath_preemption_timer(struct kvm_vcpu *vcpu) +static fastpath_t handle_fastpath_preemption_timer(struct kvm_vcpu *vcpu, + bool force_immediate_exit) { struct vcpu_vmx *vmx = to_vmx(vcpu); - if (!vmx->req_immediate_exit && - !unlikely(vmx->loaded_vmcs->hv_timer_soft_disabled)) { - kvm_lapic_expired_hv_timer(vcpu); + /* + * In the *extremely* unlikely scenario that this is a spurious VM-Exit + * due to the timer expiring while it was "soft" disabled, just eat the + * exit and re-enter the guest. + */ + if (unlikely(vmx->loaded_vmcs->hv_timer_soft_disabled)) return EXIT_FASTPATH_REENTER_GUEST; - } - return EXIT_FASTPATH_NONE; + /* + * If the timer expired because KVM used it to force an immediate exit, + * then mission accomplished. + */ + if (force_immediate_exit) + return EXIT_FASTPATH_EXIT_HANDLED; + + /* + * If L2 is active, go down the slow path as emulating the guest timer + * expiration likely requires synthesizing a nested VM-Exit. + */ + if (is_guest_mode(vcpu)) + return EXIT_FASTPATH_NONE; + + kvm_lapic_expired_hv_timer(vcpu); + return EXIT_FASTPATH_REENTER_GUEST; } static int handle_preemption_timer(struct kvm_vcpu *vcpu) { - handle_fastpath_preemption_timer(vcpu); + /* + * This non-fastpath handler is reached if and only if the preemption + * timer was being used to emulate a guest timer while L2 is active. + * All other scenarios are supposed to be handled in the fastpath. + */ + WARN_ON_ONCE(!is_guest_mode(vcpu)); + kvm_lapic_expired_hv_timer(vcpu); return 1; } @@ -6518,7 +6532,7 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV; vcpu->run->internal.data[0] = vectoring_info; vcpu->run->internal.data[1] = exit_reason.full; - vcpu->run->internal.data[2] = vcpu->arch.exit_qualification; + vcpu->run->internal.data[2] = vmx_get_exit_qual(vcpu); if (exit_reason.basic == EXIT_REASON_EPT_MISCONFIG) { vcpu->run->internal.data[ndata++] = vmcs_read64(GUEST_PHYSICAL_ADDRESS); @@ -6552,7 +6566,7 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) if (exit_reason.basic >= kvm_vmx_max_exit_handlers) goto unexpected_vmexit; -#ifdef CONFIG_RETPOLINE +#ifdef CONFIG_MITIGATION_RETPOLINE if (exit_reason.basic == EXIT_REASON_MSR_WRITE) return kvm_emulate_wrmsr(vcpu); else if (exit_reason.basic == EXIT_REASON_PREEMPTION_TIMER) @@ -6969,14 +6983,16 @@ static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu) { u32 intr_info = vmx_get_intr_info(vcpu); unsigned int vector = intr_info & INTR_INFO_VECTOR_MASK; - gate_desc *desc = (gate_desc *)host_idt_base + vector; if (KVM_BUG(!is_external_intr(intr_info), vcpu->kvm, "unexpected VM-Exit interrupt info: 0x%x", intr_info)) return; kvm_before_interrupt(vcpu, KVM_HANDLING_IRQ); - vmx_do_interrupt_irqoff(gate_offset(desc)); + if (cpu_feature_enabled(X86_FEATURE_FRED)) + fred_entry_from_kvm(EVENT_TYPE_EXTINT, vector); + else + vmx_do_interrupt_irqoff(gate_offset((gate_desc *)host_idt_base + vector)); kvm_after_interrupt(vcpu); vcpu->arch.at_instruction_boundary = true; @@ -7155,13 +7171,13 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx) msrs[i].host, false); } -static void vmx_update_hv_timer(struct kvm_vcpu *vcpu) +static void vmx_update_hv_timer(struct kvm_vcpu *vcpu, bool force_immediate_exit) { struct vcpu_vmx *vmx = to_vmx(vcpu); u64 tscl; u32 delta_tsc; - if (vmx->req_immediate_exit) { + if (force_immediate_exit) { vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, 0); vmx->loaded_vmcs->hv_timer_soft_disabled = false; } else if (vmx->hv_deadline_tsc != -1) { @@ -7214,13 +7230,22 @@ void noinstr vmx_spec_ctrl_restore_host(struct vcpu_vmx *vmx, barrier_nospec(); } -static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu) +static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu, + bool force_immediate_exit) { + /* + * If L2 is active, some VMX preemption timer exits can be handled in + * the fastpath even, all other exits must use the slow path. + */ + if (is_guest_mode(vcpu) && + to_vmx(vcpu)->exit_reason.basic != EXIT_REASON_PREEMPTION_TIMER) + return EXIT_FASTPATH_NONE; + switch (to_vmx(vcpu)->exit_reason.basic) { case EXIT_REASON_MSR_WRITE: return handle_fastpath_set_msr_irqoff(vcpu); case EXIT_REASON_PREEMPTION_TIMER: - return handle_fastpath_preemption_timer(vcpu); + return handle_fastpath_preemption_timer(vcpu, force_immediate_exit); default: return EXIT_FASTPATH_NONE; } @@ -7272,7 +7297,10 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu, if ((u16)vmx->exit_reason.basic == EXIT_REASON_EXCEPTION_NMI && is_nmi(vmx_get_intr_info(vcpu))) { kvm_before_interrupt(vcpu, KVM_HANDLING_NMI); - vmx_do_nmi_irqoff(); + if (cpu_feature_enabled(X86_FEATURE_FRED)) + fred_entry_from_kvm(EVENT_TYPE_NMI, NMI_VECTOR); + else + vmx_do_nmi_irqoff(); kvm_after_interrupt(vcpu); } @@ -7280,7 +7308,7 @@ out: guest_state_exit_irqoff(); } -static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) +static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit) { struct vcpu_vmx *vmx = to_vmx(vcpu); unsigned long cr3, cr4; @@ -7307,7 +7335,7 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) return EXIT_FASTPATH_NONE; } - trace_kvm_entry(vcpu); + trace_kvm_entry(vcpu, force_immediate_exit); if (vmx->ple_window_dirty) { vmx->ple_window_dirty = false; @@ -7366,7 +7394,9 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) vmx_passthrough_lbr_msrs(vcpu); if (enable_preemption_timer) - vmx_update_hv_timer(vcpu); + vmx_update_hv_timer(vcpu, force_immediate_exit); + else if (force_immediate_exit) + smp_send_reschedule(vcpu->cpu); kvm_wait_lapic_expire(vcpu); @@ -7430,10 +7460,7 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) vmx_recover_nmi_blocking(vmx); vmx_complete_interrupts(vmx); - if (is_guest_mode(vcpu)) - return EXIT_FASTPATH_NONE; - - return vmx_exit_handlers_fastpath(vcpu); + return vmx_exit_handlers_fastpath(vcpu, force_immediate_exit); } static void vmx_vcpu_free(struct kvm_vcpu *vcpu) @@ -7837,10 +7864,9 @@ static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) vmx_update_exception_bitmap(vcpu); } -static u64 vmx_get_perf_capabilities(void) +static __init u64 vmx_get_perf_capabilities(void) { u64 perf_cap = PMU_CAP_FW_WRITES; - struct x86_pmu_lbr lbr; u64 host_perf_cap = 0; if (!enable_pmu) @@ -7850,8 +7876,16 @@ static u64 vmx_get_perf_capabilities(void) rdmsrl(MSR_IA32_PERF_CAPABILITIES, host_perf_cap); if (!cpu_feature_enabled(X86_FEATURE_ARCH_LBR)) { - x86_perf_get_lbr(&lbr); - if (lbr.nr) + x86_perf_get_lbr(&vmx_lbr_caps); + + /* + * KVM requires LBR callstack support, as the overhead due to + * context switching LBRs without said support is too high. + * See intel_pmu_create_guest_lbr_event() for more info. + */ + if (!vmx_lbr_caps.has_callstack) + memset(&vmx_lbr_caps, 0, sizeof(vmx_lbr_caps)); + else if (vmx_lbr_caps.nr) perf_cap |= host_perf_cap & PMU_CAP_LBR_FMT; } @@ -7933,11 +7967,6 @@ static __init void vmx_set_cpu_caps(void) kvm_cpu_cap_check_and_set(X86_FEATURE_WAITPKG); } -static void vmx_request_immediate_exit(struct kvm_vcpu *vcpu) -{ - to_vmx(vcpu)->req_immediate_exit = true; -} - static int vmx_check_intercept_io(struct kvm_vcpu *vcpu, struct x86_instruction_info *info) { @@ -8390,8 +8419,6 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .check_intercept = vmx_check_intercept, .handle_exit_irqoff = vmx_handle_exit_irqoff, - .request_immediate_exit = vmx_request_immediate_exit, - .sched_in = vmx_sched_in, .cpu_dirty_log_size = PML_ENTITY_NUM, @@ -8651,7 +8678,6 @@ static __init int hardware_setup(void) if (!enable_preemption_timer) { vmx_x86_ops.set_hv_timer = NULL; vmx_x86_ops.cancel_hv_timer = NULL; - vmx_x86_ops.request_immediate_exit = __kvm_request_immediate_exit; } kvm_caps.supported_mce_cap |= MCG_LMCE_P; diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index e3b0985bb7..90f9e44346 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -15,6 +15,7 @@ #include "vmx_ops.h" #include "../cpuid.h" #include "run_flags.h" +#include "../mmu.h" #define MSR_TYPE_R 1 #define MSR_TYPE_W 2 @@ -109,6 +110,8 @@ struct lbr_desc { bool msr_passthrough; }; +extern struct x86_pmu_lbr vmx_lbr_caps; + /* * The nested_vmx structure is part of vcpu_vmx, and holds information we need * for correct emulation of VMX (i.e., nested VMX) on this vcpu. @@ -332,8 +335,6 @@ struct vcpu_vmx { unsigned int ple_window; bool ple_window_dirty; - bool req_immediate_exit; - /* Support for PML */ #define PML_ENTITY_NUM 512 struct page *pml_pg; @@ -721,7 +722,8 @@ static inline bool vmx_need_pf_intercept(struct kvm_vcpu *vcpu) if (!enable_ept) return true; - return allow_smaller_maxphyaddr && cpuid_maxphyaddr(vcpu) < boot_cpu_data.x86_phys_bits; + return allow_smaller_maxphyaddr && + cpuid_maxphyaddr(vcpu) < kvm_get_shadow_phys_bits(); } static inline bool is_unrestricted_guest(struct kvm_vcpu *vcpu) |