diff options
Diffstat (limited to 'arch/x86/kvm')
-rw-r--r-- | arch/x86/kvm/cpuid.c | 1 | ||||
-rw-r--r-- | arch/x86/kvm/emulate.c | 104 | ||||
-rw-r--r-- | arch/x86/kvm/svm.c | 11 | ||||
-rw-r--r-- | arch/x86/kvm/vmx.c | 72 | ||||
-rw-r--r-- | arch/x86/kvm/x86.c | 6 |
5 files changed, 141 insertions, 53 deletions
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 0489ffc3d..768a765c4 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -680,6 +680,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, g_phys_as = phys_as; entry->eax = g_phys_as | (virt_as << 8); + entry->ecx &= ~(GENMASK(31, 16) | GENMASK(11, 8)); entry->edx = 0; /* * IBRS, IBPB and VIRT_SSBD aren't necessarily present in diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 25bdd0696..0cb75d29e 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -759,8 +759,7 @@ static int linearize(struct x86_emulate_ctxt *ctxt, ctxt->mode, linear); } -static inline int assign_eip(struct x86_emulate_ctxt *ctxt, ulong dst, - enum x86emul_mode mode) +static inline int assign_eip(struct x86_emulate_ctxt *ctxt, ulong dst) { ulong linear; int rc; @@ -770,41 +769,71 @@ static inline int assign_eip(struct x86_emulate_ctxt *ctxt, ulong dst, if (ctxt->op_bytes != sizeof(unsigned long)) addr.ea = dst & ((1UL << (ctxt->op_bytes << 3)) - 1); - rc = __linearize(ctxt, addr, &max_size, 1, false, true, mode, &linear); + rc = __linearize(ctxt, addr, &max_size, 1, false, true, ctxt->mode, &linear); if (rc == X86EMUL_CONTINUE) ctxt->_eip = addr.ea; return rc; } +static inline int emulator_recalc_and_set_mode(struct x86_emulate_ctxt *ctxt) +{ + u64 efer; + struct desc_struct cs; + u16 selector; + u32 base3; + + ctxt->ops->get_msr(ctxt, MSR_EFER, &efer); + + if (!(ctxt->ops->get_cr(ctxt, 0) & X86_CR0_PE)) { + /* Real mode. cpu must not have long mode active */ + if (efer & EFER_LMA) + return X86EMUL_UNHANDLEABLE; + ctxt->mode = X86EMUL_MODE_REAL; + return X86EMUL_CONTINUE; + } + + if (ctxt->eflags & X86_EFLAGS_VM) { + /* Protected/VM86 mode. cpu must not have long mode active */ + if (efer & EFER_LMA) + return X86EMUL_UNHANDLEABLE; + ctxt->mode = X86EMUL_MODE_VM86; + return X86EMUL_CONTINUE; + } + + if (!ctxt->ops->get_segment(ctxt, &selector, &cs, &base3, VCPU_SREG_CS)) + return X86EMUL_UNHANDLEABLE; + + if (efer & EFER_LMA) { + if (cs.l) { + /* Proper long mode */ + ctxt->mode = X86EMUL_MODE_PROT64; + } else if (cs.d) { + /* 32 bit compatibility mode*/ + ctxt->mode = X86EMUL_MODE_PROT32; + } else { + ctxt->mode = X86EMUL_MODE_PROT16; + } + } else { + /* Legacy 32 bit / 16 bit mode */ + ctxt->mode = cs.d ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; + } + + return X86EMUL_CONTINUE; +} + static inline int assign_eip_near(struct x86_emulate_ctxt *ctxt, ulong dst) { - return assign_eip(ctxt, dst, ctxt->mode); + return assign_eip(ctxt, dst); } -static int assign_eip_far(struct x86_emulate_ctxt *ctxt, ulong dst, - const struct desc_struct *cs_desc) +static int assign_eip_far(struct x86_emulate_ctxt *ctxt, ulong dst) { - enum x86emul_mode mode = ctxt->mode; - int rc; + int rc = emulator_recalc_and_set_mode(ctxt); -#ifdef CONFIG_X86_64 - if (ctxt->mode >= X86EMUL_MODE_PROT16) { - if (cs_desc->l) { - u64 efer = 0; + if (rc != X86EMUL_CONTINUE) + return rc; - ctxt->ops->get_msr(ctxt, MSR_EFER, &efer); - if (efer & EFER_LMA) - mode = X86EMUL_MODE_PROT64; - } else - mode = X86EMUL_MODE_PROT32; /* temporary value */ - } -#endif - if (mode == X86EMUL_MODE_PROT16 || mode == X86EMUL_MODE_PROT32) - mode = cs_desc->d ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; - rc = assign_eip(ctxt, dst, mode); - if (rc == X86EMUL_CONTINUE) - ctxt->mode = mode; - return rc; + return assign_eip(ctxt, dst); } static inline int jmp_rel(struct x86_emulate_ctxt *ctxt, int rel) @@ -1975,7 +2004,7 @@ static int em_pop_sreg(struct x86_emulate_ctxt *ctxt) if (rc != X86EMUL_CONTINUE) return rc; - if (ctxt->modrm_reg == VCPU_SREG_SS) + if (seg == VCPU_SREG_SS) ctxt->interruptibility = KVM_X86_SHADOW_INT_MOV_SS; if (ctxt->op_bytes > 2) rsp_increment(ctxt, ctxt->op_bytes - 2); @@ -2192,7 +2221,7 @@ static int em_jmp_far(struct x86_emulate_ctxt *ctxt) if (rc != X86EMUL_CONTINUE) return rc; - rc = assign_eip_far(ctxt, ctxt->src.val, &new_desc); + rc = assign_eip_far(ctxt, ctxt->src.val); /* Error handling is not implemented. */ if (rc != X86EMUL_CONTINUE) return X86EMUL_UNHANDLEABLE; @@ -2273,7 +2302,7 @@ static int em_ret_far(struct x86_emulate_ctxt *ctxt) &new_desc); if (rc != X86EMUL_CONTINUE) return rc; - rc = assign_eip_far(ctxt, eip, &new_desc); + rc = assign_eip_far(ctxt, eip); /* Error handling is not implemented. */ if (rc != X86EMUL_CONTINUE) return X86EMUL_UNHANDLEABLE; @@ -2895,6 +2924,7 @@ static int em_sysexit(struct x86_emulate_ctxt *ctxt) ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS); ctxt->_eip = rdx; + ctxt->mode = usermode; *reg_write(ctxt, VCPU_REGS_RSP) = rcx; return X86EMUL_CONTINUE; @@ -3491,7 +3521,7 @@ static int em_call_far(struct x86_emulate_ctxt *ctxt) if (rc != X86EMUL_CONTINUE) return rc; - rc = assign_eip_far(ctxt, ctxt->src.val, &new_desc); + rc = assign_eip_far(ctxt, ctxt->src.val); if (rc != X86EMUL_CONTINUE) goto fail; @@ -3638,11 +3668,25 @@ static int em_movbe(struct x86_emulate_ctxt *ctxt) static int em_cr_write(struct x86_emulate_ctxt *ctxt) { - if (ctxt->ops->set_cr(ctxt, ctxt->modrm_reg, ctxt->src.val)) + int cr_num = ctxt->modrm_reg; + int r; + + if (ctxt->ops->set_cr(ctxt, cr_num, ctxt->src.val)) return emulate_gp(ctxt, 0); /* Disable writeback. */ ctxt->dst.type = OP_NONE; + + if (cr_num == 0) { + /* + * CR0 write might have updated CR0.PE and/or CR0.PG + * which can affect the cpu's execution mode. + */ + r = emulator_recalc_and_set_mode(ctxt); + if (r != X86EMUL_CONTINUE) + return r; + } + return X86EMUL_CONTINUE; } diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index e1492a67e..cd3432df0 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -50,6 +50,7 @@ #include <asm/kvm_para.h> #include <asm/irq_remapping.h> #include <asm/spec-ctrl.h> +#include <asm/cpu_device_id.h> #include <asm/virtext.h> #include "trace.h" @@ -4154,9 +4155,9 @@ static int svm_get_msr_feature(struct kvm_msr_entry *msr) msr->data = 0; switch (msr->index) { - case MSR_F10H_DECFG: - if (boot_cpu_has(X86_FEATURE_LFENCE_RDTSC)) - msr->data |= MSR_F10H_DECFG_LFENCE_SERIALIZE; + case MSR_AMD64_DE_CFG: + if (cpu_feature_enabled(X86_FEATURE_LFENCE_RDTSC)) + msr->data |= MSR_AMD64_DE_CFG_LFENCE_SERIALIZE; break; default: return 1; @@ -4258,7 +4259,7 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->data = 0x1E; } break; - case MSR_F10H_DECFG: + case MSR_AMD64_DE_CFG: msr_info->data = svm->msr_decfg; break; default: @@ -4445,7 +4446,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) case MSR_VM_IGNNE: vcpu_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data); break; - case MSR_F10H_DECFG: { + case MSR_AMD64_DE_CFG: { struct kvm_msr_entry msr_entry; msr_entry.index = msr->index; diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 20ce79458..9bb696d73 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -41,6 +41,7 @@ #include <asm/asm.h> #include <asm/cpu.h> +#include <asm/cpu_device_id.h> #include <asm/io.h> #include <asm/desc.h> #include <asm/vmx.h> @@ -2132,9 +2133,9 @@ static __always_inline void vmx_disable_fb_clear(struct vcpu_vmx *vmx) if (!vmx->disable_fb_clear) return; - rdmsrl(MSR_IA32_MCU_OPT_CTRL, msr); + msr = __rdmsr(MSR_IA32_MCU_OPT_CTRL); msr |= FB_CLEAR_DIS; - wrmsrl(MSR_IA32_MCU_OPT_CTRL, msr); + native_wrmsrl(MSR_IA32_MCU_OPT_CTRL, msr); /* Cache the MSR value to avoid reading it later */ vmx->msr_ia32_mcu_opt_ctrl = msr; } @@ -2145,7 +2146,7 @@ static __always_inline void vmx_enable_fb_clear(struct vcpu_vmx *vmx) return; vmx->msr_ia32_mcu_opt_ctrl &= ~FB_CLEAR_DIS; - wrmsrl(MSR_IA32_MCU_OPT_CTRL, vmx->msr_ia32_mcu_opt_ctrl); + native_wrmsrl(MSR_IA32_MCU_OPT_CTRL, vmx->msr_ia32_mcu_opt_ctrl); } static void vmx_update_fb_clear_dis(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) @@ -10759,10 +10760,35 @@ static void vmx_update_hv_timer(struct kvm_vcpu *vcpu) vmx->loaded_vmcs->hv_timer_armed = false; } +u64 __always_inline vmx_spec_ctrl_restore_host(struct vcpu_vmx *vmx) +{ + u64 guestval, hostval = this_cpu_read(x86_spec_ctrl_current); + + if (!cpu_feature_enabled(X86_FEATURE_MSR_SPEC_CTRL)) + return 0; + + guestval = __rdmsr(MSR_IA32_SPEC_CTRL); + + /* + * + * For legacy IBRS, the IBRS bit always needs to be written after + * transitioning from a less privileged predictor mode, regardless of + * whether the guest/host values differ. + */ + if (cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS) || + guestval != hostval) + native_wrmsrl(MSR_IA32_SPEC_CTRL, hostval); + + barrier_nospec(); + + return guestval; +} + static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); unsigned long cr3, cr4, evmcs_rsp; + u64 spec_ctrl; /* Record the guest's net vcpu time for enforced NMI injections. */ if (unlikely(!enable_vnmi && @@ -10988,9 +11014,24 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) #endif ); - /* Eliminate branch target predictions from guest mode */ + /* + * IMPORTANT: RSB filling and SPEC_CTRL handling must be done before + * the first unbalanced RET after vmexit! + * + * For retpoline or IBRS, RSB filling is needed to prevent poisoned RSB + * entries and (in some cases) RSB underflow. + * + * eIBRS has its own protection against poisoned RSB, so it doesn't + * need the RSB filling sequence. But it does need to be enabled, and a + * single call to retire, before the first unbalanced RET. + * + * So no RETs before vmx_spec_ctrl_restore_host() below. + */ vmexit_fill_RSB(); + /* Save this for below */ + spec_ctrl = vmx_spec_ctrl_restore_host(vmx); + vmx_enable_fb_clear(vmx); /* @@ -11009,9 +11050,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) * save it. */ if (unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL))) - vmx->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL); - - x86_spec_ctrl_restore_host(vmx->spec_ctrl, 0); + vmx->spec_ctrl = spec_ctrl; /* All fields are clean at this point */ if (static_branch_unlikely(&enable_evmcs)) @@ -13276,14 +13315,6 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, */ vmcs12_save_pending_event(vcpu, vmcs12); } - - /* - * Drop what we picked up for L2 via vmx_complete_interrupts. It is - * preserved above and would only end up incorrectly in L1. - */ - vcpu->arch.nmi_injected = false; - kvm_clear_exception_queue(vcpu); - kvm_clear_interrupt_queue(vcpu); } /* @@ -13617,6 +13648,17 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, nested_vmx_abort(vcpu, VMX_ABORT_SAVE_GUEST_MSR_FAIL); } + /* + * Drop events/exceptions that were queued for re-injection to L2 + * (picked up via vmx_complete_interrupts()), as well as exceptions + * that were pending for L2. Note, this must NOT be hoisted above + * prepare_vmcs12(), events/exceptions queued for re-injection need to + * be captured in vmcs12 (see vmcs12_save_pending_event()). + */ + vcpu->arch.nmi_injected = false; + kvm_clear_exception_queue(vcpu); + kvm_clear_interrupt_queue(vcpu); + vmx_switch_vmcs(vcpu, &vmx->vmcs01); vm_entry_controls_reset_shadow(vmx); vm_exit_controls_reset_shadow(vmx); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index be4697d91..8fe615df8 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1156,7 +1156,7 @@ static u32 msr_based_features[] = { MSR_IA32_VMX_EPT_VPID_CAP, MSR_IA32_VMX_VMFUNC, - MSR_F10H_DECFG, + MSR_AMD64_DE_CFG, MSR_IA32_UCODE_REV, MSR_IA32_ARCH_CAPABILITIES, }; @@ -9736,9 +9736,9 @@ void kvm_arch_end_assignment(struct kvm *kvm) } EXPORT_SYMBOL_GPL(kvm_arch_end_assignment); -bool kvm_arch_has_assigned_device(struct kvm *kvm) +bool noinstr kvm_arch_has_assigned_device(struct kvm *kvm) { - return atomic_read(&kvm->arch.assigned_device_count); + return arch_atomic_read(&kvm->arch.assigned_device_count); } EXPORT_SYMBOL_GPL(kvm_arch_has_assigned_device); |