summaryrefslogtreecommitdiffstats
path: root/arch/x86/kvm
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-18 17:35:05 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-18 17:39:31 +0000
commit85c675d0d09a45a135bddd15d7b385f8758c32fb (patch)
tree76267dbc9b9a130337be3640948fe397b04ac629 /arch/x86/kvm
parentAdding upstream version 6.6.15. (diff)
downloadlinux-85c675d0d09a45a135bddd15d7b385f8758c32fb.tar.xz
linux-85c675d0d09a45a135bddd15d7b385f8758c32fb.zip
Adding upstream version 6.7.7.upstream/6.7.7
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'arch/x86/kvm')
-rw-r--r--arch/x86/kvm/Kconfig11
-rw-r--r--arch/x86/kvm/cpuid.c10
-rw-r--r--arch/x86/kvm/cpuid.h3
-rw-r--r--arch/x86/kvm/debugfs.c1
-rw-r--r--arch/x86/kvm/mmu.h7
-rw-r--r--arch/x86/kvm/mmu/mmu.c55
-rw-r--r--arch/x86/kvm/mtrr.c2
-rw-r--r--arch/x86/kvm/smm.c1
-rw-r--r--arch/x86/kvm/svm/sev.c19
-rw-r--r--arch/x86/kvm/svm/svm.c61
-rw-r--r--arch/x86/kvm/svm/svm.h2
-rw-r--r--arch/x86/kvm/svm/svm_ops.h6
-rw-r--r--arch/x86/kvm/vmx/pmu_intel.c2
-rw-r--r--arch/x86/kvm/vmx/vmx.c45
-rw-r--r--arch/x86/kvm/vmx/vmx_ops.h6
-rw-r--r--arch/x86/kvm/x86.c258
-rw-r--r--arch/x86/kvm/x86.h1
-rw-r--r--arch/x86/kvm/xen.c59
18 files changed, 387 insertions, 162 deletions
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index ed90f14814..950c12868d 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -154,4 +154,15 @@ config KVM_PROVE_MMU
config KVM_EXTERNAL_WRITE_TRACKING
bool
+config KVM_MAX_NR_VCPUS
+ int "Maximum number of vCPUs per KVM guest"
+ depends on KVM
+ range 1024 4096
+ default 4096 if MAXSMP
+ default 1024
+ help
+ Set the maximum number of vCPUs per KVM guest. Larger values will increase
+ the memory footprint of each KVM guest, regardless of how many vCPUs are
+ created for a given VM.
+
endif # VIRTUALIZATION
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 773132c3bf..dda6fc4cfa 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -448,7 +448,9 @@ static int kvm_set_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *e2,
vcpu->arch.cpuid_nent = nent;
vcpu->arch.kvm_cpuid = kvm_get_hypervisor_cpuid(vcpu, KVM_SIGNATURE);
+#ifdef CONFIG_KVM_XEN
vcpu->arch.xen.cpuid = kvm_get_hypervisor_cpuid(vcpu, XEN_SIGNATURE);
+#endif
kvm_vcpu_after_set_cpuid(vcpu);
return 0;
@@ -753,11 +755,13 @@ void kvm_set_cpu_caps(void)
kvm_cpu_cap_mask(CPUID_8000_0021_EAX,
F(NO_NESTED_DATA_BP) | F(LFENCE_RDTSC) | 0 /* SmmPgCfgLock */ |
- F(NULL_SEL_CLR_BASE) | F(AUTOIBRS) | 0 /* PrefetchCtlMsr */
+ F(NULL_SEL_CLR_BASE) | F(AUTOIBRS) | 0 /* PrefetchCtlMsr */ |
+ F(WRMSR_XX_BASE_NS)
);
- if (cpu_feature_enabled(X86_FEATURE_SRSO_NO))
- kvm_cpu_cap_set(X86_FEATURE_SRSO_NO);
+ kvm_cpu_cap_check_and_set(X86_FEATURE_SBPB);
+ kvm_cpu_cap_check_and_set(X86_FEATURE_IBPB_BRTYPE);
+ kvm_cpu_cap_check_and_set(X86_FEATURE_SRSO_NO);
kvm_cpu_cap_init_kvm_defined(CPUID_8000_0022_EAX,
F(PERFMON_V2)
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index 284fa47045..0b90532b6e 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -174,7 +174,8 @@ static inline bool guest_has_spec_ctrl_msr(struct kvm_vcpu *vcpu)
static inline bool guest_has_pred_cmd_msr(struct kvm_vcpu *vcpu)
{
return (guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL) ||
- guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBPB));
+ guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBPB) ||
+ guest_cpuid_has(vcpu, X86_FEATURE_SBPB));
}
static inline bool supports_cpuid_fault(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/debugfs.c b/arch/x86/kvm/debugfs.c
index ee8c4c3496..eea6ea7f14 100644
--- a/arch/x86/kvm/debugfs.c
+++ b/arch/x86/kvm/debugfs.c
@@ -182,6 +182,7 @@ static int kvm_mmu_rmaps_stat_release(struct inode *inode, struct file *file)
}
static const struct file_operations mmu_rmaps_stat_fops = {
+ .owner = THIS_MODULE,
.open = kvm_mmu_rmaps_stat_open,
.read = seq_read,
.llseek = seq_lseek,
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 253fb2093d..bb8c86eefa 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -237,6 +237,13 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
return -(u32)fault & errcode;
}
+bool __kvm_mmu_honors_guest_mtrrs(bool vm_has_noncoherent_dma);
+
+static inline bool kvm_mmu_honors_guest_mtrrs(struct kvm *kvm)
+{
+ return __kvm_mmu_honors_guest_mtrrs(kvm_arch_has_noncoherent_dma(kvm));
+}
+
void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end);
int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index f7901cb4d2..c57e181bba 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -3425,8 +3425,8 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
{
struct kvm_mmu_page *sp;
int ret = RET_PF_INVALID;
- u64 spte = 0ull;
- u64 *sptep = NULL;
+ u64 spte;
+ u64 *sptep;
uint retry_count = 0;
if (!page_fault_can_be_fast(fault))
@@ -3442,6 +3442,14 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
else
sptep = fast_pf_get_last_sptep(vcpu, fault->addr, &spte);
+ /*
+ * It's entirely possible for the mapping to have been zapped
+ * by a different task, but the root page should always be
+ * available as the vCPU holds a reference to its root(s).
+ */
+ if (WARN_ON_ONCE(!sptep))
+ spte = REMOVED_SPTE;
+
if (!is_shadow_present_pte(spte))
break;
@@ -4479,21 +4487,28 @@ out_unlock:
}
#endif
-int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
+bool __kvm_mmu_honors_guest_mtrrs(bool vm_has_noncoherent_dma)
{
/*
- * If the guest's MTRRs may be used to compute the "real" memtype,
- * restrict the mapping level to ensure KVM uses a consistent memtype
- * across the entire mapping. If the host MTRRs are ignored by TDP
- * (shadow_memtype_mask is non-zero), and the VM has non-coherent DMA
- * (DMA doesn't snoop CPU caches), KVM's ABI is to honor the memtype
- * from the guest's MTRRs so that guest accesses to memory that is
- * DMA'd aren't cached against the guest's wishes.
+ * If host MTRRs are ignored (shadow_memtype_mask is non-zero), and the
+ * VM has non-coherent DMA (DMA doesn't snoop CPU caches), KVM's ABI is
+ * to honor the memtype from the guest's MTRRs so that guest accesses
+ * to memory that is DMA'd aren't cached against the guest's wishes.
*
* Note, KVM may still ultimately ignore guest MTRRs for certain PFNs,
* e.g. KVM will force UC memtype for host MMIO.
*/
- if (shadow_memtype_mask && kvm_arch_has_noncoherent_dma(vcpu->kvm)) {
+ return vm_has_noncoherent_dma && shadow_memtype_mask;
+}
+
+int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
+{
+ /*
+ * If the guest's MTRRs may be used to compute the "real" memtype,
+ * restrict the mapping level to ensure KVM uses a consistent memtype
+ * across the entire mapping.
+ */
+ if (kvm_mmu_honors_guest_mtrrs(vcpu->kvm)) {
for ( ; fault->max_level > PG_LEVEL_4K; --fault->max_level) {
int page_num = KVM_PAGES_PER_HPAGE(fault->max_level);
gfn_t base = gfn_round_for_level(fault->gfn,
@@ -6785,11 +6800,7 @@ static unsigned long mmu_shrink_count(struct shrinker *shrink,
return percpu_counter_read_positive(&kvm_total_used_mmu_pages);
}
-static struct shrinker mmu_shrinker = {
- .count_objects = mmu_shrink_count,
- .scan_objects = mmu_shrink_scan,
- .seeks = DEFAULT_SEEKS * 10,
-};
+static struct shrinker *mmu_shrinker;
static void mmu_destroy_caches(void)
{
@@ -6922,10 +6933,16 @@ int kvm_mmu_vendor_module_init(void)
if (percpu_counter_init(&kvm_total_used_mmu_pages, 0, GFP_KERNEL))
goto out;
- ret = register_shrinker(&mmu_shrinker, "x86-mmu");
- if (ret)
+ mmu_shrinker = shrinker_alloc(0, "x86-mmu");
+ if (!mmu_shrinker)
goto out_shrinker;
+ mmu_shrinker->count_objects = mmu_shrink_count;
+ mmu_shrinker->scan_objects = mmu_shrink_scan;
+ mmu_shrinker->seeks = DEFAULT_SEEKS * 10;
+
+ shrinker_register(mmu_shrinker);
+
return 0;
out_shrinker:
@@ -6947,7 +6964,7 @@ void kvm_mmu_vendor_module_exit(void)
{
mmu_destroy_caches();
percpu_counter_destroy(&kvm_total_used_mmu_pages);
- unregister_shrinker(&mmu_shrinker);
+ shrinker_free(mmu_shrinker);
}
/*
diff --git a/arch/x86/kvm/mtrr.c b/arch/x86/kvm/mtrr.c
index 3eb6e7f47e..a67c28a564 100644
--- a/arch/x86/kvm/mtrr.c
+++ b/arch/x86/kvm/mtrr.c
@@ -320,7 +320,7 @@ static void update_mtrr(struct kvm_vcpu *vcpu, u32 msr)
struct kvm_mtrr *mtrr_state = &vcpu->arch.mtrr_state;
gfn_t start, end;
- if (!tdp_enabled || !kvm_arch_has_noncoherent_dma(vcpu->kvm))
+ if (!kvm_mmu_honors_guest_mtrrs(vcpu->kvm))
return;
if (!mtrr_is_enabled(mtrr_state) && msr != MSR_MTRRdefType)
diff --git a/arch/x86/kvm/smm.c b/arch/x86/kvm/smm.c
index b42111a24c..dc3d95fdca 100644
--- a/arch/x86/kvm/smm.c
+++ b/arch/x86/kvm/smm.c
@@ -324,7 +324,6 @@ void enter_smm(struct kvm_vcpu *vcpu)
cr0 = vcpu->arch.cr0 & ~(X86_CR0_PE | X86_CR0_EM | X86_CR0_TS | X86_CR0_PG);
static_call(kvm_x86_set_cr0)(vcpu, cr0);
- vcpu->arch.cr0 = cr0;
static_call(kvm_x86_set_cr4)(vcpu, 0);
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index 4900c07804..6ee925d666 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -2972,6 +2972,25 @@ static void sev_es_vcpu_after_set_cpuid(struct vcpu_svm *svm)
set_msr_interception(vcpu, svm->msrpm, MSR_TSC_AUX, v_tsc_aux, v_tsc_aux);
}
+
+ /*
+ * For SEV-ES, accesses to MSR_IA32_XSS should not be intercepted if
+ * the host/guest supports its use.
+ *
+ * guest_can_use() checks a number of requirements on the host/guest to
+ * ensure that MSR_IA32_XSS is available, but it might report true even
+ * if X86_FEATURE_XSAVES isn't configured in the guest to ensure host
+ * MSR_IA32_XSS is always properly restored. For SEV-ES, it is better
+ * to further check that the guest CPUID actually supports
+ * X86_FEATURE_XSAVES so that accesses to MSR_IA32_XSS by misbehaved
+ * guests will still get intercepted and caught in the normal
+ * kvm_emulate_rdmsr()/kvm_emulated_wrmsr() paths.
+ */
+ if (guest_can_use(vcpu, X86_FEATURE_XSAVES) &&
+ guest_cpuid_has(vcpu, X86_FEATURE_XSAVES))
+ set_msr_interception(vcpu, svm->msrpm, MSR_IA32_XSS, 1, 1);
+ else
+ set_msr_interception(vcpu, svm->msrpm, MSR_IA32_XSS, 0, 0);
}
void sev_vcpu_after_set_cpuid(struct vcpu_svm *svm)
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 77f1eeefcd..a8bd4e909a 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -103,6 +103,7 @@ static const struct svm_direct_access_msrs {
{ .index = MSR_IA32_LASTBRANCHTOIP, .always = false },
{ .index = MSR_IA32_LASTINTFROMIP, .always = false },
{ .index = MSR_IA32_LASTINTTOIP, .always = false },
+ { .index = MSR_IA32_XSS, .always = false },
{ .index = MSR_EFER, .always = false },
{ .index = MSR_IA32_CR_PAT, .always = false },
{ .index = MSR_AMD64_SEV_ES_GHCB, .always = true },
@@ -199,7 +200,7 @@ module_param_named(npt, npt_enabled, bool, 0444);
/* allow nested virtualization in KVM/SVM */
static int nested = true;
-module_param(nested, int, S_IRUGO);
+module_param(nested, int, 0444);
/* enable/disable Next RIP Save */
int nrips = true;
@@ -364,8 +365,6 @@ static void svm_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
svm->vmcb->control.int_state |= SVM_INTERRUPT_SHADOW_MASK;
}
-static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
- void *insn, int insn_len);
static int __svm_skip_emulated_instruction(struct kvm_vcpu *vcpu,
bool commit_side_effects)
@@ -386,14 +385,6 @@ static int __svm_skip_emulated_instruction(struct kvm_vcpu *vcpu,
}
if (!svm->next_rip) {
- /*
- * FIXME: Drop this when kvm_emulate_instruction() does the
- * right thing and treats "can't emulate" as outright failure
- * for EMULTYPE_SKIP.
- */
- if (!svm_can_emulate_instruction(vcpu, EMULTYPE_SKIP, NULL, 0))
- return 0;
-
if (unlikely(!commit_side_effects))
old_rflags = svm->vmcb->save.rflags;
@@ -531,8 +522,6 @@ static bool __kvm_is_svm_supported(void)
int cpu = smp_processor_id();
struct cpuinfo_x86 *c = &cpu_data(cpu);
- u64 vm_cr;
-
if (c->x86_vendor != X86_VENDOR_AMD &&
c->x86_vendor != X86_VENDOR_HYGON) {
pr_err("CPU %d isn't AMD or Hygon\n", cpu);
@@ -549,12 +538,6 @@ static bool __kvm_is_svm_supported(void)
return false;
}
- rdmsrl(MSR_VM_CR, vm_cr);
- if (vm_cr & (1 << SVM_VM_CR_SVM_DISABLE)) {
- pr_err("SVM disabled (by BIOS) in MSR_VM_CR on CPU %d\n", cpu);
- return false;
- }
-
return true;
}
@@ -2204,12 +2187,6 @@ static int shutdown_interception(struct kvm_vcpu *vcpu)
struct kvm_run *kvm_run = vcpu->run;
struct vcpu_svm *svm = to_svm(vcpu);
- /*
- * The VM save area has already been encrypted so it
- * cannot be reinitialized - just terminate.
- */
- if (sev_es_guest(vcpu->kvm))
- return -EINVAL;
/*
* VMCB is undefined after a SHUTDOWN intercept. INIT the vCPU to put
@@ -2218,9 +2195,14 @@ static int shutdown_interception(struct kvm_vcpu *vcpu)
* userspace. At a platform view, INIT is acceptable behavior as
* there exist bare metal platforms that automatically INIT the CPU
* in response to shutdown.
+ *
+ * The VM save area for SEV-ES guests has already been encrypted so it
+ * cannot be reinitialized, i.e. synthesizing INIT is futile.
*/
- clear_page(svm->vmcb);
- kvm_vcpu_reset(vcpu, true);
+ if (!sev_es_guest(vcpu->kvm)) {
+ clear_page(svm->vmcb);
+ kvm_vcpu_reset(vcpu, true);
+ }
kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
return 0;
@@ -4729,15 +4711,15 @@ static void svm_enable_smi_window(struct kvm_vcpu *vcpu)
}
#endif
-static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
- void *insn, int insn_len)
+static int svm_check_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
+ void *insn, int insn_len)
{
bool smep, smap, is_user;
u64 error_code;
/* Emulation is always possible when KVM has access to all guest state. */
if (!sev_guest(vcpu->kvm))
- return true;
+ return X86EMUL_CONTINUE;
/* #UD and #GP should never be intercepted for SEV guests. */
WARN_ON_ONCE(emul_type & (EMULTYPE_TRAP_UD |
@@ -4749,14 +4731,14 @@ static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
* to guest register state.
*/
if (sev_es_guest(vcpu->kvm))
- return false;
+ return X86EMUL_RETRY_INSTR;
/*
* Emulation is possible if the instruction is already decoded, e.g.
* when completing I/O after returning from userspace.
*/
if (emul_type & EMULTYPE_NO_DECODE)
- return true;
+ return X86EMUL_CONTINUE;
/*
* Emulation is possible for SEV guests if and only if a prefilled
@@ -4782,9 +4764,11 @@ static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
* success (and in practice it will work the vast majority of the time).
*/
if (unlikely(!insn)) {
- if (!(emul_type & EMULTYPE_SKIP))
- kvm_queue_exception(vcpu, UD_VECTOR);
- return false;
+ if (emul_type & EMULTYPE_SKIP)
+ return X86EMUL_UNHANDLEABLE;
+
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return X86EMUL_PROPAGATE_FAULT;
}
/*
@@ -4795,7 +4779,7 @@ static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
* table used to translate CS:RIP resides in emulated MMIO.
*/
if (likely(insn_len))
- return true;
+ return X86EMUL_CONTINUE;
/*
* Detect and workaround Errata 1096 Fam_17h_00_0Fh.
@@ -4853,6 +4837,7 @@ static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
kvm_inject_gp(vcpu, 0);
else
kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+ return X86EMUL_PROPAGATE_FAULT;
}
resume_guest:
@@ -4870,7 +4855,7 @@ resume_guest:
* doesn't explicitly define "ignored", i.e. doing nothing and letting
* the guest spin is technically "ignoring" the access.
*/
- return false;
+ return X86EMUL_RETRY_INSTR;
}
static bool svm_apic_init_signal_blocked(struct kvm_vcpu *vcpu)
@@ -5030,7 +5015,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.vm_copy_enc_context_from = sev_vm_copy_enc_context_from,
.vm_move_enc_context_from = sev_vm_move_enc_context_from,
- .can_emulate_instruction = svm_can_emulate_instruction,
+ .check_emulate_instruction = svm_check_emulate_instruction,
.apic_init_signal_blocked = svm_apic_init_signal_blocked,
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index be67ab7fdd..c409f934c3 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -30,7 +30,7 @@
#define IOPM_SIZE PAGE_SIZE * 3
#define MSRPM_SIZE PAGE_SIZE * 2
-#define MAX_DIRECT_ACCESS_MSRS 46
+#define MAX_DIRECT_ACCESS_MSRS 47
#define MSRPM_OFFSETS 32
extern u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
extern bool npt_enabled;
diff --git a/arch/x86/kvm/svm/svm_ops.h b/arch/x86/kvm/svm/svm_ops.h
index 36c8af87a7..4e725854c6 100644
--- a/arch/x86/kvm/svm/svm_ops.h
+++ b/arch/x86/kvm/svm/svm_ops.h
@@ -8,7 +8,7 @@
#define svm_asm(insn, clobber...) \
do { \
- asm_volatile_goto("1: " __stringify(insn) "\n\t" \
+ asm goto("1: " __stringify(insn) "\n\t" \
_ASM_EXTABLE(1b, %l[fault]) \
::: clobber : fault); \
return; \
@@ -18,7 +18,7 @@ fault: \
#define svm_asm1(insn, op1, clobber...) \
do { \
- asm_volatile_goto("1: " __stringify(insn) " %0\n\t" \
+ asm goto("1: " __stringify(insn) " %0\n\t" \
_ASM_EXTABLE(1b, %l[fault]) \
:: op1 : clobber : fault); \
return; \
@@ -28,7 +28,7 @@ fault: \
#define svm_asm2(insn, op1, op2, clobber...) \
do { \
- asm_volatile_goto("1: " __stringify(insn) " %1, %0\n\t" \
+ asm goto("1: " __stringify(insn) " %1, %0\n\t" \
_ASM_EXTABLE(1b, %l[fault]) \
:: op1, op2 : clobber : fault); \
return; \
diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
index 90c1f7f07e..1549461fa4 100644
--- a/arch/x86/kvm/vmx/pmu_intel.c
+++ b/arch/x86/kvm/vmx/pmu_intel.c
@@ -71,7 +71,7 @@ static int fixed_pmc_events[] = {
static void reprogram_fixed_counters(struct kvm_pmu *pmu, u64 data)
{
struct kvm_pmc *pmc;
- u8 old_fixed_ctr_ctrl = pmu->fixed_ctr_ctrl;
+ u64 old_fixed_ctr_ctrl = pmu->fixed_ctr_ctrl;
int i;
pmu->fixed_ctr_ctrl = data;
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 9bba535258..94082169bb 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -82,28 +82,28 @@ bool __read_mostly enable_vpid = 1;
module_param_named(vpid, enable_vpid, bool, 0444);
static bool __read_mostly enable_vnmi = 1;
-module_param_named(vnmi, enable_vnmi, bool, S_IRUGO);
+module_param_named(vnmi, enable_vnmi, bool, 0444);
bool __read_mostly flexpriority_enabled = 1;
-module_param_named(flexpriority, flexpriority_enabled, bool, S_IRUGO);
+module_param_named(flexpriority, flexpriority_enabled, bool, 0444);
bool __read_mostly enable_ept = 1;
-module_param_named(ept, enable_ept, bool, S_IRUGO);
+module_param_named(ept, enable_ept, bool, 0444);
bool __read_mostly enable_unrestricted_guest = 1;
module_param_named(unrestricted_guest,
- enable_unrestricted_guest, bool, S_IRUGO);
+ enable_unrestricted_guest, bool, 0444);
bool __read_mostly enable_ept_ad_bits = 1;
-module_param_named(eptad, enable_ept_ad_bits, bool, S_IRUGO);
+module_param_named(eptad, enable_ept_ad_bits, bool, 0444);
static bool __read_mostly emulate_invalid_guest_state = true;
-module_param(emulate_invalid_guest_state, bool, S_IRUGO);
+module_param(emulate_invalid_guest_state, bool, 0444);
static bool __read_mostly fasteoi = 1;
-module_param(fasteoi, bool, S_IRUGO);
+module_param(fasteoi, bool, 0444);
-module_param(enable_apicv, bool, S_IRUGO);
+module_param(enable_apicv, bool, 0444);
bool __read_mostly enable_ipiv = true;
module_param(enable_ipiv, bool, 0444);
@@ -114,10 +114,10 @@ module_param(enable_ipiv, bool, 0444);
* use VMX instructions.
*/
static bool __read_mostly nested = 1;
-module_param(nested, bool, S_IRUGO);
+module_param(nested, bool, 0444);
bool __read_mostly enable_pml = 1;
-module_param_named(pml, enable_pml, bool, S_IRUGO);
+module_param_named(pml, enable_pml, bool, 0444);
static bool __read_mostly error_on_inconsistent_vmcs_config = true;
module_param(error_on_inconsistent_vmcs_config, bool, 0444);
@@ -745,7 +745,7 @@ static int vmx_set_guest_uret_msr(struct vcpu_vmx *vmx,
*/
static int kvm_cpu_vmxoff(void)
{
- asm_volatile_goto("1: vmxoff\n\t"
+ asm goto("1: vmxoff\n\t"
_ASM_EXTABLE(1b, %l[fault])
::: "cc", "memory" : fault);
@@ -1657,8 +1657,8 @@ static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data)
return 0;
}
-static bool vmx_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
- void *insn, int insn_len)
+static int vmx_check_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
+ void *insn, int insn_len)
{
/*
* Emulation of instructions in SGX enclaves is impossible as RIP does
@@ -1669,9 +1669,9 @@ static bool vmx_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
*/
if (to_vmx(vcpu)->exit_reason.enclave_mode) {
kvm_queue_exception(vcpu, UD_VECTOR);
- return false;
+ return X86EMUL_PROPAGATE_FAULT;
}
- return true;
+ return X86EMUL_CONTINUE;
}
static int skip_emulated_instruction(struct kvm_vcpu *vcpu)
@@ -2789,7 +2789,7 @@ static int kvm_cpu_vmxon(u64 vmxon_pointer)
cr4_set_bits(X86_CR4_VMXE);
- asm_volatile_goto("1: vmxon %[vmxon_pointer]\n\t"
+ asm goto("1: vmxon %[vmxon_pointer]\n\t"
_ASM_EXTABLE(1b, %l[fault])
: : [vmxon_pointer] "m"(vmxon_pointer)
: : fault);
@@ -5792,7 +5792,7 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
{
gpa_t gpa;
- if (!vmx_can_emulate_instruction(vcpu, EMULTYPE_PF, NULL, 0))
+ if (vmx_check_emulate_instruction(vcpu, EMULTYPE_PF, NULL, 0))
return 1;
/*
@@ -7579,8 +7579,6 @@ static int vmx_vm_init(struct kvm *kvm)
static u8 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
{
- u8 cache;
-
/* We wanted to honor guest CD/MTRR/PAT, but doing so could result in
* memory aliases with conflicting memory types and sometimes MCEs.
* We have to be careful as to what are honored and when.
@@ -7607,11 +7605,10 @@ static u8 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
if (kvm_read_cr0_bits(vcpu, X86_CR0_CD)) {
if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
- cache = MTRR_TYPE_WRBACK;
+ return MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT;
else
- cache = MTRR_TYPE_UNCACHABLE;
-
- return (cache << VMX_EPT_MT_EPTE_SHIFT) | VMX_EPT_IPAT_BIT;
+ return (MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT) |
+ VMX_EPT_IPAT_BIT;
}
return kvm_mtrr_get_guest_memory_type(vcpu, gfn) << VMX_EPT_MT_EPTE_SHIFT;
@@ -8341,7 +8338,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
.enable_smi_window = vmx_enable_smi_window,
#endif
- .can_emulate_instruction = vmx_can_emulate_instruction,
+ .check_emulate_instruction = vmx_check_emulate_instruction,
.apic_init_signal_blocked = vmx_apic_init_signal_blocked,
.migrate_timers = vmx_migrate_timers,
diff --git a/arch/x86/kvm/vmx/vmx_ops.h b/arch/x86/kvm/vmx/vmx_ops.h
index 33af7b4c6e..6a0c6e81f7 100644
--- a/arch/x86/kvm/vmx/vmx_ops.h
+++ b/arch/x86/kvm/vmx/vmx_ops.h
@@ -94,7 +94,7 @@ static __always_inline unsigned long __vmcs_readl(unsigned long field)
#ifdef CONFIG_CC_HAS_ASM_GOTO_OUTPUT
- asm_volatile_goto("1: vmread %[field], %[output]\n\t"
+ asm_goto_output("1: vmread %[field], %[output]\n\t"
"jna %l[do_fail]\n\t"
_ASM_EXTABLE(1b, %l[do_exception])
@@ -188,7 +188,7 @@ static __always_inline unsigned long vmcs_readl(unsigned long field)
#define vmx_asm1(insn, op1, error_args...) \
do { \
- asm_volatile_goto("1: " __stringify(insn) " %0\n\t" \
+ asm goto("1: " __stringify(insn) " %0\n\t" \
".byte 0x2e\n\t" /* branch not taken hint */ \
"jna %l[error]\n\t" \
_ASM_EXTABLE(1b, %l[fault]) \
@@ -205,7 +205,7 @@ fault: \
#define vmx_asm2(insn, op1, op2, error_args...) \
do { \
- asm_volatile_goto("1: " __stringify(insn) " %1, %0\n\t" \
+ asm goto("1: " __stringify(insn) " %1, %0\n\t" \
".byte 0x2e\n\t" /* branch not taken hint */ \
"jna %l[error]\n\t" \
_ASM_EXTABLE(1b, %l[fault]) \
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index e179db7c17..468870450b 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -145,21 +145,21 @@ EXPORT_STATIC_CALL_GPL(kvm_x86_get_cs_db_l_bits);
EXPORT_STATIC_CALL_GPL(kvm_x86_cache_reg);
static bool __read_mostly ignore_msrs = 0;
-module_param(ignore_msrs, bool, S_IRUGO | S_IWUSR);
+module_param(ignore_msrs, bool, 0644);
bool __read_mostly report_ignored_msrs = true;
-module_param(report_ignored_msrs, bool, S_IRUGO | S_IWUSR);
+module_param(report_ignored_msrs, bool, 0644);
EXPORT_SYMBOL_GPL(report_ignored_msrs);
unsigned int min_timer_period_us = 200;
-module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR);
+module_param(min_timer_period_us, uint, 0644);
static bool __read_mostly kvmclock_periodic_sync = true;
-module_param(kvmclock_periodic_sync, bool, S_IRUGO);
+module_param(kvmclock_periodic_sync, bool, 0444);
/* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */
static u32 __read_mostly tsc_tolerance_ppm = 250;
-module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR);
+module_param(tsc_tolerance_ppm, uint, 0644);
/*
* lapic timer advance (tscdeadline mode only) in nanoseconds. '-1' enables
@@ -168,13 +168,13 @@ module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR);
* tuning, i.e. allows privileged userspace to set an exact advancement time.
*/
static int __read_mostly lapic_timer_advance_ns = -1;
-module_param(lapic_timer_advance_ns, int, S_IRUGO | S_IWUSR);
+module_param(lapic_timer_advance_ns, int, 0644);
static bool __read_mostly vector_hashing = true;
-module_param(vector_hashing, bool, S_IRUGO);
+module_param(vector_hashing, bool, 0444);
bool __read_mostly enable_vmware_backdoor = false;
-module_param(enable_vmware_backdoor, bool, S_IRUGO);
+module_param(enable_vmware_backdoor, bool, 0444);
EXPORT_SYMBOL_GPL(enable_vmware_backdoor);
/*
@@ -186,7 +186,7 @@ static int __read_mostly force_emulation_prefix;
module_param(force_emulation_prefix, int, 0644);
int __read_mostly pi_inject_timer = -1;
-module_param(pi_inject_timer, bint, S_IRUGO | S_IWUSR);
+module_param(pi_inject_timer, bint, 0644);
/* Enable/disable PMU virtualization */
bool __read_mostly enable_pmu = true;
@@ -962,7 +962,7 @@ void kvm_post_set_cr0(struct kvm_vcpu *vcpu, unsigned long old_cr0, unsigned lon
kvm_mmu_reset_context(vcpu);
if (((cr0 ^ old_cr0) & X86_CR0_CD) &&
- kvm_arch_has_noncoherent_dma(vcpu->kvm) &&
+ kvm_mmu_honors_guest_mtrrs(vcpu->kvm) &&
!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
kvm_zap_gfn_range(vcpu->kvm, 0, ~0ULL);
}
@@ -2331,14 +2331,9 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock, int sec_hi_o
if (kvm_write_guest(kvm, wall_clock, &version, sizeof(version)))
return;
- /*
- * The guest calculates current wall clock time by adding
- * system time (updated by kvm_guest_time_update below) to the
- * wall clock specified here. We do the reverse here.
- */
- wall_nsec = ktime_get_real_ns() - get_kvmclock_ns(kvm);
+ wall_nsec = kvm_get_wall_clock_epoch(kvm);
- wc.nsec = do_div(wall_nsec, 1000000000);
+ wc.nsec = do_div(wall_nsec, NSEC_PER_SEC);
wc.sec = (u32)wall_nsec; /* overflow in 2106 guest time */
wc.version = version;
@@ -2714,8 +2709,9 @@ static void __kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 offset, u64 tsc,
kvm_track_tsc_matching(vcpu);
}
-static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 data)
+static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 *user_value)
{
+ u64 data = user_value ? *user_value : 0;
struct kvm *kvm = vcpu->kvm;
u64 offset, ns, elapsed;
unsigned long flags;
@@ -2730,25 +2726,37 @@ static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 data)
if (vcpu->arch.virtual_tsc_khz) {
if (data == 0) {
/*
- * detection of vcpu initialization -- need to sync
- * with other vCPUs. This particularly helps to keep
- * kvm_clock stable after CPU hotplug
+ * Force synchronization when creating a vCPU, or when
+ * userspace explicitly writes a zero value.
*/
synchronizing = true;
- } else {
+ } else if (kvm->arch.user_set_tsc) {
u64 tsc_exp = kvm->arch.last_tsc_write +
nsec_to_cycles(vcpu, elapsed);
u64 tsc_hz = vcpu->arch.virtual_tsc_khz * 1000LL;
/*
- * Special case: TSC write with a small delta (1 second)
- * of virtual cycle time against real time is
- * interpreted as an attempt to synchronize the CPU.
+ * Here lies UAPI baggage: when a user-initiated TSC write has
+ * a small delta (1 second) of virtual cycle time against the
+ * previously set vCPU, we assume that they were intended to be
+ * in sync and the delta was only due to the racy nature of the
+ * legacy API.
+ *
+ * This trick falls down when restoring a guest which genuinely
+ * has been running for less time than the 1 second of imprecision
+ * which we allow for in the legacy API. In this case, the first
+ * value written by userspace (on any vCPU) should not be subject
+ * to this 'correction' to make it sync up with values that only
+ * come from the kernel's default vCPU creation. Make the 1-second
+ * slop hack only trigger if the user_set_tsc flag is already set.
*/
synchronizing = data < tsc_exp + tsc_hz &&
data + tsc_hz > tsc_exp;
}
}
+ if (user_value)
+ kvm->arch.user_set_tsc = true;
+
/*
* For a reliable TSC, we can match TSC offsets, and for an unstable
* TSC, we add elapsed time in this computation. We could let the
@@ -3232,16 +3240,94 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
if (vcpu->pv_time.active)
kvm_setup_guest_pvclock(v, &vcpu->pv_time, 0);
+#ifdef CONFIG_KVM_XEN
if (vcpu->xen.vcpu_info_cache.active)
kvm_setup_guest_pvclock(v, &vcpu->xen.vcpu_info_cache,
offsetof(struct compat_vcpu_info, time));
if (vcpu->xen.vcpu_time_info_cache.active)
kvm_setup_guest_pvclock(v, &vcpu->xen.vcpu_time_info_cache, 0);
+#endif
kvm_hv_setup_tsc_page(v->kvm, &vcpu->hv_clock);
return 0;
}
/*
+ * The pvclock_wall_clock ABI tells the guest the wall clock time at
+ * which it started (i.e. its epoch, when its kvmclock was zero).
+ *
+ * In fact those clocks are subtly different; wall clock frequency is
+ * adjusted by NTP and has leap seconds, while the kvmclock is a
+ * simple function of the TSC without any such adjustment.
+ *
+ * Perhaps the ABI should have exposed CLOCK_TAI and a ratio between
+ * that and kvmclock, but even that would be subject to change over
+ * time.
+ *
+ * Attempt to calculate the epoch at a given moment using the *same*
+ * TSC reading via kvm_get_walltime_and_clockread() to obtain both
+ * wallclock and kvmclock times, and subtracting one from the other.
+ *
+ * Fall back to using their values at slightly different moments by
+ * calling ktime_get_real_ns() and get_kvmclock_ns() separately.
+ */
+uint64_t kvm_get_wall_clock_epoch(struct kvm *kvm)
+{
+#ifdef CONFIG_X86_64
+ struct pvclock_vcpu_time_info hv_clock;
+ struct kvm_arch *ka = &kvm->arch;
+ unsigned long seq, local_tsc_khz;
+ struct timespec64 ts;
+ uint64_t host_tsc;
+
+ do {
+ seq = read_seqcount_begin(&ka->pvclock_sc);
+
+ local_tsc_khz = 0;
+ if (!ka->use_master_clock)
+ break;
+
+ /*
+ * The TSC read and the call to get_cpu_tsc_khz() must happen
+ * on the same CPU.
+ */
+ get_cpu();
+
+ local_tsc_khz = get_cpu_tsc_khz();
+
+ if (local_tsc_khz &&
+ !kvm_get_walltime_and_clockread(&ts, &host_tsc))
+ local_tsc_khz = 0; /* Fall back to old method */
+
+ put_cpu();
+
+ /*
+ * These values must be snapshotted within the seqcount loop.
+ * After that, it's just mathematics which can happen on any
+ * CPU at any time.
+ */
+ hv_clock.tsc_timestamp = ka->master_cycle_now;
+ hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset;
+
+ } while (read_seqcount_retry(&ka->pvclock_sc, seq));
+
+ /*
+ * If the conditions were right, and obtaining the wallclock+TSC was
+ * successful, calculate the KVM clock at the corresponding time and
+ * subtract one from the other to get the guest's epoch in nanoseconds
+ * since 1970-01-01.
+ */
+ if (local_tsc_khz) {
+ kvm_get_time_scale(NSEC_PER_SEC, local_tsc_khz * NSEC_PER_USEC,
+ &hv_clock.tsc_shift,
+ &hv_clock.tsc_to_system_mul);
+ return ts.tv_nsec + NSEC_PER_SEC * ts.tv_sec -
+ __pvclock_read_cycles(&hv_clock, host_tsc);
+ }
+#endif
+ return ktime_get_real_ns() - get_kvmclock_ns(kvm);
+}
+
+/*
* kvmclock updates which are isolated to a given vcpu, such as
* vcpu->cpu migration, should not allow system_timestamp from
* the rest of the vcpus to remain static. Otherwise ntp frequency
@@ -3290,9 +3376,6 @@ static void kvmclock_sync_fn(struct work_struct *work)
kvmclock_sync_work);
struct kvm *kvm = container_of(ka, struct kvm, arch);
- if (!kvmclock_periodic_sync)
- return;
-
schedule_delayed_work(&kvm->arch.kvmclock_update_work, 0);
schedule_delayed_work(&kvm->arch.kvmclock_sync_work,
KVMCLOCK_SYNC_PERIOD);
@@ -3671,17 +3754,36 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
vcpu->arch.perf_capabilities = data;
kvm_pmu_refresh(vcpu);
break;
- case MSR_IA32_PRED_CMD:
- if (!msr_info->host_initiated && !guest_has_pred_cmd_msr(vcpu))
- return 1;
+ case MSR_IA32_PRED_CMD: {
+ u64 reserved_bits = ~(PRED_CMD_IBPB | PRED_CMD_SBPB);
+
+ if (!msr_info->host_initiated) {
+ if ((!guest_has_pred_cmd_msr(vcpu)))
+ return 1;
+
+ if (!guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL) &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBPB))
+ reserved_bits |= PRED_CMD_IBPB;
+
+ if (!guest_cpuid_has(vcpu, X86_FEATURE_SBPB))
+ reserved_bits |= PRED_CMD_SBPB;
+ }
+
+ if (!boot_cpu_has(X86_FEATURE_IBPB))
+ reserved_bits |= PRED_CMD_IBPB;
- if (!boot_cpu_has(X86_FEATURE_IBPB) || (data & ~PRED_CMD_IBPB))
+ if (!boot_cpu_has(X86_FEATURE_SBPB))
+ reserved_bits |= PRED_CMD_SBPB;
+
+ if (data & reserved_bits)
return 1;
+
if (!data)
break;
- wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB);
+ wrmsrl(MSR_IA32_PRED_CMD, data);
break;
+ }
case MSR_IA32_FLUSH_CMD:
if (!msr_info->host_initiated &&
!guest_cpuid_has(vcpu, X86_FEATURE_FLUSH_L1D))
@@ -3701,13 +3803,16 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
data &= ~(u64)0x100; /* ignore ignne emulation enable */
data &= ~(u64)0x8; /* ignore TLB cache disable */
- /* Handle McStatusWrEn */
- if (data == BIT_ULL(18)) {
- vcpu->arch.msr_hwcr = data;
- } else if (data != 0) {
+ /*
+ * Allow McStatusWrEn and TscFreqSel. (Linux guests from v3.2
+ * through at least v6.6 whine if TscFreqSel is clear,
+ * depending on F/M/S.
+ */
+ if (data & ~(BIT_ULL(18) | BIT_ULL(24))) {
kvm_pr_unimpl_wrmsr(vcpu, msr, data);
return 1;
}
+ vcpu->arch.msr_hwcr = data;
break;
case MSR_FAM10H_MMIO_CONF_BASE:
if (data != 0) {
@@ -3778,7 +3883,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
break;
case MSR_IA32_TSC:
if (msr_info->host_initiated) {
- kvm_synchronize_tsc(vcpu, data);
+ kvm_synchronize_tsc(vcpu, &data);
} else {
u64 adj = kvm_compute_l1_tsc_offset(vcpu, data) - vcpu->arch.l1_tsc_offset;
adjust_tsc_offset_guest(vcpu, adj);
@@ -5300,7 +5405,8 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING) {
vcpu->arch.nmi_pending = 0;
atomic_set(&vcpu->arch.nmi_queued, events->nmi.pending);
- kvm_make_request(KVM_REQ_NMI, vcpu);
+ if (events->nmi.pending)
+ kvm_make_request(KVM_REQ_NMI, vcpu);
}
static_call(kvm_x86_set_nmi_mask)(vcpu, events->nmi.masked);
@@ -5413,8 +5519,8 @@ static void kvm_vcpu_ioctl_x86_get_xsave2(struct kvm_vcpu *vcpu,
static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu,
struct kvm_xsave *guest_xsave)
{
- return kvm_vcpu_ioctl_x86_get_xsave2(vcpu, (void *)guest_xsave->region,
- sizeof(guest_xsave->region));
+ kvm_vcpu_ioctl_x86_get_xsave2(vcpu, (void *)guest_xsave->region,
+ sizeof(guest_xsave->region));
}
static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu,
@@ -5549,6 +5655,7 @@ static int kvm_arch_tsc_set_attr(struct kvm_vcpu *vcpu,
tsc = kvm_scale_tsc(rdtsc(), vcpu->arch.l1_tsc_scaling_ratio) + offset;
ns = get_kvmclock_base_ns();
+ kvm->arch.user_set_tsc = true;
__kvm_synchronize_tsc(vcpu, offset, tsc, ns, matched);
raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
@@ -6261,6 +6368,9 @@ void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
struct kvm_vcpu *vcpu;
unsigned long i;
+ if (!kvm_x86_ops.cpu_dirty_log_size)
+ return;
+
kvm_for_each_vcpu(i, vcpu, kvm)
kvm_vcpu_kick(vcpu);
}
@@ -7487,11 +7597,11 @@ int kvm_write_guest_virt_system(struct kvm_vcpu *vcpu, gva_t addr, void *val,
}
EXPORT_SYMBOL_GPL(kvm_write_guest_virt_system);
-static int kvm_can_emulate_insn(struct kvm_vcpu *vcpu, int emul_type,
- void *insn, int insn_len)
+static int kvm_check_emulate_insn(struct kvm_vcpu *vcpu, int emul_type,
+ void *insn, int insn_len)
{
- return static_call(kvm_x86_can_emulate_instruction)(vcpu, emul_type,
- insn, insn_len);
+ return static_call(kvm_x86_check_emulate_instruction)(vcpu, emul_type,
+ insn, insn_len);
}
int handle_ud(struct kvm_vcpu *vcpu)
@@ -7501,8 +7611,10 @@ int handle_ud(struct kvm_vcpu *vcpu)
int emul_type = EMULTYPE_TRAP_UD;
char sig[5]; /* ud2; .ascii "kvm" */
struct x86_exception e;
+ int r;
- if (unlikely(!kvm_can_emulate_insn(vcpu, emul_type, NULL, 0)))
+ r = kvm_check_emulate_insn(vcpu, emul_type, NULL, 0);
+ if (r != X86EMUL_CONTINUE)
return 1;
if (fep_flags &&
@@ -8884,8 +8996,14 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
bool writeback = true;
- if (unlikely(!kvm_can_emulate_insn(vcpu, emulation_type, insn, insn_len)))
- return 1;
+ r = kvm_check_emulate_insn(vcpu, emulation_type, insn, insn_len);
+ if (r != X86EMUL_CONTINUE) {
+ if (r == X86EMUL_RETRY_INSTR || r == X86EMUL_PROPAGATE_FAULT)
+ return 1;
+
+ WARN_ON_ONCE(r != X86EMUL_UNHANDLEABLE);
+ return handle_emulation_failure(vcpu, emulation_type);
+ }
vcpu->arch.l1tf_flush_l1d = true;
@@ -10589,16 +10707,16 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
}
if (kvm_check_request(KVM_REQ_STEAL_UPDATE, vcpu))
record_steal_time(vcpu);
+ if (kvm_check_request(KVM_REQ_PMU, vcpu))
+ kvm_pmu_handle_event(vcpu);
+ if (kvm_check_request(KVM_REQ_PMI, vcpu))
+ kvm_pmu_deliver_pmi(vcpu);
#ifdef CONFIG_KVM_SMM
if (kvm_check_request(KVM_REQ_SMI, vcpu))
process_smi(vcpu);
#endif
if (kvm_check_request(KVM_REQ_NMI, vcpu))
process_nmi(vcpu);
- if (kvm_check_request(KVM_REQ_PMU, vcpu))
- kvm_pmu_handle_event(vcpu);
- if (kvm_check_request(KVM_REQ_PMI, vcpu))
- kvm_pmu_deliver_pmi(vcpu);
if (kvm_check_request(KVM_REQ_IOAPIC_EOI_EXIT, vcpu)) {
BUG_ON(vcpu->arch.pending_ioapic_eoi > 255);
if (test_bit(vcpu->arch.pending_ioapic_eoi,
@@ -11534,7 +11652,6 @@ static int __set_sregs_common(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs,
*mmu_reset_needed |= kvm_read_cr0(vcpu) != sregs->cr0;
static_call(kvm_x86_set_cr0)(vcpu, sregs->cr0);
- vcpu->arch.cr0 = sregs->cr0;
*mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4;
static_call(kvm_x86_set_cr4)(vcpu, sregs->cr4);
@@ -11578,8 +11695,10 @@ static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
if (ret)
return ret;
- if (mmu_reset_needed)
+ if (mmu_reset_needed) {
kvm_mmu_reset_context(vcpu);
+ kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
+ }
max_bits = KVM_NR_INTERRUPTS;
pending_vec = find_first_bit(
@@ -11620,8 +11739,10 @@ static int __set_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2)
mmu_reset_needed = 1;
vcpu->arch.pdptrs_from_userspace = true;
}
- if (mmu_reset_needed)
+ if (mmu_reset_needed) {
kvm_mmu_reset_context(vcpu);
+ kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
+ }
return 0;
}
@@ -11972,7 +12093,7 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
if (mutex_lock_killable(&vcpu->mutex))
return;
vcpu_load(vcpu);
- kvm_synchronize_tsc(vcpu, 0);
+ kvm_synchronize_tsc(vcpu, NULL);
vcpu_put(vcpu);
/* poll control enabled by default */
@@ -12328,7 +12449,6 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
goto out_uninit_mmu;
INIT_HLIST_HEAD(&kvm->arch.mask_notifier_list);
- INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
atomic_set(&kvm->arch.noncoherent_dma_count, 0);
/* Reserve bit 0 of irq_sources_bitmap for userspace irq source */
@@ -12912,7 +13032,10 @@ bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu)
if (vcpu->arch.guest_state_protected)
return true;
- return vcpu->arch.preempted_in_kernel;
+ if (vcpu != kvm_get_running_vcpu())
+ return vcpu->arch.preempted_in_kernel;
+
+ return static_call(kvm_x86_get_cpl)(vcpu) == 0;
}
unsigned long kvm_arch_vcpu_get_ip(struct kvm_vcpu *vcpu)
@@ -13204,15 +13327,30 @@ bool noinstr kvm_arch_has_assigned_device(struct kvm *kvm)
}
EXPORT_SYMBOL_GPL(kvm_arch_has_assigned_device);
+static void kvm_noncoherent_dma_assignment_start_or_stop(struct kvm *kvm)
+{
+ /*
+ * Non-coherent DMA assignment and de-assignment will affect
+ * whether KVM honors guest MTRRs and cause changes in memtypes
+ * in TDP.
+ * So, pass %true unconditionally to indicate non-coherent DMA was,
+ * or will be involved, and that zapping SPTEs might be necessary.
+ */
+ if (__kvm_mmu_honors_guest_mtrrs(true))
+ kvm_zap_gfn_range(kvm, gpa_to_gfn(0), gpa_to_gfn(~0ULL));
+}
+
void kvm_arch_register_noncoherent_dma(struct kvm *kvm)
{
- atomic_inc(&kvm->arch.noncoherent_dma_count);
+ if (atomic_inc_return(&kvm->arch.noncoherent_dma_count) == 1)
+ kvm_noncoherent_dma_assignment_start_or_stop(kvm);
}
EXPORT_SYMBOL_GPL(kvm_arch_register_noncoherent_dma);
void kvm_arch_unregister_noncoherent_dma(struct kvm *kvm)
{
- atomic_dec(&kvm->arch.noncoherent_dma_count);
+ if (!atomic_dec_return(&kvm->arch.noncoherent_dma_count))
+ kvm_noncoherent_dma_assignment_start_or_stop(kvm);
}
EXPORT_SYMBOL_GPL(kvm_arch_unregister_noncoherent_dma);
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 1e7be1f6ab..5184fde1dc 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -293,6 +293,7 @@ static inline bool kvm_check_has_quirk(struct kvm *kvm, u64 quirk)
void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip);
u64 get_kvmclock_ns(struct kvm *kvm);
+uint64_t kvm_get_wall_clock_epoch(struct kvm *kvm);
int kvm_read_guest_virt(struct kvm_vcpu *vcpu,
gva_t addr, void *val, unsigned int bytes,
diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c
index 40edf4d197..e53fad915a 100644
--- a/arch/x86/kvm/xen.c
+++ b/arch/x86/kvm/xen.c
@@ -59,7 +59,7 @@ static int kvm_xen_shared_info_init(struct kvm *kvm, gfn_t gfn)
* This code mirrors kvm_write_wall_clock() except that it writes
* directly through the pfn cache and doesn't mark the page dirty.
*/
- wall_nsec = ktime_get_real_ns() - get_kvmclock_ns(kvm);
+ wall_nsec = kvm_get_wall_clock_epoch(kvm);
/* It could be invalid again already, so we need to check */
read_lock_irq(&gpc->lock);
@@ -98,7 +98,7 @@ static int kvm_xen_shared_info_init(struct kvm *kvm, gfn_t gfn)
wc_version = wc->version = (wc->version + 1) | 1;
smp_wmb();
- wc->nsec = do_div(wall_nsec, 1000000000);
+ wc->nsec = do_div(wall_nsec, NSEC_PER_SEC);
wc->sec = (u32)wall_nsec;
*wc_sec_hi = wall_nsec >> 32;
smp_wmb();
@@ -134,9 +134,23 @@ static enum hrtimer_restart xen_timer_callback(struct hrtimer *timer)
{
struct kvm_vcpu *vcpu = container_of(timer, struct kvm_vcpu,
arch.xen.timer);
+ struct kvm_xen_evtchn e;
+ int rc;
+
if (atomic_read(&vcpu->arch.xen.timer_pending))
return HRTIMER_NORESTART;
+ e.vcpu_id = vcpu->vcpu_id;
+ e.vcpu_idx = vcpu->vcpu_idx;
+ e.port = vcpu->arch.xen.timer_virq;
+ e.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL;
+
+ rc = kvm_xen_set_evtchn_fast(&e, vcpu->kvm);
+ if (rc != -EWOULDBLOCK) {
+ vcpu->arch.xen.timer_expires = 0;
+ return HRTIMER_NORESTART;
+ }
+
atomic_inc(&vcpu->arch.xen.timer_pending);
kvm_make_request(KVM_REQ_UNBLOCK, vcpu);
kvm_vcpu_kick(vcpu);
@@ -146,6 +160,14 @@ static enum hrtimer_restart xen_timer_callback(struct hrtimer *timer)
static void kvm_xen_start_timer(struct kvm_vcpu *vcpu, u64 guest_abs, s64 delta_ns)
{
+ /*
+ * Avoid races with the old timer firing. Checking timer_expires
+ * to avoid calling hrtimer_cancel() will only have false positives
+ * so is fine.
+ */
+ if (vcpu->arch.xen.timer_expires)
+ hrtimer_cancel(&vcpu->arch.xen.timer);
+
atomic_set(&vcpu->arch.xen.timer_pending, 0);
vcpu->arch.xen.timer_expires = guest_abs;
@@ -1019,9 +1041,36 @@ int kvm_xen_vcpu_get_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
break;
case KVM_XEN_VCPU_ATTR_TYPE_TIMER:
+ /*
+ * Ensure a consistent snapshot of state is captured, with a
+ * timer either being pending, or the event channel delivered
+ * to the corresponding bit in the shared_info. Not still
+ * lurking in the timer_pending flag for deferred delivery.
+ * Purely as an optimisation, if the timer_expires field is
+ * zero, that means the timer isn't active (or even in the
+ * timer_pending flag) and there is no need to cancel it.
+ */
+ if (vcpu->arch.xen.timer_expires) {
+ hrtimer_cancel(&vcpu->arch.xen.timer);
+ kvm_xen_inject_timer_irqs(vcpu);
+ }
+
data->u.timer.port = vcpu->arch.xen.timer_virq;
data->u.timer.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL;
data->u.timer.expires_ns = vcpu->arch.xen.timer_expires;
+
+ /*
+ * The hrtimer may trigger and raise the IRQ immediately,
+ * while the returned state causes it to be set up and
+ * raised again on the destination system after migration.
+ * That's fine, as the guest won't even have had a chance
+ * to run and handle the interrupt. Asserting an already
+ * pending event channel is idempotent.
+ */
+ if (vcpu->arch.xen.timer_expires)
+ hrtimer_start_expires(&vcpu->arch.xen.timer,
+ HRTIMER_MODE_ABS_HARD);
+
r = 0;
break;
@@ -1374,12 +1423,8 @@ static bool kvm_xen_hcall_vcpu_op(struct kvm_vcpu *vcpu, bool longmode, int cmd,
return true;
}
+ /* A delta <= 0 results in an immediate callback, which is what we want */
delta = oneshot.timeout_abs_ns - get_kvmclock_ns(vcpu->kvm);
- if ((oneshot.flags & VCPU_SSHOTTMR_future) && delta < 0) {
- *r = -ETIME;
- return true;
- }
-
kvm_xen_start_timer(vcpu, oneshot.timeout_abs_ns, delta);
*r = 0;
return true;