summaryrefslogtreecommitdiffstats
path: root/arch/x86/kvm/x86.c
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-08-07 13:11:22 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-08-07 13:11:22 +0000
commitb20732900e4636a467c0183a47f7396700f5f743 (patch)
tree42f079ff82e701ebcb76829974b4caca3e5b6798 /arch/x86/kvm/x86.c
parentAdding upstream version 6.8.12. (diff)
downloadlinux-b20732900e4636a467c0183a47f7396700f5f743.tar.xz
linux-b20732900e4636a467c0183a47f7396700f5f743.zip
Adding upstream version 6.9.7.upstream/6.9.7
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'arch/x86/kvm/x86.c')
-rw-r--r--arch/x86/kvm/x86.c237
1 files changed, 118 insertions, 119 deletions
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c84927216f..4dbd9d99f9 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1399,22 +1399,19 @@ int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
}
EXPORT_SYMBOL_GPL(kvm_set_dr);
-void kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
+unsigned long kvm_get_dr(struct kvm_vcpu *vcpu, int dr)
{
size_t size = ARRAY_SIZE(vcpu->arch.db);
switch (dr) {
case 0 ... 3:
- *val = vcpu->arch.db[array_index_nospec(dr, size)];
- break;
+ return vcpu->arch.db[array_index_nospec(dr, size)];
case 4:
case 6:
- *val = vcpu->arch.dr6;
- break;
+ return vcpu->arch.dr6;
case 5:
default: /* 7 */
- *val = vcpu->arch.dr7;
- break;
+ return vcpu->arch.dr7;
}
}
EXPORT_SYMBOL_GPL(kvm_get_dr);
@@ -2860,7 +2857,11 @@ static inline u64 vgettsc(struct pvclock_clock *clock, u64 *tsc_timestamp,
return v * clock->mult;
}
-static int do_monotonic_raw(s64 *t, u64 *tsc_timestamp)
+/*
+ * As with get_kvmclock_base_ns(), this counts from boot time, at the
+ * frequency of CLOCK_MONOTONIC_RAW (hence adding gtos->offs_boot).
+ */
+static int do_kvmclock_base(s64 *t, u64 *tsc_timestamp)
{
struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
unsigned long seq;
@@ -2879,6 +2880,29 @@ static int do_monotonic_raw(s64 *t, u64 *tsc_timestamp)
return mode;
}
+/*
+ * This calculates CLOCK_MONOTONIC at the time of the TSC snapshot, with
+ * no boot time offset.
+ */
+static int do_monotonic(s64 *t, u64 *tsc_timestamp)
+{
+ struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
+ unsigned long seq;
+ int mode;
+ u64 ns;
+
+ do {
+ seq = read_seqcount_begin(&gtod->seq);
+ ns = gtod->clock.base_cycles;
+ ns += vgettsc(&gtod->clock, tsc_timestamp, &mode);
+ ns >>= gtod->clock.shift;
+ ns += ktime_to_ns(gtod->clock.offset);
+ } while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
+ *t = ns;
+
+ return mode;
+}
+
static int do_realtime(struct timespec64 *ts, u64 *tsc_timestamp)
{
struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
@@ -2900,18 +2924,42 @@ static int do_realtime(struct timespec64 *ts, u64 *tsc_timestamp)
return mode;
}
-/* returns true if host is using TSC based clocksource */
+/*
+ * Calculates the kvmclock_base_ns (CLOCK_MONOTONIC_RAW + boot time) and
+ * reports the TSC value from which it do so. Returns true if host is
+ * using TSC based clocksource.
+ */
static bool kvm_get_time_and_clockread(s64 *kernel_ns, u64 *tsc_timestamp)
{
/* checked again under seqlock below */
if (!gtod_is_based_on_tsc(pvclock_gtod_data.clock.vclock_mode))
return false;
- return gtod_is_based_on_tsc(do_monotonic_raw(kernel_ns,
- tsc_timestamp));
+ return gtod_is_based_on_tsc(do_kvmclock_base(kernel_ns,
+ tsc_timestamp));
+}
+
+/*
+ * Calculates CLOCK_MONOTONIC and reports the TSC value from which it did
+ * so. Returns true if host is using TSC based clocksource.
+ */
+bool kvm_get_monotonic_and_clockread(s64 *kernel_ns, u64 *tsc_timestamp)
+{
+ /* checked again under seqlock below */
+ if (!gtod_is_based_on_tsc(pvclock_gtod_data.clock.vclock_mode))
+ return false;
+
+ return gtod_is_based_on_tsc(do_monotonic(kernel_ns,
+ tsc_timestamp));
}
-/* returns true if host is using TSC based clocksource */
+/*
+ * Calculates CLOCK_REALTIME and reports the TSC value from which it did
+ * so. Returns true if host is using TSC based clocksource.
+ *
+ * DO NOT USE this for anything related to migration. You want CLOCK_TAI
+ * for that.
+ */
static bool kvm_get_walltime_and_clockread(struct timespec64 *ts,
u64 *tsc_timestamp)
{
@@ -3158,7 +3206,7 @@ static void kvm_setup_guest_pvclock(struct kvm_vcpu *v,
guest_hv_clock->version = ++vcpu->hv_clock.version;
- mark_page_dirty_in_slot(v->kvm, gpc->memslot, gpc->gpa >> PAGE_SHIFT);
+ kvm_gpc_mark_dirty_in_slot(gpc);
read_unlock_irqrestore(&gpc->lock, flags);
trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock);
@@ -4680,7 +4728,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
KVM_XEN_HVM_CONFIG_SHARED_INFO |
KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL |
KVM_XEN_HVM_CONFIG_EVTCHN_SEND |
- KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE;
+ KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE |
+ KVM_XEN_HVM_CONFIG_SHARED_INFO_HVA;
if (sched_info_on())
r |= KVM_XEN_HVM_CONFIG_RUNSTATE |
KVM_XEN_HVM_CONFIG_RUNSTATE_UPDATE_FLAG;
@@ -5064,8 +5113,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
int idx;
if (vcpu->preempted) {
- if (!vcpu->arch.guest_state_protected)
- vcpu->arch.preempted_in_kernel = !static_call(kvm_x86_get_cpl)(vcpu);
+ vcpu->arch.preempted_in_kernel = kvm_arch_vcpu_in_kernel(vcpu);
/*
* Take the srcu lock as memslots will be accessed to check the gfn
@@ -5512,18 +5560,23 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu,
struct kvm_debugregs *dbgregs)
{
- unsigned long val;
+ unsigned int i;
memset(dbgregs, 0, sizeof(*dbgregs));
- memcpy(dbgregs->db, vcpu->arch.db, sizeof(vcpu->arch.db));
- kvm_get_dr(vcpu, 6, &val);
- dbgregs->dr6 = val;
+
+ BUILD_BUG_ON(ARRAY_SIZE(vcpu->arch.db) != ARRAY_SIZE(dbgregs->db));
+ for (i = 0; i < ARRAY_SIZE(vcpu->arch.db); i++)
+ dbgregs->db[i] = vcpu->arch.db[i];
+
+ dbgregs->dr6 = vcpu->arch.dr6;
dbgregs->dr7 = vcpu->arch.dr7;
}
static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
struct kvm_debugregs *dbgregs)
{
+ unsigned int i;
+
if (dbgregs->flags)
return -EINVAL;
@@ -5532,7 +5585,9 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
if (!kvm_dr7_valid(dbgregs->dr7))
return -EINVAL;
- memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db));
+ for (i = 0; i < ARRAY_SIZE(vcpu->arch.db); i++)
+ vcpu->arch.db[i] = dbgregs->db[i];
+
kvm_update_dr0123(vcpu);
vcpu->arch.dr6 = dbgregs->dr6;
vcpu->arch.dr7 = dbgregs->dr7;
@@ -8180,10 +8235,9 @@ static void emulator_wbinvd(struct x86_emulate_ctxt *ctxt)
kvm_emulate_wbinvd_noskip(emul_to_vcpu(ctxt));
}
-static void emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr,
- unsigned long *dest)
+static unsigned long emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr)
{
- kvm_get_dr(emul_to_vcpu(ctxt), dr, dest);
+ return kvm_get_dr(emul_to_vcpu(ctxt), dr);
}
static int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr,
@@ -8405,12 +8459,9 @@ static int emulator_get_msr(struct x86_emulate_ctxt *ctxt,
return kvm_get_msr(emul_to_vcpu(ctxt), msr_index, pdata);
}
-static int emulator_check_pmc(struct x86_emulate_ctxt *ctxt,
- u32 pmc)
+static int emulator_check_rdpmc_early(struct x86_emulate_ctxt *ctxt, u32 pmc)
{
- if (kvm_pmu_is_valid_rdpmc_ecx(emul_to_vcpu(ctxt), pmc))
- return 0;
- return -EINVAL;
+ return kvm_pmu_check_rdpmc_early(emul_to_vcpu(ctxt), pmc);
}
static int emulator_read_pmc(struct x86_emulate_ctxt *ctxt,
@@ -8542,7 +8593,7 @@ static const struct x86_emulate_ops emulate_ops = {
.set_msr_with_filter = emulator_set_msr_with_filter,
.get_msr_with_filter = emulator_get_msr_with_filter,
.get_msr = emulator_get_msr,
- .check_pmc = emulator_check_pmc,
+ .check_rdpmc_early = emulator_check_rdpmc_early,
.read_pmc = emulator_read_pmc,
.halt = emulator_halt,
.wbinvd = emulator_wbinvd,
@@ -8803,31 +8854,24 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
kvm_release_pfn_clean(pfn);
- /* The instructions are well-emulated on direct mmu. */
- if (vcpu->arch.mmu->root_role.direct) {
- unsigned int indirect_shadow_pages;
-
- write_lock(&vcpu->kvm->mmu_lock);
- indirect_shadow_pages = vcpu->kvm->arch.indirect_shadow_pages;
- write_unlock(&vcpu->kvm->mmu_lock);
-
- if (indirect_shadow_pages)
- kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
-
- return true;
- }
-
/*
- * if emulation was due to access to shadowed page table
- * and it failed try to unshadow page and re-enter the
- * guest to let CPU execute the instruction.
+ * If emulation may have been triggered by a write to a shadowed page
+ * table, unprotect the gfn (zap any relevant SPTEs) and re-enter the
+ * guest to let the CPU re-execute the instruction in the hope that the
+ * CPU can cleanly execute the instruction that KVM failed to emulate.
*/
- kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
+ if (vcpu->kvm->arch.indirect_shadow_pages)
+ kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
/*
- * If the access faults on its page table, it can not
- * be fixed by unprotecting shadow page and it should
- * be reported to userspace.
+ * If the failed instruction faulted on an access to page tables that
+ * are used to translate any part of the instruction, KVM can't resolve
+ * the issue by unprotecting the gfn, as zapping the shadow page will
+ * result in the instruction taking a !PRESENT page fault and thus put
+ * the vCPU into an infinite loop of page faults. E.g. KVM will create
+ * a SPTE and write-protect the gfn to resolve the !PRESENT fault, and
+ * then zap the SPTE to unprotect the gfn, and then do it all over
+ * again. Report the error to userspace.
*/
return !(emulation_type & EMULTYPE_WRITE_PF_TO_SP);
}
@@ -8922,7 +8966,7 @@ int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu)
if (unlikely(!r))
return 0;
- kvm_pmu_trigger_event(vcpu, PERF_COUNT_HW_INSTRUCTIONS);
+ kvm_pmu_trigger_event(vcpu, kvm_pmu_eventsel.INSTRUCTIONS_RETIRED);
/*
* rflags is the old, "raw" value of the flags. The new value has
@@ -9235,9 +9279,9 @@ writeback:
*/
if (!ctxt->have_exception ||
exception_type(ctxt->exception.vector) == EXCPT_TRAP) {
- kvm_pmu_trigger_event(vcpu, PERF_COUNT_HW_INSTRUCTIONS);
+ kvm_pmu_trigger_event(vcpu, kvm_pmu_eventsel.INSTRUCTIONS_RETIRED);
if (ctxt->is_branch)
- kvm_pmu_trigger_event(vcpu, PERF_COUNT_HW_BRANCH_INSTRUCTIONS);
+ kvm_pmu_trigger_event(vcpu, kvm_pmu_eventsel.BRANCH_INSTRUCTIONS_RETIRED);
kvm_rip_write(vcpu, ctxt->eip);
if (r && (ctxt->tf || (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)))
r = kvm_vcpu_do_singlestep(vcpu);
@@ -9648,11 +9692,13 @@ static void kvm_x86_check_cpu_compat(void *ret)
*(int *)ret = kvm_x86_check_processor_compatibility();
}
-static int __kvm_x86_vendor_init(struct kvm_x86_init_ops *ops)
+int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops)
{
u64 host_pat;
int r, cpu;
+ guard(mutex)(&vendor_module_lock);
+
if (kvm_x86_ops.hardware_enable) {
pr_err("already loaded vendor module '%s'\n", kvm_x86_ops.name);
return -EEXIST;
@@ -9782,17 +9828,6 @@ out_free_x86_emulator_cache:
kmem_cache_destroy(x86_emulator_cache);
return r;
}
-
-int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops)
-{
- int r;
-
- mutex_lock(&vendor_module_lock);
- r = __kvm_x86_vendor_init(ops);
- mutex_unlock(&vendor_module_lock);
-
- return r;
-}
EXPORT_SYMBOL_GPL(kvm_x86_vendor_init);
void kvm_x86_vendor_exit(void)
@@ -10642,13 +10677,12 @@ static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
bitmap_zero(vcpu->arch.ioapic_handled_vectors, 256);
+ static_call_cond(kvm_x86_sync_pir_to_irr)(vcpu);
+
if (irqchip_split(vcpu->kvm))
kvm_scan_ioapic_routes(vcpu, vcpu->arch.ioapic_handled_vectors);
- else {
- static_call_cond(kvm_x86_sync_pir_to_irr)(vcpu);
- if (ioapic_in_kernel(vcpu->kvm))
- kvm_ioapic_scan_entry(vcpu, vcpu->arch.ioapic_handled_vectors);
- }
+ else if (ioapic_in_kernel(vcpu->kvm))
+ kvm_ioapic_scan_entry(vcpu, vcpu->arch.ioapic_handled_vectors);
if (is_guest_mode(vcpu))
vcpu->arch.load_eoi_exitmap_pending = true;
@@ -10689,12 +10723,6 @@ static void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu)
static_call_cond(kvm_x86_set_apic_access_page_addr)(vcpu);
}
-void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu)
-{
- smp_send_reschedule(vcpu->cpu);
-}
-EXPORT_SYMBOL_GPL(__kvm_request_immediate_exit);
-
/*
* Called within kvm->srcu read side.
* Returns 1 to let vcpu_run() continue the guest execution loop without
@@ -10944,10 +10972,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
goto cancel_injection;
}
- if (req_immediate_exit) {
+ if (req_immediate_exit)
kvm_make_request(KVM_REQ_EVENT, vcpu);
- static_call(kvm_x86_request_immediate_exit)(vcpu);
- }
fpregs_assert_state_consistent();
if (test_thread_flag(TIF_NEED_FPU_LOAD))
@@ -10978,7 +11004,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
WARN_ON_ONCE((kvm_vcpu_apicv_activated(vcpu) != kvm_vcpu_apicv_active(vcpu)) &&
(kvm_get_apic_mode(vcpu) != LAPIC_MODE_DISABLED));
- exit_fastpath = static_call(kvm_x86_vcpu_run)(vcpu);
+ exit_fastpath = static_call(kvm_x86_vcpu_run)(vcpu, req_immediate_exit);
if (likely(exit_fastpath != EXIT_FASTPATH_REENTER_GUEST))
break;
@@ -12065,7 +12091,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
vcpu->arch.regs_avail = ~0;
vcpu->arch.regs_dirty = ~0;
- kvm_gpc_init(&vcpu->arch.pv_time, vcpu->kvm, vcpu, KVM_HOST_USES_PFN);
+ kvm_gpc_init(&vcpu->arch.pv_time, vcpu->kvm);
if (!irqchip_in_kernel(vcpu->kvm) || kvm_vcpu_is_reset_bsp(vcpu))
vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
@@ -12076,27 +12102,9 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
if (r < 0)
return r;
- if (irqchip_in_kernel(vcpu->kvm)) {
- r = kvm_create_lapic(vcpu, lapic_timer_advance_ns);
- if (r < 0)
- goto fail_mmu_destroy;
-
- /*
- * Defer evaluating inhibits until the vCPU is first run, as
- * this vCPU will not get notified of any changes until this
- * vCPU is visible to other vCPUs (marked online and added to
- * the set of vCPUs). Opportunistically mark APICv active as
- * VMX in particularly is highly unlikely to have inhibits.
- * Ignore the current per-VM APICv state so that vCPU creation
- * is guaranteed to run with a deterministic value, the request
- * will ensure the vCPU gets the correct state before VM-Entry.
- */
- if (enable_apicv) {
- vcpu->arch.apic->apicv_active = true;
- kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu);
- }
- } else
- static_branch_inc(&kvm_has_noapic_vcpu);
+ r = kvm_create_lapic(vcpu, lapic_timer_advance_ns);
+ if (r < 0)
+ goto fail_mmu_destroy;
r = -ENOMEM;
@@ -12217,8 +12225,6 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
srcu_read_unlock(&vcpu->kvm->srcu, idx);
free_page((unsigned long)vcpu->arch.pio_data);
kvfree(vcpu->arch.cpuid_entries);
- if (!lapic_in_kernel(vcpu))
- static_branch_dec(&kvm_has_noapic_vcpu);
}
void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
@@ -12495,9 +12501,6 @@ bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu)
return (vcpu->arch.apic_base & MSR_IA32_APICBASE_BSP) != 0;
}
-__read_mostly DEFINE_STATIC_KEY_FALSE(kvm_has_noapic_vcpu);
-EXPORT_SYMBOL_GPL(kvm_has_noapic_vcpu);
-
void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu)
{
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
@@ -13100,11 +13103,13 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
bool kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu)
{
- if (kvm_vcpu_apicv_active(vcpu) &&
- static_call(kvm_x86_dy_apicv_has_pending_interrupt)(vcpu))
- return true;
+ return kvm_vcpu_apicv_active(vcpu) &&
+ static_call(kvm_x86_dy_apicv_has_pending_interrupt)(vcpu);
+}
- return false;
+bool kvm_arch_vcpu_preempted_in_kernel(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.preempted_in_kernel;
}
bool kvm_arch_dy_runnable(struct kvm_vcpu *vcpu)
@@ -13127,9 +13132,6 @@ bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu)
if (vcpu->arch.guest_state_protected)
return true;
- if (vcpu != kvm_get_running_vcpu())
- return vcpu->arch.preempted_in_kernel;
-
return static_call(kvm_x86_get_cpl)(vcpu) == 0;
}
@@ -13924,9 +13926,6 @@ module_init(kvm_x86_init);
static void __exit kvm_x86_exit(void)
{
- /*
- * If module_init() is implemented, module_exit() must also be
- * implemented to allow module unload.
- */
+ WARN_ON_ONCE(static_branch_unlikely(&kvm_has_noapic_vcpu));
}
module_exit(kvm_x86_exit);