diff options
Diffstat (limited to '')
-rw-r--r-- | arch/x86/hyperv/Makefile | 7 | ||||
-rw-r--r-- | arch/x86/hyperv/hv_apic.c | 294 | ||||
-rw-r--r-- | arch/x86/hyperv/hv_init.c | 561 | ||||
-rw-r--r-- | arch/x86/hyperv/hv_spinlock.c | 88 | ||||
-rw-r--r-- | arch/x86/hyperv/mmu.c | 244 | ||||
-rw-r--r-- | arch/x86/hyperv/nested.c | 136 |
6 files changed, 1330 insertions, 0 deletions
diff --git a/arch/x86/hyperv/Makefile b/arch/x86/hyperv/Makefile new file mode 100644 index 000000000..89b1f74d3 --- /dev/null +++ b/arch/x86/hyperv/Makefile @@ -0,0 +1,7 @@ +# SPDX-License-Identifier: GPL-2.0-only +obj-y := hv_init.o mmu.o nested.o +obj-$(CONFIG_X86_64) += hv_apic.o + +ifdef CONFIG_X86_64 +obj-$(CONFIG_PARAVIRT_SPINLOCKS) += hv_spinlock.o +endif diff --git a/arch/x86/hyperv/hv_apic.c b/arch/x86/hyperv/hv_apic.c new file mode 100644 index 000000000..284e73661 --- /dev/null +++ b/arch/x86/hyperv/hv_apic.c @@ -0,0 +1,294 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Hyper-V specific APIC code. + * + * Copyright (C) 2018, Microsoft, Inc. + * + * Author : K. Y. Srinivasan <kys@microsoft.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for more + * details. + * + */ + +#include <linux/types.h> +#include <linux/vmalloc.h> +#include <linux/mm.h> +#include <linux/clockchips.h> +#include <linux/hyperv.h> +#include <linux/slab.h> +#include <linux/cpuhotplug.h> +#include <asm/hypervisor.h> +#include <asm/mshyperv.h> +#include <asm/apic.h> + +#include <asm/trace/hyperv.h> + +static struct apic orig_apic; + +static u64 hv_apic_icr_read(void) +{ + u64 reg_val; + + rdmsrl(HV_X64_MSR_ICR, reg_val); + return reg_val; +} + +static void hv_apic_icr_write(u32 low, u32 id) +{ + u64 reg_val; + + reg_val = SET_APIC_DEST_FIELD(id); + reg_val = reg_val << 32; + reg_val |= low; + + wrmsrl(HV_X64_MSR_ICR, reg_val); +} + +static u32 hv_apic_read(u32 reg) +{ + u32 reg_val, hi; + + switch (reg) { + case APIC_EOI: + rdmsr(HV_X64_MSR_EOI, reg_val, hi); + return reg_val; + case APIC_TASKPRI: + rdmsr(HV_X64_MSR_TPR, reg_val, hi); + return reg_val; + + default: + return native_apic_mem_read(reg); + } +} + +static void hv_apic_write(u32 reg, u32 val) +{ + switch (reg) { + case APIC_EOI: + wrmsr(HV_X64_MSR_EOI, val, 0); + break; + case APIC_TASKPRI: + wrmsr(HV_X64_MSR_TPR, val, 0); + break; + default: + native_apic_mem_write(reg, val); + } +} + +static void hv_apic_eoi_write(u32 reg, u32 val) +{ + struct hv_vp_assist_page *hvp = hv_vp_assist_page[smp_processor_id()]; + + if (hvp && (xchg(&hvp->apic_assist, 0) & 0x1)) + return; + + wrmsr(HV_X64_MSR_EOI, val, 0); +} + +/* + * IPI implementation on Hyper-V. + */ +static bool __send_ipi_mask_ex(const struct cpumask *mask, int vector) +{ + struct hv_send_ipi_ex **arg; + struct hv_send_ipi_ex *ipi_arg; + unsigned long flags; + int nr_bank = 0; + int ret = 1; + + if (!(ms_hyperv.hints & HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED)) + return false; + + local_irq_save(flags); + arg = (struct hv_send_ipi_ex **)this_cpu_ptr(hyperv_pcpu_input_arg); + + ipi_arg = *arg; + if (unlikely(!ipi_arg)) + goto ipi_mask_ex_done; + + ipi_arg->vector = vector; + ipi_arg->reserved = 0; + ipi_arg->vp_set.valid_bank_mask = 0; + + if (!cpumask_equal(mask, cpu_present_mask)) { + ipi_arg->vp_set.format = HV_GENERIC_SET_SPARSE_4K; + nr_bank = cpumask_to_vpset(&(ipi_arg->vp_set), mask); + } + if (nr_bank < 0) + goto ipi_mask_ex_done; + if (!nr_bank) + ipi_arg->vp_set.format = HV_GENERIC_SET_ALL; + + ret = hv_do_rep_hypercall(HVCALL_SEND_IPI_EX, 0, nr_bank, + ipi_arg, NULL); + +ipi_mask_ex_done: + local_irq_restore(flags); + return ((ret == 0) ? true : false); +} + +static bool __send_ipi_mask(const struct cpumask *mask, int vector) +{ + int cur_cpu, vcpu; + struct hv_send_ipi ipi_arg; + int ret = 1; + + trace_hyperv_send_ipi_mask(mask, vector); + + if (cpumask_empty(mask)) + return true; + + if (!hv_hypercall_pg) + return false; + + if ((vector < HV_IPI_LOW_VECTOR) || (vector > HV_IPI_HIGH_VECTOR)) + return false; + + /* + * From the supplied CPU set we need to figure out if we can get away + * with cheaper HVCALL_SEND_IPI hypercall. This is possible when the + * highest VP number in the set is < 64. As VP numbers are usually in + * ascending order and match Linux CPU ids, here is an optimization: + * we check the VP number for the highest bit in the supplied set first + * so we can quickly find out if using HVCALL_SEND_IPI_EX hypercall is + * a must. We will also check all VP numbers when walking the supplied + * CPU set to remain correct in all cases. + */ + if (hv_cpu_number_to_vp_number(cpumask_last(mask)) >= 64) + goto do_ex_hypercall; + + ipi_arg.vector = vector; + ipi_arg.cpu_mask = 0; + + for_each_cpu(cur_cpu, mask) { + vcpu = hv_cpu_number_to_vp_number(cur_cpu); + if (vcpu == VP_INVAL) + return false; + + /* + * This particular version of the IPI hypercall can + * only target upto 64 CPUs. + */ + if (vcpu >= 64) + goto do_ex_hypercall; + + __set_bit(vcpu, (unsigned long *)&ipi_arg.cpu_mask); + } + + ret = hv_do_fast_hypercall16(HVCALL_SEND_IPI, ipi_arg.vector, + ipi_arg.cpu_mask); + return ((ret == 0) ? true : false); + +do_ex_hypercall: + return __send_ipi_mask_ex(mask, vector); +} + +static bool __send_ipi_one(int cpu, int vector) +{ + int vp = hv_cpu_number_to_vp_number(cpu); + + trace_hyperv_send_ipi_one(cpu, vector); + + if (!hv_hypercall_pg || (vp == VP_INVAL)) + return false; + + if ((vector < HV_IPI_LOW_VECTOR) || (vector > HV_IPI_HIGH_VECTOR)) + return false; + + if (vp >= 64) + return __send_ipi_mask_ex(cpumask_of(cpu), vector); + + return !hv_do_fast_hypercall16(HVCALL_SEND_IPI, vector, BIT_ULL(vp)); +} + +static void hv_send_ipi(int cpu, int vector) +{ + if (!__send_ipi_one(cpu, vector)) + orig_apic.send_IPI(cpu, vector); +} + +static void hv_send_ipi_mask(const struct cpumask *mask, int vector) +{ + if (!__send_ipi_mask(mask, vector)) + orig_apic.send_IPI_mask(mask, vector); +} + +static void hv_send_ipi_mask_allbutself(const struct cpumask *mask, int vector) +{ + unsigned int this_cpu = smp_processor_id(); + struct cpumask new_mask; + const struct cpumask *local_mask; + + cpumask_copy(&new_mask, mask); + cpumask_clear_cpu(this_cpu, &new_mask); + local_mask = &new_mask; + if (!__send_ipi_mask(local_mask, vector)) + orig_apic.send_IPI_mask_allbutself(mask, vector); +} + +static void hv_send_ipi_allbutself(int vector) +{ + hv_send_ipi_mask_allbutself(cpu_online_mask, vector); +} + +static void hv_send_ipi_all(int vector) +{ + if (!__send_ipi_mask(cpu_online_mask, vector)) + orig_apic.send_IPI_all(vector); +} + +static void hv_send_ipi_self(int vector) +{ + if (!__send_ipi_one(smp_processor_id(), vector)) + orig_apic.send_IPI_self(vector); +} + +void __init hv_apic_init(void) +{ + if (ms_hyperv.hints & HV_X64_CLUSTER_IPI_RECOMMENDED) { + pr_info("Hyper-V: Using IPI hypercalls\n"); + /* + * Set the IPI entry points. + */ + orig_apic = *apic; + + apic->send_IPI = hv_send_ipi; + apic->send_IPI_mask = hv_send_ipi_mask; + apic->send_IPI_mask_allbutself = hv_send_ipi_mask_allbutself; + apic->send_IPI_allbutself = hv_send_ipi_allbutself; + apic->send_IPI_all = hv_send_ipi_all; + apic->send_IPI_self = hv_send_ipi_self; + } + + if (ms_hyperv.hints & HV_X64_APIC_ACCESS_RECOMMENDED) { + pr_info("Hyper-V: Using enlightened APIC (%s mode)", + x2apic_enabled() ? "x2apic" : "xapic"); + /* + * When in x2apic mode, don't use the Hyper-V specific APIC + * accessors since the field layout in the ICR register is + * different in x2apic mode. Furthermore, the architectural + * x2apic MSRs function just as well as the Hyper-V + * synthetic APIC MSRs, so there's no benefit in having + * separate Hyper-V accessors for x2apic mode. The only + * exception is hv_apic_eoi_write, because it benefits from + * lazy EOI when available, but the same accessor works for + * both xapic and x2apic because the field layout is the same. + */ + apic_set_eoi_write(hv_apic_eoi_write); + if (!x2apic_enabled()) { + apic->read = hv_apic_read; + apic->write = hv_apic_write; + apic->icr_write = hv_apic_icr_write; + apic->icr_read = hv_apic_icr_read; + } + } +} diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c new file mode 100644 index 000000000..70fd21ebb --- /dev/null +++ b/arch/x86/hyperv/hv_init.c @@ -0,0 +1,561 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * X86 specific Hyper-V initialization code. + * + * Copyright (C) 2016, Microsoft, Inc. + * + * Author : K. Y. Srinivasan <kys@microsoft.com> + */ + +#include <linux/acpi.h> +#include <linux/efi.h> +#include <linux/types.h> +#include <asm/apic.h> +#include <asm/desc.h> +#include <asm/hypervisor.h> +#include <asm/hyperv-tlfs.h> +#include <asm/mshyperv.h> +#include <asm/idtentry.h> +#include <linux/kexec.h> +#include <linux/version.h> +#include <linux/vmalloc.h> +#include <linux/mm.h> +#include <linux/hyperv.h> +#include <linux/slab.h> +#include <linux/kernel.h> +#include <linux/cpuhotplug.h> +#include <linux/syscore_ops.h> +#include <clocksource/hyperv_timer.h> + +int hyperv_init_cpuhp; + +void *hv_hypercall_pg; +EXPORT_SYMBOL_GPL(hv_hypercall_pg); + +/* Storage to save the hypercall page temporarily for hibernation */ +static void *hv_hypercall_pg_saved; + +u32 *hv_vp_index; +EXPORT_SYMBOL_GPL(hv_vp_index); + +struct hv_vp_assist_page **hv_vp_assist_page; +EXPORT_SYMBOL_GPL(hv_vp_assist_page); + +void __percpu **hyperv_pcpu_input_arg; +EXPORT_SYMBOL_GPL(hyperv_pcpu_input_arg); + +u32 hv_max_vp_index; +EXPORT_SYMBOL_GPL(hv_max_vp_index); + +void *hv_alloc_hyperv_page(void) +{ + BUILD_BUG_ON(PAGE_SIZE != HV_HYP_PAGE_SIZE); + + return (void *)__get_free_page(GFP_KERNEL); +} +EXPORT_SYMBOL_GPL(hv_alloc_hyperv_page); + +void *hv_alloc_hyperv_zeroed_page(void) +{ + BUILD_BUG_ON(PAGE_SIZE != HV_HYP_PAGE_SIZE); + + return (void *)__get_free_page(GFP_KERNEL | __GFP_ZERO); +} +EXPORT_SYMBOL_GPL(hv_alloc_hyperv_zeroed_page); + +void hv_free_hyperv_page(unsigned long addr) +{ + free_page(addr); +} +EXPORT_SYMBOL_GPL(hv_free_hyperv_page); + +static int hv_cpu_init(unsigned int cpu) +{ + u64 msr_vp_index; + struct hv_vp_assist_page **hvp = &hv_vp_assist_page[smp_processor_id()]; + void **input_arg; + struct page *pg; + + input_arg = (void **)this_cpu_ptr(hyperv_pcpu_input_arg); + /* hv_cpu_init() can be called with IRQs disabled from hv_resume() */ + pg = alloc_page(irqs_disabled() ? GFP_ATOMIC : GFP_KERNEL); + if (unlikely(!pg)) + return -ENOMEM; + *input_arg = page_address(pg); + + hv_get_vp_index(msr_vp_index); + + hv_vp_index[smp_processor_id()] = msr_vp_index; + + if (msr_vp_index > hv_max_vp_index) + hv_max_vp_index = msr_vp_index; + + if (!hv_vp_assist_page) + return 0; + + /* + * The VP ASSIST PAGE is an "overlay" page (see Hyper-V TLFS's Section + * 5.2.1 "GPA Overlay Pages"). Here it must be zeroed out to make sure + * we always write the EOI MSR in hv_apic_eoi_write() *after* the + * EOI optimization is disabled in hv_cpu_die(), otherwise a CPU may + * not be stopped in the case of CPU offlining and the VM will hang. + */ + if (!*hvp) { + *hvp = __vmalloc(PAGE_SIZE, GFP_KERNEL | __GFP_ZERO); + } + + if (*hvp) { + u64 val; + + val = vmalloc_to_pfn(*hvp); + val = (val << HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_SHIFT) | + HV_X64_MSR_VP_ASSIST_PAGE_ENABLE; + + wrmsrl(HV_X64_MSR_VP_ASSIST_PAGE, val); + } + + return 0; +} + +static void (*hv_reenlightenment_cb)(void); + +static void hv_reenlightenment_notify(struct work_struct *dummy) +{ + struct hv_tsc_emulation_status emu_status; + + rdmsrl(HV_X64_MSR_TSC_EMULATION_STATUS, *(u64 *)&emu_status); + + /* Don't issue the callback if TSC accesses are not emulated */ + if (hv_reenlightenment_cb && emu_status.inprogress) + hv_reenlightenment_cb(); +} +static DECLARE_DELAYED_WORK(hv_reenlightenment_work, hv_reenlightenment_notify); + +void hyperv_stop_tsc_emulation(void) +{ + u64 freq; + struct hv_tsc_emulation_status emu_status; + + rdmsrl(HV_X64_MSR_TSC_EMULATION_STATUS, *(u64 *)&emu_status); + emu_status.inprogress = 0; + wrmsrl(HV_X64_MSR_TSC_EMULATION_STATUS, *(u64 *)&emu_status); + + rdmsrl(HV_X64_MSR_TSC_FREQUENCY, freq); + tsc_khz = div64_u64(freq, 1000); +} +EXPORT_SYMBOL_GPL(hyperv_stop_tsc_emulation); + +static inline bool hv_reenlightenment_available(void) +{ + /* + * Check for required features and priviliges to make TSC frequency + * change notifications work. + */ + return ms_hyperv.features & HV_ACCESS_FREQUENCY_MSRS && + ms_hyperv.misc_features & HV_FEATURE_FREQUENCY_MSRS_AVAILABLE && + ms_hyperv.features & HV_ACCESS_REENLIGHTENMENT; +} + +DEFINE_IDTENTRY_SYSVEC(sysvec_hyperv_reenlightenment) +{ + ack_APIC_irq(); + inc_irq_stat(irq_hv_reenlightenment_count); + schedule_delayed_work(&hv_reenlightenment_work, HZ/10); +} + +void set_hv_tscchange_cb(void (*cb)(void)) +{ + struct hv_reenlightenment_control re_ctrl = { + .vector = HYPERV_REENLIGHTENMENT_VECTOR, + .enabled = 1, + }; + struct hv_tsc_emulation_control emu_ctrl = {.enabled = 1}; + + if (!hv_reenlightenment_available()) { + pr_warn("Hyper-V: reenlightenment support is unavailable\n"); + return; + } + + if (!hv_vp_index) + return; + + hv_reenlightenment_cb = cb; + + /* Make sure callback is registered before we write to MSRs */ + wmb(); + + re_ctrl.target_vp = hv_vp_index[get_cpu()]; + + wrmsrl(HV_X64_MSR_REENLIGHTENMENT_CONTROL, *((u64 *)&re_ctrl)); + wrmsrl(HV_X64_MSR_TSC_EMULATION_CONTROL, *((u64 *)&emu_ctrl)); + + put_cpu(); +} +EXPORT_SYMBOL_GPL(set_hv_tscchange_cb); + +void clear_hv_tscchange_cb(void) +{ + struct hv_reenlightenment_control re_ctrl; + + if (!hv_reenlightenment_available()) + return; + + rdmsrl(HV_X64_MSR_REENLIGHTENMENT_CONTROL, *(u64 *)&re_ctrl); + re_ctrl.enabled = 0; + wrmsrl(HV_X64_MSR_REENLIGHTENMENT_CONTROL, *(u64 *)&re_ctrl); + + hv_reenlightenment_cb = NULL; +} +EXPORT_SYMBOL_GPL(clear_hv_tscchange_cb); + +static int hv_cpu_die(unsigned int cpu) +{ + struct hv_reenlightenment_control re_ctrl; + unsigned int new_cpu; + unsigned long flags; + void **input_arg; + void *input_pg = NULL; + + local_irq_save(flags); + input_arg = (void **)this_cpu_ptr(hyperv_pcpu_input_arg); + input_pg = *input_arg; + *input_arg = NULL; + local_irq_restore(flags); + free_page((unsigned long)input_pg); + + if (hv_vp_assist_page && hv_vp_assist_page[cpu]) + wrmsrl(HV_X64_MSR_VP_ASSIST_PAGE, 0); + + if (hv_reenlightenment_cb == NULL) + return 0; + + rdmsrl(HV_X64_MSR_REENLIGHTENMENT_CONTROL, *((u64 *)&re_ctrl)); + if (re_ctrl.target_vp == hv_vp_index[cpu]) { + /* + * Reassign reenlightenment notifications to some other online + * CPU or just disable the feature if there are no online CPUs + * left (happens on hibernation). + */ + new_cpu = cpumask_any_but(cpu_online_mask, cpu); + + if (new_cpu < nr_cpu_ids) + re_ctrl.target_vp = hv_vp_index[new_cpu]; + else + re_ctrl.enabled = 0; + + wrmsrl(HV_X64_MSR_REENLIGHTENMENT_CONTROL, *((u64 *)&re_ctrl)); + } + + return 0; +} + +static int __init hv_pci_init(void) +{ + int gen2vm = efi_enabled(EFI_BOOT); + + /* + * For Generation-2 VM, we exit from pci_arch_init() by returning 0. + * The purpose is to suppress the harmless warning: + * "PCI: Fatal: No config space access function found" + */ + if (gen2vm) + return 0; + + /* For Generation-1 VM, we'll proceed in pci_arch_init(). */ + return 1; +} + +static int hv_suspend(void) +{ + union hv_x64_msr_hypercall_contents hypercall_msr; + int ret; + + /* + * Reset the hypercall page as it is going to be invalidated + * accross hibernation. Setting hv_hypercall_pg to NULL ensures + * that any subsequent hypercall operation fails safely instead of + * crashing due to an access of an invalid page. The hypercall page + * pointer is restored on resume. + */ + hv_hypercall_pg_saved = hv_hypercall_pg; + hv_hypercall_pg = NULL; + + /* Disable the hypercall page in the hypervisor */ + rdmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); + hypercall_msr.enable = 0; + wrmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); + + ret = hv_cpu_die(0); + return ret; +} + +static void hv_resume(void) +{ + union hv_x64_msr_hypercall_contents hypercall_msr; + int ret; + + ret = hv_cpu_init(0); + WARN_ON(ret); + + /* Re-enable the hypercall page */ + rdmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); + hypercall_msr.enable = 1; + hypercall_msr.guest_physical_address = + vmalloc_to_pfn(hv_hypercall_pg_saved); + wrmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); + + hv_hypercall_pg = hv_hypercall_pg_saved; + hv_hypercall_pg_saved = NULL; + + /* + * Reenlightenment notifications are disabled by hv_cpu_die(0), + * reenable them here if hv_reenlightenment_cb was previously set. + */ + if (hv_reenlightenment_cb) + set_hv_tscchange_cb(hv_reenlightenment_cb); +} + +/* Note: when the ops are called, only CPU0 is online and IRQs are disabled. */ +static struct syscore_ops hv_syscore_ops = { + .suspend = hv_suspend, + .resume = hv_resume, +}; + +static void (* __initdata old_setup_percpu_clockev)(void); + +static void __init hv_stimer_setup_percpu_clockev(void) +{ + /* + * Ignore any errors in setting up stimer clockevents + * as we can run with the LAPIC timer as a fallback. + */ + (void)hv_stimer_alloc(); + + /* + * Still register the LAPIC timer, because the direct-mode STIMER is + * not supported by old versions of Hyper-V. This also allows users + * to switch to LAPIC timer via /sys, if they want to. + */ + if (old_setup_percpu_clockev) + old_setup_percpu_clockev(); +} + +/* + * This function is to be invoked early in the boot sequence after the + * hypervisor has been detected. + * + * 1. Setup the hypercall page. + * 2. Register Hyper-V specific clocksource. + * 3. Setup Hyper-V specific APIC entry points. + */ +void __init hyperv_init(void) +{ + u64 guest_id, required_msrs; + union hv_x64_msr_hypercall_contents hypercall_msr; + int cpuhp, i; + + if (x86_hyper_type != X86_HYPER_MS_HYPERV) + return; + + /* Absolutely required MSRs */ + required_msrs = HV_MSR_HYPERCALL_AVAILABLE | + HV_MSR_VP_INDEX_AVAILABLE; + + if ((ms_hyperv.features & required_msrs) != required_msrs) + return; + + /* + * Allocate the per-CPU state for the hypercall input arg. + * If this allocation fails, we will not be able to setup + * (per-CPU) hypercall input page and thus this failure is + * fatal on Hyper-V. + */ + hyperv_pcpu_input_arg = alloc_percpu(void *); + + BUG_ON(hyperv_pcpu_input_arg == NULL); + + /* Allocate percpu VP index */ + hv_vp_index = kmalloc_array(num_possible_cpus(), sizeof(*hv_vp_index), + GFP_KERNEL); + if (!hv_vp_index) + return; + + for (i = 0; i < num_possible_cpus(); i++) + hv_vp_index[i] = VP_INVAL; + + hv_vp_assist_page = kcalloc(num_possible_cpus(), + sizeof(*hv_vp_assist_page), GFP_KERNEL); + if (!hv_vp_assist_page) { + ms_hyperv.hints &= ~HV_X64_ENLIGHTENED_VMCS_RECOMMENDED; + goto free_vp_index; + } + + cpuhp = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/hyperv_init:online", + hv_cpu_init, hv_cpu_die); + if (cpuhp < 0) + goto free_vp_assist_page; + + /* + * Setup the hypercall page and enable hypercalls. + * 1. Register the guest ID + * 2. Enable the hypercall and register the hypercall page + */ + guest_id = generate_guest_id(0, LINUX_VERSION_CODE, 0); + wrmsrl(HV_X64_MSR_GUEST_OS_ID, guest_id); + + hv_hypercall_pg = __vmalloc_node_range(PAGE_SIZE, 1, VMALLOC_START, + VMALLOC_END, GFP_KERNEL, PAGE_KERNEL_ROX, + VM_FLUSH_RESET_PERMS, NUMA_NO_NODE, + __builtin_return_address(0)); + if (hv_hypercall_pg == NULL) { + wrmsrl(HV_X64_MSR_GUEST_OS_ID, 0); + goto remove_cpuhp_state; + } + + rdmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); + hypercall_msr.enable = 1; + hypercall_msr.guest_physical_address = vmalloc_to_pfn(hv_hypercall_pg); + wrmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); + + /* + * hyperv_init() is called before LAPIC is initialized: see + * apic_intr_mode_init() -> x86_platform.apic_post_init() and + * apic_bsp_setup() -> setup_local_APIC(). The direct-mode STIMER + * depends on LAPIC, so hv_stimer_alloc() should be called from + * x86_init.timers.setup_percpu_clockev. + */ + old_setup_percpu_clockev = x86_init.timers.setup_percpu_clockev; + x86_init.timers.setup_percpu_clockev = hv_stimer_setup_percpu_clockev; + + hv_apic_init(); + + x86_init.pci.arch_init = hv_pci_init; + + register_syscore_ops(&hv_syscore_ops); + + hyperv_init_cpuhp = cpuhp; + return; + +remove_cpuhp_state: + cpuhp_remove_state(cpuhp); +free_vp_assist_page: + kfree(hv_vp_assist_page); + hv_vp_assist_page = NULL; +free_vp_index: + kfree(hv_vp_index); + hv_vp_index = NULL; +} + +/* + * This routine is called before kexec/kdump, it does the required cleanup. + */ +void hyperv_cleanup(void) +{ + union hv_x64_msr_hypercall_contents hypercall_msr; + + /* Reset our OS id */ + wrmsrl(HV_X64_MSR_GUEST_OS_ID, 0); + + /* + * Reset hypercall page reference before reset the page, + * let hypercall operations fail safely rather than + * panic the kernel for using invalid hypercall page + */ + hv_hypercall_pg = NULL; + + /* Reset the hypercall page */ + hypercall_msr.as_uint64 = 0; + wrmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); + + /* Reset the TSC page */ + hypercall_msr.as_uint64 = 0; + wrmsrl(HV_X64_MSR_REFERENCE_TSC, hypercall_msr.as_uint64); +} +EXPORT_SYMBOL_GPL(hyperv_cleanup); + +void hyperv_report_panic(struct pt_regs *regs, long err, bool in_die) +{ + static bool panic_reported; + u64 guest_id; + + if (in_die && !panic_on_oops) + return; + + /* + * We prefer to report panic on 'die' chain as we have proper + * registers to report, but if we miss it (e.g. on BUG()) we need + * to report it on 'panic'. + */ + if (panic_reported) + return; + panic_reported = true; + + rdmsrl(HV_X64_MSR_GUEST_OS_ID, guest_id); + + wrmsrl(HV_X64_MSR_CRASH_P0, err); + wrmsrl(HV_X64_MSR_CRASH_P1, guest_id); + wrmsrl(HV_X64_MSR_CRASH_P2, regs->ip); + wrmsrl(HV_X64_MSR_CRASH_P3, regs->ax); + wrmsrl(HV_X64_MSR_CRASH_P4, regs->sp); + + /* + * Let Hyper-V know there is crash data available + */ + wrmsrl(HV_X64_MSR_CRASH_CTL, HV_CRASH_CTL_CRASH_NOTIFY); +} +EXPORT_SYMBOL_GPL(hyperv_report_panic); + +/** + * hyperv_report_panic_msg - report panic message to Hyper-V + * @pa: physical address of the panic page containing the message + * @size: size of the message in the page + */ +void hyperv_report_panic_msg(phys_addr_t pa, size_t size) +{ + /* + * P3 to contain the physical address of the panic page & P4 to + * contain the size of the panic data in that page. Rest of the + * registers are no-op when the NOTIFY_MSG flag is set. + */ + wrmsrl(HV_X64_MSR_CRASH_P0, 0); + wrmsrl(HV_X64_MSR_CRASH_P1, 0); + wrmsrl(HV_X64_MSR_CRASH_P2, 0); + wrmsrl(HV_X64_MSR_CRASH_P3, pa); + wrmsrl(HV_X64_MSR_CRASH_P4, size); + + /* + * Let Hyper-V know there is crash data available along with + * the panic message. + */ + wrmsrl(HV_X64_MSR_CRASH_CTL, + (HV_CRASH_CTL_CRASH_NOTIFY | HV_CRASH_CTL_CRASH_NOTIFY_MSG)); +} +EXPORT_SYMBOL_GPL(hyperv_report_panic_msg); + +bool hv_is_hyperv_initialized(void) +{ + union hv_x64_msr_hypercall_contents hypercall_msr; + + /* + * Ensure that we're really on Hyper-V, and not a KVM or Xen + * emulation of Hyper-V + */ + if (x86_hyper_type != X86_HYPER_MS_HYPERV) + return false; + + /* + * Verify that earlier initialization succeeded by checking + * that the hypercall page is setup + */ + hypercall_msr.as_uint64 = 0; + rdmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); + + return hypercall_msr.enable; +} +EXPORT_SYMBOL_GPL(hv_is_hyperv_initialized); + +bool hv_is_hibernation_supported(void) +{ + return acpi_sleep_state_supported(ACPI_STATE_S4); +} +EXPORT_SYMBOL_GPL(hv_is_hibernation_supported); diff --git a/arch/x86/hyperv/hv_spinlock.c b/arch/x86/hyperv/hv_spinlock.c new file mode 100644 index 000000000..f3270c1fc --- /dev/null +++ b/arch/x86/hyperv/hv_spinlock.c @@ -0,0 +1,88 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Hyper-V specific spinlock code. + * + * Copyright (C) 2018, Intel, Inc. + * + * Author : Yi Sun <yi.y.sun@intel.com> + */ + +#define pr_fmt(fmt) "Hyper-V: " fmt + +#include <linux/spinlock.h> + +#include <asm/mshyperv.h> +#include <asm/paravirt.h> +#include <asm/apic.h> + +static bool __initdata hv_pvspin = true; + +static void hv_qlock_kick(int cpu) +{ + apic->send_IPI(cpu, X86_PLATFORM_IPI_VECTOR); +} + +static void hv_qlock_wait(u8 *byte, u8 val) +{ + unsigned long msr_val; + unsigned long flags; + + if (in_nmi()) + return; + + /* + * Reading HV_X64_MSR_GUEST_IDLE MSR tells the hypervisor that the + * vCPU can be put into 'idle' state. This 'idle' state is + * terminated by an IPI, usually from hv_qlock_kick(), even if + * interrupts are disabled on the vCPU. + * + * To prevent a race against the unlock path it is required to + * disable interrupts before accessing the HV_X64_MSR_GUEST_IDLE + * MSR. Otherwise, if the IPI from hv_qlock_kick() arrives between + * the lock value check and the rdmsrl() then the vCPU might be put + * into 'idle' state by the hypervisor and kept in that state for + * an unspecified amount of time. + */ + local_irq_save(flags); + /* + * Only issue the rdmsrl() when the lock state has not changed. + */ + if (READ_ONCE(*byte) == val) + rdmsrl(HV_X64_MSR_GUEST_IDLE, msr_val); + local_irq_restore(flags); +} + +/* + * Hyper-V does not support this so far. + */ +__visible bool hv_vcpu_is_preempted(int vcpu) +{ + return false; +} +PV_CALLEE_SAVE_REGS_THUNK(hv_vcpu_is_preempted); + +void __init hv_init_spinlocks(void) +{ + if (!hv_pvspin || !apic || + !(ms_hyperv.hints & HV_X64_CLUSTER_IPI_RECOMMENDED) || + !(ms_hyperv.features & HV_MSR_GUEST_IDLE_AVAILABLE)) { + pr_info("PV spinlocks disabled\n"); + return; + } + pr_info("PV spinlocks enabled\n"); + + __pv_init_lock_hash(); + pv_ops.lock.queued_spin_lock_slowpath = __pv_queued_spin_lock_slowpath; + pv_ops.lock.queued_spin_unlock = PV_CALLEE_SAVE(__pv_queued_spin_unlock); + pv_ops.lock.wait = hv_qlock_wait; + pv_ops.lock.kick = hv_qlock_kick; + pv_ops.lock.vcpu_is_preempted = PV_CALLEE_SAVE(hv_vcpu_is_preempted); +} + +static __init int hv_parse_nopvspin(char *arg) +{ + hv_pvspin = false; + return 0; +} +early_param("hv_nopvspin", hv_parse_nopvspin); diff --git a/arch/x86/hyperv/mmu.c b/arch/x86/hyperv/mmu.c new file mode 100644 index 000000000..2c87350c1 --- /dev/null +++ b/arch/x86/hyperv/mmu.c @@ -0,0 +1,244 @@ +#define pr_fmt(fmt) "Hyper-V: " fmt + +#include <linux/hyperv.h> +#include <linux/log2.h> +#include <linux/slab.h> +#include <linux/types.h> + +#include <asm/fpu/api.h> +#include <asm/mshyperv.h> +#include <asm/msr.h> +#include <asm/tlbflush.h> +#include <asm/tlb.h> + +#define CREATE_TRACE_POINTS +#include <asm/trace/hyperv.h> + +/* Each gva in gva_list encodes up to 4096 pages to flush */ +#define HV_TLB_FLUSH_UNIT (4096 * PAGE_SIZE) + +static u64 hyperv_flush_tlb_others_ex(const struct cpumask *cpus, + const struct flush_tlb_info *info); + +/* + * Fills in gva_list starting from offset. Returns the number of items added. + */ +static inline int fill_gva_list(u64 gva_list[], int offset, + unsigned long start, unsigned long end) +{ + int gva_n = offset; + unsigned long cur = start, diff; + + do { + diff = end > cur ? end - cur : 0; + + gva_list[gva_n] = cur & PAGE_MASK; + /* + * Lower 12 bits encode the number of additional + * pages to flush (in addition to the 'cur' page). + */ + if (diff >= HV_TLB_FLUSH_UNIT) { + gva_list[gva_n] |= ~PAGE_MASK; + cur += HV_TLB_FLUSH_UNIT; + } else if (diff) { + gva_list[gva_n] |= (diff - 1) >> PAGE_SHIFT; + cur = end; + } + + gva_n++; + + } while (cur < end); + + return gva_n - offset; +} + +static void hyperv_flush_tlb_others(const struct cpumask *cpus, + const struct flush_tlb_info *info) +{ + int cpu, vcpu, gva_n, max_gvas; + struct hv_tlb_flush **flush_pcpu; + struct hv_tlb_flush *flush; + u64 status = U64_MAX; + unsigned long flags; + + trace_hyperv_mmu_flush_tlb_others(cpus, info); + + if (!hv_hypercall_pg) + goto do_native; + + local_irq_save(flags); + + /* + * Only check the mask _after_ interrupt has been disabled to avoid the + * mask changing under our feet. + */ + if (cpumask_empty(cpus)) { + local_irq_restore(flags); + return; + } + + flush_pcpu = (struct hv_tlb_flush **) + this_cpu_ptr(hyperv_pcpu_input_arg); + + flush = *flush_pcpu; + + if (unlikely(!flush)) { + local_irq_restore(flags); + goto do_native; + } + + if (info->mm) { + /* + * AddressSpace argument must match the CR3 with PCID bits + * stripped out. + */ + flush->address_space = virt_to_phys(info->mm->pgd); + flush->address_space &= CR3_ADDR_MASK; + flush->flags = 0; + } else { + flush->address_space = 0; + flush->flags = HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES; + } + + flush->processor_mask = 0; + if (cpumask_equal(cpus, cpu_present_mask)) { + flush->flags |= HV_FLUSH_ALL_PROCESSORS; + } else { + /* + * From the supplied CPU set we need to figure out if we can get + * away with cheaper HVCALL_FLUSH_VIRTUAL_ADDRESS_{LIST,SPACE} + * hypercalls. This is possible when the highest VP number in + * the set is < 64. As VP numbers are usually in ascending order + * and match Linux CPU ids, here is an optimization: we check + * the VP number for the highest bit in the supplied set first + * so we can quickly find out if using *_EX hypercalls is a + * must. We will also check all VP numbers when walking the + * supplied CPU set to remain correct in all cases. + */ + if (hv_cpu_number_to_vp_number(cpumask_last(cpus)) >= 64) + goto do_ex_hypercall; + + for_each_cpu(cpu, cpus) { + vcpu = hv_cpu_number_to_vp_number(cpu); + if (vcpu == VP_INVAL) { + local_irq_restore(flags); + goto do_native; + } + + if (vcpu >= 64) + goto do_ex_hypercall; + + __set_bit(vcpu, (unsigned long *) + &flush->processor_mask); + } + } + + /* + * We can flush not more than max_gvas with one hypercall. Flush the + * whole address space if we were asked to do more. + */ + max_gvas = (PAGE_SIZE - sizeof(*flush)) / sizeof(flush->gva_list[0]); + + if (info->end == TLB_FLUSH_ALL) { + flush->flags |= HV_FLUSH_NON_GLOBAL_MAPPINGS_ONLY; + status = hv_do_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE, + flush, NULL); + } else if (info->end && + ((info->end - info->start)/HV_TLB_FLUSH_UNIT) > max_gvas) { + status = hv_do_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE, + flush, NULL); + } else { + gva_n = fill_gva_list(flush->gva_list, 0, + info->start, info->end); + status = hv_do_rep_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST, + gva_n, 0, flush, NULL); + } + goto check_status; + +do_ex_hypercall: + status = hyperv_flush_tlb_others_ex(cpus, info); + +check_status: + local_irq_restore(flags); + + if (!(status & HV_HYPERCALL_RESULT_MASK)) + return; +do_native: + native_flush_tlb_others(cpus, info); +} + +static u64 hyperv_flush_tlb_others_ex(const struct cpumask *cpus, + const struct flush_tlb_info *info) +{ + int nr_bank = 0, max_gvas, gva_n; + struct hv_tlb_flush_ex **flush_pcpu; + struct hv_tlb_flush_ex *flush; + u64 status; + + if (!(ms_hyperv.hints & HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED)) + return U64_MAX; + + flush_pcpu = (struct hv_tlb_flush_ex **) + this_cpu_ptr(hyperv_pcpu_input_arg); + + flush = *flush_pcpu; + + if (info->mm) { + /* + * AddressSpace argument must match the CR3 with PCID bits + * stripped out. + */ + flush->address_space = virt_to_phys(info->mm->pgd); + flush->address_space &= CR3_ADDR_MASK; + flush->flags = 0; + } else { + flush->address_space = 0; + flush->flags = HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES; + } + + flush->hv_vp_set.valid_bank_mask = 0; + + flush->hv_vp_set.format = HV_GENERIC_SET_SPARSE_4K; + nr_bank = cpumask_to_vpset(&(flush->hv_vp_set), cpus); + if (nr_bank < 0) + return U64_MAX; + + /* + * We can flush not more than max_gvas with one hypercall. Flush the + * whole address space if we were asked to do more. + */ + max_gvas = + (PAGE_SIZE - sizeof(*flush) - nr_bank * + sizeof(flush->hv_vp_set.bank_contents[0])) / + sizeof(flush->gva_list[0]); + + if (info->end == TLB_FLUSH_ALL) { + flush->flags |= HV_FLUSH_NON_GLOBAL_MAPPINGS_ONLY; + status = hv_do_rep_hypercall( + HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX, + 0, nr_bank, flush, NULL); + } else if (info->end && + ((info->end - info->start)/HV_TLB_FLUSH_UNIT) > max_gvas) { + status = hv_do_rep_hypercall( + HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX, + 0, nr_bank, flush, NULL); + } else { + gva_n = fill_gva_list(flush->gva_list, nr_bank, + info->start, info->end); + status = hv_do_rep_hypercall( + HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX, + gva_n, nr_bank, flush, NULL); + } + + return status; +} + +void hyperv_setup_mmu_ops(void) +{ + if (!(ms_hyperv.hints & HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED)) + return; + + pr_info("Using hypercall for remote TLB flush\n"); + pv_ops.mmu.flush_tlb_others = hyperv_flush_tlb_others; + pv_ops.mmu.tlb_remove_table = tlb_remove_table; +} diff --git a/arch/x86/hyperv/nested.c b/arch/x86/hyperv/nested.c new file mode 100644 index 000000000..dd0a843f7 --- /dev/null +++ b/arch/x86/hyperv/nested.c @@ -0,0 +1,136 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Hyper-V nested virtualization code. + * + * Copyright (C) 2018, Microsoft, Inc. + * + * Author : Lan Tianyu <Tianyu.Lan@microsoft.com> + */ +#define pr_fmt(fmt) "Hyper-V: " fmt + + +#include <linux/types.h> +#include <asm/hyperv-tlfs.h> +#include <asm/mshyperv.h> +#include <asm/tlbflush.h> + +#include <asm/trace/hyperv.h> + +int hyperv_flush_guest_mapping(u64 as) +{ + struct hv_guest_mapping_flush **flush_pcpu; + struct hv_guest_mapping_flush *flush; + u64 status; + unsigned long flags; + int ret = -ENOTSUPP; + + if (!hv_hypercall_pg) + goto fault; + + local_irq_save(flags); + + flush_pcpu = (struct hv_guest_mapping_flush **) + this_cpu_ptr(hyperv_pcpu_input_arg); + + flush = *flush_pcpu; + + if (unlikely(!flush)) { + local_irq_restore(flags); + goto fault; + } + + flush->address_space = as; + flush->flags = 0; + + status = hv_do_hypercall(HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_SPACE, + flush, NULL); + local_irq_restore(flags); + + if (!(status & HV_HYPERCALL_RESULT_MASK)) + ret = 0; + +fault: + trace_hyperv_nested_flush_guest_mapping(as, ret); + return ret; +} +EXPORT_SYMBOL_GPL(hyperv_flush_guest_mapping); + +int hyperv_fill_flush_guest_mapping_list( + struct hv_guest_mapping_flush_list *flush, + u64 start_gfn, u64 pages) +{ + u64 cur = start_gfn; + u64 additional_pages; + int gpa_n = 0; + + do { + /* + * If flush requests exceed max flush count, go back to + * flush tlbs without range. + */ + if (gpa_n >= HV_MAX_FLUSH_REP_COUNT) + return -ENOSPC; + + additional_pages = min_t(u64, pages, HV_MAX_FLUSH_PAGES) - 1; + + flush->gpa_list[gpa_n].page.additional_pages = additional_pages; + flush->gpa_list[gpa_n].page.largepage = false; + flush->gpa_list[gpa_n].page.basepfn = cur; + + pages -= additional_pages + 1; + cur += additional_pages + 1; + gpa_n++; + } while (pages > 0); + + return gpa_n; +} +EXPORT_SYMBOL_GPL(hyperv_fill_flush_guest_mapping_list); + +int hyperv_flush_guest_mapping_range(u64 as, + hyperv_fill_flush_list_func fill_flush_list_func, void *data) +{ + struct hv_guest_mapping_flush_list **flush_pcpu; + struct hv_guest_mapping_flush_list *flush; + u64 status = 0; + unsigned long flags; + int ret = -ENOTSUPP; + int gpa_n = 0; + + if (!hv_hypercall_pg || !fill_flush_list_func) + goto fault; + + local_irq_save(flags); + + flush_pcpu = (struct hv_guest_mapping_flush_list **) + this_cpu_ptr(hyperv_pcpu_input_arg); + + flush = *flush_pcpu; + if (unlikely(!flush)) { + local_irq_restore(flags); + goto fault; + } + + flush->address_space = as; + flush->flags = 0; + + gpa_n = fill_flush_list_func(flush, data); + if (gpa_n < 0) { + local_irq_restore(flags); + goto fault; + } + + status = hv_do_rep_hypercall(HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_LIST, + gpa_n, 0, flush, NULL); + + local_irq_restore(flags); + + if (!(status & HV_HYPERCALL_RESULT_MASK)) + ret = 0; + else + ret = status; +fault: + trace_hyperv_nested_flush_guest_mapping_range(as, ret); + return ret; +} +EXPORT_SYMBOL_GPL(hyperv_flush_guest_mapping_range); |