diff options
Diffstat (limited to 'arch/x86/hyperv')
-rw-r--r-- | arch/x86/hyperv/Makefile | 2 | ||||
-rw-r--r-- | arch/x86/hyperv/hv_apic.c | 266 | ||||
-rw-r--r-- | arch/x86/hyperv/hv_init.c | 516 | ||||
-rw-r--r-- | arch/x86/hyperv/mmu.c | 244 | ||||
-rw-r--r-- | arch/x86/hyperv/nested.c | 56 |
5 files changed, 1084 insertions, 0 deletions
diff --git a/arch/x86/hyperv/Makefile b/arch/x86/hyperv/Makefile new file mode 100644 index 000000000..b21ee65c4 --- /dev/null +++ b/arch/x86/hyperv/Makefile @@ -0,0 +1,2 @@ +obj-y := hv_init.o mmu.o nested.o +obj-$(CONFIG_X86_64) += hv_apic.o diff --git a/arch/x86/hyperv/hv_apic.c b/arch/x86/hyperv/hv_apic.c new file mode 100644 index 000000000..2c43e3055 --- /dev/null +++ b/arch/x86/hyperv/hv_apic.c @@ -0,0 +1,266 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Hyper-V specific APIC code. + * + * Copyright (C) 2018, Microsoft, Inc. + * + * Author : K. Y. Srinivasan <kys@microsoft.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for more + * details. + * + */ + +#include <linux/types.h> +#include <linux/version.h> +#include <linux/vmalloc.h> +#include <linux/mm.h> +#include <linux/clockchips.h> +#include <linux/hyperv.h> +#include <linux/slab.h> +#include <linux/cpuhotplug.h> +#include <asm/hypervisor.h> +#include <asm/mshyperv.h> +#include <asm/apic.h> + +#include <asm/trace/hyperv.h> + +static struct apic orig_apic; + +static u64 hv_apic_icr_read(void) +{ + u64 reg_val; + + rdmsrl(HV_X64_MSR_ICR, reg_val); + return reg_val; +} + +static void hv_apic_icr_write(u32 low, u32 id) +{ + u64 reg_val; + + reg_val = SET_APIC_DEST_FIELD(id); + reg_val = reg_val << 32; + reg_val |= low; + + wrmsrl(HV_X64_MSR_ICR, reg_val); +} + +static u32 hv_apic_read(u32 reg) +{ + u32 reg_val, hi; + + switch (reg) { + case APIC_EOI: + rdmsr(HV_X64_MSR_EOI, reg_val, hi); + return reg_val; + case APIC_TASKPRI: + rdmsr(HV_X64_MSR_TPR, reg_val, hi); + return reg_val; + + default: + return native_apic_mem_read(reg); + } +} + +static void hv_apic_write(u32 reg, u32 val) +{ + switch (reg) { + case APIC_EOI: + wrmsr(HV_X64_MSR_EOI, val, 0); + break; + case APIC_TASKPRI: + wrmsr(HV_X64_MSR_TPR, val, 0); + break; + default: + native_apic_mem_write(reg, val); + } +} + +static void hv_apic_eoi_write(u32 reg, u32 val) +{ + wrmsr(HV_X64_MSR_EOI, val, 0); +} + +/* + * IPI implementation on Hyper-V. + */ +static bool __send_ipi_mask_ex(const struct cpumask *mask, int vector) +{ + struct hv_send_ipi_ex **arg; + struct hv_send_ipi_ex *ipi_arg; + unsigned long flags; + int nr_bank = 0; + int ret = 1; + + if (!(ms_hyperv.hints & HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED)) + return false; + + local_irq_save(flags); + arg = (struct hv_send_ipi_ex **)this_cpu_ptr(hyperv_pcpu_input_arg); + + ipi_arg = *arg; + if (unlikely(!ipi_arg)) + goto ipi_mask_ex_done; + + ipi_arg->vector = vector; + ipi_arg->reserved = 0; + ipi_arg->vp_set.valid_bank_mask = 0; + + if (!cpumask_equal(mask, cpu_present_mask)) { + ipi_arg->vp_set.format = HV_GENERIC_SET_SPARSE_4K; + nr_bank = cpumask_to_vpset(&(ipi_arg->vp_set), mask); + } + if (nr_bank < 0) + goto ipi_mask_ex_done; + if (!nr_bank) + ipi_arg->vp_set.format = HV_GENERIC_SET_ALL; + + ret = hv_do_rep_hypercall(HVCALL_SEND_IPI_EX, 0, nr_bank, + ipi_arg, NULL); + +ipi_mask_ex_done: + local_irq_restore(flags); + return ((ret == 0) ? true : false); +} + +static bool __send_ipi_mask(const struct cpumask *mask, int vector) +{ + int cur_cpu, vcpu; + struct hv_send_ipi ipi_arg; + int ret = 1; + + trace_hyperv_send_ipi_mask(mask, vector); + + if (cpumask_empty(mask)) + return true; + + if (!hv_hypercall_pg) + return false; + + if ((vector < HV_IPI_LOW_VECTOR) || (vector > HV_IPI_HIGH_VECTOR)) + return false; + + /* + * From the supplied CPU set we need to figure out if we can get away + * with cheaper HVCALL_SEND_IPI hypercall. This is possible when the + * highest VP number in the set is < 64. As VP numbers are usually in + * ascending order and match Linux CPU ids, here is an optimization: + * we check the VP number for the highest bit in the supplied set first + * so we can quickly find out if using HVCALL_SEND_IPI_EX hypercall is + * a must. We will also check all VP numbers when walking the supplied + * CPU set to remain correct in all cases. + */ + if (hv_cpu_number_to_vp_number(cpumask_last(mask)) >= 64) + goto do_ex_hypercall; + + ipi_arg.vector = vector; + ipi_arg.cpu_mask = 0; + + for_each_cpu(cur_cpu, mask) { + vcpu = hv_cpu_number_to_vp_number(cur_cpu); + if (vcpu == VP_INVAL) + return false; + + /* + * This particular version of the IPI hypercall can + * only target upto 64 CPUs. + */ + if (vcpu >= 64) + goto do_ex_hypercall; + + __set_bit(vcpu, (unsigned long *)&ipi_arg.cpu_mask); + } + + ret = hv_do_fast_hypercall16(HVCALL_SEND_IPI, ipi_arg.vector, + ipi_arg.cpu_mask); + return ((ret == 0) ? true : false); + +do_ex_hypercall: + return __send_ipi_mask_ex(mask, vector); +} + +static bool __send_ipi_one(int cpu, int vector) +{ + struct cpumask mask = CPU_MASK_NONE; + + cpumask_set_cpu(cpu, &mask); + return __send_ipi_mask(&mask, vector); +} + +static void hv_send_ipi(int cpu, int vector) +{ + if (!__send_ipi_one(cpu, vector)) + orig_apic.send_IPI(cpu, vector); +} + +static void hv_send_ipi_mask(const struct cpumask *mask, int vector) +{ + if (!__send_ipi_mask(mask, vector)) + orig_apic.send_IPI_mask(mask, vector); +} + +static void hv_send_ipi_mask_allbutself(const struct cpumask *mask, int vector) +{ + unsigned int this_cpu = smp_processor_id(); + struct cpumask new_mask; + const struct cpumask *local_mask; + + cpumask_copy(&new_mask, mask); + cpumask_clear_cpu(this_cpu, &new_mask); + local_mask = &new_mask; + if (!__send_ipi_mask(local_mask, vector)) + orig_apic.send_IPI_mask_allbutself(mask, vector); +} + +static void hv_send_ipi_allbutself(int vector) +{ + hv_send_ipi_mask_allbutself(cpu_online_mask, vector); +} + +static void hv_send_ipi_all(int vector) +{ + if (!__send_ipi_mask(cpu_online_mask, vector)) + orig_apic.send_IPI_all(vector); +} + +static void hv_send_ipi_self(int vector) +{ + if (!__send_ipi_one(smp_processor_id(), vector)) + orig_apic.send_IPI_self(vector); +} + +void __init hv_apic_init(void) +{ + if (ms_hyperv.hints & HV_X64_CLUSTER_IPI_RECOMMENDED) { + pr_info("Hyper-V: Using IPI hypercalls\n"); + /* + * Set the IPI entry points. + */ + orig_apic = *apic; + + apic->send_IPI = hv_send_ipi; + apic->send_IPI_mask = hv_send_ipi_mask; + apic->send_IPI_mask_allbutself = hv_send_ipi_mask_allbutself; + apic->send_IPI_allbutself = hv_send_ipi_allbutself; + apic->send_IPI_all = hv_send_ipi_all; + apic->send_IPI_self = hv_send_ipi_self; + } + + if (ms_hyperv.hints & HV_X64_APIC_ACCESS_RECOMMENDED) { + pr_info("Hyper-V: Using MSR based APIC access\n"); + apic_set_eoi_write(hv_apic_eoi_write); + apic->read = hv_apic_read; + apic->write = hv_apic_write; + apic->icr_write = hv_apic_icr_write; + apic->icr_read = hv_apic_icr_read; + } +} diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c new file mode 100644 index 000000000..38c31e164 --- /dev/null +++ b/arch/x86/hyperv/hv_init.c @@ -0,0 +1,516 @@ +/* + * X86 specific Hyper-V initialization code. + * + * Copyright (C) 2016, Microsoft, Inc. + * + * Author : K. Y. Srinivasan <kys@microsoft.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for more + * details. + * + */ + +#include <linux/efi.h> +#include <linux/types.h> +#include <asm/apic.h> +#include <asm/desc.h> +#include <asm/hypervisor.h> +#include <asm/hyperv-tlfs.h> +#include <asm/mshyperv.h> +#include <linux/version.h> +#include <linux/vmalloc.h> +#include <linux/mm.h> +#include <linux/clockchips.h> +#include <linux/hyperv.h> +#include <linux/slab.h> +#include <linux/kernel.h> +#include <linux/cpuhotplug.h> + +#ifdef CONFIG_HYPERV_TSCPAGE + +static struct ms_hyperv_tsc_page *tsc_pg; + +struct ms_hyperv_tsc_page *hv_get_tsc_page(void) +{ + return tsc_pg; +} +EXPORT_SYMBOL_GPL(hv_get_tsc_page); + +static u64 read_hv_clock_tsc(struct clocksource *arg) +{ + u64 current_tick = hv_read_tsc_page(tsc_pg); + + if (current_tick == U64_MAX) + rdmsrl(HV_X64_MSR_TIME_REF_COUNT, current_tick); + + return current_tick; +} + +static struct clocksource hyperv_cs_tsc = { + .name = "hyperv_clocksource_tsc_page", + .rating = 400, + .read = read_hv_clock_tsc, + .mask = CLOCKSOURCE_MASK(64), + .flags = CLOCK_SOURCE_IS_CONTINUOUS, +}; +#endif + +static u64 read_hv_clock_msr(struct clocksource *arg) +{ + u64 current_tick; + /* + * Read the partition counter to get the current tick count. This count + * is set to 0 when the partition is created and is incremented in + * 100 nanosecond units. + */ + rdmsrl(HV_X64_MSR_TIME_REF_COUNT, current_tick); + return current_tick; +} + +static struct clocksource hyperv_cs_msr = { + .name = "hyperv_clocksource_msr", + .rating = 400, + .read = read_hv_clock_msr, + .mask = CLOCKSOURCE_MASK(64), + .flags = CLOCK_SOURCE_IS_CONTINUOUS, +}; + +void *hv_hypercall_pg; +EXPORT_SYMBOL_GPL(hv_hypercall_pg); +struct clocksource *hyperv_cs; +EXPORT_SYMBOL_GPL(hyperv_cs); + +u32 *hv_vp_index; +EXPORT_SYMBOL_GPL(hv_vp_index); + +struct hv_vp_assist_page **hv_vp_assist_page; +EXPORT_SYMBOL_GPL(hv_vp_assist_page); + +void __percpu **hyperv_pcpu_input_arg; +EXPORT_SYMBOL_GPL(hyperv_pcpu_input_arg); + +u32 hv_max_vp_index; + +static int hv_cpu_init(unsigned int cpu) +{ + u64 msr_vp_index; + struct hv_vp_assist_page **hvp = &hv_vp_assist_page[smp_processor_id()]; + void **input_arg; + struct page *pg; + + input_arg = (void **)this_cpu_ptr(hyperv_pcpu_input_arg); + pg = alloc_page(GFP_KERNEL); + if (unlikely(!pg)) + return -ENOMEM; + *input_arg = page_address(pg); + + hv_get_vp_index(msr_vp_index); + + hv_vp_index[smp_processor_id()] = msr_vp_index; + + if (msr_vp_index > hv_max_vp_index) + hv_max_vp_index = msr_vp_index; + + if (!hv_vp_assist_page) + return 0; + + if (!*hvp) + *hvp = __vmalloc(PAGE_SIZE, GFP_KERNEL, PAGE_KERNEL); + + if (*hvp) { + u64 val; + + val = vmalloc_to_pfn(*hvp); + val = (val << HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_SHIFT) | + HV_X64_MSR_VP_ASSIST_PAGE_ENABLE; + + wrmsrl(HV_X64_MSR_VP_ASSIST_PAGE, val); + } + + return 0; +} + +static void (*hv_reenlightenment_cb)(void); + +static void hv_reenlightenment_notify(struct work_struct *dummy) +{ + struct hv_tsc_emulation_status emu_status; + + rdmsrl(HV_X64_MSR_TSC_EMULATION_STATUS, *(u64 *)&emu_status); + + /* Don't issue the callback if TSC accesses are not emulated */ + if (hv_reenlightenment_cb && emu_status.inprogress) + hv_reenlightenment_cb(); +} +static DECLARE_DELAYED_WORK(hv_reenlightenment_work, hv_reenlightenment_notify); + +void hyperv_stop_tsc_emulation(void) +{ + u64 freq; + struct hv_tsc_emulation_status emu_status; + + rdmsrl(HV_X64_MSR_TSC_EMULATION_STATUS, *(u64 *)&emu_status); + emu_status.inprogress = 0; + wrmsrl(HV_X64_MSR_TSC_EMULATION_STATUS, *(u64 *)&emu_status); + + rdmsrl(HV_X64_MSR_TSC_FREQUENCY, freq); + tsc_khz = div64_u64(freq, 1000); +} +EXPORT_SYMBOL_GPL(hyperv_stop_tsc_emulation); + +static inline bool hv_reenlightenment_available(void) +{ + /* + * Check for required features and priviliges to make TSC frequency + * change notifications work. + */ + return ms_hyperv.features & HV_X64_ACCESS_FREQUENCY_MSRS && + ms_hyperv.misc_features & HV_FEATURE_FREQUENCY_MSRS_AVAILABLE && + ms_hyperv.features & HV_X64_ACCESS_REENLIGHTENMENT; +} + +__visible void __irq_entry hyperv_reenlightenment_intr(struct pt_regs *regs) +{ + entering_ack_irq(); + + inc_irq_stat(irq_hv_reenlightenment_count); + + schedule_delayed_work(&hv_reenlightenment_work, HZ/10); + + exiting_irq(); +} + +void set_hv_tscchange_cb(void (*cb)(void)) +{ + struct hv_reenlightenment_control re_ctrl = { + .vector = HYPERV_REENLIGHTENMENT_VECTOR, + .enabled = 1, + }; + struct hv_tsc_emulation_control emu_ctrl = {.enabled = 1}; + + if (!hv_reenlightenment_available()) { + pr_warn("Hyper-V: reenlightenment support is unavailable\n"); + return; + } + + if (!hv_vp_index) + return; + + hv_reenlightenment_cb = cb; + + /* Make sure callback is registered before we write to MSRs */ + wmb(); + + re_ctrl.target_vp = hv_vp_index[get_cpu()]; + + wrmsrl(HV_X64_MSR_REENLIGHTENMENT_CONTROL, *((u64 *)&re_ctrl)); + wrmsrl(HV_X64_MSR_TSC_EMULATION_CONTROL, *((u64 *)&emu_ctrl)); + + put_cpu(); +} +EXPORT_SYMBOL_GPL(set_hv_tscchange_cb); + +void clear_hv_tscchange_cb(void) +{ + struct hv_reenlightenment_control re_ctrl; + + if (!hv_reenlightenment_available()) + return; + + rdmsrl(HV_X64_MSR_REENLIGHTENMENT_CONTROL, *(u64 *)&re_ctrl); + re_ctrl.enabled = 0; + wrmsrl(HV_X64_MSR_REENLIGHTENMENT_CONTROL, *(u64 *)&re_ctrl); + + hv_reenlightenment_cb = NULL; +} +EXPORT_SYMBOL_GPL(clear_hv_tscchange_cb); + +static int hv_cpu_die(unsigned int cpu) +{ + struct hv_reenlightenment_control re_ctrl; + unsigned int new_cpu; + unsigned long flags; + void **input_arg; + void *input_pg = NULL; + + local_irq_save(flags); + input_arg = (void **)this_cpu_ptr(hyperv_pcpu_input_arg); + input_pg = *input_arg; + *input_arg = NULL; + local_irq_restore(flags); + free_page((unsigned long)input_pg); + + if (hv_vp_assist_page && hv_vp_assist_page[cpu]) + wrmsrl(HV_X64_MSR_VP_ASSIST_PAGE, 0); + + if (hv_reenlightenment_cb == NULL) + return 0; + + rdmsrl(HV_X64_MSR_REENLIGHTENMENT_CONTROL, *((u64 *)&re_ctrl)); + if (re_ctrl.target_vp == hv_vp_index[cpu]) { + /* Reassign to some other online CPU */ + new_cpu = cpumask_any_but(cpu_online_mask, cpu); + + re_ctrl.target_vp = hv_vp_index[new_cpu]; + wrmsrl(HV_X64_MSR_REENLIGHTENMENT_CONTROL, *((u64 *)&re_ctrl)); + } + + return 0; +} + +static int __init hv_pci_init(void) +{ + int gen2vm = efi_enabled(EFI_BOOT); + + /* + * For Generation-2 VM, we exit from pci_arch_init() by returning 0. + * The purpose is to suppress the harmless warning: + * "PCI: Fatal: No config space access function found" + */ + if (gen2vm) + return 0; + + /* For Generation-1 VM, we'll proceed in pci_arch_init(). */ + return 1; +} + +/* + * This function is to be invoked early in the boot sequence after the + * hypervisor has been detected. + * + * 1. Setup the hypercall page. + * 2. Register Hyper-V specific clocksource. + * 3. Setup Hyper-V specific APIC entry points. + */ +void __init hyperv_init(void) +{ + u64 guest_id, required_msrs; + union hv_x64_msr_hypercall_contents hypercall_msr; + int cpuhp, i; + + if (x86_hyper_type != X86_HYPER_MS_HYPERV) + return; + + /* Absolutely required MSRs */ + required_msrs = HV_X64_MSR_HYPERCALL_AVAILABLE | + HV_X64_MSR_VP_INDEX_AVAILABLE; + + if ((ms_hyperv.features & required_msrs) != required_msrs) + return; + + /* + * Allocate the per-CPU state for the hypercall input arg. + * If this allocation fails, we will not be able to setup + * (per-CPU) hypercall input page and thus this failure is + * fatal on Hyper-V. + */ + hyperv_pcpu_input_arg = alloc_percpu(void *); + + BUG_ON(hyperv_pcpu_input_arg == NULL); + + /* Allocate percpu VP index */ + hv_vp_index = kmalloc_array(num_possible_cpus(), sizeof(*hv_vp_index), + GFP_KERNEL); + if (!hv_vp_index) + return; + + for (i = 0; i < num_possible_cpus(); i++) + hv_vp_index[i] = VP_INVAL; + + hv_vp_assist_page = kcalloc(num_possible_cpus(), + sizeof(*hv_vp_assist_page), GFP_KERNEL); + if (!hv_vp_assist_page) { + ms_hyperv.hints &= ~HV_X64_ENLIGHTENED_VMCS_RECOMMENDED; + goto free_vp_index; + } + + cpuhp = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/hyperv_init:online", + hv_cpu_init, hv_cpu_die); + if (cpuhp < 0) + goto free_vp_assist_page; + + /* + * Setup the hypercall page and enable hypercalls. + * 1. Register the guest ID + * 2. Enable the hypercall and register the hypercall page + */ + guest_id = generate_guest_id(0, LINUX_VERSION_CODE, 0); + wrmsrl(HV_X64_MSR_GUEST_OS_ID, guest_id); + + hv_hypercall_pg = __vmalloc(PAGE_SIZE, GFP_KERNEL, PAGE_KERNEL_RX); + if (hv_hypercall_pg == NULL) { + wrmsrl(HV_X64_MSR_GUEST_OS_ID, 0); + goto remove_cpuhp_state; + } + + rdmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); + hypercall_msr.enable = 1; + hypercall_msr.guest_physical_address = vmalloc_to_pfn(hv_hypercall_pg); + wrmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); + + hv_apic_init(); + + x86_init.pci.arch_init = hv_pci_init; + + /* + * Register Hyper-V specific clocksource. + */ +#ifdef CONFIG_HYPERV_TSCPAGE + if (ms_hyperv.features & HV_MSR_REFERENCE_TSC_AVAILABLE) { + union hv_x64_msr_hypercall_contents tsc_msr; + + tsc_pg = __vmalloc(PAGE_SIZE, GFP_KERNEL, PAGE_KERNEL); + if (!tsc_pg) + goto register_msr_cs; + + hyperv_cs = &hyperv_cs_tsc; + + rdmsrl(HV_X64_MSR_REFERENCE_TSC, tsc_msr.as_uint64); + + tsc_msr.enable = 1; + tsc_msr.guest_physical_address = vmalloc_to_pfn(tsc_pg); + + wrmsrl(HV_X64_MSR_REFERENCE_TSC, tsc_msr.as_uint64); + + hyperv_cs_tsc.archdata.vclock_mode = VCLOCK_HVCLOCK; + + clocksource_register_hz(&hyperv_cs_tsc, NSEC_PER_SEC/100); + return; + } +register_msr_cs: +#endif + /* + * For 32 bit guests just use the MSR based mechanism for reading + * the partition counter. + */ + + hyperv_cs = &hyperv_cs_msr; + if (ms_hyperv.features & HV_MSR_TIME_REF_COUNT_AVAILABLE) + clocksource_register_hz(&hyperv_cs_msr, NSEC_PER_SEC/100); + + return; + +remove_cpuhp_state: + cpuhp_remove_state(cpuhp); +free_vp_assist_page: + kfree(hv_vp_assist_page); + hv_vp_assist_page = NULL; +free_vp_index: + kfree(hv_vp_index); + hv_vp_index = NULL; +} + +/* + * This routine is called before kexec/kdump, it does the required cleanup. + */ +void hyperv_cleanup(void) +{ + union hv_x64_msr_hypercall_contents hypercall_msr; + + /* Reset our OS id */ + wrmsrl(HV_X64_MSR_GUEST_OS_ID, 0); + + /* + * Reset hypercall page reference before reset the page, + * let hypercall operations fail safely rather than + * panic the kernel for using invalid hypercall page + */ + hv_hypercall_pg = NULL; + + /* Reset the hypercall page */ + hypercall_msr.as_uint64 = 0; + wrmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); + + /* Reset the TSC page */ + hypercall_msr.as_uint64 = 0; + wrmsrl(HV_X64_MSR_REFERENCE_TSC, hypercall_msr.as_uint64); +} +EXPORT_SYMBOL_GPL(hyperv_cleanup); + +void hyperv_report_panic(struct pt_regs *regs, long err, bool in_die) +{ + static bool panic_reported; + u64 guest_id; + + if (in_die && !panic_on_oops) + return; + + /* + * We prefer to report panic on 'die' chain as we have proper + * registers to report, but if we miss it (e.g. on BUG()) we need + * to report it on 'panic'. + */ + if (panic_reported) + return; + panic_reported = true; + + rdmsrl(HV_X64_MSR_GUEST_OS_ID, guest_id); + + wrmsrl(HV_X64_MSR_CRASH_P0, err); + wrmsrl(HV_X64_MSR_CRASH_P1, guest_id); + wrmsrl(HV_X64_MSR_CRASH_P2, regs->ip); + wrmsrl(HV_X64_MSR_CRASH_P3, regs->ax); + wrmsrl(HV_X64_MSR_CRASH_P4, regs->sp); + + /* + * Let Hyper-V know there is crash data available + */ + wrmsrl(HV_X64_MSR_CRASH_CTL, HV_CRASH_CTL_CRASH_NOTIFY); +} +EXPORT_SYMBOL_GPL(hyperv_report_panic); + +/** + * hyperv_report_panic_msg - report panic message to Hyper-V + * @pa: physical address of the panic page containing the message + * @size: size of the message in the page + */ +void hyperv_report_panic_msg(phys_addr_t pa, size_t size) +{ + /* + * P3 to contain the physical address of the panic page & P4 to + * contain the size of the panic data in that page. Rest of the + * registers are no-op when the NOTIFY_MSG flag is set. + */ + wrmsrl(HV_X64_MSR_CRASH_P0, 0); + wrmsrl(HV_X64_MSR_CRASH_P1, 0); + wrmsrl(HV_X64_MSR_CRASH_P2, 0); + wrmsrl(HV_X64_MSR_CRASH_P3, pa); + wrmsrl(HV_X64_MSR_CRASH_P4, size); + + /* + * Let Hyper-V know there is crash data available along with + * the panic message. + */ + wrmsrl(HV_X64_MSR_CRASH_CTL, + (HV_CRASH_CTL_CRASH_NOTIFY | HV_CRASH_CTL_CRASH_NOTIFY_MSG)); +} +EXPORT_SYMBOL_GPL(hyperv_report_panic_msg); + +bool hv_is_hyperv_initialized(void) +{ + union hv_x64_msr_hypercall_contents hypercall_msr; + + /* + * Ensure that we're really on Hyper-V, and not a KVM or Xen + * emulation of Hyper-V + */ + if (x86_hyper_type != X86_HYPER_MS_HYPERV) + return false; + + /* + * Verify that earlier initialization succeeded by checking + * that the hypercall page is setup + */ + hypercall_msr.as_uint64 = 0; + rdmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); + + return hypercall_msr.enable; +} +EXPORT_SYMBOL_GPL(hv_is_hyperv_initialized); diff --git a/arch/x86/hyperv/mmu.c b/arch/x86/hyperv/mmu.c new file mode 100644 index 000000000..e666f7eaf --- /dev/null +++ b/arch/x86/hyperv/mmu.c @@ -0,0 +1,244 @@ +#define pr_fmt(fmt) "Hyper-V: " fmt + +#include <linux/hyperv.h> +#include <linux/log2.h> +#include <linux/slab.h> +#include <linux/types.h> + +#include <asm/fpu/api.h> +#include <asm/mshyperv.h> +#include <asm/msr.h> +#include <asm/tlbflush.h> +#include <asm/tlb.h> + +#define CREATE_TRACE_POINTS +#include <asm/trace/hyperv.h> + +/* Each gva in gva_list encodes up to 4096 pages to flush */ +#define HV_TLB_FLUSH_UNIT (4096 * PAGE_SIZE) + +static u64 hyperv_flush_tlb_others_ex(const struct cpumask *cpus, + const struct flush_tlb_info *info); + +/* + * Fills in gva_list starting from offset. Returns the number of items added. + */ +static inline int fill_gva_list(u64 gva_list[], int offset, + unsigned long start, unsigned long end) +{ + int gva_n = offset; + unsigned long cur = start, diff; + + do { + diff = end > cur ? end - cur : 0; + + gva_list[gva_n] = cur & PAGE_MASK; + /* + * Lower 12 bits encode the number of additional + * pages to flush (in addition to the 'cur' page). + */ + if (diff >= HV_TLB_FLUSH_UNIT) { + gva_list[gva_n] |= ~PAGE_MASK; + cur += HV_TLB_FLUSH_UNIT; + } else if (diff) { + gva_list[gva_n] |= (diff - 1) >> PAGE_SHIFT; + cur = end; + } + + gva_n++; + + } while (cur < end); + + return gva_n - offset; +} + +static void hyperv_flush_tlb_others(const struct cpumask *cpus, + const struct flush_tlb_info *info) +{ + int cpu, vcpu, gva_n, max_gvas; + struct hv_tlb_flush **flush_pcpu; + struct hv_tlb_flush *flush; + u64 status = U64_MAX; + unsigned long flags; + + trace_hyperv_mmu_flush_tlb_others(cpus, info); + + if (!hv_hypercall_pg) + goto do_native; + + local_irq_save(flags); + + /* + * Only check the mask _after_ interrupt has been disabled to avoid the + * mask changing under our feet. + */ + if (cpumask_empty(cpus)) { + local_irq_restore(flags); + return; + } + + flush_pcpu = (struct hv_tlb_flush **) + this_cpu_ptr(hyperv_pcpu_input_arg); + + flush = *flush_pcpu; + + if (unlikely(!flush)) { + local_irq_restore(flags); + goto do_native; + } + + if (info->mm) { + /* + * AddressSpace argument must match the CR3 with PCID bits + * stripped out. + */ + flush->address_space = virt_to_phys(info->mm->pgd); + flush->address_space &= CR3_ADDR_MASK; + flush->flags = 0; + } else { + flush->address_space = 0; + flush->flags = HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES; + } + + flush->processor_mask = 0; + if (cpumask_equal(cpus, cpu_present_mask)) { + flush->flags |= HV_FLUSH_ALL_PROCESSORS; + } else { + /* + * From the supplied CPU set we need to figure out if we can get + * away with cheaper HVCALL_FLUSH_VIRTUAL_ADDRESS_{LIST,SPACE} + * hypercalls. This is possible when the highest VP number in + * the set is < 64. As VP numbers are usually in ascending order + * and match Linux CPU ids, here is an optimization: we check + * the VP number for the highest bit in the supplied set first + * so we can quickly find out if using *_EX hypercalls is a + * must. We will also check all VP numbers when walking the + * supplied CPU set to remain correct in all cases. + */ + if (hv_cpu_number_to_vp_number(cpumask_last(cpus)) >= 64) + goto do_ex_hypercall; + + for_each_cpu(cpu, cpus) { + vcpu = hv_cpu_number_to_vp_number(cpu); + if (vcpu == VP_INVAL) { + local_irq_restore(flags); + goto do_native; + } + + if (vcpu >= 64) + goto do_ex_hypercall; + + __set_bit(vcpu, (unsigned long *) + &flush->processor_mask); + } + } + + /* + * We can flush not more than max_gvas with one hypercall. Flush the + * whole address space if we were asked to do more. + */ + max_gvas = (PAGE_SIZE - sizeof(*flush)) / sizeof(flush->gva_list[0]); + + if (info->end == TLB_FLUSH_ALL) { + flush->flags |= HV_FLUSH_NON_GLOBAL_MAPPINGS_ONLY; + status = hv_do_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE, + flush, NULL); + } else if (info->end && + ((info->end - info->start)/HV_TLB_FLUSH_UNIT) > max_gvas) { + status = hv_do_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE, + flush, NULL); + } else { + gva_n = fill_gva_list(flush->gva_list, 0, + info->start, info->end); + status = hv_do_rep_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST, + gva_n, 0, flush, NULL); + } + goto check_status; + +do_ex_hypercall: + status = hyperv_flush_tlb_others_ex(cpus, info); + +check_status: + local_irq_restore(flags); + + if (!(status & HV_HYPERCALL_RESULT_MASK)) + return; +do_native: + native_flush_tlb_others(cpus, info); +} + +static u64 hyperv_flush_tlb_others_ex(const struct cpumask *cpus, + const struct flush_tlb_info *info) +{ + int nr_bank = 0, max_gvas, gva_n; + struct hv_tlb_flush_ex **flush_pcpu; + struct hv_tlb_flush_ex *flush; + u64 status; + + if (!(ms_hyperv.hints & HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED)) + return U64_MAX; + + flush_pcpu = (struct hv_tlb_flush_ex **) + this_cpu_ptr(hyperv_pcpu_input_arg); + + flush = *flush_pcpu; + + if (info->mm) { + /* + * AddressSpace argument must match the CR3 with PCID bits + * stripped out. + */ + flush->address_space = virt_to_phys(info->mm->pgd); + flush->address_space &= CR3_ADDR_MASK; + flush->flags = 0; + } else { + flush->address_space = 0; + flush->flags = HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES; + } + + flush->hv_vp_set.valid_bank_mask = 0; + + flush->hv_vp_set.format = HV_GENERIC_SET_SPARSE_4K; + nr_bank = cpumask_to_vpset(&(flush->hv_vp_set), cpus); + if (nr_bank < 0) + return U64_MAX; + + /* + * We can flush not more than max_gvas with one hypercall. Flush the + * whole address space if we were asked to do more. + */ + max_gvas = + (PAGE_SIZE - sizeof(*flush) - nr_bank * + sizeof(flush->hv_vp_set.bank_contents[0])) / + sizeof(flush->gva_list[0]); + + if (info->end == TLB_FLUSH_ALL) { + flush->flags |= HV_FLUSH_NON_GLOBAL_MAPPINGS_ONLY; + status = hv_do_rep_hypercall( + HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX, + 0, nr_bank, flush, NULL); + } else if (info->end && + ((info->end - info->start)/HV_TLB_FLUSH_UNIT) > max_gvas) { + status = hv_do_rep_hypercall( + HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX, + 0, nr_bank, flush, NULL); + } else { + gva_n = fill_gva_list(flush->gva_list, nr_bank, + info->start, info->end); + status = hv_do_rep_hypercall( + HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX, + gva_n, nr_bank, flush, NULL); + } + + return status; +} + +void hyperv_setup_mmu_ops(void) +{ + if (!(ms_hyperv.hints & HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED)) + return; + + pr_info("Using hypercall for remote TLB flush\n"); + pv_mmu_ops.flush_tlb_others = hyperv_flush_tlb_others; + pv_mmu_ops.tlb_remove_table = tlb_remove_table; +} diff --git a/arch/x86/hyperv/nested.c b/arch/x86/hyperv/nested.c new file mode 100644 index 000000000..b8e60cc50 --- /dev/null +++ b/arch/x86/hyperv/nested.c @@ -0,0 +1,56 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Hyper-V nested virtualization code. + * + * Copyright (C) 2018, Microsoft, Inc. + * + * Author : Lan Tianyu <Tianyu.Lan@microsoft.com> + */ + + +#include <linux/types.h> +#include <asm/hyperv-tlfs.h> +#include <asm/mshyperv.h> +#include <asm/tlbflush.h> + +#include <asm/trace/hyperv.h> + +int hyperv_flush_guest_mapping(u64 as) +{ + struct hv_guest_mapping_flush **flush_pcpu; + struct hv_guest_mapping_flush *flush; + u64 status; + unsigned long flags; + int ret = -ENOTSUPP; + + if (!hv_hypercall_pg) + goto fault; + + local_irq_save(flags); + + flush_pcpu = (struct hv_guest_mapping_flush **) + this_cpu_ptr(hyperv_pcpu_input_arg); + + flush = *flush_pcpu; + + if (unlikely(!flush)) { + local_irq_restore(flags); + goto fault; + } + + flush->address_space = as; + flush->flags = 0; + + status = hv_do_hypercall(HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_SPACE, + flush, NULL); + local_irq_restore(flags); + + if (!(status & HV_HYPERCALL_RESULT_MASK)) + ret = 0; + +fault: + trace_hyperv_nested_flush_guest_mapping(as, ret); + return ret; +} +EXPORT_SYMBOL_GPL(hyperv_flush_guest_mapping); |