summaryrefslogtreecommitdiffstats
path: root/drivers/hv/hv_common.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/hv/hv_common.c')
-rw-r--r--drivers/hv/hv_common.c586
1 files changed, 586 insertions, 0 deletions
diff --git a/drivers/hv/hv_common.c b/drivers/hv/hv_common.c
new file mode 100644
index 000000000..ccad7bca3
--- /dev/null
+++ b/drivers/hv/hv_common.c
@@ -0,0 +1,586 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Architecture neutral utility routines for interacting with
+ * Hyper-V. This file is specifically for code that must be
+ * built-in to the kernel image when CONFIG_HYPERV is set
+ * (vs. being in a module) because it is called from architecture
+ * specific code under arch/.
+ *
+ * Copyright (C) 2021, Microsoft, Inc.
+ *
+ * Author : Michael Kelley <mikelley@microsoft.com>
+ */
+
+#include <linux/types.h>
+#include <linux/acpi.h>
+#include <linux/export.h>
+#include <linux/bitfield.h>
+#include <linux/cpumask.h>
+#include <linux/sched/task_stack.h>
+#include <linux/panic_notifier.h>
+#include <linux/ptrace.h>
+#include <linux/kdebug.h>
+#include <linux/kmsg_dump.h>
+#include <linux/slab.h>
+#include <linux/dma-map-ops.h>
+#include <linux/set_memory.h>
+#include <asm/hyperv-tlfs.h>
+#include <asm/mshyperv.h>
+
+/*
+ * hv_root_partition, ms_hyperv and hv_nested are defined here with other
+ * Hyper-V specific globals so they are shared across all architectures and are
+ * built only when CONFIG_HYPERV is defined. But on x86,
+ * ms_hyperv_init_platform() is built even when CONFIG_HYPERV is not
+ * defined, and it uses these three variables. So mark them as __weak
+ * here, allowing for an overriding definition in the module containing
+ * ms_hyperv_init_platform().
+ */
+bool __weak hv_root_partition;
+EXPORT_SYMBOL_GPL(hv_root_partition);
+
+bool __weak hv_nested;
+EXPORT_SYMBOL_GPL(hv_nested);
+
+struct ms_hyperv_info __weak ms_hyperv;
+EXPORT_SYMBOL_GPL(ms_hyperv);
+
+u32 *hv_vp_index;
+EXPORT_SYMBOL_GPL(hv_vp_index);
+
+u32 hv_max_vp_index;
+EXPORT_SYMBOL_GPL(hv_max_vp_index);
+
+void * __percpu *hyperv_pcpu_input_arg;
+EXPORT_SYMBOL_GPL(hyperv_pcpu_input_arg);
+
+void * __percpu *hyperv_pcpu_output_arg;
+EXPORT_SYMBOL_GPL(hyperv_pcpu_output_arg);
+
+static void hv_kmsg_dump_unregister(void);
+
+static struct ctl_table_header *hv_ctl_table_hdr;
+
+/*
+ * Hyper-V specific initialization and shutdown code that is
+ * common across all architectures. Called from architecture
+ * specific initialization functions.
+ */
+
+void __init hv_common_free(void)
+{
+ unregister_sysctl_table(hv_ctl_table_hdr);
+ hv_ctl_table_hdr = NULL;
+
+ if (ms_hyperv.misc_features & HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE)
+ hv_kmsg_dump_unregister();
+
+ kfree(hv_vp_index);
+ hv_vp_index = NULL;
+
+ free_percpu(hyperv_pcpu_output_arg);
+ hyperv_pcpu_output_arg = NULL;
+
+ free_percpu(hyperv_pcpu_input_arg);
+ hyperv_pcpu_input_arg = NULL;
+}
+
+/*
+ * Functions for allocating and freeing memory with size and
+ * alignment HV_HYP_PAGE_SIZE. These functions are needed because
+ * the guest page size may not be the same as the Hyper-V page
+ * size. We depend upon kmalloc() aligning power-of-two size
+ * allocations to the allocation size boundary, so that the
+ * allocated memory appears to Hyper-V as a page of the size
+ * it expects.
+ */
+
+void *hv_alloc_hyperv_page(void)
+{
+ BUILD_BUG_ON(PAGE_SIZE < HV_HYP_PAGE_SIZE);
+
+ if (PAGE_SIZE == HV_HYP_PAGE_SIZE)
+ return (void *)__get_free_page(GFP_KERNEL);
+ else
+ return kmalloc(HV_HYP_PAGE_SIZE, GFP_KERNEL);
+}
+EXPORT_SYMBOL_GPL(hv_alloc_hyperv_page);
+
+void *hv_alloc_hyperv_zeroed_page(void)
+{
+ if (PAGE_SIZE == HV_HYP_PAGE_SIZE)
+ return (void *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
+ else
+ return kzalloc(HV_HYP_PAGE_SIZE, GFP_KERNEL);
+}
+EXPORT_SYMBOL_GPL(hv_alloc_hyperv_zeroed_page);
+
+void hv_free_hyperv_page(void *addr)
+{
+ if (PAGE_SIZE == HV_HYP_PAGE_SIZE)
+ free_page((unsigned long)addr);
+ else
+ kfree(addr);
+}
+EXPORT_SYMBOL_GPL(hv_free_hyperv_page);
+
+static void *hv_panic_page;
+
+/*
+ * Boolean to control whether to report panic messages over Hyper-V.
+ *
+ * It can be set via /proc/sys/kernel/hyperv_record_panic_msg
+ */
+static int sysctl_record_panic_msg = 1;
+
+/*
+ * sysctl option to allow the user to control whether kmsg data should be
+ * reported to Hyper-V on panic.
+ */
+static struct ctl_table hv_ctl_table[] = {
+ {
+ .procname = "hyperv_record_panic_msg",
+ .data = &sysctl_record_panic_msg,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE
+ },
+ {}
+};
+
+static int hv_die_panic_notify_crash(struct notifier_block *self,
+ unsigned long val, void *args);
+
+static struct notifier_block hyperv_die_report_block = {
+ .notifier_call = hv_die_panic_notify_crash,
+};
+
+static struct notifier_block hyperv_panic_report_block = {
+ .notifier_call = hv_die_panic_notify_crash,
+};
+
+/*
+ * The following callback works both as die and panic notifier; its
+ * goal is to provide panic information to the hypervisor unless the
+ * kmsg dumper is used [see hv_kmsg_dump()], which provides more
+ * information but isn't always available.
+ *
+ * Notice that both the panic/die report notifiers are registered only
+ * if we have the capability HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE set.
+ */
+static int hv_die_panic_notify_crash(struct notifier_block *self,
+ unsigned long val, void *args)
+{
+ struct pt_regs *regs;
+ bool is_die;
+
+ /* Don't notify Hyper-V unless we have a die oops event or panic. */
+ if (self == &hyperv_panic_report_block) {
+ is_die = false;
+ regs = current_pt_regs();
+ } else { /* die event */
+ if (val != DIE_OOPS)
+ return NOTIFY_DONE;
+
+ is_die = true;
+ regs = ((struct die_args *)args)->regs;
+ }
+
+ /*
+ * Hyper-V should be notified only once about a panic/die. If we will
+ * be calling hv_kmsg_dump() later with kmsg data, don't do the
+ * notification here.
+ */
+ if (!sysctl_record_panic_msg || !hv_panic_page)
+ hyperv_report_panic(regs, val, is_die);
+
+ return NOTIFY_DONE;
+}
+
+/*
+ * Callback from kmsg_dump. Grab as much as possible from the end of the kmsg
+ * buffer and call into Hyper-V to transfer the data.
+ */
+static void hv_kmsg_dump(struct kmsg_dumper *dumper,
+ enum kmsg_dump_reason reason)
+{
+ struct kmsg_dump_iter iter;
+ size_t bytes_written;
+
+ /* We are only interested in panics. */
+ if (reason != KMSG_DUMP_PANIC || !sysctl_record_panic_msg)
+ return;
+
+ /*
+ * Write dump contents to the page. No need to synchronize; panic should
+ * be single-threaded.
+ */
+ kmsg_dump_rewind(&iter);
+ kmsg_dump_get_buffer(&iter, false, hv_panic_page, HV_HYP_PAGE_SIZE,
+ &bytes_written);
+ if (!bytes_written)
+ return;
+ /*
+ * P3 to contain the physical address of the panic page & P4 to
+ * contain the size of the panic data in that page. Rest of the
+ * registers are no-op when the NOTIFY_MSG flag is set.
+ */
+ hv_set_register(HV_REGISTER_CRASH_P0, 0);
+ hv_set_register(HV_REGISTER_CRASH_P1, 0);
+ hv_set_register(HV_REGISTER_CRASH_P2, 0);
+ hv_set_register(HV_REGISTER_CRASH_P3, virt_to_phys(hv_panic_page));
+ hv_set_register(HV_REGISTER_CRASH_P4, bytes_written);
+
+ /*
+ * Let Hyper-V know there is crash data available along with
+ * the panic message.
+ */
+ hv_set_register(HV_REGISTER_CRASH_CTL,
+ (HV_CRASH_CTL_CRASH_NOTIFY |
+ HV_CRASH_CTL_CRASH_NOTIFY_MSG));
+}
+
+static struct kmsg_dumper hv_kmsg_dumper = {
+ .dump = hv_kmsg_dump,
+};
+
+static void hv_kmsg_dump_unregister(void)
+{
+ kmsg_dump_unregister(&hv_kmsg_dumper);
+ unregister_die_notifier(&hyperv_die_report_block);
+ atomic_notifier_chain_unregister(&panic_notifier_list,
+ &hyperv_panic_report_block);
+
+ hv_free_hyperv_page(hv_panic_page);
+ hv_panic_page = NULL;
+}
+
+static void hv_kmsg_dump_register(void)
+{
+ int ret;
+
+ hv_panic_page = hv_alloc_hyperv_zeroed_page();
+ if (!hv_panic_page) {
+ pr_err("Hyper-V: panic message page memory allocation failed\n");
+ return;
+ }
+
+ ret = kmsg_dump_register(&hv_kmsg_dumper);
+ if (ret) {
+ pr_err("Hyper-V: kmsg dump register error 0x%x\n", ret);
+ hv_free_hyperv_page(hv_panic_page);
+ hv_panic_page = NULL;
+ }
+}
+
+int __init hv_common_init(void)
+{
+ int i;
+
+ if (hv_is_isolation_supported())
+ sysctl_record_panic_msg = 0;
+
+ /*
+ * Hyper-V expects to get crash register data or kmsg when
+ * crash enlightment is available and system crashes. Set
+ * crash_kexec_post_notifiers to be true to make sure that
+ * calling crash enlightment interface before running kdump
+ * kernel.
+ */
+ if (ms_hyperv.misc_features & HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE) {
+ u64 hyperv_crash_ctl;
+
+ crash_kexec_post_notifiers = true;
+ pr_info("Hyper-V: enabling crash_kexec_post_notifiers\n");
+
+ /*
+ * Panic message recording (sysctl_record_panic_msg)
+ * is enabled by default in non-isolated guests and
+ * disabled by default in isolated guests; the panic
+ * message recording won't be available in isolated
+ * guests should the following registration fail.
+ */
+ hv_ctl_table_hdr = register_sysctl("kernel", hv_ctl_table);
+ if (!hv_ctl_table_hdr)
+ pr_err("Hyper-V: sysctl table register error");
+
+ /*
+ * Register for panic kmsg callback only if the right
+ * capability is supported by the hypervisor.
+ */
+ hyperv_crash_ctl = hv_get_register(HV_REGISTER_CRASH_CTL);
+ if (hyperv_crash_ctl & HV_CRASH_CTL_CRASH_NOTIFY_MSG)
+ hv_kmsg_dump_register();
+
+ register_die_notifier(&hyperv_die_report_block);
+ atomic_notifier_chain_register(&panic_notifier_list,
+ &hyperv_panic_report_block);
+ }
+
+ /*
+ * Allocate the per-CPU state for the hypercall input arg.
+ * If this allocation fails, we will not be able to setup
+ * (per-CPU) hypercall input page and thus this failure is
+ * fatal on Hyper-V.
+ */
+ hyperv_pcpu_input_arg = alloc_percpu(void *);
+ BUG_ON(!hyperv_pcpu_input_arg);
+
+ /* Allocate the per-CPU state for output arg for root */
+ if (hv_root_partition) {
+ hyperv_pcpu_output_arg = alloc_percpu(void *);
+ BUG_ON(!hyperv_pcpu_output_arg);
+ }
+
+ hv_vp_index = kmalloc_array(num_possible_cpus(), sizeof(*hv_vp_index),
+ GFP_KERNEL);
+ if (!hv_vp_index) {
+ hv_common_free();
+ return -ENOMEM;
+ }
+
+ for (i = 0; i < num_possible_cpus(); i++)
+ hv_vp_index[i] = VP_INVAL;
+
+ return 0;
+}
+
+/*
+ * Hyper-V specific initialization and die code for
+ * individual CPUs that is common across all architectures.
+ * Called by the CPU hotplug mechanism.
+ */
+
+int hv_common_cpu_init(unsigned int cpu)
+{
+ void **inputarg, **outputarg;
+ u64 msr_vp_index;
+ gfp_t flags;
+ int pgcount = hv_root_partition ? 2 : 1;
+ void *mem;
+ int ret;
+
+ /* hv_cpu_init() can be called with IRQs disabled from hv_resume() */
+ flags = irqs_disabled() ? GFP_ATOMIC : GFP_KERNEL;
+
+ inputarg = (void **)this_cpu_ptr(hyperv_pcpu_input_arg);
+
+ /*
+ * hyperv_pcpu_input_arg and hyperv_pcpu_output_arg memory is already
+ * allocated if this CPU was previously online and then taken offline
+ */
+ if (!*inputarg) {
+ mem = kmalloc(pgcount * HV_HYP_PAGE_SIZE, flags);
+ if (!mem)
+ return -ENOMEM;
+
+ if (hv_root_partition) {
+ outputarg = (void **)this_cpu_ptr(hyperv_pcpu_output_arg);
+ *outputarg = (char *)mem + HV_HYP_PAGE_SIZE;
+ }
+
+ if (!ms_hyperv.paravisor_present &&
+ (hv_isolation_type_snp() || hv_isolation_type_tdx())) {
+ ret = set_memory_decrypted((unsigned long)mem, pgcount);
+ if (ret) {
+ /* It may be unsafe to free 'mem' */
+ return ret;
+ }
+
+ memset(mem, 0x00, pgcount * HV_HYP_PAGE_SIZE);
+ }
+
+ /*
+ * In a fully enlightened TDX/SNP VM with more than 64 VPs, if
+ * hyperv_pcpu_input_arg is not NULL, set_memory_decrypted() ->
+ * ... -> cpa_flush()-> ... -> __send_ipi_mask_ex() tries to
+ * use hyperv_pcpu_input_arg as the hypercall input page, which
+ * must be a decrypted page in such a VM, but the page is still
+ * encrypted before set_memory_decrypted() returns. Fix this by
+ * setting *inputarg after the above set_memory_decrypted(): if
+ * hyperv_pcpu_input_arg is NULL, __send_ipi_mask_ex() returns
+ * HV_STATUS_INVALID_PARAMETER immediately, and the function
+ * hv_send_ipi_mask() falls back to orig_apic.send_IPI_mask(),
+ * which may be slightly slower than the hypercall, but still
+ * works correctly in such a VM.
+ */
+ *inputarg = mem;
+ }
+
+ msr_vp_index = hv_get_register(HV_REGISTER_VP_INDEX);
+
+ hv_vp_index[cpu] = msr_vp_index;
+
+ if (msr_vp_index > hv_max_vp_index)
+ hv_max_vp_index = msr_vp_index;
+
+ return 0;
+}
+
+int hv_common_cpu_die(unsigned int cpu)
+{
+ /*
+ * The hyperv_pcpu_input_arg and hyperv_pcpu_output_arg memory
+ * is not freed when the CPU goes offline as the hyperv_pcpu_input_arg
+ * may be used by the Hyper-V vPCI driver in reassigning interrupts
+ * as part of the offlining process. The interrupt reassignment
+ * happens *after* the CPUHP_AP_HYPERV_ONLINE state has run and
+ * called this function.
+ *
+ * If a previously offlined CPU is brought back online again, the
+ * originally allocated memory is reused in hv_common_cpu_init().
+ */
+
+ return 0;
+}
+
+/* Bit mask of the extended capability to query: see HV_EXT_CAPABILITY_xxx */
+bool hv_query_ext_cap(u64 cap_query)
+{
+ /*
+ * The address of the 'hv_extended_cap' variable will be used as an
+ * output parameter to the hypercall below and so it should be
+ * compatible with 'virt_to_phys'. Which means, it's address should be
+ * directly mapped. Use 'static' to keep it compatible; stack variables
+ * can be virtually mapped, making them incompatible with
+ * 'virt_to_phys'.
+ * Hypercall input/output addresses should also be 8-byte aligned.
+ */
+ static u64 hv_extended_cap __aligned(8);
+ static bool hv_extended_cap_queried;
+ u64 status;
+
+ /*
+ * Querying extended capabilities is an extended hypercall. Check if the
+ * partition supports extended hypercall, first.
+ */
+ if (!(ms_hyperv.priv_high & HV_ENABLE_EXTENDED_HYPERCALLS))
+ return false;
+
+ /* Extended capabilities do not change at runtime. */
+ if (hv_extended_cap_queried)
+ return hv_extended_cap & cap_query;
+
+ status = hv_do_hypercall(HV_EXT_CALL_QUERY_CAPABILITIES, NULL,
+ &hv_extended_cap);
+
+ /*
+ * The query extended capabilities hypercall should not fail under
+ * any normal circumstances. Avoid repeatedly making the hypercall, on
+ * error.
+ */
+ hv_extended_cap_queried = true;
+ if (!hv_result_success(status)) {
+ pr_err("Hyper-V: Extended query capabilities hypercall failed 0x%llx\n",
+ status);
+ return false;
+ }
+
+ return hv_extended_cap & cap_query;
+}
+EXPORT_SYMBOL_GPL(hv_query_ext_cap);
+
+void hv_setup_dma_ops(struct device *dev, bool coherent)
+{
+ /*
+ * Hyper-V does not offer a vIOMMU in the guest
+ * VM, so pass 0/NULL for the IOMMU settings
+ */
+ arch_setup_dma_ops(dev, 0, 0, NULL, coherent);
+}
+EXPORT_SYMBOL_GPL(hv_setup_dma_ops);
+
+bool hv_is_hibernation_supported(void)
+{
+ return !hv_root_partition && acpi_sleep_state_supported(ACPI_STATE_S4);
+}
+EXPORT_SYMBOL_GPL(hv_is_hibernation_supported);
+
+/*
+ * Default function to read the Hyper-V reference counter, independent
+ * of whether Hyper-V enlightened clocks/timers are being used. But on
+ * architectures where it is used, Hyper-V enlightenment code in
+ * hyperv_timer.c may override this function.
+ */
+static u64 __hv_read_ref_counter(void)
+{
+ return hv_get_register(HV_REGISTER_TIME_REF_COUNT);
+}
+
+u64 (*hv_read_reference_counter)(void) = __hv_read_ref_counter;
+EXPORT_SYMBOL_GPL(hv_read_reference_counter);
+
+/* These __weak functions provide default "no-op" behavior and
+ * may be overridden by architecture specific versions. Architectures
+ * for which the default "no-op" behavior is sufficient can leave
+ * them unimplemented and not be cluttered with a bunch of stub
+ * functions in arch-specific code.
+ */
+
+bool __weak hv_is_isolation_supported(void)
+{
+ return false;
+}
+EXPORT_SYMBOL_GPL(hv_is_isolation_supported);
+
+bool __weak hv_isolation_type_snp(void)
+{
+ return false;
+}
+EXPORT_SYMBOL_GPL(hv_isolation_type_snp);
+
+bool __weak hv_isolation_type_tdx(void)
+{
+ return false;
+}
+EXPORT_SYMBOL_GPL(hv_isolation_type_tdx);
+
+void __weak hv_setup_vmbus_handler(void (*handler)(void))
+{
+}
+EXPORT_SYMBOL_GPL(hv_setup_vmbus_handler);
+
+void __weak hv_remove_vmbus_handler(void)
+{
+}
+EXPORT_SYMBOL_GPL(hv_remove_vmbus_handler);
+
+void __weak hv_setup_kexec_handler(void (*handler)(void))
+{
+}
+EXPORT_SYMBOL_GPL(hv_setup_kexec_handler);
+
+void __weak hv_remove_kexec_handler(void)
+{
+}
+EXPORT_SYMBOL_GPL(hv_remove_kexec_handler);
+
+void __weak hv_setup_crash_handler(void (*handler)(struct pt_regs *regs))
+{
+}
+EXPORT_SYMBOL_GPL(hv_setup_crash_handler);
+
+void __weak hv_remove_crash_handler(void)
+{
+}
+EXPORT_SYMBOL_GPL(hv_remove_crash_handler);
+
+void __weak hyperv_cleanup(void)
+{
+}
+EXPORT_SYMBOL_GPL(hyperv_cleanup);
+
+u64 __weak hv_ghcb_hypercall(u64 control, void *input, void *output, u32 input_size)
+{
+ return HV_STATUS_INVALID_PARAMETER;
+}
+EXPORT_SYMBOL_GPL(hv_ghcb_hypercall);
+
+u64 __weak hv_tdx_hypercall(u64 control, u64 param1, u64 param2)
+{
+ return HV_STATUS_INVALID_PARAMETER;
+}
+EXPORT_SYMBOL_GPL(hv_tdx_hypercall);