summaryrefslogtreecommitdiffstats
path: root/arch/x86/kvm/svm
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-07 18:49:45 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-07 18:49:45 +0000
commit2c3c1048746a4622d8c89a29670120dc8fab93c4 (patch)
tree848558de17fb3008cdf4d861b01ac7781903ce39 /arch/x86/kvm/svm
parentInitial commit. (diff)
downloadlinux-upstream.tar.xz
linux-upstream.zip
Adding upstream version 6.1.76.upstream/6.1.76upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'arch/x86/kvm/svm')
-rw-r--r--arch/x86/kvm/svm/avic.c1259
-rw-r--r--arch/x86/kvm/svm/hyperv.h13
-rw-r--r--arch/x86/kvm/svm/nested.c1716
-rw-r--r--arch/x86/kvm/svm/pmu.c232
-rw-r--r--arch/x86/kvm/svm/sev.c3076
-rw-r--r--arch/x86/kvm/svm/svm.c5172
-rw-r--r--arch/x86/kvm/svm/svm.h718
-rw-r--r--arch/x86/kvm/svm/svm_onhyperv.c40
-rw-r--r--arch/x86/kvm/svm/svm_onhyperv.h117
-rw-r--r--arch/x86/kvm/svm/svm_ops.h64
-rw-r--r--arch/x86/kvm/svm/vmenter.S392
11 files changed, 12799 insertions, 0 deletions
diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c
new file mode 100644
index 000000000..fb125b54e
--- /dev/null
+++ b/arch/x86/kvm/svm/avic.c
@@ -0,0 +1,1259 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Kernel-based Virtual Machine driver for Linux
+ *
+ * AMD SVM support
+ *
+ * Copyright (C) 2006 Qumranet, Inc.
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
+ *
+ * Authors:
+ * Yaniv Kamay <yaniv@qumranet.com>
+ * Avi Kivity <avi@qumranet.com>
+ */
+
+#define pr_fmt(fmt) "SVM: " fmt
+
+#include <linux/kvm_types.h>
+#include <linux/hashtable.h>
+#include <linux/amd-iommu.h>
+#include <linux/kvm_host.h>
+
+#include <asm/irq_remapping.h>
+
+#include "trace.h"
+#include "lapic.h"
+#include "x86.h"
+#include "irq.h"
+#include "svm.h"
+
+/*
+ * Encode the arbitrary VM ID and the vCPU's default APIC ID, i.e the vCPU ID,
+ * into the GATag so that KVM can retrieve the correct vCPU from a GALog entry
+ * if an interrupt can't be delivered, e.g. because the vCPU isn't running.
+ *
+ * For the vCPU ID, use however many bits are currently allowed for the max
+ * guest physical APIC ID (limited by the size of the physical ID table), and
+ * use whatever bits remain to assign arbitrary AVIC IDs to VMs. Note, the
+ * size of the GATag is defined by hardware (32 bits), but is an opaque value
+ * as far as hardware is concerned.
+ */
+#define AVIC_VCPU_ID_MASK AVIC_PHYSICAL_MAX_INDEX_MASK
+
+#define AVIC_VM_ID_SHIFT HWEIGHT32(AVIC_PHYSICAL_MAX_INDEX_MASK)
+#define AVIC_VM_ID_MASK (GENMASK(31, AVIC_VM_ID_SHIFT) >> AVIC_VM_ID_SHIFT)
+
+#define AVIC_GATAG(x, y) (((x & AVIC_VM_ID_MASK) << AVIC_VM_ID_SHIFT) | \
+ (y & AVIC_VCPU_ID_MASK))
+#define AVIC_GATAG_TO_VMID(x) ((x >> AVIC_VM_ID_SHIFT) & AVIC_VM_ID_MASK)
+#define AVIC_GATAG_TO_VCPUID(x) (x & AVIC_VCPU_ID_MASK)
+
+static_assert(AVIC_GATAG(AVIC_VM_ID_MASK, AVIC_VCPU_ID_MASK) == -1u);
+
+static bool force_avic;
+module_param_unsafe(force_avic, bool, 0444);
+
+/* Note:
+ * This hash table is used to map VM_ID to a struct kvm_svm,
+ * when handling AMD IOMMU GALOG notification to schedule in
+ * a particular vCPU.
+ */
+#define SVM_VM_DATA_HASH_BITS 8
+static DEFINE_HASHTABLE(svm_vm_data_hash, SVM_VM_DATA_HASH_BITS);
+static u32 next_vm_id = 0;
+static bool next_vm_id_wrapped = 0;
+static DEFINE_SPINLOCK(svm_vm_data_hash_lock);
+enum avic_modes avic_mode;
+
+/*
+ * This is a wrapper of struct amd_iommu_ir_data.
+ */
+struct amd_svm_iommu_ir {
+ struct list_head node; /* Used by SVM for per-vcpu ir_list */
+ void *data; /* Storing pointer to struct amd_ir_data */
+};
+
+static void avic_activate_vmcb(struct vcpu_svm *svm)
+{
+ struct vmcb *vmcb = svm->vmcb01.ptr;
+
+ vmcb->control.int_ctl &= ~(AVIC_ENABLE_MASK | X2APIC_MODE_MASK);
+ vmcb->control.avic_physical_id &= ~AVIC_PHYSICAL_MAX_INDEX_MASK;
+
+ vmcb->control.int_ctl |= AVIC_ENABLE_MASK;
+
+ /* Note:
+ * KVM can support hybrid-AVIC mode, where KVM emulates x2APIC
+ * MSR accesses, while interrupt injection to a running vCPU
+ * can be achieved using AVIC doorbell. The AVIC hardware still
+ * accelerate MMIO accesses, but this does not cause any harm
+ * as the guest is not supposed to access xAPIC mmio when uses x2APIC.
+ */
+ if (apic_x2apic_mode(svm->vcpu.arch.apic) &&
+ avic_mode == AVIC_MODE_X2) {
+ vmcb->control.int_ctl |= X2APIC_MODE_MASK;
+ vmcb->control.avic_physical_id |= X2AVIC_MAX_PHYSICAL_ID;
+ /* Disabling MSR intercept for x2APIC registers */
+ svm_set_x2apic_msr_interception(svm, false);
+ } else {
+ /*
+ * Flush the TLB, the guest may have inserted a non-APIC
+ * mapping into the TLB while AVIC was disabled.
+ */
+ kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, &svm->vcpu);
+
+ /* For xAVIC and hybrid-xAVIC modes */
+ vmcb->control.avic_physical_id |= AVIC_MAX_PHYSICAL_ID;
+ /* Enabling MSR intercept for x2APIC registers */
+ svm_set_x2apic_msr_interception(svm, true);
+ }
+}
+
+static void avic_deactivate_vmcb(struct vcpu_svm *svm)
+{
+ struct vmcb *vmcb = svm->vmcb01.ptr;
+
+ vmcb->control.int_ctl &= ~(AVIC_ENABLE_MASK | X2APIC_MODE_MASK);
+ vmcb->control.avic_physical_id &= ~AVIC_PHYSICAL_MAX_INDEX_MASK;
+
+ /*
+ * If running nested and the guest uses its own MSR bitmap, there
+ * is no need to update L0's msr bitmap
+ */
+ if (is_guest_mode(&svm->vcpu) &&
+ vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_MSR_PROT))
+ return;
+
+ /* Enabling MSR intercept for x2APIC registers */
+ svm_set_x2apic_msr_interception(svm, true);
+}
+
+/* Note:
+ * This function is called from IOMMU driver to notify
+ * SVM to schedule in a particular vCPU of a particular VM.
+ */
+int avic_ga_log_notifier(u32 ga_tag)
+{
+ unsigned long flags;
+ struct kvm_svm *kvm_svm;
+ struct kvm_vcpu *vcpu = NULL;
+ u32 vm_id = AVIC_GATAG_TO_VMID(ga_tag);
+ u32 vcpu_id = AVIC_GATAG_TO_VCPUID(ga_tag);
+
+ pr_debug("SVM: %s: vm_id=%#x, vcpu_id=%#x\n", __func__, vm_id, vcpu_id);
+ trace_kvm_avic_ga_log(vm_id, vcpu_id);
+
+ spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
+ hash_for_each_possible(svm_vm_data_hash, kvm_svm, hnode, vm_id) {
+ if (kvm_svm->avic_vm_id != vm_id)
+ continue;
+ vcpu = kvm_get_vcpu_by_id(&kvm_svm->kvm, vcpu_id);
+ break;
+ }
+ spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
+
+ /* Note:
+ * At this point, the IOMMU should have already set the pending
+ * bit in the vAPIC backing page. So, we just need to schedule
+ * in the vcpu.
+ */
+ if (vcpu)
+ kvm_vcpu_wake_up(vcpu);
+
+ return 0;
+}
+
+void avic_vm_destroy(struct kvm *kvm)
+{
+ unsigned long flags;
+ struct kvm_svm *kvm_svm = to_kvm_svm(kvm);
+
+ if (!enable_apicv)
+ return;
+
+ if (kvm_svm->avic_logical_id_table_page)
+ __free_page(kvm_svm->avic_logical_id_table_page);
+ if (kvm_svm->avic_physical_id_table_page)
+ __free_page(kvm_svm->avic_physical_id_table_page);
+
+ spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
+ hash_del(&kvm_svm->hnode);
+ spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
+}
+
+int avic_vm_init(struct kvm *kvm)
+{
+ unsigned long flags;
+ int err = -ENOMEM;
+ struct kvm_svm *kvm_svm = to_kvm_svm(kvm);
+ struct kvm_svm *k2;
+ struct page *p_page;
+ struct page *l_page;
+ u32 vm_id;
+
+ if (!enable_apicv)
+ return 0;
+
+ /* Allocating physical APIC ID table (4KB) */
+ p_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+ if (!p_page)
+ goto free_avic;
+
+ kvm_svm->avic_physical_id_table_page = p_page;
+
+ /* Allocating logical APIC ID table (4KB) */
+ l_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+ if (!l_page)
+ goto free_avic;
+
+ kvm_svm->avic_logical_id_table_page = l_page;
+
+ spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
+ again:
+ vm_id = next_vm_id = (next_vm_id + 1) & AVIC_VM_ID_MASK;
+ if (vm_id == 0) { /* id is 1-based, zero is not okay */
+ next_vm_id_wrapped = 1;
+ goto again;
+ }
+ /* Is it still in use? Only possible if wrapped at least once */
+ if (next_vm_id_wrapped) {
+ hash_for_each_possible(svm_vm_data_hash, k2, hnode, vm_id) {
+ if (k2->avic_vm_id == vm_id)
+ goto again;
+ }
+ }
+ kvm_svm->avic_vm_id = vm_id;
+ hash_add(svm_vm_data_hash, &kvm_svm->hnode, kvm_svm->avic_vm_id);
+ spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
+
+ return 0;
+
+free_avic:
+ avic_vm_destroy(kvm);
+ return err;
+}
+
+void avic_init_vmcb(struct vcpu_svm *svm, struct vmcb *vmcb)
+{
+ struct kvm_svm *kvm_svm = to_kvm_svm(svm->vcpu.kvm);
+ phys_addr_t bpa = __sme_set(page_to_phys(svm->avic_backing_page));
+ phys_addr_t lpa = __sme_set(page_to_phys(kvm_svm->avic_logical_id_table_page));
+ phys_addr_t ppa = __sme_set(page_to_phys(kvm_svm->avic_physical_id_table_page));
+
+ vmcb->control.avic_backing_page = bpa & AVIC_HPA_MASK;
+ vmcb->control.avic_logical_id = lpa & AVIC_HPA_MASK;
+ vmcb->control.avic_physical_id = ppa & AVIC_HPA_MASK;
+ vmcb->control.avic_vapic_bar = APIC_DEFAULT_PHYS_BASE & VMCB_AVIC_APIC_BAR_MASK;
+
+ if (kvm_apicv_activated(svm->vcpu.kvm))
+ avic_activate_vmcb(svm);
+ else
+ avic_deactivate_vmcb(svm);
+}
+
+static u64 *avic_get_physical_id_entry(struct kvm_vcpu *vcpu,
+ unsigned int index)
+{
+ u64 *avic_physical_id_table;
+ struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
+
+ if ((avic_mode == AVIC_MODE_X1 && index > AVIC_MAX_PHYSICAL_ID) ||
+ (avic_mode == AVIC_MODE_X2 && index > X2AVIC_MAX_PHYSICAL_ID))
+ return NULL;
+
+ avic_physical_id_table = page_address(kvm_svm->avic_physical_id_table_page);
+
+ return &avic_physical_id_table[index];
+}
+
+/*
+ * Note:
+ * AVIC hardware walks the nested page table to check permissions,
+ * but does not use the SPA address specified in the leaf page
+ * table entry since it uses address in the AVIC_BACKING_PAGE pointer
+ * field of the VMCB. Therefore, we set up the
+ * APIC_ACCESS_PAGE_PRIVATE_MEMSLOT (4KB) here.
+ */
+static int avic_alloc_access_page(struct kvm *kvm)
+{
+ void __user *ret;
+ int r = 0;
+
+ mutex_lock(&kvm->slots_lock);
+
+ if (kvm->arch.apic_access_memslot_enabled)
+ goto out;
+
+ ret = __x86_set_memory_region(kvm,
+ APIC_ACCESS_PAGE_PRIVATE_MEMSLOT,
+ APIC_DEFAULT_PHYS_BASE,
+ PAGE_SIZE);
+ if (IS_ERR(ret)) {
+ r = PTR_ERR(ret);
+ goto out;
+ }
+
+ kvm->arch.apic_access_memslot_enabled = true;
+out:
+ mutex_unlock(&kvm->slots_lock);
+ return r;
+}
+
+static int avic_init_backing_page(struct kvm_vcpu *vcpu)
+{
+ u64 *entry, new_entry;
+ int id = vcpu->vcpu_id;
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if ((avic_mode == AVIC_MODE_X1 && id > AVIC_MAX_PHYSICAL_ID) ||
+ (avic_mode == AVIC_MODE_X2 && id > X2AVIC_MAX_PHYSICAL_ID))
+ return -EINVAL;
+
+ if (!vcpu->arch.apic->regs)
+ return -EINVAL;
+
+ if (kvm_apicv_activated(vcpu->kvm)) {
+ int ret;
+
+ ret = avic_alloc_access_page(vcpu->kvm);
+ if (ret)
+ return ret;
+ }
+
+ svm->avic_backing_page = virt_to_page(vcpu->arch.apic->regs);
+
+ /* Setting AVIC backing page address in the phy APIC ID table */
+ entry = avic_get_physical_id_entry(vcpu, id);
+ if (!entry)
+ return -EINVAL;
+
+ new_entry = __sme_set((page_to_phys(svm->avic_backing_page) &
+ AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK) |
+ AVIC_PHYSICAL_ID_ENTRY_VALID_MASK);
+ WRITE_ONCE(*entry, new_entry);
+
+ svm->avic_physical_id_cache = entry;
+
+ return 0;
+}
+
+void avic_ring_doorbell(struct kvm_vcpu *vcpu)
+{
+ /*
+ * Note, the vCPU could get migrated to a different pCPU at any point,
+ * which could result in signalling the wrong/previous pCPU. But if
+ * that happens the vCPU is guaranteed to do a VMRUN (after being
+ * migrated) and thus will process pending interrupts, i.e. a doorbell
+ * is not needed (and the spurious one is harmless).
+ */
+ int cpu = READ_ONCE(vcpu->cpu);
+
+ if (cpu != get_cpu()) {
+ wrmsrl(MSR_AMD64_SVM_AVIC_DOORBELL, kvm_cpu_get_apicid(cpu));
+ trace_kvm_avic_doorbell(vcpu->vcpu_id, kvm_cpu_get_apicid(cpu));
+ }
+ put_cpu();
+}
+
+/*
+ * A fast-path version of avic_kick_target_vcpus(), which attempts to match
+ * destination APIC ID to vCPU without looping through all vCPUs.
+ */
+static int avic_kick_target_vcpus_fast(struct kvm *kvm, struct kvm_lapic *source,
+ u32 icrl, u32 icrh, u32 index)
+{
+ u32 l1_physical_id, dest;
+ struct kvm_vcpu *target_vcpu;
+ int dest_mode = icrl & APIC_DEST_MASK;
+ int shorthand = icrl & APIC_SHORT_MASK;
+ struct kvm_svm *kvm_svm = to_kvm_svm(kvm);
+
+ if (shorthand != APIC_DEST_NOSHORT)
+ return -EINVAL;
+
+ if (apic_x2apic_mode(source))
+ dest = icrh;
+ else
+ dest = GET_XAPIC_DEST_FIELD(icrh);
+
+ if (dest_mode == APIC_DEST_PHYSICAL) {
+ /* broadcast destination, use slow path */
+ if (apic_x2apic_mode(source) && dest == X2APIC_BROADCAST)
+ return -EINVAL;
+ if (!apic_x2apic_mode(source) && dest == APIC_BROADCAST)
+ return -EINVAL;
+
+ l1_physical_id = dest;
+
+ if (WARN_ON_ONCE(l1_physical_id != index))
+ return -EINVAL;
+
+ } else {
+ u32 bitmap, cluster;
+ int logid_index;
+
+ if (apic_x2apic_mode(source)) {
+ /* 16 bit dest mask, 16 bit cluster id */
+ bitmap = dest & 0xFFFF0000;
+ cluster = (dest >> 16) << 4;
+ } else if (kvm_lapic_get_reg(source, APIC_DFR) == APIC_DFR_FLAT) {
+ /* 8 bit dest mask*/
+ bitmap = dest;
+ cluster = 0;
+ } else {
+ /* 4 bit desk mask, 4 bit cluster id */
+ bitmap = dest & 0xF;
+ cluster = (dest >> 4) << 2;
+ }
+
+ if (unlikely(!bitmap))
+ /* guest bug: nobody to send the logical interrupt to */
+ return 0;
+
+ if (!is_power_of_2(bitmap))
+ /* multiple logical destinations, use slow path */
+ return -EINVAL;
+
+ logid_index = cluster + __ffs(bitmap);
+
+ if (!apic_x2apic_mode(source)) {
+ u32 *avic_logical_id_table =
+ page_address(kvm_svm->avic_logical_id_table_page);
+
+ u32 logid_entry = avic_logical_id_table[logid_index];
+
+ if (WARN_ON_ONCE(index != logid_index))
+ return -EINVAL;
+
+ /* guest bug: non existing/reserved logical destination */
+ if (unlikely(!(logid_entry & AVIC_LOGICAL_ID_ENTRY_VALID_MASK)))
+ return 0;
+
+ l1_physical_id = logid_entry &
+ AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK;
+ } else {
+ /*
+ * For x2APIC logical mode, cannot leverage the index.
+ * Instead, calculate physical ID from logical ID in ICRH.
+ */
+ int cluster = (icrh & 0xffff0000) >> 16;
+ int apic = ffs(icrh & 0xffff) - 1;
+
+ /*
+ * If the x2APIC logical ID sub-field (i.e. icrh[15:0])
+ * contains anything but a single bit, we cannot use the
+ * fast path, because it is limited to a single vCPU.
+ */
+ if (apic < 0 || icrh != (1 << apic))
+ return -EINVAL;
+
+ l1_physical_id = (cluster << 4) + apic;
+ }
+ }
+
+ target_vcpu = kvm_get_vcpu_by_id(kvm, l1_physical_id);
+ if (unlikely(!target_vcpu))
+ /* guest bug: non existing vCPU is a target of this IPI*/
+ return 0;
+
+ target_vcpu->arch.apic->irr_pending = true;
+ svm_complete_interrupt_delivery(target_vcpu,
+ icrl & APIC_MODE_MASK,
+ icrl & APIC_INT_LEVELTRIG,
+ icrl & APIC_VECTOR_MASK);
+ return 0;
+}
+
+static void avic_kick_target_vcpus(struct kvm *kvm, struct kvm_lapic *source,
+ u32 icrl, u32 icrh, u32 index)
+{
+ unsigned long i;
+ struct kvm_vcpu *vcpu;
+
+ if (!avic_kick_target_vcpus_fast(kvm, source, icrl, icrh, index))
+ return;
+
+ trace_kvm_avic_kick_vcpu_slowpath(icrh, icrl, index);
+
+ /*
+ * Wake any target vCPUs that are blocking, i.e. waiting for a wake
+ * event. There's no need to signal doorbells, as hardware has handled
+ * vCPUs that were in guest at the time of the IPI, and vCPUs that have
+ * since entered the guest will have processed pending IRQs at VMRUN.
+ */
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ u32 dest;
+
+ if (apic_x2apic_mode(vcpu->arch.apic))
+ dest = icrh;
+ else
+ dest = GET_XAPIC_DEST_FIELD(icrh);
+
+ if (kvm_apic_match_dest(vcpu, source, icrl & APIC_SHORT_MASK,
+ dest, icrl & APIC_DEST_MASK)) {
+ vcpu->arch.apic->irr_pending = true;
+ svm_complete_interrupt_delivery(vcpu,
+ icrl & APIC_MODE_MASK,
+ icrl & APIC_INT_LEVELTRIG,
+ icrl & APIC_VECTOR_MASK);
+ }
+ }
+}
+
+int avic_incomplete_ipi_interception(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ u32 icrh = svm->vmcb->control.exit_info_1 >> 32;
+ u32 icrl = svm->vmcb->control.exit_info_1;
+ u32 id = svm->vmcb->control.exit_info_2 >> 32;
+ u32 index = svm->vmcb->control.exit_info_2 & 0x1FF;
+ struct kvm_lapic *apic = vcpu->arch.apic;
+
+ trace_kvm_avic_incomplete_ipi(vcpu->vcpu_id, icrh, icrl, id, index);
+
+ switch (id) {
+ case AVIC_IPI_FAILURE_INVALID_TARGET:
+ case AVIC_IPI_FAILURE_INVALID_INT_TYPE:
+ /*
+ * Emulate IPIs that are not handled by AVIC hardware, which
+ * only virtualizes Fixed, Edge-Triggered INTRs, and falls over
+ * if _any_ targets are invalid, e.g. if the logical mode mask
+ * is a superset of running vCPUs.
+ *
+ * The exit is a trap, e.g. ICR holds the correct value and RIP
+ * has been advanced, KVM is responsible only for emulating the
+ * IPI. Sadly, hardware may sometimes leave the BUSY flag set,
+ * in which case KVM needs to emulate the ICR write as well in
+ * order to clear the BUSY flag.
+ */
+ if (icrl & APIC_ICR_BUSY)
+ kvm_apic_write_nodecode(vcpu, APIC_ICR);
+ else
+ kvm_apic_send_ipi(apic, icrl, icrh);
+ break;
+ case AVIC_IPI_FAILURE_TARGET_NOT_RUNNING:
+ /*
+ * At this point, we expect that the AVIC HW has already
+ * set the appropriate IRR bits on the valid target
+ * vcpus. So, we just need to kick the appropriate vcpu.
+ */
+ avic_kick_target_vcpus(vcpu->kvm, apic, icrl, icrh, index);
+ break;
+ case AVIC_IPI_FAILURE_INVALID_BACKING_PAGE:
+ WARN_ONCE(1, "Invalid backing page\n");
+ break;
+ case AVIC_IPI_FAILURE_INVALID_IPI_VECTOR:
+ /* Invalid IPI with vector < 16 */
+ break;
+ default:
+ vcpu_unimpl(vcpu, "Unknown avic incomplete IPI interception\n");
+ }
+
+ return 1;
+}
+
+unsigned long avic_vcpu_get_apicv_inhibit_reasons(struct kvm_vcpu *vcpu)
+{
+ if (is_guest_mode(vcpu))
+ return APICV_INHIBIT_REASON_NESTED;
+ return 0;
+}
+
+static u32 *avic_get_logical_id_entry(struct kvm_vcpu *vcpu, u32 ldr, bool flat)
+{
+ struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
+ int index;
+ u32 *logical_apic_id_table;
+ int dlid = GET_APIC_LOGICAL_ID(ldr);
+
+ if (!dlid)
+ return NULL;
+
+ if (flat) { /* flat */
+ index = ffs(dlid) - 1;
+ if (index > 7)
+ return NULL;
+ } else { /* cluster */
+ int cluster = (dlid & 0xf0) >> 4;
+ int apic = ffs(dlid & 0x0f) - 1;
+
+ if ((apic < 0) || (apic > 7) ||
+ (cluster >= 0xf))
+ return NULL;
+ index = (cluster << 2) + apic;
+ }
+
+ logical_apic_id_table = (u32 *) page_address(kvm_svm->avic_logical_id_table_page);
+
+ return &logical_apic_id_table[index];
+}
+
+static int avic_ldr_write(struct kvm_vcpu *vcpu, u8 g_physical_id, u32 ldr)
+{
+ bool flat;
+ u32 *entry, new_entry;
+
+ flat = kvm_lapic_get_reg(vcpu->arch.apic, APIC_DFR) == APIC_DFR_FLAT;
+ entry = avic_get_logical_id_entry(vcpu, ldr, flat);
+ if (!entry)
+ return -EINVAL;
+
+ new_entry = READ_ONCE(*entry);
+ new_entry &= ~AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK;
+ new_entry |= (g_physical_id & AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK);
+ new_entry |= AVIC_LOGICAL_ID_ENTRY_VALID_MASK;
+ WRITE_ONCE(*entry, new_entry);
+
+ return 0;
+}
+
+static void avic_invalidate_logical_id_entry(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ bool flat = svm->dfr_reg == APIC_DFR_FLAT;
+ u32 *entry;
+
+ /* Note: x2AVIC does not use logical APIC ID table */
+ if (apic_x2apic_mode(vcpu->arch.apic))
+ return;
+
+ entry = avic_get_logical_id_entry(vcpu, svm->ldr_reg, flat);
+ if (entry)
+ clear_bit(AVIC_LOGICAL_ID_ENTRY_VALID_BIT, (unsigned long *)entry);
+}
+
+static int avic_handle_ldr_update(struct kvm_vcpu *vcpu)
+{
+ int ret = 0;
+ struct vcpu_svm *svm = to_svm(vcpu);
+ u32 ldr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_LDR);
+ u32 id = kvm_xapic_id(vcpu->arch.apic);
+
+ /* AVIC does not support LDR update for x2APIC */
+ if (apic_x2apic_mode(vcpu->arch.apic))
+ return 0;
+
+ if (ldr == svm->ldr_reg)
+ return 0;
+
+ avic_invalidate_logical_id_entry(vcpu);
+
+ if (ldr)
+ ret = avic_ldr_write(vcpu, id, ldr);
+
+ if (!ret)
+ svm->ldr_reg = ldr;
+
+ return ret;
+}
+
+static void avic_handle_dfr_update(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ u32 dfr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_DFR);
+
+ if (svm->dfr_reg == dfr)
+ return;
+
+ avic_invalidate_logical_id_entry(vcpu);
+ svm->dfr_reg = dfr;
+}
+
+static int avic_unaccel_trap_write(struct kvm_vcpu *vcpu)
+{
+ u32 offset = to_svm(vcpu)->vmcb->control.exit_info_1 &
+ AVIC_UNACCEL_ACCESS_OFFSET_MASK;
+
+ switch (offset) {
+ case APIC_LDR:
+ if (avic_handle_ldr_update(vcpu))
+ return 0;
+ break;
+ case APIC_DFR:
+ avic_handle_dfr_update(vcpu);
+ break;
+ default:
+ break;
+ }
+
+ kvm_apic_write_nodecode(vcpu, offset);
+ return 1;
+}
+
+static bool is_avic_unaccelerated_access_trap(u32 offset)
+{
+ bool ret = false;
+
+ switch (offset) {
+ case APIC_ID:
+ case APIC_EOI:
+ case APIC_RRR:
+ case APIC_LDR:
+ case APIC_DFR:
+ case APIC_SPIV:
+ case APIC_ESR:
+ case APIC_ICR:
+ case APIC_LVTT:
+ case APIC_LVTTHMR:
+ case APIC_LVTPC:
+ case APIC_LVT0:
+ case APIC_LVT1:
+ case APIC_LVTERR:
+ case APIC_TMICT:
+ case APIC_TDCR:
+ ret = true;
+ break;
+ default:
+ break;
+ }
+ return ret;
+}
+
+int avic_unaccelerated_access_interception(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ int ret = 0;
+ u32 offset = svm->vmcb->control.exit_info_1 &
+ AVIC_UNACCEL_ACCESS_OFFSET_MASK;
+ u32 vector = svm->vmcb->control.exit_info_2 &
+ AVIC_UNACCEL_ACCESS_VECTOR_MASK;
+ bool write = (svm->vmcb->control.exit_info_1 >> 32) &
+ AVIC_UNACCEL_ACCESS_WRITE_MASK;
+ bool trap = is_avic_unaccelerated_access_trap(offset);
+
+ trace_kvm_avic_unaccelerated_access(vcpu->vcpu_id, offset,
+ trap, write, vector);
+ if (trap) {
+ /* Handling Trap */
+ WARN_ONCE(!write, "svm: Handling trap read.\n");
+ ret = avic_unaccel_trap_write(vcpu);
+ } else {
+ /* Handling Fault */
+ ret = kvm_emulate_instruction(vcpu, 0);
+ }
+
+ return ret;
+}
+
+int avic_init_vcpu(struct vcpu_svm *svm)
+{
+ int ret;
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+
+ if (!enable_apicv || !irqchip_in_kernel(vcpu->kvm))
+ return 0;
+
+ ret = avic_init_backing_page(vcpu);
+ if (ret)
+ return ret;
+
+ INIT_LIST_HEAD(&svm->ir_list);
+ spin_lock_init(&svm->ir_list_lock);
+ svm->dfr_reg = APIC_DFR_FLAT;
+
+ return ret;
+}
+
+void avic_apicv_post_state_restore(struct kvm_vcpu *vcpu)
+{
+ avic_handle_dfr_update(vcpu);
+ avic_handle_ldr_update(vcpu);
+}
+
+static int avic_set_pi_irte_mode(struct kvm_vcpu *vcpu, bool activate)
+{
+ int ret = 0;
+ unsigned long flags;
+ struct amd_svm_iommu_ir *ir;
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (!kvm_arch_has_assigned_device(vcpu->kvm))
+ return 0;
+
+ /*
+ * Here, we go through the per-vcpu ir_list to update all existing
+ * interrupt remapping table entry targeting this vcpu.
+ */
+ spin_lock_irqsave(&svm->ir_list_lock, flags);
+
+ if (list_empty(&svm->ir_list))
+ goto out;
+
+ list_for_each_entry(ir, &svm->ir_list, node) {
+ if (activate)
+ ret = amd_iommu_activate_guest_mode(ir->data);
+ else
+ ret = amd_iommu_deactivate_guest_mode(ir->data);
+ if (ret)
+ break;
+ }
+out:
+ spin_unlock_irqrestore(&svm->ir_list_lock, flags);
+ return ret;
+}
+
+static void svm_ir_list_del(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi)
+{
+ unsigned long flags;
+ struct amd_svm_iommu_ir *cur;
+
+ spin_lock_irqsave(&svm->ir_list_lock, flags);
+ list_for_each_entry(cur, &svm->ir_list, node) {
+ if (cur->data != pi->ir_data)
+ continue;
+ list_del(&cur->node);
+ kfree(cur);
+ break;
+ }
+ spin_unlock_irqrestore(&svm->ir_list_lock, flags);
+}
+
+static int svm_ir_list_add(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi)
+{
+ int ret = 0;
+ unsigned long flags;
+ struct amd_svm_iommu_ir *ir;
+ u64 entry;
+
+ /**
+ * In some cases, the existing irte is updated and re-set,
+ * so we need to check here if it's already been * added
+ * to the ir_list.
+ */
+ if (pi->ir_data && (pi->prev_ga_tag != 0)) {
+ struct kvm *kvm = svm->vcpu.kvm;
+ u32 vcpu_id = AVIC_GATAG_TO_VCPUID(pi->prev_ga_tag);
+ struct kvm_vcpu *prev_vcpu = kvm_get_vcpu_by_id(kvm, vcpu_id);
+ struct vcpu_svm *prev_svm;
+
+ if (!prev_vcpu) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ prev_svm = to_svm(prev_vcpu);
+ svm_ir_list_del(prev_svm, pi);
+ }
+
+ /**
+ * Allocating new amd_iommu_pi_data, which will get
+ * add to the per-vcpu ir_list.
+ */
+ ir = kzalloc(sizeof(struct amd_svm_iommu_ir), GFP_KERNEL_ACCOUNT);
+ if (!ir) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ ir->data = pi->ir_data;
+
+ spin_lock_irqsave(&svm->ir_list_lock, flags);
+
+ /*
+ * Update the target pCPU for IOMMU doorbells if the vCPU is running.
+ * If the vCPU is NOT running, i.e. is blocking or scheduled out, KVM
+ * will update the pCPU info when the vCPU awkened and/or scheduled in.
+ * See also avic_vcpu_load().
+ */
+ entry = READ_ONCE(*(svm->avic_physical_id_cache));
+ if (entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK)
+ amd_iommu_update_ga(entry & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK,
+ true, pi->ir_data);
+
+ list_add(&ir->node, &svm->ir_list);
+ spin_unlock_irqrestore(&svm->ir_list_lock, flags);
+out:
+ return ret;
+}
+
+/*
+ * Note:
+ * The HW cannot support posting multicast/broadcast
+ * interrupts to a vCPU. So, we still use legacy interrupt
+ * remapping for these kind of interrupts.
+ *
+ * For lowest-priority interrupts, we only support
+ * those with single CPU as the destination, e.g. user
+ * configures the interrupts via /proc/irq or uses
+ * irqbalance to make the interrupts single-CPU.
+ */
+static int
+get_pi_vcpu_info(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e,
+ struct vcpu_data *vcpu_info, struct vcpu_svm **svm)
+{
+ struct kvm_lapic_irq irq;
+ struct kvm_vcpu *vcpu = NULL;
+
+ kvm_set_msi_irq(kvm, e, &irq);
+
+ if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu) ||
+ !kvm_irq_is_postable(&irq)) {
+ pr_debug("SVM: %s: use legacy intr remap mode for irq %u\n",
+ __func__, irq.vector);
+ return -1;
+ }
+
+ pr_debug("SVM: %s: use GA mode for irq %u\n", __func__,
+ irq.vector);
+ *svm = to_svm(vcpu);
+ vcpu_info->pi_desc_addr = __sme_set(page_to_phys((*svm)->avic_backing_page));
+ vcpu_info->vector = irq.vector;
+
+ return 0;
+}
+
+/*
+ * avic_pi_update_irte - set IRTE for Posted-Interrupts
+ *
+ * @kvm: kvm
+ * @host_irq: host irq of the interrupt
+ * @guest_irq: gsi of the interrupt
+ * @set: set or unset PI
+ * returns 0 on success, < 0 on failure
+ */
+int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
+ uint32_t guest_irq, bool set)
+{
+ struct kvm_kernel_irq_routing_entry *e;
+ struct kvm_irq_routing_table *irq_rt;
+ int idx, ret = 0;
+
+ if (!kvm_arch_has_assigned_device(kvm) ||
+ !irq_remapping_cap(IRQ_POSTING_CAP))
+ return 0;
+
+ pr_debug("SVM: %s: host_irq=%#x, guest_irq=%#x, set=%#x\n",
+ __func__, host_irq, guest_irq, set);
+
+ idx = srcu_read_lock(&kvm->irq_srcu);
+ irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
+
+ if (guest_irq >= irq_rt->nr_rt_entries ||
+ hlist_empty(&irq_rt->map[guest_irq])) {
+ pr_warn_once("no route for guest_irq %u/%u (broken user space?)\n",
+ guest_irq, irq_rt->nr_rt_entries);
+ goto out;
+ }
+
+ hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) {
+ struct vcpu_data vcpu_info;
+ struct vcpu_svm *svm = NULL;
+
+ if (e->type != KVM_IRQ_ROUTING_MSI)
+ continue;
+
+ /**
+ * Here, we setup with legacy mode in the following cases:
+ * 1. When cannot target interrupt to a specific vcpu.
+ * 2. Unsetting posted interrupt.
+ * 3. APIC virtualization is disabled for the vcpu.
+ * 4. IRQ has incompatible delivery mode (SMI, INIT, etc)
+ */
+ if (!get_pi_vcpu_info(kvm, e, &vcpu_info, &svm) && set &&
+ kvm_vcpu_apicv_active(&svm->vcpu)) {
+ struct amd_iommu_pi_data pi;
+
+ /* Try to enable guest_mode in IRTE */
+ pi.base = __sme_set(page_to_phys(svm->avic_backing_page) &
+ AVIC_HPA_MASK);
+ pi.ga_tag = AVIC_GATAG(to_kvm_svm(kvm)->avic_vm_id,
+ svm->vcpu.vcpu_id);
+ pi.is_guest_mode = true;
+ pi.vcpu_data = &vcpu_info;
+ ret = irq_set_vcpu_affinity(host_irq, &pi);
+
+ /**
+ * Here, we successfully setting up vcpu affinity in
+ * IOMMU guest mode. Now, we need to store the posted
+ * interrupt information in a per-vcpu ir_list so that
+ * we can reference to them directly when we update vcpu
+ * scheduling information in IOMMU irte.
+ */
+ if (!ret && pi.is_guest_mode)
+ svm_ir_list_add(svm, &pi);
+ } else {
+ /* Use legacy mode in IRTE */
+ struct amd_iommu_pi_data pi;
+
+ /**
+ * Here, pi is used to:
+ * - Tell IOMMU to use legacy mode for this interrupt.
+ * - Retrieve ga_tag of prior interrupt remapping data.
+ */
+ pi.prev_ga_tag = 0;
+ pi.is_guest_mode = false;
+ ret = irq_set_vcpu_affinity(host_irq, &pi);
+
+ /**
+ * Check if the posted interrupt was previously
+ * setup with the guest_mode by checking if the ga_tag
+ * was cached. If so, we need to clean up the per-vcpu
+ * ir_list.
+ */
+ if (!ret && pi.prev_ga_tag) {
+ int id = AVIC_GATAG_TO_VCPUID(pi.prev_ga_tag);
+ struct kvm_vcpu *vcpu;
+
+ vcpu = kvm_get_vcpu_by_id(kvm, id);
+ if (vcpu)
+ svm_ir_list_del(to_svm(vcpu), &pi);
+ }
+ }
+
+ if (!ret && svm) {
+ trace_kvm_pi_irte_update(host_irq, svm->vcpu.vcpu_id,
+ e->gsi, vcpu_info.vector,
+ vcpu_info.pi_desc_addr, set);
+ }
+
+ if (ret < 0) {
+ pr_err("%s: failed to update PI IRTE\n", __func__);
+ goto out;
+ }
+ }
+
+ ret = 0;
+out:
+ srcu_read_unlock(&kvm->irq_srcu, idx);
+ return ret;
+}
+
+bool avic_check_apicv_inhibit_reasons(enum kvm_apicv_inhibit reason)
+{
+ ulong supported = BIT(APICV_INHIBIT_REASON_DISABLE) |
+ BIT(APICV_INHIBIT_REASON_ABSENT) |
+ BIT(APICV_INHIBIT_REASON_HYPERV) |
+ BIT(APICV_INHIBIT_REASON_NESTED) |
+ BIT(APICV_INHIBIT_REASON_IRQWIN) |
+ BIT(APICV_INHIBIT_REASON_PIT_REINJ) |
+ BIT(APICV_INHIBIT_REASON_BLOCKIRQ) |
+ BIT(APICV_INHIBIT_REASON_SEV) |
+ BIT(APICV_INHIBIT_REASON_APIC_ID_MODIFIED) |
+ BIT(APICV_INHIBIT_REASON_APIC_BASE_MODIFIED);
+
+ return supported & BIT(reason);
+}
+
+
+static inline int
+avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu, bool r)
+{
+ int ret = 0;
+ struct amd_svm_iommu_ir *ir;
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ lockdep_assert_held(&svm->ir_list_lock);
+
+ if (!kvm_arch_has_assigned_device(vcpu->kvm))
+ return 0;
+
+ /*
+ * Here, we go through the per-vcpu ir_list to update all existing
+ * interrupt remapping table entry targeting this vcpu.
+ */
+ if (list_empty(&svm->ir_list))
+ return 0;
+
+ list_for_each_entry(ir, &svm->ir_list, node) {
+ ret = amd_iommu_update_ga(cpu, r, ir->data);
+ if (ret)
+ return ret;
+ }
+ return 0;
+}
+
+void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+{
+ u64 entry;
+ int h_physical_id = kvm_cpu_get_apicid(cpu);
+ struct vcpu_svm *svm = to_svm(vcpu);
+ unsigned long flags;
+
+ lockdep_assert_preemption_disabled();
+
+ if (WARN_ON(h_physical_id & ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK))
+ return;
+
+ /*
+ * No need to update anything if the vCPU is blocking, i.e. if the vCPU
+ * is being scheduled in after being preempted. The CPU entries in the
+ * Physical APIC table and IRTE are consumed iff IsRun{ning} is '1'.
+ * If the vCPU was migrated, its new CPU value will be stuffed when the
+ * vCPU unblocks.
+ */
+ if (kvm_vcpu_is_blocking(vcpu))
+ return;
+
+ /*
+ * Grab the per-vCPU interrupt remapping lock even if the VM doesn't
+ * _currently_ have assigned devices, as that can change. Holding
+ * ir_list_lock ensures that either svm_ir_list_add() will consume
+ * up-to-date entry information, or that this task will wait until
+ * svm_ir_list_add() completes to set the new target pCPU.
+ */
+ spin_lock_irqsave(&svm->ir_list_lock, flags);
+
+ entry = READ_ONCE(*(svm->avic_physical_id_cache));
+
+ entry &= ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK;
+ entry |= (h_physical_id & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK);
+ entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
+
+ WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
+ avic_update_iommu_vcpu_affinity(vcpu, h_physical_id, true);
+
+ spin_unlock_irqrestore(&svm->ir_list_lock, flags);
+}
+
+void avic_vcpu_put(struct kvm_vcpu *vcpu)
+{
+ u64 entry;
+ struct vcpu_svm *svm = to_svm(vcpu);
+ unsigned long flags;
+
+ lockdep_assert_preemption_disabled();
+
+ /*
+ * Note, reading the Physical ID entry outside of ir_list_lock is safe
+ * as only the pCPU that has loaded (or is loading) the vCPU is allowed
+ * to modify the entry, and preemption is disabled. I.e. the vCPU
+ * can't be scheduled out and thus avic_vcpu_{put,load}() can't run
+ * recursively.
+ */
+ entry = READ_ONCE(*(svm->avic_physical_id_cache));
+
+ /* Nothing to do if IsRunning == '0' due to vCPU blocking. */
+ if (!(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK))
+ return;
+
+ /*
+ * Take and hold the per-vCPU interrupt remapping lock while updating
+ * the Physical ID entry even though the lock doesn't protect against
+ * multiple writers (see above). Holding ir_list_lock ensures that
+ * either svm_ir_list_add() will consume up-to-date entry information,
+ * or that this task will wait until svm_ir_list_add() completes to
+ * mark the vCPU as not running.
+ */
+ spin_lock_irqsave(&svm->ir_list_lock, flags);
+
+ avic_update_iommu_vcpu_affinity(vcpu, -1, 0);
+
+ entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
+ WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
+
+ spin_unlock_irqrestore(&svm->ir_list_lock, flags);
+
+}
+
+void avic_refresh_virtual_apic_mode(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb *vmcb = svm->vmcb01.ptr;
+
+ if (!lapic_in_kernel(vcpu) || avic_mode == AVIC_MODE_NONE)
+ return;
+
+ if (!enable_apicv)
+ return;
+
+ if (kvm_vcpu_apicv_active(vcpu)) {
+ /**
+ * During AVIC temporary deactivation, guest could update
+ * APIC ID, DFR and LDR registers, which would not be trapped
+ * by avic_unaccelerated_access_interception(). In this case,
+ * we need to check and update the AVIC logical APIC ID table
+ * accordingly before re-activating.
+ */
+ avic_apicv_post_state_restore(vcpu);
+ avic_activate_vmcb(svm);
+ } else {
+ avic_deactivate_vmcb(svm);
+ }
+ vmcb_mark_dirty(vmcb, VMCB_AVIC);
+}
+
+void avic_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
+{
+ bool activated = kvm_vcpu_apicv_active(vcpu);
+
+ if (!enable_apicv)
+ return;
+
+ avic_refresh_virtual_apic_mode(vcpu);
+
+ if (activated)
+ avic_vcpu_load(vcpu, vcpu->cpu);
+ else
+ avic_vcpu_put(vcpu);
+
+ avic_set_pi_irte_mode(vcpu, activated);
+}
+
+void avic_vcpu_blocking(struct kvm_vcpu *vcpu)
+{
+ if (!kvm_vcpu_apicv_active(vcpu))
+ return;
+
+ /*
+ * Unload the AVIC when the vCPU is about to block, _before_
+ * the vCPU actually blocks.
+ *
+ * Any IRQs that arrive before IsRunning=0 will not cause an
+ * incomplete IPI vmexit on the source, therefore vIRR will also
+ * be checked by kvm_vcpu_check_block() before blocking. The
+ * memory barrier implicit in set_current_state orders writing
+ * IsRunning=0 before reading the vIRR. The processor needs a
+ * matching memory barrier on interrupt delivery between writing
+ * IRR and reading IsRunning; the lack of this barrier might be
+ * the cause of errata #1235).
+ */
+ avic_vcpu_put(vcpu);
+}
+
+void avic_vcpu_unblocking(struct kvm_vcpu *vcpu)
+{
+ if (!kvm_vcpu_apicv_active(vcpu))
+ return;
+
+ avic_vcpu_load(vcpu, vcpu->cpu);
+}
+
+/*
+ * Note:
+ * - The module param avic enable both xAPIC and x2APIC mode.
+ * - Hypervisor can support both xAVIC and x2AVIC in the same guest.
+ * - The mode can be switched at run-time.
+ */
+bool avic_hardware_setup(struct kvm_x86_ops *x86_ops)
+{
+ if (!npt_enabled)
+ return false;
+
+ if (boot_cpu_has(X86_FEATURE_AVIC)) {
+ avic_mode = AVIC_MODE_X1;
+ pr_info("AVIC enabled\n");
+ } else if (force_avic) {
+ /*
+ * Some older systems does not advertise AVIC support.
+ * See Revision Guide for specific AMD processor for more detail.
+ */
+ avic_mode = AVIC_MODE_X1;
+ pr_warn("AVIC is not supported in CPUID but force enabled");
+ pr_warn("Your system might crash and burn");
+ }
+
+ /* AVIC is a prerequisite for x2AVIC. */
+ if (boot_cpu_has(X86_FEATURE_X2AVIC)) {
+ if (avic_mode == AVIC_MODE_X1) {
+ avic_mode = AVIC_MODE_X2;
+ pr_info("x2AVIC enabled\n");
+ } else {
+ pr_warn(FW_BUG "Cannot support x2AVIC due to AVIC is disabled");
+ pr_warn(FW_BUG "Try enable AVIC using force_avic option");
+ }
+ }
+
+ if (avic_mode != AVIC_MODE_NONE)
+ amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier);
+
+ return !!avic_mode;
+}
diff --git a/arch/x86/kvm/svm/hyperv.h b/arch/x86/kvm/svm/hyperv.h
new file mode 100644
index 000000000..c59544cdf
--- /dev/null
+++ b/arch/x86/kvm/svm/hyperv.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Common Hyper-V on KVM and KVM on Hyper-V definitions (SVM).
+ */
+
+#ifndef __ARCH_X86_KVM_SVM_HYPERV_H__
+#define __ARCH_X86_KVM_SVM_HYPERV_H__
+
+#include <asm/mshyperv.h>
+
+#include "../hyperv.h"
+
+#endif /* __ARCH_X86_KVM_SVM_HYPERV_H__ */
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
new file mode 100644
index 000000000..5d4d78c9a
--- /dev/null
+++ b/arch/x86/kvm/svm/nested.c
@@ -0,0 +1,1716 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Kernel-based Virtual Machine driver for Linux
+ *
+ * AMD SVM support
+ *
+ * Copyright (C) 2006 Qumranet, Inc.
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
+ *
+ * Authors:
+ * Yaniv Kamay <yaniv@qumranet.com>
+ * Avi Kivity <avi@qumranet.com>
+ */
+
+#define pr_fmt(fmt) "SVM: " fmt
+
+#include <linux/kvm_types.h>
+#include <linux/kvm_host.h>
+#include <linux/kernel.h>
+
+#include <asm/msr-index.h>
+#include <asm/debugreg.h>
+
+#include "kvm_emulate.h"
+#include "trace.h"
+#include "mmu.h"
+#include "x86.h"
+#include "cpuid.h"
+#include "lapic.h"
+#include "svm.h"
+#include "hyperv.h"
+
+#define CC KVM_NESTED_VMENTER_CONSISTENCY_CHECK
+
+static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu,
+ struct x86_exception *fault)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb *vmcb = svm->vmcb;
+
+ if (vmcb->control.exit_code != SVM_EXIT_NPF) {
+ /*
+ * TODO: track the cause of the nested page fault, and
+ * correctly fill in the high bits of exit_info_1.
+ */
+ vmcb->control.exit_code = SVM_EXIT_NPF;
+ vmcb->control.exit_code_hi = 0;
+ vmcb->control.exit_info_1 = (1ULL << 32);
+ vmcb->control.exit_info_2 = fault->address;
+ }
+
+ vmcb->control.exit_info_1 &= ~0xffffffffULL;
+ vmcb->control.exit_info_1 |= fault->error_code;
+
+ nested_svm_vmexit(svm);
+}
+
+static u64 nested_svm_get_tdp_pdptr(struct kvm_vcpu *vcpu, int index)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ u64 cr3 = svm->nested.ctl.nested_cr3;
+ u64 pdpte;
+ int ret;
+
+ ret = kvm_vcpu_read_guest_page(vcpu, gpa_to_gfn(cr3), &pdpte,
+ offset_in_page(cr3) + index * 8, 8);
+ if (ret)
+ return 0;
+ return pdpte;
+}
+
+static unsigned long nested_svm_get_tdp_cr3(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ return svm->nested.ctl.nested_cr3;
+}
+
+static void nested_svm_init_mmu_context(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ WARN_ON(mmu_is_nested(vcpu));
+
+ vcpu->arch.mmu = &vcpu->arch.guest_mmu;
+
+ /*
+ * The NPT format depends on L1's CR4 and EFER, which is in vmcb01. Note,
+ * when called via KVM_SET_NESTED_STATE, that state may _not_ match current
+ * vCPU state. CR0.WP is explicitly ignored, while CR0.PG is required.
+ */
+ kvm_init_shadow_npt_mmu(vcpu, X86_CR0_PG, svm->vmcb01.ptr->save.cr4,
+ svm->vmcb01.ptr->save.efer,
+ svm->nested.ctl.nested_cr3);
+ vcpu->arch.mmu->get_guest_pgd = nested_svm_get_tdp_cr3;
+ vcpu->arch.mmu->get_pdptr = nested_svm_get_tdp_pdptr;
+ vcpu->arch.mmu->inject_page_fault = nested_svm_inject_npf_exit;
+ vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu;
+}
+
+static void nested_svm_uninit_mmu_context(struct kvm_vcpu *vcpu)
+{
+ vcpu->arch.mmu = &vcpu->arch.root_mmu;
+ vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
+}
+
+static bool nested_vmcb_needs_vls_intercept(struct vcpu_svm *svm)
+{
+ if (!svm->v_vmload_vmsave_enabled)
+ return true;
+
+ if (!nested_npt_enabled(svm))
+ return true;
+
+ if (!(svm->nested.ctl.virt_ext & VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK))
+ return true;
+
+ return false;
+}
+
+void recalc_intercepts(struct vcpu_svm *svm)
+{
+ struct vmcb_control_area *c, *h;
+ struct vmcb_ctrl_area_cached *g;
+ unsigned int i;
+
+ vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
+
+ if (!is_guest_mode(&svm->vcpu))
+ return;
+
+ c = &svm->vmcb->control;
+ h = &svm->vmcb01.ptr->control;
+ g = &svm->nested.ctl;
+
+ for (i = 0; i < MAX_INTERCEPT; i++)
+ c->intercepts[i] = h->intercepts[i];
+
+ if (g->int_ctl & V_INTR_MASKING_MASK) {
+ /* We only want the cr8 intercept bits of L1 */
+ vmcb_clr_intercept(c, INTERCEPT_CR8_READ);
+ vmcb_clr_intercept(c, INTERCEPT_CR8_WRITE);
+
+ /*
+ * Once running L2 with HF_VINTR_MASK, EFLAGS.IF does not
+ * affect any interrupt we may want to inject; therefore,
+ * interrupt window vmexits are irrelevant to L0.
+ */
+ vmcb_clr_intercept(c, INTERCEPT_VINTR);
+ }
+
+ /* We don't want to see VMMCALLs from a nested guest */
+ vmcb_clr_intercept(c, INTERCEPT_VMMCALL);
+
+ for (i = 0; i < MAX_INTERCEPT; i++)
+ c->intercepts[i] |= g->intercepts[i];
+
+ /* If SMI is not intercepted, ignore guest SMI intercept as well */
+ if (!intercept_smi)
+ vmcb_clr_intercept(c, INTERCEPT_SMI);
+
+ if (nested_vmcb_needs_vls_intercept(svm)) {
+ /*
+ * If the virtual VMLOAD/VMSAVE is not enabled for the L2,
+ * we must intercept these instructions to correctly
+ * emulate them in case L1 doesn't intercept them.
+ */
+ vmcb_set_intercept(c, INTERCEPT_VMLOAD);
+ vmcb_set_intercept(c, INTERCEPT_VMSAVE);
+ } else {
+ WARN_ON(!(c->virt_ext & VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK));
+ }
+}
+
+/*
+ * Merge L0's (KVM) and L1's (Nested VMCB) MSR permission bitmaps. The function
+ * is optimized in that it only merges the parts where KVM MSR permission bitmap
+ * may contain zero bits.
+ */
+static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
+{
+ struct hv_vmcb_enlightenments *hve = &svm->nested.ctl.hv_enlightenments;
+ int i;
+
+ /*
+ * MSR bitmap update can be skipped when:
+ * - MSR bitmap for L1 hasn't changed.
+ * - Nested hypervisor (L1) is attempting to launch the same L2 as
+ * before.
+ * - Nested hypervisor (L1) is using Hyper-V emulation interface and
+ * tells KVM (L0) there were no changes in MSR bitmap for L2.
+ */
+ if (!svm->nested.force_msr_bitmap_recalc &&
+ kvm_hv_hypercall_enabled(&svm->vcpu) &&
+ hve->hv_enlightenments_control.msr_bitmap &&
+ (svm->nested.ctl.clean & BIT(HV_VMCB_NESTED_ENLIGHTENMENTS)))
+ goto set_msrpm_base_pa;
+
+ if (!(vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_MSR_PROT)))
+ return true;
+
+ for (i = 0; i < MSRPM_OFFSETS; i++) {
+ u32 value, p;
+ u64 offset;
+
+ if (msrpm_offsets[i] == 0xffffffff)
+ break;
+
+ p = msrpm_offsets[i];
+
+ /* x2apic msrs are intercepted always for the nested guest */
+ if (is_x2apic_msrpm_offset(p))
+ continue;
+
+ offset = svm->nested.ctl.msrpm_base_pa + (p * 4);
+
+ if (kvm_vcpu_read_guest(&svm->vcpu, offset, &value, 4))
+ return false;
+
+ svm->nested.msrpm[p] = svm->msrpm[p] | value;
+ }
+
+ svm->nested.force_msr_bitmap_recalc = false;
+
+set_msrpm_base_pa:
+ svm->vmcb->control.msrpm_base_pa = __sme_set(__pa(svm->nested.msrpm));
+
+ return true;
+}
+
+/*
+ * Bits 11:0 of bitmap address are ignored by hardware
+ */
+static bool nested_svm_check_bitmap_pa(struct kvm_vcpu *vcpu, u64 pa, u32 size)
+{
+ u64 addr = PAGE_ALIGN(pa);
+
+ return kvm_vcpu_is_legal_gpa(vcpu, addr) &&
+ kvm_vcpu_is_legal_gpa(vcpu, addr + size - 1);
+}
+
+static bool __nested_vmcb_check_controls(struct kvm_vcpu *vcpu,
+ struct vmcb_ctrl_area_cached *control)
+{
+ if (CC(!vmcb12_is_intercept(control, INTERCEPT_VMRUN)))
+ return false;
+
+ if (CC(control->asid == 0))
+ return false;
+
+ if (CC((control->nested_ctl & SVM_NESTED_CTL_NP_ENABLE) && !npt_enabled))
+ return false;
+
+ if (CC(!nested_svm_check_bitmap_pa(vcpu, control->msrpm_base_pa,
+ MSRPM_SIZE)))
+ return false;
+ if (CC(!nested_svm_check_bitmap_pa(vcpu, control->iopm_base_pa,
+ IOPM_SIZE)))
+ return false;
+
+
+ return true;
+}
+
+/* Common checks that apply to both L1 and L2 state. */
+static bool __nested_vmcb_check_save(struct kvm_vcpu *vcpu,
+ struct vmcb_save_area_cached *save)
+{
+ if (CC(!(save->efer & EFER_SVME)))
+ return false;
+
+ if (CC((save->cr0 & X86_CR0_CD) == 0 && (save->cr0 & X86_CR0_NW)) ||
+ CC(save->cr0 & ~0xffffffffULL))
+ return false;
+
+ if (CC(!kvm_dr6_valid(save->dr6)) || CC(!kvm_dr7_valid(save->dr7)))
+ return false;
+
+ /*
+ * These checks are also performed by KVM_SET_SREGS,
+ * except that EFER.LMA is not checked by SVM against
+ * CR0.PG && EFER.LME.
+ */
+ if ((save->efer & EFER_LME) && (save->cr0 & X86_CR0_PG)) {
+ if (CC(!(save->cr4 & X86_CR4_PAE)) ||
+ CC(!(save->cr0 & X86_CR0_PE)) ||
+ CC(kvm_vcpu_is_illegal_gpa(vcpu, save->cr3)))
+ return false;
+ }
+
+ /* Note, SVM doesn't have any additional restrictions on CR4. */
+ if (CC(!__kvm_is_valid_cr4(vcpu, save->cr4)))
+ return false;
+
+ if (CC(!kvm_valid_efer(vcpu, save->efer)))
+ return false;
+
+ return true;
+}
+
+static bool nested_vmcb_check_save(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb_save_area_cached *save = &svm->nested.save;
+
+ return __nested_vmcb_check_save(vcpu, save);
+}
+
+static bool nested_vmcb_check_controls(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb_ctrl_area_cached *ctl = &svm->nested.ctl;
+
+ return __nested_vmcb_check_controls(vcpu, ctl);
+}
+
+static
+void __nested_copy_vmcb_control_to_cache(struct kvm_vcpu *vcpu,
+ struct vmcb_ctrl_area_cached *to,
+ struct vmcb_control_area *from)
+{
+ unsigned int i;
+
+ for (i = 0; i < MAX_INTERCEPT; i++)
+ to->intercepts[i] = from->intercepts[i];
+
+ to->iopm_base_pa = from->iopm_base_pa;
+ to->msrpm_base_pa = from->msrpm_base_pa;
+ to->tsc_offset = from->tsc_offset;
+ to->tlb_ctl = from->tlb_ctl;
+ to->int_ctl = from->int_ctl;
+ to->int_vector = from->int_vector;
+ to->int_state = from->int_state;
+ to->exit_code = from->exit_code;
+ to->exit_code_hi = from->exit_code_hi;
+ to->exit_info_1 = from->exit_info_1;
+ to->exit_info_2 = from->exit_info_2;
+ to->exit_int_info = from->exit_int_info;
+ to->exit_int_info_err = from->exit_int_info_err;
+ to->nested_ctl = from->nested_ctl;
+ to->event_inj = from->event_inj;
+ to->event_inj_err = from->event_inj_err;
+ to->next_rip = from->next_rip;
+ to->nested_cr3 = from->nested_cr3;
+ to->virt_ext = from->virt_ext;
+ to->pause_filter_count = from->pause_filter_count;
+ to->pause_filter_thresh = from->pause_filter_thresh;
+
+ /* Copy asid here because nested_vmcb_check_controls will check it. */
+ to->asid = from->asid;
+ to->msrpm_base_pa &= ~0x0fffULL;
+ to->iopm_base_pa &= ~0x0fffULL;
+
+ /* Hyper-V extensions (Enlightened VMCB) */
+ if (kvm_hv_hypercall_enabled(vcpu)) {
+ to->clean = from->clean;
+ memcpy(&to->hv_enlightenments, &from->hv_enlightenments,
+ sizeof(to->hv_enlightenments));
+ }
+}
+
+void nested_copy_vmcb_control_to_cache(struct vcpu_svm *svm,
+ struct vmcb_control_area *control)
+{
+ __nested_copy_vmcb_control_to_cache(&svm->vcpu, &svm->nested.ctl, control);
+}
+
+static void __nested_copy_vmcb_save_to_cache(struct vmcb_save_area_cached *to,
+ struct vmcb_save_area *from)
+{
+ /*
+ * Copy only fields that are validated, as we need them
+ * to avoid TOC/TOU races.
+ */
+ to->efer = from->efer;
+ to->cr0 = from->cr0;
+ to->cr3 = from->cr3;
+ to->cr4 = from->cr4;
+
+ to->dr6 = from->dr6;
+ to->dr7 = from->dr7;
+}
+
+void nested_copy_vmcb_save_to_cache(struct vcpu_svm *svm,
+ struct vmcb_save_area *save)
+{
+ __nested_copy_vmcb_save_to_cache(&svm->nested.save, save);
+}
+
+/*
+ * Synchronize fields that are written by the processor, so that
+ * they can be copied back into the vmcb12.
+ */
+void nested_sync_control_from_vmcb02(struct vcpu_svm *svm)
+{
+ u32 mask;
+ svm->nested.ctl.event_inj = svm->vmcb->control.event_inj;
+ svm->nested.ctl.event_inj_err = svm->vmcb->control.event_inj_err;
+
+ /* Only a few fields of int_ctl are written by the processor. */
+ mask = V_IRQ_MASK | V_TPR_MASK;
+ if (!(svm->nested.ctl.int_ctl & V_INTR_MASKING_MASK) &&
+ svm_is_intercept(svm, INTERCEPT_VINTR)) {
+ /*
+ * In order to request an interrupt window, L0 is usurping
+ * svm->vmcb->control.int_ctl and possibly setting V_IRQ
+ * even if it was clear in L1's VMCB. Restoring it would be
+ * wrong. However, in this case V_IRQ will remain true until
+ * interrupt_window_interception calls svm_clear_vintr and
+ * restores int_ctl. We can just leave it aside.
+ */
+ mask &= ~V_IRQ_MASK;
+ }
+
+ if (nested_vgif_enabled(svm))
+ mask |= V_GIF_MASK;
+
+ svm->nested.ctl.int_ctl &= ~mask;
+ svm->nested.ctl.int_ctl |= svm->vmcb->control.int_ctl & mask;
+}
+
+/*
+ * Transfer any event that L0 or L1 wanted to inject into L2 to
+ * EXIT_INT_INFO.
+ */
+static void nested_save_pending_event_to_vmcb12(struct vcpu_svm *svm,
+ struct vmcb *vmcb12)
+{
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+ u32 exit_int_info = 0;
+ unsigned int nr;
+
+ if (vcpu->arch.exception.injected) {
+ nr = vcpu->arch.exception.vector;
+ exit_int_info = nr | SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_EXEPT;
+
+ if (vcpu->arch.exception.has_error_code) {
+ exit_int_info |= SVM_EVTINJ_VALID_ERR;
+ vmcb12->control.exit_int_info_err =
+ vcpu->arch.exception.error_code;
+ }
+
+ } else if (vcpu->arch.nmi_injected) {
+ exit_int_info = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI;
+
+ } else if (vcpu->arch.interrupt.injected) {
+ nr = vcpu->arch.interrupt.nr;
+ exit_int_info = nr | SVM_EVTINJ_VALID;
+
+ if (vcpu->arch.interrupt.soft)
+ exit_int_info |= SVM_EVTINJ_TYPE_SOFT;
+ else
+ exit_int_info |= SVM_EVTINJ_TYPE_INTR;
+ }
+
+ vmcb12->control.exit_int_info = exit_int_info;
+}
+
+static void nested_svm_transition_tlb_flush(struct kvm_vcpu *vcpu)
+{
+ /*
+ * TODO: optimize unconditional TLB flush/MMU sync. A partial list of
+ * things to fix before this can be conditional:
+ *
+ * - Flush TLBs for both L1 and L2 remote TLB flush
+ * - Honor L1's request to flush an ASID on nested VMRUN
+ * - Sync nested NPT MMU on VMRUN that flushes L2's ASID[*]
+ * - Don't crush a pending TLB flush in vmcb02 on nested VMRUN
+ * - Flush L1's ASID on KVM_REQ_TLB_FLUSH_GUEST
+ *
+ * [*] Unlike nested EPT, SVM's ASID management can invalidate nested
+ * NPT guest-physical mappings on VMRUN.
+ */
+ kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
+ kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
+}
+
+/*
+ * Load guest's/host's cr3 on nested vmentry or vmexit. @nested_npt is true
+ * if we are emulating VM-Entry into a guest with NPT enabled.
+ */
+static int nested_svm_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3,
+ bool nested_npt, bool reload_pdptrs)
+{
+ if (CC(kvm_vcpu_is_illegal_gpa(vcpu, cr3)))
+ return -EINVAL;
+
+ if (reload_pdptrs && !nested_npt && is_pae_paging(vcpu) &&
+ CC(!load_pdptrs(vcpu, cr3)))
+ return -EINVAL;
+
+ vcpu->arch.cr3 = cr3;
+
+ /* Re-initialize the MMU, e.g. to pick up CR4 MMU role changes. */
+ kvm_init_mmu(vcpu);
+
+ if (!nested_npt)
+ kvm_mmu_new_pgd(vcpu, cr3);
+
+ return 0;
+}
+
+void nested_vmcb02_compute_g_pat(struct vcpu_svm *svm)
+{
+ if (!svm->nested.vmcb02.ptr)
+ return;
+
+ /* FIXME: merge g_pat from vmcb01 and vmcb12. */
+ svm->nested.vmcb02.ptr->save.g_pat = svm->vmcb01.ptr->save.g_pat;
+}
+
+static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12)
+{
+ bool new_vmcb12 = false;
+ struct vmcb *vmcb01 = svm->vmcb01.ptr;
+ struct vmcb *vmcb02 = svm->nested.vmcb02.ptr;
+
+ nested_vmcb02_compute_g_pat(svm);
+
+ /* Load the nested guest state */
+ if (svm->nested.vmcb12_gpa != svm->nested.last_vmcb12_gpa) {
+ new_vmcb12 = true;
+ svm->nested.last_vmcb12_gpa = svm->nested.vmcb12_gpa;
+ svm->nested.force_msr_bitmap_recalc = true;
+ }
+
+ if (unlikely(new_vmcb12 || vmcb_is_dirty(vmcb12, VMCB_SEG))) {
+ vmcb02->save.es = vmcb12->save.es;
+ vmcb02->save.cs = vmcb12->save.cs;
+ vmcb02->save.ss = vmcb12->save.ss;
+ vmcb02->save.ds = vmcb12->save.ds;
+ vmcb02->save.cpl = vmcb12->save.cpl;
+ vmcb_mark_dirty(vmcb02, VMCB_SEG);
+ }
+
+ if (unlikely(new_vmcb12 || vmcb_is_dirty(vmcb12, VMCB_DT))) {
+ vmcb02->save.gdtr = vmcb12->save.gdtr;
+ vmcb02->save.idtr = vmcb12->save.idtr;
+ vmcb_mark_dirty(vmcb02, VMCB_DT);
+ }
+
+ kvm_set_rflags(&svm->vcpu, vmcb12->save.rflags | X86_EFLAGS_FIXED);
+
+ svm_set_efer(&svm->vcpu, svm->nested.save.efer);
+
+ svm_set_cr0(&svm->vcpu, svm->nested.save.cr0);
+ svm_set_cr4(&svm->vcpu, svm->nested.save.cr4);
+
+ svm->vcpu.arch.cr2 = vmcb12->save.cr2;
+
+ kvm_rax_write(&svm->vcpu, vmcb12->save.rax);
+ kvm_rsp_write(&svm->vcpu, vmcb12->save.rsp);
+ kvm_rip_write(&svm->vcpu, vmcb12->save.rip);
+
+ /* In case we don't even reach vcpu_run, the fields are not updated */
+ vmcb02->save.rax = vmcb12->save.rax;
+ vmcb02->save.rsp = vmcb12->save.rsp;
+ vmcb02->save.rip = vmcb12->save.rip;
+
+ /* These bits will be set properly on the first execution when new_vmc12 is true */
+ if (unlikely(new_vmcb12 || vmcb_is_dirty(vmcb12, VMCB_DR))) {
+ vmcb02->save.dr7 = svm->nested.save.dr7 | DR7_FIXED_1;
+ svm->vcpu.arch.dr6 = svm->nested.save.dr6 | DR6_ACTIVE_LOW;
+ vmcb_mark_dirty(vmcb02, VMCB_DR);
+ }
+
+ if (unlikely(svm->lbrv_enabled && (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK))) {
+ /*
+ * Reserved bits of DEBUGCTL are ignored. Be consistent with
+ * svm_set_msr's definition of reserved bits.
+ */
+ svm_copy_lbrs(vmcb02, vmcb12);
+ vmcb02->save.dbgctl &= ~DEBUGCTL_RESERVED_BITS;
+ svm_update_lbrv(&svm->vcpu);
+
+ } else if (unlikely(vmcb01->control.virt_ext & LBR_CTL_ENABLE_MASK)) {
+ svm_copy_lbrs(vmcb02, vmcb01);
+ }
+}
+
+static inline bool is_evtinj_soft(u32 evtinj)
+{
+ u32 type = evtinj & SVM_EVTINJ_TYPE_MASK;
+ u8 vector = evtinj & SVM_EVTINJ_VEC_MASK;
+
+ if (!(evtinj & SVM_EVTINJ_VALID))
+ return false;
+
+ if (type == SVM_EVTINJ_TYPE_SOFT)
+ return true;
+
+ return type == SVM_EVTINJ_TYPE_EXEPT && kvm_exception_is_soft(vector);
+}
+
+static bool is_evtinj_nmi(u32 evtinj)
+{
+ u32 type = evtinj & SVM_EVTINJ_TYPE_MASK;
+
+ if (!(evtinj & SVM_EVTINJ_VALID))
+ return false;
+
+ return type == SVM_EVTINJ_TYPE_NMI;
+}
+
+static void nested_vmcb02_prepare_control(struct vcpu_svm *svm,
+ unsigned long vmcb12_rip,
+ unsigned long vmcb12_csbase)
+{
+ u32 int_ctl_vmcb01_bits = V_INTR_MASKING_MASK;
+ u32 int_ctl_vmcb12_bits = V_TPR_MASK | V_IRQ_INJECTION_BITS_MASK;
+
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+ struct vmcb *vmcb01 = svm->vmcb01.ptr;
+ struct vmcb *vmcb02 = svm->nested.vmcb02.ptr;
+ u32 pause_count12;
+ u32 pause_thresh12;
+
+ /*
+ * Filled at exit: exit_code, exit_code_hi, exit_info_1, exit_info_2,
+ * exit_int_info, exit_int_info_err, next_rip, insn_len, insn_bytes.
+ */
+
+ if (svm->vgif_enabled && (svm->nested.ctl.int_ctl & V_GIF_ENABLE_MASK))
+ int_ctl_vmcb12_bits |= (V_GIF_MASK | V_GIF_ENABLE_MASK);
+ else
+ int_ctl_vmcb01_bits |= (V_GIF_MASK | V_GIF_ENABLE_MASK);
+
+ /* Copied from vmcb01. msrpm_base can be overwritten later. */
+ vmcb02->control.nested_ctl = vmcb01->control.nested_ctl;
+ vmcb02->control.iopm_base_pa = vmcb01->control.iopm_base_pa;
+ vmcb02->control.msrpm_base_pa = vmcb01->control.msrpm_base_pa;
+
+ /* Done at vmrun: asid. */
+
+ /* Also overwritten later if necessary. */
+ vmcb02->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;
+
+ /* nested_cr3. */
+ if (nested_npt_enabled(svm))
+ nested_svm_init_mmu_context(vcpu);
+
+ vcpu->arch.tsc_offset = kvm_calc_nested_tsc_offset(
+ vcpu->arch.l1_tsc_offset,
+ svm->nested.ctl.tsc_offset,
+ svm->tsc_ratio_msr);
+
+ vmcb02->control.tsc_offset = vcpu->arch.tsc_offset;
+
+ if (svm->tsc_scaling_enabled &&
+ svm->tsc_ratio_msr != kvm_caps.default_tsc_scaling_ratio)
+ nested_svm_update_tsc_ratio_msr(vcpu);
+
+ vmcb02->control.int_ctl =
+ (svm->nested.ctl.int_ctl & int_ctl_vmcb12_bits) |
+ (vmcb01->control.int_ctl & int_ctl_vmcb01_bits);
+
+ vmcb02->control.int_vector = svm->nested.ctl.int_vector;
+ vmcb02->control.int_state = svm->nested.ctl.int_state;
+ vmcb02->control.event_inj = svm->nested.ctl.event_inj;
+ vmcb02->control.event_inj_err = svm->nested.ctl.event_inj_err;
+
+ /*
+ * next_rip is consumed on VMRUN as the return address pushed on the
+ * stack for injected soft exceptions/interrupts. If nrips is exposed
+ * to L1, take it verbatim from vmcb12. If nrips is supported in
+ * hardware but not exposed to L1, stuff the actual L2 RIP to emulate
+ * what a nrips=0 CPU would do (L1 is responsible for advancing RIP
+ * prior to injecting the event).
+ */
+ if (svm->nrips_enabled)
+ vmcb02->control.next_rip = svm->nested.ctl.next_rip;
+ else if (boot_cpu_has(X86_FEATURE_NRIPS))
+ vmcb02->control.next_rip = vmcb12_rip;
+
+ svm->nmi_l1_to_l2 = is_evtinj_nmi(vmcb02->control.event_inj);
+ if (is_evtinj_soft(vmcb02->control.event_inj)) {
+ svm->soft_int_injected = true;
+ svm->soft_int_csbase = vmcb12_csbase;
+ svm->soft_int_old_rip = vmcb12_rip;
+ if (svm->nrips_enabled)
+ svm->soft_int_next_rip = svm->nested.ctl.next_rip;
+ else
+ svm->soft_int_next_rip = vmcb12_rip;
+ }
+
+ vmcb02->control.virt_ext = vmcb01->control.virt_ext &
+ LBR_CTL_ENABLE_MASK;
+ if (svm->lbrv_enabled)
+ vmcb02->control.virt_ext |=
+ (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK);
+
+ if (!nested_vmcb_needs_vls_intercept(svm))
+ vmcb02->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
+
+ pause_count12 = svm->pause_filter_enabled ? svm->nested.ctl.pause_filter_count : 0;
+ pause_thresh12 = svm->pause_threshold_enabled ? svm->nested.ctl.pause_filter_thresh : 0;
+ if (kvm_pause_in_guest(svm->vcpu.kvm)) {
+ /* use guest values since host doesn't intercept PAUSE */
+ vmcb02->control.pause_filter_count = pause_count12;
+ vmcb02->control.pause_filter_thresh = pause_thresh12;
+
+ } else {
+ /* start from host values otherwise */
+ vmcb02->control.pause_filter_count = vmcb01->control.pause_filter_count;
+ vmcb02->control.pause_filter_thresh = vmcb01->control.pause_filter_thresh;
+
+ /* ... but ensure filtering is disabled if so requested. */
+ if (vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_PAUSE)) {
+ if (!pause_count12)
+ vmcb02->control.pause_filter_count = 0;
+ if (!pause_thresh12)
+ vmcb02->control.pause_filter_thresh = 0;
+ }
+ }
+
+ nested_svm_transition_tlb_flush(vcpu);
+
+ /* Enter Guest-Mode */
+ enter_guest_mode(vcpu);
+
+ /*
+ * Merge guest and host intercepts - must be called with vcpu in
+ * guest-mode to take effect.
+ */
+ recalc_intercepts(svm);
+}
+
+static void nested_svm_copy_common_state(struct vmcb *from_vmcb, struct vmcb *to_vmcb)
+{
+ /*
+ * Some VMCB state is shared between L1 and L2 and thus has to be
+ * moved at the time of nested vmrun and vmexit.
+ *
+ * VMLOAD/VMSAVE state would also belong in this category, but KVM
+ * always performs VMLOAD and VMSAVE from the VMCB01.
+ */
+ to_vmcb->save.spec_ctrl = from_vmcb->save.spec_ctrl;
+}
+
+int enter_svm_guest_mode(struct kvm_vcpu *vcpu, u64 vmcb12_gpa,
+ struct vmcb *vmcb12, bool from_vmrun)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ int ret;
+
+ trace_kvm_nested_vmenter(svm->vmcb->save.rip,
+ vmcb12_gpa,
+ vmcb12->save.rip,
+ vmcb12->control.int_ctl,
+ vmcb12->control.event_inj,
+ vmcb12->control.nested_ctl,
+ vmcb12->control.nested_cr3,
+ vmcb12->save.cr3,
+ KVM_ISA_SVM);
+
+ trace_kvm_nested_intercepts(vmcb12->control.intercepts[INTERCEPT_CR] & 0xffff,
+ vmcb12->control.intercepts[INTERCEPT_CR] >> 16,
+ vmcb12->control.intercepts[INTERCEPT_EXCEPTION],
+ vmcb12->control.intercepts[INTERCEPT_WORD3],
+ vmcb12->control.intercepts[INTERCEPT_WORD4],
+ vmcb12->control.intercepts[INTERCEPT_WORD5]);
+
+
+ svm->nested.vmcb12_gpa = vmcb12_gpa;
+
+ WARN_ON(svm->vmcb == svm->nested.vmcb02.ptr);
+
+ nested_svm_copy_common_state(svm->vmcb01.ptr, svm->nested.vmcb02.ptr);
+
+ svm_switch_vmcb(svm, &svm->nested.vmcb02);
+ nested_vmcb02_prepare_control(svm, vmcb12->save.rip, vmcb12->save.cs.base);
+ nested_vmcb02_prepare_save(svm, vmcb12);
+
+ ret = nested_svm_load_cr3(&svm->vcpu, svm->nested.save.cr3,
+ nested_npt_enabled(svm), from_vmrun);
+ if (ret)
+ return ret;
+
+ if (!from_vmrun)
+ kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
+
+ svm_set_gif(svm, true);
+
+ if (kvm_vcpu_apicv_active(vcpu))
+ kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu);
+
+ return 0;
+}
+
+int nested_svm_vmrun(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ int ret;
+ struct vmcb *vmcb12;
+ struct kvm_host_map map;
+ u64 vmcb12_gpa;
+ struct vmcb *vmcb01 = svm->vmcb01.ptr;
+
+ if (!svm->nested.hsave_msr) {
+ kvm_inject_gp(vcpu, 0);
+ return 1;
+ }
+
+ if (is_smm(vcpu)) {
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+ }
+
+ vmcb12_gpa = svm->vmcb->save.rax;
+ ret = kvm_vcpu_map(vcpu, gpa_to_gfn(vmcb12_gpa), &map);
+ if (ret == -EINVAL) {
+ kvm_inject_gp(vcpu, 0);
+ return 1;
+ } else if (ret) {
+ return kvm_skip_emulated_instruction(vcpu);
+ }
+
+ ret = kvm_skip_emulated_instruction(vcpu);
+
+ vmcb12 = map.hva;
+
+ if (WARN_ON_ONCE(!svm->nested.initialized))
+ return -EINVAL;
+
+ nested_copy_vmcb_control_to_cache(svm, &vmcb12->control);
+ nested_copy_vmcb_save_to_cache(svm, &vmcb12->save);
+
+ if (!nested_vmcb_check_save(vcpu) ||
+ !nested_vmcb_check_controls(vcpu)) {
+ vmcb12->control.exit_code = SVM_EXIT_ERR;
+ vmcb12->control.exit_code_hi = 0;
+ vmcb12->control.exit_info_1 = 0;
+ vmcb12->control.exit_info_2 = 0;
+ goto out;
+ }
+
+ /*
+ * Since vmcb01 is not in use, we can use it to store some of the L1
+ * state.
+ */
+ vmcb01->save.efer = vcpu->arch.efer;
+ vmcb01->save.cr0 = kvm_read_cr0(vcpu);
+ vmcb01->save.cr4 = vcpu->arch.cr4;
+ vmcb01->save.rflags = kvm_get_rflags(vcpu);
+ vmcb01->save.rip = kvm_rip_read(vcpu);
+
+ if (!npt_enabled)
+ vmcb01->save.cr3 = kvm_read_cr3(vcpu);
+
+ svm->nested.nested_run_pending = 1;
+
+ if (enter_svm_guest_mode(vcpu, vmcb12_gpa, vmcb12, true))
+ goto out_exit_err;
+
+ if (nested_svm_vmrun_msrpm(svm))
+ goto out;
+
+out_exit_err:
+ svm->nested.nested_run_pending = 0;
+ svm->nmi_l1_to_l2 = false;
+ svm->soft_int_injected = false;
+
+ svm->vmcb->control.exit_code = SVM_EXIT_ERR;
+ svm->vmcb->control.exit_code_hi = 0;
+ svm->vmcb->control.exit_info_1 = 0;
+ svm->vmcb->control.exit_info_2 = 0;
+
+ nested_svm_vmexit(svm);
+
+out:
+ kvm_vcpu_unmap(vcpu, &map, true);
+
+ return ret;
+}
+
+/* Copy state save area fields which are handled by VMRUN */
+void svm_copy_vmrun_state(struct vmcb_save_area *to_save,
+ struct vmcb_save_area *from_save)
+{
+ to_save->es = from_save->es;
+ to_save->cs = from_save->cs;
+ to_save->ss = from_save->ss;
+ to_save->ds = from_save->ds;
+ to_save->gdtr = from_save->gdtr;
+ to_save->idtr = from_save->idtr;
+ to_save->rflags = from_save->rflags | X86_EFLAGS_FIXED;
+ to_save->efer = from_save->efer;
+ to_save->cr0 = from_save->cr0;
+ to_save->cr3 = from_save->cr3;
+ to_save->cr4 = from_save->cr4;
+ to_save->rax = from_save->rax;
+ to_save->rsp = from_save->rsp;
+ to_save->rip = from_save->rip;
+ to_save->cpl = 0;
+}
+
+void svm_copy_vmloadsave_state(struct vmcb *to_vmcb, struct vmcb *from_vmcb)
+{
+ to_vmcb->save.fs = from_vmcb->save.fs;
+ to_vmcb->save.gs = from_vmcb->save.gs;
+ to_vmcb->save.tr = from_vmcb->save.tr;
+ to_vmcb->save.ldtr = from_vmcb->save.ldtr;
+ to_vmcb->save.kernel_gs_base = from_vmcb->save.kernel_gs_base;
+ to_vmcb->save.star = from_vmcb->save.star;
+ to_vmcb->save.lstar = from_vmcb->save.lstar;
+ to_vmcb->save.cstar = from_vmcb->save.cstar;
+ to_vmcb->save.sfmask = from_vmcb->save.sfmask;
+ to_vmcb->save.sysenter_cs = from_vmcb->save.sysenter_cs;
+ to_vmcb->save.sysenter_esp = from_vmcb->save.sysenter_esp;
+ to_vmcb->save.sysenter_eip = from_vmcb->save.sysenter_eip;
+}
+
+int nested_svm_vmexit(struct vcpu_svm *svm)
+{
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+ struct vmcb *vmcb01 = svm->vmcb01.ptr;
+ struct vmcb *vmcb02 = svm->nested.vmcb02.ptr;
+ struct vmcb *vmcb12;
+ struct kvm_host_map map;
+ int rc;
+
+ rc = kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.vmcb12_gpa), &map);
+ if (rc) {
+ if (rc == -EINVAL)
+ kvm_inject_gp(vcpu, 0);
+ return 1;
+ }
+
+ vmcb12 = map.hva;
+
+ /* Exit Guest-Mode */
+ leave_guest_mode(vcpu);
+ svm->nested.vmcb12_gpa = 0;
+ WARN_ON_ONCE(svm->nested.nested_run_pending);
+
+ kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
+
+ /* in case we halted in L2 */
+ svm->vcpu.arch.mp_state = KVM_MP_STATE_RUNNABLE;
+
+ /* Give the current vmcb to the guest */
+
+ vmcb12->save.es = vmcb02->save.es;
+ vmcb12->save.cs = vmcb02->save.cs;
+ vmcb12->save.ss = vmcb02->save.ss;
+ vmcb12->save.ds = vmcb02->save.ds;
+ vmcb12->save.gdtr = vmcb02->save.gdtr;
+ vmcb12->save.idtr = vmcb02->save.idtr;
+ vmcb12->save.efer = svm->vcpu.arch.efer;
+ vmcb12->save.cr0 = kvm_read_cr0(vcpu);
+ vmcb12->save.cr3 = kvm_read_cr3(vcpu);
+ vmcb12->save.cr2 = vmcb02->save.cr2;
+ vmcb12->save.cr4 = svm->vcpu.arch.cr4;
+ vmcb12->save.rflags = kvm_get_rflags(vcpu);
+ vmcb12->save.rip = kvm_rip_read(vcpu);
+ vmcb12->save.rsp = kvm_rsp_read(vcpu);
+ vmcb12->save.rax = kvm_rax_read(vcpu);
+ vmcb12->save.dr7 = vmcb02->save.dr7;
+ vmcb12->save.dr6 = svm->vcpu.arch.dr6;
+ vmcb12->save.cpl = vmcb02->save.cpl;
+
+ vmcb12->control.int_state = vmcb02->control.int_state;
+ vmcb12->control.exit_code = vmcb02->control.exit_code;
+ vmcb12->control.exit_code_hi = vmcb02->control.exit_code_hi;
+ vmcb12->control.exit_info_1 = vmcb02->control.exit_info_1;
+ vmcb12->control.exit_info_2 = vmcb02->control.exit_info_2;
+
+ if (vmcb12->control.exit_code != SVM_EXIT_ERR)
+ nested_save_pending_event_to_vmcb12(svm, vmcb12);
+
+ if (svm->nrips_enabled)
+ vmcb12->control.next_rip = vmcb02->control.next_rip;
+
+ vmcb12->control.int_ctl = svm->nested.ctl.int_ctl;
+ vmcb12->control.tlb_ctl = svm->nested.ctl.tlb_ctl;
+ vmcb12->control.event_inj = svm->nested.ctl.event_inj;
+ vmcb12->control.event_inj_err = svm->nested.ctl.event_inj_err;
+
+ if (!kvm_pause_in_guest(vcpu->kvm)) {
+ vmcb01->control.pause_filter_count = vmcb02->control.pause_filter_count;
+ vmcb_mark_dirty(vmcb01, VMCB_INTERCEPTS);
+
+ }
+
+ nested_svm_copy_common_state(svm->nested.vmcb02.ptr, svm->vmcb01.ptr);
+
+ svm_switch_vmcb(svm, &svm->vmcb01);
+
+ if (unlikely(svm->lbrv_enabled && (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK))) {
+ svm_copy_lbrs(vmcb12, vmcb02);
+ svm_update_lbrv(vcpu);
+ } else if (unlikely(vmcb01->control.virt_ext & LBR_CTL_ENABLE_MASK)) {
+ svm_copy_lbrs(vmcb01, vmcb02);
+ svm_update_lbrv(vcpu);
+ }
+
+ /*
+ * On vmexit the GIF is set to false and
+ * no event can be injected in L1.
+ */
+ svm_set_gif(svm, false);
+ vmcb01->control.exit_int_info = 0;
+
+ svm->vcpu.arch.tsc_offset = svm->vcpu.arch.l1_tsc_offset;
+ if (vmcb01->control.tsc_offset != svm->vcpu.arch.tsc_offset) {
+ vmcb01->control.tsc_offset = svm->vcpu.arch.tsc_offset;
+ vmcb_mark_dirty(vmcb01, VMCB_INTERCEPTS);
+ }
+
+ if (kvm_caps.has_tsc_control &&
+ vcpu->arch.tsc_scaling_ratio != vcpu->arch.l1_tsc_scaling_ratio) {
+ vcpu->arch.tsc_scaling_ratio = vcpu->arch.l1_tsc_scaling_ratio;
+ __svm_write_tsc_multiplier(vcpu->arch.tsc_scaling_ratio);
+ }
+
+ svm->nested.ctl.nested_cr3 = 0;
+
+ /*
+ * Restore processor state that had been saved in vmcb01
+ */
+ kvm_set_rflags(vcpu, vmcb01->save.rflags);
+ svm_set_efer(vcpu, vmcb01->save.efer);
+ svm_set_cr0(vcpu, vmcb01->save.cr0 | X86_CR0_PE);
+ svm_set_cr4(vcpu, vmcb01->save.cr4);
+ kvm_rax_write(vcpu, vmcb01->save.rax);
+ kvm_rsp_write(vcpu, vmcb01->save.rsp);
+ kvm_rip_write(vcpu, vmcb01->save.rip);
+
+ svm->vcpu.arch.dr7 = DR7_FIXED_1;
+ kvm_update_dr7(&svm->vcpu);
+
+ trace_kvm_nested_vmexit_inject(vmcb12->control.exit_code,
+ vmcb12->control.exit_info_1,
+ vmcb12->control.exit_info_2,
+ vmcb12->control.exit_int_info,
+ vmcb12->control.exit_int_info_err,
+ KVM_ISA_SVM);
+
+ kvm_vcpu_unmap(vcpu, &map, true);
+
+ nested_svm_transition_tlb_flush(vcpu);
+
+ nested_svm_uninit_mmu_context(vcpu);
+
+ rc = nested_svm_load_cr3(vcpu, vmcb01->save.cr3, false, true);
+ if (rc)
+ return 1;
+
+ /*
+ * Drop what we picked up for L2 via svm_complete_interrupts() so it
+ * doesn't end up in L1.
+ */
+ svm->vcpu.arch.nmi_injected = false;
+ kvm_clear_exception_queue(vcpu);
+ kvm_clear_interrupt_queue(vcpu);
+
+ /*
+ * If we are here following the completion of a VMRUN that
+ * is being single-stepped, queue the pending #DB intercept
+ * right now so that it an be accounted for before we execute
+ * L1's next instruction.
+ */
+ if (unlikely(vmcb01->save.rflags & X86_EFLAGS_TF))
+ kvm_queue_exception(&(svm->vcpu), DB_VECTOR);
+
+ /*
+ * Un-inhibit the AVIC right away, so that other vCPUs can start
+ * to benefit from it right away.
+ */
+ if (kvm_apicv_activated(vcpu->kvm))
+ kvm_vcpu_update_apicv(vcpu);
+
+ return 0;
+}
+
+static void nested_svm_triple_fault(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (!vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_SHUTDOWN))
+ return;
+
+ kvm_clear_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+ nested_svm_simple_vmexit(to_svm(vcpu), SVM_EXIT_SHUTDOWN);
+}
+
+int svm_allocate_nested(struct vcpu_svm *svm)
+{
+ struct page *vmcb02_page;
+
+ if (svm->nested.initialized)
+ return 0;
+
+ vmcb02_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+ if (!vmcb02_page)
+ return -ENOMEM;
+ svm->nested.vmcb02.ptr = page_address(vmcb02_page);
+ svm->nested.vmcb02.pa = __sme_set(page_to_pfn(vmcb02_page) << PAGE_SHIFT);
+
+ svm->nested.msrpm = svm_vcpu_alloc_msrpm();
+ if (!svm->nested.msrpm)
+ goto err_free_vmcb02;
+ svm_vcpu_init_msrpm(&svm->vcpu, svm->nested.msrpm);
+
+ svm->nested.initialized = true;
+ return 0;
+
+err_free_vmcb02:
+ __free_page(vmcb02_page);
+ return -ENOMEM;
+}
+
+void svm_free_nested(struct vcpu_svm *svm)
+{
+ if (!svm->nested.initialized)
+ return;
+
+ if (WARN_ON_ONCE(svm->vmcb != svm->vmcb01.ptr))
+ svm_switch_vmcb(svm, &svm->vmcb01);
+
+ svm_vcpu_free_msrpm(svm->nested.msrpm);
+ svm->nested.msrpm = NULL;
+
+ __free_page(virt_to_page(svm->nested.vmcb02.ptr));
+ svm->nested.vmcb02.ptr = NULL;
+
+ /*
+ * When last_vmcb12_gpa matches the current vmcb12 gpa,
+ * some vmcb12 fields are not loaded if they are marked clean
+ * in the vmcb12, since in this case they are up to date already.
+ *
+ * When the vmcb02 is freed, this optimization becomes invalid.
+ */
+ svm->nested.last_vmcb12_gpa = INVALID_GPA;
+
+ svm->nested.initialized = false;
+}
+
+void svm_leave_nested(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (is_guest_mode(vcpu)) {
+ svm->nested.nested_run_pending = 0;
+ svm->nested.vmcb12_gpa = INVALID_GPA;
+
+ leave_guest_mode(vcpu);
+
+ svm_switch_vmcb(svm, &svm->vmcb01);
+
+ nested_svm_uninit_mmu_context(vcpu);
+ vmcb_mark_all_dirty(svm->vmcb);
+
+ if (kvm_apicv_activated(vcpu->kvm))
+ kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu);
+ }
+
+ kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
+}
+
+static int nested_svm_exit_handled_msr(struct vcpu_svm *svm)
+{
+ u32 offset, msr, value;
+ int write, mask;
+
+ if (!(vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_MSR_PROT)))
+ return NESTED_EXIT_HOST;
+
+ msr = svm->vcpu.arch.regs[VCPU_REGS_RCX];
+ offset = svm_msrpm_offset(msr);
+ write = svm->vmcb->control.exit_info_1 & 1;
+ mask = 1 << ((2 * (msr & 0xf)) + write);
+
+ if (offset == MSR_INVALID)
+ return NESTED_EXIT_DONE;
+
+ /* Offset is in 32 bit units but need in 8 bit units */
+ offset *= 4;
+
+ if (kvm_vcpu_read_guest(&svm->vcpu, svm->nested.ctl.msrpm_base_pa + offset, &value, 4))
+ return NESTED_EXIT_DONE;
+
+ return (value & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST;
+}
+
+static int nested_svm_intercept_ioio(struct vcpu_svm *svm)
+{
+ unsigned port, size, iopm_len;
+ u16 val, mask;
+ u8 start_bit;
+ u64 gpa;
+
+ if (!(vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_IOIO_PROT)))
+ return NESTED_EXIT_HOST;
+
+ port = svm->vmcb->control.exit_info_1 >> 16;
+ size = (svm->vmcb->control.exit_info_1 & SVM_IOIO_SIZE_MASK) >>
+ SVM_IOIO_SIZE_SHIFT;
+ gpa = svm->nested.ctl.iopm_base_pa + (port / 8);
+ start_bit = port % 8;
+ iopm_len = (start_bit + size > 8) ? 2 : 1;
+ mask = (0xf >> (4 - size)) << start_bit;
+ val = 0;
+
+ if (kvm_vcpu_read_guest(&svm->vcpu, gpa, &val, iopm_len))
+ return NESTED_EXIT_DONE;
+
+ return (val & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST;
+}
+
+static int nested_svm_intercept(struct vcpu_svm *svm)
+{
+ u32 exit_code = svm->vmcb->control.exit_code;
+ int vmexit = NESTED_EXIT_HOST;
+
+ switch (exit_code) {
+ case SVM_EXIT_MSR:
+ vmexit = nested_svm_exit_handled_msr(svm);
+ break;
+ case SVM_EXIT_IOIO:
+ vmexit = nested_svm_intercept_ioio(svm);
+ break;
+ case SVM_EXIT_READ_CR0 ... SVM_EXIT_WRITE_CR8: {
+ if (vmcb12_is_intercept(&svm->nested.ctl, exit_code))
+ vmexit = NESTED_EXIT_DONE;
+ break;
+ }
+ case SVM_EXIT_READ_DR0 ... SVM_EXIT_WRITE_DR7: {
+ if (vmcb12_is_intercept(&svm->nested.ctl, exit_code))
+ vmexit = NESTED_EXIT_DONE;
+ break;
+ }
+ case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 0x1f: {
+ /*
+ * Host-intercepted exceptions have been checked already in
+ * nested_svm_exit_special. There is nothing to do here,
+ * the vmexit is injected by svm_check_nested_events.
+ */
+ vmexit = NESTED_EXIT_DONE;
+ break;
+ }
+ case SVM_EXIT_ERR: {
+ vmexit = NESTED_EXIT_DONE;
+ break;
+ }
+ default: {
+ if (vmcb12_is_intercept(&svm->nested.ctl, exit_code))
+ vmexit = NESTED_EXIT_DONE;
+ }
+ }
+
+ return vmexit;
+}
+
+int nested_svm_exit_handled(struct vcpu_svm *svm)
+{
+ int vmexit;
+
+ vmexit = nested_svm_intercept(svm);
+
+ if (vmexit == NESTED_EXIT_DONE)
+ nested_svm_vmexit(svm);
+
+ return vmexit;
+}
+
+int nested_svm_check_permissions(struct kvm_vcpu *vcpu)
+{
+ if (!(vcpu->arch.efer & EFER_SVME) || !is_paging(vcpu)) {
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+ }
+
+ if (to_svm(vcpu)->vmcb->save.cpl) {
+ kvm_inject_gp(vcpu, 0);
+ return 1;
+ }
+
+ return 0;
+}
+
+static bool nested_svm_is_exception_vmexit(struct kvm_vcpu *vcpu, u8 vector,
+ u32 error_code)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ return (svm->nested.ctl.intercepts[INTERCEPT_EXCEPTION] & BIT(vector));
+}
+
+static void nested_svm_inject_exception_vmexit(struct kvm_vcpu *vcpu)
+{
+ struct kvm_queued_exception *ex = &vcpu->arch.exception_vmexit;
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb *vmcb = svm->vmcb;
+
+ vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + ex->vector;
+ vmcb->control.exit_code_hi = 0;
+
+ if (ex->has_error_code)
+ vmcb->control.exit_info_1 = ex->error_code;
+
+ /*
+ * EXITINFO2 is undefined for all exception intercepts other
+ * than #PF.
+ */
+ if (ex->vector == PF_VECTOR) {
+ if (ex->has_payload)
+ vmcb->control.exit_info_2 = ex->payload;
+ else
+ vmcb->control.exit_info_2 = vcpu->arch.cr2;
+ } else if (ex->vector == DB_VECTOR) {
+ /* See kvm_check_and_inject_events(). */
+ kvm_deliver_exception_payload(vcpu, ex);
+
+ if (vcpu->arch.dr7 & DR7_GD) {
+ vcpu->arch.dr7 &= ~DR7_GD;
+ kvm_update_dr7(vcpu);
+ }
+ } else {
+ WARN_ON(ex->has_payload);
+ }
+
+ nested_svm_vmexit(svm);
+}
+
+static inline bool nested_exit_on_init(struct vcpu_svm *svm)
+{
+ return vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_INIT);
+}
+
+static int svm_check_nested_events(struct kvm_vcpu *vcpu)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+ struct vcpu_svm *svm = to_svm(vcpu);
+ /*
+ * Only a pending nested run blocks a pending exception. If there is a
+ * previously injected event, the pending exception occurred while said
+ * event was being delivered and thus needs to be handled.
+ */
+ bool block_nested_exceptions = svm->nested.nested_run_pending;
+ /*
+ * New events (not exceptions) are only recognized at instruction
+ * boundaries. If an event needs reinjection, then KVM is handling a
+ * VM-Exit that occurred _during_ instruction execution; new events are
+ * blocked until the instruction completes.
+ */
+ bool block_nested_events = block_nested_exceptions ||
+ kvm_event_needs_reinjection(vcpu);
+
+ if (lapic_in_kernel(vcpu) &&
+ test_bit(KVM_APIC_INIT, &apic->pending_events)) {
+ if (block_nested_events)
+ return -EBUSY;
+ if (!nested_exit_on_init(svm))
+ return 0;
+ nested_svm_simple_vmexit(svm, SVM_EXIT_INIT);
+ return 0;
+ }
+
+ if (vcpu->arch.exception_vmexit.pending) {
+ if (block_nested_exceptions)
+ return -EBUSY;
+ nested_svm_inject_exception_vmexit(vcpu);
+ return 0;
+ }
+
+ if (vcpu->arch.exception.pending) {
+ if (block_nested_exceptions)
+ return -EBUSY;
+ return 0;
+ }
+
+ if (vcpu->arch.smi_pending && !svm_smi_blocked(vcpu)) {
+ if (block_nested_events)
+ return -EBUSY;
+ if (!nested_exit_on_smi(svm))
+ return 0;
+ nested_svm_simple_vmexit(svm, SVM_EXIT_SMI);
+ return 0;
+ }
+
+ if (vcpu->arch.nmi_pending && !svm_nmi_blocked(vcpu)) {
+ if (block_nested_events)
+ return -EBUSY;
+ if (!nested_exit_on_nmi(svm))
+ return 0;
+ nested_svm_simple_vmexit(svm, SVM_EXIT_NMI);
+ return 0;
+ }
+
+ if (kvm_cpu_has_interrupt(vcpu) && !svm_interrupt_blocked(vcpu)) {
+ if (block_nested_events)
+ return -EBUSY;
+ if (!nested_exit_on_intr(svm))
+ return 0;
+ trace_kvm_nested_intr_vmexit(svm->vmcb->save.rip);
+ nested_svm_simple_vmexit(svm, SVM_EXIT_INTR);
+ return 0;
+ }
+
+ return 0;
+}
+
+int nested_svm_exit_special(struct vcpu_svm *svm)
+{
+ u32 exit_code = svm->vmcb->control.exit_code;
+
+ switch (exit_code) {
+ case SVM_EXIT_INTR:
+ case SVM_EXIT_NMI:
+ case SVM_EXIT_NPF:
+ return NESTED_EXIT_HOST;
+ case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 0x1f: {
+ u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE);
+
+ if (svm->vmcb01.ptr->control.intercepts[INTERCEPT_EXCEPTION] &
+ excp_bits)
+ return NESTED_EXIT_HOST;
+ else if (exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR &&
+ svm->vcpu.arch.apf.host_apf_flags)
+ /* Trap async PF even if not shadowing */
+ return NESTED_EXIT_HOST;
+ break;
+ }
+ default:
+ break;
+ }
+
+ return NESTED_EXIT_CONTINUE;
+}
+
+void nested_svm_update_tsc_ratio_msr(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ vcpu->arch.tsc_scaling_ratio =
+ kvm_calc_nested_tsc_multiplier(vcpu->arch.l1_tsc_scaling_ratio,
+ svm->tsc_ratio_msr);
+ __svm_write_tsc_multiplier(vcpu->arch.tsc_scaling_ratio);
+}
+
+/* Inverse operation of nested_copy_vmcb_control_to_cache(). asid is copied too. */
+static void nested_copy_vmcb_cache_to_control(struct vmcb_control_area *dst,
+ struct vmcb_ctrl_area_cached *from)
+{
+ unsigned int i;
+
+ memset(dst, 0, sizeof(struct vmcb_control_area));
+
+ for (i = 0; i < MAX_INTERCEPT; i++)
+ dst->intercepts[i] = from->intercepts[i];
+
+ dst->iopm_base_pa = from->iopm_base_pa;
+ dst->msrpm_base_pa = from->msrpm_base_pa;
+ dst->tsc_offset = from->tsc_offset;
+ dst->asid = from->asid;
+ dst->tlb_ctl = from->tlb_ctl;
+ dst->int_ctl = from->int_ctl;
+ dst->int_vector = from->int_vector;
+ dst->int_state = from->int_state;
+ dst->exit_code = from->exit_code;
+ dst->exit_code_hi = from->exit_code_hi;
+ dst->exit_info_1 = from->exit_info_1;
+ dst->exit_info_2 = from->exit_info_2;
+ dst->exit_int_info = from->exit_int_info;
+ dst->exit_int_info_err = from->exit_int_info_err;
+ dst->nested_ctl = from->nested_ctl;
+ dst->event_inj = from->event_inj;
+ dst->event_inj_err = from->event_inj_err;
+ dst->next_rip = from->next_rip;
+ dst->nested_cr3 = from->nested_cr3;
+ dst->virt_ext = from->virt_ext;
+ dst->pause_filter_count = from->pause_filter_count;
+ dst->pause_filter_thresh = from->pause_filter_thresh;
+ /* 'clean' and 'hv_enlightenments' are not changed by KVM */
+}
+
+static int svm_get_nested_state(struct kvm_vcpu *vcpu,
+ struct kvm_nested_state __user *user_kvm_nested_state,
+ u32 user_data_size)
+{
+ struct vcpu_svm *svm;
+ struct vmcb_control_area *ctl;
+ unsigned long r;
+ struct kvm_nested_state kvm_state = {
+ .flags = 0,
+ .format = KVM_STATE_NESTED_FORMAT_SVM,
+ .size = sizeof(kvm_state),
+ };
+ struct vmcb __user *user_vmcb = (struct vmcb __user *)
+ &user_kvm_nested_state->data.svm[0];
+
+ if (!vcpu)
+ return kvm_state.size + KVM_STATE_NESTED_SVM_VMCB_SIZE;
+
+ svm = to_svm(vcpu);
+
+ if (user_data_size < kvm_state.size)
+ goto out;
+
+ /* First fill in the header and copy it out. */
+ if (is_guest_mode(vcpu)) {
+ kvm_state.hdr.svm.vmcb_pa = svm->nested.vmcb12_gpa;
+ kvm_state.size += KVM_STATE_NESTED_SVM_VMCB_SIZE;
+ kvm_state.flags |= KVM_STATE_NESTED_GUEST_MODE;
+
+ if (svm->nested.nested_run_pending)
+ kvm_state.flags |= KVM_STATE_NESTED_RUN_PENDING;
+ }
+
+ if (gif_set(svm))
+ kvm_state.flags |= KVM_STATE_NESTED_GIF_SET;
+
+ if (copy_to_user(user_kvm_nested_state, &kvm_state, sizeof(kvm_state)))
+ return -EFAULT;
+
+ if (!is_guest_mode(vcpu))
+ goto out;
+
+ /*
+ * Copy over the full size of the VMCB rather than just the size
+ * of the structs.
+ */
+ if (clear_user(user_vmcb, KVM_STATE_NESTED_SVM_VMCB_SIZE))
+ return -EFAULT;
+
+ ctl = kzalloc(sizeof(*ctl), GFP_KERNEL);
+ if (!ctl)
+ return -ENOMEM;
+
+ nested_copy_vmcb_cache_to_control(ctl, &svm->nested.ctl);
+ r = copy_to_user(&user_vmcb->control, ctl,
+ sizeof(user_vmcb->control));
+ kfree(ctl);
+ if (r)
+ return -EFAULT;
+
+ if (copy_to_user(&user_vmcb->save, &svm->vmcb01.ptr->save,
+ sizeof(user_vmcb->save)))
+ return -EFAULT;
+out:
+ return kvm_state.size;
+}
+
+static int svm_set_nested_state(struct kvm_vcpu *vcpu,
+ struct kvm_nested_state __user *user_kvm_nested_state,
+ struct kvm_nested_state *kvm_state)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb __user *user_vmcb = (struct vmcb __user *)
+ &user_kvm_nested_state->data.svm[0];
+ struct vmcb_control_area *ctl;
+ struct vmcb_save_area *save;
+ struct vmcb_save_area_cached save_cached;
+ struct vmcb_ctrl_area_cached ctl_cached;
+ unsigned long cr0;
+ int ret;
+
+ BUILD_BUG_ON(sizeof(struct vmcb_control_area) + sizeof(struct vmcb_save_area) >
+ KVM_STATE_NESTED_SVM_VMCB_SIZE);
+
+ if (kvm_state->format != KVM_STATE_NESTED_FORMAT_SVM)
+ return -EINVAL;
+
+ if (kvm_state->flags & ~(KVM_STATE_NESTED_GUEST_MODE |
+ KVM_STATE_NESTED_RUN_PENDING |
+ KVM_STATE_NESTED_GIF_SET))
+ return -EINVAL;
+
+ /*
+ * If in guest mode, vcpu->arch.efer actually refers to the L2 guest's
+ * EFER.SVME, but EFER.SVME still has to be 1 for VMRUN to succeed.
+ */
+ if (!(vcpu->arch.efer & EFER_SVME)) {
+ /* GIF=1 and no guest mode are required if SVME=0. */
+ if (kvm_state->flags != KVM_STATE_NESTED_GIF_SET)
+ return -EINVAL;
+ }
+
+ /* SMM temporarily disables SVM, so we cannot be in guest mode. */
+ if (is_smm(vcpu) && (kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE))
+ return -EINVAL;
+
+ if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) {
+ svm_leave_nested(vcpu);
+ svm_set_gif(svm, !!(kvm_state->flags & KVM_STATE_NESTED_GIF_SET));
+ return 0;
+ }
+
+ if (!page_address_valid(vcpu, kvm_state->hdr.svm.vmcb_pa))
+ return -EINVAL;
+ if (kvm_state->size < sizeof(*kvm_state) + KVM_STATE_NESTED_SVM_VMCB_SIZE)
+ return -EINVAL;
+
+ ret = -ENOMEM;
+ ctl = kzalloc(sizeof(*ctl), GFP_KERNEL_ACCOUNT);
+ save = kzalloc(sizeof(*save), GFP_KERNEL_ACCOUNT);
+ if (!ctl || !save)
+ goto out_free;
+
+ ret = -EFAULT;
+ if (copy_from_user(ctl, &user_vmcb->control, sizeof(*ctl)))
+ goto out_free;
+ if (copy_from_user(save, &user_vmcb->save, sizeof(*save)))
+ goto out_free;
+
+ ret = -EINVAL;
+ __nested_copy_vmcb_control_to_cache(vcpu, &ctl_cached, ctl);
+ if (!__nested_vmcb_check_controls(vcpu, &ctl_cached))
+ goto out_free;
+
+ /*
+ * Processor state contains L2 state. Check that it is
+ * valid for guest mode (see nested_vmcb_check_save).
+ */
+ cr0 = kvm_read_cr0(vcpu);
+ if (((cr0 & X86_CR0_CD) == 0) && (cr0 & X86_CR0_NW))
+ goto out_free;
+
+ /*
+ * Validate host state saved from before VMRUN (see
+ * nested_svm_check_permissions).
+ */
+ __nested_copy_vmcb_save_to_cache(&save_cached, save);
+ if (!(save->cr0 & X86_CR0_PG) ||
+ !(save->cr0 & X86_CR0_PE) ||
+ (save->rflags & X86_EFLAGS_VM) ||
+ !__nested_vmcb_check_save(vcpu, &save_cached))
+ goto out_free;
+
+
+ /*
+ * All checks done, we can enter guest mode. Userspace provides
+ * vmcb12.control, which will be combined with L1 and stored into
+ * vmcb02, and the L1 save state which we store in vmcb01.
+ * L2 registers if needed are moved from the current VMCB to VMCB02.
+ */
+
+ if (is_guest_mode(vcpu))
+ svm_leave_nested(vcpu);
+ else
+ svm->nested.vmcb02.ptr->save = svm->vmcb01.ptr->save;
+
+ svm_set_gif(svm, !!(kvm_state->flags & KVM_STATE_NESTED_GIF_SET));
+
+ svm->nested.nested_run_pending =
+ !!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING);
+
+ svm->nested.vmcb12_gpa = kvm_state->hdr.svm.vmcb_pa;
+
+ svm_copy_vmrun_state(&svm->vmcb01.ptr->save, save);
+ nested_copy_vmcb_control_to_cache(svm, ctl);
+
+ svm_switch_vmcb(svm, &svm->nested.vmcb02);
+ nested_vmcb02_prepare_control(svm, svm->vmcb->save.rip, svm->vmcb->save.cs.base);
+
+ /*
+ * While the nested guest CR3 is already checked and set by
+ * KVM_SET_SREGS, it was set when nested state was yet loaded,
+ * thus MMU might not be initialized correctly.
+ * Set it again to fix this.
+ */
+
+ ret = nested_svm_load_cr3(&svm->vcpu, vcpu->arch.cr3,
+ nested_npt_enabled(svm), false);
+ if (WARN_ON_ONCE(ret))
+ goto out_free;
+
+ svm->nested.force_msr_bitmap_recalc = true;
+
+ kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
+ ret = 0;
+out_free:
+ kfree(save);
+ kfree(ctl);
+
+ return ret;
+}
+
+static bool svm_get_nested_state_pages(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (WARN_ON(!is_guest_mode(vcpu)))
+ return true;
+
+ if (!vcpu->arch.pdptrs_from_userspace &&
+ !nested_npt_enabled(svm) && is_pae_paging(vcpu))
+ /*
+ * Reload the guest's PDPTRs since after a migration
+ * the guest CR3 might be restored prior to setting the nested
+ * state which can lead to a load of wrong PDPTRs.
+ */
+ if (CC(!load_pdptrs(vcpu, vcpu->arch.cr3)))
+ return false;
+
+ if (!nested_svm_vmrun_msrpm(svm)) {
+ vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+ vcpu->run->internal.suberror =
+ KVM_INTERNAL_ERROR_EMULATION;
+ vcpu->run->internal.ndata = 0;
+ return false;
+ }
+
+ return true;
+}
+
+struct kvm_x86_nested_ops svm_nested_ops = {
+ .leave_nested = svm_leave_nested,
+ .is_exception_vmexit = nested_svm_is_exception_vmexit,
+ .check_events = svm_check_nested_events,
+ .triple_fault = nested_svm_triple_fault,
+ .get_nested_state_pages = svm_get_nested_state_pages,
+ .get_state = svm_get_nested_state,
+ .set_state = svm_set_nested_state,
+};
diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c
new file mode 100644
index 000000000..1cb2bf980
--- /dev/null
+++ b/arch/x86/kvm/svm/pmu.c
@@ -0,0 +1,232 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * KVM PMU support for AMD
+ *
+ * Copyright 2015, Red Hat, Inc. and/or its affiliates.
+ *
+ * Author:
+ * Wei Huang <wei@redhat.com>
+ *
+ * Implementation is based on pmu_intel.c file
+ */
+#include <linux/types.h>
+#include <linux/kvm_host.h>
+#include <linux/perf_event.h>
+#include "x86.h"
+#include "cpuid.h"
+#include "lapic.h"
+#include "pmu.h"
+#include "svm.h"
+
+enum pmu_type {
+ PMU_TYPE_COUNTER = 0,
+ PMU_TYPE_EVNTSEL,
+};
+
+static struct kvm_pmc *amd_pmc_idx_to_pmc(struct kvm_pmu *pmu, int pmc_idx)
+{
+ unsigned int num_counters = pmu->nr_arch_gp_counters;
+
+ if (pmc_idx >= num_counters)
+ return NULL;
+
+ return &pmu->gp_counters[array_index_nospec(pmc_idx, num_counters)];
+}
+
+static inline struct kvm_pmc *get_gp_pmc_amd(struct kvm_pmu *pmu, u32 msr,
+ enum pmu_type type)
+{
+ struct kvm_vcpu *vcpu = pmu_to_vcpu(pmu);
+ unsigned int idx;
+
+ if (!vcpu->kvm->arch.enable_pmu)
+ return NULL;
+
+ switch (msr) {
+ case MSR_F15H_PERF_CTL0 ... MSR_F15H_PERF_CTR5:
+ if (!guest_cpuid_has(vcpu, X86_FEATURE_PERFCTR_CORE))
+ return NULL;
+ /*
+ * Each PMU counter has a pair of CTL and CTR MSRs. CTLn
+ * MSRs (accessed via EVNTSEL) are even, CTRn MSRs are odd.
+ */
+ idx = (unsigned int)((msr - MSR_F15H_PERF_CTL0) / 2);
+ if (!(msr & 0x1) != (type == PMU_TYPE_EVNTSEL))
+ return NULL;
+ break;
+ case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3:
+ if (type != PMU_TYPE_EVNTSEL)
+ return NULL;
+ idx = msr - MSR_K7_EVNTSEL0;
+ break;
+ case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3:
+ if (type != PMU_TYPE_COUNTER)
+ return NULL;
+ idx = msr - MSR_K7_PERFCTR0;
+ break;
+ default:
+ return NULL;
+ }
+
+ return amd_pmc_idx_to_pmc(pmu, idx);
+}
+
+static bool amd_hw_event_available(struct kvm_pmc *pmc)
+{
+ return true;
+}
+
+/* check if a PMC is enabled by comparing it against global_ctrl bits. Because
+ * AMD CPU doesn't have global_ctrl MSR, all PMCs are enabled (return TRUE).
+ */
+static bool amd_pmc_is_enabled(struct kvm_pmc *pmc)
+{
+ return true;
+}
+
+static bool amd_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+
+ idx &= ~(3u << 30);
+
+ return idx < pmu->nr_arch_gp_counters;
+}
+
+/* idx is the ECX register of RDPMC instruction */
+static struct kvm_pmc *amd_rdpmc_ecx_to_pmc(struct kvm_vcpu *vcpu,
+ unsigned int idx, u64 *mask)
+{
+ return amd_pmc_idx_to_pmc(vcpu_to_pmu(vcpu), idx & ~(3u << 30));
+}
+
+static bool amd_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr)
+{
+ /* All MSRs refer to exactly one PMC, so msr_idx_to_pmc is enough. */
+ return false;
+}
+
+static struct kvm_pmc *amd_msr_idx_to_pmc(struct kvm_vcpu *vcpu, u32 msr)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ struct kvm_pmc *pmc;
+
+ pmc = get_gp_pmc_amd(pmu, msr, PMU_TYPE_COUNTER);
+ pmc = pmc ? pmc : get_gp_pmc_amd(pmu, msr, PMU_TYPE_EVNTSEL);
+
+ return pmc;
+}
+
+static int amd_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ struct kvm_pmc *pmc;
+ u32 msr = msr_info->index;
+
+ /* MSR_PERFCTRn */
+ pmc = get_gp_pmc_amd(pmu, msr, PMU_TYPE_COUNTER);
+ if (pmc) {
+ msr_info->data = pmc_read_counter(pmc);
+ return 0;
+ }
+ /* MSR_EVNTSELn */
+ pmc = get_gp_pmc_amd(pmu, msr, PMU_TYPE_EVNTSEL);
+ if (pmc) {
+ msr_info->data = pmc->eventsel;
+ return 0;
+ }
+
+ return 1;
+}
+
+static int amd_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ struct kvm_pmc *pmc;
+ u32 msr = msr_info->index;
+ u64 data = msr_info->data;
+
+ /* MSR_PERFCTRn */
+ pmc = get_gp_pmc_amd(pmu, msr, PMU_TYPE_COUNTER);
+ if (pmc) {
+ pmc_write_counter(pmc, data);
+ pmc_update_sample_period(pmc);
+ return 0;
+ }
+ /* MSR_EVNTSELn */
+ pmc = get_gp_pmc_amd(pmu, msr, PMU_TYPE_EVNTSEL);
+ if (pmc) {
+ data &= ~pmu->reserved_bits;
+ if (data != pmc->eventsel) {
+ pmc->eventsel = data;
+ reprogram_counter(pmc);
+ }
+ return 0;
+ }
+
+ return 1;
+}
+
+static void amd_pmu_refresh(struct kvm_vcpu *vcpu)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+
+ if (guest_cpuid_has(vcpu, X86_FEATURE_PERFCTR_CORE))
+ pmu->nr_arch_gp_counters = AMD64_NUM_COUNTERS_CORE;
+ else
+ pmu->nr_arch_gp_counters = AMD64_NUM_COUNTERS;
+
+ pmu->counter_bitmask[KVM_PMC_GP] = ((u64)1 << 48) - 1;
+ pmu->reserved_bits = 0xfffffff000280000ull;
+ pmu->raw_event_mask = AMD64_RAW_EVENT_MASK;
+ pmu->version = 1;
+ /* not applicable to AMD; but clean them to prevent any fall out */
+ pmu->counter_bitmask[KVM_PMC_FIXED] = 0;
+ pmu->nr_arch_fixed_counters = 0;
+ pmu->global_status = 0;
+ bitmap_set(pmu->all_valid_pmc_idx, 0, pmu->nr_arch_gp_counters);
+}
+
+static void amd_pmu_init(struct kvm_vcpu *vcpu)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ int i;
+
+ BUILD_BUG_ON(KVM_AMD_PMC_MAX_GENERIC > AMD64_NUM_COUNTERS_CORE);
+ BUILD_BUG_ON(KVM_AMD_PMC_MAX_GENERIC > INTEL_PMC_MAX_GENERIC);
+
+ for (i = 0; i < KVM_AMD_PMC_MAX_GENERIC ; i++) {
+ pmu->gp_counters[i].type = KVM_PMC_GP;
+ pmu->gp_counters[i].vcpu = vcpu;
+ pmu->gp_counters[i].idx = i;
+ pmu->gp_counters[i].current_config = 0;
+ }
+}
+
+static void amd_pmu_reset(struct kvm_vcpu *vcpu)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ int i;
+
+ for (i = 0; i < KVM_AMD_PMC_MAX_GENERIC; i++) {
+ struct kvm_pmc *pmc = &pmu->gp_counters[i];
+
+ pmc_stop_counter(pmc);
+ pmc->counter = pmc->eventsel = 0;
+ }
+}
+
+struct kvm_pmu_ops amd_pmu_ops __initdata = {
+ .hw_event_available = amd_hw_event_available,
+ .pmc_is_enabled = amd_pmc_is_enabled,
+ .pmc_idx_to_pmc = amd_pmc_idx_to_pmc,
+ .rdpmc_ecx_to_pmc = amd_rdpmc_ecx_to_pmc,
+ .msr_idx_to_pmc = amd_msr_idx_to_pmc,
+ .is_valid_rdpmc_ecx = amd_is_valid_rdpmc_ecx,
+ .is_valid_msr = amd_is_valid_msr,
+ .get_msr = amd_pmu_get_msr,
+ .set_msr = amd_pmu_set_msr,
+ .refresh = amd_pmu_refresh,
+ .init = amd_pmu_init,
+ .reset = amd_pmu_reset,
+};
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
new file mode 100644
index 000000000..3060fe4e9
--- /dev/null
+++ b/arch/x86/kvm/svm/sev.c
@@ -0,0 +1,3076 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Kernel-based Virtual Machine driver for Linux
+ *
+ * AMD SVM-SEV support
+ *
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
+ */
+
+#include <linux/kvm_types.h>
+#include <linux/kvm_host.h>
+#include <linux/kernel.h>
+#include <linux/highmem.h>
+#include <linux/psp-sev.h>
+#include <linux/pagemap.h>
+#include <linux/swap.h>
+#include <linux/misc_cgroup.h>
+#include <linux/processor.h>
+#include <linux/trace_events.h>
+
+#include <asm/pkru.h>
+#include <asm/trapnr.h>
+#include <asm/fpu/xcr.h>
+
+#include "mmu.h"
+#include "x86.h"
+#include "svm.h"
+#include "svm_ops.h"
+#include "cpuid.h"
+#include "trace.h"
+
+#ifndef CONFIG_KVM_AMD_SEV
+/*
+ * When this config is not defined, SEV feature is not supported and APIs in
+ * this file are not used but this file still gets compiled into the KVM AMD
+ * module.
+ *
+ * We will not have MISC_CG_RES_SEV and MISC_CG_RES_SEV_ES entries in the enum
+ * misc_res_type {} defined in linux/misc_cgroup.h.
+ *
+ * Below macros allow compilation to succeed.
+ */
+#define MISC_CG_RES_SEV MISC_CG_RES_TYPES
+#define MISC_CG_RES_SEV_ES MISC_CG_RES_TYPES
+#endif
+
+#ifdef CONFIG_KVM_AMD_SEV
+/* enable/disable SEV support */
+static bool sev_enabled = true;
+module_param_named(sev, sev_enabled, bool, 0444);
+
+/* enable/disable SEV-ES support */
+static bool sev_es_enabled = true;
+module_param_named(sev_es, sev_es_enabled, bool, 0444);
+#else
+#define sev_enabled false
+#define sev_es_enabled false
+#endif /* CONFIG_KVM_AMD_SEV */
+
+static u8 sev_enc_bit;
+static DECLARE_RWSEM(sev_deactivate_lock);
+static DEFINE_MUTEX(sev_bitmap_lock);
+unsigned int max_sev_asid;
+static unsigned int min_sev_asid;
+static unsigned long sev_me_mask;
+static unsigned int nr_asids;
+static unsigned long *sev_asid_bitmap;
+static unsigned long *sev_reclaim_asid_bitmap;
+
+struct enc_region {
+ struct list_head list;
+ unsigned long npages;
+ struct page **pages;
+ unsigned long uaddr;
+ unsigned long size;
+};
+
+/* Called with the sev_bitmap_lock held, or on shutdown */
+static int sev_flush_asids(int min_asid, int max_asid)
+{
+ int ret, asid, error = 0;
+
+ /* Check if there are any ASIDs to reclaim before performing a flush */
+ asid = find_next_bit(sev_reclaim_asid_bitmap, nr_asids, min_asid);
+ if (asid > max_asid)
+ return -EBUSY;
+
+ /*
+ * DEACTIVATE will clear the WBINVD indicator causing DF_FLUSH to fail,
+ * so it must be guarded.
+ */
+ down_write(&sev_deactivate_lock);
+
+ wbinvd_on_all_cpus();
+ ret = sev_guest_df_flush(&error);
+
+ up_write(&sev_deactivate_lock);
+
+ if (ret)
+ pr_err("SEV: DF_FLUSH failed, ret=%d, error=%#x\n", ret, error);
+
+ return ret;
+}
+
+static inline bool is_mirroring_enc_context(struct kvm *kvm)
+{
+ return !!to_kvm_svm(kvm)->sev_info.enc_context_owner;
+}
+
+/* Must be called with the sev_bitmap_lock held */
+static bool __sev_recycle_asids(int min_asid, int max_asid)
+{
+ if (sev_flush_asids(min_asid, max_asid))
+ return false;
+
+ /* The flush process will flush all reclaimable SEV and SEV-ES ASIDs */
+ bitmap_xor(sev_asid_bitmap, sev_asid_bitmap, sev_reclaim_asid_bitmap,
+ nr_asids);
+ bitmap_zero(sev_reclaim_asid_bitmap, nr_asids);
+
+ return true;
+}
+
+static int sev_misc_cg_try_charge(struct kvm_sev_info *sev)
+{
+ enum misc_res_type type = sev->es_active ? MISC_CG_RES_SEV_ES : MISC_CG_RES_SEV;
+ return misc_cg_try_charge(type, sev->misc_cg, 1);
+}
+
+static void sev_misc_cg_uncharge(struct kvm_sev_info *sev)
+{
+ enum misc_res_type type = sev->es_active ? MISC_CG_RES_SEV_ES : MISC_CG_RES_SEV;
+ misc_cg_uncharge(type, sev->misc_cg, 1);
+}
+
+static int sev_asid_new(struct kvm_sev_info *sev)
+{
+ int asid, min_asid, max_asid, ret;
+ bool retry = true;
+
+ WARN_ON(sev->misc_cg);
+ sev->misc_cg = get_current_misc_cg();
+ ret = sev_misc_cg_try_charge(sev);
+ if (ret) {
+ put_misc_cg(sev->misc_cg);
+ sev->misc_cg = NULL;
+ return ret;
+ }
+
+ mutex_lock(&sev_bitmap_lock);
+
+ /*
+ * SEV-enabled guests must use asid from min_sev_asid to max_sev_asid.
+ * SEV-ES-enabled guest can use from 1 to min_sev_asid - 1.
+ */
+ min_asid = sev->es_active ? 1 : min_sev_asid;
+ max_asid = sev->es_active ? min_sev_asid - 1 : max_sev_asid;
+again:
+ asid = find_next_zero_bit(sev_asid_bitmap, max_asid + 1, min_asid);
+ if (asid > max_asid) {
+ if (retry && __sev_recycle_asids(min_asid, max_asid)) {
+ retry = false;
+ goto again;
+ }
+ mutex_unlock(&sev_bitmap_lock);
+ ret = -EBUSY;
+ goto e_uncharge;
+ }
+
+ __set_bit(asid, sev_asid_bitmap);
+
+ mutex_unlock(&sev_bitmap_lock);
+
+ return asid;
+e_uncharge:
+ sev_misc_cg_uncharge(sev);
+ put_misc_cg(sev->misc_cg);
+ sev->misc_cg = NULL;
+ return ret;
+}
+
+static int sev_get_asid(struct kvm *kvm)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+
+ return sev->asid;
+}
+
+static void sev_asid_free(struct kvm_sev_info *sev)
+{
+ struct svm_cpu_data *sd;
+ int cpu;
+
+ mutex_lock(&sev_bitmap_lock);
+
+ __set_bit(sev->asid, sev_reclaim_asid_bitmap);
+
+ for_each_possible_cpu(cpu) {
+ sd = per_cpu_ptr(&svm_data, cpu);
+ sd->sev_vmcbs[sev->asid] = NULL;
+ }
+
+ mutex_unlock(&sev_bitmap_lock);
+
+ sev_misc_cg_uncharge(sev);
+ put_misc_cg(sev->misc_cg);
+ sev->misc_cg = NULL;
+}
+
+static void sev_decommission(unsigned int handle)
+{
+ struct sev_data_decommission decommission;
+
+ if (!handle)
+ return;
+
+ decommission.handle = handle;
+ sev_guest_decommission(&decommission, NULL);
+}
+
+static void sev_unbind_asid(struct kvm *kvm, unsigned int handle)
+{
+ struct sev_data_deactivate deactivate;
+
+ if (!handle)
+ return;
+
+ deactivate.handle = handle;
+
+ /* Guard DEACTIVATE against WBINVD/DF_FLUSH used in ASID recycling */
+ down_read(&sev_deactivate_lock);
+ sev_guest_deactivate(&deactivate, NULL);
+ up_read(&sev_deactivate_lock);
+
+ sev_decommission(handle);
+}
+
+static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ int asid, ret;
+
+ if (kvm->created_vcpus)
+ return -EINVAL;
+
+ ret = -EBUSY;
+ if (unlikely(sev->active))
+ return ret;
+
+ sev->active = true;
+ sev->es_active = argp->id == KVM_SEV_ES_INIT;
+ asid = sev_asid_new(sev);
+ if (asid < 0)
+ goto e_no_asid;
+ sev->asid = asid;
+
+ ret = sev_platform_init(&argp->error);
+ if (ret)
+ goto e_free;
+
+ INIT_LIST_HEAD(&sev->regions_list);
+ INIT_LIST_HEAD(&sev->mirror_vms);
+
+ kvm_set_apicv_inhibit(kvm, APICV_INHIBIT_REASON_SEV);
+
+ return 0;
+
+e_free:
+ sev_asid_free(sev);
+ sev->asid = 0;
+e_no_asid:
+ sev->es_active = false;
+ sev->active = false;
+ return ret;
+}
+
+static int sev_bind_asid(struct kvm *kvm, unsigned int handle, int *error)
+{
+ struct sev_data_activate activate;
+ int asid = sev_get_asid(kvm);
+ int ret;
+
+ /* activate ASID on the given handle */
+ activate.handle = handle;
+ activate.asid = asid;
+ ret = sev_guest_activate(&activate, error);
+
+ return ret;
+}
+
+static int __sev_issue_cmd(int fd, int id, void *data, int *error)
+{
+ struct fd f;
+ int ret;
+
+ f = fdget(fd);
+ if (!f.file)
+ return -EBADF;
+
+ ret = sev_issue_cmd_external_user(f.file, id, data, error);
+
+ fdput(f);
+ return ret;
+}
+
+static int sev_issue_cmd(struct kvm *kvm, int id, void *data, int *error)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+
+ return __sev_issue_cmd(sev->fd, id, data, error);
+}
+
+static int sev_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct sev_data_launch_start start;
+ struct kvm_sev_launch_start params;
+ void *dh_blob, *session_blob;
+ int *error = &argp->error;
+ int ret;
+
+ if (!sev_guest(kvm))
+ return -ENOTTY;
+
+ if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data, sizeof(params)))
+ return -EFAULT;
+
+ memset(&start, 0, sizeof(start));
+
+ dh_blob = NULL;
+ if (params.dh_uaddr) {
+ dh_blob = psp_copy_user_blob(params.dh_uaddr, params.dh_len);
+ if (IS_ERR(dh_blob))
+ return PTR_ERR(dh_blob);
+
+ start.dh_cert_address = __sme_set(__pa(dh_blob));
+ start.dh_cert_len = params.dh_len;
+ }
+
+ session_blob = NULL;
+ if (params.session_uaddr) {
+ session_blob = psp_copy_user_blob(params.session_uaddr, params.session_len);
+ if (IS_ERR(session_blob)) {
+ ret = PTR_ERR(session_blob);
+ goto e_free_dh;
+ }
+
+ start.session_address = __sme_set(__pa(session_blob));
+ start.session_len = params.session_len;
+ }
+
+ start.handle = params.handle;
+ start.policy = params.policy;
+
+ /* create memory encryption context */
+ ret = __sev_issue_cmd(argp->sev_fd, SEV_CMD_LAUNCH_START, &start, error);
+ if (ret)
+ goto e_free_session;
+
+ /* Bind ASID to this guest */
+ ret = sev_bind_asid(kvm, start.handle, error);
+ if (ret) {
+ sev_decommission(start.handle);
+ goto e_free_session;
+ }
+
+ /* return handle to userspace */
+ params.handle = start.handle;
+ if (copy_to_user((void __user *)(uintptr_t)argp->data, &params, sizeof(params))) {
+ sev_unbind_asid(kvm, start.handle);
+ ret = -EFAULT;
+ goto e_free_session;
+ }
+
+ sev->handle = start.handle;
+ sev->fd = argp->sev_fd;
+
+e_free_session:
+ kfree(session_blob);
+e_free_dh:
+ kfree(dh_blob);
+ return ret;
+}
+
+static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr,
+ unsigned long ulen, unsigned long *n,
+ int write)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ unsigned long npages, size;
+ int npinned;
+ unsigned long locked, lock_limit;
+ struct page **pages;
+ unsigned long first, last;
+ int ret;
+
+ lockdep_assert_held(&kvm->lock);
+
+ if (ulen == 0 || uaddr + ulen < uaddr)
+ return ERR_PTR(-EINVAL);
+
+ /* Calculate number of pages. */
+ first = (uaddr & PAGE_MASK) >> PAGE_SHIFT;
+ last = ((uaddr + ulen - 1) & PAGE_MASK) >> PAGE_SHIFT;
+ npages = (last - first + 1);
+
+ locked = sev->pages_locked + npages;
+ lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+ if (locked > lock_limit && !capable(CAP_IPC_LOCK)) {
+ pr_err("SEV: %lu locked pages exceed the lock limit of %lu.\n", locked, lock_limit);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ if (WARN_ON_ONCE(npages > INT_MAX))
+ return ERR_PTR(-EINVAL);
+
+ /* Avoid using vmalloc for smaller buffers. */
+ size = npages * sizeof(struct page *);
+ if (size > PAGE_SIZE)
+ pages = __vmalloc(size, GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+ else
+ pages = kmalloc(size, GFP_KERNEL_ACCOUNT);
+
+ if (!pages)
+ return ERR_PTR(-ENOMEM);
+
+ /* Pin the user virtual address. */
+ npinned = pin_user_pages_fast(uaddr, npages, write ? FOLL_WRITE : 0, pages);
+ if (npinned != npages) {
+ pr_err("SEV: Failure locking %lu pages.\n", npages);
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ *n = npages;
+ sev->pages_locked = locked;
+
+ return pages;
+
+err:
+ if (npinned > 0)
+ unpin_user_pages(pages, npinned);
+
+ kvfree(pages);
+ return ERR_PTR(ret);
+}
+
+static void sev_unpin_memory(struct kvm *kvm, struct page **pages,
+ unsigned long npages)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+
+ unpin_user_pages(pages, npages);
+ kvfree(pages);
+ sev->pages_locked -= npages;
+}
+
+static void sev_clflush_pages(struct page *pages[], unsigned long npages)
+{
+ uint8_t *page_virtual;
+ unsigned long i;
+
+ if (this_cpu_has(X86_FEATURE_SME_COHERENT) || npages == 0 ||
+ pages == NULL)
+ return;
+
+ for (i = 0; i < npages; i++) {
+ page_virtual = kmap_atomic(pages[i]);
+ clflush_cache_range(page_virtual, PAGE_SIZE);
+ kunmap_atomic(page_virtual);
+ cond_resched();
+ }
+}
+
+static unsigned long get_num_contig_pages(unsigned long idx,
+ struct page **inpages, unsigned long npages)
+{
+ unsigned long paddr, next_paddr;
+ unsigned long i = idx + 1, pages = 1;
+
+ /* find the number of contiguous pages starting from idx */
+ paddr = __sme_page_pa(inpages[idx]);
+ while (i < npages) {
+ next_paddr = __sme_page_pa(inpages[i++]);
+ if ((paddr + PAGE_SIZE) == next_paddr) {
+ pages++;
+ paddr = next_paddr;
+ continue;
+ }
+ break;
+ }
+
+ return pages;
+}
+
+static int sev_launch_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ unsigned long vaddr, vaddr_end, next_vaddr, npages, pages, size, i;
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct kvm_sev_launch_update_data params;
+ struct sev_data_launch_update_data data;
+ struct page **inpages;
+ int ret;
+
+ if (!sev_guest(kvm))
+ return -ENOTTY;
+
+ if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data, sizeof(params)))
+ return -EFAULT;
+
+ vaddr = params.uaddr;
+ size = params.len;
+ vaddr_end = vaddr + size;
+
+ /* Lock the user memory. */
+ inpages = sev_pin_memory(kvm, vaddr, size, &npages, 1);
+ if (IS_ERR(inpages))
+ return PTR_ERR(inpages);
+
+ /*
+ * Flush (on non-coherent CPUs) before LAUNCH_UPDATE encrypts pages in
+ * place; the cache may contain the data that was written unencrypted.
+ */
+ sev_clflush_pages(inpages, npages);
+
+ data.reserved = 0;
+ data.handle = sev->handle;
+
+ for (i = 0; vaddr < vaddr_end; vaddr = next_vaddr, i += pages) {
+ int offset, len;
+
+ /*
+ * If the user buffer is not page-aligned, calculate the offset
+ * within the page.
+ */
+ offset = vaddr & (PAGE_SIZE - 1);
+
+ /* Calculate the number of pages that can be encrypted in one go. */
+ pages = get_num_contig_pages(i, inpages, npages);
+
+ len = min_t(size_t, ((pages * PAGE_SIZE) - offset), size);
+
+ data.len = len;
+ data.address = __sme_page_pa(inpages[i]) + offset;
+ ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_DATA, &data, &argp->error);
+ if (ret)
+ goto e_unpin;
+
+ size -= len;
+ next_vaddr = vaddr + len;
+ }
+
+e_unpin:
+ /* content of memory is updated, mark pages dirty */
+ for (i = 0; i < npages; i++) {
+ set_page_dirty_lock(inpages[i]);
+ mark_page_accessed(inpages[i]);
+ }
+ /* unlock the user pages */
+ sev_unpin_memory(kvm, inpages, npages);
+ return ret;
+}
+
+static int sev_es_sync_vmsa(struct vcpu_svm *svm)
+{
+ struct sev_es_save_area *save = svm->sev_es.vmsa;
+
+ /* Check some debug related fields before encrypting the VMSA */
+ if (svm->vcpu.guest_debug || (svm->vmcb->save.dr7 & ~DR7_FIXED_1))
+ return -EINVAL;
+
+ /*
+ * SEV-ES will use a VMSA that is pointed to by the VMCB, not
+ * the traditional VMSA that is part of the VMCB. Copy the
+ * traditional VMSA as it has been built so far (in prep
+ * for LAUNCH_UPDATE_VMSA) to be the initial SEV-ES state.
+ */
+ memcpy(save, &svm->vmcb->save, sizeof(svm->vmcb->save));
+
+ /* Sync registgers */
+ save->rax = svm->vcpu.arch.regs[VCPU_REGS_RAX];
+ save->rbx = svm->vcpu.arch.regs[VCPU_REGS_RBX];
+ save->rcx = svm->vcpu.arch.regs[VCPU_REGS_RCX];
+ save->rdx = svm->vcpu.arch.regs[VCPU_REGS_RDX];
+ save->rsp = svm->vcpu.arch.regs[VCPU_REGS_RSP];
+ save->rbp = svm->vcpu.arch.regs[VCPU_REGS_RBP];
+ save->rsi = svm->vcpu.arch.regs[VCPU_REGS_RSI];
+ save->rdi = svm->vcpu.arch.regs[VCPU_REGS_RDI];
+#ifdef CONFIG_X86_64
+ save->r8 = svm->vcpu.arch.regs[VCPU_REGS_R8];
+ save->r9 = svm->vcpu.arch.regs[VCPU_REGS_R9];
+ save->r10 = svm->vcpu.arch.regs[VCPU_REGS_R10];
+ save->r11 = svm->vcpu.arch.regs[VCPU_REGS_R11];
+ save->r12 = svm->vcpu.arch.regs[VCPU_REGS_R12];
+ save->r13 = svm->vcpu.arch.regs[VCPU_REGS_R13];
+ save->r14 = svm->vcpu.arch.regs[VCPU_REGS_R14];
+ save->r15 = svm->vcpu.arch.regs[VCPU_REGS_R15];
+#endif
+ save->rip = svm->vcpu.arch.regs[VCPU_REGS_RIP];
+
+ /* Sync some non-GPR registers before encrypting */
+ save->xcr0 = svm->vcpu.arch.xcr0;
+ save->pkru = svm->vcpu.arch.pkru;
+ save->xss = svm->vcpu.arch.ia32_xss;
+ save->dr6 = svm->vcpu.arch.dr6;
+
+ pr_debug("Virtual Machine Save Area (VMSA):\n");
+ print_hex_dump_debug("", DUMP_PREFIX_NONE, 16, 1, save, sizeof(*save), false);
+
+ return 0;
+}
+
+static int __sev_launch_update_vmsa(struct kvm *kvm, struct kvm_vcpu *vcpu,
+ int *error)
+{
+ struct sev_data_launch_update_vmsa vmsa;
+ struct vcpu_svm *svm = to_svm(vcpu);
+ int ret;
+
+ /* Perform some pre-encryption checks against the VMSA */
+ ret = sev_es_sync_vmsa(svm);
+ if (ret)
+ return ret;
+
+ /*
+ * The LAUNCH_UPDATE_VMSA command will perform in-place encryption of
+ * the VMSA memory content (i.e it will write the same memory region
+ * with the guest's key), so invalidate it first.
+ */
+ clflush_cache_range(svm->sev_es.vmsa, PAGE_SIZE);
+
+ vmsa.reserved = 0;
+ vmsa.handle = to_kvm_svm(kvm)->sev_info.handle;
+ vmsa.address = __sme_pa(svm->sev_es.vmsa);
+ vmsa.len = PAGE_SIZE;
+ ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_VMSA, &vmsa, error);
+ if (ret)
+ return ret;
+
+ vcpu->arch.guest_state_protected = true;
+ return 0;
+}
+
+static int sev_launch_update_vmsa(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct kvm_vcpu *vcpu;
+ unsigned long i;
+ int ret;
+
+ if (!sev_es_guest(kvm))
+ return -ENOTTY;
+
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ ret = mutex_lock_killable(&vcpu->mutex);
+ if (ret)
+ return ret;
+
+ ret = __sev_launch_update_vmsa(kvm, vcpu, &argp->error);
+
+ mutex_unlock(&vcpu->mutex);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+static int sev_launch_measure(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ void __user *measure = (void __user *)(uintptr_t)argp->data;
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct sev_data_launch_measure data;
+ struct kvm_sev_launch_measure params;
+ void __user *p = NULL;
+ void *blob = NULL;
+ int ret;
+
+ if (!sev_guest(kvm))
+ return -ENOTTY;
+
+ if (copy_from_user(&params, measure, sizeof(params)))
+ return -EFAULT;
+
+ memset(&data, 0, sizeof(data));
+
+ /* User wants to query the blob length */
+ if (!params.len)
+ goto cmd;
+
+ p = (void __user *)(uintptr_t)params.uaddr;
+ if (p) {
+ if (params.len > SEV_FW_BLOB_MAX_SIZE)
+ return -EINVAL;
+
+ blob = kzalloc(params.len, GFP_KERNEL_ACCOUNT);
+ if (!blob)
+ return -ENOMEM;
+
+ data.address = __psp_pa(blob);
+ data.len = params.len;
+ }
+
+cmd:
+ data.handle = sev->handle;
+ ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_MEASURE, &data, &argp->error);
+
+ /*
+ * If we query the session length, FW responded with expected data.
+ */
+ if (!params.len)
+ goto done;
+
+ if (ret)
+ goto e_free_blob;
+
+ if (blob) {
+ if (copy_to_user(p, blob, params.len))
+ ret = -EFAULT;
+ }
+
+done:
+ params.len = data.len;
+ if (copy_to_user(measure, &params, sizeof(params)))
+ ret = -EFAULT;
+e_free_blob:
+ kfree(blob);
+ return ret;
+}
+
+static int sev_launch_finish(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct sev_data_launch_finish data;
+
+ if (!sev_guest(kvm))
+ return -ENOTTY;
+
+ data.handle = sev->handle;
+ return sev_issue_cmd(kvm, SEV_CMD_LAUNCH_FINISH, &data, &argp->error);
+}
+
+static int sev_guest_status(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct kvm_sev_guest_status params;
+ struct sev_data_guest_status data;
+ int ret;
+
+ if (!sev_guest(kvm))
+ return -ENOTTY;
+
+ memset(&data, 0, sizeof(data));
+
+ data.handle = sev->handle;
+ ret = sev_issue_cmd(kvm, SEV_CMD_GUEST_STATUS, &data, &argp->error);
+ if (ret)
+ return ret;
+
+ params.policy = data.policy;
+ params.state = data.state;
+ params.handle = data.handle;
+
+ if (copy_to_user((void __user *)(uintptr_t)argp->data, &params, sizeof(params)))
+ ret = -EFAULT;
+
+ return ret;
+}
+
+static int __sev_issue_dbg_cmd(struct kvm *kvm, unsigned long src,
+ unsigned long dst, int size,
+ int *error, bool enc)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct sev_data_dbg data;
+
+ data.reserved = 0;
+ data.handle = sev->handle;
+ data.dst_addr = dst;
+ data.src_addr = src;
+ data.len = size;
+
+ return sev_issue_cmd(kvm,
+ enc ? SEV_CMD_DBG_ENCRYPT : SEV_CMD_DBG_DECRYPT,
+ &data, error);
+}
+
+static int __sev_dbg_decrypt(struct kvm *kvm, unsigned long src_paddr,
+ unsigned long dst_paddr, int sz, int *err)
+{
+ int offset;
+
+ /*
+ * Its safe to read more than we are asked, caller should ensure that
+ * destination has enough space.
+ */
+ offset = src_paddr & 15;
+ src_paddr = round_down(src_paddr, 16);
+ sz = round_up(sz + offset, 16);
+
+ return __sev_issue_dbg_cmd(kvm, src_paddr, dst_paddr, sz, err, false);
+}
+
+static int __sev_dbg_decrypt_user(struct kvm *kvm, unsigned long paddr,
+ void __user *dst_uaddr,
+ unsigned long dst_paddr,
+ int size, int *err)
+{
+ struct page *tpage = NULL;
+ int ret, offset;
+
+ /* if inputs are not 16-byte then use intermediate buffer */
+ if (!IS_ALIGNED(dst_paddr, 16) ||
+ !IS_ALIGNED(paddr, 16) ||
+ !IS_ALIGNED(size, 16)) {
+ tpage = (void *)alloc_page(GFP_KERNEL | __GFP_ZERO);
+ if (!tpage)
+ return -ENOMEM;
+
+ dst_paddr = __sme_page_pa(tpage);
+ }
+
+ ret = __sev_dbg_decrypt(kvm, paddr, dst_paddr, size, err);
+ if (ret)
+ goto e_free;
+
+ if (tpage) {
+ offset = paddr & 15;
+ if (copy_to_user(dst_uaddr, page_address(tpage) + offset, size))
+ ret = -EFAULT;
+ }
+
+e_free:
+ if (tpage)
+ __free_page(tpage);
+
+ return ret;
+}
+
+static int __sev_dbg_encrypt_user(struct kvm *kvm, unsigned long paddr,
+ void __user *vaddr,
+ unsigned long dst_paddr,
+ void __user *dst_vaddr,
+ int size, int *error)
+{
+ struct page *src_tpage = NULL;
+ struct page *dst_tpage = NULL;
+ int ret, len = size;
+
+ /* If source buffer is not aligned then use an intermediate buffer */
+ if (!IS_ALIGNED((unsigned long)vaddr, 16)) {
+ src_tpage = alloc_page(GFP_KERNEL_ACCOUNT);
+ if (!src_tpage)
+ return -ENOMEM;
+
+ if (copy_from_user(page_address(src_tpage), vaddr, size)) {
+ __free_page(src_tpage);
+ return -EFAULT;
+ }
+
+ paddr = __sme_page_pa(src_tpage);
+ }
+
+ /*
+ * If destination buffer or length is not aligned then do read-modify-write:
+ * - decrypt destination in an intermediate buffer
+ * - copy the source buffer in an intermediate buffer
+ * - use the intermediate buffer as source buffer
+ */
+ if (!IS_ALIGNED((unsigned long)dst_vaddr, 16) || !IS_ALIGNED(size, 16)) {
+ int dst_offset;
+
+ dst_tpage = alloc_page(GFP_KERNEL_ACCOUNT);
+ if (!dst_tpage) {
+ ret = -ENOMEM;
+ goto e_free;
+ }
+
+ ret = __sev_dbg_decrypt(kvm, dst_paddr,
+ __sme_page_pa(dst_tpage), size, error);
+ if (ret)
+ goto e_free;
+
+ /*
+ * If source is kernel buffer then use memcpy() otherwise
+ * copy_from_user().
+ */
+ dst_offset = dst_paddr & 15;
+
+ if (src_tpage)
+ memcpy(page_address(dst_tpage) + dst_offset,
+ page_address(src_tpage), size);
+ else {
+ if (copy_from_user(page_address(dst_tpage) + dst_offset,
+ vaddr, size)) {
+ ret = -EFAULT;
+ goto e_free;
+ }
+ }
+
+ paddr = __sme_page_pa(dst_tpage);
+ dst_paddr = round_down(dst_paddr, 16);
+ len = round_up(size, 16);
+ }
+
+ ret = __sev_issue_dbg_cmd(kvm, paddr, dst_paddr, len, error, true);
+
+e_free:
+ if (src_tpage)
+ __free_page(src_tpage);
+ if (dst_tpage)
+ __free_page(dst_tpage);
+ return ret;
+}
+
+static int sev_dbg_crypt(struct kvm *kvm, struct kvm_sev_cmd *argp, bool dec)
+{
+ unsigned long vaddr, vaddr_end, next_vaddr;
+ unsigned long dst_vaddr;
+ struct page **src_p, **dst_p;
+ struct kvm_sev_dbg debug;
+ unsigned long n;
+ unsigned int size;
+ int ret;
+
+ if (!sev_guest(kvm))
+ return -ENOTTY;
+
+ if (copy_from_user(&debug, (void __user *)(uintptr_t)argp->data, sizeof(debug)))
+ return -EFAULT;
+
+ if (!debug.len || debug.src_uaddr + debug.len < debug.src_uaddr)
+ return -EINVAL;
+ if (!debug.dst_uaddr)
+ return -EINVAL;
+
+ vaddr = debug.src_uaddr;
+ size = debug.len;
+ vaddr_end = vaddr + size;
+ dst_vaddr = debug.dst_uaddr;
+
+ for (; vaddr < vaddr_end; vaddr = next_vaddr) {
+ int len, s_off, d_off;
+
+ /* lock userspace source and destination page */
+ src_p = sev_pin_memory(kvm, vaddr & PAGE_MASK, PAGE_SIZE, &n, 0);
+ if (IS_ERR(src_p))
+ return PTR_ERR(src_p);
+
+ dst_p = sev_pin_memory(kvm, dst_vaddr & PAGE_MASK, PAGE_SIZE, &n, 1);
+ if (IS_ERR(dst_p)) {
+ sev_unpin_memory(kvm, src_p, n);
+ return PTR_ERR(dst_p);
+ }
+
+ /*
+ * Flush (on non-coherent CPUs) before DBG_{DE,EN}CRYPT read or modify
+ * the pages; flush the destination too so that future accesses do not
+ * see stale data.
+ */
+ sev_clflush_pages(src_p, 1);
+ sev_clflush_pages(dst_p, 1);
+
+ /*
+ * Since user buffer may not be page aligned, calculate the
+ * offset within the page.
+ */
+ s_off = vaddr & ~PAGE_MASK;
+ d_off = dst_vaddr & ~PAGE_MASK;
+ len = min_t(size_t, (PAGE_SIZE - s_off), size);
+
+ if (dec)
+ ret = __sev_dbg_decrypt_user(kvm,
+ __sme_page_pa(src_p[0]) + s_off,
+ (void __user *)dst_vaddr,
+ __sme_page_pa(dst_p[0]) + d_off,
+ len, &argp->error);
+ else
+ ret = __sev_dbg_encrypt_user(kvm,
+ __sme_page_pa(src_p[0]) + s_off,
+ (void __user *)vaddr,
+ __sme_page_pa(dst_p[0]) + d_off,
+ (void __user *)dst_vaddr,
+ len, &argp->error);
+
+ sev_unpin_memory(kvm, src_p, n);
+ sev_unpin_memory(kvm, dst_p, n);
+
+ if (ret)
+ goto err;
+
+ next_vaddr = vaddr + len;
+ dst_vaddr = dst_vaddr + len;
+ size -= len;
+ }
+err:
+ return ret;
+}
+
+static int sev_launch_secret(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct sev_data_launch_secret data;
+ struct kvm_sev_launch_secret params;
+ struct page **pages;
+ void *blob, *hdr;
+ unsigned long n, i;
+ int ret, offset;
+
+ if (!sev_guest(kvm))
+ return -ENOTTY;
+
+ if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data, sizeof(params)))
+ return -EFAULT;
+
+ pages = sev_pin_memory(kvm, params.guest_uaddr, params.guest_len, &n, 1);
+ if (IS_ERR(pages))
+ return PTR_ERR(pages);
+
+ /*
+ * Flush (on non-coherent CPUs) before LAUNCH_SECRET encrypts pages in
+ * place; the cache may contain the data that was written unencrypted.
+ */
+ sev_clflush_pages(pages, n);
+
+ /*
+ * The secret must be copied into contiguous memory region, lets verify
+ * that userspace memory pages are contiguous before we issue command.
+ */
+ if (get_num_contig_pages(0, pages, n) != n) {
+ ret = -EINVAL;
+ goto e_unpin_memory;
+ }
+
+ memset(&data, 0, sizeof(data));
+
+ offset = params.guest_uaddr & (PAGE_SIZE - 1);
+ data.guest_address = __sme_page_pa(pages[0]) + offset;
+ data.guest_len = params.guest_len;
+
+ blob = psp_copy_user_blob(params.trans_uaddr, params.trans_len);
+ if (IS_ERR(blob)) {
+ ret = PTR_ERR(blob);
+ goto e_unpin_memory;
+ }
+
+ data.trans_address = __psp_pa(blob);
+ data.trans_len = params.trans_len;
+
+ hdr = psp_copy_user_blob(params.hdr_uaddr, params.hdr_len);
+ if (IS_ERR(hdr)) {
+ ret = PTR_ERR(hdr);
+ goto e_free_blob;
+ }
+ data.hdr_address = __psp_pa(hdr);
+ data.hdr_len = params.hdr_len;
+
+ data.handle = sev->handle;
+ ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_SECRET, &data, &argp->error);
+
+ kfree(hdr);
+
+e_free_blob:
+ kfree(blob);
+e_unpin_memory:
+ /* content of memory is updated, mark pages dirty */
+ for (i = 0; i < n; i++) {
+ set_page_dirty_lock(pages[i]);
+ mark_page_accessed(pages[i]);
+ }
+ sev_unpin_memory(kvm, pages, n);
+ return ret;
+}
+
+static int sev_get_attestation_report(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ void __user *report = (void __user *)(uintptr_t)argp->data;
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct sev_data_attestation_report data;
+ struct kvm_sev_attestation_report params;
+ void __user *p;
+ void *blob = NULL;
+ int ret;
+
+ if (!sev_guest(kvm))
+ return -ENOTTY;
+
+ if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data, sizeof(params)))
+ return -EFAULT;
+
+ memset(&data, 0, sizeof(data));
+
+ /* User wants to query the blob length */
+ if (!params.len)
+ goto cmd;
+
+ p = (void __user *)(uintptr_t)params.uaddr;
+ if (p) {
+ if (params.len > SEV_FW_BLOB_MAX_SIZE)
+ return -EINVAL;
+
+ blob = kzalloc(params.len, GFP_KERNEL_ACCOUNT);
+ if (!blob)
+ return -ENOMEM;
+
+ data.address = __psp_pa(blob);
+ data.len = params.len;
+ memcpy(data.mnonce, params.mnonce, sizeof(params.mnonce));
+ }
+cmd:
+ data.handle = sev->handle;
+ ret = sev_issue_cmd(kvm, SEV_CMD_ATTESTATION_REPORT, &data, &argp->error);
+ /*
+ * If we query the session length, FW responded with expected data.
+ */
+ if (!params.len)
+ goto done;
+
+ if (ret)
+ goto e_free_blob;
+
+ if (blob) {
+ if (copy_to_user(p, blob, params.len))
+ ret = -EFAULT;
+ }
+
+done:
+ params.len = data.len;
+ if (copy_to_user(report, &params, sizeof(params)))
+ ret = -EFAULT;
+e_free_blob:
+ kfree(blob);
+ return ret;
+}
+
+/* Userspace wants to query session length. */
+static int
+__sev_send_start_query_session_length(struct kvm *kvm, struct kvm_sev_cmd *argp,
+ struct kvm_sev_send_start *params)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct sev_data_send_start data;
+ int ret;
+
+ memset(&data, 0, sizeof(data));
+ data.handle = sev->handle;
+ ret = sev_issue_cmd(kvm, SEV_CMD_SEND_START, &data, &argp->error);
+
+ params->session_len = data.session_len;
+ if (copy_to_user((void __user *)(uintptr_t)argp->data, params,
+ sizeof(struct kvm_sev_send_start)))
+ ret = -EFAULT;
+
+ return ret;
+}
+
+static int sev_send_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct sev_data_send_start data;
+ struct kvm_sev_send_start params;
+ void *amd_certs, *session_data;
+ void *pdh_cert, *plat_certs;
+ int ret;
+
+ if (!sev_guest(kvm))
+ return -ENOTTY;
+
+ if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data,
+ sizeof(struct kvm_sev_send_start)))
+ return -EFAULT;
+
+ /* if session_len is zero, userspace wants to query the session length */
+ if (!params.session_len)
+ return __sev_send_start_query_session_length(kvm, argp,
+ &params);
+
+ /* some sanity checks */
+ if (!params.pdh_cert_uaddr || !params.pdh_cert_len ||
+ !params.session_uaddr || params.session_len > SEV_FW_BLOB_MAX_SIZE)
+ return -EINVAL;
+
+ /* allocate the memory to hold the session data blob */
+ session_data = kzalloc(params.session_len, GFP_KERNEL_ACCOUNT);
+ if (!session_data)
+ return -ENOMEM;
+
+ /* copy the certificate blobs from userspace */
+ pdh_cert = psp_copy_user_blob(params.pdh_cert_uaddr,
+ params.pdh_cert_len);
+ if (IS_ERR(pdh_cert)) {
+ ret = PTR_ERR(pdh_cert);
+ goto e_free_session;
+ }
+
+ plat_certs = psp_copy_user_blob(params.plat_certs_uaddr,
+ params.plat_certs_len);
+ if (IS_ERR(plat_certs)) {
+ ret = PTR_ERR(plat_certs);
+ goto e_free_pdh;
+ }
+
+ amd_certs = psp_copy_user_blob(params.amd_certs_uaddr,
+ params.amd_certs_len);
+ if (IS_ERR(amd_certs)) {
+ ret = PTR_ERR(amd_certs);
+ goto e_free_plat_cert;
+ }
+
+ /* populate the FW SEND_START field with system physical address */
+ memset(&data, 0, sizeof(data));
+ data.pdh_cert_address = __psp_pa(pdh_cert);
+ data.pdh_cert_len = params.pdh_cert_len;
+ data.plat_certs_address = __psp_pa(plat_certs);
+ data.plat_certs_len = params.plat_certs_len;
+ data.amd_certs_address = __psp_pa(amd_certs);
+ data.amd_certs_len = params.amd_certs_len;
+ data.session_address = __psp_pa(session_data);
+ data.session_len = params.session_len;
+ data.handle = sev->handle;
+
+ ret = sev_issue_cmd(kvm, SEV_CMD_SEND_START, &data, &argp->error);
+
+ if (!ret && copy_to_user((void __user *)(uintptr_t)params.session_uaddr,
+ session_data, params.session_len)) {
+ ret = -EFAULT;
+ goto e_free_amd_cert;
+ }
+
+ params.policy = data.policy;
+ params.session_len = data.session_len;
+ if (copy_to_user((void __user *)(uintptr_t)argp->data, &params,
+ sizeof(struct kvm_sev_send_start)))
+ ret = -EFAULT;
+
+e_free_amd_cert:
+ kfree(amd_certs);
+e_free_plat_cert:
+ kfree(plat_certs);
+e_free_pdh:
+ kfree(pdh_cert);
+e_free_session:
+ kfree(session_data);
+ return ret;
+}
+
+/* Userspace wants to query either header or trans length. */
+static int
+__sev_send_update_data_query_lengths(struct kvm *kvm, struct kvm_sev_cmd *argp,
+ struct kvm_sev_send_update_data *params)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct sev_data_send_update_data data;
+ int ret;
+
+ memset(&data, 0, sizeof(data));
+ data.handle = sev->handle;
+ ret = sev_issue_cmd(kvm, SEV_CMD_SEND_UPDATE_DATA, &data, &argp->error);
+
+ params->hdr_len = data.hdr_len;
+ params->trans_len = data.trans_len;
+
+ if (copy_to_user((void __user *)(uintptr_t)argp->data, params,
+ sizeof(struct kvm_sev_send_update_data)))
+ ret = -EFAULT;
+
+ return ret;
+}
+
+static int sev_send_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct sev_data_send_update_data data;
+ struct kvm_sev_send_update_data params;
+ void *hdr, *trans_data;
+ struct page **guest_page;
+ unsigned long n;
+ int ret, offset;
+
+ if (!sev_guest(kvm))
+ return -ENOTTY;
+
+ if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data,
+ sizeof(struct kvm_sev_send_update_data)))
+ return -EFAULT;
+
+ /* userspace wants to query either header or trans length */
+ if (!params.trans_len || !params.hdr_len)
+ return __sev_send_update_data_query_lengths(kvm, argp, &params);
+
+ if (!params.trans_uaddr || !params.guest_uaddr ||
+ !params.guest_len || !params.hdr_uaddr)
+ return -EINVAL;
+
+ /* Check if we are crossing the page boundary */
+ offset = params.guest_uaddr & (PAGE_SIZE - 1);
+ if (params.guest_len > PAGE_SIZE || (params.guest_len + offset) > PAGE_SIZE)
+ return -EINVAL;
+
+ /* Pin guest memory */
+ guest_page = sev_pin_memory(kvm, params.guest_uaddr & PAGE_MASK,
+ PAGE_SIZE, &n, 0);
+ if (IS_ERR(guest_page))
+ return PTR_ERR(guest_page);
+
+ /* allocate memory for header and transport buffer */
+ ret = -ENOMEM;
+ hdr = kzalloc(params.hdr_len, GFP_KERNEL_ACCOUNT);
+ if (!hdr)
+ goto e_unpin;
+
+ trans_data = kzalloc(params.trans_len, GFP_KERNEL_ACCOUNT);
+ if (!trans_data)
+ goto e_free_hdr;
+
+ memset(&data, 0, sizeof(data));
+ data.hdr_address = __psp_pa(hdr);
+ data.hdr_len = params.hdr_len;
+ data.trans_address = __psp_pa(trans_data);
+ data.trans_len = params.trans_len;
+
+ /* The SEND_UPDATE_DATA command requires C-bit to be always set. */
+ data.guest_address = (page_to_pfn(guest_page[0]) << PAGE_SHIFT) + offset;
+ data.guest_address |= sev_me_mask;
+ data.guest_len = params.guest_len;
+ data.handle = sev->handle;
+
+ ret = sev_issue_cmd(kvm, SEV_CMD_SEND_UPDATE_DATA, &data, &argp->error);
+
+ if (ret)
+ goto e_free_trans_data;
+
+ /* copy transport buffer to user space */
+ if (copy_to_user((void __user *)(uintptr_t)params.trans_uaddr,
+ trans_data, params.trans_len)) {
+ ret = -EFAULT;
+ goto e_free_trans_data;
+ }
+
+ /* Copy packet header to userspace. */
+ if (copy_to_user((void __user *)(uintptr_t)params.hdr_uaddr, hdr,
+ params.hdr_len))
+ ret = -EFAULT;
+
+e_free_trans_data:
+ kfree(trans_data);
+e_free_hdr:
+ kfree(hdr);
+e_unpin:
+ sev_unpin_memory(kvm, guest_page, n);
+
+ return ret;
+}
+
+static int sev_send_finish(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct sev_data_send_finish data;
+
+ if (!sev_guest(kvm))
+ return -ENOTTY;
+
+ data.handle = sev->handle;
+ return sev_issue_cmd(kvm, SEV_CMD_SEND_FINISH, &data, &argp->error);
+}
+
+static int sev_send_cancel(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct sev_data_send_cancel data;
+
+ if (!sev_guest(kvm))
+ return -ENOTTY;
+
+ data.handle = sev->handle;
+ return sev_issue_cmd(kvm, SEV_CMD_SEND_CANCEL, &data, &argp->error);
+}
+
+static int sev_receive_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct sev_data_receive_start start;
+ struct kvm_sev_receive_start params;
+ int *error = &argp->error;
+ void *session_data;
+ void *pdh_data;
+ int ret;
+
+ if (!sev_guest(kvm))
+ return -ENOTTY;
+
+ /* Get parameter from the userspace */
+ if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data,
+ sizeof(struct kvm_sev_receive_start)))
+ return -EFAULT;
+
+ /* some sanity checks */
+ if (!params.pdh_uaddr || !params.pdh_len ||
+ !params.session_uaddr || !params.session_len)
+ return -EINVAL;
+
+ pdh_data = psp_copy_user_blob(params.pdh_uaddr, params.pdh_len);
+ if (IS_ERR(pdh_data))
+ return PTR_ERR(pdh_data);
+
+ session_data = psp_copy_user_blob(params.session_uaddr,
+ params.session_len);
+ if (IS_ERR(session_data)) {
+ ret = PTR_ERR(session_data);
+ goto e_free_pdh;
+ }
+
+ memset(&start, 0, sizeof(start));
+ start.handle = params.handle;
+ start.policy = params.policy;
+ start.pdh_cert_address = __psp_pa(pdh_data);
+ start.pdh_cert_len = params.pdh_len;
+ start.session_address = __psp_pa(session_data);
+ start.session_len = params.session_len;
+
+ /* create memory encryption context */
+ ret = __sev_issue_cmd(argp->sev_fd, SEV_CMD_RECEIVE_START, &start,
+ error);
+ if (ret)
+ goto e_free_session;
+
+ /* Bind ASID to this guest */
+ ret = sev_bind_asid(kvm, start.handle, error);
+ if (ret) {
+ sev_decommission(start.handle);
+ goto e_free_session;
+ }
+
+ params.handle = start.handle;
+ if (copy_to_user((void __user *)(uintptr_t)argp->data,
+ &params, sizeof(struct kvm_sev_receive_start))) {
+ ret = -EFAULT;
+ sev_unbind_asid(kvm, start.handle);
+ goto e_free_session;
+ }
+
+ sev->handle = start.handle;
+ sev->fd = argp->sev_fd;
+
+e_free_session:
+ kfree(session_data);
+e_free_pdh:
+ kfree(pdh_data);
+
+ return ret;
+}
+
+static int sev_receive_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct kvm_sev_receive_update_data params;
+ struct sev_data_receive_update_data data;
+ void *hdr = NULL, *trans = NULL;
+ struct page **guest_page;
+ unsigned long n;
+ int ret, offset;
+
+ if (!sev_guest(kvm))
+ return -EINVAL;
+
+ if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data,
+ sizeof(struct kvm_sev_receive_update_data)))
+ return -EFAULT;
+
+ if (!params.hdr_uaddr || !params.hdr_len ||
+ !params.guest_uaddr || !params.guest_len ||
+ !params.trans_uaddr || !params.trans_len)
+ return -EINVAL;
+
+ /* Check if we are crossing the page boundary */
+ offset = params.guest_uaddr & (PAGE_SIZE - 1);
+ if (params.guest_len > PAGE_SIZE || (params.guest_len + offset) > PAGE_SIZE)
+ return -EINVAL;
+
+ hdr = psp_copy_user_blob(params.hdr_uaddr, params.hdr_len);
+ if (IS_ERR(hdr))
+ return PTR_ERR(hdr);
+
+ trans = psp_copy_user_blob(params.trans_uaddr, params.trans_len);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ goto e_free_hdr;
+ }
+
+ memset(&data, 0, sizeof(data));
+ data.hdr_address = __psp_pa(hdr);
+ data.hdr_len = params.hdr_len;
+ data.trans_address = __psp_pa(trans);
+ data.trans_len = params.trans_len;
+
+ /* Pin guest memory */
+ guest_page = sev_pin_memory(kvm, params.guest_uaddr & PAGE_MASK,
+ PAGE_SIZE, &n, 1);
+ if (IS_ERR(guest_page)) {
+ ret = PTR_ERR(guest_page);
+ goto e_free_trans;
+ }
+
+ /*
+ * Flush (on non-coherent CPUs) before RECEIVE_UPDATE_DATA, the PSP
+ * encrypts the written data with the guest's key, and the cache may
+ * contain dirty, unencrypted data.
+ */
+ sev_clflush_pages(guest_page, n);
+
+ /* The RECEIVE_UPDATE_DATA command requires C-bit to be always set. */
+ data.guest_address = (page_to_pfn(guest_page[0]) << PAGE_SHIFT) + offset;
+ data.guest_address |= sev_me_mask;
+ data.guest_len = params.guest_len;
+ data.handle = sev->handle;
+
+ ret = sev_issue_cmd(kvm, SEV_CMD_RECEIVE_UPDATE_DATA, &data,
+ &argp->error);
+
+ sev_unpin_memory(kvm, guest_page, n);
+
+e_free_trans:
+ kfree(trans);
+e_free_hdr:
+ kfree(hdr);
+
+ return ret;
+}
+
+static int sev_receive_finish(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct sev_data_receive_finish data;
+
+ if (!sev_guest(kvm))
+ return -ENOTTY;
+
+ data.handle = sev->handle;
+ return sev_issue_cmd(kvm, SEV_CMD_RECEIVE_FINISH, &data, &argp->error);
+}
+
+static bool is_cmd_allowed_from_mirror(u32 cmd_id)
+{
+ /*
+ * Allow mirrors VM to call KVM_SEV_LAUNCH_UPDATE_VMSA to enable SEV-ES
+ * active mirror VMs. Also allow the debugging and status commands.
+ */
+ if (cmd_id == KVM_SEV_LAUNCH_UPDATE_VMSA ||
+ cmd_id == KVM_SEV_GUEST_STATUS || cmd_id == KVM_SEV_DBG_DECRYPT ||
+ cmd_id == KVM_SEV_DBG_ENCRYPT)
+ return true;
+
+ return false;
+}
+
+static int sev_lock_two_vms(struct kvm *dst_kvm, struct kvm *src_kvm)
+{
+ struct kvm_sev_info *dst_sev = &to_kvm_svm(dst_kvm)->sev_info;
+ struct kvm_sev_info *src_sev = &to_kvm_svm(src_kvm)->sev_info;
+ int r = -EBUSY;
+
+ if (dst_kvm == src_kvm)
+ return -EINVAL;
+
+ /*
+ * Bail if these VMs are already involved in a migration to avoid
+ * deadlock between two VMs trying to migrate to/from each other.
+ */
+ if (atomic_cmpxchg_acquire(&dst_sev->migration_in_progress, 0, 1))
+ return -EBUSY;
+
+ if (atomic_cmpxchg_acquire(&src_sev->migration_in_progress, 0, 1))
+ goto release_dst;
+
+ r = -EINTR;
+ if (mutex_lock_killable(&dst_kvm->lock))
+ goto release_src;
+ if (mutex_lock_killable_nested(&src_kvm->lock, SINGLE_DEPTH_NESTING))
+ goto unlock_dst;
+ return 0;
+
+unlock_dst:
+ mutex_unlock(&dst_kvm->lock);
+release_src:
+ atomic_set_release(&src_sev->migration_in_progress, 0);
+release_dst:
+ atomic_set_release(&dst_sev->migration_in_progress, 0);
+ return r;
+}
+
+static void sev_unlock_two_vms(struct kvm *dst_kvm, struct kvm *src_kvm)
+{
+ struct kvm_sev_info *dst_sev = &to_kvm_svm(dst_kvm)->sev_info;
+ struct kvm_sev_info *src_sev = &to_kvm_svm(src_kvm)->sev_info;
+
+ mutex_unlock(&dst_kvm->lock);
+ mutex_unlock(&src_kvm->lock);
+ atomic_set_release(&dst_sev->migration_in_progress, 0);
+ atomic_set_release(&src_sev->migration_in_progress, 0);
+}
+
+/* vCPU mutex subclasses. */
+enum sev_migration_role {
+ SEV_MIGRATION_SOURCE = 0,
+ SEV_MIGRATION_TARGET,
+ SEV_NR_MIGRATION_ROLES,
+};
+
+static int sev_lock_vcpus_for_migration(struct kvm *kvm,
+ enum sev_migration_role role)
+{
+ struct kvm_vcpu *vcpu;
+ unsigned long i, j;
+
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ if (mutex_lock_killable_nested(&vcpu->mutex, role))
+ goto out_unlock;
+
+#ifdef CONFIG_PROVE_LOCKING
+ if (!i)
+ /*
+ * Reset the role to one that avoids colliding with
+ * the role used for the first vcpu mutex.
+ */
+ role = SEV_NR_MIGRATION_ROLES;
+ else
+ mutex_release(&vcpu->mutex.dep_map, _THIS_IP_);
+#endif
+ }
+
+ return 0;
+
+out_unlock:
+
+ kvm_for_each_vcpu(j, vcpu, kvm) {
+ if (i == j)
+ break;
+
+#ifdef CONFIG_PROVE_LOCKING
+ if (j)
+ mutex_acquire(&vcpu->mutex.dep_map, role, 0, _THIS_IP_);
+#endif
+
+ mutex_unlock(&vcpu->mutex);
+ }
+ return -EINTR;
+}
+
+static void sev_unlock_vcpus_for_migration(struct kvm *kvm)
+{
+ struct kvm_vcpu *vcpu;
+ unsigned long i;
+ bool first = true;
+
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ if (first)
+ first = false;
+ else
+ mutex_acquire(&vcpu->mutex.dep_map,
+ SEV_NR_MIGRATION_ROLES, 0, _THIS_IP_);
+
+ mutex_unlock(&vcpu->mutex);
+ }
+}
+
+static void sev_migrate_from(struct kvm *dst_kvm, struct kvm *src_kvm)
+{
+ struct kvm_sev_info *dst = &to_kvm_svm(dst_kvm)->sev_info;
+ struct kvm_sev_info *src = &to_kvm_svm(src_kvm)->sev_info;
+ struct kvm_vcpu *dst_vcpu, *src_vcpu;
+ struct vcpu_svm *dst_svm, *src_svm;
+ struct kvm_sev_info *mirror;
+ unsigned long i;
+
+ dst->active = true;
+ dst->asid = src->asid;
+ dst->handle = src->handle;
+ dst->pages_locked = src->pages_locked;
+ dst->enc_context_owner = src->enc_context_owner;
+ dst->es_active = src->es_active;
+
+ src->asid = 0;
+ src->active = false;
+ src->handle = 0;
+ src->pages_locked = 0;
+ src->enc_context_owner = NULL;
+ src->es_active = false;
+
+ list_cut_before(&dst->regions_list, &src->regions_list, &src->regions_list);
+
+ /*
+ * If this VM has mirrors, "transfer" each mirror's refcount of the
+ * source to the destination (this KVM). The caller holds a reference
+ * to the source, so there's no danger of use-after-free.
+ */
+ list_cut_before(&dst->mirror_vms, &src->mirror_vms, &src->mirror_vms);
+ list_for_each_entry(mirror, &dst->mirror_vms, mirror_entry) {
+ kvm_get_kvm(dst_kvm);
+ kvm_put_kvm(src_kvm);
+ mirror->enc_context_owner = dst_kvm;
+ }
+
+ /*
+ * If this VM is a mirror, remove the old mirror from the owners list
+ * and add the new mirror to the list.
+ */
+ if (is_mirroring_enc_context(dst_kvm)) {
+ struct kvm_sev_info *owner_sev_info =
+ &to_kvm_svm(dst->enc_context_owner)->sev_info;
+
+ list_del(&src->mirror_entry);
+ list_add_tail(&dst->mirror_entry, &owner_sev_info->mirror_vms);
+ }
+
+ kvm_for_each_vcpu(i, dst_vcpu, dst_kvm) {
+ dst_svm = to_svm(dst_vcpu);
+
+ sev_init_vmcb(dst_svm);
+
+ if (!dst->es_active)
+ continue;
+
+ /*
+ * Note, the source is not required to have the same number of
+ * vCPUs as the destination when migrating a vanilla SEV VM.
+ */
+ src_vcpu = kvm_get_vcpu(src_kvm, i);
+ src_svm = to_svm(src_vcpu);
+
+ /*
+ * Transfer VMSA and GHCB state to the destination. Nullify and
+ * clear source fields as appropriate, the state now belongs to
+ * the destination.
+ */
+ memcpy(&dst_svm->sev_es, &src_svm->sev_es, sizeof(src_svm->sev_es));
+ dst_svm->vmcb->control.ghcb_gpa = src_svm->vmcb->control.ghcb_gpa;
+ dst_svm->vmcb->control.vmsa_pa = src_svm->vmcb->control.vmsa_pa;
+ dst_vcpu->arch.guest_state_protected = true;
+
+ memset(&src_svm->sev_es, 0, sizeof(src_svm->sev_es));
+ src_svm->vmcb->control.ghcb_gpa = INVALID_PAGE;
+ src_svm->vmcb->control.vmsa_pa = INVALID_PAGE;
+ src_vcpu->arch.guest_state_protected = false;
+ }
+}
+
+static int sev_check_source_vcpus(struct kvm *dst, struct kvm *src)
+{
+ struct kvm_vcpu *src_vcpu;
+ unsigned long i;
+
+ if (!sev_es_guest(src))
+ return 0;
+
+ if (atomic_read(&src->online_vcpus) != atomic_read(&dst->online_vcpus))
+ return -EINVAL;
+
+ kvm_for_each_vcpu(i, src_vcpu, src) {
+ if (!src_vcpu->arch.guest_state_protected)
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+int sev_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd)
+{
+ struct kvm_sev_info *dst_sev = &to_kvm_svm(kvm)->sev_info;
+ struct kvm_sev_info *src_sev, *cg_cleanup_sev;
+ struct file *source_kvm_file;
+ struct kvm *source_kvm;
+ bool charged = false;
+ int ret;
+
+ source_kvm_file = fget(source_fd);
+ if (!file_is_kvm(source_kvm_file)) {
+ ret = -EBADF;
+ goto out_fput;
+ }
+
+ source_kvm = source_kvm_file->private_data;
+ ret = sev_lock_two_vms(kvm, source_kvm);
+ if (ret)
+ goto out_fput;
+
+ if (sev_guest(kvm) || !sev_guest(source_kvm)) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+
+ src_sev = &to_kvm_svm(source_kvm)->sev_info;
+
+ dst_sev->misc_cg = get_current_misc_cg();
+ cg_cleanup_sev = dst_sev;
+ if (dst_sev->misc_cg != src_sev->misc_cg) {
+ ret = sev_misc_cg_try_charge(dst_sev);
+ if (ret)
+ goto out_dst_cgroup;
+ charged = true;
+ }
+
+ ret = sev_lock_vcpus_for_migration(kvm, SEV_MIGRATION_SOURCE);
+ if (ret)
+ goto out_dst_cgroup;
+ ret = sev_lock_vcpus_for_migration(source_kvm, SEV_MIGRATION_TARGET);
+ if (ret)
+ goto out_dst_vcpu;
+
+ ret = sev_check_source_vcpus(kvm, source_kvm);
+ if (ret)
+ goto out_source_vcpu;
+
+ sev_migrate_from(kvm, source_kvm);
+ kvm_vm_dead(source_kvm);
+ cg_cleanup_sev = src_sev;
+ ret = 0;
+
+out_source_vcpu:
+ sev_unlock_vcpus_for_migration(source_kvm);
+out_dst_vcpu:
+ sev_unlock_vcpus_for_migration(kvm);
+out_dst_cgroup:
+ /* Operates on the source on success, on the destination on failure. */
+ if (charged)
+ sev_misc_cg_uncharge(cg_cleanup_sev);
+ put_misc_cg(cg_cleanup_sev->misc_cg);
+ cg_cleanup_sev->misc_cg = NULL;
+out_unlock:
+ sev_unlock_two_vms(kvm, source_kvm);
+out_fput:
+ if (source_kvm_file)
+ fput(source_kvm_file);
+ return ret;
+}
+
+int sev_mem_enc_ioctl(struct kvm *kvm, void __user *argp)
+{
+ struct kvm_sev_cmd sev_cmd;
+ int r;
+
+ if (!sev_enabled)
+ return -ENOTTY;
+
+ if (!argp)
+ return 0;
+
+ if (copy_from_user(&sev_cmd, argp, sizeof(struct kvm_sev_cmd)))
+ return -EFAULT;
+
+ mutex_lock(&kvm->lock);
+
+ /* Only the enc_context_owner handles some memory enc operations. */
+ if (is_mirroring_enc_context(kvm) &&
+ !is_cmd_allowed_from_mirror(sev_cmd.id)) {
+ r = -EINVAL;
+ goto out;
+ }
+
+ switch (sev_cmd.id) {
+ case KVM_SEV_ES_INIT:
+ if (!sev_es_enabled) {
+ r = -ENOTTY;
+ goto out;
+ }
+ fallthrough;
+ case KVM_SEV_INIT:
+ r = sev_guest_init(kvm, &sev_cmd);
+ break;
+ case KVM_SEV_LAUNCH_START:
+ r = sev_launch_start(kvm, &sev_cmd);
+ break;
+ case KVM_SEV_LAUNCH_UPDATE_DATA:
+ r = sev_launch_update_data(kvm, &sev_cmd);
+ break;
+ case KVM_SEV_LAUNCH_UPDATE_VMSA:
+ r = sev_launch_update_vmsa(kvm, &sev_cmd);
+ break;
+ case KVM_SEV_LAUNCH_MEASURE:
+ r = sev_launch_measure(kvm, &sev_cmd);
+ break;
+ case KVM_SEV_LAUNCH_FINISH:
+ r = sev_launch_finish(kvm, &sev_cmd);
+ break;
+ case KVM_SEV_GUEST_STATUS:
+ r = sev_guest_status(kvm, &sev_cmd);
+ break;
+ case KVM_SEV_DBG_DECRYPT:
+ r = sev_dbg_crypt(kvm, &sev_cmd, true);
+ break;
+ case KVM_SEV_DBG_ENCRYPT:
+ r = sev_dbg_crypt(kvm, &sev_cmd, false);
+ break;
+ case KVM_SEV_LAUNCH_SECRET:
+ r = sev_launch_secret(kvm, &sev_cmd);
+ break;
+ case KVM_SEV_GET_ATTESTATION_REPORT:
+ r = sev_get_attestation_report(kvm, &sev_cmd);
+ break;
+ case KVM_SEV_SEND_START:
+ r = sev_send_start(kvm, &sev_cmd);
+ break;
+ case KVM_SEV_SEND_UPDATE_DATA:
+ r = sev_send_update_data(kvm, &sev_cmd);
+ break;
+ case KVM_SEV_SEND_FINISH:
+ r = sev_send_finish(kvm, &sev_cmd);
+ break;
+ case KVM_SEV_SEND_CANCEL:
+ r = sev_send_cancel(kvm, &sev_cmd);
+ break;
+ case KVM_SEV_RECEIVE_START:
+ r = sev_receive_start(kvm, &sev_cmd);
+ break;
+ case KVM_SEV_RECEIVE_UPDATE_DATA:
+ r = sev_receive_update_data(kvm, &sev_cmd);
+ break;
+ case KVM_SEV_RECEIVE_FINISH:
+ r = sev_receive_finish(kvm, &sev_cmd);
+ break;
+ default:
+ r = -EINVAL;
+ goto out;
+ }
+
+ if (copy_to_user(argp, &sev_cmd, sizeof(struct kvm_sev_cmd)))
+ r = -EFAULT;
+
+out:
+ mutex_unlock(&kvm->lock);
+ return r;
+}
+
+int sev_mem_enc_register_region(struct kvm *kvm,
+ struct kvm_enc_region *range)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct enc_region *region;
+ int ret = 0;
+
+ if (!sev_guest(kvm))
+ return -ENOTTY;
+
+ /* If kvm is mirroring encryption context it isn't responsible for it */
+ if (is_mirroring_enc_context(kvm))
+ return -EINVAL;
+
+ if (range->addr > ULONG_MAX || range->size > ULONG_MAX)
+ return -EINVAL;
+
+ region = kzalloc(sizeof(*region), GFP_KERNEL_ACCOUNT);
+ if (!region)
+ return -ENOMEM;
+
+ mutex_lock(&kvm->lock);
+ region->pages = sev_pin_memory(kvm, range->addr, range->size, &region->npages, 1);
+ if (IS_ERR(region->pages)) {
+ ret = PTR_ERR(region->pages);
+ mutex_unlock(&kvm->lock);
+ goto e_free;
+ }
+
+ region->uaddr = range->addr;
+ region->size = range->size;
+
+ list_add_tail(&region->list, &sev->regions_list);
+ mutex_unlock(&kvm->lock);
+
+ /*
+ * The guest may change the memory encryption attribute from C=0 -> C=1
+ * or vice versa for this memory range. Lets make sure caches are
+ * flushed to ensure that guest data gets written into memory with
+ * correct C-bit.
+ */
+ sev_clflush_pages(region->pages, region->npages);
+
+ return ret;
+
+e_free:
+ kfree(region);
+ return ret;
+}
+
+static struct enc_region *
+find_enc_region(struct kvm *kvm, struct kvm_enc_region *range)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct list_head *head = &sev->regions_list;
+ struct enc_region *i;
+
+ list_for_each_entry(i, head, list) {
+ if (i->uaddr == range->addr &&
+ i->size == range->size)
+ return i;
+ }
+
+ return NULL;
+}
+
+static void __unregister_enc_region_locked(struct kvm *kvm,
+ struct enc_region *region)
+{
+ sev_unpin_memory(kvm, region->pages, region->npages);
+ list_del(&region->list);
+ kfree(region);
+}
+
+int sev_mem_enc_unregister_region(struct kvm *kvm,
+ struct kvm_enc_region *range)
+{
+ struct enc_region *region;
+ int ret;
+
+ /* If kvm is mirroring encryption context it isn't responsible for it */
+ if (is_mirroring_enc_context(kvm))
+ return -EINVAL;
+
+ mutex_lock(&kvm->lock);
+
+ if (!sev_guest(kvm)) {
+ ret = -ENOTTY;
+ goto failed;
+ }
+
+ region = find_enc_region(kvm, range);
+ if (!region) {
+ ret = -EINVAL;
+ goto failed;
+ }
+
+ /*
+ * Ensure that all guest tagged cache entries are flushed before
+ * releasing the pages back to the system for use. CLFLUSH will
+ * not do this, so issue a WBINVD.
+ */
+ wbinvd_on_all_cpus();
+
+ __unregister_enc_region_locked(kvm, region);
+
+ mutex_unlock(&kvm->lock);
+ return 0;
+
+failed:
+ mutex_unlock(&kvm->lock);
+ return ret;
+}
+
+int sev_vm_copy_enc_context_from(struct kvm *kvm, unsigned int source_fd)
+{
+ struct file *source_kvm_file;
+ struct kvm *source_kvm;
+ struct kvm_sev_info *source_sev, *mirror_sev;
+ int ret;
+
+ source_kvm_file = fget(source_fd);
+ if (!file_is_kvm(source_kvm_file)) {
+ ret = -EBADF;
+ goto e_source_fput;
+ }
+
+ source_kvm = source_kvm_file->private_data;
+ ret = sev_lock_two_vms(kvm, source_kvm);
+ if (ret)
+ goto e_source_fput;
+
+ /*
+ * Mirrors of mirrors should work, but let's not get silly. Also
+ * disallow out-of-band SEV/SEV-ES init if the target is already an
+ * SEV guest, or if vCPUs have been created. KVM relies on vCPUs being
+ * created after SEV/SEV-ES initialization, e.g. to init intercepts.
+ */
+ if (sev_guest(kvm) || !sev_guest(source_kvm) ||
+ is_mirroring_enc_context(source_kvm) || kvm->created_vcpus) {
+ ret = -EINVAL;
+ goto e_unlock;
+ }
+
+ /*
+ * The mirror kvm holds an enc_context_owner ref so its asid can't
+ * disappear until we're done with it
+ */
+ source_sev = &to_kvm_svm(source_kvm)->sev_info;
+ kvm_get_kvm(source_kvm);
+ mirror_sev = &to_kvm_svm(kvm)->sev_info;
+ list_add_tail(&mirror_sev->mirror_entry, &source_sev->mirror_vms);
+
+ /* Set enc_context_owner and copy its encryption context over */
+ mirror_sev->enc_context_owner = source_kvm;
+ mirror_sev->active = true;
+ mirror_sev->asid = source_sev->asid;
+ mirror_sev->fd = source_sev->fd;
+ mirror_sev->es_active = source_sev->es_active;
+ mirror_sev->handle = source_sev->handle;
+ INIT_LIST_HEAD(&mirror_sev->regions_list);
+ INIT_LIST_HEAD(&mirror_sev->mirror_vms);
+ ret = 0;
+
+ /*
+ * Do not copy ap_jump_table. Since the mirror does not share the same
+ * KVM contexts as the original, and they may have different
+ * memory-views.
+ */
+
+e_unlock:
+ sev_unlock_two_vms(kvm, source_kvm);
+e_source_fput:
+ if (source_kvm_file)
+ fput(source_kvm_file);
+ return ret;
+}
+
+void sev_vm_destroy(struct kvm *kvm)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct list_head *head = &sev->regions_list;
+ struct list_head *pos, *q;
+
+ if (!sev_guest(kvm))
+ return;
+
+ WARN_ON(!list_empty(&sev->mirror_vms));
+
+ /* If this is a mirror_kvm release the enc_context_owner and skip sev cleanup */
+ if (is_mirroring_enc_context(kvm)) {
+ struct kvm *owner_kvm = sev->enc_context_owner;
+
+ mutex_lock(&owner_kvm->lock);
+ list_del(&sev->mirror_entry);
+ mutex_unlock(&owner_kvm->lock);
+ kvm_put_kvm(owner_kvm);
+ return;
+ }
+
+ /*
+ * Ensure that all guest tagged cache entries are flushed before
+ * releasing the pages back to the system for use. CLFLUSH will
+ * not do this, so issue a WBINVD.
+ */
+ wbinvd_on_all_cpus();
+
+ /*
+ * if userspace was terminated before unregistering the memory regions
+ * then lets unpin all the registered memory.
+ */
+ if (!list_empty(head)) {
+ list_for_each_safe(pos, q, head) {
+ __unregister_enc_region_locked(kvm,
+ list_entry(pos, struct enc_region, list));
+ cond_resched();
+ }
+ }
+
+ sev_unbind_asid(kvm, sev->handle);
+ sev_asid_free(sev);
+}
+
+void __init sev_set_cpu_caps(void)
+{
+ if (!sev_enabled)
+ kvm_cpu_cap_clear(X86_FEATURE_SEV);
+ if (!sev_es_enabled)
+ kvm_cpu_cap_clear(X86_FEATURE_SEV_ES);
+}
+
+void __init sev_hardware_setup(void)
+{
+#ifdef CONFIG_KVM_AMD_SEV
+ unsigned int eax, ebx, ecx, edx, sev_asid_count, sev_es_asid_count;
+ bool sev_es_supported = false;
+ bool sev_supported = false;
+
+ if (!sev_enabled || !npt_enabled)
+ goto out;
+
+ /*
+ * SEV must obviously be supported in hardware. Sanity check that the
+ * CPU supports decode assists, which is mandatory for SEV guests to
+ * support instruction emulation.
+ */
+ if (!boot_cpu_has(X86_FEATURE_SEV) ||
+ WARN_ON_ONCE(!boot_cpu_has(X86_FEATURE_DECODEASSISTS)))
+ goto out;
+
+ /* Retrieve SEV CPUID information */
+ cpuid(0x8000001f, &eax, &ebx, &ecx, &edx);
+
+ /* Set encryption bit location for SEV-ES guests */
+ sev_enc_bit = ebx & 0x3f;
+
+ /* Maximum number of encrypted guests supported simultaneously */
+ max_sev_asid = ecx;
+ if (!max_sev_asid)
+ goto out;
+
+ /* Minimum ASID value that should be used for SEV guest */
+ min_sev_asid = edx;
+ sev_me_mask = 1UL << (ebx & 0x3f);
+
+ /*
+ * Initialize SEV ASID bitmaps. Allocate space for ASID 0 in the bitmap,
+ * even though it's never used, so that the bitmap is indexed by the
+ * actual ASID.
+ */
+ nr_asids = max_sev_asid + 1;
+ sev_asid_bitmap = bitmap_zalloc(nr_asids, GFP_KERNEL);
+ if (!sev_asid_bitmap)
+ goto out;
+
+ sev_reclaim_asid_bitmap = bitmap_zalloc(nr_asids, GFP_KERNEL);
+ if (!sev_reclaim_asid_bitmap) {
+ bitmap_free(sev_asid_bitmap);
+ sev_asid_bitmap = NULL;
+ goto out;
+ }
+
+ sev_asid_count = max_sev_asid - min_sev_asid + 1;
+ if (misc_cg_set_capacity(MISC_CG_RES_SEV, sev_asid_count))
+ goto out;
+
+ pr_info("SEV supported: %u ASIDs\n", sev_asid_count);
+ sev_supported = true;
+
+ /* SEV-ES support requested? */
+ if (!sev_es_enabled)
+ goto out;
+
+ /*
+ * SEV-ES requires MMIO caching as KVM doesn't have access to the guest
+ * instruction stream, i.e. can't emulate in response to a #NPF and
+ * instead relies on #NPF(RSVD) being reflected into the guest as #VC
+ * (the guest can then do a #VMGEXIT to request MMIO emulation).
+ */
+ if (!enable_mmio_caching)
+ goto out;
+
+ /* Does the CPU support SEV-ES? */
+ if (!boot_cpu_has(X86_FEATURE_SEV_ES))
+ goto out;
+
+ /* Has the system been allocated ASIDs for SEV-ES? */
+ if (min_sev_asid == 1)
+ goto out;
+
+ sev_es_asid_count = min_sev_asid - 1;
+ if (misc_cg_set_capacity(MISC_CG_RES_SEV_ES, sev_es_asid_count))
+ goto out;
+
+ pr_info("SEV-ES supported: %u ASIDs\n", sev_es_asid_count);
+ sev_es_supported = true;
+
+out:
+ sev_enabled = sev_supported;
+ sev_es_enabled = sev_es_supported;
+#endif
+}
+
+void sev_hardware_unsetup(void)
+{
+ if (!sev_enabled)
+ return;
+
+ /* No need to take sev_bitmap_lock, all VMs have been destroyed. */
+ sev_flush_asids(1, max_sev_asid);
+
+ bitmap_free(sev_asid_bitmap);
+ bitmap_free(sev_reclaim_asid_bitmap);
+
+ misc_cg_set_capacity(MISC_CG_RES_SEV, 0);
+ misc_cg_set_capacity(MISC_CG_RES_SEV_ES, 0);
+}
+
+int sev_cpu_init(struct svm_cpu_data *sd)
+{
+ if (!sev_enabled)
+ return 0;
+
+ sd->sev_vmcbs = kcalloc(nr_asids, sizeof(void *), GFP_KERNEL);
+ if (!sd->sev_vmcbs)
+ return -ENOMEM;
+
+ return 0;
+}
+
+/*
+ * Pages used by hardware to hold guest encrypted state must be flushed before
+ * returning them to the system.
+ */
+static void sev_flush_encrypted_page(struct kvm_vcpu *vcpu, void *va)
+{
+ int asid = to_kvm_svm(vcpu->kvm)->sev_info.asid;
+
+ /*
+ * Note! The address must be a kernel address, as regular page walk
+ * checks are performed by VM_PAGE_FLUSH, i.e. operating on a user
+ * address is non-deterministic and unsafe. This function deliberately
+ * takes a pointer to deter passing in a user address.
+ */
+ unsigned long addr = (unsigned long)va;
+
+ /*
+ * If CPU enforced cache coherency for encrypted mappings of the
+ * same physical page is supported, use CLFLUSHOPT instead. NOTE: cache
+ * flush is still needed in order to work properly with DMA devices.
+ */
+ if (boot_cpu_has(X86_FEATURE_SME_COHERENT)) {
+ clflush_cache_range(va, PAGE_SIZE);
+ return;
+ }
+
+ /*
+ * VM Page Flush takes a host virtual address and a guest ASID. Fall
+ * back to WBINVD if this faults so as not to make any problems worse
+ * by leaving stale encrypted data in the cache.
+ */
+ if (WARN_ON_ONCE(wrmsrl_safe(MSR_AMD64_VM_PAGE_FLUSH, addr | asid)))
+ goto do_wbinvd;
+
+ return;
+
+do_wbinvd:
+ wbinvd_on_all_cpus();
+}
+
+void sev_guest_memory_reclaimed(struct kvm *kvm)
+{
+ if (!sev_guest(kvm))
+ return;
+
+ wbinvd_on_all_cpus();
+}
+
+void sev_free_vcpu(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm;
+
+ if (!sev_es_guest(vcpu->kvm))
+ return;
+
+ svm = to_svm(vcpu);
+
+ if (vcpu->arch.guest_state_protected)
+ sev_flush_encrypted_page(vcpu, svm->sev_es.vmsa);
+
+ __free_page(virt_to_page(svm->sev_es.vmsa));
+
+ if (svm->sev_es.ghcb_sa_free)
+ kvfree(svm->sev_es.ghcb_sa);
+}
+
+static void dump_ghcb(struct vcpu_svm *svm)
+{
+ struct ghcb *ghcb = svm->sev_es.ghcb;
+ unsigned int nbits;
+
+ /* Re-use the dump_invalid_vmcb module parameter */
+ if (!dump_invalid_vmcb) {
+ pr_warn_ratelimited("set kvm_amd.dump_invalid_vmcb=1 to dump internal KVM state.\n");
+ return;
+ }
+
+ nbits = sizeof(ghcb->save.valid_bitmap) * 8;
+
+ pr_err("GHCB (GPA=%016llx):\n", svm->vmcb->control.ghcb_gpa);
+ pr_err("%-20s%016llx is_valid: %u\n", "sw_exit_code",
+ ghcb->save.sw_exit_code, ghcb_sw_exit_code_is_valid(ghcb));
+ pr_err("%-20s%016llx is_valid: %u\n", "sw_exit_info_1",
+ ghcb->save.sw_exit_info_1, ghcb_sw_exit_info_1_is_valid(ghcb));
+ pr_err("%-20s%016llx is_valid: %u\n", "sw_exit_info_2",
+ ghcb->save.sw_exit_info_2, ghcb_sw_exit_info_2_is_valid(ghcb));
+ pr_err("%-20s%016llx is_valid: %u\n", "sw_scratch",
+ ghcb->save.sw_scratch, ghcb_sw_scratch_is_valid(ghcb));
+ pr_err("%-20s%*pb\n", "valid_bitmap", nbits, ghcb->save.valid_bitmap);
+}
+
+static void sev_es_sync_to_ghcb(struct vcpu_svm *svm)
+{
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+ struct ghcb *ghcb = svm->sev_es.ghcb;
+
+ /*
+ * The GHCB protocol so far allows for the following data
+ * to be returned:
+ * GPRs RAX, RBX, RCX, RDX
+ *
+ * Copy their values, even if they may not have been written during the
+ * VM-Exit. It's the guest's responsibility to not consume random data.
+ */
+ ghcb_set_rax(ghcb, vcpu->arch.regs[VCPU_REGS_RAX]);
+ ghcb_set_rbx(ghcb, vcpu->arch.regs[VCPU_REGS_RBX]);
+ ghcb_set_rcx(ghcb, vcpu->arch.regs[VCPU_REGS_RCX]);
+ ghcb_set_rdx(ghcb, vcpu->arch.regs[VCPU_REGS_RDX]);
+}
+
+static void sev_es_sync_from_ghcb(struct vcpu_svm *svm)
+{
+ struct vmcb_control_area *control = &svm->vmcb->control;
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+ struct ghcb *ghcb = svm->sev_es.ghcb;
+ u64 exit_code;
+
+ /*
+ * The GHCB protocol so far allows for the following data
+ * to be supplied:
+ * GPRs RAX, RBX, RCX, RDX
+ * XCR0
+ * CPL
+ *
+ * VMMCALL allows the guest to provide extra registers. KVM also
+ * expects RSI for hypercalls, so include that, too.
+ *
+ * Copy their values to the appropriate location if supplied.
+ */
+ memset(vcpu->arch.regs, 0, sizeof(vcpu->arch.regs));
+
+ BUILD_BUG_ON(sizeof(svm->sev_es.valid_bitmap) != sizeof(ghcb->save.valid_bitmap));
+ memcpy(&svm->sev_es.valid_bitmap, &ghcb->save.valid_bitmap, sizeof(ghcb->save.valid_bitmap));
+
+ vcpu->arch.regs[VCPU_REGS_RAX] = kvm_ghcb_get_rax_if_valid(svm, ghcb);
+ vcpu->arch.regs[VCPU_REGS_RBX] = kvm_ghcb_get_rbx_if_valid(svm, ghcb);
+ vcpu->arch.regs[VCPU_REGS_RCX] = kvm_ghcb_get_rcx_if_valid(svm, ghcb);
+ vcpu->arch.regs[VCPU_REGS_RDX] = kvm_ghcb_get_rdx_if_valid(svm, ghcb);
+ vcpu->arch.regs[VCPU_REGS_RSI] = kvm_ghcb_get_rsi_if_valid(svm, ghcb);
+
+ svm->vmcb->save.cpl = kvm_ghcb_get_cpl_if_valid(svm, ghcb);
+
+ if (kvm_ghcb_xcr0_is_valid(svm)) {
+ vcpu->arch.xcr0 = ghcb_get_xcr0(ghcb);
+ kvm_update_cpuid_runtime(vcpu);
+ }
+
+ /* Copy the GHCB exit information into the VMCB fields */
+ exit_code = ghcb_get_sw_exit_code(ghcb);
+ control->exit_code = lower_32_bits(exit_code);
+ control->exit_code_hi = upper_32_bits(exit_code);
+ control->exit_info_1 = ghcb_get_sw_exit_info_1(ghcb);
+ control->exit_info_2 = ghcb_get_sw_exit_info_2(ghcb);
+ svm->sev_es.sw_scratch = kvm_ghcb_get_sw_scratch_if_valid(svm, ghcb);
+
+ /* Clear the valid entries fields */
+ memset(ghcb->save.valid_bitmap, 0, sizeof(ghcb->save.valid_bitmap));
+}
+
+static u64 kvm_ghcb_get_sw_exit_code(struct vmcb_control_area *control)
+{
+ return (((u64)control->exit_code_hi) << 32) | control->exit_code;
+}
+
+static int sev_es_validate_vmgexit(struct vcpu_svm *svm)
+{
+ struct vmcb_control_area *control = &svm->vmcb->control;
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+ struct ghcb *ghcb;
+ u64 exit_code;
+ u64 reason;
+
+ ghcb = svm->sev_es.ghcb;
+
+ /*
+ * Retrieve the exit code now even though it may not be marked valid
+ * as it could help with debugging.
+ */
+ exit_code = kvm_ghcb_get_sw_exit_code(control);
+
+ /* Only GHCB Usage code 0 is supported */
+ if (ghcb->ghcb_usage) {
+ reason = GHCB_ERR_INVALID_USAGE;
+ goto vmgexit_err;
+ }
+
+ reason = GHCB_ERR_MISSING_INPUT;
+
+ if (!kvm_ghcb_sw_exit_code_is_valid(svm) ||
+ !kvm_ghcb_sw_exit_info_1_is_valid(svm) ||
+ !kvm_ghcb_sw_exit_info_2_is_valid(svm))
+ goto vmgexit_err;
+
+ switch (exit_code) {
+ case SVM_EXIT_READ_DR7:
+ break;
+ case SVM_EXIT_WRITE_DR7:
+ if (!kvm_ghcb_rax_is_valid(svm))
+ goto vmgexit_err;
+ break;
+ case SVM_EXIT_RDTSC:
+ break;
+ case SVM_EXIT_RDPMC:
+ if (!kvm_ghcb_rcx_is_valid(svm))
+ goto vmgexit_err;
+ break;
+ case SVM_EXIT_CPUID:
+ if (!kvm_ghcb_rax_is_valid(svm) ||
+ !kvm_ghcb_rcx_is_valid(svm))
+ goto vmgexit_err;
+ if (vcpu->arch.regs[VCPU_REGS_RAX] == 0xd)
+ if (!kvm_ghcb_xcr0_is_valid(svm))
+ goto vmgexit_err;
+ break;
+ case SVM_EXIT_INVD:
+ break;
+ case SVM_EXIT_IOIO:
+ if (control->exit_info_1 & SVM_IOIO_STR_MASK) {
+ if (!kvm_ghcb_sw_scratch_is_valid(svm))
+ goto vmgexit_err;
+ } else {
+ if (!(control->exit_info_1 & SVM_IOIO_TYPE_MASK))
+ if (!kvm_ghcb_rax_is_valid(svm))
+ goto vmgexit_err;
+ }
+ break;
+ case SVM_EXIT_MSR:
+ if (!kvm_ghcb_rcx_is_valid(svm))
+ goto vmgexit_err;
+ if (control->exit_info_1) {
+ if (!kvm_ghcb_rax_is_valid(svm) ||
+ !kvm_ghcb_rdx_is_valid(svm))
+ goto vmgexit_err;
+ }
+ break;
+ case SVM_EXIT_VMMCALL:
+ if (!kvm_ghcb_rax_is_valid(svm) ||
+ !kvm_ghcb_cpl_is_valid(svm))
+ goto vmgexit_err;
+ break;
+ case SVM_EXIT_RDTSCP:
+ break;
+ case SVM_EXIT_WBINVD:
+ break;
+ case SVM_EXIT_MONITOR:
+ if (!kvm_ghcb_rax_is_valid(svm) ||
+ !kvm_ghcb_rcx_is_valid(svm) ||
+ !kvm_ghcb_rdx_is_valid(svm))
+ goto vmgexit_err;
+ break;
+ case SVM_EXIT_MWAIT:
+ if (!kvm_ghcb_rax_is_valid(svm) ||
+ !kvm_ghcb_rcx_is_valid(svm))
+ goto vmgexit_err;
+ break;
+ case SVM_VMGEXIT_MMIO_READ:
+ case SVM_VMGEXIT_MMIO_WRITE:
+ if (!kvm_ghcb_sw_scratch_is_valid(svm))
+ goto vmgexit_err;
+ break;
+ case SVM_VMGEXIT_NMI_COMPLETE:
+ case SVM_VMGEXIT_AP_HLT_LOOP:
+ case SVM_VMGEXIT_AP_JUMP_TABLE:
+ case SVM_VMGEXIT_UNSUPPORTED_EVENT:
+ break;
+ default:
+ reason = GHCB_ERR_INVALID_EVENT;
+ goto vmgexit_err;
+ }
+
+ return 0;
+
+vmgexit_err:
+ if (reason == GHCB_ERR_INVALID_USAGE) {
+ vcpu_unimpl(vcpu, "vmgexit: ghcb usage %#x is not valid\n",
+ ghcb->ghcb_usage);
+ } else if (reason == GHCB_ERR_INVALID_EVENT) {
+ vcpu_unimpl(vcpu, "vmgexit: exit code %#llx is not valid\n",
+ exit_code);
+ } else {
+ vcpu_unimpl(vcpu, "vmgexit: exit code %#llx input is not valid\n",
+ exit_code);
+ dump_ghcb(svm);
+ }
+
+ ghcb_set_sw_exit_info_1(ghcb, 2);
+ ghcb_set_sw_exit_info_2(ghcb, reason);
+
+ /* Resume the guest to "return" the error code. */
+ return 1;
+}
+
+void sev_es_unmap_ghcb(struct vcpu_svm *svm)
+{
+ if (!svm->sev_es.ghcb)
+ return;
+
+ if (svm->sev_es.ghcb_sa_free) {
+ /*
+ * The scratch area lives outside the GHCB, so there is a
+ * buffer that, depending on the operation performed, may
+ * need to be synced, then freed.
+ */
+ if (svm->sev_es.ghcb_sa_sync) {
+ kvm_write_guest(svm->vcpu.kvm,
+ svm->sev_es.sw_scratch,
+ svm->sev_es.ghcb_sa,
+ svm->sev_es.ghcb_sa_len);
+ svm->sev_es.ghcb_sa_sync = false;
+ }
+
+ kvfree(svm->sev_es.ghcb_sa);
+ svm->sev_es.ghcb_sa = NULL;
+ svm->sev_es.ghcb_sa_free = false;
+ }
+
+ trace_kvm_vmgexit_exit(svm->vcpu.vcpu_id, svm->sev_es.ghcb);
+
+ sev_es_sync_to_ghcb(svm);
+
+ kvm_vcpu_unmap(&svm->vcpu, &svm->sev_es.ghcb_map, true);
+ svm->sev_es.ghcb = NULL;
+}
+
+void pre_sev_run(struct vcpu_svm *svm, int cpu)
+{
+ struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, cpu);
+ int asid = sev_get_asid(svm->vcpu.kvm);
+
+ /* Assign the asid allocated with this SEV guest */
+ svm->asid = asid;
+
+ /*
+ * Flush guest TLB:
+ *
+ * 1) when different VMCB for the same ASID is to be run on the same host CPU.
+ * 2) or this VMCB was executed on different host CPU in previous VMRUNs.
+ */
+ if (sd->sev_vmcbs[asid] == svm->vmcb &&
+ svm->vcpu.arch.last_vmentry_cpu == cpu)
+ return;
+
+ sd->sev_vmcbs[asid] = svm->vmcb;
+ svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID;
+ vmcb_mark_dirty(svm->vmcb, VMCB_ASID);
+}
+
+#define GHCB_SCRATCH_AREA_LIMIT (16ULL * PAGE_SIZE)
+static int setup_vmgexit_scratch(struct vcpu_svm *svm, bool sync, u64 len)
+{
+ struct vmcb_control_area *control = &svm->vmcb->control;
+ struct ghcb *ghcb = svm->sev_es.ghcb;
+ u64 ghcb_scratch_beg, ghcb_scratch_end;
+ u64 scratch_gpa_beg, scratch_gpa_end;
+ void *scratch_va;
+
+ scratch_gpa_beg = svm->sev_es.sw_scratch;
+ if (!scratch_gpa_beg) {
+ pr_err("vmgexit: scratch gpa not provided\n");
+ goto e_scratch;
+ }
+
+ scratch_gpa_end = scratch_gpa_beg + len;
+ if (scratch_gpa_end < scratch_gpa_beg) {
+ pr_err("vmgexit: scratch length (%#llx) not valid for scratch address (%#llx)\n",
+ len, scratch_gpa_beg);
+ goto e_scratch;
+ }
+
+ if ((scratch_gpa_beg & PAGE_MASK) == control->ghcb_gpa) {
+ /* Scratch area begins within GHCB */
+ ghcb_scratch_beg = control->ghcb_gpa +
+ offsetof(struct ghcb, shared_buffer);
+ ghcb_scratch_end = control->ghcb_gpa +
+ offsetof(struct ghcb, reserved_1);
+
+ /*
+ * If the scratch area begins within the GHCB, it must be
+ * completely contained in the GHCB shared buffer area.
+ */
+ if (scratch_gpa_beg < ghcb_scratch_beg ||
+ scratch_gpa_end > ghcb_scratch_end) {
+ pr_err("vmgexit: scratch area is outside of GHCB shared buffer area (%#llx - %#llx)\n",
+ scratch_gpa_beg, scratch_gpa_end);
+ goto e_scratch;
+ }
+
+ scratch_va = (void *)svm->sev_es.ghcb;
+ scratch_va += (scratch_gpa_beg - control->ghcb_gpa);
+ } else {
+ /*
+ * The guest memory must be read into a kernel buffer, so
+ * limit the size
+ */
+ if (len > GHCB_SCRATCH_AREA_LIMIT) {
+ pr_err("vmgexit: scratch area exceeds KVM limits (%#llx requested, %#llx limit)\n",
+ len, GHCB_SCRATCH_AREA_LIMIT);
+ goto e_scratch;
+ }
+ scratch_va = kvzalloc(len, GFP_KERNEL_ACCOUNT);
+ if (!scratch_va)
+ return -ENOMEM;
+
+ if (kvm_read_guest(svm->vcpu.kvm, scratch_gpa_beg, scratch_va, len)) {
+ /* Unable to copy scratch area from guest */
+ pr_err("vmgexit: kvm_read_guest for scratch area failed\n");
+
+ kvfree(scratch_va);
+ return -EFAULT;
+ }
+
+ /*
+ * The scratch area is outside the GHCB. The operation will
+ * dictate whether the buffer needs to be synced before running
+ * the vCPU next time (i.e. a read was requested so the data
+ * must be written back to the guest memory).
+ */
+ svm->sev_es.ghcb_sa_sync = sync;
+ svm->sev_es.ghcb_sa_free = true;
+ }
+
+ svm->sev_es.ghcb_sa = scratch_va;
+ svm->sev_es.ghcb_sa_len = len;
+
+ return 0;
+
+e_scratch:
+ ghcb_set_sw_exit_info_1(ghcb, 2);
+ ghcb_set_sw_exit_info_2(ghcb, GHCB_ERR_INVALID_SCRATCH_AREA);
+
+ return 1;
+}
+
+static void set_ghcb_msr_bits(struct vcpu_svm *svm, u64 value, u64 mask,
+ unsigned int pos)
+{
+ svm->vmcb->control.ghcb_gpa &= ~(mask << pos);
+ svm->vmcb->control.ghcb_gpa |= (value & mask) << pos;
+}
+
+static u64 get_ghcb_msr_bits(struct vcpu_svm *svm, u64 mask, unsigned int pos)
+{
+ return (svm->vmcb->control.ghcb_gpa >> pos) & mask;
+}
+
+static void set_ghcb_msr(struct vcpu_svm *svm, u64 value)
+{
+ svm->vmcb->control.ghcb_gpa = value;
+}
+
+static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm)
+{
+ struct vmcb_control_area *control = &svm->vmcb->control;
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+ u64 ghcb_info;
+ int ret = 1;
+
+ ghcb_info = control->ghcb_gpa & GHCB_MSR_INFO_MASK;
+
+ trace_kvm_vmgexit_msr_protocol_enter(svm->vcpu.vcpu_id,
+ control->ghcb_gpa);
+
+ switch (ghcb_info) {
+ case GHCB_MSR_SEV_INFO_REQ:
+ set_ghcb_msr(svm, GHCB_MSR_SEV_INFO(GHCB_VERSION_MAX,
+ GHCB_VERSION_MIN,
+ sev_enc_bit));
+ break;
+ case GHCB_MSR_CPUID_REQ: {
+ u64 cpuid_fn, cpuid_reg, cpuid_value;
+
+ cpuid_fn = get_ghcb_msr_bits(svm,
+ GHCB_MSR_CPUID_FUNC_MASK,
+ GHCB_MSR_CPUID_FUNC_POS);
+
+ /* Initialize the registers needed by the CPUID intercept */
+ vcpu->arch.regs[VCPU_REGS_RAX] = cpuid_fn;
+ vcpu->arch.regs[VCPU_REGS_RCX] = 0;
+
+ ret = svm_invoke_exit_handler(vcpu, SVM_EXIT_CPUID);
+ if (!ret) {
+ /* Error, keep GHCB MSR value as-is */
+ break;
+ }
+
+ cpuid_reg = get_ghcb_msr_bits(svm,
+ GHCB_MSR_CPUID_REG_MASK,
+ GHCB_MSR_CPUID_REG_POS);
+ if (cpuid_reg == 0)
+ cpuid_value = vcpu->arch.regs[VCPU_REGS_RAX];
+ else if (cpuid_reg == 1)
+ cpuid_value = vcpu->arch.regs[VCPU_REGS_RBX];
+ else if (cpuid_reg == 2)
+ cpuid_value = vcpu->arch.regs[VCPU_REGS_RCX];
+ else
+ cpuid_value = vcpu->arch.regs[VCPU_REGS_RDX];
+
+ set_ghcb_msr_bits(svm, cpuid_value,
+ GHCB_MSR_CPUID_VALUE_MASK,
+ GHCB_MSR_CPUID_VALUE_POS);
+
+ set_ghcb_msr_bits(svm, GHCB_MSR_CPUID_RESP,
+ GHCB_MSR_INFO_MASK,
+ GHCB_MSR_INFO_POS);
+ break;
+ }
+ case GHCB_MSR_TERM_REQ: {
+ u64 reason_set, reason_code;
+
+ reason_set = get_ghcb_msr_bits(svm,
+ GHCB_MSR_TERM_REASON_SET_MASK,
+ GHCB_MSR_TERM_REASON_SET_POS);
+ reason_code = get_ghcb_msr_bits(svm,
+ GHCB_MSR_TERM_REASON_MASK,
+ GHCB_MSR_TERM_REASON_POS);
+ pr_info("SEV-ES guest requested termination: %#llx:%#llx\n",
+ reason_set, reason_code);
+
+ vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
+ vcpu->run->system_event.type = KVM_SYSTEM_EVENT_SEV_TERM;
+ vcpu->run->system_event.ndata = 1;
+ vcpu->run->system_event.data[0] = control->ghcb_gpa;
+
+ return 0;
+ }
+ default:
+ /* Error, keep GHCB MSR value as-is */
+ break;
+ }
+
+ trace_kvm_vmgexit_msr_protocol_exit(svm->vcpu.vcpu_id,
+ control->ghcb_gpa, ret);
+
+ return ret;
+}
+
+int sev_handle_vmgexit(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb_control_area *control = &svm->vmcb->control;
+ u64 ghcb_gpa, exit_code;
+ struct ghcb *ghcb;
+ int ret;
+
+ /* Validate the GHCB */
+ ghcb_gpa = control->ghcb_gpa;
+ if (ghcb_gpa & GHCB_MSR_INFO_MASK)
+ return sev_handle_vmgexit_msr_protocol(svm);
+
+ if (!ghcb_gpa) {
+ vcpu_unimpl(vcpu, "vmgexit: GHCB gpa is not set\n");
+
+ /* Without a GHCB, just return right back to the guest */
+ return 1;
+ }
+
+ if (kvm_vcpu_map(vcpu, ghcb_gpa >> PAGE_SHIFT, &svm->sev_es.ghcb_map)) {
+ /* Unable to map GHCB from guest */
+ vcpu_unimpl(vcpu, "vmgexit: error mapping GHCB [%#llx] from guest\n",
+ ghcb_gpa);
+
+ /* Without a GHCB, just return right back to the guest */
+ return 1;
+ }
+
+ svm->sev_es.ghcb = svm->sev_es.ghcb_map.hva;
+ ghcb = svm->sev_es.ghcb_map.hva;
+
+ trace_kvm_vmgexit_enter(vcpu->vcpu_id, ghcb);
+
+ sev_es_sync_from_ghcb(svm);
+ ret = sev_es_validate_vmgexit(svm);
+ if (ret)
+ return ret;
+
+ ghcb_set_sw_exit_info_1(ghcb, 0);
+ ghcb_set_sw_exit_info_2(ghcb, 0);
+
+ exit_code = kvm_ghcb_get_sw_exit_code(control);
+ switch (exit_code) {
+ case SVM_VMGEXIT_MMIO_READ:
+ ret = setup_vmgexit_scratch(svm, true, control->exit_info_2);
+ if (ret)
+ break;
+
+ ret = kvm_sev_es_mmio_read(vcpu,
+ control->exit_info_1,
+ control->exit_info_2,
+ svm->sev_es.ghcb_sa);
+ break;
+ case SVM_VMGEXIT_MMIO_WRITE:
+ ret = setup_vmgexit_scratch(svm, false, control->exit_info_2);
+ if (ret)
+ break;
+
+ ret = kvm_sev_es_mmio_write(vcpu,
+ control->exit_info_1,
+ control->exit_info_2,
+ svm->sev_es.ghcb_sa);
+ break;
+ case SVM_VMGEXIT_NMI_COMPLETE:
+ ret = svm_invoke_exit_handler(vcpu, SVM_EXIT_IRET);
+ break;
+ case SVM_VMGEXIT_AP_HLT_LOOP:
+ ret = kvm_emulate_ap_reset_hold(vcpu);
+ break;
+ case SVM_VMGEXIT_AP_JUMP_TABLE: {
+ struct kvm_sev_info *sev = &to_kvm_svm(vcpu->kvm)->sev_info;
+
+ switch (control->exit_info_1) {
+ case 0:
+ /* Set AP jump table address */
+ sev->ap_jump_table = control->exit_info_2;
+ break;
+ case 1:
+ /* Get AP jump table address */
+ ghcb_set_sw_exit_info_2(ghcb, sev->ap_jump_table);
+ break;
+ default:
+ pr_err("svm: vmgexit: unsupported AP jump table request - exit_info_1=%#llx\n",
+ control->exit_info_1);
+ ghcb_set_sw_exit_info_1(ghcb, 2);
+ ghcb_set_sw_exit_info_2(ghcb, GHCB_ERR_INVALID_INPUT);
+ }
+
+ ret = 1;
+ break;
+ }
+ case SVM_VMGEXIT_UNSUPPORTED_EVENT:
+ vcpu_unimpl(vcpu,
+ "vmgexit: unsupported event - exit_info_1=%#llx, exit_info_2=%#llx\n",
+ control->exit_info_1, control->exit_info_2);
+ ret = -EINVAL;
+ break;
+ default:
+ ret = svm_invoke_exit_handler(vcpu, exit_code);
+ }
+
+ return ret;
+}
+
+int sev_es_string_io(struct vcpu_svm *svm, int size, unsigned int port, int in)
+{
+ int count;
+ int bytes;
+ int r;
+
+ if (svm->vmcb->control.exit_info_2 > INT_MAX)
+ return -EINVAL;
+
+ count = svm->vmcb->control.exit_info_2;
+ if (unlikely(check_mul_overflow(count, size, &bytes)))
+ return -EINVAL;
+
+ r = setup_vmgexit_scratch(svm, in, bytes);
+ if (r)
+ return r;
+
+ return kvm_sev_es_string_io(&svm->vcpu, size, port, svm->sev_es.ghcb_sa,
+ count, in);
+}
+
+static void sev_es_vcpu_after_set_cpuid(struct vcpu_svm *svm)
+{
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+
+ if (boot_cpu_has(X86_FEATURE_V_TSC_AUX)) {
+ bool v_tsc_aux = guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP) ||
+ guest_cpuid_has(vcpu, X86_FEATURE_RDPID);
+
+ set_msr_interception(vcpu, svm->msrpm, MSR_TSC_AUX, v_tsc_aux, v_tsc_aux);
+ }
+}
+
+void sev_vcpu_after_set_cpuid(struct vcpu_svm *svm)
+{
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+ struct kvm_cpuid_entry2 *best;
+
+ /* For sev guests, the memory encryption bit is not reserved in CR3. */
+ best = kvm_find_cpuid_entry(vcpu, 0x8000001F);
+ if (best)
+ vcpu->arch.reserved_gpa_bits &= ~(1UL << (best->ebx & 0x3f));
+
+ if (sev_es_guest(svm->vcpu.kvm))
+ sev_es_vcpu_after_set_cpuid(svm);
+}
+
+static void sev_es_init_vmcb(struct vcpu_svm *svm)
+{
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+
+ svm->vmcb->control.nested_ctl |= SVM_NESTED_CTL_SEV_ES_ENABLE;
+ svm->vmcb->control.virt_ext |= LBR_CTL_ENABLE_MASK;
+
+ /*
+ * An SEV-ES guest requires a VMSA area that is a separate from the
+ * VMCB page. Do not include the encryption mask on the VMSA physical
+ * address since hardware will access it using the guest key. Note,
+ * the VMSA will be NULL if this vCPU is the destination for intrahost
+ * migration, and will be copied later.
+ */
+ if (svm->sev_es.vmsa)
+ svm->vmcb->control.vmsa_pa = __pa(svm->sev_es.vmsa);
+
+ /* Can't intercept CR register access, HV can't modify CR registers */
+ svm_clr_intercept(svm, INTERCEPT_CR0_READ);
+ svm_clr_intercept(svm, INTERCEPT_CR4_READ);
+ svm_clr_intercept(svm, INTERCEPT_CR8_READ);
+ svm_clr_intercept(svm, INTERCEPT_CR0_WRITE);
+ svm_clr_intercept(svm, INTERCEPT_CR4_WRITE);
+ svm_clr_intercept(svm, INTERCEPT_CR8_WRITE);
+
+ svm_clr_intercept(svm, INTERCEPT_SELECTIVE_CR0);
+
+ /* Track EFER/CR register changes */
+ svm_set_intercept(svm, TRAP_EFER_WRITE);
+ svm_set_intercept(svm, TRAP_CR0_WRITE);
+ svm_set_intercept(svm, TRAP_CR4_WRITE);
+ svm_set_intercept(svm, TRAP_CR8_WRITE);
+
+ /* No support for enable_vmware_backdoor */
+ clr_exception_intercept(svm, GP_VECTOR);
+
+ /* Can't intercept XSETBV, HV can't modify XCR0 directly */
+ svm_clr_intercept(svm, INTERCEPT_XSETBV);
+
+ /* Clear intercepts on selected MSRs */
+ set_msr_interception(vcpu, svm->msrpm, MSR_EFER, 1, 1);
+ set_msr_interception(vcpu, svm->msrpm, MSR_IA32_CR_PAT, 1, 1);
+ set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHFROMIP, 1, 1);
+ set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1);
+ set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 1, 1);
+ set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 1, 1);
+}
+
+void sev_init_vmcb(struct vcpu_svm *svm)
+{
+ svm->vmcb->control.nested_ctl |= SVM_NESTED_CTL_SEV_ENABLE;
+ clr_exception_intercept(svm, UD_VECTOR);
+
+ if (sev_es_guest(svm->vcpu.kvm))
+ sev_es_init_vmcb(svm);
+}
+
+void sev_es_vcpu_reset(struct vcpu_svm *svm)
+{
+ /*
+ * Set the GHCB MSR value as per the GHCB specification when emulating
+ * vCPU RESET for an SEV-ES guest.
+ */
+ set_ghcb_msr(svm, GHCB_MSR_SEV_INFO(GHCB_VERSION_MAX,
+ GHCB_VERSION_MIN,
+ sev_enc_bit));
+}
+
+void sev_es_prepare_switch_to_guest(struct sev_es_save_area *hostsa)
+{
+ /*
+ * As an SEV-ES guest, hardware will restore the host state on VMEXIT,
+ * of which one step is to perform a VMLOAD. KVM performs the
+ * corresponding VMSAVE in svm_prepare_guest_switch for both
+ * traditional and SEV-ES guests.
+ */
+
+ /* XCR0 is restored on VMEXIT, save the current host value */
+ hostsa->xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
+
+ /* PKRU is restored on VMEXIT, save the current host value */
+ hostsa->pkru = read_pkru();
+
+ /* MSR_IA32_XSS is restored on VMEXIT, save the currnet host value */
+ hostsa->xss = host_xss;
+}
+
+void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ /* First SIPI: Use the values as initially set by the VMM */
+ if (!svm->sev_es.received_first_sipi) {
+ svm->sev_es.received_first_sipi = true;
+ return;
+ }
+
+ /*
+ * Subsequent SIPI: Return from an AP Reset Hold VMGEXIT, where
+ * the guest will set the CS and RIP. Set SW_EXIT_INFO_2 to a
+ * non-zero value.
+ */
+ if (!svm->sev_es.ghcb)
+ return;
+
+ ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, 1);
+}
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
new file mode 100644
index 000000000..4a6638125
--- /dev/null
+++ b/arch/x86/kvm/svm/svm.c
@@ -0,0 +1,5172 @@
+#define pr_fmt(fmt) "SVM: " fmt
+
+#include <linux/kvm_host.h>
+
+#include "irq.h"
+#include "mmu.h"
+#include "kvm_cache_regs.h"
+#include "x86.h"
+#include "cpuid.h"
+#include "pmu.h"
+
+#include <linux/module.h>
+#include <linux/mod_devicetable.h>
+#include <linux/kernel.h>
+#include <linux/vmalloc.h>
+#include <linux/highmem.h>
+#include <linux/amd-iommu.h>
+#include <linux/sched.h>
+#include <linux/trace_events.h>
+#include <linux/slab.h>
+#include <linux/hashtable.h>
+#include <linux/objtool.h>
+#include <linux/psp-sev.h>
+#include <linux/file.h>
+#include <linux/pagemap.h>
+#include <linux/swap.h>
+#include <linux/rwsem.h>
+#include <linux/cc_platform.h>
+
+#include <asm/apic.h>
+#include <asm/perf_event.h>
+#include <asm/tlbflush.h>
+#include <asm/desc.h>
+#include <asm/debugreg.h>
+#include <asm/kvm_para.h>
+#include <asm/irq_remapping.h>
+#include <asm/spec-ctrl.h>
+#include <asm/cpu_device_id.h>
+#include <asm/traps.h>
+#include <asm/fpu/api.h>
+
+#include <asm/virtext.h>
+#include "trace.h"
+
+#include "svm.h"
+#include "svm_ops.h"
+
+#include "kvm_onhyperv.h"
+#include "svm_onhyperv.h"
+
+MODULE_AUTHOR("Qumranet");
+MODULE_LICENSE("GPL");
+
+#ifdef MODULE
+static const struct x86_cpu_id svm_cpu_id[] = {
+ X86_MATCH_FEATURE(X86_FEATURE_SVM, NULL),
+ {}
+};
+MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id);
+#endif
+
+#define SEG_TYPE_LDT 2
+#define SEG_TYPE_BUSY_TSS16 3
+
+static bool erratum_383_found __read_mostly;
+
+u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
+
+/*
+ * Set osvw_len to higher value when updated Revision Guides
+ * are published and we know what the new status bits are
+ */
+static uint64_t osvw_len = 4, osvw_status;
+
+static DEFINE_PER_CPU(u64, current_tsc_ratio);
+
+#define X2APIC_MSR(x) (APIC_BASE_MSR + (x >> 4))
+
+static const struct svm_direct_access_msrs {
+ u32 index; /* Index of the MSR */
+ bool always; /* True if intercept is initially cleared */
+} direct_access_msrs[MAX_DIRECT_ACCESS_MSRS] = {
+ { .index = MSR_STAR, .always = true },
+ { .index = MSR_IA32_SYSENTER_CS, .always = true },
+ { .index = MSR_IA32_SYSENTER_EIP, .always = false },
+ { .index = MSR_IA32_SYSENTER_ESP, .always = false },
+#ifdef CONFIG_X86_64
+ { .index = MSR_GS_BASE, .always = true },
+ { .index = MSR_FS_BASE, .always = true },
+ { .index = MSR_KERNEL_GS_BASE, .always = true },
+ { .index = MSR_LSTAR, .always = true },
+ { .index = MSR_CSTAR, .always = true },
+ { .index = MSR_SYSCALL_MASK, .always = true },
+#endif
+ { .index = MSR_IA32_SPEC_CTRL, .always = false },
+ { .index = MSR_IA32_PRED_CMD, .always = false },
+ { .index = MSR_IA32_LASTBRANCHFROMIP, .always = false },
+ { .index = MSR_IA32_LASTBRANCHTOIP, .always = false },
+ { .index = MSR_IA32_LASTINTFROMIP, .always = false },
+ { .index = MSR_IA32_LASTINTTOIP, .always = false },
+ { .index = MSR_EFER, .always = false },
+ { .index = MSR_IA32_CR_PAT, .always = false },
+ { .index = MSR_AMD64_SEV_ES_GHCB, .always = true },
+ { .index = MSR_TSC_AUX, .always = false },
+ { .index = X2APIC_MSR(APIC_ID), .always = false },
+ { .index = X2APIC_MSR(APIC_LVR), .always = false },
+ { .index = X2APIC_MSR(APIC_TASKPRI), .always = false },
+ { .index = X2APIC_MSR(APIC_ARBPRI), .always = false },
+ { .index = X2APIC_MSR(APIC_PROCPRI), .always = false },
+ { .index = X2APIC_MSR(APIC_EOI), .always = false },
+ { .index = X2APIC_MSR(APIC_RRR), .always = false },
+ { .index = X2APIC_MSR(APIC_LDR), .always = false },
+ { .index = X2APIC_MSR(APIC_DFR), .always = false },
+ { .index = X2APIC_MSR(APIC_SPIV), .always = false },
+ { .index = X2APIC_MSR(APIC_ISR), .always = false },
+ { .index = X2APIC_MSR(APIC_TMR), .always = false },
+ { .index = X2APIC_MSR(APIC_IRR), .always = false },
+ { .index = X2APIC_MSR(APIC_ESR), .always = false },
+ { .index = X2APIC_MSR(APIC_ICR), .always = false },
+ { .index = X2APIC_MSR(APIC_ICR2), .always = false },
+
+ /*
+ * Note:
+ * AMD does not virtualize APIC TSC-deadline timer mode, but it is
+ * emulated by KVM. When setting APIC LVTT (0x832) register bit 18,
+ * the AVIC hardware would generate GP fault. Therefore, always
+ * intercept the MSR 0x832, and do not setup direct_access_msr.
+ */
+ { .index = X2APIC_MSR(APIC_LVTTHMR), .always = false },
+ { .index = X2APIC_MSR(APIC_LVTPC), .always = false },
+ { .index = X2APIC_MSR(APIC_LVT0), .always = false },
+ { .index = X2APIC_MSR(APIC_LVT1), .always = false },
+ { .index = X2APIC_MSR(APIC_LVTERR), .always = false },
+ { .index = X2APIC_MSR(APIC_TMICT), .always = false },
+ { .index = X2APIC_MSR(APIC_TMCCT), .always = false },
+ { .index = X2APIC_MSR(APIC_TDCR), .always = false },
+ { .index = MSR_INVALID, .always = false },
+};
+
+/*
+ * These 2 parameters are used to config the controls for Pause-Loop Exiting:
+ * pause_filter_count: On processors that support Pause filtering(indicated
+ * by CPUID Fn8000_000A_EDX), the VMCB provides a 16 bit pause filter
+ * count value. On VMRUN this value is loaded into an internal counter.
+ * Each time a pause instruction is executed, this counter is decremented
+ * until it reaches zero at which time a #VMEXIT is generated if pause
+ * intercept is enabled. Refer to AMD APM Vol 2 Section 15.14.4 Pause
+ * Intercept Filtering for more details.
+ * This also indicate if ple logic enabled.
+ *
+ * pause_filter_thresh: In addition, some processor families support advanced
+ * pause filtering (indicated by CPUID Fn8000_000A_EDX) upper bound on
+ * the amount of time a guest is allowed to execute in a pause loop.
+ * In this mode, a 16-bit pause filter threshold field is added in the
+ * VMCB. The threshold value is a cycle count that is used to reset the
+ * pause counter. As with simple pause filtering, VMRUN loads the pause
+ * count value from VMCB into an internal counter. Then, on each pause
+ * instruction the hardware checks the elapsed number of cycles since
+ * the most recent pause instruction against the pause filter threshold.
+ * If the elapsed cycle count is greater than the pause filter threshold,
+ * then the internal pause count is reloaded from the VMCB and execution
+ * continues. If the elapsed cycle count is less than the pause filter
+ * threshold, then the internal pause count is decremented. If the count
+ * value is less than zero and PAUSE intercept is enabled, a #VMEXIT is
+ * triggered. If advanced pause filtering is supported and pause filter
+ * threshold field is set to zero, the filter will operate in the simpler,
+ * count only mode.
+ */
+
+static unsigned short pause_filter_thresh = KVM_DEFAULT_PLE_GAP;
+module_param(pause_filter_thresh, ushort, 0444);
+
+static unsigned short pause_filter_count = KVM_SVM_DEFAULT_PLE_WINDOW;
+module_param(pause_filter_count, ushort, 0444);
+
+/* Default doubles per-vcpu window every exit. */
+static unsigned short pause_filter_count_grow = KVM_DEFAULT_PLE_WINDOW_GROW;
+module_param(pause_filter_count_grow, ushort, 0444);
+
+/* Default resets per-vcpu window every exit to pause_filter_count. */
+static unsigned short pause_filter_count_shrink = KVM_DEFAULT_PLE_WINDOW_SHRINK;
+module_param(pause_filter_count_shrink, ushort, 0444);
+
+/* Default is to compute the maximum so we can never overflow. */
+static unsigned short pause_filter_count_max = KVM_SVM_DEFAULT_PLE_WINDOW_MAX;
+module_param(pause_filter_count_max, ushort, 0444);
+
+/*
+ * Use nested page tables by default. Note, NPT may get forced off by
+ * svm_hardware_setup() if it's unsupported by hardware or the host kernel.
+ */
+bool npt_enabled = true;
+module_param_named(npt, npt_enabled, bool, 0444);
+
+/* allow nested virtualization in KVM/SVM */
+static int nested = true;
+module_param(nested, int, S_IRUGO);
+
+/* enable/disable Next RIP Save */
+static int nrips = true;
+module_param(nrips, int, 0444);
+
+/* enable/disable Virtual VMLOAD VMSAVE */
+static int vls = true;
+module_param(vls, int, 0444);
+
+/* enable/disable Virtual GIF */
+int vgif = true;
+module_param(vgif, int, 0444);
+
+/* enable/disable LBR virtualization */
+static int lbrv = true;
+module_param(lbrv, int, 0444);
+
+static int tsc_scaling = true;
+module_param(tsc_scaling, int, 0444);
+
+/*
+ * enable / disable AVIC. Because the defaults differ for APICv
+ * support between VMX and SVM we cannot use module_param_named.
+ */
+static bool avic;
+module_param(avic, bool, 0444);
+
+bool __read_mostly dump_invalid_vmcb;
+module_param(dump_invalid_vmcb, bool, 0644);
+
+
+bool intercept_smi = true;
+module_param(intercept_smi, bool, 0444);
+
+
+static bool svm_gp_erratum_intercept = true;
+
+static u8 rsm_ins_bytes[] = "\x0f\xaa";
+
+static unsigned long iopm_base;
+
+struct kvm_ldttss_desc {
+ u16 limit0;
+ u16 base0;
+ unsigned base1:8, type:5, dpl:2, p:1;
+ unsigned limit1:4, zero0:3, g:1, base2:8;
+ u32 base3;
+ u32 zero1;
+} __attribute__((packed));
+
+DEFINE_PER_CPU(struct svm_cpu_data, svm_data);
+
+/*
+ * Only MSR_TSC_AUX is switched via the user return hook. EFER is switched via
+ * the VMCB, and the SYSCALL/SYSENTER MSRs are handled by VMLOAD/VMSAVE.
+ *
+ * RDTSCP and RDPID are not used in the kernel, specifically to allow KVM to
+ * defer the restoration of TSC_AUX until the CPU returns to userspace.
+ */
+static int tsc_aux_uret_slot __read_mostly = -1;
+
+static const u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000};
+
+#define NUM_MSR_MAPS ARRAY_SIZE(msrpm_ranges)
+#define MSRS_RANGE_SIZE 2048
+#define MSRS_IN_RANGE (MSRS_RANGE_SIZE * 8 / 2)
+
+u32 svm_msrpm_offset(u32 msr)
+{
+ u32 offset;
+ int i;
+
+ for (i = 0; i < NUM_MSR_MAPS; i++) {
+ if (msr < msrpm_ranges[i] ||
+ msr >= msrpm_ranges[i] + MSRS_IN_RANGE)
+ continue;
+
+ offset = (msr - msrpm_ranges[i]) / 4; /* 4 msrs per u8 */
+ offset += (i * MSRS_RANGE_SIZE); /* add range offset */
+
+ /* Now we have the u8 offset - but need the u32 offset */
+ return offset / 4;
+ }
+
+ /* MSR not in any range */
+ return MSR_INVALID;
+}
+
+static void svm_flush_tlb_current(struct kvm_vcpu *vcpu);
+
+static int get_npt_level(void)
+{
+#ifdef CONFIG_X86_64
+ return pgtable_l5_enabled() ? PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL;
+#else
+ return PT32E_ROOT_LEVEL;
+#endif
+}
+
+int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ u64 old_efer = vcpu->arch.efer;
+ vcpu->arch.efer = efer;
+
+ if (!npt_enabled) {
+ /* Shadow paging assumes NX to be available. */
+ efer |= EFER_NX;
+
+ if (!(efer & EFER_LMA))
+ efer &= ~EFER_LME;
+ }
+
+ if ((old_efer & EFER_SVME) != (efer & EFER_SVME)) {
+ if (!(efer & EFER_SVME)) {
+ svm_leave_nested(vcpu);
+ svm_set_gif(svm, true);
+ /* #GP intercept is still needed for vmware backdoor */
+ if (!enable_vmware_backdoor)
+ clr_exception_intercept(svm, GP_VECTOR);
+
+ /*
+ * Free the nested guest state, unless we are in SMM.
+ * In this case we will return to the nested guest
+ * as soon as we leave SMM.
+ */
+ if (!is_smm(vcpu))
+ svm_free_nested(svm);
+
+ } else {
+ int ret = svm_allocate_nested(svm);
+
+ if (ret) {
+ vcpu->arch.efer = old_efer;
+ return ret;
+ }
+
+ /*
+ * Never intercept #GP for SEV guests, KVM can't
+ * decrypt guest memory to workaround the erratum.
+ */
+ if (svm_gp_erratum_intercept && !sev_guest(vcpu->kvm))
+ set_exception_intercept(svm, GP_VECTOR);
+ }
+ }
+
+ svm->vmcb->save.efer = efer | EFER_SVME;
+ vmcb_mark_dirty(svm->vmcb, VMCB_CR);
+ return 0;
+}
+
+static u32 svm_get_interrupt_shadow(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ u32 ret = 0;
+
+ if (svm->vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK)
+ ret = KVM_X86_SHADOW_INT_STI | KVM_X86_SHADOW_INT_MOV_SS;
+ return ret;
+}
+
+static void svm_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (mask == 0)
+ svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK;
+ else
+ svm->vmcb->control.int_state |= SVM_INTERRUPT_SHADOW_MASK;
+
+}
+static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
+ void *insn, int insn_len);
+
+static int __svm_skip_emulated_instruction(struct kvm_vcpu *vcpu,
+ bool commit_side_effects)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ unsigned long old_rflags;
+
+ /*
+ * SEV-ES does not expose the next RIP. The RIP update is controlled by
+ * the type of exit and the #VC handler in the guest.
+ */
+ if (sev_es_guest(vcpu->kvm))
+ goto done;
+
+ if (nrips && svm->vmcb->control.next_rip != 0) {
+ WARN_ON_ONCE(!static_cpu_has(X86_FEATURE_NRIPS));
+ svm->next_rip = svm->vmcb->control.next_rip;
+ }
+
+ if (!svm->next_rip) {
+ /*
+ * FIXME: Drop this when kvm_emulate_instruction() does the
+ * right thing and treats "can't emulate" as outright failure
+ * for EMULTYPE_SKIP.
+ */
+ if (!svm_can_emulate_instruction(vcpu, EMULTYPE_SKIP, NULL, 0))
+ return 0;
+
+ if (unlikely(!commit_side_effects))
+ old_rflags = svm->vmcb->save.rflags;
+
+ if (!kvm_emulate_instruction(vcpu, EMULTYPE_SKIP))
+ return 0;
+
+ if (unlikely(!commit_side_effects))
+ svm->vmcb->save.rflags = old_rflags;
+ } else {
+ kvm_rip_write(vcpu, svm->next_rip);
+ }
+
+done:
+ if (likely(commit_side_effects))
+ svm_set_interrupt_shadow(vcpu, 0);
+
+ return 1;
+}
+
+static int svm_skip_emulated_instruction(struct kvm_vcpu *vcpu)
+{
+ return __svm_skip_emulated_instruction(vcpu, true);
+}
+
+static int svm_update_soft_interrupt_rip(struct kvm_vcpu *vcpu)
+{
+ unsigned long rip, old_rip = kvm_rip_read(vcpu);
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ /*
+ * Due to architectural shortcomings, the CPU doesn't always provide
+ * NextRIP, e.g. if KVM intercepted an exception that occurred while
+ * the CPU was vectoring an INTO/INT3 in the guest. Temporarily skip
+ * the instruction even if NextRIP is supported to acquire the next
+ * RIP so that it can be shoved into the NextRIP field, otherwise
+ * hardware will fail to advance guest RIP during event injection.
+ * Drop the exception/interrupt if emulation fails and effectively
+ * retry the instruction, it's the least awful option. If NRIPS is
+ * in use, the skip must not commit any side effects such as clearing
+ * the interrupt shadow or RFLAGS.RF.
+ */
+ if (!__svm_skip_emulated_instruction(vcpu, !nrips))
+ return -EIO;
+
+ rip = kvm_rip_read(vcpu);
+
+ /*
+ * Save the injection information, even when using next_rip, as the
+ * VMCB's next_rip will be lost (cleared on VM-Exit) if the injection
+ * doesn't complete due to a VM-Exit occurring while the CPU is
+ * vectoring the event. Decoding the instruction isn't guaranteed to
+ * work as there may be no backing instruction, e.g. if the event is
+ * being injected by L1 for L2, or if the guest is patching INT3 into
+ * a different instruction.
+ */
+ svm->soft_int_injected = true;
+ svm->soft_int_csbase = svm->vmcb->save.cs.base;
+ svm->soft_int_old_rip = old_rip;
+ svm->soft_int_next_rip = rip;
+
+ if (nrips)
+ kvm_rip_write(vcpu, old_rip);
+
+ if (static_cpu_has(X86_FEATURE_NRIPS))
+ svm->vmcb->control.next_rip = rip;
+
+ return 0;
+}
+
+static void svm_inject_exception(struct kvm_vcpu *vcpu)
+{
+ struct kvm_queued_exception *ex = &vcpu->arch.exception;
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ kvm_deliver_exception_payload(vcpu, ex);
+
+ if (kvm_exception_is_soft(ex->vector) &&
+ svm_update_soft_interrupt_rip(vcpu))
+ return;
+
+ svm->vmcb->control.event_inj = ex->vector
+ | SVM_EVTINJ_VALID
+ | (ex->has_error_code ? SVM_EVTINJ_VALID_ERR : 0)
+ | SVM_EVTINJ_TYPE_EXEPT;
+ svm->vmcb->control.event_inj_err = ex->error_code;
+}
+
+static void svm_init_erratum_383(void)
+{
+ u32 low, high;
+ int err;
+ u64 val;
+
+ if (!static_cpu_has_bug(X86_BUG_AMD_TLB_MMATCH))
+ return;
+
+ /* Use _safe variants to not break nested virtualization */
+ val = native_read_msr_safe(MSR_AMD64_DC_CFG, &err);
+ if (err)
+ return;
+
+ val |= (1ULL << 47);
+
+ low = lower_32_bits(val);
+ high = upper_32_bits(val);
+
+ native_write_msr_safe(MSR_AMD64_DC_CFG, low, high);
+
+ erratum_383_found = true;
+}
+
+static void svm_init_osvw(struct kvm_vcpu *vcpu)
+{
+ /*
+ * Guests should see errata 400 and 415 as fixed (assuming that
+ * HLT and IO instructions are intercepted).
+ */
+ vcpu->arch.osvw.length = (osvw_len >= 3) ? (osvw_len) : 3;
+ vcpu->arch.osvw.status = osvw_status & ~(6ULL);
+
+ /*
+ * By increasing VCPU's osvw.length to 3 we are telling the guest that
+ * all osvw.status bits inside that length, including bit 0 (which is
+ * reserved for erratum 298), are valid. However, if host processor's
+ * osvw_len is 0 then osvw_status[0] carries no information. We need to
+ * be conservative here and therefore we tell the guest that erratum 298
+ * is present (because we really don't know).
+ */
+ if (osvw_len == 0 && boot_cpu_data.x86 == 0x10)
+ vcpu->arch.osvw.status |= 1;
+}
+
+static int has_svm(void)
+{
+ const char *msg;
+
+ if (!cpu_has_svm(&msg)) {
+ printk(KERN_INFO "has_svm: %s\n", msg);
+ return 0;
+ }
+
+ if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) {
+ pr_info("KVM is unsupported when running as an SEV guest\n");
+ return 0;
+ }
+
+ return 1;
+}
+
+void __svm_write_tsc_multiplier(u64 multiplier)
+{
+ preempt_disable();
+
+ if (multiplier == __this_cpu_read(current_tsc_ratio))
+ goto out;
+
+ wrmsrl(MSR_AMD64_TSC_RATIO, multiplier);
+ __this_cpu_write(current_tsc_ratio, multiplier);
+out:
+ preempt_enable();
+}
+
+static void svm_hardware_disable(void)
+{
+ /* Make sure we clean up behind us */
+ if (tsc_scaling)
+ __svm_write_tsc_multiplier(SVM_TSC_RATIO_DEFAULT);
+
+ cpu_svm_disable();
+
+ amd_pmu_disable_virt();
+}
+
+static int svm_hardware_enable(void)
+{
+
+ struct svm_cpu_data *sd;
+ uint64_t efer;
+ struct desc_struct *gdt;
+ int me = raw_smp_processor_id();
+
+ rdmsrl(MSR_EFER, efer);
+ if (efer & EFER_SVME)
+ return -EBUSY;
+
+ if (!has_svm()) {
+ pr_err("%s: err EOPNOTSUPP on %d\n", __func__, me);
+ return -EINVAL;
+ }
+ sd = per_cpu_ptr(&svm_data, me);
+ sd->asid_generation = 1;
+ sd->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1;
+ sd->next_asid = sd->max_asid + 1;
+ sd->min_asid = max_sev_asid + 1;
+
+ gdt = get_current_gdt_rw();
+ sd->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS);
+
+ wrmsrl(MSR_EFER, efer | EFER_SVME);
+
+ wrmsrl(MSR_VM_HSAVE_PA, sd->save_area_pa);
+
+ if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) {
+ /*
+ * Set the default value, even if we don't use TSC scaling
+ * to avoid having stale value in the msr
+ */
+ __svm_write_tsc_multiplier(SVM_TSC_RATIO_DEFAULT);
+ }
+
+
+ /*
+ * Get OSVW bits.
+ *
+ * Note that it is possible to have a system with mixed processor
+ * revisions and therefore different OSVW bits. If bits are not the same
+ * on different processors then choose the worst case (i.e. if erratum
+ * is present on one processor and not on another then assume that the
+ * erratum is present everywhere).
+ */
+ if (cpu_has(&boot_cpu_data, X86_FEATURE_OSVW)) {
+ uint64_t len, status = 0;
+ int err;
+
+ len = native_read_msr_safe(MSR_AMD64_OSVW_ID_LENGTH, &err);
+ if (!err)
+ status = native_read_msr_safe(MSR_AMD64_OSVW_STATUS,
+ &err);
+
+ if (err)
+ osvw_status = osvw_len = 0;
+ else {
+ if (len < osvw_len)
+ osvw_len = len;
+ osvw_status |= status;
+ osvw_status &= (1ULL << osvw_len) - 1;
+ }
+ } else
+ osvw_status = osvw_len = 0;
+
+ svm_init_erratum_383();
+
+ amd_pmu_enable_virt();
+
+ return 0;
+}
+
+static void svm_cpu_uninit(int cpu)
+{
+ struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, cpu);
+
+ if (!sd->save_area)
+ return;
+
+ kfree(sd->sev_vmcbs);
+ __free_page(sd->save_area);
+ sd->save_area_pa = 0;
+ sd->save_area = NULL;
+}
+
+static int svm_cpu_init(int cpu)
+{
+ struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, cpu);
+ int ret = -ENOMEM;
+
+ memset(sd, 0, sizeof(struct svm_cpu_data));
+ sd->save_area = alloc_page(GFP_KERNEL | __GFP_ZERO);
+ if (!sd->save_area)
+ return ret;
+
+ ret = sev_cpu_init(sd);
+ if (ret)
+ goto free_save_area;
+
+ sd->save_area_pa = __sme_page_pa(sd->save_area);
+ return 0;
+
+free_save_area:
+ __free_page(sd->save_area);
+ sd->save_area = NULL;
+ return ret;
+
+}
+
+static int direct_access_msr_slot(u32 msr)
+{
+ u32 i;
+
+ for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++)
+ if (direct_access_msrs[i].index == msr)
+ return i;
+
+ return -ENOENT;
+}
+
+static void set_shadow_msr_intercept(struct kvm_vcpu *vcpu, u32 msr, int read,
+ int write)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ int slot = direct_access_msr_slot(msr);
+
+ if (slot == -ENOENT)
+ return;
+
+ /* Set the shadow bitmaps to the desired intercept states */
+ if (read)
+ set_bit(slot, svm->shadow_msr_intercept.read);
+ else
+ clear_bit(slot, svm->shadow_msr_intercept.read);
+
+ if (write)
+ set_bit(slot, svm->shadow_msr_intercept.write);
+ else
+ clear_bit(slot, svm->shadow_msr_intercept.write);
+}
+
+static bool valid_msr_intercept(u32 index)
+{
+ return direct_access_msr_slot(index) != -ENOENT;
+}
+
+static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr)
+{
+ u8 bit_write;
+ unsigned long tmp;
+ u32 offset;
+ u32 *msrpm;
+
+ /*
+ * For non-nested case:
+ * If the L01 MSR bitmap does not intercept the MSR, then we need to
+ * save it.
+ *
+ * For nested case:
+ * If the L02 MSR bitmap does not intercept the MSR, then we need to
+ * save it.
+ */
+ msrpm = is_guest_mode(vcpu) ? to_svm(vcpu)->nested.msrpm:
+ to_svm(vcpu)->msrpm;
+
+ offset = svm_msrpm_offset(msr);
+ bit_write = 2 * (msr & 0x0f) + 1;
+ tmp = msrpm[offset];
+
+ BUG_ON(offset == MSR_INVALID);
+
+ return !!test_bit(bit_write, &tmp);
+}
+
+static void set_msr_interception_bitmap(struct kvm_vcpu *vcpu, u32 *msrpm,
+ u32 msr, int read, int write)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ u8 bit_read, bit_write;
+ unsigned long tmp;
+ u32 offset;
+
+ /*
+ * If this warning triggers extend the direct_access_msrs list at the
+ * beginning of the file
+ */
+ WARN_ON(!valid_msr_intercept(msr));
+
+ /* Enforce non allowed MSRs to trap */
+ if (read && !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_READ))
+ read = 0;
+
+ if (write && !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_WRITE))
+ write = 0;
+
+ offset = svm_msrpm_offset(msr);
+ bit_read = 2 * (msr & 0x0f);
+ bit_write = 2 * (msr & 0x0f) + 1;
+ tmp = msrpm[offset];
+
+ BUG_ON(offset == MSR_INVALID);
+
+ read ? clear_bit(bit_read, &tmp) : set_bit(bit_read, &tmp);
+ write ? clear_bit(bit_write, &tmp) : set_bit(bit_write, &tmp);
+
+ msrpm[offset] = tmp;
+
+ svm_hv_vmcb_dirty_nested_enlightenments(vcpu);
+ svm->nested.force_msr_bitmap_recalc = true;
+}
+
+void set_msr_interception(struct kvm_vcpu *vcpu, u32 *msrpm, u32 msr,
+ int read, int write)
+{
+ set_shadow_msr_intercept(vcpu, msr, read, write);
+ set_msr_interception_bitmap(vcpu, msrpm, msr, read, write);
+}
+
+u32 *svm_vcpu_alloc_msrpm(void)
+{
+ unsigned int order = get_order(MSRPM_SIZE);
+ struct page *pages = alloc_pages(GFP_KERNEL_ACCOUNT, order);
+ u32 *msrpm;
+
+ if (!pages)
+ return NULL;
+
+ msrpm = page_address(pages);
+ memset(msrpm, 0xff, PAGE_SIZE * (1 << order));
+
+ return msrpm;
+}
+
+void svm_vcpu_init_msrpm(struct kvm_vcpu *vcpu, u32 *msrpm)
+{
+ int i;
+
+ for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
+ if (!direct_access_msrs[i].always)
+ continue;
+ set_msr_interception(vcpu, msrpm, direct_access_msrs[i].index, 1, 1);
+ }
+}
+
+void svm_set_x2apic_msr_interception(struct vcpu_svm *svm, bool intercept)
+{
+ int i;
+
+ if (intercept == svm->x2avic_msrs_intercepted)
+ return;
+
+ if (avic_mode != AVIC_MODE_X2)
+ return;
+
+ for (i = 0; i < MAX_DIRECT_ACCESS_MSRS; i++) {
+ int index = direct_access_msrs[i].index;
+
+ if ((index < APIC_BASE_MSR) ||
+ (index > APIC_BASE_MSR + 0xff))
+ continue;
+ set_msr_interception(&svm->vcpu, svm->msrpm, index,
+ !intercept, !intercept);
+ }
+
+ svm->x2avic_msrs_intercepted = intercept;
+}
+
+void svm_vcpu_free_msrpm(u32 *msrpm)
+{
+ __free_pages(virt_to_page(msrpm), get_order(MSRPM_SIZE));
+}
+
+static void svm_msr_filter_changed(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ u32 i;
+
+ /*
+ * Set intercept permissions for all direct access MSRs again. They
+ * will automatically get filtered through the MSR filter, so we are
+ * back in sync after this.
+ */
+ for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
+ u32 msr = direct_access_msrs[i].index;
+ u32 read = test_bit(i, svm->shadow_msr_intercept.read);
+ u32 write = test_bit(i, svm->shadow_msr_intercept.write);
+
+ set_msr_interception_bitmap(vcpu, svm->msrpm, msr, read, write);
+ }
+}
+
+static void add_msr_offset(u32 offset)
+{
+ int i;
+
+ for (i = 0; i < MSRPM_OFFSETS; ++i) {
+
+ /* Offset already in list? */
+ if (msrpm_offsets[i] == offset)
+ return;
+
+ /* Slot used by another offset? */
+ if (msrpm_offsets[i] != MSR_INVALID)
+ continue;
+
+ /* Add offset to list */
+ msrpm_offsets[i] = offset;
+
+ return;
+ }
+
+ /*
+ * If this BUG triggers the msrpm_offsets table has an overflow. Just
+ * increase MSRPM_OFFSETS in this case.
+ */
+ BUG();
+}
+
+static void init_msrpm_offsets(void)
+{
+ int i;
+
+ memset(msrpm_offsets, 0xff, sizeof(msrpm_offsets));
+
+ for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
+ u32 offset;
+
+ offset = svm_msrpm_offset(direct_access_msrs[i].index);
+ BUG_ON(offset == MSR_INVALID);
+
+ add_msr_offset(offset);
+ }
+}
+
+void svm_copy_lbrs(struct vmcb *to_vmcb, struct vmcb *from_vmcb)
+{
+ to_vmcb->save.dbgctl = from_vmcb->save.dbgctl;
+ to_vmcb->save.br_from = from_vmcb->save.br_from;
+ to_vmcb->save.br_to = from_vmcb->save.br_to;
+ to_vmcb->save.last_excp_from = from_vmcb->save.last_excp_from;
+ to_vmcb->save.last_excp_to = from_vmcb->save.last_excp_to;
+
+ vmcb_mark_dirty(to_vmcb, VMCB_LBR);
+}
+
+static void svm_enable_lbrv(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ svm->vmcb->control.virt_ext |= LBR_CTL_ENABLE_MASK;
+ set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHFROMIP, 1, 1);
+ set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1);
+ set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 1, 1);
+ set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 1, 1);
+
+ /* Move the LBR msrs to the vmcb02 so that the guest can see them. */
+ if (is_guest_mode(vcpu))
+ svm_copy_lbrs(svm->vmcb, svm->vmcb01.ptr);
+}
+
+static void svm_disable_lbrv(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ svm->vmcb->control.virt_ext &= ~LBR_CTL_ENABLE_MASK;
+ set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHFROMIP, 0, 0);
+ set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 0, 0);
+ set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 0, 0);
+ set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 0, 0);
+
+ /*
+ * Move the LBR msrs back to the vmcb01 to avoid copying them
+ * on nested guest entries.
+ */
+ if (is_guest_mode(vcpu))
+ svm_copy_lbrs(svm->vmcb01.ptr, svm->vmcb);
+}
+
+static int svm_get_lbr_msr(struct vcpu_svm *svm, u32 index)
+{
+ /*
+ * If the LBR virtualization is disabled, the LBR msrs are always
+ * kept in the vmcb01 to avoid copying them on nested guest entries.
+ *
+ * If nested, and the LBR virtualization is enabled/disabled, the msrs
+ * are moved between the vmcb01 and vmcb02 as needed.
+ */
+ struct vmcb *vmcb =
+ (svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK) ?
+ svm->vmcb : svm->vmcb01.ptr;
+
+ switch (index) {
+ case MSR_IA32_DEBUGCTLMSR:
+ return vmcb->save.dbgctl;
+ case MSR_IA32_LASTBRANCHFROMIP:
+ return vmcb->save.br_from;
+ case MSR_IA32_LASTBRANCHTOIP:
+ return vmcb->save.br_to;
+ case MSR_IA32_LASTINTFROMIP:
+ return vmcb->save.last_excp_from;
+ case MSR_IA32_LASTINTTOIP:
+ return vmcb->save.last_excp_to;
+ default:
+ KVM_BUG(false, svm->vcpu.kvm,
+ "%s: Unknown MSR 0x%x", __func__, index);
+ return 0;
+ }
+}
+
+void svm_update_lbrv(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ bool enable_lbrv = svm_get_lbr_msr(svm, MSR_IA32_DEBUGCTLMSR) &
+ DEBUGCTLMSR_LBR;
+
+ bool current_enable_lbrv = !!(svm->vmcb->control.virt_ext &
+ LBR_CTL_ENABLE_MASK);
+
+ if (unlikely(is_guest_mode(vcpu) && svm->lbrv_enabled))
+ if (unlikely(svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK))
+ enable_lbrv = true;
+
+ if (enable_lbrv == current_enable_lbrv)
+ return;
+
+ if (enable_lbrv)
+ svm_enable_lbrv(vcpu);
+ else
+ svm_disable_lbrv(vcpu);
+}
+
+void disable_nmi_singlestep(struct vcpu_svm *svm)
+{
+ svm->nmi_singlestep = false;
+
+ if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP)) {
+ /* Clear our flags if they were not set by the guest */
+ if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF))
+ svm->vmcb->save.rflags &= ~X86_EFLAGS_TF;
+ if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_RF))
+ svm->vmcb->save.rflags &= ~X86_EFLAGS_RF;
+ }
+}
+
+static void grow_ple_window(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb_control_area *control = &svm->vmcb->control;
+ int old = control->pause_filter_count;
+
+ if (kvm_pause_in_guest(vcpu->kvm))
+ return;
+
+ control->pause_filter_count = __grow_ple_window(old,
+ pause_filter_count,
+ pause_filter_count_grow,
+ pause_filter_count_max);
+
+ if (control->pause_filter_count != old) {
+ vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
+ trace_kvm_ple_window_update(vcpu->vcpu_id,
+ control->pause_filter_count, old);
+ }
+}
+
+static void shrink_ple_window(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb_control_area *control = &svm->vmcb->control;
+ int old = control->pause_filter_count;
+
+ if (kvm_pause_in_guest(vcpu->kvm))
+ return;
+
+ control->pause_filter_count =
+ __shrink_ple_window(old,
+ pause_filter_count,
+ pause_filter_count_shrink,
+ pause_filter_count);
+ if (control->pause_filter_count != old) {
+ vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
+ trace_kvm_ple_window_update(vcpu->vcpu_id,
+ control->pause_filter_count, old);
+ }
+}
+
+static void svm_hardware_unsetup(void)
+{
+ int cpu;
+
+ sev_hardware_unsetup();
+
+ for_each_possible_cpu(cpu)
+ svm_cpu_uninit(cpu);
+
+ __free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT),
+ get_order(IOPM_SIZE));
+ iopm_base = 0;
+}
+
+static void init_seg(struct vmcb_seg *seg)
+{
+ seg->selector = 0;
+ seg->attrib = SVM_SELECTOR_P_MASK | SVM_SELECTOR_S_MASK |
+ SVM_SELECTOR_WRITE_MASK; /* Read/Write Data Segment */
+ seg->limit = 0xffff;
+ seg->base = 0;
+}
+
+static void init_sys_seg(struct vmcb_seg *seg, uint32_t type)
+{
+ seg->selector = 0;
+ seg->attrib = SVM_SELECTOR_P_MASK | type;
+ seg->limit = 0xffff;
+ seg->base = 0;
+}
+
+static u64 svm_get_l2_tsc_offset(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ return svm->nested.ctl.tsc_offset;
+}
+
+static u64 svm_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ return svm->tsc_ratio_msr;
+}
+
+static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ svm->vmcb01.ptr->control.tsc_offset = vcpu->arch.l1_tsc_offset;
+ svm->vmcb->control.tsc_offset = offset;
+ vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
+}
+
+static void svm_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 multiplier)
+{
+ __svm_write_tsc_multiplier(multiplier);
+}
+
+
+/* Evaluate instruction intercepts that depend on guest CPUID features. */
+static void svm_recalc_instruction_intercepts(struct kvm_vcpu *vcpu,
+ struct vcpu_svm *svm)
+{
+ /*
+ * Intercept INVPCID if shadow paging is enabled to sync/free shadow
+ * roots, or if INVPCID is disabled in the guest to inject #UD.
+ */
+ if (kvm_cpu_cap_has(X86_FEATURE_INVPCID)) {
+ if (!npt_enabled ||
+ !guest_cpuid_has(&svm->vcpu, X86_FEATURE_INVPCID))
+ svm_set_intercept(svm, INTERCEPT_INVPCID);
+ else
+ svm_clr_intercept(svm, INTERCEPT_INVPCID);
+ }
+
+ if (kvm_cpu_cap_has(X86_FEATURE_RDTSCP)) {
+ if (guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
+ svm_clr_intercept(svm, INTERCEPT_RDTSCP);
+ else
+ svm_set_intercept(svm, INTERCEPT_RDTSCP);
+ }
+}
+
+static inline void init_vmcb_after_set_cpuid(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (guest_cpuid_is_intel(vcpu)) {
+ /*
+ * We must intercept SYSENTER_EIP and SYSENTER_ESP
+ * accesses because the processor only stores 32 bits.
+ * For the same reason we cannot use virtual VMLOAD/VMSAVE.
+ */
+ svm_set_intercept(svm, INTERCEPT_VMLOAD);
+ svm_set_intercept(svm, INTERCEPT_VMSAVE);
+ svm->vmcb->control.virt_ext &= ~VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
+
+ set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_EIP, 0, 0);
+ set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_ESP, 0, 0);
+
+ svm->v_vmload_vmsave_enabled = false;
+ } else {
+ /*
+ * If hardware supports Virtual VMLOAD VMSAVE then enable it
+ * in VMCB and clear intercepts to avoid #VMEXIT.
+ */
+ if (vls) {
+ svm_clr_intercept(svm, INTERCEPT_VMLOAD);
+ svm_clr_intercept(svm, INTERCEPT_VMSAVE);
+ svm->vmcb->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
+ }
+ /* No need to intercept these MSRs */
+ set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_EIP, 1, 1);
+ set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_ESP, 1, 1);
+ }
+}
+
+static void init_vmcb(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb *vmcb = svm->vmcb01.ptr;
+ struct vmcb_control_area *control = &vmcb->control;
+ struct vmcb_save_area *save = &vmcb->save;
+
+ svm_set_intercept(svm, INTERCEPT_CR0_READ);
+ svm_set_intercept(svm, INTERCEPT_CR3_READ);
+ svm_set_intercept(svm, INTERCEPT_CR4_READ);
+ svm_set_intercept(svm, INTERCEPT_CR0_WRITE);
+ svm_set_intercept(svm, INTERCEPT_CR3_WRITE);
+ svm_set_intercept(svm, INTERCEPT_CR4_WRITE);
+ if (!kvm_vcpu_apicv_active(vcpu))
+ svm_set_intercept(svm, INTERCEPT_CR8_WRITE);
+
+ set_dr_intercepts(svm);
+
+ set_exception_intercept(svm, PF_VECTOR);
+ set_exception_intercept(svm, UD_VECTOR);
+ set_exception_intercept(svm, MC_VECTOR);
+ set_exception_intercept(svm, AC_VECTOR);
+ set_exception_intercept(svm, DB_VECTOR);
+ /*
+ * Guest access to VMware backdoor ports could legitimately
+ * trigger #GP because of TSS I/O permission bitmap.
+ * We intercept those #GP and allow access to them anyway
+ * as VMware does. Don't intercept #GP for SEV guests as KVM can't
+ * decrypt guest memory to decode the faulting instruction.
+ */
+ if (enable_vmware_backdoor && !sev_guest(vcpu->kvm))
+ set_exception_intercept(svm, GP_VECTOR);
+
+ svm_set_intercept(svm, INTERCEPT_INTR);
+ svm_set_intercept(svm, INTERCEPT_NMI);
+
+ if (intercept_smi)
+ svm_set_intercept(svm, INTERCEPT_SMI);
+
+ svm_set_intercept(svm, INTERCEPT_SELECTIVE_CR0);
+ svm_set_intercept(svm, INTERCEPT_RDPMC);
+ svm_set_intercept(svm, INTERCEPT_CPUID);
+ svm_set_intercept(svm, INTERCEPT_INVD);
+ svm_set_intercept(svm, INTERCEPT_INVLPG);
+ svm_set_intercept(svm, INTERCEPT_INVLPGA);
+ svm_set_intercept(svm, INTERCEPT_IOIO_PROT);
+ svm_set_intercept(svm, INTERCEPT_MSR_PROT);
+ svm_set_intercept(svm, INTERCEPT_TASK_SWITCH);
+ svm_set_intercept(svm, INTERCEPT_SHUTDOWN);
+ svm_set_intercept(svm, INTERCEPT_VMRUN);
+ svm_set_intercept(svm, INTERCEPT_VMMCALL);
+ svm_set_intercept(svm, INTERCEPT_VMLOAD);
+ svm_set_intercept(svm, INTERCEPT_VMSAVE);
+ svm_set_intercept(svm, INTERCEPT_STGI);
+ svm_set_intercept(svm, INTERCEPT_CLGI);
+ svm_set_intercept(svm, INTERCEPT_SKINIT);
+ svm_set_intercept(svm, INTERCEPT_WBINVD);
+ svm_set_intercept(svm, INTERCEPT_XSETBV);
+ svm_set_intercept(svm, INTERCEPT_RDPRU);
+ svm_set_intercept(svm, INTERCEPT_RSM);
+
+ if (!kvm_mwait_in_guest(vcpu->kvm)) {
+ svm_set_intercept(svm, INTERCEPT_MONITOR);
+ svm_set_intercept(svm, INTERCEPT_MWAIT);
+ }
+
+ if (!kvm_hlt_in_guest(vcpu->kvm))
+ svm_set_intercept(svm, INTERCEPT_HLT);
+
+ control->iopm_base_pa = __sme_set(iopm_base);
+ control->msrpm_base_pa = __sme_set(__pa(svm->msrpm));
+ control->int_ctl = V_INTR_MASKING_MASK;
+
+ init_seg(&save->es);
+ init_seg(&save->ss);
+ init_seg(&save->ds);
+ init_seg(&save->fs);
+ init_seg(&save->gs);
+
+ save->cs.selector = 0xf000;
+ save->cs.base = 0xffff0000;
+ /* Executable/Readable Code Segment */
+ save->cs.attrib = SVM_SELECTOR_READ_MASK | SVM_SELECTOR_P_MASK |
+ SVM_SELECTOR_S_MASK | SVM_SELECTOR_CODE_MASK;
+ save->cs.limit = 0xffff;
+
+ save->gdtr.base = 0;
+ save->gdtr.limit = 0xffff;
+ save->idtr.base = 0;
+ save->idtr.limit = 0xffff;
+
+ init_sys_seg(&save->ldtr, SEG_TYPE_LDT);
+ init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16);
+
+ if (npt_enabled) {
+ /* Setup VMCB for Nested Paging */
+ control->nested_ctl |= SVM_NESTED_CTL_NP_ENABLE;
+ svm_clr_intercept(svm, INTERCEPT_INVLPG);
+ clr_exception_intercept(svm, PF_VECTOR);
+ svm_clr_intercept(svm, INTERCEPT_CR3_READ);
+ svm_clr_intercept(svm, INTERCEPT_CR3_WRITE);
+ save->g_pat = vcpu->arch.pat;
+ save->cr3 = 0;
+ }
+ svm->current_vmcb->asid_generation = 0;
+ svm->asid = 0;
+
+ svm->nested.vmcb12_gpa = INVALID_GPA;
+ svm->nested.last_vmcb12_gpa = INVALID_GPA;
+
+ if (!kvm_pause_in_guest(vcpu->kvm)) {
+ control->pause_filter_count = pause_filter_count;
+ if (pause_filter_thresh)
+ control->pause_filter_thresh = pause_filter_thresh;
+ svm_set_intercept(svm, INTERCEPT_PAUSE);
+ } else {
+ svm_clr_intercept(svm, INTERCEPT_PAUSE);
+ }
+
+ svm_recalc_instruction_intercepts(vcpu, svm);
+
+ /*
+ * If the host supports V_SPEC_CTRL then disable the interception
+ * of MSR_IA32_SPEC_CTRL.
+ */
+ if (boot_cpu_has(X86_FEATURE_V_SPEC_CTRL))
+ set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1);
+
+ if (kvm_vcpu_apicv_active(vcpu))
+ avic_init_vmcb(svm, vmcb);
+
+ if (vgif) {
+ svm_clr_intercept(svm, INTERCEPT_STGI);
+ svm_clr_intercept(svm, INTERCEPT_CLGI);
+ svm->vmcb->control.int_ctl |= V_GIF_ENABLE_MASK;
+ }
+
+ if (sev_guest(vcpu->kvm))
+ sev_init_vmcb(svm);
+
+ svm_hv_init_vmcb(vmcb);
+ init_vmcb_after_set_cpuid(vcpu);
+
+ vmcb_mark_all_dirty(vmcb);
+
+ enable_gif(svm);
+}
+
+static void __svm_vcpu_reset(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ svm_vcpu_init_msrpm(vcpu, svm->msrpm);
+
+ svm_init_osvw(vcpu);
+ vcpu->arch.microcode_version = 0x01000065;
+ svm->tsc_ratio_msr = kvm_caps.default_tsc_scaling_ratio;
+
+ if (sev_es_guest(vcpu->kvm))
+ sev_es_vcpu_reset(svm);
+}
+
+static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ svm->spec_ctrl = 0;
+ svm->virt_spec_ctrl = 0;
+
+ init_vmcb(vcpu);
+
+ if (!init_event)
+ __svm_vcpu_reset(vcpu);
+}
+
+void svm_switch_vmcb(struct vcpu_svm *svm, struct kvm_vmcb_info *target_vmcb)
+{
+ svm->current_vmcb = target_vmcb;
+ svm->vmcb = target_vmcb->ptr;
+}
+
+static int svm_vcpu_create(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm;
+ struct page *vmcb01_page;
+ struct page *vmsa_page = NULL;
+ int err;
+
+ BUILD_BUG_ON(offsetof(struct vcpu_svm, vcpu) != 0);
+ svm = to_svm(vcpu);
+
+ err = -ENOMEM;
+ vmcb01_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+ if (!vmcb01_page)
+ goto out;
+
+ if (sev_es_guest(vcpu->kvm)) {
+ /*
+ * SEV-ES guests require a separate VMSA page used to contain
+ * the encrypted register state of the guest.
+ */
+ vmsa_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+ if (!vmsa_page)
+ goto error_free_vmcb_page;
+
+ /*
+ * SEV-ES guests maintain an encrypted version of their FPU
+ * state which is restored and saved on VMRUN and VMEXIT.
+ * Mark vcpu->arch.guest_fpu->fpstate as scratch so it won't
+ * do xsave/xrstor on it.
+ */
+ fpstate_set_confidential(&vcpu->arch.guest_fpu);
+ }
+
+ err = avic_init_vcpu(svm);
+ if (err)
+ goto error_free_vmsa_page;
+
+ svm->msrpm = svm_vcpu_alloc_msrpm();
+ if (!svm->msrpm) {
+ err = -ENOMEM;
+ goto error_free_vmsa_page;
+ }
+
+ svm->x2avic_msrs_intercepted = true;
+
+ svm->vmcb01.ptr = page_address(vmcb01_page);
+ svm->vmcb01.pa = __sme_set(page_to_pfn(vmcb01_page) << PAGE_SHIFT);
+ svm_switch_vmcb(svm, &svm->vmcb01);
+
+ if (vmsa_page)
+ svm->sev_es.vmsa = page_address(vmsa_page);
+
+ svm->guest_state_loaded = false;
+
+ return 0;
+
+error_free_vmsa_page:
+ if (vmsa_page)
+ __free_page(vmsa_page);
+error_free_vmcb_page:
+ __free_page(vmcb01_page);
+out:
+ return err;
+}
+
+static void svm_clear_current_vmcb(struct vmcb *vmcb)
+{
+ int i;
+
+ for_each_online_cpu(i)
+ cmpxchg(per_cpu_ptr(&svm_data.current_vmcb, i), vmcb, NULL);
+}
+
+static void svm_vcpu_free(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ /*
+ * The vmcb page can be recycled, causing a false negative in
+ * svm_vcpu_load(). So, ensure that no logical CPU has this
+ * vmcb page recorded as its current vmcb.
+ */
+ svm_clear_current_vmcb(svm->vmcb);
+
+ svm_leave_nested(vcpu);
+ svm_free_nested(svm);
+
+ sev_free_vcpu(vcpu);
+
+ __free_page(pfn_to_page(__sme_clr(svm->vmcb01.pa) >> PAGE_SHIFT));
+ __free_pages(virt_to_page(svm->msrpm), get_order(MSRPM_SIZE));
+}
+
+static void svm_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, vcpu->cpu);
+
+ if (sev_es_guest(vcpu->kvm))
+ sev_es_unmap_ghcb(svm);
+
+ if (svm->guest_state_loaded)
+ return;
+
+ /*
+ * Save additional host state that will be restored on VMEXIT (sev-es)
+ * or subsequent vmload of host save area.
+ */
+ vmsave(sd->save_area_pa);
+ if (sev_es_guest(vcpu->kvm)) {
+ struct sev_es_save_area *hostsa;
+ hostsa = (struct sev_es_save_area *)(page_address(sd->save_area) + 0x400);
+
+ sev_es_prepare_switch_to_guest(hostsa);
+ }
+
+ if (tsc_scaling)
+ __svm_write_tsc_multiplier(vcpu->arch.tsc_scaling_ratio);
+
+ if (likely(tsc_aux_uret_slot >= 0))
+ kvm_set_user_return_msr(tsc_aux_uret_slot, svm->tsc_aux, -1ull);
+
+ svm->guest_state_loaded = true;
+}
+
+static void svm_prepare_host_switch(struct kvm_vcpu *vcpu)
+{
+ to_svm(vcpu)->guest_state_loaded = false;
+}
+
+static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, cpu);
+
+ if (sd->current_vmcb != svm->vmcb) {
+ sd->current_vmcb = svm->vmcb;
+
+ if (!cpu_feature_enabled(X86_FEATURE_IBPB_ON_VMEXIT))
+ indirect_branch_prediction_barrier();
+ }
+ if (kvm_vcpu_apicv_active(vcpu))
+ avic_vcpu_load(vcpu, cpu);
+}
+
+static void svm_vcpu_put(struct kvm_vcpu *vcpu)
+{
+ if (kvm_vcpu_apicv_active(vcpu))
+ avic_vcpu_put(vcpu);
+
+ svm_prepare_host_switch(vcpu);
+
+ ++vcpu->stat.host_state_reload;
+}
+
+static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ unsigned long rflags = svm->vmcb->save.rflags;
+
+ if (svm->nmi_singlestep) {
+ /* Hide our flags if they were not set by the guest */
+ if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF))
+ rflags &= ~X86_EFLAGS_TF;
+ if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_RF))
+ rflags &= ~X86_EFLAGS_RF;
+ }
+ return rflags;
+}
+
+static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
+{
+ if (to_svm(vcpu)->nmi_singlestep)
+ rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
+
+ /*
+ * Any change of EFLAGS.VM is accompanied by a reload of SS
+ * (caused by either a task switch or an inter-privilege IRET),
+ * so we do not need to update the CPL here.
+ */
+ to_svm(vcpu)->vmcb->save.rflags = rflags;
+}
+
+static bool svm_get_if_flag(struct kvm_vcpu *vcpu)
+{
+ struct vmcb *vmcb = to_svm(vcpu)->vmcb;
+
+ return sev_es_guest(vcpu->kvm)
+ ? vmcb->control.int_state & SVM_GUEST_INTERRUPT_MASK
+ : kvm_get_rflags(vcpu) & X86_EFLAGS_IF;
+}
+
+static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
+{
+ kvm_register_mark_available(vcpu, reg);
+
+ switch (reg) {
+ case VCPU_EXREG_PDPTR:
+ /*
+ * When !npt_enabled, mmu->pdptrs[] is already available since
+ * it is always updated per SDM when moving to CRs.
+ */
+ if (npt_enabled)
+ load_pdptrs(vcpu, kvm_read_cr3(vcpu));
+ break;
+ default:
+ KVM_BUG_ON(1, vcpu->kvm);
+ }
+}
+
+static void svm_set_vintr(struct vcpu_svm *svm)
+{
+ struct vmcb_control_area *control;
+
+ /*
+ * The following fields are ignored when AVIC is enabled
+ */
+ WARN_ON(kvm_vcpu_apicv_activated(&svm->vcpu));
+
+ svm_set_intercept(svm, INTERCEPT_VINTR);
+
+ /*
+ * This is just a dummy VINTR to actually cause a vmexit to happen.
+ * Actual injection of virtual interrupts happens through EVENTINJ.
+ */
+ control = &svm->vmcb->control;
+ control->int_vector = 0x0;
+ control->int_ctl &= ~V_INTR_PRIO_MASK;
+ control->int_ctl |= V_IRQ_MASK |
+ ((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT);
+ vmcb_mark_dirty(svm->vmcb, VMCB_INTR);
+}
+
+static void svm_clear_vintr(struct vcpu_svm *svm)
+{
+ svm_clr_intercept(svm, INTERCEPT_VINTR);
+
+ /* Drop int_ctl fields related to VINTR injection. */
+ svm->vmcb->control.int_ctl &= ~V_IRQ_INJECTION_BITS_MASK;
+ if (is_guest_mode(&svm->vcpu)) {
+ svm->vmcb01.ptr->control.int_ctl &= ~V_IRQ_INJECTION_BITS_MASK;
+
+ WARN_ON((svm->vmcb->control.int_ctl & V_TPR_MASK) !=
+ (svm->nested.ctl.int_ctl & V_TPR_MASK));
+
+ svm->vmcb->control.int_ctl |= svm->nested.ctl.int_ctl &
+ V_IRQ_INJECTION_BITS_MASK;
+
+ svm->vmcb->control.int_vector = svm->nested.ctl.int_vector;
+ }
+
+ vmcb_mark_dirty(svm->vmcb, VMCB_INTR);
+}
+
+static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg)
+{
+ struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;
+ struct vmcb_save_area *save01 = &to_svm(vcpu)->vmcb01.ptr->save;
+
+ switch (seg) {
+ case VCPU_SREG_CS: return &save->cs;
+ case VCPU_SREG_DS: return &save->ds;
+ case VCPU_SREG_ES: return &save->es;
+ case VCPU_SREG_FS: return &save01->fs;
+ case VCPU_SREG_GS: return &save01->gs;
+ case VCPU_SREG_SS: return &save->ss;
+ case VCPU_SREG_TR: return &save01->tr;
+ case VCPU_SREG_LDTR: return &save01->ldtr;
+ }
+ BUG();
+ return NULL;
+}
+
+static u64 svm_get_segment_base(struct kvm_vcpu *vcpu, int seg)
+{
+ struct vmcb_seg *s = svm_seg(vcpu, seg);
+
+ return s->base;
+}
+
+static void svm_get_segment(struct kvm_vcpu *vcpu,
+ struct kvm_segment *var, int seg)
+{
+ struct vmcb_seg *s = svm_seg(vcpu, seg);
+
+ var->base = s->base;
+ var->limit = s->limit;
+ var->selector = s->selector;
+ var->type = s->attrib & SVM_SELECTOR_TYPE_MASK;
+ var->s = (s->attrib >> SVM_SELECTOR_S_SHIFT) & 1;
+ var->dpl = (s->attrib >> SVM_SELECTOR_DPL_SHIFT) & 3;
+ var->present = (s->attrib >> SVM_SELECTOR_P_SHIFT) & 1;
+ var->avl = (s->attrib >> SVM_SELECTOR_AVL_SHIFT) & 1;
+ var->l = (s->attrib >> SVM_SELECTOR_L_SHIFT) & 1;
+ var->db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1;
+
+ /*
+ * AMD CPUs circa 2014 track the G bit for all segments except CS.
+ * However, the SVM spec states that the G bit is not observed by the
+ * CPU, and some VMware virtual CPUs drop the G bit for all segments.
+ * So let's synthesize a legal G bit for all segments, this helps
+ * running KVM nested. It also helps cross-vendor migration, because
+ * Intel's vmentry has a check on the 'G' bit.
+ */
+ var->g = s->limit > 0xfffff;
+
+ /*
+ * AMD's VMCB does not have an explicit unusable field, so emulate it
+ * for cross vendor migration purposes by "not present"
+ */
+ var->unusable = !var->present;
+
+ switch (seg) {
+ case VCPU_SREG_TR:
+ /*
+ * Work around a bug where the busy flag in the tr selector
+ * isn't exposed
+ */
+ var->type |= 0x2;
+ break;
+ case VCPU_SREG_DS:
+ case VCPU_SREG_ES:
+ case VCPU_SREG_FS:
+ case VCPU_SREG_GS:
+ /*
+ * The accessed bit must always be set in the segment
+ * descriptor cache, although it can be cleared in the
+ * descriptor, the cached bit always remains at 1. Since
+ * Intel has a check on this, set it here to support
+ * cross-vendor migration.
+ */
+ if (!var->unusable)
+ var->type |= 0x1;
+ break;
+ case VCPU_SREG_SS:
+ /*
+ * On AMD CPUs sometimes the DB bit in the segment
+ * descriptor is left as 1, although the whole segment has
+ * been made unusable. Clear it here to pass an Intel VMX
+ * entry check when cross vendor migrating.
+ */
+ if (var->unusable)
+ var->db = 0;
+ /* This is symmetric with svm_set_segment() */
+ var->dpl = to_svm(vcpu)->vmcb->save.cpl;
+ break;
+ }
+}
+
+static int svm_get_cpl(struct kvm_vcpu *vcpu)
+{
+ struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;
+
+ return save->cpl;
+}
+
+static void svm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
+{
+ struct kvm_segment cs;
+
+ svm_get_segment(vcpu, &cs, VCPU_SREG_CS);
+ *db = cs.db;
+ *l = cs.l;
+}
+
+static void svm_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ dt->size = svm->vmcb->save.idtr.limit;
+ dt->address = svm->vmcb->save.idtr.base;
+}
+
+static void svm_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ svm->vmcb->save.idtr.limit = dt->size;
+ svm->vmcb->save.idtr.base = dt->address ;
+ vmcb_mark_dirty(svm->vmcb, VMCB_DT);
+}
+
+static void svm_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ dt->size = svm->vmcb->save.gdtr.limit;
+ dt->address = svm->vmcb->save.gdtr.base;
+}
+
+static void svm_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ svm->vmcb->save.gdtr.limit = dt->size;
+ svm->vmcb->save.gdtr.base = dt->address ;
+ vmcb_mark_dirty(svm->vmcb, VMCB_DT);
+}
+
+static void sev_post_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ /*
+ * For guests that don't set guest_state_protected, the cr3 update is
+ * handled via kvm_mmu_load() while entering the guest. For guests
+ * that do (SEV-ES/SEV-SNP), the cr3 update needs to be written to
+ * VMCB save area now, since the save area will become the initial
+ * contents of the VMSA, and future VMCB save area updates won't be
+ * seen.
+ */
+ if (sev_es_guest(vcpu->kvm)) {
+ svm->vmcb->save.cr3 = cr3;
+ vmcb_mark_dirty(svm->vmcb, VMCB_CR);
+ }
+}
+
+static bool svm_is_valid_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
+{
+ return true;
+}
+
+void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ u64 hcr0 = cr0;
+ bool old_paging = is_paging(vcpu);
+
+#ifdef CONFIG_X86_64
+ if (vcpu->arch.efer & EFER_LME) {
+ if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
+ vcpu->arch.efer |= EFER_LMA;
+ if (!vcpu->arch.guest_state_protected)
+ svm->vmcb->save.efer |= EFER_LMA | EFER_LME;
+ }
+
+ if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) {
+ vcpu->arch.efer &= ~EFER_LMA;
+ if (!vcpu->arch.guest_state_protected)
+ svm->vmcb->save.efer &= ~(EFER_LMA | EFER_LME);
+ }
+ }
+#endif
+ vcpu->arch.cr0 = cr0;
+
+ if (!npt_enabled) {
+ hcr0 |= X86_CR0_PG | X86_CR0_WP;
+ if (old_paging != is_paging(vcpu))
+ svm_set_cr4(vcpu, kvm_read_cr4(vcpu));
+ }
+
+ /*
+ * re-enable caching here because the QEMU bios
+ * does not do it - this results in some delay at
+ * reboot
+ */
+ if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
+ hcr0 &= ~(X86_CR0_CD | X86_CR0_NW);
+
+ svm->vmcb->save.cr0 = hcr0;
+ vmcb_mark_dirty(svm->vmcb, VMCB_CR);
+
+ /*
+ * SEV-ES guests must always keep the CR intercepts cleared. CR
+ * tracking is done using the CR write traps.
+ */
+ if (sev_es_guest(vcpu->kvm))
+ return;
+
+ if (hcr0 == cr0) {
+ /* Selective CR0 write remains on. */
+ svm_clr_intercept(svm, INTERCEPT_CR0_READ);
+ svm_clr_intercept(svm, INTERCEPT_CR0_WRITE);
+ } else {
+ svm_set_intercept(svm, INTERCEPT_CR0_READ);
+ svm_set_intercept(svm, INTERCEPT_CR0_WRITE);
+ }
+}
+
+static bool svm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
+{
+ return true;
+}
+
+void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
+{
+ unsigned long host_cr4_mce = cr4_read_shadow() & X86_CR4_MCE;
+ unsigned long old_cr4 = vcpu->arch.cr4;
+
+ if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE))
+ svm_flush_tlb_current(vcpu);
+
+ vcpu->arch.cr4 = cr4;
+ if (!npt_enabled) {
+ cr4 |= X86_CR4_PAE;
+
+ if (!is_paging(vcpu))
+ cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE);
+ }
+ cr4 |= host_cr4_mce;
+ to_svm(vcpu)->vmcb->save.cr4 = cr4;
+ vmcb_mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR);
+
+ if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE))
+ kvm_update_cpuid_runtime(vcpu);
+}
+
+static void svm_set_segment(struct kvm_vcpu *vcpu,
+ struct kvm_segment *var, int seg)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb_seg *s = svm_seg(vcpu, seg);
+
+ s->base = var->base;
+ s->limit = var->limit;
+ s->selector = var->selector;
+ s->attrib = (var->type & SVM_SELECTOR_TYPE_MASK);
+ s->attrib |= (var->s & 1) << SVM_SELECTOR_S_SHIFT;
+ s->attrib |= (var->dpl & 3) << SVM_SELECTOR_DPL_SHIFT;
+ s->attrib |= ((var->present & 1) && !var->unusable) << SVM_SELECTOR_P_SHIFT;
+ s->attrib |= (var->avl & 1) << SVM_SELECTOR_AVL_SHIFT;
+ s->attrib |= (var->l & 1) << SVM_SELECTOR_L_SHIFT;
+ s->attrib |= (var->db & 1) << SVM_SELECTOR_DB_SHIFT;
+ s->attrib |= (var->g & 1) << SVM_SELECTOR_G_SHIFT;
+
+ /*
+ * This is always accurate, except if SYSRET returned to a segment
+ * with SS.DPL != 3. Intel does not have this quirk, and always
+ * forces SS.DPL to 3 on sysret, so we ignore that case; fixing it
+ * would entail passing the CPL to userspace and back.
+ */
+ if (seg == VCPU_SREG_SS)
+ /* This is symmetric with svm_get_segment() */
+ svm->vmcb->save.cpl = (var->dpl & 3);
+
+ vmcb_mark_dirty(svm->vmcb, VMCB_SEG);
+}
+
+static void svm_update_exception_bitmap(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ clr_exception_intercept(svm, BP_VECTOR);
+
+ if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) {
+ if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
+ set_exception_intercept(svm, BP_VECTOR);
+ }
+}
+
+static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd)
+{
+ if (sd->next_asid > sd->max_asid) {
+ ++sd->asid_generation;
+ sd->next_asid = sd->min_asid;
+ svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ALL_ASID;
+ vmcb_mark_dirty(svm->vmcb, VMCB_ASID);
+ }
+
+ svm->current_vmcb->asid_generation = sd->asid_generation;
+ svm->asid = sd->next_asid++;
+}
+
+static void svm_set_dr6(struct vcpu_svm *svm, unsigned long value)
+{
+ struct vmcb *vmcb = svm->vmcb;
+
+ if (svm->vcpu.arch.guest_state_protected)
+ return;
+
+ if (unlikely(value != vmcb->save.dr6)) {
+ vmcb->save.dr6 = value;
+ vmcb_mark_dirty(vmcb, VMCB_DR);
+ }
+}
+
+static void svm_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (vcpu->arch.guest_state_protected)
+ return;
+
+ get_debugreg(vcpu->arch.db[0], 0);
+ get_debugreg(vcpu->arch.db[1], 1);
+ get_debugreg(vcpu->arch.db[2], 2);
+ get_debugreg(vcpu->arch.db[3], 3);
+ /*
+ * We cannot reset svm->vmcb->save.dr6 to DR6_ACTIVE_LOW here,
+ * because db_interception might need it. We can do it before vmentry.
+ */
+ vcpu->arch.dr6 = svm->vmcb->save.dr6;
+ vcpu->arch.dr7 = svm->vmcb->save.dr7;
+ vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT;
+ set_dr_intercepts(svm);
+}
+
+static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (vcpu->arch.guest_state_protected)
+ return;
+
+ svm->vmcb->save.dr7 = value;
+ vmcb_mark_dirty(svm->vmcb, VMCB_DR);
+}
+
+static int pf_interception(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ u64 fault_address = svm->vmcb->control.exit_info_2;
+ u64 error_code = svm->vmcb->control.exit_info_1;
+
+ return kvm_handle_page_fault(vcpu, error_code, fault_address,
+ static_cpu_has(X86_FEATURE_DECODEASSISTS) ?
+ svm->vmcb->control.insn_bytes : NULL,
+ svm->vmcb->control.insn_len);
+}
+
+static int npf_interception(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ u64 fault_address = svm->vmcb->control.exit_info_2;
+ u64 error_code = svm->vmcb->control.exit_info_1;
+
+ trace_kvm_page_fault(vcpu, fault_address, error_code);
+ return kvm_mmu_page_fault(vcpu, fault_address, error_code,
+ static_cpu_has(X86_FEATURE_DECODEASSISTS) ?
+ svm->vmcb->control.insn_bytes : NULL,
+ svm->vmcb->control.insn_len);
+}
+
+static int db_interception(struct kvm_vcpu *vcpu)
+{
+ struct kvm_run *kvm_run = vcpu->run;
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (!(vcpu->guest_debug &
+ (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) &&
+ !svm->nmi_singlestep) {
+ u32 payload = svm->vmcb->save.dr6 ^ DR6_ACTIVE_LOW;
+ kvm_queue_exception_p(vcpu, DB_VECTOR, payload);
+ return 1;
+ }
+
+ if (svm->nmi_singlestep) {
+ disable_nmi_singlestep(svm);
+ /* Make sure we check for pending NMIs upon entry */
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ }
+
+ if (vcpu->guest_debug &
+ (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) {
+ kvm_run->exit_reason = KVM_EXIT_DEBUG;
+ kvm_run->debug.arch.dr6 = svm->vmcb->save.dr6;
+ kvm_run->debug.arch.dr7 = svm->vmcb->save.dr7;
+ kvm_run->debug.arch.pc =
+ svm->vmcb->save.cs.base + svm->vmcb->save.rip;
+ kvm_run->debug.arch.exception = DB_VECTOR;
+ return 0;
+ }
+
+ return 1;
+}
+
+static int bp_interception(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct kvm_run *kvm_run = vcpu->run;
+
+ kvm_run->exit_reason = KVM_EXIT_DEBUG;
+ kvm_run->debug.arch.pc = svm->vmcb->save.cs.base + svm->vmcb->save.rip;
+ kvm_run->debug.arch.exception = BP_VECTOR;
+ return 0;
+}
+
+static int ud_interception(struct kvm_vcpu *vcpu)
+{
+ return handle_ud(vcpu);
+}
+
+static int ac_interception(struct kvm_vcpu *vcpu)
+{
+ kvm_queue_exception_e(vcpu, AC_VECTOR, 0);
+ return 1;
+}
+
+static bool is_erratum_383(void)
+{
+ int err, i;
+ u64 value;
+
+ if (!erratum_383_found)
+ return false;
+
+ value = native_read_msr_safe(MSR_IA32_MC0_STATUS, &err);
+ if (err)
+ return false;
+
+ /* Bit 62 may or may not be set for this mce */
+ value &= ~(1ULL << 62);
+
+ if (value != 0xb600000000010015ULL)
+ return false;
+
+ /* Clear MCi_STATUS registers */
+ for (i = 0; i < 6; ++i)
+ native_write_msr_safe(MSR_IA32_MCx_STATUS(i), 0, 0);
+
+ value = native_read_msr_safe(MSR_IA32_MCG_STATUS, &err);
+ if (!err) {
+ u32 low, high;
+
+ value &= ~(1ULL << 2);
+ low = lower_32_bits(value);
+ high = upper_32_bits(value);
+
+ native_write_msr_safe(MSR_IA32_MCG_STATUS, low, high);
+ }
+
+ /* Flush tlb to evict multi-match entries */
+ __flush_tlb_all();
+
+ return true;
+}
+
+static void svm_handle_mce(struct kvm_vcpu *vcpu)
+{
+ if (is_erratum_383()) {
+ /*
+ * Erratum 383 triggered. Guest state is corrupt so kill the
+ * guest.
+ */
+ pr_err("KVM: Guest triggered AMD Erratum 383\n");
+
+ kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+
+ return;
+ }
+
+ /*
+ * On an #MC intercept the MCE handler is not called automatically in
+ * the host. So do it by hand here.
+ */
+ kvm_machine_check();
+}
+
+static int mc_interception(struct kvm_vcpu *vcpu)
+{
+ return 1;
+}
+
+static int shutdown_interception(struct kvm_vcpu *vcpu)
+{
+ struct kvm_run *kvm_run = vcpu->run;
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ /*
+ * The VM save area has already been encrypted so it
+ * cannot be reinitialized - just terminate.
+ */
+ if (sev_es_guest(vcpu->kvm))
+ return -EINVAL;
+
+ /*
+ * VMCB is undefined after a SHUTDOWN intercept. INIT the vCPU to put
+ * the VMCB in a known good state. Unfortuately, KVM doesn't have
+ * KVM_MP_STATE_SHUTDOWN and can't add it without potentially breaking
+ * userspace. At a platform view, INIT is acceptable behavior as
+ * there exist bare metal platforms that automatically INIT the CPU
+ * in response to shutdown.
+ */
+ clear_page(svm->vmcb);
+ kvm_vcpu_reset(vcpu, true);
+
+ kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
+ return 0;
+}
+
+static int io_interception(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */
+ int size, in, string;
+ unsigned port;
+
+ ++vcpu->stat.io_exits;
+ string = (io_info & SVM_IOIO_STR_MASK) != 0;
+ in = (io_info & SVM_IOIO_TYPE_MASK) != 0;
+ port = io_info >> 16;
+ size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT;
+
+ if (string) {
+ if (sev_es_guest(vcpu->kvm))
+ return sev_es_string_io(svm, size, port, in);
+ else
+ return kvm_emulate_instruction(vcpu, 0);
+ }
+
+ svm->next_rip = svm->vmcb->control.exit_info_2;
+
+ return kvm_fast_pio(vcpu, size, port, in);
+}
+
+static int nmi_interception(struct kvm_vcpu *vcpu)
+{
+ return 1;
+}
+
+static int smi_interception(struct kvm_vcpu *vcpu)
+{
+ return 1;
+}
+
+static int intr_interception(struct kvm_vcpu *vcpu)
+{
+ ++vcpu->stat.irq_exits;
+ return 1;
+}
+
+static int vmload_vmsave_interception(struct kvm_vcpu *vcpu, bool vmload)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb *vmcb12;
+ struct kvm_host_map map;
+ int ret;
+
+ if (nested_svm_check_permissions(vcpu))
+ return 1;
+
+ ret = kvm_vcpu_map(vcpu, gpa_to_gfn(svm->vmcb->save.rax), &map);
+ if (ret) {
+ if (ret == -EINVAL)
+ kvm_inject_gp(vcpu, 0);
+ return 1;
+ }
+
+ vmcb12 = map.hva;
+
+ ret = kvm_skip_emulated_instruction(vcpu);
+
+ if (vmload) {
+ svm_copy_vmloadsave_state(svm->vmcb, vmcb12);
+ svm->sysenter_eip_hi = 0;
+ svm->sysenter_esp_hi = 0;
+ } else {
+ svm_copy_vmloadsave_state(vmcb12, svm->vmcb);
+ }
+
+ kvm_vcpu_unmap(vcpu, &map, true);
+
+ return ret;
+}
+
+static int vmload_interception(struct kvm_vcpu *vcpu)
+{
+ return vmload_vmsave_interception(vcpu, true);
+}
+
+static int vmsave_interception(struct kvm_vcpu *vcpu)
+{
+ return vmload_vmsave_interception(vcpu, false);
+}
+
+static int vmrun_interception(struct kvm_vcpu *vcpu)
+{
+ if (nested_svm_check_permissions(vcpu))
+ return 1;
+
+ return nested_svm_vmrun(vcpu);
+}
+
+enum {
+ NONE_SVM_INSTR,
+ SVM_INSTR_VMRUN,
+ SVM_INSTR_VMLOAD,
+ SVM_INSTR_VMSAVE,
+};
+
+/* Return NONE_SVM_INSTR if not SVM instrs, otherwise return decode result */
+static int svm_instr_opcode(struct kvm_vcpu *vcpu)
+{
+ struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
+
+ if (ctxt->b != 0x1 || ctxt->opcode_len != 2)
+ return NONE_SVM_INSTR;
+
+ switch (ctxt->modrm) {
+ case 0xd8: /* VMRUN */
+ return SVM_INSTR_VMRUN;
+ case 0xda: /* VMLOAD */
+ return SVM_INSTR_VMLOAD;
+ case 0xdb: /* VMSAVE */
+ return SVM_INSTR_VMSAVE;
+ default:
+ break;
+ }
+
+ return NONE_SVM_INSTR;
+}
+
+static int emulate_svm_instr(struct kvm_vcpu *vcpu, int opcode)
+{
+ const int guest_mode_exit_codes[] = {
+ [SVM_INSTR_VMRUN] = SVM_EXIT_VMRUN,
+ [SVM_INSTR_VMLOAD] = SVM_EXIT_VMLOAD,
+ [SVM_INSTR_VMSAVE] = SVM_EXIT_VMSAVE,
+ };
+ int (*const svm_instr_handlers[])(struct kvm_vcpu *vcpu) = {
+ [SVM_INSTR_VMRUN] = vmrun_interception,
+ [SVM_INSTR_VMLOAD] = vmload_interception,
+ [SVM_INSTR_VMSAVE] = vmsave_interception,
+ };
+ struct vcpu_svm *svm = to_svm(vcpu);
+ int ret;
+
+ if (is_guest_mode(vcpu)) {
+ /* Returns '1' or -errno on failure, '0' on success. */
+ ret = nested_svm_simple_vmexit(svm, guest_mode_exit_codes[opcode]);
+ if (ret)
+ return ret;
+ return 1;
+ }
+ return svm_instr_handlers[opcode](vcpu);
+}
+
+/*
+ * #GP handling code. Note that #GP can be triggered under the following two
+ * cases:
+ * 1) SVM VM-related instructions (VMRUN/VMSAVE/VMLOAD) that trigger #GP on
+ * some AMD CPUs when EAX of these instructions are in the reserved memory
+ * regions (e.g. SMM memory on host).
+ * 2) VMware backdoor
+ */
+static int gp_interception(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ u32 error_code = svm->vmcb->control.exit_info_1;
+ int opcode;
+
+ /* Both #GP cases have zero error_code */
+ if (error_code)
+ goto reinject;
+
+ /* Decode the instruction for usage later */
+ if (x86_decode_emulated_instruction(vcpu, 0, NULL, 0) != EMULATION_OK)
+ goto reinject;
+
+ opcode = svm_instr_opcode(vcpu);
+
+ if (opcode == NONE_SVM_INSTR) {
+ if (!enable_vmware_backdoor)
+ goto reinject;
+
+ /*
+ * VMware backdoor emulation on #GP interception only handles
+ * IN{S}, OUT{S}, and RDPMC.
+ */
+ if (!is_guest_mode(vcpu))
+ return kvm_emulate_instruction(vcpu,
+ EMULTYPE_VMWARE_GP | EMULTYPE_NO_DECODE);
+ } else {
+ /* All SVM instructions expect page aligned RAX */
+ if (svm->vmcb->save.rax & ~PAGE_MASK)
+ goto reinject;
+
+ return emulate_svm_instr(vcpu, opcode);
+ }
+
+reinject:
+ kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
+ return 1;
+}
+
+void svm_set_gif(struct vcpu_svm *svm, bool value)
+{
+ if (value) {
+ /*
+ * If VGIF is enabled, the STGI intercept is only added to
+ * detect the opening of the SMI/NMI window; remove it now.
+ * Likewise, clear the VINTR intercept, we will set it
+ * again while processing KVM_REQ_EVENT if needed.
+ */
+ if (vgif)
+ svm_clr_intercept(svm, INTERCEPT_STGI);
+ if (svm_is_intercept(svm, INTERCEPT_VINTR))
+ svm_clear_vintr(svm);
+
+ enable_gif(svm);
+ if (svm->vcpu.arch.smi_pending ||
+ svm->vcpu.arch.nmi_pending ||
+ kvm_cpu_has_injectable_intr(&svm->vcpu) ||
+ kvm_apic_has_pending_init_or_sipi(&svm->vcpu))
+ kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
+ } else {
+ disable_gif(svm);
+
+ /*
+ * After a CLGI no interrupts should come. But if vGIF is
+ * in use, we still rely on the VINTR intercept (rather than
+ * STGI) to detect an open interrupt window.
+ */
+ if (!vgif)
+ svm_clear_vintr(svm);
+ }
+}
+
+static int stgi_interception(struct kvm_vcpu *vcpu)
+{
+ int ret;
+
+ if (nested_svm_check_permissions(vcpu))
+ return 1;
+
+ ret = kvm_skip_emulated_instruction(vcpu);
+ svm_set_gif(to_svm(vcpu), true);
+ return ret;
+}
+
+static int clgi_interception(struct kvm_vcpu *vcpu)
+{
+ int ret;
+
+ if (nested_svm_check_permissions(vcpu))
+ return 1;
+
+ ret = kvm_skip_emulated_instruction(vcpu);
+ svm_set_gif(to_svm(vcpu), false);
+ return ret;
+}
+
+static int invlpga_interception(struct kvm_vcpu *vcpu)
+{
+ gva_t gva = kvm_rax_read(vcpu);
+ u32 asid = kvm_rcx_read(vcpu);
+
+ /* FIXME: Handle an address size prefix. */
+ if (!is_long_mode(vcpu))
+ gva = (u32)gva;
+
+ trace_kvm_invlpga(to_svm(vcpu)->vmcb->save.rip, asid, gva);
+
+ /* Let's treat INVLPGA the same as INVLPG (can be optimized!) */
+ kvm_mmu_invlpg(vcpu, gva);
+
+ return kvm_skip_emulated_instruction(vcpu);
+}
+
+static int skinit_interception(struct kvm_vcpu *vcpu)
+{
+ trace_kvm_skinit(to_svm(vcpu)->vmcb->save.rip, kvm_rax_read(vcpu));
+
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+}
+
+static int task_switch_interception(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ u16 tss_selector;
+ int reason;
+ int int_type = svm->vmcb->control.exit_int_info &
+ SVM_EXITINTINFO_TYPE_MASK;
+ int int_vec = svm->vmcb->control.exit_int_info & SVM_EVTINJ_VEC_MASK;
+ uint32_t type =
+ svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_TYPE_MASK;
+ uint32_t idt_v =
+ svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_VALID;
+ bool has_error_code = false;
+ u32 error_code = 0;
+
+ tss_selector = (u16)svm->vmcb->control.exit_info_1;
+
+ if (svm->vmcb->control.exit_info_2 &
+ (1ULL << SVM_EXITINFOSHIFT_TS_REASON_IRET))
+ reason = TASK_SWITCH_IRET;
+ else if (svm->vmcb->control.exit_info_2 &
+ (1ULL << SVM_EXITINFOSHIFT_TS_REASON_JMP))
+ reason = TASK_SWITCH_JMP;
+ else if (idt_v)
+ reason = TASK_SWITCH_GATE;
+ else
+ reason = TASK_SWITCH_CALL;
+
+ if (reason == TASK_SWITCH_GATE) {
+ switch (type) {
+ case SVM_EXITINTINFO_TYPE_NMI:
+ vcpu->arch.nmi_injected = false;
+ break;
+ case SVM_EXITINTINFO_TYPE_EXEPT:
+ if (svm->vmcb->control.exit_info_2 &
+ (1ULL << SVM_EXITINFOSHIFT_TS_HAS_ERROR_CODE)) {
+ has_error_code = true;
+ error_code =
+ (u32)svm->vmcb->control.exit_info_2;
+ }
+ kvm_clear_exception_queue(vcpu);
+ break;
+ case SVM_EXITINTINFO_TYPE_INTR:
+ case SVM_EXITINTINFO_TYPE_SOFT:
+ kvm_clear_interrupt_queue(vcpu);
+ break;
+ default:
+ break;
+ }
+ }
+
+ if (reason != TASK_SWITCH_GATE ||
+ int_type == SVM_EXITINTINFO_TYPE_SOFT ||
+ (int_type == SVM_EXITINTINFO_TYPE_EXEPT &&
+ (int_vec == OF_VECTOR || int_vec == BP_VECTOR))) {
+ if (!svm_skip_emulated_instruction(vcpu))
+ return 0;
+ }
+
+ if (int_type != SVM_EXITINTINFO_TYPE_SOFT)
+ int_vec = -1;
+
+ return kvm_task_switch(vcpu, tss_selector, int_vec, reason,
+ has_error_code, error_code);
+}
+
+static int iret_interception(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ ++vcpu->stat.nmi_window_exits;
+ vcpu->arch.hflags |= HF_IRET_MASK;
+ if (!sev_es_guest(vcpu->kvm)) {
+ svm_clr_intercept(svm, INTERCEPT_IRET);
+ svm->nmi_iret_rip = kvm_rip_read(vcpu);
+ }
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ return 1;
+}
+
+static int invlpg_interception(struct kvm_vcpu *vcpu)
+{
+ if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
+ return kvm_emulate_instruction(vcpu, 0);
+
+ kvm_mmu_invlpg(vcpu, to_svm(vcpu)->vmcb->control.exit_info_1);
+ return kvm_skip_emulated_instruction(vcpu);
+}
+
+static int emulate_on_interception(struct kvm_vcpu *vcpu)
+{
+ return kvm_emulate_instruction(vcpu, 0);
+}
+
+static int rsm_interception(struct kvm_vcpu *vcpu)
+{
+ return kvm_emulate_instruction_from_buffer(vcpu, rsm_ins_bytes, 2);
+}
+
+static bool check_selective_cr0_intercepted(struct kvm_vcpu *vcpu,
+ unsigned long val)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ unsigned long cr0 = vcpu->arch.cr0;
+ bool ret = false;
+
+ if (!is_guest_mode(vcpu) ||
+ (!(vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_SELECTIVE_CR0))))
+ return false;
+
+ cr0 &= ~SVM_CR0_SELECTIVE_MASK;
+ val &= ~SVM_CR0_SELECTIVE_MASK;
+
+ if (cr0 ^ val) {
+ svm->vmcb->control.exit_code = SVM_EXIT_CR0_SEL_WRITE;
+ ret = (nested_svm_exit_handled(svm) == NESTED_EXIT_DONE);
+ }
+
+ return ret;
+}
+
+#define CR_VALID (1ULL << 63)
+
+static int cr_interception(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ int reg, cr;
+ unsigned long val;
+ int err;
+
+ if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
+ return emulate_on_interception(vcpu);
+
+ if (unlikely((svm->vmcb->control.exit_info_1 & CR_VALID) == 0))
+ return emulate_on_interception(vcpu);
+
+ reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
+ if (svm->vmcb->control.exit_code == SVM_EXIT_CR0_SEL_WRITE)
+ cr = SVM_EXIT_WRITE_CR0 - SVM_EXIT_READ_CR0;
+ else
+ cr = svm->vmcb->control.exit_code - SVM_EXIT_READ_CR0;
+
+ err = 0;
+ if (cr >= 16) { /* mov to cr */
+ cr -= 16;
+ val = kvm_register_read(vcpu, reg);
+ trace_kvm_cr_write(cr, val);
+ switch (cr) {
+ case 0:
+ if (!check_selective_cr0_intercepted(vcpu, val))
+ err = kvm_set_cr0(vcpu, val);
+ else
+ return 1;
+
+ break;
+ case 3:
+ err = kvm_set_cr3(vcpu, val);
+ break;
+ case 4:
+ err = kvm_set_cr4(vcpu, val);
+ break;
+ case 8:
+ err = kvm_set_cr8(vcpu, val);
+ break;
+ default:
+ WARN(1, "unhandled write to CR%d", cr);
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+ }
+ } else { /* mov from cr */
+ switch (cr) {
+ case 0:
+ val = kvm_read_cr0(vcpu);
+ break;
+ case 2:
+ val = vcpu->arch.cr2;
+ break;
+ case 3:
+ val = kvm_read_cr3(vcpu);
+ break;
+ case 4:
+ val = kvm_read_cr4(vcpu);
+ break;
+ case 8:
+ val = kvm_get_cr8(vcpu);
+ break;
+ default:
+ WARN(1, "unhandled read from CR%d", cr);
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+ }
+ kvm_register_write(vcpu, reg, val);
+ trace_kvm_cr_read(cr, val);
+ }
+ return kvm_complete_insn_gp(vcpu, err);
+}
+
+static int cr_trap(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ unsigned long old_value, new_value;
+ unsigned int cr;
+ int ret = 0;
+
+ new_value = (unsigned long)svm->vmcb->control.exit_info_1;
+
+ cr = svm->vmcb->control.exit_code - SVM_EXIT_CR0_WRITE_TRAP;
+ switch (cr) {
+ case 0:
+ old_value = kvm_read_cr0(vcpu);
+ svm_set_cr0(vcpu, new_value);
+
+ kvm_post_set_cr0(vcpu, old_value, new_value);
+ break;
+ case 4:
+ old_value = kvm_read_cr4(vcpu);
+ svm_set_cr4(vcpu, new_value);
+
+ kvm_post_set_cr4(vcpu, old_value, new_value);
+ break;
+ case 8:
+ ret = kvm_set_cr8(vcpu, new_value);
+ break;
+ default:
+ WARN(1, "unhandled CR%d write trap", cr);
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+ }
+
+ return kvm_complete_insn_gp(vcpu, ret);
+}
+
+static int dr_interception(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ int reg, dr;
+ unsigned long val;
+ int err = 0;
+
+ if (vcpu->guest_debug == 0) {
+ /*
+ * No more DR vmexits; force a reload of the debug registers
+ * and reenter on this instruction. The next vmexit will
+ * retrieve the full state of the debug registers.
+ */
+ clr_dr_intercepts(svm);
+ vcpu->arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT;
+ return 1;
+ }
+
+ if (!boot_cpu_has(X86_FEATURE_DECODEASSISTS))
+ return emulate_on_interception(vcpu);
+
+ reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
+ dr = svm->vmcb->control.exit_code - SVM_EXIT_READ_DR0;
+ if (dr >= 16) { /* mov to DRn */
+ dr -= 16;
+ val = kvm_register_read(vcpu, reg);
+ err = kvm_set_dr(vcpu, dr, val);
+ } else {
+ kvm_get_dr(vcpu, dr, &val);
+ kvm_register_write(vcpu, reg, val);
+ }
+
+ return kvm_complete_insn_gp(vcpu, err);
+}
+
+static int cr8_write_interception(struct kvm_vcpu *vcpu)
+{
+ int r;
+
+ u8 cr8_prev = kvm_get_cr8(vcpu);
+ /* instruction emulation calls kvm_set_cr8() */
+ r = cr_interception(vcpu);
+ if (lapic_in_kernel(vcpu))
+ return r;
+ if (cr8_prev <= kvm_get_cr8(vcpu))
+ return r;
+ vcpu->run->exit_reason = KVM_EXIT_SET_TPR;
+ return 0;
+}
+
+static int efer_trap(struct kvm_vcpu *vcpu)
+{
+ struct msr_data msr_info;
+ int ret;
+
+ /*
+ * Clear the EFER_SVME bit from EFER. The SVM code always sets this
+ * bit in svm_set_efer(), but __kvm_valid_efer() checks it against
+ * whether the guest has X86_FEATURE_SVM - this avoids a failure if
+ * the guest doesn't have X86_FEATURE_SVM.
+ */
+ msr_info.host_initiated = false;
+ msr_info.index = MSR_EFER;
+ msr_info.data = to_svm(vcpu)->vmcb->control.exit_info_1 & ~EFER_SVME;
+ ret = kvm_set_msr_common(vcpu, &msr_info);
+
+ return kvm_complete_insn_gp(vcpu, ret);
+}
+
+static int svm_get_msr_feature(struct kvm_msr_entry *msr)
+{
+ msr->data = 0;
+
+ switch (msr->index) {
+ case MSR_AMD64_DE_CFG:
+ if (cpu_feature_enabled(X86_FEATURE_LFENCE_RDTSC))
+ msr->data |= MSR_AMD64_DE_CFG_LFENCE_SERIALIZE;
+ break;
+ case MSR_IA32_PERF_CAPABILITIES:
+ msr->data = kvm_caps.supported_perf_cap;
+ return 0;
+ default:
+ return KVM_MSR_RET_INVALID;
+ }
+
+ return 0;
+}
+
+static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ switch (msr_info->index) {
+ case MSR_AMD64_TSC_RATIO:
+ if (!msr_info->host_initiated && !svm->tsc_scaling_enabled)
+ return 1;
+ msr_info->data = svm->tsc_ratio_msr;
+ break;
+ case MSR_STAR:
+ msr_info->data = svm->vmcb01.ptr->save.star;
+ break;
+#ifdef CONFIG_X86_64
+ case MSR_LSTAR:
+ msr_info->data = svm->vmcb01.ptr->save.lstar;
+ break;
+ case MSR_CSTAR:
+ msr_info->data = svm->vmcb01.ptr->save.cstar;
+ break;
+ case MSR_KERNEL_GS_BASE:
+ msr_info->data = svm->vmcb01.ptr->save.kernel_gs_base;
+ break;
+ case MSR_SYSCALL_MASK:
+ msr_info->data = svm->vmcb01.ptr->save.sfmask;
+ break;
+#endif
+ case MSR_IA32_SYSENTER_CS:
+ msr_info->data = svm->vmcb01.ptr->save.sysenter_cs;
+ break;
+ case MSR_IA32_SYSENTER_EIP:
+ msr_info->data = (u32)svm->vmcb01.ptr->save.sysenter_eip;
+ if (guest_cpuid_is_intel(vcpu))
+ msr_info->data |= (u64)svm->sysenter_eip_hi << 32;
+ break;
+ case MSR_IA32_SYSENTER_ESP:
+ msr_info->data = svm->vmcb01.ptr->save.sysenter_esp;
+ if (guest_cpuid_is_intel(vcpu))
+ msr_info->data |= (u64)svm->sysenter_esp_hi << 32;
+ break;
+ case MSR_TSC_AUX:
+ msr_info->data = svm->tsc_aux;
+ break;
+ case MSR_IA32_DEBUGCTLMSR:
+ case MSR_IA32_LASTBRANCHFROMIP:
+ case MSR_IA32_LASTBRANCHTOIP:
+ case MSR_IA32_LASTINTFROMIP:
+ case MSR_IA32_LASTINTTOIP:
+ msr_info->data = svm_get_lbr_msr(svm, msr_info->index);
+ break;
+ case MSR_VM_HSAVE_PA:
+ msr_info->data = svm->nested.hsave_msr;
+ break;
+ case MSR_VM_CR:
+ msr_info->data = svm->nested.vm_cr_msr;
+ break;
+ case MSR_IA32_SPEC_CTRL:
+ if (!msr_info->host_initiated &&
+ !guest_has_spec_ctrl_msr(vcpu))
+ return 1;
+
+ if (boot_cpu_has(X86_FEATURE_V_SPEC_CTRL))
+ msr_info->data = svm->vmcb->save.spec_ctrl;
+ else
+ msr_info->data = svm->spec_ctrl;
+ break;
+ case MSR_AMD64_VIRT_SPEC_CTRL:
+ if (!msr_info->host_initiated &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_VIRT_SSBD))
+ return 1;
+
+ msr_info->data = svm->virt_spec_ctrl;
+ break;
+ case MSR_F15H_IC_CFG: {
+
+ int family, model;
+
+ family = guest_cpuid_family(vcpu);
+ model = guest_cpuid_model(vcpu);
+
+ if (family < 0 || model < 0)
+ return kvm_get_msr_common(vcpu, msr_info);
+
+ msr_info->data = 0;
+
+ if (family == 0x15 &&
+ (model >= 0x2 && model < 0x20))
+ msr_info->data = 0x1E;
+ }
+ break;
+ case MSR_AMD64_DE_CFG:
+ msr_info->data = svm->msr_decfg;
+ break;
+ default:
+ return kvm_get_msr_common(vcpu, msr_info);
+ }
+ return 0;
+}
+
+static int svm_complete_emulated_msr(struct kvm_vcpu *vcpu, int err)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ if (!err || !sev_es_guest(vcpu->kvm) || WARN_ON_ONCE(!svm->sev_es.ghcb))
+ return kvm_complete_insn_gp(vcpu, err);
+
+ ghcb_set_sw_exit_info_1(svm->sev_es.ghcb, 1);
+ ghcb_set_sw_exit_info_2(svm->sev_es.ghcb,
+ X86_TRAP_GP |
+ SVM_EVTINJ_TYPE_EXEPT |
+ SVM_EVTINJ_VALID);
+ return 1;
+}
+
+static int svm_set_vm_cr(struct kvm_vcpu *vcpu, u64 data)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ int svm_dis, chg_mask;
+
+ if (data & ~SVM_VM_CR_VALID_MASK)
+ return 1;
+
+ chg_mask = SVM_VM_CR_VALID_MASK;
+
+ if (svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK)
+ chg_mask &= ~(SVM_VM_CR_SVM_LOCK_MASK | SVM_VM_CR_SVM_DIS_MASK);
+
+ svm->nested.vm_cr_msr &= ~chg_mask;
+ svm->nested.vm_cr_msr |= (data & chg_mask);
+
+ svm_dis = svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK;
+
+ /* check for svm_disable while efer.svme is set */
+ if (svm_dis && (vcpu->arch.efer & EFER_SVME))
+ return 1;
+
+ return 0;
+}
+
+static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ int r;
+
+ u32 ecx = msr->index;
+ u64 data = msr->data;
+ switch (ecx) {
+ case MSR_AMD64_TSC_RATIO:
+
+ if (!svm->tsc_scaling_enabled) {
+
+ if (!msr->host_initiated)
+ return 1;
+ /*
+ * In case TSC scaling is not enabled, always
+ * leave this MSR at the default value.
+ *
+ * Due to bug in qemu 6.2.0, it would try to set
+ * this msr to 0 if tsc scaling is not enabled.
+ * Ignore this value as well.
+ */
+ if (data != 0 && data != svm->tsc_ratio_msr)
+ return 1;
+ break;
+ }
+
+ if (data & SVM_TSC_RATIO_RSVD)
+ return 1;
+
+ svm->tsc_ratio_msr = data;
+
+ if (svm->tsc_scaling_enabled && is_guest_mode(vcpu))
+ nested_svm_update_tsc_ratio_msr(vcpu);
+
+ break;
+ case MSR_IA32_CR_PAT:
+ if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data))
+ return 1;
+ vcpu->arch.pat = data;
+ svm->vmcb01.ptr->save.g_pat = data;
+ if (is_guest_mode(vcpu))
+ nested_vmcb02_compute_g_pat(svm);
+ vmcb_mark_dirty(svm->vmcb, VMCB_NPT);
+ break;
+ case MSR_IA32_SPEC_CTRL:
+ if (!msr->host_initiated &&
+ !guest_has_spec_ctrl_msr(vcpu))
+ return 1;
+
+ if (kvm_spec_ctrl_test_value(data))
+ return 1;
+
+ if (boot_cpu_has(X86_FEATURE_V_SPEC_CTRL))
+ svm->vmcb->save.spec_ctrl = data;
+ else
+ svm->spec_ctrl = data;
+ if (!data)
+ break;
+
+ /*
+ * For non-nested:
+ * When it's written (to non-zero) for the first time, pass
+ * it through.
+ *
+ * For nested:
+ * The handling of the MSR bitmap for L2 guests is done in
+ * nested_svm_vmrun_msrpm.
+ * We update the L1 MSR bit as well since it will end up
+ * touching the MSR anyway now.
+ */
+ set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1);
+ break;
+ case MSR_IA32_PRED_CMD:
+ if (!msr->host_initiated &&
+ !guest_has_pred_cmd_msr(vcpu))
+ return 1;
+
+ if (data & ~PRED_CMD_IBPB)
+ return 1;
+ if (!boot_cpu_has(X86_FEATURE_IBPB))
+ return 1;
+ if (!data)
+ break;
+
+ wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB);
+ set_msr_interception(vcpu, svm->msrpm, MSR_IA32_PRED_CMD, 0, 1);
+ break;
+ case MSR_AMD64_VIRT_SPEC_CTRL:
+ if (!msr->host_initiated &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_VIRT_SSBD))
+ return 1;
+
+ if (data & ~SPEC_CTRL_SSBD)
+ return 1;
+
+ svm->virt_spec_ctrl = data;
+ break;
+ case MSR_STAR:
+ svm->vmcb01.ptr->save.star = data;
+ break;
+#ifdef CONFIG_X86_64
+ case MSR_LSTAR:
+ svm->vmcb01.ptr->save.lstar = data;
+ break;
+ case MSR_CSTAR:
+ svm->vmcb01.ptr->save.cstar = data;
+ break;
+ case MSR_KERNEL_GS_BASE:
+ svm->vmcb01.ptr->save.kernel_gs_base = data;
+ break;
+ case MSR_SYSCALL_MASK:
+ svm->vmcb01.ptr->save.sfmask = data;
+ break;
+#endif
+ case MSR_IA32_SYSENTER_CS:
+ svm->vmcb01.ptr->save.sysenter_cs = data;
+ break;
+ case MSR_IA32_SYSENTER_EIP:
+ svm->vmcb01.ptr->save.sysenter_eip = (u32)data;
+ /*
+ * We only intercept the MSR_IA32_SYSENTER_{EIP|ESP} msrs
+ * when we spoof an Intel vendor ID (for cross vendor migration).
+ * In this case we use this intercept to track the high
+ * 32 bit part of these msrs to support Intel's
+ * implementation of SYSENTER/SYSEXIT.
+ */
+ svm->sysenter_eip_hi = guest_cpuid_is_intel(vcpu) ? (data >> 32) : 0;
+ break;
+ case MSR_IA32_SYSENTER_ESP:
+ svm->vmcb01.ptr->save.sysenter_esp = (u32)data;
+ svm->sysenter_esp_hi = guest_cpuid_is_intel(vcpu) ? (data >> 32) : 0;
+ break;
+ case MSR_TSC_AUX:
+ /*
+ * TSC_AUX is usually changed only during boot and never read
+ * directly. Intercept TSC_AUX instead of exposing it to the
+ * guest via direct_access_msrs, and switch it via user return.
+ */
+ preempt_disable();
+ r = kvm_set_user_return_msr(tsc_aux_uret_slot, data, -1ull);
+ preempt_enable();
+ if (r)
+ return 1;
+
+ svm->tsc_aux = data;
+ break;
+ case MSR_IA32_DEBUGCTLMSR:
+ if (!lbrv) {
+ vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n",
+ __func__, data);
+ break;
+ }
+ if (data & DEBUGCTL_RESERVED_BITS)
+ return 1;
+
+ if (svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK)
+ svm->vmcb->save.dbgctl = data;
+ else
+ svm->vmcb01.ptr->save.dbgctl = data;
+
+ svm_update_lbrv(vcpu);
+
+ break;
+ case MSR_VM_HSAVE_PA:
+ /*
+ * Old kernels did not validate the value written to
+ * MSR_VM_HSAVE_PA. Allow KVM_SET_MSR to set an invalid
+ * value to allow live migrating buggy or malicious guests
+ * originating from those kernels.
+ */
+ if (!msr->host_initiated && !page_address_valid(vcpu, data))
+ return 1;
+
+ svm->nested.hsave_msr = data & PAGE_MASK;
+ break;
+ case MSR_VM_CR:
+ return svm_set_vm_cr(vcpu, data);
+ case MSR_VM_IGNNE:
+ vcpu_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data);
+ break;
+ case MSR_AMD64_DE_CFG: {
+ struct kvm_msr_entry msr_entry;
+
+ msr_entry.index = msr->index;
+ if (svm_get_msr_feature(&msr_entry))
+ return 1;
+
+ /* Check the supported bits */
+ if (data & ~msr_entry.data)
+ return 1;
+
+ /* Don't allow the guest to change a bit, #GP */
+ if (!msr->host_initiated && (data ^ msr_entry.data))
+ return 1;
+
+ svm->msr_decfg = data;
+ break;
+ }
+ default:
+ return kvm_set_msr_common(vcpu, msr);
+ }
+ return 0;
+}
+
+static int msr_interception(struct kvm_vcpu *vcpu)
+{
+ if (to_svm(vcpu)->vmcb->control.exit_info_1)
+ return kvm_emulate_wrmsr(vcpu);
+ else
+ return kvm_emulate_rdmsr(vcpu);
+}
+
+static int interrupt_window_interception(struct kvm_vcpu *vcpu)
+{
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ svm_clear_vintr(to_svm(vcpu));
+
+ /*
+ * If not running nested, for AVIC, the only reason to end up here is ExtINTs.
+ * In this case AVIC was temporarily disabled for
+ * requesting the IRQ window and we have to re-enable it.
+ *
+ * If running nested, still remove the VM wide AVIC inhibit to
+ * support case in which the interrupt window was requested when the
+ * vCPU was not running nested.
+
+ * All vCPUs which run still run nested, will remain to have their
+ * AVIC still inhibited due to per-cpu AVIC inhibition.
+ */
+ kvm_clear_apicv_inhibit(vcpu->kvm, APICV_INHIBIT_REASON_IRQWIN);
+
+ ++vcpu->stat.irq_window_exits;
+ return 1;
+}
+
+static int pause_interception(struct kvm_vcpu *vcpu)
+{
+ bool in_kernel;
+ /*
+ * CPL is not made available for an SEV-ES guest, therefore
+ * vcpu->arch.preempted_in_kernel can never be true. Just
+ * set in_kernel to false as well.
+ */
+ in_kernel = !sev_es_guest(vcpu->kvm) && svm_get_cpl(vcpu) == 0;
+
+ grow_ple_window(vcpu);
+
+ kvm_vcpu_on_spin(vcpu, in_kernel);
+ return kvm_skip_emulated_instruction(vcpu);
+}
+
+static int invpcid_interception(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ unsigned long type;
+ gva_t gva;
+
+ if (!guest_cpuid_has(vcpu, X86_FEATURE_INVPCID)) {
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+ }
+
+ /*
+ * For an INVPCID intercept:
+ * EXITINFO1 provides the linear address of the memory operand.
+ * EXITINFO2 provides the contents of the register operand.
+ */
+ type = svm->vmcb->control.exit_info_2;
+ gva = svm->vmcb->control.exit_info_1;
+
+ return kvm_handle_invpcid(vcpu, type, gva);
+}
+
+static int (*const svm_exit_handlers[])(struct kvm_vcpu *vcpu) = {
+ [SVM_EXIT_READ_CR0] = cr_interception,
+ [SVM_EXIT_READ_CR3] = cr_interception,
+ [SVM_EXIT_READ_CR4] = cr_interception,
+ [SVM_EXIT_READ_CR8] = cr_interception,
+ [SVM_EXIT_CR0_SEL_WRITE] = cr_interception,
+ [SVM_EXIT_WRITE_CR0] = cr_interception,
+ [SVM_EXIT_WRITE_CR3] = cr_interception,
+ [SVM_EXIT_WRITE_CR4] = cr_interception,
+ [SVM_EXIT_WRITE_CR8] = cr8_write_interception,
+ [SVM_EXIT_READ_DR0] = dr_interception,
+ [SVM_EXIT_READ_DR1] = dr_interception,
+ [SVM_EXIT_READ_DR2] = dr_interception,
+ [SVM_EXIT_READ_DR3] = dr_interception,
+ [SVM_EXIT_READ_DR4] = dr_interception,
+ [SVM_EXIT_READ_DR5] = dr_interception,
+ [SVM_EXIT_READ_DR6] = dr_interception,
+ [SVM_EXIT_READ_DR7] = dr_interception,
+ [SVM_EXIT_WRITE_DR0] = dr_interception,
+ [SVM_EXIT_WRITE_DR1] = dr_interception,
+ [SVM_EXIT_WRITE_DR2] = dr_interception,
+ [SVM_EXIT_WRITE_DR3] = dr_interception,
+ [SVM_EXIT_WRITE_DR4] = dr_interception,
+ [SVM_EXIT_WRITE_DR5] = dr_interception,
+ [SVM_EXIT_WRITE_DR6] = dr_interception,
+ [SVM_EXIT_WRITE_DR7] = dr_interception,
+ [SVM_EXIT_EXCP_BASE + DB_VECTOR] = db_interception,
+ [SVM_EXIT_EXCP_BASE + BP_VECTOR] = bp_interception,
+ [SVM_EXIT_EXCP_BASE + UD_VECTOR] = ud_interception,
+ [SVM_EXIT_EXCP_BASE + PF_VECTOR] = pf_interception,
+ [SVM_EXIT_EXCP_BASE + MC_VECTOR] = mc_interception,
+ [SVM_EXIT_EXCP_BASE + AC_VECTOR] = ac_interception,
+ [SVM_EXIT_EXCP_BASE + GP_VECTOR] = gp_interception,
+ [SVM_EXIT_INTR] = intr_interception,
+ [SVM_EXIT_NMI] = nmi_interception,
+ [SVM_EXIT_SMI] = smi_interception,
+ [SVM_EXIT_VINTR] = interrupt_window_interception,
+ [SVM_EXIT_RDPMC] = kvm_emulate_rdpmc,
+ [SVM_EXIT_CPUID] = kvm_emulate_cpuid,
+ [SVM_EXIT_IRET] = iret_interception,
+ [SVM_EXIT_INVD] = kvm_emulate_invd,
+ [SVM_EXIT_PAUSE] = pause_interception,
+ [SVM_EXIT_HLT] = kvm_emulate_halt,
+ [SVM_EXIT_INVLPG] = invlpg_interception,
+ [SVM_EXIT_INVLPGA] = invlpga_interception,
+ [SVM_EXIT_IOIO] = io_interception,
+ [SVM_EXIT_MSR] = msr_interception,
+ [SVM_EXIT_TASK_SWITCH] = task_switch_interception,
+ [SVM_EXIT_SHUTDOWN] = shutdown_interception,
+ [SVM_EXIT_VMRUN] = vmrun_interception,
+ [SVM_EXIT_VMMCALL] = kvm_emulate_hypercall,
+ [SVM_EXIT_VMLOAD] = vmload_interception,
+ [SVM_EXIT_VMSAVE] = vmsave_interception,
+ [SVM_EXIT_STGI] = stgi_interception,
+ [SVM_EXIT_CLGI] = clgi_interception,
+ [SVM_EXIT_SKINIT] = skinit_interception,
+ [SVM_EXIT_RDTSCP] = kvm_handle_invalid_op,
+ [SVM_EXIT_WBINVD] = kvm_emulate_wbinvd,
+ [SVM_EXIT_MONITOR] = kvm_emulate_monitor,
+ [SVM_EXIT_MWAIT] = kvm_emulate_mwait,
+ [SVM_EXIT_XSETBV] = kvm_emulate_xsetbv,
+ [SVM_EXIT_RDPRU] = kvm_handle_invalid_op,
+ [SVM_EXIT_EFER_WRITE_TRAP] = efer_trap,
+ [SVM_EXIT_CR0_WRITE_TRAP] = cr_trap,
+ [SVM_EXIT_CR4_WRITE_TRAP] = cr_trap,
+ [SVM_EXIT_CR8_WRITE_TRAP] = cr_trap,
+ [SVM_EXIT_INVPCID] = invpcid_interception,
+ [SVM_EXIT_NPF] = npf_interception,
+ [SVM_EXIT_RSM] = rsm_interception,
+ [SVM_EXIT_AVIC_INCOMPLETE_IPI] = avic_incomplete_ipi_interception,
+ [SVM_EXIT_AVIC_UNACCELERATED_ACCESS] = avic_unaccelerated_access_interception,
+ [SVM_EXIT_VMGEXIT] = sev_handle_vmgexit,
+};
+
+static void dump_vmcb(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb_control_area *control = &svm->vmcb->control;
+ struct vmcb_save_area *save = &svm->vmcb->save;
+ struct vmcb_save_area *save01 = &svm->vmcb01.ptr->save;
+
+ if (!dump_invalid_vmcb) {
+ pr_warn_ratelimited("set kvm_amd.dump_invalid_vmcb=1 to dump internal KVM state.\n");
+ return;
+ }
+
+ pr_err("VMCB %p, last attempted VMRUN on CPU %d\n",
+ svm->current_vmcb->ptr, vcpu->arch.last_vmentry_cpu);
+ pr_err("VMCB Control Area:\n");
+ pr_err("%-20s%04x\n", "cr_read:", control->intercepts[INTERCEPT_CR] & 0xffff);
+ pr_err("%-20s%04x\n", "cr_write:", control->intercepts[INTERCEPT_CR] >> 16);
+ pr_err("%-20s%04x\n", "dr_read:", control->intercepts[INTERCEPT_DR] & 0xffff);
+ pr_err("%-20s%04x\n", "dr_write:", control->intercepts[INTERCEPT_DR] >> 16);
+ pr_err("%-20s%08x\n", "exceptions:", control->intercepts[INTERCEPT_EXCEPTION]);
+ pr_err("%-20s%08x %08x\n", "intercepts:",
+ control->intercepts[INTERCEPT_WORD3],
+ control->intercepts[INTERCEPT_WORD4]);
+ pr_err("%-20s%d\n", "pause filter count:", control->pause_filter_count);
+ pr_err("%-20s%d\n", "pause filter threshold:",
+ control->pause_filter_thresh);
+ pr_err("%-20s%016llx\n", "iopm_base_pa:", control->iopm_base_pa);
+ pr_err("%-20s%016llx\n", "msrpm_base_pa:", control->msrpm_base_pa);
+ pr_err("%-20s%016llx\n", "tsc_offset:", control->tsc_offset);
+ pr_err("%-20s%d\n", "asid:", control->asid);
+ pr_err("%-20s%d\n", "tlb_ctl:", control->tlb_ctl);
+ pr_err("%-20s%08x\n", "int_ctl:", control->int_ctl);
+ pr_err("%-20s%08x\n", "int_vector:", control->int_vector);
+ pr_err("%-20s%08x\n", "int_state:", control->int_state);
+ pr_err("%-20s%08x\n", "exit_code:", control->exit_code);
+ pr_err("%-20s%016llx\n", "exit_info1:", control->exit_info_1);
+ pr_err("%-20s%016llx\n", "exit_info2:", control->exit_info_2);
+ pr_err("%-20s%08x\n", "exit_int_info:", control->exit_int_info);
+ pr_err("%-20s%08x\n", "exit_int_info_err:", control->exit_int_info_err);
+ pr_err("%-20s%lld\n", "nested_ctl:", control->nested_ctl);
+ pr_err("%-20s%016llx\n", "nested_cr3:", control->nested_cr3);
+ pr_err("%-20s%016llx\n", "avic_vapic_bar:", control->avic_vapic_bar);
+ pr_err("%-20s%016llx\n", "ghcb:", control->ghcb_gpa);
+ pr_err("%-20s%08x\n", "event_inj:", control->event_inj);
+ pr_err("%-20s%08x\n", "event_inj_err:", control->event_inj_err);
+ pr_err("%-20s%lld\n", "virt_ext:", control->virt_ext);
+ pr_err("%-20s%016llx\n", "next_rip:", control->next_rip);
+ pr_err("%-20s%016llx\n", "avic_backing_page:", control->avic_backing_page);
+ pr_err("%-20s%016llx\n", "avic_logical_id:", control->avic_logical_id);
+ pr_err("%-20s%016llx\n", "avic_physical_id:", control->avic_physical_id);
+ pr_err("%-20s%016llx\n", "vmsa_pa:", control->vmsa_pa);
+ pr_err("VMCB State Save Area:\n");
+ pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
+ "es:",
+ save->es.selector, save->es.attrib,
+ save->es.limit, save->es.base);
+ pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
+ "cs:",
+ save->cs.selector, save->cs.attrib,
+ save->cs.limit, save->cs.base);
+ pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
+ "ss:",
+ save->ss.selector, save->ss.attrib,
+ save->ss.limit, save->ss.base);
+ pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
+ "ds:",
+ save->ds.selector, save->ds.attrib,
+ save->ds.limit, save->ds.base);
+ pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
+ "fs:",
+ save01->fs.selector, save01->fs.attrib,
+ save01->fs.limit, save01->fs.base);
+ pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
+ "gs:",
+ save01->gs.selector, save01->gs.attrib,
+ save01->gs.limit, save01->gs.base);
+ pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
+ "gdtr:",
+ save->gdtr.selector, save->gdtr.attrib,
+ save->gdtr.limit, save->gdtr.base);
+ pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
+ "ldtr:",
+ save01->ldtr.selector, save01->ldtr.attrib,
+ save01->ldtr.limit, save01->ldtr.base);
+ pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
+ "idtr:",
+ save->idtr.selector, save->idtr.attrib,
+ save->idtr.limit, save->idtr.base);
+ pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
+ "tr:",
+ save01->tr.selector, save01->tr.attrib,
+ save01->tr.limit, save01->tr.base);
+ pr_err("vmpl: %d cpl: %d efer: %016llx\n",
+ save->vmpl, save->cpl, save->efer);
+ pr_err("%-15s %016llx %-13s %016llx\n",
+ "cr0:", save->cr0, "cr2:", save->cr2);
+ pr_err("%-15s %016llx %-13s %016llx\n",
+ "cr3:", save->cr3, "cr4:", save->cr4);
+ pr_err("%-15s %016llx %-13s %016llx\n",
+ "dr6:", save->dr6, "dr7:", save->dr7);
+ pr_err("%-15s %016llx %-13s %016llx\n",
+ "rip:", save->rip, "rflags:", save->rflags);
+ pr_err("%-15s %016llx %-13s %016llx\n",
+ "rsp:", save->rsp, "rax:", save->rax);
+ pr_err("%-15s %016llx %-13s %016llx\n",
+ "star:", save01->star, "lstar:", save01->lstar);
+ pr_err("%-15s %016llx %-13s %016llx\n",
+ "cstar:", save01->cstar, "sfmask:", save01->sfmask);
+ pr_err("%-15s %016llx %-13s %016llx\n",
+ "kernel_gs_base:", save01->kernel_gs_base,
+ "sysenter_cs:", save01->sysenter_cs);
+ pr_err("%-15s %016llx %-13s %016llx\n",
+ "sysenter_esp:", save01->sysenter_esp,
+ "sysenter_eip:", save01->sysenter_eip);
+ pr_err("%-15s %016llx %-13s %016llx\n",
+ "gpat:", save->g_pat, "dbgctl:", save->dbgctl);
+ pr_err("%-15s %016llx %-13s %016llx\n",
+ "br_from:", save->br_from, "br_to:", save->br_to);
+ pr_err("%-15s %016llx %-13s %016llx\n",
+ "excp_from:", save->last_excp_from,
+ "excp_to:", save->last_excp_to);
+}
+
+static bool svm_check_exit_valid(u64 exit_code)
+{
+ return (exit_code < ARRAY_SIZE(svm_exit_handlers) &&
+ svm_exit_handlers[exit_code]);
+}
+
+static int svm_handle_invalid_exit(struct kvm_vcpu *vcpu, u64 exit_code)
+{
+ vcpu_unimpl(vcpu, "svm: unexpected exit reason 0x%llx\n", exit_code);
+ dump_vmcb(vcpu);
+ vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+ vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON;
+ vcpu->run->internal.ndata = 2;
+ vcpu->run->internal.data[0] = exit_code;
+ vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu;
+ return 0;
+}
+
+int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 exit_code)
+{
+ if (!svm_check_exit_valid(exit_code))
+ return svm_handle_invalid_exit(vcpu, exit_code);
+
+#ifdef CONFIG_RETPOLINE
+ if (exit_code == SVM_EXIT_MSR)
+ return msr_interception(vcpu);
+ else if (exit_code == SVM_EXIT_VINTR)
+ return interrupt_window_interception(vcpu);
+ else if (exit_code == SVM_EXIT_INTR)
+ return intr_interception(vcpu);
+ else if (exit_code == SVM_EXIT_HLT)
+ return kvm_emulate_halt(vcpu);
+ else if (exit_code == SVM_EXIT_NPF)
+ return npf_interception(vcpu);
+#endif
+ return svm_exit_handlers[exit_code](vcpu);
+}
+
+static void svm_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason,
+ u64 *info1, u64 *info2,
+ u32 *intr_info, u32 *error_code)
+{
+ struct vmcb_control_area *control = &to_svm(vcpu)->vmcb->control;
+
+ *reason = control->exit_code;
+ *info1 = control->exit_info_1;
+ *info2 = control->exit_info_2;
+ *intr_info = control->exit_int_info;
+ if ((*intr_info & SVM_EXITINTINFO_VALID) &&
+ (*intr_info & SVM_EXITINTINFO_VALID_ERR))
+ *error_code = control->exit_int_info_err;
+ else
+ *error_code = 0;
+}
+
+static int svm_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct kvm_run *kvm_run = vcpu->run;
+ u32 exit_code = svm->vmcb->control.exit_code;
+
+ trace_kvm_exit(vcpu, KVM_ISA_SVM);
+
+ /* SEV-ES guests must use the CR write traps to track CR registers. */
+ if (!sev_es_guest(vcpu->kvm)) {
+ if (!svm_is_intercept(svm, INTERCEPT_CR0_WRITE))
+ vcpu->arch.cr0 = svm->vmcb->save.cr0;
+ if (npt_enabled)
+ vcpu->arch.cr3 = svm->vmcb->save.cr3;
+ }
+
+ if (is_guest_mode(vcpu)) {
+ int vmexit;
+
+ trace_kvm_nested_vmexit(vcpu, KVM_ISA_SVM);
+
+ vmexit = nested_svm_exit_special(svm);
+
+ if (vmexit == NESTED_EXIT_CONTINUE)
+ vmexit = nested_svm_exit_handled(svm);
+
+ if (vmexit == NESTED_EXIT_DONE)
+ return 1;
+ }
+
+ if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) {
+ kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
+ kvm_run->fail_entry.hardware_entry_failure_reason
+ = svm->vmcb->control.exit_code;
+ kvm_run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu;
+ dump_vmcb(vcpu);
+ return 0;
+ }
+
+ if (exit_fastpath != EXIT_FASTPATH_NONE)
+ return 1;
+
+ return svm_invoke_exit_handler(vcpu, exit_code);
+}
+
+static void reload_tss(struct kvm_vcpu *vcpu)
+{
+ struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, vcpu->cpu);
+
+ sd->tss_desc->type = 9; /* available 32/64-bit TSS */
+ load_TR_desc();
+}
+
+static void pre_svm_run(struct kvm_vcpu *vcpu)
+{
+ struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, vcpu->cpu);
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ /*
+ * If the previous vmrun of the vmcb occurred on a different physical
+ * cpu, then mark the vmcb dirty and assign a new asid. Hardware's
+ * vmcb clean bits are per logical CPU, as are KVM's asid assignments.
+ */
+ if (unlikely(svm->current_vmcb->cpu != vcpu->cpu)) {
+ svm->current_vmcb->asid_generation = 0;
+ vmcb_mark_all_dirty(svm->vmcb);
+ svm->current_vmcb->cpu = vcpu->cpu;
+ }
+
+ if (sev_guest(vcpu->kvm))
+ return pre_sev_run(svm, vcpu->cpu);
+
+ /* FIXME: handle wraparound of asid_generation */
+ if (svm->current_vmcb->asid_generation != sd->asid_generation)
+ new_asid(svm, sd);
+}
+
+static void svm_inject_nmi(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI;
+
+ if (svm->nmi_l1_to_l2)
+ return;
+
+ vcpu->arch.hflags |= HF_NMI_MASK;
+ if (!sev_es_guest(vcpu->kvm))
+ svm_set_intercept(svm, INTERCEPT_IRET);
+ ++vcpu->stat.nmi_injections;
+}
+
+static void svm_inject_irq(struct kvm_vcpu *vcpu, bool reinjected)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ u32 type;
+
+ if (vcpu->arch.interrupt.soft) {
+ if (svm_update_soft_interrupt_rip(vcpu))
+ return;
+
+ type = SVM_EVTINJ_TYPE_SOFT;
+ } else {
+ type = SVM_EVTINJ_TYPE_INTR;
+ }
+
+ trace_kvm_inj_virq(vcpu->arch.interrupt.nr,
+ vcpu->arch.interrupt.soft, reinjected);
+ ++vcpu->stat.irq_injections;
+
+ svm->vmcb->control.event_inj = vcpu->arch.interrupt.nr |
+ SVM_EVTINJ_VALID | type;
+}
+
+void svm_complete_interrupt_delivery(struct kvm_vcpu *vcpu, int delivery_mode,
+ int trig_mode, int vector)
+{
+ /*
+ * apic->apicv_active must be read after vcpu->mode.
+ * Pairs with smp_store_release in vcpu_enter_guest.
+ */
+ bool in_guest_mode = (smp_load_acquire(&vcpu->mode) == IN_GUEST_MODE);
+
+ /* Note, this is called iff the local APIC is in-kernel. */
+ if (!READ_ONCE(vcpu->arch.apic->apicv_active)) {
+ /* Process the interrupt via kvm_check_and_inject_events(). */
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ kvm_vcpu_kick(vcpu);
+ return;
+ }
+
+ trace_kvm_apicv_accept_irq(vcpu->vcpu_id, delivery_mode, trig_mode, vector);
+ if (in_guest_mode) {
+ /*
+ * Signal the doorbell to tell hardware to inject the IRQ. If
+ * the vCPU exits the guest before the doorbell chimes, hardware
+ * will automatically process AVIC interrupts at the next VMRUN.
+ */
+ avic_ring_doorbell(vcpu);
+ } else {
+ /*
+ * Wake the vCPU if it was blocking. KVM will then detect the
+ * pending IRQ when checking if the vCPU has a wake event.
+ */
+ kvm_vcpu_wake_up(vcpu);
+ }
+}
+
+static void svm_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode,
+ int trig_mode, int vector)
+{
+ kvm_lapic_set_irr(vector, apic);
+
+ /*
+ * Pairs with the smp_mb_*() after setting vcpu->guest_mode in
+ * vcpu_enter_guest() to ensure the write to the vIRR is ordered before
+ * the read of guest_mode. This guarantees that either VMRUN will see
+ * and process the new vIRR entry, or that svm_complete_interrupt_delivery
+ * will signal the doorbell if the CPU has already entered the guest.
+ */
+ smp_mb__after_atomic();
+ svm_complete_interrupt_delivery(apic->vcpu, delivery_mode, trig_mode, vector);
+}
+
+static void svm_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ /*
+ * SEV-ES guests must always keep the CR intercepts cleared. CR
+ * tracking is done using the CR write traps.
+ */
+ if (sev_es_guest(vcpu->kvm))
+ return;
+
+ if (nested_svm_virtualize_tpr(vcpu))
+ return;
+
+ svm_clr_intercept(svm, INTERCEPT_CR8_WRITE);
+
+ if (irr == -1)
+ return;
+
+ if (tpr >= irr)
+ svm_set_intercept(svm, INTERCEPT_CR8_WRITE);
+}
+
+bool svm_nmi_blocked(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb *vmcb = svm->vmcb;
+ bool ret;
+
+ if (!gif_set(svm))
+ return true;
+
+ if (is_guest_mode(vcpu) && nested_exit_on_nmi(svm))
+ return false;
+
+ ret = (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) ||
+ (vcpu->arch.hflags & HF_NMI_MASK);
+
+ return ret;
+}
+
+static int svm_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ if (svm->nested.nested_run_pending)
+ return -EBUSY;
+
+ if (svm_nmi_blocked(vcpu))
+ return 0;
+
+ /* An NMI must not be injected into L2 if it's supposed to VM-Exit. */
+ if (for_injection && is_guest_mode(vcpu) && nested_exit_on_nmi(svm))
+ return -EBUSY;
+ return 1;
+}
+
+static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu)
+{
+ return !!(vcpu->arch.hflags & HF_NMI_MASK);
+}
+
+static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (masked) {
+ vcpu->arch.hflags |= HF_NMI_MASK;
+ if (!sev_es_guest(vcpu->kvm))
+ svm_set_intercept(svm, INTERCEPT_IRET);
+ } else {
+ vcpu->arch.hflags &= ~HF_NMI_MASK;
+ if (!sev_es_guest(vcpu->kvm))
+ svm_clr_intercept(svm, INTERCEPT_IRET);
+ }
+}
+
+bool svm_interrupt_blocked(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb *vmcb = svm->vmcb;
+
+ if (!gif_set(svm))
+ return true;
+
+ if (is_guest_mode(vcpu)) {
+ /* As long as interrupts are being delivered... */
+ if ((svm->nested.ctl.int_ctl & V_INTR_MASKING_MASK)
+ ? !(svm->vmcb01.ptr->save.rflags & X86_EFLAGS_IF)
+ : !(kvm_get_rflags(vcpu) & X86_EFLAGS_IF))
+ return true;
+
+ /* ... vmexits aren't blocked by the interrupt shadow */
+ if (nested_exit_on_intr(svm))
+ return false;
+ } else {
+ if (!svm_get_if_flag(vcpu))
+ return true;
+ }
+
+ return (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK);
+}
+
+static int svm_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (svm->nested.nested_run_pending)
+ return -EBUSY;
+
+ if (svm_interrupt_blocked(vcpu))
+ return 0;
+
+ /*
+ * An IRQ must not be injected into L2 if it's supposed to VM-Exit,
+ * e.g. if the IRQ arrived asynchronously after checking nested events.
+ */
+ if (for_injection && is_guest_mode(vcpu) && nested_exit_on_intr(svm))
+ return -EBUSY;
+
+ return 1;
+}
+
+static void svm_enable_irq_window(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ /*
+ * In case GIF=0 we can't rely on the CPU to tell us when GIF becomes
+ * 1, because that's a separate STGI/VMRUN intercept. The next time we
+ * get that intercept, this function will be called again though and
+ * we'll get the vintr intercept. However, if the vGIF feature is
+ * enabled, the STGI interception will not occur. Enable the irq
+ * window under the assumption that the hardware will set the GIF.
+ */
+ if (vgif || gif_set(svm)) {
+ /*
+ * IRQ window is not needed when AVIC is enabled,
+ * unless we have pending ExtINT since it cannot be injected
+ * via AVIC. In such case, KVM needs to temporarily disable AVIC,
+ * and fallback to injecting IRQ via V_IRQ.
+ *
+ * If running nested, AVIC is already locally inhibited
+ * on this vCPU, therefore there is no need to request
+ * the VM wide AVIC inhibition.
+ */
+ if (!is_guest_mode(vcpu))
+ kvm_set_apicv_inhibit(vcpu->kvm, APICV_INHIBIT_REASON_IRQWIN);
+
+ svm_set_vintr(svm);
+ }
+}
+
+static void svm_enable_nmi_window(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if ((vcpu->arch.hflags & (HF_NMI_MASK | HF_IRET_MASK)) == HF_NMI_MASK)
+ return; /* IRET will cause a vm exit */
+
+ if (!gif_set(svm)) {
+ if (vgif)
+ svm_set_intercept(svm, INTERCEPT_STGI);
+ return; /* STGI will cause a vm exit */
+ }
+
+ /*
+ * Something prevents NMI from been injected. Single step over possible
+ * problem (IRET or exception injection or interrupt shadow)
+ */
+ svm->nmi_singlestep_guest_rflags = svm_get_rflags(vcpu);
+ svm->nmi_singlestep = true;
+ svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
+}
+
+static void svm_flush_tlb_asid(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ /*
+ * Flush only the current ASID even if the TLB flush was invoked via
+ * kvm_flush_remote_tlbs(). Although flushing remote TLBs requires all
+ * ASIDs to be flushed, KVM uses a single ASID for L1 and L2, and
+ * unconditionally does a TLB flush on both nested VM-Enter and nested
+ * VM-Exit (via kvm_mmu_reset_context()).
+ */
+ if (static_cpu_has(X86_FEATURE_FLUSHBYASID))
+ svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID;
+ else
+ svm->current_vmcb->asid_generation--;
+}
+
+static void svm_flush_tlb_current(struct kvm_vcpu *vcpu)
+{
+ hpa_t root_tdp = vcpu->arch.mmu->root.hpa;
+
+ /*
+ * When running on Hyper-V with EnlightenedNptTlb enabled, explicitly
+ * flush the NPT mappings via hypercall as flushing the ASID only
+ * affects virtual to physical mappings, it does not invalidate guest
+ * physical to host physical mappings.
+ */
+ if (svm_hv_is_enlightened_tlb_enabled(vcpu) && VALID_PAGE(root_tdp))
+ hyperv_flush_guest_mapping(root_tdp);
+
+ svm_flush_tlb_asid(vcpu);
+}
+
+static void svm_flush_tlb_all(struct kvm_vcpu *vcpu)
+{
+ /*
+ * When running on Hyper-V with EnlightenedNptTlb enabled, remote TLB
+ * flushes should be routed to hv_remote_flush_tlb() without requesting
+ * a "regular" remote flush. Reaching this point means either there's
+ * a KVM bug or a prior hv_remote_flush_tlb() call failed, both of
+ * which might be fatal to the guest. Yell, but try to recover.
+ */
+ if (WARN_ON_ONCE(svm_hv_is_enlightened_tlb_enabled(vcpu)))
+ hv_remote_flush_tlb(vcpu->kvm);
+
+ svm_flush_tlb_asid(vcpu);
+}
+
+static void svm_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t gva)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ invlpga(gva, svm->vmcb->control.asid);
+}
+
+static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (nested_svm_virtualize_tpr(vcpu))
+ return;
+
+ if (!svm_is_intercept(svm, INTERCEPT_CR8_WRITE)) {
+ int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK;
+ kvm_set_cr8(vcpu, cr8);
+ }
+}
+
+static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ u64 cr8;
+
+ if (nested_svm_virtualize_tpr(vcpu) ||
+ kvm_vcpu_apicv_active(vcpu))
+ return;
+
+ cr8 = kvm_get_cr8(vcpu);
+ svm->vmcb->control.int_ctl &= ~V_TPR_MASK;
+ svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK;
+}
+
+static void svm_complete_soft_interrupt(struct kvm_vcpu *vcpu, u8 vector,
+ int type)
+{
+ bool is_exception = (type == SVM_EXITINTINFO_TYPE_EXEPT);
+ bool is_soft = (type == SVM_EXITINTINFO_TYPE_SOFT);
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ /*
+ * If NRIPS is enabled, KVM must snapshot the pre-VMRUN next_rip that's
+ * associated with the original soft exception/interrupt. next_rip is
+ * cleared on all exits that can occur while vectoring an event, so KVM
+ * needs to manually set next_rip for re-injection. Unlike the !nrips
+ * case below, this needs to be done if and only if KVM is re-injecting
+ * the same event, i.e. if the event is a soft exception/interrupt,
+ * otherwise next_rip is unused on VMRUN.
+ */
+ if (nrips && (is_soft || (is_exception && kvm_exception_is_soft(vector))) &&
+ kvm_is_linear_rip(vcpu, svm->soft_int_old_rip + svm->soft_int_csbase))
+ svm->vmcb->control.next_rip = svm->soft_int_next_rip;
+ /*
+ * If NRIPS isn't enabled, KVM must manually advance RIP prior to
+ * injecting the soft exception/interrupt. That advancement needs to
+ * be unwound if vectoring didn't complete. Note, the new event may
+ * not be the injected event, e.g. if KVM injected an INTn, the INTn
+ * hit a #NP in the guest, and the #NP encountered a #PF, the #NP will
+ * be the reported vectored event, but RIP still needs to be unwound.
+ */
+ else if (!nrips && (is_soft || is_exception) &&
+ kvm_is_linear_rip(vcpu, svm->soft_int_next_rip + svm->soft_int_csbase))
+ kvm_rip_write(vcpu, svm->soft_int_old_rip);
+}
+
+static void svm_complete_interrupts(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ u8 vector;
+ int type;
+ u32 exitintinfo = svm->vmcb->control.exit_int_info;
+ bool nmi_l1_to_l2 = svm->nmi_l1_to_l2;
+ bool soft_int_injected = svm->soft_int_injected;
+
+ svm->nmi_l1_to_l2 = false;
+ svm->soft_int_injected = false;
+
+ /*
+ * If we've made progress since setting HF_IRET_MASK, we've
+ * executed an IRET and can allow NMI injection.
+ */
+ if ((vcpu->arch.hflags & HF_IRET_MASK) &&
+ (sev_es_guest(vcpu->kvm) ||
+ kvm_rip_read(vcpu) != svm->nmi_iret_rip)) {
+ vcpu->arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK);
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ }
+
+ vcpu->arch.nmi_injected = false;
+ kvm_clear_exception_queue(vcpu);
+ kvm_clear_interrupt_queue(vcpu);
+
+ if (!(exitintinfo & SVM_EXITINTINFO_VALID))
+ return;
+
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+
+ vector = exitintinfo & SVM_EXITINTINFO_VEC_MASK;
+ type = exitintinfo & SVM_EXITINTINFO_TYPE_MASK;
+
+ if (soft_int_injected)
+ svm_complete_soft_interrupt(vcpu, vector, type);
+
+ switch (type) {
+ case SVM_EXITINTINFO_TYPE_NMI:
+ vcpu->arch.nmi_injected = true;
+ svm->nmi_l1_to_l2 = nmi_l1_to_l2;
+ break;
+ case SVM_EXITINTINFO_TYPE_EXEPT:
+ /*
+ * Never re-inject a #VC exception.
+ */
+ if (vector == X86_TRAP_VC)
+ break;
+
+ if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) {
+ u32 err = svm->vmcb->control.exit_int_info_err;
+ kvm_requeue_exception_e(vcpu, vector, err);
+
+ } else
+ kvm_requeue_exception(vcpu, vector);
+ break;
+ case SVM_EXITINTINFO_TYPE_INTR:
+ kvm_queue_interrupt(vcpu, vector, false);
+ break;
+ case SVM_EXITINTINFO_TYPE_SOFT:
+ kvm_queue_interrupt(vcpu, vector, true);
+ break;
+ default:
+ break;
+ }
+
+}
+
+static void svm_cancel_injection(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb_control_area *control = &svm->vmcb->control;
+
+ control->exit_int_info = control->event_inj;
+ control->exit_int_info_err = control->event_inj_err;
+ control->event_inj = 0;
+ svm_complete_interrupts(vcpu);
+}
+
+static int svm_vcpu_pre_run(struct kvm_vcpu *vcpu)
+{
+ return 1;
+}
+
+static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
+{
+ struct vmcb_control_area *control = &to_svm(vcpu)->vmcb->control;
+
+ /*
+ * Note, the next RIP must be provided as SRCU isn't held, i.e. KVM
+ * can't read guest memory (dereference memslots) to decode the WRMSR.
+ */
+ if (control->exit_code == SVM_EXIT_MSR && control->exit_info_1 &&
+ nrips && control->next_rip)
+ return handle_fastpath_set_msr_irqoff(vcpu);
+
+ return EXIT_FASTPATH_NONE;
+}
+
+static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_intercepted)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ guest_state_enter_irqoff();
+
+ amd_clear_divider();
+
+ if (sev_es_guest(vcpu->kvm))
+ __svm_sev_es_vcpu_run(svm, spec_ctrl_intercepted);
+ else
+ __svm_vcpu_run(svm, spec_ctrl_intercepted);
+
+ guest_state_exit_irqoff();
+}
+
+static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ bool spec_ctrl_intercepted = msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL);
+
+ trace_kvm_entry(vcpu);
+
+ svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
+ svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
+ svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];
+
+ /*
+ * Disable singlestep if we're injecting an interrupt/exception.
+ * We don't want our modified rflags to be pushed on the stack where
+ * we might not be able to easily reset them if we disabled NMI
+ * singlestep later.
+ */
+ if (svm->nmi_singlestep && svm->vmcb->control.event_inj) {
+ /*
+ * Event injection happens before external interrupts cause a
+ * vmexit and interrupts are disabled here, so smp_send_reschedule
+ * is enough to force an immediate vmexit.
+ */
+ disable_nmi_singlestep(svm);
+ smp_send_reschedule(vcpu->cpu);
+ }
+
+ pre_svm_run(vcpu);
+
+ sync_lapic_to_cr8(vcpu);
+
+ if (unlikely(svm->asid != svm->vmcb->control.asid)) {
+ svm->vmcb->control.asid = svm->asid;
+ vmcb_mark_dirty(svm->vmcb, VMCB_ASID);
+ }
+ svm->vmcb->save.cr2 = vcpu->arch.cr2;
+
+ svm_hv_update_vp_id(svm->vmcb, vcpu);
+
+ /*
+ * Run with all-zero DR6 unless needed, so that we can get the exact cause
+ * of a #DB.
+ */
+ if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT))
+ svm_set_dr6(svm, vcpu->arch.dr6);
+ else
+ svm_set_dr6(svm, DR6_ACTIVE_LOW);
+
+ clgi();
+ kvm_load_guest_xsave_state(vcpu);
+
+ kvm_wait_lapic_expire(vcpu);
+
+ /*
+ * If this vCPU has touched SPEC_CTRL, restore the guest's value if
+ * it's non-zero. Since vmentry is serialising on affected CPUs, there
+ * is no need to worry about the conditional branch over the wrmsr
+ * being speculatively taken.
+ */
+ if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL))
+ x86_spec_ctrl_set_guest(svm->virt_spec_ctrl);
+
+ svm_vcpu_enter_exit(vcpu, spec_ctrl_intercepted);
+
+ if (!sev_es_guest(vcpu->kvm))
+ reload_tss(vcpu);
+
+ if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL))
+ x86_spec_ctrl_restore_host(svm->virt_spec_ctrl);
+
+ if (!sev_es_guest(vcpu->kvm)) {
+ vcpu->arch.cr2 = svm->vmcb->save.cr2;
+ vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
+ vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
+ vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip;
+ }
+ vcpu->arch.regs_dirty = 0;
+
+ if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
+ kvm_before_interrupt(vcpu, KVM_HANDLING_NMI);
+
+ kvm_load_host_xsave_state(vcpu);
+ stgi();
+
+ /* Any pending NMI will happen here */
+
+ if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
+ kvm_after_interrupt(vcpu);
+
+ sync_cr8_to_lapic(vcpu);
+
+ svm->next_rip = 0;
+ if (is_guest_mode(vcpu)) {
+ nested_sync_control_from_vmcb02(svm);
+
+ /* Track VMRUNs that have made past consistency checking */
+ if (svm->nested.nested_run_pending &&
+ svm->vmcb->control.exit_code != SVM_EXIT_ERR)
+ ++vcpu->stat.nested_run;
+
+ svm->nested.nested_run_pending = 0;
+ }
+
+ svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;
+ vmcb_mark_all_clean(svm->vmcb);
+
+ /* if exit due to PF check for async PF */
+ if (svm->vmcb->control.exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR)
+ vcpu->arch.apf.host_apf_flags =
+ kvm_read_and_reset_apf_flags();
+
+ vcpu->arch.regs_avail &= ~SVM_REGS_LAZY_LOAD_SET;
+
+ /*
+ * We need to handle MC intercepts here before the vcpu has a chance to
+ * change the physical cpu
+ */
+ if (unlikely(svm->vmcb->control.exit_code ==
+ SVM_EXIT_EXCP_BASE + MC_VECTOR))
+ svm_handle_mce(vcpu);
+
+ svm_complete_interrupts(vcpu);
+
+ if (is_guest_mode(vcpu))
+ return EXIT_FASTPATH_NONE;
+
+ return svm_exit_handlers_fastpath(vcpu);
+}
+
+static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa,
+ int root_level)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ unsigned long cr3;
+
+ if (npt_enabled) {
+ svm->vmcb->control.nested_cr3 = __sme_set(root_hpa);
+ vmcb_mark_dirty(svm->vmcb, VMCB_NPT);
+
+ hv_track_root_tdp(vcpu, root_hpa);
+
+ cr3 = vcpu->arch.cr3;
+ } else if (root_level >= PT64_ROOT_4LEVEL) {
+ cr3 = __sme_set(root_hpa) | kvm_get_active_pcid(vcpu);
+ } else {
+ /* PCID in the guest should be impossible with a 32-bit MMU. */
+ WARN_ON_ONCE(kvm_get_active_pcid(vcpu));
+ cr3 = root_hpa;
+ }
+
+ svm->vmcb->save.cr3 = cr3;
+ vmcb_mark_dirty(svm->vmcb, VMCB_CR);
+}
+
+static int is_disabled(void)
+{
+ u64 vm_cr;
+
+ rdmsrl(MSR_VM_CR, vm_cr);
+ if (vm_cr & (1 << SVM_VM_CR_SVM_DISABLE))
+ return 1;
+
+ return 0;
+}
+
+static void
+svm_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
+{
+ /*
+ * Patch in the VMMCALL instruction:
+ */
+ hypercall[0] = 0x0f;
+ hypercall[1] = 0x01;
+ hypercall[2] = 0xd9;
+}
+
+static int __init svm_check_processor_compat(void)
+{
+ return 0;
+}
+
+/*
+ * The kvm parameter can be NULL (module initialization, or invocation before
+ * VM creation). Be sure to check the kvm parameter before using it.
+ */
+static bool svm_has_emulated_msr(struct kvm *kvm, u32 index)
+{
+ switch (index) {
+ case MSR_IA32_MCG_EXT_CTL:
+ case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
+ return false;
+ case MSR_IA32_SMBASE:
+ /* SEV-ES guests do not support SMM, so report false */
+ if (kvm && sev_es_guest(kvm))
+ return false;
+ break;
+ default:
+ break;
+ }
+
+ return true;
+}
+
+static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ vcpu->arch.xsaves_enabled = guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) &&
+ boot_cpu_has(X86_FEATURE_XSAVE) &&
+ boot_cpu_has(X86_FEATURE_XSAVES);
+
+ /* Update nrips enabled cache */
+ svm->nrips_enabled = kvm_cpu_cap_has(X86_FEATURE_NRIPS) &&
+ guest_cpuid_has(vcpu, X86_FEATURE_NRIPS);
+
+ svm->tsc_scaling_enabled = tsc_scaling && guest_cpuid_has(vcpu, X86_FEATURE_TSCRATEMSR);
+ svm->lbrv_enabled = lbrv && guest_cpuid_has(vcpu, X86_FEATURE_LBRV);
+
+ svm->v_vmload_vmsave_enabled = vls && guest_cpuid_has(vcpu, X86_FEATURE_V_VMSAVE_VMLOAD);
+
+ svm->pause_filter_enabled = kvm_cpu_cap_has(X86_FEATURE_PAUSEFILTER) &&
+ guest_cpuid_has(vcpu, X86_FEATURE_PAUSEFILTER);
+
+ svm->pause_threshold_enabled = kvm_cpu_cap_has(X86_FEATURE_PFTHRESHOLD) &&
+ guest_cpuid_has(vcpu, X86_FEATURE_PFTHRESHOLD);
+
+ svm->vgif_enabled = vgif && guest_cpuid_has(vcpu, X86_FEATURE_VGIF);
+
+ svm_recalc_instruction_intercepts(vcpu, svm);
+
+ if (sev_guest(vcpu->kvm))
+ sev_vcpu_after_set_cpuid(svm);
+
+ init_vmcb_after_set_cpuid(vcpu);
+}
+
+static bool svm_has_wbinvd_exit(void)
+{
+ return true;
+}
+
+#define PRE_EX(exit) { .exit_code = (exit), \
+ .stage = X86_ICPT_PRE_EXCEPT, }
+#define POST_EX(exit) { .exit_code = (exit), \
+ .stage = X86_ICPT_POST_EXCEPT, }
+#define POST_MEM(exit) { .exit_code = (exit), \
+ .stage = X86_ICPT_POST_MEMACCESS, }
+
+static const struct __x86_intercept {
+ u32 exit_code;
+ enum x86_intercept_stage stage;
+} x86_intercept_map[] = {
+ [x86_intercept_cr_read] = POST_EX(SVM_EXIT_READ_CR0),
+ [x86_intercept_cr_write] = POST_EX(SVM_EXIT_WRITE_CR0),
+ [x86_intercept_clts] = POST_EX(SVM_EXIT_WRITE_CR0),
+ [x86_intercept_lmsw] = POST_EX(SVM_EXIT_WRITE_CR0),
+ [x86_intercept_smsw] = POST_EX(SVM_EXIT_READ_CR0),
+ [x86_intercept_dr_read] = POST_EX(SVM_EXIT_READ_DR0),
+ [x86_intercept_dr_write] = POST_EX(SVM_EXIT_WRITE_DR0),
+ [x86_intercept_sldt] = POST_EX(SVM_EXIT_LDTR_READ),
+ [x86_intercept_str] = POST_EX(SVM_EXIT_TR_READ),
+ [x86_intercept_lldt] = POST_EX(SVM_EXIT_LDTR_WRITE),
+ [x86_intercept_ltr] = POST_EX(SVM_EXIT_TR_WRITE),
+ [x86_intercept_sgdt] = POST_EX(SVM_EXIT_GDTR_READ),
+ [x86_intercept_sidt] = POST_EX(SVM_EXIT_IDTR_READ),
+ [x86_intercept_lgdt] = POST_EX(SVM_EXIT_GDTR_WRITE),
+ [x86_intercept_lidt] = POST_EX(SVM_EXIT_IDTR_WRITE),
+ [x86_intercept_vmrun] = POST_EX(SVM_EXIT_VMRUN),
+ [x86_intercept_vmmcall] = POST_EX(SVM_EXIT_VMMCALL),
+ [x86_intercept_vmload] = POST_EX(SVM_EXIT_VMLOAD),
+ [x86_intercept_vmsave] = POST_EX(SVM_EXIT_VMSAVE),
+ [x86_intercept_stgi] = POST_EX(SVM_EXIT_STGI),
+ [x86_intercept_clgi] = POST_EX(SVM_EXIT_CLGI),
+ [x86_intercept_skinit] = POST_EX(SVM_EXIT_SKINIT),
+ [x86_intercept_invlpga] = POST_EX(SVM_EXIT_INVLPGA),
+ [x86_intercept_rdtscp] = POST_EX(SVM_EXIT_RDTSCP),
+ [x86_intercept_monitor] = POST_MEM(SVM_EXIT_MONITOR),
+ [x86_intercept_mwait] = POST_EX(SVM_EXIT_MWAIT),
+ [x86_intercept_invlpg] = POST_EX(SVM_EXIT_INVLPG),
+ [x86_intercept_invd] = POST_EX(SVM_EXIT_INVD),
+ [x86_intercept_wbinvd] = POST_EX(SVM_EXIT_WBINVD),
+ [x86_intercept_wrmsr] = POST_EX(SVM_EXIT_MSR),
+ [x86_intercept_rdtsc] = POST_EX(SVM_EXIT_RDTSC),
+ [x86_intercept_rdmsr] = POST_EX(SVM_EXIT_MSR),
+ [x86_intercept_rdpmc] = POST_EX(SVM_EXIT_RDPMC),
+ [x86_intercept_cpuid] = PRE_EX(SVM_EXIT_CPUID),
+ [x86_intercept_rsm] = PRE_EX(SVM_EXIT_RSM),
+ [x86_intercept_pause] = PRE_EX(SVM_EXIT_PAUSE),
+ [x86_intercept_pushf] = PRE_EX(SVM_EXIT_PUSHF),
+ [x86_intercept_popf] = PRE_EX(SVM_EXIT_POPF),
+ [x86_intercept_intn] = PRE_EX(SVM_EXIT_SWINT),
+ [x86_intercept_iret] = PRE_EX(SVM_EXIT_IRET),
+ [x86_intercept_icebp] = PRE_EX(SVM_EXIT_ICEBP),
+ [x86_intercept_hlt] = POST_EX(SVM_EXIT_HLT),
+ [x86_intercept_in] = POST_EX(SVM_EXIT_IOIO),
+ [x86_intercept_ins] = POST_EX(SVM_EXIT_IOIO),
+ [x86_intercept_out] = POST_EX(SVM_EXIT_IOIO),
+ [x86_intercept_outs] = POST_EX(SVM_EXIT_IOIO),
+ [x86_intercept_xsetbv] = PRE_EX(SVM_EXIT_XSETBV),
+};
+
+#undef PRE_EX
+#undef POST_EX
+#undef POST_MEM
+
+static int svm_check_intercept(struct kvm_vcpu *vcpu,
+ struct x86_instruction_info *info,
+ enum x86_intercept_stage stage,
+ struct x86_exception *exception)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ int vmexit, ret = X86EMUL_CONTINUE;
+ struct __x86_intercept icpt_info;
+ struct vmcb *vmcb = svm->vmcb;
+
+ if (info->intercept >= ARRAY_SIZE(x86_intercept_map))
+ goto out;
+
+ icpt_info = x86_intercept_map[info->intercept];
+
+ if (stage != icpt_info.stage)
+ goto out;
+
+ switch (icpt_info.exit_code) {
+ case SVM_EXIT_READ_CR0:
+ if (info->intercept == x86_intercept_cr_read)
+ icpt_info.exit_code += info->modrm_reg;
+ break;
+ case SVM_EXIT_WRITE_CR0: {
+ unsigned long cr0, val;
+
+ if (info->intercept == x86_intercept_cr_write)
+ icpt_info.exit_code += info->modrm_reg;
+
+ if (icpt_info.exit_code != SVM_EXIT_WRITE_CR0 ||
+ info->intercept == x86_intercept_clts)
+ break;
+
+ if (!(vmcb12_is_intercept(&svm->nested.ctl,
+ INTERCEPT_SELECTIVE_CR0)))
+ break;
+
+ cr0 = vcpu->arch.cr0 & ~SVM_CR0_SELECTIVE_MASK;
+ val = info->src_val & ~SVM_CR0_SELECTIVE_MASK;
+
+ if (info->intercept == x86_intercept_lmsw) {
+ cr0 &= 0xfUL;
+ val &= 0xfUL;
+ /* lmsw can't clear PE - catch this here */
+ if (cr0 & X86_CR0_PE)
+ val |= X86_CR0_PE;
+ }
+
+ if (cr0 ^ val)
+ icpt_info.exit_code = SVM_EXIT_CR0_SEL_WRITE;
+
+ break;
+ }
+ case SVM_EXIT_READ_DR0:
+ case SVM_EXIT_WRITE_DR0:
+ icpt_info.exit_code += info->modrm_reg;
+ break;
+ case SVM_EXIT_MSR:
+ if (info->intercept == x86_intercept_wrmsr)
+ vmcb->control.exit_info_1 = 1;
+ else
+ vmcb->control.exit_info_1 = 0;
+ break;
+ case SVM_EXIT_PAUSE:
+ /*
+ * We get this for NOP only, but pause
+ * is rep not, check this here
+ */
+ if (info->rep_prefix != REPE_PREFIX)
+ goto out;
+ break;
+ case SVM_EXIT_IOIO: {
+ u64 exit_info;
+ u32 bytes;
+
+ if (info->intercept == x86_intercept_in ||
+ info->intercept == x86_intercept_ins) {
+ exit_info = ((info->src_val & 0xffff) << 16) |
+ SVM_IOIO_TYPE_MASK;
+ bytes = info->dst_bytes;
+ } else {
+ exit_info = (info->dst_val & 0xffff) << 16;
+ bytes = info->src_bytes;
+ }
+
+ if (info->intercept == x86_intercept_outs ||
+ info->intercept == x86_intercept_ins)
+ exit_info |= SVM_IOIO_STR_MASK;
+
+ if (info->rep_prefix)
+ exit_info |= SVM_IOIO_REP_MASK;
+
+ bytes = min(bytes, 4u);
+
+ exit_info |= bytes << SVM_IOIO_SIZE_SHIFT;
+
+ exit_info |= (u32)info->ad_bytes << (SVM_IOIO_ASIZE_SHIFT - 1);
+
+ vmcb->control.exit_info_1 = exit_info;
+ vmcb->control.exit_info_2 = info->next_rip;
+
+ break;
+ }
+ default:
+ break;
+ }
+
+ /* TODO: Advertise NRIPS to guest hypervisor unconditionally */
+ if (static_cpu_has(X86_FEATURE_NRIPS))
+ vmcb->control.next_rip = info->next_rip;
+ vmcb->control.exit_code = icpt_info.exit_code;
+ vmexit = nested_svm_exit_handled(svm);
+
+ ret = (vmexit == NESTED_EXIT_DONE) ? X86EMUL_INTERCEPTED
+ : X86EMUL_CONTINUE;
+
+out:
+ return ret;
+}
+
+static void svm_handle_exit_irqoff(struct kvm_vcpu *vcpu)
+{
+ if (to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_INTR)
+ vcpu->arch.at_instruction_boundary = true;
+}
+
+static void svm_sched_in(struct kvm_vcpu *vcpu, int cpu)
+{
+ if (!kvm_pause_in_guest(vcpu->kvm))
+ shrink_ple_window(vcpu);
+}
+
+static void svm_setup_mce(struct kvm_vcpu *vcpu)
+{
+ /* [63:9] are reserved. */
+ vcpu->arch.mcg_cap &= 0x1ff;
+}
+
+bool svm_smi_blocked(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ /* Per APM Vol.2 15.22.2 "Response to SMI" */
+ if (!gif_set(svm))
+ return true;
+
+ return is_smm(vcpu);
+}
+
+static int svm_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ if (svm->nested.nested_run_pending)
+ return -EBUSY;
+
+ if (svm_smi_blocked(vcpu))
+ return 0;
+
+ /* An SMI must not be injected into L2 if it's supposed to VM-Exit. */
+ if (for_injection && is_guest_mode(vcpu) && nested_exit_on_smi(svm))
+ return -EBUSY;
+
+ return 1;
+}
+
+static int svm_enter_smm(struct kvm_vcpu *vcpu, char *smstate)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct kvm_host_map map_save;
+ int ret;
+
+ if (!is_guest_mode(vcpu))
+ return 0;
+
+ /* FED8h - SVM Guest */
+ put_smstate(u64, smstate, 0x7ed8, 1);
+ /* FEE0h - SVM Guest VMCB Physical Address */
+ put_smstate(u64, smstate, 0x7ee0, svm->nested.vmcb12_gpa);
+
+ svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
+ svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
+ svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];
+
+ ret = nested_svm_simple_vmexit(svm, SVM_EXIT_SW);
+ if (ret)
+ return ret;
+
+ /*
+ * KVM uses VMCB01 to store L1 host state while L2 runs but
+ * VMCB01 is going to be used during SMM and thus the state will
+ * be lost. Temporary save non-VMLOAD/VMSAVE state to the host save
+ * area pointed to by MSR_VM_HSAVE_PA. APM guarantees that the
+ * format of the area is identical to guest save area offsetted
+ * by 0x400 (matches the offset of 'struct vmcb_save_area'
+ * within 'struct vmcb'). Note: HSAVE area may also be used by
+ * L1 hypervisor to save additional host context (e.g. KVM does
+ * that, see svm_prepare_switch_to_guest()) which must be
+ * preserved.
+ */
+ if (kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.hsave_msr),
+ &map_save) == -EINVAL)
+ return 1;
+
+ BUILD_BUG_ON(offsetof(struct vmcb, save) != 0x400);
+
+ svm_copy_vmrun_state(map_save.hva + 0x400,
+ &svm->vmcb01.ptr->save);
+
+ kvm_vcpu_unmap(vcpu, &map_save, true);
+ return 0;
+}
+
+static int svm_leave_smm(struct kvm_vcpu *vcpu, const char *smstate)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct kvm_host_map map, map_save;
+ u64 saved_efer, vmcb12_gpa;
+ struct vmcb *vmcb12;
+ int ret;
+
+ if (!guest_cpuid_has(vcpu, X86_FEATURE_LM))
+ return 0;
+
+ /* Non-zero if SMI arrived while vCPU was in guest mode. */
+ if (!GET_SMSTATE(u64, smstate, 0x7ed8))
+ return 0;
+
+ if (!guest_cpuid_has(vcpu, X86_FEATURE_SVM))
+ return 1;
+
+ saved_efer = GET_SMSTATE(u64, smstate, 0x7ed0);
+ if (!(saved_efer & EFER_SVME))
+ return 1;
+
+ vmcb12_gpa = GET_SMSTATE(u64, smstate, 0x7ee0);
+ if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcb12_gpa), &map) == -EINVAL)
+ return 1;
+
+ ret = 1;
+ if (kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.hsave_msr), &map_save) == -EINVAL)
+ goto unmap_map;
+
+ if (svm_allocate_nested(svm))
+ goto unmap_save;
+
+ /*
+ * Restore L1 host state from L1 HSAVE area as VMCB01 was
+ * used during SMM (see svm_enter_smm())
+ */
+
+ svm_copy_vmrun_state(&svm->vmcb01.ptr->save, map_save.hva + 0x400);
+
+ /*
+ * Enter the nested guest now
+ */
+
+ vmcb_mark_all_dirty(svm->vmcb01.ptr);
+
+ vmcb12 = map.hva;
+ nested_copy_vmcb_control_to_cache(svm, &vmcb12->control);
+ nested_copy_vmcb_save_to_cache(svm, &vmcb12->save);
+ ret = enter_svm_guest_mode(vcpu, vmcb12_gpa, vmcb12, false);
+
+ if (ret)
+ goto unmap_save;
+
+ svm->nested.nested_run_pending = 1;
+
+unmap_save:
+ kvm_vcpu_unmap(vcpu, &map_save, true);
+unmap_map:
+ kvm_vcpu_unmap(vcpu, &map, true);
+ return ret;
+}
+
+static void svm_enable_smi_window(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (!gif_set(svm)) {
+ if (vgif)
+ svm_set_intercept(svm, INTERCEPT_STGI);
+ /* STGI will cause a vm exit */
+ } else {
+ /* We must be in SMM; RSM will cause a vmexit anyway. */
+ }
+}
+
+static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
+ void *insn, int insn_len)
+{
+ bool smep, smap, is_user;
+ unsigned long cr4;
+ u64 error_code;
+
+ /* Emulation is always possible when KVM has access to all guest state. */
+ if (!sev_guest(vcpu->kvm))
+ return true;
+
+ /* #UD and #GP should never be intercepted for SEV guests. */
+ WARN_ON_ONCE(emul_type & (EMULTYPE_TRAP_UD |
+ EMULTYPE_TRAP_UD_FORCED |
+ EMULTYPE_VMWARE_GP));
+
+ /*
+ * Emulation is impossible for SEV-ES guests as KVM doesn't have access
+ * to guest register state.
+ */
+ if (sev_es_guest(vcpu->kvm))
+ return false;
+
+ /*
+ * Emulation is possible if the instruction is already decoded, e.g.
+ * when completing I/O after returning from userspace.
+ */
+ if (emul_type & EMULTYPE_NO_DECODE)
+ return true;
+
+ /*
+ * Emulation is possible for SEV guests if and only if a prefilled
+ * buffer containing the bytes of the intercepted instruction is
+ * available. SEV guest memory is encrypted with a guest specific key
+ * and cannot be decrypted by KVM, i.e. KVM would read cyphertext and
+ * decode garbage.
+ *
+ * If KVM is NOT trying to simply skip an instruction, inject #UD if
+ * KVM reached this point without an instruction buffer. In practice,
+ * this path should never be hit by a well-behaved guest, e.g. KVM
+ * doesn't intercept #UD or #GP for SEV guests, but this path is still
+ * theoretically reachable, e.g. via unaccelerated fault-like AVIC
+ * access, and needs to be handled by KVM to avoid putting the guest
+ * into an infinite loop. Injecting #UD is somewhat arbitrary, but
+ * its the least awful option given lack of insight into the guest.
+ *
+ * If KVM is trying to skip an instruction, simply resume the guest.
+ * If a #NPF occurs while the guest is vectoring an INT3/INTO, then KVM
+ * will attempt to re-inject the INT3/INTO and skip the instruction.
+ * In that scenario, retrying the INT3/INTO and hoping the guest will
+ * make forward progress is the only option that has a chance of
+ * success (and in practice it will work the vast majority of the time).
+ */
+ if (unlikely(!insn)) {
+ if (!(emul_type & EMULTYPE_SKIP))
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return false;
+ }
+
+ /*
+ * Emulate for SEV guests if the insn buffer is not empty. The buffer
+ * will be empty if the DecodeAssist microcode cannot fetch bytes for
+ * the faulting instruction because the code fetch itself faulted, e.g.
+ * the guest attempted to fetch from emulated MMIO or a guest page
+ * table used to translate CS:RIP resides in emulated MMIO.
+ */
+ if (likely(insn_len))
+ return true;
+
+ /*
+ * Detect and workaround Errata 1096 Fam_17h_00_0Fh.
+ *
+ * Errata:
+ * When CPU raises #NPF on guest data access and vCPU CR4.SMAP=1, it is
+ * possible that CPU microcode implementing DecodeAssist will fail to
+ * read guest memory at CS:RIP and vmcb.GuestIntrBytes will incorrectly
+ * be '0'. This happens because microcode reads CS:RIP using a _data_
+ * loap uop with CPL=0 privileges. If the load hits a SMAP #PF, ucode
+ * gives up and does not fill the instruction bytes buffer.
+ *
+ * As above, KVM reaches this point iff the VM is an SEV guest, the CPU
+ * supports DecodeAssist, a #NPF was raised, KVM's page fault handler
+ * triggered emulation (e.g. for MMIO), and the CPU returned 0 in the
+ * GuestIntrBytes field of the VMCB.
+ *
+ * This does _not_ mean that the erratum has been encountered, as the
+ * DecodeAssist will also fail if the load for CS:RIP hits a legitimate
+ * #PF, e.g. if the guest attempt to execute from emulated MMIO and
+ * encountered a reserved/not-present #PF.
+ *
+ * To hit the erratum, the following conditions must be true:
+ * 1. CR4.SMAP=1 (obviously).
+ * 2. CR4.SMEP=0 || CPL=3. If SMEP=1 and CPL<3, the erratum cannot
+ * have been hit as the guest would have encountered a SMEP
+ * violation #PF, not a #NPF.
+ * 3. The #NPF is not due to a code fetch, in which case failure to
+ * retrieve the instruction bytes is legitimate (see abvoe).
+ *
+ * In addition, don't apply the erratum workaround if the #NPF occurred
+ * while translating guest page tables (see below).
+ */
+ error_code = to_svm(vcpu)->vmcb->control.exit_info_1;
+ if (error_code & (PFERR_GUEST_PAGE_MASK | PFERR_FETCH_MASK))
+ goto resume_guest;
+
+ cr4 = kvm_read_cr4(vcpu);
+ smep = cr4 & X86_CR4_SMEP;
+ smap = cr4 & X86_CR4_SMAP;
+ is_user = svm_get_cpl(vcpu) == 3;
+ if (smap && (!smep || is_user)) {
+ pr_err_ratelimited("KVM: SEV Guest triggered AMD Erratum 1096\n");
+
+ /*
+ * If the fault occurred in userspace, arbitrarily inject #GP
+ * to avoid killing the guest and to hopefully avoid confusing
+ * the guest kernel too much, e.g. injecting #PF would not be
+ * coherent with respect to the guest's page tables. Request
+ * triple fault if the fault occurred in the kernel as there's
+ * no fault that KVM can inject without confusing the guest.
+ * In practice, the triple fault is moot as no sane SEV kernel
+ * will execute from user memory while also running with SMAP=1.
+ */
+ if (is_user)
+ kvm_inject_gp(vcpu, 0);
+ else
+ kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+ }
+
+resume_guest:
+ /*
+ * If the erratum was not hit, simply resume the guest and let it fault
+ * again. While awful, e.g. the vCPU may get stuck in an infinite loop
+ * if the fault is at CPL=0, it's the lesser of all evils. Exiting to
+ * userspace will kill the guest, and letting the emulator read garbage
+ * will yield random behavior and potentially corrupt the guest.
+ *
+ * Simply resuming the guest is technically not a violation of the SEV
+ * architecture. AMD's APM states that all code fetches and page table
+ * accesses for SEV guest are encrypted, regardless of the C-Bit. The
+ * APM also states that encrypted accesses to MMIO are "ignored", but
+ * doesn't explicitly define "ignored", i.e. doing nothing and letting
+ * the guest spin is technically "ignoring" the access.
+ */
+ return false;
+}
+
+static bool svm_apic_init_signal_blocked(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ return !gif_set(svm);
+}
+
+static void svm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
+{
+ if (!sev_es_guest(vcpu->kvm))
+ return kvm_vcpu_deliver_sipi_vector(vcpu, vector);
+
+ sev_vcpu_deliver_sipi_vector(vcpu, vector);
+}
+
+static void svm_vm_destroy(struct kvm *kvm)
+{
+ avic_vm_destroy(kvm);
+ sev_vm_destroy(kvm);
+}
+
+static int svm_vm_init(struct kvm *kvm)
+{
+ if (!pause_filter_count || !pause_filter_thresh)
+ kvm->arch.pause_in_guest = true;
+
+ if (enable_apicv) {
+ int ret = avic_vm_init(kvm);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+static struct kvm_x86_ops svm_x86_ops __initdata = {
+ .name = "kvm_amd",
+
+ .hardware_unsetup = svm_hardware_unsetup,
+ .hardware_enable = svm_hardware_enable,
+ .hardware_disable = svm_hardware_disable,
+ .has_emulated_msr = svm_has_emulated_msr,
+
+ .vcpu_create = svm_vcpu_create,
+ .vcpu_free = svm_vcpu_free,
+ .vcpu_reset = svm_vcpu_reset,
+
+ .vm_size = sizeof(struct kvm_svm),
+ .vm_init = svm_vm_init,
+ .vm_destroy = svm_vm_destroy,
+
+ .prepare_switch_to_guest = svm_prepare_switch_to_guest,
+ .vcpu_load = svm_vcpu_load,
+ .vcpu_put = svm_vcpu_put,
+ .vcpu_blocking = avic_vcpu_blocking,
+ .vcpu_unblocking = avic_vcpu_unblocking,
+
+ .update_exception_bitmap = svm_update_exception_bitmap,
+ .get_msr_feature = svm_get_msr_feature,
+ .get_msr = svm_get_msr,
+ .set_msr = svm_set_msr,
+ .get_segment_base = svm_get_segment_base,
+ .get_segment = svm_get_segment,
+ .set_segment = svm_set_segment,
+ .get_cpl = svm_get_cpl,
+ .get_cs_db_l_bits = svm_get_cs_db_l_bits,
+ .is_valid_cr0 = svm_is_valid_cr0,
+ .set_cr0 = svm_set_cr0,
+ .post_set_cr3 = sev_post_set_cr3,
+ .is_valid_cr4 = svm_is_valid_cr4,
+ .set_cr4 = svm_set_cr4,
+ .set_efer = svm_set_efer,
+ .get_idt = svm_get_idt,
+ .set_idt = svm_set_idt,
+ .get_gdt = svm_get_gdt,
+ .set_gdt = svm_set_gdt,
+ .set_dr7 = svm_set_dr7,
+ .sync_dirty_debug_regs = svm_sync_dirty_debug_regs,
+ .cache_reg = svm_cache_reg,
+ .get_rflags = svm_get_rflags,
+ .set_rflags = svm_set_rflags,
+ .get_if_flag = svm_get_if_flag,
+
+ .flush_tlb_all = svm_flush_tlb_all,
+ .flush_tlb_current = svm_flush_tlb_current,
+ .flush_tlb_gva = svm_flush_tlb_gva,
+ .flush_tlb_guest = svm_flush_tlb_asid,
+
+ .vcpu_pre_run = svm_vcpu_pre_run,
+ .vcpu_run = svm_vcpu_run,
+ .handle_exit = svm_handle_exit,
+ .skip_emulated_instruction = svm_skip_emulated_instruction,
+ .update_emulated_instruction = NULL,
+ .set_interrupt_shadow = svm_set_interrupt_shadow,
+ .get_interrupt_shadow = svm_get_interrupt_shadow,
+ .patch_hypercall = svm_patch_hypercall,
+ .inject_irq = svm_inject_irq,
+ .inject_nmi = svm_inject_nmi,
+ .inject_exception = svm_inject_exception,
+ .cancel_injection = svm_cancel_injection,
+ .interrupt_allowed = svm_interrupt_allowed,
+ .nmi_allowed = svm_nmi_allowed,
+ .get_nmi_mask = svm_get_nmi_mask,
+ .set_nmi_mask = svm_set_nmi_mask,
+ .enable_nmi_window = svm_enable_nmi_window,
+ .enable_irq_window = svm_enable_irq_window,
+ .update_cr8_intercept = svm_update_cr8_intercept,
+ .set_virtual_apic_mode = avic_refresh_virtual_apic_mode,
+ .refresh_apicv_exec_ctrl = avic_refresh_apicv_exec_ctrl,
+ .check_apicv_inhibit_reasons = avic_check_apicv_inhibit_reasons,
+ .apicv_post_state_restore = avic_apicv_post_state_restore,
+
+ .get_exit_info = svm_get_exit_info,
+
+ .vcpu_after_set_cpuid = svm_vcpu_after_set_cpuid,
+
+ .has_wbinvd_exit = svm_has_wbinvd_exit,
+
+ .get_l2_tsc_offset = svm_get_l2_tsc_offset,
+ .get_l2_tsc_multiplier = svm_get_l2_tsc_multiplier,
+ .write_tsc_offset = svm_write_tsc_offset,
+ .write_tsc_multiplier = svm_write_tsc_multiplier,
+
+ .load_mmu_pgd = svm_load_mmu_pgd,
+
+ .check_intercept = svm_check_intercept,
+ .handle_exit_irqoff = svm_handle_exit_irqoff,
+
+ .request_immediate_exit = __kvm_request_immediate_exit,
+
+ .sched_in = svm_sched_in,
+
+ .nested_ops = &svm_nested_ops,
+
+ .deliver_interrupt = svm_deliver_interrupt,
+ .pi_update_irte = avic_pi_update_irte,
+ .setup_mce = svm_setup_mce,
+
+ .smi_allowed = svm_smi_allowed,
+ .enter_smm = svm_enter_smm,
+ .leave_smm = svm_leave_smm,
+ .enable_smi_window = svm_enable_smi_window,
+
+ .mem_enc_ioctl = sev_mem_enc_ioctl,
+ .mem_enc_register_region = sev_mem_enc_register_region,
+ .mem_enc_unregister_region = sev_mem_enc_unregister_region,
+ .guest_memory_reclaimed = sev_guest_memory_reclaimed,
+
+ .vm_copy_enc_context_from = sev_vm_copy_enc_context_from,
+ .vm_move_enc_context_from = sev_vm_move_enc_context_from,
+
+ .can_emulate_instruction = svm_can_emulate_instruction,
+
+ .apic_init_signal_blocked = svm_apic_init_signal_blocked,
+
+ .msr_filter_changed = svm_msr_filter_changed,
+ .complete_emulated_msr = svm_complete_emulated_msr,
+
+ .vcpu_deliver_sipi_vector = svm_vcpu_deliver_sipi_vector,
+ .vcpu_get_apicv_inhibit_reasons = avic_vcpu_get_apicv_inhibit_reasons,
+};
+
+/*
+ * The default MMIO mask is a single bit (excluding the present bit),
+ * which could conflict with the memory encryption bit. Check for
+ * memory encryption support and override the default MMIO mask if
+ * memory encryption is enabled.
+ */
+static __init void svm_adjust_mmio_mask(void)
+{
+ unsigned int enc_bit, mask_bit;
+ u64 msr, mask;
+
+ /* If there is no memory encryption support, use existing mask */
+ if (cpuid_eax(0x80000000) < 0x8000001f)
+ return;
+
+ /* If memory encryption is not enabled, use existing mask */
+ rdmsrl(MSR_AMD64_SYSCFG, msr);
+ if (!(msr & MSR_AMD64_SYSCFG_MEM_ENCRYPT))
+ return;
+
+ enc_bit = cpuid_ebx(0x8000001f) & 0x3f;
+ mask_bit = boot_cpu_data.x86_phys_bits;
+
+ /* Increment the mask bit if it is the same as the encryption bit */
+ if (enc_bit == mask_bit)
+ mask_bit++;
+
+ /*
+ * If the mask bit location is below 52, then some bits above the
+ * physical addressing limit will always be reserved, so use the
+ * rsvd_bits() function to generate the mask. This mask, along with
+ * the present bit, will be used to generate a page fault with
+ * PFER.RSV = 1.
+ *
+ * If the mask bit location is 52 (or above), then clear the mask.
+ */
+ mask = (mask_bit < 52) ? rsvd_bits(mask_bit, 51) | PT_PRESENT_MASK : 0;
+
+ kvm_mmu_set_mmio_spte_mask(mask, mask, PT_WRITABLE_MASK | PT_USER_MASK);
+}
+
+static __init void svm_set_cpu_caps(void)
+{
+ kvm_set_cpu_caps();
+
+ kvm_caps.supported_perf_cap = 0;
+ kvm_caps.supported_xss = 0;
+
+ /* CPUID 0x80000001 and 0x8000000A (SVM features) */
+ if (nested) {
+ kvm_cpu_cap_set(X86_FEATURE_SVM);
+ kvm_cpu_cap_set(X86_FEATURE_VMCBCLEAN);
+
+ if (nrips)
+ kvm_cpu_cap_set(X86_FEATURE_NRIPS);
+
+ if (npt_enabled)
+ kvm_cpu_cap_set(X86_FEATURE_NPT);
+
+ if (tsc_scaling)
+ kvm_cpu_cap_set(X86_FEATURE_TSCRATEMSR);
+
+ if (vls)
+ kvm_cpu_cap_set(X86_FEATURE_V_VMSAVE_VMLOAD);
+ if (lbrv)
+ kvm_cpu_cap_set(X86_FEATURE_LBRV);
+
+ if (boot_cpu_has(X86_FEATURE_PAUSEFILTER))
+ kvm_cpu_cap_set(X86_FEATURE_PAUSEFILTER);
+
+ if (boot_cpu_has(X86_FEATURE_PFTHRESHOLD))
+ kvm_cpu_cap_set(X86_FEATURE_PFTHRESHOLD);
+
+ if (vgif)
+ kvm_cpu_cap_set(X86_FEATURE_VGIF);
+
+ /* Nested VM can receive #VMEXIT instead of triggering #GP */
+ kvm_cpu_cap_set(X86_FEATURE_SVME_ADDR_CHK);
+ }
+
+ /* CPUID 0x80000008 */
+ if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) ||
+ boot_cpu_has(X86_FEATURE_AMD_SSBD))
+ kvm_cpu_cap_set(X86_FEATURE_VIRT_SSBD);
+
+ /* AMD PMU PERFCTR_CORE CPUID */
+ if (enable_pmu && boot_cpu_has(X86_FEATURE_PERFCTR_CORE))
+ kvm_cpu_cap_set(X86_FEATURE_PERFCTR_CORE);
+
+ /* CPUID 0x8000001F (SME/SEV features) */
+ sev_set_cpu_caps();
+}
+
+static __init int svm_hardware_setup(void)
+{
+ int cpu;
+ struct page *iopm_pages;
+ void *iopm_va;
+ int r;
+ unsigned int order = get_order(IOPM_SIZE);
+
+ /*
+ * NX is required for shadow paging and for NPT if the NX huge pages
+ * mitigation is enabled.
+ */
+ if (!boot_cpu_has(X86_FEATURE_NX)) {
+ pr_err_ratelimited("NX (Execute Disable) not supported\n");
+ return -EOPNOTSUPP;
+ }
+ kvm_enable_efer_bits(EFER_NX);
+
+ iopm_pages = alloc_pages(GFP_KERNEL, order);
+
+ if (!iopm_pages)
+ return -ENOMEM;
+
+ iopm_va = page_address(iopm_pages);
+ memset(iopm_va, 0xff, PAGE_SIZE * (1 << order));
+ iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT;
+
+ init_msrpm_offsets();
+
+ kvm_caps.supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS |
+ XFEATURE_MASK_BNDCSR);
+
+ if (boot_cpu_has(X86_FEATURE_FXSR_OPT))
+ kvm_enable_efer_bits(EFER_FFXSR);
+
+ if (tsc_scaling) {
+ if (!boot_cpu_has(X86_FEATURE_TSCRATEMSR)) {
+ tsc_scaling = false;
+ } else {
+ pr_info("TSC scaling supported\n");
+ kvm_caps.has_tsc_control = true;
+ }
+ }
+ kvm_caps.max_tsc_scaling_ratio = SVM_TSC_RATIO_MAX;
+ kvm_caps.tsc_scaling_ratio_frac_bits = 32;
+
+ tsc_aux_uret_slot = kvm_add_user_return_msr(MSR_TSC_AUX);
+
+ /* Check for pause filtering support */
+ if (!boot_cpu_has(X86_FEATURE_PAUSEFILTER)) {
+ pause_filter_count = 0;
+ pause_filter_thresh = 0;
+ } else if (!boot_cpu_has(X86_FEATURE_PFTHRESHOLD)) {
+ pause_filter_thresh = 0;
+ }
+
+ if (nested) {
+ printk(KERN_INFO "kvm: Nested Virtualization enabled\n");
+ kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE);
+ }
+
+ /*
+ * KVM's MMU doesn't support using 2-level paging for itself, and thus
+ * NPT isn't supported if the host is using 2-level paging since host
+ * CR4 is unchanged on VMRUN.
+ */
+ if (!IS_ENABLED(CONFIG_X86_64) && !IS_ENABLED(CONFIG_X86_PAE))
+ npt_enabled = false;
+
+ if (!boot_cpu_has(X86_FEATURE_NPT))
+ npt_enabled = false;
+
+ /* Force VM NPT level equal to the host's paging level */
+ kvm_configure_mmu(npt_enabled, get_npt_level(),
+ get_npt_level(), PG_LEVEL_1G);
+ pr_info("kvm: Nested Paging %sabled\n", npt_enabled ? "en" : "dis");
+
+ /* Setup shadow_me_value and shadow_me_mask */
+ kvm_mmu_set_me_spte_mask(sme_me_mask, sme_me_mask);
+
+ svm_adjust_mmio_mask();
+
+ /*
+ * Note, SEV setup consumes npt_enabled and enable_mmio_caching (which
+ * may be modified by svm_adjust_mmio_mask()).
+ */
+ sev_hardware_setup();
+
+ svm_hv_hardware_setup();
+
+ for_each_possible_cpu(cpu) {
+ r = svm_cpu_init(cpu);
+ if (r)
+ goto err;
+ }
+
+ if (nrips) {
+ if (!boot_cpu_has(X86_FEATURE_NRIPS))
+ nrips = false;
+ }
+
+ enable_apicv = avic = avic && avic_hardware_setup(&svm_x86_ops);
+
+ if (!enable_apicv) {
+ svm_x86_ops.vcpu_blocking = NULL;
+ svm_x86_ops.vcpu_unblocking = NULL;
+ svm_x86_ops.vcpu_get_apicv_inhibit_reasons = NULL;
+ }
+
+ if (vls) {
+ if (!npt_enabled ||
+ !boot_cpu_has(X86_FEATURE_V_VMSAVE_VMLOAD) ||
+ !IS_ENABLED(CONFIG_X86_64)) {
+ vls = false;
+ } else {
+ pr_info("Virtual VMLOAD VMSAVE supported\n");
+ }
+ }
+
+ if (boot_cpu_has(X86_FEATURE_SVME_ADDR_CHK))
+ svm_gp_erratum_intercept = false;
+
+ if (vgif) {
+ if (!boot_cpu_has(X86_FEATURE_VGIF))
+ vgif = false;
+ else
+ pr_info("Virtual GIF supported\n");
+ }
+
+ if (lbrv) {
+ if (!boot_cpu_has(X86_FEATURE_LBRV))
+ lbrv = false;
+ else
+ pr_info("LBR virtualization supported\n");
+ }
+
+ if (!enable_pmu)
+ pr_info("PMU virtualization is disabled\n");
+
+ svm_set_cpu_caps();
+
+ /*
+ * It seems that on AMD processors PTE's accessed bit is
+ * being set by the CPU hardware before the NPF vmexit.
+ * This is not expected behaviour and our tests fail because
+ * of it.
+ * A workaround here is to disable support for
+ * GUEST_MAXPHYADDR < HOST_MAXPHYADDR if NPT is enabled.
+ * In this case userspace can know if there is support using
+ * KVM_CAP_SMALLER_MAXPHYADDR extension and decide how to handle
+ * it
+ * If future AMD CPU models change the behaviour described above,
+ * this variable can be changed accordingly
+ */
+ allow_smaller_maxphyaddr = !npt_enabled;
+
+ return 0;
+
+err:
+ svm_hardware_unsetup();
+ return r;
+}
+
+
+static struct kvm_x86_init_ops svm_init_ops __initdata = {
+ .cpu_has_kvm_support = has_svm,
+ .disabled_by_bios = is_disabled,
+ .hardware_setup = svm_hardware_setup,
+ .check_processor_compatibility = svm_check_processor_compat,
+
+ .runtime_ops = &svm_x86_ops,
+ .pmu_ops = &amd_pmu_ops,
+};
+
+static int __init svm_init(void)
+{
+ int r;
+
+ __unused_size_checks();
+
+ r = kvm_x86_vendor_init(&svm_init_ops);
+ if (r)
+ return r;
+
+ /*
+ * Common KVM initialization _must_ come last, after this, /dev/kvm is
+ * exposed to userspace!
+ */
+ r = kvm_init(&svm_init_ops, sizeof(struct vcpu_svm),
+ __alignof__(struct vcpu_svm), THIS_MODULE);
+ if (r)
+ goto err_kvm_init;
+
+ return 0;
+
+err_kvm_init:
+ kvm_x86_vendor_exit();
+ return r;
+}
+
+static void __exit svm_exit(void)
+{
+ kvm_exit();
+ kvm_x86_vendor_exit();
+}
+
+module_init(svm_init)
+module_exit(svm_exit)
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
new file mode 100644
index 000000000..4cb142590
--- /dev/null
+++ b/arch/x86/kvm/svm/svm.h
@@ -0,0 +1,718 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Kernel-based Virtual Machine driver for Linux
+ *
+ * AMD SVM support
+ *
+ * Copyright (C) 2006 Qumranet, Inc.
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
+ *
+ * Authors:
+ * Yaniv Kamay <yaniv@qumranet.com>
+ * Avi Kivity <avi@qumranet.com>
+ */
+
+#ifndef __SVM_SVM_H
+#define __SVM_SVM_H
+
+#include <linux/kvm_types.h>
+#include <linux/kvm_host.h>
+#include <linux/bits.h>
+
+#include <asm/svm.h>
+#include <asm/sev-common.h>
+
+#include "kvm_cache_regs.h"
+
+#define __sme_page_pa(x) __sme_set(page_to_pfn(x) << PAGE_SHIFT)
+
+#define IOPM_SIZE PAGE_SIZE * 3
+#define MSRPM_SIZE PAGE_SIZE * 2
+
+#define MAX_DIRECT_ACCESS_MSRS 46
+#define MSRPM_OFFSETS 32
+extern u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
+extern bool npt_enabled;
+extern int vgif;
+extern bool intercept_smi;
+
+enum avic_modes {
+ AVIC_MODE_NONE = 0,
+ AVIC_MODE_X1,
+ AVIC_MODE_X2,
+};
+
+extern enum avic_modes avic_mode;
+
+/*
+ * Clean bits in VMCB.
+ * VMCB_ALL_CLEAN_MASK might also need to
+ * be updated if this enum is modified.
+ */
+enum {
+ VMCB_INTERCEPTS, /* Intercept vectors, TSC offset,
+ pause filter count */
+ VMCB_PERM_MAP, /* IOPM Base and MSRPM Base */
+ VMCB_ASID, /* ASID */
+ VMCB_INTR, /* int_ctl, int_vector */
+ VMCB_NPT, /* npt_en, nCR3, gPAT */
+ VMCB_CR, /* CR0, CR3, CR4, EFER */
+ VMCB_DR, /* DR6, DR7 */
+ VMCB_DT, /* GDT, IDT */
+ VMCB_SEG, /* CS, DS, SS, ES, CPL */
+ VMCB_CR2, /* CR2 only */
+ VMCB_LBR, /* DBGCTL, BR_FROM, BR_TO, LAST_EX_FROM, LAST_EX_TO */
+ VMCB_AVIC, /* AVIC APIC_BAR, AVIC APIC_BACKING_PAGE,
+ * AVIC PHYSICAL_TABLE pointer,
+ * AVIC LOGICAL_TABLE pointer
+ */
+ VMCB_SW = 31, /* Reserved for hypervisor/software use */
+};
+
+#define VMCB_ALL_CLEAN_MASK ( \
+ (1U << VMCB_INTERCEPTS) | (1U << VMCB_PERM_MAP) | \
+ (1U << VMCB_ASID) | (1U << VMCB_INTR) | \
+ (1U << VMCB_NPT) | (1U << VMCB_CR) | (1U << VMCB_DR) | \
+ (1U << VMCB_DT) | (1U << VMCB_SEG) | (1U << VMCB_CR2) | \
+ (1U << VMCB_LBR) | (1U << VMCB_AVIC) | \
+ (1U << VMCB_SW))
+
+/* TPR and CR2 are always written before VMRUN */
+#define VMCB_ALWAYS_DIRTY_MASK ((1U << VMCB_INTR) | (1U << VMCB_CR2))
+
+struct kvm_sev_info {
+ bool active; /* SEV enabled guest */
+ bool es_active; /* SEV-ES enabled guest */
+ unsigned int asid; /* ASID used for this guest */
+ unsigned int handle; /* SEV firmware handle */
+ int fd; /* SEV device fd */
+ unsigned long pages_locked; /* Number of pages locked */
+ struct list_head regions_list; /* List of registered regions */
+ u64 ap_jump_table; /* SEV-ES AP Jump Table address */
+ struct kvm *enc_context_owner; /* Owner of copied encryption context */
+ struct list_head mirror_vms; /* List of VMs mirroring */
+ struct list_head mirror_entry; /* Use as a list entry of mirrors */
+ struct misc_cg *misc_cg; /* For misc cgroup accounting */
+ atomic_t migration_in_progress;
+};
+
+struct kvm_svm {
+ struct kvm kvm;
+
+ /* Struct members for AVIC */
+ u32 avic_vm_id;
+ struct page *avic_logical_id_table_page;
+ struct page *avic_physical_id_table_page;
+ struct hlist_node hnode;
+
+ struct kvm_sev_info sev_info;
+};
+
+struct kvm_vcpu;
+
+struct kvm_vmcb_info {
+ struct vmcb *ptr;
+ unsigned long pa;
+ int cpu;
+ uint64_t asid_generation;
+};
+
+struct vmcb_save_area_cached {
+ u64 efer;
+ u64 cr4;
+ u64 cr3;
+ u64 cr0;
+ u64 dr7;
+ u64 dr6;
+};
+
+struct vmcb_ctrl_area_cached {
+ u32 intercepts[MAX_INTERCEPT];
+ u16 pause_filter_thresh;
+ u16 pause_filter_count;
+ u64 iopm_base_pa;
+ u64 msrpm_base_pa;
+ u64 tsc_offset;
+ u32 asid;
+ u8 tlb_ctl;
+ u32 int_ctl;
+ u32 int_vector;
+ u32 int_state;
+ u32 exit_code;
+ u32 exit_code_hi;
+ u64 exit_info_1;
+ u64 exit_info_2;
+ u32 exit_int_info;
+ u32 exit_int_info_err;
+ u64 nested_ctl;
+ u32 event_inj;
+ u32 event_inj_err;
+ u64 next_rip;
+ u64 nested_cr3;
+ u64 virt_ext;
+ u32 clean;
+ union {
+ struct hv_vmcb_enlightenments hv_enlightenments;
+ u8 reserved_sw[32];
+ };
+};
+
+struct svm_nested_state {
+ struct kvm_vmcb_info vmcb02;
+ u64 hsave_msr;
+ u64 vm_cr_msr;
+ u64 vmcb12_gpa;
+ u64 last_vmcb12_gpa;
+
+ /* These are the merged vectors */
+ u32 *msrpm;
+
+ /* A VMRUN has started but has not yet been performed, so
+ * we cannot inject a nested vmexit yet. */
+ bool nested_run_pending;
+
+ /* cache for control fields of the guest */
+ struct vmcb_ctrl_area_cached ctl;
+
+ /*
+ * Note: this struct is not kept up-to-date while L2 runs; it is only
+ * valid within nested_svm_vmrun.
+ */
+ struct vmcb_save_area_cached save;
+
+ bool initialized;
+
+ /*
+ * Indicates whether MSR bitmap for L2 needs to be rebuilt due to
+ * changes in MSR bitmap for L1 or switching to a different L2. Note,
+ * this flag can only be used reliably in conjunction with a paravirt L1
+ * which informs L0 whether any changes to MSR bitmap for L2 were done
+ * on its side.
+ */
+ bool force_msr_bitmap_recalc;
+};
+
+struct vcpu_sev_es_state {
+ /* SEV-ES support */
+ struct sev_es_save_area *vmsa;
+ struct ghcb *ghcb;
+ u8 valid_bitmap[16];
+ struct kvm_host_map ghcb_map;
+ bool received_first_sipi;
+
+ /* SEV-ES scratch area support */
+ u64 sw_scratch;
+ void *ghcb_sa;
+ u32 ghcb_sa_len;
+ bool ghcb_sa_sync;
+ bool ghcb_sa_free;
+};
+
+struct vcpu_svm {
+ struct kvm_vcpu vcpu;
+ /* vmcb always points at current_vmcb->ptr, it's purely a shorthand. */
+ struct vmcb *vmcb;
+ struct kvm_vmcb_info vmcb01;
+ struct kvm_vmcb_info *current_vmcb;
+ u32 asid;
+ u32 sysenter_esp_hi;
+ u32 sysenter_eip_hi;
+ uint64_t tsc_aux;
+
+ u64 msr_decfg;
+
+ u64 next_rip;
+
+ u64 spec_ctrl;
+
+ u64 tsc_ratio_msr;
+ /*
+ * Contains guest-controlled bits of VIRT_SPEC_CTRL, which will be
+ * translated into the appropriate L2_CFG bits on the host to
+ * perform speculative control.
+ */
+ u64 virt_spec_ctrl;
+
+ u32 *msrpm;
+
+ ulong nmi_iret_rip;
+
+ struct svm_nested_state nested;
+
+ bool nmi_singlestep;
+ u64 nmi_singlestep_guest_rflags;
+ bool nmi_l1_to_l2;
+
+ unsigned long soft_int_csbase;
+ unsigned long soft_int_old_rip;
+ unsigned long soft_int_next_rip;
+ bool soft_int_injected;
+
+ /* optional nested SVM features that are enabled for this guest */
+ bool nrips_enabled : 1;
+ bool tsc_scaling_enabled : 1;
+ bool v_vmload_vmsave_enabled : 1;
+ bool lbrv_enabled : 1;
+ bool pause_filter_enabled : 1;
+ bool pause_threshold_enabled : 1;
+ bool vgif_enabled : 1;
+
+ u32 ldr_reg;
+ u32 dfr_reg;
+ struct page *avic_backing_page;
+ u64 *avic_physical_id_cache;
+
+ /*
+ * Per-vcpu list of struct amd_svm_iommu_ir:
+ * This is used mainly to store interrupt remapping information used
+ * when update the vcpu affinity. This avoids the need to scan for
+ * IRTE and try to match ga_tag in the IOMMU driver.
+ */
+ struct list_head ir_list;
+ spinlock_t ir_list_lock;
+
+ /* Save desired MSR intercept (read: pass-through) state */
+ struct {
+ DECLARE_BITMAP(read, MAX_DIRECT_ACCESS_MSRS);
+ DECLARE_BITMAP(write, MAX_DIRECT_ACCESS_MSRS);
+ } shadow_msr_intercept;
+
+ struct vcpu_sev_es_state sev_es;
+
+ bool guest_state_loaded;
+
+ bool x2avic_msrs_intercepted;
+};
+
+struct svm_cpu_data {
+ u64 asid_generation;
+ u32 max_asid;
+ u32 next_asid;
+ u32 min_asid;
+ struct kvm_ldttss_desc *tss_desc;
+
+ struct page *save_area;
+ unsigned long save_area_pa;
+
+ struct vmcb *current_vmcb;
+
+ /* index = sev_asid, value = vmcb pointer */
+ struct vmcb **sev_vmcbs;
+};
+
+DECLARE_PER_CPU(struct svm_cpu_data, svm_data);
+
+void recalc_intercepts(struct vcpu_svm *svm);
+
+static __always_inline struct kvm_svm *to_kvm_svm(struct kvm *kvm)
+{
+ return container_of(kvm, struct kvm_svm, kvm);
+}
+
+static __always_inline bool sev_guest(struct kvm *kvm)
+{
+#ifdef CONFIG_KVM_AMD_SEV
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+
+ return sev->active;
+#else
+ return false;
+#endif
+}
+
+static __always_inline bool sev_es_guest(struct kvm *kvm)
+{
+#ifdef CONFIG_KVM_AMD_SEV
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+
+ return sev->es_active && !WARN_ON_ONCE(!sev->active);
+#else
+ return false;
+#endif
+}
+
+static inline void vmcb_mark_all_dirty(struct vmcb *vmcb)
+{
+ vmcb->control.clean = 0;
+}
+
+static inline void vmcb_mark_all_clean(struct vmcb *vmcb)
+{
+ vmcb->control.clean = VMCB_ALL_CLEAN_MASK
+ & ~VMCB_ALWAYS_DIRTY_MASK;
+}
+
+static inline void vmcb_mark_dirty(struct vmcb *vmcb, int bit)
+{
+ vmcb->control.clean &= ~(1 << bit);
+}
+
+static inline bool vmcb_is_dirty(struct vmcb *vmcb, int bit)
+{
+ return !test_bit(bit, (unsigned long *)&vmcb->control.clean);
+}
+
+static __always_inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu)
+{
+ return container_of(vcpu, struct vcpu_svm, vcpu);
+}
+
+/*
+ * Only the PDPTRs are loaded on demand into the shadow MMU. All other
+ * fields are synchronized on VM-Exit, because accessing the VMCB is cheap.
+ *
+ * CR3 might be out of date in the VMCB but it is not marked dirty; instead,
+ * KVM_REQ_LOAD_MMU_PGD is always requested when the cached vcpu->arch.cr3
+ * is changed. svm_load_mmu_pgd() then syncs the new CR3 value into the VMCB.
+ */
+#define SVM_REGS_LAZY_LOAD_SET (1 << VCPU_EXREG_PDPTR)
+
+static inline void vmcb_set_intercept(struct vmcb_control_area *control, u32 bit)
+{
+ WARN_ON_ONCE(bit >= 32 * MAX_INTERCEPT);
+ __set_bit(bit, (unsigned long *)&control->intercepts);
+}
+
+static inline void vmcb_clr_intercept(struct vmcb_control_area *control, u32 bit)
+{
+ WARN_ON_ONCE(bit >= 32 * MAX_INTERCEPT);
+ __clear_bit(bit, (unsigned long *)&control->intercepts);
+}
+
+static inline bool vmcb_is_intercept(struct vmcb_control_area *control, u32 bit)
+{
+ WARN_ON_ONCE(bit >= 32 * MAX_INTERCEPT);
+ return test_bit(bit, (unsigned long *)&control->intercepts);
+}
+
+static inline bool vmcb12_is_intercept(struct vmcb_ctrl_area_cached *control, u32 bit)
+{
+ WARN_ON_ONCE(bit >= 32 * MAX_INTERCEPT);
+ return test_bit(bit, (unsigned long *)&control->intercepts);
+}
+
+static inline void set_dr_intercepts(struct vcpu_svm *svm)
+{
+ struct vmcb *vmcb = svm->vmcb01.ptr;
+
+ if (!sev_es_guest(svm->vcpu.kvm)) {
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR0_READ);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR1_READ);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR2_READ);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR3_READ);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR4_READ);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR5_READ);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR6_READ);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR0_WRITE);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR1_WRITE);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR2_WRITE);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR3_WRITE);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR4_WRITE);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR5_WRITE);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR6_WRITE);
+ }
+
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_READ);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_WRITE);
+
+ recalc_intercepts(svm);
+}
+
+static inline void clr_dr_intercepts(struct vcpu_svm *svm)
+{
+ struct vmcb *vmcb = svm->vmcb01.ptr;
+
+ vmcb->control.intercepts[INTERCEPT_DR] = 0;
+
+ /* DR7 access must remain intercepted for an SEV-ES guest */
+ if (sev_es_guest(svm->vcpu.kvm)) {
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_READ);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_WRITE);
+ }
+
+ recalc_intercepts(svm);
+}
+
+static inline void set_exception_intercept(struct vcpu_svm *svm, u32 bit)
+{
+ struct vmcb *vmcb = svm->vmcb01.ptr;
+
+ WARN_ON_ONCE(bit >= 32);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_EXCEPTION_OFFSET + bit);
+
+ recalc_intercepts(svm);
+}
+
+static inline void clr_exception_intercept(struct vcpu_svm *svm, u32 bit)
+{
+ struct vmcb *vmcb = svm->vmcb01.ptr;
+
+ WARN_ON_ONCE(bit >= 32);
+ vmcb_clr_intercept(&vmcb->control, INTERCEPT_EXCEPTION_OFFSET + bit);
+
+ recalc_intercepts(svm);
+}
+
+static inline void svm_set_intercept(struct vcpu_svm *svm, int bit)
+{
+ struct vmcb *vmcb = svm->vmcb01.ptr;
+
+ vmcb_set_intercept(&vmcb->control, bit);
+
+ recalc_intercepts(svm);
+}
+
+static inline void svm_clr_intercept(struct vcpu_svm *svm, int bit)
+{
+ struct vmcb *vmcb = svm->vmcb01.ptr;
+
+ vmcb_clr_intercept(&vmcb->control, bit);
+
+ recalc_intercepts(svm);
+}
+
+static inline bool svm_is_intercept(struct vcpu_svm *svm, int bit)
+{
+ return vmcb_is_intercept(&svm->vmcb->control, bit);
+}
+
+static inline bool nested_vgif_enabled(struct vcpu_svm *svm)
+{
+ return svm->vgif_enabled && (svm->nested.ctl.int_ctl & V_GIF_ENABLE_MASK);
+}
+
+static inline struct vmcb *get_vgif_vmcb(struct vcpu_svm *svm)
+{
+ if (!vgif)
+ return NULL;
+
+ if (is_guest_mode(&svm->vcpu) && !nested_vgif_enabled(svm))
+ return svm->nested.vmcb02.ptr;
+ else
+ return svm->vmcb01.ptr;
+}
+
+static inline void enable_gif(struct vcpu_svm *svm)
+{
+ struct vmcb *vmcb = get_vgif_vmcb(svm);
+
+ if (vmcb)
+ vmcb->control.int_ctl |= V_GIF_MASK;
+ else
+ svm->vcpu.arch.hflags |= HF_GIF_MASK;
+}
+
+static inline void disable_gif(struct vcpu_svm *svm)
+{
+ struct vmcb *vmcb = get_vgif_vmcb(svm);
+
+ if (vmcb)
+ vmcb->control.int_ctl &= ~V_GIF_MASK;
+ else
+ svm->vcpu.arch.hflags &= ~HF_GIF_MASK;
+}
+
+static inline bool gif_set(struct vcpu_svm *svm)
+{
+ struct vmcb *vmcb = get_vgif_vmcb(svm);
+
+ if (vmcb)
+ return !!(vmcb->control.int_ctl & V_GIF_MASK);
+ else
+ return !!(svm->vcpu.arch.hflags & HF_GIF_MASK);
+}
+
+static inline bool nested_npt_enabled(struct vcpu_svm *svm)
+{
+ return svm->nested.ctl.nested_ctl & SVM_NESTED_CTL_NP_ENABLE;
+}
+
+static inline bool is_x2apic_msrpm_offset(u32 offset)
+{
+ /* 4 msrs per u8, and 4 u8 in u32 */
+ u32 msr = offset * 16;
+
+ return (msr >= APIC_BASE_MSR) &&
+ (msr < (APIC_BASE_MSR + 0x100));
+}
+
+/* svm.c */
+#define MSR_INVALID 0xffffffffU
+
+#define DEBUGCTL_RESERVED_BITS (~(0x3fULL))
+
+extern bool dump_invalid_vmcb;
+
+u32 svm_msrpm_offset(u32 msr);
+u32 *svm_vcpu_alloc_msrpm(void);
+void svm_vcpu_init_msrpm(struct kvm_vcpu *vcpu, u32 *msrpm);
+void svm_vcpu_free_msrpm(u32 *msrpm);
+void svm_copy_lbrs(struct vmcb *to_vmcb, struct vmcb *from_vmcb);
+void svm_update_lbrv(struct kvm_vcpu *vcpu);
+
+int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer);
+void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
+void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
+void disable_nmi_singlestep(struct vcpu_svm *svm);
+bool svm_smi_blocked(struct kvm_vcpu *vcpu);
+bool svm_nmi_blocked(struct kvm_vcpu *vcpu);
+bool svm_interrupt_blocked(struct kvm_vcpu *vcpu);
+void svm_set_gif(struct vcpu_svm *svm, bool value);
+int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 exit_code);
+void set_msr_interception(struct kvm_vcpu *vcpu, u32 *msrpm, u32 msr,
+ int read, int write);
+void svm_set_x2apic_msr_interception(struct vcpu_svm *svm, bool disable);
+void svm_complete_interrupt_delivery(struct kvm_vcpu *vcpu, int delivery_mode,
+ int trig_mode, int vec);
+
+/* nested.c */
+
+#define NESTED_EXIT_HOST 0 /* Exit handled on host level */
+#define NESTED_EXIT_DONE 1 /* Exit caused nested vmexit */
+#define NESTED_EXIT_CONTINUE 2 /* Further checks needed */
+
+static inline bool nested_svm_virtualize_tpr(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ return is_guest_mode(vcpu) && (svm->nested.ctl.int_ctl & V_INTR_MASKING_MASK);
+}
+
+static inline bool nested_exit_on_smi(struct vcpu_svm *svm)
+{
+ return vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_SMI);
+}
+
+static inline bool nested_exit_on_intr(struct vcpu_svm *svm)
+{
+ return vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_INTR);
+}
+
+static inline bool nested_exit_on_nmi(struct vcpu_svm *svm)
+{
+ return vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_NMI);
+}
+
+int enter_svm_guest_mode(struct kvm_vcpu *vcpu,
+ u64 vmcb_gpa, struct vmcb *vmcb12, bool from_vmrun);
+void svm_leave_nested(struct kvm_vcpu *vcpu);
+void svm_free_nested(struct vcpu_svm *svm);
+int svm_allocate_nested(struct vcpu_svm *svm);
+int nested_svm_vmrun(struct kvm_vcpu *vcpu);
+void svm_copy_vmrun_state(struct vmcb_save_area *to_save,
+ struct vmcb_save_area *from_save);
+void svm_copy_vmloadsave_state(struct vmcb *to_vmcb, struct vmcb *from_vmcb);
+int nested_svm_vmexit(struct vcpu_svm *svm);
+
+static inline int nested_svm_simple_vmexit(struct vcpu_svm *svm, u32 exit_code)
+{
+ svm->vmcb->control.exit_code = exit_code;
+ svm->vmcb->control.exit_info_1 = 0;
+ svm->vmcb->control.exit_info_2 = 0;
+ return nested_svm_vmexit(svm);
+}
+
+int nested_svm_exit_handled(struct vcpu_svm *svm);
+int nested_svm_check_permissions(struct kvm_vcpu *vcpu);
+int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
+ bool has_error_code, u32 error_code);
+int nested_svm_exit_special(struct vcpu_svm *svm);
+void nested_svm_update_tsc_ratio_msr(struct kvm_vcpu *vcpu);
+void __svm_write_tsc_multiplier(u64 multiplier);
+void nested_copy_vmcb_control_to_cache(struct vcpu_svm *svm,
+ struct vmcb_control_area *control);
+void nested_copy_vmcb_save_to_cache(struct vcpu_svm *svm,
+ struct vmcb_save_area *save);
+void nested_sync_control_from_vmcb02(struct vcpu_svm *svm);
+void nested_vmcb02_compute_g_pat(struct vcpu_svm *svm);
+void svm_switch_vmcb(struct vcpu_svm *svm, struct kvm_vmcb_info *target_vmcb);
+
+extern struct kvm_x86_nested_ops svm_nested_ops;
+
+/* avic.c */
+
+bool avic_hardware_setup(struct kvm_x86_ops *ops);
+int avic_ga_log_notifier(u32 ga_tag);
+void avic_vm_destroy(struct kvm *kvm);
+int avic_vm_init(struct kvm *kvm);
+void avic_init_vmcb(struct vcpu_svm *svm, struct vmcb *vmcb);
+int avic_incomplete_ipi_interception(struct kvm_vcpu *vcpu);
+int avic_unaccelerated_access_interception(struct kvm_vcpu *vcpu);
+int avic_init_vcpu(struct vcpu_svm *svm);
+void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
+void avic_vcpu_put(struct kvm_vcpu *vcpu);
+void avic_apicv_post_state_restore(struct kvm_vcpu *vcpu);
+void avic_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu);
+bool avic_check_apicv_inhibit_reasons(enum kvm_apicv_inhibit reason);
+int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
+ uint32_t guest_irq, bool set);
+void avic_vcpu_blocking(struct kvm_vcpu *vcpu);
+void avic_vcpu_unblocking(struct kvm_vcpu *vcpu);
+void avic_ring_doorbell(struct kvm_vcpu *vcpu);
+unsigned long avic_vcpu_get_apicv_inhibit_reasons(struct kvm_vcpu *vcpu);
+void avic_refresh_virtual_apic_mode(struct kvm_vcpu *vcpu);
+
+
+/* sev.c */
+
+#define GHCB_VERSION_MAX 1ULL
+#define GHCB_VERSION_MIN 1ULL
+
+
+extern unsigned int max_sev_asid;
+
+void sev_vm_destroy(struct kvm *kvm);
+int sev_mem_enc_ioctl(struct kvm *kvm, void __user *argp);
+int sev_mem_enc_register_region(struct kvm *kvm,
+ struct kvm_enc_region *range);
+int sev_mem_enc_unregister_region(struct kvm *kvm,
+ struct kvm_enc_region *range);
+int sev_vm_copy_enc_context_from(struct kvm *kvm, unsigned int source_fd);
+int sev_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd);
+void sev_guest_memory_reclaimed(struct kvm *kvm);
+
+void pre_sev_run(struct vcpu_svm *svm, int cpu);
+void __init sev_set_cpu_caps(void);
+void __init sev_hardware_setup(void);
+void sev_hardware_unsetup(void);
+int sev_cpu_init(struct svm_cpu_data *sd);
+void sev_init_vmcb(struct vcpu_svm *svm);
+void sev_vcpu_after_set_cpuid(struct vcpu_svm *svm);
+void sev_free_vcpu(struct kvm_vcpu *vcpu);
+int sev_handle_vmgexit(struct kvm_vcpu *vcpu);
+int sev_es_string_io(struct vcpu_svm *svm, int size, unsigned int port, int in);
+void sev_es_vcpu_reset(struct vcpu_svm *svm);
+void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector);
+void sev_es_prepare_switch_to_guest(struct sev_es_save_area *hostsa);
+void sev_es_unmap_ghcb(struct vcpu_svm *svm);
+
+/* vmenter.S */
+
+void __svm_sev_es_vcpu_run(struct vcpu_svm *svm, bool spec_ctrl_intercepted);
+void __svm_vcpu_run(struct vcpu_svm *svm, bool spec_ctrl_intercepted);
+
+#define DEFINE_KVM_GHCB_ACCESSORS(field) \
+ static __always_inline bool kvm_ghcb_##field##_is_valid(const struct vcpu_svm *svm) \
+ { \
+ return test_bit(GHCB_BITMAP_IDX(field), \
+ (unsigned long *)&svm->sev_es.valid_bitmap); \
+ } \
+ \
+ static __always_inline u64 kvm_ghcb_get_##field##_if_valid(struct vcpu_svm *svm, struct ghcb *ghcb) \
+ { \
+ return kvm_ghcb_##field##_is_valid(svm) ? ghcb->save.field : 0; \
+ } \
+
+DEFINE_KVM_GHCB_ACCESSORS(cpl)
+DEFINE_KVM_GHCB_ACCESSORS(rax)
+DEFINE_KVM_GHCB_ACCESSORS(rcx)
+DEFINE_KVM_GHCB_ACCESSORS(rdx)
+DEFINE_KVM_GHCB_ACCESSORS(rbx)
+DEFINE_KVM_GHCB_ACCESSORS(rsi)
+DEFINE_KVM_GHCB_ACCESSORS(sw_exit_code)
+DEFINE_KVM_GHCB_ACCESSORS(sw_exit_info_1)
+DEFINE_KVM_GHCB_ACCESSORS(sw_exit_info_2)
+DEFINE_KVM_GHCB_ACCESSORS(sw_scratch)
+DEFINE_KVM_GHCB_ACCESSORS(xcr0)
+
+#endif
diff --git a/arch/x86/kvm/svm/svm_onhyperv.c b/arch/x86/kvm/svm/svm_onhyperv.c
new file mode 100644
index 000000000..52c73a8be
--- /dev/null
+++ b/arch/x86/kvm/svm/svm_onhyperv.c
@@ -0,0 +1,40 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * KVM L1 hypervisor optimizations on Hyper-V for SVM.
+ */
+
+#include <linux/kvm_host.h>
+
+#include <asm/mshyperv.h>
+
+#include "svm.h"
+#include "svm_ops.h"
+
+#include "hyperv.h"
+#include "kvm_onhyperv.h"
+#include "svm_onhyperv.h"
+
+int svm_hv_enable_direct_tlbflush(struct kvm_vcpu *vcpu)
+{
+ struct hv_vmcb_enlightenments *hve;
+ struct hv_partition_assist_pg **p_hv_pa_pg =
+ &to_kvm_hv(vcpu->kvm)->hv_pa_pg;
+
+ if (!*p_hv_pa_pg)
+ *p_hv_pa_pg = kzalloc(PAGE_SIZE, GFP_KERNEL);
+
+ if (!*p_hv_pa_pg)
+ return -ENOMEM;
+
+ hve = &to_svm(vcpu)->vmcb->control.hv_enlightenments;
+
+ hve->partition_assist_page = __pa(*p_hv_pa_pg);
+ hve->hv_vm_id = (unsigned long)vcpu->kvm;
+ if (!hve->hv_enlightenments_control.nested_flush_hypercall) {
+ hve->hv_enlightenments_control.nested_flush_hypercall = 1;
+ vmcb_mark_dirty(to_svm(vcpu)->vmcb, HV_VMCB_NESTED_ENLIGHTENMENTS);
+ }
+
+ return 0;
+}
+
diff --git a/arch/x86/kvm/svm/svm_onhyperv.h b/arch/x86/kvm/svm/svm_onhyperv.h
new file mode 100644
index 000000000..9a6a34149
--- /dev/null
+++ b/arch/x86/kvm/svm/svm_onhyperv.h
@@ -0,0 +1,117 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * KVM L1 hypervisor optimizations on Hyper-V for SVM.
+ */
+
+#ifndef __ARCH_X86_KVM_SVM_ONHYPERV_H__
+#define __ARCH_X86_KVM_SVM_ONHYPERV_H__
+
+#include <asm/mshyperv.h>
+
+#if IS_ENABLED(CONFIG_HYPERV)
+
+#include "kvm_onhyperv.h"
+#include "svm/hyperv.h"
+
+static struct kvm_x86_ops svm_x86_ops;
+
+int svm_hv_enable_direct_tlbflush(struct kvm_vcpu *vcpu);
+
+static inline bool svm_hv_is_enlightened_tlb_enabled(struct kvm_vcpu *vcpu)
+{
+ struct hv_vmcb_enlightenments *hve = &to_svm(vcpu)->vmcb->control.hv_enlightenments;
+
+ return ms_hyperv.nested_features & HV_X64_NESTED_ENLIGHTENED_TLB &&
+ !!hve->hv_enlightenments_control.enlightened_npt_tlb;
+}
+
+static inline void svm_hv_init_vmcb(struct vmcb *vmcb)
+{
+ struct hv_vmcb_enlightenments *hve = &vmcb->control.hv_enlightenments;
+
+ BUILD_BUG_ON(sizeof(vmcb->control.hv_enlightenments) !=
+ sizeof(vmcb->control.reserved_sw));
+
+ if (npt_enabled &&
+ ms_hyperv.nested_features & HV_X64_NESTED_ENLIGHTENED_TLB)
+ hve->hv_enlightenments_control.enlightened_npt_tlb = 1;
+
+ if (ms_hyperv.nested_features & HV_X64_NESTED_MSR_BITMAP)
+ hve->hv_enlightenments_control.msr_bitmap = 1;
+}
+
+static inline __init void svm_hv_hardware_setup(void)
+{
+ if (npt_enabled &&
+ ms_hyperv.nested_features & HV_X64_NESTED_ENLIGHTENED_TLB) {
+ pr_info("kvm: Hyper-V enlightened NPT TLB flush enabled\n");
+ svm_x86_ops.tlb_remote_flush = hv_remote_flush_tlb;
+ svm_x86_ops.tlb_remote_flush_with_range =
+ hv_remote_flush_tlb_with_range;
+ }
+
+ if (ms_hyperv.nested_features & HV_X64_NESTED_DIRECT_FLUSH) {
+ int cpu;
+
+ pr_info("kvm: Hyper-V Direct TLB Flush enabled\n");
+ for_each_online_cpu(cpu) {
+ struct hv_vp_assist_page *vp_ap =
+ hv_get_vp_assist_page(cpu);
+
+ if (!vp_ap)
+ continue;
+
+ vp_ap->nested_control.features.directhypercall = 1;
+ }
+ svm_x86_ops.enable_direct_tlbflush =
+ svm_hv_enable_direct_tlbflush;
+ }
+}
+
+static inline void svm_hv_vmcb_dirty_nested_enlightenments(
+ struct kvm_vcpu *vcpu)
+{
+ struct vmcb *vmcb = to_svm(vcpu)->vmcb;
+ struct hv_vmcb_enlightenments *hve = &vmcb->control.hv_enlightenments;
+
+ if (hve->hv_enlightenments_control.msr_bitmap)
+ vmcb_mark_dirty(vmcb, HV_VMCB_NESTED_ENLIGHTENMENTS);
+}
+
+static inline void svm_hv_update_vp_id(struct vmcb *vmcb, struct kvm_vcpu *vcpu)
+{
+ struct hv_vmcb_enlightenments *hve = &vmcb->control.hv_enlightenments;
+ u32 vp_index = kvm_hv_get_vpindex(vcpu);
+
+ if (hve->hv_vp_id != vp_index) {
+ hve->hv_vp_id = vp_index;
+ vmcb_mark_dirty(vmcb, HV_VMCB_NESTED_ENLIGHTENMENTS);
+ }
+}
+#else
+
+static inline bool svm_hv_is_enlightened_tlb_enabled(struct kvm_vcpu *vcpu)
+{
+ return false;
+}
+
+static inline void svm_hv_init_vmcb(struct vmcb *vmcb)
+{
+}
+
+static inline __init void svm_hv_hardware_setup(void)
+{
+}
+
+static inline void svm_hv_vmcb_dirty_nested_enlightenments(
+ struct kvm_vcpu *vcpu)
+{
+}
+
+static inline void svm_hv_update_vp_id(struct vmcb *vmcb,
+ struct kvm_vcpu *vcpu)
+{
+}
+#endif /* CONFIG_HYPERV */
+
+#endif /* __ARCH_X86_KVM_SVM_ONHYPERV_H__ */
diff --git a/arch/x86/kvm/svm/svm_ops.h b/arch/x86/kvm/svm/svm_ops.h
new file mode 100644
index 000000000..36c8af87a
--- /dev/null
+++ b/arch/x86/kvm/svm/svm_ops.h
@@ -0,0 +1,64 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __KVM_X86_SVM_OPS_H
+#define __KVM_X86_SVM_OPS_H
+
+#include <linux/compiler_types.h>
+
+#include "x86.h"
+
+#define svm_asm(insn, clobber...) \
+do { \
+ asm_volatile_goto("1: " __stringify(insn) "\n\t" \
+ _ASM_EXTABLE(1b, %l[fault]) \
+ ::: clobber : fault); \
+ return; \
+fault: \
+ kvm_spurious_fault(); \
+} while (0)
+
+#define svm_asm1(insn, op1, clobber...) \
+do { \
+ asm_volatile_goto("1: " __stringify(insn) " %0\n\t" \
+ _ASM_EXTABLE(1b, %l[fault]) \
+ :: op1 : clobber : fault); \
+ return; \
+fault: \
+ kvm_spurious_fault(); \
+} while (0)
+
+#define svm_asm2(insn, op1, op2, clobber...) \
+do { \
+ asm_volatile_goto("1: " __stringify(insn) " %1, %0\n\t" \
+ _ASM_EXTABLE(1b, %l[fault]) \
+ :: op1, op2 : clobber : fault); \
+ return; \
+fault: \
+ kvm_spurious_fault(); \
+} while (0)
+
+static inline void clgi(void)
+{
+ svm_asm(clgi);
+}
+
+static inline void stgi(void)
+{
+ svm_asm(stgi);
+}
+
+static inline void invlpga(unsigned long addr, u32 asid)
+{
+ svm_asm2(invlpga, "c"(asid), "a"(addr));
+}
+
+/*
+ * Despite being a physical address, the portion of rAX that is consumed by
+ * VMSAVE, VMLOAD, etc... is still controlled by the effective address size,
+ * hence 'unsigned long' instead of 'hpa_t'.
+ */
+static __always_inline void vmsave(unsigned long pa)
+{
+ svm_asm1(vmsave, "a" (pa), "memory");
+}
+
+#endif /* __KVM_X86_SVM_OPS_H */
diff --git a/arch/x86/kvm/svm/vmenter.S b/arch/x86/kvm/svm/vmenter.S
new file mode 100644
index 000000000..5be9a63f0
--- /dev/null
+++ b/arch/x86/kvm/svm/vmenter.S
@@ -0,0 +1,392 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/linkage.h>
+#include <asm/asm.h>
+#include <asm/bitsperlong.h>
+#include <asm/kvm_vcpu_regs.h>
+#include <asm/nospec-branch.h>
+#include "kvm-asm-offsets.h"
+
+#define WORD_SIZE (BITS_PER_LONG / 8)
+
+/* Intentionally omit RAX as it's context switched by hardware */
+#define VCPU_RCX (SVM_vcpu_arch_regs + __VCPU_REGS_RCX * WORD_SIZE)
+#define VCPU_RDX (SVM_vcpu_arch_regs + __VCPU_REGS_RDX * WORD_SIZE)
+#define VCPU_RBX (SVM_vcpu_arch_regs + __VCPU_REGS_RBX * WORD_SIZE)
+/* Intentionally omit RSP as it's context switched by hardware */
+#define VCPU_RBP (SVM_vcpu_arch_regs + __VCPU_REGS_RBP * WORD_SIZE)
+#define VCPU_RSI (SVM_vcpu_arch_regs + __VCPU_REGS_RSI * WORD_SIZE)
+#define VCPU_RDI (SVM_vcpu_arch_regs + __VCPU_REGS_RDI * WORD_SIZE)
+
+#ifdef CONFIG_X86_64
+#define VCPU_R8 (SVM_vcpu_arch_regs + __VCPU_REGS_R8 * WORD_SIZE)
+#define VCPU_R9 (SVM_vcpu_arch_regs + __VCPU_REGS_R9 * WORD_SIZE)
+#define VCPU_R10 (SVM_vcpu_arch_regs + __VCPU_REGS_R10 * WORD_SIZE)
+#define VCPU_R11 (SVM_vcpu_arch_regs + __VCPU_REGS_R11 * WORD_SIZE)
+#define VCPU_R12 (SVM_vcpu_arch_regs + __VCPU_REGS_R12 * WORD_SIZE)
+#define VCPU_R13 (SVM_vcpu_arch_regs + __VCPU_REGS_R13 * WORD_SIZE)
+#define VCPU_R14 (SVM_vcpu_arch_regs + __VCPU_REGS_R14 * WORD_SIZE)
+#define VCPU_R15 (SVM_vcpu_arch_regs + __VCPU_REGS_R15 * WORD_SIZE)
+#endif
+
+#define SVM_vmcb01_pa (SVM_vmcb01 + KVM_VMCB_pa)
+
+.section .noinstr.text, "ax"
+
+.macro RESTORE_GUEST_SPEC_CTRL
+ /* No need to do anything if SPEC_CTRL is unset or V_SPEC_CTRL is set */
+ ALTERNATIVE_2 "", \
+ "jmp 800f", X86_FEATURE_MSR_SPEC_CTRL, \
+ "", X86_FEATURE_V_SPEC_CTRL
+801:
+.endm
+.macro RESTORE_GUEST_SPEC_CTRL_BODY
+800:
+ /*
+ * SPEC_CTRL handling: if the guest's SPEC_CTRL value differs from the
+ * host's, write the MSR. This is kept out-of-line so that the common
+ * case does not have to jump.
+ *
+ * IMPORTANT: To avoid RSB underflow attacks and any other nastiness,
+ * there must not be any returns or indirect branches between this code
+ * and vmentry.
+ */
+ movl SVM_spec_ctrl(%_ASM_DI), %eax
+ cmp PER_CPU_VAR(x86_spec_ctrl_current), %eax
+ je 801b
+ mov $MSR_IA32_SPEC_CTRL, %ecx
+ xor %edx, %edx
+ wrmsr
+ jmp 801b
+.endm
+
+.macro RESTORE_HOST_SPEC_CTRL
+ /* No need to do anything if SPEC_CTRL is unset or V_SPEC_CTRL is set */
+ ALTERNATIVE_2 "", \
+ "jmp 900f", X86_FEATURE_MSR_SPEC_CTRL, \
+ "", X86_FEATURE_V_SPEC_CTRL
+901:
+.endm
+.macro RESTORE_HOST_SPEC_CTRL_BODY
+900:
+ /* Same for after vmexit. */
+ mov $MSR_IA32_SPEC_CTRL, %ecx
+
+ /*
+ * Load the value that the guest had written into MSR_IA32_SPEC_CTRL,
+ * if it was not intercepted during guest execution.
+ */
+ cmpb $0, (%_ASM_SP)
+ jnz 998f
+ rdmsr
+ movl %eax, SVM_spec_ctrl(%_ASM_DI)
+998:
+
+ /* Now restore the host value of the MSR if different from the guest's. */
+ movl PER_CPU_VAR(x86_spec_ctrl_current), %eax
+ cmp SVM_spec_ctrl(%_ASM_DI), %eax
+ je 901b
+ xor %edx, %edx
+ wrmsr
+ jmp 901b
+.endm
+
+
+/**
+ * __svm_vcpu_run - Run a vCPU via a transition to SVM guest mode
+ * @svm: struct vcpu_svm *
+ * @spec_ctrl_intercepted: bool
+ */
+SYM_FUNC_START(__svm_vcpu_run)
+ push %_ASM_BP
+#ifdef CONFIG_X86_64
+ push %r15
+ push %r14
+ push %r13
+ push %r12
+#else
+ push %edi
+ push %esi
+#endif
+ push %_ASM_BX
+
+ /*
+ * Save variables needed after vmexit on the stack, in inverse
+ * order compared to when they are needed.
+ */
+
+ /* Accessed directly from the stack in RESTORE_HOST_SPEC_CTRL. */
+ push %_ASM_ARG2
+
+ /* Needed to restore access to percpu variables. */
+ __ASM_SIZE(push) PER_CPU_VAR(svm_data + SD_save_area_pa)
+
+ /* Finally save @svm. */
+ push %_ASM_ARG1
+
+.ifnc _ASM_ARG1, _ASM_DI
+ /*
+ * Stash @svm in RDI early. On 32-bit, arguments are in RAX, RCX
+ * and RDX which are clobbered by RESTORE_GUEST_SPEC_CTRL.
+ */
+ mov %_ASM_ARG1, %_ASM_DI
+.endif
+
+ /* Clobbers RAX, RCX, RDX. */
+ RESTORE_GUEST_SPEC_CTRL
+
+ /*
+ * Use a single vmcb (vmcb01 because it's always valid) for
+ * context switching guest state via VMLOAD/VMSAVE, that way
+ * the state doesn't need to be copied between vmcb01 and
+ * vmcb02 when switching vmcbs for nested virtualization.
+ */
+ mov SVM_vmcb01_pa(%_ASM_DI), %_ASM_AX
+1: vmload %_ASM_AX
+2:
+
+ /* Get svm->current_vmcb->pa into RAX. */
+ mov SVM_current_vmcb(%_ASM_DI), %_ASM_AX
+ mov KVM_VMCB_pa(%_ASM_AX), %_ASM_AX
+
+ /* Load guest registers. */
+ mov VCPU_RCX(%_ASM_DI), %_ASM_CX
+ mov VCPU_RDX(%_ASM_DI), %_ASM_DX
+ mov VCPU_RBX(%_ASM_DI), %_ASM_BX
+ mov VCPU_RBP(%_ASM_DI), %_ASM_BP
+ mov VCPU_RSI(%_ASM_DI), %_ASM_SI
+#ifdef CONFIG_X86_64
+ mov VCPU_R8 (%_ASM_DI), %r8
+ mov VCPU_R9 (%_ASM_DI), %r9
+ mov VCPU_R10(%_ASM_DI), %r10
+ mov VCPU_R11(%_ASM_DI), %r11
+ mov VCPU_R12(%_ASM_DI), %r12
+ mov VCPU_R13(%_ASM_DI), %r13
+ mov VCPU_R14(%_ASM_DI), %r14
+ mov VCPU_R15(%_ASM_DI), %r15
+#endif
+ mov VCPU_RDI(%_ASM_DI), %_ASM_DI
+
+ /* Enter guest mode */
+ sti
+
+3: vmrun %_ASM_AX
+4:
+ cli
+
+ /* Pop @svm to RAX while it's the only available register. */
+ pop %_ASM_AX
+
+ /* Save all guest registers. */
+ mov %_ASM_CX, VCPU_RCX(%_ASM_AX)
+ mov %_ASM_DX, VCPU_RDX(%_ASM_AX)
+ mov %_ASM_BX, VCPU_RBX(%_ASM_AX)
+ mov %_ASM_BP, VCPU_RBP(%_ASM_AX)
+ mov %_ASM_SI, VCPU_RSI(%_ASM_AX)
+ mov %_ASM_DI, VCPU_RDI(%_ASM_AX)
+#ifdef CONFIG_X86_64
+ mov %r8, VCPU_R8 (%_ASM_AX)
+ mov %r9, VCPU_R9 (%_ASM_AX)
+ mov %r10, VCPU_R10(%_ASM_AX)
+ mov %r11, VCPU_R11(%_ASM_AX)
+ mov %r12, VCPU_R12(%_ASM_AX)
+ mov %r13, VCPU_R13(%_ASM_AX)
+ mov %r14, VCPU_R14(%_ASM_AX)
+ mov %r15, VCPU_R15(%_ASM_AX)
+#endif
+
+ /* @svm can stay in RDI from now on. */
+ mov %_ASM_AX, %_ASM_DI
+
+ mov SVM_vmcb01_pa(%_ASM_DI), %_ASM_AX
+5: vmsave %_ASM_AX
+6:
+
+ /* Restores GSBASE among other things, allowing access to percpu data. */
+ pop %_ASM_AX
+7: vmload %_ASM_AX
+8:
+
+#ifdef CONFIG_RETPOLINE
+ /* IMPORTANT: Stuff the RSB immediately after VM-Exit, before RET! */
+ FILL_RETURN_BUFFER %_ASM_AX, RSB_CLEAR_LOOPS, X86_FEATURE_RETPOLINE
+#endif
+
+ /* Clobbers RAX, RCX, RDX. */
+ RESTORE_HOST_SPEC_CTRL
+
+ /*
+ * Mitigate RETBleed for AMD/Hygon Zen uarch. RET should be
+ * untrained as soon as we exit the VM and are back to the
+ * kernel. This should be done before re-enabling interrupts
+ * because interrupt handlers won't sanitize 'ret' if the return is
+ * from the kernel.
+ */
+ UNTRAIN_RET
+
+ /* SRSO */
+ ALTERNATIVE "", "call entry_ibpb", X86_FEATURE_IBPB_ON_VMEXIT
+
+ /*
+ * Clear all general purpose registers except RSP and RAX to prevent
+ * speculative use of the guest's values, even those that are reloaded
+ * via the stack. In theory, an L1 cache miss when restoring registers
+ * could lead to speculative execution with the guest's values.
+ * Zeroing XORs are dirt cheap, i.e. the extra paranoia is essentially
+ * free. RSP and RAX are exempt as they are restored by hardware
+ * during VM-Exit.
+ */
+ xor %ecx, %ecx
+ xor %edx, %edx
+ xor %ebx, %ebx
+ xor %ebp, %ebp
+ xor %esi, %esi
+ xor %edi, %edi
+#ifdef CONFIG_X86_64
+ xor %r8d, %r8d
+ xor %r9d, %r9d
+ xor %r10d, %r10d
+ xor %r11d, %r11d
+ xor %r12d, %r12d
+ xor %r13d, %r13d
+ xor %r14d, %r14d
+ xor %r15d, %r15d
+#endif
+
+ /* "Pop" @spec_ctrl_intercepted. */
+ pop %_ASM_BX
+
+ pop %_ASM_BX
+
+#ifdef CONFIG_X86_64
+ pop %r12
+ pop %r13
+ pop %r14
+ pop %r15
+#else
+ pop %esi
+ pop %edi
+#endif
+ pop %_ASM_BP
+ RET
+
+ RESTORE_GUEST_SPEC_CTRL_BODY
+ RESTORE_HOST_SPEC_CTRL_BODY
+
+10: cmpb $0, kvm_rebooting
+ jne 2b
+ ud2
+30: cmpb $0, kvm_rebooting
+ jne 4b
+ ud2
+50: cmpb $0, kvm_rebooting
+ jne 6b
+ ud2
+70: cmpb $0, kvm_rebooting
+ jne 8b
+ ud2
+
+ _ASM_EXTABLE(1b, 10b)
+ _ASM_EXTABLE(3b, 30b)
+ _ASM_EXTABLE(5b, 50b)
+ _ASM_EXTABLE(7b, 70b)
+
+SYM_FUNC_END(__svm_vcpu_run)
+
+/**
+ * __svm_sev_es_vcpu_run - Run a SEV-ES vCPU via a transition to SVM guest mode
+ * @svm: struct vcpu_svm *
+ * @spec_ctrl_intercepted: bool
+ */
+SYM_FUNC_START(__svm_sev_es_vcpu_run)
+ push %_ASM_BP
+#ifdef CONFIG_X86_64
+ push %r15
+ push %r14
+ push %r13
+ push %r12
+#else
+ push %edi
+ push %esi
+#endif
+ push %_ASM_BX
+
+ /*
+ * Save variables needed after vmexit on the stack, in inverse
+ * order compared to when they are needed.
+ */
+
+ /* Accessed directly from the stack in RESTORE_HOST_SPEC_CTRL. */
+ push %_ASM_ARG2
+
+ /* Save @svm. */
+ push %_ASM_ARG1
+
+.ifnc _ASM_ARG1, _ASM_DI
+ /*
+ * Stash @svm in RDI early. On 32-bit, arguments are in RAX, RCX
+ * and RDX which are clobbered by RESTORE_GUEST_SPEC_CTRL.
+ */
+ mov %_ASM_ARG1, %_ASM_DI
+.endif
+
+ /* Clobbers RAX, RCX, RDX. */
+ RESTORE_GUEST_SPEC_CTRL
+
+ /* Get svm->current_vmcb->pa into RAX. */
+ mov SVM_current_vmcb(%_ASM_DI), %_ASM_AX
+ mov KVM_VMCB_pa(%_ASM_AX), %_ASM_AX
+
+ /* Enter guest mode */
+ sti
+
+1: vmrun %_ASM_AX
+
+2: cli
+
+ /* Pop @svm to RDI, guest registers have been saved already. */
+ pop %_ASM_DI
+
+#ifdef CONFIG_RETPOLINE
+ /* IMPORTANT: Stuff the RSB immediately after VM-Exit, before RET! */
+ FILL_RETURN_BUFFER %_ASM_AX, RSB_CLEAR_LOOPS, X86_FEATURE_RETPOLINE
+#endif
+
+ /* Clobbers RAX, RCX, RDX. */
+ RESTORE_HOST_SPEC_CTRL
+
+ /*
+ * Mitigate RETBleed for AMD/Hygon Zen uarch. RET should be
+ * untrained as soon as we exit the VM and are back to the
+ * kernel. This should be done before re-enabling interrupts
+ * because interrupt handlers won't sanitize RET if the return is
+ * from the kernel.
+ */
+ UNTRAIN_RET
+
+ /* "Pop" @spec_ctrl_intercepted. */
+ pop %_ASM_BX
+
+ pop %_ASM_BX
+
+#ifdef CONFIG_X86_64
+ pop %r12
+ pop %r13
+ pop %r14
+ pop %r15
+#else
+ pop %esi
+ pop %edi
+#endif
+ pop %_ASM_BP
+ RET
+
+ RESTORE_GUEST_SPEC_CTRL_BODY
+ RESTORE_HOST_SPEC_CTRL_BODY
+
+3: cmpb $0, kvm_rebooting
+ jne 2b
+ ud2
+
+ _ASM_EXTABLE(1b, 3b)
+
+SYM_FUNC_END(__svm_sev_es_vcpu_run)