summaryrefslogtreecommitdiffstats
path: root/arch/riscv/kvm
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-11 08:27:49 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-11 08:27:49 +0000
commitace9429bb58fd418f0c81d4c2835699bddf6bde6 (patch)
treeb2d64bc10158fdd5497876388cd68142ca374ed3 /arch/riscv/kvm
parentInitial commit. (diff)
downloadlinux-ace9429bb58fd418f0c81d4c2835699bddf6bde6.tar.xz
linux-ace9429bb58fd418f0c81d4c2835699bddf6bde6.zip
Adding upstream version 6.6.15.upstream/6.6.15
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'arch/riscv/kvm')
-rw-r--r--arch/riscv/kvm/Kconfig40
-rw-r--r--arch/riscv/kvm/Makefile34
-rw-r--r--arch/riscv/kvm/aia.c658
-rw-r--r--arch/riscv/kvm/aia_aplic.c619
-rw-r--r--arch/riscv/kvm/aia_device.c673
-rw-r--r--arch/riscv/kvm/aia_imsic.c1097
-rw-r--r--arch/riscv/kvm/main.c138
-rw-r--r--arch/riscv/kvm/mmu.c793
-rw-r--r--arch/riscv/kvm/tlb.c405
-rw-r--r--arch/riscv/kvm/vcpu.c781
-rw-r--r--arch/riscv/kvm/vcpu_exit.c223
-rw-r--r--arch/riscv/kvm/vcpu_fp.c165
-rw-r--r--arch/riscv/kvm/vcpu_insn.c754
-rw-r--r--arch/riscv/kvm/vcpu_onereg.c1054
-rw-r--r--arch/riscv/kvm/vcpu_pmu.c633
-rw-r--r--arch/riscv/kvm/vcpu_sbi.c421
-rw-r--r--arch/riscv/kvm/vcpu_sbi_base.c98
-rw-r--r--arch/riscv/kvm/vcpu_sbi_hsm.c118
-rw-r--r--arch/riscv/kvm/vcpu_sbi_pmu.c86
-rw-r--r--arch/riscv/kvm/vcpu_sbi_replace.c177
-rw-r--r--arch/riscv/kvm/vcpu_sbi_v01.c114
-rw-r--r--arch/riscv/kvm/vcpu_switch.S408
-rw-r--r--arch/riscv/kvm/vcpu_timer.c363
-rw-r--r--arch/riscv/kvm/vcpu_vector.c184
-rw-r--r--arch/riscv/kvm/vm.c215
-rw-r--r--arch/riscv/kvm/vmid.c124
26 files changed, 10375 insertions, 0 deletions
diff --git a/arch/riscv/kvm/Kconfig b/arch/riscv/kvm/Kconfig
new file mode 100644
index 000000000..dfc237d78
--- /dev/null
+++ b/arch/riscv/kvm/Kconfig
@@ -0,0 +1,40 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# KVM configuration
+#
+
+source "virt/kvm/Kconfig"
+
+menuconfig VIRTUALIZATION
+ bool "Virtualization"
+ help
+ Say Y here to get to see options for using your Linux host to run
+ other operating systems inside virtual machines (guests).
+ This option alone does not add any kernel code.
+
+ If you say N, all options in this submenu will be skipped and
+ disabled.
+
+if VIRTUALIZATION
+
+config KVM
+ tristate "Kernel-based Virtual Machine (KVM) support (EXPERIMENTAL)"
+ depends on RISCV_SBI && MMU
+ select HAVE_KVM_EVENTFD
+ select HAVE_KVM_IRQCHIP
+ select HAVE_KVM_IRQFD
+ select HAVE_KVM_IRQ_ROUTING
+ select HAVE_KVM_MSI
+ select HAVE_KVM_VCPU_ASYNC_IOCTL
+ select KVM_GENERIC_DIRTYLOG_READ_PROTECT
+ select KVM_GENERIC_HARDWARE_ENABLING
+ select KVM_MMIO
+ select KVM_XFER_TO_GUEST_WORK
+ select MMU_NOTIFIER
+ select PREEMPT_NOTIFIERS
+ help
+ Support hosting virtualized guest machines.
+
+ If unsure, say N.
+
+endif # VIRTUALIZATION
diff --git a/arch/riscv/kvm/Makefile b/arch/riscv/kvm/Makefile
new file mode 100644
index 000000000..4c2067fc5
--- /dev/null
+++ b/arch/riscv/kvm/Makefile
@@ -0,0 +1,34 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Makefile for RISC-V KVM support
+#
+
+ccflags-y += -I $(srctree)/$(src)
+
+include $(srctree)/virt/kvm/Makefile.kvm
+
+obj-$(CONFIG_KVM) += kvm.o
+
+kvm-y += main.o
+kvm-y += vm.o
+kvm-y += vmid.o
+kvm-y += tlb.o
+kvm-y += mmu.o
+kvm-y += vcpu.o
+kvm-y += vcpu_exit.o
+kvm-y += vcpu_fp.o
+kvm-y += vcpu_vector.o
+kvm-y += vcpu_insn.o
+kvm-y += vcpu_onereg.o
+kvm-y += vcpu_switch.o
+kvm-y += vcpu_sbi.o
+kvm-$(CONFIG_RISCV_SBI_V01) += vcpu_sbi_v01.o
+kvm-y += vcpu_sbi_base.o
+kvm-y += vcpu_sbi_replace.o
+kvm-y += vcpu_sbi_hsm.o
+kvm-y += vcpu_timer.o
+kvm-$(CONFIG_RISCV_PMU_SBI) += vcpu_pmu.o vcpu_sbi_pmu.o
+kvm-y += aia.o
+kvm-y += aia_device.o
+kvm-y += aia_aplic.o
+kvm-y += aia_imsic.o
diff --git a/arch/riscv/kvm/aia.c b/arch/riscv/kvm/aia.c
new file mode 100644
index 000000000..74bb27440
--- /dev/null
+++ b/arch/riscv/kvm/aia.c
@@ -0,0 +1,658 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2021 Western Digital Corporation or its affiliates.
+ * Copyright (C) 2022 Ventana Micro Systems Inc.
+ *
+ * Authors:
+ * Anup Patel <apatel@ventanamicro.com>
+ */
+
+#include <linux/kernel.h>
+#include <linux/bitops.h>
+#include <linux/irq.h>
+#include <linux/irqdomain.h>
+#include <linux/kvm_host.h>
+#include <linux/percpu.h>
+#include <linux/spinlock.h>
+#include <asm/hwcap.h>
+#include <asm/kvm_aia_imsic.h>
+
+struct aia_hgei_control {
+ raw_spinlock_t lock;
+ unsigned long free_bitmap;
+ struct kvm_vcpu *owners[BITS_PER_LONG];
+};
+static DEFINE_PER_CPU(struct aia_hgei_control, aia_hgei);
+static int hgei_parent_irq;
+
+unsigned int kvm_riscv_aia_nr_hgei;
+unsigned int kvm_riscv_aia_max_ids;
+DEFINE_STATIC_KEY_FALSE(kvm_riscv_aia_available);
+
+static int aia_find_hgei(struct kvm_vcpu *owner)
+{
+ int i, hgei;
+ unsigned long flags;
+ struct aia_hgei_control *hgctrl = get_cpu_ptr(&aia_hgei);
+
+ raw_spin_lock_irqsave(&hgctrl->lock, flags);
+
+ hgei = -1;
+ for (i = 1; i <= kvm_riscv_aia_nr_hgei; i++) {
+ if (hgctrl->owners[i] == owner) {
+ hgei = i;
+ break;
+ }
+ }
+
+ raw_spin_unlock_irqrestore(&hgctrl->lock, flags);
+
+ put_cpu_ptr(&aia_hgei);
+ return hgei;
+}
+
+static void aia_set_hvictl(bool ext_irq_pending)
+{
+ unsigned long hvictl;
+
+ /*
+ * HVICTL.IID == 9 and HVICTL.IPRIO == 0 represents
+ * no interrupt in HVICTL.
+ */
+
+ hvictl = (IRQ_S_EXT << HVICTL_IID_SHIFT) & HVICTL_IID;
+ hvictl |= ext_irq_pending;
+ csr_write(CSR_HVICTL, hvictl);
+}
+
+#ifdef CONFIG_32BIT
+void kvm_riscv_vcpu_aia_flush_interrupts(struct kvm_vcpu *vcpu)
+{
+ struct kvm_vcpu_aia_csr *csr = &vcpu->arch.aia_context.guest_csr;
+ unsigned long mask, val;
+
+ if (!kvm_riscv_aia_available())
+ return;
+
+ if (READ_ONCE(vcpu->arch.irqs_pending_mask[1])) {
+ mask = xchg_acquire(&vcpu->arch.irqs_pending_mask[1], 0);
+ val = READ_ONCE(vcpu->arch.irqs_pending[1]) & mask;
+
+ csr->hviph &= ~mask;
+ csr->hviph |= val;
+ }
+}
+
+void kvm_riscv_vcpu_aia_sync_interrupts(struct kvm_vcpu *vcpu)
+{
+ struct kvm_vcpu_aia_csr *csr = &vcpu->arch.aia_context.guest_csr;
+
+ if (kvm_riscv_aia_available())
+ csr->vsieh = csr_read(CSR_VSIEH);
+}
+#endif
+
+bool kvm_riscv_vcpu_aia_has_interrupts(struct kvm_vcpu *vcpu, u64 mask)
+{
+ int hgei;
+ unsigned long seip;
+
+ if (!kvm_riscv_aia_available())
+ return false;
+
+#ifdef CONFIG_32BIT
+ if (READ_ONCE(vcpu->arch.irqs_pending[1]) &
+ (vcpu->arch.aia_context.guest_csr.vsieh & upper_32_bits(mask)))
+ return true;
+#endif
+
+ seip = vcpu->arch.guest_csr.vsie;
+ seip &= (unsigned long)mask;
+ seip &= BIT(IRQ_S_EXT);
+
+ if (!kvm_riscv_aia_initialized(vcpu->kvm) || !seip)
+ return false;
+
+ hgei = aia_find_hgei(vcpu);
+ if (hgei > 0)
+ return !!(csr_read(CSR_HGEIP) & BIT(hgei));
+
+ return false;
+}
+
+void kvm_riscv_vcpu_aia_update_hvip(struct kvm_vcpu *vcpu)
+{
+ struct kvm_vcpu_csr *csr = &vcpu->arch.guest_csr;
+
+ if (!kvm_riscv_aia_available())
+ return;
+
+#ifdef CONFIG_32BIT
+ csr_write(CSR_HVIPH, vcpu->arch.aia_context.guest_csr.hviph);
+#endif
+ aia_set_hvictl(!!(csr->hvip & BIT(IRQ_VS_EXT)));
+}
+
+void kvm_riscv_vcpu_aia_load(struct kvm_vcpu *vcpu, int cpu)
+{
+ struct kvm_vcpu_aia_csr *csr = &vcpu->arch.aia_context.guest_csr;
+
+ if (!kvm_riscv_aia_available())
+ return;
+
+ csr_write(CSR_VSISELECT, csr->vsiselect);
+ csr_write(CSR_HVIPRIO1, csr->hviprio1);
+ csr_write(CSR_HVIPRIO2, csr->hviprio2);
+#ifdef CONFIG_32BIT
+ csr_write(CSR_VSIEH, csr->vsieh);
+ csr_write(CSR_HVIPH, csr->hviph);
+ csr_write(CSR_HVIPRIO1H, csr->hviprio1h);
+ csr_write(CSR_HVIPRIO2H, csr->hviprio2h);
+#endif
+}
+
+void kvm_riscv_vcpu_aia_put(struct kvm_vcpu *vcpu)
+{
+ struct kvm_vcpu_aia_csr *csr = &vcpu->arch.aia_context.guest_csr;
+
+ if (!kvm_riscv_aia_available())
+ return;
+
+ csr->vsiselect = csr_read(CSR_VSISELECT);
+ csr->hviprio1 = csr_read(CSR_HVIPRIO1);
+ csr->hviprio2 = csr_read(CSR_HVIPRIO2);
+#ifdef CONFIG_32BIT
+ csr->vsieh = csr_read(CSR_VSIEH);
+ csr->hviph = csr_read(CSR_HVIPH);
+ csr->hviprio1h = csr_read(CSR_HVIPRIO1H);
+ csr->hviprio2h = csr_read(CSR_HVIPRIO2H);
+#endif
+}
+
+int kvm_riscv_vcpu_aia_get_csr(struct kvm_vcpu *vcpu,
+ unsigned long reg_num,
+ unsigned long *out_val)
+{
+ struct kvm_vcpu_aia_csr *csr = &vcpu->arch.aia_context.guest_csr;
+
+ if (reg_num >= sizeof(struct kvm_riscv_aia_csr) / sizeof(unsigned long))
+ return -ENOENT;
+
+ *out_val = 0;
+ if (kvm_riscv_aia_available())
+ *out_val = ((unsigned long *)csr)[reg_num];
+
+ return 0;
+}
+
+int kvm_riscv_vcpu_aia_set_csr(struct kvm_vcpu *vcpu,
+ unsigned long reg_num,
+ unsigned long val)
+{
+ struct kvm_vcpu_aia_csr *csr = &vcpu->arch.aia_context.guest_csr;
+
+ if (reg_num >= sizeof(struct kvm_riscv_aia_csr) / sizeof(unsigned long))
+ return -ENOENT;
+
+ if (kvm_riscv_aia_available()) {
+ ((unsigned long *)csr)[reg_num] = val;
+
+#ifdef CONFIG_32BIT
+ if (reg_num == KVM_REG_RISCV_CSR_AIA_REG(siph))
+ WRITE_ONCE(vcpu->arch.irqs_pending_mask[1], 0);
+#endif
+ }
+
+ return 0;
+}
+
+int kvm_riscv_vcpu_aia_rmw_topei(struct kvm_vcpu *vcpu,
+ unsigned int csr_num,
+ unsigned long *val,
+ unsigned long new_val,
+ unsigned long wr_mask)
+{
+ /* If AIA not available then redirect trap */
+ if (!kvm_riscv_aia_available())
+ return KVM_INSN_ILLEGAL_TRAP;
+
+ /* If AIA not initialized then forward to user space */
+ if (!kvm_riscv_aia_initialized(vcpu->kvm))
+ return KVM_INSN_EXIT_TO_USER_SPACE;
+
+ return kvm_riscv_vcpu_aia_imsic_rmw(vcpu, KVM_RISCV_AIA_IMSIC_TOPEI,
+ val, new_val, wr_mask);
+}
+
+/*
+ * External IRQ priority always read-only zero. This means default
+ * priority order is always preferred for external IRQs unless
+ * HVICTL.IID == 9 and HVICTL.IPRIO != 0
+ */
+static int aia_irq2bitpos[] = {
+0, 8, -1, -1, 16, 24, -1, -1, /* 0 - 7 */
+32, -1, -1, -1, -1, 40, 48, 56, /* 8 - 15 */
+64, 72, 80, 88, 96, 104, 112, 120, /* 16 - 23 */
+-1, -1, -1, -1, -1, -1, -1, -1, /* 24 - 31 */
+-1, -1, -1, -1, -1, -1, -1, -1, /* 32 - 39 */
+-1, -1, -1, -1, -1, -1, -1, -1, /* 40 - 47 */
+-1, -1, -1, -1, -1, -1, -1, -1, /* 48 - 55 */
+-1, -1, -1, -1, -1, -1, -1, -1, /* 56 - 63 */
+};
+
+static u8 aia_get_iprio8(struct kvm_vcpu *vcpu, unsigned int irq)
+{
+ unsigned long hviprio;
+ int bitpos = aia_irq2bitpos[irq];
+
+ if (bitpos < 0)
+ return 0;
+
+ switch (bitpos / BITS_PER_LONG) {
+ case 0:
+ hviprio = csr_read(CSR_HVIPRIO1);
+ break;
+ case 1:
+#ifndef CONFIG_32BIT
+ hviprio = csr_read(CSR_HVIPRIO2);
+ break;
+#else
+ hviprio = csr_read(CSR_HVIPRIO1H);
+ break;
+ case 2:
+ hviprio = csr_read(CSR_HVIPRIO2);
+ break;
+ case 3:
+ hviprio = csr_read(CSR_HVIPRIO2H);
+ break;
+#endif
+ default:
+ return 0;
+ }
+
+ return (hviprio >> (bitpos % BITS_PER_LONG)) & TOPI_IPRIO_MASK;
+}
+
+static void aia_set_iprio8(struct kvm_vcpu *vcpu, unsigned int irq, u8 prio)
+{
+ unsigned long hviprio;
+ int bitpos = aia_irq2bitpos[irq];
+
+ if (bitpos < 0)
+ return;
+
+ switch (bitpos / BITS_PER_LONG) {
+ case 0:
+ hviprio = csr_read(CSR_HVIPRIO1);
+ break;
+ case 1:
+#ifndef CONFIG_32BIT
+ hviprio = csr_read(CSR_HVIPRIO2);
+ break;
+#else
+ hviprio = csr_read(CSR_HVIPRIO1H);
+ break;
+ case 2:
+ hviprio = csr_read(CSR_HVIPRIO2);
+ break;
+ case 3:
+ hviprio = csr_read(CSR_HVIPRIO2H);
+ break;
+#endif
+ default:
+ return;
+ }
+
+ hviprio &= ~(TOPI_IPRIO_MASK << (bitpos % BITS_PER_LONG));
+ hviprio |= (unsigned long)prio << (bitpos % BITS_PER_LONG);
+
+ switch (bitpos / BITS_PER_LONG) {
+ case 0:
+ csr_write(CSR_HVIPRIO1, hviprio);
+ break;
+ case 1:
+#ifndef CONFIG_32BIT
+ csr_write(CSR_HVIPRIO2, hviprio);
+ break;
+#else
+ csr_write(CSR_HVIPRIO1H, hviprio);
+ break;
+ case 2:
+ csr_write(CSR_HVIPRIO2, hviprio);
+ break;
+ case 3:
+ csr_write(CSR_HVIPRIO2H, hviprio);
+ break;
+#endif
+ default:
+ return;
+ }
+}
+
+static int aia_rmw_iprio(struct kvm_vcpu *vcpu, unsigned int isel,
+ unsigned long *val, unsigned long new_val,
+ unsigned long wr_mask)
+{
+ int i, first_irq, nirqs;
+ unsigned long old_val;
+ u8 prio;
+
+#ifndef CONFIG_32BIT
+ if (isel & 0x1)
+ return KVM_INSN_ILLEGAL_TRAP;
+#endif
+
+ nirqs = 4 * (BITS_PER_LONG / 32);
+ first_irq = (isel - ISELECT_IPRIO0) * 4;
+
+ old_val = 0;
+ for (i = 0; i < nirqs; i++) {
+ prio = aia_get_iprio8(vcpu, first_irq + i);
+ old_val |= (unsigned long)prio << (TOPI_IPRIO_BITS * i);
+ }
+
+ if (val)
+ *val = old_val;
+
+ if (wr_mask) {
+ new_val = (old_val & ~wr_mask) | (new_val & wr_mask);
+ for (i = 0; i < nirqs; i++) {
+ prio = (new_val >> (TOPI_IPRIO_BITS * i)) &
+ TOPI_IPRIO_MASK;
+ aia_set_iprio8(vcpu, first_irq + i, prio);
+ }
+ }
+
+ return KVM_INSN_CONTINUE_NEXT_SEPC;
+}
+
+int kvm_riscv_vcpu_aia_rmw_ireg(struct kvm_vcpu *vcpu, unsigned int csr_num,
+ unsigned long *val, unsigned long new_val,
+ unsigned long wr_mask)
+{
+ unsigned int isel;
+
+ /* If AIA not available then redirect trap */
+ if (!kvm_riscv_aia_available())
+ return KVM_INSN_ILLEGAL_TRAP;
+
+ /* First try to emulate in kernel space */
+ isel = csr_read(CSR_VSISELECT) & ISELECT_MASK;
+ if (isel >= ISELECT_IPRIO0 && isel <= ISELECT_IPRIO15)
+ return aia_rmw_iprio(vcpu, isel, val, new_val, wr_mask);
+ else if (isel >= IMSIC_FIRST && isel <= IMSIC_LAST &&
+ kvm_riscv_aia_initialized(vcpu->kvm))
+ return kvm_riscv_vcpu_aia_imsic_rmw(vcpu, isel, val, new_val,
+ wr_mask);
+
+ /* We can't handle it here so redirect to user space */
+ return KVM_INSN_EXIT_TO_USER_SPACE;
+}
+
+int kvm_riscv_aia_alloc_hgei(int cpu, struct kvm_vcpu *owner,
+ void __iomem **hgei_va, phys_addr_t *hgei_pa)
+{
+ int ret = -ENOENT;
+ unsigned long flags;
+ struct aia_hgei_control *hgctrl = per_cpu_ptr(&aia_hgei, cpu);
+
+ if (!kvm_riscv_aia_available() || !hgctrl)
+ return -ENODEV;
+
+ raw_spin_lock_irqsave(&hgctrl->lock, flags);
+
+ if (hgctrl->free_bitmap) {
+ ret = __ffs(hgctrl->free_bitmap);
+ hgctrl->free_bitmap &= ~BIT(ret);
+ hgctrl->owners[ret] = owner;
+ }
+
+ raw_spin_unlock_irqrestore(&hgctrl->lock, flags);
+
+ /* TODO: To be updated later by AIA IMSIC HW guest file support */
+ if (hgei_va)
+ *hgei_va = NULL;
+ if (hgei_pa)
+ *hgei_pa = 0;
+
+ return ret;
+}
+
+void kvm_riscv_aia_free_hgei(int cpu, int hgei)
+{
+ unsigned long flags;
+ struct aia_hgei_control *hgctrl = per_cpu_ptr(&aia_hgei, cpu);
+
+ if (!kvm_riscv_aia_available() || !hgctrl)
+ return;
+
+ raw_spin_lock_irqsave(&hgctrl->lock, flags);
+
+ if (hgei > 0 && hgei <= kvm_riscv_aia_nr_hgei) {
+ if (!(hgctrl->free_bitmap & BIT(hgei))) {
+ hgctrl->free_bitmap |= BIT(hgei);
+ hgctrl->owners[hgei] = NULL;
+ }
+ }
+
+ raw_spin_unlock_irqrestore(&hgctrl->lock, flags);
+}
+
+void kvm_riscv_aia_wakeon_hgei(struct kvm_vcpu *owner, bool enable)
+{
+ int hgei;
+
+ if (!kvm_riscv_aia_available())
+ return;
+
+ hgei = aia_find_hgei(owner);
+ if (hgei > 0) {
+ if (enable)
+ csr_set(CSR_HGEIE, BIT(hgei));
+ else
+ csr_clear(CSR_HGEIE, BIT(hgei));
+ }
+}
+
+static irqreturn_t hgei_interrupt(int irq, void *dev_id)
+{
+ int i;
+ unsigned long hgei_mask, flags;
+ struct aia_hgei_control *hgctrl = get_cpu_ptr(&aia_hgei);
+
+ hgei_mask = csr_read(CSR_HGEIP) & csr_read(CSR_HGEIE);
+ csr_clear(CSR_HGEIE, hgei_mask);
+
+ raw_spin_lock_irqsave(&hgctrl->lock, flags);
+
+ for_each_set_bit(i, &hgei_mask, BITS_PER_LONG) {
+ if (hgctrl->owners[i])
+ kvm_vcpu_kick(hgctrl->owners[i]);
+ }
+
+ raw_spin_unlock_irqrestore(&hgctrl->lock, flags);
+
+ put_cpu_ptr(&aia_hgei);
+ return IRQ_HANDLED;
+}
+
+static int aia_hgei_init(void)
+{
+ int cpu, rc;
+ struct irq_domain *domain;
+ struct aia_hgei_control *hgctrl;
+
+ /* Initialize per-CPU guest external interrupt line management */
+ for_each_possible_cpu(cpu) {
+ hgctrl = per_cpu_ptr(&aia_hgei, cpu);
+ raw_spin_lock_init(&hgctrl->lock);
+ if (kvm_riscv_aia_nr_hgei) {
+ hgctrl->free_bitmap =
+ BIT(kvm_riscv_aia_nr_hgei + 1) - 1;
+ hgctrl->free_bitmap &= ~BIT(0);
+ } else
+ hgctrl->free_bitmap = 0;
+ }
+
+ /* Find INTC irq domain */
+ domain = irq_find_matching_fwnode(riscv_get_intc_hwnode(),
+ DOMAIN_BUS_ANY);
+ if (!domain) {
+ kvm_err("unable to find INTC domain\n");
+ return -ENOENT;
+ }
+
+ /* Map per-CPU SGEI interrupt from INTC domain */
+ hgei_parent_irq = irq_create_mapping(domain, IRQ_S_GEXT);
+ if (!hgei_parent_irq) {
+ kvm_err("unable to map SGEI IRQ\n");
+ return -ENOMEM;
+ }
+
+ /* Request per-CPU SGEI interrupt */
+ rc = request_percpu_irq(hgei_parent_irq, hgei_interrupt,
+ "riscv-kvm", &aia_hgei);
+ if (rc) {
+ kvm_err("failed to request SGEI IRQ\n");
+ return rc;
+ }
+
+ return 0;
+}
+
+static void aia_hgei_exit(void)
+{
+ /* Free per-CPU SGEI interrupt */
+ free_percpu_irq(hgei_parent_irq, &aia_hgei);
+}
+
+void kvm_riscv_aia_enable(void)
+{
+ if (!kvm_riscv_aia_available())
+ return;
+
+ aia_set_hvictl(false);
+ csr_write(CSR_HVIPRIO1, 0x0);
+ csr_write(CSR_HVIPRIO2, 0x0);
+#ifdef CONFIG_32BIT
+ csr_write(CSR_HVIPH, 0x0);
+ csr_write(CSR_HIDELEGH, 0x0);
+ csr_write(CSR_HVIPRIO1H, 0x0);
+ csr_write(CSR_HVIPRIO2H, 0x0);
+#endif
+
+ /* Enable per-CPU SGEI interrupt */
+ enable_percpu_irq(hgei_parent_irq,
+ irq_get_trigger_type(hgei_parent_irq));
+ csr_set(CSR_HIE, BIT(IRQ_S_GEXT));
+}
+
+void kvm_riscv_aia_disable(void)
+{
+ int i;
+ unsigned long flags;
+ struct kvm_vcpu *vcpu;
+ struct aia_hgei_control *hgctrl;
+
+ if (!kvm_riscv_aia_available())
+ return;
+ hgctrl = get_cpu_ptr(&aia_hgei);
+
+ /* Disable per-CPU SGEI interrupt */
+ csr_clear(CSR_HIE, BIT(IRQ_S_GEXT));
+ disable_percpu_irq(hgei_parent_irq);
+
+ aia_set_hvictl(false);
+
+ raw_spin_lock_irqsave(&hgctrl->lock, flags);
+
+ for (i = 0; i <= kvm_riscv_aia_nr_hgei; i++) {
+ vcpu = hgctrl->owners[i];
+ if (!vcpu)
+ continue;
+
+ /*
+ * We release hgctrl->lock before notifying IMSIC
+ * so that we don't have lock ordering issues.
+ */
+ raw_spin_unlock_irqrestore(&hgctrl->lock, flags);
+
+ /* Notify IMSIC */
+ kvm_riscv_vcpu_aia_imsic_release(vcpu);
+
+ /*
+ * Wakeup VCPU if it was blocked so that it can
+ * run on other HARTs
+ */
+ if (csr_read(CSR_HGEIE) & BIT(i)) {
+ csr_clear(CSR_HGEIE, BIT(i));
+ kvm_vcpu_kick(vcpu);
+ }
+
+ raw_spin_lock_irqsave(&hgctrl->lock, flags);
+ }
+
+ raw_spin_unlock_irqrestore(&hgctrl->lock, flags);
+
+ put_cpu_ptr(&aia_hgei);
+}
+
+int kvm_riscv_aia_init(void)
+{
+ int rc;
+
+ if (!riscv_isa_extension_available(NULL, SxAIA))
+ return -ENODEV;
+
+ /* Figure-out number of bits in HGEIE */
+ csr_write(CSR_HGEIE, -1UL);
+ kvm_riscv_aia_nr_hgei = fls_long(csr_read(CSR_HGEIE));
+ csr_write(CSR_HGEIE, 0);
+ if (kvm_riscv_aia_nr_hgei)
+ kvm_riscv_aia_nr_hgei--;
+
+ /*
+ * Number of usable HGEI lines should be minimum of per-HART
+ * IMSIC guest files and number of bits in HGEIE
+ *
+ * TODO: To be updated later by AIA IMSIC HW guest file support
+ */
+ kvm_riscv_aia_nr_hgei = 0;
+
+ /*
+ * Find number of guest MSI IDs
+ *
+ * TODO: To be updated later by AIA IMSIC HW guest file support
+ */
+ kvm_riscv_aia_max_ids = IMSIC_MAX_ID;
+
+ /* Initialize guest external interrupt line management */
+ rc = aia_hgei_init();
+ if (rc)
+ return rc;
+
+ /* Register device operations */
+ rc = kvm_register_device_ops(&kvm_riscv_aia_device_ops,
+ KVM_DEV_TYPE_RISCV_AIA);
+ if (rc) {
+ aia_hgei_exit();
+ return rc;
+ }
+
+ /* Enable KVM AIA support */
+ static_branch_enable(&kvm_riscv_aia_available);
+
+ return 0;
+}
+
+void kvm_riscv_aia_exit(void)
+{
+ if (!kvm_riscv_aia_available())
+ return;
+
+ /* Unregister device operations */
+ kvm_unregister_device_ops(KVM_DEV_TYPE_RISCV_AIA);
+
+ /* Cleanup the HGEI state */
+ aia_hgei_exit();
+}
diff --git a/arch/riscv/kvm/aia_aplic.c b/arch/riscv/kvm/aia_aplic.c
new file mode 100644
index 000000000..39e72aa01
--- /dev/null
+++ b/arch/riscv/kvm/aia_aplic.c
@@ -0,0 +1,619 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2021 Western Digital Corporation or its affiliates.
+ * Copyright (C) 2022 Ventana Micro Systems Inc.
+ *
+ * Authors:
+ * Anup Patel <apatel@ventanamicro.com>
+ */
+
+#include <linux/kvm_host.h>
+#include <linux/math.h>
+#include <linux/spinlock.h>
+#include <linux/swab.h>
+#include <kvm/iodev.h>
+#include <asm/kvm_aia_aplic.h>
+
+struct aplic_irq {
+ raw_spinlock_t lock;
+ u32 sourcecfg;
+ u32 state;
+#define APLIC_IRQ_STATE_PENDING BIT(0)
+#define APLIC_IRQ_STATE_ENABLED BIT(1)
+#define APLIC_IRQ_STATE_ENPEND (APLIC_IRQ_STATE_PENDING | \
+ APLIC_IRQ_STATE_ENABLED)
+#define APLIC_IRQ_STATE_INPUT BIT(8)
+ u32 target;
+};
+
+struct aplic {
+ struct kvm_io_device iodev;
+
+ u32 domaincfg;
+ u32 genmsi;
+
+ u32 nr_irqs;
+ u32 nr_words;
+ struct aplic_irq *irqs;
+};
+
+static u32 aplic_read_sourcecfg(struct aplic *aplic, u32 irq)
+{
+ u32 ret;
+ unsigned long flags;
+ struct aplic_irq *irqd;
+
+ if (!irq || aplic->nr_irqs <= irq)
+ return 0;
+ irqd = &aplic->irqs[irq];
+
+ raw_spin_lock_irqsave(&irqd->lock, flags);
+ ret = irqd->sourcecfg;
+ raw_spin_unlock_irqrestore(&irqd->lock, flags);
+
+ return ret;
+}
+
+static void aplic_write_sourcecfg(struct aplic *aplic, u32 irq, u32 val)
+{
+ unsigned long flags;
+ struct aplic_irq *irqd;
+
+ if (!irq || aplic->nr_irqs <= irq)
+ return;
+ irqd = &aplic->irqs[irq];
+
+ if (val & APLIC_SOURCECFG_D)
+ val = 0;
+ else
+ val &= APLIC_SOURCECFG_SM_MASK;
+
+ raw_spin_lock_irqsave(&irqd->lock, flags);
+ irqd->sourcecfg = val;
+ raw_spin_unlock_irqrestore(&irqd->lock, flags);
+}
+
+static u32 aplic_read_target(struct aplic *aplic, u32 irq)
+{
+ u32 ret;
+ unsigned long flags;
+ struct aplic_irq *irqd;
+
+ if (!irq || aplic->nr_irqs <= irq)
+ return 0;
+ irqd = &aplic->irqs[irq];
+
+ raw_spin_lock_irqsave(&irqd->lock, flags);
+ ret = irqd->target;
+ raw_spin_unlock_irqrestore(&irqd->lock, flags);
+
+ return ret;
+}
+
+static void aplic_write_target(struct aplic *aplic, u32 irq, u32 val)
+{
+ unsigned long flags;
+ struct aplic_irq *irqd;
+
+ if (!irq || aplic->nr_irqs <= irq)
+ return;
+ irqd = &aplic->irqs[irq];
+
+ val &= APLIC_TARGET_EIID_MASK |
+ (APLIC_TARGET_HART_IDX_MASK << APLIC_TARGET_HART_IDX_SHIFT) |
+ (APLIC_TARGET_GUEST_IDX_MASK << APLIC_TARGET_GUEST_IDX_SHIFT);
+
+ raw_spin_lock_irqsave(&irqd->lock, flags);
+ irqd->target = val;
+ raw_spin_unlock_irqrestore(&irqd->lock, flags);
+}
+
+static bool aplic_read_pending(struct aplic *aplic, u32 irq)
+{
+ bool ret;
+ unsigned long flags;
+ struct aplic_irq *irqd;
+
+ if (!irq || aplic->nr_irqs <= irq)
+ return false;
+ irqd = &aplic->irqs[irq];
+
+ raw_spin_lock_irqsave(&irqd->lock, flags);
+ ret = (irqd->state & APLIC_IRQ_STATE_PENDING) ? true : false;
+ raw_spin_unlock_irqrestore(&irqd->lock, flags);
+
+ return ret;
+}
+
+static void aplic_write_pending(struct aplic *aplic, u32 irq, bool pending)
+{
+ unsigned long flags, sm;
+ struct aplic_irq *irqd;
+
+ if (!irq || aplic->nr_irqs <= irq)
+ return;
+ irqd = &aplic->irqs[irq];
+
+ raw_spin_lock_irqsave(&irqd->lock, flags);
+
+ sm = irqd->sourcecfg & APLIC_SOURCECFG_SM_MASK;
+ if (!pending &&
+ ((sm == APLIC_SOURCECFG_SM_LEVEL_HIGH) ||
+ (sm == APLIC_SOURCECFG_SM_LEVEL_LOW)))
+ goto skip_write_pending;
+
+ if (pending)
+ irqd->state |= APLIC_IRQ_STATE_PENDING;
+ else
+ irqd->state &= ~APLIC_IRQ_STATE_PENDING;
+
+skip_write_pending:
+ raw_spin_unlock_irqrestore(&irqd->lock, flags);
+}
+
+static bool aplic_read_enabled(struct aplic *aplic, u32 irq)
+{
+ bool ret;
+ unsigned long flags;
+ struct aplic_irq *irqd;
+
+ if (!irq || aplic->nr_irqs <= irq)
+ return false;
+ irqd = &aplic->irqs[irq];
+
+ raw_spin_lock_irqsave(&irqd->lock, flags);
+ ret = (irqd->state & APLIC_IRQ_STATE_ENABLED) ? true : false;
+ raw_spin_unlock_irqrestore(&irqd->lock, flags);
+
+ return ret;
+}
+
+static void aplic_write_enabled(struct aplic *aplic, u32 irq, bool enabled)
+{
+ unsigned long flags;
+ struct aplic_irq *irqd;
+
+ if (!irq || aplic->nr_irqs <= irq)
+ return;
+ irqd = &aplic->irqs[irq];
+
+ raw_spin_lock_irqsave(&irqd->lock, flags);
+ if (enabled)
+ irqd->state |= APLIC_IRQ_STATE_ENABLED;
+ else
+ irqd->state &= ~APLIC_IRQ_STATE_ENABLED;
+ raw_spin_unlock_irqrestore(&irqd->lock, flags);
+}
+
+static bool aplic_read_input(struct aplic *aplic, u32 irq)
+{
+ bool ret;
+ unsigned long flags;
+ struct aplic_irq *irqd;
+
+ if (!irq || aplic->nr_irqs <= irq)
+ return false;
+ irqd = &aplic->irqs[irq];
+
+ raw_spin_lock_irqsave(&irqd->lock, flags);
+ ret = (irqd->state & APLIC_IRQ_STATE_INPUT) ? true : false;
+ raw_spin_unlock_irqrestore(&irqd->lock, flags);
+
+ return ret;
+}
+
+static void aplic_inject_msi(struct kvm *kvm, u32 irq, u32 target)
+{
+ u32 hart_idx, guest_idx, eiid;
+
+ hart_idx = target >> APLIC_TARGET_HART_IDX_SHIFT;
+ hart_idx &= APLIC_TARGET_HART_IDX_MASK;
+ guest_idx = target >> APLIC_TARGET_GUEST_IDX_SHIFT;
+ guest_idx &= APLIC_TARGET_GUEST_IDX_MASK;
+ eiid = target & APLIC_TARGET_EIID_MASK;
+ kvm_riscv_aia_inject_msi_by_id(kvm, hart_idx, guest_idx, eiid);
+}
+
+static void aplic_update_irq_range(struct kvm *kvm, u32 first, u32 last)
+{
+ bool inject;
+ u32 irq, target;
+ unsigned long flags;
+ struct aplic_irq *irqd;
+ struct aplic *aplic = kvm->arch.aia.aplic_state;
+
+ if (!(aplic->domaincfg & APLIC_DOMAINCFG_IE))
+ return;
+
+ for (irq = first; irq <= last; irq++) {
+ if (!irq || aplic->nr_irqs <= irq)
+ continue;
+ irqd = &aplic->irqs[irq];
+
+ raw_spin_lock_irqsave(&irqd->lock, flags);
+
+ inject = false;
+ target = irqd->target;
+ if ((irqd->state & APLIC_IRQ_STATE_ENPEND) ==
+ APLIC_IRQ_STATE_ENPEND) {
+ irqd->state &= ~APLIC_IRQ_STATE_PENDING;
+ inject = true;
+ }
+
+ raw_spin_unlock_irqrestore(&irqd->lock, flags);
+
+ if (inject)
+ aplic_inject_msi(kvm, irq, target);
+ }
+}
+
+int kvm_riscv_aia_aplic_inject(struct kvm *kvm, u32 source, bool level)
+{
+ u32 target;
+ bool inject = false, ie;
+ unsigned long flags;
+ struct aplic_irq *irqd;
+ struct aplic *aplic = kvm->arch.aia.aplic_state;
+
+ if (!aplic || !source || (aplic->nr_irqs <= source))
+ return -ENODEV;
+ irqd = &aplic->irqs[source];
+ ie = (aplic->domaincfg & APLIC_DOMAINCFG_IE) ? true : false;
+
+ raw_spin_lock_irqsave(&irqd->lock, flags);
+
+ if (irqd->sourcecfg & APLIC_SOURCECFG_D)
+ goto skip_unlock;
+
+ switch (irqd->sourcecfg & APLIC_SOURCECFG_SM_MASK) {
+ case APLIC_SOURCECFG_SM_EDGE_RISE:
+ if (level && !(irqd->state & APLIC_IRQ_STATE_INPUT) &&
+ !(irqd->state & APLIC_IRQ_STATE_PENDING))
+ irqd->state |= APLIC_IRQ_STATE_PENDING;
+ break;
+ case APLIC_SOURCECFG_SM_EDGE_FALL:
+ if (!level && (irqd->state & APLIC_IRQ_STATE_INPUT) &&
+ !(irqd->state & APLIC_IRQ_STATE_PENDING))
+ irqd->state |= APLIC_IRQ_STATE_PENDING;
+ break;
+ case APLIC_SOURCECFG_SM_LEVEL_HIGH:
+ if (level && !(irqd->state & APLIC_IRQ_STATE_PENDING))
+ irqd->state |= APLIC_IRQ_STATE_PENDING;
+ break;
+ case APLIC_SOURCECFG_SM_LEVEL_LOW:
+ if (!level && !(irqd->state & APLIC_IRQ_STATE_PENDING))
+ irqd->state |= APLIC_IRQ_STATE_PENDING;
+ break;
+ }
+
+ if (level)
+ irqd->state |= APLIC_IRQ_STATE_INPUT;
+ else
+ irqd->state &= ~APLIC_IRQ_STATE_INPUT;
+
+ target = irqd->target;
+ if (ie && ((irqd->state & APLIC_IRQ_STATE_ENPEND) ==
+ APLIC_IRQ_STATE_ENPEND)) {
+ irqd->state &= ~APLIC_IRQ_STATE_PENDING;
+ inject = true;
+ }
+
+skip_unlock:
+ raw_spin_unlock_irqrestore(&irqd->lock, flags);
+
+ if (inject)
+ aplic_inject_msi(kvm, source, target);
+
+ return 0;
+}
+
+static u32 aplic_read_input_word(struct aplic *aplic, u32 word)
+{
+ u32 i, ret = 0;
+
+ for (i = 0; i < 32; i++)
+ ret |= aplic_read_input(aplic, word * 32 + i) ? BIT(i) : 0;
+
+ return ret;
+}
+
+static u32 aplic_read_pending_word(struct aplic *aplic, u32 word)
+{
+ u32 i, ret = 0;
+
+ for (i = 0; i < 32; i++)
+ ret |= aplic_read_pending(aplic, word * 32 + i) ? BIT(i) : 0;
+
+ return ret;
+}
+
+static void aplic_write_pending_word(struct aplic *aplic, u32 word,
+ u32 val, bool pending)
+{
+ u32 i;
+
+ for (i = 0; i < 32; i++) {
+ if (val & BIT(i))
+ aplic_write_pending(aplic, word * 32 + i, pending);
+ }
+}
+
+static u32 aplic_read_enabled_word(struct aplic *aplic, u32 word)
+{
+ u32 i, ret = 0;
+
+ for (i = 0; i < 32; i++)
+ ret |= aplic_read_enabled(aplic, word * 32 + i) ? BIT(i) : 0;
+
+ return ret;
+}
+
+static void aplic_write_enabled_word(struct aplic *aplic, u32 word,
+ u32 val, bool enabled)
+{
+ u32 i;
+
+ for (i = 0; i < 32; i++) {
+ if (val & BIT(i))
+ aplic_write_enabled(aplic, word * 32 + i, enabled);
+ }
+}
+
+static int aplic_mmio_read_offset(struct kvm *kvm, gpa_t off, u32 *val32)
+{
+ u32 i;
+ struct aplic *aplic = kvm->arch.aia.aplic_state;
+
+ if ((off & 0x3) != 0)
+ return -EOPNOTSUPP;
+
+ if (off == APLIC_DOMAINCFG) {
+ *val32 = APLIC_DOMAINCFG_RDONLY |
+ aplic->domaincfg | APLIC_DOMAINCFG_DM;
+ } else if ((off >= APLIC_SOURCECFG_BASE) &&
+ (off < (APLIC_SOURCECFG_BASE + (aplic->nr_irqs - 1) * 4))) {
+ i = ((off - APLIC_SOURCECFG_BASE) >> 2) + 1;
+ *val32 = aplic_read_sourcecfg(aplic, i);
+ } else if ((off >= APLIC_SETIP_BASE) &&
+ (off < (APLIC_SETIP_BASE + aplic->nr_words * 4))) {
+ i = (off - APLIC_SETIP_BASE) >> 2;
+ *val32 = aplic_read_pending_word(aplic, i);
+ } else if (off == APLIC_SETIPNUM) {
+ *val32 = 0;
+ } else if ((off >= APLIC_CLRIP_BASE) &&
+ (off < (APLIC_CLRIP_BASE + aplic->nr_words * 4))) {
+ i = (off - APLIC_CLRIP_BASE) >> 2;
+ *val32 = aplic_read_input_word(aplic, i);
+ } else if (off == APLIC_CLRIPNUM) {
+ *val32 = 0;
+ } else if ((off >= APLIC_SETIE_BASE) &&
+ (off < (APLIC_SETIE_BASE + aplic->nr_words * 4))) {
+ i = (off - APLIC_SETIE_BASE) >> 2;
+ *val32 = aplic_read_enabled_word(aplic, i);
+ } else if (off == APLIC_SETIENUM) {
+ *val32 = 0;
+ } else if ((off >= APLIC_CLRIE_BASE) &&
+ (off < (APLIC_CLRIE_BASE + aplic->nr_words * 4))) {
+ *val32 = 0;
+ } else if (off == APLIC_CLRIENUM) {
+ *val32 = 0;
+ } else if (off == APLIC_SETIPNUM_LE) {
+ *val32 = 0;
+ } else if (off == APLIC_SETIPNUM_BE) {
+ *val32 = 0;
+ } else if (off == APLIC_GENMSI) {
+ *val32 = aplic->genmsi;
+ } else if ((off >= APLIC_TARGET_BASE) &&
+ (off < (APLIC_TARGET_BASE + (aplic->nr_irqs - 1) * 4))) {
+ i = ((off - APLIC_TARGET_BASE) >> 2) + 1;
+ *val32 = aplic_read_target(aplic, i);
+ } else
+ return -ENODEV;
+
+ return 0;
+}
+
+static int aplic_mmio_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
+ gpa_t addr, int len, void *val)
+{
+ if (len != 4)
+ return -EOPNOTSUPP;
+
+ return aplic_mmio_read_offset(vcpu->kvm,
+ addr - vcpu->kvm->arch.aia.aplic_addr,
+ val);
+}
+
+static int aplic_mmio_write_offset(struct kvm *kvm, gpa_t off, u32 val32)
+{
+ u32 i;
+ struct aplic *aplic = kvm->arch.aia.aplic_state;
+
+ if ((off & 0x3) != 0)
+ return -EOPNOTSUPP;
+
+ if (off == APLIC_DOMAINCFG) {
+ /* Only IE bit writeable */
+ aplic->domaincfg = val32 & APLIC_DOMAINCFG_IE;
+ } else if ((off >= APLIC_SOURCECFG_BASE) &&
+ (off < (APLIC_SOURCECFG_BASE + (aplic->nr_irqs - 1) * 4))) {
+ i = ((off - APLIC_SOURCECFG_BASE) >> 2) + 1;
+ aplic_write_sourcecfg(aplic, i, val32);
+ } else if ((off >= APLIC_SETIP_BASE) &&
+ (off < (APLIC_SETIP_BASE + aplic->nr_words * 4))) {
+ i = (off - APLIC_SETIP_BASE) >> 2;
+ aplic_write_pending_word(aplic, i, val32, true);
+ } else if (off == APLIC_SETIPNUM) {
+ aplic_write_pending(aplic, val32, true);
+ } else if ((off >= APLIC_CLRIP_BASE) &&
+ (off < (APLIC_CLRIP_BASE + aplic->nr_words * 4))) {
+ i = (off - APLIC_CLRIP_BASE) >> 2;
+ aplic_write_pending_word(aplic, i, val32, false);
+ } else if (off == APLIC_CLRIPNUM) {
+ aplic_write_pending(aplic, val32, false);
+ } else if ((off >= APLIC_SETIE_BASE) &&
+ (off < (APLIC_SETIE_BASE + aplic->nr_words * 4))) {
+ i = (off - APLIC_SETIE_BASE) >> 2;
+ aplic_write_enabled_word(aplic, i, val32, true);
+ } else if (off == APLIC_SETIENUM) {
+ aplic_write_enabled(aplic, val32, true);
+ } else if ((off >= APLIC_CLRIE_BASE) &&
+ (off < (APLIC_CLRIE_BASE + aplic->nr_words * 4))) {
+ i = (off - APLIC_CLRIE_BASE) >> 2;
+ aplic_write_enabled_word(aplic, i, val32, false);
+ } else if (off == APLIC_CLRIENUM) {
+ aplic_write_enabled(aplic, val32, false);
+ } else if (off == APLIC_SETIPNUM_LE) {
+ aplic_write_pending(aplic, val32, true);
+ } else if (off == APLIC_SETIPNUM_BE) {
+ aplic_write_pending(aplic, __swab32(val32), true);
+ } else if (off == APLIC_GENMSI) {
+ aplic->genmsi = val32 & ~(APLIC_TARGET_GUEST_IDX_MASK <<
+ APLIC_TARGET_GUEST_IDX_SHIFT);
+ kvm_riscv_aia_inject_msi_by_id(kvm,
+ val32 >> APLIC_TARGET_HART_IDX_SHIFT, 0,
+ val32 & APLIC_TARGET_EIID_MASK);
+ } else if ((off >= APLIC_TARGET_BASE) &&
+ (off < (APLIC_TARGET_BASE + (aplic->nr_irqs - 1) * 4))) {
+ i = ((off - APLIC_TARGET_BASE) >> 2) + 1;
+ aplic_write_target(aplic, i, val32);
+ } else
+ return -ENODEV;
+
+ aplic_update_irq_range(kvm, 1, aplic->nr_irqs - 1);
+
+ return 0;
+}
+
+static int aplic_mmio_write(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
+ gpa_t addr, int len, const void *val)
+{
+ if (len != 4)
+ return -EOPNOTSUPP;
+
+ return aplic_mmio_write_offset(vcpu->kvm,
+ addr - vcpu->kvm->arch.aia.aplic_addr,
+ *((const u32 *)val));
+}
+
+static struct kvm_io_device_ops aplic_iodoev_ops = {
+ .read = aplic_mmio_read,
+ .write = aplic_mmio_write,
+};
+
+int kvm_riscv_aia_aplic_set_attr(struct kvm *kvm, unsigned long type, u32 v)
+{
+ int rc;
+
+ if (!kvm->arch.aia.aplic_state)
+ return -ENODEV;
+
+ rc = aplic_mmio_write_offset(kvm, type, v);
+ if (rc)
+ return rc;
+
+ return 0;
+}
+
+int kvm_riscv_aia_aplic_get_attr(struct kvm *kvm, unsigned long type, u32 *v)
+{
+ int rc;
+
+ if (!kvm->arch.aia.aplic_state)
+ return -ENODEV;
+
+ rc = aplic_mmio_read_offset(kvm, type, v);
+ if (rc)
+ return rc;
+
+ return 0;
+}
+
+int kvm_riscv_aia_aplic_has_attr(struct kvm *kvm, unsigned long type)
+{
+ int rc;
+ u32 val;
+
+ if (!kvm->arch.aia.aplic_state)
+ return -ENODEV;
+
+ rc = aplic_mmio_read_offset(kvm, type, &val);
+ if (rc)
+ return rc;
+
+ return 0;
+}
+
+int kvm_riscv_aia_aplic_init(struct kvm *kvm)
+{
+ int i, ret = 0;
+ struct aplic *aplic;
+
+ /* Do nothing if we have zero sources */
+ if (!kvm->arch.aia.nr_sources)
+ return 0;
+
+ /* Allocate APLIC global state */
+ aplic = kzalloc(sizeof(*aplic), GFP_KERNEL);
+ if (!aplic)
+ return -ENOMEM;
+ kvm->arch.aia.aplic_state = aplic;
+
+ /* Setup APLIC IRQs */
+ aplic->nr_irqs = kvm->arch.aia.nr_sources + 1;
+ aplic->nr_words = DIV_ROUND_UP(aplic->nr_irqs, 32);
+ aplic->irqs = kcalloc(aplic->nr_irqs,
+ sizeof(*aplic->irqs), GFP_KERNEL);
+ if (!aplic->irqs) {
+ ret = -ENOMEM;
+ goto fail_free_aplic;
+ }
+ for (i = 0; i < aplic->nr_irqs; i++)
+ raw_spin_lock_init(&aplic->irqs[i].lock);
+
+ /* Setup IO device */
+ kvm_iodevice_init(&aplic->iodev, &aplic_iodoev_ops);
+ mutex_lock(&kvm->slots_lock);
+ ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS,
+ kvm->arch.aia.aplic_addr,
+ KVM_DEV_RISCV_APLIC_SIZE,
+ &aplic->iodev);
+ mutex_unlock(&kvm->slots_lock);
+ if (ret)
+ goto fail_free_aplic_irqs;
+
+ /* Setup default IRQ routing */
+ ret = kvm_riscv_setup_default_irq_routing(kvm, aplic->nr_irqs);
+ if (ret)
+ goto fail_unreg_iodev;
+
+ return 0;
+
+fail_unreg_iodev:
+ mutex_lock(&kvm->slots_lock);
+ kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, &aplic->iodev);
+ mutex_unlock(&kvm->slots_lock);
+fail_free_aplic_irqs:
+ kfree(aplic->irqs);
+fail_free_aplic:
+ kvm->arch.aia.aplic_state = NULL;
+ kfree(aplic);
+ return ret;
+}
+
+void kvm_riscv_aia_aplic_cleanup(struct kvm *kvm)
+{
+ struct aplic *aplic = kvm->arch.aia.aplic_state;
+
+ if (!aplic)
+ return;
+
+ mutex_lock(&kvm->slots_lock);
+ kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, &aplic->iodev);
+ mutex_unlock(&kvm->slots_lock);
+
+ kfree(aplic->irqs);
+
+ kvm->arch.aia.aplic_state = NULL;
+ kfree(aplic);
+}
diff --git a/arch/riscv/kvm/aia_device.c b/arch/riscv/kvm/aia_device.c
new file mode 100644
index 000000000..0eb689351
--- /dev/null
+++ b/arch/riscv/kvm/aia_device.c
@@ -0,0 +1,673 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2021 Western Digital Corporation or its affiliates.
+ * Copyright (C) 2022 Ventana Micro Systems Inc.
+ *
+ * Authors:
+ * Anup Patel <apatel@ventanamicro.com>
+ */
+
+#include <linux/bits.h>
+#include <linux/kvm_host.h>
+#include <linux/uaccess.h>
+#include <asm/kvm_aia_imsic.h>
+
+static void unlock_vcpus(struct kvm *kvm, int vcpu_lock_idx)
+{
+ struct kvm_vcpu *tmp_vcpu;
+
+ for (; vcpu_lock_idx >= 0; vcpu_lock_idx--) {
+ tmp_vcpu = kvm_get_vcpu(kvm, vcpu_lock_idx);
+ mutex_unlock(&tmp_vcpu->mutex);
+ }
+}
+
+static void unlock_all_vcpus(struct kvm *kvm)
+{
+ unlock_vcpus(kvm, atomic_read(&kvm->online_vcpus) - 1);
+}
+
+static bool lock_all_vcpus(struct kvm *kvm)
+{
+ struct kvm_vcpu *tmp_vcpu;
+ unsigned long c;
+
+ kvm_for_each_vcpu(c, tmp_vcpu, kvm) {
+ if (!mutex_trylock(&tmp_vcpu->mutex)) {
+ unlock_vcpus(kvm, c - 1);
+ return false;
+ }
+ }
+
+ return true;
+}
+
+static int aia_create(struct kvm_device *dev, u32 type)
+{
+ int ret;
+ unsigned long i;
+ struct kvm *kvm = dev->kvm;
+ struct kvm_vcpu *vcpu;
+
+ if (irqchip_in_kernel(kvm))
+ return -EEXIST;
+
+ ret = -EBUSY;
+ if (!lock_all_vcpus(kvm))
+ return ret;
+
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ if (vcpu->arch.ran_atleast_once)
+ goto out_unlock;
+ }
+ ret = 0;
+
+ kvm->arch.aia.in_kernel = true;
+
+out_unlock:
+ unlock_all_vcpus(kvm);
+ return ret;
+}
+
+static void aia_destroy(struct kvm_device *dev)
+{
+ kfree(dev);
+}
+
+static int aia_config(struct kvm *kvm, unsigned long type,
+ u32 *nr, bool write)
+{
+ struct kvm_aia *aia = &kvm->arch.aia;
+
+ /* Writes can only be done before irqchip is initialized */
+ if (write && kvm_riscv_aia_initialized(kvm))
+ return -EBUSY;
+
+ switch (type) {
+ case KVM_DEV_RISCV_AIA_CONFIG_MODE:
+ if (write) {
+ switch (*nr) {
+ case KVM_DEV_RISCV_AIA_MODE_EMUL:
+ break;
+ case KVM_DEV_RISCV_AIA_MODE_HWACCEL:
+ case KVM_DEV_RISCV_AIA_MODE_AUTO:
+ /*
+ * HW Acceleration and Auto modes only
+ * supported on host with non-zero guest
+ * external interrupts (i.e. non-zero
+ * VS-level IMSIC pages).
+ */
+ if (!kvm_riscv_aia_nr_hgei)
+ return -EINVAL;
+ break;
+ default:
+ return -EINVAL;
+ }
+ aia->mode = *nr;
+ } else
+ *nr = aia->mode;
+ break;
+ case KVM_DEV_RISCV_AIA_CONFIG_IDS:
+ if (write) {
+ if ((*nr < KVM_DEV_RISCV_AIA_IDS_MIN) ||
+ (*nr >= KVM_DEV_RISCV_AIA_IDS_MAX) ||
+ ((*nr & KVM_DEV_RISCV_AIA_IDS_MIN) !=
+ KVM_DEV_RISCV_AIA_IDS_MIN) ||
+ (kvm_riscv_aia_max_ids <= *nr))
+ return -EINVAL;
+ aia->nr_ids = *nr;
+ } else
+ *nr = aia->nr_ids;
+ break;
+ case KVM_DEV_RISCV_AIA_CONFIG_SRCS:
+ if (write) {
+ if ((*nr >= KVM_DEV_RISCV_AIA_SRCS_MAX) ||
+ (*nr >= kvm_riscv_aia_max_ids))
+ return -EINVAL;
+ aia->nr_sources = *nr;
+ } else
+ *nr = aia->nr_sources;
+ break;
+ case KVM_DEV_RISCV_AIA_CONFIG_GROUP_BITS:
+ if (write) {
+ if (*nr >= KVM_DEV_RISCV_AIA_GROUP_BITS_MAX)
+ return -EINVAL;
+ aia->nr_group_bits = *nr;
+ } else
+ *nr = aia->nr_group_bits;
+ break;
+ case KVM_DEV_RISCV_AIA_CONFIG_GROUP_SHIFT:
+ if (write) {
+ if ((*nr < KVM_DEV_RISCV_AIA_GROUP_SHIFT_MIN) ||
+ (*nr >= KVM_DEV_RISCV_AIA_GROUP_SHIFT_MAX))
+ return -EINVAL;
+ aia->nr_group_shift = *nr;
+ } else
+ *nr = aia->nr_group_shift;
+ break;
+ case KVM_DEV_RISCV_AIA_CONFIG_HART_BITS:
+ if (write) {
+ if (*nr >= KVM_DEV_RISCV_AIA_HART_BITS_MAX)
+ return -EINVAL;
+ aia->nr_hart_bits = *nr;
+ } else
+ *nr = aia->nr_hart_bits;
+ break;
+ case KVM_DEV_RISCV_AIA_CONFIG_GUEST_BITS:
+ if (write) {
+ if (*nr >= KVM_DEV_RISCV_AIA_GUEST_BITS_MAX)
+ return -EINVAL;
+ aia->nr_guest_bits = *nr;
+ } else
+ *nr = aia->nr_guest_bits;
+ break;
+ default:
+ return -ENXIO;
+ }
+
+ return 0;
+}
+
+static int aia_aplic_addr(struct kvm *kvm, u64 *addr, bool write)
+{
+ struct kvm_aia *aia = &kvm->arch.aia;
+
+ if (write) {
+ /* Writes can only be done before irqchip is initialized */
+ if (kvm_riscv_aia_initialized(kvm))
+ return -EBUSY;
+
+ if (*addr & (KVM_DEV_RISCV_APLIC_ALIGN - 1))
+ return -EINVAL;
+
+ aia->aplic_addr = *addr;
+ } else
+ *addr = aia->aplic_addr;
+
+ return 0;
+}
+
+static int aia_imsic_addr(struct kvm *kvm, u64 *addr,
+ unsigned long vcpu_idx, bool write)
+{
+ struct kvm_vcpu *vcpu;
+ struct kvm_vcpu_aia *vcpu_aia;
+
+ vcpu = kvm_get_vcpu(kvm, vcpu_idx);
+ if (!vcpu)
+ return -EINVAL;
+ vcpu_aia = &vcpu->arch.aia_context;
+
+ if (write) {
+ /* Writes can only be done before irqchip is initialized */
+ if (kvm_riscv_aia_initialized(kvm))
+ return -EBUSY;
+
+ if (*addr & (KVM_DEV_RISCV_IMSIC_ALIGN - 1))
+ return -EINVAL;
+ }
+
+ mutex_lock(&vcpu->mutex);
+ if (write)
+ vcpu_aia->imsic_addr = *addr;
+ else
+ *addr = vcpu_aia->imsic_addr;
+ mutex_unlock(&vcpu->mutex);
+
+ return 0;
+}
+
+static gpa_t aia_imsic_ppn(struct kvm_aia *aia, gpa_t addr)
+{
+ u32 h, l;
+ gpa_t mask = 0;
+
+ h = aia->nr_hart_bits + aia->nr_guest_bits +
+ IMSIC_MMIO_PAGE_SHIFT - 1;
+ mask = GENMASK_ULL(h, 0);
+
+ if (aia->nr_group_bits) {
+ h = aia->nr_group_bits + aia->nr_group_shift - 1;
+ l = aia->nr_group_shift;
+ mask |= GENMASK_ULL(h, l);
+ }
+
+ return (addr & ~mask) >> IMSIC_MMIO_PAGE_SHIFT;
+}
+
+static u32 aia_imsic_hart_index(struct kvm_aia *aia, gpa_t addr)
+{
+ u32 hart, group = 0;
+
+ hart = (addr >> (aia->nr_guest_bits + IMSIC_MMIO_PAGE_SHIFT)) &
+ GENMASK_ULL(aia->nr_hart_bits - 1, 0);
+ if (aia->nr_group_bits)
+ group = (addr >> aia->nr_group_shift) &
+ GENMASK_ULL(aia->nr_group_bits - 1, 0);
+
+ return (group << aia->nr_hart_bits) | hart;
+}
+
+static int aia_init(struct kvm *kvm)
+{
+ int ret, i;
+ unsigned long idx;
+ struct kvm_vcpu *vcpu;
+ struct kvm_vcpu_aia *vaia;
+ struct kvm_aia *aia = &kvm->arch.aia;
+ gpa_t base_ppn = KVM_RISCV_AIA_UNDEF_ADDR;
+
+ /* Irqchip can be initialized only once */
+ if (kvm_riscv_aia_initialized(kvm))
+ return -EBUSY;
+
+ /* We might be in the middle of creating a VCPU? */
+ if (kvm->created_vcpus != atomic_read(&kvm->online_vcpus))
+ return -EBUSY;
+
+ /* Number of sources should be less than or equals number of IDs */
+ if (aia->nr_ids < aia->nr_sources)
+ return -EINVAL;
+
+ /* APLIC base is required for non-zero number of sources */
+ if (aia->nr_sources && aia->aplic_addr == KVM_RISCV_AIA_UNDEF_ADDR)
+ return -EINVAL;
+
+ /* Initialize APLIC */
+ ret = kvm_riscv_aia_aplic_init(kvm);
+ if (ret)
+ return ret;
+
+ /* Iterate over each VCPU */
+ kvm_for_each_vcpu(idx, vcpu, kvm) {
+ vaia = &vcpu->arch.aia_context;
+
+ /* IMSIC base is required */
+ if (vaia->imsic_addr == KVM_RISCV_AIA_UNDEF_ADDR) {
+ ret = -EINVAL;
+ goto fail_cleanup_imsics;
+ }
+
+ /* All IMSICs should have matching base PPN */
+ if (base_ppn == KVM_RISCV_AIA_UNDEF_ADDR)
+ base_ppn = aia_imsic_ppn(aia, vaia->imsic_addr);
+ if (base_ppn != aia_imsic_ppn(aia, vaia->imsic_addr)) {
+ ret = -EINVAL;
+ goto fail_cleanup_imsics;
+ }
+
+ /* Update HART index of the IMSIC based on IMSIC base */
+ vaia->hart_index = aia_imsic_hart_index(aia,
+ vaia->imsic_addr);
+
+ /* Initialize IMSIC for this VCPU */
+ ret = kvm_riscv_vcpu_aia_imsic_init(vcpu);
+ if (ret)
+ goto fail_cleanup_imsics;
+ }
+
+ /* Set the initialized flag */
+ kvm->arch.aia.initialized = true;
+
+ return 0;
+
+fail_cleanup_imsics:
+ for (i = idx - 1; i >= 0; i--) {
+ vcpu = kvm_get_vcpu(kvm, i);
+ if (!vcpu)
+ continue;
+ kvm_riscv_vcpu_aia_imsic_cleanup(vcpu);
+ }
+ kvm_riscv_aia_aplic_cleanup(kvm);
+ return ret;
+}
+
+static int aia_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
+{
+ u32 nr;
+ u64 addr;
+ int nr_vcpus, r = -ENXIO;
+ unsigned long v, type = (unsigned long)attr->attr;
+ void __user *uaddr = (void __user *)(long)attr->addr;
+
+ switch (attr->group) {
+ case KVM_DEV_RISCV_AIA_GRP_CONFIG:
+ if (copy_from_user(&nr, uaddr, sizeof(nr)))
+ return -EFAULT;
+
+ mutex_lock(&dev->kvm->lock);
+ r = aia_config(dev->kvm, type, &nr, true);
+ mutex_unlock(&dev->kvm->lock);
+
+ break;
+
+ case KVM_DEV_RISCV_AIA_GRP_ADDR:
+ if (copy_from_user(&addr, uaddr, sizeof(addr)))
+ return -EFAULT;
+
+ nr_vcpus = atomic_read(&dev->kvm->online_vcpus);
+ mutex_lock(&dev->kvm->lock);
+ if (type == KVM_DEV_RISCV_AIA_ADDR_APLIC)
+ r = aia_aplic_addr(dev->kvm, &addr, true);
+ else if (type < KVM_DEV_RISCV_AIA_ADDR_IMSIC(nr_vcpus))
+ r = aia_imsic_addr(dev->kvm, &addr,
+ type - KVM_DEV_RISCV_AIA_ADDR_IMSIC(0), true);
+ mutex_unlock(&dev->kvm->lock);
+
+ break;
+
+ case KVM_DEV_RISCV_AIA_GRP_CTRL:
+ switch (type) {
+ case KVM_DEV_RISCV_AIA_CTRL_INIT:
+ mutex_lock(&dev->kvm->lock);
+ r = aia_init(dev->kvm);
+ mutex_unlock(&dev->kvm->lock);
+ break;
+ }
+
+ break;
+ case KVM_DEV_RISCV_AIA_GRP_APLIC:
+ if (copy_from_user(&nr, uaddr, sizeof(nr)))
+ return -EFAULT;
+
+ mutex_lock(&dev->kvm->lock);
+ r = kvm_riscv_aia_aplic_set_attr(dev->kvm, type, nr);
+ mutex_unlock(&dev->kvm->lock);
+
+ break;
+ case KVM_DEV_RISCV_AIA_GRP_IMSIC:
+ if (copy_from_user(&v, uaddr, sizeof(v)))
+ return -EFAULT;
+
+ mutex_lock(&dev->kvm->lock);
+ r = kvm_riscv_aia_imsic_rw_attr(dev->kvm, type, true, &v);
+ mutex_unlock(&dev->kvm->lock);
+
+ break;
+ }
+
+ return r;
+}
+
+static int aia_get_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
+{
+ u32 nr;
+ u64 addr;
+ int nr_vcpus, r = -ENXIO;
+ void __user *uaddr = (void __user *)(long)attr->addr;
+ unsigned long v, type = (unsigned long)attr->attr;
+
+ switch (attr->group) {
+ case KVM_DEV_RISCV_AIA_GRP_CONFIG:
+ if (copy_from_user(&nr, uaddr, sizeof(nr)))
+ return -EFAULT;
+
+ mutex_lock(&dev->kvm->lock);
+ r = aia_config(dev->kvm, type, &nr, false);
+ mutex_unlock(&dev->kvm->lock);
+ if (r)
+ return r;
+
+ if (copy_to_user(uaddr, &nr, sizeof(nr)))
+ return -EFAULT;
+
+ break;
+ case KVM_DEV_RISCV_AIA_GRP_ADDR:
+ if (copy_from_user(&addr, uaddr, sizeof(addr)))
+ return -EFAULT;
+
+ nr_vcpus = atomic_read(&dev->kvm->online_vcpus);
+ mutex_lock(&dev->kvm->lock);
+ if (type == KVM_DEV_RISCV_AIA_ADDR_APLIC)
+ r = aia_aplic_addr(dev->kvm, &addr, false);
+ else if (type < KVM_DEV_RISCV_AIA_ADDR_IMSIC(nr_vcpus))
+ r = aia_imsic_addr(dev->kvm, &addr,
+ type - KVM_DEV_RISCV_AIA_ADDR_IMSIC(0), false);
+ mutex_unlock(&dev->kvm->lock);
+ if (r)
+ return r;
+
+ if (copy_to_user(uaddr, &addr, sizeof(addr)))
+ return -EFAULT;
+
+ break;
+ case KVM_DEV_RISCV_AIA_GRP_APLIC:
+ if (copy_from_user(&nr, uaddr, sizeof(nr)))
+ return -EFAULT;
+
+ mutex_lock(&dev->kvm->lock);
+ r = kvm_riscv_aia_aplic_get_attr(dev->kvm, type, &nr);
+ mutex_unlock(&dev->kvm->lock);
+ if (r)
+ return r;
+
+ if (copy_to_user(uaddr, &nr, sizeof(nr)))
+ return -EFAULT;
+
+ break;
+ case KVM_DEV_RISCV_AIA_GRP_IMSIC:
+ if (copy_from_user(&v, uaddr, sizeof(v)))
+ return -EFAULT;
+
+ mutex_lock(&dev->kvm->lock);
+ r = kvm_riscv_aia_imsic_rw_attr(dev->kvm, type, false, &v);
+ mutex_unlock(&dev->kvm->lock);
+ if (r)
+ return r;
+
+ if (copy_to_user(uaddr, &v, sizeof(v)))
+ return -EFAULT;
+
+ break;
+ }
+
+ return r;
+}
+
+static int aia_has_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
+{
+ int nr_vcpus;
+
+ switch (attr->group) {
+ case KVM_DEV_RISCV_AIA_GRP_CONFIG:
+ switch (attr->attr) {
+ case KVM_DEV_RISCV_AIA_CONFIG_MODE:
+ case KVM_DEV_RISCV_AIA_CONFIG_IDS:
+ case KVM_DEV_RISCV_AIA_CONFIG_SRCS:
+ case KVM_DEV_RISCV_AIA_CONFIG_GROUP_BITS:
+ case KVM_DEV_RISCV_AIA_CONFIG_GROUP_SHIFT:
+ case KVM_DEV_RISCV_AIA_CONFIG_HART_BITS:
+ case KVM_DEV_RISCV_AIA_CONFIG_GUEST_BITS:
+ return 0;
+ }
+ break;
+ case KVM_DEV_RISCV_AIA_GRP_ADDR:
+ nr_vcpus = atomic_read(&dev->kvm->online_vcpus);
+ if (attr->attr == KVM_DEV_RISCV_AIA_ADDR_APLIC)
+ return 0;
+ else if (attr->attr < KVM_DEV_RISCV_AIA_ADDR_IMSIC(nr_vcpus))
+ return 0;
+ break;
+ case KVM_DEV_RISCV_AIA_GRP_CTRL:
+ switch (attr->attr) {
+ case KVM_DEV_RISCV_AIA_CTRL_INIT:
+ return 0;
+ }
+ break;
+ case KVM_DEV_RISCV_AIA_GRP_APLIC:
+ return kvm_riscv_aia_aplic_has_attr(dev->kvm, attr->attr);
+ case KVM_DEV_RISCV_AIA_GRP_IMSIC:
+ return kvm_riscv_aia_imsic_has_attr(dev->kvm, attr->attr);
+ }
+
+ return -ENXIO;
+}
+
+struct kvm_device_ops kvm_riscv_aia_device_ops = {
+ .name = "kvm-riscv-aia",
+ .create = aia_create,
+ .destroy = aia_destroy,
+ .set_attr = aia_set_attr,
+ .get_attr = aia_get_attr,
+ .has_attr = aia_has_attr,
+};
+
+int kvm_riscv_vcpu_aia_update(struct kvm_vcpu *vcpu)
+{
+ /* Proceed only if AIA was initialized successfully */
+ if (!kvm_riscv_aia_initialized(vcpu->kvm))
+ return 1;
+
+ /* Update the IMSIC HW state before entering guest mode */
+ return kvm_riscv_vcpu_aia_imsic_update(vcpu);
+}
+
+void kvm_riscv_vcpu_aia_reset(struct kvm_vcpu *vcpu)
+{
+ struct kvm_vcpu_aia_csr *csr = &vcpu->arch.aia_context.guest_csr;
+ struct kvm_vcpu_aia_csr *reset_csr =
+ &vcpu->arch.aia_context.guest_reset_csr;
+
+ if (!kvm_riscv_aia_available())
+ return;
+ memcpy(csr, reset_csr, sizeof(*csr));
+
+ /* Proceed only if AIA was initialized successfully */
+ if (!kvm_riscv_aia_initialized(vcpu->kvm))
+ return;
+
+ /* Reset the IMSIC context */
+ kvm_riscv_vcpu_aia_imsic_reset(vcpu);
+}
+
+int kvm_riscv_vcpu_aia_init(struct kvm_vcpu *vcpu)
+{
+ struct kvm_vcpu_aia *vaia = &vcpu->arch.aia_context;
+
+ if (!kvm_riscv_aia_available())
+ return 0;
+
+ /*
+ * We don't do any memory allocations over here because these
+ * will be done after AIA device is initialized by the user-space.
+ *
+ * Refer, aia_init() implementation for more details.
+ */
+
+ /* Initialize default values in AIA vcpu context */
+ vaia->imsic_addr = KVM_RISCV_AIA_UNDEF_ADDR;
+ vaia->hart_index = vcpu->vcpu_idx;
+
+ return 0;
+}
+
+void kvm_riscv_vcpu_aia_deinit(struct kvm_vcpu *vcpu)
+{
+ /* Proceed only if AIA was initialized successfully */
+ if (!kvm_riscv_aia_initialized(vcpu->kvm))
+ return;
+
+ /* Cleanup IMSIC context */
+ kvm_riscv_vcpu_aia_imsic_cleanup(vcpu);
+}
+
+int kvm_riscv_aia_inject_msi_by_id(struct kvm *kvm, u32 hart_index,
+ u32 guest_index, u32 iid)
+{
+ unsigned long idx;
+ struct kvm_vcpu *vcpu;
+
+ /* Proceed only if AIA was initialized successfully */
+ if (!kvm_riscv_aia_initialized(kvm))
+ return -EBUSY;
+
+ /* Inject MSI to matching VCPU */
+ kvm_for_each_vcpu(idx, vcpu, kvm) {
+ if (vcpu->arch.aia_context.hart_index == hart_index)
+ return kvm_riscv_vcpu_aia_imsic_inject(vcpu,
+ guest_index,
+ 0, iid);
+ }
+
+ return 0;
+}
+
+int kvm_riscv_aia_inject_msi(struct kvm *kvm, struct kvm_msi *msi)
+{
+ gpa_t tppn, ippn;
+ unsigned long idx;
+ struct kvm_vcpu *vcpu;
+ u32 g, toff, iid = msi->data;
+ struct kvm_aia *aia = &kvm->arch.aia;
+ gpa_t target = (((gpa_t)msi->address_hi) << 32) | msi->address_lo;
+
+ /* Proceed only if AIA was initialized successfully */
+ if (!kvm_riscv_aia_initialized(kvm))
+ return -EBUSY;
+
+ /* Convert target address to target PPN */
+ tppn = target >> IMSIC_MMIO_PAGE_SHIFT;
+
+ /* Extract and clear Guest ID from target PPN */
+ g = tppn & (BIT(aia->nr_guest_bits) - 1);
+ tppn &= ~((gpa_t)(BIT(aia->nr_guest_bits) - 1));
+
+ /* Inject MSI to matching VCPU */
+ kvm_for_each_vcpu(idx, vcpu, kvm) {
+ ippn = vcpu->arch.aia_context.imsic_addr >>
+ IMSIC_MMIO_PAGE_SHIFT;
+ if (ippn == tppn) {
+ toff = target & (IMSIC_MMIO_PAGE_SZ - 1);
+ return kvm_riscv_vcpu_aia_imsic_inject(vcpu, g,
+ toff, iid);
+ }
+ }
+
+ return 0;
+}
+
+int kvm_riscv_aia_inject_irq(struct kvm *kvm, unsigned int irq, bool level)
+{
+ /* Proceed only if AIA was initialized successfully */
+ if (!kvm_riscv_aia_initialized(kvm))
+ return -EBUSY;
+
+ /* Inject interrupt level change in APLIC */
+ return kvm_riscv_aia_aplic_inject(kvm, irq, level);
+}
+
+void kvm_riscv_aia_init_vm(struct kvm *kvm)
+{
+ struct kvm_aia *aia = &kvm->arch.aia;
+
+ if (!kvm_riscv_aia_available())
+ return;
+
+ /*
+ * We don't do any memory allocations over here because these
+ * will be done after AIA device is initialized by the user-space.
+ *
+ * Refer, aia_init() implementation for more details.
+ */
+
+ /* Initialize default values in AIA global context */
+ aia->mode = (kvm_riscv_aia_nr_hgei) ?
+ KVM_DEV_RISCV_AIA_MODE_AUTO : KVM_DEV_RISCV_AIA_MODE_EMUL;
+ aia->nr_ids = kvm_riscv_aia_max_ids - 1;
+ aia->nr_sources = 0;
+ aia->nr_group_bits = 0;
+ aia->nr_group_shift = KVM_DEV_RISCV_AIA_GROUP_SHIFT_MIN;
+ aia->nr_hart_bits = 0;
+ aia->nr_guest_bits = 0;
+ aia->aplic_addr = KVM_RISCV_AIA_UNDEF_ADDR;
+}
+
+void kvm_riscv_aia_destroy_vm(struct kvm *kvm)
+{
+ /* Proceed only if AIA was initialized successfully */
+ if (!kvm_riscv_aia_initialized(kvm))
+ return;
+
+ /* Cleanup APLIC context */
+ kvm_riscv_aia_aplic_cleanup(kvm);
+}
diff --git a/arch/riscv/kvm/aia_imsic.c b/arch/riscv/kvm/aia_imsic.c
new file mode 100644
index 000000000..e808723a8
--- /dev/null
+++ b/arch/riscv/kvm/aia_imsic.c
@@ -0,0 +1,1097 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2021 Western Digital Corporation or its affiliates.
+ * Copyright (C) 2022 Ventana Micro Systems Inc.
+ *
+ * Authors:
+ * Anup Patel <apatel@ventanamicro.com>
+ */
+
+#include <linux/atomic.h>
+#include <linux/bitmap.h>
+#include <linux/kvm_host.h>
+#include <linux/math.h>
+#include <linux/spinlock.h>
+#include <linux/swab.h>
+#include <kvm/iodev.h>
+#include <asm/csr.h>
+#include <asm/kvm_aia_imsic.h>
+
+#define IMSIC_MAX_EIX (IMSIC_MAX_ID / BITS_PER_TYPE(u64))
+
+struct imsic_mrif_eix {
+ unsigned long eip[BITS_PER_TYPE(u64) / BITS_PER_LONG];
+ unsigned long eie[BITS_PER_TYPE(u64) / BITS_PER_LONG];
+};
+
+struct imsic_mrif {
+ struct imsic_mrif_eix eix[IMSIC_MAX_EIX];
+ unsigned long eithreshold;
+ unsigned long eidelivery;
+};
+
+struct imsic {
+ struct kvm_io_device iodev;
+
+ u32 nr_msis;
+ u32 nr_eix;
+ u32 nr_hw_eix;
+
+ /*
+ * At any point in time, the register state is in
+ * one of the following places:
+ *
+ * 1) Hardware: IMSIC VS-file (vsfile_cpu >= 0)
+ * 2) Software: IMSIC SW-file (vsfile_cpu < 0)
+ */
+
+ /* IMSIC VS-file */
+ rwlock_t vsfile_lock;
+ int vsfile_cpu;
+ int vsfile_hgei;
+ void __iomem *vsfile_va;
+ phys_addr_t vsfile_pa;
+
+ /* IMSIC SW-file */
+ struct imsic_mrif *swfile;
+ phys_addr_t swfile_pa;
+ spinlock_t swfile_extirq_lock;
+};
+
+#define imsic_vs_csr_read(__c) \
+({ \
+ unsigned long __r; \
+ csr_write(CSR_VSISELECT, __c); \
+ __r = csr_read(CSR_VSIREG); \
+ __r; \
+})
+
+#define imsic_read_switchcase(__ireg) \
+ case __ireg: \
+ return imsic_vs_csr_read(__ireg);
+#define imsic_read_switchcase_2(__ireg) \
+ imsic_read_switchcase(__ireg + 0) \
+ imsic_read_switchcase(__ireg + 1)
+#define imsic_read_switchcase_4(__ireg) \
+ imsic_read_switchcase_2(__ireg + 0) \
+ imsic_read_switchcase_2(__ireg + 2)
+#define imsic_read_switchcase_8(__ireg) \
+ imsic_read_switchcase_4(__ireg + 0) \
+ imsic_read_switchcase_4(__ireg + 4)
+#define imsic_read_switchcase_16(__ireg) \
+ imsic_read_switchcase_8(__ireg + 0) \
+ imsic_read_switchcase_8(__ireg + 8)
+#define imsic_read_switchcase_32(__ireg) \
+ imsic_read_switchcase_16(__ireg + 0) \
+ imsic_read_switchcase_16(__ireg + 16)
+#define imsic_read_switchcase_64(__ireg) \
+ imsic_read_switchcase_32(__ireg + 0) \
+ imsic_read_switchcase_32(__ireg + 32)
+
+static unsigned long imsic_eix_read(int ireg)
+{
+ switch (ireg) {
+ imsic_read_switchcase_64(IMSIC_EIP0)
+ imsic_read_switchcase_64(IMSIC_EIE0)
+ }
+
+ return 0;
+}
+
+#define imsic_vs_csr_swap(__c, __v) \
+({ \
+ unsigned long __r; \
+ csr_write(CSR_VSISELECT, __c); \
+ __r = csr_swap(CSR_VSIREG, __v); \
+ __r; \
+})
+
+#define imsic_swap_switchcase(__ireg, __v) \
+ case __ireg: \
+ return imsic_vs_csr_swap(__ireg, __v);
+#define imsic_swap_switchcase_2(__ireg, __v) \
+ imsic_swap_switchcase(__ireg + 0, __v) \
+ imsic_swap_switchcase(__ireg + 1, __v)
+#define imsic_swap_switchcase_4(__ireg, __v) \
+ imsic_swap_switchcase_2(__ireg + 0, __v) \
+ imsic_swap_switchcase_2(__ireg + 2, __v)
+#define imsic_swap_switchcase_8(__ireg, __v) \
+ imsic_swap_switchcase_4(__ireg + 0, __v) \
+ imsic_swap_switchcase_4(__ireg + 4, __v)
+#define imsic_swap_switchcase_16(__ireg, __v) \
+ imsic_swap_switchcase_8(__ireg + 0, __v) \
+ imsic_swap_switchcase_8(__ireg + 8, __v)
+#define imsic_swap_switchcase_32(__ireg, __v) \
+ imsic_swap_switchcase_16(__ireg + 0, __v) \
+ imsic_swap_switchcase_16(__ireg + 16, __v)
+#define imsic_swap_switchcase_64(__ireg, __v) \
+ imsic_swap_switchcase_32(__ireg + 0, __v) \
+ imsic_swap_switchcase_32(__ireg + 32, __v)
+
+static unsigned long imsic_eix_swap(int ireg, unsigned long val)
+{
+ switch (ireg) {
+ imsic_swap_switchcase_64(IMSIC_EIP0, val)
+ imsic_swap_switchcase_64(IMSIC_EIE0, val)
+ }
+
+ return 0;
+}
+
+#define imsic_vs_csr_write(__c, __v) \
+do { \
+ csr_write(CSR_VSISELECT, __c); \
+ csr_write(CSR_VSIREG, __v); \
+} while (0)
+
+#define imsic_write_switchcase(__ireg, __v) \
+ case __ireg: \
+ imsic_vs_csr_write(__ireg, __v); \
+ break;
+#define imsic_write_switchcase_2(__ireg, __v) \
+ imsic_write_switchcase(__ireg + 0, __v) \
+ imsic_write_switchcase(__ireg + 1, __v)
+#define imsic_write_switchcase_4(__ireg, __v) \
+ imsic_write_switchcase_2(__ireg + 0, __v) \
+ imsic_write_switchcase_2(__ireg + 2, __v)
+#define imsic_write_switchcase_8(__ireg, __v) \
+ imsic_write_switchcase_4(__ireg + 0, __v) \
+ imsic_write_switchcase_4(__ireg + 4, __v)
+#define imsic_write_switchcase_16(__ireg, __v) \
+ imsic_write_switchcase_8(__ireg + 0, __v) \
+ imsic_write_switchcase_8(__ireg + 8, __v)
+#define imsic_write_switchcase_32(__ireg, __v) \
+ imsic_write_switchcase_16(__ireg + 0, __v) \
+ imsic_write_switchcase_16(__ireg + 16, __v)
+#define imsic_write_switchcase_64(__ireg, __v) \
+ imsic_write_switchcase_32(__ireg + 0, __v) \
+ imsic_write_switchcase_32(__ireg + 32, __v)
+
+static void imsic_eix_write(int ireg, unsigned long val)
+{
+ switch (ireg) {
+ imsic_write_switchcase_64(IMSIC_EIP0, val)
+ imsic_write_switchcase_64(IMSIC_EIE0, val)
+ }
+}
+
+#define imsic_vs_csr_set(__c, __v) \
+do { \
+ csr_write(CSR_VSISELECT, __c); \
+ csr_set(CSR_VSIREG, __v); \
+} while (0)
+
+#define imsic_set_switchcase(__ireg, __v) \
+ case __ireg: \
+ imsic_vs_csr_set(__ireg, __v); \
+ break;
+#define imsic_set_switchcase_2(__ireg, __v) \
+ imsic_set_switchcase(__ireg + 0, __v) \
+ imsic_set_switchcase(__ireg + 1, __v)
+#define imsic_set_switchcase_4(__ireg, __v) \
+ imsic_set_switchcase_2(__ireg + 0, __v) \
+ imsic_set_switchcase_2(__ireg + 2, __v)
+#define imsic_set_switchcase_8(__ireg, __v) \
+ imsic_set_switchcase_4(__ireg + 0, __v) \
+ imsic_set_switchcase_4(__ireg + 4, __v)
+#define imsic_set_switchcase_16(__ireg, __v) \
+ imsic_set_switchcase_8(__ireg + 0, __v) \
+ imsic_set_switchcase_8(__ireg + 8, __v)
+#define imsic_set_switchcase_32(__ireg, __v) \
+ imsic_set_switchcase_16(__ireg + 0, __v) \
+ imsic_set_switchcase_16(__ireg + 16, __v)
+#define imsic_set_switchcase_64(__ireg, __v) \
+ imsic_set_switchcase_32(__ireg + 0, __v) \
+ imsic_set_switchcase_32(__ireg + 32, __v)
+
+static void imsic_eix_set(int ireg, unsigned long val)
+{
+ switch (ireg) {
+ imsic_set_switchcase_64(IMSIC_EIP0, val)
+ imsic_set_switchcase_64(IMSIC_EIE0, val)
+ }
+}
+
+static unsigned long imsic_mrif_atomic_rmw(struct imsic_mrif *mrif,
+ unsigned long *ptr,
+ unsigned long new_val,
+ unsigned long wr_mask)
+{
+ unsigned long old_val = 0, tmp = 0;
+
+ __asm__ __volatile__ (
+ "0: lr.w.aq %1, %0\n"
+ " and %2, %1, %3\n"
+ " or %2, %2, %4\n"
+ " sc.w.rl %2, %2, %0\n"
+ " bnez %2, 0b"
+ : "+A" (*ptr), "+r" (old_val), "+r" (tmp)
+ : "r" (~wr_mask), "r" (new_val & wr_mask)
+ : "memory");
+
+ return old_val;
+}
+
+static unsigned long imsic_mrif_atomic_or(struct imsic_mrif *mrif,
+ unsigned long *ptr,
+ unsigned long val)
+{
+ return atomic_long_fetch_or(val, (atomic_long_t *)ptr);
+}
+
+#define imsic_mrif_atomic_write(__mrif, __ptr, __new_val) \
+ imsic_mrif_atomic_rmw(__mrif, __ptr, __new_val, -1UL)
+#define imsic_mrif_atomic_read(__mrif, __ptr) \
+ imsic_mrif_atomic_or(__mrif, __ptr, 0)
+
+static u32 imsic_mrif_topei(struct imsic_mrif *mrif, u32 nr_eix, u32 nr_msis)
+{
+ struct imsic_mrif_eix *eix;
+ u32 i, imin, imax, ei, max_msi;
+ unsigned long eipend[BITS_PER_TYPE(u64) / BITS_PER_LONG];
+ unsigned long eithreshold = imsic_mrif_atomic_read(mrif,
+ &mrif->eithreshold);
+
+ max_msi = (eithreshold && (eithreshold <= nr_msis)) ?
+ eithreshold : nr_msis;
+ for (ei = 0; ei < nr_eix; ei++) {
+ eix = &mrif->eix[ei];
+ eipend[0] = imsic_mrif_atomic_read(mrif, &eix->eie[0]) &
+ imsic_mrif_atomic_read(mrif, &eix->eip[0]);
+#ifdef CONFIG_32BIT
+ eipend[1] = imsic_mrif_atomic_read(mrif, &eix->eie[1]) &
+ imsic_mrif_atomic_read(mrif, &eix->eip[1]);
+ if (!eipend[0] && !eipend[1])
+#else
+ if (!eipend[0])
+#endif
+ continue;
+
+ imin = ei * BITS_PER_TYPE(u64);
+ imax = ((imin + BITS_PER_TYPE(u64)) < max_msi) ?
+ imin + BITS_PER_TYPE(u64) : max_msi;
+ for (i = (!imin) ? 1 : imin; i < imax; i++) {
+ if (test_bit(i - imin, eipend))
+ return (i << TOPEI_ID_SHIFT) | i;
+ }
+ }
+
+ return 0;
+}
+
+static int imsic_mrif_isel_check(u32 nr_eix, unsigned long isel)
+{
+ u32 num = 0;
+
+ switch (isel) {
+ case IMSIC_EIDELIVERY:
+ case IMSIC_EITHRESHOLD:
+ break;
+ case IMSIC_EIP0 ... IMSIC_EIP63:
+ num = isel - IMSIC_EIP0;
+ break;
+ case IMSIC_EIE0 ... IMSIC_EIE63:
+ num = isel - IMSIC_EIE0;
+ break;
+ default:
+ return -ENOENT;
+ }
+#ifndef CONFIG_32BIT
+ if (num & 0x1)
+ return -EINVAL;
+#endif
+ if ((num / 2) >= nr_eix)
+ return -EINVAL;
+
+ return 0;
+}
+
+static int imsic_mrif_rmw(struct imsic_mrif *mrif, u32 nr_eix,
+ unsigned long isel, unsigned long *val,
+ unsigned long new_val, unsigned long wr_mask)
+{
+ bool pend;
+ struct imsic_mrif_eix *eix;
+ unsigned long *ei, num, old_val = 0;
+
+ switch (isel) {
+ case IMSIC_EIDELIVERY:
+ old_val = imsic_mrif_atomic_rmw(mrif, &mrif->eidelivery,
+ new_val, wr_mask & 0x1);
+ break;
+ case IMSIC_EITHRESHOLD:
+ old_val = imsic_mrif_atomic_rmw(mrif, &mrif->eithreshold,
+ new_val, wr_mask & (IMSIC_MAX_ID - 1));
+ break;
+ case IMSIC_EIP0 ... IMSIC_EIP63:
+ case IMSIC_EIE0 ... IMSIC_EIE63:
+ if (isel >= IMSIC_EIP0 && isel <= IMSIC_EIP63) {
+ pend = true;
+ num = isel - IMSIC_EIP0;
+ } else {
+ pend = false;
+ num = isel - IMSIC_EIE0;
+ }
+
+ if ((num / 2) >= nr_eix)
+ return -EINVAL;
+ eix = &mrif->eix[num / 2];
+
+#ifndef CONFIG_32BIT
+ if (num & 0x1)
+ return -EINVAL;
+ ei = (pend) ? &eix->eip[0] : &eix->eie[0];
+#else
+ ei = (pend) ? &eix->eip[num & 0x1] : &eix->eie[num & 0x1];
+#endif
+
+ /* Bit0 of EIP0 or EIE0 is read-only */
+ if (!num)
+ wr_mask &= ~BIT(0);
+
+ old_val = imsic_mrif_atomic_rmw(mrif, ei, new_val, wr_mask);
+ break;
+ default:
+ return -ENOENT;
+ }
+
+ if (val)
+ *val = old_val;
+
+ return 0;
+}
+
+struct imsic_vsfile_read_data {
+ int hgei;
+ u32 nr_eix;
+ bool clear;
+ struct imsic_mrif *mrif;
+};
+
+static void imsic_vsfile_local_read(void *data)
+{
+ u32 i;
+ struct imsic_mrif_eix *eix;
+ struct imsic_vsfile_read_data *idata = data;
+ struct imsic_mrif *mrif = idata->mrif;
+ unsigned long new_hstatus, old_hstatus, old_vsiselect;
+
+ old_vsiselect = csr_read(CSR_VSISELECT);
+ old_hstatus = csr_read(CSR_HSTATUS);
+ new_hstatus = old_hstatus & ~HSTATUS_VGEIN;
+ new_hstatus |= ((unsigned long)idata->hgei) << HSTATUS_VGEIN_SHIFT;
+ csr_write(CSR_HSTATUS, new_hstatus);
+
+ /*
+ * We don't use imsic_mrif_atomic_xyz() functions to store
+ * values in MRIF because imsic_vsfile_read() is always called
+ * with pointer to temporary MRIF on stack.
+ */
+
+ if (idata->clear) {
+ mrif->eidelivery = imsic_vs_csr_swap(IMSIC_EIDELIVERY, 0);
+ mrif->eithreshold = imsic_vs_csr_swap(IMSIC_EITHRESHOLD, 0);
+ for (i = 0; i < idata->nr_eix; i++) {
+ eix = &mrif->eix[i];
+ eix->eip[0] = imsic_eix_swap(IMSIC_EIP0 + i * 2, 0);
+ eix->eie[0] = imsic_eix_swap(IMSIC_EIE0 + i * 2, 0);
+#ifdef CONFIG_32BIT
+ eix->eip[1] = imsic_eix_swap(IMSIC_EIP0 + i * 2 + 1, 0);
+ eix->eie[1] = imsic_eix_swap(IMSIC_EIE0 + i * 2 + 1, 0);
+#endif
+ }
+ } else {
+ mrif->eidelivery = imsic_vs_csr_read(IMSIC_EIDELIVERY);
+ mrif->eithreshold = imsic_vs_csr_read(IMSIC_EITHRESHOLD);
+ for (i = 0; i < idata->nr_eix; i++) {
+ eix = &mrif->eix[i];
+ eix->eip[0] = imsic_eix_read(IMSIC_EIP0 + i * 2);
+ eix->eie[0] = imsic_eix_read(IMSIC_EIE0 + i * 2);
+#ifdef CONFIG_32BIT
+ eix->eip[1] = imsic_eix_read(IMSIC_EIP0 + i * 2 + 1);
+ eix->eie[1] = imsic_eix_read(IMSIC_EIE0 + i * 2 + 1);
+#endif
+ }
+ }
+
+ csr_write(CSR_HSTATUS, old_hstatus);
+ csr_write(CSR_VSISELECT, old_vsiselect);
+}
+
+static void imsic_vsfile_read(int vsfile_hgei, int vsfile_cpu, u32 nr_eix,
+ bool clear, struct imsic_mrif *mrif)
+{
+ struct imsic_vsfile_read_data idata;
+
+ /* We can only read clear if we have a IMSIC VS-file */
+ if (vsfile_cpu < 0 || vsfile_hgei <= 0)
+ return;
+
+ /* We can only read clear on local CPU */
+ idata.hgei = vsfile_hgei;
+ idata.nr_eix = nr_eix;
+ idata.clear = clear;
+ idata.mrif = mrif;
+ on_each_cpu_mask(cpumask_of(vsfile_cpu),
+ imsic_vsfile_local_read, &idata, 1);
+}
+
+struct imsic_vsfile_rw_data {
+ int hgei;
+ int isel;
+ bool write;
+ unsigned long val;
+};
+
+static void imsic_vsfile_local_rw(void *data)
+{
+ struct imsic_vsfile_rw_data *idata = data;
+ unsigned long new_hstatus, old_hstatus, old_vsiselect;
+
+ old_vsiselect = csr_read(CSR_VSISELECT);
+ old_hstatus = csr_read(CSR_HSTATUS);
+ new_hstatus = old_hstatus & ~HSTATUS_VGEIN;
+ new_hstatus |= ((unsigned long)idata->hgei) << HSTATUS_VGEIN_SHIFT;
+ csr_write(CSR_HSTATUS, new_hstatus);
+
+ switch (idata->isel) {
+ case IMSIC_EIDELIVERY:
+ if (idata->write)
+ imsic_vs_csr_write(IMSIC_EIDELIVERY, idata->val);
+ else
+ idata->val = imsic_vs_csr_read(IMSIC_EIDELIVERY);
+ break;
+ case IMSIC_EITHRESHOLD:
+ if (idata->write)
+ imsic_vs_csr_write(IMSIC_EITHRESHOLD, idata->val);
+ else
+ idata->val = imsic_vs_csr_read(IMSIC_EITHRESHOLD);
+ break;
+ case IMSIC_EIP0 ... IMSIC_EIP63:
+ case IMSIC_EIE0 ... IMSIC_EIE63:
+#ifndef CONFIG_32BIT
+ if (idata->isel & 0x1)
+ break;
+#endif
+ if (idata->write)
+ imsic_eix_write(idata->isel, idata->val);
+ else
+ idata->val = imsic_eix_read(idata->isel);
+ break;
+ default:
+ break;
+ }
+
+ csr_write(CSR_HSTATUS, old_hstatus);
+ csr_write(CSR_VSISELECT, old_vsiselect);
+}
+
+static int imsic_vsfile_rw(int vsfile_hgei, int vsfile_cpu, u32 nr_eix,
+ unsigned long isel, bool write,
+ unsigned long *val)
+{
+ int rc;
+ struct imsic_vsfile_rw_data rdata;
+
+ /* We can only access register if we have a IMSIC VS-file */
+ if (vsfile_cpu < 0 || vsfile_hgei <= 0)
+ return -EINVAL;
+
+ /* Check IMSIC register iselect */
+ rc = imsic_mrif_isel_check(nr_eix, isel);
+ if (rc)
+ return rc;
+
+ /* We can only access register on local CPU */
+ rdata.hgei = vsfile_hgei;
+ rdata.isel = isel;
+ rdata.write = write;
+ rdata.val = (write) ? *val : 0;
+ on_each_cpu_mask(cpumask_of(vsfile_cpu),
+ imsic_vsfile_local_rw, &rdata, 1);
+
+ if (!write)
+ *val = rdata.val;
+
+ return 0;
+}
+
+static void imsic_vsfile_local_clear(int vsfile_hgei, u32 nr_eix)
+{
+ u32 i;
+ unsigned long new_hstatus, old_hstatus, old_vsiselect;
+
+ /* We can only zero-out if we have a IMSIC VS-file */
+ if (vsfile_hgei <= 0)
+ return;
+
+ old_vsiselect = csr_read(CSR_VSISELECT);
+ old_hstatus = csr_read(CSR_HSTATUS);
+ new_hstatus = old_hstatus & ~HSTATUS_VGEIN;
+ new_hstatus |= ((unsigned long)vsfile_hgei) << HSTATUS_VGEIN_SHIFT;
+ csr_write(CSR_HSTATUS, new_hstatus);
+
+ imsic_vs_csr_write(IMSIC_EIDELIVERY, 0);
+ imsic_vs_csr_write(IMSIC_EITHRESHOLD, 0);
+ for (i = 0; i < nr_eix; i++) {
+ imsic_eix_write(IMSIC_EIP0 + i * 2, 0);
+ imsic_eix_write(IMSIC_EIE0 + i * 2, 0);
+#ifdef CONFIG_32BIT
+ imsic_eix_write(IMSIC_EIP0 + i * 2 + 1, 0);
+ imsic_eix_write(IMSIC_EIE0 + i * 2 + 1, 0);
+#endif
+ }
+
+ csr_write(CSR_HSTATUS, old_hstatus);
+ csr_write(CSR_VSISELECT, old_vsiselect);
+}
+
+static void imsic_vsfile_local_update(int vsfile_hgei, u32 nr_eix,
+ struct imsic_mrif *mrif)
+{
+ u32 i;
+ struct imsic_mrif_eix *eix;
+ unsigned long new_hstatus, old_hstatus, old_vsiselect;
+
+ /* We can only update if we have a HW IMSIC context */
+ if (vsfile_hgei <= 0)
+ return;
+
+ /*
+ * We don't use imsic_mrif_atomic_xyz() functions to read values
+ * from MRIF in this function because it is always called with
+ * pointer to temporary MRIF on stack.
+ */
+
+ old_vsiselect = csr_read(CSR_VSISELECT);
+ old_hstatus = csr_read(CSR_HSTATUS);
+ new_hstatus = old_hstatus & ~HSTATUS_VGEIN;
+ new_hstatus |= ((unsigned long)vsfile_hgei) << HSTATUS_VGEIN_SHIFT;
+ csr_write(CSR_HSTATUS, new_hstatus);
+
+ for (i = 0; i < nr_eix; i++) {
+ eix = &mrif->eix[i];
+ imsic_eix_set(IMSIC_EIP0 + i * 2, eix->eip[0]);
+ imsic_eix_set(IMSIC_EIE0 + i * 2, eix->eie[0]);
+#ifdef CONFIG_32BIT
+ imsic_eix_set(IMSIC_EIP0 + i * 2 + 1, eix->eip[1]);
+ imsic_eix_set(IMSIC_EIE0 + i * 2 + 1, eix->eie[1]);
+#endif
+ }
+ imsic_vs_csr_write(IMSIC_EITHRESHOLD, mrif->eithreshold);
+ imsic_vs_csr_write(IMSIC_EIDELIVERY, mrif->eidelivery);
+
+ csr_write(CSR_HSTATUS, old_hstatus);
+ csr_write(CSR_VSISELECT, old_vsiselect);
+}
+
+static void imsic_vsfile_cleanup(struct imsic *imsic)
+{
+ int old_vsfile_hgei, old_vsfile_cpu;
+ unsigned long flags;
+
+ /*
+ * We don't use imsic_mrif_atomic_xyz() functions to clear the
+ * SW-file in this function because it is always called when the
+ * VCPU is being destroyed.
+ */
+
+ write_lock_irqsave(&imsic->vsfile_lock, flags);
+ old_vsfile_hgei = imsic->vsfile_hgei;
+ old_vsfile_cpu = imsic->vsfile_cpu;
+ imsic->vsfile_cpu = imsic->vsfile_hgei = -1;
+ imsic->vsfile_va = NULL;
+ imsic->vsfile_pa = 0;
+ write_unlock_irqrestore(&imsic->vsfile_lock, flags);
+
+ memset(imsic->swfile, 0, sizeof(*imsic->swfile));
+
+ if (old_vsfile_cpu >= 0)
+ kvm_riscv_aia_free_hgei(old_vsfile_cpu, old_vsfile_hgei);
+}
+
+static void imsic_swfile_extirq_update(struct kvm_vcpu *vcpu)
+{
+ struct imsic *imsic = vcpu->arch.aia_context.imsic_state;
+ struct imsic_mrif *mrif = imsic->swfile;
+ unsigned long flags;
+
+ /*
+ * The critical section is necessary during external interrupt
+ * updates to avoid the risk of losing interrupts due to potential
+ * interruptions between reading topei and updating pending status.
+ */
+
+ spin_lock_irqsave(&imsic->swfile_extirq_lock, flags);
+
+ if (imsic_mrif_atomic_read(mrif, &mrif->eidelivery) &&
+ imsic_mrif_topei(mrif, imsic->nr_eix, imsic->nr_msis))
+ kvm_riscv_vcpu_set_interrupt(vcpu, IRQ_VS_EXT);
+ else
+ kvm_riscv_vcpu_unset_interrupt(vcpu, IRQ_VS_EXT);
+
+ spin_unlock_irqrestore(&imsic->swfile_extirq_lock, flags);
+}
+
+static void imsic_swfile_read(struct kvm_vcpu *vcpu, bool clear,
+ struct imsic_mrif *mrif)
+{
+ struct imsic *imsic = vcpu->arch.aia_context.imsic_state;
+
+ /*
+ * We don't use imsic_mrif_atomic_xyz() functions to read and
+ * write SW-file and MRIF in this function because it is always
+ * called when VCPU is not using SW-file and the MRIF points to
+ * a temporary MRIF on stack.
+ */
+
+ memcpy(mrif, imsic->swfile, sizeof(*mrif));
+ if (clear) {
+ memset(imsic->swfile, 0, sizeof(*imsic->swfile));
+ kvm_riscv_vcpu_unset_interrupt(vcpu, IRQ_VS_EXT);
+ }
+}
+
+static void imsic_swfile_update(struct kvm_vcpu *vcpu,
+ struct imsic_mrif *mrif)
+{
+ u32 i;
+ struct imsic_mrif_eix *seix, *eix;
+ struct imsic *imsic = vcpu->arch.aia_context.imsic_state;
+ struct imsic_mrif *smrif = imsic->swfile;
+
+ imsic_mrif_atomic_write(smrif, &smrif->eidelivery, mrif->eidelivery);
+ imsic_mrif_atomic_write(smrif, &smrif->eithreshold, mrif->eithreshold);
+ for (i = 0; i < imsic->nr_eix; i++) {
+ seix = &smrif->eix[i];
+ eix = &mrif->eix[i];
+ imsic_mrif_atomic_or(smrif, &seix->eip[0], eix->eip[0]);
+ imsic_mrif_atomic_or(smrif, &seix->eie[0], eix->eie[0]);
+#ifdef CONFIG_32BIT
+ imsic_mrif_atomic_or(smrif, &seix->eip[1], eix->eip[1]);
+ imsic_mrif_atomic_or(smrif, &seix->eie[1], eix->eie[1]);
+#endif
+ }
+
+ imsic_swfile_extirq_update(vcpu);
+}
+
+void kvm_riscv_vcpu_aia_imsic_release(struct kvm_vcpu *vcpu)
+{
+ unsigned long flags;
+ struct imsic_mrif tmrif;
+ int old_vsfile_hgei, old_vsfile_cpu;
+ struct imsic *imsic = vcpu->arch.aia_context.imsic_state;
+
+ /* Read and clear IMSIC VS-file details */
+ write_lock_irqsave(&imsic->vsfile_lock, flags);
+ old_vsfile_hgei = imsic->vsfile_hgei;
+ old_vsfile_cpu = imsic->vsfile_cpu;
+ imsic->vsfile_cpu = imsic->vsfile_hgei = -1;
+ imsic->vsfile_va = NULL;
+ imsic->vsfile_pa = 0;
+ write_unlock_irqrestore(&imsic->vsfile_lock, flags);
+
+ /* Do nothing, if no IMSIC VS-file to release */
+ if (old_vsfile_cpu < 0)
+ return;
+
+ /*
+ * At this point, all interrupt producers are still using
+ * the old IMSIC VS-file so we first re-direct all interrupt
+ * producers.
+ */
+
+ /* Purge the G-stage mapping */
+ kvm_riscv_gstage_iounmap(vcpu->kvm,
+ vcpu->arch.aia_context.imsic_addr,
+ IMSIC_MMIO_PAGE_SZ);
+
+ /* TODO: Purge the IOMMU mapping ??? */
+
+ /*
+ * At this point, all interrupt producers have been re-directed
+ * to somewhere else so we move register state from the old IMSIC
+ * VS-file to the IMSIC SW-file.
+ */
+
+ /* Read and clear register state from old IMSIC VS-file */
+ memset(&tmrif, 0, sizeof(tmrif));
+ imsic_vsfile_read(old_vsfile_hgei, old_vsfile_cpu, imsic->nr_hw_eix,
+ true, &tmrif);
+
+ /* Update register state in IMSIC SW-file */
+ imsic_swfile_update(vcpu, &tmrif);
+
+ /* Free-up old IMSIC VS-file */
+ kvm_riscv_aia_free_hgei(old_vsfile_cpu, old_vsfile_hgei);
+}
+
+int kvm_riscv_vcpu_aia_imsic_update(struct kvm_vcpu *vcpu)
+{
+ unsigned long flags;
+ phys_addr_t new_vsfile_pa;
+ struct imsic_mrif tmrif;
+ void __iomem *new_vsfile_va;
+ struct kvm *kvm = vcpu->kvm;
+ struct kvm_run *run = vcpu->run;
+ struct kvm_vcpu_aia *vaia = &vcpu->arch.aia_context;
+ struct imsic *imsic = vaia->imsic_state;
+ int ret = 0, new_vsfile_hgei = -1, old_vsfile_hgei, old_vsfile_cpu;
+
+ /* Do nothing for emulation mode */
+ if (kvm->arch.aia.mode == KVM_DEV_RISCV_AIA_MODE_EMUL)
+ return 1;
+
+ /* Read old IMSIC VS-file details */
+ read_lock_irqsave(&imsic->vsfile_lock, flags);
+ old_vsfile_hgei = imsic->vsfile_hgei;
+ old_vsfile_cpu = imsic->vsfile_cpu;
+ read_unlock_irqrestore(&imsic->vsfile_lock, flags);
+
+ /* Do nothing if we are continuing on same CPU */
+ if (old_vsfile_cpu == vcpu->cpu)
+ return 1;
+
+ /* Allocate new IMSIC VS-file */
+ ret = kvm_riscv_aia_alloc_hgei(vcpu->cpu, vcpu,
+ &new_vsfile_va, &new_vsfile_pa);
+ if (ret <= 0) {
+ /* For HW acceleration mode, we can't continue */
+ if (kvm->arch.aia.mode == KVM_DEV_RISCV_AIA_MODE_HWACCEL) {
+ run->fail_entry.hardware_entry_failure_reason =
+ CSR_HSTATUS;
+ run->fail_entry.cpu = vcpu->cpu;
+ run->exit_reason = KVM_EXIT_FAIL_ENTRY;
+ return 0;
+ }
+
+ /* Release old IMSIC VS-file */
+ if (old_vsfile_cpu >= 0)
+ kvm_riscv_vcpu_aia_imsic_release(vcpu);
+
+ /* For automatic mode, we continue */
+ goto done;
+ }
+ new_vsfile_hgei = ret;
+
+ /*
+ * At this point, all interrupt producers are still using
+ * to the old IMSIC VS-file so we first move all interrupt
+ * producers to the new IMSIC VS-file.
+ */
+
+ /* Zero-out new IMSIC VS-file */
+ imsic_vsfile_local_clear(new_vsfile_hgei, imsic->nr_hw_eix);
+
+ /* Update G-stage mapping for the new IMSIC VS-file */
+ ret = kvm_riscv_gstage_ioremap(kvm, vcpu->arch.aia_context.imsic_addr,
+ new_vsfile_pa, IMSIC_MMIO_PAGE_SZ,
+ true, true);
+ if (ret)
+ goto fail_free_vsfile_hgei;
+
+ /* TODO: Update the IOMMU mapping ??? */
+
+ /* Update new IMSIC VS-file details in IMSIC context */
+ write_lock_irqsave(&imsic->vsfile_lock, flags);
+ imsic->vsfile_hgei = new_vsfile_hgei;
+ imsic->vsfile_cpu = vcpu->cpu;
+ imsic->vsfile_va = new_vsfile_va;
+ imsic->vsfile_pa = new_vsfile_pa;
+ write_unlock_irqrestore(&imsic->vsfile_lock, flags);
+
+ /*
+ * At this point, all interrupt producers have been moved
+ * to the new IMSIC VS-file so we move register state from
+ * the old IMSIC VS/SW-file to the new IMSIC VS-file.
+ */
+
+ memset(&tmrif, 0, sizeof(tmrif));
+ if (old_vsfile_cpu >= 0) {
+ /* Read and clear register state from old IMSIC VS-file */
+ imsic_vsfile_read(old_vsfile_hgei, old_vsfile_cpu,
+ imsic->nr_hw_eix, true, &tmrif);
+
+ /* Free-up old IMSIC VS-file */
+ kvm_riscv_aia_free_hgei(old_vsfile_cpu, old_vsfile_hgei);
+ } else {
+ /* Read and clear register state from IMSIC SW-file */
+ imsic_swfile_read(vcpu, true, &tmrif);
+ }
+
+ /* Restore register state in the new IMSIC VS-file */
+ imsic_vsfile_local_update(new_vsfile_hgei, imsic->nr_hw_eix, &tmrif);
+
+done:
+ /* Set VCPU HSTATUS.VGEIN to new IMSIC VS-file */
+ vcpu->arch.guest_context.hstatus &= ~HSTATUS_VGEIN;
+ if (new_vsfile_hgei > 0)
+ vcpu->arch.guest_context.hstatus |=
+ ((unsigned long)new_vsfile_hgei) << HSTATUS_VGEIN_SHIFT;
+
+ /* Continue run-loop */
+ return 1;
+
+fail_free_vsfile_hgei:
+ kvm_riscv_aia_free_hgei(vcpu->cpu, new_vsfile_hgei);
+ return ret;
+}
+
+int kvm_riscv_vcpu_aia_imsic_rmw(struct kvm_vcpu *vcpu, unsigned long isel,
+ unsigned long *val, unsigned long new_val,
+ unsigned long wr_mask)
+{
+ u32 topei;
+ struct imsic_mrif_eix *eix;
+ int r, rc = KVM_INSN_CONTINUE_NEXT_SEPC;
+ struct imsic *imsic = vcpu->arch.aia_context.imsic_state;
+
+ if (isel == KVM_RISCV_AIA_IMSIC_TOPEI) {
+ /* Read pending and enabled interrupt with highest priority */
+ topei = imsic_mrif_topei(imsic->swfile, imsic->nr_eix,
+ imsic->nr_msis);
+ if (val)
+ *val = topei;
+
+ /* Writes ignore value and clear top pending interrupt */
+ if (topei && wr_mask) {
+ topei >>= TOPEI_ID_SHIFT;
+ if (topei) {
+ eix = &imsic->swfile->eix[topei /
+ BITS_PER_TYPE(u64)];
+ clear_bit(topei & (BITS_PER_TYPE(u64) - 1),
+ eix->eip);
+ }
+ }
+ } else {
+ r = imsic_mrif_rmw(imsic->swfile, imsic->nr_eix, isel,
+ val, new_val, wr_mask);
+ /* Forward unknown IMSIC register to user-space */
+ if (r)
+ rc = (r == -ENOENT) ? 0 : KVM_INSN_ILLEGAL_TRAP;
+ }
+
+ if (wr_mask)
+ imsic_swfile_extirq_update(vcpu);
+
+ return rc;
+}
+
+int kvm_riscv_aia_imsic_rw_attr(struct kvm *kvm, unsigned long type,
+ bool write, unsigned long *val)
+{
+ u32 isel, vcpu_id;
+ unsigned long flags;
+ struct imsic *imsic;
+ struct kvm_vcpu *vcpu;
+ int rc, vsfile_hgei, vsfile_cpu;
+
+ if (!kvm_riscv_aia_initialized(kvm))
+ return -ENODEV;
+
+ vcpu_id = KVM_DEV_RISCV_AIA_IMSIC_GET_VCPU(type);
+ vcpu = kvm_get_vcpu_by_id(kvm, vcpu_id);
+ if (!vcpu)
+ return -ENODEV;
+
+ isel = KVM_DEV_RISCV_AIA_IMSIC_GET_ISEL(type);
+ imsic = vcpu->arch.aia_context.imsic_state;
+
+ read_lock_irqsave(&imsic->vsfile_lock, flags);
+
+ rc = 0;
+ vsfile_hgei = imsic->vsfile_hgei;
+ vsfile_cpu = imsic->vsfile_cpu;
+ if (vsfile_cpu < 0) {
+ if (write) {
+ rc = imsic_mrif_rmw(imsic->swfile, imsic->nr_eix,
+ isel, NULL, *val, -1UL);
+ imsic_swfile_extirq_update(vcpu);
+ } else
+ rc = imsic_mrif_rmw(imsic->swfile, imsic->nr_eix,
+ isel, val, 0, 0);
+ }
+
+ read_unlock_irqrestore(&imsic->vsfile_lock, flags);
+
+ if (!rc && vsfile_cpu >= 0)
+ rc = imsic_vsfile_rw(vsfile_hgei, vsfile_cpu, imsic->nr_eix,
+ isel, write, val);
+
+ return rc;
+}
+
+int kvm_riscv_aia_imsic_has_attr(struct kvm *kvm, unsigned long type)
+{
+ u32 isel, vcpu_id;
+ struct imsic *imsic;
+ struct kvm_vcpu *vcpu;
+
+ if (!kvm_riscv_aia_initialized(kvm))
+ return -ENODEV;
+
+ vcpu_id = KVM_DEV_RISCV_AIA_IMSIC_GET_VCPU(type);
+ vcpu = kvm_get_vcpu_by_id(kvm, vcpu_id);
+ if (!vcpu)
+ return -ENODEV;
+
+ isel = KVM_DEV_RISCV_AIA_IMSIC_GET_ISEL(type);
+ imsic = vcpu->arch.aia_context.imsic_state;
+ return imsic_mrif_isel_check(imsic->nr_eix, isel);
+}
+
+void kvm_riscv_vcpu_aia_imsic_reset(struct kvm_vcpu *vcpu)
+{
+ struct imsic *imsic = vcpu->arch.aia_context.imsic_state;
+
+ if (!imsic)
+ return;
+
+ kvm_riscv_vcpu_aia_imsic_release(vcpu);
+
+ memset(imsic->swfile, 0, sizeof(*imsic->swfile));
+}
+
+int kvm_riscv_vcpu_aia_imsic_inject(struct kvm_vcpu *vcpu,
+ u32 guest_index, u32 offset, u32 iid)
+{
+ unsigned long flags;
+ struct imsic_mrif_eix *eix;
+ struct imsic *imsic = vcpu->arch.aia_context.imsic_state;
+
+ /* We only emulate one IMSIC MMIO page for each Guest VCPU */
+ if (!imsic || !iid || guest_index ||
+ (offset != IMSIC_MMIO_SETIPNUM_LE &&
+ offset != IMSIC_MMIO_SETIPNUM_BE))
+ return -ENODEV;
+
+ iid = (offset == IMSIC_MMIO_SETIPNUM_BE) ? __swab32(iid) : iid;
+ if (imsic->nr_msis <= iid)
+ return -EINVAL;
+
+ read_lock_irqsave(&imsic->vsfile_lock, flags);
+
+ if (imsic->vsfile_cpu >= 0) {
+ writel(iid, imsic->vsfile_va + IMSIC_MMIO_SETIPNUM_LE);
+ kvm_vcpu_kick(vcpu);
+ } else {
+ eix = &imsic->swfile->eix[iid / BITS_PER_TYPE(u64)];
+ set_bit(iid & (BITS_PER_TYPE(u64) - 1), eix->eip);
+ imsic_swfile_extirq_update(vcpu);
+ }
+
+ read_unlock_irqrestore(&imsic->vsfile_lock, flags);
+
+ return 0;
+}
+
+static int imsic_mmio_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
+ gpa_t addr, int len, void *val)
+{
+ if (len != 4 || (addr & 0x3) != 0)
+ return -EOPNOTSUPP;
+
+ *((u32 *)val) = 0;
+
+ return 0;
+}
+
+static int imsic_mmio_write(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
+ gpa_t addr, int len, const void *val)
+{
+ struct kvm_msi msi = { 0 };
+
+ if (len != 4 || (addr & 0x3) != 0)
+ return -EOPNOTSUPP;
+
+ msi.address_hi = addr >> 32;
+ msi.address_lo = (u32)addr;
+ msi.data = *((const u32 *)val);
+ kvm_riscv_aia_inject_msi(vcpu->kvm, &msi);
+
+ return 0;
+};
+
+static struct kvm_io_device_ops imsic_iodoev_ops = {
+ .read = imsic_mmio_read,
+ .write = imsic_mmio_write,
+};
+
+int kvm_riscv_vcpu_aia_imsic_init(struct kvm_vcpu *vcpu)
+{
+ int ret = 0;
+ struct imsic *imsic;
+ struct page *swfile_page;
+ struct kvm *kvm = vcpu->kvm;
+
+ /* Fail if we have zero IDs */
+ if (!kvm->arch.aia.nr_ids)
+ return -EINVAL;
+
+ /* Allocate IMSIC context */
+ imsic = kzalloc(sizeof(*imsic), GFP_KERNEL);
+ if (!imsic)
+ return -ENOMEM;
+ vcpu->arch.aia_context.imsic_state = imsic;
+
+ /* Setup IMSIC context */
+ imsic->nr_msis = kvm->arch.aia.nr_ids + 1;
+ rwlock_init(&imsic->vsfile_lock);
+ imsic->nr_eix = BITS_TO_U64(imsic->nr_msis);
+ imsic->nr_hw_eix = BITS_TO_U64(kvm_riscv_aia_max_ids);
+ imsic->vsfile_hgei = imsic->vsfile_cpu = -1;
+
+ /* Setup IMSIC SW-file */
+ swfile_page = alloc_pages(GFP_KERNEL | __GFP_ZERO,
+ get_order(sizeof(*imsic->swfile)));
+ if (!swfile_page) {
+ ret = -ENOMEM;
+ goto fail_free_imsic;
+ }
+ imsic->swfile = page_to_virt(swfile_page);
+ imsic->swfile_pa = page_to_phys(swfile_page);
+ spin_lock_init(&imsic->swfile_extirq_lock);
+
+ /* Setup IO device */
+ kvm_iodevice_init(&imsic->iodev, &imsic_iodoev_ops);
+ mutex_lock(&kvm->slots_lock);
+ ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS,
+ vcpu->arch.aia_context.imsic_addr,
+ KVM_DEV_RISCV_IMSIC_SIZE,
+ &imsic->iodev);
+ mutex_unlock(&kvm->slots_lock);
+ if (ret)
+ goto fail_free_swfile;
+
+ return 0;
+
+fail_free_swfile:
+ free_pages((unsigned long)imsic->swfile,
+ get_order(sizeof(*imsic->swfile)));
+fail_free_imsic:
+ vcpu->arch.aia_context.imsic_state = NULL;
+ kfree(imsic);
+ return ret;
+}
+
+void kvm_riscv_vcpu_aia_imsic_cleanup(struct kvm_vcpu *vcpu)
+{
+ struct kvm *kvm = vcpu->kvm;
+ struct imsic *imsic = vcpu->arch.aia_context.imsic_state;
+
+ if (!imsic)
+ return;
+
+ imsic_vsfile_cleanup(imsic);
+
+ mutex_lock(&kvm->slots_lock);
+ kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, &imsic->iodev);
+ mutex_unlock(&kvm->slots_lock);
+
+ free_pages((unsigned long)imsic->swfile,
+ get_order(sizeof(*imsic->swfile)));
+
+ vcpu->arch.aia_context.imsic_state = NULL;
+ kfree(imsic);
+}
diff --git a/arch/riscv/kvm/main.c b/arch/riscv/kvm/main.c
new file mode 100644
index 000000000..48ae0d4b3
--- /dev/null
+++ b/arch/riscv/kvm/main.c
@@ -0,0 +1,138 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2019 Western Digital Corporation or its affiliates.
+ *
+ * Authors:
+ * Anup Patel <anup.patel@wdc.com>
+ */
+
+#include <linux/errno.h>
+#include <linux/err.h>
+#include <linux/module.h>
+#include <linux/kvm_host.h>
+#include <asm/csr.h>
+#include <asm/hwcap.h>
+#include <asm/sbi.h>
+
+long kvm_arch_dev_ioctl(struct file *filp,
+ unsigned int ioctl, unsigned long arg)
+{
+ return -EINVAL;
+}
+
+int kvm_arch_hardware_enable(void)
+{
+ unsigned long hideleg, hedeleg;
+
+ hedeleg = 0;
+ hedeleg |= (1UL << EXC_INST_MISALIGNED);
+ hedeleg |= (1UL << EXC_BREAKPOINT);
+ hedeleg |= (1UL << EXC_SYSCALL);
+ hedeleg |= (1UL << EXC_INST_PAGE_FAULT);
+ hedeleg |= (1UL << EXC_LOAD_PAGE_FAULT);
+ hedeleg |= (1UL << EXC_STORE_PAGE_FAULT);
+ csr_write(CSR_HEDELEG, hedeleg);
+
+ hideleg = 0;
+ hideleg |= (1UL << IRQ_VS_SOFT);
+ hideleg |= (1UL << IRQ_VS_TIMER);
+ hideleg |= (1UL << IRQ_VS_EXT);
+ csr_write(CSR_HIDELEG, hideleg);
+
+ /* VS should access only the time counter directly. Everything else should trap */
+ csr_write(CSR_HCOUNTEREN, 0x02);
+
+ csr_write(CSR_HVIP, 0);
+
+ kvm_riscv_aia_enable();
+
+ return 0;
+}
+
+void kvm_arch_hardware_disable(void)
+{
+ kvm_riscv_aia_disable();
+
+ /*
+ * After clearing the hideleg CSR, the host kernel will receive
+ * spurious interrupts if hvip CSR has pending interrupts and the
+ * corresponding enable bits in vsie CSR are asserted. To avoid it,
+ * hvip CSR and vsie CSR must be cleared before clearing hideleg CSR.
+ */
+ csr_write(CSR_VSIE, 0);
+ csr_write(CSR_HVIP, 0);
+ csr_write(CSR_HEDELEG, 0);
+ csr_write(CSR_HIDELEG, 0);
+}
+
+static int __init riscv_kvm_init(void)
+{
+ int rc;
+ const char *str;
+
+ if (!riscv_isa_extension_available(NULL, h)) {
+ kvm_info("hypervisor extension not available\n");
+ return -ENODEV;
+ }
+
+ if (sbi_spec_is_0_1()) {
+ kvm_info("require SBI v0.2 or higher\n");
+ return -ENODEV;
+ }
+
+ if (!sbi_probe_extension(SBI_EXT_RFENCE)) {
+ kvm_info("require SBI RFENCE extension\n");
+ return -ENODEV;
+ }
+
+ kvm_riscv_gstage_mode_detect();
+
+ kvm_riscv_gstage_vmid_detect();
+
+ rc = kvm_riscv_aia_init();
+ if (rc && rc != -ENODEV)
+ return rc;
+
+ kvm_info("hypervisor extension available\n");
+
+ switch (kvm_riscv_gstage_mode()) {
+ case HGATP_MODE_SV32X4:
+ str = "Sv32x4";
+ break;
+ case HGATP_MODE_SV39X4:
+ str = "Sv39x4";
+ break;
+ case HGATP_MODE_SV48X4:
+ str = "Sv48x4";
+ break;
+ case HGATP_MODE_SV57X4:
+ str = "Sv57x4";
+ break;
+ default:
+ return -ENODEV;
+ }
+ kvm_info("using %s G-stage page table format\n", str);
+
+ kvm_info("VMID %ld bits available\n", kvm_riscv_gstage_vmid_bits());
+
+ if (kvm_riscv_aia_available())
+ kvm_info("AIA available with %d guest external interrupts\n",
+ kvm_riscv_aia_nr_hgei);
+
+ rc = kvm_init(sizeof(struct kvm_vcpu), 0, THIS_MODULE);
+ if (rc) {
+ kvm_riscv_aia_exit();
+ return rc;
+ }
+
+ return 0;
+}
+module_init(riscv_kvm_init);
+
+static void __exit riscv_kvm_exit(void)
+{
+ kvm_riscv_aia_exit();
+
+ kvm_exit();
+}
+module_exit(riscv_kvm_exit);
diff --git a/arch/riscv/kvm/mmu.c b/arch/riscv/kvm/mmu.c
new file mode 100644
index 000000000..068c74593
--- /dev/null
+++ b/arch/riscv/kvm/mmu.c
@@ -0,0 +1,793 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2019 Western Digital Corporation or its affiliates.
+ *
+ * Authors:
+ * Anup Patel <anup.patel@wdc.com>
+ */
+
+#include <linux/bitops.h>
+#include <linux/errno.h>
+#include <linux/err.h>
+#include <linux/hugetlb.h>
+#include <linux/module.h>
+#include <linux/uaccess.h>
+#include <linux/vmalloc.h>
+#include <linux/kvm_host.h>
+#include <linux/sched/signal.h>
+#include <asm/csr.h>
+#include <asm/page.h>
+#include <asm/pgtable.h>
+
+#ifdef CONFIG_64BIT
+static unsigned long gstage_mode __ro_after_init = (HGATP_MODE_SV39X4 << HGATP_MODE_SHIFT);
+static unsigned long gstage_pgd_levels __ro_after_init = 3;
+#define gstage_index_bits 9
+#else
+static unsigned long gstage_mode __ro_after_init = (HGATP_MODE_SV32X4 << HGATP_MODE_SHIFT);
+static unsigned long gstage_pgd_levels __ro_after_init = 2;
+#define gstage_index_bits 10
+#endif
+
+#define gstage_pgd_xbits 2
+#define gstage_pgd_size (1UL << (HGATP_PAGE_SHIFT + gstage_pgd_xbits))
+#define gstage_gpa_bits (HGATP_PAGE_SHIFT + \
+ (gstage_pgd_levels * gstage_index_bits) + \
+ gstage_pgd_xbits)
+#define gstage_gpa_size ((gpa_t)(1ULL << gstage_gpa_bits))
+
+#define gstage_pte_leaf(__ptep) \
+ (pte_val(*(__ptep)) & (_PAGE_READ | _PAGE_WRITE | _PAGE_EXEC))
+
+static inline unsigned long gstage_pte_index(gpa_t addr, u32 level)
+{
+ unsigned long mask;
+ unsigned long shift = HGATP_PAGE_SHIFT + (gstage_index_bits * level);
+
+ if (level == (gstage_pgd_levels - 1))
+ mask = (PTRS_PER_PTE * (1UL << gstage_pgd_xbits)) - 1;
+ else
+ mask = PTRS_PER_PTE - 1;
+
+ return (addr >> shift) & mask;
+}
+
+static inline unsigned long gstage_pte_page_vaddr(pte_t pte)
+{
+ return (unsigned long)pfn_to_virt(__page_val_to_pfn(pte_val(pte)));
+}
+
+static int gstage_page_size_to_level(unsigned long page_size, u32 *out_level)
+{
+ u32 i;
+ unsigned long psz = 1UL << 12;
+
+ for (i = 0; i < gstage_pgd_levels; i++) {
+ if (page_size == (psz << (i * gstage_index_bits))) {
+ *out_level = i;
+ return 0;
+ }
+ }
+
+ return -EINVAL;
+}
+
+static int gstage_level_to_page_order(u32 level, unsigned long *out_pgorder)
+{
+ if (gstage_pgd_levels < level)
+ return -EINVAL;
+
+ *out_pgorder = 12 + (level * gstage_index_bits);
+ return 0;
+}
+
+static int gstage_level_to_page_size(u32 level, unsigned long *out_pgsize)
+{
+ int rc;
+ unsigned long page_order = PAGE_SHIFT;
+
+ rc = gstage_level_to_page_order(level, &page_order);
+ if (rc)
+ return rc;
+
+ *out_pgsize = BIT(page_order);
+ return 0;
+}
+
+static bool gstage_get_leaf_entry(struct kvm *kvm, gpa_t addr,
+ pte_t **ptepp, u32 *ptep_level)
+{
+ pte_t *ptep;
+ u32 current_level = gstage_pgd_levels - 1;
+
+ *ptep_level = current_level;
+ ptep = (pte_t *)kvm->arch.pgd;
+ ptep = &ptep[gstage_pte_index(addr, current_level)];
+ while (ptep && pte_val(*ptep)) {
+ if (gstage_pte_leaf(ptep)) {
+ *ptep_level = current_level;
+ *ptepp = ptep;
+ return true;
+ }
+
+ if (current_level) {
+ current_level--;
+ *ptep_level = current_level;
+ ptep = (pte_t *)gstage_pte_page_vaddr(*ptep);
+ ptep = &ptep[gstage_pte_index(addr, current_level)];
+ } else {
+ ptep = NULL;
+ }
+ }
+
+ return false;
+}
+
+static void gstage_remote_tlb_flush(struct kvm *kvm, u32 level, gpa_t addr)
+{
+ unsigned long order = PAGE_SHIFT;
+
+ if (gstage_level_to_page_order(level, &order))
+ return;
+ addr &= ~(BIT(order) - 1);
+
+ kvm_riscv_hfence_gvma_vmid_gpa(kvm, -1UL, 0, addr, BIT(order), order);
+}
+
+static int gstage_set_pte(struct kvm *kvm, u32 level,
+ struct kvm_mmu_memory_cache *pcache,
+ gpa_t addr, const pte_t *new_pte)
+{
+ u32 current_level = gstage_pgd_levels - 1;
+ pte_t *next_ptep = (pte_t *)kvm->arch.pgd;
+ pte_t *ptep = &next_ptep[gstage_pte_index(addr, current_level)];
+
+ if (current_level < level)
+ return -EINVAL;
+
+ while (current_level != level) {
+ if (gstage_pte_leaf(ptep))
+ return -EEXIST;
+
+ if (!pte_val(*ptep)) {
+ if (!pcache)
+ return -ENOMEM;
+ next_ptep = kvm_mmu_memory_cache_alloc(pcache);
+ if (!next_ptep)
+ return -ENOMEM;
+ *ptep = pfn_pte(PFN_DOWN(__pa(next_ptep)),
+ __pgprot(_PAGE_TABLE));
+ } else {
+ if (gstage_pte_leaf(ptep))
+ return -EEXIST;
+ next_ptep = (pte_t *)gstage_pte_page_vaddr(*ptep);
+ }
+
+ current_level--;
+ ptep = &next_ptep[gstage_pte_index(addr, current_level)];
+ }
+
+ *ptep = *new_pte;
+ if (gstage_pte_leaf(ptep))
+ gstage_remote_tlb_flush(kvm, current_level, addr);
+
+ return 0;
+}
+
+static int gstage_map_page(struct kvm *kvm,
+ struct kvm_mmu_memory_cache *pcache,
+ gpa_t gpa, phys_addr_t hpa,
+ unsigned long page_size,
+ bool page_rdonly, bool page_exec)
+{
+ int ret;
+ u32 level = 0;
+ pte_t new_pte;
+ pgprot_t prot;
+
+ ret = gstage_page_size_to_level(page_size, &level);
+ if (ret)
+ return ret;
+
+ /*
+ * A RISC-V implementation can choose to either:
+ * 1) Update 'A' and 'D' PTE bits in hardware
+ * 2) Generate page fault when 'A' and/or 'D' bits are not set
+ * PTE so that software can update these bits.
+ *
+ * We support both options mentioned above. To achieve this, we
+ * always set 'A' and 'D' PTE bits at time of creating G-stage
+ * mapping. To support KVM dirty page logging with both options
+ * mentioned above, we will write-protect G-stage PTEs to track
+ * dirty pages.
+ */
+
+ if (page_exec) {
+ if (page_rdonly)
+ prot = PAGE_READ_EXEC;
+ else
+ prot = PAGE_WRITE_EXEC;
+ } else {
+ if (page_rdonly)
+ prot = PAGE_READ;
+ else
+ prot = PAGE_WRITE;
+ }
+ new_pte = pfn_pte(PFN_DOWN(hpa), prot);
+ new_pte = pte_mkdirty(new_pte);
+
+ return gstage_set_pte(kvm, level, pcache, gpa, &new_pte);
+}
+
+enum gstage_op {
+ GSTAGE_OP_NOP = 0, /* Nothing */
+ GSTAGE_OP_CLEAR, /* Clear/Unmap */
+ GSTAGE_OP_WP, /* Write-protect */
+};
+
+static void gstage_op_pte(struct kvm *kvm, gpa_t addr,
+ pte_t *ptep, u32 ptep_level, enum gstage_op op)
+{
+ int i, ret;
+ pte_t *next_ptep;
+ u32 next_ptep_level;
+ unsigned long next_page_size, page_size;
+
+ ret = gstage_level_to_page_size(ptep_level, &page_size);
+ if (ret)
+ return;
+
+ BUG_ON(addr & (page_size - 1));
+
+ if (!pte_val(*ptep))
+ return;
+
+ if (ptep_level && !gstage_pte_leaf(ptep)) {
+ next_ptep = (pte_t *)gstage_pte_page_vaddr(*ptep);
+ next_ptep_level = ptep_level - 1;
+ ret = gstage_level_to_page_size(next_ptep_level,
+ &next_page_size);
+ if (ret)
+ return;
+
+ if (op == GSTAGE_OP_CLEAR)
+ set_pte(ptep, __pte(0));
+ for (i = 0; i < PTRS_PER_PTE; i++)
+ gstage_op_pte(kvm, addr + i * next_page_size,
+ &next_ptep[i], next_ptep_level, op);
+ if (op == GSTAGE_OP_CLEAR)
+ put_page(virt_to_page(next_ptep));
+ } else {
+ if (op == GSTAGE_OP_CLEAR)
+ set_pte(ptep, __pte(0));
+ else if (op == GSTAGE_OP_WP)
+ set_pte(ptep, __pte(pte_val(*ptep) & ~_PAGE_WRITE));
+ gstage_remote_tlb_flush(kvm, ptep_level, addr);
+ }
+}
+
+static void gstage_unmap_range(struct kvm *kvm, gpa_t start,
+ gpa_t size, bool may_block)
+{
+ int ret;
+ pte_t *ptep;
+ u32 ptep_level;
+ bool found_leaf;
+ unsigned long page_size;
+ gpa_t addr = start, end = start + size;
+
+ while (addr < end) {
+ found_leaf = gstage_get_leaf_entry(kvm, addr,
+ &ptep, &ptep_level);
+ ret = gstage_level_to_page_size(ptep_level, &page_size);
+ if (ret)
+ break;
+
+ if (!found_leaf)
+ goto next;
+
+ if (!(addr & (page_size - 1)) && ((end - addr) >= page_size))
+ gstage_op_pte(kvm, addr, ptep,
+ ptep_level, GSTAGE_OP_CLEAR);
+
+next:
+ addr += page_size;
+
+ /*
+ * If the range is too large, release the kvm->mmu_lock
+ * to prevent starvation and lockup detector warnings.
+ */
+ if (may_block && addr < end)
+ cond_resched_lock(&kvm->mmu_lock);
+ }
+}
+
+static void gstage_wp_range(struct kvm *kvm, gpa_t start, gpa_t end)
+{
+ int ret;
+ pte_t *ptep;
+ u32 ptep_level;
+ bool found_leaf;
+ gpa_t addr = start;
+ unsigned long page_size;
+
+ while (addr < end) {
+ found_leaf = gstage_get_leaf_entry(kvm, addr,
+ &ptep, &ptep_level);
+ ret = gstage_level_to_page_size(ptep_level, &page_size);
+ if (ret)
+ break;
+
+ if (!found_leaf)
+ goto next;
+
+ if (!(addr & (page_size - 1)) && ((end - addr) >= page_size))
+ gstage_op_pte(kvm, addr, ptep,
+ ptep_level, GSTAGE_OP_WP);
+
+next:
+ addr += page_size;
+ }
+}
+
+static void gstage_wp_memory_region(struct kvm *kvm, int slot)
+{
+ struct kvm_memslots *slots = kvm_memslots(kvm);
+ struct kvm_memory_slot *memslot = id_to_memslot(slots, slot);
+ phys_addr_t start = memslot->base_gfn << PAGE_SHIFT;
+ phys_addr_t end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT;
+
+ spin_lock(&kvm->mmu_lock);
+ gstage_wp_range(kvm, start, end);
+ spin_unlock(&kvm->mmu_lock);
+ kvm_flush_remote_tlbs(kvm);
+}
+
+int kvm_riscv_gstage_ioremap(struct kvm *kvm, gpa_t gpa,
+ phys_addr_t hpa, unsigned long size,
+ bool writable, bool in_atomic)
+{
+ pte_t pte;
+ int ret = 0;
+ unsigned long pfn;
+ phys_addr_t addr, end;
+ struct kvm_mmu_memory_cache pcache = {
+ .gfp_custom = (in_atomic) ? GFP_ATOMIC | __GFP_ACCOUNT : 0,
+ .gfp_zero = __GFP_ZERO,
+ };
+
+ end = (gpa + size + PAGE_SIZE - 1) & PAGE_MASK;
+ pfn = __phys_to_pfn(hpa);
+
+ for (addr = gpa; addr < end; addr += PAGE_SIZE) {
+ pte = pfn_pte(pfn, PAGE_KERNEL_IO);
+
+ if (!writable)
+ pte = pte_wrprotect(pte);
+
+ ret = kvm_mmu_topup_memory_cache(&pcache, gstage_pgd_levels);
+ if (ret)
+ goto out;
+
+ spin_lock(&kvm->mmu_lock);
+ ret = gstage_set_pte(kvm, 0, &pcache, addr, &pte);
+ spin_unlock(&kvm->mmu_lock);
+ if (ret)
+ goto out;
+
+ pfn++;
+ }
+
+out:
+ kvm_mmu_free_memory_cache(&pcache);
+ return ret;
+}
+
+void kvm_riscv_gstage_iounmap(struct kvm *kvm, gpa_t gpa, unsigned long size)
+{
+ spin_lock(&kvm->mmu_lock);
+ gstage_unmap_range(kvm, gpa, size, false);
+ spin_unlock(&kvm->mmu_lock);
+}
+
+void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
+ struct kvm_memory_slot *slot,
+ gfn_t gfn_offset,
+ unsigned long mask)
+{
+ phys_addr_t base_gfn = slot->base_gfn + gfn_offset;
+ phys_addr_t start = (base_gfn + __ffs(mask)) << PAGE_SHIFT;
+ phys_addr_t end = (base_gfn + __fls(mask) + 1) << PAGE_SHIFT;
+
+ gstage_wp_range(kvm, start, end);
+}
+
+void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
+{
+}
+
+void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free)
+{
+}
+
+void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen)
+{
+}
+
+void kvm_arch_flush_shadow_all(struct kvm *kvm)
+{
+ kvm_riscv_gstage_free_pgd(kvm);
+}
+
+void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
+ struct kvm_memory_slot *slot)
+{
+ gpa_t gpa = slot->base_gfn << PAGE_SHIFT;
+ phys_addr_t size = slot->npages << PAGE_SHIFT;
+
+ spin_lock(&kvm->mmu_lock);
+ gstage_unmap_range(kvm, gpa, size, false);
+ spin_unlock(&kvm->mmu_lock);
+}
+
+void kvm_arch_commit_memory_region(struct kvm *kvm,
+ struct kvm_memory_slot *old,
+ const struct kvm_memory_slot *new,
+ enum kvm_mr_change change)
+{
+ /*
+ * At this point memslot has been committed and there is an
+ * allocated dirty_bitmap[], dirty pages will be tracked while
+ * the memory slot is write protected.
+ */
+ if (change != KVM_MR_DELETE && new->flags & KVM_MEM_LOG_DIRTY_PAGES)
+ gstage_wp_memory_region(kvm, new->id);
+}
+
+int kvm_arch_prepare_memory_region(struct kvm *kvm,
+ const struct kvm_memory_slot *old,
+ struct kvm_memory_slot *new,
+ enum kvm_mr_change change)
+{
+ hva_t hva, reg_end, size;
+ gpa_t base_gpa;
+ bool writable;
+ int ret = 0;
+
+ if (change != KVM_MR_CREATE && change != KVM_MR_MOVE &&
+ change != KVM_MR_FLAGS_ONLY)
+ return 0;
+
+ /*
+ * Prevent userspace from creating a memory region outside of the GPA
+ * space addressable by the KVM guest GPA space.
+ */
+ if ((new->base_gfn + new->npages) >=
+ (gstage_gpa_size >> PAGE_SHIFT))
+ return -EFAULT;
+
+ hva = new->userspace_addr;
+ size = new->npages << PAGE_SHIFT;
+ reg_end = hva + size;
+ base_gpa = new->base_gfn << PAGE_SHIFT;
+ writable = !(new->flags & KVM_MEM_READONLY);
+
+ mmap_read_lock(current->mm);
+
+ /*
+ * A memory region could potentially cover multiple VMAs, and
+ * any holes between them, so iterate over all of them to find
+ * out if we can map any of them right now.
+ *
+ * +--------------------------------------------+
+ * +---------------+----------------+ +----------------+
+ * | : VMA 1 | VMA 2 | | VMA 3 : |
+ * +---------------+----------------+ +----------------+
+ * | memory region |
+ * +--------------------------------------------+
+ */
+ do {
+ struct vm_area_struct *vma = find_vma(current->mm, hva);
+ hva_t vm_start, vm_end;
+
+ if (!vma || vma->vm_start >= reg_end)
+ break;
+
+ /*
+ * Mapping a read-only VMA is only allowed if the
+ * memory region is configured as read-only.
+ */
+ if (writable && !(vma->vm_flags & VM_WRITE)) {
+ ret = -EPERM;
+ break;
+ }
+
+ /* Take the intersection of this VMA with the memory region */
+ vm_start = max(hva, vma->vm_start);
+ vm_end = min(reg_end, vma->vm_end);
+
+ if (vma->vm_flags & VM_PFNMAP) {
+ gpa_t gpa = base_gpa + (vm_start - hva);
+ phys_addr_t pa;
+
+ pa = (phys_addr_t)vma->vm_pgoff << PAGE_SHIFT;
+ pa += vm_start - vma->vm_start;
+
+ /* IO region dirty page logging not allowed */
+ if (new->flags & KVM_MEM_LOG_DIRTY_PAGES) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = kvm_riscv_gstage_ioremap(kvm, gpa, pa,
+ vm_end - vm_start,
+ writable, false);
+ if (ret)
+ break;
+ }
+ hva = vm_end;
+ } while (hva < reg_end);
+
+ if (change == KVM_MR_FLAGS_ONLY)
+ goto out;
+
+ if (ret)
+ kvm_riscv_gstage_iounmap(kvm, base_gpa, size);
+
+out:
+ mmap_read_unlock(current->mm);
+ return ret;
+}
+
+bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
+{
+ if (!kvm->arch.pgd)
+ return false;
+
+ gstage_unmap_range(kvm, range->start << PAGE_SHIFT,
+ (range->end - range->start) << PAGE_SHIFT,
+ range->may_block);
+ return false;
+}
+
+bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
+{
+ int ret;
+ kvm_pfn_t pfn = pte_pfn(range->arg.pte);
+
+ if (!kvm->arch.pgd)
+ return false;
+
+ WARN_ON(range->end - range->start != 1);
+
+ ret = gstage_map_page(kvm, NULL, range->start << PAGE_SHIFT,
+ __pfn_to_phys(pfn), PAGE_SIZE, true, true);
+ if (ret) {
+ kvm_debug("Failed to map G-stage page (error %d)\n", ret);
+ return true;
+ }
+
+ return false;
+}
+
+bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
+{
+ pte_t *ptep;
+ u32 ptep_level = 0;
+ u64 size = (range->end - range->start) << PAGE_SHIFT;
+
+ if (!kvm->arch.pgd)
+ return false;
+
+ WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE);
+
+ if (!gstage_get_leaf_entry(kvm, range->start << PAGE_SHIFT,
+ &ptep, &ptep_level))
+ return false;
+
+ return ptep_test_and_clear_young(NULL, 0, ptep);
+}
+
+bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
+{
+ pte_t *ptep;
+ u32 ptep_level = 0;
+ u64 size = (range->end - range->start) << PAGE_SHIFT;
+
+ if (!kvm->arch.pgd)
+ return false;
+
+ WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE);
+
+ if (!gstage_get_leaf_entry(kvm, range->start << PAGE_SHIFT,
+ &ptep, &ptep_level))
+ return false;
+
+ return pte_young(*ptep);
+}
+
+int kvm_riscv_gstage_map(struct kvm_vcpu *vcpu,
+ struct kvm_memory_slot *memslot,
+ gpa_t gpa, unsigned long hva, bool is_write)
+{
+ int ret;
+ kvm_pfn_t hfn;
+ bool writable;
+ short vma_pageshift;
+ gfn_t gfn = gpa >> PAGE_SHIFT;
+ struct vm_area_struct *vma;
+ struct kvm *kvm = vcpu->kvm;
+ struct kvm_mmu_memory_cache *pcache = &vcpu->arch.mmu_page_cache;
+ bool logging = (memslot->dirty_bitmap &&
+ !(memslot->flags & KVM_MEM_READONLY)) ? true : false;
+ unsigned long vma_pagesize, mmu_seq;
+
+ /* We need minimum second+third level pages */
+ ret = kvm_mmu_topup_memory_cache(pcache, gstage_pgd_levels);
+ if (ret) {
+ kvm_err("Failed to topup G-stage cache\n");
+ return ret;
+ }
+
+ mmap_read_lock(current->mm);
+
+ vma = vma_lookup(current->mm, hva);
+ if (unlikely(!vma)) {
+ kvm_err("Failed to find VMA for hva 0x%lx\n", hva);
+ mmap_read_unlock(current->mm);
+ return -EFAULT;
+ }
+
+ if (is_vm_hugetlb_page(vma))
+ vma_pageshift = huge_page_shift(hstate_vma(vma));
+ else
+ vma_pageshift = PAGE_SHIFT;
+ vma_pagesize = 1ULL << vma_pageshift;
+ if (logging || (vma->vm_flags & VM_PFNMAP))
+ vma_pagesize = PAGE_SIZE;
+
+ if (vma_pagesize == PMD_SIZE || vma_pagesize == PUD_SIZE)
+ gfn = (gpa & huge_page_mask(hstate_vma(vma))) >> PAGE_SHIFT;
+
+ /*
+ * Read mmu_invalidate_seq so that KVM can detect if the results of
+ * vma_lookup() or gfn_to_pfn_prot() become stale priort to acquiring
+ * kvm->mmu_lock.
+ *
+ * Rely on mmap_read_unlock() for an implicit smp_rmb(), which pairs
+ * with the smp_wmb() in kvm_mmu_invalidate_end().
+ */
+ mmu_seq = kvm->mmu_invalidate_seq;
+ mmap_read_unlock(current->mm);
+
+ if (vma_pagesize != PUD_SIZE &&
+ vma_pagesize != PMD_SIZE &&
+ vma_pagesize != PAGE_SIZE) {
+ kvm_err("Invalid VMA page size 0x%lx\n", vma_pagesize);
+ return -EFAULT;
+ }
+
+ hfn = gfn_to_pfn_prot(kvm, gfn, is_write, &writable);
+ if (hfn == KVM_PFN_ERR_HWPOISON) {
+ send_sig_mceerr(BUS_MCEERR_AR, (void __user *)hva,
+ vma_pageshift, current);
+ return 0;
+ }
+ if (is_error_noslot_pfn(hfn))
+ return -EFAULT;
+
+ /*
+ * If logging is active then we allow writable pages only
+ * for write faults.
+ */
+ if (logging && !is_write)
+ writable = false;
+
+ spin_lock(&kvm->mmu_lock);
+
+ if (mmu_invalidate_retry(kvm, mmu_seq))
+ goto out_unlock;
+
+ if (writable) {
+ kvm_set_pfn_dirty(hfn);
+ mark_page_dirty(kvm, gfn);
+ ret = gstage_map_page(kvm, pcache, gpa, hfn << PAGE_SHIFT,
+ vma_pagesize, false, true);
+ } else {
+ ret = gstage_map_page(kvm, pcache, gpa, hfn << PAGE_SHIFT,
+ vma_pagesize, true, true);
+ }
+
+ if (ret)
+ kvm_err("Failed to map in G-stage\n");
+
+out_unlock:
+ spin_unlock(&kvm->mmu_lock);
+ kvm_set_pfn_accessed(hfn);
+ kvm_release_pfn_clean(hfn);
+ return ret;
+}
+
+int kvm_riscv_gstage_alloc_pgd(struct kvm *kvm)
+{
+ struct page *pgd_page;
+
+ if (kvm->arch.pgd != NULL) {
+ kvm_err("kvm_arch already initialized?\n");
+ return -EINVAL;
+ }
+
+ pgd_page = alloc_pages(GFP_KERNEL | __GFP_ZERO,
+ get_order(gstage_pgd_size));
+ if (!pgd_page)
+ return -ENOMEM;
+ kvm->arch.pgd = page_to_virt(pgd_page);
+ kvm->arch.pgd_phys = page_to_phys(pgd_page);
+
+ return 0;
+}
+
+void kvm_riscv_gstage_free_pgd(struct kvm *kvm)
+{
+ void *pgd = NULL;
+
+ spin_lock(&kvm->mmu_lock);
+ if (kvm->arch.pgd) {
+ gstage_unmap_range(kvm, 0UL, gstage_gpa_size, false);
+ pgd = READ_ONCE(kvm->arch.pgd);
+ kvm->arch.pgd = NULL;
+ kvm->arch.pgd_phys = 0;
+ }
+ spin_unlock(&kvm->mmu_lock);
+
+ if (pgd)
+ free_pages((unsigned long)pgd, get_order(gstage_pgd_size));
+}
+
+void kvm_riscv_gstage_update_hgatp(struct kvm_vcpu *vcpu)
+{
+ unsigned long hgatp = gstage_mode;
+ struct kvm_arch *k = &vcpu->kvm->arch;
+
+ hgatp |= (READ_ONCE(k->vmid.vmid) << HGATP_VMID_SHIFT) & HGATP_VMID;
+ hgatp |= (k->pgd_phys >> PAGE_SHIFT) & HGATP_PPN;
+
+ csr_write(CSR_HGATP, hgatp);
+
+ if (!kvm_riscv_gstage_vmid_bits())
+ kvm_riscv_local_hfence_gvma_all();
+}
+
+void __init kvm_riscv_gstage_mode_detect(void)
+{
+#ifdef CONFIG_64BIT
+ /* Try Sv57x4 G-stage mode */
+ csr_write(CSR_HGATP, HGATP_MODE_SV57X4 << HGATP_MODE_SHIFT);
+ if ((csr_read(CSR_HGATP) >> HGATP_MODE_SHIFT) == HGATP_MODE_SV57X4) {
+ gstage_mode = (HGATP_MODE_SV57X4 << HGATP_MODE_SHIFT);
+ gstage_pgd_levels = 5;
+ goto skip_sv48x4_test;
+ }
+
+ /* Try Sv48x4 G-stage mode */
+ csr_write(CSR_HGATP, HGATP_MODE_SV48X4 << HGATP_MODE_SHIFT);
+ if ((csr_read(CSR_HGATP) >> HGATP_MODE_SHIFT) == HGATP_MODE_SV48X4) {
+ gstage_mode = (HGATP_MODE_SV48X4 << HGATP_MODE_SHIFT);
+ gstage_pgd_levels = 4;
+ }
+skip_sv48x4_test:
+
+ csr_write(CSR_HGATP, 0);
+ kvm_riscv_local_hfence_gvma_all();
+#endif
+}
+
+unsigned long __init kvm_riscv_gstage_mode(void)
+{
+ return gstage_mode >> HGATP_MODE_SHIFT;
+}
+
+int kvm_riscv_gstage_gpa_bits(void)
+{
+ return gstage_gpa_bits;
+}
diff --git a/arch/riscv/kvm/tlb.c b/arch/riscv/kvm/tlb.c
new file mode 100644
index 000000000..44bc324ae
--- /dev/null
+++ b/arch/riscv/kvm/tlb.c
@@ -0,0 +1,405 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2022 Ventana Micro Systems Inc.
+ */
+
+#include <linux/bitmap.h>
+#include <linux/cpumask.h>
+#include <linux/errno.h>
+#include <linux/err.h>
+#include <linux/module.h>
+#include <linux/smp.h>
+#include <linux/kvm_host.h>
+#include <asm/cacheflush.h>
+#include <asm/csr.h>
+#include <asm/hwcap.h>
+#include <asm/insn-def.h>
+
+#define has_svinval() riscv_has_extension_unlikely(RISCV_ISA_EXT_SVINVAL)
+
+void kvm_riscv_local_hfence_gvma_vmid_gpa(unsigned long vmid,
+ gpa_t gpa, gpa_t gpsz,
+ unsigned long order)
+{
+ gpa_t pos;
+
+ if (PTRS_PER_PTE < (gpsz >> order)) {
+ kvm_riscv_local_hfence_gvma_vmid_all(vmid);
+ return;
+ }
+
+ if (has_svinval()) {
+ asm volatile (SFENCE_W_INVAL() ::: "memory");
+ for (pos = gpa; pos < (gpa + gpsz); pos += BIT(order))
+ asm volatile (HINVAL_GVMA(%0, %1)
+ : : "r" (pos >> 2), "r" (vmid) : "memory");
+ asm volatile (SFENCE_INVAL_IR() ::: "memory");
+ } else {
+ for (pos = gpa; pos < (gpa + gpsz); pos += BIT(order))
+ asm volatile (HFENCE_GVMA(%0, %1)
+ : : "r" (pos >> 2), "r" (vmid) : "memory");
+ }
+}
+
+void kvm_riscv_local_hfence_gvma_vmid_all(unsigned long vmid)
+{
+ asm volatile(HFENCE_GVMA(zero, %0) : : "r" (vmid) : "memory");
+}
+
+void kvm_riscv_local_hfence_gvma_gpa(gpa_t gpa, gpa_t gpsz,
+ unsigned long order)
+{
+ gpa_t pos;
+
+ if (PTRS_PER_PTE < (gpsz >> order)) {
+ kvm_riscv_local_hfence_gvma_all();
+ return;
+ }
+
+ if (has_svinval()) {
+ asm volatile (SFENCE_W_INVAL() ::: "memory");
+ for (pos = gpa; pos < (gpa + gpsz); pos += BIT(order))
+ asm volatile(HINVAL_GVMA(%0, zero)
+ : : "r" (pos >> 2) : "memory");
+ asm volatile (SFENCE_INVAL_IR() ::: "memory");
+ } else {
+ for (pos = gpa; pos < (gpa + gpsz); pos += BIT(order))
+ asm volatile(HFENCE_GVMA(%0, zero)
+ : : "r" (pos >> 2) : "memory");
+ }
+}
+
+void kvm_riscv_local_hfence_gvma_all(void)
+{
+ asm volatile(HFENCE_GVMA(zero, zero) : : : "memory");
+}
+
+void kvm_riscv_local_hfence_vvma_asid_gva(unsigned long vmid,
+ unsigned long asid,
+ unsigned long gva,
+ unsigned long gvsz,
+ unsigned long order)
+{
+ unsigned long pos, hgatp;
+
+ if (PTRS_PER_PTE < (gvsz >> order)) {
+ kvm_riscv_local_hfence_vvma_asid_all(vmid, asid);
+ return;
+ }
+
+ hgatp = csr_swap(CSR_HGATP, vmid << HGATP_VMID_SHIFT);
+
+ if (has_svinval()) {
+ asm volatile (SFENCE_W_INVAL() ::: "memory");
+ for (pos = gva; pos < (gva + gvsz); pos += BIT(order))
+ asm volatile(HINVAL_VVMA(%0, %1)
+ : : "r" (pos), "r" (asid) : "memory");
+ asm volatile (SFENCE_INVAL_IR() ::: "memory");
+ } else {
+ for (pos = gva; pos < (gva + gvsz); pos += BIT(order))
+ asm volatile(HFENCE_VVMA(%0, %1)
+ : : "r" (pos), "r" (asid) : "memory");
+ }
+
+ csr_write(CSR_HGATP, hgatp);
+}
+
+void kvm_riscv_local_hfence_vvma_asid_all(unsigned long vmid,
+ unsigned long asid)
+{
+ unsigned long hgatp;
+
+ hgatp = csr_swap(CSR_HGATP, vmid << HGATP_VMID_SHIFT);
+
+ asm volatile(HFENCE_VVMA(zero, %0) : : "r" (asid) : "memory");
+
+ csr_write(CSR_HGATP, hgatp);
+}
+
+void kvm_riscv_local_hfence_vvma_gva(unsigned long vmid,
+ unsigned long gva, unsigned long gvsz,
+ unsigned long order)
+{
+ unsigned long pos, hgatp;
+
+ if (PTRS_PER_PTE < (gvsz >> order)) {
+ kvm_riscv_local_hfence_vvma_all(vmid);
+ return;
+ }
+
+ hgatp = csr_swap(CSR_HGATP, vmid << HGATP_VMID_SHIFT);
+
+ if (has_svinval()) {
+ asm volatile (SFENCE_W_INVAL() ::: "memory");
+ for (pos = gva; pos < (gva + gvsz); pos += BIT(order))
+ asm volatile(HINVAL_VVMA(%0, zero)
+ : : "r" (pos) : "memory");
+ asm volatile (SFENCE_INVAL_IR() ::: "memory");
+ } else {
+ for (pos = gva; pos < (gva + gvsz); pos += BIT(order))
+ asm volatile(HFENCE_VVMA(%0, zero)
+ : : "r" (pos) : "memory");
+ }
+
+ csr_write(CSR_HGATP, hgatp);
+}
+
+void kvm_riscv_local_hfence_vvma_all(unsigned long vmid)
+{
+ unsigned long hgatp;
+
+ hgatp = csr_swap(CSR_HGATP, vmid << HGATP_VMID_SHIFT);
+
+ asm volatile(HFENCE_VVMA(zero, zero) : : : "memory");
+
+ csr_write(CSR_HGATP, hgatp);
+}
+
+void kvm_riscv_local_tlb_sanitize(struct kvm_vcpu *vcpu)
+{
+ unsigned long vmid;
+
+ if (!kvm_riscv_gstage_vmid_bits() ||
+ vcpu->arch.last_exit_cpu == vcpu->cpu)
+ return;
+
+ /*
+ * On RISC-V platforms with hardware VMID support, we share same
+ * VMID for all VCPUs of a particular Guest/VM. This means we might
+ * have stale G-stage TLB entries on the current Host CPU due to
+ * some other VCPU of the same Guest which ran previously on the
+ * current Host CPU.
+ *
+ * To cleanup stale TLB entries, we simply flush all G-stage TLB
+ * entries by VMID whenever underlying Host CPU changes for a VCPU.
+ */
+
+ vmid = READ_ONCE(vcpu->kvm->arch.vmid.vmid);
+ kvm_riscv_local_hfence_gvma_vmid_all(vmid);
+}
+
+void kvm_riscv_fence_i_process(struct kvm_vcpu *vcpu)
+{
+ kvm_riscv_vcpu_pmu_incr_fw(vcpu, SBI_PMU_FW_FENCE_I_RCVD);
+ local_flush_icache_all();
+}
+
+void kvm_riscv_hfence_gvma_vmid_all_process(struct kvm_vcpu *vcpu)
+{
+ struct kvm_vmid *vmid;
+
+ vmid = &vcpu->kvm->arch.vmid;
+ kvm_riscv_local_hfence_gvma_vmid_all(READ_ONCE(vmid->vmid));
+}
+
+void kvm_riscv_hfence_vvma_all_process(struct kvm_vcpu *vcpu)
+{
+ struct kvm_vmid *vmid;
+
+ vmid = &vcpu->kvm->arch.vmid;
+ kvm_riscv_local_hfence_vvma_all(READ_ONCE(vmid->vmid));
+}
+
+static bool vcpu_hfence_dequeue(struct kvm_vcpu *vcpu,
+ struct kvm_riscv_hfence *out_data)
+{
+ bool ret = false;
+ struct kvm_vcpu_arch *varch = &vcpu->arch;
+
+ spin_lock(&varch->hfence_lock);
+
+ if (varch->hfence_queue[varch->hfence_head].type) {
+ memcpy(out_data, &varch->hfence_queue[varch->hfence_head],
+ sizeof(*out_data));
+ varch->hfence_queue[varch->hfence_head].type = 0;
+
+ varch->hfence_head++;
+ if (varch->hfence_head == KVM_RISCV_VCPU_MAX_HFENCE)
+ varch->hfence_head = 0;
+
+ ret = true;
+ }
+
+ spin_unlock(&varch->hfence_lock);
+
+ return ret;
+}
+
+static bool vcpu_hfence_enqueue(struct kvm_vcpu *vcpu,
+ const struct kvm_riscv_hfence *data)
+{
+ bool ret = false;
+ struct kvm_vcpu_arch *varch = &vcpu->arch;
+
+ spin_lock(&varch->hfence_lock);
+
+ if (!varch->hfence_queue[varch->hfence_tail].type) {
+ memcpy(&varch->hfence_queue[varch->hfence_tail],
+ data, sizeof(*data));
+
+ varch->hfence_tail++;
+ if (varch->hfence_tail == KVM_RISCV_VCPU_MAX_HFENCE)
+ varch->hfence_tail = 0;
+
+ ret = true;
+ }
+
+ spin_unlock(&varch->hfence_lock);
+
+ return ret;
+}
+
+void kvm_riscv_hfence_process(struct kvm_vcpu *vcpu)
+{
+ struct kvm_riscv_hfence d = { 0 };
+ struct kvm_vmid *v = &vcpu->kvm->arch.vmid;
+
+ while (vcpu_hfence_dequeue(vcpu, &d)) {
+ switch (d.type) {
+ case KVM_RISCV_HFENCE_UNKNOWN:
+ break;
+ case KVM_RISCV_HFENCE_GVMA_VMID_GPA:
+ kvm_riscv_local_hfence_gvma_vmid_gpa(
+ READ_ONCE(v->vmid),
+ d.addr, d.size, d.order);
+ break;
+ case KVM_RISCV_HFENCE_VVMA_ASID_GVA:
+ kvm_riscv_vcpu_pmu_incr_fw(vcpu, SBI_PMU_FW_HFENCE_VVMA_ASID_RCVD);
+ kvm_riscv_local_hfence_vvma_asid_gva(
+ READ_ONCE(v->vmid), d.asid,
+ d.addr, d.size, d.order);
+ break;
+ case KVM_RISCV_HFENCE_VVMA_ASID_ALL:
+ kvm_riscv_vcpu_pmu_incr_fw(vcpu, SBI_PMU_FW_HFENCE_VVMA_ASID_RCVD);
+ kvm_riscv_local_hfence_vvma_asid_all(
+ READ_ONCE(v->vmid), d.asid);
+ break;
+ case KVM_RISCV_HFENCE_VVMA_GVA:
+ kvm_riscv_vcpu_pmu_incr_fw(vcpu, SBI_PMU_FW_HFENCE_VVMA_RCVD);
+ kvm_riscv_local_hfence_vvma_gva(
+ READ_ONCE(v->vmid),
+ d.addr, d.size, d.order);
+ break;
+ default:
+ break;
+ }
+ }
+}
+
+static void make_xfence_request(struct kvm *kvm,
+ unsigned long hbase, unsigned long hmask,
+ unsigned int req, unsigned int fallback_req,
+ const struct kvm_riscv_hfence *data)
+{
+ unsigned long i;
+ struct kvm_vcpu *vcpu;
+ unsigned int actual_req = req;
+ DECLARE_BITMAP(vcpu_mask, KVM_MAX_VCPUS);
+
+ bitmap_zero(vcpu_mask, KVM_MAX_VCPUS);
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ if (hbase != -1UL) {
+ if (vcpu->vcpu_id < hbase)
+ continue;
+ if (!(hmask & (1UL << (vcpu->vcpu_id - hbase))))
+ continue;
+ }
+
+ bitmap_set(vcpu_mask, i, 1);
+
+ if (!data || !data->type)
+ continue;
+
+ /*
+ * Enqueue hfence data to VCPU hfence queue. If we don't
+ * have space in the VCPU hfence queue then fallback to
+ * a more conservative hfence request.
+ */
+ if (!vcpu_hfence_enqueue(vcpu, data))
+ actual_req = fallback_req;
+ }
+
+ kvm_make_vcpus_request_mask(kvm, actual_req, vcpu_mask);
+}
+
+void kvm_riscv_fence_i(struct kvm *kvm,
+ unsigned long hbase, unsigned long hmask)
+{
+ make_xfence_request(kvm, hbase, hmask, KVM_REQ_FENCE_I,
+ KVM_REQ_FENCE_I, NULL);
+}
+
+void kvm_riscv_hfence_gvma_vmid_gpa(struct kvm *kvm,
+ unsigned long hbase, unsigned long hmask,
+ gpa_t gpa, gpa_t gpsz,
+ unsigned long order)
+{
+ struct kvm_riscv_hfence data;
+
+ data.type = KVM_RISCV_HFENCE_GVMA_VMID_GPA;
+ data.asid = 0;
+ data.addr = gpa;
+ data.size = gpsz;
+ data.order = order;
+ make_xfence_request(kvm, hbase, hmask, KVM_REQ_HFENCE,
+ KVM_REQ_HFENCE_GVMA_VMID_ALL, &data);
+}
+
+void kvm_riscv_hfence_gvma_vmid_all(struct kvm *kvm,
+ unsigned long hbase, unsigned long hmask)
+{
+ make_xfence_request(kvm, hbase, hmask, KVM_REQ_HFENCE_GVMA_VMID_ALL,
+ KVM_REQ_HFENCE_GVMA_VMID_ALL, NULL);
+}
+
+void kvm_riscv_hfence_vvma_asid_gva(struct kvm *kvm,
+ unsigned long hbase, unsigned long hmask,
+ unsigned long gva, unsigned long gvsz,
+ unsigned long order, unsigned long asid)
+{
+ struct kvm_riscv_hfence data;
+
+ data.type = KVM_RISCV_HFENCE_VVMA_ASID_GVA;
+ data.asid = asid;
+ data.addr = gva;
+ data.size = gvsz;
+ data.order = order;
+ make_xfence_request(kvm, hbase, hmask, KVM_REQ_HFENCE,
+ KVM_REQ_HFENCE_VVMA_ALL, &data);
+}
+
+void kvm_riscv_hfence_vvma_asid_all(struct kvm *kvm,
+ unsigned long hbase, unsigned long hmask,
+ unsigned long asid)
+{
+ struct kvm_riscv_hfence data;
+
+ data.type = KVM_RISCV_HFENCE_VVMA_ASID_ALL;
+ data.asid = asid;
+ data.addr = data.size = data.order = 0;
+ make_xfence_request(kvm, hbase, hmask, KVM_REQ_HFENCE,
+ KVM_REQ_HFENCE_VVMA_ALL, &data);
+}
+
+void kvm_riscv_hfence_vvma_gva(struct kvm *kvm,
+ unsigned long hbase, unsigned long hmask,
+ unsigned long gva, unsigned long gvsz,
+ unsigned long order)
+{
+ struct kvm_riscv_hfence data;
+
+ data.type = KVM_RISCV_HFENCE_VVMA_GVA;
+ data.asid = 0;
+ data.addr = gva;
+ data.size = gvsz;
+ data.order = order;
+ make_xfence_request(kvm, hbase, hmask, KVM_REQ_HFENCE,
+ KVM_REQ_HFENCE_VVMA_ALL, &data);
+}
+
+void kvm_riscv_hfence_vvma_all(struct kvm *kvm,
+ unsigned long hbase, unsigned long hmask)
+{
+ make_xfence_request(kvm, hbase, hmask, KVM_REQ_HFENCE_VVMA_ALL,
+ KVM_REQ_HFENCE_VVMA_ALL, NULL);
+}
diff --git a/arch/riscv/kvm/vcpu.c b/arch/riscv/kvm/vcpu.c
new file mode 100644
index 000000000..82229db1c
--- /dev/null
+++ b/arch/riscv/kvm/vcpu.c
@@ -0,0 +1,781 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2019 Western Digital Corporation or its affiliates.
+ *
+ * Authors:
+ * Anup Patel <anup.patel@wdc.com>
+ */
+
+#include <linux/bitops.h>
+#include <linux/entry-kvm.h>
+#include <linux/errno.h>
+#include <linux/err.h>
+#include <linux/kdebug.h>
+#include <linux/module.h>
+#include <linux/percpu.h>
+#include <linux/vmalloc.h>
+#include <linux/sched/signal.h>
+#include <linux/fs.h>
+#include <linux/kvm_host.h>
+#include <asm/csr.h>
+#include <asm/cacheflush.h>
+#include <asm/kvm_vcpu_vector.h>
+
+const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = {
+ KVM_GENERIC_VCPU_STATS(),
+ STATS_DESC_COUNTER(VCPU, ecall_exit_stat),
+ STATS_DESC_COUNTER(VCPU, wfi_exit_stat),
+ STATS_DESC_COUNTER(VCPU, mmio_exit_user),
+ STATS_DESC_COUNTER(VCPU, mmio_exit_kernel),
+ STATS_DESC_COUNTER(VCPU, csr_exit_user),
+ STATS_DESC_COUNTER(VCPU, csr_exit_kernel),
+ STATS_DESC_COUNTER(VCPU, signal_exits),
+ STATS_DESC_COUNTER(VCPU, exits)
+};
+
+const struct kvm_stats_header kvm_vcpu_stats_header = {
+ .name_size = KVM_STATS_NAME_SIZE,
+ .num_desc = ARRAY_SIZE(kvm_vcpu_stats_desc),
+ .id_offset = sizeof(struct kvm_stats_header),
+ .desc_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE,
+ .data_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE +
+ sizeof(kvm_vcpu_stats_desc),
+};
+
+static void kvm_riscv_reset_vcpu(struct kvm_vcpu *vcpu)
+{
+ struct kvm_vcpu_csr *csr = &vcpu->arch.guest_csr;
+ struct kvm_vcpu_csr *reset_csr = &vcpu->arch.guest_reset_csr;
+ struct kvm_cpu_context *cntx = &vcpu->arch.guest_context;
+ struct kvm_cpu_context *reset_cntx = &vcpu->arch.guest_reset_context;
+ bool loaded;
+
+ /**
+ * The preemption should be disabled here because it races with
+ * kvm_sched_out/kvm_sched_in(called from preempt notifiers) which
+ * also calls vcpu_load/put.
+ */
+ get_cpu();
+ loaded = (vcpu->cpu != -1);
+ if (loaded)
+ kvm_arch_vcpu_put(vcpu);
+
+ vcpu->arch.last_exit_cpu = -1;
+
+ memcpy(csr, reset_csr, sizeof(*csr));
+
+ memcpy(cntx, reset_cntx, sizeof(*cntx));
+
+ kvm_riscv_vcpu_fp_reset(vcpu);
+
+ kvm_riscv_vcpu_vector_reset(vcpu);
+
+ kvm_riscv_vcpu_timer_reset(vcpu);
+
+ kvm_riscv_vcpu_aia_reset(vcpu);
+
+ bitmap_zero(vcpu->arch.irqs_pending, KVM_RISCV_VCPU_NR_IRQS);
+ bitmap_zero(vcpu->arch.irqs_pending_mask, KVM_RISCV_VCPU_NR_IRQS);
+
+ kvm_riscv_vcpu_pmu_reset(vcpu);
+
+ vcpu->arch.hfence_head = 0;
+ vcpu->arch.hfence_tail = 0;
+ memset(vcpu->arch.hfence_queue, 0, sizeof(vcpu->arch.hfence_queue));
+
+ /* Reset the guest CSRs for hotplug usecase */
+ if (loaded)
+ kvm_arch_vcpu_load(vcpu, smp_processor_id());
+ put_cpu();
+}
+
+int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id)
+{
+ return 0;
+}
+
+int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
+{
+ int rc;
+ struct kvm_cpu_context *cntx;
+ struct kvm_vcpu_csr *reset_csr = &vcpu->arch.guest_reset_csr;
+
+ /* Mark this VCPU never ran */
+ vcpu->arch.ran_atleast_once = false;
+ vcpu->arch.mmu_page_cache.gfp_zero = __GFP_ZERO;
+ bitmap_zero(vcpu->arch.isa, RISCV_ISA_EXT_MAX);
+
+ /* Setup ISA features available to VCPU */
+ kvm_riscv_vcpu_setup_isa(vcpu);
+
+ /* Setup vendor, arch, and implementation details */
+ vcpu->arch.mvendorid = sbi_get_mvendorid();
+ vcpu->arch.marchid = sbi_get_marchid();
+ vcpu->arch.mimpid = sbi_get_mimpid();
+
+ /* Setup VCPU hfence queue */
+ spin_lock_init(&vcpu->arch.hfence_lock);
+
+ /* Setup reset state of shadow SSTATUS and HSTATUS CSRs */
+ cntx = &vcpu->arch.guest_reset_context;
+ cntx->sstatus = SR_SPP | SR_SPIE;
+ cntx->hstatus = 0;
+ cntx->hstatus |= HSTATUS_VTW;
+ cntx->hstatus |= HSTATUS_SPVP;
+ cntx->hstatus |= HSTATUS_SPV;
+
+ if (kvm_riscv_vcpu_alloc_vector_context(vcpu, cntx))
+ return -ENOMEM;
+
+ /* By default, make CY, TM, and IR counters accessible in VU mode */
+ reset_csr->scounteren = 0x7;
+
+ /* Setup VCPU timer */
+ kvm_riscv_vcpu_timer_init(vcpu);
+
+ /* setup performance monitoring */
+ kvm_riscv_vcpu_pmu_init(vcpu);
+
+ /* Setup VCPU AIA */
+ rc = kvm_riscv_vcpu_aia_init(vcpu);
+ if (rc)
+ return rc;
+
+ /* Reset VCPU */
+ kvm_riscv_reset_vcpu(vcpu);
+
+ return 0;
+}
+
+void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
+{
+ /**
+ * vcpu with id 0 is the designated boot cpu.
+ * Keep all vcpus with non-zero id in power-off state so that
+ * they can be brought up using SBI HSM extension.
+ */
+ if (vcpu->vcpu_idx != 0)
+ kvm_riscv_vcpu_power_off(vcpu);
+}
+
+void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
+{
+ /* Cleanup VCPU AIA context */
+ kvm_riscv_vcpu_aia_deinit(vcpu);
+
+ /* Cleanup VCPU timer */
+ kvm_riscv_vcpu_timer_deinit(vcpu);
+
+ kvm_riscv_vcpu_pmu_deinit(vcpu);
+
+ /* Free unused pages pre-allocated for G-stage page table mappings */
+ kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_cache);
+
+ /* Free vector context space for host and guest kernel */
+ kvm_riscv_vcpu_free_vector_context(vcpu);
+}
+
+int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
+{
+ return kvm_riscv_vcpu_timer_pending(vcpu);
+}
+
+void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu)
+{
+ kvm_riscv_aia_wakeon_hgei(vcpu, true);
+}
+
+void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu)
+{
+ kvm_riscv_aia_wakeon_hgei(vcpu, false);
+}
+
+int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
+{
+ return (kvm_riscv_vcpu_has_interrupts(vcpu, -1UL) &&
+ !vcpu->arch.power_off && !vcpu->arch.pause);
+}
+
+int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)
+{
+ return kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE;
+}
+
+bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu)
+{
+ return (vcpu->arch.guest_context.sstatus & SR_SPP) ? true : false;
+}
+
+vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
+{
+ return VM_FAULT_SIGBUS;
+}
+
+long kvm_arch_vcpu_async_ioctl(struct file *filp,
+ unsigned int ioctl, unsigned long arg)
+{
+ struct kvm_vcpu *vcpu = filp->private_data;
+ void __user *argp = (void __user *)arg;
+
+ if (ioctl == KVM_INTERRUPT) {
+ struct kvm_interrupt irq;
+
+ if (copy_from_user(&irq, argp, sizeof(irq)))
+ return -EFAULT;
+
+ if (irq.irq == KVM_INTERRUPT_SET)
+ return kvm_riscv_vcpu_set_interrupt(vcpu, IRQ_VS_EXT);
+ else
+ return kvm_riscv_vcpu_unset_interrupt(vcpu, IRQ_VS_EXT);
+ }
+
+ return -ENOIOCTLCMD;
+}
+
+long kvm_arch_vcpu_ioctl(struct file *filp,
+ unsigned int ioctl, unsigned long arg)
+{
+ struct kvm_vcpu *vcpu = filp->private_data;
+ void __user *argp = (void __user *)arg;
+ long r = -EINVAL;
+
+ switch (ioctl) {
+ case KVM_SET_ONE_REG:
+ case KVM_GET_ONE_REG: {
+ struct kvm_one_reg reg;
+
+ r = -EFAULT;
+ if (copy_from_user(&reg, argp, sizeof(reg)))
+ break;
+
+ if (ioctl == KVM_SET_ONE_REG)
+ r = kvm_riscv_vcpu_set_reg(vcpu, &reg);
+ else
+ r = kvm_riscv_vcpu_get_reg(vcpu, &reg);
+ break;
+ }
+ case KVM_GET_REG_LIST: {
+ struct kvm_reg_list __user *user_list = argp;
+ struct kvm_reg_list reg_list;
+ unsigned int n;
+
+ r = -EFAULT;
+ if (copy_from_user(&reg_list, user_list, sizeof(reg_list)))
+ break;
+ n = reg_list.n;
+ reg_list.n = kvm_riscv_vcpu_num_regs(vcpu);
+ if (copy_to_user(user_list, &reg_list, sizeof(reg_list)))
+ break;
+ r = -E2BIG;
+ if (n < reg_list.n)
+ break;
+ r = kvm_riscv_vcpu_copy_reg_indices(vcpu, user_list->reg);
+ break;
+ }
+ default:
+ break;
+ }
+
+ return r;
+}
+
+int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
+ struct kvm_sregs *sregs)
+{
+ return -EINVAL;
+}
+
+int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
+ struct kvm_sregs *sregs)
+{
+ return -EINVAL;
+}
+
+int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
+{
+ return -EINVAL;
+}
+
+int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
+{
+ return -EINVAL;
+}
+
+int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
+ struct kvm_translation *tr)
+{
+ return -EINVAL;
+}
+
+int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
+{
+ return -EINVAL;
+}
+
+int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
+{
+ return -EINVAL;
+}
+
+void kvm_riscv_vcpu_flush_interrupts(struct kvm_vcpu *vcpu)
+{
+ struct kvm_vcpu_csr *csr = &vcpu->arch.guest_csr;
+ unsigned long mask, val;
+
+ if (READ_ONCE(vcpu->arch.irqs_pending_mask[0])) {
+ mask = xchg_acquire(&vcpu->arch.irqs_pending_mask[0], 0);
+ val = READ_ONCE(vcpu->arch.irqs_pending[0]) & mask;
+
+ csr->hvip &= ~mask;
+ csr->hvip |= val;
+ }
+
+ /* Flush AIA high interrupts */
+ kvm_riscv_vcpu_aia_flush_interrupts(vcpu);
+}
+
+void kvm_riscv_vcpu_sync_interrupts(struct kvm_vcpu *vcpu)
+{
+ unsigned long hvip;
+ struct kvm_vcpu_arch *v = &vcpu->arch;
+ struct kvm_vcpu_csr *csr = &vcpu->arch.guest_csr;
+
+ /* Read current HVIP and VSIE CSRs */
+ csr->vsie = csr_read(CSR_VSIE);
+
+ /* Sync-up HVIP.VSSIP bit changes does by Guest */
+ hvip = csr_read(CSR_HVIP);
+ if ((csr->hvip ^ hvip) & (1UL << IRQ_VS_SOFT)) {
+ if (hvip & (1UL << IRQ_VS_SOFT)) {
+ if (!test_and_set_bit(IRQ_VS_SOFT,
+ v->irqs_pending_mask))
+ set_bit(IRQ_VS_SOFT, v->irqs_pending);
+ } else {
+ if (!test_and_set_bit(IRQ_VS_SOFT,
+ v->irqs_pending_mask))
+ clear_bit(IRQ_VS_SOFT, v->irqs_pending);
+ }
+ }
+
+ /* Sync-up AIA high interrupts */
+ kvm_riscv_vcpu_aia_sync_interrupts(vcpu);
+
+ /* Sync-up timer CSRs */
+ kvm_riscv_vcpu_timer_sync(vcpu);
+}
+
+int kvm_riscv_vcpu_set_interrupt(struct kvm_vcpu *vcpu, unsigned int irq)
+{
+ /*
+ * We only allow VS-mode software, timer, and external
+ * interrupts when irq is one of the local interrupts
+ * defined by RISC-V privilege specification.
+ */
+ if (irq < IRQ_LOCAL_MAX &&
+ irq != IRQ_VS_SOFT &&
+ irq != IRQ_VS_TIMER &&
+ irq != IRQ_VS_EXT)
+ return -EINVAL;
+
+ set_bit(irq, vcpu->arch.irqs_pending);
+ smp_mb__before_atomic();
+ set_bit(irq, vcpu->arch.irqs_pending_mask);
+
+ kvm_vcpu_kick(vcpu);
+
+ return 0;
+}
+
+int kvm_riscv_vcpu_unset_interrupt(struct kvm_vcpu *vcpu, unsigned int irq)
+{
+ /*
+ * We only allow VS-mode software, timer, and external
+ * interrupts when irq is one of the local interrupts
+ * defined by RISC-V privilege specification.
+ */
+ if (irq < IRQ_LOCAL_MAX &&
+ irq != IRQ_VS_SOFT &&
+ irq != IRQ_VS_TIMER &&
+ irq != IRQ_VS_EXT)
+ return -EINVAL;
+
+ clear_bit(irq, vcpu->arch.irqs_pending);
+ smp_mb__before_atomic();
+ set_bit(irq, vcpu->arch.irqs_pending_mask);
+
+ return 0;
+}
+
+bool kvm_riscv_vcpu_has_interrupts(struct kvm_vcpu *vcpu, u64 mask)
+{
+ unsigned long ie;
+
+ ie = ((vcpu->arch.guest_csr.vsie & VSIP_VALID_MASK)
+ << VSIP_TO_HVIP_SHIFT) & (unsigned long)mask;
+ ie |= vcpu->arch.guest_csr.vsie & ~IRQ_LOCAL_MASK &
+ (unsigned long)mask;
+ if (READ_ONCE(vcpu->arch.irqs_pending[0]) & ie)
+ return true;
+
+ /* Check AIA high interrupts */
+ return kvm_riscv_vcpu_aia_has_interrupts(vcpu, mask);
+}
+
+void kvm_riscv_vcpu_power_off(struct kvm_vcpu *vcpu)
+{
+ vcpu->arch.power_off = true;
+ kvm_make_request(KVM_REQ_SLEEP, vcpu);
+ kvm_vcpu_kick(vcpu);
+}
+
+void kvm_riscv_vcpu_power_on(struct kvm_vcpu *vcpu)
+{
+ vcpu->arch.power_off = false;
+ kvm_vcpu_wake_up(vcpu);
+}
+
+int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
+ struct kvm_mp_state *mp_state)
+{
+ if (vcpu->arch.power_off)
+ mp_state->mp_state = KVM_MP_STATE_STOPPED;
+ else
+ mp_state->mp_state = KVM_MP_STATE_RUNNABLE;
+
+ return 0;
+}
+
+int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
+ struct kvm_mp_state *mp_state)
+{
+ int ret = 0;
+
+ switch (mp_state->mp_state) {
+ case KVM_MP_STATE_RUNNABLE:
+ vcpu->arch.power_off = false;
+ break;
+ case KVM_MP_STATE_STOPPED:
+ kvm_riscv_vcpu_power_off(vcpu);
+ break;
+ default:
+ ret = -EINVAL;
+ }
+
+ return ret;
+}
+
+int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
+ struct kvm_guest_debug *dbg)
+{
+ /* TODO; To be implemented later. */
+ return -EINVAL;
+}
+
+static void kvm_riscv_vcpu_update_config(const unsigned long *isa)
+{
+ u64 henvcfg = 0;
+
+ if (riscv_isa_extension_available(isa, SVPBMT))
+ henvcfg |= ENVCFG_PBMTE;
+
+ if (riscv_isa_extension_available(isa, SSTC))
+ henvcfg |= ENVCFG_STCE;
+
+ if (riscv_isa_extension_available(isa, ZICBOM))
+ henvcfg |= (ENVCFG_CBIE | ENVCFG_CBCFE);
+
+ if (riscv_isa_extension_available(isa, ZICBOZ))
+ henvcfg |= ENVCFG_CBZE;
+
+ csr_write(CSR_HENVCFG, henvcfg);
+#ifdef CONFIG_32BIT
+ csr_write(CSR_HENVCFGH, henvcfg >> 32);
+#endif
+}
+
+void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+{
+ struct kvm_vcpu_csr *csr = &vcpu->arch.guest_csr;
+
+ csr_write(CSR_VSSTATUS, csr->vsstatus);
+ csr_write(CSR_VSIE, csr->vsie);
+ csr_write(CSR_VSTVEC, csr->vstvec);
+ csr_write(CSR_VSSCRATCH, csr->vsscratch);
+ csr_write(CSR_VSEPC, csr->vsepc);
+ csr_write(CSR_VSCAUSE, csr->vscause);
+ csr_write(CSR_VSTVAL, csr->vstval);
+ csr_write(CSR_HVIP, csr->hvip);
+ csr_write(CSR_VSATP, csr->vsatp);
+
+ kvm_riscv_vcpu_update_config(vcpu->arch.isa);
+
+ kvm_riscv_gstage_update_hgatp(vcpu);
+
+ kvm_riscv_vcpu_timer_restore(vcpu);
+
+ kvm_riscv_vcpu_host_fp_save(&vcpu->arch.host_context);
+ kvm_riscv_vcpu_guest_fp_restore(&vcpu->arch.guest_context,
+ vcpu->arch.isa);
+ kvm_riscv_vcpu_host_vector_save(&vcpu->arch.host_context);
+ kvm_riscv_vcpu_guest_vector_restore(&vcpu->arch.guest_context,
+ vcpu->arch.isa);
+
+ kvm_riscv_vcpu_aia_load(vcpu, cpu);
+
+ vcpu->cpu = cpu;
+}
+
+void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
+{
+ struct kvm_vcpu_csr *csr = &vcpu->arch.guest_csr;
+
+ vcpu->cpu = -1;
+
+ kvm_riscv_vcpu_aia_put(vcpu);
+
+ kvm_riscv_vcpu_guest_fp_save(&vcpu->arch.guest_context,
+ vcpu->arch.isa);
+ kvm_riscv_vcpu_host_fp_restore(&vcpu->arch.host_context);
+
+ kvm_riscv_vcpu_timer_save(vcpu);
+ kvm_riscv_vcpu_guest_vector_save(&vcpu->arch.guest_context,
+ vcpu->arch.isa);
+ kvm_riscv_vcpu_host_vector_restore(&vcpu->arch.host_context);
+
+ csr->vsstatus = csr_read(CSR_VSSTATUS);
+ csr->vsie = csr_read(CSR_VSIE);
+ csr->vstvec = csr_read(CSR_VSTVEC);
+ csr->vsscratch = csr_read(CSR_VSSCRATCH);
+ csr->vsepc = csr_read(CSR_VSEPC);
+ csr->vscause = csr_read(CSR_VSCAUSE);
+ csr->vstval = csr_read(CSR_VSTVAL);
+ csr->hvip = csr_read(CSR_HVIP);
+ csr->vsatp = csr_read(CSR_VSATP);
+}
+
+static void kvm_riscv_check_vcpu_requests(struct kvm_vcpu *vcpu)
+{
+ struct rcuwait *wait = kvm_arch_vcpu_get_wait(vcpu);
+
+ if (kvm_request_pending(vcpu)) {
+ if (kvm_check_request(KVM_REQ_SLEEP, vcpu)) {
+ kvm_vcpu_srcu_read_unlock(vcpu);
+ rcuwait_wait_event(wait,
+ (!vcpu->arch.power_off) && (!vcpu->arch.pause),
+ TASK_INTERRUPTIBLE);
+ kvm_vcpu_srcu_read_lock(vcpu);
+
+ if (vcpu->arch.power_off || vcpu->arch.pause) {
+ /*
+ * Awaken to handle a signal, request to
+ * sleep again later.
+ */
+ kvm_make_request(KVM_REQ_SLEEP, vcpu);
+ }
+ }
+
+ if (kvm_check_request(KVM_REQ_VCPU_RESET, vcpu))
+ kvm_riscv_reset_vcpu(vcpu);
+
+ if (kvm_check_request(KVM_REQ_UPDATE_HGATP, vcpu))
+ kvm_riscv_gstage_update_hgatp(vcpu);
+
+ if (kvm_check_request(KVM_REQ_FENCE_I, vcpu))
+ kvm_riscv_fence_i_process(vcpu);
+
+ /*
+ * The generic KVM_REQ_TLB_FLUSH is same as
+ * KVM_REQ_HFENCE_GVMA_VMID_ALL
+ */
+ if (kvm_check_request(KVM_REQ_HFENCE_GVMA_VMID_ALL, vcpu))
+ kvm_riscv_hfence_gvma_vmid_all_process(vcpu);
+
+ if (kvm_check_request(KVM_REQ_HFENCE_VVMA_ALL, vcpu))
+ kvm_riscv_hfence_vvma_all_process(vcpu);
+
+ if (kvm_check_request(KVM_REQ_HFENCE, vcpu))
+ kvm_riscv_hfence_process(vcpu);
+ }
+}
+
+static void kvm_riscv_update_hvip(struct kvm_vcpu *vcpu)
+{
+ struct kvm_vcpu_csr *csr = &vcpu->arch.guest_csr;
+
+ csr_write(CSR_HVIP, csr->hvip);
+ kvm_riscv_vcpu_aia_update_hvip(vcpu);
+}
+
+/*
+ * Actually run the vCPU, entering an RCU extended quiescent state (EQS) while
+ * the vCPU is running.
+ *
+ * This must be noinstr as instrumentation may make use of RCU, and this is not
+ * safe during the EQS.
+ */
+static void noinstr kvm_riscv_vcpu_enter_exit(struct kvm_vcpu *vcpu)
+{
+ guest_state_enter_irqoff();
+ __kvm_riscv_switch_to(&vcpu->arch);
+ vcpu->arch.last_exit_cpu = vcpu->cpu;
+ guest_state_exit_irqoff();
+}
+
+int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
+{
+ int ret;
+ struct kvm_cpu_trap trap;
+ struct kvm_run *run = vcpu->run;
+
+ /* Mark this VCPU ran at least once */
+ vcpu->arch.ran_atleast_once = true;
+
+ kvm_vcpu_srcu_read_lock(vcpu);
+
+ switch (run->exit_reason) {
+ case KVM_EXIT_MMIO:
+ /* Process MMIO value returned from user-space */
+ ret = kvm_riscv_vcpu_mmio_return(vcpu, vcpu->run);
+ break;
+ case KVM_EXIT_RISCV_SBI:
+ /* Process SBI value returned from user-space */
+ ret = kvm_riscv_vcpu_sbi_return(vcpu, vcpu->run);
+ break;
+ case KVM_EXIT_RISCV_CSR:
+ /* Process CSR value returned from user-space */
+ ret = kvm_riscv_vcpu_csr_return(vcpu, vcpu->run);
+ break;
+ default:
+ ret = 0;
+ break;
+ }
+ if (ret) {
+ kvm_vcpu_srcu_read_unlock(vcpu);
+ return ret;
+ }
+
+ if (run->immediate_exit) {
+ kvm_vcpu_srcu_read_unlock(vcpu);
+ return -EINTR;
+ }
+
+ vcpu_load(vcpu);
+
+ kvm_sigset_activate(vcpu);
+
+ ret = 1;
+ run->exit_reason = KVM_EXIT_UNKNOWN;
+ while (ret > 0) {
+ /* Check conditions before entering the guest */
+ ret = xfer_to_guest_mode_handle_work(vcpu);
+ if (ret)
+ continue;
+ ret = 1;
+
+ kvm_riscv_gstage_vmid_update(vcpu);
+
+ kvm_riscv_check_vcpu_requests(vcpu);
+
+ preempt_disable();
+
+ /* Update AIA HW state before entering guest */
+ ret = kvm_riscv_vcpu_aia_update(vcpu);
+ if (ret <= 0) {
+ preempt_enable();
+ continue;
+ }
+
+ local_irq_disable();
+
+ /*
+ * Ensure we set mode to IN_GUEST_MODE after we disable
+ * interrupts and before the final VCPU requests check.
+ * See the comment in kvm_vcpu_exiting_guest_mode() and
+ * Documentation/virt/kvm/vcpu-requests.rst
+ */
+ vcpu->mode = IN_GUEST_MODE;
+
+ kvm_vcpu_srcu_read_unlock(vcpu);
+ smp_mb__after_srcu_read_unlock();
+
+ /*
+ * We might have got VCPU interrupts updated asynchronously
+ * so update it in HW.
+ */
+ kvm_riscv_vcpu_flush_interrupts(vcpu);
+
+ /* Update HVIP CSR for current CPU */
+ kvm_riscv_update_hvip(vcpu);
+
+ if (ret <= 0 ||
+ kvm_riscv_gstage_vmid_ver_changed(&vcpu->kvm->arch.vmid) ||
+ kvm_request_pending(vcpu) ||
+ xfer_to_guest_mode_work_pending()) {
+ vcpu->mode = OUTSIDE_GUEST_MODE;
+ local_irq_enable();
+ preempt_enable();
+ kvm_vcpu_srcu_read_lock(vcpu);
+ continue;
+ }
+
+ /*
+ * Cleanup stale TLB enteries
+ *
+ * Note: This should be done after G-stage VMID has been
+ * updated using kvm_riscv_gstage_vmid_ver_changed()
+ */
+ kvm_riscv_local_tlb_sanitize(vcpu);
+
+ guest_timing_enter_irqoff();
+
+ kvm_riscv_vcpu_enter_exit(vcpu);
+
+ vcpu->mode = OUTSIDE_GUEST_MODE;
+ vcpu->stat.exits++;
+
+ /*
+ * Save SCAUSE, STVAL, HTVAL, and HTINST because we might
+ * get an interrupt between __kvm_riscv_switch_to() and
+ * local_irq_enable() which can potentially change CSRs.
+ */
+ trap.sepc = vcpu->arch.guest_context.sepc;
+ trap.scause = csr_read(CSR_SCAUSE);
+ trap.stval = csr_read(CSR_STVAL);
+ trap.htval = csr_read(CSR_HTVAL);
+ trap.htinst = csr_read(CSR_HTINST);
+
+ /* Syncup interrupts state with HW */
+ kvm_riscv_vcpu_sync_interrupts(vcpu);
+
+ /*
+ * We must ensure that any pending interrupts are taken before
+ * we exit guest timing so that timer ticks are accounted as
+ * guest time. Transiently unmask interrupts so that any
+ * pending interrupts are taken.
+ *
+ * There's no barrier which ensures that pending interrupts are
+ * recognised, so we just hope that the CPU takes any pending
+ * interrupts between the enable and disable.
+ */
+ local_irq_enable();
+ local_irq_disable();
+
+ guest_timing_exit_irqoff();
+
+ local_irq_enable();
+
+ preempt_enable();
+
+ kvm_vcpu_srcu_read_lock(vcpu);
+
+ ret = kvm_riscv_vcpu_exit(vcpu, run, &trap);
+ }
+
+ kvm_sigset_deactivate(vcpu);
+
+ vcpu_put(vcpu);
+
+ kvm_vcpu_srcu_read_unlock(vcpu);
+
+ return ret;
+}
diff --git a/arch/riscv/kvm/vcpu_exit.c b/arch/riscv/kvm/vcpu_exit.c
new file mode 100644
index 000000000..2415722c0
--- /dev/null
+++ b/arch/riscv/kvm/vcpu_exit.c
@@ -0,0 +1,223 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2019 Western Digital Corporation or its affiliates.
+ *
+ * Authors:
+ * Anup Patel <anup.patel@wdc.com>
+ */
+
+#include <linux/kvm_host.h>
+#include <asm/csr.h>
+#include <asm/insn-def.h>
+
+static int gstage_page_fault(struct kvm_vcpu *vcpu, struct kvm_run *run,
+ struct kvm_cpu_trap *trap)
+{
+ struct kvm_memory_slot *memslot;
+ unsigned long hva, fault_addr;
+ bool writable;
+ gfn_t gfn;
+ int ret;
+
+ fault_addr = (trap->htval << 2) | (trap->stval & 0x3);
+ gfn = fault_addr >> PAGE_SHIFT;
+ memslot = gfn_to_memslot(vcpu->kvm, gfn);
+ hva = gfn_to_hva_memslot_prot(memslot, gfn, &writable);
+
+ if (kvm_is_error_hva(hva) ||
+ (trap->scause == EXC_STORE_GUEST_PAGE_FAULT && !writable)) {
+ switch (trap->scause) {
+ case EXC_LOAD_GUEST_PAGE_FAULT:
+ return kvm_riscv_vcpu_mmio_load(vcpu, run,
+ fault_addr,
+ trap->htinst);
+ case EXC_STORE_GUEST_PAGE_FAULT:
+ return kvm_riscv_vcpu_mmio_store(vcpu, run,
+ fault_addr,
+ trap->htinst);
+ default:
+ return -EOPNOTSUPP;
+ };
+ }
+
+ ret = kvm_riscv_gstage_map(vcpu, memslot, fault_addr, hva,
+ (trap->scause == EXC_STORE_GUEST_PAGE_FAULT) ? true : false);
+ if (ret < 0)
+ return ret;
+
+ return 1;
+}
+
+/**
+ * kvm_riscv_vcpu_unpriv_read -- Read machine word from Guest memory
+ *
+ * @vcpu: The VCPU pointer
+ * @read_insn: Flag representing whether we are reading instruction
+ * @guest_addr: Guest address to read
+ * @trap: Output pointer to trap details
+ */
+unsigned long kvm_riscv_vcpu_unpriv_read(struct kvm_vcpu *vcpu,
+ bool read_insn,
+ unsigned long guest_addr,
+ struct kvm_cpu_trap *trap)
+{
+ register unsigned long taddr asm("a0") = (unsigned long)trap;
+ register unsigned long ttmp asm("a1");
+ unsigned long flags, val, tmp, old_stvec, old_hstatus;
+
+ local_irq_save(flags);
+
+ old_hstatus = csr_swap(CSR_HSTATUS, vcpu->arch.guest_context.hstatus);
+ old_stvec = csr_swap(CSR_STVEC, (ulong)&__kvm_riscv_unpriv_trap);
+
+ if (read_insn) {
+ /*
+ * HLVX.HU instruction
+ * 0110010 00011 rs1 100 rd 1110011
+ */
+ asm volatile ("\n"
+ ".option push\n"
+ ".option norvc\n"
+ "add %[ttmp], %[taddr], 0\n"
+ HLVX_HU(%[val], %[addr])
+ "andi %[tmp], %[val], 3\n"
+ "addi %[tmp], %[tmp], -3\n"
+ "bne %[tmp], zero, 2f\n"
+ "addi %[addr], %[addr], 2\n"
+ HLVX_HU(%[tmp], %[addr])
+ "sll %[tmp], %[tmp], 16\n"
+ "add %[val], %[val], %[tmp]\n"
+ "2:\n"
+ ".option pop"
+ : [val] "=&r" (val), [tmp] "=&r" (tmp),
+ [taddr] "+&r" (taddr), [ttmp] "+&r" (ttmp),
+ [addr] "+&r" (guest_addr) : : "memory");
+
+ if (trap->scause == EXC_LOAD_PAGE_FAULT)
+ trap->scause = EXC_INST_PAGE_FAULT;
+ } else {
+ /*
+ * HLV.D instruction
+ * 0110110 00000 rs1 100 rd 1110011
+ *
+ * HLV.W instruction
+ * 0110100 00000 rs1 100 rd 1110011
+ */
+ asm volatile ("\n"
+ ".option push\n"
+ ".option norvc\n"
+ "add %[ttmp], %[taddr], 0\n"
+#ifdef CONFIG_64BIT
+ HLV_D(%[val], %[addr])
+#else
+ HLV_W(%[val], %[addr])
+#endif
+ ".option pop"
+ : [val] "=&r" (val),
+ [taddr] "+&r" (taddr), [ttmp] "+&r" (ttmp)
+ : [addr] "r" (guest_addr) : "memory");
+ }
+
+ csr_write(CSR_STVEC, old_stvec);
+ csr_write(CSR_HSTATUS, old_hstatus);
+
+ local_irq_restore(flags);
+
+ return val;
+}
+
+/**
+ * kvm_riscv_vcpu_trap_redirect -- Redirect trap to Guest
+ *
+ * @vcpu: The VCPU pointer
+ * @trap: Trap details
+ */
+void kvm_riscv_vcpu_trap_redirect(struct kvm_vcpu *vcpu,
+ struct kvm_cpu_trap *trap)
+{
+ unsigned long vsstatus = csr_read(CSR_VSSTATUS);
+
+ /* Change Guest SSTATUS.SPP bit */
+ vsstatus &= ~SR_SPP;
+ if (vcpu->arch.guest_context.sstatus & SR_SPP)
+ vsstatus |= SR_SPP;
+
+ /* Change Guest SSTATUS.SPIE bit */
+ vsstatus &= ~SR_SPIE;
+ if (vsstatus & SR_SIE)
+ vsstatus |= SR_SPIE;
+
+ /* Clear Guest SSTATUS.SIE bit */
+ vsstatus &= ~SR_SIE;
+
+ /* Update Guest SSTATUS */
+ csr_write(CSR_VSSTATUS, vsstatus);
+
+ /* Update Guest SCAUSE, STVAL, and SEPC */
+ csr_write(CSR_VSCAUSE, trap->scause);
+ csr_write(CSR_VSTVAL, trap->stval);
+ csr_write(CSR_VSEPC, trap->sepc);
+
+ /* Set Guest PC to Guest exception vector */
+ vcpu->arch.guest_context.sepc = csr_read(CSR_VSTVEC);
+
+ /* Set Guest privilege mode to supervisor */
+ vcpu->arch.guest_context.sstatus |= SR_SPP;
+}
+
+/*
+ * Return > 0 to return to guest, < 0 on error, 0 (and set exit_reason) on
+ * proper exit to userspace.
+ */
+int kvm_riscv_vcpu_exit(struct kvm_vcpu *vcpu, struct kvm_run *run,
+ struct kvm_cpu_trap *trap)
+{
+ int ret;
+
+ /* If we got host interrupt then do nothing */
+ if (trap->scause & CAUSE_IRQ_FLAG)
+ return 1;
+
+ /* Handle guest traps */
+ ret = -EFAULT;
+ run->exit_reason = KVM_EXIT_UNKNOWN;
+ switch (trap->scause) {
+ case EXC_INST_ILLEGAL:
+ case EXC_LOAD_MISALIGNED:
+ case EXC_STORE_MISALIGNED:
+ if (vcpu->arch.guest_context.hstatus & HSTATUS_SPV) {
+ kvm_riscv_vcpu_trap_redirect(vcpu, trap);
+ ret = 1;
+ }
+ break;
+ case EXC_VIRTUAL_INST_FAULT:
+ if (vcpu->arch.guest_context.hstatus & HSTATUS_SPV)
+ ret = kvm_riscv_vcpu_virtual_insn(vcpu, run, trap);
+ break;
+ case EXC_INST_GUEST_PAGE_FAULT:
+ case EXC_LOAD_GUEST_PAGE_FAULT:
+ case EXC_STORE_GUEST_PAGE_FAULT:
+ if (vcpu->arch.guest_context.hstatus & HSTATUS_SPV)
+ ret = gstage_page_fault(vcpu, run, trap);
+ break;
+ case EXC_SUPERVISOR_SYSCALL:
+ if (vcpu->arch.guest_context.hstatus & HSTATUS_SPV)
+ ret = kvm_riscv_vcpu_sbi_ecall(vcpu, run);
+ break;
+ default:
+ break;
+ }
+
+ /* Print details in-case of error */
+ if (ret < 0) {
+ kvm_err("VCPU exit error %d\n", ret);
+ kvm_err("SEPC=0x%lx SSTATUS=0x%lx HSTATUS=0x%lx\n",
+ vcpu->arch.guest_context.sepc,
+ vcpu->arch.guest_context.sstatus,
+ vcpu->arch.guest_context.hstatus);
+ kvm_err("SCAUSE=0x%lx STVAL=0x%lx HTVAL=0x%lx HTINST=0x%lx\n",
+ trap->scause, trap->stval, trap->htval, trap->htinst);
+ }
+
+ return ret;
+}
diff --git a/arch/riscv/kvm/vcpu_fp.c b/arch/riscv/kvm/vcpu_fp.c
new file mode 100644
index 000000000..08ba48a39
--- /dev/null
+++ b/arch/riscv/kvm/vcpu_fp.c
@@ -0,0 +1,165 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2021 Western Digital Corporation or its affiliates.
+ *
+ * Authors:
+ * Atish Patra <atish.patra@wdc.com>
+ * Anup Patel <anup.patel@wdc.com>
+ */
+
+#include <linux/errno.h>
+#include <linux/err.h>
+#include <linux/kvm_host.h>
+#include <linux/uaccess.h>
+#include <asm/hwcap.h>
+
+#ifdef CONFIG_FPU
+void kvm_riscv_vcpu_fp_reset(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpu_context *cntx = &vcpu->arch.guest_context;
+
+ cntx->sstatus &= ~SR_FS;
+ if (riscv_isa_extension_available(vcpu->arch.isa, f) ||
+ riscv_isa_extension_available(vcpu->arch.isa, d))
+ cntx->sstatus |= SR_FS_INITIAL;
+ else
+ cntx->sstatus |= SR_FS_OFF;
+}
+
+static void kvm_riscv_vcpu_fp_clean(struct kvm_cpu_context *cntx)
+{
+ cntx->sstatus &= ~SR_FS;
+ cntx->sstatus |= SR_FS_CLEAN;
+}
+
+void kvm_riscv_vcpu_guest_fp_save(struct kvm_cpu_context *cntx,
+ const unsigned long *isa)
+{
+ if ((cntx->sstatus & SR_FS) == SR_FS_DIRTY) {
+ if (riscv_isa_extension_available(isa, d))
+ __kvm_riscv_fp_d_save(cntx);
+ else if (riscv_isa_extension_available(isa, f))
+ __kvm_riscv_fp_f_save(cntx);
+ kvm_riscv_vcpu_fp_clean(cntx);
+ }
+}
+
+void kvm_riscv_vcpu_guest_fp_restore(struct kvm_cpu_context *cntx,
+ const unsigned long *isa)
+{
+ if ((cntx->sstatus & SR_FS) != SR_FS_OFF) {
+ if (riscv_isa_extension_available(isa, d))
+ __kvm_riscv_fp_d_restore(cntx);
+ else if (riscv_isa_extension_available(isa, f))
+ __kvm_riscv_fp_f_restore(cntx);
+ kvm_riscv_vcpu_fp_clean(cntx);
+ }
+}
+
+void kvm_riscv_vcpu_host_fp_save(struct kvm_cpu_context *cntx)
+{
+ /* No need to check host sstatus as it can be modified outside */
+ if (riscv_isa_extension_available(NULL, d))
+ __kvm_riscv_fp_d_save(cntx);
+ else if (riscv_isa_extension_available(NULL, f))
+ __kvm_riscv_fp_f_save(cntx);
+}
+
+void kvm_riscv_vcpu_host_fp_restore(struct kvm_cpu_context *cntx)
+{
+ if (riscv_isa_extension_available(NULL, d))
+ __kvm_riscv_fp_d_restore(cntx);
+ else if (riscv_isa_extension_available(NULL, f))
+ __kvm_riscv_fp_f_restore(cntx);
+}
+#endif
+
+int kvm_riscv_vcpu_get_reg_fp(struct kvm_vcpu *vcpu,
+ const struct kvm_one_reg *reg,
+ unsigned long rtype)
+{
+ struct kvm_cpu_context *cntx = &vcpu->arch.guest_context;
+ unsigned long __user *uaddr =
+ (unsigned long __user *)(unsigned long)reg->addr;
+ unsigned long reg_num = reg->id & ~(KVM_REG_ARCH_MASK |
+ KVM_REG_SIZE_MASK |
+ rtype);
+ void *reg_val;
+
+ if ((rtype == KVM_REG_RISCV_FP_F) &&
+ riscv_isa_extension_available(vcpu->arch.isa, f)) {
+ if (KVM_REG_SIZE(reg->id) != sizeof(u32))
+ return -EINVAL;
+ if (reg_num == KVM_REG_RISCV_FP_F_REG(fcsr))
+ reg_val = &cntx->fp.f.fcsr;
+ else if ((KVM_REG_RISCV_FP_F_REG(f[0]) <= reg_num) &&
+ reg_num <= KVM_REG_RISCV_FP_F_REG(f[31]))
+ reg_val = &cntx->fp.f.f[reg_num];
+ else
+ return -ENOENT;
+ } else if ((rtype == KVM_REG_RISCV_FP_D) &&
+ riscv_isa_extension_available(vcpu->arch.isa, d)) {
+ if (reg_num == KVM_REG_RISCV_FP_D_REG(fcsr)) {
+ if (KVM_REG_SIZE(reg->id) != sizeof(u32))
+ return -EINVAL;
+ reg_val = &cntx->fp.d.fcsr;
+ } else if ((KVM_REG_RISCV_FP_D_REG(f[0]) <= reg_num) &&
+ reg_num <= KVM_REG_RISCV_FP_D_REG(f[31])) {
+ if (KVM_REG_SIZE(reg->id) != sizeof(u64))
+ return -EINVAL;
+ reg_val = &cntx->fp.d.f[reg_num];
+ } else
+ return -ENOENT;
+ } else
+ return -ENOENT;
+
+ if (copy_to_user(uaddr, reg_val, KVM_REG_SIZE(reg->id)))
+ return -EFAULT;
+
+ return 0;
+}
+
+int kvm_riscv_vcpu_set_reg_fp(struct kvm_vcpu *vcpu,
+ const struct kvm_one_reg *reg,
+ unsigned long rtype)
+{
+ struct kvm_cpu_context *cntx = &vcpu->arch.guest_context;
+ unsigned long __user *uaddr =
+ (unsigned long __user *)(unsigned long)reg->addr;
+ unsigned long reg_num = reg->id & ~(KVM_REG_ARCH_MASK |
+ KVM_REG_SIZE_MASK |
+ rtype);
+ void *reg_val;
+
+ if ((rtype == KVM_REG_RISCV_FP_F) &&
+ riscv_isa_extension_available(vcpu->arch.isa, f)) {
+ if (KVM_REG_SIZE(reg->id) != sizeof(u32))
+ return -EINVAL;
+ if (reg_num == KVM_REG_RISCV_FP_F_REG(fcsr))
+ reg_val = &cntx->fp.f.fcsr;
+ else if ((KVM_REG_RISCV_FP_F_REG(f[0]) <= reg_num) &&
+ reg_num <= KVM_REG_RISCV_FP_F_REG(f[31]))
+ reg_val = &cntx->fp.f.f[reg_num];
+ else
+ return -ENOENT;
+ } else if ((rtype == KVM_REG_RISCV_FP_D) &&
+ riscv_isa_extension_available(vcpu->arch.isa, d)) {
+ if (reg_num == KVM_REG_RISCV_FP_D_REG(fcsr)) {
+ if (KVM_REG_SIZE(reg->id) != sizeof(u32))
+ return -EINVAL;
+ reg_val = &cntx->fp.d.fcsr;
+ } else if ((KVM_REG_RISCV_FP_D_REG(f[0]) <= reg_num) &&
+ reg_num <= KVM_REG_RISCV_FP_D_REG(f[31])) {
+ if (KVM_REG_SIZE(reg->id) != sizeof(u64))
+ return -EINVAL;
+ reg_val = &cntx->fp.d.f[reg_num];
+ } else
+ return -ENOENT;
+ } else
+ return -ENOENT;
+
+ if (copy_from_user(reg_val, uaddr, KVM_REG_SIZE(reg->id)))
+ return -EFAULT;
+
+ return 0;
+}
diff --git a/arch/riscv/kvm/vcpu_insn.c b/arch/riscv/kvm/vcpu_insn.c
new file mode 100644
index 000000000..7a6abed41
--- /dev/null
+++ b/arch/riscv/kvm/vcpu_insn.c
@@ -0,0 +1,754 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2019 Western Digital Corporation or its affiliates.
+ * Copyright (c) 2022 Ventana Micro Systems Inc.
+ */
+
+#include <linux/bitops.h>
+#include <linux/kvm_host.h>
+
+#define INSN_OPCODE_MASK 0x007c
+#define INSN_OPCODE_SHIFT 2
+#define INSN_OPCODE_SYSTEM 28
+
+#define INSN_MASK_WFI 0xffffffff
+#define INSN_MATCH_WFI 0x10500073
+
+#define INSN_MATCH_CSRRW 0x1073
+#define INSN_MASK_CSRRW 0x707f
+#define INSN_MATCH_CSRRS 0x2073
+#define INSN_MASK_CSRRS 0x707f
+#define INSN_MATCH_CSRRC 0x3073
+#define INSN_MASK_CSRRC 0x707f
+#define INSN_MATCH_CSRRWI 0x5073
+#define INSN_MASK_CSRRWI 0x707f
+#define INSN_MATCH_CSRRSI 0x6073
+#define INSN_MASK_CSRRSI 0x707f
+#define INSN_MATCH_CSRRCI 0x7073
+#define INSN_MASK_CSRRCI 0x707f
+
+#define INSN_MATCH_LB 0x3
+#define INSN_MASK_LB 0x707f
+#define INSN_MATCH_LH 0x1003
+#define INSN_MASK_LH 0x707f
+#define INSN_MATCH_LW 0x2003
+#define INSN_MASK_LW 0x707f
+#define INSN_MATCH_LD 0x3003
+#define INSN_MASK_LD 0x707f
+#define INSN_MATCH_LBU 0x4003
+#define INSN_MASK_LBU 0x707f
+#define INSN_MATCH_LHU 0x5003
+#define INSN_MASK_LHU 0x707f
+#define INSN_MATCH_LWU 0x6003
+#define INSN_MASK_LWU 0x707f
+#define INSN_MATCH_SB 0x23
+#define INSN_MASK_SB 0x707f
+#define INSN_MATCH_SH 0x1023
+#define INSN_MASK_SH 0x707f
+#define INSN_MATCH_SW 0x2023
+#define INSN_MASK_SW 0x707f
+#define INSN_MATCH_SD 0x3023
+#define INSN_MASK_SD 0x707f
+
+#define INSN_MATCH_C_LD 0x6000
+#define INSN_MASK_C_LD 0xe003
+#define INSN_MATCH_C_SD 0xe000
+#define INSN_MASK_C_SD 0xe003
+#define INSN_MATCH_C_LW 0x4000
+#define INSN_MASK_C_LW 0xe003
+#define INSN_MATCH_C_SW 0xc000
+#define INSN_MASK_C_SW 0xe003
+#define INSN_MATCH_C_LDSP 0x6002
+#define INSN_MASK_C_LDSP 0xe003
+#define INSN_MATCH_C_SDSP 0xe002
+#define INSN_MASK_C_SDSP 0xe003
+#define INSN_MATCH_C_LWSP 0x4002
+#define INSN_MASK_C_LWSP 0xe003
+#define INSN_MATCH_C_SWSP 0xc002
+#define INSN_MASK_C_SWSP 0xe003
+
+#define INSN_16BIT_MASK 0x3
+
+#define INSN_IS_16BIT(insn) (((insn) & INSN_16BIT_MASK) != INSN_16BIT_MASK)
+
+#define INSN_LEN(insn) (INSN_IS_16BIT(insn) ? 2 : 4)
+
+#ifdef CONFIG_64BIT
+#define LOG_REGBYTES 3
+#else
+#define LOG_REGBYTES 2
+#endif
+#define REGBYTES (1 << LOG_REGBYTES)
+
+#define SH_RD 7
+#define SH_RS1 15
+#define SH_RS2 20
+#define SH_RS2C 2
+#define MASK_RX 0x1f
+
+#define RV_X(x, s, n) (((x) >> (s)) & ((1 << (n)) - 1))
+#define RVC_LW_IMM(x) ((RV_X(x, 6, 1) << 2) | \
+ (RV_X(x, 10, 3) << 3) | \
+ (RV_X(x, 5, 1) << 6))
+#define RVC_LD_IMM(x) ((RV_X(x, 10, 3) << 3) | \
+ (RV_X(x, 5, 2) << 6))
+#define RVC_LWSP_IMM(x) ((RV_X(x, 4, 3) << 2) | \
+ (RV_X(x, 12, 1) << 5) | \
+ (RV_X(x, 2, 2) << 6))
+#define RVC_LDSP_IMM(x) ((RV_X(x, 5, 2) << 3) | \
+ (RV_X(x, 12, 1) << 5) | \
+ (RV_X(x, 2, 3) << 6))
+#define RVC_SWSP_IMM(x) ((RV_X(x, 9, 4) << 2) | \
+ (RV_X(x, 7, 2) << 6))
+#define RVC_SDSP_IMM(x) ((RV_X(x, 10, 3) << 3) | \
+ (RV_X(x, 7, 3) << 6))
+#define RVC_RS1S(insn) (8 + RV_X(insn, SH_RD, 3))
+#define RVC_RS2S(insn) (8 + RV_X(insn, SH_RS2C, 3))
+#define RVC_RS2(insn) RV_X(insn, SH_RS2C, 5)
+
+#define SHIFT_RIGHT(x, y) \
+ ((y) < 0 ? ((x) << -(y)) : ((x) >> (y)))
+
+#define REG_MASK \
+ ((1 << (5 + LOG_REGBYTES)) - (1 << LOG_REGBYTES))
+
+#define REG_OFFSET(insn, pos) \
+ (SHIFT_RIGHT((insn), (pos) - LOG_REGBYTES) & REG_MASK)
+
+#define REG_PTR(insn, pos, regs) \
+ ((ulong *)((ulong)(regs) + REG_OFFSET(insn, pos)))
+
+#define GET_FUNCT3(insn) (((insn) >> 12) & 7)
+
+#define GET_RS1(insn, regs) (*REG_PTR(insn, SH_RS1, regs))
+#define GET_RS2(insn, regs) (*REG_PTR(insn, SH_RS2, regs))
+#define GET_RS1S(insn, regs) (*REG_PTR(RVC_RS1S(insn), 0, regs))
+#define GET_RS2S(insn, regs) (*REG_PTR(RVC_RS2S(insn), 0, regs))
+#define GET_RS2C(insn, regs) (*REG_PTR(insn, SH_RS2C, regs))
+#define GET_SP(regs) (*REG_PTR(2, 0, regs))
+#define SET_RD(insn, regs, val) (*REG_PTR(insn, SH_RD, regs) = (val))
+#define IMM_I(insn) ((s32)(insn) >> 20)
+#define IMM_S(insn) (((s32)(insn) >> 25 << 5) | \
+ (s32)(((insn) >> 7) & 0x1f))
+
+struct insn_func {
+ unsigned long mask;
+ unsigned long match;
+ /*
+ * Possible return values are as follows:
+ * 1) Returns < 0 for error case
+ * 2) Returns 0 for exit to user-space
+ * 3) Returns 1 to continue with next sepc
+ * 4) Returns 2 to continue with same sepc
+ * 5) Returns 3 to inject illegal instruction trap and continue
+ * 6) Returns 4 to inject virtual instruction trap and continue
+ *
+ * Use enum kvm_insn_return for return values
+ */
+ int (*func)(struct kvm_vcpu *vcpu, struct kvm_run *run, ulong insn);
+};
+
+static int truly_illegal_insn(struct kvm_vcpu *vcpu, struct kvm_run *run,
+ ulong insn)
+{
+ struct kvm_cpu_trap utrap = { 0 };
+
+ /* Redirect trap to Guest VCPU */
+ utrap.sepc = vcpu->arch.guest_context.sepc;
+ utrap.scause = EXC_INST_ILLEGAL;
+ utrap.stval = insn;
+ utrap.htval = 0;
+ utrap.htinst = 0;
+ kvm_riscv_vcpu_trap_redirect(vcpu, &utrap);
+
+ return 1;
+}
+
+static int truly_virtual_insn(struct kvm_vcpu *vcpu, struct kvm_run *run,
+ ulong insn)
+{
+ struct kvm_cpu_trap utrap = { 0 };
+
+ /* Redirect trap to Guest VCPU */
+ utrap.sepc = vcpu->arch.guest_context.sepc;
+ utrap.scause = EXC_VIRTUAL_INST_FAULT;
+ utrap.stval = insn;
+ utrap.htval = 0;
+ utrap.htinst = 0;
+ kvm_riscv_vcpu_trap_redirect(vcpu, &utrap);
+
+ return 1;
+}
+
+/**
+ * kvm_riscv_vcpu_wfi -- Emulate wait for interrupt (WFI) behaviour
+ *
+ * @vcpu: The VCPU pointer
+ */
+void kvm_riscv_vcpu_wfi(struct kvm_vcpu *vcpu)
+{
+ if (!kvm_arch_vcpu_runnable(vcpu)) {
+ kvm_vcpu_srcu_read_unlock(vcpu);
+ kvm_vcpu_halt(vcpu);
+ kvm_vcpu_srcu_read_lock(vcpu);
+ }
+}
+
+static int wfi_insn(struct kvm_vcpu *vcpu, struct kvm_run *run, ulong insn)
+{
+ vcpu->stat.wfi_exit_stat++;
+ kvm_riscv_vcpu_wfi(vcpu);
+ return KVM_INSN_CONTINUE_NEXT_SEPC;
+}
+
+struct csr_func {
+ unsigned int base;
+ unsigned int count;
+ /*
+ * Possible return values are as same as "func" callback in
+ * "struct insn_func".
+ */
+ int (*func)(struct kvm_vcpu *vcpu, unsigned int csr_num,
+ unsigned long *val, unsigned long new_val,
+ unsigned long wr_mask);
+};
+
+static const struct csr_func csr_funcs[] = {
+ KVM_RISCV_VCPU_AIA_CSR_FUNCS
+ KVM_RISCV_VCPU_HPMCOUNTER_CSR_FUNCS
+};
+
+/**
+ * kvm_riscv_vcpu_csr_return -- Handle CSR read/write after user space
+ * emulation or in-kernel emulation
+ *
+ * @vcpu: The VCPU pointer
+ * @run: The VCPU run struct containing the CSR data
+ *
+ * Returns > 0 upon failure and 0 upon success
+ */
+int kvm_riscv_vcpu_csr_return(struct kvm_vcpu *vcpu, struct kvm_run *run)
+{
+ ulong insn;
+
+ if (vcpu->arch.csr_decode.return_handled)
+ return 0;
+ vcpu->arch.csr_decode.return_handled = 1;
+
+ /* Update destination register for CSR reads */
+ insn = vcpu->arch.csr_decode.insn;
+ if ((insn >> SH_RD) & MASK_RX)
+ SET_RD(insn, &vcpu->arch.guest_context,
+ run->riscv_csr.ret_value);
+
+ /* Move to next instruction */
+ vcpu->arch.guest_context.sepc += INSN_LEN(insn);
+
+ return 0;
+}
+
+static int csr_insn(struct kvm_vcpu *vcpu, struct kvm_run *run, ulong insn)
+{
+ int i, rc = KVM_INSN_ILLEGAL_TRAP;
+ unsigned int csr_num = insn >> SH_RS2;
+ unsigned int rs1_num = (insn >> SH_RS1) & MASK_RX;
+ ulong rs1_val = GET_RS1(insn, &vcpu->arch.guest_context);
+ const struct csr_func *tcfn, *cfn = NULL;
+ ulong val = 0, wr_mask = 0, new_val = 0;
+
+ /* Decode the CSR instruction */
+ switch (GET_FUNCT3(insn)) {
+ case GET_FUNCT3(INSN_MATCH_CSRRW):
+ wr_mask = -1UL;
+ new_val = rs1_val;
+ break;
+ case GET_FUNCT3(INSN_MATCH_CSRRS):
+ wr_mask = rs1_val;
+ new_val = -1UL;
+ break;
+ case GET_FUNCT3(INSN_MATCH_CSRRC):
+ wr_mask = rs1_val;
+ new_val = 0;
+ break;
+ case GET_FUNCT3(INSN_MATCH_CSRRWI):
+ wr_mask = -1UL;
+ new_val = rs1_num;
+ break;
+ case GET_FUNCT3(INSN_MATCH_CSRRSI):
+ wr_mask = rs1_num;
+ new_val = -1UL;
+ break;
+ case GET_FUNCT3(INSN_MATCH_CSRRCI):
+ wr_mask = rs1_num;
+ new_val = 0;
+ break;
+ default:
+ return rc;
+ }
+
+ /* Save instruction decode info */
+ vcpu->arch.csr_decode.insn = insn;
+ vcpu->arch.csr_decode.return_handled = 0;
+
+ /* Update CSR details in kvm_run struct */
+ run->riscv_csr.csr_num = csr_num;
+ run->riscv_csr.new_value = new_val;
+ run->riscv_csr.write_mask = wr_mask;
+ run->riscv_csr.ret_value = 0;
+
+ /* Find in-kernel CSR function */
+ for (i = 0; i < ARRAY_SIZE(csr_funcs); i++) {
+ tcfn = &csr_funcs[i];
+ if ((tcfn->base <= csr_num) &&
+ (csr_num < (tcfn->base + tcfn->count))) {
+ cfn = tcfn;
+ break;
+ }
+ }
+
+ /* First try in-kernel CSR emulation */
+ if (cfn && cfn->func) {
+ rc = cfn->func(vcpu, csr_num, &val, new_val, wr_mask);
+ if (rc > KVM_INSN_EXIT_TO_USER_SPACE) {
+ if (rc == KVM_INSN_CONTINUE_NEXT_SEPC) {
+ run->riscv_csr.ret_value = val;
+ vcpu->stat.csr_exit_kernel++;
+ kvm_riscv_vcpu_csr_return(vcpu, run);
+ rc = KVM_INSN_CONTINUE_SAME_SEPC;
+ }
+ return rc;
+ }
+ }
+
+ /* Exit to user-space for CSR emulation */
+ if (rc <= KVM_INSN_EXIT_TO_USER_SPACE) {
+ vcpu->stat.csr_exit_user++;
+ run->exit_reason = KVM_EXIT_RISCV_CSR;
+ }
+
+ return rc;
+}
+
+static const struct insn_func system_opcode_funcs[] = {
+ {
+ .mask = INSN_MASK_CSRRW,
+ .match = INSN_MATCH_CSRRW,
+ .func = csr_insn,
+ },
+ {
+ .mask = INSN_MASK_CSRRS,
+ .match = INSN_MATCH_CSRRS,
+ .func = csr_insn,
+ },
+ {
+ .mask = INSN_MASK_CSRRC,
+ .match = INSN_MATCH_CSRRC,
+ .func = csr_insn,
+ },
+ {
+ .mask = INSN_MASK_CSRRWI,
+ .match = INSN_MATCH_CSRRWI,
+ .func = csr_insn,
+ },
+ {
+ .mask = INSN_MASK_CSRRSI,
+ .match = INSN_MATCH_CSRRSI,
+ .func = csr_insn,
+ },
+ {
+ .mask = INSN_MASK_CSRRCI,
+ .match = INSN_MATCH_CSRRCI,
+ .func = csr_insn,
+ },
+ {
+ .mask = INSN_MASK_WFI,
+ .match = INSN_MATCH_WFI,
+ .func = wfi_insn,
+ },
+};
+
+static int system_opcode_insn(struct kvm_vcpu *vcpu, struct kvm_run *run,
+ ulong insn)
+{
+ int i, rc = KVM_INSN_ILLEGAL_TRAP;
+ const struct insn_func *ifn;
+
+ for (i = 0; i < ARRAY_SIZE(system_opcode_funcs); i++) {
+ ifn = &system_opcode_funcs[i];
+ if ((insn & ifn->mask) == ifn->match) {
+ rc = ifn->func(vcpu, run, insn);
+ break;
+ }
+ }
+
+ switch (rc) {
+ case KVM_INSN_ILLEGAL_TRAP:
+ return truly_illegal_insn(vcpu, run, insn);
+ case KVM_INSN_VIRTUAL_TRAP:
+ return truly_virtual_insn(vcpu, run, insn);
+ case KVM_INSN_CONTINUE_NEXT_SEPC:
+ vcpu->arch.guest_context.sepc += INSN_LEN(insn);
+ break;
+ default:
+ break;
+ }
+
+ return (rc <= 0) ? rc : 1;
+}
+
+/**
+ * kvm_riscv_vcpu_virtual_insn -- Handle virtual instruction trap
+ *
+ * @vcpu: The VCPU pointer
+ * @run: The VCPU run struct containing the mmio data
+ * @trap: Trap details
+ *
+ * Returns > 0 to continue run-loop
+ * Returns 0 to exit run-loop and handle in user-space.
+ * Returns < 0 to report failure and exit run-loop
+ */
+int kvm_riscv_vcpu_virtual_insn(struct kvm_vcpu *vcpu, struct kvm_run *run,
+ struct kvm_cpu_trap *trap)
+{
+ unsigned long insn = trap->stval;
+ struct kvm_cpu_trap utrap = { 0 };
+ struct kvm_cpu_context *ct;
+
+ if (unlikely(INSN_IS_16BIT(insn))) {
+ if (insn == 0) {
+ ct = &vcpu->arch.guest_context;
+ insn = kvm_riscv_vcpu_unpriv_read(vcpu, true,
+ ct->sepc,
+ &utrap);
+ if (utrap.scause) {
+ utrap.sepc = ct->sepc;
+ kvm_riscv_vcpu_trap_redirect(vcpu, &utrap);
+ return 1;
+ }
+ }
+ if (INSN_IS_16BIT(insn))
+ return truly_illegal_insn(vcpu, run, insn);
+ }
+
+ switch ((insn & INSN_OPCODE_MASK) >> INSN_OPCODE_SHIFT) {
+ case INSN_OPCODE_SYSTEM:
+ return system_opcode_insn(vcpu, run, insn);
+ default:
+ return truly_illegal_insn(vcpu, run, insn);
+ }
+}
+
+/**
+ * kvm_riscv_vcpu_mmio_load -- Emulate MMIO load instruction
+ *
+ * @vcpu: The VCPU pointer
+ * @run: The VCPU run struct containing the mmio data
+ * @fault_addr: Guest physical address to load
+ * @htinst: Transformed encoding of the load instruction
+ *
+ * Returns > 0 to continue run-loop
+ * Returns 0 to exit run-loop and handle in user-space.
+ * Returns < 0 to report failure and exit run-loop
+ */
+int kvm_riscv_vcpu_mmio_load(struct kvm_vcpu *vcpu, struct kvm_run *run,
+ unsigned long fault_addr,
+ unsigned long htinst)
+{
+ u8 data_buf[8];
+ unsigned long insn;
+ int shift = 0, len = 0, insn_len = 0;
+ struct kvm_cpu_trap utrap = { 0 };
+ struct kvm_cpu_context *ct = &vcpu->arch.guest_context;
+
+ /* Determine trapped instruction */
+ if (htinst & 0x1) {
+ /*
+ * Bit[0] == 1 implies trapped instruction value is
+ * transformed instruction or custom instruction.
+ */
+ insn = htinst | INSN_16BIT_MASK;
+ insn_len = (htinst & BIT(1)) ? INSN_LEN(insn) : 2;
+ } else {
+ /*
+ * Bit[0] == 0 implies trapped instruction value is
+ * zero or special value.
+ */
+ insn = kvm_riscv_vcpu_unpriv_read(vcpu, true, ct->sepc,
+ &utrap);
+ if (utrap.scause) {
+ /* Redirect trap if we failed to read instruction */
+ utrap.sepc = ct->sepc;
+ kvm_riscv_vcpu_trap_redirect(vcpu, &utrap);
+ return 1;
+ }
+ insn_len = INSN_LEN(insn);
+ }
+
+ /* Decode length of MMIO and shift */
+ if ((insn & INSN_MASK_LW) == INSN_MATCH_LW) {
+ len = 4;
+ shift = 8 * (sizeof(ulong) - len);
+ } else if ((insn & INSN_MASK_LB) == INSN_MATCH_LB) {
+ len = 1;
+ shift = 8 * (sizeof(ulong) - len);
+ } else if ((insn & INSN_MASK_LBU) == INSN_MATCH_LBU) {
+ len = 1;
+ shift = 8 * (sizeof(ulong) - len);
+#ifdef CONFIG_64BIT
+ } else if ((insn & INSN_MASK_LD) == INSN_MATCH_LD) {
+ len = 8;
+ shift = 8 * (sizeof(ulong) - len);
+ } else if ((insn & INSN_MASK_LWU) == INSN_MATCH_LWU) {
+ len = 4;
+#endif
+ } else if ((insn & INSN_MASK_LH) == INSN_MATCH_LH) {
+ len = 2;
+ shift = 8 * (sizeof(ulong) - len);
+ } else if ((insn & INSN_MASK_LHU) == INSN_MATCH_LHU) {
+ len = 2;
+#ifdef CONFIG_64BIT
+ } else if ((insn & INSN_MASK_C_LD) == INSN_MATCH_C_LD) {
+ len = 8;
+ shift = 8 * (sizeof(ulong) - len);
+ insn = RVC_RS2S(insn) << SH_RD;
+ } else if ((insn & INSN_MASK_C_LDSP) == INSN_MATCH_C_LDSP &&
+ ((insn >> SH_RD) & 0x1f)) {
+ len = 8;
+ shift = 8 * (sizeof(ulong) - len);
+#endif
+ } else if ((insn & INSN_MASK_C_LW) == INSN_MATCH_C_LW) {
+ len = 4;
+ shift = 8 * (sizeof(ulong) - len);
+ insn = RVC_RS2S(insn) << SH_RD;
+ } else if ((insn & INSN_MASK_C_LWSP) == INSN_MATCH_C_LWSP &&
+ ((insn >> SH_RD) & 0x1f)) {
+ len = 4;
+ shift = 8 * (sizeof(ulong) - len);
+ } else {
+ return -EOPNOTSUPP;
+ }
+
+ /* Fault address should be aligned to length of MMIO */
+ if (fault_addr & (len - 1))
+ return -EIO;
+
+ /* Save instruction decode info */
+ vcpu->arch.mmio_decode.insn = insn;
+ vcpu->arch.mmio_decode.insn_len = insn_len;
+ vcpu->arch.mmio_decode.shift = shift;
+ vcpu->arch.mmio_decode.len = len;
+ vcpu->arch.mmio_decode.return_handled = 0;
+
+ /* Update MMIO details in kvm_run struct */
+ run->mmio.is_write = false;
+ run->mmio.phys_addr = fault_addr;
+ run->mmio.len = len;
+
+ /* Try to handle MMIO access in the kernel */
+ if (!kvm_io_bus_read(vcpu, KVM_MMIO_BUS, fault_addr, len, data_buf)) {
+ /* Successfully handled MMIO access in the kernel so resume */
+ memcpy(run->mmio.data, data_buf, len);
+ vcpu->stat.mmio_exit_kernel++;
+ kvm_riscv_vcpu_mmio_return(vcpu, run);
+ return 1;
+ }
+
+ /* Exit to userspace for MMIO emulation */
+ vcpu->stat.mmio_exit_user++;
+ run->exit_reason = KVM_EXIT_MMIO;
+
+ return 0;
+}
+
+/**
+ * kvm_riscv_vcpu_mmio_store -- Emulate MMIO store instruction
+ *
+ * @vcpu: The VCPU pointer
+ * @run: The VCPU run struct containing the mmio data
+ * @fault_addr: Guest physical address to store
+ * @htinst: Transformed encoding of the store instruction
+ *
+ * Returns > 0 to continue run-loop
+ * Returns 0 to exit run-loop and handle in user-space.
+ * Returns < 0 to report failure and exit run-loop
+ */
+int kvm_riscv_vcpu_mmio_store(struct kvm_vcpu *vcpu, struct kvm_run *run,
+ unsigned long fault_addr,
+ unsigned long htinst)
+{
+ u8 data8;
+ u16 data16;
+ u32 data32;
+ u64 data64;
+ ulong data;
+ unsigned long insn;
+ int len = 0, insn_len = 0;
+ struct kvm_cpu_trap utrap = { 0 };
+ struct kvm_cpu_context *ct = &vcpu->arch.guest_context;
+
+ /* Determine trapped instruction */
+ if (htinst & 0x1) {
+ /*
+ * Bit[0] == 1 implies trapped instruction value is
+ * transformed instruction or custom instruction.
+ */
+ insn = htinst | INSN_16BIT_MASK;
+ insn_len = (htinst & BIT(1)) ? INSN_LEN(insn) : 2;
+ } else {
+ /*
+ * Bit[0] == 0 implies trapped instruction value is
+ * zero or special value.
+ */
+ insn = kvm_riscv_vcpu_unpriv_read(vcpu, true, ct->sepc,
+ &utrap);
+ if (utrap.scause) {
+ /* Redirect trap if we failed to read instruction */
+ utrap.sepc = ct->sepc;
+ kvm_riscv_vcpu_trap_redirect(vcpu, &utrap);
+ return 1;
+ }
+ insn_len = INSN_LEN(insn);
+ }
+
+ data = GET_RS2(insn, &vcpu->arch.guest_context);
+ data8 = data16 = data32 = data64 = data;
+
+ if ((insn & INSN_MASK_SW) == INSN_MATCH_SW) {
+ len = 4;
+ } else if ((insn & INSN_MASK_SB) == INSN_MATCH_SB) {
+ len = 1;
+#ifdef CONFIG_64BIT
+ } else if ((insn & INSN_MASK_SD) == INSN_MATCH_SD) {
+ len = 8;
+#endif
+ } else if ((insn & INSN_MASK_SH) == INSN_MATCH_SH) {
+ len = 2;
+#ifdef CONFIG_64BIT
+ } else if ((insn & INSN_MASK_C_SD) == INSN_MATCH_C_SD) {
+ len = 8;
+ data64 = GET_RS2S(insn, &vcpu->arch.guest_context);
+ } else if ((insn & INSN_MASK_C_SDSP) == INSN_MATCH_C_SDSP &&
+ ((insn >> SH_RD) & 0x1f)) {
+ len = 8;
+ data64 = GET_RS2C(insn, &vcpu->arch.guest_context);
+#endif
+ } else if ((insn & INSN_MASK_C_SW) == INSN_MATCH_C_SW) {
+ len = 4;
+ data32 = GET_RS2S(insn, &vcpu->arch.guest_context);
+ } else if ((insn & INSN_MASK_C_SWSP) == INSN_MATCH_C_SWSP &&
+ ((insn >> SH_RD) & 0x1f)) {
+ len = 4;
+ data32 = GET_RS2C(insn, &vcpu->arch.guest_context);
+ } else {
+ return -EOPNOTSUPP;
+ }
+
+ /* Fault address should be aligned to length of MMIO */
+ if (fault_addr & (len - 1))
+ return -EIO;
+
+ /* Save instruction decode info */
+ vcpu->arch.mmio_decode.insn = insn;
+ vcpu->arch.mmio_decode.insn_len = insn_len;
+ vcpu->arch.mmio_decode.shift = 0;
+ vcpu->arch.mmio_decode.len = len;
+ vcpu->arch.mmio_decode.return_handled = 0;
+
+ /* Copy data to kvm_run instance */
+ switch (len) {
+ case 1:
+ *((u8 *)run->mmio.data) = data8;
+ break;
+ case 2:
+ *((u16 *)run->mmio.data) = data16;
+ break;
+ case 4:
+ *((u32 *)run->mmio.data) = data32;
+ break;
+ case 8:
+ *((u64 *)run->mmio.data) = data64;
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ /* Update MMIO details in kvm_run struct */
+ run->mmio.is_write = true;
+ run->mmio.phys_addr = fault_addr;
+ run->mmio.len = len;
+
+ /* Try to handle MMIO access in the kernel */
+ if (!kvm_io_bus_write(vcpu, KVM_MMIO_BUS,
+ fault_addr, len, run->mmio.data)) {
+ /* Successfully handled MMIO access in the kernel so resume */
+ vcpu->stat.mmio_exit_kernel++;
+ kvm_riscv_vcpu_mmio_return(vcpu, run);
+ return 1;
+ }
+
+ /* Exit to userspace for MMIO emulation */
+ vcpu->stat.mmio_exit_user++;
+ run->exit_reason = KVM_EXIT_MMIO;
+
+ return 0;
+}
+
+/**
+ * kvm_riscv_vcpu_mmio_return -- Handle MMIO loads after user space emulation
+ * or in-kernel IO emulation
+ *
+ * @vcpu: The VCPU pointer
+ * @run: The VCPU run struct containing the mmio data
+ */
+int kvm_riscv_vcpu_mmio_return(struct kvm_vcpu *vcpu, struct kvm_run *run)
+{
+ u8 data8;
+ u16 data16;
+ u32 data32;
+ u64 data64;
+ ulong insn;
+ int len, shift;
+
+ if (vcpu->arch.mmio_decode.return_handled)
+ return 0;
+
+ vcpu->arch.mmio_decode.return_handled = 1;
+ insn = vcpu->arch.mmio_decode.insn;
+
+ if (run->mmio.is_write)
+ goto done;
+
+ len = vcpu->arch.mmio_decode.len;
+ shift = vcpu->arch.mmio_decode.shift;
+
+ switch (len) {
+ case 1:
+ data8 = *((u8 *)run->mmio.data);
+ SET_RD(insn, &vcpu->arch.guest_context,
+ (ulong)data8 << shift >> shift);
+ break;
+ case 2:
+ data16 = *((u16 *)run->mmio.data);
+ SET_RD(insn, &vcpu->arch.guest_context,
+ (ulong)data16 << shift >> shift);
+ break;
+ case 4:
+ data32 = *((u32 *)run->mmio.data);
+ SET_RD(insn, &vcpu->arch.guest_context,
+ (ulong)data32 << shift >> shift);
+ break;
+ case 8:
+ data64 = *((u64 *)run->mmio.data);
+ SET_RD(insn, &vcpu->arch.guest_context,
+ (ulong)data64 << shift >> shift);
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+
+done:
+ /* Move to next instruction */
+ vcpu->arch.guest_context.sepc += vcpu->arch.mmio_decode.insn_len;
+
+ return 0;
+}
diff --git a/arch/riscv/kvm/vcpu_onereg.c b/arch/riscv/kvm/vcpu_onereg.c
new file mode 100644
index 000000000..b7e0e03c6
--- /dev/null
+++ b/arch/riscv/kvm/vcpu_onereg.c
@@ -0,0 +1,1054 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2019 Western Digital Corporation or its affiliates.
+ * Copyright (C) 2023 Ventana Micro Systems Inc.
+ *
+ * Authors:
+ * Anup Patel <apatel@ventanamicro.com>
+ */
+
+#include <linux/bitops.h>
+#include <linux/errno.h>
+#include <linux/err.h>
+#include <linux/uaccess.h>
+#include <linux/kvm_host.h>
+#include <asm/cacheflush.h>
+#include <asm/hwcap.h>
+#include <asm/kvm_vcpu_vector.h>
+#include <asm/vector.h>
+
+#define KVM_RISCV_BASE_ISA_MASK GENMASK(25, 0)
+
+#define KVM_ISA_EXT_ARR(ext) \
+[KVM_RISCV_ISA_EXT_##ext] = RISCV_ISA_EXT_##ext
+
+/* Mapping between KVM ISA Extension ID & Host ISA extension ID */
+static const unsigned long kvm_isa_ext_arr[] = {
+ /* Single letter extensions (alphabetically sorted) */
+ [KVM_RISCV_ISA_EXT_A] = RISCV_ISA_EXT_a,
+ [KVM_RISCV_ISA_EXT_C] = RISCV_ISA_EXT_c,
+ [KVM_RISCV_ISA_EXT_D] = RISCV_ISA_EXT_d,
+ [KVM_RISCV_ISA_EXT_F] = RISCV_ISA_EXT_f,
+ [KVM_RISCV_ISA_EXT_H] = RISCV_ISA_EXT_h,
+ [KVM_RISCV_ISA_EXT_I] = RISCV_ISA_EXT_i,
+ [KVM_RISCV_ISA_EXT_M] = RISCV_ISA_EXT_m,
+ [KVM_RISCV_ISA_EXT_V] = RISCV_ISA_EXT_v,
+ /* Multi letter extensions (alphabetically sorted) */
+ KVM_ISA_EXT_ARR(SSAIA),
+ KVM_ISA_EXT_ARR(SSTC),
+ KVM_ISA_EXT_ARR(SVINVAL),
+ KVM_ISA_EXT_ARR(SVNAPOT),
+ KVM_ISA_EXT_ARR(SVPBMT),
+ KVM_ISA_EXT_ARR(ZBA),
+ KVM_ISA_EXT_ARR(ZBB),
+ KVM_ISA_EXT_ARR(ZBS),
+ KVM_ISA_EXT_ARR(ZICBOM),
+ KVM_ISA_EXT_ARR(ZICBOZ),
+ KVM_ISA_EXT_ARR(ZICNTR),
+ KVM_ISA_EXT_ARR(ZICSR),
+ KVM_ISA_EXT_ARR(ZIFENCEI),
+ KVM_ISA_EXT_ARR(ZIHINTPAUSE),
+ KVM_ISA_EXT_ARR(ZIHPM),
+};
+
+static unsigned long kvm_riscv_vcpu_base2isa_ext(unsigned long base_ext)
+{
+ unsigned long i;
+
+ for (i = 0; i < KVM_RISCV_ISA_EXT_MAX; i++) {
+ if (kvm_isa_ext_arr[i] == base_ext)
+ return i;
+ }
+
+ return KVM_RISCV_ISA_EXT_MAX;
+}
+
+static bool kvm_riscv_vcpu_isa_enable_allowed(unsigned long ext)
+{
+ switch (ext) {
+ case KVM_RISCV_ISA_EXT_H:
+ return false;
+ case KVM_RISCV_ISA_EXT_V:
+ return riscv_v_vstate_ctrl_user_allowed();
+ default:
+ break;
+ }
+
+ return true;
+}
+
+static bool kvm_riscv_vcpu_isa_disable_allowed(unsigned long ext)
+{
+ switch (ext) {
+ case KVM_RISCV_ISA_EXT_A:
+ case KVM_RISCV_ISA_EXT_C:
+ case KVM_RISCV_ISA_EXT_I:
+ case KVM_RISCV_ISA_EXT_M:
+ case KVM_RISCV_ISA_EXT_SSAIA:
+ case KVM_RISCV_ISA_EXT_SSTC:
+ case KVM_RISCV_ISA_EXT_SVINVAL:
+ case KVM_RISCV_ISA_EXT_SVNAPOT:
+ case KVM_RISCV_ISA_EXT_ZBA:
+ case KVM_RISCV_ISA_EXT_ZBB:
+ case KVM_RISCV_ISA_EXT_ZBS:
+ case KVM_RISCV_ISA_EXT_ZICNTR:
+ case KVM_RISCV_ISA_EXT_ZICSR:
+ case KVM_RISCV_ISA_EXT_ZIFENCEI:
+ case KVM_RISCV_ISA_EXT_ZIHINTPAUSE:
+ case KVM_RISCV_ISA_EXT_ZIHPM:
+ return false;
+ default:
+ break;
+ }
+
+ return true;
+}
+
+void kvm_riscv_vcpu_setup_isa(struct kvm_vcpu *vcpu)
+{
+ unsigned long host_isa, i;
+
+ for (i = 0; i < ARRAY_SIZE(kvm_isa_ext_arr); i++) {
+ host_isa = kvm_isa_ext_arr[i];
+ if (__riscv_isa_extension_available(NULL, host_isa) &&
+ kvm_riscv_vcpu_isa_enable_allowed(i))
+ set_bit(host_isa, vcpu->arch.isa);
+ }
+}
+
+static int kvm_riscv_vcpu_get_reg_config(struct kvm_vcpu *vcpu,
+ const struct kvm_one_reg *reg)
+{
+ unsigned long __user *uaddr =
+ (unsigned long __user *)(unsigned long)reg->addr;
+ unsigned long reg_num = reg->id & ~(KVM_REG_ARCH_MASK |
+ KVM_REG_SIZE_MASK |
+ KVM_REG_RISCV_CONFIG);
+ unsigned long reg_val;
+
+ if (KVM_REG_SIZE(reg->id) != sizeof(unsigned long))
+ return -EINVAL;
+
+ switch (reg_num) {
+ case KVM_REG_RISCV_CONFIG_REG(isa):
+ reg_val = vcpu->arch.isa[0] & KVM_RISCV_BASE_ISA_MASK;
+ break;
+ case KVM_REG_RISCV_CONFIG_REG(zicbom_block_size):
+ if (!riscv_isa_extension_available(vcpu->arch.isa, ZICBOM))
+ return -ENOENT;
+ reg_val = riscv_cbom_block_size;
+ break;
+ case KVM_REG_RISCV_CONFIG_REG(zicboz_block_size):
+ if (!riscv_isa_extension_available(vcpu->arch.isa, ZICBOZ))
+ return -ENOENT;
+ reg_val = riscv_cboz_block_size;
+ break;
+ case KVM_REG_RISCV_CONFIG_REG(mvendorid):
+ reg_val = vcpu->arch.mvendorid;
+ break;
+ case KVM_REG_RISCV_CONFIG_REG(marchid):
+ reg_val = vcpu->arch.marchid;
+ break;
+ case KVM_REG_RISCV_CONFIG_REG(mimpid):
+ reg_val = vcpu->arch.mimpid;
+ break;
+ case KVM_REG_RISCV_CONFIG_REG(satp_mode):
+ reg_val = satp_mode >> SATP_MODE_SHIFT;
+ break;
+ default:
+ return -ENOENT;
+ }
+
+ if (copy_to_user(uaddr, &reg_val, KVM_REG_SIZE(reg->id)))
+ return -EFAULT;
+
+ return 0;
+}
+
+static int kvm_riscv_vcpu_set_reg_config(struct kvm_vcpu *vcpu,
+ const struct kvm_one_reg *reg)
+{
+ unsigned long __user *uaddr =
+ (unsigned long __user *)(unsigned long)reg->addr;
+ unsigned long reg_num = reg->id & ~(KVM_REG_ARCH_MASK |
+ KVM_REG_SIZE_MASK |
+ KVM_REG_RISCV_CONFIG);
+ unsigned long i, isa_ext, reg_val;
+
+ if (KVM_REG_SIZE(reg->id) != sizeof(unsigned long))
+ return -EINVAL;
+
+ if (copy_from_user(&reg_val, uaddr, KVM_REG_SIZE(reg->id)))
+ return -EFAULT;
+
+ switch (reg_num) {
+ case KVM_REG_RISCV_CONFIG_REG(isa):
+ /*
+ * This ONE REG interface is only defined for
+ * single letter extensions.
+ */
+ if (fls(reg_val) >= RISCV_ISA_EXT_BASE)
+ return -EINVAL;
+
+ /*
+ * Return early (i.e. do nothing) if reg_val is the same
+ * value retrievable via kvm_riscv_vcpu_get_reg_config().
+ */
+ if (reg_val == (vcpu->arch.isa[0] & KVM_RISCV_BASE_ISA_MASK))
+ break;
+
+ if (!vcpu->arch.ran_atleast_once) {
+ /* Ignore the enable/disable request for certain extensions */
+ for (i = 0; i < RISCV_ISA_EXT_BASE; i++) {
+ isa_ext = kvm_riscv_vcpu_base2isa_ext(i);
+ if (isa_ext >= KVM_RISCV_ISA_EXT_MAX) {
+ reg_val &= ~BIT(i);
+ continue;
+ }
+ if (!kvm_riscv_vcpu_isa_enable_allowed(isa_ext))
+ if (reg_val & BIT(i))
+ reg_val &= ~BIT(i);
+ if (!kvm_riscv_vcpu_isa_disable_allowed(isa_ext))
+ if (!(reg_val & BIT(i)))
+ reg_val |= BIT(i);
+ }
+ reg_val &= riscv_isa_extension_base(NULL);
+ /* Do not modify anything beyond single letter extensions */
+ reg_val = (vcpu->arch.isa[0] & ~KVM_RISCV_BASE_ISA_MASK) |
+ (reg_val & KVM_RISCV_BASE_ISA_MASK);
+ vcpu->arch.isa[0] = reg_val;
+ kvm_riscv_vcpu_fp_reset(vcpu);
+ } else {
+ return -EBUSY;
+ }
+ break;
+ case KVM_REG_RISCV_CONFIG_REG(zicbom_block_size):
+ if (!riscv_isa_extension_available(vcpu->arch.isa, ZICBOM))
+ return -ENOENT;
+ if (reg_val != riscv_cbom_block_size)
+ return -EINVAL;
+ break;
+ case KVM_REG_RISCV_CONFIG_REG(zicboz_block_size):
+ if (!riscv_isa_extension_available(vcpu->arch.isa, ZICBOZ))
+ return -ENOENT;
+ if (reg_val != riscv_cboz_block_size)
+ return -EINVAL;
+ break;
+ case KVM_REG_RISCV_CONFIG_REG(mvendorid):
+ if (reg_val == vcpu->arch.mvendorid)
+ break;
+ if (!vcpu->arch.ran_atleast_once)
+ vcpu->arch.mvendorid = reg_val;
+ else
+ return -EBUSY;
+ break;
+ case KVM_REG_RISCV_CONFIG_REG(marchid):
+ if (reg_val == vcpu->arch.marchid)
+ break;
+ if (!vcpu->arch.ran_atleast_once)
+ vcpu->arch.marchid = reg_val;
+ else
+ return -EBUSY;
+ break;
+ case KVM_REG_RISCV_CONFIG_REG(mimpid):
+ if (reg_val == vcpu->arch.mimpid)
+ break;
+ if (!vcpu->arch.ran_atleast_once)
+ vcpu->arch.mimpid = reg_val;
+ else
+ return -EBUSY;
+ break;
+ case KVM_REG_RISCV_CONFIG_REG(satp_mode):
+ if (reg_val != (satp_mode >> SATP_MODE_SHIFT))
+ return -EINVAL;
+ break;
+ default:
+ return -ENOENT;
+ }
+
+ return 0;
+}
+
+static int kvm_riscv_vcpu_get_reg_core(struct kvm_vcpu *vcpu,
+ const struct kvm_one_reg *reg)
+{
+ struct kvm_cpu_context *cntx = &vcpu->arch.guest_context;
+ unsigned long __user *uaddr =
+ (unsigned long __user *)(unsigned long)reg->addr;
+ unsigned long reg_num = reg->id & ~(KVM_REG_ARCH_MASK |
+ KVM_REG_SIZE_MASK |
+ KVM_REG_RISCV_CORE);
+ unsigned long reg_val;
+
+ if (KVM_REG_SIZE(reg->id) != sizeof(unsigned long))
+ return -EINVAL;
+ if (reg_num >= sizeof(struct kvm_riscv_core) / sizeof(unsigned long))
+ return -ENOENT;
+
+ if (reg_num == KVM_REG_RISCV_CORE_REG(regs.pc))
+ reg_val = cntx->sepc;
+ else if (KVM_REG_RISCV_CORE_REG(regs.pc) < reg_num &&
+ reg_num <= KVM_REG_RISCV_CORE_REG(regs.t6))
+ reg_val = ((unsigned long *)cntx)[reg_num];
+ else if (reg_num == KVM_REG_RISCV_CORE_REG(mode))
+ reg_val = (cntx->sstatus & SR_SPP) ?
+ KVM_RISCV_MODE_S : KVM_RISCV_MODE_U;
+ else
+ return -ENOENT;
+
+ if (copy_to_user(uaddr, &reg_val, KVM_REG_SIZE(reg->id)))
+ return -EFAULT;
+
+ return 0;
+}
+
+static int kvm_riscv_vcpu_set_reg_core(struct kvm_vcpu *vcpu,
+ const struct kvm_one_reg *reg)
+{
+ struct kvm_cpu_context *cntx = &vcpu->arch.guest_context;
+ unsigned long __user *uaddr =
+ (unsigned long __user *)(unsigned long)reg->addr;
+ unsigned long reg_num = reg->id & ~(KVM_REG_ARCH_MASK |
+ KVM_REG_SIZE_MASK |
+ KVM_REG_RISCV_CORE);
+ unsigned long reg_val;
+
+ if (KVM_REG_SIZE(reg->id) != sizeof(unsigned long))
+ return -EINVAL;
+ if (reg_num >= sizeof(struct kvm_riscv_core) / sizeof(unsigned long))
+ return -ENOENT;
+
+ if (copy_from_user(&reg_val, uaddr, KVM_REG_SIZE(reg->id)))
+ return -EFAULT;
+
+ if (reg_num == KVM_REG_RISCV_CORE_REG(regs.pc))
+ cntx->sepc = reg_val;
+ else if (KVM_REG_RISCV_CORE_REG(regs.pc) < reg_num &&
+ reg_num <= KVM_REG_RISCV_CORE_REG(regs.t6))
+ ((unsigned long *)cntx)[reg_num] = reg_val;
+ else if (reg_num == KVM_REG_RISCV_CORE_REG(mode)) {
+ if (reg_val == KVM_RISCV_MODE_S)
+ cntx->sstatus |= SR_SPP;
+ else
+ cntx->sstatus &= ~SR_SPP;
+ } else
+ return -ENOENT;
+
+ return 0;
+}
+
+static int kvm_riscv_vcpu_general_get_csr(struct kvm_vcpu *vcpu,
+ unsigned long reg_num,
+ unsigned long *out_val)
+{
+ struct kvm_vcpu_csr *csr = &vcpu->arch.guest_csr;
+
+ if (reg_num >= sizeof(struct kvm_riscv_csr) / sizeof(unsigned long))
+ return -ENOENT;
+
+ if (reg_num == KVM_REG_RISCV_CSR_REG(sip)) {
+ kvm_riscv_vcpu_flush_interrupts(vcpu);
+ *out_val = (csr->hvip >> VSIP_TO_HVIP_SHIFT) & VSIP_VALID_MASK;
+ *out_val |= csr->hvip & ~IRQ_LOCAL_MASK;
+ } else
+ *out_val = ((unsigned long *)csr)[reg_num];
+
+ return 0;
+}
+
+static int kvm_riscv_vcpu_general_set_csr(struct kvm_vcpu *vcpu,
+ unsigned long reg_num,
+ unsigned long reg_val)
+{
+ struct kvm_vcpu_csr *csr = &vcpu->arch.guest_csr;
+
+ if (reg_num >= sizeof(struct kvm_riscv_csr) / sizeof(unsigned long))
+ return -ENOENT;
+
+ if (reg_num == KVM_REG_RISCV_CSR_REG(sip)) {
+ reg_val &= VSIP_VALID_MASK;
+ reg_val <<= VSIP_TO_HVIP_SHIFT;
+ }
+
+ ((unsigned long *)csr)[reg_num] = reg_val;
+
+ if (reg_num == KVM_REG_RISCV_CSR_REG(sip))
+ WRITE_ONCE(vcpu->arch.irqs_pending_mask[0], 0);
+
+ return 0;
+}
+
+static int kvm_riscv_vcpu_get_reg_csr(struct kvm_vcpu *vcpu,
+ const struct kvm_one_reg *reg)
+{
+ int rc;
+ unsigned long __user *uaddr =
+ (unsigned long __user *)(unsigned long)reg->addr;
+ unsigned long reg_num = reg->id & ~(KVM_REG_ARCH_MASK |
+ KVM_REG_SIZE_MASK |
+ KVM_REG_RISCV_CSR);
+ unsigned long reg_val, reg_subtype;
+
+ if (KVM_REG_SIZE(reg->id) != sizeof(unsigned long))
+ return -EINVAL;
+
+ reg_subtype = reg_num & KVM_REG_RISCV_SUBTYPE_MASK;
+ reg_num &= ~KVM_REG_RISCV_SUBTYPE_MASK;
+ switch (reg_subtype) {
+ case KVM_REG_RISCV_CSR_GENERAL:
+ rc = kvm_riscv_vcpu_general_get_csr(vcpu, reg_num, &reg_val);
+ break;
+ case KVM_REG_RISCV_CSR_AIA:
+ rc = kvm_riscv_vcpu_aia_get_csr(vcpu, reg_num, &reg_val);
+ break;
+ default:
+ rc = -ENOENT;
+ break;
+ }
+ if (rc)
+ return rc;
+
+ if (copy_to_user(uaddr, &reg_val, KVM_REG_SIZE(reg->id)))
+ return -EFAULT;
+
+ return 0;
+}
+
+static int kvm_riscv_vcpu_set_reg_csr(struct kvm_vcpu *vcpu,
+ const struct kvm_one_reg *reg)
+{
+ int rc;
+ unsigned long __user *uaddr =
+ (unsigned long __user *)(unsigned long)reg->addr;
+ unsigned long reg_num = reg->id & ~(KVM_REG_ARCH_MASK |
+ KVM_REG_SIZE_MASK |
+ KVM_REG_RISCV_CSR);
+ unsigned long reg_val, reg_subtype;
+
+ if (KVM_REG_SIZE(reg->id) != sizeof(unsigned long))
+ return -EINVAL;
+
+ if (copy_from_user(&reg_val, uaddr, KVM_REG_SIZE(reg->id)))
+ return -EFAULT;
+
+ reg_subtype = reg_num & KVM_REG_RISCV_SUBTYPE_MASK;
+ reg_num &= ~KVM_REG_RISCV_SUBTYPE_MASK;
+ switch (reg_subtype) {
+ case KVM_REG_RISCV_CSR_GENERAL:
+ rc = kvm_riscv_vcpu_general_set_csr(vcpu, reg_num, reg_val);
+ break;
+ case KVM_REG_RISCV_CSR_AIA:
+ rc = kvm_riscv_vcpu_aia_set_csr(vcpu, reg_num, reg_val);
+ break;
+ default:
+ rc = -ENOENT;
+ break;
+ }
+ if (rc)
+ return rc;
+
+ return 0;
+}
+
+static int riscv_vcpu_get_isa_ext_single(struct kvm_vcpu *vcpu,
+ unsigned long reg_num,
+ unsigned long *reg_val)
+{
+ unsigned long host_isa_ext;
+
+ if (reg_num >= KVM_RISCV_ISA_EXT_MAX ||
+ reg_num >= ARRAY_SIZE(kvm_isa_ext_arr))
+ return -ENOENT;
+
+ host_isa_ext = kvm_isa_ext_arr[reg_num];
+ if (!__riscv_isa_extension_available(NULL, host_isa_ext))
+ return -ENOENT;
+
+ *reg_val = 0;
+ if (__riscv_isa_extension_available(vcpu->arch.isa, host_isa_ext))
+ *reg_val = 1; /* Mark the given extension as available */
+
+ return 0;
+}
+
+static int riscv_vcpu_set_isa_ext_single(struct kvm_vcpu *vcpu,
+ unsigned long reg_num,
+ unsigned long reg_val)
+{
+ unsigned long host_isa_ext;
+
+ if (reg_num >= KVM_RISCV_ISA_EXT_MAX ||
+ reg_num >= ARRAY_SIZE(kvm_isa_ext_arr))
+ return -ENOENT;
+
+ host_isa_ext = kvm_isa_ext_arr[reg_num];
+ if (!__riscv_isa_extension_available(NULL, host_isa_ext))
+ return -ENOENT;
+
+ if (reg_val == test_bit(host_isa_ext, vcpu->arch.isa))
+ return 0;
+
+ if (!vcpu->arch.ran_atleast_once) {
+ /*
+ * All multi-letter extension and a few single letter
+ * extension can be disabled
+ */
+ if (reg_val == 1 &&
+ kvm_riscv_vcpu_isa_enable_allowed(reg_num))
+ set_bit(host_isa_ext, vcpu->arch.isa);
+ else if (!reg_val &&
+ kvm_riscv_vcpu_isa_disable_allowed(reg_num))
+ clear_bit(host_isa_ext, vcpu->arch.isa);
+ else
+ return -EINVAL;
+ kvm_riscv_vcpu_fp_reset(vcpu);
+ } else {
+ return -EBUSY;
+ }
+
+ return 0;
+}
+
+static int riscv_vcpu_get_isa_ext_multi(struct kvm_vcpu *vcpu,
+ unsigned long reg_num,
+ unsigned long *reg_val)
+{
+ unsigned long i, ext_id, ext_val;
+
+ if (reg_num > KVM_REG_RISCV_ISA_MULTI_REG_LAST)
+ return -ENOENT;
+
+ for (i = 0; i < BITS_PER_LONG; i++) {
+ ext_id = i + reg_num * BITS_PER_LONG;
+ if (ext_id >= KVM_RISCV_ISA_EXT_MAX)
+ break;
+
+ ext_val = 0;
+ riscv_vcpu_get_isa_ext_single(vcpu, ext_id, &ext_val);
+ if (ext_val)
+ *reg_val |= KVM_REG_RISCV_ISA_MULTI_MASK(ext_id);
+ }
+
+ return 0;
+}
+
+static int riscv_vcpu_set_isa_ext_multi(struct kvm_vcpu *vcpu,
+ unsigned long reg_num,
+ unsigned long reg_val, bool enable)
+{
+ unsigned long i, ext_id;
+
+ if (reg_num > KVM_REG_RISCV_ISA_MULTI_REG_LAST)
+ return -ENOENT;
+
+ for_each_set_bit(i, &reg_val, BITS_PER_LONG) {
+ ext_id = i + reg_num * BITS_PER_LONG;
+ if (ext_id >= KVM_RISCV_ISA_EXT_MAX)
+ break;
+
+ riscv_vcpu_set_isa_ext_single(vcpu, ext_id, enable);
+ }
+
+ return 0;
+}
+
+static int kvm_riscv_vcpu_get_reg_isa_ext(struct kvm_vcpu *vcpu,
+ const struct kvm_one_reg *reg)
+{
+ int rc;
+ unsigned long __user *uaddr =
+ (unsigned long __user *)(unsigned long)reg->addr;
+ unsigned long reg_num = reg->id & ~(KVM_REG_ARCH_MASK |
+ KVM_REG_SIZE_MASK |
+ KVM_REG_RISCV_ISA_EXT);
+ unsigned long reg_val, reg_subtype;
+
+ if (KVM_REG_SIZE(reg->id) != sizeof(unsigned long))
+ return -EINVAL;
+
+ reg_subtype = reg_num & KVM_REG_RISCV_SUBTYPE_MASK;
+ reg_num &= ~KVM_REG_RISCV_SUBTYPE_MASK;
+
+ reg_val = 0;
+ switch (reg_subtype) {
+ case KVM_REG_RISCV_ISA_SINGLE:
+ rc = riscv_vcpu_get_isa_ext_single(vcpu, reg_num, &reg_val);
+ break;
+ case KVM_REG_RISCV_ISA_MULTI_EN:
+ case KVM_REG_RISCV_ISA_MULTI_DIS:
+ rc = riscv_vcpu_get_isa_ext_multi(vcpu, reg_num, &reg_val);
+ if (!rc && reg_subtype == KVM_REG_RISCV_ISA_MULTI_DIS)
+ reg_val = ~reg_val;
+ break;
+ default:
+ rc = -ENOENT;
+ }
+ if (rc)
+ return rc;
+
+ if (copy_to_user(uaddr, &reg_val, KVM_REG_SIZE(reg->id)))
+ return -EFAULT;
+
+ return 0;
+}
+
+static int kvm_riscv_vcpu_set_reg_isa_ext(struct kvm_vcpu *vcpu,
+ const struct kvm_one_reg *reg)
+{
+ unsigned long __user *uaddr =
+ (unsigned long __user *)(unsigned long)reg->addr;
+ unsigned long reg_num = reg->id & ~(KVM_REG_ARCH_MASK |
+ KVM_REG_SIZE_MASK |
+ KVM_REG_RISCV_ISA_EXT);
+ unsigned long reg_val, reg_subtype;
+
+ if (KVM_REG_SIZE(reg->id) != sizeof(unsigned long))
+ return -EINVAL;
+
+ reg_subtype = reg_num & KVM_REG_RISCV_SUBTYPE_MASK;
+ reg_num &= ~KVM_REG_RISCV_SUBTYPE_MASK;
+
+ if (copy_from_user(&reg_val, uaddr, KVM_REG_SIZE(reg->id)))
+ return -EFAULT;
+
+ switch (reg_subtype) {
+ case KVM_REG_RISCV_ISA_SINGLE:
+ return riscv_vcpu_set_isa_ext_single(vcpu, reg_num, reg_val);
+ case KVM_REG_RISCV_SBI_MULTI_EN:
+ return riscv_vcpu_set_isa_ext_multi(vcpu, reg_num, reg_val, true);
+ case KVM_REG_RISCV_SBI_MULTI_DIS:
+ return riscv_vcpu_set_isa_ext_multi(vcpu, reg_num, reg_val, false);
+ default:
+ return -ENOENT;
+ }
+
+ return 0;
+}
+
+static int copy_config_reg_indices(const struct kvm_vcpu *vcpu,
+ u64 __user *uindices)
+{
+ int n = 0;
+
+ for (int i = 0; i < sizeof(struct kvm_riscv_config)/sizeof(unsigned long);
+ i++) {
+ u64 size;
+ u64 reg;
+
+ /*
+ * Avoid reporting config reg if the corresponding extension
+ * was not available.
+ */
+ if (i == KVM_REG_RISCV_CONFIG_REG(zicbom_block_size) &&
+ !riscv_isa_extension_available(vcpu->arch.isa, ZICBOM))
+ continue;
+ else if (i == KVM_REG_RISCV_CONFIG_REG(zicboz_block_size) &&
+ !riscv_isa_extension_available(vcpu->arch.isa, ZICBOZ))
+ continue;
+
+ size = IS_ENABLED(CONFIG_32BIT) ? KVM_REG_SIZE_U32 : KVM_REG_SIZE_U64;
+ reg = KVM_REG_RISCV | size | KVM_REG_RISCV_CONFIG | i;
+
+ if (uindices) {
+ if (put_user(reg, uindices))
+ return -EFAULT;
+ uindices++;
+ }
+
+ n++;
+ }
+
+ return n;
+}
+
+static unsigned long num_config_regs(const struct kvm_vcpu *vcpu)
+{
+ return copy_config_reg_indices(vcpu, NULL);
+}
+
+static inline unsigned long num_core_regs(void)
+{
+ return sizeof(struct kvm_riscv_core) / sizeof(unsigned long);
+}
+
+static int copy_core_reg_indices(u64 __user *uindices)
+{
+ int n = num_core_regs();
+
+ for (int i = 0; i < n; i++) {
+ u64 size = IS_ENABLED(CONFIG_32BIT) ?
+ KVM_REG_SIZE_U32 : KVM_REG_SIZE_U64;
+ u64 reg = KVM_REG_RISCV | size | KVM_REG_RISCV_CORE | i;
+
+ if (uindices) {
+ if (put_user(reg, uindices))
+ return -EFAULT;
+ uindices++;
+ }
+ }
+
+ return n;
+}
+
+static inline unsigned long num_csr_regs(const struct kvm_vcpu *vcpu)
+{
+ unsigned long n = sizeof(struct kvm_riscv_csr) / sizeof(unsigned long);
+
+ if (riscv_isa_extension_available(vcpu->arch.isa, SSAIA))
+ n += sizeof(struct kvm_riscv_aia_csr) / sizeof(unsigned long);
+
+ return n;
+}
+
+static int copy_csr_reg_indices(const struct kvm_vcpu *vcpu,
+ u64 __user *uindices)
+{
+ int n1 = sizeof(struct kvm_riscv_csr) / sizeof(unsigned long);
+ int n2 = 0;
+
+ /* copy general csr regs */
+ for (int i = 0; i < n1; i++) {
+ u64 size = IS_ENABLED(CONFIG_32BIT) ?
+ KVM_REG_SIZE_U32 : KVM_REG_SIZE_U64;
+ u64 reg = KVM_REG_RISCV | size | KVM_REG_RISCV_CSR |
+ KVM_REG_RISCV_CSR_GENERAL | i;
+
+ if (uindices) {
+ if (put_user(reg, uindices))
+ return -EFAULT;
+ uindices++;
+ }
+ }
+
+ /* copy AIA csr regs */
+ if (riscv_isa_extension_available(vcpu->arch.isa, SSAIA)) {
+ n2 = sizeof(struct kvm_riscv_aia_csr) / sizeof(unsigned long);
+
+ for (int i = 0; i < n2; i++) {
+ u64 size = IS_ENABLED(CONFIG_32BIT) ?
+ KVM_REG_SIZE_U32 : KVM_REG_SIZE_U64;
+ u64 reg = KVM_REG_RISCV | size | KVM_REG_RISCV_CSR |
+ KVM_REG_RISCV_CSR_AIA | i;
+
+ if (uindices) {
+ if (put_user(reg, uindices))
+ return -EFAULT;
+ uindices++;
+ }
+ }
+ }
+
+ return n1 + n2;
+}
+
+static inline unsigned long num_timer_regs(void)
+{
+ return sizeof(struct kvm_riscv_timer) / sizeof(u64);
+}
+
+static int copy_timer_reg_indices(u64 __user *uindices)
+{
+ int n = num_timer_regs();
+
+ for (int i = 0; i < n; i++) {
+ u64 reg = KVM_REG_RISCV | KVM_REG_SIZE_U64 |
+ KVM_REG_RISCV_TIMER | i;
+
+ if (uindices) {
+ if (put_user(reg, uindices))
+ return -EFAULT;
+ uindices++;
+ }
+ }
+
+ return n;
+}
+
+static inline unsigned long num_fp_f_regs(const struct kvm_vcpu *vcpu)
+{
+ const struct kvm_cpu_context *cntx = &vcpu->arch.guest_context;
+
+ if (riscv_isa_extension_available(vcpu->arch.isa, f))
+ return sizeof(cntx->fp.f) / sizeof(u32);
+ else
+ return 0;
+}
+
+static int copy_fp_f_reg_indices(const struct kvm_vcpu *vcpu,
+ u64 __user *uindices)
+{
+ int n = num_fp_f_regs(vcpu);
+
+ for (int i = 0; i < n; i++) {
+ u64 reg = KVM_REG_RISCV | KVM_REG_SIZE_U32 |
+ KVM_REG_RISCV_FP_F | i;
+
+ if (uindices) {
+ if (put_user(reg, uindices))
+ return -EFAULT;
+ uindices++;
+ }
+ }
+
+ return n;
+}
+
+static inline unsigned long num_fp_d_regs(const struct kvm_vcpu *vcpu)
+{
+ const struct kvm_cpu_context *cntx = &vcpu->arch.guest_context;
+
+ if (riscv_isa_extension_available(vcpu->arch.isa, d))
+ return sizeof(cntx->fp.d.f) / sizeof(u64) + 1;
+ else
+ return 0;
+}
+
+static int copy_fp_d_reg_indices(const struct kvm_vcpu *vcpu,
+ u64 __user *uindices)
+{
+ int i;
+ int n = num_fp_d_regs(vcpu);
+ u64 reg;
+
+ /* copy fp.d.f indices */
+ for (i = 0; i < n-1; i++) {
+ reg = KVM_REG_RISCV | KVM_REG_SIZE_U64 |
+ KVM_REG_RISCV_FP_D | i;
+
+ if (uindices) {
+ if (put_user(reg, uindices))
+ return -EFAULT;
+ uindices++;
+ }
+ }
+
+ /* copy fp.d.fcsr indices */
+ reg = KVM_REG_RISCV | KVM_REG_SIZE_U32 | KVM_REG_RISCV_FP_D | i;
+ if (uindices) {
+ if (put_user(reg, uindices))
+ return -EFAULT;
+ uindices++;
+ }
+
+ return n;
+}
+
+static int copy_isa_ext_reg_indices(const struct kvm_vcpu *vcpu,
+ u64 __user *uindices)
+{
+ unsigned int n = 0;
+ unsigned long isa_ext;
+
+ for (int i = 0; i < KVM_RISCV_ISA_EXT_MAX; i++) {
+ u64 size = IS_ENABLED(CONFIG_32BIT) ?
+ KVM_REG_SIZE_U32 : KVM_REG_SIZE_U64;
+ u64 reg = KVM_REG_RISCV | size | KVM_REG_RISCV_ISA_EXT | i;
+
+ isa_ext = kvm_isa_ext_arr[i];
+ if (!__riscv_isa_extension_available(NULL, isa_ext))
+ continue;
+
+ if (uindices) {
+ if (put_user(reg, uindices))
+ return -EFAULT;
+ uindices++;
+ }
+
+ n++;
+ }
+
+ return n;
+}
+
+static inline unsigned long num_isa_ext_regs(const struct kvm_vcpu *vcpu)
+{
+ return copy_isa_ext_reg_indices(vcpu, NULL);;
+}
+
+static inline unsigned long num_sbi_ext_regs(void)
+{
+ /*
+ * number of KVM_REG_RISCV_SBI_SINGLE +
+ * 2 x (number of KVM_REG_RISCV_SBI_MULTI)
+ */
+ return KVM_RISCV_SBI_EXT_MAX + 2*(KVM_REG_RISCV_SBI_MULTI_REG_LAST+1);
+}
+
+static int copy_sbi_ext_reg_indices(u64 __user *uindices)
+{
+ int n;
+
+ /* copy KVM_REG_RISCV_SBI_SINGLE */
+ n = KVM_RISCV_SBI_EXT_MAX;
+ for (int i = 0; i < n; i++) {
+ u64 size = IS_ENABLED(CONFIG_32BIT) ?
+ KVM_REG_SIZE_U32 : KVM_REG_SIZE_U64;
+ u64 reg = KVM_REG_RISCV | size | KVM_REG_RISCV_SBI_EXT |
+ KVM_REG_RISCV_SBI_SINGLE | i;
+
+ if (uindices) {
+ if (put_user(reg, uindices))
+ return -EFAULT;
+ uindices++;
+ }
+ }
+
+ /* copy KVM_REG_RISCV_SBI_MULTI */
+ n = KVM_REG_RISCV_SBI_MULTI_REG_LAST + 1;
+ for (int i = 0; i < n; i++) {
+ u64 size = IS_ENABLED(CONFIG_32BIT) ?
+ KVM_REG_SIZE_U32 : KVM_REG_SIZE_U64;
+ u64 reg = KVM_REG_RISCV | size | KVM_REG_RISCV_SBI_EXT |
+ KVM_REG_RISCV_SBI_MULTI_EN | i;
+
+ if (uindices) {
+ if (put_user(reg, uindices))
+ return -EFAULT;
+ uindices++;
+ }
+
+ reg = KVM_REG_RISCV | size | KVM_REG_RISCV_SBI_EXT |
+ KVM_REG_RISCV_SBI_MULTI_DIS | i;
+
+ if (uindices) {
+ if (put_user(reg, uindices))
+ return -EFAULT;
+ uindices++;
+ }
+ }
+
+ return num_sbi_ext_regs();
+}
+
+/*
+ * kvm_riscv_vcpu_num_regs - how many registers do we present via KVM_GET/SET_ONE_REG
+ *
+ * This is for all registers.
+ */
+unsigned long kvm_riscv_vcpu_num_regs(struct kvm_vcpu *vcpu)
+{
+ unsigned long res = 0;
+
+ res += num_config_regs(vcpu);
+ res += num_core_regs();
+ res += num_csr_regs(vcpu);
+ res += num_timer_regs();
+ res += num_fp_f_regs(vcpu);
+ res += num_fp_d_regs(vcpu);
+ res += num_isa_ext_regs(vcpu);
+ res += num_sbi_ext_regs();
+
+ return res;
+}
+
+/*
+ * kvm_riscv_vcpu_copy_reg_indices - get indices of all registers.
+ */
+int kvm_riscv_vcpu_copy_reg_indices(struct kvm_vcpu *vcpu,
+ u64 __user *uindices)
+{
+ int ret;
+
+ ret = copy_config_reg_indices(vcpu, uindices);
+ if (ret < 0)
+ return ret;
+ uindices += ret;
+
+ ret = copy_core_reg_indices(uindices);
+ if (ret < 0)
+ return ret;
+ uindices += ret;
+
+ ret = copy_csr_reg_indices(vcpu, uindices);
+ if (ret < 0)
+ return ret;
+ uindices += ret;
+
+ ret = copy_timer_reg_indices(uindices);
+ if (ret < 0)
+ return ret;
+ uindices += ret;
+
+ ret = copy_fp_f_reg_indices(vcpu, uindices);
+ if (ret < 0)
+ return ret;
+ uindices += ret;
+
+ ret = copy_fp_d_reg_indices(vcpu, uindices);
+ if (ret < 0)
+ return ret;
+ uindices += ret;
+
+ ret = copy_isa_ext_reg_indices(vcpu, uindices);
+ if (ret < 0)
+ return ret;
+ uindices += ret;
+
+ ret = copy_sbi_ext_reg_indices(uindices);
+ if (ret < 0)
+ return ret;
+
+ return 0;
+}
+
+int kvm_riscv_vcpu_set_reg(struct kvm_vcpu *vcpu,
+ const struct kvm_one_reg *reg)
+{
+ switch (reg->id & KVM_REG_RISCV_TYPE_MASK) {
+ case KVM_REG_RISCV_CONFIG:
+ return kvm_riscv_vcpu_set_reg_config(vcpu, reg);
+ case KVM_REG_RISCV_CORE:
+ return kvm_riscv_vcpu_set_reg_core(vcpu, reg);
+ case KVM_REG_RISCV_CSR:
+ return kvm_riscv_vcpu_set_reg_csr(vcpu, reg);
+ case KVM_REG_RISCV_TIMER:
+ return kvm_riscv_vcpu_set_reg_timer(vcpu, reg);
+ case KVM_REG_RISCV_FP_F:
+ return kvm_riscv_vcpu_set_reg_fp(vcpu, reg,
+ KVM_REG_RISCV_FP_F);
+ case KVM_REG_RISCV_FP_D:
+ return kvm_riscv_vcpu_set_reg_fp(vcpu, reg,
+ KVM_REG_RISCV_FP_D);
+ case KVM_REG_RISCV_ISA_EXT:
+ return kvm_riscv_vcpu_set_reg_isa_ext(vcpu, reg);
+ case KVM_REG_RISCV_SBI_EXT:
+ return kvm_riscv_vcpu_set_reg_sbi_ext(vcpu, reg);
+ case KVM_REG_RISCV_VECTOR:
+ return kvm_riscv_vcpu_set_reg_vector(vcpu, reg);
+ default:
+ break;
+ }
+
+ return -ENOENT;
+}
+
+int kvm_riscv_vcpu_get_reg(struct kvm_vcpu *vcpu,
+ const struct kvm_one_reg *reg)
+{
+ switch (reg->id & KVM_REG_RISCV_TYPE_MASK) {
+ case KVM_REG_RISCV_CONFIG:
+ return kvm_riscv_vcpu_get_reg_config(vcpu, reg);
+ case KVM_REG_RISCV_CORE:
+ return kvm_riscv_vcpu_get_reg_core(vcpu, reg);
+ case KVM_REG_RISCV_CSR:
+ return kvm_riscv_vcpu_get_reg_csr(vcpu, reg);
+ case KVM_REG_RISCV_TIMER:
+ return kvm_riscv_vcpu_get_reg_timer(vcpu, reg);
+ case KVM_REG_RISCV_FP_F:
+ return kvm_riscv_vcpu_get_reg_fp(vcpu, reg,
+ KVM_REG_RISCV_FP_F);
+ case KVM_REG_RISCV_FP_D:
+ return kvm_riscv_vcpu_get_reg_fp(vcpu, reg,
+ KVM_REG_RISCV_FP_D);
+ case KVM_REG_RISCV_ISA_EXT:
+ return kvm_riscv_vcpu_get_reg_isa_ext(vcpu, reg);
+ case KVM_REG_RISCV_SBI_EXT:
+ return kvm_riscv_vcpu_get_reg_sbi_ext(vcpu, reg);
+ case KVM_REG_RISCV_VECTOR:
+ return kvm_riscv_vcpu_get_reg_vector(vcpu, reg);
+ default:
+ break;
+ }
+
+ return -ENOENT;
+}
diff --git a/arch/riscv/kvm/vcpu_pmu.c b/arch/riscv/kvm/vcpu_pmu.c
new file mode 100644
index 000000000..86391a506
--- /dev/null
+++ b/arch/riscv/kvm/vcpu_pmu.c
@@ -0,0 +1,633 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2023 Rivos Inc
+ *
+ * Authors:
+ * Atish Patra <atishp@rivosinc.com>
+ */
+
+#define pr_fmt(fmt) "riscv-kvm-pmu: " fmt
+#include <linux/errno.h>
+#include <linux/err.h>
+#include <linux/kvm_host.h>
+#include <linux/perf/riscv_pmu.h>
+#include <asm/csr.h>
+#include <asm/kvm_vcpu_sbi.h>
+#include <asm/kvm_vcpu_pmu.h>
+#include <linux/bitops.h>
+
+#define kvm_pmu_num_counters(pmu) ((pmu)->num_hw_ctrs + (pmu)->num_fw_ctrs)
+#define get_event_type(x) (((x) & SBI_PMU_EVENT_IDX_TYPE_MASK) >> 16)
+#define get_event_code(x) ((x) & SBI_PMU_EVENT_IDX_CODE_MASK)
+
+static enum perf_hw_id hw_event_perf_map[SBI_PMU_HW_GENERAL_MAX] = {
+ [SBI_PMU_HW_CPU_CYCLES] = PERF_COUNT_HW_CPU_CYCLES,
+ [SBI_PMU_HW_INSTRUCTIONS] = PERF_COUNT_HW_INSTRUCTIONS,
+ [SBI_PMU_HW_CACHE_REFERENCES] = PERF_COUNT_HW_CACHE_REFERENCES,
+ [SBI_PMU_HW_CACHE_MISSES] = PERF_COUNT_HW_CACHE_MISSES,
+ [SBI_PMU_HW_BRANCH_INSTRUCTIONS] = PERF_COUNT_HW_BRANCH_INSTRUCTIONS,
+ [SBI_PMU_HW_BRANCH_MISSES] = PERF_COUNT_HW_BRANCH_MISSES,
+ [SBI_PMU_HW_BUS_CYCLES] = PERF_COUNT_HW_BUS_CYCLES,
+ [SBI_PMU_HW_STALLED_CYCLES_FRONTEND] = PERF_COUNT_HW_STALLED_CYCLES_FRONTEND,
+ [SBI_PMU_HW_STALLED_CYCLES_BACKEND] = PERF_COUNT_HW_STALLED_CYCLES_BACKEND,
+ [SBI_PMU_HW_REF_CPU_CYCLES] = PERF_COUNT_HW_REF_CPU_CYCLES,
+};
+
+static u64 kvm_pmu_get_sample_period(struct kvm_pmc *pmc)
+{
+ u64 counter_val_mask = GENMASK(pmc->cinfo.width, 0);
+ u64 sample_period;
+
+ if (!pmc->counter_val)
+ sample_period = counter_val_mask + 1;
+ else
+ sample_period = (-pmc->counter_val) & counter_val_mask;
+
+ return sample_period;
+}
+
+static u32 kvm_pmu_get_perf_event_type(unsigned long eidx)
+{
+ enum sbi_pmu_event_type etype = get_event_type(eidx);
+ u32 type = PERF_TYPE_MAX;
+
+ switch (etype) {
+ case SBI_PMU_EVENT_TYPE_HW:
+ type = PERF_TYPE_HARDWARE;
+ break;
+ case SBI_PMU_EVENT_TYPE_CACHE:
+ type = PERF_TYPE_HW_CACHE;
+ break;
+ case SBI_PMU_EVENT_TYPE_RAW:
+ case SBI_PMU_EVENT_TYPE_FW:
+ type = PERF_TYPE_RAW;
+ break;
+ default:
+ break;
+ }
+
+ return type;
+}
+
+static bool kvm_pmu_is_fw_event(unsigned long eidx)
+{
+ return get_event_type(eidx) == SBI_PMU_EVENT_TYPE_FW;
+}
+
+static void kvm_pmu_release_perf_event(struct kvm_pmc *pmc)
+{
+ if (pmc->perf_event) {
+ perf_event_disable(pmc->perf_event);
+ perf_event_release_kernel(pmc->perf_event);
+ pmc->perf_event = NULL;
+ }
+}
+
+static u64 kvm_pmu_get_perf_event_hw_config(u32 sbi_event_code)
+{
+ return hw_event_perf_map[sbi_event_code];
+}
+
+static u64 kvm_pmu_get_perf_event_cache_config(u32 sbi_event_code)
+{
+ u64 config = U64_MAX;
+ unsigned int cache_type, cache_op, cache_result;
+
+ /* All the cache event masks lie within 0xFF. No separate masking is necessary */
+ cache_type = (sbi_event_code & SBI_PMU_EVENT_CACHE_ID_CODE_MASK) >>
+ SBI_PMU_EVENT_CACHE_ID_SHIFT;
+ cache_op = (sbi_event_code & SBI_PMU_EVENT_CACHE_OP_ID_CODE_MASK) >>
+ SBI_PMU_EVENT_CACHE_OP_SHIFT;
+ cache_result = sbi_event_code & SBI_PMU_EVENT_CACHE_RESULT_ID_CODE_MASK;
+
+ if (cache_type >= PERF_COUNT_HW_CACHE_MAX ||
+ cache_op >= PERF_COUNT_HW_CACHE_OP_MAX ||
+ cache_result >= PERF_COUNT_HW_CACHE_RESULT_MAX)
+ return config;
+
+ config = cache_type | (cache_op << 8) | (cache_result << 16);
+
+ return config;
+}
+
+static u64 kvm_pmu_get_perf_event_config(unsigned long eidx, uint64_t evt_data)
+{
+ enum sbi_pmu_event_type etype = get_event_type(eidx);
+ u32 ecode = get_event_code(eidx);
+ u64 config = U64_MAX;
+
+ switch (etype) {
+ case SBI_PMU_EVENT_TYPE_HW:
+ if (ecode < SBI_PMU_HW_GENERAL_MAX)
+ config = kvm_pmu_get_perf_event_hw_config(ecode);
+ break;
+ case SBI_PMU_EVENT_TYPE_CACHE:
+ config = kvm_pmu_get_perf_event_cache_config(ecode);
+ break;
+ case SBI_PMU_EVENT_TYPE_RAW:
+ config = evt_data & RISCV_PMU_RAW_EVENT_MASK;
+ break;
+ case SBI_PMU_EVENT_TYPE_FW:
+ if (ecode < SBI_PMU_FW_MAX)
+ config = (1ULL << 63) | ecode;
+ break;
+ default:
+ break;
+ }
+
+ return config;
+}
+
+static int kvm_pmu_get_fixed_pmc_index(unsigned long eidx)
+{
+ u32 etype = kvm_pmu_get_perf_event_type(eidx);
+ u32 ecode = get_event_code(eidx);
+
+ if (etype != SBI_PMU_EVENT_TYPE_HW)
+ return -EINVAL;
+
+ if (ecode == SBI_PMU_HW_CPU_CYCLES)
+ return 0;
+ else if (ecode == SBI_PMU_HW_INSTRUCTIONS)
+ return 2;
+ else
+ return -EINVAL;
+}
+
+static int kvm_pmu_get_programmable_pmc_index(struct kvm_pmu *kvpmu, unsigned long eidx,
+ unsigned long cbase, unsigned long cmask)
+{
+ int ctr_idx = -1;
+ int i, pmc_idx;
+ int min, max;
+
+ if (kvm_pmu_is_fw_event(eidx)) {
+ /* Firmware counters are mapped 1:1 starting from num_hw_ctrs for simplicity */
+ min = kvpmu->num_hw_ctrs;
+ max = min + kvpmu->num_fw_ctrs;
+ } else {
+ /* First 3 counters are reserved for fixed counters */
+ min = 3;
+ max = kvpmu->num_hw_ctrs;
+ }
+
+ for_each_set_bit(i, &cmask, BITS_PER_LONG) {
+ pmc_idx = i + cbase;
+ if ((pmc_idx >= min && pmc_idx < max) &&
+ !test_bit(pmc_idx, kvpmu->pmc_in_use)) {
+ ctr_idx = pmc_idx;
+ break;
+ }
+ }
+
+ return ctr_idx;
+}
+
+static int pmu_get_pmc_index(struct kvm_pmu *pmu, unsigned long eidx,
+ unsigned long cbase, unsigned long cmask)
+{
+ int ret;
+
+ /* Fixed counters need to be have fixed mapping as they have different width */
+ ret = kvm_pmu_get_fixed_pmc_index(eidx);
+ if (ret >= 0)
+ return ret;
+
+ return kvm_pmu_get_programmable_pmc_index(pmu, eidx, cbase, cmask);
+}
+
+static int pmu_ctr_read(struct kvm_vcpu *vcpu, unsigned long cidx,
+ unsigned long *out_val)
+{
+ struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu);
+ struct kvm_pmc *pmc;
+ u64 enabled, running;
+ int fevent_code;
+
+ pmc = &kvpmu->pmc[cidx];
+
+ if (pmc->cinfo.type == SBI_PMU_CTR_TYPE_FW) {
+ fevent_code = get_event_code(pmc->event_idx);
+ pmc->counter_val = kvpmu->fw_event[fevent_code].value;
+ } else if (pmc->perf_event) {
+ pmc->counter_val += perf_event_read_value(pmc->perf_event, &enabled, &running);
+ } else {
+ return -EINVAL;
+ }
+ *out_val = pmc->counter_val;
+
+ return 0;
+}
+
+static int kvm_pmu_validate_counter_mask(struct kvm_pmu *kvpmu, unsigned long ctr_base,
+ unsigned long ctr_mask)
+{
+ /* Make sure the we have a valid counter mask requested from the caller */
+ if (!ctr_mask || (ctr_base + __fls(ctr_mask) >= kvm_pmu_num_counters(kvpmu)))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int kvm_pmu_create_perf_event(struct kvm_pmc *pmc, struct perf_event_attr *attr,
+ unsigned long flags, unsigned long eidx, unsigned long evtdata)
+{
+ struct perf_event *event;
+
+ kvm_pmu_release_perf_event(pmc);
+ attr->config = kvm_pmu_get_perf_event_config(eidx, evtdata);
+ if (flags & SBI_PMU_CFG_FLAG_CLEAR_VALUE) {
+ //TODO: Do we really want to clear the value in hardware counter
+ pmc->counter_val = 0;
+ }
+
+ /*
+ * Set the default sample_period for now. The guest specified value
+ * will be updated in the start call.
+ */
+ attr->sample_period = kvm_pmu_get_sample_period(pmc);
+
+ event = perf_event_create_kernel_counter(attr, -1, current, NULL, pmc);
+ if (IS_ERR(event)) {
+ pr_err("kvm pmu event creation failed for eidx %lx: %ld\n", eidx, PTR_ERR(event));
+ return PTR_ERR(event);
+ }
+
+ pmc->perf_event = event;
+ if (flags & SBI_PMU_CFG_FLAG_AUTO_START)
+ perf_event_enable(pmc->perf_event);
+
+ return 0;
+}
+
+int kvm_riscv_vcpu_pmu_incr_fw(struct kvm_vcpu *vcpu, unsigned long fid)
+{
+ struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu);
+ struct kvm_fw_event *fevent;
+
+ if (!kvpmu || fid >= SBI_PMU_FW_MAX)
+ return -EINVAL;
+
+ fevent = &kvpmu->fw_event[fid];
+ if (fevent->started)
+ fevent->value++;
+
+ return 0;
+}
+
+int kvm_riscv_vcpu_pmu_read_hpm(struct kvm_vcpu *vcpu, unsigned int csr_num,
+ unsigned long *val, unsigned long new_val,
+ unsigned long wr_mask)
+{
+ struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu);
+ int cidx, ret = KVM_INSN_CONTINUE_NEXT_SEPC;
+
+ if (!kvpmu || !kvpmu->init_done) {
+ /*
+ * In absence of sscofpmf in the platform, the guest OS may use
+ * the legacy PMU driver to read cycle/instret. In that case,
+ * just return 0 to avoid any illegal trap. However, any other
+ * hpmcounter access should result in illegal trap as they must
+ * be access through SBI PMU only.
+ */
+ if (csr_num == CSR_CYCLE || csr_num == CSR_INSTRET) {
+ *val = 0;
+ return ret;
+ } else {
+ return KVM_INSN_ILLEGAL_TRAP;
+ }
+ }
+
+ /* The counter CSR are read only. Thus, any write should result in illegal traps */
+ if (wr_mask)
+ return KVM_INSN_ILLEGAL_TRAP;
+
+ cidx = csr_num - CSR_CYCLE;
+
+ if (pmu_ctr_read(vcpu, cidx, val) < 0)
+ return KVM_INSN_ILLEGAL_TRAP;
+
+ return ret;
+}
+
+int kvm_riscv_vcpu_pmu_num_ctrs(struct kvm_vcpu *vcpu,
+ struct kvm_vcpu_sbi_return *retdata)
+{
+ struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu);
+
+ retdata->out_val = kvm_pmu_num_counters(kvpmu);
+
+ return 0;
+}
+
+int kvm_riscv_vcpu_pmu_ctr_info(struct kvm_vcpu *vcpu, unsigned long cidx,
+ struct kvm_vcpu_sbi_return *retdata)
+{
+ struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu);
+
+ if (cidx > RISCV_KVM_MAX_COUNTERS || cidx == 1) {
+ retdata->err_val = SBI_ERR_INVALID_PARAM;
+ return 0;
+ }
+
+ retdata->out_val = kvpmu->pmc[cidx].cinfo.value;
+
+ return 0;
+}
+
+int kvm_riscv_vcpu_pmu_ctr_start(struct kvm_vcpu *vcpu, unsigned long ctr_base,
+ unsigned long ctr_mask, unsigned long flags, u64 ival,
+ struct kvm_vcpu_sbi_return *retdata)
+{
+ struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu);
+ int i, pmc_index, sbiret = 0;
+ struct kvm_pmc *pmc;
+ int fevent_code;
+
+ if (kvm_pmu_validate_counter_mask(kvpmu, ctr_base, ctr_mask) < 0) {
+ sbiret = SBI_ERR_INVALID_PARAM;
+ goto out;
+ }
+
+ /* Start the counters that have been configured and requested by the guest */
+ for_each_set_bit(i, &ctr_mask, RISCV_MAX_COUNTERS) {
+ pmc_index = i + ctr_base;
+ if (!test_bit(pmc_index, kvpmu->pmc_in_use))
+ continue;
+ pmc = &kvpmu->pmc[pmc_index];
+ if (flags & SBI_PMU_START_FLAG_SET_INIT_VALUE)
+ pmc->counter_val = ival;
+ if (pmc->cinfo.type == SBI_PMU_CTR_TYPE_FW) {
+ fevent_code = get_event_code(pmc->event_idx);
+ if (fevent_code >= SBI_PMU_FW_MAX) {
+ sbiret = SBI_ERR_INVALID_PARAM;
+ goto out;
+ }
+
+ /* Check if the counter was already started for some reason */
+ if (kvpmu->fw_event[fevent_code].started) {
+ sbiret = SBI_ERR_ALREADY_STARTED;
+ continue;
+ }
+
+ kvpmu->fw_event[fevent_code].started = true;
+ kvpmu->fw_event[fevent_code].value = pmc->counter_val;
+ } else if (pmc->perf_event) {
+ if (unlikely(pmc->started)) {
+ sbiret = SBI_ERR_ALREADY_STARTED;
+ continue;
+ }
+ perf_event_period(pmc->perf_event, kvm_pmu_get_sample_period(pmc));
+ perf_event_enable(pmc->perf_event);
+ pmc->started = true;
+ } else {
+ sbiret = SBI_ERR_INVALID_PARAM;
+ }
+ }
+
+out:
+ retdata->err_val = sbiret;
+
+ return 0;
+}
+
+int kvm_riscv_vcpu_pmu_ctr_stop(struct kvm_vcpu *vcpu, unsigned long ctr_base,
+ unsigned long ctr_mask, unsigned long flags,
+ struct kvm_vcpu_sbi_return *retdata)
+{
+ struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu);
+ int i, pmc_index, sbiret = 0;
+ u64 enabled, running;
+ struct kvm_pmc *pmc;
+ int fevent_code;
+
+ if (kvm_pmu_validate_counter_mask(kvpmu, ctr_base, ctr_mask) < 0) {
+ sbiret = SBI_ERR_INVALID_PARAM;
+ goto out;
+ }
+
+ /* Stop the counters that have been configured and requested by the guest */
+ for_each_set_bit(i, &ctr_mask, RISCV_MAX_COUNTERS) {
+ pmc_index = i + ctr_base;
+ if (!test_bit(pmc_index, kvpmu->pmc_in_use))
+ continue;
+ pmc = &kvpmu->pmc[pmc_index];
+ if (pmc->cinfo.type == SBI_PMU_CTR_TYPE_FW) {
+ fevent_code = get_event_code(pmc->event_idx);
+ if (fevent_code >= SBI_PMU_FW_MAX) {
+ sbiret = SBI_ERR_INVALID_PARAM;
+ goto out;
+ }
+
+ if (!kvpmu->fw_event[fevent_code].started)
+ sbiret = SBI_ERR_ALREADY_STOPPED;
+
+ kvpmu->fw_event[fevent_code].started = false;
+ } else if (pmc->perf_event) {
+ if (pmc->started) {
+ /* Stop counting the counter */
+ perf_event_disable(pmc->perf_event);
+ pmc->started = false;
+ } else {
+ sbiret = SBI_ERR_ALREADY_STOPPED;
+ }
+
+ if (flags & SBI_PMU_STOP_FLAG_RESET) {
+ /* Relase the counter if this is a reset request */
+ pmc->counter_val += perf_event_read_value(pmc->perf_event,
+ &enabled, &running);
+ kvm_pmu_release_perf_event(pmc);
+ }
+ } else {
+ sbiret = SBI_ERR_INVALID_PARAM;
+ }
+ if (flags & SBI_PMU_STOP_FLAG_RESET) {
+ pmc->event_idx = SBI_PMU_EVENT_IDX_INVALID;
+ clear_bit(pmc_index, kvpmu->pmc_in_use);
+ }
+ }
+
+out:
+ retdata->err_val = sbiret;
+
+ return 0;
+}
+
+int kvm_riscv_vcpu_pmu_ctr_cfg_match(struct kvm_vcpu *vcpu, unsigned long ctr_base,
+ unsigned long ctr_mask, unsigned long flags,
+ unsigned long eidx, u64 evtdata,
+ struct kvm_vcpu_sbi_return *retdata)
+{
+ int ctr_idx, ret, sbiret = 0;
+ bool is_fevent;
+ unsigned long event_code;
+ u32 etype = kvm_pmu_get_perf_event_type(eidx);
+ struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu);
+ struct kvm_pmc *pmc = NULL;
+ struct perf_event_attr attr = {
+ .type = etype,
+ .size = sizeof(struct perf_event_attr),
+ .pinned = true,
+ /*
+ * It should never reach here if the platform doesn't support the sscofpmf
+ * extension as mode filtering won't work without it.
+ */
+ .exclude_host = true,
+ .exclude_hv = true,
+ .exclude_user = !!(flags & SBI_PMU_CFG_FLAG_SET_UINH),
+ .exclude_kernel = !!(flags & SBI_PMU_CFG_FLAG_SET_SINH),
+ .config1 = RISCV_PMU_CONFIG1_GUEST_EVENTS,
+ };
+
+ if (kvm_pmu_validate_counter_mask(kvpmu, ctr_base, ctr_mask) < 0) {
+ sbiret = SBI_ERR_INVALID_PARAM;
+ goto out;
+ }
+
+ event_code = get_event_code(eidx);
+ is_fevent = kvm_pmu_is_fw_event(eidx);
+ if (is_fevent && event_code >= SBI_PMU_FW_MAX) {
+ sbiret = SBI_ERR_NOT_SUPPORTED;
+ goto out;
+ }
+
+ /*
+ * SKIP_MATCH flag indicates the caller is aware of the assigned counter
+ * for this event. Just do a sanity check if it already marked used.
+ */
+ if (flags & SBI_PMU_CFG_FLAG_SKIP_MATCH) {
+ if (!test_bit(ctr_base + __ffs(ctr_mask), kvpmu->pmc_in_use)) {
+ sbiret = SBI_ERR_FAILURE;
+ goto out;
+ }
+ ctr_idx = ctr_base + __ffs(ctr_mask);
+ } else {
+ ctr_idx = pmu_get_pmc_index(kvpmu, eidx, ctr_base, ctr_mask);
+ if (ctr_idx < 0) {
+ sbiret = SBI_ERR_NOT_SUPPORTED;
+ goto out;
+ }
+ }
+
+ pmc = &kvpmu->pmc[ctr_idx];
+ pmc->idx = ctr_idx;
+
+ if (is_fevent) {
+ if (flags & SBI_PMU_CFG_FLAG_AUTO_START)
+ kvpmu->fw_event[event_code].started = true;
+ } else {
+ ret = kvm_pmu_create_perf_event(pmc, &attr, flags, eidx, evtdata);
+ if (ret)
+ return ret;
+ }
+
+ set_bit(ctr_idx, kvpmu->pmc_in_use);
+ pmc->event_idx = eidx;
+ retdata->out_val = ctr_idx;
+out:
+ retdata->err_val = sbiret;
+
+ return 0;
+}
+
+int kvm_riscv_vcpu_pmu_ctr_read(struct kvm_vcpu *vcpu, unsigned long cidx,
+ struct kvm_vcpu_sbi_return *retdata)
+{
+ int ret;
+
+ ret = pmu_ctr_read(vcpu, cidx, &retdata->out_val);
+ if (ret == -EINVAL)
+ retdata->err_val = SBI_ERR_INVALID_PARAM;
+
+ return 0;
+}
+
+void kvm_riscv_vcpu_pmu_init(struct kvm_vcpu *vcpu)
+{
+ int i = 0, ret, num_hw_ctrs = 0, hpm_width = 0;
+ struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu);
+ struct kvm_pmc *pmc;
+
+ /*
+ * PMU functionality should be only available to guests if privilege mode
+ * filtering is available in the host. Otherwise, guest will always count
+ * events while the execution is in hypervisor mode.
+ */
+ if (!riscv_isa_extension_available(NULL, SSCOFPMF))
+ return;
+
+ ret = riscv_pmu_get_hpm_info(&hpm_width, &num_hw_ctrs);
+ if (ret < 0 || !hpm_width || !num_hw_ctrs)
+ return;
+
+ /*
+ * Increase the number of hardware counters to offset the time counter.
+ */
+ kvpmu->num_hw_ctrs = num_hw_ctrs + 1;
+ kvpmu->num_fw_ctrs = SBI_PMU_FW_MAX;
+ memset(&kvpmu->fw_event, 0, SBI_PMU_FW_MAX * sizeof(struct kvm_fw_event));
+
+ if (kvpmu->num_hw_ctrs > RISCV_KVM_MAX_HW_CTRS) {
+ pr_warn_once("Limiting the hardware counters to 32 as specified by the ISA");
+ kvpmu->num_hw_ctrs = RISCV_KVM_MAX_HW_CTRS;
+ }
+
+ /*
+ * There is no correlation between the logical hardware counter and virtual counters.
+ * However, we need to encode a hpmcounter CSR in the counter info field so that
+ * KVM can trap n emulate the read. This works well in the migration use case as
+ * KVM doesn't care if the actual hpmcounter is available in the hardware or not.
+ */
+ for (i = 0; i < kvm_pmu_num_counters(kvpmu); i++) {
+ /* TIME CSR shouldn't be read from perf interface */
+ if (i == 1)
+ continue;
+ pmc = &kvpmu->pmc[i];
+ pmc->idx = i;
+ pmc->event_idx = SBI_PMU_EVENT_IDX_INVALID;
+ if (i < kvpmu->num_hw_ctrs) {
+ pmc->cinfo.type = SBI_PMU_CTR_TYPE_HW;
+ if (i < 3)
+ /* CY, IR counters */
+ pmc->cinfo.width = 63;
+ else
+ pmc->cinfo.width = hpm_width;
+ /*
+ * The CSR number doesn't have any relation with the logical
+ * hardware counters. The CSR numbers are encoded sequentially
+ * to avoid maintaining a map between the virtual counter
+ * and CSR number.
+ */
+ pmc->cinfo.csr = CSR_CYCLE + i;
+ } else {
+ pmc->cinfo.type = SBI_PMU_CTR_TYPE_FW;
+ pmc->cinfo.width = BITS_PER_LONG - 1;
+ }
+ }
+
+ kvpmu->init_done = true;
+}
+
+void kvm_riscv_vcpu_pmu_deinit(struct kvm_vcpu *vcpu)
+{
+ struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu);
+ struct kvm_pmc *pmc;
+ int i;
+
+ if (!kvpmu)
+ return;
+
+ for_each_set_bit(i, kvpmu->pmc_in_use, RISCV_MAX_COUNTERS) {
+ pmc = &kvpmu->pmc[i];
+ pmc->counter_val = 0;
+ kvm_pmu_release_perf_event(pmc);
+ pmc->event_idx = SBI_PMU_EVENT_IDX_INVALID;
+ }
+ bitmap_zero(kvpmu->pmc_in_use, RISCV_MAX_COUNTERS);
+ memset(&kvpmu->fw_event, 0, SBI_PMU_FW_MAX * sizeof(struct kvm_fw_event));
+}
+
+void kvm_riscv_vcpu_pmu_reset(struct kvm_vcpu *vcpu)
+{
+ kvm_riscv_vcpu_pmu_deinit(vcpu);
+}
diff --git a/arch/riscv/kvm/vcpu_sbi.c b/arch/riscv/kvm/vcpu_sbi.c
new file mode 100644
index 000000000..9cd97091c
--- /dev/null
+++ b/arch/riscv/kvm/vcpu_sbi.c
@@ -0,0 +1,421 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2019 Western Digital Corporation or its affiliates.
+ *
+ * Authors:
+ * Atish Patra <atish.patra@wdc.com>
+ */
+
+#include <linux/errno.h>
+#include <linux/err.h>
+#include <linux/kvm_host.h>
+#include <asm/sbi.h>
+#include <asm/kvm_vcpu_sbi.h>
+
+#ifndef CONFIG_RISCV_SBI_V01
+static const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_v01 = {
+ .extid_start = -1UL,
+ .extid_end = -1UL,
+ .handler = NULL,
+};
+#endif
+
+#ifndef CONFIG_RISCV_PMU_SBI
+static const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_pmu = {
+ .extid_start = -1UL,
+ .extid_end = -1UL,
+ .handler = NULL,
+};
+#endif
+
+struct kvm_riscv_sbi_extension_entry {
+ enum KVM_RISCV_SBI_EXT_ID ext_idx;
+ const struct kvm_vcpu_sbi_extension *ext_ptr;
+};
+
+static const struct kvm_riscv_sbi_extension_entry sbi_ext[] = {
+ {
+ .ext_idx = KVM_RISCV_SBI_EXT_V01,
+ .ext_ptr = &vcpu_sbi_ext_v01,
+ },
+ {
+ .ext_idx = KVM_RISCV_SBI_EXT_MAX, /* Can't be disabled */
+ .ext_ptr = &vcpu_sbi_ext_base,
+ },
+ {
+ .ext_idx = KVM_RISCV_SBI_EXT_TIME,
+ .ext_ptr = &vcpu_sbi_ext_time,
+ },
+ {
+ .ext_idx = KVM_RISCV_SBI_EXT_IPI,
+ .ext_ptr = &vcpu_sbi_ext_ipi,
+ },
+ {
+ .ext_idx = KVM_RISCV_SBI_EXT_RFENCE,
+ .ext_ptr = &vcpu_sbi_ext_rfence,
+ },
+ {
+ .ext_idx = KVM_RISCV_SBI_EXT_SRST,
+ .ext_ptr = &vcpu_sbi_ext_srst,
+ },
+ {
+ .ext_idx = KVM_RISCV_SBI_EXT_HSM,
+ .ext_ptr = &vcpu_sbi_ext_hsm,
+ },
+ {
+ .ext_idx = KVM_RISCV_SBI_EXT_PMU,
+ .ext_ptr = &vcpu_sbi_ext_pmu,
+ },
+ {
+ .ext_idx = KVM_RISCV_SBI_EXT_EXPERIMENTAL,
+ .ext_ptr = &vcpu_sbi_ext_experimental,
+ },
+ {
+ .ext_idx = KVM_RISCV_SBI_EXT_VENDOR,
+ .ext_ptr = &vcpu_sbi_ext_vendor,
+ },
+};
+
+void kvm_riscv_vcpu_sbi_forward(struct kvm_vcpu *vcpu, struct kvm_run *run)
+{
+ struct kvm_cpu_context *cp = &vcpu->arch.guest_context;
+
+ vcpu->arch.sbi_context.return_handled = 0;
+ vcpu->stat.ecall_exit_stat++;
+ run->exit_reason = KVM_EXIT_RISCV_SBI;
+ run->riscv_sbi.extension_id = cp->a7;
+ run->riscv_sbi.function_id = cp->a6;
+ run->riscv_sbi.args[0] = cp->a0;
+ run->riscv_sbi.args[1] = cp->a1;
+ run->riscv_sbi.args[2] = cp->a2;
+ run->riscv_sbi.args[3] = cp->a3;
+ run->riscv_sbi.args[4] = cp->a4;
+ run->riscv_sbi.args[5] = cp->a5;
+ run->riscv_sbi.ret[0] = cp->a0;
+ run->riscv_sbi.ret[1] = cp->a1;
+}
+
+void kvm_riscv_vcpu_sbi_system_reset(struct kvm_vcpu *vcpu,
+ struct kvm_run *run,
+ u32 type, u64 reason)
+{
+ unsigned long i;
+ struct kvm_vcpu *tmp;
+
+ kvm_for_each_vcpu(i, tmp, vcpu->kvm)
+ tmp->arch.power_off = true;
+ kvm_make_all_cpus_request(vcpu->kvm, KVM_REQ_SLEEP);
+
+ memset(&run->system_event, 0, sizeof(run->system_event));
+ run->system_event.type = type;
+ run->system_event.ndata = 1;
+ run->system_event.data[0] = reason;
+ run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
+}
+
+int kvm_riscv_vcpu_sbi_return(struct kvm_vcpu *vcpu, struct kvm_run *run)
+{
+ struct kvm_cpu_context *cp = &vcpu->arch.guest_context;
+
+ /* Handle SBI return only once */
+ if (vcpu->arch.sbi_context.return_handled)
+ return 0;
+ vcpu->arch.sbi_context.return_handled = 1;
+
+ /* Update return values */
+ cp->a0 = run->riscv_sbi.ret[0];
+ cp->a1 = run->riscv_sbi.ret[1];
+
+ /* Move to next instruction */
+ vcpu->arch.guest_context.sepc += 4;
+
+ return 0;
+}
+
+static int riscv_vcpu_set_sbi_ext_single(struct kvm_vcpu *vcpu,
+ unsigned long reg_num,
+ unsigned long reg_val)
+{
+ unsigned long i;
+ const struct kvm_riscv_sbi_extension_entry *sext = NULL;
+ struct kvm_vcpu_sbi_context *scontext = &vcpu->arch.sbi_context;
+
+ if (reg_num >= KVM_RISCV_SBI_EXT_MAX)
+ return -ENOENT;
+
+ if (reg_val != 1 && reg_val != 0)
+ return -EINVAL;
+
+ for (i = 0; i < ARRAY_SIZE(sbi_ext); i++) {
+ if (sbi_ext[i].ext_idx == reg_num) {
+ sext = &sbi_ext[i];
+ break;
+ }
+ }
+ if (!sext)
+ return -ENOENT;
+
+ /*
+ * We can't set the extension status to available here, since it may
+ * have a probe() function which needs to confirm availability first,
+ * but it may be too early to call that here. We can set the status to
+ * unavailable, though.
+ */
+ if (!reg_val)
+ scontext->ext_status[sext->ext_idx] =
+ KVM_RISCV_SBI_EXT_UNAVAILABLE;
+
+ return 0;
+}
+
+static int riscv_vcpu_get_sbi_ext_single(struct kvm_vcpu *vcpu,
+ unsigned long reg_num,
+ unsigned long *reg_val)
+{
+ unsigned long i;
+ const struct kvm_riscv_sbi_extension_entry *sext = NULL;
+ struct kvm_vcpu_sbi_context *scontext = &vcpu->arch.sbi_context;
+
+ if (reg_num >= KVM_RISCV_SBI_EXT_MAX)
+ return -ENOENT;
+
+ for (i = 0; i < ARRAY_SIZE(sbi_ext); i++) {
+ if (sbi_ext[i].ext_idx == reg_num) {
+ sext = &sbi_ext[i];
+ break;
+ }
+ }
+ if (!sext)
+ return -ENOENT;
+
+ /*
+ * If the extension status is still uninitialized, then we should probe
+ * to determine if it's available, but it may be too early to do that
+ * here. The best we can do is report that the extension has not been
+ * disabled, i.e. we return 1 when the extension is available and also
+ * when it only may be available.
+ */
+ *reg_val = scontext->ext_status[sext->ext_idx] !=
+ KVM_RISCV_SBI_EXT_UNAVAILABLE;
+
+ return 0;
+}
+
+static int riscv_vcpu_set_sbi_ext_multi(struct kvm_vcpu *vcpu,
+ unsigned long reg_num,
+ unsigned long reg_val, bool enable)
+{
+ unsigned long i, ext_id;
+
+ if (reg_num > KVM_REG_RISCV_SBI_MULTI_REG_LAST)
+ return -ENOENT;
+
+ for_each_set_bit(i, &reg_val, BITS_PER_LONG) {
+ ext_id = i + reg_num * BITS_PER_LONG;
+ if (ext_id >= KVM_RISCV_SBI_EXT_MAX)
+ break;
+
+ riscv_vcpu_set_sbi_ext_single(vcpu, ext_id, enable);
+ }
+
+ return 0;
+}
+
+static int riscv_vcpu_get_sbi_ext_multi(struct kvm_vcpu *vcpu,
+ unsigned long reg_num,
+ unsigned long *reg_val)
+{
+ unsigned long i, ext_id, ext_val;
+
+ if (reg_num > KVM_REG_RISCV_SBI_MULTI_REG_LAST)
+ return -ENOENT;
+
+ for (i = 0; i < BITS_PER_LONG; i++) {
+ ext_id = i + reg_num * BITS_PER_LONG;
+ if (ext_id >= KVM_RISCV_SBI_EXT_MAX)
+ break;
+
+ ext_val = 0;
+ riscv_vcpu_get_sbi_ext_single(vcpu, ext_id, &ext_val);
+ if (ext_val)
+ *reg_val |= KVM_REG_RISCV_SBI_MULTI_MASK(ext_id);
+ }
+
+ return 0;
+}
+
+int kvm_riscv_vcpu_set_reg_sbi_ext(struct kvm_vcpu *vcpu,
+ const struct kvm_one_reg *reg)
+{
+ unsigned long __user *uaddr =
+ (unsigned long __user *)(unsigned long)reg->addr;
+ unsigned long reg_num = reg->id & ~(KVM_REG_ARCH_MASK |
+ KVM_REG_SIZE_MASK |
+ KVM_REG_RISCV_SBI_EXT);
+ unsigned long reg_val, reg_subtype;
+
+ if (KVM_REG_SIZE(reg->id) != sizeof(unsigned long))
+ return -EINVAL;
+
+ if (vcpu->arch.ran_atleast_once)
+ return -EBUSY;
+
+ reg_subtype = reg_num & KVM_REG_RISCV_SUBTYPE_MASK;
+ reg_num &= ~KVM_REG_RISCV_SUBTYPE_MASK;
+
+ if (copy_from_user(&reg_val, uaddr, KVM_REG_SIZE(reg->id)))
+ return -EFAULT;
+
+ switch (reg_subtype) {
+ case KVM_REG_RISCV_SBI_SINGLE:
+ return riscv_vcpu_set_sbi_ext_single(vcpu, reg_num, reg_val);
+ case KVM_REG_RISCV_SBI_MULTI_EN:
+ return riscv_vcpu_set_sbi_ext_multi(vcpu, reg_num, reg_val, true);
+ case KVM_REG_RISCV_SBI_MULTI_DIS:
+ return riscv_vcpu_set_sbi_ext_multi(vcpu, reg_num, reg_val, false);
+ default:
+ return -ENOENT;
+ }
+
+ return 0;
+}
+
+int kvm_riscv_vcpu_get_reg_sbi_ext(struct kvm_vcpu *vcpu,
+ const struct kvm_one_reg *reg)
+{
+ int rc;
+ unsigned long __user *uaddr =
+ (unsigned long __user *)(unsigned long)reg->addr;
+ unsigned long reg_num = reg->id & ~(KVM_REG_ARCH_MASK |
+ KVM_REG_SIZE_MASK |
+ KVM_REG_RISCV_SBI_EXT);
+ unsigned long reg_val, reg_subtype;
+
+ if (KVM_REG_SIZE(reg->id) != sizeof(unsigned long))
+ return -EINVAL;
+
+ reg_subtype = reg_num & KVM_REG_RISCV_SUBTYPE_MASK;
+ reg_num &= ~KVM_REG_RISCV_SUBTYPE_MASK;
+
+ reg_val = 0;
+ switch (reg_subtype) {
+ case KVM_REG_RISCV_SBI_SINGLE:
+ rc = riscv_vcpu_get_sbi_ext_single(vcpu, reg_num, &reg_val);
+ break;
+ case KVM_REG_RISCV_SBI_MULTI_EN:
+ case KVM_REG_RISCV_SBI_MULTI_DIS:
+ rc = riscv_vcpu_get_sbi_ext_multi(vcpu, reg_num, &reg_val);
+ if (!rc && reg_subtype == KVM_REG_RISCV_SBI_MULTI_DIS)
+ reg_val = ~reg_val;
+ break;
+ default:
+ rc = -ENOENT;
+ }
+ if (rc)
+ return rc;
+
+ if (copy_to_user(uaddr, &reg_val, KVM_REG_SIZE(reg->id)))
+ return -EFAULT;
+
+ return 0;
+}
+
+const struct kvm_vcpu_sbi_extension *kvm_vcpu_sbi_find_ext(
+ struct kvm_vcpu *vcpu, unsigned long extid)
+{
+ struct kvm_vcpu_sbi_context *scontext = &vcpu->arch.sbi_context;
+ const struct kvm_riscv_sbi_extension_entry *entry;
+ const struct kvm_vcpu_sbi_extension *ext;
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(sbi_ext); i++) {
+ entry = &sbi_ext[i];
+ ext = entry->ext_ptr;
+
+ if (ext->extid_start <= extid && ext->extid_end >= extid) {
+ if (entry->ext_idx >= KVM_RISCV_SBI_EXT_MAX ||
+ scontext->ext_status[entry->ext_idx] ==
+ KVM_RISCV_SBI_EXT_AVAILABLE)
+ return ext;
+ if (scontext->ext_status[entry->ext_idx] ==
+ KVM_RISCV_SBI_EXT_UNAVAILABLE)
+ return NULL;
+ if (ext->probe && !ext->probe(vcpu)) {
+ scontext->ext_status[entry->ext_idx] =
+ KVM_RISCV_SBI_EXT_UNAVAILABLE;
+ return NULL;
+ }
+
+ scontext->ext_status[entry->ext_idx] =
+ KVM_RISCV_SBI_EXT_AVAILABLE;
+ return ext;
+ }
+ }
+
+ return NULL;
+}
+
+int kvm_riscv_vcpu_sbi_ecall(struct kvm_vcpu *vcpu, struct kvm_run *run)
+{
+ int ret = 1;
+ bool next_sepc = true;
+ struct kvm_cpu_context *cp = &vcpu->arch.guest_context;
+ const struct kvm_vcpu_sbi_extension *sbi_ext;
+ struct kvm_cpu_trap utrap = {0};
+ struct kvm_vcpu_sbi_return sbi_ret = {
+ .out_val = 0,
+ .err_val = 0,
+ .utrap = &utrap,
+ };
+ bool ext_is_v01 = false;
+
+ sbi_ext = kvm_vcpu_sbi_find_ext(vcpu, cp->a7);
+ if (sbi_ext && sbi_ext->handler) {
+#ifdef CONFIG_RISCV_SBI_V01
+ if (cp->a7 >= SBI_EXT_0_1_SET_TIMER &&
+ cp->a7 <= SBI_EXT_0_1_SHUTDOWN)
+ ext_is_v01 = true;
+#endif
+ ret = sbi_ext->handler(vcpu, run, &sbi_ret);
+ } else {
+ /* Return error for unsupported SBI calls */
+ cp->a0 = SBI_ERR_NOT_SUPPORTED;
+ goto ecall_done;
+ }
+
+ /*
+ * When the SBI extension returns a Linux error code, it exits the ioctl
+ * loop and forwards the error to userspace.
+ */
+ if (ret < 0) {
+ next_sepc = false;
+ goto ecall_done;
+ }
+
+ /* Handle special error cases i.e trap, exit or userspace forward */
+ if (sbi_ret.utrap->scause) {
+ /* No need to increment sepc or exit ioctl loop */
+ ret = 1;
+ sbi_ret.utrap->sepc = cp->sepc;
+ kvm_riscv_vcpu_trap_redirect(vcpu, sbi_ret.utrap);
+ next_sepc = false;
+ goto ecall_done;
+ }
+
+ /* Exit ioctl loop or Propagate the error code the guest */
+ if (sbi_ret.uexit) {
+ next_sepc = false;
+ ret = 0;
+ } else {
+ cp->a0 = sbi_ret.err_val;
+ ret = 1;
+ }
+ecall_done:
+ if (next_sepc)
+ cp->sepc += 4;
+ /* a1 should only be updated when we continue the ioctl loop */
+ if (!ext_is_v01 && ret == 1)
+ cp->a1 = sbi_ret.out_val;
+
+ return ret;
+}
diff --git a/arch/riscv/kvm/vcpu_sbi_base.c b/arch/riscv/kvm/vcpu_sbi_base.c
new file mode 100644
index 000000000..5bc570b98
--- /dev/null
+++ b/arch/riscv/kvm/vcpu_sbi_base.c
@@ -0,0 +1,98 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2021 Western Digital Corporation or its affiliates.
+ *
+ * Authors:
+ * Atish Patra <atish.patra@wdc.com>
+ */
+
+#include <linux/errno.h>
+#include <linux/err.h>
+#include <linux/kvm_host.h>
+#include <linux/version.h>
+#include <asm/sbi.h>
+#include <asm/kvm_vcpu_sbi.h>
+
+static int kvm_sbi_ext_base_handler(struct kvm_vcpu *vcpu, struct kvm_run *run,
+ struct kvm_vcpu_sbi_return *retdata)
+{
+ struct kvm_cpu_context *cp = &vcpu->arch.guest_context;
+ const struct kvm_vcpu_sbi_extension *sbi_ext;
+ unsigned long *out_val = &retdata->out_val;
+
+ switch (cp->a6) {
+ case SBI_EXT_BASE_GET_SPEC_VERSION:
+ *out_val = (KVM_SBI_VERSION_MAJOR <<
+ SBI_SPEC_VERSION_MAJOR_SHIFT) |
+ KVM_SBI_VERSION_MINOR;
+ break;
+ case SBI_EXT_BASE_GET_IMP_ID:
+ *out_val = KVM_SBI_IMPID;
+ break;
+ case SBI_EXT_BASE_GET_IMP_VERSION:
+ *out_val = LINUX_VERSION_CODE;
+ break;
+ case SBI_EXT_BASE_PROBE_EXT:
+ if ((cp->a0 >= SBI_EXT_EXPERIMENTAL_START &&
+ cp->a0 <= SBI_EXT_EXPERIMENTAL_END) ||
+ (cp->a0 >= SBI_EXT_VENDOR_START &&
+ cp->a0 <= SBI_EXT_VENDOR_END)) {
+ /*
+ * For experimental/vendor extensions
+ * forward it to the userspace
+ */
+ kvm_riscv_vcpu_sbi_forward(vcpu, run);
+ retdata->uexit = true;
+ } else {
+ sbi_ext = kvm_vcpu_sbi_find_ext(vcpu, cp->a0);
+ *out_val = sbi_ext && sbi_ext->probe ?
+ sbi_ext->probe(vcpu) : !!sbi_ext;
+ }
+ break;
+ case SBI_EXT_BASE_GET_MVENDORID:
+ *out_val = vcpu->arch.mvendorid;
+ break;
+ case SBI_EXT_BASE_GET_MARCHID:
+ *out_val = vcpu->arch.marchid;
+ break;
+ case SBI_EXT_BASE_GET_MIMPID:
+ *out_val = vcpu->arch.mimpid;
+ break;
+ default:
+ retdata->err_val = SBI_ERR_NOT_SUPPORTED;
+ break;
+ }
+
+ return 0;
+}
+
+const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_base = {
+ .extid_start = SBI_EXT_BASE,
+ .extid_end = SBI_EXT_BASE,
+ .handler = kvm_sbi_ext_base_handler,
+};
+
+static int kvm_sbi_ext_forward_handler(struct kvm_vcpu *vcpu,
+ struct kvm_run *run,
+ struct kvm_vcpu_sbi_return *retdata)
+{
+ /*
+ * Both SBI experimental and vendor extensions are
+ * unconditionally forwarded to userspace.
+ */
+ kvm_riscv_vcpu_sbi_forward(vcpu, run);
+ retdata->uexit = true;
+ return 0;
+}
+
+const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_experimental = {
+ .extid_start = SBI_EXT_EXPERIMENTAL_START,
+ .extid_end = SBI_EXT_EXPERIMENTAL_END,
+ .handler = kvm_sbi_ext_forward_handler,
+};
+
+const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_vendor = {
+ .extid_start = SBI_EXT_VENDOR_START,
+ .extid_end = SBI_EXT_VENDOR_END,
+ .handler = kvm_sbi_ext_forward_handler,
+};
diff --git a/arch/riscv/kvm/vcpu_sbi_hsm.c b/arch/riscv/kvm/vcpu_sbi_hsm.c
new file mode 100644
index 000000000..7dca0e938
--- /dev/null
+++ b/arch/riscv/kvm/vcpu_sbi_hsm.c
@@ -0,0 +1,118 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2021 Western Digital Corporation or its affiliates.
+ *
+ * Authors:
+ * Atish Patra <atish.patra@wdc.com>
+ */
+
+#include <linux/errno.h>
+#include <linux/err.h>
+#include <linux/kvm_host.h>
+#include <asm/sbi.h>
+#include <asm/kvm_vcpu_sbi.h>
+
+static int kvm_sbi_hsm_vcpu_start(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpu_context *reset_cntx;
+ struct kvm_cpu_context *cp = &vcpu->arch.guest_context;
+ struct kvm_vcpu *target_vcpu;
+ unsigned long target_vcpuid = cp->a0;
+
+ target_vcpu = kvm_get_vcpu_by_id(vcpu->kvm, target_vcpuid);
+ if (!target_vcpu)
+ return SBI_ERR_INVALID_PARAM;
+ if (!target_vcpu->arch.power_off)
+ return SBI_ERR_ALREADY_AVAILABLE;
+
+ reset_cntx = &target_vcpu->arch.guest_reset_context;
+ /* start address */
+ reset_cntx->sepc = cp->a1;
+ /* target vcpu id to start */
+ reset_cntx->a0 = target_vcpuid;
+ /* private data passed from kernel */
+ reset_cntx->a1 = cp->a2;
+ kvm_make_request(KVM_REQ_VCPU_RESET, target_vcpu);
+
+ kvm_riscv_vcpu_power_on(target_vcpu);
+
+ return 0;
+}
+
+static int kvm_sbi_hsm_vcpu_stop(struct kvm_vcpu *vcpu)
+{
+ if (vcpu->arch.power_off)
+ return SBI_ERR_FAILURE;
+
+ kvm_riscv_vcpu_power_off(vcpu);
+
+ return 0;
+}
+
+static int kvm_sbi_hsm_vcpu_get_status(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpu_context *cp = &vcpu->arch.guest_context;
+ unsigned long target_vcpuid = cp->a0;
+ struct kvm_vcpu *target_vcpu;
+
+ target_vcpu = kvm_get_vcpu_by_id(vcpu->kvm, target_vcpuid);
+ if (!target_vcpu)
+ return SBI_ERR_INVALID_PARAM;
+ if (!target_vcpu->arch.power_off)
+ return SBI_HSM_STATE_STARTED;
+ else if (vcpu->stat.generic.blocking)
+ return SBI_HSM_STATE_SUSPENDED;
+ else
+ return SBI_HSM_STATE_STOPPED;
+}
+
+static int kvm_sbi_ext_hsm_handler(struct kvm_vcpu *vcpu, struct kvm_run *run,
+ struct kvm_vcpu_sbi_return *retdata)
+{
+ int ret = 0;
+ struct kvm_cpu_context *cp = &vcpu->arch.guest_context;
+ struct kvm *kvm = vcpu->kvm;
+ unsigned long funcid = cp->a6;
+
+ switch (funcid) {
+ case SBI_EXT_HSM_HART_START:
+ mutex_lock(&kvm->lock);
+ ret = kvm_sbi_hsm_vcpu_start(vcpu);
+ mutex_unlock(&kvm->lock);
+ break;
+ case SBI_EXT_HSM_HART_STOP:
+ ret = kvm_sbi_hsm_vcpu_stop(vcpu);
+ break;
+ case SBI_EXT_HSM_HART_STATUS:
+ ret = kvm_sbi_hsm_vcpu_get_status(vcpu);
+ if (ret >= 0) {
+ retdata->out_val = ret;
+ retdata->err_val = 0;
+ }
+ return 0;
+ case SBI_EXT_HSM_HART_SUSPEND:
+ switch (cp->a0) {
+ case SBI_HSM_SUSPEND_RET_DEFAULT:
+ kvm_riscv_vcpu_wfi(vcpu);
+ break;
+ case SBI_HSM_SUSPEND_NON_RET_DEFAULT:
+ ret = SBI_ERR_NOT_SUPPORTED;
+ break;
+ default:
+ ret = SBI_ERR_INVALID_PARAM;
+ }
+ break;
+ default:
+ ret = SBI_ERR_NOT_SUPPORTED;
+ }
+
+ retdata->err_val = ret;
+
+ return 0;
+}
+
+const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_hsm = {
+ .extid_start = SBI_EXT_HSM,
+ .extid_end = SBI_EXT_HSM,
+ .handler = kvm_sbi_ext_hsm_handler,
+};
diff --git a/arch/riscv/kvm/vcpu_sbi_pmu.c b/arch/riscv/kvm/vcpu_sbi_pmu.c
new file mode 100644
index 000000000..7eca72df2
--- /dev/null
+++ b/arch/riscv/kvm/vcpu_sbi_pmu.c
@@ -0,0 +1,86 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2023 Rivos Inc
+ *
+ * Authors:
+ * Atish Patra <atishp@rivosinc.com>
+ */
+
+#include <linux/errno.h>
+#include <linux/err.h>
+#include <linux/kvm_host.h>
+#include <asm/csr.h>
+#include <asm/sbi.h>
+#include <asm/kvm_vcpu_sbi.h>
+
+static int kvm_sbi_ext_pmu_handler(struct kvm_vcpu *vcpu, struct kvm_run *run,
+ struct kvm_vcpu_sbi_return *retdata)
+{
+ int ret = 0;
+ struct kvm_cpu_context *cp = &vcpu->arch.guest_context;
+ struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu);
+ unsigned long funcid = cp->a6;
+ u64 temp;
+
+ if (!kvpmu->init_done) {
+ retdata->err_val = SBI_ERR_NOT_SUPPORTED;
+ return 0;
+ }
+
+ switch (funcid) {
+ case SBI_EXT_PMU_NUM_COUNTERS:
+ ret = kvm_riscv_vcpu_pmu_num_ctrs(vcpu, retdata);
+ break;
+ case SBI_EXT_PMU_COUNTER_GET_INFO:
+ ret = kvm_riscv_vcpu_pmu_ctr_info(vcpu, cp->a0, retdata);
+ break;
+ case SBI_EXT_PMU_COUNTER_CFG_MATCH:
+#if defined(CONFIG_32BIT)
+ temp = ((uint64_t)cp->a5 << 32) | cp->a4;
+#else
+ temp = cp->a4;
+#endif
+ /*
+ * This can fail if perf core framework fails to create an event.
+ * Forward the error to userspace because it's an error which
+ * happened within the host kernel. The other option would be
+ * to convert to an SBI error and forward to the guest.
+ */
+ ret = kvm_riscv_vcpu_pmu_ctr_cfg_match(vcpu, cp->a0, cp->a1,
+ cp->a2, cp->a3, temp, retdata);
+ break;
+ case SBI_EXT_PMU_COUNTER_START:
+#if defined(CONFIG_32BIT)
+ temp = ((uint64_t)cp->a4 << 32) | cp->a3;
+#else
+ temp = cp->a3;
+#endif
+ ret = kvm_riscv_vcpu_pmu_ctr_start(vcpu, cp->a0, cp->a1, cp->a2,
+ temp, retdata);
+ break;
+ case SBI_EXT_PMU_COUNTER_STOP:
+ ret = kvm_riscv_vcpu_pmu_ctr_stop(vcpu, cp->a0, cp->a1, cp->a2, retdata);
+ break;
+ case SBI_EXT_PMU_COUNTER_FW_READ:
+ ret = kvm_riscv_vcpu_pmu_ctr_read(vcpu, cp->a0, retdata);
+ break;
+ default:
+ retdata->err_val = SBI_ERR_NOT_SUPPORTED;
+ }
+
+ return ret;
+}
+
+static unsigned long kvm_sbi_ext_pmu_probe(struct kvm_vcpu *vcpu)
+{
+ struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu);
+
+ return kvpmu->init_done;
+}
+
+const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_pmu = {
+ .extid_start = SBI_EXT_PMU,
+ .extid_end = SBI_EXT_PMU,
+ .handler = kvm_sbi_ext_pmu_handler,
+ .probe = kvm_sbi_ext_pmu_probe,
+};
diff --git a/arch/riscv/kvm/vcpu_sbi_replace.c b/arch/riscv/kvm/vcpu_sbi_replace.c
new file mode 100644
index 000000000..7c4d5d38a
--- /dev/null
+++ b/arch/riscv/kvm/vcpu_sbi_replace.c
@@ -0,0 +1,177 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2021 Western Digital Corporation or its affiliates.
+ *
+ * Authors:
+ * Atish Patra <atish.patra@wdc.com>
+ */
+
+#include <linux/errno.h>
+#include <linux/err.h>
+#include <linux/kvm_host.h>
+#include <asm/sbi.h>
+#include <asm/kvm_vcpu_timer.h>
+#include <asm/kvm_vcpu_pmu.h>
+#include <asm/kvm_vcpu_sbi.h>
+
+static int kvm_sbi_ext_time_handler(struct kvm_vcpu *vcpu, struct kvm_run *run,
+ struct kvm_vcpu_sbi_return *retdata)
+{
+ struct kvm_cpu_context *cp = &vcpu->arch.guest_context;
+ u64 next_cycle;
+
+ if (cp->a6 != SBI_EXT_TIME_SET_TIMER) {
+ retdata->err_val = SBI_ERR_INVALID_PARAM;
+ return 0;
+ }
+
+ kvm_riscv_vcpu_pmu_incr_fw(vcpu, SBI_PMU_FW_SET_TIMER);
+#if __riscv_xlen == 32
+ next_cycle = ((u64)cp->a1 << 32) | (u64)cp->a0;
+#else
+ next_cycle = (u64)cp->a0;
+#endif
+ kvm_riscv_vcpu_timer_next_event(vcpu, next_cycle);
+
+ return 0;
+}
+
+const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_time = {
+ .extid_start = SBI_EXT_TIME,
+ .extid_end = SBI_EXT_TIME,
+ .handler = kvm_sbi_ext_time_handler,
+};
+
+static int kvm_sbi_ext_ipi_handler(struct kvm_vcpu *vcpu, struct kvm_run *run,
+ struct kvm_vcpu_sbi_return *retdata)
+{
+ int ret = 0;
+ unsigned long i;
+ struct kvm_vcpu *tmp;
+ struct kvm_cpu_context *cp = &vcpu->arch.guest_context;
+ unsigned long hmask = cp->a0;
+ unsigned long hbase = cp->a1;
+
+ if (cp->a6 != SBI_EXT_IPI_SEND_IPI) {
+ retdata->err_val = SBI_ERR_INVALID_PARAM;
+ return 0;
+ }
+
+ kvm_riscv_vcpu_pmu_incr_fw(vcpu, SBI_PMU_FW_IPI_SENT);
+ kvm_for_each_vcpu(i, tmp, vcpu->kvm) {
+ if (hbase != -1UL) {
+ if (tmp->vcpu_id < hbase)
+ continue;
+ if (!(hmask & (1UL << (tmp->vcpu_id - hbase))))
+ continue;
+ }
+ ret = kvm_riscv_vcpu_set_interrupt(tmp, IRQ_VS_SOFT);
+ if (ret < 0)
+ break;
+ kvm_riscv_vcpu_pmu_incr_fw(tmp, SBI_PMU_FW_IPI_RCVD);
+ }
+
+ return ret;
+}
+
+const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_ipi = {
+ .extid_start = SBI_EXT_IPI,
+ .extid_end = SBI_EXT_IPI,
+ .handler = kvm_sbi_ext_ipi_handler,
+};
+
+static int kvm_sbi_ext_rfence_handler(struct kvm_vcpu *vcpu, struct kvm_run *run,
+ struct kvm_vcpu_sbi_return *retdata)
+{
+ struct kvm_cpu_context *cp = &vcpu->arch.guest_context;
+ unsigned long hmask = cp->a0;
+ unsigned long hbase = cp->a1;
+ unsigned long funcid = cp->a6;
+
+ switch (funcid) {
+ case SBI_EXT_RFENCE_REMOTE_FENCE_I:
+ kvm_riscv_fence_i(vcpu->kvm, hbase, hmask);
+ kvm_riscv_vcpu_pmu_incr_fw(vcpu, SBI_PMU_FW_FENCE_I_SENT);
+ break;
+ case SBI_EXT_RFENCE_REMOTE_SFENCE_VMA:
+ if (cp->a2 == 0 && cp->a3 == 0)
+ kvm_riscv_hfence_vvma_all(vcpu->kvm, hbase, hmask);
+ else
+ kvm_riscv_hfence_vvma_gva(vcpu->kvm, hbase, hmask,
+ cp->a2, cp->a3, PAGE_SHIFT);
+ kvm_riscv_vcpu_pmu_incr_fw(vcpu, SBI_PMU_FW_HFENCE_VVMA_SENT);
+ break;
+ case SBI_EXT_RFENCE_REMOTE_SFENCE_VMA_ASID:
+ if (cp->a2 == 0 && cp->a3 == 0)
+ kvm_riscv_hfence_vvma_asid_all(vcpu->kvm,
+ hbase, hmask, cp->a4);
+ else
+ kvm_riscv_hfence_vvma_asid_gva(vcpu->kvm,
+ hbase, hmask,
+ cp->a2, cp->a3,
+ PAGE_SHIFT, cp->a4);
+ kvm_riscv_vcpu_pmu_incr_fw(vcpu, SBI_PMU_FW_HFENCE_VVMA_ASID_SENT);
+ break;
+ case SBI_EXT_RFENCE_REMOTE_HFENCE_GVMA:
+ case SBI_EXT_RFENCE_REMOTE_HFENCE_GVMA_VMID:
+ case SBI_EXT_RFENCE_REMOTE_HFENCE_VVMA:
+ case SBI_EXT_RFENCE_REMOTE_HFENCE_VVMA_ASID:
+ /*
+ * Until nested virtualization is implemented, the
+ * SBI HFENCE calls should be treated as NOPs
+ */
+ break;
+ default:
+ retdata->err_val = SBI_ERR_NOT_SUPPORTED;
+ }
+
+ return 0;
+}
+
+const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_rfence = {
+ .extid_start = SBI_EXT_RFENCE,
+ .extid_end = SBI_EXT_RFENCE,
+ .handler = kvm_sbi_ext_rfence_handler,
+};
+
+static int kvm_sbi_ext_srst_handler(struct kvm_vcpu *vcpu,
+ struct kvm_run *run,
+ struct kvm_vcpu_sbi_return *retdata)
+{
+ struct kvm_cpu_context *cp = &vcpu->arch.guest_context;
+ unsigned long funcid = cp->a6;
+ u32 reason = cp->a1;
+ u32 type = cp->a0;
+
+ switch (funcid) {
+ case SBI_EXT_SRST_RESET:
+ switch (type) {
+ case SBI_SRST_RESET_TYPE_SHUTDOWN:
+ kvm_riscv_vcpu_sbi_system_reset(vcpu, run,
+ KVM_SYSTEM_EVENT_SHUTDOWN,
+ reason);
+ retdata->uexit = true;
+ break;
+ case SBI_SRST_RESET_TYPE_COLD_REBOOT:
+ case SBI_SRST_RESET_TYPE_WARM_REBOOT:
+ kvm_riscv_vcpu_sbi_system_reset(vcpu, run,
+ KVM_SYSTEM_EVENT_RESET,
+ reason);
+ retdata->uexit = true;
+ break;
+ default:
+ retdata->err_val = SBI_ERR_NOT_SUPPORTED;
+ }
+ break;
+ default:
+ retdata->err_val = SBI_ERR_NOT_SUPPORTED;
+ }
+
+ return 0;
+}
+
+const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_srst = {
+ .extid_start = SBI_EXT_SRST,
+ .extid_end = SBI_EXT_SRST,
+ .handler = kvm_sbi_ext_srst_handler,
+};
diff --git a/arch/riscv/kvm/vcpu_sbi_v01.c b/arch/riscv/kvm/vcpu_sbi_v01.c
new file mode 100644
index 000000000..8f4c4fa16
--- /dev/null
+++ b/arch/riscv/kvm/vcpu_sbi_v01.c
@@ -0,0 +1,114 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2021 Western Digital Corporation or its affiliates.
+ *
+ * Authors:
+ * Atish Patra <atish.patra@wdc.com>
+ */
+
+#include <linux/errno.h>
+#include <linux/err.h>
+#include <linux/kvm_host.h>
+#include <asm/sbi.h>
+#include <asm/kvm_vcpu_timer.h>
+#include <asm/kvm_vcpu_sbi.h>
+
+static int kvm_sbi_ext_v01_handler(struct kvm_vcpu *vcpu, struct kvm_run *run,
+ struct kvm_vcpu_sbi_return *retdata)
+{
+ ulong hmask;
+ int i, ret = 0;
+ u64 next_cycle;
+ struct kvm_vcpu *rvcpu;
+ struct kvm *kvm = vcpu->kvm;
+ struct kvm_cpu_context *cp = &vcpu->arch.guest_context;
+ struct kvm_cpu_trap *utrap = retdata->utrap;
+
+ switch (cp->a7) {
+ case SBI_EXT_0_1_CONSOLE_GETCHAR:
+ case SBI_EXT_0_1_CONSOLE_PUTCHAR:
+ /*
+ * The CONSOLE_GETCHAR/CONSOLE_PUTCHAR SBI calls cannot be
+ * handled in kernel so we forward these to user-space
+ */
+ kvm_riscv_vcpu_sbi_forward(vcpu, run);
+ retdata->uexit = true;
+ break;
+ case SBI_EXT_0_1_SET_TIMER:
+#if __riscv_xlen == 32
+ next_cycle = ((u64)cp->a1 << 32) | (u64)cp->a0;
+#else
+ next_cycle = (u64)cp->a0;
+#endif
+ ret = kvm_riscv_vcpu_timer_next_event(vcpu, next_cycle);
+ break;
+ case SBI_EXT_0_1_CLEAR_IPI:
+ ret = kvm_riscv_vcpu_unset_interrupt(vcpu, IRQ_VS_SOFT);
+ break;
+ case SBI_EXT_0_1_SEND_IPI:
+ if (cp->a0)
+ hmask = kvm_riscv_vcpu_unpriv_read(vcpu, false, cp->a0, utrap);
+ else
+ hmask = (1UL << atomic_read(&kvm->online_vcpus)) - 1;
+ if (utrap->scause)
+ break;
+
+ for_each_set_bit(i, &hmask, BITS_PER_LONG) {
+ rvcpu = kvm_get_vcpu_by_id(vcpu->kvm, i);
+ ret = kvm_riscv_vcpu_set_interrupt(rvcpu, IRQ_VS_SOFT);
+ if (ret < 0)
+ break;
+ }
+ break;
+ case SBI_EXT_0_1_SHUTDOWN:
+ kvm_riscv_vcpu_sbi_system_reset(vcpu, run,
+ KVM_SYSTEM_EVENT_SHUTDOWN, 0);
+ retdata->uexit = true;
+ break;
+ case SBI_EXT_0_1_REMOTE_FENCE_I:
+ case SBI_EXT_0_1_REMOTE_SFENCE_VMA:
+ case SBI_EXT_0_1_REMOTE_SFENCE_VMA_ASID:
+ if (cp->a0)
+ hmask = kvm_riscv_vcpu_unpriv_read(vcpu, false, cp->a0, utrap);
+ else
+ hmask = (1UL << atomic_read(&kvm->online_vcpus)) - 1;
+ if (utrap->scause)
+ break;
+
+ if (cp->a7 == SBI_EXT_0_1_REMOTE_FENCE_I)
+ kvm_riscv_fence_i(vcpu->kvm, 0, hmask);
+ else if (cp->a7 == SBI_EXT_0_1_REMOTE_SFENCE_VMA) {
+ if (cp->a1 == 0 && cp->a2 == 0)
+ kvm_riscv_hfence_vvma_all(vcpu->kvm,
+ 0, hmask);
+ else
+ kvm_riscv_hfence_vvma_gva(vcpu->kvm,
+ 0, hmask,
+ cp->a1, cp->a2,
+ PAGE_SHIFT);
+ } else {
+ if (cp->a1 == 0 && cp->a2 == 0)
+ kvm_riscv_hfence_vvma_asid_all(vcpu->kvm,
+ 0, hmask,
+ cp->a3);
+ else
+ kvm_riscv_hfence_vvma_asid_gva(vcpu->kvm,
+ 0, hmask,
+ cp->a1, cp->a2,
+ PAGE_SHIFT,
+ cp->a3);
+ }
+ break;
+ default:
+ retdata->err_val = SBI_ERR_NOT_SUPPORTED;
+ break;
+ }
+
+ return ret;
+}
+
+const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_v01 = {
+ .extid_start = SBI_EXT_0_1_SET_TIMER,
+ .extid_end = SBI_EXT_0_1_SHUTDOWN,
+ .handler = kvm_sbi_ext_v01_handler,
+};
diff --git a/arch/riscv/kvm/vcpu_switch.S b/arch/riscv/kvm/vcpu_switch.S
new file mode 100644
index 000000000..d74df8eb4
--- /dev/null
+++ b/arch/riscv/kvm/vcpu_switch.S
@@ -0,0 +1,408 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2019 Western Digital Corporation or its affiliates.
+ *
+ * Authors:
+ * Anup Patel <anup.patel@wdc.com>
+ */
+
+#include <linux/linkage.h>
+#include <asm/asm.h>
+#include <asm/asm-offsets.h>
+#include <asm/csr.h>
+
+ .text
+ .altmacro
+ .option norelax
+
+ENTRY(__kvm_riscv_switch_to)
+ /* Save Host GPRs (except A0 and T0-T6) */
+ REG_S ra, (KVM_ARCH_HOST_RA)(a0)
+ REG_S sp, (KVM_ARCH_HOST_SP)(a0)
+ REG_S gp, (KVM_ARCH_HOST_GP)(a0)
+ REG_S tp, (KVM_ARCH_HOST_TP)(a0)
+ REG_S s0, (KVM_ARCH_HOST_S0)(a0)
+ REG_S s1, (KVM_ARCH_HOST_S1)(a0)
+ REG_S a1, (KVM_ARCH_HOST_A1)(a0)
+ REG_S a2, (KVM_ARCH_HOST_A2)(a0)
+ REG_S a3, (KVM_ARCH_HOST_A3)(a0)
+ REG_S a4, (KVM_ARCH_HOST_A4)(a0)
+ REG_S a5, (KVM_ARCH_HOST_A5)(a0)
+ REG_S a6, (KVM_ARCH_HOST_A6)(a0)
+ REG_S a7, (KVM_ARCH_HOST_A7)(a0)
+ REG_S s2, (KVM_ARCH_HOST_S2)(a0)
+ REG_S s3, (KVM_ARCH_HOST_S3)(a0)
+ REG_S s4, (KVM_ARCH_HOST_S4)(a0)
+ REG_S s5, (KVM_ARCH_HOST_S5)(a0)
+ REG_S s6, (KVM_ARCH_HOST_S6)(a0)
+ REG_S s7, (KVM_ARCH_HOST_S7)(a0)
+ REG_S s8, (KVM_ARCH_HOST_S8)(a0)
+ REG_S s9, (KVM_ARCH_HOST_S9)(a0)
+ REG_S s10, (KVM_ARCH_HOST_S10)(a0)
+ REG_S s11, (KVM_ARCH_HOST_S11)(a0)
+
+ /* Load Guest CSR values */
+ REG_L t0, (KVM_ARCH_GUEST_SSTATUS)(a0)
+ REG_L t1, (KVM_ARCH_GUEST_HSTATUS)(a0)
+ REG_L t2, (KVM_ARCH_GUEST_SCOUNTEREN)(a0)
+ la t4, __kvm_switch_return
+ REG_L t5, (KVM_ARCH_GUEST_SEPC)(a0)
+
+ /* Save Host and Restore Guest SSTATUS */
+ csrrw t0, CSR_SSTATUS, t0
+
+ /* Save Host and Restore Guest HSTATUS */
+ csrrw t1, CSR_HSTATUS, t1
+
+ /* Save Host and Restore Guest SCOUNTEREN */
+ csrrw t2, CSR_SCOUNTEREN, t2
+
+ /* Save Host STVEC and change it to return path */
+ csrrw t4, CSR_STVEC, t4
+
+ /* Save Host SSCRATCH and change it to struct kvm_vcpu_arch pointer */
+ csrrw t3, CSR_SSCRATCH, a0
+
+ /* Restore Guest SEPC */
+ csrw CSR_SEPC, t5
+
+ /* Store Host CSR values */
+ REG_S t0, (KVM_ARCH_HOST_SSTATUS)(a0)
+ REG_S t1, (KVM_ARCH_HOST_HSTATUS)(a0)
+ REG_S t2, (KVM_ARCH_HOST_SCOUNTEREN)(a0)
+ REG_S t3, (KVM_ARCH_HOST_SSCRATCH)(a0)
+ REG_S t4, (KVM_ARCH_HOST_STVEC)(a0)
+
+ /* Restore Guest GPRs (except A0) */
+ REG_L ra, (KVM_ARCH_GUEST_RA)(a0)
+ REG_L sp, (KVM_ARCH_GUEST_SP)(a0)
+ REG_L gp, (KVM_ARCH_GUEST_GP)(a0)
+ REG_L tp, (KVM_ARCH_GUEST_TP)(a0)
+ REG_L t0, (KVM_ARCH_GUEST_T0)(a0)
+ REG_L t1, (KVM_ARCH_GUEST_T1)(a0)
+ REG_L t2, (KVM_ARCH_GUEST_T2)(a0)
+ REG_L s0, (KVM_ARCH_GUEST_S0)(a0)
+ REG_L s1, (KVM_ARCH_GUEST_S1)(a0)
+ REG_L a1, (KVM_ARCH_GUEST_A1)(a0)
+ REG_L a2, (KVM_ARCH_GUEST_A2)(a0)
+ REG_L a3, (KVM_ARCH_GUEST_A3)(a0)
+ REG_L a4, (KVM_ARCH_GUEST_A4)(a0)
+ REG_L a5, (KVM_ARCH_GUEST_A5)(a0)
+ REG_L a6, (KVM_ARCH_GUEST_A6)(a0)
+ REG_L a7, (KVM_ARCH_GUEST_A7)(a0)
+ REG_L s2, (KVM_ARCH_GUEST_S2)(a0)
+ REG_L s3, (KVM_ARCH_GUEST_S3)(a0)
+ REG_L s4, (KVM_ARCH_GUEST_S4)(a0)
+ REG_L s5, (KVM_ARCH_GUEST_S5)(a0)
+ REG_L s6, (KVM_ARCH_GUEST_S6)(a0)
+ REG_L s7, (KVM_ARCH_GUEST_S7)(a0)
+ REG_L s8, (KVM_ARCH_GUEST_S8)(a0)
+ REG_L s9, (KVM_ARCH_GUEST_S9)(a0)
+ REG_L s10, (KVM_ARCH_GUEST_S10)(a0)
+ REG_L s11, (KVM_ARCH_GUEST_S11)(a0)
+ REG_L t3, (KVM_ARCH_GUEST_T3)(a0)
+ REG_L t4, (KVM_ARCH_GUEST_T4)(a0)
+ REG_L t5, (KVM_ARCH_GUEST_T5)(a0)
+ REG_L t6, (KVM_ARCH_GUEST_T6)(a0)
+
+ /* Restore Guest A0 */
+ REG_L a0, (KVM_ARCH_GUEST_A0)(a0)
+
+ /* Resume Guest */
+ sret
+
+ /* Back to Host */
+ .align 2
+__kvm_switch_return:
+ /* Swap Guest A0 with SSCRATCH */
+ csrrw a0, CSR_SSCRATCH, a0
+
+ /* Save Guest GPRs (except A0) */
+ REG_S ra, (KVM_ARCH_GUEST_RA)(a0)
+ REG_S sp, (KVM_ARCH_GUEST_SP)(a0)
+ REG_S gp, (KVM_ARCH_GUEST_GP)(a0)
+ REG_S tp, (KVM_ARCH_GUEST_TP)(a0)
+ REG_S t0, (KVM_ARCH_GUEST_T0)(a0)
+ REG_S t1, (KVM_ARCH_GUEST_T1)(a0)
+ REG_S t2, (KVM_ARCH_GUEST_T2)(a0)
+ REG_S s0, (KVM_ARCH_GUEST_S0)(a0)
+ REG_S s1, (KVM_ARCH_GUEST_S1)(a0)
+ REG_S a1, (KVM_ARCH_GUEST_A1)(a0)
+ REG_S a2, (KVM_ARCH_GUEST_A2)(a0)
+ REG_S a3, (KVM_ARCH_GUEST_A3)(a0)
+ REG_S a4, (KVM_ARCH_GUEST_A4)(a0)
+ REG_S a5, (KVM_ARCH_GUEST_A5)(a0)
+ REG_S a6, (KVM_ARCH_GUEST_A6)(a0)
+ REG_S a7, (KVM_ARCH_GUEST_A7)(a0)
+ REG_S s2, (KVM_ARCH_GUEST_S2)(a0)
+ REG_S s3, (KVM_ARCH_GUEST_S3)(a0)
+ REG_S s4, (KVM_ARCH_GUEST_S4)(a0)
+ REG_S s5, (KVM_ARCH_GUEST_S5)(a0)
+ REG_S s6, (KVM_ARCH_GUEST_S6)(a0)
+ REG_S s7, (KVM_ARCH_GUEST_S7)(a0)
+ REG_S s8, (KVM_ARCH_GUEST_S8)(a0)
+ REG_S s9, (KVM_ARCH_GUEST_S9)(a0)
+ REG_S s10, (KVM_ARCH_GUEST_S10)(a0)
+ REG_S s11, (KVM_ARCH_GUEST_S11)(a0)
+ REG_S t3, (KVM_ARCH_GUEST_T3)(a0)
+ REG_S t4, (KVM_ARCH_GUEST_T4)(a0)
+ REG_S t5, (KVM_ARCH_GUEST_T5)(a0)
+ REG_S t6, (KVM_ARCH_GUEST_T6)(a0)
+
+ /* Load Host CSR values */
+ REG_L t1, (KVM_ARCH_HOST_STVEC)(a0)
+ REG_L t2, (KVM_ARCH_HOST_SSCRATCH)(a0)
+ REG_L t3, (KVM_ARCH_HOST_SCOUNTEREN)(a0)
+ REG_L t4, (KVM_ARCH_HOST_HSTATUS)(a0)
+ REG_L t5, (KVM_ARCH_HOST_SSTATUS)(a0)
+
+ /* Save Guest SEPC */
+ csrr t0, CSR_SEPC
+
+ /* Save Guest A0 and Restore Host SSCRATCH */
+ csrrw t2, CSR_SSCRATCH, t2
+
+ /* Restore Host STVEC */
+ csrw CSR_STVEC, t1
+
+ /* Save Guest and Restore Host SCOUNTEREN */
+ csrrw t3, CSR_SCOUNTEREN, t3
+
+ /* Save Guest and Restore Host HSTATUS */
+ csrrw t4, CSR_HSTATUS, t4
+
+ /* Save Guest and Restore Host SSTATUS */
+ csrrw t5, CSR_SSTATUS, t5
+
+ /* Store Guest CSR values */
+ REG_S t0, (KVM_ARCH_GUEST_SEPC)(a0)
+ REG_S t2, (KVM_ARCH_GUEST_A0)(a0)
+ REG_S t3, (KVM_ARCH_GUEST_SCOUNTEREN)(a0)
+ REG_S t4, (KVM_ARCH_GUEST_HSTATUS)(a0)
+ REG_S t5, (KVM_ARCH_GUEST_SSTATUS)(a0)
+
+ /* Restore Host GPRs (except A0 and T0-T6) */
+ REG_L ra, (KVM_ARCH_HOST_RA)(a0)
+ REG_L sp, (KVM_ARCH_HOST_SP)(a0)
+ REG_L gp, (KVM_ARCH_HOST_GP)(a0)
+ REG_L tp, (KVM_ARCH_HOST_TP)(a0)
+ REG_L s0, (KVM_ARCH_HOST_S0)(a0)
+ REG_L s1, (KVM_ARCH_HOST_S1)(a0)
+ REG_L a1, (KVM_ARCH_HOST_A1)(a0)
+ REG_L a2, (KVM_ARCH_HOST_A2)(a0)
+ REG_L a3, (KVM_ARCH_HOST_A3)(a0)
+ REG_L a4, (KVM_ARCH_HOST_A4)(a0)
+ REG_L a5, (KVM_ARCH_HOST_A5)(a0)
+ REG_L a6, (KVM_ARCH_HOST_A6)(a0)
+ REG_L a7, (KVM_ARCH_HOST_A7)(a0)
+ REG_L s2, (KVM_ARCH_HOST_S2)(a0)
+ REG_L s3, (KVM_ARCH_HOST_S3)(a0)
+ REG_L s4, (KVM_ARCH_HOST_S4)(a0)
+ REG_L s5, (KVM_ARCH_HOST_S5)(a0)
+ REG_L s6, (KVM_ARCH_HOST_S6)(a0)
+ REG_L s7, (KVM_ARCH_HOST_S7)(a0)
+ REG_L s8, (KVM_ARCH_HOST_S8)(a0)
+ REG_L s9, (KVM_ARCH_HOST_S9)(a0)
+ REG_L s10, (KVM_ARCH_HOST_S10)(a0)
+ REG_L s11, (KVM_ARCH_HOST_S11)(a0)
+
+ /* Return to C code */
+ ret
+ENDPROC(__kvm_riscv_switch_to)
+
+ENTRY(__kvm_riscv_unpriv_trap)
+ /*
+ * We assume that faulting unpriv load/store instruction is
+ * 4-byte long and blindly increment SEPC by 4.
+ *
+ * The trap details will be saved at address pointed by 'A0'
+ * register and we use 'A1' register as temporary.
+ */
+ csrr a1, CSR_SEPC
+ REG_S a1, (KVM_ARCH_TRAP_SEPC)(a0)
+ addi a1, a1, 4
+ csrw CSR_SEPC, a1
+ csrr a1, CSR_SCAUSE
+ REG_S a1, (KVM_ARCH_TRAP_SCAUSE)(a0)
+ csrr a1, CSR_STVAL
+ REG_S a1, (KVM_ARCH_TRAP_STVAL)(a0)
+ csrr a1, CSR_HTVAL
+ REG_S a1, (KVM_ARCH_TRAP_HTVAL)(a0)
+ csrr a1, CSR_HTINST
+ REG_S a1, (KVM_ARCH_TRAP_HTINST)(a0)
+ sret
+ENDPROC(__kvm_riscv_unpriv_trap)
+
+#ifdef CONFIG_FPU
+ .align 3
+ .global __kvm_riscv_fp_f_save
+__kvm_riscv_fp_f_save:
+ csrr t2, CSR_SSTATUS
+ li t1, SR_FS
+ csrs CSR_SSTATUS, t1
+ frcsr t0
+ fsw f0, KVM_ARCH_FP_F_F0(a0)
+ fsw f1, KVM_ARCH_FP_F_F1(a0)
+ fsw f2, KVM_ARCH_FP_F_F2(a0)
+ fsw f3, KVM_ARCH_FP_F_F3(a0)
+ fsw f4, KVM_ARCH_FP_F_F4(a0)
+ fsw f5, KVM_ARCH_FP_F_F5(a0)
+ fsw f6, KVM_ARCH_FP_F_F6(a0)
+ fsw f7, KVM_ARCH_FP_F_F7(a0)
+ fsw f8, KVM_ARCH_FP_F_F8(a0)
+ fsw f9, KVM_ARCH_FP_F_F9(a0)
+ fsw f10, KVM_ARCH_FP_F_F10(a0)
+ fsw f11, KVM_ARCH_FP_F_F11(a0)
+ fsw f12, KVM_ARCH_FP_F_F12(a0)
+ fsw f13, KVM_ARCH_FP_F_F13(a0)
+ fsw f14, KVM_ARCH_FP_F_F14(a0)
+ fsw f15, KVM_ARCH_FP_F_F15(a0)
+ fsw f16, KVM_ARCH_FP_F_F16(a0)
+ fsw f17, KVM_ARCH_FP_F_F17(a0)
+ fsw f18, KVM_ARCH_FP_F_F18(a0)
+ fsw f19, KVM_ARCH_FP_F_F19(a0)
+ fsw f20, KVM_ARCH_FP_F_F20(a0)
+ fsw f21, KVM_ARCH_FP_F_F21(a0)
+ fsw f22, KVM_ARCH_FP_F_F22(a0)
+ fsw f23, KVM_ARCH_FP_F_F23(a0)
+ fsw f24, KVM_ARCH_FP_F_F24(a0)
+ fsw f25, KVM_ARCH_FP_F_F25(a0)
+ fsw f26, KVM_ARCH_FP_F_F26(a0)
+ fsw f27, KVM_ARCH_FP_F_F27(a0)
+ fsw f28, KVM_ARCH_FP_F_F28(a0)
+ fsw f29, KVM_ARCH_FP_F_F29(a0)
+ fsw f30, KVM_ARCH_FP_F_F30(a0)
+ fsw f31, KVM_ARCH_FP_F_F31(a0)
+ sw t0, KVM_ARCH_FP_F_FCSR(a0)
+ csrw CSR_SSTATUS, t2
+ ret
+
+ .align 3
+ .global __kvm_riscv_fp_d_save
+__kvm_riscv_fp_d_save:
+ csrr t2, CSR_SSTATUS
+ li t1, SR_FS
+ csrs CSR_SSTATUS, t1
+ frcsr t0
+ fsd f0, KVM_ARCH_FP_D_F0(a0)
+ fsd f1, KVM_ARCH_FP_D_F1(a0)
+ fsd f2, KVM_ARCH_FP_D_F2(a0)
+ fsd f3, KVM_ARCH_FP_D_F3(a0)
+ fsd f4, KVM_ARCH_FP_D_F4(a0)
+ fsd f5, KVM_ARCH_FP_D_F5(a0)
+ fsd f6, KVM_ARCH_FP_D_F6(a0)
+ fsd f7, KVM_ARCH_FP_D_F7(a0)
+ fsd f8, KVM_ARCH_FP_D_F8(a0)
+ fsd f9, KVM_ARCH_FP_D_F9(a0)
+ fsd f10, KVM_ARCH_FP_D_F10(a0)
+ fsd f11, KVM_ARCH_FP_D_F11(a0)
+ fsd f12, KVM_ARCH_FP_D_F12(a0)
+ fsd f13, KVM_ARCH_FP_D_F13(a0)
+ fsd f14, KVM_ARCH_FP_D_F14(a0)
+ fsd f15, KVM_ARCH_FP_D_F15(a0)
+ fsd f16, KVM_ARCH_FP_D_F16(a0)
+ fsd f17, KVM_ARCH_FP_D_F17(a0)
+ fsd f18, KVM_ARCH_FP_D_F18(a0)
+ fsd f19, KVM_ARCH_FP_D_F19(a0)
+ fsd f20, KVM_ARCH_FP_D_F20(a0)
+ fsd f21, KVM_ARCH_FP_D_F21(a0)
+ fsd f22, KVM_ARCH_FP_D_F22(a0)
+ fsd f23, KVM_ARCH_FP_D_F23(a0)
+ fsd f24, KVM_ARCH_FP_D_F24(a0)
+ fsd f25, KVM_ARCH_FP_D_F25(a0)
+ fsd f26, KVM_ARCH_FP_D_F26(a0)
+ fsd f27, KVM_ARCH_FP_D_F27(a0)
+ fsd f28, KVM_ARCH_FP_D_F28(a0)
+ fsd f29, KVM_ARCH_FP_D_F29(a0)
+ fsd f30, KVM_ARCH_FP_D_F30(a0)
+ fsd f31, KVM_ARCH_FP_D_F31(a0)
+ sw t0, KVM_ARCH_FP_D_FCSR(a0)
+ csrw CSR_SSTATUS, t2
+ ret
+
+ .align 3
+ .global __kvm_riscv_fp_f_restore
+__kvm_riscv_fp_f_restore:
+ csrr t2, CSR_SSTATUS
+ li t1, SR_FS
+ lw t0, KVM_ARCH_FP_F_FCSR(a0)
+ csrs CSR_SSTATUS, t1
+ flw f0, KVM_ARCH_FP_F_F0(a0)
+ flw f1, KVM_ARCH_FP_F_F1(a0)
+ flw f2, KVM_ARCH_FP_F_F2(a0)
+ flw f3, KVM_ARCH_FP_F_F3(a0)
+ flw f4, KVM_ARCH_FP_F_F4(a0)
+ flw f5, KVM_ARCH_FP_F_F5(a0)
+ flw f6, KVM_ARCH_FP_F_F6(a0)
+ flw f7, KVM_ARCH_FP_F_F7(a0)
+ flw f8, KVM_ARCH_FP_F_F8(a0)
+ flw f9, KVM_ARCH_FP_F_F9(a0)
+ flw f10, KVM_ARCH_FP_F_F10(a0)
+ flw f11, KVM_ARCH_FP_F_F11(a0)
+ flw f12, KVM_ARCH_FP_F_F12(a0)
+ flw f13, KVM_ARCH_FP_F_F13(a0)
+ flw f14, KVM_ARCH_FP_F_F14(a0)
+ flw f15, KVM_ARCH_FP_F_F15(a0)
+ flw f16, KVM_ARCH_FP_F_F16(a0)
+ flw f17, KVM_ARCH_FP_F_F17(a0)
+ flw f18, KVM_ARCH_FP_F_F18(a0)
+ flw f19, KVM_ARCH_FP_F_F19(a0)
+ flw f20, KVM_ARCH_FP_F_F20(a0)
+ flw f21, KVM_ARCH_FP_F_F21(a0)
+ flw f22, KVM_ARCH_FP_F_F22(a0)
+ flw f23, KVM_ARCH_FP_F_F23(a0)
+ flw f24, KVM_ARCH_FP_F_F24(a0)
+ flw f25, KVM_ARCH_FP_F_F25(a0)
+ flw f26, KVM_ARCH_FP_F_F26(a0)
+ flw f27, KVM_ARCH_FP_F_F27(a0)
+ flw f28, KVM_ARCH_FP_F_F28(a0)
+ flw f29, KVM_ARCH_FP_F_F29(a0)
+ flw f30, KVM_ARCH_FP_F_F30(a0)
+ flw f31, KVM_ARCH_FP_F_F31(a0)
+ fscsr t0
+ csrw CSR_SSTATUS, t2
+ ret
+
+ .align 3
+ .global __kvm_riscv_fp_d_restore
+__kvm_riscv_fp_d_restore:
+ csrr t2, CSR_SSTATUS
+ li t1, SR_FS
+ lw t0, KVM_ARCH_FP_D_FCSR(a0)
+ csrs CSR_SSTATUS, t1
+ fld f0, KVM_ARCH_FP_D_F0(a0)
+ fld f1, KVM_ARCH_FP_D_F1(a0)
+ fld f2, KVM_ARCH_FP_D_F2(a0)
+ fld f3, KVM_ARCH_FP_D_F3(a0)
+ fld f4, KVM_ARCH_FP_D_F4(a0)
+ fld f5, KVM_ARCH_FP_D_F5(a0)
+ fld f6, KVM_ARCH_FP_D_F6(a0)
+ fld f7, KVM_ARCH_FP_D_F7(a0)
+ fld f8, KVM_ARCH_FP_D_F8(a0)
+ fld f9, KVM_ARCH_FP_D_F9(a0)
+ fld f10, KVM_ARCH_FP_D_F10(a0)
+ fld f11, KVM_ARCH_FP_D_F11(a0)
+ fld f12, KVM_ARCH_FP_D_F12(a0)
+ fld f13, KVM_ARCH_FP_D_F13(a0)
+ fld f14, KVM_ARCH_FP_D_F14(a0)
+ fld f15, KVM_ARCH_FP_D_F15(a0)
+ fld f16, KVM_ARCH_FP_D_F16(a0)
+ fld f17, KVM_ARCH_FP_D_F17(a0)
+ fld f18, KVM_ARCH_FP_D_F18(a0)
+ fld f19, KVM_ARCH_FP_D_F19(a0)
+ fld f20, KVM_ARCH_FP_D_F20(a0)
+ fld f21, KVM_ARCH_FP_D_F21(a0)
+ fld f22, KVM_ARCH_FP_D_F22(a0)
+ fld f23, KVM_ARCH_FP_D_F23(a0)
+ fld f24, KVM_ARCH_FP_D_F24(a0)
+ fld f25, KVM_ARCH_FP_D_F25(a0)
+ fld f26, KVM_ARCH_FP_D_F26(a0)
+ fld f27, KVM_ARCH_FP_D_F27(a0)
+ fld f28, KVM_ARCH_FP_D_F28(a0)
+ fld f29, KVM_ARCH_FP_D_F29(a0)
+ fld f30, KVM_ARCH_FP_D_F30(a0)
+ fld f31, KVM_ARCH_FP_D_F31(a0)
+ fscsr t0
+ csrw CSR_SSTATUS, t2
+ ret
+#endif
diff --git a/arch/riscv/kvm/vcpu_timer.c b/arch/riscv/kvm/vcpu_timer.c
new file mode 100644
index 000000000..75486b25a
--- /dev/null
+++ b/arch/riscv/kvm/vcpu_timer.c
@@ -0,0 +1,363 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2019 Western Digital Corporation or its affiliates.
+ *
+ * Authors:
+ * Atish Patra <atish.patra@wdc.com>
+ */
+
+#include <linux/errno.h>
+#include <linux/err.h>
+#include <linux/kvm_host.h>
+#include <linux/uaccess.h>
+#include <clocksource/timer-riscv.h>
+#include <asm/csr.h>
+#include <asm/delay.h>
+#include <asm/kvm_vcpu_timer.h>
+
+static u64 kvm_riscv_current_cycles(struct kvm_guest_timer *gt)
+{
+ return get_cycles64() + gt->time_delta;
+}
+
+static u64 kvm_riscv_delta_cycles2ns(u64 cycles,
+ struct kvm_guest_timer *gt,
+ struct kvm_vcpu_timer *t)
+{
+ unsigned long flags;
+ u64 cycles_now, cycles_delta, delta_ns;
+
+ local_irq_save(flags);
+ cycles_now = kvm_riscv_current_cycles(gt);
+ if (cycles_now < cycles)
+ cycles_delta = cycles - cycles_now;
+ else
+ cycles_delta = 0;
+ delta_ns = (cycles_delta * gt->nsec_mult) >> gt->nsec_shift;
+ local_irq_restore(flags);
+
+ return delta_ns;
+}
+
+static enum hrtimer_restart kvm_riscv_vcpu_hrtimer_expired(struct hrtimer *h)
+{
+ u64 delta_ns;
+ struct kvm_vcpu_timer *t = container_of(h, struct kvm_vcpu_timer, hrt);
+ struct kvm_vcpu *vcpu = container_of(t, struct kvm_vcpu, arch.timer);
+ struct kvm_guest_timer *gt = &vcpu->kvm->arch.timer;
+
+ if (kvm_riscv_current_cycles(gt) < t->next_cycles) {
+ delta_ns = kvm_riscv_delta_cycles2ns(t->next_cycles, gt, t);
+ hrtimer_forward_now(&t->hrt, ktime_set(0, delta_ns));
+ return HRTIMER_RESTART;
+ }
+
+ t->next_set = false;
+ kvm_riscv_vcpu_set_interrupt(vcpu, IRQ_VS_TIMER);
+
+ return HRTIMER_NORESTART;
+}
+
+static int kvm_riscv_vcpu_timer_cancel(struct kvm_vcpu_timer *t)
+{
+ if (!t->init_done || !t->next_set)
+ return -EINVAL;
+
+ hrtimer_cancel(&t->hrt);
+ t->next_set = false;
+
+ return 0;
+}
+
+static int kvm_riscv_vcpu_update_vstimecmp(struct kvm_vcpu *vcpu, u64 ncycles)
+{
+#if defined(CONFIG_32BIT)
+ csr_write(CSR_VSTIMECMP, ncycles & 0xFFFFFFFF);
+ csr_write(CSR_VSTIMECMPH, ncycles >> 32);
+#else
+ csr_write(CSR_VSTIMECMP, ncycles);
+#endif
+ return 0;
+}
+
+static int kvm_riscv_vcpu_update_hrtimer(struct kvm_vcpu *vcpu, u64 ncycles)
+{
+ struct kvm_vcpu_timer *t = &vcpu->arch.timer;
+ struct kvm_guest_timer *gt = &vcpu->kvm->arch.timer;
+ u64 delta_ns;
+
+ if (!t->init_done)
+ return -EINVAL;
+
+ kvm_riscv_vcpu_unset_interrupt(vcpu, IRQ_VS_TIMER);
+
+ delta_ns = kvm_riscv_delta_cycles2ns(ncycles, gt, t);
+ t->next_cycles = ncycles;
+ hrtimer_start(&t->hrt, ktime_set(0, delta_ns), HRTIMER_MODE_REL);
+ t->next_set = true;
+
+ return 0;
+}
+
+int kvm_riscv_vcpu_timer_next_event(struct kvm_vcpu *vcpu, u64 ncycles)
+{
+ struct kvm_vcpu_timer *t = &vcpu->arch.timer;
+
+ return t->timer_next_event(vcpu, ncycles);
+}
+
+static enum hrtimer_restart kvm_riscv_vcpu_vstimer_expired(struct hrtimer *h)
+{
+ u64 delta_ns;
+ struct kvm_vcpu_timer *t = container_of(h, struct kvm_vcpu_timer, hrt);
+ struct kvm_vcpu *vcpu = container_of(t, struct kvm_vcpu, arch.timer);
+ struct kvm_guest_timer *gt = &vcpu->kvm->arch.timer;
+
+ if (kvm_riscv_current_cycles(gt) < t->next_cycles) {
+ delta_ns = kvm_riscv_delta_cycles2ns(t->next_cycles, gt, t);
+ hrtimer_forward_now(&t->hrt, ktime_set(0, delta_ns));
+ return HRTIMER_RESTART;
+ }
+
+ t->next_set = false;
+ kvm_vcpu_kick(vcpu);
+
+ return HRTIMER_NORESTART;
+}
+
+bool kvm_riscv_vcpu_timer_pending(struct kvm_vcpu *vcpu)
+{
+ struct kvm_vcpu_timer *t = &vcpu->arch.timer;
+ struct kvm_guest_timer *gt = &vcpu->kvm->arch.timer;
+
+ if (!kvm_riscv_delta_cycles2ns(t->next_cycles, gt, t) ||
+ kvm_riscv_vcpu_has_interrupts(vcpu, 1UL << IRQ_VS_TIMER))
+ return true;
+ else
+ return false;
+}
+
+static void kvm_riscv_vcpu_timer_blocking(struct kvm_vcpu *vcpu)
+{
+ struct kvm_vcpu_timer *t = &vcpu->arch.timer;
+ struct kvm_guest_timer *gt = &vcpu->kvm->arch.timer;
+ u64 delta_ns;
+
+ if (!t->init_done)
+ return;
+
+ delta_ns = kvm_riscv_delta_cycles2ns(t->next_cycles, gt, t);
+ hrtimer_start(&t->hrt, ktime_set(0, delta_ns), HRTIMER_MODE_REL);
+ t->next_set = true;
+}
+
+static void kvm_riscv_vcpu_timer_unblocking(struct kvm_vcpu *vcpu)
+{
+ kvm_riscv_vcpu_timer_cancel(&vcpu->arch.timer);
+}
+
+int kvm_riscv_vcpu_get_reg_timer(struct kvm_vcpu *vcpu,
+ const struct kvm_one_reg *reg)
+{
+ struct kvm_vcpu_timer *t = &vcpu->arch.timer;
+ struct kvm_guest_timer *gt = &vcpu->kvm->arch.timer;
+ u64 __user *uaddr = (u64 __user *)(unsigned long)reg->addr;
+ unsigned long reg_num = reg->id & ~(KVM_REG_ARCH_MASK |
+ KVM_REG_SIZE_MASK |
+ KVM_REG_RISCV_TIMER);
+ u64 reg_val;
+
+ if (KVM_REG_SIZE(reg->id) != sizeof(u64))
+ return -EINVAL;
+ if (reg_num >= sizeof(struct kvm_riscv_timer) / sizeof(u64))
+ return -ENOENT;
+
+ switch (reg_num) {
+ case KVM_REG_RISCV_TIMER_REG(frequency):
+ reg_val = riscv_timebase;
+ break;
+ case KVM_REG_RISCV_TIMER_REG(time):
+ reg_val = kvm_riscv_current_cycles(gt);
+ break;
+ case KVM_REG_RISCV_TIMER_REG(compare):
+ reg_val = t->next_cycles;
+ break;
+ case KVM_REG_RISCV_TIMER_REG(state):
+ reg_val = (t->next_set) ? KVM_RISCV_TIMER_STATE_ON :
+ KVM_RISCV_TIMER_STATE_OFF;
+ break;
+ default:
+ return -ENOENT;
+ }
+
+ if (copy_to_user(uaddr, &reg_val, KVM_REG_SIZE(reg->id)))
+ return -EFAULT;
+
+ return 0;
+}
+
+int kvm_riscv_vcpu_set_reg_timer(struct kvm_vcpu *vcpu,
+ const struct kvm_one_reg *reg)
+{
+ struct kvm_vcpu_timer *t = &vcpu->arch.timer;
+ struct kvm_guest_timer *gt = &vcpu->kvm->arch.timer;
+ u64 __user *uaddr = (u64 __user *)(unsigned long)reg->addr;
+ unsigned long reg_num = reg->id & ~(KVM_REG_ARCH_MASK |
+ KVM_REG_SIZE_MASK |
+ KVM_REG_RISCV_TIMER);
+ u64 reg_val;
+ int ret = 0;
+
+ if (KVM_REG_SIZE(reg->id) != sizeof(u64))
+ return -EINVAL;
+ if (reg_num >= sizeof(struct kvm_riscv_timer) / sizeof(u64))
+ return -ENOENT;
+
+ if (copy_from_user(&reg_val, uaddr, KVM_REG_SIZE(reg->id)))
+ return -EFAULT;
+
+ switch (reg_num) {
+ case KVM_REG_RISCV_TIMER_REG(frequency):
+ if (reg_val != riscv_timebase)
+ return -EINVAL;
+ break;
+ case KVM_REG_RISCV_TIMER_REG(time):
+ gt->time_delta = reg_val - get_cycles64();
+ break;
+ case KVM_REG_RISCV_TIMER_REG(compare):
+ t->next_cycles = reg_val;
+ break;
+ case KVM_REG_RISCV_TIMER_REG(state):
+ if (reg_val == KVM_RISCV_TIMER_STATE_ON)
+ ret = kvm_riscv_vcpu_timer_next_event(vcpu, reg_val);
+ else
+ ret = kvm_riscv_vcpu_timer_cancel(t);
+ break;
+ default:
+ ret = -ENOENT;
+ break;
+ }
+
+ return ret;
+}
+
+int kvm_riscv_vcpu_timer_init(struct kvm_vcpu *vcpu)
+{
+ struct kvm_vcpu_timer *t = &vcpu->arch.timer;
+
+ if (t->init_done)
+ return -EINVAL;
+
+ hrtimer_init(&t->hrt, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ t->init_done = true;
+ t->next_set = false;
+
+ /* Enable sstc for every vcpu if available in hardware */
+ if (riscv_isa_extension_available(NULL, SSTC)) {
+ t->sstc_enabled = true;
+ t->hrt.function = kvm_riscv_vcpu_vstimer_expired;
+ t->timer_next_event = kvm_riscv_vcpu_update_vstimecmp;
+ } else {
+ t->sstc_enabled = false;
+ t->hrt.function = kvm_riscv_vcpu_hrtimer_expired;
+ t->timer_next_event = kvm_riscv_vcpu_update_hrtimer;
+ }
+
+ return 0;
+}
+
+int kvm_riscv_vcpu_timer_deinit(struct kvm_vcpu *vcpu)
+{
+ int ret;
+
+ ret = kvm_riscv_vcpu_timer_cancel(&vcpu->arch.timer);
+ vcpu->arch.timer.init_done = false;
+
+ return ret;
+}
+
+int kvm_riscv_vcpu_timer_reset(struct kvm_vcpu *vcpu)
+{
+ struct kvm_vcpu_timer *t = &vcpu->arch.timer;
+
+ t->next_cycles = -1ULL;
+ return kvm_riscv_vcpu_timer_cancel(&vcpu->arch.timer);
+}
+
+static void kvm_riscv_vcpu_update_timedelta(struct kvm_vcpu *vcpu)
+{
+ struct kvm_guest_timer *gt = &vcpu->kvm->arch.timer;
+
+#if defined(CONFIG_32BIT)
+ csr_write(CSR_HTIMEDELTA, (u32)(gt->time_delta));
+ csr_write(CSR_HTIMEDELTAH, (u32)(gt->time_delta >> 32));
+#else
+ csr_write(CSR_HTIMEDELTA, gt->time_delta);
+#endif
+}
+
+void kvm_riscv_vcpu_timer_restore(struct kvm_vcpu *vcpu)
+{
+ struct kvm_vcpu_timer *t = &vcpu->arch.timer;
+
+ kvm_riscv_vcpu_update_timedelta(vcpu);
+
+ if (!t->sstc_enabled)
+ return;
+
+#if defined(CONFIG_32BIT)
+ csr_write(CSR_VSTIMECMP, (u32)t->next_cycles);
+ csr_write(CSR_VSTIMECMPH, (u32)(t->next_cycles >> 32));
+#else
+ csr_write(CSR_VSTIMECMP, t->next_cycles);
+#endif
+
+ /* timer should be enabled for the remaining operations */
+ if (unlikely(!t->init_done))
+ return;
+
+ kvm_riscv_vcpu_timer_unblocking(vcpu);
+}
+
+void kvm_riscv_vcpu_timer_sync(struct kvm_vcpu *vcpu)
+{
+ struct kvm_vcpu_timer *t = &vcpu->arch.timer;
+
+ if (!t->sstc_enabled)
+ return;
+
+#if defined(CONFIG_32BIT)
+ t->next_cycles = csr_read(CSR_VSTIMECMP);
+ t->next_cycles |= (u64)csr_read(CSR_VSTIMECMPH) << 32;
+#else
+ t->next_cycles = csr_read(CSR_VSTIMECMP);
+#endif
+}
+
+void kvm_riscv_vcpu_timer_save(struct kvm_vcpu *vcpu)
+{
+ struct kvm_vcpu_timer *t = &vcpu->arch.timer;
+
+ if (!t->sstc_enabled)
+ return;
+
+ /*
+ * The vstimecmp CSRs are saved by kvm_riscv_vcpu_timer_sync()
+ * upon every VM exit so no need to save here.
+ */
+
+ /* timer should be enabled for the remaining operations */
+ if (unlikely(!t->init_done))
+ return;
+
+ if (kvm_vcpu_is_blocking(vcpu))
+ kvm_riscv_vcpu_timer_blocking(vcpu);
+}
+
+void kvm_riscv_guest_timer_init(struct kvm *kvm)
+{
+ struct kvm_guest_timer *gt = &kvm->arch.timer;
+
+ riscv_cs_get_mult_shift(&gt->nsec_mult, &gt->nsec_shift);
+ gt->time_delta = -get_cycles64();
+}
diff --git a/arch/riscv/kvm/vcpu_vector.c b/arch/riscv/kvm/vcpu_vector.c
new file mode 100644
index 000000000..b430cbb69
--- /dev/null
+++ b/arch/riscv/kvm/vcpu_vector.c
@@ -0,0 +1,184 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2022 SiFive
+ *
+ * Authors:
+ * Vincent Chen <vincent.chen@sifive.com>
+ * Greentime Hu <greentime.hu@sifive.com>
+ */
+
+#include <linux/errno.h>
+#include <linux/err.h>
+#include <linux/kvm_host.h>
+#include <linux/uaccess.h>
+#include <asm/hwcap.h>
+#include <asm/kvm_vcpu_vector.h>
+#include <asm/vector.h>
+
+#ifdef CONFIG_RISCV_ISA_V
+void kvm_riscv_vcpu_vector_reset(struct kvm_vcpu *vcpu)
+{
+ unsigned long *isa = vcpu->arch.isa;
+ struct kvm_cpu_context *cntx = &vcpu->arch.guest_context;
+
+ cntx->sstatus &= ~SR_VS;
+ if (riscv_isa_extension_available(isa, v)) {
+ cntx->sstatus |= SR_VS_INITIAL;
+ WARN_ON(!cntx->vector.datap);
+ memset(cntx->vector.datap, 0, riscv_v_vsize);
+ } else {
+ cntx->sstatus |= SR_VS_OFF;
+ }
+}
+
+static void kvm_riscv_vcpu_vector_clean(struct kvm_cpu_context *cntx)
+{
+ cntx->sstatus &= ~SR_VS;
+ cntx->sstatus |= SR_VS_CLEAN;
+}
+
+void kvm_riscv_vcpu_guest_vector_save(struct kvm_cpu_context *cntx,
+ unsigned long *isa)
+{
+ if ((cntx->sstatus & SR_VS) == SR_VS_DIRTY) {
+ if (riscv_isa_extension_available(isa, v))
+ __kvm_riscv_vector_save(cntx);
+ kvm_riscv_vcpu_vector_clean(cntx);
+ }
+}
+
+void kvm_riscv_vcpu_guest_vector_restore(struct kvm_cpu_context *cntx,
+ unsigned long *isa)
+{
+ if ((cntx->sstatus & SR_VS) != SR_VS_OFF) {
+ if (riscv_isa_extension_available(isa, v))
+ __kvm_riscv_vector_restore(cntx);
+ kvm_riscv_vcpu_vector_clean(cntx);
+ }
+}
+
+void kvm_riscv_vcpu_host_vector_save(struct kvm_cpu_context *cntx)
+{
+ /* No need to check host sstatus as it can be modified outside */
+ if (riscv_isa_extension_available(NULL, v))
+ __kvm_riscv_vector_save(cntx);
+}
+
+void kvm_riscv_vcpu_host_vector_restore(struct kvm_cpu_context *cntx)
+{
+ if (riscv_isa_extension_available(NULL, v))
+ __kvm_riscv_vector_restore(cntx);
+}
+
+int kvm_riscv_vcpu_alloc_vector_context(struct kvm_vcpu *vcpu,
+ struct kvm_cpu_context *cntx)
+{
+ cntx->vector.datap = kmalloc(riscv_v_vsize, GFP_KERNEL);
+ if (!cntx->vector.datap)
+ return -ENOMEM;
+
+ vcpu->arch.host_context.vector.datap = kzalloc(riscv_v_vsize, GFP_KERNEL);
+ if (!vcpu->arch.host_context.vector.datap)
+ return -ENOMEM;
+
+ return 0;
+}
+
+void kvm_riscv_vcpu_free_vector_context(struct kvm_vcpu *vcpu)
+{
+ kfree(vcpu->arch.guest_reset_context.vector.datap);
+ kfree(vcpu->arch.host_context.vector.datap);
+}
+#endif
+
+static int kvm_riscv_vcpu_vreg_addr(struct kvm_vcpu *vcpu,
+ unsigned long reg_num,
+ size_t reg_size,
+ void **reg_addr)
+{
+ struct kvm_cpu_context *cntx = &vcpu->arch.guest_context;
+ size_t vlenb = riscv_v_vsize / 32;
+
+ if (reg_num < KVM_REG_RISCV_VECTOR_REG(0)) {
+ if (reg_size != sizeof(unsigned long))
+ return -EINVAL;
+ switch (reg_num) {
+ case KVM_REG_RISCV_VECTOR_CSR_REG(vstart):
+ *reg_addr = &cntx->vector.vstart;
+ break;
+ case KVM_REG_RISCV_VECTOR_CSR_REG(vl):
+ *reg_addr = &cntx->vector.vl;
+ break;
+ case KVM_REG_RISCV_VECTOR_CSR_REG(vtype):
+ *reg_addr = &cntx->vector.vtype;
+ break;
+ case KVM_REG_RISCV_VECTOR_CSR_REG(vcsr):
+ *reg_addr = &cntx->vector.vcsr;
+ break;
+ case KVM_REG_RISCV_VECTOR_CSR_REG(datap):
+ default:
+ return -ENOENT;
+ }
+ } else if (reg_num <= KVM_REG_RISCV_VECTOR_REG(31)) {
+ if (reg_size != vlenb)
+ return -EINVAL;
+ *reg_addr = cntx->vector.datap +
+ (reg_num - KVM_REG_RISCV_VECTOR_REG(0)) * vlenb;
+ } else {
+ return -ENOENT;
+ }
+
+ return 0;
+}
+
+int kvm_riscv_vcpu_get_reg_vector(struct kvm_vcpu *vcpu,
+ const struct kvm_one_reg *reg)
+{
+ unsigned long *isa = vcpu->arch.isa;
+ unsigned long __user *uaddr =
+ (unsigned long __user *)(unsigned long)reg->addr;
+ unsigned long reg_num = reg->id & ~(KVM_REG_ARCH_MASK |
+ KVM_REG_SIZE_MASK |
+ KVM_REG_RISCV_VECTOR);
+ size_t reg_size = KVM_REG_SIZE(reg->id);
+ void *reg_addr;
+ int rc;
+
+ if (!riscv_isa_extension_available(isa, v))
+ return -ENOENT;
+
+ rc = kvm_riscv_vcpu_vreg_addr(vcpu, reg_num, reg_size, &reg_addr);
+ if (rc)
+ return rc;
+
+ if (copy_to_user(uaddr, reg_addr, reg_size))
+ return -EFAULT;
+
+ return 0;
+}
+
+int kvm_riscv_vcpu_set_reg_vector(struct kvm_vcpu *vcpu,
+ const struct kvm_one_reg *reg)
+{
+ unsigned long *isa = vcpu->arch.isa;
+ unsigned long __user *uaddr =
+ (unsigned long __user *)(unsigned long)reg->addr;
+ unsigned long reg_num = reg->id & ~(KVM_REG_ARCH_MASK |
+ KVM_REG_SIZE_MASK |
+ KVM_REG_RISCV_VECTOR);
+ size_t reg_size = KVM_REG_SIZE(reg->id);
+ void *reg_addr;
+ int rc;
+
+ if (!riscv_isa_extension_available(isa, v))
+ return -ENOENT;
+
+ rc = kvm_riscv_vcpu_vreg_addr(vcpu, reg_num, reg_size, &reg_addr);
+ if (rc)
+ return rc;
+
+ if (copy_from_user(reg_addr, uaddr, reg_size))
+ return -EFAULT;
+
+ return 0;
+}
diff --git a/arch/riscv/kvm/vm.c b/arch/riscv/kvm/vm.c
new file mode 100644
index 000000000..7e2b50c69
--- /dev/null
+++ b/arch/riscv/kvm/vm.c
@@ -0,0 +1,215 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2019 Western Digital Corporation or its affiliates.
+ *
+ * Authors:
+ * Anup Patel <anup.patel@wdc.com>
+ */
+
+#include <linux/errno.h>
+#include <linux/err.h>
+#include <linux/module.h>
+#include <linux/uaccess.h>
+#include <linux/kvm_host.h>
+
+const struct _kvm_stats_desc kvm_vm_stats_desc[] = {
+ KVM_GENERIC_VM_STATS()
+};
+static_assert(ARRAY_SIZE(kvm_vm_stats_desc) ==
+ sizeof(struct kvm_vm_stat) / sizeof(u64));
+
+const struct kvm_stats_header kvm_vm_stats_header = {
+ .name_size = KVM_STATS_NAME_SIZE,
+ .num_desc = ARRAY_SIZE(kvm_vm_stats_desc),
+ .id_offset = sizeof(struct kvm_stats_header),
+ .desc_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE,
+ .data_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE +
+ sizeof(kvm_vm_stats_desc),
+};
+
+int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
+{
+ int r;
+
+ r = kvm_riscv_gstage_alloc_pgd(kvm);
+ if (r)
+ return r;
+
+ r = kvm_riscv_gstage_vmid_init(kvm);
+ if (r) {
+ kvm_riscv_gstage_free_pgd(kvm);
+ return r;
+ }
+
+ kvm_riscv_aia_init_vm(kvm);
+
+ kvm_riscv_guest_timer_init(kvm);
+
+ return 0;
+}
+
+void kvm_arch_destroy_vm(struct kvm *kvm)
+{
+ kvm_destroy_vcpus(kvm);
+
+ kvm_riscv_aia_destroy_vm(kvm);
+}
+
+int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irql,
+ bool line_status)
+{
+ if (!irqchip_in_kernel(kvm))
+ return -ENXIO;
+
+ return kvm_riscv_aia_inject_irq(kvm, irql->irq, irql->level);
+}
+
+int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
+ struct kvm *kvm, int irq_source_id,
+ int level, bool line_status)
+{
+ struct kvm_msi msi;
+
+ if (!level)
+ return -1;
+
+ msi.address_lo = e->msi.address_lo;
+ msi.address_hi = e->msi.address_hi;
+ msi.data = e->msi.data;
+ msi.flags = e->msi.flags;
+ msi.devid = e->msi.devid;
+
+ return kvm_riscv_aia_inject_msi(kvm, &msi);
+}
+
+static int kvm_riscv_set_irq(struct kvm_kernel_irq_routing_entry *e,
+ struct kvm *kvm, int irq_source_id,
+ int level, bool line_status)
+{
+ return kvm_riscv_aia_inject_irq(kvm, e->irqchip.pin, level);
+}
+
+int kvm_riscv_setup_default_irq_routing(struct kvm *kvm, u32 lines)
+{
+ struct kvm_irq_routing_entry *ents;
+ int i, rc;
+
+ ents = kcalloc(lines, sizeof(*ents), GFP_KERNEL);
+ if (!ents)
+ return -ENOMEM;
+
+ for (i = 0; i < lines; i++) {
+ ents[i].gsi = i;
+ ents[i].type = KVM_IRQ_ROUTING_IRQCHIP;
+ ents[i].u.irqchip.irqchip = 0;
+ ents[i].u.irqchip.pin = i;
+ }
+ rc = kvm_set_irq_routing(kvm, ents, lines, 0);
+ kfree(ents);
+
+ return rc;
+}
+
+bool kvm_arch_can_set_irq_routing(struct kvm *kvm)
+{
+ return irqchip_in_kernel(kvm);
+}
+
+int kvm_set_routing_entry(struct kvm *kvm,
+ struct kvm_kernel_irq_routing_entry *e,
+ const struct kvm_irq_routing_entry *ue)
+{
+ int r = -EINVAL;
+
+ switch (ue->type) {
+ case KVM_IRQ_ROUTING_IRQCHIP:
+ e->set = kvm_riscv_set_irq;
+ e->irqchip.irqchip = ue->u.irqchip.irqchip;
+ e->irqchip.pin = ue->u.irqchip.pin;
+ if ((e->irqchip.pin >= KVM_IRQCHIP_NUM_PINS) ||
+ (e->irqchip.irqchip >= KVM_NR_IRQCHIPS))
+ goto out;
+ break;
+ case KVM_IRQ_ROUTING_MSI:
+ e->set = kvm_set_msi;
+ e->msi.address_lo = ue->u.msi.address_lo;
+ e->msi.address_hi = ue->u.msi.address_hi;
+ e->msi.data = ue->u.msi.data;
+ e->msi.flags = ue->flags;
+ e->msi.devid = ue->u.msi.devid;
+ break;
+ default:
+ goto out;
+ }
+ r = 0;
+out:
+ return r;
+}
+
+int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e,
+ struct kvm *kvm, int irq_source_id, int level,
+ bool line_status)
+{
+ if (!level)
+ return -EWOULDBLOCK;
+
+ switch (e->type) {
+ case KVM_IRQ_ROUTING_MSI:
+ return kvm_set_msi(e, kvm, irq_source_id, level, line_status);
+
+ case KVM_IRQ_ROUTING_IRQCHIP:
+ return kvm_riscv_set_irq(e, kvm, irq_source_id,
+ level, line_status);
+ }
+
+ return -EWOULDBLOCK;
+}
+
+bool kvm_arch_irqchip_in_kernel(struct kvm *kvm)
+{
+ return irqchip_in_kernel(kvm);
+}
+
+int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
+{
+ int r;
+
+ switch (ext) {
+ case KVM_CAP_IRQCHIP:
+ r = kvm_riscv_aia_available();
+ break;
+ case KVM_CAP_IOEVENTFD:
+ case KVM_CAP_DEVICE_CTRL:
+ case KVM_CAP_USER_MEMORY:
+ case KVM_CAP_SYNC_MMU:
+ case KVM_CAP_DESTROY_MEMORY_REGION_WORKS:
+ case KVM_CAP_ONE_REG:
+ case KVM_CAP_READONLY_MEM:
+ case KVM_CAP_MP_STATE:
+ case KVM_CAP_IMMEDIATE_EXIT:
+ r = 1;
+ break;
+ case KVM_CAP_NR_VCPUS:
+ r = min_t(unsigned int, num_online_cpus(), KVM_MAX_VCPUS);
+ break;
+ case KVM_CAP_MAX_VCPUS:
+ r = KVM_MAX_VCPUS;
+ break;
+ case KVM_CAP_NR_MEMSLOTS:
+ r = KVM_USER_MEM_SLOTS;
+ break;
+ case KVM_CAP_VM_GPA_BITS:
+ r = kvm_riscv_gstage_gpa_bits();
+ break;
+ default:
+ r = 0;
+ break;
+ }
+
+ return r;
+}
+
+int kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
+{
+ return -EINVAL;
+}
diff --git a/arch/riscv/kvm/vmid.c b/arch/riscv/kvm/vmid.c
new file mode 100644
index 000000000..ddc98714c
--- /dev/null
+++ b/arch/riscv/kvm/vmid.c
@@ -0,0 +1,124 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2019 Western Digital Corporation or its affiliates.
+ *
+ * Authors:
+ * Anup Patel <anup.patel@wdc.com>
+ */
+
+#include <linux/bitops.h>
+#include <linux/cpumask.h>
+#include <linux/errno.h>
+#include <linux/err.h>
+#include <linux/module.h>
+#include <linux/smp.h>
+#include <linux/kvm_host.h>
+#include <asm/csr.h>
+
+static unsigned long vmid_version = 1;
+static unsigned long vmid_next;
+static unsigned long vmid_bits __ro_after_init;
+static DEFINE_SPINLOCK(vmid_lock);
+
+void __init kvm_riscv_gstage_vmid_detect(void)
+{
+ unsigned long old;
+
+ /* Figure-out number of VMID bits in HW */
+ old = csr_read(CSR_HGATP);
+ csr_write(CSR_HGATP, old | HGATP_VMID);
+ vmid_bits = csr_read(CSR_HGATP);
+ vmid_bits = (vmid_bits & HGATP_VMID) >> HGATP_VMID_SHIFT;
+ vmid_bits = fls_long(vmid_bits);
+ csr_write(CSR_HGATP, old);
+
+ /* We polluted local TLB so flush all guest TLB */
+ kvm_riscv_local_hfence_gvma_all();
+
+ /* We don't use VMID bits if they are not sufficient */
+ if ((1UL << vmid_bits) < num_possible_cpus())
+ vmid_bits = 0;
+}
+
+unsigned long kvm_riscv_gstage_vmid_bits(void)
+{
+ return vmid_bits;
+}
+
+int kvm_riscv_gstage_vmid_init(struct kvm *kvm)
+{
+ /* Mark the initial VMID and VMID version invalid */
+ kvm->arch.vmid.vmid_version = 0;
+ kvm->arch.vmid.vmid = 0;
+
+ return 0;
+}
+
+bool kvm_riscv_gstage_vmid_ver_changed(struct kvm_vmid *vmid)
+{
+ if (!vmid_bits)
+ return false;
+
+ return unlikely(READ_ONCE(vmid->vmid_version) !=
+ READ_ONCE(vmid_version));
+}
+
+static void __local_hfence_gvma_all(void *info)
+{
+ kvm_riscv_local_hfence_gvma_all();
+}
+
+void kvm_riscv_gstage_vmid_update(struct kvm_vcpu *vcpu)
+{
+ unsigned long i;
+ struct kvm_vcpu *v;
+ struct kvm_vmid *vmid = &vcpu->kvm->arch.vmid;
+
+ if (!kvm_riscv_gstage_vmid_ver_changed(vmid))
+ return;
+
+ spin_lock(&vmid_lock);
+
+ /*
+ * We need to re-check the vmid_version here to ensure that if
+ * another vcpu already allocated a valid vmid for this vm.
+ */
+ if (!kvm_riscv_gstage_vmid_ver_changed(vmid)) {
+ spin_unlock(&vmid_lock);
+ return;
+ }
+
+ /* First user of a new VMID version? */
+ if (unlikely(vmid_next == 0)) {
+ WRITE_ONCE(vmid_version, READ_ONCE(vmid_version) + 1);
+ vmid_next = 1;
+
+ /*
+ * We ran out of VMIDs so we increment vmid_version and
+ * start assigning VMIDs from 1.
+ *
+ * This also means existing VMIDs assignment to all Guest
+ * instances is invalid and we have force VMID re-assignement
+ * for all Guest instances. The Guest instances that were not
+ * running will automatically pick-up new VMIDs because will
+ * call kvm_riscv_gstage_vmid_update() whenever they enter
+ * in-kernel run loop. For Guest instances that are already
+ * running, we force VM exits on all host CPUs using IPI and
+ * flush all Guest TLBs.
+ */
+ on_each_cpu_mask(cpu_online_mask, __local_hfence_gvma_all,
+ NULL, 1);
+ }
+
+ vmid->vmid = vmid_next;
+ vmid_next++;
+ vmid_next &= (1 << vmid_bits) - 1;
+
+ WRITE_ONCE(vmid->vmid_version, READ_ONCE(vmid_version));
+
+ spin_unlock(&vmid_lock);
+
+ /* Request G-stage page table update for all VCPUs */
+ kvm_for_each_vcpu(i, v, vcpu->kvm)
+ kvm_make_request(KVM_REQ_UPDATE_HGATP, v);
+}