diff options
Diffstat (limited to '')
-rw-r--r-- | arch/loongarch/kvm/Kconfig | 40 | ||||
-rw-r--r-- | arch/loongarch/kvm/Makefile | 22 | ||||
-rw-r--r-- | arch/loongarch/kvm/exit.c | 696 | ||||
-rw-r--r-- | arch/loongarch/kvm/interrupt.c | 183 | ||||
-rw-r--r-- | arch/loongarch/kvm/main.c | 420 | ||||
-rw-r--r-- | arch/loongarch/kvm/mmu.c | 914 | ||||
-rw-r--r-- | arch/loongarch/kvm/switch.S | 250 | ||||
-rw-r--r-- | arch/loongarch/kvm/timer.c | 197 | ||||
-rw-r--r-- | arch/loongarch/kvm/tlb.c | 32 | ||||
-rw-r--r-- | arch/loongarch/kvm/trace.h | 162 | ||||
-rw-r--r-- | arch/loongarch/kvm/vcpu.c | 939 | ||||
-rw-r--r-- | arch/loongarch/kvm/vm.c | 94 |
12 files changed, 3949 insertions, 0 deletions
diff --git a/arch/loongarch/kvm/Kconfig b/arch/loongarch/kvm/Kconfig new file mode 100644 index 0000000000..fda425babf --- /dev/null +++ b/arch/loongarch/kvm/Kconfig @@ -0,0 +1,40 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# KVM configuration +# + +source "virt/kvm/Kconfig" + +menuconfig VIRTUALIZATION + bool "Virtualization" + help + Say Y here to get to see options for using your Linux host to run + other operating systems inside virtual machines (guests). + This option alone does not add any kernel code. + + If you say N, all options in this submenu will be skipped and + disabled. + +if VIRTUALIZATION + +config KVM + tristate "Kernel-based Virtual Machine (KVM) support" + depends on AS_HAS_LVZ_EXTENSION + depends on HAVE_KVM + select HAVE_KVM_DIRTY_RING_ACQ_REL + select HAVE_KVM_EVENTFD + select HAVE_KVM_VCPU_ASYNC_IOCTL + select KVM_GENERIC_DIRTYLOG_READ_PROTECT + select KVM_GENERIC_HARDWARE_ENABLING + select KVM_MMIO + select KVM_XFER_TO_GUEST_WORK + select MMU_NOTIFIER + select PREEMPT_NOTIFIERS + help + Support hosting virtualized guest machines using + hardware virtualization extensions. You will need + a processor equipped with virtualization extensions. + + If unsure, say N. + +endif # VIRTUALIZATION diff --git a/arch/loongarch/kvm/Makefile b/arch/loongarch/kvm/Makefile new file mode 100644 index 0000000000..244467d779 --- /dev/null +++ b/arch/loongarch/kvm/Makefile @@ -0,0 +1,22 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# Makefile for LoongArch KVM support +# + +ccflags-y += -I $(srctree)/$(src) + +include $(srctree)/virt/kvm/Makefile.kvm + +obj-$(CONFIG_KVM) += kvm.o + +kvm-y += exit.o +kvm-y += interrupt.o +kvm-y += main.o +kvm-y += mmu.o +kvm-y += switch.o +kvm-y += timer.o +kvm-y += tlb.o +kvm-y += vcpu.o +kvm-y += vm.o + +CFLAGS_exit.o += $(call cc-option,-Wno-override-init,) diff --git a/arch/loongarch/kvm/exit.c b/arch/loongarch/kvm/exit.c new file mode 100644 index 0000000000..ce8de3fa47 --- /dev/null +++ b/arch/loongarch/kvm/exit.c @@ -0,0 +1,696 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2020-2023 Loongson Technology Corporation Limited + */ + +#include <linux/err.h> +#include <linux/errno.h> +#include <linux/kvm_host.h> +#include <linux/module.h> +#include <linux/preempt.h> +#include <linux/vmalloc.h> +#include <asm/fpu.h> +#include <asm/inst.h> +#include <asm/loongarch.h> +#include <asm/mmzone.h> +#include <asm/numa.h> +#include <asm/time.h> +#include <asm/tlb.h> +#include <asm/kvm_csr.h> +#include <asm/kvm_vcpu.h> +#include "trace.h" + +static unsigned long kvm_emu_read_csr(struct kvm_vcpu *vcpu, int csrid) +{ + unsigned long val = 0; + struct loongarch_csrs *csr = vcpu->arch.csr; + + /* + * From LoongArch Reference Manual Volume 1 Chapter 4.2.1 + * For undefined CSR id, return value is 0 + */ + if (get_gcsr_flag(csrid) & SW_GCSR) + val = kvm_read_sw_gcsr(csr, csrid); + else + pr_warn_once("Unsupported csrrd 0x%x with pc %lx\n", csrid, vcpu->arch.pc); + + return val; +} + +static unsigned long kvm_emu_write_csr(struct kvm_vcpu *vcpu, int csrid, unsigned long val) +{ + unsigned long old = 0; + struct loongarch_csrs *csr = vcpu->arch.csr; + + if (get_gcsr_flag(csrid) & SW_GCSR) { + old = kvm_read_sw_gcsr(csr, csrid); + kvm_write_sw_gcsr(csr, csrid, val); + } else + pr_warn_once("Unsupported csrwr 0x%x with pc %lx\n", csrid, vcpu->arch.pc); + + return old; +} + +static unsigned long kvm_emu_xchg_csr(struct kvm_vcpu *vcpu, int csrid, + unsigned long csr_mask, unsigned long val) +{ + unsigned long old = 0; + struct loongarch_csrs *csr = vcpu->arch.csr; + + if (get_gcsr_flag(csrid) & SW_GCSR) { + old = kvm_read_sw_gcsr(csr, csrid); + val = (old & ~csr_mask) | (val & csr_mask); + kvm_write_sw_gcsr(csr, csrid, val); + old = old & csr_mask; + } else + pr_warn_once("Unsupported csrxchg 0x%x with pc %lx\n", csrid, vcpu->arch.pc); + + return old; +} + +static int kvm_handle_csr(struct kvm_vcpu *vcpu, larch_inst inst) +{ + unsigned int rd, rj, csrid; + unsigned long csr_mask, val = 0; + + /* + * CSR value mask imm + * rj = 0 means csrrd + * rj = 1 means csrwr + * rj != 0,1 means csrxchg + */ + rd = inst.reg2csr_format.rd; + rj = inst.reg2csr_format.rj; + csrid = inst.reg2csr_format.csr; + + /* Process CSR ops */ + switch (rj) { + case 0: /* process csrrd */ + val = kvm_emu_read_csr(vcpu, csrid); + vcpu->arch.gprs[rd] = val; + break; + case 1: /* process csrwr */ + val = vcpu->arch.gprs[rd]; + val = kvm_emu_write_csr(vcpu, csrid, val); + vcpu->arch.gprs[rd] = val; + break; + default: /* process csrxchg */ + val = vcpu->arch.gprs[rd]; + csr_mask = vcpu->arch.gprs[rj]; + val = kvm_emu_xchg_csr(vcpu, csrid, csr_mask, val); + vcpu->arch.gprs[rd] = val; + } + + return EMULATE_DONE; +} + +int kvm_emu_iocsr(larch_inst inst, struct kvm_run *run, struct kvm_vcpu *vcpu) +{ + int ret; + unsigned long val; + u32 addr, rd, rj, opcode; + + /* + * Each IOCSR with different opcode + */ + rd = inst.reg2_format.rd; + rj = inst.reg2_format.rj; + opcode = inst.reg2_format.opcode; + addr = vcpu->arch.gprs[rj]; + ret = EMULATE_DO_IOCSR; + run->iocsr_io.phys_addr = addr; + run->iocsr_io.is_write = 0; + + /* LoongArch is Little endian */ + switch (opcode) { + case iocsrrdb_op: + run->iocsr_io.len = 1; + break; + case iocsrrdh_op: + run->iocsr_io.len = 2; + break; + case iocsrrdw_op: + run->iocsr_io.len = 4; + break; + case iocsrrdd_op: + run->iocsr_io.len = 8; + break; + case iocsrwrb_op: + run->iocsr_io.len = 1; + run->iocsr_io.is_write = 1; + break; + case iocsrwrh_op: + run->iocsr_io.len = 2; + run->iocsr_io.is_write = 1; + break; + case iocsrwrw_op: + run->iocsr_io.len = 4; + run->iocsr_io.is_write = 1; + break; + case iocsrwrd_op: + run->iocsr_io.len = 8; + run->iocsr_io.is_write = 1; + break; + default: + ret = EMULATE_FAIL; + break; + } + + if (ret == EMULATE_DO_IOCSR) { + if (run->iocsr_io.is_write) { + val = vcpu->arch.gprs[rd]; + memcpy(run->iocsr_io.data, &val, run->iocsr_io.len); + } + vcpu->arch.io_gpr = rd; + } + + return ret; +} + +int kvm_complete_iocsr_read(struct kvm_vcpu *vcpu, struct kvm_run *run) +{ + enum emulation_result er = EMULATE_DONE; + unsigned long *gpr = &vcpu->arch.gprs[vcpu->arch.io_gpr]; + + switch (run->iocsr_io.len) { + case 1: + *gpr = *(s8 *)run->iocsr_io.data; + break; + case 2: + *gpr = *(s16 *)run->iocsr_io.data; + break; + case 4: + *gpr = *(s32 *)run->iocsr_io.data; + break; + case 8: + *gpr = *(s64 *)run->iocsr_io.data; + break; + default: + kvm_err("Bad IOCSR length: %d, addr is 0x%lx\n", + run->iocsr_io.len, vcpu->arch.badv); + er = EMULATE_FAIL; + break; + } + + return er; +} + +int kvm_emu_idle(struct kvm_vcpu *vcpu) +{ + ++vcpu->stat.idle_exits; + trace_kvm_exit_idle(vcpu, KVM_TRACE_EXIT_IDLE); + + if (!kvm_arch_vcpu_runnable(vcpu)) { + /* + * Switch to the software timer before halt-polling/blocking as + * the guest's timer may be a break event for the vCPU, and the + * hypervisor timer runs only when the CPU is in guest mode. + * Switch before halt-polling so that KVM recognizes an expired + * timer before blocking. + */ + kvm_save_timer(vcpu); + kvm_vcpu_block(vcpu); + } + + return EMULATE_DONE; +} + +static int kvm_trap_handle_gspr(struct kvm_vcpu *vcpu) +{ + int rd, rj; + unsigned int index; + unsigned long curr_pc; + larch_inst inst; + enum emulation_result er = EMULATE_DONE; + struct kvm_run *run = vcpu->run; + + /* Fetch the instruction */ + inst.word = vcpu->arch.badi; + curr_pc = vcpu->arch.pc; + update_pc(&vcpu->arch); + + trace_kvm_exit_gspr(vcpu, inst.word); + er = EMULATE_FAIL; + switch (((inst.word >> 24) & 0xff)) { + case 0x0: /* CPUCFG GSPR */ + if (inst.reg2_format.opcode == 0x1B) { + rd = inst.reg2_format.rd; + rj = inst.reg2_format.rj; + ++vcpu->stat.cpucfg_exits; + index = vcpu->arch.gprs[rj]; + er = EMULATE_DONE; + /* + * By LoongArch Reference Manual 2.2.10.5 + * return value is 0 for undefined cpucfg index + */ + if (index < KVM_MAX_CPUCFG_REGS) + vcpu->arch.gprs[rd] = vcpu->arch.cpucfg[index]; + else + vcpu->arch.gprs[rd] = 0; + } + break; + case 0x4: /* CSR{RD,WR,XCHG} GSPR */ + er = kvm_handle_csr(vcpu, inst); + break; + case 0x6: /* Cache, Idle and IOCSR GSPR */ + switch (((inst.word >> 22) & 0x3ff)) { + case 0x18: /* Cache GSPR */ + er = EMULATE_DONE; + trace_kvm_exit_cache(vcpu, KVM_TRACE_EXIT_CACHE); + break; + case 0x19: /* Idle/IOCSR GSPR */ + switch (((inst.word >> 15) & 0x1ffff)) { + case 0xc90: /* IOCSR GSPR */ + er = kvm_emu_iocsr(inst, run, vcpu); + break; + case 0xc91: /* Idle GSPR */ + er = kvm_emu_idle(vcpu); + break; + default: + er = EMULATE_FAIL; + break; + } + break; + default: + er = EMULATE_FAIL; + break; + } + break; + default: + er = EMULATE_FAIL; + break; + } + + /* Rollback PC only if emulation was unsuccessful */ + if (er == EMULATE_FAIL) { + kvm_err("[%#lx]%s: unsupported gspr instruction 0x%08x\n", + curr_pc, __func__, inst.word); + + kvm_arch_vcpu_dump_regs(vcpu); + vcpu->arch.pc = curr_pc; + } + + return er; +} + +/* + * Trigger GSPR: + * 1) Execute CPUCFG instruction; + * 2) Execute CACOP/IDLE instructions; + * 3) Access to unimplemented CSRs/IOCSRs. + */ +static int kvm_handle_gspr(struct kvm_vcpu *vcpu) +{ + int ret = RESUME_GUEST; + enum emulation_result er = EMULATE_DONE; + + er = kvm_trap_handle_gspr(vcpu); + + if (er == EMULATE_DONE) { + ret = RESUME_GUEST; + } else if (er == EMULATE_DO_MMIO) { + vcpu->run->exit_reason = KVM_EXIT_MMIO; + ret = RESUME_HOST; + } else if (er == EMULATE_DO_IOCSR) { + vcpu->run->exit_reason = KVM_EXIT_LOONGARCH_IOCSR; + ret = RESUME_HOST; + } else { + kvm_queue_exception(vcpu, EXCCODE_INE, 0); + ret = RESUME_GUEST; + } + + return ret; +} + +int kvm_emu_mmio_read(struct kvm_vcpu *vcpu, larch_inst inst) +{ + int ret; + unsigned int op8, opcode, rd; + struct kvm_run *run = vcpu->run; + + run->mmio.phys_addr = vcpu->arch.badv; + vcpu->mmio_needed = 2; /* signed */ + op8 = (inst.word >> 24) & 0xff; + ret = EMULATE_DO_MMIO; + + switch (op8) { + case 0x24 ... 0x27: /* ldptr.w/d process */ + rd = inst.reg2i14_format.rd; + opcode = inst.reg2i14_format.opcode; + + switch (opcode) { + case ldptrw_op: + run->mmio.len = 4; + break; + case ldptrd_op: + run->mmio.len = 8; + break; + default: + break; + } + break; + case 0x28 ... 0x2e: /* ld.b/h/w/d, ld.bu/hu/wu process */ + rd = inst.reg2i12_format.rd; + opcode = inst.reg2i12_format.opcode; + + switch (opcode) { + case ldb_op: + run->mmio.len = 1; + break; + case ldbu_op: + vcpu->mmio_needed = 1; /* unsigned */ + run->mmio.len = 1; + break; + case ldh_op: + run->mmio.len = 2; + break; + case ldhu_op: + vcpu->mmio_needed = 1; /* unsigned */ + run->mmio.len = 2; + break; + case ldw_op: + run->mmio.len = 4; + break; + case ldwu_op: + vcpu->mmio_needed = 1; /* unsigned */ + run->mmio.len = 4; + break; + case ldd_op: + run->mmio.len = 8; + break; + default: + ret = EMULATE_FAIL; + break; + } + break; + case 0x38: /* ldx.b/h/w/d, ldx.bu/hu/wu process */ + rd = inst.reg3_format.rd; + opcode = inst.reg3_format.opcode; + + switch (opcode) { + case ldxb_op: + run->mmio.len = 1; + break; + case ldxbu_op: + run->mmio.len = 1; + vcpu->mmio_needed = 1; /* unsigned */ + break; + case ldxh_op: + run->mmio.len = 2; + break; + case ldxhu_op: + run->mmio.len = 2; + vcpu->mmio_needed = 1; /* unsigned */ + break; + case ldxw_op: + run->mmio.len = 4; + break; + case ldxwu_op: + run->mmio.len = 4; + vcpu->mmio_needed = 1; /* unsigned */ + break; + case ldxd_op: + run->mmio.len = 8; + break; + default: + ret = EMULATE_FAIL; + break; + } + break; + default: + ret = EMULATE_FAIL; + } + + if (ret == EMULATE_DO_MMIO) { + /* Set for kvm_complete_mmio_read() use */ + vcpu->arch.io_gpr = rd; + run->mmio.is_write = 0; + vcpu->mmio_is_write = 0; + } else { + kvm_err("Read not supported Inst=0x%08x @%lx BadVaddr:%#lx\n", + inst.word, vcpu->arch.pc, vcpu->arch.badv); + kvm_arch_vcpu_dump_regs(vcpu); + vcpu->mmio_needed = 0; + } + + return ret; +} + +int kvm_complete_mmio_read(struct kvm_vcpu *vcpu, struct kvm_run *run) +{ + enum emulation_result er = EMULATE_DONE; + unsigned long *gpr = &vcpu->arch.gprs[vcpu->arch.io_gpr]; + + /* Update with new PC */ + update_pc(&vcpu->arch); + switch (run->mmio.len) { + case 1: + if (vcpu->mmio_needed == 2) + *gpr = *(s8 *)run->mmio.data; + else + *gpr = *(u8 *)run->mmio.data; + break; + case 2: + if (vcpu->mmio_needed == 2) + *gpr = *(s16 *)run->mmio.data; + else + *gpr = *(u16 *)run->mmio.data; + break; + case 4: + if (vcpu->mmio_needed == 2) + *gpr = *(s32 *)run->mmio.data; + else + *gpr = *(u32 *)run->mmio.data; + break; + case 8: + *gpr = *(s64 *)run->mmio.data; + break; + default: + kvm_err("Bad MMIO length: %d, addr is 0x%lx\n", + run->mmio.len, vcpu->arch.badv); + er = EMULATE_FAIL; + break; + } + + return er; +} + +int kvm_emu_mmio_write(struct kvm_vcpu *vcpu, larch_inst inst) +{ + int ret; + unsigned int rd, op8, opcode; + unsigned long curr_pc, rd_val = 0; + struct kvm_run *run = vcpu->run; + void *data = run->mmio.data; + + /* + * Update PC and hold onto current PC in case there is + * an error and we want to rollback the PC + */ + curr_pc = vcpu->arch.pc; + update_pc(&vcpu->arch); + + op8 = (inst.word >> 24) & 0xff; + run->mmio.phys_addr = vcpu->arch.badv; + ret = EMULATE_DO_MMIO; + switch (op8) { + case 0x24 ... 0x27: /* stptr.w/d process */ + rd = inst.reg2i14_format.rd; + opcode = inst.reg2i14_format.opcode; + + switch (opcode) { + case stptrw_op: + run->mmio.len = 4; + *(unsigned int *)data = vcpu->arch.gprs[rd]; + break; + case stptrd_op: + run->mmio.len = 8; + *(unsigned long *)data = vcpu->arch.gprs[rd]; + break; + default: + ret = EMULATE_FAIL; + break; + } + break; + case 0x28 ... 0x2e: /* st.b/h/w/d process */ + rd = inst.reg2i12_format.rd; + opcode = inst.reg2i12_format.opcode; + rd_val = vcpu->arch.gprs[rd]; + + switch (opcode) { + case stb_op: + run->mmio.len = 1; + *(unsigned char *)data = rd_val; + break; + case sth_op: + run->mmio.len = 2; + *(unsigned short *)data = rd_val; + break; + case stw_op: + run->mmio.len = 4; + *(unsigned int *)data = rd_val; + break; + case std_op: + run->mmio.len = 8; + *(unsigned long *)data = rd_val; + break; + default: + ret = EMULATE_FAIL; + break; + } + break; + case 0x38: /* stx.b/h/w/d process */ + rd = inst.reg3_format.rd; + opcode = inst.reg3_format.opcode; + + switch (opcode) { + case stxb_op: + run->mmio.len = 1; + *(unsigned char *)data = vcpu->arch.gprs[rd]; + break; + case stxh_op: + run->mmio.len = 2; + *(unsigned short *)data = vcpu->arch.gprs[rd]; + break; + case stxw_op: + run->mmio.len = 4; + *(unsigned int *)data = vcpu->arch.gprs[rd]; + break; + case stxd_op: + run->mmio.len = 8; + *(unsigned long *)data = vcpu->arch.gprs[rd]; + break; + default: + ret = EMULATE_FAIL; + break; + } + break; + default: + ret = EMULATE_FAIL; + } + + if (ret == EMULATE_DO_MMIO) { + run->mmio.is_write = 1; + vcpu->mmio_needed = 1; + vcpu->mmio_is_write = 1; + } else { + vcpu->arch.pc = curr_pc; + kvm_err("Write not supported Inst=0x%08x @%lx BadVaddr:%#lx\n", + inst.word, vcpu->arch.pc, vcpu->arch.badv); + kvm_arch_vcpu_dump_regs(vcpu); + /* Rollback PC if emulation was unsuccessful */ + } + + return ret; +} + +static int kvm_handle_rdwr_fault(struct kvm_vcpu *vcpu, bool write) +{ + int ret; + larch_inst inst; + enum emulation_result er = EMULATE_DONE; + struct kvm_run *run = vcpu->run; + unsigned long badv = vcpu->arch.badv; + + ret = kvm_handle_mm_fault(vcpu, badv, write); + if (ret) { + /* Treat as MMIO */ + inst.word = vcpu->arch.badi; + if (write) { + er = kvm_emu_mmio_write(vcpu, inst); + } else { + /* A code fetch fault doesn't count as an MMIO */ + if (kvm_is_ifetch_fault(&vcpu->arch)) { + kvm_queue_exception(vcpu, EXCCODE_ADE, EXSUBCODE_ADEF); + return RESUME_GUEST; + } + + er = kvm_emu_mmio_read(vcpu, inst); + } + } + + if (er == EMULATE_DONE) { + ret = RESUME_GUEST; + } else if (er == EMULATE_DO_MMIO) { + run->exit_reason = KVM_EXIT_MMIO; + ret = RESUME_HOST; + } else { + kvm_queue_exception(vcpu, EXCCODE_ADE, EXSUBCODE_ADEM); + ret = RESUME_GUEST; + } + + return ret; +} + +static int kvm_handle_read_fault(struct kvm_vcpu *vcpu) +{ + return kvm_handle_rdwr_fault(vcpu, false); +} + +static int kvm_handle_write_fault(struct kvm_vcpu *vcpu) +{ + return kvm_handle_rdwr_fault(vcpu, true); +} + +/** + * kvm_handle_fpu_disabled() - Guest used fpu however it is disabled at host + * @vcpu: Virtual CPU context. + * + * Handle when the guest attempts to use fpu which hasn't been allowed + * by the root context. + */ +static int kvm_handle_fpu_disabled(struct kvm_vcpu *vcpu) +{ + struct kvm_run *run = vcpu->run; + + /* + * If guest FPU not present, the FPU operation should have been + * treated as a reserved instruction! + * If FPU already in use, we shouldn't get this at all. + */ + if (WARN_ON(vcpu->arch.aux_inuse & KVM_LARCH_FPU)) { + kvm_err("%s internal error\n", __func__); + run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + return RESUME_HOST; + } + + kvm_own_fpu(vcpu); + + return RESUME_GUEST; +} + +/* + * LoongArch KVM callback handling for unimplemented guest exiting + */ +static int kvm_fault_ni(struct kvm_vcpu *vcpu) +{ + unsigned int ecode, inst; + unsigned long estat, badv; + + /* Fetch the instruction */ + inst = vcpu->arch.badi; + badv = vcpu->arch.badv; + estat = vcpu->arch.host_estat; + ecode = (estat & CSR_ESTAT_EXC) >> CSR_ESTAT_EXC_SHIFT; + kvm_err("ECode: %d PC=%#lx Inst=0x%08x BadVaddr=%#lx ESTAT=%#lx\n", + ecode, vcpu->arch.pc, inst, badv, read_gcsr_estat()); + kvm_arch_vcpu_dump_regs(vcpu); + kvm_queue_exception(vcpu, EXCCODE_INE, 0); + + return RESUME_GUEST; +} + +static exit_handle_fn kvm_fault_tables[EXCCODE_INT_START] = { + [0 ... EXCCODE_INT_START - 1] = kvm_fault_ni, + [EXCCODE_TLBI] = kvm_handle_read_fault, + [EXCCODE_TLBL] = kvm_handle_read_fault, + [EXCCODE_TLBS] = kvm_handle_write_fault, + [EXCCODE_TLBM] = kvm_handle_write_fault, + [EXCCODE_FPDIS] = kvm_handle_fpu_disabled, + [EXCCODE_GSPR] = kvm_handle_gspr, +}; + +int kvm_handle_fault(struct kvm_vcpu *vcpu, int fault) +{ + return kvm_fault_tables[fault](vcpu); +} diff --git a/arch/loongarch/kvm/interrupt.c b/arch/loongarch/kvm/interrupt.c new file mode 100644 index 0000000000..4c3f22de4b --- /dev/null +++ b/arch/loongarch/kvm/interrupt.c @@ -0,0 +1,183 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2020-2023 Loongson Technology Corporation Limited + */ + +#include <linux/err.h> +#include <linux/errno.h> +#include <asm/kvm_csr.h> +#include <asm/kvm_vcpu.h> + +static unsigned int priority_to_irq[EXCCODE_INT_NUM] = { + [INT_TI] = CPU_TIMER, + [INT_IPI] = CPU_IPI, + [INT_SWI0] = CPU_SIP0, + [INT_SWI1] = CPU_SIP1, + [INT_HWI0] = CPU_IP0, + [INT_HWI1] = CPU_IP1, + [INT_HWI2] = CPU_IP2, + [INT_HWI3] = CPU_IP3, + [INT_HWI4] = CPU_IP4, + [INT_HWI5] = CPU_IP5, + [INT_HWI6] = CPU_IP6, + [INT_HWI7] = CPU_IP7, +}; + +static int kvm_irq_deliver(struct kvm_vcpu *vcpu, unsigned int priority) +{ + unsigned int irq = 0; + + clear_bit(priority, &vcpu->arch.irq_pending); + if (priority < EXCCODE_INT_NUM) + irq = priority_to_irq[priority]; + + switch (priority) { + case INT_TI: + case INT_IPI: + case INT_SWI0: + case INT_SWI1: + set_gcsr_estat(irq); + break; + + case INT_HWI0 ... INT_HWI7: + set_csr_gintc(irq); + break; + + default: + break; + } + + return 1; +} + +static int kvm_irq_clear(struct kvm_vcpu *vcpu, unsigned int priority) +{ + unsigned int irq = 0; + + clear_bit(priority, &vcpu->arch.irq_clear); + if (priority < EXCCODE_INT_NUM) + irq = priority_to_irq[priority]; + + switch (priority) { + case INT_TI: + case INT_IPI: + case INT_SWI0: + case INT_SWI1: + clear_gcsr_estat(irq); + break; + + case INT_HWI0 ... INT_HWI7: + clear_csr_gintc(irq); + break; + + default: + break; + } + + return 1; +} + +void kvm_deliver_intr(struct kvm_vcpu *vcpu) +{ + unsigned int priority; + unsigned long *pending = &vcpu->arch.irq_pending; + unsigned long *pending_clr = &vcpu->arch.irq_clear; + + if (!(*pending) && !(*pending_clr)) + return; + + if (*pending_clr) { + priority = __ffs(*pending_clr); + while (priority <= INT_IPI) { + kvm_irq_clear(vcpu, priority); + priority = find_next_bit(pending_clr, + BITS_PER_BYTE * sizeof(*pending_clr), + priority + 1); + } + } + + if (*pending) { + priority = __ffs(*pending); + while (priority <= INT_IPI) { + kvm_irq_deliver(vcpu, priority); + priority = find_next_bit(pending, + BITS_PER_BYTE * sizeof(*pending), + priority + 1); + } + } +} + +int kvm_pending_timer(struct kvm_vcpu *vcpu) +{ + return test_bit(INT_TI, &vcpu->arch.irq_pending); +} + +/* + * Only support illegal instruction or illegal Address Error exception, + * Other exceptions are injected by hardware in kvm mode + */ +static void _kvm_deliver_exception(struct kvm_vcpu *vcpu, + unsigned int code, unsigned int subcode) +{ + unsigned long val, vec_size; + + /* + * BADV is added for EXCCODE_ADE exception + * Use PC register (GVA address) if it is instruction exeception + * Else use BADV from host side (GPA address) for data exeception + */ + if (code == EXCCODE_ADE) { + if (subcode == EXSUBCODE_ADEF) + val = vcpu->arch.pc; + else + val = vcpu->arch.badv; + kvm_write_hw_gcsr(LOONGARCH_CSR_BADV, val); + } + + /* Set exception instruction */ + kvm_write_hw_gcsr(LOONGARCH_CSR_BADI, vcpu->arch.badi); + + /* + * Save CRMD in PRMD + * Set IRQ disabled and PLV0 with CRMD + */ + val = kvm_read_hw_gcsr(LOONGARCH_CSR_CRMD); + kvm_write_hw_gcsr(LOONGARCH_CSR_PRMD, val); + val = val & ~(CSR_CRMD_PLV | CSR_CRMD_IE); + kvm_write_hw_gcsr(LOONGARCH_CSR_CRMD, val); + + /* Set exception PC address */ + kvm_write_hw_gcsr(LOONGARCH_CSR_ERA, vcpu->arch.pc); + + /* + * Set exception code + * Exception and interrupt can be inject at the same time + * Hardware will handle exception first and then extern interrupt + * Exception code is Ecode in ESTAT[16:21] + * Interrupt code in ESTAT[0:12] + */ + val = kvm_read_hw_gcsr(LOONGARCH_CSR_ESTAT); + val = (val & ~CSR_ESTAT_EXC) | code; + kvm_write_hw_gcsr(LOONGARCH_CSR_ESTAT, val); + + /* Calculate expcetion entry address */ + val = kvm_read_hw_gcsr(LOONGARCH_CSR_ECFG); + vec_size = (val & CSR_ECFG_VS) >> CSR_ECFG_VS_SHIFT; + if (vec_size) + vec_size = (1 << vec_size) * 4; + val = kvm_read_hw_gcsr(LOONGARCH_CSR_EENTRY); + vcpu->arch.pc = val + code * vec_size; +} + +void kvm_deliver_exception(struct kvm_vcpu *vcpu) +{ + unsigned int code; + unsigned long *pending = &vcpu->arch.exception_pending; + + if (*pending) { + code = __ffs(*pending); + _kvm_deliver_exception(vcpu, code, vcpu->arch.esubcode); + *pending = 0; + vcpu->arch.esubcode = 0; + } +} diff --git a/arch/loongarch/kvm/main.c b/arch/loongarch/kvm/main.c new file mode 100644 index 0000000000..1c1d519950 --- /dev/null +++ b/arch/loongarch/kvm/main.c @@ -0,0 +1,420 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2020-2023 Loongson Technology Corporation Limited + */ + +#include <linux/err.h> +#include <linux/module.h> +#include <linux/kvm_host.h> +#include <asm/cacheflush.h> +#include <asm/cpufeature.h> +#include <asm/kvm_csr.h> +#include "trace.h" + +unsigned long vpid_mask; +struct kvm_world_switch *kvm_loongarch_ops; +static int gcsr_flag[CSR_MAX_NUMS]; +static struct kvm_context __percpu *vmcs; + +int get_gcsr_flag(int csr) +{ + if (csr < CSR_MAX_NUMS) + return gcsr_flag[csr]; + + return INVALID_GCSR; +} + +static inline void set_gcsr_sw_flag(int csr) +{ + if (csr < CSR_MAX_NUMS) + gcsr_flag[csr] |= SW_GCSR; +} + +static inline void set_gcsr_hw_flag(int csr) +{ + if (csr < CSR_MAX_NUMS) + gcsr_flag[csr] |= HW_GCSR; +} + +/* + * The default value of gcsr_flag[CSR] is 0, and we use this + * function to set the flag to 1 (SW_GCSR) or 2 (HW_GCSR) if the + * gcsr is software or hardware. It will be used by get/set_gcsr, + * if gcsr_flag is HW we should use gcsrrd/gcsrwr to access it, + * else use software csr to emulate it. + */ +static void kvm_init_gcsr_flag(void) +{ + set_gcsr_hw_flag(LOONGARCH_CSR_CRMD); + set_gcsr_hw_flag(LOONGARCH_CSR_PRMD); + set_gcsr_hw_flag(LOONGARCH_CSR_EUEN); + set_gcsr_hw_flag(LOONGARCH_CSR_MISC); + set_gcsr_hw_flag(LOONGARCH_CSR_ECFG); + set_gcsr_hw_flag(LOONGARCH_CSR_ESTAT); + set_gcsr_hw_flag(LOONGARCH_CSR_ERA); + set_gcsr_hw_flag(LOONGARCH_CSR_BADV); + set_gcsr_hw_flag(LOONGARCH_CSR_BADI); + set_gcsr_hw_flag(LOONGARCH_CSR_EENTRY); + set_gcsr_hw_flag(LOONGARCH_CSR_TLBIDX); + set_gcsr_hw_flag(LOONGARCH_CSR_TLBEHI); + set_gcsr_hw_flag(LOONGARCH_CSR_TLBELO0); + set_gcsr_hw_flag(LOONGARCH_CSR_TLBELO1); + set_gcsr_hw_flag(LOONGARCH_CSR_ASID); + set_gcsr_hw_flag(LOONGARCH_CSR_PGDL); + set_gcsr_hw_flag(LOONGARCH_CSR_PGDH); + set_gcsr_hw_flag(LOONGARCH_CSR_PGD); + set_gcsr_hw_flag(LOONGARCH_CSR_PWCTL0); + set_gcsr_hw_flag(LOONGARCH_CSR_PWCTL1); + set_gcsr_hw_flag(LOONGARCH_CSR_STLBPGSIZE); + set_gcsr_hw_flag(LOONGARCH_CSR_RVACFG); + set_gcsr_hw_flag(LOONGARCH_CSR_CPUID); + set_gcsr_hw_flag(LOONGARCH_CSR_PRCFG1); + set_gcsr_hw_flag(LOONGARCH_CSR_PRCFG2); + set_gcsr_hw_flag(LOONGARCH_CSR_PRCFG3); + set_gcsr_hw_flag(LOONGARCH_CSR_KS0); + set_gcsr_hw_flag(LOONGARCH_CSR_KS1); + set_gcsr_hw_flag(LOONGARCH_CSR_KS2); + set_gcsr_hw_flag(LOONGARCH_CSR_KS3); + set_gcsr_hw_flag(LOONGARCH_CSR_KS4); + set_gcsr_hw_flag(LOONGARCH_CSR_KS5); + set_gcsr_hw_flag(LOONGARCH_CSR_KS6); + set_gcsr_hw_flag(LOONGARCH_CSR_KS7); + set_gcsr_hw_flag(LOONGARCH_CSR_TMID); + set_gcsr_hw_flag(LOONGARCH_CSR_TCFG); + set_gcsr_hw_flag(LOONGARCH_CSR_TVAL); + set_gcsr_hw_flag(LOONGARCH_CSR_TINTCLR); + set_gcsr_hw_flag(LOONGARCH_CSR_CNTC); + set_gcsr_hw_flag(LOONGARCH_CSR_LLBCTL); + set_gcsr_hw_flag(LOONGARCH_CSR_TLBRENTRY); + set_gcsr_hw_flag(LOONGARCH_CSR_TLBRBADV); + set_gcsr_hw_flag(LOONGARCH_CSR_TLBRERA); + set_gcsr_hw_flag(LOONGARCH_CSR_TLBRSAVE); + set_gcsr_hw_flag(LOONGARCH_CSR_TLBRELO0); + set_gcsr_hw_flag(LOONGARCH_CSR_TLBRELO1); + set_gcsr_hw_flag(LOONGARCH_CSR_TLBREHI); + set_gcsr_hw_flag(LOONGARCH_CSR_TLBRPRMD); + set_gcsr_hw_flag(LOONGARCH_CSR_DMWIN0); + set_gcsr_hw_flag(LOONGARCH_CSR_DMWIN1); + set_gcsr_hw_flag(LOONGARCH_CSR_DMWIN2); + set_gcsr_hw_flag(LOONGARCH_CSR_DMWIN3); + + set_gcsr_sw_flag(LOONGARCH_CSR_IMPCTL1); + set_gcsr_sw_flag(LOONGARCH_CSR_IMPCTL2); + set_gcsr_sw_flag(LOONGARCH_CSR_MERRCTL); + set_gcsr_sw_flag(LOONGARCH_CSR_MERRINFO1); + set_gcsr_sw_flag(LOONGARCH_CSR_MERRINFO2); + set_gcsr_sw_flag(LOONGARCH_CSR_MERRENTRY); + set_gcsr_sw_flag(LOONGARCH_CSR_MERRERA); + set_gcsr_sw_flag(LOONGARCH_CSR_MERRSAVE); + set_gcsr_sw_flag(LOONGARCH_CSR_CTAG); + set_gcsr_sw_flag(LOONGARCH_CSR_DEBUG); + set_gcsr_sw_flag(LOONGARCH_CSR_DERA); + set_gcsr_sw_flag(LOONGARCH_CSR_DESAVE); + + set_gcsr_sw_flag(LOONGARCH_CSR_FWPC); + set_gcsr_sw_flag(LOONGARCH_CSR_FWPS); + set_gcsr_sw_flag(LOONGARCH_CSR_MWPC); + set_gcsr_sw_flag(LOONGARCH_CSR_MWPS); + + set_gcsr_sw_flag(LOONGARCH_CSR_DB0ADDR); + set_gcsr_sw_flag(LOONGARCH_CSR_DB0MASK); + set_gcsr_sw_flag(LOONGARCH_CSR_DB0CTRL); + set_gcsr_sw_flag(LOONGARCH_CSR_DB0ASID); + set_gcsr_sw_flag(LOONGARCH_CSR_DB1ADDR); + set_gcsr_sw_flag(LOONGARCH_CSR_DB1MASK); + set_gcsr_sw_flag(LOONGARCH_CSR_DB1CTRL); + set_gcsr_sw_flag(LOONGARCH_CSR_DB1ASID); + set_gcsr_sw_flag(LOONGARCH_CSR_DB2ADDR); + set_gcsr_sw_flag(LOONGARCH_CSR_DB2MASK); + set_gcsr_sw_flag(LOONGARCH_CSR_DB2CTRL); + set_gcsr_sw_flag(LOONGARCH_CSR_DB2ASID); + set_gcsr_sw_flag(LOONGARCH_CSR_DB3ADDR); + set_gcsr_sw_flag(LOONGARCH_CSR_DB3MASK); + set_gcsr_sw_flag(LOONGARCH_CSR_DB3CTRL); + set_gcsr_sw_flag(LOONGARCH_CSR_DB3ASID); + set_gcsr_sw_flag(LOONGARCH_CSR_DB4ADDR); + set_gcsr_sw_flag(LOONGARCH_CSR_DB4MASK); + set_gcsr_sw_flag(LOONGARCH_CSR_DB4CTRL); + set_gcsr_sw_flag(LOONGARCH_CSR_DB4ASID); + set_gcsr_sw_flag(LOONGARCH_CSR_DB5ADDR); + set_gcsr_sw_flag(LOONGARCH_CSR_DB5MASK); + set_gcsr_sw_flag(LOONGARCH_CSR_DB5CTRL); + set_gcsr_sw_flag(LOONGARCH_CSR_DB5ASID); + set_gcsr_sw_flag(LOONGARCH_CSR_DB6ADDR); + set_gcsr_sw_flag(LOONGARCH_CSR_DB6MASK); + set_gcsr_sw_flag(LOONGARCH_CSR_DB6CTRL); + set_gcsr_sw_flag(LOONGARCH_CSR_DB6ASID); + set_gcsr_sw_flag(LOONGARCH_CSR_DB7ADDR); + set_gcsr_sw_flag(LOONGARCH_CSR_DB7MASK); + set_gcsr_sw_flag(LOONGARCH_CSR_DB7CTRL); + set_gcsr_sw_flag(LOONGARCH_CSR_DB7ASID); + + set_gcsr_sw_flag(LOONGARCH_CSR_IB0ADDR); + set_gcsr_sw_flag(LOONGARCH_CSR_IB0MASK); + set_gcsr_sw_flag(LOONGARCH_CSR_IB0CTRL); + set_gcsr_sw_flag(LOONGARCH_CSR_IB0ASID); + set_gcsr_sw_flag(LOONGARCH_CSR_IB1ADDR); + set_gcsr_sw_flag(LOONGARCH_CSR_IB1MASK); + set_gcsr_sw_flag(LOONGARCH_CSR_IB1CTRL); + set_gcsr_sw_flag(LOONGARCH_CSR_IB1ASID); + set_gcsr_sw_flag(LOONGARCH_CSR_IB2ADDR); + set_gcsr_sw_flag(LOONGARCH_CSR_IB2MASK); + set_gcsr_sw_flag(LOONGARCH_CSR_IB2CTRL); + set_gcsr_sw_flag(LOONGARCH_CSR_IB2ASID); + set_gcsr_sw_flag(LOONGARCH_CSR_IB3ADDR); + set_gcsr_sw_flag(LOONGARCH_CSR_IB3MASK); + set_gcsr_sw_flag(LOONGARCH_CSR_IB3CTRL); + set_gcsr_sw_flag(LOONGARCH_CSR_IB3ASID); + set_gcsr_sw_flag(LOONGARCH_CSR_IB4ADDR); + set_gcsr_sw_flag(LOONGARCH_CSR_IB4MASK); + set_gcsr_sw_flag(LOONGARCH_CSR_IB4CTRL); + set_gcsr_sw_flag(LOONGARCH_CSR_IB4ASID); + set_gcsr_sw_flag(LOONGARCH_CSR_IB5ADDR); + set_gcsr_sw_flag(LOONGARCH_CSR_IB5MASK); + set_gcsr_sw_flag(LOONGARCH_CSR_IB5CTRL); + set_gcsr_sw_flag(LOONGARCH_CSR_IB5ASID); + set_gcsr_sw_flag(LOONGARCH_CSR_IB6ADDR); + set_gcsr_sw_flag(LOONGARCH_CSR_IB6MASK); + set_gcsr_sw_flag(LOONGARCH_CSR_IB6CTRL); + set_gcsr_sw_flag(LOONGARCH_CSR_IB6ASID); + set_gcsr_sw_flag(LOONGARCH_CSR_IB7ADDR); + set_gcsr_sw_flag(LOONGARCH_CSR_IB7MASK); + set_gcsr_sw_flag(LOONGARCH_CSR_IB7CTRL); + set_gcsr_sw_flag(LOONGARCH_CSR_IB7ASID); + + set_gcsr_sw_flag(LOONGARCH_CSR_PERFCTRL0); + set_gcsr_sw_flag(LOONGARCH_CSR_PERFCNTR0); + set_gcsr_sw_flag(LOONGARCH_CSR_PERFCTRL1); + set_gcsr_sw_flag(LOONGARCH_CSR_PERFCNTR1); + set_gcsr_sw_flag(LOONGARCH_CSR_PERFCTRL2); + set_gcsr_sw_flag(LOONGARCH_CSR_PERFCNTR2); + set_gcsr_sw_flag(LOONGARCH_CSR_PERFCTRL3); + set_gcsr_sw_flag(LOONGARCH_CSR_PERFCNTR3); +} + +static void kvm_update_vpid(struct kvm_vcpu *vcpu, int cpu) +{ + unsigned long vpid; + struct kvm_context *context; + + context = per_cpu_ptr(vcpu->kvm->arch.vmcs, cpu); + vpid = context->vpid_cache + 1; + if (!(vpid & vpid_mask)) { + /* finish round of vpid loop */ + if (unlikely(!vpid)) + vpid = vpid_mask + 1; + + ++vpid; /* vpid 0 reserved for root */ + + /* start new vpid cycle */ + kvm_flush_tlb_all(); + } + + context->vpid_cache = vpid; + vcpu->arch.vpid = vpid; +} + +void kvm_check_vpid(struct kvm_vcpu *vcpu) +{ + int cpu; + bool migrated; + unsigned long ver, old, vpid; + struct kvm_context *context; + + cpu = smp_processor_id(); + /* + * Are we entering guest context on a different CPU to last time? + * If so, the vCPU's guest TLB state on this CPU may be stale. + */ + context = per_cpu_ptr(vcpu->kvm->arch.vmcs, cpu); + migrated = (vcpu->cpu != cpu); + + /* + * Check if our vpid is of an older version + * + * We also discard the stored vpid if we've executed on + * another CPU, as the guest mappings may have changed without + * hypervisor knowledge. + */ + ver = vcpu->arch.vpid & ~vpid_mask; + old = context->vpid_cache & ~vpid_mask; + if (migrated || (ver != old)) { + kvm_update_vpid(vcpu, cpu); + trace_kvm_vpid_change(vcpu, vcpu->arch.vpid); + vcpu->cpu = cpu; + } + + /* Restore GSTAT(0x50).vpid */ + vpid = (vcpu->arch.vpid & vpid_mask) << CSR_GSTAT_GID_SHIFT; + change_csr_gstat(vpid_mask << CSR_GSTAT_GID_SHIFT, vpid); +} + +void kvm_init_vmcs(struct kvm *kvm) +{ + kvm->arch.vmcs = vmcs; +} + +long kvm_arch_dev_ioctl(struct file *filp, + unsigned int ioctl, unsigned long arg) +{ + return -ENOIOCTLCMD; +} + +int kvm_arch_hardware_enable(void) +{ + unsigned long env, gcfg = 0; + + env = read_csr_gcfg(); + + /* First init gcfg, gstat, gintc, gtlbc. All guest use the same config */ + write_csr_gcfg(0); + write_csr_gstat(0); + write_csr_gintc(0); + clear_csr_gtlbc(CSR_GTLBC_USETGID | CSR_GTLBC_TOTI); + + /* + * Enable virtualization features granting guest direct control of + * certain features: + * GCI=2: Trap on init or unimplement cache instruction. + * TORU=0: Trap on Root Unimplement. + * CACTRL=1: Root control cache. + * TOP=0: Trap on Previlege. + * TOE=0: Trap on Exception. + * TIT=0: Trap on Timer. + */ + if (env & CSR_GCFG_GCIP_ALL) + gcfg |= CSR_GCFG_GCI_SECURE; + if (env & CSR_GCFG_MATC_ROOT) + gcfg |= CSR_GCFG_MATC_ROOT; + + gcfg |= CSR_GCFG_TIT; + write_csr_gcfg(gcfg); + + kvm_flush_tlb_all(); + + /* Enable using TGID */ + set_csr_gtlbc(CSR_GTLBC_USETGID); + kvm_debug("GCFG:%lx GSTAT:%lx GINTC:%lx GTLBC:%lx", + read_csr_gcfg(), read_csr_gstat(), read_csr_gintc(), read_csr_gtlbc()); + + return 0; +} + +void kvm_arch_hardware_disable(void) +{ + write_csr_gcfg(0); + write_csr_gstat(0); + write_csr_gintc(0); + clear_csr_gtlbc(CSR_GTLBC_USETGID | CSR_GTLBC_TOTI); + + /* Flush any remaining guest TLB entries */ + kvm_flush_tlb_all(); +} + +static int kvm_loongarch_env_init(void) +{ + int cpu, order; + void *addr; + struct kvm_context *context; + + vmcs = alloc_percpu(struct kvm_context); + if (!vmcs) { + pr_err("kvm: failed to allocate percpu kvm_context\n"); + return -ENOMEM; + } + + kvm_loongarch_ops = kzalloc(sizeof(*kvm_loongarch_ops), GFP_KERNEL); + if (!kvm_loongarch_ops) { + free_percpu(vmcs); + vmcs = NULL; + return -ENOMEM; + } + + /* + * PGD register is shared between root kernel and kvm hypervisor. + * So world switch entry should be in DMW area rather than TLB area + * to avoid page fault reenter. + * + * In future if hardware pagetable walking is supported, we won't + * need to copy world switch code to DMW area. + */ + order = get_order(kvm_exception_size + kvm_enter_guest_size); + addr = (void *)__get_free_pages(GFP_KERNEL, order); + if (!addr) { + free_percpu(vmcs); + vmcs = NULL; + kfree(kvm_loongarch_ops); + kvm_loongarch_ops = NULL; + return -ENOMEM; + } + + memcpy(addr, kvm_exc_entry, kvm_exception_size); + memcpy(addr + kvm_exception_size, kvm_enter_guest, kvm_enter_guest_size); + flush_icache_range((unsigned long)addr, (unsigned long)addr + kvm_exception_size + kvm_enter_guest_size); + kvm_loongarch_ops->exc_entry = addr; + kvm_loongarch_ops->enter_guest = addr + kvm_exception_size; + kvm_loongarch_ops->page_order = order; + + vpid_mask = read_csr_gstat(); + vpid_mask = (vpid_mask & CSR_GSTAT_GIDBIT) >> CSR_GSTAT_GIDBIT_SHIFT; + if (vpid_mask) + vpid_mask = GENMASK(vpid_mask - 1, 0); + + for_each_possible_cpu(cpu) { + context = per_cpu_ptr(vmcs, cpu); + context->vpid_cache = vpid_mask + 1; + context->last_vcpu = NULL; + } + + kvm_init_gcsr_flag(); + + return 0; +} + +static void kvm_loongarch_env_exit(void) +{ + unsigned long addr; + + if (vmcs) + free_percpu(vmcs); + + if (kvm_loongarch_ops) { + if (kvm_loongarch_ops->exc_entry) { + addr = (unsigned long)kvm_loongarch_ops->exc_entry; + free_pages(addr, kvm_loongarch_ops->page_order); + } + kfree(kvm_loongarch_ops); + } +} + +static int kvm_loongarch_init(void) +{ + int r; + + if (!cpu_has_lvz) { + kvm_info("Hardware virtualization not available\n"); + return -ENODEV; + } + r = kvm_loongarch_env_init(); + if (r) + return r; + + return kvm_init(sizeof(struct kvm_vcpu), 0, THIS_MODULE); +} + +static void kvm_loongarch_exit(void) +{ + kvm_exit(); + kvm_loongarch_env_exit(); +} + +module_init(kvm_loongarch_init); +module_exit(kvm_loongarch_exit); + +#ifdef MODULE +static const struct cpu_feature kvm_feature[] = { + { .feature = cpu_feature(LOONGARCH_LVZ) }, + {}, +}; +MODULE_DEVICE_TABLE(cpu, kvm_feature); +#endif diff --git a/arch/loongarch/kvm/mmu.c b/arch/loongarch/kvm/mmu.c new file mode 100644 index 0000000000..80480df5f5 --- /dev/null +++ b/arch/loongarch/kvm/mmu.c @@ -0,0 +1,914 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2020-2023 Loongson Technology Corporation Limited + */ + +#include <linux/highmem.h> +#include <linux/hugetlb.h> +#include <linux/kvm_host.h> +#include <linux/page-flags.h> +#include <linux/uaccess.h> +#include <asm/mmu_context.h> +#include <asm/pgalloc.h> +#include <asm/tlb.h> +#include <asm/kvm_mmu.h> + +static inline void kvm_ptw_prepare(struct kvm *kvm, kvm_ptw_ctx *ctx) +{ + ctx->level = kvm->arch.root_level; + /* pte table */ + ctx->invalid_ptes = kvm->arch.invalid_ptes; + ctx->pte_shifts = kvm->arch.pte_shifts; + ctx->pgtable_shift = ctx->pte_shifts[ctx->level]; + ctx->invalid_entry = ctx->invalid_ptes[ctx->level]; + ctx->opaque = kvm; +} + +/* + * Mark a range of guest physical address space old (all accesses fault) in the + * VM's GPA page table to allow detection of commonly used pages. + */ +static int kvm_mkold_pte(kvm_pte_t *pte, phys_addr_t addr, kvm_ptw_ctx *ctx) +{ + if (kvm_pte_young(*pte)) { + *pte = kvm_pte_mkold(*pte); + return 1; + } + + return 0; +} + +/* + * Mark a range of guest physical address space clean (writes fault) in the VM's + * GPA page table to allow dirty page tracking. + */ +static int kvm_mkclean_pte(kvm_pte_t *pte, phys_addr_t addr, kvm_ptw_ctx *ctx) +{ + gfn_t offset; + kvm_pte_t val; + + val = *pte; + /* + * For kvm_arch_mmu_enable_log_dirty_pt_masked with mask, start and end + * may cross hugepage, for first huge page parameter addr is equal to + * start, however for the second huge page addr is base address of + * this huge page, rather than start or end address + */ + if ((ctx->flag & _KVM_HAS_PGMASK) && !kvm_pte_huge(val)) { + offset = (addr >> PAGE_SHIFT) - ctx->gfn; + if (!(BIT(offset) & ctx->mask)) + return 0; + } + + /* + * Need not split huge page now, just set write-proect pte bit + * Split huge page until next write fault + */ + if (kvm_pte_dirty(val)) { + *pte = kvm_pte_mkclean(val); + return 1; + } + + return 0; +} + +/* + * Clear pte entry + */ +static int kvm_flush_pte(kvm_pte_t *pte, phys_addr_t addr, kvm_ptw_ctx *ctx) +{ + struct kvm *kvm; + + kvm = ctx->opaque; + if (ctx->level) + kvm->stat.hugepages--; + else + kvm->stat.pages--; + + *pte = ctx->invalid_entry; + + return 1; +} + +/* + * kvm_pgd_alloc() - Allocate and initialise a KVM GPA page directory. + * + * Allocate a blank KVM GPA page directory (PGD) for representing guest physical + * to host physical page mappings. + * + * Returns: Pointer to new KVM GPA page directory. + * NULL on allocation failure. + */ +kvm_pte_t *kvm_pgd_alloc(void) +{ + kvm_pte_t *pgd; + + pgd = (kvm_pte_t *)__get_free_pages(GFP_KERNEL, 0); + if (pgd) + pgd_init((void *)pgd); + + return pgd; +} + +static void _kvm_pte_init(void *addr, unsigned long val) +{ + unsigned long *p, *end; + + p = (unsigned long *)addr; + end = p + PTRS_PER_PTE; + do { + p[0] = val; + p[1] = val; + p[2] = val; + p[3] = val; + p[4] = val; + p += 8; + p[-3] = val; + p[-2] = val; + p[-1] = val; + } while (p != end); +} + +/* + * Caller must hold kvm->mm_lock + * + * Walk the page tables of kvm to find the PTE corresponding to the + * address @addr. If page tables don't exist for @addr, they will be created + * from the MMU cache if @cache is not NULL. + */ +static kvm_pte_t *kvm_populate_gpa(struct kvm *kvm, + struct kvm_mmu_memory_cache *cache, + unsigned long addr, int level) +{ + kvm_ptw_ctx ctx; + kvm_pte_t *entry, *child; + + kvm_ptw_prepare(kvm, &ctx); + child = kvm->arch.pgd; + while (ctx.level > level) { + entry = kvm_pgtable_offset(&ctx, child, addr); + if (kvm_pte_none(&ctx, entry)) { + if (!cache) + return NULL; + + child = kvm_mmu_memory_cache_alloc(cache); + _kvm_pte_init(child, ctx.invalid_ptes[ctx.level - 1]); + kvm_set_pte(entry, __pa(child)); + } else if (kvm_pte_huge(*entry)) { + return entry; + } else + child = (kvm_pte_t *)__va(PHYSADDR(*entry)); + kvm_ptw_enter(&ctx); + } + + entry = kvm_pgtable_offset(&ctx, child, addr); + + return entry; +} + +/* + * Page walker for VM shadow mmu at last level + * The last level is small pte page or huge pmd page + */ +static int kvm_ptw_leaf(kvm_pte_t *dir, phys_addr_t addr, phys_addr_t end, kvm_ptw_ctx *ctx) +{ + int ret; + phys_addr_t next, start, size; + struct list_head *list; + kvm_pte_t *entry, *child; + + ret = 0; + start = addr; + child = (kvm_pte_t *)__va(PHYSADDR(*dir)); + entry = kvm_pgtable_offset(ctx, child, addr); + do { + next = addr + (0x1UL << ctx->pgtable_shift); + if (!kvm_pte_present(ctx, entry)) + continue; + + ret |= ctx->ops(entry, addr, ctx); + } while (entry++, addr = next, addr < end); + + if (kvm_need_flush(ctx)) { + size = 0x1UL << (ctx->pgtable_shift + PAGE_SHIFT - 3); + if (start + size == end) { + list = (struct list_head *)child; + list_add_tail(list, &ctx->list); + *dir = ctx->invalid_ptes[ctx->level + 1]; + } + } + + return ret; +} + +/* + * Page walker for VM shadow mmu at page table dir level + */ +static int kvm_ptw_dir(kvm_pte_t *dir, phys_addr_t addr, phys_addr_t end, kvm_ptw_ctx *ctx) +{ + int ret; + phys_addr_t next, start, size; + struct list_head *list; + kvm_pte_t *entry, *child; + + ret = 0; + start = addr; + child = (kvm_pte_t *)__va(PHYSADDR(*dir)); + entry = kvm_pgtable_offset(ctx, child, addr); + do { + next = kvm_pgtable_addr_end(ctx, addr, end); + if (!kvm_pte_present(ctx, entry)) + continue; + + if (kvm_pte_huge(*entry)) { + ret |= ctx->ops(entry, addr, ctx); + continue; + } + + kvm_ptw_enter(ctx); + if (ctx->level == 0) + ret |= kvm_ptw_leaf(entry, addr, next, ctx); + else + ret |= kvm_ptw_dir(entry, addr, next, ctx); + kvm_ptw_exit(ctx); + } while (entry++, addr = next, addr < end); + + if (kvm_need_flush(ctx)) { + size = 0x1UL << (ctx->pgtable_shift + PAGE_SHIFT - 3); + if (start + size == end) { + list = (struct list_head *)child; + list_add_tail(list, &ctx->list); + *dir = ctx->invalid_ptes[ctx->level + 1]; + } + } + + return ret; +} + +/* + * Page walker for VM shadow mmu at page root table + */ +static int kvm_ptw_top(kvm_pte_t *dir, phys_addr_t addr, phys_addr_t end, kvm_ptw_ctx *ctx) +{ + int ret; + phys_addr_t next; + kvm_pte_t *entry; + + ret = 0; + entry = kvm_pgtable_offset(ctx, dir, addr); + do { + next = kvm_pgtable_addr_end(ctx, addr, end); + if (!kvm_pte_present(ctx, entry)) + continue; + + kvm_ptw_enter(ctx); + ret |= kvm_ptw_dir(entry, addr, next, ctx); + kvm_ptw_exit(ctx); + } while (entry++, addr = next, addr < end); + + return ret; +} + +/* + * kvm_flush_range() - Flush a range of guest physical addresses. + * @kvm: KVM pointer. + * @start_gfn: Guest frame number of first page in GPA range to flush. + * @end_gfn: Guest frame number of last page in GPA range to flush. + * @lock: Whether to hold mmu_lock or not + * + * Flushes a range of GPA mappings from the GPA page tables. + */ +static void kvm_flush_range(struct kvm *kvm, gfn_t start_gfn, gfn_t end_gfn, int lock) +{ + int ret; + kvm_ptw_ctx ctx; + struct list_head *pos, *temp; + + ctx.ops = kvm_flush_pte; + ctx.flag = _KVM_FLUSH_PGTABLE; + kvm_ptw_prepare(kvm, &ctx); + INIT_LIST_HEAD(&ctx.list); + + if (lock) { + spin_lock(&kvm->mmu_lock); + ret = kvm_ptw_top(kvm->arch.pgd, start_gfn << PAGE_SHIFT, + end_gfn << PAGE_SHIFT, &ctx); + spin_unlock(&kvm->mmu_lock); + } else + ret = kvm_ptw_top(kvm->arch.pgd, start_gfn << PAGE_SHIFT, + end_gfn << PAGE_SHIFT, &ctx); + + /* Flush vpid for each vCPU individually */ + if (ret) + kvm_flush_remote_tlbs(kvm); + + /* + * free pte table page after mmu_lock + * the pte table page is linked together with ctx.list + */ + list_for_each_safe(pos, temp, &ctx.list) { + list_del(pos); + free_page((unsigned long)pos); + } +} + +/* + * kvm_mkclean_gpa_pt() - Make a range of guest physical addresses clean. + * @kvm: KVM pointer. + * @start_gfn: Guest frame number of first page in GPA range to flush. + * @end_gfn: Guest frame number of last page in GPA range to flush. + * + * Make a range of GPA mappings clean so that guest writes will fault and + * trigger dirty page logging. + * + * The caller must hold the @kvm->mmu_lock spinlock. + * + * Returns: Whether any GPA mappings were modified, which would require + * derived mappings (GVA page tables & TLB enties) to be + * invalidated. + */ +static int kvm_mkclean_gpa_pt(struct kvm *kvm, gfn_t start_gfn, gfn_t end_gfn) +{ + kvm_ptw_ctx ctx; + + ctx.ops = kvm_mkclean_pte; + ctx.flag = 0; + kvm_ptw_prepare(kvm, &ctx); + return kvm_ptw_top(kvm->arch.pgd, start_gfn << PAGE_SHIFT, end_gfn << PAGE_SHIFT, &ctx); +} + +/* + * kvm_arch_mmu_enable_log_dirty_pt_masked() - write protect dirty pages + * @kvm: The KVM pointer + * @slot: The memory slot associated with mask + * @gfn_offset: The gfn offset in memory slot + * @mask: The mask of dirty pages at offset 'gfn_offset' in this memory + * slot to be write protected + * + * Walks bits set in mask write protects the associated pte's. Caller must + * acquire @kvm->mmu_lock. + */ +void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, + struct kvm_memory_slot *slot, gfn_t gfn_offset, unsigned long mask) +{ + kvm_ptw_ctx ctx; + gfn_t base_gfn = slot->base_gfn + gfn_offset; + gfn_t start = base_gfn + __ffs(mask); + gfn_t end = base_gfn + __fls(mask) + 1; + + ctx.ops = kvm_mkclean_pte; + ctx.flag = _KVM_HAS_PGMASK; + ctx.mask = mask; + ctx.gfn = base_gfn; + kvm_ptw_prepare(kvm, &ctx); + + kvm_ptw_top(kvm->arch.pgd, start << PAGE_SHIFT, end << PAGE_SHIFT, &ctx); +} + +void kvm_arch_commit_memory_region(struct kvm *kvm, + struct kvm_memory_slot *old, + const struct kvm_memory_slot *new, + enum kvm_mr_change change) +{ + int needs_flush; + + /* + * If dirty page logging is enabled, write protect all pages in the slot + * ready for dirty logging. + * + * There is no need to do this in any of the following cases: + * CREATE: No dirty mappings will already exist. + * MOVE/DELETE: The old mappings will already have been cleaned up by + * kvm_arch_flush_shadow_memslot() + */ + if (change == KVM_MR_FLAGS_ONLY && + (!(old->flags & KVM_MEM_LOG_DIRTY_PAGES) && + new->flags & KVM_MEM_LOG_DIRTY_PAGES)) { + spin_lock(&kvm->mmu_lock); + /* Write protect GPA page table entries */ + needs_flush = kvm_mkclean_gpa_pt(kvm, new->base_gfn, + new->base_gfn + new->npages); + spin_unlock(&kvm->mmu_lock); + if (needs_flush) + kvm_flush_remote_tlbs(kvm); + } +} + +void kvm_arch_flush_shadow_all(struct kvm *kvm) +{ + kvm_flush_range(kvm, 0, kvm->arch.gpa_size >> PAGE_SHIFT, 0); +} + +void kvm_arch_flush_shadow_memslot(struct kvm *kvm, struct kvm_memory_slot *slot) +{ + /* + * The slot has been made invalid (ready for moving or deletion), so we + * need to ensure that it can no longer be accessed by any guest vCPUs. + */ + kvm_flush_range(kvm, slot->base_gfn, slot->base_gfn + slot->npages, 1); +} + +bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) +{ + kvm_ptw_ctx ctx; + + ctx.flag = 0; + ctx.ops = kvm_flush_pte; + kvm_ptw_prepare(kvm, &ctx); + INIT_LIST_HEAD(&ctx.list); + + return kvm_ptw_top(kvm->arch.pgd, range->start << PAGE_SHIFT, + range->end << PAGE_SHIFT, &ctx); +} + +bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range) +{ + unsigned long prot_bits; + kvm_pte_t *ptep; + kvm_pfn_t pfn = pte_pfn(range->arg.pte); + gpa_t gpa = range->start << PAGE_SHIFT; + + ptep = kvm_populate_gpa(kvm, NULL, gpa, 0); + if (!ptep) + return false; + + /* Replacing an absent or old page doesn't need flushes */ + if (!kvm_pte_present(NULL, ptep) || !kvm_pte_young(*ptep)) { + kvm_set_pte(ptep, 0); + return false; + } + + /* Fill new pte if write protected or page migrated */ + prot_bits = _PAGE_PRESENT | __READABLE; + prot_bits |= _CACHE_MASK & pte_val(range->arg.pte); + + /* + * Set _PAGE_WRITE or _PAGE_DIRTY iff old and new pte both support + * _PAGE_WRITE for map_page_fast if next page write fault + * _PAGE_DIRTY since gpa has already recorded as dirty page + */ + prot_bits |= __WRITEABLE & *ptep & pte_val(range->arg.pte); + kvm_set_pte(ptep, kvm_pfn_pte(pfn, __pgprot(prot_bits))); + + return true; +} + +bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) +{ + kvm_ptw_ctx ctx; + + ctx.flag = 0; + ctx.ops = kvm_mkold_pte; + kvm_ptw_prepare(kvm, &ctx); + + return kvm_ptw_top(kvm->arch.pgd, range->start << PAGE_SHIFT, + range->end << PAGE_SHIFT, &ctx); +} + +bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) +{ + gpa_t gpa = range->start << PAGE_SHIFT; + kvm_pte_t *ptep = kvm_populate_gpa(kvm, NULL, gpa, 0); + + if (ptep && kvm_pte_present(NULL, ptep) && kvm_pte_young(*ptep)) + return true; + + return false; +} + +/* + * kvm_map_page_fast() - Fast path GPA fault handler. + * @vcpu: vCPU pointer. + * @gpa: Guest physical address of fault. + * @write: Whether the fault was due to a write. + * + * Perform fast path GPA fault handling, doing all that can be done without + * calling into KVM. This handles marking old pages young (for idle page + * tracking), and dirtying of clean pages (for dirty page logging). + * + * Returns: 0 on success, in which case we can update derived mappings and + * resume guest execution. + * -EFAULT on failure due to absent GPA mapping or write to + * read-only page, in which case KVM must be consulted. + */ +static int kvm_map_page_fast(struct kvm_vcpu *vcpu, unsigned long gpa, bool write) +{ + int ret = 0; + kvm_pfn_t pfn = 0; + kvm_pte_t *ptep, changed, new; + gfn_t gfn = gpa >> PAGE_SHIFT; + struct kvm *kvm = vcpu->kvm; + struct kvm_memory_slot *slot; + + spin_lock(&kvm->mmu_lock); + + /* Fast path - just check GPA page table for an existing entry */ + ptep = kvm_populate_gpa(kvm, NULL, gpa, 0); + if (!ptep || !kvm_pte_present(NULL, ptep)) { + ret = -EFAULT; + goto out; + } + + /* Track access to pages marked old */ + new = *ptep; + if (!kvm_pte_young(new)) + new = kvm_pte_mkyoung(new); + /* call kvm_set_pfn_accessed() after unlock */ + + if (write && !kvm_pte_dirty(new)) { + if (!kvm_pte_write(new)) { + ret = -EFAULT; + goto out; + } + + if (kvm_pte_huge(new)) { + /* + * Do not set write permission when dirty logging is + * enabled for HugePages + */ + slot = gfn_to_memslot(kvm, gfn); + if (kvm_slot_dirty_track_enabled(slot)) { + ret = -EFAULT; + goto out; + } + } + + /* Track dirtying of writeable pages */ + new = kvm_pte_mkdirty(new); + } + + changed = new ^ (*ptep); + if (changed) { + kvm_set_pte(ptep, new); + pfn = kvm_pte_pfn(new); + } + spin_unlock(&kvm->mmu_lock); + + /* + * Fixme: pfn may be freed after mmu_lock + * kvm_try_get_pfn(pfn)/kvm_release_pfn pair to prevent this? + */ + if (kvm_pte_young(changed)) + kvm_set_pfn_accessed(pfn); + + if (kvm_pte_dirty(changed)) { + mark_page_dirty(kvm, gfn); + kvm_set_pfn_dirty(pfn); + } + return ret; +out: + spin_unlock(&kvm->mmu_lock); + return ret; +} + +static bool fault_supports_huge_mapping(struct kvm_memory_slot *memslot, + unsigned long hva, unsigned long map_size, bool write) +{ + size_t size; + gpa_t gpa_start; + hva_t uaddr_start, uaddr_end; + + /* Disable dirty logging on HugePages */ + if (kvm_slot_dirty_track_enabled(memslot) && write) + return false; + + size = memslot->npages * PAGE_SIZE; + gpa_start = memslot->base_gfn << PAGE_SHIFT; + uaddr_start = memslot->userspace_addr; + uaddr_end = uaddr_start + size; + + /* + * Pages belonging to memslots that don't have the same alignment + * within a PMD for userspace and GPA cannot be mapped with stage-2 + * PMD entries, because we'll end up mapping the wrong pages. + * + * Consider a layout like the following: + * + * memslot->userspace_addr: + * +-----+--------------------+--------------------+---+ + * |abcde|fgh Stage-1 block | Stage-1 block tv|xyz| + * +-----+--------------------+--------------------+---+ + * + * memslot->base_gfn << PAGE_SIZE: + * +---+--------------------+--------------------+-----+ + * |abc|def Stage-2 block | Stage-2 block |tvxyz| + * +---+--------------------+--------------------+-----+ + * + * If we create those stage-2 blocks, we'll end up with this incorrect + * mapping: + * d -> f + * e -> g + * f -> h + */ + if ((gpa_start & (map_size - 1)) != (uaddr_start & (map_size - 1))) + return false; + + /* + * Next, let's make sure we're not trying to map anything not covered + * by the memslot. This means we have to prohibit block size mappings + * for the beginning and end of a non-block aligned and non-block sized + * memory slot (illustrated by the head and tail parts of the + * userspace view above containing pages 'abcde' and 'xyz', + * respectively). + * + * Note that it doesn't matter if we do the check using the + * userspace_addr or the base_gfn, as both are equally aligned (per + * the check above) and equally sized. + */ + return (hva & ~(map_size - 1)) >= uaddr_start && + (hva & ~(map_size - 1)) + map_size <= uaddr_end; +} + +/* + * Lookup the mapping level for @gfn in the current mm. + * + * WARNING! Use of host_pfn_mapping_level() requires the caller and the end + * consumer to be tied into KVM's handlers for MMU notifier events! + * + * There are several ways to safely use this helper: + * + * - Check mmu_invalidate_retry_hva() after grabbing the mapping level, before + * consuming it. In this case, mmu_lock doesn't need to be held during the + * lookup, but it does need to be held while checking the MMU notifier. + * + * - Hold mmu_lock AND ensure there is no in-progress MMU notifier invalidation + * event for the hva. This can be done by explicit checking the MMU notifier + * or by ensuring that KVM already has a valid mapping that covers the hva. + * + * - Do not use the result to install new mappings, e.g. use the host mapping + * level only to decide whether or not to zap an entry. In this case, it's + * not required to hold mmu_lock (though it's highly likely the caller will + * want to hold mmu_lock anyways, e.g. to modify SPTEs). + * + * Note! The lookup can still race with modifications to host page tables, but + * the above "rules" ensure KVM will not _consume_ the result of the walk if a + * race with the primary MMU occurs. + */ +static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, + const struct kvm_memory_slot *slot) +{ + int level = 0; + unsigned long hva; + unsigned long flags; + pgd_t pgd; + p4d_t p4d; + pud_t pud; + pmd_t pmd; + + /* + * Note, using the already-retrieved memslot and __gfn_to_hva_memslot() + * is not solely for performance, it's also necessary to avoid the + * "writable" check in __gfn_to_hva_many(), which will always fail on + * read-only memslots due to gfn_to_hva() assuming writes. Earlier + * page fault steps have already verified the guest isn't writing a + * read-only memslot. + */ + hva = __gfn_to_hva_memslot(slot, gfn); + + /* + * Disable IRQs to prevent concurrent tear down of host page tables, + * e.g. if the primary MMU promotes a P*D to a huge page and then frees + * the original page table. + */ + local_irq_save(flags); + + /* + * Read each entry once. As above, a non-leaf entry can be promoted to + * a huge page _during_ this walk. Re-reading the entry could send the + * walk into the weeks, e.g. p*d_large() returns false (sees the old + * value) and then p*d_offset() walks into the target huge page instead + * of the old page table (sees the new value). + */ + pgd = READ_ONCE(*pgd_offset(kvm->mm, hva)); + if (pgd_none(pgd)) + goto out; + + p4d = READ_ONCE(*p4d_offset(&pgd, hva)); + if (p4d_none(p4d) || !p4d_present(p4d)) + goto out; + + pud = READ_ONCE(*pud_offset(&p4d, hva)); + if (pud_none(pud) || !pud_present(pud)) + goto out; + + pmd = READ_ONCE(*pmd_offset(&pud, hva)); + if (pmd_none(pmd) || !pmd_present(pmd)) + goto out; + + if (kvm_pte_huge(pmd_val(pmd))) + level = 1; + +out: + local_irq_restore(flags); + return level; +} + +/* + * Split huge page + */ +static kvm_pte_t *kvm_split_huge(struct kvm_vcpu *vcpu, kvm_pte_t *ptep, gfn_t gfn) +{ + int i; + kvm_pte_t val, *child; + struct kvm *kvm = vcpu->kvm; + struct kvm_mmu_memory_cache *memcache; + + memcache = &vcpu->arch.mmu_page_cache; + child = kvm_mmu_memory_cache_alloc(memcache); + val = kvm_pte_mksmall(*ptep); + for (i = 0; i < PTRS_PER_PTE; i++) { + kvm_set_pte(child + i, val); + val += PAGE_SIZE; + } + + /* The later kvm_flush_tlb_gpa() will flush hugepage tlb */ + kvm_set_pte(ptep, __pa(child)); + + kvm->stat.hugepages--; + kvm->stat.pages += PTRS_PER_PTE; + + return child + (gfn & (PTRS_PER_PTE - 1)); +} + +/* + * kvm_map_page() - Map a guest physical page. + * @vcpu: vCPU pointer. + * @gpa: Guest physical address of fault. + * @write: Whether the fault was due to a write. + * + * Handle GPA faults by creating a new GPA mapping (or updating an existing + * one). + * + * This takes care of marking pages young or dirty (idle/dirty page tracking), + * asking KVM for the corresponding PFN, and creating a mapping in the GPA page + * tables. Derived mappings (GVA page tables and TLBs) must be handled by the + * caller. + * + * Returns: 0 on success + * -EFAULT if there is no memory region at @gpa or a write was + * attempted to a read-only memory region. This is usually handled + * as an MMIO access. + */ +static int kvm_map_page(struct kvm_vcpu *vcpu, unsigned long gpa, bool write) +{ + bool writeable; + int srcu_idx, err, retry_no = 0, level; + unsigned long hva, mmu_seq, prot_bits; + kvm_pfn_t pfn; + kvm_pte_t *ptep, new_pte; + gfn_t gfn = gpa >> PAGE_SHIFT; + struct kvm *kvm = vcpu->kvm; + struct kvm_memory_slot *memslot; + struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache; + + /* Try the fast path to handle old / clean pages */ + srcu_idx = srcu_read_lock(&kvm->srcu); + err = kvm_map_page_fast(vcpu, gpa, write); + if (!err) + goto out; + + memslot = gfn_to_memslot(kvm, gfn); + hva = gfn_to_hva_memslot_prot(memslot, gfn, &writeable); + if (kvm_is_error_hva(hva) || (write && !writeable)) { + err = -EFAULT; + goto out; + } + + /* We need a minimum of cached pages ready for page table creation */ + err = kvm_mmu_topup_memory_cache(memcache, KVM_MMU_CACHE_MIN_PAGES); + if (err) + goto out; + +retry: + /* + * Used to check for invalidations in progress, of the pfn that is + * returned by pfn_to_pfn_prot below. + */ + mmu_seq = kvm->mmu_invalidate_seq; + /* + * Ensure the read of mmu_invalidate_seq isn't reordered with PTE reads in + * gfn_to_pfn_prot() (which calls get_user_pages()), so that we don't + * risk the page we get a reference to getting unmapped before we have a + * chance to grab the mmu_lock without mmu_invalidate_retry() noticing. + * + * This smp_rmb() pairs with the effective smp_wmb() of the combination + * of the pte_unmap_unlock() after the PTE is zapped, and the + * spin_lock() in kvm_mmu_invalidate_invalidate_<page|range_end>() before + * mmu_invalidate_seq is incremented. + */ + smp_rmb(); + + /* Slow path - ask KVM core whether we can access this GPA */ + pfn = gfn_to_pfn_prot(kvm, gfn, write, &writeable); + if (is_error_noslot_pfn(pfn)) { + err = -EFAULT; + goto out; + } + + /* Check if an invalidation has taken place since we got pfn */ + spin_lock(&kvm->mmu_lock); + if (mmu_invalidate_retry_hva(kvm, mmu_seq, hva)) { + /* + * This can happen when mappings are changed asynchronously, but + * also synchronously if a COW is triggered by + * gfn_to_pfn_prot(). + */ + spin_unlock(&kvm->mmu_lock); + kvm_release_pfn_clean(pfn); + if (retry_no > 100) { + retry_no = 0; + schedule(); + } + retry_no++; + goto retry; + } + + /* + * For emulated devices such virtio device, actual cache attribute is + * determined by physical machine. + * For pass through physical device, it should be uncachable + */ + prot_bits = _PAGE_PRESENT | __READABLE; + if (pfn_valid(pfn)) + prot_bits |= _CACHE_CC; + else + prot_bits |= _CACHE_SUC; + + if (writeable) { + prot_bits |= _PAGE_WRITE; + if (write) + prot_bits |= __WRITEABLE; + } + + /* Disable dirty logging on HugePages */ + level = 0; + if (!fault_supports_huge_mapping(memslot, hva, PMD_SIZE, write)) { + level = 0; + } else { + level = host_pfn_mapping_level(kvm, gfn, memslot); + if (level == 1) { + gfn = gfn & ~(PTRS_PER_PTE - 1); + pfn = pfn & ~(PTRS_PER_PTE - 1); + } + } + + /* Ensure page tables are allocated */ + ptep = kvm_populate_gpa(kvm, memcache, gpa, level); + new_pte = kvm_pfn_pte(pfn, __pgprot(prot_bits)); + if (level == 1) { + new_pte = kvm_pte_mkhuge(new_pte); + /* + * previous pmd entry is invalid_pte_table + * there is invalid tlb with small page + * need flush these invalid tlbs for current vcpu + */ + kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); + ++kvm->stat.hugepages; + } else if (kvm_pte_huge(*ptep) && write) + ptep = kvm_split_huge(vcpu, ptep, gfn); + else + ++kvm->stat.pages; + kvm_set_pte(ptep, new_pte); + spin_unlock(&kvm->mmu_lock); + + if (prot_bits & _PAGE_DIRTY) { + mark_page_dirty_in_slot(kvm, memslot, gfn); + kvm_set_pfn_dirty(pfn); + } + + kvm_set_pfn_accessed(pfn); + kvm_release_pfn_clean(pfn); +out: + srcu_read_unlock(&kvm->srcu, srcu_idx); + return err; +} + +int kvm_handle_mm_fault(struct kvm_vcpu *vcpu, unsigned long gpa, bool write) +{ + int ret; + + ret = kvm_map_page(vcpu, gpa, write); + if (ret) + return ret; + + /* Invalidate this entry in the TLB */ + kvm_flush_tlb_gpa(vcpu, gpa); + + return 0; +} + +void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot) +{ +} + +int kvm_arch_prepare_memory_region(struct kvm *kvm, const struct kvm_memory_slot *old, + struct kvm_memory_slot *new, enum kvm_mr_change change) +{ + return 0; +} + +void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm, + const struct kvm_memory_slot *memslot) +{ + kvm_flush_remote_tlbs(kvm); +} diff --git a/arch/loongarch/kvm/switch.S b/arch/loongarch/kvm/switch.S new file mode 100644 index 0000000000..0ed9040307 --- /dev/null +++ b/arch/loongarch/kvm/switch.S @@ -0,0 +1,250 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2020-2023 Loongson Technology Corporation Limited + */ + +#include <linux/linkage.h> +#include <asm/asm.h> +#include <asm/asmmacro.h> +#include <asm/loongarch.h> +#include <asm/regdef.h> +#include <asm/stackframe.h> + +#define HGPR_OFFSET(x) (PT_R0 + 8*x) +#define GGPR_OFFSET(x) (KVM_ARCH_GGPR + 8*x) + +.macro kvm_save_host_gpr base + .irp n,1,2,3,22,23,24,25,26,27,28,29,30,31 + st.d $r\n, \base, HGPR_OFFSET(\n) + .endr +.endm + +.macro kvm_restore_host_gpr base + .irp n,1,2,3,22,23,24,25,26,27,28,29,30,31 + ld.d $r\n, \base, HGPR_OFFSET(\n) + .endr +.endm + +/* + * Save and restore all GPRs except base register, + * and default value of base register is a2. + */ +.macro kvm_save_guest_gprs base + .irp n,1,2,3,4,5,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31 + st.d $r\n, \base, GGPR_OFFSET(\n) + .endr +.endm + +.macro kvm_restore_guest_gprs base + .irp n,1,2,3,4,5,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31 + ld.d $r\n, \base, GGPR_OFFSET(\n) + .endr +.endm + +/* + * Prepare switch to guest, save host regs and restore guest regs. + * a2: kvm_vcpu_arch, don't touch it until 'ertn' + * t0, t1: temp register + */ +.macro kvm_switch_to_guest + /* Set host ECFG.VS=0, all exceptions share one exception entry */ + csrrd t0, LOONGARCH_CSR_ECFG + bstrins.w t0, zero, CSR_ECFG_VS_SHIFT_END, CSR_ECFG_VS_SHIFT + csrwr t0, LOONGARCH_CSR_ECFG + + /* Load up the new EENTRY */ + ld.d t0, a2, KVM_ARCH_GEENTRY + csrwr t0, LOONGARCH_CSR_EENTRY + + /* Set Guest ERA */ + ld.d t0, a2, KVM_ARCH_GPC + csrwr t0, LOONGARCH_CSR_ERA + + /* Save host PGDL */ + csrrd t0, LOONGARCH_CSR_PGDL + st.d t0, a2, KVM_ARCH_HPGD + + /* Switch to kvm */ + ld.d t1, a2, KVM_VCPU_KVM - KVM_VCPU_ARCH + + /* Load guest PGDL */ + li.w t0, KVM_GPGD + ldx.d t0, t1, t0 + csrwr t0, LOONGARCH_CSR_PGDL + + /* Mix GID and RID */ + csrrd t1, LOONGARCH_CSR_GSTAT + bstrpick.w t1, t1, CSR_GSTAT_GID_SHIFT_END, CSR_GSTAT_GID_SHIFT + csrrd t0, LOONGARCH_CSR_GTLBC + bstrins.w t0, t1, CSR_GTLBC_TGID_SHIFT_END, CSR_GTLBC_TGID_SHIFT + csrwr t0, LOONGARCH_CSR_GTLBC + + /* + * Enable intr in root mode with future ertn so that host interrupt + * can be responsed during VM runs + * Guest CRMD comes from separate GCSR_CRMD register + */ + ori t0, zero, CSR_PRMD_PIE + csrxchg t0, t0, LOONGARCH_CSR_PRMD + + /* Set PVM bit to setup ertn to guest context */ + ori t0, zero, CSR_GSTAT_PVM + csrxchg t0, t0, LOONGARCH_CSR_GSTAT + + /* Load Guest GPRs */ + kvm_restore_guest_gprs a2 + /* Load KVM_ARCH register */ + ld.d a2, a2, (KVM_ARCH_GGPR + 8 * REG_A2) + + ertn /* Switch to guest: GSTAT.PGM = 1, ERRCTL.ISERR = 0, TLBRPRMD.ISTLBR = 0 */ +.endm + + /* + * Exception entry for general exception from guest mode + * - IRQ is disabled + * - kernel privilege in root mode + * - page mode keep unchanged from previous PRMD in root mode + * - Fixme: tlb exception cannot happen since registers relative with TLB + * - is still in guest mode, such as pgd table/vmid registers etc, + * - will fix with hw page walk enabled in future + * load kvm_vcpu from reserved CSR KVM_VCPU_KS, and save a2 to KVM_TEMP_KS + */ + .text + .cfi_sections .debug_frame +SYM_CODE_START(kvm_exc_entry) + csrwr a2, KVM_TEMP_KS + csrrd a2, KVM_VCPU_KS + addi.d a2, a2, KVM_VCPU_ARCH + + /* After save GPRs, free to use any GPR */ + kvm_save_guest_gprs a2 + /* Save guest A2 */ + csrrd t0, KVM_TEMP_KS + st.d t0, a2, (KVM_ARCH_GGPR + 8 * REG_A2) + + /* A2 is kvm_vcpu_arch, A1 is free to use */ + csrrd s1, KVM_VCPU_KS + ld.d s0, s1, KVM_VCPU_RUN + + csrrd t0, LOONGARCH_CSR_ESTAT + st.d t0, a2, KVM_ARCH_HESTAT + csrrd t0, LOONGARCH_CSR_ERA + st.d t0, a2, KVM_ARCH_GPC + csrrd t0, LOONGARCH_CSR_BADV + st.d t0, a2, KVM_ARCH_HBADV + csrrd t0, LOONGARCH_CSR_BADI + st.d t0, a2, KVM_ARCH_HBADI + + /* Restore host ECFG.VS */ + csrrd t0, LOONGARCH_CSR_ECFG + ld.d t1, a2, KVM_ARCH_HECFG + or t0, t0, t1 + csrwr t0, LOONGARCH_CSR_ECFG + + /* Restore host EENTRY */ + ld.d t0, a2, KVM_ARCH_HEENTRY + csrwr t0, LOONGARCH_CSR_EENTRY + + /* Restore host pgd table */ + ld.d t0, a2, KVM_ARCH_HPGD + csrwr t0, LOONGARCH_CSR_PGDL + + /* + * Disable PGM bit to enter root mode by default with next ertn + */ + ori t0, zero, CSR_GSTAT_PVM + csrxchg zero, t0, LOONGARCH_CSR_GSTAT + + /* + * Clear GTLBC.TGID field + * 0: for root tlb update in future tlb instr + * others: for guest tlb update like gpa to hpa in future tlb instr + */ + csrrd t0, LOONGARCH_CSR_GTLBC + bstrins.w t0, zero, CSR_GTLBC_TGID_SHIFT_END, CSR_GTLBC_TGID_SHIFT + csrwr t0, LOONGARCH_CSR_GTLBC + ld.d tp, a2, KVM_ARCH_HTP + ld.d sp, a2, KVM_ARCH_HSP + /* restore per cpu register */ + ld.d u0, a2, KVM_ARCH_HPERCPU + addi.d sp, sp, -PT_SIZE + + /* Prepare handle exception */ + or a0, s0, zero + or a1, s1, zero + ld.d t8, a2, KVM_ARCH_HANDLE_EXIT + jirl ra, t8, 0 + + or a2, s1, zero + addi.d a2, a2, KVM_VCPU_ARCH + + /* Resume host when ret <= 0 */ + blez a0, ret_to_host + + /* + * Return to guest + * Save per cpu register again, maybe switched to another cpu + */ + st.d u0, a2, KVM_ARCH_HPERCPU + + /* Save kvm_vcpu to kscratch */ + csrwr s1, KVM_VCPU_KS + kvm_switch_to_guest + +ret_to_host: + ld.d a2, a2, KVM_ARCH_HSP + addi.d a2, a2, -PT_SIZE + kvm_restore_host_gpr a2 + jr ra + +SYM_INNER_LABEL(kvm_exc_entry_end, SYM_L_LOCAL) +SYM_CODE_END(kvm_exc_entry) + +/* + * int kvm_enter_guest(struct kvm_run *run, struct kvm_vcpu *vcpu) + * + * @register_param: + * a0: kvm_run* run + * a1: kvm_vcpu* vcpu + */ +SYM_FUNC_START(kvm_enter_guest) + /* Allocate space in stack bottom */ + addi.d a2, sp, -PT_SIZE + /* Save host GPRs */ + kvm_save_host_gpr a2 + + /* Save host CRMD, PRMD to stack */ + csrrd a3, LOONGARCH_CSR_CRMD + st.d a3, a2, PT_CRMD + csrrd a3, LOONGARCH_CSR_PRMD + st.d a3, a2, PT_PRMD + + addi.d a2, a1, KVM_VCPU_ARCH + st.d sp, a2, KVM_ARCH_HSP + st.d tp, a2, KVM_ARCH_HTP + /* Save per cpu register */ + st.d u0, a2, KVM_ARCH_HPERCPU + + /* Save kvm_vcpu to kscratch */ + csrwr a1, KVM_VCPU_KS + kvm_switch_to_guest +SYM_INNER_LABEL(kvm_enter_guest_end, SYM_L_LOCAL) +SYM_FUNC_END(kvm_enter_guest) + +SYM_FUNC_START(kvm_save_fpu) + fpu_save_csr a0 t1 + fpu_save_double a0 t1 + fpu_save_cc a0 t1 t2 + jr ra +SYM_FUNC_END(kvm_save_fpu) + +SYM_FUNC_START(kvm_restore_fpu) + fpu_restore_double a0 t1 + fpu_restore_csr a0 t1 t2 + fpu_restore_cc a0 t1 t2 + jr ra +SYM_FUNC_END(kvm_restore_fpu) + + .section ".rodata" +SYM_DATA(kvm_exception_size, .quad kvm_exc_entry_end - kvm_exc_entry) +SYM_DATA(kvm_enter_guest_size, .quad kvm_enter_guest_end - kvm_enter_guest) diff --git a/arch/loongarch/kvm/timer.c b/arch/loongarch/kvm/timer.c new file mode 100644 index 0000000000..284bf553fe --- /dev/null +++ b/arch/loongarch/kvm/timer.c @@ -0,0 +1,197 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2020-2023 Loongson Technology Corporation Limited + */ + +#include <linux/kvm_host.h> +#include <asm/kvm_csr.h> +#include <asm/kvm_vcpu.h> + +/* + * ktime_to_tick() - Scale ktime_t to timer tick value. + */ +static inline u64 ktime_to_tick(struct kvm_vcpu *vcpu, ktime_t now) +{ + u64 delta; + + delta = ktime_to_ns(now); + return div_u64(delta * vcpu->arch.timer_mhz, MNSEC_PER_SEC); +} + +static inline u64 tick_to_ns(struct kvm_vcpu *vcpu, u64 tick) +{ + return div_u64(tick * MNSEC_PER_SEC, vcpu->arch.timer_mhz); +} + +/* + * Push timer forward on timeout. + * Handle an hrtimer event by push the hrtimer forward a period. + */ +static enum hrtimer_restart kvm_count_timeout(struct kvm_vcpu *vcpu) +{ + unsigned long cfg, period; + + /* Add periodic tick to current expire time */ + cfg = kvm_read_sw_gcsr(vcpu->arch.csr, LOONGARCH_CSR_TCFG); + if (cfg & CSR_TCFG_PERIOD) { + period = tick_to_ns(vcpu, cfg & CSR_TCFG_VAL); + hrtimer_add_expires_ns(&vcpu->arch.swtimer, period); + return HRTIMER_RESTART; + } else + return HRTIMER_NORESTART; +} + +/* Low level hrtimer wake routine */ +enum hrtimer_restart kvm_swtimer_wakeup(struct hrtimer *timer) +{ + struct kvm_vcpu *vcpu; + + vcpu = container_of(timer, struct kvm_vcpu, arch.swtimer); + kvm_queue_irq(vcpu, INT_TI); + rcuwait_wake_up(&vcpu->wait); + + return kvm_count_timeout(vcpu); +} + +/* + * Initialise the timer to the specified frequency, zero it + */ +void kvm_init_timer(struct kvm_vcpu *vcpu, unsigned long timer_hz) +{ + vcpu->arch.timer_mhz = timer_hz >> 20; + + /* Starting at 0 */ + kvm_write_sw_gcsr(vcpu->arch.csr, LOONGARCH_CSR_TVAL, 0); +} + +/* + * Restore hard timer state and enable guest to access timer registers + * without trap, should be called with irq disabled + */ +void kvm_acquire_timer(struct kvm_vcpu *vcpu) +{ + unsigned long cfg; + + cfg = read_csr_gcfg(); + if (!(cfg & CSR_GCFG_TIT)) + return; + + /* Enable guest access to hard timer */ + write_csr_gcfg(cfg & ~CSR_GCFG_TIT); + + /* + * Freeze the soft-timer and sync the guest stable timer with it. We do + * this with interrupts disabled to avoid latency. + */ + hrtimer_cancel(&vcpu->arch.swtimer); +} + +/* + * Restore soft timer state from saved context. + */ +void kvm_restore_timer(struct kvm_vcpu *vcpu) +{ + unsigned long cfg, delta, period; + ktime_t expire, now; + struct loongarch_csrs *csr = vcpu->arch.csr; + + /* + * Set guest stable timer cfg csr + */ + cfg = kvm_read_sw_gcsr(csr, LOONGARCH_CSR_TCFG); + kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_ESTAT); + kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_TCFG); + if (!(cfg & CSR_TCFG_EN)) { + /* Guest timer is disabled, just restore timer registers */ + kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_TVAL); + return; + } + + /* + * Set remainder tick value if not expired + */ + now = ktime_get(); + expire = vcpu->arch.expire; + if (ktime_before(now, expire)) + delta = ktime_to_tick(vcpu, ktime_sub(expire, now)); + else { + if (cfg & CSR_TCFG_PERIOD) { + period = cfg & CSR_TCFG_VAL; + delta = ktime_to_tick(vcpu, ktime_sub(now, expire)); + delta = period - (delta % period); + } else + delta = 0; + /* + * Inject timer here though sw timer should inject timer + * interrupt async already, since sw timer may be cancelled + * during injecting intr async in function kvm_acquire_timer + */ + kvm_queue_irq(vcpu, INT_TI); + } + + write_gcsr_timertick(delta); +} + +/* + * Save guest timer state and switch to software emulation of guest + * timer. The hard timer must already be in use, so preemption should be + * disabled. + */ +static void _kvm_save_timer(struct kvm_vcpu *vcpu) +{ + unsigned long ticks, delta; + ktime_t expire; + struct loongarch_csrs *csr = vcpu->arch.csr; + + ticks = kvm_read_sw_gcsr(csr, LOONGARCH_CSR_TVAL); + delta = tick_to_ns(vcpu, ticks); + expire = ktime_add_ns(ktime_get(), delta); + vcpu->arch.expire = expire; + if (ticks) { + /* + * Update hrtimer to use new timeout + * HRTIMER_MODE_PINNED is suggested since vcpu may run in + * the same physical cpu in next time + */ + hrtimer_cancel(&vcpu->arch.swtimer); + hrtimer_start(&vcpu->arch.swtimer, expire, HRTIMER_MODE_ABS_PINNED); + } else + /* + * Inject timer interrupt so that hall polling can dectect and exit + */ + kvm_queue_irq(vcpu, INT_TI); +} + +/* + * Save guest timer state and switch to soft guest timer if hard timer was in + * use. + */ +void kvm_save_timer(struct kvm_vcpu *vcpu) +{ + unsigned long cfg; + struct loongarch_csrs *csr = vcpu->arch.csr; + + preempt_disable(); + cfg = read_csr_gcfg(); + if (!(cfg & CSR_GCFG_TIT)) { + /* Disable guest use of hard timer */ + write_csr_gcfg(cfg | CSR_GCFG_TIT); + + /* Save hard timer state */ + kvm_save_hw_gcsr(csr, LOONGARCH_CSR_TCFG); + kvm_save_hw_gcsr(csr, LOONGARCH_CSR_TVAL); + if (kvm_read_sw_gcsr(csr, LOONGARCH_CSR_TCFG) & CSR_TCFG_EN) + _kvm_save_timer(vcpu); + } + + /* Save timer-related state to vCPU context */ + kvm_save_hw_gcsr(csr, LOONGARCH_CSR_ESTAT); + preempt_enable(); +} + +void kvm_reset_timer(struct kvm_vcpu *vcpu) +{ + write_gcsr_timercfg(0); + kvm_write_sw_gcsr(vcpu->arch.csr, LOONGARCH_CSR_TCFG, 0); + hrtimer_cancel(&vcpu->arch.swtimer); +} diff --git a/arch/loongarch/kvm/tlb.c b/arch/loongarch/kvm/tlb.c new file mode 100644 index 0000000000..02535df6b5 --- /dev/null +++ b/arch/loongarch/kvm/tlb.c @@ -0,0 +1,32 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2020-2023 Loongson Technology Corporation Limited + */ + +#include <linux/kvm_host.h> +#include <asm/tlb.h> +#include <asm/kvm_csr.h> + +/* + * kvm_flush_tlb_all() - Flush all root TLB entries for guests. + * + * Invalidate all entries including GVA-->GPA and GPA-->HPA mappings. + */ +void kvm_flush_tlb_all(void) +{ + unsigned long flags; + + local_irq_save(flags); + invtlb_all(INVTLB_ALLGID, 0, 0); + local_irq_restore(flags); +} + +void kvm_flush_tlb_gpa(struct kvm_vcpu *vcpu, unsigned long gpa) +{ + unsigned long flags; + + local_irq_save(flags); + gpa &= (PAGE_MASK << 1); + invtlb(INVTLB_GID_ADDR, read_csr_gstat() & CSR_GSTAT_GID, gpa); + local_irq_restore(flags); +} diff --git a/arch/loongarch/kvm/trace.h b/arch/loongarch/kvm/trace.h new file mode 100644 index 0000000000..a1e35d6554 --- /dev/null +++ b/arch/loongarch/kvm/trace.h @@ -0,0 +1,162 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2020-2023 Loongson Technology Corporation Limited + */ + +#if !defined(_TRACE_KVM_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_KVM_H + +#include <linux/tracepoint.h> +#include <asm/kvm_csr.h> + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM kvm + +/* + * Tracepoints for VM enters + */ +DECLARE_EVENT_CLASS(kvm_transition, + TP_PROTO(struct kvm_vcpu *vcpu), + TP_ARGS(vcpu), + TP_STRUCT__entry( + __field(unsigned long, pc) + ), + + TP_fast_assign( + __entry->pc = vcpu->arch.pc; + ), + + TP_printk("PC: 0x%08lx", __entry->pc) +); + +DEFINE_EVENT(kvm_transition, kvm_enter, + TP_PROTO(struct kvm_vcpu *vcpu), + TP_ARGS(vcpu)); + +DEFINE_EVENT(kvm_transition, kvm_reenter, + TP_PROTO(struct kvm_vcpu *vcpu), + TP_ARGS(vcpu)); + +DEFINE_EVENT(kvm_transition, kvm_out, + TP_PROTO(struct kvm_vcpu *vcpu), + TP_ARGS(vcpu)); + +/* Further exit reasons */ +#define KVM_TRACE_EXIT_IDLE 64 +#define KVM_TRACE_EXIT_CACHE 65 + +/* Tracepoints for VM exits */ +#define kvm_trace_symbol_exit_types \ + { KVM_TRACE_EXIT_IDLE, "IDLE" }, \ + { KVM_TRACE_EXIT_CACHE, "CACHE" } + +DECLARE_EVENT_CLASS(kvm_exit, + TP_PROTO(struct kvm_vcpu *vcpu, unsigned int reason), + TP_ARGS(vcpu, reason), + TP_STRUCT__entry( + __field(unsigned long, pc) + __field(unsigned int, reason) + ), + + TP_fast_assign( + __entry->pc = vcpu->arch.pc; + __entry->reason = reason; + ), + + TP_printk("[%s]PC: 0x%08lx", + __print_symbolic(__entry->reason, + kvm_trace_symbol_exit_types), + __entry->pc) +); + +DEFINE_EVENT(kvm_exit, kvm_exit_idle, + TP_PROTO(struct kvm_vcpu *vcpu, unsigned int reason), + TP_ARGS(vcpu, reason)); + +DEFINE_EVENT(kvm_exit, kvm_exit_cache, + TP_PROTO(struct kvm_vcpu *vcpu, unsigned int reason), + TP_ARGS(vcpu, reason)); + +DEFINE_EVENT(kvm_exit, kvm_exit, + TP_PROTO(struct kvm_vcpu *vcpu, unsigned int reason), + TP_ARGS(vcpu, reason)); + +TRACE_EVENT(kvm_exit_gspr, + TP_PROTO(struct kvm_vcpu *vcpu, unsigned int inst_word), + TP_ARGS(vcpu, inst_word), + TP_STRUCT__entry( + __field(unsigned int, inst_word) + ), + + TP_fast_assign( + __entry->inst_word = inst_word; + ), + + TP_printk("Inst word: 0x%08x", __entry->inst_word) +); + +#define KVM_TRACE_AUX_SAVE 0 +#define KVM_TRACE_AUX_RESTORE 1 +#define KVM_TRACE_AUX_ENABLE 2 +#define KVM_TRACE_AUX_DISABLE 3 +#define KVM_TRACE_AUX_DISCARD 4 + +#define KVM_TRACE_AUX_FPU 1 + +#define kvm_trace_symbol_aux_op \ + { KVM_TRACE_AUX_SAVE, "save" }, \ + { KVM_TRACE_AUX_RESTORE, "restore" }, \ + { KVM_TRACE_AUX_ENABLE, "enable" }, \ + { KVM_TRACE_AUX_DISABLE, "disable" }, \ + { KVM_TRACE_AUX_DISCARD, "discard" } + +#define kvm_trace_symbol_aux_state \ + { KVM_TRACE_AUX_FPU, "FPU" } + +TRACE_EVENT(kvm_aux, + TP_PROTO(struct kvm_vcpu *vcpu, unsigned int op, + unsigned int state), + TP_ARGS(vcpu, op, state), + TP_STRUCT__entry( + __field(unsigned long, pc) + __field(u8, op) + __field(u8, state) + ), + + TP_fast_assign( + __entry->pc = vcpu->arch.pc; + __entry->op = op; + __entry->state = state; + ), + + TP_printk("%s %s PC: 0x%08lx", + __print_symbolic(__entry->op, + kvm_trace_symbol_aux_op), + __print_symbolic(__entry->state, + kvm_trace_symbol_aux_state), + __entry->pc) +); + +TRACE_EVENT(kvm_vpid_change, + TP_PROTO(struct kvm_vcpu *vcpu, unsigned long vpid), + TP_ARGS(vcpu, vpid), + TP_STRUCT__entry( + __field(unsigned long, vpid) + ), + + TP_fast_assign( + __entry->vpid = vpid; + ), + + TP_printk("VPID: 0x%08lx", __entry->vpid) +); + +#endif /* _TRACE_KVM_H */ + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH ../../arch/loongarch/kvm +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_FILE trace + +/* This part must be outside protection */ +#include <trace/define_trace.h> diff --git a/arch/loongarch/kvm/vcpu.c b/arch/loongarch/kvm/vcpu.c new file mode 100644 index 0000000000..73d0c2b9c1 --- /dev/null +++ b/arch/loongarch/kvm/vcpu.c @@ -0,0 +1,939 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2020-2023 Loongson Technology Corporation Limited + */ + +#include <linux/kvm_host.h> +#include <linux/entry-kvm.h> +#include <asm/fpu.h> +#include <asm/loongarch.h> +#include <asm/setup.h> +#include <asm/time.h> + +#define CREATE_TRACE_POINTS +#include "trace.h" + +const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = { + KVM_GENERIC_VCPU_STATS(), + STATS_DESC_COUNTER(VCPU, int_exits), + STATS_DESC_COUNTER(VCPU, idle_exits), + STATS_DESC_COUNTER(VCPU, cpucfg_exits), + STATS_DESC_COUNTER(VCPU, signal_exits), +}; + +const struct kvm_stats_header kvm_vcpu_stats_header = { + .name_size = KVM_STATS_NAME_SIZE, + .num_desc = ARRAY_SIZE(kvm_vcpu_stats_desc), + .id_offset = sizeof(struct kvm_stats_header), + .desc_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE, + .data_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE + + sizeof(kvm_vcpu_stats_desc), +}; + +/* + * kvm_check_requests - check and handle pending vCPU requests + * + * Return: RESUME_GUEST if we should enter the guest + * RESUME_HOST if we should exit to userspace + */ +static int kvm_check_requests(struct kvm_vcpu *vcpu) +{ + if (!kvm_request_pending(vcpu)) + return RESUME_GUEST; + + if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) + vcpu->arch.vpid = 0; /* Drop vpid for this vCPU */ + + if (kvm_dirty_ring_check_request(vcpu)) + return RESUME_HOST; + + return RESUME_GUEST; +} + +/* + * Check and handle pending signal and vCPU requests etc + * Run with irq enabled and preempt enabled + * + * Return: RESUME_GUEST if we should enter the guest + * RESUME_HOST if we should exit to userspace + * < 0 if we should exit to userspace, where the return value + * indicates an error + */ +static int kvm_enter_guest_check(struct kvm_vcpu *vcpu) +{ + int ret; + + /* + * Check conditions before entering the guest + */ + ret = xfer_to_guest_mode_handle_work(vcpu); + if (ret < 0) + return ret; + + ret = kvm_check_requests(vcpu); + + return ret; +} + +/* + * Called with irq enabled + * + * Return: RESUME_GUEST if we should enter the guest, and irq disabled + * Others if we should exit to userspace + */ +static int kvm_pre_enter_guest(struct kvm_vcpu *vcpu) +{ + int ret; + + do { + ret = kvm_enter_guest_check(vcpu); + if (ret != RESUME_GUEST) + break; + + /* + * Handle vcpu timer, interrupts, check requests and + * check vmid before vcpu enter guest + */ + local_irq_disable(); + kvm_acquire_timer(vcpu); + kvm_deliver_intr(vcpu); + kvm_deliver_exception(vcpu); + /* Make sure the vcpu mode has been written */ + smp_store_mb(vcpu->mode, IN_GUEST_MODE); + kvm_check_vpid(vcpu); + vcpu->arch.host_eentry = csr_read64(LOONGARCH_CSR_EENTRY); + /* Clear KVM_LARCH_SWCSR_LATEST as CSR will change when enter guest */ + vcpu->arch.aux_inuse &= ~KVM_LARCH_SWCSR_LATEST; + + if (kvm_request_pending(vcpu) || xfer_to_guest_mode_work_pending()) { + /* make sure the vcpu mode has been written */ + smp_store_mb(vcpu->mode, OUTSIDE_GUEST_MODE); + local_irq_enable(); + ret = -EAGAIN; + } + } while (ret != RESUME_GUEST); + + return ret; +} + +/* + * Return 1 for resume guest and "<= 0" for resume host. + */ +static int kvm_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu) +{ + int ret = RESUME_GUEST; + unsigned long estat = vcpu->arch.host_estat; + u32 intr = estat & 0x1fff; /* Ignore NMI */ + u32 ecode = (estat & CSR_ESTAT_EXC) >> CSR_ESTAT_EXC_SHIFT; + + vcpu->mode = OUTSIDE_GUEST_MODE; + + /* Set a default exit reason */ + run->exit_reason = KVM_EXIT_UNKNOWN; + + guest_timing_exit_irqoff(); + guest_state_exit_irqoff(); + local_irq_enable(); + + trace_kvm_exit(vcpu, ecode); + if (ecode) { + ret = kvm_handle_fault(vcpu, ecode); + } else { + WARN(!intr, "vm exiting with suspicious irq\n"); + ++vcpu->stat.int_exits; + } + + if (ret == RESUME_GUEST) + ret = kvm_pre_enter_guest(vcpu); + + if (ret != RESUME_GUEST) { + local_irq_disable(); + return ret; + } + + guest_timing_enter_irqoff(); + guest_state_enter_irqoff(); + trace_kvm_reenter(vcpu); + + return RESUME_GUEST; +} + +int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) +{ + return !!(vcpu->arch.irq_pending) && + vcpu->arch.mp_state.mp_state == KVM_MP_STATE_RUNNABLE; +} + +int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu) +{ + return kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE; +} + +bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu) +{ + return false; +} + +vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf) +{ + return VM_FAULT_SIGBUS; +} + +int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, + struct kvm_translation *tr) +{ + return -EINVAL; +} + +int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu) +{ + return kvm_pending_timer(vcpu) || + kvm_read_hw_gcsr(LOONGARCH_CSR_ESTAT) & (1 << INT_TI); +} + +int kvm_arch_vcpu_dump_regs(struct kvm_vcpu *vcpu) +{ + int i; + + kvm_debug("vCPU Register Dump:\n"); + kvm_debug("\tPC = 0x%08lx\n", vcpu->arch.pc); + kvm_debug("\tExceptions: %08lx\n", vcpu->arch.irq_pending); + + for (i = 0; i < 32; i += 4) { + kvm_debug("\tGPR%02d: %08lx %08lx %08lx %08lx\n", i, + vcpu->arch.gprs[i], vcpu->arch.gprs[i + 1], + vcpu->arch.gprs[i + 2], vcpu->arch.gprs[i + 3]); + } + + kvm_debug("\tCRMD: 0x%08lx, ESTAT: 0x%08lx\n", + kvm_read_hw_gcsr(LOONGARCH_CSR_CRMD), + kvm_read_hw_gcsr(LOONGARCH_CSR_ESTAT)); + + kvm_debug("\tERA: 0x%08lx\n", kvm_read_hw_gcsr(LOONGARCH_CSR_ERA)); + + return 0; +} + +int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, + struct kvm_mp_state *mp_state) +{ + *mp_state = vcpu->arch.mp_state; + + return 0; +} + +int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, + struct kvm_mp_state *mp_state) +{ + int ret = 0; + + switch (mp_state->mp_state) { + case KVM_MP_STATE_RUNNABLE: + vcpu->arch.mp_state = *mp_state; + break; + default: + ret = -EINVAL; + } + + return ret; +} + +int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, + struct kvm_guest_debug *dbg) +{ + return -EINVAL; +} + +/** + * kvm_migrate_count() - Migrate timer. + * @vcpu: Virtual CPU. + * + * Migrate hrtimer to the current CPU by cancelling and restarting it + * if the hrtimer is active. + * + * Must be called when the vCPU is migrated to a different CPU, so that + * the timer can interrupt the guest at the new CPU, and the timer irq can + * be delivered to the vCPU. + */ +static void kvm_migrate_count(struct kvm_vcpu *vcpu) +{ + if (hrtimer_cancel(&vcpu->arch.swtimer)) + hrtimer_restart(&vcpu->arch.swtimer); +} + +static int _kvm_getcsr(struct kvm_vcpu *vcpu, unsigned int id, u64 *val) +{ + unsigned long gintc; + struct loongarch_csrs *csr = vcpu->arch.csr; + + if (get_gcsr_flag(id) & INVALID_GCSR) + return -EINVAL; + + if (id == LOONGARCH_CSR_ESTAT) { + /* ESTAT IP0~IP7 get from GINTC */ + gintc = kvm_read_sw_gcsr(csr, LOONGARCH_CSR_GINTC) & 0xff; + *val = kvm_read_sw_gcsr(csr, LOONGARCH_CSR_ESTAT) | (gintc << 2); + return 0; + } + + /* + * Get software CSR state since software state is consistent + * with hardware for synchronous ioctl + */ + *val = kvm_read_sw_gcsr(csr, id); + + return 0; +} + +static int _kvm_setcsr(struct kvm_vcpu *vcpu, unsigned int id, u64 val) +{ + int ret = 0, gintc; + struct loongarch_csrs *csr = vcpu->arch.csr; + + if (get_gcsr_flag(id) & INVALID_GCSR) + return -EINVAL; + + if (id == LOONGARCH_CSR_ESTAT) { + /* ESTAT IP0~IP7 inject through GINTC */ + gintc = (val >> 2) & 0xff; + kvm_set_sw_gcsr(csr, LOONGARCH_CSR_GINTC, gintc); + + gintc = val & ~(0xffUL << 2); + kvm_set_sw_gcsr(csr, LOONGARCH_CSR_ESTAT, gintc); + + return ret; + } + + kvm_write_sw_gcsr(csr, id, val); + + return ret; +} + +static int kvm_get_one_reg(struct kvm_vcpu *vcpu, + const struct kvm_one_reg *reg, u64 *v) +{ + int id, ret = 0; + u64 type = reg->id & KVM_REG_LOONGARCH_MASK; + + switch (type) { + case KVM_REG_LOONGARCH_CSR: + id = KVM_GET_IOC_CSR_IDX(reg->id); + ret = _kvm_getcsr(vcpu, id, v); + break; + case KVM_REG_LOONGARCH_CPUCFG: + id = KVM_GET_IOC_CPUCFG_IDX(reg->id); + if (id >= 0 && id < KVM_MAX_CPUCFG_REGS) + *v = vcpu->arch.cpucfg[id]; + else + ret = -EINVAL; + break; + case KVM_REG_LOONGARCH_KVM: + switch (reg->id) { + case KVM_REG_LOONGARCH_COUNTER: + *v = drdtime() + vcpu->kvm->arch.time_offset; + break; + default: + ret = -EINVAL; + break; + } + break; + default: + ret = -EINVAL; + break; + } + + return ret; +} + +static int kvm_get_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) +{ + int ret = 0; + u64 v, size = reg->id & KVM_REG_SIZE_MASK; + + switch (size) { + case KVM_REG_SIZE_U64: + ret = kvm_get_one_reg(vcpu, reg, &v); + if (ret) + return ret; + ret = put_user(v, (u64 __user *)(long)reg->addr); + break; + default: + ret = -EINVAL; + break; + } + + return ret; +} + +static int kvm_set_one_reg(struct kvm_vcpu *vcpu, + const struct kvm_one_reg *reg, u64 v) +{ + int id, ret = 0; + u64 type = reg->id & KVM_REG_LOONGARCH_MASK; + + switch (type) { + case KVM_REG_LOONGARCH_CSR: + id = KVM_GET_IOC_CSR_IDX(reg->id); + ret = _kvm_setcsr(vcpu, id, v); + break; + case KVM_REG_LOONGARCH_CPUCFG: + id = KVM_GET_IOC_CPUCFG_IDX(reg->id); + if (id >= 0 && id < KVM_MAX_CPUCFG_REGS) + vcpu->arch.cpucfg[id] = (u32)v; + else + ret = -EINVAL; + break; + case KVM_REG_LOONGARCH_KVM: + switch (reg->id) { + case KVM_REG_LOONGARCH_COUNTER: + /* + * gftoffset is relative with board, not vcpu + * only set for the first time for smp system + */ + if (vcpu->vcpu_id == 0) + vcpu->kvm->arch.time_offset = (signed long)(v - drdtime()); + break; + case KVM_REG_LOONGARCH_VCPU_RESET: + kvm_reset_timer(vcpu); + memset(&vcpu->arch.irq_pending, 0, sizeof(vcpu->arch.irq_pending)); + memset(&vcpu->arch.irq_clear, 0, sizeof(vcpu->arch.irq_clear)); + break; + default: + ret = -EINVAL; + break; + } + break; + default: + ret = -EINVAL; + break; + } + + return ret; +} + +static int kvm_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) +{ + int ret = 0; + u64 v, size = reg->id & KVM_REG_SIZE_MASK; + + switch (size) { + case KVM_REG_SIZE_U64: + ret = get_user(v, (u64 __user *)(long)reg->addr); + if (ret) + return ret; + break; + default: + return -EINVAL; + } + + return kvm_set_one_reg(vcpu, reg, v); +} + +int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) +{ + return -ENOIOCTLCMD; +} + +int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) +{ + return -ENOIOCTLCMD; +} + +int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(vcpu->arch.gprs); i++) + regs->gpr[i] = vcpu->arch.gprs[i]; + + regs->pc = vcpu->arch.pc; + + return 0; +} + +int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) +{ + int i; + + for (i = 1; i < ARRAY_SIZE(vcpu->arch.gprs); i++) + vcpu->arch.gprs[i] = regs->gpr[i]; + + vcpu->arch.gprs[0] = 0; /* zero is special, and cannot be set. */ + vcpu->arch.pc = regs->pc; + + return 0; +} + +static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, + struct kvm_enable_cap *cap) +{ + /* FPU is enabled by default, will support LSX/LASX later. */ + return -EINVAL; +} + +long kvm_arch_vcpu_ioctl(struct file *filp, + unsigned int ioctl, unsigned long arg) +{ + long r; + void __user *argp = (void __user *)arg; + struct kvm_vcpu *vcpu = filp->private_data; + + /* + * Only software CSR should be modified + * + * If any hardware CSR register is modified, vcpu_load/vcpu_put pair + * should be used. Since CSR registers owns by this vcpu, if switch + * to other vcpus, other vcpus need reload CSR registers. + * + * If software CSR is modified, bit KVM_LARCH_HWCSR_USABLE should + * be clear in vcpu->arch.aux_inuse, and vcpu_load will check + * aux_inuse flag and reload CSR registers form software. + */ + + switch (ioctl) { + case KVM_SET_ONE_REG: + case KVM_GET_ONE_REG: { + struct kvm_one_reg reg; + + r = -EFAULT; + if (copy_from_user(®, argp, sizeof(reg))) + break; + if (ioctl == KVM_SET_ONE_REG) { + r = kvm_set_reg(vcpu, ®); + vcpu->arch.aux_inuse &= ~KVM_LARCH_HWCSR_USABLE; + } else + r = kvm_get_reg(vcpu, ®); + break; + } + case KVM_ENABLE_CAP: { + struct kvm_enable_cap cap; + + r = -EFAULT; + if (copy_from_user(&cap, argp, sizeof(cap))) + break; + r = kvm_vcpu_ioctl_enable_cap(vcpu, &cap); + break; + } + default: + r = -ENOIOCTLCMD; + break; + } + + return r; +} + +int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) +{ + int i = 0; + + fpu->fcc = vcpu->arch.fpu.fcc; + fpu->fcsr = vcpu->arch.fpu.fcsr; + for (i = 0; i < NUM_FPU_REGS; i++) + memcpy(&fpu->fpr[i], &vcpu->arch.fpu.fpr[i], FPU_REG_WIDTH / 64); + + return 0; +} + +int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) +{ + int i = 0; + + vcpu->arch.fpu.fcc = fpu->fcc; + vcpu->arch.fpu.fcsr = fpu->fcsr; + for (i = 0; i < NUM_FPU_REGS; i++) + memcpy(&vcpu->arch.fpu.fpr[i], &fpu->fpr[i], FPU_REG_WIDTH / 64); + + return 0; +} + +/* Enable FPU and restore context */ +void kvm_own_fpu(struct kvm_vcpu *vcpu) +{ + preempt_disable(); + + /* Enable FPU */ + set_csr_euen(CSR_EUEN_FPEN); + + kvm_restore_fpu(&vcpu->arch.fpu); + vcpu->arch.aux_inuse |= KVM_LARCH_FPU; + trace_kvm_aux(vcpu, KVM_TRACE_AUX_RESTORE, KVM_TRACE_AUX_FPU); + + preempt_enable(); +} + +/* Save context and disable FPU */ +void kvm_lose_fpu(struct kvm_vcpu *vcpu) +{ + preempt_disable(); + + if (vcpu->arch.aux_inuse & KVM_LARCH_FPU) { + kvm_save_fpu(&vcpu->arch.fpu); + vcpu->arch.aux_inuse &= ~KVM_LARCH_FPU; + trace_kvm_aux(vcpu, KVM_TRACE_AUX_SAVE, KVM_TRACE_AUX_FPU); + + /* Disable FPU */ + clear_csr_euen(CSR_EUEN_FPEN); + } + + preempt_enable(); +} + +int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq) +{ + int intr = (int)irq->irq; + + if (intr > 0) + kvm_queue_irq(vcpu, intr); + else if (intr < 0) + kvm_dequeue_irq(vcpu, -intr); + else { + kvm_err("%s: invalid interrupt ioctl %d\n", __func__, irq->irq); + return -EINVAL; + } + + kvm_vcpu_kick(vcpu); + + return 0; +} + +long kvm_arch_vcpu_async_ioctl(struct file *filp, + unsigned int ioctl, unsigned long arg) +{ + void __user *argp = (void __user *)arg; + struct kvm_vcpu *vcpu = filp->private_data; + + if (ioctl == KVM_INTERRUPT) { + struct kvm_interrupt irq; + + if (copy_from_user(&irq, argp, sizeof(irq))) + return -EFAULT; + + kvm_debug("[%d] %s: irq: %d\n", vcpu->vcpu_id, __func__, irq.irq); + + return kvm_vcpu_ioctl_interrupt(vcpu, &irq); + } + + return -ENOIOCTLCMD; +} + +int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id) +{ + return 0; +} + +int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) +{ + unsigned long timer_hz; + struct loongarch_csrs *csr; + + vcpu->arch.vpid = 0; + + hrtimer_init(&vcpu->arch.swtimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED); + vcpu->arch.swtimer.function = kvm_swtimer_wakeup; + + vcpu->arch.handle_exit = kvm_handle_exit; + vcpu->arch.guest_eentry = (unsigned long)kvm_loongarch_ops->exc_entry; + vcpu->arch.csr = kzalloc(sizeof(struct loongarch_csrs), GFP_KERNEL); + if (!vcpu->arch.csr) + return -ENOMEM; + + /* + * All kvm exceptions share one exception entry, and host <-> guest + * switch also switch ECFG.VS field, keep host ECFG.VS info here. + */ + vcpu->arch.host_ecfg = (read_csr_ecfg() & CSR_ECFG_VS); + + /* Init */ + vcpu->arch.last_sched_cpu = -1; + + /* + * Initialize guest register state to valid architectural reset state. + */ + timer_hz = calc_const_freq(); + kvm_init_timer(vcpu, timer_hz); + + /* Set Initialize mode for guest */ + csr = vcpu->arch.csr; + kvm_write_sw_gcsr(csr, LOONGARCH_CSR_CRMD, CSR_CRMD_DA); + + /* Set cpuid */ + kvm_write_sw_gcsr(csr, LOONGARCH_CSR_TMID, vcpu->vcpu_id); + + /* Start with no pending virtual guest interrupts */ + csr->csrs[LOONGARCH_CSR_GINTC] = 0; + + return 0; +} + +void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) +{ +} + +void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) +{ + int cpu; + struct kvm_context *context; + + hrtimer_cancel(&vcpu->arch.swtimer); + kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_cache); + kfree(vcpu->arch.csr); + + /* + * If the vCPU is freed and reused as another vCPU, we don't want the + * matching pointer wrongly hanging around in last_vcpu. + */ + for_each_possible_cpu(cpu) { + context = per_cpu_ptr(vcpu->kvm->arch.vmcs, cpu); + if (context->last_vcpu == vcpu) + context->last_vcpu = NULL; + } +} + +static int _kvm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) +{ + bool migrated; + struct kvm_context *context; + struct loongarch_csrs *csr = vcpu->arch.csr; + + /* + * Have we migrated to a different CPU? + * If so, any old guest TLB state may be stale. + */ + migrated = (vcpu->arch.last_sched_cpu != cpu); + + /* + * Was this the last vCPU to run on this CPU? + * If not, any old guest state from this vCPU will have been clobbered. + */ + context = per_cpu_ptr(vcpu->kvm->arch.vmcs, cpu); + if (migrated || (context->last_vcpu != vcpu)) + vcpu->arch.aux_inuse &= ~KVM_LARCH_HWCSR_USABLE; + context->last_vcpu = vcpu; + + /* Restore timer state regardless */ + kvm_restore_timer(vcpu); + + /* Control guest page CCA attribute */ + change_csr_gcfg(CSR_GCFG_MATC_MASK, CSR_GCFG_MATC_ROOT); + + /* Don't bother restoring registers multiple times unless necessary */ + if (vcpu->arch.aux_inuse & KVM_LARCH_HWCSR_USABLE) + return 0; + + write_csr_gcntc((ulong)vcpu->kvm->arch.time_offset); + + /* Restore guest CSR registers */ + kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_CRMD); + kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_PRMD); + kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_EUEN); + kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_MISC); + kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_ECFG); + kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_ERA); + kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_BADV); + kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_BADI); + kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_EENTRY); + kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_TLBIDX); + kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_TLBEHI); + kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_TLBELO0); + kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_TLBELO1); + kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_ASID); + kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_PGDL); + kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_PGDH); + kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_PWCTL0); + kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_PWCTL1); + kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_STLBPGSIZE); + kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_RVACFG); + kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_CPUID); + kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_KS0); + kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_KS1); + kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_KS2); + kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_KS3); + kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_KS4); + kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_KS5); + kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_KS6); + kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_KS7); + kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_TMID); + kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_CNTC); + kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_TLBRENTRY); + kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_TLBRBADV); + kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_TLBRERA); + kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_TLBRSAVE); + kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_TLBRELO0); + kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_TLBRELO1); + kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_TLBREHI); + kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_TLBRPRMD); + kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_DMWIN0); + kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_DMWIN1); + kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_DMWIN2); + kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_DMWIN3); + kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_LLBCTL); + + /* Restore Root.GINTC from unused Guest.GINTC register */ + write_csr_gintc(csr->csrs[LOONGARCH_CSR_GINTC]); + + /* + * We should clear linked load bit to break interrupted atomics. This + * prevents a SC on the next vCPU from succeeding by matching a LL on + * the previous vCPU. + */ + if (vcpu->kvm->created_vcpus > 1) + set_gcsr_llbctl(CSR_LLBCTL_WCLLB); + + vcpu->arch.aux_inuse |= KVM_LARCH_HWCSR_USABLE; + + return 0; +} + +void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) +{ + unsigned long flags; + + local_irq_save(flags); + if (vcpu->arch.last_sched_cpu != cpu) { + kvm_debug("[%d->%d]KVM vCPU[%d] switch\n", + vcpu->arch.last_sched_cpu, cpu, vcpu->vcpu_id); + /* + * Migrate the timer interrupt to the current CPU so that it + * always interrupts the guest and synchronously triggers a + * guest timer interrupt. + */ + kvm_migrate_count(vcpu); + } + + /* Restore guest state to registers */ + _kvm_vcpu_load(vcpu, cpu); + local_irq_restore(flags); +} + +static int _kvm_vcpu_put(struct kvm_vcpu *vcpu, int cpu) +{ + struct loongarch_csrs *csr = vcpu->arch.csr; + + kvm_lose_fpu(vcpu); + + /* + * Update CSR state from hardware if software CSR state is stale, + * most CSR registers are kept unchanged during process context + * switch except CSR registers like remaining timer tick value and + * injected interrupt state. + */ + if (vcpu->arch.aux_inuse & KVM_LARCH_SWCSR_LATEST) + goto out; + + kvm_save_hw_gcsr(csr, LOONGARCH_CSR_CRMD); + kvm_save_hw_gcsr(csr, LOONGARCH_CSR_PRMD); + kvm_save_hw_gcsr(csr, LOONGARCH_CSR_EUEN); + kvm_save_hw_gcsr(csr, LOONGARCH_CSR_MISC); + kvm_save_hw_gcsr(csr, LOONGARCH_CSR_ECFG); + kvm_save_hw_gcsr(csr, LOONGARCH_CSR_ERA); + kvm_save_hw_gcsr(csr, LOONGARCH_CSR_BADV); + kvm_save_hw_gcsr(csr, LOONGARCH_CSR_BADI); + kvm_save_hw_gcsr(csr, LOONGARCH_CSR_EENTRY); + kvm_save_hw_gcsr(csr, LOONGARCH_CSR_TLBIDX); + kvm_save_hw_gcsr(csr, LOONGARCH_CSR_TLBEHI); + kvm_save_hw_gcsr(csr, LOONGARCH_CSR_TLBELO0); + kvm_save_hw_gcsr(csr, LOONGARCH_CSR_TLBELO1); + kvm_save_hw_gcsr(csr, LOONGARCH_CSR_ASID); + kvm_save_hw_gcsr(csr, LOONGARCH_CSR_PGDL); + kvm_save_hw_gcsr(csr, LOONGARCH_CSR_PGDH); + kvm_save_hw_gcsr(csr, LOONGARCH_CSR_PWCTL0); + kvm_save_hw_gcsr(csr, LOONGARCH_CSR_PWCTL1); + kvm_save_hw_gcsr(csr, LOONGARCH_CSR_STLBPGSIZE); + kvm_save_hw_gcsr(csr, LOONGARCH_CSR_RVACFG); + kvm_save_hw_gcsr(csr, LOONGARCH_CSR_CPUID); + kvm_save_hw_gcsr(csr, LOONGARCH_CSR_PRCFG1); + kvm_save_hw_gcsr(csr, LOONGARCH_CSR_PRCFG2); + kvm_save_hw_gcsr(csr, LOONGARCH_CSR_PRCFG3); + kvm_save_hw_gcsr(csr, LOONGARCH_CSR_KS0); + kvm_save_hw_gcsr(csr, LOONGARCH_CSR_KS1); + kvm_save_hw_gcsr(csr, LOONGARCH_CSR_KS2); + kvm_save_hw_gcsr(csr, LOONGARCH_CSR_KS3); + kvm_save_hw_gcsr(csr, LOONGARCH_CSR_KS4); + kvm_save_hw_gcsr(csr, LOONGARCH_CSR_KS5); + kvm_save_hw_gcsr(csr, LOONGARCH_CSR_KS6); + kvm_save_hw_gcsr(csr, LOONGARCH_CSR_KS7); + kvm_save_hw_gcsr(csr, LOONGARCH_CSR_TMID); + kvm_save_hw_gcsr(csr, LOONGARCH_CSR_CNTC); + kvm_save_hw_gcsr(csr, LOONGARCH_CSR_LLBCTL); + kvm_save_hw_gcsr(csr, LOONGARCH_CSR_TLBRENTRY); + kvm_save_hw_gcsr(csr, LOONGARCH_CSR_TLBRBADV); + kvm_save_hw_gcsr(csr, LOONGARCH_CSR_TLBRERA); + kvm_save_hw_gcsr(csr, LOONGARCH_CSR_TLBRSAVE); + kvm_save_hw_gcsr(csr, LOONGARCH_CSR_TLBRELO0); + kvm_save_hw_gcsr(csr, LOONGARCH_CSR_TLBRELO1); + kvm_save_hw_gcsr(csr, LOONGARCH_CSR_TLBREHI); + kvm_save_hw_gcsr(csr, LOONGARCH_CSR_TLBRPRMD); + kvm_save_hw_gcsr(csr, LOONGARCH_CSR_DMWIN0); + kvm_save_hw_gcsr(csr, LOONGARCH_CSR_DMWIN1); + kvm_save_hw_gcsr(csr, LOONGARCH_CSR_DMWIN2); + kvm_save_hw_gcsr(csr, LOONGARCH_CSR_DMWIN3); + + vcpu->arch.aux_inuse |= KVM_LARCH_SWCSR_LATEST; + +out: + kvm_save_timer(vcpu); + /* Save Root.GINTC into unused Guest.GINTC register */ + csr->csrs[LOONGARCH_CSR_GINTC] = read_csr_gintc(); + + return 0; +} + +void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) +{ + int cpu; + unsigned long flags; + + local_irq_save(flags); + cpu = smp_processor_id(); + vcpu->arch.last_sched_cpu = cpu; + + /* Save guest state in registers */ + _kvm_vcpu_put(vcpu, cpu); + local_irq_restore(flags); +} + +int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) +{ + int r = -EINTR; + struct kvm_run *run = vcpu->run; + + if (vcpu->mmio_needed) { + if (!vcpu->mmio_is_write) + kvm_complete_mmio_read(vcpu, run); + vcpu->mmio_needed = 0; + } + + if (run->exit_reason == KVM_EXIT_LOONGARCH_IOCSR) { + if (!run->iocsr_io.is_write) + kvm_complete_iocsr_read(vcpu, run); + } + + if (run->immediate_exit) + return r; + + /* Clear exit_reason */ + run->exit_reason = KVM_EXIT_UNKNOWN; + lose_fpu(1); + vcpu_load(vcpu); + kvm_sigset_activate(vcpu); + r = kvm_pre_enter_guest(vcpu); + if (r != RESUME_GUEST) + goto out; + + guest_timing_enter_irqoff(); + guest_state_enter_irqoff(); + trace_kvm_enter(vcpu); + r = kvm_loongarch_ops->enter_guest(run, vcpu); + + trace_kvm_out(vcpu); + /* + * Guest exit is already recorded at kvm_handle_exit() + * return value must not be RESUME_GUEST + */ + local_irq_enable(); +out: + kvm_sigset_deactivate(vcpu); + vcpu_put(vcpu); + + return r; +} diff --git a/arch/loongarch/kvm/vm.c b/arch/loongarch/kvm/vm.c new file mode 100644 index 0000000000..0a37f6fa8f --- /dev/null +++ b/arch/loongarch/kvm/vm.c @@ -0,0 +1,94 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2020-2023 Loongson Technology Corporation Limited + */ + +#include <linux/kvm_host.h> +#include <asm/kvm_mmu.h> + +const struct _kvm_stats_desc kvm_vm_stats_desc[] = { + KVM_GENERIC_VM_STATS(), + STATS_DESC_ICOUNTER(VM, pages), + STATS_DESC_ICOUNTER(VM, hugepages), +}; + +const struct kvm_stats_header kvm_vm_stats_header = { + .name_size = KVM_STATS_NAME_SIZE, + .num_desc = ARRAY_SIZE(kvm_vm_stats_desc), + .id_offset = sizeof(struct kvm_stats_header), + .desc_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE, + .data_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE + + sizeof(kvm_vm_stats_desc), +}; + +int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) +{ + int i; + + /* Allocate page table to map GPA -> RPA */ + kvm->arch.pgd = kvm_pgd_alloc(); + if (!kvm->arch.pgd) + return -ENOMEM; + + kvm_init_vmcs(kvm); + kvm->arch.gpa_size = BIT(cpu_vabits - 1); + kvm->arch.root_level = CONFIG_PGTABLE_LEVELS - 1; + kvm->arch.invalid_ptes[0] = 0; + kvm->arch.invalid_ptes[1] = (unsigned long)invalid_pte_table; +#if CONFIG_PGTABLE_LEVELS > 2 + kvm->arch.invalid_ptes[2] = (unsigned long)invalid_pmd_table; +#endif +#if CONFIG_PGTABLE_LEVELS > 3 + kvm->arch.invalid_ptes[3] = (unsigned long)invalid_pud_table; +#endif + for (i = 0; i <= kvm->arch.root_level; i++) + kvm->arch.pte_shifts[i] = PAGE_SHIFT + i * (PAGE_SHIFT - 3); + + return 0; +} + +void kvm_arch_destroy_vm(struct kvm *kvm) +{ + kvm_destroy_vcpus(kvm); + free_page((unsigned long)kvm->arch.pgd); + kvm->arch.pgd = NULL; +} + +int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) +{ + int r; + + switch (ext) { + case KVM_CAP_ONE_REG: + case KVM_CAP_ENABLE_CAP: + case KVM_CAP_READONLY_MEM: + case KVM_CAP_SYNC_MMU: + case KVM_CAP_IMMEDIATE_EXIT: + case KVM_CAP_IOEVENTFD: + case KVM_CAP_MP_STATE: + r = 1; + break; + case KVM_CAP_NR_VCPUS: + r = num_online_cpus(); + break; + case KVM_CAP_MAX_VCPUS: + r = KVM_MAX_VCPUS; + break; + case KVM_CAP_MAX_VCPU_ID: + r = KVM_MAX_VCPU_IDS; + break; + case KVM_CAP_NR_MEMSLOTS: + r = KVM_USER_MEM_SLOTS; + break; + default: + r = 0; + break; + } + + return r; +} + +int kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) +{ + return -ENOIOCTLCMD; +} |