From 5d1646d90e1f2cceb9f0828f4b28318cd0ec7744 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sat, 27 Apr 2024 12:05:51 +0200 Subject: Adding upstream version 5.10.209. Signed-off-by: Daniel Baumann --- tools/testing/selftests/kvm/.gitignore | 33 + tools/testing/selftests/kvm/Makefile | 144 ++ .../selftests/kvm/aarch64/get-reg-list-sve.c | 3 + tools/testing/selftests/kvm/aarch64/get-reg-list.c | 841 ++++++++ tools/testing/selftests/kvm/config | 3 + tools/testing/selftests/kvm/demand_paging_test.c | 498 +++++ tools/testing/selftests/kvm/dirty_log_perf_test.c | 376 ++++ tools/testing/selftests/kvm/dirty_log_test.c | 639 ++++++ .../selftests/kvm/include/aarch64/processor.h | 59 + tools/testing/selftests/kvm/include/evmcs.h | 1102 +++++++++++ tools/testing/selftests/kvm/include/kvm_util.h | 348 ++++ .../testing/selftests/kvm/include/perf_test_util.h | 198 ++ .../selftests/kvm/include/s390x/processor.h | 22 + tools/testing/selftests/kvm/include/sparsebit.h | 73 + tools/testing/selftests/kvm/include/test_util.h | 70 + .../selftests/kvm/include/x86_64/processor.h | 422 ++++ tools/testing/selftests/kvm/include/x86_64/svm.h | 297 +++ .../selftests/kvm/include/x86_64/svm_util.h | 49 + tools/testing/selftests/kvm/include/x86_64/vmx.h | 625 ++++++ tools/testing/selftests/kvm/kvm_create_max_vcpus.c | 96 + .../testing/selftests/kvm/lib/aarch64/processor.c | 356 ++++ tools/testing/selftests/kvm/lib/aarch64/ucall.c | 114 ++ tools/testing/selftests/kvm/lib/assert.c | 93 + tools/testing/selftests/kvm/lib/elf.c | 196 ++ tools/testing/selftests/kvm/lib/io.c | 157 ++ tools/testing/selftests/kvm/lib/kvm_util.c | 1865 +++++++++++++++++ .../testing/selftests/kvm/lib/kvm_util_internal.h | 113 ++ tools/testing/selftests/kvm/lib/s390x/processor.c | 247 +++ tools/testing/selftests/kvm/lib/s390x/ucall.c | 59 + tools/testing/selftests/kvm/lib/sparsebit.c | 2086 ++++++++++++++++++++ tools/testing/selftests/kvm/lib/test_util.c | 111 ++ tools/testing/selftests/kvm/lib/x86_64/handlers.S | 81 + tools/testing/selftests/kvm/lib/x86_64/processor.c | 1258 ++++++++++++ tools/testing/selftests/kvm/lib/x86_64/svm.c | 177 ++ tools/testing/selftests/kvm/lib/x86_64/ucall.c | 59 + tools/testing/selftests/kvm/lib/x86_64/vmx.c | 553 ++++++ tools/testing/selftests/kvm/s390x/memop.c | 166 ++ tools/testing/selftests/kvm/s390x/resets.c | 279 +++ tools/testing/selftests/kvm/s390x/sync_regs_test.c | 193 ++ .../testing/selftests/kvm/set_memory_region_test.c | 417 ++++ tools/testing/selftests/kvm/steal_time.c | 352 ++++ .../selftests/kvm/x86_64/cr4_cpuid_sync_test.c | 117 ++ tools/testing/selftests/kvm/x86_64/debug_regs.c | 202 ++ tools/testing/selftests/kvm/x86_64/evmcs_test.c | 166 ++ tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c | 190 ++ tools/testing/selftests/kvm/x86_64/kvm_pv_test.c | 234 +++ .../selftests/kvm/x86_64/mmio_warning_test.c | 127 ++ .../selftests/kvm/x86_64/platform_info_test.c | 107 + .../testing/selftests/kvm/x86_64/set_sregs_test.c | 52 + tools/testing/selftests/kvm/x86_64/smm_test.c | 164 ++ tools/testing/selftests/kvm/x86_64/state_test.c | 233 +++ .../testing/selftests/kvm/x86_64/svm_vmcall_test.c | 77 + .../testing/selftests/kvm/x86_64/sync_regs_test.c | 243 +++ tools/testing/selftests/kvm/x86_64/tsc_msrs_test.c | 168 ++ tools/testing/selftests/kvm/x86_64/user_msr_test.c | 248 +++ .../selftests/kvm/x86_64/vmx_apic_access_test.c | 142 ++ .../kvm/x86_64/vmx_close_while_nested_test.c | 87 + .../selftests/kvm/x86_64/vmx_dirty_log_test.c | 157 ++ .../kvm/x86_64/vmx_preemption_timer_test.c | 259 +++ .../kvm/x86_64/vmx_set_nested_state_test.c | 298 +++ .../selftests/kvm/x86_64/vmx_tsc_adjust_test.c | 168 ++ tools/testing/selftests/kvm/x86_64/xss_msr_test.c | 76 + 62 files changed, 18345 insertions(+) create mode 100644 tools/testing/selftests/kvm/.gitignore create mode 100644 tools/testing/selftests/kvm/Makefile create mode 100644 tools/testing/selftests/kvm/aarch64/get-reg-list-sve.c create mode 100644 tools/testing/selftests/kvm/aarch64/get-reg-list.c create mode 100644 tools/testing/selftests/kvm/config create mode 100644 tools/testing/selftests/kvm/demand_paging_test.c create mode 100644 tools/testing/selftests/kvm/dirty_log_perf_test.c create mode 100644 tools/testing/selftests/kvm/dirty_log_test.c create mode 100644 tools/testing/selftests/kvm/include/aarch64/processor.h create mode 100644 tools/testing/selftests/kvm/include/evmcs.h create mode 100644 tools/testing/selftests/kvm/include/kvm_util.h create mode 100644 tools/testing/selftests/kvm/include/perf_test_util.h create mode 100644 tools/testing/selftests/kvm/include/s390x/processor.h create mode 100644 tools/testing/selftests/kvm/include/sparsebit.h create mode 100644 tools/testing/selftests/kvm/include/test_util.h create mode 100644 tools/testing/selftests/kvm/include/x86_64/processor.h create mode 100644 tools/testing/selftests/kvm/include/x86_64/svm.h create mode 100644 tools/testing/selftests/kvm/include/x86_64/svm_util.h create mode 100644 tools/testing/selftests/kvm/include/x86_64/vmx.h create mode 100644 tools/testing/selftests/kvm/kvm_create_max_vcpus.c create mode 100644 tools/testing/selftests/kvm/lib/aarch64/processor.c create mode 100644 tools/testing/selftests/kvm/lib/aarch64/ucall.c create mode 100644 tools/testing/selftests/kvm/lib/assert.c create mode 100644 tools/testing/selftests/kvm/lib/elf.c create mode 100644 tools/testing/selftests/kvm/lib/io.c create mode 100644 tools/testing/selftests/kvm/lib/kvm_util.c create mode 100644 tools/testing/selftests/kvm/lib/kvm_util_internal.h create mode 100644 tools/testing/selftests/kvm/lib/s390x/processor.c create mode 100644 tools/testing/selftests/kvm/lib/s390x/ucall.c create mode 100644 tools/testing/selftests/kvm/lib/sparsebit.c create mode 100644 tools/testing/selftests/kvm/lib/test_util.c create mode 100644 tools/testing/selftests/kvm/lib/x86_64/handlers.S create mode 100644 tools/testing/selftests/kvm/lib/x86_64/processor.c create mode 100644 tools/testing/selftests/kvm/lib/x86_64/svm.c create mode 100644 tools/testing/selftests/kvm/lib/x86_64/ucall.c create mode 100644 tools/testing/selftests/kvm/lib/x86_64/vmx.c create mode 100644 tools/testing/selftests/kvm/s390x/memop.c create mode 100644 tools/testing/selftests/kvm/s390x/resets.c create mode 100644 tools/testing/selftests/kvm/s390x/sync_regs_test.c create mode 100644 tools/testing/selftests/kvm/set_memory_region_test.c create mode 100644 tools/testing/selftests/kvm/steal_time.c create mode 100644 tools/testing/selftests/kvm/x86_64/cr4_cpuid_sync_test.c create mode 100644 tools/testing/selftests/kvm/x86_64/debug_regs.c create mode 100644 tools/testing/selftests/kvm/x86_64/evmcs_test.c create mode 100644 tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c create mode 100644 tools/testing/selftests/kvm/x86_64/kvm_pv_test.c create mode 100644 tools/testing/selftests/kvm/x86_64/mmio_warning_test.c create mode 100644 tools/testing/selftests/kvm/x86_64/platform_info_test.c create mode 100644 tools/testing/selftests/kvm/x86_64/set_sregs_test.c create mode 100644 tools/testing/selftests/kvm/x86_64/smm_test.c create mode 100644 tools/testing/selftests/kvm/x86_64/state_test.c create mode 100644 tools/testing/selftests/kvm/x86_64/svm_vmcall_test.c create mode 100644 tools/testing/selftests/kvm/x86_64/sync_regs_test.c create mode 100644 tools/testing/selftests/kvm/x86_64/tsc_msrs_test.c create mode 100644 tools/testing/selftests/kvm/x86_64/user_msr_test.c create mode 100644 tools/testing/selftests/kvm/x86_64/vmx_apic_access_test.c create mode 100644 tools/testing/selftests/kvm/x86_64/vmx_close_while_nested_test.c create mode 100644 tools/testing/selftests/kvm/x86_64/vmx_dirty_log_test.c create mode 100644 tools/testing/selftests/kvm/x86_64/vmx_preemption_timer_test.c create mode 100644 tools/testing/selftests/kvm/x86_64/vmx_set_nested_state_test.c create mode 100644 tools/testing/selftests/kvm/x86_64/vmx_tsc_adjust_test.c create mode 100644 tools/testing/selftests/kvm/x86_64/xss_msr_test.c (limited to 'tools/testing/selftests/kvm') diff --git a/tools/testing/selftests/kvm/.gitignore b/tools/testing/selftests/kvm/.gitignore new file mode 100644 index 000000000..7a2c242b7 --- /dev/null +++ b/tools/testing/selftests/kvm/.gitignore @@ -0,0 +1,33 @@ +# SPDX-License-Identifier: GPL-2.0-only +/aarch64/get-reg-list +/aarch64/get-reg-list-sve +/s390x/memop +/s390x/resets +/s390x/sync_regs_test +/x86_64/cr4_cpuid_sync_test +/x86_64/debug_regs +/x86_64/evmcs_test +/x86_64/kvm_pv_test +/x86_64/hyperv_cpuid +/x86_64/mmio_warning_test +/x86_64/platform_info_test +/x86_64/set_sregs_test +/x86_64/smm_test +/x86_64/state_test +/x86_64/user_msr_test +/x86_64/vmx_preemption_timer_test +/x86_64/svm_vmcall_test +/x86_64/sync_regs_test +/x86_64/vmx_apic_access_test +/x86_64/vmx_close_while_nested_test +/x86_64/vmx_dirty_log_test +/x86_64/vmx_set_nested_state_test +/x86_64/vmx_tsc_adjust_test +/x86_64/xss_msr_test +/clear_dirty_log_test +/demand_paging_test +/dirty_log_test +/dirty_log_perf_test +/kvm_create_max_vcpus +/set_memory_region_test +/steal_time diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile new file mode 100644 index 000000000..3d14ef777 --- /dev/null +++ b/tools/testing/selftests/kvm/Makefile @@ -0,0 +1,144 @@ +# SPDX-License-Identifier: GPL-2.0-only +include ../../../../scripts/Kbuild.include + +all: + +top_srcdir = ../../../.. +KSFT_KHDR_INSTALL := 1 + +# For cross-builds to work, UNAME_M has to map to ARCH and arch specific +# directories and targets in this Makefile. "uname -m" doesn't map to +# arch specific sub-directory names. +# +# UNAME_M variable to used to run the compiles pointing to the right arch +# directories and build the right targets for these supported architectures. +# +# TEST_GEN_PROGS and LIBKVM are set using UNAME_M variable. +# LINUX_TOOL_ARCH_INCLUDE is set using ARCH variable. +# +# x86_64 targets are named to include x86_64 as a suffix and directories +# for includes are in x86_64 sub-directory. s390x and aarch64 follow the +# same convention. "uname -m" doesn't result in the correct mapping for +# s390x and aarch64. +# +# No change necessary for x86_64 +UNAME_M := $(shell uname -m) + +# Set UNAME_M for arm64 compile/install to work +ifeq ($(ARCH),arm64) + UNAME_M := aarch64 +endif +# Set UNAME_M s390x compile/install to work +ifeq ($(ARCH),s390) + UNAME_M := s390x +endif + +LIBKVM = lib/assert.c lib/elf.c lib/io.c lib/kvm_util.c lib/sparsebit.c lib/test_util.c +LIBKVM_x86_64 = lib/x86_64/processor.c lib/x86_64/vmx.c lib/x86_64/svm.c lib/x86_64/ucall.c lib/x86_64/handlers.S +LIBKVM_aarch64 = lib/aarch64/processor.c lib/aarch64/ucall.c +LIBKVM_s390x = lib/s390x/processor.c lib/s390x/ucall.c + +TEST_GEN_PROGS_x86_64 = x86_64/cr4_cpuid_sync_test +TEST_GEN_PROGS_x86_64 += x86_64/evmcs_test +TEST_GEN_PROGS_x86_64 += x86_64/hyperv_cpuid +TEST_GEN_PROGS_x86_64 += x86_64/kvm_pv_test +TEST_GEN_PROGS_x86_64 += x86_64/mmio_warning_test +TEST_GEN_PROGS_x86_64 += x86_64/platform_info_test +TEST_GEN_PROGS_x86_64 += x86_64/set_sregs_test +TEST_GEN_PROGS_x86_64 += x86_64/smm_test +TEST_GEN_PROGS_x86_64 += x86_64/state_test +TEST_GEN_PROGS_x86_64 += x86_64/vmx_preemption_timer_test +TEST_GEN_PROGS_x86_64 += x86_64/svm_vmcall_test +TEST_GEN_PROGS_x86_64 += x86_64/sync_regs_test +TEST_GEN_PROGS_x86_64 += x86_64/vmx_apic_access_test +TEST_GEN_PROGS_x86_64 += x86_64/vmx_close_while_nested_test +TEST_GEN_PROGS_x86_64 += x86_64/vmx_dirty_log_test +TEST_GEN_PROGS_x86_64 += x86_64/vmx_set_nested_state_test +TEST_GEN_PROGS_x86_64 += x86_64/vmx_tsc_adjust_test +TEST_GEN_PROGS_x86_64 += x86_64/xss_msr_test +TEST_GEN_PROGS_x86_64 += x86_64/debug_regs +TEST_GEN_PROGS_x86_64 += x86_64/tsc_msrs_test +TEST_GEN_PROGS_x86_64 += x86_64/user_msr_test +TEST_GEN_PROGS_x86_64 += demand_paging_test +TEST_GEN_PROGS_x86_64 += dirty_log_test +TEST_GEN_PROGS_x86_64 += dirty_log_perf_test +TEST_GEN_PROGS_x86_64 += kvm_create_max_vcpus +TEST_GEN_PROGS_x86_64 += set_memory_region_test +TEST_GEN_PROGS_x86_64 += steal_time + +TEST_GEN_PROGS_aarch64 += aarch64/get-reg-list +TEST_GEN_PROGS_aarch64 += aarch64/get-reg-list-sve +TEST_GEN_PROGS_aarch64 += demand_paging_test +TEST_GEN_PROGS_aarch64 += dirty_log_test +TEST_GEN_PROGS_aarch64 += kvm_create_max_vcpus +TEST_GEN_PROGS_aarch64 += set_memory_region_test +TEST_GEN_PROGS_aarch64 += steal_time + +TEST_GEN_PROGS_s390x = s390x/memop +TEST_GEN_PROGS_s390x += s390x/resets +TEST_GEN_PROGS_s390x += s390x/sync_regs_test +TEST_GEN_PROGS_s390x += demand_paging_test +TEST_GEN_PROGS_s390x += dirty_log_test +TEST_GEN_PROGS_s390x += kvm_create_max_vcpus +TEST_GEN_PROGS_s390x += set_memory_region_test + +TEST_GEN_PROGS += $(TEST_GEN_PROGS_$(UNAME_M)) +LIBKVM += $(LIBKVM_$(UNAME_M)) + +INSTALL_HDR_PATH = $(top_srcdir)/usr +LINUX_HDR_PATH = $(INSTALL_HDR_PATH)/include/ +LINUX_TOOL_INCLUDE = $(top_srcdir)/tools/include +ifeq ($(ARCH),x86_64) +LINUX_TOOL_ARCH_INCLUDE = $(top_srcdir)/tools/arch/x86/include +else +LINUX_TOOL_ARCH_INCLUDE = $(top_srcdir)/tools/arch/$(ARCH)/include +endif +CFLAGS += -Wall -Wstrict-prototypes -Wuninitialized -O2 -g -std=gnu99 \ + -fno-stack-protector -fno-PIE -I$(LINUX_TOOL_INCLUDE) \ + -I$(LINUX_TOOL_ARCH_INCLUDE) -I$(LINUX_HDR_PATH) -Iinclude \ + -I$( cscope.files + cscope -b diff --git a/tools/testing/selftests/kvm/aarch64/get-reg-list-sve.c b/tools/testing/selftests/kvm/aarch64/get-reg-list-sve.c new file mode 100644 index 000000000..efba76682 --- /dev/null +++ b/tools/testing/selftests/kvm/aarch64/get-reg-list-sve.c @@ -0,0 +1,3 @@ +// SPDX-License-Identifier: GPL-2.0 +#define REG_LIST_SVE +#include "get-reg-list.c" diff --git a/tools/testing/selftests/kvm/aarch64/get-reg-list.c b/tools/testing/selftests/kvm/aarch64/get-reg-list.c new file mode 100644 index 000000000..33218a395 --- /dev/null +++ b/tools/testing/selftests/kvm/aarch64/get-reg-list.c @@ -0,0 +1,841 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Check for KVM_GET_REG_LIST regressions. + * + * Copyright (C) 2020, Red Hat, Inc. + * + * When attempting to migrate from a host with an older kernel to a host + * with a newer kernel we allow the newer kernel on the destination to + * list new registers with get-reg-list. We assume they'll be unused, at + * least until the guest reboots, and so they're relatively harmless. + * However, if the destination host with the newer kernel is missing + * registers which the source host with the older kernel has, then that's + * a regression in get-reg-list. This test checks for that regression by + * checking the current list against a blessed list. We should never have + * missing registers, but if new ones appear then they can probably be + * added to the blessed list. A completely new blessed list can be created + * by running the test with the --list command line argument. + * + * Note, the blessed list should be created from the oldest possible + * kernel. We can't go older than v4.15, though, because that's the first + * release to expose the ID system registers in KVM_GET_REG_LIST, see + * commit 93390c0a1b20 ("arm64: KVM: Hide unsupported AArch64 CPU features + * from guests"). Also, one must use the --core-reg-fixup command line + * option when running on an older kernel that doesn't include df205b5c6328 + * ("KVM: arm64: Filter out invalid core register IDs in KVM_GET_REG_LIST") + */ +#include +#include +#include +#include "kvm_util.h" +#include "test_util.h" +#include "processor.h" + +#ifdef REG_LIST_SVE +#define reg_list_sve() (true) +#else +#define reg_list_sve() (false) +#endif + +#define REG_MASK (KVM_REG_ARCH_MASK | KVM_REG_SIZE_MASK | KVM_REG_ARM_COPROC_MASK) + +#define for_each_reg(i) \ + for ((i) = 0; (i) < reg_list->n; ++(i)) + +#define for_each_missing_reg(i) \ + for ((i) = 0; (i) < blessed_n; ++(i)) \ + if (!find_reg(reg_list->reg, reg_list->n, blessed_reg[i])) + +#define for_each_new_reg(i) \ + for ((i) = 0; (i) < reg_list->n; ++(i)) \ + if (!find_reg(blessed_reg, blessed_n, reg_list->reg[i])) + + +static struct kvm_reg_list *reg_list; + +static __u64 base_regs[], vregs[], sve_regs[], rejects_set[]; +static __u64 base_regs_n, vregs_n, sve_regs_n, rejects_set_n; +static __u64 *blessed_reg, blessed_n; + +static bool find_reg(__u64 regs[], __u64 nr_regs, __u64 reg) +{ + int i; + + for (i = 0; i < nr_regs; ++i) + if (reg == regs[i]) + return true; + return false; +} + +static const char *str_with_index(const char *template, __u64 index) +{ + char *str, *p; + int n; + + str = strdup(template); + p = strstr(str, "##"); + n = sprintf(p, "%lld", index); + strcat(p + n, strstr(template, "##") + 2); + + return (const char *)str; +} + +#define CORE_REGS_XX_NR_WORDS 2 +#define CORE_SPSR_XX_NR_WORDS 2 +#define CORE_FPREGS_XX_NR_WORDS 4 + +static const char *core_id_to_str(__u64 id) +{ + __u64 core_off = id & ~REG_MASK, idx; + + /* + * core_off is the offset into struct kvm_regs + */ + switch (core_off) { + case KVM_REG_ARM_CORE_REG(regs.regs[0]) ... + KVM_REG_ARM_CORE_REG(regs.regs[30]): + idx = (core_off - KVM_REG_ARM_CORE_REG(regs.regs[0])) / CORE_REGS_XX_NR_WORDS; + TEST_ASSERT(idx < 31, "Unexpected regs.regs index: %lld", idx); + return str_with_index("KVM_REG_ARM_CORE_REG(regs.regs[##])", idx); + case KVM_REG_ARM_CORE_REG(regs.sp): + return "KVM_REG_ARM_CORE_REG(regs.sp)"; + case KVM_REG_ARM_CORE_REG(regs.pc): + return "KVM_REG_ARM_CORE_REG(regs.pc)"; + case KVM_REG_ARM_CORE_REG(regs.pstate): + return "KVM_REG_ARM_CORE_REG(regs.pstate)"; + case KVM_REG_ARM_CORE_REG(sp_el1): + return "KVM_REG_ARM_CORE_REG(sp_el1)"; + case KVM_REG_ARM_CORE_REG(elr_el1): + return "KVM_REG_ARM_CORE_REG(elr_el1)"; + case KVM_REG_ARM_CORE_REG(spsr[0]) ... + KVM_REG_ARM_CORE_REG(spsr[KVM_NR_SPSR - 1]): + idx = (core_off - KVM_REG_ARM_CORE_REG(spsr[0])) / CORE_SPSR_XX_NR_WORDS; + TEST_ASSERT(idx < KVM_NR_SPSR, "Unexpected spsr index: %lld", idx); + return str_with_index("KVM_REG_ARM_CORE_REG(spsr[##])", idx); + case KVM_REG_ARM_CORE_REG(fp_regs.vregs[0]) ... + KVM_REG_ARM_CORE_REG(fp_regs.vregs[31]): + idx = (core_off - KVM_REG_ARM_CORE_REG(fp_regs.vregs[0])) / CORE_FPREGS_XX_NR_WORDS; + TEST_ASSERT(idx < 32, "Unexpected fp_regs.vregs index: %lld", idx); + return str_with_index("KVM_REG_ARM_CORE_REG(fp_regs.vregs[##])", idx); + case KVM_REG_ARM_CORE_REG(fp_regs.fpsr): + return "KVM_REG_ARM_CORE_REG(fp_regs.fpsr)"; + case KVM_REG_ARM_CORE_REG(fp_regs.fpcr): + return "KVM_REG_ARM_CORE_REG(fp_regs.fpcr)"; + } + + TEST_FAIL("Unknown core reg id: 0x%llx", id); + return NULL; +} + +static const char *sve_id_to_str(__u64 id) +{ + __u64 sve_off, n, i; + + if (id == KVM_REG_ARM64_SVE_VLS) + return "KVM_REG_ARM64_SVE_VLS"; + + sve_off = id & ~(REG_MASK | ((1ULL << 5) - 1)); + i = id & (KVM_ARM64_SVE_MAX_SLICES - 1); + + TEST_ASSERT(i == 0, "Currently we don't expect slice > 0, reg id 0x%llx", id); + + switch (sve_off) { + case KVM_REG_ARM64_SVE_ZREG_BASE ... + KVM_REG_ARM64_SVE_ZREG_BASE + (1ULL << 5) * KVM_ARM64_SVE_NUM_ZREGS - 1: + n = (id >> 5) & (KVM_ARM64_SVE_NUM_ZREGS - 1); + TEST_ASSERT(id == KVM_REG_ARM64_SVE_ZREG(n, 0), + "Unexpected bits set in SVE ZREG id: 0x%llx", id); + return str_with_index("KVM_REG_ARM64_SVE_ZREG(##, 0)", n); + case KVM_REG_ARM64_SVE_PREG_BASE ... + KVM_REG_ARM64_SVE_PREG_BASE + (1ULL << 5) * KVM_ARM64_SVE_NUM_PREGS - 1: + n = (id >> 5) & (KVM_ARM64_SVE_NUM_PREGS - 1); + TEST_ASSERT(id == KVM_REG_ARM64_SVE_PREG(n, 0), + "Unexpected bits set in SVE PREG id: 0x%llx", id); + return str_with_index("KVM_REG_ARM64_SVE_PREG(##, 0)", n); + case KVM_REG_ARM64_SVE_FFR_BASE: + TEST_ASSERT(id == KVM_REG_ARM64_SVE_FFR(0), + "Unexpected bits set in SVE FFR id: 0x%llx", id); + return "KVM_REG_ARM64_SVE_FFR(0)"; + } + + return NULL; +} + +static void print_reg(__u64 id) +{ + unsigned op0, op1, crn, crm, op2; + const char *reg_size = NULL; + + TEST_ASSERT((id & KVM_REG_ARCH_MASK) == KVM_REG_ARM64, + "KVM_REG_ARM64 missing in reg id: 0x%llx", id); + + switch (id & KVM_REG_SIZE_MASK) { + case KVM_REG_SIZE_U8: + reg_size = "KVM_REG_SIZE_U8"; + break; + case KVM_REG_SIZE_U16: + reg_size = "KVM_REG_SIZE_U16"; + break; + case KVM_REG_SIZE_U32: + reg_size = "KVM_REG_SIZE_U32"; + break; + case KVM_REG_SIZE_U64: + reg_size = "KVM_REG_SIZE_U64"; + break; + case KVM_REG_SIZE_U128: + reg_size = "KVM_REG_SIZE_U128"; + break; + case KVM_REG_SIZE_U256: + reg_size = "KVM_REG_SIZE_U256"; + break; + case KVM_REG_SIZE_U512: + reg_size = "KVM_REG_SIZE_U512"; + break; + case KVM_REG_SIZE_U1024: + reg_size = "KVM_REG_SIZE_U1024"; + break; + case KVM_REG_SIZE_U2048: + reg_size = "KVM_REG_SIZE_U2048"; + break; + default: + TEST_FAIL("Unexpected reg size: 0x%llx in reg id: 0x%llx", + (id & KVM_REG_SIZE_MASK) >> KVM_REG_SIZE_SHIFT, id); + } + + switch (id & KVM_REG_ARM_COPROC_MASK) { + case KVM_REG_ARM_CORE: + printf("\tKVM_REG_ARM64 | %s | KVM_REG_ARM_CORE | %s,\n", reg_size, core_id_to_str(id)); + break; + case KVM_REG_ARM_DEMUX: + TEST_ASSERT(!(id & ~(REG_MASK | KVM_REG_ARM_DEMUX_ID_MASK | KVM_REG_ARM_DEMUX_VAL_MASK)), + "Unexpected bits set in DEMUX reg id: 0x%llx", id); + printf("\tKVM_REG_ARM64 | %s | KVM_REG_ARM_DEMUX | KVM_REG_ARM_DEMUX_ID_CCSIDR | %lld,\n", + reg_size, id & KVM_REG_ARM_DEMUX_VAL_MASK); + break; + case KVM_REG_ARM64_SYSREG: + op0 = (id & KVM_REG_ARM64_SYSREG_OP0_MASK) >> KVM_REG_ARM64_SYSREG_OP0_SHIFT; + op1 = (id & KVM_REG_ARM64_SYSREG_OP1_MASK) >> KVM_REG_ARM64_SYSREG_OP1_SHIFT; + crn = (id & KVM_REG_ARM64_SYSREG_CRN_MASK) >> KVM_REG_ARM64_SYSREG_CRN_SHIFT; + crm = (id & KVM_REG_ARM64_SYSREG_CRM_MASK) >> KVM_REG_ARM64_SYSREG_CRM_SHIFT; + op2 = (id & KVM_REG_ARM64_SYSREG_OP2_MASK) >> KVM_REG_ARM64_SYSREG_OP2_SHIFT; + TEST_ASSERT(id == ARM64_SYS_REG(op0, op1, crn, crm, op2), + "Unexpected bits set in SYSREG reg id: 0x%llx", id); + printf("\tARM64_SYS_REG(%d, %d, %d, %d, %d),\n", op0, op1, crn, crm, op2); + break; + case KVM_REG_ARM_FW: + TEST_ASSERT(id == KVM_REG_ARM_FW_REG(id & 0xffff), + "Unexpected bits set in FW reg id: 0x%llx", id); + printf("\tKVM_REG_ARM_FW_REG(%lld),\n", id & 0xffff); + break; + case KVM_REG_ARM64_SVE: + if (reg_list_sve()) + printf("\t%s,\n", sve_id_to_str(id)); + else + TEST_FAIL("KVM_REG_ARM64_SVE is an unexpected coproc type in reg id: 0x%llx", id); + break; + default: + TEST_FAIL("Unexpected coproc type: 0x%llx in reg id: 0x%llx", + (id & KVM_REG_ARM_COPROC_MASK) >> KVM_REG_ARM_COPROC_SHIFT, id); + } +} + +/* + * Older kernels listed each 32-bit word of CORE registers separately. + * For 64 and 128-bit registers we need to ignore the extra words. We + * also need to fixup the sizes, because the older kernels stated all + * registers were 64-bit, even when they weren't. + */ +static void core_reg_fixup(void) +{ + struct kvm_reg_list *tmp; + __u64 id, core_off; + int i; + + tmp = calloc(1, sizeof(*tmp) + reg_list->n * sizeof(__u64)); + + for (i = 0; i < reg_list->n; ++i) { + id = reg_list->reg[i]; + + if ((id & KVM_REG_ARM_COPROC_MASK) != KVM_REG_ARM_CORE) { + tmp->reg[tmp->n++] = id; + continue; + } + + core_off = id & ~REG_MASK; + + switch (core_off) { + case 0x52: case 0xd2: case 0xd6: + /* + * These offsets are pointing at padding. + * We need to ignore them too. + */ + continue; + case KVM_REG_ARM_CORE_REG(fp_regs.vregs[0]) ... + KVM_REG_ARM_CORE_REG(fp_regs.vregs[31]): + if (core_off & 3) + continue; + id &= ~KVM_REG_SIZE_MASK; + id |= KVM_REG_SIZE_U128; + tmp->reg[tmp->n++] = id; + continue; + case KVM_REG_ARM_CORE_REG(fp_regs.fpsr): + case KVM_REG_ARM_CORE_REG(fp_regs.fpcr): + id &= ~KVM_REG_SIZE_MASK; + id |= KVM_REG_SIZE_U32; + tmp->reg[tmp->n++] = id; + continue; + default: + if (core_off & 1) + continue; + tmp->reg[tmp->n++] = id; + break; + } + } + + free(reg_list); + reg_list = tmp; +} + +static void prepare_vcpu_init(struct kvm_vcpu_init *init) +{ + if (reg_list_sve()) + init->features[0] |= 1 << KVM_ARM_VCPU_SVE; +} + +static void finalize_vcpu(struct kvm_vm *vm, uint32_t vcpuid) +{ + int feature; + + if (reg_list_sve()) { + feature = KVM_ARM_VCPU_SVE; + vcpu_ioctl(vm, vcpuid, KVM_ARM_VCPU_FINALIZE, &feature); + } +} + +static void check_supported(void) +{ + if (reg_list_sve() && !kvm_check_cap(KVM_CAP_ARM_SVE)) { + fprintf(stderr, "SVE not available, skipping tests\n"); + exit(KSFT_SKIP); + } +} + +int main(int ac, char **av) +{ + struct kvm_vcpu_init init = { .target = -1, }; + int new_regs = 0, missing_regs = 0, i; + int failed_get = 0, failed_set = 0, failed_reject = 0; + bool print_list = false, fixup_core_regs = false; + struct kvm_vm *vm; + __u64 *vec_regs; + + check_supported(); + + for (i = 1; i < ac; ++i) { + if (strcmp(av[i], "--core-reg-fixup") == 0) + fixup_core_regs = true; + else if (strcmp(av[i], "--list") == 0) + print_list = true; + else + fprintf(stderr, "Ignoring unknown option: %s\n", av[i]); + } + + vm = vm_create(VM_MODE_DEFAULT, DEFAULT_GUEST_PHY_PAGES, O_RDWR); + prepare_vcpu_init(&init); + aarch64_vcpu_add_default(vm, 0, &init, NULL); + finalize_vcpu(vm, 0); + + reg_list = vcpu_get_reg_list(vm, 0); + + if (fixup_core_regs) + core_reg_fixup(); + + if (print_list) { + putchar('\n'); + for_each_reg(i) + print_reg(reg_list->reg[i]); + putchar('\n'); + return 0; + } + + /* + * We only test that we can get the register and then write back the + * same value. Some registers may allow other values to be written + * back, but others only allow some bits to be changed, and at least + * for ID registers set will fail if the value does not exactly match + * what was returned by get. If registers that allow other values to + * be written need to have the other values tested, then we should + * create a new set of tests for those in a new independent test + * executable. + */ + for_each_reg(i) { + uint8_t addr[2048 / 8]; + struct kvm_one_reg reg = { + .id = reg_list->reg[i], + .addr = (__u64)&addr, + }; + int ret; + + ret = _vcpu_ioctl(vm, 0, KVM_GET_ONE_REG, ®); + if (ret) { + puts("Failed to get "); + print_reg(reg.id); + putchar('\n'); + ++failed_get; + } + + /* rejects_set registers are rejected after KVM_ARM_VCPU_FINALIZE */ + if (find_reg(rejects_set, rejects_set_n, reg.id)) { + ret = _vcpu_ioctl(vm, 0, KVM_SET_ONE_REG, ®); + if (ret != -1 || errno != EPERM) { + printf("Failed to reject (ret=%d, errno=%d) ", ret, errno); + print_reg(reg.id); + putchar('\n'); + ++failed_reject; + } + continue; + } + + ret = _vcpu_ioctl(vm, 0, KVM_SET_ONE_REG, ®); + if (ret) { + puts("Failed to set "); + print_reg(reg.id); + putchar('\n'); + ++failed_set; + } + } + + if (reg_list_sve()) { + blessed_n = base_regs_n + sve_regs_n; + vec_regs = sve_regs; + } else { + blessed_n = base_regs_n + vregs_n; + vec_regs = vregs; + } + + blessed_reg = calloc(blessed_n, sizeof(__u64)); + for (i = 0; i < base_regs_n; ++i) + blessed_reg[i] = base_regs[i]; + for (i = 0; i < blessed_n - base_regs_n; ++i) + blessed_reg[base_regs_n + i] = vec_regs[i]; + + for_each_new_reg(i) + ++new_regs; + + for_each_missing_reg(i) + ++missing_regs; + + if (new_regs || missing_regs) { + printf("Number blessed registers: %5lld\n", blessed_n); + printf("Number registers: %5lld\n", reg_list->n); + } + + if (new_regs) { + printf("\nThere are %d new registers.\n" + "Consider adding them to the blessed reg " + "list with the following lines:\n\n", new_regs); + for_each_new_reg(i) + print_reg(reg_list->reg[i]); + putchar('\n'); + } + + if (missing_regs) { + printf("\nThere are %d missing registers.\n" + "The following lines are missing registers:\n\n", missing_regs); + for_each_missing_reg(i) + print_reg(blessed_reg[i]); + putchar('\n'); + } + + TEST_ASSERT(!missing_regs && !failed_get && !failed_set && !failed_reject, + "There are %d missing registers; " + "%d registers failed get; %d registers failed set; %d registers failed reject", + missing_regs, failed_get, failed_set, failed_reject); + + return 0; +} + +/* + * The current blessed list was primed with the output of kernel version + * v4.15 with --core-reg-fixup and then later updated with new registers. + */ +static __u64 base_regs[] = { + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[0]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[1]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[2]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[3]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[4]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[5]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[6]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[7]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[8]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[9]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[10]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[11]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[12]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[13]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[14]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[15]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[16]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[17]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[18]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[19]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[20]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[21]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[22]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[23]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[24]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[25]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[26]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[27]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[28]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[29]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[30]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.sp), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.pc), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.pstate), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(sp_el1), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(elr_el1), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(spsr[0]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(spsr[1]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(spsr[2]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(spsr[3]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(spsr[4]), + KVM_REG_ARM64 | KVM_REG_SIZE_U32 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.fpsr), + KVM_REG_ARM64 | KVM_REG_SIZE_U32 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.fpcr), + KVM_REG_ARM_FW_REG(0), + KVM_REG_ARM_FW_REG(1), + KVM_REG_ARM_FW_REG(2), + ARM64_SYS_REG(3, 3, 14, 3, 1), /* CNTV_CTL_EL0 */ + ARM64_SYS_REG(3, 3, 14, 3, 2), /* CNTV_CVAL_EL0 */ + ARM64_SYS_REG(3, 3, 14, 0, 2), + ARM64_SYS_REG(3, 0, 0, 0, 0), /* MIDR_EL1 */ + ARM64_SYS_REG(3, 0, 0, 0, 6), /* REVIDR_EL1 */ + ARM64_SYS_REG(3, 1, 0, 0, 1), /* CLIDR_EL1 */ + ARM64_SYS_REG(3, 1, 0, 0, 7), /* AIDR_EL1 */ + ARM64_SYS_REG(3, 3, 0, 0, 1), /* CTR_EL0 */ + ARM64_SYS_REG(2, 0, 0, 0, 4), + ARM64_SYS_REG(2, 0, 0, 0, 5), + ARM64_SYS_REG(2, 0, 0, 0, 6), + ARM64_SYS_REG(2, 0, 0, 0, 7), + ARM64_SYS_REG(2, 0, 0, 1, 4), + ARM64_SYS_REG(2, 0, 0, 1, 5), + ARM64_SYS_REG(2, 0, 0, 1, 6), + ARM64_SYS_REG(2, 0, 0, 1, 7), + ARM64_SYS_REG(2, 0, 0, 2, 0), /* MDCCINT_EL1 */ + ARM64_SYS_REG(2, 0, 0, 2, 2), /* MDSCR_EL1 */ + ARM64_SYS_REG(2, 0, 0, 2, 4), + ARM64_SYS_REG(2, 0, 0, 2, 5), + ARM64_SYS_REG(2, 0, 0, 2, 6), + ARM64_SYS_REG(2, 0, 0, 2, 7), + ARM64_SYS_REG(2, 0, 0, 3, 4), + ARM64_SYS_REG(2, 0, 0, 3, 5), + ARM64_SYS_REG(2, 0, 0, 3, 6), + ARM64_SYS_REG(2, 0, 0, 3, 7), + ARM64_SYS_REG(2, 0, 0, 4, 4), + ARM64_SYS_REG(2, 0, 0, 4, 5), + ARM64_SYS_REG(2, 0, 0, 4, 6), + ARM64_SYS_REG(2, 0, 0, 4, 7), + ARM64_SYS_REG(2, 0, 0, 5, 4), + ARM64_SYS_REG(2, 0, 0, 5, 5), + ARM64_SYS_REG(2, 0, 0, 5, 6), + ARM64_SYS_REG(2, 0, 0, 5, 7), + ARM64_SYS_REG(2, 0, 0, 6, 4), + ARM64_SYS_REG(2, 0, 0, 6, 5), + ARM64_SYS_REG(2, 0, 0, 6, 6), + ARM64_SYS_REG(2, 0, 0, 6, 7), + ARM64_SYS_REG(2, 0, 0, 7, 4), + ARM64_SYS_REG(2, 0, 0, 7, 5), + ARM64_SYS_REG(2, 0, 0, 7, 6), + ARM64_SYS_REG(2, 0, 0, 7, 7), + ARM64_SYS_REG(2, 0, 0, 8, 4), + ARM64_SYS_REG(2, 0, 0, 8, 5), + ARM64_SYS_REG(2, 0, 0, 8, 6), + ARM64_SYS_REG(2, 0, 0, 8, 7), + ARM64_SYS_REG(2, 0, 0, 9, 4), + ARM64_SYS_REG(2, 0, 0, 9, 5), + ARM64_SYS_REG(2, 0, 0, 9, 6), + ARM64_SYS_REG(2, 0, 0, 9, 7), + ARM64_SYS_REG(2, 0, 0, 10, 4), + ARM64_SYS_REG(2, 0, 0, 10, 5), + ARM64_SYS_REG(2, 0, 0, 10, 6), + ARM64_SYS_REG(2, 0, 0, 10, 7), + ARM64_SYS_REG(2, 0, 0, 11, 4), + ARM64_SYS_REG(2, 0, 0, 11, 5), + ARM64_SYS_REG(2, 0, 0, 11, 6), + ARM64_SYS_REG(2, 0, 0, 11, 7), + ARM64_SYS_REG(2, 0, 0, 12, 4), + ARM64_SYS_REG(2, 0, 0, 12, 5), + ARM64_SYS_REG(2, 0, 0, 12, 6), + ARM64_SYS_REG(2, 0, 0, 12, 7), + ARM64_SYS_REG(2, 0, 0, 13, 4), + ARM64_SYS_REG(2, 0, 0, 13, 5), + ARM64_SYS_REG(2, 0, 0, 13, 6), + ARM64_SYS_REG(2, 0, 0, 13, 7), + ARM64_SYS_REG(2, 0, 0, 14, 4), + ARM64_SYS_REG(2, 0, 0, 14, 5), + ARM64_SYS_REG(2, 0, 0, 14, 6), + ARM64_SYS_REG(2, 0, 0, 14, 7), + ARM64_SYS_REG(2, 0, 0, 15, 4), + ARM64_SYS_REG(2, 0, 0, 15, 5), + ARM64_SYS_REG(2, 0, 0, 15, 6), + ARM64_SYS_REG(2, 0, 0, 15, 7), + ARM64_SYS_REG(2, 4, 0, 7, 0), /* DBGVCR32_EL2 */ + ARM64_SYS_REG(3, 0, 0, 0, 5), /* MPIDR_EL1 */ + ARM64_SYS_REG(3, 0, 0, 1, 0), /* ID_PFR0_EL1 */ + ARM64_SYS_REG(3, 0, 0, 1, 1), /* ID_PFR1_EL1 */ + ARM64_SYS_REG(3, 0, 0, 1, 2), /* ID_DFR0_EL1 */ + ARM64_SYS_REG(3, 0, 0, 1, 3), /* ID_AFR0_EL1 */ + ARM64_SYS_REG(3, 0, 0, 1, 4), /* ID_MMFR0_EL1 */ + ARM64_SYS_REG(3, 0, 0, 1, 5), /* ID_MMFR1_EL1 */ + ARM64_SYS_REG(3, 0, 0, 1, 6), /* ID_MMFR2_EL1 */ + ARM64_SYS_REG(3, 0, 0, 1, 7), /* ID_MMFR3_EL1 */ + ARM64_SYS_REG(3, 0, 0, 2, 0), /* ID_ISAR0_EL1 */ + ARM64_SYS_REG(3, 0, 0, 2, 1), /* ID_ISAR1_EL1 */ + ARM64_SYS_REG(3, 0, 0, 2, 2), /* ID_ISAR2_EL1 */ + ARM64_SYS_REG(3, 0, 0, 2, 3), /* ID_ISAR3_EL1 */ + ARM64_SYS_REG(3, 0, 0, 2, 4), /* ID_ISAR4_EL1 */ + ARM64_SYS_REG(3, 0, 0, 2, 5), /* ID_ISAR5_EL1 */ + ARM64_SYS_REG(3, 0, 0, 2, 6), /* ID_MMFR4_EL1 */ + ARM64_SYS_REG(3, 0, 0, 2, 7), /* ID_ISAR6_EL1 */ + ARM64_SYS_REG(3, 0, 0, 3, 0), /* MVFR0_EL1 */ + ARM64_SYS_REG(3, 0, 0, 3, 1), /* MVFR1_EL1 */ + ARM64_SYS_REG(3, 0, 0, 3, 2), /* MVFR2_EL1 */ + ARM64_SYS_REG(3, 0, 0, 3, 3), + ARM64_SYS_REG(3, 0, 0, 3, 4), /* ID_PFR2_EL1 */ + ARM64_SYS_REG(3, 0, 0, 3, 5), /* ID_DFR1_EL1 */ + ARM64_SYS_REG(3, 0, 0, 3, 6), /* ID_MMFR5_EL1 */ + ARM64_SYS_REG(3, 0, 0, 3, 7), + ARM64_SYS_REG(3, 0, 0, 4, 0), /* ID_AA64PFR0_EL1 */ + ARM64_SYS_REG(3, 0, 0, 4, 1), /* ID_AA64PFR1_EL1 */ + ARM64_SYS_REG(3, 0, 0, 4, 2), + ARM64_SYS_REG(3, 0, 0, 4, 3), + ARM64_SYS_REG(3, 0, 0, 4, 4), /* ID_AA64ZFR0_EL1 */ + ARM64_SYS_REG(3, 0, 0, 4, 5), + ARM64_SYS_REG(3, 0, 0, 4, 6), + ARM64_SYS_REG(3, 0, 0, 4, 7), + ARM64_SYS_REG(3, 0, 0, 5, 0), /* ID_AA64DFR0_EL1 */ + ARM64_SYS_REG(3, 0, 0, 5, 1), /* ID_AA64DFR1_EL1 */ + ARM64_SYS_REG(3, 0, 0, 5, 2), + ARM64_SYS_REG(3, 0, 0, 5, 3), + ARM64_SYS_REG(3, 0, 0, 5, 4), /* ID_AA64AFR0_EL1 */ + ARM64_SYS_REG(3, 0, 0, 5, 5), /* ID_AA64AFR1_EL1 */ + ARM64_SYS_REG(3, 0, 0, 5, 6), + ARM64_SYS_REG(3, 0, 0, 5, 7), + ARM64_SYS_REG(3, 0, 0, 6, 0), /* ID_AA64ISAR0_EL1 */ + ARM64_SYS_REG(3, 0, 0, 6, 1), /* ID_AA64ISAR1_EL1 */ + ARM64_SYS_REG(3, 0, 0, 6, 2), + ARM64_SYS_REG(3, 0, 0, 6, 3), + ARM64_SYS_REG(3, 0, 0, 6, 4), + ARM64_SYS_REG(3, 0, 0, 6, 5), + ARM64_SYS_REG(3, 0, 0, 6, 6), + ARM64_SYS_REG(3, 0, 0, 6, 7), + ARM64_SYS_REG(3, 0, 0, 7, 0), /* ID_AA64MMFR0_EL1 */ + ARM64_SYS_REG(3, 0, 0, 7, 1), /* ID_AA64MMFR1_EL1 */ + ARM64_SYS_REG(3, 0, 0, 7, 2), /* ID_AA64MMFR2_EL1 */ + ARM64_SYS_REG(3, 0, 0, 7, 3), + ARM64_SYS_REG(3, 0, 0, 7, 4), + ARM64_SYS_REG(3, 0, 0, 7, 5), + ARM64_SYS_REG(3, 0, 0, 7, 6), + ARM64_SYS_REG(3, 0, 0, 7, 7), + ARM64_SYS_REG(3, 0, 1, 0, 0), /* SCTLR_EL1 */ + ARM64_SYS_REG(3, 0, 1, 0, 1), /* ACTLR_EL1 */ + ARM64_SYS_REG(3, 0, 1, 0, 2), /* CPACR_EL1 */ + ARM64_SYS_REG(3, 0, 2, 0, 0), /* TTBR0_EL1 */ + ARM64_SYS_REG(3, 0, 2, 0, 1), /* TTBR1_EL1 */ + ARM64_SYS_REG(3, 0, 2, 0, 2), /* TCR_EL1 */ + ARM64_SYS_REG(3, 0, 5, 1, 0), /* AFSR0_EL1 */ + ARM64_SYS_REG(3, 0, 5, 1, 1), /* AFSR1_EL1 */ + ARM64_SYS_REG(3, 0, 5, 2, 0), /* ESR_EL1 */ + ARM64_SYS_REG(3, 0, 6, 0, 0), /* FAR_EL1 */ + ARM64_SYS_REG(3, 0, 7, 4, 0), /* PAR_EL1 */ + ARM64_SYS_REG(3, 0, 9, 14, 1), /* PMINTENSET_EL1 */ + ARM64_SYS_REG(3, 0, 9, 14, 2), /* PMINTENCLR_EL1 */ + ARM64_SYS_REG(3, 0, 10, 2, 0), /* MAIR_EL1 */ + ARM64_SYS_REG(3, 0, 10, 3, 0), /* AMAIR_EL1 */ + ARM64_SYS_REG(3, 0, 12, 0, 0), /* VBAR_EL1 */ + ARM64_SYS_REG(3, 0, 12, 1, 1), /* DISR_EL1 */ + ARM64_SYS_REG(3, 0, 13, 0, 1), /* CONTEXTIDR_EL1 */ + ARM64_SYS_REG(3, 0, 13, 0, 4), /* TPIDR_EL1 */ + ARM64_SYS_REG(3, 0, 14, 1, 0), /* CNTKCTL_EL1 */ + ARM64_SYS_REG(3, 2, 0, 0, 0), /* CSSELR_EL1 */ + ARM64_SYS_REG(3, 3, 9, 12, 0), /* PMCR_EL0 */ + ARM64_SYS_REG(3, 3, 9, 12, 1), /* PMCNTENSET_EL0 */ + ARM64_SYS_REG(3, 3, 9, 12, 2), /* PMCNTENCLR_EL0 */ + ARM64_SYS_REG(3, 3, 9, 12, 3), /* PMOVSCLR_EL0 */ + ARM64_SYS_REG(3, 3, 9, 12, 4), /* PMSWINC_EL0 */ + ARM64_SYS_REG(3, 3, 9, 12, 5), /* PMSELR_EL0 */ + ARM64_SYS_REG(3, 3, 9, 13, 0), /* PMCCNTR_EL0 */ + ARM64_SYS_REG(3, 3, 9, 14, 0), /* PMUSERENR_EL0 */ + ARM64_SYS_REG(3, 3, 9, 14, 3), /* PMOVSSET_EL0 */ + ARM64_SYS_REG(3, 3, 13, 0, 2), /* TPIDR_EL0 */ + ARM64_SYS_REG(3, 3, 13, 0, 3), /* TPIDRRO_EL0 */ + ARM64_SYS_REG(3, 3, 14, 8, 0), + ARM64_SYS_REG(3, 3, 14, 8, 1), + ARM64_SYS_REG(3, 3, 14, 8, 2), + ARM64_SYS_REG(3, 3, 14, 8, 3), + ARM64_SYS_REG(3, 3, 14, 8, 4), + ARM64_SYS_REG(3, 3, 14, 8, 5), + ARM64_SYS_REG(3, 3, 14, 8, 6), + ARM64_SYS_REG(3, 3, 14, 8, 7), + ARM64_SYS_REG(3, 3, 14, 9, 0), + ARM64_SYS_REG(3, 3, 14, 9, 1), + ARM64_SYS_REG(3, 3, 14, 9, 2), + ARM64_SYS_REG(3, 3, 14, 9, 3), + ARM64_SYS_REG(3, 3, 14, 9, 4), + ARM64_SYS_REG(3, 3, 14, 9, 5), + ARM64_SYS_REG(3, 3, 14, 9, 6), + ARM64_SYS_REG(3, 3, 14, 9, 7), + ARM64_SYS_REG(3, 3, 14, 10, 0), + ARM64_SYS_REG(3, 3, 14, 10, 1), + ARM64_SYS_REG(3, 3, 14, 10, 2), + ARM64_SYS_REG(3, 3, 14, 10, 3), + ARM64_SYS_REG(3, 3, 14, 10, 4), + ARM64_SYS_REG(3, 3, 14, 10, 5), + ARM64_SYS_REG(3, 3, 14, 10, 6), + ARM64_SYS_REG(3, 3, 14, 10, 7), + ARM64_SYS_REG(3, 3, 14, 11, 0), + ARM64_SYS_REG(3, 3, 14, 11, 1), + ARM64_SYS_REG(3, 3, 14, 11, 2), + ARM64_SYS_REG(3, 3, 14, 11, 3), + ARM64_SYS_REG(3, 3, 14, 11, 4), + ARM64_SYS_REG(3, 3, 14, 11, 5), + ARM64_SYS_REG(3, 3, 14, 11, 6), + ARM64_SYS_REG(3, 3, 14, 12, 0), + ARM64_SYS_REG(3, 3, 14, 12, 1), + ARM64_SYS_REG(3, 3, 14, 12, 2), + ARM64_SYS_REG(3, 3, 14, 12, 3), + ARM64_SYS_REG(3, 3, 14, 12, 4), + ARM64_SYS_REG(3, 3, 14, 12, 5), + ARM64_SYS_REG(3, 3, 14, 12, 6), + ARM64_SYS_REG(3, 3, 14, 12, 7), + ARM64_SYS_REG(3, 3, 14, 13, 0), + ARM64_SYS_REG(3, 3, 14, 13, 1), + ARM64_SYS_REG(3, 3, 14, 13, 2), + ARM64_SYS_REG(3, 3, 14, 13, 3), + ARM64_SYS_REG(3, 3, 14, 13, 4), + ARM64_SYS_REG(3, 3, 14, 13, 5), + ARM64_SYS_REG(3, 3, 14, 13, 6), + ARM64_SYS_REG(3, 3, 14, 13, 7), + ARM64_SYS_REG(3, 3, 14, 14, 0), + ARM64_SYS_REG(3, 3, 14, 14, 1), + ARM64_SYS_REG(3, 3, 14, 14, 2), + ARM64_SYS_REG(3, 3, 14, 14, 3), + ARM64_SYS_REG(3, 3, 14, 14, 4), + ARM64_SYS_REG(3, 3, 14, 14, 5), + ARM64_SYS_REG(3, 3, 14, 14, 6), + ARM64_SYS_REG(3, 3, 14, 14, 7), + ARM64_SYS_REG(3, 3, 14, 15, 0), + ARM64_SYS_REG(3, 3, 14, 15, 1), + ARM64_SYS_REG(3, 3, 14, 15, 2), + ARM64_SYS_REG(3, 3, 14, 15, 3), + ARM64_SYS_REG(3, 3, 14, 15, 4), + ARM64_SYS_REG(3, 3, 14, 15, 5), + ARM64_SYS_REG(3, 3, 14, 15, 6), + ARM64_SYS_REG(3, 3, 14, 15, 7), /* PMCCFILTR_EL0 */ + ARM64_SYS_REG(3, 4, 3, 0, 0), /* DACR32_EL2 */ + ARM64_SYS_REG(3, 4, 5, 0, 1), /* IFSR32_EL2 */ + ARM64_SYS_REG(3, 4, 5, 3, 0), /* FPEXC32_EL2 */ + KVM_REG_ARM64 | KVM_REG_SIZE_U32 | KVM_REG_ARM_DEMUX | KVM_REG_ARM_DEMUX_ID_CCSIDR | 0, + KVM_REG_ARM64 | KVM_REG_SIZE_U32 | KVM_REG_ARM_DEMUX | KVM_REG_ARM_DEMUX_ID_CCSIDR | 1, + KVM_REG_ARM64 | KVM_REG_SIZE_U32 | KVM_REG_ARM_DEMUX | KVM_REG_ARM_DEMUX_ID_CCSIDR | 2, +}; +static __u64 base_regs_n = ARRAY_SIZE(base_regs); + +static __u64 vregs[] = { + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[0]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[1]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[2]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[3]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[4]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[5]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[6]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[7]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[8]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[9]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[10]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[11]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[12]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[13]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[14]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[15]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[16]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[17]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[18]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[19]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[20]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[21]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[22]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[23]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[24]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[25]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[26]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[27]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[28]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[29]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[30]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[31]), +}; +static __u64 vregs_n = ARRAY_SIZE(vregs); + +static __u64 sve_regs[] = { + KVM_REG_ARM64_SVE_VLS, + KVM_REG_ARM64_SVE_ZREG(0, 0), + KVM_REG_ARM64_SVE_ZREG(1, 0), + KVM_REG_ARM64_SVE_ZREG(2, 0), + KVM_REG_ARM64_SVE_ZREG(3, 0), + KVM_REG_ARM64_SVE_ZREG(4, 0), + KVM_REG_ARM64_SVE_ZREG(5, 0), + KVM_REG_ARM64_SVE_ZREG(6, 0), + KVM_REG_ARM64_SVE_ZREG(7, 0), + KVM_REG_ARM64_SVE_ZREG(8, 0), + KVM_REG_ARM64_SVE_ZREG(9, 0), + KVM_REG_ARM64_SVE_ZREG(10, 0), + KVM_REG_ARM64_SVE_ZREG(11, 0), + KVM_REG_ARM64_SVE_ZREG(12, 0), + KVM_REG_ARM64_SVE_ZREG(13, 0), + KVM_REG_ARM64_SVE_ZREG(14, 0), + KVM_REG_ARM64_SVE_ZREG(15, 0), + KVM_REG_ARM64_SVE_ZREG(16, 0), + KVM_REG_ARM64_SVE_ZREG(17, 0), + KVM_REG_ARM64_SVE_ZREG(18, 0), + KVM_REG_ARM64_SVE_ZREG(19, 0), + KVM_REG_ARM64_SVE_ZREG(20, 0), + KVM_REG_ARM64_SVE_ZREG(21, 0), + KVM_REG_ARM64_SVE_ZREG(22, 0), + KVM_REG_ARM64_SVE_ZREG(23, 0), + KVM_REG_ARM64_SVE_ZREG(24, 0), + KVM_REG_ARM64_SVE_ZREG(25, 0), + KVM_REG_ARM64_SVE_ZREG(26, 0), + KVM_REG_ARM64_SVE_ZREG(27, 0), + KVM_REG_ARM64_SVE_ZREG(28, 0), + KVM_REG_ARM64_SVE_ZREG(29, 0), + KVM_REG_ARM64_SVE_ZREG(30, 0), + KVM_REG_ARM64_SVE_ZREG(31, 0), + KVM_REG_ARM64_SVE_PREG(0, 0), + KVM_REG_ARM64_SVE_PREG(1, 0), + KVM_REG_ARM64_SVE_PREG(2, 0), + KVM_REG_ARM64_SVE_PREG(3, 0), + KVM_REG_ARM64_SVE_PREG(4, 0), + KVM_REG_ARM64_SVE_PREG(5, 0), + KVM_REG_ARM64_SVE_PREG(6, 0), + KVM_REG_ARM64_SVE_PREG(7, 0), + KVM_REG_ARM64_SVE_PREG(8, 0), + KVM_REG_ARM64_SVE_PREG(9, 0), + KVM_REG_ARM64_SVE_PREG(10, 0), + KVM_REG_ARM64_SVE_PREG(11, 0), + KVM_REG_ARM64_SVE_PREG(12, 0), + KVM_REG_ARM64_SVE_PREG(13, 0), + KVM_REG_ARM64_SVE_PREG(14, 0), + KVM_REG_ARM64_SVE_PREG(15, 0), + KVM_REG_ARM64_SVE_FFR(0), + ARM64_SYS_REG(3, 0, 1, 2, 0), /* ZCR_EL1 */ +}; +static __u64 sve_regs_n = ARRAY_SIZE(sve_regs); + +static __u64 rejects_set[] = { +#ifdef REG_LIST_SVE + KVM_REG_ARM64_SVE_VLS, +#endif +}; +static __u64 rejects_set_n = ARRAY_SIZE(rejects_set); diff --git a/tools/testing/selftests/kvm/config b/tools/testing/selftests/kvm/config new file mode 100644 index 000000000..63ed533f7 --- /dev/null +++ b/tools/testing/selftests/kvm/config @@ -0,0 +1,3 @@ +CONFIG_KVM=y +CONFIG_KVM_INTEL=y +CONFIG_KVM_AMD=y diff --git a/tools/testing/selftests/kvm/demand_paging_test.c b/tools/testing/selftests/kvm/demand_paging_test.c new file mode 100644 index 000000000..3d96a7bfa --- /dev/null +++ b/tools/testing/selftests/kvm/demand_paging_test.c @@ -0,0 +1,498 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * KVM demand paging test + * Adapted from dirty_log_test.c + * + * Copyright (C) 2018, Red Hat, Inc. + * Copyright (C) 2019, Google, Inc. + */ + +#define _GNU_SOURCE /* for program_invocation_name */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "perf_test_util.h" +#include "processor.h" +#include "test_util.h" + +#ifdef __NR_userfaultfd + +#ifdef PRINT_PER_PAGE_UPDATES +#define PER_PAGE_DEBUG(...) printf(__VA_ARGS__) +#else +#define PER_PAGE_DEBUG(...) _no_printf(__VA_ARGS__) +#endif + +#ifdef PRINT_PER_VCPU_UPDATES +#define PER_VCPU_DEBUG(...) printf(__VA_ARGS__) +#else +#define PER_VCPU_DEBUG(...) _no_printf(__VA_ARGS__) +#endif + +static char *guest_data_prototype; + +static void *vcpu_worker(void *data) +{ + int ret; + struct vcpu_args *vcpu_args = (struct vcpu_args *)data; + int vcpu_id = vcpu_args->vcpu_id; + struct kvm_vm *vm = perf_test_args.vm; + struct kvm_run *run; + struct timespec start; + struct timespec ts_diff; + + vcpu_args_set(vm, vcpu_id, 1, vcpu_id); + run = vcpu_state(vm, vcpu_id); + + clock_gettime(CLOCK_MONOTONIC, &start); + + /* Let the guest access its memory */ + ret = _vcpu_run(vm, vcpu_id); + TEST_ASSERT(ret == 0, "vcpu_run failed: %d\n", ret); + if (get_ucall(vm, vcpu_id, NULL) != UCALL_SYNC) { + TEST_ASSERT(false, + "Invalid guest sync status: exit_reason=%s\n", + exit_reason_str(run->exit_reason)); + } + + ts_diff = timespec_diff_now(start); + PER_VCPU_DEBUG("vCPU %d execution time: %ld.%.9lds\n", vcpu_id, + ts_diff.tv_sec, ts_diff.tv_nsec); + + return NULL; +} + +static int handle_uffd_page_request(int uffd, uint64_t addr) +{ + pid_t tid; + struct timespec start; + struct timespec ts_diff; + struct uffdio_copy copy; + int r; + + tid = syscall(__NR_gettid); + + copy.src = (uint64_t)guest_data_prototype; + copy.dst = addr; + copy.len = perf_test_args.host_page_size; + copy.mode = 0; + + clock_gettime(CLOCK_MONOTONIC, &start); + + r = ioctl(uffd, UFFDIO_COPY, ©); + if (r == -1) { + pr_info("Failed Paged in 0x%lx from thread %d with errno: %d\n", + addr, tid, errno); + return r; + } + + ts_diff = timespec_diff_now(start); + + PER_PAGE_DEBUG("UFFDIO_COPY %d \t%ld ns\n", tid, + timespec_to_ns(ts_diff)); + PER_PAGE_DEBUG("Paged in %ld bytes at 0x%lx from thread %d\n", + perf_test_args.host_page_size, addr, tid); + + return 0; +} + +bool quit_uffd_thread; + +struct uffd_handler_args { + int uffd; + int pipefd; + useconds_t delay; +}; + +static void *uffd_handler_thread_fn(void *arg) +{ + struct uffd_handler_args *uffd_args = (struct uffd_handler_args *)arg; + int uffd = uffd_args->uffd; + int pipefd = uffd_args->pipefd; + useconds_t delay = uffd_args->delay; + int64_t pages = 0; + struct timespec start; + struct timespec ts_diff; + + clock_gettime(CLOCK_MONOTONIC, &start); + while (!quit_uffd_thread) { + struct uffd_msg msg; + struct pollfd pollfd[2]; + char tmp_chr; + int r; + uint64_t addr; + + pollfd[0].fd = uffd; + pollfd[0].events = POLLIN; + pollfd[1].fd = pipefd; + pollfd[1].events = POLLIN; + + r = poll(pollfd, 2, -1); + switch (r) { + case -1: + pr_info("poll err"); + continue; + case 0: + continue; + case 1: + break; + default: + pr_info("Polling uffd returned %d", r); + return NULL; + } + + if (pollfd[0].revents & POLLERR) { + pr_info("uffd revents has POLLERR"); + return NULL; + } + + if (pollfd[1].revents & POLLIN) { + r = read(pollfd[1].fd, &tmp_chr, 1); + TEST_ASSERT(r == 1, + "Error reading pipefd in UFFD thread\n"); + return NULL; + } + + if (!pollfd[0].revents & POLLIN) + continue; + + r = read(uffd, &msg, sizeof(msg)); + if (r == -1) { + if (errno == EAGAIN) + continue; + pr_info("Read of uffd gor errno %d", errno); + return NULL; + } + + if (r != sizeof(msg)) { + pr_info("Read on uffd returned unexpected size: %d bytes", r); + return NULL; + } + + if (!(msg.event & UFFD_EVENT_PAGEFAULT)) + continue; + + if (delay) + usleep(delay); + addr = msg.arg.pagefault.address; + r = handle_uffd_page_request(uffd, addr); + if (r < 0) + return NULL; + pages++; + } + + ts_diff = timespec_diff_now(start); + PER_VCPU_DEBUG("userfaulted %ld pages over %ld.%.9lds. (%f/sec)\n", + pages, ts_diff.tv_sec, ts_diff.tv_nsec, + pages / ((double)ts_diff.tv_sec + (double)ts_diff.tv_nsec / 100000000.0)); + + return NULL; +} + +static int setup_demand_paging(struct kvm_vm *vm, + pthread_t *uffd_handler_thread, int pipefd, + useconds_t uffd_delay, + struct uffd_handler_args *uffd_args, + void *hva, uint64_t len) +{ + int uffd; + struct uffdio_api uffdio_api; + struct uffdio_register uffdio_register; + + uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK); + if (uffd == -1) { + pr_info("uffd creation failed\n"); + return -1; + } + + uffdio_api.api = UFFD_API; + uffdio_api.features = 0; + if (ioctl(uffd, UFFDIO_API, &uffdio_api) == -1) { + pr_info("ioctl uffdio_api failed\n"); + return -1; + } + + uffdio_register.range.start = (uint64_t)hva; + uffdio_register.range.len = len; + uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING; + if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) == -1) { + pr_info("ioctl uffdio_register failed\n"); + return -1; + } + + if ((uffdio_register.ioctls & UFFD_API_RANGE_IOCTLS) != + UFFD_API_RANGE_IOCTLS) { + pr_info("unexpected userfaultfd ioctl set\n"); + return -1; + } + + uffd_args->uffd = uffd; + uffd_args->pipefd = pipefd; + uffd_args->delay = uffd_delay; + pthread_create(uffd_handler_thread, NULL, uffd_handler_thread_fn, + uffd_args); + + PER_VCPU_DEBUG("Created uffd thread for HVA range [%p, %p)\n", + hva, hva + len); + + return 0; +} + +static void run_test(enum vm_guest_mode mode, bool use_uffd, + useconds_t uffd_delay) +{ + pthread_t *vcpu_threads; + pthread_t *uffd_handler_threads = NULL; + struct uffd_handler_args *uffd_args = NULL; + struct timespec start; + struct timespec ts_diff; + int *pipefds = NULL; + struct kvm_vm *vm; + int vcpu_id; + int r; + + vm = create_vm(mode, nr_vcpus, guest_percpu_mem_size); + + perf_test_args.wr_fract = 1; + + guest_data_prototype = malloc(perf_test_args.host_page_size); + TEST_ASSERT(guest_data_prototype, + "Failed to allocate buffer for guest data pattern"); + memset(guest_data_prototype, 0xAB, perf_test_args.host_page_size); + + vcpu_threads = malloc(nr_vcpus * sizeof(*vcpu_threads)); + TEST_ASSERT(vcpu_threads, "Memory allocation failed"); + + add_vcpus(vm, nr_vcpus, guest_percpu_mem_size); + + if (use_uffd) { + uffd_handler_threads = + malloc(nr_vcpus * sizeof(*uffd_handler_threads)); + TEST_ASSERT(uffd_handler_threads, "Memory allocation failed"); + + uffd_args = malloc(nr_vcpus * sizeof(*uffd_args)); + TEST_ASSERT(uffd_args, "Memory allocation failed"); + + pipefds = malloc(sizeof(int) * nr_vcpus * 2); + TEST_ASSERT(pipefds, "Unable to allocate memory for pipefd"); + + for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++) { + vm_paddr_t vcpu_gpa; + void *vcpu_hva; + + vcpu_gpa = guest_test_phys_mem + (vcpu_id * guest_percpu_mem_size); + PER_VCPU_DEBUG("Added VCPU %d with test mem gpa [%lx, %lx)\n", + vcpu_id, vcpu_gpa, vcpu_gpa + guest_percpu_mem_size); + + /* Cache the HVA pointer of the region */ + vcpu_hva = addr_gpa2hva(vm, vcpu_gpa); + + /* + * Set up user fault fd to handle demand paging + * requests. + */ + r = pipe2(&pipefds[vcpu_id * 2], + O_CLOEXEC | O_NONBLOCK); + TEST_ASSERT(!r, "Failed to set up pipefd"); + + r = setup_demand_paging(vm, + &uffd_handler_threads[vcpu_id], + pipefds[vcpu_id * 2], + uffd_delay, &uffd_args[vcpu_id], + vcpu_hva, guest_percpu_mem_size); + if (r < 0) + exit(-r); + } + } + + /* Export the shared variables to the guest */ + sync_global_to_guest(vm, perf_test_args); + + pr_info("Finished creating vCPUs and starting uffd threads\n"); + + clock_gettime(CLOCK_MONOTONIC, &start); + + for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++) { + pthread_create(&vcpu_threads[vcpu_id], NULL, vcpu_worker, + &perf_test_args.vcpu_args[vcpu_id]); + } + + pr_info("Started all vCPUs\n"); + + /* Wait for the vcpu threads to quit */ + for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++) { + pthread_join(vcpu_threads[vcpu_id], NULL); + PER_VCPU_DEBUG("Joined thread for vCPU %d\n", vcpu_id); + } + + ts_diff = timespec_diff_now(start); + + pr_info("All vCPU threads joined\n"); + + if (use_uffd) { + char c; + + /* Tell the user fault fd handler threads to quit */ + for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++) { + r = write(pipefds[vcpu_id * 2 + 1], &c, 1); + TEST_ASSERT(r == 1, "Unable to write to pipefd"); + + pthread_join(uffd_handler_threads[vcpu_id], NULL); + } + } + + pr_info("Total guest execution time: %ld.%.9lds\n", + ts_diff.tv_sec, ts_diff.tv_nsec); + pr_info("Overall demand paging rate: %f pgs/sec\n", + perf_test_args.vcpu_args[0].pages * nr_vcpus / + ((double)ts_diff.tv_sec + (double)ts_diff.tv_nsec / 100000000.0)); + + ucall_uninit(vm); + kvm_vm_free(vm); + + free(guest_data_prototype); + free(vcpu_threads); + if (use_uffd) { + free(uffd_handler_threads); + free(uffd_args); + free(pipefds); + } +} + +struct guest_mode { + bool supported; + bool enabled; +}; +static struct guest_mode guest_modes[NUM_VM_MODES]; + +#define guest_mode_init(mode, supported, enabled) ({ \ + guest_modes[mode] = (struct guest_mode){ supported, enabled }; \ +}) + +static void help(char *name) +{ + int i; + + puts(""); + printf("usage: %s [-h] [-m mode] [-u] [-d uffd_delay_usec]\n" + " [-b memory] [-v vcpus]\n", name); + printf(" -m: specify the guest mode ID to test\n" + " (default: test all supported modes)\n" + " This option may be used multiple times.\n" + " Guest mode IDs:\n"); + for (i = 0; i < NUM_VM_MODES; ++i) { + printf(" %d: %s%s\n", i, vm_guest_mode_string(i), + guest_modes[i].supported ? " (supported)" : ""); + } + printf(" -u: use User Fault FD to handle vCPU page\n" + " faults.\n"); + printf(" -d: add a delay in usec to the User Fault\n" + " FD handler to simulate demand paging\n" + " overheads. Ignored without -u.\n"); + printf(" -b: specify the size of the memory region which should be\n" + " demand paged by each vCPU. e.g. 10M or 3G.\n" + " Default: 1G\n"); + printf(" -v: specify the number of vCPUs to run.\n"); + puts(""); + exit(0); +} + +int main(int argc, char *argv[]) +{ + int max_vcpus = kvm_check_cap(KVM_CAP_MAX_VCPUS); + bool mode_selected = false; + unsigned int mode; + int opt, i; + bool use_uffd = false; + useconds_t uffd_delay = 0; + +#ifdef __x86_64__ + guest_mode_init(VM_MODE_PXXV48_4K, true, true); +#endif +#ifdef __aarch64__ + guest_mode_init(VM_MODE_P40V48_4K, true, true); + guest_mode_init(VM_MODE_P40V48_64K, true, true); + { + unsigned int limit = kvm_check_cap(KVM_CAP_ARM_VM_IPA_SIZE); + + if (limit >= 52) + guest_mode_init(VM_MODE_P52V48_64K, true, true); + if (limit >= 48) { + guest_mode_init(VM_MODE_P48V48_4K, true, true); + guest_mode_init(VM_MODE_P48V48_64K, true, true); + } + } +#endif +#ifdef __s390x__ + guest_mode_init(VM_MODE_P40V48_4K, true, true); +#endif + + while ((opt = getopt(argc, argv, "hm:ud:b:v:")) != -1) { + switch (opt) { + case 'm': + if (!mode_selected) { + for (i = 0; i < NUM_VM_MODES; ++i) + guest_modes[i].enabled = false; + mode_selected = true; + } + mode = strtoul(optarg, NULL, 10); + TEST_ASSERT(mode < NUM_VM_MODES, + "Guest mode ID %d too big", mode); + guest_modes[mode].enabled = true; + break; + case 'u': + use_uffd = true; + break; + case 'd': + uffd_delay = strtoul(optarg, NULL, 0); + TEST_ASSERT(uffd_delay >= 0, + "A negative UFFD delay is not supported."); + break; + case 'b': + guest_percpu_mem_size = parse_size(optarg); + break; + case 'v': + nr_vcpus = atoi(optarg); + TEST_ASSERT(nr_vcpus > 0 && nr_vcpus <= max_vcpus, + "Invalid number of vcpus, must be between 1 and %d", max_vcpus); + break; + case 'h': + default: + help(argv[0]); + break; + } + } + + for (i = 0; i < NUM_VM_MODES; ++i) { + if (!guest_modes[i].enabled) + continue; + TEST_ASSERT(guest_modes[i].supported, + "Guest mode ID %d (%s) not supported.", + i, vm_guest_mode_string(i)); + run_test(i, use_uffd, uffd_delay); + } + + return 0; +} + +#else /* __NR_userfaultfd */ + +#warning "missing __NR_userfaultfd definition" + +int main(void) +{ + print_skip("__NR_userfaultfd must be present for userfaultfd test"); + return KSFT_SKIP; +} + +#endif /* __NR_userfaultfd */ diff --git a/tools/testing/selftests/kvm/dirty_log_perf_test.c b/tools/testing/selftests/kvm/dirty_log_perf_test.c new file mode 100644 index 000000000..85c9b8f73 --- /dev/null +++ b/tools/testing/selftests/kvm/dirty_log_perf_test.c @@ -0,0 +1,376 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * KVM dirty page logging performance test + * + * Based on dirty_log_test.c + * + * Copyright (C) 2018, Red Hat, Inc. + * Copyright (C) 2020, Google, Inc. + */ + +#define _GNU_SOURCE /* for program_invocation_name */ + +#include +#include +#include +#include +#include +#include +#include + +#include "kvm_util.h" +#include "perf_test_util.h" +#include "processor.h" +#include "test_util.h" + +/* How many host loops to run by default (one KVM_GET_DIRTY_LOG for each loop)*/ +#define TEST_HOST_LOOP_N 2UL + +/* Host variables */ +static bool host_quit; +static uint64_t iteration; +static uint64_t vcpu_last_completed_iteration[MAX_VCPUS]; + +static void *vcpu_worker(void *data) +{ + int ret; + struct kvm_vm *vm = perf_test_args.vm; + uint64_t pages_count = 0; + struct kvm_run *run; + struct timespec start; + struct timespec ts_diff; + struct timespec total = (struct timespec){0}; + struct timespec avg; + struct vcpu_args *vcpu_args = (struct vcpu_args *)data; + int vcpu_id = vcpu_args->vcpu_id; + + vcpu_args_set(vm, vcpu_id, 1, vcpu_id); + run = vcpu_state(vm, vcpu_id); + + while (!READ_ONCE(host_quit)) { + uint64_t current_iteration = READ_ONCE(iteration); + + clock_gettime(CLOCK_MONOTONIC, &start); + ret = _vcpu_run(vm, vcpu_id); + ts_diff = timespec_diff_now(start); + + TEST_ASSERT(ret == 0, "vcpu_run failed: %d\n", ret); + TEST_ASSERT(get_ucall(vm, vcpu_id, NULL) == UCALL_SYNC, + "Invalid guest sync status: exit_reason=%s\n", + exit_reason_str(run->exit_reason)); + + pr_debug("Got sync event from vCPU %d\n", vcpu_id); + vcpu_last_completed_iteration[vcpu_id] = current_iteration; + pr_debug("vCPU %d updated last completed iteration to %lu\n", + vcpu_id, vcpu_last_completed_iteration[vcpu_id]); + + if (current_iteration) { + pages_count += vcpu_args->pages; + total = timespec_add(total, ts_diff); + pr_debug("vCPU %d iteration %lu dirty memory time: %ld.%.9lds\n", + vcpu_id, current_iteration, ts_diff.tv_sec, + ts_diff.tv_nsec); + } else { + pr_debug("vCPU %d iteration %lu populate memory time: %ld.%.9lds\n", + vcpu_id, current_iteration, ts_diff.tv_sec, + ts_diff.tv_nsec); + } + + while (current_iteration == READ_ONCE(iteration) && + !READ_ONCE(host_quit)) {} + } + + avg = timespec_div(total, vcpu_last_completed_iteration[vcpu_id]); + pr_debug("\nvCPU %d dirtied 0x%lx pages over %lu iterations in %ld.%.9lds. (Avg %ld.%.9lds/iteration)\n", + vcpu_id, pages_count, vcpu_last_completed_iteration[vcpu_id], + total.tv_sec, total.tv_nsec, avg.tv_sec, avg.tv_nsec); + + return NULL; +} + +#ifdef USE_CLEAR_DIRTY_LOG +static u64 dirty_log_manual_caps; +#endif + +static void run_test(enum vm_guest_mode mode, unsigned long iterations, + uint64_t phys_offset, int wr_fract) +{ + pthread_t *vcpu_threads; + struct kvm_vm *vm; + unsigned long *bmap; + uint64_t guest_num_pages; + uint64_t host_num_pages; + int vcpu_id; + struct timespec start; + struct timespec ts_diff; + struct timespec get_dirty_log_total = (struct timespec){0}; + struct timespec vcpu_dirty_total = (struct timespec){0}; + struct timespec avg; +#ifdef USE_CLEAR_DIRTY_LOG + struct kvm_enable_cap cap = {}; + struct timespec clear_dirty_log_total = (struct timespec){0}; +#endif + + vm = create_vm(mode, nr_vcpus, guest_percpu_mem_size); + + perf_test_args.wr_fract = wr_fract; + + guest_num_pages = (nr_vcpus * guest_percpu_mem_size) >> vm_get_page_shift(vm); + guest_num_pages = vm_adjust_num_guest_pages(mode, guest_num_pages); + host_num_pages = vm_num_host_pages(mode, guest_num_pages); + bmap = bitmap_alloc(host_num_pages); + +#ifdef USE_CLEAR_DIRTY_LOG + cap.cap = KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2; + cap.args[0] = dirty_log_manual_caps; + vm_enable_cap(vm, &cap); +#endif + + vcpu_threads = malloc(nr_vcpus * sizeof(*vcpu_threads)); + TEST_ASSERT(vcpu_threads, "Memory allocation failed"); + + add_vcpus(vm, nr_vcpus, guest_percpu_mem_size); + + sync_global_to_guest(vm, perf_test_args); + + /* Start the iterations */ + iteration = 0; + host_quit = false; + + clock_gettime(CLOCK_MONOTONIC, &start); + for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++) { + pthread_create(&vcpu_threads[vcpu_id], NULL, vcpu_worker, + &perf_test_args.vcpu_args[vcpu_id]); + } + + /* Allow the vCPU to populate memory */ + pr_debug("Starting iteration %lu - Populating\n", iteration); + while (READ_ONCE(vcpu_last_completed_iteration[vcpu_id]) != iteration) + pr_debug("Waiting for vcpu_last_completed_iteration == %lu\n", + iteration); + + ts_diff = timespec_diff_now(start); + pr_info("Populate memory time: %ld.%.9lds\n", + ts_diff.tv_sec, ts_diff.tv_nsec); + + /* Enable dirty logging */ + clock_gettime(CLOCK_MONOTONIC, &start); + vm_mem_region_set_flags(vm, TEST_MEM_SLOT_INDEX, + KVM_MEM_LOG_DIRTY_PAGES); + ts_diff = timespec_diff_now(start); + pr_info("Enabling dirty logging time: %ld.%.9lds\n\n", + ts_diff.tv_sec, ts_diff.tv_nsec); + + while (iteration < iterations) { + /* + * Incrementing the iteration number will start the vCPUs + * dirtying memory again. + */ + clock_gettime(CLOCK_MONOTONIC, &start); + iteration++; + + pr_debug("Starting iteration %lu\n", iteration); + for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++) { + while (READ_ONCE(vcpu_last_completed_iteration[vcpu_id]) != iteration) + pr_debug("Waiting for vCPU %d vcpu_last_completed_iteration == %lu\n", + vcpu_id, iteration); + } + + ts_diff = timespec_diff_now(start); + vcpu_dirty_total = timespec_add(vcpu_dirty_total, ts_diff); + pr_info("Iteration %lu dirty memory time: %ld.%.9lds\n", + iteration, ts_diff.tv_sec, ts_diff.tv_nsec); + + clock_gettime(CLOCK_MONOTONIC, &start); + kvm_vm_get_dirty_log(vm, TEST_MEM_SLOT_INDEX, bmap); + + ts_diff = timespec_diff_now(start); + get_dirty_log_total = timespec_add(get_dirty_log_total, + ts_diff); + pr_info("Iteration %lu get dirty log time: %ld.%.9lds\n", + iteration, ts_diff.tv_sec, ts_diff.tv_nsec); + +#ifdef USE_CLEAR_DIRTY_LOG + clock_gettime(CLOCK_MONOTONIC, &start); + kvm_vm_clear_dirty_log(vm, TEST_MEM_SLOT_INDEX, bmap, 0, + host_num_pages); + + ts_diff = timespec_diff_now(start); + clear_dirty_log_total = timespec_add(clear_dirty_log_total, + ts_diff); + pr_info("Iteration %lu clear dirty log time: %ld.%.9lds\n", + iteration, ts_diff.tv_sec, ts_diff.tv_nsec); +#endif + } + + /* Tell the vcpu thread to quit */ + host_quit = true; + for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++) + pthread_join(vcpu_threads[vcpu_id], NULL); + + /* Disable dirty logging */ + clock_gettime(CLOCK_MONOTONIC, &start); + vm_mem_region_set_flags(vm, TEST_MEM_SLOT_INDEX, 0); + ts_diff = timespec_diff_now(start); + pr_info("Disabling dirty logging time: %ld.%.9lds\n", + ts_diff.tv_sec, ts_diff.tv_nsec); + + avg = timespec_div(get_dirty_log_total, iterations); + pr_info("Get dirty log over %lu iterations took %ld.%.9lds. (Avg %ld.%.9lds/iteration)\n", + iterations, get_dirty_log_total.tv_sec, + get_dirty_log_total.tv_nsec, avg.tv_sec, avg.tv_nsec); + +#ifdef USE_CLEAR_DIRTY_LOG + avg = timespec_div(clear_dirty_log_total, iterations); + pr_info("Clear dirty log over %lu iterations took %ld.%.9lds. (Avg %ld.%.9lds/iteration)\n", + iterations, clear_dirty_log_total.tv_sec, + clear_dirty_log_total.tv_nsec, avg.tv_sec, avg.tv_nsec); +#endif + + free(bmap); + free(vcpu_threads); + ucall_uninit(vm); + kvm_vm_free(vm); +} + +struct guest_mode { + bool supported; + bool enabled; +}; +static struct guest_mode guest_modes[NUM_VM_MODES]; + +#define guest_mode_init(mode, supported, enabled) ({ \ + guest_modes[mode] = (struct guest_mode){ supported, enabled }; \ +}) + +static void help(char *name) +{ + int i; + + puts(""); + printf("usage: %s [-h] [-i iterations] [-p offset] " + "[-m mode] [-b vcpu bytes] [-v vcpus]\n", name); + puts(""); + printf(" -i: specify iteration counts (default: %"PRIu64")\n", + TEST_HOST_LOOP_N); + printf(" -p: specify guest physical test memory offset\n" + " Warning: a low offset can conflict with the loaded test code.\n"); + printf(" -m: specify the guest mode ID to test " + "(default: test all supported modes)\n" + " This option may be used multiple times.\n" + " Guest mode IDs:\n"); + for (i = 0; i < NUM_VM_MODES; ++i) { + printf(" %d: %s%s\n", i, vm_guest_mode_string(i), + guest_modes[i].supported ? " (supported)" : ""); + } + printf(" -b: specify the size of the memory region which should be\n" + " dirtied by each vCPU. e.g. 10M or 3G.\n" + " (default: 1G)\n"); + printf(" -f: specify the fraction of pages which should be written to\n" + " as opposed to simply read, in the form\n" + " 1/.\n" + " (default: 1 i.e. all pages are written to.)\n"); + printf(" -v: specify the number of vCPUs to run.\n"); + puts(""); + exit(0); +} + +int main(int argc, char *argv[]) +{ + unsigned long iterations = TEST_HOST_LOOP_N; + bool mode_selected = false; + uint64_t phys_offset = 0; + unsigned int mode; + int opt, i; + int wr_fract = 1; + +#ifdef USE_CLEAR_DIRTY_LOG + dirty_log_manual_caps = + kvm_check_cap(KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2); + if (!dirty_log_manual_caps) { + print_skip("KVM_CLEAR_DIRTY_LOG not available"); + exit(KSFT_SKIP); + } + dirty_log_manual_caps &= (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE | + KVM_DIRTY_LOG_INITIALLY_SET); +#endif + +#ifdef __x86_64__ + guest_mode_init(VM_MODE_PXXV48_4K, true, true); +#endif +#ifdef __aarch64__ + guest_mode_init(VM_MODE_P40V48_4K, true, true); + guest_mode_init(VM_MODE_P40V48_64K, true, true); + + { + unsigned int limit = kvm_check_cap(KVM_CAP_ARM_VM_IPA_SIZE); + + if (limit >= 52) + guest_mode_init(VM_MODE_P52V48_64K, true, true); + if (limit >= 48) { + guest_mode_init(VM_MODE_P48V48_4K, true, true); + guest_mode_init(VM_MODE_P48V48_64K, true, true); + } + } +#endif +#ifdef __s390x__ + guest_mode_init(VM_MODE_P40V48_4K, true, true); +#endif + + while ((opt = getopt(argc, argv, "hi:p:m:b:f:v:")) != -1) { + switch (opt) { + case 'i': + iterations = strtol(optarg, NULL, 10); + break; + case 'p': + phys_offset = strtoull(optarg, NULL, 0); + break; + case 'm': + if (!mode_selected) { + for (i = 0; i < NUM_VM_MODES; ++i) + guest_modes[i].enabled = false; + mode_selected = true; + } + mode = strtoul(optarg, NULL, 10); + TEST_ASSERT(mode < NUM_VM_MODES, + "Guest mode ID %d too big", mode); + guest_modes[mode].enabled = true; + break; + case 'b': + guest_percpu_mem_size = parse_size(optarg); + break; + case 'f': + wr_fract = atoi(optarg); + TEST_ASSERT(wr_fract >= 1, + "Write fraction cannot be less than one"); + break; + case 'v': + nr_vcpus = atoi(optarg); + TEST_ASSERT(nr_vcpus > 0, + "Must have a positive number of vCPUs"); + TEST_ASSERT(nr_vcpus <= MAX_VCPUS, + "This test does not currently support\n" + "more than %d vCPUs.", MAX_VCPUS); + break; + case 'h': + default: + help(argv[0]); + break; + } + } + + TEST_ASSERT(iterations >= 2, "The test should have at least two iterations"); + + pr_info("Test iterations: %"PRIu64"\n", iterations); + + for (i = 0; i < NUM_VM_MODES; ++i) { + if (!guest_modes[i].enabled) + continue; + TEST_ASSERT(guest_modes[i].supported, + "Guest mode ID %d (%s) not supported.", + i, vm_guest_mode_string(i)); + run_test(i, iterations, phys_offset, wr_fract); + } + + return 0; +} diff --git a/tools/testing/selftests/kvm/dirty_log_test.c b/tools/testing/selftests/kvm/dirty_log_test.c new file mode 100644 index 000000000..54da9cc20 --- /dev/null +++ b/tools/testing/selftests/kvm/dirty_log_test.c @@ -0,0 +1,639 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * KVM dirty page logging test + * + * Copyright (C) 2018, Red Hat, Inc. + */ + +#define _GNU_SOURCE /* for program_invocation_name */ + +#include +#include +#include +#include +#include +#include +#include + +#include "test_util.h" +#include "kvm_util.h" +#include "processor.h" + +#define VCPU_ID 1 + +/* The memory slot index to track dirty pages */ +#define TEST_MEM_SLOT_INDEX 1 + +/* Default guest test virtual memory offset */ +#define DEFAULT_GUEST_TEST_MEM 0xc0000000 + +/* How many pages to dirty for each guest loop */ +#define TEST_PAGES_PER_LOOP 1024 + +/* How many host loops to run (one KVM_GET_DIRTY_LOG for each loop) */ +#define TEST_HOST_LOOP_N 32UL + +/* Interval for each host loop (ms) */ +#define TEST_HOST_LOOP_INTERVAL 10UL + +/* Dirty bitmaps are always little endian, so we need to swap on big endian */ +#if defined(__s390x__) +# define BITOP_LE_SWIZZLE ((BITS_PER_LONG-1) & ~0x7) +# define test_bit_le(nr, addr) \ + test_bit((nr) ^ BITOP_LE_SWIZZLE, addr) +# define set_bit_le(nr, addr) \ + set_bit((nr) ^ BITOP_LE_SWIZZLE, addr) +# define clear_bit_le(nr, addr) \ + clear_bit((nr) ^ BITOP_LE_SWIZZLE, addr) +# define test_and_set_bit_le(nr, addr) \ + test_and_set_bit((nr) ^ BITOP_LE_SWIZZLE, addr) +# define test_and_clear_bit_le(nr, addr) \ + test_and_clear_bit((nr) ^ BITOP_LE_SWIZZLE, addr) +#else +# define test_bit_le test_bit +# define set_bit_le set_bit +# define clear_bit_le clear_bit +# define test_and_set_bit_le test_and_set_bit +# define test_and_clear_bit_le test_and_clear_bit +#endif + +/* + * Guest/Host shared variables. Ensure addr_gva2hva() and/or + * sync_global_to/from_guest() are used when accessing from + * the host. READ/WRITE_ONCE() should also be used with anything + * that may change. + */ +static uint64_t host_page_size; +static uint64_t guest_page_size; +static uint64_t guest_num_pages; +static uint64_t random_array[TEST_PAGES_PER_LOOP]; +static uint64_t iteration; + +/* + * Guest physical memory offset of the testing memory slot. + * This will be set to the topmost valid physical address minus + * the test memory size. + */ +static uint64_t guest_test_phys_mem; + +/* + * Guest virtual memory offset of the testing memory slot. + * Must not conflict with identity mapped test code. + */ +static uint64_t guest_test_virt_mem = DEFAULT_GUEST_TEST_MEM; + +/* + * Continuously write to the first 8 bytes of a random pages within + * the testing memory region. + */ +static void guest_code(void) +{ + uint64_t addr; + int i; + + /* + * On s390x, all pages of a 1M segment are initially marked as dirty + * when a page of the segment is written to for the very first time. + * To compensate this specialty in this test, we need to touch all + * pages during the first iteration. + */ + for (i = 0; i < guest_num_pages; i++) { + addr = guest_test_virt_mem + i * guest_page_size; + *(uint64_t *)addr = READ_ONCE(iteration); + } + + while (true) { + for (i = 0; i < TEST_PAGES_PER_LOOP; i++) { + addr = guest_test_virt_mem; + addr += (READ_ONCE(random_array[i]) % guest_num_pages) + * guest_page_size; + addr &= ~(host_page_size - 1); + *(uint64_t *)addr = READ_ONCE(iteration); + } + + /* Tell the host that we need more random numbers */ + GUEST_SYNC(1); + } +} + +/* Host variables */ +static bool host_quit; + +/* Points to the test VM memory region on which we track dirty logs */ +static void *host_test_mem; +static uint64_t host_num_pages; + +/* For statistics only */ +static uint64_t host_dirty_count; +static uint64_t host_clear_count; +static uint64_t host_track_next_count; + +enum log_mode_t { + /* Only use KVM_GET_DIRTY_LOG for logging */ + LOG_MODE_DIRTY_LOG = 0, + + /* Use both KVM_[GET|CLEAR]_DIRTY_LOG for logging */ + LOG_MODE_CLEAR_LOG = 1, + + LOG_MODE_NUM, + + /* Run all supported modes */ + LOG_MODE_ALL = LOG_MODE_NUM, +}; + +/* Mode of logging to test. Default is to run all supported modes */ +static enum log_mode_t host_log_mode_option = LOG_MODE_ALL; +/* Logging mode for current run */ +static enum log_mode_t host_log_mode; + +static bool clear_log_supported(void) +{ + return kvm_check_cap(KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2); +} + +static void clear_log_create_vm_done(struct kvm_vm *vm) +{ + struct kvm_enable_cap cap = {}; + u64 manual_caps; + + manual_caps = kvm_check_cap(KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2); + TEST_ASSERT(manual_caps, "MANUAL_CAPS is zero!"); + manual_caps &= (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE | + KVM_DIRTY_LOG_INITIALLY_SET); + cap.cap = KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2; + cap.args[0] = manual_caps; + vm_enable_cap(vm, &cap); +} + +static void dirty_log_collect_dirty_pages(struct kvm_vm *vm, int slot, + void *bitmap, uint32_t num_pages) +{ + kvm_vm_get_dirty_log(vm, slot, bitmap); +} + +static void clear_log_collect_dirty_pages(struct kvm_vm *vm, int slot, + void *bitmap, uint32_t num_pages) +{ + kvm_vm_get_dirty_log(vm, slot, bitmap); + kvm_vm_clear_dirty_log(vm, slot, bitmap, 0, num_pages); +} + +struct log_mode { + const char *name; + /* Return true if this mode is supported, otherwise false */ + bool (*supported)(void); + /* Hook when the vm creation is done (before vcpu creation) */ + void (*create_vm_done)(struct kvm_vm *vm); + /* Hook to collect the dirty pages into the bitmap provided */ + void (*collect_dirty_pages) (struct kvm_vm *vm, int slot, + void *bitmap, uint32_t num_pages); +} log_modes[LOG_MODE_NUM] = { + { + .name = "dirty-log", + .collect_dirty_pages = dirty_log_collect_dirty_pages, + }, + { + .name = "clear-log", + .supported = clear_log_supported, + .create_vm_done = clear_log_create_vm_done, + .collect_dirty_pages = clear_log_collect_dirty_pages, + }, +}; + +/* + * We use this bitmap to track some pages that should have its dirty + * bit set in the _next_ iteration. For example, if we detected the + * page value changed to current iteration but at the same time the + * page bit is cleared in the latest bitmap, then the system must + * report that write in the next get dirty log call. + */ +static unsigned long *host_bmap_track; + +static void log_modes_dump(void) +{ + int i; + + printf("all"); + for (i = 0; i < LOG_MODE_NUM; i++) + printf(", %s", log_modes[i].name); + printf("\n"); +} + +static bool log_mode_supported(void) +{ + struct log_mode *mode = &log_modes[host_log_mode]; + + if (mode->supported) + return mode->supported(); + + return true; +} + +static void log_mode_create_vm_done(struct kvm_vm *vm) +{ + struct log_mode *mode = &log_modes[host_log_mode]; + + if (mode->create_vm_done) + mode->create_vm_done(vm); +} + +static void log_mode_collect_dirty_pages(struct kvm_vm *vm, int slot, + void *bitmap, uint32_t num_pages) +{ + struct log_mode *mode = &log_modes[host_log_mode]; + + TEST_ASSERT(mode->collect_dirty_pages != NULL, + "collect_dirty_pages() is required for any log mode!"); + mode->collect_dirty_pages(vm, slot, bitmap, num_pages); +} + +static void generate_random_array(uint64_t *guest_array, uint64_t size) +{ + uint64_t i; + + for (i = 0; i < size; i++) + guest_array[i] = random(); +} + +static void *vcpu_worker(void *data) +{ + int ret; + struct kvm_vm *vm = data; + uint64_t *guest_array; + uint64_t pages_count = 0; + struct kvm_run *run; + + run = vcpu_state(vm, VCPU_ID); + + guest_array = addr_gva2hva(vm, (vm_vaddr_t)random_array); + generate_random_array(guest_array, TEST_PAGES_PER_LOOP); + + while (!READ_ONCE(host_quit)) { + /* Let the guest dirty the random pages */ + ret = _vcpu_run(vm, VCPU_ID); + TEST_ASSERT(ret == 0, "vcpu_run failed: %d\n", ret); + if (get_ucall(vm, VCPU_ID, NULL) == UCALL_SYNC) { + pages_count += TEST_PAGES_PER_LOOP; + generate_random_array(guest_array, TEST_PAGES_PER_LOOP); + } else { + TEST_FAIL("Invalid guest sync status: " + "exit_reason=%s\n", + exit_reason_str(run->exit_reason)); + } + } + + pr_info("Dirtied %"PRIu64" pages\n", pages_count); + + return NULL; +} + +static void vm_dirty_log_verify(enum vm_guest_mode mode, unsigned long *bmap) +{ + uint64_t step = vm_num_host_pages(mode, 1); + uint64_t page; + uint64_t *value_ptr; + + for (page = 0; page < host_num_pages; page += step) { + value_ptr = host_test_mem + page * host_page_size; + + /* If this is a special page that we were tracking... */ + if (test_and_clear_bit_le(page, host_bmap_track)) { + host_track_next_count++; + TEST_ASSERT(test_bit_le(page, bmap), + "Page %"PRIu64" should have its dirty bit " + "set in this iteration but it is missing", + page); + } + + if (test_and_clear_bit_le(page, bmap)) { + host_dirty_count++; + /* + * If the bit is set, the value written onto + * the corresponding page should be either the + * previous iteration number or the current one. + */ + TEST_ASSERT(*value_ptr == iteration || + *value_ptr == iteration - 1, + "Set page %"PRIu64" value %"PRIu64 + " incorrect (iteration=%"PRIu64")", + page, *value_ptr, iteration); + } else { + host_clear_count++; + /* + * If cleared, the value written can be any + * value smaller or equals to the iteration + * number. Note that the value can be exactly + * (iteration-1) if that write can happen + * like this: + * + * (1) increase loop count to "iteration-1" + * (2) write to page P happens (with value + * "iteration-1") + * (3) get dirty log for "iteration-1"; we'll + * see that page P bit is set (dirtied), + * and not set the bit in host_bmap_track + * (4) increase loop count to "iteration" + * (which is current iteration) + * (5) get dirty log for current iteration, + * we'll see that page P is cleared, with + * value "iteration-1". + */ + TEST_ASSERT(*value_ptr <= iteration, + "Clear page %"PRIu64" value %"PRIu64 + " incorrect (iteration=%"PRIu64")", + page, *value_ptr, iteration); + if (*value_ptr == iteration) { + /* + * This page is _just_ modified; it + * should report its dirtyness in the + * next run + */ + set_bit_le(page, host_bmap_track); + } + } + } +} + +static struct kvm_vm *create_vm(enum vm_guest_mode mode, uint32_t vcpuid, + uint64_t extra_mem_pages, void *guest_code) +{ + struct kvm_vm *vm; + uint64_t extra_pg_pages = extra_mem_pages / 512 * 2; + + pr_info("Testing guest mode: %s\n", vm_guest_mode_string(mode)); + + vm = vm_create(mode, DEFAULT_GUEST_PHY_PAGES + extra_pg_pages, O_RDWR); + kvm_vm_elf_load(vm, program_invocation_name, 0, 0); +#ifdef __x86_64__ + vm_create_irqchip(vm); +#endif + log_mode_create_vm_done(vm); + vm_vcpu_add_default(vm, vcpuid, guest_code); + return vm; +} + +#define DIRTY_MEM_BITS 30 /* 1G */ +#define PAGE_SHIFT_4K 12 + +static void run_test(enum vm_guest_mode mode, unsigned long iterations, + unsigned long interval, uint64_t phys_offset) +{ + pthread_t vcpu_thread; + struct kvm_vm *vm; + unsigned long *bmap; + + if (!log_mode_supported()) { + print_skip("Log mode '%s' not supported", + log_modes[host_log_mode].name); + return; + } + + /* + * We reserve page table for 2 times of extra dirty mem which + * will definitely cover the original (1G+) test range. Here + * we do the calculation with 4K page size which is the + * smallest so the page number will be enough for all archs + * (e.g., 64K page size guest will need even less memory for + * page tables). + */ + vm = create_vm(mode, VCPU_ID, + 2ul << (DIRTY_MEM_BITS - PAGE_SHIFT_4K), + guest_code); + + guest_page_size = vm_get_page_size(vm); + /* + * A little more than 1G of guest page sized pages. Cover the + * case where the size is not aligned to 64 pages. + */ + guest_num_pages = (1ul << (DIRTY_MEM_BITS - + vm_get_page_shift(vm))) + 3; + guest_num_pages = vm_adjust_num_guest_pages(mode, guest_num_pages); + + host_page_size = getpagesize(); + host_num_pages = vm_num_host_pages(mode, guest_num_pages); + + if (!phys_offset) { + guest_test_phys_mem = (vm_get_max_gfn(vm) - + guest_num_pages) * guest_page_size; + guest_test_phys_mem &= ~(host_page_size - 1); + } else { + guest_test_phys_mem = phys_offset; + } + +#ifdef __s390x__ + /* Align to 1M (segment size) */ + guest_test_phys_mem &= ~((1 << 20) - 1); +#endif + + pr_info("guest physical test memory offset: 0x%lx\n", guest_test_phys_mem); + + bmap = bitmap_alloc(host_num_pages); + host_bmap_track = bitmap_alloc(host_num_pages); + + /* Add an extra memory slot for testing dirty logging */ + vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, + guest_test_phys_mem, + TEST_MEM_SLOT_INDEX, + guest_num_pages, + KVM_MEM_LOG_DIRTY_PAGES); + + /* Do mapping for the dirty track memory slot */ + virt_map(vm, guest_test_virt_mem, guest_test_phys_mem, guest_num_pages, 0); + + /* Cache the HVA pointer of the region */ + host_test_mem = addr_gpa2hva(vm, (vm_paddr_t)guest_test_phys_mem); + +#ifdef __x86_64__ + vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid()); +#endif + ucall_init(vm, NULL); + + /* Export the shared variables to the guest */ + sync_global_to_guest(vm, host_page_size); + sync_global_to_guest(vm, guest_page_size); + sync_global_to_guest(vm, guest_test_virt_mem); + sync_global_to_guest(vm, guest_num_pages); + + /* Start the iterations */ + iteration = 1; + sync_global_to_guest(vm, iteration); + host_quit = false; + host_dirty_count = 0; + host_clear_count = 0; + host_track_next_count = 0; + + pthread_create(&vcpu_thread, NULL, vcpu_worker, vm); + + while (iteration < iterations) { + /* Give the vcpu thread some time to dirty some pages */ + usleep(interval * 1000); + log_mode_collect_dirty_pages(vm, TEST_MEM_SLOT_INDEX, + bmap, host_num_pages); + vm_dirty_log_verify(mode, bmap); + iteration++; + sync_global_to_guest(vm, iteration); + } + + /* Tell the vcpu thread to quit */ + host_quit = true; + pthread_join(vcpu_thread, NULL); + + pr_info("Total bits checked: dirty (%"PRIu64"), clear (%"PRIu64"), " + "track_next (%"PRIu64")\n", host_dirty_count, host_clear_count, + host_track_next_count); + + free(bmap); + free(host_bmap_track); + ucall_uninit(vm); + kvm_vm_free(vm); +} + +struct guest_mode { + bool supported; + bool enabled; +}; +static struct guest_mode guest_modes[NUM_VM_MODES]; + +#define guest_mode_init(mode, supported, enabled) ({ \ + guest_modes[mode] = (struct guest_mode){ supported, enabled }; \ +}) + +static void help(char *name) +{ + int i; + + puts(""); + printf("usage: %s [-h] [-i iterations] [-I interval] " + "[-p offset] [-m mode]\n", name); + puts(""); + printf(" -i: specify iteration counts (default: %"PRIu64")\n", + TEST_HOST_LOOP_N); + printf(" -I: specify interval in ms (default: %"PRIu64" ms)\n", + TEST_HOST_LOOP_INTERVAL); + printf(" -p: specify guest physical test memory offset\n" + " Warning: a low offset can conflict with the loaded test code.\n"); + printf(" -M: specify the host logging mode " + "(default: run all log modes). Supported modes: \n\t"); + log_modes_dump(); + printf(" -m: specify the guest mode ID to test " + "(default: test all supported modes)\n" + " This option may be used multiple times.\n" + " Guest mode IDs:\n"); + for (i = 0; i < NUM_VM_MODES; ++i) { + printf(" %d: %s%s\n", i, vm_guest_mode_string(i), + guest_modes[i].supported ? " (supported)" : ""); + } + puts(""); + exit(0); +} + +int main(int argc, char *argv[]) +{ + unsigned long iterations = TEST_HOST_LOOP_N; + unsigned long interval = TEST_HOST_LOOP_INTERVAL; + bool mode_selected = false; + uint64_t phys_offset = 0; + unsigned int mode; + int opt, i, j; + +#ifdef __x86_64__ + guest_mode_init(VM_MODE_PXXV48_4K, true, true); +#endif +#ifdef __aarch64__ + guest_mode_init(VM_MODE_P40V48_4K, true, true); + guest_mode_init(VM_MODE_P40V48_64K, true, true); + + { + unsigned int limit = kvm_check_cap(KVM_CAP_ARM_VM_IPA_SIZE); + + if (limit >= 52) + guest_mode_init(VM_MODE_P52V48_64K, true, true); + if (limit >= 48) { + guest_mode_init(VM_MODE_P48V48_4K, true, true); + guest_mode_init(VM_MODE_P48V48_64K, true, true); + } + } +#endif +#ifdef __s390x__ + guest_mode_init(VM_MODE_P40V48_4K, true, true); +#endif + + while ((opt = getopt(argc, argv, "hi:I:p:m:M:")) != -1) { + switch (opt) { + case 'i': + iterations = strtol(optarg, NULL, 10); + break; + case 'I': + interval = strtol(optarg, NULL, 10); + break; + case 'p': + phys_offset = strtoull(optarg, NULL, 0); + break; + case 'm': + if (!mode_selected) { + for (i = 0; i < NUM_VM_MODES; ++i) + guest_modes[i].enabled = false; + mode_selected = true; + } + mode = strtoul(optarg, NULL, 10); + TEST_ASSERT(mode < NUM_VM_MODES, + "Guest mode ID %d too big", mode); + guest_modes[mode].enabled = true; + break; + case 'M': + if (!strcmp(optarg, "all")) { + host_log_mode_option = LOG_MODE_ALL; + break; + } + for (i = 0; i < LOG_MODE_NUM; i++) { + if (!strcmp(optarg, log_modes[i].name)) { + pr_info("Setting log mode to: '%s'\n", + optarg); + host_log_mode_option = i; + break; + } + } + if (i == LOG_MODE_NUM) { + printf("Log mode '%s' invalid. Please choose " + "from: ", optarg); + log_modes_dump(); + exit(1); + } + break; + case 'h': + default: + help(argv[0]); + break; + } + } + + TEST_ASSERT(iterations > 2, "Iterations must be greater than two"); + TEST_ASSERT(interval > 0, "Interval must be greater than zero"); + + pr_info("Test iterations: %"PRIu64", interval: %"PRIu64" (ms)\n", + iterations, interval); + + srandom(time(0)); + + for (i = 0; i < NUM_VM_MODES; ++i) { + if (!guest_modes[i].enabled) + continue; + TEST_ASSERT(guest_modes[i].supported, + "Guest mode ID %d (%s) not supported.", + i, vm_guest_mode_string(i)); + if (host_log_mode_option == LOG_MODE_ALL) { + /* Run each log mode */ + for (j = 0; j < LOG_MODE_NUM; j++) { + pr_info("Testing Log Mode '%s'\n", + log_modes[j].name); + host_log_mode = j; + run_test(i, iterations, interval, phys_offset); + } + } else { + host_log_mode = host_log_mode_option; + run_test(i, iterations, interval, phys_offset); + } + } + + return 0; +} diff --git a/tools/testing/selftests/kvm/include/aarch64/processor.h b/tools/testing/selftests/kvm/include/aarch64/processor.h new file mode 100644 index 000000000..b7fa0c855 --- /dev/null +++ b/tools/testing/selftests/kvm/include/aarch64/processor.h @@ -0,0 +1,59 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * AArch64 processor specific defines + * + * Copyright (C) 2018, Red Hat, Inc. + */ +#ifndef SELFTEST_KVM_PROCESSOR_H +#define SELFTEST_KVM_PROCESSOR_H + +#include "kvm_util.h" + + +#define ARM64_CORE_REG(x) (KVM_REG_ARM64 | KVM_REG_SIZE_U64 | \ + KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(x)) + +#define CPACR_EL1 3, 0, 1, 0, 2 +#define TCR_EL1 3, 0, 2, 0, 2 +#define MAIR_EL1 3, 0, 10, 2, 0 +#define TTBR0_EL1 3, 0, 2, 0, 0 +#define SCTLR_EL1 3, 0, 1, 0, 0 + +/* + * Default MAIR + * index attribute + * DEVICE_nGnRnE 0 0000:0000 + * DEVICE_nGnRE 1 0000:0100 + * DEVICE_GRE 2 0000:1100 + * NORMAL_NC 3 0100:0100 + * NORMAL 4 1111:1111 + * NORMAL_WT 5 1011:1011 + */ +#define DEFAULT_MAIR_EL1 ((0x00ul << (0 * 8)) | \ + (0x04ul << (1 * 8)) | \ + (0x0cul << (2 * 8)) | \ + (0x44ul << (3 * 8)) | \ + (0xfful << (4 * 8)) | \ + (0xbbul << (5 * 8))) + +static inline void get_reg(struct kvm_vm *vm, uint32_t vcpuid, uint64_t id, uint64_t *addr) +{ + struct kvm_one_reg reg; + reg.id = id; + reg.addr = (uint64_t)addr; + vcpu_ioctl(vm, vcpuid, KVM_GET_ONE_REG, ®); +} + +static inline void set_reg(struct kvm_vm *vm, uint32_t vcpuid, uint64_t id, uint64_t val) +{ + struct kvm_one_reg reg; + reg.id = id; + reg.addr = (uint64_t)&val; + vcpu_ioctl(vm, vcpuid, KVM_SET_ONE_REG, ®); +} + +void aarch64_vcpu_setup(struct kvm_vm *vm, int vcpuid, struct kvm_vcpu_init *init); +void aarch64_vcpu_add_default(struct kvm_vm *vm, uint32_t vcpuid, + struct kvm_vcpu_init *init, void *guest_code); + +#endif /* SELFTEST_KVM_PROCESSOR_H */ diff --git a/tools/testing/selftests/kvm/include/evmcs.h b/tools/testing/selftests/kvm/include/evmcs.h new file mode 100644 index 000000000..a034438b6 --- /dev/null +++ b/tools/testing/selftests/kvm/include/evmcs.h @@ -0,0 +1,1102 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * tools/testing/selftests/kvm/include/vmx.h + * + * Copyright (C) 2018, Red Hat, Inc. + * + */ + +#ifndef SELFTEST_KVM_EVMCS_H +#define SELFTEST_KVM_EVMCS_H + +#include +#include "vmx.h" + +#define u16 uint16_t +#define u32 uint32_t +#define u64 uint64_t + +#define EVMCS_VERSION 1 + +extern bool enable_evmcs; + +struct hv_vp_assist_page { + __u32 apic_assist; + __u32 reserved; + __u64 vtl_control[2]; + __u64 nested_enlightenments_control[2]; + __u32 enlighten_vmentry; + __u64 current_nested_vmcs; +}; + +struct hv_enlightened_vmcs { + u32 revision_id; + u32 abort; + + u16 host_es_selector; + u16 host_cs_selector; + u16 host_ss_selector; + u16 host_ds_selector; + u16 host_fs_selector; + u16 host_gs_selector; + u16 host_tr_selector; + + u64 host_ia32_pat; + u64 host_ia32_efer; + + u64 host_cr0; + u64 host_cr3; + u64 host_cr4; + + u64 host_ia32_sysenter_esp; + u64 host_ia32_sysenter_eip; + u64 host_rip; + u32 host_ia32_sysenter_cs; + + u32 pin_based_vm_exec_control; + u32 vm_exit_controls; + u32 secondary_vm_exec_control; + + u64 io_bitmap_a; + u64 io_bitmap_b; + u64 msr_bitmap; + + u16 guest_es_selector; + u16 guest_cs_selector; + u16 guest_ss_selector; + u16 guest_ds_selector; + u16 guest_fs_selector; + u16 guest_gs_selector; + u16 guest_ldtr_selector; + u16 guest_tr_selector; + + u32 guest_es_limit; + u32 guest_cs_limit; + u32 guest_ss_limit; + u32 guest_ds_limit; + u32 guest_fs_limit; + u32 guest_gs_limit; + u32 guest_ldtr_limit; + u32 guest_tr_limit; + u32 guest_gdtr_limit; + u32 guest_idtr_limit; + + u32 guest_es_ar_bytes; + u32 guest_cs_ar_bytes; + u32 guest_ss_ar_bytes; + u32 guest_ds_ar_bytes; + u32 guest_fs_ar_bytes; + u32 guest_gs_ar_bytes; + u32 guest_ldtr_ar_bytes; + u32 guest_tr_ar_bytes; + + u64 guest_es_base; + u64 guest_cs_base; + u64 guest_ss_base; + u64 guest_ds_base; + u64 guest_fs_base; + u64 guest_gs_base; + u64 guest_ldtr_base; + u64 guest_tr_base; + u64 guest_gdtr_base; + u64 guest_idtr_base; + + u64 padding64_1[3]; + + u64 vm_exit_msr_store_addr; + u64 vm_exit_msr_load_addr; + u64 vm_entry_msr_load_addr; + + u64 cr3_target_value0; + u64 cr3_target_value1; + u64 cr3_target_value2; + u64 cr3_target_value3; + + u32 page_fault_error_code_mask; + u32 page_fault_error_code_match; + + u32 cr3_target_count; + u32 vm_exit_msr_store_count; + u32 vm_exit_msr_load_count; + u32 vm_entry_msr_load_count; + + u64 tsc_offset; + u64 virtual_apic_page_addr; + u64 vmcs_link_pointer; + + u64 guest_ia32_debugctl; + u64 guest_ia32_pat; + u64 guest_ia32_efer; + + u64 guest_pdptr0; + u64 guest_pdptr1; + u64 guest_pdptr2; + u64 guest_pdptr3; + + u64 guest_pending_dbg_exceptions; + u64 guest_sysenter_esp; + u64 guest_sysenter_eip; + + u32 guest_activity_state; + u32 guest_sysenter_cs; + + u64 cr0_guest_host_mask; + u64 cr4_guest_host_mask; + u64 cr0_read_shadow; + u64 cr4_read_shadow; + u64 guest_cr0; + u64 guest_cr3; + u64 guest_cr4; + u64 guest_dr7; + + u64 host_fs_base; + u64 host_gs_base; + u64 host_tr_base; + u64 host_gdtr_base; + u64 host_idtr_base; + u64 host_rsp; + + u64 ept_pointer; + + u16 virtual_processor_id; + u16 padding16[3]; + + u64 padding64_2[5]; + u64 guest_physical_address; + + u32 vm_instruction_error; + u32 vm_exit_reason; + u32 vm_exit_intr_info; + u32 vm_exit_intr_error_code; + u32 idt_vectoring_info_field; + u32 idt_vectoring_error_code; + u32 vm_exit_instruction_len; + u32 vmx_instruction_info; + + u64 exit_qualification; + u64 exit_io_instruction_ecx; + u64 exit_io_instruction_esi; + u64 exit_io_instruction_edi; + u64 exit_io_instruction_eip; + + u64 guest_linear_address; + u64 guest_rsp; + u64 guest_rflags; + + u32 guest_interruptibility_info; + u32 cpu_based_vm_exec_control; + u32 exception_bitmap; + u32 vm_entry_controls; + u32 vm_entry_intr_info_field; + u32 vm_entry_exception_error_code; + u32 vm_entry_instruction_len; + u32 tpr_threshold; + + u64 guest_rip; + + u32 hv_clean_fields; + u32 hv_padding_32; + u32 hv_synthetic_controls; + struct { + u32 nested_flush_hypercall:1; + u32 msr_bitmap:1; + u32 reserved:30; + } hv_enlightenments_control; + u32 hv_vp_id; + + u64 hv_vm_id; + u64 partition_assist_page; + u64 padding64_4[4]; + u64 guest_bndcfgs; + u64 padding64_5[7]; + u64 xss_exit_bitmap; + u64 padding64_6[7]; +}; + +#define HV_X64_MSR_VP_ASSIST_PAGE 0x40000073 +#define HV_X64_MSR_VP_ASSIST_PAGE_ENABLE 0x00000001 +#define HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_SHIFT 12 +#define HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_MASK \ + (~((1ull << HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_SHIFT) - 1)) + +extern struct hv_enlightened_vmcs *current_evmcs; +extern struct hv_vp_assist_page *current_vp_assist; + +int vcpu_enable_evmcs(struct kvm_vm *vm, int vcpu_id); + +static inline int enable_vp_assist(uint64_t vp_assist_pa, void *vp_assist) +{ + u64 val = (vp_assist_pa & HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_MASK) | + HV_X64_MSR_VP_ASSIST_PAGE_ENABLE; + + wrmsr(HV_X64_MSR_VP_ASSIST_PAGE, val); + + current_vp_assist = vp_assist; + + enable_evmcs = true; + + return 0; +} + +static inline int evmcs_vmptrld(uint64_t vmcs_pa, void *vmcs) +{ + current_vp_assist->current_nested_vmcs = vmcs_pa; + current_vp_assist->enlighten_vmentry = 1; + + current_evmcs = vmcs; + + return 0; +} + +static inline int evmcs_vmptrst(uint64_t *value) +{ + *value = current_vp_assist->current_nested_vmcs & + ~HV_X64_MSR_VP_ASSIST_PAGE_ENABLE; + + return 0; +} + +static inline int evmcs_vmread(uint64_t encoding, uint64_t *value) +{ + switch (encoding) { + case GUEST_RIP: + *value = current_evmcs->guest_rip; + break; + case GUEST_RSP: + *value = current_evmcs->guest_rsp; + break; + case GUEST_RFLAGS: + *value = current_evmcs->guest_rflags; + break; + case HOST_IA32_PAT: + *value = current_evmcs->host_ia32_pat; + break; + case HOST_IA32_EFER: + *value = current_evmcs->host_ia32_efer; + break; + case HOST_CR0: + *value = current_evmcs->host_cr0; + break; + case HOST_CR3: + *value = current_evmcs->host_cr3; + break; + case HOST_CR4: + *value = current_evmcs->host_cr4; + break; + case HOST_IA32_SYSENTER_ESP: + *value = current_evmcs->host_ia32_sysenter_esp; + break; + case HOST_IA32_SYSENTER_EIP: + *value = current_evmcs->host_ia32_sysenter_eip; + break; + case HOST_RIP: + *value = current_evmcs->host_rip; + break; + case IO_BITMAP_A: + *value = current_evmcs->io_bitmap_a; + break; + case IO_BITMAP_B: + *value = current_evmcs->io_bitmap_b; + break; + case MSR_BITMAP: + *value = current_evmcs->msr_bitmap; + break; + case GUEST_ES_BASE: + *value = current_evmcs->guest_es_base; + break; + case GUEST_CS_BASE: + *value = current_evmcs->guest_cs_base; + break; + case GUEST_SS_BASE: + *value = current_evmcs->guest_ss_base; + break; + case GUEST_DS_BASE: + *value = current_evmcs->guest_ds_base; + break; + case GUEST_FS_BASE: + *value = current_evmcs->guest_fs_base; + break; + case GUEST_GS_BASE: + *value = current_evmcs->guest_gs_base; + break; + case GUEST_LDTR_BASE: + *value = current_evmcs->guest_ldtr_base; + break; + case GUEST_TR_BASE: + *value = current_evmcs->guest_tr_base; + break; + case GUEST_GDTR_BASE: + *value = current_evmcs->guest_gdtr_base; + break; + case GUEST_IDTR_BASE: + *value = current_evmcs->guest_idtr_base; + break; + case TSC_OFFSET: + *value = current_evmcs->tsc_offset; + break; + case VIRTUAL_APIC_PAGE_ADDR: + *value = current_evmcs->virtual_apic_page_addr; + break; + case VMCS_LINK_POINTER: + *value = current_evmcs->vmcs_link_pointer; + break; + case GUEST_IA32_DEBUGCTL: + *value = current_evmcs->guest_ia32_debugctl; + break; + case GUEST_IA32_PAT: + *value = current_evmcs->guest_ia32_pat; + break; + case GUEST_IA32_EFER: + *value = current_evmcs->guest_ia32_efer; + break; + case GUEST_PDPTR0: + *value = current_evmcs->guest_pdptr0; + break; + case GUEST_PDPTR1: + *value = current_evmcs->guest_pdptr1; + break; + case GUEST_PDPTR2: + *value = current_evmcs->guest_pdptr2; + break; + case GUEST_PDPTR3: + *value = current_evmcs->guest_pdptr3; + break; + case GUEST_PENDING_DBG_EXCEPTIONS: + *value = current_evmcs->guest_pending_dbg_exceptions; + break; + case GUEST_SYSENTER_ESP: + *value = current_evmcs->guest_sysenter_esp; + break; + case GUEST_SYSENTER_EIP: + *value = current_evmcs->guest_sysenter_eip; + break; + case CR0_GUEST_HOST_MASK: + *value = current_evmcs->cr0_guest_host_mask; + break; + case CR4_GUEST_HOST_MASK: + *value = current_evmcs->cr4_guest_host_mask; + break; + case CR0_READ_SHADOW: + *value = current_evmcs->cr0_read_shadow; + break; + case CR4_READ_SHADOW: + *value = current_evmcs->cr4_read_shadow; + break; + case GUEST_CR0: + *value = current_evmcs->guest_cr0; + break; + case GUEST_CR3: + *value = current_evmcs->guest_cr3; + break; + case GUEST_CR4: + *value = current_evmcs->guest_cr4; + break; + case GUEST_DR7: + *value = current_evmcs->guest_dr7; + break; + case HOST_FS_BASE: + *value = current_evmcs->host_fs_base; + break; + case HOST_GS_BASE: + *value = current_evmcs->host_gs_base; + break; + case HOST_TR_BASE: + *value = current_evmcs->host_tr_base; + break; + case HOST_GDTR_BASE: + *value = current_evmcs->host_gdtr_base; + break; + case HOST_IDTR_BASE: + *value = current_evmcs->host_idtr_base; + break; + case HOST_RSP: + *value = current_evmcs->host_rsp; + break; + case EPT_POINTER: + *value = current_evmcs->ept_pointer; + break; + case GUEST_BNDCFGS: + *value = current_evmcs->guest_bndcfgs; + break; + case XSS_EXIT_BITMAP: + *value = current_evmcs->xss_exit_bitmap; + break; + case GUEST_PHYSICAL_ADDRESS: + *value = current_evmcs->guest_physical_address; + break; + case EXIT_QUALIFICATION: + *value = current_evmcs->exit_qualification; + break; + case GUEST_LINEAR_ADDRESS: + *value = current_evmcs->guest_linear_address; + break; + case VM_EXIT_MSR_STORE_ADDR: + *value = current_evmcs->vm_exit_msr_store_addr; + break; + case VM_EXIT_MSR_LOAD_ADDR: + *value = current_evmcs->vm_exit_msr_load_addr; + break; + case VM_ENTRY_MSR_LOAD_ADDR: + *value = current_evmcs->vm_entry_msr_load_addr; + break; + case CR3_TARGET_VALUE0: + *value = current_evmcs->cr3_target_value0; + break; + case CR3_TARGET_VALUE1: + *value = current_evmcs->cr3_target_value1; + break; + case CR3_TARGET_VALUE2: + *value = current_evmcs->cr3_target_value2; + break; + case CR3_TARGET_VALUE3: + *value = current_evmcs->cr3_target_value3; + break; + case TPR_THRESHOLD: + *value = current_evmcs->tpr_threshold; + break; + case GUEST_INTERRUPTIBILITY_INFO: + *value = current_evmcs->guest_interruptibility_info; + break; + case CPU_BASED_VM_EXEC_CONTROL: + *value = current_evmcs->cpu_based_vm_exec_control; + break; + case EXCEPTION_BITMAP: + *value = current_evmcs->exception_bitmap; + break; + case VM_ENTRY_CONTROLS: + *value = current_evmcs->vm_entry_controls; + break; + case VM_ENTRY_INTR_INFO_FIELD: + *value = current_evmcs->vm_entry_intr_info_field; + break; + case VM_ENTRY_EXCEPTION_ERROR_CODE: + *value = current_evmcs->vm_entry_exception_error_code; + break; + case VM_ENTRY_INSTRUCTION_LEN: + *value = current_evmcs->vm_entry_instruction_len; + break; + case HOST_IA32_SYSENTER_CS: + *value = current_evmcs->host_ia32_sysenter_cs; + break; + case PIN_BASED_VM_EXEC_CONTROL: + *value = current_evmcs->pin_based_vm_exec_control; + break; + case VM_EXIT_CONTROLS: + *value = current_evmcs->vm_exit_controls; + break; + case SECONDARY_VM_EXEC_CONTROL: + *value = current_evmcs->secondary_vm_exec_control; + break; + case GUEST_ES_LIMIT: + *value = current_evmcs->guest_es_limit; + break; + case GUEST_CS_LIMIT: + *value = current_evmcs->guest_cs_limit; + break; + case GUEST_SS_LIMIT: + *value = current_evmcs->guest_ss_limit; + break; + case GUEST_DS_LIMIT: + *value = current_evmcs->guest_ds_limit; + break; + case GUEST_FS_LIMIT: + *value = current_evmcs->guest_fs_limit; + break; + case GUEST_GS_LIMIT: + *value = current_evmcs->guest_gs_limit; + break; + case GUEST_LDTR_LIMIT: + *value = current_evmcs->guest_ldtr_limit; + break; + case GUEST_TR_LIMIT: + *value = current_evmcs->guest_tr_limit; + break; + case GUEST_GDTR_LIMIT: + *value = current_evmcs->guest_gdtr_limit; + break; + case GUEST_IDTR_LIMIT: + *value = current_evmcs->guest_idtr_limit; + break; + case GUEST_ES_AR_BYTES: + *value = current_evmcs->guest_es_ar_bytes; + break; + case GUEST_CS_AR_BYTES: + *value = current_evmcs->guest_cs_ar_bytes; + break; + case GUEST_SS_AR_BYTES: + *value = current_evmcs->guest_ss_ar_bytes; + break; + case GUEST_DS_AR_BYTES: + *value = current_evmcs->guest_ds_ar_bytes; + break; + case GUEST_FS_AR_BYTES: + *value = current_evmcs->guest_fs_ar_bytes; + break; + case GUEST_GS_AR_BYTES: + *value = current_evmcs->guest_gs_ar_bytes; + break; + case GUEST_LDTR_AR_BYTES: + *value = current_evmcs->guest_ldtr_ar_bytes; + break; + case GUEST_TR_AR_BYTES: + *value = current_evmcs->guest_tr_ar_bytes; + break; + case GUEST_ACTIVITY_STATE: + *value = current_evmcs->guest_activity_state; + break; + case GUEST_SYSENTER_CS: + *value = current_evmcs->guest_sysenter_cs; + break; + case VM_INSTRUCTION_ERROR: + *value = current_evmcs->vm_instruction_error; + break; + case VM_EXIT_REASON: + *value = current_evmcs->vm_exit_reason; + break; + case VM_EXIT_INTR_INFO: + *value = current_evmcs->vm_exit_intr_info; + break; + case VM_EXIT_INTR_ERROR_CODE: + *value = current_evmcs->vm_exit_intr_error_code; + break; + case IDT_VECTORING_INFO_FIELD: + *value = current_evmcs->idt_vectoring_info_field; + break; + case IDT_VECTORING_ERROR_CODE: + *value = current_evmcs->idt_vectoring_error_code; + break; + case VM_EXIT_INSTRUCTION_LEN: + *value = current_evmcs->vm_exit_instruction_len; + break; + case VMX_INSTRUCTION_INFO: + *value = current_evmcs->vmx_instruction_info; + break; + case PAGE_FAULT_ERROR_CODE_MASK: + *value = current_evmcs->page_fault_error_code_mask; + break; + case PAGE_FAULT_ERROR_CODE_MATCH: + *value = current_evmcs->page_fault_error_code_match; + break; + case CR3_TARGET_COUNT: + *value = current_evmcs->cr3_target_count; + break; + case VM_EXIT_MSR_STORE_COUNT: + *value = current_evmcs->vm_exit_msr_store_count; + break; + case VM_EXIT_MSR_LOAD_COUNT: + *value = current_evmcs->vm_exit_msr_load_count; + break; + case VM_ENTRY_MSR_LOAD_COUNT: + *value = current_evmcs->vm_entry_msr_load_count; + break; + case HOST_ES_SELECTOR: + *value = current_evmcs->host_es_selector; + break; + case HOST_CS_SELECTOR: + *value = current_evmcs->host_cs_selector; + break; + case HOST_SS_SELECTOR: + *value = current_evmcs->host_ss_selector; + break; + case HOST_DS_SELECTOR: + *value = current_evmcs->host_ds_selector; + break; + case HOST_FS_SELECTOR: + *value = current_evmcs->host_fs_selector; + break; + case HOST_GS_SELECTOR: + *value = current_evmcs->host_gs_selector; + break; + case HOST_TR_SELECTOR: + *value = current_evmcs->host_tr_selector; + break; + case GUEST_ES_SELECTOR: + *value = current_evmcs->guest_es_selector; + break; + case GUEST_CS_SELECTOR: + *value = current_evmcs->guest_cs_selector; + break; + case GUEST_SS_SELECTOR: + *value = current_evmcs->guest_ss_selector; + break; + case GUEST_DS_SELECTOR: + *value = current_evmcs->guest_ds_selector; + break; + case GUEST_FS_SELECTOR: + *value = current_evmcs->guest_fs_selector; + break; + case GUEST_GS_SELECTOR: + *value = current_evmcs->guest_gs_selector; + break; + case GUEST_LDTR_SELECTOR: + *value = current_evmcs->guest_ldtr_selector; + break; + case GUEST_TR_SELECTOR: + *value = current_evmcs->guest_tr_selector; + break; + case VIRTUAL_PROCESSOR_ID: + *value = current_evmcs->virtual_processor_id; + break; + default: return 1; + } + + return 0; +} + +static inline int evmcs_vmwrite(uint64_t encoding, uint64_t value) +{ + switch (encoding) { + case GUEST_RIP: + current_evmcs->guest_rip = value; + break; + case GUEST_RSP: + current_evmcs->guest_rsp = value; + break; + case GUEST_RFLAGS: + current_evmcs->guest_rflags = value; + break; + case HOST_IA32_PAT: + current_evmcs->host_ia32_pat = value; + break; + case HOST_IA32_EFER: + current_evmcs->host_ia32_efer = value; + break; + case HOST_CR0: + current_evmcs->host_cr0 = value; + break; + case HOST_CR3: + current_evmcs->host_cr3 = value; + break; + case HOST_CR4: + current_evmcs->host_cr4 = value; + break; + case HOST_IA32_SYSENTER_ESP: + current_evmcs->host_ia32_sysenter_esp = value; + break; + case HOST_IA32_SYSENTER_EIP: + current_evmcs->host_ia32_sysenter_eip = value; + break; + case HOST_RIP: + current_evmcs->host_rip = value; + break; + case IO_BITMAP_A: + current_evmcs->io_bitmap_a = value; + break; + case IO_BITMAP_B: + current_evmcs->io_bitmap_b = value; + break; + case MSR_BITMAP: + current_evmcs->msr_bitmap = value; + break; + case GUEST_ES_BASE: + current_evmcs->guest_es_base = value; + break; + case GUEST_CS_BASE: + current_evmcs->guest_cs_base = value; + break; + case GUEST_SS_BASE: + current_evmcs->guest_ss_base = value; + break; + case GUEST_DS_BASE: + current_evmcs->guest_ds_base = value; + break; + case GUEST_FS_BASE: + current_evmcs->guest_fs_base = value; + break; + case GUEST_GS_BASE: + current_evmcs->guest_gs_base = value; + break; + case GUEST_LDTR_BASE: + current_evmcs->guest_ldtr_base = value; + break; + case GUEST_TR_BASE: + current_evmcs->guest_tr_base = value; + break; + case GUEST_GDTR_BASE: + current_evmcs->guest_gdtr_base = value; + break; + case GUEST_IDTR_BASE: + current_evmcs->guest_idtr_base = value; + break; + case TSC_OFFSET: + current_evmcs->tsc_offset = value; + break; + case VIRTUAL_APIC_PAGE_ADDR: + current_evmcs->virtual_apic_page_addr = value; + break; + case VMCS_LINK_POINTER: + current_evmcs->vmcs_link_pointer = value; + break; + case GUEST_IA32_DEBUGCTL: + current_evmcs->guest_ia32_debugctl = value; + break; + case GUEST_IA32_PAT: + current_evmcs->guest_ia32_pat = value; + break; + case GUEST_IA32_EFER: + current_evmcs->guest_ia32_efer = value; + break; + case GUEST_PDPTR0: + current_evmcs->guest_pdptr0 = value; + break; + case GUEST_PDPTR1: + current_evmcs->guest_pdptr1 = value; + break; + case GUEST_PDPTR2: + current_evmcs->guest_pdptr2 = value; + break; + case GUEST_PDPTR3: + current_evmcs->guest_pdptr3 = value; + break; + case GUEST_PENDING_DBG_EXCEPTIONS: + current_evmcs->guest_pending_dbg_exceptions = value; + break; + case GUEST_SYSENTER_ESP: + current_evmcs->guest_sysenter_esp = value; + break; + case GUEST_SYSENTER_EIP: + current_evmcs->guest_sysenter_eip = value; + break; + case CR0_GUEST_HOST_MASK: + current_evmcs->cr0_guest_host_mask = value; + break; + case CR4_GUEST_HOST_MASK: + current_evmcs->cr4_guest_host_mask = value; + break; + case CR0_READ_SHADOW: + current_evmcs->cr0_read_shadow = value; + break; + case CR4_READ_SHADOW: + current_evmcs->cr4_read_shadow = value; + break; + case GUEST_CR0: + current_evmcs->guest_cr0 = value; + break; + case GUEST_CR3: + current_evmcs->guest_cr3 = value; + break; + case GUEST_CR4: + current_evmcs->guest_cr4 = value; + break; + case GUEST_DR7: + current_evmcs->guest_dr7 = value; + break; + case HOST_FS_BASE: + current_evmcs->host_fs_base = value; + break; + case HOST_GS_BASE: + current_evmcs->host_gs_base = value; + break; + case HOST_TR_BASE: + current_evmcs->host_tr_base = value; + break; + case HOST_GDTR_BASE: + current_evmcs->host_gdtr_base = value; + break; + case HOST_IDTR_BASE: + current_evmcs->host_idtr_base = value; + break; + case HOST_RSP: + current_evmcs->host_rsp = value; + break; + case EPT_POINTER: + current_evmcs->ept_pointer = value; + break; + case GUEST_BNDCFGS: + current_evmcs->guest_bndcfgs = value; + break; + case XSS_EXIT_BITMAP: + current_evmcs->xss_exit_bitmap = value; + break; + case GUEST_PHYSICAL_ADDRESS: + current_evmcs->guest_physical_address = value; + break; + case EXIT_QUALIFICATION: + current_evmcs->exit_qualification = value; + break; + case GUEST_LINEAR_ADDRESS: + current_evmcs->guest_linear_address = value; + break; + case VM_EXIT_MSR_STORE_ADDR: + current_evmcs->vm_exit_msr_store_addr = value; + break; + case VM_EXIT_MSR_LOAD_ADDR: + current_evmcs->vm_exit_msr_load_addr = value; + break; + case VM_ENTRY_MSR_LOAD_ADDR: + current_evmcs->vm_entry_msr_load_addr = value; + break; + case CR3_TARGET_VALUE0: + current_evmcs->cr3_target_value0 = value; + break; + case CR3_TARGET_VALUE1: + current_evmcs->cr3_target_value1 = value; + break; + case CR3_TARGET_VALUE2: + current_evmcs->cr3_target_value2 = value; + break; + case CR3_TARGET_VALUE3: + current_evmcs->cr3_target_value3 = value; + break; + case TPR_THRESHOLD: + current_evmcs->tpr_threshold = value; + break; + case GUEST_INTERRUPTIBILITY_INFO: + current_evmcs->guest_interruptibility_info = value; + break; + case CPU_BASED_VM_EXEC_CONTROL: + current_evmcs->cpu_based_vm_exec_control = value; + break; + case EXCEPTION_BITMAP: + current_evmcs->exception_bitmap = value; + break; + case VM_ENTRY_CONTROLS: + current_evmcs->vm_entry_controls = value; + break; + case VM_ENTRY_INTR_INFO_FIELD: + current_evmcs->vm_entry_intr_info_field = value; + break; + case VM_ENTRY_EXCEPTION_ERROR_CODE: + current_evmcs->vm_entry_exception_error_code = value; + break; + case VM_ENTRY_INSTRUCTION_LEN: + current_evmcs->vm_entry_instruction_len = value; + break; + case HOST_IA32_SYSENTER_CS: + current_evmcs->host_ia32_sysenter_cs = value; + break; + case PIN_BASED_VM_EXEC_CONTROL: + current_evmcs->pin_based_vm_exec_control = value; + break; + case VM_EXIT_CONTROLS: + current_evmcs->vm_exit_controls = value; + break; + case SECONDARY_VM_EXEC_CONTROL: + current_evmcs->secondary_vm_exec_control = value; + break; + case GUEST_ES_LIMIT: + current_evmcs->guest_es_limit = value; + break; + case GUEST_CS_LIMIT: + current_evmcs->guest_cs_limit = value; + break; + case GUEST_SS_LIMIT: + current_evmcs->guest_ss_limit = value; + break; + case GUEST_DS_LIMIT: + current_evmcs->guest_ds_limit = value; + break; + case GUEST_FS_LIMIT: + current_evmcs->guest_fs_limit = value; + break; + case GUEST_GS_LIMIT: + current_evmcs->guest_gs_limit = value; + break; + case GUEST_LDTR_LIMIT: + current_evmcs->guest_ldtr_limit = value; + break; + case GUEST_TR_LIMIT: + current_evmcs->guest_tr_limit = value; + break; + case GUEST_GDTR_LIMIT: + current_evmcs->guest_gdtr_limit = value; + break; + case GUEST_IDTR_LIMIT: + current_evmcs->guest_idtr_limit = value; + break; + case GUEST_ES_AR_BYTES: + current_evmcs->guest_es_ar_bytes = value; + break; + case GUEST_CS_AR_BYTES: + current_evmcs->guest_cs_ar_bytes = value; + break; + case GUEST_SS_AR_BYTES: + current_evmcs->guest_ss_ar_bytes = value; + break; + case GUEST_DS_AR_BYTES: + current_evmcs->guest_ds_ar_bytes = value; + break; + case GUEST_FS_AR_BYTES: + current_evmcs->guest_fs_ar_bytes = value; + break; + case GUEST_GS_AR_BYTES: + current_evmcs->guest_gs_ar_bytes = value; + break; + case GUEST_LDTR_AR_BYTES: + current_evmcs->guest_ldtr_ar_bytes = value; + break; + case GUEST_TR_AR_BYTES: + current_evmcs->guest_tr_ar_bytes = value; + break; + case GUEST_ACTIVITY_STATE: + current_evmcs->guest_activity_state = value; + break; + case GUEST_SYSENTER_CS: + current_evmcs->guest_sysenter_cs = value; + break; + case VM_INSTRUCTION_ERROR: + current_evmcs->vm_instruction_error = value; + break; + case VM_EXIT_REASON: + current_evmcs->vm_exit_reason = value; + break; + case VM_EXIT_INTR_INFO: + current_evmcs->vm_exit_intr_info = value; + break; + case VM_EXIT_INTR_ERROR_CODE: + current_evmcs->vm_exit_intr_error_code = value; + break; + case IDT_VECTORING_INFO_FIELD: + current_evmcs->idt_vectoring_info_field = value; + break; + case IDT_VECTORING_ERROR_CODE: + current_evmcs->idt_vectoring_error_code = value; + break; + case VM_EXIT_INSTRUCTION_LEN: + current_evmcs->vm_exit_instruction_len = value; + break; + case VMX_INSTRUCTION_INFO: + current_evmcs->vmx_instruction_info = value; + break; + case PAGE_FAULT_ERROR_CODE_MASK: + current_evmcs->page_fault_error_code_mask = value; + break; + case PAGE_FAULT_ERROR_CODE_MATCH: + current_evmcs->page_fault_error_code_match = value; + break; + case CR3_TARGET_COUNT: + current_evmcs->cr3_target_count = value; + break; + case VM_EXIT_MSR_STORE_COUNT: + current_evmcs->vm_exit_msr_store_count = value; + break; + case VM_EXIT_MSR_LOAD_COUNT: + current_evmcs->vm_exit_msr_load_count = value; + break; + case VM_ENTRY_MSR_LOAD_COUNT: + current_evmcs->vm_entry_msr_load_count = value; + break; + case HOST_ES_SELECTOR: + current_evmcs->host_es_selector = value; + break; + case HOST_CS_SELECTOR: + current_evmcs->host_cs_selector = value; + break; + case HOST_SS_SELECTOR: + current_evmcs->host_ss_selector = value; + break; + case HOST_DS_SELECTOR: + current_evmcs->host_ds_selector = value; + break; + case HOST_FS_SELECTOR: + current_evmcs->host_fs_selector = value; + break; + case HOST_GS_SELECTOR: + current_evmcs->host_gs_selector = value; + break; + case HOST_TR_SELECTOR: + current_evmcs->host_tr_selector = value; + break; + case GUEST_ES_SELECTOR: + current_evmcs->guest_es_selector = value; + break; + case GUEST_CS_SELECTOR: + current_evmcs->guest_cs_selector = value; + break; + case GUEST_SS_SELECTOR: + current_evmcs->guest_ss_selector = value; + break; + case GUEST_DS_SELECTOR: + current_evmcs->guest_ds_selector = value; + break; + case GUEST_FS_SELECTOR: + current_evmcs->guest_fs_selector = value; + break; + case GUEST_GS_SELECTOR: + current_evmcs->guest_gs_selector = value; + break; + case GUEST_LDTR_SELECTOR: + current_evmcs->guest_ldtr_selector = value; + break; + case GUEST_TR_SELECTOR: + current_evmcs->guest_tr_selector = value; + break; + case VIRTUAL_PROCESSOR_ID: + current_evmcs->virtual_processor_id = value; + break; + default: return 1; + } + + return 0; +} + +static inline int evmcs_vmlaunch(void) +{ + int ret; + + current_evmcs->hv_clean_fields = 0; + + __asm__ __volatile__("push %%rbp;" + "push %%rcx;" + "push %%rdx;" + "push %%rsi;" + "push %%rdi;" + "push $0;" + "mov %%rsp, (%[host_rsp]);" + "lea 1f(%%rip), %%rax;" + "mov %%rax, (%[host_rip]);" + "vmlaunch;" + "incq (%%rsp);" + "1: pop %%rax;" + "pop %%rdi;" + "pop %%rsi;" + "pop %%rdx;" + "pop %%rcx;" + "pop %%rbp;" + : [ret]"=&a"(ret) + : [host_rsp]"r" + ((uint64_t)¤t_evmcs->host_rsp), + [host_rip]"r" + ((uint64_t)¤t_evmcs->host_rip) + : "memory", "cc", "rbx", "r8", "r9", "r10", + "r11", "r12", "r13", "r14", "r15"); + return ret; +} + +/* + * No guest state (e.g. GPRs) is established by this vmresume. + */ +static inline int evmcs_vmresume(void) +{ + int ret; + + current_evmcs->hv_clean_fields = 0; + + __asm__ __volatile__("push %%rbp;" + "push %%rcx;" + "push %%rdx;" + "push %%rsi;" + "push %%rdi;" + "push $0;" + "mov %%rsp, (%[host_rsp]);" + "lea 1f(%%rip), %%rax;" + "mov %%rax, (%[host_rip]);" + "vmresume;" + "incq (%%rsp);" + "1: pop %%rax;" + "pop %%rdi;" + "pop %%rsi;" + "pop %%rdx;" + "pop %%rcx;" + "pop %%rbp;" + : [ret]"=&a"(ret) + : [host_rsp]"r" + ((uint64_t)¤t_evmcs->host_rsp), + [host_rip]"r" + ((uint64_t)¤t_evmcs->host_rip) + : "memory", "cc", "rbx", "r8", "r9", "r10", + "r11", "r12", "r13", "r14", "r15"); + return ret; +} + +#endif /* !SELFTEST_KVM_EVMCS_H */ diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h new file mode 100644 index 000000000..7d29aa786 --- /dev/null +++ b/tools/testing/selftests/kvm/include/kvm_util.h @@ -0,0 +1,348 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * tools/testing/selftests/kvm/include/kvm_util.h + * + * Copyright (C) 2018, Google LLC. + */ +#ifndef SELFTEST_KVM_UTIL_H +#define SELFTEST_KVM_UTIL_H + +#include "test_util.h" + +#include "asm/kvm.h" +#include "linux/list.h" +#include "linux/kvm.h" +#include + +#include "sparsebit.h" + + +/* + * Callers of kvm_util only have an incomplete/opaque description of the + * structure kvm_util is using to maintain the state of a VM. + */ +struct kvm_vm; + +typedef uint64_t vm_paddr_t; /* Virtual Machine (Guest) physical address */ +typedef uint64_t vm_vaddr_t; /* Virtual Machine (Guest) virtual address */ + +/* Minimum allocated guest virtual and physical addresses */ +#define KVM_UTIL_MIN_VADDR 0x2000 + +#define DEFAULT_GUEST_PHY_PAGES 512 +#define DEFAULT_GUEST_STACK_VADDR_MIN 0xab6000 +#define DEFAULT_STACK_PGS 5 + +enum vm_guest_mode { + VM_MODE_P52V48_4K, + VM_MODE_P52V48_64K, + VM_MODE_P48V48_4K, + VM_MODE_P48V48_64K, + VM_MODE_P40V48_4K, + VM_MODE_P40V48_64K, + VM_MODE_PXXV48_4K, /* For 48bits VA but ANY bits PA */ + NUM_VM_MODES, +}; + +#if defined(__aarch64__) +#define VM_MODE_DEFAULT VM_MODE_P40V48_4K +#elif defined(__x86_64__) +#define VM_MODE_DEFAULT VM_MODE_PXXV48_4K +#else +#define VM_MODE_DEFAULT VM_MODE_P52V48_4K +#endif + +#define vm_guest_mode_string(m) vm_guest_mode_string[m] +extern const char * const vm_guest_mode_string[]; + +enum vm_mem_backing_src_type { + VM_MEM_SRC_ANONYMOUS, + VM_MEM_SRC_ANONYMOUS_THP, + VM_MEM_SRC_ANONYMOUS_HUGETLB, +}; + +int kvm_check_cap(long cap); +int vm_enable_cap(struct kvm_vm *vm, struct kvm_enable_cap *cap); +int vcpu_enable_cap(struct kvm_vm *vm, uint32_t vcpu_id, + struct kvm_enable_cap *cap); +void vm_enable_dirty_ring(struct kvm_vm *vm, uint32_t ring_size); + +struct kvm_vm *vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm); +void kvm_vm_free(struct kvm_vm *vmp); +void kvm_vm_restart(struct kvm_vm *vmp, int perm); +void kvm_vm_release(struct kvm_vm *vmp); +void kvm_vm_get_dirty_log(struct kvm_vm *vm, int slot, void *log); +void kvm_vm_clear_dirty_log(struct kvm_vm *vm, int slot, void *log, + uint64_t first_page, uint32_t num_pages); + +int kvm_memcmp_hva_gva(void *hva, struct kvm_vm *vm, const vm_vaddr_t gva, + size_t len); + +void kvm_vm_elf_load(struct kvm_vm *vm, const char *filename, + uint32_t data_memslot, uint32_t pgd_memslot); + +void vm_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent); + +/* + * VM VCPU Dump + * + * Input Args: + * stream - Output FILE stream + * vm - Virtual Machine + * vcpuid - VCPU ID + * indent - Left margin indent amount + * + * Output Args: None + * + * Return: None + * + * Dumps the current state of the VCPU specified by @vcpuid, within the VM + * given by @vm, to the FILE stream given by @stream. + */ +void vcpu_dump(FILE *stream, struct kvm_vm *vm, uint32_t vcpuid, + uint8_t indent); + +void vm_create_irqchip(struct kvm_vm *vm); + +void vm_userspace_mem_region_add(struct kvm_vm *vm, + enum vm_mem_backing_src_type src_type, + uint64_t guest_paddr, uint32_t slot, uint64_t npages, + uint32_t flags); + +void vcpu_ioctl(struct kvm_vm *vm, uint32_t vcpuid, unsigned long ioctl, + void *arg); +int _vcpu_ioctl(struct kvm_vm *vm, uint32_t vcpuid, unsigned long ioctl, + void *arg); +void vm_ioctl(struct kvm_vm *vm, unsigned long ioctl, void *arg); +void vm_mem_region_set_flags(struct kvm_vm *vm, uint32_t slot, uint32_t flags); +void vm_mem_region_move(struct kvm_vm *vm, uint32_t slot, uint64_t new_gpa); +void vm_mem_region_delete(struct kvm_vm *vm, uint32_t slot); +void vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpuid); +vm_vaddr_t vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min, + uint32_t data_memslot, uint32_t pgd_memslot); +void virt_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, + unsigned int npages, uint32_t pgd_memslot); +void *addr_gpa2hva(struct kvm_vm *vm, vm_paddr_t gpa); +void *addr_gva2hva(struct kvm_vm *vm, vm_vaddr_t gva); +vm_paddr_t addr_hva2gpa(struct kvm_vm *vm, void *hva); + +/* + * Address Guest Virtual to Guest Physical + * + * Input Args: + * vm - Virtual Machine + * gva - VM virtual address + * + * Output Args: None + * + * Return: + * Equivalent VM physical address + * + * Returns the VM physical address of the translated VM virtual + * address given by @gva. + */ +vm_paddr_t addr_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva); + +struct kvm_run *vcpu_state(struct kvm_vm *vm, uint32_t vcpuid); +void vcpu_run(struct kvm_vm *vm, uint32_t vcpuid); +int _vcpu_run(struct kvm_vm *vm, uint32_t vcpuid); +void vcpu_run_complete_io(struct kvm_vm *vm, uint32_t vcpuid); +void vcpu_set_guest_debug(struct kvm_vm *vm, uint32_t vcpuid, + struct kvm_guest_debug *debug); +void vcpu_set_mp_state(struct kvm_vm *vm, uint32_t vcpuid, + struct kvm_mp_state *mp_state); +struct kvm_reg_list *vcpu_get_reg_list(struct kvm_vm *vm, uint32_t vcpuid); +void vcpu_regs_get(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_regs *regs); +void vcpu_regs_set(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_regs *regs); + +/* + * VM VCPU Args Set + * + * Input Args: + * vm - Virtual Machine + * vcpuid - VCPU ID + * num - number of arguments + * ... - arguments, each of type uint64_t + * + * Output Args: None + * + * Return: None + * + * Sets the first @num function input registers of the VCPU with @vcpuid, + * per the C calling convention of the architecture, to the values given + * as variable args. Each of the variable args is expected to be of type + * uint64_t. The maximum @num can be is specific to the architecture. + */ +void vcpu_args_set(struct kvm_vm *vm, uint32_t vcpuid, unsigned int num, ...); + +void vcpu_sregs_get(struct kvm_vm *vm, uint32_t vcpuid, + struct kvm_sregs *sregs); +void vcpu_sregs_set(struct kvm_vm *vm, uint32_t vcpuid, + struct kvm_sregs *sregs); +int _vcpu_sregs_set(struct kvm_vm *vm, uint32_t vcpuid, + struct kvm_sregs *sregs); +void vcpu_fpu_get(struct kvm_vm *vm, uint32_t vcpuid, + struct kvm_fpu *fpu); +void vcpu_fpu_set(struct kvm_vm *vm, uint32_t vcpuid, + struct kvm_fpu *fpu); +void vcpu_get_reg(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_one_reg *reg); +void vcpu_set_reg(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_one_reg *reg); +#ifdef __KVM_HAVE_VCPU_EVENTS +void vcpu_events_get(struct kvm_vm *vm, uint32_t vcpuid, + struct kvm_vcpu_events *events); +void vcpu_events_set(struct kvm_vm *vm, uint32_t vcpuid, + struct kvm_vcpu_events *events); +#endif +#ifdef __x86_64__ +void vcpu_nested_state_get(struct kvm_vm *vm, uint32_t vcpuid, + struct kvm_nested_state *state); +int vcpu_nested_state_set(struct kvm_vm *vm, uint32_t vcpuid, + struct kvm_nested_state *state, bool ignore_error); +#endif + +const char *exit_reason_str(unsigned int exit_reason); + +void virt_pgd_alloc(struct kvm_vm *vm, uint32_t pgd_memslot); + +/* + * VM Virtual Page Map + * + * Input Args: + * vm - Virtual Machine + * vaddr - VM Virtual Address + * paddr - VM Physical Address + * memslot - Memory region slot for new virtual translation tables + * + * Output Args: None + * + * Return: None + * + * Within @vm, creates a virtual translation for the page starting + * at @vaddr to the page starting at @paddr. + */ +void virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, + uint32_t memslot); + +vm_paddr_t vm_phy_page_alloc(struct kvm_vm *vm, vm_paddr_t paddr_min, + uint32_t memslot); +vm_paddr_t vm_phy_pages_alloc(struct kvm_vm *vm, size_t num, + vm_paddr_t paddr_min, uint32_t memslot); + +/* + * Create a VM with reasonable defaults + * + * Input Args: + * vcpuid - The id of the single VCPU to add to the VM. + * extra_mem_pages - The number of extra pages to add (this will + * decide how much extra space we will need to + * setup the page tables using memslot 0) + * guest_code - The vCPU's entry point + * + * Output Args: None + * + * Return: + * Pointer to opaque structure that describes the created VM. + */ +struct kvm_vm *vm_create_default(uint32_t vcpuid, uint64_t extra_mem_pages, + void *guest_code); + +/* + * Adds a vCPU with reasonable defaults (e.g. a stack) + * + * Input Args: + * vm - Virtual Machine + * vcpuid - The id of the VCPU to add to the VM. + * guest_code - The vCPU's entry point + */ +void vm_vcpu_add_default(struct kvm_vm *vm, uint32_t vcpuid, void *guest_code); + +bool vm_is_unrestricted_guest(struct kvm_vm *vm); + +unsigned int vm_get_page_size(struct kvm_vm *vm); +unsigned int vm_get_page_shift(struct kvm_vm *vm); +unsigned int vm_get_max_gfn(struct kvm_vm *vm); +int vm_get_fd(struct kvm_vm *vm); + +unsigned int vm_calc_num_guest_pages(enum vm_guest_mode mode, size_t size); +unsigned int vm_num_host_pages(enum vm_guest_mode mode, unsigned int num_guest_pages); +unsigned int vm_num_guest_pages(enum vm_guest_mode mode, unsigned int num_host_pages); +static inline unsigned int +vm_adjust_num_guest_pages(enum vm_guest_mode mode, unsigned int num_guest_pages) +{ + unsigned int n; + n = vm_num_guest_pages(mode, vm_num_host_pages(mode, num_guest_pages)); +#ifdef __s390x__ + /* s390 requires 1M aligned guest sizes */ + n = (n + 255) & ~255; +#endif + return n; +} + +struct kvm_userspace_memory_region * +kvm_userspace_memory_region_find(struct kvm_vm *vm, uint64_t start, + uint64_t end); + +struct kvm_dirty_log * +allocate_kvm_dirty_log(struct kvm_userspace_memory_region *region); + +int vm_create_device(struct kvm_vm *vm, struct kvm_create_device *cd); + +#define sync_global_to_guest(vm, g) ({ \ + typeof(g) *_p = addr_gva2hva(vm, (vm_vaddr_t)&(g)); \ + memcpy(_p, &(g), sizeof(g)); \ +}) + +#define sync_global_from_guest(vm, g) ({ \ + typeof(g) *_p = addr_gva2hva(vm, (vm_vaddr_t)&(g)); \ + memcpy(&(g), _p, sizeof(g)); \ +}) + +void assert_on_unhandled_exception(struct kvm_vm *vm, uint32_t vcpuid); + +/* Common ucalls */ +enum { + UCALL_NONE, + UCALL_SYNC, + UCALL_ABORT, + UCALL_DONE, +}; + +#define UCALL_MAX_ARGS 6 + +struct ucall { + uint64_t cmd; + uint64_t args[UCALL_MAX_ARGS]; +}; + +void ucall_init(struct kvm_vm *vm, void *arg); +void ucall_uninit(struct kvm_vm *vm); +void ucall(uint64_t cmd, int nargs, ...); +uint64_t get_ucall(struct kvm_vm *vm, uint32_t vcpu_id, struct ucall *uc); + +#define GUEST_SYNC_ARGS(stage, arg1, arg2, arg3, arg4) \ + ucall(UCALL_SYNC, 6, "hello", stage, arg1, arg2, arg3, arg4) +#define GUEST_SYNC(stage) ucall(UCALL_SYNC, 2, "hello", stage) +#define GUEST_DONE() ucall(UCALL_DONE, 0) +#define __GUEST_ASSERT(_condition, _nargs, _args...) do { \ + if (!(_condition)) \ + ucall(UCALL_ABORT, 2 + _nargs, \ + "Failed guest assert: " \ + #_condition, __LINE__, _args); \ +} while (0) + +#define GUEST_ASSERT(_condition) \ + __GUEST_ASSERT((_condition), 0, 0) + +#define GUEST_ASSERT_1(_condition, arg1) \ + __GUEST_ASSERT((_condition), 1, (arg1)) + +#define GUEST_ASSERT_2(_condition, arg1, arg2) \ + __GUEST_ASSERT((_condition), 2, (arg1), (arg2)) + +#define GUEST_ASSERT_3(_condition, arg1, arg2, arg3) \ + __GUEST_ASSERT((_condition), 3, (arg1), (arg2), (arg3)) + +#define GUEST_ASSERT_4(_condition, arg1, arg2, arg3, arg4) \ + __GUEST_ASSERT((_condition), 4, (arg1), (arg2), (arg3), (arg4)) + +#endif /* SELFTEST_KVM_UTIL_H */ diff --git a/tools/testing/selftests/kvm/include/perf_test_util.h b/tools/testing/selftests/kvm/include/perf_test_util.h new file mode 100644 index 000000000..261805205 --- /dev/null +++ b/tools/testing/selftests/kvm/include/perf_test_util.h @@ -0,0 +1,198 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * tools/testing/selftests/kvm/include/perf_test_util.h + * + * Copyright (C) 2020, Google LLC. + */ + +#ifndef SELFTEST_KVM_PERF_TEST_UTIL_H +#define SELFTEST_KVM_PERF_TEST_UTIL_H + +#include "kvm_util.h" +#include "processor.h" + +#define MAX_VCPUS 512 + +#define PAGE_SHIFT_4K 12 +#define PTES_PER_4K_PT 512 + +#define TEST_MEM_SLOT_INDEX 1 + +/* Default guest test virtual memory offset */ +#define DEFAULT_GUEST_TEST_MEM 0xc0000000 + +#define DEFAULT_PER_VCPU_MEM_SIZE (1 << 30) /* 1G */ + +/* + * Guest physical memory offset of the testing memory slot. + * This will be set to the topmost valid physical address minus + * the test memory size. + */ +static uint64_t guest_test_phys_mem; + +/* + * Guest virtual memory offset of the testing memory slot. + * Must not conflict with identity mapped test code. + */ +static uint64_t guest_test_virt_mem = DEFAULT_GUEST_TEST_MEM; +static uint64_t guest_percpu_mem_size = DEFAULT_PER_VCPU_MEM_SIZE; + +/* Number of VCPUs for the test */ +static int nr_vcpus = 1; + +struct vcpu_args { + uint64_t gva; + uint64_t pages; + + /* Only used by the host userspace part of the vCPU thread */ + int vcpu_id; +}; + +struct perf_test_args { + struct kvm_vm *vm; + uint64_t host_page_size; + uint64_t guest_page_size; + int wr_fract; + + struct vcpu_args vcpu_args[MAX_VCPUS]; +}; + +static struct perf_test_args perf_test_args; + +/* + * Continuously write to the first 8 bytes of each page in the + * specified region. + */ +static void guest_code(uint32_t vcpu_id) +{ + struct vcpu_args *vcpu_args = &perf_test_args.vcpu_args[vcpu_id]; + uint64_t gva; + uint64_t pages; + int i; + + /* Make sure vCPU args data structure is not corrupt. */ + GUEST_ASSERT(vcpu_args->vcpu_id == vcpu_id); + + gva = vcpu_args->gva; + pages = vcpu_args->pages; + + while (true) { + for (i = 0; i < pages; i++) { + uint64_t addr = gva + (i * perf_test_args.guest_page_size); + + if (i % perf_test_args.wr_fract == 0) + *(uint64_t *)addr = 0x0123456789ABCDEF; + else + READ_ONCE(*(uint64_t *)addr); + } + + GUEST_SYNC(1); + } +} + +static struct kvm_vm *create_vm(enum vm_guest_mode mode, int vcpus, + uint64_t vcpu_memory_bytes) +{ + struct kvm_vm *vm; + uint64_t pages = DEFAULT_GUEST_PHY_PAGES; + uint64_t guest_num_pages; + + /* Account for a few pages per-vCPU for stacks */ + pages += DEFAULT_STACK_PGS * vcpus; + + /* + * Reserve twice the ammount of memory needed to map the test region and + * the page table / stacks region, at 4k, for page tables. Do the + * calculation with 4K page size: the smallest of all archs. (e.g., 64K + * page size guest will need even less memory for page tables). + */ + pages += (2 * pages) / PTES_PER_4K_PT; + pages += ((2 * vcpus * vcpu_memory_bytes) >> PAGE_SHIFT_4K) / + PTES_PER_4K_PT; + pages = vm_adjust_num_guest_pages(mode, pages); + + pr_info("Testing guest mode: %s\n", vm_guest_mode_string(mode)); + + vm = vm_create(mode, pages, O_RDWR); + kvm_vm_elf_load(vm, program_invocation_name, 0, 0); +#ifdef __x86_64__ + vm_create_irqchip(vm); +#endif + + perf_test_args.vm = vm; + perf_test_args.guest_page_size = vm_get_page_size(vm); + perf_test_args.host_page_size = getpagesize(); + + TEST_ASSERT(vcpu_memory_bytes % perf_test_args.guest_page_size == 0, + "Guest memory size is not guest page size aligned."); + + guest_num_pages = (vcpus * vcpu_memory_bytes) / + perf_test_args.guest_page_size; + guest_num_pages = vm_adjust_num_guest_pages(mode, guest_num_pages); + + /* + * If there should be more memory in the guest test region than there + * can be pages in the guest, it will definitely cause problems. + */ + TEST_ASSERT(guest_num_pages < vm_get_max_gfn(vm), + "Requested more guest memory than address space allows.\n" + " guest pages: %lx max gfn: %x vcpus: %d wss: %lx]\n", + guest_num_pages, vm_get_max_gfn(vm), vcpus, + vcpu_memory_bytes); + + TEST_ASSERT(vcpu_memory_bytes % perf_test_args.host_page_size == 0, + "Guest memory size is not host page size aligned."); + + guest_test_phys_mem = (vm_get_max_gfn(vm) - guest_num_pages) * + perf_test_args.guest_page_size; + guest_test_phys_mem &= ~(perf_test_args.host_page_size - 1); + +#ifdef __s390x__ + /* Align to 1M (segment size) */ + guest_test_phys_mem &= ~((1 << 20) - 1); +#endif + + pr_info("guest physical test memory offset: 0x%lx\n", guest_test_phys_mem); + + /* Add an extra memory slot for testing */ + vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, + guest_test_phys_mem, + TEST_MEM_SLOT_INDEX, + guest_num_pages, 0); + + /* Do mapping for the demand paging memory slot */ + virt_map(vm, guest_test_virt_mem, guest_test_phys_mem, guest_num_pages, 0); + + ucall_init(vm, NULL); + + return vm; +} + +static void add_vcpus(struct kvm_vm *vm, int vcpus, uint64_t vcpu_memory_bytes) +{ + vm_paddr_t vcpu_gpa; + struct vcpu_args *vcpu_args; + int vcpu_id; + + for (vcpu_id = 0; vcpu_id < vcpus; vcpu_id++) { + vcpu_args = &perf_test_args.vcpu_args[vcpu_id]; + + vm_vcpu_add_default(vm, vcpu_id, guest_code); + +#ifdef __x86_64__ + vcpu_set_cpuid(vm, vcpu_id, kvm_get_supported_cpuid()); +#endif + + vcpu_args->vcpu_id = vcpu_id; + vcpu_args->gva = guest_test_virt_mem + + (vcpu_id * vcpu_memory_bytes); + vcpu_args->pages = vcpu_memory_bytes / + perf_test_args.guest_page_size; + + vcpu_gpa = guest_test_phys_mem + (vcpu_id * vcpu_memory_bytes); + pr_debug("Added VCPU %d with test mem gpa [%lx, %lx)\n", + vcpu_id, vcpu_gpa, vcpu_gpa + vcpu_memory_bytes); + } +} + +#endif /* SELFTEST_KVM_PERF_TEST_UTIL_H */ diff --git a/tools/testing/selftests/kvm/include/s390x/processor.h b/tools/testing/selftests/kvm/include/s390x/processor.h new file mode 100644 index 000000000..e0e96a5f6 --- /dev/null +++ b/tools/testing/selftests/kvm/include/s390x/processor.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * s390x processor specific defines + */ +#ifndef SELFTEST_KVM_PROCESSOR_H +#define SELFTEST_KVM_PROCESSOR_H + +/* Bits in the region/segment table entry */ +#define REGION_ENTRY_ORIGIN ~0xfffUL /* region/segment table origin */ +#define REGION_ENTRY_PROTECT 0x200 /* region protection bit */ +#define REGION_ENTRY_NOEXEC 0x100 /* region no-execute bit */ +#define REGION_ENTRY_OFFSET 0xc0 /* region table offset */ +#define REGION_ENTRY_INVALID 0x20 /* invalid region table entry */ +#define REGION_ENTRY_TYPE 0x0c /* region/segment table type mask */ +#define REGION_ENTRY_LENGTH 0x03 /* region third length */ + +/* Bits in the page table entry */ +#define PAGE_INVALID 0x400 /* HW invalid bit */ +#define PAGE_PROTECT 0x200 /* HW read-only bit */ +#define PAGE_NOEXEC 0x100 /* HW no-execute bit */ + +#endif diff --git a/tools/testing/selftests/kvm/include/sparsebit.h b/tools/testing/selftests/kvm/include/sparsebit.h new file mode 100644 index 000000000..12a9a4b9c --- /dev/null +++ b/tools/testing/selftests/kvm/include/sparsebit.h @@ -0,0 +1,73 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * tools/testing/selftests/kvm/include/sparsebit.h + * + * Copyright (C) 2018, Google LLC. + * + * Header file that describes API to the sparsebit library. + * This library provides a memory efficient means of storing + * the settings of bits indexed via a uint64_t. Memory usage + * is reasonable, significantly less than (2^64 / 8) bytes, as + * long as bits that are mostly set or mostly cleared are close + * to each other. This library is efficient in memory usage + * even in the case where most bits are set. + */ + +#ifndef SELFTEST_KVM_SPARSEBIT_H +#define SELFTEST_KVM_SPARSEBIT_H + +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +struct sparsebit; +typedef uint64_t sparsebit_idx_t; +typedef uint64_t sparsebit_num_t; + +struct sparsebit *sparsebit_alloc(void); +void sparsebit_free(struct sparsebit **sbitp); +void sparsebit_copy(struct sparsebit *dstp, struct sparsebit *src); + +bool sparsebit_is_set(struct sparsebit *sbit, sparsebit_idx_t idx); +bool sparsebit_is_set_num(struct sparsebit *sbit, + sparsebit_idx_t idx, sparsebit_num_t num); +bool sparsebit_is_clear(struct sparsebit *sbit, sparsebit_idx_t idx); +bool sparsebit_is_clear_num(struct sparsebit *sbit, + sparsebit_idx_t idx, sparsebit_num_t num); +sparsebit_num_t sparsebit_num_set(struct sparsebit *sbit); +bool sparsebit_any_set(struct sparsebit *sbit); +bool sparsebit_any_clear(struct sparsebit *sbit); +bool sparsebit_all_set(struct sparsebit *sbit); +bool sparsebit_all_clear(struct sparsebit *sbit); +sparsebit_idx_t sparsebit_first_set(struct sparsebit *sbit); +sparsebit_idx_t sparsebit_first_clear(struct sparsebit *sbit); +sparsebit_idx_t sparsebit_next_set(struct sparsebit *sbit, sparsebit_idx_t prev); +sparsebit_idx_t sparsebit_next_clear(struct sparsebit *sbit, sparsebit_idx_t prev); +sparsebit_idx_t sparsebit_next_set_num(struct sparsebit *sbit, + sparsebit_idx_t start, sparsebit_num_t num); +sparsebit_idx_t sparsebit_next_clear_num(struct sparsebit *sbit, + sparsebit_idx_t start, sparsebit_num_t num); + +void sparsebit_set(struct sparsebit *sbitp, sparsebit_idx_t idx); +void sparsebit_set_num(struct sparsebit *sbitp, sparsebit_idx_t start, + sparsebit_num_t num); +void sparsebit_set_all(struct sparsebit *sbitp); + +void sparsebit_clear(struct sparsebit *sbitp, sparsebit_idx_t idx); +void sparsebit_clear_num(struct sparsebit *sbitp, + sparsebit_idx_t start, sparsebit_num_t num); +void sparsebit_clear_all(struct sparsebit *sbitp); + +void sparsebit_dump(FILE *stream, struct sparsebit *sbit, + unsigned int indent); +void sparsebit_validate_internal(struct sparsebit *sbit); + +#ifdef __cplusplus +} +#endif + +#endif /* SELFTEST_KVM_SPARSEBIT_H */ diff --git a/tools/testing/selftests/kvm/include/test_util.h b/tools/testing/selftests/kvm/include/test_util.h new file mode 100644 index 000000000..ffffa5604 --- /dev/null +++ b/tools/testing/selftests/kvm/include/test_util.h @@ -0,0 +1,70 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * tools/testing/selftests/kvm/include/test_util.h + * + * Copyright (C) 2018, Google LLC. + */ + +#ifndef SELFTEST_KVM_TEST_UTIL_H +#define SELFTEST_KVM_TEST_UTIL_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "kselftest.h" + +static inline int _no_printf(const char *format, ...) { return 0; } + +#ifdef DEBUG +#define pr_debug(...) printf(__VA_ARGS__) +#else +#define pr_debug(...) _no_printf(__VA_ARGS__) +#endif +#ifndef QUIET +#define pr_info(...) printf(__VA_ARGS__) +#else +#define pr_info(...) _no_printf(__VA_ARGS__) +#endif + +void print_skip(const char *fmt, ...) __attribute__((format(printf, 1, 2))); + +ssize_t test_write(int fd, const void *buf, size_t count); +ssize_t test_read(int fd, void *buf, size_t count); +int test_seq_read(const char *path, char **bufp, size_t *sizep); + +void test_assert(bool exp, const char *exp_str, + const char *file, unsigned int line, const char *fmt, ...) + __attribute__((format(printf, 5, 6))); + +#define TEST_ASSERT(e, fmt, ...) \ + test_assert((e), #e, __FILE__, __LINE__, fmt, ##__VA_ARGS__) + +#define ASSERT_EQ(a, b) do { \ + typeof(a) __a = (a); \ + typeof(b) __b = (b); \ + TEST_ASSERT(__a == __b, \ + "ASSERT_EQ(%s, %s) failed.\n" \ + "\t%s is %#lx\n" \ + "\t%s is %#lx", \ + #a, #b, #a, (unsigned long) __a, #b, (unsigned long) __b); \ +} while (0) + +#define TEST_FAIL(fmt, ...) \ + TEST_ASSERT(false, fmt, ##__VA_ARGS__) + +size_t parse_size(const char *size); + +int64_t timespec_to_ns(struct timespec ts); +struct timespec timespec_add_ns(struct timespec ts, int64_t ns); +struct timespec timespec_add(struct timespec ts1, struct timespec ts2); +struct timespec timespec_sub(struct timespec ts1, struct timespec ts2); +struct timespec timespec_diff_now(struct timespec start); +struct timespec timespec_div(struct timespec ts, int divisor); + +#endif /* SELFTEST_KVM_TEST_UTIL_H */ diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h new file mode 100644 index 000000000..8e61340b3 --- /dev/null +++ b/tools/testing/selftests/kvm/include/x86_64/processor.h @@ -0,0 +1,422 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * tools/testing/selftests/kvm/include/x86_64/processor.h + * + * Copyright (C) 2018, Google LLC. + */ + +#ifndef SELFTEST_KVM_PROCESSOR_H +#define SELFTEST_KVM_PROCESSOR_H + +#include +#include + +#include + +#define X86_EFLAGS_FIXED (1u << 1) + +#define X86_CR4_VME (1ul << 0) +#define X86_CR4_PVI (1ul << 1) +#define X86_CR4_TSD (1ul << 2) +#define X86_CR4_DE (1ul << 3) +#define X86_CR4_PSE (1ul << 4) +#define X86_CR4_PAE (1ul << 5) +#define X86_CR4_MCE (1ul << 6) +#define X86_CR4_PGE (1ul << 7) +#define X86_CR4_PCE (1ul << 8) +#define X86_CR4_OSFXSR (1ul << 9) +#define X86_CR4_OSXMMEXCPT (1ul << 10) +#define X86_CR4_UMIP (1ul << 11) +#define X86_CR4_VMXE (1ul << 13) +#define X86_CR4_SMXE (1ul << 14) +#define X86_CR4_FSGSBASE (1ul << 16) +#define X86_CR4_PCIDE (1ul << 17) +#define X86_CR4_OSXSAVE (1ul << 18) +#define X86_CR4_SMEP (1ul << 20) +#define X86_CR4_SMAP (1ul << 21) +#define X86_CR4_PKE (1ul << 22) + +#define UNEXPECTED_VECTOR_PORT 0xfff0u + +/* General Registers in 64-Bit Mode */ +struct gpr64_regs { + u64 rax; + u64 rcx; + u64 rdx; + u64 rbx; + u64 rsp; + u64 rbp; + u64 rsi; + u64 rdi; + u64 r8; + u64 r9; + u64 r10; + u64 r11; + u64 r12; + u64 r13; + u64 r14; + u64 r15; +}; + +struct desc64 { + uint16_t limit0; + uint16_t base0; + unsigned base1:8, type:4, s:1, dpl:2, p:1; + unsigned limit1:4, avl:1, l:1, db:1, g:1, base2:8; + uint32_t base3; + uint32_t zero1; +} __attribute__((packed)); + +struct desc_ptr { + uint16_t size; + uint64_t address; +} __attribute__((packed)); + +static inline uint64_t get_desc64_base(const struct desc64 *desc) +{ + return ((uint64_t)desc->base3 << 32) | + (desc->base0 | ((desc->base1) << 16) | ((desc->base2) << 24)); +} + +static inline uint64_t rdtsc(void) +{ + uint32_t eax, edx; + uint64_t tsc_val; + /* + * The lfence is to wait (on Intel CPUs) until all previous + * instructions have been executed. If software requires RDTSC to be + * executed prior to execution of any subsequent instruction, it can + * execute LFENCE immediately after RDTSC + */ + __asm__ __volatile__("lfence; rdtsc; lfence" : "=a"(eax), "=d"(edx)); + tsc_val = ((uint64_t)edx) << 32 | eax; + return tsc_val; +} + +static inline uint64_t rdtscp(uint32_t *aux) +{ + uint32_t eax, edx; + + __asm__ __volatile__("rdtscp" : "=a"(eax), "=d"(edx), "=c"(*aux)); + return ((uint64_t)edx) << 32 | eax; +} + +static inline uint64_t rdmsr(uint32_t msr) +{ + uint32_t a, d; + + __asm__ __volatile__("rdmsr" : "=a"(a), "=d"(d) : "c"(msr) : "memory"); + + return a | ((uint64_t) d << 32); +} + +static inline void wrmsr(uint32_t msr, uint64_t value) +{ + uint32_t a = value; + uint32_t d = value >> 32; + + __asm__ __volatile__("wrmsr" :: "a"(a), "d"(d), "c"(msr) : "memory"); +} + + +static inline uint16_t inw(uint16_t port) +{ + uint16_t tmp; + + __asm__ __volatile__("in %%dx, %%ax" + : /* output */ "=a" (tmp) + : /* input */ "d" (port)); + + return tmp; +} + +static inline uint16_t get_es(void) +{ + uint16_t es; + + __asm__ __volatile__("mov %%es, %[es]" + : /* output */ [es]"=rm"(es)); + return es; +} + +static inline uint16_t get_cs(void) +{ + uint16_t cs; + + __asm__ __volatile__("mov %%cs, %[cs]" + : /* output */ [cs]"=rm"(cs)); + return cs; +} + +static inline uint16_t get_ss(void) +{ + uint16_t ss; + + __asm__ __volatile__("mov %%ss, %[ss]" + : /* output */ [ss]"=rm"(ss)); + return ss; +} + +static inline uint16_t get_ds(void) +{ + uint16_t ds; + + __asm__ __volatile__("mov %%ds, %[ds]" + : /* output */ [ds]"=rm"(ds)); + return ds; +} + +static inline uint16_t get_fs(void) +{ + uint16_t fs; + + __asm__ __volatile__("mov %%fs, %[fs]" + : /* output */ [fs]"=rm"(fs)); + return fs; +} + +static inline uint16_t get_gs(void) +{ + uint16_t gs; + + __asm__ __volatile__("mov %%gs, %[gs]" + : /* output */ [gs]"=rm"(gs)); + return gs; +} + +static inline uint16_t get_tr(void) +{ + uint16_t tr; + + __asm__ __volatile__("str %[tr]" + : /* output */ [tr]"=rm"(tr)); + return tr; +} + +static inline uint64_t get_cr0(void) +{ + uint64_t cr0; + + __asm__ __volatile__("mov %%cr0, %[cr0]" + : /* output */ [cr0]"=r"(cr0)); + return cr0; +} + +static inline uint64_t get_cr3(void) +{ + uint64_t cr3; + + __asm__ __volatile__("mov %%cr3, %[cr3]" + : /* output */ [cr3]"=r"(cr3)); + return cr3; +} + +static inline uint64_t get_cr4(void) +{ + uint64_t cr4; + + __asm__ __volatile__("mov %%cr4, %[cr4]" + : /* output */ [cr4]"=r"(cr4)); + return cr4; +} + +static inline void set_cr4(uint64_t val) +{ + __asm__ __volatile__("mov %0, %%cr4" : : "r" (val) : "memory"); +} + +static inline struct desc_ptr get_gdt(void) +{ + struct desc_ptr gdt; + __asm__ __volatile__("sgdt %[gdt]" + : /* output */ [gdt]"=m"(gdt)); + return gdt; +} + +static inline struct desc_ptr get_idt(void) +{ + struct desc_ptr idt; + __asm__ __volatile__("sidt %[idt]" + : /* output */ [idt]"=m"(idt)); + return idt; +} + +static inline void outl(uint16_t port, uint32_t value) +{ + __asm__ __volatile__("outl %%eax, %%dx" : : "d"(port), "a"(value)); +} + +#define SET_XMM(__var, __xmm) \ + asm volatile("movq %0, %%"#__xmm : : "r"(__var) : #__xmm) + +static inline void set_xmm(int n, unsigned long val) +{ + switch (n) { + case 0: + SET_XMM(val, xmm0); + break; + case 1: + SET_XMM(val, xmm1); + break; + case 2: + SET_XMM(val, xmm2); + break; + case 3: + SET_XMM(val, xmm3); + break; + case 4: + SET_XMM(val, xmm4); + break; + case 5: + SET_XMM(val, xmm5); + break; + case 6: + SET_XMM(val, xmm6); + break; + case 7: + SET_XMM(val, xmm7); + break; + } +} + +typedef unsigned long v1di __attribute__ ((vector_size (8))); +static inline unsigned long get_xmm(int n) +{ + assert(n >= 0 && n <= 7); + + register v1di xmm0 __asm__("%xmm0"); + register v1di xmm1 __asm__("%xmm1"); + register v1di xmm2 __asm__("%xmm2"); + register v1di xmm3 __asm__("%xmm3"); + register v1di xmm4 __asm__("%xmm4"); + register v1di xmm5 __asm__("%xmm5"); + register v1di xmm6 __asm__("%xmm6"); + register v1di xmm7 __asm__("%xmm7"); + switch (n) { + case 0: + return (unsigned long)xmm0; + case 1: + return (unsigned long)xmm1; + case 2: + return (unsigned long)xmm2; + case 3: + return (unsigned long)xmm3; + case 4: + return (unsigned long)xmm4; + case 5: + return (unsigned long)xmm5; + case 6: + return (unsigned long)xmm6; + case 7: + return (unsigned long)xmm7; + } + return 0; +} + +bool is_intel_cpu(void); + +struct kvm_x86_state; +struct kvm_x86_state *vcpu_save_state(struct kvm_vm *vm, uint32_t vcpuid); +void vcpu_load_state(struct kvm_vm *vm, uint32_t vcpuid, + struct kvm_x86_state *state); + +struct kvm_msr_list *kvm_get_msr_index_list(void); + +struct kvm_cpuid2 *kvm_get_supported_cpuid(void); +void vcpu_set_cpuid(struct kvm_vm *vm, uint32_t vcpuid, + struct kvm_cpuid2 *cpuid); + +struct kvm_cpuid_entry2 * +kvm_get_supported_cpuid_index(uint32_t function, uint32_t index); + +static inline struct kvm_cpuid_entry2 * +kvm_get_supported_cpuid_entry(uint32_t function) +{ + return kvm_get_supported_cpuid_index(function, 0); +} + +uint64_t vcpu_get_msr(struct kvm_vm *vm, uint32_t vcpuid, uint64_t msr_index); +int _vcpu_set_msr(struct kvm_vm *vm, uint32_t vcpuid, uint64_t msr_index, + uint64_t msr_value); +void vcpu_set_msr(struct kvm_vm *vm, uint32_t vcpuid, uint64_t msr_index, + uint64_t msr_value); + +uint32_t kvm_get_cpuid_max_basic(void); +uint32_t kvm_get_cpuid_max_extended(void); +void kvm_get_cpu_address_width(unsigned int *pa_bits, unsigned int *va_bits); + +struct ex_regs { + uint64_t rax, rcx, rdx, rbx; + uint64_t rbp, rsi, rdi; + uint64_t r8, r9, r10, r11; + uint64_t r12, r13, r14, r15; + uint64_t vector; + uint64_t error_code; + uint64_t rip; + uint64_t cs; + uint64_t rflags; +}; + +void vm_init_descriptor_tables(struct kvm_vm *vm); +void vcpu_init_descriptor_tables(struct kvm_vm *vm, uint32_t vcpuid); +void vm_handle_exception(struct kvm_vm *vm, int vector, + void (*handler)(struct ex_regs *)); + +/* + * set_cpuid() - overwrites a matching cpuid entry with the provided value. + * matches based on ent->function && ent->index. returns true + * if a match was found and successfully overwritten. + * @cpuid: the kvm cpuid list to modify. + * @ent: cpuid entry to insert + */ +bool set_cpuid(struct kvm_cpuid2 *cpuid, struct kvm_cpuid_entry2 *ent); + +uint64_t kvm_hypercall(uint64_t nr, uint64_t a0, uint64_t a1, uint64_t a2, + uint64_t a3); + +/* + * Basic CPU control in CR0 + */ +#define X86_CR0_PE (1UL<<0) /* Protection Enable */ +#define X86_CR0_MP (1UL<<1) /* Monitor Coprocessor */ +#define X86_CR0_EM (1UL<<2) /* Emulation */ +#define X86_CR0_TS (1UL<<3) /* Task Switched */ +#define X86_CR0_ET (1UL<<4) /* Extension Type */ +#define X86_CR0_NE (1UL<<5) /* Numeric Error */ +#define X86_CR0_WP (1UL<<16) /* Write Protect */ +#define X86_CR0_AM (1UL<<18) /* Alignment Mask */ +#define X86_CR0_NW (1UL<<29) /* Not Write-through */ +#define X86_CR0_CD (1UL<<30) /* Cache Disable */ +#define X86_CR0_PG (1UL<<31) /* Paging */ + +#define APIC_BASE_MSR 0x800 +#define X2APIC_ENABLE (1UL << 10) +#define APIC_ICR 0x300 +#define APIC_DEST_SELF 0x40000 +#define APIC_DEST_ALLINC 0x80000 +#define APIC_DEST_ALLBUT 0xC0000 +#define APIC_ICR_RR_MASK 0x30000 +#define APIC_ICR_RR_INVALID 0x00000 +#define APIC_ICR_RR_INPROG 0x10000 +#define APIC_ICR_RR_VALID 0x20000 +#define APIC_INT_LEVELTRIG 0x08000 +#define APIC_INT_ASSERT 0x04000 +#define APIC_ICR_BUSY 0x01000 +#define APIC_DEST_LOGICAL 0x00800 +#define APIC_DEST_PHYSICAL 0x00000 +#define APIC_DM_FIXED 0x00000 +#define APIC_DM_FIXED_MASK 0x00700 +#define APIC_DM_LOWEST 0x00100 +#define APIC_DM_SMI 0x00200 +#define APIC_DM_REMRD 0x00300 +#define APIC_DM_NMI 0x00400 +#define APIC_DM_INIT 0x00500 +#define APIC_DM_STARTUP 0x00600 +#define APIC_DM_EXTINT 0x00700 +#define APIC_VECTOR_MASK 0x000FF +#define APIC_ICR2 0x310 + +/* VMX_EPT_VPID_CAP bits */ +#define VMX_EPT_VPID_CAP_AD_BITS (1ULL << 21) + +#endif /* SELFTEST_KVM_PROCESSOR_H */ diff --git a/tools/testing/selftests/kvm/include/x86_64/svm.h b/tools/testing/selftests/kvm/include/x86_64/svm.h new file mode 100644 index 000000000..f4ea2355d --- /dev/null +++ b/tools/testing/selftests/kvm/include/x86_64/svm.h @@ -0,0 +1,297 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * tools/testing/selftests/kvm/include/x86_64/svm.h + * This is a copy of arch/x86/include/asm/svm.h + * + */ + +#ifndef SELFTEST_KVM_SVM_H +#define SELFTEST_KVM_SVM_H + +enum { + INTERCEPT_INTR, + INTERCEPT_NMI, + INTERCEPT_SMI, + INTERCEPT_INIT, + INTERCEPT_VINTR, + INTERCEPT_SELECTIVE_CR0, + INTERCEPT_STORE_IDTR, + INTERCEPT_STORE_GDTR, + INTERCEPT_STORE_LDTR, + INTERCEPT_STORE_TR, + INTERCEPT_LOAD_IDTR, + INTERCEPT_LOAD_GDTR, + INTERCEPT_LOAD_LDTR, + INTERCEPT_LOAD_TR, + INTERCEPT_RDTSC, + INTERCEPT_RDPMC, + INTERCEPT_PUSHF, + INTERCEPT_POPF, + INTERCEPT_CPUID, + INTERCEPT_RSM, + INTERCEPT_IRET, + INTERCEPT_INTn, + INTERCEPT_INVD, + INTERCEPT_PAUSE, + INTERCEPT_HLT, + INTERCEPT_INVLPG, + INTERCEPT_INVLPGA, + INTERCEPT_IOIO_PROT, + INTERCEPT_MSR_PROT, + INTERCEPT_TASK_SWITCH, + INTERCEPT_FERR_FREEZE, + INTERCEPT_SHUTDOWN, + INTERCEPT_VMRUN, + INTERCEPT_VMMCALL, + INTERCEPT_VMLOAD, + INTERCEPT_VMSAVE, + INTERCEPT_STGI, + INTERCEPT_CLGI, + INTERCEPT_SKINIT, + INTERCEPT_RDTSCP, + INTERCEPT_ICEBP, + INTERCEPT_WBINVD, + INTERCEPT_MONITOR, + INTERCEPT_MWAIT, + INTERCEPT_MWAIT_COND, + INTERCEPT_XSETBV, + INTERCEPT_RDPRU, +}; + + +struct __attribute__ ((__packed__)) vmcb_control_area { + u32 intercept_cr; + u32 intercept_dr; + u32 intercept_exceptions; + u64 intercept; + u8 reserved_1[40]; + u16 pause_filter_thresh; + u16 pause_filter_count; + u64 iopm_base_pa; + u64 msrpm_base_pa; + u64 tsc_offset; + u32 asid; + u8 tlb_ctl; + u8 reserved_2[3]; + u32 int_ctl; + u32 int_vector; + u32 int_state; + u8 reserved_3[4]; + u32 exit_code; + u32 exit_code_hi; + u64 exit_info_1; + u64 exit_info_2; + u32 exit_int_info; + u32 exit_int_info_err; + u64 nested_ctl; + u64 avic_vapic_bar; + u8 reserved_4[8]; + u32 event_inj; + u32 event_inj_err; + u64 nested_cr3; + u64 virt_ext; + u32 clean; + u32 reserved_5; + u64 next_rip; + u8 insn_len; + u8 insn_bytes[15]; + u64 avic_backing_page; /* Offset 0xe0 */ + u8 reserved_6[8]; /* Offset 0xe8 */ + u64 avic_logical_id; /* Offset 0xf0 */ + u64 avic_physical_id; /* Offset 0xf8 */ + u8 reserved_7[768]; +}; + + +#define TLB_CONTROL_DO_NOTHING 0 +#define TLB_CONTROL_FLUSH_ALL_ASID 1 +#define TLB_CONTROL_FLUSH_ASID 3 +#define TLB_CONTROL_FLUSH_ASID_LOCAL 7 + +#define V_TPR_MASK 0x0f + +#define V_IRQ_SHIFT 8 +#define V_IRQ_MASK (1 << V_IRQ_SHIFT) + +#define V_GIF_SHIFT 9 +#define V_GIF_MASK (1 << V_GIF_SHIFT) + +#define V_INTR_PRIO_SHIFT 16 +#define V_INTR_PRIO_MASK (0x0f << V_INTR_PRIO_SHIFT) + +#define V_IGN_TPR_SHIFT 20 +#define V_IGN_TPR_MASK (1 << V_IGN_TPR_SHIFT) + +#define V_INTR_MASKING_SHIFT 24 +#define V_INTR_MASKING_MASK (1 << V_INTR_MASKING_SHIFT) + +#define V_GIF_ENABLE_SHIFT 25 +#define V_GIF_ENABLE_MASK (1 << V_GIF_ENABLE_SHIFT) + +#define AVIC_ENABLE_SHIFT 31 +#define AVIC_ENABLE_MASK (1 << AVIC_ENABLE_SHIFT) + +#define LBR_CTL_ENABLE_MASK BIT_ULL(0) +#define VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK BIT_ULL(1) + +#define SVM_INTERRUPT_SHADOW_MASK 1 + +#define SVM_IOIO_STR_SHIFT 2 +#define SVM_IOIO_REP_SHIFT 3 +#define SVM_IOIO_SIZE_SHIFT 4 +#define SVM_IOIO_ASIZE_SHIFT 7 + +#define SVM_IOIO_TYPE_MASK 1 +#define SVM_IOIO_STR_MASK (1 << SVM_IOIO_STR_SHIFT) +#define SVM_IOIO_REP_MASK (1 << SVM_IOIO_REP_SHIFT) +#define SVM_IOIO_SIZE_MASK (7 << SVM_IOIO_SIZE_SHIFT) +#define SVM_IOIO_ASIZE_MASK (7 << SVM_IOIO_ASIZE_SHIFT) + +#define SVM_VM_CR_VALID_MASK 0x001fULL +#define SVM_VM_CR_SVM_LOCK_MASK 0x0008ULL +#define SVM_VM_CR_SVM_DIS_MASK 0x0010ULL + +#define SVM_NESTED_CTL_NP_ENABLE BIT(0) +#define SVM_NESTED_CTL_SEV_ENABLE BIT(1) + +struct __attribute__ ((__packed__)) vmcb_seg { + u16 selector; + u16 attrib; + u32 limit; + u64 base; +}; + +struct __attribute__ ((__packed__)) vmcb_save_area { + struct vmcb_seg es; + struct vmcb_seg cs; + struct vmcb_seg ss; + struct vmcb_seg ds; + struct vmcb_seg fs; + struct vmcb_seg gs; + struct vmcb_seg gdtr; + struct vmcb_seg ldtr; + struct vmcb_seg idtr; + struct vmcb_seg tr; + u8 reserved_1[43]; + u8 cpl; + u8 reserved_2[4]; + u64 efer; + u8 reserved_3[112]; + u64 cr4; + u64 cr3; + u64 cr0; + u64 dr7; + u64 dr6; + u64 rflags; + u64 rip; + u8 reserved_4[88]; + u64 rsp; + u8 reserved_5[24]; + u64 rax; + u64 star; + u64 lstar; + u64 cstar; + u64 sfmask; + u64 kernel_gs_base; + u64 sysenter_cs; + u64 sysenter_esp; + u64 sysenter_eip; + u64 cr2; + u8 reserved_6[32]; + u64 g_pat; + u64 dbgctl; + u64 br_from; + u64 br_to; + u64 last_excp_from; + u64 last_excp_to; +}; + +struct __attribute__ ((__packed__)) vmcb { + struct vmcb_control_area control; + struct vmcb_save_area save; +}; + +#define SVM_CPUID_FUNC 0x8000000a + +#define SVM_VM_CR_SVM_DISABLE 4 + +#define SVM_SELECTOR_S_SHIFT 4 +#define SVM_SELECTOR_DPL_SHIFT 5 +#define SVM_SELECTOR_P_SHIFT 7 +#define SVM_SELECTOR_AVL_SHIFT 8 +#define SVM_SELECTOR_L_SHIFT 9 +#define SVM_SELECTOR_DB_SHIFT 10 +#define SVM_SELECTOR_G_SHIFT 11 + +#define SVM_SELECTOR_TYPE_MASK (0xf) +#define SVM_SELECTOR_S_MASK (1 << SVM_SELECTOR_S_SHIFT) +#define SVM_SELECTOR_DPL_MASK (3 << SVM_SELECTOR_DPL_SHIFT) +#define SVM_SELECTOR_P_MASK (1 << SVM_SELECTOR_P_SHIFT) +#define SVM_SELECTOR_AVL_MASK (1 << SVM_SELECTOR_AVL_SHIFT) +#define SVM_SELECTOR_L_MASK (1 << SVM_SELECTOR_L_SHIFT) +#define SVM_SELECTOR_DB_MASK (1 << SVM_SELECTOR_DB_SHIFT) +#define SVM_SELECTOR_G_MASK (1 << SVM_SELECTOR_G_SHIFT) + +#define SVM_SELECTOR_WRITE_MASK (1 << 1) +#define SVM_SELECTOR_READ_MASK SVM_SELECTOR_WRITE_MASK +#define SVM_SELECTOR_CODE_MASK (1 << 3) + +#define INTERCEPT_CR0_READ 0 +#define INTERCEPT_CR3_READ 3 +#define INTERCEPT_CR4_READ 4 +#define INTERCEPT_CR8_READ 8 +#define INTERCEPT_CR0_WRITE (16 + 0) +#define INTERCEPT_CR3_WRITE (16 + 3) +#define INTERCEPT_CR4_WRITE (16 + 4) +#define INTERCEPT_CR8_WRITE (16 + 8) + +#define INTERCEPT_DR0_READ 0 +#define INTERCEPT_DR1_READ 1 +#define INTERCEPT_DR2_READ 2 +#define INTERCEPT_DR3_READ 3 +#define INTERCEPT_DR4_READ 4 +#define INTERCEPT_DR5_READ 5 +#define INTERCEPT_DR6_READ 6 +#define INTERCEPT_DR7_READ 7 +#define INTERCEPT_DR0_WRITE (16 + 0) +#define INTERCEPT_DR1_WRITE (16 + 1) +#define INTERCEPT_DR2_WRITE (16 + 2) +#define INTERCEPT_DR3_WRITE (16 + 3) +#define INTERCEPT_DR4_WRITE (16 + 4) +#define INTERCEPT_DR5_WRITE (16 + 5) +#define INTERCEPT_DR6_WRITE (16 + 6) +#define INTERCEPT_DR7_WRITE (16 + 7) + +#define SVM_EVTINJ_VEC_MASK 0xff + +#define SVM_EVTINJ_TYPE_SHIFT 8 +#define SVM_EVTINJ_TYPE_MASK (7 << SVM_EVTINJ_TYPE_SHIFT) + +#define SVM_EVTINJ_TYPE_INTR (0 << SVM_EVTINJ_TYPE_SHIFT) +#define SVM_EVTINJ_TYPE_NMI (2 << SVM_EVTINJ_TYPE_SHIFT) +#define SVM_EVTINJ_TYPE_EXEPT (3 << SVM_EVTINJ_TYPE_SHIFT) +#define SVM_EVTINJ_TYPE_SOFT (4 << SVM_EVTINJ_TYPE_SHIFT) + +#define SVM_EVTINJ_VALID (1 << 31) +#define SVM_EVTINJ_VALID_ERR (1 << 11) + +#define SVM_EXITINTINFO_VEC_MASK SVM_EVTINJ_VEC_MASK +#define SVM_EXITINTINFO_TYPE_MASK SVM_EVTINJ_TYPE_MASK + +#define SVM_EXITINTINFO_TYPE_INTR SVM_EVTINJ_TYPE_INTR +#define SVM_EXITINTINFO_TYPE_NMI SVM_EVTINJ_TYPE_NMI +#define SVM_EXITINTINFO_TYPE_EXEPT SVM_EVTINJ_TYPE_EXEPT +#define SVM_EXITINTINFO_TYPE_SOFT SVM_EVTINJ_TYPE_SOFT + +#define SVM_EXITINTINFO_VALID SVM_EVTINJ_VALID +#define SVM_EXITINTINFO_VALID_ERR SVM_EVTINJ_VALID_ERR + +#define SVM_EXITINFOSHIFT_TS_REASON_IRET 36 +#define SVM_EXITINFOSHIFT_TS_REASON_JMP 38 +#define SVM_EXITINFOSHIFT_TS_HAS_ERROR_CODE 44 + +#define SVM_EXITINFO_REG_MASK 0x0F + +#define SVM_CR0_SELECTIVE_MASK (X86_CR0_TS | X86_CR0_MP) + +#endif /* SELFTEST_KVM_SVM_H */ diff --git a/tools/testing/selftests/kvm/include/x86_64/svm_util.h b/tools/testing/selftests/kvm/include/x86_64/svm_util.h new file mode 100644 index 000000000..b7531c83b --- /dev/null +++ b/tools/testing/selftests/kvm/include/x86_64/svm_util.h @@ -0,0 +1,49 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * tools/testing/selftests/kvm/include/x86_64/svm_utils.h + * Header for nested SVM testing + * + * Copyright (C) 2020, Red Hat, Inc. + */ + +#ifndef SELFTEST_KVM_SVM_UTILS_H +#define SELFTEST_KVM_SVM_UTILS_H + +#include +#include "svm.h" +#include "processor.h" + +#define CPUID_SVM_BIT 2 +#define CPUID_SVM BIT_ULL(CPUID_SVM_BIT) + +#define SVM_EXIT_VMMCALL 0x081 + +struct svm_test_data { + /* VMCB */ + struct vmcb *vmcb; /* gva */ + void *vmcb_hva; + uint64_t vmcb_gpa; + + /* host state-save area */ + struct vmcb_save_area *save_area; /* gva */ + void *save_area_hva; + uint64_t save_area_gpa; +}; + +struct svm_test_data *vcpu_alloc_svm(struct kvm_vm *vm, vm_vaddr_t *p_svm_gva); +void generic_svm_setup(struct svm_test_data *svm, void *guest_rip, void *guest_rsp); +void run_guest(struct vmcb *vmcb, uint64_t vmcb_gpa); +bool nested_svm_supported(void); +void nested_svm_check_supported(void); + +static inline bool cpu_has_svm(void) +{ + u32 eax = 0x80000001, ecx; + + asm("cpuid" : + "=a" (eax), "=c" (ecx) : "0" (eax) : "ebx", "edx"); + + return ecx & CPUID_SVM; +} + +#endif /* SELFTEST_KVM_SVM_UTILS_H */ diff --git a/tools/testing/selftests/kvm/include/x86_64/vmx.h b/tools/testing/selftests/kvm/include/x86_64/vmx.h new file mode 100644 index 000000000..e78d7e26b --- /dev/null +++ b/tools/testing/selftests/kvm/include/x86_64/vmx.h @@ -0,0 +1,625 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * tools/testing/selftests/kvm/include/x86_64/vmx.h + * + * Copyright (C) 2018, Google LLC. + */ + +#ifndef SELFTEST_KVM_VMX_H +#define SELFTEST_KVM_VMX_H + +#include +#include "processor.h" + +#define CPUID_VMX_BIT 5 + +#define CPUID_VMX (1 << 5) + +/* + * Definitions of Primary Processor-Based VM-Execution Controls. + */ +#define CPU_BASED_INTR_WINDOW_EXITING 0x00000004 +#define CPU_BASED_USE_TSC_OFFSETTING 0x00000008 +#define CPU_BASED_HLT_EXITING 0x00000080 +#define CPU_BASED_INVLPG_EXITING 0x00000200 +#define CPU_BASED_MWAIT_EXITING 0x00000400 +#define CPU_BASED_RDPMC_EXITING 0x00000800 +#define CPU_BASED_RDTSC_EXITING 0x00001000 +#define CPU_BASED_CR3_LOAD_EXITING 0x00008000 +#define CPU_BASED_CR3_STORE_EXITING 0x00010000 +#define CPU_BASED_CR8_LOAD_EXITING 0x00080000 +#define CPU_BASED_CR8_STORE_EXITING 0x00100000 +#define CPU_BASED_TPR_SHADOW 0x00200000 +#define CPU_BASED_NMI_WINDOW_EXITING 0x00400000 +#define CPU_BASED_MOV_DR_EXITING 0x00800000 +#define CPU_BASED_UNCOND_IO_EXITING 0x01000000 +#define CPU_BASED_USE_IO_BITMAPS 0x02000000 +#define CPU_BASED_MONITOR_TRAP 0x08000000 +#define CPU_BASED_USE_MSR_BITMAPS 0x10000000 +#define CPU_BASED_MONITOR_EXITING 0x20000000 +#define CPU_BASED_PAUSE_EXITING 0x40000000 +#define CPU_BASED_ACTIVATE_SECONDARY_CONTROLS 0x80000000 + +#define CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR 0x0401e172 + +/* + * Definitions of Secondary Processor-Based VM-Execution Controls. + */ +#define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001 +#define SECONDARY_EXEC_ENABLE_EPT 0x00000002 +#define SECONDARY_EXEC_DESC 0x00000004 +#define SECONDARY_EXEC_ENABLE_RDTSCP 0x00000008 +#define SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE 0x00000010 +#define SECONDARY_EXEC_ENABLE_VPID 0x00000020 +#define SECONDARY_EXEC_WBINVD_EXITING 0x00000040 +#define SECONDARY_EXEC_UNRESTRICTED_GUEST 0x00000080 +#define SECONDARY_EXEC_APIC_REGISTER_VIRT 0x00000100 +#define SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY 0x00000200 +#define SECONDARY_EXEC_PAUSE_LOOP_EXITING 0x00000400 +#define SECONDARY_EXEC_RDRAND_EXITING 0x00000800 +#define SECONDARY_EXEC_ENABLE_INVPCID 0x00001000 +#define SECONDARY_EXEC_ENABLE_VMFUNC 0x00002000 +#define SECONDARY_EXEC_SHADOW_VMCS 0x00004000 +#define SECONDARY_EXEC_RDSEED_EXITING 0x00010000 +#define SECONDARY_EXEC_ENABLE_PML 0x00020000 +#define SECONDARY_EPT_VE 0x00040000 +#define SECONDARY_ENABLE_XSAV_RESTORE 0x00100000 +#define SECONDARY_EXEC_TSC_SCALING 0x02000000 + +#define PIN_BASED_EXT_INTR_MASK 0x00000001 +#define PIN_BASED_NMI_EXITING 0x00000008 +#define PIN_BASED_VIRTUAL_NMIS 0x00000020 +#define PIN_BASED_VMX_PREEMPTION_TIMER 0x00000040 +#define PIN_BASED_POSTED_INTR 0x00000080 + +#define PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR 0x00000016 + +#define VM_EXIT_SAVE_DEBUG_CONTROLS 0x00000004 +#define VM_EXIT_HOST_ADDR_SPACE_SIZE 0x00000200 +#define VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL 0x00001000 +#define VM_EXIT_ACK_INTR_ON_EXIT 0x00008000 +#define VM_EXIT_SAVE_IA32_PAT 0x00040000 +#define VM_EXIT_LOAD_IA32_PAT 0x00080000 +#define VM_EXIT_SAVE_IA32_EFER 0x00100000 +#define VM_EXIT_LOAD_IA32_EFER 0x00200000 +#define VM_EXIT_SAVE_VMX_PREEMPTION_TIMER 0x00400000 + +#define VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR 0x00036dff + +#define VM_ENTRY_LOAD_DEBUG_CONTROLS 0x00000004 +#define VM_ENTRY_IA32E_MODE 0x00000200 +#define VM_ENTRY_SMM 0x00000400 +#define VM_ENTRY_DEACT_DUAL_MONITOR 0x00000800 +#define VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL 0x00002000 +#define VM_ENTRY_LOAD_IA32_PAT 0x00004000 +#define VM_ENTRY_LOAD_IA32_EFER 0x00008000 + +#define VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR 0x000011ff + +#define VMX_MISC_PREEMPTION_TIMER_RATE_MASK 0x0000001f +#define VMX_MISC_SAVE_EFER_LMA 0x00000020 + +#define EXIT_REASON_FAILED_VMENTRY 0x80000000 +#define EXIT_REASON_EXCEPTION_NMI 0 +#define EXIT_REASON_EXTERNAL_INTERRUPT 1 +#define EXIT_REASON_TRIPLE_FAULT 2 +#define EXIT_REASON_INTERRUPT_WINDOW 7 +#define EXIT_REASON_NMI_WINDOW 8 +#define EXIT_REASON_TASK_SWITCH 9 +#define EXIT_REASON_CPUID 10 +#define EXIT_REASON_HLT 12 +#define EXIT_REASON_INVD 13 +#define EXIT_REASON_INVLPG 14 +#define EXIT_REASON_RDPMC 15 +#define EXIT_REASON_RDTSC 16 +#define EXIT_REASON_VMCALL 18 +#define EXIT_REASON_VMCLEAR 19 +#define EXIT_REASON_VMLAUNCH 20 +#define EXIT_REASON_VMPTRLD 21 +#define EXIT_REASON_VMPTRST 22 +#define EXIT_REASON_VMREAD 23 +#define EXIT_REASON_VMRESUME 24 +#define EXIT_REASON_VMWRITE 25 +#define EXIT_REASON_VMOFF 26 +#define EXIT_REASON_VMON 27 +#define EXIT_REASON_CR_ACCESS 28 +#define EXIT_REASON_DR_ACCESS 29 +#define EXIT_REASON_IO_INSTRUCTION 30 +#define EXIT_REASON_MSR_READ 31 +#define EXIT_REASON_MSR_WRITE 32 +#define EXIT_REASON_INVALID_STATE 33 +#define EXIT_REASON_MWAIT_INSTRUCTION 36 +#define EXIT_REASON_MONITOR_INSTRUCTION 39 +#define EXIT_REASON_PAUSE_INSTRUCTION 40 +#define EXIT_REASON_MCE_DURING_VMENTRY 41 +#define EXIT_REASON_TPR_BELOW_THRESHOLD 43 +#define EXIT_REASON_APIC_ACCESS 44 +#define EXIT_REASON_EOI_INDUCED 45 +#define EXIT_REASON_EPT_VIOLATION 48 +#define EXIT_REASON_EPT_MISCONFIG 49 +#define EXIT_REASON_INVEPT 50 +#define EXIT_REASON_RDTSCP 51 +#define EXIT_REASON_PREEMPTION_TIMER 52 +#define EXIT_REASON_INVVPID 53 +#define EXIT_REASON_WBINVD 54 +#define EXIT_REASON_XSETBV 55 +#define EXIT_REASON_APIC_WRITE 56 +#define EXIT_REASON_INVPCID 58 +#define EXIT_REASON_PML_FULL 62 +#define EXIT_REASON_XSAVES 63 +#define EXIT_REASON_XRSTORS 64 +#define LAST_EXIT_REASON 64 + +enum vmcs_field { + VIRTUAL_PROCESSOR_ID = 0x00000000, + POSTED_INTR_NV = 0x00000002, + GUEST_ES_SELECTOR = 0x00000800, + GUEST_CS_SELECTOR = 0x00000802, + GUEST_SS_SELECTOR = 0x00000804, + GUEST_DS_SELECTOR = 0x00000806, + GUEST_FS_SELECTOR = 0x00000808, + GUEST_GS_SELECTOR = 0x0000080a, + GUEST_LDTR_SELECTOR = 0x0000080c, + GUEST_TR_SELECTOR = 0x0000080e, + GUEST_INTR_STATUS = 0x00000810, + GUEST_PML_INDEX = 0x00000812, + HOST_ES_SELECTOR = 0x00000c00, + HOST_CS_SELECTOR = 0x00000c02, + HOST_SS_SELECTOR = 0x00000c04, + HOST_DS_SELECTOR = 0x00000c06, + HOST_FS_SELECTOR = 0x00000c08, + HOST_GS_SELECTOR = 0x00000c0a, + HOST_TR_SELECTOR = 0x00000c0c, + IO_BITMAP_A = 0x00002000, + IO_BITMAP_A_HIGH = 0x00002001, + IO_BITMAP_B = 0x00002002, + IO_BITMAP_B_HIGH = 0x00002003, + MSR_BITMAP = 0x00002004, + MSR_BITMAP_HIGH = 0x00002005, + VM_EXIT_MSR_STORE_ADDR = 0x00002006, + VM_EXIT_MSR_STORE_ADDR_HIGH = 0x00002007, + VM_EXIT_MSR_LOAD_ADDR = 0x00002008, + VM_EXIT_MSR_LOAD_ADDR_HIGH = 0x00002009, + VM_ENTRY_MSR_LOAD_ADDR = 0x0000200a, + VM_ENTRY_MSR_LOAD_ADDR_HIGH = 0x0000200b, + PML_ADDRESS = 0x0000200e, + PML_ADDRESS_HIGH = 0x0000200f, + TSC_OFFSET = 0x00002010, + TSC_OFFSET_HIGH = 0x00002011, + VIRTUAL_APIC_PAGE_ADDR = 0x00002012, + VIRTUAL_APIC_PAGE_ADDR_HIGH = 0x00002013, + APIC_ACCESS_ADDR = 0x00002014, + APIC_ACCESS_ADDR_HIGH = 0x00002015, + POSTED_INTR_DESC_ADDR = 0x00002016, + POSTED_INTR_DESC_ADDR_HIGH = 0x00002017, + EPT_POINTER = 0x0000201a, + EPT_POINTER_HIGH = 0x0000201b, + EOI_EXIT_BITMAP0 = 0x0000201c, + EOI_EXIT_BITMAP0_HIGH = 0x0000201d, + EOI_EXIT_BITMAP1 = 0x0000201e, + EOI_EXIT_BITMAP1_HIGH = 0x0000201f, + EOI_EXIT_BITMAP2 = 0x00002020, + EOI_EXIT_BITMAP2_HIGH = 0x00002021, + EOI_EXIT_BITMAP3 = 0x00002022, + EOI_EXIT_BITMAP3_HIGH = 0x00002023, + VMREAD_BITMAP = 0x00002026, + VMREAD_BITMAP_HIGH = 0x00002027, + VMWRITE_BITMAP = 0x00002028, + VMWRITE_BITMAP_HIGH = 0x00002029, + XSS_EXIT_BITMAP = 0x0000202C, + XSS_EXIT_BITMAP_HIGH = 0x0000202D, + TSC_MULTIPLIER = 0x00002032, + TSC_MULTIPLIER_HIGH = 0x00002033, + GUEST_PHYSICAL_ADDRESS = 0x00002400, + GUEST_PHYSICAL_ADDRESS_HIGH = 0x00002401, + VMCS_LINK_POINTER = 0x00002800, + VMCS_LINK_POINTER_HIGH = 0x00002801, + GUEST_IA32_DEBUGCTL = 0x00002802, + GUEST_IA32_DEBUGCTL_HIGH = 0x00002803, + GUEST_IA32_PAT = 0x00002804, + GUEST_IA32_PAT_HIGH = 0x00002805, + GUEST_IA32_EFER = 0x00002806, + GUEST_IA32_EFER_HIGH = 0x00002807, + GUEST_IA32_PERF_GLOBAL_CTRL = 0x00002808, + GUEST_IA32_PERF_GLOBAL_CTRL_HIGH= 0x00002809, + GUEST_PDPTR0 = 0x0000280a, + GUEST_PDPTR0_HIGH = 0x0000280b, + GUEST_PDPTR1 = 0x0000280c, + GUEST_PDPTR1_HIGH = 0x0000280d, + GUEST_PDPTR2 = 0x0000280e, + GUEST_PDPTR2_HIGH = 0x0000280f, + GUEST_PDPTR3 = 0x00002810, + GUEST_PDPTR3_HIGH = 0x00002811, + GUEST_BNDCFGS = 0x00002812, + GUEST_BNDCFGS_HIGH = 0x00002813, + HOST_IA32_PAT = 0x00002c00, + HOST_IA32_PAT_HIGH = 0x00002c01, + HOST_IA32_EFER = 0x00002c02, + HOST_IA32_EFER_HIGH = 0x00002c03, + HOST_IA32_PERF_GLOBAL_CTRL = 0x00002c04, + HOST_IA32_PERF_GLOBAL_CTRL_HIGH = 0x00002c05, + PIN_BASED_VM_EXEC_CONTROL = 0x00004000, + CPU_BASED_VM_EXEC_CONTROL = 0x00004002, + EXCEPTION_BITMAP = 0x00004004, + PAGE_FAULT_ERROR_CODE_MASK = 0x00004006, + PAGE_FAULT_ERROR_CODE_MATCH = 0x00004008, + CR3_TARGET_COUNT = 0x0000400a, + VM_EXIT_CONTROLS = 0x0000400c, + VM_EXIT_MSR_STORE_COUNT = 0x0000400e, + VM_EXIT_MSR_LOAD_COUNT = 0x00004010, + VM_ENTRY_CONTROLS = 0x00004012, + VM_ENTRY_MSR_LOAD_COUNT = 0x00004014, + VM_ENTRY_INTR_INFO_FIELD = 0x00004016, + VM_ENTRY_EXCEPTION_ERROR_CODE = 0x00004018, + VM_ENTRY_INSTRUCTION_LEN = 0x0000401a, + TPR_THRESHOLD = 0x0000401c, + SECONDARY_VM_EXEC_CONTROL = 0x0000401e, + PLE_GAP = 0x00004020, + PLE_WINDOW = 0x00004022, + VM_INSTRUCTION_ERROR = 0x00004400, + VM_EXIT_REASON = 0x00004402, + VM_EXIT_INTR_INFO = 0x00004404, + VM_EXIT_INTR_ERROR_CODE = 0x00004406, + IDT_VECTORING_INFO_FIELD = 0x00004408, + IDT_VECTORING_ERROR_CODE = 0x0000440a, + VM_EXIT_INSTRUCTION_LEN = 0x0000440c, + VMX_INSTRUCTION_INFO = 0x0000440e, + GUEST_ES_LIMIT = 0x00004800, + GUEST_CS_LIMIT = 0x00004802, + GUEST_SS_LIMIT = 0x00004804, + GUEST_DS_LIMIT = 0x00004806, + GUEST_FS_LIMIT = 0x00004808, + GUEST_GS_LIMIT = 0x0000480a, + GUEST_LDTR_LIMIT = 0x0000480c, + GUEST_TR_LIMIT = 0x0000480e, + GUEST_GDTR_LIMIT = 0x00004810, + GUEST_IDTR_LIMIT = 0x00004812, + GUEST_ES_AR_BYTES = 0x00004814, + GUEST_CS_AR_BYTES = 0x00004816, + GUEST_SS_AR_BYTES = 0x00004818, + GUEST_DS_AR_BYTES = 0x0000481a, + GUEST_FS_AR_BYTES = 0x0000481c, + GUEST_GS_AR_BYTES = 0x0000481e, + GUEST_LDTR_AR_BYTES = 0x00004820, + GUEST_TR_AR_BYTES = 0x00004822, + GUEST_INTERRUPTIBILITY_INFO = 0x00004824, + GUEST_ACTIVITY_STATE = 0X00004826, + GUEST_SYSENTER_CS = 0x0000482A, + VMX_PREEMPTION_TIMER_VALUE = 0x0000482E, + HOST_IA32_SYSENTER_CS = 0x00004c00, + CR0_GUEST_HOST_MASK = 0x00006000, + CR4_GUEST_HOST_MASK = 0x00006002, + CR0_READ_SHADOW = 0x00006004, + CR4_READ_SHADOW = 0x00006006, + CR3_TARGET_VALUE0 = 0x00006008, + CR3_TARGET_VALUE1 = 0x0000600a, + CR3_TARGET_VALUE2 = 0x0000600c, + CR3_TARGET_VALUE3 = 0x0000600e, + EXIT_QUALIFICATION = 0x00006400, + GUEST_LINEAR_ADDRESS = 0x0000640a, + GUEST_CR0 = 0x00006800, + GUEST_CR3 = 0x00006802, + GUEST_CR4 = 0x00006804, + GUEST_ES_BASE = 0x00006806, + GUEST_CS_BASE = 0x00006808, + GUEST_SS_BASE = 0x0000680a, + GUEST_DS_BASE = 0x0000680c, + GUEST_FS_BASE = 0x0000680e, + GUEST_GS_BASE = 0x00006810, + GUEST_LDTR_BASE = 0x00006812, + GUEST_TR_BASE = 0x00006814, + GUEST_GDTR_BASE = 0x00006816, + GUEST_IDTR_BASE = 0x00006818, + GUEST_DR7 = 0x0000681a, + GUEST_RSP = 0x0000681c, + GUEST_RIP = 0x0000681e, + GUEST_RFLAGS = 0x00006820, + GUEST_PENDING_DBG_EXCEPTIONS = 0x00006822, + GUEST_SYSENTER_ESP = 0x00006824, + GUEST_SYSENTER_EIP = 0x00006826, + HOST_CR0 = 0x00006c00, + HOST_CR3 = 0x00006c02, + HOST_CR4 = 0x00006c04, + HOST_FS_BASE = 0x00006c06, + HOST_GS_BASE = 0x00006c08, + HOST_TR_BASE = 0x00006c0a, + HOST_GDTR_BASE = 0x00006c0c, + HOST_IDTR_BASE = 0x00006c0e, + HOST_IA32_SYSENTER_ESP = 0x00006c10, + HOST_IA32_SYSENTER_EIP = 0x00006c12, + HOST_RSP = 0x00006c14, + HOST_RIP = 0x00006c16, +}; + +struct vmx_msr_entry { + uint32_t index; + uint32_t reserved; + uint64_t value; +} __attribute__ ((aligned(16))); + +#include "evmcs.h" + +static inline int vmxon(uint64_t phys) +{ + uint8_t ret; + + __asm__ __volatile__ ("vmxon %[pa]; setna %[ret]" + : [ret]"=rm"(ret) + : [pa]"m"(phys) + : "cc", "memory"); + + return ret; +} + +static inline void vmxoff(void) +{ + __asm__ __volatile__("vmxoff"); +} + +static inline int vmclear(uint64_t vmcs_pa) +{ + uint8_t ret; + + __asm__ __volatile__ ("vmclear %[pa]; setna %[ret]" + : [ret]"=rm"(ret) + : [pa]"m"(vmcs_pa) + : "cc", "memory"); + + return ret; +} + +static inline int vmptrld(uint64_t vmcs_pa) +{ + uint8_t ret; + + if (enable_evmcs) + return -1; + + __asm__ __volatile__ ("vmptrld %[pa]; setna %[ret]" + : [ret]"=rm"(ret) + : [pa]"m"(vmcs_pa) + : "cc", "memory"); + + return ret; +} + +static inline int vmptrst(uint64_t *value) +{ + uint64_t tmp; + uint8_t ret; + + if (enable_evmcs) + return evmcs_vmptrst(value); + + __asm__ __volatile__("vmptrst %[value]; setna %[ret]" + : [value]"=m"(tmp), [ret]"=rm"(ret) + : : "cc", "memory"); + + *value = tmp; + return ret; +} + +/* + * A wrapper around vmptrst that ignores errors and returns zero if the + * vmptrst instruction fails. + */ +static inline uint64_t vmptrstz(void) +{ + uint64_t value = 0; + vmptrst(&value); + return value; +} + +/* + * No guest state (e.g. GPRs) is established by this vmlaunch. + */ +static inline int vmlaunch(void) +{ + int ret; + + if (enable_evmcs) + return evmcs_vmlaunch(); + + __asm__ __volatile__("push %%rbp;" + "push %%rcx;" + "push %%rdx;" + "push %%rsi;" + "push %%rdi;" + "push $0;" + "vmwrite %%rsp, %[host_rsp];" + "lea 1f(%%rip), %%rax;" + "vmwrite %%rax, %[host_rip];" + "vmlaunch;" + "incq (%%rsp);" + "1: pop %%rax;" + "pop %%rdi;" + "pop %%rsi;" + "pop %%rdx;" + "pop %%rcx;" + "pop %%rbp;" + : [ret]"=&a"(ret) + : [host_rsp]"r"((uint64_t)HOST_RSP), + [host_rip]"r"((uint64_t)HOST_RIP) + : "memory", "cc", "rbx", "r8", "r9", "r10", + "r11", "r12", "r13", "r14", "r15"); + return ret; +} + +/* + * No guest state (e.g. GPRs) is established by this vmresume. + */ +static inline int vmresume(void) +{ + int ret; + + if (enable_evmcs) + return evmcs_vmresume(); + + __asm__ __volatile__("push %%rbp;" + "push %%rcx;" + "push %%rdx;" + "push %%rsi;" + "push %%rdi;" + "push $0;" + "vmwrite %%rsp, %[host_rsp];" + "lea 1f(%%rip), %%rax;" + "vmwrite %%rax, %[host_rip];" + "vmresume;" + "incq (%%rsp);" + "1: pop %%rax;" + "pop %%rdi;" + "pop %%rsi;" + "pop %%rdx;" + "pop %%rcx;" + "pop %%rbp;" + : [ret]"=&a"(ret) + : [host_rsp]"r"((uint64_t)HOST_RSP), + [host_rip]"r"((uint64_t)HOST_RIP) + : "memory", "cc", "rbx", "r8", "r9", "r10", + "r11", "r12", "r13", "r14", "r15"); + return ret; +} + +static inline void vmcall(void) +{ + /* Currently, L1 destroys our GPRs during vmexits. */ + __asm__ __volatile__("push %%rbp; vmcall; pop %%rbp" : : : + "rax", "rbx", "rcx", "rdx", + "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", + "r13", "r14", "r15"); +} + +static inline int vmread(uint64_t encoding, uint64_t *value) +{ + uint64_t tmp; + uint8_t ret; + + if (enable_evmcs) + return evmcs_vmread(encoding, value); + + __asm__ __volatile__("vmread %[encoding], %[value]; setna %[ret]" + : [value]"=rm"(tmp), [ret]"=rm"(ret) + : [encoding]"r"(encoding) + : "cc", "memory"); + + *value = tmp; + return ret; +} + +/* + * A wrapper around vmread that ignores errors and returns zero if the + * vmread instruction fails. + */ +static inline uint64_t vmreadz(uint64_t encoding) +{ + uint64_t value = 0; + vmread(encoding, &value); + return value; +} + +static inline int vmwrite(uint64_t encoding, uint64_t value) +{ + uint8_t ret; + + if (enable_evmcs) + return evmcs_vmwrite(encoding, value); + + __asm__ __volatile__ ("vmwrite %[value], %[encoding]; setna %[ret]" + : [ret]"=rm"(ret) + : [value]"rm"(value), [encoding]"r"(encoding) + : "cc", "memory"); + + return ret; +} + +static inline uint32_t vmcs_revision(void) +{ + return rdmsr(MSR_IA32_VMX_BASIC); +} + +struct vmx_pages { + void *vmxon_hva; + uint64_t vmxon_gpa; + void *vmxon; + + void *vmcs_hva; + uint64_t vmcs_gpa; + void *vmcs; + + void *msr_hva; + uint64_t msr_gpa; + void *msr; + + void *shadow_vmcs_hva; + uint64_t shadow_vmcs_gpa; + void *shadow_vmcs; + + void *vmread_hva; + uint64_t vmread_gpa; + void *vmread; + + void *vmwrite_hva; + uint64_t vmwrite_gpa; + void *vmwrite; + + void *vp_assist_hva; + uint64_t vp_assist_gpa; + void *vp_assist; + + void *enlightened_vmcs_hva; + uint64_t enlightened_vmcs_gpa; + void *enlightened_vmcs; + + void *eptp_hva; + uint64_t eptp_gpa; + void *eptp; + + void *apic_access_hva; + uint64_t apic_access_gpa; + void *apic_access; +}; + +union vmx_basic { + u64 val; + struct { + u32 revision; + u32 size:13, + reserved1:3, + width:1, + dual:1, + type:4, + insouts:1, + ctrl:1, + vm_entry_exception_ctrl:1, + reserved2:7; + }; +}; + +union vmx_ctrl_msr { + u64 val; + struct { + u32 set, clr; + }; +}; + +struct vmx_pages *vcpu_alloc_vmx(struct kvm_vm *vm, vm_vaddr_t *p_vmx_gva); +bool prepare_for_vmx_operation(struct vmx_pages *vmx); +void prepare_vmcs(struct vmx_pages *vmx, void *guest_rip, void *guest_rsp); +bool load_vmcs(struct vmx_pages *vmx); + +bool nested_vmx_supported(void); +void nested_vmx_check_supported(void); + +void nested_pg_map(struct vmx_pages *vmx, struct kvm_vm *vm, + uint64_t nested_paddr, uint64_t paddr, uint32_t eptp_memslot); +void nested_map(struct vmx_pages *vmx, struct kvm_vm *vm, + uint64_t nested_paddr, uint64_t paddr, uint64_t size, + uint32_t eptp_memslot); +void nested_map_memslot(struct vmx_pages *vmx, struct kvm_vm *vm, + uint32_t memslot, uint32_t eptp_memslot); +void prepare_eptp(struct vmx_pages *vmx, struct kvm_vm *vm, + uint32_t eptp_memslot); +void prepare_virtualize_apic_accesses(struct vmx_pages *vmx, struct kvm_vm *vm, + uint32_t eptp_memslot); + +#endif /* SELFTEST_KVM_VMX_H */ diff --git a/tools/testing/selftests/kvm/kvm_create_max_vcpus.c b/tools/testing/selftests/kvm/kvm_create_max_vcpus.c new file mode 100644 index 000000000..aa3795cd7 --- /dev/null +++ b/tools/testing/selftests/kvm/kvm_create_max_vcpus.c @@ -0,0 +1,96 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * kvm_create_max_vcpus + * + * Copyright (C) 2019, Google LLC. + * + * Test for KVM_CAP_MAX_VCPUS and KVM_CAP_MAX_VCPU_ID. + */ + +#define _GNU_SOURCE /* for program_invocation_short_name */ +#include +#include +#include +#include +#include + +#include "test_util.h" + +#include "kvm_util.h" +#include "asm/kvm.h" +#include "linux/kvm.h" + +void test_vcpu_creation(int first_vcpu_id, int num_vcpus) +{ + struct kvm_vm *vm; + int i; + + pr_info("Testing creating %d vCPUs, with IDs %d...%d.\n", + num_vcpus, first_vcpu_id, first_vcpu_id + num_vcpus - 1); + + vm = vm_create(VM_MODE_DEFAULT, DEFAULT_GUEST_PHY_PAGES, O_RDWR); + + for (i = first_vcpu_id; i < first_vcpu_id + num_vcpus; i++) + /* This asserts that the vCPU was created. */ + vm_vcpu_add(vm, i); + + kvm_vm_free(vm); +} + +int main(int argc, char *argv[]) +{ + int kvm_max_vcpu_id = kvm_check_cap(KVM_CAP_MAX_VCPU_ID); + int kvm_max_vcpus = kvm_check_cap(KVM_CAP_MAX_VCPUS); + /* + * Number of file descriptors reqired, KVM_CAP_MAX_VCPUS for vCPU fds + + * an arbitrary number for everything else. + */ + int nr_fds_wanted = kvm_max_vcpus + 100; + struct rlimit rl; + + pr_info("KVM_CAP_MAX_VCPU_ID: %d\n", kvm_max_vcpu_id); + pr_info("KVM_CAP_MAX_VCPUS: %d\n", kvm_max_vcpus); + + /* + * Check that we're allowed to open nr_fds_wanted file descriptors and + * try raising the limits if needed. + */ + TEST_ASSERT(!getrlimit(RLIMIT_NOFILE, &rl), "getrlimit() failed!"); + + if (rl.rlim_cur < nr_fds_wanted) { + rl.rlim_cur = nr_fds_wanted; + if (rl.rlim_max < nr_fds_wanted) { + int old_rlim_max = rl.rlim_max; + rl.rlim_max = nr_fds_wanted; + + int r = setrlimit(RLIMIT_NOFILE, &rl); + if (r < 0) { + printf("RLIMIT_NOFILE hard limit is too low (%d, wanted %d)\n", + old_rlim_max, nr_fds_wanted); + exit(KSFT_SKIP); + } + } else { + TEST_ASSERT(!setrlimit(RLIMIT_NOFILE, &rl), "setrlimit() failed!"); + } + } + + /* + * Upstream KVM prior to 4.8 does not support KVM_CAP_MAX_VCPU_ID. + * Userspace is supposed to use KVM_CAP_MAX_VCPUS as the maximum ID + * in this case. + */ + if (!kvm_max_vcpu_id) + kvm_max_vcpu_id = kvm_max_vcpus; + + TEST_ASSERT(kvm_max_vcpu_id >= kvm_max_vcpus, + "KVM_MAX_VCPU_ID (%d) must be at least as large as KVM_MAX_VCPUS (%d).", + kvm_max_vcpu_id, kvm_max_vcpus); + + test_vcpu_creation(0, kvm_max_vcpus); + + if (kvm_max_vcpu_id > kvm_max_vcpus) + test_vcpu_creation( + kvm_max_vcpu_id - kvm_max_vcpus, kvm_max_vcpus); + + return 0; +} diff --git a/tools/testing/selftests/kvm/lib/aarch64/processor.c b/tools/testing/selftests/kvm/lib/aarch64/processor.c new file mode 100644 index 000000000..d6c32c328 --- /dev/null +++ b/tools/testing/selftests/kvm/lib/aarch64/processor.c @@ -0,0 +1,356 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * AArch64 code + * + * Copyright (C) 2018, Red Hat, Inc. + */ + +#define _GNU_SOURCE /* for program_invocation_name */ + +#include + +#include "kvm_util.h" +#include "../kvm_util_internal.h" +#include "processor.h" + +#define KVM_GUEST_PAGE_TABLE_MIN_PADDR 0x180000 +#define DEFAULT_ARM64_GUEST_STACK_VADDR_MIN 0xac0000 + +static uint64_t page_align(struct kvm_vm *vm, uint64_t v) +{ + return (v + vm->page_size) & ~(vm->page_size - 1); +} + +static uint64_t pgd_index(struct kvm_vm *vm, vm_vaddr_t gva) +{ + unsigned int shift = (vm->pgtable_levels - 1) * (vm->page_shift - 3) + vm->page_shift; + uint64_t mask = (1UL << (vm->va_bits - shift)) - 1; + + return (gva >> shift) & mask; +} + +static uint64_t pud_index(struct kvm_vm *vm, vm_vaddr_t gva) +{ + unsigned int shift = 2 * (vm->page_shift - 3) + vm->page_shift; + uint64_t mask = (1UL << (vm->page_shift - 3)) - 1; + + TEST_ASSERT(vm->pgtable_levels == 4, + "Mode %d does not have 4 page table levels", vm->mode); + + return (gva >> shift) & mask; +} + +static uint64_t pmd_index(struct kvm_vm *vm, vm_vaddr_t gva) +{ + unsigned int shift = (vm->page_shift - 3) + vm->page_shift; + uint64_t mask = (1UL << (vm->page_shift - 3)) - 1; + + TEST_ASSERT(vm->pgtable_levels >= 3, + "Mode %d does not have >= 3 page table levels", vm->mode); + + return (gva >> shift) & mask; +} + +static uint64_t pte_index(struct kvm_vm *vm, vm_vaddr_t gva) +{ + uint64_t mask = (1UL << (vm->page_shift - 3)) - 1; + return (gva >> vm->page_shift) & mask; +} + +static uint64_t pte_addr(struct kvm_vm *vm, uint64_t entry) +{ + uint64_t mask = ((1UL << (vm->va_bits - vm->page_shift)) - 1) << vm->page_shift; + return entry & mask; +} + +static uint64_t ptrs_per_pgd(struct kvm_vm *vm) +{ + unsigned int shift = (vm->pgtable_levels - 1) * (vm->page_shift - 3) + vm->page_shift; + return 1 << (vm->va_bits - shift); +} + +static uint64_t __maybe_unused ptrs_per_pte(struct kvm_vm *vm) +{ + return 1 << (vm->page_shift - 3); +} + +void virt_pgd_alloc(struct kvm_vm *vm, uint32_t pgd_memslot) +{ + if (!vm->pgd_created) { + vm_paddr_t paddr = vm_phy_pages_alloc(vm, + page_align(vm, ptrs_per_pgd(vm) * 8) / vm->page_size, + KVM_GUEST_PAGE_TABLE_MIN_PADDR, pgd_memslot); + vm->pgd = paddr; + vm->pgd_created = true; + } +} + +void _virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, + uint32_t pgd_memslot, uint64_t flags) +{ + uint8_t attr_idx = flags & 7; + uint64_t *ptep; + + TEST_ASSERT((vaddr % vm->page_size) == 0, + "Virtual address not on page boundary,\n" + " vaddr: 0x%lx vm->page_size: 0x%x", vaddr, vm->page_size); + TEST_ASSERT(sparsebit_is_set(vm->vpages_valid, + (vaddr >> vm->page_shift)), + "Invalid virtual address, vaddr: 0x%lx", vaddr); + TEST_ASSERT((paddr % vm->page_size) == 0, + "Physical address not on page boundary,\n" + " paddr: 0x%lx vm->page_size: 0x%x", paddr, vm->page_size); + TEST_ASSERT((paddr >> vm->page_shift) <= vm->max_gfn, + "Physical address beyond beyond maximum supported,\n" + " paddr: 0x%lx vm->max_gfn: 0x%lx vm->page_size: 0x%x", + paddr, vm->max_gfn, vm->page_size); + + ptep = addr_gpa2hva(vm, vm->pgd) + pgd_index(vm, vaddr) * 8; + if (!*ptep) { + *ptep = vm_phy_page_alloc(vm, KVM_GUEST_PAGE_TABLE_MIN_PADDR, pgd_memslot); + *ptep |= 3; + } + + switch (vm->pgtable_levels) { + case 4: + ptep = addr_gpa2hva(vm, pte_addr(vm, *ptep)) + pud_index(vm, vaddr) * 8; + if (!*ptep) { + *ptep = vm_phy_page_alloc(vm, KVM_GUEST_PAGE_TABLE_MIN_PADDR, pgd_memslot); + *ptep |= 3; + } + /* fall through */ + case 3: + ptep = addr_gpa2hva(vm, pte_addr(vm, *ptep)) + pmd_index(vm, vaddr) * 8; + if (!*ptep) { + *ptep = vm_phy_page_alloc(vm, KVM_GUEST_PAGE_TABLE_MIN_PADDR, pgd_memslot); + *ptep |= 3; + } + /* fall through */ + case 2: + ptep = addr_gpa2hva(vm, pte_addr(vm, *ptep)) + pte_index(vm, vaddr) * 8; + break; + default: + TEST_FAIL("Page table levels must be 2, 3, or 4"); + } + + *ptep = paddr | 3; + *ptep |= (attr_idx << 2) | (1 << 10) /* Access Flag */; +} + +void virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, + uint32_t pgd_memslot) +{ + uint64_t attr_idx = 4; /* NORMAL (See DEFAULT_MAIR_EL1) */ + + _virt_pg_map(vm, vaddr, paddr, pgd_memslot, attr_idx); +} + +vm_paddr_t addr_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva) +{ + uint64_t *ptep; + + if (!vm->pgd_created) + goto unmapped_gva; + + ptep = addr_gpa2hva(vm, vm->pgd) + pgd_index(vm, gva) * 8; + if (!ptep) + goto unmapped_gva; + + switch (vm->pgtable_levels) { + case 4: + ptep = addr_gpa2hva(vm, pte_addr(vm, *ptep)) + pud_index(vm, gva) * 8; + if (!ptep) + goto unmapped_gva; + /* fall through */ + case 3: + ptep = addr_gpa2hva(vm, pte_addr(vm, *ptep)) + pmd_index(vm, gva) * 8; + if (!ptep) + goto unmapped_gva; + /* fall through */ + case 2: + ptep = addr_gpa2hva(vm, pte_addr(vm, *ptep)) + pte_index(vm, gva) * 8; + if (!ptep) + goto unmapped_gva; + break; + default: + TEST_FAIL("Page table levels must be 2, 3, or 4"); + } + + return pte_addr(vm, *ptep) + (gva & (vm->page_size - 1)); + +unmapped_gva: + TEST_FAIL("No mapping for vm virtual address, gva: 0x%lx", gva); + exit(1); +} + +static void pte_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent, uint64_t page, int level) +{ +#ifdef DEBUG + static const char * const type[] = { "", "pud", "pmd", "pte" }; + uint64_t pte, *ptep; + + if (level == 4) + return; + + for (pte = page; pte < page + ptrs_per_pte(vm) * 8; pte += 8) { + ptep = addr_gpa2hva(vm, pte); + if (!*ptep) + continue; + fprintf(stream, "%*s%s: %lx: %lx at %p\n", indent, "", type[level], pte, *ptep, ptep); + pte_dump(stream, vm, indent + 1, pte_addr(vm, *ptep), level + 1); + } +#endif +} + +void virt_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) +{ + int level = 4 - (vm->pgtable_levels - 1); + uint64_t pgd, *ptep; + + if (!vm->pgd_created) + return; + + for (pgd = vm->pgd; pgd < vm->pgd + ptrs_per_pgd(vm) * 8; pgd += 8) { + ptep = addr_gpa2hva(vm, pgd); + if (!*ptep) + continue; + fprintf(stream, "%*spgd: %lx: %lx at %p\n", indent, "", pgd, *ptep, ptep); + pte_dump(stream, vm, indent + 1, pte_addr(vm, *ptep), level); + } +} + +struct kvm_vm *vm_create_default(uint32_t vcpuid, uint64_t extra_mem_pages, + void *guest_code) +{ + uint64_t ptrs_per_4k_pte = 512; + uint64_t extra_pg_pages = (extra_mem_pages / ptrs_per_4k_pte) * 2; + struct kvm_vm *vm; + + vm = vm_create(VM_MODE_DEFAULT, DEFAULT_GUEST_PHY_PAGES + extra_pg_pages, O_RDWR); + + kvm_vm_elf_load(vm, program_invocation_name, 0, 0); + vm_vcpu_add_default(vm, vcpuid, guest_code); + + return vm; +} + +void aarch64_vcpu_setup(struct kvm_vm *vm, int vcpuid, struct kvm_vcpu_init *init) +{ + struct kvm_vcpu_init default_init = { .target = -1, }; + uint64_t sctlr_el1, tcr_el1; + + if (!init) + init = &default_init; + + if (init->target == -1) { + struct kvm_vcpu_init preferred; + vm_ioctl(vm, KVM_ARM_PREFERRED_TARGET, &preferred); + init->target = preferred.target; + } + + vcpu_ioctl(vm, vcpuid, KVM_ARM_VCPU_INIT, init); + + /* + * Enable FP/ASIMD to avoid trapping when accessing Q0-Q15 + * registers, which the variable argument list macros do. + */ + set_reg(vm, vcpuid, ARM64_SYS_REG(CPACR_EL1), 3 << 20); + + get_reg(vm, vcpuid, ARM64_SYS_REG(SCTLR_EL1), &sctlr_el1); + get_reg(vm, vcpuid, ARM64_SYS_REG(TCR_EL1), &tcr_el1); + + switch (vm->mode) { + case VM_MODE_P52V48_4K: + TEST_FAIL("AArch64 does not support 4K sized pages " + "with 52-bit physical address ranges"); + case VM_MODE_PXXV48_4K: + TEST_FAIL("AArch64 does not support 4K sized pages " + "with ANY-bit physical address ranges"); + case VM_MODE_P52V48_64K: + tcr_el1 |= 1ul << 14; /* TG0 = 64KB */ + tcr_el1 |= 6ul << 32; /* IPS = 52 bits */ + break; + case VM_MODE_P48V48_4K: + tcr_el1 |= 0ul << 14; /* TG0 = 4KB */ + tcr_el1 |= 5ul << 32; /* IPS = 48 bits */ + break; + case VM_MODE_P48V48_64K: + tcr_el1 |= 1ul << 14; /* TG0 = 64KB */ + tcr_el1 |= 5ul << 32; /* IPS = 48 bits */ + break; + case VM_MODE_P40V48_4K: + tcr_el1 |= 0ul << 14; /* TG0 = 4KB */ + tcr_el1 |= 2ul << 32; /* IPS = 40 bits */ + break; + case VM_MODE_P40V48_64K: + tcr_el1 |= 1ul << 14; /* TG0 = 64KB */ + tcr_el1 |= 2ul << 32; /* IPS = 40 bits */ + break; + default: + TEST_FAIL("Unknown guest mode, mode: 0x%x", vm->mode); + } + + sctlr_el1 |= (1 << 0) | (1 << 2) | (1 << 12) /* M | C | I */; + /* TCR_EL1 |= IRGN0:WBWA | ORGN0:WBWA | SH0:Inner-Shareable */; + tcr_el1 |= (1 << 8) | (1 << 10) | (3 << 12); + tcr_el1 |= (64 - vm->va_bits) /* T0SZ */; + + set_reg(vm, vcpuid, ARM64_SYS_REG(SCTLR_EL1), sctlr_el1); + set_reg(vm, vcpuid, ARM64_SYS_REG(TCR_EL1), tcr_el1); + set_reg(vm, vcpuid, ARM64_SYS_REG(MAIR_EL1), DEFAULT_MAIR_EL1); + set_reg(vm, vcpuid, ARM64_SYS_REG(TTBR0_EL1), vm->pgd); +} + +void vcpu_dump(FILE *stream, struct kvm_vm *vm, uint32_t vcpuid, uint8_t indent) +{ + uint64_t pstate, pc; + + get_reg(vm, vcpuid, ARM64_CORE_REG(regs.pstate), &pstate); + get_reg(vm, vcpuid, ARM64_CORE_REG(regs.pc), &pc); + + fprintf(stream, "%*spstate: 0x%.16lx pc: 0x%.16lx\n", + indent, "", pstate, pc); +} + +void aarch64_vcpu_add_default(struct kvm_vm *vm, uint32_t vcpuid, + struct kvm_vcpu_init *init, void *guest_code) +{ + size_t stack_size = vm->page_size == 4096 ? + DEFAULT_STACK_PGS * vm->page_size : + vm->page_size; + uint64_t stack_vaddr = vm_vaddr_alloc(vm, stack_size, + DEFAULT_ARM64_GUEST_STACK_VADDR_MIN, 0, 0); + + vm_vcpu_add(vm, vcpuid); + aarch64_vcpu_setup(vm, vcpuid, init); + + set_reg(vm, vcpuid, ARM64_CORE_REG(sp_el1), stack_vaddr + stack_size); + set_reg(vm, vcpuid, ARM64_CORE_REG(regs.pc), (uint64_t)guest_code); +} + +void vm_vcpu_add_default(struct kvm_vm *vm, uint32_t vcpuid, void *guest_code) +{ + aarch64_vcpu_add_default(vm, vcpuid, NULL, guest_code); +} + +void vcpu_args_set(struct kvm_vm *vm, uint32_t vcpuid, unsigned int num, ...) +{ + va_list ap; + int i; + + TEST_ASSERT(num >= 1 && num <= 8, "Unsupported number of args,\n" + " num: %u\n", num); + + va_start(ap, num); + + for (i = 0; i < num; i++) { + set_reg(vm, vcpuid, ARM64_CORE_REG(regs.regs[i]), + va_arg(ap, uint64_t)); + } + + va_end(ap); +} + +void assert_on_unhandled_exception(struct kvm_vm *vm, uint32_t vcpuid) +{ +} diff --git a/tools/testing/selftests/kvm/lib/aarch64/ucall.c b/tools/testing/selftests/kvm/lib/aarch64/ucall.c new file mode 100644 index 000000000..f600311fd --- /dev/null +++ b/tools/testing/selftests/kvm/lib/aarch64/ucall.c @@ -0,0 +1,114 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * ucall support. A ucall is a "hypercall to userspace". + * + * Copyright (C) 2018, Red Hat, Inc. + */ +#include "kvm_util.h" +#include "../kvm_util_internal.h" + +static vm_vaddr_t *ucall_exit_mmio_addr; + +static bool ucall_mmio_init(struct kvm_vm *vm, vm_paddr_t gpa) +{ + if (kvm_userspace_memory_region_find(vm, gpa, gpa + 1)) + return false; + + virt_pg_map(vm, gpa, gpa, 0); + + ucall_exit_mmio_addr = (vm_vaddr_t *)gpa; + sync_global_to_guest(vm, ucall_exit_mmio_addr); + + return true; +} + +void ucall_init(struct kvm_vm *vm, void *arg) +{ + vm_paddr_t gpa, start, end, step, offset; + unsigned int bits; + bool ret; + + if (arg) { + gpa = (vm_paddr_t)arg; + ret = ucall_mmio_init(vm, gpa); + TEST_ASSERT(ret, "Can't set ucall mmio address to %lx", gpa); + return; + } + + /* + * Find an address within the allowed physical and virtual address + * spaces, that does _not_ have a KVM memory region associated with + * it. Identity mapping an address like this allows the guest to + * access it, but as KVM doesn't know what to do with it, it + * will assume it's something userspace handles and exit with + * KVM_EXIT_MMIO. Well, at least that's how it works for AArch64. + * Here we start with a guess that the addresses around 5/8th + * of the allowed space are unmapped and then work both down and + * up from there in 1/16th allowed space sized steps. + * + * Note, we need to use VA-bits - 1 when calculating the allowed + * virtual address space for an identity mapping because the upper + * half of the virtual address space is the two's complement of the + * lower and won't match physical addresses. + */ + bits = vm->va_bits - 1; + bits = vm->pa_bits < bits ? vm->pa_bits : bits; + end = 1ul << bits; + start = end * 5 / 8; + step = end / 16; + for (offset = 0; offset < end - start; offset += step) { + if (ucall_mmio_init(vm, start - offset)) + return; + if (ucall_mmio_init(vm, start + offset)) + return; + } + TEST_FAIL("Can't find a ucall mmio address"); +} + +void ucall_uninit(struct kvm_vm *vm) +{ + ucall_exit_mmio_addr = 0; + sync_global_to_guest(vm, ucall_exit_mmio_addr); +} + +void ucall(uint64_t cmd, int nargs, ...) +{ + struct ucall uc = {}; + va_list va; + int i; + + WRITE_ONCE(uc.cmd, cmd); + nargs = nargs <= UCALL_MAX_ARGS ? nargs : UCALL_MAX_ARGS; + + va_start(va, nargs); + for (i = 0; i < nargs; ++i) + WRITE_ONCE(uc.args[i], va_arg(va, uint64_t)); + va_end(va); + + WRITE_ONCE(*ucall_exit_mmio_addr, (vm_vaddr_t)&uc); +} + +uint64_t get_ucall(struct kvm_vm *vm, uint32_t vcpu_id, struct ucall *uc) +{ + struct kvm_run *run = vcpu_state(vm, vcpu_id); + struct ucall ucall = {}; + + if (uc) + memset(uc, 0, sizeof(*uc)); + + if (run->exit_reason == KVM_EXIT_MMIO && + run->mmio.phys_addr == (uint64_t)ucall_exit_mmio_addr) { + vm_vaddr_t gva; + + TEST_ASSERT(run->mmio.is_write && run->mmio.len == 8, + "Unexpected ucall exit mmio address access"); + memcpy(&gva, run->mmio.data, sizeof(gva)); + memcpy(&ucall, addr_gva2hva(vm, gva), sizeof(ucall)); + + vcpu_run_complete_io(vm, vcpu_id); + if (uc) + memcpy(uc, &ucall, sizeof(ucall)); + } + + return ucall.cmd; +} diff --git a/tools/testing/selftests/kvm/lib/assert.c b/tools/testing/selftests/kvm/lib/assert.c new file mode 100644 index 000000000..5ebbd0d6b --- /dev/null +++ b/tools/testing/selftests/kvm/lib/assert.c @@ -0,0 +1,93 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * tools/testing/selftests/kvm/lib/assert.c + * + * Copyright (C) 2018, Google LLC. + */ + +#define _GNU_SOURCE /* for getline(3) and strchrnul(3)*/ + +#include "test_util.h" + +#include +#include + +#include "kselftest.h" + +/* Dumps the current stack trace to stderr. */ +static void __attribute__((noinline)) test_dump_stack(void); +static void test_dump_stack(void) +{ + /* + * Build and run this command: + * + * addr2line -s -e /proc/$PPID/exe -fpai {backtrace addresses} | \ + * grep -v test_dump_stack | cat -n 1>&2 + * + * Note that the spacing is different and there's no newline. + */ + size_t i; + size_t n = 20; + void *stack[n]; + const char *addr2line = "addr2line -s -e /proc/$PPID/exe -fpai"; + const char *pipeline = "|cat -n 1>&2"; + char cmd[strlen(addr2line) + strlen(pipeline) + + /* N bytes per addr * 2 digits per byte + 1 space per addr: */ + n * (((sizeof(void *)) * 2) + 1) + + /* Null terminator: */ + 1]; + char *c; + + n = backtrace(stack, n); + c = &cmd[0]; + c += sprintf(c, "%s", addr2line); + /* + * Skip the first 3 frames: backtrace, test_dump_stack, and + * test_assert. We hope that backtrace isn't inlined and the other two + * we've declared noinline. + */ + for (i = 2; i < n; i++) + c += sprintf(c, " %lx", ((unsigned long) stack[i]) - 1); + c += sprintf(c, "%s", pipeline); +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-result" + system(cmd); +#pragma GCC diagnostic pop +} + +static pid_t _gettid(void) +{ + return syscall(SYS_gettid); +} + +void __attribute__((noinline)) +test_assert(bool exp, const char *exp_str, + const char *file, unsigned int line, const char *fmt, ...) +{ + va_list ap; + + if (!(exp)) { + va_start(ap, fmt); + + fprintf(stderr, "==== Test Assertion Failure ====\n" + " %s:%u: %s\n" + " pid=%d tid=%d - %s\n", + file, line, exp_str, getpid(), _gettid(), + strerror(errno)); + test_dump_stack(); + if (fmt) { + fputs(" ", stderr); + vfprintf(stderr, fmt, ap); + fputs("\n", stderr); + } + va_end(ap); + + if (errno == EACCES) { + print_skip("Access denied - Exiting"); + exit(KSFT_SKIP); + } + exit(254); + } + + return; +} diff --git a/tools/testing/selftests/kvm/lib/elf.c b/tools/testing/selftests/kvm/lib/elf.c new file mode 100644 index 000000000..bc75a91e0 --- /dev/null +++ b/tools/testing/selftests/kvm/lib/elf.c @@ -0,0 +1,196 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * tools/testing/selftests/kvm/lib/elf.c + * + * Copyright (C) 2018, Google LLC. + */ + +#include "test_util.h" + +#include +#include + +#include "kvm_util.h" +#include "kvm_util_internal.h" + +static void elfhdr_get(const char *filename, Elf64_Ehdr *hdrp) +{ + off_t offset_rv; + + /* Open the ELF file. */ + int fd; + fd = open(filename, O_RDONLY); + TEST_ASSERT(fd >= 0, "Failed to open ELF file,\n" + " filename: %s\n" + " rv: %i errno: %i", filename, fd, errno); + + /* Read in and validate ELF Identification Record. + * The ELF Identification record is the first 16 (EI_NIDENT) bytes + * of the ELF header, which is at the beginning of the ELF file. + * For now it is only safe to read the first EI_NIDENT bytes. Once + * read and validated, the value of e_ehsize can be used to determine + * the real size of the ELF header. + */ + unsigned char ident[EI_NIDENT]; + test_read(fd, ident, sizeof(ident)); + TEST_ASSERT((ident[EI_MAG0] == ELFMAG0) && (ident[EI_MAG1] == ELFMAG1) + && (ident[EI_MAG2] == ELFMAG2) && (ident[EI_MAG3] == ELFMAG3), + "ELF MAGIC Mismatch,\n" + " filename: %s\n" + " ident[EI_MAG0 - EI_MAG3]: %02x %02x %02x %02x\n" + " Expected: %02x %02x %02x %02x", + filename, + ident[EI_MAG0], ident[EI_MAG1], ident[EI_MAG2], ident[EI_MAG3], + ELFMAG0, ELFMAG1, ELFMAG2, ELFMAG3); + TEST_ASSERT(ident[EI_CLASS] == ELFCLASS64, + "Current implementation only able to handle ELFCLASS64,\n" + " filename: %s\n" + " ident[EI_CLASS]: %02x\n" + " expected: %02x", + filename, + ident[EI_CLASS], ELFCLASS64); + TEST_ASSERT(((BYTE_ORDER == LITTLE_ENDIAN) + && (ident[EI_DATA] == ELFDATA2LSB)) + || ((BYTE_ORDER == BIG_ENDIAN) + && (ident[EI_DATA] == ELFDATA2MSB)), "Current " + "implementation only able to handle\n" + "cases where the host and ELF file endianness\n" + "is the same:\n" + " host BYTE_ORDER: %u\n" + " host LITTLE_ENDIAN: %u\n" + " host BIG_ENDIAN: %u\n" + " ident[EI_DATA]: %u\n" + " ELFDATA2LSB: %u\n" + " ELFDATA2MSB: %u", + BYTE_ORDER, LITTLE_ENDIAN, BIG_ENDIAN, + ident[EI_DATA], ELFDATA2LSB, ELFDATA2MSB); + TEST_ASSERT(ident[EI_VERSION] == EV_CURRENT, + "Current implementation only able to handle current " + "ELF version,\n" + " filename: %s\n" + " ident[EI_VERSION]: %02x\n" + " expected: %02x", + filename, ident[EI_VERSION], EV_CURRENT); + + /* Read in the ELF header. + * With the ELF Identification portion of the ELF header + * validated, especially that the value at EI_VERSION is + * as expected, it is now safe to read the entire ELF header. + */ + offset_rv = lseek(fd, 0, SEEK_SET); + TEST_ASSERT(offset_rv == 0, "Seek to ELF header failed,\n" + " rv: %zi expected: %i", offset_rv, 0); + test_read(fd, hdrp, sizeof(*hdrp)); + TEST_ASSERT(hdrp->e_phentsize == sizeof(Elf64_Phdr), + "Unexpected physical header size,\n" + " hdrp->e_phentsize: %x\n" + " expected: %zx", + hdrp->e_phentsize, sizeof(Elf64_Phdr)); + TEST_ASSERT(hdrp->e_shentsize == sizeof(Elf64_Shdr), + "Unexpected section header size,\n" + " hdrp->e_shentsize: %x\n" + " expected: %zx", + hdrp->e_shentsize, sizeof(Elf64_Shdr)); +} + +/* VM ELF Load + * + * Input Args: + * filename - Path to ELF file + * + * Output Args: None + * + * Input/Output Args: + * vm - Pointer to opaque type that describes the VM. + * + * Return: None, TEST_ASSERT failures for all error conditions + * + * Loads the program image of the ELF file specified by filename, + * into the virtual address space of the VM pointed to by vm. On entry + * the VM needs to not be using any of the virtual address space used + * by the image and it needs to have sufficient available physical pages, to + * back the virtual pages used to load the image. + */ +void kvm_vm_elf_load(struct kvm_vm *vm, const char *filename, + uint32_t data_memslot, uint32_t pgd_memslot) +{ + off_t offset, offset_rv; + Elf64_Ehdr hdr; + + /* Open the ELF file. */ + int fd; + fd = open(filename, O_RDONLY); + TEST_ASSERT(fd >= 0, "Failed to open ELF file,\n" + " filename: %s\n" + " rv: %i errno: %i", filename, fd, errno); + + /* Read in the ELF header. */ + elfhdr_get(filename, &hdr); + + /* For each program header. + * The following ELF header members specify the location + * and size of the program headers: + * + * e_phoff - File offset to start of program headers + * e_phentsize - Size of each program header + * e_phnum - Number of program header entries + */ + for (unsigned int n1 = 0; n1 < hdr.e_phnum; n1++) { + /* Seek to the beginning of the program header. */ + offset = hdr.e_phoff + (n1 * hdr.e_phentsize); + offset_rv = lseek(fd, offset, SEEK_SET); + TEST_ASSERT(offset_rv == offset, + "Failed to seek to begining of program header %u,\n" + " filename: %s\n" + " rv: %jd errno: %i", + n1, filename, (intmax_t) offset_rv, errno); + + /* Read in the program header. */ + Elf64_Phdr phdr; + test_read(fd, &phdr, sizeof(phdr)); + + /* Skip if this header doesn't describe a loadable segment. */ + if (phdr.p_type != PT_LOAD) + continue; + + /* Allocate memory for this segment within the VM. */ + TEST_ASSERT(phdr.p_memsz > 0, "Unexpected loadable segment " + "memsize of 0,\n" + " phdr index: %u p_memsz: 0x%" PRIx64, + n1, (uint64_t) phdr.p_memsz); + vm_vaddr_t seg_vstart = phdr.p_vaddr; + seg_vstart &= ~(vm_vaddr_t)(vm->page_size - 1); + vm_vaddr_t seg_vend = phdr.p_vaddr + phdr.p_memsz - 1; + seg_vend |= vm->page_size - 1; + size_t seg_size = seg_vend - seg_vstart + 1; + + vm_vaddr_t vaddr = vm_vaddr_alloc(vm, seg_size, seg_vstart, + data_memslot, pgd_memslot); + TEST_ASSERT(vaddr == seg_vstart, "Unable to allocate " + "virtual memory for segment at requested min addr,\n" + " segment idx: %u\n" + " seg_vstart: 0x%lx\n" + " vaddr: 0x%lx", + n1, seg_vstart, vaddr); + memset(addr_gva2hva(vm, vaddr), 0, seg_size); + /* TODO(lhuemill): Set permissions of each memory segment + * based on the least-significant 3 bits of phdr.p_flags. + */ + + /* Load portion of initial state that is contained within + * the ELF file. + */ + if (phdr.p_filesz) { + offset_rv = lseek(fd, phdr.p_offset, SEEK_SET); + TEST_ASSERT(offset_rv == phdr.p_offset, + "Seek to program segment offset failed,\n" + " program header idx: %u errno: %i\n" + " offset_rv: 0x%jx\n" + " expected: 0x%jx\n", + n1, errno, (intmax_t) offset_rv, + (intmax_t) phdr.p_offset); + test_read(fd, addr_gva2hva(vm, phdr.p_vaddr), + phdr.p_filesz); + } + } +} diff --git a/tools/testing/selftests/kvm/lib/io.c b/tools/testing/selftests/kvm/lib/io.c new file mode 100644 index 000000000..fedb2a741 --- /dev/null +++ b/tools/testing/selftests/kvm/lib/io.c @@ -0,0 +1,157 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * tools/testing/selftests/kvm/lib/io.c + * + * Copyright (C) 2018, Google LLC. + */ + +#include "test_util.h" + +/* Test Write + * + * A wrapper for write(2), that automatically handles the following + * special conditions: + * + * + Interrupted system call (EINTR) + * + Write of less than requested amount + * + Non-block return (EAGAIN) + * + * For each of the above, an additional write is performed to automatically + * continue writing the requested data. + * There are also many cases where write(2) can return an unexpected + * error (e.g. EIO). Such errors cause a TEST_ASSERT failure. + * + * Note, for function signature compatibility with write(2), this function + * returns the number of bytes written, but that value will always be equal + * to the number of requested bytes. All other conditions in this and + * future enhancements to this function either automatically issue another + * write(2) or cause a TEST_ASSERT failure. + * + * Args: + * fd - Opened file descriptor to file to be written. + * count - Number of bytes to write. + * + * Output: + * buf - Starting address of data to be written. + * + * Return: + * On success, number of bytes written. + * On failure, a TEST_ASSERT failure is caused. + */ +ssize_t test_write(int fd, const void *buf, size_t count) +{ + ssize_t rc; + ssize_t num_written = 0; + size_t num_left = count; + const char *ptr = buf; + + /* Note: Count of zero is allowed (see "RETURN VALUE" portion of + * write(2) manpage for details. + */ + TEST_ASSERT(count >= 0, "Unexpected count, count: %li", count); + + do { + rc = write(fd, ptr, num_left); + + switch (rc) { + case -1: + TEST_ASSERT(errno == EAGAIN || errno == EINTR, + "Unexpected write failure,\n" + " rc: %zi errno: %i", rc, errno); + continue; + + case 0: + TEST_FAIL("Unexpected EOF,\n" + " rc: %zi num_written: %zi num_left: %zu", + rc, num_written, num_left); + break; + + default: + TEST_ASSERT(rc >= 0, "Unexpected ret from write,\n" + " rc: %zi errno: %i", rc, errno); + num_written += rc; + num_left -= rc; + ptr += rc; + break; + } + } while (num_written < count); + + return num_written; +} + +/* Test Read + * + * A wrapper for read(2), that automatically handles the following + * special conditions: + * + * + Interrupted system call (EINTR) + * + Read of less than requested amount + * + Non-block return (EAGAIN) + * + * For each of the above, an additional read is performed to automatically + * continue reading the requested data. + * There are also many cases where read(2) can return an unexpected + * error (e.g. EIO). Such errors cause a TEST_ASSERT failure. Note, + * it is expected that the file opened by fd at the current file position + * contains at least the number of requested bytes to be read. A TEST_ASSERT + * failure is produced if an End-Of-File condition occurs, before all the + * data is read. It is the callers responsibility to assure that sufficient + * data exists. + * + * Note, for function signature compatibility with read(2), this function + * returns the number of bytes read, but that value will always be equal + * to the number of requested bytes. All other conditions in this and + * future enhancements to this function either automatically issue another + * read(2) or cause a TEST_ASSERT failure. + * + * Args: + * fd - Opened file descriptor to file to be read. + * count - Number of bytes to read. + * + * Output: + * buf - Starting address of where to write the bytes read. + * + * Return: + * On success, number of bytes read. + * On failure, a TEST_ASSERT failure is caused. + */ +ssize_t test_read(int fd, void *buf, size_t count) +{ + ssize_t rc; + ssize_t num_read = 0; + size_t num_left = count; + char *ptr = buf; + + /* Note: Count of zero is allowed (see "If count is zero" portion of + * read(2) manpage for details. + */ + TEST_ASSERT(count >= 0, "Unexpected count, count: %li", count); + + do { + rc = read(fd, ptr, num_left); + + switch (rc) { + case -1: + TEST_ASSERT(errno == EAGAIN || errno == EINTR, + "Unexpected read failure,\n" + " rc: %zi errno: %i", rc, errno); + break; + + case 0: + TEST_FAIL("Unexpected EOF,\n" + " rc: %zi num_read: %zi num_left: %zu", + rc, num_read, num_left); + break; + + default: + TEST_ASSERT(rc > 0, "Unexpected ret from read,\n" + " rc: %zi errno: %i", rc, errno); + num_read += rc; + num_left -= rc; + ptr += rc; + break; + } + } while (num_read < count); + + return num_read; +} diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c new file mode 100644 index 000000000..49805fd16 --- /dev/null +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -0,0 +1,1865 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * tools/testing/selftests/kvm/lib/kvm_util.c + * + * Copyright (C) 2018, Google LLC. + */ + +#include "test_util.h" +#include "kvm_util.h" +#include "kvm_util_internal.h" +#include "processor.h" + +#include +#include +#include +#include +#include +#include + +#define KVM_UTIL_PGS_PER_HUGEPG 512 +#define KVM_UTIL_MIN_PFN 2 + +/* Aligns x up to the next multiple of size. Size must be a power of 2. */ +static void *align(void *x, size_t size) +{ + size_t mask = size - 1; + TEST_ASSERT(size != 0 && !(size & (size - 1)), + "size not a power of 2: %lu", size); + return (void *) (((size_t) x + mask) & ~mask); +} + +/* + * Capability + * + * Input Args: + * cap - Capability + * + * Output Args: None + * + * Return: + * On success, the Value corresponding to the capability (KVM_CAP_*) + * specified by the value of cap. On failure a TEST_ASSERT failure + * is produced. + * + * Looks up and returns the value corresponding to the capability + * (KVM_CAP_*) given by cap. + */ +int kvm_check_cap(long cap) +{ + int ret; + int kvm_fd; + + kvm_fd = open(KVM_DEV_PATH, O_RDONLY); + if (kvm_fd < 0) + exit(KSFT_SKIP); + + ret = ioctl(kvm_fd, KVM_CHECK_EXTENSION, cap); + TEST_ASSERT(ret >= 0, "KVM_CHECK_EXTENSION IOCTL failed,\n" + " rc: %i errno: %i", ret, errno); + + close(kvm_fd); + + return ret; +} + +/* VM Enable Capability + * + * Input Args: + * vm - Virtual Machine + * cap - Capability + * + * Output Args: None + * + * Return: On success, 0. On failure a TEST_ASSERT failure is produced. + * + * Enables a capability (KVM_CAP_*) on the VM. + */ +int vm_enable_cap(struct kvm_vm *vm, struct kvm_enable_cap *cap) +{ + int ret; + + ret = ioctl(vm->fd, KVM_ENABLE_CAP, cap); + TEST_ASSERT(ret == 0, "KVM_ENABLE_CAP IOCTL failed,\n" + " rc: %i errno: %i", ret, errno); + + return ret; +} + +/* VCPU Enable Capability + * + * Input Args: + * vm - Virtual Machine + * vcpu_id - VCPU + * cap - Capability + * + * Output Args: None + * + * Return: On success, 0. On failure a TEST_ASSERT failure is produced. + * + * Enables a capability (KVM_CAP_*) on the VCPU. + */ +int vcpu_enable_cap(struct kvm_vm *vm, uint32_t vcpu_id, + struct kvm_enable_cap *cap) +{ + struct vcpu *vcpu = vcpu_find(vm, vcpu_id); + int r; + + TEST_ASSERT(vcpu, "cannot find vcpu %d", vcpu_id); + + r = ioctl(vcpu->fd, KVM_ENABLE_CAP, cap); + TEST_ASSERT(!r, "KVM_ENABLE_CAP vCPU ioctl failed,\n" + " rc: %i, errno: %i", r, errno); + + return r; +} + +static void vm_open(struct kvm_vm *vm, int perm) +{ + vm->kvm_fd = open(KVM_DEV_PATH, perm); + if (vm->kvm_fd < 0) + exit(KSFT_SKIP); + + if (!kvm_check_cap(KVM_CAP_IMMEDIATE_EXIT)) { + print_skip("immediate_exit not available"); + exit(KSFT_SKIP); + } + + vm->fd = ioctl(vm->kvm_fd, KVM_CREATE_VM, vm->type); + TEST_ASSERT(vm->fd >= 0, "KVM_CREATE_VM ioctl failed, " + "rc: %i errno: %i", vm->fd, errno); +} + +const char * const vm_guest_mode_string[] = { + "PA-bits:52, VA-bits:48, 4K pages", + "PA-bits:52, VA-bits:48, 64K pages", + "PA-bits:48, VA-bits:48, 4K pages", + "PA-bits:48, VA-bits:48, 64K pages", + "PA-bits:40, VA-bits:48, 4K pages", + "PA-bits:40, VA-bits:48, 64K pages", + "PA-bits:ANY, VA-bits:48, 4K pages", +}; +_Static_assert(sizeof(vm_guest_mode_string)/sizeof(char *) == NUM_VM_MODES, + "Missing new mode strings?"); + +struct vm_guest_mode_params { + unsigned int pa_bits; + unsigned int va_bits; + unsigned int page_size; + unsigned int page_shift; +}; + +static const struct vm_guest_mode_params vm_guest_mode_params[] = { + { 52, 48, 0x1000, 12 }, + { 52, 48, 0x10000, 16 }, + { 48, 48, 0x1000, 12 }, + { 48, 48, 0x10000, 16 }, + { 40, 48, 0x1000, 12 }, + { 40, 48, 0x10000, 16 }, + { 0, 0, 0x1000, 12 }, +}; +_Static_assert(sizeof(vm_guest_mode_params)/sizeof(struct vm_guest_mode_params) == NUM_VM_MODES, + "Missing new mode params?"); + +/* + * VM Create + * + * Input Args: + * mode - VM Mode (e.g. VM_MODE_P52V48_4K) + * phy_pages - Physical memory pages + * perm - permission + * + * Output Args: None + * + * Return: + * Pointer to opaque structure that describes the created VM. + * + * Creates a VM with the mode specified by mode (e.g. VM_MODE_P52V48_4K). + * When phy_pages is non-zero, a memory region of phy_pages physical pages + * is created and mapped starting at guest physical address 0. The file + * descriptor to control the created VM is created with the permissions + * given by perm (e.g. O_RDWR). + */ +struct kvm_vm *vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm) +{ + struct kvm_vm *vm; + + pr_debug("%s: mode='%s' pages='%ld' perm='%d'\n", __func__, + vm_guest_mode_string(mode), phy_pages, perm); + + vm = calloc(1, sizeof(*vm)); + TEST_ASSERT(vm != NULL, "Insufficient Memory"); + + INIT_LIST_HEAD(&vm->vcpus); + INIT_LIST_HEAD(&vm->userspace_mem_regions); + + vm->mode = mode; + vm->type = 0; + + vm->pa_bits = vm_guest_mode_params[mode].pa_bits; + vm->va_bits = vm_guest_mode_params[mode].va_bits; + vm->page_size = vm_guest_mode_params[mode].page_size; + vm->page_shift = vm_guest_mode_params[mode].page_shift; + + /* Setup mode specific traits. */ + switch (vm->mode) { + case VM_MODE_P52V48_4K: + vm->pgtable_levels = 4; + break; + case VM_MODE_P52V48_64K: + vm->pgtable_levels = 3; + break; + case VM_MODE_P48V48_4K: + vm->pgtable_levels = 4; + break; + case VM_MODE_P48V48_64K: + vm->pgtable_levels = 3; + break; + case VM_MODE_P40V48_4K: + vm->pgtable_levels = 4; + break; + case VM_MODE_P40V48_64K: + vm->pgtable_levels = 3; + break; + case VM_MODE_PXXV48_4K: +#ifdef __x86_64__ + kvm_get_cpu_address_width(&vm->pa_bits, &vm->va_bits); + /* + * Ignore KVM support for 5-level paging (vm->va_bits == 57), + * it doesn't take effect unless a CR4.LA57 is set, which it + * isn't for this VM_MODE. + */ + TEST_ASSERT(vm->va_bits == 48 || vm->va_bits == 57, + "Linear address width (%d bits) not supported", + vm->va_bits); + pr_debug("Guest physical address width detected: %d\n", + vm->pa_bits); + vm->pgtable_levels = 4; + vm->va_bits = 48; +#else + TEST_FAIL("VM_MODE_PXXV48_4K not supported on non-x86 platforms"); +#endif + break; + default: + TEST_FAIL("Unknown guest mode, mode: 0x%x", mode); + } + +#ifdef __aarch64__ + if (vm->pa_bits != 40) + vm->type = KVM_VM_TYPE_ARM_IPA_SIZE(vm->pa_bits); +#endif + + vm_open(vm, perm); + + /* Limit to VA-bit canonical virtual addresses. */ + vm->vpages_valid = sparsebit_alloc(); + sparsebit_set_num(vm->vpages_valid, + 0, (1ULL << (vm->va_bits - 1)) >> vm->page_shift); + sparsebit_set_num(vm->vpages_valid, + (~((1ULL << (vm->va_bits - 1)) - 1)) >> vm->page_shift, + (1ULL << (vm->va_bits - 1)) >> vm->page_shift); + + /* Limit physical addresses to PA-bits. */ + vm->max_gfn = ((1ULL << vm->pa_bits) >> vm->page_shift) - 1; + + /* Allocate and setup memory for guest. */ + vm->vpages_mapped = sparsebit_alloc(); + if (phy_pages != 0) + vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, + 0, 0, phy_pages, 0); + + return vm; +} + +/* + * VM Restart + * + * Input Args: + * vm - VM that has been released before + * perm - permission + * + * Output Args: None + * + * Reopens the file descriptors associated to the VM and reinstates the + * global state, such as the irqchip and the memory regions that are mapped + * into the guest. + */ +void kvm_vm_restart(struct kvm_vm *vmp, int perm) +{ + struct userspace_mem_region *region; + + vm_open(vmp, perm); + if (vmp->has_irqchip) + vm_create_irqchip(vmp); + + list_for_each_entry(region, &vmp->userspace_mem_regions, list) { + int ret = ioctl(vmp->fd, KVM_SET_USER_MEMORY_REGION, ®ion->region); + TEST_ASSERT(ret == 0, "KVM_SET_USER_MEMORY_REGION IOCTL failed,\n" + " rc: %i errno: %i\n" + " slot: %u flags: 0x%x\n" + " guest_phys_addr: 0x%llx size: 0x%llx", + ret, errno, region->region.slot, + region->region.flags, + region->region.guest_phys_addr, + region->region.memory_size); + } +} + +void kvm_vm_get_dirty_log(struct kvm_vm *vm, int slot, void *log) +{ + struct kvm_dirty_log args = { .dirty_bitmap = log, .slot = slot }; + int ret; + + ret = ioctl(vm->fd, KVM_GET_DIRTY_LOG, &args); + TEST_ASSERT(ret == 0, "%s: KVM_GET_DIRTY_LOG failed: %s", + __func__, strerror(-ret)); +} + +void kvm_vm_clear_dirty_log(struct kvm_vm *vm, int slot, void *log, + uint64_t first_page, uint32_t num_pages) +{ + struct kvm_clear_dirty_log args = { .dirty_bitmap = log, .slot = slot, + .first_page = first_page, + .num_pages = num_pages }; + int ret; + + ret = ioctl(vm->fd, KVM_CLEAR_DIRTY_LOG, &args); + TEST_ASSERT(ret == 0, "%s: KVM_CLEAR_DIRTY_LOG failed: %s", + __func__, strerror(-ret)); +} + +/* + * Userspace Memory Region Find + * + * Input Args: + * vm - Virtual Machine + * start - Starting VM physical address + * end - Ending VM physical address, inclusive. + * + * Output Args: None + * + * Return: + * Pointer to overlapping region, NULL if no such region. + * + * Searches for a region with any physical memory that overlaps with + * any portion of the guest physical addresses from start to end + * inclusive. If multiple overlapping regions exist, a pointer to any + * of the regions is returned. Null is returned only when no overlapping + * region exists. + */ +static struct userspace_mem_region * +userspace_mem_region_find(struct kvm_vm *vm, uint64_t start, uint64_t end) +{ + struct userspace_mem_region *region; + + list_for_each_entry(region, &vm->userspace_mem_regions, list) { + uint64_t existing_start = region->region.guest_phys_addr; + uint64_t existing_end = region->region.guest_phys_addr + + region->region.memory_size - 1; + if (start <= existing_end && end >= existing_start) + return region; + } + + return NULL; +} + +/* + * KVM Userspace Memory Region Find + * + * Input Args: + * vm - Virtual Machine + * start - Starting VM physical address + * end - Ending VM physical address, inclusive. + * + * Output Args: None + * + * Return: + * Pointer to overlapping region, NULL if no such region. + * + * Public interface to userspace_mem_region_find. Allows tests to look up + * the memslot datastructure for a given range of guest physical memory. + */ +struct kvm_userspace_memory_region * +kvm_userspace_memory_region_find(struct kvm_vm *vm, uint64_t start, + uint64_t end) +{ + struct userspace_mem_region *region; + + region = userspace_mem_region_find(vm, start, end); + if (!region) + return NULL; + + return ®ion->region; +} + +/* + * VCPU Find + * + * Input Args: + * vm - Virtual Machine + * vcpuid - VCPU ID + * + * Output Args: None + * + * Return: + * Pointer to VCPU structure + * + * Locates a vcpu structure that describes the VCPU specified by vcpuid and + * returns a pointer to it. Returns NULL if the VM doesn't contain a VCPU + * for the specified vcpuid. + */ +struct vcpu *vcpu_find(struct kvm_vm *vm, uint32_t vcpuid) +{ + struct vcpu *vcpu; + + list_for_each_entry(vcpu, &vm->vcpus, list) { + if (vcpu->id == vcpuid) + return vcpu; + } + + return NULL; +} + +/* + * VM VCPU Remove + * + * Input Args: + * vcpu - VCPU to remove + * + * Output Args: None + * + * Return: None, TEST_ASSERT failures for all error conditions + * + * Removes a vCPU from a VM and frees its resources. + */ +static void vm_vcpu_rm(struct vcpu *vcpu) +{ + int ret; + + ret = munmap(vcpu->state, sizeof(*vcpu->state)); + TEST_ASSERT(ret == 0, "munmap of VCPU fd failed, rc: %i " + "errno: %i", ret, errno); + close(vcpu->fd); + TEST_ASSERT(ret == 0, "Close of VCPU fd failed, rc: %i " + "errno: %i", ret, errno); + + list_del(&vcpu->list); + free(vcpu); +} + +void kvm_vm_release(struct kvm_vm *vmp) +{ + struct vcpu *vcpu, *tmp; + int ret; + + list_for_each_entry_safe(vcpu, tmp, &vmp->vcpus, list) + vm_vcpu_rm(vcpu); + + ret = close(vmp->fd); + TEST_ASSERT(ret == 0, "Close of vm fd failed,\n" + " vmp->fd: %i rc: %i errno: %i", vmp->fd, ret, errno); + + close(vmp->kvm_fd); + TEST_ASSERT(ret == 0, "Close of /dev/kvm fd failed,\n" + " vmp->kvm_fd: %i rc: %i errno: %i", vmp->kvm_fd, ret, errno); +} + +static void __vm_mem_region_delete(struct kvm_vm *vm, + struct userspace_mem_region *region) +{ + int ret; + + list_del(®ion->list); + + region->region.memory_size = 0; + ret = ioctl(vm->fd, KVM_SET_USER_MEMORY_REGION, ®ion->region); + TEST_ASSERT(ret == 0, "KVM_SET_USER_MEMORY_REGION IOCTL failed, " + "rc: %i errno: %i", ret, errno); + + sparsebit_free(®ion->unused_phy_pages); + ret = munmap(region->mmap_start, region->mmap_size); + TEST_ASSERT(ret == 0, "munmap failed, rc: %i errno: %i", ret, errno); + + free(region); +} + +/* + * Destroys and frees the VM pointed to by vmp. + */ +void kvm_vm_free(struct kvm_vm *vmp) +{ + struct userspace_mem_region *region, *tmp; + + if (vmp == NULL) + return; + + /* Free userspace_mem_regions. */ + list_for_each_entry_safe(region, tmp, &vmp->userspace_mem_regions, list) + __vm_mem_region_delete(vmp, region); + + /* Free sparsebit arrays. */ + sparsebit_free(&vmp->vpages_valid); + sparsebit_free(&vmp->vpages_mapped); + + kvm_vm_release(vmp); + + /* Free the structure describing the VM. */ + free(vmp); +} + +/* + * Memory Compare, host virtual to guest virtual + * + * Input Args: + * hva - Starting host virtual address + * vm - Virtual Machine + * gva - Starting guest virtual address + * len - number of bytes to compare + * + * Output Args: None + * + * Input/Output Args: None + * + * Return: + * Returns 0 if the bytes starting at hva for a length of len + * are equal the guest virtual bytes starting at gva. Returns + * a value < 0, if bytes at hva are less than those at gva. + * Otherwise a value > 0 is returned. + * + * Compares the bytes starting at the host virtual address hva, for + * a length of len, to the guest bytes starting at the guest virtual + * address given by gva. + */ +int kvm_memcmp_hva_gva(void *hva, struct kvm_vm *vm, vm_vaddr_t gva, size_t len) +{ + size_t amt; + + /* + * Compare a batch of bytes until either a match is found + * or all the bytes have been compared. + */ + for (uintptr_t offset = 0; offset < len; offset += amt) { + uintptr_t ptr1 = (uintptr_t)hva + offset; + + /* + * Determine host address for guest virtual address + * at offset. + */ + uintptr_t ptr2 = (uintptr_t)addr_gva2hva(vm, gva + offset); + + /* + * Determine amount to compare on this pass. + * Don't allow the comparsion to cross a page boundary. + */ + amt = len - offset; + if ((ptr1 >> vm->page_shift) != ((ptr1 + amt) >> vm->page_shift)) + amt = vm->page_size - (ptr1 % vm->page_size); + if ((ptr2 >> vm->page_shift) != ((ptr2 + amt) >> vm->page_shift)) + amt = vm->page_size - (ptr2 % vm->page_size); + + assert((ptr1 >> vm->page_shift) == ((ptr1 + amt - 1) >> vm->page_shift)); + assert((ptr2 >> vm->page_shift) == ((ptr2 + amt - 1) >> vm->page_shift)); + + /* + * Perform the comparison. If there is a difference + * return that result to the caller, otherwise need + * to continue on looking for a mismatch. + */ + int ret = memcmp((void *)ptr1, (void *)ptr2, amt); + if (ret != 0) + return ret; + } + + /* + * No mismatch found. Let the caller know the two memory + * areas are equal. + */ + return 0; +} + +/* + * VM Userspace Memory Region Add + * + * Input Args: + * vm - Virtual Machine + * backing_src - Storage source for this region. + * NULL to use anonymous memory. + * guest_paddr - Starting guest physical address + * slot - KVM region slot + * npages - Number of physical pages + * flags - KVM memory region flags (e.g. KVM_MEM_LOG_DIRTY_PAGES) + * + * Output Args: None + * + * Return: None + * + * Allocates a memory area of the number of pages specified by npages + * and maps it to the VM specified by vm, at a starting physical address + * given by guest_paddr. The region is created with a KVM region slot + * given by slot, which must be unique and < KVM_MEM_SLOTS_NUM. The + * region is created with the flags given by flags. + */ +void vm_userspace_mem_region_add(struct kvm_vm *vm, + enum vm_mem_backing_src_type src_type, + uint64_t guest_paddr, uint32_t slot, uint64_t npages, + uint32_t flags) +{ + int ret; + struct userspace_mem_region *region; + size_t huge_page_size = KVM_UTIL_PGS_PER_HUGEPG * vm->page_size; + size_t alignment; + + TEST_ASSERT(vm_adjust_num_guest_pages(vm->mode, npages) == npages, + "Number of guest pages is not compatible with the host. " + "Try npages=%d", vm_adjust_num_guest_pages(vm->mode, npages)); + + TEST_ASSERT((guest_paddr % vm->page_size) == 0, "Guest physical " + "address not on a page boundary.\n" + " guest_paddr: 0x%lx vm->page_size: 0x%x", + guest_paddr, vm->page_size); + TEST_ASSERT((((guest_paddr >> vm->page_shift) + npages) - 1) + <= vm->max_gfn, "Physical range beyond maximum " + "supported physical address,\n" + " guest_paddr: 0x%lx npages: 0x%lx\n" + " vm->max_gfn: 0x%lx vm->page_size: 0x%x", + guest_paddr, npages, vm->max_gfn, vm->page_size); + + /* + * Confirm a mem region with an overlapping address doesn't + * already exist. + */ + region = (struct userspace_mem_region *) userspace_mem_region_find( + vm, guest_paddr, (guest_paddr + npages * vm->page_size) - 1); + if (region != NULL) + TEST_FAIL("overlapping userspace_mem_region already " + "exists\n" + " requested guest_paddr: 0x%lx npages: 0x%lx " + "page_size: 0x%x\n" + " existing guest_paddr: 0x%lx size: 0x%lx", + guest_paddr, npages, vm->page_size, + (uint64_t) region->region.guest_phys_addr, + (uint64_t) region->region.memory_size); + + /* Confirm no region with the requested slot already exists. */ + list_for_each_entry(region, &vm->userspace_mem_regions, list) { + if (region->region.slot != slot) + continue; + + TEST_FAIL("A mem region with the requested slot " + "already exists.\n" + " requested slot: %u paddr: 0x%lx npages: 0x%lx\n" + " existing slot: %u paddr: 0x%lx size: 0x%lx", + slot, guest_paddr, npages, + region->region.slot, + (uint64_t) region->region.guest_phys_addr, + (uint64_t) region->region.memory_size); + } + + /* Allocate and initialize new mem region structure. */ + region = calloc(1, sizeof(*region)); + TEST_ASSERT(region != NULL, "Insufficient Memory"); + region->mmap_size = npages * vm->page_size; + +#ifdef __s390x__ + /* On s390x, the host address must be aligned to 1M (due to PGSTEs) */ + alignment = 0x100000; +#else + alignment = 1; +#endif + + if (src_type == VM_MEM_SRC_ANONYMOUS_THP) + alignment = max(huge_page_size, alignment); + + /* Add enough memory to align up if necessary */ + if (alignment > 1) + region->mmap_size += alignment; + + region->mmap_start = mmap(NULL, region->mmap_size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS + | (src_type == VM_MEM_SRC_ANONYMOUS_HUGETLB ? MAP_HUGETLB : 0), + -1, 0); + TEST_ASSERT(region->mmap_start != MAP_FAILED, + "test_malloc failed, mmap_start: %p errno: %i", + region->mmap_start, errno); + + /* Align host address */ + region->host_mem = align(region->mmap_start, alignment); + + /* As needed perform madvise */ + if (src_type == VM_MEM_SRC_ANONYMOUS || src_type == VM_MEM_SRC_ANONYMOUS_THP) { + struct stat statbuf; + + ret = stat("/sys/kernel/mm/transparent_hugepage", &statbuf); + TEST_ASSERT(ret == 0 || (ret == -1 && errno == ENOENT), + "stat /sys/kernel/mm/transparent_hugepage"); + + TEST_ASSERT(ret == 0 || src_type != VM_MEM_SRC_ANONYMOUS_THP, + "VM_MEM_SRC_ANONYMOUS_THP requires THP to be configured in the host kernel"); + + if (ret == 0) { + ret = madvise(region->host_mem, npages * vm->page_size, + src_type == VM_MEM_SRC_ANONYMOUS ? MADV_NOHUGEPAGE : MADV_HUGEPAGE); + TEST_ASSERT(ret == 0, "madvise failed, addr: %p length: 0x%lx src_type: %x", + region->host_mem, npages * vm->page_size, src_type); + } + } + + region->unused_phy_pages = sparsebit_alloc(); + sparsebit_set_num(region->unused_phy_pages, + guest_paddr >> vm->page_shift, npages); + region->region.slot = slot; + region->region.flags = flags; + region->region.guest_phys_addr = guest_paddr; + region->region.memory_size = npages * vm->page_size; + region->region.userspace_addr = (uintptr_t) region->host_mem; + ret = ioctl(vm->fd, KVM_SET_USER_MEMORY_REGION, ®ion->region); + TEST_ASSERT(ret == 0, "KVM_SET_USER_MEMORY_REGION IOCTL failed,\n" + " rc: %i errno: %i\n" + " slot: %u flags: 0x%x\n" + " guest_phys_addr: 0x%lx size: 0x%lx", + ret, errno, slot, flags, + guest_paddr, (uint64_t) region->region.memory_size); + + /* Add to linked-list of memory regions. */ + list_add(®ion->list, &vm->userspace_mem_regions); +} + +/* + * Memslot to region + * + * Input Args: + * vm - Virtual Machine + * memslot - KVM memory slot ID + * + * Output Args: None + * + * Return: + * Pointer to memory region structure that describe memory region + * using kvm memory slot ID given by memslot. TEST_ASSERT failure + * on error (e.g. currently no memory region using memslot as a KVM + * memory slot ID). + */ +struct userspace_mem_region * +memslot2region(struct kvm_vm *vm, uint32_t memslot) +{ + struct userspace_mem_region *region; + + list_for_each_entry(region, &vm->userspace_mem_regions, list) { + if (region->region.slot == memslot) + return region; + } + + fprintf(stderr, "No mem region with the requested slot found,\n" + " requested slot: %u\n", memslot); + fputs("---- vm dump ----\n", stderr); + vm_dump(stderr, vm, 2); + TEST_FAIL("Mem region not found"); + return NULL; +} + +/* + * VM Memory Region Flags Set + * + * Input Args: + * vm - Virtual Machine + * flags - Starting guest physical address + * + * Output Args: None + * + * Return: None + * + * Sets the flags of the memory region specified by the value of slot, + * to the values given by flags. + */ +void vm_mem_region_set_flags(struct kvm_vm *vm, uint32_t slot, uint32_t flags) +{ + int ret; + struct userspace_mem_region *region; + + region = memslot2region(vm, slot); + + region->region.flags = flags; + + ret = ioctl(vm->fd, KVM_SET_USER_MEMORY_REGION, ®ion->region); + + TEST_ASSERT(ret == 0, "KVM_SET_USER_MEMORY_REGION IOCTL failed,\n" + " rc: %i errno: %i slot: %u flags: 0x%x", + ret, errno, slot, flags); +} + +/* + * VM Memory Region Move + * + * Input Args: + * vm - Virtual Machine + * slot - Slot of the memory region to move + * new_gpa - Starting guest physical address + * + * Output Args: None + * + * Return: None + * + * Change the gpa of a memory region. + */ +void vm_mem_region_move(struct kvm_vm *vm, uint32_t slot, uint64_t new_gpa) +{ + struct userspace_mem_region *region; + int ret; + + region = memslot2region(vm, slot); + + region->region.guest_phys_addr = new_gpa; + + ret = ioctl(vm->fd, KVM_SET_USER_MEMORY_REGION, ®ion->region); + + TEST_ASSERT(!ret, "KVM_SET_USER_MEMORY_REGION failed\n" + "ret: %i errno: %i slot: %u new_gpa: 0x%lx", + ret, errno, slot, new_gpa); +} + +/* + * VM Memory Region Delete + * + * Input Args: + * vm - Virtual Machine + * slot - Slot of the memory region to delete + * + * Output Args: None + * + * Return: None + * + * Delete a memory region. + */ +void vm_mem_region_delete(struct kvm_vm *vm, uint32_t slot) +{ + __vm_mem_region_delete(vm, memslot2region(vm, slot)); +} + +/* + * VCPU mmap Size + * + * Input Args: None + * + * Output Args: None + * + * Return: + * Size of VCPU state + * + * Returns the size of the structure pointed to by the return value + * of vcpu_state(). + */ +static int vcpu_mmap_sz(void) +{ + int dev_fd, ret; + + dev_fd = open(KVM_DEV_PATH, O_RDONLY); + if (dev_fd < 0) + exit(KSFT_SKIP); + + ret = ioctl(dev_fd, KVM_GET_VCPU_MMAP_SIZE, NULL); + TEST_ASSERT(ret >= sizeof(struct kvm_run), + "%s KVM_GET_VCPU_MMAP_SIZE ioctl failed, rc: %i errno: %i", + __func__, ret, errno); + + close(dev_fd); + + return ret; +} + +/* + * VM VCPU Add + * + * Input Args: + * vm - Virtual Machine + * vcpuid - VCPU ID + * + * Output Args: None + * + * Return: None + * + * Adds a virtual CPU to the VM specified by vm with the ID given by vcpuid. + * No additional VCPU setup is done. + */ +void vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpuid) +{ + struct vcpu *vcpu; + + /* Confirm a vcpu with the specified id doesn't already exist. */ + vcpu = vcpu_find(vm, vcpuid); + if (vcpu != NULL) + TEST_FAIL("vcpu with the specified id " + "already exists,\n" + " requested vcpuid: %u\n" + " existing vcpuid: %u state: %p", + vcpuid, vcpu->id, vcpu->state); + + /* Allocate and initialize new vcpu structure. */ + vcpu = calloc(1, sizeof(*vcpu)); + TEST_ASSERT(vcpu != NULL, "Insufficient Memory"); + vcpu->id = vcpuid; + vcpu->fd = ioctl(vm->fd, KVM_CREATE_VCPU, vcpuid); + TEST_ASSERT(vcpu->fd >= 0, "KVM_CREATE_VCPU failed, rc: %i errno: %i", + vcpu->fd, errno); + + TEST_ASSERT(vcpu_mmap_sz() >= sizeof(*vcpu->state), "vcpu mmap size " + "smaller than expected, vcpu_mmap_sz: %i expected_min: %zi", + vcpu_mmap_sz(), sizeof(*vcpu->state)); + vcpu->state = (struct kvm_run *) mmap(NULL, sizeof(*vcpu->state), + PROT_READ | PROT_WRITE, MAP_SHARED, vcpu->fd, 0); + TEST_ASSERT(vcpu->state != MAP_FAILED, "mmap vcpu_state failed, " + "vcpu id: %u errno: %i", vcpuid, errno); + + /* Add to linked-list of VCPUs. */ + list_add(&vcpu->list, &vm->vcpus); +} + +/* + * VM Virtual Address Unused Gap + * + * Input Args: + * vm - Virtual Machine + * sz - Size (bytes) + * vaddr_min - Minimum Virtual Address + * + * Output Args: None + * + * Return: + * Lowest virtual address at or below vaddr_min, with at least + * sz unused bytes. TEST_ASSERT failure if no area of at least + * size sz is available. + * + * Within the VM specified by vm, locates the lowest starting virtual + * address >= vaddr_min, that has at least sz unallocated bytes. A + * TEST_ASSERT failure occurs for invalid input or no area of at least + * sz unallocated bytes >= vaddr_min is available. + */ +static vm_vaddr_t vm_vaddr_unused_gap(struct kvm_vm *vm, size_t sz, + vm_vaddr_t vaddr_min) +{ + uint64_t pages = (sz + vm->page_size - 1) >> vm->page_shift; + + /* Determine lowest permitted virtual page index. */ + uint64_t pgidx_start = (vaddr_min + vm->page_size - 1) >> vm->page_shift; + if ((pgidx_start * vm->page_size) < vaddr_min) + goto no_va_found; + + /* Loop over section with enough valid virtual page indexes. */ + if (!sparsebit_is_set_num(vm->vpages_valid, + pgidx_start, pages)) + pgidx_start = sparsebit_next_set_num(vm->vpages_valid, + pgidx_start, pages); + do { + /* + * Are there enough unused virtual pages available at + * the currently proposed starting virtual page index. + * If not, adjust proposed starting index to next + * possible. + */ + if (sparsebit_is_clear_num(vm->vpages_mapped, + pgidx_start, pages)) + goto va_found; + pgidx_start = sparsebit_next_clear_num(vm->vpages_mapped, + pgidx_start, pages); + if (pgidx_start == 0) + goto no_va_found; + + /* + * If needed, adjust proposed starting virtual address, + * to next range of valid virtual addresses. + */ + if (!sparsebit_is_set_num(vm->vpages_valid, + pgidx_start, pages)) { + pgidx_start = sparsebit_next_set_num( + vm->vpages_valid, pgidx_start, pages); + if (pgidx_start == 0) + goto no_va_found; + } + } while (pgidx_start != 0); + +no_va_found: + TEST_FAIL("No vaddr of specified pages available, pages: 0x%lx", pages); + + /* NOT REACHED */ + return -1; + +va_found: + TEST_ASSERT(sparsebit_is_set_num(vm->vpages_valid, + pgidx_start, pages), + "Unexpected, invalid virtual page index range,\n" + " pgidx_start: 0x%lx\n" + " pages: 0x%lx", + pgidx_start, pages); + TEST_ASSERT(sparsebit_is_clear_num(vm->vpages_mapped, + pgidx_start, pages), + "Unexpected, pages already mapped,\n" + " pgidx_start: 0x%lx\n" + " pages: 0x%lx", + pgidx_start, pages); + + return pgidx_start * vm->page_size; +} + +/* + * VM Virtual Address Allocate + * + * Input Args: + * vm - Virtual Machine + * sz - Size in bytes + * vaddr_min - Minimum starting virtual address + * data_memslot - Memory region slot for data pages + * pgd_memslot - Memory region slot for new virtual translation tables + * + * Output Args: None + * + * Return: + * Starting guest virtual address + * + * Allocates at least sz bytes within the virtual address space of the vm + * given by vm. The allocated bytes are mapped to a virtual address >= + * the address given by vaddr_min. Note that each allocation uses a + * a unique set of pages, with the minimum real allocation being at least + * a page. + */ +vm_vaddr_t vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min, + uint32_t data_memslot, uint32_t pgd_memslot) +{ + uint64_t pages = (sz >> vm->page_shift) + ((sz % vm->page_size) != 0); + + virt_pgd_alloc(vm, pgd_memslot); + + /* + * Find an unused range of virtual page addresses of at least + * pages in length. + */ + vm_vaddr_t vaddr_start = vm_vaddr_unused_gap(vm, sz, vaddr_min); + + /* Map the virtual pages. */ + for (vm_vaddr_t vaddr = vaddr_start; pages > 0; + pages--, vaddr += vm->page_size) { + vm_paddr_t paddr; + + paddr = vm_phy_page_alloc(vm, + KVM_UTIL_MIN_PFN * vm->page_size, data_memslot); + + virt_pg_map(vm, vaddr, paddr, pgd_memslot); + + sparsebit_set(vm->vpages_mapped, + vaddr >> vm->page_shift); + } + + return vaddr_start; +} + +/* + * Map a range of VM virtual address to the VM's physical address + * + * Input Args: + * vm - Virtual Machine + * vaddr - Virtuall address to map + * paddr - VM Physical Address + * npages - The number of pages to map + * pgd_memslot - Memory region slot for new virtual translation tables + * + * Output Args: None + * + * Return: None + * + * Within the VM given by @vm, creates a virtual translation for + * @npages starting at @vaddr to the page range starting at @paddr. + */ +void virt_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, + unsigned int npages, uint32_t pgd_memslot) +{ + size_t page_size = vm->page_size; + size_t size = npages * page_size; + + TEST_ASSERT(vaddr + size > vaddr, "Vaddr overflow"); + TEST_ASSERT(paddr + size > paddr, "Paddr overflow"); + + while (npages--) { + virt_pg_map(vm, vaddr, paddr, pgd_memslot); + vaddr += page_size; + paddr += page_size; + } +} + +/* + * Address VM Physical to Host Virtual + * + * Input Args: + * vm - Virtual Machine + * gpa - VM physical address + * + * Output Args: None + * + * Return: + * Equivalent host virtual address + * + * Locates the memory region containing the VM physical address given + * by gpa, within the VM given by vm. When found, the host virtual + * address providing the memory to the vm physical address is returned. + * A TEST_ASSERT failure occurs if no region containing gpa exists. + */ +void *addr_gpa2hva(struct kvm_vm *vm, vm_paddr_t gpa) +{ + struct userspace_mem_region *region; + + list_for_each_entry(region, &vm->userspace_mem_regions, list) { + if ((gpa >= region->region.guest_phys_addr) + && (gpa <= (region->region.guest_phys_addr + + region->region.memory_size - 1))) + return (void *) ((uintptr_t) region->host_mem + + (gpa - region->region.guest_phys_addr)); + } + + TEST_FAIL("No vm physical memory at 0x%lx", gpa); + return NULL; +} + +/* + * Address Host Virtual to VM Physical + * + * Input Args: + * vm - Virtual Machine + * hva - Host virtual address + * + * Output Args: None + * + * Return: + * Equivalent VM physical address + * + * Locates the memory region containing the host virtual address given + * by hva, within the VM given by vm. When found, the equivalent + * VM physical address is returned. A TEST_ASSERT failure occurs if no + * region containing hva exists. + */ +vm_paddr_t addr_hva2gpa(struct kvm_vm *vm, void *hva) +{ + struct userspace_mem_region *region; + + list_for_each_entry(region, &vm->userspace_mem_regions, list) { + if ((hva >= region->host_mem) + && (hva <= (region->host_mem + + region->region.memory_size - 1))) + return (vm_paddr_t) ((uintptr_t) + region->region.guest_phys_addr + + (hva - (uintptr_t) region->host_mem)); + } + + TEST_FAIL("No mapping to a guest physical address, hva: %p", hva); + return -1; +} + +/* + * VM Create IRQ Chip + * + * Input Args: + * vm - Virtual Machine + * + * Output Args: None + * + * Return: None + * + * Creates an interrupt controller chip for the VM specified by vm. + */ +void vm_create_irqchip(struct kvm_vm *vm) +{ + int ret; + + ret = ioctl(vm->fd, KVM_CREATE_IRQCHIP, 0); + TEST_ASSERT(ret == 0, "KVM_CREATE_IRQCHIP IOCTL failed, " + "rc: %i errno: %i", ret, errno); + + vm->has_irqchip = true; +} + +/* + * VM VCPU State + * + * Input Args: + * vm - Virtual Machine + * vcpuid - VCPU ID + * + * Output Args: None + * + * Return: + * Pointer to structure that describes the state of the VCPU. + * + * Locates and returns a pointer to a structure that describes the + * state of the VCPU with the given vcpuid. + */ +struct kvm_run *vcpu_state(struct kvm_vm *vm, uint32_t vcpuid) +{ + struct vcpu *vcpu = vcpu_find(vm, vcpuid); + TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid); + + return vcpu->state; +} + +/* + * VM VCPU Run + * + * Input Args: + * vm - Virtual Machine + * vcpuid - VCPU ID + * + * Output Args: None + * + * Return: None + * + * Switch to executing the code for the VCPU given by vcpuid, within the VM + * given by vm. + */ +void vcpu_run(struct kvm_vm *vm, uint32_t vcpuid) +{ + int ret = _vcpu_run(vm, vcpuid); + TEST_ASSERT(ret == 0, "KVM_RUN IOCTL failed, " + "rc: %i errno: %i", ret, errno); +} + +int _vcpu_run(struct kvm_vm *vm, uint32_t vcpuid) +{ + struct vcpu *vcpu = vcpu_find(vm, vcpuid); + int rc; + + TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid); + do { + rc = ioctl(vcpu->fd, KVM_RUN, NULL); + } while (rc == -1 && errno == EINTR); + + assert_on_unhandled_exception(vm, vcpuid); + + return rc; +} + +void vcpu_run_complete_io(struct kvm_vm *vm, uint32_t vcpuid) +{ + struct vcpu *vcpu = vcpu_find(vm, vcpuid); + int ret; + + TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid); + + vcpu->state->immediate_exit = 1; + ret = ioctl(vcpu->fd, KVM_RUN, NULL); + vcpu->state->immediate_exit = 0; + + TEST_ASSERT(ret == -1 && errno == EINTR, + "KVM_RUN IOCTL didn't exit immediately, rc: %i, errno: %i", + ret, errno); +} + +void vcpu_set_guest_debug(struct kvm_vm *vm, uint32_t vcpuid, + struct kvm_guest_debug *debug) +{ + struct vcpu *vcpu = vcpu_find(vm, vcpuid); + int ret = ioctl(vcpu->fd, KVM_SET_GUEST_DEBUG, debug); + + TEST_ASSERT(ret == 0, "KVM_SET_GUEST_DEBUG failed: %d", ret); +} + +/* + * VM VCPU Set MP State + * + * Input Args: + * vm - Virtual Machine + * vcpuid - VCPU ID + * mp_state - mp_state to be set + * + * Output Args: None + * + * Return: None + * + * Sets the MP state of the VCPU given by vcpuid, to the state given + * by mp_state. + */ +void vcpu_set_mp_state(struct kvm_vm *vm, uint32_t vcpuid, + struct kvm_mp_state *mp_state) +{ + struct vcpu *vcpu = vcpu_find(vm, vcpuid); + int ret; + + TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid); + + ret = ioctl(vcpu->fd, KVM_SET_MP_STATE, mp_state); + TEST_ASSERT(ret == 0, "KVM_SET_MP_STATE IOCTL failed, " + "rc: %i errno: %i", ret, errno); +} + +/* + * VM VCPU Get Reg List + * + * Input Args: + * vm - Virtual Machine + * vcpuid - VCPU ID + * + * Output Args: + * None + * + * Return: + * A pointer to an allocated struct kvm_reg_list + * + * Get the list of guest registers which are supported for + * KVM_GET_ONE_REG/KVM_SET_ONE_REG calls + */ +struct kvm_reg_list *vcpu_get_reg_list(struct kvm_vm *vm, uint32_t vcpuid) +{ + struct kvm_reg_list reg_list_n = { .n = 0 }, *reg_list; + int ret; + + ret = _vcpu_ioctl(vm, vcpuid, KVM_GET_REG_LIST, ®_list_n); + TEST_ASSERT(ret == -1 && errno == E2BIG, "KVM_GET_REG_LIST n=0"); + reg_list = calloc(1, sizeof(*reg_list) + reg_list_n.n * sizeof(__u64)); + reg_list->n = reg_list_n.n; + vcpu_ioctl(vm, vcpuid, KVM_GET_REG_LIST, reg_list); + return reg_list; +} + +/* + * VM VCPU Regs Get + * + * Input Args: + * vm - Virtual Machine + * vcpuid - VCPU ID + * + * Output Args: + * regs - current state of VCPU regs + * + * Return: None + * + * Obtains the current register state for the VCPU specified by vcpuid + * and stores it at the location given by regs. + */ +void vcpu_regs_get(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_regs *regs) +{ + struct vcpu *vcpu = vcpu_find(vm, vcpuid); + int ret; + + TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid); + + ret = ioctl(vcpu->fd, KVM_GET_REGS, regs); + TEST_ASSERT(ret == 0, "KVM_GET_REGS failed, rc: %i errno: %i", + ret, errno); +} + +/* + * VM VCPU Regs Set + * + * Input Args: + * vm - Virtual Machine + * vcpuid - VCPU ID + * regs - Values to set VCPU regs to + * + * Output Args: None + * + * Return: None + * + * Sets the regs of the VCPU specified by vcpuid to the values + * given by regs. + */ +void vcpu_regs_set(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_regs *regs) +{ + struct vcpu *vcpu = vcpu_find(vm, vcpuid); + int ret; + + TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid); + + ret = ioctl(vcpu->fd, KVM_SET_REGS, regs); + TEST_ASSERT(ret == 0, "KVM_SET_REGS failed, rc: %i errno: %i", + ret, errno); +} + +#ifdef __KVM_HAVE_VCPU_EVENTS +void vcpu_events_get(struct kvm_vm *vm, uint32_t vcpuid, + struct kvm_vcpu_events *events) +{ + struct vcpu *vcpu = vcpu_find(vm, vcpuid); + int ret; + + TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid); + + ret = ioctl(vcpu->fd, KVM_GET_VCPU_EVENTS, events); + TEST_ASSERT(ret == 0, "KVM_GET_VCPU_EVENTS, failed, rc: %i errno: %i", + ret, errno); +} + +void vcpu_events_set(struct kvm_vm *vm, uint32_t vcpuid, + struct kvm_vcpu_events *events) +{ + struct vcpu *vcpu = vcpu_find(vm, vcpuid); + int ret; + + TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid); + + ret = ioctl(vcpu->fd, KVM_SET_VCPU_EVENTS, events); + TEST_ASSERT(ret == 0, "KVM_SET_VCPU_EVENTS, failed, rc: %i errno: %i", + ret, errno); +} +#endif + +#ifdef __x86_64__ +void vcpu_nested_state_get(struct kvm_vm *vm, uint32_t vcpuid, + struct kvm_nested_state *state) +{ + struct vcpu *vcpu = vcpu_find(vm, vcpuid); + int ret; + + TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid); + + ret = ioctl(vcpu->fd, KVM_GET_NESTED_STATE, state); + TEST_ASSERT(ret == 0, + "KVM_SET_NESTED_STATE failed, ret: %i errno: %i", + ret, errno); +} + +int vcpu_nested_state_set(struct kvm_vm *vm, uint32_t vcpuid, + struct kvm_nested_state *state, bool ignore_error) +{ + struct vcpu *vcpu = vcpu_find(vm, vcpuid); + int ret; + + TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid); + + ret = ioctl(vcpu->fd, KVM_SET_NESTED_STATE, state); + if (!ignore_error) { + TEST_ASSERT(ret == 0, + "KVM_SET_NESTED_STATE failed, ret: %i errno: %i", + ret, errno); + } + + return ret; +} +#endif + +/* + * VM VCPU System Regs Get + * + * Input Args: + * vm - Virtual Machine + * vcpuid - VCPU ID + * + * Output Args: + * sregs - current state of VCPU system regs + * + * Return: None + * + * Obtains the current system register state for the VCPU specified by + * vcpuid and stores it at the location given by sregs. + */ +void vcpu_sregs_get(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_sregs *sregs) +{ + struct vcpu *vcpu = vcpu_find(vm, vcpuid); + int ret; + + TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid); + + ret = ioctl(vcpu->fd, KVM_GET_SREGS, sregs); + TEST_ASSERT(ret == 0, "KVM_GET_SREGS failed, rc: %i errno: %i", + ret, errno); +} + +/* + * VM VCPU System Regs Set + * + * Input Args: + * vm - Virtual Machine + * vcpuid - VCPU ID + * sregs - Values to set VCPU system regs to + * + * Output Args: None + * + * Return: None + * + * Sets the system regs of the VCPU specified by vcpuid to the values + * given by sregs. + */ +void vcpu_sregs_set(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_sregs *sregs) +{ + int ret = _vcpu_sregs_set(vm, vcpuid, sregs); + TEST_ASSERT(ret == 0, "KVM_RUN IOCTL failed, " + "rc: %i errno: %i", ret, errno); +} + +int _vcpu_sregs_set(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_sregs *sregs) +{ + struct vcpu *vcpu = vcpu_find(vm, vcpuid); + + TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid); + + return ioctl(vcpu->fd, KVM_SET_SREGS, sregs); +} + +void vcpu_fpu_get(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_fpu *fpu) +{ + int ret; + + ret = _vcpu_ioctl(vm, vcpuid, KVM_GET_FPU, fpu); + TEST_ASSERT(ret == 0, "KVM_GET_FPU failed, rc: %i errno: %i (%s)", + ret, errno, strerror(errno)); +} + +void vcpu_fpu_set(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_fpu *fpu) +{ + int ret; + + ret = _vcpu_ioctl(vm, vcpuid, KVM_SET_FPU, fpu); + TEST_ASSERT(ret == 0, "KVM_SET_FPU failed, rc: %i errno: %i (%s)", + ret, errno, strerror(errno)); +} + +void vcpu_get_reg(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_one_reg *reg) +{ + int ret; + + ret = _vcpu_ioctl(vm, vcpuid, KVM_GET_ONE_REG, reg); + TEST_ASSERT(ret == 0, "KVM_GET_ONE_REG failed, rc: %i errno: %i (%s)", + ret, errno, strerror(errno)); +} + +void vcpu_set_reg(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_one_reg *reg) +{ + int ret; + + ret = _vcpu_ioctl(vm, vcpuid, KVM_SET_ONE_REG, reg); + TEST_ASSERT(ret == 0, "KVM_SET_ONE_REG failed, rc: %i errno: %i (%s)", + ret, errno, strerror(errno)); +} + +/* + * VCPU Ioctl + * + * Input Args: + * vm - Virtual Machine + * vcpuid - VCPU ID + * cmd - Ioctl number + * arg - Argument to pass to the ioctl + * + * Return: None + * + * Issues an arbitrary ioctl on a VCPU fd. + */ +void vcpu_ioctl(struct kvm_vm *vm, uint32_t vcpuid, + unsigned long cmd, void *arg) +{ + int ret; + + ret = _vcpu_ioctl(vm, vcpuid, cmd, arg); + TEST_ASSERT(ret == 0, "vcpu ioctl %lu failed, rc: %i errno: %i (%s)", + cmd, ret, errno, strerror(errno)); +} + +int _vcpu_ioctl(struct kvm_vm *vm, uint32_t vcpuid, + unsigned long cmd, void *arg) +{ + struct vcpu *vcpu = vcpu_find(vm, vcpuid); + int ret; + + TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid); + + ret = ioctl(vcpu->fd, cmd, arg); + + return ret; +} + +/* + * VM Ioctl + * + * Input Args: + * vm - Virtual Machine + * cmd - Ioctl number + * arg - Argument to pass to the ioctl + * + * Return: None + * + * Issues an arbitrary ioctl on a VM fd. + */ +void vm_ioctl(struct kvm_vm *vm, unsigned long cmd, void *arg) +{ + int ret; + + ret = ioctl(vm->fd, cmd, arg); + TEST_ASSERT(ret == 0, "vm ioctl %lu failed, rc: %i errno: %i (%s)", + cmd, ret, errno, strerror(errno)); +} + +/* + * VM Dump + * + * Input Args: + * vm - Virtual Machine + * indent - Left margin indent amount + * + * Output Args: + * stream - Output FILE stream + * + * Return: None + * + * Dumps the current state of the VM given by vm, to the FILE stream + * given by stream. + */ +void vm_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) +{ + struct userspace_mem_region *region; + struct vcpu *vcpu; + + fprintf(stream, "%*smode: 0x%x\n", indent, "", vm->mode); + fprintf(stream, "%*sfd: %i\n", indent, "", vm->fd); + fprintf(stream, "%*spage_size: 0x%x\n", indent, "", vm->page_size); + fprintf(stream, "%*sMem Regions:\n", indent, ""); + list_for_each_entry(region, &vm->userspace_mem_regions, list) { + fprintf(stream, "%*sguest_phys: 0x%lx size: 0x%lx " + "host_virt: %p\n", indent + 2, "", + (uint64_t) region->region.guest_phys_addr, + (uint64_t) region->region.memory_size, + region->host_mem); + fprintf(stream, "%*sunused_phy_pages: ", indent + 2, ""); + sparsebit_dump(stream, region->unused_phy_pages, 0); + } + fprintf(stream, "%*sMapped Virtual Pages:\n", indent, ""); + sparsebit_dump(stream, vm->vpages_mapped, indent + 2); + fprintf(stream, "%*spgd_created: %u\n", indent, "", + vm->pgd_created); + if (vm->pgd_created) { + fprintf(stream, "%*sVirtual Translation Tables:\n", + indent + 2, ""); + virt_dump(stream, vm, indent + 4); + } + fprintf(stream, "%*sVCPUs:\n", indent, ""); + list_for_each_entry(vcpu, &vm->vcpus, list) + vcpu_dump(stream, vm, vcpu->id, indent + 2); +} + +/* Known KVM exit reasons */ +static struct exit_reason { + unsigned int reason; + const char *name; +} exit_reasons_known[] = { + {KVM_EXIT_UNKNOWN, "UNKNOWN"}, + {KVM_EXIT_EXCEPTION, "EXCEPTION"}, + {KVM_EXIT_IO, "IO"}, + {KVM_EXIT_HYPERCALL, "HYPERCALL"}, + {KVM_EXIT_DEBUG, "DEBUG"}, + {KVM_EXIT_HLT, "HLT"}, + {KVM_EXIT_MMIO, "MMIO"}, + {KVM_EXIT_IRQ_WINDOW_OPEN, "IRQ_WINDOW_OPEN"}, + {KVM_EXIT_SHUTDOWN, "SHUTDOWN"}, + {KVM_EXIT_FAIL_ENTRY, "FAIL_ENTRY"}, + {KVM_EXIT_INTR, "INTR"}, + {KVM_EXIT_SET_TPR, "SET_TPR"}, + {KVM_EXIT_TPR_ACCESS, "TPR_ACCESS"}, + {KVM_EXIT_S390_SIEIC, "S390_SIEIC"}, + {KVM_EXIT_S390_RESET, "S390_RESET"}, + {KVM_EXIT_DCR, "DCR"}, + {KVM_EXIT_NMI, "NMI"}, + {KVM_EXIT_INTERNAL_ERROR, "INTERNAL_ERROR"}, + {KVM_EXIT_OSI, "OSI"}, + {KVM_EXIT_PAPR_HCALL, "PAPR_HCALL"}, +#ifdef KVM_EXIT_MEMORY_NOT_PRESENT + {KVM_EXIT_MEMORY_NOT_PRESENT, "MEMORY_NOT_PRESENT"}, +#endif +}; + +/* + * Exit Reason String + * + * Input Args: + * exit_reason - Exit reason + * + * Output Args: None + * + * Return: + * Constant string pointer describing the exit reason. + * + * Locates and returns a constant string that describes the KVM exit + * reason given by exit_reason. If no such string is found, a constant + * string of "Unknown" is returned. + */ +const char *exit_reason_str(unsigned int exit_reason) +{ + unsigned int n1; + + for (n1 = 0; n1 < ARRAY_SIZE(exit_reasons_known); n1++) { + if (exit_reason == exit_reasons_known[n1].reason) + return exit_reasons_known[n1].name; + } + + return "Unknown"; +} + +/* + * Physical Contiguous Page Allocator + * + * Input Args: + * vm - Virtual Machine + * num - number of pages + * paddr_min - Physical address minimum + * memslot - Memory region to allocate page from + * + * Output Args: None + * + * Return: + * Starting physical address + * + * Within the VM specified by vm, locates a range of available physical + * pages at or above paddr_min. If found, the pages are marked as in use + * and their base address is returned. A TEST_ASSERT failure occurs if + * not enough pages are available at or above paddr_min. + */ +vm_paddr_t vm_phy_pages_alloc(struct kvm_vm *vm, size_t num, + vm_paddr_t paddr_min, uint32_t memslot) +{ + struct userspace_mem_region *region; + sparsebit_idx_t pg, base; + + TEST_ASSERT(num > 0, "Must allocate at least one page"); + + TEST_ASSERT((paddr_min % vm->page_size) == 0, "Min physical address " + "not divisible by page size.\n" + " paddr_min: 0x%lx page_size: 0x%x", + paddr_min, vm->page_size); + + region = memslot2region(vm, memslot); + base = pg = paddr_min >> vm->page_shift; + + do { + for (; pg < base + num; ++pg) { + if (!sparsebit_is_set(region->unused_phy_pages, pg)) { + base = pg = sparsebit_next_set(region->unused_phy_pages, pg); + break; + } + } + } while (pg && pg != base + num); + + if (pg == 0) { + fprintf(stderr, "No guest physical page available, " + "paddr_min: 0x%lx page_size: 0x%x memslot: %u\n", + paddr_min, vm->page_size, memslot); + fputs("---- vm dump ----\n", stderr); + vm_dump(stderr, vm, 2); + abort(); + } + + for (pg = base; pg < base + num; ++pg) + sparsebit_clear(region->unused_phy_pages, pg); + + return base * vm->page_size; +} + +vm_paddr_t vm_phy_page_alloc(struct kvm_vm *vm, vm_paddr_t paddr_min, + uint32_t memslot) +{ + return vm_phy_pages_alloc(vm, 1, paddr_min, memslot); +} + +/* + * Address Guest Virtual to Host Virtual + * + * Input Args: + * vm - Virtual Machine + * gva - VM virtual address + * + * Output Args: None + * + * Return: + * Equivalent host virtual address + */ +void *addr_gva2hva(struct kvm_vm *vm, vm_vaddr_t gva) +{ + return addr_gpa2hva(vm, addr_gva2gpa(vm, gva)); +} + +/* + * Is Unrestricted Guest + * + * Input Args: + * vm - Virtual Machine + * + * Output Args: None + * + * Return: True if the unrestricted guest is set to 'Y', otherwise return false. + * + * Check if the unrestricted guest flag is enabled. + */ +bool vm_is_unrestricted_guest(struct kvm_vm *vm) +{ + char val = 'N'; + size_t count; + FILE *f; + + if (vm == NULL) { + /* Ensure that the KVM vendor-specific module is loaded. */ + f = fopen(KVM_DEV_PATH, "r"); + TEST_ASSERT(f != NULL, "Error in opening KVM dev file: %d", + errno); + fclose(f); + } + + f = fopen("/sys/module/kvm_intel/parameters/unrestricted_guest", "r"); + if (f) { + count = fread(&val, sizeof(char), 1, f); + TEST_ASSERT(count == 1, "Unable to read from param file."); + fclose(f); + } + + return val == 'Y'; +} + +unsigned int vm_get_page_size(struct kvm_vm *vm) +{ + return vm->page_size; +} + +unsigned int vm_get_page_shift(struct kvm_vm *vm) +{ + return vm->page_shift; +} + +unsigned int vm_get_max_gfn(struct kvm_vm *vm) +{ + return vm->max_gfn; +} + +int vm_get_fd(struct kvm_vm *vm) +{ + return vm->fd; +} + +static unsigned int vm_calc_num_pages(unsigned int num_pages, + unsigned int page_shift, + unsigned int new_page_shift, + bool ceil) +{ + unsigned int n = 1 << (new_page_shift - page_shift); + + if (page_shift >= new_page_shift) + return num_pages * (1 << (page_shift - new_page_shift)); + + return num_pages / n + !!(ceil && num_pages % n); +} + +static inline int getpageshift(void) +{ + return __builtin_ffs(getpagesize()) - 1; +} + +unsigned int +vm_num_host_pages(enum vm_guest_mode mode, unsigned int num_guest_pages) +{ + return vm_calc_num_pages(num_guest_pages, + vm_guest_mode_params[mode].page_shift, + getpageshift(), true); +} + +unsigned int +vm_num_guest_pages(enum vm_guest_mode mode, unsigned int num_host_pages) +{ + return vm_calc_num_pages(num_host_pages, getpageshift(), + vm_guest_mode_params[mode].page_shift, false); +} + +unsigned int vm_calc_num_guest_pages(enum vm_guest_mode mode, size_t size) +{ + unsigned int n; + n = DIV_ROUND_UP(size, vm_guest_mode_params[mode].page_size); + return vm_adjust_num_guest_pages(mode, n); +} diff --git a/tools/testing/selftests/kvm/lib/kvm_util_internal.h b/tools/testing/selftests/kvm/lib/kvm_util_internal.h new file mode 100644 index 000000000..f07d383d0 --- /dev/null +++ b/tools/testing/selftests/kvm/lib/kvm_util_internal.h @@ -0,0 +1,113 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * tools/testing/selftests/kvm/lib/kvm_util_internal.h + * + * Copyright (C) 2018, Google LLC. + */ + +#ifndef SELFTEST_KVM_UTIL_INTERNAL_H +#define SELFTEST_KVM_UTIL_INTERNAL_H + +#include "sparsebit.h" + +#define KVM_DEV_PATH "/dev/kvm" + +struct userspace_mem_region { + struct kvm_userspace_memory_region region; + struct sparsebit *unused_phy_pages; + int fd; + off_t offset; + void *host_mem; + void *mmap_start; + size_t mmap_size; + struct list_head list; +}; + +struct vcpu { + struct list_head list; + uint32_t id; + int fd; + struct kvm_run *state; +}; + +struct kvm_vm { + int mode; + unsigned long type; + int kvm_fd; + int fd; + unsigned int pgtable_levels; + unsigned int page_size; + unsigned int page_shift; + unsigned int pa_bits; + unsigned int va_bits; + uint64_t max_gfn; + struct list_head vcpus; + struct list_head userspace_mem_regions; + struct sparsebit *vpages_valid; + struct sparsebit *vpages_mapped; + bool has_irqchip; + bool pgd_created; + vm_paddr_t pgd; + vm_vaddr_t gdt; + vm_vaddr_t tss; + vm_vaddr_t idt; + vm_vaddr_t handlers; +}; + +struct vcpu *vcpu_find(struct kvm_vm *vm, uint32_t vcpuid); + +/* + * Virtual Translation Tables Dump + * + * Input Args: + * stream - Output FILE stream + * vm - Virtual Machine + * indent - Left margin indent amount + * + * Output Args: None + * + * Return: None + * + * Dumps to the FILE stream given by @stream, the contents of all the + * virtual translation tables for the VM given by @vm. + */ +void virt_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent); + +/* + * Register Dump + * + * Input Args: + * stream - Output FILE stream + * regs - Registers + * indent - Left margin indent amount + * + * Output Args: None + * + * Return: None + * + * Dumps the state of the registers given by @regs, to the FILE stream + * given by @stream. + */ +void regs_dump(FILE *stream, struct kvm_regs *regs, uint8_t indent); + +/* + * System Register Dump + * + * Input Args: + * stream - Output FILE stream + * sregs - System registers + * indent - Left margin indent amount + * + * Output Args: None + * + * Return: None + * + * Dumps the state of the system registers given by @sregs, to the FILE stream + * given by @stream. + */ +void sregs_dump(FILE *stream, struct kvm_sregs *sregs, uint8_t indent); + +struct userspace_mem_region * +memslot2region(struct kvm_vm *vm, uint32_t memslot); + +#endif /* SELFTEST_KVM_UTIL_INTERNAL_H */ diff --git a/tools/testing/selftests/kvm/lib/s390x/processor.c b/tools/testing/selftests/kvm/lib/s390x/processor.c new file mode 100644 index 000000000..7349bb2e1 --- /dev/null +++ b/tools/testing/selftests/kvm/lib/s390x/processor.c @@ -0,0 +1,247 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * KVM selftest s390x library code - CPU-related functions (page tables...) + * + * Copyright (C) 2019, Red Hat, Inc. + */ + +#define _GNU_SOURCE /* for program_invocation_name */ + +#include "processor.h" +#include "kvm_util.h" +#include "../kvm_util_internal.h" + +#define KVM_GUEST_PAGE_TABLE_MIN_PADDR 0x180000 + +#define PAGES_PER_REGION 4 + +void virt_pgd_alloc(struct kvm_vm *vm, uint32_t memslot) +{ + vm_paddr_t paddr; + + TEST_ASSERT(vm->page_size == 4096, "Unsupported page size: 0x%x", + vm->page_size); + + if (vm->pgd_created) + return; + + paddr = vm_phy_pages_alloc(vm, PAGES_PER_REGION, + KVM_GUEST_PAGE_TABLE_MIN_PADDR, memslot); + memset(addr_gpa2hva(vm, paddr), 0xff, PAGES_PER_REGION * vm->page_size); + + vm->pgd = paddr; + vm->pgd_created = true; +} + +/* + * Allocate 4 pages for a region/segment table (ri < 4), or one page for + * a page table (ri == 4). Returns a suitable region/segment table entry + * which points to the freshly allocated pages. + */ +static uint64_t virt_alloc_region(struct kvm_vm *vm, int ri, uint32_t memslot) +{ + uint64_t taddr; + + taddr = vm_phy_pages_alloc(vm, ri < 4 ? PAGES_PER_REGION : 1, + KVM_GUEST_PAGE_TABLE_MIN_PADDR, memslot); + memset(addr_gpa2hva(vm, taddr), 0xff, PAGES_PER_REGION * vm->page_size); + + return (taddr & REGION_ENTRY_ORIGIN) + | (((4 - ri) << 2) & REGION_ENTRY_TYPE) + | ((ri < 4 ? (PAGES_PER_REGION - 1) : 0) & REGION_ENTRY_LENGTH); +} + +void virt_pg_map(struct kvm_vm *vm, uint64_t gva, uint64_t gpa, + uint32_t memslot) +{ + int ri, idx; + uint64_t *entry; + + TEST_ASSERT((gva % vm->page_size) == 0, + "Virtual address not on page boundary,\n" + " vaddr: 0x%lx vm->page_size: 0x%x", + gva, vm->page_size); + TEST_ASSERT(sparsebit_is_set(vm->vpages_valid, + (gva >> vm->page_shift)), + "Invalid virtual address, vaddr: 0x%lx", + gva); + TEST_ASSERT((gpa % vm->page_size) == 0, + "Physical address not on page boundary,\n" + " paddr: 0x%lx vm->page_size: 0x%x", + gva, vm->page_size); + TEST_ASSERT((gpa >> vm->page_shift) <= vm->max_gfn, + "Physical address beyond beyond maximum supported,\n" + " paddr: 0x%lx vm->max_gfn: 0x%lx vm->page_size: 0x%x", + gva, vm->max_gfn, vm->page_size); + + /* Walk through region and segment tables */ + entry = addr_gpa2hva(vm, vm->pgd); + for (ri = 1; ri <= 4; ri++) { + idx = (gva >> (64 - 11 * ri)) & 0x7ffu; + if (entry[idx] & REGION_ENTRY_INVALID) + entry[idx] = virt_alloc_region(vm, ri, memslot); + entry = addr_gpa2hva(vm, entry[idx] & REGION_ENTRY_ORIGIN); + } + + /* Fill in page table entry */ + idx = (gva >> 12) & 0x0ffu; /* page index */ + if (!(entry[idx] & PAGE_INVALID)) + fprintf(stderr, + "WARNING: PTE for gpa=0x%"PRIx64" already set!\n", gpa); + entry[idx] = gpa; +} + +vm_paddr_t addr_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva) +{ + int ri, idx; + uint64_t *entry; + + TEST_ASSERT(vm->page_size == 4096, "Unsupported page size: 0x%x", + vm->page_size); + + entry = addr_gpa2hva(vm, vm->pgd); + for (ri = 1; ri <= 4; ri++) { + idx = (gva >> (64 - 11 * ri)) & 0x7ffu; + TEST_ASSERT(!(entry[idx] & REGION_ENTRY_INVALID), + "No region mapping for vm virtual address 0x%lx", + gva); + entry = addr_gpa2hva(vm, entry[idx] & REGION_ENTRY_ORIGIN); + } + + idx = (gva >> 12) & 0x0ffu; /* page index */ + + TEST_ASSERT(!(entry[idx] & PAGE_INVALID), + "No page mapping for vm virtual address 0x%lx", gva); + + return (entry[idx] & ~0xffful) + (gva & 0xffful); +} + +static void virt_dump_ptes(FILE *stream, struct kvm_vm *vm, uint8_t indent, + uint64_t ptea_start) +{ + uint64_t *pte, ptea; + + for (ptea = ptea_start; ptea < ptea_start + 0x100 * 8; ptea += 8) { + pte = addr_gpa2hva(vm, ptea); + if (*pte & PAGE_INVALID) + continue; + fprintf(stream, "%*spte @ 0x%lx: 0x%016lx\n", + indent, "", ptea, *pte); + } +} + +static void virt_dump_region(FILE *stream, struct kvm_vm *vm, uint8_t indent, + uint64_t reg_tab_addr) +{ + uint64_t addr, *entry; + + for (addr = reg_tab_addr; addr < reg_tab_addr + 0x400 * 8; addr += 8) { + entry = addr_gpa2hva(vm, addr); + if (*entry & REGION_ENTRY_INVALID) + continue; + fprintf(stream, "%*srt%lde @ 0x%lx: 0x%016lx\n", + indent, "", 4 - ((*entry & REGION_ENTRY_TYPE) >> 2), + addr, *entry); + if (*entry & REGION_ENTRY_TYPE) { + virt_dump_region(stream, vm, indent + 2, + *entry & REGION_ENTRY_ORIGIN); + } else { + virt_dump_ptes(stream, vm, indent + 2, + *entry & REGION_ENTRY_ORIGIN); + } + } +} + +void virt_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) +{ + if (!vm->pgd_created) + return; + + virt_dump_region(stream, vm, indent, vm->pgd); +} + +struct kvm_vm *vm_create_default(uint32_t vcpuid, uint64_t extra_mem_pages, + void *guest_code) +{ + /* + * The additional amount of pages required for the page tables is: + * 1 * n / 256 + 4 * (n / 256) / 2048 + 4 * (n / 256) / 2048^2 + ... + * which is definitely smaller than (n / 256) * 2. + */ + uint64_t extra_pg_pages = extra_mem_pages / 256 * 2; + struct kvm_vm *vm; + + vm = vm_create(VM_MODE_DEFAULT, + DEFAULT_GUEST_PHY_PAGES + extra_pg_pages, O_RDWR); + + kvm_vm_elf_load(vm, program_invocation_name, 0, 0); + vm_vcpu_add_default(vm, vcpuid, guest_code); + + return vm; +} + +void vm_vcpu_add_default(struct kvm_vm *vm, uint32_t vcpuid, void *guest_code) +{ + size_t stack_size = DEFAULT_STACK_PGS * getpagesize(); + uint64_t stack_vaddr; + struct kvm_regs regs; + struct kvm_sregs sregs; + struct kvm_run *run; + + TEST_ASSERT(vm->page_size == 4096, "Unsupported page size: 0x%x", + vm->page_size); + + stack_vaddr = vm_vaddr_alloc(vm, stack_size, + DEFAULT_GUEST_STACK_VADDR_MIN, 0, 0); + + vm_vcpu_add(vm, vcpuid); + + /* Setup guest registers */ + vcpu_regs_get(vm, vcpuid, ®s); + regs.gprs[15] = stack_vaddr + (DEFAULT_STACK_PGS * getpagesize()) - 160; + vcpu_regs_set(vm, vcpuid, ®s); + + vcpu_sregs_get(vm, vcpuid, &sregs); + sregs.crs[0] |= 0x00040000; /* Enable floating point regs */ + sregs.crs[1] = vm->pgd | 0xf; /* Primary region table */ + vcpu_sregs_set(vm, vcpuid, &sregs); + + run = vcpu_state(vm, vcpuid); + run->psw_mask = 0x0400000180000000ULL; /* DAT enabled + 64 bit mode */ + run->psw_addr = (uintptr_t)guest_code; +} + +void vcpu_args_set(struct kvm_vm *vm, uint32_t vcpuid, unsigned int num, ...) +{ + va_list ap; + struct kvm_regs regs; + int i; + + TEST_ASSERT(num >= 1 && num <= 5, "Unsupported number of args,\n" + " num: %u\n", + num); + + va_start(ap, num); + vcpu_regs_get(vm, vcpuid, ®s); + + for (i = 0; i < num; i++) + regs.gprs[i + 2] = va_arg(ap, uint64_t); + + vcpu_regs_set(vm, vcpuid, ®s); + va_end(ap); +} + +void vcpu_dump(FILE *stream, struct kvm_vm *vm, uint32_t vcpuid, uint8_t indent) +{ + struct vcpu *vcpu = vcpu_find(vm, vcpuid); + + if (!vcpu) + return; + + fprintf(stream, "%*spstate: psw: 0x%.16llx:0x%.16llx\n", + indent, "", vcpu->state->psw_mask, vcpu->state->psw_addr); +} + +void assert_on_unhandled_exception(struct kvm_vm *vm, uint32_t vcpuid) +{ +} diff --git a/tools/testing/selftests/kvm/lib/s390x/ucall.c b/tools/testing/selftests/kvm/lib/s390x/ucall.c new file mode 100644 index 000000000..9d3b0f152 --- /dev/null +++ b/tools/testing/selftests/kvm/lib/s390x/ucall.c @@ -0,0 +1,59 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * ucall support. A ucall is a "hypercall to userspace". + * + * Copyright (C) 2019 Red Hat, Inc. + */ +#include "kvm_util.h" + +void ucall_init(struct kvm_vm *vm, void *arg) +{ +} + +void ucall_uninit(struct kvm_vm *vm) +{ +} + +void ucall(uint64_t cmd, int nargs, ...) +{ + struct ucall uc = { + .cmd = cmd, + }; + va_list va; + int i; + + nargs = nargs <= UCALL_MAX_ARGS ? nargs : UCALL_MAX_ARGS; + + va_start(va, nargs); + for (i = 0; i < nargs; ++i) + uc.args[i] = va_arg(va, uint64_t); + va_end(va); + + /* Exit via DIAGNOSE 0x501 (normally used for breakpoints) */ + asm volatile ("diag 0,%0,0x501" : : "a"(&uc) : "memory"); +} + +uint64_t get_ucall(struct kvm_vm *vm, uint32_t vcpu_id, struct ucall *uc) +{ + struct kvm_run *run = vcpu_state(vm, vcpu_id); + struct ucall ucall = {}; + + if (uc) + memset(uc, 0, sizeof(*uc)); + + if (run->exit_reason == KVM_EXIT_S390_SIEIC && + run->s390_sieic.icptcode == 4 && + (run->s390_sieic.ipa >> 8) == 0x83 && /* 0x83 means DIAGNOSE */ + (run->s390_sieic.ipb >> 16) == 0x501) { + int reg = run->s390_sieic.ipa & 0xf; + + memcpy(&ucall, addr_gva2hva(vm, run->s.regs.gprs[reg]), + sizeof(ucall)); + + vcpu_run_complete_io(vm, vcpu_id); + if (uc) + memcpy(uc, &ucall, sizeof(ucall)); + } + + return ucall.cmd; +} diff --git a/tools/testing/selftests/kvm/lib/sparsebit.c b/tools/testing/selftests/kvm/lib/sparsebit.c new file mode 100644 index 000000000..031ba3c93 --- /dev/null +++ b/tools/testing/selftests/kvm/lib/sparsebit.c @@ -0,0 +1,2086 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Sparse bit array + * + * Copyright (C) 2018, Google LLC. + * Copyright (C) 2018, Red Hat, Inc. (code style cleanup and fuzzing driver) + * + * This library provides functions to support a memory efficient bit array, + * with an index size of 2^64. A sparsebit array is allocated through + * the use sparsebit_alloc() and free'd via sparsebit_free(), + * such as in the following: + * + * struct sparsebit *s; + * s = sparsebit_alloc(); + * sparsebit_free(&s); + * + * The struct sparsebit type resolves down to a struct sparsebit. + * Note that, sparsebit_free() takes a pointer to the sparsebit + * structure. This is so that sparsebit_free() is able to poison + * the pointer (e.g. set it to NULL) to the struct sparsebit before + * returning to the caller. + * + * Between the return of sparsebit_alloc() and the call of + * sparsebit_free(), there are multiple query and modifying operations + * that can be performed on the allocated sparsebit array. All of + * these operations take as a parameter the value returned from + * sparsebit_alloc() and most also take a bit index. Frequently + * used routines include: + * + * ---- Query Operations + * sparsebit_is_set(s, idx) + * sparsebit_is_clear(s, idx) + * sparsebit_any_set(s) + * sparsebit_first_set(s) + * sparsebit_next_set(s, prev_idx) + * + * ---- Modifying Operations + * sparsebit_set(s, idx) + * sparsebit_clear(s, idx) + * sparsebit_set_num(s, idx, num); + * sparsebit_clear_num(s, idx, num); + * + * A common operation, is to itterate over all the bits set in a test + * sparsebit array. This can be done via code with the following structure: + * + * sparsebit_idx_t idx; + * if (sparsebit_any_set(s)) { + * idx = sparsebit_first_set(s); + * do { + * ... + * idx = sparsebit_next_set(s, idx); + * } while (idx != 0); + * } + * + * The index of the first bit set needs to be obtained via + * sparsebit_first_set(), because sparsebit_next_set(), needs + * the index of the previously set. The sparsebit_idx_t type is + * unsigned, so there is no previous index before 0 that is available. + * Also, the call to sparsebit_first_set() is not made unless there + * is at least 1 bit in the array set. This is because sparsebit_first_set() + * aborts if sparsebit_first_set() is called with no bits set. + * It is the callers responsibility to assure that the + * sparsebit array has at least a single bit set before calling + * sparsebit_first_set(). + * + * ==== Implementation Overview ==== + * For the most part the internal implementation of sparsebit is + * opaque to the caller. One important implementation detail that the + * caller may need to be aware of is the spatial complexity of the + * implementation. This implementation of a sparsebit array is not + * only sparse, in that it uses memory proportional to the number of bits + * set. It is also efficient in memory usage when most of the bits are + * set. + * + * At a high-level the state of the bit settings are maintained through + * the use of a binary-search tree, where each node contains at least + * the following members: + * + * typedef uint64_t sparsebit_idx_t; + * typedef uint64_t sparsebit_num_t; + * + * sparsebit_idx_t idx; + * uint32_t mask; + * sparsebit_num_t num_after; + * + * The idx member contains the bit index of the first bit described by this + * node, while the mask member stores the setting of the first 32-bits. + * The setting of the bit at idx + n, where 0 <= n < 32, is located in the + * mask member at 1 << n. + * + * Nodes are sorted by idx and the bits described by two nodes will never + * overlap. The idx member is always aligned to the mask size, i.e. a + * multiple of 32. + * + * Beyond a typical implementation, the nodes in this implementation also + * contains a member named num_after. The num_after member holds the + * number of bits immediately after the mask bits that are contiguously set. + * The use of the num_after member allows this implementation to efficiently + * represent cases where most bits are set. For example, the case of all + * but the last two bits set, is represented by the following two nodes: + * + * node 0 - idx: 0x0 mask: 0xffffffff num_after: 0xffffffffffffffc0 + * node 1 - idx: 0xffffffffffffffe0 mask: 0x3fffffff num_after: 0 + * + * ==== Invariants ==== + * This implementation usses the following invariants: + * + * + Node are only used to represent bits that are set. + * Nodes with a mask of 0 and num_after of 0 are not allowed. + * + * + Sum of bits set in all the nodes is equal to the value of + * the struct sparsebit_pvt num_set member. + * + * + The setting of at least one bit is always described in a nodes + * mask (mask >= 1). + * + * + A node with all mask bits set only occurs when the last bit + * described by the previous node is not equal to this nodes + * starting index - 1. All such occurences of this condition are + * avoided by moving the setting of the nodes mask bits into + * the previous nodes num_after setting. + * + * + Node starting index is evenly divisible by the number of bits + * within a nodes mask member. + * + * + Nodes never represent a range of bits that wrap around the + * highest supported index. + * + * (idx + MASK_BITS + num_after - 1) <= ((sparsebit_idx_t) 0) - 1) + * + * As a consequence of the above, the num_after member of a node + * will always be <=: + * + * maximum_index - nodes_starting_index - number_of_mask_bits + * + * + Nodes within the binary search tree are sorted based on each + * nodes starting index. + * + * + The range of bits described by any two nodes do not overlap. The + * range of bits described by a single node is: + * + * start: node->idx + * end (inclusive): node->idx + MASK_BITS + node->num_after - 1; + * + * Note, at times these invariants are temporarily violated for a + * specific portion of the code. For example, when setting a mask + * bit, there is a small delay between when the mask bit is set and the + * value in the struct sparsebit_pvt num_set member is updated. Other + * temporary violations occur when node_split() is called with a specified + * index and assures that a node where its mask represents the bit + * at the specified index exists. At times to do this node_split() + * must split an existing node into two nodes or create a node that + * has no bits set. Such temporary violations must be corrected before + * returning to the caller. These corrections are typically performed + * by the local function node_reduce(). + */ + +#include "test_util.h" +#include "sparsebit.h" +#include +#include + +#define DUMP_LINE_MAX 100 /* Does not include indent amount */ + +typedef uint32_t mask_t; +#define MASK_BITS (sizeof(mask_t) * CHAR_BIT) + +struct node { + struct node *parent; + struct node *left; + struct node *right; + sparsebit_idx_t idx; /* index of least-significant bit in mask */ + sparsebit_num_t num_after; /* num contiguously set after mask */ + mask_t mask; +}; + +struct sparsebit { + /* + * Points to root node of the binary search + * tree. Equal to NULL when no bits are set in + * the entire sparsebit array. + */ + struct node *root; + + /* + * A redundant count of the total number of bits set. Used for + * diagnostic purposes and to change the time complexity of + * sparsebit_num_set() from O(n) to O(1). + * Note: Due to overflow, a value of 0 means none or all set. + */ + sparsebit_num_t num_set; +}; + +/* Returns the number of set bits described by the settings + * of the node pointed to by nodep. + */ +static sparsebit_num_t node_num_set(struct node *nodep) +{ + return nodep->num_after + __builtin_popcount(nodep->mask); +} + +/* Returns a pointer to the node that describes the + * lowest bit index. + */ +static struct node *node_first(struct sparsebit *s) +{ + struct node *nodep; + + for (nodep = s->root; nodep && nodep->left; nodep = nodep->left) + ; + + return nodep; +} + +/* Returns a pointer to the node that describes the + * lowest bit index > the index of the node pointed to by np. + * Returns NULL if no node with a higher index exists. + */ +static struct node *node_next(struct sparsebit *s, struct node *np) +{ + struct node *nodep = np; + + /* + * If current node has a right child, next node is the left-most + * of the right child. + */ + if (nodep->right) { + for (nodep = nodep->right; nodep->left; nodep = nodep->left) + ; + return nodep; + } + + /* + * No right child. Go up until node is left child of a parent. + * That parent is then the next node. + */ + while (nodep->parent && nodep == nodep->parent->right) + nodep = nodep->parent; + + return nodep->parent; +} + +/* Searches for and returns a pointer to the node that describes the + * highest index < the index of the node pointed to by np. + * Returns NULL if no node with a lower index exists. + */ +static struct node *node_prev(struct sparsebit *s, struct node *np) +{ + struct node *nodep = np; + + /* + * If current node has a left child, next node is the right-most + * of the left child. + */ + if (nodep->left) { + for (nodep = nodep->left; nodep->right; nodep = nodep->right) + ; + return (struct node *) nodep; + } + + /* + * No left child. Go up until node is right child of a parent. + * That parent is then the next node. + */ + while (nodep->parent && nodep == nodep->parent->left) + nodep = nodep->parent; + + return (struct node *) nodep->parent; +} + + +/* Allocates space to hold a copy of the node sub-tree pointed to by + * subtree and duplicates the bit settings to the newly allocated nodes. + * Returns the newly allocated copy of subtree. + */ +static struct node *node_copy_subtree(struct node *subtree) +{ + struct node *root; + + /* Duplicate the node at the root of the subtree */ + root = calloc(1, sizeof(*root)); + if (!root) { + perror("calloc"); + abort(); + } + + root->idx = subtree->idx; + root->mask = subtree->mask; + root->num_after = subtree->num_after; + + /* As needed, recursively duplicate the left and right subtrees */ + if (subtree->left) { + root->left = node_copy_subtree(subtree->left); + root->left->parent = root; + } + + if (subtree->right) { + root->right = node_copy_subtree(subtree->right); + root->right->parent = root; + } + + return root; +} + +/* Searches for and returns a pointer to the node that describes the setting + * of the bit given by idx. A node describes the setting of a bit if its + * index is within the bits described by the mask bits or the number of + * contiguous bits set after the mask. Returns NULL if there is no such node. + */ +static struct node *node_find(struct sparsebit *s, sparsebit_idx_t idx) +{ + struct node *nodep; + + /* Find the node that describes the setting of the bit at idx */ + for (nodep = s->root; nodep; + nodep = nodep->idx > idx ? nodep->left : nodep->right) { + if (idx >= nodep->idx && + idx <= nodep->idx + MASK_BITS + nodep->num_after - 1) + break; + } + + return nodep; +} + +/* Entry Requirements: + * + A node that describes the setting of idx is not already present. + * + * Adds a new node to describe the setting of the bit at the index given + * by idx. Returns a pointer to the newly added node. + * + * TODO(lhuemill): Degenerate cases causes the tree to get unbalanced. + */ +static struct node *node_add(struct sparsebit *s, sparsebit_idx_t idx) +{ + struct node *nodep, *parentp, *prev; + + /* Allocate and initialize the new node. */ + nodep = calloc(1, sizeof(*nodep)); + if (!nodep) { + perror("calloc"); + abort(); + } + + nodep->idx = idx & -MASK_BITS; + + /* If no nodes, set it up as the root node. */ + if (!s->root) { + s->root = nodep; + return nodep; + } + + /* + * Find the parent where the new node should be attached + * and add the node there. + */ + parentp = s->root; + while (true) { + if (idx < parentp->idx) { + if (!parentp->left) { + parentp->left = nodep; + nodep->parent = parentp; + break; + } + parentp = parentp->left; + } else { + assert(idx > parentp->idx + MASK_BITS + parentp->num_after - 1); + if (!parentp->right) { + parentp->right = nodep; + nodep->parent = parentp; + break; + } + parentp = parentp->right; + } + } + + /* + * Does num_after bits of previous node overlap with the mask + * of the new node? If so set the bits in the new nodes mask + * and reduce the previous nodes num_after. + */ + prev = node_prev(s, nodep); + while (prev && prev->idx + MASK_BITS + prev->num_after - 1 >= nodep->idx) { + unsigned int n1 = (prev->idx + MASK_BITS + prev->num_after - 1) + - nodep->idx; + assert(prev->num_after > 0); + assert(n1 < MASK_BITS); + assert(!(nodep->mask & (1 << n1))); + nodep->mask |= (1 << n1); + prev->num_after--; + } + + return nodep; +} + +/* Returns whether all the bits in the sparsebit array are set. */ +bool sparsebit_all_set(struct sparsebit *s) +{ + /* + * If any nodes there must be at least one bit set. Only case + * where a bit is set and total num set is 0, is when all bits + * are set. + */ + return s->root && s->num_set == 0; +} + +/* Clears all bits described by the node pointed to by nodep, then + * removes the node. + */ +static void node_rm(struct sparsebit *s, struct node *nodep) +{ + struct node *tmp; + sparsebit_num_t num_set; + + num_set = node_num_set(nodep); + assert(s->num_set >= num_set || sparsebit_all_set(s)); + s->num_set -= node_num_set(nodep); + + /* Have both left and right child */ + if (nodep->left && nodep->right) { + /* + * Move left children to the leftmost leaf node + * of the right child. + */ + for (tmp = nodep->right; tmp->left; tmp = tmp->left) + ; + tmp->left = nodep->left; + nodep->left = NULL; + tmp->left->parent = tmp; + } + + /* Left only child */ + if (nodep->left) { + if (!nodep->parent) { + s->root = nodep->left; + nodep->left->parent = NULL; + } else { + nodep->left->parent = nodep->parent; + if (nodep == nodep->parent->left) + nodep->parent->left = nodep->left; + else { + assert(nodep == nodep->parent->right); + nodep->parent->right = nodep->left; + } + } + + nodep->parent = nodep->left = nodep->right = NULL; + free(nodep); + + return; + } + + + /* Right only child */ + if (nodep->right) { + if (!nodep->parent) { + s->root = nodep->right; + nodep->right->parent = NULL; + } else { + nodep->right->parent = nodep->parent; + if (nodep == nodep->parent->left) + nodep->parent->left = nodep->right; + else { + assert(nodep == nodep->parent->right); + nodep->parent->right = nodep->right; + } + } + + nodep->parent = nodep->left = nodep->right = NULL; + free(nodep); + + return; + } + + /* Leaf Node */ + if (!nodep->parent) { + s->root = NULL; + } else { + if (nodep->parent->left == nodep) + nodep->parent->left = NULL; + else { + assert(nodep == nodep->parent->right); + nodep->parent->right = NULL; + } + } + + nodep->parent = nodep->left = nodep->right = NULL; + free(nodep); + + return; +} + +/* Splits the node containing the bit at idx so that there is a node + * that starts at the specified index. If no such node exists, a new + * node at the specified index is created. Returns the new node. + * + * idx must start of a mask boundary. + */ +static struct node *node_split(struct sparsebit *s, sparsebit_idx_t idx) +{ + struct node *nodep1, *nodep2; + sparsebit_idx_t offset; + sparsebit_num_t orig_num_after; + + assert(!(idx % MASK_BITS)); + + /* + * Is there a node that describes the setting of idx? + * If not, add it. + */ + nodep1 = node_find(s, idx); + if (!nodep1) + return node_add(s, idx); + + /* + * All done if the starting index of the node is where the + * split should occur. + */ + if (nodep1->idx == idx) + return nodep1; + + /* + * Split point not at start of mask, so it must be part of + * bits described by num_after. + */ + + /* + * Calculate offset within num_after for where the split is + * to occur. + */ + offset = idx - (nodep1->idx + MASK_BITS); + orig_num_after = nodep1->num_after; + + /* + * Add a new node to describe the bits starting at + * the split point. + */ + nodep1->num_after = offset; + nodep2 = node_add(s, idx); + + /* Move bits after the split point into the new node */ + nodep2->num_after = orig_num_after - offset; + if (nodep2->num_after >= MASK_BITS) { + nodep2->mask = ~(mask_t) 0; + nodep2->num_after -= MASK_BITS; + } else { + nodep2->mask = (1 << nodep2->num_after) - 1; + nodep2->num_after = 0; + } + + return nodep2; +} + +/* Iteratively reduces the node pointed to by nodep and its adjacent + * nodes into a more compact form. For example, a node with a mask with + * all bits set adjacent to a previous node, will get combined into a + * single node with an increased num_after setting. + * + * After each reduction, a further check is made to see if additional + * reductions are possible with the new previous and next nodes. Note, + * a search for a reduction is only done across the nodes nearest nodep + * and those that became part of a reduction. Reductions beyond nodep + * and the adjacent nodes that are reduced are not discovered. It is the + * responsibility of the caller to pass a nodep that is within one node + * of each possible reduction. + * + * This function does not fix the temporary violation of all invariants. + * For example it does not fix the case where the bit settings described + * by two or more nodes overlap. Such a violation introduces the potential + * complication of a bit setting for a specific index having different settings + * in different nodes. This would then introduce the further complication + * of which node has the correct setting of the bit and thus such conditions + * are not allowed. + * + * This function is designed to fix invariant violations that are introduced + * by node_split() and by changes to the nodes mask or num_after members. + * For example, when setting a bit within a nodes mask, the function that + * sets the bit doesn't have to worry about whether the setting of that + * bit caused the mask to have leading only or trailing only bits set. + * Instead, the function can call node_reduce(), with nodep equal to the + * node address that it set a mask bit in, and node_reduce() will notice + * the cases of leading or trailing only bits and that there is an + * adjacent node that the bit settings could be merged into. + * + * This implementation specifically detects and corrects violation of the + * following invariants: + * + * + Node are only used to represent bits that are set. + * Nodes with a mask of 0 and num_after of 0 are not allowed. + * + * + The setting of at least one bit is always described in a nodes + * mask (mask >= 1). + * + * + A node with all mask bits set only occurs when the last bit + * described by the previous node is not equal to this nodes + * starting index - 1. All such occurences of this condition are + * avoided by moving the setting of the nodes mask bits into + * the previous nodes num_after setting. + */ +static void node_reduce(struct sparsebit *s, struct node *nodep) +{ + bool reduction_performed; + + do { + reduction_performed = false; + struct node *prev, *next, *tmp; + + /* 1) Potential reductions within the current node. */ + + /* Nodes with all bits cleared may be removed. */ + if (nodep->mask == 0 && nodep->num_after == 0) { + /* + * About to remove the node pointed to by + * nodep, which normally would cause a problem + * for the next pass through the reduction loop, + * because the node at the starting point no longer + * exists. This potential problem is handled + * by first remembering the location of the next + * or previous nodes. Doesn't matter which, because + * once the node at nodep is removed, there will be + * no other nodes between prev and next. + * + * Note, the checks performed on nodep against both + * both prev and next both check for an adjacent + * node that can be reduced into a single node. As + * such, after removing the node at nodep, doesn't + * matter whether the nodep for the next pass + * through the loop is equal to the previous pass + * prev or next node. Either way, on the next pass + * the one not selected will become either the + * prev or next node. + */ + tmp = node_next(s, nodep); + if (!tmp) + tmp = node_prev(s, nodep); + + node_rm(s, nodep); + nodep = NULL; + + nodep = tmp; + reduction_performed = true; + continue; + } + + /* + * When the mask is 0, can reduce the amount of num_after + * bits by moving the initial num_after bits into the mask. + */ + if (nodep->mask == 0) { + assert(nodep->num_after != 0); + assert(nodep->idx + MASK_BITS > nodep->idx); + + nodep->idx += MASK_BITS; + + if (nodep->num_after >= MASK_BITS) { + nodep->mask = ~0; + nodep->num_after -= MASK_BITS; + } else { + nodep->mask = (1u << nodep->num_after) - 1; + nodep->num_after = 0; + } + + reduction_performed = true; + continue; + } + + /* + * 2) Potential reductions between the current and + * previous nodes. + */ + prev = node_prev(s, nodep); + if (prev) { + sparsebit_idx_t prev_highest_bit; + + /* Nodes with no bits set can be removed. */ + if (prev->mask == 0 && prev->num_after == 0) { + node_rm(s, prev); + + reduction_performed = true; + continue; + } + + /* + * All mask bits set and previous node has + * adjacent index. + */ + if (nodep->mask + 1 == 0 && + prev->idx + MASK_BITS == nodep->idx) { + prev->num_after += MASK_BITS + nodep->num_after; + nodep->mask = 0; + nodep->num_after = 0; + + reduction_performed = true; + continue; + } + + /* + * Is node adjacent to previous node and the node + * contains a single contiguous range of bits + * starting from the beginning of the mask? + */ + prev_highest_bit = prev->idx + MASK_BITS - 1 + prev->num_after; + if (prev_highest_bit + 1 == nodep->idx && + (nodep->mask | (nodep->mask >> 1)) == nodep->mask) { + /* + * How many contiguous bits are there? + * Is equal to the total number of set + * bits, due to an earlier check that + * there is a single contiguous range of + * set bits. + */ + unsigned int num_contiguous + = __builtin_popcount(nodep->mask); + assert((num_contiguous > 0) && + ((1ULL << num_contiguous) - 1) == nodep->mask); + + prev->num_after += num_contiguous; + nodep->mask = 0; + + /* + * For predictable performance, handle special + * case where all mask bits are set and there + * is a non-zero num_after setting. This code + * is functionally correct without the following + * conditionalized statements, but without them + * the value of num_after is only reduced by + * the number of mask bits per pass. There are + * cases where num_after can be close to 2^64. + * Without this code it could take nearly + * (2^64) / 32 passes to perform the full + * reduction. + */ + if (num_contiguous == MASK_BITS) { + prev->num_after += nodep->num_after; + nodep->num_after = 0; + } + + reduction_performed = true; + continue; + } + } + + /* + * 3) Potential reductions between the current and + * next nodes. + */ + next = node_next(s, nodep); + if (next) { + /* Nodes with no bits set can be removed. */ + if (next->mask == 0 && next->num_after == 0) { + node_rm(s, next); + reduction_performed = true; + continue; + } + + /* + * Is next node index adjacent to current node + * and has a mask with all bits set? + */ + if (next->idx == nodep->idx + MASK_BITS + nodep->num_after && + next->mask == ~(mask_t) 0) { + nodep->num_after += MASK_BITS; + next->mask = 0; + nodep->num_after += next->num_after; + next->num_after = 0; + + node_rm(s, next); + next = NULL; + + reduction_performed = true; + continue; + } + } + } while (nodep && reduction_performed); +} + +/* Returns whether the bit at the index given by idx, within the + * sparsebit array is set or not. + */ +bool sparsebit_is_set(struct sparsebit *s, sparsebit_idx_t idx) +{ + struct node *nodep; + + /* Find the node that describes the setting of the bit at idx */ + for (nodep = s->root; nodep; + nodep = nodep->idx > idx ? nodep->left : nodep->right) + if (idx >= nodep->idx && + idx <= nodep->idx + MASK_BITS + nodep->num_after - 1) + goto have_node; + + return false; + +have_node: + /* Bit is set if it is any of the bits described by num_after */ + if (nodep->num_after && idx >= nodep->idx + MASK_BITS) + return true; + + /* Is the corresponding mask bit set */ + assert(idx >= nodep->idx && idx - nodep->idx < MASK_BITS); + return !!(nodep->mask & (1 << (idx - nodep->idx))); +} + +/* Within the sparsebit array pointed to by s, sets the bit + * at the index given by idx. + */ +static void bit_set(struct sparsebit *s, sparsebit_idx_t idx) +{ + struct node *nodep; + + /* Skip bits that are already set */ + if (sparsebit_is_set(s, idx)) + return; + + /* + * Get a node where the bit at idx is described by the mask. + * The node_split will also create a node, if there isn't + * already a node that describes the setting of bit. + */ + nodep = node_split(s, idx & -MASK_BITS); + + /* Set the bit within the nodes mask */ + assert(idx >= nodep->idx && idx <= nodep->idx + MASK_BITS - 1); + assert(!(nodep->mask & (1 << (idx - nodep->idx)))); + nodep->mask |= 1 << (idx - nodep->idx); + s->num_set++; + + node_reduce(s, nodep); +} + +/* Within the sparsebit array pointed to by s, clears the bit + * at the index given by idx. + */ +static void bit_clear(struct sparsebit *s, sparsebit_idx_t idx) +{ + struct node *nodep; + + /* Skip bits that are already cleared */ + if (!sparsebit_is_set(s, idx)) + return; + + /* Is there a node that describes the setting of this bit? */ + nodep = node_find(s, idx); + if (!nodep) + return; + + /* + * If a num_after bit, split the node, so that the bit is + * part of a node mask. + */ + if (idx >= nodep->idx + MASK_BITS) + nodep = node_split(s, idx & -MASK_BITS); + + /* + * After node_split above, bit at idx should be within the mask. + * Clear that bit. + */ + assert(idx >= nodep->idx && idx <= nodep->idx + MASK_BITS - 1); + assert(nodep->mask & (1 << (idx - nodep->idx))); + nodep->mask &= ~(1 << (idx - nodep->idx)); + assert(s->num_set > 0 || sparsebit_all_set(s)); + s->num_set--; + + node_reduce(s, nodep); +} + +/* Recursively dumps to the FILE stream given by stream the contents + * of the sub-tree of nodes pointed to by nodep. Each line of output + * is prefixed by the number of spaces given by indent. On each + * recursion, the indent amount is increased by 2. This causes nodes + * at each level deeper into the binary search tree to be displayed + * with a greater indent. + */ +static void dump_nodes(FILE *stream, struct node *nodep, + unsigned int indent) +{ + char *node_type; + + /* Dump contents of node */ + if (!nodep->parent) + node_type = "root"; + else if (nodep == nodep->parent->left) + node_type = "left"; + else { + assert(nodep == nodep->parent->right); + node_type = "right"; + } + fprintf(stream, "%*s---- %s nodep: %p\n", indent, "", node_type, nodep); + fprintf(stream, "%*s parent: %p left: %p right: %p\n", indent, "", + nodep->parent, nodep->left, nodep->right); + fprintf(stream, "%*s idx: 0x%lx mask: 0x%x num_after: 0x%lx\n", + indent, "", nodep->idx, nodep->mask, nodep->num_after); + + /* If present, dump contents of left child nodes */ + if (nodep->left) + dump_nodes(stream, nodep->left, indent + 2); + + /* If present, dump contents of right child nodes */ + if (nodep->right) + dump_nodes(stream, nodep->right, indent + 2); +} + +static inline sparsebit_idx_t node_first_set(struct node *nodep, int start) +{ + mask_t leading = (mask_t)1 << start; + int n1 = __builtin_ctz(nodep->mask & -leading); + + return nodep->idx + n1; +} + +static inline sparsebit_idx_t node_first_clear(struct node *nodep, int start) +{ + mask_t leading = (mask_t)1 << start; + int n1 = __builtin_ctz(~nodep->mask & -leading); + + return nodep->idx + n1; +} + +/* Dumps to the FILE stream specified by stream, the implementation dependent + * internal state of s. Each line of output is prefixed with the number + * of spaces given by indent. The output is completely implementation + * dependent and subject to change. Output from this function should only + * be used for diagnostic purposes. For example, this function can be + * used by test cases after they detect an unexpected condition, as a means + * to capture diagnostic information. + */ +static void sparsebit_dump_internal(FILE *stream, struct sparsebit *s, + unsigned int indent) +{ + /* Dump the contents of s */ + fprintf(stream, "%*sroot: %p\n", indent, "", s->root); + fprintf(stream, "%*snum_set: 0x%lx\n", indent, "", s->num_set); + + if (s->root) + dump_nodes(stream, s->root, indent); +} + +/* Allocates and returns a new sparsebit array. The initial state + * of the newly allocated sparsebit array has all bits cleared. + */ +struct sparsebit *sparsebit_alloc(void) +{ + struct sparsebit *s; + + /* Allocate top level structure. */ + s = calloc(1, sizeof(*s)); + if (!s) { + perror("calloc"); + abort(); + } + + return s; +} + +/* Frees the implementation dependent data for the sparsebit array + * pointed to by s and poisons the pointer to that data. + */ +void sparsebit_free(struct sparsebit **sbitp) +{ + struct sparsebit *s = *sbitp; + + if (!s) + return; + + sparsebit_clear_all(s); + free(s); + *sbitp = NULL; +} + +/* Makes a copy of the sparsebit array given by s, to the sparsebit + * array given by d. Note, d must have already been allocated via + * sparsebit_alloc(). It can though already have bits set, which + * if different from src will be cleared. + */ +void sparsebit_copy(struct sparsebit *d, struct sparsebit *s) +{ + /* First clear any bits already set in the destination */ + sparsebit_clear_all(d); + + if (s->root) { + d->root = node_copy_subtree(s->root); + d->num_set = s->num_set; + } +} + +/* Returns whether num consecutive bits starting at idx are all set. */ +bool sparsebit_is_set_num(struct sparsebit *s, + sparsebit_idx_t idx, sparsebit_num_t num) +{ + sparsebit_idx_t next_cleared; + + assert(num > 0); + assert(idx + num - 1 >= idx); + + /* With num > 0, the first bit must be set. */ + if (!sparsebit_is_set(s, idx)) + return false; + + /* Find the next cleared bit */ + next_cleared = sparsebit_next_clear(s, idx); + + /* + * If no cleared bits beyond idx, then there are at least num + * set bits. idx + num doesn't wrap. Otherwise check if + * there are enough set bits between idx and the next cleared bit. + */ + return next_cleared == 0 || next_cleared - idx >= num; +} + +/* Returns whether the bit at the index given by idx. */ +bool sparsebit_is_clear(struct sparsebit *s, + sparsebit_idx_t idx) +{ + return !sparsebit_is_set(s, idx); +} + +/* Returns whether num consecutive bits starting at idx are all cleared. */ +bool sparsebit_is_clear_num(struct sparsebit *s, + sparsebit_idx_t idx, sparsebit_num_t num) +{ + sparsebit_idx_t next_set; + + assert(num > 0); + assert(idx + num - 1 >= idx); + + /* With num > 0, the first bit must be cleared. */ + if (!sparsebit_is_clear(s, idx)) + return false; + + /* Find the next set bit */ + next_set = sparsebit_next_set(s, idx); + + /* + * If no set bits beyond idx, then there are at least num + * cleared bits. idx + num doesn't wrap. Otherwise check if + * there are enough cleared bits between idx and the next set bit. + */ + return next_set == 0 || next_set - idx >= num; +} + +/* Returns the total number of bits set. Note: 0 is also returned for + * the case of all bits set. This is because with all bits set, there + * is 1 additional bit set beyond what can be represented in the return + * value. Use sparsebit_any_set(), instead of sparsebit_num_set() > 0, + * to determine if the sparsebit array has any bits set. + */ +sparsebit_num_t sparsebit_num_set(struct sparsebit *s) +{ + return s->num_set; +} + +/* Returns whether any bit is set in the sparsebit array. */ +bool sparsebit_any_set(struct sparsebit *s) +{ + /* + * Nodes only describe set bits. If any nodes then there + * is at least 1 bit set. + */ + if (!s->root) + return false; + + /* + * Every node should have a non-zero mask. For now will + * just assure that the root node has a non-zero mask, + * which is a quick check that at least 1 bit is set. + */ + assert(s->root->mask != 0); + assert(s->num_set > 0 || + (s->root->num_after == ((sparsebit_num_t) 0) - MASK_BITS && + s->root->mask == ~(mask_t) 0)); + + return true; +} + +/* Returns whether all the bits in the sparsebit array are cleared. */ +bool sparsebit_all_clear(struct sparsebit *s) +{ + return !sparsebit_any_set(s); +} + +/* Returns whether all the bits in the sparsebit array are set. */ +bool sparsebit_any_clear(struct sparsebit *s) +{ + return !sparsebit_all_set(s); +} + +/* Returns the index of the first set bit. Abort if no bits are set. + */ +sparsebit_idx_t sparsebit_first_set(struct sparsebit *s) +{ + struct node *nodep; + + /* Validate at least 1 bit is set */ + assert(sparsebit_any_set(s)); + + nodep = node_first(s); + return node_first_set(nodep, 0); +} + +/* Returns the index of the first cleared bit. Abort if + * no bits are cleared. + */ +sparsebit_idx_t sparsebit_first_clear(struct sparsebit *s) +{ + struct node *nodep1, *nodep2; + + /* Validate at least 1 bit is cleared. */ + assert(sparsebit_any_clear(s)); + + /* If no nodes or first node index > 0 then lowest cleared is 0 */ + nodep1 = node_first(s); + if (!nodep1 || nodep1->idx > 0) + return 0; + + /* Does the mask in the first node contain any cleared bits. */ + if (nodep1->mask != ~(mask_t) 0) + return node_first_clear(nodep1, 0); + + /* + * All mask bits set in first node. If there isn't a second node + * then the first cleared bit is the first bit after the bits + * described by the first node. + */ + nodep2 = node_next(s, nodep1); + if (!nodep2) { + /* + * No second node. First cleared bit is first bit beyond + * bits described by first node. + */ + assert(nodep1->mask == ~(mask_t) 0); + assert(nodep1->idx + MASK_BITS + nodep1->num_after != (sparsebit_idx_t) 0); + return nodep1->idx + MASK_BITS + nodep1->num_after; + } + + /* + * There is a second node. + * If it is not adjacent to the first node, then there is a gap + * of cleared bits between the nodes, and the first cleared bit + * is the first bit within the gap. + */ + if (nodep1->idx + MASK_BITS + nodep1->num_after != nodep2->idx) + return nodep1->idx + MASK_BITS + nodep1->num_after; + + /* + * Second node is adjacent to the first node. + * Because it is adjacent, its mask should be non-zero. If all + * its mask bits are set, then with it being adjacent, it should + * have had the mask bits moved into the num_after setting of the + * previous node. + */ + return node_first_clear(nodep2, 0); +} + +/* Returns index of next bit set within s after the index given by prev. + * Returns 0 if there are no bits after prev that are set. + */ +sparsebit_idx_t sparsebit_next_set(struct sparsebit *s, + sparsebit_idx_t prev) +{ + sparsebit_idx_t lowest_possible = prev + 1; + sparsebit_idx_t start; + struct node *nodep; + + /* A bit after the highest index can't be set. */ + if (lowest_possible == 0) + return 0; + + /* + * Find the leftmost 'candidate' overlapping or to the right + * of lowest_possible. + */ + struct node *candidate = NULL; + + /* True iff lowest_possible is within candidate */ + bool contains = false; + + /* + * Find node that describes setting of bit at lowest_possible. + * If such a node doesn't exist, find the node with the lowest + * starting index that is > lowest_possible. + */ + for (nodep = s->root; nodep;) { + if ((nodep->idx + MASK_BITS + nodep->num_after - 1) + >= lowest_possible) { + candidate = nodep; + if (candidate->idx <= lowest_possible) { + contains = true; + break; + } + nodep = nodep->left; + } else { + nodep = nodep->right; + } + } + if (!candidate) + return 0; + + assert(candidate->mask != 0); + + /* Does the candidate node describe the setting of lowest_possible? */ + if (!contains) { + /* + * Candidate doesn't describe setting of bit at lowest_possible. + * Candidate points to the first node with a starting index + * > lowest_possible. + */ + assert(candidate->idx > lowest_possible); + + return node_first_set(candidate, 0); + } + + /* + * Candidate describes setting of bit at lowest_possible. + * Note: although the node describes the setting of the bit + * at lowest_possible, its possible that its setting and the + * setting of all latter bits described by this node are 0. + * For now, just handle the cases where this node describes + * a bit at or after an index of lowest_possible that is set. + */ + start = lowest_possible - candidate->idx; + + if (start < MASK_BITS && candidate->mask >= (1 << start)) + return node_first_set(candidate, start); + + if (candidate->num_after) { + sparsebit_idx_t first_num_after_idx = candidate->idx + MASK_BITS; + + return lowest_possible < first_num_after_idx + ? first_num_after_idx : lowest_possible; + } + + /* + * Although candidate node describes setting of bit at + * the index of lowest_possible, all bits at that index and + * latter that are described by candidate are cleared. With + * this, the next bit is the first bit in the next node, if + * such a node exists. If a next node doesn't exist, then + * there is no next set bit. + */ + candidate = node_next(s, candidate); + if (!candidate) + return 0; + + return node_first_set(candidate, 0); +} + +/* Returns index of next bit cleared within s after the index given by prev. + * Returns 0 if there are no bits after prev that are cleared. + */ +sparsebit_idx_t sparsebit_next_clear(struct sparsebit *s, + sparsebit_idx_t prev) +{ + sparsebit_idx_t lowest_possible = prev + 1; + sparsebit_idx_t idx; + struct node *nodep1, *nodep2; + + /* A bit after the highest index can't be set. */ + if (lowest_possible == 0) + return 0; + + /* + * Does a node describing the setting of lowest_possible exist? + * If not, the bit at lowest_possible is cleared. + */ + nodep1 = node_find(s, lowest_possible); + if (!nodep1) + return lowest_possible; + + /* Does a mask bit in node 1 describe the next cleared bit. */ + for (idx = lowest_possible - nodep1->idx; idx < MASK_BITS; idx++) + if (!(nodep1->mask & (1 << idx))) + return nodep1->idx + idx; + + /* + * Next cleared bit is not described by node 1. If there + * isn't a next node, then next cleared bit is described + * by bit after the bits described by the first node. + */ + nodep2 = node_next(s, nodep1); + if (!nodep2) + return nodep1->idx + MASK_BITS + nodep1->num_after; + + /* + * There is a second node. + * If it is not adjacent to the first node, then there is a gap + * of cleared bits between the nodes, and the next cleared bit + * is the first bit within the gap. + */ + if (nodep1->idx + MASK_BITS + nodep1->num_after != nodep2->idx) + return nodep1->idx + MASK_BITS + nodep1->num_after; + + /* + * Second node is adjacent to the first node. + * Because it is adjacent, its mask should be non-zero. If all + * its mask bits are set, then with it being adjacent, it should + * have had the mask bits moved into the num_after setting of the + * previous node. + */ + return node_first_clear(nodep2, 0); +} + +/* Starting with the index 1 greater than the index given by start, finds + * and returns the index of the first sequence of num consecutively set + * bits. Returns a value of 0 of no such sequence exists. + */ +sparsebit_idx_t sparsebit_next_set_num(struct sparsebit *s, + sparsebit_idx_t start, sparsebit_num_t num) +{ + sparsebit_idx_t idx; + + assert(num >= 1); + + for (idx = sparsebit_next_set(s, start); + idx != 0 && idx + num - 1 >= idx; + idx = sparsebit_next_set(s, idx)) { + assert(sparsebit_is_set(s, idx)); + + /* + * Does the sequence of bits starting at idx consist of + * num set bits? + */ + if (sparsebit_is_set_num(s, idx, num)) + return idx; + + /* + * Sequence of set bits at idx isn't large enough. + * Skip this entire sequence of set bits. + */ + idx = sparsebit_next_clear(s, idx); + if (idx == 0) + return 0; + } + + return 0; +} + +/* Starting with the index 1 greater than the index given by start, finds + * and returns the index of the first sequence of num consecutively cleared + * bits. Returns a value of 0 of no such sequence exists. + */ +sparsebit_idx_t sparsebit_next_clear_num(struct sparsebit *s, + sparsebit_idx_t start, sparsebit_num_t num) +{ + sparsebit_idx_t idx; + + assert(num >= 1); + + for (idx = sparsebit_next_clear(s, start); + idx != 0 && idx + num - 1 >= idx; + idx = sparsebit_next_clear(s, idx)) { + assert(sparsebit_is_clear(s, idx)); + + /* + * Does the sequence of bits starting at idx consist of + * num cleared bits? + */ + if (sparsebit_is_clear_num(s, idx, num)) + return idx; + + /* + * Sequence of cleared bits at idx isn't large enough. + * Skip this entire sequence of cleared bits. + */ + idx = sparsebit_next_set(s, idx); + if (idx == 0) + return 0; + } + + return 0; +} + +/* Sets the bits * in the inclusive range idx through idx + num - 1. */ +void sparsebit_set_num(struct sparsebit *s, + sparsebit_idx_t start, sparsebit_num_t num) +{ + struct node *nodep, *next; + unsigned int n1; + sparsebit_idx_t idx; + sparsebit_num_t n; + sparsebit_idx_t middle_start, middle_end; + + assert(num > 0); + assert(start + num - 1 >= start); + + /* + * Leading - bits before first mask boundary. + * + * TODO(lhuemill): With some effort it may be possible to + * replace the following loop with a sequential sequence + * of statements. High level sequence would be: + * + * 1. Use node_split() to force node that describes setting + * of idx to be within the mask portion of a node. + * 2. Form mask of bits to be set. + * 3. Determine number of mask bits already set in the node + * and store in a local variable named num_already_set. + * 4. Set the appropriate mask bits within the node. + * 5. Increment struct sparsebit_pvt num_set member + * by the number of bits that were actually set. + * Exclude from the counts bits that were already set. + * 6. Before returning to the caller, use node_reduce() to + * handle the multiple corner cases that this method + * introduces. + */ + for (idx = start, n = num; n > 0 && idx % MASK_BITS != 0; idx++, n--) + bit_set(s, idx); + + /* Middle - bits spanning one or more entire mask */ + middle_start = idx; + middle_end = middle_start + (n & -MASK_BITS) - 1; + if (n >= MASK_BITS) { + nodep = node_split(s, middle_start); + + /* + * As needed, split just after end of middle bits. + * No split needed if end of middle bits is at highest + * supported bit index. + */ + if (middle_end + 1 > middle_end) + (void) node_split(s, middle_end + 1); + + /* Delete nodes that only describe bits within the middle. */ + for (next = node_next(s, nodep); + next && (next->idx < middle_end); + next = node_next(s, nodep)) { + assert(next->idx + MASK_BITS + next->num_after - 1 <= middle_end); + node_rm(s, next); + next = NULL; + } + + /* As needed set each of the mask bits */ + for (n1 = 0; n1 < MASK_BITS; n1++) { + if (!(nodep->mask & (1 << n1))) { + nodep->mask |= 1 << n1; + s->num_set++; + } + } + + s->num_set -= nodep->num_after; + nodep->num_after = middle_end - middle_start + 1 - MASK_BITS; + s->num_set += nodep->num_after; + + node_reduce(s, nodep); + } + idx = middle_end + 1; + n -= middle_end - middle_start + 1; + + /* Trailing - bits at and beyond last mask boundary */ + assert(n < MASK_BITS); + for (; n > 0; idx++, n--) + bit_set(s, idx); +} + +/* Clears the bits * in the inclusive range idx through idx + num - 1. */ +void sparsebit_clear_num(struct sparsebit *s, + sparsebit_idx_t start, sparsebit_num_t num) +{ + struct node *nodep, *next; + unsigned int n1; + sparsebit_idx_t idx; + sparsebit_num_t n; + sparsebit_idx_t middle_start, middle_end; + + assert(num > 0); + assert(start + num - 1 >= start); + + /* Leading - bits before first mask boundary */ + for (idx = start, n = num; n > 0 && idx % MASK_BITS != 0; idx++, n--) + bit_clear(s, idx); + + /* Middle - bits spanning one or more entire mask */ + middle_start = idx; + middle_end = middle_start + (n & -MASK_BITS) - 1; + if (n >= MASK_BITS) { + nodep = node_split(s, middle_start); + + /* + * As needed, split just after end of middle bits. + * No split needed if end of middle bits is at highest + * supported bit index. + */ + if (middle_end + 1 > middle_end) + (void) node_split(s, middle_end + 1); + + /* Delete nodes that only describe bits within the middle. */ + for (next = node_next(s, nodep); + next && (next->idx < middle_end); + next = node_next(s, nodep)) { + assert(next->idx + MASK_BITS + next->num_after - 1 <= middle_end); + node_rm(s, next); + next = NULL; + } + + /* As needed clear each of the mask bits */ + for (n1 = 0; n1 < MASK_BITS; n1++) { + if (nodep->mask & (1 << n1)) { + nodep->mask &= ~(1 << n1); + s->num_set--; + } + } + + /* Clear any bits described by num_after */ + s->num_set -= nodep->num_after; + nodep->num_after = 0; + + /* + * Delete the node that describes the beginning of + * the middle bits and perform any allowed reductions + * with the nodes prev or next of nodep. + */ + node_reduce(s, nodep); + nodep = NULL; + } + idx = middle_end + 1; + n -= middle_end - middle_start + 1; + + /* Trailing - bits at and beyond last mask boundary */ + assert(n < MASK_BITS); + for (; n > 0; idx++, n--) + bit_clear(s, idx); +} + +/* Sets the bit at the index given by idx. */ +void sparsebit_set(struct sparsebit *s, sparsebit_idx_t idx) +{ + sparsebit_set_num(s, idx, 1); +} + +/* Clears the bit at the index given by idx. */ +void sparsebit_clear(struct sparsebit *s, sparsebit_idx_t idx) +{ + sparsebit_clear_num(s, idx, 1); +} + +/* Sets the bits in the entire addressable range of the sparsebit array. */ +void sparsebit_set_all(struct sparsebit *s) +{ + sparsebit_set(s, 0); + sparsebit_set_num(s, 1, ~(sparsebit_idx_t) 0); + assert(sparsebit_all_set(s)); +} + +/* Clears the bits in the entire addressable range of the sparsebit array. */ +void sparsebit_clear_all(struct sparsebit *s) +{ + sparsebit_clear(s, 0); + sparsebit_clear_num(s, 1, ~(sparsebit_idx_t) 0); + assert(!sparsebit_any_set(s)); +} + +static size_t display_range(FILE *stream, sparsebit_idx_t low, + sparsebit_idx_t high, bool prepend_comma_space) +{ + char *fmt_str; + size_t sz; + + /* Determine the printf format string */ + if (low == high) + fmt_str = prepend_comma_space ? ", 0x%lx" : "0x%lx"; + else + fmt_str = prepend_comma_space ? ", 0x%lx:0x%lx" : "0x%lx:0x%lx"; + + /* + * When stream is NULL, just determine the size of what would + * have been printed, else print the range. + */ + if (!stream) + sz = snprintf(NULL, 0, fmt_str, low, high); + else + sz = fprintf(stream, fmt_str, low, high); + + return sz; +} + + +/* Dumps to the FILE stream given by stream, the bit settings + * of s. Each line of output is prefixed with the number of + * spaces given by indent. The length of each line is implementation + * dependent and does not depend on the indent amount. The following + * is an example output of a sparsebit array that has bits: + * + * 0x5, 0x8, 0xa:0xe, 0x12 + * + * This corresponds to a sparsebit whose bits 5, 8, 10, 11, 12, 13, 14, 18 + * are set. Note that a ':', instead of a '-' is used to specify a range of + * contiguous bits. This is done because '-' is used to specify command-line + * options, and sometimes ranges are specified as command-line arguments. + */ +void sparsebit_dump(FILE *stream, struct sparsebit *s, + unsigned int indent) +{ + size_t current_line_len = 0; + size_t sz; + struct node *nodep; + + if (!sparsebit_any_set(s)) + return; + + /* Display initial indent */ + fprintf(stream, "%*s", indent, ""); + + /* For each node */ + for (nodep = node_first(s); nodep; nodep = node_next(s, nodep)) { + unsigned int n1; + sparsebit_idx_t low, high; + + /* For each group of bits in the mask */ + for (n1 = 0; n1 < MASK_BITS; n1++) { + if (nodep->mask & (1 << n1)) { + low = high = nodep->idx + n1; + + for (; n1 < MASK_BITS; n1++) { + if (nodep->mask & (1 << n1)) + high = nodep->idx + n1; + else + break; + } + + if ((n1 == MASK_BITS) && nodep->num_after) + high += nodep->num_after; + + /* + * How much room will it take to display + * this range. + */ + sz = display_range(NULL, low, high, + current_line_len != 0); + + /* + * If there is not enough room, display + * a newline plus the indent of the next + * line. + */ + if (current_line_len + sz > DUMP_LINE_MAX) { + fputs("\n", stream); + fprintf(stream, "%*s", indent, ""); + current_line_len = 0; + } + + /* Display the range */ + sz = display_range(stream, low, high, + current_line_len != 0); + current_line_len += sz; + } + } + + /* + * If num_after and most significant-bit of mask is not + * set, then still need to display a range for the bits + * described by num_after. + */ + if (!(nodep->mask & (1 << (MASK_BITS - 1))) && nodep->num_after) { + low = nodep->idx + MASK_BITS; + high = nodep->idx + MASK_BITS + nodep->num_after - 1; + + /* + * How much room will it take to display + * this range. + */ + sz = display_range(NULL, low, high, + current_line_len != 0); + + /* + * If there is not enough room, display + * a newline plus the indent of the next + * line. + */ + if (current_line_len + sz > DUMP_LINE_MAX) { + fputs("\n", stream); + fprintf(stream, "%*s", indent, ""); + current_line_len = 0; + } + + /* Display the range */ + sz = display_range(stream, low, high, + current_line_len != 0); + current_line_len += sz; + } + } + fputs("\n", stream); +} + +/* Validates the internal state of the sparsebit array given by + * s. On error, diagnostic information is printed to stderr and + * abort is called. + */ +void sparsebit_validate_internal(struct sparsebit *s) +{ + bool error_detected = false; + struct node *nodep, *prev = NULL; + sparsebit_num_t total_bits_set = 0; + unsigned int n1; + + /* For each node */ + for (nodep = node_first(s); nodep; + prev = nodep, nodep = node_next(s, nodep)) { + + /* + * Increase total bits set by the number of bits set + * in this node. + */ + for (n1 = 0; n1 < MASK_BITS; n1++) + if (nodep->mask & (1 << n1)) + total_bits_set++; + + total_bits_set += nodep->num_after; + + /* + * Arbitrary choice as to whether a mask of 0 is allowed + * or not. For diagnostic purposes it is beneficial to + * have only one valid means to represent a set of bits. + * To support this an arbitrary choice has been made + * to not allow a mask of zero. + */ + if (nodep->mask == 0) { + fprintf(stderr, "Node mask of zero, " + "nodep: %p nodep->mask: 0x%x", + nodep, nodep->mask); + error_detected = true; + break; + } + + /* + * Validate num_after is not greater than the max index + * - the number of mask bits. The num_after member + * uses 0-based indexing and thus has no value that + * represents all bits set. This limitation is handled + * by requiring a non-zero mask. With a non-zero mask, + * MASK_BITS worth of bits are described by the mask, + * which makes the largest needed num_after equal to: + * + * (~(sparsebit_num_t) 0) - MASK_BITS + 1 + */ + if (nodep->num_after + > (~(sparsebit_num_t) 0) - MASK_BITS + 1) { + fprintf(stderr, "num_after too large, " + "nodep: %p nodep->num_after: 0x%lx", + nodep, nodep->num_after); + error_detected = true; + break; + } + + /* Validate node index is divisible by the mask size */ + if (nodep->idx % MASK_BITS) { + fprintf(stderr, "Node index not divisible by " + "mask size,\n" + " nodep: %p nodep->idx: 0x%lx " + "MASK_BITS: %lu\n", + nodep, nodep->idx, MASK_BITS); + error_detected = true; + break; + } + + /* + * Validate bits described by node don't wrap beyond the + * highest supported index. + */ + if ((nodep->idx + MASK_BITS + nodep->num_after - 1) < nodep->idx) { + fprintf(stderr, "Bits described by node wrap " + "beyond highest supported index,\n" + " nodep: %p nodep->idx: 0x%lx\n" + " MASK_BITS: %lu nodep->num_after: 0x%lx", + nodep, nodep->idx, MASK_BITS, nodep->num_after); + error_detected = true; + break; + } + + /* Check parent pointers. */ + if (nodep->left) { + if (nodep->left->parent != nodep) { + fprintf(stderr, "Left child parent pointer " + "doesn't point to this node,\n" + " nodep: %p nodep->left: %p " + "nodep->left->parent: %p", + nodep, nodep->left, + nodep->left->parent); + error_detected = true; + break; + } + } + + if (nodep->right) { + if (nodep->right->parent != nodep) { + fprintf(stderr, "Right child parent pointer " + "doesn't point to this node,\n" + " nodep: %p nodep->right: %p " + "nodep->right->parent: %p", + nodep, nodep->right, + nodep->right->parent); + error_detected = true; + break; + } + } + + if (!nodep->parent) { + if (s->root != nodep) { + fprintf(stderr, "Unexpected root node, " + "s->root: %p nodep: %p", + s->root, nodep); + error_detected = true; + break; + } + } + + if (prev) { + /* + * Is index of previous node before index of + * current node? + */ + if (prev->idx >= nodep->idx) { + fprintf(stderr, "Previous node index " + ">= current node index,\n" + " prev: %p prev->idx: 0x%lx\n" + " nodep: %p nodep->idx: 0x%lx", + prev, prev->idx, nodep, nodep->idx); + error_detected = true; + break; + } + + /* + * Nodes occur in asscending order, based on each + * nodes starting index. + */ + if ((prev->idx + MASK_BITS + prev->num_after - 1) + >= nodep->idx) { + fprintf(stderr, "Previous node bit range " + "overlap with current node bit range,\n" + " prev: %p prev->idx: 0x%lx " + "prev->num_after: 0x%lx\n" + " nodep: %p nodep->idx: 0x%lx " + "nodep->num_after: 0x%lx\n" + " MASK_BITS: %lu", + prev, prev->idx, prev->num_after, + nodep, nodep->idx, nodep->num_after, + MASK_BITS); + error_detected = true; + break; + } + + /* + * When the node has all mask bits set, it shouldn't + * be adjacent to the last bit described by the + * previous node. + */ + if (nodep->mask == ~(mask_t) 0 && + prev->idx + MASK_BITS + prev->num_after == nodep->idx) { + fprintf(stderr, "Current node has mask with " + "all bits set and is adjacent to the " + "previous node,\n" + " prev: %p prev->idx: 0x%lx " + "prev->num_after: 0x%lx\n" + " nodep: %p nodep->idx: 0x%lx " + "nodep->num_after: 0x%lx\n" + " MASK_BITS: %lu", + prev, prev->idx, prev->num_after, + nodep, nodep->idx, nodep->num_after, + MASK_BITS); + + error_detected = true; + break; + } + } + } + + if (!error_detected) { + /* + * Is sum of bits set in each node equal to the count + * of total bits set. + */ + if (s->num_set != total_bits_set) { + fprintf(stderr, "Number of bits set missmatch,\n" + " s->num_set: 0x%lx total_bits_set: 0x%lx", + s->num_set, total_bits_set); + + error_detected = true; + } + } + + if (error_detected) { + fputs(" dump_internal:\n", stderr); + sparsebit_dump_internal(stderr, s, 4); + abort(); + } +} + + +#ifdef FUZZ +/* A simple but effective fuzzing driver. Look for bugs with the help + * of some invariants and of a trivial representation of sparsebit. + * Just use 512 bytes of /dev/zero and /dev/urandom as inputs, and let + * afl-fuzz do the magic. :) + */ + +#include +#include + +struct range { + sparsebit_idx_t first, last; + bool set; +}; + +struct sparsebit *s; +struct range ranges[1000]; +int num_ranges; + +static bool get_value(sparsebit_idx_t idx) +{ + int i; + + for (i = num_ranges; --i >= 0; ) + if (ranges[i].first <= idx && idx <= ranges[i].last) + return ranges[i].set; + + return false; +} + +static void operate(int code, sparsebit_idx_t first, sparsebit_idx_t last) +{ + sparsebit_num_t num; + sparsebit_idx_t next; + + if (first < last) { + num = last - first + 1; + } else { + num = first - last + 1; + first = last; + last = first + num - 1; + } + + switch (code) { + case 0: + sparsebit_set(s, first); + assert(sparsebit_is_set(s, first)); + assert(!sparsebit_is_clear(s, first)); + assert(sparsebit_any_set(s)); + assert(!sparsebit_all_clear(s)); + if (get_value(first)) + return; + if (num_ranges == 1000) + exit(0); + ranges[num_ranges++] = (struct range) + { .first = first, .last = first, .set = true }; + break; + case 1: + sparsebit_clear(s, first); + assert(!sparsebit_is_set(s, first)); + assert(sparsebit_is_clear(s, first)); + assert(sparsebit_any_clear(s)); + assert(!sparsebit_all_set(s)); + if (!get_value(first)) + return; + if (num_ranges == 1000) + exit(0); + ranges[num_ranges++] = (struct range) + { .first = first, .last = first, .set = false }; + break; + case 2: + assert(sparsebit_is_set(s, first) == get_value(first)); + assert(sparsebit_is_clear(s, first) == !get_value(first)); + break; + case 3: + if (sparsebit_any_set(s)) + assert(get_value(sparsebit_first_set(s))); + if (sparsebit_any_clear(s)) + assert(!get_value(sparsebit_first_clear(s))); + sparsebit_set_all(s); + assert(!sparsebit_any_clear(s)); + assert(sparsebit_all_set(s)); + num_ranges = 0; + ranges[num_ranges++] = (struct range) + { .first = 0, .last = ~(sparsebit_idx_t)0, .set = true }; + break; + case 4: + if (sparsebit_any_set(s)) + assert(get_value(sparsebit_first_set(s))); + if (sparsebit_any_clear(s)) + assert(!get_value(sparsebit_first_clear(s))); + sparsebit_clear_all(s); + assert(!sparsebit_any_set(s)); + assert(sparsebit_all_clear(s)); + num_ranges = 0; + break; + case 5: + next = sparsebit_next_set(s, first); + assert(next == 0 || next > first); + assert(next == 0 || get_value(next)); + break; + case 6: + next = sparsebit_next_clear(s, first); + assert(next == 0 || next > first); + assert(next == 0 || !get_value(next)); + break; + case 7: + next = sparsebit_next_clear(s, first); + if (sparsebit_is_set_num(s, first, num)) { + assert(next == 0 || next > last); + if (first) + next = sparsebit_next_set(s, first - 1); + else if (sparsebit_any_set(s)) + next = sparsebit_first_set(s); + else + return; + assert(next == first); + } else { + assert(sparsebit_is_clear(s, first) || next <= last); + } + break; + case 8: + next = sparsebit_next_set(s, first); + if (sparsebit_is_clear_num(s, first, num)) { + assert(next == 0 || next > last); + if (first) + next = sparsebit_next_clear(s, first - 1); + else if (sparsebit_any_clear(s)) + next = sparsebit_first_clear(s); + else + return; + assert(next == first); + } else { + assert(sparsebit_is_set(s, first) || next <= last); + } + break; + case 9: + sparsebit_set_num(s, first, num); + assert(sparsebit_is_set_num(s, first, num)); + assert(!sparsebit_is_clear_num(s, first, num)); + assert(sparsebit_any_set(s)); + assert(!sparsebit_all_clear(s)); + if (num_ranges == 1000) + exit(0); + ranges[num_ranges++] = (struct range) + { .first = first, .last = last, .set = true }; + break; + case 10: + sparsebit_clear_num(s, first, num); + assert(!sparsebit_is_set_num(s, first, num)); + assert(sparsebit_is_clear_num(s, first, num)); + assert(sparsebit_any_clear(s)); + assert(!sparsebit_all_set(s)); + if (num_ranges == 1000) + exit(0); + ranges[num_ranges++] = (struct range) + { .first = first, .last = last, .set = false }; + break; + case 11: + sparsebit_validate_internal(s); + break; + default: + break; + } +} + +unsigned char get8(void) +{ + int ch; + + ch = getchar(); + if (ch == EOF) + exit(0); + return ch; +} + +uint64_t get64(void) +{ + uint64_t x; + + x = get8(); + x = (x << 8) | get8(); + x = (x << 8) | get8(); + x = (x << 8) | get8(); + x = (x << 8) | get8(); + x = (x << 8) | get8(); + x = (x << 8) | get8(); + return (x << 8) | get8(); +} + +int main(void) +{ + s = sparsebit_alloc(); + for (;;) { + uint8_t op = get8() & 0xf; + uint64_t first = get64(); + uint64_t last = get64(); + + operate(op, first, last); + } +} +#endif diff --git a/tools/testing/selftests/kvm/lib/test_util.c b/tools/testing/selftests/kvm/lib/test_util.c new file mode 100644 index 000000000..8e04c0b16 --- /dev/null +++ b/tools/testing/selftests/kvm/lib/test_util.c @@ -0,0 +1,111 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * tools/testing/selftests/kvm/lib/test_util.c + * + * Copyright (C) 2020, Google LLC. + */ + +#include +#include +#include +#include +#include + +#include "test_util.h" + +/* + * Parses "[0-9]+[kmgt]?". + */ +size_t parse_size(const char *size) +{ + size_t base; + char *scale; + int shift = 0; + + TEST_ASSERT(size && isdigit(size[0]), "Need at least one digit in '%s'", size); + + base = strtoull(size, &scale, 0); + + TEST_ASSERT(base != ULLONG_MAX, "Overflow parsing size!"); + + switch (tolower(*scale)) { + case 't': + shift = 40; + break; + case 'g': + shift = 30; + break; + case 'm': + shift = 20; + break; + case 'k': + shift = 10; + break; + case 'b': + case '\0': + shift = 0; + break; + default: + TEST_ASSERT(false, "Unknown size letter %c", *scale); + } + + TEST_ASSERT((base << shift) >> shift == base, "Overflow scaling size!"); + + return base << shift; +} + +int64_t timespec_to_ns(struct timespec ts) +{ + return (int64_t)ts.tv_nsec + 1000000000LL * (int64_t)ts.tv_sec; +} + +struct timespec timespec_add_ns(struct timespec ts, int64_t ns) +{ + struct timespec res; + + res.tv_nsec = ts.tv_nsec + ns; + res.tv_sec = ts.tv_sec + res.tv_nsec / 1000000000LL; + res.tv_nsec %= 1000000000LL; + + return res; +} + +struct timespec timespec_add(struct timespec ts1, struct timespec ts2) +{ + int64_t ns1 = timespec_to_ns(ts1); + int64_t ns2 = timespec_to_ns(ts2); + return timespec_add_ns((struct timespec){0}, ns1 + ns2); +} + +struct timespec timespec_sub(struct timespec ts1, struct timespec ts2) +{ + int64_t ns1 = timespec_to_ns(ts1); + int64_t ns2 = timespec_to_ns(ts2); + return timespec_add_ns((struct timespec){0}, ns1 - ns2); +} + +struct timespec timespec_diff_now(struct timespec start) +{ + struct timespec end; + + clock_gettime(CLOCK_MONOTONIC, &end); + return timespec_sub(end, start); +} + +struct timespec timespec_div(struct timespec ts, int divisor) +{ + int64_t ns = timespec_to_ns(ts) / divisor; + + return timespec_add_ns((struct timespec){0}, ns); +} + +void print_skip(const char *fmt, ...) +{ + va_list ap; + + assert(fmt); + va_start(ap, fmt); + vprintf(fmt, ap); + va_end(ap); + puts(", skipping test"); +} diff --git a/tools/testing/selftests/kvm/lib/x86_64/handlers.S b/tools/testing/selftests/kvm/lib/x86_64/handlers.S new file mode 100644 index 000000000..aaf7bc7d2 --- /dev/null +++ b/tools/testing/selftests/kvm/lib/x86_64/handlers.S @@ -0,0 +1,81 @@ +handle_exception: + push %r15 + push %r14 + push %r13 + push %r12 + push %r11 + push %r10 + push %r9 + push %r8 + + push %rdi + push %rsi + push %rbp + push %rbx + push %rdx + push %rcx + push %rax + mov %rsp, %rdi + + call route_exception + + pop %rax + pop %rcx + pop %rdx + pop %rbx + pop %rbp + pop %rsi + pop %rdi + pop %r8 + pop %r9 + pop %r10 + pop %r11 + pop %r12 + pop %r13 + pop %r14 + pop %r15 + + /* Discard vector and error code. */ + add $16, %rsp + iretq + +/* + * Build the handle_exception wrappers which push the vector/error code on the + * stack and an array of pointers to those wrappers. + */ +.pushsection .rodata +.globl idt_handlers +idt_handlers: +.popsection + +.macro HANDLERS has_error from to + vector = \from + .rept \to - \from + 1 + .align 8 + + /* Fetch current address and append it to idt_handlers. */ + current_handler = . +.pushsection .rodata +.quad current_handler +.popsection + + .if ! \has_error + pushq $0 + .endif + pushq $vector + jmp handle_exception + vector = vector + 1 + .endr +.endm + +.global idt_handler_code +idt_handler_code: + HANDLERS has_error=0 from=0 to=7 + HANDLERS has_error=1 from=8 to=8 + HANDLERS has_error=0 from=9 to=9 + HANDLERS has_error=1 from=10 to=14 + HANDLERS has_error=0 from=15 to=16 + HANDLERS has_error=1 from=17 to=17 + HANDLERS has_error=0 from=18 to=255 + +.section .note.GNU-stack, "", %progbits diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c new file mode 100644 index 000000000..f5d2d27be --- /dev/null +++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c @@ -0,0 +1,1258 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * tools/testing/selftests/kvm/lib/x86_64/processor.c + * + * Copyright (C) 2018, Google LLC. + */ + +#define _GNU_SOURCE /* for program_invocation_name */ + +#include "test_util.h" +#include "kvm_util.h" +#include "../kvm_util_internal.h" +#include "processor.h" + +#ifndef NUM_INTERRUPTS +#define NUM_INTERRUPTS 256 +#endif + +#define DEFAULT_CODE_SELECTOR 0x8 +#define DEFAULT_DATA_SELECTOR 0x10 + +/* Minimum physical address used for virtual translation tables. */ +#define KVM_GUEST_PAGE_TABLE_MIN_PADDR 0x180000 + +vm_vaddr_t exception_handlers; + +/* Virtual translation table structure declarations */ +struct pageMapL4Entry { + uint64_t present:1; + uint64_t writable:1; + uint64_t user:1; + uint64_t write_through:1; + uint64_t cache_disable:1; + uint64_t accessed:1; + uint64_t ignored_06:1; + uint64_t page_size:1; + uint64_t ignored_11_08:4; + uint64_t address:40; + uint64_t ignored_62_52:11; + uint64_t execute_disable:1; +}; + +struct pageDirectoryPointerEntry { + uint64_t present:1; + uint64_t writable:1; + uint64_t user:1; + uint64_t write_through:1; + uint64_t cache_disable:1; + uint64_t accessed:1; + uint64_t ignored_06:1; + uint64_t page_size:1; + uint64_t ignored_11_08:4; + uint64_t address:40; + uint64_t ignored_62_52:11; + uint64_t execute_disable:1; +}; + +struct pageDirectoryEntry { + uint64_t present:1; + uint64_t writable:1; + uint64_t user:1; + uint64_t write_through:1; + uint64_t cache_disable:1; + uint64_t accessed:1; + uint64_t ignored_06:1; + uint64_t page_size:1; + uint64_t ignored_11_08:4; + uint64_t address:40; + uint64_t ignored_62_52:11; + uint64_t execute_disable:1; +}; + +struct pageTableEntry { + uint64_t present:1; + uint64_t writable:1; + uint64_t user:1; + uint64_t write_through:1; + uint64_t cache_disable:1; + uint64_t accessed:1; + uint64_t dirty:1; + uint64_t reserved_07:1; + uint64_t global:1; + uint64_t ignored_11_09:3; + uint64_t address:40; + uint64_t ignored_62_52:11; + uint64_t execute_disable:1; +}; + +void regs_dump(FILE *stream, struct kvm_regs *regs, + uint8_t indent) +{ + fprintf(stream, "%*srax: 0x%.16llx rbx: 0x%.16llx " + "rcx: 0x%.16llx rdx: 0x%.16llx\n", + indent, "", + regs->rax, regs->rbx, regs->rcx, regs->rdx); + fprintf(stream, "%*srsi: 0x%.16llx rdi: 0x%.16llx " + "rsp: 0x%.16llx rbp: 0x%.16llx\n", + indent, "", + regs->rsi, regs->rdi, regs->rsp, regs->rbp); + fprintf(stream, "%*sr8: 0x%.16llx r9: 0x%.16llx " + "r10: 0x%.16llx r11: 0x%.16llx\n", + indent, "", + regs->r8, regs->r9, regs->r10, regs->r11); + fprintf(stream, "%*sr12: 0x%.16llx r13: 0x%.16llx " + "r14: 0x%.16llx r15: 0x%.16llx\n", + indent, "", + regs->r12, regs->r13, regs->r14, regs->r15); + fprintf(stream, "%*srip: 0x%.16llx rfl: 0x%.16llx\n", + indent, "", + regs->rip, regs->rflags); +} + +/* + * Segment Dump + * + * Input Args: + * stream - Output FILE stream + * segment - KVM segment + * indent - Left margin indent amount + * + * Output Args: None + * + * Return: None + * + * Dumps the state of the KVM segment given by @segment, to the FILE stream + * given by @stream. + */ +static void segment_dump(FILE *stream, struct kvm_segment *segment, + uint8_t indent) +{ + fprintf(stream, "%*sbase: 0x%.16llx limit: 0x%.8x " + "selector: 0x%.4x type: 0x%.2x\n", + indent, "", segment->base, segment->limit, + segment->selector, segment->type); + fprintf(stream, "%*spresent: 0x%.2x dpl: 0x%.2x " + "db: 0x%.2x s: 0x%.2x l: 0x%.2x\n", + indent, "", segment->present, segment->dpl, + segment->db, segment->s, segment->l); + fprintf(stream, "%*sg: 0x%.2x avl: 0x%.2x " + "unusable: 0x%.2x padding: 0x%.2x\n", + indent, "", segment->g, segment->avl, + segment->unusable, segment->padding); +} + +/* + * dtable Dump + * + * Input Args: + * stream - Output FILE stream + * dtable - KVM dtable + * indent - Left margin indent amount + * + * Output Args: None + * + * Return: None + * + * Dumps the state of the KVM dtable given by @dtable, to the FILE stream + * given by @stream. + */ +static void dtable_dump(FILE *stream, struct kvm_dtable *dtable, + uint8_t indent) +{ + fprintf(stream, "%*sbase: 0x%.16llx limit: 0x%.4x " + "padding: 0x%.4x 0x%.4x 0x%.4x\n", + indent, "", dtable->base, dtable->limit, + dtable->padding[0], dtable->padding[1], dtable->padding[2]); +} + +void sregs_dump(FILE *stream, struct kvm_sregs *sregs, + uint8_t indent) +{ + unsigned int i; + + fprintf(stream, "%*scs:\n", indent, ""); + segment_dump(stream, &sregs->cs, indent + 2); + fprintf(stream, "%*sds:\n", indent, ""); + segment_dump(stream, &sregs->ds, indent + 2); + fprintf(stream, "%*ses:\n", indent, ""); + segment_dump(stream, &sregs->es, indent + 2); + fprintf(stream, "%*sfs:\n", indent, ""); + segment_dump(stream, &sregs->fs, indent + 2); + fprintf(stream, "%*sgs:\n", indent, ""); + segment_dump(stream, &sregs->gs, indent + 2); + fprintf(stream, "%*sss:\n", indent, ""); + segment_dump(stream, &sregs->ss, indent + 2); + fprintf(stream, "%*str:\n", indent, ""); + segment_dump(stream, &sregs->tr, indent + 2); + fprintf(stream, "%*sldt:\n", indent, ""); + segment_dump(stream, &sregs->ldt, indent + 2); + + fprintf(stream, "%*sgdt:\n", indent, ""); + dtable_dump(stream, &sregs->gdt, indent + 2); + fprintf(stream, "%*sidt:\n", indent, ""); + dtable_dump(stream, &sregs->idt, indent + 2); + + fprintf(stream, "%*scr0: 0x%.16llx cr2: 0x%.16llx " + "cr3: 0x%.16llx cr4: 0x%.16llx\n", + indent, "", + sregs->cr0, sregs->cr2, sregs->cr3, sregs->cr4); + fprintf(stream, "%*scr8: 0x%.16llx efer: 0x%.16llx " + "apic_base: 0x%.16llx\n", + indent, "", + sregs->cr8, sregs->efer, sregs->apic_base); + + fprintf(stream, "%*sinterrupt_bitmap:\n", indent, ""); + for (i = 0; i < (KVM_NR_INTERRUPTS + 63) / 64; i++) { + fprintf(stream, "%*s%.16llx\n", indent + 2, "", + sregs->interrupt_bitmap[i]); + } +} + +void virt_pgd_alloc(struct kvm_vm *vm, uint32_t pgd_memslot) +{ + TEST_ASSERT(vm->mode == VM_MODE_PXXV48_4K, "Attempt to use " + "unknown or unsupported guest mode, mode: 0x%x", vm->mode); + + /* If needed, create page map l4 table. */ + if (!vm->pgd_created) { + vm_paddr_t paddr = vm_phy_page_alloc(vm, + KVM_GUEST_PAGE_TABLE_MIN_PADDR, pgd_memslot); + vm->pgd = paddr; + vm->pgd_created = true; + } +} + +void virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, + uint32_t pgd_memslot) +{ + uint16_t index[4]; + struct pageMapL4Entry *pml4e; + + TEST_ASSERT(vm->mode == VM_MODE_PXXV48_4K, "Attempt to use " + "unknown or unsupported guest mode, mode: 0x%x", vm->mode); + + TEST_ASSERT((vaddr % vm->page_size) == 0, + "Virtual address not on page boundary,\n" + " vaddr: 0x%lx vm->page_size: 0x%x", + vaddr, vm->page_size); + TEST_ASSERT(sparsebit_is_set(vm->vpages_valid, + (vaddr >> vm->page_shift)), + "Invalid virtual address, vaddr: 0x%lx", + vaddr); + TEST_ASSERT((paddr % vm->page_size) == 0, + "Physical address not on page boundary,\n" + " paddr: 0x%lx vm->page_size: 0x%x", + paddr, vm->page_size); + TEST_ASSERT((paddr >> vm->page_shift) <= vm->max_gfn, + "Physical address beyond beyond maximum supported,\n" + " paddr: 0x%lx vm->max_gfn: 0x%lx vm->page_size: 0x%x", + paddr, vm->max_gfn, vm->page_size); + + index[0] = (vaddr >> 12) & 0x1ffu; + index[1] = (vaddr >> 21) & 0x1ffu; + index[2] = (vaddr >> 30) & 0x1ffu; + index[3] = (vaddr >> 39) & 0x1ffu; + + /* Allocate page directory pointer table if not present. */ + pml4e = addr_gpa2hva(vm, vm->pgd); + if (!pml4e[index[3]].present) { + pml4e[index[3]].address = vm_phy_page_alloc(vm, + KVM_GUEST_PAGE_TABLE_MIN_PADDR, pgd_memslot) + >> vm->page_shift; + pml4e[index[3]].writable = true; + pml4e[index[3]].present = true; + } + + /* Allocate page directory table if not present. */ + struct pageDirectoryPointerEntry *pdpe; + pdpe = addr_gpa2hva(vm, pml4e[index[3]].address * vm->page_size); + if (!pdpe[index[2]].present) { + pdpe[index[2]].address = vm_phy_page_alloc(vm, + KVM_GUEST_PAGE_TABLE_MIN_PADDR, pgd_memslot) + >> vm->page_shift; + pdpe[index[2]].writable = true; + pdpe[index[2]].present = true; + } + + /* Allocate page table if not present. */ + struct pageDirectoryEntry *pde; + pde = addr_gpa2hva(vm, pdpe[index[2]].address * vm->page_size); + if (!pde[index[1]].present) { + pde[index[1]].address = vm_phy_page_alloc(vm, + KVM_GUEST_PAGE_TABLE_MIN_PADDR, pgd_memslot) + >> vm->page_shift; + pde[index[1]].writable = true; + pde[index[1]].present = true; + } + + /* Fill in page table entry. */ + struct pageTableEntry *pte; + pte = addr_gpa2hva(vm, pde[index[1]].address * vm->page_size); + pte[index[0]].address = paddr >> vm->page_shift; + pte[index[0]].writable = true; + pte[index[0]].present = 1; +} + +void virt_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) +{ + struct pageMapL4Entry *pml4e, *pml4e_start; + struct pageDirectoryPointerEntry *pdpe, *pdpe_start; + struct pageDirectoryEntry *pde, *pde_start; + struct pageTableEntry *pte, *pte_start; + + if (!vm->pgd_created) + return; + + fprintf(stream, "%*s " + " no\n", indent, ""); + fprintf(stream, "%*s index hvaddr gpaddr " + "addr w exec dirty\n", + indent, ""); + pml4e_start = (struct pageMapL4Entry *) addr_gpa2hva(vm, + vm->pgd); + for (uint16_t n1 = 0; n1 <= 0x1ffu; n1++) { + pml4e = &pml4e_start[n1]; + if (!pml4e->present) + continue; + fprintf(stream, "%*spml4e 0x%-3zx %p 0x%-12lx 0x%-10lx %u " + " %u\n", + indent, "", + pml4e - pml4e_start, pml4e, + addr_hva2gpa(vm, pml4e), (uint64_t) pml4e->address, + pml4e->writable, pml4e->execute_disable); + + pdpe_start = addr_gpa2hva(vm, pml4e->address + * vm->page_size); + for (uint16_t n2 = 0; n2 <= 0x1ffu; n2++) { + pdpe = &pdpe_start[n2]; + if (!pdpe->present) + continue; + fprintf(stream, "%*spdpe 0x%-3zx %p 0x%-12lx 0x%-10lx " + "%u %u\n", + indent, "", + pdpe - pdpe_start, pdpe, + addr_hva2gpa(vm, pdpe), + (uint64_t) pdpe->address, pdpe->writable, + pdpe->execute_disable); + + pde_start = addr_gpa2hva(vm, + pdpe->address * vm->page_size); + for (uint16_t n3 = 0; n3 <= 0x1ffu; n3++) { + pde = &pde_start[n3]; + if (!pde->present) + continue; + fprintf(stream, "%*spde 0x%-3zx %p " + "0x%-12lx 0x%-10lx %u %u\n", + indent, "", pde - pde_start, pde, + addr_hva2gpa(vm, pde), + (uint64_t) pde->address, pde->writable, + pde->execute_disable); + + pte_start = addr_gpa2hva(vm, + pde->address * vm->page_size); + for (uint16_t n4 = 0; n4 <= 0x1ffu; n4++) { + pte = &pte_start[n4]; + if (!pte->present) + continue; + fprintf(stream, "%*spte 0x%-3zx %p " + "0x%-12lx 0x%-10lx %u %u " + " %u 0x%-10lx\n", + indent, "", + pte - pte_start, pte, + addr_hva2gpa(vm, pte), + (uint64_t) pte->address, + pte->writable, + pte->execute_disable, + pte->dirty, + ((uint64_t) n1 << 27) + | ((uint64_t) n2 << 18) + | ((uint64_t) n3 << 9) + | ((uint64_t) n4)); + } + } + } + } +} + +/* + * Set Unusable Segment + * + * Input Args: None + * + * Output Args: + * segp - Pointer to segment register + * + * Return: None + * + * Sets the segment register pointed to by @segp to an unusable state. + */ +static void kvm_seg_set_unusable(struct kvm_segment *segp) +{ + memset(segp, 0, sizeof(*segp)); + segp->unusable = true; +} + +static void kvm_seg_fill_gdt_64bit(struct kvm_vm *vm, struct kvm_segment *segp) +{ + void *gdt = addr_gva2hva(vm, vm->gdt); + struct desc64 *desc = gdt + (segp->selector >> 3) * 8; + + desc->limit0 = segp->limit & 0xFFFF; + desc->base0 = segp->base & 0xFFFF; + desc->base1 = segp->base >> 16; + desc->type = segp->type; + desc->s = segp->s; + desc->dpl = segp->dpl; + desc->p = segp->present; + desc->limit1 = segp->limit >> 16; + desc->avl = segp->avl; + desc->l = segp->l; + desc->db = segp->db; + desc->g = segp->g; + desc->base2 = segp->base >> 24; + if (!segp->s) + desc->base3 = segp->base >> 32; +} + + +/* + * Set Long Mode Flat Kernel Code Segment + * + * Input Args: + * vm - VM whose GDT is being filled, or NULL to only write segp + * selector - selector value + * + * Output Args: + * segp - Pointer to KVM segment + * + * Return: None + * + * Sets up the KVM segment pointed to by @segp, to be a code segment + * with the selector value given by @selector. + */ +static void kvm_seg_set_kernel_code_64bit(struct kvm_vm *vm, uint16_t selector, + struct kvm_segment *segp) +{ + memset(segp, 0, sizeof(*segp)); + segp->selector = selector; + segp->limit = 0xFFFFFFFFu; + segp->s = 0x1; /* kTypeCodeData */ + segp->type = 0x08 | 0x01 | 0x02; /* kFlagCode | kFlagCodeAccessed + * | kFlagCodeReadable + */ + segp->g = true; + segp->l = true; + segp->present = 1; + if (vm) + kvm_seg_fill_gdt_64bit(vm, segp); +} + +/* + * Set Long Mode Flat Kernel Data Segment + * + * Input Args: + * vm - VM whose GDT is being filled, or NULL to only write segp + * selector - selector value + * + * Output Args: + * segp - Pointer to KVM segment + * + * Return: None + * + * Sets up the KVM segment pointed to by @segp, to be a data segment + * with the selector value given by @selector. + */ +static void kvm_seg_set_kernel_data_64bit(struct kvm_vm *vm, uint16_t selector, + struct kvm_segment *segp) +{ + memset(segp, 0, sizeof(*segp)); + segp->selector = selector; + segp->limit = 0xFFFFFFFFu; + segp->s = 0x1; /* kTypeCodeData */ + segp->type = 0x00 | 0x01 | 0x02; /* kFlagData | kFlagDataAccessed + * | kFlagDataWritable + */ + segp->g = true; + segp->present = true; + if (vm) + kvm_seg_fill_gdt_64bit(vm, segp); +} + +vm_paddr_t addr_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva) +{ + uint16_t index[4]; + struct pageMapL4Entry *pml4e; + struct pageDirectoryPointerEntry *pdpe; + struct pageDirectoryEntry *pde; + struct pageTableEntry *pte; + + TEST_ASSERT(vm->mode == VM_MODE_PXXV48_4K, "Attempt to use " + "unknown or unsupported guest mode, mode: 0x%x", vm->mode); + + index[0] = (gva >> 12) & 0x1ffu; + index[1] = (gva >> 21) & 0x1ffu; + index[2] = (gva >> 30) & 0x1ffu; + index[3] = (gva >> 39) & 0x1ffu; + + if (!vm->pgd_created) + goto unmapped_gva; + pml4e = addr_gpa2hva(vm, vm->pgd); + if (!pml4e[index[3]].present) + goto unmapped_gva; + + pdpe = addr_gpa2hva(vm, pml4e[index[3]].address * vm->page_size); + if (!pdpe[index[2]].present) + goto unmapped_gva; + + pde = addr_gpa2hva(vm, pdpe[index[2]].address * vm->page_size); + if (!pde[index[1]].present) + goto unmapped_gva; + + pte = addr_gpa2hva(vm, pde[index[1]].address * vm->page_size); + if (!pte[index[0]].present) + goto unmapped_gva; + + return (pte[index[0]].address * vm->page_size) + (gva & 0xfffu); + +unmapped_gva: + TEST_FAIL("No mapping for vm virtual address, gva: 0x%lx", gva); + exit(EXIT_FAILURE); +} + +static void kvm_setup_gdt(struct kvm_vm *vm, struct kvm_dtable *dt, int gdt_memslot, + int pgd_memslot) +{ + if (!vm->gdt) + vm->gdt = vm_vaddr_alloc(vm, getpagesize(), + KVM_UTIL_MIN_VADDR, gdt_memslot, pgd_memslot); + + dt->base = vm->gdt; + dt->limit = getpagesize(); +} + +static void kvm_setup_tss_64bit(struct kvm_vm *vm, struct kvm_segment *segp, + int selector, int gdt_memslot, + int pgd_memslot) +{ + if (!vm->tss) + vm->tss = vm_vaddr_alloc(vm, getpagesize(), + KVM_UTIL_MIN_VADDR, gdt_memslot, pgd_memslot); + + memset(segp, 0, sizeof(*segp)); + segp->base = vm->tss; + segp->limit = 0x67; + segp->selector = selector; + segp->type = 0xb; + segp->present = 1; + kvm_seg_fill_gdt_64bit(vm, segp); +} + +static void vcpu_setup(struct kvm_vm *vm, int vcpuid, int pgd_memslot, int gdt_memslot) +{ + struct kvm_sregs sregs; + + /* Set mode specific system register values. */ + vcpu_sregs_get(vm, vcpuid, &sregs); + + sregs.idt.limit = 0; + + kvm_setup_gdt(vm, &sregs.gdt, gdt_memslot, pgd_memslot); + + switch (vm->mode) { + case VM_MODE_PXXV48_4K: + sregs.cr0 = X86_CR0_PE | X86_CR0_NE | X86_CR0_PG; + sregs.cr4 |= X86_CR4_PAE | X86_CR4_OSFXSR; + sregs.efer |= (EFER_LME | EFER_LMA | EFER_NX); + + kvm_seg_set_unusable(&sregs.ldt); + kvm_seg_set_kernel_code_64bit(vm, DEFAULT_CODE_SELECTOR, &sregs.cs); + kvm_seg_set_kernel_data_64bit(vm, DEFAULT_DATA_SELECTOR, &sregs.ds); + kvm_seg_set_kernel_data_64bit(vm, DEFAULT_DATA_SELECTOR, &sregs.es); + kvm_setup_tss_64bit(vm, &sregs.tr, 0x18, gdt_memslot, pgd_memslot); + break; + + default: + TEST_FAIL("Unknown guest mode, mode: 0x%x", vm->mode); + } + + sregs.cr3 = vm->pgd; + vcpu_sregs_set(vm, vcpuid, &sregs); +} + +void vm_vcpu_add_default(struct kvm_vm *vm, uint32_t vcpuid, void *guest_code) +{ + struct kvm_mp_state mp_state; + struct kvm_regs regs; + vm_vaddr_t stack_vaddr; + stack_vaddr = vm_vaddr_alloc(vm, DEFAULT_STACK_PGS * getpagesize(), + DEFAULT_GUEST_STACK_VADDR_MIN, 0, 0); + + /* Create VCPU */ + vm_vcpu_add(vm, vcpuid); + vcpu_setup(vm, vcpuid, 0, 0); + + /* Setup guest general purpose registers */ + vcpu_regs_get(vm, vcpuid, ®s); + regs.rflags = regs.rflags | 0x2; + regs.rsp = stack_vaddr + (DEFAULT_STACK_PGS * getpagesize()); + regs.rip = (unsigned long) guest_code; + vcpu_regs_set(vm, vcpuid, ®s); + + /* Setup the MP state */ + mp_state.mp_state = 0; + vcpu_set_mp_state(vm, vcpuid, &mp_state); +} + +/* + * Allocate an instance of struct kvm_cpuid2 + * + * Input Args: None + * + * Output Args: None + * + * Return: A pointer to the allocated struct. The caller is responsible + * for freeing this struct. + * + * Since kvm_cpuid2 uses a 0-length array to allow a the size of the + * array to be decided at allocation time, allocation is slightly + * complicated. This function uses a reasonable default length for + * the array and performs the appropriate allocation. + */ +static struct kvm_cpuid2 *allocate_kvm_cpuid2(void) +{ + struct kvm_cpuid2 *cpuid; + int nent = 100; + size_t size; + + size = sizeof(*cpuid); + size += nent * sizeof(struct kvm_cpuid_entry2); + cpuid = malloc(size); + if (!cpuid) { + perror("malloc"); + abort(); + } + + cpuid->nent = nent; + + return cpuid; +} + +/* + * KVM Supported CPUID Get + * + * Input Args: None + * + * Output Args: + * + * Return: The supported KVM CPUID + * + * Get the guest CPUID supported by KVM. + */ +struct kvm_cpuid2 *kvm_get_supported_cpuid(void) +{ + static struct kvm_cpuid2 *cpuid; + int ret; + int kvm_fd; + + if (cpuid) + return cpuid; + + cpuid = allocate_kvm_cpuid2(); + kvm_fd = open(KVM_DEV_PATH, O_RDONLY); + if (kvm_fd < 0) + exit(KSFT_SKIP); + + ret = ioctl(kvm_fd, KVM_GET_SUPPORTED_CPUID, cpuid); + TEST_ASSERT(ret == 0, "KVM_GET_SUPPORTED_CPUID failed %d %d\n", + ret, errno); + + close(kvm_fd); + return cpuid; +} + +/* + * Locate a cpuid entry. + * + * Input Args: + * function: The function of the cpuid entry to find. + * index: The index of the cpuid entry. + * + * Output Args: None + * + * Return: A pointer to the cpuid entry. Never returns NULL. + */ +struct kvm_cpuid_entry2 * +kvm_get_supported_cpuid_index(uint32_t function, uint32_t index) +{ + struct kvm_cpuid2 *cpuid; + struct kvm_cpuid_entry2 *entry = NULL; + int i; + + cpuid = kvm_get_supported_cpuid(); + for (i = 0; i < cpuid->nent; i++) { + if (cpuid->entries[i].function == function && + cpuid->entries[i].index == index) { + entry = &cpuid->entries[i]; + break; + } + } + + TEST_ASSERT(entry, "Guest CPUID entry not found: (EAX=%x, ECX=%x).", + function, index); + return entry; +} + +/* + * VM VCPU CPUID Set + * + * Input Args: + * vm - Virtual Machine + * vcpuid - VCPU id + * cpuid - The CPUID values to set. + * + * Output Args: None + * + * Return: void + * + * Set the VCPU's CPUID. + */ +void vcpu_set_cpuid(struct kvm_vm *vm, + uint32_t vcpuid, struct kvm_cpuid2 *cpuid) +{ + struct vcpu *vcpu = vcpu_find(vm, vcpuid); + int rc; + + TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid); + + rc = ioctl(vcpu->fd, KVM_SET_CPUID2, cpuid); + TEST_ASSERT(rc == 0, "KVM_SET_CPUID2 failed, rc: %i errno: %i", + rc, errno); + +} + +struct kvm_vm *vm_create_default(uint32_t vcpuid, uint64_t extra_mem_pages, + void *guest_code) +{ + struct kvm_vm *vm; + /* + * For x86 the maximum page table size for a memory region + * will be when only 4K pages are used. In that case the + * total extra size for page tables (for extra N pages) will + * be: N/512+N/512^2+N/512^3+... which is definitely smaller + * than N/512*2. + */ + uint64_t extra_pg_pages = extra_mem_pages / 512 * 2; + + /* Create VM */ + vm = vm_create(VM_MODE_DEFAULT, + DEFAULT_GUEST_PHY_PAGES + extra_pg_pages, + O_RDWR); + + /* Setup guest code */ + kvm_vm_elf_load(vm, program_invocation_name, 0, 0); + + /* Setup IRQ Chip */ + vm_create_irqchip(vm); + + /* Add the first vCPU. */ + vm_vcpu_add_default(vm, vcpuid, guest_code); + + return vm; +} + +/* + * VCPU Get MSR + * + * Input Args: + * vm - Virtual Machine + * vcpuid - VCPU ID + * msr_index - Index of MSR + * + * Output Args: None + * + * Return: On success, value of the MSR. On failure a TEST_ASSERT is produced. + * + * Get value of MSR for VCPU. + */ +uint64_t vcpu_get_msr(struct kvm_vm *vm, uint32_t vcpuid, uint64_t msr_index) +{ + struct vcpu *vcpu = vcpu_find(vm, vcpuid); + struct { + struct kvm_msrs header; + struct kvm_msr_entry entry; + } buffer = {}; + int r; + + TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid); + buffer.header.nmsrs = 1; + buffer.entry.index = msr_index; + r = ioctl(vcpu->fd, KVM_GET_MSRS, &buffer.header); + TEST_ASSERT(r == 1, "KVM_GET_MSRS IOCTL failed,\n" + " rc: %i errno: %i", r, errno); + + return buffer.entry.data; +} + +/* + * _VCPU Set MSR + * + * Input Args: + * vm - Virtual Machine + * vcpuid - VCPU ID + * msr_index - Index of MSR + * msr_value - New value of MSR + * + * Output Args: None + * + * Return: The result of KVM_SET_MSRS. + * + * Sets the value of an MSR for the given VCPU. + */ +int _vcpu_set_msr(struct kvm_vm *vm, uint32_t vcpuid, uint64_t msr_index, + uint64_t msr_value) +{ + struct vcpu *vcpu = vcpu_find(vm, vcpuid); + struct { + struct kvm_msrs header; + struct kvm_msr_entry entry; + } buffer = {}; + int r; + + TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid); + memset(&buffer, 0, sizeof(buffer)); + buffer.header.nmsrs = 1; + buffer.entry.index = msr_index; + buffer.entry.data = msr_value; + r = ioctl(vcpu->fd, KVM_SET_MSRS, &buffer.header); + return r; +} + +/* + * VCPU Set MSR + * + * Input Args: + * vm - Virtual Machine + * vcpuid - VCPU ID + * msr_index - Index of MSR + * msr_value - New value of MSR + * + * Output Args: None + * + * Return: On success, nothing. On failure a TEST_ASSERT is produced. + * + * Set value of MSR for VCPU. + */ +void vcpu_set_msr(struct kvm_vm *vm, uint32_t vcpuid, uint64_t msr_index, + uint64_t msr_value) +{ + int r; + + r = _vcpu_set_msr(vm, vcpuid, msr_index, msr_value); + TEST_ASSERT(r == 1, "KVM_SET_MSRS IOCTL failed,\n" + " rc: %i errno: %i", r, errno); +} + +void vcpu_args_set(struct kvm_vm *vm, uint32_t vcpuid, unsigned int num, ...) +{ + va_list ap; + struct kvm_regs regs; + + TEST_ASSERT(num >= 1 && num <= 6, "Unsupported number of args,\n" + " num: %u\n", + num); + + va_start(ap, num); + vcpu_regs_get(vm, vcpuid, ®s); + + if (num >= 1) + regs.rdi = va_arg(ap, uint64_t); + + if (num >= 2) + regs.rsi = va_arg(ap, uint64_t); + + if (num >= 3) + regs.rdx = va_arg(ap, uint64_t); + + if (num >= 4) + regs.rcx = va_arg(ap, uint64_t); + + if (num >= 5) + regs.r8 = va_arg(ap, uint64_t); + + if (num >= 6) + regs.r9 = va_arg(ap, uint64_t); + + vcpu_regs_set(vm, vcpuid, ®s); + va_end(ap); +} + +void vcpu_dump(FILE *stream, struct kvm_vm *vm, uint32_t vcpuid, uint8_t indent) +{ + struct kvm_regs regs; + struct kvm_sregs sregs; + + fprintf(stream, "%*scpuid: %u\n", indent, "", vcpuid); + + fprintf(stream, "%*sregs:\n", indent + 2, ""); + vcpu_regs_get(vm, vcpuid, ®s); + regs_dump(stream, ®s, indent + 4); + + fprintf(stream, "%*ssregs:\n", indent + 2, ""); + vcpu_sregs_get(vm, vcpuid, &sregs); + sregs_dump(stream, &sregs, indent + 4); +} + +struct kvm_x86_state { + struct kvm_vcpu_events events; + struct kvm_mp_state mp_state; + struct kvm_regs regs; + struct kvm_xsave xsave; + struct kvm_xcrs xcrs; + struct kvm_sregs sregs; + struct kvm_debugregs debugregs; + union { + struct kvm_nested_state nested; + char nested_[16384]; + }; + struct kvm_msrs msrs; +}; + +static int kvm_get_num_msrs_fd(int kvm_fd) +{ + struct kvm_msr_list nmsrs; + int r; + + nmsrs.nmsrs = 0; + r = ioctl(kvm_fd, KVM_GET_MSR_INDEX_LIST, &nmsrs); + TEST_ASSERT(r == -1 && errno == E2BIG, "Unexpected result from KVM_GET_MSR_INDEX_LIST probe, r: %i", + r); + + return nmsrs.nmsrs; +} + +static int kvm_get_num_msrs(struct kvm_vm *vm) +{ + return kvm_get_num_msrs_fd(vm->kvm_fd); +} + +struct kvm_msr_list *kvm_get_msr_index_list(void) +{ + struct kvm_msr_list *list; + int nmsrs, r, kvm_fd; + + kvm_fd = open(KVM_DEV_PATH, O_RDONLY); + if (kvm_fd < 0) + exit(KSFT_SKIP); + + nmsrs = kvm_get_num_msrs_fd(kvm_fd); + list = malloc(sizeof(*list) + nmsrs * sizeof(list->indices[0])); + list->nmsrs = nmsrs; + r = ioctl(kvm_fd, KVM_GET_MSR_INDEX_LIST, list); + close(kvm_fd); + + TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_MSR_INDEX_LIST, r: %i", + r); + + return list; +} + +struct kvm_x86_state *vcpu_save_state(struct kvm_vm *vm, uint32_t vcpuid) +{ + struct vcpu *vcpu = vcpu_find(vm, vcpuid); + struct kvm_msr_list *list; + struct kvm_x86_state *state; + int nmsrs, r, i; + static int nested_size = -1; + + if (nested_size == -1) { + nested_size = kvm_check_cap(KVM_CAP_NESTED_STATE); + TEST_ASSERT(nested_size <= sizeof(state->nested_), + "Nested state size too big, %i > %zi", + nested_size, sizeof(state->nested_)); + } + + /* + * When KVM exits to userspace with KVM_EXIT_IO, KVM guarantees + * guest state is consistent only after userspace re-enters the + * kernel with KVM_RUN. Complete IO prior to migrating state + * to a new VM. + */ + vcpu_run_complete_io(vm, vcpuid); + + nmsrs = kvm_get_num_msrs(vm); + list = malloc(sizeof(*list) + nmsrs * sizeof(list->indices[0])); + list->nmsrs = nmsrs; + r = ioctl(vm->kvm_fd, KVM_GET_MSR_INDEX_LIST, list); + TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_MSR_INDEX_LIST, r: %i", + r); + + state = malloc(sizeof(*state) + nmsrs * sizeof(state->msrs.entries[0])); + r = ioctl(vcpu->fd, KVM_GET_VCPU_EVENTS, &state->events); + TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_VCPU_EVENTS, r: %i", + r); + + r = ioctl(vcpu->fd, KVM_GET_MP_STATE, &state->mp_state); + TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_MP_STATE, r: %i", + r); + + r = ioctl(vcpu->fd, KVM_GET_REGS, &state->regs); + TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_REGS, r: %i", + r); + + r = ioctl(vcpu->fd, KVM_GET_XSAVE, &state->xsave); + TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_XSAVE, r: %i", + r); + + if (kvm_check_cap(KVM_CAP_XCRS)) { + r = ioctl(vcpu->fd, KVM_GET_XCRS, &state->xcrs); + TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_XCRS, r: %i", + r); + } + + r = ioctl(vcpu->fd, KVM_GET_SREGS, &state->sregs); + TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_SREGS, r: %i", + r); + + if (nested_size) { + state->nested.size = sizeof(state->nested_); + r = ioctl(vcpu->fd, KVM_GET_NESTED_STATE, &state->nested); + TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_NESTED_STATE, r: %i", + r); + TEST_ASSERT(state->nested.size <= nested_size, + "Nested state size too big, %i (KVM_CHECK_CAP gave %i)", + state->nested.size, nested_size); + } else + state->nested.size = 0; + + state->msrs.nmsrs = nmsrs; + for (i = 0; i < nmsrs; i++) + state->msrs.entries[i].index = list->indices[i]; + r = ioctl(vcpu->fd, KVM_GET_MSRS, &state->msrs); + TEST_ASSERT(r == nmsrs, "Unexpected result from KVM_GET_MSRS, r: %i (failed MSR was 0x%x)", + r, r == nmsrs ? -1 : list->indices[r]); + + r = ioctl(vcpu->fd, KVM_GET_DEBUGREGS, &state->debugregs); + TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_DEBUGREGS, r: %i", + r); + + free(list); + return state; +} + +void vcpu_load_state(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_x86_state *state) +{ + struct vcpu *vcpu = vcpu_find(vm, vcpuid); + int r; + + r = ioctl(vcpu->fd, KVM_SET_XSAVE, &state->xsave); + TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_XSAVE, r: %i", + r); + + if (kvm_check_cap(KVM_CAP_XCRS)) { + r = ioctl(vcpu->fd, KVM_SET_XCRS, &state->xcrs); + TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_XCRS, r: %i", + r); + } + + r = ioctl(vcpu->fd, KVM_SET_SREGS, &state->sregs); + TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_SREGS, r: %i", + r); + + r = ioctl(vcpu->fd, KVM_SET_MSRS, &state->msrs); + TEST_ASSERT(r == state->msrs.nmsrs, "Unexpected result from KVM_SET_MSRS, r: %i (failed at %x)", + r, r == state->msrs.nmsrs ? -1 : state->msrs.entries[r].index); + + r = ioctl(vcpu->fd, KVM_SET_VCPU_EVENTS, &state->events); + TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_VCPU_EVENTS, r: %i", + r); + + r = ioctl(vcpu->fd, KVM_SET_MP_STATE, &state->mp_state); + TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_MP_STATE, r: %i", + r); + + r = ioctl(vcpu->fd, KVM_SET_DEBUGREGS, &state->debugregs); + TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_DEBUGREGS, r: %i", + r); + + r = ioctl(vcpu->fd, KVM_SET_REGS, &state->regs); + TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_REGS, r: %i", + r); + + if (state->nested.size) { + r = ioctl(vcpu->fd, KVM_SET_NESTED_STATE, &state->nested); + TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_NESTED_STATE, r: %i", + r); + } +} + +bool is_intel_cpu(void) +{ + int eax, ebx, ecx, edx; + const uint32_t *chunk; + const int leaf = 0; + + __asm__ __volatile__( + "cpuid" + : /* output */ "=a"(eax), "=b"(ebx), + "=c"(ecx), "=d"(edx) + : /* input */ "0"(leaf), "2"(0)); + + chunk = (const uint32_t *)("GenuineIntel"); + return (ebx == chunk[0] && edx == chunk[1] && ecx == chunk[2]); +} + +uint32_t kvm_get_cpuid_max_basic(void) +{ + return kvm_get_supported_cpuid_entry(0)->eax; +} + +uint32_t kvm_get_cpuid_max_extended(void) +{ + return kvm_get_supported_cpuid_entry(0x80000000)->eax; +} + +void kvm_get_cpu_address_width(unsigned int *pa_bits, unsigned int *va_bits) +{ + struct kvm_cpuid_entry2 *entry; + bool pae; + + /* SDM 4.1.4 */ + if (kvm_get_cpuid_max_extended() < 0x80000008) { + pae = kvm_get_supported_cpuid_entry(1)->edx & (1 << 6); + *pa_bits = pae ? 36 : 32; + *va_bits = 32; + } else { + entry = kvm_get_supported_cpuid_entry(0x80000008); + *pa_bits = entry->eax & 0xff; + *va_bits = (entry->eax >> 8) & 0xff; + } +} + +struct idt_entry { + uint16_t offset0; + uint16_t selector; + uint16_t ist : 3; + uint16_t : 5; + uint16_t type : 4; + uint16_t : 1; + uint16_t dpl : 2; + uint16_t p : 1; + uint16_t offset1; + uint32_t offset2; uint32_t reserved; +}; + +static void set_idt_entry(struct kvm_vm *vm, int vector, unsigned long addr, + int dpl, unsigned short selector) +{ + struct idt_entry *base = + (struct idt_entry *)addr_gva2hva(vm, vm->idt); + struct idt_entry *e = &base[vector]; + + memset(e, 0, sizeof(*e)); + e->offset0 = addr; + e->selector = selector; + e->ist = 0; + e->type = 14; + e->dpl = dpl; + e->p = 1; + e->offset1 = addr >> 16; + e->offset2 = addr >> 32; +} + +void kvm_exit_unexpected_vector(uint32_t value) +{ + outl(UNEXPECTED_VECTOR_PORT, value); +} + +void route_exception(struct ex_regs *regs) +{ + typedef void(*handler)(struct ex_regs *); + handler *handlers = (handler *)exception_handlers; + + if (handlers && handlers[regs->vector]) { + handlers[regs->vector](regs); + return; + } + + kvm_exit_unexpected_vector(regs->vector); +} + +void vm_init_descriptor_tables(struct kvm_vm *vm) +{ + extern void *idt_handlers; + int i; + + vm->idt = vm_vaddr_alloc(vm, getpagesize(), 0x2000, 0, 0); + vm->handlers = vm_vaddr_alloc(vm, 256 * sizeof(void *), 0x2000, 0, 0); + /* Handlers have the same address in both address spaces.*/ + for (i = 0; i < NUM_INTERRUPTS; i++) + set_idt_entry(vm, i, (unsigned long)(&idt_handlers)[i], 0, + DEFAULT_CODE_SELECTOR); +} + +void vcpu_init_descriptor_tables(struct kvm_vm *vm, uint32_t vcpuid) +{ + struct kvm_sregs sregs; + + vcpu_sregs_get(vm, vcpuid, &sregs); + sregs.idt.base = vm->idt; + sregs.idt.limit = NUM_INTERRUPTS * sizeof(struct idt_entry) - 1; + sregs.gdt.base = vm->gdt; + sregs.gdt.limit = getpagesize() - 1; + kvm_seg_set_kernel_data_64bit(NULL, DEFAULT_DATA_SELECTOR, &sregs.gs); + vcpu_sregs_set(vm, vcpuid, &sregs); + *(vm_vaddr_t *)addr_gva2hva(vm, (vm_vaddr_t)(&exception_handlers)) = vm->handlers; +} + +void vm_handle_exception(struct kvm_vm *vm, int vector, + void (*handler)(struct ex_regs *)) +{ + vm_vaddr_t *handlers = (vm_vaddr_t *)addr_gva2hva(vm, vm->handlers); + + handlers[vector] = (vm_vaddr_t)handler; +} + +void assert_on_unhandled_exception(struct kvm_vm *vm, uint32_t vcpuid) +{ + if (vcpu_state(vm, vcpuid)->exit_reason == KVM_EXIT_IO + && vcpu_state(vm, vcpuid)->io.port == UNEXPECTED_VECTOR_PORT + && vcpu_state(vm, vcpuid)->io.size == 4) { + /* Grab pointer to io data */ + uint32_t *data = (void *)vcpu_state(vm, vcpuid) + + vcpu_state(vm, vcpuid)->io.data_offset; + + TEST_ASSERT(false, + "Unexpected vectored event in guest (vector:0x%x)", + *data); + } +} + +bool set_cpuid(struct kvm_cpuid2 *cpuid, + struct kvm_cpuid_entry2 *ent) +{ + int i; + + for (i = 0; i < cpuid->nent; i++) { + struct kvm_cpuid_entry2 *cur = &cpuid->entries[i]; + + if (cur->function != ent->function || cur->index != ent->index) + continue; + + memcpy(cur, ent, sizeof(struct kvm_cpuid_entry2)); + return true; + } + + return false; +} + +uint64_t kvm_hypercall(uint64_t nr, uint64_t a0, uint64_t a1, uint64_t a2, + uint64_t a3) +{ + uint64_t r; + + asm volatile("vmcall" + : "=a"(r) + : "a"(nr), "b"(a0), "c"(a1), "d"(a2), "S"(a3)); + return r; +} diff --git a/tools/testing/selftests/kvm/lib/x86_64/svm.c b/tools/testing/selftests/kvm/lib/x86_64/svm.c new file mode 100644 index 000000000..a58507a7b --- /dev/null +++ b/tools/testing/selftests/kvm/lib/x86_64/svm.c @@ -0,0 +1,177 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * tools/testing/selftests/kvm/lib/x86_64/svm.c + * Helpers used for nested SVM testing + * Largely inspired from KVM unit test svm.c + * + * Copyright (C) 2020, Red Hat, Inc. + */ + +#include "test_util.h" +#include "kvm_util.h" +#include "../kvm_util_internal.h" +#include "processor.h" +#include "svm_util.h" + +struct gpr64_regs guest_regs; +u64 rflags; + +/* Allocate memory regions for nested SVM tests. + * + * Input Args: + * vm - The VM to allocate guest-virtual addresses in. + * + * Output Args: + * p_svm_gva - The guest virtual address for the struct svm_test_data. + * + * Return: + * Pointer to structure with the addresses of the SVM areas. + */ +struct svm_test_data * +vcpu_alloc_svm(struct kvm_vm *vm, vm_vaddr_t *p_svm_gva) +{ + vm_vaddr_t svm_gva = vm_vaddr_alloc(vm, getpagesize(), + 0x10000, 0, 0); + struct svm_test_data *svm = addr_gva2hva(vm, svm_gva); + + svm->vmcb = (void *)vm_vaddr_alloc(vm, getpagesize(), + 0x10000, 0, 0); + svm->vmcb_hva = addr_gva2hva(vm, (uintptr_t)svm->vmcb); + svm->vmcb_gpa = addr_gva2gpa(vm, (uintptr_t)svm->vmcb); + + svm->save_area = (void *)vm_vaddr_alloc(vm, getpagesize(), + 0x10000, 0, 0); + svm->save_area_hva = addr_gva2hva(vm, (uintptr_t)svm->save_area); + svm->save_area_gpa = addr_gva2gpa(vm, (uintptr_t)svm->save_area); + + *p_svm_gva = svm_gva; + return svm; +} + +static void vmcb_set_seg(struct vmcb_seg *seg, u16 selector, + u64 base, u32 limit, u32 attr) +{ + seg->selector = selector; + seg->attrib = attr; + seg->limit = limit; + seg->base = base; +} + +/* + * Avoid using memset to clear the vmcb, since libc may not be + * available in L1 (and, even if it is, features that libc memset may + * want to use, like AVX, may not be enabled). + */ +static void clear_vmcb(struct vmcb *vmcb) +{ + int n = sizeof(*vmcb) / sizeof(u32); + + asm volatile ("rep stosl" : "+c"(n), "+D"(vmcb) : "a"(0) : "memory"); +} + +void generic_svm_setup(struct svm_test_data *svm, void *guest_rip, void *guest_rsp) +{ + struct vmcb *vmcb = svm->vmcb; + uint64_t vmcb_gpa = svm->vmcb_gpa; + struct vmcb_save_area *save = &vmcb->save; + struct vmcb_control_area *ctrl = &vmcb->control; + u32 data_seg_attr = 3 | SVM_SELECTOR_S_MASK | SVM_SELECTOR_P_MASK + | SVM_SELECTOR_DB_MASK | SVM_SELECTOR_G_MASK; + u32 code_seg_attr = 9 | SVM_SELECTOR_S_MASK | SVM_SELECTOR_P_MASK + | SVM_SELECTOR_L_MASK | SVM_SELECTOR_G_MASK; + uint64_t efer; + + efer = rdmsr(MSR_EFER); + wrmsr(MSR_EFER, efer | EFER_SVME); + wrmsr(MSR_VM_HSAVE_PA, svm->save_area_gpa); + + clear_vmcb(vmcb); + asm volatile ("vmsave %0\n\t" : : "a" (vmcb_gpa) : "memory"); + vmcb_set_seg(&save->es, get_es(), 0, -1U, data_seg_attr); + vmcb_set_seg(&save->cs, get_cs(), 0, -1U, code_seg_attr); + vmcb_set_seg(&save->ss, get_ss(), 0, -1U, data_seg_attr); + vmcb_set_seg(&save->ds, get_ds(), 0, -1U, data_seg_attr); + vmcb_set_seg(&save->gdtr, 0, get_gdt().address, get_gdt().size, 0); + vmcb_set_seg(&save->idtr, 0, get_idt().address, get_idt().size, 0); + + ctrl->asid = 1; + save->cpl = 0; + save->efer = rdmsr(MSR_EFER); + asm volatile ("mov %%cr4, %0" : "=r"(save->cr4) : : "memory"); + asm volatile ("mov %%cr3, %0" : "=r"(save->cr3) : : "memory"); + asm volatile ("mov %%cr0, %0" : "=r"(save->cr0) : : "memory"); + asm volatile ("mov %%dr7, %0" : "=r"(save->dr7) : : "memory"); + asm volatile ("mov %%dr6, %0" : "=r"(save->dr6) : : "memory"); + asm volatile ("mov %%cr2, %0" : "=r"(save->cr2) : : "memory"); + save->g_pat = rdmsr(MSR_IA32_CR_PAT); + save->dbgctl = rdmsr(MSR_IA32_DEBUGCTLMSR); + ctrl->intercept = (1ULL << INTERCEPT_VMRUN) | + (1ULL << INTERCEPT_VMMCALL); + + vmcb->save.rip = (u64)guest_rip; + vmcb->save.rsp = (u64)guest_rsp; + guest_regs.rdi = (u64)svm; +} + +/* + * save/restore 64-bit general registers except rax, rip, rsp + * which are directly handed through the VMCB guest processor state + */ +#define SAVE_GPR_C \ + "xchg %%rbx, guest_regs+0x20\n\t" \ + "xchg %%rcx, guest_regs+0x10\n\t" \ + "xchg %%rdx, guest_regs+0x18\n\t" \ + "xchg %%rbp, guest_regs+0x30\n\t" \ + "xchg %%rsi, guest_regs+0x38\n\t" \ + "xchg %%rdi, guest_regs+0x40\n\t" \ + "xchg %%r8, guest_regs+0x48\n\t" \ + "xchg %%r9, guest_regs+0x50\n\t" \ + "xchg %%r10, guest_regs+0x58\n\t" \ + "xchg %%r11, guest_regs+0x60\n\t" \ + "xchg %%r12, guest_regs+0x68\n\t" \ + "xchg %%r13, guest_regs+0x70\n\t" \ + "xchg %%r14, guest_regs+0x78\n\t" \ + "xchg %%r15, guest_regs+0x80\n\t" + +#define LOAD_GPR_C SAVE_GPR_C + +/* + * selftests do not use interrupts so we dropped clgi/sti/cli/stgi + * for now. registers involved in LOAD/SAVE_GPR_C are eventually + * unmodified so they do not need to be in the clobber list. + */ +void run_guest(struct vmcb *vmcb, uint64_t vmcb_gpa) +{ + asm volatile ( + "vmload %[vmcb_gpa]\n\t" + "mov rflags, %%r15\n\t" // rflags + "mov %%r15, 0x170(%[vmcb])\n\t" + "mov guest_regs, %%r15\n\t" // rax + "mov %%r15, 0x1f8(%[vmcb])\n\t" + LOAD_GPR_C + "vmrun %[vmcb_gpa]\n\t" + SAVE_GPR_C + "mov 0x170(%[vmcb]), %%r15\n\t" // rflags + "mov %%r15, rflags\n\t" + "mov 0x1f8(%[vmcb]), %%r15\n\t" // rax + "mov %%r15, guest_regs\n\t" + "vmsave %[vmcb_gpa]\n\t" + : : [vmcb] "r" (vmcb), [vmcb_gpa] "a" (vmcb_gpa) + : "r15", "memory"); +} + +bool nested_svm_supported(void) +{ + struct kvm_cpuid_entry2 *entry = + kvm_get_supported_cpuid_entry(0x80000001); + + return entry->ecx & CPUID_SVM; +} + +void nested_svm_check_supported(void) +{ + if (!nested_svm_supported()) { + print_skip("nested SVM not enabled"); + exit(KSFT_SKIP); + } +} diff --git a/tools/testing/selftests/kvm/lib/x86_64/ucall.c b/tools/testing/selftests/kvm/lib/x86_64/ucall.c new file mode 100644 index 000000000..a3489973e --- /dev/null +++ b/tools/testing/selftests/kvm/lib/x86_64/ucall.c @@ -0,0 +1,59 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * ucall support. A ucall is a "hypercall to userspace". + * + * Copyright (C) 2018, Red Hat, Inc. + */ +#include "kvm_util.h" + +#define UCALL_PIO_PORT ((uint16_t)0x1000) + +void ucall_init(struct kvm_vm *vm, void *arg) +{ +} + +void ucall_uninit(struct kvm_vm *vm) +{ +} + +void ucall(uint64_t cmd, int nargs, ...) +{ + struct ucall uc = { + .cmd = cmd, + }; + va_list va; + int i; + + nargs = nargs <= UCALL_MAX_ARGS ? nargs : UCALL_MAX_ARGS; + + va_start(va, nargs); + for (i = 0; i < nargs; ++i) + uc.args[i] = va_arg(va, uint64_t); + va_end(va); + + asm volatile("in %[port], %%al" + : : [port] "d" (UCALL_PIO_PORT), "D" (&uc) : "rax", "memory"); +} + +uint64_t get_ucall(struct kvm_vm *vm, uint32_t vcpu_id, struct ucall *uc) +{ + struct kvm_run *run = vcpu_state(vm, vcpu_id); + struct ucall ucall = {}; + + if (uc) + memset(uc, 0, sizeof(*uc)); + + if (run->exit_reason == KVM_EXIT_IO && run->io.port == UCALL_PIO_PORT) { + struct kvm_regs regs; + + vcpu_regs_get(vm, vcpu_id, ®s); + memcpy(&ucall, addr_gva2hva(vm, (vm_vaddr_t)regs.rdi), + sizeof(ucall)); + + vcpu_run_complete_io(vm, vcpu_id); + if (uc) + memcpy(uc, &ucall, sizeof(ucall)); + } + + return ucall.cmd; +} diff --git a/tools/testing/selftests/kvm/lib/x86_64/vmx.c b/tools/testing/selftests/kvm/lib/x86_64/vmx.c new file mode 100644 index 000000000..2448b30e8 --- /dev/null +++ b/tools/testing/selftests/kvm/lib/x86_64/vmx.c @@ -0,0 +1,553 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * tools/testing/selftests/kvm/lib/x86_64/vmx.c + * + * Copyright (C) 2018, Google LLC. + */ + +#include "test_util.h" +#include "kvm_util.h" +#include "../kvm_util_internal.h" +#include "processor.h" +#include "vmx.h" + +#define PAGE_SHIFT_4K 12 + +#define KVM_EPT_PAGE_TABLE_MIN_PADDR 0x1c0000 + +bool enable_evmcs; + +struct hv_enlightened_vmcs *current_evmcs; +struct hv_vp_assist_page *current_vp_assist; + +struct eptPageTableEntry { + uint64_t readable:1; + uint64_t writable:1; + uint64_t executable:1; + uint64_t memory_type:3; + uint64_t ignore_pat:1; + uint64_t page_size:1; + uint64_t accessed:1; + uint64_t dirty:1; + uint64_t ignored_11_10:2; + uint64_t address:40; + uint64_t ignored_62_52:11; + uint64_t suppress_ve:1; +}; + +struct eptPageTablePointer { + uint64_t memory_type:3; + uint64_t page_walk_length:3; + uint64_t ad_enabled:1; + uint64_t reserved_11_07:5; + uint64_t address:40; + uint64_t reserved_63_52:12; +}; +int vcpu_enable_evmcs(struct kvm_vm *vm, int vcpu_id) +{ + uint16_t evmcs_ver; + + struct kvm_enable_cap enable_evmcs_cap = { + .cap = KVM_CAP_HYPERV_ENLIGHTENED_VMCS, + .args[0] = (unsigned long)&evmcs_ver + }; + + vcpu_ioctl(vm, vcpu_id, KVM_ENABLE_CAP, &enable_evmcs_cap); + + /* KVM should return supported EVMCS version range */ + TEST_ASSERT(((evmcs_ver >> 8) >= (evmcs_ver & 0xff)) && + (evmcs_ver & 0xff) > 0, + "Incorrect EVMCS version range: %x:%x\n", + evmcs_ver & 0xff, evmcs_ver >> 8); + + return evmcs_ver; +} + +/* Allocate memory regions for nested VMX tests. + * + * Input Args: + * vm - The VM to allocate guest-virtual addresses in. + * + * Output Args: + * p_vmx_gva - The guest virtual address for the struct vmx_pages. + * + * Return: + * Pointer to structure with the addresses of the VMX areas. + */ +struct vmx_pages * +vcpu_alloc_vmx(struct kvm_vm *vm, vm_vaddr_t *p_vmx_gva) +{ + vm_vaddr_t vmx_gva = vm_vaddr_alloc(vm, getpagesize(), 0x10000, 0, 0); + struct vmx_pages *vmx = addr_gva2hva(vm, vmx_gva); + + /* Setup of a region of guest memory for the vmxon region. */ + vmx->vmxon = (void *)vm_vaddr_alloc(vm, getpagesize(), 0x10000, 0, 0); + vmx->vmxon_hva = addr_gva2hva(vm, (uintptr_t)vmx->vmxon); + vmx->vmxon_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->vmxon); + + /* Setup of a region of guest memory for a vmcs. */ + vmx->vmcs = (void *)vm_vaddr_alloc(vm, getpagesize(), 0x10000, 0, 0); + vmx->vmcs_hva = addr_gva2hva(vm, (uintptr_t)vmx->vmcs); + vmx->vmcs_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->vmcs); + + /* Setup of a region of guest memory for the MSR bitmap. */ + vmx->msr = (void *)vm_vaddr_alloc(vm, getpagesize(), 0x10000, 0, 0); + vmx->msr_hva = addr_gva2hva(vm, (uintptr_t)vmx->msr); + vmx->msr_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->msr); + memset(vmx->msr_hva, 0, getpagesize()); + + /* Setup of a region of guest memory for the shadow VMCS. */ + vmx->shadow_vmcs = (void *)vm_vaddr_alloc(vm, getpagesize(), 0x10000, 0, 0); + vmx->shadow_vmcs_hva = addr_gva2hva(vm, (uintptr_t)vmx->shadow_vmcs); + vmx->shadow_vmcs_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->shadow_vmcs); + + /* Setup of a region of guest memory for the VMREAD and VMWRITE bitmaps. */ + vmx->vmread = (void *)vm_vaddr_alloc(vm, getpagesize(), 0x10000, 0, 0); + vmx->vmread_hva = addr_gva2hva(vm, (uintptr_t)vmx->vmread); + vmx->vmread_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->vmread); + memset(vmx->vmread_hva, 0, getpagesize()); + + vmx->vmwrite = (void *)vm_vaddr_alloc(vm, getpagesize(), 0x10000, 0, 0); + vmx->vmwrite_hva = addr_gva2hva(vm, (uintptr_t)vmx->vmwrite); + vmx->vmwrite_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->vmwrite); + memset(vmx->vmwrite_hva, 0, getpagesize()); + + /* Setup of a region of guest memory for the VP Assist page. */ + vmx->vp_assist = (void *)vm_vaddr_alloc(vm, getpagesize(), + 0x10000, 0, 0); + vmx->vp_assist_hva = addr_gva2hva(vm, (uintptr_t)vmx->vp_assist); + vmx->vp_assist_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->vp_assist); + + /* Setup of a region of guest memory for the enlightened VMCS. */ + vmx->enlightened_vmcs = (void *)vm_vaddr_alloc(vm, getpagesize(), + 0x10000, 0, 0); + vmx->enlightened_vmcs_hva = + addr_gva2hva(vm, (uintptr_t)vmx->enlightened_vmcs); + vmx->enlightened_vmcs_gpa = + addr_gva2gpa(vm, (uintptr_t)vmx->enlightened_vmcs); + + *p_vmx_gva = vmx_gva; + return vmx; +} + +bool prepare_for_vmx_operation(struct vmx_pages *vmx) +{ + uint64_t feature_control; + uint64_t required; + unsigned long cr0; + unsigned long cr4; + + /* + * Ensure bits in CR0 and CR4 are valid in VMX operation: + * - Bit X is 1 in _FIXED0: bit X is fixed to 1 in CRx. + * - Bit X is 0 in _FIXED1: bit X is fixed to 0 in CRx. + */ + __asm__ __volatile__("mov %%cr0, %0" : "=r"(cr0) : : "memory"); + cr0 &= rdmsr(MSR_IA32_VMX_CR0_FIXED1); + cr0 |= rdmsr(MSR_IA32_VMX_CR0_FIXED0); + __asm__ __volatile__("mov %0, %%cr0" : : "r"(cr0) : "memory"); + + __asm__ __volatile__("mov %%cr4, %0" : "=r"(cr4) : : "memory"); + cr4 &= rdmsr(MSR_IA32_VMX_CR4_FIXED1); + cr4 |= rdmsr(MSR_IA32_VMX_CR4_FIXED0); + /* Enable VMX operation */ + cr4 |= X86_CR4_VMXE; + __asm__ __volatile__("mov %0, %%cr4" : : "r"(cr4) : "memory"); + + /* + * Configure IA32_FEATURE_CONTROL MSR to allow VMXON: + * Bit 0: Lock bit. If clear, VMXON causes a #GP. + * Bit 2: Enables VMXON outside of SMX operation. If clear, VMXON + * outside of SMX causes a #GP. + */ + required = FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX; + required |= FEAT_CTL_LOCKED; + feature_control = rdmsr(MSR_IA32_FEAT_CTL); + if ((feature_control & required) != required) + wrmsr(MSR_IA32_FEAT_CTL, feature_control | required); + + /* Enter VMX root operation. */ + *(uint32_t *)(vmx->vmxon) = vmcs_revision(); + if (vmxon(vmx->vmxon_gpa)) + return false; + + return true; +} + +bool load_vmcs(struct vmx_pages *vmx) +{ + if (!enable_evmcs) { + /* Load a VMCS. */ + *(uint32_t *)(vmx->vmcs) = vmcs_revision(); + if (vmclear(vmx->vmcs_gpa)) + return false; + + if (vmptrld(vmx->vmcs_gpa)) + return false; + + /* Setup shadow VMCS, do not load it yet. */ + *(uint32_t *)(vmx->shadow_vmcs) = + vmcs_revision() | 0x80000000ul; + if (vmclear(vmx->shadow_vmcs_gpa)) + return false; + } else { + if (evmcs_vmptrld(vmx->enlightened_vmcs_gpa, + vmx->enlightened_vmcs)) + return false; + current_evmcs->revision_id = EVMCS_VERSION; + } + + return true; +} + +/* + * Initialize the control fields to the most basic settings possible. + */ +static inline void init_vmcs_control_fields(struct vmx_pages *vmx) +{ + uint32_t sec_exec_ctl = 0; + + vmwrite(VIRTUAL_PROCESSOR_ID, 0); + vmwrite(POSTED_INTR_NV, 0); + + vmwrite(PIN_BASED_VM_EXEC_CONTROL, rdmsr(MSR_IA32_VMX_TRUE_PINBASED_CTLS)); + + if (vmx->eptp_gpa) { + uint64_t ept_paddr; + struct eptPageTablePointer eptp = { + .memory_type = VMX_BASIC_MEM_TYPE_WB, + .page_walk_length = 3, /* + 1 */ + .ad_enabled = !!(rdmsr(MSR_IA32_VMX_EPT_VPID_CAP) & VMX_EPT_VPID_CAP_AD_BITS), + .address = vmx->eptp_gpa >> PAGE_SHIFT_4K, + }; + + memcpy(&ept_paddr, &eptp, sizeof(ept_paddr)); + vmwrite(EPT_POINTER, ept_paddr); + sec_exec_ctl |= SECONDARY_EXEC_ENABLE_EPT; + } + + if (!vmwrite(SECONDARY_VM_EXEC_CONTROL, sec_exec_ctl)) + vmwrite(CPU_BASED_VM_EXEC_CONTROL, + rdmsr(MSR_IA32_VMX_TRUE_PROCBASED_CTLS) | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS); + else { + vmwrite(CPU_BASED_VM_EXEC_CONTROL, rdmsr(MSR_IA32_VMX_TRUE_PROCBASED_CTLS)); + GUEST_ASSERT(!sec_exec_ctl); + } + + vmwrite(EXCEPTION_BITMAP, 0); + vmwrite(PAGE_FAULT_ERROR_CODE_MASK, 0); + vmwrite(PAGE_FAULT_ERROR_CODE_MATCH, -1); /* Never match */ + vmwrite(CR3_TARGET_COUNT, 0); + vmwrite(VM_EXIT_CONTROLS, rdmsr(MSR_IA32_VMX_EXIT_CTLS) | + VM_EXIT_HOST_ADDR_SPACE_SIZE); /* 64-bit host */ + vmwrite(VM_EXIT_MSR_STORE_COUNT, 0); + vmwrite(VM_EXIT_MSR_LOAD_COUNT, 0); + vmwrite(VM_ENTRY_CONTROLS, rdmsr(MSR_IA32_VMX_ENTRY_CTLS) | + VM_ENTRY_IA32E_MODE); /* 64-bit guest */ + vmwrite(VM_ENTRY_MSR_LOAD_COUNT, 0); + vmwrite(VM_ENTRY_INTR_INFO_FIELD, 0); + vmwrite(TPR_THRESHOLD, 0); + + vmwrite(CR0_GUEST_HOST_MASK, 0); + vmwrite(CR4_GUEST_HOST_MASK, 0); + vmwrite(CR0_READ_SHADOW, get_cr0()); + vmwrite(CR4_READ_SHADOW, get_cr4()); + + vmwrite(MSR_BITMAP, vmx->msr_gpa); + vmwrite(VMREAD_BITMAP, vmx->vmread_gpa); + vmwrite(VMWRITE_BITMAP, vmx->vmwrite_gpa); +} + +/* + * Initialize the host state fields based on the current host state, with + * the exception of HOST_RSP and HOST_RIP, which should be set by vmlaunch + * or vmresume. + */ +static inline void init_vmcs_host_state(void) +{ + uint32_t exit_controls = vmreadz(VM_EXIT_CONTROLS); + + vmwrite(HOST_ES_SELECTOR, get_es()); + vmwrite(HOST_CS_SELECTOR, get_cs()); + vmwrite(HOST_SS_SELECTOR, get_ss()); + vmwrite(HOST_DS_SELECTOR, get_ds()); + vmwrite(HOST_FS_SELECTOR, get_fs()); + vmwrite(HOST_GS_SELECTOR, get_gs()); + vmwrite(HOST_TR_SELECTOR, get_tr()); + + if (exit_controls & VM_EXIT_LOAD_IA32_PAT) + vmwrite(HOST_IA32_PAT, rdmsr(MSR_IA32_CR_PAT)); + if (exit_controls & VM_EXIT_LOAD_IA32_EFER) + vmwrite(HOST_IA32_EFER, rdmsr(MSR_EFER)); + if (exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) + vmwrite(HOST_IA32_PERF_GLOBAL_CTRL, + rdmsr(MSR_CORE_PERF_GLOBAL_CTRL)); + + vmwrite(HOST_IA32_SYSENTER_CS, rdmsr(MSR_IA32_SYSENTER_CS)); + + vmwrite(HOST_CR0, get_cr0()); + vmwrite(HOST_CR3, get_cr3()); + vmwrite(HOST_CR4, get_cr4()); + vmwrite(HOST_FS_BASE, rdmsr(MSR_FS_BASE)); + vmwrite(HOST_GS_BASE, rdmsr(MSR_GS_BASE)); + vmwrite(HOST_TR_BASE, + get_desc64_base((struct desc64 *)(get_gdt().address + get_tr()))); + vmwrite(HOST_GDTR_BASE, get_gdt().address); + vmwrite(HOST_IDTR_BASE, get_idt().address); + vmwrite(HOST_IA32_SYSENTER_ESP, rdmsr(MSR_IA32_SYSENTER_ESP)); + vmwrite(HOST_IA32_SYSENTER_EIP, rdmsr(MSR_IA32_SYSENTER_EIP)); +} + +/* + * Initialize the guest state fields essentially as a clone of + * the host state fields. Some host state fields have fixed + * values, and we set the corresponding guest state fields accordingly. + */ +static inline void init_vmcs_guest_state(void *rip, void *rsp) +{ + vmwrite(GUEST_ES_SELECTOR, vmreadz(HOST_ES_SELECTOR)); + vmwrite(GUEST_CS_SELECTOR, vmreadz(HOST_CS_SELECTOR)); + vmwrite(GUEST_SS_SELECTOR, vmreadz(HOST_SS_SELECTOR)); + vmwrite(GUEST_DS_SELECTOR, vmreadz(HOST_DS_SELECTOR)); + vmwrite(GUEST_FS_SELECTOR, vmreadz(HOST_FS_SELECTOR)); + vmwrite(GUEST_GS_SELECTOR, vmreadz(HOST_GS_SELECTOR)); + vmwrite(GUEST_LDTR_SELECTOR, 0); + vmwrite(GUEST_TR_SELECTOR, vmreadz(HOST_TR_SELECTOR)); + vmwrite(GUEST_INTR_STATUS, 0); + vmwrite(GUEST_PML_INDEX, 0); + + vmwrite(VMCS_LINK_POINTER, -1ll); + vmwrite(GUEST_IA32_DEBUGCTL, 0); + vmwrite(GUEST_IA32_PAT, vmreadz(HOST_IA32_PAT)); + vmwrite(GUEST_IA32_EFER, vmreadz(HOST_IA32_EFER)); + vmwrite(GUEST_IA32_PERF_GLOBAL_CTRL, + vmreadz(HOST_IA32_PERF_GLOBAL_CTRL)); + + vmwrite(GUEST_ES_LIMIT, -1); + vmwrite(GUEST_CS_LIMIT, -1); + vmwrite(GUEST_SS_LIMIT, -1); + vmwrite(GUEST_DS_LIMIT, -1); + vmwrite(GUEST_FS_LIMIT, -1); + vmwrite(GUEST_GS_LIMIT, -1); + vmwrite(GUEST_LDTR_LIMIT, -1); + vmwrite(GUEST_TR_LIMIT, 0x67); + vmwrite(GUEST_GDTR_LIMIT, 0xffff); + vmwrite(GUEST_IDTR_LIMIT, 0xffff); + vmwrite(GUEST_ES_AR_BYTES, + vmreadz(GUEST_ES_SELECTOR) == 0 ? 0x10000 : 0xc093); + vmwrite(GUEST_CS_AR_BYTES, 0xa09b); + vmwrite(GUEST_SS_AR_BYTES, 0xc093); + vmwrite(GUEST_DS_AR_BYTES, + vmreadz(GUEST_DS_SELECTOR) == 0 ? 0x10000 : 0xc093); + vmwrite(GUEST_FS_AR_BYTES, + vmreadz(GUEST_FS_SELECTOR) == 0 ? 0x10000 : 0xc093); + vmwrite(GUEST_GS_AR_BYTES, + vmreadz(GUEST_GS_SELECTOR) == 0 ? 0x10000 : 0xc093); + vmwrite(GUEST_LDTR_AR_BYTES, 0x10000); + vmwrite(GUEST_TR_AR_BYTES, 0x8b); + vmwrite(GUEST_INTERRUPTIBILITY_INFO, 0); + vmwrite(GUEST_ACTIVITY_STATE, 0); + vmwrite(GUEST_SYSENTER_CS, vmreadz(HOST_IA32_SYSENTER_CS)); + vmwrite(VMX_PREEMPTION_TIMER_VALUE, 0); + + vmwrite(GUEST_CR0, vmreadz(HOST_CR0)); + vmwrite(GUEST_CR3, vmreadz(HOST_CR3)); + vmwrite(GUEST_CR4, vmreadz(HOST_CR4)); + vmwrite(GUEST_ES_BASE, 0); + vmwrite(GUEST_CS_BASE, 0); + vmwrite(GUEST_SS_BASE, 0); + vmwrite(GUEST_DS_BASE, 0); + vmwrite(GUEST_FS_BASE, vmreadz(HOST_FS_BASE)); + vmwrite(GUEST_GS_BASE, vmreadz(HOST_GS_BASE)); + vmwrite(GUEST_LDTR_BASE, 0); + vmwrite(GUEST_TR_BASE, vmreadz(HOST_TR_BASE)); + vmwrite(GUEST_GDTR_BASE, vmreadz(HOST_GDTR_BASE)); + vmwrite(GUEST_IDTR_BASE, vmreadz(HOST_IDTR_BASE)); + vmwrite(GUEST_DR7, 0x400); + vmwrite(GUEST_RSP, (uint64_t)rsp); + vmwrite(GUEST_RIP, (uint64_t)rip); + vmwrite(GUEST_RFLAGS, 2); + vmwrite(GUEST_PENDING_DBG_EXCEPTIONS, 0); + vmwrite(GUEST_SYSENTER_ESP, vmreadz(HOST_IA32_SYSENTER_ESP)); + vmwrite(GUEST_SYSENTER_EIP, vmreadz(HOST_IA32_SYSENTER_EIP)); +} + +void prepare_vmcs(struct vmx_pages *vmx, void *guest_rip, void *guest_rsp) +{ + init_vmcs_control_fields(vmx); + init_vmcs_host_state(); + init_vmcs_guest_state(guest_rip, guest_rsp); +} + +bool nested_vmx_supported(void) +{ + struct kvm_cpuid_entry2 *entry = kvm_get_supported_cpuid_entry(1); + + return entry->ecx & CPUID_VMX; +} + +void nested_vmx_check_supported(void) +{ + if (!nested_vmx_supported()) { + print_skip("nested VMX not enabled"); + exit(KSFT_SKIP); + } +} + +void nested_pg_map(struct vmx_pages *vmx, struct kvm_vm *vm, + uint64_t nested_paddr, uint64_t paddr, uint32_t eptp_memslot) +{ + uint16_t index[4]; + struct eptPageTableEntry *pml4e; + + TEST_ASSERT(vm->mode == VM_MODE_PXXV48_4K, "Attempt to use " + "unknown or unsupported guest mode, mode: 0x%x", vm->mode); + + TEST_ASSERT((nested_paddr % vm->page_size) == 0, + "Nested physical address not on page boundary,\n" + " nested_paddr: 0x%lx vm->page_size: 0x%x", + nested_paddr, vm->page_size); + TEST_ASSERT((nested_paddr >> vm->page_shift) <= vm->max_gfn, + "Physical address beyond beyond maximum supported,\n" + " nested_paddr: 0x%lx vm->max_gfn: 0x%lx vm->page_size: 0x%x", + paddr, vm->max_gfn, vm->page_size); + TEST_ASSERT((paddr % vm->page_size) == 0, + "Physical address not on page boundary,\n" + " paddr: 0x%lx vm->page_size: 0x%x", + paddr, vm->page_size); + TEST_ASSERT((paddr >> vm->page_shift) <= vm->max_gfn, + "Physical address beyond beyond maximum supported,\n" + " paddr: 0x%lx vm->max_gfn: 0x%lx vm->page_size: 0x%x", + paddr, vm->max_gfn, vm->page_size); + + index[0] = (nested_paddr >> 12) & 0x1ffu; + index[1] = (nested_paddr >> 21) & 0x1ffu; + index[2] = (nested_paddr >> 30) & 0x1ffu; + index[3] = (nested_paddr >> 39) & 0x1ffu; + + /* Allocate page directory pointer table if not present. */ + pml4e = vmx->eptp_hva; + if (!pml4e[index[3]].readable) { + pml4e[index[3]].address = vm_phy_page_alloc(vm, + KVM_EPT_PAGE_TABLE_MIN_PADDR, eptp_memslot) + >> vm->page_shift; + pml4e[index[3]].writable = true; + pml4e[index[3]].readable = true; + pml4e[index[3]].executable = true; + } + + /* Allocate page directory table if not present. */ + struct eptPageTableEntry *pdpe; + pdpe = addr_gpa2hva(vm, pml4e[index[3]].address * vm->page_size); + if (!pdpe[index[2]].readable) { + pdpe[index[2]].address = vm_phy_page_alloc(vm, + KVM_EPT_PAGE_TABLE_MIN_PADDR, eptp_memslot) + >> vm->page_shift; + pdpe[index[2]].writable = true; + pdpe[index[2]].readable = true; + pdpe[index[2]].executable = true; + } + + /* Allocate page table if not present. */ + struct eptPageTableEntry *pde; + pde = addr_gpa2hva(vm, pdpe[index[2]].address * vm->page_size); + if (!pde[index[1]].readable) { + pde[index[1]].address = vm_phy_page_alloc(vm, + KVM_EPT_PAGE_TABLE_MIN_PADDR, eptp_memslot) + >> vm->page_shift; + pde[index[1]].writable = true; + pde[index[1]].readable = true; + pde[index[1]].executable = true; + } + + /* Fill in page table entry. */ + struct eptPageTableEntry *pte; + pte = addr_gpa2hva(vm, pde[index[1]].address * vm->page_size); + pte[index[0]].address = paddr >> vm->page_shift; + pte[index[0]].writable = true; + pte[index[0]].readable = true; + pte[index[0]].executable = true; + + /* + * For now mark these as accessed and dirty because the only + * testcase we have needs that. Can be reconsidered later. + */ + pte[index[0]].accessed = true; + pte[index[0]].dirty = true; +} + +/* + * Map a range of EPT guest physical addresses to the VM's physical address + * + * Input Args: + * vm - Virtual Machine + * nested_paddr - Nested guest physical address to map + * paddr - VM Physical Address + * size - The size of the range to map + * eptp_memslot - Memory region slot for new virtual translation tables + * + * Output Args: None + * + * Return: None + * + * Within the VM given by vm, creates a nested guest translation for the + * page range starting at nested_paddr to the page range starting at paddr. + */ +void nested_map(struct vmx_pages *vmx, struct kvm_vm *vm, + uint64_t nested_paddr, uint64_t paddr, uint64_t size, + uint32_t eptp_memslot) +{ + size_t page_size = vm->page_size; + size_t npages = size / page_size; + + TEST_ASSERT(nested_paddr + size > nested_paddr, "Vaddr overflow"); + TEST_ASSERT(paddr + size > paddr, "Paddr overflow"); + + while (npages--) { + nested_pg_map(vmx, vm, nested_paddr, paddr, eptp_memslot); + nested_paddr += page_size; + paddr += page_size; + } +} + +/* Prepare an identity extended page table that maps all the + * physical pages in VM. + */ +void nested_map_memslot(struct vmx_pages *vmx, struct kvm_vm *vm, + uint32_t memslot, uint32_t eptp_memslot) +{ + sparsebit_idx_t i, last; + struct userspace_mem_region *region = + memslot2region(vm, memslot); + + i = (region->region.guest_phys_addr >> vm->page_shift) - 1; + last = i + (region->region.memory_size >> vm->page_shift); + for (;;) { + i = sparsebit_next_clear(region->unused_phy_pages, i); + if (i > last) + break; + + nested_map(vmx, vm, + (uint64_t)i << vm->page_shift, + (uint64_t)i << vm->page_shift, + 1 << vm->page_shift, + eptp_memslot); + } +} + +void prepare_eptp(struct vmx_pages *vmx, struct kvm_vm *vm, + uint32_t eptp_memslot) +{ + vmx->eptp = (void *)vm_vaddr_alloc(vm, getpagesize(), 0x10000, 0, 0); + vmx->eptp_hva = addr_gva2hva(vm, (uintptr_t)vmx->eptp); + vmx->eptp_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->eptp); +} + +void prepare_virtualize_apic_accesses(struct vmx_pages *vmx, struct kvm_vm *vm, + uint32_t eptp_memslot) +{ + vmx->apic_access = (void *)vm_vaddr_alloc(vm, getpagesize(), + 0x10000, 0, 0); + vmx->apic_access_hva = addr_gva2hva(vm, (uintptr_t)vmx->apic_access); + vmx->apic_access_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->apic_access); +} diff --git a/tools/testing/selftests/kvm/s390x/memop.c b/tools/testing/selftests/kvm/s390x/memop.c new file mode 100644 index 000000000..9f49ead38 --- /dev/null +++ b/tools/testing/selftests/kvm/s390x/memop.c @@ -0,0 +1,166 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Test for s390x KVM_S390_MEM_OP + * + * Copyright (C) 2019, Red Hat, Inc. + */ + +#include +#include +#include +#include + +#include "test_util.h" +#include "kvm_util.h" + +#define VCPU_ID 1 + +static uint8_t mem1[65536]; +static uint8_t mem2[65536]; + +static void guest_code(void) +{ + int i; + + for (;;) { + for (i = 0; i < sizeof(mem2); i++) + mem2[i] = mem1[i]; + GUEST_SYNC(0); + } +} + +int main(int argc, char *argv[]) +{ + struct kvm_vm *vm; + struct kvm_run *run; + struct kvm_s390_mem_op ksmo; + int rv, i, maxsize; + + setbuf(stdout, NULL); /* Tell stdout not to buffer its content */ + + maxsize = kvm_check_cap(KVM_CAP_S390_MEM_OP); + if (!maxsize) { + print_skip("CAP_S390_MEM_OP not supported"); + exit(KSFT_SKIP); + } + if (maxsize > sizeof(mem1)) + maxsize = sizeof(mem1); + + /* Create VM */ + vm = vm_create_default(VCPU_ID, 0, guest_code); + run = vcpu_state(vm, VCPU_ID); + + for (i = 0; i < sizeof(mem1); i++) + mem1[i] = i * i + i; + + /* Set the first array */ + ksmo.gaddr = addr_gva2gpa(vm, (uintptr_t)mem1); + ksmo.flags = 0; + ksmo.size = maxsize; + ksmo.op = KVM_S390_MEMOP_LOGICAL_WRITE; + ksmo.buf = (uintptr_t)mem1; + ksmo.ar = 0; + vcpu_ioctl(vm, VCPU_ID, KVM_S390_MEM_OP, &ksmo); + + /* Let the guest code copy the first array to the second */ + vcpu_run(vm, VCPU_ID); + TEST_ASSERT(run->exit_reason == KVM_EXIT_S390_SIEIC, + "Unexpected exit reason: %u (%s)\n", + run->exit_reason, + exit_reason_str(run->exit_reason)); + + memset(mem2, 0xaa, sizeof(mem2)); + + /* Get the second array */ + ksmo.gaddr = (uintptr_t)mem2; + ksmo.flags = 0; + ksmo.size = maxsize; + ksmo.op = KVM_S390_MEMOP_LOGICAL_READ; + ksmo.buf = (uintptr_t)mem2; + ksmo.ar = 0; + vcpu_ioctl(vm, VCPU_ID, KVM_S390_MEM_OP, &ksmo); + + TEST_ASSERT(!memcmp(mem1, mem2, maxsize), + "Memory contents do not match!"); + + /* Check error conditions - first bad size: */ + ksmo.gaddr = (uintptr_t)mem1; + ksmo.flags = 0; + ksmo.size = -1; + ksmo.op = KVM_S390_MEMOP_LOGICAL_WRITE; + ksmo.buf = (uintptr_t)mem1; + ksmo.ar = 0; + rv = _vcpu_ioctl(vm, VCPU_ID, KVM_S390_MEM_OP, &ksmo); + TEST_ASSERT(rv == -1 && errno == E2BIG, "ioctl allows insane sizes"); + + /* Zero size: */ + ksmo.gaddr = (uintptr_t)mem1; + ksmo.flags = 0; + ksmo.size = 0; + ksmo.op = KVM_S390_MEMOP_LOGICAL_WRITE; + ksmo.buf = (uintptr_t)mem1; + ksmo.ar = 0; + rv = _vcpu_ioctl(vm, VCPU_ID, KVM_S390_MEM_OP, &ksmo); + TEST_ASSERT(rv == -1 && (errno == EINVAL || errno == ENOMEM), + "ioctl allows 0 as size"); + + /* Bad flags: */ + ksmo.gaddr = (uintptr_t)mem1; + ksmo.flags = -1; + ksmo.size = maxsize; + ksmo.op = KVM_S390_MEMOP_LOGICAL_WRITE; + ksmo.buf = (uintptr_t)mem1; + ksmo.ar = 0; + rv = _vcpu_ioctl(vm, VCPU_ID, KVM_S390_MEM_OP, &ksmo); + TEST_ASSERT(rv == -1 && errno == EINVAL, "ioctl allows all flags"); + + /* Bad operation: */ + ksmo.gaddr = (uintptr_t)mem1; + ksmo.flags = 0; + ksmo.size = maxsize; + ksmo.op = -1; + ksmo.buf = (uintptr_t)mem1; + ksmo.ar = 0; + rv = _vcpu_ioctl(vm, VCPU_ID, KVM_S390_MEM_OP, &ksmo); + TEST_ASSERT(rv == -1 && errno == EINVAL, "ioctl allows bad operations"); + + /* Bad guest address: */ + ksmo.gaddr = ~0xfffUL; + ksmo.flags = KVM_S390_MEMOP_F_CHECK_ONLY; + ksmo.size = maxsize; + ksmo.op = KVM_S390_MEMOP_LOGICAL_WRITE; + ksmo.buf = (uintptr_t)mem1; + ksmo.ar = 0; + rv = _vcpu_ioctl(vm, VCPU_ID, KVM_S390_MEM_OP, &ksmo); + TEST_ASSERT(rv > 0, "ioctl does not report bad guest memory access"); + + /* Bad host address: */ + ksmo.gaddr = (uintptr_t)mem1; + ksmo.flags = 0; + ksmo.size = maxsize; + ksmo.op = KVM_S390_MEMOP_LOGICAL_WRITE; + ksmo.buf = 0; + ksmo.ar = 0; + rv = _vcpu_ioctl(vm, VCPU_ID, KVM_S390_MEM_OP, &ksmo); + TEST_ASSERT(rv == -1 && errno == EFAULT, + "ioctl does not report bad host memory address"); + + /* Bad access register: */ + run->psw_mask &= ~(3UL << (63 - 17)); + run->psw_mask |= 1UL << (63 - 17); /* Enable AR mode */ + vcpu_run(vm, VCPU_ID); /* To sync new state to SIE block */ + ksmo.gaddr = (uintptr_t)mem1; + ksmo.flags = 0; + ksmo.size = maxsize; + ksmo.op = KVM_S390_MEMOP_LOGICAL_WRITE; + ksmo.buf = (uintptr_t)mem1; + ksmo.ar = 17; + rv = _vcpu_ioctl(vm, VCPU_ID, KVM_S390_MEM_OP, &ksmo); + TEST_ASSERT(rv == -1 && errno == EINVAL, "ioctl allows ARs > 15"); + run->psw_mask &= ~(3UL << (63 - 17)); /* Disable AR mode */ + vcpu_run(vm, VCPU_ID); /* Run to sync new state */ + + kvm_vm_free(vm); + + return 0; +} diff --git a/tools/testing/selftests/kvm/s390x/resets.c b/tools/testing/selftests/kvm/s390x/resets.c new file mode 100644 index 000000000..b143db6d8 --- /dev/null +++ b/tools/testing/selftests/kvm/s390x/resets.c @@ -0,0 +1,279 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Test for s390x CPU resets + * + * Copyright (C) 2020, IBM + */ + +#include +#include +#include +#include + +#include "test_util.h" +#include "kvm_util.h" + +#define VCPU_ID 3 +#define LOCAL_IRQS 32 + +struct kvm_s390_irq buf[VCPU_ID + LOCAL_IRQS]; + +struct kvm_vm *vm; +struct kvm_run *run; +struct kvm_sync_regs *sync_regs; +static uint8_t regs_null[512]; + +static void guest_code_initial(void) +{ + /* set several CRs to "safe" value */ + unsigned long cr2_59 = 0x10; /* enable guarded storage */ + unsigned long cr8_63 = 0x1; /* monitor mask = 1 */ + unsigned long cr10 = 1; /* PER START */ + unsigned long cr11 = -1; /* PER END */ + + + /* Dirty registers */ + asm volatile ( + " lghi 2,0x11\n" /* Round toward 0 */ + " sfpc 2\n" /* set fpc to !=0 */ + " lctlg 2,2,%0\n" + " lctlg 8,8,%1\n" + " lctlg 10,10,%2\n" + " lctlg 11,11,%3\n" + /* now clobber some general purpose regs */ + " llihh 0,0xffff\n" + " llihl 1,0x5555\n" + " llilh 2,0xaaaa\n" + " llill 3,0x0000\n" + /* now clobber a floating point reg */ + " lghi 4,0x1\n" + " cdgbr 0,4\n" + /* now clobber an access reg */ + " sar 9,4\n" + /* We embed diag 501 here to control register content */ + " diag 0,0,0x501\n" + : + : "m" (cr2_59), "m" (cr8_63), "m" (cr10), "m" (cr11) + /* no clobber list as this should not return */ + ); +} + +static void test_one_reg(uint64_t id, uint64_t value) +{ + struct kvm_one_reg reg; + uint64_t eval_reg; + + reg.addr = (uintptr_t)&eval_reg; + reg.id = id; + vcpu_get_reg(vm, VCPU_ID, ®); + TEST_ASSERT(eval_reg == value, "value == 0x%lx", value); +} + +static void assert_noirq(void) +{ + struct kvm_s390_irq_state irq_state; + int irqs; + + irq_state.len = sizeof(buf); + irq_state.buf = (unsigned long)buf; + irqs = _vcpu_ioctl(vm, VCPU_ID, KVM_S390_GET_IRQ_STATE, &irq_state); + /* + * irqs contains the number of retrieved interrupts. Any interrupt + * (notably, the emergency call interrupt we have injected) should + * be cleared by the resets, so this should be 0. + */ + TEST_ASSERT(irqs >= 0, "Could not fetch IRQs: errno %d\n", errno); + TEST_ASSERT(!irqs, "IRQ pending"); +} + +static void assert_clear(void) +{ + struct kvm_sregs sregs; + struct kvm_regs regs; + struct kvm_fpu fpu; + + vcpu_regs_get(vm, VCPU_ID, ®s); + TEST_ASSERT(!memcmp(®s.gprs, regs_null, sizeof(regs.gprs)), "grs == 0"); + + vcpu_sregs_get(vm, VCPU_ID, &sregs); + TEST_ASSERT(!memcmp(&sregs.acrs, regs_null, sizeof(sregs.acrs)), "acrs == 0"); + + vcpu_fpu_get(vm, VCPU_ID, &fpu); + TEST_ASSERT(!memcmp(&fpu.fprs, regs_null, sizeof(fpu.fprs)), "fprs == 0"); + + /* sync regs */ + TEST_ASSERT(!memcmp(sync_regs->gprs, regs_null, sizeof(sync_regs->gprs)), + "gprs0-15 == 0 (sync_regs)"); + + TEST_ASSERT(!memcmp(sync_regs->acrs, regs_null, sizeof(sync_regs->acrs)), + "acrs0-15 == 0 (sync_regs)"); + + TEST_ASSERT(!memcmp(sync_regs->vrs, regs_null, sizeof(sync_regs->vrs)), + "vrs0-15 == 0 (sync_regs)"); +} + +static void assert_initial_noclear(void) +{ + TEST_ASSERT(sync_regs->gprs[0] == 0xffff000000000000UL, + "gpr0 == 0xffff000000000000 (sync_regs)"); + TEST_ASSERT(sync_regs->gprs[1] == 0x0000555500000000UL, + "gpr1 == 0x0000555500000000 (sync_regs)"); + TEST_ASSERT(sync_regs->gprs[2] == 0x00000000aaaa0000UL, + "gpr2 == 0x00000000aaaa0000 (sync_regs)"); + TEST_ASSERT(sync_regs->gprs[3] == 0x0000000000000000UL, + "gpr3 == 0x0000000000000000 (sync_regs)"); + TEST_ASSERT(sync_regs->fprs[0] == 0x3ff0000000000000UL, + "fpr0 == 0f1 (sync_regs)"); + TEST_ASSERT(sync_regs->acrs[9] == 1, "ar9 == 1 (sync_regs)"); +} + +static void assert_initial(void) +{ + struct kvm_sregs sregs; + struct kvm_fpu fpu; + + /* KVM_GET_SREGS */ + vcpu_sregs_get(vm, VCPU_ID, &sregs); + TEST_ASSERT(sregs.crs[0] == 0xE0UL, "cr0 == 0xE0 (KVM_GET_SREGS)"); + TEST_ASSERT(sregs.crs[14] == 0xC2000000UL, + "cr14 == 0xC2000000 (KVM_GET_SREGS)"); + TEST_ASSERT(!memcmp(&sregs.crs[1], regs_null, sizeof(sregs.crs[1]) * 12), + "cr1-13 == 0 (KVM_GET_SREGS)"); + TEST_ASSERT(sregs.crs[15] == 0, "cr15 == 0 (KVM_GET_SREGS)"); + + /* sync regs */ + TEST_ASSERT(sync_regs->crs[0] == 0xE0UL, "cr0 == 0xE0 (sync_regs)"); + TEST_ASSERT(sync_regs->crs[14] == 0xC2000000UL, + "cr14 == 0xC2000000 (sync_regs)"); + TEST_ASSERT(!memcmp(&sync_regs->crs[1], regs_null, 8 * 12), + "cr1-13 == 0 (sync_regs)"); + TEST_ASSERT(sync_regs->crs[15] == 0, "cr15 == 0 (sync_regs)"); + TEST_ASSERT(sync_regs->fpc == 0, "fpc == 0 (sync_regs)"); + TEST_ASSERT(sync_regs->todpr == 0, "todpr == 0 (sync_regs)"); + TEST_ASSERT(sync_regs->cputm == 0, "cputm == 0 (sync_regs)"); + TEST_ASSERT(sync_regs->ckc == 0, "ckc == 0 (sync_regs)"); + TEST_ASSERT(sync_regs->pp == 0, "pp == 0 (sync_regs)"); + TEST_ASSERT(sync_regs->gbea == 1, "gbea == 1 (sync_regs)"); + + /* kvm_run */ + TEST_ASSERT(run->psw_addr == 0, "psw_addr == 0 (kvm_run)"); + TEST_ASSERT(run->psw_mask == 0, "psw_mask == 0 (kvm_run)"); + + vcpu_fpu_get(vm, VCPU_ID, &fpu); + TEST_ASSERT(!fpu.fpc, "fpc == 0"); + + test_one_reg(KVM_REG_S390_GBEA, 1); + test_one_reg(KVM_REG_S390_PP, 0); + test_one_reg(KVM_REG_S390_TODPR, 0); + test_one_reg(KVM_REG_S390_CPU_TIMER, 0); + test_one_reg(KVM_REG_S390_CLOCK_COMP, 0); +} + +static void assert_normal_noclear(void) +{ + TEST_ASSERT(sync_regs->crs[2] == 0x10, "cr2 == 10 (sync_regs)"); + TEST_ASSERT(sync_regs->crs[8] == 1, "cr10 == 1 (sync_regs)"); + TEST_ASSERT(sync_regs->crs[10] == 1, "cr10 == 1 (sync_regs)"); + TEST_ASSERT(sync_regs->crs[11] == -1, "cr11 == -1 (sync_regs)"); +} + +static void assert_normal(void) +{ + test_one_reg(KVM_REG_S390_PFTOKEN, KVM_S390_PFAULT_TOKEN_INVALID); + TEST_ASSERT(sync_regs->pft == KVM_S390_PFAULT_TOKEN_INVALID, + "pft == 0xff..... (sync_regs)"); + assert_noirq(); +} + +static void inject_irq(int cpu_id) +{ + struct kvm_s390_irq_state irq_state; + struct kvm_s390_irq *irq = &buf[0]; + int irqs; + + /* Inject IRQ */ + irq_state.len = sizeof(struct kvm_s390_irq); + irq_state.buf = (unsigned long)buf; + irq->type = KVM_S390_INT_EMERGENCY; + irq->u.emerg.code = cpu_id; + irqs = _vcpu_ioctl(vm, cpu_id, KVM_S390_SET_IRQ_STATE, &irq_state); + TEST_ASSERT(irqs >= 0, "Error injecting EMERGENCY IRQ errno %d\n", errno); +} + +static void test_normal(void) +{ + pr_info("Testing normal reset\n"); + /* Create VM */ + vm = vm_create_default(VCPU_ID, 0, guest_code_initial); + run = vcpu_state(vm, VCPU_ID); + sync_regs = &run->s.regs; + + vcpu_run(vm, VCPU_ID); + + inject_irq(VCPU_ID); + + vcpu_ioctl(vm, VCPU_ID, KVM_S390_NORMAL_RESET, 0); + + /* must clears */ + assert_normal(); + /* must not clears */ + assert_normal_noclear(); + assert_initial_noclear(); + + kvm_vm_free(vm); +} + +static void test_initial(void) +{ + pr_info("Testing initial reset\n"); + vm = vm_create_default(VCPU_ID, 0, guest_code_initial); + run = vcpu_state(vm, VCPU_ID); + sync_regs = &run->s.regs; + + vcpu_run(vm, VCPU_ID); + + inject_irq(VCPU_ID); + + vcpu_ioctl(vm, VCPU_ID, KVM_S390_INITIAL_RESET, 0); + + /* must clears */ + assert_normal(); + assert_initial(); + /* must not clears */ + assert_initial_noclear(); + + kvm_vm_free(vm); +} + +static void test_clear(void) +{ + pr_info("Testing clear reset\n"); + vm = vm_create_default(VCPU_ID, 0, guest_code_initial); + run = vcpu_state(vm, VCPU_ID); + sync_regs = &run->s.regs; + + vcpu_run(vm, VCPU_ID); + + inject_irq(VCPU_ID); + + vcpu_ioctl(vm, VCPU_ID, KVM_S390_CLEAR_RESET, 0); + + /* must clears */ + assert_normal(); + assert_initial(); + assert_clear(); + + kvm_vm_free(vm); +} + +int main(int argc, char *argv[]) +{ + setbuf(stdout, NULL); /* Tell stdout not to buffer its content */ + + test_initial(); + if (kvm_check_cap(KVM_CAP_S390_VCPU_RESETS)) { + test_normal(); + test_clear(); + } + return 0; +} diff --git a/tools/testing/selftests/kvm/s390x/sync_regs_test.c b/tools/testing/selftests/kvm/s390x/sync_regs_test.c new file mode 100644 index 000000000..5731ccf34 --- /dev/null +++ b/tools/testing/selftests/kvm/s390x/sync_regs_test.c @@ -0,0 +1,193 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Test for s390x KVM_CAP_SYNC_REGS + * + * Based on the same test for x86: + * Copyright (C) 2018, Google LLC. + * + * Adaptions for s390x: + * Copyright (C) 2019, Red Hat, Inc. + * + * Test expected behavior of the KVM_CAP_SYNC_REGS functionality. + */ + +#define _GNU_SOURCE /* for program_invocation_short_name */ +#include +#include +#include +#include +#include + +#include "test_util.h" +#include "kvm_util.h" + +#define VCPU_ID 5 + +static void guest_code(void) +{ + /* + * We embed diag 501 here instead of doing a ucall to avoid that + * the compiler has messed with r11 at the time of the ucall. + */ + asm volatile ( + "0: diag 0,0,0x501\n" + " ahi 11,1\n" + " j 0b\n" + ); +} + +#define REG_COMPARE(reg) \ + TEST_ASSERT(left->reg == right->reg, \ + "Register " #reg \ + " values did not match: 0x%llx, 0x%llx\n", \ + left->reg, right->reg) + +#define REG_COMPARE32(reg) \ + TEST_ASSERT(left->reg == right->reg, \ + "Register " #reg \ + " values did not match: 0x%x, 0x%x\n", \ + left->reg, right->reg) + + +static void compare_regs(struct kvm_regs *left, struct kvm_sync_regs *right) +{ + int i; + + for (i = 0; i < 16; i++) + REG_COMPARE(gprs[i]); +} + +static void compare_sregs(struct kvm_sregs *left, struct kvm_sync_regs *right) +{ + int i; + + for (i = 0; i < 16; i++) + REG_COMPARE32(acrs[i]); + + for (i = 0; i < 16; i++) + REG_COMPARE(crs[i]); +} + +#undef REG_COMPARE + +#define TEST_SYNC_FIELDS (KVM_SYNC_GPRS|KVM_SYNC_ACRS|KVM_SYNC_CRS) +#define INVALID_SYNC_FIELD 0x80000000 + +int main(int argc, char *argv[]) +{ + struct kvm_vm *vm; + struct kvm_run *run; + struct kvm_regs regs; + struct kvm_sregs sregs; + int rv, cap; + + /* Tell stdout not to buffer its content */ + setbuf(stdout, NULL); + + cap = kvm_check_cap(KVM_CAP_SYNC_REGS); + if (!cap) { + print_skip("CAP_SYNC_REGS not supported"); + exit(KSFT_SKIP); + } + + /* Create VM */ + vm = vm_create_default(VCPU_ID, 0, guest_code); + + run = vcpu_state(vm, VCPU_ID); + + /* Request reading invalid register set from VCPU. */ + run->kvm_valid_regs = INVALID_SYNC_FIELD; + rv = _vcpu_run(vm, VCPU_ID); + TEST_ASSERT(rv < 0 && errno == EINVAL, + "Invalid kvm_valid_regs did not cause expected KVM_RUN error: %d\n", + rv); + vcpu_state(vm, VCPU_ID)->kvm_valid_regs = 0; + + run->kvm_valid_regs = INVALID_SYNC_FIELD | TEST_SYNC_FIELDS; + rv = _vcpu_run(vm, VCPU_ID); + TEST_ASSERT(rv < 0 && errno == EINVAL, + "Invalid kvm_valid_regs did not cause expected KVM_RUN error: %d\n", + rv); + vcpu_state(vm, VCPU_ID)->kvm_valid_regs = 0; + + /* Request setting invalid register set into VCPU. */ + run->kvm_dirty_regs = INVALID_SYNC_FIELD; + rv = _vcpu_run(vm, VCPU_ID); + TEST_ASSERT(rv < 0 && errno == EINVAL, + "Invalid kvm_dirty_regs did not cause expected KVM_RUN error: %d\n", + rv); + vcpu_state(vm, VCPU_ID)->kvm_dirty_regs = 0; + + run->kvm_dirty_regs = INVALID_SYNC_FIELD | TEST_SYNC_FIELDS; + rv = _vcpu_run(vm, VCPU_ID); + TEST_ASSERT(rv < 0 && errno == EINVAL, + "Invalid kvm_dirty_regs did not cause expected KVM_RUN error: %d\n", + rv); + vcpu_state(vm, VCPU_ID)->kvm_dirty_regs = 0; + + /* Request and verify all valid register sets. */ + run->kvm_valid_regs = TEST_SYNC_FIELDS; + rv = _vcpu_run(vm, VCPU_ID); + TEST_ASSERT(rv == 0, "vcpu_run failed: %d\n", rv); + TEST_ASSERT(run->exit_reason == KVM_EXIT_S390_SIEIC, + "Unexpected exit reason: %u (%s)\n", + run->exit_reason, + exit_reason_str(run->exit_reason)); + TEST_ASSERT(run->s390_sieic.icptcode == 4 && + (run->s390_sieic.ipa >> 8) == 0x83 && + (run->s390_sieic.ipb >> 16) == 0x501, + "Unexpected interception code: ic=%u, ipa=0x%x, ipb=0x%x\n", + run->s390_sieic.icptcode, run->s390_sieic.ipa, + run->s390_sieic.ipb); + + vcpu_regs_get(vm, VCPU_ID, ®s); + compare_regs(®s, &run->s.regs); + + vcpu_sregs_get(vm, VCPU_ID, &sregs); + compare_sregs(&sregs, &run->s.regs); + + /* Set and verify various register values */ + run->s.regs.gprs[11] = 0xBAD1DEA; + run->s.regs.acrs[0] = 1 << 11; + + run->kvm_valid_regs = TEST_SYNC_FIELDS; + run->kvm_dirty_regs = KVM_SYNC_GPRS | KVM_SYNC_ACRS; + rv = _vcpu_run(vm, VCPU_ID); + TEST_ASSERT(rv == 0, "vcpu_run failed: %d\n", rv); + TEST_ASSERT(run->exit_reason == KVM_EXIT_S390_SIEIC, + "Unexpected exit reason: %u (%s)\n", + run->exit_reason, + exit_reason_str(run->exit_reason)); + TEST_ASSERT(run->s.regs.gprs[11] == 0xBAD1DEA + 1, + "r11 sync regs value incorrect 0x%llx.", + run->s.regs.gprs[11]); + TEST_ASSERT(run->s.regs.acrs[0] == 1 << 11, + "acr0 sync regs value incorrect 0x%x.", + run->s.regs.acrs[0]); + + vcpu_regs_get(vm, VCPU_ID, ®s); + compare_regs(®s, &run->s.regs); + + vcpu_sregs_get(vm, VCPU_ID, &sregs); + compare_sregs(&sregs, &run->s.regs); + + /* Clear kvm_dirty_regs bits, verify new s.regs values are + * overwritten with existing guest values. + */ + run->kvm_valid_regs = TEST_SYNC_FIELDS; + run->kvm_dirty_regs = 0; + run->s.regs.gprs[11] = 0xDEADBEEF; + rv = _vcpu_run(vm, VCPU_ID); + TEST_ASSERT(rv == 0, "vcpu_run failed: %d\n", rv); + TEST_ASSERT(run->exit_reason == KVM_EXIT_S390_SIEIC, + "Unexpected exit reason: %u (%s)\n", + run->exit_reason, + exit_reason_str(run->exit_reason)); + TEST_ASSERT(run->s.regs.gprs[11] != 0xDEADBEEF, + "r11 sync regs value incorrect 0x%llx.", + run->s.regs.gprs[11]); + + kvm_vm_free(vm); + + return 0; +} diff --git a/tools/testing/selftests/kvm/set_memory_region_test.c b/tools/testing/selftests/kvm/set_memory_region_test.c new file mode 100644 index 000000000..6f441dd9f --- /dev/null +++ b/tools/testing/selftests/kvm/set_memory_region_test.c @@ -0,0 +1,417 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE /* for program_invocation_short_name */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include + +#define VCPU_ID 0 + +/* + * s390x needs at least 1MB alignment, and the x86_64 MOVE/DELETE tests need a + * 2MB sized and aligned region so that the initial region corresponds to + * exactly one large page. + */ +#define MEM_REGION_SIZE 0x200000 + +#ifdef __x86_64__ +/* + * Somewhat arbitrary location and slot, intended to not overlap anything. + */ +#define MEM_REGION_GPA 0xc0000000 +#define MEM_REGION_SLOT 10 + +static const uint64_t MMIO_VAL = 0xbeefull; + +extern const uint64_t final_rip_start; +extern const uint64_t final_rip_end; + +static sem_t vcpu_ready; + +static inline uint64_t guest_spin_on_val(uint64_t spin_val) +{ + uint64_t val; + + do { + val = READ_ONCE(*((uint64_t *)MEM_REGION_GPA)); + } while (val == spin_val); + + GUEST_SYNC(0); + return val; +} + +static void *vcpu_worker(void *data) +{ + struct kvm_vm *vm = data; + struct kvm_run *run; + struct ucall uc; + uint64_t cmd; + + /* + * Loop until the guest is done. Re-enter the guest on all MMIO exits, + * which will occur if the guest attempts to access a memslot after it + * has been deleted or while it is being moved . + */ + run = vcpu_state(vm, VCPU_ID); + + while (1) { + vcpu_run(vm, VCPU_ID); + + if (run->exit_reason == KVM_EXIT_IO) { + cmd = get_ucall(vm, VCPU_ID, &uc); + if (cmd != UCALL_SYNC) + break; + + sem_post(&vcpu_ready); + continue; + } + + if (run->exit_reason != KVM_EXIT_MMIO) + break; + + TEST_ASSERT(!run->mmio.is_write, "Unexpected exit mmio write"); + TEST_ASSERT(run->mmio.len == 8, + "Unexpected exit mmio size = %u", run->mmio.len); + + TEST_ASSERT(run->mmio.phys_addr == MEM_REGION_GPA, + "Unexpected exit mmio address = 0x%llx", + run->mmio.phys_addr); + memcpy(run->mmio.data, &MMIO_VAL, 8); + } + + if (run->exit_reason == KVM_EXIT_IO && cmd == UCALL_ABORT) + TEST_FAIL("%s at %s:%ld, val = %lu", (const char *)uc.args[0], + __FILE__, uc.args[1], uc.args[2]); + + return NULL; +} + +static void wait_for_vcpu(void) +{ + struct timespec ts; + + TEST_ASSERT(!clock_gettime(CLOCK_REALTIME, &ts), + "clock_gettime() failed: %d\n", errno); + + ts.tv_sec += 2; + TEST_ASSERT(!sem_timedwait(&vcpu_ready, &ts), + "sem_timedwait() failed: %d\n", errno); + + /* Wait for the vCPU thread to reenter the guest. */ + usleep(100000); +} + +static struct kvm_vm *spawn_vm(pthread_t *vcpu_thread, void *guest_code) +{ + struct kvm_vm *vm; + uint64_t *hva; + uint64_t gpa; + + vm = vm_create_default(VCPU_ID, 0, guest_code); + + vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid()); + + vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS_THP, + MEM_REGION_GPA, MEM_REGION_SLOT, + MEM_REGION_SIZE / getpagesize(), 0); + + /* + * Allocate and map two pages so that the GPA accessed by guest_code() + * stays valid across the memslot move. + */ + gpa = vm_phy_pages_alloc(vm, 2, MEM_REGION_GPA, MEM_REGION_SLOT); + TEST_ASSERT(gpa == MEM_REGION_GPA, "Failed vm_phy_pages_alloc\n"); + + virt_map(vm, MEM_REGION_GPA, MEM_REGION_GPA, 2, 0); + + /* Ditto for the host mapping so that both pages can be zeroed. */ + hva = addr_gpa2hva(vm, MEM_REGION_GPA); + memset(hva, 0, 2 * 4096); + + pthread_create(vcpu_thread, NULL, vcpu_worker, vm); + + /* Ensure the guest thread is spun up. */ + wait_for_vcpu(); + + return vm; +} + + +static void guest_code_move_memory_region(void) +{ + uint64_t val; + + GUEST_SYNC(0); + + /* + * Spin until the memory region starts getting moved to a + * misaligned address. + * Every region move may or may not trigger MMIO, as the + * window where the memslot is invalid is usually quite small. + */ + val = guest_spin_on_val(0); + GUEST_ASSERT_1(val == 1 || val == MMIO_VAL, val); + + /* Spin until the misaligning memory region move completes. */ + val = guest_spin_on_val(MMIO_VAL); + GUEST_ASSERT_1(val == 1 || val == 0, val); + + /* Spin until the memory region starts to get re-aligned. */ + val = guest_spin_on_val(0); + GUEST_ASSERT_1(val == 1 || val == MMIO_VAL, val); + + /* Spin until the re-aligning memory region move completes. */ + val = guest_spin_on_val(MMIO_VAL); + GUEST_ASSERT_1(val == 1, val); + + GUEST_DONE(); +} + +static void test_move_memory_region(void) +{ + pthread_t vcpu_thread; + struct kvm_vm *vm; + uint64_t *hva; + + vm = spawn_vm(&vcpu_thread, guest_code_move_memory_region); + + hva = addr_gpa2hva(vm, MEM_REGION_GPA); + + /* + * Shift the region's base GPA. The guest should not see "2" as the + * hva->gpa translation is misaligned, i.e. the guest is accessing a + * different host pfn. + */ + vm_mem_region_move(vm, MEM_REGION_SLOT, MEM_REGION_GPA - 4096); + WRITE_ONCE(*hva, 2); + + /* + * The guest _might_ see an invalid memslot and trigger MMIO, but it's + * a tiny window. Spin and defer the sync until the memslot is + * restored and guest behavior is once again deterministic. + */ + usleep(100000); + + /* + * Note, value in memory needs to be changed *before* restoring the + * memslot, else the guest could race the update and see "2". + */ + WRITE_ONCE(*hva, 1); + + /* Restore the original base, the guest should see "1". */ + vm_mem_region_move(vm, MEM_REGION_SLOT, MEM_REGION_GPA); + wait_for_vcpu(); + /* Defered sync from when the memslot was misaligned (above). */ + wait_for_vcpu(); + + pthread_join(vcpu_thread, NULL); + + kvm_vm_free(vm); +} + +static void guest_code_delete_memory_region(void) +{ + uint64_t val; + + GUEST_SYNC(0); + + /* Spin until the memory region is deleted. */ + val = guest_spin_on_val(0); + GUEST_ASSERT_1(val == MMIO_VAL, val); + + /* Spin until the memory region is recreated. */ + val = guest_spin_on_val(MMIO_VAL); + GUEST_ASSERT_1(val == 0, val); + + /* Spin until the memory region is deleted. */ + val = guest_spin_on_val(0); + GUEST_ASSERT_1(val == MMIO_VAL, val); + + asm("1:\n\t" + ".pushsection .rodata\n\t" + ".global final_rip_start\n\t" + "final_rip_start: .quad 1b\n\t" + ".popsection"); + + /* Spin indefinitely (until the code memslot is deleted). */ + guest_spin_on_val(MMIO_VAL); + + asm("1:\n\t" + ".pushsection .rodata\n\t" + ".global final_rip_end\n\t" + "final_rip_end: .quad 1b\n\t" + ".popsection"); + + GUEST_ASSERT_1(0, 0); +} + +static void test_delete_memory_region(void) +{ + pthread_t vcpu_thread; + struct kvm_regs regs; + struct kvm_run *run; + struct kvm_vm *vm; + + vm = spawn_vm(&vcpu_thread, guest_code_delete_memory_region); + + /* Delete the memory region, the guest should not die. */ + vm_mem_region_delete(vm, MEM_REGION_SLOT); + wait_for_vcpu(); + + /* Recreate the memory region. The guest should see "0". */ + vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS_THP, + MEM_REGION_GPA, MEM_REGION_SLOT, + MEM_REGION_SIZE / getpagesize(), 0); + wait_for_vcpu(); + + /* Delete the region again so that there's only one memslot left. */ + vm_mem_region_delete(vm, MEM_REGION_SLOT); + wait_for_vcpu(); + + /* + * Delete the primary memslot. This should cause an emulation error or + * shutdown due to the page tables getting nuked. + */ + vm_mem_region_delete(vm, 0); + + pthread_join(vcpu_thread, NULL); + + run = vcpu_state(vm, VCPU_ID); + + TEST_ASSERT(run->exit_reason == KVM_EXIT_SHUTDOWN || + run->exit_reason == KVM_EXIT_INTERNAL_ERROR, + "Unexpected exit reason = %d", run->exit_reason); + + vcpu_regs_get(vm, VCPU_ID, ®s); + + /* + * On AMD, after KVM_EXIT_SHUTDOWN the VMCB has been reinitialized already, + * so the instruction pointer would point to the reset vector. + */ + if (run->exit_reason == KVM_EXIT_INTERNAL_ERROR) + TEST_ASSERT(regs.rip >= final_rip_start && + regs.rip < final_rip_end, + "Bad rip, expected 0x%lx - 0x%lx, got 0x%llx\n", + final_rip_start, final_rip_end, regs.rip); + + kvm_vm_free(vm); +} + +static void test_zero_memory_regions(void) +{ + struct kvm_run *run; + struct kvm_vm *vm; + + pr_info("Testing KVM_RUN with zero added memory regions\n"); + + vm = vm_create(VM_MODE_DEFAULT, 0, O_RDWR); + vm_vcpu_add(vm, VCPU_ID); + + TEST_ASSERT(!ioctl(vm_get_fd(vm), KVM_SET_NR_MMU_PAGES, 64), + "KVM_SET_NR_MMU_PAGES failed, errno = %d\n", errno); + vcpu_run(vm, VCPU_ID); + + run = vcpu_state(vm, VCPU_ID); + TEST_ASSERT(run->exit_reason == KVM_EXIT_INTERNAL_ERROR, + "Unexpected exit_reason = %u\n", run->exit_reason); + + kvm_vm_free(vm); +} +#endif /* __x86_64__ */ + +/* + * Test it can be added memory slots up to KVM_CAP_NR_MEMSLOTS, then any + * tentative to add further slots should fail. + */ +static void test_add_max_memory_regions(void) +{ + int ret; + struct kvm_vm *vm; + uint32_t max_mem_slots; + uint32_t slot; + uint64_t guest_addr = 0x0; + uint64_t mem_reg_npages; + void *mem; + + max_mem_slots = kvm_check_cap(KVM_CAP_NR_MEMSLOTS); + TEST_ASSERT(max_mem_slots > 0, + "KVM_CAP_NR_MEMSLOTS should be greater than 0"); + pr_info("Allowed number of memory slots: %i\n", max_mem_slots); + + vm = vm_create(VM_MODE_DEFAULT, 0, O_RDWR); + + mem_reg_npages = vm_calc_num_guest_pages(VM_MODE_DEFAULT, MEM_REGION_SIZE); + + /* Check it can be added memory slots up to the maximum allowed */ + pr_info("Adding slots 0..%i, each memory region with %dK size\n", + (max_mem_slots - 1), MEM_REGION_SIZE >> 10); + for (slot = 0; slot < max_mem_slots; slot++) { + vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, + guest_addr, slot, mem_reg_npages, + 0); + guest_addr += MEM_REGION_SIZE; + } + + /* Check it cannot be added memory slots beyond the limit */ + mem = mmap(NULL, MEM_REGION_SIZE, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + TEST_ASSERT(mem != MAP_FAILED, "Failed to mmap() host"); + + ret = ioctl(vm_get_fd(vm), KVM_SET_USER_MEMORY_REGION, + &(struct kvm_userspace_memory_region) {slot, 0, guest_addr, + MEM_REGION_SIZE, (uint64_t) mem}); + TEST_ASSERT(ret == -1 && errno == EINVAL, + "Adding one more memory slot should fail with EINVAL"); + + munmap(mem, MEM_REGION_SIZE); + kvm_vm_free(vm); +} + +int main(int argc, char *argv[]) +{ +#ifdef __x86_64__ + int i, loops; +#endif + + /* Tell stdout not to buffer its content */ + setbuf(stdout, NULL); + +#ifdef __x86_64__ + /* + * FIXME: the zero-memslot test fails on aarch64 and s390x because + * KVM_RUN fails with ENOEXEC or EFAULT. + */ + test_zero_memory_regions(); +#endif + + test_add_max_memory_regions(); + +#ifdef __x86_64__ + if (argc > 1) + loops = atoi(argv[1]); + else + loops = 10; + + pr_info("Testing MOVE of in-use region, %d loops\n", loops); + for (i = 0; i < loops; i++) + test_move_memory_region(); + + pr_info("Testing DELETE of in-use region, %d loops\n", loops); + for (i = 0; i < loops; i++) + test_delete_memory_region(); +#endif + + return 0; +} diff --git a/tools/testing/selftests/kvm/steal_time.c b/tools/testing/selftests/kvm/steal_time.c new file mode 100644 index 000000000..7daedee3e --- /dev/null +++ b/tools/testing/selftests/kvm/steal_time.c @@ -0,0 +1,352 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * steal/stolen time test + * + * Copyright (C) 2020, Red Hat, Inc. + */ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include + +#include "test_util.h" +#include "kvm_util.h" +#include "processor.h" + +#define NR_VCPUS 4 +#define ST_GPA_BASE (1 << 30) +#define MIN_RUN_DELAY_NS 200000UL + +static void *st_gva[NR_VCPUS]; +static uint64_t guest_stolen_time[NR_VCPUS]; + +#if defined(__x86_64__) + +/* steal_time must have 64-byte alignment */ +#define STEAL_TIME_SIZE ((sizeof(struct kvm_steal_time) + 63) & ~63) + +static void check_status(struct kvm_steal_time *st) +{ + GUEST_ASSERT(!(READ_ONCE(st->version) & 1)); + GUEST_ASSERT(READ_ONCE(st->flags) == 0); + GUEST_ASSERT(READ_ONCE(st->preempted) == 0); +} + +static void guest_code(int cpu) +{ + struct kvm_steal_time *st = st_gva[cpu]; + uint32_t version; + + GUEST_ASSERT(rdmsr(MSR_KVM_STEAL_TIME) == ((uint64_t)st_gva[cpu] | KVM_MSR_ENABLED)); + + memset(st, 0, sizeof(*st)); + GUEST_SYNC(0); + + check_status(st); + WRITE_ONCE(guest_stolen_time[cpu], st->steal); + version = READ_ONCE(st->version); + check_status(st); + GUEST_SYNC(1); + + check_status(st); + GUEST_ASSERT(version < READ_ONCE(st->version)); + WRITE_ONCE(guest_stolen_time[cpu], st->steal); + check_status(st); + GUEST_DONE(); +} + +static void steal_time_init(struct kvm_vm *vm) +{ + int i; + + if (!(kvm_get_supported_cpuid_entry(KVM_CPUID_FEATURES)->eax & + KVM_FEATURE_STEAL_TIME)) { + print_skip("steal-time not supported"); + exit(KSFT_SKIP); + } + + for (i = 0; i < NR_VCPUS; ++i) { + int ret; + + vcpu_set_cpuid(vm, i, kvm_get_supported_cpuid()); + + /* ST_GPA_BASE is identity mapped */ + st_gva[i] = (void *)(ST_GPA_BASE + i * STEAL_TIME_SIZE); + sync_global_to_guest(vm, st_gva[i]); + + ret = _vcpu_set_msr(vm, i, MSR_KVM_STEAL_TIME, (ulong)st_gva[i] | KVM_STEAL_RESERVED_MASK); + TEST_ASSERT(ret == 0, "Bad GPA didn't fail"); + + vcpu_set_msr(vm, i, MSR_KVM_STEAL_TIME, (ulong)st_gva[i] | KVM_MSR_ENABLED); + } +} + +static void steal_time_dump(struct kvm_vm *vm, uint32_t vcpuid) +{ + struct kvm_steal_time *st = addr_gva2hva(vm, (ulong)st_gva[vcpuid]); + int i; + + pr_info("VCPU%d:\n", vcpuid); + pr_info(" steal: %lld\n", st->steal); + pr_info(" version: %d\n", st->version); + pr_info(" flags: %d\n", st->flags); + pr_info(" preempted: %d\n", st->preempted); + pr_info(" u8_pad: "); + for (i = 0; i < 3; ++i) + pr_info("%d", st->u8_pad[i]); + pr_info("\n pad: "); + for (i = 0; i < 11; ++i) + pr_info("%d", st->pad[i]); + pr_info("\n"); +} + +#elif defined(__aarch64__) + +/* PV_TIME_ST must have 64-byte alignment */ +#define STEAL_TIME_SIZE ((sizeof(struct st_time) + 63) & ~63) + +#define SMCCC_ARCH_FEATURES 0x80000001 +#define PV_TIME_FEATURES 0xc5000020 +#define PV_TIME_ST 0xc5000021 + +struct st_time { + uint32_t rev; + uint32_t attr; + uint64_t st_time; +}; + +static int64_t smccc(uint32_t func, uint64_t arg) +{ + unsigned long ret; + + asm volatile( + "mov w0, %w1\n" + "mov x1, %2\n" + "hvc #0\n" + "mov %0, x0\n" + : "=r" (ret) : "r" (func), "r" (arg) : + "x0", "x1", "x2", "x3"); + + return ret; +} + +static void check_status(struct st_time *st) +{ + GUEST_ASSERT(READ_ONCE(st->rev) == 0); + GUEST_ASSERT(READ_ONCE(st->attr) == 0); +} + +static void guest_code(int cpu) +{ + struct st_time *st; + int64_t status; + + status = smccc(SMCCC_ARCH_FEATURES, PV_TIME_FEATURES); + GUEST_ASSERT(status == 0); + status = smccc(PV_TIME_FEATURES, PV_TIME_FEATURES); + GUEST_ASSERT(status == 0); + status = smccc(PV_TIME_FEATURES, PV_TIME_ST); + GUEST_ASSERT(status == 0); + + status = smccc(PV_TIME_ST, 0); + GUEST_ASSERT(status != -1); + GUEST_ASSERT(status == (ulong)st_gva[cpu]); + + st = (struct st_time *)status; + GUEST_SYNC(0); + + check_status(st); + WRITE_ONCE(guest_stolen_time[cpu], st->st_time); + GUEST_SYNC(1); + + check_status(st); + WRITE_ONCE(guest_stolen_time[cpu], st->st_time); + GUEST_DONE(); +} + +static void steal_time_init(struct kvm_vm *vm) +{ + struct kvm_device_attr dev = { + .group = KVM_ARM_VCPU_PVTIME_CTRL, + .attr = KVM_ARM_VCPU_PVTIME_IPA, + }; + int i, ret; + + ret = _vcpu_ioctl(vm, 0, KVM_HAS_DEVICE_ATTR, &dev); + if (ret != 0 && errno == ENXIO) { + print_skip("steal-time not supported"); + exit(KSFT_SKIP); + } + + for (i = 0; i < NR_VCPUS; ++i) { + uint64_t st_ipa; + + vcpu_ioctl(vm, i, KVM_HAS_DEVICE_ATTR, &dev); + + dev.addr = (uint64_t)&st_ipa; + + /* ST_GPA_BASE is identity mapped */ + st_gva[i] = (void *)(ST_GPA_BASE + i * STEAL_TIME_SIZE); + sync_global_to_guest(vm, st_gva[i]); + + st_ipa = (ulong)st_gva[i] | 1; + ret = _vcpu_ioctl(vm, i, KVM_SET_DEVICE_ATTR, &dev); + TEST_ASSERT(ret == -1 && errno == EINVAL, "Bad IPA didn't report EINVAL"); + + st_ipa = (ulong)st_gva[i]; + vcpu_ioctl(vm, i, KVM_SET_DEVICE_ATTR, &dev); + + ret = _vcpu_ioctl(vm, i, KVM_SET_DEVICE_ATTR, &dev); + TEST_ASSERT(ret == -1 && errno == EEXIST, "Set IPA twice without EEXIST"); + + } +} + +static void steal_time_dump(struct kvm_vm *vm, uint32_t vcpuid) +{ + struct st_time *st = addr_gva2hva(vm, (ulong)st_gva[vcpuid]); + + pr_info("VCPU%d:\n", vcpuid); + pr_info(" rev: %d\n", st->rev); + pr_info(" attr: %d\n", st->attr); + pr_info(" st_time: %ld\n", st->st_time); +} + +#endif + +static long get_run_delay(void) +{ + char path[64]; + long val[2]; + FILE *fp; + + sprintf(path, "/proc/%ld/schedstat", syscall(SYS_gettid)); + fp = fopen(path, "r"); + fscanf(fp, "%ld %ld ", &val[0], &val[1]); + fclose(fp); + + return val[1]; +} + +static void *do_steal_time(void *arg) +{ + struct timespec ts, stop; + + clock_gettime(CLOCK_MONOTONIC, &ts); + stop = timespec_add_ns(ts, MIN_RUN_DELAY_NS); + + while (1) { + clock_gettime(CLOCK_MONOTONIC, &ts); + if (timespec_to_ns(timespec_sub(ts, stop)) >= 0) + break; + } + + return NULL; +} + +static void run_vcpu(struct kvm_vm *vm, uint32_t vcpuid) +{ + struct ucall uc; + + vcpu_args_set(vm, vcpuid, 1, vcpuid); + + vcpu_ioctl(vm, vcpuid, KVM_RUN, NULL); + + switch (get_ucall(vm, vcpuid, &uc)) { + case UCALL_SYNC: + case UCALL_DONE: + break; + case UCALL_ABORT: + TEST_ASSERT(false, "%s at %s:%ld", (const char *)uc.args[0], + __FILE__, uc.args[1]); + default: + TEST_ASSERT(false, "Unexpected exit: %s", + exit_reason_str(vcpu_state(vm, vcpuid)->exit_reason)); + } +} + +int main(int ac, char **av) +{ + struct kvm_vm *vm; + pthread_attr_t attr; + pthread_t thread; + cpu_set_t cpuset; + unsigned int gpages; + long stolen_time; + long run_delay; + bool verbose; + int i; + + verbose = ac > 1 && (!strncmp(av[1], "-v", 3) || !strncmp(av[1], "--verbose", 10)); + + /* Set CPU affinity so we can force preemption of the VCPU */ + CPU_ZERO(&cpuset); + CPU_SET(0, &cpuset); + pthread_attr_init(&attr); + pthread_attr_setaffinity_np(&attr, sizeof(cpu_set_t), &cpuset); + pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset); + + /* Create a one VCPU guest and an identity mapped memslot for the steal time structure */ + vm = vm_create_default(0, 0, guest_code); + gpages = vm_calc_num_guest_pages(VM_MODE_DEFAULT, STEAL_TIME_SIZE * NR_VCPUS); + vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, ST_GPA_BASE, 1, gpages, 0); + virt_map(vm, ST_GPA_BASE, ST_GPA_BASE, gpages, 0); + ucall_init(vm, NULL); + + /* Add the rest of the VCPUs */ + for (i = 1; i < NR_VCPUS; ++i) + vm_vcpu_add_default(vm, i, guest_code); + + steal_time_init(vm); + + /* Run test on each VCPU */ + for (i = 0; i < NR_VCPUS; ++i) { + /* First VCPU run initializes steal-time */ + run_vcpu(vm, i); + + /* Second VCPU run, expect guest stolen time to be <= run_delay */ + run_vcpu(vm, i); + sync_global_from_guest(vm, guest_stolen_time[i]); + stolen_time = guest_stolen_time[i]; + run_delay = get_run_delay(); + TEST_ASSERT(stolen_time <= run_delay, + "Expected stolen time <= %ld, got %ld", + run_delay, stolen_time); + + /* Steal time from the VCPU. The steal time thread has the same CPU affinity as the VCPUs. */ + run_delay = get_run_delay(); + pthread_create(&thread, &attr, do_steal_time, NULL); + do + pthread_yield(); + while (get_run_delay() - run_delay < MIN_RUN_DELAY_NS); + pthread_join(thread, NULL); + run_delay = get_run_delay() - run_delay; + TEST_ASSERT(run_delay >= MIN_RUN_DELAY_NS, + "Expected run_delay >= %ld, got %ld", + MIN_RUN_DELAY_NS, run_delay); + + /* Run VCPU again to confirm stolen time is consistent with run_delay */ + run_vcpu(vm, i); + sync_global_from_guest(vm, guest_stolen_time[i]); + stolen_time = guest_stolen_time[i] - stolen_time; + TEST_ASSERT(stolen_time >= run_delay, + "Expected stolen time >= %ld, got %ld", + run_delay, stolen_time); + + if (verbose) { + pr_info("VCPU%d: total-stolen-time=%ld test-stolen-time=%ld", i, + guest_stolen_time[i], stolen_time); + if (stolen_time == run_delay) + pr_info(" (BONUS: guest test-stolen-time even exactly matches test-run_delay)"); + pr_info("\n"); + steal_time_dump(vm, i); + } + } + + return 0; +} diff --git a/tools/testing/selftests/kvm/x86_64/cr4_cpuid_sync_test.c b/tools/testing/selftests/kvm/x86_64/cr4_cpuid_sync_test.c new file mode 100644 index 000000000..140e91901 --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/cr4_cpuid_sync_test.c @@ -0,0 +1,117 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * CR4 and CPUID sync test + * + * Copyright 2018, Red Hat, Inc. and/or its affiliates. + * + * Author: + * Wei Huang + */ + +#include +#include +#include +#include +#include + +#include "test_util.h" + +#include "kvm_util.h" +#include "processor.h" + +#define X86_FEATURE_XSAVE (1<<26) +#define X86_FEATURE_OSXSAVE (1<<27) +#define VCPU_ID 1 + +static inline bool cr4_cpuid_is_sync(void) +{ + int func, subfunc; + uint32_t eax, ebx, ecx, edx; + uint64_t cr4; + + func = 0x1; + subfunc = 0x0; + __asm__ __volatile__("cpuid" + : "=a"(eax), "=b"(ebx), "=c"(ecx), "=d"(edx) + : "a"(func), "c"(subfunc)); + + cr4 = get_cr4(); + + return (!!(ecx & X86_FEATURE_OSXSAVE)) == (!!(cr4 & X86_CR4_OSXSAVE)); +} + +static void guest_code(void) +{ + uint64_t cr4; + + /* turn on CR4.OSXSAVE */ + cr4 = get_cr4(); + cr4 |= X86_CR4_OSXSAVE; + set_cr4(cr4); + + /* verify CR4.OSXSAVE == CPUID.OSXSAVE */ + GUEST_ASSERT(cr4_cpuid_is_sync()); + + /* notify hypervisor to change CR4 */ + GUEST_SYNC(0); + + /* check again */ + GUEST_ASSERT(cr4_cpuid_is_sync()); + + GUEST_DONE(); +} + +int main(int argc, char *argv[]) +{ + struct kvm_run *run; + struct kvm_vm *vm; + struct kvm_sregs sregs; + struct kvm_cpuid_entry2 *entry; + struct ucall uc; + int rc; + + entry = kvm_get_supported_cpuid_entry(1); + if (!(entry->ecx & X86_FEATURE_XSAVE)) { + print_skip("XSAVE feature not supported"); + return 0; + } + + /* Tell stdout not to buffer its content */ + setbuf(stdout, NULL); + + /* Create VM */ + vm = vm_create_default(VCPU_ID, 0, guest_code); + vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid()); + run = vcpu_state(vm, VCPU_ID); + + while (1) { + rc = _vcpu_run(vm, VCPU_ID); + + TEST_ASSERT(rc == 0, "vcpu_run failed: %d\n", rc); + TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, + "Unexpected exit reason: %u (%s),\n", + run->exit_reason, + exit_reason_str(run->exit_reason)); + + switch (get_ucall(vm, VCPU_ID, &uc)) { + case UCALL_SYNC: + /* emulate hypervisor clearing CR4.OSXSAVE */ + vcpu_sregs_get(vm, VCPU_ID, &sregs); + sregs.cr4 &= ~X86_CR4_OSXSAVE; + vcpu_sregs_set(vm, VCPU_ID, &sregs); + break; + case UCALL_ABORT: + TEST_FAIL("Guest CR4 bit (OSXSAVE) unsynchronized with CPUID bit."); + break; + case UCALL_DONE: + goto done; + default: + TEST_FAIL("Unknown ucall %lu", uc.cmd); + } + } + + kvm_vm_free(vm); + +done: + return 0; +} diff --git a/tools/testing/selftests/kvm/x86_64/debug_regs.c b/tools/testing/selftests/kvm/x86_64/debug_regs.c new file mode 100644 index 000000000..2fc6b3af8 --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/debug_regs.c @@ -0,0 +1,202 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * KVM guest debug register tests + * + * Copyright (C) 2020, Red Hat, Inc. + */ +#include +#include +#include "kvm_util.h" +#include "processor.h" + +#define VCPU_ID 0 + +#define DR6_BD (1 << 13) +#define DR7_GD (1 << 13) + +/* For testing data access debug BP */ +uint32_t guest_value; + +extern unsigned char sw_bp, hw_bp, write_data, ss_start, bd_start; + +static void guest_code(void) +{ + /* + * Software BP tests. + * + * NOTE: sw_bp need to be before the cmd here, because int3 is an + * exception rather than a normal trap for KVM_SET_GUEST_DEBUG (we + * capture it using the vcpu exception bitmap). + */ + asm volatile("sw_bp: int3"); + + /* Hardware instruction BP test */ + asm volatile("hw_bp: nop"); + + /* Hardware data BP test */ + asm volatile("mov $1234,%%rax;\n\t" + "mov %%rax,%0;\n\t write_data:" + : "=m" (guest_value) : : "rax"); + + /* Single step test, covers 2 basic instructions and 2 emulated */ + asm volatile("ss_start: " + "xor %%eax,%%eax\n\t" + "cpuid\n\t" + "movl $0x1a0,%%ecx\n\t" + "rdmsr\n\t" + : : : "eax", "ebx", "ecx", "edx"); + + /* DR6.BD test */ + asm volatile("bd_start: mov %%dr0, %%rax" : : : "rax"); + GUEST_DONE(); +} + +#define CLEAR_DEBUG() memset(&debug, 0, sizeof(debug)) +#define APPLY_DEBUG() vcpu_set_guest_debug(vm, VCPU_ID, &debug) +#define CAST_TO_RIP(v) ((unsigned long long)&(v)) +#define SET_RIP(v) do { \ + vcpu_regs_get(vm, VCPU_ID, ®s); \ + regs.rip = (v); \ + vcpu_regs_set(vm, VCPU_ID, ®s); \ + } while (0) +#define MOVE_RIP(v) SET_RIP(regs.rip + (v)); + +int main(void) +{ + struct kvm_guest_debug debug; + unsigned long long target_dr6, target_rip; + struct kvm_regs regs; + struct kvm_run *run; + struct kvm_vm *vm; + struct ucall uc; + uint64_t cmd; + int i; + /* Instruction lengths starting at ss_start */ + int ss_size[4] = { + 2, /* xor */ + 2, /* cpuid */ + 5, /* mov */ + 2, /* rdmsr */ + }; + + if (!kvm_check_cap(KVM_CAP_SET_GUEST_DEBUG)) { + print_skip("KVM_CAP_SET_GUEST_DEBUG not supported"); + return 0; + } + + vm = vm_create_default(VCPU_ID, 0, guest_code); + vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid()); + run = vcpu_state(vm, VCPU_ID); + + /* Test software BPs - int3 */ + CLEAR_DEBUG(); + debug.control = KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP; + APPLY_DEBUG(); + vcpu_run(vm, VCPU_ID); + TEST_ASSERT(run->exit_reason == KVM_EXIT_DEBUG && + run->debug.arch.exception == BP_VECTOR && + run->debug.arch.pc == CAST_TO_RIP(sw_bp), + "INT3: exit %d exception %d rip 0x%llx (should be 0x%llx)", + run->exit_reason, run->debug.arch.exception, + run->debug.arch.pc, CAST_TO_RIP(sw_bp)); + MOVE_RIP(1); + + /* Test instruction HW BP over DR[0-3] */ + for (i = 0; i < 4; i++) { + CLEAR_DEBUG(); + debug.control = KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP; + debug.arch.debugreg[i] = CAST_TO_RIP(hw_bp); + debug.arch.debugreg[7] = 0x400 | (1UL << (2*i+1)); + APPLY_DEBUG(); + vcpu_run(vm, VCPU_ID); + target_dr6 = 0xffff0ff0 | (1UL << i); + TEST_ASSERT(run->exit_reason == KVM_EXIT_DEBUG && + run->debug.arch.exception == DB_VECTOR && + run->debug.arch.pc == CAST_TO_RIP(hw_bp) && + run->debug.arch.dr6 == target_dr6, + "INS_HW_BP (DR%d): exit %d exception %d rip 0x%llx " + "(should be 0x%llx) dr6 0x%llx (should be 0x%llx)", + i, run->exit_reason, run->debug.arch.exception, + run->debug.arch.pc, CAST_TO_RIP(hw_bp), + run->debug.arch.dr6, target_dr6); + } + /* Skip "nop" */ + MOVE_RIP(1); + + /* Test data access HW BP over DR[0-3] */ + for (i = 0; i < 4; i++) { + CLEAR_DEBUG(); + debug.control = KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP; + debug.arch.debugreg[i] = CAST_TO_RIP(guest_value); + debug.arch.debugreg[7] = 0x00000400 | (1UL << (2*i+1)) | + (0x000d0000UL << (4*i)); + APPLY_DEBUG(); + vcpu_run(vm, VCPU_ID); + target_dr6 = 0xffff0ff0 | (1UL << i); + TEST_ASSERT(run->exit_reason == KVM_EXIT_DEBUG && + run->debug.arch.exception == DB_VECTOR && + run->debug.arch.pc == CAST_TO_RIP(write_data) && + run->debug.arch.dr6 == target_dr6, + "DATA_HW_BP (DR%d): exit %d exception %d rip 0x%llx " + "(should be 0x%llx) dr6 0x%llx (should be 0x%llx)", + i, run->exit_reason, run->debug.arch.exception, + run->debug.arch.pc, CAST_TO_RIP(write_data), + run->debug.arch.dr6, target_dr6); + /* Rollback the 4-bytes "mov" */ + MOVE_RIP(-7); + } + /* Skip the 4-bytes "mov" */ + MOVE_RIP(7); + + /* Test single step */ + target_rip = CAST_TO_RIP(ss_start); + target_dr6 = 0xffff4ff0ULL; + vcpu_regs_get(vm, VCPU_ID, ®s); + for (i = 0; i < (sizeof(ss_size) / sizeof(ss_size[0])); i++) { + target_rip += ss_size[i]; + CLEAR_DEBUG(); + debug.control = KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_SINGLESTEP; + debug.arch.debugreg[7] = 0x00000400; + APPLY_DEBUG(); + vcpu_run(vm, VCPU_ID); + TEST_ASSERT(run->exit_reason == KVM_EXIT_DEBUG && + run->debug.arch.exception == DB_VECTOR && + run->debug.arch.pc == target_rip && + run->debug.arch.dr6 == target_dr6, + "SINGLE_STEP[%d]: exit %d exception %d rip 0x%llx " + "(should be 0x%llx) dr6 0x%llx (should be 0x%llx)", + i, run->exit_reason, run->debug.arch.exception, + run->debug.arch.pc, target_rip, run->debug.arch.dr6, + target_dr6); + } + + /* Finally test global disable */ + CLEAR_DEBUG(); + debug.control = KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP; + debug.arch.debugreg[7] = 0x400 | DR7_GD; + APPLY_DEBUG(); + vcpu_run(vm, VCPU_ID); + target_dr6 = 0xffff0ff0 | DR6_BD; + TEST_ASSERT(run->exit_reason == KVM_EXIT_DEBUG && + run->debug.arch.exception == DB_VECTOR && + run->debug.arch.pc == CAST_TO_RIP(bd_start) && + run->debug.arch.dr6 == target_dr6, + "DR7.GD: exit %d exception %d rip 0x%llx " + "(should be 0x%llx) dr6 0x%llx (should be 0x%llx)", + run->exit_reason, run->debug.arch.exception, + run->debug.arch.pc, target_rip, run->debug.arch.dr6, + target_dr6); + + /* Disable all debug controls, run to the end */ + CLEAR_DEBUG(); + APPLY_DEBUG(); + + vcpu_run(vm, VCPU_ID); + TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, "KVM_EXIT_IO"); + cmd = get_ucall(vm, VCPU_ID, &uc); + TEST_ASSERT(cmd == UCALL_DONE, "UCALL_DONE"); + + kvm_vm_free(vm); + + return 0; +} diff --git a/tools/testing/selftests/kvm/x86_64/evmcs_test.c b/tools/testing/selftests/kvm/x86_64/evmcs_test.c new file mode 100644 index 000000000..757928199 --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/evmcs_test.c @@ -0,0 +1,166 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2018, Red Hat, Inc. + * + * Tests for Enlightened VMCS, including nested guest state. + */ +#define _GNU_SOURCE /* for program_invocation_short_name */ +#include +#include +#include +#include +#include + +#include "test_util.h" + +#include "kvm_util.h" + +#include "vmx.h" + +#define VCPU_ID 5 + +void l2_guest_code(void) +{ + GUEST_SYNC(7); + + GUEST_SYNC(8); + + /* Done, exit to L1 and never come back. */ + vmcall(); +} + +void l1_guest_code(struct vmx_pages *vmx_pages) +{ +#define L2_GUEST_STACK_SIZE 64 + unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; + + enable_vp_assist(vmx_pages->vp_assist_gpa, vmx_pages->vp_assist); + + GUEST_ASSERT(vmx_pages->vmcs_gpa); + GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages)); + GUEST_SYNC(3); + GUEST_ASSERT(load_vmcs(vmx_pages)); + GUEST_ASSERT(vmptrstz() == vmx_pages->enlightened_vmcs_gpa); + + GUEST_SYNC(4); + GUEST_ASSERT(vmptrstz() == vmx_pages->enlightened_vmcs_gpa); + + prepare_vmcs(vmx_pages, l2_guest_code, + &l2_guest_stack[L2_GUEST_STACK_SIZE]); + + GUEST_SYNC(5); + GUEST_ASSERT(vmptrstz() == vmx_pages->enlightened_vmcs_gpa); + current_evmcs->revision_id = -1u; + GUEST_ASSERT(vmlaunch()); + current_evmcs->revision_id = EVMCS_VERSION; + GUEST_SYNC(6); + + GUEST_ASSERT(!vmlaunch()); + GUEST_ASSERT(vmptrstz() == vmx_pages->enlightened_vmcs_gpa); + GUEST_SYNC(9); + GUEST_ASSERT(!vmresume()); + GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL); + GUEST_SYNC(10); +} + +void guest_code(struct vmx_pages *vmx_pages) +{ + GUEST_SYNC(1); + GUEST_SYNC(2); + + if (vmx_pages) + l1_guest_code(vmx_pages); + + GUEST_DONE(); + + /* Try enlightened vmptrld with an incorrect GPA */ + evmcs_vmptrld(0xdeadbeef, vmx_pages->enlightened_vmcs); + GUEST_ASSERT(vmlaunch()); +} + +int main(int argc, char *argv[]) +{ + vm_vaddr_t vmx_pages_gva = 0; + + struct kvm_regs regs1, regs2; + struct kvm_vm *vm; + struct kvm_run *run; + struct kvm_x86_state *state; + struct ucall uc; + int stage; + + /* Create VM */ + vm = vm_create_default(VCPU_ID, 0, guest_code); + + vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid()); + + if (!nested_vmx_supported() || + !kvm_check_cap(KVM_CAP_NESTED_STATE) || + !kvm_check_cap(KVM_CAP_HYPERV_ENLIGHTENED_VMCS)) { + print_skip("Enlightened VMCS is unsupported"); + exit(KSFT_SKIP); + } + + vcpu_enable_evmcs(vm, VCPU_ID); + + run = vcpu_state(vm, VCPU_ID); + + vcpu_regs_get(vm, VCPU_ID, ®s1); + + vcpu_alloc_vmx(vm, &vmx_pages_gva); + vcpu_args_set(vm, VCPU_ID, 1, vmx_pages_gva); + + for (stage = 1;; stage++) { + _vcpu_run(vm, VCPU_ID); + TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, + "Stage %d: unexpected exit reason: %u (%s),\n", + stage, run->exit_reason, + exit_reason_str(run->exit_reason)); + + switch (get_ucall(vm, VCPU_ID, &uc)) { + case UCALL_ABORT: + TEST_FAIL("%s at %s:%ld", (const char *)uc.args[0], + __FILE__, uc.args[1]); + /* NOT REACHED */ + case UCALL_SYNC: + break; + case UCALL_DONE: + goto part1_done; + default: + TEST_FAIL("Unknown ucall %lu", uc.cmd); + } + + /* UCALL_SYNC is handled here. */ + TEST_ASSERT(!strcmp((const char *)uc.args[0], "hello") && + uc.args[1] == stage, "Stage %d: Unexpected register values vmexit, got %lx", + stage, (ulong)uc.args[1]); + + state = vcpu_save_state(vm, VCPU_ID); + memset(®s1, 0, sizeof(regs1)); + vcpu_regs_get(vm, VCPU_ID, ®s1); + + kvm_vm_release(vm); + + /* Restore state in a new VM. */ + kvm_vm_restart(vm, O_RDWR); + vm_vcpu_add(vm, VCPU_ID); + vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid()); + vcpu_enable_evmcs(vm, VCPU_ID); + vcpu_load_state(vm, VCPU_ID, state); + run = vcpu_state(vm, VCPU_ID); + free(state); + + memset(®s2, 0, sizeof(regs2)); + vcpu_regs_get(vm, VCPU_ID, ®s2); + TEST_ASSERT(!memcmp(®s1, ®s2, sizeof(regs2)), + "Unexpected register values after vcpu_load_state; rdi: %lx rsi: %lx", + (ulong) regs2.rdi, (ulong) regs2.rsi); + } + +part1_done: + _vcpu_run(vm, VCPU_ID); + TEST_ASSERT(run->exit_reason == KVM_EXIT_SHUTDOWN, + "Unexpected successful VMEnter with invalid eVMCS pointer!"); + + kvm_vm_free(vm); +} diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c b/tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c new file mode 100644 index 000000000..745b708c2 --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c @@ -0,0 +1,190 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Test for x86 KVM_CAP_HYPERV_CPUID + * + * Copyright (C) 2018, Red Hat, Inc. + * + * This work is licensed under the terms of the GNU GPL, version 2. + * + */ + +#define _GNU_SOURCE /* for program_invocation_short_name */ +#include +#include +#include +#include +#include + +#include "test_util.h" +#include "kvm_util.h" +#include "processor.h" +#include "vmx.h" + +#define VCPU_ID 0 + +static void guest_code(void) +{ +} + +static bool smt_possible(void) +{ + char buf[16]; + FILE *f; + bool res = true; + + f = fopen("/sys/devices/system/cpu/smt/control", "r"); + if (f) { + if (fread(buf, sizeof(*buf), sizeof(buf), f) > 0) { + if (!strncmp(buf, "forceoff", 8) || + !strncmp(buf, "notsupported", 12)) + res = false; + } + fclose(f); + } + + return res; +} + +static void test_hv_cpuid(struct kvm_cpuid2 *hv_cpuid_entries, + bool evmcs_enabled) +{ + int i; + int nent = 9; + u32 test_val; + + if (evmcs_enabled) + nent += 1; /* 0x4000000A */ + + TEST_ASSERT(hv_cpuid_entries->nent == nent, + "KVM_GET_SUPPORTED_HV_CPUID should return %d entries" + " with evmcs=%d (returned %d)", + nent, evmcs_enabled, hv_cpuid_entries->nent); + + for (i = 0; i < hv_cpuid_entries->nent; i++) { + struct kvm_cpuid_entry2 *entry = &hv_cpuid_entries->entries[i]; + + TEST_ASSERT((entry->function >= 0x40000000) && + (entry->function <= 0x40000082), + "function %x is our of supported range", + entry->function); + + TEST_ASSERT(evmcs_enabled || (entry->function != 0x4000000A), + "0x4000000A leaf should not be reported"); + + TEST_ASSERT(entry->index == 0, + ".index field should be zero"); + + TEST_ASSERT(entry->flags == 0, + ".flags field should be zero"); + + TEST_ASSERT(!entry->padding[0] && !entry->padding[1] && + !entry->padding[2], "padding should be zero"); + + switch (entry->function) { + case 0x40000000: + test_val = 0x40000082; + + TEST_ASSERT(entry->eax == test_val, + "Wrong max leaf report in 0x40000000.EAX: %x" + " (evmcs=%d)", + entry->eax, evmcs_enabled + ); + break; + case 0x40000004: + test_val = entry->eax & (1UL << 18); + + TEST_ASSERT(!!test_val == !smt_possible(), + "NoNonArchitecturalCoreSharing bit" + " doesn't reflect SMT setting"); + break; + } + + /* + * If needed for debug: + * fprintf(stdout, + * "CPUID%lx EAX=0x%lx EBX=0x%lx ECX=0x%lx EDX=0x%lx\n", + * entry->function, entry->eax, entry->ebx, entry->ecx, + * entry->edx); + */ + } + +} + +void test_hv_cpuid_e2big(struct kvm_vm *vm) +{ + static struct kvm_cpuid2 cpuid = {.nent = 0}; + int ret; + + ret = _vcpu_ioctl(vm, VCPU_ID, KVM_GET_SUPPORTED_HV_CPUID, &cpuid); + + TEST_ASSERT(ret == -1 && errno == E2BIG, + "KVM_GET_SUPPORTED_HV_CPUID didn't fail with -E2BIG when" + " it should have: %d %d", ret, errno); +} + + +struct kvm_cpuid2 *kvm_get_supported_hv_cpuid(struct kvm_vm *vm) +{ + int nent = 20; /* should be enough */ + static struct kvm_cpuid2 *cpuid; + + cpuid = malloc(sizeof(*cpuid) + nent * sizeof(struct kvm_cpuid_entry2)); + + if (!cpuid) { + perror("malloc"); + abort(); + } + + cpuid->nent = nent; + + vcpu_ioctl(vm, VCPU_ID, KVM_GET_SUPPORTED_HV_CPUID, cpuid); + + return cpuid; +} + + +int main(int argc, char *argv[]) +{ + struct kvm_vm *vm; + int rv, stage; + struct kvm_cpuid2 *hv_cpuid_entries; + bool evmcs_enabled; + + /* Tell stdout not to buffer its content */ + setbuf(stdout, NULL); + + rv = kvm_check_cap(KVM_CAP_HYPERV_CPUID); + if (!rv) { + print_skip("KVM_CAP_HYPERV_CPUID not supported"); + exit(KSFT_SKIP); + } + + for (stage = 0; stage < 3; stage++) { + evmcs_enabled = false; + + vm = vm_create_default(VCPU_ID, 0, guest_code); + switch (stage) { + case 0: + test_hv_cpuid_e2big(vm); + continue; + case 1: + break; + case 2: + if (!nested_vmx_supported() || + !kvm_check_cap(KVM_CAP_HYPERV_ENLIGHTENED_VMCS)) { + print_skip("Enlightened VMCS is unsupported"); + continue; + } + vcpu_enable_evmcs(vm, VCPU_ID); + evmcs_enabled = true; + break; + } + + hv_cpuid_entries = kvm_get_supported_hv_cpuid(vm); + test_hv_cpuid(hv_cpuid_entries, evmcs_enabled); + free(hv_cpuid_entries); + kvm_vm_free(vm); + } + + return 0; +} diff --git a/tools/testing/selftests/kvm/x86_64/kvm_pv_test.c b/tools/testing/selftests/kvm/x86_64/kvm_pv_test.c new file mode 100644 index 000000000..b10a27485 --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/kvm_pv_test.c @@ -0,0 +1,234 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2020, Google LLC. + * + * Tests for KVM paravirtual feature disablement + */ +#include +#include +#include + +#include "test_util.h" +#include "kvm_util.h" +#include "processor.h" + +extern unsigned char rdmsr_start; +extern unsigned char rdmsr_end; + +static u64 do_rdmsr(u32 idx) +{ + u32 lo, hi; + + asm volatile("rdmsr_start: rdmsr;" + "rdmsr_end:" + : "=a"(lo), "=c"(hi) + : "c"(idx)); + + return (((u64) hi) << 32) | lo; +} + +extern unsigned char wrmsr_start; +extern unsigned char wrmsr_end; + +static void do_wrmsr(u32 idx, u64 val) +{ + u32 lo, hi; + + lo = val; + hi = val >> 32; + + asm volatile("wrmsr_start: wrmsr;" + "wrmsr_end:" + : : "a"(lo), "c"(idx), "d"(hi)); +} + +static int nr_gp; + +static void guest_gp_handler(struct ex_regs *regs) +{ + unsigned char *rip = (unsigned char *)regs->rip; + bool r, w; + + r = rip == &rdmsr_start; + w = rip == &wrmsr_start; + GUEST_ASSERT(r || w); + + nr_gp++; + + if (r) + regs->rip = (uint64_t)&rdmsr_end; + else + regs->rip = (uint64_t)&wrmsr_end; +} + +struct msr_data { + uint32_t idx; + const char *name; +}; + +#define TEST_MSR(msr) { .idx = msr, .name = #msr } +#define UCALL_PR_MSR 0xdeadbeef +#define PR_MSR(msr) ucall(UCALL_PR_MSR, 1, msr) + +/* + * KVM paravirtual msrs to test. Expect a #GP if any of these msrs are read or + * written, as the KVM_CPUID_FEATURES leaf is cleared. + */ +static struct msr_data msrs_to_test[] = { + TEST_MSR(MSR_KVM_SYSTEM_TIME), + TEST_MSR(MSR_KVM_SYSTEM_TIME_NEW), + TEST_MSR(MSR_KVM_WALL_CLOCK), + TEST_MSR(MSR_KVM_WALL_CLOCK_NEW), + TEST_MSR(MSR_KVM_ASYNC_PF_EN), + TEST_MSR(MSR_KVM_STEAL_TIME), + TEST_MSR(MSR_KVM_PV_EOI_EN), + TEST_MSR(MSR_KVM_POLL_CONTROL), + TEST_MSR(MSR_KVM_ASYNC_PF_INT), + TEST_MSR(MSR_KVM_ASYNC_PF_ACK), +}; + +static void test_msr(struct msr_data *msr) +{ + PR_MSR(msr); + do_rdmsr(msr->idx); + GUEST_ASSERT(READ_ONCE(nr_gp) == 1); + + nr_gp = 0; + do_wrmsr(msr->idx, 0); + GUEST_ASSERT(READ_ONCE(nr_gp) == 1); + nr_gp = 0; +} + +struct hcall_data { + uint64_t nr; + const char *name; +}; + +#define TEST_HCALL(hc) { .nr = hc, .name = #hc } +#define UCALL_PR_HCALL 0xdeadc0de +#define PR_HCALL(hc) ucall(UCALL_PR_HCALL, 1, hc) + +/* + * KVM hypercalls to test. Expect -KVM_ENOSYS when called, as the corresponding + * features have been cleared in KVM_CPUID_FEATURES. + */ +static struct hcall_data hcalls_to_test[] = { + TEST_HCALL(KVM_HC_KICK_CPU), + TEST_HCALL(KVM_HC_SEND_IPI), + TEST_HCALL(KVM_HC_SCHED_YIELD), +}; + +static void test_hcall(struct hcall_data *hc) +{ + uint64_t r; + + PR_HCALL(hc); + r = kvm_hypercall(hc->nr, 0, 0, 0, 0); + GUEST_ASSERT(r == -KVM_ENOSYS); +} + +static void guest_main(void) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(msrs_to_test); i++) { + test_msr(&msrs_to_test[i]); + } + + for (i = 0; i < ARRAY_SIZE(hcalls_to_test); i++) { + test_hcall(&hcalls_to_test[i]); + } + + GUEST_DONE(); +} + +static void clear_kvm_cpuid_features(struct kvm_cpuid2 *cpuid) +{ + struct kvm_cpuid_entry2 ent = {0}; + + ent.function = KVM_CPUID_FEATURES; + TEST_ASSERT(set_cpuid(cpuid, &ent), + "failed to clear KVM_CPUID_FEATURES leaf"); +} + +static void pr_msr(struct ucall *uc) +{ + struct msr_data *msr = (struct msr_data *)uc->args[0]; + + pr_info("testing msr: %s (%#x)\n", msr->name, msr->idx); +} + +static void pr_hcall(struct ucall *uc) +{ + struct hcall_data *hc = (struct hcall_data *)uc->args[0]; + + pr_info("testing hcall: %s (%lu)\n", hc->name, hc->nr); +} + +static void handle_abort(struct ucall *uc) +{ + TEST_FAIL("%s at %s:%ld", (const char *)uc->args[0], + __FILE__, uc->args[1]); +} + +#define VCPU_ID 0 + +static void enter_guest(struct kvm_vm *vm) +{ + struct kvm_run *run; + struct ucall uc; + int r; + + run = vcpu_state(vm, VCPU_ID); + + while (true) { + r = _vcpu_run(vm, VCPU_ID); + TEST_ASSERT(!r, "vcpu_run failed: %d\n", r); + TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, + "unexpected exit reason: %u (%s)", + run->exit_reason, exit_reason_str(run->exit_reason)); + + switch (get_ucall(vm, VCPU_ID, &uc)) { + case UCALL_PR_MSR: + pr_msr(&uc); + break; + case UCALL_PR_HCALL: + pr_hcall(&uc); + break; + case UCALL_ABORT: + handle_abort(&uc); + return; + case UCALL_DONE: + return; + } + } +} + +int main(void) +{ + struct kvm_enable_cap cap = {0}; + struct kvm_cpuid2 *best; + struct kvm_vm *vm; + + if (!kvm_check_cap(KVM_CAP_ENFORCE_PV_FEATURE_CPUID)) { + pr_info("will skip kvm paravirt restriction tests.\n"); + return 0; + } + + vm = vm_create_default(VCPU_ID, 0, guest_main); + + cap.cap = KVM_CAP_ENFORCE_PV_FEATURE_CPUID; + cap.args[0] = 1; + vcpu_enable_cap(vm, VCPU_ID, &cap); + + best = kvm_get_supported_cpuid(); + clear_kvm_cpuid_features(best); + vcpu_set_cpuid(vm, VCPU_ID, best); + + vm_init_descriptor_tables(vm); + vcpu_init_descriptor_tables(vm, VCPU_ID); + vm_handle_exception(vm, GP_VECTOR, guest_gp_handler); + + enter_guest(vm); + kvm_vm_free(vm); +} diff --git a/tools/testing/selftests/kvm/x86_64/mmio_warning_test.c b/tools/testing/selftests/kvm/x86_64/mmio_warning_test.c new file mode 100644 index 000000000..9f55ccd16 --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/mmio_warning_test.c @@ -0,0 +1,127 @@ +/* + * mmio_warning_test + * + * Copyright (C) 2019, Google LLC. + * + * This work is licensed under the terms of the GNU GPL, version 2. + * + * Test that we don't get a kernel warning when we call KVM_RUN after a + * triple fault occurs. To get the triple fault to occur we call KVM_RUN + * on a VCPU that hasn't been properly setup. + * + */ + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define NTHREAD 4 +#define NPROCESS 5 + +struct thread_context { + int kvmcpu; + struct kvm_run *run; +}; + +void *thr(void *arg) +{ + struct thread_context *tc = (struct thread_context *)arg; + int res; + int kvmcpu = tc->kvmcpu; + struct kvm_run *run = tc->run; + + res = ioctl(kvmcpu, KVM_RUN, 0); + pr_info("ret1=%d exit_reason=%d suberror=%d\n", + res, run->exit_reason, run->internal.suberror); + + return 0; +} + +void test(void) +{ + int i, kvm, kvmvm, kvmcpu; + pthread_t th[NTHREAD]; + struct kvm_run *run; + struct thread_context tc; + + kvm = open("/dev/kvm", O_RDWR); + TEST_ASSERT(kvm != -1, "failed to open /dev/kvm"); + kvmvm = ioctl(kvm, KVM_CREATE_VM, 0); + TEST_ASSERT(kvmvm != -1, "KVM_CREATE_VM failed"); + kvmcpu = ioctl(kvmvm, KVM_CREATE_VCPU, 0); + TEST_ASSERT(kvmcpu != -1, "KVM_CREATE_VCPU failed"); + run = (struct kvm_run *)mmap(0, 4096, PROT_READ|PROT_WRITE, MAP_SHARED, + kvmcpu, 0); + tc.kvmcpu = kvmcpu; + tc.run = run; + srand(getpid()); + for (i = 0; i < NTHREAD; i++) { + pthread_create(&th[i], NULL, thr, (void *)(uintptr_t)&tc); + usleep(rand() % 10000); + } + for (i = 0; i < NTHREAD; i++) + pthread_join(th[i], NULL); +} + +int get_warnings_count(void) +{ + int warnings; + FILE *f; + + f = popen("dmesg | grep \"WARNING:\" | wc -l", "r"); + if (fscanf(f, "%d", &warnings) < 1) + warnings = 0; + pclose(f); + + return warnings; +} + +int main(void) +{ + int warnings_before, warnings_after; + + if (!is_intel_cpu()) { + print_skip("Must be run on an Intel CPU"); + exit(KSFT_SKIP); + } + + if (vm_is_unrestricted_guest(NULL)) { + print_skip("Unrestricted guest must be disabled"); + exit(KSFT_SKIP); + } + + warnings_before = get_warnings_count(); + + for (int i = 0; i < NPROCESS; ++i) { + int status; + int pid = fork(); + + if (pid < 0) + exit(1); + if (pid == 0) { + test(); + exit(0); + } + while (waitpid(pid, &status, __WALL) != pid) + ; + } + + warnings_after = get_warnings_count(); + TEST_ASSERT(warnings_before == warnings_after, + "Warnings found in kernel. Run 'dmesg' to inspect them."); + + return 0; +} diff --git a/tools/testing/selftests/kvm/x86_64/platform_info_test.c b/tools/testing/selftests/kvm/x86_64/platform_info_test.c new file mode 100644 index 000000000..1e89688cb --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/platform_info_test.c @@ -0,0 +1,107 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Test for x86 KVM_CAP_MSR_PLATFORM_INFO + * + * Copyright (C) 2018, Google LLC. + * + * This work is licensed under the terms of the GNU GPL, version 2. + * + * Verifies expected behavior of controlling guest access to + * MSR_PLATFORM_INFO. + */ + +#define _GNU_SOURCE /* for program_invocation_short_name */ +#include +#include +#include +#include +#include + +#include "test_util.h" +#include "kvm_util.h" +#include "processor.h" + +#define VCPU_ID 0 +#define MSR_PLATFORM_INFO_MAX_TURBO_RATIO 0xff00 + +static void guest_code(void) +{ + uint64_t msr_platform_info; + + for (;;) { + msr_platform_info = rdmsr(MSR_PLATFORM_INFO); + GUEST_SYNC(msr_platform_info); + asm volatile ("inc %r11"); + } +} + +static void set_msr_platform_info_enabled(struct kvm_vm *vm, bool enable) +{ + struct kvm_enable_cap cap = {}; + + cap.cap = KVM_CAP_MSR_PLATFORM_INFO; + cap.flags = 0; + cap.args[0] = (int)enable; + vm_enable_cap(vm, &cap); +} + +static void test_msr_platform_info_enabled(struct kvm_vm *vm) +{ + struct kvm_run *run = vcpu_state(vm, VCPU_ID); + struct ucall uc; + + set_msr_platform_info_enabled(vm, true); + vcpu_run(vm, VCPU_ID); + TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, + "Exit_reason other than KVM_EXIT_IO: %u (%s),\n", + run->exit_reason, + exit_reason_str(run->exit_reason)); + get_ucall(vm, VCPU_ID, &uc); + TEST_ASSERT(uc.cmd == UCALL_SYNC, + "Received ucall other than UCALL_SYNC: %lu\n", uc.cmd); + TEST_ASSERT((uc.args[1] & MSR_PLATFORM_INFO_MAX_TURBO_RATIO) == + MSR_PLATFORM_INFO_MAX_TURBO_RATIO, + "Expected MSR_PLATFORM_INFO to have max turbo ratio mask: %i.", + MSR_PLATFORM_INFO_MAX_TURBO_RATIO); +} + +static void test_msr_platform_info_disabled(struct kvm_vm *vm) +{ + struct kvm_run *run = vcpu_state(vm, VCPU_ID); + + set_msr_platform_info_enabled(vm, false); + vcpu_run(vm, VCPU_ID); + TEST_ASSERT(run->exit_reason == KVM_EXIT_SHUTDOWN, + "Exit_reason other than KVM_EXIT_SHUTDOWN: %u (%s)\n", + run->exit_reason, + exit_reason_str(run->exit_reason)); +} + +int main(int argc, char *argv[]) +{ + struct kvm_vm *vm; + int rv; + uint64_t msr_platform_info; + + /* Tell stdout not to buffer its content */ + setbuf(stdout, NULL); + + rv = kvm_check_cap(KVM_CAP_MSR_PLATFORM_INFO); + if (!rv) { + print_skip("KVM_CAP_MSR_PLATFORM_INFO not supported"); + exit(KSFT_SKIP); + } + + vm = vm_create_default(VCPU_ID, 0, guest_code); + + msr_platform_info = vcpu_get_msr(vm, VCPU_ID, MSR_PLATFORM_INFO); + vcpu_set_msr(vm, VCPU_ID, MSR_PLATFORM_INFO, + msr_platform_info | MSR_PLATFORM_INFO_MAX_TURBO_RATIO); + test_msr_platform_info_enabled(vm); + test_msr_platform_info_disabled(vm); + vcpu_set_msr(vm, VCPU_ID, MSR_PLATFORM_INFO, msr_platform_info); + + kvm_vm_free(vm); + + return 0; +} diff --git a/tools/testing/selftests/kvm/x86_64/set_sregs_test.c b/tools/testing/selftests/kvm/x86_64/set_sregs_test.c new file mode 100644 index 000000000..9f7656184 --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/set_sregs_test.c @@ -0,0 +1,52 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * KVM_SET_SREGS tests + * + * Copyright (C) 2018, Google LLC. + * + * This is a regression test for the bug fixed by the following commit: + * d3802286fa0f ("kvm: x86: Disallow illegal IA32_APIC_BASE MSR values") + * + * That bug allowed a user-mode program that called the KVM_SET_SREGS + * ioctl to put a VCPU's local APIC into an invalid state. + */ +#define _GNU_SOURCE /* for program_invocation_short_name */ +#include +#include +#include +#include +#include + +#include "test_util.h" + +#include "kvm_util.h" +#include "processor.h" + +#define VCPU_ID 5 + +int main(int argc, char *argv[]) +{ + struct kvm_sregs sregs; + struct kvm_vm *vm; + int rc; + + /* Tell stdout not to buffer its content */ + setbuf(stdout, NULL); + + /* Create VM */ + vm = vm_create_default(VCPU_ID, 0, NULL); + + vcpu_sregs_get(vm, VCPU_ID, &sregs); + sregs.apic_base = 1 << 10; + rc = _vcpu_sregs_set(vm, VCPU_ID, &sregs); + TEST_ASSERT(rc, "Set IA32_APIC_BASE to %llx (invalid)", + sregs.apic_base); + sregs.apic_base = 1 << 11; + rc = _vcpu_sregs_set(vm, VCPU_ID, &sregs); + TEST_ASSERT(!rc, "Couldn't set IA32_APIC_BASE to %llx (valid)", + sregs.apic_base); + + kvm_vm_free(vm); + + return 0; +} diff --git a/tools/testing/selftests/kvm/x86_64/smm_test.c b/tools/testing/selftests/kvm/x86_64/smm_test.c new file mode 100644 index 000000000..ae39a2206 --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/smm_test.c @@ -0,0 +1,164 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2018, Red Hat, Inc. + * + * Tests for SMM. + */ +#define _GNU_SOURCE /* for program_invocation_short_name */ +#include +#include +#include +#include +#include +#include + +#include "test_util.h" + +#include "kvm_util.h" + +#include "vmx.h" +#include "svm_util.h" + +#define VCPU_ID 1 + +#define PAGE_SIZE 4096 + +#define SMRAM_SIZE 65536 +#define SMRAM_MEMSLOT ((1 << 16) | 1) +#define SMRAM_PAGES (SMRAM_SIZE / PAGE_SIZE) +#define SMRAM_GPA 0x1000000 +#define SMRAM_STAGE 0xfe + +#define STR(x) #x +#define XSTR(s) STR(s) + +#define SYNC_PORT 0xe +#define DONE 0xff + +/* + * This is compiled as normal 64-bit code, however, SMI handler is executed + * in real-address mode. To stay simple we're limiting ourselves to a mode + * independent subset of asm here. + * SMI handler always report back fixed stage SMRAM_STAGE. + */ +uint8_t smi_handler[] = { + 0xb0, SMRAM_STAGE, /* mov $SMRAM_STAGE, %al */ + 0xe4, SYNC_PORT, /* in $SYNC_PORT, %al */ + 0x0f, 0xaa, /* rsm */ +}; + +static inline void sync_with_host(uint64_t phase) +{ + asm volatile("in $" XSTR(SYNC_PORT)", %%al \n" + : "+a" (phase)); +} + +void self_smi(void) +{ + wrmsr(APIC_BASE_MSR + (APIC_ICR >> 4), + APIC_DEST_SELF | APIC_INT_ASSERT | APIC_DM_SMI); +} + +void guest_code(void *arg) +{ + uint64_t apicbase = rdmsr(MSR_IA32_APICBASE); + + sync_with_host(1); + + wrmsr(MSR_IA32_APICBASE, apicbase | X2APIC_ENABLE); + + sync_with_host(2); + + self_smi(); + + sync_with_host(4); + + if (arg) { + if (cpu_has_svm()) + generic_svm_setup(arg, NULL, NULL); + else + GUEST_ASSERT(prepare_for_vmx_operation(arg)); + + sync_with_host(5); + + self_smi(); + + sync_with_host(7); + } + + sync_with_host(DONE); +} + +int main(int argc, char *argv[]) +{ + vm_vaddr_t nested_gva = 0; + + struct kvm_regs regs; + struct kvm_vm *vm; + struct kvm_run *run; + struct kvm_x86_state *state; + int stage, stage_reported; + + /* Create VM */ + vm = vm_create_default(VCPU_ID, 0, guest_code); + + vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid()); + + run = vcpu_state(vm, VCPU_ID); + + vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, SMRAM_GPA, + SMRAM_MEMSLOT, SMRAM_PAGES, 0); + TEST_ASSERT(vm_phy_pages_alloc(vm, SMRAM_PAGES, SMRAM_GPA, SMRAM_MEMSLOT) + == SMRAM_GPA, "could not allocate guest physical addresses?"); + + memset(addr_gpa2hva(vm, SMRAM_GPA), 0x0, SMRAM_SIZE); + memcpy(addr_gpa2hva(vm, SMRAM_GPA) + 0x8000, smi_handler, + sizeof(smi_handler)); + + vcpu_set_msr(vm, VCPU_ID, MSR_IA32_SMBASE, SMRAM_GPA); + + if (kvm_check_cap(KVM_CAP_NESTED_STATE)) { + if (nested_svm_supported()) + vcpu_alloc_svm(vm, &nested_gva); + else if (nested_vmx_supported()) + vcpu_alloc_vmx(vm, &nested_gva); + } + + if (!nested_gva) + pr_info("will skip SMM test with VMX enabled\n"); + + vcpu_args_set(vm, VCPU_ID, 1, nested_gva); + + for (stage = 1;; stage++) { + _vcpu_run(vm, VCPU_ID); + TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, + "Stage %d: unexpected exit reason: %u (%s),\n", + stage, run->exit_reason, + exit_reason_str(run->exit_reason)); + + memset(®s, 0, sizeof(regs)); + vcpu_regs_get(vm, VCPU_ID, ®s); + + stage_reported = regs.rax & 0xff; + + if (stage_reported == DONE) + goto done; + + TEST_ASSERT(stage_reported == stage || + stage_reported == SMRAM_STAGE, + "Unexpected stage: #%x, got %x", + stage, stage_reported); + + state = vcpu_save_state(vm, VCPU_ID); + kvm_vm_release(vm); + kvm_vm_restart(vm, O_RDWR); + vm_vcpu_add(vm, VCPU_ID); + vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid()); + vcpu_load_state(vm, VCPU_ID, state); + run = vcpu_state(vm, VCPU_ID); + free(state); + } + +done: + kvm_vm_free(vm); +} diff --git a/tools/testing/selftests/kvm/x86_64/state_test.c b/tools/testing/selftests/kvm/x86_64/state_test.c new file mode 100644 index 000000000..f6c8b9042 --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/state_test.c @@ -0,0 +1,233 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * KVM_GET/SET_* tests + * + * Copyright (C) 2018, Red Hat, Inc. + * + * Tests for vCPU state save/restore, including nested guest state. + */ +#define _GNU_SOURCE /* for program_invocation_short_name */ +#include +#include +#include +#include +#include + +#include "test_util.h" + +#include "kvm_util.h" +#include "processor.h" +#include "vmx.h" +#include "svm_util.h" + +#define VCPU_ID 5 +#define L2_GUEST_STACK_SIZE 256 + +void svm_l2_guest_code(void) +{ + GUEST_SYNC(4); + /* Exit to L1 */ + vmcall(); + GUEST_SYNC(6); + /* Done, exit to L1 and never come back. */ + vmcall(); +} + +static void svm_l1_guest_code(struct svm_test_data *svm) +{ + unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; + struct vmcb *vmcb = svm->vmcb; + + GUEST_ASSERT(svm->vmcb_gpa); + /* Prepare for L2 execution. */ + generic_svm_setup(svm, svm_l2_guest_code, + &l2_guest_stack[L2_GUEST_STACK_SIZE]); + + GUEST_SYNC(3); + run_guest(vmcb, svm->vmcb_gpa); + GUEST_ASSERT(vmcb->control.exit_code == SVM_EXIT_VMMCALL); + GUEST_SYNC(5); + vmcb->save.rip += 3; + run_guest(vmcb, svm->vmcb_gpa); + GUEST_ASSERT(vmcb->control.exit_code == SVM_EXIT_VMMCALL); + GUEST_SYNC(7); +} + +void vmx_l2_guest_code(void) +{ + GUEST_SYNC(6); + + /* Exit to L1 */ + vmcall(); + + /* L1 has now set up a shadow VMCS for us. */ + GUEST_ASSERT(vmreadz(GUEST_RIP) == 0xc0ffee); + GUEST_SYNC(10); + GUEST_ASSERT(vmreadz(GUEST_RIP) == 0xc0ffee); + GUEST_ASSERT(!vmwrite(GUEST_RIP, 0xc0fffee)); + GUEST_SYNC(11); + GUEST_ASSERT(vmreadz(GUEST_RIP) == 0xc0fffee); + GUEST_ASSERT(!vmwrite(GUEST_RIP, 0xc0ffffee)); + GUEST_SYNC(12); + + /* Done, exit to L1 and never come back. */ + vmcall(); +} + +static void vmx_l1_guest_code(struct vmx_pages *vmx_pages) +{ + unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; + + GUEST_ASSERT(vmx_pages->vmcs_gpa); + GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages)); + GUEST_SYNC(3); + GUEST_ASSERT(load_vmcs(vmx_pages)); + GUEST_ASSERT(vmptrstz() == vmx_pages->vmcs_gpa); + + GUEST_SYNC(4); + GUEST_ASSERT(vmptrstz() == vmx_pages->vmcs_gpa); + + prepare_vmcs(vmx_pages, vmx_l2_guest_code, + &l2_guest_stack[L2_GUEST_STACK_SIZE]); + + GUEST_SYNC(5); + GUEST_ASSERT(vmptrstz() == vmx_pages->vmcs_gpa); + GUEST_ASSERT(!vmlaunch()); + GUEST_ASSERT(vmptrstz() == vmx_pages->vmcs_gpa); + GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL); + + /* Check that the launched state is preserved. */ + GUEST_ASSERT(vmlaunch()); + + GUEST_ASSERT(!vmresume()); + GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL); + + GUEST_SYNC(7); + GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL); + + GUEST_ASSERT(!vmresume()); + GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL); + + vmwrite(GUEST_RIP, vmreadz(GUEST_RIP) + 3); + + vmwrite(SECONDARY_VM_EXEC_CONTROL, SECONDARY_EXEC_SHADOW_VMCS); + vmwrite(VMCS_LINK_POINTER, vmx_pages->shadow_vmcs_gpa); + + GUEST_ASSERT(!vmptrld(vmx_pages->shadow_vmcs_gpa)); + GUEST_ASSERT(vmlaunch()); + GUEST_SYNC(8); + GUEST_ASSERT(vmlaunch()); + GUEST_ASSERT(vmresume()); + + vmwrite(GUEST_RIP, 0xc0ffee); + GUEST_SYNC(9); + GUEST_ASSERT(vmreadz(GUEST_RIP) == 0xc0ffee); + + GUEST_ASSERT(!vmptrld(vmx_pages->vmcs_gpa)); + GUEST_ASSERT(!vmresume()); + GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL); + + GUEST_ASSERT(!vmptrld(vmx_pages->shadow_vmcs_gpa)); + GUEST_ASSERT(vmreadz(GUEST_RIP) == 0xc0ffffee); + GUEST_ASSERT(vmlaunch()); + GUEST_ASSERT(vmresume()); + GUEST_SYNC(13); + GUEST_ASSERT(vmreadz(GUEST_RIP) == 0xc0ffffee); + GUEST_ASSERT(vmlaunch()); + GUEST_ASSERT(vmresume()); +} + +static void __attribute__((__flatten__)) guest_code(void *arg) +{ + GUEST_SYNC(1); + GUEST_SYNC(2); + + if (arg) { + if (cpu_has_svm()) + svm_l1_guest_code(arg); + else + vmx_l1_guest_code(arg); + } + + GUEST_DONE(); +} + +int main(int argc, char *argv[]) +{ + vm_vaddr_t nested_gva = 0; + + struct kvm_regs regs1, regs2; + struct kvm_vm *vm; + struct kvm_run *run; + struct kvm_x86_state *state; + struct ucall uc; + int stage; + + /* Create VM */ + vm = vm_create_default(VCPU_ID, 0, guest_code); + vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid()); + run = vcpu_state(vm, VCPU_ID); + + vcpu_regs_get(vm, VCPU_ID, ®s1); + + if (kvm_check_cap(KVM_CAP_NESTED_STATE)) { + if (nested_svm_supported()) + vcpu_alloc_svm(vm, &nested_gva); + else if (nested_vmx_supported()) + vcpu_alloc_vmx(vm, &nested_gva); + } + + if (!nested_gva) + pr_info("will skip nested state checks\n"); + + vcpu_args_set(vm, VCPU_ID, 1, nested_gva); + + for (stage = 1;; stage++) { + _vcpu_run(vm, VCPU_ID); + TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, + "Stage %d: unexpected exit reason: %u (%s),\n", + stage, run->exit_reason, + exit_reason_str(run->exit_reason)); + + switch (get_ucall(vm, VCPU_ID, &uc)) { + case UCALL_ABORT: + TEST_FAIL("%s at %s:%ld", (const char *)uc.args[0], + __FILE__, uc.args[1]); + /* NOT REACHED */ + case UCALL_SYNC: + break; + case UCALL_DONE: + goto done; + default: + TEST_FAIL("Unknown ucall %lu", uc.cmd); + } + + /* UCALL_SYNC is handled here. */ + TEST_ASSERT(!strcmp((const char *)uc.args[0], "hello") && + uc.args[1] == stage, "Stage %d: Unexpected register values vmexit, got %lx", + stage, (ulong)uc.args[1]); + + state = vcpu_save_state(vm, VCPU_ID); + memset(®s1, 0, sizeof(regs1)); + vcpu_regs_get(vm, VCPU_ID, ®s1); + + kvm_vm_release(vm); + + /* Restore state in a new VM. */ + kvm_vm_restart(vm, O_RDWR); + vm_vcpu_add(vm, VCPU_ID); + vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid()); + vcpu_load_state(vm, VCPU_ID, state); + run = vcpu_state(vm, VCPU_ID); + free(state); + + memset(®s2, 0, sizeof(regs2)); + vcpu_regs_get(vm, VCPU_ID, ®s2); + TEST_ASSERT(!memcmp(®s1, ®s2, sizeof(regs2)), + "Unexpected register values after vcpu_load_state; rdi: %lx rsi: %lx", + (ulong) regs2.rdi, (ulong) regs2.rsi); + } + +done: + kvm_vm_free(vm); +} diff --git a/tools/testing/selftests/kvm/x86_64/svm_vmcall_test.c b/tools/testing/selftests/kvm/x86_64/svm_vmcall_test.c new file mode 100644 index 000000000..0e1adb4e3 --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/svm_vmcall_test.c @@ -0,0 +1,77 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * svm_vmcall_test + * + * Copyright (C) 2020, Red Hat, Inc. + * + * Nested SVM testing: VMCALL + */ + +#include "test_util.h" +#include "kvm_util.h" +#include "processor.h" +#include "svm_util.h" + +#define VCPU_ID 5 + +static struct kvm_vm *vm; + +static void l2_guest_code(struct svm_test_data *svm) +{ + __asm__ __volatile__("vmcall"); +} + +static void l1_guest_code(struct svm_test_data *svm) +{ + #define L2_GUEST_STACK_SIZE 64 + unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; + struct vmcb *vmcb = svm->vmcb; + + /* Prepare for L2 execution. */ + generic_svm_setup(svm, l2_guest_code, + &l2_guest_stack[L2_GUEST_STACK_SIZE]); + + run_guest(vmcb, svm->vmcb_gpa); + + GUEST_ASSERT(vmcb->control.exit_code == SVM_EXIT_VMMCALL); + GUEST_DONE(); +} + +int main(int argc, char *argv[]) +{ + vm_vaddr_t svm_gva; + + nested_svm_check_supported(); + + vm = vm_create_default(VCPU_ID, 0, (void *) l1_guest_code); + vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid()); + + vcpu_alloc_svm(vm, &svm_gva); + vcpu_args_set(vm, VCPU_ID, 1, svm_gva); + + for (;;) { + volatile struct kvm_run *run = vcpu_state(vm, VCPU_ID); + struct ucall uc; + + vcpu_run(vm, VCPU_ID); + TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, + "Got exit_reason other than KVM_EXIT_IO: %u (%s)\n", + run->exit_reason, + exit_reason_str(run->exit_reason)); + + switch (get_ucall(vm, VCPU_ID, &uc)) { + case UCALL_ABORT: + TEST_FAIL("%s", (const char *)uc.args[0]); + /* NOT REACHED */ + case UCALL_SYNC: + break; + case UCALL_DONE: + goto done; + default: + TEST_FAIL("Unknown ucall 0x%lx.", uc.cmd); + } + } +done: + kvm_vm_free(vm); + return 0; +} diff --git a/tools/testing/selftests/kvm/x86_64/sync_regs_test.c b/tools/testing/selftests/kvm/x86_64/sync_regs_test.c new file mode 100644 index 000000000..d672f0a47 --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/sync_regs_test.c @@ -0,0 +1,243 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Test for x86 KVM_CAP_SYNC_REGS + * + * Copyright (C) 2018, Google LLC. + * + * Verifies expected behavior of x86 KVM_CAP_SYNC_REGS functionality, + * including requesting an invalid register set, updates to/from values + * in kvm_run.s.regs when kvm_valid_regs and kvm_dirty_regs are toggled. + */ + +#define _GNU_SOURCE /* for program_invocation_short_name */ +#include +#include +#include +#include +#include + +#include "test_util.h" +#include "kvm_util.h" +#include "processor.h" + +#define VCPU_ID 5 + +#define UCALL_PIO_PORT ((uint16_t)0x1000) + +/* + * ucall is embedded here to protect against compiler reshuffling registers + * before calling a function. In this test we only need to get KVM_EXIT_IO + * vmexit and preserve RBX, no additional information is needed. + */ +void guest_code(void) +{ + asm volatile("1: in %[port], %%al\n" + "add $0x1, %%rbx\n" + "jmp 1b" + : : [port] "d" (UCALL_PIO_PORT) : "rax", "rbx"); +} + +static void compare_regs(struct kvm_regs *left, struct kvm_regs *right) +{ +#define REG_COMPARE(reg) \ + TEST_ASSERT(left->reg == right->reg, \ + "Register " #reg \ + " values did not match: 0x%llx, 0x%llx\n", \ + left->reg, right->reg) + REG_COMPARE(rax); + REG_COMPARE(rbx); + REG_COMPARE(rcx); + REG_COMPARE(rdx); + REG_COMPARE(rsi); + REG_COMPARE(rdi); + REG_COMPARE(rsp); + REG_COMPARE(rbp); + REG_COMPARE(r8); + REG_COMPARE(r9); + REG_COMPARE(r10); + REG_COMPARE(r11); + REG_COMPARE(r12); + REG_COMPARE(r13); + REG_COMPARE(r14); + REG_COMPARE(r15); + REG_COMPARE(rip); + REG_COMPARE(rflags); +#undef REG_COMPARE +} + +static void compare_sregs(struct kvm_sregs *left, struct kvm_sregs *right) +{ +} + +static void compare_vcpu_events(struct kvm_vcpu_events *left, + struct kvm_vcpu_events *right) +{ +} + +#define TEST_SYNC_FIELDS (KVM_SYNC_X86_REGS|KVM_SYNC_X86_SREGS|KVM_SYNC_X86_EVENTS) +#define INVALID_SYNC_FIELD 0x80000000 + +int main(int argc, char *argv[]) +{ + struct kvm_vm *vm; + struct kvm_run *run; + struct kvm_regs regs; + struct kvm_sregs sregs; + struct kvm_vcpu_events events; + int rv, cap; + + /* Tell stdout not to buffer its content */ + setbuf(stdout, NULL); + + cap = kvm_check_cap(KVM_CAP_SYNC_REGS); + if ((cap & TEST_SYNC_FIELDS) != TEST_SYNC_FIELDS) { + print_skip("KVM_CAP_SYNC_REGS not supported"); + exit(KSFT_SKIP); + } + if ((cap & INVALID_SYNC_FIELD) != 0) { + print_skip("The \"invalid\" field is not invalid"); + exit(KSFT_SKIP); + } + + /* Create VM */ + vm = vm_create_default(VCPU_ID, 0, guest_code); + + run = vcpu_state(vm, VCPU_ID); + + /* Request reading invalid register set from VCPU. */ + run->kvm_valid_regs = INVALID_SYNC_FIELD; + rv = _vcpu_run(vm, VCPU_ID); + TEST_ASSERT(rv < 0 && errno == EINVAL, + "Invalid kvm_valid_regs did not cause expected KVM_RUN error: %d\n", + rv); + vcpu_state(vm, VCPU_ID)->kvm_valid_regs = 0; + + run->kvm_valid_regs = INVALID_SYNC_FIELD | TEST_SYNC_FIELDS; + rv = _vcpu_run(vm, VCPU_ID); + TEST_ASSERT(rv < 0 && errno == EINVAL, + "Invalid kvm_valid_regs did not cause expected KVM_RUN error: %d\n", + rv); + vcpu_state(vm, VCPU_ID)->kvm_valid_regs = 0; + + /* Request setting invalid register set into VCPU. */ + run->kvm_dirty_regs = INVALID_SYNC_FIELD; + rv = _vcpu_run(vm, VCPU_ID); + TEST_ASSERT(rv < 0 && errno == EINVAL, + "Invalid kvm_dirty_regs did not cause expected KVM_RUN error: %d\n", + rv); + vcpu_state(vm, VCPU_ID)->kvm_dirty_regs = 0; + + run->kvm_dirty_regs = INVALID_SYNC_FIELD | TEST_SYNC_FIELDS; + rv = _vcpu_run(vm, VCPU_ID); + TEST_ASSERT(rv < 0 && errno == EINVAL, + "Invalid kvm_dirty_regs did not cause expected KVM_RUN error: %d\n", + rv); + vcpu_state(vm, VCPU_ID)->kvm_dirty_regs = 0; + + /* Request and verify all valid register sets. */ + /* TODO: BUILD TIME CHECK: TEST_ASSERT(KVM_SYNC_X86_NUM_FIELDS != 3); */ + run->kvm_valid_regs = TEST_SYNC_FIELDS; + rv = _vcpu_run(vm, VCPU_ID); + TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, + "Unexpected exit reason: %u (%s),\n", + run->exit_reason, + exit_reason_str(run->exit_reason)); + + vcpu_regs_get(vm, VCPU_ID, ®s); + compare_regs(®s, &run->s.regs.regs); + + vcpu_sregs_get(vm, VCPU_ID, &sregs); + compare_sregs(&sregs, &run->s.regs.sregs); + + vcpu_events_get(vm, VCPU_ID, &events); + compare_vcpu_events(&events, &run->s.regs.events); + + /* Set and verify various register values. */ + run->s.regs.regs.rbx = 0xBAD1DEA; + run->s.regs.sregs.apic_base = 1 << 11; + /* TODO run->s.regs.events.XYZ = ABC; */ + + run->kvm_valid_regs = TEST_SYNC_FIELDS; + run->kvm_dirty_regs = KVM_SYNC_X86_REGS | KVM_SYNC_X86_SREGS; + rv = _vcpu_run(vm, VCPU_ID); + TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, + "Unexpected exit reason: %u (%s),\n", + run->exit_reason, + exit_reason_str(run->exit_reason)); + TEST_ASSERT(run->s.regs.regs.rbx == 0xBAD1DEA + 1, + "rbx sync regs value incorrect 0x%llx.", + run->s.regs.regs.rbx); + TEST_ASSERT(run->s.regs.sregs.apic_base == 1 << 11, + "apic_base sync regs value incorrect 0x%llx.", + run->s.regs.sregs.apic_base); + + vcpu_regs_get(vm, VCPU_ID, ®s); + compare_regs(®s, &run->s.regs.regs); + + vcpu_sregs_get(vm, VCPU_ID, &sregs); + compare_sregs(&sregs, &run->s.regs.sregs); + + vcpu_events_get(vm, VCPU_ID, &events); + compare_vcpu_events(&events, &run->s.regs.events); + + /* Clear kvm_dirty_regs bits, verify new s.regs values are + * overwritten with existing guest values. + */ + run->kvm_valid_regs = TEST_SYNC_FIELDS; + run->kvm_dirty_regs = 0; + run->s.regs.regs.rbx = 0xDEADBEEF; + rv = _vcpu_run(vm, VCPU_ID); + TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, + "Unexpected exit reason: %u (%s),\n", + run->exit_reason, + exit_reason_str(run->exit_reason)); + TEST_ASSERT(run->s.regs.regs.rbx != 0xDEADBEEF, + "rbx sync regs value incorrect 0x%llx.", + run->s.regs.regs.rbx); + + /* Clear kvm_valid_regs bits and kvm_dirty_bits. + * Verify s.regs values are not overwritten with existing guest values + * and that guest values are not overwritten with kvm_sync_regs values. + */ + run->kvm_valid_regs = 0; + run->kvm_dirty_regs = 0; + run->s.regs.regs.rbx = 0xAAAA; + regs.rbx = 0xBAC0; + vcpu_regs_set(vm, VCPU_ID, ®s); + rv = _vcpu_run(vm, VCPU_ID); + TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, + "Unexpected exit reason: %u (%s),\n", + run->exit_reason, + exit_reason_str(run->exit_reason)); + TEST_ASSERT(run->s.regs.regs.rbx == 0xAAAA, + "rbx sync regs value incorrect 0x%llx.", + run->s.regs.regs.rbx); + vcpu_regs_get(vm, VCPU_ID, ®s); + TEST_ASSERT(regs.rbx == 0xBAC0 + 1, + "rbx guest value incorrect 0x%llx.", + regs.rbx); + + /* Clear kvm_valid_regs bits. Verify s.regs values are not overwritten + * with existing guest values but that guest values are overwritten + * with kvm_sync_regs values. + */ + run->kvm_valid_regs = 0; + run->kvm_dirty_regs = TEST_SYNC_FIELDS; + run->s.regs.regs.rbx = 0xBBBB; + rv = _vcpu_run(vm, VCPU_ID); + TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, + "Unexpected exit reason: %u (%s),\n", + run->exit_reason, + exit_reason_str(run->exit_reason)); + TEST_ASSERT(run->s.regs.regs.rbx == 0xBBBB, + "rbx sync regs value incorrect 0x%llx.", + run->s.regs.regs.rbx); + vcpu_regs_get(vm, VCPU_ID, ®s); + TEST_ASSERT(regs.rbx == 0xBBBB + 1, + "rbx guest value incorrect 0x%llx.", + regs.rbx); + + kvm_vm_free(vm); + + return 0; +} diff --git a/tools/testing/selftests/kvm/x86_64/tsc_msrs_test.c b/tools/testing/selftests/kvm/x86_64/tsc_msrs_test.c new file mode 100644 index 000000000..f8e761149 --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/tsc_msrs_test.c @@ -0,0 +1,168 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Tests for MSR_IA32_TSC and MSR_IA32_TSC_ADJUST. + * + * Copyright (C) 2020, Red Hat, Inc. + */ +#include +#include +#include "kvm_util.h" +#include "processor.h" + +#define VCPU_ID 0 + +#define UNITY (1ull << 30) +#define HOST_ADJUST (UNITY * 64) +#define GUEST_STEP (UNITY * 4) +#define ROUND(x) ((x + UNITY / 2) & -UNITY) +#define rounded_rdmsr(x) ROUND(rdmsr(x)) +#define rounded_host_rdmsr(x) ROUND(vcpu_get_msr(vm, 0, x)) + +#define GUEST_ASSERT_EQ(a, b) do { \ + __typeof(a) _a = (a); \ + __typeof(b) _b = (b); \ + if (_a != _b) \ + ucall(UCALL_ABORT, 4, \ + "Failed guest assert: " \ + #a " == " #b, __LINE__, _a, _b); \ + } while(0) + +static void guest_code(void) +{ + u64 val = 0; + + GUEST_ASSERT_EQ(rounded_rdmsr(MSR_IA32_TSC), val); + GUEST_ASSERT_EQ(rounded_rdmsr(MSR_IA32_TSC_ADJUST), val); + + /* Guest: writes to MSR_IA32_TSC affect both MSRs. */ + val = 1ull * GUEST_STEP; + wrmsr(MSR_IA32_TSC, val); + GUEST_ASSERT_EQ(rounded_rdmsr(MSR_IA32_TSC), val); + GUEST_ASSERT_EQ(rounded_rdmsr(MSR_IA32_TSC_ADJUST), val); + + /* Guest: writes to MSR_IA32_TSC_ADJUST affect both MSRs. */ + GUEST_SYNC(2); + val = 2ull * GUEST_STEP; + wrmsr(MSR_IA32_TSC_ADJUST, val); + GUEST_ASSERT_EQ(rounded_rdmsr(MSR_IA32_TSC), val); + GUEST_ASSERT_EQ(rounded_rdmsr(MSR_IA32_TSC_ADJUST), val); + + /* Host: setting the TSC offset. */ + GUEST_SYNC(3); + GUEST_ASSERT_EQ(rounded_rdmsr(MSR_IA32_TSC), HOST_ADJUST + val); + GUEST_ASSERT_EQ(rounded_rdmsr(MSR_IA32_TSC_ADJUST), val); + + /* + * Guest: writes to MSR_IA32_TSC_ADJUST do not destroy the + * host-side offset and affect both MSRs. + */ + GUEST_SYNC(4); + val = 3ull * GUEST_STEP; + wrmsr(MSR_IA32_TSC_ADJUST, val); + GUEST_ASSERT_EQ(rounded_rdmsr(MSR_IA32_TSC), HOST_ADJUST + val); + GUEST_ASSERT_EQ(rounded_rdmsr(MSR_IA32_TSC_ADJUST), val); + + /* + * Guest: writes to MSR_IA32_TSC affect both MSRs, so the host-side + * offset is now visible in MSR_IA32_TSC_ADJUST. + */ + GUEST_SYNC(5); + val = 4ull * GUEST_STEP; + wrmsr(MSR_IA32_TSC, val); + GUEST_ASSERT_EQ(rounded_rdmsr(MSR_IA32_TSC), val); + GUEST_ASSERT_EQ(rounded_rdmsr(MSR_IA32_TSC_ADJUST), val - HOST_ADJUST); + + GUEST_DONE(); +} + +static void run_vcpu(struct kvm_vm *vm, uint32_t vcpuid, int stage) +{ + struct ucall uc; + + vcpu_args_set(vm, vcpuid, 1, vcpuid); + + vcpu_ioctl(vm, vcpuid, KVM_RUN, NULL); + + switch (get_ucall(vm, vcpuid, &uc)) { + case UCALL_SYNC: + TEST_ASSERT(!strcmp((const char *)uc.args[0], "hello") && + uc.args[1] == stage + 1, "Stage %d: Unexpected register values vmexit, got %lx", + stage + 1, (ulong)uc.args[1]); + return; + case UCALL_DONE: + return; + case UCALL_ABORT: + TEST_ASSERT(false, "%s at %s:%ld\n" \ + "\tvalues: %#lx, %#lx", (const char *)uc.args[0], + __FILE__, uc.args[1], uc.args[2], uc.args[3]); + default: + TEST_ASSERT(false, "Unexpected exit: %s", + exit_reason_str(vcpu_state(vm, vcpuid)->exit_reason)); + } +} + +int main(void) +{ + struct kvm_vm *vm; + uint64_t val; + + vm = vm_create_default(VCPU_ID, 0, guest_code); + vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid()); + + val = 0; + ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC), val); + ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC_ADJUST), val); + + /* Guest: writes to MSR_IA32_TSC affect both MSRs. */ + run_vcpu(vm, VCPU_ID, 1); + val = 1ull * GUEST_STEP; + ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC), val); + ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC_ADJUST), val); + + /* Guest: writes to MSR_IA32_TSC_ADJUST affect both MSRs. */ + run_vcpu(vm, VCPU_ID, 2); + val = 2ull * GUEST_STEP; + ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC), val); + ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC_ADJUST), val); + + /* + * Host: writes to MSR_IA32_TSC set the host-side offset + * and therefore do not change MSR_IA32_TSC_ADJUST. + */ + vcpu_set_msr(vm, 0, MSR_IA32_TSC, HOST_ADJUST + val); + ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC), HOST_ADJUST + val); + ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC_ADJUST), val); + run_vcpu(vm, VCPU_ID, 3); + + /* Host: writes to MSR_IA32_TSC_ADJUST do not modify the TSC. */ + vcpu_set_msr(vm, 0, MSR_IA32_TSC_ADJUST, UNITY * 123456); + ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC), HOST_ADJUST + val); + ASSERT_EQ(vcpu_get_msr(vm, 0, MSR_IA32_TSC_ADJUST), UNITY * 123456); + + /* Restore previous value. */ + vcpu_set_msr(vm, 0, MSR_IA32_TSC_ADJUST, val); + ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC), HOST_ADJUST + val); + ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC_ADJUST), val); + + /* + * Guest: writes to MSR_IA32_TSC_ADJUST do not destroy the + * host-side offset and affect both MSRs. + */ + run_vcpu(vm, VCPU_ID, 4); + val = 3ull * GUEST_STEP; + ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC), HOST_ADJUST + val); + ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC_ADJUST), val); + + /* + * Guest: writes to MSR_IA32_TSC affect both MSRs, so the host-side + * offset is now visible in MSR_IA32_TSC_ADJUST. + */ + run_vcpu(vm, VCPU_ID, 5); + val = 4ull * GUEST_STEP; + ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC), val); + ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC_ADJUST), val - HOST_ADJUST); + + kvm_vm_free(vm); + + return 0; +} diff --git a/tools/testing/selftests/kvm/x86_64/user_msr_test.c b/tools/testing/selftests/kvm/x86_64/user_msr_test.c new file mode 100644 index 000000000..cbe1b0889 --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/user_msr_test.c @@ -0,0 +1,248 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * tests for KVM_CAP_X86_USER_SPACE_MSR and KVM_X86_SET_MSR_FILTER + * + * Copyright (C) 2020, Amazon Inc. + * + * This is a functional test to verify that we can deflect MSR events + * into user space. + */ +#define _GNU_SOURCE /* for program_invocation_short_name */ +#include +#include +#include +#include +#include + +#include "test_util.h" + +#include "kvm_util.h" +#include "processor.h" + +#define VCPU_ID 5 + +static u32 msr_reads, msr_writes; + +static u8 bitmap_00000000[KVM_MSR_FILTER_MAX_BITMAP_SIZE]; +static u8 bitmap_00000000_write[KVM_MSR_FILTER_MAX_BITMAP_SIZE]; +static u8 bitmap_40000000[KVM_MSR_FILTER_MAX_BITMAP_SIZE]; +static u8 bitmap_c0000000[KVM_MSR_FILTER_MAX_BITMAP_SIZE]; +static u8 bitmap_c0000000_read[KVM_MSR_FILTER_MAX_BITMAP_SIZE]; +static u8 bitmap_deadbeef[1] = { 0x1 }; + +static void deny_msr(uint8_t *bitmap, u32 msr) +{ + u32 idx = msr & (KVM_MSR_FILTER_MAX_BITMAP_SIZE - 1); + + bitmap[idx / 8] &= ~(1 << (idx % 8)); +} + +static void prepare_bitmaps(void) +{ + memset(bitmap_00000000, 0xff, sizeof(bitmap_00000000)); + memset(bitmap_00000000_write, 0xff, sizeof(bitmap_00000000_write)); + memset(bitmap_40000000, 0xff, sizeof(bitmap_40000000)); + memset(bitmap_c0000000, 0xff, sizeof(bitmap_c0000000)); + memset(bitmap_c0000000_read, 0xff, sizeof(bitmap_c0000000_read)); + + deny_msr(bitmap_00000000_write, MSR_IA32_POWER_CTL); + deny_msr(bitmap_c0000000_read, MSR_SYSCALL_MASK); + deny_msr(bitmap_c0000000_read, MSR_GS_BASE); +} + +struct kvm_msr_filter filter = { + .flags = KVM_MSR_FILTER_DEFAULT_DENY, + .ranges = { + { + .flags = KVM_MSR_FILTER_READ, + .base = 0x00000000, + .nmsrs = KVM_MSR_FILTER_MAX_BITMAP_SIZE * BITS_PER_BYTE, + .bitmap = bitmap_00000000, + }, { + .flags = KVM_MSR_FILTER_WRITE, + .base = 0x00000000, + .nmsrs = KVM_MSR_FILTER_MAX_BITMAP_SIZE * BITS_PER_BYTE, + .bitmap = bitmap_00000000_write, + }, { + .flags = KVM_MSR_FILTER_READ | KVM_MSR_FILTER_WRITE, + .base = 0x40000000, + .nmsrs = KVM_MSR_FILTER_MAX_BITMAP_SIZE * BITS_PER_BYTE, + .bitmap = bitmap_40000000, + }, { + .flags = KVM_MSR_FILTER_READ, + .base = 0xc0000000, + .nmsrs = KVM_MSR_FILTER_MAX_BITMAP_SIZE * BITS_PER_BYTE, + .bitmap = bitmap_c0000000_read, + }, { + .flags = KVM_MSR_FILTER_WRITE, + .base = 0xc0000000, + .nmsrs = KVM_MSR_FILTER_MAX_BITMAP_SIZE * BITS_PER_BYTE, + .bitmap = bitmap_c0000000, + }, { + .flags = KVM_MSR_FILTER_WRITE | KVM_MSR_FILTER_READ, + .base = 0xdeadbeef, + .nmsrs = 1, + .bitmap = bitmap_deadbeef, + }, + }, +}; + +struct kvm_msr_filter no_filter = { + .flags = KVM_MSR_FILTER_DEFAULT_ALLOW, +}; + +static void guest_msr_calls(bool trapped) +{ + /* This goes into the in-kernel emulation */ + wrmsr(MSR_SYSCALL_MASK, 0); + + if (trapped) { + /* This goes into user space emulation */ + GUEST_ASSERT(rdmsr(MSR_SYSCALL_MASK) == MSR_SYSCALL_MASK); + GUEST_ASSERT(rdmsr(MSR_GS_BASE) == MSR_GS_BASE); + } else { + GUEST_ASSERT(rdmsr(MSR_SYSCALL_MASK) != MSR_SYSCALL_MASK); + GUEST_ASSERT(rdmsr(MSR_GS_BASE) != MSR_GS_BASE); + } + + /* If trapped == true, this goes into user space emulation */ + wrmsr(MSR_IA32_POWER_CTL, 0x1234); + + /* This goes into the in-kernel emulation */ + rdmsr(MSR_IA32_POWER_CTL); + + /* Invalid MSR, should always be handled by user space exit */ + GUEST_ASSERT(rdmsr(0xdeadbeef) == 0xdeadbeef); + wrmsr(0xdeadbeef, 0x1234); +} + +static void guest_code(void) +{ + guest_msr_calls(true); + + /* + * Disable msr filtering, so that the kernel + * handles everything in the next round + */ + GUEST_SYNC(0); + + guest_msr_calls(false); + + GUEST_DONE(); +} + +static int handle_ucall(struct kvm_vm *vm) +{ + struct ucall uc; + + switch (get_ucall(vm, VCPU_ID, &uc)) { + case UCALL_ABORT: + TEST_FAIL("Guest assertion not met"); + break; + case UCALL_SYNC: + vm_ioctl(vm, KVM_X86_SET_MSR_FILTER, &no_filter); + break; + case UCALL_DONE: + return 1; + default: + TEST_FAIL("Unknown ucall %lu", uc.cmd); + } + + return 0; +} + +static void handle_rdmsr(struct kvm_run *run) +{ + run->msr.data = run->msr.index; + msr_reads++; + + if (run->msr.index == MSR_SYSCALL_MASK || + run->msr.index == MSR_GS_BASE) { + TEST_ASSERT(run->msr.reason == KVM_MSR_EXIT_REASON_FILTER, + "MSR read trap w/o access fault"); + } + + if (run->msr.index == 0xdeadbeef) { + TEST_ASSERT(run->msr.reason == KVM_MSR_EXIT_REASON_UNKNOWN, + "MSR deadbeef read trap w/o inval fault"); + } +} + +static void handle_wrmsr(struct kvm_run *run) +{ + /* ignore */ + msr_writes++; + + if (run->msr.index == MSR_IA32_POWER_CTL) { + TEST_ASSERT(run->msr.data == 0x1234, + "MSR data for MSR_IA32_POWER_CTL incorrect"); + TEST_ASSERT(run->msr.reason == KVM_MSR_EXIT_REASON_FILTER, + "MSR_IA32_POWER_CTL trap w/o access fault"); + } + + if (run->msr.index == 0xdeadbeef) { + TEST_ASSERT(run->msr.data == 0x1234, + "MSR data for deadbeef incorrect"); + TEST_ASSERT(run->msr.reason == KVM_MSR_EXIT_REASON_UNKNOWN, + "deadbeef trap w/o inval fault"); + } +} + +int main(int argc, char *argv[]) +{ + struct kvm_enable_cap cap = { + .cap = KVM_CAP_X86_USER_SPACE_MSR, + .args[0] = KVM_MSR_EXIT_REASON_INVAL | + KVM_MSR_EXIT_REASON_UNKNOWN | + KVM_MSR_EXIT_REASON_FILTER, + }; + struct kvm_vm *vm; + struct kvm_run *run; + int rc; + + /* Tell stdout not to buffer its content */ + setbuf(stdout, NULL); + + /* Create VM */ + vm = vm_create_default(VCPU_ID, 0, guest_code); + vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid()); + run = vcpu_state(vm, VCPU_ID); + + rc = kvm_check_cap(KVM_CAP_X86_USER_SPACE_MSR); + TEST_ASSERT(rc, "KVM_CAP_X86_USER_SPACE_MSR is available"); + vm_enable_cap(vm, &cap); + + rc = kvm_check_cap(KVM_CAP_X86_MSR_FILTER); + TEST_ASSERT(rc, "KVM_CAP_X86_MSR_FILTER is available"); + + prepare_bitmaps(); + vm_ioctl(vm, KVM_X86_SET_MSR_FILTER, &filter); + + while (1) { + rc = _vcpu_run(vm, VCPU_ID); + + TEST_ASSERT(rc == 0, "vcpu_run failed: %d\n", rc); + + switch (run->exit_reason) { + case KVM_EXIT_X86_RDMSR: + handle_rdmsr(run); + break; + case KVM_EXIT_X86_WRMSR: + handle_wrmsr(run); + break; + case KVM_EXIT_IO: + if (handle_ucall(vm)) + goto done; + break; + } + + } + +done: + TEST_ASSERT(msr_reads == 4, "Handled 4 rdmsr in user space"); + TEST_ASSERT(msr_writes == 3, "Handled 3 wrmsr in user space"); + + kvm_vm_free(vm); + + return 0; +} diff --git a/tools/testing/selftests/kvm/x86_64/vmx_apic_access_test.c b/tools/testing/selftests/kvm/x86_64/vmx_apic_access_test.c new file mode 100644 index 000000000..1f65342d6 --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/vmx_apic_access_test.c @@ -0,0 +1,142 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * vmx_apic_access_test + * + * Copyright (C) 2020, Google LLC. + * + * This work is licensed under the terms of the GNU GPL, version 2. + * + * The first subtest simply checks to see that an L2 guest can be + * launched with a valid APIC-access address that is backed by a + * page of L1 physical memory. + * + * The second subtest sets the APIC-access address to a (valid) L1 + * physical address that is not backed by memory. KVM can't handle + * this situation, so resuming L2 should result in a KVM exit for + * internal error (emulation). This is not an architectural + * requirement. It is just a shortcoming of KVM. The internal error + * is unfortunate, but it's better than what used to happen! + */ + +#include "test_util.h" +#include "kvm_util.h" +#include "processor.h" +#include "vmx.h" + +#include +#include + +#include "kselftest.h" + +#define VCPU_ID 0 + +/* The virtual machine object. */ +static struct kvm_vm *vm; + +static void l2_guest_code(void) +{ + /* Exit to L1 */ + __asm__ __volatile__("vmcall"); +} + +static void l1_guest_code(struct vmx_pages *vmx_pages, unsigned long high_gpa) +{ +#define L2_GUEST_STACK_SIZE 64 + unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; + uint32_t control; + + GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages)); + GUEST_ASSERT(load_vmcs(vmx_pages)); + + /* Prepare the VMCS for L2 execution. */ + prepare_vmcs(vmx_pages, l2_guest_code, + &l2_guest_stack[L2_GUEST_STACK_SIZE]); + control = vmreadz(CPU_BASED_VM_EXEC_CONTROL); + control |= CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; + vmwrite(CPU_BASED_VM_EXEC_CONTROL, control); + control = vmreadz(SECONDARY_VM_EXEC_CONTROL); + control |= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; + vmwrite(SECONDARY_VM_EXEC_CONTROL, control); + vmwrite(APIC_ACCESS_ADDR, vmx_pages->apic_access_gpa); + + /* Try to launch L2 with the memory-backed APIC-access address. */ + GUEST_SYNC(vmreadz(APIC_ACCESS_ADDR)); + GUEST_ASSERT(!vmlaunch()); + GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL); + + vmwrite(APIC_ACCESS_ADDR, high_gpa); + + /* Try to resume L2 with the unbacked APIC-access address. */ + GUEST_SYNC(vmreadz(APIC_ACCESS_ADDR)); + GUEST_ASSERT(!vmresume()); + GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL); + + GUEST_DONE(); +} + +int main(int argc, char *argv[]) +{ + unsigned long apic_access_addr = ~0ul; + unsigned int paddr_width; + unsigned int vaddr_width; + vm_vaddr_t vmx_pages_gva; + unsigned long high_gpa; + struct vmx_pages *vmx; + bool done = false; + + nested_vmx_check_supported(); + + vm = vm_create_default(VCPU_ID, 0, (void *) l1_guest_code); + vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid()); + + kvm_get_cpu_address_width(&paddr_width, &vaddr_width); + high_gpa = (1ul << paddr_width) - getpagesize(); + if ((unsigned long)DEFAULT_GUEST_PHY_PAGES * getpagesize() > high_gpa) { + print_skip("No unbacked physical page available"); + exit(KSFT_SKIP); + } + + vmx = vcpu_alloc_vmx(vm, &vmx_pages_gva); + prepare_virtualize_apic_accesses(vmx, vm, 0); + vcpu_args_set(vm, VCPU_ID, 2, vmx_pages_gva, high_gpa); + + while (!done) { + volatile struct kvm_run *run = vcpu_state(vm, VCPU_ID); + struct ucall uc; + + vcpu_run(vm, VCPU_ID); + if (apic_access_addr == high_gpa) { + TEST_ASSERT(run->exit_reason == + KVM_EXIT_INTERNAL_ERROR, + "Got exit reason other than KVM_EXIT_INTERNAL_ERROR: %u (%s)\n", + run->exit_reason, + exit_reason_str(run->exit_reason)); + TEST_ASSERT(run->internal.suberror == + KVM_INTERNAL_ERROR_EMULATION, + "Got internal suberror other than KVM_INTERNAL_ERROR_EMULATION: %u\n", + run->internal.suberror); + break; + } + TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, + "Got exit_reason other than KVM_EXIT_IO: %u (%s)\n", + run->exit_reason, + exit_reason_str(run->exit_reason)); + + switch (get_ucall(vm, VCPU_ID, &uc)) { + case UCALL_ABORT: + TEST_FAIL("%s at %s:%ld", (const char *)uc.args[0], + __FILE__, uc.args[1]); + /* NOT REACHED */ + case UCALL_SYNC: + apic_access_addr = uc.args[1]; + break; + case UCALL_DONE: + done = true; + break; + default: + TEST_ASSERT(false, "Unknown ucall %lu", uc.cmd); + } + } + kvm_vm_free(vm); + return 0; +} diff --git a/tools/testing/selftests/kvm/x86_64/vmx_close_while_nested_test.c b/tools/testing/selftests/kvm/x86_64/vmx_close_while_nested_test.c new file mode 100644 index 000000000..fe40ade06 --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/vmx_close_while_nested_test.c @@ -0,0 +1,87 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * vmx_close_while_nested + * + * Copyright (C) 2019, Red Hat, Inc. + * + * Verify that nothing bad happens if a KVM user exits with open + * file descriptors while executing a nested guest. + */ + +#include "test_util.h" +#include "kvm_util.h" +#include "processor.h" +#include "vmx.h" + +#include +#include + +#include "kselftest.h" + +#define VCPU_ID 5 + +enum { + PORT_L0_EXIT = 0x2000, +}; + +/* The virtual machine object. */ +static struct kvm_vm *vm; + +static void l2_guest_code(void) +{ + /* Exit to L0 */ + asm volatile("inb %%dx, %%al" + : : [port] "d" (PORT_L0_EXIT) : "rax"); +} + +static void l1_guest_code(struct vmx_pages *vmx_pages) +{ +#define L2_GUEST_STACK_SIZE 64 + unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; + + GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages)); + GUEST_ASSERT(load_vmcs(vmx_pages)); + + /* Prepare the VMCS for L2 execution. */ + prepare_vmcs(vmx_pages, l2_guest_code, + &l2_guest_stack[L2_GUEST_STACK_SIZE]); + + GUEST_ASSERT(!vmlaunch()); + GUEST_ASSERT(0); +} + +int main(int argc, char *argv[]) +{ + vm_vaddr_t vmx_pages_gva; + + nested_vmx_check_supported(); + + vm = vm_create_default(VCPU_ID, 0, (void *) l1_guest_code); + vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid()); + + /* Allocate VMX pages and shared descriptors (vmx_pages). */ + vcpu_alloc_vmx(vm, &vmx_pages_gva); + vcpu_args_set(vm, VCPU_ID, 1, vmx_pages_gva); + + for (;;) { + volatile struct kvm_run *run = vcpu_state(vm, VCPU_ID); + struct ucall uc; + + vcpu_run(vm, VCPU_ID); + TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, + "Got exit_reason other than KVM_EXIT_IO: %u (%s)\n", + run->exit_reason, + exit_reason_str(run->exit_reason)); + + if (run->io.port == PORT_L0_EXIT) + break; + + switch (get_ucall(vm, VCPU_ID, &uc)) { + case UCALL_ABORT: + TEST_FAIL("%s", (const char *)uc.args[0]); + /* NOT REACHED */ + default: + TEST_FAIL("Unknown ucall %lu", uc.cmd); + } + } +} diff --git a/tools/testing/selftests/kvm/x86_64/vmx_dirty_log_test.c b/tools/testing/selftests/kvm/x86_64/vmx_dirty_log_test.c new file mode 100644 index 000000000..e894a638a --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/vmx_dirty_log_test.c @@ -0,0 +1,157 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * KVM dirty page logging test + * + * Copyright (C) 2018, Red Hat, Inc. + */ + +#define _GNU_SOURCE /* for program_invocation_name */ + +#include +#include +#include +#include + +#include "test_util.h" +#include "kvm_util.h" +#include "processor.h" +#include "vmx.h" + +#define VCPU_ID 1 + +/* The memory slot index to track dirty pages */ +#define TEST_MEM_SLOT_INDEX 1 +#define TEST_MEM_PAGES 3 + +/* L1 guest test virtual memory offset */ +#define GUEST_TEST_MEM 0xc0000000 + +/* L2 guest test virtual memory offset */ +#define NESTED_TEST_MEM1 0xc0001000 +#define NESTED_TEST_MEM2 0xc0002000 + +static void l2_guest_code(void) +{ + *(volatile uint64_t *)NESTED_TEST_MEM1; + *(volatile uint64_t *)NESTED_TEST_MEM1 = 1; + GUEST_SYNC(true); + GUEST_SYNC(false); + + *(volatile uint64_t *)NESTED_TEST_MEM2 = 1; + GUEST_SYNC(true); + *(volatile uint64_t *)NESTED_TEST_MEM2 = 1; + GUEST_SYNC(true); + GUEST_SYNC(false); + + /* Exit to L1 and never come back. */ + vmcall(); +} + +void l1_guest_code(struct vmx_pages *vmx) +{ +#define L2_GUEST_STACK_SIZE 64 + unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; + + GUEST_ASSERT(vmx->vmcs_gpa); + GUEST_ASSERT(prepare_for_vmx_operation(vmx)); + GUEST_ASSERT(load_vmcs(vmx)); + + prepare_vmcs(vmx, l2_guest_code, + &l2_guest_stack[L2_GUEST_STACK_SIZE]); + + GUEST_SYNC(false); + GUEST_ASSERT(!vmlaunch()); + GUEST_SYNC(false); + GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL); + GUEST_DONE(); +} + +int main(int argc, char *argv[]) +{ + vm_vaddr_t vmx_pages_gva = 0; + struct vmx_pages *vmx; + unsigned long *bmap; + uint64_t *host_test_mem; + + struct kvm_vm *vm; + struct kvm_run *run; + struct ucall uc; + bool done = false; + + nested_vmx_check_supported(); + + /* Create VM */ + vm = vm_create_default(VCPU_ID, 0, l1_guest_code); + vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid()); + vmx = vcpu_alloc_vmx(vm, &vmx_pages_gva); + vcpu_args_set(vm, VCPU_ID, 1, vmx_pages_gva); + run = vcpu_state(vm, VCPU_ID); + + /* Add an extra memory slot for testing dirty logging */ + vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, + GUEST_TEST_MEM, + TEST_MEM_SLOT_INDEX, + TEST_MEM_PAGES, + KVM_MEM_LOG_DIRTY_PAGES); + + /* + * Add an identity map for GVA range [0xc0000000, 0xc0002000). This + * affects both L1 and L2. However... + */ + virt_map(vm, GUEST_TEST_MEM, GUEST_TEST_MEM, TEST_MEM_PAGES, 0); + + /* + * ... pages in the L2 GPA range [0xc0001000, 0xc0003000) will map to + * 0xc0000000. + * + * Note that prepare_eptp should be called only L1's GPA map is done, + * meaning after the last call to virt_map. + */ + prepare_eptp(vmx, vm, 0); + nested_map_memslot(vmx, vm, 0, 0); + nested_map(vmx, vm, NESTED_TEST_MEM1, GUEST_TEST_MEM, 4096, 0); + nested_map(vmx, vm, NESTED_TEST_MEM2, GUEST_TEST_MEM, 4096, 0); + + bmap = bitmap_alloc(TEST_MEM_PAGES); + host_test_mem = addr_gpa2hva(vm, GUEST_TEST_MEM); + + while (!done) { + memset(host_test_mem, 0xaa, TEST_MEM_PAGES * 4096); + _vcpu_run(vm, VCPU_ID); + TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, + "Unexpected exit reason: %u (%s),\n", + run->exit_reason, + exit_reason_str(run->exit_reason)); + + switch (get_ucall(vm, VCPU_ID, &uc)) { + case UCALL_ABORT: + TEST_FAIL("%s at %s:%ld", (const char *)uc.args[0], + __FILE__, uc.args[1]); + /* NOT REACHED */ + case UCALL_SYNC: + /* + * The nested guest wrote at offset 0x1000 in the memslot, but the + * dirty bitmap must be filled in according to L1 GPA, not L2. + */ + kvm_vm_get_dirty_log(vm, TEST_MEM_SLOT_INDEX, bmap); + if (uc.args[1]) { + TEST_ASSERT(test_bit(0, bmap), "Page 0 incorrectly reported clean\n"); + TEST_ASSERT(host_test_mem[0] == 1, "Page 0 not written by guest\n"); + } else { + TEST_ASSERT(!test_bit(0, bmap), "Page 0 incorrectly reported dirty\n"); + TEST_ASSERT(host_test_mem[0] == 0xaaaaaaaaaaaaaaaaULL, "Page 0 written by guest\n"); + } + + TEST_ASSERT(!test_bit(1, bmap), "Page 1 incorrectly reported dirty\n"); + TEST_ASSERT(host_test_mem[4096 / 8] == 0xaaaaaaaaaaaaaaaaULL, "Page 1 written by guest\n"); + TEST_ASSERT(!test_bit(2, bmap), "Page 2 incorrectly reported dirty\n"); + TEST_ASSERT(host_test_mem[8192 / 8] == 0xaaaaaaaaaaaaaaaaULL, "Page 2 written by guest\n"); + break; + case UCALL_DONE: + done = true; + break; + default: + TEST_FAIL("Unknown ucall %lu", uc.cmd); + } + } +} diff --git a/tools/testing/selftests/kvm/x86_64/vmx_preemption_timer_test.c b/tools/testing/selftests/kvm/x86_64/vmx_preemption_timer_test.c new file mode 100644 index 000000000..a7737af12 --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/vmx_preemption_timer_test.c @@ -0,0 +1,259 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * VMX-preemption timer test + * + * Copyright (C) 2020, Google, LLC. + * + * Test to ensure the VM-Enter after migration doesn't + * incorrectly restarts the timer with the full timer + * value instead of partially decayed timer value + * + */ +#define _GNU_SOURCE /* for program_invocation_short_name */ +#include +#include +#include +#include +#include + +#include "test_util.h" + +#include "kvm_util.h" +#include "processor.h" +#include "vmx.h" + +#define VCPU_ID 5 +#define PREEMPTION_TIMER_VALUE 100000000ull +#define PREEMPTION_TIMER_VALUE_THRESHOLD1 80000000ull + +u32 vmx_pt_rate; +bool l2_save_restore_done; +static u64 l2_vmx_pt_start; +volatile u64 l2_vmx_pt_finish; + +union vmx_basic basic; +union vmx_ctrl_msr ctrl_pin_rev; +union vmx_ctrl_msr ctrl_exit_rev; + +void l2_guest_code(void) +{ + u64 vmx_pt_delta; + + vmcall(); + l2_vmx_pt_start = (rdtsc() >> vmx_pt_rate) << vmx_pt_rate; + + /* + * Wait until the 1st threshold has passed + */ + do { + l2_vmx_pt_finish = rdtsc(); + vmx_pt_delta = (l2_vmx_pt_finish - l2_vmx_pt_start) >> + vmx_pt_rate; + } while (vmx_pt_delta < PREEMPTION_TIMER_VALUE_THRESHOLD1); + + /* + * Force L2 through Save and Restore cycle + */ + GUEST_SYNC(1); + + l2_save_restore_done = 1; + + /* + * Now wait for the preemption timer to fire and + * exit to L1 + */ + while ((l2_vmx_pt_finish = rdtsc())) + ; +} + +void l1_guest_code(struct vmx_pages *vmx_pages) +{ +#define L2_GUEST_STACK_SIZE 64 + unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; + u64 l1_vmx_pt_start; + u64 l1_vmx_pt_finish; + u64 l1_tsc_deadline, l2_tsc_deadline; + + GUEST_ASSERT(vmx_pages->vmcs_gpa); + GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages)); + GUEST_ASSERT(load_vmcs(vmx_pages)); + GUEST_ASSERT(vmptrstz() == vmx_pages->vmcs_gpa); + + prepare_vmcs(vmx_pages, l2_guest_code, + &l2_guest_stack[L2_GUEST_STACK_SIZE]); + + /* + * Check for Preemption timer support + */ + basic.val = rdmsr(MSR_IA32_VMX_BASIC); + ctrl_pin_rev.val = rdmsr(basic.ctrl ? MSR_IA32_VMX_TRUE_PINBASED_CTLS + : MSR_IA32_VMX_PINBASED_CTLS); + ctrl_exit_rev.val = rdmsr(basic.ctrl ? MSR_IA32_VMX_TRUE_EXIT_CTLS + : MSR_IA32_VMX_EXIT_CTLS); + + if (!(ctrl_pin_rev.clr & PIN_BASED_VMX_PREEMPTION_TIMER) || + !(ctrl_exit_rev.clr & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)) + return; + + GUEST_ASSERT(!vmlaunch()); + GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL); + vmwrite(GUEST_RIP, vmreadz(GUEST_RIP) + vmreadz(VM_EXIT_INSTRUCTION_LEN)); + + /* + * Turn on PIN control and resume the guest + */ + GUEST_ASSERT(!vmwrite(PIN_BASED_VM_EXEC_CONTROL, + vmreadz(PIN_BASED_VM_EXEC_CONTROL) | + PIN_BASED_VMX_PREEMPTION_TIMER)); + + GUEST_ASSERT(!vmwrite(VMX_PREEMPTION_TIMER_VALUE, + PREEMPTION_TIMER_VALUE)); + + vmx_pt_rate = rdmsr(MSR_IA32_VMX_MISC) & 0x1F; + + l2_save_restore_done = 0; + + l1_vmx_pt_start = (rdtsc() >> vmx_pt_rate) << vmx_pt_rate; + + GUEST_ASSERT(!vmresume()); + + l1_vmx_pt_finish = rdtsc(); + + /* + * Ensure exit from L2 happens after L2 goes through + * save and restore + */ + GUEST_ASSERT(l2_save_restore_done); + + /* + * Ensure the exit from L2 is due to preemption timer expiry + */ + GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_PREEMPTION_TIMER); + + l1_tsc_deadline = l1_vmx_pt_start + + (PREEMPTION_TIMER_VALUE << vmx_pt_rate); + + l2_tsc_deadline = l2_vmx_pt_start + + (PREEMPTION_TIMER_VALUE << vmx_pt_rate); + + /* + * Sync with the host and pass the l1|l2 pt_expiry_finish times and + * tsc deadlines so that host can verify they are as expected + */ + GUEST_SYNC_ARGS(2, l1_vmx_pt_finish, l1_tsc_deadline, + l2_vmx_pt_finish, l2_tsc_deadline); +} + +void guest_code(struct vmx_pages *vmx_pages) +{ + if (vmx_pages) + l1_guest_code(vmx_pages); + + GUEST_DONE(); +} + +int main(int argc, char *argv[]) +{ + vm_vaddr_t vmx_pages_gva = 0; + + struct kvm_regs regs1, regs2; + struct kvm_vm *vm; + struct kvm_run *run; + struct kvm_x86_state *state; + struct ucall uc; + int stage; + + /* + * AMD currently does not implement any VMX features, so for now we + * just early out. + */ + nested_vmx_check_supported(); + + /* Create VM */ + vm = vm_create_default(VCPU_ID, 0, guest_code); + vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid()); + run = vcpu_state(vm, VCPU_ID); + + vcpu_regs_get(vm, VCPU_ID, ®s1); + + if (kvm_check_cap(KVM_CAP_NESTED_STATE)) { + vcpu_alloc_vmx(vm, &vmx_pages_gva); + vcpu_args_set(vm, VCPU_ID, 1, vmx_pages_gva); + } else { + pr_info("will skip vmx preemption timer checks\n"); + goto done; + } + + for (stage = 1;; stage++) { + _vcpu_run(vm, VCPU_ID); + TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, + "Stage %d: unexpected exit reason: %u (%s),\n", + stage, run->exit_reason, + exit_reason_str(run->exit_reason)); + + switch (get_ucall(vm, VCPU_ID, &uc)) { + case UCALL_ABORT: + TEST_FAIL("%s at %s:%ld", (const char *)uc.args[0], + __FILE__, uc.args[1]); + /* NOT REACHED */ + case UCALL_SYNC: + break; + case UCALL_DONE: + goto done; + default: + TEST_FAIL("Unknown ucall %lu", uc.cmd); + } + + /* UCALL_SYNC is handled here. */ + TEST_ASSERT(!strcmp((const char *)uc.args[0], "hello") && + uc.args[1] == stage, "Stage %d: Unexpected register values vmexit, got %lx", + stage, (ulong)uc.args[1]); + /* + * If this stage 2 then we should verify the vmx pt expiry + * is as expected. + * From L1's perspective verify Preemption timer hasn't + * expired too early. + * From L2's perspective verify Preemption timer hasn't + * expired too late. + */ + if (stage == 2) { + + pr_info("Stage %d: L1 PT expiry TSC (%lu) , L1 TSC deadline (%lu)\n", + stage, uc.args[2], uc.args[3]); + + pr_info("Stage %d: L2 PT expiry TSC (%lu) , L2 TSC deadline (%lu)\n", + stage, uc.args[4], uc.args[5]); + + TEST_ASSERT(uc.args[2] >= uc.args[3], + "Stage %d: L1 PT expiry TSC (%lu) < L1 TSC deadline (%lu)", + stage, uc.args[2], uc.args[3]); + + TEST_ASSERT(uc.args[4] < uc.args[5], + "Stage %d: L2 PT expiry TSC (%lu) > L2 TSC deadline (%lu)", + stage, uc.args[4], uc.args[5]); + } + + state = vcpu_save_state(vm, VCPU_ID); + memset(®s1, 0, sizeof(regs1)); + vcpu_regs_get(vm, VCPU_ID, ®s1); + + kvm_vm_release(vm); + + /* Restore state in a new VM. */ + kvm_vm_restart(vm, O_RDWR); + vm_vcpu_add(vm, VCPU_ID); + vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid()); + vcpu_load_state(vm, VCPU_ID, state); + run = vcpu_state(vm, VCPU_ID); + free(state); + + memset(®s2, 0, sizeof(regs2)); + vcpu_regs_get(vm, VCPU_ID, ®s2); + TEST_ASSERT(!memcmp(®s1, ®s2, sizeof(regs2)), + "Unexpected register values after vcpu_load_state; rdi: %lx rsi: %lx", + (ulong) regs2.rdi, (ulong) regs2.rsi); + } + +done: + kvm_vm_free(vm); +} diff --git a/tools/testing/selftests/kvm/x86_64/vmx_set_nested_state_test.c b/tools/testing/selftests/kvm/x86_64/vmx_set_nested_state_test.c new file mode 100644 index 000000000..d59f3eb67 --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/vmx_set_nested_state_test.c @@ -0,0 +1,298 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * vmx_set_nested_state_test + * + * Copyright (C) 2019, Google LLC. + * + * This test verifies the integrity of calling the ioctl KVM_SET_NESTED_STATE. + */ + +#include "test_util.h" +#include "kvm_util.h" +#include "processor.h" +#include "vmx.h" + +#include +#include +#include +#include +#include + +/* + * Mirror of VMCS12_REVISION in arch/x86/kvm/vmx/vmcs12.h. If that value + * changes this should be updated. + */ +#define VMCS12_REVISION 0x11e57ed0 +#define VCPU_ID 5 + +bool have_evmcs; + +void test_nested_state(struct kvm_vm *vm, struct kvm_nested_state *state) +{ + vcpu_nested_state_set(vm, VCPU_ID, state, false); +} + +void test_nested_state_expect_errno(struct kvm_vm *vm, + struct kvm_nested_state *state, + int expected_errno) +{ + int rv; + + rv = vcpu_nested_state_set(vm, VCPU_ID, state, true); + TEST_ASSERT(rv == -1 && errno == expected_errno, + "Expected %s (%d) from vcpu_nested_state_set but got rv: %i errno: %s (%d)", + strerror(expected_errno), expected_errno, rv, strerror(errno), + errno); +} + +void test_nested_state_expect_einval(struct kvm_vm *vm, + struct kvm_nested_state *state) +{ + test_nested_state_expect_errno(vm, state, EINVAL); +} + +void test_nested_state_expect_efault(struct kvm_vm *vm, + struct kvm_nested_state *state) +{ + test_nested_state_expect_errno(vm, state, EFAULT); +} + +void set_revision_id_for_vmcs12(struct kvm_nested_state *state, + u32 vmcs12_revision) +{ + /* Set revision_id in vmcs12 to vmcs12_revision. */ + memcpy(&state->data, &vmcs12_revision, sizeof(u32)); +} + +void set_default_state(struct kvm_nested_state *state) +{ + memset(state, 0, sizeof(*state)); + state->flags = KVM_STATE_NESTED_RUN_PENDING | + KVM_STATE_NESTED_GUEST_MODE; + state->format = 0; + state->size = sizeof(*state); +} + +void set_default_vmx_state(struct kvm_nested_state *state, int size) +{ + memset(state, 0, size); + if (have_evmcs) + state->flags = KVM_STATE_NESTED_EVMCS; + state->format = 0; + state->size = size; + state->hdr.vmx.vmxon_pa = 0x1000; + state->hdr.vmx.vmcs12_pa = 0x2000; + state->hdr.vmx.smm.flags = 0; + set_revision_id_for_vmcs12(state, VMCS12_REVISION); +} + +void test_vmx_nested_state(struct kvm_vm *vm) +{ + /* Add a page for VMCS12. */ + const int state_sz = sizeof(struct kvm_nested_state) + getpagesize(); + struct kvm_nested_state *state = + (struct kvm_nested_state *)malloc(state_sz); + + /* The format must be set to 0. 0 for VMX, 1 for SVM. */ + set_default_vmx_state(state, state_sz); + state->format = 1; + test_nested_state_expect_einval(vm, state); + + /* + * We cannot virtualize anything if the guest does not have VMX + * enabled. + */ + set_default_vmx_state(state, state_sz); + test_nested_state_expect_einval(vm, state); + + /* + * We cannot virtualize anything if the guest does not have VMX + * enabled. We expect KVM_SET_NESTED_STATE to return 0 if vmxon_pa + * is set to -1ull, but the flags must be zero. + */ + set_default_vmx_state(state, state_sz); + state->hdr.vmx.vmxon_pa = -1ull; + test_nested_state_expect_einval(vm, state); + + state->hdr.vmx.vmcs12_pa = -1ull; + state->flags = KVM_STATE_NESTED_EVMCS; + test_nested_state_expect_einval(vm, state); + + state->flags = 0; + test_nested_state(vm, state); + + /* Enable VMX in the guest CPUID. */ + vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid()); + + /* + * Setting vmxon_pa == -1ull and vmcs_pa == -1ull exits early without + * setting the nested state but flags other than eVMCS must be clear. + * The eVMCS flag can be set if the enlightened VMCS capability has + * been enabled. + */ + set_default_vmx_state(state, state_sz); + state->hdr.vmx.vmxon_pa = -1ull; + state->hdr.vmx.vmcs12_pa = -1ull; + test_nested_state_expect_einval(vm, state); + + state->flags &= KVM_STATE_NESTED_EVMCS; + if (have_evmcs) { + test_nested_state_expect_einval(vm, state); + vcpu_enable_evmcs(vm, VCPU_ID); + } + test_nested_state(vm, state); + + /* It is invalid to have vmxon_pa == -1ull and SMM flags non-zero. */ + state->hdr.vmx.smm.flags = 1; + test_nested_state_expect_einval(vm, state); + + /* Invalid flags are rejected. */ + set_default_vmx_state(state, state_sz); + state->hdr.vmx.flags = ~0; + test_nested_state_expect_einval(vm, state); + + /* It is invalid to have vmxon_pa == -1ull and vmcs_pa != -1ull. */ + set_default_vmx_state(state, state_sz); + state->hdr.vmx.vmxon_pa = -1ull; + state->flags = 0; + test_nested_state_expect_einval(vm, state); + + /* It is invalid to have vmxon_pa set to a non-page aligned address. */ + set_default_vmx_state(state, state_sz); + state->hdr.vmx.vmxon_pa = 1; + test_nested_state_expect_einval(vm, state); + + /* + * It is invalid to have KVM_STATE_NESTED_SMM_GUEST_MODE and + * KVM_STATE_NESTED_GUEST_MODE set together. + */ + set_default_vmx_state(state, state_sz); + state->flags = KVM_STATE_NESTED_GUEST_MODE | + KVM_STATE_NESTED_RUN_PENDING; + state->hdr.vmx.smm.flags = KVM_STATE_NESTED_SMM_GUEST_MODE; + test_nested_state_expect_einval(vm, state); + + /* + * It is invalid to have any of the SMM flags set besides: + * KVM_STATE_NESTED_SMM_GUEST_MODE + * KVM_STATE_NESTED_SMM_VMXON + */ + set_default_vmx_state(state, state_sz); + state->hdr.vmx.smm.flags = ~(KVM_STATE_NESTED_SMM_GUEST_MODE | + KVM_STATE_NESTED_SMM_VMXON); + test_nested_state_expect_einval(vm, state); + + /* Outside SMM, SMM flags must be zero. */ + set_default_vmx_state(state, state_sz); + state->flags = 0; + state->hdr.vmx.smm.flags = KVM_STATE_NESTED_SMM_GUEST_MODE; + test_nested_state_expect_einval(vm, state); + + /* + * Size must be large enough to fit kvm_nested_state and vmcs12 + * if VMCS12 physical address is set + */ + set_default_vmx_state(state, state_sz); + state->size = sizeof(*state); + state->flags = 0; + test_nested_state_expect_einval(vm, state); + + set_default_vmx_state(state, state_sz); + state->size = sizeof(*state); + state->flags = 0; + state->hdr.vmx.vmcs12_pa = -1; + test_nested_state(vm, state); + + /* + * KVM_SET_NESTED_STATE succeeds with invalid VMCS + * contents but L2 not running. + */ + set_default_vmx_state(state, state_sz); + state->flags = 0; + test_nested_state(vm, state); + + /* Invalid flags are rejected, even if no VMCS loaded. */ + set_default_vmx_state(state, state_sz); + state->size = sizeof(*state); + state->flags = 0; + state->hdr.vmx.vmcs12_pa = -1; + state->hdr.vmx.flags = ~0; + test_nested_state_expect_einval(vm, state); + + /* vmxon_pa cannot be the same address as vmcs_pa. */ + set_default_vmx_state(state, state_sz); + state->hdr.vmx.vmxon_pa = 0; + state->hdr.vmx.vmcs12_pa = 0; + test_nested_state_expect_einval(vm, state); + + /* + * Test that if we leave nesting the state reflects that when we get + * it again. + */ + set_default_vmx_state(state, state_sz); + state->hdr.vmx.vmxon_pa = -1ull; + state->hdr.vmx.vmcs12_pa = -1ull; + state->flags = 0; + test_nested_state(vm, state); + vcpu_nested_state_get(vm, VCPU_ID, state); + TEST_ASSERT(state->size >= sizeof(*state) && state->size <= state_sz, + "Size must be between %ld and %d. The size returned was %d.", + sizeof(*state), state_sz, state->size); + TEST_ASSERT(state->hdr.vmx.vmxon_pa == -1ull, "vmxon_pa must be -1ull."); + TEST_ASSERT(state->hdr.vmx.vmcs12_pa == -1ull, "vmcs_pa must be -1ull."); + + free(state); +} + +int main(int argc, char *argv[]) +{ + struct kvm_vm *vm; + struct kvm_nested_state state; + + have_evmcs = kvm_check_cap(KVM_CAP_HYPERV_ENLIGHTENED_VMCS); + + if (!kvm_check_cap(KVM_CAP_NESTED_STATE)) { + print_skip("KVM_CAP_NESTED_STATE not available"); + exit(KSFT_SKIP); + } + + /* + * AMD currently does not implement set_nested_state, so for now we + * just early out. + */ + nested_vmx_check_supported(); + + vm = vm_create_default(VCPU_ID, 0, 0); + + /* Passing a NULL kvm_nested_state causes a EFAULT. */ + test_nested_state_expect_efault(vm, NULL); + + /* 'size' cannot be smaller than sizeof(kvm_nested_state). */ + set_default_state(&state); + state.size = 0; + test_nested_state_expect_einval(vm, &state); + + /* + * Setting the flags 0xf fails the flags check. The only flags that + * can be used are: + * KVM_STATE_NESTED_GUEST_MODE + * KVM_STATE_NESTED_RUN_PENDING + * KVM_STATE_NESTED_EVMCS + */ + set_default_state(&state); + state.flags = 0xf; + test_nested_state_expect_einval(vm, &state); + + /* + * If KVM_STATE_NESTED_RUN_PENDING is set then + * KVM_STATE_NESTED_GUEST_MODE has to be set as well. + */ + set_default_state(&state); + state.flags = KVM_STATE_NESTED_RUN_PENDING; + test_nested_state_expect_einval(vm, &state); + + test_vmx_nested_state(vm); + + kvm_vm_free(vm); + return 0; +} diff --git a/tools/testing/selftests/kvm/x86_64/vmx_tsc_adjust_test.c b/tools/testing/selftests/kvm/x86_64/vmx_tsc_adjust_test.c new file mode 100644 index 000000000..fbe8417cb --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/vmx_tsc_adjust_test.c @@ -0,0 +1,168 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * vmx_tsc_adjust_test + * + * Copyright (C) 2018, Google LLC. + * + * IA32_TSC_ADJUST test + * + * According to the SDM, "if an execution of WRMSR to the + * IA32_TIME_STAMP_COUNTER MSR adds (or subtracts) value X from the TSC, + * the logical processor also adds (or subtracts) value X from the + * IA32_TSC_ADJUST MSR. + * + * Note that when L1 doesn't intercept writes to IA32_TSC, a + * WRMSR(IA32_TSC) from L2 sets L1's TSC value, not L2's perceived TSC + * value. + * + * This test verifies that this unusual case is handled correctly. + */ + +#include "test_util.h" +#include "kvm_util.h" +#include "processor.h" +#include "vmx.h" + +#include +#include + +#include "kselftest.h" + +#ifndef MSR_IA32_TSC_ADJUST +#define MSR_IA32_TSC_ADJUST 0x3b +#endif + +#define PAGE_SIZE 4096 +#define VCPU_ID 5 + +#define TSC_ADJUST_VALUE (1ll << 32) +#define TSC_OFFSET_VALUE -(1ll << 48) + +enum { + PORT_ABORT = 0x1000, + PORT_REPORT, + PORT_DONE, +}; + +enum { + VMXON_PAGE = 0, + VMCS_PAGE, + MSR_BITMAP_PAGE, + + NUM_VMX_PAGES, +}; + +struct kvm_single_msr { + struct kvm_msrs header; + struct kvm_msr_entry entry; +} __attribute__((packed)); + +/* The virtual machine object. */ +static struct kvm_vm *vm; + +static void check_ia32_tsc_adjust(int64_t max) +{ + int64_t adjust; + + adjust = rdmsr(MSR_IA32_TSC_ADJUST); + GUEST_SYNC(adjust); + GUEST_ASSERT(adjust <= max); +} + +static void l2_guest_code(void) +{ + uint64_t l1_tsc = rdtsc() - TSC_OFFSET_VALUE; + + wrmsr(MSR_IA32_TSC, l1_tsc - TSC_ADJUST_VALUE); + check_ia32_tsc_adjust(-2 * TSC_ADJUST_VALUE); + + /* Exit to L1 */ + __asm__ __volatile__("vmcall"); +} + +static void l1_guest_code(struct vmx_pages *vmx_pages) +{ +#define L2_GUEST_STACK_SIZE 64 + unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; + uint32_t control; + uintptr_t save_cr3; + + GUEST_ASSERT(rdtsc() < TSC_ADJUST_VALUE); + wrmsr(MSR_IA32_TSC, rdtsc() - TSC_ADJUST_VALUE); + check_ia32_tsc_adjust(-1 * TSC_ADJUST_VALUE); + + GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages)); + GUEST_ASSERT(load_vmcs(vmx_pages)); + + /* Prepare the VMCS for L2 execution. */ + prepare_vmcs(vmx_pages, l2_guest_code, + &l2_guest_stack[L2_GUEST_STACK_SIZE]); + control = vmreadz(CPU_BASED_VM_EXEC_CONTROL); + control |= CPU_BASED_USE_MSR_BITMAPS | CPU_BASED_USE_TSC_OFFSETTING; + vmwrite(CPU_BASED_VM_EXEC_CONTROL, control); + vmwrite(TSC_OFFSET, TSC_OFFSET_VALUE); + + /* Jump into L2. First, test failure to load guest CR3. */ + save_cr3 = vmreadz(GUEST_CR3); + vmwrite(GUEST_CR3, -1ull); + GUEST_ASSERT(!vmlaunch()); + GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == + (EXIT_REASON_FAILED_VMENTRY | EXIT_REASON_INVALID_STATE)); + check_ia32_tsc_adjust(-1 * TSC_ADJUST_VALUE); + vmwrite(GUEST_CR3, save_cr3); + + GUEST_ASSERT(!vmlaunch()); + GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL); + + check_ia32_tsc_adjust(-2 * TSC_ADJUST_VALUE); + + GUEST_DONE(); +} + +static void report(int64_t val) +{ + pr_info("IA32_TSC_ADJUST is %ld (%lld * TSC_ADJUST_VALUE + %lld).\n", + val, val / TSC_ADJUST_VALUE, val % TSC_ADJUST_VALUE); +} + +int main(int argc, char *argv[]) +{ + vm_vaddr_t vmx_pages_gva; + + nested_vmx_check_supported(); + + vm = vm_create_default(VCPU_ID, 0, (void *) l1_guest_code); + vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid()); + + /* Allocate VMX pages and shared descriptors (vmx_pages). */ + vcpu_alloc_vmx(vm, &vmx_pages_gva); + vcpu_args_set(vm, VCPU_ID, 1, vmx_pages_gva); + + for (;;) { + volatile struct kvm_run *run = vcpu_state(vm, VCPU_ID); + struct ucall uc; + + vcpu_run(vm, VCPU_ID); + TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, + "Got exit_reason other than KVM_EXIT_IO: %u (%s)\n", + run->exit_reason, + exit_reason_str(run->exit_reason)); + + switch (get_ucall(vm, VCPU_ID, &uc)) { + case UCALL_ABORT: + TEST_FAIL("%s", (const char *)uc.args[0]); + /* NOT REACHED */ + case UCALL_SYNC: + report(uc.args[1]); + break; + case UCALL_DONE: + goto done; + default: + TEST_FAIL("Unknown ucall %lu", uc.cmd); + } + } + + kvm_vm_free(vm); +done: + return 0; +} diff --git a/tools/testing/selftests/kvm/x86_64/xss_msr_test.c b/tools/testing/selftests/kvm/x86_64/xss_msr_test.c new file mode 100644 index 000000000..352937674 --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/xss_msr_test.c @@ -0,0 +1,76 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2019, Google LLC. + * + * Tests for the IA32_XSS MSR. + */ + +#define _GNU_SOURCE /* for program_invocation_short_name */ +#include + +#include "test_util.h" +#include "kvm_util.h" +#include "vmx.h" + +#define VCPU_ID 1 +#define MSR_BITS 64 + +#define X86_FEATURE_XSAVES (1<<3) + +bool is_supported_msr(u32 msr_index) +{ + struct kvm_msr_list *list; + bool found = false; + int i; + + list = kvm_get_msr_index_list(); + for (i = 0; i < list->nmsrs; ++i) { + if (list->indices[i] == msr_index) { + found = true; + break; + } + } + + free(list); + return found; +} + +int main(int argc, char *argv[]) +{ + struct kvm_cpuid_entry2 *entry; + bool xss_supported = false; + struct kvm_vm *vm; + uint64_t xss_val; + int i, r; + + /* Create VM */ + vm = vm_create_default(VCPU_ID, 0, 0); + + if (kvm_get_cpuid_max_basic() >= 0xd) { + entry = kvm_get_supported_cpuid_index(0xd, 1); + xss_supported = entry && !!(entry->eax & X86_FEATURE_XSAVES); + } + if (!xss_supported) { + print_skip("IA32_XSS is not supported by the vCPU"); + exit(KSFT_SKIP); + } + + xss_val = vcpu_get_msr(vm, VCPU_ID, MSR_IA32_XSS); + TEST_ASSERT(xss_val == 0, + "MSR_IA32_XSS should be initialized to zero\n"); + + vcpu_set_msr(vm, VCPU_ID, MSR_IA32_XSS, xss_val); + /* + * At present, KVM only supports a guest IA32_XSS value of 0. Verify + * that trying to set the guest IA32_XSS to an unsupported value fails. + * Also, in the future when a non-zero value succeeds check that + * IA32_XSS is in the KVM_GET_MSR_INDEX_LIST. + */ + for (i = 0; i < MSR_BITS; ++i) { + r = _vcpu_set_msr(vm, VCPU_ID, MSR_IA32_XSS, 1ull << i); + TEST_ASSERT(r == 0 || is_supported_msr(MSR_IA32_XSS), + "IA32_XSS was able to be set, but was not found in KVM_GET_MSR_INDEX_LIST.\n"); + } + + kvm_vm_free(vm); +} -- cgit v1.2.3