summaryrefslogtreecommitdiffstats
path: root/tools/testing/selftests/kvm
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--tools/testing/selftests/kvm/.gitignore33
-rw-r--r--tools/testing/selftests/kvm/Makefile144
-rw-r--r--tools/testing/selftests/kvm/aarch64/get-reg-list-sve.c3
-rw-r--r--tools/testing/selftests/kvm/aarch64/get-reg-list.c841
-rw-r--r--tools/testing/selftests/kvm/config3
-rw-r--r--tools/testing/selftests/kvm/demand_paging_test.c498
-rw-r--r--tools/testing/selftests/kvm/dirty_log_perf_test.c376
-rw-r--r--tools/testing/selftests/kvm/dirty_log_test.c639
-rw-r--r--tools/testing/selftests/kvm/include/aarch64/processor.h59
-rw-r--r--tools/testing/selftests/kvm/include/evmcs.h1102
-rw-r--r--tools/testing/selftests/kvm/include/kvm_util.h348
-rw-r--r--tools/testing/selftests/kvm/include/perf_test_util.h198
-rw-r--r--tools/testing/selftests/kvm/include/s390x/processor.h22
-rw-r--r--tools/testing/selftests/kvm/include/sparsebit.h73
-rw-r--r--tools/testing/selftests/kvm/include/test_util.h70
-rw-r--r--tools/testing/selftests/kvm/include/x86_64/processor.h422
-rw-r--r--tools/testing/selftests/kvm/include/x86_64/svm.h297
-rw-r--r--tools/testing/selftests/kvm/include/x86_64/svm_util.h49
-rw-r--r--tools/testing/selftests/kvm/include/x86_64/vmx.h625
-rw-r--r--tools/testing/selftests/kvm/kvm_create_max_vcpus.c96
-rw-r--r--tools/testing/selftests/kvm/lib/aarch64/processor.c356
-rw-r--r--tools/testing/selftests/kvm/lib/aarch64/ucall.c114
-rw-r--r--tools/testing/selftests/kvm/lib/assert.c93
-rw-r--r--tools/testing/selftests/kvm/lib/elf.c196
-rw-r--r--tools/testing/selftests/kvm/lib/io.c157
-rw-r--r--tools/testing/selftests/kvm/lib/kvm_util.c1865
-rw-r--r--tools/testing/selftests/kvm/lib/kvm_util_internal.h113
-rw-r--r--tools/testing/selftests/kvm/lib/s390x/processor.c247
-rw-r--r--tools/testing/selftests/kvm/lib/s390x/ucall.c59
-rw-r--r--tools/testing/selftests/kvm/lib/sparsebit.c2086
-rw-r--r--tools/testing/selftests/kvm/lib/test_util.c111
-rw-r--r--tools/testing/selftests/kvm/lib/x86_64/handlers.S81
-rw-r--r--tools/testing/selftests/kvm/lib/x86_64/processor.c1258
-rw-r--r--tools/testing/selftests/kvm/lib/x86_64/svm.c177
-rw-r--r--tools/testing/selftests/kvm/lib/x86_64/ucall.c59
-rw-r--r--tools/testing/selftests/kvm/lib/x86_64/vmx.c553
-rw-r--r--tools/testing/selftests/kvm/s390x/memop.c166
-rw-r--r--tools/testing/selftests/kvm/s390x/resets.c279
-rw-r--r--tools/testing/selftests/kvm/s390x/sync_regs_test.c193
-rw-r--r--tools/testing/selftests/kvm/set_memory_region_test.c417
-rw-r--r--tools/testing/selftests/kvm/steal_time.c352
-rw-r--r--tools/testing/selftests/kvm/x86_64/cr4_cpuid_sync_test.c117
-rw-r--r--tools/testing/selftests/kvm/x86_64/debug_regs.c202
-rw-r--r--tools/testing/selftests/kvm/x86_64/evmcs_test.c166
-rw-r--r--tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c190
-rw-r--r--tools/testing/selftests/kvm/x86_64/kvm_pv_test.c234
-rw-r--r--tools/testing/selftests/kvm/x86_64/mmio_warning_test.c127
-rw-r--r--tools/testing/selftests/kvm/x86_64/platform_info_test.c107
-rw-r--r--tools/testing/selftests/kvm/x86_64/set_sregs_test.c52
-rw-r--r--tools/testing/selftests/kvm/x86_64/smm_test.c164
-rw-r--r--tools/testing/selftests/kvm/x86_64/state_test.c233
-rw-r--r--tools/testing/selftests/kvm/x86_64/svm_vmcall_test.c77
-rw-r--r--tools/testing/selftests/kvm/x86_64/sync_regs_test.c243
-rw-r--r--tools/testing/selftests/kvm/x86_64/tsc_msrs_test.c168
-rw-r--r--tools/testing/selftests/kvm/x86_64/user_msr_test.c248
-rw-r--r--tools/testing/selftests/kvm/x86_64/vmx_apic_access_test.c142
-rw-r--r--tools/testing/selftests/kvm/x86_64/vmx_close_while_nested_test.c87
-rw-r--r--tools/testing/selftests/kvm/x86_64/vmx_dirty_log_test.c157
-rw-r--r--tools/testing/selftests/kvm/x86_64/vmx_preemption_timer_test.c259
-rw-r--r--tools/testing/selftests/kvm/x86_64/vmx_set_nested_state_test.c298
-rw-r--r--tools/testing/selftests/kvm/x86_64/vmx_tsc_adjust_test.c168
-rw-r--r--tools/testing/selftests/kvm/x86_64/xss_msr_test.c76
62 files changed, 18345 insertions, 0 deletions
diff --git a/tools/testing/selftests/kvm/.gitignore b/tools/testing/selftests/kvm/.gitignore
new file mode 100644
index 000000000..7a2c242b7
--- /dev/null
+++ b/tools/testing/selftests/kvm/.gitignore
@@ -0,0 +1,33 @@
+# SPDX-License-Identifier: GPL-2.0-only
+/aarch64/get-reg-list
+/aarch64/get-reg-list-sve
+/s390x/memop
+/s390x/resets
+/s390x/sync_regs_test
+/x86_64/cr4_cpuid_sync_test
+/x86_64/debug_regs
+/x86_64/evmcs_test
+/x86_64/kvm_pv_test
+/x86_64/hyperv_cpuid
+/x86_64/mmio_warning_test
+/x86_64/platform_info_test
+/x86_64/set_sregs_test
+/x86_64/smm_test
+/x86_64/state_test
+/x86_64/user_msr_test
+/x86_64/vmx_preemption_timer_test
+/x86_64/svm_vmcall_test
+/x86_64/sync_regs_test
+/x86_64/vmx_apic_access_test
+/x86_64/vmx_close_while_nested_test
+/x86_64/vmx_dirty_log_test
+/x86_64/vmx_set_nested_state_test
+/x86_64/vmx_tsc_adjust_test
+/x86_64/xss_msr_test
+/clear_dirty_log_test
+/demand_paging_test
+/dirty_log_test
+/dirty_log_perf_test
+/kvm_create_max_vcpus
+/set_memory_region_test
+/steal_time
diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile
new file mode 100644
index 000000000..3d14ef777
--- /dev/null
+++ b/tools/testing/selftests/kvm/Makefile
@@ -0,0 +1,144 @@
+# SPDX-License-Identifier: GPL-2.0-only
+include ../../../../scripts/Kbuild.include
+
+all:
+
+top_srcdir = ../../../..
+KSFT_KHDR_INSTALL := 1
+
+# For cross-builds to work, UNAME_M has to map to ARCH and arch specific
+# directories and targets in this Makefile. "uname -m" doesn't map to
+# arch specific sub-directory names.
+#
+# UNAME_M variable to used to run the compiles pointing to the right arch
+# directories and build the right targets for these supported architectures.
+#
+# TEST_GEN_PROGS and LIBKVM are set using UNAME_M variable.
+# LINUX_TOOL_ARCH_INCLUDE is set using ARCH variable.
+#
+# x86_64 targets are named to include x86_64 as a suffix and directories
+# for includes are in x86_64 sub-directory. s390x and aarch64 follow the
+# same convention. "uname -m" doesn't result in the correct mapping for
+# s390x and aarch64.
+#
+# No change necessary for x86_64
+UNAME_M := $(shell uname -m)
+
+# Set UNAME_M for arm64 compile/install to work
+ifeq ($(ARCH),arm64)
+ UNAME_M := aarch64
+endif
+# Set UNAME_M s390x compile/install to work
+ifeq ($(ARCH),s390)
+ UNAME_M := s390x
+endif
+
+LIBKVM = lib/assert.c lib/elf.c lib/io.c lib/kvm_util.c lib/sparsebit.c lib/test_util.c
+LIBKVM_x86_64 = lib/x86_64/processor.c lib/x86_64/vmx.c lib/x86_64/svm.c lib/x86_64/ucall.c lib/x86_64/handlers.S
+LIBKVM_aarch64 = lib/aarch64/processor.c lib/aarch64/ucall.c
+LIBKVM_s390x = lib/s390x/processor.c lib/s390x/ucall.c
+
+TEST_GEN_PROGS_x86_64 = x86_64/cr4_cpuid_sync_test
+TEST_GEN_PROGS_x86_64 += x86_64/evmcs_test
+TEST_GEN_PROGS_x86_64 += x86_64/hyperv_cpuid
+TEST_GEN_PROGS_x86_64 += x86_64/kvm_pv_test
+TEST_GEN_PROGS_x86_64 += x86_64/mmio_warning_test
+TEST_GEN_PROGS_x86_64 += x86_64/platform_info_test
+TEST_GEN_PROGS_x86_64 += x86_64/set_sregs_test
+TEST_GEN_PROGS_x86_64 += x86_64/smm_test
+TEST_GEN_PROGS_x86_64 += x86_64/state_test
+TEST_GEN_PROGS_x86_64 += x86_64/vmx_preemption_timer_test
+TEST_GEN_PROGS_x86_64 += x86_64/svm_vmcall_test
+TEST_GEN_PROGS_x86_64 += x86_64/sync_regs_test
+TEST_GEN_PROGS_x86_64 += x86_64/vmx_apic_access_test
+TEST_GEN_PROGS_x86_64 += x86_64/vmx_close_while_nested_test
+TEST_GEN_PROGS_x86_64 += x86_64/vmx_dirty_log_test
+TEST_GEN_PROGS_x86_64 += x86_64/vmx_set_nested_state_test
+TEST_GEN_PROGS_x86_64 += x86_64/vmx_tsc_adjust_test
+TEST_GEN_PROGS_x86_64 += x86_64/xss_msr_test
+TEST_GEN_PROGS_x86_64 += x86_64/debug_regs
+TEST_GEN_PROGS_x86_64 += x86_64/tsc_msrs_test
+TEST_GEN_PROGS_x86_64 += x86_64/user_msr_test
+TEST_GEN_PROGS_x86_64 += demand_paging_test
+TEST_GEN_PROGS_x86_64 += dirty_log_test
+TEST_GEN_PROGS_x86_64 += dirty_log_perf_test
+TEST_GEN_PROGS_x86_64 += kvm_create_max_vcpus
+TEST_GEN_PROGS_x86_64 += set_memory_region_test
+TEST_GEN_PROGS_x86_64 += steal_time
+
+TEST_GEN_PROGS_aarch64 += aarch64/get-reg-list
+TEST_GEN_PROGS_aarch64 += aarch64/get-reg-list-sve
+TEST_GEN_PROGS_aarch64 += demand_paging_test
+TEST_GEN_PROGS_aarch64 += dirty_log_test
+TEST_GEN_PROGS_aarch64 += kvm_create_max_vcpus
+TEST_GEN_PROGS_aarch64 += set_memory_region_test
+TEST_GEN_PROGS_aarch64 += steal_time
+
+TEST_GEN_PROGS_s390x = s390x/memop
+TEST_GEN_PROGS_s390x += s390x/resets
+TEST_GEN_PROGS_s390x += s390x/sync_regs_test
+TEST_GEN_PROGS_s390x += demand_paging_test
+TEST_GEN_PROGS_s390x += dirty_log_test
+TEST_GEN_PROGS_s390x += kvm_create_max_vcpus
+TEST_GEN_PROGS_s390x += set_memory_region_test
+
+TEST_GEN_PROGS += $(TEST_GEN_PROGS_$(UNAME_M))
+LIBKVM += $(LIBKVM_$(UNAME_M))
+
+INSTALL_HDR_PATH = $(top_srcdir)/usr
+LINUX_HDR_PATH = $(INSTALL_HDR_PATH)/include/
+LINUX_TOOL_INCLUDE = $(top_srcdir)/tools/include
+ifeq ($(ARCH),x86_64)
+LINUX_TOOL_ARCH_INCLUDE = $(top_srcdir)/tools/arch/x86/include
+else
+LINUX_TOOL_ARCH_INCLUDE = $(top_srcdir)/tools/arch/$(ARCH)/include
+endif
+CFLAGS += -Wall -Wstrict-prototypes -Wuninitialized -O2 -g -std=gnu99 \
+ -fno-stack-protector -fno-PIE -I$(LINUX_TOOL_INCLUDE) \
+ -I$(LINUX_TOOL_ARCH_INCLUDE) -I$(LINUX_HDR_PATH) -Iinclude \
+ -I$(<D) -Iinclude/$(UNAME_M) -I..
+
+no-pie-option := $(call try-run, echo 'int main() { return 0; }' | \
+ $(CC) -Werror -no-pie -x c - -o "$$TMP", -no-pie)
+
+# On s390, build the testcases KVM-enabled
+pgste-option = $(call try-run, echo 'int main() { return 0; }' | \
+ $(CC) -Werror -Wl$(comma)--s390-pgste -x c - -o "$$TMP",-Wl$(comma)--s390-pgste)
+
+
+LDFLAGS += -pthread $(no-pie-option) $(pgste-option)
+
+# After inclusion, $(OUTPUT) is defined and
+# $(TEST_GEN_PROGS) starts with $(OUTPUT)/
+include ../lib.mk
+
+STATIC_LIBS := $(OUTPUT)/libkvm.a
+LIBKVM_C := $(filter %.c,$(LIBKVM))
+LIBKVM_S := $(filter %.S,$(LIBKVM))
+LIBKVM_C_OBJ := $(patsubst %.c, $(OUTPUT)/%.o, $(LIBKVM_C))
+LIBKVM_S_OBJ := $(patsubst %.S, $(OUTPUT)/%.o, $(LIBKVM_S))
+EXTRA_CLEAN += $(LIBKVM_C_OBJ) $(LIBKVM_S_OBJ) $(STATIC_LIBS) cscope.*
+
+x := $(shell mkdir -p $(sort $(dir $(LIBKVM_C_OBJ) $(LIBKVM_S_OBJ))))
+$(LIBKVM_C_OBJ): $(OUTPUT)/%.o: %.c
+ $(CC) $(CFLAGS) $(CPPFLAGS) $(TARGET_ARCH) -c $< -o $@
+
+$(LIBKVM_S_OBJ): $(OUTPUT)/%.o: %.S
+ $(CC) $(CFLAGS) $(CPPFLAGS) $(TARGET_ARCH) -c $< -o $@
+
+LIBKVM_OBJS = $(LIBKVM_C_OBJ) $(LIBKVM_S_OBJ)
+$(OUTPUT)/libkvm.a: $(LIBKVM_OBJS)
+ $(AR) crs $@ $^
+
+x := $(shell mkdir -p $(sort $(dir $(TEST_GEN_PROGS))))
+all: $(STATIC_LIBS)
+$(TEST_GEN_PROGS): $(STATIC_LIBS)
+
+cscope: include_paths = $(LINUX_TOOL_INCLUDE) $(LINUX_HDR_PATH) include lib ..
+cscope:
+ $(RM) cscope.*
+ (find $(include_paths) -name '*.h' \
+ -exec realpath --relative-base=$(PWD) {} \;; \
+ find . -name '*.c' \
+ -exec realpath --relative-base=$(PWD) {} \;) | sort -u > cscope.files
+ cscope -b
diff --git a/tools/testing/selftests/kvm/aarch64/get-reg-list-sve.c b/tools/testing/selftests/kvm/aarch64/get-reg-list-sve.c
new file mode 100644
index 000000000..efba76682
--- /dev/null
+++ b/tools/testing/selftests/kvm/aarch64/get-reg-list-sve.c
@@ -0,0 +1,3 @@
+// SPDX-License-Identifier: GPL-2.0
+#define REG_LIST_SVE
+#include "get-reg-list.c"
diff --git a/tools/testing/selftests/kvm/aarch64/get-reg-list.c b/tools/testing/selftests/kvm/aarch64/get-reg-list.c
new file mode 100644
index 000000000..33218a395
--- /dev/null
+++ b/tools/testing/selftests/kvm/aarch64/get-reg-list.c
@@ -0,0 +1,841 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Check for KVM_GET_REG_LIST regressions.
+ *
+ * Copyright (C) 2020, Red Hat, Inc.
+ *
+ * When attempting to migrate from a host with an older kernel to a host
+ * with a newer kernel we allow the newer kernel on the destination to
+ * list new registers with get-reg-list. We assume they'll be unused, at
+ * least until the guest reboots, and so they're relatively harmless.
+ * However, if the destination host with the newer kernel is missing
+ * registers which the source host with the older kernel has, then that's
+ * a regression in get-reg-list. This test checks for that regression by
+ * checking the current list against a blessed list. We should never have
+ * missing registers, but if new ones appear then they can probably be
+ * added to the blessed list. A completely new blessed list can be created
+ * by running the test with the --list command line argument.
+ *
+ * Note, the blessed list should be created from the oldest possible
+ * kernel. We can't go older than v4.15, though, because that's the first
+ * release to expose the ID system registers in KVM_GET_REG_LIST, see
+ * commit 93390c0a1b20 ("arm64: KVM: Hide unsupported AArch64 CPU features
+ * from guests"). Also, one must use the --core-reg-fixup command line
+ * option when running on an older kernel that doesn't include df205b5c6328
+ * ("KVM: arm64: Filter out invalid core register IDs in KVM_GET_REG_LIST")
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "kvm_util.h"
+#include "test_util.h"
+#include "processor.h"
+
+#ifdef REG_LIST_SVE
+#define reg_list_sve() (true)
+#else
+#define reg_list_sve() (false)
+#endif
+
+#define REG_MASK (KVM_REG_ARCH_MASK | KVM_REG_SIZE_MASK | KVM_REG_ARM_COPROC_MASK)
+
+#define for_each_reg(i) \
+ for ((i) = 0; (i) < reg_list->n; ++(i))
+
+#define for_each_missing_reg(i) \
+ for ((i) = 0; (i) < blessed_n; ++(i)) \
+ if (!find_reg(reg_list->reg, reg_list->n, blessed_reg[i]))
+
+#define for_each_new_reg(i) \
+ for ((i) = 0; (i) < reg_list->n; ++(i)) \
+ if (!find_reg(blessed_reg, blessed_n, reg_list->reg[i]))
+
+
+static struct kvm_reg_list *reg_list;
+
+static __u64 base_regs[], vregs[], sve_regs[], rejects_set[];
+static __u64 base_regs_n, vregs_n, sve_regs_n, rejects_set_n;
+static __u64 *blessed_reg, blessed_n;
+
+static bool find_reg(__u64 regs[], __u64 nr_regs, __u64 reg)
+{
+ int i;
+
+ for (i = 0; i < nr_regs; ++i)
+ if (reg == regs[i])
+ return true;
+ return false;
+}
+
+static const char *str_with_index(const char *template, __u64 index)
+{
+ char *str, *p;
+ int n;
+
+ str = strdup(template);
+ p = strstr(str, "##");
+ n = sprintf(p, "%lld", index);
+ strcat(p + n, strstr(template, "##") + 2);
+
+ return (const char *)str;
+}
+
+#define CORE_REGS_XX_NR_WORDS 2
+#define CORE_SPSR_XX_NR_WORDS 2
+#define CORE_FPREGS_XX_NR_WORDS 4
+
+static const char *core_id_to_str(__u64 id)
+{
+ __u64 core_off = id & ~REG_MASK, idx;
+
+ /*
+ * core_off is the offset into struct kvm_regs
+ */
+ switch (core_off) {
+ case KVM_REG_ARM_CORE_REG(regs.regs[0]) ...
+ KVM_REG_ARM_CORE_REG(regs.regs[30]):
+ idx = (core_off - KVM_REG_ARM_CORE_REG(regs.regs[0])) / CORE_REGS_XX_NR_WORDS;
+ TEST_ASSERT(idx < 31, "Unexpected regs.regs index: %lld", idx);
+ return str_with_index("KVM_REG_ARM_CORE_REG(regs.regs[##])", idx);
+ case KVM_REG_ARM_CORE_REG(regs.sp):
+ return "KVM_REG_ARM_CORE_REG(regs.sp)";
+ case KVM_REG_ARM_CORE_REG(regs.pc):
+ return "KVM_REG_ARM_CORE_REG(regs.pc)";
+ case KVM_REG_ARM_CORE_REG(regs.pstate):
+ return "KVM_REG_ARM_CORE_REG(regs.pstate)";
+ case KVM_REG_ARM_CORE_REG(sp_el1):
+ return "KVM_REG_ARM_CORE_REG(sp_el1)";
+ case KVM_REG_ARM_CORE_REG(elr_el1):
+ return "KVM_REG_ARM_CORE_REG(elr_el1)";
+ case KVM_REG_ARM_CORE_REG(spsr[0]) ...
+ KVM_REG_ARM_CORE_REG(spsr[KVM_NR_SPSR - 1]):
+ idx = (core_off - KVM_REG_ARM_CORE_REG(spsr[0])) / CORE_SPSR_XX_NR_WORDS;
+ TEST_ASSERT(idx < KVM_NR_SPSR, "Unexpected spsr index: %lld", idx);
+ return str_with_index("KVM_REG_ARM_CORE_REG(spsr[##])", idx);
+ case KVM_REG_ARM_CORE_REG(fp_regs.vregs[0]) ...
+ KVM_REG_ARM_CORE_REG(fp_regs.vregs[31]):
+ idx = (core_off - KVM_REG_ARM_CORE_REG(fp_regs.vregs[0])) / CORE_FPREGS_XX_NR_WORDS;
+ TEST_ASSERT(idx < 32, "Unexpected fp_regs.vregs index: %lld", idx);
+ return str_with_index("KVM_REG_ARM_CORE_REG(fp_regs.vregs[##])", idx);
+ case KVM_REG_ARM_CORE_REG(fp_regs.fpsr):
+ return "KVM_REG_ARM_CORE_REG(fp_regs.fpsr)";
+ case KVM_REG_ARM_CORE_REG(fp_regs.fpcr):
+ return "KVM_REG_ARM_CORE_REG(fp_regs.fpcr)";
+ }
+
+ TEST_FAIL("Unknown core reg id: 0x%llx", id);
+ return NULL;
+}
+
+static const char *sve_id_to_str(__u64 id)
+{
+ __u64 sve_off, n, i;
+
+ if (id == KVM_REG_ARM64_SVE_VLS)
+ return "KVM_REG_ARM64_SVE_VLS";
+
+ sve_off = id & ~(REG_MASK | ((1ULL << 5) - 1));
+ i = id & (KVM_ARM64_SVE_MAX_SLICES - 1);
+
+ TEST_ASSERT(i == 0, "Currently we don't expect slice > 0, reg id 0x%llx", id);
+
+ switch (sve_off) {
+ case KVM_REG_ARM64_SVE_ZREG_BASE ...
+ KVM_REG_ARM64_SVE_ZREG_BASE + (1ULL << 5) * KVM_ARM64_SVE_NUM_ZREGS - 1:
+ n = (id >> 5) & (KVM_ARM64_SVE_NUM_ZREGS - 1);
+ TEST_ASSERT(id == KVM_REG_ARM64_SVE_ZREG(n, 0),
+ "Unexpected bits set in SVE ZREG id: 0x%llx", id);
+ return str_with_index("KVM_REG_ARM64_SVE_ZREG(##, 0)", n);
+ case KVM_REG_ARM64_SVE_PREG_BASE ...
+ KVM_REG_ARM64_SVE_PREG_BASE + (1ULL << 5) * KVM_ARM64_SVE_NUM_PREGS - 1:
+ n = (id >> 5) & (KVM_ARM64_SVE_NUM_PREGS - 1);
+ TEST_ASSERT(id == KVM_REG_ARM64_SVE_PREG(n, 0),
+ "Unexpected bits set in SVE PREG id: 0x%llx", id);
+ return str_with_index("KVM_REG_ARM64_SVE_PREG(##, 0)", n);
+ case KVM_REG_ARM64_SVE_FFR_BASE:
+ TEST_ASSERT(id == KVM_REG_ARM64_SVE_FFR(0),
+ "Unexpected bits set in SVE FFR id: 0x%llx", id);
+ return "KVM_REG_ARM64_SVE_FFR(0)";
+ }
+
+ return NULL;
+}
+
+static void print_reg(__u64 id)
+{
+ unsigned op0, op1, crn, crm, op2;
+ const char *reg_size = NULL;
+
+ TEST_ASSERT((id & KVM_REG_ARCH_MASK) == KVM_REG_ARM64,
+ "KVM_REG_ARM64 missing in reg id: 0x%llx", id);
+
+ switch (id & KVM_REG_SIZE_MASK) {
+ case KVM_REG_SIZE_U8:
+ reg_size = "KVM_REG_SIZE_U8";
+ break;
+ case KVM_REG_SIZE_U16:
+ reg_size = "KVM_REG_SIZE_U16";
+ break;
+ case KVM_REG_SIZE_U32:
+ reg_size = "KVM_REG_SIZE_U32";
+ break;
+ case KVM_REG_SIZE_U64:
+ reg_size = "KVM_REG_SIZE_U64";
+ break;
+ case KVM_REG_SIZE_U128:
+ reg_size = "KVM_REG_SIZE_U128";
+ break;
+ case KVM_REG_SIZE_U256:
+ reg_size = "KVM_REG_SIZE_U256";
+ break;
+ case KVM_REG_SIZE_U512:
+ reg_size = "KVM_REG_SIZE_U512";
+ break;
+ case KVM_REG_SIZE_U1024:
+ reg_size = "KVM_REG_SIZE_U1024";
+ break;
+ case KVM_REG_SIZE_U2048:
+ reg_size = "KVM_REG_SIZE_U2048";
+ break;
+ default:
+ TEST_FAIL("Unexpected reg size: 0x%llx in reg id: 0x%llx",
+ (id & KVM_REG_SIZE_MASK) >> KVM_REG_SIZE_SHIFT, id);
+ }
+
+ switch (id & KVM_REG_ARM_COPROC_MASK) {
+ case KVM_REG_ARM_CORE:
+ printf("\tKVM_REG_ARM64 | %s | KVM_REG_ARM_CORE | %s,\n", reg_size, core_id_to_str(id));
+ break;
+ case KVM_REG_ARM_DEMUX:
+ TEST_ASSERT(!(id & ~(REG_MASK | KVM_REG_ARM_DEMUX_ID_MASK | KVM_REG_ARM_DEMUX_VAL_MASK)),
+ "Unexpected bits set in DEMUX reg id: 0x%llx", id);
+ printf("\tKVM_REG_ARM64 | %s | KVM_REG_ARM_DEMUX | KVM_REG_ARM_DEMUX_ID_CCSIDR | %lld,\n",
+ reg_size, id & KVM_REG_ARM_DEMUX_VAL_MASK);
+ break;
+ case KVM_REG_ARM64_SYSREG:
+ op0 = (id & KVM_REG_ARM64_SYSREG_OP0_MASK) >> KVM_REG_ARM64_SYSREG_OP0_SHIFT;
+ op1 = (id & KVM_REG_ARM64_SYSREG_OP1_MASK) >> KVM_REG_ARM64_SYSREG_OP1_SHIFT;
+ crn = (id & KVM_REG_ARM64_SYSREG_CRN_MASK) >> KVM_REG_ARM64_SYSREG_CRN_SHIFT;
+ crm = (id & KVM_REG_ARM64_SYSREG_CRM_MASK) >> KVM_REG_ARM64_SYSREG_CRM_SHIFT;
+ op2 = (id & KVM_REG_ARM64_SYSREG_OP2_MASK) >> KVM_REG_ARM64_SYSREG_OP2_SHIFT;
+ TEST_ASSERT(id == ARM64_SYS_REG(op0, op1, crn, crm, op2),
+ "Unexpected bits set in SYSREG reg id: 0x%llx", id);
+ printf("\tARM64_SYS_REG(%d, %d, %d, %d, %d),\n", op0, op1, crn, crm, op2);
+ break;
+ case KVM_REG_ARM_FW:
+ TEST_ASSERT(id == KVM_REG_ARM_FW_REG(id & 0xffff),
+ "Unexpected bits set in FW reg id: 0x%llx", id);
+ printf("\tKVM_REG_ARM_FW_REG(%lld),\n", id & 0xffff);
+ break;
+ case KVM_REG_ARM64_SVE:
+ if (reg_list_sve())
+ printf("\t%s,\n", sve_id_to_str(id));
+ else
+ TEST_FAIL("KVM_REG_ARM64_SVE is an unexpected coproc type in reg id: 0x%llx", id);
+ break;
+ default:
+ TEST_FAIL("Unexpected coproc type: 0x%llx in reg id: 0x%llx",
+ (id & KVM_REG_ARM_COPROC_MASK) >> KVM_REG_ARM_COPROC_SHIFT, id);
+ }
+}
+
+/*
+ * Older kernels listed each 32-bit word of CORE registers separately.
+ * For 64 and 128-bit registers we need to ignore the extra words. We
+ * also need to fixup the sizes, because the older kernels stated all
+ * registers were 64-bit, even when they weren't.
+ */
+static void core_reg_fixup(void)
+{
+ struct kvm_reg_list *tmp;
+ __u64 id, core_off;
+ int i;
+
+ tmp = calloc(1, sizeof(*tmp) + reg_list->n * sizeof(__u64));
+
+ for (i = 0; i < reg_list->n; ++i) {
+ id = reg_list->reg[i];
+
+ if ((id & KVM_REG_ARM_COPROC_MASK) != KVM_REG_ARM_CORE) {
+ tmp->reg[tmp->n++] = id;
+ continue;
+ }
+
+ core_off = id & ~REG_MASK;
+
+ switch (core_off) {
+ case 0x52: case 0xd2: case 0xd6:
+ /*
+ * These offsets are pointing at padding.
+ * We need to ignore them too.
+ */
+ continue;
+ case KVM_REG_ARM_CORE_REG(fp_regs.vregs[0]) ...
+ KVM_REG_ARM_CORE_REG(fp_regs.vregs[31]):
+ if (core_off & 3)
+ continue;
+ id &= ~KVM_REG_SIZE_MASK;
+ id |= KVM_REG_SIZE_U128;
+ tmp->reg[tmp->n++] = id;
+ continue;
+ case KVM_REG_ARM_CORE_REG(fp_regs.fpsr):
+ case KVM_REG_ARM_CORE_REG(fp_regs.fpcr):
+ id &= ~KVM_REG_SIZE_MASK;
+ id |= KVM_REG_SIZE_U32;
+ tmp->reg[tmp->n++] = id;
+ continue;
+ default:
+ if (core_off & 1)
+ continue;
+ tmp->reg[tmp->n++] = id;
+ break;
+ }
+ }
+
+ free(reg_list);
+ reg_list = tmp;
+}
+
+static void prepare_vcpu_init(struct kvm_vcpu_init *init)
+{
+ if (reg_list_sve())
+ init->features[0] |= 1 << KVM_ARM_VCPU_SVE;
+}
+
+static void finalize_vcpu(struct kvm_vm *vm, uint32_t vcpuid)
+{
+ int feature;
+
+ if (reg_list_sve()) {
+ feature = KVM_ARM_VCPU_SVE;
+ vcpu_ioctl(vm, vcpuid, KVM_ARM_VCPU_FINALIZE, &feature);
+ }
+}
+
+static void check_supported(void)
+{
+ if (reg_list_sve() && !kvm_check_cap(KVM_CAP_ARM_SVE)) {
+ fprintf(stderr, "SVE not available, skipping tests\n");
+ exit(KSFT_SKIP);
+ }
+}
+
+int main(int ac, char **av)
+{
+ struct kvm_vcpu_init init = { .target = -1, };
+ int new_regs = 0, missing_regs = 0, i;
+ int failed_get = 0, failed_set = 0, failed_reject = 0;
+ bool print_list = false, fixup_core_regs = false;
+ struct kvm_vm *vm;
+ __u64 *vec_regs;
+
+ check_supported();
+
+ for (i = 1; i < ac; ++i) {
+ if (strcmp(av[i], "--core-reg-fixup") == 0)
+ fixup_core_regs = true;
+ else if (strcmp(av[i], "--list") == 0)
+ print_list = true;
+ else
+ fprintf(stderr, "Ignoring unknown option: %s\n", av[i]);
+ }
+
+ vm = vm_create(VM_MODE_DEFAULT, DEFAULT_GUEST_PHY_PAGES, O_RDWR);
+ prepare_vcpu_init(&init);
+ aarch64_vcpu_add_default(vm, 0, &init, NULL);
+ finalize_vcpu(vm, 0);
+
+ reg_list = vcpu_get_reg_list(vm, 0);
+
+ if (fixup_core_regs)
+ core_reg_fixup();
+
+ if (print_list) {
+ putchar('\n');
+ for_each_reg(i)
+ print_reg(reg_list->reg[i]);
+ putchar('\n');
+ return 0;
+ }
+
+ /*
+ * We only test that we can get the register and then write back the
+ * same value. Some registers may allow other values to be written
+ * back, but others only allow some bits to be changed, and at least
+ * for ID registers set will fail if the value does not exactly match
+ * what was returned by get. If registers that allow other values to
+ * be written need to have the other values tested, then we should
+ * create a new set of tests for those in a new independent test
+ * executable.
+ */
+ for_each_reg(i) {
+ uint8_t addr[2048 / 8];
+ struct kvm_one_reg reg = {
+ .id = reg_list->reg[i],
+ .addr = (__u64)&addr,
+ };
+ int ret;
+
+ ret = _vcpu_ioctl(vm, 0, KVM_GET_ONE_REG, &reg);
+ if (ret) {
+ puts("Failed to get ");
+ print_reg(reg.id);
+ putchar('\n');
+ ++failed_get;
+ }
+
+ /* rejects_set registers are rejected after KVM_ARM_VCPU_FINALIZE */
+ if (find_reg(rejects_set, rejects_set_n, reg.id)) {
+ ret = _vcpu_ioctl(vm, 0, KVM_SET_ONE_REG, &reg);
+ if (ret != -1 || errno != EPERM) {
+ printf("Failed to reject (ret=%d, errno=%d) ", ret, errno);
+ print_reg(reg.id);
+ putchar('\n');
+ ++failed_reject;
+ }
+ continue;
+ }
+
+ ret = _vcpu_ioctl(vm, 0, KVM_SET_ONE_REG, &reg);
+ if (ret) {
+ puts("Failed to set ");
+ print_reg(reg.id);
+ putchar('\n');
+ ++failed_set;
+ }
+ }
+
+ if (reg_list_sve()) {
+ blessed_n = base_regs_n + sve_regs_n;
+ vec_regs = sve_regs;
+ } else {
+ blessed_n = base_regs_n + vregs_n;
+ vec_regs = vregs;
+ }
+
+ blessed_reg = calloc(blessed_n, sizeof(__u64));
+ for (i = 0; i < base_regs_n; ++i)
+ blessed_reg[i] = base_regs[i];
+ for (i = 0; i < blessed_n - base_regs_n; ++i)
+ blessed_reg[base_regs_n + i] = vec_regs[i];
+
+ for_each_new_reg(i)
+ ++new_regs;
+
+ for_each_missing_reg(i)
+ ++missing_regs;
+
+ if (new_regs || missing_regs) {
+ printf("Number blessed registers: %5lld\n", blessed_n);
+ printf("Number registers: %5lld\n", reg_list->n);
+ }
+
+ if (new_regs) {
+ printf("\nThere are %d new registers.\n"
+ "Consider adding them to the blessed reg "
+ "list with the following lines:\n\n", new_regs);
+ for_each_new_reg(i)
+ print_reg(reg_list->reg[i]);
+ putchar('\n');
+ }
+
+ if (missing_regs) {
+ printf("\nThere are %d missing registers.\n"
+ "The following lines are missing registers:\n\n", missing_regs);
+ for_each_missing_reg(i)
+ print_reg(blessed_reg[i]);
+ putchar('\n');
+ }
+
+ TEST_ASSERT(!missing_regs && !failed_get && !failed_set && !failed_reject,
+ "There are %d missing registers; "
+ "%d registers failed get; %d registers failed set; %d registers failed reject",
+ missing_regs, failed_get, failed_set, failed_reject);
+
+ return 0;
+}
+
+/*
+ * The current blessed list was primed with the output of kernel version
+ * v4.15 with --core-reg-fixup and then later updated with new registers.
+ */
+static __u64 base_regs[] = {
+ KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[0]),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[1]),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[2]),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[3]),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[4]),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[5]),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[6]),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[7]),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[8]),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[9]),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[10]),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[11]),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[12]),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[13]),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[14]),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[15]),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[16]),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[17]),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[18]),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[19]),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[20]),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[21]),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[22]),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[23]),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[24]),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[25]),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[26]),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[27]),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[28]),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[29]),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[30]),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.sp),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.pc),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.pstate),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(sp_el1),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(elr_el1),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(spsr[0]),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(spsr[1]),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(spsr[2]),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(spsr[3]),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(spsr[4]),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U32 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.fpsr),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U32 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.fpcr),
+ KVM_REG_ARM_FW_REG(0),
+ KVM_REG_ARM_FW_REG(1),
+ KVM_REG_ARM_FW_REG(2),
+ ARM64_SYS_REG(3, 3, 14, 3, 1), /* CNTV_CTL_EL0 */
+ ARM64_SYS_REG(3, 3, 14, 3, 2), /* CNTV_CVAL_EL0 */
+ ARM64_SYS_REG(3, 3, 14, 0, 2),
+ ARM64_SYS_REG(3, 0, 0, 0, 0), /* MIDR_EL1 */
+ ARM64_SYS_REG(3, 0, 0, 0, 6), /* REVIDR_EL1 */
+ ARM64_SYS_REG(3, 1, 0, 0, 1), /* CLIDR_EL1 */
+ ARM64_SYS_REG(3, 1, 0, 0, 7), /* AIDR_EL1 */
+ ARM64_SYS_REG(3, 3, 0, 0, 1), /* CTR_EL0 */
+ ARM64_SYS_REG(2, 0, 0, 0, 4),
+ ARM64_SYS_REG(2, 0, 0, 0, 5),
+ ARM64_SYS_REG(2, 0, 0, 0, 6),
+ ARM64_SYS_REG(2, 0, 0, 0, 7),
+ ARM64_SYS_REG(2, 0, 0, 1, 4),
+ ARM64_SYS_REG(2, 0, 0, 1, 5),
+ ARM64_SYS_REG(2, 0, 0, 1, 6),
+ ARM64_SYS_REG(2, 0, 0, 1, 7),
+ ARM64_SYS_REG(2, 0, 0, 2, 0), /* MDCCINT_EL1 */
+ ARM64_SYS_REG(2, 0, 0, 2, 2), /* MDSCR_EL1 */
+ ARM64_SYS_REG(2, 0, 0, 2, 4),
+ ARM64_SYS_REG(2, 0, 0, 2, 5),
+ ARM64_SYS_REG(2, 0, 0, 2, 6),
+ ARM64_SYS_REG(2, 0, 0, 2, 7),
+ ARM64_SYS_REG(2, 0, 0, 3, 4),
+ ARM64_SYS_REG(2, 0, 0, 3, 5),
+ ARM64_SYS_REG(2, 0, 0, 3, 6),
+ ARM64_SYS_REG(2, 0, 0, 3, 7),
+ ARM64_SYS_REG(2, 0, 0, 4, 4),
+ ARM64_SYS_REG(2, 0, 0, 4, 5),
+ ARM64_SYS_REG(2, 0, 0, 4, 6),
+ ARM64_SYS_REG(2, 0, 0, 4, 7),
+ ARM64_SYS_REG(2, 0, 0, 5, 4),
+ ARM64_SYS_REG(2, 0, 0, 5, 5),
+ ARM64_SYS_REG(2, 0, 0, 5, 6),
+ ARM64_SYS_REG(2, 0, 0, 5, 7),
+ ARM64_SYS_REG(2, 0, 0, 6, 4),
+ ARM64_SYS_REG(2, 0, 0, 6, 5),
+ ARM64_SYS_REG(2, 0, 0, 6, 6),
+ ARM64_SYS_REG(2, 0, 0, 6, 7),
+ ARM64_SYS_REG(2, 0, 0, 7, 4),
+ ARM64_SYS_REG(2, 0, 0, 7, 5),
+ ARM64_SYS_REG(2, 0, 0, 7, 6),
+ ARM64_SYS_REG(2, 0, 0, 7, 7),
+ ARM64_SYS_REG(2, 0, 0, 8, 4),
+ ARM64_SYS_REG(2, 0, 0, 8, 5),
+ ARM64_SYS_REG(2, 0, 0, 8, 6),
+ ARM64_SYS_REG(2, 0, 0, 8, 7),
+ ARM64_SYS_REG(2, 0, 0, 9, 4),
+ ARM64_SYS_REG(2, 0, 0, 9, 5),
+ ARM64_SYS_REG(2, 0, 0, 9, 6),
+ ARM64_SYS_REG(2, 0, 0, 9, 7),
+ ARM64_SYS_REG(2, 0, 0, 10, 4),
+ ARM64_SYS_REG(2, 0, 0, 10, 5),
+ ARM64_SYS_REG(2, 0, 0, 10, 6),
+ ARM64_SYS_REG(2, 0, 0, 10, 7),
+ ARM64_SYS_REG(2, 0, 0, 11, 4),
+ ARM64_SYS_REG(2, 0, 0, 11, 5),
+ ARM64_SYS_REG(2, 0, 0, 11, 6),
+ ARM64_SYS_REG(2, 0, 0, 11, 7),
+ ARM64_SYS_REG(2, 0, 0, 12, 4),
+ ARM64_SYS_REG(2, 0, 0, 12, 5),
+ ARM64_SYS_REG(2, 0, 0, 12, 6),
+ ARM64_SYS_REG(2, 0, 0, 12, 7),
+ ARM64_SYS_REG(2, 0, 0, 13, 4),
+ ARM64_SYS_REG(2, 0, 0, 13, 5),
+ ARM64_SYS_REG(2, 0, 0, 13, 6),
+ ARM64_SYS_REG(2, 0, 0, 13, 7),
+ ARM64_SYS_REG(2, 0, 0, 14, 4),
+ ARM64_SYS_REG(2, 0, 0, 14, 5),
+ ARM64_SYS_REG(2, 0, 0, 14, 6),
+ ARM64_SYS_REG(2, 0, 0, 14, 7),
+ ARM64_SYS_REG(2, 0, 0, 15, 4),
+ ARM64_SYS_REG(2, 0, 0, 15, 5),
+ ARM64_SYS_REG(2, 0, 0, 15, 6),
+ ARM64_SYS_REG(2, 0, 0, 15, 7),
+ ARM64_SYS_REG(2, 4, 0, 7, 0), /* DBGVCR32_EL2 */
+ ARM64_SYS_REG(3, 0, 0, 0, 5), /* MPIDR_EL1 */
+ ARM64_SYS_REG(3, 0, 0, 1, 0), /* ID_PFR0_EL1 */
+ ARM64_SYS_REG(3, 0, 0, 1, 1), /* ID_PFR1_EL1 */
+ ARM64_SYS_REG(3, 0, 0, 1, 2), /* ID_DFR0_EL1 */
+ ARM64_SYS_REG(3, 0, 0, 1, 3), /* ID_AFR0_EL1 */
+ ARM64_SYS_REG(3, 0, 0, 1, 4), /* ID_MMFR0_EL1 */
+ ARM64_SYS_REG(3, 0, 0, 1, 5), /* ID_MMFR1_EL1 */
+ ARM64_SYS_REG(3, 0, 0, 1, 6), /* ID_MMFR2_EL1 */
+ ARM64_SYS_REG(3, 0, 0, 1, 7), /* ID_MMFR3_EL1 */
+ ARM64_SYS_REG(3, 0, 0, 2, 0), /* ID_ISAR0_EL1 */
+ ARM64_SYS_REG(3, 0, 0, 2, 1), /* ID_ISAR1_EL1 */
+ ARM64_SYS_REG(3, 0, 0, 2, 2), /* ID_ISAR2_EL1 */
+ ARM64_SYS_REG(3, 0, 0, 2, 3), /* ID_ISAR3_EL1 */
+ ARM64_SYS_REG(3, 0, 0, 2, 4), /* ID_ISAR4_EL1 */
+ ARM64_SYS_REG(3, 0, 0, 2, 5), /* ID_ISAR5_EL1 */
+ ARM64_SYS_REG(3, 0, 0, 2, 6), /* ID_MMFR4_EL1 */
+ ARM64_SYS_REG(3, 0, 0, 2, 7), /* ID_ISAR6_EL1 */
+ ARM64_SYS_REG(3, 0, 0, 3, 0), /* MVFR0_EL1 */
+ ARM64_SYS_REG(3, 0, 0, 3, 1), /* MVFR1_EL1 */
+ ARM64_SYS_REG(3, 0, 0, 3, 2), /* MVFR2_EL1 */
+ ARM64_SYS_REG(3, 0, 0, 3, 3),
+ ARM64_SYS_REG(3, 0, 0, 3, 4), /* ID_PFR2_EL1 */
+ ARM64_SYS_REG(3, 0, 0, 3, 5), /* ID_DFR1_EL1 */
+ ARM64_SYS_REG(3, 0, 0, 3, 6), /* ID_MMFR5_EL1 */
+ ARM64_SYS_REG(3, 0, 0, 3, 7),
+ ARM64_SYS_REG(3, 0, 0, 4, 0), /* ID_AA64PFR0_EL1 */
+ ARM64_SYS_REG(3, 0, 0, 4, 1), /* ID_AA64PFR1_EL1 */
+ ARM64_SYS_REG(3, 0, 0, 4, 2),
+ ARM64_SYS_REG(3, 0, 0, 4, 3),
+ ARM64_SYS_REG(3, 0, 0, 4, 4), /* ID_AA64ZFR0_EL1 */
+ ARM64_SYS_REG(3, 0, 0, 4, 5),
+ ARM64_SYS_REG(3, 0, 0, 4, 6),
+ ARM64_SYS_REG(3, 0, 0, 4, 7),
+ ARM64_SYS_REG(3, 0, 0, 5, 0), /* ID_AA64DFR0_EL1 */
+ ARM64_SYS_REG(3, 0, 0, 5, 1), /* ID_AA64DFR1_EL1 */
+ ARM64_SYS_REG(3, 0, 0, 5, 2),
+ ARM64_SYS_REG(3, 0, 0, 5, 3),
+ ARM64_SYS_REG(3, 0, 0, 5, 4), /* ID_AA64AFR0_EL1 */
+ ARM64_SYS_REG(3, 0, 0, 5, 5), /* ID_AA64AFR1_EL1 */
+ ARM64_SYS_REG(3, 0, 0, 5, 6),
+ ARM64_SYS_REG(3, 0, 0, 5, 7),
+ ARM64_SYS_REG(3, 0, 0, 6, 0), /* ID_AA64ISAR0_EL1 */
+ ARM64_SYS_REG(3, 0, 0, 6, 1), /* ID_AA64ISAR1_EL1 */
+ ARM64_SYS_REG(3, 0, 0, 6, 2),
+ ARM64_SYS_REG(3, 0, 0, 6, 3),
+ ARM64_SYS_REG(3, 0, 0, 6, 4),
+ ARM64_SYS_REG(3, 0, 0, 6, 5),
+ ARM64_SYS_REG(3, 0, 0, 6, 6),
+ ARM64_SYS_REG(3, 0, 0, 6, 7),
+ ARM64_SYS_REG(3, 0, 0, 7, 0), /* ID_AA64MMFR0_EL1 */
+ ARM64_SYS_REG(3, 0, 0, 7, 1), /* ID_AA64MMFR1_EL1 */
+ ARM64_SYS_REG(3, 0, 0, 7, 2), /* ID_AA64MMFR2_EL1 */
+ ARM64_SYS_REG(3, 0, 0, 7, 3),
+ ARM64_SYS_REG(3, 0, 0, 7, 4),
+ ARM64_SYS_REG(3, 0, 0, 7, 5),
+ ARM64_SYS_REG(3, 0, 0, 7, 6),
+ ARM64_SYS_REG(3, 0, 0, 7, 7),
+ ARM64_SYS_REG(3, 0, 1, 0, 0), /* SCTLR_EL1 */
+ ARM64_SYS_REG(3, 0, 1, 0, 1), /* ACTLR_EL1 */
+ ARM64_SYS_REG(3, 0, 1, 0, 2), /* CPACR_EL1 */
+ ARM64_SYS_REG(3, 0, 2, 0, 0), /* TTBR0_EL1 */
+ ARM64_SYS_REG(3, 0, 2, 0, 1), /* TTBR1_EL1 */
+ ARM64_SYS_REG(3, 0, 2, 0, 2), /* TCR_EL1 */
+ ARM64_SYS_REG(3, 0, 5, 1, 0), /* AFSR0_EL1 */
+ ARM64_SYS_REG(3, 0, 5, 1, 1), /* AFSR1_EL1 */
+ ARM64_SYS_REG(3, 0, 5, 2, 0), /* ESR_EL1 */
+ ARM64_SYS_REG(3, 0, 6, 0, 0), /* FAR_EL1 */
+ ARM64_SYS_REG(3, 0, 7, 4, 0), /* PAR_EL1 */
+ ARM64_SYS_REG(3, 0, 9, 14, 1), /* PMINTENSET_EL1 */
+ ARM64_SYS_REG(3, 0, 9, 14, 2), /* PMINTENCLR_EL1 */
+ ARM64_SYS_REG(3, 0, 10, 2, 0), /* MAIR_EL1 */
+ ARM64_SYS_REG(3, 0, 10, 3, 0), /* AMAIR_EL1 */
+ ARM64_SYS_REG(3, 0, 12, 0, 0), /* VBAR_EL1 */
+ ARM64_SYS_REG(3, 0, 12, 1, 1), /* DISR_EL1 */
+ ARM64_SYS_REG(3, 0, 13, 0, 1), /* CONTEXTIDR_EL1 */
+ ARM64_SYS_REG(3, 0, 13, 0, 4), /* TPIDR_EL1 */
+ ARM64_SYS_REG(3, 0, 14, 1, 0), /* CNTKCTL_EL1 */
+ ARM64_SYS_REG(3, 2, 0, 0, 0), /* CSSELR_EL1 */
+ ARM64_SYS_REG(3, 3, 9, 12, 0), /* PMCR_EL0 */
+ ARM64_SYS_REG(3, 3, 9, 12, 1), /* PMCNTENSET_EL0 */
+ ARM64_SYS_REG(3, 3, 9, 12, 2), /* PMCNTENCLR_EL0 */
+ ARM64_SYS_REG(3, 3, 9, 12, 3), /* PMOVSCLR_EL0 */
+ ARM64_SYS_REG(3, 3, 9, 12, 4), /* PMSWINC_EL0 */
+ ARM64_SYS_REG(3, 3, 9, 12, 5), /* PMSELR_EL0 */
+ ARM64_SYS_REG(3, 3, 9, 13, 0), /* PMCCNTR_EL0 */
+ ARM64_SYS_REG(3, 3, 9, 14, 0), /* PMUSERENR_EL0 */
+ ARM64_SYS_REG(3, 3, 9, 14, 3), /* PMOVSSET_EL0 */
+ ARM64_SYS_REG(3, 3, 13, 0, 2), /* TPIDR_EL0 */
+ ARM64_SYS_REG(3, 3, 13, 0, 3), /* TPIDRRO_EL0 */
+ ARM64_SYS_REG(3, 3, 14, 8, 0),
+ ARM64_SYS_REG(3, 3, 14, 8, 1),
+ ARM64_SYS_REG(3, 3, 14, 8, 2),
+ ARM64_SYS_REG(3, 3, 14, 8, 3),
+ ARM64_SYS_REG(3, 3, 14, 8, 4),
+ ARM64_SYS_REG(3, 3, 14, 8, 5),
+ ARM64_SYS_REG(3, 3, 14, 8, 6),
+ ARM64_SYS_REG(3, 3, 14, 8, 7),
+ ARM64_SYS_REG(3, 3, 14, 9, 0),
+ ARM64_SYS_REG(3, 3, 14, 9, 1),
+ ARM64_SYS_REG(3, 3, 14, 9, 2),
+ ARM64_SYS_REG(3, 3, 14, 9, 3),
+ ARM64_SYS_REG(3, 3, 14, 9, 4),
+ ARM64_SYS_REG(3, 3, 14, 9, 5),
+ ARM64_SYS_REG(3, 3, 14, 9, 6),
+ ARM64_SYS_REG(3, 3, 14, 9, 7),
+ ARM64_SYS_REG(3, 3, 14, 10, 0),
+ ARM64_SYS_REG(3, 3, 14, 10, 1),
+ ARM64_SYS_REG(3, 3, 14, 10, 2),
+ ARM64_SYS_REG(3, 3, 14, 10, 3),
+ ARM64_SYS_REG(3, 3, 14, 10, 4),
+ ARM64_SYS_REG(3, 3, 14, 10, 5),
+ ARM64_SYS_REG(3, 3, 14, 10, 6),
+ ARM64_SYS_REG(3, 3, 14, 10, 7),
+ ARM64_SYS_REG(3, 3, 14, 11, 0),
+ ARM64_SYS_REG(3, 3, 14, 11, 1),
+ ARM64_SYS_REG(3, 3, 14, 11, 2),
+ ARM64_SYS_REG(3, 3, 14, 11, 3),
+ ARM64_SYS_REG(3, 3, 14, 11, 4),
+ ARM64_SYS_REG(3, 3, 14, 11, 5),
+ ARM64_SYS_REG(3, 3, 14, 11, 6),
+ ARM64_SYS_REG(3, 3, 14, 12, 0),
+ ARM64_SYS_REG(3, 3, 14, 12, 1),
+ ARM64_SYS_REG(3, 3, 14, 12, 2),
+ ARM64_SYS_REG(3, 3, 14, 12, 3),
+ ARM64_SYS_REG(3, 3, 14, 12, 4),
+ ARM64_SYS_REG(3, 3, 14, 12, 5),
+ ARM64_SYS_REG(3, 3, 14, 12, 6),
+ ARM64_SYS_REG(3, 3, 14, 12, 7),
+ ARM64_SYS_REG(3, 3, 14, 13, 0),
+ ARM64_SYS_REG(3, 3, 14, 13, 1),
+ ARM64_SYS_REG(3, 3, 14, 13, 2),
+ ARM64_SYS_REG(3, 3, 14, 13, 3),
+ ARM64_SYS_REG(3, 3, 14, 13, 4),
+ ARM64_SYS_REG(3, 3, 14, 13, 5),
+ ARM64_SYS_REG(3, 3, 14, 13, 6),
+ ARM64_SYS_REG(3, 3, 14, 13, 7),
+ ARM64_SYS_REG(3, 3, 14, 14, 0),
+ ARM64_SYS_REG(3, 3, 14, 14, 1),
+ ARM64_SYS_REG(3, 3, 14, 14, 2),
+ ARM64_SYS_REG(3, 3, 14, 14, 3),
+ ARM64_SYS_REG(3, 3, 14, 14, 4),
+ ARM64_SYS_REG(3, 3, 14, 14, 5),
+ ARM64_SYS_REG(3, 3, 14, 14, 6),
+ ARM64_SYS_REG(3, 3, 14, 14, 7),
+ ARM64_SYS_REG(3, 3, 14, 15, 0),
+ ARM64_SYS_REG(3, 3, 14, 15, 1),
+ ARM64_SYS_REG(3, 3, 14, 15, 2),
+ ARM64_SYS_REG(3, 3, 14, 15, 3),
+ ARM64_SYS_REG(3, 3, 14, 15, 4),
+ ARM64_SYS_REG(3, 3, 14, 15, 5),
+ ARM64_SYS_REG(3, 3, 14, 15, 6),
+ ARM64_SYS_REG(3, 3, 14, 15, 7), /* PMCCFILTR_EL0 */
+ ARM64_SYS_REG(3, 4, 3, 0, 0), /* DACR32_EL2 */
+ ARM64_SYS_REG(3, 4, 5, 0, 1), /* IFSR32_EL2 */
+ ARM64_SYS_REG(3, 4, 5, 3, 0), /* FPEXC32_EL2 */
+ KVM_REG_ARM64 | KVM_REG_SIZE_U32 | KVM_REG_ARM_DEMUX | KVM_REG_ARM_DEMUX_ID_CCSIDR | 0,
+ KVM_REG_ARM64 | KVM_REG_SIZE_U32 | KVM_REG_ARM_DEMUX | KVM_REG_ARM_DEMUX_ID_CCSIDR | 1,
+ KVM_REG_ARM64 | KVM_REG_SIZE_U32 | KVM_REG_ARM_DEMUX | KVM_REG_ARM_DEMUX_ID_CCSIDR | 2,
+};
+static __u64 base_regs_n = ARRAY_SIZE(base_regs);
+
+static __u64 vregs[] = {
+ KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[0]),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[1]),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[2]),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[3]),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[4]),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[5]),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[6]),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[7]),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[8]),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[9]),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[10]),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[11]),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[12]),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[13]),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[14]),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[15]),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[16]),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[17]),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[18]),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[19]),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[20]),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[21]),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[22]),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[23]),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[24]),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[25]),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[26]),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[27]),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[28]),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[29]),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[30]),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[31]),
+};
+static __u64 vregs_n = ARRAY_SIZE(vregs);
+
+static __u64 sve_regs[] = {
+ KVM_REG_ARM64_SVE_VLS,
+ KVM_REG_ARM64_SVE_ZREG(0, 0),
+ KVM_REG_ARM64_SVE_ZREG(1, 0),
+ KVM_REG_ARM64_SVE_ZREG(2, 0),
+ KVM_REG_ARM64_SVE_ZREG(3, 0),
+ KVM_REG_ARM64_SVE_ZREG(4, 0),
+ KVM_REG_ARM64_SVE_ZREG(5, 0),
+ KVM_REG_ARM64_SVE_ZREG(6, 0),
+ KVM_REG_ARM64_SVE_ZREG(7, 0),
+ KVM_REG_ARM64_SVE_ZREG(8, 0),
+ KVM_REG_ARM64_SVE_ZREG(9, 0),
+ KVM_REG_ARM64_SVE_ZREG(10, 0),
+ KVM_REG_ARM64_SVE_ZREG(11, 0),
+ KVM_REG_ARM64_SVE_ZREG(12, 0),
+ KVM_REG_ARM64_SVE_ZREG(13, 0),
+ KVM_REG_ARM64_SVE_ZREG(14, 0),
+ KVM_REG_ARM64_SVE_ZREG(15, 0),
+ KVM_REG_ARM64_SVE_ZREG(16, 0),
+ KVM_REG_ARM64_SVE_ZREG(17, 0),
+ KVM_REG_ARM64_SVE_ZREG(18, 0),
+ KVM_REG_ARM64_SVE_ZREG(19, 0),
+ KVM_REG_ARM64_SVE_ZREG(20, 0),
+ KVM_REG_ARM64_SVE_ZREG(21, 0),
+ KVM_REG_ARM64_SVE_ZREG(22, 0),
+ KVM_REG_ARM64_SVE_ZREG(23, 0),
+ KVM_REG_ARM64_SVE_ZREG(24, 0),
+ KVM_REG_ARM64_SVE_ZREG(25, 0),
+ KVM_REG_ARM64_SVE_ZREG(26, 0),
+ KVM_REG_ARM64_SVE_ZREG(27, 0),
+ KVM_REG_ARM64_SVE_ZREG(28, 0),
+ KVM_REG_ARM64_SVE_ZREG(29, 0),
+ KVM_REG_ARM64_SVE_ZREG(30, 0),
+ KVM_REG_ARM64_SVE_ZREG(31, 0),
+ KVM_REG_ARM64_SVE_PREG(0, 0),
+ KVM_REG_ARM64_SVE_PREG(1, 0),
+ KVM_REG_ARM64_SVE_PREG(2, 0),
+ KVM_REG_ARM64_SVE_PREG(3, 0),
+ KVM_REG_ARM64_SVE_PREG(4, 0),
+ KVM_REG_ARM64_SVE_PREG(5, 0),
+ KVM_REG_ARM64_SVE_PREG(6, 0),
+ KVM_REG_ARM64_SVE_PREG(7, 0),
+ KVM_REG_ARM64_SVE_PREG(8, 0),
+ KVM_REG_ARM64_SVE_PREG(9, 0),
+ KVM_REG_ARM64_SVE_PREG(10, 0),
+ KVM_REG_ARM64_SVE_PREG(11, 0),
+ KVM_REG_ARM64_SVE_PREG(12, 0),
+ KVM_REG_ARM64_SVE_PREG(13, 0),
+ KVM_REG_ARM64_SVE_PREG(14, 0),
+ KVM_REG_ARM64_SVE_PREG(15, 0),
+ KVM_REG_ARM64_SVE_FFR(0),
+ ARM64_SYS_REG(3, 0, 1, 2, 0), /* ZCR_EL1 */
+};
+static __u64 sve_regs_n = ARRAY_SIZE(sve_regs);
+
+static __u64 rejects_set[] = {
+#ifdef REG_LIST_SVE
+ KVM_REG_ARM64_SVE_VLS,
+#endif
+};
+static __u64 rejects_set_n = ARRAY_SIZE(rejects_set);
diff --git a/tools/testing/selftests/kvm/config b/tools/testing/selftests/kvm/config
new file mode 100644
index 000000000..63ed533f7
--- /dev/null
+++ b/tools/testing/selftests/kvm/config
@@ -0,0 +1,3 @@
+CONFIG_KVM=y
+CONFIG_KVM_INTEL=y
+CONFIG_KVM_AMD=y
diff --git a/tools/testing/selftests/kvm/demand_paging_test.c b/tools/testing/selftests/kvm/demand_paging_test.c
new file mode 100644
index 000000000..3d96a7bfa
--- /dev/null
+++ b/tools/testing/selftests/kvm/demand_paging_test.c
@@ -0,0 +1,498 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * KVM demand paging test
+ * Adapted from dirty_log_test.c
+ *
+ * Copyright (C) 2018, Red Hat, Inc.
+ * Copyright (C) 2019, Google, Inc.
+ */
+
+#define _GNU_SOURCE /* for program_invocation_name */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+#include <asm/unistd.h>
+#include <time.h>
+#include <poll.h>
+#include <pthread.h>
+#include <linux/bitmap.h>
+#include <linux/bitops.h>
+#include <linux/userfaultfd.h>
+
+#include "perf_test_util.h"
+#include "processor.h"
+#include "test_util.h"
+
+#ifdef __NR_userfaultfd
+
+#ifdef PRINT_PER_PAGE_UPDATES
+#define PER_PAGE_DEBUG(...) printf(__VA_ARGS__)
+#else
+#define PER_PAGE_DEBUG(...) _no_printf(__VA_ARGS__)
+#endif
+
+#ifdef PRINT_PER_VCPU_UPDATES
+#define PER_VCPU_DEBUG(...) printf(__VA_ARGS__)
+#else
+#define PER_VCPU_DEBUG(...) _no_printf(__VA_ARGS__)
+#endif
+
+static char *guest_data_prototype;
+
+static void *vcpu_worker(void *data)
+{
+ int ret;
+ struct vcpu_args *vcpu_args = (struct vcpu_args *)data;
+ int vcpu_id = vcpu_args->vcpu_id;
+ struct kvm_vm *vm = perf_test_args.vm;
+ struct kvm_run *run;
+ struct timespec start;
+ struct timespec ts_diff;
+
+ vcpu_args_set(vm, vcpu_id, 1, vcpu_id);
+ run = vcpu_state(vm, vcpu_id);
+
+ clock_gettime(CLOCK_MONOTONIC, &start);
+
+ /* Let the guest access its memory */
+ ret = _vcpu_run(vm, vcpu_id);
+ TEST_ASSERT(ret == 0, "vcpu_run failed: %d\n", ret);
+ if (get_ucall(vm, vcpu_id, NULL) != UCALL_SYNC) {
+ TEST_ASSERT(false,
+ "Invalid guest sync status: exit_reason=%s\n",
+ exit_reason_str(run->exit_reason));
+ }
+
+ ts_diff = timespec_diff_now(start);
+ PER_VCPU_DEBUG("vCPU %d execution time: %ld.%.9lds\n", vcpu_id,
+ ts_diff.tv_sec, ts_diff.tv_nsec);
+
+ return NULL;
+}
+
+static int handle_uffd_page_request(int uffd, uint64_t addr)
+{
+ pid_t tid;
+ struct timespec start;
+ struct timespec ts_diff;
+ struct uffdio_copy copy;
+ int r;
+
+ tid = syscall(__NR_gettid);
+
+ copy.src = (uint64_t)guest_data_prototype;
+ copy.dst = addr;
+ copy.len = perf_test_args.host_page_size;
+ copy.mode = 0;
+
+ clock_gettime(CLOCK_MONOTONIC, &start);
+
+ r = ioctl(uffd, UFFDIO_COPY, &copy);
+ if (r == -1) {
+ pr_info("Failed Paged in 0x%lx from thread %d with errno: %d\n",
+ addr, tid, errno);
+ return r;
+ }
+
+ ts_diff = timespec_diff_now(start);
+
+ PER_PAGE_DEBUG("UFFDIO_COPY %d \t%ld ns\n", tid,
+ timespec_to_ns(ts_diff));
+ PER_PAGE_DEBUG("Paged in %ld bytes at 0x%lx from thread %d\n",
+ perf_test_args.host_page_size, addr, tid);
+
+ return 0;
+}
+
+bool quit_uffd_thread;
+
+struct uffd_handler_args {
+ int uffd;
+ int pipefd;
+ useconds_t delay;
+};
+
+static void *uffd_handler_thread_fn(void *arg)
+{
+ struct uffd_handler_args *uffd_args = (struct uffd_handler_args *)arg;
+ int uffd = uffd_args->uffd;
+ int pipefd = uffd_args->pipefd;
+ useconds_t delay = uffd_args->delay;
+ int64_t pages = 0;
+ struct timespec start;
+ struct timespec ts_diff;
+
+ clock_gettime(CLOCK_MONOTONIC, &start);
+ while (!quit_uffd_thread) {
+ struct uffd_msg msg;
+ struct pollfd pollfd[2];
+ char tmp_chr;
+ int r;
+ uint64_t addr;
+
+ pollfd[0].fd = uffd;
+ pollfd[0].events = POLLIN;
+ pollfd[1].fd = pipefd;
+ pollfd[1].events = POLLIN;
+
+ r = poll(pollfd, 2, -1);
+ switch (r) {
+ case -1:
+ pr_info("poll err");
+ continue;
+ case 0:
+ continue;
+ case 1:
+ break;
+ default:
+ pr_info("Polling uffd returned %d", r);
+ return NULL;
+ }
+
+ if (pollfd[0].revents & POLLERR) {
+ pr_info("uffd revents has POLLERR");
+ return NULL;
+ }
+
+ if (pollfd[1].revents & POLLIN) {
+ r = read(pollfd[1].fd, &tmp_chr, 1);
+ TEST_ASSERT(r == 1,
+ "Error reading pipefd in UFFD thread\n");
+ return NULL;
+ }
+
+ if (!pollfd[0].revents & POLLIN)
+ continue;
+
+ r = read(uffd, &msg, sizeof(msg));
+ if (r == -1) {
+ if (errno == EAGAIN)
+ continue;
+ pr_info("Read of uffd gor errno %d", errno);
+ return NULL;
+ }
+
+ if (r != sizeof(msg)) {
+ pr_info("Read on uffd returned unexpected size: %d bytes", r);
+ return NULL;
+ }
+
+ if (!(msg.event & UFFD_EVENT_PAGEFAULT))
+ continue;
+
+ if (delay)
+ usleep(delay);
+ addr = msg.arg.pagefault.address;
+ r = handle_uffd_page_request(uffd, addr);
+ if (r < 0)
+ return NULL;
+ pages++;
+ }
+
+ ts_diff = timespec_diff_now(start);
+ PER_VCPU_DEBUG("userfaulted %ld pages over %ld.%.9lds. (%f/sec)\n",
+ pages, ts_diff.tv_sec, ts_diff.tv_nsec,
+ pages / ((double)ts_diff.tv_sec + (double)ts_diff.tv_nsec / 100000000.0));
+
+ return NULL;
+}
+
+static int setup_demand_paging(struct kvm_vm *vm,
+ pthread_t *uffd_handler_thread, int pipefd,
+ useconds_t uffd_delay,
+ struct uffd_handler_args *uffd_args,
+ void *hva, uint64_t len)
+{
+ int uffd;
+ struct uffdio_api uffdio_api;
+ struct uffdio_register uffdio_register;
+
+ uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
+ if (uffd == -1) {
+ pr_info("uffd creation failed\n");
+ return -1;
+ }
+
+ uffdio_api.api = UFFD_API;
+ uffdio_api.features = 0;
+ if (ioctl(uffd, UFFDIO_API, &uffdio_api) == -1) {
+ pr_info("ioctl uffdio_api failed\n");
+ return -1;
+ }
+
+ uffdio_register.range.start = (uint64_t)hva;
+ uffdio_register.range.len = len;
+ uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
+ if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) == -1) {
+ pr_info("ioctl uffdio_register failed\n");
+ return -1;
+ }
+
+ if ((uffdio_register.ioctls & UFFD_API_RANGE_IOCTLS) !=
+ UFFD_API_RANGE_IOCTLS) {
+ pr_info("unexpected userfaultfd ioctl set\n");
+ return -1;
+ }
+
+ uffd_args->uffd = uffd;
+ uffd_args->pipefd = pipefd;
+ uffd_args->delay = uffd_delay;
+ pthread_create(uffd_handler_thread, NULL, uffd_handler_thread_fn,
+ uffd_args);
+
+ PER_VCPU_DEBUG("Created uffd thread for HVA range [%p, %p)\n",
+ hva, hva + len);
+
+ return 0;
+}
+
+static void run_test(enum vm_guest_mode mode, bool use_uffd,
+ useconds_t uffd_delay)
+{
+ pthread_t *vcpu_threads;
+ pthread_t *uffd_handler_threads = NULL;
+ struct uffd_handler_args *uffd_args = NULL;
+ struct timespec start;
+ struct timespec ts_diff;
+ int *pipefds = NULL;
+ struct kvm_vm *vm;
+ int vcpu_id;
+ int r;
+
+ vm = create_vm(mode, nr_vcpus, guest_percpu_mem_size);
+
+ perf_test_args.wr_fract = 1;
+
+ guest_data_prototype = malloc(perf_test_args.host_page_size);
+ TEST_ASSERT(guest_data_prototype,
+ "Failed to allocate buffer for guest data pattern");
+ memset(guest_data_prototype, 0xAB, perf_test_args.host_page_size);
+
+ vcpu_threads = malloc(nr_vcpus * sizeof(*vcpu_threads));
+ TEST_ASSERT(vcpu_threads, "Memory allocation failed");
+
+ add_vcpus(vm, nr_vcpus, guest_percpu_mem_size);
+
+ if (use_uffd) {
+ uffd_handler_threads =
+ malloc(nr_vcpus * sizeof(*uffd_handler_threads));
+ TEST_ASSERT(uffd_handler_threads, "Memory allocation failed");
+
+ uffd_args = malloc(nr_vcpus * sizeof(*uffd_args));
+ TEST_ASSERT(uffd_args, "Memory allocation failed");
+
+ pipefds = malloc(sizeof(int) * nr_vcpus * 2);
+ TEST_ASSERT(pipefds, "Unable to allocate memory for pipefd");
+
+ for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++) {
+ vm_paddr_t vcpu_gpa;
+ void *vcpu_hva;
+
+ vcpu_gpa = guest_test_phys_mem + (vcpu_id * guest_percpu_mem_size);
+ PER_VCPU_DEBUG("Added VCPU %d with test mem gpa [%lx, %lx)\n",
+ vcpu_id, vcpu_gpa, vcpu_gpa + guest_percpu_mem_size);
+
+ /* Cache the HVA pointer of the region */
+ vcpu_hva = addr_gpa2hva(vm, vcpu_gpa);
+
+ /*
+ * Set up user fault fd to handle demand paging
+ * requests.
+ */
+ r = pipe2(&pipefds[vcpu_id * 2],
+ O_CLOEXEC | O_NONBLOCK);
+ TEST_ASSERT(!r, "Failed to set up pipefd");
+
+ r = setup_demand_paging(vm,
+ &uffd_handler_threads[vcpu_id],
+ pipefds[vcpu_id * 2],
+ uffd_delay, &uffd_args[vcpu_id],
+ vcpu_hva, guest_percpu_mem_size);
+ if (r < 0)
+ exit(-r);
+ }
+ }
+
+ /* Export the shared variables to the guest */
+ sync_global_to_guest(vm, perf_test_args);
+
+ pr_info("Finished creating vCPUs and starting uffd threads\n");
+
+ clock_gettime(CLOCK_MONOTONIC, &start);
+
+ for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++) {
+ pthread_create(&vcpu_threads[vcpu_id], NULL, vcpu_worker,
+ &perf_test_args.vcpu_args[vcpu_id]);
+ }
+
+ pr_info("Started all vCPUs\n");
+
+ /* Wait for the vcpu threads to quit */
+ for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++) {
+ pthread_join(vcpu_threads[vcpu_id], NULL);
+ PER_VCPU_DEBUG("Joined thread for vCPU %d\n", vcpu_id);
+ }
+
+ ts_diff = timespec_diff_now(start);
+
+ pr_info("All vCPU threads joined\n");
+
+ if (use_uffd) {
+ char c;
+
+ /* Tell the user fault fd handler threads to quit */
+ for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++) {
+ r = write(pipefds[vcpu_id * 2 + 1], &c, 1);
+ TEST_ASSERT(r == 1, "Unable to write to pipefd");
+
+ pthread_join(uffd_handler_threads[vcpu_id], NULL);
+ }
+ }
+
+ pr_info("Total guest execution time: %ld.%.9lds\n",
+ ts_diff.tv_sec, ts_diff.tv_nsec);
+ pr_info("Overall demand paging rate: %f pgs/sec\n",
+ perf_test_args.vcpu_args[0].pages * nr_vcpus /
+ ((double)ts_diff.tv_sec + (double)ts_diff.tv_nsec / 100000000.0));
+
+ ucall_uninit(vm);
+ kvm_vm_free(vm);
+
+ free(guest_data_prototype);
+ free(vcpu_threads);
+ if (use_uffd) {
+ free(uffd_handler_threads);
+ free(uffd_args);
+ free(pipefds);
+ }
+}
+
+struct guest_mode {
+ bool supported;
+ bool enabled;
+};
+static struct guest_mode guest_modes[NUM_VM_MODES];
+
+#define guest_mode_init(mode, supported, enabled) ({ \
+ guest_modes[mode] = (struct guest_mode){ supported, enabled }; \
+})
+
+static void help(char *name)
+{
+ int i;
+
+ puts("");
+ printf("usage: %s [-h] [-m mode] [-u] [-d uffd_delay_usec]\n"
+ " [-b memory] [-v vcpus]\n", name);
+ printf(" -m: specify the guest mode ID to test\n"
+ " (default: test all supported modes)\n"
+ " This option may be used multiple times.\n"
+ " Guest mode IDs:\n");
+ for (i = 0; i < NUM_VM_MODES; ++i) {
+ printf(" %d: %s%s\n", i, vm_guest_mode_string(i),
+ guest_modes[i].supported ? " (supported)" : "");
+ }
+ printf(" -u: use User Fault FD to handle vCPU page\n"
+ " faults.\n");
+ printf(" -d: add a delay in usec to the User Fault\n"
+ " FD handler to simulate demand paging\n"
+ " overheads. Ignored without -u.\n");
+ printf(" -b: specify the size of the memory region which should be\n"
+ " demand paged by each vCPU. e.g. 10M or 3G.\n"
+ " Default: 1G\n");
+ printf(" -v: specify the number of vCPUs to run.\n");
+ puts("");
+ exit(0);
+}
+
+int main(int argc, char *argv[])
+{
+ int max_vcpus = kvm_check_cap(KVM_CAP_MAX_VCPUS);
+ bool mode_selected = false;
+ unsigned int mode;
+ int opt, i;
+ bool use_uffd = false;
+ useconds_t uffd_delay = 0;
+
+#ifdef __x86_64__
+ guest_mode_init(VM_MODE_PXXV48_4K, true, true);
+#endif
+#ifdef __aarch64__
+ guest_mode_init(VM_MODE_P40V48_4K, true, true);
+ guest_mode_init(VM_MODE_P40V48_64K, true, true);
+ {
+ unsigned int limit = kvm_check_cap(KVM_CAP_ARM_VM_IPA_SIZE);
+
+ if (limit >= 52)
+ guest_mode_init(VM_MODE_P52V48_64K, true, true);
+ if (limit >= 48) {
+ guest_mode_init(VM_MODE_P48V48_4K, true, true);
+ guest_mode_init(VM_MODE_P48V48_64K, true, true);
+ }
+ }
+#endif
+#ifdef __s390x__
+ guest_mode_init(VM_MODE_P40V48_4K, true, true);
+#endif
+
+ while ((opt = getopt(argc, argv, "hm:ud:b:v:")) != -1) {
+ switch (opt) {
+ case 'm':
+ if (!mode_selected) {
+ for (i = 0; i < NUM_VM_MODES; ++i)
+ guest_modes[i].enabled = false;
+ mode_selected = true;
+ }
+ mode = strtoul(optarg, NULL, 10);
+ TEST_ASSERT(mode < NUM_VM_MODES,
+ "Guest mode ID %d too big", mode);
+ guest_modes[mode].enabled = true;
+ break;
+ case 'u':
+ use_uffd = true;
+ break;
+ case 'd':
+ uffd_delay = strtoul(optarg, NULL, 0);
+ TEST_ASSERT(uffd_delay >= 0,
+ "A negative UFFD delay is not supported.");
+ break;
+ case 'b':
+ guest_percpu_mem_size = parse_size(optarg);
+ break;
+ case 'v':
+ nr_vcpus = atoi(optarg);
+ TEST_ASSERT(nr_vcpus > 0 && nr_vcpus <= max_vcpus,
+ "Invalid number of vcpus, must be between 1 and %d", max_vcpus);
+ break;
+ case 'h':
+ default:
+ help(argv[0]);
+ break;
+ }
+ }
+
+ for (i = 0; i < NUM_VM_MODES; ++i) {
+ if (!guest_modes[i].enabled)
+ continue;
+ TEST_ASSERT(guest_modes[i].supported,
+ "Guest mode ID %d (%s) not supported.",
+ i, vm_guest_mode_string(i));
+ run_test(i, use_uffd, uffd_delay);
+ }
+
+ return 0;
+}
+
+#else /* __NR_userfaultfd */
+
+#warning "missing __NR_userfaultfd definition"
+
+int main(void)
+{
+ print_skip("__NR_userfaultfd must be present for userfaultfd test");
+ return KSFT_SKIP;
+}
+
+#endif /* __NR_userfaultfd */
diff --git a/tools/testing/selftests/kvm/dirty_log_perf_test.c b/tools/testing/selftests/kvm/dirty_log_perf_test.c
new file mode 100644
index 000000000..85c9b8f73
--- /dev/null
+++ b/tools/testing/selftests/kvm/dirty_log_perf_test.c
@@ -0,0 +1,376 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * KVM dirty page logging performance test
+ *
+ * Based on dirty_log_test.c
+ *
+ * Copyright (C) 2018, Red Hat, Inc.
+ * Copyright (C) 2020, Google, Inc.
+ */
+
+#define _GNU_SOURCE /* for program_invocation_name */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <time.h>
+#include <pthread.h>
+#include <linux/bitmap.h>
+#include <linux/bitops.h>
+
+#include "kvm_util.h"
+#include "perf_test_util.h"
+#include "processor.h"
+#include "test_util.h"
+
+/* How many host loops to run by default (one KVM_GET_DIRTY_LOG for each loop)*/
+#define TEST_HOST_LOOP_N 2UL
+
+/* Host variables */
+static bool host_quit;
+static uint64_t iteration;
+static uint64_t vcpu_last_completed_iteration[MAX_VCPUS];
+
+static void *vcpu_worker(void *data)
+{
+ int ret;
+ struct kvm_vm *vm = perf_test_args.vm;
+ uint64_t pages_count = 0;
+ struct kvm_run *run;
+ struct timespec start;
+ struct timespec ts_diff;
+ struct timespec total = (struct timespec){0};
+ struct timespec avg;
+ struct vcpu_args *vcpu_args = (struct vcpu_args *)data;
+ int vcpu_id = vcpu_args->vcpu_id;
+
+ vcpu_args_set(vm, vcpu_id, 1, vcpu_id);
+ run = vcpu_state(vm, vcpu_id);
+
+ while (!READ_ONCE(host_quit)) {
+ uint64_t current_iteration = READ_ONCE(iteration);
+
+ clock_gettime(CLOCK_MONOTONIC, &start);
+ ret = _vcpu_run(vm, vcpu_id);
+ ts_diff = timespec_diff_now(start);
+
+ TEST_ASSERT(ret == 0, "vcpu_run failed: %d\n", ret);
+ TEST_ASSERT(get_ucall(vm, vcpu_id, NULL) == UCALL_SYNC,
+ "Invalid guest sync status: exit_reason=%s\n",
+ exit_reason_str(run->exit_reason));
+
+ pr_debug("Got sync event from vCPU %d\n", vcpu_id);
+ vcpu_last_completed_iteration[vcpu_id] = current_iteration;
+ pr_debug("vCPU %d updated last completed iteration to %lu\n",
+ vcpu_id, vcpu_last_completed_iteration[vcpu_id]);
+
+ if (current_iteration) {
+ pages_count += vcpu_args->pages;
+ total = timespec_add(total, ts_diff);
+ pr_debug("vCPU %d iteration %lu dirty memory time: %ld.%.9lds\n",
+ vcpu_id, current_iteration, ts_diff.tv_sec,
+ ts_diff.tv_nsec);
+ } else {
+ pr_debug("vCPU %d iteration %lu populate memory time: %ld.%.9lds\n",
+ vcpu_id, current_iteration, ts_diff.tv_sec,
+ ts_diff.tv_nsec);
+ }
+
+ while (current_iteration == READ_ONCE(iteration) &&
+ !READ_ONCE(host_quit)) {}
+ }
+
+ avg = timespec_div(total, vcpu_last_completed_iteration[vcpu_id]);
+ pr_debug("\nvCPU %d dirtied 0x%lx pages over %lu iterations in %ld.%.9lds. (Avg %ld.%.9lds/iteration)\n",
+ vcpu_id, pages_count, vcpu_last_completed_iteration[vcpu_id],
+ total.tv_sec, total.tv_nsec, avg.tv_sec, avg.tv_nsec);
+
+ return NULL;
+}
+
+#ifdef USE_CLEAR_DIRTY_LOG
+static u64 dirty_log_manual_caps;
+#endif
+
+static void run_test(enum vm_guest_mode mode, unsigned long iterations,
+ uint64_t phys_offset, int wr_fract)
+{
+ pthread_t *vcpu_threads;
+ struct kvm_vm *vm;
+ unsigned long *bmap;
+ uint64_t guest_num_pages;
+ uint64_t host_num_pages;
+ int vcpu_id;
+ struct timespec start;
+ struct timespec ts_diff;
+ struct timespec get_dirty_log_total = (struct timespec){0};
+ struct timespec vcpu_dirty_total = (struct timespec){0};
+ struct timespec avg;
+#ifdef USE_CLEAR_DIRTY_LOG
+ struct kvm_enable_cap cap = {};
+ struct timespec clear_dirty_log_total = (struct timespec){0};
+#endif
+
+ vm = create_vm(mode, nr_vcpus, guest_percpu_mem_size);
+
+ perf_test_args.wr_fract = wr_fract;
+
+ guest_num_pages = (nr_vcpus * guest_percpu_mem_size) >> vm_get_page_shift(vm);
+ guest_num_pages = vm_adjust_num_guest_pages(mode, guest_num_pages);
+ host_num_pages = vm_num_host_pages(mode, guest_num_pages);
+ bmap = bitmap_alloc(host_num_pages);
+
+#ifdef USE_CLEAR_DIRTY_LOG
+ cap.cap = KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2;
+ cap.args[0] = dirty_log_manual_caps;
+ vm_enable_cap(vm, &cap);
+#endif
+
+ vcpu_threads = malloc(nr_vcpus * sizeof(*vcpu_threads));
+ TEST_ASSERT(vcpu_threads, "Memory allocation failed");
+
+ add_vcpus(vm, nr_vcpus, guest_percpu_mem_size);
+
+ sync_global_to_guest(vm, perf_test_args);
+
+ /* Start the iterations */
+ iteration = 0;
+ host_quit = false;
+
+ clock_gettime(CLOCK_MONOTONIC, &start);
+ for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++) {
+ pthread_create(&vcpu_threads[vcpu_id], NULL, vcpu_worker,
+ &perf_test_args.vcpu_args[vcpu_id]);
+ }
+
+ /* Allow the vCPU to populate memory */
+ pr_debug("Starting iteration %lu - Populating\n", iteration);
+ while (READ_ONCE(vcpu_last_completed_iteration[vcpu_id]) != iteration)
+ pr_debug("Waiting for vcpu_last_completed_iteration == %lu\n",
+ iteration);
+
+ ts_diff = timespec_diff_now(start);
+ pr_info("Populate memory time: %ld.%.9lds\n",
+ ts_diff.tv_sec, ts_diff.tv_nsec);
+
+ /* Enable dirty logging */
+ clock_gettime(CLOCK_MONOTONIC, &start);
+ vm_mem_region_set_flags(vm, TEST_MEM_SLOT_INDEX,
+ KVM_MEM_LOG_DIRTY_PAGES);
+ ts_diff = timespec_diff_now(start);
+ pr_info("Enabling dirty logging time: %ld.%.9lds\n\n",
+ ts_diff.tv_sec, ts_diff.tv_nsec);
+
+ while (iteration < iterations) {
+ /*
+ * Incrementing the iteration number will start the vCPUs
+ * dirtying memory again.
+ */
+ clock_gettime(CLOCK_MONOTONIC, &start);
+ iteration++;
+
+ pr_debug("Starting iteration %lu\n", iteration);
+ for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++) {
+ while (READ_ONCE(vcpu_last_completed_iteration[vcpu_id]) != iteration)
+ pr_debug("Waiting for vCPU %d vcpu_last_completed_iteration == %lu\n",
+ vcpu_id, iteration);
+ }
+
+ ts_diff = timespec_diff_now(start);
+ vcpu_dirty_total = timespec_add(vcpu_dirty_total, ts_diff);
+ pr_info("Iteration %lu dirty memory time: %ld.%.9lds\n",
+ iteration, ts_diff.tv_sec, ts_diff.tv_nsec);
+
+ clock_gettime(CLOCK_MONOTONIC, &start);
+ kvm_vm_get_dirty_log(vm, TEST_MEM_SLOT_INDEX, bmap);
+
+ ts_diff = timespec_diff_now(start);
+ get_dirty_log_total = timespec_add(get_dirty_log_total,
+ ts_diff);
+ pr_info("Iteration %lu get dirty log time: %ld.%.9lds\n",
+ iteration, ts_diff.tv_sec, ts_diff.tv_nsec);
+
+#ifdef USE_CLEAR_DIRTY_LOG
+ clock_gettime(CLOCK_MONOTONIC, &start);
+ kvm_vm_clear_dirty_log(vm, TEST_MEM_SLOT_INDEX, bmap, 0,
+ host_num_pages);
+
+ ts_diff = timespec_diff_now(start);
+ clear_dirty_log_total = timespec_add(clear_dirty_log_total,
+ ts_diff);
+ pr_info("Iteration %lu clear dirty log time: %ld.%.9lds\n",
+ iteration, ts_diff.tv_sec, ts_diff.tv_nsec);
+#endif
+ }
+
+ /* Tell the vcpu thread to quit */
+ host_quit = true;
+ for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++)
+ pthread_join(vcpu_threads[vcpu_id], NULL);
+
+ /* Disable dirty logging */
+ clock_gettime(CLOCK_MONOTONIC, &start);
+ vm_mem_region_set_flags(vm, TEST_MEM_SLOT_INDEX, 0);
+ ts_diff = timespec_diff_now(start);
+ pr_info("Disabling dirty logging time: %ld.%.9lds\n",
+ ts_diff.tv_sec, ts_diff.tv_nsec);
+
+ avg = timespec_div(get_dirty_log_total, iterations);
+ pr_info("Get dirty log over %lu iterations took %ld.%.9lds. (Avg %ld.%.9lds/iteration)\n",
+ iterations, get_dirty_log_total.tv_sec,
+ get_dirty_log_total.tv_nsec, avg.tv_sec, avg.tv_nsec);
+
+#ifdef USE_CLEAR_DIRTY_LOG
+ avg = timespec_div(clear_dirty_log_total, iterations);
+ pr_info("Clear dirty log over %lu iterations took %ld.%.9lds. (Avg %ld.%.9lds/iteration)\n",
+ iterations, clear_dirty_log_total.tv_sec,
+ clear_dirty_log_total.tv_nsec, avg.tv_sec, avg.tv_nsec);
+#endif
+
+ free(bmap);
+ free(vcpu_threads);
+ ucall_uninit(vm);
+ kvm_vm_free(vm);
+}
+
+struct guest_mode {
+ bool supported;
+ bool enabled;
+};
+static struct guest_mode guest_modes[NUM_VM_MODES];
+
+#define guest_mode_init(mode, supported, enabled) ({ \
+ guest_modes[mode] = (struct guest_mode){ supported, enabled }; \
+})
+
+static void help(char *name)
+{
+ int i;
+
+ puts("");
+ printf("usage: %s [-h] [-i iterations] [-p offset] "
+ "[-m mode] [-b vcpu bytes] [-v vcpus]\n", name);
+ puts("");
+ printf(" -i: specify iteration counts (default: %"PRIu64")\n",
+ TEST_HOST_LOOP_N);
+ printf(" -p: specify guest physical test memory offset\n"
+ " Warning: a low offset can conflict with the loaded test code.\n");
+ printf(" -m: specify the guest mode ID to test "
+ "(default: test all supported modes)\n"
+ " This option may be used multiple times.\n"
+ " Guest mode IDs:\n");
+ for (i = 0; i < NUM_VM_MODES; ++i) {
+ printf(" %d: %s%s\n", i, vm_guest_mode_string(i),
+ guest_modes[i].supported ? " (supported)" : "");
+ }
+ printf(" -b: specify the size of the memory region which should be\n"
+ " dirtied by each vCPU. e.g. 10M or 3G.\n"
+ " (default: 1G)\n");
+ printf(" -f: specify the fraction of pages which should be written to\n"
+ " as opposed to simply read, in the form\n"
+ " 1/<fraction of pages to write>.\n"
+ " (default: 1 i.e. all pages are written to.)\n");
+ printf(" -v: specify the number of vCPUs to run.\n");
+ puts("");
+ exit(0);
+}
+
+int main(int argc, char *argv[])
+{
+ unsigned long iterations = TEST_HOST_LOOP_N;
+ bool mode_selected = false;
+ uint64_t phys_offset = 0;
+ unsigned int mode;
+ int opt, i;
+ int wr_fract = 1;
+
+#ifdef USE_CLEAR_DIRTY_LOG
+ dirty_log_manual_caps =
+ kvm_check_cap(KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2);
+ if (!dirty_log_manual_caps) {
+ print_skip("KVM_CLEAR_DIRTY_LOG not available");
+ exit(KSFT_SKIP);
+ }
+ dirty_log_manual_caps &= (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE |
+ KVM_DIRTY_LOG_INITIALLY_SET);
+#endif
+
+#ifdef __x86_64__
+ guest_mode_init(VM_MODE_PXXV48_4K, true, true);
+#endif
+#ifdef __aarch64__
+ guest_mode_init(VM_MODE_P40V48_4K, true, true);
+ guest_mode_init(VM_MODE_P40V48_64K, true, true);
+
+ {
+ unsigned int limit = kvm_check_cap(KVM_CAP_ARM_VM_IPA_SIZE);
+
+ if (limit >= 52)
+ guest_mode_init(VM_MODE_P52V48_64K, true, true);
+ if (limit >= 48) {
+ guest_mode_init(VM_MODE_P48V48_4K, true, true);
+ guest_mode_init(VM_MODE_P48V48_64K, true, true);
+ }
+ }
+#endif
+#ifdef __s390x__
+ guest_mode_init(VM_MODE_P40V48_4K, true, true);
+#endif
+
+ while ((opt = getopt(argc, argv, "hi:p:m:b:f:v:")) != -1) {
+ switch (opt) {
+ case 'i':
+ iterations = strtol(optarg, NULL, 10);
+ break;
+ case 'p':
+ phys_offset = strtoull(optarg, NULL, 0);
+ break;
+ case 'm':
+ if (!mode_selected) {
+ for (i = 0; i < NUM_VM_MODES; ++i)
+ guest_modes[i].enabled = false;
+ mode_selected = true;
+ }
+ mode = strtoul(optarg, NULL, 10);
+ TEST_ASSERT(mode < NUM_VM_MODES,
+ "Guest mode ID %d too big", mode);
+ guest_modes[mode].enabled = true;
+ break;
+ case 'b':
+ guest_percpu_mem_size = parse_size(optarg);
+ break;
+ case 'f':
+ wr_fract = atoi(optarg);
+ TEST_ASSERT(wr_fract >= 1,
+ "Write fraction cannot be less than one");
+ break;
+ case 'v':
+ nr_vcpus = atoi(optarg);
+ TEST_ASSERT(nr_vcpus > 0,
+ "Must have a positive number of vCPUs");
+ TEST_ASSERT(nr_vcpus <= MAX_VCPUS,
+ "This test does not currently support\n"
+ "more than %d vCPUs.", MAX_VCPUS);
+ break;
+ case 'h':
+ default:
+ help(argv[0]);
+ break;
+ }
+ }
+
+ TEST_ASSERT(iterations >= 2, "The test should have at least two iterations");
+
+ pr_info("Test iterations: %"PRIu64"\n", iterations);
+
+ for (i = 0; i < NUM_VM_MODES; ++i) {
+ if (!guest_modes[i].enabled)
+ continue;
+ TEST_ASSERT(guest_modes[i].supported,
+ "Guest mode ID %d (%s) not supported.",
+ i, vm_guest_mode_string(i));
+ run_test(i, iterations, phys_offset, wr_fract);
+ }
+
+ return 0;
+}
diff --git a/tools/testing/selftests/kvm/dirty_log_test.c b/tools/testing/selftests/kvm/dirty_log_test.c
new file mode 100644
index 000000000..54da9cc20
--- /dev/null
+++ b/tools/testing/selftests/kvm/dirty_log_test.c
@@ -0,0 +1,639 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * KVM dirty page logging test
+ *
+ * Copyright (C) 2018, Red Hat, Inc.
+ */
+
+#define _GNU_SOURCE /* for program_invocation_name */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <time.h>
+#include <pthread.h>
+#include <linux/bitmap.h>
+#include <linux/bitops.h>
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+
+#define VCPU_ID 1
+
+/* The memory slot index to track dirty pages */
+#define TEST_MEM_SLOT_INDEX 1
+
+/* Default guest test virtual memory offset */
+#define DEFAULT_GUEST_TEST_MEM 0xc0000000
+
+/* How many pages to dirty for each guest loop */
+#define TEST_PAGES_PER_LOOP 1024
+
+/* How many host loops to run (one KVM_GET_DIRTY_LOG for each loop) */
+#define TEST_HOST_LOOP_N 32UL
+
+/* Interval for each host loop (ms) */
+#define TEST_HOST_LOOP_INTERVAL 10UL
+
+/* Dirty bitmaps are always little endian, so we need to swap on big endian */
+#if defined(__s390x__)
+# define BITOP_LE_SWIZZLE ((BITS_PER_LONG-1) & ~0x7)
+# define test_bit_le(nr, addr) \
+ test_bit((nr) ^ BITOP_LE_SWIZZLE, addr)
+# define set_bit_le(nr, addr) \
+ set_bit((nr) ^ BITOP_LE_SWIZZLE, addr)
+# define clear_bit_le(nr, addr) \
+ clear_bit((nr) ^ BITOP_LE_SWIZZLE, addr)
+# define test_and_set_bit_le(nr, addr) \
+ test_and_set_bit((nr) ^ BITOP_LE_SWIZZLE, addr)
+# define test_and_clear_bit_le(nr, addr) \
+ test_and_clear_bit((nr) ^ BITOP_LE_SWIZZLE, addr)
+#else
+# define test_bit_le test_bit
+# define set_bit_le set_bit
+# define clear_bit_le clear_bit
+# define test_and_set_bit_le test_and_set_bit
+# define test_and_clear_bit_le test_and_clear_bit
+#endif
+
+/*
+ * Guest/Host shared variables. Ensure addr_gva2hva() and/or
+ * sync_global_to/from_guest() are used when accessing from
+ * the host. READ/WRITE_ONCE() should also be used with anything
+ * that may change.
+ */
+static uint64_t host_page_size;
+static uint64_t guest_page_size;
+static uint64_t guest_num_pages;
+static uint64_t random_array[TEST_PAGES_PER_LOOP];
+static uint64_t iteration;
+
+/*
+ * Guest physical memory offset of the testing memory slot.
+ * This will be set to the topmost valid physical address minus
+ * the test memory size.
+ */
+static uint64_t guest_test_phys_mem;
+
+/*
+ * Guest virtual memory offset of the testing memory slot.
+ * Must not conflict with identity mapped test code.
+ */
+static uint64_t guest_test_virt_mem = DEFAULT_GUEST_TEST_MEM;
+
+/*
+ * Continuously write to the first 8 bytes of a random pages within
+ * the testing memory region.
+ */
+static void guest_code(void)
+{
+ uint64_t addr;
+ int i;
+
+ /*
+ * On s390x, all pages of a 1M segment are initially marked as dirty
+ * when a page of the segment is written to for the very first time.
+ * To compensate this specialty in this test, we need to touch all
+ * pages during the first iteration.
+ */
+ for (i = 0; i < guest_num_pages; i++) {
+ addr = guest_test_virt_mem + i * guest_page_size;
+ *(uint64_t *)addr = READ_ONCE(iteration);
+ }
+
+ while (true) {
+ for (i = 0; i < TEST_PAGES_PER_LOOP; i++) {
+ addr = guest_test_virt_mem;
+ addr += (READ_ONCE(random_array[i]) % guest_num_pages)
+ * guest_page_size;
+ addr &= ~(host_page_size - 1);
+ *(uint64_t *)addr = READ_ONCE(iteration);
+ }
+
+ /* Tell the host that we need more random numbers */
+ GUEST_SYNC(1);
+ }
+}
+
+/* Host variables */
+static bool host_quit;
+
+/* Points to the test VM memory region on which we track dirty logs */
+static void *host_test_mem;
+static uint64_t host_num_pages;
+
+/* For statistics only */
+static uint64_t host_dirty_count;
+static uint64_t host_clear_count;
+static uint64_t host_track_next_count;
+
+enum log_mode_t {
+ /* Only use KVM_GET_DIRTY_LOG for logging */
+ LOG_MODE_DIRTY_LOG = 0,
+
+ /* Use both KVM_[GET|CLEAR]_DIRTY_LOG for logging */
+ LOG_MODE_CLEAR_LOG = 1,
+
+ LOG_MODE_NUM,
+
+ /* Run all supported modes */
+ LOG_MODE_ALL = LOG_MODE_NUM,
+};
+
+/* Mode of logging to test. Default is to run all supported modes */
+static enum log_mode_t host_log_mode_option = LOG_MODE_ALL;
+/* Logging mode for current run */
+static enum log_mode_t host_log_mode;
+
+static bool clear_log_supported(void)
+{
+ return kvm_check_cap(KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2);
+}
+
+static void clear_log_create_vm_done(struct kvm_vm *vm)
+{
+ struct kvm_enable_cap cap = {};
+ u64 manual_caps;
+
+ manual_caps = kvm_check_cap(KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2);
+ TEST_ASSERT(manual_caps, "MANUAL_CAPS is zero!");
+ manual_caps &= (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE |
+ KVM_DIRTY_LOG_INITIALLY_SET);
+ cap.cap = KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2;
+ cap.args[0] = manual_caps;
+ vm_enable_cap(vm, &cap);
+}
+
+static void dirty_log_collect_dirty_pages(struct kvm_vm *vm, int slot,
+ void *bitmap, uint32_t num_pages)
+{
+ kvm_vm_get_dirty_log(vm, slot, bitmap);
+}
+
+static void clear_log_collect_dirty_pages(struct kvm_vm *vm, int slot,
+ void *bitmap, uint32_t num_pages)
+{
+ kvm_vm_get_dirty_log(vm, slot, bitmap);
+ kvm_vm_clear_dirty_log(vm, slot, bitmap, 0, num_pages);
+}
+
+struct log_mode {
+ const char *name;
+ /* Return true if this mode is supported, otherwise false */
+ bool (*supported)(void);
+ /* Hook when the vm creation is done (before vcpu creation) */
+ void (*create_vm_done)(struct kvm_vm *vm);
+ /* Hook to collect the dirty pages into the bitmap provided */
+ void (*collect_dirty_pages) (struct kvm_vm *vm, int slot,
+ void *bitmap, uint32_t num_pages);
+} log_modes[LOG_MODE_NUM] = {
+ {
+ .name = "dirty-log",
+ .collect_dirty_pages = dirty_log_collect_dirty_pages,
+ },
+ {
+ .name = "clear-log",
+ .supported = clear_log_supported,
+ .create_vm_done = clear_log_create_vm_done,
+ .collect_dirty_pages = clear_log_collect_dirty_pages,
+ },
+};
+
+/*
+ * We use this bitmap to track some pages that should have its dirty
+ * bit set in the _next_ iteration. For example, if we detected the
+ * page value changed to current iteration but at the same time the
+ * page bit is cleared in the latest bitmap, then the system must
+ * report that write in the next get dirty log call.
+ */
+static unsigned long *host_bmap_track;
+
+static void log_modes_dump(void)
+{
+ int i;
+
+ printf("all");
+ for (i = 0; i < LOG_MODE_NUM; i++)
+ printf(", %s", log_modes[i].name);
+ printf("\n");
+}
+
+static bool log_mode_supported(void)
+{
+ struct log_mode *mode = &log_modes[host_log_mode];
+
+ if (mode->supported)
+ return mode->supported();
+
+ return true;
+}
+
+static void log_mode_create_vm_done(struct kvm_vm *vm)
+{
+ struct log_mode *mode = &log_modes[host_log_mode];
+
+ if (mode->create_vm_done)
+ mode->create_vm_done(vm);
+}
+
+static void log_mode_collect_dirty_pages(struct kvm_vm *vm, int slot,
+ void *bitmap, uint32_t num_pages)
+{
+ struct log_mode *mode = &log_modes[host_log_mode];
+
+ TEST_ASSERT(mode->collect_dirty_pages != NULL,
+ "collect_dirty_pages() is required for any log mode!");
+ mode->collect_dirty_pages(vm, slot, bitmap, num_pages);
+}
+
+static void generate_random_array(uint64_t *guest_array, uint64_t size)
+{
+ uint64_t i;
+
+ for (i = 0; i < size; i++)
+ guest_array[i] = random();
+}
+
+static void *vcpu_worker(void *data)
+{
+ int ret;
+ struct kvm_vm *vm = data;
+ uint64_t *guest_array;
+ uint64_t pages_count = 0;
+ struct kvm_run *run;
+
+ run = vcpu_state(vm, VCPU_ID);
+
+ guest_array = addr_gva2hva(vm, (vm_vaddr_t)random_array);
+ generate_random_array(guest_array, TEST_PAGES_PER_LOOP);
+
+ while (!READ_ONCE(host_quit)) {
+ /* Let the guest dirty the random pages */
+ ret = _vcpu_run(vm, VCPU_ID);
+ TEST_ASSERT(ret == 0, "vcpu_run failed: %d\n", ret);
+ if (get_ucall(vm, VCPU_ID, NULL) == UCALL_SYNC) {
+ pages_count += TEST_PAGES_PER_LOOP;
+ generate_random_array(guest_array, TEST_PAGES_PER_LOOP);
+ } else {
+ TEST_FAIL("Invalid guest sync status: "
+ "exit_reason=%s\n",
+ exit_reason_str(run->exit_reason));
+ }
+ }
+
+ pr_info("Dirtied %"PRIu64" pages\n", pages_count);
+
+ return NULL;
+}
+
+static void vm_dirty_log_verify(enum vm_guest_mode mode, unsigned long *bmap)
+{
+ uint64_t step = vm_num_host_pages(mode, 1);
+ uint64_t page;
+ uint64_t *value_ptr;
+
+ for (page = 0; page < host_num_pages; page += step) {
+ value_ptr = host_test_mem + page * host_page_size;
+
+ /* If this is a special page that we were tracking... */
+ if (test_and_clear_bit_le(page, host_bmap_track)) {
+ host_track_next_count++;
+ TEST_ASSERT(test_bit_le(page, bmap),
+ "Page %"PRIu64" should have its dirty bit "
+ "set in this iteration but it is missing",
+ page);
+ }
+
+ if (test_and_clear_bit_le(page, bmap)) {
+ host_dirty_count++;
+ /*
+ * If the bit is set, the value written onto
+ * the corresponding page should be either the
+ * previous iteration number or the current one.
+ */
+ TEST_ASSERT(*value_ptr == iteration ||
+ *value_ptr == iteration - 1,
+ "Set page %"PRIu64" value %"PRIu64
+ " incorrect (iteration=%"PRIu64")",
+ page, *value_ptr, iteration);
+ } else {
+ host_clear_count++;
+ /*
+ * If cleared, the value written can be any
+ * value smaller or equals to the iteration
+ * number. Note that the value can be exactly
+ * (iteration-1) if that write can happen
+ * like this:
+ *
+ * (1) increase loop count to "iteration-1"
+ * (2) write to page P happens (with value
+ * "iteration-1")
+ * (3) get dirty log for "iteration-1"; we'll
+ * see that page P bit is set (dirtied),
+ * and not set the bit in host_bmap_track
+ * (4) increase loop count to "iteration"
+ * (which is current iteration)
+ * (5) get dirty log for current iteration,
+ * we'll see that page P is cleared, with
+ * value "iteration-1".
+ */
+ TEST_ASSERT(*value_ptr <= iteration,
+ "Clear page %"PRIu64" value %"PRIu64
+ " incorrect (iteration=%"PRIu64")",
+ page, *value_ptr, iteration);
+ if (*value_ptr == iteration) {
+ /*
+ * This page is _just_ modified; it
+ * should report its dirtyness in the
+ * next run
+ */
+ set_bit_le(page, host_bmap_track);
+ }
+ }
+ }
+}
+
+static struct kvm_vm *create_vm(enum vm_guest_mode mode, uint32_t vcpuid,
+ uint64_t extra_mem_pages, void *guest_code)
+{
+ struct kvm_vm *vm;
+ uint64_t extra_pg_pages = extra_mem_pages / 512 * 2;
+
+ pr_info("Testing guest mode: %s\n", vm_guest_mode_string(mode));
+
+ vm = vm_create(mode, DEFAULT_GUEST_PHY_PAGES + extra_pg_pages, O_RDWR);
+ kvm_vm_elf_load(vm, program_invocation_name, 0, 0);
+#ifdef __x86_64__
+ vm_create_irqchip(vm);
+#endif
+ log_mode_create_vm_done(vm);
+ vm_vcpu_add_default(vm, vcpuid, guest_code);
+ return vm;
+}
+
+#define DIRTY_MEM_BITS 30 /* 1G */
+#define PAGE_SHIFT_4K 12
+
+static void run_test(enum vm_guest_mode mode, unsigned long iterations,
+ unsigned long interval, uint64_t phys_offset)
+{
+ pthread_t vcpu_thread;
+ struct kvm_vm *vm;
+ unsigned long *bmap;
+
+ if (!log_mode_supported()) {
+ print_skip("Log mode '%s' not supported",
+ log_modes[host_log_mode].name);
+ return;
+ }
+
+ /*
+ * We reserve page table for 2 times of extra dirty mem which
+ * will definitely cover the original (1G+) test range. Here
+ * we do the calculation with 4K page size which is the
+ * smallest so the page number will be enough for all archs
+ * (e.g., 64K page size guest will need even less memory for
+ * page tables).
+ */
+ vm = create_vm(mode, VCPU_ID,
+ 2ul << (DIRTY_MEM_BITS - PAGE_SHIFT_4K),
+ guest_code);
+
+ guest_page_size = vm_get_page_size(vm);
+ /*
+ * A little more than 1G of guest page sized pages. Cover the
+ * case where the size is not aligned to 64 pages.
+ */
+ guest_num_pages = (1ul << (DIRTY_MEM_BITS -
+ vm_get_page_shift(vm))) + 3;
+ guest_num_pages = vm_adjust_num_guest_pages(mode, guest_num_pages);
+
+ host_page_size = getpagesize();
+ host_num_pages = vm_num_host_pages(mode, guest_num_pages);
+
+ if (!phys_offset) {
+ guest_test_phys_mem = (vm_get_max_gfn(vm) -
+ guest_num_pages) * guest_page_size;
+ guest_test_phys_mem &= ~(host_page_size - 1);
+ } else {
+ guest_test_phys_mem = phys_offset;
+ }
+
+#ifdef __s390x__
+ /* Align to 1M (segment size) */
+ guest_test_phys_mem &= ~((1 << 20) - 1);
+#endif
+
+ pr_info("guest physical test memory offset: 0x%lx\n", guest_test_phys_mem);
+
+ bmap = bitmap_alloc(host_num_pages);
+ host_bmap_track = bitmap_alloc(host_num_pages);
+
+ /* Add an extra memory slot for testing dirty logging */
+ vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS,
+ guest_test_phys_mem,
+ TEST_MEM_SLOT_INDEX,
+ guest_num_pages,
+ KVM_MEM_LOG_DIRTY_PAGES);
+
+ /* Do mapping for the dirty track memory slot */
+ virt_map(vm, guest_test_virt_mem, guest_test_phys_mem, guest_num_pages, 0);
+
+ /* Cache the HVA pointer of the region */
+ host_test_mem = addr_gpa2hva(vm, (vm_paddr_t)guest_test_phys_mem);
+
+#ifdef __x86_64__
+ vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid());
+#endif
+ ucall_init(vm, NULL);
+
+ /* Export the shared variables to the guest */
+ sync_global_to_guest(vm, host_page_size);
+ sync_global_to_guest(vm, guest_page_size);
+ sync_global_to_guest(vm, guest_test_virt_mem);
+ sync_global_to_guest(vm, guest_num_pages);
+
+ /* Start the iterations */
+ iteration = 1;
+ sync_global_to_guest(vm, iteration);
+ host_quit = false;
+ host_dirty_count = 0;
+ host_clear_count = 0;
+ host_track_next_count = 0;
+
+ pthread_create(&vcpu_thread, NULL, vcpu_worker, vm);
+
+ while (iteration < iterations) {
+ /* Give the vcpu thread some time to dirty some pages */
+ usleep(interval * 1000);
+ log_mode_collect_dirty_pages(vm, TEST_MEM_SLOT_INDEX,
+ bmap, host_num_pages);
+ vm_dirty_log_verify(mode, bmap);
+ iteration++;
+ sync_global_to_guest(vm, iteration);
+ }
+
+ /* Tell the vcpu thread to quit */
+ host_quit = true;
+ pthread_join(vcpu_thread, NULL);
+
+ pr_info("Total bits checked: dirty (%"PRIu64"), clear (%"PRIu64"), "
+ "track_next (%"PRIu64")\n", host_dirty_count, host_clear_count,
+ host_track_next_count);
+
+ free(bmap);
+ free(host_bmap_track);
+ ucall_uninit(vm);
+ kvm_vm_free(vm);
+}
+
+struct guest_mode {
+ bool supported;
+ bool enabled;
+};
+static struct guest_mode guest_modes[NUM_VM_MODES];
+
+#define guest_mode_init(mode, supported, enabled) ({ \
+ guest_modes[mode] = (struct guest_mode){ supported, enabled }; \
+})
+
+static void help(char *name)
+{
+ int i;
+
+ puts("");
+ printf("usage: %s [-h] [-i iterations] [-I interval] "
+ "[-p offset] [-m mode]\n", name);
+ puts("");
+ printf(" -i: specify iteration counts (default: %"PRIu64")\n",
+ TEST_HOST_LOOP_N);
+ printf(" -I: specify interval in ms (default: %"PRIu64" ms)\n",
+ TEST_HOST_LOOP_INTERVAL);
+ printf(" -p: specify guest physical test memory offset\n"
+ " Warning: a low offset can conflict with the loaded test code.\n");
+ printf(" -M: specify the host logging mode "
+ "(default: run all log modes). Supported modes: \n\t");
+ log_modes_dump();
+ printf(" -m: specify the guest mode ID to test "
+ "(default: test all supported modes)\n"
+ " This option may be used multiple times.\n"
+ " Guest mode IDs:\n");
+ for (i = 0; i < NUM_VM_MODES; ++i) {
+ printf(" %d: %s%s\n", i, vm_guest_mode_string(i),
+ guest_modes[i].supported ? " (supported)" : "");
+ }
+ puts("");
+ exit(0);
+}
+
+int main(int argc, char *argv[])
+{
+ unsigned long iterations = TEST_HOST_LOOP_N;
+ unsigned long interval = TEST_HOST_LOOP_INTERVAL;
+ bool mode_selected = false;
+ uint64_t phys_offset = 0;
+ unsigned int mode;
+ int opt, i, j;
+
+#ifdef __x86_64__
+ guest_mode_init(VM_MODE_PXXV48_4K, true, true);
+#endif
+#ifdef __aarch64__
+ guest_mode_init(VM_MODE_P40V48_4K, true, true);
+ guest_mode_init(VM_MODE_P40V48_64K, true, true);
+
+ {
+ unsigned int limit = kvm_check_cap(KVM_CAP_ARM_VM_IPA_SIZE);
+
+ if (limit >= 52)
+ guest_mode_init(VM_MODE_P52V48_64K, true, true);
+ if (limit >= 48) {
+ guest_mode_init(VM_MODE_P48V48_4K, true, true);
+ guest_mode_init(VM_MODE_P48V48_64K, true, true);
+ }
+ }
+#endif
+#ifdef __s390x__
+ guest_mode_init(VM_MODE_P40V48_4K, true, true);
+#endif
+
+ while ((opt = getopt(argc, argv, "hi:I:p:m:M:")) != -1) {
+ switch (opt) {
+ case 'i':
+ iterations = strtol(optarg, NULL, 10);
+ break;
+ case 'I':
+ interval = strtol(optarg, NULL, 10);
+ break;
+ case 'p':
+ phys_offset = strtoull(optarg, NULL, 0);
+ break;
+ case 'm':
+ if (!mode_selected) {
+ for (i = 0; i < NUM_VM_MODES; ++i)
+ guest_modes[i].enabled = false;
+ mode_selected = true;
+ }
+ mode = strtoul(optarg, NULL, 10);
+ TEST_ASSERT(mode < NUM_VM_MODES,
+ "Guest mode ID %d too big", mode);
+ guest_modes[mode].enabled = true;
+ break;
+ case 'M':
+ if (!strcmp(optarg, "all")) {
+ host_log_mode_option = LOG_MODE_ALL;
+ break;
+ }
+ for (i = 0; i < LOG_MODE_NUM; i++) {
+ if (!strcmp(optarg, log_modes[i].name)) {
+ pr_info("Setting log mode to: '%s'\n",
+ optarg);
+ host_log_mode_option = i;
+ break;
+ }
+ }
+ if (i == LOG_MODE_NUM) {
+ printf("Log mode '%s' invalid. Please choose "
+ "from: ", optarg);
+ log_modes_dump();
+ exit(1);
+ }
+ break;
+ case 'h':
+ default:
+ help(argv[0]);
+ break;
+ }
+ }
+
+ TEST_ASSERT(iterations > 2, "Iterations must be greater than two");
+ TEST_ASSERT(interval > 0, "Interval must be greater than zero");
+
+ pr_info("Test iterations: %"PRIu64", interval: %"PRIu64" (ms)\n",
+ iterations, interval);
+
+ srandom(time(0));
+
+ for (i = 0; i < NUM_VM_MODES; ++i) {
+ if (!guest_modes[i].enabled)
+ continue;
+ TEST_ASSERT(guest_modes[i].supported,
+ "Guest mode ID %d (%s) not supported.",
+ i, vm_guest_mode_string(i));
+ if (host_log_mode_option == LOG_MODE_ALL) {
+ /* Run each log mode */
+ for (j = 0; j < LOG_MODE_NUM; j++) {
+ pr_info("Testing Log Mode '%s'\n",
+ log_modes[j].name);
+ host_log_mode = j;
+ run_test(i, iterations, interval, phys_offset);
+ }
+ } else {
+ host_log_mode = host_log_mode_option;
+ run_test(i, iterations, interval, phys_offset);
+ }
+ }
+
+ return 0;
+}
diff --git a/tools/testing/selftests/kvm/include/aarch64/processor.h b/tools/testing/selftests/kvm/include/aarch64/processor.h
new file mode 100644
index 000000000..b7fa0c855
--- /dev/null
+++ b/tools/testing/selftests/kvm/include/aarch64/processor.h
@@ -0,0 +1,59 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * AArch64 processor specific defines
+ *
+ * Copyright (C) 2018, Red Hat, Inc.
+ */
+#ifndef SELFTEST_KVM_PROCESSOR_H
+#define SELFTEST_KVM_PROCESSOR_H
+
+#include "kvm_util.h"
+
+
+#define ARM64_CORE_REG(x) (KVM_REG_ARM64 | KVM_REG_SIZE_U64 | \
+ KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(x))
+
+#define CPACR_EL1 3, 0, 1, 0, 2
+#define TCR_EL1 3, 0, 2, 0, 2
+#define MAIR_EL1 3, 0, 10, 2, 0
+#define TTBR0_EL1 3, 0, 2, 0, 0
+#define SCTLR_EL1 3, 0, 1, 0, 0
+
+/*
+ * Default MAIR
+ * index attribute
+ * DEVICE_nGnRnE 0 0000:0000
+ * DEVICE_nGnRE 1 0000:0100
+ * DEVICE_GRE 2 0000:1100
+ * NORMAL_NC 3 0100:0100
+ * NORMAL 4 1111:1111
+ * NORMAL_WT 5 1011:1011
+ */
+#define DEFAULT_MAIR_EL1 ((0x00ul << (0 * 8)) | \
+ (0x04ul << (1 * 8)) | \
+ (0x0cul << (2 * 8)) | \
+ (0x44ul << (3 * 8)) | \
+ (0xfful << (4 * 8)) | \
+ (0xbbul << (5 * 8)))
+
+static inline void get_reg(struct kvm_vm *vm, uint32_t vcpuid, uint64_t id, uint64_t *addr)
+{
+ struct kvm_one_reg reg;
+ reg.id = id;
+ reg.addr = (uint64_t)addr;
+ vcpu_ioctl(vm, vcpuid, KVM_GET_ONE_REG, &reg);
+}
+
+static inline void set_reg(struct kvm_vm *vm, uint32_t vcpuid, uint64_t id, uint64_t val)
+{
+ struct kvm_one_reg reg;
+ reg.id = id;
+ reg.addr = (uint64_t)&val;
+ vcpu_ioctl(vm, vcpuid, KVM_SET_ONE_REG, &reg);
+}
+
+void aarch64_vcpu_setup(struct kvm_vm *vm, int vcpuid, struct kvm_vcpu_init *init);
+void aarch64_vcpu_add_default(struct kvm_vm *vm, uint32_t vcpuid,
+ struct kvm_vcpu_init *init, void *guest_code);
+
+#endif /* SELFTEST_KVM_PROCESSOR_H */
diff --git a/tools/testing/selftests/kvm/include/evmcs.h b/tools/testing/selftests/kvm/include/evmcs.h
new file mode 100644
index 000000000..a034438b6
--- /dev/null
+++ b/tools/testing/selftests/kvm/include/evmcs.h
@@ -0,0 +1,1102 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * tools/testing/selftests/kvm/include/vmx.h
+ *
+ * Copyright (C) 2018, Red Hat, Inc.
+ *
+ */
+
+#ifndef SELFTEST_KVM_EVMCS_H
+#define SELFTEST_KVM_EVMCS_H
+
+#include <stdint.h>
+#include "vmx.h"
+
+#define u16 uint16_t
+#define u32 uint32_t
+#define u64 uint64_t
+
+#define EVMCS_VERSION 1
+
+extern bool enable_evmcs;
+
+struct hv_vp_assist_page {
+ __u32 apic_assist;
+ __u32 reserved;
+ __u64 vtl_control[2];
+ __u64 nested_enlightenments_control[2];
+ __u32 enlighten_vmentry;
+ __u64 current_nested_vmcs;
+};
+
+struct hv_enlightened_vmcs {
+ u32 revision_id;
+ u32 abort;
+
+ u16 host_es_selector;
+ u16 host_cs_selector;
+ u16 host_ss_selector;
+ u16 host_ds_selector;
+ u16 host_fs_selector;
+ u16 host_gs_selector;
+ u16 host_tr_selector;
+
+ u64 host_ia32_pat;
+ u64 host_ia32_efer;
+
+ u64 host_cr0;
+ u64 host_cr3;
+ u64 host_cr4;
+
+ u64 host_ia32_sysenter_esp;
+ u64 host_ia32_sysenter_eip;
+ u64 host_rip;
+ u32 host_ia32_sysenter_cs;
+
+ u32 pin_based_vm_exec_control;
+ u32 vm_exit_controls;
+ u32 secondary_vm_exec_control;
+
+ u64 io_bitmap_a;
+ u64 io_bitmap_b;
+ u64 msr_bitmap;
+
+ u16 guest_es_selector;
+ u16 guest_cs_selector;
+ u16 guest_ss_selector;
+ u16 guest_ds_selector;
+ u16 guest_fs_selector;
+ u16 guest_gs_selector;
+ u16 guest_ldtr_selector;
+ u16 guest_tr_selector;
+
+ u32 guest_es_limit;
+ u32 guest_cs_limit;
+ u32 guest_ss_limit;
+ u32 guest_ds_limit;
+ u32 guest_fs_limit;
+ u32 guest_gs_limit;
+ u32 guest_ldtr_limit;
+ u32 guest_tr_limit;
+ u32 guest_gdtr_limit;
+ u32 guest_idtr_limit;
+
+ u32 guest_es_ar_bytes;
+ u32 guest_cs_ar_bytes;
+ u32 guest_ss_ar_bytes;
+ u32 guest_ds_ar_bytes;
+ u32 guest_fs_ar_bytes;
+ u32 guest_gs_ar_bytes;
+ u32 guest_ldtr_ar_bytes;
+ u32 guest_tr_ar_bytes;
+
+ u64 guest_es_base;
+ u64 guest_cs_base;
+ u64 guest_ss_base;
+ u64 guest_ds_base;
+ u64 guest_fs_base;
+ u64 guest_gs_base;
+ u64 guest_ldtr_base;
+ u64 guest_tr_base;
+ u64 guest_gdtr_base;
+ u64 guest_idtr_base;
+
+ u64 padding64_1[3];
+
+ u64 vm_exit_msr_store_addr;
+ u64 vm_exit_msr_load_addr;
+ u64 vm_entry_msr_load_addr;
+
+ u64 cr3_target_value0;
+ u64 cr3_target_value1;
+ u64 cr3_target_value2;
+ u64 cr3_target_value3;
+
+ u32 page_fault_error_code_mask;
+ u32 page_fault_error_code_match;
+
+ u32 cr3_target_count;
+ u32 vm_exit_msr_store_count;
+ u32 vm_exit_msr_load_count;
+ u32 vm_entry_msr_load_count;
+
+ u64 tsc_offset;
+ u64 virtual_apic_page_addr;
+ u64 vmcs_link_pointer;
+
+ u64 guest_ia32_debugctl;
+ u64 guest_ia32_pat;
+ u64 guest_ia32_efer;
+
+ u64 guest_pdptr0;
+ u64 guest_pdptr1;
+ u64 guest_pdptr2;
+ u64 guest_pdptr3;
+
+ u64 guest_pending_dbg_exceptions;
+ u64 guest_sysenter_esp;
+ u64 guest_sysenter_eip;
+
+ u32 guest_activity_state;
+ u32 guest_sysenter_cs;
+
+ u64 cr0_guest_host_mask;
+ u64 cr4_guest_host_mask;
+ u64 cr0_read_shadow;
+ u64 cr4_read_shadow;
+ u64 guest_cr0;
+ u64 guest_cr3;
+ u64 guest_cr4;
+ u64 guest_dr7;
+
+ u64 host_fs_base;
+ u64 host_gs_base;
+ u64 host_tr_base;
+ u64 host_gdtr_base;
+ u64 host_idtr_base;
+ u64 host_rsp;
+
+ u64 ept_pointer;
+
+ u16 virtual_processor_id;
+ u16 padding16[3];
+
+ u64 padding64_2[5];
+ u64 guest_physical_address;
+
+ u32 vm_instruction_error;
+ u32 vm_exit_reason;
+ u32 vm_exit_intr_info;
+ u32 vm_exit_intr_error_code;
+ u32 idt_vectoring_info_field;
+ u32 idt_vectoring_error_code;
+ u32 vm_exit_instruction_len;
+ u32 vmx_instruction_info;
+
+ u64 exit_qualification;
+ u64 exit_io_instruction_ecx;
+ u64 exit_io_instruction_esi;
+ u64 exit_io_instruction_edi;
+ u64 exit_io_instruction_eip;
+
+ u64 guest_linear_address;
+ u64 guest_rsp;
+ u64 guest_rflags;
+
+ u32 guest_interruptibility_info;
+ u32 cpu_based_vm_exec_control;
+ u32 exception_bitmap;
+ u32 vm_entry_controls;
+ u32 vm_entry_intr_info_field;
+ u32 vm_entry_exception_error_code;
+ u32 vm_entry_instruction_len;
+ u32 tpr_threshold;
+
+ u64 guest_rip;
+
+ u32 hv_clean_fields;
+ u32 hv_padding_32;
+ u32 hv_synthetic_controls;
+ struct {
+ u32 nested_flush_hypercall:1;
+ u32 msr_bitmap:1;
+ u32 reserved:30;
+ } hv_enlightenments_control;
+ u32 hv_vp_id;
+
+ u64 hv_vm_id;
+ u64 partition_assist_page;
+ u64 padding64_4[4];
+ u64 guest_bndcfgs;
+ u64 padding64_5[7];
+ u64 xss_exit_bitmap;
+ u64 padding64_6[7];
+};
+
+#define HV_X64_MSR_VP_ASSIST_PAGE 0x40000073
+#define HV_X64_MSR_VP_ASSIST_PAGE_ENABLE 0x00000001
+#define HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_SHIFT 12
+#define HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_MASK \
+ (~((1ull << HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_SHIFT) - 1))
+
+extern struct hv_enlightened_vmcs *current_evmcs;
+extern struct hv_vp_assist_page *current_vp_assist;
+
+int vcpu_enable_evmcs(struct kvm_vm *vm, int vcpu_id);
+
+static inline int enable_vp_assist(uint64_t vp_assist_pa, void *vp_assist)
+{
+ u64 val = (vp_assist_pa & HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_MASK) |
+ HV_X64_MSR_VP_ASSIST_PAGE_ENABLE;
+
+ wrmsr(HV_X64_MSR_VP_ASSIST_PAGE, val);
+
+ current_vp_assist = vp_assist;
+
+ enable_evmcs = true;
+
+ return 0;
+}
+
+static inline int evmcs_vmptrld(uint64_t vmcs_pa, void *vmcs)
+{
+ current_vp_assist->current_nested_vmcs = vmcs_pa;
+ current_vp_assist->enlighten_vmentry = 1;
+
+ current_evmcs = vmcs;
+
+ return 0;
+}
+
+static inline int evmcs_vmptrst(uint64_t *value)
+{
+ *value = current_vp_assist->current_nested_vmcs &
+ ~HV_X64_MSR_VP_ASSIST_PAGE_ENABLE;
+
+ return 0;
+}
+
+static inline int evmcs_vmread(uint64_t encoding, uint64_t *value)
+{
+ switch (encoding) {
+ case GUEST_RIP:
+ *value = current_evmcs->guest_rip;
+ break;
+ case GUEST_RSP:
+ *value = current_evmcs->guest_rsp;
+ break;
+ case GUEST_RFLAGS:
+ *value = current_evmcs->guest_rflags;
+ break;
+ case HOST_IA32_PAT:
+ *value = current_evmcs->host_ia32_pat;
+ break;
+ case HOST_IA32_EFER:
+ *value = current_evmcs->host_ia32_efer;
+ break;
+ case HOST_CR0:
+ *value = current_evmcs->host_cr0;
+ break;
+ case HOST_CR3:
+ *value = current_evmcs->host_cr3;
+ break;
+ case HOST_CR4:
+ *value = current_evmcs->host_cr4;
+ break;
+ case HOST_IA32_SYSENTER_ESP:
+ *value = current_evmcs->host_ia32_sysenter_esp;
+ break;
+ case HOST_IA32_SYSENTER_EIP:
+ *value = current_evmcs->host_ia32_sysenter_eip;
+ break;
+ case HOST_RIP:
+ *value = current_evmcs->host_rip;
+ break;
+ case IO_BITMAP_A:
+ *value = current_evmcs->io_bitmap_a;
+ break;
+ case IO_BITMAP_B:
+ *value = current_evmcs->io_bitmap_b;
+ break;
+ case MSR_BITMAP:
+ *value = current_evmcs->msr_bitmap;
+ break;
+ case GUEST_ES_BASE:
+ *value = current_evmcs->guest_es_base;
+ break;
+ case GUEST_CS_BASE:
+ *value = current_evmcs->guest_cs_base;
+ break;
+ case GUEST_SS_BASE:
+ *value = current_evmcs->guest_ss_base;
+ break;
+ case GUEST_DS_BASE:
+ *value = current_evmcs->guest_ds_base;
+ break;
+ case GUEST_FS_BASE:
+ *value = current_evmcs->guest_fs_base;
+ break;
+ case GUEST_GS_BASE:
+ *value = current_evmcs->guest_gs_base;
+ break;
+ case GUEST_LDTR_BASE:
+ *value = current_evmcs->guest_ldtr_base;
+ break;
+ case GUEST_TR_BASE:
+ *value = current_evmcs->guest_tr_base;
+ break;
+ case GUEST_GDTR_BASE:
+ *value = current_evmcs->guest_gdtr_base;
+ break;
+ case GUEST_IDTR_BASE:
+ *value = current_evmcs->guest_idtr_base;
+ break;
+ case TSC_OFFSET:
+ *value = current_evmcs->tsc_offset;
+ break;
+ case VIRTUAL_APIC_PAGE_ADDR:
+ *value = current_evmcs->virtual_apic_page_addr;
+ break;
+ case VMCS_LINK_POINTER:
+ *value = current_evmcs->vmcs_link_pointer;
+ break;
+ case GUEST_IA32_DEBUGCTL:
+ *value = current_evmcs->guest_ia32_debugctl;
+ break;
+ case GUEST_IA32_PAT:
+ *value = current_evmcs->guest_ia32_pat;
+ break;
+ case GUEST_IA32_EFER:
+ *value = current_evmcs->guest_ia32_efer;
+ break;
+ case GUEST_PDPTR0:
+ *value = current_evmcs->guest_pdptr0;
+ break;
+ case GUEST_PDPTR1:
+ *value = current_evmcs->guest_pdptr1;
+ break;
+ case GUEST_PDPTR2:
+ *value = current_evmcs->guest_pdptr2;
+ break;
+ case GUEST_PDPTR3:
+ *value = current_evmcs->guest_pdptr3;
+ break;
+ case GUEST_PENDING_DBG_EXCEPTIONS:
+ *value = current_evmcs->guest_pending_dbg_exceptions;
+ break;
+ case GUEST_SYSENTER_ESP:
+ *value = current_evmcs->guest_sysenter_esp;
+ break;
+ case GUEST_SYSENTER_EIP:
+ *value = current_evmcs->guest_sysenter_eip;
+ break;
+ case CR0_GUEST_HOST_MASK:
+ *value = current_evmcs->cr0_guest_host_mask;
+ break;
+ case CR4_GUEST_HOST_MASK:
+ *value = current_evmcs->cr4_guest_host_mask;
+ break;
+ case CR0_READ_SHADOW:
+ *value = current_evmcs->cr0_read_shadow;
+ break;
+ case CR4_READ_SHADOW:
+ *value = current_evmcs->cr4_read_shadow;
+ break;
+ case GUEST_CR0:
+ *value = current_evmcs->guest_cr0;
+ break;
+ case GUEST_CR3:
+ *value = current_evmcs->guest_cr3;
+ break;
+ case GUEST_CR4:
+ *value = current_evmcs->guest_cr4;
+ break;
+ case GUEST_DR7:
+ *value = current_evmcs->guest_dr7;
+ break;
+ case HOST_FS_BASE:
+ *value = current_evmcs->host_fs_base;
+ break;
+ case HOST_GS_BASE:
+ *value = current_evmcs->host_gs_base;
+ break;
+ case HOST_TR_BASE:
+ *value = current_evmcs->host_tr_base;
+ break;
+ case HOST_GDTR_BASE:
+ *value = current_evmcs->host_gdtr_base;
+ break;
+ case HOST_IDTR_BASE:
+ *value = current_evmcs->host_idtr_base;
+ break;
+ case HOST_RSP:
+ *value = current_evmcs->host_rsp;
+ break;
+ case EPT_POINTER:
+ *value = current_evmcs->ept_pointer;
+ break;
+ case GUEST_BNDCFGS:
+ *value = current_evmcs->guest_bndcfgs;
+ break;
+ case XSS_EXIT_BITMAP:
+ *value = current_evmcs->xss_exit_bitmap;
+ break;
+ case GUEST_PHYSICAL_ADDRESS:
+ *value = current_evmcs->guest_physical_address;
+ break;
+ case EXIT_QUALIFICATION:
+ *value = current_evmcs->exit_qualification;
+ break;
+ case GUEST_LINEAR_ADDRESS:
+ *value = current_evmcs->guest_linear_address;
+ break;
+ case VM_EXIT_MSR_STORE_ADDR:
+ *value = current_evmcs->vm_exit_msr_store_addr;
+ break;
+ case VM_EXIT_MSR_LOAD_ADDR:
+ *value = current_evmcs->vm_exit_msr_load_addr;
+ break;
+ case VM_ENTRY_MSR_LOAD_ADDR:
+ *value = current_evmcs->vm_entry_msr_load_addr;
+ break;
+ case CR3_TARGET_VALUE0:
+ *value = current_evmcs->cr3_target_value0;
+ break;
+ case CR3_TARGET_VALUE1:
+ *value = current_evmcs->cr3_target_value1;
+ break;
+ case CR3_TARGET_VALUE2:
+ *value = current_evmcs->cr3_target_value2;
+ break;
+ case CR3_TARGET_VALUE3:
+ *value = current_evmcs->cr3_target_value3;
+ break;
+ case TPR_THRESHOLD:
+ *value = current_evmcs->tpr_threshold;
+ break;
+ case GUEST_INTERRUPTIBILITY_INFO:
+ *value = current_evmcs->guest_interruptibility_info;
+ break;
+ case CPU_BASED_VM_EXEC_CONTROL:
+ *value = current_evmcs->cpu_based_vm_exec_control;
+ break;
+ case EXCEPTION_BITMAP:
+ *value = current_evmcs->exception_bitmap;
+ break;
+ case VM_ENTRY_CONTROLS:
+ *value = current_evmcs->vm_entry_controls;
+ break;
+ case VM_ENTRY_INTR_INFO_FIELD:
+ *value = current_evmcs->vm_entry_intr_info_field;
+ break;
+ case VM_ENTRY_EXCEPTION_ERROR_CODE:
+ *value = current_evmcs->vm_entry_exception_error_code;
+ break;
+ case VM_ENTRY_INSTRUCTION_LEN:
+ *value = current_evmcs->vm_entry_instruction_len;
+ break;
+ case HOST_IA32_SYSENTER_CS:
+ *value = current_evmcs->host_ia32_sysenter_cs;
+ break;
+ case PIN_BASED_VM_EXEC_CONTROL:
+ *value = current_evmcs->pin_based_vm_exec_control;
+ break;
+ case VM_EXIT_CONTROLS:
+ *value = current_evmcs->vm_exit_controls;
+ break;
+ case SECONDARY_VM_EXEC_CONTROL:
+ *value = current_evmcs->secondary_vm_exec_control;
+ break;
+ case GUEST_ES_LIMIT:
+ *value = current_evmcs->guest_es_limit;
+ break;
+ case GUEST_CS_LIMIT:
+ *value = current_evmcs->guest_cs_limit;
+ break;
+ case GUEST_SS_LIMIT:
+ *value = current_evmcs->guest_ss_limit;
+ break;
+ case GUEST_DS_LIMIT:
+ *value = current_evmcs->guest_ds_limit;
+ break;
+ case GUEST_FS_LIMIT:
+ *value = current_evmcs->guest_fs_limit;
+ break;
+ case GUEST_GS_LIMIT:
+ *value = current_evmcs->guest_gs_limit;
+ break;
+ case GUEST_LDTR_LIMIT:
+ *value = current_evmcs->guest_ldtr_limit;
+ break;
+ case GUEST_TR_LIMIT:
+ *value = current_evmcs->guest_tr_limit;
+ break;
+ case GUEST_GDTR_LIMIT:
+ *value = current_evmcs->guest_gdtr_limit;
+ break;
+ case GUEST_IDTR_LIMIT:
+ *value = current_evmcs->guest_idtr_limit;
+ break;
+ case GUEST_ES_AR_BYTES:
+ *value = current_evmcs->guest_es_ar_bytes;
+ break;
+ case GUEST_CS_AR_BYTES:
+ *value = current_evmcs->guest_cs_ar_bytes;
+ break;
+ case GUEST_SS_AR_BYTES:
+ *value = current_evmcs->guest_ss_ar_bytes;
+ break;
+ case GUEST_DS_AR_BYTES:
+ *value = current_evmcs->guest_ds_ar_bytes;
+ break;
+ case GUEST_FS_AR_BYTES:
+ *value = current_evmcs->guest_fs_ar_bytes;
+ break;
+ case GUEST_GS_AR_BYTES:
+ *value = current_evmcs->guest_gs_ar_bytes;
+ break;
+ case GUEST_LDTR_AR_BYTES:
+ *value = current_evmcs->guest_ldtr_ar_bytes;
+ break;
+ case GUEST_TR_AR_BYTES:
+ *value = current_evmcs->guest_tr_ar_bytes;
+ break;
+ case GUEST_ACTIVITY_STATE:
+ *value = current_evmcs->guest_activity_state;
+ break;
+ case GUEST_SYSENTER_CS:
+ *value = current_evmcs->guest_sysenter_cs;
+ break;
+ case VM_INSTRUCTION_ERROR:
+ *value = current_evmcs->vm_instruction_error;
+ break;
+ case VM_EXIT_REASON:
+ *value = current_evmcs->vm_exit_reason;
+ break;
+ case VM_EXIT_INTR_INFO:
+ *value = current_evmcs->vm_exit_intr_info;
+ break;
+ case VM_EXIT_INTR_ERROR_CODE:
+ *value = current_evmcs->vm_exit_intr_error_code;
+ break;
+ case IDT_VECTORING_INFO_FIELD:
+ *value = current_evmcs->idt_vectoring_info_field;
+ break;
+ case IDT_VECTORING_ERROR_CODE:
+ *value = current_evmcs->idt_vectoring_error_code;
+ break;
+ case VM_EXIT_INSTRUCTION_LEN:
+ *value = current_evmcs->vm_exit_instruction_len;
+ break;
+ case VMX_INSTRUCTION_INFO:
+ *value = current_evmcs->vmx_instruction_info;
+ break;
+ case PAGE_FAULT_ERROR_CODE_MASK:
+ *value = current_evmcs->page_fault_error_code_mask;
+ break;
+ case PAGE_FAULT_ERROR_CODE_MATCH:
+ *value = current_evmcs->page_fault_error_code_match;
+ break;
+ case CR3_TARGET_COUNT:
+ *value = current_evmcs->cr3_target_count;
+ break;
+ case VM_EXIT_MSR_STORE_COUNT:
+ *value = current_evmcs->vm_exit_msr_store_count;
+ break;
+ case VM_EXIT_MSR_LOAD_COUNT:
+ *value = current_evmcs->vm_exit_msr_load_count;
+ break;
+ case VM_ENTRY_MSR_LOAD_COUNT:
+ *value = current_evmcs->vm_entry_msr_load_count;
+ break;
+ case HOST_ES_SELECTOR:
+ *value = current_evmcs->host_es_selector;
+ break;
+ case HOST_CS_SELECTOR:
+ *value = current_evmcs->host_cs_selector;
+ break;
+ case HOST_SS_SELECTOR:
+ *value = current_evmcs->host_ss_selector;
+ break;
+ case HOST_DS_SELECTOR:
+ *value = current_evmcs->host_ds_selector;
+ break;
+ case HOST_FS_SELECTOR:
+ *value = current_evmcs->host_fs_selector;
+ break;
+ case HOST_GS_SELECTOR:
+ *value = current_evmcs->host_gs_selector;
+ break;
+ case HOST_TR_SELECTOR:
+ *value = current_evmcs->host_tr_selector;
+ break;
+ case GUEST_ES_SELECTOR:
+ *value = current_evmcs->guest_es_selector;
+ break;
+ case GUEST_CS_SELECTOR:
+ *value = current_evmcs->guest_cs_selector;
+ break;
+ case GUEST_SS_SELECTOR:
+ *value = current_evmcs->guest_ss_selector;
+ break;
+ case GUEST_DS_SELECTOR:
+ *value = current_evmcs->guest_ds_selector;
+ break;
+ case GUEST_FS_SELECTOR:
+ *value = current_evmcs->guest_fs_selector;
+ break;
+ case GUEST_GS_SELECTOR:
+ *value = current_evmcs->guest_gs_selector;
+ break;
+ case GUEST_LDTR_SELECTOR:
+ *value = current_evmcs->guest_ldtr_selector;
+ break;
+ case GUEST_TR_SELECTOR:
+ *value = current_evmcs->guest_tr_selector;
+ break;
+ case VIRTUAL_PROCESSOR_ID:
+ *value = current_evmcs->virtual_processor_id;
+ break;
+ default: return 1;
+ }
+
+ return 0;
+}
+
+static inline int evmcs_vmwrite(uint64_t encoding, uint64_t value)
+{
+ switch (encoding) {
+ case GUEST_RIP:
+ current_evmcs->guest_rip = value;
+ break;
+ case GUEST_RSP:
+ current_evmcs->guest_rsp = value;
+ break;
+ case GUEST_RFLAGS:
+ current_evmcs->guest_rflags = value;
+ break;
+ case HOST_IA32_PAT:
+ current_evmcs->host_ia32_pat = value;
+ break;
+ case HOST_IA32_EFER:
+ current_evmcs->host_ia32_efer = value;
+ break;
+ case HOST_CR0:
+ current_evmcs->host_cr0 = value;
+ break;
+ case HOST_CR3:
+ current_evmcs->host_cr3 = value;
+ break;
+ case HOST_CR4:
+ current_evmcs->host_cr4 = value;
+ break;
+ case HOST_IA32_SYSENTER_ESP:
+ current_evmcs->host_ia32_sysenter_esp = value;
+ break;
+ case HOST_IA32_SYSENTER_EIP:
+ current_evmcs->host_ia32_sysenter_eip = value;
+ break;
+ case HOST_RIP:
+ current_evmcs->host_rip = value;
+ break;
+ case IO_BITMAP_A:
+ current_evmcs->io_bitmap_a = value;
+ break;
+ case IO_BITMAP_B:
+ current_evmcs->io_bitmap_b = value;
+ break;
+ case MSR_BITMAP:
+ current_evmcs->msr_bitmap = value;
+ break;
+ case GUEST_ES_BASE:
+ current_evmcs->guest_es_base = value;
+ break;
+ case GUEST_CS_BASE:
+ current_evmcs->guest_cs_base = value;
+ break;
+ case GUEST_SS_BASE:
+ current_evmcs->guest_ss_base = value;
+ break;
+ case GUEST_DS_BASE:
+ current_evmcs->guest_ds_base = value;
+ break;
+ case GUEST_FS_BASE:
+ current_evmcs->guest_fs_base = value;
+ break;
+ case GUEST_GS_BASE:
+ current_evmcs->guest_gs_base = value;
+ break;
+ case GUEST_LDTR_BASE:
+ current_evmcs->guest_ldtr_base = value;
+ break;
+ case GUEST_TR_BASE:
+ current_evmcs->guest_tr_base = value;
+ break;
+ case GUEST_GDTR_BASE:
+ current_evmcs->guest_gdtr_base = value;
+ break;
+ case GUEST_IDTR_BASE:
+ current_evmcs->guest_idtr_base = value;
+ break;
+ case TSC_OFFSET:
+ current_evmcs->tsc_offset = value;
+ break;
+ case VIRTUAL_APIC_PAGE_ADDR:
+ current_evmcs->virtual_apic_page_addr = value;
+ break;
+ case VMCS_LINK_POINTER:
+ current_evmcs->vmcs_link_pointer = value;
+ break;
+ case GUEST_IA32_DEBUGCTL:
+ current_evmcs->guest_ia32_debugctl = value;
+ break;
+ case GUEST_IA32_PAT:
+ current_evmcs->guest_ia32_pat = value;
+ break;
+ case GUEST_IA32_EFER:
+ current_evmcs->guest_ia32_efer = value;
+ break;
+ case GUEST_PDPTR0:
+ current_evmcs->guest_pdptr0 = value;
+ break;
+ case GUEST_PDPTR1:
+ current_evmcs->guest_pdptr1 = value;
+ break;
+ case GUEST_PDPTR2:
+ current_evmcs->guest_pdptr2 = value;
+ break;
+ case GUEST_PDPTR3:
+ current_evmcs->guest_pdptr3 = value;
+ break;
+ case GUEST_PENDING_DBG_EXCEPTIONS:
+ current_evmcs->guest_pending_dbg_exceptions = value;
+ break;
+ case GUEST_SYSENTER_ESP:
+ current_evmcs->guest_sysenter_esp = value;
+ break;
+ case GUEST_SYSENTER_EIP:
+ current_evmcs->guest_sysenter_eip = value;
+ break;
+ case CR0_GUEST_HOST_MASK:
+ current_evmcs->cr0_guest_host_mask = value;
+ break;
+ case CR4_GUEST_HOST_MASK:
+ current_evmcs->cr4_guest_host_mask = value;
+ break;
+ case CR0_READ_SHADOW:
+ current_evmcs->cr0_read_shadow = value;
+ break;
+ case CR4_READ_SHADOW:
+ current_evmcs->cr4_read_shadow = value;
+ break;
+ case GUEST_CR0:
+ current_evmcs->guest_cr0 = value;
+ break;
+ case GUEST_CR3:
+ current_evmcs->guest_cr3 = value;
+ break;
+ case GUEST_CR4:
+ current_evmcs->guest_cr4 = value;
+ break;
+ case GUEST_DR7:
+ current_evmcs->guest_dr7 = value;
+ break;
+ case HOST_FS_BASE:
+ current_evmcs->host_fs_base = value;
+ break;
+ case HOST_GS_BASE:
+ current_evmcs->host_gs_base = value;
+ break;
+ case HOST_TR_BASE:
+ current_evmcs->host_tr_base = value;
+ break;
+ case HOST_GDTR_BASE:
+ current_evmcs->host_gdtr_base = value;
+ break;
+ case HOST_IDTR_BASE:
+ current_evmcs->host_idtr_base = value;
+ break;
+ case HOST_RSP:
+ current_evmcs->host_rsp = value;
+ break;
+ case EPT_POINTER:
+ current_evmcs->ept_pointer = value;
+ break;
+ case GUEST_BNDCFGS:
+ current_evmcs->guest_bndcfgs = value;
+ break;
+ case XSS_EXIT_BITMAP:
+ current_evmcs->xss_exit_bitmap = value;
+ break;
+ case GUEST_PHYSICAL_ADDRESS:
+ current_evmcs->guest_physical_address = value;
+ break;
+ case EXIT_QUALIFICATION:
+ current_evmcs->exit_qualification = value;
+ break;
+ case GUEST_LINEAR_ADDRESS:
+ current_evmcs->guest_linear_address = value;
+ break;
+ case VM_EXIT_MSR_STORE_ADDR:
+ current_evmcs->vm_exit_msr_store_addr = value;
+ break;
+ case VM_EXIT_MSR_LOAD_ADDR:
+ current_evmcs->vm_exit_msr_load_addr = value;
+ break;
+ case VM_ENTRY_MSR_LOAD_ADDR:
+ current_evmcs->vm_entry_msr_load_addr = value;
+ break;
+ case CR3_TARGET_VALUE0:
+ current_evmcs->cr3_target_value0 = value;
+ break;
+ case CR3_TARGET_VALUE1:
+ current_evmcs->cr3_target_value1 = value;
+ break;
+ case CR3_TARGET_VALUE2:
+ current_evmcs->cr3_target_value2 = value;
+ break;
+ case CR3_TARGET_VALUE3:
+ current_evmcs->cr3_target_value3 = value;
+ break;
+ case TPR_THRESHOLD:
+ current_evmcs->tpr_threshold = value;
+ break;
+ case GUEST_INTERRUPTIBILITY_INFO:
+ current_evmcs->guest_interruptibility_info = value;
+ break;
+ case CPU_BASED_VM_EXEC_CONTROL:
+ current_evmcs->cpu_based_vm_exec_control = value;
+ break;
+ case EXCEPTION_BITMAP:
+ current_evmcs->exception_bitmap = value;
+ break;
+ case VM_ENTRY_CONTROLS:
+ current_evmcs->vm_entry_controls = value;
+ break;
+ case VM_ENTRY_INTR_INFO_FIELD:
+ current_evmcs->vm_entry_intr_info_field = value;
+ break;
+ case VM_ENTRY_EXCEPTION_ERROR_CODE:
+ current_evmcs->vm_entry_exception_error_code = value;
+ break;
+ case VM_ENTRY_INSTRUCTION_LEN:
+ current_evmcs->vm_entry_instruction_len = value;
+ break;
+ case HOST_IA32_SYSENTER_CS:
+ current_evmcs->host_ia32_sysenter_cs = value;
+ break;
+ case PIN_BASED_VM_EXEC_CONTROL:
+ current_evmcs->pin_based_vm_exec_control = value;
+ break;
+ case VM_EXIT_CONTROLS:
+ current_evmcs->vm_exit_controls = value;
+ break;
+ case SECONDARY_VM_EXEC_CONTROL:
+ current_evmcs->secondary_vm_exec_control = value;
+ break;
+ case GUEST_ES_LIMIT:
+ current_evmcs->guest_es_limit = value;
+ break;
+ case GUEST_CS_LIMIT:
+ current_evmcs->guest_cs_limit = value;
+ break;
+ case GUEST_SS_LIMIT:
+ current_evmcs->guest_ss_limit = value;
+ break;
+ case GUEST_DS_LIMIT:
+ current_evmcs->guest_ds_limit = value;
+ break;
+ case GUEST_FS_LIMIT:
+ current_evmcs->guest_fs_limit = value;
+ break;
+ case GUEST_GS_LIMIT:
+ current_evmcs->guest_gs_limit = value;
+ break;
+ case GUEST_LDTR_LIMIT:
+ current_evmcs->guest_ldtr_limit = value;
+ break;
+ case GUEST_TR_LIMIT:
+ current_evmcs->guest_tr_limit = value;
+ break;
+ case GUEST_GDTR_LIMIT:
+ current_evmcs->guest_gdtr_limit = value;
+ break;
+ case GUEST_IDTR_LIMIT:
+ current_evmcs->guest_idtr_limit = value;
+ break;
+ case GUEST_ES_AR_BYTES:
+ current_evmcs->guest_es_ar_bytes = value;
+ break;
+ case GUEST_CS_AR_BYTES:
+ current_evmcs->guest_cs_ar_bytes = value;
+ break;
+ case GUEST_SS_AR_BYTES:
+ current_evmcs->guest_ss_ar_bytes = value;
+ break;
+ case GUEST_DS_AR_BYTES:
+ current_evmcs->guest_ds_ar_bytes = value;
+ break;
+ case GUEST_FS_AR_BYTES:
+ current_evmcs->guest_fs_ar_bytes = value;
+ break;
+ case GUEST_GS_AR_BYTES:
+ current_evmcs->guest_gs_ar_bytes = value;
+ break;
+ case GUEST_LDTR_AR_BYTES:
+ current_evmcs->guest_ldtr_ar_bytes = value;
+ break;
+ case GUEST_TR_AR_BYTES:
+ current_evmcs->guest_tr_ar_bytes = value;
+ break;
+ case GUEST_ACTIVITY_STATE:
+ current_evmcs->guest_activity_state = value;
+ break;
+ case GUEST_SYSENTER_CS:
+ current_evmcs->guest_sysenter_cs = value;
+ break;
+ case VM_INSTRUCTION_ERROR:
+ current_evmcs->vm_instruction_error = value;
+ break;
+ case VM_EXIT_REASON:
+ current_evmcs->vm_exit_reason = value;
+ break;
+ case VM_EXIT_INTR_INFO:
+ current_evmcs->vm_exit_intr_info = value;
+ break;
+ case VM_EXIT_INTR_ERROR_CODE:
+ current_evmcs->vm_exit_intr_error_code = value;
+ break;
+ case IDT_VECTORING_INFO_FIELD:
+ current_evmcs->idt_vectoring_info_field = value;
+ break;
+ case IDT_VECTORING_ERROR_CODE:
+ current_evmcs->idt_vectoring_error_code = value;
+ break;
+ case VM_EXIT_INSTRUCTION_LEN:
+ current_evmcs->vm_exit_instruction_len = value;
+ break;
+ case VMX_INSTRUCTION_INFO:
+ current_evmcs->vmx_instruction_info = value;
+ break;
+ case PAGE_FAULT_ERROR_CODE_MASK:
+ current_evmcs->page_fault_error_code_mask = value;
+ break;
+ case PAGE_FAULT_ERROR_CODE_MATCH:
+ current_evmcs->page_fault_error_code_match = value;
+ break;
+ case CR3_TARGET_COUNT:
+ current_evmcs->cr3_target_count = value;
+ break;
+ case VM_EXIT_MSR_STORE_COUNT:
+ current_evmcs->vm_exit_msr_store_count = value;
+ break;
+ case VM_EXIT_MSR_LOAD_COUNT:
+ current_evmcs->vm_exit_msr_load_count = value;
+ break;
+ case VM_ENTRY_MSR_LOAD_COUNT:
+ current_evmcs->vm_entry_msr_load_count = value;
+ break;
+ case HOST_ES_SELECTOR:
+ current_evmcs->host_es_selector = value;
+ break;
+ case HOST_CS_SELECTOR:
+ current_evmcs->host_cs_selector = value;
+ break;
+ case HOST_SS_SELECTOR:
+ current_evmcs->host_ss_selector = value;
+ break;
+ case HOST_DS_SELECTOR:
+ current_evmcs->host_ds_selector = value;
+ break;
+ case HOST_FS_SELECTOR:
+ current_evmcs->host_fs_selector = value;
+ break;
+ case HOST_GS_SELECTOR:
+ current_evmcs->host_gs_selector = value;
+ break;
+ case HOST_TR_SELECTOR:
+ current_evmcs->host_tr_selector = value;
+ break;
+ case GUEST_ES_SELECTOR:
+ current_evmcs->guest_es_selector = value;
+ break;
+ case GUEST_CS_SELECTOR:
+ current_evmcs->guest_cs_selector = value;
+ break;
+ case GUEST_SS_SELECTOR:
+ current_evmcs->guest_ss_selector = value;
+ break;
+ case GUEST_DS_SELECTOR:
+ current_evmcs->guest_ds_selector = value;
+ break;
+ case GUEST_FS_SELECTOR:
+ current_evmcs->guest_fs_selector = value;
+ break;
+ case GUEST_GS_SELECTOR:
+ current_evmcs->guest_gs_selector = value;
+ break;
+ case GUEST_LDTR_SELECTOR:
+ current_evmcs->guest_ldtr_selector = value;
+ break;
+ case GUEST_TR_SELECTOR:
+ current_evmcs->guest_tr_selector = value;
+ break;
+ case VIRTUAL_PROCESSOR_ID:
+ current_evmcs->virtual_processor_id = value;
+ break;
+ default: return 1;
+ }
+
+ return 0;
+}
+
+static inline int evmcs_vmlaunch(void)
+{
+ int ret;
+
+ current_evmcs->hv_clean_fields = 0;
+
+ __asm__ __volatile__("push %%rbp;"
+ "push %%rcx;"
+ "push %%rdx;"
+ "push %%rsi;"
+ "push %%rdi;"
+ "push $0;"
+ "mov %%rsp, (%[host_rsp]);"
+ "lea 1f(%%rip), %%rax;"
+ "mov %%rax, (%[host_rip]);"
+ "vmlaunch;"
+ "incq (%%rsp);"
+ "1: pop %%rax;"
+ "pop %%rdi;"
+ "pop %%rsi;"
+ "pop %%rdx;"
+ "pop %%rcx;"
+ "pop %%rbp;"
+ : [ret]"=&a"(ret)
+ : [host_rsp]"r"
+ ((uint64_t)&current_evmcs->host_rsp),
+ [host_rip]"r"
+ ((uint64_t)&current_evmcs->host_rip)
+ : "memory", "cc", "rbx", "r8", "r9", "r10",
+ "r11", "r12", "r13", "r14", "r15");
+ return ret;
+}
+
+/*
+ * No guest state (e.g. GPRs) is established by this vmresume.
+ */
+static inline int evmcs_vmresume(void)
+{
+ int ret;
+
+ current_evmcs->hv_clean_fields = 0;
+
+ __asm__ __volatile__("push %%rbp;"
+ "push %%rcx;"
+ "push %%rdx;"
+ "push %%rsi;"
+ "push %%rdi;"
+ "push $0;"
+ "mov %%rsp, (%[host_rsp]);"
+ "lea 1f(%%rip), %%rax;"
+ "mov %%rax, (%[host_rip]);"
+ "vmresume;"
+ "incq (%%rsp);"
+ "1: pop %%rax;"
+ "pop %%rdi;"
+ "pop %%rsi;"
+ "pop %%rdx;"
+ "pop %%rcx;"
+ "pop %%rbp;"
+ : [ret]"=&a"(ret)
+ : [host_rsp]"r"
+ ((uint64_t)&current_evmcs->host_rsp),
+ [host_rip]"r"
+ ((uint64_t)&current_evmcs->host_rip)
+ : "memory", "cc", "rbx", "r8", "r9", "r10",
+ "r11", "r12", "r13", "r14", "r15");
+ return ret;
+}
+
+#endif /* !SELFTEST_KVM_EVMCS_H */
diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h
new file mode 100644
index 000000000..7d29aa786
--- /dev/null
+++ b/tools/testing/selftests/kvm/include/kvm_util.h
@@ -0,0 +1,348 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * tools/testing/selftests/kvm/include/kvm_util.h
+ *
+ * Copyright (C) 2018, Google LLC.
+ */
+#ifndef SELFTEST_KVM_UTIL_H
+#define SELFTEST_KVM_UTIL_H
+
+#include "test_util.h"
+
+#include "asm/kvm.h"
+#include "linux/list.h"
+#include "linux/kvm.h"
+#include <sys/ioctl.h>
+
+#include "sparsebit.h"
+
+
+/*
+ * Callers of kvm_util only have an incomplete/opaque description of the
+ * structure kvm_util is using to maintain the state of a VM.
+ */
+struct kvm_vm;
+
+typedef uint64_t vm_paddr_t; /* Virtual Machine (Guest) physical address */
+typedef uint64_t vm_vaddr_t; /* Virtual Machine (Guest) virtual address */
+
+/* Minimum allocated guest virtual and physical addresses */
+#define KVM_UTIL_MIN_VADDR 0x2000
+
+#define DEFAULT_GUEST_PHY_PAGES 512
+#define DEFAULT_GUEST_STACK_VADDR_MIN 0xab6000
+#define DEFAULT_STACK_PGS 5
+
+enum vm_guest_mode {
+ VM_MODE_P52V48_4K,
+ VM_MODE_P52V48_64K,
+ VM_MODE_P48V48_4K,
+ VM_MODE_P48V48_64K,
+ VM_MODE_P40V48_4K,
+ VM_MODE_P40V48_64K,
+ VM_MODE_PXXV48_4K, /* For 48bits VA but ANY bits PA */
+ NUM_VM_MODES,
+};
+
+#if defined(__aarch64__)
+#define VM_MODE_DEFAULT VM_MODE_P40V48_4K
+#elif defined(__x86_64__)
+#define VM_MODE_DEFAULT VM_MODE_PXXV48_4K
+#else
+#define VM_MODE_DEFAULT VM_MODE_P52V48_4K
+#endif
+
+#define vm_guest_mode_string(m) vm_guest_mode_string[m]
+extern const char * const vm_guest_mode_string[];
+
+enum vm_mem_backing_src_type {
+ VM_MEM_SRC_ANONYMOUS,
+ VM_MEM_SRC_ANONYMOUS_THP,
+ VM_MEM_SRC_ANONYMOUS_HUGETLB,
+};
+
+int kvm_check_cap(long cap);
+int vm_enable_cap(struct kvm_vm *vm, struct kvm_enable_cap *cap);
+int vcpu_enable_cap(struct kvm_vm *vm, uint32_t vcpu_id,
+ struct kvm_enable_cap *cap);
+void vm_enable_dirty_ring(struct kvm_vm *vm, uint32_t ring_size);
+
+struct kvm_vm *vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm);
+void kvm_vm_free(struct kvm_vm *vmp);
+void kvm_vm_restart(struct kvm_vm *vmp, int perm);
+void kvm_vm_release(struct kvm_vm *vmp);
+void kvm_vm_get_dirty_log(struct kvm_vm *vm, int slot, void *log);
+void kvm_vm_clear_dirty_log(struct kvm_vm *vm, int slot, void *log,
+ uint64_t first_page, uint32_t num_pages);
+
+int kvm_memcmp_hva_gva(void *hva, struct kvm_vm *vm, const vm_vaddr_t gva,
+ size_t len);
+
+void kvm_vm_elf_load(struct kvm_vm *vm, const char *filename,
+ uint32_t data_memslot, uint32_t pgd_memslot);
+
+void vm_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent);
+
+/*
+ * VM VCPU Dump
+ *
+ * Input Args:
+ * stream - Output FILE stream
+ * vm - Virtual Machine
+ * vcpuid - VCPU ID
+ * indent - Left margin indent amount
+ *
+ * Output Args: None
+ *
+ * Return: None
+ *
+ * Dumps the current state of the VCPU specified by @vcpuid, within the VM
+ * given by @vm, to the FILE stream given by @stream.
+ */
+void vcpu_dump(FILE *stream, struct kvm_vm *vm, uint32_t vcpuid,
+ uint8_t indent);
+
+void vm_create_irqchip(struct kvm_vm *vm);
+
+void vm_userspace_mem_region_add(struct kvm_vm *vm,
+ enum vm_mem_backing_src_type src_type,
+ uint64_t guest_paddr, uint32_t slot, uint64_t npages,
+ uint32_t flags);
+
+void vcpu_ioctl(struct kvm_vm *vm, uint32_t vcpuid, unsigned long ioctl,
+ void *arg);
+int _vcpu_ioctl(struct kvm_vm *vm, uint32_t vcpuid, unsigned long ioctl,
+ void *arg);
+void vm_ioctl(struct kvm_vm *vm, unsigned long ioctl, void *arg);
+void vm_mem_region_set_flags(struct kvm_vm *vm, uint32_t slot, uint32_t flags);
+void vm_mem_region_move(struct kvm_vm *vm, uint32_t slot, uint64_t new_gpa);
+void vm_mem_region_delete(struct kvm_vm *vm, uint32_t slot);
+void vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpuid);
+vm_vaddr_t vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min,
+ uint32_t data_memslot, uint32_t pgd_memslot);
+void virt_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr,
+ unsigned int npages, uint32_t pgd_memslot);
+void *addr_gpa2hva(struct kvm_vm *vm, vm_paddr_t gpa);
+void *addr_gva2hva(struct kvm_vm *vm, vm_vaddr_t gva);
+vm_paddr_t addr_hva2gpa(struct kvm_vm *vm, void *hva);
+
+/*
+ * Address Guest Virtual to Guest Physical
+ *
+ * Input Args:
+ * vm - Virtual Machine
+ * gva - VM virtual address
+ *
+ * Output Args: None
+ *
+ * Return:
+ * Equivalent VM physical address
+ *
+ * Returns the VM physical address of the translated VM virtual
+ * address given by @gva.
+ */
+vm_paddr_t addr_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva);
+
+struct kvm_run *vcpu_state(struct kvm_vm *vm, uint32_t vcpuid);
+void vcpu_run(struct kvm_vm *vm, uint32_t vcpuid);
+int _vcpu_run(struct kvm_vm *vm, uint32_t vcpuid);
+void vcpu_run_complete_io(struct kvm_vm *vm, uint32_t vcpuid);
+void vcpu_set_guest_debug(struct kvm_vm *vm, uint32_t vcpuid,
+ struct kvm_guest_debug *debug);
+void vcpu_set_mp_state(struct kvm_vm *vm, uint32_t vcpuid,
+ struct kvm_mp_state *mp_state);
+struct kvm_reg_list *vcpu_get_reg_list(struct kvm_vm *vm, uint32_t vcpuid);
+void vcpu_regs_get(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_regs *regs);
+void vcpu_regs_set(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_regs *regs);
+
+/*
+ * VM VCPU Args Set
+ *
+ * Input Args:
+ * vm - Virtual Machine
+ * vcpuid - VCPU ID
+ * num - number of arguments
+ * ... - arguments, each of type uint64_t
+ *
+ * Output Args: None
+ *
+ * Return: None
+ *
+ * Sets the first @num function input registers of the VCPU with @vcpuid,
+ * per the C calling convention of the architecture, to the values given
+ * as variable args. Each of the variable args is expected to be of type
+ * uint64_t. The maximum @num can be is specific to the architecture.
+ */
+void vcpu_args_set(struct kvm_vm *vm, uint32_t vcpuid, unsigned int num, ...);
+
+void vcpu_sregs_get(struct kvm_vm *vm, uint32_t vcpuid,
+ struct kvm_sregs *sregs);
+void vcpu_sregs_set(struct kvm_vm *vm, uint32_t vcpuid,
+ struct kvm_sregs *sregs);
+int _vcpu_sregs_set(struct kvm_vm *vm, uint32_t vcpuid,
+ struct kvm_sregs *sregs);
+void vcpu_fpu_get(struct kvm_vm *vm, uint32_t vcpuid,
+ struct kvm_fpu *fpu);
+void vcpu_fpu_set(struct kvm_vm *vm, uint32_t vcpuid,
+ struct kvm_fpu *fpu);
+void vcpu_get_reg(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_one_reg *reg);
+void vcpu_set_reg(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_one_reg *reg);
+#ifdef __KVM_HAVE_VCPU_EVENTS
+void vcpu_events_get(struct kvm_vm *vm, uint32_t vcpuid,
+ struct kvm_vcpu_events *events);
+void vcpu_events_set(struct kvm_vm *vm, uint32_t vcpuid,
+ struct kvm_vcpu_events *events);
+#endif
+#ifdef __x86_64__
+void vcpu_nested_state_get(struct kvm_vm *vm, uint32_t vcpuid,
+ struct kvm_nested_state *state);
+int vcpu_nested_state_set(struct kvm_vm *vm, uint32_t vcpuid,
+ struct kvm_nested_state *state, bool ignore_error);
+#endif
+
+const char *exit_reason_str(unsigned int exit_reason);
+
+void virt_pgd_alloc(struct kvm_vm *vm, uint32_t pgd_memslot);
+
+/*
+ * VM Virtual Page Map
+ *
+ * Input Args:
+ * vm - Virtual Machine
+ * vaddr - VM Virtual Address
+ * paddr - VM Physical Address
+ * memslot - Memory region slot for new virtual translation tables
+ *
+ * Output Args: None
+ *
+ * Return: None
+ *
+ * Within @vm, creates a virtual translation for the page starting
+ * at @vaddr to the page starting at @paddr.
+ */
+void virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr,
+ uint32_t memslot);
+
+vm_paddr_t vm_phy_page_alloc(struct kvm_vm *vm, vm_paddr_t paddr_min,
+ uint32_t memslot);
+vm_paddr_t vm_phy_pages_alloc(struct kvm_vm *vm, size_t num,
+ vm_paddr_t paddr_min, uint32_t memslot);
+
+/*
+ * Create a VM with reasonable defaults
+ *
+ * Input Args:
+ * vcpuid - The id of the single VCPU to add to the VM.
+ * extra_mem_pages - The number of extra pages to add (this will
+ * decide how much extra space we will need to
+ * setup the page tables using memslot 0)
+ * guest_code - The vCPU's entry point
+ *
+ * Output Args: None
+ *
+ * Return:
+ * Pointer to opaque structure that describes the created VM.
+ */
+struct kvm_vm *vm_create_default(uint32_t vcpuid, uint64_t extra_mem_pages,
+ void *guest_code);
+
+/*
+ * Adds a vCPU with reasonable defaults (e.g. a stack)
+ *
+ * Input Args:
+ * vm - Virtual Machine
+ * vcpuid - The id of the VCPU to add to the VM.
+ * guest_code - The vCPU's entry point
+ */
+void vm_vcpu_add_default(struct kvm_vm *vm, uint32_t vcpuid, void *guest_code);
+
+bool vm_is_unrestricted_guest(struct kvm_vm *vm);
+
+unsigned int vm_get_page_size(struct kvm_vm *vm);
+unsigned int vm_get_page_shift(struct kvm_vm *vm);
+unsigned int vm_get_max_gfn(struct kvm_vm *vm);
+int vm_get_fd(struct kvm_vm *vm);
+
+unsigned int vm_calc_num_guest_pages(enum vm_guest_mode mode, size_t size);
+unsigned int vm_num_host_pages(enum vm_guest_mode mode, unsigned int num_guest_pages);
+unsigned int vm_num_guest_pages(enum vm_guest_mode mode, unsigned int num_host_pages);
+static inline unsigned int
+vm_adjust_num_guest_pages(enum vm_guest_mode mode, unsigned int num_guest_pages)
+{
+ unsigned int n;
+ n = vm_num_guest_pages(mode, vm_num_host_pages(mode, num_guest_pages));
+#ifdef __s390x__
+ /* s390 requires 1M aligned guest sizes */
+ n = (n + 255) & ~255;
+#endif
+ return n;
+}
+
+struct kvm_userspace_memory_region *
+kvm_userspace_memory_region_find(struct kvm_vm *vm, uint64_t start,
+ uint64_t end);
+
+struct kvm_dirty_log *
+allocate_kvm_dirty_log(struct kvm_userspace_memory_region *region);
+
+int vm_create_device(struct kvm_vm *vm, struct kvm_create_device *cd);
+
+#define sync_global_to_guest(vm, g) ({ \
+ typeof(g) *_p = addr_gva2hva(vm, (vm_vaddr_t)&(g)); \
+ memcpy(_p, &(g), sizeof(g)); \
+})
+
+#define sync_global_from_guest(vm, g) ({ \
+ typeof(g) *_p = addr_gva2hva(vm, (vm_vaddr_t)&(g)); \
+ memcpy(&(g), _p, sizeof(g)); \
+})
+
+void assert_on_unhandled_exception(struct kvm_vm *vm, uint32_t vcpuid);
+
+/* Common ucalls */
+enum {
+ UCALL_NONE,
+ UCALL_SYNC,
+ UCALL_ABORT,
+ UCALL_DONE,
+};
+
+#define UCALL_MAX_ARGS 6
+
+struct ucall {
+ uint64_t cmd;
+ uint64_t args[UCALL_MAX_ARGS];
+};
+
+void ucall_init(struct kvm_vm *vm, void *arg);
+void ucall_uninit(struct kvm_vm *vm);
+void ucall(uint64_t cmd, int nargs, ...);
+uint64_t get_ucall(struct kvm_vm *vm, uint32_t vcpu_id, struct ucall *uc);
+
+#define GUEST_SYNC_ARGS(stage, arg1, arg2, arg3, arg4) \
+ ucall(UCALL_SYNC, 6, "hello", stage, arg1, arg2, arg3, arg4)
+#define GUEST_SYNC(stage) ucall(UCALL_SYNC, 2, "hello", stage)
+#define GUEST_DONE() ucall(UCALL_DONE, 0)
+#define __GUEST_ASSERT(_condition, _nargs, _args...) do { \
+ if (!(_condition)) \
+ ucall(UCALL_ABORT, 2 + _nargs, \
+ "Failed guest assert: " \
+ #_condition, __LINE__, _args); \
+} while (0)
+
+#define GUEST_ASSERT(_condition) \
+ __GUEST_ASSERT((_condition), 0, 0)
+
+#define GUEST_ASSERT_1(_condition, arg1) \
+ __GUEST_ASSERT((_condition), 1, (arg1))
+
+#define GUEST_ASSERT_2(_condition, arg1, arg2) \
+ __GUEST_ASSERT((_condition), 2, (arg1), (arg2))
+
+#define GUEST_ASSERT_3(_condition, arg1, arg2, arg3) \
+ __GUEST_ASSERT((_condition), 3, (arg1), (arg2), (arg3))
+
+#define GUEST_ASSERT_4(_condition, arg1, arg2, arg3, arg4) \
+ __GUEST_ASSERT((_condition), 4, (arg1), (arg2), (arg3), (arg4))
+
+#endif /* SELFTEST_KVM_UTIL_H */
diff --git a/tools/testing/selftests/kvm/include/perf_test_util.h b/tools/testing/selftests/kvm/include/perf_test_util.h
new file mode 100644
index 000000000..261805205
--- /dev/null
+++ b/tools/testing/selftests/kvm/include/perf_test_util.h
@@ -0,0 +1,198 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * tools/testing/selftests/kvm/include/perf_test_util.h
+ *
+ * Copyright (C) 2020, Google LLC.
+ */
+
+#ifndef SELFTEST_KVM_PERF_TEST_UTIL_H
+#define SELFTEST_KVM_PERF_TEST_UTIL_H
+
+#include "kvm_util.h"
+#include "processor.h"
+
+#define MAX_VCPUS 512
+
+#define PAGE_SHIFT_4K 12
+#define PTES_PER_4K_PT 512
+
+#define TEST_MEM_SLOT_INDEX 1
+
+/* Default guest test virtual memory offset */
+#define DEFAULT_GUEST_TEST_MEM 0xc0000000
+
+#define DEFAULT_PER_VCPU_MEM_SIZE (1 << 30) /* 1G */
+
+/*
+ * Guest physical memory offset of the testing memory slot.
+ * This will be set to the topmost valid physical address minus
+ * the test memory size.
+ */
+static uint64_t guest_test_phys_mem;
+
+/*
+ * Guest virtual memory offset of the testing memory slot.
+ * Must not conflict with identity mapped test code.
+ */
+static uint64_t guest_test_virt_mem = DEFAULT_GUEST_TEST_MEM;
+static uint64_t guest_percpu_mem_size = DEFAULT_PER_VCPU_MEM_SIZE;
+
+/* Number of VCPUs for the test */
+static int nr_vcpus = 1;
+
+struct vcpu_args {
+ uint64_t gva;
+ uint64_t pages;
+
+ /* Only used by the host userspace part of the vCPU thread */
+ int vcpu_id;
+};
+
+struct perf_test_args {
+ struct kvm_vm *vm;
+ uint64_t host_page_size;
+ uint64_t guest_page_size;
+ int wr_fract;
+
+ struct vcpu_args vcpu_args[MAX_VCPUS];
+};
+
+static struct perf_test_args perf_test_args;
+
+/*
+ * Continuously write to the first 8 bytes of each page in the
+ * specified region.
+ */
+static void guest_code(uint32_t vcpu_id)
+{
+ struct vcpu_args *vcpu_args = &perf_test_args.vcpu_args[vcpu_id];
+ uint64_t gva;
+ uint64_t pages;
+ int i;
+
+ /* Make sure vCPU args data structure is not corrupt. */
+ GUEST_ASSERT(vcpu_args->vcpu_id == vcpu_id);
+
+ gva = vcpu_args->gva;
+ pages = vcpu_args->pages;
+
+ while (true) {
+ for (i = 0; i < pages; i++) {
+ uint64_t addr = gva + (i * perf_test_args.guest_page_size);
+
+ if (i % perf_test_args.wr_fract == 0)
+ *(uint64_t *)addr = 0x0123456789ABCDEF;
+ else
+ READ_ONCE(*(uint64_t *)addr);
+ }
+
+ GUEST_SYNC(1);
+ }
+}
+
+static struct kvm_vm *create_vm(enum vm_guest_mode mode, int vcpus,
+ uint64_t vcpu_memory_bytes)
+{
+ struct kvm_vm *vm;
+ uint64_t pages = DEFAULT_GUEST_PHY_PAGES;
+ uint64_t guest_num_pages;
+
+ /* Account for a few pages per-vCPU for stacks */
+ pages += DEFAULT_STACK_PGS * vcpus;
+
+ /*
+ * Reserve twice the ammount of memory needed to map the test region and
+ * the page table / stacks region, at 4k, for page tables. Do the
+ * calculation with 4K page size: the smallest of all archs. (e.g., 64K
+ * page size guest will need even less memory for page tables).
+ */
+ pages += (2 * pages) / PTES_PER_4K_PT;
+ pages += ((2 * vcpus * vcpu_memory_bytes) >> PAGE_SHIFT_4K) /
+ PTES_PER_4K_PT;
+ pages = vm_adjust_num_guest_pages(mode, pages);
+
+ pr_info("Testing guest mode: %s\n", vm_guest_mode_string(mode));
+
+ vm = vm_create(mode, pages, O_RDWR);
+ kvm_vm_elf_load(vm, program_invocation_name, 0, 0);
+#ifdef __x86_64__
+ vm_create_irqchip(vm);
+#endif
+
+ perf_test_args.vm = vm;
+ perf_test_args.guest_page_size = vm_get_page_size(vm);
+ perf_test_args.host_page_size = getpagesize();
+
+ TEST_ASSERT(vcpu_memory_bytes % perf_test_args.guest_page_size == 0,
+ "Guest memory size is not guest page size aligned.");
+
+ guest_num_pages = (vcpus * vcpu_memory_bytes) /
+ perf_test_args.guest_page_size;
+ guest_num_pages = vm_adjust_num_guest_pages(mode, guest_num_pages);
+
+ /*
+ * If there should be more memory in the guest test region than there
+ * can be pages in the guest, it will definitely cause problems.
+ */
+ TEST_ASSERT(guest_num_pages < vm_get_max_gfn(vm),
+ "Requested more guest memory than address space allows.\n"
+ " guest pages: %lx max gfn: %x vcpus: %d wss: %lx]\n",
+ guest_num_pages, vm_get_max_gfn(vm), vcpus,
+ vcpu_memory_bytes);
+
+ TEST_ASSERT(vcpu_memory_bytes % perf_test_args.host_page_size == 0,
+ "Guest memory size is not host page size aligned.");
+
+ guest_test_phys_mem = (vm_get_max_gfn(vm) - guest_num_pages) *
+ perf_test_args.guest_page_size;
+ guest_test_phys_mem &= ~(perf_test_args.host_page_size - 1);
+
+#ifdef __s390x__
+ /* Align to 1M (segment size) */
+ guest_test_phys_mem &= ~((1 << 20) - 1);
+#endif
+
+ pr_info("guest physical test memory offset: 0x%lx\n", guest_test_phys_mem);
+
+ /* Add an extra memory slot for testing */
+ vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS,
+ guest_test_phys_mem,
+ TEST_MEM_SLOT_INDEX,
+ guest_num_pages, 0);
+
+ /* Do mapping for the demand paging memory slot */
+ virt_map(vm, guest_test_virt_mem, guest_test_phys_mem, guest_num_pages, 0);
+
+ ucall_init(vm, NULL);
+
+ return vm;
+}
+
+static void add_vcpus(struct kvm_vm *vm, int vcpus, uint64_t vcpu_memory_bytes)
+{
+ vm_paddr_t vcpu_gpa;
+ struct vcpu_args *vcpu_args;
+ int vcpu_id;
+
+ for (vcpu_id = 0; vcpu_id < vcpus; vcpu_id++) {
+ vcpu_args = &perf_test_args.vcpu_args[vcpu_id];
+
+ vm_vcpu_add_default(vm, vcpu_id, guest_code);
+
+#ifdef __x86_64__
+ vcpu_set_cpuid(vm, vcpu_id, kvm_get_supported_cpuid());
+#endif
+
+ vcpu_args->vcpu_id = vcpu_id;
+ vcpu_args->gva = guest_test_virt_mem +
+ (vcpu_id * vcpu_memory_bytes);
+ vcpu_args->pages = vcpu_memory_bytes /
+ perf_test_args.guest_page_size;
+
+ vcpu_gpa = guest_test_phys_mem + (vcpu_id * vcpu_memory_bytes);
+ pr_debug("Added VCPU %d with test mem gpa [%lx, %lx)\n",
+ vcpu_id, vcpu_gpa, vcpu_gpa + vcpu_memory_bytes);
+ }
+}
+
+#endif /* SELFTEST_KVM_PERF_TEST_UTIL_H */
diff --git a/tools/testing/selftests/kvm/include/s390x/processor.h b/tools/testing/selftests/kvm/include/s390x/processor.h
new file mode 100644
index 000000000..e0e96a5f6
--- /dev/null
+++ b/tools/testing/selftests/kvm/include/s390x/processor.h
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * s390x processor specific defines
+ */
+#ifndef SELFTEST_KVM_PROCESSOR_H
+#define SELFTEST_KVM_PROCESSOR_H
+
+/* Bits in the region/segment table entry */
+#define REGION_ENTRY_ORIGIN ~0xfffUL /* region/segment table origin */
+#define REGION_ENTRY_PROTECT 0x200 /* region protection bit */
+#define REGION_ENTRY_NOEXEC 0x100 /* region no-execute bit */
+#define REGION_ENTRY_OFFSET 0xc0 /* region table offset */
+#define REGION_ENTRY_INVALID 0x20 /* invalid region table entry */
+#define REGION_ENTRY_TYPE 0x0c /* region/segment table type mask */
+#define REGION_ENTRY_LENGTH 0x03 /* region third length */
+
+/* Bits in the page table entry */
+#define PAGE_INVALID 0x400 /* HW invalid bit */
+#define PAGE_PROTECT 0x200 /* HW read-only bit */
+#define PAGE_NOEXEC 0x100 /* HW no-execute bit */
+
+#endif
diff --git a/tools/testing/selftests/kvm/include/sparsebit.h b/tools/testing/selftests/kvm/include/sparsebit.h
new file mode 100644
index 000000000..12a9a4b9c
--- /dev/null
+++ b/tools/testing/selftests/kvm/include/sparsebit.h
@@ -0,0 +1,73 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * tools/testing/selftests/kvm/include/sparsebit.h
+ *
+ * Copyright (C) 2018, Google LLC.
+ *
+ * Header file that describes API to the sparsebit library.
+ * This library provides a memory efficient means of storing
+ * the settings of bits indexed via a uint64_t. Memory usage
+ * is reasonable, significantly less than (2^64 / 8) bytes, as
+ * long as bits that are mostly set or mostly cleared are close
+ * to each other. This library is efficient in memory usage
+ * even in the case where most bits are set.
+ */
+
+#ifndef SELFTEST_KVM_SPARSEBIT_H
+#define SELFTEST_KVM_SPARSEBIT_H
+
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct sparsebit;
+typedef uint64_t sparsebit_idx_t;
+typedef uint64_t sparsebit_num_t;
+
+struct sparsebit *sparsebit_alloc(void);
+void sparsebit_free(struct sparsebit **sbitp);
+void sparsebit_copy(struct sparsebit *dstp, struct sparsebit *src);
+
+bool sparsebit_is_set(struct sparsebit *sbit, sparsebit_idx_t idx);
+bool sparsebit_is_set_num(struct sparsebit *sbit,
+ sparsebit_idx_t idx, sparsebit_num_t num);
+bool sparsebit_is_clear(struct sparsebit *sbit, sparsebit_idx_t idx);
+bool sparsebit_is_clear_num(struct sparsebit *sbit,
+ sparsebit_idx_t idx, sparsebit_num_t num);
+sparsebit_num_t sparsebit_num_set(struct sparsebit *sbit);
+bool sparsebit_any_set(struct sparsebit *sbit);
+bool sparsebit_any_clear(struct sparsebit *sbit);
+bool sparsebit_all_set(struct sparsebit *sbit);
+bool sparsebit_all_clear(struct sparsebit *sbit);
+sparsebit_idx_t sparsebit_first_set(struct sparsebit *sbit);
+sparsebit_idx_t sparsebit_first_clear(struct sparsebit *sbit);
+sparsebit_idx_t sparsebit_next_set(struct sparsebit *sbit, sparsebit_idx_t prev);
+sparsebit_idx_t sparsebit_next_clear(struct sparsebit *sbit, sparsebit_idx_t prev);
+sparsebit_idx_t sparsebit_next_set_num(struct sparsebit *sbit,
+ sparsebit_idx_t start, sparsebit_num_t num);
+sparsebit_idx_t sparsebit_next_clear_num(struct sparsebit *sbit,
+ sparsebit_idx_t start, sparsebit_num_t num);
+
+void sparsebit_set(struct sparsebit *sbitp, sparsebit_idx_t idx);
+void sparsebit_set_num(struct sparsebit *sbitp, sparsebit_idx_t start,
+ sparsebit_num_t num);
+void sparsebit_set_all(struct sparsebit *sbitp);
+
+void sparsebit_clear(struct sparsebit *sbitp, sparsebit_idx_t idx);
+void sparsebit_clear_num(struct sparsebit *sbitp,
+ sparsebit_idx_t start, sparsebit_num_t num);
+void sparsebit_clear_all(struct sparsebit *sbitp);
+
+void sparsebit_dump(FILE *stream, struct sparsebit *sbit,
+ unsigned int indent);
+void sparsebit_validate_internal(struct sparsebit *sbit);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* SELFTEST_KVM_SPARSEBIT_H */
diff --git a/tools/testing/selftests/kvm/include/test_util.h b/tools/testing/selftests/kvm/include/test_util.h
new file mode 100644
index 000000000..ffffa5604
--- /dev/null
+++ b/tools/testing/selftests/kvm/include/test_util.h
@@ -0,0 +1,70 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * tools/testing/selftests/kvm/include/test_util.h
+ *
+ * Copyright (C) 2018, Google LLC.
+ */
+
+#ifndef SELFTEST_KVM_TEST_UTIL_H
+#define SELFTEST_KVM_TEST_UTIL_H
+
+#include <stdlib.h>
+#include <stdarg.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <string.h>
+#include <inttypes.h>
+#include <errno.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include "kselftest.h"
+
+static inline int _no_printf(const char *format, ...) { return 0; }
+
+#ifdef DEBUG
+#define pr_debug(...) printf(__VA_ARGS__)
+#else
+#define pr_debug(...) _no_printf(__VA_ARGS__)
+#endif
+#ifndef QUIET
+#define pr_info(...) printf(__VA_ARGS__)
+#else
+#define pr_info(...) _no_printf(__VA_ARGS__)
+#endif
+
+void print_skip(const char *fmt, ...) __attribute__((format(printf, 1, 2)));
+
+ssize_t test_write(int fd, const void *buf, size_t count);
+ssize_t test_read(int fd, void *buf, size_t count);
+int test_seq_read(const char *path, char **bufp, size_t *sizep);
+
+void test_assert(bool exp, const char *exp_str,
+ const char *file, unsigned int line, const char *fmt, ...)
+ __attribute__((format(printf, 5, 6)));
+
+#define TEST_ASSERT(e, fmt, ...) \
+ test_assert((e), #e, __FILE__, __LINE__, fmt, ##__VA_ARGS__)
+
+#define ASSERT_EQ(a, b) do { \
+ typeof(a) __a = (a); \
+ typeof(b) __b = (b); \
+ TEST_ASSERT(__a == __b, \
+ "ASSERT_EQ(%s, %s) failed.\n" \
+ "\t%s is %#lx\n" \
+ "\t%s is %#lx", \
+ #a, #b, #a, (unsigned long) __a, #b, (unsigned long) __b); \
+} while (0)
+
+#define TEST_FAIL(fmt, ...) \
+ TEST_ASSERT(false, fmt, ##__VA_ARGS__)
+
+size_t parse_size(const char *size);
+
+int64_t timespec_to_ns(struct timespec ts);
+struct timespec timespec_add_ns(struct timespec ts, int64_t ns);
+struct timespec timespec_add(struct timespec ts1, struct timespec ts2);
+struct timespec timespec_sub(struct timespec ts1, struct timespec ts2);
+struct timespec timespec_diff_now(struct timespec start);
+struct timespec timespec_div(struct timespec ts, int divisor);
+
+#endif /* SELFTEST_KVM_TEST_UTIL_H */
diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h
new file mode 100644
index 000000000..8e61340b3
--- /dev/null
+++ b/tools/testing/selftests/kvm/include/x86_64/processor.h
@@ -0,0 +1,422 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * tools/testing/selftests/kvm/include/x86_64/processor.h
+ *
+ * Copyright (C) 2018, Google LLC.
+ */
+
+#ifndef SELFTEST_KVM_PROCESSOR_H
+#define SELFTEST_KVM_PROCESSOR_H
+
+#include <assert.h>
+#include <stdint.h>
+
+#include <asm/msr-index.h>
+
+#define X86_EFLAGS_FIXED (1u << 1)
+
+#define X86_CR4_VME (1ul << 0)
+#define X86_CR4_PVI (1ul << 1)
+#define X86_CR4_TSD (1ul << 2)
+#define X86_CR4_DE (1ul << 3)
+#define X86_CR4_PSE (1ul << 4)
+#define X86_CR4_PAE (1ul << 5)
+#define X86_CR4_MCE (1ul << 6)
+#define X86_CR4_PGE (1ul << 7)
+#define X86_CR4_PCE (1ul << 8)
+#define X86_CR4_OSFXSR (1ul << 9)
+#define X86_CR4_OSXMMEXCPT (1ul << 10)
+#define X86_CR4_UMIP (1ul << 11)
+#define X86_CR4_VMXE (1ul << 13)
+#define X86_CR4_SMXE (1ul << 14)
+#define X86_CR4_FSGSBASE (1ul << 16)
+#define X86_CR4_PCIDE (1ul << 17)
+#define X86_CR4_OSXSAVE (1ul << 18)
+#define X86_CR4_SMEP (1ul << 20)
+#define X86_CR4_SMAP (1ul << 21)
+#define X86_CR4_PKE (1ul << 22)
+
+#define UNEXPECTED_VECTOR_PORT 0xfff0u
+
+/* General Registers in 64-Bit Mode */
+struct gpr64_regs {
+ u64 rax;
+ u64 rcx;
+ u64 rdx;
+ u64 rbx;
+ u64 rsp;
+ u64 rbp;
+ u64 rsi;
+ u64 rdi;
+ u64 r8;
+ u64 r9;
+ u64 r10;
+ u64 r11;
+ u64 r12;
+ u64 r13;
+ u64 r14;
+ u64 r15;
+};
+
+struct desc64 {
+ uint16_t limit0;
+ uint16_t base0;
+ unsigned base1:8, type:4, s:1, dpl:2, p:1;
+ unsigned limit1:4, avl:1, l:1, db:1, g:1, base2:8;
+ uint32_t base3;
+ uint32_t zero1;
+} __attribute__((packed));
+
+struct desc_ptr {
+ uint16_t size;
+ uint64_t address;
+} __attribute__((packed));
+
+static inline uint64_t get_desc64_base(const struct desc64 *desc)
+{
+ return ((uint64_t)desc->base3 << 32) |
+ (desc->base0 | ((desc->base1) << 16) | ((desc->base2) << 24));
+}
+
+static inline uint64_t rdtsc(void)
+{
+ uint32_t eax, edx;
+ uint64_t tsc_val;
+ /*
+ * The lfence is to wait (on Intel CPUs) until all previous
+ * instructions have been executed. If software requires RDTSC to be
+ * executed prior to execution of any subsequent instruction, it can
+ * execute LFENCE immediately after RDTSC
+ */
+ __asm__ __volatile__("lfence; rdtsc; lfence" : "=a"(eax), "=d"(edx));
+ tsc_val = ((uint64_t)edx) << 32 | eax;
+ return tsc_val;
+}
+
+static inline uint64_t rdtscp(uint32_t *aux)
+{
+ uint32_t eax, edx;
+
+ __asm__ __volatile__("rdtscp" : "=a"(eax), "=d"(edx), "=c"(*aux));
+ return ((uint64_t)edx) << 32 | eax;
+}
+
+static inline uint64_t rdmsr(uint32_t msr)
+{
+ uint32_t a, d;
+
+ __asm__ __volatile__("rdmsr" : "=a"(a), "=d"(d) : "c"(msr) : "memory");
+
+ return a | ((uint64_t) d << 32);
+}
+
+static inline void wrmsr(uint32_t msr, uint64_t value)
+{
+ uint32_t a = value;
+ uint32_t d = value >> 32;
+
+ __asm__ __volatile__("wrmsr" :: "a"(a), "d"(d), "c"(msr) : "memory");
+}
+
+
+static inline uint16_t inw(uint16_t port)
+{
+ uint16_t tmp;
+
+ __asm__ __volatile__("in %%dx, %%ax"
+ : /* output */ "=a" (tmp)
+ : /* input */ "d" (port));
+
+ return tmp;
+}
+
+static inline uint16_t get_es(void)
+{
+ uint16_t es;
+
+ __asm__ __volatile__("mov %%es, %[es]"
+ : /* output */ [es]"=rm"(es));
+ return es;
+}
+
+static inline uint16_t get_cs(void)
+{
+ uint16_t cs;
+
+ __asm__ __volatile__("mov %%cs, %[cs]"
+ : /* output */ [cs]"=rm"(cs));
+ return cs;
+}
+
+static inline uint16_t get_ss(void)
+{
+ uint16_t ss;
+
+ __asm__ __volatile__("mov %%ss, %[ss]"
+ : /* output */ [ss]"=rm"(ss));
+ return ss;
+}
+
+static inline uint16_t get_ds(void)
+{
+ uint16_t ds;
+
+ __asm__ __volatile__("mov %%ds, %[ds]"
+ : /* output */ [ds]"=rm"(ds));
+ return ds;
+}
+
+static inline uint16_t get_fs(void)
+{
+ uint16_t fs;
+
+ __asm__ __volatile__("mov %%fs, %[fs]"
+ : /* output */ [fs]"=rm"(fs));
+ return fs;
+}
+
+static inline uint16_t get_gs(void)
+{
+ uint16_t gs;
+
+ __asm__ __volatile__("mov %%gs, %[gs]"
+ : /* output */ [gs]"=rm"(gs));
+ return gs;
+}
+
+static inline uint16_t get_tr(void)
+{
+ uint16_t tr;
+
+ __asm__ __volatile__("str %[tr]"
+ : /* output */ [tr]"=rm"(tr));
+ return tr;
+}
+
+static inline uint64_t get_cr0(void)
+{
+ uint64_t cr0;
+
+ __asm__ __volatile__("mov %%cr0, %[cr0]"
+ : /* output */ [cr0]"=r"(cr0));
+ return cr0;
+}
+
+static inline uint64_t get_cr3(void)
+{
+ uint64_t cr3;
+
+ __asm__ __volatile__("mov %%cr3, %[cr3]"
+ : /* output */ [cr3]"=r"(cr3));
+ return cr3;
+}
+
+static inline uint64_t get_cr4(void)
+{
+ uint64_t cr4;
+
+ __asm__ __volatile__("mov %%cr4, %[cr4]"
+ : /* output */ [cr4]"=r"(cr4));
+ return cr4;
+}
+
+static inline void set_cr4(uint64_t val)
+{
+ __asm__ __volatile__("mov %0, %%cr4" : : "r" (val) : "memory");
+}
+
+static inline struct desc_ptr get_gdt(void)
+{
+ struct desc_ptr gdt;
+ __asm__ __volatile__("sgdt %[gdt]"
+ : /* output */ [gdt]"=m"(gdt));
+ return gdt;
+}
+
+static inline struct desc_ptr get_idt(void)
+{
+ struct desc_ptr idt;
+ __asm__ __volatile__("sidt %[idt]"
+ : /* output */ [idt]"=m"(idt));
+ return idt;
+}
+
+static inline void outl(uint16_t port, uint32_t value)
+{
+ __asm__ __volatile__("outl %%eax, %%dx" : : "d"(port), "a"(value));
+}
+
+#define SET_XMM(__var, __xmm) \
+ asm volatile("movq %0, %%"#__xmm : : "r"(__var) : #__xmm)
+
+static inline void set_xmm(int n, unsigned long val)
+{
+ switch (n) {
+ case 0:
+ SET_XMM(val, xmm0);
+ break;
+ case 1:
+ SET_XMM(val, xmm1);
+ break;
+ case 2:
+ SET_XMM(val, xmm2);
+ break;
+ case 3:
+ SET_XMM(val, xmm3);
+ break;
+ case 4:
+ SET_XMM(val, xmm4);
+ break;
+ case 5:
+ SET_XMM(val, xmm5);
+ break;
+ case 6:
+ SET_XMM(val, xmm6);
+ break;
+ case 7:
+ SET_XMM(val, xmm7);
+ break;
+ }
+}
+
+typedef unsigned long v1di __attribute__ ((vector_size (8)));
+static inline unsigned long get_xmm(int n)
+{
+ assert(n >= 0 && n <= 7);
+
+ register v1di xmm0 __asm__("%xmm0");
+ register v1di xmm1 __asm__("%xmm1");
+ register v1di xmm2 __asm__("%xmm2");
+ register v1di xmm3 __asm__("%xmm3");
+ register v1di xmm4 __asm__("%xmm4");
+ register v1di xmm5 __asm__("%xmm5");
+ register v1di xmm6 __asm__("%xmm6");
+ register v1di xmm7 __asm__("%xmm7");
+ switch (n) {
+ case 0:
+ return (unsigned long)xmm0;
+ case 1:
+ return (unsigned long)xmm1;
+ case 2:
+ return (unsigned long)xmm2;
+ case 3:
+ return (unsigned long)xmm3;
+ case 4:
+ return (unsigned long)xmm4;
+ case 5:
+ return (unsigned long)xmm5;
+ case 6:
+ return (unsigned long)xmm6;
+ case 7:
+ return (unsigned long)xmm7;
+ }
+ return 0;
+}
+
+bool is_intel_cpu(void);
+
+struct kvm_x86_state;
+struct kvm_x86_state *vcpu_save_state(struct kvm_vm *vm, uint32_t vcpuid);
+void vcpu_load_state(struct kvm_vm *vm, uint32_t vcpuid,
+ struct kvm_x86_state *state);
+
+struct kvm_msr_list *kvm_get_msr_index_list(void);
+
+struct kvm_cpuid2 *kvm_get_supported_cpuid(void);
+void vcpu_set_cpuid(struct kvm_vm *vm, uint32_t vcpuid,
+ struct kvm_cpuid2 *cpuid);
+
+struct kvm_cpuid_entry2 *
+kvm_get_supported_cpuid_index(uint32_t function, uint32_t index);
+
+static inline struct kvm_cpuid_entry2 *
+kvm_get_supported_cpuid_entry(uint32_t function)
+{
+ return kvm_get_supported_cpuid_index(function, 0);
+}
+
+uint64_t vcpu_get_msr(struct kvm_vm *vm, uint32_t vcpuid, uint64_t msr_index);
+int _vcpu_set_msr(struct kvm_vm *vm, uint32_t vcpuid, uint64_t msr_index,
+ uint64_t msr_value);
+void vcpu_set_msr(struct kvm_vm *vm, uint32_t vcpuid, uint64_t msr_index,
+ uint64_t msr_value);
+
+uint32_t kvm_get_cpuid_max_basic(void);
+uint32_t kvm_get_cpuid_max_extended(void);
+void kvm_get_cpu_address_width(unsigned int *pa_bits, unsigned int *va_bits);
+
+struct ex_regs {
+ uint64_t rax, rcx, rdx, rbx;
+ uint64_t rbp, rsi, rdi;
+ uint64_t r8, r9, r10, r11;
+ uint64_t r12, r13, r14, r15;
+ uint64_t vector;
+ uint64_t error_code;
+ uint64_t rip;
+ uint64_t cs;
+ uint64_t rflags;
+};
+
+void vm_init_descriptor_tables(struct kvm_vm *vm);
+void vcpu_init_descriptor_tables(struct kvm_vm *vm, uint32_t vcpuid);
+void vm_handle_exception(struct kvm_vm *vm, int vector,
+ void (*handler)(struct ex_regs *));
+
+/*
+ * set_cpuid() - overwrites a matching cpuid entry with the provided value.
+ * matches based on ent->function && ent->index. returns true
+ * if a match was found and successfully overwritten.
+ * @cpuid: the kvm cpuid list to modify.
+ * @ent: cpuid entry to insert
+ */
+bool set_cpuid(struct kvm_cpuid2 *cpuid, struct kvm_cpuid_entry2 *ent);
+
+uint64_t kvm_hypercall(uint64_t nr, uint64_t a0, uint64_t a1, uint64_t a2,
+ uint64_t a3);
+
+/*
+ * Basic CPU control in CR0
+ */
+#define X86_CR0_PE (1UL<<0) /* Protection Enable */
+#define X86_CR0_MP (1UL<<1) /* Monitor Coprocessor */
+#define X86_CR0_EM (1UL<<2) /* Emulation */
+#define X86_CR0_TS (1UL<<3) /* Task Switched */
+#define X86_CR0_ET (1UL<<4) /* Extension Type */
+#define X86_CR0_NE (1UL<<5) /* Numeric Error */
+#define X86_CR0_WP (1UL<<16) /* Write Protect */
+#define X86_CR0_AM (1UL<<18) /* Alignment Mask */
+#define X86_CR0_NW (1UL<<29) /* Not Write-through */
+#define X86_CR0_CD (1UL<<30) /* Cache Disable */
+#define X86_CR0_PG (1UL<<31) /* Paging */
+
+#define APIC_BASE_MSR 0x800
+#define X2APIC_ENABLE (1UL << 10)
+#define APIC_ICR 0x300
+#define APIC_DEST_SELF 0x40000
+#define APIC_DEST_ALLINC 0x80000
+#define APIC_DEST_ALLBUT 0xC0000
+#define APIC_ICR_RR_MASK 0x30000
+#define APIC_ICR_RR_INVALID 0x00000
+#define APIC_ICR_RR_INPROG 0x10000
+#define APIC_ICR_RR_VALID 0x20000
+#define APIC_INT_LEVELTRIG 0x08000
+#define APIC_INT_ASSERT 0x04000
+#define APIC_ICR_BUSY 0x01000
+#define APIC_DEST_LOGICAL 0x00800
+#define APIC_DEST_PHYSICAL 0x00000
+#define APIC_DM_FIXED 0x00000
+#define APIC_DM_FIXED_MASK 0x00700
+#define APIC_DM_LOWEST 0x00100
+#define APIC_DM_SMI 0x00200
+#define APIC_DM_REMRD 0x00300
+#define APIC_DM_NMI 0x00400
+#define APIC_DM_INIT 0x00500
+#define APIC_DM_STARTUP 0x00600
+#define APIC_DM_EXTINT 0x00700
+#define APIC_VECTOR_MASK 0x000FF
+#define APIC_ICR2 0x310
+
+/* VMX_EPT_VPID_CAP bits */
+#define VMX_EPT_VPID_CAP_AD_BITS (1ULL << 21)
+
+#endif /* SELFTEST_KVM_PROCESSOR_H */
diff --git a/tools/testing/selftests/kvm/include/x86_64/svm.h b/tools/testing/selftests/kvm/include/x86_64/svm.h
new file mode 100644
index 000000000..f4ea2355d
--- /dev/null
+++ b/tools/testing/selftests/kvm/include/x86_64/svm.h
@@ -0,0 +1,297 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * tools/testing/selftests/kvm/include/x86_64/svm.h
+ * This is a copy of arch/x86/include/asm/svm.h
+ *
+ */
+
+#ifndef SELFTEST_KVM_SVM_H
+#define SELFTEST_KVM_SVM_H
+
+enum {
+ INTERCEPT_INTR,
+ INTERCEPT_NMI,
+ INTERCEPT_SMI,
+ INTERCEPT_INIT,
+ INTERCEPT_VINTR,
+ INTERCEPT_SELECTIVE_CR0,
+ INTERCEPT_STORE_IDTR,
+ INTERCEPT_STORE_GDTR,
+ INTERCEPT_STORE_LDTR,
+ INTERCEPT_STORE_TR,
+ INTERCEPT_LOAD_IDTR,
+ INTERCEPT_LOAD_GDTR,
+ INTERCEPT_LOAD_LDTR,
+ INTERCEPT_LOAD_TR,
+ INTERCEPT_RDTSC,
+ INTERCEPT_RDPMC,
+ INTERCEPT_PUSHF,
+ INTERCEPT_POPF,
+ INTERCEPT_CPUID,
+ INTERCEPT_RSM,
+ INTERCEPT_IRET,
+ INTERCEPT_INTn,
+ INTERCEPT_INVD,
+ INTERCEPT_PAUSE,
+ INTERCEPT_HLT,
+ INTERCEPT_INVLPG,
+ INTERCEPT_INVLPGA,
+ INTERCEPT_IOIO_PROT,
+ INTERCEPT_MSR_PROT,
+ INTERCEPT_TASK_SWITCH,
+ INTERCEPT_FERR_FREEZE,
+ INTERCEPT_SHUTDOWN,
+ INTERCEPT_VMRUN,
+ INTERCEPT_VMMCALL,
+ INTERCEPT_VMLOAD,
+ INTERCEPT_VMSAVE,
+ INTERCEPT_STGI,
+ INTERCEPT_CLGI,
+ INTERCEPT_SKINIT,
+ INTERCEPT_RDTSCP,
+ INTERCEPT_ICEBP,
+ INTERCEPT_WBINVD,
+ INTERCEPT_MONITOR,
+ INTERCEPT_MWAIT,
+ INTERCEPT_MWAIT_COND,
+ INTERCEPT_XSETBV,
+ INTERCEPT_RDPRU,
+};
+
+
+struct __attribute__ ((__packed__)) vmcb_control_area {
+ u32 intercept_cr;
+ u32 intercept_dr;
+ u32 intercept_exceptions;
+ u64 intercept;
+ u8 reserved_1[40];
+ u16 pause_filter_thresh;
+ u16 pause_filter_count;
+ u64 iopm_base_pa;
+ u64 msrpm_base_pa;
+ u64 tsc_offset;
+ u32 asid;
+ u8 tlb_ctl;
+ u8 reserved_2[3];
+ u32 int_ctl;
+ u32 int_vector;
+ u32 int_state;
+ u8 reserved_3[4];
+ u32 exit_code;
+ u32 exit_code_hi;
+ u64 exit_info_1;
+ u64 exit_info_2;
+ u32 exit_int_info;
+ u32 exit_int_info_err;
+ u64 nested_ctl;
+ u64 avic_vapic_bar;
+ u8 reserved_4[8];
+ u32 event_inj;
+ u32 event_inj_err;
+ u64 nested_cr3;
+ u64 virt_ext;
+ u32 clean;
+ u32 reserved_5;
+ u64 next_rip;
+ u8 insn_len;
+ u8 insn_bytes[15];
+ u64 avic_backing_page; /* Offset 0xe0 */
+ u8 reserved_6[8]; /* Offset 0xe8 */
+ u64 avic_logical_id; /* Offset 0xf0 */
+ u64 avic_physical_id; /* Offset 0xf8 */
+ u8 reserved_7[768];
+};
+
+
+#define TLB_CONTROL_DO_NOTHING 0
+#define TLB_CONTROL_FLUSH_ALL_ASID 1
+#define TLB_CONTROL_FLUSH_ASID 3
+#define TLB_CONTROL_FLUSH_ASID_LOCAL 7
+
+#define V_TPR_MASK 0x0f
+
+#define V_IRQ_SHIFT 8
+#define V_IRQ_MASK (1 << V_IRQ_SHIFT)
+
+#define V_GIF_SHIFT 9
+#define V_GIF_MASK (1 << V_GIF_SHIFT)
+
+#define V_INTR_PRIO_SHIFT 16
+#define V_INTR_PRIO_MASK (0x0f << V_INTR_PRIO_SHIFT)
+
+#define V_IGN_TPR_SHIFT 20
+#define V_IGN_TPR_MASK (1 << V_IGN_TPR_SHIFT)
+
+#define V_INTR_MASKING_SHIFT 24
+#define V_INTR_MASKING_MASK (1 << V_INTR_MASKING_SHIFT)
+
+#define V_GIF_ENABLE_SHIFT 25
+#define V_GIF_ENABLE_MASK (1 << V_GIF_ENABLE_SHIFT)
+
+#define AVIC_ENABLE_SHIFT 31
+#define AVIC_ENABLE_MASK (1 << AVIC_ENABLE_SHIFT)
+
+#define LBR_CTL_ENABLE_MASK BIT_ULL(0)
+#define VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK BIT_ULL(1)
+
+#define SVM_INTERRUPT_SHADOW_MASK 1
+
+#define SVM_IOIO_STR_SHIFT 2
+#define SVM_IOIO_REP_SHIFT 3
+#define SVM_IOIO_SIZE_SHIFT 4
+#define SVM_IOIO_ASIZE_SHIFT 7
+
+#define SVM_IOIO_TYPE_MASK 1
+#define SVM_IOIO_STR_MASK (1 << SVM_IOIO_STR_SHIFT)
+#define SVM_IOIO_REP_MASK (1 << SVM_IOIO_REP_SHIFT)
+#define SVM_IOIO_SIZE_MASK (7 << SVM_IOIO_SIZE_SHIFT)
+#define SVM_IOIO_ASIZE_MASK (7 << SVM_IOIO_ASIZE_SHIFT)
+
+#define SVM_VM_CR_VALID_MASK 0x001fULL
+#define SVM_VM_CR_SVM_LOCK_MASK 0x0008ULL
+#define SVM_VM_CR_SVM_DIS_MASK 0x0010ULL
+
+#define SVM_NESTED_CTL_NP_ENABLE BIT(0)
+#define SVM_NESTED_CTL_SEV_ENABLE BIT(1)
+
+struct __attribute__ ((__packed__)) vmcb_seg {
+ u16 selector;
+ u16 attrib;
+ u32 limit;
+ u64 base;
+};
+
+struct __attribute__ ((__packed__)) vmcb_save_area {
+ struct vmcb_seg es;
+ struct vmcb_seg cs;
+ struct vmcb_seg ss;
+ struct vmcb_seg ds;
+ struct vmcb_seg fs;
+ struct vmcb_seg gs;
+ struct vmcb_seg gdtr;
+ struct vmcb_seg ldtr;
+ struct vmcb_seg idtr;
+ struct vmcb_seg tr;
+ u8 reserved_1[43];
+ u8 cpl;
+ u8 reserved_2[4];
+ u64 efer;
+ u8 reserved_3[112];
+ u64 cr4;
+ u64 cr3;
+ u64 cr0;
+ u64 dr7;
+ u64 dr6;
+ u64 rflags;
+ u64 rip;
+ u8 reserved_4[88];
+ u64 rsp;
+ u8 reserved_5[24];
+ u64 rax;
+ u64 star;
+ u64 lstar;
+ u64 cstar;
+ u64 sfmask;
+ u64 kernel_gs_base;
+ u64 sysenter_cs;
+ u64 sysenter_esp;
+ u64 sysenter_eip;
+ u64 cr2;
+ u8 reserved_6[32];
+ u64 g_pat;
+ u64 dbgctl;
+ u64 br_from;
+ u64 br_to;
+ u64 last_excp_from;
+ u64 last_excp_to;
+};
+
+struct __attribute__ ((__packed__)) vmcb {
+ struct vmcb_control_area control;
+ struct vmcb_save_area save;
+};
+
+#define SVM_CPUID_FUNC 0x8000000a
+
+#define SVM_VM_CR_SVM_DISABLE 4
+
+#define SVM_SELECTOR_S_SHIFT 4
+#define SVM_SELECTOR_DPL_SHIFT 5
+#define SVM_SELECTOR_P_SHIFT 7
+#define SVM_SELECTOR_AVL_SHIFT 8
+#define SVM_SELECTOR_L_SHIFT 9
+#define SVM_SELECTOR_DB_SHIFT 10
+#define SVM_SELECTOR_G_SHIFT 11
+
+#define SVM_SELECTOR_TYPE_MASK (0xf)
+#define SVM_SELECTOR_S_MASK (1 << SVM_SELECTOR_S_SHIFT)
+#define SVM_SELECTOR_DPL_MASK (3 << SVM_SELECTOR_DPL_SHIFT)
+#define SVM_SELECTOR_P_MASK (1 << SVM_SELECTOR_P_SHIFT)
+#define SVM_SELECTOR_AVL_MASK (1 << SVM_SELECTOR_AVL_SHIFT)
+#define SVM_SELECTOR_L_MASK (1 << SVM_SELECTOR_L_SHIFT)
+#define SVM_SELECTOR_DB_MASK (1 << SVM_SELECTOR_DB_SHIFT)
+#define SVM_SELECTOR_G_MASK (1 << SVM_SELECTOR_G_SHIFT)
+
+#define SVM_SELECTOR_WRITE_MASK (1 << 1)
+#define SVM_SELECTOR_READ_MASK SVM_SELECTOR_WRITE_MASK
+#define SVM_SELECTOR_CODE_MASK (1 << 3)
+
+#define INTERCEPT_CR0_READ 0
+#define INTERCEPT_CR3_READ 3
+#define INTERCEPT_CR4_READ 4
+#define INTERCEPT_CR8_READ 8
+#define INTERCEPT_CR0_WRITE (16 + 0)
+#define INTERCEPT_CR3_WRITE (16 + 3)
+#define INTERCEPT_CR4_WRITE (16 + 4)
+#define INTERCEPT_CR8_WRITE (16 + 8)
+
+#define INTERCEPT_DR0_READ 0
+#define INTERCEPT_DR1_READ 1
+#define INTERCEPT_DR2_READ 2
+#define INTERCEPT_DR3_READ 3
+#define INTERCEPT_DR4_READ 4
+#define INTERCEPT_DR5_READ 5
+#define INTERCEPT_DR6_READ 6
+#define INTERCEPT_DR7_READ 7
+#define INTERCEPT_DR0_WRITE (16 + 0)
+#define INTERCEPT_DR1_WRITE (16 + 1)
+#define INTERCEPT_DR2_WRITE (16 + 2)
+#define INTERCEPT_DR3_WRITE (16 + 3)
+#define INTERCEPT_DR4_WRITE (16 + 4)
+#define INTERCEPT_DR5_WRITE (16 + 5)
+#define INTERCEPT_DR6_WRITE (16 + 6)
+#define INTERCEPT_DR7_WRITE (16 + 7)
+
+#define SVM_EVTINJ_VEC_MASK 0xff
+
+#define SVM_EVTINJ_TYPE_SHIFT 8
+#define SVM_EVTINJ_TYPE_MASK (7 << SVM_EVTINJ_TYPE_SHIFT)
+
+#define SVM_EVTINJ_TYPE_INTR (0 << SVM_EVTINJ_TYPE_SHIFT)
+#define SVM_EVTINJ_TYPE_NMI (2 << SVM_EVTINJ_TYPE_SHIFT)
+#define SVM_EVTINJ_TYPE_EXEPT (3 << SVM_EVTINJ_TYPE_SHIFT)
+#define SVM_EVTINJ_TYPE_SOFT (4 << SVM_EVTINJ_TYPE_SHIFT)
+
+#define SVM_EVTINJ_VALID (1 << 31)
+#define SVM_EVTINJ_VALID_ERR (1 << 11)
+
+#define SVM_EXITINTINFO_VEC_MASK SVM_EVTINJ_VEC_MASK
+#define SVM_EXITINTINFO_TYPE_MASK SVM_EVTINJ_TYPE_MASK
+
+#define SVM_EXITINTINFO_TYPE_INTR SVM_EVTINJ_TYPE_INTR
+#define SVM_EXITINTINFO_TYPE_NMI SVM_EVTINJ_TYPE_NMI
+#define SVM_EXITINTINFO_TYPE_EXEPT SVM_EVTINJ_TYPE_EXEPT
+#define SVM_EXITINTINFO_TYPE_SOFT SVM_EVTINJ_TYPE_SOFT
+
+#define SVM_EXITINTINFO_VALID SVM_EVTINJ_VALID
+#define SVM_EXITINTINFO_VALID_ERR SVM_EVTINJ_VALID_ERR
+
+#define SVM_EXITINFOSHIFT_TS_REASON_IRET 36
+#define SVM_EXITINFOSHIFT_TS_REASON_JMP 38
+#define SVM_EXITINFOSHIFT_TS_HAS_ERROR_CODE 44
+
+#define SVM_EXITINFO_REG_MASK 0x0F
+
+#define SVM_CR0_SELECTIVE_MASK (X86_CR0_TS | X86_CR0_MP)
+
+#endif /* SELFTEST_KVM_SVM_H */
diff --git a/tools/testing/selftests/kvm/include/x86_64/svm_util.h b/tools/testing/selftests/kvm/include/x86_64/svm_util.h
new file mode 100644
index 000000000..b7531c83b
--- /dev/null
+++ b/tools/testing/selftests/kvm/include/x86_64/svm_util.h
@@ -0,0 +1,49 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * tools/testing/selftests/kvm/include/x86_64/svm_utils.h
+ * Header for nested SVM testing
+ *
+ * Copyright (C) 2020, Red Hat, Inc.
+ */
+
+#ifndef SELFTEST_KVM_SVM_UTILS_H
+#define SELFTEST_KVM_SVM_UTILS_H
+
+#include <stdint.h>
+#include "svm.h"
+#include "processor.h"
+
+#define CPUID_SVM_BIT 2
+#define CPUID_SVM BIT_ULL(CPUID_SVM_BIT)
+
+#define SVM_EXIT_VMMCALL 0x081
+
+struct svm_test_data {
+ /* VMCB */
+ struct vmcb *vmcb; /* gva */
+ void *vmcb_hva;
+ uint64_t vmcb_gpa;
+
+ /* host state-save area */
+ struct vmcb_save_area *save_area; /* gva */
+ void *save_area_hva;
+ uint64_t save_area_gpa;
+};
+
+struct svm_test_data *vcpu_alloc_svm(struct kvm_vm *vm, vm_vaddr_t *p_svm_gva);
+void generic_svm_setup(struct svm_test_data *svm, void *guest_rip, void *guest_rsp);
+void run_guest(struct vmcb *vmcb, uint64_t vmcb_gpa);
+bool nested_svm_supported(void);
+void nested_svm_check_supported(void);
+
+static inline bool cpu_has_svm(void)
+{
+ u32 eax = 0x80000001, ecx;
+
+ asm("cpuid" :
+ "=a" (eax), "=c" (ecx) : "0" (eax) : "ebx", "edx");
+
+ return ecx & CPUID_SVM;
+}
+
+#endif /* SELFTEST_KVM_SVM_UTILS_H */
diff --git a/tools/testing/selftests/kvm/include/x86_64/vmx.h b/tools/testing/selftests/kvm/include/x86_64/vmx.h
new file mode 100644
index 000000000..e78d7e26b
--- /dev/null
+++ b/tools/testing/selftests/kvm/include/x86_64/vmx.h
@@ -0,0 +1,625 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * tools/testing/selftests/kvm/include/x86_64/vmx.h
+ *
+ * Copyright (C) 2018, Google LLC.
+ */
+
+#ifndef SELFTEST_KVM_VMX_H
+#define SELFTEST_KVM_VMX_H
+
+#include <stdint.h>
+#include "processor.h"
+
+#define CPUID_VMX_BIT 5
+
+#define CPUID_VMX (1 << 5)
+
+/*
+ * Definitions of Primary Processor-Based VM-Execution Controls.
+ */
+#define CPU_BASED_INTR_WINDOW_EXITING 0x00000004
+#define CPU_BASED_USE_TSC_OFFSETTING 0x00000008
+#define CPU_BASED_HLT_EXITING 0x00000080
+#define CPU_BASED_INVLPG_EXITING 0x00000200
+#define CPU_BASED_MWAIT_EXITING 0x00000400
+#define CPU_BASED_RDPMC_EXITING 0x00000800
+#define CPU_BASED_RDTSC_EXITING 0x00001000
+#define CPU_BASED_CR3_LOAD_EXITING 0x00008000
+#define CPU_BASED_CR3_STORE_EXITING 0x00010000
+#define CPU_BASED_CR8_LOAD_EXITING 0x00080000
+#define CPU_BASED_CR8_STORE_EXITING 0x00100000
+#define CPU_BASED_TPR_SHADOW 0x00200000
+#define CPU_BASED_NMI_WINDOW_EXITING 0x00400000
+#define CPU_BASED_MOV_DR_EXITING 0x00800000
+#define CPU_BASED_UNCOND_IO_EXITING 0x01000000
+#define CPU_BASED_USE_IO_BITMAPS 0x02000000
+#define CPU_BASED_MONITOR_TRAP 0x08000000
+#define CPU_BASED_USE_MSR_BITMAPS 0x10000000
+#define CPU_BASED_MONITOR_EXITING 0x20000000
+#define CPU_BASED_PAUSE_EXITING 0x40000000
+#define CPU_BASED_ACTIVATE_SECONDARY_CONTROLS 0x80000000
+
+#define CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR 0x0401e172
+
+/*
+ * Definitions of Secondary Processor-Based VM-Execution Controls.
+ */
+#define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001
+#define SECONDARY_EXEC_ENABLE_EPT 0x00000002
+#define SECONDARY_EXEC_DESC 0x00000004
+#define SECONDARY_EXEC_ENABLE_RDTSCP 0x00000008
+#define SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE 0x00000010
+#define SECONDARY_EXEC_ENABLE_VPID 0x00000020
+#define SECONDARY_EXEC_WBINVD_EXITING 0x00000040
+#define SECONDARY_EXEC_UNRESTRICTED_GUEST 0x00000080
+#define SECONDARY_EXEC_APIC_REGISTER_VIRT 0x00000100
+#define SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY 0x00000200
+#define SECONDARY_EXEC_PAUSE_LOOP_EXITING 0x00000400
+#define SECONDARY_EXEC_RDRAND_EXITING 0x00000800
+#define SECONDARY_EXEC_ENABLE_INVPCID 0x00001000
+#define SECONDARY_EXEC_ENABLE_VMFUNC 0x00002000
+#define SECONDARY_EXEC_SHADOW_VMCS 0x00004000
+#define SECONDARY_EXEC_RDSEED_EXITING 0x00010000
+#define SECONDARY_EXEC_ENABLE_PML 0x00020000
+#define SECONDARY_EPT_VE 0x00040000
+#define SECONDARY_ENABLE_XSAV_RESTORE 0x00100000
+#define SECONDARY_EXEC_TSC_SCALING 0x02000000
+
+#define PIN_BASED_EXT_INTR_MASK 0x00000001
+#define PIN_BASED_NMI_EXITING 0x00000008
+#define PIN_BASED_VIRTUAL_NMIS 0x00000020
+#define PIN_BASED_VMX_PREEMPTION_TIMER 0x00000040
+#define PIN_BASED_POSTED_INTR 0x00000080
+
+#define PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR 0x00000016
+
+#define VM_EXIT_SAVE_DEBUG_CONTROLS 0x00000004
+#define VM_EXIT_HOST_ADDR_SPACE_SIZE 0x00000200
+#define VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL 0x00001000
+#define VM_EXIT_ACK_INTR_ON_EXIT 0x00008000
+#define VM_EXIT_SAVE_IA32_PAT 0x00040000
+#define VM_EXIT_LOAD_IA32_PAT 0x00080000
+#define VM_EXIT_SAVE_IA32_EFER 0x00100000
+#define VM_EXIT_LOAD_IA32_EFER 0x00200000
+#define VM_EXIT_SAVE_VMX_PREEMPTION_TIMER 0x00400000
+
+#define VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR 0x00036dff
+
+#define VM_ENTRY_LOAD_DEBUG_CONTROLS 0x00000004
+#define VM_ENTRY_IA32E_MODE 0x00000200
+#define VM_ENTRY_SMM 0x00000400
+#define VM_ENTRY_DEACT_DUAL_MONITOR 0x00000800
+#define VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL 0x00002000
+#define VM_ENTRY_LOAD_IA32_PAT 0x00004000
+#define VM_ENTRY_LOAD_IA32_EFER 0x00008000
+
+#define VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR 0x000011ff
+
+#define VMX_MISC_PREEMPTION_TIMER_RATE_MASK 0x0000001f
+#define VMX_MISC_SAVE_EFER_LMA 0x00000020
+
+#define EXIT_REASON_FAILED_VMENTRY 0x80000000
+#define EXIT_REASON_EXCEPTION_NMI 0
+#define EXIT_REASON_EXTERNAL_INTERRUPT 1
+#define EXIT_REASON_TRIPLE_FAULT 2
+#define EXIT_REASON_INTERRUPT_WINDOW 7
+#define EXIT_REASON_NMI_WINDOW 8
+#define EXIT_REASON_TASK_SWITCH 9
+#define EXIT_REASON_CPUID 10
+#define EXIT_REASON_HLT 12
+#define EXIT_REASON_INVD 13
+#define EXIT_REASON_INVLPG 14
+#define EXIT_REASON_RDPMC 15
+#define EXIT_REASON_RDTSC 16
+#define EXIT_REASON_VMCALL 18
+#define EXIT_REASON_VMCLEAR 19
+#define EXIT_REASON_VMLAUNCH 20
+#define EXIT_REASON_VMPTRLD 21
+#define EXIT_REASON_VMPTRST 22
+#define EXIT_REASON_VMREAD 23
+#define EXIT_REASON_VMRESUME 24
+#define EXIT_REASON_VMWRITE 25
+#define EXIT_REASON_VMOFF 26
+#define EXIT_REASON_VMON 27
+#define EXIT_REASON_CR_ACCESS 28
+#define EXIT_REASON_DR_ACCESS 29
+#define EXIT_REASON_IO_INSTRUCTION 30
+#define EXIT_REASON_MSR_READ 31
+#define EXIT_REASON_MSR_WRITE 32
+#define EXIT_REASON_INVALID_STATE 33
+#define EXIT_REASON_MWAIT_INSTRUCTION 36
+#define EXIT_REASON_MONITOR_INSTRUCTION 39
+#define EXIT_REASON_PAUSE_INSTRUCTION 40
+#define EXIT_REASON_MCE_DURING_VMENTRY 41
+#define EXIT_REASON_TPR_BELOW_THRESHOLD 43
+#define EXIT_REASON_APIC_ACCESS 44
+#define EXIT_REASON_EOI_INDUCED 45
+#define EXIT_REASON_EPT_VIOLATION 48
+#define EXIT_REASON_EPT_MISCONFIG 49
+#define EXIT_REASON_INVEPT 50
+#define EXIT_REASON_RDTSCP 51
+#define EXIT_REASON_PREEMPTION_TIMER 52
+#define EXIT_REASON_INVVPID 53
+#define EXIT_REASON_WBINVD 54
+#define EXIT_REASON_XSETBV 55
+#define EXIT_REASON_APIC_WRITE 56
+#define EXIT_REASON_INVPCID 58
+#define EXIT_REASON_PML_FULL 62
+#define EXIT_REASON_XSAVES 63
+#define EXIT_REASON_XRSTORS 64
+#define LAST_EXIT_REASON 64
+
+enum vmcs_field {
+ VIRTUAL_PROCESSOR_ID = 0x00000000,
+ POSTED_INTR_NV = 0x00000002,
+ GUEST_ES_SELECTOR = 0x00000800,
+ GUEST_CS_SELECTOR = 0x00000802,
+ GUEST_SS_SELECTOR = 0x00000804,
+ GUEST_DS_SELECTOR = 0x00000806,
+ GUEST_FS_SELECTOR = 0x00000808,
+ GUEST_GS_SELECTOR = 0x0000080a,
+ GUEST_LDTR_SELECTOR = 0x0000080c,
+ GUEST_TR_SELECTOR = 0x0000080e,
+ GUEST_INTR_STATUS = 0x00000810,
+ GUEST_PML_INDEX = 0x00000812,
+ HOST_ES_SELECTOR = 0x00000c00,
+ HOST_CS_SELECTOR = 0x00000c02,
+ HOST_SS_SELECTOR = 0x00000c04,
+ HOST_DS_SELECTOR = 0x00000c06,
+ HOST_FS_SELECTOR = 0x00000c08,
+ HOST_GS_SELECTOR = 0x00000c0a,
+ HOST_TR_SELECTOR = 0x00000c0c,
+ IO_BITMAP_A = 0x00002000,
+ IO_BITMAP_A_HIGH = 0x00002001,
+ IO_BITMAP_B = 0x00002002,
+ IO_BITMAP_B_HIGH = 0x00002003,
+ MSR_BITMAP = 0x00002004,
+ MSR_BITMAP_HIGH = 0x00002005,
+ VM_EXIT_MSR_STORE_ADDR = 0x00002006,
+ VM_EXIT_MSR_STORE_ADDR_HIGH = 0x00002007,
+ VM_EXIT_MSR_LOAD_ADDR = 0x00002008,
+ VM_EXIT_MSR_LOAD_ADDR_HIGH = 0x00002009,
+ VM_ENTRY_MSR_LOAD_ADDR = 0x0000200a,
+ VM_ENTRY_MSR_LOAD_ADDR_HIGH = 0x0000200b,
+ PML_ADDRESS = 0x0000200e,
+ PML_ADDRESS_HIGH = 0x0000200f,
+ TSC_OFFSET = 0x00002010,
+ TSC_OFFSET_HIGH = 0x00002011,
+ VIRTUAL_APIC_PAGE_ADDR = 0x00002012,
+ VIRTUAL_APIC_PAGE_ADDR_HIGH = 0x00002013,
+ APIC_ACCESS_ADDR = 0x00002014,
+ APIC_ACCESS_ADDR_HIGH = 0x00002015,
+ POSTED_INTR_DESC_ADDR = 0x00002016,
+ POSTED_INTR_DESC_ADDR_HIGH = 0x00002017,
+ EPT_POINTER = 0x0000201a,
+ EPT_POINTER_HIGH = 0x0000201b,
+ EOI_EXIT_BITMAP0 = 0x0000201c,
+ EOI_EXIT_BITMAP0_HIGH = 0x0000201d,
+ EOI_EXIT_BITMAP1 = 0x0000201e,
+ EOI_EXIT_BITMAP1_HIGH = 0x0000201f,
+ EOI_EXIT_BITMAP2 = 0x00002020,
+ EOI_EXIT_BITMAP2_HIGH = 0x00002021,
+ EOI_EXIT_BITMAP3 = 0x00002022,
+ EOI_EXIT_BITMAP3_HIGH = 0x00002023,
+ VMREAD_BITMAP = 0x00002026,
+ VMREAD_BITMAP_HIGH = 0x00002027,
+ VMWRITE_BITMAP = 0x00002028,
+ VMWRITE_BITMAP_HIGH = 0x00002029,
+ XSS_EXIT_BITMAP = 0x0000202C,
+ XSS_EXIT_BITMAP_HIGH = 0x0000202D,
+ TSC_MULTIPLIER = 0x00002032,
+ TSC_MULTIPLIER_HIGH = 0x00002033,
+ GUEST_PHYSICAL_ADDRESS = 0x00002400,
+ GUEST_PHYSICAL_ADDRESS_HIGH = 0x00002401,
+ VMCS_LINK_POINTER = 0x00002800,
+ VMCS_LINK_POINTER_HIGH = 0x00002801,
+ GUEST_IA32_DEBUGCTL = 0x00002802,
+ GUEST_IA32_DEBUGCTL_HIGH = 0x00002803,
+ GUEST_IA32_PAT = 0x00002804,
+ GUEST_IA32_PAT_HIGH = 0x00002805,
+ GUEST_IA32_EFER = 0x00002806,
+ GUEST_IA32_EFER_HIGH = 0x00002807,
+ GUEST_IA32_PERF_GLOBAL_CTRL = 0x00002808,
+ GUEST_IA32_PERF_GLOBAL_CTRL_HIGH= 0x00002809,
+ GUEST_PDPTR0 = 0x0000280a,
+ GUEST_PDPTR0_HIGH = 0x0000280b,
+ GUEST_PDPTR1 = 0x0000280c,
+ GUEST_PDPTR1_HIGH = 0x0000280d,
+ GUEST_PDPTR2 = 0x0000280e,
+ GUEST_PDPTR2_HIGH = 0x0000280f,
+ GUEST_PDPTR3 = 0x00002810,
+ GUEST_PDPTR3_HIGH = 0x00002811,
+ GUEST_BNDCFGS = 0x00002812,
+ GUEST_BNDCFGS_HIGH = 0x00002813,
+ HOST_IA32_PAT = 0x00002c00,
+ HOST_IA32_PAT_HIGH = 0x00002c01,
+ HOST_IA32_EFER = 0x00002c02,
+ HOST_IA32_EFER_HIGH = 0x00002c03,
+ HOST_IA32_PERF_GLOBAL_CTRL = 0x00002c04,
+ HOST_IA32_PERF_GLOBAL_CTRL_HIGH = 0x00002c05,
+ PIN_BASED_VM_EXEC_CONTROL = 0x00004000,
+ CPU_BASED_VM_EXEC_CONTROL = 0x00004002,
+ EXCEPTION_BITMAP = 0x00004004,
+ PAGE_FAULT_ERROR_CODE_MASK = 0x00004006,
+ PAGE_FAULT_ERROR_CODE_MATCH = 0x00004008,
+ CR3_TARGET_COUNT = 0x0000400a,
+ VM_EXIT_CONTROLS = 0x0000400c,
+ VM_EXIT_MSR_STORE_COUNT = 0x0000400e,
+ VM_EXIT_MSR_LOAD_COUNT = 0x00004010,
+ VM_ENTRY_CONTROLS = 0x00004012,
+ VM_ENTRY_MSR_LOAD_COUNT = 0x00004014,
+ VM_ENTRY_INTR_INFO_FIELD = 0x00004016,
+ VM_ENTRY_EXCEPTION_ERROR_CODE = 0x00004018,
+ VM_ENTRY_INSTRUCTION_LEN = 0x0000401a,
+ TPR_THRESHOLD = 0x0000401c,
+ SECONDARY_VM_EXEC_CONTROL = 0x0000401e,
+ PLE_GAP = 0x00004020,
+ PLE_WINDOW = 0x00004022,
+ VM_INSTRUCTION_ERROR = 0x00004400,
+ VM_EXIT_REASON = 0x00004402,
+ VM_EXIT_INTR_INFO = 0x00004404,
+ VM_EXIT_INTR_ERROR_CODE = 0x00004406,
+ IDT_VECTORING_INFO_FIELD = 0x00004408,
+ IDT_VECTORING_ERROR_CODE = 0x0000440a,
+ VM_EXIT_INSTRUCTION_LEN = 0x0000440c,
+ VMX_INSTRUCTION_INFO = 0x0000440e,
+ GUEST_ES_LIMIT = 0x00004800,
+ GUEST_CS_LIMIT = 0x00004802,
+ GUEST_SS_LIMIT = 0x00004804,
+ GUEST_DS_LIMIT = 0x00004806,
+ GUEST_FS_LIMIT = 0x00004808,
+ GUEST_GS_LIMIT = 0x0000480a,
+ GUEST_LDTR_LIMIT = 0x0000480c,
+ GUEST_TR_LIMIT = 0x0000480e,
+ GUEST_GDTR_LIMIT = 0x00004810,
+ GUEST_IDTR_LIMIT = 0x00004812,
+ GUEST_ES_AR_BYTES = 0x00004814,
+ GUEST_CS_AR_BYTES = 0x00004816,
+ GUEST_SS_AR_BYTES = 0x00004818,
+ GUEST_DS_AR_BYTES = 0x0000481a,
+ GUEST_FS_AR_BYTES = 0x0000481c,
+ GUEST_GS_AR_BYTES = 0x0000481e,
+ GUEST_LDTR_AR_BYTES = 0x00004820,
+ GUEST_TR_AR_BYTES = 0x00004822,
+ GUEST_INTERRUPTIBILITY_INFO = 0x00004824,
+ GUEST_ACTIVITY_STATE = 0X00004826,
+ GUEST_SYSENTER_CS = 0x0000482A,
+ VMX_PREEMPTION_TIMER_VALUE = 0x0000482E,
+ HOST_IA32_SYSENTER_CS = 0x00004c00,
+ CR0_GUEST_HOST_MASK = 0x00006000,
+ CR4_GUEST_HOST_MASK = 0x00006002,
+ CR0_READ_SHADOW = 0x00006004,
+ CR4_READ_SHADOW = 0x00006006,
+ CR3_TARGET_VALUE0 = 0x00006008,
+ CR3_TARGET_VALUE1 = 0x0000600a,
+ CR3_TARGET_VALUE2 = 0x0000600c,
+ CR3_TARGET_VALUE3 = 0x0000600e,
+ EXIT_QUALIFICATION = 0x00006400,
+ GUEST_LINEAR_ADDRESS = 0x0000640a,
+ GUEST_CR0 = 0x00006800,
+ GUEST_CR3 = 0x00006802,
+ GUEST_CR4 = 0x00006804,
+ GUEST_ES_BASE = 0x00006806,
+ GUEST_CS_BASE = 0x00006808,
+ GUEST_SS_BASE = 0x0000680a,
+ GUEST_DS_BASE = 0x0000680c,
+ GUEST_FS_BASE = 0x0000680e,
+ GUEST_GS_BASE = 0x00006810,
+ GUEST_LDTR_BASE = 0x00006812,
+ GUEST_TR_BASE = 0x00006814,
+ GUEST_GDTR_BASE = 0x00006816,
+ GUEST_IDTR_BASE = 0x00006818,
+ GUEST_DR7 = 0x0000681a,
+ GUEST_RSP = 0x0000681c,
+ GUEST_RIP = 0x0000681e,
+ GUEST_RFLAGS = 0x00006820,
+ GUEST_PENDING_DBG_EXCEPTIONS = 0x00006822,
+ GUEST_SYSENTER_ESP = 0x00006824,
+ GUEST_SYSENTER_EIP = 0x00006826,
+ HOST_CR0 = 0x00006c00,
+ HOST_CR3 = 0x00006c02,
+ HOST_CR4 = 0x00006c04,
+ HOST_FS_BASE = 0x00006c06,
+ HOST_GS_BASE = 0x00006c08,
+ HOST_TR_BASE = 0x00006c0a,
+ HOST_GDTR_BASE = 0x00006c0c,
+ HOST_IDTR_BASE = 0x00006c0e,
+ HOST_IA32_SYSENTER_ESP = 0x00006c10,
+ HOST_IA32_SYSENTER_EIP = 0x00006c12,
+ HOST_RSP = 0x00006c14,
+ HOST_RIP = 0x00006c16,
+};
+
+struct vmx_msr_entry {
+ uint32_t index;
+ uint32_t reserved;
+ uint64_t value;
+} __attribute__ ((aligned(16)));
+
+#include "evmcs.h"
+
+static inline int vmxon(uint64_t phys)
+{
+ uint8_t ret;
+
+ __asm__ __volatile__ ("vmxon %[pa]; setna %[ret]"
+ : [ret]"=rm"(ret)
+ : [pa]"m"(phys)
+ : "cc", "memory");
+
+ return ret;
+}
+
+static inline void vmxoff(void)
+{
+ __asm__ __volatile__("vmxoff");
+}
+
+static inline int vmclear(uint64_t vmcs_pa)
+{
+ uint8_t ret;
+
+ __asm__ __volatile__ ("vmclear %[pa]; setna %[ret]"
+ : [ret]"=rm"(ret)
+ : [pa]"m"(vmcs_pa)
+ : "cc", "memory");
+
+ return ret;
+}
+
+static inline int vmptrld(uint64_t vmcs_pa)
+{
+ uint8_t ret;
+
+ if (enable_evmcs)
+ return -1;
+
+ __asm__ __volatile__ ("vmptrld %[pa]; setna %[ret]"
+ : [ret]"=rm"(ret)
+ : [pa]"m"(vmcs_pa)
+ : "cc", "memory");
+
+ return ret;
+}
+
+static inline int vmptrst(uint64_t *value)
+{
+ uint64_t tmp;
+ uint8_t ret;
+
+ if (enable_evmcs)
+ return evmcs_vmptrst(value);
+
+ __asm__ __volatile__("vmptrst %[value]; setna %[ret]"
+ : [value]"=m"(tmp), [ret]"=rm"(ret)
+ : : "cc", "memory");
+
+ *value = tmp;
+ return ret;
+}
+
+/*
+ * A wrapper around vmptrst that ignores errors and returns zero if the
+ * vmptrst instruction fails.
+ */
+static inline uint64_t vmptrstz(void)
+{
+ uint64_t value = 0;
+ vmptrst(&value);
+ return value;
+}
+
+/*
+ * No guest state (e.g. GPRs) is established by this vmlaunch.
+ */
+static inline int vmlaunch(void)
+{
+ int ret;
+
+ if (enable_evmcs)
+ return evmcs_vmlaunch();
+
+ __asm__ __volatile__("push %%rbp;"
+ "push %%rcx;"
+ "push %%rdx;"
+ "push %%rsi;"
+ "push %%rdi;"
+ "push $0;"
+ "vmwrite %%rsp, %[host_rsp];"
+ "lea 1f(%%rip), %%rax;"
+ "vmwrite %%rax, %[host_rip];"
+ "vmlaunch;"
+ "incq (%%rsp);"
+ "1: pop %%rax;"
+ "pop %%rdi;"
+ "pop %%rsi;"
+ "pop %%rdx;"
+ "pop %%rcx;"
+ "pop %%rbp;"
+ : [ret]"=&a"(ret)
+ : [host_rsp]"r"((uint64_t)HOST_RSP),
+ [host_rip]"r"((uint64_t)HOST_RIP)
+ : "memory", "cc", "rbx", "r8", "r9", "r10",
+ "r11", "r12", "r13", "r14", "r15");
+ return ret;
+}
+
+/*
+ * No guest state (e.g. GPRs) is established by this vmresume.
+ */
+static inline int vmresume(void)
+{
+ int ret;
+
+ if (enable_evmcs)
+ return evmcs_vmresume();
+
+ __asm__ __volatile__("push %%rbp;"
+ "push %%rcx;"
+ "push %%rdx;"
+ "push %%rsi;"
+ "push %%rdi;"
+ "push $0;"
+ "vmwrite %%rsp, %[host_rsp];"
+ "lea 1f(%%rip), %%rax;"
+ "vmwrite %%rax, %[host_rip];"
+ "vmresume;"
+ "incq (%%rsp);"
+ "1: pop %%rax;"
+ "pop %%rdi;"
+ "pop %%rsi;"
+ "pop %%rdx;"
+ "pop %%rcx;"
+ "pop %%rbp;"
+ : [ret]"=&a"(ret)
+ : [host_rsp]"r"((uint64_t)HOST_RSP),
+ [host_rip]"r"((uint64_t)HOST_RIP)
+ : "memory", "cc", "rbx", "r8", "r9", "r10",
+ "r11", "r12", "r13", "r14", "r15");
+ return ret;
+}
+
+static inline void vmcall(void)
+{
+ /* Currently, L1 destroys our GPRs during vmexits. */
+ __asm__ __volatile__("push %%rbp; vmcall; pop %%rbp" : : :
+ "rax", "rbx", "rcx", "rdx",
+ "rsi", "rdi", "r8", "r9", "r10", "r11", "r12",
+ "r13", "r14", "r15");
+}
+
+static inline int vmread(uint64_t encoding, uint64_t *value)
+{
+ uint64_t tmp;
+ uint8_t ret;
+
+ if (enable_evmcs)
+ return evmcs_vmread(encoding, value);
+
+ __asm__ __volatile__("vmread %[encoding], %[value]; setna %[ret]"
+ : [value]"=rm"(tmp), [ret]"=rm"(ret)
+ : [encoding]"r"(encoding)
+ : "cc", "memory");
+
+ *value = tmp;
+ return ret;
+}
+
+/*
+ * A wrapper around vmread that ignores errors and returns zero if the
+ * vmread instruction fails.
+ */
+static inline uint64_t vmreadz(uint64_t encoding)
+{
+ uint64_t value = 0;
+ vmread(encoding, &value);
+ return value;
+}
+
+static inline int vmwrite(uint64_t encoding, uint64_t value)
+{
+ uint8_t ret;
+
+ if (enable_evmcs)
+ return evmcs_vmwrite(encoding, value);
+
+ __asm__ __volatile__ ("vmwrite %[value], %[encoding]; setna %[ret]"
+ : [ret]"=rm"(ret)
+ : [value]"rm"(value), [encoding]"r"(encoding)
+ : "cc", "memory");
+
+ return ret;
+}
+
+static inline uint32_t vmcs_revision(void)
+{
+ return rdmsr(MSR_IA32_VMX_BASIC);
+}
+
+struct vmx_pages {
+ void *vmxon_hva;
+ uint64_t vmxon_gpa;
+ void *vmxon;
+
+ void *vmcs_hva;
+ uint64_t vmcs_gpa;
+ void *vmcs;
+
+ void *msr_hva;
+ uint64_t msr_gpa;
+ void *msr;
+
+ void *shadow_vmcs_hva;
+ uint64_t shadow_vmcs_gpa;
+ void *shadow_vmcs;
+
+ void *vmread_hva;
+ uint64_t vmread_gpa;
+ void *vmread;
+
+ void *vmwrite_hva;
+ uint64_t vmwrite_gpa;
+ void *vmwrite;
+
+ void *vp_assist_hva;
+ uint64_t vp_assist_gpa;
+ void *vp_assist;
+
+ void *enlightened_vmcs_hva;
+ uint64_t enlightened_vmcs_gpa;
+ void *enlightened_vmcs;
+
+ void *eptp_hva;
+ uint64_t eptp_gpa;
+ void *eptp;
+
+ void *apic_access_hva;
+ uint64_t apic_access_gpa;
+ void *apic_access;
+};
+
+union vmx_basic {
+ u64 val;
+ struct {
+ u32 revision;
+ u32 size:13,
+ reserved1:3,
+ width:1,
+ dual:1,
+ type:4,
+ insouts:1,
+ ctrl:1,
+ vm_entry_exception_ctrl:1,
+ reserved2:7;
+ };
+};
+
+union vmx_ctrl_msr {
+ u64 val;
+ struct {
+ u32 set, clr;
+ };
+};
+
+struct vmx_pages *vcpu_alloc_vmx(struct kvm_vm *vm, vm_vaddr_t *p_vmx_gva);
+bool prepare_for_vmx_operation(struct vmx_pages *vmx);
+void prepare_vmcs(struct vmx_pages *vmx, void *guest_rip, void *guest_rsp);
+bool load_vmcs(struct vmx_pages *vmx);
+
+bool nested_vmx_supported(void);
+void nested_vmx_check_supported(void);
+
+void nested_pg_map(struct vmx_pages *vmx, struct kvm_vm *vm,
+ uint64_t nested_paddr, uint64_t paddr, uint32_t eptp_memslot);
+void nested_map(struct vmx_pages *vmx, struct kvm_vm *vm,
+ uint64_t nested_paddr, uint64_t paddr, uint64_t size,
+ uint32_t eptp_memslot);
+void nested_map_memslot(struct vmx_pages *vmx, struct kvm_vm *vm,
+ uint32_t memslot, uint32_t eptp_memslot);
+void prepare_eptp(struct vmx_pages *vmx, struct kvm_vm *vm,
+ uint32_t eptp_memslot);
+void prepare_virtualize_apic_accesses(struct vmx_pages *vmx, struct kvm_vm *vm,
+ uint32_t eptp_memslot);
+
+#endif /* SELFTEST_KVM_VMX_H */
diff --git a/tools/testing/selftests/kvm/kvm_create_max_vcpus.c b/tools/testing/selftests/kvm/kvm_create_max_vcpus.c
new file mode 100644
index 000000000..aa3795cd7
--- /dev/null
+++ b/tools/testing/selftests/kvm/kvm_create_max_vcpus.c
@@ -0,0 +1,96 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * kvm_create_max_vcpus
+ *
+ * Copyright (C) 2019, Google LLC.
+ *
+ * Test for KVM_CAP_MAX_VCPUS and KVM_CAP_MAX_VCPU_ID.
+ */
+
+#define _GNU_SOURCE /* for program_invocation_short_name */
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/resource.h>
+
+#include "test_util.h"
+
+#include "kvm_util.h"
+#include "asm/kvm.h"
+#include "linux/kvm.h"
+
+void test_vcpu_creation(int first_vcpu_id, int num_vcpus)
+{
+ struct kvm_vm *vm;
+ int i;
+
+ pr_info("Testing creating %d vCPUs, with IDs %d...%d.\n",
+ num_vcpus, first_vcpu_id, first_vcpu_id + num_vcpus - 1);
+
+ vm = vm_create(VM_MODE_DEFAULT, DEFAULT_GUEST_PHY_PAGES, O_RDWR);
+
+ for (i = first_vcpu_id; i < first_vcpu_id + num_vcpus; i++)
+ /* This asserts that the vCPU was created. */
+ vm_vcpu_add(vm, i);
+
+ kvm_vm_free(vm);
+}
+
+int main(int argc, char *argv[])
+{
+ int kvm_max_vcpu_id = kvm_check_cap(KVM_CAP_MAX_VCPU_ID);
+ int kvm_max_vcpus = kvm_check_cap(KVM_CAP_MAX_VCPUS);
+ /*
+ * Number of file descriptors reqired, KVM_CAP_MAX_VCPUS for vCPU fds +
+ * an arbitrary number for everything else.
+ */
+ int nr_fds_wanted = kvm_max_vcpus + 100;
+ struct rlimit rl;
+
+ pr_info("KVM_CAP_MAX_VCPU_ID: %d\n", kvm_max_vcpu_id);
+ pr_info("KVM_CAP_MAX_VCPUS: %d\n", kvm_max_vcpus);
+
+ /*
+ * Check that we're allowed to open nr_fds_wanted file descriptors and
+ * try raising the limits if needed.
+ */
+ TEST_ASSERT(!getrlimit(RLIMIT_NOFILE, &rl), "getrlimit() failed!");
+
+ if (rl.rlim_cur < nr_fds_wanted) {
+ rl.rlim_cur = nr_fds_wanted;
+ if (rl.rlim_max < nr_fds_wanted) {
+ int old_rlim_max = rl.rlim_max;
+ rl.rlim_max = nr_fds_wanted;
+
+ int r = setrlimit(RLIMIT_NOFILE, &rl);
+ if (r < 0) {
+ printf("RLIMIT_NOFILE hard limit is too low (%d, wanted %d)\n",
+ old_rlim_max, nr_fds_wanted);
+ exit(KSFT_SKIP);
+ }
+ } else {
+ TEST_ASSERT(!setrlimit(RLIMIT_NOFILE, &rl), "setrlimit() failed!");
+ }
+ }
+
+ /*
+ * Upstream KVM prior to 4.8 does not support KVM_CAP_MAX_VCPU_ID.
+ * Userspace is supposed to use KVM_CAP_MAX_VCPUS as the maximum ID
+ * in this case.
+ */
+ if (!kvm_max_vcpu_id)
+ kvm_max_vcpu_id = kvm_max_vcpus;
+
+ TEST_ASSERT(kvm_max_vcpu_id >= kvm_max_vcpus,
+ "KVM_MAX_VCPU_ID (%d) must be at least as large as KVM_MAX_VCPUS (%d).",
+ kvm_max_vcpu_id, kvm_max_vcpus);
+
+ test_vcpu_creation(0, kvm_max_vcpus);
+
+ if (kvm_max_vcpu_id > kvm_max_vcpus)
+ test_vcpu_creation(
+ kvm_max_vcpu_id - kvm_max_vcpus, kvm_max_vcpus);
+
+ return 0;
+}
diff --git a/tools/testing/selftests/kvm/lib/aarch64/processor.c b/tools/testing/selftests/kvm/lib/aarch64/processor.c
new file mode 100644
index 000000000..d6c32c328
--- /dev/null
+++ b/tools/testing/selftests/kvm/lib/aarch64/processor.c
@@ -0,0 +1,356 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * AArch64 code
+ *
+ * Copyright (C) 2018, Red Hat, Inc.
+ */
+
+#define _GNU_SOURCE /* for program_invocation_name */
+
+#include <linux/compiler.h>
+
+#include "kvm_util.h"
+#include "../kvm_util_internal.h"
+#include "processor.h"
+
+#define KVM_GUEST_PAGE_TABLE_MIN_PADDR 0x180000
+#define DEFAULT_ARM64_GUEST_STACK_VADDR_MIN 0xac0000
+
+static uint64_t page_align(struct kvm_vm *vm, uint64_t v)
+{
+ return (v + vm->page_size) & ~(vm->page_size - 1);
+}
+
+static uint64_t pgd_index(struct kvm_vm *vm, vm_vaddr_t gva)
+{
+ unsigned int shift = (vm->pgtable_levels - 1) * (vm->page_shift - 3) + vm->page_shift;
+ uint64_t mask = (1UL << (vm->va_bits - shift)) - 1;
+
+ return (gva >> shift) & mask;
+}
+
+static uint64_t pud_index(struct kvm_vm *vm, vm_vaddr_t gva)
+{
+ unsigned int shift = 2 * (vm->page_shift - 3) + vm->page_shift;
+ uint64_t mask = (1UL << (vm->page_shift - 3)) - 1;
+
+ TEST_ASSERT(vm->pgtable_levels == 4,
+ "Mode %d does not have 4 page table levels", vm->mode);
+
+ return (gva >> shift) & mask;
+}
+
+static uint64_t pmd_index(struct kvm_vm *vm, vm_vaddr_t gva)
+{
+ unsigned int shift = (vm->page_shift - 3) + vm->page_shift;
+ uint64_t mask = (1UL << (vm->page_shift - 3)) - 1;
+
+ TEST_ASSERT(vm->pgtable_levels >= 3,
+ "Mode %d does not have >= 3 page table levels", vm->mode);
+
+ return (gva >> shift) & mask;
+}
+
+static uint64_t pte_index(struct kvm_vm *vm, vm_vaddr_t gva)
+{
+ uint64_t mask = (1UL << (vm->page_shift - 3)) - 1;
+ return (gva >> vm->page_shift) & mask;
+}
+
+static uint64_t pte_addr(struct kvm_vm *vm, uint64_t entry)
+{
+ uint64_t mask = ((1UL << (vm->va_bits - vm->page_shift)) - 1) << vm->page_shift;
+ return entry & mask;
+}
+
+static uint64_t ptrs_per_pgd(struct kvm_vm *vm)
+{
+ unsigned int shift = (vm->pgtable_levels - 1) * (vm->page_shift - 3) + vm->page_shift;
+ return 1 << (vm->va_bits - shift);
+}
+
+static uint64_t __maybe_unused ptrs_per_pte(struct kvm_vm *vm)
+{
+ return 1 << (vm->page_shift - 3);
+}
+
+void virt_pgd_alloc(struct kvm_vm *vm, uint32_t pgd_memslot)
+{
+ if (!vm->pgd_created) {
+ vm_paddr_t paddr = vm_phy_pages_alloc(vm,
+ page_align(vm, ptrs_per_pgd(vm) * 8) / vm->page_size,
+ KVM_GUEST_PAGE_TABLE_MIN_PADDR, pgd_memslot);
+ vm->pgd = paddr;
+ vm->pgd_created = true;
+ }
+}
+
+void _virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr,
+ uint32_t pgd_memslot, uint64_t flags)
+{
+ uint8_t attr_idx = flags & 7;
+ uint64_t *ptep;
+
+ TEST_ASSERT((vaddr % vm->page_size) == 0,
+ "Virtual address not on page boundary,\n"
+ " vaddr: 0x%lx vm->page_size: 0x%x", vaddr, vm->page_size);
+ TEST_ASSERT(sparsebit_is_set(vm->vpages_valid,
+ (vaddr >> vm->page_shift)),
+ "Invalid virtual address, vaddr: 0x%lx", vaddr);
+ TEST_ASSERT((paddr % vm->page_size) == 0,
+ "Physical address not on page boundary,\n"
+ " paddr: 0x%lx vm->page_size: 0x%x", paddr, vm->page_size);
+ TEST_ASSERT((paddr >> vm->page_shift) <= vm->max_gfn,
+ "Physical address beyond beyond maximum supported,\n"
+ " paddr: 0x%lx vm->max_gfn: 0x%lx vm->page_size: 0x%x",
+ paddr, vm->max_gfn, vm->page_size);
+
+ ptep = addr_gpa2hva(vm, vm->pgd) + pgd_index(vm, vaddr) * 8;
+ if (!*ptep) {
+ *ptep = vm_phy_page_alloc(vm, KVM_GUEST_PAGE_TABLE_MIN_PADDR, pgd_memslot);
+ *ptep |= 3;
+ }
+
+ switch (vm->pgtable_levels) {
+ case 4:
+ ptep = addr_gpa2hva(vm, pte_addr(vm, *ptep)) + pud_index(vm, vaddr) * 8;
+ if (!*ptep) {
+ *ptep = vm_phy_page_alloc(vm, KVM_GUEST_PAGE_TABLE_MIN_PADDR, pgd_memslot);
+ *ptep |= 3;
+ }
+ /* fall through */
+ case 3:
+ ptep = addr_gpa2hva(vm, pte_addr(vm, *ptep)) + pmd_index(vm, vaddr) * 8;
+ if (!*ptep) {
+ *ptep = vm_phy_page_alloc(vm, KVM_GUEST_PAGE_TABLE_MIN_PADDR, pgd_memslot);
+ *ptep |= 3;
+ }
+ /* fall through */
+ case 2:
+ ptep = addr_gpa2hva(vm, pte_addr(vm, *ptep)) + pte_index(vm, vaddr) * 8;
+ break;
+ default:
+ TEST_FAIL("Page table levels must be 2, 3, or 4");
+ }
+
+ *ptep = paddr | 3;
+ *ptep |= (attr_idx << 2) | (1 << 10) /* Access Flag */;
+}
+
+void virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr,
+ uint32_t pgd_memslot)
+{
+ uint64_t attr_idx = 4; /* NORMAL (See DEFAULT_MAIR_EL1) */
+
+ _virt_pg_map(vm, vaddr, paddr, pgd_memslot, attr_idx);
+}
+
+vm_paddr_t addr_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva)
+{
+ uint64_t *ptep;
+
+ if (!vm->pgd_created)
+ goto unmapped_gva;
+
+ ptep = addr_gpa2hva(vm, vm->pgd) + pgd_index(vm, gva) * 8;
+ if (!ptep)
+ goto unmapped_gva;
+
+ switch (vm->pgtable_levels) {
+ case 4:
+ ptep = addr_gpa2hva(vm, pte_addr(vm, *ptep)) + pud_index(vm, gva) * 8;
+ if (!ptep)
+ goto unmapped_gva;
+ /* fall through */
+ case 3:
+ ptep = addr_gpa2hva(vm, pte_addr(vm, *ptep)) + pmd_index(vm, gva) * 8;
+ if (!ptep)
+ goto unmapped_gva;
+ /* fall through */
+ case 2:
+ ptep = addr_gpa2hva(vm, pte_addr(vm, *ptep)) + pte_index(vm, gva) * 8;
+ if (!ptep)
+ goto unmapped_gva;
+ break;
+ default:
+ TEST_FAIL("Page table levels must be 2, 3, or 4");
+ }
+
+ return pte_addr(vm, *ptep) + (gva & (vm->page_size - 1));
+
+unmapped_gva:
+ TEST_FAIL("No mapping for vm virtual address, gva: 0x%lx", gva);
+ exit(1);
+}
+
+static void pte_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent, uint64_t page, int level)
+{
+#ifdef DEBUG
+ static const char * const type[] = { "", "pud", "pmd", "pte" };
+ uint64_t pte, *ptep;
+
+ if (level == 4)
+ return;
+
+ for (pte = page; pte < page + ptrs_per_pte(vm) * 8; pte += 8) {
+ ptep = addr_gpa2hva(vm, pte);
+ if (!*ptep)
+ continue;
+ fprintf(stream, "%*s%s: %lx: %lx at %p\n", indent, "", type[level], pte, *ptep, ptep);
+ pte_dump(stream, vm, indent + 1, pte_addr(vm, *ptep), level + 1);
+ }
+#endif
+}
+
+void virt_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent)
+{
+ int level = 4 - (vm->pgtable_levels - 1);
+ uint64_t pgd, *ptep;
+
+ if (!vm->pgd_created)
+ return;
+
+ for (pgd = vm->pgd; pgd < vm->pgd + ptrs_per_pgd(vm) * 8; pgd += 8) {
+ ptep = addr_gpa2hva(vm, pgd);
+ if (!*ptep)
+ continue;
+ fprintf(stream, "%*spgd: %lx: %lx at %p\n", indent, "", pgd, *ptep, ptep);
+ pte_dump(stream, vm, indent + 1, pte_addr(vm, *ptep), level);
+ }
+}
+
+struct kvm_vm *vm_create_default(uint32_t vcpuid, uint64_t extra_mem_pages,
+ void *guest_code)
+{
+ uint64_t ptrs_per_4k_pte = 512;
+ uint64_t extra_pg_pages = (extra_mem_pages / ptrs_per_4k_pte) * 2;
+ struct kvm_vm *vm;
+
+ vm = vm_create(VM_MODE_DEFAULT, DEFAULT_GUEST_PHY_PAGES + extra_pg_pages, O_RDWR);
+
+ kvm_vm_elf_load(vm, program_invocation_name, 0, 0);
+ vm_vcpu_add_default(vm, vcpuid, guest_code);
+
+ return vm;
+}
+
+void aarch64_vcpu_setup(struct kvm_vm *vm, int vcpuid, struct kvm_vcpu_init *init)
+{
+ struct kvm_vcpu_init default_init = { .target = -1, };
+ uint64_t sctlr_el1, tcr_el1;
+
+ if (!init)
+ init = &default_init;
+
+ if (init->target == -1) {
+ struct kvm_vcpu_init preferred;
+ vm_ioctl(vm, KVM_ARM_PREFERRED_TARGET, &preferred);
+ init->target = preferred.target;
+ }
+
+ vcpu_ioctl(vm, vcpuid, KVM_ARM_VCPU_INIT, init);
+
+ /*
+ * Enable FP/ASIMD to avoid trapping when accessing Q0-Q15
+ * registers, which the variable argument list macros do.
+ */
+ set_reg(vm, vcpuid, ARM64_SYS_REG(CPACR_EL1), 3 << 20);
+
+ get_reg(vm, vcpuid, ARM64_SYS_REG(SCTLR_EL1), &sctlr_el1);
+ get_reg(vm, vcpuid, ARM64_SYS_REG(TCR_EL1), &tcr_el1);
+
+ switch (vm->mode) {
+ case VM_MODE_P52V48_4K:
+ TEST_FAIL("AArch64 does not support 4K sized pages "
+ "with 52-bit physical address ranges");
+ case VM_MODE_PXXV48_4K:
+ TEST_FAIL("AArch64 does not support 4K sized pages "
+ "with ANY-bit physical address ranges");
+ case VM_MODE_P52V48_64K:
+ tcr_el1 |= 1ul << 14; /* TG0 = 64KB */
+ tcr_el1 |= 6ul << 32; /* IPS = 52 bits */
+ break;
+ case VM_MODE_P48V48_4K:
+ tcr_el1 |= 0ul << 14; /* TG0 = 4KB */
+ tcr_el1 |= 5ul << 32; /* IPS = 48 bits */
+ break;
+ case VM_MODE_P48V48_64K:
+ tcr_el1 |= 1ul << 14; /* TG0 = 64KB */
+ tcr_el1 |= 5ul << 32; /* IPS = 48 bits */
+ break;
+ case VM_MODE_P40V48_4K:
+ tcr_el1 |= 0ul << 14; /* TG0 = 4KB */
+ tcr_el1 |= 2ul << 32; /* IPS = 40 bits */
+ break;
+ case VM_MODE_P40V48_64K:
+ tcr_el1 |= 1ul << 14; /* TG0 = 64KB */
+ tcr_el1 |= 2ul << 32; /* IPS = 40 bits */
+ break;
+ default:
+ TEST_FAIL("Unknown guest mode, mode: 0x%x", vm->mode);
+ }
+
+ sctlr_el1 |= (1 << 0) | (1 << 2) | (1 << 12) /* M | C | I */;
+ /* TCR_EL1 |= IRGN0:WBWA | ORGN0:WBWA | SH0:Inner-Shareable */;
+ tcr_el1 |= (1 << 8) | (1 << 10) | (3 << 12);
+ tcr_el1 |= (64 - vm->va_bits) /* T0SZ */;
+
+ set_reg(vm, vcpuid, ARM64_SYS_REG(SCTLR_EL1), sctlr_el1);
+ set_reg(vm, vcpuid, ARM64_SYS_REG(TCR_EL1), tcr_el1);
+ set_reg(vm, vcpuid, ARM64_SYS_REG(MAIR_EL1), DEFAULT_MAIR_EL1);
+ set_reg(vm, vcpuid, ARM64_SYS_REG(TTBR0_EL1), vm->pgd);
+}
+
+void vcpu_dump(FILE *stream, struct kvm_vm *vm, uint32_t vcpuid, uint8_t indent)
+{
+ uint64_t pstate, pc;
+
+ get_reg(vm, vcpuid, ARM64_CORE_REG(regs.pstate), &pstate);
+ get_reg(vm, vcpuid, ARM64_CORE_REG(regs.pc), &pc);
+
+ fprintf(stream, "%*spstate: 0x%.16lx pc: 0x%.16lx\n",
+ indent, "", pstate, pc);
+}
+
+void aarch64_vcpu_add_default(struct kvm_vm *vm, uint32_t vcpuid,
+ struct kvm_vcpu_init *init, void *guest_code)
+{
+ size_t stack_size = vm->page_size == 4096 ?
+ DEFAULT_STACK_PGS * vm->page_size :
+ vm->page_size;
+ uint64_t stack_vaddr = vm_vaddr_alloc(vm, stack_size,
+ DEFAULT_ARM64_GUEST_STACK_VADDR_MIN, 0, 0);
+
+ vm_vcpu_add(vm, vcpuid);
+ aarch64_vcpu_setup(vm, vcpuid, init);
+
+ set_reg(vm, vcpuid, ARM64_CORE_REG(sp_el1), stack_vaddr + stack_size);
+ set_reg(vm, vcpuid, ARM64_CORE_REG(regs.pc), (uint64_t)guest_code);
+}
+
+void vm_vcpu_add_default(struct kvm_vm *vm, uint32_t vcpuid, void *guest_code)
+{
+ aarch64_vcpu_add_default(vm, vcpuid, NULL, guest_code);
+}
+
+void vcpu_args_set(struct kvm_vm *vm, uint32_t vcpuid, unsigned int num, ...)
+{
+ va_list ap;
+ int i;
+
+ TEST_ASSERT(num >= 1 && num <= 8, "Unsupported number of args,\n"
+ " num: %u\n", num);
+
+ va_start(ap, num);
+
+ for (i = 0; i < num; i++) {
+ set_reg(vm, vcpuid, ARM64_CORE_REG(regs.regs[i]),
+ va_arg(ap, uint64_t));
+ }
+
+ va_end(ap);
+}
+
+void assert_on_unhandled_exception(struct kvm_vm *vm, uint32_t vcpuid)
+{
+}
diff --git a/tools/testing/selftests/kvm/lib/aarch64/ucall.c b/tools/testing/selftests/kvm/lib/aarch64/ucall.c
new file mode 100644
index 000000000..f600311fd
--- /dev/null
+++ b/tools/testing/selftests/kvm/lib/aarch64/ucall.c
@@ -0,0 +1,114 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * ucall support. A ucall is a "hypercall to userspace".
+ *
+ * Copyright (C) 2018, Red Hat, Inc.
+ */
+#include "kvm_util.h"
+#include "../kvm_util_internal.h"
+
+static vm_vaddr_t *ucall_exit_mmio_addr;
+
+static bool ucall_mmio_init(struct kvm_vm *vm, vm_paddr_t gpa)
+{
+ if (kvm_userspace_memory_region_find(vm, gpa, gpa + 1))
+ return false;
+
+ virt_pg_map(vm, gpa, gpa, 0);
+
+ ucall_exit_mmio_addr = (vm_vaddr_t *)gpa;
+ sync_global_to_guest(vm, ucall_exit_mmio_addr);
+
+ return true;
+}
+
+void ucall_init(struct kvm_vm *vm, void *arg)
+{
+ vm_paddr_t gpa, start, end, step, offset;
+ unsigned int bits;
+ bool ret;
+
+ if (arg) {
+ gpa = (vm_paddr_t)arg;
+ ret = ucall_mmio_init(vm, gpa);
+ TEST_ASSERT(ret, "Can't set ucall mmio address to %lx", gpa);
+ return;
+ }
+
+ /*
+ * Find an address within the allowed physical and virtual address
+ * spaces, that does _not_ have a KVM memory region associated with
+ * it. Identity mapping an address like this allows the guest to
+ * access it, but as KVM doesn't know what to do with it, it
+ * will assume it's something userspace handles and exit with
+ * KVM_EXIT_MMIO. Well, at least that's how it works for AArch64.
+ * Here we start with a guess that the addresses around 5/8th
+ * of the allowed space are unmapped and then work both down and
+ * up from there in 1/16th allowed space sized steps.
+ *
+ * Note, we need to use VA-bits - 1 when calculating the allowed
+ * virtual address space for an identity mapping because the upper
+ * half of the virtual address space is the two's complement of the
+ * lower and won't match physical addresses.
+ */
+ bits = vm->va_bits - 1;
+ bits = vm->pa_bits < bits ? vm->pa_bits : bits;
+ end = 1ul << bits;
+ start = end * 5 / 8;
+ step = end / 16;
+ for (offset = 0; offset < end - start; offset += step) {
+ if (ucall_mmio_init(vm, start - offset))
+ return;
+ if (ucall_mmio_init(vm, start + offset))
+ return;
+ }
+ TEST_FAIL("Can't find a ucall mmio address");
+}
+
+void ucall_uninit(struct kvm_vm *vm)
+{
+ ucall_exit_mmio_addr = 0;
+ sync_global_to_guest(vm, ucall_exit_mmio_addr);
+}
+
+void ucall(uint64_t cmd, int nargs, ...)
+{
+ struct ucall uc = {};
+ va_list va;
+ int i;
+
+ WRITE_ONCE(uc.cmd, cmd);
+ nargs = nargs <= UCALL_MAX_ARGS ? nargs : UCALL_MAX_ARGS;
+
+ va_start(va, nargs);
+ for (i = 0; i < nargs; ++i)
+ WRITE_ONCE(uc.args[i], va_arg(va, uint64_t));
+ va_end(va);
+
+ WRITE_ONCE(*ucall_exit_mmio_addr, (vm_vaddr_t)&uc);
+}
+
+uint64_t get_ucall(struct kvm_vm *vm, uint32_t vcpu_id, struct ucall *uc)
+{
+ struct kvm_run *run = vcpu_state(vm, vcpu_id);
+ struct ucall ucall = {};
+
+ if (uc)
+ memset(uc, 0, sizeof(*uc));
+
+ if (run->exit_reason == KVM_EXIT_MMIO &&
+ run->mmio.phys_addr == (uint64_t)ucall_exit_mmio_addr) {
+ vm_vaddr_t gva;
+
+ TEST_ASSERT(run->mmio.is_write && run->mmio.len == 8,
+ "Unexpected ucall exit mmio address access");
+ memcpy(&gva, run->mmio.data, sizeof(gva));
+ memcpy(&ucall, addr_gva2hva(vm, gva), sizeof(ucall));
+
+ vcpu_run_complete_io(vm, vcpu_id);
+ if (uc)
+ memcpy(uc, &ucall, sizeof(ucall));
+ }
+
+ return ucall.cmd;
+}
diff --git a/tools/testing/selftests/kvm/lib/assert.c b/tools/testing/selftests/kvm/lib/assert.c
new file mode 100644
index 000000000..5ebbd0d6b
--- /dev/null
+++ b/tools/testing/selftests/kvm/lib/assert.c
@@ -0,0 +1,93 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * tools/testing/selftests/kvm/lib/assert.c
+ *
+ * Copyright (C) 2018, Google LLC.
+ */
+
+#define _GNU_SOURCE /* for getline(3) and strchrnul(3)*/
+
+#include "test_util.h"
+
+#include <execinfo.h>
+#include <sys/syscall.h>
+
+#include "kselftest.h"
+
+/* Dumps the current stack trace to stderr. */
+static void __attribute__((noinline)) test_dump_stack(void);
+static void test_dump_stack(void)
+{
+ /*
+ * Build and run this command:
+ *
+ * addr2line -s -e /proc/$PPID/exe -fpai {backtrace addresses} | \
+ * grep -v test_dump_stack | cat -n 1>&2
+ *
+ * Note that the spacing is different and there's no newline.
+ */
+ size_t i;
+ size_t n = 20;
+ void *stack[n];
+ const char *addr2line = "addr2line -s -e /proc/$PPID/exe -fpai";
+ const char *pipeline = "|cat -n 1>&2";
+ char cmd[strlen(addr2line) + strlen(pipeline) +
+ /* N bytes per addr * 2 digits per byte + 1 space per addr: */
+ n * (((sizeof(void *)) * 2) + 1) +
+ /* Null terminator: */
+ 1];
+ char *c;
+
+ n = backtrace(stack, n);
+ c = &cmd[0];
+ c += sprintf(c, "%s", addr2line);
+ /*
+ * Skip the first 3 frames: backtrace, test_dump_stack, and
+ * test_assert. We hope that backtrace isn't inlined and the other two
+ * we've declared noinline.
+ */
+ for (i = 2; i < n; i++)
+ c += sprintf(c, " %lx", ((unsigned long) stack[i]) - 1);
+ c += sprintf(c, "%s", pipeline);
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-result"
+ system(cmd);
+#pragma GCC diagnostic pop
+}
+
+static pid_t _gettid(void)
+{
+ return syscall(SYS_gettid);
+}
+
+void __attribute__((noinline))
+test_assert(bool exp, const char *exp_str,
+ const char *file, unsigned int line, const char *fmt, ...)
+{
+ va_list ap;
+
+ if (!(exp)) {
+ va_start(ap, fmt);
+
+ fprintf(stderr, "==== Test Assertion Failure ====\n"
+ " %s:%u: %s\n"
+ " pid=%d tid=%d - %s\n",
+ file, line, exp_str, getpid(), _gettid(),
+ strerror(errno));
+ test_dump_stack();
+ if (fmt) {
+ fputs(" ", stderr);
+ vfprintf(stderr, fmt, ap);
+ fputs("\n", stderr);
+ }
+ va_end(ap);
+
+ if (errno == EACCES) {
+ print_skip("Access denied - Exiting");
+ exit(KSFT_SKIP);
+ }
+ exit(254);
+ }
+
+ return;
+}
diff --git a/tools/testing/selftests/kvm/lib/elf.c b/tools/testing/selftests/kvm/lib/elf.c
new file mode 100644
index 000000000..bc75a91e0
--- /dev/null
+++ b/tools/testing/selftests/kvm/lib/elf.c
@@ -0,0 +1,196 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * tools/testing/selftests/kvm/lib/elf.c
+ *
+ * Copyright (C) 2018, Google LLC.
+ */
+
+#include "test_util.h"
+
+#include <bits/endian.h>
+#include <linux/elf.h>
+
+#include "kvm_util.h"
+#include "kvm_util_internal.h"
+
+static void elfhdr_get(const char *filename, Elf64_Ehdr *hdrp)
+{
+ off_t offset_rv;
+
+ /* Open the ELF file. */
+ int fd;
+ fd = open(filename, O_RDONLY);
+ TEST_ASSERT(fd >= 0, "Failed to open ELF file,\n"
+ " filename: %s\n"
+ " rv: %i errno: %i", filename, fd, errno);
+
+ /* Read in and validate ELF Identification Record.
+ * The ELF Identification record is the first 16 (EI_NIDENT) bytes
+ * of the ELF header, which is at the beginning of the ELF file.
+ * For now it is only safe to read the first EI_NIDENT bytes. Once
+ * read and validated, the value of e_ehsize can be used to determine
+ * the real size of the ELF header.
+ */
+ unsigned char ident[EI_NIDENT];
+ test_read(fd, ident, sizeof(ident));
+ TEST_ASSERT((ident[EI_MAG0] == ELFMAG0) && (ident[EI_MAG1] == ELFMAG1)
+ && (ident[EI_MAG2] == ELFMAG2) && (ident[EI_MAG3] == ELFMAG3),
+ "ELF MAGIC Mismatch,\n"
+ " filename: %s\n"
+ " ident[EI_MAG0 - EI_MAG3]: %02x %02x %02x %02x\n"
+ " Expected: %02x %02x %02x %02x",
+ filename,
+ ident[EI_MAG0], ident[EI_MAG1], ident[EI_MAG2], ident[EI_MAG3],
+ ELFMAG0, ELFMAG1, ELFMAG2, ELFMAG3);
+ TEST_ASSERT(ident[EI_CLASS] == ELFCLASS64,
+ "Current implementation only able to handle ELFCLASS64,\n"
+ " filename: %s\n"
+ " ident[EI_CLASS]: %02x\n"
+ " expected: %02x",
+ filename,
+ ident[EI_CLASS], ELFCLASS64);
+ TEST_ASSERT(((BYTE_ORDER == LITTLE_ENDIAN)
+ && (ident[EI_DATA] == ELFDATA2LSB))
+ || ((BYTE_ORDER == BIG_ENDIAN)
+ && (ident[EI_DATA] == ELFDATA2MSB)), "Current "
+ "implementation only able to handle\n"
+ "cases where the host and ELF file endianness\n"
+ "is the same:\n"
+ " host BYTE_ORDER: %u\n"
+ " host LITTLE_ENDIAN: %u\n"
+ " host BIG_ENDIAN: %u\n"
+ " ident[EI_DATA]: %u\n"
+ " ELFDATA2LSB: %u\n"
+ " ELFDATA2MSB: %u",
+ BYTE_ORDER, LITTLE_ENDIAN, BIG_ENDIAN,
+ ident[EI_DATA], ELFDATA2LSB, ELFDATA2MSB);
+ TEST_ASSERT(ident[EI_VERSION] == EV_CURRENT,
+ "Current implementation only able to handle current "
+ "ELF version,\n"
+ " filename: %s\n"
+ " ident[EI_VERSION]: %02x\n"
+ " expected: %02x",
+ filename, ident[EI_VERSION], EV_CURRENT);
+
+ /* Read in the ELF header.
+ * With the ELF Identification portion of the ELF header
+ * validated, especially that the value at EI_VERSION is
+ * as expected, it is now safe to read the entire ELF header.
+ */
+ offset_rv = lseek(fd, 0, SEEK_SET);
+ TEST_ASSERT(offset_rv == 0, "Seek to ELF header failed,\n"
+ " rv: %zi expected: %i", offset_rv, 0);
+ test_read(fd, hdrp, sizeof(*hdrp));
+ TEST_ASSERT(hdrp->e_phentsize == sizeof(Elf64_Phdr),
+ "Unexpected physical header size,\n"
+ " hdrp->e_phentsize: %x\n"
+ " expected: %zx",
+ hdrp->e_phentsize, sizeof(Elf64_Phdr));
+ TEST_ASSERT(hdrp->e_shentsize == sizeof(Elf64_Shdr),
+ "Unexpected section header size,\n"
+ " hdrp->e_shentsize: %x\n"
+ " expected: %zx",
+ hdrp->e_shentsize, sizeof(Elf64_Shdr));
+}
+
+/* VM ELF Load
+ *
+ * Input Args:
+ * filename - Path to ELF file
+ *
+ * Output Args: None
+ *
+ * Input/Output Args:
+ * vm - Pointer to opaque type that describes the VM.
+ *
+ * Return: None, TEST_ASSERT failures for all error conditions
+ *
+ * Loads the program image of the ELF file specified by filename,
+ * into the virtual address space of the VM pointed to by vm. On entry
+ * the VM needs to not be using any of the virtual address space used
+ * by the image and it needs to have sufficient available physical pages, to
+ * back the virtual pages used to load the image.
+ */
+void kvm_vm_elf_load(struct kvm_vm *vm, const char *filename,
+ uint32_t data_memslot, uint32_t pgd_memslot)
+{
+ off_t offset, offset_rv;
+ Elf64_Ehdr hdr;
+
+ /* Open the ELF file. */
+ int fd;
+ fd = open(filename, O_RDONLY);
+ TEST_ASSERT(fd >= 0, "Failed to open ELF file,\n"
+ " filename: %s\n"
+ " rv: %i errno: %i", filename, fd, errno);
+
+ /* Read in the ELF header. */
+ elfhdr_get(filename, &hdr);
+
+ /* For each program header.
+ * The following ELF header members specify the location
+ * and size of the program headers:
+ *
+ * e_phoff - File offset to start of program headers
+ * e_phentsize - Size of each program header
+ * e_phnum - Number of program header entries
+ */
+ for (unsigned int n1 = 0; n1 < hdr.e_phnum; n1++) {
+ /* Seek to the beginning of the program header. */
+ offset = hdr.e_phoff + (n1 * hdr.e_phentsize);
+ offset_rv = lseek(fd, offset, SEEK_SET);
+ TEST_ASSERT(offset_rv == offset,
+ "Failed to seek to begining of program header %u,\n"
+ " filename: %s\n"
+ " rv: %jd errno: %i",
+ n1, filename, (intmax_t) offset_rv, errno);
+
+ /* Read in the program header. */
+ Elf64_Phdr phdr;
+ test_read(fd, &phdr, sizeof(phdr));
+
+ /* Skip if this header doesn't describe a loadable segment. */
+ if (phdr.p_type != PT_LOAD)
+ continue;
+
+ /* Allocate memory for this segment within the VM. */
+ TEST_ASSERT(phdr.p_memsz > 0, "Unexpected loadable segment "
+ "memsize of 0,\n"
+ " phdr index: %u p_memsz: 0x%" PRIx64,
+ n1, (uint64_t) phdr.p_memsz);
+ vm_vaddr_t seg_vstart = phdr.p_vaddr;
+ seg_vstart &= ~(vm_vaddr_t)(vm->page_size - 1);
+ vm_vaddr_t seg_vend = phdr.p_vaddr + phdr.p_memsz - 1;
+ seg_vend |= vm->page_size - 1;
+ size_t seg_size = seg_vend - seg_vstart + 1;
+
+ vm_vaddr_t vaddr = vm_vaddr_alloc(vm, seg_size, seg_vstart,
+ data_memslot, pgd_memslot);
+ TEST_ASSERT(vaddr == seg_vstart, "Unable to allocate "
+ "virtual memory for segment at requested min addr,\n"
+ " segment idx: %u\n"
+ " seg_vstart: 0x%lx\n"
+ " vaddr: 0x%lx",
+ n1, seg_vstart, vaddr);
+ memset(addr_gva2hva(vm, vaddr), 0, seg_size);
+ /* TODO(lhuemill): Set permissions of each memory segment
+ * based on the least-significant 3 bits of phdr.p_flags.
+ */
+
+ /* Load portion of initial state that is contained within
+ * the ELF file.
+ */
+ if (phdr.p_filesz) {
+ offset_rv = lseek(fd, phdr.p_offset, SEEK_SET);
+ TEST_ASSERT(offset_rv == phdr.p_offset,
+ "Seek to program segment offset failed,\n"
+ " program header idx: %u errno: %i\n"
+ " offset_rv: 0x%jx\n"
+ " expected: 0x%jx\n",
+ n1, errno, (intmax_t) offset_rv,
+ (intmax_t) phdr.p_offset);
+ test_read(fd, addr_gva2hva(vm, phdr.p_vaddr),
+ phdr.p_filesz);
+ }
+ }
+}
diff --git a/tools/testing/selftests/kvm/lib/io.c b/tools/testing/selftests/kvm/lib/io.c
new file mode 100644
index 000000000..fedb2a741
--- /dev/null
+++ b/tools/testing/selftests/kvm/lib/io.c
@@ -0,0 +1,157 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * tools/testing/selftests/kvm/lib/io.c
+ *
+ * Copyright (C) 2018, Google LLC.
+ */
+
+#include "test_util.h"
+
+/* Test Write
+ *
+ * A wrapper for write(2), that automatically handles the following
+ * special conditions:
+ *
+ * + Interrupted system call (EINTR)
+ * + Write of less than requested amount
+ * + Non-block return (EAGAIN)
+ *
+ * For each of the above, an additional write is performed to automatically
+ * continue writing the requested data.
+ * There are also many cases where write(2) can return an unexpected
+ * error (e.g. EIO). Such errors cause a TEST_ASSERT failure.
+ *
+ * Note, for function signature compatibility with write(2), this function
+ * returns the number of bytes written, but that value will always be equal
+ * to the number of requested bytes. All other conditions in this and
+ * future enhancements to this function either automatically issue another
+ * write(2) or cause a TEST_ASSERT failure.
+ *
+ * Args:
+ * fd - Opened file descriptor to file to be written.
+ * count - Number of bytes to write.
+ *
+ * Output:
+ * buf - Starting address of data to be written.
+ *
+ * Return:
+ * On success, number of bytes written.
+ * On failure, a TEST_ASSERT failure is caused.
+ */
+ssize_t test_write(int fd, const void *buf, size_t count)
+{
+ ssize_t rc;
+ ssize_t num_written = 0;
+ size_t num_left = count;
+ const char *ptr = buf;
+
+ /* Note: Count of zero is allowed (see "RETURN VALUE" portion of
+ * write(2) manpage for details.
+ */
+ TEST_ASSERT(count >= 0, "Unexpected count, count: %li", count);
+
+ do {
+ rc = write(fd, ptr, num_left);
+
+ switch (rc) {
+ case -1:
+ TEST_ASSERT(errno == EAGAIN || errno == EINTR,
+ "Unexpected write failure,\n"
+ " rc: %zi errno: %i", rc, errno);
+ continue;
+
+ case 0:
+ TEST_FAIL("Unexpected EOF,\n"
+ " rc: %zi num_written: %zi num_left: %zu",
+ rc, num_written, num_left);
+ break;
+
+ default:
+ TEST_ASSERT(rc >= 0, "Unexpected ret from write,\n"
+ " rc: %zi errno: %i", rc, errno);
+ num_written += rc;
+ num_left -= rc;
+ ptr += rc;
+ break;
+ }
+ } while (num_written < count);
+
+ return num_written;
+}
+
+/* Test Read
+ *
+ * A wrapper for read(2), that automatically handles the following
+ * special conditions:
+ *
+ * + Interrupted system call (EINTR)
+ * + Read of less than requested amount
+ * + Non-block return (EAGAIN)
+ *
+ * For each of the above, an additional read is performed to automatically
+ * continue reading the requested data.
+ * There are also many cases where read(2) can return an unexpected
+ * error (e.g. EIO). Such errors cause a TEST_ASSERT failure. Note,
+ * it is expected that the file opened by fd at the current file position
+ * contains at least the number of requested bytes to be read. A TEST_ASSERT
+ * failure is produced if an End-Of-File condition occurs, before all the
+ * data is read. It is the callers responsibility to assure that sufficient
+ * data exists.
+ *
+ * Note, for function signature compatibility with read(2), this function
+ * returns the number of bytes read, but that value will always be equal
+ * to the number of requested bytes. All other conditions in this and
+ * future enhancements to this function either automatically issue another
+ * read(2) or cause a TEST_ASSERT failure.
+ *
+ * Args:
+ * fd - Opened file descriptor to file to be read.
+ * count - Number of bytes to read.
+ *
+ * Output:
+ * buf - Starting address of where to write the bytes read.
+ *
+ * Return:
+ * On success, number of bytes read.
+ * On failure, a TEST_ASSERT failure is caused.
+ */
+ssize_t test_read(int fd, void *buf, size_t count)
+{
+ ssize_t rc;
+ ssize_t num_read = 0;
+ size_t num_left = count;
+ char *ptr = buf;
+
+ /* Note: Count of zero is allowed (see "If count is zero" portion of
+ * read(2) manpage for details.
+ */
+ TEST_ASSERT(count >= 0, "Unexpected count, count: %li", count);
+
+ do {
+ rc = read(fd, ptr, num_left);
+
+ switch (rc) {
+ case -1:
+ TEST_ASSERT(errno == EAGAIN || errno == EINTR,
+ "Unexpected read failure,\n"
+ " rc: %zi errno: %i", rc, errno);
+ break;
+
+ case 0:
+ TEST_FAIL("Unexpected EOF,\n"
+ " rc: %zi num_read: %zi num_left: %zu",
+ rc, num_read, num_left);
+ break;
+
+ default:
+ TEST_ASSERT(rc > 0, "Unexpected ret from read,\n"
+ " rc: %zi errno: %i", rc, errno);
+ num_read += rc;
+ num_left -= rc;
+ ptr += rc;
+ break;
+ }
+ } while (num_read < count);
+
+ return num_read;
+}
diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c
new file mode 100644
index 000000000..49805fd16
--- /dev/null
+++ b/tools/testing/selftests/kvm/lib/kvm_util.c
@@ -0,0 +1,1865 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * tools/testing/selftests/kvm/lib/kvm_util.c
+ *
+ * Copyright (C) 2018, Google LLC.
+ */
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "kvm_util_internal.h"
+#include "processor.h"
+
+#include <assert.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <linux/kernel.h>
+
+#define KVM_UTIL_PGS_PER_HUGEPG 512
+#define KVM_UTIL_MIN_PFN 2
+
+/* Aligns x up to the next multiple of size. Size must be a power of 2. */
+static void *align(void *x, size_t size)
+{
+ size_t mask = size - 1;
+ TEST_ASSERT(size != 0 && !(size & (size - 1)),
+ "size not a power of 2: %lu", size);
+ return (void *) (((size_t) x + mask) & ~mask);
+}
+
+/*
+ * Capability
+ *
+ * Input Args:
+ * cap - Capability
+ *
+ * Output Args: None
+ *
+ * Return:
+ * On success, the Value corresponding to the capability (KVM_CAP_*)
+ * specified by the value of cap. On failure a TEST_ASSERT failure
+ * is produced.
+ *
+ * Looks up and returns the value corresponding to the capability
+ * (KVM_CAP_*) given by cap.
+ */
+int kvm_check_cap(long cap)
+{
+ int ret;
+ int kvm_fd;
+
+ kvm_fd = open(KVM_DEV_PATH, O_RDONLY);
+ if (kvm_fd < 0)
+ exit(KSFT_SKIP);
+
+ ret = ioctl(kvm_fd, KVM_CHECK_EXTENSION, cap);
+ TEST_ASSERT(ret >= 0, "KVM_CHECK_EXTENSION IOCTL failed,\n"
+ " rc: %i errno: %i", ret, errno);
+
+ close(kvm_fd);
+
+ return ret;
+}
+
+/* VM Enable Capability
+ *
+ * Input Args:
+ * vm - Virtual Machine
+ * cap - Capability
+ *
+ * Output Args: None
+ *
+ * Return: On success, 0. On failure a TEST_ASSERT failure is produced.
+ *
+ * Enables a capability (KVM_CAP_*) on the VM.
+ */
+int vm_enable_cap(struct kvm_vm *vm, struct kvm_enable_cap *cap)
+{
+ int ret;
+
+ ret = ioctl(vm->fd, KVM_ENABLE_CAP, cap);
+ TEST_ASSERT(ret == 0, "KVM_ENABLE_CAP IOCTL failed,\n"
+ " rc: %i errno: %i", ret, errno);
+
+ return ret;
+}
+
+/* VCPU Enable Capability
+ *
+ * Input Args:
+ * vm - Virtual Machine
+ * vcpu_id - VCPU
+ * cap - Capability
+ *
+ * Output Args: None
+ *
+ * Return: On success, 0. On failure a TEST_ASSERT failure is produced.
+ *
+ * Enables a capability (KVM_CAP_*) on the VCPU.
+ */
+int vcpu_enable_cap(struct kvm_vm *vm, uint32_t vcpu_id,
+ struct kvm_enable_cap *cap)
+{
+ struct vcpu *vcpu = vcpu_find(vm, vcpu_id);
+ int r;
+
+ TEST_ASSERT(vcpu, "cannot find vcpu %d", vcpu_id);
+
+ r = ioctl(vcpu->fd, KVM_ENABLE_CAP, cap);
+ TEST_ASSERT(!r, "KVM_ENABLE_CAP vCPU ioctl failed,\n"
+ " rc: %i, errno: %i", r, errno);
+
+ return r;
+}
+
+static void vm_open(struct kvm_vm *vm, int perm)
+{
+ vm->kvm_fd = open(KVM_DEV_PATH, perm);
+ if (vm->kvm_fd < 0)
+ exit(KSFT_SKIP);
+
+ if (!kvm_check_cap(KVM_CAP_IMMEDIATE_EXIT)) {
+ print_skip("immediate_exit not available");
+ exit(KSFT_SKIP);
+ }
+
+ vm->fd = ioctl(vm->kvm_fd, KVM_CREATE_VM, vm->type);
+ TEST_ASSERT(vm->fd >= 0, "KVM_CREATE_VM ioctl failed, "
+ "rc: %i errno: %i", vm->fd, errno);
+}
+
+const char * const vm_guest_mode_string[] = {
+ "PA-bits:52, VA-bits:48, 4K pages",
+ "PA-bits:52, VA-bits:48, 64K pages",
+ "PA-bits:48, VA-bits:48, 4K pages",
+ "PA-bits:48, VA-bits:48, 64K pages",
+ "PA-bits:40, VA-bits:48, 4K pages",
+ "PA-bits:40, VA-bits:48, 64K pages",
+ "PA-bits:ANY, VA-bits:48, 4K pages",
+};
+_Static_assert(sizeof(vm_guest_mode_string)/sizeof(char *) == NUM_VM_MODES,
+ "Missing new mode strings?");
+
+struct vm_guest_mode_params {
+ unsigned int pa_bits;
+ unsigned int va_bits;
+ unsigned int page_size;
+ unsigned int page_shift;
+};
+
+static const struct vm_guest_mode_params vm_guest_mode_params[] = {
+ { 52, 48, 0x1000, 12 },
+ { 52, 48, 0x10000, 16 },
+ { 48, 48, 0x1000, 12 },
+ { 48, 48, 0x10000, 16 },
+ { 40, 48, 0x1000, 12 },
+ { 40, 48, 0x10000, 16 },
+ { 0, 0, 0x1000, 12 },
+};
+_Static_assert(sizeof(vm_guest_mode_params)/sizeof(struct vm_guest_mode_params) == NUM_VM_MODES,
+ "Missing new mode params?");
+
+/*
+ * VM Create
+ *
+ * Input Args:
+ * mode - VM Mode (e.g. VM_MODE_P52V48_4K)
+ * phy_pages - Physical memory pages
+ * perm - permission
+ *
+ * Output Args: None
+ *
+ * Return:
+ * Pointer to opaque structure that describes the created VM.
+ *
+ * Creates a VM with the mode specified by mode (e.g. VM_MODE_P52V48_4K).
+ * When phy_pages is non-zero, a memory region of phy_pages physical pages
+ * is created and mapped starting at guest physical address 0. The file
+ * descriptor to control the created VM is created with the permissions
+ * given by perm (e.g. O_RDWR).
+ */
+struct kvm_vm *vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm)
+{
+ struct kvm_vm *vm;
+
+ pr_debug("%s: mode='%s' pages='%ld' perm='%d'\n", __func__,
+ vm_guest_mode_string(mode), phy_pages, perm);
+
+ vm = calloc(1, sizeof(*vm));
+ TEST_ASSERT(vm != NULL, "Insufficient Memory");
+
+ INIT_LIST_HEAD(&vm->vcpus);
+ INIT_LIST_HEAD(&vm->userspace_mem_regions);
+
+ vm->mode = mode;
+ vm->type = 0;
+
+ vm->pa_bits = vm_guest_mode_params[mode].pa_bits;
+ vm->va_bits = vm_guest_mode_params[mode].va_bits;
+ vm->page_size = vm_guest_mode_params[mode].page_size;
+ vm->page_shift = vm_guest_mode_params[mode].page_shift;
+
+ /* Setup mode specific traits. */
+ switch (vm->mode) {
+ case VM_MODE_P52V48_4K:
+ vm->pgtable_levels = 4;
+ break;
+ case VM_MODE_P52V48_64K:
+ vm->pgtable_levels = 3;
+ break;
+ case VM_MODE_P48V48_4K:
+ vm->pgtable_levels = 4;
+ break;
+ case VM_MODE_P48V48_64K:
+ vm->pgtable_levels = 3;
+ break;
+ case VM_MODE_P40V48_4K:
+ vm->pgtable_levels = 4;
+ break;
+ case VM_MODE_P40V48_64K:
+ vm->pgtable_levels = 3;
+ break;
+ case VM_MODE_PXXV48_4K:
+#ifdef __x86_64__
+ kvm_get_cpu_address_width(&vm->pa_bits, &vm->va_bits);
+ /*
+ * Ignore KVM support for 5-level paging (vm->va_bits == 57),
+ * it doesn't take effect unless a CR4.LA57 is set, which it
+ * isn't for this VM_MODE.
+ */
+ TEST_ASSERT(vm->va_bits == 48 || vm->va_bits == 57,
+ "Linear address width (%d bits) not supported",
+ vm->va_bits);
+ pr_debug("Guest physical address width detected: %d\n",
+ vm->pa_bits);
+ vm->pgtable_levels = 4;
+ vm->va_bits = 48;
+#else
+ TEST_FAIL("VM_MODE_PXXV48_4K not supported on non-x86 platforms");
+#endif
+ break;
+ default:
+ TEST_FAIL("Unknown guest mode, mode: 0x%x", mode);
+ }
+
+#ifdef __aarch64__
+ if (vm->pa_bits != 40)
+ vm->type = KVM_VM_TYPE_ARM_IPA_SIZE(vm->pa_bits);
+#endif
+
+ vm_open(vm, perm);
+
+ /* Limit to VA-bit canonical virtual addresses. */
+ vm->vpages_valid = sparsebit_alloc();
+ sparsebit_set_num(vm->vpages_valid,
+ 0, (1ULL << (vm->va_bits - 1)) >> vm->page_shift);
+ sparsebit_set_num(vm->vpages_valid,
+ (~((1ULL << (vm->va_bits - 1)) - 1)) >> vm->page_shift,
+ (1ULL << (vm->va_bits - 1)) >> vm->page_shift);
+
+ /* Limit physical addresses to PA-bits. */
+ vm->max_gfn = ((1ULL << vm->pa_bits) >> vm->page_shift) - 1;
+
+ /* Allocate and setup memory for guest. */
+ vm->vpages_mapped = sparsebit_alloc();
+ if (phy_pages != 0)
+ vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS,
+ 0, 0, phy_pages, 0);
+
+ return vm;
+}
+
+/*
+ * VM Restart
+ *
+ * Input Args:
+ * vm - VM that has been released before
+ * perm - permission
+ *
+ * Output Args: None
+ *
+ * Reopens the file descriptors associated to the VM and reinstates the
+ * global state, such as the irqchip and the memory regions that are mapped
+ * into the guest.
+ */
+void kvm_vm_restart(struct kvm_vm *vmp, int perm)
+{
+ struct userspace_mem_region *region;
+
+ vm_open(vmp, perm);
+ if (vmp->has_irqchip)
+ vm_create_irqchip(vmp);
+
+ list_for_each_entry(region, &vmp->userspace_mem_regions, list) {
+ int ret = ioctl(vmp->fd, KVM_SET_USER_MEMORY_REGION, &region->region);
+ TEST_ASSERT(ret == 0, "KVM_SET_USER_MEMORY_REGION IOCTL failed,\n"
+ " rc: %i errno: %i\n"
+ " slot: %u flags: 0x%x\n"
+ " guest_phys_addr: 0x%llx size: 0x%llx",
+ ret, errno, region->region.slot,
+ region->region.flags,
+ region->region.guest_phys_addr,
+ region->region.memory_size);
+ }
+}
+
+void kvm_vm_get_dirty_log(struct kvm_vm *vm, int slot, void *log)
+{
+ struct kvm_dirty_log args = { .dirty_bitmap = log, .slot = slot };
+ int ret;
+
+ ret = ioctl(vm->fd, KVM_GET_DIRTY_LOG, &args);
+ TEST_ASSERT(ret == 0, "%s: KVM_GET_DIRTY_LOG failed: %s",
+ __func__, strerror(-ret));
+}
+
+void kvm_vm_clear_dirty_log(struct kvm_vm *vm, int slot, void *log,
+ uint64_t first_page, uint32_t num_pages)
+{
+ struct kvm_clear_dirty_log args = { .dirty_bitmap = log, .slot = slot,
+ .first_page = first_page,
+ .num_pages = num_pages };
+ int ret;
+
+ ret = ioctl(vm->fd, KVM_CLEAR_DIRTY_LOG, &args);
+ TEST_ASSERT(ret == 0, "%s: KVM_CLEAR_DIRTY_LOG failed: %s",
+ __func__, strerror(-ret));
+}
+
+/*
+ * Userspace Memory Region Find
+ *
+ * Input Args:
+ * vm - Virtual Machine
+ * start - Starting VM physical address
+ * end - Ending VM physical address, inclusive.
+ *
+ * Output Args: None
+ *
+ * Return:
+ * Pointer to overlapping region, NULL if no such region.
+ *
+ * Searches for a region with any physical memory that overlaps with
+ * any portion of the guest physical addresses from start to end
+ * inclusive. If multiple overlapping regions exist, a pointer to any
+ * of the regions is returned. Null is returned only when no overlapping
+ * region exists.
+ */
+static struct userspace_mem_region *
+userspace_mem_region_find(struct kvm_vm *vm, uint64_t start, uint64_t end)
+{
+ struct userspace_mem_region *region;
+
+ list_for_each_entry(region, &vm->userspace_mem_regions, list) {
+ uint64_t existing_start = region->region.guest_phys_addr;
+ uint64_t existing_end = region->region.guest_phys_addr
+ + region->region.memory_size - 1;
+ if (start <= existing_end && end >= existing_start)
+ return region;
+ }
+
+ return NULL;
+}
+
+/*
+ * KVM Userspace Memory Region Find
+ *
+ * Input Args:
+ * vm - Virtual Machine
+ * start - Starting VM physical address
+ * end - Ending VM physical address, inclusive.
+ *
+ * Output Args: None
+ *
+ * Return:
+ * Pointer to overlapping region, NULL if no such region.
+ *
+ * Public interface to userspace_mem_region_find. Allows tests to look up
+ * the memslot datastructure for a given range of guest physical memory.
+ */
+struct kvm_userspace_memory_region *
+kvm_userspace_memory_region_find(struct kvm_vm *vm, uint64_t start,
+ uint64_t end)
+{
+ struct userspace_mem_region *region;
+
+ region = userspace_mem_region_find(vm, start, end);
+ if (!region)
+ return NULL;
+
+ return &region->region;
+}
+
+/*
+ * VCPU Find
+ *
+ * Input Args:
+ * vm - Virtual Machine
+ * vcpuid - VCPU ID
+ *
+ * Output Args: None
+ *
+ * Return:
+ * Pointer to VCPU structure
+ *
+ * Locates a vcpu structure that describes the VCPU specified by vcpuid and
+ * returns a pointer to it. Returns NULL if the VM doesn't contain a VCPU
+ * for the specified vcpuid.
+ */
+struct vcpu *vcpu_find(struct kvm_vm *vm, uint32_t vcpuid)
+{
+ struct vcpu *vcpu;
+
+ list_for_each_entry(vcpu, &vm->vcpus, list) {
+ if (vcpu->id == vcpuid)
+ return vcpu;
+ }
+
+ return NULL;
+}
+
+/*
+ * VM VCPU Remove
+ *
+ * Input Args:
+ * vcpu - VCPU to remove
+ *
+ * Output Args: None
+ *
+ * Return: None, TEST_ASSERT failures for all error conditions
+ *
+ * Removes a vCPU from a VM and frees its resources.
+ */
+static void vm_vcpu_rm(struct vcpu *vcpu)
+{
+ int ret;
+
+ ret = munmap(vcpu->state, sizeof(*vcpu->state));
+ TEST_ASSERT(ret == 0, "munmap of VCPU fd failed, rc: %i "
+ "errno: %i", ret, errno);
+ close(vcpu->fd);
+ TEST_ASSERT(ret == 0, "Close of VCPU fd failed, rc: %i "
+ "errno: %i", ret, errno);
+
+ list_del(&vcpu->list);
+ free(vcpu);
+}
+
+void kvm_vm_release(struct kvm_vm *vmp)
+{
+ struct vcpu *vcpu, *tmp;
+ int ret;
+
+ list_for_each_entry_safe(vcpu, tmp, &vmp->vcpus, list)
+ vm_vcpu_rm(vcpu);
+
+ ret = close(vmp->fd);
+ TEST_ASSERT(ret == 0, "Close of vm fd failed,\n"
+ " vmp->fd: %i rc: %i errno: %i", vmp->fd, ret, errno);
+
+ close(vmp->kvm_fd);
+ TEST_ASSERT(ret == 0, "Close of /dev/kvm fd failed,\n"
+ " vmp->kvm_fd: %i rc: %i errno: %i", vmp->kvm_fd, ret, errno);
+}
+
+static void __vm_mem_region_delete(struct kvm_vm *vm,
+ struct userspace_mem_region *region)
+{
+ int ret;
+
+ list_del(&region->list);
+
+ region->region.memory_size = 0;
+ ret = ioctl(vm->fd, KVM_SET_USER_MEMORY_REGION, &region->region);
+ TEST_ASSERT(ret == 0, "KVM_SET_USER_MEMORY_REGION IOCTL failed, "
+ "rc: %i errno: %i", ret, errno);
+
+ sparsebit_free(&region->unused_phy_pages);
+ ret = munmap(region->mmap_start, region->mmap_size);
+ TEST_ASSERT(ret == 0, "munmap failed, rc: %i errno: %i", ret, errno);
+
+ free(region);
+}
+
+/*
+ * Destroys and frees the VM pointed to by vmp.
+ */
+void kvm_vm_free(struct kvm_vm *vmp)
+{
+ struct userspace_mem_region *region, *tmp;
+
+ if (vmp == NULL)
+ return;
+
+ /* Free userspace_mem_regions. */
+ list_for_each_entry_safe(region, tmp, &vmp->userspace_mem_regions, list)
+ __vm_mem_region_delete(vmp, region);
+
+ /* Free sparsebit arrays. */
+ sparsebit_free(&vmp->vpages_valid);
+ sparsebit_free(&vmp->vpages_mapped);
+
+ kvm_vm_release(vmp);
+
+ /* Free the structure describing the VM. */
+ free(vmp);
+}
+
+/*
+ * Memory Compare, host virtual to guest virtual
+ *
+ * Input Args:
+ * hva - Starting host virtual address
+ * vm - Virtual Machine
+ * gva - Starting guest virtual address
+ * len - number of bytes to compare
+ *
+ * Output Args: None
+ *
+ * Input/Output Args: None
+ *
+ * Return:
+ * Returns 0 if the bytes starting at hva for a length of len
+ * are equal the guest virtual bytes starting at gva. Returns
+ * a value < 0, if bytes at hva are less than those at gva.
+ * Otherwise a value > 0 is returned.
+ *
+ * Compares the bytes starting at the host virtual address hva, for
+ * a length of len, to the guest bytes starting at the guest virtual
+ * address given by gva.
+ */
+int kvm_memcmp_hva_gva(void *hva, struct kvm_vm *vm, vm_vaddr_t gva, size_t len)
+{
+ size_t amt;
+
+ /*
+ * Compare a batch of bytes until either a match is found
+ * or all the bytes have been compared.
+ */
+ for (uintptr_t offset = 0; offset < len; offset += amt) {
+ uintptr_t ptr1 = (uintptr_t)hva + offset;
+
+ /*
+ * Determine host address for guest virtual address
+ * at offset.
+ */
+ uintptr_t ptr2 = (uintptr_t)addr_gva2hva(vm, gva + offset);
+
+ /*
+ * Determine amount to compare on this pass.
+ * Don't allow the comparsion to cross a page boundary.
+ */
+ amt = len - offset;
+ if ((ptr1 >> vm->page_shift) != ((ptr1 + amt) >> vm->page_shift))
+ amt = vm->page_size - (ptr1 % vm->page_size);
+ if ((ptr2 >> vm->page_shift) != ((ptr2 + amt) >> vm->page_shift))
+ amt = vm->page_size - (ptr2 % vm->page_size);
+
+ assert((ptr1 >> vm->page_shift) == ((ptr1 + amt - 1) >> vm->page_shift));
+ assert((ptr2 >> vm->page_shift) == ((ptr2 + amt - 1) >> vm->page_shift));
+
+ /*
+ * Perform the comparison. If there is a difference
+ * return that result to the caller, otherwise need
+ * to continue on looking for a mismatch.
+ */
+ int ret = memcmp((void *)ptr1, (void *)ptr2, amt);
+ if (ret != 0)
+ return ret;
+ }
+
+ /*
+ * No mismatch found. Let the caller know the two memory
+ * areas are equal.
+ */
+ return 0;
+}
+
+/*
+ * VM Userspace Memory Region Add
+ *
+ * Input Args:
+ * vm - Virtual Machine
+ * backing_src - Storage source for this region.
+ * NULL to use anonymous memory.
+ * guest_paddr - Starting guest physical address
+ * slot - KVM region slot
+ * npages - Number of physical pages
+ * flags - KVM memory region flags (e.g. KVM_MEM_LOG_DIRTY_PAGES)
+ *
+ * Output Args: None
+ *
+ * Return: None
+ *
+ * Allocates a memory area of the number of pages specified by npages
+ * and maps it to the VM specified by vm, at a starting physical address
+ * given by guest_paddr. The region is created with a KVM region slot
+ * given by slot, which must be unique and < KVM_MEM_SLOTS_NUM. The
+ * region is created with the flags given by flags.
+ */
+void vm_userspace_mem_region_add(struct kvm_vm *vm,
+ enum vm_mem_backing_src_type src_type,
+ uint64_t guest_paddr, uint32_t slot, uint64_t npages,
+ uint32_t flags)
+{
+ int ret;
+ struct userspace_mem_region *region;
+ size_t huge_page_size = KVM_UTIL_PGS_PER_HUGEPG * vm->page_size;
+ size_t alignment;
+
+ TEST_ASSERT(vm_adjust_num_guest_pages(vm->mode, npages) == npages,
+ "Number of guest pages is not compatible with the host. "
+ "Try npages=%d", vm_adjust_num_guest_pages(vm->mode, npages));
+
+ TEST_ASSERT((guest_paddr % vm->page_size) == 0, "Guest physical "
+ "address not on a page boundary.\n"
+ " guest_paddr: 0x%lx vm->page_size: 0x%x",
+ guest_paddr, vm->page_size);
+ TEST_ASSERT((((guest_paddr >> vm->page_shift) + npages) - 1)
+ <= vm->max_gfn, "Physical range beyond maximum "
+ "supported physical address,\n"
+ " guest_paddr: 0x%lx npages: 0x%lx\n"
+ " vm->max_gfn: 0x%lx vm->page_size: 0x%x",
+ guest_paddr, npages, vm->max_gfn, vm->page_size);
+
+ /*
+ * Confirm a mem region with an overlapping address doesn't
+ * already exist.
+ */
+ region = (struct userspace_mem_region *) userspace_mem_region_find(
+ vm, guest_paddr, (guest_paddr + npages * vm->page_size) - 1);
+ if (region != NULL)
+ TEST_FAIL("overlapping userspace_mem_region already "
+ "exists\n"
+ " requested guest_paddr: 0x%lx npages: 0x%lx "
+ "page_size: 0x%x\n"
+ " existing guest_paddr: 0x%lx size: 0x%lx",
+ guest_paddr, npages, vm->page_size,
+ (uint64_t) region->region.guest_phys_addr,
+ (uint64_t) region->region.memory_size);
+
+ /* Confirm no region with the requested slot already exists. */
+ list_for_each_entry(region, &vm->userspace_mem_regions, list) {
+ if (region->region.slot != slot)
+ continue;
+
+ TEST_FAIL("A mem region with the requested slot "
+ "already exists.\n"
+ " requested slot: %u paddr: 0x%lx npages: 0x%lx\n"
+ " existing slot: %u paddr: 0x%lx size: 0x%lx",
+ slot, guest_paddr, npages,
+ region->region.slot,
+ (uint64_t) region->region.guest_phys_addr,
+ (uint64_t) region->region.memory_size);
+ }
+
+ /* Allocate and initialize new mem region structure. */
+ region = calloc(1, sizeof(*region));
+ TEST_ASSERT(region != NULL, "Insufficient Memory");
+ region->mmap_size = npages * vm->page_size;
+
+#ifdef __s390x__
+ /* On s390x, the host address must be aligned to 1M (due to PGSTEs) */
+ alignment = 0x100000;
+#else
+ alignment = 1;
+#endif
+
+ if (src_type == VM_MEM_SRC_ANONYMOUS_THP)
+ alignment = max(huge_page_size, alignment);
+
+ /* Add enough memory to align up if necessary */
+ if (alignment > 1)
+ region->mmap_size += alignment;
+
+ region->mmap_start = mmap(NULL, region->mmap_size,
+ PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS
+ | (src_type == VM_MEM_SRC_ANONYMOUS_HUGETLB ? MAP_HUGETLB : 0),
+ -1, 0);
+ TEST_ASSERT(region->mmap_start != MAP_FAILED,
+ "test_malloc failed, mmap_start: %p errno: %i",
+ region->mmap_start, errno);
+
+ /* Align host address */
+ region->host_mem = align(region->mmap_start, alignment);
+
+ /* As needed perform madvise */
+ if (src_type == VM_MEM_SRC_ANONYMOUS || src_type == VM_MEM_SRC_ANONYMOUS_THP) {
+ struct stat statbuf;
+
+ ret = stat("/sys/kernel/mm/transparent_hugepage", &statbuf);
+ TEST_ASSERT(ret == 0 || (ret == -1 && errno == ENOENT),
+ "stat /sys/kernel/mm/transparent_hugepage");
+
+ TEST_ASSERT(ret == 0 || src_type != VM_MEM_SRC_ANONYMOUS_THP,
+ "VM_MEM_SRC_ANONYMOUS_THP requires THP to be configured in the host kernel");
+
+ if (ret == 0) {
+ ret = madvise(region->host_mem, npages * vm->page_size,
+ src_type == VM_MEM_SRC_ANONYMOUS ? MADV_NOHUGEPAGE : MADV_HUGEPAGE);
+ TEST_ASSERT(ret == 0, "madvise failed, addr: %p length: 0x%lx src_type: %x",
+ region->host_mem, npages * vm->page_size, src_type);
+ }
+ }
+
+ region->unused_phy_pages = sparsebit_alloc();
+ sparsebit_set_num(region->unused_phy_pages,
+ guest_paddr >> vm->page_shift, npages);
+ region->region.slot = slot;
+ region->region.flags = flags;
+ region->region.guest_phys_addr = guest_paddr;
+ region->region.memory_size = npages * vm->page_size;
+ region->region.userspace_addr = (uintptr_t) region->host_mem;
+ ret = ioctl(vm->fd, KVM_SET_USER_MEMORY_REGION, &region->region);
+ TEST_ASSERT(ret == 0, "KVM_SET_USER_MEMORY_REGION IOCTL failed,\n"
+ " rc: %i errno: %i\n"
+ " slot: %u flags: 0x%x\n"
+ " guest_phys_addr: 0x%lx size: 0x%lx",
+ ret, errno, slot, flags,
+ guest_paddr, (uint64_t) region->region.memory_size);
+
+ /* Add to linked-list of memory regions. */
+ list_add(&region->list, &vm->userspace_mem_regions);
+}
+
+/*
+ * Memslot to region
+ *
+ * Input Args:
+ * vm - Virtual Machine
+ * memslot - KVM memory slot ID
+ *
+ * Output Args: None
+ *
+ * Return:
+ * Pointer to memory region structure that describe memory region
+ * using kvm memory slot ID given by memslot. TEST_ASSERT failure
+ * on error (e.g. currently no memory region using memslot as a KVM
+ * memory slot ID).
+ */
+struct userspace_mem_region *
+memslot2region(struct kvm_vm *vm, uint32_t memslot)
+{
+ struct userspace_mem_region *region;
+
+ list_for_each_entry(region, &vm->userspace_mem_regions, list) {
+ if (region->region.slot == memslot)
+ return region;
+ }
+
+ fprintf(stderr, "No mem region with the requested slot found,\n"
+ " requested slot: %u\n", memslot);
+ fputs("---- vm dump ----\n", stderr);
+ vm_dump(stderr, vm, 2);
+ TEST_FAIL("Mem region not found");
+ return NULL;
+}
+
+/*
+ * VM Memory Region Flags Set
+ *
+ * Input Args:
+ * vm - Virtual Machine
+ * flags - Starting guest physical address
+ *
+ * Output Args: None
+ *
+ * Return: None
+ *
+ * Sets the flags of the memory region specified by the value of slot,
+ * to the values given by flags.
+ */
+void vm_mem_region_set_flags(struct kvm_vm *vm, uint32_t slot, uint32_t flags)
+{
+ int ret;
+ struct userspace_mem_region *region;
+
+ region = memslot2region(vm, slot);
+
+ region->region.flags = flags;
+
+ ret = ioctl(vm->fd, KVM_SET_USER_MEMORY_REGION, &region->region);
+
+ TEST_ASSERT(ret == 0, "KVM_SET_USER_MEMORY_REGION IOCTL failed,\n"
+ " rc: %i errno: %i slot: %u flags: 0x%x",
+ ret, errno, slot, flags);
+}
+
+/*
+ * VM Memory Region Move
+ *
+ * Input Args:
+ * vm - Virtual Machine
+ * slot - Slot of the memory region to move
+ * new_gpa - Starting guest physical address
+ *
+ * Output Args: None
+ *
+ * Return: None
+ *
+ * Change the gpa of a memory region.
+ */
+void vm_mem_region_move(struct kvm_vm *vm, uint32_t slot, uint64_t new_gpa)
+{
+ struct userspace_mem_region *region;
+ int ret;
+
+ region = memslot2region(vm, slot);
+
+ region->region.guest_phys_addr = new_gpa;
+
+ ret = ioctl(vm->fd, KVM_SET_USER_MEMORY_REGION, &region->region);
+
+ TEST_ASSERT(!ret, "KVM_SET_USER_MEMORY_REGION failed\n"
+ "ret: %i errno: %i slot: %u new_gpa: 0x%lx",
+ ret, errno, slot, new_gpa);
+}
+
+/*
+ * VM Memory Region Delete
+ *
+ * Input Args:
+ * vm - Virtual Machine
+ * slot - Slot of the memory region to delete
+ *
+ * Output Args: None
+ *
+ * Return: None
+ *
+ * Delete a memory region.
+ */
+void vm_mem_region_delete(struct kvm_vm *vm, uint32_t slot)
+{
+ __vm_mem_region_delete(vm, memslot2region(vm, slot));
+}
+
+/*
+ * VCPU mmap Size
+ *
+ * Input Args: None
+ *
+ * Output Args: None
+ *
+ * Return:
+ * Size of VCPU state
+ *
+ * Returns the size of the structure pointed to by the return value
+ * of vcpu_state().
+ */
+static int vcpu_mmap_sz(void)
+{
+ int dev_fd, ret;
+
+ dev_fd = open(KVM_DEV_PATH, O_RDONLY);
+ if (dev_fd < 0)
+ exit(KSFT_SKIP);
+
+ ret = ioctl(dev_fd, KVM_GET_VCPU_MMAP_SIZE, NULL);
+ TEST_ASSERT(ret >= sizeof(struct kvm_run),
+ "%s KVM_GET_VCPU_MMAP_SIZE ioctl failed, rc: %i errno: %i",
+ __func__, ret, errno);
+
+ close(dev_fd);
+
+ return ret;
+}
+
+/*
+ * VM VCPU Add
+ *
+ * Input Args:
+ * vm - Virtual Machine
+ * vcpuid - VCPU ID
+ *
+ * Output Args: None
+ *
+ * Return: None
+ *
+ * Adds a virtual CPU to the VM specified by vm with the ID given by vcpuid.
+ * No additional VCPU setup is done.
+ */
+void vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpuid)
+{
+ struct vcpu *vcpu;
+
+ /* Confirm a vcpu with the specified id doesn't already exist. */
+ vcpu = vcpu_find(vm, vcpuid);
+ if (vcpu != NULL)
+ TEST_FAIL("vcpu with the specified id "
+ "already exists,\n"
+ " requested vcpuid: %u\n"
+ " existing vcpuid: %u state: %p",
+ vcpuid, vcpu->id, vcpu->state);
+
+ /* Allocate and initialize new vcpu structure. */
+ vcpu = calloc(1, sizeof(*vcpu));
+ TEST_ASSERT(vcpu != NULL, "Insufficient Memory");
+ vcpu->id = vcpuid;
+ vcpu->fd = ioctl(vm->fd, KVM_CREATE_VCPU, vcpuid);
+ TEST_ASSERT(vcpu->fd >= 0, "KVM_CREATE_VCPU failed, rc: %i errno: %i",
+ vcpu->fd, errno);
+
+ TEST_ASSERT(vcpu_mmap_sz() >= sizeof(*vcpu->state), "vcpu mmap size "
+ "smaller than expected, vcpu_mmap_sz: %i expected_min: %zi",
+ vcpu_mmap_sz(), sizeof(*vcpu->state));
+ vcpu->state = (struct kvm_run *) mmap(NULL, sizeof(*vcpu->state),
+ PROT_READ | PROT_WRITE, MAP_SHARED, vcpu->fd, 0);
+ TEST_ASSERT(vcpu->state != MAP_FAILED, "mmap vcpu_state failed, "
+ "vcpu id: %u errno: %i", vcpuid, errno);
+
+ /* Add to linked-list of VCPUs. */
+ list_add(&vcpu->list, &vm->vcpus);
+}
+
+/*
+ * VM Virtual Address Unused Gap
+ *
+ * Input Args:
+ * vm - Virtual Machine
+ * sz - Size (bytes)
+ * vaddr_min - Minimum Virtual Address
+ *
+ * Output Args: None
+ *
+ * Return:
+ * Lowest virtual address at or below vaddr_min, with at least
+ * sz unused bytes. TEST_ASSERT failure if no area of at least
+ * size sz is available.
+ *
+ * Within the VM specified by vm, locates the lowest starting virtual
+ * address >= vaddr_min, that has at least sz unallocated bytes. A
+ * TEST_ASSERT failure occurs for invalid input or no area of at least
+ * sz unallocated bytes >= vaddr_min is available.
+ */
+static vm_vaddr_t vm_vaddr_unused_gap(struct kvm_vm *vm, size_t sz,
+ vm_vaddr_t vaddr_min)
+{
+ uint64_t pages = (sz + vm->page_size - 1) >> vm->page_shift;
+
+ /* Determine lowest permitted virtual page index. */
+ uint64_t pgidx_start = (vaddr_min + vm->page_size - 1) >> vm->page_shift;
+ if ((pgidx_start * vm->page_size) < vaddr_min)
+ goto no_va_found;
+
+ /* Loop over section with enough valid virtual page indexes. */
+ if (!sparsebit_is_set_num(vm->vpages_valid,
+ pgidx_start, pages))
+ pgidx_start = sparsebit_next_set_num(vm->vpages_valid,
+ pgidx_start, pages);
+ do {
+ /*
+ * Are there enough unused virtual pages available at
+ * the currently proposed starting virtual page index.
+ * If not, adjust proposed starting index to next
+ * possible.
+ */
+ if (sparsebit_is_clear_num(vm->vpages_mapped,
+ pgidx_start, pages))
+ goto va_found;
+ pgidx_start = sparsebit_next_clear_num(vm->vpages_mapped,
+ pgidx_start, pages);
+ if (pgidx_start == 0)
+ goto no_va_found;
+
+ /*
+ * If needed, adjust proposed starting virtual address,
+ * to next range of valid virtual addresses.
+ */
+ if (!sparsebit_is_set_num(vm->vpages_valid,
+ pgidx_start, pages)) {
+ pgidx_start = sparsebit_next_set_num(
+ vm->vpages_valid, pgidx_start, pages);
+ if (pgidx_start == 0)
+ goto no_va_found;
+ }
+ } while (pgidx_start != 0);
+
+no_va_found:
+ TEST_FAIL("No vaddr of specified pages available, pages: 0x%lx", pages);
+
+ /* NOT REACHED */
+ return -1;
+
+va_found:
+ TEST_ASSERT(sparsebit_is_set_num(vm->vpages_valid,
+ pgidx_start, pages),
+ "Unexpected, invalid virtual page index range,\n"
+ " pgidx_start: 0x%lx\n"
+ " pages: 0x%lx",
+ pgidx_start, pages);
+ TEST_ASSERT(sparsebit_is_clear_num(vm->vpages_mapped,
+ pgidx_start, pages),
+ "Unexpected, pages already mapped,\n"
+ " pgidx_start: 0x%lx\n"
+ " pages: 0x%lx",
+ pgidx_start, pages);
+
+ return pgidx_start * vm->page_size;
+}
+
+/*
+ * VM Virtual Address Allocate
+ *
+ * Input Args:
+ * vm - Virtual Machine
+ * sz - Size in bytes
+ * vaddr_min - Minimum starting virtual address
+ * data_memslot - Memory region slot for data pages
+ * pgd_memslot - Memory region slot for new virtual translation tables
+ *
+ * Output Args: None
+ *
+ * Return:
+ * Starting guest virtual address
+ *
+ * Allocates at least sz bytes within the virtual address space of the vm
+ * given by vm. The allocated bytes are mapped to a virtual address >=
+ * the address given by vaddr_min. Note that each allocation uses a
+ * a unique set of pages, with the minimum real allocation being at least
+ * a page.
+ */
+vm_vaddr_t vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min,
+ uint32_t data_memslot, uint32_t pgd_memslot)
+{
+ uint64_t pages = (sz >> vm->page_shift) + ((sz % vm->page_size) != 0);
+
+ virt_pgd_alloc(vm, pgd_memslot);
+
+ /*
+ * Find an unused range of virtual page addresses of at least
+ * pages in length.
+ */
+ vm_vaddr_t vaddr_start = vm_vaddr_unused_gap(vm, sz, vaddr_min);
+
+ /* Map the virtual pages. */
+ for (vm_vaddr_t vaddr = vaddr_start; pages > 0;
+ pages--, vaddr += vm->page_size) {
+ vm_paddr_t paddr;
+
+ paddr = vm_phy_page_alloc(vm,
+ KVM_UTIL_MIN_PFN * vm->page_size, data_memslot);
+
+ virt_pg_map(vm, vaddr, paddr, pgd_memslot);
+
+ sparsebit_set(vm->vpages_mapped,
+ vaddr >> vm->page_shift);
+ }
+
+ return vaddr_start;
+}
+
+/*
+ * Map a range of VM virtual address to the VM's physical address
+ *
+ * Input Args:
+ * vm - Virtual Machine
+ * vaddr - Virtuall address to map
+ * paddr - VM Physical Address
+ * npages - The number of pages to map
+ * pgd_memslot - Memory region slot for new virtual translation tables
+ *
+ * Output Args: None
+ *
+ * Return: None
+ *
+ * Within the VM given by @vm, creates a virtual translation for
+ * @npages starting at @vaddr to the page range starting at @paddr.
+ */
+void virt_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr,
+ unsigned int npages, uint32_t pgd_memslot)
+{
+ size_t page_size = vm->page_size;
+ size_t size = npages * page_size;
+
+ TEST_ASSERT(vaddr + size > vaddr, "Vaddr overflow");
+ TEST_ASSERT(paddr + size > paddr, "Paddr overflow");
+
+ while (npages--) {
+ virt_pg_map(vm, vaddr, paddr, pgd_memslot);
+ vaddr += page_size;
+ paddr += page_size;
+ }
+}
+
+/*
+ * Address VM Physical to Host Virtual
+ *
+ * Input Args:
+ * vm - Virtual Machine
+ * gpa - VM physical address
+ *
+ * Output Args: None
+ *
+ * Return:
+ * Equivalent host virtual address
+ *
+ * Locates the memory region containing the VM physical address given
+ * by gpa, within the VM given by vm. When found, the host virtual
+ * address providing the memory to the vm physical address is returned.
+ * A TEST_ASSERT failure occurs if no region containing gpa exists.
+ */
+void *addr_gpa2hva(struct kvm_vm *vm, vm_paddr_t gpa)
+{
+ struct userspace_mem_region *region;
+
+ list_for_each_entry(region, &vm->userspace_mem_regions, list) {
+ if ((gpa >= region->region.guest_phys_addr)
+ && (gpa <= (region->region.guest_phys_addr
+ + region->region.memory_size - 1)))
+ return (void *) ((uintptr_t) region->host_mem
+ + (gpa - region->region.guest_phys_addr));
+ }
+
+ TEST_FAIL("No vm physical memory at 0x%lx", gpa);
+ return NULL;
+}
+
+/*
+ * Address Host Virtual to VM Physical
+ *
+ * Input Args:
+ * vm - Virtual Machine
+ * hva - Host virtual address
+ *
+ * Output Args: None
+ *
+ * Return:
+ * Equivalent VM physical address
+ *
+ * Locates the memory region containing the host virtual address given
+ * by hva, within the VM given by vm. When found, the equivalent
+ * VM physical address is returned. A TEST_ASSERT failure occurs if no
+ * region containing hva exists.
+ */
+vm_paddr_t addr_hva2gpa(struct kvm_vm *vm, void *hva)
+{
+ struct userspace_mem_region *region;
+
+ list_for_each_entry(region, &vm->userspace_mem_regions, list) {
+ if ((hva >= region->host_mem)
+ && (hva <= (region->host_mem
+ + region->region.memory_size - 1)))
+ return (vm_paddr_t) ((uintptr_t)
+ region->region.guest_phys_addr
+ + (hva - (uintptr_t) region->host_mem));
+ }
+
+ TEST_FAIL("No mapping to a guest physical address, hva: %p", hva);
+ return -1;
+}
+
+/*
+ * VM Create IRQ Chip
+ *
+ * Input Args:
+ * vm - Virtual Machine
+ *
+ * Output Args: None
+ *
+ * Return: None
+ *
+ * Creates an interrupt controller chip for the VM specified by vm.
+ */
+void vm_create_irqchip(struct kvm_vm *vm)
+{
+ int ret;
+
+ ret = ioctl(vm->fd, KVM_CREATE_IRQCHIP, 0);
+ TEST_ASSERT(ret == 0, "KVM_CREATE_IRQCHIP IOCTL failed, "
+ "rc: %i errno: %i", ret, errno);
+
+ vm->has_irqchip = true;
+}
+
+/*
+ * VM VCPU State
+ *
+ * Input Args:
+ * vm - Virtual Machine
+ * vcpuid - VCPU ID
+ *
+ * Output Args: None
+ *
+ * Return:
+ * Pointer to structure that describes the state of the VCPU.
+ *
+ * Locates and returns a pointer to a structure that describes the
+ * state of the VCPU with the given vcpuid.
+ */
+struct kvm_run *vcpu_state(struct kvm_vm *vm, uint32_t vcpuid)
+{
+ struct vcpu *vcpu = vcpu_find(vm, vcpuid);
+ TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid);
+
+ return vcpu->state;
+}
+
+/*
+ * VM VCPU Run
+ *
+ * Input Args:
+ * vm - Virtual Machine
+ * vcpuid - VCPU ID
+ *
+ * Output Args: None
+ *
+ * Return: None
+ *
+ * Switch to executing the code for the VCPU given by vcpuid, within the VM
+ * given by vm.
+ */
+void vcpu_run(struct kvm_vm *vm, uint32_t vcpuid)
+{
+ int ret = _vcpu_run(vm, vcpuid);
+ TEST_ASSERT(ret == 0, "KVM_RUN IOCTL failed, "
+ "rc: %i errno: %i", ret, errno);
+}
+
+int _vcpu_run(struct kvm_vm *vm, uint32_t vcpuid)
+{
+ struct vcpu *vcpu = vcpu_find(vm, vcpuid);
+ int rc;
+
+ TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid);
+ do {
+ rc = ioctl(vcpu->fd, KVM_RUN, NULL);
+ } while (rc == -1 && errno == EINTR);
+
+ assert_on_unhandled_exception(vm, vcpuid);
+
+ return rc;
+}
+
+void vcpu_run_complete_io(struct kvm_vm *vm, uint32_t vcpuid)
+{
+ struct vcpu *vcpu = vcpu_find(vm, vcpuid);
+ int ret;
+
+ TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid);
+
+ vcpu->state->immediate_exit = 1;
+ ret = ioctl(vcpu->fd, KVM_RUN, NULL);
+ vcpu->state->immediate_exit = 0;
+
+ TEST_ASSERT(ret == -1 && errno == EINTR,
+ "KVM_RUN IOCTL didn't exit immediately, rc: %i, errno: %i",
+ ret, errno);
+}
+
+void vcpu_set_guest_debug(struct kvm_vm *vm, uint32_t vcpuid,
+ struct kvm_guest_debug *debug)
+{
+ struct vcpu *vcpu = vcpu_find(vm, vcpuid);
+ int ret = ioctl(vcpu->fd, KVM_SET_GUEST_DEBUG, debug);
+
+ TEST_ASSERT(ret == 0, "KVM_SET_GUEST_DEBUG failed: %d", ret);
+}
+
+/*
+ * VM VCPU Set MP State
+ *
+ * Input Args:
+ * vm - Virtual Machine
+ * vcpuid - VCPU ID
+ * mp_state - mp_state to be set
+ *
+ * Output Args: None
+ *
+ * Return: None
+ *
+ * Sets the MP state of the VCPU given by vcpuid, to the state given
+ * by mp_state.
+ */
+void vcpu_set_mp_state(struct kvm_vm *vm, uint32_t vcpuid,
+ struct kvm_mp_state *mp_state)
+{
+ struct vcpu *vcpu = vcpu_find(vm, vcpuid);
+ int ret;
+
+ TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid);
+
+ ret = ioctl(vcpu->fd, KVM_SET_MP_STATE, mp_state);
+ TEST_ASSERT(ret == 0, "KVM_SET_MP_STATE IOCTL failed, "
+ "rc: %i errno: %i", ret, errno);
+}
+
+/*
+ * VM VCPU Get Reg List
+ *
+ * Input Args:
+ * vm - Virtual Machine
+ * vcpuid - VCPU ID
+ *
+ * Output Args:
+ * None
+ *
+ * Return:
+ * A pointer to an allocated struct kvm_reg_list
+ *
+ * Get the list of guest registers which are supported for
+ * KVM_GET_ONE_REG/KVM_SET_ONE_REG calls
+ */
+struct kvm_reg_list *vcpu_get_reg_list(struct kvm_vm *vm, uint32_t vcpuid)
+{
+ struct kvm_reg_list reg_list_n = { .n = 0 }, *reg_list;
+ int ret;
+
+ ret = _vcpu_ioctl(vm, vcpuid, KVM_GET_REG_LIST, &reg_list_n);
+ TEST_ASSERT(ret == -1 && errno == E2BIG, "KVM_GET_REG_LIST n=0");
+ reg_list = calloc(1, sizeof(*reg_list) + reg_list_n.n * sizeof(__u64));
+ reg_list->n = reg_list_n.n;
+ vcpu_ioctl(vm, vcpuid, KVM_GET_REG_LIST, reg_list);
+ return reg_list;
+}
+
+/*
+ * VM VCPU Regs Get
+ *
+ * Input Args:
+ * vm - Virtual Machine
+ * vcpuid - VCPU ID
+ *
+ * Output Args:
+ * regs - current state of VCPU regs
+ *
+ * Return: None
+ *
+ * Obtains the current register state for the VCPU specified by vcpuid
+ * and stores it at the location given by regs.
+ */
+void vcpu_regs_get(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_regs *regs)
+{
+ struct vcpu *vcpu = vcpu_find(vm, vcpuid);
+ int ret;
+
+ TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid);
+
+ ret = ioctl(vcpu->fd, KVM_GET_REGS, regs);
+ TEST_ASSERT(ret == 0, "KVM_GET_REGS failed, rc: %i errno: %i",
+ ret, errno);
+}
+
+/*
+ * VM VCPU Regs Set
+ *
+ * Input Args:
+ * vm - Virtual Machine
+ * vcpuid - VCPU ID
+ * regs - Values to set VCPU regs to
+ *
+ * Output Args: None
+ *
+ * Return: None
+ *
+ * Sets the regs of the VCPU specified by vcpuid to the values
+ * given by regs.
+ */
+void vcpu_regs_set(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_regs *regs)
+{
+ struct vcpu *vcpu = vcpu_find(vm, vcpuid);
+ int ret;
+
+ TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid);
+
+ ret = ioctl(vcpu->fd, KVM_SET_REGS, regs);
+ TEST_ASSERT(ret == 0, "KVM_SET_REGS failed, rc: %i errno: %i",
+ ret, errno);
+}
+
+#ifdef __KVM_HAVE_VCPU_EVENTS
+void vcpu_events_get(struct kvm_vm *vm, uint32_t vcpuid,
+ struct kvm_vcpu_events *events)
+{
+ struct vcpu *vcpu = vcpu_find(vm, vcpuid);
+ int ret;
+
+ TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid);
+
+ ret = ioctl(vcpu->fd, KVM_GET_VCPU_EVENTS, events);
+ TEST_ASSERT(ret == 0, "KVM_GET_VCPU_EVENTS, failed, rc: %i errno: %i",
+ ret, errno);
+}
+
+void vcpu_events_set(struct kvm_vm *vm, uint32_t vcpuid,
+ struct kvm_vcpu_events *events)
+{
+ struct vcpu *vcpu = vcpu_find(vm, vcpuid);
+ int ret;
+
+ TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid);
+
+ ret = ioctl(vcpu->fd, KVM_SET_VCPU_EVENTS, events);
+ TEST_ASSERT(ret == 0, "KVM_SET_VCPU_EVENTS, failed, rc: %i errno: %i",
+ ret, errno);
+}
+#endif
+
+#ifdef __x86_64__
+void vcpu_nested_state_get(struct kvm_vm *vm, uint32_t vcpuid,
+ struct kvm_nested_state *state)
+{
+ struct vcpu *vcpu = vcpu_find(vm, vcpuid);
+ int ret;
+
+ TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid);
+
+ ret = ioctl(vcpu->fd, KVM_GET_NESTED_STATE, state);
+ TEST_ASSERT(ret == 0,
+ "KVM_SET_NESTED_STATE failed, ret: %i errno: %i",
+ ret, errno);
+}
+
+int vcpu_nested_state_set(struct kvm_vm *vm, uint32_t vcpuid,
+ struct kvm_nested_state *state, bool ignore_error)
+{
+ struct vcpu *vcpu = vcpu_find(vm, vcpuid);
+ int ret;
+
+ TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid);
+
+ ret = ioctl(vcpu->fd, KVM_SET_NESTED_STATE, state);
+ if (!ignore_error) {
+ TEST_ASSERT(ret == 0,
+ "KVM_SET_NESTED_STATE failed, ret: %i errno: %i",
+ ret, errno);
+ }
+
+ return ret;
+}
+#endif
+
+/*
+ * VM VCPU System Regs Get
+ *
+ * Input Args:
+ * vm - Virtual Machine
+ * vcpuid - VCPU ID
+ *
+ * Output Args:
+ * sregs - current state of VCPU system regs
+ *
+ * Return: None
+ *
+ * Obtains the current system register state for the VCPU specified by
+ * vcpuid and stores it at the location given by sregs.
+ */
+void vcpu_sregs_get(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_sregs *sregs)
+{
+ struct vcpu *vcpu = vcpu_find(vm, vcpuid);
+ int ret;
+
+ TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid);
+
+ ret = ioctl(vcpu->fd, KVM_GET_SREGS, sregs);
+ TEST_ASSERT(ret == 0, "KVM_GET_SREGS failed, rc: %i errno: %i",
+ ret, errno);
+}
+
+/*
+ * VM VCPU System Regs Set
+ *
+ * Input Args:
+ * vm - Virtual Machine
+ * vcpuid - VCPU ID
+ * sregs - Values to set VCPU system regs to
+ *
+ * Output Args: None
+ *
+ * Return: None
+ *
+ * Sets the system regs of the VCPU specified by vcpuid to the values
+ * given by sregs.
+ */
+void vcpu_sregs_set(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_sregs *sregs)
+{
+ int ret = _vcpu_sregs_set(vm, vcpuid, sregs);
+ TEST_ASSERT(ret == 0, "KVM_RUN IOCTL failed, "
+ "rc: %i errno: %i", ret, errno);
+}
+
+int _vcpu_sregs_set(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_sregs *sregs)
+{
+ struct vcpu *vcpu = vcpu_find(vm, vcpuid);
+
+ TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid);
+
+ return ioctl(vcpu->fd, KVM_SET_SREGS, sregs);
+}
+
+void vcpu_fpu_get(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_fpu *fpu)
+{
+ int ret;
+
+ ret = _vcpu_ioctl(vm, vcpuid, KVM_GET_FPU, fpu);
+ TEST_ASSERT(ret == 0, "KVM_GET_FPU failed, rc: %i errno: %i (%s)",
+ ret, errno, strerror(errno));
+}
+
+void vcpu_fpu_set(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_fpu *fpu)
+{
+ int ret;
+
+ ret = _vcpu_ioctl(vm, vcpuid, KVM_SET_FPU, fpu);
+ TEST_ASSERT(ret == 0, "KVM_SET_FPU failed, rc: %i errno: %i (%s)",
+ ret, errno, strerror(errno));
+}
+
+void vcpu_get_reg(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_one_reg *reg)
+{
+ int ret;
+
+ ret = _vcpu_ioctl(vm, vcpuid, KVM_GET_ONE_REG, reg);
+ TEST_ASSERT(ret == 0, "KVM_GET_ONE_REG failed, rc: %i errno: %i (%s)",
+ ret, errno, strerror(errno));
+}
+
+void vcpu_set_reg(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_one_reg *reg)
+{
+ int ret;
+
+ ret = _vcpu_ioctl(vm, vcpuid, KVM_SET_ONE_REG, reg);
+ TEST_ASSERT(ret == 0, "KVM_SET_ONE_REG failed, rc: %i errno: %i (%s)",
+ ret, errno, strerror(errno));
+}
+
+/*
+ * VCPU Ioctl
+ *
+ * Input Args:
+ * vm - Virtual Machine
+ * vcpuid - VCPU ID
+ * cmd - Ioctl number
+ * arg - Argument to pass to the ioctl
+ *
+ * Return: None
+ *
+ * Issues an arbitrary ioctl on a VCPU fd.
+ */
+void vcpu_ioctl(struct kvm_vm *vm, uint32_t vcpuid,
+ unsigned long cmd, void *arg)
+{
+ int ret;
+
+ ret = _vcpu_ioctl(vm, vcpuid, cmd, arg);
+ TEST_ASSERT(ret == 0, "vcpu ioctl %lu failed, rc: %i errno: %i (%s)",
+ cmd, ret, errno, strerror(errno));
+}
+
+int _vcpu_ioctl(struct kvm_vm *vm, uint32_t vcpuid,
+ unsigned long cmd, void *arg)
+{
+ struct vcpu *vcpu = vcpu_find(vm, vcpuid);
+ int ret;
+
+ TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid);
+
+ ret = ioctl(vcpu->fd, cmd, arg);
+
+ return ret;
+}
+
+/*
+ * VM Ioctl
+ *
+ * Input Args:
+ * vm - Virtual Machine
+ * cmd - Ioctl number
+ * arg - Argument to pass to the ioctl
+ *
+ * Return: None
+ *
+ * Issues an arbitrary ioctl on a VM fd.
+ */
+void vm_ioctl(struct kvm_vm *vm, unsigned long cmd, void *arg)
+{
+ int ret;
+
+ ret = ioctl(vm->fd, cmd, arg);
+ TEST_ASSERT(ret == 0, "vm ioctl %lu failed, rc: %i errno: %i (%s)",
+ cmd, ret, errno, strerror(errno));
+}
+
+/*
+ * VM Dump
+ *
+ * Input Args:
+ * vm - Virtual Machine
+ * indent - Left margin indent amount
+ *
+ * Output Args:
+ * stream - Output FILE stream
+ *
+ * Return: None
+ *
+ * Dumps the current state of the VM given by vm, to the FILE stream
+ * given by stream.
+ */
+void vm_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent)
+{
+ struct userspace_mem_region *region;
+ struct vcpu *vcpu;
+
+ fprintf(stream, "%*smode: 0x%x\n", indent, "", vm->mode);
+ fprintf(stream, "%*sfd: %i\n", indent, "", vm->fd);
+ fprintf(stream, "%*spage_size: 0x%x\n", indent, "", vm->page_size);
+ fprintf(stream, "%*sMem Regions:\n", indent, "");
+ list_for_each_entry(region, &vm->userspace_mem_regions, list) {
+ fprintf(stream, "%*sguest_phys: 0x%lx size: 0x%lx "
+ "host_virt: %p\n", indent + 2, "",
+ (uint64_t) region->region.guest_phys_addr,
+ (uint64_t) region->region.memory_size,
+ region->host_mem);
+ fprintf(stream, "%*sunused_phy_pages: ", indent + 2, "");
+ sparsebit_dump(stream, region->unused_phy_pages, 0);
+ }
+ fprintf(stream, "%*sMapped Virtual Pages:\n", indent, "");
+ sparsebit_dump(stream, vm->vpages_mapped, indent + 2);
+ fprintf(stream, "%*spgd_created: %u\n", indent, "",
+ vm->pgd_created);
+ if (vm->pgd_created) {
+ fprintf(stream, "%*sVirtual Translation Tables:\n",
+ indent + 2, "");
+ virt_dump(stream, vm, indent + 4);
+ }
+ fprintf(stream, "%*sVCPUs:\n", indent, "");
+ list_for_each_entry(vcpu, &vm->vcpus, list)
+ vcpu_dump(stream, vm, vcpu->id, indent + 2);
+}
+
+/* Known KVM exit reasons */
+static struct exit_reason {
+ unsigned int reason;
+ const char *name;
+} exit_reasons_known[] = {
+ {KVM_EXIT_UNKNOWN, "UNKNOWN"},
+ {KVM_EXIT_EXCEPTION, "EXCEPTION"},
+ {KVM_EXIT_IO, "IO"},
+ {KVM_EXIT_HYPERCALL, "HYPERCALL"},
+ {KVM_EXIT_DEBUG, "DEBUG"},
+ {KVM_EXIT_HLT, "HLT"},
+ {KVM_EXIT_MMIO, "MMIO"},
+ {KVM_EXIT_IRQ_WINDOW_OPEN, "IRQ_WINDOW_OPEN"},
+ {KVM_EXIT_SHUTDOWN, "SHUTDOWN"},
+ {KVM_EXIT_FAIL_ENTRY, "FAIL_ENTRY"},
+ {KVM_EXIT_INTR, "INTR"},
+ {KVM_EXIT_SET_TPR, "SET_TPR"},
+ {KVM_EXIT_TPR_ACCESS, "TPR_ACCESS"},
+ {KVM_EXIT_S390_SIEIC, "S390_SIEIC"},
+ {KVM_EXIT_S390_RESET, "S390_RESET"},
+ {KVM_EXIT_DCR, "DCR"},
+ {KVM_EXIT_NMI, "NMI"},
+ {KVM_EXIT_INTERNAL_ERROR, "INTERNAL_ERROR"},
+ {KVM_EXIT_OSI, "OSI"},
+ {KVM_EXIT_PAPR_HCALL, "PAPR_HCALL"},
+#ifdef KVM_EXIT_MEMORY_NOT_PRESENT
+ {KVM_EXIT_MEMORY_NOT_PRESENT, "MEMORY_NOT_PRESENT"},
+#endif
+};
+
+/*
+ * Exit Reason String
+ *
+ * Input Args:
+ * exit_reason - Exit reason
+ *
+ * Output Args: None
+ *
+ * Return:
+ * Constant string pointer describing the exit reason.
+ *
+ * Locates and returns a constant string that describes the KVM exit
+ * reason given by exit_reason. If no such string is found, a constant
+ * string of "Unknown" is returned.
+ */
+const char *exit_reason_str(unsigned int exit_reason)
+{
+ unsigned int n1;
+
+ for (n1 = 0; n1 < ARRAY_SIZE(exit_reasons_known); n1++) {
+ if (exit_reason == exit_reasons_known[n1].reason)
+ return exit_reasons_known[n1].name;
+ }
+
+ return "Unknown";
+}
+
+/*
+ * Physical Contiguous Page Allocator
+ *
+ * Input Args:
+ * vm - Virtual Machine
+ * num - number of pages
+ * paddr_min - Physical address minimum
+ * memslot - Memory region to allocate page from
+ *
+ * Output Args: None
+ *
+ * Return:
+ * Starting physical address
+ *
+ * Within the VM specified by vm, locates a range of available physical
+ * pages at or above paddr_min. If found, the pages are marked as in use
+ * and their base address is returned. A TEST_ASSERT failure occurs if
+ * not enough pages are available at or above paddr_min.
+ */
+vm_paddr_t vm_phy_pages_alloc(struct kvm_vm *vm, size_t num,
+ vm_paddr_t paddr_min, uint32_t memslot)
+{
+ struct userspace_mem_region *region;
+ sparsebit_idx_t pg, base;
+
+ TEST_ASSERT(num > 0, "Must allocate at least one page");
+
+ TEST_ASSERT((paddr_min % vm->page_size) == 0, "Min physical address "
+ "not divisible by page size.\n"
+ " paddr_min: 0x%lx page_size: 0x%x",
+ paddr_min, vm->page_size);
+
+ region = memslot2region(vm, memslot);
+ base = pg = paddr_min >> vm->page_shift;
+
+ do {
+ for (; pg < base + num; ++pg) {
+ if (!sparsebit_is_set(region->unused_phy_pages, pg)) {
+ base = pg = sparsebit_next_set(region->unused_phy_pages, pg);
+ break;
+ }
+ }
+ } while (pg && pg != base + num);
+
+ if (pg == 0) {
+ fprintf(stderr, "No guest physical page available, "
+ "paddr_min: 0x%lx page_size: 0x%x memslot: %u\n",
+ paddr_min, vm->page_size, memslot);
+ fputs("---- vm dump ----\n", stderr);
+ vm_dump(stderr, vm, 2);
+ abort();
+ }
+
+ for (pg = base; pg < base + num; ++pg)
+ sparsebit_clear(region->unused_phy_pages, pg);
+
+ return base * vm->page_size;
+}
+
+vm_paddr_t vm_phy_page_alloc(struct kvm_vm *vm, vm_paddr_t paddr_min,
+ uint32_t memslot)
+{
+ return vm_phy_pages_alloc(vm, 1, paddr_min, memslot);
+}
+
+/*
+ * Address Guest Virtual to Host Virtual
+ *
+ * Input Args:
+ * vm - Virtual Machine
+ * gva - VM virtual address
+ *
+ * Output Args: None
+ *
+ * Return:
+ * Equivalent host virtual address
+ */
+void *addr_gva2hva(struct kvm_vm *vm, vm_vaddr_t gva)
+{
+ return addr_gpa2hva(vm, addr_gva2gpa(vm, gva));
+}
+
+/*
+ * Is Unrestricted Guest
+ *
+ * Input Args:
+ * vm - Virtual Machine
+ *
+ * Output Args: None
+ *
+ * Return: True if the unrestricted guest is set to 'Y', otherwise return false.
+ *
+ * Check if the unrestricted guest flag is enabled.
+ */
+bool vm_is_unrestricted_guest(struct kvm_vm *vm)
+{
+ char val = 'N';
+ size_t count;
+ FILE *f;
+
+ if (vm == NULL) {
+ /* Ensure that the KVM vendor-specific module is loaded. */
+ f = fopen(KVM_DEV_PATH, "r");
+ TEST_ASSERT(f != NULL, "Error in opening KVM dev file: %d",
+ errno);
+ fclose(f);
+ }
+
+ f = fopen("/sys/module/kvm_intel/parameters/unrestricted_guest", "r");
+ if (f) {
+ count = fread(&val, sizeof(char), 1, f);
+ TEST_ASSERT(count == 1, "Unable to read from param file.");
+ fclose(f);
+ }
+
+ return val == 'Y';
+}
+
+unsigned int vm_get_page_size(struct kvm_vm *vm)
+{
+ return vm->page_size;
+}
+
+unsigned int vm_get_page_shift(struct kvm_vm *vm)
+{
+ return vm->page_shift;
+}
+
+unsigned int vm_get_max_gfn(struct kvm_vm *vm)
+{
+ return vm->max_gfn;
+}
+
+int vm_get_fd(struct kvm_vm *vm)
+{
+ return vm->fd;
+}
+
+static unsigned int vm_calc_num_pages(unsigned int num_pages,
+ unsigned int page_shift,
+ unsigned int new_page_shift,
+ bool ceil)
+{
+ unsigned int n = 1 << (new_page_shift - page_shift);
+
+ if (page_shift >= new_page_shift)
+ return num_pages * (1 << (page_shift - new_page_shift));
+
+ return num_pages / n + !!(ceil && num_pages % n);
+}
+
+static inline int getpageshift(void)
+{
+ return __builtin_ffs(getpagesize()) - 1;
+}
+
+unsigned int
+vm_num_host_pages(enum vm_guest_mode mode, unsigned int num_guest_pages)
+{
+ return vm_calc_num_pages(num_guest_pages,
+ vm_guest_mode_params[mode].page_shift,
+ getpageshift(), true);
+}
+
+unsigned int
+vm_num_guest_pages(enum vm_guest_mode mode, unsigned int num_host_pages)
+{
+ return vm_calc_num_pages(num_host_pages, getpageshift(),
+ vm_guest_mode_params[mode].page_shift, false);
+}
+
+unsigned int vm_calc_num_guest_pages(enum vm_guest_mode mode, size_t size)
+{
+ unsigned int n;
+ n = DIV_ROUND_UP(size, vm_guest_mode_params[mode].page_size);
+ return vm_adjust_num_guest_pages(mode, n);
+}
diff --git a/tools/testing/selftests/kvm/lib/kvm_util_internal.h b/tools/testing/selftests/kvm/lib/kvm_util_internal.h
new file mode 100644
index 000000000..f07d383d0
--- /dev/null
+++ b/tools/testing/selftests/kvm/lib/kvm_util_internal.h
@@ -0,0 +1,113 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * tools/testing/selftests/kvm/lib/kvm_util_internal.h
+ *
+ * Copyright (C) 2018, Google LLC.
+ */
+
+#ifndef SELFTEST_KVM_UTIL_INTERNAL_H
+#define SELFTEST_KVM_UTIL_INTERNAL_H
+
+#include "sparsebit.h"
+
+#define KVM_DEV_PATH "/dev/kvm"
+
+struct userspace_mem_region {
+ struct kvm_userspace_memory_region region;
+ struct sparsebit *unused_phy_pages;
+ int fd;
+ off_t offset;
+ void *host_mem;
+ void *mmap_start;
+ size_t mmap_size;
+ struct list_head list;
+};
+
+struct vcpu {
+ struct list_head list;
+ uint32_t id;
+ int fd;
+ struct kvm_run *state;
+};
+
+struct kvm_vm {
+ int mode;
+ unsigned long type;
+ int kvm_fd;
+ int fd;
+ unsigned int pgtable_levels;
+ unsigned int page_size;
+ unsigned int page_shift;
+ unsigned int pa_bits;
+ unsigned int va_bits;
+ uint64_t max_gfn;
+ struct list_head vcpus;
+ struct list_head userspace_mem_regions;
+ struct sparsebit *vpages_valid;
+ struct sparsebit *vpages_mapped;
+ bool has_irqchip;
+ bool pgd_created;
+ vm_paddr_t pgd;
+ vm_vaddr_t gdt;
+ vm_vaddr_t tss;
+ vm_vaddr_t idt;
+ vm_vaddr_t handlers;
+};
+
+struct vcpu *vcpu_find(struct kvm_vm *vm, uint32_t vcpuid);
+
+/*
+ * Virtual Translation Tables Dump
+ *
+ * Input Args:
+ * stream - Output FILE stream
+ * vm - Virtual Machine
+ * indent - Left margin indent amount
+ *
+ * Output Args: None
+ *
+ * Return: None
+ *
+ * Dumps to the FILE stream given by @stream, the contents of all the
+ * virtual translation tables for the VM given by @vm.
+ */
+void virt_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent);
+
+/*
+ * Register Dump
+ *
+ * Input Args:
+ * stream - Output FILE stream
+ * regs - Registers
+ * indent - Left margin indent amount
+ *
+ * Output Args: None
+ *
+ * Return: None
+ *
+ * Dumps the state of the registers given by @regs, to the FILE stream
+ * given by @stream.
+ */
+void regs_dump(FILE *stream, struct kvm_regs *regs, uint8_t indent);
+
+/*
+ * System Register Dump
+ *
+ * Input Args:
+ * stream - Output FILE stream
+ * sregs - System registers
+ * indent - Left margin indent amount
+ *
+ * Output Args: None
+ *
+ * Return: None
+ *
+ * Dumps the state of the system registers given by @sregs, to the FILE stream
+ * given by @stream.
+ */
+void sregs_dump(FILE *stream, struct kvm_sregs *sregs, uint8_t indent);
+
+struct userspace_mem_region *
+memslot2region(struct kvm_vm *vm, uint32_t memslot);
+
+#endif /* SELFTEST_KVM_UTIL_INTERNAL_H */
diff --git a/tools/testing/selftests/kvm/lib/s390x/processor.c b/tools/testing/selftests/kvm/lib/s390x/processor.c
new file mode 100644
index 000000000..7349bb2e1
--- /dev/null
+++ b/tools/testing/selftests/kvm/lib/s390x/processor.c
@@ -0,0 +1,247 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * KVM selftest s390x library code - CPU-related functions (page tables...)
+ *
+ * Copyright (C) 2019, Red Hat, Inc.
+ */
+
+#define _GNU_SOURCE /* for program_invocation_name */
+
+#include "processor.h"
+#include "kvm_util.h"
+#include "../kvm_util_internal.h"
+
+#define KVM_GUEST_PAGE_TABLE_MIN_PADDR 0x180000
+
+#define PAGES_PER_REGION 4
+
+void virt_pgd_alloc(struct kvm_vm *vm, uint32_t memslot)
+{
+ vm_paddr_t paddr;
+
+ TEST_ASSERT(vm->page_size == 4096, "Unsupported page size: 0x%x",
+ vm->page_size);
+
+ if (vm->pgd_created)
+ return;
+
+ paddr = vm_phy_pages_alloc(vm, PAGES_PER_REGION,
+ KVM_GUEST_PAGE_TABLE_MIN_PADDR, memslot);
+ memset(addr_gpa2hva(vm, paddr), 0xff, PAGES_PER_REGION * vm->page_size);
+
+ vm->pgd = paddr;
+ vm->pgd_created = true;
+}
+
+/*
+ * Allocate 4 pages for a region/segment table (ri < 4), or one page for
+ * a page table (ri == 4). Returns a suitable region/segment table entry
+ * which points to the freshly allocated pages.
+ */
+static uint64_t virt_alloc_region(struct kvm_vm *vm, int ri, uint32_t memslot)
+{
+ uint64_t taddr;
+
+ taddr = vm_phy_pages_alloc(vm, ri < 4 ? PAGES_PER_REGION : 1,
+ KVM_GUEST_PAGE_TABLE_MIN_PADDR, memslot);
+ memset(addr_gpa2hva(vm, taddr), 0xff, PAGES_PER_REGION * vm->page_size);
+
+ return (taddr & REGION_ENTRY_ORIGIN)
+ | (((4 - ri) << 2) & REGION_ENTRY_TYPE)
+ | ((ri < 4 ? (PAGES_PER_REGION - 1) : 0) & REGION_ENTRY_LENGTH);
+}
+
+void virt_pg_map(struct kvm_vm *vm, uint64_t gva, uint64_t gpa,
+ uint32_t memslot)
+{
+ int ri, idx;
+ uint64_t *entry;
+
+ TEST_ASSERT((gva % vm->page_size) == 0,
+ "Virtual address not on page boundary,\n"
+ " vaddr: 0x%lx vm->page_size: 0x%x",
+ gva, vm->page_size);
+ TEST_ASSERT(sparsebit_is_set(vm->vpages_valid,
+ (gva >> vm->page_shift)),
+ "Invalid virtual address, vaddr: 0x%lx",
+ gva);
+ TEST_ASSERT((gpa % vm->page_size) == 0,
+ "Physical address not on page boundary,\n"
+ " paddr: 0x%lx vm->page_size: 0x%x",
+ gva, vm->page_size);
+ TEST_ASSERT((gpa >> vm->page_shift) <= vm->max_gfn,
+ "Physical address beyond beyond maximum supported,\n"
+ " paddr: 0x%lx vm->max_gfn: 0x%lx vm->page_size: 0x%x",
+ gva, vm->max_gfn, vm->page_size);
+
+ /* Walk through region and segment tables */
+ entry = addr_gpa2hva(vm, vm->pgd);
+ for (ri = 1; ri <= 4; ri++) {
+ idx = (gva >> (64 - 11 * ri)) & 0x7ffu;
+ if (entry[idx] & REGION_ENTRY_INVALID)
+ entry[idx] = virt_alloc_region(vm, ri, memslot);
+ entry = addr_gpa2hva(vm, entry[idx] & REGION_ENTRY_ORIGIN);
+ }
+
+ /* Fill in page table entry */
+ idx = (gva >> 12) & 0x0ffu; /* page index */
+ if (!(entry[idx] & PAGE_INVALID))
+ fprintf(stderr,
+ "WARNING: PTE for gpa=0x%"PRIx64" already set!\n", gpa);
+ entry[idx] = gpa;
+}
+
+vm_paddr_t addr_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva)
+{
+ int ri, idx;
+ uint64_t *entry;
+
+ TEST_ASSERT(vm->page_size == 4096, "Unsupported page size: 0x%x",
+ vm->page_size);
+
+ entry = addr_gpa2hva(vm, vm->pgd);
+ for (ri = 1; ri <= 4; ri++) {
+ idx = (gva >> (64 - 11 * ri)) & 0x7ffu;
+ TEST_ASSERT(!(entry[idx] & REGION_ENTRY_INVALID),
+ "No region mapping for vm virtual address 0x%lx",
+ gva);
+ entry = addr_gpa2hva(vm, entry[idx] & REGION_ENTRY_ORIGIN);
+ }
+
+ idx = (gva >> 12) & 0x0ffu; /* page index */
+
+ TEST_ASSERT(!(entry[idx] & PAGE_INVALID),
+ "No page mapping for vm virtual address 0x%lx", gva);
+
+ return (entry[idx] & ~0xffful) + (gva & 0xffful);
+}
+
+static void virt_dump_ptes(FILE *stream, struct kvm_vm *vm, uint8_t indent,
+ uint64_t ptea_start)
+{
+ uint64_t *pte, ptea;
+
+ for (ptea = ptea_start; ptea < ptea_start + 0x100 * 8; ptea += 8) {
+ pte = addr_gpa2hva(vm, ptea);
+ if (*pte & PAGE_INVALID)
+ continue;
+ fprintf(stream, "%*spte @ 0x%lx: 0x%016lx\n",
+ indent, "", ptea, *pte);
+ }
+}
+
+static void virt_dump_region(FILE *stream, struct kvm_vm *vm, uint8_t indent,
+ uint64_t reg_tab_addr)
+{
+ uint64_t addr, *entry;
+
+ for (addr = reg_tab_addr; addr < reg_tab_addr + 0x400 * 8; addr += 8) {
+ entry = addr_gpa2hva(vm, addr);
+ if (*entry & REGION_ENTRY_INVALID)
+ continue;
+ fprintf(stream, "%*srt%lde @ 0x%lx: 0x%016lx\n",
+ indent, "", 4 - ((*entry & REGION_ENTRY_TYPE) >> 2),
+ addr, *entry);
+ if (*entry & REGION_ENTRY_TYPE) {
+ virt_dump_region(stream, vm, indent + 2,
+ *entry & REGION_ENTRY_ORIGIN);
+ } else {
+ virt_dump_ptes(stream, vm, indent + 2,
+ *entry & REGION_ENTRY_ORIGIN);
+ }
+ }
+}
+
+void virt_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent)
+{
+ if (!vm->pgd_created)
+ return;
+
+ virt_dump_region(stream, vm, indent, vm->pgd);
+}
+
+struct kvm_vm *vm_create_default(uint32_t vcpuid, uint64_t extra_mem_pages,
+ void *guest_code)
+{
+ /*
+ * The additional amount of pages required for the page tables is:
+ * 1 * n / 256 + 4 * (n / 256) / 2048 + 4 * (n / 256) / 2048^2 + ...
+ * which is definitely smaller than (n / 256) * 2.
+ */
+ uint64_t extra_pg_pages = extra_mem_pages / 256 * 2;
+ struct kvm_vm *vm;
+
+ vm = vm_create(VM_MODE_DEFAULT,
+ DEFAULT_GUEST_PHY_PAGES + extra_pg_pages, O_RDWR);
+
+ kvm_vm_elf_load(vm, program_invocation_name, 0, 0);
+ vm_vcpu_add_default(vm, vcpuid, guest_code);
+
+ return vm;
+}
+
+void vm_vcpu_add_default(struct kvm_vm *vm, uint32_t vcpuid, void *guest_code)
+{
+ size_t stack_size = DEFAULT_STACK_PGS * getpagesize();
+ uint64_t stack_vaddr;
+ struct kvm_regs regs;
+ struct kvm_sregs sregs;
+ struct kvm_run *run;
+
+ TEST_ASSERT(vm->page_size == 4096, "Unsupported page size: 0x%x",
+ vm->page_size);
+
+ stack_vaddr = vm_vaddr_alloc(vm, stack_size,
+ DEFAULT_GUEST_STACK_VADDR_MIN, 0, 0);
+
+ vm_vcpu_add(vm, vcpuid);
+
+ /* Setup guest registers */
+ vcpu_regs_get(vm, vcpuid, &regs);
+ regs.gprs[15] = stack_vaddr + (DEFAULT_STACK_PGS * getpagesize()) - 160;
+ vcpu_regs_set(vm, vcpuid, &regs);
+
+ vcpu_sregs_get(vm, vcpuid, &sregs);
+ sregs.crs[0] |= 0x00040000; /* Enable floating point regs */
+ sregs.crs[1] = vm->pgd | 0xf; /* Primary region table */
+ vcpu_sregs_set(vm, vcpuid, &sregs);
+
+ run = vcpu_state(vm, vcpuid);
+ run->psw_mask = 0x0400000180000000ULL; /* DAT enabled + 64 bit mode */
+ run->psw_addr = (uintptr_t)guest_code;
+}
+
+void vcpu_args_set(struct kvm_vm *vm, uint32_t vcpuid, unsigned int num, ...)
+{
+ va_list ap;
+ struct kvm_regs regs;
+ int i;
+
+ TEST_ASSERT(num >= 1 && num <= 5, "Unsupported number of args,\n"
+ " num: %u\n",
+ num);
+
+ va_start(ap, num);
+ vcpu_regs_get(vm, vcpuid, &regs);
+
+ for (i = 0; i < num; i++)
+ regs.gprs[i + 2] = va_arg(ap, uint64_t);
+
+ vcpu_regs_set(vm, vcpuid, &regs);
+ va_end(ap);
+}
+
+void vcpu_dump(FILE *stream, struct kvm_vm *vm, uint32_t vcpuid, uint8_t indent)
+{
+ struct vcpu *vcpu = vcpu_find(vm, vcpuid);
+
+ if (!vcpu)
+ return;
+
+ fprintf(stream, "%*spstate: psw: 0x%.16llx:0x%.16llx\n",
+ indent, "", vcpu->state->psw_mask, vcpu->state->psw_addr);
+}
+
+void assert_on_unhandled_exception(struct kvm_vm *vm, uint32_t vcpuid)
+{
+}
diff --git a/tools/testing/selftests/kvm/lib/s390x/ucall.c b/tools/testing/selftests/kvm/lib/s390x/ucall.c
new file mode 100644
index 000000000..9d3b0f152
--- /dev/null
+++ b/tools/testing/selftests/kvm/lib/s390x/ucall.c
@@ -0,0 +1,59 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * ucall support. A ucall is a "hypercall to userspace".
+ *
+ * Copyright (C) 2019 Red Hat, Inc.
+ */
+#include "kvm_util.h"
+
+void ucall_init(struct kvm_vm *vm, void *arg)
+{
+}
+
+void ucall_uninit(struct kvm_vm *vm)
+{
+}
+
+void ucall(uint64_t cmd, int nargs, ...)
+{
+ struct ucall uc = {
+ .cmd = cmd,
+ };
+ va_list va;
+ int i;
+
+ nargs = nargs <= UCALL_MAX_ARGS ? nargs : UCALL_MAX_ARGS;
+
+ va_start(va, nargs);
+ for (i = 0; i < nargs; ++i)
+ uc.args[i] = va_arg(va, uint64_t);
+ va_end(va);
+
+ /* Exit via DIAGNOSE 0x501 (normally used for breakpoints) */
+ asm volatile ("diag 0,%0,0x501" : : "a"(&uc) : "memory");
+}
+
+uint64_t get_ucall(struct kvm_vm *vm, uint32_t vcpu_id, struct ucall *uc)
+{
+ struct kvm_run *run = vcpu_state(vm, vcpu_id);
+ struct ucall ucall = {};
+
+ if (uc)
+ memset(uc, 0, sizeof(*uc));
+
+ if (run->exit_reason == KVM_EXIT_S390_SIEIC &&
+ run->s390_sieic.icptcode == 4 &&
+ (run->s390_sieic.ipa >> 8) == 0x83 && /* 0x83 means DIAGNOSE */
+ (run->s390_sieic.ipb >> 16) == 0x501) {
+ int reg = run->s390_sieic.ipa & 0xf;
+
+ memcpy(&ucall, addr_gva2hva(vm, run->s.regs.gprs[reg]),
+ sizeof(ucall));
+
+ vcpu_run_complete_io(vm, vcpu_id);
+ if (uc)
+ memcpy(uc, &ucall, sizeof(ucall));
+ }
+
+ return ucall.cmd;
+}
diff --git a/tools/testing/selftests/kvm/lib/sparsebit.c b/tools/testing/selftests/kvm/lib/sparsebit.c
new file mode 100644
index 000000000..031ba3c93
--- /dev/null
+++ b/tools/testing/selftests/kvm/lib/sparsebit.c
@@ -0,0 +1,2086 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Sparse bit array
+ *
+ * Copyright (C) 2018, Google LLC.
+ * Copyright (C) 2018, Red Hat, Inc. (code style cleanup and fuzzing driver)
+ *
+ * This library provides functions to support a memory efficient bit array,
+ * with an index size of 2^64. A sparsebit array is allocated through
+ * the use sparsebit_alloc() and free'd via sparsebit_free(),
+ * such as in the following:
+ *
+ * struct sparsebit *s;
+ * s = sparsebit_alloc();
+ * sparsebit_free(&s);
+ *
+ * The struct sparsebit type resolves down to a struct sparsebit.
+ * Note that, sparsebit_free() takes a pointer to the sparsebit
+ * structure. This is so that sparsebit_free() is able to poison
+ * the pointer (e.g. set it to NULL) to the struct sparsebit before
+ * returning to the caller.
+ *
+ * Between the return of sparsebit_alloc() and the call of
+ * sparsebit_free(), there are multiple query and modifying operations
+ * that can be performed on the allocated sparsebit array. All of
+ * these operations take as a parameter the value returned from
+ * sparsebit_alloc() and most also take a bit index. Frequently
+ * used routines include:
+ *
+ * ---- Query Operations
+ * sparsebit_is_set(s, idx)
+ * sparsebit_is_clear(s, idx)
+ * sparsebit_any_set(s)
+ * sparsebit_first_set(s)
+ * sparsebit_next_set(s, prev_idx)
+ *
+ * ---- Modifying Operations
+ * sparsebit_set(s, idx)
+ * sparsebit_clear(s, idx)
+ * sparsebit_set_num(s, idx, num);
+ * sparsebit_clear_num(s, idx, num);
+ *
+ * A common operation, is to itterate over all the bits set in a test
+ * sparsebit array. This can be done via code with the following structure:
+ *
+ * sparsebit_idx_t idx;
+ * if (sparsebit_any_set(s)) {
+ * idx = sparsebit_first_set(s);
+ * do {
+ * ...
+ * idx = sparsebit_next_set(s, idx);
+ * } while (idx != 0);
+ * }
+ *
+ * The index of the first bit set needs to be obtained via
+ * sparsebit_first_set(), because sparsebit_next_set(), needs
+ * the index of the previously set. The sparsebit_idx_t type is
+ * unsigned, so there is no previous index before 0 that is available.
+ * Also, the call to sparsebit_first_set() is not made unless there
+ * is at least 1 bit in the array set. This is because sparsebit_first_set()
+ * aborts if sparsebit_first_set() is called with no bits set.
+ * It is the callers responsibility to assure that the
+ * sparsebit array has at least a single bit set before calling
+ * sparsebit_first_set().
+ *
+ * ==== Implementation Overview ====
+ * For the most part the internal implementation of sparsebit is
+ * opaque to the caller. One important implementation detail that the
+ * caller may need to be aware of is the spatial complexity of the
+ * implementation. This implementation of a sparsebit array is not
+ * only sparse, in that it uses memory proportional to the number of bits
+ * set. It is also efficient in memory usage when most of the bits are
+ * set.
+ *
+ * At a high-level the state of the bit settings are maintained through
+ * the use of a binary-search tree, where each node contains at least
+ * the following members:
+ *
+ * typedef uint64_t sparsebit_idx_t;
+ * typedef uint64_t sparsebit_num_t;
+ *
+ * sparsebit_idx_t idx;
+ * uint32_t mask;
+ * sparsebit_num_t num_after;
+ *
+ * The idx member contains the bit index of the first bit described by this
+ * node, while the mask member stores the setting of the first 32-bits.
+ * The setting of the bit at idx + n, where 0 <= n < 32, is located in the
+ * mask member at 1 << n.
+ *
+ * Nodes are sorted by idx and the bits described by two nodes will never
+ * overlap. The idx member is always aligned to the mask size, i.e. a
+ * multiple of 32.
+ *
+ * Beyond a typical implementation, the nodes in this implementation also
+ * contains a member named num_after. The num_after member holds the
+ * number of bits immediately after the mask bits that are contiguously set.
+ * The use of the num_after member allows this implementation to efficiently
+ * represent cases where most bits are set. For example, the case of all
+ * but the last two bits set, is represented by the following two nodes:
+ *
+ * node 0 - idx: 0x0 mask: 0xffffffff num_after: 0xffffffffffffffc0
+ * node 1 - idx: 0xffffffffffffffe0 mask: 0x3fffffff num_after: 0
+ *
+ * ==== Invariants ====
+ * This implementation usses the following invariants:
+ *
+ * + Node are only used to represent bits that are set.
+ * Nodes with a mask of 0 and num_after of 0 are not allowed.
+ *
+ * + Sum of bits set in all the nodes is equal to the value of
+ * the struct sparsebit_pvt num_set member.
+ *
+ * + The setting of at least one bit is always described in a nodes
+ * mask (mask >= 1).
+ *
+ * + A node with all mask bits set only occurs when the last bit
+ * described by the previous node is not equal to this nodes
+ * starting index - 1. All such occurences of this condition are
+ * avoided by moving the setting of the nodes mask bits into
+ * the previous nodes num_after setting.
+ *
+ * + Node starting index is evenly divisible by the number of bits
+ * within a nodes mask member.
+ *
+ * + Nodes never represent a range of bits that wrap around the
+ * highest supported index.
+ *
+ * (idx + MASK_BITS + num_after - 1) <= ((sparsebit_idx_t) 0) - 1)
+ *
+ * As a consequence of the above, the num_after member of a node
+ * will always be <=:
+ *
+ * maximum_index - nodes_starting_index - number_of_mask_bits
+ *
+ * + Nodes within the binary search tree are sorted based on each
+ * nodes starting index.
+ *
+ * + The range of bits described by any two nodes do not overlap. The
+ * range of bits described by a single node is:
+ *
+ * start: node->idx
+ * end (inclusive): node->idx + MASK_BITS + node->num_after - 1;
+ *
+ * Note, at times these invariants are temporarily violated for a
+ * specific portion of the code. For example, when setting a mask
+ * bit, there is a small delay between when the mask bit is set and the
+ * value in the struct sparsebit_pvt num_set member is updated. Other
+ * temporary violations occur when node_split() is called with a specified
+ * index and assures that a node where its mask represents the bit
+ * at the specified index exists. At times to do this node_split()
+ * must split an existing node into two nodes or create a node that
+ * has no bits set. Such temporary violations must be corrected before
+ * returning to the caller. These corrections are typically performed
+ * by the local function node_reduce().
+ */
+
+#include "test_util.h"
+#include "sparsebit.h"
+#include <limits.h>
+#include <assert.h>
+
+#define DUMP_LINE_MAX 100 /* Does not include indent amount */
+
+typedef uint32_t mask_t;
+#define MASK_BITS (sizeof(mask_t) * CHAR_BIT)
+
+struct node {
+ struct node *parent;
+ struct node *left;
+ struct node *right;
+ sparsebit_idx_t idx; /* index of least-significant bit in mask */
+ sparsebit_num_t num_after; /* num contiguously set after mask */
+ mask_t mask;
+};
+
+struct sparsebit {
+ /*
+ * Points to root node of the binary search
+ * tree. Equal to NULL when no bits are set in
+ * the entire sparsebit array.
+ */
+ struct node *root;
+
+ /*
+ * A redundant count of the total number of bits set. Used for
+ * diagnostic purposes and to change the time complexity of
+ * sparsebit_num_set() from O(n) to O(1).
+ * Note: Due to overflow, a value of 0 means none or all set.
+ */
+ sparsebit_num_t num_set;
+};
+
+/* Returns the number of set bits described by the settings
+ * of the node pointed to by nodep.
+ */
+static sparsebit_num_t node_num_set(struct node *nodep)
+{
+ return nodep->num_after + __builtin_popcount(nodep->mask);
+}
+
+/* Returns a pointer to the node that describes the
+ * lowest bit index.
+ */
+static struct node *node_first(struct sparsebit *s)
+{
+ struct node *nodep;
+
+ for (nodep = s->root; nodep && nodep->left; nodep = nodep->left)
+ ;
+
+ return nodep;
+}
+
+/* Returns a pointer to the node that describes the
+ * lowest bit index > the index of the node pointed to by np.
+ * Returns NULL if no node with a higher index exists.
+ */
+static struct node *node_next(struct sparsebit *s, struct node *np)
+{
+ struct node *nodep = np;
+
+ /*
+ * If current node has a right child, next node is the left-most
+ * of the right child.
+ */
+ if (nodep->right) {
+ for (nodep = nodep->right; nodep->left; nodep = nodep->left)
+ ;
+ return nodep;
+ }
+
+ /*
+ * No right child. Go up until node is left child of a parent.
+ * That parent is then the next node.
+ */
+ while (nodep->parent && nodep == nodep->parent->right)
+ nodep = nodep->parent;
+
+ return nodep->parent;
+}
+
+/* Searches for and returns a pointer to the node that describes the
+ * highest index < the index of the node pointed to by np.
+ * Returns NULL if no node with a lower index exists.
+ */
+static struct node *node_prev(struct sparsebit *s, struct node *np)
+{
+ struct node *nodep = np;
+
+ /*
+ * If current node has a left child, next node is the right-most
+ * of the left child.
+ */
+ if (nodep->left) {
+ for (nodep = nodep->left; nodep->right; nodep = nodep->right)
+ ;
+ return (struct node *) nodep;
+ }
+
+ /*
+ * No left child. Go up until node is right child of a parent.
+ * That parent is then the next node.
+ */
+ while (nodep->parent && nodep == nodep->parent->left)
+ nodep = nodep->parent;
+
+ return (struct node *) nodep->parent;
+}
+
+
+/* Allocates space to hold a copy of the node sub-tree pointed to by
+ * subtree and duplicates the bit settings to the newly allocated nodes.
+ * Returns the newly allocated copy of subtree.
+ */
+static struct node *node_copy_subtree(struct node *subtree)
+{
+ struct node *root;
+
+ /* Duplicate the node at the root of the subtree */
+ root = calloc(1, sizeof(*root));
+ if (!root) {
+ perror("calloc");
+ abort();
+ }
+
+ root->idx = subtree->idx;
+ root->mask = subtree->mask;
+ root->num_after = subtree->num_after;
+
+ /* As needed, recursively duplicate the left and right subtrees */
+ if (subtree->left) {
+ root->left = node_copy_subtree(subtree->left);
+ root->left->parent = root;
+ }
+
+ if (subtree->right) {
+ root->right = node_copy_subtree(subtree->right);
+ root->right->parent = root;
+ }
+
+ return root;
+}
+
+/* Searches for and returns a pointer to the node that describes the setting
+ * of the bit given by idx. A node describes the setting of a bit if its
+ * index is within the bits described by the mask bits or the number of
+ * contiguous bits set after the mask. Returns NULL if there is no such node.
+ */
+static struct node *node_find(struct sparsebit *s, sparsebit_idx_t idx)
+{
+ struct node *nodep;
+
+ /* Find the node that describes the setting of the bit at idx */
+ for (nodep = s->root; nodep;
+ nodep = nodep->idx > idx ? nodep->left : nodep->right) {
+ if (idx >= nodep->idx &&
+ idx <= nodep->idx + MASK_BITS + nodep->num_after - 1)
+ break;
+ }
+
+ return nodep;
+}
+
+/* Entry Requirements:
+ * + A node that describes the setting of idx is not already present.
+ *
+ * Adds a new node to describe the setting of the bit at the index given
+ * by idx. Returns a pointer to the newly added node.
+ *
+ * TODO(lhuemill): Degenerate cases causes the tree to get unbalanced.
+ */
+static struct node *node_add(struct sparsebit *s, sparsebit_idx_t idx)
+{
+ struct node *nodep, *parentp, *prev;
+
+ /* Allocate and initialize the new node. */
+ nodep = calloc(1, sizeof(*nodep));
+ if (!nodep) {
+ perror("calloc");
+ abort();
+ }
+
+ nodep->idx = idx & -MASK_BITS;
+
+ /* If no nodes, set it up as the root node. */
+ if (!s->root) {
+ s->root = nodep;
+ return nodep;
+ }
+
+ /*
+ * Find the parent where the new node should be attached
+ * and add the node there.
+ */
+ parentp = s->root;
+ while (true) {
+ if (idx < parentp->idx) {
+ if (!parentp->left) {
+ parentp->left = nodep;
+ nodep->parent = parentp;
+ break;
+ }
+ parentp = parentp->left;
+ } else {
+ assert(idx > parentp->idx + MASK_BITS + parentp->num_after - 1);
+ if (!parentp->right) {
+ parentp->right = nodep;
+ nodep->parent = parentp;
+ break;
+ }
+ parentp = parentp->right;
+ }
+ }
+
+ /*
+ * Does num_after bits of previous node overlap with the mask
+ * of the new node? If so set the bits in the new nodes mask
+ * and reduce the previous nodes num_after.
+ */
+ prev = node_prev(s, nodep);
+ while (prev && prev->idx + MASK_BITS + prev->num_after - 1 >= nodep->idx) {
+ unsigned int n1 = (prev->idx + MASK_BITS + prev->num_after - 1)
+ - nodep->idx;
+ assert(prev->num_after > 0);
+ assert(n1 < MASK_BITS);
+ assert(!(nodep->mask & (1 << n1)));
+ nodep->mask |= (1 << n1);
+ prev->num_after--;
+ }
+
+ return nodep;
+}
+
+/* Returns whether all the bits in the sparsebit array are set. */
+bool sparsebit_all_set(struct sparsebit *s)
+{
+ /*
+ * If any nodes there must be at least one bit set. Only case
+ * where a bit is set and total num set is 0, is when all bits
+ * are set.
+ */
+ return s->root && s->num_set == 0;
+}
+
+/* Clears all bits described by the node pointed to by nodep, then
+ * removes the node.
+ */
+static void node_rm(struct sparsebit *s, struct node *nodep)
+{
+ struct node *tmp;
+ sparsebit_num_t num_set;
+
+ num_set = node_num_set(nodep);
+ assert(s->num_set >= num_set || sparsebit_all_set(s));
+ s->num_set -= node_num_set(nodep);
+
+ /* Have both left and right child */
+ if (nodep->left && nodep->right) {
+ /*
+ * Move left children to the leftmost leaf node
+ * of the right child.
+ */
+ for (tmp = nodep->right; tmp->left; tmp = tmp->left)
+ ;
+ tmp->left = nodep->left;
+ nodep->left = NULL;
+ tmp->left->parent = tmp;
+ }
+
+ /* Left only child */
+ if (nodep->left) {
+ if (!nodep->parent) {
+ s->root = nodep->left;
+ nodep->left->parent = NULL;
+ } else {
+ nodep->left->parent = nodep->parent;
+ if (nodep == nodep->parent->left)
+ nodep->parent->left = nodep->left;
+ else {
+ assert(nodep == nodep->parent->right);
+ nodep->parent->right = nodep->left;
+ }
+ }
+
+ nodep->parent = nodep->left = nodep->right = NULL;
+ free(nodep);
+
+ return;
+ }
+
+
+ /* Right only child */
+ if (nodep->right) {
+ if (!nodep->parent) {
+ s->root = nodep->right;
+ nodep->right->parent = NULL;
+ } else {
+ nodep->right->parent = nodep->parent;
+ if (nodep == nodep->parent->left)
+ nodep->parent->left = nodep->right;
+ else {
+ assert(nodep == nodep->parent->right);
+ nodep->parent->right = nodep->right;
+ }
+ }
+
+ nodep->parent = nodep->left = nodep->right = NULL;
+ free(nodep);
+
+ return;
+ }
+
+ /* Leaf Node */
+ if (!nodep->parent) {
+ s->root = NULL;
+ } else {
+ if (nodep->parent->left == nodep)
+ nodep->parent->left = NULL;
+ else {
+ assert(nodep == nodep->parent->right);
+ nodep->parent->right = NULL;
+ }
+ }
+
+ nodep->parent = nodep->left = nodep->right = NULL;
+ free(nodep);
+
+ return;
+}
+
+/* Splits the node containing the bit at idx so that there is a node
+ * that starts at the specified index. If no such node exists, a new
+ * node at the specified index is created. Returns the new node.
+ *
+ * idx must start of a mask boundary.
+ */
+static struct node *node_split(struct sparsebit *s, sparsebit_idx_t idx)
+{
+ struct node *nodep1, *nodep2;
+ sparsebit_idx_t offset;
+ sparsebit_num_t orig_num_after;
+
+ assert(!(idx % MASK_BITS));
+
+ /*
+ * Is there a node that describes the setting of idx?
+ * If not, add it.
+ */
+ nodep1 = node_find(s, idx);
+ if (!nodep1)
+ return node_add(s, idx);
+
+ /*
+ * All done if the starting index of the node is where the
+ * split should occur.
+ */
+ if (nodep1->idx == idx)
+ return nodep1;
+
+ /*
+ * Split point not at start of mask, so it must be part of
+ * bits described by num_after.
+ */
+
+ /*
+ * Calculate offset within num_after for where the split is
+ * to occur.
+ */
+ offset = idx - (nodep1->idx + MASK_BITS);
+ orig_num_after = nodep1->num_after;
+
+ /*
+ * Add a new node to describe the bits starting at
+ * the split point.
+ */
+ nodep1->num_after = offset;
+ nodep2 = node_add(s, idx);
+
+ /* Move bits after the split point into the new node */
+ nodep2->num_after = orig_num_after - offset;
+ if (nodep2->num_after >= MASK_BITS) {
+ nodep2->mask = ~(mask_t) 0;
+ nodep2->num_after -= MASK_BITS;
+ } else {
+ nodep2->mask = (1 << nodep2->num_after) - 1;
+ nodep2->num_after = 0;
+ }
+
+ return nodep2;
+}
+
+/* Iteratively reduces the node pointed to by nodep and its adjacent
+ * nodes into a more compact form. For example, a node with a mask with
+ * all bits set adjacent to a previous node, will get combined into a
+ * single node with an increased num_after setting.
+ *
+ * After each reduction, a further check is made to see if additional
+ * reductions are possible with the new previous and next nodes. Note,
+ * a search for a reduction is only done across the nodes nearest nodep
+ * and those that became part of a reduction. Reductions beyond nodep
+ * and the adjacent nodes that are reduced are not discovered. It is the
+ * responsibility of the caller to pass a nodep that is within one node
+ * of each possible reduction.
+ *
+ * This function does not fix the temporary violation of all invariants.
+ * For example it does not fix the case where the bit settings described
+ * by two or more nodes overlap. Such a violation introduces the potential
+ * complication of a bit setting for a specific index having different settings
+ * in different nodes. This would then introduce the further complication
+ * of which node has the correct setting of the bit and thus such conditions
+ * are not allowed.
+ *
+ * This function is designed to fix invariant violations that are introduced
+ * by node_split() and by changes to the nodes mask or num_after members.
+ * For example, when setting a bit within a nodes mask, the function that
+ * sets the bit doesn't have to worry about whether the setting of that
+ * bit caused the mask to have leading only or trailing only bits set.
+ * Instead, the function can call node_reduce(), with nodep equal to the
+ * node address that it set a mask bit in, and node_reduce() will notice
+ * the cases of leading or trailing only bits and that there is an
+ * adjacent node that the bit settings could be merged into.
+ *
+ * This implementation specifically detects and corrects violation of the
+ * following invariants:
+ *
+ * + Node are only used to represent bits that are set.
+ * Nodes with a mask of 0 and num_after of 0 are not allowed.
+ *
+ * + The setting of at least one bit is always described in a nodes
+ * mask (mask >= 1).
+ *
+ * + A node with all mask bits set only occurs when the last bit
+ * described by the previous node is not equal to this nodes
+ * starting index - 1. All such occurences of this condition are
+ * avoided by moving the setting of the nodes mask bits into
+ * the previous nodes num_after setting.
+ */
+static void node_reduce(struct sparsebit *s, struct node *nodep)
+{
+ bool reduction_performed;
+
+ do {
+ reduction_performed = false;
+ struct node *prev, *next, *tmp;
+
+ /* 1) Potential reductions within the current node. */
+
+ /* Nodes with all bits cleared may be removed. */
+ if (nodep->mask == 0 && nodep->num_after == 0) {
+ /*
+ * About to remove the node pointed to by
+ * nodep, which normally would cause a problem
+ * for the next pass through the reduction loop,
+ * because the node at the starting point no longer
+ * exists. This potential problem is handled
+ * by first remembering the location of the next
+ * or previous nodes. Doesn't matter which, because
+ * once the node at nodep is removed, there will be
+ * no other nodes between prev and next.
+ *
+ * Note, the checks performed on nodep against both
+ * both prev and next both check for an adjacent
+ * node that can be reduced into a single node. As
+ * such, after removing the node at nodep, doesn't
+ * matter whether the nodep for the next pass
+ * through the loop is equal to the previous pass
+ * prev or next node. Either way, on the next pass
+ * the one not selected will become either the
+ * prev or next node.
+ */
+ tmp = node_next(s, nodep);
+ if (!tmp)
+ tmp = node_prev(s, nodep);
+
+ node_rm(s, nodep);
+ nodep = NULL;
+
+ nodep = tmp;
+ reduction_performed = true;
+ continue;
+ }
+
+ /*
+ * When the mask is 0, can reduce the amount of num_after
+ * bits by moving the initial num_after bits into the mask.
+ */
+ if (nodep->mask == 0) {
+ assert(nodep->num_after != 0);
+ assert(nodep->idx + MASK_BITS > nodep->idx);
+
+ nodep->idx += MASK_BITS;
+
+ if (nodep->num_after >= MASK_BITS) {
+ nodep->mask = ~0;
+ nodep->num_after -= MASK_BITS;
+ } else {
+ nodep->mask = (1u << nodep->num_after) - 1;
+ nodep->num_after = 0;
+ }
+
+ reduction_performed = true;
+ continue;
+ }
+
+ /*
+ * 2) Potential reductions between the current and
+ * previous nodes.
+ */
+ prev = node_prev(s, nodep);
+ if (prev) {
+ sparsebit_idx_t prev_highest_bit;
+
+ /* Nodes with no bits set can be removed. */
+ if (prev->mask == 0 && prev->num_after == 0) {
+ node_rm(s, prev);
+
+ reduction_performed = true;
+ continue;
+ }
+
+ /*
+ * All mask bits set and previous node has
+ * adjacent index.
+ */
+ if (nodep->mask + 1 == 0 &&
+ prev->idx + MASK_BITS == nodep->idx) {
+ prev->num_after += MASK_BITS + nodep->num_after;
+ nodep->mask = 0;
+ nodep->num_after = 0;
+
+ reduction_performed = true;
+ continue;
+ }
+
+ /*
+ * Is node adjacent to previous node and the node
+ * contains a single contiguous range of bits
+ * starting from the beginning of the mask?
+ */
+ prev_highest_bit = prev->idx + MASK_BITS - 1 + prev->num_after;
+ if (prev_highest_bit + 1 == nodep->idx &&
+ (nodep->mask | (nodep->mask >> 1)) == nodep->mask) {
+ /*
+ * How many contiguous bits are there?
+ * Is equal to the total number of set
+ * bits, due to an earlier check that
+ * there is a single contiguous range of
+ * set bits.
+ */
+ unsigned int num_contiguous
+ = __builtin_popcount(nodep->mask);
+ assert((num_contiguous > 0) &&
+ ((1ULL << num_contiguous) - 1) == nodep->mask);
+
+ prev->num_after += num_contiguous;
+ nodep->mask = 0;
+
+ /*
+ * For predictable performance, handle special
+ * case where all mask bits are set and there
+ * is a non-zero num_after setting. This code
+ * is functionally correct without the following
+ * conditionalized statements, but without them
+ * the value of num_after is only reduced by
+ * the number of mask bits per pass. There are
+ * cases where num_after can be close to 2^64.
+ * Without this code it could take nearly
+ * (2^64) / 32 passes to perform the full
+ * reduction.
+ */
+ if (num_contiguous == MASK_BITS) {
+ prev->num_after += nodep->num_after;
+ nodep->num_after = 0;
+ }
+
+ reduction_performed = true;
+ continue;
+ }
+ }
+
+ /*
+ * 3) Potential reductions between the current and
+ * next nodes.
+ */
+ next = node_next(s, nodep);
+ if (next) {
+ /* Nodes with no bits set can be removed. */
+ if (next->mask == 0 && next->num_after == 0) {
+ node_rm(s, next);
+ reduction_performed = true;
+ continue;
+ }
+
+ /*
+ * Is next node index adjacent to current node
+ * and has a mask with all bits set?
+ */
+ if (next->idx == nodep->idx + MASK_BITS + nodep->num_after &&
+ next->mask == ~(mask_t) 0) {
+ nodep->num_after += MASK_BITS;
+ next->mask = 0;
+ nodep->num_after += next->num_after;
+ next->num_after = 0;
+
+ node_rm(s, next);
+ next = NULL;
+
+ reduction_performed = true;
+ continue;
+ }
+ }
+ } while (nodep && reduction_performed);
+}
+
+/* Returns whether the bit at the index given by idx, within the
+ * sparsebit array is set or not.
+ */
+bool sparsebit_is_set(struct sparsebit *s, sparsebit_idx_t idx)
+{
+ struct node *nodep;
+
+ /* Find the node that describes the setting of the bit at idx */
+ for (nodep = s->root; nodep;
+ nodep = nodep->idx > idx ? nodep->left : nodep->right)
+ if (idx >= nodep->idx &&
+ idx <= nodep->idx + MASK_BITS + nodep->num_after - 1)
+ goto have_node;
+
+ return false;
+
+have_node:
+ /* Bit is set if it is any of the bits described by num_after */
+ if (nodep->num_after && idx >= nodep->idx + MASK_BITS)
+ return true;
+
+ /* Is the corresponding mask bit set */
+ assert(idx >= nodep->idx && idx - nodep->idx < MASK_BITS);
+ return !!(nodep->mask & (1 << (idx - nodep->idx)));
+}
+
+/* Within the sparsebit array pointed to by s, sets the bit
+ * at the index given by idx.
+ */
+static void bit_set(struct sparsebit *s, sparsebit_idx_t idx)
+{
+ struct node *nodep;
+
+ /* Skip bits that are already set */
+ if (sparsebit_is_set(s, idx))
+ return;
+
+ /*
+ * Get a node where the bit at idx is described by the mask.
+ * The node_split will also create a node, if there isn't
+ * already a node that describes the setting of bit.
+ */
+ nodep = node_split(s, idx & -MASK_BITS);
+
+ /* Set the bit within the nodes mask */
+ assert(idx >= nodep->idx && idx <= nodep->idx + MASK_BITS - 1);
+ assert(!(nodep->mask & (1 << (idx - nodep->idx))));
+ nodep->mask |= 1 << (idx - nodep->idx);
+ s->num_set++;
+
+ node_reduce(s, nodep);
+}
+
+/* Within the sparsebit array pointed to by s, clears the bit
+ * at the index given by idx.
+ */
+static void bit_clear(struct sparsebit *s, sparsebit_idx_t idx)
+{
+ struct node *nodep;
+
+ /* Skip bits that are already cleared */
+ if (!sparsebit_is_set(s, idx))
+ return;
+
+ /* Is there a node that describes the setting of this bit? */
+ nodep = node_find(s, idx);
+ if (!nodep)
+ return;
+
+ /*
+ * If a num_after bit, split the node, so that the bit is
+ * part of a node mask.
+ */
+ if (idx >= nodep->idx + MASK_BITS)
+ nodep = node_split(s, idx & -MASK_BITS);
+
+ /*
+ * After node_split above, bit at idx should be within the mask.
+ * Clear that bit.
+ */
+ assert(idx >= nodep->idx && idx <= nodep->idx + MASK_BITS - 1);
+ assert(nodep->mask & (1 << (idx - nodep->idx)));
+ nodep->mask &= ~(1 << (idx - nodep->idx));
+ assert(s->num_set > 0 || sparsebit_all_set(s));
+ s->num_set--;
+
+ node_reduce(s, nodep);
+}
+
+/* Recursively dumps to the FILE stream given by stream the contents
+ * of the sub-tree of nodes pointed to by nodep. Each line of output
+ * is prefixed by the number of spaces given by indent. On each
+ * recursion, the indent amount is increased by 2. This causes nodes
+ * at each level deeper into the binary search tree to be displayed
+ * with a greater indent.
+ */
+static void dump_nodes(FILE *stream, struct node *nodep,
+ unsigned int indent)
+{
+ char *node_type;
+
+ /* Dump contents of node */
+ if (!nodep->parent)
+ node_type = "root";
+ else if (nodep == nodep->parent->left)
+ node_type = "left";
+ else {
+ assert(nodep == nodep->parent->right);
+ node_type = "right";
+ }
+ fprintf(stream, "%*s---- %s nodep: %p\n", indent, "", node_type, nodep);
+ fprintf(stream, "%*s parent: %p left: %p right: %p\n", indent, "",
+ nodep->parent, nodep->left, nodep->right);
+ fprintf(stream, "%*s idx: 0x%lx mask: 0x%x num_after: 0x%lx\n",
+ indent, "", nodep->idx, nodep->mask, nodep->num_after);
+
+ /* If present, dump contents of left child nodes */
+ if (nodep->left)
+ dump_nodes(stream, nodep->left, indent + 2);
+
+ /* If present, dump contents of right child nodes */
+ if (nodep->right)
+ dump_nodes(stream, nodep->right, indent + 2);
+}
+
+static inline sparsebit_idx_t node_first_set(struct node *nodep, int start)
+{
+ mask_t leading = (mask_t)1 << start;
+ int n1 = __builtin_ctz(nodep->mask & -leading);
+
+ return nodep->idx + n1;
+}
+
+static inline sparsebit_idx_t node_first_clear(struct node *nodep, int start)
+{
+ mask_t leading = (mask_t)1 << start;
+ int n1 = __builtin_ctz(~nodep->mask & -leading);
+
+ return nodep->idx + n1;
+}
+
+/* Dumps to the FILE stream specified by stream, the implementation dependent
+ * internal state of s. Each line of output is prefixed with the number
+ * of spaces given by indent. The output is completely implementation
+ * dependent and subject to change. Output from this function should only
+ * be used for diagnostic purposes. For example, this function can be
+ * used by test cases after they detect an unexpected condition, as a means
+ * to capture diagnostic information.
+ */
+static void sparsebit_dump_internal(FILE *stream, struct sparsebit *s,
+ unsigned int indent)
+{
+ /* Dump the contents of s */
+ fprintf(stream, "%*sroot: %p\n", indent, "", s->root);
+ fprintf(stream, "%*snum_set: 0x%lx\n", indent, "", s->num_set);
+
+ if (s->root)
+ dump_nodes(stream, s->root, indent);
+}
+
+/* Allocates and returns a new sparsebit array. The initial state
+ * of the newly allocated sparsebit array has all bits cleared.
+ */
+struct sparsebit *sparsebit_alloc(void)
+{
+ struct sparsebit *s;
+
+ /* Allocate top level structure. */
+ s = calloc(1, sizeof(*s));
+ if (!s) {
+ perror("calloc");
+ abort();
+ }
+
+ return s;
+}
+
+/* Frees the implementation dependent data for the sparsebit array
+ * pointed to by s and poisons the pointer to that data.
+ */
+void sparsebit_free(struct sparsebit **sbitp)
+{
+ struct sparsebit *s = *sbitp;
+
+ if (!s)
+ return;
+
+ sparsebit_clear_all(s);
+ free(s);
+ *sbitp = NULL;
+}
+
+/* Makes a copy of the sparsebit array given by s, to the sparsebit
+ * array given by d. Note, d must have already been allocated via
+ * sparsebit_alloc(). It can though already have bits set, which
+ * if different from src will be cleared.
+ */
+void sparsebit_copy(struct sparsebit *d, struct sparsebit *s)
+{
+ /* First clear any bits already set in the destination */
+ sparsebit_clear_all(d);
+
+ if (s->root) {
+ d->root = node_copy_subtree(s->root);
+ d->num_set = s->num_set;
+ }
+}
+
+/* Returns whether num consecutive bits starting at idx are all set. */
+bool sparsebit_is_set_num(struct sparsebit *s,
+ sparsebit_idx_t idx, sparsebit_num_t num)
+{
+ sparsebit_idx_t next_cleared;
+
+ assert(num > 0);
+ assert(idx + num - 1 >= idx);
+
+ /* With num > 0, the first bit must be set. */
+ if (!sparsebit_is_set(s, idx))
+ return false;
+
+ /* Find the next cleared bit */
+ next_cleared = sparsebit_next_clear(s, idx);
+
+ /*
+ * If no cleared bits beyond idx, then there are at least num
+ * set bits. idx + num doesn't wrap. Otherwise check if
+ * there are enough set bits between idx and the next cleared bit.
+ */
+ return next_cleared == 0 || next_cleared - idx >= num;
+}
+
+/* Returns whether the bit at the index given by idx. */
+bool sparsebit_is_clear(struct sparsebit *s,
+ sparsebit_idx_t idx)
+{
+ return !sparsebit_is_set(s, idx);
+}
+
+/* Returns whether num consecutive bits starting at idx are all cleared. */
+bool sparsebit_is_clear_num(struct sparsebit *s,
+ sparsebit_idx_t idx, sparsebit_num_t num)
+{
+ sparsebit_idx_t next_set;
+
+ assert(num > 0);
+ assert(idx + num - 1 >= idx);
+
+ /* With num > 0, the first bit must be cleared. */
+ if (!sparsebit_is_clear(s, idx))
+ return false;
+
+ /* Find the next set bit */
+ next_set = sparsebit_next_set(s, idx);
+
+ /*
+ * If no set bits beyond idx, then there are at least num
+ * cleared bits. idx + num doesn't wrap. Otherwise check if
+ * there are enough cleared bits between idx and the next set bit.
+ */
+ return next_set == 0 || next_set - idx >= num;
+}
+
+/* Returns the total number of bits set. Note: 0 is also returned for
+ * the case of all bits set. This is because with all bits set, there
+ * is 1 additional bit set beyond what can be represented in the return
+ * value. Use sparsebit_any_set(), instead of sparsebit_num_set() > 0,
+ * to determine if the sparsebit array has any bits set.
+ */
+sparsebit_num_t sparsebit_num_set(struct sparsebit *s)
+{
+ return s->num_set;
+}
+
+/* Returns whether any bit is set in the sparsebit array. */
+bool sparsebit_any_set(struct sparsebit *s)
+{
+ /*
+ * Nodes only describe set bits. If any nodes then there
+ * is at least 1 bit set.
+ */
+ if (!s->root)
+ return false;
+
+ /*
+ * Every node should have a non-zero mask. For now will
+ * just assure that the root node has a non-zero mask,
+ * which is a quick check that at least 1 bit is set.
+ */
+ assert(s->root->mask != 0);
+ assert(s->num_set > 0 ||
+ (s->root->num_after == ((sparsebit_num_t) 0) - MASK_BITS &&
+ s->root->mask == ~(mask_t) 0));
+
+ return true;
+}
+
+/* Returns whether all the bits in the sparsebit array are cleared. */
+bool sparsebit_all_clear(struct sparsebit *s)
+{
+ return !sparsebit_any_set(s);
+}
+
+/* Returns whether all the bits in the sparsebit array are set. */
+bool sparsebit_any_clear(struct sparsebit *s)
+{
+ return !sparsebit_all_set(s);
+}
+
+/* Returns the index of the first set bit. Abort if no bits are set.
+ */
+sparsebit_idx_t sparsebit_first_set(struct sparsebit *s)
+{
+ struct node *nodep;
+
+ /* Validate at least 1 bit is set */
+ assert(sparsebit_any_set(s));
+
+ nodep = node_first(s);
+ return node_first_set(nodep, 0);
+}
+
+/* Returns the index of the first cleared bit. Abort if
+ * no bits are cleared.
+ */
+sparsebit_idx_t sparsebit_first_clear(struct sparsebit *s)
+{
+ struct node *nodep1, *nodep2;
+
+ /* Validate at least 1 bit is cleared. */
+ assert(sparsebit_any_clear(s));
+
+ /* If no nodes or first node index > 0 then lowest cleared is 0 */
+ nodep1 = node_first(s);
+ if (!nodep1 || nodep1->idx > 0)
+ return 0;
+
+ /* Does the mask in the first node contain any cleared bits. */
+ if (nodep1->mask != ~(mask_t) 0)
+ return node_first_clear(nodep1, 0);
+
+ /*
+ * All mask bits set in first node. If there isn't a second node
+ * then the first cleared bit is the first bit after the bits
+ * described by the first node.
+ */
+ nodep2 = node_next(s, nodep1);
+ if (!nodep2) {
+ /*
+ * No second node. First cleared bit is first bit beyond
+ * bits described by first node.
+ */
+ assert(nodep1->mask == ~(mask_t) 0);
+ assert(nodep1->idx + MASK_BITS + nodep1->num_after != (sparsebit_idx_t) 0);
+ return nodep1->idx + MASK_BITS + nodep1->num_after;
+ }
+
+ /*
+ * There is a second node.
+ * If it is not adjacent to the first node, then there is a gap
+ * of cleared bits between the nodes, and the first cleared bit
+ * is the first bit within the gap.
+ */
+ if (nodep1->idx + MASK_BITS + nodep1->num_after != nodep2->idx)
+ return nodep1->idx + MASK_BITS + nodep1->num_after;
+
+ /*
+ * Second node is adjacent to the first node.
+ * Because it is adjacent, its mask should be non-zero. If all
+ * its mask bits are set, then with it being adjacent, it should
+ * have had the mask bits moved into the num_after setting of the
+ * previous node.
+ */
+ return node_first_clear(nodep2, 0);
+}
+
+/* Returns index of next bit set within s after the index given by prev.
+ * Returns 0 if there are no bits after prev that are set.
+ */
+sparsebit_idx_t sparsebit_next_set(struct sparsebit *s,
+ sparsebit_idx_t prev)
+{
+ sparsebit_idx_t lowest_possible = prev + 1;
+ sparsebit_idx_t start;
+ struct node *nodep;
+
+ /* A bit after the highest index can't be set. */
+ if (lowest_possible == 0)
+ return 0;
+
+ /*
+ * Find the leftmost 'candidate' overlapping or to the right
+ * of lowest_possible.
+ */
+ struct node *candidate = NULL;
+
+ /* True iff lowest_possible is within candidate */
+ bool contains = false;
+
+ /*
+ * Find node that describes setting of bit at lowest_possible.
+ * If such a node doesn't exist, find the node with the lowest
+ * starting index that is > lowest_possible.
+ */
+ for (nodep = s->root; nodep;) {
+ if ((nodep->idx + MASK_BITS + nodep->num_after - 1)
+ >= lowest_possible) {
+ candidate = nodep;
+ if (candidate->idx <= lowest_possible) {
+ contains = true;
+ break;
+ }
+ nodep = nodep->left;
+ } else {
+ nodep = nodep->right;
+ }
+ }
+ if (!candidate)
+ return 0;
+
+ assert(candidate->mask != 0);
+
+ /* Does the candidate node describe the setting of lowest_possible? */
+ if (!contains) {
+ /*
+ * Candidate doesn't describe setting of bit at lowest_possible.
+ * Candidate points to the first node with a starting index
+ * > lowest_possible.
+ */
+ assert(candidate->idx > lowest_possible);
+
+ return node_first_set(candidate, 0);
+ }
+
+ /*
+ * Candidate describes setting of bit at lowest_possible.
+ * Note: although the node describes the setting of the bit
+ * at lowest_possible, its possible that its setting and the
+ * setting of all latter bits described by this node are 0.
+ * For now, just handle the cases where this node describes
+ * a bit at or after an index of lowest_possible that is set.
+ */
+ start = lowest_possible - candidate->idx;
+
+ if (start < MASK_BITS && candidate->mask >= (1 << start))
+ return node_first_set(candidate, start);
+
+ if (candidate->num_after) {
+ sparsebit_idx_t first_num_after_idx = candidate->idx + MASK_BITS;
+
+ return lowest_possible < first_num_after_idx
+ ? first_num_after_idx : lowest_possible;
+ }
+
+ /*
+ * Although candidate node describes setting of bit at
+ * the index of lowest_possible, all bits at that index and
+ * latter that are described by candidate are cleared. With
+ * this, the next bit is the first bit in the next node, if
+ * such a node exists. If a next node doesn't exist, then
+ * there is no next set bit.
+ */
+ candidate = node_next(s, candidate);
+ if (!candidate)
+ return 0;
+
+ return node_first_set(candidate, 0);
+}
+
+/* Returns index of next bit cleared within s after the index given by prev.
+ * Returns 0 if there are no bits after prev that are cleared.
+ */
+sparsebit_idx_t sparsebit_next_clear(struct sparsebit *s,
+ sparsebit_idx_t prev)
+{
+ sparsebit_idx_t lowest_possible = prev + 1;
+ sparsebit_idx_t idx;
+ struct node *nodep1, *nodep2;
+
+ /* A bit after the highest index can't be set. */
+ if (lowest_possible == 0)
+ return 0;
+
+ /*
+ * Does a node describing the setting of lowest_possible exist?
+ * If not, the bit at lowest_possible is cleared.
+ */
+ nodep1 = node_find(s, lowest_possible);
+ if (!nodep1)
+ return lowest_possible;
+
+ /* Does a mask bit in node 1 describe the next cleared bit. */
+ for (idx = lowest_possible - nodep1->idx; idx < MASK_BITS; idx++)
+ if (!(nodep1->mask & (1 << idx)))
+ return nodep1->idx + idx;
+
+ /*
+ * Next cleared bit is not described by node 1. If there
+ * isn't a next node, then next cleared bit is described
+ * by bit after the bits described by the first node.
+ */
+ nodep2 = node_next(s, nodep1);
+ if (!nodep2)
+ return nodep1->idx + MASK_BITS + nodep1->num_after;
+
+ /*
+ * There is a second node.
+ * If it is not adjacent to the first node, then there is a gap
+ * of cleared bits between the nodes, and the next cleared bit
+ * is the first bit within the gap.
+ */
+ if (nodep1->idx + MASK_BITS + nodep1->num_after != nodep2->idx)
+ return nodep1->idx + MASK_BITS + nodep1->num_after;
+
+ /*
+ * Second node is adjacent to the first node.
+ * Because it is adjacent, its mask should be non-zero. If all
+ * its mask bits are set, then with it being adjacent, it should
+ * have had the mask bits moved into the num_after setting of the
+ * previous node.
+ */
+ return node_first_clear(nodep2, 0);
+}
+
+/* Starting with the index 1 greater than the index given by start, finds
+ * and returns the index of the first sequence of num consecutively set
+ * bits. Returns a value of 0 of no such sequence exists.
+ */
+sparsebit_idx_t sparsebit_next_set_num(struct sparsebit *s,
+ sparsebit_idx_t start, sparsebit_num_t num)
+{
+ sparsebit_idx_t idx;
+
+ assert(num >= 1);
+
+ for (idx = sparsebit_next_set(s, start);
+ idx != 0 && idx + num - 1 >= idx;
+ idx = sparsebit_next_set(s, idx)) {
+ assert(sparsebit_is_set(s, idx));
+
+ /*
+ * Does the sequence of bits starting at idx consist of
+ * num set bits?
+ */
+ if (sparsebit_is_set_num(s, idx, num))
+ return idx;
+
+ /*
+ * Sequence of set bits at idx isn't large enough.
+ * Skip this entire sequence of set bits.
+ */
+ idx = sparsebit_next_clear(s, idx);
+ if (idx == 0)
+ return 0;
+ }
+
+ return 0;
+}
+
+/* Starting with the index 1 greater than the index given by start, finds
+ * and returns the index of the first sequence of num consecutively cleared
+ * bits. Returns a value of 0 of no such sequence exists.
+ */
+sparsebit_idx_t sparsebit_next_clear_num(struct sparsebit *s,
+ sparsebit_idx_t start, sparsebit_num_t num)
+{
+ sparsebit_idx_t idx;
+
+ assert(num >= 1);
+
+ for (idx = sparsebit_next_clear(s, start);
+ idx != 0 && idx + num - 1 >= idx;
+ idx = sparsebit_next_clear(s, idx)) {
+ assert(sparsebit_is_clear(s, idx));
+
+ /*
+ * Does the sequence of bits starting at idx consist of
+ * num cleared bits?
+ */
+ if (sparsebit_is_clear_num(s, idx, num))
+ return idx;
+
+ /*
+ * Sequence of cleared bits at idx isn't large enough.
+ * Skip this entire sequence of cleared bits.
+ */
+ idx = sparsebit_next_set(s, idx);
+ if (idx == 0)
+ return 0;
+ }
+
+ return 0;
+}
+
+/* Sets the bits * in the inclusive range idx through idx + num - 1. */
+void sparsebit_set_num(struct sparsebit *s,
+ sparsebit_idx_t start, sparsebit_num_t num)
+{
+ struct node *nodep, *next;
+ unsigned int n1;
+ sparsebit_idx_t idx;
+ sparsebit_num_t n;
+ sparsebit_idx_t middle_start, middle_end;
+
+ assert(num > 0);
+ assert(start + num - 1 >= start);
+
+ /*
+ * Leading - bits before first mask boundary.
+ *
+ * TODO(lhuemill): With some effort it may be possible to
+ * replace the following loop with a sequential sequence
+ * of statements. High level sequence would be:
+ *
+ * 1. Use node_split() to force node that describes setting
+ * of idx to be within the mask portion of a node.
+ * 2. Form mask of bits to be set.
+ * 3. Determine number of mask bits already set in the node
+ * and store in a local variable named num_already_set.
+ * 4. Set the appropriate mask bits within the node.
+ * 5. Increment struct sparsebit_pvt num_set member
+ * by the number of bits that were actually set.
+ * Exclude from the counts bits that were already set.
+ * 6. Before returning to the caller, use node_reduce() to
+ * handle the multiple corner cases that this method
+ * introduces.
+ */
+ for (idx = start, n = num; n > 0 && idx % MASK_BITS != 0; idx++, n--)
+ bit_set(s, idx);
+
+ /* Middle - bits spanning one or more entire mask */
+ middle_start = idx;
+ middle_end = middle_start + (n & -MASK_BITS) - 1;
+ if (n >= MASK_BITS) {
+ nodep = node_split(s, middle_start);
+
+ /*
+ * As needed, split just after end of middle bits.
+ * No split needed if end of middle bits is at highest
+ * supported bit index.
+ */
+ if (middle_end + 1 > middle_end)
+ (void) node_split(s, middle_end + 1);
+
+ /* Delete nodes that only describe bits within the middle. */
+ for (next = node_next(s, nodep);
+ next && (next->idx < middle_end);
+ next = node_next(s, nodep)) {
+ assert(next->idx + MASK_BITS + next->num_after - 1 <= middle_end);
+ node_rm(s, next);
+ next = NULL;
+ }
+
+ /* As needed set each of the mask bits */
+ for (n1 = 0; n1 < MASK_BITS; n1++) {
+ if (!(nodep->mask & (1 << n1))) {
+ nodep->mask |= 1 << n1;
+ s->num_set++;
+ }
+ }
+
+ s->num_set -= nodep->num_after;
+ nodep->num_after = middle_end - middle_start + 1 - MASK_BITS;
+ s->num_set += nodep->num_after;
+
+ node_reduce(s, nodep);
+ }
+ idx = middle_end + 1;
+ n -= middle_end - middle_start + 1;
+
+ /* Trailing - bits at and beyond last mask boundary */
+ assert(n < MASK_BITS);
+ for (; n > 0; idx++, n--)
+ bit_set(s, idx);
+}
+
+/* Clears the bits * in the inclusive range idx through idx + num - 1. */
+void sparsebit_clear_num(struct sparsebit *s,
+ sparsebit_idx_t start, sparsebit_num_t num)
+{
+ struct node *nodep, *next;
+ unsigned int n1;
+ sparsebit_idx_t idx;
+ sparsebit_num_t n;
+ sparsebit_idx_t middle_start, middle_end;
+
+ assert(num > 0);
+ assert(start + num - 1 >= start);
+
+ /* Leading - bits before first mask boundary */
+ for (idx = start, n = num; n > 0 && idx % MASK_BITS != 0; idx++, n--)
+ bit_clear(s, idx);
+
+ /* Middle - bits spanning one or more entire mask */
+ middle_start = idx;
+ middle_end = middle_start + (n & -MASK_BITS) - 1;
+ if (n >= MASK_BITS) {
+ nodep = node_split(s, middle_start);
+
+ /*
+ * As needed, split just after end of middle bits.
+ * No split needed if end of middle bits is at highest
+ * supported bit index.
+ */
+ if (middle_end + 1 > middle_end)
+ (void) node_split(s, middle_end + 1);
+
+ /* Delete nodes that only describe bits within the middle. */
+ for (next = node_next(s, nodep);
+ next && (next->idx < middle_end);
+ next = node_next(s, nodep)) {
+ assert(next->idx + MASK_BITS + next->num_after - 1 <= middle_end);
+ node_rm(s, next);
+ next = NULL;
+ }
+
+ /* As needed clear each of the mask bits */
+ for (n1 = 0; n1 < MASK_BITS; n1++) {
+ if (nodep->mask & (1 << n1)) {
+ nodep->mask &= ~(1 << n1);
+ s->num_set--;
+ }
+ }
+
+ /* Clear any bits described by num_after */
+ s->num_set -= nodep->num_after;
+ nodep->num_after = 0;
+
+ /*
+ * Delete the node that describes the beginning of
+ * the middle bits and perform any allowed reductions
+ * with the nodes prev or next of nodep.
+ */
+ node_reduce(s, nodep);
+ nodep = NULL;
+ }
+ idx = middle_end + 1;
+ n -= middle_end - middle_start + 1;
+
+ /* Trailing - bits at and beyond last mask boundary */
+ assert(n < MASK_BITS);
+ for (; n > 0; idx++, n--)
+ bit_clear(s, idx);
+}
+
+/* Sets the bit at the index given by idx. */
+void sparsebit_set(struct sparsebit *s, sparsebit_idx_t idx)
+{
+ sparsebit_set_num(s, idx, 1);
+}
+
+/* Clears the bit at the index given by idx. */
+void sparsebit_clear(struct sparsebit *s, sparsebit_idx_t idx)
+{
+ sparsebit_clear_num(s, idx, 1);
+}
+
+/* Sets the bits in the entire addressable range of the sparsebit array. */
+void sparsebit_set_all(struct sparsebit *s)
+{
+ sparsebit_set(s, 0);
+ sparsebit_set_num(s, 1, ~(sparsebit_idx_t) 0);
+ assert(sparsebit_all_set(s));
+}
+
+/* Clears the bits in the entire addressable range of the sparsebit array. */
+void sparsebit_clear_all(struct sparsebit *s)
+{
+ sparsebit_clear(s, 0);
+ sparsebit_clear_num(s, 1, ~(sparsebit_idx_t) 0);
+ assert(!sparsebit_any_set(s));
+}
+
+static size_t display_range(FILE *stream, sparsebit_idx_t low,
+ sparsebit_idx_t high, bool prepend_comma_space)
+{
+ char *fmt_str;
+ size_t sz;
+
+ /* Determine the printf format string */
+ if (low == high)
+ fmt_str = prepend_comma_space ? ", 0x%lx" : "0x%lx";
+ else
+ fmt_str = prepend_comma_space ? ", 0x%lx:0x%lx" : "0x%lx:0x%lx";
+
+ /*
+ * When stream is NULL, just determine the size of what would
+ * have been printed, else print the range.
+ */
+ if (!stream)
+ sz = snprintf(NULL, 0, fmt_str, low, high);
+ else
+ sz = fprintf(stream, fmt_str, low, high);
+
+ return sz;
+}
+
+
+/* Dumps to the FILE stream given by stream, the bit settings
+ * of s. Each line of output is prefixed with the number of
+ * spaces given by indent. The length of each line is implementation
+ * dependent and does not depend on the indent amount. The following
+ * is an example output of a sparsebit array that has bits:
+ *
+ * 0x5, 0x8, 0xa:0xe, 0x12
+ *
+ * This corresponds to a sparsebit whose bits 5, 8, 10, 11, 12, 13, 14, 18
+ * are set. Note that a ':', instead of a '-' is used to specify a range of
+ * contiguous bits. This is done because '-' is used to specify command-line
+ * options, and sometimes ranges are specified as command-line arguments.
+ */
+void sparsebit_dump(FILE *stream, struct sparsebit *s,
+ unsigned int indent)
+{
+ size_t current_line_len = 0;
+ size_t sz;
+ struct node *nodep;
+
+ if (!sparsebit_any_set(s))
+ return;
+
+ /* Display initial indent */
+ fprintf(stream, "%*s", indent, "");
+
+ /* For each node */
+ for (nodep = node_first(s); nodep; nodep = node_next(s, nodep)) {
+ unsigned int n1;
+ sparsebit_idx_t low, high;
+
+ /* For each group of bits in the mask */
+ for (n1 = 0; n1 < MASK_BITS; n1++) {
+ if (nodep->mask & (1 << n1)) {
+ low = high = nodep->idx + n1;
+
+ for (; n1 < MASK_BITS; n1++) {
+ if (nodep->mask & (1 << n1))
+ high = nodep->idx + n1;
+ else
+ break;
+ }
+
+ if ((n1 == MASK_BITS) && nodep->num_after)
+ high += nodep->num_after;
+
+ /*
+ * How much room will it take to display
+ * this range.
+ */
+ sz = display_range(NULL, low, high,
+ current_line_len != 0);
+
+ /*
+ * If there is not enough room, display
+ * a newline plus the indent of the next
+ * line.
+ */
+ if (current_line_len + sz > DUMP_LINE_MAX) {
+ fputs("\n", stream);
+ fprintf(stream, "%*s", indent, "");
+ current_line_len = 0;
+ }
+
+ /* Display the range */
+ sz = display_range(stream, low, high,
+ current_line_len != 0);
+ current_line_len += sz;
+ }
+ }
+
+ /*
+ * If num_after and most significant-bit of mask is not
+ * set, then still need to display a range for the bits
+ * described by num_after.
+ */
+ if (!(nodep->mask & (1 << (MASK_BITS - 1))) && nodep->num_after) {
+ low = nodep->idx + MASK_BITS;
+ high = nodep->idx + MASK_BITS + nodep->num_after - 1;
+
+ /*
+ * How much room will it take to display
+ * this range.
+ */
+ sz = display_range(NULL, low, high,
+ current_line_len != 0);
+
+ /*
+ * If there is not enough room, display
+ * a newline plus the indent of the next
+ * line.
+ */
+ if (current_line_len + sz > DUMP_LINE_MAX) {
+ fputs("\n", stream);
+ fprintf(stream, "%*s", indent, "");
+ current_line_len = 0;
+ }
+
+ /* Display the range */
+ sz = display_range(stream, low, high,
+ current_line_len != 0);
+ current_line_len += sz;
+ }
+ }
+ fputs("\n", stream);
+}
+
+/* Validates the internal state of the sparsebit array given by
+ * s. On error, diagnostic information is printed to stderr and
+ * abort is called.
+ */
+void sparsebit_validate_internal(struct sparsebit *s)
+{
+ bool error_detected = false;
+ struct node *nodep, *prev = NULL;
+ sparsebit_num_t total_bits_set = 0;
+ unsigned int n1;
+
+ /* For each node */
+ for (nodep = node_first(s); nodep;
+ prev = nodep, nodep = node_next(s, nodep)) {
+
+ /*
+ * Increase total bits set by the number of bits set
+ * in this node.
+ */
+ for (n1 = 0; n1 < MASK_BITS; n1++)
+ if (nodep->mask & (1 << n1))
+ total_bits_set++;
+
+ total_bits_set += nodep->num_after;
+
+ /*
+ * Arbitrary choice as to whether a mask of 0 is allowed
+ * or not. For diagnostic purposes it is beneficial to
+ * have only one valid means to represent a set of bits.
+ * To support this an arbitrary choice has been made
+ * to not allow a mask of zero.
+ */
+ if (nodep->mask == 0) {
+ fprintf(stderr, "Node mask of zero, "
+ "nodep: %p nodep->mask: 0x%x",
+ nodep, nodep->mask);
+ error_detected = true;
+ break;
+ }
+
+ /*
+ * Validate num_after is not greater than the max index
+ * - the number of mask bits. The num_after member
+ * uses 0-based indexing and thus has no value that
+ * represents all bits set. This limitation is handled
+ * by requiring a non-zero mask. With a non-zero mask,
+ * MASK_BITS worth of bits are described by the mask,
+ * which makes the largest needed num_after equal to:
+ *
+ * (~(sparsebit_num_t) 0) - MASK_BITS + 1
+ */
+ if (nodep->num_after
+ > (~(sparsebit_num_t) 0) - MASK_BITS + 1) {
+ fprintf(stderr, "num_after too large, "
+ "nodep: %p nodep->num_after: 0x%lx",
+ nodep, nodep->num_after);
+ error_detected = true;
+ break;
+ }
+
+ /* Validate node index is divisible by the mask size */
+ if (nodep->idx % MASK_BITS) {
+ fprintf(stderr, "Node index not divisible by "
+ "mask size,\n"
+ " nodep: %p nodep->idx: 0x%lx "
+ "MASK_BITS: %lu\n",
+ nodep, nodep->idx, MASK_BITS);
+ error_detected = true;
+ break;
+ }
+
+ /*
+ * Validate bits described by node don't wrap beyond the
+ * highest supported index.
+ */
+ if ((nodep->idx + MASK_BITS + nodep->num_after - 1) < nodep->idx) {
+ fprintf(stderr, "Bits described by node wrap "
+ "beyond highest supported index,\n"
+ " nodep: %p nodep->idx: 0x%lx\n"
+ " MASK_BITS: %lu nodep->num_after: 0x%lx",
+ nodep, nodep->idx, MASK_BITS, nodep->num_after);
+ error_detected = true;
+ break;
+ }
+
+ /* Check parent pointers. */
+ if (nodep->left) {
+ if (nodep->left->parent != nodep) {
+ fprintf(stderr, "Left child parent pointer "
+ "doesn't point to this node,\n"
+ " nodep: %p nodep->left: %p "
+ "nodep->left->parent: %p",
+ nodep, nodep->left,
+ nodep->left->parent);
+ error_detected = true;
+ break;
+ }
+ }
+
+ if (nodep->right) {
+ if (nodep->right->parent != nodep) {
+ fprintf(stderr, "Right child parent pointer "
+ "doesn't point to this node,\n"
+ " nodep: %p nodep->right: %p "
+ "nodep->right->parent: %p",
+ nodep, nodep->right,
+ nodep->right->parent);
+ error_detected = true;
+ break;
+ }
+ }
+
+ if (!nodep->parent) {
+ if (s->root != nodep) {
+ fprintf(stderr, "Unexpected root node, "
+ "s->root: %p nodep: %p",
+ s->root, nodep);
+ error_detected = true;
+ break;
+ }
+ }
+
+ if (prev) {
+ /*
+ * Is index of previous node before index of
+ * current node?
+ */
+ if (prev->idx >= nodep->idx) {
+ fprintf(stderr, "Previous node index "
+ ">= current node index,\n"
+ " prev: %p prev->idx: 0x%lx\n"
+ " nodep: %p nodep->idx: 0x%lx",
+ prev, prev->idx, nodep, nodep->idx);
+ error_detected = true;
+ break;
+ }
+
+ /*
+ * Nodes occur in asscending order, based on each
+ * nodes starting index.
+ */
+ if ((prev->idx + MASK_BITS + prev->num_after - 1)
+ >= nodep->idx) {
+ fprintf(stderr, "Previous node bit range "
+ "overlap with current node bit range,\n"
+ " prev: %p prev->idx: 0x%lx "
+ "prev->num_after: 0x%lx\n"
+ " nodep: %p nodep->idx: 0x%lx "
+ "nodep->num_after: 0x%lx\n"
+ " MASK_BITS: %lu",
+ prev, prev->idx, prev->num_after,
+ nodep, nodep->idx, nodep->num_after,
+ MASK_BITS);
+ error_detected = true;
+ break;
+ }
+
+ /*
+ * When the node has all mask bits set, it shouldn't
+ * be adjacent to the last bit described by the
+ * previous node.
+ */
+ if (nodep->mask == ~(mask_t) 0 &&
+ prev->idx + MASK_BITS + prev->num_after == nodep->idx) {
+ fprintf(stderr, "Current node has mask with "
+ "all bits set and is adjacent to the "
+ "previous node,\n"
+ " prev: %p prev->idx: 0x%lx "
+ "prev->num_after: 0x%lx\n"
+ " nodep: %p nodep->idx: 0x%lx "
+ "nodep->num_after: 0x%lx\n"
+ " MASK_BITS: %lu",
+ prev, prev->idx, prev->num_after,
+ nodep, nodep->idx, nodep->num_after,
+ MASK_BITS);
+
+ error_detected = true;
+ break;
+ }
+ }
+ }
+
+ if (!error_detected) {
+ /*
+ * Is sum of bits set in each node equal to the count
+ * of total bits set.
+ */
+ if (s->num_set != total_bits_set) {
+ fprintf(stderr, "Number of bits set missmatch,\n"
+ " s->num_set: 0x%lx total_bits_set: 0x%lx",
+ s->num_set, total_bits_set);
+
+ error_detected = true;
+ }
+ }
+
+ if (error_detected) {
+ fputs(" dump_internal:\n", stderr);
+ sparsebit_dump_internal(stderr, s, 4);
+ abort();
+ }
+}
+
+
+#ifdef FUZZ
+/* A simple but effective fuzzing driver. Look for bugs with the help
+ * of some invariants and of a trivial representation of sparsebit.
+ * Just use 512 bytes of /dev/zero and /dev/urandom as inputs, and let
+ * afl-fuzz do the magic. :)
+ */
+
+#include <stdlib.h>
+#include <assert.h>
+
+struct range {
+ sparsebit_idx_t first, last;
+ bool set;
+};
+
+struct sparsebit *s;
+struct range ranges[1000];
+int num_ranges;
+
+static bool get_value(sparsebit_idx_t idx)
+{
+ int i;
+
+ for (i = num_ranges; --i >= 0; )
+ if (ranges[i].first <= idx && idx <= ranges[i].last)
+ return ranges[i].set;
+
+ return false;
+}
+
+static void operate(int code, sparsebit_idx_t first, sparsebit_idx_t last)
+{
+ sparsebit_num_t num;
+ sparsebit_idx_t next;
+
+ if (first < last) {
+ num = last - first + 1;
+ } else {
+ num = first - last + 1;
+ first = last;
+ last = first + num - 1;
+ }
+
+ switch (code) {
+ case 0:
+ sparsebit_set(s, first);
+ assert(sparsebit_is_set(s, first));
+ assert(!sparsebit_is_clear(s, first));
+ assert(sparsebit_any_set(s));
+ assert(!sparsebit_all_clear(s));
+ if (get_value(first))
+ return;
+ if (num_ranges == 1000)
+ exit(0);
+ ranges[num_ranges++] = (struct range)
+ { .first = first, .last = first, .set = true };
+ break;
+ case 1:
+ sparsebit_clear(s, first);
+ assert(!sparsebit_is_set(s, first));
+ assert(sparsebit_is_clear(s, first));
+ assert(sparsebit_any_clear(s));
+ assert(!sparsebit_all_set(s));
+ if (!get_value(first))
+ return;
+ if (num_ranges == 1000)
+ exit(0);
+ ranges[num_ranges++] = (struct range)
+ { .first = first, .last = first, .set = false };
+ break;
+ case 2:
+ assert(sparsebit_is_set(s, first) == get_value(first));
+ assert(sparsebit_is_clear(s, first) == !get_value(first));
+ break;
+ case 3:
+ if (sparsebit_any_set(s))
+ assert(get_value(sparsebit_first_set(s)));
+ if (sparsebit_any_clear(s))
+ assert(!get_value(sparsebit_first_clear(s)));
+ sparsebit_set_all(s);
+ assert(!sparsebit_any_clear(s));
+ assert(sparsebit_all_set(s));
+ num_ranges = 0;
+ ranges[num_ranges++] = (struct range)
+ { .first = 0, .last = ~(sparsebit_idx_t)0, .set = true };
+ break;
+ case 4:
+ if (sparsebit_any_set(s))
+ assert(get_value(sparsebit_first_set(s)));
+ if (sparsebit_any_clear(s))
+ assert(!get_value(sparsebit_first_clear(s)));
+ sparsebit_clear_all(s);
+ assert(!sparsebit_any_set(s));
+ assert(sparsebit_all_clear(s));
+ num_ranges = 0;
+ break;
+ case 5:
+ next = sparsebit_next_set(s, first);
+ assert(next == 0 || next > first);
+ assert(next == 0 || get_value(next));
+ break;
+ case 6:
+ next = sparsebit_next_clear(s, first);
+ assert(next == 0 || next > first);
+ assert(next == 0 || !get_value(next));
+ break;
+ case 7:
+ next = sparsebit_next_clear(s, first);
+ if (sparsebit_is_set_num(s, first, num)) {
+ assert(next == 0 || next > last);
+ if (first)
+ next = sparsebit_next_set(s, first - 1);
+ else if (sparsebit_any_set(s))
+ next = sparsebit_first_set(s);
+ else
+ return;
+ assert(next == first);
+ } else {
+ assert(sparsebit_is_clear(s, first) || next <= last);
+ }
+ break;
+ case 8:
+ next = sparsebit_next_set(s, first);
+ if (sparsebit_is_clear_num(s, first, num)) {
+ assert(next == 0 || next > last);
+ if (first)
+ next = sparsebit_next_clear(s, first - 1);
+ else if (sparsebit_any_clear(s))
+ next = sparsebit_first_clear(s);
+ else
+ return;
+ assert(next == first);
+ } else {
+ assert(sparsebit_is_set(s, first) || next <= last);
+ }
+ break;
+ case 9:
+ sparsebit_set_num(s, first, num);
+ assert(sparsebit_is_set_num(s, first, num));
+ assert(!sparsebit_is_clear_num(s, first, num));
+ assert(sparsebit_any_set(s));
+ assert(!sparsebit_all_clear(s));
+ if (num_ranges == 1000)
+ exit(0);
+ ranges[num_ranges++] = (struct range)
+ { .first = first, .last = last, .set = true };
+ break;
+ case 10:
+ sparsebit_clear_num(s, first, num);
+ assert(!sparsebit_is_set_num(s, first, num));
+ assert(sparsebit_is_clear_num(s, first, num));
+ assert(sparsebit_any_clear(s));
+ assert(!sparsebit_all_set(s));
+ if (num_ranges == 1000)
+ exit(0);
+ ranges[num_ranges++] = (struct range)
+ { .first = first, .last = last, .set = false };
+ break;
+ case 11:
+ sparsebit_validate_internal(s);
+ break;
+ default:
+ break;
+ }
+}
+
+unsigned char get8(void)
+{
+ int ch;
+
+ ch = getchar();
+ if (ch == EOF)
+ exit(0);
+ return ch;
+}
+
+uint64_t get64(void)
+{
+ uint64_t x;
+
+ x = get8();
+ x = (x << 8) | get8();
+ x = (x << 8) | get8();
+ x = (x << 8) | get8();
+ x = (x << 8) | get8();
+ x = (x << 8) | get8();
+ x = (x << 8) | get8();
+ return (x << 8) | get8();
+}
+
+int main(void)
+{
+ s = sparsebit_alloc();
+ for (;;) {
+ uint8_t op = get8() & 0xf;
+ uint64_t first = get64();
+ uint64_t last = get64();
+
+ operate(op, first, last);
+ }
+}
+#endif
diff --git a/tools/testing/selftests/kvm/lib/test_util.c b/tools/testing/selftests/kvm/lib/test_util.c
new file mode 100644
index 000000000..8e04c0b16
--- /dev/null
+++ b/tools/testing/selftests/kvm/lib/test_util.c
@@ -0,0 +1,111 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * tools/testing/selftests/kvm/lib/test_util.c
+ *
+ * Copyright (C) 2020, Google LLC.
+ */
+
+#include <assert.h>
+#include <ctype.h>
+#include <limits.h>
+#include <stdlib.h>
+#include <time.h>
+
+#include "test_util.h"
+
+/*
+ * Parses "[0-9]+[kmgt]?".
+ */
+size_t parse_size(const char *size)
+{
+ size_t base;
+ char *scale;
+ int shift = 0;
+
+ TEST_ASSERT(size && isdigit(size[0]), "Need at least one digit in '%s'", size);
+
+ base = strtoull(size, &scale, 0);
+
+ TEST_ASSERT(base != ULLONG_MAX, "Overflow parsing size!");
+
+ switch (tolower(*scale)) {
+ case 't':
+ shift = 40;
+ break;
+ case 'g':
+ shift = 30;
+ break;
+ case 'm':
+ shift = 20;
+ break;
+ case 'k':
+ shift = 10;
+ break;
+ case 'b':
+ case '\0':
+ shift = 0;
+ break;
+ default:
+ TEST_ASSERT(false, "Unknown size letter %c", *scale);
+ }
+
+ TEST_ASSERT((base << shift) >> shift == base, "Overflow scaling size!");
+
+ return base << shift;
+}
+
+int64_t timespec_to_ns(struct timespec ts)
+{
+ return (int64_t)ts.tv_nsec + 1000000000LL * (int64_t)ts.tv_sec;
+}
+
+struct timespec timespec_add_ns(struct timespec ts, int64_t ns)
+{
+ struct timespec res;
+
+ res.tv_nsec = ts.tv_nsec + ns;
+ res.tv_sec = ts.tv_sec + res.tv_nsec / 1000000000LL;
+ res.tv_nsec %= 1000000000LL;
+
+ return res;
+}
+
+struct timespec timespec_add(struct timespec ts1, struct timespec ts2)
+{
+ int64_t ns1 = timespec_to_ns(ts1);
+ int64_t ns2 = timespec_to_ns(ts2);
+ return timespec_add_ns((struct timespec){0}, ns1 + ns2);
+}
+
+struct timespec timespec_sub(struct timespec ts1, struct timespec ts2)
+{
+ int64_t ns1 = timespec_to_ns(ts1);
+ int64_t ns2 = timespec_to_ns(ts2);
+ return timespec_add_ns((struct timespec){0}, ns1 - ns2);
+}
+
+struct timespec timespec_diff_now(struct timespec start)
+{
+ struct timespec end;
+
+ clock_gettime(CLOCK_MONOTONIC, &end);
+ return timespec_sub(end, start);
+}
+
+struct timespec timespec_div(struct timespec ts, int divisor)
+{
+ int64_t ns = timespec_to_ns(ts) / divisor;
+
+ return timespec_add_ns((struct timespec){0}, ns);
+}
+
+void print_skip(const char *fmt, ...)
+{
+ va_list ap;
+
+ assert(fmt);
+ va_start(ap, fmt);
+ vprintf(fmt, ap);
+ va_end(ap);
+ puts(", skipping test");
+}
diff --git a/tools/testing/selftests/kvm/lib/x86_64/handlers.S b/tools/testing/selftests/kvm/lib/x86_64/handlers.S
new file mode 100644
index 000000000..aaf7bc7d2
--- /dev/null
+++ b/tools/testing/selftests/kvm/lib/x86_64/handlers.S
@@ -0,0 +1,81 @@
+handle_exception:
+ push %r15
+ push %r14
+ push %r13
+ push %r12
+ push %r11
+ push %r10
+ push %r9
+ push %r8
+
+ push %rdi
+ push %rsi
+ push %rbp
+ push %rbx
+ push %rdx
+ push %rcx
+ push %rax
+ mov %rsp, %rdi
+
+ call route_exception
+
+ pop %rax
+ pop %rcx
+ pop %rdx
+ pop %rbx
+ pop %rbp
+ pop %rsi
+ pop %rdi
+ pop %r8
+ pop %r9
+ pop %r10
+ pop %r11
+ pop %r12
+ pop %r13
+ pop %r14
+ pop %r15
+
+ /* Discard vector and error code. */
+ add $16, %rsp
+ iretq
+
+/*
+ * Build the handle_exception wrappers which push the vector/error code on the
+ * stack and an array of pointers to those wrappers.
+ */
+.pushsection .rodata
+.globl idt_handlers
+idt_handlers:
+.popsection
+
+.macro HANDLERS has_error from to
+ vector = \from
+ .rept \to - \from + 1
+ .align 8
+
+ /* Fetch current address and append it to idt_handlers. */
+ current_handler = .
+.pushsection .rodata
+.quad current_handler
+.popsection
+
+ .if ! \has_error
+ pushq $0
+ .endif
+ pushq $vector
+ jmp handle_exception
+ vector = vector + 1
+ .endr
+.endm
+
+.global idt_handler_code
+idt_handler_code:
+ HANDLERS has_error=0 from=0 to=7
+ HANDLERS has_error=1 from=8 to=8
+ HANDLERS has_error=0 from=9 to=9
+ HANDLERS has_error=1 from=10 to=14
+ HANDLERS has_error=0 from=15 to=16
+ HANDLERS has_error=1 from=17 to=17
+ HANDLERS has_error=0 from=18 to=255
+
+.section .note.GNU-stack, "", %progbits
diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c
new file mode 100644
index 000000000..f5d2d27be
--- /dev/null
+++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c
@@ -0,0 +1,1258 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * tools/testing/selftests/kvm/lib/x86_64/processor.c
+ *
+ * Copyright (C) 2018, Google LLC.
+ */
+
+#define _GNU_SOURCE /* for program_invocation_name */
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "../kvm_util_internal.h"
+#include "processor.h"
+
+#ifndef NUM_INTERRUPTS
+#define NUM_INTERRUPTS 256
+#endif
+
+#define DEFAULT_CODE_SELECTOR 0x8
+#define DEFAULT_DATA_SELECTOR 0x10
+
+/* Minimum physical address used for virtual translation tables. */
+#define KVM_GUEST_PAGE_TABLE_MIN_PADDR 0x180000
+
+vm_vaddr_t exception_handlers;
+
+/* Virtual translation table structure declarations */
+struct pageMapL4Entry {
+ uint64_t present:1;
+ uint64_t writable:1;
+ uint64_t user:1;
+ uint64_t write_through:1;
+ uint64_t cache_disable:1;
+ uint64_t accessed:1;
+ uint64_t ignored_06:1;
+ uint64_t page_size:1;
+ uint64_t ignored_11_08:4;
+ uint64_t address:40;
+ uint64_t ignored_62_52:11;
+ uint64_t execute_disable:1;
+};
+
+struct pageDirectoryPointerEntry {
+ uint64_t present:1;
+ uint64_t writable:1;
+ uint64_t user:1;
+ uint64_t write_through:1;
+ uint64_t cache_disable:1;
+ uint64_t accessed:1;
+ uint64_t ignored_06:1;
+ uint64_t page_size:1;
+ uint64_t ignored_11_08:4;
+ uint64_t address:40;
+ uint64_t ignored_62_52:11;
+ uint64_t execute_disable:1;
+};
+
+struct pageDirectoryEntry {
+ uint64_t present:1;
+ uint64_t writable:1;
+ uint64_t user:1;
+ uint64_t write_through:1;
+ uint64_t cache_disable:1;
+ uint64_t accessed:1;
+ uint64_t ignored_06:1;
+ uint64_t page_size:1;
+ uint64_t ignored_11_08:4;
+ uint64_t address:40;
+ uint64_t ignored_62_52:11;
+ uint64_t execute_disable:1;
+};
+
+struct pageTableEntry {
+ uint64_t present:1;
+ uint64_t writable:1;
+ uint64_t user:1;
+ uint64_t write_through:1;
+ uint64_t cache_disable:1;
+ uint64_t accessed:1;
+ uint64_t dirty:1;
+ uint64_t reserved_07:1;
+ uint64_t global:1;
+ uint64_t ignored_11_09:3;
+ uint64_t address:40;
+ uint64_t ignored_62_52:11;
+ uint64_t execute_disable:1;
+};
+
+void regs_dump(FILE *stream, struct kvm_regs *regs,
+ uint8_t indent)
+{
+ fprintf(stream, "%*srax: 0x%.16llx rbx: 0x%.16llx "
+ "rcx: 0x%.16llx rdx: 0x%.16llx\n",
+ indent, "",
+ regs->rax, regs->rbx, regs->rcx, regs->rdx);
+ fprintf(stream, "%*srsi: 0x%.16llx rdi: 0x%.16llx "
+ "rsp: 0x%.16llx rbp: 0x%.16llx\n",
+ indent, "",
+ regs->rsi, regs->rdi, regs->rsp, regs->rbp);
+ fprintf(stream, "%*sr8: 0x%.16llx r9: 0x%.16llx "
+ "r10: 0x%.16llx r11: 0x%.16llx\n",
+ indent, "",
+ regs->r8, regs->r9, regs->r10, regs->r11);
+ fprintf(stream, "%*sr12: 0x%.16llx r13: 0x%.16llx "
+ "r14: 0x%.16llx r15: 0x%.16llx\n",
+ indent, "",
+ regs->r12, regs->r13, regs->r14, regs->r15);
+ fprintf(stream, "%*srip: 0x%.16llx rfl: 0x%.16llx\n",
+ indent, "",
+ regs->rip, regs->rflags);
+}
+
+/*
+ * Segment Dump
+ *
+ * Input Args:
+ * stream - Output FILE stream
+ * segment - KVM segment
+ * indent - Left margin indent amount
+ *
+ * Output Args: None
+ *
+ * Return: None
+ *
+ * Dumps the state of the KVM segment given by @segment, to the FILE stream
+ * given by @stream.
+ */
+static void segment_dump(FILE *stream, struct kvm_segment *segment,
+ uint8_t indent)
+{
+ fprintf(stream, "%*sbase: 0x%.16llx limit: 0x%.8x "
+ "selector: 0x%.4x type: 0x%.2x\n",
+ indent, "", segment->base, segment->limit,
+ segment->selector, segment->type);
+ fprintf(stream, "%*spresent: 0x%.2x dpl: 0x%.2x "
+ "db: 0x%.2x s: 0x%.2x l: 0x%.2x\n",
+ indent, "", segment->present, segment->dpl,
+ segment->db, segment->s, segment->l);
+ fprintf(stream, "%*sg: 0x%.2x avl: 0x%.2x "
+ "unusable: 0x%.2x padding: 0x%.2x\n",
+ indent, "", segment->g, segment->avl,
+ segment->unusable, segment->padding);
+}
+
+/*
+ * dtable Dump
+ *
+ * Input Args:
+ * stream - Output FILE stream
+ * dtable - KVM dtable
+ * indent - Left margin indent amount
+ *
+ * Output Args: None
+ *
+ * Return: None
+ *
+ * Dumps the state of the KVM dtable given by @dtable, to the FILE stream
+ * given by @stream.
+ */
+static void dtable_dump(FILE *stream, struct kvm_dtable *dtable,
+ uint8_t indent)
+{
+ fprintf(stream, "%*sbase: 0x%.16llx limit: 0x%.4x "
+ "padding: 0x%.4x 0x%.4x 0x%.4x\n",
+ indent, "", dtable->base, dtable->limit,
+ dtable->padding[0], dtable->padding[1], dtable->padding[2]);
+}
+
+void sregs_dump(FILE *stream, struct kvm_sregs *sregs,
+ uint8_t indent)
+{
+ unsigned int i;
+
+ fprintf(stream, "%*scs:\n", indent, "");
+ segment_dump(stream, &sregs->cs, indent + 2);
+ fprintf(stream, "%*sds:\n", indent, "");
+ segment_dump(stream, &sregs->ds, indent + 2);
+ fprintf(stream, "%*ses:\n", indent, "");
+ segment_dump(stream, &sregs->es, indent + 2);
+ fprintf(stream, "%*sfs:\n", indent, "");
+ segment_dump(stream, &sregs->fs, indent + 2);
+ fprintf(stream, "%*sgs:\n", indent, "");
+ segment_dump(stream, &sregs->gs, indent + 2);
+ fprintf(stream, "%*sss:\n", indent, "");
+ segment_dump(stream, &sregs->ss, indent + 2);
+ fprintf(stream, "%*str:\n", indent, "");
+ segment_dump(stream, &sregs->tr, indent + 2);
+ fprintf(stream, "%*sldt:\n", indent, "");
+ segment_dump(stream, &sregs->ldt, indent + 2);
+
+ fprintf(stream, "%*sgdt:\n", indent, "");
+ dtable_dump(stream, &sregs->gdt, indent + 2);
+ fprintf(stream, "%*sidt:\n", indent, "");
+ dtable_dump(stream, &sregs->idt, indent + 2);
+
+ fprintf(stream, "%*scr0: 0x%.16llx cr2: 0x%.16llx "
+ "cr3: 0x%.16llx cr4: 0x%.16llx\n",
+ indent, "",
+ sregs->cr0, sregs->cr2, sregs->cr3, sregs->cr4);
+ fprintf(stream, "%*scr8: 0x%.16llx efer: 0x%.16llx "
+ "apic_base: 0x%.16llx\n",
+ indent, "",
+ sregs->cr8, sregs->efer, sregs->apic_base);
+
+ fprintf(stream, "%*sinterrupt_bitmap:\n", indent, "");
+ for (i = 0; i < (KVM_NR_INTERRUPTS + 63) / 64; i++) {
+ fprintf(stream, "%*s%.16llx\n", indent + 2, "",
+ sregs->interrupt_bitmap[i]);
+ }
+}
+
+void virt_pgd_alloc(struct kvm_vm *vm, uint32_t pgd_memslot)
+{
+ TEST_ASSERT(vm->mode == VM_MODE_PXXV48_4K, "Attempt to use "
+ "unknown or unsupported guest mode, mode: 0x%x", vm->mode);
+
+ /* If needed, create page map l4 table. */
+ if (!vm->pgd_created) {
+ vm_paddr_t paddr = vm_phy_page_alloc(vm,
+ KVM_GUEST_PAGE_TABLE_MIN_PADDR, pgd_memslot);
+ vm->pgd = paddr;
+ vm->pgd_created = true;
+ }
+}
+
+void virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr,
+ uint32_t pgd_memslot)
+{
+ uint16_t index[4];
+ struct pageMapL4Entry *pml4e;
+
+ TEST_ASSERT(vm->mode == VM_MODE_PXXV48_4K, "Attempt to use "
+ "unknown or unsupported guest mode, mode: 0x%x", vm->mode);
+
+ TEST_ASSERT((vaddr % vm->page_size) == 0,
+ "Virtual address not on page boundary,\n"
+ " vaddr: 0x%lx vm->page_size: 0x%x",
+ vaddr, vm->page_size);
+ TEST_ASSERT(sparsebit_is_set(vm->vpages_valid,
+ (vaddr >> vm->page_shift)),
+ "Invalid virtual address, vaddr: 0x%lx",
+ vaddr);
+ TEST_ASSERT((paddr % vm->page_size) == 0,
+ "Physical address not on page boundary,\n"
+ " paddr: 0x%lx vm->page_size: 0x%x",
+ paddr, vm->page_size);
+ TEST_ASSERT((paddr >> vm->page_shift) <= vm->max_gfn,
+ "Physical address beyond beyond maximum supported,\n"
+ " paddr: 0x%lx vm->max_gfn: 0x%lx vm->page_size: 0x%x",
+ paddr, vm->max_gfn, vm->page_size);
+
+ index[0] = (vaddr >> 12) & 0x1ffu;
+ index[1] = (vaddr >> 21) & 0x1ffu;
+ index[2] = (vaddr >> 30) & 0x1ffu;
+ index[3] = (vaddr >> 39) & 0x1ffu;
+
+ /* Allocate page directory pointer table if not present. */
+ pml4e = addr_gpa2hva(vm, vm->pgd);
+ if (!pml4e[index[3]].present) {
+ pml4e[index[3]].address = vm_phy_page_alloc(vm,
+ KVM_GUEST_PAGE_TABLE_MIN_PADDR, pgd_memslot)
+ >> vm->page_shift;
+ pml4e[index[3]].writable = true;
+ pml4e[index[3]].present = true;
+ }
+
+ /* Allocate page directory table if not present. */
+ struct pageDirectoryPointerEntry *pdpe;
+ pdpe = addr_gpa2hva(vm, pml4e[index[3]].address * vm->page_size);
+ if (!pdpe[index[2]].present) {
+ pdpe[index[2]].address = vm_phy_page_alloc(vm,
+ KVM_GUEST_PAGE_TABLE_MIN_PADDR, pgd_memslot)
+ >> vm->page_shift;
+ pdpe[index[2]].writable = true;
+ pdpe[index[2]].present = true;
+ }
+
+ /* Allocate page table if not present. */
+ struct pageDirectoryEntry *pde;
+ pde = addr_gpa2hva(vm, pdpe[index[2]].address * vm->page_size);
+ if (!pde[index[1]].present) {
+ pde[index[1]].address = vm_phy_page_alloc(vm,
+ KVM_GUEST_PAGE_TABLE_MIN_PADDR, pgd_memslot)
+ >> vm->page_shift;
+ pde[index[1]].writable = true;
+ pde[index[1]].present = true;
+ }
+
+ /* Fill in page table entry. */
+ struct pageTableEntry *pte;
+ pte = addr_gpa2hva(vm, pde[index[1]].address * vm->page_size);
+ pte[index[0]].address = paddr >> vm->page_shift;
+ pte[index[0]].writable = true;
+ pte[index[0]].present = 1;
+}
+
+void virt_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent)
+{
+ struct pageMapL4Entry *pml4e, *pml4e_start;
+ struct pageDirectoryPointerEntry *pdpe, *pdpe_start;
+ struct pageDirectoryEntry *pde, *pde_start;
+ struct pageTableEntry *pte, *pte_start;
+
+ if (!vm->pgd_created)
+ return;
+
+ fprintf(stream, "%*s "
+ " no\n", indent, "");
+ fprintf(stream, "%*s index hvaddr gpaddr "
+ "addr w exec dirty\n",
+ indent, "");
+ pml4e_start = (struct pageMapL4Entry *) addr_gpa2hva(vm,
+ vm->pgd);
+ for (uint16_t n1 = 0; n1 <= 0x1ffu; n1++) {
+ pml4e = &pml4e_start[n1];
+ if (!pml4e->present)
+ continue;
+ fprintf(stream, "%*spml4e 0x%-3zx %p 0x%-12lx 0x%-10lx %u "
+ " %u\n",
+ indent, "",
+ pml4e - pml4e_start, pml4e,
+ addr_hva2gpa(vm, pml4e), (uint64_t) pml4e->address,
+ pml4e->writable, pml4e->execute_disable);
+
+ pdpe_start = addr_gpa2hva(vm, pml4e->address
+ * vm->page_size);
+ for (uint16_t n2 = 0; n2 <= 0x1ffu; n2++) {
+ pdpe = &pdpe_start[n2];
+ if (!pdpe->present)
+ continue;
+ fprintf(stream, "%*spdpe 0x%-3zx %p 0x%-12lx 0x%-10lx "
+ "%u %u\n",
+ indent, "",
+ pdpe - pdpe_start, pdpe,
+ addr_hva2gpa(vm, pdpe),
+ (uint64_t) pdpe->address, pdpe->writable,
+ pdpe->execute_disable);
+
+ pde_start = addr_gpa2hva(vm,
+ pdpe->address * vm->page_size);
+ for (uint16_t n3 = 0; n3 <= 0x1ffu; n3++) {
+ pde = &pde_start[n3];
+ if (!pde->present)
+ continue;
+ fprintf(stream, "%*spde 0x%-3zx %p "
+ "0x%-12lx 0x%-10lx %u %u\n",
+ indent, "", pde - pde_start, pde,
+ addr_hva2gpa(vm, pde),
+ (uint64_t) pde->address, pde->writable,
+ pde->execute_disable);
+
+ pte_start = addr_gpa2hva(vm,
+ pde->address * vm->page_size);
+ for (uint16_t n4 = 0; n4 <= 0x1ffu; n4++) {
+ pte = &pte_start[n4];
+ if (!pte->present)
+ continue;
+ fprintf(stream, "%*spte 0x%-3zx %p "
+ "0x%-12lx 0x%-10lx %u %u "
+ " %u 0x%-10lx\n",
+ indent, "",
+ pte - pte_start, pte,
+ addr_hva2gpa(vm, pte),
+ (uint64_t) pte->address,
+ pte->writable,
+ pte->execute_disable,
+ pte->dirty,
+ ((uint64_t) n1 << 27)
+ | ((uint64_t) n2 << 18)
+ | ((uint64_t) n3 << 9)
+ | ((uint64_t) n4));
+ }
+ }
+ }
+ }
+}
+
+/*
+ * Set Unusable Segment
+ *
+ * Input Args: None
+ *
+ * Output Args:
+ * segp - Pointer to segment register
+ *
+ * Return: None
+ *
+ * Sets the segment register pointed to by @segp to an unusable state.
+ */
+static void kvm_seg_set_unusable(struct kvm_segment *segp)
+{
+ memset(segp, 0, sizeof(*segp));
+ segp->unusable = true;
+}
+
+static void kvm_seg_fill_gdt_64bit(struct kvm_vm *vm, struct kvm_segment *segp)
+{
+ void *gdt = addr_gva2hva(vm, vm->gdt);
+ struct desc64 *desc = gdt + (segp->selector >> 3) * 8;
+
+ desc->limit0 = segp->limit & 0xFFFF;
+ desc->base0 = segp->base & 0xFFFF;
+ desc->base1 = segp->base >> 16;
+ desc->type = segp->type;
+ desc->s = segp->s;
+ desc->dpl = segp->dpl;
+ desc->p = segp->present;
+ desc->limit1 = segp->limit >> 16;
+ desc->avl = segp->avl;
+ desc->l = segp->l;
+ desc->db = segp->db;
+ desc->g = segp->g;
+ desc->base2 = segp->base >> 24;
+ if (!segp->s)
+ desc->base3 = segp->base >> 32;
+}
+
+
+/*
+ * Set Long Mode Flat Kernel Code Segment
+ *
+ * Input Args:
+ * vm - VM whose GDT is being filled, or NULL to only write segp
+ * selector - selector value
+ *
+ * Output Args:
+ * segp - Pointer to KVM segment
+ *
+ * Return: None
+ *
+ * Sets up the KVM segment pointed to by @segp, to be a code segment
+ * with the selector value given by @selector.
+ */
+static void kvm_seg_set_kernel_code_64bit(struct kvm_vm *vm, uint16_t selector,
+ struct kvm_segment *segp)
+{
+ memset(segp, 0, sizeof(*segp));
+ segp->selector = selector;
+ segp->limit = 0xFFFFFFFFu;
+ segp->s = 0x1; /* kTypeCodeData */
+ segp->type = 0x08 | 0x01 | 0x02; /* kFlagCode | kFlagCodeAccessed
+ * | kFlagCodeReadable
+ */
+ segp->g = true;
+ segp->l = true;
+ segp->present = 1;
+ if (vm)
+ kvm_seg_fill_gdt_64bit(vm, segp);
+}
+
+/*
+ * Set Long Mode Flat Kernel Data Segment
+ *
+ * Input Args:
+ * vm - VM whose GDT is being filled, or NULL to only write segp
+ * selector - selector value
+ *
+ * Output Args:
+ * segp - Pointer to KVM segment
+ *
+ * Return: None
+ *
+ * Sets up the KVM segment pointed to by @segp, to be a data segment
+ * with the selector value given by @selector.
+ */
+static void kvm_seg_set_kernel_data_64bit(struct kvm_vm *vm, uint16_t selector,
+ struct kvm_segment *segp)
+{
+ memset(segp, 0, sizeof(*segp));
+ segp->selector = selector;
+ segp->limit = 0xFFFFFFFFu;
+ segp->s = 0x1; /* kTypeCodeData */
+ segp->type = 0x00 | 0x01 | 0x02; /* kFlagData | kFlagDataAccessed
+ * | kFlagDataWritable
+ */
+ segp->g = true;
+ segp->present = true;
+ if (vm)
+ kvm_seg_fill_gdt_64bit(vm, segp);
+}
+
+vm_paddr_t addr_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva)
+{
+ uint16_t index[4];
+ struct pageMapL4Entry *pml4e;
+ struct pageDirectoryPointerEntry *pdpe;
+ struct pageDirectoryEntry *pde;
+ struct pageTableEntry *pte;
+
+ TEST_ASSERT(vm->mode == VM_MODE_PXXV48_4K, "Attempt to use "
+ "unknown or unsupported guest mode, mode: 0x%x", vm->mode);
+
+ index[0] = (gva >> 12) & 0x1ffu;
+ index[1] = (gva >> 21) & 0x1ffu;
+ index[2] = (gva >> 30) & 0x1ffu;
+ index[3] = (gva >> 39) & 0x1ffu;
+
+ if (!vm->pgd_created)
+ goto unmapped_gva;
+ pml4e = addr_gpa2hva(vm, vm->pgd);
+ if (!pml4e[index[3]].present)
+ goto unmapped_gva;
+
+ pdpe = addr_gpa2hva(vm, pml4e[index[3]].address * vm->page_size);
+ if (!pdpe[index[2]].present)
+ goto unmapped_gva;
+
+ pde = addr_gpa2hva(vm, pdpe[index[2]].address * vm->page_size);
+ if (!pde[index[1]].present)
+ goto unmapped_gva;
+
+ pte = addr_gpa2hva(vm, pde[index[1]].address * vm->page_size);
+ if (!pte[index[0]].present)
+ goto unmapped_gva;
+
+ return (pte[index[0]].address * vm->page_size) + (gva & 0xfffu);
+
+unmapped_gva:
+ TEST_FAIL("No mapping for vm virtual address, gva: 0x%lx", gva);
+ exit(EXIT_FAILURE);
+}
+
+static void kvm_setup_gdt(struct kvm_vm *vm, struct kvm_dtable *dt, int gdt_memslot,
+ int pgd_memslot)
+{
+ if (!vm->gdt)
+ vm->gdt = vm_vaddr_alloc(vm, getpagesize(),
+ KVM_UTIL_MIN_VADDR, gdt_memslot, pgd_memslot);
+
+ dt->base = vm->gdt;
+ dt->limit = getpagesize();
+}
+
+static void kvm_setup_tss_64bit(struct kvm_vm *vm, struct kvm_segment *segp,
+ int selector, int gdt_memslot,
+ int pgd_memslot)
+{
+ if (!vm->tss)
+ vm->tss = vm_vaddr_alloc(vm, getpagesize(),
+ KVM_UTIL_MIN_VADDR, gdt_memslot, pgd_memslot);
+
+ memset(segp, 0, sizeof(*segp));
+ segp->base = vm->tss;
+ segp->limit = 0x67;
+ segp->selector = selector;
+ segp->type = 0xb;
+ segp->present = 1;
+ kvm_seg_fill_gdt_64bit(vm, segp);
+}
+
+static void vcpu_setup(struct kvm_vm *vm, int vcpuid, int pgd_memslot, int gdt_memslot)
+{
+ struct kvm_sregs sregs;
+
+ /* Set mode specific system register values. */
+ vcpu_sregs_get(vm, vcpuid, &sregs);
+
+ sregs.idt.limit = 0;
+
+ kvm_setup_gdt(vm, &sregs.gdt, gdt_memslot, pgd_memslot);
+
+ switch (vm->mode) {
+ case VM_MODE_PXXV48_4K:
+ sregs.cr0 = X86_CR0_PE | X86_CR0_NE | X86_CR0_PG;
+ sregs.cr4 |= X86_CR4_PAE | X86_CR4_OSFXSR;
+ sregs.efer |= (EFER_LME | EFER_LMA | EFER_NX);
+
+ kvm_seg_set_unusable(&sregs.ldt);
+ kvm_seg_set_kernel_code_64bit(vm, DEFAULT_CODE_SELECTOR, &sregs.cs);
+ kvm_seg_set_kernel_data_64bit(vm, DEFAULT_DATA_SELECTOR, &sregs.ds);
+ kvm_seg_set_kernel_data_64bit(vm, DEFAULT_DATA_SELECTOR, &sregs.es);
+ kvm_setup_tss_64bit(vm, &sregs.tr, 0x18, gdt_memslot, pgd_memslot);
+ break;
+
+ default:
+ TEST_FAIL("Unknown guest mode, mode: 0x%x", vm->mode);
+ }
+
+ sregs.cr3 = vm->pgd;
+ vcpu_sregs_set(vm, vcpuid, &sregs);
+}
+
+void vm_vcpu_add_default(struct kvm_vm *vm, uint32_t vcpuid, void *guest_code)
+{
+ struct kvm_mp_state mp_state;
+ struct kvm_regs regs;
+ vm_vaddr_t stack_vaddr;
+ stack_vaddr = vm_vaddr_alloc(vm, DEFAULT_STACK_PGS * getpagesize(),
+ DEFAULT_GUEST_STACK_VADDR_MIN, 0, 0);
+
+ /* Create VCPU */
+ vm_vcpu_add(vm, vcpuid);
+ vcpu_setup(vm, vcpuid, 0, 0);
+
+ /* Setup guest general purpose registers */
+ vcpu_regs_get(vm, vcpuid, &regs);
+ regs.rflags = regs.rflags | 0x2;
+ regs.rsp = stack_vaddr + (DEFAULT_STACK_PGS * getpagesize());
+ regs.rip = (unsigned long) guest_code;
+ vcpu_regs_set(vm, vcpuid, &regs);
+
+ /* Setup the MP state */
+ mp_state.mp_state = 0;
+ vcpu_set_mp_state(vm, vcpuid, &mp_state);
+}
+
+/*
+ * Allocate an instance of struct kvm_cpuid2
+ *
+ * Input Args: None
+ *
+ * Output Args: None
+ *
+ * Return: A pointer to the allocated struct. The caller is responsible
+ * for freeing this struct.
+ *
+ * Since kvm_cpuid2 uses a 0-length array to allow a the size of the
+ * array to be decided at allocation time, allocation is slightly
+ * complicated. This function uses a reasonable default length for
+ * the array and performs the appropriate allocation.
+ */
+static struct kvm_cpuid2 *allocate_kvm_cpuid2(void)
+{
+ struct kvm_cpuid2 *cpuid;
+ int nent = 100;
+ size_t size;
+
+ size = sizeof(*cpuid);
+ size += nent * sizeof(struct kvm_cpuid_entry2);
+ cpuid = malloc(size);
+ if (!cpuid) {
+ perror("malloc");
+ abort();
+ }
+
+ cpuid->nent = nent;
+
+ return cpuid;
+}
+
+/*
+ * KVM Supported CPUID Get
+ *
+ * Input Args: None
+ *
+ * Output Args:
+ *
+ * Return: The supported KVM CPUID
+ *
+ * Get the guest CPUID supported by KVM.
+ */
+struct kvm_cpuid2 *kvm_get_supported_cpuid(void)
+{
+ static struct kvm_cpuid2 *cpuid;
+ int ret;
+ int kvm_fd;
+
+ if (cpuid)
+ return cpuid;
+
+ cpuid = allocate_kvm_cpuid2();
+ kvm_fd = open(KVM_DEV_PATH, O_RDONLY);
+ if (kvm_fd < 0)
+ exit(KSFT_SKIP);
+
+ ret = ioctl(kvm_fd, KVM_GET_SUPPORTED_CPUID, cpuid);
+ TEST_ASSERT(ret == 0, "KVM_GET_SUPPORTED_CPUID failed %d %d\n",
+ ret, errno);
+
+ close(kvm_fd);
+ return cpuid;
+}
+
+/*
+ * Locate a cpuid entry.
+ *
+ * Input Args:
+ * function: The function of the cpuid entry to find.
+ * index: The index of the cpuid entry.
+ *
+ * Output Args: None
+ *
+ * Return: A pointer to the cpuid entry. Never returns NULL.
+ */
+struct kvm_cpuid_entry2 *
+kvm_get_supported_cpuid_index(uint32_t function, uint32_t index)
+{
+ struct kvm_cpuid2 *cpuid;
+ struct kvm_cpuid_entry2 *entry = NULL;
+ int i;
+
+ cpuid = kvm_get_supported_cpuid();
+ for (i = 0; i < cpuid->nent; i++) {
+ if (cpuid->entries[i].function == function &&
+ cpuid->entries[i].index == index) {
+ entry = &cpuid->entries[i];
+ break;
+ }
+ }
+
+ TEST_ASSERT(entry, "Guest CPUID entry not found: (EAX=%x, ECX=%x).",
+ function, index);
+ return entry;
+}
+
+/*
+ * VM VCPU CPUID Set
+ *
+ * Input Args:
+ * vm - Virtual Machine
+ * vcpuid - VCPU id
+ * cpuid - The CPUID values to set.
+ *
+ * Output Args: None
+ *
+ * Return: void
+ *
+ * Set the VCPU's CPUID.
+ */
+void vcpu_set_cpuid(struct kvm_vm *vm,
+ uint32_t vcpuid, struct kvm_cpuid2 *cpuid)
+{
+ struct vcpu *vcpu = vcpu_find(vm, vcpuid);
+ int rc;
+
+ TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid);
+
+ rc = ioctl(vcpu->fd, KVM_SET_CPUID2, cpuid);
+ TEST_ASSERT(rc == 0, "KVM_SET_CPUID2 failed, rc: %i errno: %i",
+ rc, errno);
+
+}
+
+struct kvm_vm *vm_create_default(uint32_t vcpuid, uint64_t extra_mem_pages,
+ void *guest_code)
+{
+ struct kvm_vm *vm;
+ /*
+ * For x86 the maximum page table size for a memory region
+ * will be when only 4K pages are used. In that case the
+ * total extra size for page tables (for extra N pages) will
+ * be: N/512+N/512^2+N/512^3+... which is definitely smaller
+ * than N/512*2.
+ */
+ uint64_t extra_pg_pages = extra_mem_pages / 512 * 2;
+
+ /* Create VM */
+ vm = vm_create(VM_MODE_DEFAULT,
+ DEFAULT_GUEST_PHY_PAGES + extra_pg_pages,
+ O_RDWR);
+
+ /* Setup guest code */
+ kvm_vm_elf_load(vm, program_invocation_name, 0, 0);
+
+ /* Setup IRQ Chip */
+ vm_create_irqchip(vm);
+
+ /* Add the first vCPU. */
+ vm_vcpu_add_default(vm, vcpuid, guest_code);
+
+ return vm;
+}
+
+/*
+ * VCPU Get MSR
+ *
+ * Input Args:
+ * vm - Virtual Machine
+ * vcpuid - VCPU ID
+ * msr_index - Index of MSR
+ *
+ * Output Args: None
+ *
+ * Return: On success, value of the MSR. On failure a TEST_ASSERT is produced.
+ *
+ * Get value of MSR for VCPU.
+ */
+uint64_t vcpu_get_msr(struct kvm_vm *vm, uint32_t vcpuid, uint64_t msr_index)
+{
+ struct vcpu *vcpu = vcpu_find(vm, vcpuid);
+ struct {
+ struct kvm_msrs header;
+ struct kvm_msr_entry entry;
+ } buffer = {};
+ int r;
+
+ TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid);
+ buffer.header.nmsrs = 1;
+ buffer.entry.index = msr_index;
+ r = ioctl(vcpu->fd, KVM_GET_MSRS, &buffer.header);
+ TEST_ASSERT(r == 1, "KVM_GET_MSRS IOCTL failed,\n"
+ " rc: %i errno: %i", r, errno);
+
+ return buffer.entry.data;
+}
+
+/*
+ * _VCPU Set MSR
+ *
+ * Input Args:
+ * vm - Virtual Machine
+ * vcpuid - VCPU ID
+ * msr_index - Index of MSR
+ * msr_value - New value of MSR
+ *
+ * Output Args: None
+ *
+ * Return: The result of KVM_SET_MSRS.
+ *
+ * Sets the value of an MSR for the given VCPU.
+ */
+int _vcpu_set_msr(struct kvm_vm *vm, uint32_t vcpuid, uint64_t msr_index,
+ uint64_t msr_value)
+{
+ struct vcpu *vcpu = vcpu_find(vm, vcpuid);
+ struct {
+ struct kvm_msrs header;
+ struct kvm_msr_entry entry;
+ } buffer = {};
+ int r;
+
+ TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid);
+ memset(&buffer, 0, sizeof(buffer));
+ buffer.header.nmsrs = 1;
+ buffer.entry.index = msr_index;
+ buffer.entry.data = msr_value;
+ r = ioctl(vcpu->fd, KVM_SET_MSRS, &buffer.header);
+ return r;
+}
+
+/*
+ * VCPU Set MSR
+ *
+ * Input Args:
+ * vm - Virtual Machine
+ * vcpuid - VCPU ID
+ * msr_index - Index of MSR
+ * msr_value - New value of MSR
+ *
+ * Output Args: None
+ *
+ * Return: On success, nothing. On failure a TEST_ASSERT is produced.
+ *
+ * Set value of MSR for VCPU.
+ */
+void vcpu_set_msr(struct kvm_vm *vm, uint32_t vcpuid, uint64_t msr_index,
+ uint64_t msr_value)
+{
+ int r;
+
+ r = _vcpu_set_msr(vm, vcpuid, msr_index, msr_value);
+ TEST_ASSERT(r == 1, "KVM_SET_MSRS IOCTL failed,\n"
+ " rc: %i errno: %i", r, errno);
+}
+
+void vcpu_args_set(struct kvm_vm *vm, uint32_t vcpuid, unsigned int num, ...)
+{
+ va_list ap;
+ struct kvm_regs regs;
+
+ TEST_ASSERT(num >= 1 && num <= 6, "Unsupported number of args,\n"
+ " num: %u\n",
+ num);
+
+ va_start(ap, num);
+ vcpu_regs_get(vm, vcpuid, &regs);
+
+ if (num >= 1)
+ regs.rdi = va_arg(ap, uint64_t);
+
+ if (num >= 2)
+ regs.rsi = va_arg(ap, uint64_t);
+
+ if (num >= 3)
+ regs.rdx = va_arg(ap, uint64_t);
+
+ if (num >= 4)
+ regs.rcx = va_arg(ap, uint64_t);
+
+ if (num >= 5)
+ regs.r8 = va_arg(ap, uint64_t);
+
+ if (num >= 6)
+ regs.r9 = va_arg(ap, uint64_t);
+
+ vcpu_regs_set(vm, vcpuid, &regs);
+ va_end(ap);
+}
+
+void vcpu_dump(FILE *stream, struct kvm_vm *vm, uint32_t vcpuid, uint8_t indent)
+{
+ struct kvm_regs regs;
+ struct kvm_sregs sregs;
+
+ fprintf(stream, "%*scpuid: %u\n", indent, "", vcpuid);
+
+ fprintf(stream, "%*sregs:\n", indent + 2, "");
+ vcpu_regs_get(vm, vcpuid, &regs);
+ regs_dump(stream, &regs, indent + 4);
+
+ fprintf(stream, "%*ssregs:\n", indent + 2, "");
+ vcpu_sregs_get(vm, vcpuid, &sregs);
+ sregs_dump(stream, &sregs, indent + 4);
+}
+
+struct kvm_x86_state {
+ struct kvm_vcpu_events events;
+ struct kvm_mp_state mp_state;
+ struct kvm_regs regs;
+ struct kvm_xsave xsave;
+ struct kvm_xcrs xcrs;
+ struct kvm_sregs sregs;
+ struct kvm_debugregs debugregs;
+ union {
+ struct kvm_nested_state nested;
+ char nested_[16384];
+ };
+ struct kvm_msrs msrs;
+};
+
+static int kvm_get_num_msrs_fd(int kvm_fd)
+{
+ struct kvm_msr_list nmsrs;
+ int r;
+
+ nmsrs.nmsrs = 0;
+ r = ioctl(kvm_fd, KVM_GET_MSR_INDEX_LIST, &nmsrs);
+ TEST_ASSERT(r == -1 && errno == E2BIG, "Unexpected result from KVM_GET_MSR_INDEX_LIST probe, r: %i",
+ r);
+
+ return nmsrs.nmsrs;
+}
+
+static int kvm_get_num_msrs(struct kvm_vm *vm)
+{
+ return kvm_get_num_msrs_fd(vm->kvm_fd);
+}
+
+struct kvm_msr_list *kvm_get_msr_index_list(void)
+{
+ struct kvm_msr_list *list;
+ int nmsrs, r, kvm_fd;
+
+ kvm_fd = open(KVM_DEV_PATH, O_RDONLY);
+ if (kvm_fd < 0)
+ exit(KSFT_SKIP);
+
+ nmsrs = kvm_get_num_msrs_fd(kvm_fd);
+ list = malloc(sizeof(*list) + nmsrs * sizeof(list->indices[0]));
+ list->nmsrs = nmsrs;
+ r = ioctl(kvm_fd, KVM_GET_MSR_INDEX_LIST, list);
+ close(kvm_fd);
+
+ TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_MSR_INDEX_LIST, r: %i",
+ r);
+
+ return list;
+}
+
+struct kvm_x86_state *vcpu_save_state(struct kvm_vm *vm, uint32_t vcpuid)
+{
+ struct vcpu *vcpu = vcpu_find(vm, vcpuid);
+ struct kvm_msr_list *list;
+ struct kvm_x86_state *state;
+ int nmsrs, r, i;
+ static int nested_size = -1;
+
+ if (nested_size == -1) {
+ nested_size = kvm_check_cap(KVM_CAP_NESTED_STATE);
+ TEST_ASSERT(nested_size <= sizeof(state->nested_),
+ "Nested state size too big, %i > %zi",
+ nested_size, sizeof(state->nested_));
+ }
+
+ /*
+ * When KVM exits to userspace with KVM_EXIT_IO, KVM guarantees
+ * guest state is consistent only after userspace re-enters the
+ * kernel with KVM_RUN. Complete IO prior to migrating state
+ * to a new VM.
+ */
+ vcpu_run_complete_io(vm, vcpuid);
+
+ nmsrs = kvm_get_num_msrs(vm);
+ list = malloc(sizeof(*list) + nmsrs * sizeof(list->indices[0]));
+ list->nmsrs = nmsrs;
+ r = ioctl(vm->kvm_fd, KVM_GET_MSR_INDEX_LIST, list);
+ TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_MSR_INDEX_LIST, r: %i",
+ r);
+
+ state = malloc(sizeof(*state) + nmsrs * sizeof(state->msrs.entries[0]));
+ r = ioctl(vcpu->fd, KVM_GET_VCPU_EVENTS, &state->events);
+ TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_VCPU_EVENTS, r: %i",
+ r);
+
+ r = ioctl(vcpu->fd, KVM_GET_MP_STATE, &state->mp_state);
+ TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_MP_STATE, r: %i",
+ r);
+
+ r = ioctl(vcpu->fd, KVM_GET_REGS, &state->regs);
+ TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_REGS, r: %i",
+ r);
+
+ r = ioctl(vcpu->fd, KVM_GET_XSAVE, &state->xsave);
+ TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_XSAVE, r: %i",
+ r);
+
+ if (kvm_check_cap(KVM_CAP_XCRS)) {
+ r = ioctl(vcpu->fd, KVM_GET_XCRS, &state->xcrs);
+ TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_XCRS, r: %i",
+ r);
+ }
+
+ r = ioctl(vcpu->fd, KVM_GET_SREGS, &state->sregs);
+ TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_SREGS, r: %i",
+ r);
+
+ if (nested_size) {
+ state->nested.size = sizeof(state->nested_);
+ r = ioctl(vcpu->fd, KVM_GET_NESTED_STATE, &state->nested);
+ TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_NESTED_STATE, r: %i",
+ r);
+ TEST_ASSERT(state->nested.size <= nested_size,
+ "Nested state size too big, %i (KVM_CHECK_CAP gave %i)",
+ state->nested.size, nested_size);
+ } else
+ state->nested.size = 0;
+
+ state->msrs.nmsrs = nmsrs;
+ for (i = 0; i < nmsrs; i++)
+ state->msrs.entries[i].index = list->indices[i];
+ r = ioctl(vcpu->fd, KVM_GET_MSRS, &state->msrs);
+ TEST_ASSERT(r == nmsrs, "Unexpected result from KVM_GET_MSRS, r: %i (failed MSR was 0x%x)",
+ r, r == nmsrs ? -1 : list->indices[r]);
+
+ r = ioctl(vcpu->fd, KVM_GET_DEBUGREGS, &state->debugregs);
+ TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_DEBUGREGS, r: %i",
+ r);
+
+ free(list);
+ return state;
+}
+
+void vcpu_load_state(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_x86_state *state)
+{
+ struct vcpu *vcpu = vcpu_find(vm, vcpuid);
+ int r;
+
+ r = ioctl(vcpu->fd, KVM_SET_XSAVE, &state->xsave);
+ TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_XSAVE, r: %i",
+ r);
+
+ if (kvm_check_cap(KVM_CAP_XCRS)) {
+ r = ioctl(vcpu->fd, KVM_SET_XCRS, &state->xcrs);
+ TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_XCRS, r: %i",
+ r);
+ }
+
+ r = ioctl(vcpu->fd, KVM_SET_SREGS, &state->sregs);
+ TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_SREGS, r: %i",
+ r);
+
+ r = ioctl(vcpu->fd, KVM_SET_MSRS, &state->msrs);
+ TEST_ASSERT(r == state->msrs.nmsrs, "Unexpected result from KVM_SET_MSRS, r: %i (failed at %x)",
+ r, r == state->msrs.nmsrs ? -1 : state->msrs.entries[r].index);
+
+ r = ioctl(vcpu->fd, KVM_SET_VCPU_EVENTS, &state->events);
+ TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_VCPU_EVENTS, r: %i",
+ r);
+
+ r = ioctl(vcpu->fd, KVM_SET_MP_STATE, &state->mp_state);
+ TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_MP_STATE, r: %i",
+ r);
+
+ r = ioctl(vcpu->fd, KVM_SET_DEBUGREGS, &state->debugregs);
+ TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_DEBUGREGS, r: %i",
+ r);
+
+ r = ioctl(vcpu->fd, KVM_SET_REGS, &state->regs);
+ TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_REGS, r: %i",
+ r);
+
+ if (state->nested.size) {
+ r = ioctl(vcpu->fd, KVM_SET_NESTED_STATE, &state->nested);
+ TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_NESTED_STATE, r: %i",
+ r);
+ }
+}
+
+bool is_intel_cpu(void)
+{
+ int eax, ebx, ecx, edx;
+ const uint32_t *chunk;
+ const int leaf = 0;
+
+ __asm__ __volatile__(
+ "cpuid"
+ : /* output */ "=a"(eax), "=b"(ebx),
+ "=c"(ecx), "=d"(edx)
+ : /* input */ "0"(leaf), "2"(0));
+
+ chunk = (const uint32_t *)("GenuineIntel");
+ return (ebx == chunk[0] && edx == chunk[1] && ecx == chunk[2]);
+}
+
+uint32_t kvm_get_cpuid_max_basic(void)
+{
+ return kvm_get_supported_cpuid_entry(0)->eax;
+}
+
+uint32_t kvm_get_cpuid_max_extended(void)
+{
+ return kvm_get_supported_cpuid_entry(0x80000000)->eax;
+}
+
+void kvm_get_cpu_address_width(unsigned int *pa_bits, unsigned int *va_bits)
+{
+ struct kvm_cpuid_entry2 *entry;
+ bool pae;
+
+ /* SDM 4.1.4 */
+ if (kvm_get_cpuid_max_extended() < 0x80000008) {
+ pae = kvm_get_supported_cpuid_entry(1)->edx & (1 << 6);
+ *pa_bits = pae ? 36 : 32;
+ *va_bits = 32;
+ } else {
+ entry = kvm_get_supported_cpuid_entry(0x80000008);
+ *pa_bits = entry->eax & 0xff;
+ *va_bits = (entry->eax >> 8) & 0xff;
+ }
+}
+
+struct idt_entry {
+ uint16_t offset0;
+ uint16_t selector;
+ uint16_t ist : 3;
+ uint16_t : 5;
+ uint16_t type : 4;
+ uint16_t : 1;
+ uint16_t dpl : 2;
+ uint16_t p : 1;
+ uint16_t offset1;
+ uint32_t offset2; uint32_t reserved;
+};
+
+static void set_idt_entry(struct kvm_vm *vm, int vector, unsigned long addr,
+ int dpl, unsigned short selector)
+{
+ struct idt_entry *base =
+ (struct idt_entry *)addr_gva2hva(vm, vm->idt);
+ struct idt_entry *e = &base[vector];
+
+ memset(e, 0, sizeof(*e));
+ e->offset0 = addr;
+ e->selector = selector;
+ e->ist = 0;
+ e->type = 14;
+ e->dpl = dpl;
+ e->p = 1;
+ e->offset1 = addr >> 16;
+ e->offset2 = addr >> 32;
+}
+
+void kvm_exit_unexpected_vector(uint32_t value)
+{
+ outl(UNEXPECTED_VECTOR_PORT, value);
+}
+
+void route_exception(struct ex_regs *regs)
+{
+ typedef void(*handler)(struct ex_regs *);
+ handler *handlers = (handler *)exception_handlers;
+
+ if (handlers && handlers[regs->vector]) {
+ handlers[regs->vector](regs);
+ return;
+ }
+
+ kvm_exit_unexpected_vector(regs->vector);
+}
+
+void vm_init_descriptor_tables(struct kvm_vm *vm)
+{
+ extern void *idt_handlers;
+ int i;
+
+ vm->idt = vm_vaddr_alloc(vm, getpagesize(), 0x2000, 0, 0);
+ vm->handlers = vm_vaddr_alloc(vm, 256 * sizeof(void *), 0x2000, 0, 0);
+ /* Handlers have the same address in both address spaces.*/
+ for (i = 0; i < NUM_INTERRUPTS; i++)
+ set_idt_entry(vm, i, (unsigned long)(&idt_handlers)[i], 0,
+ DEFAULT_CODE_SELECTOR);
+}
+
+void vcpu_init_descriptor_tables(struct kvm_vm *vm, uint32_t vcpuid)
+{
+ struct kvm_sregs sregs;
+
+ vcpu_sregs_get(vm, vcpuid, &sregs);
+ sregs.idt.base = vm->idt;
+ sregs.idt.limit = NUM_INTERRUPTS * sizeof(struct idt_entry) - 1;
+ sregs.gdt.base = vm->gdt;
+ sregs.gdt.limit = getpagesize() - 1;
+ kvm_seg_set_kernel_data_64bit(NULL, DEFAULT_DATA_SELECTOR, &sregs.gs);
+ vcpu_sregs_set(vm, vcpuid, &sregs);
+ *(vm_vaddr_t *)addr_gva2hva(vm, (vm_vaddr_t)(&exception_handlers)) = vm->handlers;
+}
+
+void vm_handle_exception(struct kvm_vm *vm, int vector,
+ void (*handler)(struct ex_regs *))
+{
+ vm_vaddr_t *handlers = (vm_vaddr_t *)addr_gva2hva(vm, vm->handlers);
+
+ handlers[vector] = (vm_vaddr_t)handler;
+}
+
+void assert_on_unhandled_exception(struct kvm_vm *vm, uint32_t vcpuid)
+{
+ if (vcpu_state(vm, vcpuid)->exit_reason == KVM_EXIT_IO
+ && vcpu_state(vm, vcpuid)->io.port == UNEXPECTED_VECTOR_PORT
+ && vcpu_state(vm, vcpuid)->io.size == 4) {
+ /* Grab pointer to io data */
+ uint32_t *data = (void *)vcpu_state(vm, vcpuid)
+ + vcpu_state(vm, vcpuid)->io.data_offset;
+
+ TEST_ASSERT(false,
+ "Unexpected vectored event in guest (vector:0x%x)",
+ *data);
+ }
+}
+
+bool set_cpuid(struct kvm_cpuid2 *cpuid,
+ struct kvm_cpuid_entry2 *ent)
+{
+ int i;
+
+ for (i = 0; i < cpuid->nent; i++) {
+ struct kvm_cpuid_entry2 *cur = &cpuid->entries[i];
+
+ if (cur->function != ent->function || cur->index != ent->index)
+ continue;
+
+ memcpy(cur, ent, sizeof(struct kvm_cpuid_entry2));
+ return true;
+ }
+
+ return false;
+}
+
+uint64_t kvm_hypercall(uint64_t nr, uint64_t a0, uint64_t a1, uint64_t a2,
+ uint64_t a3)
+{
+ uint64_t r;
+
+ asm volatile("vmcall"
+ : "=a"(r)
+ : "a"(nr), "b"(a0), "c"(a1), "d"(a2), "S"(a3));
+ return r;
+}
diff --git a/tools/testing/selftests/kvm/lib/x86_64/svm.c b/tools/testing/selftests/kvm/lib/x86_64/svm.c
new file mode 100644
index 000000000..a58507a7b
--- /dev/null
+++ b/tools/testing/selftests/kvm/lib/x86_64/svm.c
@@ -0,0 +1,177 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * tools/testing/selftests/kvm/lib/x86_64/svm.c
+ * Helpers used for nested SVM testing
+ * Largely inspired from KVM unit test svm.c
+ *
+ * Copyright (C) 2020, Red Hat, Inc.
+ */
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "../kvm_util_internal.h"
+#include "processor.h"
+#include "svm_util.h"
+
+struct gpr64_regs guest_regs;
+u64 rflags;
+
+/* Allocate memory regions for nested SVM tests.
+ *
+ * Input Args:
+ * vm - The VM to allocate guest-virtual addresses in.
+ *
+ * Output Args:
+ * p_svm_gva - The guest virtual address for the struct svm_test_data.
+ *
+ * Return:
+ * Pointer to structure with the addresses of the SVM areas.
+ */
+struct svm_test_data *
+vcpu_alloc_svm(struct kvm_vm *vm, vm_vaddr_t *p_svm_gva)
+{
+ vm_vaddr_t svm_gva = vm_vaddr_alloc(vm, getpagesize(),
+ 0x10000, 0, 0);
+ struct svm_test_data *svm = addr_gva2hva(vm, svm_gva);
+
+ svm->vmcb = (void *)vm_vaddr_alloc(vm, getpagesize(),
+ 0x10000, 0, 0);
+ svm->vmcb_hva = addr_gva2hva(vm, (uintptr_t)svm->vmcb);
+ svm->vmcb_gpa = addr_gva2gpa(vm, (uintptr_t)svm->vmcb);
+
+ svm->save_area = (void *)vm_vaddr_alloc(vm, getpagesize(),
+ 0x10000, 0, 0);
+ svm->save_area_hva = addr_gva2hva(vm, (uintptr_t)svm->save_area);
+ svm->save_area_gpa = addr_gva2gpa(vm, (uintptr_t)svm->save_area);
+
+ *p_svm_gva = svm_gva;
+ return svm;
+}
+
+static void vmcb_set_seg(struct vmcb_seg *seg, u16 selector,
+ u64 base, u32 limit, u32 attr)
+{
+ seg->selector = selector;
+ seg->attrib = attr;
+ seg->limit = limit;
+ seg->base = base;
+}
+
+/*
+ * Avoid using memset to clear the vmcb, since libc may not be
+ * available in L1 (and, even if it is, features that libc memset may
+ * want to use, like AVX, may not be enabled).
+ */
+static void clear_vmcb(struct vmcb *vmcb)
+{
+ int n = sizeof(*vmcb) / sizeof(u32);
+
+ asm volatile ("rep stosl" : "+c"(n), "+D"(vmcb) : "a"(0) : "memory");
+}
+
+void generic_svm_setup(struct svm_test_data *svm, void *guest_rip, void *guest_rsp)
+{
+ struct vmcb *vmcb = svm->vmcb;
+ uint64_t vmcb_gpa = svm->vmcb_gpa;
+ struct vmcb_save_area *save = &vmcb->save;
+ struct vmcb_control_area *ctrl = &vmcb->control;
+ u32 data_seg_attr = 3 | SVM_SELECTOR_S_MASK | SVM_SELECTOR_P_MASK
+ | SVM_SELECTOR_DB_MASK | SVM_SELECTOR_G_MASK;
+ u32 code_seg_attr = 9 | SVM_SELECTOR_S_MASK | SVM_SELECTOR_P_MASK
+ | SVM_SELECTOR_L_MASK | SVM_SELECTOR_G_MASK;
+ uint64_t efer;
+
+ efer = rdmsr(MSR_EFER);
+ wrmsr(MSR_EFER, efer | EFER_SVME);
+ wrmsr(MSR_VM_HSAVE_PA, svm->save_area_gpa);
+
+ clear_vmcb(vmcb);
+ asm volatile ("vmsave %0\n\t" : : "a" (vmcb_gpa) : "memory");
+ vmcb_set_seg(&save->es, get_es(), 0, -1U, data_seg_attr);
+ vmcb_set_seg(&save->cs, get_cs(), 0, -1U, code_seg_attr);
+ vmcb_set_seg(&save->ss, get_ss(), 0, -1U, data_seg_attr);
+ vmcb_set_seg(&save->ds, get_ds(), 0, -1U, data_seg_attr);
+ vmcb_set_seg(&save->gdtr, 0, get_gdt().address, get_gdt().size, 0);
+ vmcb_set_seg(&save->idtr, 0, get_idt().address, get_idt().size, 0);
+
+ ctrl->asid = 1;
+ save->cpl = 0;
+ save->efer = rdmsr(MSR_EFER);
+ asm volatile ("mov %%cr4, %0" : "=r"(save->cr4) : : "memory");
+ asm volatile ("mov %%cr3, %0" : "=r"(save->cr3) : : "memory");
+ asm volatile ("mov %%cr0, %0" : "=r"(save->cr0) : : "memory");
+ asm volatile ("mov %%dr7, %0" : "=r"(save->dr7) : : "memory");
+ asm volatile ("mov %%dr6, %0" : "=r"(save->dr6) : : "memory");
+ asm volatile ("mov %%cr2, %0" : "=r"(save->cr2) : : "memory");
+ save->g_pat = rdmsr(MSR_IA32_CR_PAT);
+ save->dbgctl = rdmsr(MSR_IA32_DEBUGCTLMSR);
+ ctrl->intercept = (1ULL << INTERCEPT_VMRUN) |
+ (1ULL << INTERCEPT_VMMCALL);
+
+ vmcb->save.rip = (u64)guest_rip;
+ vmcb->save.rsp = (u64)guest_rsp;
+ guest_regs.rdi = (u64)svm;
+}
+
+/*
+ * save/restore 64-bit general registers except rax, rip, rsp
+ * which are directly handed through the VMCB guest processor state
+ */
+#define SAVE_GPR_C \
+ "xchg %%rbx, guest_regs+0x20\n\t" \
+ "xchg %%rcx, guest_regs+0x10\n\t" \
+ "xchg %%rdx, guest_regs+0x18\n\t" \
+ "xchg %%rbp, guest_regs+0x30\n\t" \
+ "xchg %%rsi, guest_regs+0x38\n\t" \
+ "xchg %%rdi, guest_regs+0x40\n\t" \
+ "xchg %%r8, guest_regs+0x48\n\t" \
+ "xchg %%r9, guest_regs+0x50\n\t" \
+ "xchg %%r10, guest_regs+0x58\n\t" \
+ "xchg %%r11, guest_regs+0x60\n\t" \
+ "xchg %%r12, guest_regs+0x68\n\t" \
+ "xchg %%r13, guest_regs+0x70\n\t" \
+ "xchg %%r14, guest_regs+0x78\n\t" \
+ "xchg %%r15, guest_regs+0x80\n\t"
+
+#define LOAD_GPR_C SAVE_GPR_C
+
+/*
+ * selftests do not use interrupts so we dropped clgi/sti/cli/stgi
+ * for now. registers involved in LOAD/SAVE_GPR_C are eventually
+ * unmodified so they do not need to be in the clobber list.
+ */
+void run_guest(struct vmcb *vmcb, uint64_t vmcb_gpa)
+{
+ asm volatile (
+ "vmload %[vmcb_gpa]\n\t"
+ "mov rflags, %%r15\n\t" // rflags
+ "mov %%r15, 0x170(%[vmcb])\n\t"
+ "mov guest_regs, %%r15\n\t" // rax
+ "mov %%r15, 0x1f8(%[vmcb])\n\t"
+ LOAD_GPR_C
+ "vmrun %[vmcb_gpa]\n\t"
+ SAVE_GPR_C
+ "mov 0x170(%[vmcb]), %%r15\n\t" // rflags
+ "mov %%r15, rflags\n\t"
+ "mov 0x1f8(%[vmcb]), %%r15\n\t" // rax
+ "mov %%r15, guest_regs\n\t"
+ "vmsave %[vmcb_gpa]\n\t"
+ : : [vmcb] "r" (vmcb), [vmcb_gpa] "a" (vmcb_gpa)
+ : "r15", "memory");
+}
+
+bool nested_svm_supported(void)
+{
+ struct kvm_cpuid_entry2 *entry =
+ kvm_get_supported_cpuid_entry(0x80000001);
+
+ return entry->ecx & CPUID_SVM;
+}
+
+void nested_svm_check_supported(void)
+{
+ if (!nested_svm_supported()) {
+ print_skip("nested SVM not enabled");
+ exit(KSFT_SKIP);
+ }
+}
diff --git a/tools/testing/selftests/kvm/lib/x86_64/ucall.c b/tools/testing/selftests/kvm/lib/x86_64/ucall.c
new file mode 100644
index 000000000..a3489973e
--- /dev/null
+++ b/tools/testing/selftests/kvm/lib/x86_64/ucall.c
@@ -0,0 +1,59 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * ucall support. A ucall is a "hypercall to userspace".
+ *
+ * Copyright (C) 2018, Red Hat, Inc.
+ */
+#include "kvm_util.h"
+
+#define UCALL_PIO_PORT ((uint16_t)0x1000)
+
+void ucall_init(struct kvm_vm *vm, void *arg)
+{
+}
+
+void ucall_uninit(struct kvm_vm *vm)
+{
+}
+
+void ucall(uint64_t cmd, int nargs, ...)
+{
+ struct ucall uc = {
+ .cmd = cmd,
+ };
+ va_list va;
+ int i;
+
+ nargs = nargs <= UCALL_MAX_ARGS ? nargs : UCALL_MAX_ARGS;
+
+ va_start(va, nargs);
+ for (i = 0; i < nargs; ++i)
+ uc.args[i] = va_arg(va, uint64_t);
+ va_end(va);
+
+ asm volatile("in %[port], %%al"
+ : : [port] "d" (UCALL_PIO_PORT), "D" (&uc) : "rax", "memory");
+}
+
+uint64_t get_ucall(struct kvm_vm *vm, uint32_t vcpu_id, struct ucall *uc)
+{
+ struct kvm_run *run = vcpu_state(vm, vcpu_id);
+ struct ucall ucall = {};
+
+ if (uc)
+ memset(uc, 0, sizeof(*uc));
+
+ if (run->exit_reason == KVM_EXIT_IO && run->io.port == UCALL_PIO_PORT) {
+ struct kvm_regs regs;
+
+ vcpu_regs_get(vm, vcpu_id, &regs);
+ memcpy(&ucall, addr_gva2hva(vm, (vm_vaddr_t)regs.rdi),
+ sizeof(ucall));
+
+ vcpu_run_complete_io(vm, vcpu_id);
+ if (uc)
+ memcpy(uc, &ucall, sizeof(ucall));
+ }
+
+ return ucall.cmd;
+}
diff --git a/tools/testing/selftests/kvm/lib/x86_64/vmx.c b/tools/testing/selftests/kvm/lib/x86_64/vmx.c
new file mode 100644
index 000000000..2448b30e8
--- /dev/null
+++ b/tools/testing/selftests/kvm/lib/x86_64/vmx.c
@@ -0,0 +1,553 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * tools/testing/selftests/kvm/lib/x86_64/vmx.c
+ *
+ * Copyright (C) 2018, Google LLC.
+ */
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "../kvm_util_internal.h"
+#include "processor.h"
+#include "vmx.h"
+
+#define PAGE_SHIFT_4K 12
+
+#define KVM_EPT_PAGE_TABLE_MIN_PADDR 0x1c0000
+
+bool enable_evmcs;
+
+struct hv_enlightened_vmcs *current_evmcs;
+struct hv_vp_assist_page *current_vp_assist;
+
+struct eptPageTableEntry {
+ uint64_t readable:1;
+ uint64_t writable:1;
+ uint64_t executable:1;
+ uint64_t memory_type:3;
+ uint64_t ignore_pat:1;
+ uint64_t page_size:1;
+ uint64_t accessed:1;
+ uint64_t dirty:1;
+ uint64_t ignored_11_10:2;
+ uint64_t address:40;
+ uint64_t ignored_62_52:11;
+ uint64_t suppress_ve:1;
+};
+
+struct eptPageTablePointer {
+ uint64_t memory_type:3;
+ uint64_t page_walk_length:3;
+ uint64_t ad_enabled:1;
+ uint64_t reserved_11_07:5;
+ uint64_t address:40;
+ uint64_t reserved_63_52:12;
+};
+int vcpu_enable_evmcs(struct kvm_vm *vm, int vcpu_id)
+{
+ uint16_t evmcs_ver;
+
+ struct kvm_enable_cap enable_evmcs_cap = {
+ .cap = KVM_CAP_HYPERV_ENLIGHTENED_VMCS,
+ .args[0] = (unsigned long)&evmcs_ver
+ };
+
+ vcpu_ioctl(vm, vcpu_id, KVM_ENABLE_CAP, &enable_evmcs_cap);
+
+ /* KVM should return supported EVMCS version range */
+ TEST_ASSERT(((evmcs_ver >> 8) >= (evmcs_ver & 0xff)) &&
+ (evmcs_ver & 0xff) > 0,
+ "Incorrect EVMCS version range: %x:%x\n",
+ evmcs_ver & 0xff, evmcs_ver >> 8);
+
+ return evmcs_ver;
+}
+
+/* Allocate memory regions for nested VMX tests.
+ *
+ * Input Args:
+ * vm - The VM to allocate guest-virtual addresses in.
+ *
+ * Output Args:
+ * p_vmx_gva - The guest virtual address for the struct vmx_pages.
+ *
+ * Return:
+ * Pointer to structure with the addresses of the VMX areas.
+ */
+struct vmx_pages *
+vcpu_alloc_vmx(struct kvm_vm *vm, vm_vaddr_t *p_vmx_gva)
+{
+ vm_vaddr_t vmx_gva = vm_vaddr_alloc(vm, getpagesize(), 0x10000, 0, 0);
+ struct vmx_pages *vmx = addr_gva2hva(vm, vmx_gva);
+
+ /* Setup of a region of guest memory for the vmxon region. */
+ vmx->vmxon = (void *)vm_vaddr_alloc(vm, getpagesize(), 0x10000, 0, 0);
+ vmx->vmxon_hva = addr_gva2hva(vm, (uintptr_t)vmx->vmxon);
+ vmx->vmxon_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->vmxon);
+
+ /* Setup of a region of guest memory for a vmcs. */
+ vmx->vmcs = (void *)vm_vaddr_alloc(vm, getpagesize(), 0x10000, 0, 0);
+ vmx->vmcs_hva = addr_gva2hva(vm, (uintptr_t)vmx->vmcs);
+ vmx->vmcs_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->vmcs);
+
+ /* Setup of a region of guest memory for the MSR bitmap. */
+ vmx->msr = (void *)vm_vaddr_alloc(vm, getpagesize(), 0x10000, 0, 0);
+ vmx->msr_hva = addr_gva2hva(vm, (uintptr_t)vmx->msr);
+ vmx->msr_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->msr);
+ memset(vmx->msr_hva, 0, getpagesize());
+
+ /* Setup of a region of guest memory for the shadow VMCS. */
+ vmx->shadow_vmcs = (void *)vm_vaddr_alloc(vm, getpagesize(), 0x10000, 0, 0);
+ vmx->shadow_vmcs_hva = addr_gva2hva(vm, (uintptr_t)vmx->shadow_vmcs);
+ vmx->shadow_vmcs_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->shadow_vmcs);
+
+ /* Setup of a region of guest memory for the VMREAD and VMWRITE bitmaps. */
+ vmx->vmread = (void *)vm_vaddr_alloc(vm, getpagesize(), 0x10000, 0, 0);
+ vmx->vmread_hva = addr_gva2hva(vm, (uintptr_t)vmx->vmread);
+ vmx->vmread_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->vmread);
+ memset(vmx->vmread_hva, 0, getpagesize());
+
+ vmx->vmwrite = (void *)vm_vaddr_alloc(vm, getpagesize(), 0x10000, 0, 0);
+ vmx->vmwrite_hva = addr_gva2hva(vm, (uintptr_t)vmx->vmwrite);
+ vmx->vmwrite_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->vmwrite);
+ memset(vmx->vmwrite_hva, 0, getpagesize());
+
+ /* Setup of a region of guest memory for the VP Assist page. */
+ vmx->vp_assist = (void *)vm_vaddr_alloc(vm, getpagesize(),
+ 0x10000, 0, 0);
+ vmx->vp_assist_hva = addr_gva2hva(vm, (uintptr_t)vmx->vp_assist);
+ vmx->vp_assist_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->vp_assist);
+
+ /* Setup of a region of guest memory for the enlightened VMCS. */
+ vmx->enlightened_vmcs = (void *)vm_vaddr_alloc(vm, getpagesize(),
+ 0x10000, 0, 0);
+ vmx->enlightened_vmcs_hva =
+ addr_gva2hva(vm, (uintptr_t)vmx->enlightened_vmcs);
+ vmx->enlightened_vmcs_gpa =
+ addr_gva2gpa(vm, (uintptr_t)vmx->enlightened_vmcs);
+
+ *p_vmx_gva = vmx_gva;
+ return vmx;
+}
+
+bool prepare_for_vmx_operation(struct vmx_pages *vmx)
+{
+ uint64_t feature_control;
+ uint64_t required;
+ unsigned long cr0;
+ unsigned long cr4;
+
+ /*
+ * Ensure bits in CR0 and CR4 are valid in VMX operation:
+ * - Bit X is 1 in _FIXED0: bit X is fixed to 1 in CRx.
+ * - Bit X is 0 in _FIXED1: bit X is fixed to 0 in CRx.
+ */
+ __asm__ __volatile__("mov %%cr0, %0" : "=r"(cr0) : : "memory");
+ cr0 &= rdmsr(MSR_IA32_VMX_CR0_FIXED1);
+ cr0 |= rdmsr(MSR_IA32_VMX_CR0_FIXED0);
+ __asm__ __volatile__("mov %0, %%cr0" : : "r"(cr0) : "memory");
+
+ __asm__ __volatile__("mov %%cr4, %0" : "=r"(cr4) : : "memory");
+ cr4 &= rdmsr(MSR_IA32_VMX_CR4_FIXED1);
+ cr4 |= rdmsr(MSR_IA32_VMX_CR4_FIXED0);
+ /* Enable VMX operation */
+ cr4 |= X86_CR4_VMXE;
+ __asm__ __volatile__("mov %0, %%cr4" : : "r"(cr4) : "memory");
+
+ /*
+ * Configure IA32_FEATURE_CONTROL MSR to allow VMXON:
+ * Bit 0: Lock bit. If clear, VMXON causes a #GP.
+ * Bit 2: Enables VMXON outside of SMX operation. If clear, VMXON
+ * outside of SMX causes a #GP.
+ */
+ required = FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX;
+ required |= FEAT_CTL_LOCKED;
+ feature_control = rdmsr(MSR_IA32_FEAT_CTL);
+ if ((feature_control & required) != required)
+ wrmsr(MSR_IA32_FEAT_CTL, feature_control | required);
+
+ /* Enter VMX root operation. */
+ *(uint32_t *)(vmx->vmxon) = vmcs_revision();
+ if (vmxon(vmx->vmxon_gpa))
+ return false;
+
+ return true;
+}
+
+bool load_vmcs(struct vmx_pages *vmx)
+{
+ if (!enable_evmcs) {
+ /* Load a VMCS. */
+ *(uint32_t *)(vmx->vmcs) = vmcs_revision();
+ if (vmclear(vmx->vmcs_gpa))
+ return false;
+
+ if (vmptrld(vmx->vmcs_gpa))
+ return false;
+
+ /* Setup shadow VMCS, do not load it yet. */
+ *(uint32_t *)(vmx->shadow_vmcs) =
+ vmcs_revision() | 0x80000000ul;
+ if (vmclear(vmx->shadow_vmcs_gpa))
+ return false;
+ } else {
+ if (evmcs_vmptrld(vmx->enlightened_vmcs_gpa,
+ vmx->enlightened_vmcs))
+ return false;
+ current_evmcs->revision_id = EVMCS_VERSION;
+ }
+
+ return true;
+}
+
+/*
+ * Initialize the control fields to the most basic settings possible.
+ */
+static inline void init_vmcs_control_fields(struct vmx_pages *vmx)
+{
+ uint32_t sec_exec_ctl = 0;
+
+ vmwrite(VIRTUAL_PROCESSOR_ID, 0);
+ vmwrite(POSTED_INTR_NV, 0);
+
+ vmwrite(PIN_BASED_VM_EXEC_CONTROL, rdmsr(MSR_IA32_VMX_TRUE_PINBASED_CTLS));
+
+ if (vmx->eptp_gpa) {
+ uint64_t ept_paddr;
+ struct eptPageTablePointer eptp = {
+ .memory_type = VMX_BASIC_MEM_TYPE_WB,
+ .page_walk_length = 3, /* + 1 */
+ .ad_enabled = !!(rdmsr(MSR_IA32_VMX_EPT_VPID_CAP) & VMX_EPT_VPID_CAP_AD_BITS),
+ .address = vmx->eptp_gpa >> PAGE_SHIFT_4K,
+ };
+
+ memcpy(&ept_paddr, &eptp, sizeof(ept_paddr));
+ vmwrite(EPT_POINTER, ept_paddr);
+ sec_exec_ctl |= SECONDARY_EXEC_ENABLE_EPT;
+ }
+
+ if (!vmwrite(SECONDARY_VM_EXEC_CONTROL, sec_exec_ctl))
+ vmwrite(CPU_BASED_VM_EXEC_CONTROL,
+ rdmsr(MSR_IA32_VMX_TRUE_PROCBASED_CTLS) | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS);
+ else {
+ vmwrite(CPU_BASED_VM_EXEC_CONTROL, rdmsr(MSR_IA32_VMX_TRUE_PROCBASED_CTLS));
+ GUEST_ASSERT(!sec_exec_ctl);
+ }
+
+ vmwrite(EXCEPTION_BITMAP, 0);
+ vmwrite(PAGE_FAULT_ERROR_CODE_MASK, 0);
+ vmwrite(PAGE_FAULT_ERROR_CODE_MATCH, -1); /* Never match */
+ vmwrite(CR3_TARGET_COUNT, 0);
+ vmwrite(VM_EXIT_CONTROLS, rdmsr(MSR_IA32_VMX_EXIT_CTLS) |
+ VM_EXIT_HOST_ADDR_SPACE_SIZE); /* 64-bit host */
+ vmwrite(VM_EXIT_MSR_STORE_COUNT, 0);
+ vmwrite(VM_EXIT_MSR_LOAD_COUNT, 0);
+ vmwrite(VM_ENTRY_CONTROLS, rdmsr(MSR_IA32_VMX_ENTRY_CTLS) |
+ VM_ENTRY_IA32E_MODE); /* 64-bit guest */
+ vmwrite(VM_ENTRY_MSR_LOAD_COUNT, 0);
+ vmwrite(VM_ENTRY_INTR_INFO_FIELD, 0);
+ vmwrite(TPR_THRESHOLD, 0);
+
+ vmwrite(CR0_GUEST_HOST_MASK, 0);
+ vmwrite(CR4_GUEST_HOST_MASK, 0);
+ vmwrite(CR0_READ_SHADOW, get_cr0());
+ vmwrite(CR4_READ_SHADOW, get_cr4());
+
+ vmwrite(MSR_BITMAP, vmx->msr_gpa);
+ vmwrite(VMREAD_BITMAP, vmx->vmread_gpa);
+ vmwrite(VMWRITE_BITMAP, vmx->vmwrite_gpa);
+}
+
+/*
+ * Initialize the host state fields based on the current host state, with
+ * the exception of HOST_RSP and HOST_RIP, which should be set by vmlaunch
+ * or vmresume.
+ */
+static inline void init_vmcs_host_state(void)
+{
+ uint32_t exit_controls = vmreadz(VM_EXIT_CONTROLS);
+
+ vmwrite(HOST_ES_SELECTOR, get_es());
+ vmwrite(HOST_CS_SELECTOR, get_cs());
+ vmwrite(HOST_SS_SELECTOR, get_ss());
+ vmwrite(HOST_DS_SELECTOR, get_ds());
+ vmwrite(HOST_FS_SELECTOR, get_fs());
+ vmwrite(HOST_GS_SELECTOR, get_gs());
+ vmwrite(HOST_TR_SELECTOR, get_tr());
+
+ if (exit_controls & VM_EXIT_LOAD_IA32_PAT)
+ vmwrite(HOST_IA32_PAT, rdmsr(MSR_IA32_CR_PAT));
+ if (exit_controls & VM_EXIT_LOAD_IA32_EFER)
+ vmwrite(HOST_IA32_EFER, rdmsr(MSR_EFER));
+ if (exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
+ vmwrite(HOST_IA32_PERF_GLOBAL_CTRL,
+ rdmsr(MSR_CORE_PERF_GLOBAL_CTRL));
+
+ vmwrite(HOST_IA32_SYSENTER_CS, rdmsr(MSR_IA32_SYSENTER_CS));
+
+ vmwrite(HOST_CR0, get_cr0());
+ vmwrite(HOST_CR3, get_cr3());
+ vmwrite(HOST_CR4, get_cr4());
+ vmwrite(HOST_FS_BASE, rdmsr(MSR_FS_BASE));
+ vmwrite(HOST_GS_BASE, rdmsr(MSR_GS_BASE));
+ vmwrite(HOST_TR_BASE,
+ get_desc64_base((struct desc64 *)(get_gdt().address + get_tr())));
+ vmwrite(HOST_GDTR_BASE, get_gdt().address);
+ vmwrite(HOST_IDTR_BASE, get_idt().address);
+ vmwrite(HOST_IA32_SYSENTER_ESP, rdmsr(MSR_IA32_SYSENTER_ESP));
+ vmwrite(HOST_IA32_SYSENTER_EIP, rdmsr(MSR_IA32_SYSENTER_EIP));
+}
+
+/*
+ * Initialize the guest state fields essentially as a clone of
+ * the host state fields. Some host state fields have fixed
+ * values, and we set the corresponding guest state fields accordingly.
+ */
+static inline void init_vmcs_guest_state(void *rip, void *rsp)
+{
+ vmwrite(GUEST_ES_SELECTOR, vmreadz(HOST_ES_SELECTOR));
+ vmwrite(GUEST_CS_SELECTOR, vmreadz(HOST_CS_SELECTOR));
+ vmwrite(GUEST_SS_SELECTOR, vmreadz(HOST_SS_SELECTOR));
+ vmwrite(GUEST_DS_SELECTOR, vmreadz(HOST_DS_SELECTOR));
+ vmwrite(GUEST_FS_SELECTOR, vmreadz(HOST_FS_SELECTOR));
+ vmwrite(GUEST_GS_SELECTOR, vmreadz(HOST_GS_SELECTOR));
+ vmwrite(GUEST_LDTR_SELECTOR, 0);
+ vmwrite(GUEST_TR_SELECTOR, vmreadz(HOST_TR_SELECTOR));
+ vmwrite(GUEST_INTR_STATUS, 0);
+ vmwrite(GUEST_PML_INDEX, 0);
+
+ vmwrite(VMCS_LINK_POINTER, -1ll);
+ vmwrite(GUEST_IA32_DEBUGCTL, 0);
+ vmwrite(GUEST_IA32_PAT, vmreadz(HOST_IA32_PAT));
+ vmwrite(GUEST_IA32_EFER, vmreadz(HOST_IA32_EFER));
+ vmwrite(GUEST_IA32_PERF_GLOBAL_CTRL,
+ vmreadz(HOST_IA32_PERF_GLOBAL_CTRL));
+
+ vmwrite(GUEST_ES_LIMIT, -1);
+ vmwrite(GUEST_CS_LIMIT, -1);
+ vmwrite(GUEST_SS_LIMIT, -1);
+ vmwrite(GUEST_DS_LIMIT, -1);
+ vmwrite(GUEST_FS_LIMIT, -1);
+ vmwrite(GUEST_GS_LIMIT, -1);
+ vmwrite(GUEST_LDTR_LIMIT, -1);
+ vmwrite(GUEST_TR_LIMIT, 0x67);
+ vmwrite(GUEST_GDTR_LIMIT, 0xffff);
+ vmwrite(GUEST_IDTR_LIMIT, 0xffff);
+ vmwrite(GUEST_ES_AR_BYTES,
+ vmreadz(GUEST_ES_SELECTOR) == 0 ? 0x10000 : 0xc093);
+ vmwrite(GUEST_CS_AR_BYTES, 0xa09b);
+ vmwrite(GUEST_SS_AR_BYTES, 0xc093);
+ vmwrite(GUEST_DS_AR_BYTES,
+ vmreadz(GUEST_DS_SELECTOR) == 0 ? 0x10000 : 0xc093);
+ vmwrite(GUEST_FS_AR_BYTES,
+ vmreadz(GUEST_FS_SELECTOR) == 0 ? 0x10000 : 0xc093);
+ vmwrite(GUEST_GS_AR_BYTES,
+ vmreadz(GUEST_GS_SELECTOR) == 0 ? 0x10000 : 0xc093);
+ vmwrite(GUEST_LDTR_AR_BYTES, 0x10000);
+ vmwrite(GUEST_TR_AR_BYTES, 0x8b);
+ vmwrite(GUEST_INTERRUPTIBILITY_INFO, 0);
+ vmwrite(GUEST_ACTIVITY_STATE, 0);
+ vmwrite(GUEST_SYSENTER_CS, vmreadz(HOST_IA32_SYSENTER_CS));
+ vmwrite(VMX_PREEMPTION_TIMER_VALUE, 0);
+
+ vmwrite(GUEST_CR0, vmreadz(HOST_CR0));
+ vmwrite(GUEST_CR3, vmreadz(HOST_CR3));
+ vmwrite(GUEST_CR4, vmreadz(HOST_CR4));
+ vmwrite(GUEST_ES_BASE, 0);
+ vmwrite(GUEST_CS_BASE, 0);
+ vmwrite(GUEST_SS_BASE, 0);
+ vmwrite(GUEST_DS_BASE, 0);
+ vmwrite(GUEST_FS_BASE, vmreadz(HOST_FS_BASE));
+ vmwrite(GUEST_GS_BASE, vmreadz(HOST_GS_BASE));
+ vmwrite(GUEST_LDTR_BASE, 0);
+ vmwrite(GUEST_TR_BASE, vmreadz(HOST_TR_BASE));
+ vmwrite(GUEST_GDTR_BASE, vmreadz(HOST_GDTR_BASE));
+ vmwrite(GUEST_IDTR_BASE, vmreadz(HOST_IDTR_BASE));
+ vmwrite(GUEST_DR7, 0x400);
+ vmwrite(GUEST_RSP, (uint64_t)rsp);
+ vmwrite(GUEST_RIP, (uint64_t)rip);
+ vmwrite(GUEST_RFLAGS, 2);
+ vmwrite(GUEST_PENDING_DBG_EXCEPTIONS, 0);
+ vmwrite(GUEST_SYSENTER_ESP, vmreadz(HOST_IA32_SYSENTER_ESP));
+ vmwrite(GUEST_SYSENTER_EIP, vmreadz(HOST_IA32_SYSENTER_EIP));
+}
+
+void prepare_vmcs(struct vmx_pages *vmx, void *guest_rip, void *guest_rsp)
+{
+ init_vmcs_control_fields(vmx);
+ init_vmcs_host_state();
+ init_vmcs_guest_state(guest_rip, guest_rsp);
+}
+
+bool nested_vmx_supported(void)
+{
+ struct kvm_cpuid_entry2 *entry = kvm_get_supported_cpuid_entry(1);
+
+ return entry->ecx & CPUID_VMX;
+}
+
+void nested_vmx_check_supported(void)
+{
+ if (!nested_vmx_supported()) {
+ print_skip("nested VMX not enabled");
+ exit(KSFT_SKIP);
+ }
+}
+
+void nested_pg_map(struct vmx_pages *vmx, struct kvm_vm *vm,
+ uint64_t nested_paddr, uint64_t paddr, uint32_t eptp_memslot)
+{
+ uint16_t index[4];
+ struct eptPageTableEntry *pml4e;
+
+ TEST_ASSERT(vm->mode == VM_MODE_PXXV48_4K, "Attempt to use "
+ "unknown or unsupported guest mode, mode: 0x%x", vm->mode);
+
+ TEST_ASSERT((nested_paddr % vm->page_size) == 0,
+ "Nested physical address not on page boundary,\n"
+ " nested_paddr: 0x%lx vm->page_size: 0x%x",
+ nested_paddr, vm->page_size);
+ TEST_ASSERT((nested_paddr >> vm->page_shift) <= vm->max_gfn,
+ "Physical address beyond beyond maximum supported,\n"
+ " nested_paddr: 0x%lx vm->max_gfn: 0x%lx vm->page_size: 0x%x",
+ paddr, vm->max_gfn, vm->page_size);
+ TEST_ASSERT((paddr % vm->page_size) == 0,
+ "Physical address not on page boundary,\n"
+ " paddr: 0x%lx vm->page_size: 0x%x",
+ paddr, vm->page_size);
+ TEST_ASSERT((paddr >> vm->page_shift) <= vm->max_gfn,
+ "Physical address beyond beyond maximum supported,\n"
+ " paddr: 0x%lx vm->max_gfn: 0x%lx vm->page_size: 0x%x",
+ paddr, vm->max_gfn, vm->page_size);
+
+ index[0] = (nested_paddr >> 12) & 0x1ffu;
+ index[1] = (nested_paddr >> 21) & 0x1ffu;
+ index[2] = (nested_paddr >> 30) & 0x1ffu;
+ index[3] = (nested_paddr >> 39) & 0x1ffu;
+
+ /* Allocate page directory pointer table if not present. */
+ pml4e = vmx->eptp_hva;
+ if (!pml4e[index[3]].readable) {
+ pml4e[index[3]].address = vm_phy_page_alloc(vm,
+ KVM_EPT_PAGE_TABLE_MIN_PADDR, eptp_memslot)
+ >> vm->page_shift;
+ pml4e[index[3]].writable = true;
+ pml4e[index[3]].readable = true;
+ pml4e[index[3]].executable = true;
+ }
+
+ /* Allocate page directory table if not present. */
+ struct eptPageTableEntry *pdpe;
+ pdpe = addr_gpa2hva(vm, pml4e[index[3]].address * vm->page_size);
+ if (!pdpe[index[2]].readable) {
+ pdpe[index[2]].address = vm_phy_page_alloc(vm,
+ KVM_EPT_PAGE_TABLE_MIN_PADDR, eptp_memslot)
+ >> vm->page_shift;
+ pdpe[index[2]].writable = true;
+ pdpe[index[2]].readable = true;
+ pdpe[index[2]].executable = true;
+ }
+
+ /* Allocate page table if not present. */
+ struct eptPageTableEntry *pde;
+ pde = addr_gpa2hva(vm, pdpe[index[2]].address * vm->page_size);
+ if (!pde[index[1]].readable) {
+ pde[index[1]].address = vm_phy_page_alloc(vm,
+ KVM_EPT_PAGE_TABLE_MIN_PADDR, eptp_memslot)
+ >> vm->page_shift;
+ pde[index[1]].writable = true;
+ pde[index[1]].readable = true;
+ pde[index[1]].executable = true;
+ }
+
+ /* Fill in page table entry. */
+ struct eptPageTableEntry *pte;
+ pte = addr_gpa2hva(vm, pde[index[1]].address * vm->page_size);
+ pte[index[0]].address = paddr >> vm->page_shift;
+ pte[index[0]].writable = true;
+ pte[index[0]].readable = true;
+ pte[index[0]].executable = true;
+
+ /*
+ * For now mark these as accessed and dirty because the only
+ * testcase we have needs that. Can be reconsidered later.
+ */
+ pte[index[0]].accessed = true;
+ pte[index[0]].dirty = true;
+}
+
+/*
+ * Map a range of EPT guest physical addresses to the VM's physical address
+ *
+ * Input Args:
+ * vm - Virtual Machine
+ * nested_paddr - Nested guest physical address to map
+ * paddr - VM Physical Address
+ * size - The size of the range to map
+ * eptp_memslot - Memory region slot for new virtual translation tables
+ *
+ * Output Args: None
+ *
+ * Return: None
+ *
+ * Within the VM given by vm, creates a nested guest translation for the
+ * page range starting at nested_paddr to the page range starting at paddr.
+ */
+void nested_map(struct vmx_pages *vmx, struct kvm_vm *vm,
+ uint64_t nested_paddr, uint64_t paddr, uint64_t size,
+ uint32_t eptp_memslot)
+{
+ size_t page_size = vm->page_size;
+ size_t npages = size / page_size;
+
+ TEST_ASSERT(nested_paddr + size > nested_paddr, "Vaddr overflow");
+ TEST_ASSERT(paddr + size > paddr, "Paddr overflow");
+
+ while (npages--) {
+ nested_pg_map(vmx, vm, nested_paddr, paddr, eptp_memslot);
+ nested_paddr += page_size;
+ paddr += page_size;
+ }
+}
+
+/* Prepare an identity extended page table that maps all the
+ * physical pages in VM.
+ */
+void nested_map_memslot(struct vmx_pages *vmx, struct kvm_vm *vm,
+ uint32_t memslot, uint32_t eptp_memslot)
+{
+ sparsebit_idx_t i, last;
+ struct userspace_mem_region *region =
+ memslot2region(vm, memslot);
+
+ i = (region->region.guest_phys_addr >> vm->page_shift) - 1;
+ last = i + (region->region.memory_size >> vm->page_shift);
+ for (;;) {
+ i = sparsebit_next_clear(region->unused_phy_pages, i);
+ if (i > last)
+ break;
+
+ nested_map(vmx, vm,
+ (uint64_t)i << vm->page_shift,
+ (uint64_t)i << vm->page_shift,
+ 1 << vm->page_shift,
+ eptp_memslot);
+ }
+}
+
+void prepare_eptp(struct vmx_pages *vmx, struct kvm_vm *vm,
+ uint32_t eptp_memslot)
+{
+ vmx->eptp = (void *)vm_vaddr_alloc(vm, getpagesize(), 0x10000, 0, 0);
+ vmx->eptp_hva = addr_gva2hva(vm, (uintptr_t)vmx->eptp);
+ vmx->eptp_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->eptp);
+}
+
+void prepare_virtualize_apic_accesses(struct vmx_pages *vmx, struct kvm_vm *vm,
+ uint32_t eptp_memslot)
+{
+ vmx->apic_access = (void *)vm_vaddr_alloc(vm, getpagesize(),
+ 0x10000, 0, 0);
+ vmx->apic_access_hva = addr_gva2hva(vm, (uintptr_t)vmx->apic_access);
+ vmx->apic_access_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->apic_access);
+}
diff --git a/tools/testing/selftests/kvm/s390x/memop.c b/tools/testing/selftests/kvm/s390x/memop.c
new file mode 100644
index 000000000..9f49ead38
--- /dev/null
+++ b/tools/testing/selftests/kvm/s390x/memop.c
@@ -0,0 +1,166 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Test for s390x KVM_S390_MEM_OP
+ *
+ * Copyright (C) 2019, Red Hat, Inc.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+
+#include "test_util.h"
+#include "kvm_util.h"
+
+#define VCPU_ID 1
+
+static uint8_t mem1[65536];
+static uint8_t mem2[65536];
+
+static void guest_code(void)
+{
+ int i;
+
+ for (;;) {
+ for (i = 0; i < sizeof(mem2); i++)
+ mem2[i] = mem1[i];
+ GUEST_SYNC(0);
+ }
+}
+
+int main(int argc, char *argv[])
+{
+ struct kvm_vm *vm;
+ struct kvm_run *run;
+ struct kvm_s390_mem_op ksmo;
+ int rv, i, maxsize;
+
+ setbuf(stdout, NULL); /* Tell stdout not to buffer its content */
+
+ maxsize = kvm_check_cap(KVM_CAP_S390_MEM_OP);
+ if (!maxsize) {
+ print_skip("CAP_S390_MEM_OP not supported");
+ exit(KSFT_SKIP);
+ }
+ if (maxsize > sizeof(mem1))
+ maxsize = sizeof(mem1);
+
+ /* Create VM */
+ vm = vm_create_default(VCPU_ID, 0, guest_code);
+ run = vcpu_state(vm, VCPU_ID);
+
+ for (i = 0; i < sizeof(mem1); i++)
+ mem1[i] = i * i + i;
+
+ /* Set the first array */
+ ksmo.gaddr = addr_gva2gpa(vm, (uintptr_t)mem1);
+ ksmo.flags = 0;
+ ksmo.size = maxsize;
+ ksmo.op = KVM_S390_MEMOP_LOGICAL_WRITE;
+ ksmo.buf = (uintptr_t)mem1;
+ ksmo.ar = 0;
+ vcpu_ioctl(vm, VCPU_ID, KVM_S390_MEM_OP, &ksmo);
+
+ /* Let the guest code copy the first array to the second */
+ vcpu_run(vm, VCPU_ID);
+ TEST_ASSERT(run->exit_reason == KVM_EXIT_S390_SIEIC,
+ "Unexpected exit reason: %u (%s)\n",
+ run->exit_reason,
+ exit_reason_str(run->exit_reason));
+
+ memset(mem2, 0xaa, sizeof(mem2));
+
+ /* Get the second array */
+ ksmo.gaddr = (uintptr_t)mem2;
+ ksmo.flags = 0;
+ ksmo.size = maxsize;
+ ksmo.op = KVM_S390_MEMOP_LOGICAL_READ;
+ ksmo.buf = (uintptr_t)mem2;
+ ksmo.ar = 0;
+ vcpu_ioctl(vm, VCPU_ID, KVM_S390_MEM_OP, &ksmo);
+
+ TEST_ASSERT(!memcmp(mem1, mem2, maxsize),
+ "Memory contents do not match!");
+
+ /* Check error conditions - first bad size: */
+ ksmo.gaddr = (uintptr_t)mem1;
+ ksmo.flags = 0;
+ ksmo.size = -1;
+ ksmo.op = KVM_S390_MEMOP_LOGICAL_WRITE;
+ ksmo.buf = (uintptr_t)mem1;
+ ksmo.ar = 0;
+ rv = _vcpu_ioctl(vm, VCPU_ID, KVM_S390_MEM_OP, &ksmo);
+ TEST_ASSERT(rv == -1 && errno == E2BIG, "ioctl allows insane sizes");
+
+ /* Zero size: */
+ ksmo.gaddr = (uintptr_t)mem1;
+ ksmo.flags = 0;
+ ksmo.size = 0;
+ ksmo.op = KVM_S390_MEMOP_LOGICAL_WRITE;
+ ksmo.buf = (uintptr_t)mem1;
+ ksmo.ar = 0;
+ rv = _vcpu_ioctl(vm, VCPU_ID, KVM_S390_MEM_OP, &ksmo);
+ TEST_ASSERT(rv == -1 && (errno == EINVAL || errno == ENOMEM),
+ "ioctl allows 0 as size");
+
+ /* Bad flags: */
+ ksmo.gaddr = (uintptr_t)mem1;
+ ksmo.flags = -1;
+ ksmo.size = maxsize;
+ ksmo.op = KVM_S390_MEMOP_LOGICAL_WRITE;
+ ksmo.buf = (uintptr_t)mem1;
+ ksmo.ar = 0;
+ rv = _vcpu_ioctl(vm, VCPU_ID, KVM_S390_MEM_OP, &ksmo);
+ TEST_ASSERT(rv == -1 && errno == EINVAL, "ioctl allows all flags");
+
+ /* Bad operation: */
+ ksmo.gaddr = (uintptr_t)mem1;
+ ksmo.flags = 0;
+ ksmo.size = maxsize;
+ ksmo.op = -1;
+ ksmo.buf = (uintptr_t)mem1;
+ ksmo.ar = 0;
+ rv = _vcpu_ioctl(vm, VCPU_ID, KVM_S390_MEM_OP, &ksmo);
+ TEST_ASSERT(rv == -1 && errno == EINVAL, "ioctl allows bad operations");
+
+ /* Bad guest address: */
+ ksmo.gaddr = ~0xfffUL;
+ ksmo.flags = KVM_S390_MEMOP_F_CHECK_ONLY;
+ ksmo.size = maxsize;
+ ksmo.op = KVM_S390_MEMOP_LOGICAL_WRITE;
+ ksmo.buf = (uintptr_t)mem1;
+ ksmo.ar = 0;
+ rv = _vcpu_ioctl(vm, VCPU_ID, KVM_S390_MEM_OP, &ksmo);
+ TEST_ASSERT(rv > 0, "ioctl does not report bad guest memory access");
+
+ /* Bad host address: */
+ ksmo.gaddr = (uintptr_t)mem1;
+ ksmo.flags = 0;
+ ksmo.size = maxsize;
+ ksmo.op = KVM_S390_MEMOP_LOGICAL_WRITE;
+ ksmo.buf = 0;
+ ksmo.ar = 0;
+ rv = _vcpu_ioctl(vm, VCPU_ID, KVM_S390_MEM_OP, &ksmo);
+ TEST_ASSERT(rv == -1 && errno == EFAULT,
+ "ioctl does not report bad host memory address");
+
+ /* Bad access register: */
+ run->psw_mask &= ~(3UL << (63 - 17));
+ run->psw_mask |= 1UL << (63 - 17); /* Enable AR mode */
+ vcpu_run(vm, VCPU_ID); /* To sync new state to SIE block */
+ ksmo.gaddr = (uintptr_t)mem1;
+ ksmo.flags = 0;
+ ksmo.size = maxsize;
+ ksmo.op = KVM_S390_MEMOP_LOGICAL_WRITE;
+ ksmo.buf = (uintptr_t)mem1;
+ ksmo.ar = 17;
+ rv = _vcpu_ioctl(vm, VCPU_ID, KVM_S390_MEM_OP, &ksmo);
+ TEST_ASSERT(rv == -1 && errno == EINVAL, "ioctl allows ARs > 15");
+ run->psw_mask &= ~(3UL << (63 - 17)); /* Disable AR mode */
+ vcpu_run(vm, VCPU_ID); /* Run to sync new state */
+
+ kvm_vm_free(vm);
+
+ return 0;
+}
diff --git a/tools/testing/selftests/kvm/s390x/resets.c b/tools/testing/selftests/kvm/s390x/resets.c
new file mode 100644
index 000000000..b143db6d8
--- /dev/null
+++ b/tools/testing/selftests/kvm/s390x/resets.c
@@ -0,0 +1,279 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Test for s390x CPU resets
+ *
+ * Copyright (C) 2020, IBM
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+
+#include "test_util.h"
+#include "kvm_util.h"
+
+#define VCPU_ID 3
+#define LOCAL_IRQS 32
+
+struct kvm_s390_irq buf[VCPU_ID + LOCAL_IRQS];
+
+struct kvm_vm *vm;
+struct kvm_run *run;
+struct kvm_sync_regs *sync_regs;
+static uint8_t regs_null[512];
+
+static void guest_code_initial(void)
+{
+ /* set several CRs to "safe" value */
+ unsigned long cr2_59 = 0x10; /* enable guarded storage */
+ unsigned long cr8_63 = 0x1; /* monitor mask = 1 */
+ unsigned long cr10 = 1; /* PER START */
+ unsigned long cr11 = -1; /* PER END */
+
+
+ /* Dirty registers */
+ asm volatile (
+ " lghi 2,0x11\n" /* Round toward 0 */
+ " sfpc 2\n" /* set fpc to !=0 */
+ " lctlg 2,2,%0\n"
+ " lctlg 8,8,%1\n"
+ " lctlg 10,10,%2\n"
+ " lctlg 11,11,%3\n"
+ /* now clobber some general purpose regs */
+ " llihh 0,0xffff\n"
+ " llihl 1,0x5555\n"
+ " llilh 2,0xaaaa\n"
+ " llill 3,0x0000\n"
+ /* now clobber a floating point reg */
+ " lghi 4,0x1\n"
+ " cdgbr 0,4\n"
+ /* now clobber an access reg */
+ " sar 9,4\n"
+ /* We embed diag 501 here to control register content */
+ " diag 0,0,0x501\n"
+ :
+ : "m" (cr2_59), "m" (cr8_63), "m" (cr10), "m" (cr11)
+ /* no clobber list as this should not return */
+ );
+}
+
+static void test_one_reg(uint64_t id, uint64_t value)
+{
+ struct kvm_one_reg reg;
+ uint64_t eval_reg;
+
+ reg.addr = (uintptr_t)&eval_reg;
+ reg.id = id;
+ vcpu_get_reg(vm, VCPU_ID, &reg);
+ TEST_ASSERT(eval_reg == value, "value == 0x%lx", value);
+}
+
+static void assert_noirq(void)
+{
+ struct kvm_s390_irq_state irq_state;
+ int irqs;
+
+ irq_state.len = sizeof(buf);
+ irq_state.buf = (unsigned long)buf;
+ irqs = _vcpu_ioctl(vm, VCPU_ID, KVM_S390_GET_IRQ_STATE, &irq_state);
+ /*
+ * irqs contains the number of retrieved interrupts. Any interrupt
+ * (notably, the emergency call interrupt we have injected) should
+ * be cleared by the resets, so this should be 0.
+ */
+ TEST_ASSERT(irqs >= 0, "Could not fetch IRQs: errno %d\n", errno);
+ TEST_ASSERT(!irqs, "IRQ pending");
+}
+
+static void assert_clear(void)
+{
+ struct kvm_sregs sregs;
+ struct kvm_regs regs;
+ struct kvm_fpu fpu;
+
+ vcpu_regs_get(vm, VCPU_ID, &regs);
+ TEST_ASSERT(!memcmp(&regs.gprs, regs_null, sizeof(regs.gprs)), "grs == 0");
+
+ vcpu_sregs_get(vm, VCPU_ID, &sregs);
+ TEST_ASSERT(!memcmp(&sregs.acrs, regs_null, sizeof(sregs.acrs)), "acrs == 0");
+
+ vcpu_fpu_get(vm, VCPU_ID, &fpu);
+ TEST_ASSERT(!memcmp(&fpu.fprs, regs_null, sizeof(fpu.fprs)), "fprs == 0");
+
+ /* sync regs */
+ TEST_ASSERT(!memcmp(sync_regs->gprs, regs_null, sizeof(sync_regs->gprs)),
+ "gprs0-15 == 0 (sync_regs)");
+
+ TEST_ASSERT(!memcmp(sync_regs->acrs, regs_null, sizeof(sync_regs->acrs)),
+ "acrs0-15 == 0 (sync_regs)");
+
+ TEST_ASSERT(!memcmp(sync_regs->vrs, regs_null, sizeof(sync_regs->vrs)),
+ "vrs0-15 == 0 (sync_regs)");
+}
+
+static void assert_initial_noclear(void)
+{
+ TEST_ASSERT(sync_regs->gprs[0] == 0xffff000000000000UL,
+ "gpr0 == 0xffff000000000000 (sync_regs)");
+ TEST_ASSERT(sync_regs->gprs[1] == 0x0000555500000000UL,
+ "gpr1 == 0x0000555500000000 (sync_regs)");
+ TEST_ASSERT(sync_regs->gprs[2] == 0x00000000aaaa0000UL,
+ "gpr2 == 0x00000000aaaa0000 (sync_regs)");
+ TEST_ASSERT(sync_regs->gprs[3] == 0x0000000000000000UL,
+ "gpr3 == 0x0000000000000000 (sync_regs)");
+ TEST_ASSERT(sync_regs->fprs[0] == 0x3ff0000000000000UL,
+ "fpr0 == 0f1 (sync_regs)");
+ TEST_ASSERT(sync_regs->acrs[9] == 1, "ar9 == 1 (sync_regs)");
+}
+
+static void assert_initial(void)
+{
+ struct kvm_sregs sregs;
+ struct kvm_fpu fpu;
+
+ /* KVM_GET_SREGS */
+ vcpu_sregs_get(vm, VCPU_ID, &sregs);
+ TEST_ASSERT(sregs.crs[0] == 0xE0UL, "cr0 == 0xE0 (KVM_GET_SREGS)");
+ TEST_ASSERT(sregs.crs[14] == 0xC2000000UL,
+ "cr14 == 0xC2000000 (KVM_GET_SREGS)");
+ TEST_ASSERT(!memcmp(&sregs.crs[1], regs_null, sizeof(sregs.crs[1]) * 12),
+ "cr1-13 == 0 (KVM_GET_SREGS)");
+ TEST_ASSERT(sregs.crs[15] == 0, "cr15 == 0 (KVM_GET_SREGS)");
+
+ /* sync regs */
+ TEST_ASSERT(sync_regs->crs[0] == 0xE0UL, "cr0 == 0xE0 (sync_regs)");
+ TEST_ASSERT(sync_regs->crs[14] == 0xC2000000UL,
+ "cr14 == 0xC2000000 (sync_regs)");
+ TEST_ASSERT(!memcmp(&sync_regs->crs[1], regs_null, 8 * 12),
+ "cr1-13 == 0 (sync_regs)");
+ TEST_ASSERT(sync_regs->crs[15] == 0, "cr15 == 0 (sync_regs)");
+ TEST_ASSERT(sync_regs->fpc == 0, "fpc == 0 (sync_regs)");
+ TEST_ASSERT(sync_regs->todpr == 0, "todpr == 0 (sync_regs)");
+ TEST_ASSERT(sync_regs->cputm == 0, "cputm == 0 (sync_regs)");
+ TEST_ASSERT(sync_regs->ckc == 0, "ckc == 0 (sync_regs)");
+ TEST_ASSERT(sync_regs->pp == 0, "pp == 0 (sync_regs)");
+ TEST_ASSERT(sync_regs->gbea == 1, "gbea == 1 (sync_regs)");
+
+ /* kvm_run */
+ TEST_ASSERT(run->psw_addr == 0, "psw_addr == 0 (kvm_run)");
+ TEST_ASSERT(run->psw_mask == 0, "psw_mask == 0 (kvm_run)");
+
+ vcpu_fpu_get(vm, VCPU_ID, &fpu);
+ TEST_ASSERT(!fpu.fpc, "fpc == 0");
+
+ test_one_reg(KVM_REG_S390_GBEA, 1);
+ test_one_reg(KVM_REG_S390_PP, 0);
+ test_one_reg(KVM_REG_S390_TODPR, 0);
+ test_one_reg(KVM_REG_S390_CPU_TIMER, 0);
+ test_one_reg(KVM_REG_S390_CLOCK_COMP, 0);
+}
+
+static void assert_normal_noclear(void)
+{
+ TEST_ASSERT(sync_regs->crs[2] == 0x10, "cr2 == 10 (sync_regs)");
+ TEST_ASSERT(sync_regs->crs[8] == 1, "cr10 == 1 (sync_regs)");
+ TEST_ASSERT(sync_regs->crs[10] == 1, "cr10 == 1 (sync_regs)");
+ TEST_ASSERT(sync_regs->crs[11] == -1, "cr11 == -1 (sync_regs)");
+}
+
+static void assert_normal(void)
+{
+ test_one_reg(KVM_REG_S390_PFTOKEN, KVM_S390_PFAULT_TOKEN_INVALID);
+ TEST_ASSERT(sync_regs->pft == KVM_S390_PFAULT_TOKEN_INVALID,
+ "pft == 0xff..... (sync_regs)");
+ assert_noirq();
+}
+
+static void inject_irq(int cpu_id)
+{
+ struct kvm_s390_irq_state irq_state;
+ struct kvm_s390_irq *irq = &buf[0];
+ int irqs;
+
+ /* Inject IRQ */
+ irq_state.len = sizeof(struct kvm_s390_irq);
+ irq_state.buf = (unsigned long)buf;
+ irq->type = KVM_S390_INT_EMERGENCY;
+ irq->u.emerg.code = cpu_id;
+ irqs = _vcpu_ioctl(vm, cpu_id, KVM_S390_SET_IRQ_STATE, &irq_state);
+ TEST_ASSERT(irqs >= 0, "Error injecting EMERGENCY IRQ errno %d\n", errno);
+}
+
+static void test_normal(void)
+{
+ pr_info("Testing normal reset\n");
+ /* Create VM */
+ vm = vm_create_default(VCPU_ID, 0, guest_code_initial);
+ run = vcpu_state(vm, VCPU_ID);
+ sync_regs = &run->s.regs;
+
+ vcpu_run(vm, VCPU_ID);
+
+ inject_irq(VCPU_ID);
+
+ vcpu_ioctl(vm, VCPU_ID, KVM_S390_NORMAL_RESET, 0);
+
+ /* must clears */
+ assert_normal();
+ /* must not clears */
+ assert_normal_noclear();
+ assert_initial_noclear();
+
+ kvm_vm_free(vm);
+}
+
+static void test_initial(void)
+{
+ pr_info("Testing initial reset\n");
+ vm = vm_create_default(VCPU_ID, 0, guest_code_initial);
+ run = vcpu_state(vm, VCPU_ID);
+ sync_regs = &run->s.regs;
+
+ vcpu_run(vm, VCPU_ID);
+
+ inject_irq(VCPU_ID);
+
+ vcpu_ioctl(vm, VCPU_ID, KVM_S390_INITIAL_RESET, 0);
+
+ /* must clears */
+ assert_normal();
+ assert_initial();
+ /* must not clears */
+ assert_initial_noclear();
+
+ kvm_vm_free(vm);
+}
+
+static void test_clear(void)
+{
+ pr_info("Testing clear reset\n");
+ vm = vm_create_default(VCPU_ID, 0, guest_code_initial);
+ run = vcpu_state(vm, VCPU_ID);
+ sync_regs = &run->s.regs;
+
+ vcpu_run(vm, VCPU_ID);
+
+ inject_irq(VCPU_ID);
+
+ vcpu_ioctl(vm, VCPU_ID, KVM_S390_CLEAR_RESET, 0);
+
+ /* must clears */
+ assert_normal();
+ assert_initial();
+ assert_clear();
+
+ kvm_vm_free(vm);
+}
+
+int main(int argc, char *argv[])
+{
+ setbuf(stdout, NULL); /* Tell stdout not to buffer its content */
+
+ test_initial();
+ if (kvm_check_cap(KVM_CAP_S390_VCPU_RESETS)) {
+ test_normal();
+ test_clear();
+ }
+ return 0;
+}
diff --git a/tools/testing/selftests/kvm/s390x/sync_regs_test.c b/tools/testing/selftests/kvm/s390x/sync_regs_test.c
new file mode 100644
index 000000000..5731ccf34
--- /dev/null
+++ b/tools/testing/selftests/kvm/s390x/sync_regs_test.c
@@ -0,0 +1,193 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Test for s390x KVM_CAP_SYNC_REGS
+ *
+ * Based on the same test for x86:
+ * Copyright (C) 2018, Google LLC.
+ *
+ * Adaptions for s390x:
+ * Copyright (C) 2019, Red Hat, Inc.
+ *
+ * Test expected behavior of the KVM_CAP_SYNC_REGS functionality.
+ */
+
+#define _GNU_SOURCE /* for program_invocation_short_name */
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+
+#include "test_util.h"
+#include "kvm_util.h"
+
+#define VCPU_ID 5
+
+static void guest_code(void)
+{
+ /*
+ * We embed diag 501 here instead of doing a ucall to avoid that
+ * the compiler has messed with r11 at the time of the ucall.
+ */
+ asm volatile (
+ "0: diag 0,0,0x501\n"
+ " ahi 11,1\n"
+ " j 0b\n"
+ );
+}
+
+#define REG_COMPARE(reg) \
+ TEST_ASSERT(left->reg == right->reg, \
+ "Register " #reg \
+ " values did not match: 0x%llx, 0x%llx\n", \
+ left->reg, right->reg)
+
+#define REG_COMPARE32(reg) \
+ TEST_ASSERT(left->reg == right->reg, \
+ "Register " #reg \
+ " values did not match: 0x%x, 0x%x\n", \
+ left->reg, right->reg)
+
+
+static void compare_regs(struct kvm_regs *left, struct kvm_sync_regs *right)
+{
+ int i;
+
+ for (i = 0; i < 16; i++)
+ REG_COMPARE(gprs[i]);
+}
+
+static void compare_sregs(struct kvm_sregs *left, struct kvm_sync_regs *right)
+{
+ int i;
+
+ for (i = 0; i < 16; i++)
+ REG_COMPARE32(acrs[i]);
+
+ for (i = 0; i < 16; i++)
+ REG_COMPARE(crs[i]);
+}
+
+#undef REG_COMPARE
+
+#define TEST_SYNC_FIELDS (KVM_SYNC_GPRS|KVM_SYNC_ACRS|KVM_SYNC_CRS)
+#define INVALID_SYNC_FIELD 0x80000000
+
+int main(int argc, char *argv[])
+{
+ struct kvm_vm *vm;
+ struct kvm_run *run;
+ struct kvm_regs regs;
+ struct kvm_sregs sregs;
+ int rv, cap;
+
+ /* Tell stdout not to buffer its content */
+ setbuf(stdout, NULL);
+
+ cap = kvm_check_cap(KVM_CAP_SYNC_REGS);
+ if (!cap) {
+ print_skip("CAP_SYNC_REGS not supported");
+ exit(KSFT_SKIP);
+ }
+
+ /* Create VM */
+ vm = vm_create_default(VCPU_ID, 0, guest_code);
+
+ run = vcpu_state(vm, VCPU_ID);
+
+ /* Request reading invalid register set from VCPU. */
+ run->kvm_valid_regs = INVALID_SYNC_FIELD;
+ rv = _vcpu_run(vm, VCPU_ID);
+ TEST_ASSERT(rv < 0 && errno == EINVAL,
+ "Invalid kvm_valid_regs did not cause expected KVM_RUN error: %d\n",
+ rv);
+ vcpu_state(vm, VCPU_ID)->kvm_valid_regs = 0;
+
+ run->kvm_valid_regs = INVALID_SYNC_FIELD | TEST_SYNC_FIELDS;
+ rv = _vcpu_run(vm, VCPU_ID);
+ TEST_ASSERT(rv < 0 && errno == EINVAL,
+ "Invalid kvm_valid_regs did not cause expected KVM_RUN error: %d\n",
+ rv);
+ vcpu_state(vm, VCPU_ID)->kvm_valid_regs = 0;
+
+ /* Request setting invalid register set into VCPU. */
+ run->kvm_dirty_regs = INVALID_SYNC_FIELD;
+ rv = _vcpu_run(vm, VCPU_ID);
+ TEST_ASSERT(rv < 0 && errno == EINVAL,
+ "Invalid kvm_dirty_regs did not cause expected KVM_RUN error: %d\n",
+ rv);
+ vcpu_state(vm, VCPU_ID)->kvm_dirty_regs = 0;
+
+ run->kvm_dirty_regs = INVALID_SYNC_FIELD | TEST_SYNC_FIELDS;
+ rv = _vcpu_run(vm, VCPU_ID);
+ TEST_ASSERT(rv < 0 && errno == EINVAL,
+ "Invalid kvm_dirty_regs did not cause expected KVM_RUN error: %d\n",
+ rv);
+ vcpu_state(vm, VCPU_ID)->kvm_dirty_regs = 0;
+
+ /* Request and verify all valid register sets. */
+ run->kvm_valid_regs = TEST_SYNC_FIELDS;
+ rv = _vcpu_run(vm, VCPU_ID);
+ TEST_ASSERT(rv == 0, "vcpu_run failed: %d\n", rv);
+ TEST_ASSERT(run->exit_reason == KVM_EXIT_S390_SIEIC,
+ "Unexpected exit reason: %u (%s)\n",
+ run->exit_reason,
+ exit_reason_str(run->exit_reason));
+ TEST_ASSERT(run->s390_sieic.icptcode == 4 &&
+ (run->s390_sieic.ipa >> 8) == 0x83 &&
+ (run->s390_sieic.ipb >> 16) == 0x501,
+ "Unexpected interception code: ic=%u, ipa=0x%x, ipb=0x%x\n",
+ run->s390_sieic.icptcode, run->s390_sieic.ipa,
+ run->s390_sieic.ipb);
+
+ vcpu_regs_get(vm, VCPU_ID, &regs);
+ compare_regs(&regs, &run->s.regs);
+
+ vcpu_sregs_get(vm, VCPU_ID, &sregs);
+ compare_sregs(&sregs, &run->s.regs);
+
+ /* Set and verify various register values */
+ run->s.regs.gprs[11] = 0xBAD1DEA;
+ run->s.regs.acrs[0] = 1 << 11;
+
+ run->kvm_valid_regs = TEST_SYNC_FIELDS;
+ run->kvm_dirty_regs = KVM_SYNC_GPRS | KVM_SYNC_ACRS;
+ rv = _vcpu_run(vm, VCPU_ID);
+ TEST_ASSERT(rv == 0, "vcpu_run failed: %d\n", rv);
+ TEST_ASSERT(run->exit_reason == KVM_EXIT_S390_SIEIC,
+ "Unexpected exit reason: %u (%s)\n",
+ run->exit_reason,
+ exit_reason_str(run->exit_reason));
+ TEST_ASSERT(run->s.regs.gprs[11] == 0xBAD1DEA + 1,
+ "r11 sync regs value incorrect 0x%llx.",
+ run->s.regs.gprs[11]);
+ TEST_ASSERT(run->s.regs.acrs[0] == 1 << 11,
+ "acr0 sync regs value incorrect 0x%x.",
+ run->s.regs.acrs[0]);
+
+ vcpu_regs_get(vm, VCPU_ID, &regs);
+ compare_regs(&regs, &run->s.regs);
+
+ vcpu_sregs_get(vm, VCPU_ID, &sregs);
+ compare_sregs(&sregs, &run->s.regs);
+
+ /* Clear kvm_dirty_regs bits, verify new s.regs values are
+ * overwritten with existing guest values.
+ */
+ run->kvm_valid_regs = TEST_SYNC_FIELDS;
+ run->kvm_dirty_regs = 0;
+ run->s.regs.gprs[11] = 0xDEADBEEF;
+ rv = _vcpu_run(vm, VCPU_ID);
+ TEST_ASSERT(rv == 0, "vcpu_run failed: %d\n", rv);
+ TEST_ASSERT(run->exit_reason == KVM_EXIT_S390_SIEIC,
+ "Unexpected exit reason: %u (%s)\n",
+ run->exit_reason,
+ exit_reason_str(run->exit_reason));
+ TEST_ASSERT(run->s.regs.gprs[11] != 0xDEADBEEF,
+ "r11 sync regs value incorrect 0x%llx.",
+ run->s.regs.gprs[11]);
+
+ kvm_vm_free(vm);
+
+ return 0;
+}
diff --git a/tools/testing/selftests/kvm/set_memory_region_test.c b/tools/testing/selftests/kvm/set_memory_region_test.c
new file mode 100644
index 000000000..6f441dd9f
--- /dev/null
+++ b/tools/testing/selftests/kvm/set_memory_region_test.c
@@ -0,0 +1,417 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE /* for program_invocation_short_name */
+#include <fcntl.h>
+#include <pthread.h>
+#include <sched.h>
+#include <semaphore.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+
+#include <linux/compiler.h>
+
+#include <test_util.h>
+#include <kvm_util.h>
+#include <processor.h>
+
+#define VCPU_ID 0
+
+/*
+ * s390x needs at least 1MB alignment, and the x86_64 MOVE/DELETE tests need a
+ * 2MB sized and aligned region so that the initial region corresponds to
+ * exactly one large page.
+ */
+#define MEM_REGION_SIZE 0x200000
+
+#ifdef __x86_64__
+/*
+ * Somewhat arbitrary location and slot, intended to not overlap anything.
+ */
+#define MEM_REGION_GPA 0xc0000000
+#define MEM_REGION_SLOT 10
+
+static const uint64_t MMIO_VAL = 0xbeefull;
+
+extern const uint64_t final_rip_start;
+extern const uint64_t final_rip_end;
+
+static sem_t vcpu_ready;
+
+static inline uint64_t guest_spin_on_val(uint64_t spin_val)
+{
+ uint64_t val;
+
+ do {
+ val = READ_ONCE(*((uint64_t *)MEM_REGION_GPA));
+ } while (val == spin_val);
+
+ GUEST_SYNC(0);
+ return val;
+}
+
+static void *vcpu_worker(void *data)
+{
+ struct kvm_vm *vm = data;
+ struct kvm_run *run;
+ struct ucall uc;
+ uint64_t cmd;
+
+ /*
+ * Loop until the guest is done. Re-enter the guest on all MMIO exits,
+ * which will occur if the guest attempts to access a memslot after it
+ * has been deleted or while it is being moved .
+ */
+ run = vcpu_state(vm, VCPU_ID);
+
+ while (1) {
+ vcpu_run(vm, VCPU_ID);
+
+ if (run->exit_reason == KVM_EXIT_IO) {
+ cmd = get_ucall(vm, VCPU_ID, &uc);
+ if (cmd != UCALL_SYNC)
+ break;
+
+ sem_post(&vcpu_ready);
+ continue;
+ }
+
+ if (run->exit_reason != KVM_EXIT_MMIO)
+ break;
+
+ TEST_ASSERT(!run->mmio.is_write, "Unexpected exit mmio write");
+ TEST_ASSERT(run->mmio.len == 8,
+ "Unexpected exit mmio size = %u", run->mmio.len);
+
+ TEST_ASSERT(run->mmio.phys_addr == MEM_REGION_GPA,
+ "Unexpected exit mmio address = 0x%llx",
+ run->mmio.phys_addr);
+ memcpy(run->mmio.data, &MMIO_VAL, 8);
+ }
+
+ if (run->exit_reason == KVM_EXIT_IO && cmd == UCALL_ABORT)
+ TEST_FAIL("%s at %s:%ld, val = %lu", (const char *)uc.args[0],
+ __FILE__, uc.args[1], uc.args[2]);
+
+ return NULL;
+}
+
+static void wait_for_vcpu(void)
+{
+ struct timespec ts;
+
+ TEST_ASSERT(!clock_gettime(CLOCK_REALTIME, &ts),
+ "clock_gettime() failed: %d\n", errno);
+
+ ts.tv_sec += 2;
+ TEST_ASSERT(!sem_timedwait(&vcpu_ready, &ts),
+ "sem_timedwait() failed: %d\n", errno);
+
+ /* Wait for the vCPU thread to reenter the guest. */
+ usleep(100000);
+}
+
+static struct kvm_vm *spawn_vm(pthread_t *vcpu_thread, void *guest_code)
+{
+ struct kvm_vm *vm;
+ uint64_t *hva;
+ uint64_t gpa;
+
+ vm = vm_create_default(VCPU_ID, 0, guest_code);
+
+ vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid());
+
+ vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS_THP,
+ MEM_REGION_GPA, MEM_REGION_SLOT,
+ MEM_REGION_SIZE / getpagesize(), 0);
+
+ /*
+ * Allocate and map two pages so that the GPA accessed by guest_code()
+ * stays valid across the memslot move.
+ */
+ gpa = vm_phy_pages_alloc(vm, 2, MEM_REGION_GPA, MEM_REGION_SLOT);
+ TEST_ASSERT(gpa == MEM_REGION_GPA, "Failed vm_phy_pages_alloc\n");
+
+ virt_map(vm, MEM_REGION_GPA, MEM_REGION_GPA, 2, 0);
+
+ /* Ditto for the host mapping so that both pages can be zeroed. */
+ hva = addr_gpa2hva(vm, MEM_REGION_GPA);
+ memset(hva, 0, 2 * 4096);
+
+ pthread_create(vcpu_thread, NULL, vcpu_worker, vm);
+
+ /* Ensure the guest thread is spun up. */
+ wait_for_vcpu();
+
+ return vm;
+}
+
+
+static void guest_code_move_memory_region(void)
+{
+ uint64_t val;
+
+ GUEST_SYNC(0);
+
+ /*
+ * Spin until the memory region starts getting moved to a
+ * misaligned address.
+ * Every region move may or may not trigger MMIO, as the
+ * window where the memslot is invalid is usually quite small.
+ */
+ val = guest_spin_on_val(0);
+ GUEST_ASSERT_1(val == 1 || val == MMIO_VAL, val);
+
+ /* Spin until the misaligning memory region move completes. */
+ val = guest_spin_on_val(MMIO_VAL);
+ GUEST_ASSERT_1(val == 1 || val == 0, val);
+
+ /* Spin until the memory region starts to get re-aligned. */
+ val = guest_spin_on_val(0);
+ GUEST_ASSERT_1(val == 1 || val == MMIO_VAL, val);
+
+ /* Spin until the re-aligning memory region move completes. */
+ val = guest_spin_on_val(MMIO_VAL);
+ GUEST_ASSERT_1(val == 1, val);
+
+ GUEST_DONE();
+}
+
+static void test_move_memory_region(void)
+{
+ pthread_t vcpu_thread;
+ struct kvm_vm *vm;
+ uint64_t *hva;
+
+ vm = spawn_vm(&vcpu_thread, guest_code_move_memory_region);
+
+ hva = addr_gpa2hva(vm, MEM_REGION_GPA);
+
+ /*
+ * Shift the region's base GPA. The guest should not see "2" as the
+ * hva->gpa translation is misaligned, i.e. the guest is accessing a
+ * different host pfn.
+ */
+ vm_mem_region_move(vm, MEM_REGION_SLOT, MEM_REGION_GPA - 4096);
+ WRITE_ONCE(*hva, 2);
+
+ /*
+ * The guest _might_ see an invalid memslot and trigger MMIO, but it's
+ * a tiny window. Spin and defer the sync until the memslot is
+ * restored and guest behavior is once again deterministic.
+ */
+ usleep(100000);
+
+ /*
+ * Note, value in memory needs to be changed *before* restoring the
+ * memslot, else the guest could race the update and see "2".
+ */
+ WRITE_ONCE(*hva, 1);
+
+ /* Restore the original base, the guest should see "1". */
+ vm_mem_region_move(vm, MEM_REGION_SLOT, MEM_REGION_GPA);
+ wait_for_vcpu();
+ /* Defered sync from when the memslot was misaligned (above). */
+ wait_for_vcpu();
+
+ pthread_join(vcpu_thread, NULL);
+
+ kvm_vm_free(vm);
+}
+
+static void guest_code_delete_memory_region(void)
+{
+ uint64_t val;
+
+ GUEST_SYNC(0);
+
+ /* Spin until the memory region is deleted. */
+ val = guest_spin_on_val(0);
+ GUEST_ASSERT_1(val == MMIO_VAL, val);
+
+ /* Spin until the memory region is recreated. */
+ val = guest_spin_on_val(MMIO_VAL);
+ GUEST_ASSERT_1(val == 0, val);
+
+ /* Spin until the memory region is deleted. */
+ val = guest_spin_on_val(0);
+ GUEST_ASSERT_1(val == MMIO_VAL, val);
+
+ asm("1:\n\t"
+ ".pushsection .rodata\n\t"
+ ".global final_rip_start\n\t"
+ "final_rip_start: .quad 1b\n\t"
+ ".popsection");
+
+ /* Spin indefinitely (until the code memslot is deleted). */
+ guest_spin_on_val(MMIO_VAL);
+
+ asm("1:\n\t"
+ ".pushsection .rodata\n\t"
+ ".global final_rip_end\n\t"
+ "final_rip_end: .quad 1b\n\t"
+ ".popsection");
+
+ GUEST_ASSERT_1(0, 0);
+}
+
+static void test_delete_memory_region(void)
+{
+ pthread_t vcpu_thread;
+ struct kvm_regs regs;
+ struct kvm_run *run;
+ struct kvm_vm *vm;
+
+ vm = spawn_vm(&vcpu_thread, guest_code_delete_memory_region);
+
+ /* Delete the memory region, the guest should not die. */
+ vm_mem_region_delete(vm, MEM_REGION_SLOT);
+ wait_for_vcpu();
+
+ /* Recreate the memory region. The guest should see "0". */
+ vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS_THP,
+ MEM_REGION_GPA, MEM_REGION_SLOT,
+ MEM_REGION_SIZE / getpagesize(), 0);
+ wait_for_vcpu();
+
+ /* Delete the region again so that there's only one memslot left. */
+ vm_mem_region_delete(vm, MEM_REGION_SLOT);
+ wait_for_vcpu();
+
+ /*
+ * Delete the primary memslot. This should cause an emulation error or
+ * shutdown due to the page tables getting nuked.
+ */
+ vm_mem_region_delete(vm, 0);
+
+ pthread_join(vcpu_thread, NULL);
+
+ run = vcpu_state(vm, VCPU_ID);
+
+ TEST_ASSERT(run->exit_reason == KVM_EXIT_SHUTDOWN ||
+ run->exit_reason == KVM_EXIT_INTERNAL_ERROR,
+ "Unexpected exit reason = %d", run->exit_reason);
+
+ vcpu_regs_get(vm, VCPU_ID, &regs);
+
+ /*
+ * On AMD, after KVM_EXIT_SHUTDOWN the VMCB has been reinitialized already,
+ * so the instruction pointer would point to the reset vector.
+ */
+ if (run->exit_reason == KVM_EXIT_INTERNAL_ERROR)
+ TEST_ASSERT(regs.rip >= final_rip_start &&
+ regs.rip < final_rip_end,
+ "Bad rip, expected 0x%lx - 0x%lx, got 0x%llx\n",
+ final_rip_start, final_rip_end, regs.rip);
+
+ kvm_vm_free(vm);
+}
+
+static void test_zero_memory_regions(void)
+{
+ struct kvm_run *run;
+ struct kvm_vm *vm;
+
+ pr_info("Testing KVM_RUN with zero added memory regions\n");
+
+ vm = vm_create(VM_MODE_DEFAULT, 0, O_RDWR);
+ vm_vcpu_add(vm, VCPU_ID);
+
+ TEST_ASSERT(!ioctl(vm_get_fd(vm), KVM_SET_NR_MMU_PAGES, 64),
+ "KVM_SET_NR_MMU_PAGES failed, errno = %d\n", errno);
+ vcpu_run(vm, VCPU_ID);
+
+ run = vcpu_state(vm, VCPU_ID);
+ TEST_ASSERT(run->exit_reason == KVM_EXIT_INTERNAL_ERROR,
+ "Unexpected exit_reason = %u\n", run->exit_reason);
+
+ kvm_vm_free(vm);
+}
+#endif /* __x86_64__ */
+
+/*
+ * Test it can be added memory slots up to KVM_CAP_NR_MEMSLOTS, then any
+ * tentative to add further slots should fail.
+ */
+static void test_add_max_memory_regions(void)
+{
+ int ret;
+ struct kvm_vm *vm;
+ uint32_t max_mem_slots;
+ uint32_t slot;
+ uint64_t guest_addr = 0x0;
+ uint64_t mem_reg_npages;
+ void *mem;
+
+ max_mem_slots = kvm_check_cap(KVM_CAP_NR_MEMSLOTS);
+ TEST_ASSERT(max_mem_slots > 0,
+ "KVM_CAP_NR_MEMSLOTS should be greater than 0");
+ pr_info("Allowed number of memory slots: %i\n", max_mem_slots);
+
+ vm = vm_create(VM_MODE_DEFAULT, 0, O_RDWR);
+
+ mem_reg_npages = vm_calc_num_guest_pages(VM_MODE_DEFAULT, MEM_REGION_SIZE);
+
+ /* Check it can be added memory slots up to the maximum allowed */
+ pr_info("Adding slots 0..%i, each memory region with %dK size\n",
+ (max_mem_slots - 1), MEM_REGION_SIZE >> 10);
+ for (slot = 0; slot < max_mem_slots; slot++) {
+ vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS,
+ guest_addr, slot, mem_reg_npages,
+ 0);
+ guest_addr += MEM_REGION_SIZE;
+ }
+
+ /* Check it cannot be added memory slots beyond the limit */
+ mem = mmap(NULL, MEM_REGION_SIZE, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ TEST_ASSERT(mem != MAP_FAILED, "Failed to mmap() host");
+
+ ret = ioctl(vm_get_fd(vm), KVM_SET_USER_MEMORY_REGION,
+ &(struct kvm_userspace_memory_region) {slot, 0, guest_addr,
+ MEM_REGION_SIZE, (uint64_t) mem});
+ TEST_ASSERT(ret == -1 && errno == EINVAL,
+ "Adding one more memory slot should fail with EINVAL");
+
+ munmap(mem, MEM_REGION_SIZE);
+ kvm_vm_free(vm);
+}
+
+int main(int argc, char *argv[])
+{
+#ifdef __x86_64__
+ int i, loops;
+#endif
+
+ /* Tell stdout not to buffer its content */
+ setbuf(stdout, NULL);
+
+#ifdef __x86_64__
+ /*
+ * FIXME: the zero-memslot test fails on aarch64 and s390x because
+ * KVM_RUN fails with ENOEXEC or EFAULT.
+ */
+ test_zero_memory_regions();
+#endif
+
+ test_add_max_memory_regions();
+
+#ifdef __x86_64__
+ if (argc > 1)
+ loops = atoi(argv[1]);
+ else
+ loops = 10;
+
+ pr_info("Testing MOVE of in-use region, %d loops\n", loops);
+ for (i = 0; i < loops; i++)
+ test_move_memory_region();
+
+ pr_info("Testing DELETE of in-use region, %d loops\n", loops);
+ for (i = 0; i < loops; i++)
+ test_delete_memory_region();
+#endif
+
+ return 0;
+}
diff --git a/tools/testing/selftests/kvm/steal_time.c b/tools/testing/selftests/kvm/steal_time.c
new file mode 100644
index 000000000..7daedee3e
--- /dev/null
+++ b/tools/testing/selftests/kvm/steal_time.c
@@ -0,0 +1,352 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * steal/stolen time test
+ *
+ * Copyright (C) 2020, Red Hat, Inc.
+ */
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <time.h>
+#include <sched.h>
+#include <pthread.h>
+#include <linux/kernel.h>
+#include <sys/syscall.h>
+#include <asm/kvm.h>
+#include <asm/kvm_para.h>
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+
+#define NR_VCPUS 4
+#define ST_GPA_BASE (1 << 30)
+#define MIN_RUN_DELAY_NS 200000UL
+
+static void *st_gva[NR_VCPUS];
+static uint64_t guest_stolen_time[NR_VCPUS];
+
+#if defined(__x86_64__)
+
+/* steal_time must have 64-byte alignment */
+#define STEAL_TIME_SIZE ((sizeof(struct kvm_steal_time) + 63) & ~63)
+
+static void check_status(struct kvm_steal_time *st)
+{
+ GUEST_ASSERT(!(READ_ONCE(st->version) & 1));
+ GUEST_ASSERT(READ_ONCE(st->flags) == 0);
+ GUEST_ASSERT(READ_ONCE(st->preempted) == 0);
+}
+
+static void guest_code(int cpu)
+{
+ struct kvm_steal_time *st = st_gva[cpu];
+ uint32_t version;
+
+ GUEST_ASSERT(rdmsr(MSR_KVM_STEAL_TIME) == ((uint64_t)st_gva[cpu] | KVM_MSR_ENABLED));
+
+ memset(st, 0, sizeof(*st));
+ GUEST_SYNC(0);
+
+ check_status(st);
+ WRITE_ONCE(guest_stolen_time[cpu], st->steal);
+ version = READ_ONCE(st->version);
+ check_status(st);
+ GUEST_SYNC(1);
+
+ check_status(st);
+ GUEST_ASSERT(version < READ_ONCE(st->version));
+ WRITE_ONCE(guest_stolen_time[cpu], st->steal);
+ check_status(st);
+ GUEST_DONE();
+}
+
+static void steal_time_init(struct kvm_vm *vm)
+{
+ int i;
+
+ if (!(kvm_get_supported_cpuid_entry(KVM_CPUID_FEATURES)->eax &
+ KVM_FEATURE_STEAL_TIME)) {
+ print_skip("steal-time not supported");
+ exit(KSFT_SKIP);
+ }
+
+ for (i = 0; i < NR_VCPUS; ++i) {
+ int ret;
+
+ vcpu_set_cpuid(vm, i, kvm_get_supported_cpuid());
+
+ /* ST_GPA_BASE is identity mapped */
+ st_gva[i] = (void *)(ST_GPA_BASE + i * STEAL_TIME_SIZE);
+ sync_global_to_guest(vm, st_gva[i]);
+
+ ret = _vcpu_set_msr(vm, i, MSR_KVM_STEAL_TIME, (ulong)st_gva[i] | KVM_STEAL_RESERVED_MASK);
+ TEST_ASSERT(ret == 0, "Bad GPA didn't fail");
+
+ vcpu_set_msr(vm, i, MSR_KVM_STEAL_TIME, (ulong)st_gva[i] | KVM_MSR_ENABLED);
+ }
+}
+
+static void steal_time_dump(struct kvm_vm *vm, uint32_t vcpuid)
+{
+ struct kvm_steal_time *st = addr_gva2hva(vm, (ulong)st_gva[vcpuid]);
+ int i;
+
+ pr_info("VCPU%d:\n", vcpuid);
+ pr_info(" steal: %lld\n", st->steal);
+ pr_info(" version: %d\n", st->version);
+ pr_info(" flags: %d\n", st->flags);
+ pr_info(" preempted: %d\n", st->preempted);
+ pr_info(" u8_pad: ");
+ for (i = 0; i < 3; ++i)
+ pr_info("%d", st->u8_pad[i]);
+ pr_info("\n pad: ");
+ for (i = 0; i < 11; ++i)
+ pr_info("%d", st->pad[i]);
+ pr_info("\n");
+}
+
+#elif defined(__aarch64__)
+
+/* PV_TIME_ST must have 64-byte alignment */
+#define STEAL_TIME_SIZE ((sizeof(struct st_time) + 63) & ~63)
+
+#define SMCCC_ARCH_FEATURES 0x80000001
+#define PV_TIME_FEATURES 0xc5000020
+#define PV_TIME_ST 0xc5000021
+
+struct st_time {
+ uint32_t rev;
+ uint32_t attr;
+ uint64_t st_time;
+};
+
+static int64_t smccc(uint32_t func, uint64_t arg)
+{
+ unsigned long ret;
+
+ asm volatile(
+ "mov w0, %w1\n"
+ "mov x1, %2\n"
+ "hvc #0\n"
+ "mov %0, x0\n"
+ : "=r" (ret) : "r" (func), "r" (arg) :
+ "x0", "x1", "x2", "x3");
+
+ return ret;
+}
+
+static void check_status(struct st_time *st)
+{
+ GUEST_ASSERT(READ_ONCE(st->rev) == 0);
+ GUEST_ASSERT(READ_ONCE(st->attr) == 0);
+}
+
+static void guest_code(int cpu)
+{
+ struct st_time *st;
+ int64_t status;
+
+ status = smccc(SMCCC_ARCH_FEATURES, PV_TIME_FEATURES);
+ GUEST_ASSERT(status == 0);
+ status = smccc(PV_TIME_FEATURES, PV_TIME_FEATURES);
+ GUEST_ASSERT(status == 0);
+ status = smccc(PV_TIME_FEATURES, PV_TIME_ST);
+ GUEST_ASSERT(status == 0);
+
+ status = smccc(PV_TIME_ST, 0);
+ GUEST_ASSERT(status != -1);
+ GUEST_ASSERT(status == (ulong)st_gva[cpu]);
+
+ st = (struct st_time *)status;
+ GUEST_SYNC(0);
+
+ check_status(st);
+ WRITE_ONCE(guest_stolen_time[cpu], st->st_time);
+ GUEST_SYNC(1);
+
+ check_status(st);
+ WRITE_ONCE(guest_stolen_time[cpu], st->st_time);
+ GUEST_DONE();
+}
+
+static void steal_time_init(struct kvm_vm *vm)
+{
+ struct kvm_device_attr dev = {
+ .group = KVM_ARM_VCPU_PVTIME_CTRL,
+ .attr = KVM_ARM_VCPU_PVTIME_IPA,
+ };
+ int i, ret;
+
+ ret = _vcpu_ioctl(vm, 0, KVM_HAS_DEVICE_ATTR, &dev);
+ if (ret != 0 && errno == ENXIO) {
+ print_skip("steal-time not supported");
+ exit(KSFT_SKIP);
+ }
+
+ for (i = 0; i < NR_VCPUS; ++i) {
+ uint64_t st_ipa;
+
+ vcpu_ioctl(vm, i, KVM_HAS_DEVICE_ATTR, &dev);
+
+ dev.addr = (uint64_t)&st_ipa;
+
+ /* ST_GPA_BASE is identity mapped */
+ st_gva[i] = (void *)(ST_GPA_BASE + i * STEAL_TIME_SIZE);
+ sync_global_to_guest(vm, st_gva[i]);
+
+ st_ipa = (ulong)st_gva[i] | 1;
+ ret = _vcpu_ioctl(vm, i, KVM_SET_DEVICE_ATTR, &dev);
+ TEST_ASSERT(ret == -1 && errno == EINVAL, "Bad IPA didn't report EINVAL");
+
+ st_ipa = (ulong)st_gva[i];
+ vcpu_ioctl(vm, i, KVM_SET_DEVICE_ATTR, &dev);
+
+ ret = _vcpu_ioctl(vm, i, KVM_SET_DEVICE_ATTR, &dev);
+ TEST_ASSERT(ret == -1 && errno == EEXIST, "Set IPA twice without EEXIST");
+
+ }
+}
+
+static void steal_time_dump(struct kvm_vm *vm, uint32_t vcpuid)
+{
+ struct st_time *st = addr_gva2hva(vm, (ulong)st_gva[vcpuid]);
+
+ pr_info("VCPU%d:\n", vcpuid);
+ pr_info(" rev: %d\n", st->rev);
+ pr_info(" attr: %d\n", st->attr);
+ pr_info(" st_time: %ld\n", st->st_time);
+}
+
+#endif
+
+static long get_run_delay(void)
+{
+ char path[64];
+ long val[2];
+ FILE *fp;
+
+ sprintf(path, "/proc/%ld/schedstat", syscall(SYS_gettid));
+ fp = fopen(path, "r");
+ fscanf(fp, "%ld %ld ", &val[0], &val[1]);
+ fclose(fp);
+
+ return val[1];
+}
+
+static void *do_steal_time(void *arg)
+{
+ struct timespec ts, stop;
+
+ clock_gettime(CLOCK_MONOTONIC, &ts);
+ stop = timespec_add_ns(ts, MIN_RUN_DELAY_NS);
+
+ while (1) {
+ clock_gettime(CLOCK_MONOTONIC, &ts);
+ if (timespec_to_ns(timespec_sub(ts, stop)) >= 0)
+ break;
+ }
+
+ return NULL;
+}
+
+static void run_vcpu(struct kvm_vm *vm, uint32_t vcpuid)
+{
+ struct ucall uc;
+
+ vcpu_args_set(vm, vcpuid, 1, vcpuid);
+
+ vcpu_ioctl(vm, vcpuid, KVM_RUN, NULL);
+
+ switch (get_ucall(vm, vcpuid, &uc)) {
+ case UCALL_SYNC:
+ case UCALL_DONE:
+ break;
+ case UCALL_ABORT:
+ TEST_ASSERT(false, "%s at %s:%ld", (const char *)uc.args[0],
+ __FILE__, uc.args[1]);
+ default:
+ TEST_ASSERT(false, "Unexpected exit: %s",
+ exit_reason_str(vcpu_state(vm, vcpuid)->exit_reason));
+ }
+}
+
+int main(int ac, char **av)
+{
+ struct kvm_vm *vm;
+ pthread_attr_t attr;
+ pthread_t thread;
+ cpu_set_t cpuset;
+ unsigned int gpages;
+ long stolen_time;
+ long run_delay;
+ bool verbose;
+ int i;
+
+ verbose = ac > 1 && (!strncmp(av[1], "-v", 3) || !strncmp(av[1], "--verbose", 10));
+
+ /* Set CPU affinity so we can force preemption of the VCPU */
+ CPU_ZERO(&cpuset);
+ CPU_SET(0, &cpuset);
+ pthread_attr_init(&attr);
+ pthread_attr_setaffinity_np(&attr, sizeof(cpu_set_t), &cpuset);
+ pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset);
+
+ /* Create a one VCPU guest and an identity mapped memslot for the steal time structure */
+ vm = vm_create_default(0, 0, guest_code);
+ gpages = vm_calc_num_guest_pages(VM_MODE_DEFAULT, STEAL_TIME_SIZE * NR_VCPUS);
+ vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, ST_GPA_BASE, 1, gpages, 0);
+ virt_map(vm, ST_GPA_BASE, ST_GPA_BASE, gpages, 0);
+ ucall_init(vm, NULL);
+
+ /* Add the rest of the VCPUs */
+ for (i = 1; i < NR_VCPUS; ++i)
+ vm_vcpu_add_default(vm, i, guest_code);
+
+ steal_time_init(vm);
+
+ /* Run test on each VCPU */
+ for (i = 0; i < NR_VCPUS; ++i) {
+ /* First VCPU run initializes steal-time */
+ run_vcpu(vm, i);
+
+ /* Second VCPU run, expect guest stolen time to be <= run_delay */
+ run_vcpu(vm, i);
+ sync_global_from_guest(vm, guest_stolen_time[i]);
+ stolen_time = guest_stolen_time[i];
+ run_delay = get_run_delay();
+ TEST_ASSERT(stolen_time <= run_delay,
+ "Expected stolen time <= %ld, got %ld",
+ run_delay, stolen_time);
+
+ /* Steal time from the VCPU. The steal time thread has the same CPU affinity as the VCPUs. */
+ run_delay = get_run_delay();
+ pthread_create(&thread, &attr, do_steal_time, NULL);
+ do
+ pthread_yield();
+ while (get_run_delay() - run_delay < MIN_RUN_DELAY_NS);
+ pthread_join(thread, NULL);
+ run_delay = get_run_delay() - run_delay;
+ TEST_ASSERT(run_delay >= MIN_RUN_DELAY_NS,
+ "Expected run_delay >= %ld, got %ld",
+ MIN_RUN_DELAY_NS, run_delay);
+
+ /* Run VCPU again to confirm stolen time is consistent with run_delay */
+ run_vcpu(vm, i);
+ sync_global_from_guest(vm, guest_stolen_time[i]);
+ stolen_time = guest_stolen_time[i] - stolen_time;
+ TEST_ASSERT(stolen_time >= run_delay,
+ "Expected stolen time >= %ld, got %ld",
+ run_delay, stolen_time);
+
+ if (verbose) {
+ pr_info("VCPU%d: total-stolen-time=%ld test-stolen-time=%ld", i,
+ guest_stolen_time[i], stolen_time);
+ if (stolen_time == run_delay)
+ pr_info(" (BONUS: guest test-stolen-time even exactly matches test-run_delay)");
+ pr_info("\n");
+ steal_time_dump(vm, i);
+ }
+ }
+
+ return 0;
+}
diff --git a/tools/testing/selftests/kvm/x86_64/cr4_cpuid_sync_test.c b/tools/testing/selftests/kvm/x86_64/cr4_cpuid_sync_test.c
new file mode 100644
index 000000000..140e91901
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/cr4_cpuid_sync_test.c
@@ -0,0 +1,117 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * CR4 and CPUID sync test
+ *
+ * Copyright 2018, Red Hat, Inc. and/or its affiliates.
+ *
+ * Author:
+ * Wei Huang <wei@redhat.com>
+ */
+
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+
+#include "test_util.h"
+
+#include "kvm_util.h"
+#include "processor.h"
+
+#define X86_FEATURE_XSAVE (1<<26)
+#define X86_FEATURE_OSXSAVE (1<<27)
+#define VCPU_ID 1
+
+static inline bool cr4_cpuid_is_sync(void)
+{
+ int func, subfunc;
+ uint32_t eax, ebx, ecx, edx;
+ uint64_t cr4;
+
+ func = 0x1;
+ subfunc = 0x0;
+ __asm__ __volatile__("cpuid"
+ : "=a"(eax), "=b"(ebx), "=c"(ecx), "=d"(edx)
+ : "a"(func), "c"(subfunc));
+
+ cr4 = get_cr4();
+
+ return (!!(ecx & X86_FEATURE_OSXSAVE)) == (!!(cr4 & X86_CR4_OSXSAVE));
+}
+
+static void guest_code(void)
+{
+ uint64_t cr4;
+
+ /* turn on CR4.OSXSAVE */
+ cr4 = get_cr4();
+ cr4 |= X86_CR4_OSXSAVE;
+ set_cr4(cr4);
+
+ /* verify CR4.OSXSAVE == CPUID.OSXSAVE */
+ GUEST_ASSERT(cr4_cpuid_is_sync());
+
+ /* notify hypervisor to change CR4 */
+ GUEST_SYNC(0);
+
+ /* check again */
+ GUEST_ASSERT(cr4_cpuid_is_sync());
+
+ GUEST_DONE();
+}
+
+int main(int argc, char *argv[])
+{
+ struct kvm_run *run;
+ struct kvm_vm *vm;
+ struct kvm_sregs sregs;
+ struct kvm_cpuid_entry2 *entry;
+ struct ucall uc;
+ int rc;
+
+ entry = kvm_get_supported_cpuid_entry(1);
+ if (!(entry->ecx & X86_FEATURE_XSAVE)) {
+ print_skip("XSAVE feature not supported");
+ return 0;
+ }
+
+ /* Tell stdout not to buffer its content */
+ setbuf(stdout, NULL);
+
+ /* Create VM */
+ vm = vm_create_default(VCPU_ID, 0, guest_code);
+ vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid());
+ run = vcpu_state(vm, VCPU_ID);
+
+ while (1) {
+ rc = _vcpu_run(vm, VCPU_ID);
+
+ TEST_ASSERT(rc == 0, "vcpu_run failed: %d\n", rc);
+ TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
+ "Unexpected exit reason: %u (%s),\n",
+ run->exit_reason,
+ exit_reason_str(run->exit_reason));
+
+ switch (get_ucall(vm, VCPU_ID, &uc)) {
+ case UCALL_SYNC:
+ /* emulate hypervisor clearing CR4.OSXSAVE */
+ vcpu_sregs_get(vm, VCPU_ID, &sregs);
+ sregs.cr4 &= ~X86_CR4_OSXSAVE;
+ vcpu_sregs_set(vm, VCPU_ID, &sregs);
+ break;
+ case UCALL_ABORT:
+ TEST_FAIL("Guest CR4 bit (OSXSAVE) unsynchronized with CPUID bit.");
+ break;
+ case UCALL_DONE:
+ goto done;
+ default:
+ TEST_FAIL("Unknown ucall %lu", uc.cmd);
+ }
+ }
+
+ kvm_vm_free(vm);
+
+done:
+ return 0;
+}
diff --git a/tools/testing/selftests/kvm/x86_64/debug_regs.c b/tools/testing/selftests/kvm/x86_64/debug_regs.c
new file mode 100644
index 000000000..2fc6b3af8
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/debug_regs.c
@@ -0,0 +1,202 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * KVM guest debug register tests
+ *
+ * Copyright (C) 2020, Red Hat, Inc.
+ */
+#include <stdio.h>
+#include <string.h>
+#include "kvm_util.h"
+#include "processor.h"
+
+#define VCPU_ID 0
+
+#define DR6_BD (1 << 13)
+#define DR7_GD (1 << 13)
+
+/* For testing data access debug BP */
+uint32_t guest_value;
+
+extern unsigned char sw_bp, hw_bp, write_data, ss_start, bd_start;
+
+static void guest_code(void)
+{
+ /*
+ * Software BP tests.
+ *
+ * NOTE: sw_bp need to be before the cmd here, because int3 is an
+ * exception rather than a normal trap for KVM_SET_GUEST_DEBUG (we
+ * capture it using the vcpu exception bitmap).
+ */
+ asm volatile("sw_bp: int3");
+
+ /* Hardware instruction BP test */
+ asm volatile("hw_bp: nop");
+
+ /* Hardware data BP test */
+ asm volatile("mov $1234,%%rax;\n\t"
+ "mov %%rax,%0;\n\t write_data:"
+ : "=m" (guest_value) : : "rax");
+
+ /* Single step test, covers 2 basic instructions and 2 emulated */
+ asm volatile("ss_start: "
+ "xor %%eax,%%eax\n\t"
+ "cpuid\n\t"
+ "movl $0x1a0,%%ecx\n\t"
+ "rdmsr\n\t"
+ : : : "eax", "ebx", "ecx", "edx");
+
+ /* DR6.BD test */
+ asm volatile("bd_start: mov %%dr0, %%rax" : : : "rax");
+ GUEST_DONE();
+}
+
+#define CLEAR_DEBUG() memset(&debug, 0, sizeof(debug))
+#define APPLY_DEBUG() vcpu_set_guest_debug(vm, VCPU_ID, &debug)
+#define CAST_TO_RIP(v) ((unsigned long long)&(v))
+#define SET_RIP(v) do { \
+ vcpu_regs_get(vm, VCPU_ID, &regs); \
+ regs.rip = (v); \
+ vcpu_regs_set(vm, VCPU_ID, &regs); \
+ } while (0)
+#define MOVE_RIP(v) SET_RIP(regs.rip + (v));
+
+int main(void)
+{
+ struct kvm_guest_debug debug;
+ unsigned long long target_dr6, target_rip;
+ struct kvm_regs regs;
+ struct kvm_run *run;
+ struct kvm_vm *vm;
+ struct ucall uc;
+ uint64_t cmd;
+ int i;
+ /* Instruction lengths starting at ss_start */
+ int ss_size[4] = {
+ 2, /* xor */
+ 2, /* cpuid */
+ 5, /* mov */
+ 2, /* rdmsr */
+ };
+
+ if (!kvm_check_cap(KVM_CAP_SET_GUEST_DEBUG)) {
+ print_skip("KVM_CAP_SET_GUEST_DEBUG not supported");
+ return 0;
+ }
+
+ vm = vm_create_default(VCPU_ID, 0, guest_code);
+ vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid());
+ run = vcpu_state(vm, VCPU_ID);
+
+ /* Test software BPs - int3 */
+ CLEAR_DEBUG();
+ debug.control = KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP;
+ APPLY_DEBUG();
+ vcpu_run(vm, VCPU_ID);
+ TEST_ASSERT(run->exit_reason == KVM_EXIT_DEBUG &&
+ run->debug.arch.exception == BP_VECTOR &&
+ run->debug.arch.pc == CAST_TO_RIP(sw_bp),
+ "INT3: exit %d exception %d rip 0x%llx (should be 0x%llx)",
+ run->exit_reason, run->debug.arch.exception,
+ run->debug.arch.pc, CAST_TO_RIP(sw_bp));
+ MOVE_RIP(1);
+
+ /* Test instruction HW BP over DR[0-3] */
+ for (i = 0; i < 4; i++) {
+ CLEAR_DEBUG();
+ debug.control = KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP;
+ debug.arch.debugreg[i] = CAST_TO_RIP(hw_bp);
+ debug.arch.debugreg[7] = 0x400 | (1UL << (2*i+1));
+ APPLY_DEBUG();
+ vcpu_run(vm, VCPU_ID);
+ target_dr6 = 0xffff0ff0 | (1UL << i);
+ TEST_ASSERT(run->exit_reason == KVM_EXIT_DEBUG &&
+ run->debug.arch.exception == DB_VECTOR &&
+ run->debug.arch.pc == CAST_TO_RIP(hw_bp) &&
+ run->debug.arch.dr6 == target_dr6,
+ "INS_HW_BP (DR%d): exit %d exception %d rip 0x%llx "
+ "(should be 0x%llx) dr6 0x%llx (should be 0x%llx)",
+ i, run->exit_reason, run->debug.arch.exception,
+ run->debug.arch.pc, CAST_TO_RIP(hw_bp),
+ run->debug.arch.dr6, target_dr6);
+ }
+ /* Skip "nop" */
+ MOVE_RIP(1);
+
+ /* Test data access HW BP over DR[0-3] */
+ for (i = 0; i < 4; i++) {
+ CLEAR_DEBUG();
+ debug.control = KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP;
+ debug.arch.debugreg[i] = CAST_TO_RIP(guest_value);
+ debug.arch.debugreg[7] = 0x00000400 | (1UL << (2*i+1)) |
+ (0x000d0000UL << (4*i));
+ APPLY_DEBUG();
+ vcpu_run(vm, VCPU_ID);
+ target_dr6 = 0xffff0ff0 | (1UL << i);
+ TEST_ASSERT(run->exit_reason == KVM_EXIT_DEBUG &&
+ run->debug.arch.exception == DB_VECTOR &&
+ run->debug.arch.pc == CAST_TO_RIP(write_data) &&
+ run->debug.arch.dr6 == target_dr6,
+ "DATA_HW_BP (DR%d): exit %d exception %d rip 0x%llx "
+ "(should be 0x%llx) dr6 0x%llx (should be 0x%llx)",
+ i, run->exit_reason, run->debug.arch.exception,
+ run->debug.arch.pc, CAST_TO_RIP(write_data),
+ run->debug.arch.dr6, target_dr6);
+ /* Rollback the 4-bytes "mov" */
+ MOVE_RIP(-7);
+ }
+ /* Skip the 4-bytes "mov" */
+ MOVE_RIP(7);
+
+ /* Test single step */
+ target_rip = CAST_TO_RIP(ss_start);
+ target_dr6 = 0xffff4ff0ULL;
+ vcpu_regs_get(vm, VCPU_ID, &regs);
+ for (i = 0; i < (sizeof(ss_size) / sizeof(ss_size[0])); i++) {
+ target_rip += ss_size[i];
+ CLEAR_DEBUG();
+ debug.control = KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_SINGLESTEP;
+ debug.arch.debugreg[7] = 0x00000400;
+ APPLY_DEBUG();
+ vcpu_run(vm, VCPU_ID);
+ TEST_ASSERT(run->exit_reason == KVM_EXIT_DEBUG &&
+ run->debug.arch.exception == DB_VECTOR &&
+ run->debug.arch.pc == target_rip &&
+ run->debug.arch.dr6 == target_dr6,
+ "SINGLE_STEP[%d]: exit %d exception %d rip 0x%llx "
+ "(should be 0x%llx) dr6 0x%llx (should be 0x%llx)",
+ i, run->exit_reason, run->debug.arch.exception,
+ run->debug.arch.pc, target_rip, run->debug.arch.dr6,
+ target_dr6);
+ }
+
+ /* Finally test global disable */
+ CLEAR_DEBUG();
+ debug.control = KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP;
+ debug.arch.debugreg[7] = 0x400 | DR7_GD;
+ APPLY_DEBUG();
+ vcpu_run(vm, VCPU_ID);
+ target_dr6 = 0xffff0ff0 | DR6_BD;
+ TEST_ASSERT(run->exit_reason == KVM_EXIT_DEBUG &&
+ run->debug.arch.exception == DB_VECTOR &&
+ run->debug.arch.pc == CAST_TO_RIP(bd_start) &&
+ run->debug.arch.dr6 == target_dr6,
+ "DR7.GD: exit %d exception %d rip 0x%llx "
+ "(should be 0x%llx) dr6 0x%llx (should be 0x%llx)",
+ run->exit_reason, run->debug.arch.exception,
+ run->debug.arch.pc, target_rip, run->debug.arch.dr6,
+ target_dr6);
+
+ /* Disable all debug controls, run to the end */
+ CLEAR_DEBUG();
+ APPLY_DEBUG();
+
+ vcpu_run(vm, VCPU_ID);
+ TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, "KVM_EXIT_IO");
+ cmd = get_ucall(vm, VCPU_ID, &uc);
+ TEST_ASSERT(cmd == UCALL_DONE, "UCALL_DONE");
+
+ kvm_vm_free(vm);
+
+ return 0;
+}
diff --git a/tools/testing/selftests/kvm/x86_64/evmcs_test.c b/tools/testing/selftests/kvm/x86_64/evmcs_test.c
new file mode 100644
index 000000000..757928199
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/evmcs_test.c
@@ -0,0 +1,166 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2018, Red Hat, Inc.
+ *
+ * Tests for Enlightened VMCS, including nested guest state.
+ */
+#define _GNU_SOURCE /* for program_invocation_short_name */
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+
+#include "test_util.h"
+
+#include "kvm_util.h"
+
+#include "vmx.h"
+
+#define VCPU_ID 5
+
+void l2_guest_code(void)
+{
+ GUEST_SYNC(7);
+
+ GUEST_SYNC(8);
+
+ /* Done, exit to L1 and never come back. */
+ vmcall();
+}
+
+void l1_guest_code(struct vmx_pages *vmx_pages)
+{
+#define L2_GUEST_STACK_SIZE 64
+ unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
+
+ enable_vp_assist(vmx_pages->vp_assist_gpa, vmx_pages->vp_assist);
+
+ GUEST_ASSERT(vmx_pages->vmcs_gpa);
+ GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages));
+ GUEST_SYNC(3);
+ GUEST_ASSERT(load_vmcs(vmx_pages));
+ GUEST_ASSERT(vmptrstz() == vmx_pages->enlightened_vmcs_gpa);
+
+ GUEST_SYNC(4);
+ GUEST_ASSERT(vmptrstz() == vmx_pages->enlightened_vmcs_gpa);
+
+ prepare_vmcs(vmx_pages, l2_guest_code,
+ &l2_guest_stack[L2_GUEST_STACK_SIZE]);
+
+ GUEST_SYNC(5);
+ GUEST_ASSERT(vmptrstz() == vmx_pages->enlightened_vmcs_gpa);
+ current_evmcs->revision_id = -1u;
+ GUEST_ASSERT(vmlaunch());
+ current_evmcs->revision_id = EVMCS_VERSION;
+ GUEST_SYNC(6);
+
+ GUEST_ASSERT(!vmlaunch());
+ GUEST_ASSERT(vmptrstz() == vmx_pages->enlightened_vmcs_gpa);
+ GUEST_SYNC(9);
+ GUEST_ASSERT(!vmresume());
+ GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL);
+ GUEST_SYNC(10);
+}
+
+void guest_code(struct vmx_pages *vmx_pages)
+{
+ GUEST_SYNC(1);
+ GUEST_SYNC(2);
+
+ if (vmx_pages)
+ l1_guest_code(vmx_pages);
+
+ GUEST_DONE();
+
+ /* Try enlightened vmptrld with an incorrect GPA */
+ evmcs_vmptrld(0xdeadbeef, vmx_pages->enlightened_vmcs);
+ GUEST_ASSERT(vmlaunch());
+}
+
+int main(int argc, char *argv[])
+{
+ vm_vaddr_t vmx_pages_gva = 0;
+
+ struct kvm_regs regs1, regs2;
+ struct kvm_vm *vm;
+ struct kvm_run *run;
+ struct kvm_x86_state *state;
+ struct ucall uc;
+ int stage;
+
+ /* Create VM */
+ vm = vm_create_default(VCPU_ID, 0, guest_code);
+
+ vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid());
+
+ if (!nested_vmx_supported() ||
+ !kvm_check_cap(KVM_CAP_NESTED_STATE) ||
+ !kvm_check_cap(KVM_CAP_HYPERV_ENLIGHTENED_VMCS)) {
+ print_skip("Enlightened VMCS is unsupported");
+ exit(KSFT_SKIP);
+ }
+
+ vcpu_enable_evmcs(vm, VCPU_ID);
+
+ run = vcpu_state(vm, VCPU_ID);
+
+ vcpu_regs_get(vm, VCPU_ID, &regs1);
+
+ vcpu_alloc_vmx(vm, &vmx_pages_gva);
+ vcpu_args_set(vm, VCPU_ID, 1, vmx_pages_gva);
+
+ for (stage = 1;; stage++) {
+ _vcpu_run(vm, VCPU_ID);
+ TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
+ "Stage %d: unexpected exit reason: %u (%s),\n",
+ stage, run->exit_reason,
+ exit_reason_str(run->exit_reason));
+
+ switch (get_ucall(vm, VCPU_ID, &uc)) {
+ case UCALL_ABORT:
+ TEST_FAIL("%s at %s:%ld", (const char *)uc.args[0],
+ __FILE__, uc.args[1]);
+ /* NOT REACHED */
+ case UCALL_SYNC:
+ break;
+ case UCALL_DONE:
+ goto part1_done;
+ default:
+ TEST_FAIL("Unknown ucall %lu", uc.cmd);
+ }
+
+ /* UCALL_SYNC is handled here. */
+ TEST_ASSERT(!strcmp((const char *)uc.args[0], "hello") &&
+ uc.args[1] == stage, "Stage %d: Unexpected register values vmexit, got %lx",
+ stage, (ulong)uc.args[1]);
+
+ state = vcpu_save_state(vm, VCPU_ID);
+ memset(&regs1, 0, sizeof(regs1));
+ vcpu_regs_get(vm, VCPU_ID, &regs1);
+
+ kvm_vm_release(vm);
+
+ /* Restore state in a new VM. */
+ kvm_vm_restart(vm, O_RDWR);
+ vm_vcpu_add(vm, VCPU_ID);
+ vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid());
+ vcpu_enable_evmcs(vm, VCPU_ID);
+ vcpu_load_state(vm, VCPU_ID, state);
+ run = vcpu_state(vm, VCPU_ID);
+ free(state);
+
+ memset(&regs2, 0, sizeof(regs2));
+ vcpu_regs_get(vm, VCPU_ID, &regs2);
+ TEST_ASSERT(!memcmp(&regs1, &regs2, sizeof(regs2)),
+ "Unexpected register values after vcpu_load_state; rdi: %lx rsi: %lx",
+ (ulong) regs2.rdi, (ulong) regs2.rsi);
+ }
+
+part1_done:
+ _vcpu_run(vm, VCPU_ID);
+ TEST_ASSERT(run->exit_reason == KVM_EXIT_SHUTDOWN,
+ "Unexpected successful VMEnter with invalid eVMCS pointer!");
+
+ kvm_vm_free(vm);
+}
diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c b/tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c
new file mode 100644
index 000000000..745b708c2
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c
@@ -0,0 +1,190 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Test for x86 KVM_CAP_HYPERV_CPUID
+ *
+ * Copyright (C) 2018, Red Hat, Inc.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ *
+ */
+
+#define _GNU_SOURCE /* for program_invocation_short_name */
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+#include "vmx.h"
+
+#define VCPU_ID 0
+
+static void guest_code(void)
+{
+}
+
+static bool smt_possible(void)
+{
+ char buf[16];
+ FILE *f;
+ bool res = true;
+
+ f = fopen("/sys/devices/system/cpu/smt/control", "r");
+ if (f) {
+ if (fread(buf, sizeof(*buf), sizeof(buf), f) > 0) {
+ if (!strncmp(buf, "forceoff", 8) ||
+ !strncmp(buf, "notsupported", 12))
+ res = false;
+ }
+ fclose(f);
+ }
+
+ return res;
+}
+
+static void test_hv_cpuid(struct kvm_cpuid2 *hv_cpuid_entries,
+ bool evmcs_enabled)
+{
+ int i;
+ int nent = 9;
+ u32 test_val;
+
+ if (evmcs_enabled)
+ nent += 1; /* 0x4000000A */
+
+ TEST_ASSERT(hv_cpuid_entries->nent == nent,
+ "KVM_GET_SUPPORTED_HV_CPUID should return %d entries"
+ " with evmcs=%d (returned %d)",
+ nent, evmcs_enabled, hv_cpuid_entries->nent);
+
+ for (i = 0; i < hv_cpuid_entries->nent; i++) {
+ struct kvm_cpuid_entry2 *entry = &hv_cpuid_entries->entries[i];
+
+ TEST_ASSERT((entry->function >= 0x40000000) &&
+ (entry->function <= 0x40000082),
+ "function %x is our of supported range",
+ entry->function);
+
+ TEST_ASSERT(evmcs_enabled || (entry->function != 0x4000000A),
+ "0x4000000A leaf should not be reported");
+
+ TEST_ASSERT(entry->index == 0,
+ ".index field should be zero");
+
+ TEST_ASSERT(entry->flags == 0,
+ ".flags field should be zero");
+
+ TEST_ASSERT(!entry->padding[0] && !entry->padding[1] &&
+ !entry->padding[2], "padding should be zero");
+
+ switch (entry->function) {
+ case 0x40000000:
+ test_val = 0x40000082;
+
+ TEST_ASSERT(entry->eax == test_val,
+ "Wrong max leaf report in 0x40000000.EAX: %x"
+ " (evmcs=%d)",
+ entry->eax, evmcs_enabled
+ );
+ break;
+ case 0x40000004:
+ test_val = entry->eax & (1UL << 18);
+
+ TEST_ASSERT(!!test_val == !smt_possible(),
+ "NoNonArchitecturalCoreSharing bit"
+ " doesn't reflect SMT setting");
+ break;
+ }
+
+ /*
+ * If needed for debug:
+ * fprintf(stdout,
+ * "CPUID%lx EAX=0x%lx EBX=0x%lx ECX=0x%lx EDX=0x%lx\n",
+ * entry->function, entry->eax, entry->ebx, entry->ecx,
+ * entry->edx);
+ */
+ }
+
+}
+
+void test_hv_cpuid_e2big(struct kvm_vm *vm)
+{
+ static struct kvm_cpuid2 cpuid = {.nent = 0};
+ int ret;
+
+ ret = _vcpu_ioctl(vm, VCPU_ID, KVM_GET_SUPPORTED_HV_CPUID, &cpuid);
+
+ TEST_ASSERT(ret == -1 && errno == E2BIG,
+ "KVM_GET_SUPPORTED_HV_CPUID didn't fail with -E2BIG when"
+ " it should have: %d %d", ret, errno);
+}
+
+
+struct kvm_cpuid2 *kvm_get_supported_hv_cpuid(struct kvm_vm *vm)
+{
+ int nent = 20; /* should be enough */
+ static struct kvm_cpuid2 *cpuid;
+
+ cpuid = malloc(sizeof(*cpuid) + nent * sizeof(struct kvm_cpuid_entry2));
+
+ if (!cpuid) {
+ perror("malloc");
+ abort();
+ }
+
+ cpuid->nent = nent;
+
+ vcpu_ioctl(vm, VCPU_ID, KVM_GET_SUPPORTED_HV_CPUID, cpuid);
+
+ return cpuid;
+}
+
+
+int main(int argc, char *argv[])
+{
+ struct kvm_vm *vm;
+ int rv, stage;
+ struct kvm_cpuid2 *hv_cpuid_entries;
+ bool evmcs_enabled;
+
+ /* Tell stdout not to buffer its content */
+ setbuf(stdout, NULL);
+
+ rv = kvm_check_cap(KVM_CAP_HYPERV_CPUID);
+ if (!rv) {
+ print_skip("KVM_CAP_HYPERV_CPUID not supported");
+ exit(KSFT_SKIP);
+ }
+
+ for (stage = 0; stage < 3; stage++) {
+ evmcs_enabled = false;
+
+ vm = vm_create_default(VCPU_ID, 0, guest_code);
+ switch (stage) {
+ case 0:
+ test_hv_cpuid_e2big(vm);
+ continue;
+ case 1:
+ break;
+ case 2:
+ if (!nested_vmx_supported() ||
+ !kvm_check_cap(KVM_CAP_HYPERV_ENLIGHTENED_VMCS)) {
+ print_skip("Enlightened VMCS is unsupported");
+ continue;
+ }
+ vcpu_enable_evmcs(vm, VCPU_ID);
+ evmcs_enabled = true;
+ break;
+ }
+
+ hv_cpuid_entries = kvm_get_supported_hv_cpuid(vm);
+ test_hv_cpuid(hv_cpuid_entries, evmcs_enabled);
+ free(hv_cpuid_entries);
+ kvm_vm_free(vm);
+ }
+
+ return 0;
+}
diff --git a/tools/testing/selftests/kvm/x86_64/kvm_pv_test.c b/tools/testing/selftests/kvm/x86_64/kvm_pv_test.c
new file mode 100644
index 000000000..b10a27485
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/kvm_pv_test.c
@@ -0,0 +1,234 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2020, Google LLC.
+ *
+ * Tests for KVM paravirtual feature disablement
+ */
+#include <asm/kvm_para.h>
+#include <linux/kvm_para.h>
+#include <stdint.h>
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+
+extern unsigned char rdmsr_start;
+extern unsigned char rdmsr_end;
+
+static u64 do_rdmsr(u32 idx)
+{
+ u32 lo, hi;
+
+ asm volatile("rdmsr_start: rdmsr;"
+ "rdmsr_end:"
+ : "=a"(lo), "=c"(hi)
+ : "c"(idx));
+
+ return (((u64) hi) << 32) | lo;
+}
+
+extern unsigned char wrmsr_start;
+extern unsigned char wrmsr_end;
+
+static void do_wrmsr(u32 idx, u64 val)
+{
+ u32 lo, hi;
+
+ lo = val;
+ hi = val >> 32;
+
+ asm volatile("wrmsr_start: wrmsr;"
+ "wrmsr_end:"
+ : : "a"(lo), "c"(idx), "d"(hi));
+}
+
+static int nr_gp;
+
+static void guest_gp_handler(struct ex_regs *regs)
+{
+ unsigned char *rip = (unsigned char *)regs->rip;
+ bool r, w;
+
+ r = rip == &rdmsr_start;
+ w = rip == &wrmsr_start;
+ GUEST_ASSERT(r || w);
+
+ nr_gp++;
+
+ if (r)
+ regs->rip = (uint64_t)&rdmsr_end;
+ else
+ regs->rip = (uint64_t)&wrmsr_end;
+}
+
+struct msr_data {
+ uint32_t idx;
+ const char *name;
+};
+
+#define TEST_MSR(msr) { .idx = msr, .name = #msr }
+#define UCALL_PR_MSR 0xdeadbeef
+#define PR_MSR(msr) ucall(UCALL_PR_MSR, 1, msr)
+
+/*
+ * KVM paravirtual msrs to test. Expect a #GP if any of these msrs are read or
+ * written, as the KVM_CPUID_FEATURES leaf is cleared.
+ */
+static struct msr_data msrs_to_test[] = {
+ TEST_MSR(MSR_KVM_SYSTEM_TIME),
+ TEST_MSR(MSR_KVM_SYSTEM_TIME_NEW),
+ TEST_MSR(MSR_KVM_WALL_CLOCK),
+ TEST_MSR(MSR_KVM_WALL_CLOCK_NEW),
+ TEST_MSR(MSR_KVM_ASYNC_PF_EN),
+ TEST_MSR(MSR_KVM_STEAL_TIME),
+ TEST_MSR(MSR_KVM_PV_EOI_EN),
+ TEST_MSR(MSR_KVM_POLL_CONTROL),
+ TEST_MSR(MSR_KVM_ASYNC_PF_INT),
+ TEST_MSR(MSR_KVM_ASYNC_PF_ACK),
+};
+
+static void test_msr(struct msr_data *msr)
+{
+ PR_MSR(msr);
+ do_rdmsr(msr->idx);
+ GUEST_ASSERT(READ_ONCE(nr_gp) == 1);
+
+ nr_gp = 0;
+ do_wrmsr(msr->idx, 0);
+ GUEST_ASSERT(READ_ONCE(nr_gp) == 1);
+ nr_gp = 0;
+}
+
+struct hcall_data {
+ uint64_t nr;
+ const char *name;
+};
+
+#define TEST_HCALL(hc) { .nr = hc, .name = #hc }
+#define UCALL_PR_HCALL 0xdeadc0de
+#define PR_HCALL(hc) ucall(UCALL_PR_HCALL, 1, hc)
+
+/*
+ * KVM hypercalls to test. Expect -KVM_ENOSYS when called, as the corresponding
+ * features have been cleared in KVM_CPUID_FEATURES.
+ */
+static struct hcall_data hcalls_to_test[] = {
+ TEST_HCALL(KVM_HC_KICK_CPU),
+ TEST_HCALL(KVM_HC_SEND_IPI),
+ TEST_HCALL(KVM_HC_SCHED_YIELD),
+};
+
+static void test_hcall(struct hcall_data *hc)
+{
+ uint64_t r;
+
+ PR_HCALL(hc);
+ r = kvm_hypercall(hc->nr, 0, 0, 0, 0);
+ GUEST_ASSERT(r == -KVM_ENOSYS);
+}
+
+static void guest_main(void)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(msrs_to_test); i++) {
+ test_msr(&msrs_to_test[i]);
+ }
+
+ for (i = 0; i < ARRAY_SIZE(hcalls_to_test); i++) {
+ test_hcall(&hcalls_to_test[i]);
+ }
+
+ GUEST_DONE();
+}
+
+static void clear_kvm_cpuid_features(struct kvm_cpuid2 *cpuid)
+{
+ struct kvm_cpuid_entry2 ent = {0};
+
+ ent.function = KVM_CPUID_FEATURES;
+ TEST_ASSERT(set_cpuid(cpuid, &ent),
+ "failed to clear KVM_CPUID_FEATURES leaf");
+}
+
+static void pr_msr(struct ucall *uc)
+{
+ struct msr_data *msr = (struct msr_data *)uc->args[0];
+
+ pr_info("testing msr: %s (%#x)\n", msr->name, msr->idx);
+}
+
+static void pr_hcall(struct ucall *uc)
+{
+ struct hcall_data *hc = (struct hcall_data *)uc->args[0];
+
+ pr_info("testing hcall: %s (%lu)\n", hc->name, hc->nr);
+}
+
+static void handle_abort(struct ucall *uc)
+{
+ TEST_FAIL("%s at %s:%ld", (const char *)uc->args[0],
+ __FILE__, uc->args[1]);
+}
+
+#define VCPU_ID 0
+
+static void enter_guest(struct kvm_vm *vm)
+{
+ struct kvm_run *run;
+ struct ucall uc;
+ int r;
+
+ run = vcpu_state(vm, VCPU_ID);
+
+ while (true) {
+ r = _vcpu_run(vm, VCPU_ID);
+ TEST_ASSERT(!r, "vcpu_run failed: %d\n", r);
+ TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
+ "unexpected exit reason: %u (%s)",
+ run->exit_reason, exit_reason_str(run->exit_reason));
+
+ switch (get_ucall(vm, VCPU_ID, &uc)) {
+ case UCALL_PR_MSR:
+ pr_msr(&uc);
+ break;
+ case UCALL_PR_HCALL:
+ pr_hcall(&uc);
+ break;
+ case UCALL_ABORT:
+ handle_abort(&uc);
+ return;
+ case UCALL_DONE:
+ return;
+ }
+ }
+}
+
+int main(void)
+{
+ struct kvm_enable_cap cap = {0};
+ struct kvm_cpuid2 *best;
+ struct kvm_vm *vm;
+
+ if (!kvm_check_cap(KVM_CAP_ENFORCE_PV_FEATURE_CPUID)) {
+ pr_info("will skip kvm paravirt restriction tests.\n");
+ return 0;
+ }
+
+ vm = vm_create_default(VCPU_ID, 0, guest_main);
+
+ cap.cap = KVM_CAP_ENFORCE_PV_FEATURE_CPUID;
+ cap.args[0] = 1;
+ vcpu_enable_cap(vm, VCPU_ID, &cap);
+
+ best = kvm_get_supported_cpuid();
+ clear_kvm_cpuid_features(best);
+ vcpu_set_cpuid(vm, VCPU_ID, best);
+
+ vm_init_descriptor_tables(vm);
+ vcpu_init_descriptor_tables(vm, VCPU_ID);
+ vm_handle_exception(vm, GP_VECTOR, guest_gp_handler);
+
+ enter_guest(vm);
+ kvm_vm_free(vm);
+}
diff --git a/tools/testing/selftests/kvm/x86_64/mmio_warning_test.c b/tools/testing/selftests/kvm/x86_64/mmio_warning_test.c
new file mode 100644
index 000000000..9f55ccd16
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/mmio_warning_test.c
@@ -0,0 +1,127 @@
+/*
+ * mmio_warning_test
+ *
+ * Copyright (C) 2019, Google LLC.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ *
+ * Test that we don't get a kernel warning when we call KVM_RUN after a
+ * triple fault occurs. To get the triple fault to occur we call KVM_RUN
+ * on a VCPU that hasn't been properly setup.
+ *
+ */
+
+#define _GNU_SOURCE
+#include <fcntl.h>
+#include <kvm_util.h>
+#include <linux/kvm.h>
+#include <processor.h>
+#include <pthread.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <test_util.h>
+#include <unistd.h>
+
+#define NTHREAD 4
+#define NPROCESS 5
+
+struct thread_context {
+ int kvmcpu;
+ struct kvm_run *run;
+};
+
+void *thr(void *arg)
+{
+ struct thread_context *tc = (struct thread_context *)arg;
+ int res;
+ int kvmcpu = tc->kvmcpu;
+ struct kvm_run *run = tc->run;
+
+ res = ioctl(kvmcpu, KVM_RUN, 0);
+ pr_info("ret1=%d exit_reason=%d suberror=%d\n",
+ res, run->exit_reason, run->internal.suberror);
+
+ return 0;
+}
+
+void test(void)
+{
+ int i, kvm, kvmvm, kvmcpu;
+ pthread_t th[NTHREAD];
+ struct kvm_run *run;
+ struct thread_context tc;
+
+ kvm = open("/dev/kvm", O_RDWR);
+ TEST_ASSERT(kvm != -1, "failed to open /dev/kvm");
+ kvmvm = ioctl(kvm, KVM_CREATE_VM, 0);
+ TEST_ASSERT(kvmvm != -1, "KVM_CREATE_VM failed");
+ kvmcpu = ioctl(kvmvm, KVM_CREATE_VCPU, 0);
+ TEST_ASSERT(kvmcpu != -1, "KVM_CREATE_VCPU failed");
+ run = (struct kvm_run *)mmap(0, 4096, PROT_READ|PROT_WRITE, MAP_SHARED,
+ kvmcpu, 0);
+ tc.kvmcpu = kvmcpu;
+ tc.run = run;
+ srand(getpid());
+ for (i = 0; i < NTHREAD; i++) {
+ pthread_create(&th[i], NULL, thr, (void *)(uintptr_t)&tc);
+ usleep(rand() % 10000);
+ }
+ for (i = 0; i < NTHREAD; i++)
+ pthread_join(th[i], NULL);
+}
+
+int get_warnings_count(void)
+{
+ int warnings;
+ FILE *f;
+
+ f = popen("dmesg | grep \"WARNING:\" | wc -l", "r");
+ if (fscanf(f, "%d", &warnings) < 1)
+ warnings = 0;
+ pclose(f);
+
+ return warnings;
+}
+
+int main(void)
+{
+ int warnings_before, warnings_after;
+
+ if (!is_intel_cpu()) {
+ print_skip("Must be run on an Intel CPU");
+ exit(KSFT_SKIP);
+ }
+
+ if (vm_is_unrestricted_guest(NULL)) {
+ print_skip("Unrestricted guest must be disabled");
+ exit(KSFT_SKIP);
+ }
+
+ warnings_before = get_warnings_count();
+
+ for (int i = 0; i < NPROCESS; ++i) {
+ int status;
+ int pid = fork();
+
+ if (pid < 0)
+ exit(1);
+ if (pid == 0) {
+ test();
+ exit(0);
+ }
+ while (waitpid(pid, &status, __WALL) != pid)
+ ;
+ }
+
+ warnings_after = get_warnings_count();
+ TEST_ASSERT(warnings_before == warnings_after,
+ "Warnings found in kernel. Run 'dmesg' to inspect them.");
+
+ return 0;
+}
diff --git a/tools/testing/selftests/kvm/x86_64/platform_info_test.c b/tools/testing/selftests/kvm/x86_64/platform_info_test.c
new file mode 100644
index 000000000..1e89688cb
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/platform_info_test.c
@@ -0,0 +1,107 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Test for x86 KVM_CAP_MSR_PLATFORM_INFO
+ *
+ * Copyright (C) 2018, Google LLC.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ *
+ * Verifies expected behavior of controlling guest access to
+ * MSR_PLATFORM_INFO.
+ */
+
+#define _GNU_SOURCE /* for program_invocation_short_name */
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+
+#define VCPU_ID 0
+#define MSR_PLATFORM_INFO_MAX_TURBO_RATIO 0xff00
+
+static void guest_code(void)
+{
+ uint64_t msr_platform_info;
+
+ for (;;) {
+ msr_platform_info = rdmsr(MSR_PLATFORM_INFO);
+ GUEST_SYNC(msr_platform_info);
+ asm volatile ("inc %r11");
+ }
+}
+
+static void set_msr_platform_info_enabled(struct kvm_vm *vm, bool enable)
+{
+ struct kvm_enable_cap cap = {};
+
+ cap.cap = KVM_CAP_MSR_PLATFORM_INFO;
+ cap.flags = 0;
+ cap.args[0] = (int)enable;
+ vm_enable_cap(vm, &cap);
+}
+
+static void test_msr_platform_info_enabled(struct kvm_vm *vm)
+{
+ struct kvm_run *run = vcpu_state(vm, VCPU_ID);
+ struct ucall uc;
+
+ set_msr_platform_info_enabled(vm, true);
+ vcpu_run(vm, VCPU_ID);
+ TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
+ "Exit_reason other than KVM_EXIT_IO: %u (%s),\n",
+ run->exit_reason,
+ exit_reason_str(run->exit_reason));
+ get_ucall(vm, VCPU_ID, &uc);
+ TEST_ASSERT(uc.cmd == UCALL_SYNC,
+ "Received ucall other than UCALL_SYNC: %lu\n", uc.cmd);
+ TEST_ASSERT((uc.args[1] & MSR_PLATFORM_INFO_MAX_TURBO_RATIO) ==
+ MSR_PLATFORM_INFO_MAX_TURBO_RATIO,
+ "Expected MSR_PLATFORM_INFO to have max turbo ratio mask: %i.",
+ MSR_PLATFORM_INFO_MAX_TURBO_RATIO);
+}
+
+static void test_msr_platform_info_disabled(struct kvm_vm *vm)
+{
+ struct kvm_run *run = vcpu_state(vm, VCPU_ID);
+
+ set_msr_platform_info_enabled(vm, false);
+ vcpu_run(vm, VCPU_ID);
+ TEST_ASSERT(run->exit_reason == KVM_EXIT_SHUTDOWN,
+ "Exit_reason other than KVM_EXIT_SHUTDOWN: %u (%s)\n",
+ run->exit_reason,
+ exit_reason_str(run->exit_reason));
+}
+
+int main(int argc, char *argv[])
+{
+ struct kvm_vm *vm;
+ int rv;
+ uint64_t msr_platform_info;
+
+ /* Tell stdout not to buffer its content */
+ setbuf(stdout, NULL);
+
+ rv = kvm_check_cap(KVM_CAP_MSR_PLATFORM_INFO);
+ if (!rv) {
+ print_skip("KVM_CAP_MSR_PLATFORM_INFO not supported");
+ exit(KSFT_SKIP);
+ }
+
+ vm = vm_create_default(VCPU_ID, 0, guest_code);
+
+ msr_platform_info = vcpu_get_msr(vm, VCPU_ID, MSR_PLATFORM_INFO);
+ vcpu_set_msr(vm, VCPU_ID, MSR_PLATFORM_INFO,
+ msr_platform_info | MSR_PLATFORM_INFO_MAX_TURBO_RATIO);
+ test_msr_platform_info_enabled(vm);
+ test_msr_platform_info_disabled(vm);
+ vcpu_set_msr(vm, VCPU_ID, MSR_PLATFORM_INFO, msr_platform_info);
+
+ kvm_vm_free(vm);
+
+ return 0;
+}
diff --git a/tools/testing/selftests/kvm/x86_64/set_sregs_test.c b/tools/testing/selftests/kvm/x86_64/set_sregs_test.c
new file mode 100644
index 000000000..9f7656184
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/set_sregs_test.c
@@ -0,0 +1,52 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * KVM_SET_SREGS tests
+ *
+ * Copyright (C) 2018, Google LLC.
+ *
+ * This is a regression test for the bug fixed by the following commit:
+ * d3802286fa0f ("kvm: x86: Disallow illegal IA32_APIC_BASE MSR values")
+ *
+ * That bug allowed a user-mode program that called the KVM_SET_SREGS
+ * ioctl to put a VCPU's local APIC into an invalid state.
+ */
+#define _GNU_SOURCE /* for program_invocation_short_name */
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+
+#include "test_util.h"
+
+#include "kvm_util.h"
+#include "processor.h"
+
+#define VCPU_ID 5
+
+int main(int argc, char *argv[])
+{
+ struct kvm_sregs sregs;
+ struct kvm_vm *vm;
+ int rc;
+
+ /* Tell stdout not to buffer its content */
+ setbuf(stdout, NULL);
+
+ /* Create VM */
+ vm = vm_create_default(VCPU_ID, 0, NULL);
+
+ vcpu_sregs_get(vm, VCPU_ID, &sregs);
+ sregs.apic_base = 1 << 10;
+ rc = _vcpu_sregs_set(vm, VCPU_ID, &sregs);
+ TEST_ASSERT(rc, "Set IA32_APIC_BASE to %llx (invalid)",
+ sregs.apic_base);
+ sregs.apic_base = 1 << 11;
+ rc = _vcpu_sregs_set(vm, VCPU_ID, &sregs);
+ TEST_ASSERT(!rc, "Couldn't set IA32_APIC_BASE to %llx (valid)",
+ sregs.apic_base);
+
+ kvm_vm_free(vm);
+
+ return 0;
+}
diff --git a/tools/testing/selftests/kvm/x86_64/smm_test.c b/tools/testing/selftests/kvm/x86_64/smm_test.c
new file mode 100644
index 000000000..ae39a2206
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/smm_test.c
@@ -0,0 +1,164 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2018, Red Hat, Inc.
+ *
+ * Tests for SMM.
+ */
+#define _GNU_SOURCE /* for program_invocation_short_name */
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <sys/ioctl.h>
+
+#include "test_util.h"
+
+#include "kvm_util.h"
+
+#include "vmx.h"
+#include "svm_util.h"
+
+#define VCPU_ID 1
+
+#define PAGE_SIZE 4096
+
+#define SMRAM_SIZE 65536
+#define SMRAM_MEMSLOT ((1 << 16) | 1)
+#define SMRAM_PAGES (SMRAM_SIZE / PAGE_SIZE)
+#define SMRAM_GPA 0x1000000
+#define SMRAM_STAGE 0xfe
+
+#define STR(x) #x
+#define XSTR(s) STR(s)
+
+#define SYNC_PORT 0xe
+#define DONE 0xff
+
+/*
+ * This is compiled as normal 64-bit code, however, SMI handler is executed
+ * in real-address mode. To stay simple we're limiting ourselves to a mode
+ * independent subset of asm here.
+ * SMI handler always report back fixed stage SMRAM_STAGE.
+ */
+uint8_t smi_handler[] = {
+ 0xb0, SMRAM_STAGE, /* mov $SMRAM_STAGE, %al */
+ 0xe4, SYNC_PORT, /* in $SYNC_PORT, %al */
+ 0x0f, 0xaa, /* rsm */
+};
+
+static inline void sync_with_host(uint64_t phase)
+{
+ asm volatile("in $" XSTR(SYNC_PORT)", %%al \n"
+ : "+a" (phase));
+}
+
+void self_smi(void)
+{
+ wrmsr(APIC_BASE_MSR + (APIC_ICR >> 4),
+ APIC_DEST_SELF | APIC_INT_ASSERT | APIC_DM_SMI);
+}
+
+void guest_code(void *arg)
+{
+ uint64_t apicbase = rdmsr(MSR_IA32_APICBASE);
+
+ sync_with_host(1);
+
+ wrmsr(MSR_IA32_APICBASE, apicbase | X2APIC_ENABLE);
+
+ sync_with_host(2);
+
+ self_smi();
+
+ sync_with_host(4);
+
+ if (arg) {
+ if (cpu_has_svm())
+ generic_svm_setup(arg, NULL, NULL);
+ else
+ GUEST_ASSERT(prepare_for_vmx_operation(arg));
+
+ sync_with_host(5);
+
+ self_smi();
+
+ sync_with_host(7);
+ }
+
+ sync_with_host(DONE);
+}
+
+int main(int argc, char *argv[])
+{
+ vm_vaddr_t nested_gva = 0;
+
+ struct kvm_regs regs;
+ struct kvm_vm *vm;
+ struct kvm_run *run;
+ struct kvm_x86_state *state;
+ int stage, stage_reported;
+
+ /* Create VM */
+ vm = vm_create_default(VCPU_ID, 0, guest_code);
+
+ vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid());
+
+ run = vcpu_state(vm, VCPU_ID);
+
+ vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, SMRAM_GPA,
+ SMRAM_MEMSLOT, SMRAM_PAGES, 0);
+ TEST_ASSERT(vm_phy_pages_alloc(vm, SMRAM_PAGES, SMRAM_GPA, SMRAM_MEMSLOT)
+ == SMRAM_GPA, "could not allocate guest physical addresses?");
+
+ memset(addr_gpa2hva(vm, SMRAM_GPA), 0x0, SMRAM_SIZE);
+ memcpy(addr_gpa2hva(vm, SMRAM_GPA) + 0x8000, smi_handler,
+ sizeof(smi_handler));
+
+ vcpu_set_msr(vm, VCPU_ID, MSR_IA32_SMBASE, SMRAM_GPA);
+
+ if (kvm_check_cap(KVM_CAP_NESTED_STATE)) {
+ if (nested_svm_supported())
+ vcpu_alloc_svm(vm, &nested_gva);
+ else if (nested_vmx_supported())
+ vcpu_alloc_vmx(vm, &nested_gva);
+ }
+
+ if (!nested_gva)
+ pr_info("will skip SMM test with VMX enabled\n");
+
+ vcpu_args_set(vm, VCPU_ID, 1, nested_gva);
+
+ for (stage = 1;; stage++) {
+ _vcpu_run(vm, VCPU_ID);
+ TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
+ "Stage %d: unexpected exit reason: %u (%s),\n",
+ stage, run->exit_reason,
+ exit_reason_str(run->exit_reason));
+
+ memset(&regs, 0, sizeof(regs));
+ vcpu_regs_get(vm, VCPU_ID, &regs);
+
+ stage_reported = regs.rax & 0xff;
+
+ if (stage_reported == DONE)
+ goto done;
+
+ TEST_ASSERT(stage_reported == stage ||
+ stage_reported == SMRAM_STAGE,
+ "Unexpected stage: #%x, got %x",
+ stage, stage_reported);
+
+ state = vcpu_save_state(vm, VCPU_ID);
+ kvm_vm_release(vm);
+ kvm_vm_restart(vm, O_RDWR);
+ vm_vcpu_add(vm, VCPU_ID);
+ vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid());
+ vcpu_load_state(vm, VCPU_ID, state);
+ run = vcpu_state(vm, VCPU_ID);
+ free(state);
+ }
+
+done:
+ kvm_vm_free(vm);
+}
diff --git a/tools/testing/selftests/kvm/x86_64/state_test.c b/tools/testing/selftests/kvm/x86_64/state_test.c
new file mode 100644
index 000000000..f6c8b9042
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/state_test.c
@@ -0,0 +1,233 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * KVM_GET/SET_* tests
+ *
+ * Copyright (C) 2018, Red Hat, Inc.
+ *
+ * Tests for vCPU state save/restore, including nested guest state.
+ */
+#define _GNU_SOURCE /* for program_invocation_short_name */
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+
+#include "test_util.h"
+
+#include "kvm_util.h"
+#include "processor.h"
+#include "vmx.h"
+#include "svm_util.h"
+
+#define VCPU_ID 5
+#define L2_GUEST_STACK_SIZE 256
+
+void svm_l2_guest_code(void)
+{
+ GUEST_SYNC(4);
+ /* Exit to L1 */
+ vmcall();
+ GUEST_SYNC(6);
+ /* Done, exit to L1 and never come back. */
+ vmcall();
+}
+
+static void svm_l1_guest_code(struct svm_test_data *svm)
+{
+ unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
+ struct vmcb *vmcb = svm->vmcb;
+
+ GUEST_ASSERT(svm->vmcb_gpa);
+ /* Prepare for L2 execution. */
+ generic_svm_setup(svm, svm_l2_guest_code,
+ &l2_guest_stack[L2_GUEST_STACK_SIZE]);
+
+ GUEST_SYNC(3);
+ run_guest(vmcb, svm->vmcb_gpa);
+ GUEST_ASSERT(vmcb->control.exit_code == SVM_EXIT_VMMCALL);
+ GUEST_SYNC(5);
+ vmcb->save.rip += 3;
+ run_guest(vmcb, svm->vmcb_gpa);
+ GUEST_ASSERT(vmcb->control.exit_code == SVM_EXIT_VMMCALL);
+ GUEST_SYNC(7);
+}
+
+void vmx_l2_guest_code(void)
+{
+ GUEST_SYNC(6);
+
+ /* Exit to L1 */
+ vmcall();
+
+ /* L1 has now set up a shadow VMCS for us. */
+ GUEST_ASSERT(vmreadz(GUEST_RIP) == 0xc0ffee);
+ GUEST_SYNC(10);
+ GUEST_ASSERT(vmreadz(GUEST_RIP) == 0xc0ffee);
+ GUEST_ASSERT(!vmwrite(GUEST_RIP, 0xc0fffee));
+ GUEST_SYNC(11);
+ GUEST_ASSERT(vmreadz(GUEST_RIP) == 0xc0fffee);
+ GUEST_ASSERT(!vmwrite(GUEST_RIP, 0xc0ffffee));
+ GUEST_SYNC(12);
+
+ /* Done, exit to L1 and never come back. */
+ vmcall();
+}
+
+static void vmx_l1_guest_code(struct vmx_pages *vmx_pages)
+{
+ unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
+
+ GUEST_ASSERT(vmx_pages->vmcs_gpa);
+ GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages));
+ GUEST_SYNC(3);
+ GUEST_ASSERT(load_vmcs(vmx_pages));
+ GUEST_ASSERT(vmptrstz() == vmx_pages->vmcs_gpa);
+
+ GUEST_SYNC(4);
+ GUEST_ASSERT(vmptrstz() == vmx_pages->vmcs_gpa);
+
+ prepare_vmcs(vmx_pages, vmx_l2_guest_code,
+ &l2_guest_stack[L2_GUEST_STACK_SIZE]);
+
+ GUEST_SYNC(5);
+ GUEST_ASSERT(vmptrstz() == vmx_pages->vmcs_gpa);
+ GUEST_ASSERT(!vmlaunch());
+ GUEST_ASSERT(vmptrstz() == vmx_pages->vmcs_gpa);
+ GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL);
+
+ /* Check that the launched state is preserved. */
+ GUEST_ASSERT(vmlaunch());
+
+ GUEST_ASSERT(!vmresume());
+ GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL);
+
+ GUEST_SYNC(7);
+ GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL);
+
+ GUEST_ASSERT(!vmresume());
+ GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL);
+
+ vmwrite(GUEST_RIP, vmreadz(GUEST_RIP) + 3);
+
+ vmwrite(SECONDARY_VM_EXEC_CONTROL, SECONDARY_EXEC_SHADOW_VMCS);
+ vmwrite(VMCS_LINK_POINTER, vmx_pages->shadow_vmcs_gpa);
+
+ GUEST_ASSERT(!vmptrld(vmx_pages->shadow_vmcs_gpa));
+ GUEST_ASSERT(vmlaunch());
+ GUEST_SYNC(8);
+ GUEST_ASSERT(vmlaunch());
+ GUEST_ASSERT(vmresume());
+
+ vmwrite(GUEST_RIP, 0xc0ffee);
+ GUEST_SYNC(9);
+ GUEST_ASSERT(vmreadz(GUEST_RIP) == 0xc0ffee);
+
+ GUEST_ASSERT(!vmptrld(vmx_pages->vmcs_gpa));
+ GUEST_ASSERT(!vmresume());
+ GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL);
+
+ GUEST_ASSERT(!vmptrld(vmx_pages->shadow_vmcs_gpa));
+ GUEST_ASSERT(vmreadz(GUEST_RIP) == 0xc0ffffee);
+ GUEST_ASSERT(vmlaunch());
+ GUEST_ASSERT(vmresume());
+ GUEST_SYNC(13);
+ GUEST_ASSERT(vmreadz(GUEST_RIP) == 0xc0ffffee);
+ GUEST_ASSERT(vmlaunch());
+ GUEST_ASSERT(vmresume());
+}
+
+static void __attribute__((__flatten__)) guest_code(void *arg)
+{
+ GUEST_SYNC(1);
+ GUEST_SYNC(2);
+
+ if (arg) {
+ if (cpu_has_svm())
+ svm_l1_guest_code(arg);
+ else
+ vmx_l1_guest_code(arg);
+ }
+
+ GUEST_DONE();
+}
+
+int main(int argc, char *argv[])
+{
+ vm_vaddr_t nested_gva = 0;
+
+ struct kvm_regs regs1, regs2;
+ struct kvm_vm *vm;
+ struct kvm_run *run;
+ struct kvm_x86_state *state;
+ struct ucall uc;
+ int stage;
+
+ /* Create VM */
+ vm = vm_create_default(VCPU_ID, 0, guest_code);
+ vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid());
+ run = vcpu_state(vm, VCPU_ID);
+
+ vcpu_regs_get(vm, VCPU_ID, &regs1);
+
+ if (kvm_check_cap(KVM_CAP_NESTED_STATE)) {
+ if (nested_svm_supported())
+ vcpu_alloc_svm(vm, &nested_gva);
+ else if (nested_vmx_supported())
+ vcpu_alloc_vmx(vm, &nested_gva);
+ }
+
+ if (!nested_gva)
+ pr_info("will skip nested state checks\n");
+
+ vcpu_args_set(vm, VCPU_ID, 1, nested_gva);
+
+ for (stage = 1;; stage++) {
+ _vcpu_run(vm, VCPU_ID);
+ TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
+ "Stage %d: unexpected exit reason: %u (%s),\n",
+ stage, run->exit_reason,
+ exit_reason_str(run->exit_reason));
+
+ switch (get_ucall(vm, VCPU_ID, &uc)) {
+ case UCALL_ABORT:
+ TEST_FAIL("%s at %s:%ld", (const char *)uc.args[0],
+ __FILE__, uc.args[1]);
+ /* NOT REACHED */
+ case UCALL_SYNC:
+ break;
+ case UCALL_DONE:
+ goto done;
+ default:
+ TEST_FAIL("Unknown ucall %lu", uc.cmd);
+ }
+
+ /* UCALL_SYNC is handled here. */
+ TEST_ASSERT(!strcmp((const char *)uc.args[0], "hello") &&
+ uc.args[1] == stage, "Stage %d: Unexpected register values vmexit, got %lx",
+ stage, (ulong)uc.args[1]);
+
+ state = vcpu_save_state(vm, VCPU_ID);
+ memset(&regs1, 0, sizeof(regs1));
+ vcpu_regs_get(vm, VCPU_ID, &regs1);
+
+ kvm_vm_release(vm);
+
+ /* Restore state in a new VM. */
+ kvm_vm_restart(vm, O_RDWR);
+ vm_vcpu_add(vm, VCPU_ID);
+ vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid());
+ vcpu_load_state(vm, VCPU_ID, state);
+ run = vcpu_state(vm, VCPU_ID);
+ free(state);
+
+ memset(&regs2, 0, sizeof(regs2));
+ vcpu_regs_get(vm, VCPU_ID, &regs2);
+ TEST_ASSERT(!memcmp(&regs1, &regs2, sizeof(regs2)),
+ "Unexpected register values after vcpu_load_state; rdi: %lx rsi: %lx",
+ (ulong) regs2.rdi, (ulong) regs2.rsi);
+ }
+
+done:
+ kvm_vm_free(vm);
+}
diff --git a/tools/testing/selftests/kvm/x86_64/svm_vmcall_test.c b/tools/testing/selftests/kvm/x86_64/svm_vmcall_test.c
new file mode 100644
index 000000000..0e1adb4e3
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/svm_vmcall_test.c
@@ -0,0 +1,77 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * svm_vmcall_test
+ *
+ * Copyright (C) 2020, Red Hat, Inc.
+ *
+ * Nested SVM testing: VMCALL
+ */
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+#include "svm_util.h"
+
+#define VCPU_ID 5
+
+static struct kvm_vm *vm;
+
+static void l2_guest_code(struct svm_test_data *svm)
+{
+ __asm__ __volatile__("vmcall");
+}
+
+static void l1_guest_code(struct svm_test_data *svm)
+{
+ #define L2_GUEST_STACK_SIZE 64
+ unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
+ struct vmcb *vmcb = svm->vmcb;
+
+ /* Prepare for L2 execution. */
+ generic_svm_setup(svm, l2_guest_code,
+ &l2_guest_stack[L2_GUEST_STACK_SIZE]);
+
+ run_guest(vmcb, svm->vmcb_gpa);
+
+ GUEST_ASSERT(vmcb->control.exit_code == SVM_EXIT_VMMCALL);
+ GUEST_DONE();
+}
+
+int main(int argc, char *argv[])
+{
+ vm_vaddr_t svm_gva;
+
+ nested_svm_check_supported();
+
+ vm = vm_create_default(VCPU_ID, 0, (void *) l1_guest_code);
+ vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid());
+
+ vcpu_alloc_svm(vm, &svm_gva);
+ vcpu_args_set(vm, VCPU_ID, 1, svm_gva);
+
+ for (;;) {
+ volatile struct kvm_run *run = vcpu_state(vm, VCPU_ID);
+ struct ucall uc;
+
+ vcpu_run(vm, VCPU_ID);
+ TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
+ "Got exit_reason other than KVM_EXIT_IO: %u (%s)\n",
+ run->exit_reason,
+ exit_reason_str(run->exit_reason));
+
+ switch (get_ucall(vm, VCPU_ID, &uc)) {
+ case UCALL_ABORT:
+ TEST_FAIL("%s", (const char *)uc.args[0]);
+ /* NOT REACHED */
+ case UCALL_SYNC:
+ break;
+ case UCALL_DONE:
+ goto done;
+ default:
+ TEST_FAIL("Unknown ucall 0x%lx.", uc.cmd);
+ }
+ }
+done:
+ kvm_vm_free(vm);
+ return 0;
+}
diff --git a/tools/testing/selftests/kvm/x86_64/sync_regs_test.c b/tools/testing/selftests/kvm/x86_64/sync_regs_test.c
new file mode 100644
index 000000000..d672f0a47
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/sync_regs_test.c
@@ -0,0 +1,243 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Test for x86 KVM_CAP_SYNC_REGS
+ *
+ * Copyright (C) 2018, Google LLC.
+ *
+ * Verifies expected behavior of x86 KVM_CAP_SYNC_REGS functionality,
+ * including requesting an invalid register set, updates to/from values
+ * in kvm_run.s.regs when kvm_valid_regs and kvm_dirty_regs are toggled.
+ */
+
+#define _GNU_SOURCE /* for program_invocation_short_name */
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+
+#define VCPU_ID 5
+
+#define UCALL_PIO_PORT ((uint16_t)0x1000)
+
+/*
+ * ucall is embedded here to protect against compiler reshuffling registers
+ * before calling a function. In this test we only need to get KVM_EXIT_IO
+ * vmexit and preserve RBX, no additional information is needed.
+ */
+void guest_code(void)
+{
+ asm volatile("1: in %[port], %%al\n"
+ "add $0x1, %%rbx\n"
+ "jmp 1b"
+ : : [port] "d" (UCALL_PIO_PORT) : "rax", "rbx");
+}
+
+static void compare_regs(struct kvm_regs *left, struct kvm_regs *right)
+{
+#define REG_COMPARE(reg) \
+ TEST_ASSERT(left->reg == right->reg, \
+ "Register " #reg \
+ " values did not match: 0x%llx, 0x%llx\n", \
+ left->reg, right->reg)
+ REG_COMPARE(rax);
+ REG_COMPARE(rbx);
+ REG_COMPARE(rcx);
+ REG_COMPARE(rdx);
+ REG_COMPARE(rsi);
+ REG_COMPARE(rdi);
+ REG_COMPARE(rsp);
+ REG_COMPARE(rbp);
+ REG_COMPARE(r8);
+ REG_COMPARE(r9);
+ REG_COMPARE(r10);
+ REG_COMPARE(r11);
+ REG_COMPARE(r12);
+ REG_COMPARE(r13);
+ REG_COMPARE(r14);
+ REG_COMPARE(r15);
+ REG_COMPARE(rip);
+ REG_COMPARE(rflags);
+#undef REG_COMPARE
+}
+
+static void compare_sregs(struct kvm_sregs *left, struct kvm_sregs *right)
+{
+}
+
+static void compare_vcpu_events(struct kvm_vcpu_events *left,
+ struct kvm_vcpu_events *right)
+{
+}
+
+#define TEST_SYNC_FIELDS (KVM_SYNC_X86_REGS|KVM_SYNC_X86_SREGS|KVM_SYNC_X86_EVENTS)
+#define INVALID_SYNC_FIELD 0x80000000
+
+int main(int argc, char *argv[])
+{
+ struct kvm_vm *vm;
+ struct kvm_run *run;
+ struct kvm_regs regs;
+ struct kvm_sregs sregs;
+ struct kvm_vcpu_events events;
+ int rv, cap;
+
+ /* Tell stdout not to buffer its content */
+ setbuf(stdout, NULL);
+
+ cap = kvm_check_cap(KVM_CAP_SYNC_REGS);
+ if ((cap & TEST_SYNC_FIELDS) != TEST_SYNC_FIELDS) {
+ print_skip("KVM_CAP_SYNC_REGS not supported");
+ exit(KSFT_SKIP);
+ }
+ if ((cap & INVALID_SYNC_FIELD) != 0) {
+ print_skip("The \"invalid\" field is not invalid");
+ exit(KSFT_SKIP);
+ }
+
+ /* Create VM */
+ vm = vm_create_default(VCPU_ID, 0, guest_code);
+
+ run = vcpu_state(vm, VCPU_ID);
+
+ /* Request reading invalid register set from VCPU. */
+ run->kvm_valid_regs = INVALID_SYNC_FIELD;
+ rv = _vcpu_run(vm, VCPU_ID);
+ TEST_ASSERT(rv < 0 && errno == EINVAL,
+ "Invalid kvm_valid_regs did not cause expected KVM_RUN error: %d\n",
+ rv);
+ vcpu_state(vm, VCPU_ID)->kvm_valid_regs = 0;
+
+ run->kvm_valid_regs = INVALID_SYNC_FIELD | TEST_SYNC_FIELDS;
+ rv = _vcpu_run(vm, VCPU_ID);
+ TEST_ASSERT(rv < 0 && errno == EINVAL,
+ "Invalid kvm_valid_regs did not cause expected KVM_RUN error: %d\n",
+ rv);
+ vcpu_state(vm, VCPU_ID)->kvm_valid_regs = 0;
+
+ /* Request setting invalid register set into VCPU. */
+ run->kvm_dirty_regs = INVALID_SYNC_FIELD;
+ rv = _vcpu_run(vm, VCPU_ID);
+ TEST_ASSERT(rv < 0 && errno == EINVAL,
+ "Invalid kvm_dirty_regs did not cause expected KVM_RUN error: %d\n",
+ rv);
+ vcpu_state(vm, VCPU_ID)->kvm_dirty_regs = 0;
+
+ run->kvm_dirty_regs = INVALID_SYNC_FIELD | TEST_SYNC_FIELDS;
+ rv = _vcpu_run(vm, VCPU_ID);
+ TEST_ASSERT(rv < 0 && errno == EINVAL,
+ "Invalid kvm_dirty_regs did not cause expected KVM_RUN error: %d\n",
+ rv);
+ vcpu_state(vm, VCPU_ID)->kvm_dirty_regs = 0;
+
+ /* Request and verify all valid register sets. */
+ /* TODO: BUILD TIME CHECK: TEST_ASSERT(KVM_SYNC_X86_NUM_FIELDS != 3); */
+ run->kvm_valid_regs = TEST_SYNC_FIELDS;
+ rv = _vcpu_run(vm, VCPU_ID);
+ TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
+ "Unexpected exit reason: %u (%s),\n",
+ run->exit_reason,
+ exit_reason_str(run->exit_reason));
+
+ vcpu_regs_get(vm, VCPU_ID, &regs);
+ compare_regs(&regs, &run->s.regs.regs);
+
+ vcpu_sregs_get(vm, VCPU_ID, &sregs);
+ compare_sregs(&sregs, &run->s.regs.sregs);
+
+ vcpu_events_get(vm, VCPU_ID, &events);
+ compare_vcpu_events(&events, &run->s.regs.events);
+
+ /* Set and verify various register values. */
+ run->s.regs.regs.rbx = 0xBAD1DEA;
+ run->s.regs.sregs.apic_base = 1 << 11;
+ /* TODO run->s.regs.events.XYZ = ABC; */
+
+ run->kvm_valid_regs = TEST_SYNC_FIELDS;
+ run->kvm_dirty_regs = KVM_SYNC_X86_REGS | KVM_SYNC_X86_SREGS;
+ rv = _vcpu_run(vm, VCPU_ID);
+ TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
+ "Unexpected exit reason: %u (%s),\n",
+ run->exit_reason,
+ exit_reason_str(run->exit_reason));
+ TEST_ASSERT(run->s.regs.regs.rbx == 0xBAD1DEA + 1,
+ "rbx sync regs value incorrect 0x%llx.",
+ run->s.regs.regs.rbx);
+ TEST_ASSERT(run->s.regs.sregs.apic_base == 1 << 11,
+ "apic_base sync regs value incorrect 0x%llx.",
+ run->s.regs.sregs.apic_base);
+
+ vcpu_regs_get(vm, VCPU_ID, &regs);
+ compare_regs(&regs, &run->s.regs.regs);
+
+ vcpu_sregs_get(vm, VCPU_ID, &sregs);
+ compare_sregs(&sregs, &run->s.regs.sregs);
+
+ vcpu_events_get(vm, VCPU_ID, &events);
+ compare_vcpu_events(&events, &run->s.regs.events);
+
+ /* Clear kvm_dirty_regs bits, verify new s.regs values are
+ * overwritten with existing guest values.
+ */
+ run->kvm_valid_regs = TEST_SYNC_FIELDS;
+ run->kvm_dirty_regs = 0;
+ run->s.regs.regs.rbx = 0xDEADBEEF;
+ rv = _vcpu_run(vm, VCPU_ID);
+ TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
+ "Unexpected exit reason: %u (%s),\n",
+ run->exit_reason,
+ exit_reason_str(run->exit_reason));
+ TEST_ASSERT(run->s.regs.regs.rbx != 0xDEADBEEF,
+ "rbx sync regs value incorrect 0x%llx.",
+ run->s.regs.regs.rbx);
+
+ /* Clear kvm_valid_regs bits and kvm_dirty_bits.
+ * Verify s.regs values are not overwritten with existing guest values
+ * and that guest values are not overwritten with kvm_sync_regs values.
+ */
+ run->kvm_valid_regs = 0;
+ run->kvm_dirty_regs = 0;
+ run->s.regs.regs.rbx = 0xAAAA;
+ regs.rbx = 0xBAC0;
+ vcpu_regs_set(vm, VCPU_ID, &regs);
+ rv = _vcpu_run(vm, VCPU_ID);
+ TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
+ "Unexpected exit reason: %u (%s),\n",
+ run->exit_reason,
+ exit_reason_str(run->exit_reason));
+ TEST_ASSERT(run->s.regs.regs.rbx == 0xAAAA,
+ "rbx sync regs value incorrect 0x%llx.",
+ run->s.regs.regs.rbx);
+ vcpu_regs_get(vm, VCPU_ID, &regs);
+ TEST_ASSERT(regs.rbx == 0xBAC0 + 1,
+ "rbx guest value incorrect 0x%llx.",
+ regs.rbx);
+
+ /* Clear kvm_valid_regs bits. Verify s.regs values are not overwritten
+ * with existing guest values but that guest values are overwritten
+ * with kvm_sync_regs values.
+ */
+ run->kvm_valid_regs = 0;
+ run->kvm_dirty_regs = TEST_SYNC_FIELDS;
+ run->s.regs.regs.rbx = 0xBBBB;
+ rv = _vcpu_run(vm, VCPU_ID);
+ TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
+ "Unexpected exit reason: %u (%s),\n",
+ run->exit_reason,
+ exit_reason_str(run->exit_reason));
+ TEST_ASSERT(run->s.regs.regs.rbx == 0xBBBB,
+ "rbx sync regs value incorrect 0x%llx.",
+ run->s.regs.regs.rbx);
+ vcpu_regs_get(vm, VCPU_ID, &regs);
+ TEST_ASSERT(regs.rbx == 0xBBBB + 1,
+ "rbx guest value incorrect 0x%llx.",
+ regs.rbx);
+
+ kvm_vm_free(vm);
+
+ return 0;
+}
diff --git a/tools/testing/selftests/kvm/x86_64/tsc_msrs_test.c b/tools/testing/selftests/kvm/x86_64/tsc_msrs_test.c
new file mode 100644
index 000000000..f8e761149
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/tsc_msrs_test.c
@@ -0,0 +1,168 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Tests for MSR_IA32_TSC and MSR_IA32_TSC_ADJUST.
+ *
+ * Copyright (C) 2020, Red Hat, Inc.
+ */
+#include <stdio.h>
+#include <string.h>
+#include "kvm_util.h"
+#include "processor.h"
+
+#define VCPU_ID 0
+
+#define UNITY (1ull << 30)
+#define HOST_ADJUST (UNITY * 64)
+#define GUEST_STEP (UNITY * 4)
+#define ROUND(x) ((x + UNITY / 2) & -UNITY)
+#define rounded_rdmsr(x) ROUND(rdmsr(x))
+#define rounded_host_rdmsr(x) ROUND(vcpu_get_msr(vm, 0, x))
+
+#define GUEST_ASSERT_EQ(a, b) do { \
+ __typeof(a) _a = (a); \
+ __typeof(b) _b = (b); \
+ if (_a != _b) \
+ ucall(UCALL_ABORT, 4, \
+ "Failed guest assert: " \
+ #a " == " #b, __LINE__, _a, _b); \
+ } while(0)
+
+static void guest_code(void)
+{
+ u64 val = 0;
+
+ GUEST_ASSERT_EQ(rounded_rdmsr(MSR_IA32_TSC), val);
+ GUEST_ASSERT_EQ(rounded_rdmsr(MSR_IA32_TSC_ADJUST), val);
+
+ /* Guest: writes to MSR_IA32_TSC affect both MSRs. */
+ val = 1ull * GUEST_STEP;
+ wrmsr(MSR_IA32_TSC, val);
+ GUEST_ASSERT_EQ(rounded_rdmsr(MSR_IA32_TSC), val);
+ GUEST_ASSERT_EQ(rounded_rdmsr(MSR_IA32_TSC_ADJUST), val);
+
+ /* Guest: writes to MSR_IA32_TSC_ADJUST affect both MSRs. */
+ GUEST_SYNC(2);
+ val = 2ull * GUEST_STEP;
+ wrmsr(MSR_IA32_TSC_ADJUST, val);
+ GUEST_ASSERT_EQ(rounded_rdmsr(MSR_IA32_TSC), val);
+ GUEST_ASSERT_EQ(rounded_rdmsr(MSR_IA32_TSC_ADJUST), val);
+
+ /* Host: setting the TSC offset. */
+ GUEST_SYNC(3);
+ GUEST_ASSERT_EQ(rounded_rdmsr(MSR_IA32_TSC), HOST_ADJUST + val);
+ GUEST_ASSERT_EQ(rounded_rdmsr(MSR_IA32_TSC_ADJUST), val);
+
+ /*
+ * Guest: writes to MSR_IA32_TSC_ADJUST do not destroy the
+ * host-side offset and affect both MSRs.
+ */
+ GUEST_SYNC(4);
+ val = 3ull * GUEST_STEP;
+ wrmsr(MSR_IA32_TSC_ADJUST, val);
+ GUEST_ASSERT_EQ(rounded_rdmsr(MSR_IA32_TSC), HOST_ADJUST + val);
+ GUEST_ASSERT_EQ(rounded_rdmsr(MSR_IA32_TSC_ADJUST), val);
+
+ /*
+ * Guest: writes to MSR_IA32_TSC affect both MSRs, so the host-side
+ * offset is now visible in MSR_IA32_TSC_ADJUST.
+ */
+ GUEST_SYNC(5);
+ val = 4ull * GUEST_STEP;
+ wrmsr(MSR_IA32_TSC, val);
+ GUEST_ASSERT_EQ(rounded_rdmsr(MSR_IA32_TSC), val);
+ GUEST_ASSERT_EQ(rounded_rdmsr(MSR_IA32_TSC_ADJUST), val - HOST_ADJUST);
+
+ GUEST_DONE();
+}
+
+static void run_vcpu(struct kvm_vm *vm, uint32_t vcpuid, int stage)
+{
+ struct ucall uc;
+
+ vcpu_args_set(vm, vcpuid, 1, vcpuid);
+
+ vcpu_ioctl(vm, vcpuid, KVM_RUN, NULL);
+
+ switch (get_ucall(vm, vcpuid, &uc)) {
+ case UCALL_SYNC:
+ TEST_ASSERT(!strcmp((const char *)uc.args[0], "hello") &&
+ uc.args[1] == stage + 1, "Stage %d: Unexpected register values vmexit, got %lx",
+ stage + 1, (ulong)uc.args[1]);
+ return;
+ case UCALL_DONE:
+ return;
+ case UCALL_ABORT:
+ TEST_ASSERT(false, "%s at %s:%ld\n" \
+ "\tvalues: %#lx, %#lx", (const char *)uc.args[0],
+ __FILE__, uc.args[1], uc.args[2], uc.args[3]);
+ default:
+ TEST_ASSERT(false, "Unexpected exit: %s",
+ exit_reason_str(vcpu_state(vm, vcpuid)->exit_reason));
+ }
+}
+
+int main(void)
+{
+ struct kvm_vm *vm;
+ uint64_t val;
+
+ vm = vm_create_default(VCPU_ID, 0, guest_code);
+ vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid());
+
+ val = 0;
+ ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC), val);
+ ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC_ADJUST), val);
+
+ /* Guest: writes to MSR_IA32_TSC affect both MSRs. */
+ run_vcpu(vm, VCPU_ID, 1);
+ val = 1ull * GUEST_STEP;
+ ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC), val);
+ ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC_ADJUST), val);
+
+ /* Guest: writes to MSR_IA32_TSC_ADJUST affect both MSRs. */
+ run_vcpu(vm, VCPU_ID, 2);
+ val = 2ull * GUEST_STEP;
+ ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC), val);
+ ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC_ADJUST), val);
+
+ /*
+ * Host: writes to MSR_IA32_TSC set the host-side offset
+ * and therefore do not change MSR_IA32_TSC_ADJUST.
+ */
+ vcpu_set_msr(vm, 0, MSR_IA32_TSC, HOST_ADJUST + val);
+ ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC), HOST_ADJUST + val);
+ ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC_ADJUST), val);
+ run_vcpu(vm, VCPU_ID, 3);
+
+ /* Host: writes to MSR_IA32_TSC_ADJUST do not modify the TSC. */
+ vcpu_set_msr(vm, 0, MSR_IA32_TSC_ADJUST, UNITY * 123456);
+ ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC), HOST_ADJUST + val);
+ ASSERT_EQ(vcpu_get_msr(vm, 0, MSR_IA32_TSC_ADJUST), UNITY * 123456);
+
+ /* Restore previous value. */
+ vcpu_set_msr(vm, 0, MSR_IA32_TSC_ADJUST, val);
+ ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC), HOST_ADJUST + val);
+ ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC_ADJUST), val);
+
+ /*
+ * Guest: writes to MSR_IA32_TSC_ADJUST do not destroy the
+ * host-side offset and affect both MSRs.
+ */
+ run_vcpu(vm, VCPU_ID, 4);
+ val = 3ull * GUEST_STEP;
+ ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC), HOST_ADJUST + val);
+ ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC_ADJUST), val);
+
+ /*
+ * Guest: writes to MSR_IA32_TSC affect both MSRs, so the host-side
+ * offset is now visible in MSR_IA32_TSC_ADJUST.
+ */
+ run_vcpu(vm, VCPU_ID, 5);
+ val = 4ull * GUEST_STEP;
+ ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC), val);
+ ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC_ADJUST), val - HOST_ADJUST);
+
+ kvm_vm_free(vm);
+
+ return 0;
+}
diff --git a/tools/testing/selftests/kvm/x86_64/user_msr_test.c b/tools/testing/selftests/kvm/x86_64/user_msr_test.c
new file mode 100644
index 000000000..cbe1b0889
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/user_msr_test.c
@@ -0,0 +1,248 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * tests for KVM_CAP_X86_USER_SPACE_MSR and KVM_X86_SET_MSR_FILTER
+ *
+ * Copyright (C) 2020, Amazon Inc.
+ *
+ * This is a functional test to verify that we can deflect MSR events
+ * into user space.
+ */
+#define _GNU_SOURCE /* for program_invocation_short_name */
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+
+#include "test_util.h"
+
+#include "kvm_util.h"
+#include "processor.h"
+
+#define VCPU_ID 5
+
+static u32 msr_reads, msr_writes;
+
+static u8 bitmap_00000000[KVM_MSR_FILTER_MAX_BITMAP_SIZE];
+static u8 bitmap_00000000_write[KVM_MSR_FILTER_MAX_BITMAP_SIZE];
+static u8 bitmap_40000000[KVM_MSR_FILTER_MAX_BITMAP_SIZE];
+static u8 bitmap_c0000000[KVM_MSR_FILTER_MAX_BITMAP_SIZE];
+static u8 bitmap_c0000000_read[KVM_MSR_FILTER_MAX_BITMAP_SIZE];
+static u8 bitmap_deadbeef[1] = { 0x1 };
+
+static void deny_msr(uint8_t *bitmap, u32 msr)
+{
+ u32 idx = msr & (KVM_MSR_FILTER_MAX_BITMAP_SIZE - 1);
+
+ bitmap[idx / 8] &= ~(1 << (idx % 8));
+}
+
+static void prepare_bitmaps(void)
+{
+ memset(bitmap_00000000, 0xff, sizeof(bitmap_00000000));
+ memset(bitmap_00000000_write, 0xff, sizeof(bitmap_00000000_write));
+ memset(bitmap_40000000, 0xff, sizeof(bitmap_40000000));
+ memset(bitmap_c0000000, 0xff, sizeof(bitmap_c0000000));
+ memset(bitmap_c0000000_read, 0xff, sizeof(bitmap_c0000000_read));
+
+ deny_msr(bitmap_00000000_write, MSR_IA32_POWER_CTL);
+ deny_msr(bitmap_c0000000_read, MSR_SYSCALL_MASK);
+ deny_msr(bitmap_c0000000_read, MSR_GS_BASE);
+}
+
+struct kvm_msr_filter filter = {
+ .flags = KVM_MSR_FILTER_DEFAULT_DENY,
+ .ranges = {
+ {
+ .flags = KVM_MSR_FILTER_READ,
+ .base = 0x00000000,
+ .nmsrs = KVM_MSR_FILTER_MAX_BITMAP_SIZE * BITS_PER_BYTE,
+ .bitmap = bitmap_00000000,
+ }, {
+ .flags = KVM_MSR_FILTER_WRITE,
+ .base = 0x00000000,
+ .nmsrs = KVM_MSR_FILTER_MAX_BITMAP_SIZE * BITS_PER_BYTE,
+ .bitmap = bitmap_00000000_write,
+ }, {
+ .flags = KVM_MSR_FILTER_READ | KVM_MSR_FILTER_WRITE,
+ .base = 0x40000000,
+ .nmsrs = KVM_MSR_FILTER_MAX_BITMAP_SIZE * BITS_PER_BYTE,
+ .bitmap = bitmap_40000000,
+ }, {
+ .flags = KVM_MSR_FILTER_READ,
+ .base = 0xc0000000,
+ .nmsrs = KVM_MSR_FILTER_MAX_BITMAP_SIZE * BITS_PER_BYTE,
+ .bitmap = bitmap_c0000000_read,
+ }, {
+ .flags = KVM_MSR_FILTER_WRITE,
+ .base = 0xc0000000,
+ .nmsrs = KVM_MSR_FILTER_MAX_BITMAP_SIZE * BITS_PER_BYTE,
+ .bitmap = bitmap_c0000000,
+ }, {
+ .flags = KVM_MSR_FILTER_WRITE | KVM_MSR_FILTER_READ,
+ .base = 0xdeadbeef,
+ .nmsrs = 1,
+ .bitmap = bitmap_deadbeef,
+ },
+ },
+};
+
+struct kvm_msr_filter no_filter = {
+ .flags = KVM_MSR_FILTER_DEFAULT_ALLOW,
+};
+
+static void guest_msr_calls(bool trapped)
+{
+ /* This goes into the in-kernel emulation */
+ wrmsr(MSR_SYSCALL_MASK, 0);
+
+ if (trapped) {
+ /* This goes into user space emulation */
+ GUEST_ASSERT(rdmsr(MSR_SYSCALL_MASK) == MSR_SYSCALL_MASK);
+ GUEST_ASSERT(rdmsr(MSR_GS_BASE) == MSR_GS_BASE);
+ } else {
+ GUEST_ASSERT(rdmsr(MSR_SYSCALL_MASK) != MSR_SYSCALL_MASK);
+ GUEST_ASSERT(rdmsr(MSR_GS_BASE) != MSR_GS_BASE);
+ }
+
+ /* If trapped == true, this goes into user space emulation */
+ wrmsr(MSR_IA32_POWER_CTL, 0x1234);
+
+ /* This goes into the in-kernel emulation */
+ rdmsr(MSR_IA32_POWER_CTL);
+
+ /* Invalid MSR, should always be handled by user space exit */
+ GUEST_ASSERT(rdmsr(0xdeadbeef) == 0xdeadbeef);
+ wrmsr(0xdeadbeef, 0x1234);
+}
+
+static void guest_code(void)
+{
+ guest_msr_calls(true);
+
+ /*
+ * Disable msr filtering, so that the kernel
+ * handles everything in the next round
+ */
+ GUEST_SYNC(0);
+
+ guest_msr_calls(false);
+
+ GUEST_DONE();
+}
+
+static int handle_ucall(struct kvm_vm *vm)
+{
+ struct ucall uc;
+
+ switch (get_ucall(vm, VCPU_ID, &uc)) {
+ case UCALL_ABORT:
+ TEST_FAIL("Guest assertion not met");
+ break;
+ case UCALL_SYNC:
+ vm_ioctl(vm, KVM_X86_SET_MSR_FILTER, &no_filter);
+ break;
+ case UCALL_DONE:
+ return 1;
+ default:
+ TEST_FAIL("Unknown ucall %lu", uc.cmd);
+ }
+
+ return 0;
+}
+
+static void handle_rdmsr(struct kvm_run *run)
+{
+ run->msr.data = run->msr.index;
+ msr_reads++;
+
+ if (run->msr.index == MSR_SYSCALL_MASK ||
+ run->msr.index == MSR_GS_BASE) {
+ TEST_ASSERT(run->msr.reason == KVM_MSR_EXIT_REASON_FILTER,
+ "MSR read trap w/o access fault");
+ }
+
+ if (run->msr.index == 0xdeadbeef) {
+ TEST_ASSERT(run->msr.reason == KVM_MSR_EXIT_REASON_UNKNOWN,
+ "MSR deadbeef read trap w/o inval fault");
+ }
+}
+
+static void handle_wrmsr(struct kvm_run *run)
+{
+ /* ignore */
+ msr_writes++;
+
+ if (run->msr.index == MSR_IA32_POWER_CTL) {
+ TEST_ASSERT(run->msr.data == 0x1234,
+ "MSR data for MSR_IA32_POWER_CTL incorrect");
+ TEST_ASSERT(run->msr.reason == KVM_MSR_EXIT_REASON_FILTER,
+ "MSR_IA32_POWER_CTL trap w/o access fault");
+ }
+
+ if (run->msr.index == 0xdeadbeef) {
+ TEST_ASSERT(run->msr.data == 0x1234,
+ "MSR data for deadbeef incorrect");
+ TEST_ASSERT(run->msr.reason == KVM_MSR_EXIT_REASON_UNKNOWN,
+ "deadbeef trap w/o inval fault");
+ }
+}
+
+int main(int argc, char *argv[])
+{
+ struct kvm_enable_cap cap = {
+ .cap = KVM_CAP_X86_USER_SPACE_MSR,
+ .args[0] = KVM_MSR_EXIT_REASON_INVAL |
+ KVM_MSR_EXIT_REASON_UNKNOWN |
+ KVM_MSR_EXIT_REASON_FILTER,
+ };
+ struct kvm_vm *vm;
+ struct kvm_run *run;
+ int rc;
+
+ /* Tell stdout not to buffer its content */
+ setbuf(stdout, NULL);
+
+ /* Create VM */
+ vm = vm_create_default(VCPU_ID, 0, guest_code);
+ vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid());
+ run = vcpu_state(vm, VCPU_ID);
+
+ rc = kvm_check_cap(KVM_CAP_X86_USER_SPACE_MSR);
+ TEST_ASSERT(rc, "KVM_CAP_X86_USER_SPACE_MSR is available");
+ vm_enable_cap(vm, &cap);
+
+ rc = kvm_check_cap(KVM_CAP_X86_MSR_FILTER);
+ TEST_ASSERT(rc, "KVM_CAP_X86_MSR_FILTER is available");
+
+ prepare_bitmaps();
+ vm_ioctl(vm, KVM_X86_SET_MSR_FILTER, &filter);
+
+ while (1) {
+ rc = _vcpu_run(vm, VCPU_ID);
+
+ TEST_ASSERT(rc == 0, "vcpu_run failed: %d\n", rc);
+
+ switch (run->exit_reason) {
+ case KVM_EXIT_X86_RDMSR:
+ handle_rdmsr(run);
+ break;
+ case KVM_EXIT_X86_WRMSR:
+ handle_wrmsr(run);
+ break;
+ case KVM_EXIT_IO:
+ if (handle_ucall(vm))
+ goto done;
+ break;
+ }
+
+ }
+
+done:
+ TEST_ASSERT(msr_reads == 4, "Handled 4 rdmsr in user space");
+ TEST_ASSERT(msr_writes == 3, "Handled 3 wrmsr in user space");
+
+ kvm_vm_free(vm);
+
+ return 0;
+}
diff --git a/tools/testing/selftests/kvm/x86_64/vmx_apic_access_test.c b/tools/testing/selftests/kvm/x86_64/vmx_apic_access_test.c
new file mode 100644
index 000000000..1f65342d6
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/vmx_apic_access_test.c
@@ -0,0 +1,142 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * vmx_apic_access_test
+ *
+ * Copyright (C) 2020, Google LLC.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ *
+ * The first subtest simply checks to see that an L2 guest can be
+ * launched with a valid APIC-access address that is backed by a
+ * page of L1 physical memory.
+ *
+ * The second subtest sets the APIC-access address to a (valid) L1
+ * physical address that is not backed by memory. KVM can't handle
+ * this situation, so resuming L2 should result in a KVM exit for
+ * internal error (emulation). This is not an architectural
+ * requirement. It is just a shortcoming of KVM. The internal error
+ * is unfortunate, but it's better than what used to happen!
+ */
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+#include "vmx.h"
+
+#include <string.h>
+#include <sys/ioctl.h>
+
+#include "kselftest.h"
+
+#define VCPU_ID 0
+
+/* The virtual machine object. */
+static struct kvm_vm *vm;
+
+static void l2_guest_code(void)
+{
+ /* Exit to L1 */
+ __asm__ __volatile__("vmcall");
+}
+
+static void l1_guest_code(struct vmx_pages *vmx_pages, unsigned long high_gpa)
+{
+#define L2_GUEST_STACK_SIZE 64
+ unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
+ uint32_t control;
+
+ GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages));
+ GUEST_ASSERT(load_vmcs(vmx_pages));
+
+ /* Prepare the VMCS for L2 execution. */
+ prepare_vmcs(vmx_pages, l2_guest_code,
+ &l2_guest_stack[L2_GUEST_STACK_SIZE]);
+ control = vmreadz(CPU_BASED_VM_EXEC_CONTROL);
+ control |= CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
+ vmwrite(CPU_BASED_VM_EXEC_CONTROL, control);
+ control = vmreadz(SECONDARY_VM_EXEC_CONTROL);
+ control |= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
+ vmwrite(SECONDARY_VM_EXEC_CONTROL, control);
+ vmwrite(APIC_ACCESS_ADDR, vmx_pages->apic_access_gpa);
+
+ /* Try to launch L2 with the memory-backed APIC-access address. */
+ GUEST_SYNC(vmreadz(APIC_ACCESS_ADDR));
+ GUEST_ASSERT(!vmlaunch());
+ GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL);
+
+ vmwrite(APIC_ACCESS_ADDR, high_gpa);
+
+ /* Try to resume L2 with the unbacked APIC-access address. */
+ GUEST_SYNC(vmreadz(APIC_ACCESS_ADDR));
+ GUEST_ASSERT(!vmresume());
+ GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL);
+
+ GUEST_DONE();
+}
+
+int main(int argc, char *argv[])
+{
+ unsigned long apic_access_addr = ~0ul;
+ unsigned int paddr_width;
+ unsigned int vaddr_width;
+ vm_vaddr_t vmx_pages_gva;
+ unsigned long high_gpa;
+ struct vmx_pages *vmx;
+ bool done = false;
+
+ nested_vmx_check_supported();
+
+ vm = vm_create_default(VCPU_ID, 0, (void *) l1_guest_code);
+ vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid());
+
+ kvm_get_cpu_address_width(&paddr_width, &vaddr_width);
+ high_gpa = (1ul << paddr_width) - getpagesize();
+ if ((unsigned long)DEFAULT_GUEST_PHY_PAGES * getpagesize() > high_gpa) {
+ print_skip("No unbacked physical page available");
+ exit(KSFT_SKIP);
+ }
+
+ vmx = vcpu_alloc_vmx(vm, &vmx_pages_gva);
+ prepare_virtualize_apic_accesses(vmx, vm, 0);
+ vcpu_args_set(vm, VCPU_ID, 2, vmx_pages_gva, high_gpa);
+
+ while (!done) {
+ volatile struct kvm_run *run = vcpu_state(vm, VCPU_ID);
+ struct ucall uc;
+
+ vcpu_run(vm, VCPU_ID);
+ if (apic_access_addr == high_gpa) {
+ TEST_ASSERT(run->exit_reason ==
+ KVM_EXIT_INTERNAL_ERROR,
+ "Got exit reason other than KVM_EXIT_INTERNAL_ERROR: %u (%s)\n",
+ run->exit_reason,
+ exit_reason_str(run->exit_reason));
+ TEST_ASSERT(run->internal.suberror ==
+ KVM_INTERNAL_ERROR_EMULATION,
+ "Got internal suberror other than KVM_INTERNAL_ERROR_EMULATION: %u\n",
+ run->internal.suberror);
+ break;
+ }
+ TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
+ "Got exit_reason other than KVM_EXIT_IO: %u (%s)\n",
+ run->exit_reason,
+ exit_reason_str(run->exit_reason));
+
+ switch (get_ucall(vm, VCPU_ID, &uc)) {
+ case UCALL_ABORT:
+ TEST_FAIL("%s at %s:%ld", (const char *)uc.args[0],
+ __FILE__, uc.args[1]);
+ /* NOT REACHED */
+ case UCALL_SYNC:
+ apic_access_addr = uc.args[1];
+ break;
+ case UCALL_DONE:
+ done = true;
+ break;
+ default:
+ TEST_ASSERT(false, "Unknown ucall %lu", uc.cmd);
+ }
+ }
+ kvm_vm_free(vm);
+ return 0;
+}
diff --git a/tools/testing/selftests/kvm/x86_64/vmx_close_while_nested_test.c b/tools/testing/selftests/kvm/x86_64/vmx_close_while_nested_test.c
new file mode 100644
index 000000000..fe40ade06
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/vmx_close_while_nested_test.c
@@ -0,0 +1,87 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * vmx_close_while_nested
+ *
+ * Copyright (C) 2019, Red Hat, Inc.
+ *
+ * Verify that nothing bad happens if a KVM user exits with open
+ * file descriptors while executing a nested guest.
+ */
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+#include "vmx.h"
+
+#include <string.h>
+#include <sys/ioctl.h>
+
+#include "kselftest.h"
+
+#define VCPU_ID 5
+
+enum {
+ PORT_L0_EXIT = 0x2000,
+};
+
+/* The virtual machine object. */
+static struct kvm_vm *vm;
+
+static void l2_guest_code(void)
+{
+ /* Exit to L0 */
+ asm volatile("inb %%dx, %%al"
+ : : [port] "d" (PORT_L0_EXIT) : "rax");
+}
+
+static void l1_guest_code(struct vmx_pages *vmx_pages)
+{
+#define L2_GUEST_STACK_SIZE 64
+ unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
+
+ GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages));
+ GUEST_ASSERT(load_vmcs(vmx_pages));
+
+ /* Prepare the VMCS for L2 execution. */
+ prepare_vmcs(vmx_pages, l2_guest_code,
+ &l2_guest_stack[L2_GUEST_STACK_SIZE]);
+
+ GUEST_ASSERT(!vmlaunch());
+ GUEST_ASSERT(0);
+}
+
+int main(int argc, char *argv[])
+{
+ vm_vaddr_t vmx_pages_gva;
+
+ nested_vmx_check_supported();
+
+ vm = vm_create_default(VCPU_ID, 0, (void *) l1_guest_code);
+ vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid());
+
+ /* Allocate VMX pages and shared descriptors (vmx_pages). */
+ vcpu_alloc_vmx(vm, &vmx_pages_gva);
+ vcpu_args_set(vm, VCPU_ID, 1, vmx_pages_gva);
+
+ for (;;) {
+ volatile struct kvm_run *run = vcpu_state(vm, VCPU_ID);
+ struct ucall uc;
+
+ vcpu_run(vm, VCPU_ID);
+ TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
+ "Got exit_reason other than KVM_EXIT_IO: %u (%s)\n",
+ run->exit_reason,
+ exit_reason_str(run->exit_reason));
+
+ if (run->io.port == PORT_L0_EXIT)
+ break;
+
+ switch (get_ucall(vm, VCPU_ID, &uc)) {
+ case UCALL_ABORT:
+ TEST_FAIL("%s", (const char *)uc.args[0]);
+ /* NOT REACHED */
+ default:
+ TEST_FAIL("Unknown ucall %lu", uc.cmd);
+ }
+ }
+}
diff --git a/tools/testing/selftests/kvm/x86_64/vmx_dirty_log_test.c b/tools/testing/selftests/kvm/x86_64/vmx_dirty_log_test.c
new file mode 100644
index 000000000..e894a638a
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/vmx_dirty_log_test.c
@@ -0,0 +1,157 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * KVM dirty page logging test
+ *
+ * Copyright (C) 2018, Red Hat, Inc.
+ */
+
+#define _GNU_SOURCE /* for program_invocation_name */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <linux/bitmap.h>
+#include <linux/bitops.h>
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+#include "vmx.h"
+
+#define VCPU_ID 1
+
+/* The memory slot index to track dirty pages */
+#define TEST_MEM_SLOT_INDEX 1
+#define TEST_MEM_PAGES 3
+
+/* L1 guest test virtual memory offset */
+#define GUEST_TEST_MEM 0xc0000000
+
+/* L2 guest test virtual memory offset */
+#define NESTED_TEST_MEM1 0xc0001000
+#define NESTED_TEST_MEM2 0xc0002000
+
+static void l2_guest_code(void)
+{
+ *(volatile uint64_t *)NESTED_TEST_MEM1;
+ *(volatile uint64_t *)NESTED_TEST_MEM1 = 1;
+ GUEST_SYNC(true);
+ GUEST_SYNC(false);
+
+ *(volatile uint64_t *)NESTED_TEST_MEM2 = 1;
+ GUEST_SYNC(true);
+ *(volatile uint64_t *)NESTED_TEST_MEM2 = 1;
+ GUEST_SYNC(true);
+ GUEST_SYNC(false);
+
+ /* Exit to L1 and never come back. */
+ vmcall();
+}
+
+void l1_guest_code(struct vmx_pages *vmx)
+{
+#define L2_GUEST_STACK_SIZE 64
+ unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
+
+ GUEST_ASSERT(vmx->vmcs_gpa);
+ GUEST_ASSERT(prepare_for_vmx_operation(vmx));
+ GUEST_ASSERT(load_vmcs(vmx));
+
+ prepare_vmcs(vmx, l2_guest_code,
+ &l2_guest_stack[L2_GUEST_STACK_SIZE]);
+
+ GUEST_SYNC(false);
+ GUEST_ASSERT(!vmlaunch());
+ GUEST_SYNC(false);
+ GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL);
+ GUEST_DONE();
+}
+
+int main(int argc, char *argv[])
+{
+ vm_vaddr_t vmx_pages_gva = 0;
+ struct vmx_pages *vmx;
+ unsigned long *bmap;
+ uint64_t *host_test_mem;
+
+ struct kvm_vm *vm;
+ struct kvm_run *run;
+ struct ucall uc;
+ bool done = false;
+
+ nested_vmx_check_supported();
+
+ /* Create VM */
+ vm = vm_create_default(VCPU_ID, 0, l1_guest_code);
+ vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid());
+ vmx = vcpu_alloc_vmx(vm, &vmx_pages_gva);
+ vcpu_args_set(vm, VCPU_ID, 1, vmx_pages_gva);
+ run = vcpu_state(vm, VCPU_ID);
+
+ /* Add an extra memory slot for testing dirty logging */
+ vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS,
+ GUEST_TEST_MEM,
+ TEST_MEM_SLOT_INDEX,
+ TEST_MEM_PAGES,
+ KVM_MEM_LOG_DIRTY_PAGES);
+
+ /*
+ * Add an identity map for GVA range [0xc0000000, 0xc0002000). This
+ * affects both L1 and L2. However...
+ */
+ virt_map(vm, GUEST_TEST_MEM, GUEST_TEST_MEM, TEST_MEM_PAGES, 0);
+
+ /*
+ * ... pages in the L2 GPA range [0xc0001000, 0xc0003000) will map to
+ * 0xc0000000.
+ *
+ * Note that prepare_eptp should be called only L1's GPA map is done,
+ * meaning after the last call to virt_map.
+ */
+ prepare_eptp(vmx, vm, 0);
+ nested_map_memslot(vmx, vm, 0, 0);
+ nested_map(vmx, vm, NESTED_TEST_MEM1, GUEST_TEST_MEM, 4096, 0);
+ nested_map(vmx, vm, NESTED_TEST_MEM2, GUEST_TEST_MEM, 4096, 0);
+
+ bmap = bitmap_alloc(TEST_MEM_PAGES);
+ host_test_mem = addr_gpa2hva(vm, GUEST_TEST_MEM);
+
+ while (!done) {
+ memset(host_test_mem, 0xaa, TEST_MEM_PAGES * 4096);
+ _vcpu_run(vm, VCPU_ID);
+ TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
+ "Unexpected exit reason: %u (%s),\n",
+ run->exit_reason,
+ exit_reason_str(run->exit_reason));
+
+ switch (get_ucall(vm, VCPU_ID, &uc)) {
+ case UCALL_ABORT:
+ TEST_FAIL("%s at %s:%ld", (const char *)uc.args[0],
+ __FILE__, uc.args[1]);
+ /* NOT REACHED */
+ case UCALL_SYNC:
+ /*
+ * The nested guest wrote at offset 0x1000 in the memslot, but the
+ * dirty bitmap must be filled in according to L1 GPA, not L2.
+ */
+ kvm_vm_get_dirty_log(vm, TEST_MEM_SLOT_INDEX, bmap);
+ if (uc.args[1]) {
+ TEST_ASSERT(test_bit(0, bmap), "Page 0 incorrectly reported clean\n");
+ TEST_ASSERT(host_test_mem[0] == 1, "Page 0 not written by guest\n");
+ } else {
+ TEST_ASSERT(!test_bit(0, bmap), "Page 0 incorrectly reported dirty\n");
+ TEST_ASSERT(host_test_mem[0] == 0xaaaaaaaaaaaaaaaaULL, "Page 0 written by guest\n");
+ }
+
+ TEST_ASSERT(!test_bit(1, bmap), "Page 1 incorrectly reported dirty\n");
+ TEST_ASSERT(host_test_mem[4096 / 8] == 0xaaaaaaaaaaaaaaaaULL, "Page 1 written by guest\n");
+ TEST_ASSERT(!test_bit(2, bmap), "Page 2 incorrectly reported dirty\n");
+ TEST_ASSERT(host_test_mem[8192 / 8] == 0xaaaaaaaaaaaaaaaaULL, "Page 2 written by guest\n");
+ break;
+ case UCALL_DONE:
+ done = true;
+ break;
+ default:
+ TEST_FAIL("Unknown ucall %lu", uc.cmd);
+ }
+ }
+}
diff --git a/tools/testing/selftests/kvm/x86_64/vmx_preemption_timer_test.c b/tools/testing/selftests/kvm/x86_64/vmx_preemption_timer_test.c
new file mode 100644
index 000000000..a7737af12
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/vmx_preemption_timer_test.c
@@ -0,0 +1,259 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * VMX-preemption timer test
+ *
+ * Copyright (C) 2020, Google, LLC.
+ *
+ * Test to ensure the VM-Enter after migration doesn't
+ * incorrectly restarts the timer with the full timer
+ * value instead of partially decayed timer value
+ *
+ */
+#define _GNU_SOURCE /* for program_invocation_short_name */
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+
+#include "test_util.h"
+
+#include "kvm_util.h"
+#include "processor.h"
+#include "vmx.h"
+
+#define VCPU_ID 5
+#define PREEMPTION_TIMER_VALUE 100000000ull
+#define PREEMPTION_TIMER_VALUE_THRESHOLD1 80000000ull
+
+u32 vmx_pt_rate;
+bool l2_save_restore_done;
+static u64 l2_vmx_pt_start;
+volatile u64 l2_vmx_pt_finish;
+
+union vmx_basic basic;
+union vmx_ctrl_msr ctrl_pin_rev;
+union vmx_ctrl_msr ctrl_exit_rev;
+
+void l2_guest_code(void)
+{
+ u64 vmx_pt_delta;
+
+ vmcall();
+ l2_vmx_pt_start = (rdtsc() >> vmx_pt_rate) << vmx_pt_rate;
+
+ /*
+ * Wait until the 1st threshold has passed
+ */
+ do {
+ l2_vmx_pt_finish = rdtsc();
+ vmx_pt_delta = (l2_vmx_pt_finish - l2_vmx_pt_start) >>
+ vmx_pt_rate;
+ } while (vmx_pt_delta < PREEMPTION_TIMER_VALUE_THRESHOLD1);
+
+ /*
+ * Force L2 through Save and Restore cycle
+ */
+ GUEST_SYNC(1);
+
+ l2_save_restore_done = 1;
+
+ /*
+ * Now wait for the preemption timer to fire and
+ * exit to L1
+ */
+ while ((l2_vmx_pt_finish = rdtsc()))
+ ;
+}
+
+void l1_guest_code(struct vmx_pages *vmx_pages)
+{
+#define L2_GUEST_STACK_SIZE 64
+ unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
+ u64 l1_vmx_pt_start;
+ u64 l1_vmx_pt_finish;
+ u64 l1_tsc_deadline, l2_tsc_deadline;
+
+ GUEST_ASSERT(vmx_pages->vmcs_gpa);
+ GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages));
+ GUEST_ASSERT(load_vmcs(vmx_pages));
+ GUEST_ASSERT(vmptrstz() == vmx_pages->vmcs_gpa);
+
+ prepare_vmcs(vmx_pages, l2_guest_code,
+ &l2_guest_stack[L2_GUEST_STACK_SIZE]);
+
+ /*
+ * Check for Preemption timer support
+ */
+ basic.val = rdmsr(MSR_IA32_VMX_BASIC);
+ ctrl_pin_rev.val = rdmsr(basic.ctrl ? MSR_IA32_VMX_TRUE_PINBASED_CTLS
+ : MSR_IA32_VMX_PINBASED_CTLS);
+ ctrl_exit_rev.val = rdmsr(basic.ctrl ? MSR_IA32_VMX_TRUE_EXIT_CTLS
+ : MSR_IA32_VMX_EXIT_CTLS);
+
+ if (!(ctrl_pin_rev.clr & PIN_BASED_VMX_PREEMPTION_TIMER) ||
+ !(ctrl_exit_rev.clr & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER))
+ return;
+
+ GUEST_ASSERT(!vmlaunch());
+ GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL);
+ vmwrite(GUEST_RIP, vmreadz(GUEST_RIP) + vmreadz(VM_EXIT_INSTRUCTION_LEN));
+
+ /*
+ * Turn on PIN control and resume the guest
+ */
+ GUEST_ASSERT(!vmwrite(PIN_BASED_VM_EXEC_CONTROL,
+ vmreadz(PIN_BASED_VM_EXEC_CONTROL) |
+ PIN_BASED_VMX_PREEMPTION_TIMER));
+
+ GUEST_ASSERT(!vmwrite(VMX_PREEMPTION_TIMER_VALUE,
+ PREEMPTION_TIMER_VALUE));
+
+ vmx_pt_rate = rdmsr(MSR_IA32_VMX_MISC) & 0x1F;
+
+ l2_save_restore_done = 0;
+
+ l1_vmx_pt_start = (rdtsc() >> vmx_pt_rate) << vmx_pt_rate;
+
+ GUEST_ASSERT(!vmresume());
+
+ l1_vmx_pt_finish = rdtsc();
+
+ /*
+ * Ensure exit from L2 happens after L2 goes through
+ * save and restore
+ */
+ GUEST_ASSERT(l2_save_restore_done);
+
+ /*
+ * Ensure the exit from L2 is due to preemption timer expiry
+ */
+ GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_PREEMPTION_TIMER);
+
+ l1_tsc_deadline = l1_vmx_pt_start +
+ (PREEMPTION_TIMER_VALUE << vmx_pt_rate);
+
+ l2_tsc_deadline = l2_vmx_pt_start +
+ (PREEMPTION_TIMER_VALUE << vmx_pt_rate);
+
+ /*
+ * Sync with the host and pass the l1|l2 pt_expiry_finish times and
+ * tsc deadlines so that host can verify they are as expected
+ */
+ GUEST_SYNC_ARGS(2, l1_vmx_pt_finish, l1_tsc_deadline,
+ l2_vmx_pt_finish, l2_tsc_deadline);
+}
+
+void guest_code(struct vmx_pages *vmx_pages)
+{
+ if (vmx_pages)
+ l1_guest_code(vmx_pages);
+
+ GUEST_DONE();
+}
+
+int main(int argc, char *argv[])
+{
+ vm_vaddr_t vmx_pages_gva = 0;
+
+ struct kvm_regs regs1, regs2;
+ struct kvm_vm *vm;
+ struct kvm_run *run;
+ struct kvm_x86_state *state;
+ struct ucall uc;
+ int stage;
+
+ /*
+ * AMD currently does not implement any VMX features, so for now we
+ * just early out.
+ */
+ nested_vmx_check_supported();
+
+ /* Create VM */
+ vm = vm_create_default(VCPU_ID, 0, guest_code);
+ vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid());
+ run = vcpu_state(vm, VCPU_ID);
+
+ vcpu_regs_get(vm, VCPU_ID, &regs1);
+
+ if (kvm_check_cap(KVM_CAP_NESTED_STATE)) {
+ vcpu_alloc_vmx(vm, &vmx_pages_gva);
+ vcpu_args_set(vm, VCPU_ID, 1, vmx_pages_gva);
+ } else {
+ pr_info("will skip vmx preemption timer checks\n");
+ goto done;
+ }
+
+ for (stage = 1;; stage++) {
+ _vcpu_run(vm, VCPU_ID);
+ TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
+ "Stage %d: unexpected exit reason: %u (%s),\n",
+ stage, run->exit_reason,
+ exit_reason_str(run->exit_reason));
+
+ switch (get_ucall(vm, VCPU_ID, &uc)) {
+ case UCALL_ABORT:
+ TEST_FAIL("%s at %s:%ld", (const char *)uc.args[0],
+ __FILE__, uc.args[1]);
+ /* NOT REACHED */
+ case UCALL_SYNC:
+ break;
+ case UCALL_DONE:
+ goto done;
+ default:
+ TEST_FAIL("Unknown ucall %lu", uc.cmd);
+ }
+
+ /* UCALL_SYNC is handled here. */
+ TEST_ASSERT(!strcmp((const char *)uc.args[0], "hello") &&
+ uc.args[1] == stage, "Stage %d: Unexpected register values vmexit, got %lx",
+ stage, (ulong)uc.args[1]);
+ /*
+ * If this stage 2 then we should verify the vmx pt expiry
+ * is as expected.
+ * From L1's perspective verify Preemption timer hasn't
+ * expired too early.
+ * From L2's perspective verify Preemption timer hasn't
+ * expired too late.
+ */
+ if (stage == 2) {
+
+ pr_info("Stage %d: L1 PT expiry TSC (%lu) , L1 TSC deadline (%lu)\n",
+ stage, uc.args[2], uc.args[3]);
+
+ pr_info("Stage %d: L2 PT expiry TSC (%lu) , L2 TSC deadline (%lu)\n",
+ stage, uc.args[4], uc.args[5]);
+
+ TEST_ASSERT(uc.args[2] >= uc.args[3],
+ "Stage %d: L1 PT expiry TSC (%lu) < L1 TSC deadline (%lu)",
+ stage, uc.args[2], uc.args[3]);
+
+ TEST_ASSERT(uc.args[4] < uc.args[5],
+ "Stage %d: L2 PT expiry TSC (%lu) > L2 TSC deadline (%lu)",
+ stage, uc.args[4], uc.args[5]);
+ }
+
+ state = vcpu_save_state(vm, VCPU_ID);
+ memset(&regs1, 0, sizeof(regs1));
+ vcpu_regs_get(vm, VCPU_ID, &regs1);
+
+ kvm_vm_release(vm);
+
+ /* Restore state in a new VM. */
+ kvm_vm_restart(vm, O_RDWR);
+ vm_vcpu_add(vm, VCPU_ID);
+ vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid());
+ vcpu_load_state(vm, VCPU_ID, state);
+ run = vcpu_state(vm, VCPU_ID);
+ free(state);
+
+ memset(&regs2, 0, sizeof(regs2));
+ vcpu_regs_get(vm, VCPU_ID, &regs2);
+ TEST_ASSERT(!memcmp(&regs1, &regs2, sizeof(regs2)),
+ "Unexpected register values after vcpu_load_state; rdi: %lx rsi: %lx",
+ (ulong) regs2.rdi, (ulong) regs2.rsi);
+ }
+
+done:
+ kvm_vm_free(vm);
+}
diff --git a/tools/testing/selftests/kvm/x86_64/vmx_set_nested_state_test.c b/tools/testing/selftests/kvm/x86_64/vmx_set_nested_state_test.c
new file mode 100644
index 000000000..d59f3eb67
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/vmx_set_nested_state_test.c
@@ -0,0 +1,298 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * vmx_set_nested_state_test
+ *
+ * Copyright (C) 2019, Google LLC.
+ *
+ * This test verifies the integrity of calling the ioctl KVM_SET_NESTED_STATE.
+ */
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+#include "vmx.h"
+
+#include <errno.h>
+#include <linux/kvm.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <unistd.h>
+
+/*
+ * Mirror of VMCS12_REVISION in arch/x86/kvm/vmx/vmcs12.h. If that value
+ * changes this should be updated.
+ */
+#define VMCS12_REVISION 0x11e57ed0
+#define VCPU_ID 5
+
+bool have_evmcs;
+
+void test_nested_state(struct kvm_vm *vm, struct kvm_nested_state *state)
+{
+ vcpu_nested_state_set(vm, VCPU_ID, state, false);
+}
+
+void test_nested_state_expect_errno(struct kvm_vm *vm,
+ struct kvm_nested_state *state,
+ int expected_errno)
+{
+ int rv;
+
+ rv = vcpu_nested_state_set(vm, VCPU_ID, state, true);
+ TEST_ASSERT(rv == -1 && errno == expected_errno,
+ "Expected %s (%d) from vcpu_nested_state_set but got rv: %i errno: %s (%d)",
+ strerror(expected_errno), expected_errno, rv, strerror(errno),
+ errno);
+}
+
+void test_nested_state_expect_einval(struct kvm_vm *vm,
+ struct kvm_nested_state *state)
+{
+ test_nested_state_expect_errno(vm, state, EINVAL);
+}
+
+void test_nested_state_expect_efault(struct kvm_vm *vm,
+ struct kvm_nested_state *state)
+{
+ test_nested_state_expect_errno(vm, state, EFAULT);
+}
+
+void set_revision_id_for_vmcs12(struct kvm_nested_state *state,
+ u32 vmcs12_revision)
+{
+ /* Set revision_id in vmcs12 to vmcs12_revision. */
+ memcpy(&state->data, &vmcs12_revision, sizeof(u32));
+}
+
+void set_default_state(struct kvm_nested_state *state)
+{
+ memset(state, 0, sizeof(*state));
+ state->flags = KVM_STATE_NESTED_RUN_PENDING |
+ KVM_STATE_NESTED_GUEST_MODE;
+ state->format = 0;
+ state->size = sizeof(*state);
+}
+
+void set_default_vmx_state(struct kvm_nested_state *state, int size)
+{
+ memset(state, 0, size);
+ if (have_evmcs)
+ state->flags = KVM_STATE_NESTED_EVMCS;
+ state->format = 0;
+ state->size = size;
+ state->hdr.vmx.vmxon_pa = 0x1000;
+ state->hdr.vmx.vmcs12_pa = 0x2000;
+ state->hdr.vmx.smm.flags = 0;
+ set_revision_id_for_vmcs12(state, VMCS12_REVISION);
+}
+
+void test_vmx_nested_state(struct kvm_vm *vm)
+{
+ /* Add a page for VMCS12. */
+ const int state_sz = sizeof(struct kvm_nested_state) + getpagesize();
+ struct kvm_nested_state *state =
+ (struct kvm_nested_state *)malloc(state_sz);
+
+ /* The format must be set to 0. 0 for VMX, 1 for SVM. */
+ set_default_vmx_state(state, state_sz);
+ state->format = 1;
+ test_nested_state_expect_einval(vm, state);
+
+ /*
+ * We cannot virtualize anything if the guest does not have VMX
+ * enabled.
+ */
+ set_default_vmx_state(state, state_sz);
+ test_nested_state_expect_einval(vm, state);
+
+ /*
+ * We cannot virtualize anything if the guest does not have VMX
+ * enabled. We expect KVM_SET_NESTED_STATE to return 0 if vmxon_pa
+ * is set to -1ull, but the flags must be zero.
+ */
+ set_default_vmx_state(state, state_sz);
+ state->hdr.vmx.vmxon_pa = -1ull;
+ test_nested_state_expect_einval(vm, state);
+
+ state->hdr.vmx.vmcs12_pa = -1ull;
+ state->flags = KVM_STATE_NESTED_EVMCS;
+ test_nested_state_expect_einval(vm, state);
+
+ state->flags = 0;
+ test_nested_state(vm, state);
+
+ /* Enable VMX in the guest CPUID. */
+ vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid());
+
+ /*
+ * Setting vmxon_pa == -1ull and vmcs_pa == -1ull exits early without
+ * setting the nested state but flags other than eVMCS must be clear.
+ * The eVMCS flag can be set if the enlightened VMCS capability has
+ * been enabled.
+ */
+ set_default_vmx_state(state, state_sz);
+ state->hdr.vmx.vmxon_pa = -1ull;
+ state->hdr.vmx.vmcs12_pa = -1ull;
+ test_nested_state_expect_einval(vm, state);
+
+ state->flags &= KVM_STATE_NESTED_EVMCS;
+ if (have_evmcs) {
+ test_nested_state_expect_einval(vm, state);
+ vcpu_enable_evmcs(vm, VCPU_ID);
+ }
+ test_nested_state(vm, state);
+
+ /* It is invalid to have vmxon_pa == -1ull and SMM flags non-zero. */
+ state->hdr.vmx.smm.flags = 1;
+ test_nested_state_expect_einval(vm, state);
+
+ /* Invalid flags are rejected. */
+ set_default_vmx_state(state, state_sz);
+ state->hdr.vmx.flags = ~0;
+ test_nested_state_expect_einval(vm, state);
+
+ /* It is invalid to have vmxon_pa == -1ull and vmcs_pa != -1ull. */
+ set_default_vmx_state(state, state_sz);
+ state->hdr.vmx.vmxon_pa = -1ull;
+ state->flags = 0;
+ test_nested_state_expect_einval(vm, state);
+
+ /* It is invalid to have vmxon_pa set to a non-page aligned address. */
+ set_default_vmx_state(state, state_sz);
+ state->hdr.vmx.vmxon_pa = 1;
+ test_nested_state_expect_einval(vm, state);
+
+ /*
+ * It is invalid to have KVM_STATE_NESTED_SMM_GUEST_MODE and
+ * KVM_STATE_NESTED_GUEST_MODE set together.
+ */
+ set_default_vmx_state(state, state_sz);
+ state->flags = KVM_STATE_NESTED_GUEST_MODE |
+ KVM_STATE_NESTED_RUN_PENDING;
+ state->hdr.vmx.smm.flags = KVM_STATE_NESTED_SMM_GUEST_MODE;
+ test_nested_state_expect_einval(vm, state);
+
+ /*
+ * It is invalid to have any of the SMM flags set besides:
+ * KVM_STATE_NESTED_SMM_GUEST_MODE
+ * KVM_STATE_NESTED_SMM_VMXON
+ */
+ set_default_vmx_state(state, state_sz);
+ state->hdr.vmx.smm.flags = ~(KVM_STATE_NESTED_SMM_GUEST_MODE |
+ KVM_STATE_NESTED_SMM_VMXON);
+ test_nested_state_expect_einval(vm, state);
+
+ /* Outside SMM, SMM flags must be zero. */
+ set_default_vmx_state(state, state_sz);
+ state->flags = 0;
+ state->hdr.vmx.smm.flags = KVM_STATE_NESTED_SMM_GUEST_MODE;
+ test_nested_state_expect_einval(vm, state);
+
+ /*
+ * Size must be large enough to fit kvm_nested_state and vmcs12
+ * if VMCS12 physical address is set
+ */
+ set_default_vmx_state(state, state_sz);
+ state->size = sizeof(*state);
+ state->flags = 0;
+ test_nested_state_expect_einval(vm, state);
+
+ set_default_vmx_state(state, state_sz);
+ state->size = sizeof(*state);
+ state->flags = 0;
+ state->hdr.vmx.vmcs12_pa = -1;
+ test_nested_state(vm, state);
+
+ /*
+ * KVM_SET_NESTED_STATE succeeds with invalid VMCS
+ * contents but L2 not running.
+ */
+ set_default_vmx_state(state, state_sz);
+ state->flags = 0;
+ test_nested_state(vm, state);
+
+ /* Invalid flags are rejected, even if no VMCS loaded. */
+ set_default_vmx_state(state, state_sz);
+ state->size = sizeof(*state);
+ state->flags = 0;
+ state->hdr.vmx.vmcs12_pa = -1;
+ state->hdr.vmx.flags = ~0;
+ test_nested_state_expect_einval(vm, state);
+
+ /* vmxon_pa cannot be the same address as vmcs_pa. */
+ set_default_vmx_state(state, state_sz);
+ state->hdr.vmx.vmxon_pa = 0;
+ state->hdr.vmx.vmcs12_pa = 0;
+ test_nested_state_expect_einval(vm, state);
+
+ /*
+ * Test that if we leave nesting the state reflects that when we get
+ * it again.
+ */
+ set_default_vmx_state(state, state_sz);
+ state->hdr.vmx.vmxon_pa = -1ull;
+ state->hdr.vmx.vmcs12_pa = -1ull;
+ state->flags = 0;
+ test_nested_state(vm, state);
+ vcpu_nested_state_get(vm, VCPU_ID, state);
+ TEST_ASSERT(state->size >= sizeof(*state) && state->size <= state_sz,
+ "Size must be between %ld and %d. The size returned was %d.",
+ sizeof(*state), state_sz, state->size);
+ TEST_ASSERT(state->hdr.vmx.vmxon_pa == -1ull, "vmxon_pa must be -1ull.");
+ TEST_ASSERT(state->hdr.vmx.vmcs12_pa == -1ull, "vmcs_pa must be -1ull.");
+
+ free(state);
+}
+
+int main(int argc, char *argv[])
+{
+ struct kvm_vm *vm;
+ struct kvm_nested_state state;
+
+ have_evmcs = kvm_check_cap(KVM_CAP_HYPERV_ENLIGHTENED_VMCS);
+
+ if (!kvm_check_cap(KVM_CAP_NESTED_STATE)) {
+ print_skip("KVM_CAP_NESTED_STATE not available");
+ exit(KSFT_SKIP);
+ }
+
+ /*
+ * AMD currently does not implement set_nested_state, so for now we
+ * just early out.
+ */
+ nested_vmx_check_supported();
+
+ vm = vm_create_default(VCPU_ID, 0, 0);
+
+ /* Passing a NULL kvm_nested_state causes a EFAULT. */
+ test_nested_state_expect_efault(vm, NULL);
+
+ /* 'size' cannot be smaller than sizeof(kvm_nested_state). */
+ set_default_state(&state);
+ state.size = 0;
+ test_nested_state_expect_einval(vm, &state);
+
+ /*
+ * Setting the flags 0xf fails the flags check. The only flags that
+ * can be used are:
+ * KVM_STATE_NESTED_GUEST_MODE
+ * KVM_STATE_NESTED_RUN_PENDING
+ * KVM_STATE_NESTED_EVMCS
+ */
+ set_default_state(&state);
+ state.flags = 0xf;
+ test_nested_state_expect_einval(vm, &state);
+
+ /*
+ * If KVM_STATE_NESTED_RUN_PENDING is set then
+ * KVM_STATE_NESTED_GUEST_MODE has to be set as well.
+ */
+ set_default_state(&state);
+ state.flags = KVM_STATE_NESTED_RUN_PENDING;
+ test_nested_state_expect_einval(vm, &state);
+
+ test_vmx_nested_state(vm);
+
+ kvm_vm_free(vm);
+ return 0;
+}
diff --git a/tools/testing/selftests/kvm/x86_64/vmx_tsc_adjust_test.c b/tools/testing/selftests/kvm/x86_64/vmx_tsc_adjust_test.c
new file mode 100644
index 000000000..fbe8417cb
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/vmx_tsc_adjust_test.c
@@ -0,0 +1,168 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * vmx_tsc_adjust_test
+ *
+ * Copyright (C) 2018, Google LLC.
+ *
+ * IA32_TSC_ADJUST test
+ *
+ * According to the SDM, "if an execution of WRMSR to the
+ * IA32_TIME_STAMP_COUNTER MSR adds (or subtracts) value X from the TSC,
+ * the logical processor also adds (or subtracts) value X from the
+ * IA32_TSC_ADJUST MSR.
+ *
+ * Note that when L1 doesn't intercept writes to IA32_TSC, a
+ * WRMSR(IA32_TSC) from L2 sets L1's TSC value, not L2's perceived TSC
+ * value.
+ *
+ * This test verifies that this unusual case is handled correctly.
+ */
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+#include "vmx.h"
+
+#include <string.h>
+#include <sys/ioctl.h>
+
+#include "kselftest.h"
+
+#ifndef MSR_IA32_TSC_ADJUST
+#define MSR_IA32_TSC_ADJUST 0x3b
+#endif
+
+#define PAGE_SIZE 4096
+#define VCPU_ID 5
+
+#define TSC_ADJUST_VALUE (1ll << 32)
+#define TSC_OFFSET_VALUE -(1ll << 48)
+
+enum {
+ PORT_ABORT = 0x1000,
+ PORT_REPORT,
+ PORT_DONE,
+};
+
+enum {
+ VMXON_PAGE = 0,
+ VMCS_PAGE,
+ MSR_BITMAP_PAGE,
+
+ NUM_VMX_PAGES,
+};
+
+struct kvm_single_msr {
+ struct kvm_msrs header;
+ struct kvm_msr_entry entry;
+} __attribute__((packed));
+
+/* The virtual machine object. */
+static struct kvm_vm *vm;
+
+static void check_ia32_tsc_adjust(int64_t max)
+{
+ int64_t adjust;
+
+ adjust = rdmsr(MSR_IA32_TSC_ADJUST);
+ GUEST_SYNC(adjust);
+ GUEST_ASSERT(adjust <= max);
+}
+
+static void l2_guest_code(void)
+{
+ uint64_t l1_tsc = rdtsc() - TSC_OFFSET_VALUE;
+
+ wrmsr(MSR_IA32_TSC, l1_tsc - TSC_ADJUST_VALUE);
+ check_ia32_tsc_adjust(-2 * TSC_ADJUST_VALUE);
+
+ /* Exit to L1 */
+ __asm__ __volatile__("vmcall");
+}
+
+static void l1_guest_code(struct vmx_pages *vmx_pages)
+{
+#define L2_GUEST_STACK_SIZE 64
+ unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
+ uint32_t control;
+ uintptr_t save_cr3;
+
+ GUEST_ASSERT(rdtsc() < TSC_ADJUST_VALUE);
+ wrmsr(MSR_IA32_TSC, rdtsc() - TSC_ADJUST_VALUE);
+ check_ia32_tsc_adjust(-1 * TSC_ADJUST_VALUE);
+
+ GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages));
+ GUEST_ASSERT(load_vmcs(vmx_pages));
+
+ /* Prepare the VMCS for L2 execution. */
+ prepare_vmcs(vmx_pages, l2_guest_code,
+ &l2_guest_stack[L2_GUEST_STACK_SIZE]);
+ control = vmreadz(CPU_BASED_VM_EXEC_CONTROL);
+ control |= CPU_BASED_USE_MSR_BITMAPS | CPU_BASED_USE_TSC_OFFSETTING;
+ vmwrite(CPU_BASED_VM_EXEC_CONTROL, control);
+ vmwrite(TSC_OFFSET, TSC_OFFSET_VALUE);
+
+ /* Jump into L2. First, test failure to load guest CR3. */
+ save_cr3 = vmreadz(GUEST_CR3);
+ vmwrite(GUEST_CR3, -1ull);
+ GUEST_ASSERT(!vmlaunch());
+ GUEST_ASSERT(vmreadz(VM_EXIT_REASON) ==
+ (EXIT_REASON_FAILED_VMENTRY | EXIT_REASON_INVALID_STATE));
+ check_ia32_tsc_adjust(-1 * TSC_ADJUST_VALUE);
+ vmwrite(GUEST_CR3, save_cr3);
+
+ GUEST_ASSERT(!vmlaunch());
+ GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL);
+
+ check_ia32_tsc_adjust(-2 * TSC_ADJUST_VALUE);
+
+ GUEST_DONE();
+}
+
+static void report(int64_t val)
+{
+ pr_info("IA32_TSC_ADJUST is %ld (%lld * TSC_ADJUST_VALUE + %lld).\n",
+ val, val / TSC_ADJUST_VALUE, val % TSC_ADJUST_VALUE);
+}
+
+int main(int argc, char *argv[])
+{
+ vm_vaddr_t vmx_pages_gva;
+
+ nested_vmx_check_supported();
+
+ vm = vm_create_default(VCPU_ID, 0, (void *) l1_guest_code);
+ vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid());
+
+ /* Allocate VMX pages and shared descriptors (vmx_pages). */
+ vcpu_alloc_vmx(vm, &vmx_pages_gva);
+ vcpu_args_set(vm, VCPU_ID, 1, vmx_pages_gva);
+
+ for (;;) {
+ volatile struct kvm_run *run = vcpu_state(vm, VCPU_ID);
+ struct ucall uc;
+
+ vcpu_run(vm, VCPU_ID);
+ TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
+ "Got exit_reason other than KVM_EXIT_IO: %u (%s)\n",
+ run->exit_reason,
+ exit_reason_str(run->exit_reason));
+
+ switch (get_ucall(vm, VCPU_ID, &uc)) {
+ case UCALL_ABORT:
+ TEST_FAIL("%s", (const char *)uc.args[0]);
+ /* NOT REACHED */
+ case UCALL_SYNC:
+ report(uc.args[1]);
+ break;
+ case UCALL_DONE:
+ goto done;
+ default:
+ TEST_FAIL("Unknown ucall %lu", uc.cmd);
+ }
+ }
+
+ kvm_vm_free(vm);
+done:
+ return 0;
+}
diff --git a/tools/testing/selftests/kvm/x86_64/xss_msr_test.c b/tools/testing/selftests/kvm/x86_64/xss_msr_test.c
new file mode 100644
index 000000000..352937674
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/xss_msr_test.c
@@ -0,0 +1,76 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2019, Google LLC.
+ *
+ * Tests for the IA32_XSS MSR.
+ */
+
+#define _GNU_SOURCE /* for program_invocation_short_name */
+#include <sys/ioctl.h>
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "vmx.h"
+
+#define VCPU_ID 1
+#define MSR_BITS 64
+
+#define X86_FEATURE_XSAVES (1<<3)
+
+bool is_supported_msr(u32 msr_index)
+{
+ struct kvm_msr_list *list;
+ bool found = false;
+ int i;
+
+ list = kvm_get_msr_index_list();
+ for (i = 0; i < list->nmsrs; ++i) {
+ if (list->indices[i] == msr_index) {
+ found = true;
+ break;
+ }
+ }
+
+ free(list);
+ return found;
+}
+
+int main(int argc, char *argv[])
+{
+ struct kvm_cpuid_entry2 *entry;
+ bool xss_supported = false;
+ struct kvm_vm *vm;
+ uint64_t xss_val;
+ int i, r;
+
+ /* Create VM */
+ vm = vm_create_default(VCPU_ID, 0, 0);
+
+ if (kvm_get_cpuid_max_basic() >= 0xd) {
+ entry = kvm_get_supported_cpuid_index(0xd, 1);
+ xss_supported = entry && !!(entry->eax & X86_FEATURE_XSAVES);
+ }
+ if (!xss_supported) {
+ print_skip("IA32_XSS is not supported by the vCPU");
+ exit(KSFT_SKIP);
+ }
+
+ xss_val = vcpu_get_msr(vm, VCPU_ID, MSR_IA32_XSS);
+ TEST_ASSERT(xss_val == 0,
+ "MSR_IA32_XSS should be initialized to zero\n");
+
+ vcpu_set_msr(vm, VCPU_ID, MSR_IA32_XSS, xss_val);
+ /*
+ * At present, KVM only supports a guest IA32_XSS value of 0. Verify
+ * that trying to set the guest IA32_XSS to an unsupported value fails.
+ * Also, in the future when a non-zero value succeeds check that
+ * IA32_XSS is in the KVM_GET_MSR_INDEX_LIST.
+ */
+ for (i = 0; i < MSR_BITS; ++i) {
+ r = _vcpu_set_msr(vm, VCPU_ID, MSR_IA32_XSS, 1ull << i);
+ TEST_ASSERT(r == 0 || is_supported_msr(MSR_IA32_XSS),
+ "IA32_XSS was able to be set, but was not found in KVM_GET_MSR_INDEX_LIST.\n");
+ }
+
+ kvm_vm_free(vm);
+}