From 76cb841cb886eef6b3bee341a2266c76578724ad Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Mon, 6 May 2024 03:02:30 +0200 Subject: Adding upstream version 4.19.249. Signed-off-by: Daniel Baumann --- tools/testing/selftests/kvm/lib/assert.c | 92 + tools/testing/selftests/kvm/lib/elf.c | 197 ++ tools/testing/selftests/kvm/lib/io.c | 158 ++ tools/testing/selftests/kvm/lib/kvm_util.c | 1680 ++++++++++++++++ .../testing/selftests/kvm/lib/kvm_util_internal.h | 72 + tools/testing/selftests/kvm/lib/sparsebit.c | 2087 ++++++++++++++++++++ tools/testing/selftests/kvm/lib/vmx.c | 283 +++ tools/testing/selftests/kvm/lib/x86.c | 893 +++++++++ 8 files changed, 5462 insertions(+) create mode 100644 tools/testing/selftests/kvm/lib/assert.c create mode 100644 tools/testing/selftests/kvm/lib/elf.c create mode 100644 tools/testing/selftests/kvm/lib/io.c create mode 100644 tools/testing/selftests/kvm/lib/kvm_util.c create mode 100644 tools/testing/selftests/kvm/lib/kvm_util_internal.h create mode 100644 tools/testing/selftests/kvm/lib/sparsebit.c create mode 100644 tools/testing/selftests/kvm/lib/vmx.c create mode 100644 tools/testing/selftests/kvm/lib/x86.c (limited to 'tools/testing/selftests/kvm/lib') diff --git a/tools/testing/selftests/kvm/lib/assert.c b/tools/testing/selftests/kvm/lib/assert.c new file mode 100644 index 000000000..d30667706 --- /dev/null +++ b/tools/testing/selftests/kvm/lib/assert.c @@ -0,0 +1,92 @@ +/* + * tools/testing/selftests/kvm/lib/assert.c + * + * Copyright (C) 2018, Google LLC. + * + * This work is licensed under the terms of the GNU GPL, version 2. + */ + +#define _GNU_SOURCE /* for getline(3) and strchrnul(3)*/ + +#include "test_util.h" + +#include +#include + +#include "../../kselftest.h" + +/* Dumps the current stack trace to stderr. */ +static void __attribute__((noinline)) test_dump_stack(void); +static void test_dump_stack(void) +{ + /* + * Build and run this command: + * + * addr2line -s -e /proc/$PPID/exe -fpai {backtrace addresses} | \ + * grep -v test_dump_stack | cat -n 1>&2 + * + * Note that the spacing is different and there's no newline. + */ + size_t i; + size_t n = 20; + void *stack[n]; + const char *addr2line = "addr2line -s -e /proc/$PPID/exe -fpai"; + const char *pipeline = "|cat -n 1>&2"; + char cmd[strlen(addr2line) + strlen(pipeline) + + /* N bytes per addr * 2 digits per byte + 1 space per addr: */ + n * (((sizeof(void *)) * 2) + 1) + + /* Null terminator: */ + 1]; + char *c; + + n = backtrace(stack, n); + c = &cmd[0]; + c += sprintf(c, "%s", addr2line); + /* + * Skip the first 3 frames: backtrace, test_dump_stack, and + * test_assert. We hope that backtrace isn't inlined and the other two + * we've declared noinline. + */ + for (i = 2; i < n; i++) + c += sprintf(c, " %lx", ((unsigned long) stack[i]) - 1); + c += sprintf(c, "%s", pipeline); +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-result" + system(cmd); +#pragma GCC diagnostic pop +} + +static pid_t _gettid(void) +{ + return syscall(SYS_gettid); +} + +void __attribute__((noinline)) +test_assert(bool exp, const char *exp_str, + const char *file, unsigned int line, const char *fmt, ...) +{ + va_list ap; + + if (!(exp)) { + va_start(ap, fmt); + + fprintf(stderr, "==== Test Assertion Failure ====\n" + " %s:%u: %s\n" + " pid=%d tid=%d - %s\n", + file, line, exp_str, getpid(), _gettid(), + strerror(errno)); + test_dump_stack(); + if (fmt) { + fputs(" ", stderr); + vfprintf(stderr, fmt, ap); + fputs("\n", stderr); + } + va_end(ap); + + if (errno == EACCES) + ksft_exit_skip("Access denied - Exiting.\n"); + exit(254); + } + + return; +} diff --git a/tools/testing/selftests/kvm/lib/elf.c b/tools/testing/selftests/kvm/lib/elf.c new file mode 100644 index 000000000..5eb857584 --- /dev/null +++ b/tools/testing/selftests/kvm/lib/elf.c @@ -0,0 +1,197 @@ +/* + * tools/testing/selftests/kvm/lib/elf.c + * + * Copyright (C) 2018, Google LLC. + * + * This work is licensed under the terms of the GNU GPL, version 2. + */ + +#include "test_util.h" + +#include +#include + +#include "kvm_util.h" +#include "kvm_util_internal.h" + +static void elfhdr_get(const char *filename, Elf64_Ehdr *hdrp) +{ + off_t offset_rv; + + /* Open the ELF file. */ + int fd; + fd = open(filename, O_RDONLY); + TEST_ASSERT(fd >= 0, "Failed to open ELF file,\n" + " filename: %s\n" + " rv: %i errno: %i", filename, fd, errno); + + /* Read in and validate ELF Identification Record. + * The ELF Identification record is the first 16 (EI_NIDENT) bytes + * of the ELF header, which is at the beginning of the ELF file. + * For now it is only safe to read the first EI_NIDENT bytes. Once + * read and validated, the value of e_ehsize can be used to determine + * the real size of the ELF header. + */ + unsigned char ident[EI_NIDENT]; + test_read(fd, ident, sizeof(ident)); + TEST_ASSERT((ident[EI_MAG0] == ELFMAG0) && (ident[EI_MAG1] == ELFMAG1) + && (ident[EI_MAG2] == ELFMAG2) && (ident[EI_MAG3] == ELFMAG3), + "ELF MAGIC Mismatch,\n" + " filename: %s\n" + " ident[EI_MAG0 - EI_MAG3]: %02x %02x %02x %02x\n" + " Expected: %02x %02x %02x %02x", + filename, + ident[EI_MAG0], ident[EI_MAG1], ident[EI_MAG2], ident[EI_MAG3], + ELFMAG0, ELFMAG1, ELFMAG2, ELFMAG3); + TEST_ASSERT(ident[EI_CLASS] == ELFCLASS64, + "Current implementation only able to handle ELFCLASS64,\n" + " filename: %s\n" + " ident[EI_CLASS]: %02x\n" + " expected: %02x", + filename, + ident[EI_CLASS], ELFCLASS64); + TEST_ASSERT(((BYTE_ORDER == LITTLE_ENDIAN) + && (ident[EI_DATA] == ELFDATA2LSB)) + || ((BYTE_ORDER == BIG_ENDIAN) + && (ident[EI_DATA] == ELFDATA2MSB)), "Current " + "implementation only able to handle\n" + "cases where the host and ELF file endianness\n" + "is the same:\n" + " host BYTE_ORDER: %u\n" + " host LITTLE_ENDIAN: %u\n" + " host BIG_ENDIAN: %u\n" + " ident[EI_DATA]: %u\n" + " ELFDATA2LSB: %u\n" + " ELFDATA2MSB: %u", + BYTE_ORDER, LITTLE_ENDIAN, BIG_ENDIAN, + ident[EI_DATA], ELFDATA2LSB, ELFDATA2MSB); + TEST_ASSERT(ident[EI_VERSION] == EV_CURRENT, + "Current implementation only able to handle current " + "ELF version,\n" + " filename: %s\n" + " ident[EI_VERSION]: %02x\n" + " expected: %02x", + filename, ident[EI_VERSION], EV_CURRENT); + + /* Read in the ELF header. + * With the ELF Identification portion of the ELF header + * validated, especially that the value at EI_VERSION is + * as expected, it is now safe to read the entire ELF header. + */ + offset_rv = lseek(fd, 0, SEEK_SET); + TEST_ASSERT(offset_rv == 0, "Seek to ELF header failed,\n" + " rv: %zi expected: %i", offset_rv, 0); + test_read(fd, hdrp, sizeof(*hdrp)); + TEST_ASSERT(hdrp->e_phentsize == sizeof(Elf64_Phdr), + "Unexpected physical header size,\n" + " hdrp->e_phentsize: %x\n" + " expected: %zx", + hdrp->e_phentsize, sizeof(Elf64_Phdr)); + TEST_ASSERT(hdrp->e_shentsize == sizeof(Elf64_Shdr), + "Unexpected section header size,\n" + " hdrp->e_shentsize: %x\n" + " expected: %zx", + hdrp->e_shentsize, sizeof(Elf64_Shdr)); +} + +/* VM ELF Load + * + * Input Args: + * filename - Path to ELF file + * + * Output Args: None + * + * Input/Output Args: + * vm - Pointer to opaque type that describes the VM. + * + * Return: None, TEST_ASSERT failures for all error conditions + * + * Loads the program image of the ELF file specified by filename, + * into the virtual address space of the VM pointed to by vm. On entry + * the VM needs to not be using any of the virtual address space used + * by the image and it needs to have sufficient available physical pages, to + * back the virtual pages used to load the image. + */ +void kvm_vm_elf_load(struct kvm_vm *vm, const char *filename, + uint32_t data_memslot, uint32_t pgd_memslot) +{ + off_t offset, offset_rv; + Elf64_Ehdr hdr; + + /* Open the ELF file. */ + int fd; + fd = open(filename, O_RDONLY); + TEST_ASSERT(fd >= 0, "Failed to open ELF file,\n" + " filename: %s\n" + " rv: %i errno: %i", filename, fd, errno); + + /* Read in the ELF header. */ + elfhdr_get(filename, &hdr); + + /* For each program header. + * The following ELF header members specify the location + * and size of the program headers: + * + * e_phoff - File offset to start of program headers + * e_phentsize - Size of each program header + * e_phnum - Number of program header entries + */ + for (unsigned int n1 = 0; n1 < hdr.e_phnum; n1++) { + /* Seek to the beginning of the program header. */ + offset = hdr.e_phoff + (n1 * hdr.e_phentsize); + offset_rv = lseek(fd, offset, SEEK_SET); + TEST_ASSERT(offset_rv == offset, + "Failed to seek to begining of program header %u,\n" + " filename: %s\n" + " rv: %jd errno: %i", + n1, filename, (intmax_t) offset_rv, errno); + + /* Read in the program header. */ + Elf64_Phdr phdr; + test_read(fd, &phdr, sizeof(phdr)); + + /* Skip if this header doesn't describe a loadable segment. */ + if (phdr.p_type != PT_LOAD) + continue; + + /* Allocate memory for this segment within the VM. */ + TEST_ASSERT(phdr.p_memsz > 0, "Unexpected loadable segment " + "memsize of 0,\n" + " phdr index: %u p_memsz: 0x%" PRIx64, + n1, (uint64_t) phdr.p_memsz); + vm_vaddr_t seg_vstart = phdr.p_vaddr; + seg_vstart &= ~(vm_vaddr_t)(vm->page_size - 1); + vm_vaddr_t seg_vend = phdr.p_vaddr + phdr.p_memsz - 1; + seg_vend |= vm->page_size - 1; + size_t seg_size = seg_vend - seg_vstart + 1; + + vm_vaddr_t vaddr = vm_vaddr_alloc(vm, seg_size, seg_vstart, + data_memslot, pgd_memslot); + TEST_ASSERT(vaddr == seg_vstart, "Unable to allocate " + "virtual memory for segment at requested min addr,\n" + " segment idx: %u\n" + " seg_vstart: 0x%lx\n" + " vaddr: 0x%lx", + n1, seg_vstart, vaddr); + memset(addr_gva2hva(vm, vaddr), 0, seg_size); + /* TODO(lhuemill): Set permissions of each memory segment + * based on the least-significant 3 bits of phdr.p_flags. + */ + + /* Load portion of initial state that is contained within + * the ELF file. + */ + if (phdr.p_filesz) { + offset_rv = lseek(fd, phdr.p_offset, SEEK_SET); + TEST_ASSERT(offset_rv == phdr.p_offset, + "Seek to program segment offset failed,\n" + " program header idx: %u errno: %i\n" + " offset_rv: 0x%jx\n" + " expected: 0x%jx\n", + n1, errno, (intmax_t) offset_rv, + (intmax_t) phdr.p_offset); + test_read(fd, addr_gva2hva(vm, phdr.p_vaddr), + phdr.p_filesz); + } + } +} diff --git a/tools/testing/selftests/kvm/lib/io.c b/tools/testing/selftests/kvm/lib/io.c new file mode 100644 index 000000000..cff869ffe --- /dev/null +++ b/tools/testing/selftests/kvm/lib/io.c @@ -0,0 +1,158 @@ +/* + * tools/testing/selftests/kvm/lib/io.c + * + * Copyright (C) 2018, Google LLC. + * + * This work is licensed under the terms of the GNU GPL, version 2. + */ + +#include "test_util.h" + +/* Test Write + * + * A wrapper for write(2), that automatically handles the following + * special conditions: + * + * + Interrupted system call (EINTR) + * + Write of less than requested amount + * + Non-block return (EAGAIN) + * + * For each of the above, an additional write is performed to automatically + * continue writing the requested data. + * There are also many cases where write(2) can return an unexpected + * error (e.g. EIO). Such errors cause a TEST_ASSERT failure. + * + * Note, for function signature compatibility with write(2), this function + * returns the number of bytes written, but that value will always be equal + * to the number of requested bytes. All other conditions in this and + * future enhancements to this function either automatically issue another + * write(2) or cause a TEST_ASSERT failure. + * + * Args: + * fd - Opened file descriptor to file to be written. + * count - Number of bytes to write. + * + * Output: + * buf - Starting address of data to be written. + * + * Return: + * On success, number of bytes written. + * On failure, a TEST_ASSERT failure is caused. + */ +ssize_t test_write(int fd, const void *buf, size_t count) +{ + ssize_t rc; + ssize_t num_written = 0; + size_t num_left = count; + const char *ptr = buf; + + /* Note: Count of zero is allowed (see "RETURN VALUE" portion of + * write(2) manpage for details. + */ + TEST_ASSERT(count >= 0, "Unexpected count, count: %li", count); + + do { + rc = write(fd, ptr, num_left); + + switch (rc) { + case -1: + TEST_ASSERT(errno == EAGAIN || errno == EINTR, + "Unexpected write failure,\n" + " rc: %zi errno: %i", rc, errno); + continue; + + case 0: + TEST_ASSERT(false, "Unexpected EOF,\n" + " rc: %zi num_written: %zi num_left: %zu", + rc, num_written, num_left); + break; + + default: + TEST_ASSERT(rc >= 0, "Unexpected ret from write,\n" + " rc: %zi errno: %i", rc, errno); + num_written += rc; + num_left -= rc; + ptr += rc; + break; + } + } while (num_written < count); + + return num_written; +} + +/* Test Read + * + * A wrapper for read(2), that automatically handles the following + * special conditions: + * + * + Interrupted system call (EINTR) + * + Read of less than requested amount + * + Non-block return (EAGAIN) + * + * For each of the above, an additional read is performed to automatically + * continue reading the requested data. + * There are also many cases where read(2) can return an unexpected + * error (e.g. EIO). Such errors cause a TEST_ASSERT failure. Note, + * it is expected that the file opened by fd at the current file position + * contains at least the number of requested bytes to be read. A TEST_ASSERT + * failure is produced if an End-Of-File condition occurs, before all the + * data is read. It is the callers responsibility to assure that sufficient + * data exists. + * + * Note, for function signature compatibility with read(2), this function + * returns the number of bytes read, but that value will always be equal + * to the number of requested bytes. All other conditions in this and + * future enhancements to this function either automatically issue another + * read(2) or cause a TEST_ASSERT failure. + * + * Args: + * fd - Opened file descriptor to file to be read. + * count - Number of bytes to read. + * + * Output: + * buf - Starting address of where to write the bytes read. + * + * Return: + * On success, number of bytes read. + * On failure, a TEST_ASSERT failure is caused. + */ +ssize_t test_read(int fd, void *buf, size_t count) +{ + ssize_t rc; + ssize_t num_read = 0; + size_t num_left = count; + char *ptr = buf; + + /* Note: Count of zero is allowed (see "If count is zero" portion of + * read(2) manpage for details. + */ + TEST_ASSERT(count >= 0, "Unexpected count, count: %li", count); + + do { + rc = read(fd, ptr, num_left); + + switch (rc) { + case -1: + TEST_ASSERT(errno == EAGAIN || errno == EINTR, + "Unexpected read failure,\n" + " rc: %zi errno: %i", rc, errno); + break; + + case 0: + TEST_ASSERT(false, "Unexpected EOF,\n" + " rc: %zi num_read: %zi num_left: %zu", + rc, num_read, num_left); + break; + + default: + TEST_ASSERT(rc > 0, "Unexpected ret from read,\n" + " rc: %zi errno: %i", rc, errno); + num_read += rc; + num_left -= rc; + ptr += rc; + break; + } + } while (num_read < count); + + return num_read; +} diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c new file mode 100644 index 000000000..b138fd5e4 --- /dev/null +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -0,0 +1,1680 @@ +/* + * tools/testing/selftests/kvm/lib/kvm_util.c + * + * Copyright (C) 2018, Google LLC. + * + * This work is licensed under the terms of the GNU GPL, version 2. + */ + +#include "test_util.h" +#include "kvm_util.h" +#include "kvm_util_internal.h" + +#include +#include +#include +#include +#include + +#define KVM_DEV_PATH "/dev/kvm" + +#define KVM_UTIL_PGS_PER_HUGEPG 512 +#define KVM_UTIL_MIN_PADDR 0x2000 + +/* Aligns x up to the next multiple of size. Size must be a power of 2. */ +static void *align(void *x, size_t size) +{ + size_t mask = size - 1; + TEST_ASSERT(size != 0 && !(size & (size - 1)), + "size not a power of 2: %lu", size); + return (void *) (((size_t) x + mask) & ~mask); +} + +/* Capability + * + * Input Args: + * cap - Capability + * + * Output Args: None + * + * Return: + * On success, the Value corresponding to the capability (KVM_CAP_*) + * specified by the value of cap. On failure a TEST_ASSERT failure + * is produced. + * + * Looks up and returns the value corresponding to the capability + * (KVM_CAP_*) given by cap. + */ +int kvm_check_cap(long cap) +{ + int ret; + int kvm_fd; + + kvm_fd = open(KVM_DEV_PATH, O_RDONLY); + if (kvm_fd < 0) + exit(KSFT_SKIP); + + ret = ioctl(kvm_fd, KVM_CHECK_EXTENSION, cap); + TEST_ASSERT(ret >= 0, "KVM_CHECK_EXTENSION IOCTL failed,\n" + " rc: %i errno: %i", ret, errno); + + close(kvm_fd); + + return ret; +} + +/* VM Enable Capability + * + * Input Args: + * vm - Virtual Machine + * cap - Capability + * + * Output Args: None + * + * Return: On success, 0. On failure a TEST_ASSERT failure is produced. + * + * Enables a capability (KVM_CAP_*) on the VM. + */ +int vm_enable_cap(struct kvm_vm *vm, struct kvm_enable_cap *cap) +{ + int ret; + + ret = ioctl(vm->fd, KVM_ENABLE_CAP, cap); + TEST_ASSERT(ret == 0, "KVM_ENABLE_CAP IOCTL failed,\n" + " rc: %i errno: %i", ret, errno); + + return ret; +} + +static void vm_open(struct kvm_vm *vm, int perm) +{ + vm->kvm_fd = open(KVM_DEV_PATH, perm); + if (vm->kvm_fd < 0) + exit(KSFT_SKIP); + + /* Create VM. */ + vm->fd = ioctl(vm->kvm_fd, KVM_CREATE_VM, NULL); + TEST_ASSERT(vm->fd >= 0, "KVM_CREATE_VM ioctl failed, " + "rc: %i errno: %i", vm->fd, errno); +} + +/* VM Create + * + * Input Args: + * mode - VM Mode (e.g. VM_MODE_FLAT48PG) + * phy_pages - Physical memory pages + * perm - permission + * + * Output Args: None + * + * Return: + * Pointer to opaque structure that describes the created VM. + * + * Creates a VM with the mode specified by mode (e.g. VM_MODE_FLAT48PG). + * When phy_pages is non-zero, a memory region of phy_pages physical pages + * is created and mapped starting at guest physical address 0. The file + * descriptor to control the created VM is created with the permissions + * given by perm (e.g. O_RDWR). + */ +struct kvm_vm *vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm) +{ + struct kvm_vm *vm; + int kvm_fd; + + /* Allocate memory. */ + vm = calloc(1, sizeof(*vm)); + TEST_ASSERT(vm != NULL, "Insufficent Memory"); + + vm->mode = mode; + vm_open(vm, perm); + + /* Setup mode specific traits. */ + switch (vm->mode) { + case VM_MODE_FLAT48PG: + vm->page_size = 0x1000; + vm->page_shift = 12; + + /* Limit to 48-bit canonical virtual addresses. */ + vm->vpages_valid = sparsebit_alloc(); + sparsebit_set_num(vm->vpages_valid, + 0, (1ULL << (48 - 1)) >> vm->page_shift); + sparsebit_set_num(vm->vpages_valid, + (~((1ULL << (48 - 1)) - 1)) >> vm->page_shift, + (1ULL << (48 - 1)) >> vm->page_shift); + + /* Limit physical addresses to 52-bits. */ + vm->max_gfn = ((1ULL << 52) >> vm->page_shift) - 1; + break; + + default: + TEST_ASSERT(false, "Unknown guest mode, mode: 0x%x", mode); + } + + /* Allocate and setup memory for guest. */ + vm->vpages_mapped = sparsebit_alloc(); + if (phy_pages != 0) + vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, + 0, 0, phy_pages, 0); + + return vm; +} + +/* VM Restart + * + * Input Args: + * vm - VM that has been released before + * perm - permission + * + * Output Args: None + * + * Reopens the file descriptors associated to the VM and reinstates the + * global state, such as the irqchip and the memory regions that are mapped + * into the guest. + */ +void kvm_vm_restart(struct kvm_vm *vmp, int perm) +{ + struct userspace_mem_region *region; + + vm_open(vmp, perm); + if (vmp->has_irqchip) + vm_create_irqchip(vmp); + + for (region = vmp->userspace_mem_region_head; region; + region = region->next) { + int ret = ioctl(vmp->fd, KVM_SET_USER_MEMORY_REGION, ®ion->region); + TEST_ASSERT(ret == 0, "KVM_SET_USER_MEMORY_REGION IOCTL failed,\n" + " rc: %i errno: %i\n" + " slot: %u flags: 0x%x\n" + " guest_phys_addr: 0x%lx size: 0x%lx", + ret, errno, region->region.slot, region->region.flags, + region->region.guest_phys_addr, + region->region.memory_size); + } +} + +void kvm_vm_get_dirty_log(struct kvm_vm *vm, int slot, void *log) +{ + struct kvm_dirty_log args = { .dirty_bitmap = log, .slot = slot }; + int ret; + + ret = ioctl(vm->fd, KVM_GET_DIRTY_LOG, &args); + TEST_ASSERT(ret == 0, "%s: KVM_GET_DIRTY_LOG failed: %s", + strerror(-ret)); +} + +/* Userspace Memory Region Find + * + * Input Args: + * vm - Virtual Machine + * start - Starting VM physical address + * end - Ending VM physical address, inclusive. + * + * Output Args: None + * + * Return: + * Pointer to overlapping region, NULL if no such region. + * + * Searches for a region with any physical memory that overlaps with + * any portion of the guest physical addresses from start to end + * inclusive. If multiple overlapping regions exist, a pointer to any + * of the regions is returned. Null is returned only when no overlapping + * region exists. + */ +static struct userspace_mem_region *userspace_mem_region_find( + struct kvm_vm *vm, uint64_t start, uint64_t end) +{ + struct userspace_mem_region *region; + + for (region = vm->userspace_mem_region_head; region; + region = region->next) { + uint64_t existing_start = region->region.guest_phys_addr; + uint64_t existing_end = region->region.guest_phys_addr + + region->region.memory_size - 1; + if (start <= existing_end && end >= existing_start) + return region; + } + + return NULL; +} + +/* KVM Userspace Memory Region Find + * + * Input Args: + * vm - Virtual Machine + * start - Starting VM physical address + * end - Ending VM physical address, inclusive. + * + * Output Args: None + * + * Return: + * Pointer to overlapping region, NULL if no such region. + * + * Public interface to userspace_mem_region_find. Allows tests to look up + * the memslot datastructure for a given range of guest physical memory. + */ +struct kvm_userspace_memory_region * +kvm_userspace_memory_region_find(struct kvm_vm *vm, uint64_t start, + uint64_t end) +{ + struct userspace_mem_region *region; + + region = userspace_mem_region_find(vm, start, end); + if (!region) + return NULL; + + return ®ion->region; +} + +/* VCPU Find + * + * Input Args: + * vm - Virtual Machine + * vcpuid - VCPU ID + * + * Output Args: None + * + * Return: + * Pointer to VCPU structure + * + * Locates a vcpu structure that describes the VCPU specified by vcpuid and + * returns a pointer to it. Returns NULL if the VM doesn't contain a VCPU + * for the specified vcpuid. + */ +struct vcpu *vcpu_find(struct kvm_vm *vm, + uint32_t vcpuid) +{ + struct vcpu *vcpup; + + for (vcpup = vm->vcpu_head; vcpup; vcpup = vcpup->next) { + if (vcpup->id == vcpuid) + return vcpup; + } + + return NULL; +} + +/* VM VCPU Remove + * + * Input Args: + * vm - Virtual Machine + * vcpuid - VCPU ID + * + * Output Args: None + * + * Return: None, TEST_ASSERT failures for all error conditions + * + * Within the VM specified by vm, removes the VCPU given by vcpuid. + */ +static void vm_vcpu_rm(struct kvm_vm *vm, uint32_t vcpuid) +{ + struct vcpu *vcpu = vcpu_find(vm, vcpuid); + int ret; + + ret = munmap(vcpu->state, sizeof(*vcpu->state)); + TEST_ASSERT(ret == 0, "munmap of VCPU fd failed, rc: %i " + "errno: %i", ret, errno); + close(vcpu->fd); + TEST_ASSERT(ret == 0, "Close of VCPU fd failed, rc: %i " + "errno: %i", ret, errno); + + if (vcpu->next) + vcpu->next->prev = vcpu->prev; + if (vcpu->prev) + vcpu->prev->next = vcpu->next; + else + vm->vcpu_head = vcpu->next; + free(vcpu); +} + +void kvm_vm_release(struct kvm_vm *vmp) +{ + int ret; + + /* Free VCPUs. */ + while (vmp->vcpu_head) + vm_vcpu_rm(vmp, vmp->vcpu_head->id); + + /* Close file descriptor for the VM. */ + ret = close(vmp->fd); + TEST_ASSERT(ret == 0, "Close of vm fd failed,\n" + " vmp->fd: %i rc: %i errno: %i", vmp->fd, ret, errno); + + close(vmp->kvm_fd); + TEST_ASSERT(ret == 0, "Close of /dev/kvm fd failed,\n" + " vmp->kvm_fd: %i rc: %i errno: %i", vmp->kvm_fd, ret, errno); +} + +/* Destroys and frees the VM pointed to by vmp. + */ +void kvm_vm_free(struct kvm_vm *vmp) +{ + int ret; + + if (vmp == NULL) + return; + + /* Free userspace_mem_regions. */ + while (vmp->userspace_mem_region_head) { + struct userspace_mem_region *region + = vmp->userspace_mem_region_head; + + region->region.memory_size = 0; + ret = ioctl(vmp->fd, KVM_SET_USER_MEMORY_REGION, + ®ion->region); + TEST_ASSERT(ret == 0, "KVM_SET_USER_MEMORY_REGION IOCTL failed, " + "rc: %i errno: %i", ret, errno); + + vmp->userspace_mem_region_head = region->next; + sparsebit_free(®ion->unused_phy_pages); + ret = munmap(region->mmap_start, region->mmap_size); + TEST_ASSERT(ret == 0, "munmap failed, rc: %i errno: %i", + ret, errno); + + free(region); + } + + /* Free sparsebit arrays. */ + sparsebit_free(&vmp->vpages_valid); + sparsebit_free(&vmp->vpages_mapped); + + kvm_vm_release(vmp); + + /* Free the structure describing the VM. */ + free(vmp); +} + +/* Memory Compare, host virtual to guest virtual + * + * Input Args: + * hva - Starting host virtual address + * vm - Virtual Machine + * gva - Starting guest virtual address + * len - number of bytes to compare + * + * Output Args: None + * + * Input/Output Args: None + * + * Return: + * Returns 0 if the bytes starting at hva for a length of len + * are equal the guest virtual bytes starting at gva. Returns + * a value < 0, if bytes at hva are less than those at gva. + * Otherwise a value > 0 is returned. + * + * Compares the bytes starting at the host virtual address hva, for + * a length of len, to the guest bytes starting at the guest virtual + * address given by gva. + */ +int kvm_memcmp_hva_gva(void *hva, + struct kvm_vm *vm, vm_vaddr_t gva, size_t len) +{ + size_t amt; + + /* Compare a batch of bytes until either a match is found + * or all the bytes have been compared. + */ + for (uintptr_t offset = 0; offset < len; offset += amt) { + uintptr_t ptr1 = (uintptr_t)hva + offset; + + /* Determine host address for guest virtual address + * at offset. + */ + uintptr_t ptr2 = (uintptr_t)addr_gva2hva(vm, gva + offset); + + /* Determine amount to compare on this pass. + * Don't allow the comparsion to cross a page boundary. + */ + amt = len - offset; + if ((ptr1 >> vm->page_shift) != ((ptr1 + amt) >> vm->page_shift)) + amt = vm->page_size - (ptr1 % vm->page_size); + if ((ptr2 >> vm->page_shift) != ((ptr2 + amt) >> vm->page_shift)) + amt = vm->page_size - (ptr2 % vm->page_size); + + assert((ptr1 >> vm->page_shift) == ((ptr1 + amt - 1) >> vm->page_shift)); + assert((ptr2 >> vm->page_shift) == ((ptr2 + amt - 1) >> vm->page_shift)); + + /* Perform the comparison. If there is a difference + * return that result to the caller, otherwise need + * to continue on looking for a mismatch. + */ + int ret = memcmp((void *)ptr1, (void *)ptr2, amt); + if (ret != 0) + return ret; + } + + /* No mismatch found. Let the caller know the two memory + * areas are equal. + */ + return 0; +} + +/* Allocate an instance of struct kvm_cpuid2 + * + * Input Args: None + * + * Output Args: None + * + * Return: A pointer to the allocated struct. The caller is responsible + * for freeing this struct. + * + * Since kvm_cpuid2 uses a 0-length array to allow a the size of the + * array to be decided at allocation time, allocation is slightly + * complicated. This function uses a reasonable default length for + * the array and performs the appropriate allocation. + */ +static struct kvm_cpuid2 *allocate_kvm_cpuid2(void) +{ + struct kvm_cpuid2 *cpuid; + int nent = 100; + size_t size; + + size = sizeof(*cpuid); + size += nent * sizeof(struct kvm_cpuid_entry2); + cpuid = malloc(size); + if (!cpuid) { + perror("malloc"); + abort(); + } + + cpuid->nent = nent; + + return cpuid; +} + +/* KVM Supported CPUID Get + * + * Input Args: None + * + * Output Args: + * + * Return: The supported KVM CPUID + * + * Get the guest CPUID supported by KVM. + */ +struct kvm_cpuid2 *kvm_get_supported_cpuid(void) +{ + static struct kvm_cpuid2 *cpuid; + int ret; + int kvm_fd; + + if (cpuid) + return cpuid; + + cpuid = allocate_kvm_cpuid2(); + kvm_fd = open(KVM_DEV_PATH, O_RDONLY); + if (kvm_fd < 0) + exit(KSFT_SKIP); + + ret = ioctl(kvm_fd, KVM_GET_SUPPORTED_CPUID, cpuid); + TEST_ASSERT(ret == 0, "KVM_GET_SUPPORTED_CPUID failed %d %d\n", + ret, errno); + + close(kvm_fd); + return cpuid; +} + +/* Locate a cpuid entry. + * + * Input Args: + * cpuid: The cpuid. + * function: The function of the cpuid entry to find. + * + * Output Args: None + * + * Return: A pointer to the cpuid entry. Never returns NULL. + */ +struct kvm_cpuid_entry2 * +kvm_get_supported_cpuid_index(uint32_t function, uint32_t index) +{ + struct kvm_cpuid2 *cpuid; + struct kvm_cpuid_entry2 *entry = NULL; + int i; + + cpuid = kvm_get_supported_cpuid(); + for (i = 0; i < cpuid->nent; i++) { + if (cpuid->entries[i].function == function && + cpuid->entries[i].index == index) { + entry = &cpuid->entries[i]; + break; + } + } + + TEST_ASSERT(entry, "Guest CPUID entry not found: (EAX=%x, ECX=%x).", + function, index); + return entry; +} + +/* VM Userspace Memory Region Add + * + * Input Args: + * vm - Virtual Machine + * backing_src - Storage source for this region. + * NULL to use anonymous memory. + * guest_paddr - Starting guest physical address + * slot - KVM region slot + * npages - Number of physical pages + * flags - KVM memory region flags (e.g. KVM_MEM_LOG_DIRTY_PAGES) + * + * Output Args: None + * + * Return: None + * + * Allocates a memory area of the number of pages specified by npages + * and maps it to the VM specified by vm, at a starting physical address + * given by guest_paddr. The region is created with a KVM region slot + * given by slot, which must be unique and < KVM_MEM_SLOTS_NUM. The + * region is created with the flags given by flags. + */ +void vm_userspace_mem_region_add(struct kvm_vm *vm, + enum vm_mem_backing_src_type src_type, + uint64_t guest_paddr, uint32_t slot, uint64_t npages, + uint32_t flags) +{ + int ret; + unsigned long pmem_size = 0; + struct userspace_mem_region *region; + size_t huge_page_size = KVM_UTIL_PGS_PER_HUGEPG * vm->page_size; + + TEST_ASSERT((guest_paddr % vm->page_size) == 0, "Guest physical " + "address not on a page boundary.\n" + " guest_paddr: 0x%lx vm->page_size: 0x%x", + guest_paddr, vm->page_size); + TEST_ASSERT((((guest_paddr >> vm->page_shift) + npages) - 1) + <= vm->max_gfn, "Physical range beyond maximum " + "supported physical address,\n" + " guest_paddr: 0x%lx npages: 0x%lx\n" + " vm->max_gfn: 0x%lx vm->page_size: 0x%x", + guest_paddr, npages, vm->max_gfn, vm->page_size); + + /* Confirm a mem region with an overlapping address doesn't + * already exist. + */ + region = (struct userspace_mem_region *) userspace_mem_region_find( + vm, guest_paddr, (guest_paddr + npages * vm->page_size) - 1); + if (region != NULL) + TEST_ASSERT(false, "overlapping userspace_mem_region already " + "exists\n" + " requested guest_paddr: 0x%lx npages: 0x%lx " + "page_size: 0x%x\n" + " existing guest_paddr: 0x%lx size: 0x%lx", + guest_paddr, npages, vm->page_size, + (uint64_t) region->region.guest_phys_addr, + (uint64_t) region->region.memory_size); + + /* Confirm no region with the requested slot already exists. */ + for (region = vm->userspace_mem_region_head; region; + region = region->next) { + if (region->region.slot == slot) + break; + } + if (region != NULL) + TEST_ASSERT(false, "A mem region with the requested slot " + "already exists.\n" + " requested slot: %u paddr: 0x%lx npages: 0x%lx\n" + " existing slot: %u paddr: 0x%lx size: 0x%lx", + slot, guest_paddr, npages, + region->region.slot, + (uint64_t) region->region.guest_phys_addr, + (uint64_t) region->region.memory_size); + + /* Allocate and initialize new mem region structure. */ + region = calloc(1, sizeof(*region)); + TEST_ASSERT(region != NULL, "Insufficient Memory"); + region->mmap_size = npages * vm->page_size; + + /* Enough memory to align up to a huge page. */ + if (src_type == VM_MEM_SRC_ANONYMOUS_THP) + region->mmap_size += huge_page_size; + region->mmap_start = mmap(NULL, region->mmap_size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS + | (src_type == VM_MEM_SRC_ANONYMOUS_HUGETLB ? MAP_HUGETLB : 0), + -1, 0); + TEST_ASSERT(region->mmap_start != MAP_FAILED, + "test_malloc failed, mmap_start: %p errno: %i", + region->mmap_start, errno); + + /* Align THP allocation up to start of a huge page. */ + region->host_mem = align(region->mmap_start, + src_type == VM_MEM_SRC_ANONYMOUS_THP ? huge_page_size : 1); + + /* As needed perform madvise */ + if (src_type == VM_MEM_SRC_ANONYMOUS || src_type == VM_MEM_SRC_ANONYMOUS_THP) { + ret = madvise(region->host_mem, npages * vm->page_size, + src_type == VM_MEM_SRC_ANONYMOUS ? MADV_NOHUGEPAGE : MADV_HUGEPAGE); + TEST_ASSERT(ret == 0, "madvise failed,\n" + " addr: %p\n" + " length: 0x%lx\n" + " src_type: %x", + region->host_mem, npages * vm->page_size, src_type); + } + + region->unused_phy_pages = sparsebit_alloc(); + sparsebit_set_num(region->unused_phy_pages, + guest_paddr >> vm->page_shift, npages); + region->region.slot = slot; + region->region.flags = flags; + region->region.guest_phys_addr = guest_paddr; + region->region.memory_size = npages * vm->page_size; + region->region.userspace_addr = (uintptr_t) region->host_mem; + ret = ioctl(vm->fd, KVM_SET_USER_MEMORY_REGION, ®ion->region); + TEST_ASSERT(ret == 0, "KVM_SET_USER_MEMORY_REGION IOCTL failed,\n" + " rc: %i errno: %i\n" + " slot: %u flags: 0x%x\n" + " guest_phys_addr: 0x%lx size: 0x%lx", + ret, errno, slot, flags, + guest_paddr, (uint64_t) region->region.memory_size); + + /* Add to linked-list of memory regions. */ + if (vm->userspace_mem_region_head) + vm->userspace_mem_region_head->prev = region; + region->next = vm->userspace_mem_region_head; + vm->userspace_mem_region_head = region; +} + +/* Memslot to region + * + * Input Args: + * vm - Virtual Machine + * memslot - KVM memory slot ID + * + * Output Args: None + * + * Return: + * Pointer to memory region structure that describe memory region + * using kvm memory slot ID given by memslot. TEST_ASSERT failure + * on error (e.g. currently no memory region using memslot as a KVM + * memory slot ID). + */ +static struct userspace_mem_region *memslot2region(struct kvm_vm *vm, + uint32_t memslot) +{ + struct userspace_mem_region *region; + + for (region = vm->userspace_mem_region_head; region; + region = region->next) { + if (region->region.slot == memslot) + break; + } + if (region == NULL) { + fprintf(stderr, "No mem region with the requested slot found,\n" + " requested slot: %u\n", memslot); + fputs("---- vm dump ----\n", stderr); + vm_dump(stderr, vm, 2); + TEST_ASSERT(false, "Mem region not found"); + } + + return region; +} + +/* VM Memory Region Flags Set + * + * Input Args: + * vm - Virtual Machine + * flags - Starting guest physical address + * + * Output Args: None + * + * Return: None + * + * Sets the flags of the memory region specified by the value of slot, + * to the values given by flags. + */ +void vm_mem_region_set_flags(struct kvm_vm *vm, uint32_t slot, uint32_t flags) +{ + int ret; + struct userspace_mem_region *region; + + /* Locate memory region. */ + region = memslot2region(vm, slot); + + region->region.flags = flags; + + ret = ioctl(vm->fd, KVM_SET_USER_MEMORY_REGION, ®ion->region); + + TEST_ASSERT(ret == 0, "KVM_SET_USER_MEMORY_REGION IOCTL failed,\n" + " rc: %i errno: %i slot: %u flags: 0x%x", + ret, errno, slot, flags); +} + +/* VCPU mmap Size + * + * Input Args: None + * + * Output Args: None + * + * Return: + * Size of VCPU state + * + * Returns the size of the structure pointed to by the return value + * of vcpu_state(). + */ +static int vcpu_mmap_sz(void) +{ + int dev_fd, ret; + + dev_fd = open(KVM_DEV_PATH, O_RDONLY); + if (dev_fd < 0) + exit(KSFT_SKIP); + + ret = ioctl(dev_fd, KVM_GET_VCPU_MMAP_SIZE, NULL); + TEST_ASSERT(ret >= sizeof(struct kvm_run), + "%s KVM_GET_VCPU_MMAP_SIZE ioctl failed, rc: %i errno: %i", + __func__, ret, errno); + + close(dev_fd); + + return ret; +} + +/* VM VCPU Add + * + * Input Args: + * vm - Virtual Machine + * vcpuid - VCPU ID + * + * Output Args: None + * + * Return: None + * + * Creates and adds to the VM specified by vm and virtual CPU with + * the ID given by vcpuid. + */ +void vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpuid, int pgd_memslot, int gdt_memslot) +{ + struct vcpu *vcpu; + + /* Confirm a vcpu with the specified id doesn't already exist. */ + vcpu = vcpu_find(vm, vcpuid); + if (vcpu != NULL) + TEST_ASSERT(false, "vcpu with the specified id " + "already exists,\n" + " requested vcpuid: %u\n" + " existing vcpuid: %u state: %p", + vcpuid, vcpu->id, vcpu->state); + + /* Allocate and initialize new vcpu structure. */ + vcpu = calloc(1, sizeof(*vcpu)); + TEST_ASSERT(vcpu != NULL, "Insufficient Memory"); + vcpu->id = vcpuid; + vcpu->fd = ioctl(vm->fd, KVM_CREATE_VCPU, vcpuid); + TEST_ASSERT(vcpu->fd >= 0, "KVM_CREATE_VCPU failed, rc: %i errno: %i", + vcpu->fd, errno); + + TEST_ASSERT(vcpu_mmap_sz() >= sizeof(*vcpu->state), "vcpu mmap size " + "smaller than expected, vcpu_mmap_sz: %i expected_min: %zi", + vcpu_mmap_sz(), sizeof(*vcpu->state)); + vcpu->state = (struct kvm_run *) mmap(NULL, sizeof(*vcpu->state), + PROT_READ | PROT_WRITE, MAP_SHARED, vcpu->fd, 0); + TEST_ASSERT(vcpu->state != MAP_FAILED, "mmap vcpu_state failed, " + "vcpu id: %u errno: %i", vcpuid, errno); + + /* Add to linked-list of VCPUs. */ + if (vm->vcpu_head) + vm->vcpu_head->prev = vcpu; + vcpu->next = vm->vcpu_head; + vm->vcpu_head = vcpu; + + vcpu_setup(vm, vcpuid, pgd_memslot, gdt_memslot); +} + +/* VM Virtual Address Unused Gap + * + * Input Args: + * vm - Virtual Machine + * sz - Size (bytes) + * vaddr_min - Minimum Virtual Address + * + * Output Args: None + * + * Return: + * Lowest virtual address at or below vaddr_min, with at least + * sz unused bytes. TEST_ASSERT failure if no area of at least + * size sz is available. + * + * Within the VM specified by vm, locates the lowest starting virtual + * address >= vaddr_min, that has at least sz unallocated bytes. A + * TEST_ASSERT failure occurs for invalid input or no area of at least + * sz unallocated bytes >= vaddr_min is available. + */ +static vm_vaddr_t vm_vaddr_unused_gap(struct kvm_vm *vm, size_t sz, + vm_vaddr_t vaddr_min) +{ + uint64_t pages = (sz + vm->page_size - 1) >> vm->page_shift; + + /* Determine lowest permitted virtual page index. */ + uint64_t pgidx_start = (vaddr_min + vm->page_size - 1) >> vm->page_shift; + if ((pgidx_start * vm->page_size) < vaddr_min) + goto no_va_found; + + /* Loop over section with enough valid virtual page indexes. */ + if (!sparsebit_is_set_num(vm->vpages_valid, + pgidx_start, pages)) + pgidx_start = sparsebit_next_set_num(vm->vpages_valid, + pgidx_start, pages); + do { + /* + * Are there enough unused virtual pages available at + * the currently proposed starting virtual page index. + * If not, adjust proposed starting index to next + * possible. + */ + if (sparsebit_is_clear_num(vm->vpages_mapped, + pgidx_start, pages)) + goto va_found; + pgidx_start = sparsebit_next_clear_num(vm->vpages_mapped, + pgidx_start, pages); + if (pgidx_start == 0) + goto no_va_found; + + /* + * If needed, adjust proposed starting virtual address, + * to next range of valid virtual addresses. + */ + if (!sparsebit_is_set_num(vm->vpages_valid, + pgidx_start, pages)) { + pgidx_start = sparsebit_next_set_num( + vm->vpages_valid, pgidx_start, pages); + if (pgidx_start == 0) + goto no_va_found; + } + } while (pgidx_start != 0); + +no_va_found: + TEST_ASSERT(false, "No vaddr of specified pages available, " + "pages: 0x%lx", pages); + + /* NOT REACHED */ + return -1; + +va_found: + TEST_ASSERT(sparsebit_is_set_num(vm->vpages_valid, + pgidx_start, pages), + "Unexpected, invalid virtual page index range,\n" + " pgidx_start: 0x%lx\n" + " pages: 0x%lx", + pgidx_start, pages); + TEST_ASSERT(sparsebit_is_clear_num(vm->vpages_mapped, + pgidx_start, pages), + "Unexpected, pages already mapped,\n" + " pgidx_start: 0x%lx\n" + " pages: 0x%lx", + pgidx_start, pages); + + return pgidx_start * vm->page_size; +} + +/* VM Virtual Address Allocate + * + * Input Args: + * vm - Virtual Machine + * sz - Size in bytes + * vaddr_min - Minimum starting virtual address + * data_memslot - Memory region slot for data pages + * pgd_memslot - Memory region slot for new virtual translation tables + * + * Output Args: None + * + * Return: + * Starting guest virtual address + * + * Allocates at least sz bytes within the virtual address space of the vm + * given by vm. The allocated bytes are mapped to a virtual address >= + * the address given by vaddr_min. Note that each allocation uses a + * a unique set of pages, with the minimum real allocation being at least + * a page. + */ +vm_vaddr_t vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min, + uint32_t data_memslot, uint32_t pgd_memslot) +{ + uint64_t pages = (sz >> vm->page_shift) + ((sz % vm->page_size) != 0); + + virt_pgd_alloc(vm, pgd_memslot); + + /* Find an unused range of virtual page addresses of at least + * pages in length. + */ + vm_vaddr_t vaddr_start = vm_vaddr_unused_gap(vm, sz, vaddr_min); + + /* Map the virtual pages. */ + for (vm_vaddr_t vaddr = vaddr_start; pages > 0; + pages--, vaddr += vm->page_size) { + vm_paddr_t paddr; + + paddr = vm_phy_page_alloc(vm, KVM_UTIL_MIN_PADDR, data_memslot); + + virt_pg_map(vm, vaddr, paddr, pgd_memslot); + + sparsebit_set(vm->vpages_mapped, + vaddr >> vm->page_shift); + } + + return vaddr_start; +} + +/* + * Map a range of VM virtual address to the VM's physical address + * + * Input Args: + * vm - Virtual Machine + * vaddr - Virtuall address to map + * paddr - VM Physical Address + * size - The size of the range to map + * pgd_memslot - Memory region slot for new virtual translation tables + * + * Output Args: None + * + * Return: None + * + * Within the VM given by vm, creates a virtual translation for the + * page range starting at vaddr to the page range starting at paddr. + */ +void virt_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, + size_t size, uint32_t pgd_memslot) +{ + size_t page_size = vm->page_size; + size_t npages = size / page_size; + + TEST_ASSERT(vaddr + size > vaddr, "Vaddr overflow"); + TEST_ASSERT(paddr + size > paddr, "Paddr overflow"); + + while (npages--) { + virt_pg_map(vm, vaddr, paddr, pgd_memslot); + vaddr += page_size; + paddr += page_size; + } +} + +/* Address VM Physical to Host Virtual + * + * Input Args: + * vm - Virtual Machine + * gpa - VM physical address + * + * Output Args: None + * + * Return: + * Equivalent host virtual address + * + * Locates the memory region containing the VM physical address given + * by gpa, within the VM given by vm. When found, the host virtual + * address providing the memory to the vm physical address is returned. + * A TEST_ASSERT failure occurs if no region containing gpa exists. + */ +void *addr_gpa2hva(struct kvm_vm *vm, vm_paddr_t gpa) +{ + struct userspace_mem_region *region; + for (region = vm->userspace_mem_region_head; region; + region = region->next) { + if ((gpa >= region->region.guest_phys_addr) + && (gpa <= (region->region.guest_phys_addr + + region->region.memory_size - 1))) + return (void *) ((uintptr_t) region->host_mem + + (gpa - region->region.guest_phys_addr)); + } + + TEST_ASSERT(false, "No vm physical memory at 0x%lx", gpa); + return NULL; +} + +/* Address Host Virtual to VM Physical + * + * Input Args: + * vm - Virtual Machine + * hva - Host virtual address + * + * Output Args: None + * + * Return: + * Equivalent VM physical address + * + * Locates the memory region containing the host virtual address given + * by hva, within the VM given by vm. When found, the equivalent + * VM physical address is returned. A TEST_ASSERT failure occurs if no + * region containing hva exists. + */ +vm_paddr_t addr_hva2gpa(struct kvm_vm *vm, void *hva) +{ + struct userspace_mem_region *region; + for (region = vm->userspace_mem_region_head; region; + region = region->next) { + if ((hva >= region->host_mem) + && (hva <= (region->host_mem + + region->region.memory_size - 1))) + return (vm_paddr_t) ((uintptr_t) + region->region.guest_phys_addr + + (hva - (uintptr_t) region->host_mem)); + } + + TEST_ASSERT(false, "No mapping to a guest physical address, " + "hva: %p", hva); + return -1; +} + +/* VM Create IRQ Chip + * + * Input Args: + * vm - Virtual Machine + * + * Output Args: None + * + * Return: None + * + * Creates an interrupt controller chip for the VM specified by vm. + */ +void vm_create_irqchip(struct kvm_vm *vm) +{ + int ret; + + ret = ioctl(vm->fd, KVM_CREATE_IRQCHIP, 0); + TEST_ASSERT(ret == 0, "KVM_CREATE_IRQCHIP IOCTL failed, " + "rc: %i errno: %i", ret, errno); + + vm->has_irqchip = true; +} + +/* VM VCPU State + * + * Input Args: + * vm - Virtual Machine + * vcpuid - VCPU ID + * + * Output Args: None + * + * Return: + * Pointer to structure that describes the state of the VCPU. + * + * Locates and returns a pointer to a structure that describes the + * state of the VCPU with the given vcpuid. + */ +struct kvm_run *vcpu_state(struct kvm_vm *vm, uint32_t vcpuid) +{ + struct vcpu *vcpu = vcpu_find(vm, vcpuid); + TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid); + + return vcpu->state; +} + +/* VM VCPU Run + * + * Input Args: + * vm - Virtual Machine + * vcpuid - VCPU ID + * + * Output Args: None + * + * Return: None + * + * Switch to executing the code for the VCPU given by vcpuid, within the VM + * given by vm. + */ +void vcpu_run(struct kvm_vm *vm, uint32_t vcpuid) +{ + int ret = _vcpu_run(vm, vcpuid); + TEST_ASSERT(ret == 0, "KVM_RUN IOCTL failed, " + "rc: %i errno: %i", ret, errno); +} + +int _vcpu_run(struct kvm_vm *vm, uint32_t vcpuid) +{ + struct vcpu *vcpu = vcpu_find(vm, vcpuid); + int rc; + + TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid); + do { + rc = ioctl(vcpu->fd, KVM_RUN, NULL); + } while (rc == -1 && errno == EINTR); + return rc; +} + +/* VM VCPU Set MP State + * + * Input Args: + * vm - Virtual Machine + * vcpuid - VCPU ID + * mp_state - mp_state to be set + * + * Output Args: None + * + * Return: None + * + * Sets the MP state of the VCPU given by vcpuid, to the state given + * by mp_state. + */ +void vcpu_set_mp_state(struct kvm_vm *vm, uint32_t vcpuid, + struct kvm_mp_state *mp_state) +{ + struct vcpu *vcpu = vcpu_find(vm, vcpuid); + int ret; + + TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid); + + ret = ioctl(vcpu->fd, KVM_SET_MP_STATE, mp_state); + TEST_ASSERT(ret == 0, "KVM_SET_MP_STATE IOCTL failed, " + "rc: %i errno: %i", ret, errno); +} + +/* VM VCPU Regs Get + * + * Input Args: + * vm - Virtual Machine + * vcpuid - VCPU ID + * + * Output Args: + * regs - current state of VCPU regs + * + * Return: None + * + * Obtains the current register state for the VCPU specified by vcpuid + * and stores it at the location given by regs. + */ +void vcpu_regs_get(struct kvm_vm *vm, + uint32_t vcpuid, struct kvm_regs *regs) +{ + struct vcpu *vcpu = vcpu_find(vm, vcpuid); + int ret; + + TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid); + + /* Get the regs. */ + ret = ioctl(vcpu->fd, KVM_GET_REGS, regs); + TEST_ASSERT(ret == 0, "KVM_GET_REGS failed, rc: %i errno: %i", + ret, errno); +} + +/* VM VCPU Regs Set + * + * Input Args: + * vm - Virtual Machine + * vcpuid - VCPU ID + * regs - Values to set VCPU regs to + * + * Output Args: None + * + * Return: None + * + * Sets the regs of the VCPU specified by vcpuid to the values + * given by regs. + */ +void vcpu_regs_set(struct kvm_vm *vm, + uint32_t vcpuid, struct kvm_regs *regs) +{ + struct vcpu *vcpu = vcpu_find(vm, vcpuid); + int ret; + + TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid); + + /* Set the regs. */ + ret = ioctl(vcpu->fd, KVM_SET_REGS, regs); + TEST_ASSERT(ret == 0, "KVM_SET_REGS failed, rc: %i errno: %i", + ret, errno); +} + +void vcpu_events_get(struct kvm_vm *vm, uint32_t vcpuid, + struct kvm_vcpu_events *events) +{ + struct vcpu *vcpu = vcpu_find(vm, vcpuid); + int ret; + + TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid); + + /* Get the regs. */ + ret = ioctl(vcpu->fd, KVM_GET_VCPU_EVENTS, events); + TEST_ASSERT(ret == 0, "KVM_GET_VCPU_EVENTS, failed, rc: %i errno: %i", + ret, errno); +} + +void vcpu_events_set(struct kvm_vm *vm, uint32_t vcpuid, + struct kvm_vcpu_events *events) +{ + struct vcpu *vcpu = vcpu_find(vm, vcpuid); + int ret; + + TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid); + + /* Set the regs. */ + ret = ioctl(vcpu->fd, KVM_SET_VCPU_EVENTS, events); + TEST_ASSERT(ret == 0, "KVM_SET_VCPU_EVENTS, failed, rc: %i errno: %i", + ret, errno); +} + +/* VCPU Get MSR + * + * Input Args: + * vm - Virtual Machine + * vcpuid - VCPU ID + * msr_index - Index of MSR + * + * Output Args: None + * + * Return: On success, value of the MSR. On failure a TEST_ASSERT is produced. + * + * Get value of MSR for VCPU. + */ +uint64_t vcpu_get_msr(struct kvm_vm *vm, uint32_t vcpuid, uint64_t msr_index) +{ + struct vcpu *vcpu = vcpu_find(vm, vcpuid); + struct { + struct kvm_msrs header; + struct kvm_msr_entry entry; + } buffer = {}; + int r; + + TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid); + buffer.header.nmsrs = 1; + buffer.entry.index = msr_index; + r = ioctl(vcpu->fd, KVM_GET_MSRS, &buffer.header); + TEST_ASSERT(r == 1, "KVM_GET_MSRS IOCTL failed,\n" + " rc: %i errno: %i", r, errno); + + return buffer.entry.data; +} + +/* VCPU Set MSR + * + * Input Args: + * vm - Virtual Machine + * vcpuid - VCPU ID + * msr_index - Index of MSR + * msr_value - New value of MSR + * + * Output Args: None + * + * Return: On success, nothing. On failure a TEST_ASSERT is produced. + * + * Set value of MSR for VCPU. + */ +void vcpu_set_msr(struct kvm_vm *vm, uint32_t vcpuid, uint64_t msr_index, + uint64_t msr_value) +{ + struct vcpu *vcpu = vcpu_find(vm, vcpuid); + struct { + struct kvm_msrs header; + struct kvm_msr_entry entry; + } buffer = {}; + int r; + + TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid); + memset(&buffer, 0, sizeof(buffer)); + buffer.header.nmsrs = 1; + buffer.entry.index = msr_index; + buffer.entry.data = msr_value; + r = ioctl(vcpu->fd, KVM_SET_MSRS, &buffer.header); + TEST_ASSERT(r == 1, "KVM_SET_MSRS IOCTL failed,\n" + " rc: %i errno: %i", r, errno); +} + +/* VM VCPU Args Set + * + * Input Args: + * vm - Virtual Machine + * vcpuid - VCPU ID + * num - number of arguments + * ... - arguments, each of type uint64_t + * + * Output Args: None + * + * Return: None + * + * Sets the first num function input arguments to the values + * given as variable args. Each of the variable args is expected to + * be of type uint64_t. + */ +void vcpu_args_set(struct kvm_vm *vm, uint32_t vcpuid, unsigned int num, ...) +{ + va_list ap; + struct kvm_regs regs; + + TEST_ASSERT(num >= 1 && num <= 6, "Unsupported number of args,\n" + " num: %u\n", + num); + + va_start(ap, num); + vcpu_regs_get(vm, vcpuid, ®s); + + if (num >= 1) + regs.rdi = va_arg(ap, uint64_t); + + if (num >= 2) + regs.rsi = va_arg(ap, uint64_t); + + if (num >= 3) + regs.rdx = va_arg(ap, uint64_t); + + if (num >= 4) + regs.rcx = va_arg(ap, uint64_t); + + if (num >= 5) + regs.r8 = va_arg(ap, uint64_t); + + if (num >= 6) + regs.r9 = va_arg(ap, uint64_t); + + vcpu_regs_set(vm, vcpuid, ®s); + va_end(ap); +} + +/* VM VCPU System Regs Get + * + * Input Args: + * vm - Virtual Machine + * vcpuid - VCPU ID + * + * Output Args: + * sregs - current state of VCPU system regs + * + * Return: None + * + * Obtains the current system register state for the VCPU specified by + * vcpuid and stores it at the location given by sregs. + */ +void vcpu_sregs_get(struct kvm_vm *vm, + uint32_t vcpuid, struct kvm_sregs *sregs) +{ + struct vcpu *vcpu = vcpu_find(vm, vcpuid); + int ret; + + TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid); + + /* Get the regs. */ + /* Get the regs. */ + ret = ioctl(vcpu->fd, KVM_GET_SREGS, sregs); + TEST_ASSERT(ret == 0, "KVM_GET_SREGS failed, rc: %i errno: %i", + ret, errno); +} + +/* VM VCPU System Regs Set + * + * Input Args: + * vm - Virtual Machine + * vcpuid - VCPU ID + * sregs - Values to set VCPU system regs to + * + * Output Args: None + * + * Return: None + * + * Sets the system regs of the VCPU specified by vcpuid to the values + * given by sregs. + */ +void vcpu_sregs_set(struct kvm_vm *vm, + uint32_t vcpuid, struct kvm_sregs *sregs) +{ + int ret = _vcpu_sregs_set(vm, vcpuid, sregs); + TEST_ASSERT(ret == 0, "KVM_RUN IOCTL failed, " + "rc: %i errno: %i", ret, errno); +} + +int _vcpu_sregs_set(struct kvm_vm *vm, + uint32_t vcpuid, struct kvm_sregs *sregs) +{ + struct vcpu *vcpu = vcpu_find(vm, vcpuid); + int ret; + + TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid); + + /* Get the regs. */ + return ioctl(vcpu->fd, KVM_SET_SREGS, sregs); +} + +/* VCPU Ioctl + * + * Input Args: + * vm - Virtual Machine + * vcpuid - VCPU ID + * cmd - Ioctl number + * arg - Argument to pass to the ioctl + * + * Return: None + * + * Issues an arbitrary ioctl on a VCPU fd. + */ +void vcpu_ioctl(struct kvm_vm *vm, + uint32_t vcpuid, unsigned long cmd, void *arg) +{ + struct vcpu *vcpu = vcpu_find(vm, vcpuid); + int ret; + + TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid); + + ret = ioctl(vcpu->fd, cmd, arg); + TEST_ASSERT(ret == 0, "vcpu ioctl %lu failed, rc: %i errno: %i (%s)", + cmd, ret, errno, strerror(errno)); +} + +/* VM Ioctl + * + * Input Args: + * vm - Virtual Machine + * cmd - Ioctl number + * arg - Argument to pass to the ioctl + * + * Return: None + * + * Issues an arbitrary ioctl on a VM fd. + */ +void vm_ioctl(struct kvm_vm *vm, unsigned long cmd, void *arg) +{ + int ret; + + ret = ioctl(vm->fd, cmd, arg); + TEST_ASSERT(ret == 0, "vm ioctl %lu failed, rc: %i errno: %i (%s)", + cmd, ret, errno, strerror(errno)); +} + +/* VM Dump + * + * Input Args: + * vm - Virtual Machine + * indent - Left margin indent amount + * + * Output Args: + * stream - Output FILE stream + * + * Return: None + * + * Dumps the current state of the VM given by vm, to the FILE stream + * given by stream. + */ +void vm_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) +{ + struct userspace_mem_region *region; + struct vcpu *vcpu; + + fprintf(stream, "%*smode: 0x%x\n", indent, "", vm->mode); + fprintf(stream, "%*sfd: %i\n", indent, "", vm->fd); + fprintf(stream, "%*spage_size: 0x%x\n", indent, "", vm->page_size); + fprintf(stream, "%*sMem Regions:\n", indent, ""); + for (region = vm->userspace_mem_region_head; region; + region = region->next) { + fprintf(stream, "%*sguest_phys: 0x%lx size: 0x%lx " + "host_virt: %p\n", indent + 2, "", + (uint64_t) region->region.guest_phys_addr, + (uint64_t) region->region.memory_size, + region->host_mem); + fprintf(stream, "%*sunused_phy_pages: ", indent + 2, ""); + sparsebit_dump(stream, region->unused_phy_pages, 0); + } + fprintf(stream, "%*sMapped Virtual Pages:\n", indent, ""); + sparsebit_dump(stream, vm->vpages_mapped, indent + 2); + fprintf(stream, "%*spgd_created: %u\n", indent, "", + vm->pgd_created); + if (vm->pgd_created) { + fprintf(stream, "%*sVirtual Translation Tables:\n", + indent + 2, ""); + virt_dump(stream, vm, indent + 4); + } + fprintf(stream, "%*sVCPUs:\n", indent, ""); + for (vcpu = vm->vcpu_head; vcpu; vcpu = vcpu->next) + vcpu_dump(stream, vm, vcpu->id, indent + 2); +} + +/* VM VCPU Dump + * + * Input Args: + * vm - Virtual Machine + * vcpuid - VCPU ID + * indent - Left margin indent amount + * + * Output Args: + * stream - Output FILE stream + * + * Return: None + * + * Dumps the current state of the VCPU specified by vcpuid, within the VM + * given by vm, to the FILE stream given by stream. + */ +void vcpu_dump(FILE *stream, struct kvm_vm *vm, + uint32_t vcpuid, uint8_t indent) +{ + struct kvm_regs regs; + struct kvm_sregs sregs; + + fprintf(stream, "%*scpuid: %u\n", indent, "", vcpuid); + + fprintf(stream, "%*sregs:\n", indent + 2, ""); + vcpu_regs_get(vm, vcpuid, ®s); + regs_dump(stream, ®s, indent + 4); + + fprintf(stream, "%*ssregs:\n", indent + 2, ""); + vcpu_sregs_get(vm, vcpuid, &sregs); + sregs_dump(stream, &sregs, indent + 4); +} + +/* Known KVM exit reasons */ +static struct exit_reason { + unsigned int reason; + const char *name; +} exit_reasons_known[] = { + {KVM_EXIT_UNKNOWN, "UNKNOWN"}, + {KVM_EXIT_EXCEPTION, "EXCEPTION"}, + {KVM_EXIT_IO, "IO"}, + {KVM_EXIT_HYPERCALL, "HYPERCALL"}, + {KVM_EXIT_DEBUG, "DEBUG"}, + {KVM_EXIT_HLT, "HLT"}, + {KVM_EXIT_MMIO, "MMIO"}, + {KVM_EXIT_IRQ_WINDOW_OPEN, "IRQ_WINDOW_OPEN"}, + {KVM_EXIT_SHUTDOWN, "SHUTDOWN"}, + {KVM_EXIT_FAIL_ENTRY, "FAIL_ENTRY"}, + {KVM_EXIT_INTR, "INTR"}, + {KVM_EXIT_SET_TPR, "SET_TPR"}, + {KVM_EXIT_TPR_ACCESS, "TPR_ACCESS"}, + {KVM_EXIT_S390_SIEIC, "S390_SIEIC"}, + {KVM_EXIT_S390_RESET, "S390_RESET"}, + {KVM_EXIT_DCR, "DCR"}, + {KVM_EXIT_NMI, "NMI"}, + {KVM_EXIT_INTERNAL_ERROR, "INTERNAL_ERROR"}, + {KVM_EXIT_OSI, "OSI"}, + {KVM_EXIT_PAPR_HCALL, "PAPR_HCALL"}, +#ifdef KVM_EXIT_MEMORY_NOT_PRESENT + {KVM_EXIT_MEMORY_NOT_PRESENT, "MEMORY_NOT_PRESENT"}, +#endif +}; + +/* Exit Reason String + * + * Input Args: + * exit_reason - Exit reason + * + * Output Args: None + * + * Return: + * Constant string pointer describing the exit reason. + * + * Locates and returns a constant string that describes the KVM exit + * reason given by exit_reason. If no such string is found, a constant + * string of "Unknown" is returned. + */ +const char *exit_reason_str(unsigned int exit_reason) +{ + unsigned int n1; + + for (n1 = 0; n1 < ARRAY_SIZE(exit_reasons_known); n1++) { + if (exit_reason == exit_reasons_known[n1].reason) + return exit_reasons_known[n1].name; + } + + return "Unknown"; +} + +/* Physical Page Allocate + * + * Input Args: + * vm - Virtual Machine + * paddr_min - Physical address minimum + * memslot - Memory region to allocate page from + * + * Output Args: None + * + * Return: + * Starting physical address + * + * Within the VM specified by vm, locates an available physical page + * at or above paddr_min. If found, the page is marked as in use + * and its address is returned. A TEST_ASSERT failure occurs if no + * page is available at or above paddr_min. + */ +vm_paddr_t vm_phy_page_alloc(struct kvm_vm *vm, + vm_paddr_t paddr_min, uint32_t memslot) +{ + struct userspace_mem_region *region; + sparsebit_idx_t pg; + + TEST_ASSERT((paddr_min % vm->page_size) == 0, "Min physical address " + "not divisible by page size.\n" + " paddr_min: 0x%lx page_size: 0x%x", + paddr_min, vm->page_size); + + /* Locate memory region. */ + region = memslot2region(vm, memslot); + + /* Locate next available physical page at or above paddr_min. */ + pg = paddr_min >> vm->page_shift; + + if (!sparsebit_is_set(region->unused_phy_pages, pg)) { + pg = sparsebit_next_set(region->unused_phy_pages, pg); + if (pg == 0) { + fprintf(stderr, "No guest physical page available, " + "paddr_min: 0x%lx page_size: 0x%x memslot: %u", + paddr_min, vm->page_size, memslot); + fputs("---- vm dump ----\n", stderr); + vm_dump(stderr, vm, 2); + abort(); + } + } + + /* Specify page as in use and return its address. */ + sparsebit_clear(region->unused_phy_pages, pg); + + return pg * vm->page_size; +} + +/* Address Guest Virtual to Host Virtual + * + * Input Args: + * vm - Virtual Machine + * gva - VM virtual address + * + * Output Args: None + * + * Return: + * Equivalent host virtual address + */ +void *addr_gva2hva(struct kvm_vm *vm, vm_vaddr_t gva) +{ + return addr_gpa2hva(vm, addr_gva2gpa(vm, gva)); +} + +void guest_args_read(struct kvm_vm *vm, uint32_t vcpu_id, + struct guest_args *args) +{ + struct kvm_run *run = vcpu_state(vm, vcpu_id); + struct kvm_regs regs; + + memset(®s, 0, sizeof(regs)); + vcpu_regs_get(vm, vcpu_id, ®s); + + args->port = run->io.port; + args->arg0 = regs.rdi; + args->arg1 = regs.rsi; +} diff --git a/tools/testing/selftests/kvm/lib/kvm_util_internal.h b/tools/testing/selftests/kvm/lib/kvm_util_internal.h new file mode 100644 index 000000000..542ed606b --- /dev/null +++ b/tools/testing/selftests/kvm/lib/kvm_util_internal.h @@ -0,0 +1,72 @@ +/* + * tools/testing/selftests/kvm/lib/kvm_util.c + * + * Copyright (C) 2018, Google LLC. + * + * This work is licensed under the terms of the GNU GPL, version 2. + */ + +#ifndef KVM_UTIL_INTERNAL_H +#define KVM_UTIL_INTERNAL_H 1 + +#include "sparsebit.h" + +#ifndef BITS_PER_BYTE +#define BITS_PER_BYTE 8 +#endif + +#ifndef BITS_PER_LONG +#define BITS_PER_LONG (BITS_PER_BYTE * sizeof(long)) +#endif + +#define DIV_ROUND_UP(n, d) (((n) + (d) - 1) / (d)) +#define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, BITS_PER_LONG) + +/* Concrete definition of struct kvm_vm. */ +struct userspace_mem_region { + struct userspace_mem_region *next, *prev; + struct kvm_userspace_memory_region region; + struct sparsebit *unused_phy_pages; + int fd; + off_t offset; + void *host_mem; + void *mmap_start; + size_t mmap_size; +}; + +struct vcpu { + struct vcpu *next, *prev; + uint32_t id; + int fd; + struct kvm_run *state; +}; + +struct kvm_vm { + int mode; + int kvm_fd; + int fd; + unsigned int page_size; + unsigned int page_shift; + uint64_t max_gfn; + struct vcpu *vcpu_head; + struct userspace_mem_region *userspace_mem_region_head; + struct sparsebit *vpages_valid; + struct sparsebit *vpages_mapped; + + bool has_irqchip; + bool pgd_created; + vm_paddr_t pgd; + vm_vaddr_t gdt; + vm_vaddr_t tss; +}; + +struct vcpu *vcpu_find(struct kvm_vm *vm, + uint32_t vcpuid); +void vcpu_setup(struct kvm_vm *vm, int vcpuid, int pgd_memslot, int gdt_memslot); +void virt_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent); +void regs_dump(FILE *stream, struct kvm_regs *regs, + uint8_t indent); +void sregs_dump(FILE *stream, struct kvm_sregs *sregs, + uint8_t indent); + +#endif diff --git a/tools/testing/selftests/kvm/lib/sparsebit.c b/tools/testing/selftests/kvm/lib/sparsebit.c new file mode 100644 index 000000000..b132bc95d --- /dev/null +++ b/tools/testing/selftests/kvm/lib/sparsebit.c @@ -0,0 +1,2087 @@ +/* + * Sparse bit array + * + * Copyright (C) 2018, Google LLC. + * Copyright (C) 2018, Red Hat, Inc. (code style cleanup and fuzzing driver) + * + * This work is licensed under the terms of the GNU GPL, version 2. + * + * This library provides functions to support a memory efficient bit array, + * with an index size of 2^64. A sparsebit array is allocated through + * the use sparsebit_alloc() and free'd via sparsebit_free(), + * such as in the following: + * + * struct sparsebit *s; + * s = sparsebit_alloc(); + * sparsebit_free(&s); + * + * The struct sparsebit type resolves down to a struct sparsebit. + * Note that, sparsebit_free() takes a pointer to the sparsebit + * structure. This is so that sparsebit_free() is able to poison + * the pointer (e.g. set it to NULL) to the struct sparsebit before + * returning to the caller. + * + * Between the return of sparsebit_alloc() and the call of + * sparsebit_free(), there are multiple query and modifying operations + * that can be performed on the allocated sparsebit array. All of + * these operations take as a parameter the value returned from + * sparsebit_alloc() and most also take a bit index. Frequently + * used routines include: + * + * ---- Query Operations + * sparsebit_is_set(s, idx) + * sparsebit_is_clear(s, idx) + * sparsebit_any_set(s) + * sparsebit_first_set(s) + * sparsebit_next_set(s, prev_idx) + * + * ---- Modifying Operations + * sparsebit_set(s, idx) + * sparsebit_clear(s, idx) + * sparsebit_set_num(s, idx, num); + * sparsebit_clear_num(s, idx, num); + * + * A common operation, is to itterate over all the bits set in a test + * sparsebit array. This can be done via code with the following structure: + * + * sparsebit_idx_t idx; + * if (sparsebit_any_set(s)) { + * idx = sparsebit_first_set(s); + * do { + * ... + * idx = sparsebit_next_set(s, idx); + * } while (idx != 0); + * } + * + * The index of the first bit set needs to be obtained via + * sparsebit_first_set(), because sparsebit_next_set(), needs + * the index of the previously set. The sparsebit_idx_t type is + * unsigned, so there is no previous index before 0 that is available. + * Also, the call to sparsebit_first_set() is not made unless there + * is at least 1 bit in the array set. This is because sparsebit_first_set() + * aborts if sparsebit_first_set() is called with no bits set. + * It is the callers responsibility to assure that the + * sparsebit array has at least a single bit set before calling + * sparsebit_first_set(). + * + * ==== Implementation Overview ==== + * For the most part the internal implementation of sparsebit is + * opaque to the caller. One important implementation detail that the + * caller may need to be aware of is the spatial complexity of the + * implementation. This implementation of a sparsebit array is not + * only sparse, in that it uses memory proportional to the number of bits + * set. It is also efficient in memory usage when most of the bits are + * set. + * + * At a high-level the state of the bit settings are maintained through + * the use of a binary-search tree, where each node contains at least + * the following members: + * + * typedef uint64_t sparsebit_idx_t; + * typedef uint64_t sparsebit_num_t; + * + * sparsebit_idx_t idx; + * uint32_t mask; + * sparsebit_num_t num_after; + * + * The idx member contains the bit index of the first bit described by this + * node, while the mask member stores the setting of the first 32-bits. + * The setting of the bit at idx + n, where 0 <= n < 32, is located in the + * mask member at 1 << n. + * + * Nodes are sorted by idx and the bits described by two nodes will never + * overlap. The idx member is always aligned to the mask size, i.e. a + * multiple of 32. + * + * Beyond a typical implementation, the nodes in this implementation also + * contains a member named num_after. The num_after member holds the + * number of bits immediately after the mask bits that are contiguously set. + * The use of the num_after member allows this implementation to efficiently + * represent cases where most bits are set. For example, the case of all + * but the last two bits set, is represented by the following two nodes: + * + * node 0 - idx: 0x0 mask: 0xffffffff num_after: 0xffffffffffffffc0 + * node 1 - idx: 0xffffffffffffffe0 mask: 0x3fffffff num_after: 0 + * + * ==== Invariants ==== + * This implementation usses the following invariants: + * + * + Node are only used to represent bits that are set. + * Nodes with a mask of 0 and num_after of 0 are not allowed. + * + * + Sum of bits set in all the nodes is equal to the value of + * the struct sparsebit_pvt num_set member. + * + * + The setting of at least one bit is always described in a nodes + * mask (mask >= 1). + * + * + A node with all mask bits set only occurs when the last bit + * described by the previous node is not equal to this nodes + * starting index - 1. All such occurences of this condition are + * avoided by moving the setting of the nodes mask bits into + * the previous nodes num_after setting. + * + * + Node starting index is evenly divisible by the number of bits + * within a nodes mask member. + * + * + Nodes never represent a range of bits that wrap around the + * highest supported index. + * + * (idx + MASK_BITS + num_after - 1) <= ((sparsebit_idx_t) 0) - 1) + * + * As a consequence of the above, the num_after member of a node + * will always be <=: + * + * maximum_index - nodes_starting_index - number_of_mask_bits + * + * + Nodes within the binary search tree are sorted based on each + * nodes starting index. + * + * + The range of bits described by any two nodes do not overlap. The + * range of bits described by a single node is: + * + * start: node->idx + * end (inclusive): node->idx + MASK_BITS + node->num_after - 1; + * + * Note, at times these invariants are temporarily violated for a + * specific portion of the code. For example, when setting a mask + * bit, there is a small delay between when the mask bit is set and the + * value in the struct sparsebit_pvt num_set member is updated. Other + * temporary violations occur when node_split() is called with a specified + * index and assures that a node where its mask represents the bit + * at the specified index exists. At times to do this node_split() + * must split an existing node into two nodes or create a node that + * has no bits set. Such temporary violations must be corrected before + * returning to the caller. These corrections are typically performed + * by the local function node_reduce(). + */ + +#include "test_util.h" +#include "sparsebit.h" +#include +#include + +#define DUMP_LINE_MAX 100 /* Does not include indent amount */ + +typedef uint32_t mask_t; +#define MASK_BITS (sizeof(mask_t) * CHAR_BIT) + +struct node { + struct node *parent; + struct node *left; + struct node *right; + sparsebit_idx_t idx; /* index of least-significant bit in mask */ + sparsebit_num_t num_after; /* num contiguously set after mask */ + mask_t mask; +}; + +struct sparsebit { + /* + * Points to root node of the binary search + * tree. Equal to NULL when no bits are set in + * the entire sparsebit array. + */ + struct node *root; + + /* + * A redundant count of the total number of bits set. Used for + * diagnostic purposes and to change the time complexity of + * sparsebit_num_set() from O(n) to O(1). + * Note: Due to overflow, a value of 0 means none or all set. + */ + sparsebit_num_t num_set; +}; + +/* Returns the number of set bits described by the settings + * of the node pointed to by nodep. + */ +static sparsebit_num_t node_num_set(struct node *nodep) +{ + return nodep->num_after + __builtin_popcount(nodep->mask); +} + +/* Returns a pointer to the node that describes the + * lowest bit index. + */ +static struct node *node_first(struct sparsebit *s) +{ + struct node *nodep; + + for (nodep = s->root; nodep && nodep->left; nodep = nodep->left) + ; + + return nodep; +} + +/* Returns a pointer to the node that describes the + * lowest bit index > the index of the node pointed to by np. + * Returns NULL if no node with a higher index exists. + */ +static struct node *node_next(struct sparsebit *s, struct node *np) +{ + struct node *nodep = np; + + /* + * If current node has a right child, next node is the left-most + * of the right child. + */ + if (nodep->right) { + for (nodep = nodep->right; nodep->left; nodep = nodep->left) + ; + return nodep; + } + + /* + * No right child. Go up until node is left child of a parent. + * That parent is then the next node. + */ + while (nodep->parent && nodep == nodep->parent->right) + nodep = nodep->parent; + + return nodep->parent; +} + +/* Searches for and returns a pointer to the node that describes the + * highest index < the index of the node pointed to by np. + * Returns NULL if no node with a lower index exists. + */ +static struct node *node_prev(struct sparsebit *s, struct node *np) +{ + struct node *nodep = np; + + /* + * If current node has a left child, next node is the right-most + * of the left child. + */ + if (nodep->left) { + for (nodep = nodep->left; nodep->right; nodep = nodep->right) + ; + return (struct node *) nodep; + } + + /* + * No left child. Go up until node is right child of a parent. + * That parent is then the next node. + */ + while (nodep->parent && nodep == nodep->parent->left) + nodep = nodep->parent; + + return (struct node *) nodep->parent; +} + + +/* Allocates space to hold a copy of the node sub-tree pointed to by + * subtree and duplicates the bit settings to the newly allocated nodes. + * Returns the newly allocated copy of subtree. + */ +static struct node *node_copy_subtree(struct node *subtree) +{ + struct node *root; + + /* Duplicate the node at the root of the subtree */ + root = calloc(1, sizeof(*root)); + if (!root) { + perror("calloc"); + abort(); + } + + root->idx = subtree->idx; + root->mask = subtree->mask; + root->num_after = subtree->num_after; + + /* As needed, recursively duplicate the left and right subtrees */ + if (subtree->left) { + root->left = node_copy_subtree(subtree->left); + root->left->parent = root; + } + + if (subtree->right) { + root->right = node_copy_subtree(subtree->right); + root->right->parent = root; + } + + return root; +} + +/* Searches for and returns a pointer to the node that describes the setting + * of the bit given by idx. A node describes the setting of a bit if its + * index is within the bits described by the mask bits or the number of + * contiguous bits set after the mask. Returns NULL if there is no such node. + */ +static struct node *node_find(struct sparsebit *s, sparsebit_idx_t idx) +{ + struct node *nodep; + + /* Find the node that describes the setting of the bit at idx */ + for (nodep = s->root; nodep; + nodep = nodep->idx > idx ? nodep->left : nodep->right) { + if (idx >= nodep->idx && + idx <= nodep->idx + MASK_BITS + nodep->num_after - 1) + break; + } + + return nodep; +} + +/* Entry Requirements: + * + A node that describes the setting of idx is not already present. + * + * Adds a new node to describe the setting of the bit at the index given + * by idx. Returns a pointer to the newly added node. + * + * TODO(lhuemill): Degenerate cases causes the tree to get unbalanced. + */ +static struct node *node_add(struct sparsebit *s, sparsebit_idx_t idx) +{ + struct node *nodep, *parentp, *prev; + + /* Allocate and initialize the new node. */ + nodep = calloc(1, sizeof(*nodep)); + if (!nodep) { + perror("calloc"); + abort(); + } + + nodep->idx = idx & -MASK_BITS; + + /* If no nodes, set it up as the root node. */ + if (!s->root) { + s->root = nodep; + return nodep; + } + + /* + * Find the parent where the new node should be attached + * and add the node there. + */ + parentp = s->root; + while (true) { + if (idx < parentp->idx) { + if (!parentp->left) { + parentp->left = nodep; + nodep->parent = parentp; + break; + } + parentp = parentp->left; + } else { + assert(idx > parentp->idx + MASK_BITS + parentp->num_after - 1); + if (!parentp->right) { + parentp->right = nodep; + nodep->parent = parentp; + break; + } + parentp = parentp->right; + } + } + + /* + * Does num_after bits of previous node overlap with the mask + * of the new node? If so set the bits in the new nodes mask + * and reduce the previous nodes num_after. + */ + prev = node_prev(s, nodep); + while (prev && prev->idx + MASK_BITS + prev->num_after - 1 >= nodep->idx) { + unsigned int n1 = (prev->idx + MASK_BITS + prev->num_after - 1) + - nodep->idx; + assert(prev->num_after > 0); + assert(n1 < MASK_BITS); + assert(!(nodep->mask & (1 << n1))); + nodep->mask |= (1 << n1); + prev->num_after--; + } + + return nodep; +} + +/* Returns whether all the bits in the sparsebit array are set. */ +bool sparsebit_all_set(struct sparsebit *s) +{ + /* + * If any nodes there must be at least one bit set. Only case + * where a bit is set and total num set is 0, is when all bits + * are set. + */ + return s->root && s->num_set == 0; +} + +/* Clears all bits described by the node pointed to by nodep, then + * removes the node. + */ +static void node_rm(struct sparsebit *s, struct node *nodep) +{ + struct node *tmp; + sparsebit_num_t num_set; + + num_set = node_num_set(nodep); + assert(s->num_set >= num_set || sparsebit_all_set(s)); + s->num_set -= node_num_set(nodep); + + /* Have both left and right child */ + if (nodep->left && nodep->right) { + /* + * Move left children to the leftmost leaf node + * of the right child. + */ + for (tmp = nodep->right; tmp->left; tmp = tmp->left) + ; + tmp->left = nodep->left; + nodep->left = NULL; + tmp->left->parent = tmp; + } + + /* Left only child */ + if (nodep->left) { + if (!nodep->parent) { + s->root = nodep->left; + nodep->left->parent = NULL; + } else { + nodep->left->parent = nodep->parent; + if (nodep == nodep->parent->left) + nodep->parent->left = nodep->left; + else { + assert(nodep == nodep->parent->right); + nodep->parent->right = nodep->left; + } + } + + nodep->parent = nodep->left = nodep->right = NULL; + free(nodep); + + return; + } + + + /* Right only child */ + if (nodep->right) { + if (!nodep->parent) { + s->root = nodep->right; + nodep->right->parent = NULL; + } else { + nodep->right->parent = nodep->parent; + if (nodep == nodep->parent->left) + nodep->parent->left = nodep->right; + else { + assert(nodep == nodep->parent->right); + nodep->parent->right = nodep->right; + } + } + + nodep->parent = nodep->left = nodep->right = NULL; + free(nodep); + + return; + } + + /* Leaf Node */ + if (!nodep->parent) { + s->root = NULL; + } else { + if (nodep->parent->left == nodep) + nodep->parent->left = NULL; + else { + assert(nodep == nodep->parent->right); + nodep->parent->right = NULL; + } + } + + nodep->parent = nodep->left = nodep->right = NULL; + free(nodep); + + return; +} + +/* Splits the node containing the bit at idx so that there is a node + * that starts at the specified index. If no such node exists, a new + * node at the specified index is created. Returns the new node. + * + * idx must start of a mask boundary. + */ +static struct node *node_split(struct sparsebit *s, sparsebit_idx_t idx) +{ + struct node *nodep1, *nodep2; + sparsebit_idx_t offset; + sparsebit_num_t orig_num_after; + + assert(!(idx % MASK_BITS)); + + /* + * Is there a node that describes the setting of idx? + * If not, add it. + */ + nodep1 = node_find(s, idx); + if (!nodep1) + return node_add(s, idx); + + /* + * All done if the starting index of the node is where the + * split should occur. + */ + if (nodep1->idx == idx) + return nodep1; + + /* + * Split point not at start of mask, so it must be part of + * bits described by num_after. + */ + + /* + * Calculate offset within num_after for where the split is + * to occur. + */ + offset = idx - (nodep1->idx + MASK_BITS); + orig_num_after = nodep1->num_after; + + /* + * Add a new node to describe the bits starting at + * the split point. + */ + nodep1->num_after = offset; + nodep2 = node_add(s, idx); + + /* Move bits after the split point into the new node */ + nodep2->num_after = orig_num_after - offset; + if (nodep2->num_after >= MASK_BITS) { + nodep2->mask = ~(mask_t) 0; + nodep2->num_after -= MASK_BITS; + } else { + nodep2->mask = (1 << nodep2->num_after) - 1; + nodep2->num_after = 0; + } + + return nodep2; +} + +/* Iteratively reduces the node pointed to by nodep and its adjacent + * nodes into a more compact form. For example, a node with a mask with + * all bits set adjacent to a previous node, will get combined into a + * single node with an increased num_after setting. + * + * After each reduction, a further check is made to see if additional + * reductions are possible with the new previous and next nodes. Note, + * a search for a reduction is only done across the nodes nearest nodep + * and those that became part of a reduction. Reductions beyond nodep + * and the adjacent nodes that are reduced are not discovered. It is the + * responsibility of the caller to pass a nodep that is within one node + * of each possible reduction. + * + * This function does not fix the temporary violation of all invariants. + * For example it does not fix the case where the bit settings described + * by two or more nodes overlap. Such a violation introduces the potential + * complication of a bit setting for a specific index having different settings + * in different nodes. This would then introduce the further complication + * of which node has the correct setting of the bit and thus such conditions + * are not allowed. + * + * This function is designed to fix invariant violations that are introduced + * by node_split() and by changes to the nodes mask or num_after members. + * For example, when setting a bit within a nodes mask, the function that + * sets the bit doesn't have to worry about whether the setting of that + * bit caused the mask to have leading only or trailing only bits set. + * Instead, the function can call node_reduce(), with nodep equal to the + * node address that it set a mask bit in, and node_reduce() will notice + * the cases of leading or trailing only bits and that there is an + * adjacent node that the bit settings could be merged into. + * + * This implementation specifically detects and corrects violation of the + * following invariants: + * + * + Node are only used to represent bits that are set. + * Nodes with a mask of 0 and num_after of 0 are not allowed. + * + * + The setting of at least one bit is always described in a nodes + * mask (mask >= 1). + * + * + A node with all mask bits set only occurs when the last bit + * described by the previous node is not equal to this nodes + * starting index - 1. All such occurences of this condition are + * avoided by moving the setting of the nodes mask bits into + * the previous nodes num_after setting. + */ +static void node_reduce(struct sparsebit *s, struct node *nodep) +{ + bool reduction_performed; + + do { + reduction_performed = false; + struct node *prev, *next, *tmp; + + /* 1) Potential reductions within the current node. */ + + /* Nodes with all bits cleared may be removed. */ + if (nodep->mask == 0 && nodep->num_after == 0) { + /* + * About to remove the node pointed to by + * nodep, which normally would cause a problem + * for the next pass through the reduction loop, + * because the node at the starting point no longer + * exists. This potential problem is handled + * by first remembering the location of the next + * or previous nodes. Doesn't matter which, because + * once the node at nodep is removed, there will be + * no other nodes between prev and next. + * + * Note, the checks performed on nodep against both + * both prev and next both check for an adjacent + * node that can be reduced into a single node. As + * such, after removing the node at nodep, doesn't + * matter whether the nodep for the next pass + * through the loop is equal to the previous pass + * prev or next node. Either way, on the next pass + * the one not selected will become either the + * prev or next node. + */ + tmp = node_next(s, nodep); + if (!tmp) + tmp = node_prev(s, nodep); + + node_rm(s, nodep); + nodep = NULL; + + nodep = tmp; + reduction_performed = true; + continue; + } + + /* + * When the mask is 0, can reduce the amount of num_after + * bits by moving the initial num_after bits into the mask. + */ + if (nodep->mask == 0) { + assert(nodep->num_after != 0); + assert(nodep->idx + MASK_BITS > nodep->idx); + + nodep->idx += MASK_BITS; + + if (nodep->num_after >= MASK_BITS) { + nodep->mask = ~0; + nodep->num_after -= MASK_BITS; + } else { + nodep->mask = (1u << nodep->num_after) - 1; + nodep->num_after = 0; + } + + reduction_performed = true; + continue; + } + + /* + * 2) Potential reductions between the current and + * previous nodes. + */ + prev = node_prev(s, nodep); + if (prev) { + sparsebit_idx_t prev_highest_bit; + + /* Nodes with no bits set can be removed. */ + if (prev->mask == 0 && prev->num_after == 0) { + node_rm(s, prev); + + reduction_performed = true; + continue; + } + + /* + * All mask bits set and previous node has + * adjacent index. + */ + if (nodep->mask + 1 == 0 && + prev->idx + MASK_BITS == nodep->idx) { + prev->num_after += MASK_BITS + nodep->num_after; + nodep->mask = 0; + nodep->num_after = 0; + + reduction_performed = true; + continue; + } + + /* + * Is node adjacent to previous node and the node + * contains a single contiguous range of bits + * starting from the beginning of the mask? + */ + prev_highest_bit = prev->idx + MASK_BITS - 1 + prev->num_after; + if (prev_highest_bit + 1 == nodep->idx && + (nodep->mask | (nodep->mask >> 1)) == nodep->mask) { + /* + * How many contiguous bits are there? + * Is equal to the total number of set + * bits, due to an earlier check that + * there is a single contiguous range of + * set bits. + */ + unsigned int num_contiguous + = __builtin_popcount(nodep->mask); + assert((num_contiguous > 0) && + ((1ULL << num_contiguous) - 1) == nodep->mask); + + prev->num_after += num_contiguous; + nodep->mask = 0; + + /* + * For predictable performance, handle special + * case where all mask bits are set and there + * is a non-zero num_after setting. This code + * is functionally correct without the following + * conditionalized statements, but without them + * the value of num_after is only reduced by + * the number of mask bits per pass. There are + * cases where num_after can be close to 2^64. + * Without this code it could take nearly + * (2^64) / 32 passes to perform the full + * reduction. + */ + if (num_contiguous == MASK_BITS) { + prev->num_after += nodep->num_after; + nodep->num_after = 0; + } + + reduction_performed = true; + continue; + } + } + + /* + * 3) Potential reductions between the current and + * next nodes. + */ + next = node_next(s, nodep); + if (next) { + /* Nodes with no bits set can be removed. */ + if (next->mask == 0 && next->num_after == 0) { + node_rm(s, next); + reduction_performed = true; + continue; + } + + /* + * Is next node index adjacent to current node + * and has a mask with all bits set? + */ + if (next->idx == nodep->idx + MASK_BITS + nodep->num_after && + next->mask == ~(mask_t) 0) { + nodep->num_after += MASK_BITS; + next->mask = 0; + nodep->num_after += next->num_after; + next->num_after = 0; + + node_rm(s, next); + next = NULL; + + reduction_performed = true; + continue; + } + } + } while (nodep && reduction_performed); +} + +/* Returns whether the bit at the index given by idx, within the + * sparsebit array is set or not. + */ +bool sparsebit_is_set(struct sparsebit *s, sparsebit_idx_t idx) +{ + struct node *nodep; + + /* Find the node that describes the setting of the bit at idx */ + for (nodep = s->root; nodep; + nodep = nodep->idx > idx ? nodep->left : nodep->right) + if (idx >= nodep->idx && + idx <= nodep->idx + MASK_BITS + nodep->num_after - 1) + goto have_node; + + return false; + +have_node: + /* Bit is set if it is any of the bits described by num_after */ + if (nodep->num_after && idx >= nodep->idx + MASK_BITS) + return true; + + /* Is the corresponding mask bit set */ + assert(idx >= nodep->idx && idx - nodep->idx < MASK_BITS); + return !!(nodep->mask & (1 << (idx - nodep->idx))); +} + +/* Within the sparsebit array pointed to by s, sets the bit + * at the index given by idx. + */ +static void bit_set(struct sparsebit *s, sparsebit_idx_t idx) +{ + struct node *nodep; + + /* Skip bits that are already set */ + if (sparsebit_is_set(s, idx)) + return; + + /* + * Get a node where the bit at idx is described by the mask. + * The node_split will also create a node, if there isn't + * already a node that describes the setting of bit. + */ + nodep = node_split(s, idx & -MASK_BITS); + + /* Set the bit within the nodes mask */ + assert(idx >= nodep->idx && idx <= nodep->idx + MASK_BITS - 1); + assert(!(nodep->mask & (1 << (idx - nodep->idx)))); + nodep->mask |= 1 << (idx - nodep->idx); + s->num_set++; + + node_reduce(s, nodep); +} + +/* Within the sparsebit array pointed to by s, clears the bit + * at the index given by idx. + */ +static void bit_clear(struct sparsebit *s, sparsebit_idx_t idx) +{ + struct node *nodep; + + /* Skip bits that are already cleared */ + if (!sparsebit_is_set(s, idx)) + return; + + /* Is there a node that describes the setting of this bit? */ + nodep = node_find(s, idx); + if (!nodep) + return; + + /* + * If a num_after bit, split the node, so that the bit is + * part of a node mask. + */ + if (idx >= nodep->idx + MASK_BITS) + nodep = node_split(s, idx & -MASK_BITS); + + /* + * After node_split above, bit at idx should be within the mask. + * Clear that bit. + */ + assert(idx >= nodep->idx && idx <= nodep->idx + MASK_BITS - 1); + assert(nodep->mask & (1 << (idx - nodep->idx))); + nodep->mask &= ~(1 << (idx - nodep->idx)); + assert(s->num_set > 0 || sparsebit_all_set(s)); + s->num_set--; + + node_reduce(s, nodep); +} + +/* Recursively dumps to the FILE stream given by stream the contents + * of the sub-tree of nodes pointed to by nodep. Each line of output + * is prefixed by the number of spaces given by indent. On each + * recursion, the indent amount is increased by 2. This causes nodes + * at each level deeper into the binary search tree to be displayed + * with a greater indent. + */ +static void dump_nodes(FILE *stream, struct node *nodep, + unsigned int indent) +{ + char *node_type; + + /* Dump contents of node */ + if (!nodep->parent) + node_type = "root"; + else if (nodep == nodep->parent->left) + node_type = "left"; + else { + assert(nodep == nodep->parent->right); + node_type = "right"; + } + fprintf(stream, "%*s---- %s nodep: %p\n", indent, "", node_type, nodep); + fprintf(stream, "%*s parent: %p left: %p right: %p\n", indent, "", + nodep->parent, nodep->left, nodep->right); + fprintf(stream, "%*s idx: 0x%lx mask: 0x%x num_after: 0x%lx\n", + indent, "", nodep->idx, nodep->mask, nodep->num_after); + + /* If present, dump contents of left child nodes */ + if (nodep->left) + dump_nodes(stream, nodep->left, indent + 2); + + /* If present, dump contents of right child nodes */ + if (nodep->right) + dump_nodes(stream, nodep->right, indent + 2); +} + +static inline sparsebit_idx_t node_first_set(struct node *nodep, int start) +{ + mask_t leading = (mask_t)1 << start; + int n1 = __builtin_ctz(nodep->mask & -leading); + + return nodep->idx + n1; +} + +static inline sparsebit_idx_t node_first_clear(struct node *nodep, int start) +{ + mask_t leading = (mask_t)1 << start; + int n1 = __builtin_ctz(~nodep->mask & -leading); + + return nodep->idx + n1; +} + +/* Dumps to the FILE stream specified by stream, the implementation dependent + * internal state of s. Each line of output is prefixed with the number + * of spaces given by indent. The output is completely implementation + * dependent and subject to change. Output from this function should only + * be used for diagnostic purposes. For example, this function can be + * used by test cases after they detect an unexpected condition, as a means + * to capture diagnostic information. + */ +static void sparsebit_dump_internal(FILE *stream, struct sparsebit *s, + unsigned int indent) +{ + /* Dump the contents of s */ + fprintf(stream, "%*sroot: %p\n", indent, "", s->root); + fprintf(stream, "%*snum_set: 0x%lx\n", indent, "", s->num_set); + + if (s->root) + dump_nodes(stream, s->root, indent); +} + +/* Allocates and returns a new sparsebit array. The initial state + * of the newly allocated sparsebit array has all bits cleared. + */ +struct sparsebit *sparsebit_alloc(void) +{ + struct sparsebit *s; + + /* Allocate top level structure. */ + s = calloc(1, sizeof(*s)); + if (!s) { + perror("calloc"); + abort(); + } + + return s; +} + +/* Frees the implementation dependent data for the sparsebit array + * pointed to by s and poisons the pointer to that data. + */ +void sparsebit_free(struct sparsebit **sbitp) +{ + struct sparsebit *s = *sbitp; + + if (!s) + return; + + sparsebit_clear_all(s); + free(s); + *sbitp = NULL; +} + +/* Makes a copy of the sparsebit array given by s, to the sparsebit + * array given by d. Note, d must have already been allocated via + * sparsebit_alloc(). It can though already have bits set, which + * if different from src will be cleared. + */ +void sparsebit_copy(struct sparsebit *d, struct sparsebit *s) +{ + /* First clear any bits already set in the destination */ + sparsebit_clear_all(d); + + if (s->root) { + d->root = node_copy_subtree(s->root); + d->num_set = s->num_set; + } +} + +/* Returns whether num consecutive bits starting at idx are all set. */ +bool sparsebit_is_set_num(struct sparsebit *s, + sparsebit_idx_t idx, sparsebit_num_t num) +{ + sparsebit_idx_t next_cleared; + + assert(num > 0); + assert(idx + num - 1 >= idx); + + /* With num > 0, the first bit must be set. */ + if (!sparsebit_is_set(s, idx)) + return false; + + /* Find the next cleared bit */ + next_cleared = sparsebit_next_clear(s, idx); + + /* + * If no cleared bits beyond idx, then there are at least num + * set bits. idx + num doesn't wrap. Otherwise check if + * there are enough set bits between idx and the next cleared bit. + */ + return next_cleared == 0 || next_cleared - idx >= num; +} + +/* Returns whether the bit at the index given by idx. */ +bool sparsebit_is_clear(struct sparsebit *s, + sparsebit_idx_t idx) +{ + return !sparsebit_is_set(s, idx); +} + +/* Returns whether num consecutive bits starting at idx are all cleared. */ +bool sparsebit_is_clear_num(struct sparsebit *s, + sparsebit_idx_t idx, sparsebit_num_t num) +{ + sparsebit_idx_t next_set; + + assert(num > 0); + assert(idx + num - 1 >= idx); + + /* With num > 0, the first bit must be cleared. */ + if (!sparsebit_is_clear(s, idx)) + return false; + + /* Find the next set bit */ + next_set = sparsebit_next_set(s, idx); + + /* + * If no set bits beyond idx, then there are at least num + * cleared bits. idx + num doesn't wrap. Otherwise check if + * there are enough cleared bits between idx and the next set bit. + */ + return next_set == 0 || next_set - idx >= num; +} + +/* Returns the total number of bits set. Note: 0 is also returned for + * the case of all bits set. This is because with all bits set, there + * is 1 additional bit set beyond what can be represented in the return + * value. Use sparsebit_any_set(), instead of sparsebit_num_set() > 0, + * to determine if the sparsebit array has any bits set. + */ +sparsebit_num_t sparsebit_num_set(struct sparsebit *s) +{ + return s->num_set; +} + +/* Returns whether any bit is set in the sparsebit array. */ +bool sparsebit_any_set(struct sparsebit *s) +{ + /* + * Nodes only describe set bits. If any nodes then there + * is at least 1 bit set. + */ + if (!s->root) + return false; + + /* + * Every node should have a non-zero mask. For now will + * just assure that the root node has a non-zero mask, + * which is a quick check that at least 1 bit is set. + */ + assert(s->root->mask != 0); + assert(s->num_set > 0 || + (s->root->num_after == ((sparsebit_num_t) 0) - MASK_BITS && + s->root->mask == ~(mask_t) 0)); + + return true; +} + +/* Returns whether all the bits in the sparsebit array are cleared. */ +bool sparsebit_all_clear(struct sparsebit *s) +{ + return !sparsebit_any_set(s); +} + +/* Returns whether all the bits in the sparsebit array are set. */ +bool sparsebit_any_clear(struct sparsebit *s) +{ + return !sparsebit_all_set(s); +} + +/* Returns the index of the first set bit. Abort if no bits are set. + */ +sparsebit_idx_t sparsebit_first_set(struct sparsebit *s) +{ + struct node *nodep; + + /* Validate at least 1 bit is set */ + assert(sparsebit_any_set(s)); + + nodep = node_first(s); + return node_first_set(nodep, 0); +} + +/* Returns the index of the first cleared bit. Abort if + * no bits are cleared. + */ +sparsebit_idx_t sparsebit_first_clear(struct sparsebit *s) +{ + struct node *nodep1, *nodep2; + + /* Validate at least 1 bit is cleared. */ + assert(sparsebit_any_clear(s)); + + /* If no nodes or first node index > 0 then lowest cleared is 0 */ + nodep1 = node_first(s); + if (!nodep1 || nodep1->idx > 0) + return 0; + + /* Does the mask in the first node contain any cleared bits. */ + if (nodep1->mask != ~(mask_t) 0) + return node_first_clear(nodep1, 0); + + /* + * All mask bits set in first node. If there isn't a second node + * then the first cleared bit is the first bit after the bits + * described by the first node. + */ + nodep2 = node_next(s, nodep1); + if (!nodep2) { + /* + * No second node. First cleared bit is first bit beyond + * bits described by first node. + */ + assert(nodep1->mask == ~(mask_t) 0); + assert(nodep1->idx + MASK_BITS + nodep1->num_after != (sparsebit_idx_t) 0); + return nodep1->idx + MASK_BITS + nodep1->num_after; + } + + /* + * There is a second node. + * If it is not adjacent to the first node, then there is a gap + * of cleared bits between the nodes, and the first cleared bit + * is the first bit within the gap. + */ + if (nodep1->idx + MASK_BITS + nodep1->num_after != nodep2->idx) + return nodep1->idx + MASK_BITS + nodep1->num_after; + + /* + * Second node is adjacent to the first node. + * Because it is adjacent, its mask should be non-zero. If all + * its mask bits are set, then with it being adjacent, it should + * have had the mask bits moved into the num_after setting of the + * previous node. + */ + return node_first_clear(nodep2, 0); +} + +/* Returns index of next bit set within s after the index given by prev. + * Returns 0 if there are no bits after prev that are set. + */ +sparsebit_idx_t sparsebit_next_set(struct sparsebit *s, + sparsebit_idx_t prev) +{ + sparsebit_idx_t lowest_possible = prev + 1; + sparsebit_idx_t start; + struct node *nodep; + + /* A bit after the highest index can't be set. */ + if (lowest_possible == 0) + return 0; + + /* + * Find the leftmost 'candidate' overlapping or to the right + * of lowest_possible. + */ + struct node *candidate = NULL; + + /* True iff lowest_possible is within candidate */ + bool contains = false; + + /* + * Find node that describes setting of bit at lowest_possible. + * If such a node doesn't exist, find the node with the lowest + * starting index that is > lowest_possible. + */ + for (nodep = s->root; nodep;) { + if ((nodep->idx + MASK_BITS + nodep->num_after - 1) + >= lowest_possible) { + candidate = nodep; + if (candidate->idx <= lowest_possible) { + contains = true; + break; + } + nodep = nodep->left; + } else { + nodep = nodep->right; + } + } + if (!candidate) + return 0; + + assert(candidate->mask != 0); + + /* Does the candidate node describe the setting of lowest_possible? */ + if (!contains) { + /* + * Candidate doesn't describe setting of bit at lowest_possible. + * Candidate points to the first node with a starting index + * > lowest_possible. + */ + assert(candidate->idx > lowest_possible); + + return node_first_set(candidate, 0); + } + + /* + * Candidate describes setting of bit at lowest_possible. + * Note: although the node describes the setting of the bit + * at lowest_possible, its possible that its setting and the + * setting of all latter bits described by this node are 0. + * For now, just handle the cases where this node describes + * a bit at or after an index of lowest_possible that is set. + */ + start = lowest_possible - candidate->idx; + + if (start < MASK_BITS && candidate->mask >= (1 << start)) + return node_first_set(candidate, start); + + if (candidate->num_after) { + sparsebit_idx_t first_num_after_idx = candidate->idx + MASK_BITS; + + return lowest_possible < first_num_after_idx + ? first_num_after_idx : lowest_possible; + } + + /* + * Although candidate node describes setting of bit at + * the index of lowest_possible, all bits at that index and + * latter that are described by candidate are cleared. With + * this, the next bit is the first bit in the next node, if + * such a node exists. If a next node doesn't exist, then + * there is no next set bit. + */ + candidate = node_next(s, candidate); + if (!candidate) + return 0; + + return node_first_set(candidate, 0); +} + +/* Returns index of next bit cleared within s after the index given by prev. + * Returns 0 if there are no bits after prev that are cleared. + */ +sparsebit_idx_t sparsebit_next_clear(struct sparsebit *s, + sparsebit_idx_t prev) +{ + sparsebit_idx_t lowest_possible = prev + 1; + sparsebit_idx_t idx; + struct node *nodep1, *nodep2; + + /* A bit after the highest index can't be set. */ + if (lowest_possible == 0) + return 0; + + /* + * Does a node describing the setting of lowest_possible exist? + * If not, the bit at lowest_possible is cleared. + */ + nodep1 = node_find(s, lowest_possible); + if (!nodep1) + return lowest_possible; + + /* Does a mask bit in node 1 describe the next cleared bit. */ + for (idx = lowest_possible - nodep1->idx; idx < MASK_BITS; idx++) + if (!(nodep1->mask & (1 << idx))) + return nodep1->idx + idx; + + /* + * Next cleared bit is not described by node 1. If there + * isn't a next node, then next cleared bit is described + * by bit after the bits described by the first node. + */ + nodep2 = node_next(s, nodep1); + if (!nodep2) + return nodep1->idx + MASK_BITS + nodep1->num_after; + + /* + * There is a second node. + * If it is not adjacent to the first node, then there is a gap + * of cleared bits between the nodes, and the next cleared bit + * is the first bit within the gap. + */ + if (nodep1->idx + MASK_BITS + nodep1->num_after != nodep2->idx) + return nodep1->idx + MASK_BITS + nodep1->num_after; + + /* + * Second node is adjacent to the first node. + * Because it is adjacent, its mask should be non-zero. If all + * its mask bits are set, then with it being adjacent, it should + * have had the mask bits moved into the num_after setting of the + * previous node. + */ + return node_first_clear(nodep2, 0); +} + +/* Starting with the index 1 greater than the index given by start, finds + * and returns the index of the first sequence of num consecutively set + * bits. Returns a value of 0 of no such sequence exists. + */ +sparsebit_idx_t sparsebit_next_set_num(struct sparsebit *s, + sparsebit_idx_t start, sparsebit_num_t num) +{ + sparsebit_idx_t idx; + + assert(num >= 1); + + for (idx = sparsebit_next_set(s, start); + idx != 0 && idx + num - 1 >= idx; + idx = sparsebit_next_set(s, idx)) { + assert(sparsebit_is_set(s, idx)); + + /* + * Does the sequence of bits starting at idx consist of + * num set bits? + */ + if (sparsebit_is_set_num(s, idx, num)) + return idx; + + /* + * Sequence of set bits at idx isn't large enough. + * Skip this entire sequence of set bits. + */ + idx = sparsebit_next_clear(s, idx); + if (idx == 0) + return 0; + } + + return 0; +} + +/* Starting with the index 1 greater than the index given by start, finds + * and returns the index of the first sequence of num consecutively cleared + * bits. Returns a value of 0 of no such sequence exists. + */ +sparsebit_idx_t sparsebit_next_clear_num(struct sparsebit *s, + sparsebit_idx_t start, sparsebit_num_t num) +{ + sparsebit_idx_t idx; + + assert(num >= 1); + + for (idx = sparsebit_next_clear(s, start); + idx != 0 && idx + num - 1 >= idx; + idx = sparsebit_next_clear(s, idx)) { + assert(sparsebit_is_clear(s, idx)); + + /* + * Does the sequence of bits starting at idx consist of + * num cleared bits? + */ + if (sparsebit_is_clear_num(s, idx, num)) + return idx; + + /* + * Sequence of cleared bits at idx isn't large enough. + * Skip this entire sequence of cleared bits. + */ + idx = sparsebit_next_set(s, idx); + if (idx == 0) + return 0; + } + + return 0; +} + +/* Sets the bits * in the inclusive range idx through idx + num - 1. */ +void sparsebit_set_num(struct sparsebit *s, + sparsebit_idx_t start, sparsebit_num_t num) +{ + struct node *nodep, *next; + unsigned int n1; + sparsebit_idx_t idx; + sparsebit_num_t n; + sparsebit_idx_t middle_start, middle_end; + + assert(num > 0); + assert(start + num - 1 >= start); + + /* + * Leading - bits before first mask boundary. + * + * TODO(lhuemill): With some effort it may be possible to + * replace the following loop with a sequential sequence + * of statements. High level sequence would be: + * + * 1. Use node_split() to force node that describes setting + * of idx to be within the mask portion of a node. + * 2. Form mask of bits to be set. + * 3. Determine number of mask bits already set in the node + * and store in a local variable named num_already_set. + * 4. Set the appropriate mask bits within the node. + * 5. Increment struct sparsebit_pvt num_set member + * by the number of bits that were actually set. + * Exclude from the counts bits that were already set. + * 6. Before returning to the caller, use node_reduce() to + * handle the multiple corner cases that this method + * introduces. + */ + for (idx = start, n = num; n > 0 && idx % MASK_BITS != 0; idx++, n--) + bit_set(s, idx); + + /* Middle - bits spanning one or more entire mask */ + middle_start = idx; + middle_end = middle_start + (n & -MASK_BITS) - 1; + if (n >= MASK_BITS) { + nodep = node_split(s, middle_start); + + /* + * As needed, split just after end of middle bits. + * No split needed if end of middle bits is at highest + * supported bit index. + */ + if (middle_end + 1 > middle_end) + (void) node_split(s, middle_end + 1); + + /* Delete nodes that only describe bits within the middle. */ + for (next = node_next(s, nodep); + next && (next->idx < middle_end); + next = node_next(s, nodep)) { + assert(next->idx + MASK_BITS + next->num_after - 1 <= middle_end); + node_rm(s, next); + next = NULL; + } + + /* As needed set each of the mask bits */ + for (n1 = 0; n1 < MASK_BITS; n1++) { + if (!(nodep->mask & (1 << n1))) { + nodep->mask |= 1 << n1; + s->num_set++; + } + } + + s->num_set -= nodep->num_after; + nodep->num_after = middle_end - middle_start + 1 - MASK_BITS; + s->num_set += nodep->num_after; + + node_reduce(s, nodep); + } + idx = middle_end + 1; + n -= middle_end - middle_start + 1; + + /* Trailing - bits at and beyond last mask boundary */ + assert(n < MASK_BITS); + for (; n > 0; idx++, n--) + bit_set(s, idx); +} + +/* Clears the bits * in the inclusive range idx through idx + num - 1. */ +void sparsebit_clear_num(struct sparsebit *s, + sparsebit_idx_t start, sparsebit_num_t num) +{ + struct node *nodep, *next; + unsigned int n1; + sparsebit_idx_t idx; + sparsebit_num_t n; + sparsebit_idx_t middle_start, middle_end; + + assert(num > 0); + assert(start + num - 1 >= start); + + /* Leading - bits before first mask boundary */ + for (idx = start, n = num; n > 0 && idx % MASK_BITS != 0; idx++, n--) + bit_clear(s, idx); + + /* Middle - bits spanning one or more entire mask */ + middle_start = idx; + middle_end = middle_start + (n & -MASK_BITS) - 1; + if (n >= MASK_BITS) { + nodep = node_split(s, middle_start); + + /* + * As needed, split just after end of middle bits. + * No split needed if end of middle bits is at highest + * supported bit index. + */ + if (middle_end + 1 > middle_end) + (void) node_split(s, middle_end + 1); + + /* Delete nodes that only describe bits within the middle. */ + for (next = node_next(s, nodep); + next && (next->idx < middle_end); + next = node_next(s, nodep)) { + assert(next->idx + MASK_BITS + next->num_after - 1 <= middle_end); + node_rm(s, next); + next = NULL; + } + + /* As needed clear each of the mask bits */ + for (n1 = 0; n1 < MASK_BITS; n1++) { + if (nodep->mask & (1 << n1)) { + nodep->mask &= ~(1 << n1); + s->num_set--; + } + } + + /* Clear any bits described by num_after */ + s->num_set -= nodep->num_after; + nodep->num_after = 0; + + /* + * Delete the node that describes the beginning of + * the middle bits and perform any allowed reductions + * with the nodes prev or next of nodep. + */ + node_reduce(s, nodep); + nodep = NULL; + } + idx = middle_end + 1; + n -= middle_end - middle_start + 1; + + /* Trailing - bits at and beyond last mask boundary */ + assert(n < MASK_BITS); + for (; n > 0; idx++, n--) + bit_clear(s, idx); +} + +/* Sets the bit at the index given by idx. */ +void sparsebit_set(struct sparsebit *s, sparsebit_idx_t idx) +{ + sparsebit_set_num(s, idx, 1); +} + +/* Clears the bit at the index given by idx. */ +void sparsebit_clear(struct sparsebit *s, sparsebit_idx_t idx) +{ + sparsebit_clear_num(s, idx, 1); +} + +/* Sets the bits in the entire addressable range of the sparsebit array. */ +void sparsebit_set_all(struct sparsebit *s) +{ + sparsebit_set(s, 0); + sparsebit_set_num(s, 1, ~(sparsebit_idx_t) 0); + assert(sparsebit_all_set(s)); +} + +/* Clears the bits in the entire addressable range of the sparsebit array. */ +void sparsebit_clear_all(struct sparsebit *s) +{ + sparsebit_clear(s, 0); + sparsebit_clear_num(s, 1, ~(sparsebit_idx_t) 0); + assert(!sparsebit_any_set(s)); +} + +static size_t display_range(FILE *stream, sparsebit_idx_t low, + sparsebit_idx_t high, bool prepend_comma_space) +{ + char *fmt_str; + size_t sz; + + /* Determine the printf format string */ + if (low == high) + fmt_str = prepend_comma_space ? ", 0x%lx" : "0x%lx"; + else + fmt_str = prepend_comma_space ? ", 0x%lx:0x%lx" : "0x%lx:0x%lx"; + + /* + * When stream is NULL, just determine the size of what would + * have been printed, else print the range. + */ + if (!stream) + sz = snprintf(NULL, 0, fmt_str, low, high); + else + sz = fprintf(stream, fmt_str, low, high); + + return sz; +} + + +/* Dumps to the FILE stream given by stream, the bit settings + * of s. Each line of output is prefixed with the number of + * spaces given by indent. The length of each line is implementation + * dependent and does not depend on the indent amount. The following + * is an example output of a sparsebit array that has bits: + * + * 0x5, 0x8, 0xa:0xe, 0x12 + * + * This corresponds to a sparsebit whose bits 5, 8, 10, 11, 12, 13, 14, 18 + * are set. Note that a ':', instead of a '-' is used to specify a range of + * contiguous bits. This is done because '-' is used to specify command-line + * options, and sometimes ranges are specified as command-line arguments. + */ +void sparsebit_dump(FILE *stream, struct sparsebit *s, + unsigned int indent) +{ + size_t current_line_len = 0; + size_t sz; + struct node *nodep; + + if (!sparsebit_any_set(s)) + return; + + /* Display initial indent */ + fprintf(stream, "%*s", indent, ""); + + /* For each node */ + for (nodep = node_first(s); nodep; nodep = node_next(s, nodep)) { + unsigned int n1; + sparsebit_idx_t low, high; + + /* For each group of bits in the mask */ + for (n1 = 0; n1 < MASK_BITS; n1++) { + if (nodep->mask & (1 << n1)) { + low = high = nodep->idx + n1; + + for (; n1 < MASK_BITS; n1++) { + if (nodep->mask & (1 << n1)) + high = nodep->idx + n1; + else + break; + } + + if ((n1 == MASK_BITS) && nodep->num_after) + high += nodep->num_after; + + /* + * How much room will it take to display + * this range. + */ + sz = display_range(NULL, low, high, + current_line_len != 0); + + /* + * If there is not enough room, display + * a newline plus the indent of the next + * line. + */ + if (current_line_len + sz > DUMP_LINE_MAX) { + fputs("\n", stream); + fprintf(stream, "%*s", indent, ""); + current_line_len = 0; + } + + /* Display the range */ + sz = display_range(stream, low, high, + current_line_len != 0); + current_line_len += sz; + } + } + + /* + * If num_after and most significant-bit of mask is not + * set, then still need to display a range for the bits + * described by num_after. + */ + if (!(nodep->mask & (1 << (MASK_BITS - 1))) && nodep->num_after) { + low = nodep->idx + MASK_BITS; + high = nodep->idx + MASK_BITS + nodep->num_after - 1; + + /* + * How much room will it take to display + * this range. + */ + sz = display_range(NULL, low, high, + current_line_len != 0); + + /* + * If there is not enough room, display + * a newline plus the indent of the next + * line. + */ + if (current_line_len + sz > DUMP_LINE_MAX) { + fputs("\n", stream); + fprintf(stream, "%*s", indent, ""); + current_line_len = 0; + } + + /* Display the range */ + sz = display_range(stream, low, high, + current_line_len != 0); + current_line_len += sz; + } + } + fputs("\n", stream); +} + +/* Validates the internal state of the sparsebit array given by + * s. On error, diagnostic information is printed to stderr and + * abort is called. + */ +void sparsebit_validate_internal(struct sparsebit *s) +{ + bool error_detected = false; + struct node *nodep, *prev = NULL; + sparsebit_num_t total_bits_set = 0; + unsigned int n1; + + /* For each node */ + for (nodep = node_first(s); nodep; + prev = nodep, nodep = node_next(s, nodep)) { + + /* + * Increase total bits set by the number of bits set + * in this node. + */ + for (n1 = 0; n1 < MASK_BITS; n1++) + if (nodep->mask & (1 << n1)) + total_bits_set++; + + total_bits_set += nodep->num_after; + + /* + * Arbitrary choice as to whether a mask of 0 is allowed + * or not. For diagnostic purposes it is beneficial to + * have only one valid means to represent a set of bits. + * To support this an arbitrary choice has been made + * to not allow a mask of zero. + */ + if (nodep->mask == 0) { + fprintf(stderr, "Node mask of zero, " + "nodep: %p nodep->mask: 0x%x", + nodep, nodep->mask); + error_detected = true; + break; + } + + /* + * Validate num_after is not greater than the max index + * - the number of mask bits. The num_after member + * uses 0-based indexing and thus has no value that + * represents all bits set. This limitation is handled + * by requiring a non-zero mask. With a non-zero mask, + * MASK_BITS worth of bits are described by the mask, + * which makes the largest needed num_after equal to: + * + * (~(sparsebit_num_t) 0) - MASK_BITS + 1 + */ + if (nodep->num_after + > (~(sparsebit_num_t) 0) - MASK_BITS + 1) { + fprintf(stderr, "num_after too large, " + "nodep: %p nodep->num_after: 0x%lx", + nodep, nodep->num_after); + error_detected = true; + break; + } + + /* Validate node index is divisible by the mask size */ + if (nodep->idx % MASK_BITS) { + fprintf(stderr, "Node index not divisible by " + "mask size,\n" + " nodep: %p nodep->idx: 0x%lx " + "MASK_BITS: %lu\n", + nodep, nodep->idx, MASK_BITS); + error_detected = true; + break; + } + + /* + * Validate bits described by node don't wrap beyond the + * highest supported index. + */ + if ((nodep->idx + MASK_BITS + nodep->num_after - 1) < nodep->idx) { + fprintf(stderr, "Bits described by node wrap " + "beyond highest supported index,\n" + " nodep: %p nodep->idx: 0x%lx\n" + " MASK_BITS: %lu nodep->num_after: 0x%lx", + nodep, nodep->idx, MASK_BITS, nodep->num_after); + error_detected = true; + break; + } + + /* Check parent pointers. */ + if (nodep->left) { + if (nodep->left->parent != nodep) { + fprintf(stderr, "Left child parent pointer " + "doesn't point to this node,\n" + " nodep: %p nodep->left: %p " + "nodep->left->parent: %p", + nodep, nodep->left, + nodep->left->parent); + error_detected = true; + break; + } + } + + if (nodep->right) { + if (nodep->right->parent != nodep) { + fprintf(stderr, "Right child parent pointer " + "doesn't point to this node,\n" + " nodep: %p nodep->right: %p " + "nodep->right->parent: %p", + nodep, nodep->right, + nodep->right->parent); + error_detected = true; + break; + } + } + + if (!nodep->parent) { + if (s->root != nodep) { + fprintf(stderr, "Unexpected root node, " + "s->root: %p nodep: %p", + s->root, nodep); + error_detected = true; + break; + } + } + + if (prev) { + /* + * Is index of previous node before index of + * current node? + */ + if (prev->idx >= nodep->idx) { + fprintf(stderr, "Previous node index " + ">= current node index,\n" + " prev: %p prev->idx: 0x%lx\n" + " nodep: %p nodep->idx: 0x%lx", + prev, prev->idx, nodep, nodep->idx); + error_detected = true; + break; + } + + /* + * Nodes occur in asscending order, based on each + * nodes starting index. + */ + if ((prev->idx + MASK_BITS + prev->num_after - 1) + >= nodep->idx) { + fprintf(stderr, "Previous node bit range " + "overlap with current node bit range,\n" + " prev: %p prev->idx: 0x%lx " + "prev->num_after: 0x%lx\n" + " nodep: %p nodep->idx: 0x%lx " + "nodep->num_after: 0x%lx\n" + " MASK_BITS: %lu", + prev, prev->idx, prev->num_after, + nodep, nodep->idx, nodep->num_after, + MASK_BITS); + error_detected = true; + break; + } + + /* + * When the node has all mask bits set, it shouldn't + * be adjacent to the last bit described by the + * previous node. + */ + if (nodep->mask == ~(mask_t) 0 && + prev->idx + MASK_BITS + prev->num_after == nodep->idx) { + fprintf(stderr, "Current node has mask with " + "all bits set and is adjacent to the " + "previous node,\n" + " prev: %p prev->idx: 0x%lx " + "prev->num_after: 0x%lx\n" + " nodep: %p nodep->idx: 0x%lx " + "nodep->num_after: 0x%lx\n" + " MASK_BITS: %lu", + prev, prev->idx, prev->num_after, + nodep, nodep->idx, nodep->num_after, + MASK_BITS); + + error_detected = true; + break; + } + } + } + + if (!error_detected) { + /* + * Is sum of bits set in each node equal to the count + * of total bits set. + */ + if (s->num_set != total_bits_set) { + fprintf(stderr, "Number of bits set missmatch,\n" + " s->num_set: 0x%lx total_bits_set: 0x%lx", + s->num_set, total_bits_set); + + error_detected = true; + } + } + + if (error_detected) { + fputs(" dump_internal:\n", stderr); + sparsebit_dump_internal(stderr, s, 4); + abort(); + } +} + + +#ifdef FUZZ +/* A simple but effective fuzzing driver. Look for bugs with the help + * of some invariants and of a trivial representation of sparsebit. + * Just use 512 bytes of /dev/zero and /dev/urandom as inputs, and let + * afl-fuzz do the magic. :) + */ + +#include +#include + +struct range { + sparsebit_idx_t first, last; + bool set; +}; + +struct sparsebit *s; +struct range ranges[1000]; +int num_ranges; + +static bool get_value(sparsebit_idx_t idx) +{ + int i; + + for (i = num_ranges; --i >= 0; ) + if (ranges[i].first <= idx && idx <= ranges[i].last) + return ranges[i].set; + + return false; +} + +static void operate(int code, sparsebit_idx_t first, sparsebit_idx_t last) +{ + sparsebit_num_t num; + sparsebit_idx_t next; + + if (first < last) { + num = last - first + 1; + } else { + num = first - last + 1; + first = last; + last = first + num - 1; + } + + switch (code) { + case 0: + sparsebit_set(s, first); + assert(sparsebit_is_set(s, first)); + assert(!sparsebit_is_clear(s, first)); + assert(sparsebit_any_set(s)); + assert(!sparsebit_all_clear(s)); + if (get_value(first)) + return; + if (num_ranges == 1000) + exit(0); + ranges[num_ranges++] = (struct range) + { .first = first, .last = first, .set = true }; + break; + case 1: + sparsebit_clear(s, first); + assert(!sparsebit_is_set(s, first)); + assert(sparsebit_is_clear(s, first)); + assert(sparsebit_any_clear(s)); + assert(!sparsebit_all_set(s)); + if (!get_value(first)) + return; + if (num_ranges == 1000) + exit(0); + ranges[num_ranges++] = (struct range) + { .first = first, .last = first, .set = false }; + break; + case 2: + assert(sparsebit_is_set(s, first) == get_value(first)); + assert(sparsebit_is_clear(s, first) == !get_value(first)); + break; + case 3: + if (sparsebit_any_set(s)) + assert(get_value(sparsebit_first_set(s))); + if (sparsebit_any_clear(s)) + assert(!get_value(sparsebit_first_clear(s))); + sparsebit_set_all(s); + assert(!sparsebit_any_clear(s)); + assert(sparsebit_all_set(s)); + num_ranges = 0; + ranges[num_ranges++] = (struct range) + { .first = 0, .last = ~(sparsebit_idx_t)0, .set = true }; + break; + case 4: + if (sparsebit_any_set(s)) + assert(get_value(sparsebit_first_set(s))); + if (sparsebit_any_clear(s)) + assert(!get_value(sparsebit_first_clear(s))); + sparsebit_clear_all(s); + assert(!sparsebit_any_set(s)); + assert(sparsebit_all_clear(s)); + num_ranges = 0; + break; + case 5: + next = sparsebit_next_set(s, first); + assert(next == 0 || next > first); + assert(next == 0 || get_value(next)); + break; + case 6: + next = sparsebit_next_clear(s, first); + assert(next == 0 || next > first); + assert(next == 0 || !get_value(next)); + break; + case 7: + next = sparsebit_next_clear(s, first); + if (sparsebit_is_set_num(s, first, num)) { + assert(next == 0 || next > last); + if (first) + next = sparsebit_next_set(s, first - 1); + else if (sparsebit_any_set(s)) + next = sparsebit_first_set(s); + else + return; + assert(next == first); + } else { + assert(sparsebit_is_clear(s, first) || next <= last); + } + break; + case 8: + next = sparsebit_next_set(s, first); + if (sparsebit_is_clear_num(s, first, num)) { + assert(next == 0 || next > last); + if (first) + next = sparsebit_next_clear(s, first - 1); + else if (sparsebit_any_clear(s)) + next = sparsebit_first_clear(s); + else + return; + assert(next == first); + } else { + assert(sparsebit_is_set(s, first) || next <= last); + } + break; + case 9: + sparsebit_set_num(s, first, num); + assert(sparsebit_is_set_num(s, first, num)); + assert(!sparsebit_is_clear_num(s, first, num)); + assert(sparsebit_any_set(s)); + assert(!sparsebit_all_clear(s)); + if (num_ranges == 1000) + exit(0); + ranges[num_ranges++] = (struct range) + { .first = first, .last = last, .set = true }; + break; + case 10: + sparsebit_clear_num(s, first, num); + assert(!sparsebit_is_set_num(s, first, num)); + assert(sparsebit_is_clear_num(s, first, num)); + assert(sparsebit_any_clear(s)); + assert(!sparsebit_all_set(s)); + if (num_ranges == 1000) + exit(0); + ranges[num_ranges++] = (struct range) + { .first = first, .last = last, .set = false }; + break; + case 11: + sparsebit_validate_internal(s); + break; + default: + break; + } +} + +unsigned char get8(void) +{ + int ch; + + ch = getchar(); + if (ch == EOF) + exit(0); + return ch; +} + +uint64_t get64(void) +{ + uint64_t x; + + x = get8(); + x = (x << 8) | get8(); + x = (x << 8) | get8(); + x = (x << 8) | get8(); + x = (x << 8) | get8(); + x = (x << 8) | get8(); + x = (x << 8) | get8(); + return (x << 8) | get8(); +} + +int main(void) +{ + s = sparsebit_alloc(); + for (;;) { + uint8_t op = get8() & 0xf; + uint64_t first = get64(); + uint64_t last = get64(); + + operate(op, first, last); + } +} +#endif diff --git a/tools/testing/selftests/kvm/lib/vmx.c b/tools/testing/selftests/kvm/lib/vmx.c new file mode 100644 index 000000000..b987c3c97 --- /dev/null +++ b/tools/testing/selftests/kvm/lib/vmx.c @@ -0,0 +1,283 @@ +/* + * tools/testing/selftests/kvm/lib/x86.c + * + * Copyright (C) 2018, Google LLC. + * + * This work is licensed under the terms of the GNU GPL, version 2. + */ + +#define _GNU_SOURCE /* for program_invocation_name */ + +#include "test_util.h" +#include "kvm_util.h" +#include "x86.h" +#include "vmx.h" + +/* Allocate memory regions for nested VMX tests. + * + * Input Args: + * vm - The VM to allocate guest-virtual addresses in. + * + * Output Args: + * p_vmx_gva - The guest virtual address for the struct vmx_pages. + * + * Return: + * Pointer to structure with the addresses of the VMX areas. + */ +struct vmx_pages * +vcpu_alloc_vmx(struct kvm_vm *vm, vm_vaddr_t *p_vmx_gva) +{ + vm_vaddr_t vmx_gva = vm_vaddr_alloc(vm, getpagesize(), 0x10000, 0, 0); + struct vmx_pages *vmx = addr_gva2hva(vm, vmx_gva); + + /* Setup of a region of guest memory for the vmxon region. */ + vmx->vmxon = (void *)vm_vaddr_alloc(vm, getpagesize(), 0x10000, 0, 0); + vmx->vmxon_hva = addr_gva2hva(vm, (uintptr_t)vmx->vmxon); + vmx->vmxon_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->vmxon); + + /* Setup of a region of guest memory for a vmcs. */ + vmx->vmcs = (void *)vm_vaddr_alloc(vm, getpagesize(), 0x10000, 0, 0); + vmx->vmcs_hva = addr_gva2hva(vm, (uintptr_t)vmx->vmcs); + vmx->vmcs_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->vmcs); + + /* Setup of a region of guest memory for the MSR bitmap. */ + vmx->msr = (void *)vm_vaddr_alloc(vm, getpagesize(), 0x10000, 0, 0); + vmx->msr_hva = addr_gva2hva(vm, (uintptr_t)vmx->msr); + vmx->msr_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->msr); + memset(vmx->msr_hva, 0, getpagesize()); + + /* Setup of a region of guest memory for the shadow VMCS. */ + vmx->shadow_vmcs = (void *)vm_vaddr_alloc(vm, getpagesize(), 0x10000, 0, 0); + vmx->shadow_vmcs_hva = addr_gva2hva(vm, (uintptr_t)vmx->shadow_vmcs); + vmx->shadow_vmcs_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->shadow_vmcs); + + /* Setup of a region of guest memory for the VMREAD and VMWRITE bitmaps. */ + vmx->vmread = (void *)vm_vaddr_alloc(vm, getpagesize(), 0x10000, 0, 0); + vmx->vmread_hva = addr_gva2hva(vm, (uintptr_t)vmx->vmread); + vmx->vmread_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->vmread); + memset(vmx->vmread_hva, 0, getpagesize()); + + vmx->vmwrite = (void *)vm_vaddr_alloc(vm, getpagesize(), 0x10000, 0, 0); + vmx->vmwrite_hva = addr_gva2hva(vm, (uintptr_t)vmx->vmwrite); + vmx->vmwrite_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->vmwrite); + memset(vmx->vmwrite_hva, 0, getpagesize()); + + *p_vmx_gva = vmx_gva; + return vmx; +} + +bool prepare_for_vmx_operation(struct vmx_pages *vmx) +{ + uint64_t feature_control; + uint64_t required; + unsigned long cr0; + unsigned long cr4; + + /* + * Ensure bits in CR0 and CR4 are valid in VMX operation: + * - Bit X is 1 in _FIXED0: bit X is fixed to 1 in CRx. + * - Bit X is 0 in _FIXED1: bit X is fixed to 0 in CRx. + */ + __asm__ __volatile__("mov %%cr0, %0" : "=r"(cr0) : : "memory"); + cr0 &= rdmsr(MSR_IA32_VMX_CR0_FIXED1); + cr0 |= rdmsr(MSR_IA32_VMX_CR0_FIXED0); + __asm__ __volatile__("mov %0, %%cr0" : : "r"(cr0) : "memory"); + + __asm__ __volatile__("mov %%cr4, %0" : "=r"(cr4) : : "memory"); + cr4 &= rdmsr(MSR_IA32_VMX_CR4_FIXED1); + cr4 |= rdmsr(MSR_IA32_VMX_CR4_FIXED0); + /* Enable VMX operation */ + cr4 |= X86_CR4_VMXE; + __asm__ __volatile__("mov %0, %%cr4" : : "r"(cr4) : "memory"); + + /* + * Configure IA32_FEATURE_CONTROL MSR to allow VMXON: + * Bit 0: Lock bit. If clear, VMXON causes a #GP. + * Bit 2: Enables VMXON outside of SMX operation. If clear, VMXON + * outside of SMX causes a #GP. + */ + required = FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX; + required |= FEATURE_CONTROL_LOCKED; + feature_control = rdmsr(MSR_IA32_FEATURE_CONTROL); + if ((feature_control & required) != required) + wrmsr(MSR_IA32_FEATURE_CONTROL, feature_control | required); + + /* Enter VMX root operation. */ + *(uint32_t *)(vmx->vmxon) = vmcs_revision(); + if (vmxon(vmx->vmxon_gpa)) + return false; + + /* Load a VMCS. */ + *(uint32_t *)(vmx->vmcs) = vmcs_revision(); + if (vmclear(vmx->vmcs_gpa)) + return false; + + if (vmptrld(vmx->vmcs_gpa)) + return false; + + /* Setup shadow VMCS, do not load it yet. */ + *(uint32_t *)(vmx->shadow_vmcs) = vmcs_revision() | 0x80000000ul; + if (vmclear(vmx->shadow_vmcs_gpa)) + return false; + + return true; +} + +/* + * Initialize the control fields to the most basic settings possible. + */ +static inline void init_vmcs_control_fields(struct vmx_pages *vmx) +{ + vmwrite(VIRTUAL_PROCESSOR_ID, 0); + vmwrite(POSTED_INTR_NV, 0); + + vmwrite(PIN_BASED_VM_EXEC_CONTROL, rdmsr(MSR_IA32_VMX_TRUE_PINBASED_CTLS)); + if (!vmwrite(SECONDARY_VM_EXEC_CONTROL, 0)) + vmwrite(CPU_BASED_VM_EXEC_CONTROL, + rdmsr(MSR_IA32_VMX_TRUE_PROCBASED_CTLS) | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS); + else + vmwrite(CPU_BASED_VM_EXEC_CONTROL, rdmsr(MSR_IA32_VMX_TRUE_PROCBASED_CTLS)); + vmwrite(EXCEPTION_BITMAP, 0); + vmwrite(PAGE_FAULT_ERROR_CODE_MASK, 0); + vmwrite(PAGE_FAULT_ERROR_CODE_MATCH, -1); /* Never match */ + vmwrite(CR3_TARGET_COUNT, 0); + vmwrite(VM_EXIT_CONTROLS, rdmsr(MSR_IA32_VMX_EXIT_CTLS) | + VM_EXIT_HOST_ADDR_SPACE_SIZE); /* 64-bit host */ + vmwrite(VM_EXIT_MSR_STORE_COUNT, 0); + vmwrite(VM_EXIT_MSR_LOAD_COUNT, 0); + vmwrite(VM_ENTRY_CONTROLS, rdmsr(MSR_IA32_VMX_ENTRY_CTLS) | + VM_ENTRY_IA32E_MODE); /* 64-bit guest */ + vmwrite(VM_ENTRY_MSR_LOAD_COUNT, 0); + vmwrite(VM_ENTRY_INTR_INFO_FIELD, 0); + vmwrite(TPR_THRESHOLD, 0); + + vmwrite(CR0_GUEST_HOST_MASK, 0); + vmwrite(CR4_GUEST_HOST_MASK, 0); + vmwrite(CR0_READ_SHADOW, get_cr0()); + vmwrite(CR4_READ_SHADOW, get_cr4()); + + vmwrite(MSR_BITMAP, vmx->msr_gpa); + vmwrite(VMREAD_BITMAP, vmx->vmread_gpa); + vmwrite(VMWRITE_BITMAP, vmx->vmwrite_gpa); +} + +/* + * Initialize the host state fields based on the current host state, with + * the exception of HOST_RSP and HOST_RIP, which should be set by vmlaunch + * or vmresume. + */ +static inline void init_vmcs_host_state(void) +{ + uint32_t exit_controls = vmreadz(VM_EXIT_CONTROLS); + + vmwrite(HOST_ES_SELECTOR, get_es()); + vmwrite(HOST_CS_SELECTOR, get_cs()); + vmwrite(HOST_SS_SELECTOR, get_ss()); + vmwrite(HOST_DS_SELECTOR, get_ds()); + vmwrite(HOST_FS_SELECTOR, get_fs()); + vmwrite(HOST_GS_SELECTOR, get_gs()); + vmwrite(HOST_TR_SELECTOR, get_tr()); + + if (exit_controls & VM_EXIT_LOAD_IA32_PAT) + vmwrite(HOST_IA32_PAT, rdmsr(MSR_IA32_CR_PAT)); + if (exit_controls & VM_EXIT_LOAD_IA32_EFER) + vmwrite(HOST_IA32_EFER, rdmsr(MSR_EFER)); + if (exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) + vmwrite(HOST_IA32_PERF_GLOBAL_CTRL, + rdmsr(MSR_CORE_PERF_GLOBAL_CTRL)); + + vmwrite(HOST_IA32_SYSENTER_CS, rdmsr(MSR_IA32_SYSENTER_CS)); + + vmwrite(HOST_CR0, get_cr0()); + vmwrite(HOST_CR3, get_cr3()); + vmwrite(HOST_CR4, get_cr4()); + vmwrite(HOST_FS_BASE, rdmsr(MSR_FS_BASE)); + vmwrite(HOST_GS_BASE, rdmsr(MSR_GS_BASE)); + vmwrite(HOST_TR_BASE, + get_desc64_base((struct desc64 *)(get_gdt_base() + get_tr()))); + vmwrite(HOST_GDTR_BASE, get_gdt_base()); + vmwrite(HOST_IDTR_BASE, get_idt_base()); + vmwrite(HOST_IA32_SYSENTER_ESP, rdmsr(MSR_IA32_SYSENTER_ESP)); + vmwrite(HOST_IA32_SYSENTER_EIP, rdmsr(MSR_IA32_SYSENTER_EIP)); +} + +/* + * Initialize the guest state fields essentially as a clone of + * the host state fields. Some host state fields have fixed + * values, and we set the corresponding guest state fields accordingly. + */ +static inline void init_vmcs_guest_state(void *rip, void *rsp) +{ + vmwrite(GUEST_ES_SELECTOR, vmreadz(HOST_ES_SELECTOR)); + vmwrite(GUEST_CS_SELECTOR, vmreadz(HOST_CS_SELECTOR)); + vmwrite(GUEST_SS_SELECTOR, vmreadz(HOST_SS_SELECTOR)); + vmwrite(GUEST_DS_SELECTOR, vmreadz(HOST_DS_SELECTOR)); + vmwrite(GUEST_FS_SELECTOR, vmreadz(HOST_FS_SELECTOR)); + vmwrite(GUEST_GS_SELECTOR, vmreadz(HOST_GS_SELECTOR)); + vmwrite(GUEST_LDTR_SELECTOR, 0); + vmwrite(GUEST_TR_SELECTOR, vmreadz(HOST_TR_SELECTOR)); + vmwrite(GUEST_INTR_STATUS, 0); + vmwrite(GUEST_PML_INDEX, 0); + + vmwrite(VMCS_LINK_POINTER, -1ll); + vmwrite(GUEST_IA32_DEBUGCTL, 0); + vmwrite(GUEST_IA32_PAT, vmreadz(HOST_IA32_PAT)); + vmwrite(GUEST_IA32_EFER, vmreadz(HOST_IA32_EFER)); + vmwrite(GUEST_IA32_PERF_GLOBAL_CTRL, + vmreadz(HOST_IA32_PERF_GLOBAL_CTRL)); + + vmwrite(GUEST_ES_LIMIT, -1); + vmwrite(GUEST_CS_LIMIT, -1); + vmwrite(GUEST_SS_LIMIT, -1); + vmwrite(GUEST_DS_LIMIT, -1); + vmwrite(GUEST_FS_LIMIT, -1); + vmwrite(GUEST_GS_LIMIT, -1); + vmwrite(GUEST_LDTR_LIMIT, -1); + vmwrite(GUEST_TR_LIMIT, 0x67); + vmwrite(GUEST_GDTR_LIMIT, 0xffff); + vmwrite(GUEST_IDTR_LIMIT, 0xffff); + vmwrite(GUEST_ES_AR_BYTES, + vmreadz(GUEST_ES_SELECTOR) == 0 ? 0x10000 : 0xc093); + vmwrite(GUEST_CS_AR_BYTES, 0xa09b); + vmwrite(GUEST_SS_AR_BYTES, 0xc093); + vmwrite(GUEST_DS_AR_BYTES, + vmreadz(GUEST_DS_SELECTOR) == 0 ? 0x10000 : 0xc093); + vmwrite(GUEST_FS_AR_BYTES, + vmreadz(GUEST_FS_SELECTOR) == 0 ? 0x10000 : 0xc093); + vmwrite(GUEST_GS_AR_BYTES, + vmreadz(GUEST_GS_SELECTOR) == 0 ? 0x10000 : 0xc093); + vmwrite(GUEST_LDTR_AR_BYTES, 0x10000); + vmwrite(GUEST_TR_AR_BYTES, 0x8b); + vmwrite(GUEST_INTERRUPTIBILITY_INFO, 0); + vmwrite(GUEST_ACTIVITY_STATE, 0); + vmwrite(GUEST_SYSENTER_CS, vmreadz(HOST_IA32_SYSENTER_CS)); + vmwrite(VMX_PREEMPTION_TIMER_VALUE, 0); + + vmwrite(GUEST_CR0, vmreadz(HOST_CR0)); + vmwrite(GUEST_CR3, vmreadz(HOST_CR3)); + vmwrite(GUEST_CR4, vmreadz(HOST_CR4)); + vmwrite(GUEST_ES_BASE, 0); + vmwrite(GUEST_CS_BASE, 0); + vmwrite(GUEST_SS_BASE, 0); + vmwrite(GUEST_DS_BASE, 0); + vmwrite(GUEST_FS_BASE, vmreadz(HOST_FS_BASE)); + vmwrite(GUEST_GS_BASE, vmreadz(HOST_GS_BASE)); + vmwrite(GUEST_LDTR_BASE, 0); + vmwrite(GUEST_TR_BASE, vmreadz(HOST_TR_BASE)); + vmwrite(GUEST_GDTR_BASE, vmreadz(HOST_GDTR_BASE)); + vmwrite(GUEST_IDTR_BASE, vmreadz(HOST_IDTR_BASE)); + vmwrite(GUEST_DR7, 0x400); + vmwrite(GUEST_RSP, (uint64_t)rsp); + vmwrite(GUEST_RIP, (uint64_t)rip); + vmwrite(GUEST_RFLAGS, 2); + vmwrite(GUEST_PENDING_DBG_EXCEPTIONS, 0); + vmwrite(GUEST_SYSENTER_ESP, vmreadz(HOST_IA32_SYSENTER_ESP)); + vmwrite(GUEST_SYSENTER_EIP, vmreadz(HOST_IA32_SYSENTER_EIP)); +} + +void prepare_vmcs(struct vmx_pages *vmx, void *guest_rip, void *guest_rsp) +{ + init_vmcs_control_fields(vmx); + init_vmcs_host_state(); + init_vmcs_guest_state(guest_rip, guest_rsp); +} diff --git a/tools/testing/selftests/kvm/lib/x86.c b/tools/testing/selftests/kvm/lib/x86.c new file mode 100644 index 000000000..800fe3606 --- /dev/null +++ b/tools/testing/selftests/kvm/lib/x86.c @@ -0,0 +1,893 @@ +/* + * tools/testing/selftests/kvm/lib/x86.c + * + * Copyright (C) 2018, Google LLC. + * + * This work is licensed under the terms of the GNU GPL, version 2. + */ + +#define _GNU_SOURCE /* for program_invocation_name */ + +#include "test_util.h" +#include "kvm_util.h" +#include "kvm_util_internal.h" +#include "x86.h" + +/* Minimum physical address used for virtual translation tables. */ +#define KVM_GUEST_PAGE_TABLE_MIN_PADDR 0x180000 + +/* Virtual translation table structure declarations */ +struct pageMapL4Entry { + uint64_t present:1; + uint64_t writable:1; + uint64_t user:1; + uint64_t write_through:1; + uint64_t cache_disable:1; + uint64_t accessed:1; + uint64_t ignored_06:1; + uint64_t page_size:1; + uint64_t ignored_11_08:4; + uint64_t address:40; + uint64_t ignored_62_52:11; + uint64_t execute_disable:1; +}; + +struct pageDirectoryPointerEntry { + uint64_t present:1; + uint64_t writable:1; + uint64_t user:1; + uint64_t write_through:1; + uint64_t cache_disable:1; + uint64_t accessed:1; + uint64_t ignored_06:1; + uint64_t page_size:1; + uint64_t ignored_11_08:4; + uint64_t address:40; + uint64_t ignored_62_52:11; + uint64_t execute_disable:1; +}; + +struct pageDirectoryEntry { + uint64_t present:1; + uint64_t writable:1; + uint64_t user:1; + uint64_t write_through:1; + uint64_t cache_disable:1; + uint64_t accessed:1; + uint64_t ignored_06:1; + uint64_t page_size:1; + uint64_t ignored_11_08:4; + uint64_t address:40; + uint64_t ignored_62_52:11; + uint64_t execute_disable:1; +}; + +struct pageTableEntry { + uint64_t present:1; + uint64_t writable:1; + uint64_t user:1; + uint64_t write_through:1; + uint64_t cache_disable:1; + uint64_t accessed:1; + uint64_t dirty:1; + uint64_t reserved_07:1; + uint64_t global:1; + uint64_t ignored_11_09:3; + uint64_t address:40; + uint64_t ignored_62_52:11; + uint64_t execute_disable:1; +}; + +/* Register Dump + * + * Input Args: + * indent - Left margin indent amount + * regs - register + * + * Output Args: + * stream - Output FILE stream + * + * Return: None + * + * Dumps the state of the registers given by regs, to the FILE stream + * given by steam. + */ +void regs_dump(FILE *stream, struct kvm_regs *regs, + uint8_t indent) +{ + fprintf(stream, "%*srax: 0x%.16llx rbx: 0x%.16llx " + "rcx: 0x%.16llx rdx: 0x%.16llx\n", + indent, "", + regs->rax, regs->rbx, regs->rcx, regs->rdx); + fprintf(stream, "%*srsi: 0x%.16llx rdi: 0x%.16llx " + "rsp: 0x%.16llx rbp: 0x%.16llx\n", + indent, "", + regs->rsi, regs->rdi, regs->rsp, regs->rbp); + fprintf(stream, "%*sr8: 0x%.16llx r9: 0x%.16llx " + "r10: 0x%.16llx r11: 0x%.16llx\n", + indent, "", + regs->r8, regs->r9, regs->r10, regs->r11); + fprintf(stream, "%*sr12: 0x%.16llx r13: 0x%.16llx " + "r14: 0x%.16llx r15: 0x%.16llx\n", + indent, "", + regs->r12, regs->r13, regs->r14, regs->r15); + fprintf(stream, "%*srip: 0x%.16llx rfl: 0x%.16llx\n", + indent, "", + regs->rip, regs->rflags); +} + +/* Segment Dump + * + * Input Args: + * indent - Left margin indent amount + * segment - KVM segment + * + * Output Args: + * stream - Output FILE stream + * + * Return: None + * + * Dumps the state of the KVM segment given by segment, to the FILE stream + * given by steam. + */ +static void segment_dump(FILE *stream, struct kvm_segment *segment, + uint8_t indent) +{ + fprintf(stream, "%*sbase: 0x%.16llx limit: 0x%.8x " + "selector: 0x%.4x type: 0x%.2x\n", + indent, "", segment->base, segment->limit, + segment->selector, segment->type); + fprintf(stream, "%*spresent: 0x%.2x dpl: 0x%.2x " + "db: 0x%.2x s: 0x%.2x l: 0x%.2x\n", + indent, "", segment->present, segment->dpl, + segment->db, segment->s, segment->l); + fprintf(stream, "%*sg: 0x%.2x avl: 0x%.2x " + "unusable: 0x%.2x padding: 0x%.2x\n", + indent, "", segment->g, segment->avl, + segment->unusable, segment->padding); +} + +/* dtable Dump + * + * Input Args: + * indent - Left margin indent amount + * dtable - KVM dtable + * + * Output Args: + * stream - Output FILE stream + * + * Return: None + * + * Dumps the state of the KVM dtable given by dtable, to the FILE stream + * given by steam. + */ +static void dtable_dump(FILE *stream, struct kvm_dtable *dtable, + uint8_t indent) +{ + fprintf(stream, "%*sbase: 0x%.16llx limit: 0x%.4x " + "padding: 0x%.4x 0x%.4x 0x%.4x\n", + indent, "", dtable->base, dtable->limit, + dtable->padding[0], dtable->padding[1], dtable->padding[2]); +} + +/* System Register Dump + * + * Input Args: + * indent - Left margin indent amount + * sregs - System registers + * + * Output Args: + * stream - Output FILE stream + * + * Return: None + * + * Dumps the state of the system registers given by sregs, to the FILE stream + * given by steam. + */ +void sregs_dump(FILE *stream, struct kvm_sregs *sregs, + uint8_t indent) +{ + unsigned int i; + + fprintf(stream, "%*scs:\n", indent, ""); + segment_dump(stream, &sregs->cs, indent + 2); + fprintf(stream, "%*sds:\n", indent, ""); + segment_dump(stream, &sregs->ds, indent + 2); + fprintf(stream, "%*ses:\n", indent, ""); + segment_dump(stream, &sregs->es, indent + 2); + fprintf(stream, "%*sfs:\n", indent, ""); + segment_dump(stream, &sregs->fs, indent + 2); + fprintf(stream, "%*sgs:\n", indent, ""); + segment_dump(stream, &sregs->gs, indent + 2); + fprintf(stream, "%*sss:\n", indent, ""); + segment_dump(stream, &sregs->ss, indent + 2); + fprintf(stream, "%*str:\n", indent, ""); + segment_dump(stream, &sregs->tr, indent + 2); + fprintf(stream, "%*sldt:\n", indent, ""); + segment_dump(stream, &sregs->ldt, indent + 2); + + fprintf(stream, "%*sgdt:\n", indent, ""); + dtable_dump(stream, &sregs->gdt, indent + 2); + fprintf(stream, "%*sidt:\n", indent, ""); + dtable_dump(stream, &sregs->idt, indent + 2); + + fprintf(stream, "%*scr0: 0x%.16llx cr2: 0x%.16llx " + "cr3: 0x%.16llx cr4: 0x%.16llx\n", + indent, "", + sregs->cr0, sregs->cr2, sregs->cr3, sregs->cr4); + fprintf(stream, "%*scr8: 0x%.16llx efer: 0x%.16llx " + "apic_base: 0x%.16llx\n", + indent, "", + sregs->cr8, sregs->efer, sregs->apic_base); + + fprintf(stream, "%*sinterrupt_bitmap:\n", indent, ""); + for (i = 0; i < (KVM_NR_INTERRUPTS + 63) / 64; i++) { + fprintf(stream, "%*s%.16llx\n", indent + 2, "", + sregs->interrupt_bitmap[i]); + } +} + +void virt_pgd_alloc(struct kvm_vm *vm, uint32_t pgd_memslot) +{ + int rc; + + TEST_ASSERT(vm->mode == VM_MODE_FLAT48PG, "Attempt to use " + "unknown or unsupported guest mode, mode: 0x%x", vm->mode); + + /* If needed, create page map l4 table. */ + if (!vm->pgd_created) { + vm_paddr_t paddr = vm_phy_page_alloc(vm, + KVM_GUEST_PAGE_TABLE_MIN_PADDR, pgd_memslot); + vm->pgd = paddr; + vm->pgd_created = true; + } +} + +/* VM Virtual Page Map + * + * Input Args: + * vm - Virtual Machine + * vaddr - VM Virtual Address + * paddr - VM Physical Address + * pgd_memslot - Memory region slot for new virtual translation tables + * + * Output Args: None + * + * Return: None + * + * Within the VM given by vm, creates a virtual translation for the page + * starting at vaddr to the page starting at paddr. + */ +void virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, + uint32_t pgd_memslot) +{ + uint16_t index[4]; + struct pageMapL4Entry *pml4e; + + TEST_ASSERT(vm->mode == VM_MODE_FLAT48PG, "Attempt to use " + "unknown or unsupported guest mode, mode: 0x%x", vm->mode); + + TEST_ASSERT((vaddr % vm->page_size) == 0, + "Virtual address not on page boundary,\n" + " vaddr: 0x%lx vm->page_size: 0x%x", + vaddr, vm->page_size); + TEST_ASSERT(sparsebit_is_set(vm->vpages_valid, + (vaddr >> vm->page_shift)), + "Invalid virtual address, vaddr: 0x%lx", + vaddr); + TEST_ASSERT((paddr % vm->page_size) == 0, + "Physical address not on page boundary,\n" + " paddr: 0x%lx vm->page_size: 0x%x", + paddr, vm->page_size); + TEST_ASSERT((paddr >> vm->page_shift) <= vm->max_gfn, + "Physical address beyond beyond maximum supported,\n" + " paddr: 0x%lx vm->max_gfn: 0x%lx vm->page_size: 0x%x", + paddr, vm->max_gfn, vm->page_size); + + index[0] = (vaddr >> 12) & 0x1ffu; + index[1] = (vaddr >> 21) & 0x1ffu; + index[2] = (vaddr >> 30) & 0x1ffu; + index[3] = (vaddr >> 39) & 0x1ffu; + + /* Allocate page directory pointer table if not present. */ + pml4e = addr_gpa2hva(vm, vm->pgd); + if (!pml4e[index[3]].present) { + pml4e[index[3]].address = vm_phy_page_alloc(vm, + KVM_GUEST_PAGE_TABLE_MIN_PADDR, pgd_memslot) + >> vm->page_shift; + pml4e[index[3]].writable = true; + pml4e[index[3]].present = true; + } + + /* Allocate page directory table if not present. */ + struct pageDirectoryPointerEntry *pdpe; + pdpe = addr_gpa2hva(vm, pml4e[index[3]].address * vm->page_size); + if (!pdpe[index[2]].present) { + pdpe[index[2]].address = vm_phy_page_alloc(vm, + KVM_GUEST_PAGE_TABLE_MIN_PADDR, pgd_memslot) + >> vm->page_shift; + pdpe[index[2]].writable = true; + pdpe[index[2]].present = true; + } + + /* Allocate page table if not present. */ + struct pageDirectoryEntry *pde; + pde = addr_gpa2hva(vm, pdpe[index[2]].address * vm->page_size); + if (!pde[index[1]].present) { + pde[index[1]].address = vm_phy_page_alloc(vm, + KVM_GUEST_PAGE_TABLE_MIN_PADDR, pgd_memslot) + >> vm->page_shift; + pde[index[1]].writable = true; + pde[index[1]].present = true; + } + + /* Fill in page table entry. */ + struct pageTableEntry *pte; + pte = addr_gpa2hva(vm, pde[index[1]].address * vm->page_size); + pte[index[0]].address = paddr >> vm->page_shift; + pte[index[0]].writable = true; + pte[index[0]].present = 1; +} + +/* Virtual Translation Tables Dump + * + * Input Args: + * vm - Virtual Machine + * indent - Left margin indent amount + * + * Output Args: + * stream - Output FILE stream + * + * Return: None + * + * Dumps to the FILE stream given by stream, the contents of all the + * virtual translation tables for the VM given by vm. + */ +void virt_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) +{ + struct pageMapL4Entry *pml4e, *pml4e_start; + struct pageDirectoryPointerEntry *pdpe, *pdpe_start; + struct pageDirectoryEntry *pde, *pde_start; + struct pageTableEntry *pte, *pte_start; + + if (!vm->pgd_created) + return; + + fprintf(stream, "%*s " + " no\n", indent, ""); + fprintf(stream, "%*s index hvaddr gpaddr " + "addr w exec dirty\n", + indent, ""); + pml4e_start = (struct pageMapL4Entry *) addr_gpa2hva(vm, + vm->pgd); + for (uint16_t n1 = 0; n1 <= 0x1ffu; n1++) { + pml4e = &pml4e_start[n1]; + if (!pml4e->present) + continue; + fprintf(stream, "%*spml4e 0x%-3zx %p 0x%-12lx 0x%-10lx %u " + " %u\n", + indent, "", + pml4e - pml4e_start, pml4e, + addr_hva2gpa(vm, pml4e), (uint64_t) pml4e->address, + pml4e->writable, pml4e->execute_disable); + + pdpe_start = addr_gpa2hva(vm, pml4e->address + * vm->page_size); + for (uint16_t n2 = 0; n2 <= 0x1ffu; n2++) { + pdpe = &pdpe_start[n2]; + if (!pdpe->present) + continue; + fprintf(stream, "%*spdpe 0x%-3zx %p 0x%-12lx 0x%-10lx " + "%u %u\n", + indent, "", + pdpe - pdpe_start, pdpe, + addr_hva2gpa(vm, pdpe), + (uint64_t) pdpe->address, pdpe->writable, + pdpe->execute_disable); + + pde_start = addr_gpa2hva(vm, + pdpe->address * vm->page_size); + for (uint16_t n3 = 0; n3 <= 0x1ffu; n3++) { + pde = &pde_start[n3]; + if (!pde->present) + continue; + fprintf(stream, "%*spde 0x%-3zx %p " + "0x%-12lx 0x%-10lx %u %u\n", + indent, "", pde - pde_start, pde, + addr_hva2gpa(vm, pde), + (uint64_t) pde->address, pde->writable, + pde->execute_disable); + + pte_start = addr_gpa2hva(vm, + pde->address * vm->page_size); + for (uint16_t n4 = 0; n4 <= 0x1ffu; n4++) { + pte = &pte_start[n4]; + if (!pte->present) + continue; + fprintf(stream, "%*spte 0x%-3zx %p " + "0x%-12lx 0x%-10lx %u %u " + " %u 0x%-10lx\n", + indent, "", + pte - pte_start, pte, + addr_hva2gpa(vm, pte), + (uint64_t) pte->address, + pte->writable, + pte->execute_disable, + pte->dirty, + ((uint64_t) n1 << 27) + | ((uint64_t) n2 << 18) + | ((uint64_t) n3 << 9) + | ((uint64_t) n4)); + } + } + } + } +} + +/* Set Unusable Segment + * + * Input Args: None + * + * Output Args: + * segp - Pointer to segment register + * + * Return: None + * + * Sets the segment register pointed to by segp to an unusable state. + */ +static void kvm_seg_set_unusable(struct kvm_segment *segp) +{ + memset(segp, 0, sizeof(*segp)); + segp->unusable = true; +} + +static void kvm_seg_fill_gdt_64bit(struct kvm_vm *vm, struct kvm_segment *segp) +{ + void *gdt = addr_gva2hva(vm, vm->gdt); + struct desc64 *desc = gdt + (segp->selector >> 3) * 8; + + desc->limit0 = segp->limit & 0xFFFF; + desc->base0 = segp->base & 0xFFFF; + desc->base1 = segp->base >> 16; + desc->type = segp->type; + desc->s = segp->s; + desc->dpl = segp->dpl; + desc->p = segp->present; + desc->limit1 = segp->limit >> 16; + desc->avl = segp->avl; + desc->l = segp->l; + desc->db = segp->db; + desc->g = segp->g; + desc->base2 = segp->base >> 24; + if (!segp->s) + desc->base3 = segp->base >> 32; +} + + +/* Set Long Mode Flat Kernel Code Segment + * + * Input Args: + * vm - VM whose GDT is being filled, or NULL to only write segp + * selector - selector value + * + * Output Args: + * segp - Pointer to KVM segment + * + * Return: None + * + * Sets up the KVM segment pointed to by segp, to be a code segment + * with the selector value given by selector. + */ +static void kvm_seg_set_kernel_code_64bit(struct kvm_vm *vm, uint16_t selector, + struct kvm_segment *segp) +{ + memset(segp, 0, sizeof(*segp)); + segp->selector = selector; + segp->limit = 0xFFFFFFFFu; + segp->s = 0x1; /* kTypeCodeData */ + segp->type = 0x08 | 0x01 | 0x02; /* kFlagCode | kFlagCodeAccessed + * | kFlagCodeReadable + */ + segp->g = true; + segp->l = true; + segp->present = 1; + if (vm) + kvm_seg_fill_gdt_64bit(vm, segp); +} + +/* Set Long Mode Flat Kernel Data Segment + * + * Input Args: + * vm - VM whose GDT is being filled, or NULL to only write segp + * selector - selector value + * + * Output Args: + * segp - Pointer to KVM segment + * + * Return: None + * + * Sets up the KVM segment pointed to by segp, to be a data segment + * with the selector value given by selector. + */ +static void kvm_seg_set_kernel_data_64bit(struct kvm_vm *vm, uint16_t selector, + struct kvm_segment *segp) +{ + memset(segp, 0, sizeof(*segp)); + segp->selector = selector; + segp->limit = 0xFFFFFFFFu; + segp->s = 0x1; /* kTypeCodeData */ + segp->type = 0x00 | 0x01 | 0x02; /* kFlagData | kFlagDataAccessed + * | kFlagDataWritable + */ + segp->g = true; + segp->present = true; + if (vm) + kvm_seg_fill_gdt_64bit(vm, segp); +} + +/* Address Guest Virtual to Guest Physical + * + * Input Args: + * vm - Virtual Machine + * gpa - VM virtual address + * + * Output Args: None + * + * Return: + * Equivalent VM physical address + * + * Translates the VM virtual address given by gva to a VM physical + * address and then locates the memory region containing the VM + * physical address, within the VM given by vm. When found, the host + * virtual address providing the memory to the vm physical address is returned. + * A TEST_ASSERT failure occurs if no region containing translated + * VM virtual address exists. + */ +vm_paddr_t addr_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva) +{ + uint16_t index[4]; + struct pageMapL4Entry *pml4e; + struct pageDirectoryPointerEntry *pdpe; + struct pageDirectoryEntry *pde; + struct pageTableEntry *pte; + void *hva; + + TEST_ASSERT(vm->mode == VM_MODE_FLAT48PG, "Attempt to use " + "unknown or unsupported guest mode, mode: 0x%x", vm->mode); + + index[0] = (gva >> 12) & 0x1ffu; + index[1] = (gva >> 21) & 0x1ffu; + index[2] = (gva >> 30) & 0x1ffu; + index[3] = (gva >> 39) & 0x1ffu; + + if (!vm->pgd_created) + goto unmapped_gva; + pml4e = addr_gpa2hva(vm, vm->pgd); + if (!pml4e[index[3]].present) + goto unmapped_gva; + + pdpe = addr_gpa2hva(vm, pml4e[index[3]].address * vm->page_size); + if (!pdpe[index[2]].present) + goto unmapped_gva; + + pde = addr_gpa2hva(vm, pdpe[index[2]].address * vm->page_size); + if (!pde[index[1]].present) + goto unmapped_gva; + + pte = addr_gpa2hva(vm, pde[index[1]].address * vm->page_size); + if (!pte[index[0]].present) + goto unmapped_gva; + + return (pte[index[0]].address * vm->page_size) + (gva & 0xfffu); + +unmapped_gva: + TEST_ASSERT(false, "No mapping for vm virtual address, " + "gva: 0x%lx", gva); +} + +static void kvm_setup_gdt(struct kvm_vm *vm, struct kvm_dtable *dt, int gdt_memslot, + int pgd_memslot) +{ + if (!vm->gdt) + vm->gdt = vm_vaddr_alloc(vm, getpagesize(), + KVM_UTIL_MIN_VADDR, gdt_memslot, pgd_memslot); + + dt->base = vm->gdt; + dt->limit = getpagesize(); +} + +static void kvm_setup_tss_64bit(struct kvm_vm *vm, struct kvm_segment *segp, + int selector, int gdt_memslot, + int pgd_memslot) +{ + if (!vm->tss) + vm->tss = vm_vaddr_alloc(vm, getpagesize(), + KVM_UTIL_MIN_VADDR, gdt_memslot, pgd_memslot); + + memset(segp, 0, sizeof(*segp)); + segp->base = vm->tss; + segp->limit = 0x67; + segp->selector = selector; + segp->type = 0xb; + segp->present = 1; + kvm_seg_fill_gdt_64bit(vm, segp); +} + +void vcpu_setup(struct kvm_vm *vm, int vcpuid, int pgd_memslot, int gdt_memslot) +{ + struct kvm_sregs sregs; + + /* Set mode specific system register values. */ + vcpu_sregs_get(vm, vcpuid, &sregs); + + sregs.idt.limit = 0; + + kvm_setup_gdt(vm, &sregs.gdt, gdt_memslot, pgd_memslot); + + switch (vm->mode) { + case VM_MODE_FLAT48PG: + sregs.cr0 = X86_CR0_PE | X86_CR0_NE | X86_CR0_PG; + sregs.cr4 |= X86_CR4_PAE; + sregs.efer |= (EFER_LME | EFER_LMA | EFER_NX); + + kvm_seg_set_unusable(&sregs.ldt); + kvm_seg_set_kernel_code_64bit(vm, 0x8, &sregs.cs); + kvm_seg_set_kernel_data_64bit(vm, 0x10, &sregs.ds); + kvm_seg_set_kernel_data_64bit(vm, 0x10, &sregs.es); + kvm_setup_tss_64bit(vm, &sregs.tr, 0x18, gdt_memslot, pgd_memslot); + break; + + default: + TEST_ASSERT(false, "Unknown guest mode, mode: 0x%x", vm->mode); + } + + sregs.cr3 = vm->pgd; + vcpu_sregs_set(vm, vcpuid, &sregs); +} +/* Adds a vCPU with reasonable defaults (i.e., a stack) + * + * Input Args: + * vcpuid - The id of the VCPU to add to the VM. + * guest_code - The vCPU's entry point + */ +void vm_vcpu_add_default(struct kvm_vm *vm, uint32_t vcpuid, void *guest_code) +{ + struct kvm_mp_state mp_state; + struct kvm_regs regs; + vm_vaddr_t stack_vaddr; + stack_vaddr = vm_vaddr_alloc(vm, DEFAULT_STACK_PGS * getpagesize(), + DEFAULT_GUEST_STACK_VADDR_MIN, 0, 0); + + /* Create VCPU */ + vm_vcpu_add(vm, vcpuid, 0, 0); + + /* Setup guest general purpose registers */ + vcpu_regs_get(vm, vcpuid, ®s); + regs.rflags = regs.rflags | 0x2; + regs.rsp = stack_vaddr + (DEFAULT_STACK_PGS * getpagesize()); + regs.rip = (unsigned long) guest_code; + vcpu_regs_set(vm, vcpuid, ®s); + + /* Setup the MP state */ + mp_state.mp_state = 0; + vcpu_set_mp_state(vm, vcpuid, &mp_state); +} + +/* VM VCPU CPUID Set + * + * Input Args: + * vm - Virtual Machine + * vcpuid - VCPU id + * cpuid - The CPUID values to set. + * + * Output Args: None + * + * Return: void + * + * Set the VCPU's CPUID. + */ +void vcpu_set_cpuid(struct kvm_vm *vm, + uint32_t vcpuid, struct kvm_cpuid2 *cpuid) +{ + struct vcpu *vcpu = vcpu_find(vm, vcpuid); + int rc; + + TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid); + + rc = ioctl(vcpu->fd, KVM_SET_CPUID2, cpuid); + TEST_ASSERT(rc == 0, "KVM_SET_CPUID2 failed, rc: %i errno: %i", + rc, errno); + +} +/* Create a VM with reasonable defaults + * + * Input Args: + * vcpuid - The id of the single VCPU to add to the VM. + * extra_mem_pages - The size of extra memories to add (this will + * decide how much extra space we will need to + * setup the page tables using mem slot 0) + * guest_code - The vCPU's entry point + * + * Output Args: None + * + * Return: + * Pointer to opaque structure that describes the created VM. + */ +struct kvm_vm *vm_create_default(uint32_t vcpuid, uint64_t extra_mem_pages, + void *guest_code) +{ + struct kvm_vm *vm; + /* + * For x86 the maximum page table size for a memory region + * will be when only 4K pages are used. In that case the + * total extra size for page tables (for extra N pages) will + * be: N/512+N/512^2+N/512^3+... which is definitely smaller + * than N/512*2. + */ + uint64_t extra_pg_pages = extra_mem_pages / 512 * 2; + + /* Create VM */ + vm = vm_create(VM_MODE_FLAT48PG, + DEFAULT_GUEST_PHY_PAGES + extra_pg_pages, + O_RDWR); + + /* Setup guest code */ + kvm_vm_elf_load(vm, program_invocation_name, 0, 0); + + /* Setup IRQ Chip */ + vm_create_irqchip(vm); + + /* Add the first vCPU. */ + vm_vcpu_add_default(vm, vcpuid, guest_code); + + return vm; +} + +struct kvm_x86_state { + struct kvm_vcpu_events events; + struct kvm_mp_state mp_state; + struct kvm_regs regs; + struct kvm_xsave xsave; + struct kvm_xcrs xcrs; + struct kvm_sregs sregs; + struct kvm_debugregs debugregs; + union { + struct kvm_nested_state nested; + char nested_[16384]; + }; + struct kvm_msrs msrs; +}; + +static int kvm_get_num_msrs(struct kvm_vm *vm) +{ + struct kvm_msr_list nmsrs; + int r; + + nmsrs.nmsrs = 0; + r = ioctl(vm->kvm_fd, KVM_GET_MSR_INDEX_LIST, &nmsrs); + TEST_ASSERT(r == -1 && errno == E2BIG, "Unexpected result from KVM_GET_MSR_INDEX_LIST probe, r: %i", + r); + + return nmsrs.nmsrs; +} + +struct kvm_x86_state *vcpu_save_state(struct kvm_vm *vm, uint32_t vcpuid) +{ + struct vcpu *vcpu = vcpu_find(vm, vcpuid); + struct kvm_msr_list *list; + struct kvm_x86_state *state; + int nmsrs, r, i; + static int nested_size = -1; + + if (nested_size == -1) { + nested_size = kvm_check_cap(KVM_CAP_NESTED_STATE); + TEST_ASSERT(nested_size <= sizeof(state->nested_), + "Nested state size too big, %i > %zi", + nested_size, sizeof(state->nested_)); + } + + nmsrs = kvm_get_num_msrs(vm); + list = malloc(sizeof(*list) + nmsrs * sizeof(list->indices[0])); + list->nmsrs = nmsrs; + r = ioctl(vm->kvm_fd, KVM_GET_MSR_INDEX_LIST, list); + TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_MSR_INDEX_LIST, r: %i", + r); + + state = malloc(sizeof(*state) + nmsrs * sizeof(state->msrs.entries[0])); + r = ioctl(vcpu->fd, KVM_GET_VCPU_EVENTS, &state->events); + TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_VCPU_EVENTS, r: %i", + r); + + r = ioctl(vcpu->fd, KVM_GET_MP_STATE, &state->mp_state); + TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_MP_STATE, r: %i", + r); + + r = ioctl(vcpu->fd, KVM_GET_REGS, &state->regs); + TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_REGS, r: %i", + r); + + r = ioctl(vcpu->fd, KVM_GET_XSAVE, &state->xsave); + TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_XSAVE, r: %i", + r); + + if (kvm_check_cap(KVM_CAP_XCRS)) { + r = ioctl(vcpu->fd, KVM_GET_XCRS, &state->xcrs); + TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_XCRS, r: %i", + r); + } + + r = ioctl(vcpu->fd, KVM_GET_SREGS, &state->sregs); + TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_SREGS, r: %i", + r); + + if (nested_size) { + state->nested.size = sizeof(state->nested_); + r = ioctl(vcpu->fd, KVM_GET_NESTED_STATE, &state->nested); + TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_NESTED_STATE, r: %i", + r); + TEST_ASSERT(state->nested.size <= nested_size, + "Nested state size too big, %i (KVM_CHECK_CAP gave %i)", + state->nested.size, nested_size); + } else + state->nested.size = 0; + + state->msrs.nmsrs = nmsrs; + for (i = 0; i < nmsrs; i++) + state->msrs.entries[i].index = list->indices[i]; + r = ioctl(vcpu->fd, KVM_GET_MSRS, &state->msrs); + TEST_ASSERT(r == nmsrs, "Unexpected result from KVM_GET_MSRS, r: %i (failed at %x)", + r, r == nmsrs ? -1 : list->indices[r]); + + r = ioctl(vcpu->fd, KVM_GET_DEBUGREGS, &state->debugregs); + TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_DEBUGREGS, r: %i", + r); + + free(list); + return state; +} + +void vcpu_load_state(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_x86_state *state) +{ + struct vcpu *vcpu = vcpu_find(vm, vcpuid); + int r; + + if (state->nested.size) { + r = ioctl(vcpu->fd, KVM_SET_NESTED_STATE, &state->nested); + TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_NESTED_STATE, r: %i", + r); + } + + r = ioctl(vcpu->fd, KVM_SET_XSAVE, &state->xsave); + TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_XSAVE, r: %i", + r); + + if (kvm_check_cap(KVM_CAP_XCRS)) { + r = ioctl(vcpu->fd, KVM_SET_XCRS, &state->xcrs); + TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_XCRS, r: %i", + r); + } + + r = ioctl(vcpu->fd, KVM_SET_SREGS, &state->sregs); + TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_SREGS, r: %i", + r); + + r = ioctl(vcpu->fd, KVM_SET_MSRS, &state->msrs); + TEST_ASSERT(r == state->msrs.nmsrs, "Unexpected result from KVM_SET_MSRS, r: %i (failed at %x)", + r, r == state->msrs.nmsrs ? -1 : state->msrs.entries[r].index); + + r = ioctl(vcpu->fd, KVM_SET_VCPU_EVENTS, &state->events); + TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_VCPU_EVENTS, r: %i", + r); + + r = ioctl(vcpu->fd, KVM_SET_MP_STATE, &state->mp_state); + TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_MP_STATE, r: %i", + r); + + r = ioctl(vcpu->fd, KVM_SET_DEBUGREGS, &state->debugregs); + TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_DEBUGREGS, r: %i", + r); + + r = ioctl(vcpu->fd, KVM_SET_REGS, &state->regs); + TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_REGS, r: %i", + r); +} -- cgit v1.2.3