summaryrefslogtreecommitdiffstats
path: root/tools/testing/selftests/x86
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--tools/testing/selftests/x86/.gitignore15
-rw-r--r--tools/testing/selftests/x86/Makefile105
-rwxr-xr-xtools/testing/selftests/x86/check_cc.sh16
-rw-r--r--tools/testing/selftests/x86/check_initial_reg_state.c109
-rw-r--r--tools/testing/selftests/x86/entry_from_vm86.c349
-rw-r--r--tools/testing/selftests/x86/fsgsbase.c427
-rw-r--r--tools/testing/selftests/x86/ioperm.c171
-rw-r--r--tools/testing/selftests/x86/iopl.c136
-rw-r--r--tools/testing/selftests/x86/ldt_gdt.c927
-rw-r--r--tools/testing/selftests/x86/mov_ss_trap.c285
-rw-r--r--tools/testing/selftests/x86/mpx-debug.h15
-rw-r--r--tools/testing/selftests/x86/mpx-dig.c499
-rw-r--r--tools/testing/selftests/x86/mpx-hw.h124
-rw-r--r--tools/testing/selftests/x86/mpx-mini-test.c1616
-rw-r--r--tools/testing/selftests/x86/mpx-mm.h10
-rw-r--r--tools/testing/selftests/x86/pkey-helpers.h219
-rw-r--r--tools/testing/selftests/x86/protection_keys.c1508
-rw-r--r--tools/testing/selftests/x86/ptrace_syscall.c430
-rw-r--r--tools/testing/selftests/x86/raw_syscall_helper_32.S47
-rw-r--r--tools/testing/selftests/x86/sigreturn.c871
-rw-r--r--tools/testing/selftests/x86/single_step_syscall.c187
-rw-r--r--tools/testing/selftests/x86/syscall_arg_fault.c130
-rw-r--r--tools/testing/selftests/x86/syscall_nt.c96
-rw-r--r--tools/testing/selftests/x86/sysret_rip.c195
-rw-r--r--tools/testing/selftests/x86/sysret_ss_attrs.c112
-rw-r--r--tools/testing/selftests/x86/test_FCMOV.c94
-rw-r--r--tools/testing/selftests/x86/test_FCOMI.c332
-rw-r--r--tools/testing/selftests/x86/test_FISTTP.c138
-rw-r--r--tools/testing/selftests/x86/test_mremap_vdso.c115
-rw-r--r--tools/testing/selftests/x86/test_syscall_vdso.c408
-rw-r--r--tools/testing/selftests/x86/test_vdso.c337
-rw-r--r--tools/testing/selftests/x86/test_vsyscall.c506
-rw-r--r--tools/testing/selftests/x86/thunks.S67
-rw-r--r--tools/testing/selftests/x86/thunks_32.S55
-rw-r--r--tools/testing/selftests/x86/trivial_32bit_program.c18
-rw-r--r--tools/testing/selftests/x86/trivial_64bit_program.c18
-rw-r--r--tools/testing/selftests/x86/trivial_program.c10
-rw-r--r--tools/testing/selftests/x86/unwind_vdso.c211
-rw-r--r--tools/testing/selftests/x86/vdso_restorer.c88
39 files changed, 10996 insertions, 0 deletions
diff --git a/tools/testing/selftests/x86/.gitignore b/tools/testing/selftests/x86/.gitignore
new file mode 100644
index 000000000..7757f73ff
--- /dev/null
+++ b/tools/testing/selftests/x86/.gitignore
@@ -0,0 +1,15 @@
+*_32
+*_64
+single_step_syscall
+sysret_ss_attrs
+syscall_nt
+ptrace_syscall
+test_mremap_vdso
+check_initial_reg_state
+sigreturn
+ldt_gdt
+iopl
+mpx-mini-test
+ioperm
+protection_keys
+test_vdso
diff --git a/tools/testing/selftests/x86/Makefile b/tools/testing/selftests/x86/Makefile
new file mode 100644
index 000000000..186520198
--- /dev/null
+++ b/tools/testing/selftests/x86/Makefile
@@ -0,0 +1,105 @@
+# SPDX-License-Identifier: GPL-2.0
+all:
+
+include ../lib.mk
+
+.PHONY: all all_32 all_64 warn_32bit_failure clean
+
+UNAME_M := $(shell uname -m)
+CAN_BUILD_I386 := $(shell ./check_cc.sh $(CC) trivial_32bit_program.c -m32)
+CAN_BUILD_X86_64 := $(shell ./check_cc.sh $(CC) trivial_64bit_program.c)
+CAN_BUILD_WITH_NOPIE := $(shell ./check_cc.sh $(CC) trivial_program.c -no-pie)
+
+TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs syscall_nt test_mremap_vdso \
+ check_initial_reg_state sigreturn iopl mpx-mini-test ioperm \
+ protection_keys test_vdso test_vsyscall mov_ss_trap
+TARGETS_C_32BIT_ONLY := entry_from_vm86 syscall_arg_fault test_syscall_vdso unwind_vdso \
+ test_FCMOV test_FCOMI test_FISTTP \
+ vdso_restorer
+TARGETS_C_64BIT_ONLY := fsgsbase sysret_rip
+# Some selftests require 32bit support enabled also on 64bit systems
+TARGETS_C_32BIT_NEEDED := ldt_gdt ptrace_syscall
+
+TARGETS_C_32BIT_ALL := $(TARGETS_C_BOTHBITS) $(TARGETS_C_32BIT_ONLY) $(TARGETS_C_32BIT_NEEDED)
+TARGETS_C_64BIT_ALL := $(TARGETS_C_BOTHBITS) $(TARGETS_C_64BIT_ONLY)
+ifeq ($(CAN_BUILD_I386)$(CAN_BUILD_X86_64),11)
+TARGETS_C_64BIT_ALL += $(TARGETS_C_32BIT_NEEDED)
+endif
+
+BINARIES_32 := $(TARGETS_C_32BIT_ALL:%=%_32)
+BINARIES_64 := $(TARGETS_C_64BIT_ALL:%=%_64)
+
+BINARIES_32 := $(patsubst %,$(OUTPUT)/%,$(BINARIES_32))
+BINARIES_64 := $(patsubst %,$(OUTPUT)/%,$(BINARIES_64))
+
+CFLAGS := -O2 -g -std=gnu99 -pthread -Wall
+
+# call32_from_64 in thunks.S uses absolute addresses.
+ifeq ($(CAN_BUILD_WITH_NOPIE),1)
+CFLAGS += -no-pie
+endif
+
+define gen-target-rule-32
+$(1) $(1)_32: $(OUTPUT)/$(1)_32
+.PHONY: $(1) $(1)_32
+endef
+
+define gen-target-rule-64
+$(1) $(1)_64: $(OUTPUT)/$(1)_64
+.PHONY: $(1) $(1)_64
+endef
+
+ifeq ($(CAN_BUILD_I386),1)
+all: all_32
+TEST_PROGS += $(BINARIES_32)
+EXTRA_CFLAGS += -DCAN_BUILD_32
+$(foreach t,$(TARGETS_C_32BIT_ALL),$(eval $(call gen-target-rule-32,$(t))))
+endif
+
+ifeq ($(CAN_BUILD_X86_64),1)
+all: all_64
+TEST_PROGS += $(BINARIES_64)
+EXTRA_CFLAGS += -DCAN_BUILD_64
+$(foreach t,$(TARGETS_C_64BIT_ALL),$(eval $(call gen-target-rule-64,$(t))))
+endif
+
+all_32: $(BINARIES_32)
+
+all_64: $(BINARIES_64)
+
+EXTRA_CLEAN := $(BINARIES_32) $(BINARIES_64)
+
+$(BINARIES_32): $(OUTPUT)/%_32: %.c
+ $(CC) -m32 -o $@ $(CFLAGS) $(EXTRA_CFLAGS) $^ -lrt -ldl -lm
+
+$(BINARIES_64): $(OUTPUT)/%_64: %.c
+ $(CC) -m64 -o $@ $(CFLAGS) $(EXTRA_CFLAGS) $^ -lrt -ldl
+
+# x86_64 users should be encouraged to install 32-bit libraries
+ifeq ($(CAN_BUILD_I386)$(CAN_BUILD_X86_64),01)
+all: warn_32bit_failure
+
+warn_32bit_failure:
+ @echo "Warning: you seem to have a broken 32-bit build" 2>&1; \
+ echo "environment. This will reduce test coverage of 64-bit" 2>&1; \
+ echo "kernels. If you are using a Debian-like distribution," 2>&1; \
+ echo "try:"; 2>&1; \
+ echo ""; \
+ echo " apt-get install gcc-multilib libc6-i386 libc6-dev-i386"; \
+ echo ""; \
+ echo "If you are using a Fedora-like distribution, try:"; \
+ echo ""; \
+ echo " yum install glibc-devel.*i686"; \
+ exit 0;
+endif
+
+# Some tests have additional dependencies.
+$(OUTPUT)/sysret_ss_attrs_64: thunks.S
+$(OUTPUT)/ptrace_syscall_32: raw_syscall_helper_32.S
+$(OUTPUT)/test_syscall_vdso_32: thunks_32.S
+
+# check_initial_reg_state is special: it needs a custom entry, and it
+# needs to be static so that its interpreter doesn't destroy its initial
+# state.
+$(OUTPUT)/check_initial_reg_state_32: CFLAGS += -Wl,-ereal_start -static
+$(OUTPUT)/check_initial_reg_state_64: CFLAGS += -Wl,-ereal_start -static
diff --git a/tools/testing/selftests/x86/check_cc.sh b/tools/testing/selftests/x86/check_cc.sh
new file mode 100755
index 000000000..356689c56
--- /dev/null
+++ b/tools/testing/selftests/x86/check_cc.sh
@@ -0,0 +1,16 @@
+#!/bin/sh
+# check_cc.sh - Helper to test userspace compilation support
+# Copyright (c) 2015 Andrew Lutomirski
+# GPL v2
+
+CC="$1"
+TESTPROG="$2"
+shift 2
+
+if [ -n "$CC" ] && $CC -o /dev/null "$TESTPROG" -O0 "$@" 2>/dev/null; then
+ echo 1
+else
+ echo 0
+fi
+
+exit 0
diff --git a/tools/testing/selftests/x86/check_initial_reg_state.c b/tools/testing/selftests/x86/check_initial_reg_state.c
new file mode 100644
index 000000000..6aaed9b85
--- /dev/null
+++ b/tools/testing/selftests/x86/check_initial_reg_state.c
@@ -0,0 +1,109 @@
+/*
+ * check_initial_reg_state.c - check that execve sets the correct state
+ * Copyright (c) 2014-2016 Andrew Lutomirski
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+#define _GNU_SOURCE
+
+#include <stdio.h>
+
+unsigned long ax, bx, cx, dx, si, di, bp, sp, flags;
+unsigned long r8, r9, r10, r11, r12, r13, r14, r15;
+
+asm (
+ ".pushsection .text\n\t"
+ ".type real_start, @function\n\t"
+ ".global real_start\n\t"
+ "real_start:\n\t"
+#ifdef __x86_64__
+ "mov %rax, ax\n\t"
+ "mov %rbx, bx\n\t"
+ "mov %rcx, cx\n\t"
+ "mov %rdx, dx\n\t"
+ "mov %rsi, si\n\t"
+ "mov %rdi, di\n\t"
+ "mov %rbp, bp\n\t"
+ "mov %rsp, sp\n\t"
+ "mov %r8, r8\n\t"
+ "mov %r9, r9\n\t"
+ "mov %r10, r10\n\t"
+ "mov %r11, r11\n\t"
+ "mov %r12, r12\n\t"
+ "mov %r13, r13\n\t"
+ "mov %r14, r14\n\t"
+ "mov %r15, r15\n\t"
+ "pushfq\n\t"
+ "popq flags\n\t"
+#else
+ "mov %eax, ax\n\t"
+ "mov %ebx, bx\n\t"
+ "mov %ecx, cx\n\t"
+ "mov %edx, dx\n\t"
+ "mov %esi, si\n\t"
+ "mov %edi, di\n\t"
+ "mov %ebp, bp\n\t"
+ "mov %esp, sp\n\t"
+ "pushfl\n\t"
+ "popl flags\n\t"
+#endif
+ "jmp _start\n\t"
+ ".size real_start, . - real_start\n\t"
+ ".popsection");
+
+int main()
+{
+ int nerrs = 0;
+
+ if (sp == 0) {
+ printf("[FAIL]\tTest was built incorrectly\n");
+ return 1;
+ }
+
+ if (ax || bx || cx || dx || si || di || bp
+#ifdef __x86_64__
+ || r8 || r9 || r10 || r11 || r12 || r13 || r14 || r15
+#endif
+ ) {
+ printf("[FAIL]\tAll GPRs except SP should be 0\n");
+#define SHOW(x) printf("\t" #x " = 0x%lx\n", x);
+ SHOW(ax);
+ SHOW(bx);
+ SHOW(cx);
+ SHOW(dx);
+ SHOW(si);
+ SHOW(di);
+ SHOW(bp);
+ SHOW(sp);
+#ifdef __x86_64__
+ SHOW(r8);
+ SHOW(r9);
+ SHOW(r10);
+ SHOW(r11);
+ SHOW(r12);
+ SHOW(r13);
+ SHOW(r14);
+ SHOW(r15);
+#endif
+ nerrs++;
+ } else {
+ printf("[OK]\tAll GPRs except SP are 0\n");
+ }
+
+ if (flags != 0x202) {
+ printf("[FAIL]\tFLAGS is 0x%lx, but it should be 0x202\n", flags);
+ nerrs++;
+ } else {
+ printf("[OK]\tFLAGS is 0x202\n");
+ }
+
+ return nerrs ? 1 : 0;
+}
diff --git a/tools/testing/selftests/x86/entry_from_vm86.c b/tools/testing/selftests/x86/entry_from_vm86.c
new file mode 100644
index 000000000..ade443a88
--- /dev/null
+++ b/tools/testing/selftests/x86/entry_from_vm86.c
@@ -0,0 +1,349 @@
+/*
+ * entry_from_vm86.c - tests kernel entries from vm86 mode
+ * Copyright (c) 2014-2015 Andrew Lutomirski
+ *
+ * This exercises a few paths that need to special-case vm86 mode.
+ *
+ * GPL v2.
+ */
+
+#define _GNU_SOURCE
+
+#include <assert.h>
+#include <stdlib.h>
+#include <sys/syscall.h>
+#include <sys/signal.h>
+#include <sys/ucontext.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <string.h>
+#include <inttypes.h>
+#include <sys/mman.h>
+#include <err.h>
+#include <stddef.h>
+#include <stdbool.h>
+#include <errno.h>
+#include <sys/vm86.h>
+
+static unsigned long load_addr = 0x10000;
+static int nerrs = 0;
+
+static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *),
+ int flags)
+{
+ struct sigaction sa;
+ memset(&sa, 0, sizeof(sa));
+ sa.sa_sigaction = handler;
+ sa.sa_flags = SA_SIGINFO | flags;
+ sigemptyset(&sa.sa_mask);
+ if (sigaction(sig, &sa, 0))
+ err(1, "sigaction");
+}
+
+static void clearhandler(int sig)
+{
+ struct sigaction sa;
+ memset(&sa, 0, sizeof(sa));
+ sa.sa_handler = SIG_DFL;
+ sigemptyset(&sa.sa_mask);
+ if (sigaction(sig, &sa, 0))
+ err(1, "sigaction");
+}
+
+static sig_atomic_t got_signal;
+
+static void sighandler(int sig, siginfo_t *info, void *ctx_void)
+{
+ ucontext_t *ctx = (ucontext_t*)ctx_void;
+
+ if (ctx->uc_mcontext.gregs[REG_EFL] & X86_EFLAGS_VM ||
+ (ctx->uc_mcontext.gregs[REG_CS] & 3) != 3) {
+ printf("[FAIL]\tSignal frame should not reflect vm86 mode\n");
+ nerrs++;
+ }
+
+ const char *signame;
+ if (sig == SIGSEGV)
+ signame = "SIGSEGV";
+ else if (sig == SIGILL)
+ signame = "SIGILL";
+ else
+ signame = "unexpected signal";
+
+ printf("[INFO]\t%s: FLAGS = 0x%lx, CS = 0x%hx\n", signame,
+ (unsigned long)ctx->uc_mcontext.gregs[REG_EFL],
+ (unsigned short)ctx->uc_mcontext.gregs[REG_CS]);
+
+ got_signal = 1;
+}
+
+asm (
+ ".pushsection .rodata\n\t"
+ ".type vmcode_bound, @object\n\t"
+ "vmcode:\n\t"
+ "vmcode_bound:\n\t"
+ ".code16\n\t"
+ "bound %ax, (2048)\n\t"
+ "int3\n\t"
+ "vmcode_sysenter:\n\t"
+ "sysenter\n\t"
+ "vmcode_syscall:\n\t"
+ "syscall\n\t"
+ "vmcode_sti:\n\t"
+ "sti\n\t"
+ "vmcode_int3:\n\t"
+ "int3\n\t"
+ "vmcode_int80:\n\t"
+ "int $0x80\n\t"
+ "vmcode_popf_hlt:\n\t"
+ "push %ax\n\t"
+ "popf\n\t"
+ "hlt\n\t"
+ "vmcode_umip:\n\t"
+ /* addressing via displacements */
+ "smsw (2052)\n\t"
+ "sidt (2054)\n\t"
+ "sgdt (2060)\n\t"
+ /* addressing via registers */
+ "mov $2066, %bx\n\t"
+ "smsw (%bx)\n\t"
+ "mov $2068, %bx\n\t"
+ "sidt (%bx)\n\t"
+ "mov $2074, %bx\n\t"
+ "sgdt (%bx)\n\t"
+ /* register operands, only for smsw */
+ "smsw %ax\n\t"
+ "mov %ax, (2080)\n\t"
+ "int3\n\t"
+ "vmcode_umip_str:\n\t"
+ "str %eax\n\t"
+ "vmcode_umip_sldt:\n\t"
+ "sldt %eax\n\t"
+ "int3\n\t"
+ ".size vmcode, . - vmcode\n\t"
+ "end_vmcode:\n\t"
+ ".code32\n\t"
+ ".popsection"
+ );
+
+extern unsigned char vmcode[], end_vmcode[];
+extern unsigned char vmcode_bound[], vmcode_sysenter[], vmcode_syscall[],
+ vmcode_sti[], vmcode_int3[], vmcode_int80[], vmcode_popf_hlt[],
+ vmcode_umip[], vmcode_umip_str[], vmcode_umip_sldt[];
+
+/* Returns false if the test was skipped. */
+static bool do_test(struct vm86plus_struct *v86, unsigned long eip,
+ unsigned int rettype, unsigned int retarg,
+ const char *text)
+{
+ long ret;
+
+ printf("[RUN]\t%s from vm86 mode\n", text);
+ v86->regs.eip = eip;
+ ret = vm86(VM86_ENTER, v86);
+
+ if (ret == -1 && (errno == ENOSYS || errno == EPERM)) {
+ printf("[SKIP]\tvm86 %s\n",
+ errno == ENOSYS ? "not supported" : "not allowed");
+ return false;
+ }
+
+ if (VM86_TYPE(ret) == VM86_INTx) {
+ char trapname[32];
+ int trapno = VM86_ARG(ret);
+ if (trapno == 13)
+ strcpy(trapname, "GP");
+ else if (trapno == 5)
+ strcpy(trapname, "BR");
+ else if (trapno == 14)
+ strcpy(trapname, "PF");
+ else
+ sprintf(trapname, "%d", trapno);
+
+ printf("[INFO]\tExited vm86 mode due to #%s\n", trapname);
+ } else if (VM86_TYPE(ret) == VM86_UNKNOWN) {
+ printf("[INFO]\tExited vm86 mode due to unhandled GP fault\n");
+ } else if (VM86_TYPE(ret) == VM86_TRAP) {
+ printf("[INFO]\tExited vm86 mode due to a trap (arg=%ld)\n",
+ VM86_ARG(ret));
+ } else if (VM86_TYPE(ret) == VM86_SIGNAL) {
+ printf("[INFO]\tExited vm86 mode due to a signal\n");
+ } else if (VM86_TYPE(ret) == VM86_STI) {
+ printf("[INFO]\tExited vm86 mode due to STI\n");
+ } else {
+ printf("[INFO]\tExited vm86 mode due to type %ld, arg %ld\n",
+ VM86_TYPE(ret), VM86_ARG(ret));
+ }
+
+ if (rettype == -1 ||
+ (VM86_TYPE(ret) == rettype && VM86_ARG(ret) == retarg)) {
+ printf("[OK]\tReturned correctly\n");
+ } else {
+ printf("[FAIL]\tIncorrect return reason (started at eip = 0x%lx, ended at eip = 0x%lx)\n", eip, v86->regs.eip);
+ nerrs++;
+ }
+
+ return true;
+}
+
+void do_umip_tests(struct vm86plus_struct *vm86, unsigned char *test_mem)
+{
+ struct table_desc {
+ unsigned short limit;
+ unsigned long base;
+ } __attribute__((packed));
+
+ /* Initialize variables with arbitrary values */
+ struct table_desc gdt1 = { .base = 0x3c3c3c3c, .limit = 0x9999 };
+ struct table_desc gdt2 = { .base = 0x1a1a1a1a, .limit = 0xaeae };
+ struct table_desc idt1 = { .base = 0x7b7b7b7b, .limit = 0xf1f1 };
+ struct table_desc idt2 = { .base = 0x89898989, .limit = 0x1313 };
+ unsigned short msw1 = 0x1414, msw2 = 0x2525, msw3 = 3737;
+
+ /* UMIP -- exit with INT3 unless kernel emulation did not trap #GP */
+ do_test(vm86, vmcode_umip - vmcode, VM86_TRAP, 3, "UMIP tests");
+
+ /* Results from displacement-only addressing */
+ msw1 = *(unsigned short *)(test_mem + 2052);
+ memcpy(&idt1, test_mem + 2054, sizeof(idt1));
+ memcpy(&gdt1, test_mem + 2060, sizeof(gdt1));
+
+ /* Results from register-indirect addressing */
+ msw2 = *(unsigned short *)(test_mem + 2066);
+ memcpy(&idt2, test_mem + 2068, sizeof(idt2));
+ memcpy(&gdt2, test_mem + 2074, sizeof(gdt2));
+
+ /* Results when using register operands */
+ msw3 = *(unsigned short *)(test_mem + 2080);
+
+ printf("[INFO]\tResult from SMSW:[0x%04x]\n", msw1);
+ printf("[INFO]\tResult from SIDT: limit[0x%04x]base[0x%08lx]\n",
+ idt1.limit, idt1.base);
+ printf("[INFO]\tResult from SGDT: limit[0x%04x]base[0x%08lx]\n",
+ gdt1.limit, gdt1.base);
+
+ if (msw1 != msw2 || msw1 != msw3)
+ printf("[FAIL]\tAll the results of SMSW should be the same.\n");
+ else
+ printf("[PASS]\tAll the results from SMSW are identical.\n");
+
+ if (memcmp(&gdt1, &gdt2, sizeof(gdt1)))
+ printf("[FAIL]\tAll the results of SGDT should be the same.\n");
+ else
+ printf("[PASS]\tAll the results from SGDT are identical.\n");
+
+ if (memcmp(&idt1, &idt2, sizeof(idt1)))
+ printf("[FAIL]\tAll the results of SIDT should be the same.\n");
+ else
+ printf("[PASS]\tAll the results from SIDT are identical.\n");
+
+ sethandler(SIGILL, sighandler, 0);
+ do_test(vm86, vmcode_umip_str - vmcode, VM86_SIGNAL, 0,
+ "STR instruction");
+ clearhandler(SIGILL);
+
+ sethandler(SIGILL, sighandler, 0);
+ do_test(vm86, vmcode_umip_sldt - vmcode, VM86_SIGNAL, 0,
+ "SLDT instruction");
+ clearhandler(SIGILL);
+}
+
+int main(void)
+{
+ struct vm86plus_struct v86;
+ unsigned char *addr = mmap((void *)load_addr, 4096,
+ PROT_READ | PROT_WRITE | PROT_EXEC,
+ MAP_ANONYMOUS | MAP_PRIVATE, -1,0);
+ if (addr != (unsigned char *)load_addr)
+ err(1, "mmap");
+
+ memcpy(addr, vmcode, end_vmcode - vmcode);
+ addr[2048] = 2;
+ addr[2050] = 3;
+
+ memset(&v86, 0, sizeof(v86));
+
+ v86.regs.cs = load_addr / 16;
+ v86.regs.ss = load_addr / 16;
+ v86.regs.ds = load_addr / 16;
+ v86.regs.es = load_addr / 16;
+
+ /* Use the end of the page as our stack. */
+ v86.regs.esp = 4096;
+
+ assert((v86.regs.cs & 3) == 0); /* Looks like RPL = 0 */
+
+ /* #BR -- should deliver SIG??? */
+ do_test(&v86, vmcode_bound - vmcode, VM86_INTx, 5, "#BR");
+
+ /*
+ * SYSENTER -- should cause #GP or #UD depending on CPU.
+ * Expected return type -1 means that we shouldn't validate
+ * the vm86 return value. This will avoid problems on non-SEP
+ * CPUs.
+ */
+ sethandler(SIGILL, sighandler, 0);
+ do_test(&v86, vmcode_sysenter - vmcode, -1, 0, "SYSENTER");
+ clearhandler(SIGILL);
+
+ /*
+ * SYSCALL would be a disaster in VM86 mode. Fortunately,
+ * there is no kernel that both enables SYSCALL and sets
+ * EFER.SCE, so it's #UD on all systems. But vm86 is
+ * buggy (or has a "feature"), so the SIGILL will actually
+ * be delivered.
+ */
+ sethandler(SIGILL, sighandler, 0);
+ do_test(&v86, vmcode_syscall - vmcode, VM86_SIGNAL, 0, "SYSCALL");
+ clearhandler(SIGILL);
+
+ /* STI with VIP set */
+ v86.regs.eflags |= X86_EFLAGS_VIP;
+ v86.regs.eflags &= ~X86_EFLAGS_IF;
+ do_test(&v86, vmcode_sti - vmcode, VM86_STI, 0, "STI with VIP set");
+
+ /* POPF with VIP set but IF clear: should not trap */
+ v86.regs.eflags = X86_EFLAGS_VIP;
+ v86.regs.eax = 0;
+ do_test(&v86, vmcode_popf_hlt - vmcode, VM86_UNKNOWN, 0, "POPF with VIP set and IF clear");
+
+ /* POPF with VIP set and IF set: should trap */
+ v86.regs.eflags = X86_EFLAGS_VIP;
+ v86.regs.eax = X86_EFLAGS_IF;
+ do_test(&v86, vmcode_popf_hlt - vmcode, VM86_STI, 0, "POPF with VIP and IF set");
+
+ /* POPF with VIP clear and IF set: should not trap */
+ v86.regs.eflags = 0;
+ v86.regs.eax = X86_EFLAGS_IF;
+ do_test(&v86, vmcode_popf_hlt - vmcode, VM86_UNKNOWN, 0, "POPF with VIP clear and IF set");
+
+ v86.regs.eflags = 0;
+
+ /* INT3 -- should cause #BP */
+ do_test(&v86, vmcode_int3 - vmcode, VM86_TRAP, 3, "INT3");
+
+ /* INT80 -- should exit with "INTx 0x80" */
+ v86.regs.eax = (unsigned int)-1;
+ do_test(&v86, vmcode_int80 - vmcode, VM86_INTx, 0x80, "int80");
+
+ /* UMIP -- should exit with INTx 0x80 unless UMIP was not disabled */
+ do_umip_tests(&v86, addr);
+
+ /* Execute a null pointer */
+ v86.regs.cs = 0;
+ v86.regs.ss = 0;
+ sethandler(SIGSEGV, sighandler, 0);
+ got_signal = 0;
+ if (do_test(&v86, 0, VM86_SIGNAL, 0, "Execute null pointer") &&
+ !got_signal) {
+ printf("[FAIL]\tDid not receive SIGSEGV\n");
+ nerrs++;
+ }
+ clearhandler(SIGSEGV);
+
+ /* Make sure nothing explodes if we fork. */
+ if (fork() == 0)
+ return 0;
+
+ return (nerrs == 0 ? 0 : 1);
+}
diff --git a/tools/testing/selftests/x86/fsgsbase.c b/tools/testing/selftests/x86/fsgsbase.c
new file mode 100644
index 000000000..f249e042b
--- /dev/null
+++ b/tools/testing/selftests/x86/fsgsbase.c
@@ -0,0 +1,427 @@
+/*
+ * fsgsbase.c, an fsgsbase test
+ * Copyright (c) 2014-2016 Andy Lutomirski
+ * GPL v2
+ */
+
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <string.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+#include <err.h>
+#include <sys/user.h>
+#include <asm/prctl.h>
+#include <sys/prctl.h>
+#include <signal.h>
+#include <limits.h>
+#include <sys/ucontext.h>
+#include <sched.h>
+#include <linux/futex.h>
+#include <pthread.h>
+#include <asm/ldt.h>
+#include <sys/mman.h>
+
+#ifndef __x86_64__
+# error This test is 64-bit only
+#endif
+
+static volatile sig_atomic_t want_segv;
+static volatile unsigned long segv_addr;
+
+static int nerrs;
+
+static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *),
+ int flags)
+{
+ struct sigaction sa;
+ memset(&sa, 0, sizeof(sa));
+ sa.sa_sigaction = handler;
+ sa.sa_flags = SA_SIGINFO | flags;
+ sigemptyset(&sa.sa_mask);
+ if (sigaction(sig, &sa, 0))
+ err(1, "sigaction");
+}
+
+static void clearhandler(int sig)
+{
+ struct sigaction sa;
+ memset(&sa, 0, sizeof(sa));
+ sa.sa_handler = SIG_DFL;
+ sigemptyset(&sa.sa_mask);
+ if (sigaction(sig, &sa, 0))
+ err(1, "sigaction");
+}
+
+static void sigsegv(int sig, siginfo_t *si, void *ctx_void)
+{
+ ucontext_t *ctx = (ucontext_t*)ctx_void;
+
+ if (!want_segv) {
+ clearhandler(SIGSEGV);
+ return; /* Crash cleanly. */
+ }
+
+ want_segv = false;
+ segv_addr = (unsigned long)si->si_addr;
+
+ ctx->uc_mcontext.gregs[REG_RIP] += 4; /* Skip the faulting mov */
+
+}
+
+enum which_base { FS, GS };
+
+static unsigned long read_base(enum which_base which)
+{
+ unsigned long offset;
+ /*
+ * Unless we have FSGSBASE, there's no direct way to do this from
+ * user mode. We can get at it indirectly using signals, though.
+ */
+
+ want_segv = true;
+
+ offset = 0;
+ if (which == FS) {
+ /* Use a constant-length instruction here. */
+ asm volatile ("mov %%fs:(%%rcx), %%rax" : : "c" (offset) : "rax");
+ } else {
+ asm volatile ("mov %%gs:(%%rcx), %%rax" : : "c" (offset) : "rax");
+ }
+ if (!want_segv)
+ return segv_addr + offset;
+
+ /*
+ * If that didn't segfault, try the other end of the address space.
+ * Unless we get really unlucky and run into the vsyscall page, this
+ * is guaranteed to segfault.
+ */
+
+ offset = (ULONG_MAX >> 1) + 1;
+ if (which == FS) {
+ asm volatile ("mov %%fs:(%%rcx), %%rax"
+ : : "c" (offset) : "rax");
+ } else {
+ asm volatile ("mov %%gs:(%%rcx), %%rax"
+ : : "c" (offset) : "rax");
+ }
+ if (!want_segv)
+ return segv_addr + offset;
+
+ abort();
+}
+
+static void check_gs_value(unsigned long value)
+{
+ unsigned long base;
+ unsigned short sel;
+
+ printf("[RUN]\tARCH_SET_GS to 0x%lx\n", value);
+ if (syscall(SYS_arch_prctl, ARCH_SET_GS, value) != 0)
+ err(1, "ARCH_SET_GS");
+
+ asm volatile ("mov %%gs, %0" : "=rm" (sel));
+ base = read_base(GS);
+ if (base == value) {
+ printf("[OK]\tGSBASE was set as expected (selector 0x%hx)\n",
+ sel);
+ } else {
+ nerrs++;
+ printf("[FAIL]\tGSBASE was not as expected: got 0x%lx (selector 0x%hx)\n",
+ base, sel);
+ }
+
+ if (syscall(SYS_arch_prctl, ARCH_GET_GS, &base) != 0)
+ err(1, "ARCH_GET_GS");
+ if (base == value) {
+ printf("[OK]\tARCH_GET_GS worked as expected (selector 0x%hx)\n",
+ sel);
+ } else {
+ nerrs++;
+ printf("[FAIL]\tARCH_GET_GS was not as expected: got 0x%lx (selector 0x%hx)\n",
+ base, sel);
+ }
+}
+
+static void mov_0_gs(unsigned long initial_base, bool schedule)
+{
+ unsigned long base, arch_base;
+
+ printf("[RUN]\tARCH_SET_GS to 0x%lx then mov 0 to %%gs%s\n", initial_base, schedule ? " and schedule " : "");
+ if (syscall(SYS_arch_prctl, ARCH_SET_GS, initial_base) != 0)
+ err(1, "ARCH_SET_GS");
+
+ if (schedule)
+ usleep(10);
+
+ asm volatile ("mov %0, %%gs" : : "rm" (0));
+ base = read_base(GS);
+ if (syscall(SYS_arch_prctl, ARCH_GET_GS, &arch_base) != 0)
+ err(1, "ARCH_GET_GS");
+ if (base == arch_base) {
+ printf("[OK]\tGSBASE is 0x%lx\n", base);
+ } else {
+ nerrs++;
+ printf("[FAIL]\tGSBASE changed to 0x%lx but kernel reports 0x%lx\n", base, arch_base);
+ }
+}
+
+static volatile unsigned long remote_base;
+static volatile bool remote_hard_zero;
+static volatile unsigned int ftx;
+
+/*
+ * ARCH_SET_FS/GS(0) may or may not program a selector of zero. HARD_ZERO
+ * means to force the selector to zero to improve test coverage.
+ */
+#define HARD_ZERO 0xa1fa5f343cb85fa4
+
+static void do_remote_base()
+{
+ unsigned long to_set = remote_base;
+ bool hard_zero = false;
+ if (to_set == HARD_ZERO) {
+ to_set = 0;
+ hard_zero = true;
+ }
+
+ if (syscall(SYS_arch_prctl, ARCH_SET_GS, to_set) != 0)
+ err(1, "ARCH_SET_GS");
+
+ if (hard_zero)
+ asm volatile ("mov %0, %%gs" : : "rm" ((unsigned short)0));
+
+ unsigned short sel;
+ asm volatile ("mov %%gs, %0" : "=rm" (sel));
+ printf("\tother thread: ARCH_SET_GS(0x%lx)%s -- sel is 0x%hx\n",
+ to_set, hard_zero ? " and clear gs" : "", sel);
+}
+
+void do_unexpected_base(void)
+{
+ /*
+ * The goal here is to try to arrange for GS == 0, GSBASE !=
+ * 0, and for the the kernel the think that GSBASE == 0.
+ *
+ * To make the test as reliable as possible, this uses
+ * explicit descriptorss. (This is not the only way. This
+ * could use ARCH_SET_GS with a low, nonzero base, but the
+ * relevant side effect of ARCH_SET_GS could change.)
+ */
+
+ /* Step 1: tell the kernel that we have GSBASE == 0. */
+ if (syscall(SYS_arch_prctl, ARCH_SET_GS, 0) != 0)
+ err(1, "ARCH_SET_GS");
+
+ /* Step 2: change GSBASE without telling the kernel. */
+ struct user_desc desc = {
+ .entry_number = 0,
+ .base_addr = 0xBAADF00D,
+ .limit = 0xfffff,
+ .seg_32bit = 1,
+ .contents = 0, /* Data, grow-up */
+ .read_exec_only = 0,
+ .limit_in_pages = 1,
+ .seg_not_present = 0,
+ .useable = 0
+ };
+ if (syscall(SYS_modify_ldt, 1, &desc, sizeof(desc)) == 0) {
+ printf("\tother thread: using LDT slot 0\n");
+ asm volatile ("mov %0, %%gs" : : "rm" ((unsigned short)0x7));
+ } else {
+ /* No modify_ldt for us (configured out, perhaps) */
+
+ struct user_desc *low_desc = mmap(
+ NULL, sizeof(desc),
+ PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS | MAP_32BIT, -1, 0);
+ memcpy(low_desc, &desc, sizeof(desc));
+
+ low_desc->entry_number = -1;
+
+ /* 32-bit set_thread_area */
+ long ret;
+ asm volatile ("int $0x80"
+ : "=a" (ret) : "a" (243), "b" (low_desc)
+ : "r8", "r9", "r10", "r11");
+ memcpy(&desc, low_desc, sizeof(desc));
+ munmap(low_desc, sizeof(desc));
+
+ if (ret != 0) {
+ printf("[NOTE]\tcould not create a segment -- test won't do anything\n");
+ return;
+ }
+ printf("\tother thread: using GDT slot %d\n", desc.entry_number);
+ asm volatile ("mov %0, %%gs" : : "rm" ((unsigned short)((desc.entry_number << 3) | 0x3)));
+ }
+
+ /*
+ * Step 3: set the selector back to zero. On AMD chips, this will
+ * preserve GSBASE.
+ */
+
+ asm volatile ("mov %0, %%gs" : : "rm" ((unsigned short)0));
+}
+
+static void *threadproc(void *ctx)
+{
+ while (1) {
+ while (ftx == 0)
+ syscall(SYS_futex, &ftx, FUTEX_WAIT, 0, NULL, NULL, 0);
+ if (ftx == 3)
+ return NULL;
+
+ if (ftx == 1)
+ do_remote_base();
+ else if (ftx == 2)
+ do_unexpected_base();
+ else
+ errx(1, "helper thread got bad command");
+
+ ftx = 0;
+ syscall(SYS_futex, &ftx, FUTEX_WAKE, 0, NULL, NULL, 0);
+ }
+}
+
+static void set_gs_and_switch_to(unsigned long local,
+ unsigned short force_sel,
+ unsigned long remote)
+{
+ unsigned long base;
+ unsigned short sel_pre_sched, sel_post_sched;
+
+ bool hard_zero = false;
+ if (local == HARD_ZERO) {
+ hard_zero = true;
+ local = 0;
+ }
+
+ printf("[RUN]\tARCH_SET_GS(0x%lx)%s, then schedule to 0x%lx\n",
+ local, hard_zero ? " and clear gs" : "", remote);
+ if (force_sel)
+ printf("\tBefore schedule, set selector to 0x%hx\n", force_sel);
+ if (syscall(SYS_arch_prctl, ARCH_SET_GS, local) != 0)
+ err(1, "ARCH_SET_GS");
+ if (hard_zero)
+ asm volatile ("mov %0, %%gs" : : "rm" ((unsigned short)0));
+
+ if (read_base(GS) != local) {
+ nerrs++;
+ printf("[FAIL]\tGSBASE wasn't set as expected\n");
+ }
+
+ if (force_sel) {
+ asm volatile ("mov %0, %%gs" : : "rm" (force_sel));
+ sel_pre_sched = force_sel;
+ local = read_base(GS);
+
+ /*
+ * Signal delivery seems to mess up weird selectors. Put it
+ * back.
+ */
+ asm volatile ("mov %0, %%gs" : : "rm" (force_sel));
+ } else {
+ asm volatile ("mov %%gs, %0" : "=rm" (sel_pre_sched));
+ }
+
+ remote_base = remote;
+ ftx = 1;
+ syscall(SYS_futex, &ftx, FUTEX_WAKE, 0, NULL, NULL, 0);
+ while (ftx != 0)
+ syscall(SYS_futex, &ftx, FUTEX_WAIT, 1, NULL, NULL, 0);
+
+ asm volatile ("mov %%gs, %0" : "=rm" (sel_post_sched));
+ base = read_base(GS);
+ if (base == local && sel_pre_sched == sel_post_sched) {
+ printf("[OK]\tGS/BASE remained 0x%hx/0x%lx\n",
+ sel_pre_sched, local);
+ } else {
+ nerrs++;
+ printf("[FAIL]\tGS/BASE changed from 0x%hx/0x%lx to 0x%hx/0x%lx\n",
+ sel_pre_sched, local, sel_post_sched, base);
+ }
+}
+
+static void test_unexpected_base(void)
+{
+ unsigned long base;
+
+ printf("[RUN]\tARCH_SET_GS(0), clear gs, then manipulate GSBASE in a different thread\n");
+ if (syscall(SYS_arch_prctl, ARCH_SET_GS, 0) != 0)
+ err(1, "ARCH_SET_GS");
+ asm volatile ("mov %0, %%gs" : : "rm" ((unsigned short)0));
+
+ ftx = 2;
+ syscall(SYS_futex, &ftx, FUTEX_WAKE, 0, NULL, NULL, 0);
+ while (ftx != 0)
+ syscall(SYS_futex, &ftx, FUTEX_WAIT, 1, NULL, NULL, 0);
+
+ base = read_base(GS);
+ if (base == 0) {
+ printf("[OK]\tGSBASE remained 0\n");
+ } else {
+ nerrs++;
+ printf("[FAIL]\tGSBASE changed to 0x%lx\n", base);
+ }
+}
+
+int main()
+{
+ pthread_t thread;
+
+ sethandler(SIGSEGV, sigsegv, 0);
+
+ check_gs_value(0);
+ check_gs_value(1);
+ check_gs_value(0x200000000);
+ check_gs_value(0);
+ check_gs_value(0x200000000);
+ check_gs_value(1);
+
+ for (int sched = 0; sched < 2; sched++) {
+ mov_0_gs(0, !!sched);
+ mov_0_gs(1, !!sched);
+ mov_0_gs(0x200000000, !!sched);
+ }
+
+ /* Set up for multithreading. */
+
+ cpu_set_t cpuset;
+ CPU_ZERO(&cpuset);
+ CPU_SET(0, &cpuset);
+ if (sched_setaffinity(0, sizeof(cpuset), &cpuset) != 0)
+ err(1, "sched_setaffinity to CPU 0"); /* should never fail */
+
+ if (pthread_create(&thread, 0, threadproc, 0) != 0)
+ err(1, "pthread_create");
+
+ static unsigned long bases_with_hard_zero[] = {
+ 0, HARD_ZERO, 1, 0x200000000,
+ };
+
+ for (int local = 0; local < 4; local++) {
+ for (int remote = 0; remote < 4; remote++) {
+ for (unsigned short s = 0; s < 5; s++) {
+ unsigned short sel = s;
+ if (s == 4)
+ asm ("mov %%ss, %0" : "=rm" (sel));
+ set_gs_and_switch_to(
+ bases_with_hard_zero[local],
+ sel,
+ bases_with_hard_zero[remote]);
+ }
+ }
+ }
+
+ test_unexpected_base();
+
+ ftx = 3; /* Kill the thread. */
+ syscall(SYS_futex, &ftx, FUTEX_WAKE, 0, NULL, NULL, 0);
+
+ if (pthread_join(thread, NULL) != 0)
+ err(1, "pthread_join");
+
+ return nerrs == 0 ? 0 : 1;
+}
diff --git a/tools/testing/selftests/x86/ioperm.c b/tools/testing/selftests/x86/ioperm.c
new file mode 100644
index 000000000..01de41c1b
--- /dev/null
+++ b/tools/testing/selftests/x86/ioperm.c
@@ -0,0 +1,171 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * ioperm.c - Test case for ioperm(2)
+ * Copyright (c) 2015 Andrew Lutomirski
+ */
+
+#define _GNU_SOURCE
+#include <err.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <signal.h>
+#include <setjmp.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <stdbool.h>
+#include <sched.h>
+#include <sys/io.h>
+
+static int nerrs = 0;
+
+static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *),
+ int flags)
+{
+ struct sigaction sa;
+ memset(&sa, 0, sizeof(sa));
+ sa.sa_sigaction = handler;
+ sa.sa_flags = SA_SIGINFO | flags;
+ sigemptyset(&sa.sa_mask);
+ if (sigaction(sig, &sa, 0))
+ err(1, "sigaction");
+
+}
+
+static void clearhandler(int sig)
+{
+ struct sigaction sa;
+ memset(&sa, 0, sizeof(sa));
+ sa.sa_handler = SIG_DFL;
+ sigemptyset(&sa.sa_mask);
+ if (sigaction(sig, &sa, 0))
+ err(1, "sigaction");
+}
+
+static jmp_buf jmpbuf;
+
+static void sigsegv(int sig, siginfo_t *si, void *ctx_void)
+{
+ siglongjmp(jmpbuf, 1);
+}
+
+static bool try_outb(unsigned short port)
+{
+ sethandler(SIGSEGV, sigsegv, SA_RESETHAND);
+ if (sigsetjmp(jmpbuf, 1) != 0) {
+ return false;
+ } else {
+ asm volatile ("outb %%al, %w[port]"
+ : : [port] "Nd" (port), "a" (0));
+ return true;
+ }
+ clearhandler(SIGSEGV);
+}
+
+static void expect_ok(unsigned short port)
+{
+ if (!try_outb(port)) {
+ printf("[FAIL]\toutb to 0x%02hx failed\n", port);
+ exit(1);
+ }
+
+ printf("[OK]\toutb to 0x%02hx worked\n", port);
+}
+
+static void expect_gp(unsigned short port)
+{
+ if (try_outb(port)) {
+ printf("[FAIL]\toutb to 0x%02hx worked\n", port);
+ exit(1);
+ }
+
+ printf("[OK]\toutb to 0x%02hx failed\n", port);
+}
+
+int main(void)
+{
+ cpu_set_t cpuset;
+ CPU_ZERO(&cpuset);
+ CPU_SET(0, &cpuset);
+ if (sched_setaffinity(0, sizeof(cpuset), &cpuset) != 0)
+ err(1, "sched_setaffinity to CPU 0");
+
+ expect_gp(0x80);
+ expect_gp(0xed);
+
+ /*
+ * Probe for ioperm support. Note that clearing ioperm bits
+ * works even as nonroot.
+ */
+ printf("[RUN]\tenable 0x80\n");
+ if (ioperm(0x80, 1, 1) != 0) {
+ printf("[OK]\tioperm(0x80, 1, 1) failed (%d) -- try running as root\n",
+ errno);
+ return 0;
+ }
+ expect_ok(0x80);
+ expect_gp(0xed);
+
+ printf("[RUN]\tdisable 0x80\n");
+ if (ioperm(0x80, 1, 0) != 0) {
+ printf("[FAIL]\tioperm(0x80, 1, 0) failed (%d)", errno);
+ return 1;
+ }
+ expect_gp(0x80);
+ expect_gp(0xed);
+
+ /* Make sure that fork() preserves ioperm. */
+ if (ioperm(0x80, 1, 1) != 0) {
+ printf("[FAIL]\tioperm(0x80, 1, 0) failed (%d)", errno);
+ return 1;
+ }
+
+ pid_t child = fork();
+ if (child == -1)
+ err(1, "fork");
+
+ if (child == 0) {
+ printf("[RUN]\tchild: check that we inherited permissions\n");
+ expect_ok(0x80);
+ expect_gp(0xed);
+ return 0;
+ } else {
+ int status;
+ if (waitpid(child, &status, 0) != child ||
+ !WIFEXITED(status)) {
+ printf("[FAIL]\tChild died\n");
+ nerrs++;
+ } else if (WEXITSTATUS(status) != 0) {
+ printf("[FAIL]\tChild failed\n");
+ nerrs++;
+ } else {
+ printf("[OK]\tChild succeeded\n");
+ }
+ }
+
+ /* Test the capability checks. */
+
+ printf("\tDrop privileges\n");
+ if (setresuid(1, 1, 1) != 0) {
+ printf("[WARN]\tDropping privileges failed\n");
+ return 0;
+ }
+
+ printf("[RUN]\tdisable 0x80\n");
+ if (ioperm(0x80, 1, 0) != 0) {
+ printf("[FAIL]\tioperm(0x80, 1, 0) failed (%d)", errno);
+ return 1;
+ }
+ printf("[OK]\tit worked\n");
+
+ printf("[RUN]\tenable 0x80 again\n");
+ if (ioperm(0x80, 1, 1) == 0) {
+ printf("[FAIL]\tit succeeded but should have failed.\n");
+ return 1;
+ }
+ printf("[OK]\tit failed\n");
+ return 0;
+}
diff --git a/tools/testing/selftests/x86/iopl.c b/tools/testing/selftests/x86/iopl.c
new file mode 100644
index 000000000..6aa27f346
--- /dev/null
+++ b/tools/testing/selftests/x86/iopl.c
@@ -0,0 +1,136 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * iopl.c - Test case for a Linux on Xen 64-bit bug
+ * Copyright (c) 2015 Andrew Lutomirski
+ */
+
+#define _GNU_SOURCE
+#include <err.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <signal.h>
+#include <setjmp.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <stdbool.h>
+#include <sched.h>
+#include <sys/io.h>
+
+static int nerrs = 0;
+
+static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *),
+ int flags)
+{
+ struct sigaction sa;
+ memset(&sa, 0, sizeof(sa));
+ sa.sa_sigaction = handler;
+ sa.sa_flags = SA_SIGINFO | flags;
+ sigemptyset(&sa.sa_mask);
+ if (sigaction(sig, &sa, 0))
+ err(1, "sigaction");
+
+}
+
+static jmp_buf jmpbuf;
+
+static void sigsegv(int sig, siginfo_t *si, void *ctx_void)
+{
+ siglongjmp(jmpbuf, 1);
+}
+
+int main(void)
+{
+ cpu_set_t cpuset;
+ CPU_ZERO(&cpuset);
+ CPU_SET(0, &cpuset);
+ if (sched_setaffinity(0, sizeof(cpuset), &cpuset) != 0)
+ err(1, "sched_setaffinity to CPU 0");
+
+ /* Probe for iopl support. Note that iopl(0) works even as nonroot. */
+ if (iopl(3) != 0) {
+ printf("[OK]\tiopl(3) failed (%d) -- try running as root\n",
+ errno);
+ return 0;
+ }
+
+ /* Restore our original state prior to starting the test. */
+ if (iopl(0) != 0)
+ err(1, "iopl(0)");
+
+ pid_t child = fork();
+ if (child == -1)
+ err(1, "fork");
+
+ if (child == 0) {
+ printf("\tchild: set IOPL to 3\n");
+ if (iopl(3) != 0)
+ err(1, "iopl");
+
+ printf("[RUN]\tchild: write to 0x80\n");
+ asm volatile ("outb %%al, $0x80" : : "a" (0));
+
+ return 0;
+ } else {
+ int status;
+ if (waitpid(child, &status, 0) != child ||
+ !WIFEXITED(status)) {
+ printf("[FAIL]\tChild died\n");
+ nerrs++;
+ } else if (WEXITSTATUS(status) != 0) {
+ printf("[FAIL]\tChild failed\n");
+ nerrs++;
+ } else {
+ printf("[OK]\tChild succeeded\n");
+ }
+ }
+
+ printf("[RUN]\tparent: write to 0x80 (should fail)\n");
+
+ sethandler(SIGSEGV, sigsegv, 0);
+ if (sigsetjmp(jmpbuf, 1) != 0) {
+ printf("[OK]\twrite was denied\n");
+ } else {
+ asm volatile ("outb %%al, $0x80" : : "a" (0));
+ printf("[FAIL]\twrite was allowed\n");
+ nerrs++;
+ }
+
+ /* Test the capability checks. */
+ printf("\tiopl(3)\n");
+ if (iopl(3) != 0)
+ err(1, "iopl(3)");
+
+ printf("\tDrop privileges\n");
+ if (setresuid(1, 1, 1) != 0) {
+ printf("[WARN]\tDropping privileges failed\n");
+ goto done;
+ }
+
+ printf("[RUN]\tiopl(3) unprivileged but with IOPL==3\n");
+ if (iopl(3) != 0) {
+ printf("[FAIL]\tiopl(3) should work if iopl is already 3 even if unprivileged\n");
+ nerrs++;
+ }
+
+ printf("[RUN]\tiopl(0) unprivileged\n");
+ if (iopl(0) != 0) {
+ printf("[FAIL]\tiopl(0) should work if iopl is already 3 even if unprivileged\n");
+ nerrs++;
+ }
+
+ printf("[RUN]\tiopl(3) unprivileged\n");
+ if (iopl(3) == 0) {
+ printf("[FAIL]\tiopl(3) should fail if when unprivileged if iopl==0\n");
+ nerrs++;
+ } else {
+ printf("[OK]\tFailed as expected\n");
+ }
+
+done:
+ return nerrs ? 1 : 0;
+}
+
diff --git a/tools/testing/selftests/x86/ldt_gdt.c b/tools/testing/selftests/x86/ldt_gdt.c
new file mode 100644
index 000000000..1aef72df2
--- /dev/null
+++ b/tools/testing/selftests/x86/ldt_gdt.c
@@ -0,0 +1,927 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * ldt_gdt.c - Test cases for LDT and GDT access
+ * Copyright (c) 2015 Andrew Lutomirski
+ */
+
+#define _GNU_SOURCE
+#include <err.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <signal.h>
+#include <setjmp.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <unistd.h>
+#include <sys/syscall.h>
+#include <asm/ldt.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <stdbool.h>
+#include <pthread.h>
+#include <sched.h>
+#include <linux/futex.h>
+#include <sys/mman.h>
+#include <asm/prctl.h>
+#include <sys/prctl.h>
+
+#define AR_ACCESSED (1<<8)
+
+#define AR_TYPE_RODATA (0 * (1<<9))
+#define AR_TYPE_RWDATA (1 * (1<<9))
+#define AR_TYPE_RODATA_EXPDOWN (2 * (1<<9))
+#define AR_TYPE_RWDATA_EXPDOWN (3 * (1<<9))
+#define AR_TYPE_XOCODE (4 * (1<<9))
+#define AR_TYPE_XRCODE (5 * (1<<9))
+#define AR_TYPE_XOCODE_CONF (6 * (1<<9))
+#define AR_TYPE_XRCODE_CONF (7 * (1<<9))
+
+#define AR_DPL3 (3 * (1<<13))
+
+#define AR_S (1 << 12)
+#define AR_P (1 << 15)
+#define AR_AVL (1 << 20)
+#define AR_L (1 << 21)
+#define AR_DB (1 << 22)
+#define AR_G (1 << 23)
+
+#ifdef __x86_64__
+# define INT80_CLOBBERS "r8", "r9", "r10", "r11"
+#else
+# define INT80_CLOBBERS
+#endif
+
+static int nerrs;
+
+/* Points to an array of 1024 ints, each holding its own index. */
+static const unsigned int *counter_page;
+static struct user_desc *low_user_desc;
+static struct user_desc *low_user_desc_clear; /* Use to delete GDT entry */
+static int gdt_entry_num;
+
+static void check_invalid_segment(uint16_t index, int ldt)
+{
+ uint32_t has_limit = 0, has_ar = 0, limit, ar;
+ uint32_t selector = (index << 3) | (ldt << 2) | 3;
+
+ asm ("lsl %[selector], %[limit]\n\t"
+ "jnz 1f\n\t"
+ "movl $1, %[has_limit]\n\t"
+ "1:"
+ : [limit] "=r" (limit), [has_limit] "+rm" (has_limit)
+ : [selector] "r" (selector));
+ asm ("larl %[selector], %[ar]\n\t"
+ "jnz 1f\n\t"
+ "movl $1, %[has_ar]\n\t"
+ "1:"
+ : [ar] "=r" (ar), [has_ar] "+rm" (has_ar)
+ : [selector] "r" (selector));
+
+ if (has_limit || has_ar) {
+ printf("[FAIL]\t%s entry %hu is valid but should be invalid\n",
+ (ldt ? "LDT" : "GDT"), index);
+ nerrs++;
+ } else {
+ printf("[OK]\t%s entry %hu is invalid\n",
+ (ldt ? "LDT" : "GDT"), index);
+ }
+}
+
+static void check_valid_segment(uint16_t index, int ldt,
+ uint32_t expected_ar, uint32_t expected_limit,
+ bool verbose)
+{
+ uint32_t has_limit = 0, has_ar = 0, limit, ar;
+ uint32_t selector = (index << 3) | (ldt << 2) | 3;
+
+ asm ("lsl %[selector], %[limit]\n\t"
+ "jnz 1f\n\t"
+ "movl $1, %[has_limit]\n\t"
+ "1:"
+ : [limit] "=r" (limit), [has_limit] "+rm" (has_limit)
+ : [selector] "r" (selector));
+ asm ("larl %[selector], %[ar]\n\t"
+ "jnz 1f\n\t"
+ "movl $1, %[has_ar]\n\t"
+ "1:"
+ : [ar] "=r" (ar), [has_ar] "+rm" (has_ar)
+ : [selector] "r" (selector));
+
+ if (!has_limit || !has_ar) {
+ printf("[FAIL]\t%s entry %hu is invalid but should be valid\n",
+ (ldt ? "LDT" : "GDT"), index);
+ nerrs++;
+ return;
+ }
+
+ /* The SDM says "bits 19:16 are undefined". Thanks. */
+ ar &= ~0xF0000;
+
+ /*
+ * NB: Different Linux versions do different things with the
+ * accessed bit in set_thread_area().
+ */
+ if (ar != expected_ar && ar != (expected_ar | AR_ACCESSED)) {
+ printf("[FAIL]\t%s entry %hu has AR 0x%08X but expected 0x%08X\n",
+ (ldt ? "LDT" : "GDT"), index, ar, expected_ar);
+ nerrs++;
+ } else if (limit != expected_limit) {
+ printf("[FAIL]\t%s entry %hu has limit 0x%08X but expected 0x%08X\n",
+ (ldt ? "LDT" : "GDT"), index, limit, expected_limit);
+ nerrs++;
+ } else if (verbose) {
+ printf("[OK]\t%s entry %hu has AR 0x%08X and limit 0x%08X\n",
+ (ldt ? "LDT" : "GDT"), index, ar, limit);
+ }
+}
+
+static bool install_valid_mode(const struct user_desc *d, uint32_t ar,
+ bool oldmode, bool ldt)
+{
+ struct user_desc desc = *d;
+ int ret;
+
+ if (!ldt) {
+#ifndef __i386__
+ /* No point testing set_thread_area in a 64-bit build */
+ return false;
+#endif
+ if (!gdt_entry_num)
+ return false;
+ desc.entry_number = gdt_entry_num;
+
+ ret = syscall(SYS_set_thread_area, &desc);
+ } else {
+ ret = syscall(SYS_modify_ldt, oldmode ? 1 : 0x11,
+ &desc, sizeof(desc));
+
+ if (ret < -1)
+ errno = -ret;
+
+ if (ret != 0 && errno == ENOSYS) {
+ printf("[OK]\tmodify_ldt returned -ENOSYS\n");
+ return false;
+ }
+ }
+
+ if (ret == 0) {
+ uint32_t limit = desc.limit;
+ if (desc.limit_in_pages)
+ limit = (limit << 12) + 4095;
+ check_valid_segment(desc.entry_number, ldt, ar, limit, true);
+ return true;
+ } else {
+ if (desc.seg_32bit) {
+ printf("[FAIL]\tUnexpected %s failure %d\n",
+ ldt ? "modify_ldt" : "set_thread_area",
+ errno);
+ nerrs++;
+ return false;
+ } else {
+ printf("[OK]\t%s rejected 16 bit segment\n",
+ ldt ? "modify_ldt" : "set_thread_area");
+ return false;
+ }
+ }
+}
+
+static bool install_valid(const struct user_desc *desc, uint32_t ar)
+{
+ bool ret = install_valid_mode(desc, ar, false, true);
+
+ if (desc->contents <= 1 && desc->seg_32bit &&
+ !desc->seg_not_present) {
+ /* Should work in the GDT, too. */
+ install_valid_mode(desc, ar, false, false);
+ }
+
+ return ret;
+}
+
+static void install_invalid(const struct user_desc *desc, bool oldmode)
+{
+ int ret = syscall(SYS_modify_ldt, oldmode ? 1 : 0x11,
+ desc, sizeof(*desc));
+ if (ret < -1)
+ errno = -ret;
+ if (ret == 0) {
+ check_invalid_segment(desc->entry_number, 1);
+ } else if (errno == ENOSYS) {
+ printf("[OK]\tmodify_ldt returned -ENOSYS\n");
+ } else {
+ if (desc->seg_32bit) {
+ printf("[FAIL]\tUnexpected modify_ldt failure %d\n",
+ errno);
+ nerrs++;
+ } else {
+ printf("[OK]\tmodify_ldt rejected 16 bit segment\n");
+ }
+ }
+}
+
+static int safe_modify_ldt(int func, struct user_desc *ptr,
+ unsigned long bytecount)
+{
+ int ret = syscall(SYS_modify_ldt, 0x11, ptr, bytecount);
+ if (ret < -1)
+ errno = -ret;
+ return ret;
+}
+
+static void fail_install(struct user_desc *desc)
+{
+ if (safe_modify_ldt(0x11, desc, sizeof(*desc)) == 0) {
+ printf("[FAIL]\tmodify_ldt accepted a bad descriptor\n");
+ nerrs++;
+ } else if (errno == ENOSYS) {
+ printf("[OK]\tmodify_ldt returned -ENOSYS\n");
+ } else {
+ printf("[OK]\tmodify_ldt failure %d\n", errno);
+ }
+}
+
+static void do_simple_tests(void)
+{
+ struct user_desc desc = {
+ .entry_number = 0,
+ .base_addr = 0,
+ .limit = 10,
+ .seg_32bit = 1,
+ .contents = 2, /* Code, not conforming */
+ .read_exec_only = 0,
+ .limit_in_pages = 0,
+ .seg_not_present = 0,
+ .useable = 0
+ };
+ install_valid(&desc, AR_DPL3 | AR_TYPE_XRCODE | AR_S | AR_P | AR_DB);
+
+ desc.limit_in_pages = 1;
+ install_valid(&desc, AR_DPL3 | AR_TYPE_XRCODE |
+ AR_S | AR_P | AR_DB | AR_G);
+
+ check_invalid_segment(1, 1);
+
+ desc.entry_number = 2;
+ install_valid(&desc, AR_DPL3 | AR_TYPE_XRCODE |
+ AR_S | AR_P | AR_DB | AR_G);
+
+ check_invalid_segment(1, 1);
+
+ desc.base_addr = 0xf0000000;
+ install_valid(&desc, AR_DPL3 | AR_TYPE_XRCODE |
+ AR_S | AR_P | AR_DB | AR_G);
+
+ desc.useable = 1;
+ install_valid(&desc, AR_DPL3 | AR_TYPE_XRCODE |
+ AR_S | AR_P | AR_DB | AR_G | AR_AVL);
+
+ desc.seg_not_present = 1;
+ install_valid(&desc, AR_DPL3 | AR_TYPE_XRCODE |
+ AR_S | AR_DB | AR_G | AR_AVL);
+
+ desc.seg_32bit = 0;
+ install_valid(&desc, AR_DPL3 | AR_TYPE_XRCODE |
+ AR_S | AR_G | AR_AVL);
+
+ desc.seg_32bit = 1;
+ desc.contents = 0;
+ install_valid(&desc, AR_DPL3 | AR_TYPE_RWDATA |
+ AR_S | AR_DB | AR_G | AR_AVL);
+
+ desc.read_exec_only = 1;
+ install_valid(&desc, AR_DPL3 | AR_TYPE_RODATA |
+ AR_S | AR_DB | AR_G | AR_AVL);
+
+ desc.contents = 1;
+ install_valid(&desc, AR_DPL3 | AR_TYPE_RODATA_EXPDOWN |
+ AR_S | AR_DB | AR_G | AR_AVL);
+
+ desc.read_exec_only = 0;
+ desc.limit_in_pages = 0;
+ install_valid(&desc, AR_DPL3 | AR_TYPE_RWDATA_EXPDOWN |
+ AR_S | AR_DB | AR_AVL);
+
+ desc.contents = 3;
+ install_valid(&desc, AR_DPL3 | AR_TYPE_XRCODE_CONF |
+ AR_S | AR_DB | AR_AVL);
+
+ desc.read_exec_only = 1;
+ install_valid(&desc, AR_DPL3 | AR_TYPE_XOCODE_CONF |
+ AR_S | AR_DB | AR_AVL);
+
+ desc.read_exec_only = 0;
+ desc.contents = 2;
+ install_valid(&desc, AR_DPL3 | AR_TYPE_XRCODE |
+ AR_S | AR_DB | AR_AVL);
+
+ desc.read_exec_only = 1;
+
+#ifdef __x86_64__
+ desc.lm = 1;
+ install_valid(&desc, AR_DPL3 | AR_TYPE_XOCODE |
+ AR_S | AR_DB | AR_AVL);
+ desc.lm = 0;
+#endif
+
+ bool entry1_okay = install_valid(&desc, AR_DPL3 | AR_TYPE_XOCODE |
+ AR_S | AR_DB | AR_AVL);
+
+ if (entry1_okay) {
+ printf("[RUN]\tTest fork\n");
+ pid_t child = fork();
+ if (child == 0) {
+ nerrs = 0;
+ check_valid_segment(desc.entry_number, 1,
+ AR_DPL3 | AR_TYPE_XOCODE |
+ AR_S | AR_DB | AR_AVL, desc.limit,
+ true);
+ check_invalid_segment(1, 1);
+ exit(nerrs ? 1 : 0);
+ } else {
+ int status;
+ if (waitpid(child, &status, 0) != child ||
+ !WIFEXITED(status)) {
+ printf("[FAIL]\tChild died\n");
+ nerrs++;
+ } else if (WEXITSTATUS(status) != 0) {
+ printf("[FAIL]\tChild failed\n");
+ nerrs++;
+ } else {
+ printf("[OK]\tChild succeeded\n");
+ }
+ }
+
+ printf("[RUN]\tTest size\n");
+ int i;
+ for (i = 0; i < 8192; i++) {
+ desc.entry_number = i;
+ desc.limit = i;
+ if (safe_modify_ldt(0x11, &desc, sizeof(desc)) != 0) {
+ printf("[FAIL]\tFailed to install entry %d\n", i);
+ nerrs++;
+ break;
+ }
+ }
+ for (int j = 0; j < i; j++) {
+ check_valid_segment(j, 1, AR_DPL3 | AR_TYPE_XOCODE |
+ AR_S | AR_DB | AR_AVL, j, false);
+ }
+ printf("[DONE]\tSize test\n");
+ } else {
+ printf("[SKIP]\tSkipping fork and size tests because we have no LDT\n");
+ }
+
+ /* Test entry_number too high. */
+ desc.entry_number = 8192;
+ fail_install(&desc);
+
+ /* Test deletion and actions mistakeable for deletion. */
+ memset(&desc, 0, sizeof(desc));
+ install_valid(&desc, AR_DPL3 | AR_TYPE_RWDATA | AR_S | AR_P);
+
+ desc.seg_not_present = 1;
+ install_valid(&desc, AR_DPL3 | AR_TYPE_RWDATA | AR_S);
+
+ desc.seg_not_present = 0;
+ desc.read_exec_only = 1;
+ install_valid(&desc, AR_DPL3 | AR_TYPE_RODATA | AR_S | AR_P);
+
+ desc.read_exec_only = 0;
+ desc.seg_not_present = 1;
+ install_valid(&desc, AR_DPL3 | AR_TYPE_RWDATA | AR_S);
+
+ desc.read_exec_only = 1;
+ desc.limit = 1;
+ install_valid(&desc, AR_DPL3 | AR_TYPE_RODATA | AR_S);
+
+ desc.limit = 0;
+ desc.base_addr = 1;
+ install_valid(&desc, AR_DPL3 | AR_TYPE_RODATA | AR_S);
+
+ desc.base_addr = 0;
+ install_invalid(&desc, false);
+
+ desc.seg_not_present = 0;
+ desc.seg_32bit = 1;
+ desc.read_exec_only = 0;
+ desc.limit = 0xfffff;
+
+ install_valid(&desc, AR_DPL3 | AR_TYPE_RWDATA | AR_S | AR_P | AR_DB);
+
+ desc.limit_in_pages = 1;
+
+ install_valid(&desc, AR_DPL3 | AR_TYPE_RWDATA | AR_S | AR_P | AR_DB | AR_G);
+ desc.read_exec_only = 1;
+ install_valid(&desc, AR_DPL3 | AR_TYPE_RODATA | AR_S | AR_P | AR_DB | AR_G);
+ desc.contents = 1;
+ desc.read_exec_only = 0;
+ install_valid(&desc, AR_DPL3 | AR_TYPE_RWDATA_EXPDOWN | AR_S | AR_P | AR_DB | AR_G);
+ desc.read_exec_only = 1;
+ install_valid(&desc, AR_DPL3 | AR_TYPE_RODATA_EXPDOWN | AR_S | AR_P | AR_DB | AR_G);
+
+ desc.limit = 0;
+ install_invalid(&desc, true);
+}
+
+/*
+ * 0: thread is idle
+ * 1: thread armed
+ * 2: thread should clear LDT entry 0
+ * 3: thread should exit
+ */
+static volatile unsigned int ftx;
+
+static void *threadproc(void *ctx)
+{
+ cpu_set_t cpuset;
+ CPU_ZERO(&cpuset);
+ CPU_SET(1, &cpuset);
+ if (sched_setaffinity(0, sizeof(cpuset), &cpuset) != 0)
+ err(1, "sched_setaffinity to CPU 1"); /* should never fail */
+
+ while (1) {
+ syscall(SYS_futex, &ftx, FUTEX_WAIT, 0, NULL, NULL, 0);
+ while (ftx != 2) {
+ if (ftx >= 3)
+ return NULL;
+ }
+
+ /* clear LDT entry 0 */
+ const struct user_desc desc = {};
+ if (syscall(SYS_modify_ldt, 1, &desc, sizeof(desc)) != 0)
+ err(1, "modify_ldt");
+
+ /* If ftx == 2, set it to zero. If ftx == 100, quit. */
+ unsigned int x = -2;
+ asm volatile ("lock xaddl %[x], %[ftx]" :
+ [x] "+r" (x), [ftx] "+m" (ftx));
+ if (x != 2)
+ return NULL;
+ }
+}
+
+#ifdef __i386__
+
+#ifndef SA_RESTORE
+#define SA_RESTORER 0x04000000
+#endif
+
+/*
+ * The UAPI header calls this 'struct sigaction', which conflicts with
+ * glibc. Sigh.
+ */
+struct fake_ksigaction {
+ void *handler; /* the real type is nasty */
+ unsigned long sa_flags;
+ void (*sa_restorer)(void);
+ unsigned char sigset[8];
+};
+
+static void fix_sa_restorer(int sig)
+{
+ struct fake_ksigaction ksa;
+
+ if (syscall(SYS_rt_sigaction, sig, NULL, &ksa, 8) == 0) {
+ /*
+ * glibc has a nasty bug: it sometimes writes garbage to
+ * sa_restorer. This interacts quite badly with anything
+ * that fiddles with SS because it can trigger legacy
+ * stack switching. Patch it up. See:
+ *
+ * https://sourceware.org/bugzilla/show_bug.cgi?id=21269
+ */
+ if (!(ksa.sa_flags & SA_RESTORER) && ksa.sa_restorer) {
+ ksa.sa_restorer = NULL;
+ if (syscall(SYS_rt_sigaction, sig, &ksa, NULL,
+ sizeof(ksa.sigset)) != 0)
+ err(1, "rt_sigaction");
+ }
+ }
+}
+#else
+static void fix_sa_restorer(int sig)
+{
+ /* 64-bit glibc works fine. */
+}
+#endif
+
+static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *),
+ int flags)
+{
+ struct sigaction sa;
+ memset(&sa, 0, sizeof(sa));
+ sa.sa_sigaction = handler;
+ sa.sa_flags = SA_SIGINFO | flags;
+ sigemptyset(&sa.sa_mask);
+ if (sigaction(sig, &sa, 0))
+ err(1, "sigaction");
+
+ fix_sa_restorer(sig);
+}
+
+static jmp_buf jmpbuf;
+
+static void sigsegv(int sig, siginfo_t *info, void *ctx_void)
+{
+ siglongjmp(jmpbuf, 1);
+}
+
+static void do_multicpu_tests(void)
+{
+ cpu_set_t cpuset;
+ pthread_t thread;
+ int failures = 0, iters = 5, i;
+ unsigned short orig_ss;
+
+ CPU_ZERO(&cpuset);
+ CPU_SET(1, &cpuset);
+ if (sched_setaffinity(0, sizeof(cpuset), &cpuset) != 0) {
+ printf("[SKIP]\tCannot set affinity to CPU 1\n");
+ return;
+ }
+
+ CPU_ZERO(&cpuset);
+ CPU_SET(0, &cpuset);
+ if (sched_setaffinity(0, sizeof(cpuset), &cpuset) != 0) {
+ printf("[SKIP]\tCannot set affinity to CPU 0\n");
+ return;
+ }
+
+ sethandler(SIGSEGV, sigsegv, 0);
+#ifdef __i386__
+ /* True 32-bit kernels send SIGILL instead of SIGSEGV on IRET faults. */
+ sethandler(SIGILL, sigsegv, 0);
+#endif
+
+ printf("[RUN]\tCross-CPU LDT invalidation\n");
+
+ if (pthread_create(&thread, 0, threadproc, 0) != 0)
+ err(1, "pthread_create");
+
+ asm volatile ("mov %%ss, %0" : "=rm" (orig_ss));
+
+ for (i = 0; i < 5; i++) {
+ if (sigsetjmp(jmpbuf, 1) != 0)
+ continue;
+
+ /* Make sure the thread is ready after the last test. */
+ while (ftx != 0)
+ ;
+
+ struct user_desc desc = {
+ .entry_number = 0,
+ .base_addr = 0,
+ .limit = 0xfffff,
+ .seg_32bit = 1,
+ .contents = 0, /* Data */
+ .read_exec_only = 0,
+ .limit_in_pages = 1,
+ .seg_not_present = 0,
+ .useable = 0
+ };
+
+ if (safe_modify_ldt(0x11, &desc, sizeof(desc)) != 0) {
+ if (errno != ENOSYS)
+ err(1, "modify_ldt");
+ printf("[SKIP]\tmodify_ldt unavailable\n");
+ break;
+ }
+
+ /* Arm the thread. */
+ ftx = 1;
+ syscall(SYS_futex, &ftx, FUTEX_WAKE, 0, NULL, NULL, 0);
+
+ asm volatile ("mov %0, %%ss" : : "r" (0x7));
+
+ /* Go! */
+ ftx = 2;
+
+ while (ftx != 0)
+ ;
+
+ /*
+ * On success, modify_ldt will segfault us synchronously,
+ * and we'll escape via siglongjmp.
+ */
+
+ failures++;
+ asm volatile ("mov %0, %%ss" : : "rm" (orig_ss));
+ };
+
+ ftx = 100; /* Kill the thread. */
+ syscall(SYS_futex, &ftx, FUTEX_WAKE, 0, NULL, NULL, 0);
+
+ if (pthread_join(thread, NULL) != 0)
+ err(1, "pthread_join");
+
+ if (failures) {
+ printf("[FAIL]\t%d of %d iterations failed\n", failures, iters);
+ nerrs++;
+ } else {
+ printf("[OK]\tAll %d iterations succeeded\n", iters);
+ }
+}
+
+static int finish_exec_test(void)
+{
+ /*
+ * Older kernel versions did inherit the LDT on exec() which is
+ * wrong because exec() starts from a clean state.
+ */
+ check_invalid_segment(0, 1);
+
+ return nerrs ? 1 : 0;
+}
+
+static void do_exec_test(void)
+{
+ printf("[RUN]\tTest exec\n");
+
+ struct user_desc desc = {
+ .entry_number = 0,
+ .base_addr = 0,
+ .limit = 42,
+ .seg_32bit = 1,
+ .contents = 2, /* Code, not conforming */
+ .read_exec_only = 0,
+ .limit_in_pages = 0,
+ .seg_not_present = 0,
+ .useable = 0
+ };
+ install_valid(&desc, AR_DPL3 | AR_TYPE_XRCODE | AR_S | AR_P | AR_DB);
+
+ pid_t child = fork();
+ if (child == 0) {
+ execl("/proc/self/exe", "ldt_gdt_test_exec", NULL);
+ printf("[FAIL]\tCould not exec self\n");
+ exit(1); /* exec failed */
+ } else {
+ int status;
+ if (waitpid(child, &status, 0) != child ||
+ !WIFEXITED(status)) {
+ printf("[FAIL]\tChild died\n");
+ nerrs++;
+ } else if (WEXITSTATUS(status) != 0) {
+ printf("[FAIL]\tChild failed\n");
+ nerrs++;
+ } else {
+ printf("[OK]\tChild succeeded\n");
+ }
+ }
+}
+
+static void setup_counter_page(void)
+{
+ unsigned int *page = mmap(NULL, 4096, PROT_READ | PROT_WRITE,
+ MAP_ANONYMOUS | MAP_PRIVATE | MAP_32BIT, -1, 0);
+ if (page == MAP_FAILED)
+ err(1, "mmap");
+
+ for (int i = 0; i < 1024; i++)
+ page[i] = i;
+ counter_page = page;
+}
+
+static int invoke_set_thread_area(void)
+{
+ int ret;
+ asm volatile ("int $0x80"
+ : "=a" (ret), "+m" (low_user_desc) :
+ "a" (243), "b" (low_user_desc)
+ : INT80_CLOBBERS);
+ return ret;
+}
+
+static void setup_low_user_desc(void)
+{
+ low_user_desc = mmap(NULL, 2 * sizeof(struct user_desc),
+ PROT_READ | PROT_WRITE,
+ MAP_ANONYMOUS | MAP_PRIVATE | MAP_32BIT, -1, 0);
+ if (low_user_desc == MAP_FAILED)
+ err(1, "mmap");
+
+ low_user_desc->entry_number = -1;
+ low_user_desc->base_addr = (unsigned long)&counter_page[1];
+ low_user_desc->limit = 0xfffff;
+ low_user_desc->seg_32bit = 1;
+ low_user_desc->contents = 0; /* Data, grow-up*/
+ low_user_desc->read_exec_only = 0;
+ low_user_desc->limit_in_pages = 1;
+ low_user_desc->seg_not_present = 0;
+ low_user_desc->useable = 0;
+
+ if (invoke_set_thread_area() == 0) {
+ gdt_entry_num = low_user_desc->entry_number;
+ printf("[NOTE]\tset_thread_area is available; will use GDT index %d\n", gdt_entry_num);
+ } else {
+ printf("[NOTE]\tset_thread_area is unavailable\n");
+ }
+
+ low_user_desc_clear = low_user_desc + 1;
+ low_user_desc_clear->entry_number = gdt_entry_num;
+ low_user_desc_clear->read_exec_only = 1;
+ low_user_desc_clear->seg_not_present = 1;
+}
+
+static void test_gdt_invalidation(void)
+{
+ if (!gdt_entry_num)
+ return; /* 64-bit only system -- we can't use set_thread_area */
+
+ unsigned short prev_sel;
+ unsigned short sel;
+ unsigned int eax;
+ const char *result;
+#ifdef __x86_64__
+ unsigned long saved_base;
+ unsigned long new_base;
+#endif
+
+ /* Test DS */
+ invoke_set_thread_area();
+ eax = 243;
+ sel = (gdt_entry_num << 3) | 3;
+ asm volatile ("movw %%ds, %[prev_sel]\n\t"
+ "movw %[sel], %%ds\n\t"
+#ifdef __i386__
+ "pushl %%ebx\n\t"
+#endif
+ "movl %[arg1], %%ebx\n\t"
+ "int $0x80\n\t" /* Should invalidate ds */
+#ifdef __i386__
+ "popl %%ebx\n\t"
+#endif
+ "movw %%ds, %[sel]\n\t"
+ "movw %[prev_sel], %%ds"
+ : [prev_sel] "=&r" (prev_sel), [sel] "+r" (sel),
+ "+a" (eax)
+ : "m" (low_user_desc_clear),
+ [arg1] "r" ((unsigned int)(unsigned long)low_user_desc_clear)
+ : INT80_CLOBBERS);
+
+ if (sel != 0) {
+ result = "FAIL";
+ nerrs++;
+ } else {
+ result = "OK";
+ }
+ printf("[%s]\tInvalidate DS with set_thread_area: new DS = 0x%hx\n",
+ result, sel);
+
+ /* Test ES */
+ invoke_set_thread_area();
+ eax = 243;
+ sel = (gdt_entry_num << 3) | 3;
+ asm volatile ("movw %%es, %[prev_sel]\n\t"
+ "movw %[sel], %%es\n\t"
+#ifdef __i386__
+ "pushl %%ebx\n\t"
+#endif
+ "movl %[arg1], %%ebx\n\t"
+ "int $0x80\n\t" /* Should invalidate es */
+#ifdef __i386__
+ "popl %%ebx\n\t"
+#endif
+ "movw %%es, %[sel]\n\t"
+ "movw %[prev_sel], %%es"
+ : [prev_sel] "=&r" (prev_sel), [sel] "+r" (sel),
+ "+a" (eax)
+ : "m" (low_user_desc_clear),
+ [arg1] "r" ((unsigned int)(unsigned long)low_user_desc_clear)
+ : INT80_CLOBBERS);
+
+ if (sel != 0) {
+ result = "FAIL";
+ nerrs++;
+ } else {
+ result = "OK";
+ }
+ printf("[%s]\tInvalidate ES with set_thread_area: new ES = 0x%hx\n",
+ result, sel);
+
+ /* Test FS */
+ invoke_set_thread_area();
+ eax = 243;
+ sel = (gdt_entry_num << 3) | 3;
+#ifdef __x86_64__
+ syscall(SYS_arch_prctl, ARCH_GET_FS, &saved_base);
+#endif
+ asm volatile ("movw %%fs, %[prev_sel]\n\t"
+ "movw %[sel], %%fs\n\t"
+#ifdef __i386__
+ "pushl %%ebx\n\t"
+#endif
+ "movl %[arg1], %%ebx\n\t"
+ "int $0x80\n\t" /* Should invalidate fs */
+#ifdef __i386__
+ "popl %%ebx\n\t"
+#endif
+ "movw %%fs, %[sel]\n\t"
+ : [prev_sel] "=&r" (prev_sel), [sel] "+r" (sel),
+ "+a" (eax)
+ : "m" (low_user_desc_clear),
+ [arg1] "r" ((unsigned int)(unsigned long)low_user_desc_clear)
+ : INT80_CLOBBERS);
+
+#ifdef __x86_64__
+ syscall(SYS_arch_prctl, ARCH_GET_FS, &new_base);
+#endif
+
+ /* Restore FS/BASE for glibc */
+ asm volatile ("movw %[prev_sel], %%fs" : : [prev_sel] "rm" (prev_sel));
+#ifdef __x86_64__
+ if (saved_base)
+ syscall(SYS_arch_prctl, ARCH_SET_FS, saved_base);
+#endif
+
+ if (sel != 0) {
+ result = "FAIL";
+ nerrs++;
+ } else {
+ result = "OK";
+ }
+ printf("[%s]\tInvalidate FS with set_thread_area: new FS = 0x%hx\n",
+ result, sel);
+
+#ifdef __x86_64__
+ if (sel == 0 && new_base != 0) {
+ nerrs++;
+ printf("[FAIL]\tNew FSBASE was 0x%lx\n", new_base);
+ } else {
+ printf("[OK]\tNew FSBASE was zero\n");
+ }
+#endif
+
+ /* Test GS */
+ invoke_set_thread_area();
+ eax = 243;
+ sel = (gdt_entry_num << 3) | 3;
+#ifdef __x86_64__
+ syscall(SYS_arch_prctl, ARCH_GET_GS, &saved_base);
+#endif
+ asm volatile ("movw %%gs, %[prev_sel]\n\t"
+ "movw %[sel], %%gs\n\t"
+#ifdef __i386__
+ "pushl %%ebx\n\t"
+#endif
+ "movl %[arg1], %%ebx\n\t"
+ "int $0x80\n\t" /* Should invalidate gs */
+#ifdef __i386__
+ "popl %%ebx\n\t"
+#endif
+ "movw %%gs, %[sel]\n\t"
+ : [prev_sel] "=&r" (prev_sel), [sel] "+r" (sel),
+ "+a" (eax)
+ : "m" (low_user_desc_clear),
+ [arg1] "r" ((unsigned int)(unsigned long)low_user_desc_clear)
+ : INT80_CLOBBERS);
+
+#ifdef __x86_64__
+ syscall(SYS_arch_prctl, ARCH_GET_GS, &new_base);
+#endif
+
+ /* Restore GS/BASE for glibc */
+ asm volatile ("movw %[prev_sel], %%gs" : : [prev_sel] "rm" (prev_sel));
+#ifdef __x86_64__
+ if (saved_base)
+ syscall(SYS_arch_prctl, ARCH_SET_GS, saved_base);
+#endif
+
+ if (sel != 0) {
+ result = "FAIL";
+ nerrs++;
+ } else {
+ result = "OK";
+ }
+ printf("[%s]\tInvalidate GS with set_thread_area: new GS = 0x%hx\n",
+ result, sel);
+
+#ifdef __x86_64__
+ if (sel == 0 && new_base != 0) {
+ nerrs++;
+ printf("[FAIL]\tNew GSBASE was 0x%lx\n", new_base);
+ } else {
+ printf("[OK]\tNew GSBASE was zero\n");
+ }
+#endif
+}
+
+int main(int argc, char **argv)
+{
+ if (argc == 1 && !strcmp(argv[0], "ldt_gdt_test_exec"))
+ return finish_exec_test();
+
+ setup_counter_page();
+ setup_low_user_desc();
+
+ do_simple_tests();
+
+ do_multicpu_tests();
+
+ do_exec_test();
+
+ test_gdt_invalidation();
+
+ return nerrs ? 1 : 0;
+}
diff --git a/tools/testing/selftests/x86/mov_ss_trap.c b/tools/testing/selftests/x86/mov_ss_trap.c
new file mode 100644
index 000000000..3c3a02265
--- /dev/null
+++ b/tools/testing/selftests/x86/mov_ss_trap.c
@@ -0,0 +1,285 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * mov_ss_trap.c: Exercise the bizarre side effects of a watchpoint on MOV SS
+ *
+ * This does MOV SS from a watchpointed address followed by various
+ * types of kernel entries. A MOV SS that hits a watchpoint will queue
+ * up a #DB trap but will not actually deliver that trap. The trap
+ * will be delivered after the next instruction instead. The CPU's logic
+ * seems to be:
+ *
+ * - Any fault: drop the pending #DB trap.
+ * - INT $N, INT3, INTO, SYSCALL, SYSENTER: enter the kernel and then
+ * deliver #DB.
+ * - ICEBP: enter the kernel but do not deliver the watchpoint trap
+ * - breakpoint: only one #DB is delivered (phew!)
+ *
+ * There are plenty of ways for a kernel to handle this incorrectly. This
+ * test tries to exercise all the cases.
+ *
+ * This should mostly cover CVE-2018-1087 and CVE-2018-8897.
+ */
+#define _GNU_SOURCE
+
+#include <stdlib.h>
+#include <sys/ptrace.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <sys/user.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+#include <errno.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <err.h>
+#include <string.h>
+#include <setjmp.h>
+#include <sys/prctl.h>
+
+#define X86_EFLAGS_RF (1UL << 16)
+
+#if __x86_64__
+# define REG_IP REG_RIP
+#else
+# define REG_IP REG_EIP
+#endif
+
+unsigned short ss;
+extern unsigned char breakpoint_insn[];
+sigjmp_buf jmpbuf;
+static unsigned char altstack_data[SIGSTKSZ];
+
+static void enable_watchpoint(void)
+{
+ pid_t parent = getpid();
+ int status;
+
+ pid_t child = fork();
+ if (child < 0)
+ err(1, "fork");
+
+ if (child) {
+ if (waitpid(child, &status, 0) != child)
+ err(1, "waitpid for child");
+ } else {
+ unsigned long dr0, dr1, dr7;
+
+ dr0 = (unsigned long)&ss;
+ dr1 = (unsigned long)breakpoint_insn;
+ dr7 = ((1UL << 1) | /* G0 */
+ (3UL << 16) | /* RW0 = read or write */
+ (1UL << 18) | /* LEN0 = 2 bytes */
+ (1UL << 3)); /* G1, RW1 = insn */
+
+ if (ptrace(PTRACE_ATTACH, parent, NULL, NULL) != 0)
+ err(1, "PTRACE_ATTACH");
+
+ if (waitpid(parent, &status, 0) != parent)
+ err(1, "waitpid for child");
+
+ if (ptrace(PTRACE_POKEUSER, parent, (void *)offsetof(struct user, u_debugreg[0]), dr0) != 0)
+ err(1, "PTRACE_POKEUSER DR0");
+
+ if (ptrace(PTRACE_POKEUSER, parent, (void *)offsetof(struct user, u_debugreg[1]), dr1) != 0)
+ err(1, "PTRACE_POKEUSER DR1");
+
+ if (ptrace(PTRACE_POKEUSER, parent, (void *)offsetof(struct user, u_debugreg[7]), dr7) != 0)
+ err(1, "PTRACE_POKEUSER DR7");
+
+ printf("\tDR0 = %lx, DR1 = %lx, DR7 = %lx\n", dr0, dr1, dr7);
+
+ if (ptrace(PTRACE_DETACH, parent, NULL, NULL) != 0)
+ err(1, "PTRACE_DETACH");
+
+ exit(0);
+ }
+}
+
+static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *),
+ int flags)
+{
+ struct sigaction sa;
+ memset(&sa, 0, sizeof(sa));
+ sa.sa_sigaction = handler;
+ sa.sa_flags = SA_SIGINFO | flags;
+ sigemptyset(&sa.sa_mask);
+ if (sigaction(sig, &sa, 0))
+ err(1, "sigaction");
+}
+
+static char const * const signames[] = {
+ [SIGSEGV] = "SIGSEGV",
+ [SIGBUS] = "SIBGUS",
+ [SIGTRAP] = "SIGTRAP",
+ [SIGILL] = "SIGILL",
+};
+
+static void sigtrap(int sig, siginfo_t *si, void *ctx_void)
+{
+ ucontext_t *ctx = ctx_void;
+
+ printf("\tGot SIGTRAP with RIP=%lx, EFLAGS.RF=%d\n",
+ (unsigned long)ctx->uc_mcontext.gregs[REG_IP],
+ !!(ctx->uc_mcontext.gregs[REG_EFL] & X86_EFLAGS_RF));
+}
+
+static void handle_and_return(int sig, siginfo_t *si, void *ctx_void)
+{
+ ucontext_t *ctx = ctx_void;
+
+ printf("\tGot %s with RIP=%lx\n", signames[sig],
+ (unsigned long)ctx->uc_mcontext.gregs[REG_IP]);
+}
+
+static void handle_and_longjmp(int sig, siginfo_t *si, void *ctx_void)
+{
+ ucontext_t *ctx = ctx_void;
+
+ printf("\tGot %s with RIP=%lx\n", signames[sig],
+ (unsigned long)ctx->uc_mcontext.gregs[REG_IP]);
+
+ siglongjmp(jmpbuf, 1);
+}
+
+int main()
+{
+ unsigned long nr;
+
+ asm volatile ("mov %%ss, %[ss]" : [ss] "=m" (ss));
+ printf("\tSS = 0x%hx, &SS = 0x%p\n", ss, &ss);
+
+ if (prctl(PR_SET_PTRACER, PR_SET_PTRACER_ANY, 0, 0, 0) == 0)
+ printf("\tPR_SET_PTRACER_ANY succeeded\n");
+
+ printf("\tSet up a watchpoint\n");
+ sethandler(SIGTRAP, sigtrap, 0);
+ enable_watchpoint();
+
+ printf("[RUN]\tRead from watched memory (should get SIGTRAP)\n");
+ asm volatile ("mov %[ss], %[tmp]" : [tmp] "=r" (nr) : [ss] "m" (ss));
+
+ printf("[RUN]\tMOV SS; INT3\n");
+ asm volatile ("mov %[ss], %%ss; int3" :: [ss] "m" (ss));
+
+ printf("[RUN]\tMOV SS; INT 3\n");
+ asm volatile ("mov %[ss], %%ss; .byte 0xcd, 0x3" :: [ss] "m" (ss));
+
+ printf("[RUN]\tMOV SS; CS CS INT3\n");
+ asm volatile ("mov %[ss], %%ss; .byte 0x2e, 0x2e; int3" :: [ss] "m" (ss));
+
+ printf("[RUN]\tMOV SS; CSx14 INT3\n");
+ asm volatile ("mov %[ss], %%ss; .fill 14,1,0x2e; int3" :: [ss] "m" (ss));
+
+ printf("[RUN]\tMOV SS; INT 4\n");
+ sethandler(SIGSEGV, handle_and_return, SA_RESETHAND);
+ asm volatile ("mov %[ss], %%ss; int $4" :: [ss] "m" (ss));
+
+#ifdef __i386__
+ printf("[RUN]\tMOV SS; INTO\n");
+ sethandler(SIGSEGV, handle_and_return, SA_RESETHAND);
+ nr = -1;
+ asm volatile ("add $1, %[tmp]; mov %[ss], %%ss; into"
+ : [tmp] "+r" (nr) : [ss] "m" (ss));
+#endif
+
+ if (sigsetjmp(jmpbuf, 1) == 0) {
+ printf("[RUN]\tMOV SS; ICEBP\n");
+
+ /* Some emulators (e.g. QEMU TCG) don't emulate ICEBP. */
+ sethandler(SIGILL, handle_and_longjmp, SA_RESETHAND);
+
+ asm volatile ("mov %[ss], %%ss; .byte 0xf1" :: [ss] "m" (ss));
+ }
+
+ if (sigsetjmp(jmpbuf, 1) == 0) {
+ printf("[RUN]\tMOV SS; CLI\n");
+ sethandler(SIGSEGV, handle_and_longjmp, SA_RESETHAND);
+ asm volatile ("mov %[ss], %%ss; cli" :: [ss] "m" (ss));
+ }
+
+ if (sigsetjmp(jmpbuf, 1) == 0) {
+ printf("[RUN]\tMOV SS; #PF\n");
+ sethandler(SIGSEGV, handle_and_longjmp, SA_RESETHAND);
+ asm volatile ("mov %[ss], %%ss; mov (-1), %[tmp]"
+ : [tmp] "=r" (nr) : [ss] "m" (ss));
+ }
+
+ /*
+ * INT $1: if #DB has DPL=3 and there isn't special handling,
+ * then the kernel will die.
+ */
+ if (sigsetjmp(jmpbuf, 1) == 0) {
+ printf("[RUN]\tMOV SS; INT 1\n");
+ sethandler(SIGSEGV, handle_and_longjmp, SA_RESETHAND);
+ asm volatile ("mov %[ss], %%ss; int $1" :: [ss] "m" (ss));
+ }
+
+#ifdef __x86_64__
+ /*
+ * In principle, we should test 32-bit SYSCALL as well, but
+ * the calling convention is so unpredictable that it's
+ * not obviously worth the effort.
+ */
+ if (sigsetjmp(jmpbuf, 1) == 0) {
+ printf("[RUN]\tMOV SS; SYSCALL\n");
+ sethandler(SIGILL, handle_and_longjmp, SA_RESETHAND);
+ nr = SYS_getpid;
+ /*
+ * Toggle the high bit of RSP to make it noncanonical to
+ * strengthen this test on non-SMAP systems.
+ */
+ asm volatile ("btc $63, %%rsp\n\t"
+ "mov %[ss], %%ss; syscall\n\t"
+ "btc $63, %%rsp"
+ : "+a" (nr) : [ss] "m" (ss)
+ : "rcx"
+#ifdef __x86_64__
+ , "r11"
+#endif
+ );
+ }
+#endif
+
+ printf("[RUN]\tMOV SS; breakpointed NOP\n");
+ asm volatile ("mov %[ss], %%ss; breakpoint_insn: nop" :: [ss] "m" (ss));
+
+ /*
+ * Invoking SYSENTER directly breaks all the rules. Just handle
+ * the SIGSEGV.
+ */
+ if (sigsetjmp(jmpbuf, 1) == 0) {
+ printf("[RUN]\tMOV SS; SYSENTER\n");
+ stack_t stack = {
+ .ss_sp = altstack_data,
+ .ss_size = SIGSTKSZ,
+ };
+ if (sigaltstack(&stack, NULL) != 0)
+ err(1, "sigaltstack");
+ sethandler(SIGSEGV, handle_and_longjmp, SA_RESETHAND | SA_ONSTACK);
+ nr = SYS_getpid;
+ asm volatile ("mov %[ss], %%ss; SYSENTER" : "+a" (nr)
+ : [ss] "m" (ss) : "flags", "rcx"
+#ifdef __x86_64__
+ , "r11"
+#endif
+ );
+
+ /* We're unreachable here. SYSENTER forgets RIP. */
+ }
+
+ if (sigsetjmp(jmpbuf, 1) == 0) {
+ printf("[RUN]\tMOV SS; INT $0x80\n");
+ sethandler(SIGSEGV, handle_and_longjmp, SA_RESETHAND);
+ nr = 20; /* compat getpid */
+ asm volatile ("mov %[ss], %%ss; int $0x80"
+ : "+a" (nr) : [ss] "m" (ss)
+ : "flags"
+#ifdef __x86_64__
+ , "r8", "r9", "r10", "r11"
+#endif
+ );
+ }
+
+ printf("[OK]\tI aten't dead\n");
+ return 0;
+}
diff --git a/tools/testing/selftests/x86/mpx-debug.h b/tools/testing/selftests/x86/mpx-debug.h
new file mode 100644
index 000000000..7546eba7f
--- /dev/null
+++ b/tools/testing/selftests/x86/mpx-debug.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _MPX_DEBUG_H
+#define _MPX_DEBUG_H
+
+#ifndef DEBUG_LEVEL
+#define DEBUG_LEVEL 0
+#endif
+#define dprintf_level(level, args...) do { if(level <= DEBUG_LEVEL) printf(args); } while(0)
+#define dprintf1(args...) dprintf_level(1, args)
+#define dprintf2(args...) dprintf_level(2, args)
+#define dprintf3(args...) dprintf_level(3, args)
+#define dprintf4(args...) dprintf_level(4, args)
+#define dprintf5(args...) dprintf_level(5, args)
+
+#endif /* _MPX_DEBUG_H */
diff --git a/tools/testing/selftests/x86/mpx-dig.c b/tools/testing/selftests/x86/mpx-dig.c
new file mode 100644
index 000000000..c13607ef5
--- /dev/null
+++ b/tools/testing/selftests/x86/mpx-dig.c
@@ -0,0 +1,499 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Written by Dave Hansen <dave.hansen@intel.com>
+ */
+
+#include <stdlib.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <errno.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <sys/mman.h>
+#include <string.h>
+#include <fcntl.h>
+#include "mpx-debug.h"
+#include "mpx-mm.h"
+#include "mpx-hw.h"
+
+unsigned long bounds_dir_global;
+
+#define mpx_dig_abort() __mpx_dig_abort(__FILE__, __func__, __LINE__)
+static void inline __mpx_dig_abort(const char *file, const char *func, int line)
+{
+ fprintf(stderr, "MPX dig abort @ %s::%d in %s()\n", file, line, func);
+ printf("MPX dig abort @ %s::%d in %s()\n", file, line, func);
+ abort();
+}
+
+/*
+ * run like this (BDIR finds the probably bounds directory):
+ *
+ * BDIR="$(cat /proc/$pid/smaps | grep -B1 2097152 \
+ * | head -1 | awk -F- '{print $1}')";
+ * ./mpx-dig $pid 0x$BDIR
+ *
+ * NOTE:
+ * assumes that the only 2097152-kb VMA is the bounds dir
+ */
+
+long nr_incore(void *ptr, unsigned long size_bytes)
+{
+ int i;
+ long ret = 0;
+ long vec_len = size_bytes / PAGE_SIZE;
+ unsigned char *vec = malloc(vec_len);
+ int incore_ret;
+
+ if (!vec)
+ mpx_dig_abort();
+
+ incore_ret = mincore(ptr, size_bytes, vec);
+ if (incore_ret) {
+ printf("mincore ret: %d\n", incore_ret);
+ perror("mincore");
+ mpx_dig_abort();
+ }
+ for (i = 0; i < vec_len; i++)
+ ret += vec[i];
+ free(vec);
+ return ret;
+}
+
+int open_proc(int pid, char *file)
+{
+ static char buf[100];
+ int fd;
+
+ snprintf(&buf[0], sizeof(buf), "/proc/%d/%s", pid, file);
+ fd = open(&buf[0], O_RDONLY);
+ if (fd < 0)
+ perror(buf);
+
+ return fd;
+}
+
+struct vaddr_range {
+ unsigned long start;
+ unsigned long end;
+};
+struct vaddr_range *ranges;
+int nr_ranges_allocated;
+int nr_ranges_populated;
+int last_range = -1;
+
+int __pid_load_vaddrs(int pid)
+{
+ int ret = 0;
+ int proc_maps_fd = open_proc(pid, "maps");
+ char linebuf[10000];
+ unsigned long start;
+ unsigned long end;
+ char rest[1000];
+ FILE *f = fdopen(proc_maps_fd, "r");
+
+ if (!f)
+ mpx_dig_abort();
+ nr_ranges_populated = 0;
+ while (!feof(f)) {
+ char *readret = fgets(linebuf, sizeof(linebuf), f);
+ int parsed;
+
+ if (readret == NULL) {
+ if (feof(f))
+ break;
+ mpx_dig_abort();
+ }
+
+ parsed = sscanf(linebuf, "%lx-%lx%s", &start, &end, rest);
+ if (parsed != 3)
+ mpx_dig_abort();
+
+ dprintf4("result[%d]: %lx-%lx<->%s\n", parsed, start, end, rest);
+ if (nr_ranges_populated >= nr_ranges_allocated) {
+ ret = -E2BIG;
+ break;
+ }
+ ranges[nr_ranges_populated].start = start;
+ ranges[nr_ranges_populated].end = end;
+ nr_ranges_populated++;
+ }
+ last_range = -1;
+ fclose(f);
+ close(proc_maps_fd);
+ return ret;
+}
+
+int pid_load_vaddrs(int pid)
+{
+ int ret;
+
+ dprintf2("%s(%d)\n", __func__, pid);
+ if (!ranges) {
+ nr_ranges_allocated = 4;
+ ranges = malloc(nr_ranges_allocated * sizeof(ranges[0]));
+ dprintf2("%s(%d) allocated %d ranges @ %p\n", __func__, pid,
+ nr_ranges_allocated, ranges);
+ assert(ranges != NULL);
+ }
+ do {
+ ret = __pid_load_vaddrs(pid);
+ if (!ret)
+ break;
+ if (ret == -E2BIG) {
+ dprintf2("%s(%d) need to realloc\n", __func__, pid);
+ nr_ranges_allocated *= 2;
+ ranges = realloc(ranges,
+ nr_ranges_allocated * sizeof(ranges[0]));
+ dprintf2("%s(%d) allocated %d ranges @ %p\n", __func__,
+ pid, nr_ranges_allocated, ranges);
+ assert(ranges != NULL);
+ dprintf1("reallocating to hold %d ranges\n", nr_ranges_allocated);
+ }
+ } while (1);
+
+ dprintf2("%s(%d) done\n", __func__, pid);
+
+ return ret;
+}
+
+static inline int vaddr_in_range(unsigned long vaddr, struct vaddr_range *r)
+{
+ if (vaddr < r->start)
+ return 0;
+ if (vaddr >= r->end)
+ return 0;
+ return 1;
+}
+
+static inline int vaddr_mapped_by_range(unsigned long vaddr)
+{
+ int i;
+
+ if (last_range > 0 && vaddr_in_range(vaddr, &ranges[last_range]))
+ return 1;
+
+ for (i = 0; i < nr_ranges_populated; i++) {
+ struct vaddr_range *r = &ranges[i];
+
+ if (vaddr_in_range(vaddr, r))
+ continue;
+ last_range = i;
+ return 1;
+ }
+ return 0;
+}
+
+const int bt_entry_size_bytes = sizeof(unsigned long) * 4;
+
+void *read_bounds_table_into_buf(unsigned long table_vaddr)
+{
+#ifdef MPX_DIG_STANDALONE
+ static char bt_buf[MPX_BOUNDS_TABLE_SIZE_BYTES];
+ off_t seek_ret = lseek(fd, table_vaddr, SEEK_SET);
+ if (seek_ret != table_vaddr)
+ mpx_dig_abort();
+
+ int read_ret = read(fd, &bt_buf, sizeof(bt_buf));
+ if (read_ret != sizeof(bt_buf))
+ mpx_dig_abort();
+ return &bt_buf;
+#else
+ return (void *)table_vaddr;
+#endif
+}
+
+int dump_table(unsigned long table_vaddr, unsigned long base_controlled_vaddr,
+ unsigned long bde_vaddr)
+{
+ unsigned long offset_inside_bt;
+ int nr_entries = 0;
+ int do_abort = 0;
+ char *bt_buf;
+
+ dprintf3("%s() base_controlled_vaddr: 0x%012lx bde_vaddr: 0x%012lx\n",
+ __func__, base_controlled_vaddr, bde_vaddr);
+
+ bt_buf = read_bounds_table_into_buf(table_vaddr);
+
+ dprintf4("%s() read done\n", __func__);
+
+ for (offset_inside_bt = 0;
+ offset_inside_bt < MPX_BOUNDS_TABLE_SIZE_BYTES;
+ offset_inside_bt += bt_entry_size_bytes) {
+ unsigned long bt_entry_index;
+ unsigned long bt_entry_controls;
+ unsigned long this_bt_entry_for_vaddr;
+ unsigned long *bt_entry_buf;
+ int i;
+
+ dprintf4("%s() offset_inside_bt: 0x%lx of 0x%llx\n", __func__,
+ offset_inside_bt, MPX_BOUNDS_TABLE_SIZE_BYTES);
+ bt_entry_buf = (void *)&bt_buf[offset_inside_bt];
+ if (!bt_buf) {
+ printf("null bt_buf\n");
+ mpx_dig_abort();
+ }
+ if (!bt_entry_buf) {
+ printf("null bt_entry_buf\n");
+ mpx_dig_abort();
+ }
+ dprintf4("%s() reading *bt_entry_buf @ %p\n", __func__,
+ bt_entry_buf);
+ if (!bt_entry_buf[0] &&
+ !bt_entry_buf[1] &&
+ !bt_entry_buf[2] &&
+ !bt_entry_buf[3])
+ continue;
+
+ nr_entries++;
+
+ bt_entry_index = offset_inside_bt/bt_entry_size_bytes;
+ bt_entry_controls = sizeof(void *);
+ this_bt_entry_for_vaddr =
+ base_controlled_vaddr + bt_entry_index*bt_entry_controls;
+ /*
+ * We sign extend vaddr bits 48->63 which effectively
+ * creates a hole in the virtual address space.
+ * This calculation corrects for the hole.
+ */
+ if (this_bt_entry_for_vaddr > 0x00007fffffffffffUL)
+ this_bt_entry_for_vaddr |= 0xffff800000000000;
+
+ if (!vaddr_mapped_by_range(this_bt_entry_for_vaddr)) {
+ printf("bt_entry_buf: %p\n", bt_entry_buf);
+ printf("there is a bte for %lx but no mapping\n",
+ this_bt_entry_for_vaddr);
+ printf(" bde vaddr: %016lx\n", bde_vaddr);
+ printf("base_controlled_vaddr: %016lx\n", base_controlled_vaddr);
+ printf(" table_vaddr: %016lx\n", table_vaddr);
+ printf(" entry vaddr: %016lx @ offset %lx\n",
+ table_vaddr + offset_inside_bt, offset_inside_bt);
+ do_abort = 1;
+ mpx_dig_abort();
+ }
+ if (DEBUG_LEVEL < 4)
+ continue;
+
+ printf("table entry[%lx]: ", offset_inside_bt);
+ for (i = 0; i < bt_entry_size_bytes; i += sizeof(unsigned long))
+ printf("0x%016lx ", bt_entry_buf[i]);
+ printf("\n");
+ }
+ if (do_abort)
+ mpx_dig_abort();
+ dprintf4("%s() done\n", __func__);
+ return nr_entries;
+}
+
+int search_bd_buf(char *buf, int len_bytes, unsigned long bd_offset_bytes,
+ int *nr_populated_bdes)
+{
+ unsigned long i;
+ int total_entries = 0;
+
+ dprintf3("%s(%p, %x, %lx, ...) buf end: %p\n", __func__, buf,
+ len_bytes, bd_offset_bytes, buf + len_bytes);
+
+ for (i = 0; i < len_bytes; i += sizeof(unsigned long)) {
+ unsigned long bd_index = (bd_offset_bytes + i) / sizeof(unsigned long);
+ unsigned long *bounds_dir_entry_ptr = (unsigned long *)&buf[i];
+ unsigned long bounds_dir_entry;
+ unsigned long bd_for_vaddr;
+ unsigned long bt_start;
+ unsigned long bt_tail;
+ int nr_entries;
+
+ dprintf4("%s() loop i: %ld bounds_dir_entry_ptr: %p\n", __func__, i,
+ bounds_dir_entry_ptr);
+
+ bounds_dir_entry = *bounds_dir_entry_ptr;
+ if (!bounds_dir_entry) {
+ dprintf4("no bounds dir at index 0x%lx / 0x%lx "
+ "start at offset:%lx %lx\n", bd_index, bd_index,
+ bd_offset_bytes, i);
+ continue;
+ }
+ dprintf3("found bounds_dir_entry: 0x%lx @ "
+ "index 0x%lx buf ptr: %p\n", bounds_dir_entry, i,
+ &buf[i]);
+ /* mask off the enable bit: */
+ bounds_dir_entry &= ~0x1;
+ (*nr_populated_bdes)++;
+ dprintf4("nr_populated_bdes: %p\n", nr_populated_bdes);
+ dprintf4("*nr_populated_bdes: %d\n", *nr_populated_bdes);
+
+ bt_start = bounds_dir_entry;
+ bt_tail = bounds_dir_entry + MPX_BOUNDS_TABLE_SIZE_BYTES - 1;
+ if (!vaddr_mapped_by_range(bt_start)) {
+ printf("bounds directory 0x%lx points to nowhere\n",
+ bounds_dir_entry);
+ mpx_dig_abort();
+ }
+ if (!vaddr_mapped_by_range(bt_tail)) {
+ printf("bounds directory end 0x%lx points to nowhere\n",
+ bt_tail);
+ mpx_dig_abort();
+ }
+ /*
+ * Each bounds directory entry controls 1MB of virtual address
+ * space. This variable is the virtual address in the process
+ * of the beginning of the area controlled by this bounds_dir.
+ */
+ bd_for_vaddr = bd_index * (1UL<<20);
+
+ nr_entries = dump_table(bounds_dir_entry, bd_for_vaddr,
+ bounds_dir_global+bd_offset_bytes+i);
+ total_entries += nr_entries;
+ dprintf5("dir entry[%4ld @ %p]: 0x%lx %6d entries "
+ "total this buf: %7d bd_for_vaddrs: 0x%lx -> 0x%lx\n",
+ bd_index, buf+i,
+ bounds_dir_entry, nr_entries, total_entries,
+ bd_for_vaddr, bd_for_vaddr + (1UL<<20));
+ }
+ dprintf3("%s(%p, %x, %lx, ...) done\n", __func__, buf, len_bytes,
+ bd_offset_bytes);
+ return total_entries;
+}
+
+int proc_pid_mem_fd = -1;
+
+void *fill_bounds_dir_buf_other(long byte_offset_inside_bounds_dir,
+ long buffer_size_bytes, void *buffer)
+{
+ unsigned long seekto = bounds_dir_global + byte_offset_inside_bounds_dir;
+ int read_ret;
+ off_t seek_ret = lseek(proc_pid_mem_fd, seekto, SEEK_SET);
+
+ if (seek_ret != seekto)
+ mpx_dig_abort();
+
+ read_ret = read(proc_pid_mem_fd, buffer, buffer_size_bytes);
+ /* there shouldn't practically be short reads of /proc/$pid/mem */
+ if (read_ret != buffer_size_bytes)
+ mpx_dig_abort();
+
+ return buffer;
+}
+void *fill_bounds_dir_buf_self(long byte_offset_inside_bounds_dir,
+ long buffer_size_bytes, void *buffer)
+
+{
+ unsigned char vec[buffer_size_bytes / PAGE_SIZE];
+ char *dig_bounds_dir_ptr =
+ (void *)(bounds_dir_global + byte_offset_inside_bounds_dir);
+ /*
+ * use mincore() to quickly find the areas of the bounds directory
+ * that have memory and thus will be worth scanning.
+ */
+ int incore_ret;
+
+ int incore = 0;
+ int i;
+
+ dprintf4("%s() dig_bounds_dir_ptr: %p\n", __func__, dig_bounds_dir_ptr);
+
+ incore_ret = mincore(dig_bounds_dir_ptr, buffer_size_bytes, &vec[0]);
+ if (incore_ret) {
+ printf("mincore ret: %d\n", incore_ret);
+ perror("mincore");
+ mpx_dig_abort();
+ }
+ for (i = 0; i < sizeof(vec); i++)
+ incore += vec[i];
+ dprintf4("%s() total incore: %d\n", __func__, incore);
+ if (!incore)
+ return NULL;
+ dprintf3("%s() total incore: %d\n", __func__, incore);
+ return dig_bounds_dir_ptr;
+}
+
+int inspect_pid(int pid)
+{
+ static int dig_nr;
+ long offset_inside_bounds_dir;
+ char bounds_dir_buf[sizeof(unsigned long) * (1UL << 15)];
+ char *dig_bounds_dir_ptr;
+ int total_entries = 0;
+ int nr_populated_bdes = 0;
+ int inspect_self;
+
+ if (getpid() == pid) {
+ dprintf4("inspecting self\n");
+ inspect_self = 1;
+ } else {
+ dprintf4("inspecting pid %d\n", pid);
+ mpx_dig_abort();
+ }
+
+ for (offset_inside_bounds_dir = 0;
+ offset_inside_bounds_dir < MPX_BOUNDS_TABLE_SIZE_BYTES;
+ offset_inside_bounds_dir += sizeof(bounds_dir_buf)) {
+ static int bufs_skipped;
+ int this_entries;
+
+ if (inspect_self) {
+ dig_bounds_dir_ptr =
+ fill_bounds_dir_buf_self(offset_inside_bounds_dir,
+ sizeof(bounds_dir_buf),
+ &bounds_dir_buf[0]);
+ } else {
+ dig_bounds_dir_ptr =
+ fill_bounds_dir_buf_other(offset_inside_bounds_dir,
+ sizeof(bounds_dir_buf),
+ &bounds_dir_buf[0]);
+ }
+ if (!dig_bounds_dir_ptr) {
+ bufs_skipped++;
+ continue;
+ }
+ this_entries = search_bd_buf(dig_bounds_dir_ptr,
+ sizeof(bounds_dir_buf),
+ offset_inside_bounds_dir,
+ &nr_populated_bdes);
+ total_entries += this_entries;
+ }
+ printf("mpx dig (%3d) complete, SUCCESS (%8d / %4d)\n", ++dig_nr,
+ total_entries, nr_populated_bdes);
+ return total_entries + nr_populated_bdes;
+}
+
+#ifdef MPX_DIG_REMOTE
+int main(int argc, char **argv)
+{
+ int err;
+ char *c;
+ unsigned long bounds_dir_entry;
+ int pid;
+
+ printf("mpx-dig starting...\n");
+ err = sscanf(argv[1], "%d", &pid);
+ printf("parsing: '%s', err: %d\n", argv[1], err);
+ if (err != 1)
+ mpx_dig_abort();
+
+ err = sscanf(argv[2], "%lx", &bounds_dir_global);
+ printf("parsing: '%s': %d\n", argv[2], err);
+ if (err != 1)
+ mpx_dig_abort();
+
+ proc_pid_mem_fd = open_proc(pid, "mem");
+ if (proc_pid_mem_fd < 0)
+ mpx_dig_abort();
+
+ inspect_pid(pid);
+ return 0;
+}
+#endif
+
+long inspect_me(struct mpx_bounds_dir *bounds_dir)
+{
+ int pid = getpid();
+
+ pid_load_vaddrs(pid);
+ bounds_dir_global = (unsigned long)bounds_dir;
+ dprintf4("enter %s() bounds dir: %p\n", __func__, bounds_dir);
+ return inspect_pid(pid);
+}
diff --git a/tools/testing/selftests/x86/mpx-hw.h b/tools/testing/selftests/x86/mpx-hw.h
new file mode 100644
index 000000000..d1b61ab87
--- /dev/null
+++ b/tools/testing/selftests/x86/mpx-hw.h
@@ -0,0 +1,124 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _MPX_HW_H
+#define _MPX_HW_H
+
+#include <assert.h>
+
+/* Describe the MPX Hardware Layout in here */
+
+#define NR_MPX_BOUNDS_REGISTERS 4
+
+#ifdef __i386__
+
+#define MPX_BOUNDS_TABLE_ENTRY_SIZE_BYTES 16 /* 4 * 32-bits */
+#define MPX_BOUNDS_TABLE_SIZE_BYTES (1ULL << 14) /* 16k */
+#define MPX_BOUNDS_DIR_ENTRY_SIZE_BYTES 4
+#define MPX_BOUNDS_DIR_SIZE_BYTES (1ULL << 22) /* 4MB */
+
+#define MPX_BOUNDS_TABLE_BOTTOM_BIT 2
+#define MPX_BOUNDS_TABLE_TOP_BIT 11
+#define MPX_BOUNDS_DIR_BOTTOM_BIT 12
+#define MPX_BOUNDS_DIR_TOP_BIT 31
+
+#else
+
+/*
+ * Linear Address of "pointer" (LAp)
+ * 0 -> 2: ignored
+ * 3 -> 19: index in to bounds table
+ * 20 -> 47: index in to bounds directory
+ * 48 -> 63: ignored
+ */
+
+#define MPX_BOUNDS_TABLE_ENTRY_SIZE_BYTES 32
+#define MPX_BOUNDS_TABLE_SIZE_BYTES (1ULL << 22) /* 4MB */
+#define MPX_BOUNDS_DIR_ENTRY_SIZE_BYTES 8
+#define MPX_BOUNDS_DIR_SIZE_BYTES (1ULL << 31) /* 2GB */
+
+#define MPX_BOUNDS_TABLE_BOTTOM_BIT 3
+#define MPX_BOUNDS_TABLE_TOP_BIT 19
+#define MPX_BOUNDS_DIR_BOTTOM_BIT 20
+#define MPX_BOUNDS_DIR_TOP_BIT 47
+
+#endif
+
+#define MPX_BOUNDS_DIR_NR_ENTRIES \
+ (MPX_BOUNDS_DIR_SIZE_BYTES/MPX_BOUNDS_DIR_ENTRY_SIZE_BYTES)
+#define MPX_BOUNDS_TABLE_NR_ENTRIES \
+ (MPX_BOUNDS_TABLE_SIZE_BYTES/MPX_BOUNDS_TABLE_ENTRY_SIZE_BYTES)
+
+#define MPX_BOUNDS_TABLE_ENTRY_VALID_BIT 0x1
+
+struct mpx_bd_entry {
+ union {
+ char x[MPX_BOUNDS_DIR_ENTRY_SIZE_BYTES];
+ void *contents[0];
+ };
+} __attribute__((packed));
+
+struct mpx_bt_entry {
+ union {
+ char x[MPX_BOUNDS_TABLE_ENTRY_SIZE_BYTES];
+ unsigned long contents[0];
+ };
+} __attribute__((packed));
+
+struct mpx_bounds_dir {
+ struct mpx_bd_entry entries[MPX_BOUNDS_DIR_NR_ENTRIES];
+} __attribute__((packed));
+
+struct mpx_bounds_table {
+ struct mpx_bt_entry entries[MPX_BOUNDS_TABLE_NR_ENTRIES];
+} __attribute__((packed));
+
+static inline unsigned long GET_BITS(unsigned long val, int bottombit, int topbit)
+{
+ int total_nr_bits = topbit - bottombit;
+ unsigned long mask = (1UL << total_nr_bits)-1;
+ return (val >> bottombit) & mask;
+}
+
+static inline unsigned long __vaddr_bounds_table_index(void *vaddr)
+{
+ return GET_BITS((unsigned long)vaddr, MPX_BOUNDS_TABLE_BOTTOM_BIT,
+ MPX_BOUNDS_TABLE_TOP_BIT);
+}
+
+static inline unsigned long __vaddr_bounds_directory_index(void *vaddr)
+{
+ return GET_BITS((unsigned long)vaddr, MPX_BOUNDS_DIR_BOTTOM_BIT,
+ MPX_BOUNDS_DIR_TOP_BIT);
+}
+
+static inline struct mpx_bd_entry *mpx_vaddr_to_bd_entry(void *vaddr,
+ struct mpx_bounds_dir *bounds_dir)
+{
+ unsigned long index = __vaddr_bounds_directory_index(vaddr);
+ return &bounds_dir->entries[index];
+}
+
+static inline int bd_entry_valid(struct mpx_bd_entry *bounds_dir_entry)
+{
+ unsigned long __bd_entry = (unsigned long)bounds_dir_entry->contents;
+ return (__bd_entry & MPX_BOUNDS_TABLE_ENTRY_VALID_BIT);
+}
+
+static inline struct mpx_bounds_table *
+__bd_entry_to_bounds_table(struct mpx_bd_entry *bounds_dir_entry)
+{
+ unsigned long __bd_entry = (unsigned long)bounds_dir_entry->contents;
+ assert(__bd_entry & MPX_BOUNDS_TABLE_ENTRY_VALID_BIT);
+ __bd_entry &= ~MPX_BOUNDS_TABLE_ENTRY_VALID_BIT;
+ return (struct mpx_bounds_table *)__bd_entry;
+}
+
+static inline struct mpx_bt_entry *
+mpx_vaddr_to_bt_entry(void *vaddr, struct mpx_bounds_dir *bounds_dir)
+{
+ struct mpx_bd_entry *bde = mpx_vaddr_to_bd_entry(vaddr, bounds_dir);
+ struct mpx_bounds_table *bt = __bd_entry_to_bounds_table(bde);
+ unsigned long index = __vaddr_bounds_table_index(vaddr);
+ return &bt->entries[index];
+}
+
+#endif /* _MPX_HW_H */
diff --git a/tools/testing/selftests/x86/mpx-mini-test.c b/tools/testing/selftests/x86/mpx-mini-test.c
new file mode 100644
index 000000000..50f7e9272
--- /dev/null
+++ b/tools/testing/selftests/x86/mpx-mini-test.c
@@ -0,0 +1,1616 @@
+/*
+ * mpx-mini-test.c: routines to test Intel MPX (Memory Protection eXtentions)
+ *
+ * Written by:
+ * "Ren, Qiaowei" <qiaowei.ren@intel.com>
+ * "Wei, Gang" <gang.wei@intel.com>
+ * "Hansen, Dave" <dave.hansen@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2.
+ */
+
+/*
+ * 2014-12-05: Dave Hansen: fixed all of the compiler warnings, and made sure
+ * it works on 32-bit.
+ */
+
+int inspect_every_this_many_mallocs = 100;
+int zap_all_every_this_many_mallocs = 1000;
+
+#define _GNU_SOURCE
+#define _LARGEFILE64_SOURCE
+
+#include <string.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <signal.h>
+#include <assert.h>
+#include <stdlib.h>
+#include <ucontext.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+
+#include "mpx-hw.h"
+#include "mpx-debug.h"
+#include "mpx-mm.h"
+
+#ifndef __always_inline
+#define __always_inline inline __attribute__((always_inline)
+#endif
+
+#ifndef TEST_DURATION_SECS
+#define TEST_DURATION_SECS 3
+#endif
+
+void write_int_to(char *prefix, char *file, int int_to_write)
+{
+ char buf[100];
+ int fd = open(file, O_RDWR);
+ int len;
+ int ret;
+
+ assert(fd >= 0);
+ len = snprintf(buf, sizeof(buf), "%s%d", prefix, int_to_write);
+ assert(len >= 0);
+ assert(len < sizeof(buf));
+ ret = write(fd, buf, len);
+ assert(ret == len);
+ ret = close(fd);
+ assert(!ret);
+}
+
+void write_pid_to(char *prefix, char *file)
+{
+ write_int_to(prefix, file, getpid());
+}
+
+void trace_me(void)
+{
+/* tracing events dir */
+#define TED "/sys/kernel/debug/tracing/events/"
+/*
+ write_pid_to("common_pid=", TED "signal/filter");
+ write_pid_to("common_pid=", TED "exceptions/filter");
+ write_int_to("", TED "signal/enable", 1);
+ write_int_to("", TED "exceptions/enable", 1);
+*/
+ write_pid_to("", "/sys/kernel/debug/tracing/set_ftrace_pid");
+ write_int_to("", "/sys/kernel/debug/tracing/trace", 0);
+}
+
+#define test_failed() __test_failed(__FILE__, __LINE__)
+static void __test_failed(char *f, int l)
+{
+ fprintf(stderr, "abort @ %s::%d\n", f, l);
+ abort();
+}
+
+/* Error Printf */
+#define eprintf(args...) fprintf(stderr, args)
+
+#ifdef __i386__
+
+/* i386 directory size is 4MB */
+#define REG_IP_IDX REG_EIP
+#define REX_PREFIX
+
+#define XSAVE_OFFSET_IN_FPMEM sizeof(struct _libc_fpstate)
+
+/*
+ * __cpuid() is from the Linux Kernel:
+ */
+static inline void __cpuid(unsigned int *eax, unsigned int *ebx,
+ unsigned int *ecx, unsigned int *edx)
+{
+ /* ecx is often an input as well as an output. */
+ asm volatile(
+ "push %%ebx;"
+ "cpuid;"
+ "mov %%ebx, %1;"
+ "pop %%ebx"
+ : "=a" (*eax),
+ "=g" (*ebx),
+ "=c" (*ecx),
+ "=d" (*edx)
+ : "0" (*eax), "2" (*ecx));
+}
+
+#else /* __i386__ */
+
+#define REG_IP_IDX REG_RIP
+#define REX_PREFIX "0x48, "
+
+#define XSAVE_OFFSET_IN_FPMEM 0
+
+/*
+ * __cpuid() is from the Linux Kernel:
+ */
+static inline void __cpuid(unsigned int *eax, unsigned int *ebx,
+ unsigned int *ecx, unsigned int *edx)
+{
+ /* ecx is often an input as well as an output. */
+ asm volatile(
+ "cpuid;"
+ : "=a" (*eax),
+ "=b" (*ebx),
+ "=c" (*ecx),
+ "=d" (*edx)
+ : "0" (*eax), "2" (*ecx));
+}
+
+#endif /* !__i386__ */
+
+struct xsave_hdr_struct {
+ uint64_t xstate_bv;
+ uint64_t reserved1[2];
+ uint64_t reserved2[5];
+} __attribute__((packed));
+
+struct bndregs_struct {
+ uint64_t bndregs[8];
+} __attribute__((packed));
+
+struct bndcsr_struct {
+ uint64_t cfg_reg_u;
+ uint64_t status_reg;
+} __attribute__((packed));
+
+struct xsave_struct {
+ uint8_t fpu_sse[512];
+ struct xsave_hdr_struct xsave_hdr;
+ uint8_t ymm[256];
+ uint8_t lwp[128];
+ struct bndregs_struct bndregs;
+ struct bndcsr_struct bndcsr;
+} __attribute__((packed));
+
+uint8_t __attribute__((__aligned__(64))) buffer[4096];
+struct xsave_struct *xsave_buf = (struct xsave_struct *)buffer;
+
+uint8_t __attribute__((__aligned__(64))) test_buffer[4096];
+struct xsave_struct *xsave_test_buf = (struct xsave_struct *)test_buffer;
+
+uint64_t num_bnd_chk;
+
+static __always_inline void xrstor_state(struct xsave_struct *fx, uint64_t mask)
+{
+ uint32_t lmask = mask;
+ uint32_t hmask = mask >> 32;
+
+ asm volatile(".byte " REX_PREFIX "0x0f,0xae,0x2f\n\t"
+ : : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask)
+ : "memory");
+}
+
+static __always_inline void xsave_state_1(void *_fx, uint64_t mask)
+{
+ uint32_t lmask = mask;
+ uint32_t hmask = mask >> 32;
+ unsigned char *fx = _fx;
+
+ asm volatile(".byte " REX_PREFIX "0x0f,0xae,0x27\n\t"
+ : : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask)
+ : "memory");
+}
+
+static inline uint64_t xgetbv(uint32_t index)
+{
+ uint32_t eax, edx;
+
+ asm volatile(".byte 0x0f,0x01,0xd0" /* xgetbv */
+ : "=a" (eax), "=d" (edx)
+ : "c" (index));
+ return eax + ((uint64_t)edx << 32);
+}
+
+static uint64_t read_mpx_status_sig(ucontext_t *uctxt)
+{
+ memset(buffer, 0, sizeof(buffer));
+ memcpy(buffer,
+ (uint8_t *)uctxt->uc_mcontext.fpregs + XSAVE_OFFSET_IN_FPMEM,
+ sizeof(struct xsave_struct));
+
+ return xsave_buf->bndcsr.status_reg;
+}
+
+#include <pthread.h>
+
+static uint8_t *get_next_inst_ip(uint8_t *addr)
+{
+ uint8_t *ip = addr;
+ uint8_t sib;
+ uint8_t rm;
+ uint8_t mod;
+ uint8_t base;
+ uint8_t modrm;
+
+ /* determine the prefix. */
+ switch(*ip) {
+ case 0xf2:
+ case 0xf3:
+ case 0x66:
+ ip++;
+ break;
+ }
+
+ /* look for rex prefix */
+ if ((*ip & 0x40) == 0x40)
+ ip++;
+
+ /* Make sure we have a MPX instruction. */
+ if (*ip++ != 0x0f)
+ return addr;
+
+ /* Skip the op code byte. */
+ ip++;
+
+ /* Get the modrm byte. */
+ modrm = *ip++;
+
+ /* Break it down into parts. */
+ rm = modrm & 7;
+ mod = (modrm >> 6);
+
+ /* Init the parts of the address mode. */
+ base = 8;
+
+ /* Is it a mem mode? */
+ if (mod != 3) {
+ /* look for scaled indexed addressing */
+ if (rm == 4) {
+ /* SIB addressing */
+ sib = *ip++;
+ base = sib & 7;
+ switch (mod) {
+ case 0:
+ if (base == 5)
+ ip += 4;
+ break;
+
+ case 1:
+ ip++;
+ break;
+
+ case 2:
+ ip += 4;
+ break;
+ }
+
+ } else {
+ /* MODRM addressing */
+ switch (mod) {
+ case 0:
+ /* DISP32 addressing, no base */
+ if (rm == 5)
+ ip += 4;
+ break;
+
+ case 1:
+ ip++;
+ break;
+
+ case 2:
+ ip += 4;
+ break;
+ }
+ }
+ }
+ return ip;
+}
+
+#ifdef si_lower
+static inline void *__si_bounds_lower(siginfo_t *si)
+{
+ return si->si_lower;
+}
+
+static inline void *__si_bounds_upper(siginfo_t *si)
+{
+ return si->si_upper;
+}
+#else
+
+/*
+ * This deals with old version of _sigfault in some distros:
+ *
+
+old _sigfault:
+ struct {
+ void *si_addr;
+ } _sigfault;
+
+new _sigfault:
+ struct {
+ void __user *_addr;
+ int _trapno;
+ short _addr_lsb;
+ union {
+ struct {
+ void __user *_lower;
+ void __user *_upper;
+ } _addr_bnd;
+ __u32 _pkey;
+ };
+ } _sigfault;
+ *
+ */
+
+static inline void **__si_bounds_hack(siginfo_t *si)
+{
+ void *sigfault = &si->_sifields._sigfault;
+ void *end_sigfault = sigfault + sizeof(si->_sifields._sigfault);
+ int *trapno = (int*)end_sigfault;
+ /* skip _trapno and _addr_lsb */
+ void **__si_lower = (void**)(trapno + 2);
+
+ return __si_lower;
+}
+
+static inline void *__si_bounds_lower(siginfo_t *si)
+{
+ return *__si_bounds_hack(si);
+}
+
+static inline void *__si_bounds_upper(siginfo_t *si)
+{
+ return *(__si_bounds_hack(si) + 1);
+}
+#endif
+
+static int br_count;
+static int expected_bnd_index = -1;
+uint64_t shadow_plb[NR_MPX_BOUNDS_REGISTERS][2]; /* shadow MPX bound registers */
+unsigned long shadow_map[NR_MPX_BOUNDS_REGISTERS];
+
+/* Failed address bound checks: */
+#ifndef SEGV_BNDERR
+# define SEGV_BNDERR 3
+#endif
+
+/*
+ * The kernel is supposed to provide some information about the bounds
+ * exception in the siginfo. It should match what we have in the bounds
+ * registers that we are checking against. Just check against the shadow copy
+ * since it is easily available, and we also check that *it* matches the real
+ * registers.
+ */
+void check_siginfo_vs_shadow(siginfo_t* si)
+{
+ int siginfo_ok = 1;
+ void *shadow_lower = (void *)(unsigned long)shadow_plb[expected_bnd_index][0];
+ void *shadow_upper = (void *)(unsigned long)shadow_plb[expected_bnd_index][1];
+
+ if ((expected_bnd_index < 0) ||
+ (expected_bnd_index >= NR_MPX_BOUNDS_REGISTERS)) {
+ fprintf(stderr, "ERROR: invalid expected_bnd_index: %d\n",
+ expected_bnd_index);
+ exit(6);
+ }
+ if (__si_bounds_lower(si) != shadow_lower)
+ siginfo_ok = 0;
+ if (__si_bounds_upper(si) != shadow_upper)
+ siginfo_ok = 0;
+
+ if (!siginfo_ok) {
+ fprintf(stderr, "ERROR: siginfo bounds do not match "
+ "shadow bounds for register %d\n", expected_bnd_index);
+ exit(7);
+ }
+}
+
+void handler(int signum, siginfo_t *si, void *vucontext)
+{
+ int i;
+ ucontext_t *uctxt = vucontext;
+ int trapno;
+ unsigned long ip;
+
+ dprintf1("entered signal handler\n");
+
+ trapno = uctxt->uc_mcontext.gregs[REG_TRAPNO];
+ ip = uctxt->uc_mcontext.gregs[REG_IP_IDX];
+
+ if (trapno == 5) {
+ typeof(si->si_addr) *si_addr_ptr = &si->si_addr;
+ uint64_t status = read_mpx_status_sig(uctxt);
+ uint64_t br_reason = status & 0x3;
+
+ br_count++;
+ dprintf1("#BR 0x%jx (total seen: %d)\n", status, br_count);
+
+ dprintf2("Saw a #BR! status 0x%jx at %016lx br_reason: %jx\n",
+ status, ip, br_reason);
+ dprintf2("si_signo: %d\n", si->si_signo);
+ dprintf2(" signum: %d\n", signum);
+ dprintf2("info->si_code == SEGV_BNDERR: %d\n",
+ (si->si_code == SEGV_BNDERR));
+ dprintf2("info->si_code: %d\n", si->si_code);
+ dprintf2("info->si_lower: %p\n", __si_bounds_lower(si));
+ dprintf2("info->si_upper: %p\n", __si_bounds_upper(si));
+
+ for (i = 0; i < 8; i++)
+ dprintf3("[%d]: %p\n", i, si_addr_ptr[i]);
+ switch (br_reason) {
+ case 0: /* traditional BR */
+ fprintf(stderr,
+ "Undefined status with bound exception:%jx\n",
+ status);
+ exit(5);
+ case 1: /* #BR MPX bounds exception */
+ /* these are normal and we expect to see them */
+
+ check_siginfo_vs_shadow(si);
+
+ dprintf1("bounds exception (normal): status 0x%jx at %p si_addr: %p\n",
+ status, (void *)ip, si->si_addr);
+ num_bnd_chk++;
+ uctxt->uc_mcontext.gregs[REG_IP_IDX] =
+ (greg_t)get_next_inst_ip((uint8_t *)ip);
+ break;
+ case 2:
+ fprintf(stderr, "#BR status == 2, missing bounds table,"
+ "kernel should have handled!!\n");
+ exit(4);
+ break;
+ default:
+ fprintf(stderr, "bound check error: status 0x%jx at %p\n",
+ status, (void *)ip);
+ num_bnd_chk++;
+ uctxt->uc_mcontext.gregs[REG_IP_IDX] =
+ (greg_t)get_next_inst_ip((uint8_t *)ip);
+ fprintf(stderr, "bound check error: si_addr %p\n", si->si_addr);
+ exit(3);
+ }
+ } else if (trapno == 14) {
+ eprintf("ERROR: In signal handler, page fault, trapno = %d, ip = %016lx\n",
+ trapno, ip);
+ eprintf("si_addr %p\n", si->si_addr);
+ eprintf("REG_ERR: %lx\n", (unsigned long)uctxt->uc_mcontext.gregs[REG_ERR]);
+ test_failed();
+ } else {
+ eprintf("unexpected trap %d! at 0x%lx\n", trapno, ip);
+ eprintf("si_addr %p\n", si->si_addr);
+ eprintf("REG_ERR: %lx\n", (unsigned long)uctxt->uc_mcontext.gregs[REG_ERR]);
+ test_failed();
+ }
+}
+
+static inline void cpuid_count(unsigned int op, int count,
+ unsigned int *eax, unsigned int *ebx,
+ unsigned int *ecx, unsigned int *edx)
+{
+ *eax = op;
+ *ecx = count;
+ __cpuid(eax, ebx, ecx, edx);
+}
+
+#define XSTATE_CPUID 0x0000000d
+
+/*
+ * List of XSAVE features Linux knows about:
+ */
+enum xfeature_bit {
+ XSTATE_BIT_FP,
+ XSTATE_BIT_SSE,
+ XSTATE_BIT_YMM,
+ XSTATE_BIT_BNDREGS,
+ XSTATE_BIT_BNDCSR,
+ XSTATE_BIT_OPMASK,
+ XSTATE_BIT_ZMM_Hi256,
+ XSTATE_BIT_Hi16_ZMM,
+
+ XFEATURES_NR_MAX,
+};
+
+#define XSTATE_FP (1 << XSTATE_BIT_FP)
+#define XSTATE_SSE (1 << XSTATE_BIT_SSE)
+#define XSTATE_YMM (1 << XSTATE_BIT_YMM)
+#define XSTATE_BNDREGS (1 << XSTATE_BIT_BNDREGS)
+#define XSTATE_BNDCSR (1 << XSTATE_BIT_BNDCSR)
+#define XSTATE_OPMASK (1 << XSTATE_BIT_OPMASK)
+#define XSTATE_ZMM_Hi256 (1 << XSTATE_BIT_ZMM_Hi256)
+#define XSTATE_Hi16_ZMM (1 << XSTATE_BIT_Hi16_ZMM)
+
+#define MPX_XSTATES (XSTATE_BNDREGS | XSTATE_BNDCSR) /* 0x18 */
+
+bool one_bit(unsigned int x, int bit)
+{
+ return !!(x & (1<<bit));
+}
+
+void print_state_component(int state_bit_nr, char *name)
+{
+ unsigned int eax, ebx, ecx, edx;
+ unsigned int state_component_size;
+ unsigned int state_component_supervisor;
+ unsigned int state_component_user;
+ unsigned int state_component_aligned;
+
+ /* See SDM Section 13.2 */
+ cpuid_count(XSTATE_CPUID, state_bit_nr, &eax, &ebx, &ecx, &edx);
+ assert(eax || ebx || ecx);
+ state_component_size = eax;
+ state_component_supervisor = ((!ebx) && one_bit(ecx, 0));
+ state_component_user = !one_bit(ecx, 0);
+ state_component_aligned = one_bit(ecx, 1);
+ printf("%8s: size: %d user: %d supervisor: %d aligned: %d\n",
+ name,
+ state_component_size, state_component_user,
+ state_component_supervisor, state_component_aligned);
+
+}
+
+/* Intel-defined CPU features, CPUID level 0x00000001 (ecx) */
+#define XSAVE_FEATURE_BIT (26) /* XSAVE/XRSTOR/XSETBV/XGETBV */
+#define OSXSAVE_FEATURE_BIT (27) /* XSAVE enabled in the OS */
+
+bool check_mpx_support(void)
+{
+ unsigned int eax, ebx, ecx, edx;
+
+ cpuid_count(1, 0, &eax, &ebx, &ecx, &edx);
+
+ /* We can't do much without XSAVE, so just make these assert()'s */
+ if (!one_bit(ecx, XSAVE_FEATURE_BIT)) {
+ fprintf(stderr, "processor lacks XSAVE, can not run MPX tests\n");
+ exit(0);
+ }
+
+ if (!one_bit(ecx, OSXSAVE_FEATURE_BIT)) {
+ fprintf(stderr, "processor lacks OSXSAVE, can not run MPX tests\n");
+ exit(0);
+ }
+
+ /* CPUs not supporting the XSTATE CPUID leaf do not support MPX */
+ /* Is this redundant with the feature bit checks? */
+ cpuid_count(0, 0, &eax, &ebx, &ecx, &edx);
+ if (eax < XSTATE_CPUID) {
+ fprintf(stderr, "processor lacks XSTATE CPUID leaf,"
+ " can not run MPX tests\n");
+ exit(0);
+ }
+
+ printf("XSAVE is supported by HW & OS\n");
+
+ cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx);
+
+ printf("XSAVE processor supported state mask: 0x%x\n", eax);
+ printf("XSAVE OS supported state mask: 0x%jx\n", xgetbv(0));
+
+ /* Make sure that the MPX states are enabled in in XCR0 */
+ if ((eax & MPX_XSTATES) != MPX_XSTATES) {
+ fprintf(stderr, "processor lacks MPX XSTATE(s), can not run MPX tests\n");
+ exit(0);
+ }
+
+ /* Make sure the MPX states are supported by XSAVE* */
+ if ((xgetbv(0) & MPX_XSTATES) != MPX_XSTATES) {
+ fprintf(stderr, "MPX XSTATE(s) no enabled in XCR0, "
+ "can not run MPX tests\n");
+ exit(0);
+ }
+
+ print_state_component(XSTATE_BIT_BNDREGS, "BNDREGS");
+ print_state_component(XSTATE_BIT_BNDCSR, "BNDCSR");
+
+ return true;
+}
+
+void enable_mpx(void *l1base)
+{
+ /* enable point lookup */
+ memset(buffer, 0, sizeof(buffer));
+ xrstor_state(xsave_buf, 0x18);
+
+ xsave_buf->xsave_hdr.xstate_bv = 0x10;
+ xsave_buf->bndcsr.cfg_reg_u = (unsigned long)l1base | 1;
+ xsave_buf->bndcsr.status_reg = 0;
+
+ dprintf2("bf xrstor\n");
+ dprintf2("xsave cndcsr: status %jx, configu %jx\n",
+ xsave_buf->bndcsr.status_reg, xsave_buf->bndcsr.cfg_reg_u);
+ xrstor_state(xsave_buf, 0x18);
+ dprintf2("after xrstor\n");
+
+ xsave_state_1(xsave_buf, 0x18);
+
+ dprintf1("xsave bndcsr: status %jx, configu %jx\n",
+ xsave_buf->bndcsr.status_reg, xsave_buf->bndcsr.cfg_reg_u);
+}
+
+#include <sys/prctl.h>
+
+struct mpx_bounds_dir *bounds_dir_ptr;
+
+unsigned long __bd_incore(const char *func, int line)
+{
+ unsigned long ret = nr_incore(bounds_dir_ptr, MPX_BOUNDS_DIR_SIZE_BYTES);
+ return ret;
+}
+#define bd_incore() __bd_incore(__func__, __LINE__)
+
+void check_clear(void *ptr, unsigned long sz)
+{
+ unsigned long *i;
+
+ for (i = ptr; (void *)i < ptr + sz; i++) {
+ if (*i) {
+ dprintf1("%p is NOT clear at %p\n", ptr, i);
+ assert(0);
+ }
+ }
+ dprintf1("%p is clear for %lx\n", ptr, sz);
+}
+
+void check_clear_bd(void)
+{
+ check_clear(bounds_dir_ptr, 2UL << 30);
+}
+
+#define USE_MALLOC_FOR_BOUNDS_DIR 1
+bool process_specific_init(void)
+{
+ unsigned long size;
+ unsigned long *dir;
+ /* Guarantee we have the space to align it, add padding: */
+ unsigned long pad = getpagesize();
+
+ size = 2UL << 30; /* 2GB */
+ if (sizeof(unsigned long) == 4)
+ size = 4UL << 20; /* 4MB */
+ dprintf1("trying to allocate %ld MB bounds directory\n", (size >> 20));
+
+ if (USE_MALLOC_FOR_BOUNDS_DIR) {
+ unsigned long _dir;
+
+ dir = malloc(size + pad);
+ assert(dir);
+ _dir = (unsigned long)dir;
+ _dir += 0xfffUL;
+ _dir &= ~0xfffUL;
+ dir = (void *)_dir;
+ } else {
+ /*
+ * This makes debugging easier because the address
+ * calculations are simpler:
+ */
+ dir = mmap((void *)0x200000000000, size + pad,
+ PROT_READ|PROT_WRITE,
+ MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
+ if (dir == (void *)-1) {
+ perror("unable to allocate bounds directory");
+ abort();
+ }
+ check_clear(dir, size);
+ }
+ bounds_dir_ptr = (void *)dir;
+ madvise(bounds_dir_ptr, size, MADV_NOHUGEPAGE);
+ bd_incore();
+ dprintf1("bounds directory: 0x%p -> 0x%p\n", bounds_dir_ptr,
+ (char *)bounds_dir_ptr + size);
+ check_clear(dir, size);
+ enable_mpx(dir);
+ check_clear(dir, size);
+ if (prctl(43, 0, 0, 0, 0)) {
+ printf("no MPX support\n");
+ abort();
+ return false;
+ }
+ return true;
+}
+
+bool process_specific_finish(void)
+{
+ if (prctl(44)) {
+ printf("no MPX support\n");
+ return false;
+ }
+ return true;
+}
+
+void setup_handler()
+{
+ int r, rs;
+ struct sigaction newact;
+ struct sigaction oldact;
+
+ /* #BR is mapped to sigsegv */
+ int signum = SIGSEGV;
+
+ newact.sa_handler = 0; /* void(*)(int)*/
+ newact.sa_sigaction = handler; /* void (*)(int, siginfo_t*, void *) */
+
+ /*sigset_t - signals to block while in the handler */
+ /* get the old signal mask. */
+ rs = sigprocmask(SIG_SETMASK, 0, &newact.sa_mask);
+ assert(rs == 0);
+
+ /* call sa_sigaction, not sa_handler*/
+ newact.sa_flags = SA_SIGINFO;
+
+ newact.sa_restorer = 0; /* void(*)(), obsolete */
+ r = sigaction(signum, &newact, &oldact);
+ assert(r == 0);
+}
+
+void mpx_prepare(void)
+{
+ dprintf2("%s()\n", __func__);
+ setup_handler();
+ process_specific_init();
+}
+
+void mpx_cleanup(void)
+{
+ printf("%s(): %jd BRs. bye...\n", __func__, num_bnd_chk);
+ process_specific_finish();
+}
+
+/*-------------- the following is test case ---------------*/
+#include <stdint.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <time.h>
+
+uint64_t num_lower_brs;
+uint64_t num_upper_brs;
+
+#define MPX_CONFIG_OFFSET 1024
+#define MPX_BOUNDS_OFFSET 960
+#define MPX_HEADER_OFFSET 512
+#define MAX_ADDR_TESTED (1<<28)
+#define TEST_ROUNDS 100
+
+/*
+ 0F 1A /r BNDLDX-Load
+ 0F 1B /r BNDSTX-Store Extended Bounds Using Address Translation
+ 66 0F 1A /r BNDMOV bnd1, bnd2/m128
+ 66 0F 1B /r BNDMOV bnd1/m128, bnd2
+ F2 0F 1A /r BNDCU bnd, r/m64
+ F2 0F 1B /r BNDCN bnd, r/m64
+ F3 0F 1A /r BNDCL bnd, r/m64
+ F3 0F 1B /r BNDMK bnd, m64
+*/
+
+static __always_inline void xsave_state(void *_fx, uint64_t mask)
+{
+ uint32_t lmask = mask;
+ uint32_t hmask = mask >> 32;
+ unsigned char *fx = _fx;
+
+ asm volatile(".byte " REX_PREFIX "0x0f,0xae,0x27\n\t"
+ : : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask)
+ : "memory");
+}
+
+static __always_inline void mpx_clear_bnd0(void)
+{
+ long size = 0;
+ void *ptr = NULL;
+ /* F3 0F 1B /r BNDMK bnd, m64 */
+ /* f3 0f 1b 04 11 bndmk (%rcx,%rdx,1),%bnd0 */
+ asm volatile(".byte 0xf3,0x0f,0x1b,0x04,0x11\n\t"
+ : : "c" (ptr), "d" (size-1)
+ : "memory");
+}
+
+static __always_inline void mpx_make_bound_helper(unsigned long ptr,
+ unsigned long size)
+{
+ /* F3 0F 1B /r BNDMK bnd, m64 */
+ /* f3 0f 1b 04 11 bndmk (%rcx,%rdx,1),%bnd0 */
+ asm volatile(".byte 0xf3,0x0f,0x1b,0x04,0x11\n\t"
+ : : "c" (ptr), "d" (size-1)
+ : "memory");
+}
+
+static __always_inline void mpx_check_lowerbound_helper(unsigned long ptr)
+{
+ /* F3 0F 1A /r NDCL bnd, r/m64 */
+ /* f3 0f 1a 01 bndcl (%rcx),%bnd0 */
+ asm volatile(".byte 0xf3,0x0f,0x1a,0x01\n\t"
+ : : "c" (ptr)
+ : "memory");
+}
+
+static __always_inline void mpx_check_upperbound_helper(unsigned long ptr)
+{
+ /* F2 0F 1A /r BNDCU bnd, r/m64 */
+ /* f2 0f 1a 01 bndcu (%rcx),%bnd0 */
+ asm volatile(".byte 0xf2,0x0f,0x1a,0x01\n\t"
+ : : "c" (ptr)
+ : "memory");
+}
+
+static __always_inline void mpx_movbndreg_helper()
+{
+ /* 66 0F 1B /r BNDMOV bnd1/m128, bnd2 */
+ /* 66 0f 1b c2 bndmov %bnd0,%bnd2 */
+
+ asm volatile(".byte 0x66,0x0f,0x1b,0xc2\n\t");
+}
+
+static __always_inline void mpx_movbnd2mem_helper(uint8_t *mem)
+{
+ /* 66 0F 1B /r BNDMOV bnd1/m128, bnd2 */
+ /* 66 0f 1b 01 bndmov %bnd0,(%rcx) */
+ asm volatile(".byte 0x66,0x0f,0x1b,0x01\n\t"
+ : : "c" (mem)
+ : "memory");
+}
+
+static __always_inline void mpx_movbnd_from_mem_helper(uint8_t *mem)
+{
+ /* 66 0F 1A /r BNDMOV bnd1, bnd2/m128 */
+ /* 66 0f 1a 01 bndmov (%rcx),%bnd0 */
+ asm volatile(".byte 0x66,0x0f,0x1a,0x01\n\t"
+ : : "c" (mem)
+ : "memory");
+}
+
+static __always_inline void mpx_store_dsc_helper(unsigned long ptr_addr,
+ unsigned long ptr_val)
+{
+ /* 0F 1B /r BNDSTX-Store Extended Bounds Using Address Translation */
+ /* 0f 1b 04 11 bndstx %bnd0,(%rcx,%rdx,1) */
+ asm volatile(".byte 0x0f,0x1b,0x04,0x11\n\t"
+ : : "c" (ptr_addr), "d" (ptr_val)
+ : "memory");
+}
+
+static __always_inline void mpx_load_dsc_helper(unsigned long ptr_addr,
+ unsigned long ptr_val)
+{
+ /* 0F 1A /r BNDLDX-Load */
+ /*/ 0f 1a 04 11 bndldx (%rcx,%rdx,1),%bnd0 */
+ asm volatile(".byte 0x0f,0x1a,0x04,0x11\n\t"
+ : : "c" (ptr_addr), "d" (ptr_val)
+ : "memory");
+}
+
+void __print_context(void *__print_xsave_buffer, int line)
+{
+ uint64_t *bounds = (uint64_t *)(__print_xsave_buffer + MPX_BOUNDS_OFFSET);
+ uint64_t *cfg = (uint64_t *)(__print_xsave_buffer + MPX_CONFIG_OFFSET);
+
+ int i;
+ eprintf("%s()::%d\n", "print_context", line);
+ for (i = 0; i < 4; i++) {
+ eprintf("bound[%d]: 0x%016lx 0x%016lx(0x%016lx)\n", i,
+ (unsigned long)bounds[i*2],
+ ~(unsigned long)bounds[i*2+1],
+ (unsigned long)bounds[i*2+1]);
+ }
+
+ eprintf("cpcfg: %jx cpstatus: %jx\n", cfg[0], cfg[1]);
+}
+#define print_context(x) __print_context(x, __LINE__)
+#ifdef DEBUG
+#define dprint_context(x) print_context(x)
+#else
+#define dprint_context(x) do{}while(0)
+#endif
+
+void init()
+{
+ int i;
+
+ srand((unsigned int)time(NULL));
+
+ for (i = 0; i < 4; i++) {
+ shadow_plb[i][0] = 0;
+ shadow_plb[i][1] = ~(unsigned long)0;
+ }
+}
+
+long int __mpx_random(int line)
+{
+#ifdef NOT_SO_RANDOM
+ static long fake = 722122311;
+ fake += 563792075;
+ return fakse;
+#else
+ return random();
+#endif
+}
+#define mpx_random() __mpx_random(__LINE__)
+
+uint8_t *get_random_addr()
+{
+ uint8_t*addr = (uint8_t *)(unsigned long)(rand() % MAX_ADDR_TESTED);
+ return (addr - (unsigned long)addr % sizeof(uint8_t *));
+}
+
+static inline bool compare_context(void *__xsave_buffer)
+{
+ uint64_t *bounds = (uint64_t *)(__xsave_buffer + MPX_BOUNDS_OFFSET);
+
+ int i;
+ for (i = 0; i < 4; i++) {
+ dprintf3("shadow[%d]{%016lx/%016lx}\nbounds[%d]{%016lx/%016lx}\n",
+ i, (unsigned long)shadow_plb[i][0], (unsigned long)shadow_plb[i][1],
+ i, (unsigned long)bounds[i*2], ~(unsigned long)bounds[i*2+1]);
+ if ((shadow_plb[i][0] != bounds[i*2]) ||
+ (shadow_plb[i][1] != ~(unsigned long)bounds[i*2+1])) {
+ eprintf("ERROR comparing shadow to real bound register %d\n", i);
+ eprintf("shadow{0x%016lx/0x%016lx}\nbounds{0x%016lx/0x%016lx}\n",
+ (unsigned long)shadow_plb[i][0], (unsigned long)shadow_plb[i][1],
+ (unsigned long)bounds[i*2], (unsigned long)bounds[i*2+1]);
+ return false;
+ }
+ }
+
+ return true;
+}
+
+void mkbnd_shadow(uint8_t *ptr, int index, long offset)
+{
+ uint64_t *lower = (uint64_t *)&(shadow_plb[index][0]);
+ uint64_t *upper = (uint64_t *)&(shadow_plb[index][1]);
+ *lower = (unsigned long)ptr;
+ *upper = (unsigned long)ptr + offset - 1;
+}
+
+void check_lowerbound_shadow(uint8_t *ptr, int index)
+{
+ uint64_t *lower = (uint64_t *)&(shadow_plb[index][0]);
+ if (*lower > (uint64_t)(unsigned long)ptr)
+ num_lower_brs++;
+ else
+ dprintf1("LowerBoundChk passed:%p\n", ptr);
+}
+
+void check_upperbound_shadow(uint8_t *ptr, int index)
+{
+ uint64_t upper = *(uint64_t *)&(shadow_plb[index][1]);
+ if (upper < (uint64_t)(unsigned long)ptr)
+ num_upper_brs++;
+ else
+ dprintf1("UpperBoundChk passed:%p\n", ptr);
+}
+
+__always_inline void movbndreg_shadow(int src, int dest)
+{
+ shadow_plb[dest][0] = shadow_plb[src][0];
+ shadow_plb[dest][1] = shadow_plb[src][1];
+}
+
+__always_inline void movbnd2mem_shadow(int src, unsigned long *dest)
+{
+ unsigned long *lower = (unsigned long *)&(shadow_plb[src][0]);
+ unsigned long *upper = (unsigned long *)&(shadow_plb[src][1]);
+ *dest = *lower;
+ *(dest+1) = *upper;
+}
+
+__always_inline void movbnd_from_mem_shadow(unsigned long *src, int dest)
+{
+ unsigned long *lower = (unsigned long *)&(shadow_plb[dest][0]);
+ unsigned long *upper = (unsigned long *)&(shadow_plb[dest][1]);
+ *lower = *src;
+ *upper = *(src+1);
+}
+
+__always_inline void stdsc_shadow(int index, uint8_t *ptr, uint8_t *ptr_val)
+{
+ shadow_map[0] = (unsigned long)shadow_plb[index][0];
+ shadow_map[1] = (unsigned long)shadow_plb[index][1];
+ shadow_map[2] = (unsigned long)ptr_val;
+ dprintf3("%s(%d, %p, %p) set shadow map[2]: %p\n", __func__,
+ index, ptr, ptr_val, ptr_val);
+ /*ptr ignored */
+}
+
+void lddsc_shadow(int index, uint8_t *ptr, uint8_t *ptr_val)
+{
+ uint64_t lower = shadow_map[0];
+ uint64_t upper = shadow_map[1];
+ uint8_t *value = (uint8_t *)shadow_map[2];
+
+ if (value != ptr_val) {
+ dprintf2("%s(%d, %p, %p) init shadow bounds[%d] "
+ "because %p != %p\n", __func__, index, ptr,
+ ptr_val, index, value, ptr_val);
+ shadow_plb[index][0] = 0;
+ shadow_plb[index][1] = ~(unsigned long)0;
+ } else {
+ shadow_plb[index][0] = lower;
+ shadow_plb[index][1] = upper;
+ }
+ /* ptr ignored */
+}
+
+static __always_inline void mpx_test_helper0(uint8_t *buf, uint8_t *ptr)
+{
+ mpx_make_bound_helper((unsigned long)ptr, 0x1800);
+}
+
+static __always_inline void mpx_test_helper0_shadow(uint8_t *buf, uint8_t *ptr)
+{
+ mkbnd_shadow(ptr, 0, 0x1800);
+}
+
+static __always_inline void mpx_test_helper1(uint8_t *buf, uint8_t *ptr)
+{
+ /* these are hard-coded to check bnd0 */
+ expected_bnd_index = 0;
+ mpx_check_lowerbound_helper((unsigned long)(ptr-1));
+ mpx_check_upperbound_helper((unsigned long)(ptr+0x1800));
+ /* reset this since we do not expect any more bounds exceptions */
+ expected_bnd_index = -1;
+}
+
+static __always_inline void mpx_test_helper1_shadow(uint8_t *buf, uint8_t *ptr)
+{
+ check_lowerbound_shadow(ptr-1, 0);
+ check_upperbound_shadow(ptr+0x1800, 0);
+}
+
+static __always_inline void mpx_test_helper2(uint8_t *buf, uint8_t *ptr)
+{
+ mpx_make_bound_helper((unsigned long)ptr, 0x1800);
+ mpx_movbndreg_helper();
+ mpx_movbnd2mem_helper(buf);
+ mpx_make_bound_helper((unsigned long)(ptr+0x12), 0x1800);
+}
+
+static __always_inline void mpx_test_helper2_shadow(uint8_t *buf, uint8_t *ptr)
+{
+ mkbnd_shadow(ptr, 0, 0x1800);
+ movbndreg_shadow(0, 2);
+ movbnd2mem_shadow(0, (unsigned long *)buf);
+ mkbnd_shadow(ptr+0x12, 0, 0x1800);
+}
+
+static __always_inline void mpx_test_helper3(uint8_t *buf, uint8_t *ptr)
+{
+ mpx_movbnd_from_mem_helper(buf);
+}
+
+static __always_inline void mpx_test_helper3_shadow(uint8_t *buf, uint8_t *ptr)
+{
+ movbnd_from_mem_shadow((unsigned long *)buf, 0);
+}
+
+static __always_inline void mpx_test_helper4(uint8_t *buf, uint8_t *ptr)
+{
+ mpx_store_dsc_helper((unsigned long)buf, (unsigned long)ptr);
+ mpx_make_bound_helper((unsigned long)(ptr+0x12), 0x1800);
+}
+
+static __always_inline void mpx_test_helper4_shadow(uint8_t *buf, uint8_t *ptr)
+{
+ stdsc_shadow(0, buf, ptr);
+ mkbnd_shadow(ptr+0x12, 0, 0x1800);
+}
+
+static __always_inline void mpx_test_helper5(uint8_t *buf, uint8_t *ptr)
+{
+ mpx_load_dsc_helper((unsigned long)buf, (unsigned long)ptr);
+}
+
+static __always_inline void mpx_test_helper5_shadow(uint8_t *buf, uint8_t *ptr)
+{
+ lddsc_shadow(0, buf, ptr);
+}
+
+#define NR_MPX_TEST_FUNCTIONS 6
+
+/*
+ * For compatibility reasons, MPX will clear the bounds registers
+ * when you make function calls (among other things). We have to
+ * preserve the registers in between calls to the "helpers" since
+ * they build on each other.
+ *
+ * Be very careful not to make any function calls inside the
+ * helpers, or anywhere else beween the xrstor and xsave.
+ */
+#define run_helper(helper_nr, buf, buf_shadow, ptr) do { \
+ xrstor_state(xsave_test_buf, flags); \
+ mpx_test_helper##helper_nr(buf, ptr); \
+ xsave_state(xsave_test_buf, flags); \
+ mpx_test_helper##helper_nr##_shadow(buf_shadow, ptr); \
+} while (0)
+
+static void run_helpers(int nr, uint8_t *buf, uint8_t *buf_shadow, uint8_t *ptr)
+{
+ uint64_t flags = 0x18;
+
+ dprint_context(xsave_test_buf);
+ switch (nr) {
+ case 0:
+ run_helper(0, buf, buf_shadow, ptr);
+ break;
+ case 1:
+ run_helper(1, buf, buf_shadow, ptr);
+ break;
+ case 2:
+ run_helper(2, buf, buf_shadow, ptr);
+ break;
+ case 3:
+ run_helper(3, buf, buf_shadow, ptr);
+ break;
+ case 4:
+ run_helper(4, buf, buf_shadow, ptr);
+ break;
+ case 5:
+ run_helper(5, buf, buf_shadow, ptr);
+ break;
+ default:
+ test_failed();
+ break;
+ }
+ dprint_context(xsave_test_buf);
+}
+
+unsigned long buf_shadow[1024]; /* used to check load / store descriptors */
+extern long inspect_me(struct mpx_bounds_dir *bounds_dir);
+
+long cover_buf_with_bt_entries(void *buf, long buf_len)
+{
+ int i;
+ long nr_to_fill;
+ int ratio = 1000;
+ unsigned long buf_len_in_ptrs;
+
+ /* Fill about 1/100 of the space with bt entries */
+ nr_to_fill = buf_len / (sizeof(unsigned long) * ratio);
+
+ if (!nr_to_fill)
+ dprintf3("%s() nr_to_fill: %ld\n", __func__, nr_to_fill);
+
+ /* Align the buffer to pointer size */
+ while (((unsigned long)buf) % sizeof(void *)) {
+ buf++;
+ buf_len--;
+ }
+ /* We are storing pointers, so make */
+ buf_len_in_ptrs = buf_len / sizeof(void *);
+
+ for (i = 0; i < nr_to_fill; i++) {
+ long index = (mpx_random() % buf_len_in_ptrs);
+ void *ptr = buf + index * sizeof(unsigned long);
+ unsigned long ptr_addr = (unsigned long)ptr;
+
+ /* ptr and size can be anything */
+ mpx_make_bound_helper((unsigned long)ptr, 8);
+
+ /*
+ * take bnd0 and put it in to bounds tables "buf + index" is an
+ * address inside the buffer where we are pretending that we
+ * are going to put a pointer We do not, though because we will
+ * never load entries from the table, so it doesn't matter.
+ */
+ mpx_store_dsc_helper(ptr_addr, (unsigned long)ptr);
+ dprintf4("storing bound table entry for %lx (buf start @ %p)\n",
+ ptr_addr, buf);
+ }
+ return nr_to_fill;
+}
+
+unsigned long align_down(unsigned long alignme, unsigned long align_to)
+{
+ return alignme & ~(align_to-1);
+}
+
+unsigned long align_up(unsigned long alignme, unsigned long align_to)
+{
+ return (alignme + align_to - 1) & ~(align_to-1);
+}
+
+/*
+ * Using 1MB alignment guarantees that each no allocation
+ * will overlap with another's bounds tables.
+ *
+ * We have to cook our own allocator here. malloc() can
+ * mix other allocation with ours which means that even
+ * if we free all of our allocations, there might still
+ * be bounds tables for the *areas* since there is other
+ * valid memory there.
+ *
+ * We also can't use malloc() because a free() of an area
+ * might not free it back to the kernel. We want it
+ * completely unmapped an malloc() does not guarantee
+ * that.
+ */
+#ifdef __i386__
+long alignment = 4096;
+long sz_alignment = 4096;
+#else
+long alignment = 1 * MB;
+long sz_alignment = 1 * MB;
+#endif
+void *mpx_mini_alloc(unsigned long sz)
+{
+ unsigned long long tries = 0;
+ static void *last;
+ void *ptr;
+ void *try_at;
+
+ sz = align_up(sz, sz_alignment);
+
+ try_at = last + alignment;
+ while (1) {
+ ptr = mmap(try_at, sz, PROT_READ|PROT_WRITE,
+ MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
+ if (ptr == (void *)-1)
+ return NULL;
+ if (ptr == try_at)
+ break;
+
+ munmap(ptr, sz);
+ try_at += alignment;
+#ifdef __i386__
+ /*
+ * This isn't quite correct for 32-bit binaries
+ * on 64-bit kernels since they can use the
+ * entire 32-bit address space, but it's close
+ * enough.
+ */
+ if (try_at > (void *)0xC0000000)
+#else
+ if (try_at > (void *)0x0000800000000000)
+#endif
+ try_at = (void *)0x0;
+ if (!(++tries % 10000))
+ dprintf1("stuck in %s(), tries: %lld\n", __func__, tries);
+ continue;
+ }
+ last = ptr;
+ dprintf3("mpx_mini_alloc(0x%lx) returning: %p\n", sz, ptr);
+ return ptr;
+}
+void mpx_mini_free(void *ptr, long sz)
+{
+ dprintf2("%s() ptr: %p\n", __func__, ptr);
+ if ((unsigned long)ptr > 0x100000000000) {
+ dprintf1("uh oh !!!!!!!!!!!!!!! pointer too high: %p\n", ptr);
+ test_failed();
+ }
+ sz = align_up(sz, sz_alignment);
+ dprintf3("%s() ptr: %p before munmap\n", __func__, ptr);
+ munmap(ptr, sz);
+ dprintf3("%s() ptr: %p DONE\n", __func__, ptr);
+}
+
+#define NR_MALLOCS 100
+struct one_malloc {
+ char *ptr;
+ int nr_filled_btes;
+ unsigned long size;
+};
+struct one_malloc mallocs[NR_MALLOCS];
+
+void free_one_malloc(int index)
+{
+ unsigned long free_ptr;
+ unsigned long mask;
+
+ if (!mallocs[index].ptr)
+ return;
+
+ mpx_mini_free(mallocs[index].ptr, mallocs[index].size);
+ dprintf4("freed[%d]: %p\n", index, mallocs[index].ptr);
+
+ free_ptr = (unsigned long)mallocs[index].ptr;
+ mask = alignment-1;
+ dprintf4("lowerbits: %lx / %lx mask: %lx\n", free_ptr,
+ (free_ptr & mask), mask);
+ assert((free_ptr & mask) == 0);
+
+ mallocs[index].ptr = NULL;
+}
+
+#ifdef __i386__
+#define MPX_BOUNDS_TABLE_COVERS 4096
+#else
+#define MPX_BOUNDS_TABLE_COVERS (1 * MB)
+#endif
+void zap_everything(void)
+{
+ long after_zap;
+ long before_zap;
+ int i;
+
+ before_zap = inspect_me(bounds_dir_ptr);
+ dprintf1("zapping everything start: %ld\n", before_zap);
+ for (i = 0; i < NR_MALLOCS; i++)
+ free_one_malloc(i);
+
+ after_zap = inspect_me(bounds_dir_ptr);
+ dprintf1("zapping everything done: %ld\n", after_zap);
+ /*
+ * We only guarantee to empty the thing out if our allocations are
+ * exactly aligned on the boundaries of a boudns table.
+ */
+ if ((alignment >= MPX_BOUNDS_TABLE_COVERS) &&
+ (sz_alignment >= MPX_BOUNDS_TABLE_COVERS)) {
+ if (after_zap != 0)
+ test_failed();
+
+ assert(after_zap == 0);
+ }
+}
+
+void do_one_malloc(void)
+{
+ static int malloc_counter;
+ long sz;
+ int rand_index = (mpx_random() % NR_MALLOCS);
+ void *ptr = mallocs[rand_index].ptr;
+
+ dprintf3("%s() enter\n", __func__);
+
+ if (ptr) {
+ dprintf3("freeing one malloc at index: %d\n", rand_index);
+ free_one_malloc(rand_index);
+ if (mpx_random() % (NR_MALLOCS*3) == 3) {
+ int i;
+ dprintf3("zapping some more\n");
+ for (i = rand_index; i < NR_MALLOCS; i++)
+ free_one_malloc(i);
+ }
+ if ((mpx_random() % zap_all_every_this_many_mallocs) == 4)
+ zap_everything();
+ }
+
+ /* 1->~1M */
+ sz = (1 + mpx_random() % 1000) * 1000;
+ ptr = mpx_mini_alloc(sz);
+ if (!ptr) {
+ /*
+ * If we are failing allocations, just assume we
+ * are out of memory and zap everything.
+ */
+ dprintf3("zapping everything because out of memory\n");
+ zap_everything();
+ goto out;
+ }
+
+ dprintf3("malloc: %p size: 0x%lx\n", ptr, sz);
+ mallocs[rand_index].nr_filled_btes = cover_buf_with_bt_entries(ptr, sz);
+ mallocs[rand_index].ptr = ptr;
+ mallocs[rand_index].size = sz;
+out:
+ if ((++malloc_counter) % inspect_every_this_many_mallocs == 0)
+ inspect_me(bounds_dir_ptr);
+}
+
+void run_timed_test(void (*test_func)(void))
+{
+ int done = 0;
+ long iteration = 0;
+ static time_t last_print;
+ time_t now;
+ time_t start;
+
+ time(&start);
+ while (!done) {
+ time(&now);
+ if ((now - start) > TEST_DURATION_SECS)
+ done = 1;
+
+ test_func();
+ iteration++;
+
+ if ((now - last_print > 1) || done) {
+ printf("iteration %ld complete, OK so far\n", iteration);
+ last_print = now;
+ }
+ }
+}
+
+void check_bounds_table_frees(void)
+{
+ printf("executing unmaptest\n");
+ inspect_me(bounds_dir_ptr);
+ run_timed_test(&do_one_malloc);
+ printf("done with malloc() fun\n");
+}
+
+void insn_test_failed(int test_nr, int test_round, void *buf,
+ void *buf_shadow, void *ptr)
+{
+ print_context(xsave_test_buf);
+ eprintf("ERROR: test %d round %d failed\n", test_nr, test_round);
+ while (test_nr == 5) {
+ struct mpx_bt_entry *bte;
+ struct mpx_bounds_dir *bd = (void *)bounds_dir_ptr;
+ struct mpx_bd_entry *bde = mpx_vaddr_to_bd_entry(buf, bd);
+
+ printf(" bd: %p\n", bd);
+ printf("&bde: %p\n", bde);
+ printf("*bde: %lx\n", *(unsigned long *)bde);
+ if (!bd_entry_valid(bde))
+ break;
+
+ bte = mpx_vaddr_to_bt_entry(buf, bd);
+ printf(" te: %p\n", bte);
+ printf("bte[0]: %lx\n", bte->contents[0]);
+ printf("bte[1]: %lx\n", bte->contents[1]);
+ printf("bte[2]: %lx\n", bte->contents[2]);
+ printf("bte[3]: %lx\n", bte->contents[3]);
+ break;
+ }
+ test_failed();
+}
+
+void check_mpx_insns_and_tables(void)
+{
+ int successes = 0;
+ int failures = 0;
+ int buf_size = (1024*1024);
+ unsigned long *buf = malloc(buf_size);
+ const int total_nr_tests = NR_MPX_TEST_FUNCTIONS * TEST_ROUNDS;
+ int i, j;
+
+ memset(buf, 0, buf_size);
+ memset(buf_shadow, 0, sizeof(buf_shadow));
+
+ for (i = 0; i < TEST_ROUNDS; i++) {
+ uint8_t *ptr = get_random_addr() + 8;
+
+ for (j = 0; j < NR_MPX_TEST_FUNCTIONS; j++) {
+ if (0 && j != 5) {
+ successes++;
+ continue;
+ }
+ dprintf2("starting test %d round %d\n", j, i);
+ dprint_context(xsave_test_buf);
+ /*
+ * test5 loads an address from the bounds tables.
+ * The load will only complete if 'ptr' matches
+ * the load and the store, so with random addrs,
+ * the odds of this are very small. Make it
+ * higher by only moving 'ptr' 1/10 times.
+ */
+ if (random() % 10 <= 0)
+ ptr = get_random_addr() + 8;
+ dprintf3("random ptr{%p}\n", ptr);
+ dprint_context(xsave_test_buf);
+ run_helpers(j, (void *)buf, (void *)buf_shadow, ptr);
+ dprint_context(xsave_test_buf);
+ if (!compare_context(xsave_test_buf)) {
+ insn_test_failed(j, i, buf, buf_shadow, ptr);
+ failures++;
+ goto exit;
+ }
+ successes++;
+ dprint_context(xsave_test_buf);
+ dprintf2("finished test %d round %d\n", j, i);
+ dprintf3("\n");
+ dprint_context(xsave_test_buf);
+ }
+ }
+
+exit:
+ dprintf2("\nabout to free:\n");
+ free(buf);
+ dprintf1("successes: %d\n", successes);
+ dprintf1(" failures: %d\n", failures);
+ dprintf1(" tests: %d\n", total_nr_tests);
+ dprintf1(" expected: %jd #BRs\n", num_upper_brs + num_lower_brs);
+ dprintf1(" saw: %d #BRs\n", br_count);
+ if (failures) {
+ eprintf("ERROR: non-zero number of failures\n");
+ exit(20);
+ }
+ if (successes != total_nr_tests) {
+ eprintf("ERROR: succeded fewer than number of tries (%d != %d)\n",
+ successes, total_nr_tests);
+ exit(21);
+ }
+ if (num_upper_brs + num_lower_brs != br_count) {
+ eprintf("ERROR: unexpected number of #BRs: %jd %jd %d\n",
+ num_upper_brs, num_lower_brs, br_count);
+ eprintf("successes: %d\n", successes);
+ eprintf(" failures: %d\n", failures);
+ eprintf(" tests: %d\n", total_nr_tests);
+ eprintf(" expected: %jd #BRs\n", num_upper_brs + num_lower_brs);
+ eprintf(" saw: %d #BRs\n", br_count);
+ exit(22);
+ }
+}
+
+/*
+ * This is supposed to SIGSEGV nicely once the kernel
+ * can no longer allocate vaddr space.
+ */
+void exhaust_vaddr_space(void)
+{
+ unsigned long ptr;
+ /* Try to make sure there is no room for a bounds table anywhere */
+ unsigned long skip = MPX_BOUNDS_TABLE_SIZE_BYTES - PAGE_SIZE;
+#ifdef __i386__
+ unsigned long max_vaddr = 0xf7788000UL;
+#else
+ unsigned long max_vaddr = 0x800000000000UL;
+#endif
+
+ dprintf1("%s() start\n", __func__);
+ /* do not start at 0, we aren't allowed to map there */
+ for (ptr = PAGE_SIZE; ptr < max_vaddr; ptr += skip) {
+ void *ptr_ret;
+ int ret = madvise((void *)ptr, PAGE_SIZE, MADV_NORMAL);
+
+ if (!ret) {
+ dprintf1("madvise() %lx ret: %d\n", ptr, ret);
+ continue;
+ }
+ ptr_ret = mmap((void *)ptr, PAGE_SIZE, PROT_READ|PROT_WRITE,
+ MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
+ if (ptr_ret != (void *)ptr) {
+ perror("mmap");
+ dprintf1("mmap(%lx) ret: %p\n", ptr, ptr_ret);
+ break;
+ }
+ if (!(ptr & 0xffffff))
+ dprintf1("mmap(%lx) ret: %p\n", ptr, ptr_ret);
+ }
+ for (ptr = PAGE_SIZE; ptr < max_vaddr; ptr += skip) {
+ dprintf2("covering 0x%lx with bounds table entries\n", ptr);
+ cover_buf_with_bt_entries((void *)ptr, PAGE_SIZE);
+ }
+ dprintf1("%s() end\n", __func__);
+ printf("done with vaddr space fun\n");
+}
+
+void mpx_table_test(void)
+{
+ printf("starting mpx bounds table test\n");
+ run_timed_test(check_mpx_insns_and_tables);
+ printf("done with mpx bounds table test\n");
+}
+
+int main(int argc, char **argv)
+{
+ int unmaptest = 0;
+ int vaddrexhaust = 0;
+ int tabletest = 0;
+ int i;
+
+ check_mpx_support();
+ mpx_prepare();
+ srandom(11179);
+
+ bd_incore();
+ init();
+ bd_incore();
+
+ trace_me();
+
+ xsave_state((void *)xsave_test_buf, 0x1f);
+ if (!compare_context(xsave_test_buf))
+ printf("Init failed\n");
+
+ for (i = 1; i < argc; i++) {
+ if (!strcmp(argv[i], "unmaptest"))
+ unmaptest = 1;
+ if (!strcmp(argv[i], "vaddrexhaust"))
+ vaddrexhaust = 1;
+ if (!strcmp(argv[i], "tabletest"))
+ tabletest = 1;
+ }
+ if (!(unmaptest || vaddrexhaust || tabletest)) {
+ unmaptest = 1;
+ /* vaddrexhaust = 1; */
+ tabletest = 1;
+ }
+ if (unmaptest)
+ check_bounds_table_frees();
+ if (tabletest)
+ mpx_table_test();
+ if (vaddrexhaust)
+ exhaust_vaddr_space();
+ printf("%s completed successfully\n", argv[0]);
+ exit(0);
+}
+
+#include "mpx-dig.c"
diff --git a/tools/testing/selftests/x86/mpx-mm.h b/tools/testing/selftests/x86/mpx-mm.h
new file mode 100644
index 000000000..6dbdd66b8
--- /dev/null
+++ b/tools/testing/selftests/x86/mpx-mm.h
@@ -0,0 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _MPX_MM_H
+#define _MPX_MM_H
+
+#define PAGE_SIZE 4096
+#define MB (1UL<<20)
+
+extern long nr_incore(void *ptr, unsigned long size_bytes);
+
+#endif /* _MPX_MM_H */
diff --git a/tools/testing/selftests/x86/pkey-helpers.h b/tools/testing/selftests/x86/pkey-helpers.h
new file mode 100644
index 000000000..254e5436b
--- /dev/null
+++ b/tools/testing/selftests/x86/pkey-helpers.h
@@ -0,0 +1,219 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _PKEYS_HELPER_H
+#define _PKEYS_HELPER_H
+#define _GNU_SOURCE
+#include <string.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <signal.h>
+#include <assert.h>
+#include <stdlib.h>
+#include <ucontext.h>
+#include <sys/mman.h>
+
+#define NR_PKEYS 16
+#define PKRU_BITS_PER_PKEY 2
+
+#ifndef DEBUG_LEVEL
+#define DEBUG_LEVEL 0
+#endif
+#define DPRINT_IN_SIGNAL_BUF_SIZE 4096
+extern int dprint_in_signal;
+extern char dprint_in_signal_buffer[DPRINT_IN_SIGNAL_BUF_SIZE];
+static inline void sigsafe_printf(const char *format, ...)
+{
+ va_list ap;
+
+ if (!dprint_in_signal) {
+ va_start(ap, format);
+ vprintf(format, ap);
+ va_end(ap);
+ } else {
+ int ret;
+ /*
+ * No printf() functions are signal-safe.
+ * They deadlock easily. Write the format
+ * string to get some output, even if
+ * incomplete.
+ */
+ ret = write(1, format, strlen(format));
+ if (ret < 0)
+ exit(1);
+ }
+}
+#define dprintf_level(level, args...) do { \
+ if (level <= DEBUG_LEVEL) \
+ sigsafe_printf(args); \
+} while (0)
+#define dprintf0(args...) dprintf_level(0, args)
+#define dprintf1(args...) dprintf_level(1, args)
+#define dprintf2(args...) dprintf_level(2, args)
+#define dprintf3(args...) dprintf_level(3, args)
+#define dprintf4(args...) dprintf_level(4, args)
+
+extern unsigned int shadow_pkru;
+static inline unsigned int __rdpkru(void)
+{
+ unsigned int eax, edx;
+ unsigned int ecx = 0;
+ unsigned int pkru;
+
+ asm volatile(".byte 0x0f,0x01,0xee\n\t"
+ : "=a" (eax), "=d" (edx)
+ : "c" (ecx));
+ pkru = eax;
+ return pkru;
+}
+
+static inline unsigned int _rdpkru(int line)
+{
+ unsigned int pkru = __rdpkru();
+
+ dprintf4("rdpkru(line=%d) pkru: %x shadow: %x\n",
+ line, pkru, shadow_pkru);
+ assert(pkru == shadow_pkru);
+
+ return pkru;
+}
+
+#define rdpkru() _rdpkru(__LINE__)
+
+static inline void __wrpkru(unsigned int pkru)
+{
+ unsigned int eax = pkru;
+ unsigned int ecx = 0;
+ unsigned int edx = 0;
+
+ dprintf4("%s() changing %08x to %08x\n", __func__, __rdpkru(), pkru);
+ asm volatile(".byte 0x0f,0x01,0xef\n\t"
+ : : "a" (eax), "c" (ecx), "d" (edx));
+ assert(pkru == __rdpkru());
+}
+
+static inline void wrpkru(unsigned int pkru)
+{
+ dprintf4("%s() changing %08x to %08x\n", __func__, __rdpkru(), pkru);
+ /* will do the shadow check for us: */
+ rdpkru();
+ __wrpkru(pkru);
+ shadow_pkru = pkru;
+ dprintf4("%s(%08x) pkru: %08x\n", __func__, pkru, __rdpkru());
+}
+
+/*
+ * These are technically racy. since something could
+ * change PKRU between the read and the write.
+ */
+static inline void __pkey_access_allow(int pkey, int do_allow)
+{
+ unsigned int pkru = rdpkru();
+ int bit = pkey * 2;
+
+ if (do_allow)
+ pkru &= (1<<bit);
+ else
+ pkru |= (1<<bit);
+
+ dprintf4("pkru now: %08x\n", rdpkru());
+ wrpkru(pkru);
+}
+
+static inline void __pkey_write_allow(int pkey, int do_allow_write)
+{
+ long pkru = rdpkru();
+ int bit = pkey * 2 + 1;
+
+ if (do_allow_write)
+ pkru &= (1<<bit);
+ else
+ pkru |= (1<<bit);
+
+ wrpkru(pkru);
+ dprintf4("pkru now: %08x\n", rdpkru());
+}
+
+#define PROT_PKEY0 0x10 /* protection key value (bit 0) */
+#define PROT_PKEY1 0x20 /* protection key value (bit 1) */
+#define PROT_PKEY2 0x40 /* protection key value (bit 2) */
+#define PROT_PKEY3 0x80 /* protection key value (bit 3) */
+
+#define PAGE_SIZE 4096
+#define MB (1<<20)
+
+static inline void __cpuid(unsigned int *eax, unsigned int *ebx,
+ unsigned int *ecx, unsigned int *edx)
+{
+ /* ecx is often an input as well as an output. */
+ asm volatile(
+ "cpuid;"
+ : "=a" (*eax),
+ "=b" (*ebx),
+ "=c" (*ecx),
+ "=d" (*edx)
+ : "0" (*eax), "2" (*ecx));
+}
+
+/* Intel-defined CPU features, CPUID level 0x00000007:0 (ecx) */
+#define X86_FEATURE_PKU (1<<3) /* Protection Keys for Userspace */
+#define X86_FEATURE_OSPKE (1<<4) /* OS Protection Keys Enable */
+
+static inline int cpu_has_pku(void)
+{
+ unsigned int eax;
+ unsigned int ebx;
+ unsigned int ecx;
+ unsigned int edx;
+
+ eax = 0x7;
+ ecx = 0x0;
+ __cpuid(&eax, &ebx, &ecx, &edx);
+
+ if (!(ecx & X86_FEATURE_PKU)) {
+ dprintf2("cpu does not have PKU\n");
+ return 0;
+ }
+ if (!(ecx & X86_FEATURE_OSPKE)) {
+ dprintf2("cpu does not have OSPKE\n");
+ return 0;
+ }
+ return 1;
+}
+
+#define XSTATE_PKRU_BIT (9)
+#define XSTATE_PKRU 0x200
+
+int pkru_xstate_offset(void)
+{
+ unsigned int eax;
+ unsigned int ebx;
+ unsigned int ecx;
+ unsigned int edx;
+ int xstate_offset;
+ int xstate_size;
+ unsigned long XSTATE_CPUID = 0xd;
+ int leaf;
+
+ /* assume that XSTATE_PKRU is set in XCR0 */
+ leaf = XSTATE_PKRU_BIT;
+ {
+ eax = XSTATE_CPUID;
+ ecx = leaf;
+ __cpuid(&eax, &ebx, &ecx, &edx);
+
+ if (leaf == XSTATE_PKRU_BIT) {
+ xstate_offset = ebx;
+ xstate_size = eax;
+ }
+ }
+
+ if (xstate_size == 0) {
+ printf("could not find size/offset of PKRU in xsave state\n");
+ return 0;
+ }
+
+ return xstate_offset;
+}
+
+#endif /* _PKEYS_HELPER_H */
diff --git a/tools/testing/selftests/x86/protection_keys.c b/tools/testing/selftests/x86/protection_keys.c
new file mode 100644
index 000000000..27661302a
--- /dev/null
+++ b/tools/testing/selftests/x86/protection_keys.c
@@ -0,0 +1,1508 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Tests x86 Memory Protection Keys (see Documentation/x86/protection-keys.txt)
+ *
+ * There are examples in here of:
+ * * how to set protection keys on memory
+ * * how to set/clear bits in PKRU (the rights register)
+ * * how to handle SEGV_PKRU signals and extract pkey-relevant
+ * information from the siginfo
+ *
+ * Things to add:
+ * make sure KSM and KSM COW breaking works
+ * prefault pages in at malloc, or not
+ * protect MPX bounds tables with protection keys?
+ * make sure VMA splitting/merging is working correctly
+ * OOMs can destroy mm->mmap (see exit_mmap()), so make sure it is immune to pkeys
+ * look for pkey "leaks" where it is still set on a VMA but "freed" back to the kernel
+ * do a plain mprotect() to a mprotect_pkey() area and make sure the pkey sticks
+ *
+ * Compile like this:
+ * gcc -o protection_keys -O2 -g -std=gnu99 -pthread -Wall protection_keys.c -lrt -ldl -lm
+ * gcc -m32 -o protection_keys_32 -O2 -g -std=gnu99 -pthread -Wall protection_keys.c -lrt -ldl -lm
+ */
+#define _GNU_SOURCE
+#include <errno.h>
+#include <linux/futex.h>
+#include <time.h>
+#include <sys/time.h>
+#include <sys/syscall.h>
+#include <string.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <signal.h>
+#include <assert.h>
+#include <stdlib.h>
+#include <ucontext.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <sys/ptrace.h>
+#include <setjmp.h>
+
+#include "pkey-helpers.h"
+
+int iteration_nr = 1;
+int test_nr;
+
+unsigned int shadow_pkru;
+
+#define HPAGE_SIZE (1UL<<21)
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x)))
+#define ALIGN_UP(x, align_to) (((x) + ((align_to)-1)) & ~((align_to)-1))
+#define ALIGN_DOWN(x, align_to) ((x) & ~((align_to)-1))
+#define ALIGN_PTR_UP(p, ptr_align_to) ((typeof(p))ALIGN_UP((unsigned long)(p), ptr_align_to))
+#define ALIGN_PTR_DOWN(p, ptr_align_to) ((typeof(p))ALIGN_DOWN((unsigned long)(p), ptr_align_to))
+#define __stringify_1(x...) #x
+#define __stringify(x...) __stringify_1(x)
+
+#define PTR_ERR_ENOTSUP ((void *)-ENOTSUP)
+
+int dprint_in_signal;
+char dprint_in_signal_buffer[DPRINT_IN_SIGNAL_BUF_SIZE];
+
+extern void abort_hooks(void);
+#define pkey_assert(condition) do { \
+ if (!(condition)) { \
+ dprintf0("assert() at %s::%d test_nr: %d iteration: %d\n", \
+ __FILE__, __LINE__, \
+ test_nr, iteration_nr); \
+ dprintf0("errno at assert: %d", errno); \
+ abort_hooks(); \
+ exit(__LINE__); \
+ } \
+} while (0)
+
+void cat_into_file(char *str, char *file)
+{
+ int fd = open(file, O_RDWR);
+ int ret;
+
+ dprintf2("%s(): writing '%s' to '%s'\n", __func__, str, file);
+ /*
+ * these need to be raw because they are called under
+ * pkey_assert()
+ */
+ if (fd < 0) {
+ fprintf(stderr, "error opening '%s'\n", str);
+ perror("error: ");
+ exit(__LINE__);
+ }
+
+ ret = write(fd, str, strlen(str));
+ if (ret != strlen(str)) {
+ perror("write to file failed");
+ fprintf(stderr, "filename: '%s' str: '%s'\n", file, str);
+ exit(__LINE__);
+ }
+ close(fd);
+}
+
+#if CONTROL_TRACING > 0
+static int warned_tracing;
+int tracing_root_ok(void)
+{
+ if (geteuid() != 0) {
+ if (!warned_tracing)
+ fprintf(stderr, "WARNING: not run as root, "
+ "can not do tracing control\n");
+ warned_tracing = 1;
+ return 0;
+ }
+ return 1;
+}
+#endif
+
+void tracing_on(void)
+{
+#if CONTROL_TRACING > 0
+#define TRACEDIR "/sys/kernel/debug/tracing"
+ char pidstr[32];
+
+ if (!tracing_root_ok())
+ return;
+
+ sprintf(pidstr, "%d", getpid());
+ cat_into_file("0", TRACEDIR "/tracing_on");
+ cat_into_file("\n", TRACEDIR "/trace");
+ if (1) {
+ cat_into_file("function_graph", TRACEDIR "/current_tracer");
+ cat_into_file("1", TRACEDIR "/options/funcgraph-proc");
+ } else {
+ cat_into_file("nop", TRACEDIR "/current_tracer");
+ }
+ cat_into_file(pidstr, TRACEDIR "/set_ftrace_pid");
+ cat_into_file("1", TRACEDIR "/tracing_on");
+ dprintf1("enabled tracing\n");
+#endif
+}
+
+void tracing_off(void)
+{
+#if CONTROL_TRACING > 0
+ if (!tracing_root_ok())
+ return;
+ cat_into_file("0", "/sys/kernel/debug/tracing/tracing_on");
+#endif
+}
+
+void abort_hooks(void)
+{
+ fprintf(stderr, "running %s()...\n", __func__);
+ tracing_off();
+#ifdef SLEEP_ON_ABORT
+ sleep(SLEEP_ON_ABORT);
+#endif
+}
+
+static inline void __page_o_noops(void)
+{
+ /* 8-bytes of instruction * 512 bytes = 1 page */
+ asm(".rept 512 ; nopl 0x7eeeeeee(%eax) ; .endr");
+}
+
+/*
+ * This attempts to have roughly a page of instructions followed by a few
+ * instructions that do a write, and another page of instructions. That
+ * way, we are pretty sure that the write is in the second page of
+ * instructions and has at least a page of padding behind it.
+ *
+ * *That* lets us be sure to madvise() away the write instruction, which
+ * will then fault, which makes sure that the fault code handles
+ * execute-only memory properly.
+ */
+__attribute__((__aligned__(PAGE_SIZE)))
+void lots_o_noops_around_write(int *write_to_me)
+{
+ dprintf3("running %s()\n", __func__);
+ __page_o_noops();
+ /* Assume this happens in the second page of instructions: */
+ *write_to_me = __LINE__;
+ /* pad out by another page: */
+ __page_o_noops();
+ dprintf3("%s() done\n", __func__);
+}
+
+/* Define some kernel-like types */
+#define u8 uint8_t
+#define u16 uint16_t
+#define u32 uint32_t
+#define u64 uint64_t
+
+#ifdef __i386__
+
+#ifndef SYS_mprotect_key
+# define SYS_mprotect_key 380
+#endif
+
+#ifndef SYS_pkey_alloc
+# define SYS_pkey_alloc 381
+# define SYS_pkey_free 382
+#endif
+
+#define REG_IP_IDX REG_EIP
+#define si_pkey_offset 0x14
+
+#else
+
+#ifndef SYS_mprotect_key
+# define SYS_mprotect_key 329
+#endif
+
+#ifndef SYS_pkey_alloc
+# define SYS_pkey_alloc 330
+# define SYS_pkey_free 331
+#endif
+
+#define REG_IP_IDX REG_RIP
+#define si_pkey_offset 0x20
+
+#endif
+
+void dump_mem(void *dumpme, int len_bytes)
+{
+ char *c = (void *)dumpme;
+ int i;
+
+ for (i = 0; i < len_bytes; i += sizeof(u64)) {
+ u64 *ptr = (u64 *)(c + i);
+ dprintf1("dump[%03d][@%p]: %016jx\n", i, ptr, *ptr);
+ }
+}
+
+/* Failed address bound checks: */
+#ifndef SEGV_BNDERR
+# define SEGV_BNDERR 3
+#endif
+
+#ifndef SEGV_PKUERR
+# define SEGV_PKUERR 4
+#endif
+
+static char *si_code_str(int si_code)
+{
+ if (si_code == SEGV_MAPERR)
+ return "SEGV_MAPERR";
+ if (si_code == SEGV_ACCERR)
+ return "SEGV_ACCERR";
+ if (si_code == SEGV_BNDERR)
+ return "SEGV_BNDERR";
+ if (si_code == SEGV_PKUERR)
+ return "SEGV_PKUERR";
+ return "UNKNOWN";
+}
+
+int pkru_faults;
+int last_si_pkey = -1;
+void signal_handler(int signum, siginfo_t *si, void *vucontext)
+{
+ ucontext_t *uctxt = vucontext;
+ int trapno;
+ unsigned long ip;
+ char *fpregs;
+ u32 *pkru_ptr;
+ u64 siginfo_pkey;
+ u32 *si_pkey_ptr;
+ int pkru_offset;
+ fpregset_t fpregset;
+
+ dprint_in_signal = 1;
+ dprintf1(">>>>===============SIGSEGV============================\n");
+ dprintf1("%s()::%d, pkru: 0x%x shadow: %x\n", __func__, __LINE__,
+ __rdpkru(), shadow_pkru);
+
+ trapno = uctxt->uc_mcontext.gregs[REG_TRAPNO];
+ ip = uctxt->uc_mcontext.gregs[REG_IP_IDX];
+ fpregset = uctxt->uc_mcontext.fpregs;
+ fpregs = (void *)fpregset;
+
+ dprintf2("%s() trapno: %d ip: 0x%lx info->si_code: %s/%d\n", __func__,
+ trapno, ip, si_code_str(si->si_code), si->si_code);
+#ifdef __i386__
+ /*
+ * 32-bit has some extra padding so that userspace can tell whether
+ * the XSTATE header is present in addition to the "legacy" FPU
+ * state. We just assume that it is here.
+ */
+ fpregs += 0x70;
+#endif
+ pkru_offset = pkru_xstate_offset();
+ pkru_ptr = (void *)(&fpregs[pkru_offset]);
+
+ dprintf1("siginfo: %p\n", si);
+ dprintf1(" fpregs: %p\n", fpregs);
+ /*
+ * If we got a PKRU fault, we *HAVE* to have at least one bit set in
+ * here.
+ */
+ dprintf1("pkru_xstate_offset: %d\n", pkru_xstate_offset());
+ if (DEBUG_LEVEL > 4)
+ dump_mem(pkru_ptr - 128, 256);
+ pkey_assert(*pkru_ptr);
+
+ if ((si->si_code == SEGV_MAPERR) ||
+ (si->si_code == SEGV_ACCERR) ||
+ (si->si_code == SEGV_BNDERR)) {
+ printf("non-PK si_code, exiting...\n");
+ exit(4);
+ }
+
+ si_pkey_ptr = (u32 *)(((u8 *)si) + si_pkey_offset);
+ dprintf1("si_pkey_ptr: %p\n", si_pkey_ptr);
+ dump_mem((u8 *)si_pkey_ptr - 8, 24);
+ siginfo_pkey = *si_pkey_ptr;
+ pkey_assert(siginfo_pkey < NR_PKEYS);
+ last_si_pkey = siginfo_pkey;
+
+ dprintf1("signal pkru from xsave: %08x\n", *pkru_ptr);
+ /* need __rdpkru() version so we do not do shadow_pkru checking */
+ dprintf1("signal pkru from pkru: %08x\n", __rdpkru());
+ dprintf1("pkey from siginfo: %jx\n", siginfo_pkey);
+ *(u64 *)pkru_ptr = 0x00000000;
+ dprintf1("WARNING: set PRKU=0 to allow faulting instruction to continue\n");
+ pkru_faults++;
+ dprintf1("<<<<==================================================\n");
+ dprint_in_signal = 0;
+}
+
+int wait_all_children(void)
+{
+ int status;
+ return waitpid(-1, &status, 0);
+}
+
+void sig_chld(int x)
+{
+ dprint_in_signal = 1;
+ dprintf2("[%d] SIGCHLD: %d\n", getpid(), x);
+ dprint_in_signal = 0;
+}
+
+void setup_sigsegv_handler(void)
+{
+ int r, rs;
+ struct sigaction newact;
+ struct sigaction oldact;
+
+ /* #PF is mapped to sigsegv */
+ int signum = SIGSEGV;
+
+ newact.sa_handler = 0;
+ newact.sa_sigaction = signal_handler;
+
+ /*sigset_t - signals to block while in the handler */
+ /* get the old signal mask. */
+ rs = sigprocmask(SIG_SETMASK, 0, &newact.sa_mask);
+ pkey_assert(rs == 0);
+
+ /* call sa_sigaction, not sa_handler*/
+ newact.sa_flags = SA_SIGINFO;
+
+ newact.sa_restorer = 0; /* void(*)(), obsolete */
+ r = sigaction(signum, &newact, &oldact);
+ r = sigaction(SIGALRM, &newact, &oldact);
+ pkey_assert(r == 0);
+}
+
+void setup_handlers(void)
+{
+ signal(SIGCHLD, &sig_chld);
+ setup_sigsegv_handler();
+}
+
+pid_t fork_lazy_child(void)
+{
+ pid_t forkret;
+
+ forkret = fork();
+ pkey_assert(forkret >= 0);
+ dprintf3("[%d] fork() ret: %d\n", getpid(), forkret);
+
+ if (!forkret) {
+ /* in the child */
+ while (1) {
+ dprintf1("child sleeping...\n");
+ sleep(30);
+ }
+ }
+ return forkret;
+}
+
+#ifndef PKEY_DISABLE_ACCESS
+# define PKEY_DISABLE_ACCESS 0x1
+#endif
+
+#ifndef PKEY_DISABLE_WRITE
+# define PKEY_DISABLE_WRITE 0x2
+#endif
+
+static u32 hw_pkey_get(int pkey, unsigned long flags)
+{
+ u32 mask = (PKEY_DISABLE_ACCESS|PKEY_DISABLE_WRITE);
+ u32 pkru = __rdpkru();
+ u32 shifted_pkru;
+ u32 masked_pkru;
+
+ dprintf1("%s(pkey=%d, flags=%lx) = %x / %d\n",
+ __func__, pkey, flags, 0, 0);
+ dprintf2("%s() raw pkru: %x\n", __func__, pkru);
+
+ shifted_pkru = (pkru >> (pkey * PKRU_BITS_PER_PKEY));
+ dprintf2("%s() shifted_pkru: %x\n", __func__, shifted_pkru);
+ masked_pkru = shifted_pkru & mask;
+ dprintf2("%s() masked pkru: %x\n", __func__, masked_pkru);
+ /*
+ * shift down the relevant bits to the lowest two, then
+ * mask off all the other high bits.
+ */
+ return masked_pkru;
+}
+
+static int hw_pkey_set(int pkey, unsigned long rights, unsigned long flags)
+{
+ u32 mask = (PKEY_DISABLE_ACCESS|PKEY_DISABLE_WRITE);
+ u32 old_pkru = __rdpkru();
+ u32 new_pkru;
+
+ /* make sure that 'rights' only contains the bits we expect: */
+ assert(!(rights & ~mask));
+
+ /* copy old pkru */
+ new_pkru = old_pkru;
+ /* mask out bits from pkey in old value: */
+ new_pkru &= ~(mask << (pkey * PKRU_BITS_PER_PKEY));
+ /* OR in new bits for pkey: */
+ new_pkru |= (rights << (pkey * PKRU_BITS_PER_PKEY));
+
+ __wrpkru(new_pkru);
+
+ dprintf3("%s(pkey=%d, rights=%lx, flags=%lx) = %x pkru now: %x old_pkru: %x\n",
+ __func__, pkey, rights, flags, 0, __rdpkru(), old_pkru);
+ return 0;
+}
+
+void pkey_disable_set(int pkey, int flags)
+{
+ unsigned long syscall_flags = 0;
+ int ret;
+ int pkey_rights;
+ u32 orig_pkru = rdpkru();
+
+ dprintf1("START->%s(%d, 0x%x)\n", __func__,
+ pkey, flags);
+ pkey_assert(flags & (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE));
+
+ pkey_rights = hw_pkey_get(pkey, syscall_flags);
+
+ dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__,
+ pkey, pkey, pkey_rights);
+ pkey_assert(pkey_rights >= 0);
+
+ pkey_rights |= flags;
+
+ ret = hw_pkey_set(pkey, pkey_rights, syscall_flags);
+ assert(!ret);
+ /*pkru and flags have the same format */
+ shadow_pkru |= flags << (pkey * 2);
+ dprintf1("%s(%d) shadow: 0x%x\n", __func__, pkey, shadow_pkru);
+
+ pkey_assert(ret >= 0);
+
+ pkey_rights = hw_pkey_get(pkey, syscall_flags);
+ dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__,
+ pkey, pkey, pkey_rights);
+
+ dprintf1("%s(%d) pkru: 0x%x\n", __func__, pkey, rdpkru());
+ if (flags)
+ pkey_assert(rdpkru() > orig_pkru);
+ dprintf1("END<---%s(%d, 0x%x)\n", __func__,
+ pkey, flags);
+}
+
+void pkey_disable_clear(int pkey, int flags)
+{
+ unsigned long syscall_flags = 0;
+ int ret;
+ int pkey_rights = hw_pkey_get(pkey, syscall_flags);
+ u32 orig_pkru = rdpkru();
+
+ pkey_assert(flags & (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE));
+
+ dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__,
+ pkey, pkey, pkey_rights);
+ pkey_assert(pkey_rights >= 0);
+
+ pkey_rights |= flags;
+
+ ret = hw_pkey_set(pkey, pkey_rights, 0);
+ /* pkru and flags have the same format */
+ shadow_pkru &= ~(flags << (pkey * 2));
+ pkey_assert(ret >= 0);
+
+ pkey_rights = hw_pkey_get(pkey, syscall_flags);
+ dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__,
+ pkey, pkey, pkey_rights);
+
+ dprintf1("%s(%d) pkru: 0x%x\n", __func__, pkey, rdpkru());
+ if (flags)
+ assert(rdpkru() > orig_pkru);
+}
+
+void pkey_write_allow(int pkey)
+{
+ pkey_disable_clear(pkey, PKEY_DISABLE_WRITE);
+}
+void pkey_write_deny(int pkey)
+{
+ pkey_disable_set(pkey, PKEY_DISABLE_WRITE);
+}
+void pkey_access_allow(int pkey)
+{
+ pkey_disable_clear(pkey, PKEY_DISABLE_ACCESS);
+}
+void pkey_access_deny(int pkey)
+{
+ pkey_disable_set(pkey, PKEY_DISABLE_ACCESS);
+}
+
+int sys_mprotect_pkey(void *ptr, size_t size, unsigned long orig_prot,
+ unsigned long pkey)
+{
+ int sret;
+
+ dprintf2("%s(0x%p, %zx, prot=%lx, pkey=%lx)\n", __func__,
+ ptr, size, orig_prot, pkey);
+
+ errno = 0;
+ sret = syscall(SYS_mprotect_key, ptr, size, orig_prot, pkey);
+ if (errno) {
+ dprintf2("SYS_mprotect_key sret: %d\n", sret);
+ dprintf2("SYS_mprotect_key prot: 0x%lx\n", orig_prot);
+ dprintf2("SYS_mprotect_key failed, errno: %d\n", errno);
+ if (DEBUG_LEVEL >= 2)
+ perror("SYS_mprotect_pkey");
+ }
+ return sret;
+}
+
+int sys_pkey_alloc(unsigned long flags, unsigned long init_val)
+{
+ int ret = syscall(SYS_pkey_alloc, flags, init_val);
+ dprintf1("%s(flags=%lx, init_val=%lx) syscall ret: %d errno: %d\n",
+ __func__, flags, init_val, ret, errno);
+ return ret;
+}
+
+int alloc_pkey(void)
+{
+ int ret;
+ unsigned long init_val = 0x0;
+
+ dprintf1("alloc_pkey()::%d, pkru: 0x%x shadow: %x\n",
+ __LINE__, __rdpkru(), shadow_pkru);
+ ret = sys_pkey_alloc(0, init_val);
+ /*
+ * pkey_alloc() sets PKRU, so we need to reflect it in
+ * shadow_pkru:
+ */
+ dprintf4("alloc_pkey()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n",
+ __LINE__, ret, __rdpkru(), shadow_pkru);
+ if (ret) {
+ /* clear both the bits: */
+ shadow_pkru &= ~(0x3 << (ret * 2));
+ dprintf4("alloc_pkey()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n",
+ __LINE__, ret, __rdpkru(), shadow_pkru);
+ /*
+ * move the new state in from init_val
+ * (remember, we cheated and init_val == pkru format)
+ */
+ shadow_pkru |= (init_val << (ret * 2));
+ }
+ dprintf4("alloc_pkey()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n",
+ __LINE__, ret, __rdpkru(), shadow_pkru);
+ dprintf1("alloc_pkey()::%d errno: %d\n", __LINE__, errno);
+ /* for shadow checking: */
+ rdpkru();
+ dprintf4("alloc_pkey()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n",
+ __LINE__, ret, __rdpkru(), shadow_pkru);
+ return ret;
+}
+
+int sys_pkey_free(unsigned long pkey)
+{
+ int ret = syscall(SYS_pkey_free, pkey);
+ dprintf1("%s(pkey=%ld) syscall ret: %d\n", __func__, pkey, ret);
+ return ret;
+}
+
+/*
+ * I had a bug where pkey bits could be set by mprotect() but
+ * not cleared. This ensures we get lots of random bit sets
+ * and clears on the vma and pte pkey bits.
+ */
+int alloc_random_pkey(void)
+{
+ int max_nr_pkey_allocs;
+ int ret;
+ int i;
+ int alloced_pkeys[NR_PKEYS];
+ int nr_alloced = 0;
+ int random_index;
+ memset(alloced_pkeys, 0, sizeof(alloced_pkeys));
+
+ /* allocate every possible key and make a note of which ones we got */
+ max_nr_pkey_allocs = NR_PKEYS;
+ for (i = 0; i < max_nr_pkey_allocs; i++) {
+ int new_pkey = alloc_pkey();
+ if (new_pkey < 0)
+ break;
+ alloced_pkeys[nr_alloced++] = new_pkey;
+ }
+
+ pkey_assert(nr_alloced > 0);
+ /* select a random one out of the allocated ones */
+ random_index = rand() % nr_alloced;
+ ret = alloced_pkeys[random_index];
+ /* now zero it out so we don't free it next */
+ alloced_pkeys[random_index] = 0;
+
+ /* go through the allocated ones that we did not want and free them */
+ for (i = 0; i < nr_alloced; i++) {
+ int free_ret;
+ if (!alloced_pkeys[i])
+ continue;
+ free_ret = sys_pkey_free(alloced_pkeys[i]);
+ pkey_assert(!free_ret);
+ }
+ dprintf1("%s()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n", __func__,
+ __LINE__, ret, __rdpkru(), shadow_pkru);
+ return ret;
+}
+
+int mprotect_pkey(void *ptr, size_t size, unsigned long orig_prot,
+ unsigned long pkey)
+{
+ int nr_iterations = random() % 100;
+ int ret;
+
+ while (0) {
+ int rpkey = alloc_random_pkey();
+ ret = sys_mprotect_pkey(ptr, size, orig_prot, pkey);
+ dprintf1("sys_mprotect_pkey(%p, %zx, prot=0x%lx, pkey=%ld) ret: %d\n",
+ ptr, size, orig_prot, pkey, ret);
+ if (nr_iterations-- < 0)
+ break;
+
+ dprintf1("%s()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n", __func__,
+ __LINE__, ret, __rdpkru(), shadow_pkru);
+ sys_pkey_free(rpkey);
+ dprintf1("%s()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n", __func__,
+ __LINE__, ret, __rdpkru(), shadow_pkru);
+ }
+ pkey_assert(pkey < NR_PKEYS);
+
+ ret = sys_mprotect_pkey(ptr, size, orig_prot, pkey);
+ dprintf1("mprotect_pkey(%p, %zx, prot=0x%lx, pkey=%ld) ret: %d\n",
+ ptr, size, orig_prot, pkey, ret);
+ pkey_assert(!ret);
+ dprintf1("%s()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n", __func__,
+ __LINE__, ret, __rdpkru(), shadow_pkru);
+ return ret;
+}
+
+struct pkey_malloc_record {
+ void *ptr;
+ long size;
+ int prot;
+};
+struct pkey_malloc_record *pkey_malloc_records;
+struct pkey_malloc_record *pkey_last_malloc_record;
+long nr_pkey_malloc_records;
+void record_pkey_malloc(void *ptr, long size, int prot)
+{
+ long i;
+ struct pkey_malloc_record *rec = NULL;
+
+ for (i = 0; i < nr_pkey_malloc_records; i++) {
+ rec = &pkey_malloc_records[i];
+ /* find a free record */
+ if (rec)
+ break;
+ }
+ if (!rec) {
+ /* every record is full */
+ size_t old_nr_records = nr_pkey_malloc_records;
+ size_t new_nr_records = (nr_pkey_malloc_records * 2 + 1);
+ size_t new_size = new_nr_records * sizeof(struct pkey_malloc_record);
+ dprintf2("new_nr_records: %zd\n", new_nr_records);
+ dprintf2("new_size: %zd\n", new_size);
+ pkey_malloc_records = realloc(pkey_malloc_records, new_size);
+ pkey_assert(pkey_malloc_records != NULL);
+ rec = &pkey_malloc_records[nr_pkey_malloc_records];
+ /*
+ * realloc() does not initialize memory, so zero it from
+ * the first new record all the way to the end.
+ */
+ for (i = 0; i < new_nr_records - old_nr_records; i++)
+ memset(rec + i, 0, sizeof(*rec));
+ }
+ dprintf3("filling malloc record[%d/%p]: {%p, %ld}\n",
+ (int)(rec - pkey_malloc_records), rec, ptr, size);
+ rec->ptr = ptr;
+ rec->size = size;
+ rec->prot = prot;
+ pkey_last_malloc_record = rec;
+ nr_pkey_malloc_records++;
+}
+
+void free_pkey_malloc(void *ptr)
+{
+ long i;
+ int ret;
+ dprintf3("%s(%p)\n", __func__, ptr);
+ for (i = 0; i < nr_pkey_malloc_records; i++) {
+ struct pkey_malloc_record *rec = &pkey_malloc_records[i];
+ dprintf4("looking for ptr %p at record[%ld/%p]: {%p, %ld}\n",
+ ptr, i, rec, rec->ptr, rec->size);
+ if ((ptr < rec->ptr) ||
+ (ptr >= rec->ptr + rec->size))
+ continue;
+
+ dprintf3("found ptr %p at record[%ld/%p]: {%p, %ld}\n",
+ ptr, i, rec, rec->ptr, rec->size);
+ nr_pkey_malloc_records--;
+ ret = munmap(rec->ptr, rec->size);
+ dprintf3("munmap ret: %d\n", ret);
+ pkey_assert(!ret);
+ dprintf3("clearing rec->ptr, rec: %p\n", rec);
+ rec->ptr = NULL;
+ dprintf3("done clearing rec->ptr, rec: %p\n", rec);
+ return;
+ }
+ pkey_assert(false);
+}
+
+
+void *malloc_pkey_with_mprotect(long size, int prot, u16 pkey)
+{
+ void *ptr;
+ int ret;
+
+ rdpkru();
+ dprintf1("doing %s(size=%ld, prot=0x%x, pkey=%d)\n", __func__,
+ size, prot, pkey);
+ pkey_assert(pkey < NR_PKEYS);
+ ptr = mmap(NULL, size, prot, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
+ pkey_assert(ptr != (void *)-1);
+ ret = mprotect_pkey((void *)ptr, PAGE_SIZE, prot, pkey);
+ pkey_assert(!ret);
+ record_pkey_malloc(ptr, size, prot);
+ rdpkru();
+
+ dprintf1("%s() for pkey %d @ %p\n", __func__, pkey, ptr);
+ return ptr;
+}
+
+void *malloc_pkey_anon_huge(long size, int prot, u16 pkey)
+{
+ int ret;
+ void *ptr;
+
+ dprintf1("doing %s(size=%ld, prot=0x%x, pkey=%d)\n", __func__,
+ size, prot, pkey);
+ /*
+ * Guarantee we can fit at least one huge page in the resulting
+ * allocation by allocating space for 2:
+ */
+ size = ALIGN_UP(size, HPAGE_SIZE * 2);
+ ptr = mmap(NULL, size, PROT_NONE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
+ pkey_assert(ptr != (void *)-1);
+ record_pkey_malloc(ptr, size, prot);
+ mprotect_pkey(ptr, size, prot, pkey);
+
+ dprintf1("unaligned ptr: %p\n", ptr);
+ ptr = ALIGN_PTR_UP(ptr, HPAGE_SIZE);
+ dprintf1(" aligned ptr: %p\n", ptr);
+ ret = madvise(ptr, HPAGE_SIZE, MADV_HUGEPAGE);
+ dprintf1("MADV_HUGEPAGE ret: %d\n", ret);
+ ret = madvise(ptr, HPAGE_SIZE, MADV_WILLNEED);
+ dprintf1("MADV_WILLNEED ret: %d\n", ret);
+ memset(ptr, 0, HPAGE_SIZE);
+
+ dprintf1("mmap()'d thp for pkey %d @ %p\n", pkey, ptr);
+ return ptr;
+}
+
+int hugetlb_setup_ok;
+#define GET_NR_HUGE_PAGES 10
+void setup_hugetlbfs(void)
+{
+ int err;
+ int fd;
+ char buf[] = "123";
+
+ if (geteuid() != 0) {
+ fprintf(stderr, "WARNING: not run as root, can not do hugetlb test\n");
+ return;
+ }
+
+ cat_into_file(__stringify(GET_NR_HUGE_PAGES), "/proc/sys/vm/nr_hugepages");
+
+ /*
+ * Now go make sure that we got the pages and that they
+ * are 2M pages. Someone might have made 1G the default.
+ */
+ fd = open("/sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages", O_RDONLY);
+ if (fd < 0) {
+ perror("opening sysfs 2M hugetlb config");
+ return;
+ }
+
+ /* -1 to guarantee leaving the trailing \0 */
+ err = read(fd, buf, sizeof(buf)-1);
+ close(fd);
+ if (err <= 0) {
+ perror("reading sysfs 2M hugetlb config");
+ return;
+ }
+
+ if (atoi(buf) != GET_NR_HUGE_PAGES) {
+ fprintf(stderr, "could not confirm 2M pages, got: '%s' expected %d\n",
+ buf, GET_NR_HUGE_PAGES);
+ return;
+ }
+
+ hugetlb_setup_ok = 1;
+}
+
+void *malloc_pkey_hugetlb(long size, int prot, u16 pkey)
+{
+ void *ptr;
+ int flags = MAP_ANONYMOUS|MAP_PRIVATE|MAP_HUGETLB;
+
+ if (!hugetlb_setup_ok)
+ return PTR_ERR_ENOTSUP;
+
+ dprintf1("doing %s(%ld, %x, %x)\n", __func__, size, prot, pkey);
+ size = ALIGN_UP(size, HPAGE_SIZE * 2);
+ pkey_assert(pkey < NR_PKEYS);
+ ptr = mmap(NULL, size, PROT_NONE, flags, -1, 0);
+ pkey_assert(ptr != (void *)-1);
+ mprotect_pkey(ptr, size, prot, pkey);
+
+ record_pkey_malloc(ptr, size, prot);
+
+ dprintf1("mmap()'d hugetlbfs for pkey %d @ %p\n", pkey, ptr);
+ return ptr;
+}
+
+void *malloc_pkey_mmap_dax(long size, int prot, u16 pkey)
+{
+ void *ptr;
+ int fd;
+
+ dprintf1("doing %s(size=%ld, prot=0x%x, pkey=%d)\n", __func__,
+ size, prot, pkey);
+ pkey_assert(pkey < NR_PKEYS);
+ fd = open("/dax/foo", O_RDWR);
+ pkey_assert(fd >= 0);
+
+ ptr = mmap(0, size, prot, MAP_SHARED, fd, 0);
+ pkey_assert(ptr != (void *)-1);
+
+ mprotect_pkey(ptr, size, prot, pkey);
+
+ record_pkey_malloc(ptr, size, prot);
+
+ dprintf1("mmap()'d for pkey %d @ %p\n", pkey, ptr);
+ close(fd);
+ return ptr;
+}
+
+void *(*pkey_malloc[])(long size, int prot, u16 pkey) = {
+
+ malloc_pkey_with_mprotect,
+ malloc_pkey_anon_huge,
+ malloc_pkey_hugetlb
+/* can not do direct with the pkey_mprotect() API:
+ malloc_pkey_mmap_direct,
+ malloc_pkey_mmap_dax,
+*/
+};
+
+void *malloc_pkey(long size, int prot, u16 pkey)
+{
+ void *ret;
+ static int malloc_type;
+ int nr_malloc_types = ARRAY_SIZE(pkey_malloc);
+
+ pkey_assert(pkey < NR_PKEYS);
+
+ while (1) {
+ pkey_assert(malloc_type < nr_malloc_types);
+
+ ret = pkey_malloc[malloc_type](size, prot, pkey);
+ pkey_assert(ret != (void *)-1);
+
+ malloc_type++;
+ if (malloc_type >= nr_malloc_types)
+ malloc_type = (random()%nr_malloc_types);
+
+ /* try again if the malloc_type we tried is unsupported */
+ if (ret == PTR_ERR_ENOTSUP)
+ continue;
+
+ break;
+ }
+
+ dprintf3("%s(%ld, prot=%x, pkey=%x) returning: %p\n", __func__,
+ size, prot, pkey, ret);
+ return ret;
+}
+
+int last_pkru_faults;
+#define UNKNOWN_PKEY -2
+void expected_pk_fault(int pkey)
+{
+ dprintf2("%s(): last_pkru_faults: %d pkru_faults: %d\n",
+ __func__, last_pkru_faults, pkru_faults);
+ dprintf2("%s(%d): last_si_pkey: %d\n", __func__, pkey, last_si_pkey);
+ pkey_assert(last_pkru_faults + 1 == pkru_faults);
+
+ /*
+ * For exec-only memory, we do not know the pkey in
+ * advance, so skip this check.
+ */
+ if (pkey != UNKNOWN_PKEY)
+ pkey_assert(last_si_pkey == pkey);
+
+ /*
+ * The signal handler shold have cleared out PKRU to let the
+ * test program continue. We now have to restore it.
+ */
+ if (__rdpkru() != 0)
+ pkey_assert(0);
+
+ __wrpkru(shadow_pkru);
+ dprintf1("%s() set PKRU=%x to restore state after signal nuked it\n",
+ __func__, shadow_pkru);
+ last_pkru_faults = pkru_faults;
+ last_si_pkey = -1;
+}
+
+#define do_not_expect_pk_fault(msg) do { \
+ if (last_pkru_faults != pkru_faults) \
+ dprintf0("unexpected PK fault: %s\n", msg); \
+ pkey_assert(last_pkru_faults == pkru_faults); \
+} while (0)
+
+int test_fds[10] = { -1 };
+int nr_test_fds;
+void __save_test_fd(int fd)
+{
+ pkey_assert(fd >= 0);
+ pkey_assert(nr_test_fds < ARRAY_SIZE(test_fds));
+ test_fds[nr_test_fds] = fd;
+ nr_test_fds++;
+}
+
+int get_test_read_fd(void)
+{
+ int test_fd = open("/etc/passwd", O_RDONLY);
+ __save_test_fd(test_fd);
+ return test_fd;
+}
+
+void close_test_fds(void)
+{
+ int i;
+
+ for (i = 0; i < nr_test_fds; i++) {
+ if (test_fds[i] < 0)
+ continue;
+ close(test_fds[i]);
+ test_fds[i] = -1;
+ }
+ nr_test_fds = 0;
+}
+
+#define barrier() __asm__ __volatile__("": : :"memory")
+__attribute__((noinline)) int read_ptr(int *ptr)
+{
+ /*
+ * Keep GCC from optimizing this away somehow
+ */
+ barrier();
+ return *ptr;
+}
+
+void test_read_of_write_disabled_region(int *ptr, u16 pkey)
+{
+ int ptr_contents;
+
+ dprintf1("disabling write access to PKEY[1], doing read\n");
+ pkey_write_deny(pkey);
+ ptr_contents = read_ptr(ptr);
+ dprintf1("*ptr: %d\n", ptr_contents);
+ dprintf1("\n");
+}
+void test_read_of_access_disabled_region(int *ptr, u16 pkey)
+{
+ int ptr_contents;
+
+ dprintf1("disabling access to PKEY[%02d], doing read @ %p\n", pkey, ptr);
+ rdpkru();
+ pkey_access_deny(pkey);
+ ptr_contents = read_ptr(ptr);
+ dprintf1("*ptr: %d\n", ptr_contents);
+ expected_pk_fault(pkey);
+}
+void test_write_of_write_disabled_region(int *ptr, u16 pkey)
+{
+ dprintf1("disabling write access to PKEY[%02d], doing write\n", pkey);
+ pkey_write_deny(pkey);
+ *ptr = __LINE__;
+ expected_pk_fault(pkey);
+}
+void test_write_of_access_disabled_region(int *ptr, u16 pkey)
+{
+ dprintf1("disabling access to PKEY[%02d], doing write\n", pkey);
+ pkey_access_deny(pkey);
+ *ptr = __LINE__;
+ expected_pk_fault(pkey);
+}
+void test_kernel_write_of_access_disabled_region(int *ptr, u16 pkey)
+{
+ int ret;
+ int test_fd = get_test_read_fd();
+
+ dprintf1("disabling access to PKEY[%02d], "
+ "having kernel read() to buffer\n", pkey);
+ pkey_access_deny(pkey);
+ ret = read(test_fd, ptr, 1);
+ dprintf1("read ret: %d\n", ret);
+ pkey_assert(ret);
+}
+void test_kernel_write_of_write_disabled_region(int *ptr, u16 pkey)
+{
+ int ret;
+ int test_fd = get_test_read_fd();
+
+ pkey_write_deny(pkey);
+ ret = read(test_fd, ptr, 100);
+ dprintf1("read ret: %d\n", ret);
+ if (ret < 0 && (DEBUG_LEVEL > 0))
+ perror("verbose read result (OK for this to be bad)");
+ pkey_assert(ret);
+}
+
+void test_kernel_gup_of_access_disabled_region(int *ptr, u16 pkey)
+{
+ int pipe_ret, vmsplice_ret;
+ struct iovec iov;
+ int pipe_fds[2];
+
+ pipe_ret = pipe(pipe_fds);
+
+ pkey_assert(pipe_ret == 0);
+ dprintf1("disabling access to PKEY[%02d], "
+ "having kernel vmsplice from buffer\n", pkey);
+ pkey_access_deny(pkey);
+ iov.iov_base = ptr;
+ iov.iov_len = PAGE_SIZE;
+ vmsplice_ret = vmsplice(pipe_fds[1], &iov, 1, SPLICE_F_GIFT);
+ dprintf1("vmsplice() ret: %d\n", vmsplice_ret);
+ pkey_assert(vmsplice_ret == -1);
+
+ close(pipe_fds[0]);
+ close(pipe_fds[1]);
+}
+
+void test_kernel_gup_write_to_write_disabled_region(int *ptr, u16 pkey)
+{
+ int ignored = 0xdada;
+ int futex_ret;
+ int some_int = __LINE__;
+
+ dprintf1("disabling write to PKEY[%02d], "
+ "doing futex gunk in buffer\n", pkey);
+ *ptr = some_int;
+ pkey_write_deny(pkey);
+ futex_ret = syscall(SYS_futex, ptr, FUTEX_WAIT, some_int-1, NULL,
+ &ignored, ignored);
+ if (DEBUG_LEVEL > 0)
+ perror("futex");
+ dprintf1("futex() ret: %d\n", futex_ret);
+}
+
+/* Assumes that all pkeys other than 'pkey' are unallocated */
+void test_pkey_syscalls_on_non_allocated_pkey(int *ptr, u16 pkey)
+{
+ int err;
+ int i;
+
+ /* Note: 0 is the default pkey, so don't mess with it */
+ for (i = 1; i < NR_PKEYS; i++) {
+ if (pkey == i)
+ continue;
+
+ dprintf1("trying get/set/free to non-allocated pkey: %2d\n", i);
+ err = sys_pkey_free(i);
+ pkey_assert(err);
+
+ err = sys_pkey_free(i);
+ pkey_assert(err);
+
+ err = sys_mprotect_pkey(ptr, PAGE_SIZE, PROT_READ, i);
+ pkey_assert(err);
+ }
+}
+
+/* Assumes that all pkeys other than 'pkey' are unallocated */
+void test_pkey_syscalls_bad_args(int *ptr, u16 pkey)
+{
+ int err;
+ int bad_pkey = NR_PKEYS+99;
+
+ /* pass a known-invalid pkey in: */
+ err = sys_mprotect_pkey(ptr, PAGE_SIZE, PROT_READ, bad_pkey);
+ pkey_assert(err);
+}
+
+void become_child(void)
+{
+ pid_t forkret;
+
+ forkret = fork();
+ pkey_assert(forkret >= 0);
+ dprintf3("[%d] fork() ret: %d\n", getpid(), forkret);
+
+ if (!forkret) {
+ /* in the child */
+ return;
+ }
+ exit(0);
+}
+
+/* Assumes that all pkeys other than 'pkey' are unallocated */
+void test_pkey_alloc_exhaust(int *ptr, u16 pkey)
+{
+ int err;
+ int allocated_pkeys[NR_PKEYS] = {0};
+ int nr_allocated_pkeys = 0;
+ int i;
+
+ for (i = 0; i < NR_PKEYS*3; i++) {
+ int new_pkey;
+ dprintf1("%s() alloc loop: %d\n", __func__, i);
+ new_pkey = alloc_pkey();
+ dprintf4("%s()::%d, err: %d pkru: 0x%x shadow: 0x%x\n", __func__,
+ __LINE__, err, __rdpkru(), shadow_pkru);
+ rdpkru(); /* for shadow checking */
+ dprintf2("%s() errno: %d ENOSPC: %d\n", __func__, errno, ENOSPC);
+ if ((new_pkey == -1) && (errno == ENOSPC)) {
+ dprintf2("%s() failed to allocate pkey after %d tries\n",
+ __func__, nr_allocated_pkeys);
+ } else {
+ /*
+ * Ensure the number of successes never
+ * exceeds the number of keys supported
+ * in the hardware.
+ */
+ pkey_assert(nr_allocated_pkeys < NR_PKEYS);
+ allocated_pkeys[nr_allocated_pkeys++] = new_pkey;
+ }
+
+ /*
+ * Make sure that allocation state is properly
+ * preserved across fork().
+ */
+ if (i == NR_PKEYS*2)
+ become_child();
+ }
+
+ dprintf3("%s()::%d\n", __func__, __LINE__);
+
+ /*
+ * There are 16 pkeys supported in hardware. Three are
+ * allocated by the time we get here:
+ * 1. The default key (0)
+ * 2. One possibly consumed by an execute-only mapping.
+ * 3. One allocated by the test code and passed in via
+ * 'pkey' to this function.
+ * Ensure that we can allocate at least another 13 (16-3).
+ */
+ pkey_assert(i >= NR_PKEYS-3);
+
+ for (i = 0; i < nr_allocated_pkeys; i++) {
+ err = sys_pkey_free(allocated_pkeys[i]);
+ pkey_assert(!err);
+ rdpkru(); /* for shadow checking */
+ }
+}
+
+/*
+ * pkey 0 is special. It is allocated by default, so you do not
+ * have to call pkey_alloc() to use it first. Make sure that it
+ * is usable.
+ */
+void test_mprotect_with_pkey_0(int *ptr, u16 pkey)
+{
+ long size;
+ int prot;
+
+ assert(pkey_last_malloc_record);
+ size = pkey_last_malloc_record->size;
+ /*
+ * This is a bit of a hack. But mprotect() requires
+ * huge-page-aligned sizes when operating on hugetlbfs.
+ * So, make sure that we use something that's a multiple
+ * of a huge page when we can.
+ */
+ if (size >= HPAGE_SIZE)
+ size = HPAGE_SIZE;
+ prot = pkey_last_malloc_record->prot;
+
+ /* Use pkey 0 */
+ mprotect_pkey(ptr, size, prot, 0);
+
+ /* Make sure that we can set it back to the original pkey. */
+ mprotect_pkey(ptr, size, prot, pkey);
+}
+
+void test_ptrace_of_child(int *ptr, u16 pkey)
+{
+ __attribute__((__unused__)) int peek_result;
+ pid_t child_pid;
+ void *ignored = 0;
+ long ret;
+ int status;
+ /*
+ * This is the "control" for our little expermient. Make sure
+ * we can always access it when ptracing.
+ */
+ int *plain_ptr_unaligned = malloc(HPAGE_SIZE);
+ int *plain_ptr = ALIGN_PTR_UP(plain_ptr_unaligned, PAGE_SIZE);
+
+ /*
+ * Fork a child which is an exact copy of this process, of course.
+ * That means we can do all of our tests via ptrace() and then plain
+ * memory access and ensure they work differently.
+ */
+ child_pid = fork_lazy_child();
+ dprintf1("[%d] child pid: %d\n", getpid(), child_pid);
+
+ ret = ptrace(PTRACE_ATTACH, child_pid, ignored, ignored);
+ if (ret)
+ perror("attach");
+ dprintf1("[%d] attach ret: %ld %d\n", getpid(), ret, __LINE__);
+ pkey_assert(ret != -1);
+ ret = waitpid(child_pid, &status, WUNTRACED);
+ if ((ret != child_pid) || !(WIFSTOPPED(status))) {
+ fprintf(stderr, "weird waitpid result %ld stat %x\n",
+ ret, status);
+ pkey_assert(0);
+ }
+ dprintf2("waitpid ret: %ld\n", ret);
+ dprintf2("waitpid status: %d\n", status);
+
+ pkey_access_deny(pkey);
+ pkey_write_deny(pkey);
+
+ /* Write access, untested for now:
+ ret = ptrace(PTRACE_POKEDATA, child_pid, peek_at, data);
+ pkey_assert(ret != -1);
+ dprintf1("poke at %p: %ld\n", peek_at, ret);
+ */
+
+ /*
+ * Try to access the pkey-protected "ptr" via ptrace:
+ */
+ ret = ptrace(PTRACE_PEEKDATA, child_pid, ptr, ignored);
+ /* expect it to work, without an error: */
+ pkey_assert(ret != -1);
+ /* Now access from the current task, and expect an exception: */
+ peek_result = read_ptr(ptr);
+ expected_pk_fault(pkey);
+
+ /*
+ * Try to access the NON-pkey-protected "plain_ptr" via ptrace:
+ */
+ ret = ptrace(PTRACE_PEEKDATA, child_pid, plain_ptr, ignored);
+ /* expect it to work, without an error: */
+ pkey_assert(ret != -1);
+ /* Now access from the current task, and expect NO exception: */
+ peek_result = read_ptr(plain_ptr);
+ do_not_expect_pk_fault("read plain pointer after ptrace");
+
+ ret = ptrace(PTRACE_DETACH, child_pid, ignored, 0);
+ pkey_assert(ret != -1);
+
+ ret = kill(child_pid, SIGKILL);
+ pkey_assert(ret != -1);
+
+ wait(&status);
+
+ free(plain_ptr_unaligned);
+}
+
+void *get_pointer_to_instructions(void)
+{
+ void *p1;
+
+ p1 = ALIGN_PTR_UP(&lots_o_noops_around_write, PAGE_SIZE);
+ dprintf3("&lots_o_noops: %p\n", &lots_o_noops_around_write);
+ /* lots_o_noops_around_write should be page-aligned already */
+ assert(p1 == &lots_o_noops_around_write);
+
+ /* Point 'p1' at the *second* page of the function: */
+ p1 += PAGE_SIZE;
+
+ /*
+ * Try to ensure we fault this in on next touch to ensure
+ * we get an instruction fault as opposed to a data one
+ */
+ madvise(p1, PAGE_SIZE, MADV_DONTNEED);
+
+ return p1;
+}
+
+void test_executing_on_unreadable_memory(int *ptr, u16 pkey)
+{
+ void *p1;
+ int scratch;
+ int ptr_contents;
+ int ret;
+
+ p1 = get_pointer_to_instructions();
+ lots_o_noops_around_write(&scratch);
+ ptr_contents = read_ptr(p1);
+ dprintf2("ptr (%p) contents@%d: %x\n", p1, __LINE__, ptr_contents);
+
+ ret = mprotect_pkey(p1, PAGE_SIZE, PROT_EXEC, (u64)pkey);
+ pkey_assert(!ret);
+ pkey_access_deny(pkey);
+
+ dprintf2("pkru: %x\n", rdpkru());
+
+ /*
+ * Make sure this is an *instruction* fault
+ */
+ madvise(p1, PAGE_SIZE, MADV_DONTNEED);
+ lots_o_noops_around_write(&scratch);
+ do_not_expect_pk_fault("executing on PROT_EXEC memory");
+ ptr_contents = read_ptr(p1);
+ dprintf2("ptr (%p) contents@%d: %x\n", p1, __LINE__, ptr_contents);
+ expected_pk_fault(pkey);
+}
+
+void test_implicit_mprotect_exec_only_memory(int *ptr, u16 pkey)
+{
+ void *p1;
+ int scratch;
+ int ptr_contents;
+ int ret;
+
+ dprintf1("%s() start\n", __func__);
+
+ p1 = get_pointer_to_instructions();
+ lots_o_noops_around_write(&scratch);
+ ptr_contents = read_ptr(p1);
+ dprintf2("ptr (%p) contents@%d: %x\n", p1, __LINE__, ptr_contents);
+
+ /* Use a *normal* mprotect(), not mprotect_pkey(): */
+ ret = mprotect(p1, PAGE_SIZE, PROT_EXEC);
+ pkey_assert(!ret);
+
+ dprintf2("pkru: %x\n", rdpkru());
+
+ /* Make sure this is an *instruction* fault */
+ madvise(p1, PAGE_SIZE, MADV_DONTNEED);
+ lots_o_noops_around_write(&scratch);
+ do_not_expect_pk_fault("executing on PROT_EXEC memory");
+ ptr_contents = read_ptr(p1);
+ dprintf2("ptr (%p) contents@%d: %x\n", p1, __LINE__, ptr_contents);
+ expected_pk_fault(UNKNOWN_PKEY);
+
+ /*
+ * Put the memory back to non-PROT_EXEC. Should clear the
+ * exec-only pkey off the VMA and allow it to be readable
+ * again. Go to PROT_NONE first to check for a kernel bug
+ * that did not clear the pkey when doing PROT_NONE.
+ */
+ ret = mprotect(p1, PAGE_SIZE, PROT_NONE);
+ pkey_assert(!ret);
+
+ ret = mprotect(p1, PAGE_SIZE, PROT_READ|PROT_EXEC);
+ pkey_assert(!ret);
+ ptr_contents = read_ptr(p1);
+ do_not_expect_pk_fault("plain read on recently PROT_EXEC area");
+}
+
+void test_mprotect_pkey_on_unsupported_cpu(int *ptr, u16 pkey)
+{
+ int size = PAGE_SIZE;
+ int sret;
+
+ if (cpu_has_pku()) {
+ dprintf1("SKIP: %s: no CPU support\n", __func__);
+ return;
+ }
+
+ sret = syscall(SYS_mprotect_key, ptr, size, PROT_READ, pkey);
+ pkey_assert(sret < 0);
+}
+
+void (*pkey_tests[])(int *ptr, u16 pkey) = {
+ test_read_of_write_disabled_region,
+ test_read_of_access_disabled_region,
+ test_write_of_write_disabled_region,
+ test_write_of_access_disabled_region,
+ test_kernel_write_of_access_disabled_region,
+ test_kernel_write_of_write_disabled_region,
+ test_kernel_gup_of_access_disabled_region,
+ test_kernel_gup_write_to_write_disabled_region,
+ test_executing_on_unreadable_memory,
+ test_implicit_mprotect_exec_only_memory,
+ test_mprotect_with_pkey_0,
+ test_ptrace_of_child,
+ test_pkey_syscalls_on_non_allocated_pkey,
+ test_pkey_syscalls_bad_args,
+ test_pkey_alloc_exhaust,
+};
+
+void run_tests_once(void)
+{
+ int *ptr;
+ int prot = PROT_READ|PROT_WRITE;
+
+ for (test_nr = 0; test_nr < ARRAY_SIZE(pkey_tests); test_nr++) {
+ int pkey;
+ int orig_pkru_faults = pkru_faults;
+
+ dprintf1("======================\n");
+ dprintf1("test %d preparing...\n", test_nr);
+
+ tracing_on();
+ pkey = alloc_random_pkey();
+ dprintf1("test %d starting with pkey: %d\n", test_nr, pkey);
+ ptr = malloc_pkey(PAGE_SIZE, prot, pkey);
+ dprintf1("test %d starting...\n", test_nr);
+ pkey_tests[test_nr](ptr, pkey);
+ dprintf1("freeing test memory: %p\n", ptr);
+ free_pkey_malloc(ptr);
+ sys_pkey_free(pkey);
+
+ dprintf1("pkru_faults: %d\n", pkru_faults);
+ dprintf1("orig_pkru_faults: %d\n", orig_pkru_faults);
+
+ tracing_off();
+ close_test_fds();
+
+ printf("test %2d PASSED (iteration %d)\n", test_nr, iteration_nr);
+ dprintf1("======================\n\n");
+ }
+ iteration_nr++;
+}
+
+void pkey_setup_shadow(void)
+{
+ shadow_pkru = __rdpkru();
+}
+
+int main(void)
+{
+ int nr_iterations = 22;
+
+ srand((unsigned int)time(NULL));
+
+ setup_handlers();
+
+ printf("has pku: %d\n", cpu_has_pku());
+
+ if (!cpu_has_pku()) {
+ int size = PAGE_SIZE;
+ int *ptr;
+
+ printf("running PKEY tests for unsupported CPU/OS\n");
+
+ ptr = mmap(NULL, size, PROT_NONE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
+ assert(ptr != (void *)-1);
+ test_mprotect_pkey_on_unsupported_cpu(ptr, 1);
+ exit(0);
+ }
+
+ pkey_setup_shadow();
+ printf("startup pkru: %x\n", rdpkru());
+ setup_hugetlbfs();
+
+ while (nr_iterations-- > 0)
+ run_tests_once();
+
+ printf("done (all tests OK)\n");
+ return 0;
+}
diff --git a/tools/testing/selftests/x86/ptrace_syscall.c b/tools/testing/selftests/x86/ptrace_syscall.c
new file mode 100644
index 000000000..12aaa0631
--- /dev/null
+++ b/tools/testing/selftests/x86/ptrace_syscall.c
@@ -0,0 +1,430 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+
+#include <sys/ptrace.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <sys/syscall.h>
+#include <sys/user.h>
+#include <unistd.h>
+#include <errno.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <err.h>
+#include <string.h>
+#include <asm/ptrace-abi.h>
+#include <sys/auxv.h>
+
+/* Bitness-agnostic defines for user_regs_struct fields. */
+#ifdef __x86_64__
+# define user_syscall_nr orig_rax
+# define user_arg0 rdi
+# define user_arg1 rsi
+# define user_arg2 rdx
+# define user_arg3 r10
+# define user_arg4 r8
+# define user_arg5 r9
+# define user_ip rip
+# define user_ax rax
+#else
+# define user_syscall_nr orig_eax
+# define user_arg0 ebx
+# define user_arg1 ecx
+# define user_arg2 edx
+# define user_arg3 esi
+# define user_arg4 edi
+# define user_arg5 ebp
+# define user_ip eip
+# define user_ax eax
+#endif
+
+static int nerrs = 0;
+
+struct syscall_args32 {
+ uint32_t nr, arg0, arg1, arg2, arg3, arg4, arg5;
+};
+
+#ifdef __i386__
+extern void sys32_helper(struct syscall_args32 *, void *);
+extern void int80_and_ret(void);
+#endif
+
+/*
+ * Helper to invoke int80 with controlled regs and capture the final regs.
+ */
+static void do_full_int80(struct syscall_args32 *args)
+{
+#ifdef __x86_64__
+ register unsigned long bp asm("bp") = args->arg5;
+ asm volatile ("int $0x80"
+ : "+a" (args->nr),
+ "+b" (args->arg0), "+c" (args->arg1), "+d" (args->arg2),
+ "+S" (args->arg3), "+D" (args->arg4), "+r" (bp)
+ : : "r8", "r9", "r10", "r11");
+ args->arg5 = bp;
+#else
+ sys32_helper(args, int80_and_ret);
+#endif
+}
+
+#ifdef __i386__
+static void (*vsyscall32)(void);
+
+/*
+ * Nasty helper to invoke AT_SYSINFO (i.e. __kernel_vsyscall) with
+ * controlled regs and capture the final regs. This is so nasty that it
+ * crashes my copy of gdb :)
+ */
+static void do_full_vsyscall32(struct syscall_args32 *args)
+{
+ sys32_helper(args, vsyscall32);
+}
+#endif
+
+static siginfo_t wait_trap(pid_t chld)
+{
+ siginfo_t si;
+ if (waitid(P_PID, chld, &si, WEXITED|WSTOPPED) != 0)
+ err(1, "waitid");
+ if (si.si_pid != chld)
+ errx(1, "got unexpected pid in event\n");
+ if (si.si_code != CLD_TRAPPED)
+ errx(1, "got unexpected event type %d\n", si.si_code);
+ return si;
+}
+
+static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *),
+ int flags)
+{
+ struct sigaction sa;
+ memset(&sa, 0, sizeof(sa));
+ sa.sa_sigaction = handler;
+ sa.sa_flags = SA_SIGINFO | flags;
+ sigemptyset(&sa.sa_mask);
+ if (sigaction(sig, &sa, 0))
+ err(1, "sigaction");
+}
+
+static void setsigign(int sig, int flags)
+{
+ struct sigaction sa;
+ memset(&sa, 0, sizeof(sa));
+ sa.sa_sigaction = (void *)SIG_IGN;
+ sa.sa_flags = flags;
+ sigemptyset(&sa.sa_mask);
+ if (sigaction(sig, &sa, 0))
+ err(1, "sigaction");
+}
+
+static void clearhandler(int sig)
+{
+ struct sigaction sa;
+ memset(&sa, 0, sizeof(sa));
+ sa.sa_handler = SIG_DFL;
+ sigemptyset(&sa.sa_mask);
+ if (sigaction(sig, &sa, 0))
+ err(1, "sigaction");
+}
+
+#ifdef __x86_64__
+# define REG_BP REG_RBP
+#else
+# define REG_BP REG_EBP
+#endif
+
+static void empty_handler(int sig, siginfo_t *si, void *ctx_void)
+{
+}
+
+static void test_sys32_regs(void (*do_syscall)(struct syscall_args32 *))
+{
+ struct syscall_args32 args = {
+ .nr = 224, /* gettid */
+ .arg0 = 10, .arg1 = 11, .arg2 = 12,
+ .arg3 = 13, .arg4 = 14, .arg5 = 15,
+ };
+
+ do_syscall(&args);
+
+ if (args.nr != getpid() ||
+ args.arg0 != 10 || args.arg1 != 11 || args.arg2 != 12 ||
+ args.arg3 != 13 || args.arg4 != 14 || args.arg5 != 15) {
+ printf("[FAIL]\tgetpid() failed to preserve regs\n");
+ nerrs++;
+ } else {
+ printf("[OK]\tgetpid() preserves regs\n");
+ }
+
+ sethandler(SIGUSR1, empty_handler, 0);
+
+ args.nr = 37; /* kill */
+ args.arg0 = getpid();
+ args.arg1 = SIGUSR1;
+ do_syscall(&args);
+ if (args.nr != 0 ||
+ args.arg0 != getpid() || args.arg1 != SIGUSR1 || args.arg2 != 12 ||
+ args.arg3 != 13 || args.arg4 != 14 || args.arg5 != 15) {
+ printf("[FAIL]\tkill(getpid(), SIGUSR1) failed to preserve regs\n");
+ nerrs++;
+ } else {
+ printf("[OK]\tkill(getpid(), SIGUSR1) preserves regs\n");
+ }
+ clearhandler(SIGUSR1);
+}
+
+static void test_ptrace_syscall_restart(void)
+{
+ printf("[RUN]\tptrace-induced syscall restart\n");
+ pid_t chld = fork();
+ if (chld < 0)
+ err(1, "fork");
+
+ if (chld == 0) {
+ if (ptrace(PTRACE_TRACEME, 0, 0, 0) != 0)
+ err(1, "PTRACE_TRACEME");
+
+ pid_t pid = getpid(), tid = syscall(SYS_gettid);
+
+ printf("\tChild will make one syscall\n");
+ syscall(SYS_tgkill, pid, tid, SIGSTOP);
+
+ syscall(SYS_gettid, 10, 11, 12, 13, 14, 15);
+ _exit(0);
+ }
+
+ int status;
+
+ /* Wait for SIGSTOP. */
+ if (waitpid(chld, &status, 0) != chld || !WIFSTOPPED(status))
+ err(1, "waitpid");
+
+ struct user_regs_struct regs;
+
+ printf("[RUN]\tSYSEMU\n");
+ if (ptrace(PTRACE_SYSEMU, chld, 0, 0) != 0)
+ err(1, "PTRACE_SYSEMU");
+ wait_trap(chld);
+
+ if (ptrace(PTRACE_GETREGS, chld, 0, &regs) != 0)
+ err(1, "PTRACE_GETREGS");
+
+ if (regs.user_syscall_nr != SYS_gettid ||
+ regs.user_arg0 != 10 || regs.user_arg1 != 11 ||
+ regs.user_arg2 != 12 || regs.user_arg3 != 13 ||
+ regs.user_arg4 != 14 || regs.user_arg5 != 15) {
+ printf("[FAIL]\tInitial args are wrong (nr=%lu, args=%lu %lu %lu %lu %lu %lu)\n", (unsigned long)regs.user_syscall_nr, (unsigned long)regs.user_arg0, (unsigned long)regs.user_arg1, (unsigned long)regs.user_arg2, (unsigned long)regs.user_arg3, (unsigned long)regs.user_arg4, (unsigned long)regs.user_arg5);
+ nerrs++;
+ } else {
+ printf("[OK]\tInitial nr and args are correct\n");
+ }
+
+ printf("[RUN]\tRestart the syscall (ip = 0x%lx)\n",
+ (unsigned long)regs.user_ip);
+
+ /*
+ * This does exactly what it appears to do if syscall is int80 or
+ * SYSCALL64. For SYSCALL32 or SYSENTER, though, this is highly
+ * magical. It needs to work so that ptrace and syscall restart
+ * work as expected.
+ */
+ regs.user_ax = regs.user_syscall_nr;
+ regs.user_ip -= 2;
+ if (ptrace(PTRACE_SETREGS, chld, 0, &regs) != 0)
+ err(1, "PTRACE_SETREGS");
+
+ if (ptrace(PTRACE_SYSEMU, chld, 0, 0) != 0)
+ err(1, "PTRACE_SYSEMU");
+ wait_trap(chld);
+
+ if (ptrace(PTRACE_GETREGS, chld, 0, &regs) != 0)
+ err(1, "PTRACE_GETREGS");
+
+ if (regs.user_syscall_nr != SYS_gettid ||
+ regs.user_arg0 != 10 || regs.user_arg1 != 11 ||
+ regs.user_arg2 != 12 || regs.user_arg3 != 13 ||
+ regs.user_arg4 != 14 || regs.user_arg5 != 15) {
+ printf("[FAIL]\tRestart nr or args are wrong (nr=%lu, args=%lu %lu %lu %lu %lu %lu)\n", (unsigned long)regs.user_syscall_nr, (unsigned long)regs.user_arg0, (unsigned long)regs.user_arg1, (unsigned long)regs.user_arg2, (unsigned long)regs.user_arg3, (unsigned long)regs.user_arg4, (unsigned long)regs.user_arg5);
+ nerrs++;
+ } else {
+ printf("[OK]\tRestarted nr and args are correct\n");
+ }
+
+ printf("[RUN]\tChange nr and args and restart the syscall (ip = 0x%lx)\n",
+ (unsigned long)regs.user_ip);
+
+ regs.user_ax = SYS_getpid;
+ regs.user_arg0 = 20;
+ regs.user_arg1 = 21;
+ regs.user_arg2 = 22;
+ regs.user_arg3 = 23;
+ regs.user_arg4 = 24;
+ regs.user_arg5 = 25;
+ regs.user_ip -= 2;
+
+ if (ptrace(PTRACE_SETREGS, chld, 0, &regs) != 0)
+ err(1, "PTRACE_SETREGS");
+
+ if (ptrace(PTRACE_SYSEMU, chld, 0, 0) != 0)
+ err(1, "PTRACE_SYSEMU");
+ wait_trap(chld);
+
+ if (ptrace(PTRACE_GETREGS, chld, 0, &regs) != 0)
+ err(1, "PTRACE_GETREGS");
+
+ if (regs.user_syscall_nr != SYS_getpid ||
+ regs.user_arg0 != 20 || regs.user_arg1 != 21 || regs.user_arg2 != 22 ||
+ regs.user_arg3 != 23 || regs.user_arg4 != 24 || regs.user_arg5 != 25) {
+ printf("[FAIL]\tRestart nr or args are wrong (nr=%lu, args=%lu %lu %lu %lu %lu %lu)\n", (unsigned long)regs.user_syscall_nr, (unsigned long)regs.user_arg0, (unsigned long)regs.user_arg1, (unsigned long)regs.user_arg2, (unsigned long)regs.user_arg3, (unsigned long)regs.user_arg4, (unsigned long)regs.user_arg5);
+ nerrs++;
+ } else {
+ printf("[OK]\tReplacement nr and args are correct\n");
+ }
+
+ if (ptrace(PTRACE_CONT, chld, 0, 0) != 0)
+ err(1, "PTRACE_CONT");
+ if (waitpid(chld, &status, 0) != chld)
+ err(1, "waitpid");
+ if (!WIFEXITED(status) || WEXITSTATUS(status) != 0) {
+ printf("[FAIL]\tChild failed\n");
+ nerrs++;
+ } else {
+ printf("[OK]\tChild exited cleanly\n");
+ }
+}
+
+static void test_restart_under_ptrace(void)
+{
+ printf("[RUN]\tkernel syscall restart under ptrace\n");
+ pid_t chld = fork();
+ if (chld < 0)
+ err(1, "fork");
+
+ if (chld == 0) {
+ if (ptrace(PTRACE_TRACEME, 0, 0, 0) != 0)
+ err(1, "PTRACE_TRACEME");
+
+ pid_t pid = getpid(), tid = syscall(SYS_gettid);
+
+ printf("\tChild will take a nap until signaled\n");
+ setsigign(SIGUSR1, SA_RESTART);
+ syscall(SYS_tgkill, pid, tid, SIGSTOP);
+
+ syscall(SYS_pause, 0, 0, 0, 0, 0, 0);
+ _exit(0);
+ }
+
+ int status;
+
+ /* Wait for SIGSTOP. */
+ if (waitpid(chld, &status, 0) != chld || !WIFSTOPPED(status))
+ err(1, "waitpid");
+
+ struct user_regs_struct regs;
+
+ printf("[RUN]\tSYSCALL\n");
+ if (ptrace(PTRACE_SYSCALL, chld, 0, 0) != 0)
+ err(1, "PTRACE_SYSCALL");
+ wait_trap(chld);
+
+ /* We should be stopped at pause(2) entry. */
+
+ if (ptrace(PTRACE_GETREGS, chld, 0, &regs) != 0)
+ err(1, "PTRACE_GETREGS");
+
+ if (regs.user_syscall_nr != SYS_pause ||
+ regs.user_arg0 != 0 || regs.user_arg1 != 0 ||
+ regs.user_arg2 != 0 || regs.user_arg3 != 0 ||
+ regs.user_arg4 != 0 || regs.user_arg5 != 0) {
+ printf("[FAIL]\tInitial args are wrong (nr=%lu, args=%lu %lu %lu %lu %lu %lu)\n", (unsigned long)regs.user_syscall_nr, (unsigned long)regs.user_arg0, (unsigned long)regs.user_arg1, (unsigned long)regs.user_arg2, (unsigned long)regs.user_arg3, (unsigned long)regs.user_arg4, (unsigned long)regs.user_arg5);
+ nerrs++;
+ } else {
+ printf("[OK]\tInitial nr and args are correct\n");
+ }
+
+ /* Interrupt it. */
+ kill(chld, SIGUSR1);
+
+ /* Advance. We should be stopped at exit. */
+ printf("[RUN]\tSYSCALL\n");
+ if (ptrace(PTRACE_SYSCALL, chld, 0, 0) != 0)
+ err(1, "PTRACE_SYSCALL");
+ wait_trap(chld);
+
+ if (ptrace(PTRACE_GETREGS, chld, 0, &regs) != 0)
+ err(1, "PTRACE_GETREGS");
+
+ if (regs.user_syscall_nr != SYS_pause ||
+ regs.user_arg0 != 0 || regs.user_arg1 != 0 ||
+ regs.user_arg2 != 0 || regs.user_arg3 != 0 ||
+ regs.user_arg4 != 0 || regs.user_arg5 != 0) {
+ printf("[FAIL]\tArgs after SIGUSR1 are wrong (nr=%lu, args=%lu %lu %lu %lu %lu %lu)\n", (unsigned long)regs.user_syscall_nr, (unsigned long)regs.user_arg0, (unsigned long)regs.user_arg1, (unsigned long)regs.user_arg2, (unsigned long)regs.user_arg3, (unsigned long)regs.user_arg4, (unsigned long)regs.user_arg5);
+ nerrs++;
+ } else {
+ printf("[OK]\tArgs after SIGUSR1 are correct (ax = %ld)\n",
+ (long)regs.user_ax);
+ }
+
+ /* Poke the regs back in. This must not break anything. */
+ if (ptrace(PTRACE_SETREGS, chld, 0, &regs) != 0)
+ err(1, "PTRACE_SETREGS");
+
+ /* Catch the (ignored) SIGUSR1. */
+ if (ptrace(PTRACE_CONT, chld, 0, 0) != 0)
+ err(1, "PTRACE_CONT");
+ if (waitpid(chld, &status, 0) != chld)
+ err(1, "waitpid");
+ if (!WIFSTOPPED(status)) {
+ printf("[FAIL]\tChild was stopped for SIGUSR1 (status = 0x%x)\n", status);
+ nerrs++;
+ } else {
+ printf("[OK]\tChild got SIGUSR1\n");
+ }
+
+ /* The next event should be pause(2) again. */
+ printf("[RUN]\tStep again\n");
+ if (ptrace(PTRACE_SYSCALL, chld, 0, 0) != 0)
+ err(1, "PTRACE_SYSCALL");
+ wait_trap(chld);
+
+ /* We should be stopped at pause(2) entry. */
+
+ if (ptrace(PTRACE_GETREGS, chld, 0, &regs) != 0)
+ err(1, "PTRACE_GETREGS");
+
+ if (regs.user_syscall_nr != SYS_pause ||
+ regs.user_arg0 != 0 || regs.user_arg1 != 0 ||
+ regs.user_arg2 != 0 || regs.user_arg3 != 0 ||
+ regs.user_arg4 != 0 || regs.user_arg5 != 0) {
+ printf("[FAIL]\tpause did not restart (nr=%lu, args=%lu %lu %lu %lu %lu %lu)\n", (unsigned long)regs.user_syscall_nr, (unsigned long)regs.user_arg0, (unsigned long)regs.user_arg1, (unsigned long)regs.user_arg2, (unsigned long)regs.user_arg3, (unsigned long)regs.user_arg4, (unsigned long)regs.user_arg5);
+ nerrs++;
+ } else {
+ printf("[OK]\tpause(2) restarted correctly\n");
+ }
+
+ /* Kill it. */
+ kill(chld, SIGKILL);
+ if (waitpid(chld, &status, 0) != chld)
+ err(1, "waitpid");
+}
+
+int main()
+{
+ printf("[RUN]\tCheck int80 return regs\n");
+ test_sys32_regs(do_full_int80);
+
+#if defined(__i386__) && (!defined(__GLIBC__) || __GLIBC__ > 2 || __GLIBC_MINOR__ >= 16)
+ vsyscall32 = (void *)getauxval(AT_SYSINFO);
+ if (vsyscall32) {
+ printf("[RUN]\tCheck AT_SYSINFO return regs\n");
+ test_sys32_regs(do_full_vsyscall32);
+ } else {
+ printf("[SKIP]\tAT_SYSINFO is not available\n");
+ }
+#endif
+
+ test_ptrace_syscall_restart();
+
+ test_restart_under_ptrace();
+
+ return 0;
+}
diff --git a/tools/testing/selftests/x86/raw_syscall_helper_32.S b/tools/testing/selftests/x86/raw_syscall_helper_32.S
new file mode 100644
index 000000000..94410fa2b
--- /dev/null
+++ b/tools/testing/selftests/x86/raw_syscall_helper_32.S
@@ -0,0 +1,47 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+.global sys32_helper
+sys32_helper:
+ /* Args: syscall_args_32*, function pointer */
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ movl 5*4(%esp), %eax /* pointer to args struct */
+
+ movl 1*4(%eax), %ebx
+ movl 2*4(%eax), %ecx
+ movl 3*4(%eax), %edx
+ movl 4*4(%eax), %esi
+ movl 5*4(%eax), %edi
+ movl 6*4(%eax), %ebp
+ movl 0*4(%eax), %eax
+
+ call *(6*4)(%esp) /* Do the syscall */
+
+ /* Now we need to recover without losing any reg values */
+ pushl %eax
+ movl 6*4(%esp), %eax
+ popl 0*4(%eax)
+ movl %ebx, 1*4(%eax)
+ movl %ecx, 2*4(%eax)
+ movl %edx, 3*4(%eax)
+ movl %esi, 4*4(%eax)
+ movl %edi, 5*4(%eax)
+ movl %ebp, 6*4(%eax)
+
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+
+ .type sys32_helper, @function
+ .size sys32_helper, .-sys32_helper
+
+.global int80_and_ret
+int80_and_ret:
+ int $0x80
+ ret
+
+ .type int80_and_ret, @function
+ .size int80_and_ret, .-int80_and_ret
diff --git a/tools/testing/selftests/x86/sigreturn.c b/tools/testing/selftests/x86/sigreturn.c
new file mode 100644
index 000000000..4d9dc3f2f
--- /dev/null
+++ b/tools/testing/selftests/x86/sigreturn.c
@@ -0,0 +1,871 @@
+/*
+ * sigreturn.c - tests for x86 sigreturn(2) and exit-to-userspace
+ * Copyright (c) 2014-2015 Andrew Lutomirski
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * This is a series of tests that exercises the sigreturn(2) syscall and
+ * the IRET / SYSRET paths in the kernel.
+ *
+ * For now, this focuses on the effects of unusual CS and SS values,
+ * and it has a bunch of tests to make sure that ESP/RSP is restored
+ * properly.
+ *
+ * The basic idea behind these tests is to raise(SIGUSR1) to create a
+ * sigcontext frame, plug in the values to be tested, and then return,
+ * which implicitly invokes sigreturn(2) and programs the user context
+ * as desired.
+ *
+ * For tests for which we expect sigreturn and the subsequent return to
+ * user mode to succeed, we return to a short trampoline that generates
+ * SIGTRAP so that the meat of the tests can be ordinary C code in a
+ * SIGTRAP handler.
+ *
+ * The inner workings of each test is documented below.
+ *
+ * Do not run on outdated, unpatched kernels at risk of nasty crashes.
+ */
+
+#define _GNU_SOURCE
+
+#include <sys/time.h>
+#include <time.h>
+#include <stdlib.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <string.h>
+#include <inttypes.h>
+#include <sys/mman.h>
+#include <sys/signal.h>
+#include <sys/ucontext.h>
+#include <asm/ldt.h>
+#include <err.h>
+#include <setjmp.h>
+#include <stddef.h>
+#include <stdbool.h>
+#include <sys/ptrace.h>
+#include <sys/user.h>
+
+/* Pull in AR_xyz defines. */
+typedef unsigned int u32;
+typedef unsigned short u16;
+#include "../../../../arch/x86/include/asm/desc_defs.h"
+
+/*
+ * Copied from asm/ucontext.h, as asm/ucontext.h conflicts badly with the glibc
+ * headers.
+ */
+#ifdef __x86_64__
+/*
+ * UC_SIGCONTEXT_SS will be set when delivering 64-bit or x32 signals on
+ * kernels that save SS in the sigcontext. All kernels that set
+ * UC_SIGCONTEXT_SS will correctly restore at least the low 32 bits of esp
+ * regardless of SS (i.e. they implement espfix).
+ *
+ * Kernels that set UC_SIGCONTEXT_SS will also set UC_STRICT_RESTORE_SS
+ * when delivering a signal that came from 64-bit code.
+ *
+ * Sigreturn restores SS as follows:
+ *
+ * if (saved SS is valid || UC_STRICT_RESTORE_SS is set ||
+ * saved CS is not 64-bit)
+ * new SS = saved SS (will fail IRET and signal if invalid)
+ * else
+ * new SS = a flat 32-bit data segment
+ */
+#define UC_SIGCONTEXT_SS 0x2
+#define UC_STRICT_RESTORE_SS 0x4
+#endif
+
+/*
+ * In principle, this test can run on Linux emulation layers (e.g.
+ * Illumos "LX branded zones"). Solaris-based kernels reserve LDT
+ * entries 0-5 for their own internal purposes, so start our LDT
+ * allocations above that reservation. (The tests don't pass on LX
+ * branded zones, but at least this lets them run.)
+ */
+#define LDT_OFFSET 6
+
+/* An aligned stack accessible through some of our segments. */
+static unsigned char stack16[65536] __attribute__((aligned(4096)));
+
+/*
+ * An aligned int3 instruction used as a trampoline. Some of the tests
+ * want to fish out their ss values, so this trampoline copies ss to eax
+ * before the int3.
+ */
+asm (".pushsection .text\n\t"
+ ".type int3, @function\n\t"
+ ".align 4096\n\t"
+ "int3:\n\t"
+ "mov %ss,%ecx\n\t"
+ "int3\n\t"
+ ".size int3, . - int3\n\t"
+ ".align 4096, 0xcc\n\t"
+ ".popsection");
+extern char int3[4096];
+
+/*
+ * At startup, we prepapre:
+ *
+ * - ldt_nonexistent_sel: An LDT entry that doesn't exist (all-zero
+ * descriptor or out of bounds).
+ * - code16_sel: A 16-bit LDT code segment pointing to int3.
+ * - data16_sel: A 16-bit LDT data segment pointing to stack16.
+ * - npcode32_sel: A 32-bit not-present LDT code segment pointing to int3.
+ * - npdata32_sel: A 32-bit not-present LDT data segment pointing to stack16.
+ * - gdt_data16_idx: A 16-bit GDT data segment pointing to stack16.
+ * - gdt_npdata32_idx: A 32-bit not-present GDT data segment pointing to
+ * stack16.
+ *
+ * For no particularly good reason, xyz_sel is a selector value with the
+ * RPL and LDT bits filled in, whereas xyz_idx is just an index into the
+ * descriptor table. These variables will be zero if their respective
+ * segments could not be allocated.
+ */
+static unsigned short ldt_nonexistent_sel;
+static unsigned short code16_sel, data16_sel, npcode32_sel, npdata32_sel;
+
+static unsigned short gdt_data16_idx, gdt_npdata32_idx;
+
+static unsigned short GDT3(int idx)
+{
+ return (idx << 3) | 3;
+}
+
+static unsigned short LDT3(int idx)
+{
+ return (idx << 3) | 7;
+}
+
+/* Our sigaltstack scratch space. */
+static char altstack_data[SIGSTKSZ];
+
+static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *),
+ int flags)
+{
+ struct sigaction sa;
+ memset(&sa, 0, sizeof(sa));
+ sa.sa_sigaction = handler;
+ sa.sa_flags = SA_SIGINFO | flags;
+ sigemptyset(&sa.sa_mask);
+ if (sigaction(sig, &sa, 0))
+ err(1, "sigaction");
+}
+
+static void clearhandler(int sig)
+{
+ struct sigaction sa;
+ memset(&sa, 0, sizeof(sa));
+ sa.sa_handler = SIG_DFL;
+ sigemptyset(&sa.sa_mask);
+ if (sigaction(sig, &sa, 0))
+ err(1, "sigaction");
+}
+
+static void add_ldt(const struct user_desc *desc, unsigned short *var,
+ const char *name)
+{
+ if (syscall(SYS_modify_ldt, 1, desc, sizeof(*desc)) == 0) {
+ *var = LDT3(desc->entry_number);
+ } else {
+ printf("[NOTE]\tFailed to create %s segment\n", name);
+ *var = 0;
+ }
+}
+
+static void setup_ldt(void)
+{
+ if ((unsigned long)stack16 > (1ULL << 32) - sizeof(stack16))
+ errx(1, "stack16 is too high\n");
+ if ((unsigned long)int3 > (1ULL << 32) - sizeof(int3))
+ errx(1, "int3 is too high\n");
+
+ ldt_nonexistent_sel = LDT3(LDT_OFFSET + 2);
+
+ const struct user_desc code16_desc = {
+ .entry_number = LDT_OFFSET + 0,
+ .base_addr = (unsigned long)int3,
+ .limit = 4095,
+ .seg_32bit = 0,
+ .contents = 2, /* Code, not conforming */
+ .read_exec_only = 0,
+ .limit_in_pages = 0,
+ .seg_not_present = 0,
+ .useable = 0
+ };
+ add_ldt(&code16_desc, &code16_sel, "code16");
+
+ const struct user_desc data16_desc = {
+ .entry_number = LDT_OFFSET + 1,
+ .base_addr = (unsigned long)stack16,
+ .limit = 0xffff,
+ .seg_32bit = 0,
+ .contents = 0, /* Data, grow-up */
+ .read_exec_only = 0,
+ .limit_in_pages = 0,
+ .seg_not_present = 0,
+ .useable = 0
+ };
+ add_ldt(&data16_desc, &data16_sel, "data16");
+
+ const struct user_desc npcode32_desc = {
+ .entry_number = LDT_OFFSET + 3,
+ .base_addr = (unsigned long)int3,
+ .limit = 4095,
+ .seg_32bit = 1,
+ .contents = 2, /* Code, not conforming */
+ .read_exec_only = 0,
+ .limit_in_pages = 0,
+ .seg_not_present = 1,
+ .useable = 0
+ };
+ add_ldt(&npcode32_desc, &npcode32_sel, "npcode32");
+
+ const struct user_desc npdata32_desc = {
+ .entry_number = LDT_OFFSET + 4,
+ .base_addr = (unsigned long)stack16,
+ .limit = 0xffff,
+ .seg_32bit = 1,
+ .contents = 0, /* Data, grow-up */
+ .read_exec_only = 0,
+ .limit_in_pages = 0,
+ .seg_not_present = 1,
+ .useable = 0
+ };
+ add_ldt(&npdata32_desc, &npdata32_sel, "npdata32");
+
+ struct user_desc gdt_data16_desc = {
+ .entry_number = -1,
+ .base_addr = (unsigned long)stack16,
+ .limit = 0xffff,
+ .seg_32bit = 0,
+ .contents = 0, /* Data, grow-up */
+ .read_exec_only = 0,
+ .limit_in_pages = 0,
+ .seg_not_present = 0,
+ .useable = 0
+ };
+
+ if (syscall(SYS_set_thread_area, &gdt_data16_desc) == 0) {
+ /*
+ * This probably indicates vulnerability to CVE-2014-8133.
+ * Merely getting here isn't definitive, though, and we'll
+ * diagnose the problem for real later on.
+ */
+ printf("[WARN]\tset_thread_area allocated data16 at index %d\n",
+ gdt_data16_desc.entry_number);
+ gdt_data16_idx = gdt_data16_desc.entry_number;
+ } else {
+ printf("[OK]\tset_thread_area refused 16-bit data\n");
+ }
+
+ struct user_desc gdt_npdata32_desc = {
+ .entry_number = -1,
+ .base_addr = (unsigned long)stack16,
+ .limit = 0xffff,
+ .seg_32bit = 1,
+ .contents = 0, /* Data, grow-up */
+ .read_exec_only = 0,
+ .limit_in_pages = 0,
+ .seg_not_present = 1,
+ .useable = 0
+ };
+
+ if (syscall(SYS_set_thread_area, &gdt_npdata32_desc) == 0) {
+ /*
+ * As a hardening measure, newer kernels don't allow this.
+ */
+ printf("[WARN]\tset_thread_area allocated npdata32 at index %d\n",
+ gdt_npdata32_desc.entry_number);
+ gdt_npdata32_idx = gdt_npdata32_desc.entry_number;
+ } else {
+ printf("[OK]\tset_thread_area refused 16-bit data\n");
+ }
+}
+
+/* State used by our signal handlers. */
+static gregset_t initial_regs, requested_regs, resulting_regs;
+
+/* Instructions for the SIGUSR1 handler. */
+static volatile unsigned short sig_cs, sig_ss;
+static volatile sig_atomic_t sig_trapped, sig_err, sig_trapno;
+#ifdef __x86_64__
+static volatile sig_atomic_t sig_corrupt_final_ss;
+#endif
+
+/* Abstractions for some 32-bit vs 64-bit differences. */
+#ifdef __x86_64__
+# define REG_IP REG_RIP
+# define REG_SP REG_RSP
+# define REG_CX REG_RCX
+
+struct selectors {
+ unsigned short cs, gs, fs, ss;
+};
+
+static unsigned short *ssptr(ucontext_t *ctx)
+{
+ struct selectors *sels = (void *)&ctx->uc_mcontext.gregs[REG_CSGSFS];
+ return &sels->ss;
+}
+
+static unsigned short *csptr(ucontext_t *ctx)
+{
+ struct selectors *sels = (void *)&ctx->uc_mcontext.gregs[REG_CSGSFS];
+ return &sels->cs;
+}
+#else
+# define REG_IP REG_EIP
+# define REG_SP REG_ESP
+# define REG_CX REG_ECX
+
+static greg_t *ssptr(ucontext_t *ctx)
+{
+ return &ctx->uc_mcontext.gregs[REG_SS];
+}
+
+static greg_t *csptr(ucontext_t *ctx)
+{
+ return &ctx->uc_mcontext.gregs[REG_CS];
+}
+#endif
+
+/*
+ * Checks a given selector for its code bitness or returns -1 if it's not
+ * a usable code segment selector.
+ */
+int cs_bitness(unsigned short cs)
+{
+ uint32_t valid = 0, ar;
+ asm ("lar %[cs], %[ar]\n\t"
+ "jnz 1f\n\t"
+ "mov $1, %[valid]\n\t"
+ "1:"
+ : [ar] "=r" (ar), [valid] "+rm" (valid)
+ : [cs] "r" (cs));
+
+ if (!valid)
+ return -1;
+
+ bool db = (ar & (1 << 22));
+ bool l = (ar & (1 << 21));
+
+ if (!(ar & (1<<11)))
+ return -1; /* Not code. */
+
+ if (l && !db)
+ return 64;
+ else if (!l && db)
+ return 32;
+ else if (!l && !db)
+ return 16;
+ else
+ return -1; /* Unknown bitness. */
+}
+
+/*
+ * Checks a given selector for its code bitness or returns -1 if it's not
+ * a usable code segment selector.
+ */
+bool is_valid_ss(unsigned short cs)
+{
+ uint32_t valid = 0, ar;
+ asm ("lar %[cs], %[ar]\n\t"
+ "jnz 1f\n\t"
+ "mov $1, %[valid]\n\t"
+ "1:"
+ : [ar] "=r" (ar), [valid] "+rm" (valid)
+ : [cs] "r" (cs));
+
+ if (!valid)
+ return false;
+
+ if ((ar & AR_TYPE_MASK) != AR_TYPE_RWDATA &&
+ (ar & AR_TYPE_MASK) != AR_TYPE_RWDATA_EXPDOWN)
+ return false;
+
+ return (ar & AR_P);
+}
+
+/* Number of errors in the current test case. */
+static volatile sig_atomic_t nerrs;
+
+static void validate_signal_ss(int sig, ucontext_t *ctx)
+{
+#ifdef __x86_64__
+ bool was_64bit = (cs_bitness(*csptr(ctx)) == 64);
+
+ if (!(ctx->uc_flags & UC_SIGCONTEXT_SS)) {
+ printf("[FAIL]\tUC_SIGCONTEXT_SS was not set\n");
+ nerrs++;
+
+ /*
+ * This happens on Linux 4.1. The rest will fail, too, so
+ * return now to reduce the noise.
+ */
+ return;
+ }
+
+ /* UC_STRICT_RESTORE_SS is set iff we came from 64-bit mode. */
+ if (!!(ctx->uc_flags & UC_STRICT_RESTORE_SS) != was_64bit) {
+ printf("[FAIL]\tUC_STRICT_RESTORE_SS was wrong in signal %d\n",
+ sig);
+ nerrs++;
+ }
+
+ if (is_valid_ss(*ssptr(ctx))) {
+ /*
+ * DOSEMU was written before 64-bit sigcontext had SS, and
+ * it tries to figure out the signal source SS by looking at
+ * the physical register. Make sure that keeps working.
+ */
+ unsigned short hw_ss;
+ asm ("mov %%ss, %0" : "=rm" (hw_ss));
+ if (hw_ss != *ssptr(ctx)) {
+ printf("[FAIL]\tHW SS didn't match saved SS\n");
+ nerrs++;
+ }
+ }
+#endif
+}
+
+/*
+ * SIGUSR1 handler. Sets CS and SS as requested and points IP to the
+ * int3 trampoline. Sets SP to a large known value so that we can see
+ * whether the value round-trips back to user mode correctly.
+ */
+static void sigusr1(int sig, siginfo_t *info, void *ctx_void)
+{
+ ucontext_t *ctx = (ucontext_t*)ctx_void;
+
+ validate_signal_ss(sig, ctx);
+
+ memcpy(&initial_regs, &ctx->uc_mcontext.gregs, sizeof(gregset_t));
+
+ *csptr(ctx) = sig_cs;
+ *ssptr(ctx) = sig_ss;
+
+ ctx->uc_mcontext.gregs[REG_IP] =
+ sig_cs == code16_sel ? 0 : (unsigned long)&int3;
+ ctx->uc_mcontext.gregs[REG_SP] = (unsigned long)0x8badf00d5aadc0deULL;
+ ctx->uc_mcontext.gregs[REG_CX] = 0;
+
+ memcpy(&requested_regs, &ctx->uc_mcontext.gregs, sizeof(gregset_t));
+ requested_regs[REG_CX] = *ssptr(ctx); /* The asm code does this. */
+
+ return;
+}
+
+/*
+ * Called after a successful sigreturn (via int3) or from a failed
+ * sigreturn (directly by kernel). Restores our state so that the
+ * original raise(SIGUSR1) returns.
+ */
+static void sigtrap(int sig, siginfo_t *info, void *ctx_void)
+{
+ ucontext_t *ctx = (ucontext_t*)ctx_void;
+
+ validate_signal_ss(sig, ctx);
+
+ sig_err = ctx->uc_mcontext.gregs[REG_ERR];
+ sig_trapno = ctx->uc_mcontext.gregs[REG_TRAPNO];
+
+ unsigned short ss;
+ asm ("mov %%ss,%0" : "=r" (ss));
+
+ greg_t asm_ss = ctx->uc_mcontext.gregs[REG_CX];
+ if (asm_ss != sig_ss && sig == SIGTRAP) {
+ /* Sanity check failure. */
+ printf("[FAIL]\tSIGTRAP: ss = %hx, frame ss = %hx, ax = %llx\n",
+ ss, *ssptr(ctx), (unsigned long long)asm_ss);
+ nerrs++;
+ }
+
+ memcpy(&resulting_regs, &ctx->uc_mcontext.gregs, sizeof(gregset_t));
+ memcpy(&ctx->uc_mcontext.gregs, &initial_regs, sizeof(gregset_t));
+
+#ifdef __x86_64__
+ if (sig_corrupt_final_ss) {
+ if (ctx->uc_flags & UC_STRICT_RESTORE_SS) {
+ printf("[FAIL]\tUC_STRICT_RESTORE_SS was set inappropriately\n");
+ nerrs++;
+ } else {
+ /*
+ * DOSEMU transitions from 32-bit to 64-bit mode by
+ * adjusting sigcontext, and it requires that this work
+ * even if the saved SS is bogus.
+ */
+ printf("\tCorrupting SS on return to 64-bit mode\n");
+ *ssptr(ctx) = 0;
+ }
+ }
+#endif
+
+ sig_trapped = sig;
+}
+
+#ifdef __x86_64__
+/* Tests recovery if !UC_STRICT_RESTORE_SS */
+static void sigusr2(int sig, siginfo_t *info, void *ctx_void)
+{
+ ucontext_t *ctx = (ucontext_t*)ctx_void;
+
+ if (!(ctx->uc_flags & UC_STRICT_RESTORE_SS)) {
+ printf("[FAIL]\traise(2) didn't set UC_STRICT_RESTORE_SS\n");
+ nerrs++;
+ return; /* We can't do the rest. */
+ }
+
+ ctx->uc_flags &= ~UC_STRICT_RESTORE_SS;
+ *ssptr(ctx) = 0;
+
+ /* Return. The kernel should recover without sending another signal. */
+}
+
+static int test_nonstrict_ss(void)
+{
+ clearhandler(SIGUSR1);
+ clearhandler(SIGTRAP);
+ clearhandler(SIGSEGV);
+ clearhandler(SIGILL);
+ sethandler(SIGUSR2, sigusr2, 0);
+
+ nerrs = 0;
+
+ printf("[RUN]\tClear UC_STRICT_RESTORE_SS and corrupt SS\n");
+ raise(SIGUSR2);
+ if (!nerrs)
+ printf("[OK]\tIt worked\n");
+
+ return nerrs;
+}
+#endif
+
+/* Finds a usable code segment of the requested bitness. */
+int find_cs(int bitness)
+{
+ unsigned short my_cs;
+
+ asm ("mov %%cs,%0" : "=r" (my_cs));
+
+ if (cs_bitness(my_cs) == bitness)
+ return my_cs;
+ if (cs_bitness(my_cs + (2 << 3)) == bitness)
+ return my_cs + (2 << 3);
+ if (my_cs > (2<<3) && cs_bitness(my_cs - (2 << 3)) == bitness)
+ return my_cs - (2 << 3);
+ if (cs_bitness(code16_sel) == bitness)
+ return code16_sel;
+
+ printf("[WARN]\tCould not find %d-bit CS\n", bitness);
+ return -1;
+}
+
+static int test_valid_sigreturn(int cs_bits, bool use_16bit_ss, int force_ss)
+{
+ int cs = find_cs(cs_bits);
+ if (cs == -1) {
+ printf("[SKIP]\tCode segment unavailable for %d-bit CS, %d-bit SS\n",
+ cs_bits, use_16bit_ss ? 16 : 32);
+ return 0;
+ }
+
+ if (force_ss != -1) {
+ sig_ss = force_ss;
+ } else {
+ if (use_16bit_ss) {
+ if (!data16_sel) {
+ printf("[SKIP]\tData segment unavailable for %d-bit CS, 16-bit SS\n",
+ cs_bits);
+ return 0;
+ }
+ sig_ss = data16_sel;
+ } else {
+ asm volatile ("mov %%ss,%0" : "=r" (sig_ss));
+ }
+ }
+
+ sig_cs = cs;
+
+ printf("[RUN]\tValid sigreturn: %d-bit CS (%hx), %d-bit SS (%hx%s)\n",
+ cs_bits, sig_cs, use_16bit_ss ? 16 : 32, sig_ss,
+ (sig_ss & 4) ? "" : ", GDT");
+
+ raise(SIGUSR1);
+
+ nerrs = 0;
+
+ /*
+ * Check that each register had an acceptable value when the
+ * int3 trampoline was invoked.
+ */
+ for (int i = 0; i < NGREG; i++) {
+ greg_t req = requested_regs[i], res = resulting_regs[i];
+
+ if (i == REG_TRAPNO || i == REG_IP)
+ continue; /* don't care */
+
+ if (i == REG_SP) {
+ /*
+ * If we were using a 16-bit stack segment, then
+ * the kernel is a bit stuck: IRET only restores
+ * the low 16 bits of ESP/RSP if SS is 16-bit.
+ * The kernel uses a hack to restore bits 31:16,
+ * but that hack doesn't help with bits 63:32.
+ * On Intel CPUs, bits 63:32 end up zeroed, and, on
+ * AMD CPUs, they leak the high bits of the kernel
+ * espfix64 stack pointer. There's very little that
+ * the kernel can do about it.
+ *
+ * Similarly, if we are returning to a 32-bit context,
+ * the CPU will often lose the high 32 bits of RSP.
+ */
+
+ if (res == req)
+ continue;
+
+ if (cs_bits != 64 && ((res ^ req) & 0xFFFFFFFF) == 0) {
+ printf("[NOTE]\tSP: %llx -> %llx\n",
+ (unsigned long long)req,
+ (unsigned long long)res);
+ continue;
+ }
+
+ printf("[FAIL]\tSP mismatch: requested 0x%llx; got 0x%llx\n",
+ (unsigned long long)requested_regs[i],
+ (unsigned long long)resulting_regs[i]);
+ nerrs++;
+ continue;
+ }
+
+ bool ignore_reg = false;
+#if __i386__
+ if (i == REG_UESP)
+ ignore_reg = true;
+#else
+ if (i == REG_CSGSFS) {
+ struct selectors *req_sels =
+ (void *)&requested_regs[REG_CSGSFS];
+ struct selectors *res_sels =
+ (void *)&resulting_regs[REG_CSGSFS];
+ if (req_sels->cs != res_sels->cs) {
+ printf("[FAIL]\tCS mismatch: requested 0x%hx; got 0x%hx\n",
+ req_sels->cs, res_sels->cs);
+ nerrs++;
+ }
+
+ if (req_sels->ss != res_sels->ss) {
+ printf("[FAIL]\tSS mismatch: requested 0x%hx; got 0x%hx\n",
+ req_sels->ss, res_sels->ss);
+ nerrs++;
+ }
+
+ continue;
+ }
+#endif
+
+ /* Sanity check on the kernel */
+ if (i == REG_CX && req != res) {
+ printf("[FAIL]\tCX (saved SP) mismatch: requested 0x%llx; got 0x%llx\n",
+ (unsigned long long)req,
+ (unsigned long long)res);
+ nerrs++;
+ continue;
+ }
+
+ if (req != res && !ignore_reg) {
+ printf("[FAIL]\tReg %d mismatch: requested 0x%llx; got 0x%llx\n",
+ i, (unsigned long long)req,
+ (unsigned long long)res);
+ nerrs++;
+ }
+ }
+
+ if (nerrs == 0)
+ printf("[OK]\tall registers okay\n");
+
+ return nerrs;
+}
+
+static int test_bad_iret(int cs_bits, unsigned short ss, int force_cs)
+{
+ int cs = force_cs == -1 ? find_cs(cs_bits) : force_cs;
+ if (cs == -1)
+ return 0;
+
+ sig_cs = cs;
+ sig_ss = ss;
+
+ printf("[RUN]\t%d-bit CS (%hx), bogus SS (%hx)\n",
+ cs_bits, sig_cs, sig_ss);
+
+ sig_trapped = 0;
+ raise(SIGUSR1);
+ if (sig_trapped) {
+ char errdesc[32] = "";
+ if (sig_err) {
+ const char *src = (sig_err & 1) ? " EXT" : "";
+ const char *table;
+ if ((sig_err & 0x6) == 0x0)
+ table = "GDT";
+ else if ((sig_err & 0x6) == 0x4)
+ table = "LDT";
+ else if ((sig_err & 0x6) == 0x2)
+ table = "IDT";
+ else
+ table = "???";
+
+ sprintf(errdesc, "%s%s index %d, ",
+ table, src, sig_err >> 3);
+ }
+
+ char trapname[32];
+ if (sig_trapno == 13)
+ strcpy(trapname, "GP");
+ else if (sig_trapno == 11)
+ strcpy(trapname, "NP");
+ else if (sig_trapno == 12)
+ strcpy(trapname, "SS");
+ else if (sig_trapno == 32)
+ strcpy(trapname, "IRET"); /* X86_TRAP_IRET */
+ else
+ sprintf(trapname, "%d", sig_trapno);
+
+ printf("[OK]\tGot #%s(0x%lx) (i.e. %s%s)\n",
+ trapname, (unsigned long)sig_err,
+ errdesc, strsignal(sig_trapped));
+ return 0;
+ } else {
+ /*
+ * This also implicitly tests UC_STRICT_RESTORE_SS:
+ * We check that these signals set UC_STRICT_RESTORE_SS and,
+ * if UC_STRICT_RESTORE_SS doesn't cause strict behavior,
+ * then we won't get SIGSEGV.
+ */
+ printf("[FAIL]\tDid not get SIGSEGV\n");
+ return 1;
+ }
+}
+
+int main()
+{
+ int total_nerrs = 0;
+ unsigned short my_cs, my_ss;
+
+ asm volatile ("mov %%cs,%0" : "=r" (my_cs));
+ asm volatile ("mov %%ss,%0" : "=r" (my_ss));
+ setup_ldt();
+
+ stack_t stack = {
+ .ss_sp = altstack_data,
+ .ss_size = SIGSTKSZ,
+ };
+ if (sigaltstack(&stack, NULL) != 0)
+ err(1, "sigaltstack");
+
+ sethandler(SIGUSR1, sigusr1, 0);
+ sethandler(SIGTRAP, sigtrap, SA_ONSTACK);
+
+ /* Easy cases: return to a 32-bit SS in each possible CS bitness. */
+ total_nerrs += test_valid_sigreturn(64, false, -1);
+ total_nerrs += test_valid_sigreturn(32, false, -1);
+ total_nerrs += test_valid_sigreturn(16, false, -1);
+
+ /*
+ * Test easy espfix cases: return to a 16-bit LDT SS in each possible
+ * CS bitness. NB: with a long mode CS, the SS bitness is irrelevant.
+ *
+ * This catches the original missing-espfix-on-64-bit-kernels issue
+ * as well as CVE-2014-8134.
+ */
+ total_nerrs += test_valid_sigreturn(64, true, -1);
+ total_nerrs += test_valid_sigreturn(32, true, -1);
+ total_nerrs += test_valid_sigreturn(16, true, -1);
+
+ if (gdt_data16_idx) {
+ /*
+ * For performance reasons, Linux skips espfix if SS points
+ * to the GDT. If we were able to allocate a 16-bit SS in
+ * the GDT, see if it leaks parts of the kernel stack pointer.
+ *
+ * This tests for CVE-2014-8133.
+ */
+ total_nerrs += test_valid_sigreturn(64, true,
+ GDT3(gdt_data16_idx));
+ total_nerrs += test_valid_sigreturn(32, true,
+ GDT3(gdt_data16_idx));
+ total_nerrs += test_valid_sigreturn(16, true,
+ GDT3(gdt_data16_idx));
+ }
+
+#ifdef __x86_64__
+ /* Nasty ABI case: check SS corruption handling. */
+ sig_corrupt_final_ss = 1;
+ total_nerrs += test_valid_sigreturn(32, false, -1);
+ total_nerrs += test_valid_sigreturn(32, true, -1);
+ sig_corrupt_final_ss = 0;
+#endif
+
+ /*
+ * We're done testing valid sigreturn cases. Now we test states
+ * for which sigreturn itself will succeed but the subsequent
+ * entry to user mode will fail.
+ *
+ * Depending on the failure mode and the kernel bitness, these
+ * entry failures can generate SIGSEGV, SIGBUS, or SIGILL.
+ */
+ clearhandler(SIGTRAP);
+ sethandler(SIGSEGV, sigtrap, SA_ONSTACK);
+ sethandler(SIGBUS, sigtrap, SA_ONSTACK);
+ sethandler(SIGILL, sigtrap, SA_ONSTACK); /* 32-bit kernels do this */
+
+ /* Easy failures: invalid SS, resulting in #GP(0) */
+ test_bad_iret(64, ldt_nonexistent_sel, -1);
+ test_bad_iret(32, ldt_nonexistent_sel, -1);
+ test_bad_iret(16, ldt_nonexistent_sel, -1);
+
+ /* These fail because SS isn't a data segment, resulting in #GP(SS) */
+ test_bad_iret(64, my_cs, -1);
+ test_bad_iret(32, my_cs, -1);
+ test_bad_iret(16, my_cs, -1);
+
+ /* Try to return to a not-present code segment, triggering #NP(SS). */
+ test_bad_iret(32, my_ss, npcode32_sel);
+
+ /*
+ * Try to return to a not-present but otherwise valid data segment.
+ * This will cause IRET to fail with #SS on the espfix stack. This
+ * exercises CVE-2014-9322.
+ *
+ * Note that, if espfix is enabled, 64-bit Linux will lose track
+ * of the actual cause of failure and report #GP(0) instead.
+ * This would be very difficult for Linux to avoid, because
+ * espfix64 causes IRET failures to be promoted to #DF, so the
+ * original exception frame is never pushed onto the stack.
+ */
+ test_bad_iret(32, npdata32_sel, -1);
+
+ /*
+ * Try to return to a not-present but otherwise valid data
+ * segment without invoking espfix. Newer kernels don't allow
+ * this to happen in the first place. On older kernels, though,
+ * this can trigger CVE-2014-9322.
+ */
+ if (gdt_npdata32_idx)
+ test_bad_iret(32, GDT3(gdt_npdata32_idx), -1);
+
+#ifdef __x86_64__
+ total_nerrs += test_nonstrict_ss();
+#endif
+
+ return total_nerrs ? 1 : 0;
+}
diff --git a/tools/testing/selftests/x86/single_step_syscall.c b/tools/testing/selftests/x86/single_step_syscall.c
new file mode 100644
index 000000000..ddfdd635d
--- /dev/null
+++ b/tools/testing/selftests/x86/single_step_syscall.c
@@ -0,0 +1,187 @@
+/*
+ * single_step_syscall.c - single-steps various x86 syscalls
+ * Copyright (c) 2014-2015 Andrew Lutomirski
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * This is a very simple series of tests that makes system calls with
+ * the TF flag set. This exercises some nasty kernel code in the
+ * SYSENTER case: SYSENTER does not clear TF, so SYSENTER with TF set
+ * immediately issues #DB from CPL 0. This requires special handling in
+ * the kernel.
+ */
+
+#define _GNU_SOURCE
+
+#include <sys/time.h>
+#include <time.h>
+#include <stdlib.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <string.h>
+#include <inttypes.h>
+#include <sys/mman.h>
+#include <sys/signal.h>
+#include <sys/ucontext.h>
+#include <asm/ldt.h>
+#include <err.h>
+#include <setjmp.h>
+#include <stddef.h>
+#include <stdbool.h>
+#include <sys/ptrace.h>
+#include <sys/user.h>
+
+static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *),
+ int flags)
+{
+ struct sigaction sa;
+ memset(&sa, 0, sizeof(sa));
+ sa.sa_sigaction = handler;
+ sa.sa_flags = SA_SIGINFO | flags;
+ sigemptyset(&sa.sa_mask);
+ if (sigaction(sig, &sa, 0))
+ err(1, "sigaction");
+}
+
+static volatile sig_atomic_t sig_traps;
+
+#ifdef __x86_64__
+# define REG_IP REG_RIP
+# define WIDTH "q"
+# define INT80_CLOBBERS "r8", "r9", "r10", "r11"
+#else
+# define REG_IP REG_EIP
+# define WIDTH "l"
+# define INT80_CLOBBERS
+#endif
+
+static unsigned long get_eflags(void)
+{
+ unsigned long eflags;
+ asm volatile ("pushf" WIDTH "\n\tpop" WIDTH " %0" : "=rm" (eflags));
+ return eflags;
+}
+
+static void set_eflags(unsigned long eflags)
+{
+ asm volatile ("push" WIDTH " %0\n\tpopf" WIDTH
+ : : "rm" (eflags) : "flags");
+}
+
+#define X86_EFLAGS_TF (1UL << 8)
+
+static void sigtrap(int sig, siginfo_t *info, void *ctx_void)
+{
+ ucontext_t *ctx = (ucontext_t*)ctx_void;
+
+ if (get_eflags() & X86_EFLAGS_TF) {
+ set_eflags(get_eflags() & ~X86_EFLAGS_TF);
+ printf("[WARN]\tSIGTRAP handler had TF set\n");
+ _exit(1);
+ }
+
+ sig_traps++;
+
+ if (sig_traps == 10000 || sig_traps == 10001) {
+ printf("[WARN]\tHit %d SIGTRAPs with si_addr 0x%lx, ip 0x%lx\n",
+ (int)sig_traps,
+ (unsigned long)info->si_addr,
+ (unsigned long)ctx->uc_mcontext.gregs[REG_IP]);
+ }
+}
+
+static void check_result(void)
+{
+ unsigned long new_eflags = get_eflags();
+ set_eflags(new_eflags & ~X86_EFLAGS_TF);
+
+ if (!sig_traps) {
+ printf("[FAIL]\tNo SIGTRAP\n");
+ exit(1);
+ }
+
+ if (!(new_eflags & X86_EFLAGS_TF)) {
+ printf("[FAIL]\tTF was cleared\n");
+ exit(1);
+ }
+
+ printf("[OK]\tSurvived with TF set and %d traps\n", (int)sig_traps);
+ sig_traps = 0;
+}
+
+int main()
+{
+#ifdef CAN_BUILD_32
+ int tmp;
+#endif
+
+ sethandler(SIGTRAP, sigtrap, 0);
+
+ printf("[RUN]\tSet TF and check nop\n");
+ set_eflags(get_eflags() | X86_EFLAGS_TF);
+ asm volatile ("nop");
+ check_result();
+
+#ifdef __x86_64__
+ printf("[RUN]\tSet TF and check syscall-less opportunistic sysret\n");
+ set_eflags(get_eflags() | X86_EFLAGS_TF);
+ extern unsigned char post_nop[];
+ asm volatile ("pushf" WIDTH "\n\t"
+ "pop" WIDTH " %%r11\n\t"
+ "nop\n\t"
+ "post_nop:"
+ : : "c" (post_nop) : "r11");
+ check_result();
+#endif
+#ifdef CAN_BUILD_32
+ printf("[RUN]\tSet TF and check int80\n");
+ set_eflags(get_eflags() | X86_EFLAGS_TF);
+ asm volatile ("int $0x80" : "=a" (tmp) : "a" (SYS_getpid)
+ : INT80_CLOBBERS);
+ check_result();
+#endif
+
+ /*
+ * This test is particularly interesting if fast syscalls use
+ * SYSENTER: it triggers a nasty design flaw in SYSENTER.
+ * Specifically, SYSENTER does not clear TF, so either SYSENTER
+ * or the next instruction traps at CPL0. (Of course, Intel
+ * mostly forgot to document exactly what happens here.) So we
+ * get a CPL0 fault with usergs (on 64-bit kernels) and possibly
+ * no stack. The only sane way the kernel can possibly handle
+ * it is to clear TF on return from the #DB handler, but this
+ * happens way too early to set TF in the saved pt_regs, so the
+ * kernel has to do something clever to avoid losing track of
+ * the TF bit.
+ *
+ * Needless to say, we've had bugs in this area.
+ */
+ syscall(SYS_getpid); /* Force symbol binding without TF set. */
+ printf("[RUN]\tSet TF and check a fast syscall\n");
+ set_eflags(get_eflags() | X86_EFLAGS_TF);
+ syscall(SYS_getpid);
+ check_result();
+
+ /* Now make sure that another fast syscall doesn't set TF again. */
+ printf("[RUN]\tFast syscall with TF cleared\n");
+ fflush(stdout); /* Force a syscall */
+ if (get_eflags() & X86_EFLAGS_TF) {
+ printf("[FAIL]\tTF is now set\n");
+ exit(1);
+ }
+ if (sig_traps) {
+ printf("[FAIL]\tGot SIGTRAP\n");
+ exit(1);
+ }
+ printf("[OK]\tNothing unexpected happened\n");
+
+ return 0;
+}
diff --git a/tools/testing/selftests/x86/syscall_arg_fault.c b/tools/testing/selftests/x86/syscall_arg_fault.c
new file mode 100644
index 000000000..7db4fc9fa
--- /dev/null
+++ b/tools/testing/selftests/x86/syscall_arg_fault.c
@@ -0,0 +1,130 @@
+/*
+ * syscall_arg_fault.c - tests faults 32-bit fast syscall stack args
+ * Copyright (c) 2015 Andrew Lutomirski
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+#define _GNU_SOURCE
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/signal.h>
+#include <sys/ucontext.h>
+#include <err.h>
+#include <setjmp.h>
+#include <errno.h>
+
+/* Our sigaltstack scratch space. */
+static unsigned char altstack_data[SIGSTKSZ];
+
+static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *),
+ int flags)
+{
+ struct sigaction sa;
+ memset(&sa, 0, sizeof(sa));
+ sa.sa_sigaction = handler;
+ sa.sa_flags = SA_SIGINFO | flags;
+ sigemptyset(&sa.sa_mask);
+ if (sigaction(sig, &sa, 0))
+ err(1, "sigaction");
+}
+
+static volatile sig_atomic_t sig_traps;
+static sigjmp_buf jmpbuf;
+
+static volatile sig_atomic_t n_errs;
+
+static void sigsegv(int sig, siginfo_t *info, void *ctx_void)
+{
+ ucontext_t *ctx = (ucontext_t*)ctx_void;
+
+ if (ctx->uc_mcontext.gregs[REG_EAX] != -EFAULT) {
+ printf("[FAIL]\tAX had the wrong value: 0x%x\n",
+ ctx->uc_mcontext.gregs[REG_EAX]);
+ n_errs++;
+ } else {
+ printf("[OK]\tSeems okay\n");
+ }
+
+ siglongjmp(jmpbuf, 1);
+}
+
+static void sigill(int sig, siginfo_t *info, void *ctx_void)
+{
+ printf("[SKIP]\tIllegal instruction\n");
+ siglongjmp(jmpbuf, 1);
+}
+
+int main()
+{
+ stack_t stack = {
+ .ss_sp = altstack_data,
+ .ss_size = SIGSTKSZ,
+ };
+ if (sigaltstack(&stack, NULL) != 0)
+ err(1, "sigaltstack");
+
+ sethandler(SIGSEGV, sigsegv, SA_ONSTACK);
+ sethandler(SIGILL, sigill, SA_ONSTACK);
+
+ /*
+ * Exercise another nasty special case. The 32-bit SYSCALL
+ * and SYSENTER instructions (even in compat mode) each
+ * clobber one register. A Linux system call has a syscall
+ * number and six arguments, and the user stack pointer
+ * needs to live in some register on return. That means
+ * that we need eight registers, but SYSCALL and SYSENTER
+ * only preserve seven registers. As a result, one argument
+ * ends up on the stack. The stack is user memory, which
+ * means that the kernel can fail to read it.
+ *
+ * The 32-bit fast system calls don't have a defined ABI:
+ * we're supposed to invoke them through the vDSO. So we'll
+ * fudge it: we set all regs to invalid pointer values and
+ * invoke the entry instruction. The return will fail no
+ * matter what, and we completely lose our program state,
+ * but we can fix it up with a signal handler.
+ */
+
+ printf("[RUN]\tSYSENTER with invalid state\n");
+ if (sigsetjmp(jmpbuf, 1) == 0) {
+ asm volatile (
+ "movl $-1, %%eax\n\t"
+ "movl $-1, %%ebx\n\t"
+ "movl $-1, %%ecx\n\t"
+ "movl $-1, %%edx\n\t"
+ "movl $-1, %%esi\n\t"
+ "movl $-1, %%edi\n\t"
+ "movl $-1, %%ebp\n\t"
+ "movl $-1, %%esp\n\t"
+ "sysenter"
+ : : : "memory", "flags");
+ }
+
+ printf("[RUN]\tSYSCALL with invalid state\n");
+ if (sigsetjmp(jmpbuf, 1) == 0) {
+ asm volatile (
+ "movl $-1, %%eax\n\t"
+ "movl $-1, %%ebx\n\t"
+ "movl $-1, %%ecx\n\t"
+ "movl $-1, %%edx\n\t"
+ "movl $-1, %%esi\n\t"
+ "movl $-1, %%edi\n\t"
+ "movl $-1, %%ebp\n\t"
+ "movl $-1, %%esp\n\t"
+ "syscall\n\t"
+ "pushl $0" /* make sure we segfault cleanly */
+ : : : "memory", "flags");
+ }
+
+ return 0;
+}
diff --git a/tools/testing/selftests/x86/syscall_nt.c b/tools/testing/selftests/x86/syscall_nt.c
new file mode 100644
index 000000000..74e6b3fc2
--- /dev/null
+++ b/tools/testing/selftests/x86/syscall_nt.c
@@ -0,0 +1,96 @@
+/*
+ * syscall_nt.c - checks syscalls with NT set
+ * Copyright (c) 2014-2015 Andrew Lutomirski
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Some obscure user-space code requires the ability to make system calls
+ * with FLAGS.NT set. Make sure it works.
+ */
+
+#include <stdio.h>
+#include <unistd.h>
+#include <string.h>
+#include <signal.h>
+#include <err.h>
+#include <sys/syscall.h>
+#include <asm/processor-flags.h>
+
+#ifdef __x86_64__
+# define WIDTH "q"
+#else
+# define WIDTH "l"
+#endif
+
+static unsigned int nerrs;
+
+static unsigned long get_eflags(void)
+{
+ unsigned long eflags;
+ asm volatile ("pushf" WIDTH "\n\tpop" WIDTH " %0" : "=rm" (eflags));
+ return eflags;
+}
+
+static void set_eflags(unsigned long eflags)
+{
+ asm volatile ("push" WIDTH " %0\n\tpopf" WIDTH
+ : : "rm" (eflags) : "flags");
+}
+
+static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *),
+ int flags)
+{
+ struct sigaction sa;
+ memset(&sa, 0, sizeof(sa));
+ sa.sa_sigaction = handler;
+ sa.sa_flags = SA_SIGINFO | flags;
+ sigemptyset(&sa.sa_mask);
+ if (sigaction(sig, &sa, 0))
+ err(1, "sigaction");
+}
+
+static void sigtrap(int sig, siginfo_t *si, void *ctx_void)
+{
+}
+
+static void do_it(unsigned long extraflags)
+{
+ unsigned long flags;
+
+ set_eflags(get_eflags() | extraflags);
+ syscall(SYS_getpid);
+ flags = get_eflags();
+ set_eflags(X86_EFLAGS_IF | X86_EFLAGS_FIXED);
+ if ((flags & extraflags) == extraflags) {
+ printf("[OK]\tThe syscall worked and flags are still set\n");
+ } else {
+ printf("[FAIL]\tThe syscall worked but flags were cleared (flags = 0x%lx but expected 0x%lx set)\n",
+ flags, extraflags);
+ nerrs++;
+ }
+}
+
+int main(void)
+{
+ printf("[RUN]\tSet NT and issue a syscall\n");
+ do_it(X86_EFLAGS_NT);
+
+ /*
+ * Now try it again with TF set -- TF forces returns via IRET in all
+ * cases except non-ptregs-using 64-bit full fast path syscalls.
+ */
+
+ sethandler(SIGTRAP, sigtrap, 0);
+
+ printf("[RUN]\tSet NT|TF and issue a syscall\n");
+ do_it(X86_EFLAGS_NT | X86_EFLAGS_TF);
+
+ return nerrs == 0 ? 0 : 1;
+}
diff --git a/tools/testing/selftests/x86/sysret_rip.c b/tools/testing/selftests/x86/sysret_rip.c
new file mode 100644
index 000000000..d85ec5b36
--- /dev/null
+++ b/tools/testing/selftests/x86/sysret_rip.c
@@ -0,0 +1,195 @@
+/*
+ * sigreturn.c - tests that x86 avoids Intel SYSRET pitfalls
+ * Copyright (c) 2014-2016 Andrew Lutomirski
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+#define _GNU_SOURCE
+
+#include <stdlib.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <string.h>
+#include <inttypes.h>
+#include <sys/signal.h>
+#include <sys/ucontext.h>
+#include <sys/syscall.h>
+#include <err.h>
+#include <stddef.h>
+#include <stdbool.h>
+#include <setjmp.h>
+#include <sys/user.h>
+#include <sys/mman.h>
+#include <assert.h>
+
+
+asm (
+ ".pushsection \".text\", \"ax\"\n\t"
+ ".balign 4096\n\t"
+ "test_page: .globl test_page\n\t"
+ ".fill 4094,1,0xcc\n\t"
+ "test_syscall_insn:\n\t"
+ "syscall\n\t"
+ ".ifne . - test_page - 4096\n\t"
+ ".error \"test page is not one page long\"\n\t"
+ ".endif\n\t"
+ ".popsection"
+ );
+
+extern const char test_page[];
+static void const *current_test_page_addr = test_page;
+
+static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *),
+ int flags)
+{
+ struct sigaction sa;
+ memset(&sa, 0, sizeof(sa));
+ sa.sa_sigaction = handler;
+ sa.sa_flags = SA_SIGINFO | flags;
+ sigemptyset(&sa.sa_mask);
+ if (sigaction(sig, &sa, 0))
+ err(1, "sigaction");
+}
+
+static void clearhandler(int sig)
+{
+ struct sigaction sa;
+ memset(&sa, 0, sizeof(sa));
+ sa.sa_handler = SIG_DFL;
+ sigemptyset(&sa.sa_mask);
+ if (sigaction(sig, &sa, 0))
+ err(1, "sigaction");
+}
+
+/* State used by our signal handlers. */
+static gregset_t initial_regs;
+
+static volatile unsigned long rip;
+
+static void sigsegv_for_sigreturn_test(int sig, siginfo_t *info, void *ctx_void)
+{
+ ucontext_t *ctx = (ucontext_t*)ctx_void;
+
+ if (rip != ctx->uc_mcontext.gregs[REG_RIP]) {
+ printf("[FAIL]\tRequested RIP=0x%lx but got RIP=0x%lx\n",
+ rip, (unsigned long)ctx->uc_mcontext.gregs[REG_RIP]);
+ fflush(stdout);
+ _exit(1);
+ }
+
+ memcpy(&ctx->uc_mcontext.gregs, &initial_regs, sizeof(gregset_t));
+
+ printf("[OK]\tGot SIGSEGV at RIP=0x%lx\n", rip);
+}
+
+static void sigusr1(int sig, siginfo_t *info, void *ctx_void)
+{
+ ucontext_t *ctx = (ucontext_t*)ctx_void;
+
+ memcpy(&initial_regs, &ctx->uc_mcontext.gregs, sizeof(gregset_t));
+
+ /* Set IP and CX to match so that SYSRET can happen. */
+ ctx->uc_mcontext.gregs[REG_RIP] = rip;
+ ctx->uc_mcontext.gregs[REG_RCX] = rip;
+
+ /* R11 and EFLAGS should already match. */
+ assert(ctx->uc_mcontext.gregs[REG_EFL] ==
+ ctx->uc_mcontext.gregs[REG_R11]);
+
+ sethandler(SIGSEGV, sigsegv_for_sigreturn_test, SA_RESETHAND);
+
+ return;
+}
+
+static void test_sigreturn_to(unsigned long ip)
+{
+ rip = ip;
+ printf("[RUN]\tsigreturn to 0x%lx\n", ip);
+ raise(SIGUSR1);
+}
+
+static jmp_buf jmpbuf;
+
+static void sigsegv_for_fallthrough(int sig, siginfo_t *info, void *ctx_void)
+{
+ ucontext_t *ctx = (ucontext_t*)ctx_void;
+
+ if (rip != ctx->uc_mcontext.gregs[REG_RIP]) {
+ printf("[FAIL]\tExpected SIGSEGV at 0x%lx but got RIP=0x%lx\n",
+ rip, (unsigned long)ctx->uc_mcontext.gregs[REG_RIP]);
+ fflush(stdout);
+ _exit(1);
+ }
+
+ siglongjmp(jmpbuf, 1);
+}
+
+static void test_syscall_fallthrough_to(unsigned long ip)
+{
+ void *new_address = (void *)(ip - 4096);
+ void *ret;
+
+ printf("[RUN]\tTrying a SYSCALL that falls through to 0x%lx\n", ip);
+
+ ret = mremap((void *)current_test_page_addr, 4096, 4096,
+ MREMAP_MAYMOVE | MREMAP_FIXED, new_address);
+ if (ret == MAP_FAILED) {
+ if (ip <= (1UL << 47) - PAGE_SIZE) {
+ err(1, "mremap to %p", new_address);
+ } else {
+ printf("[OK]\tmremap to %p failed\n", new_address);
+ return;
+ }
+ }
+
+ if (ret != new_address)
+ errx(1, "mremap malfunctioned: asked for %p but got %p\n",
+ new_address, ret);
+
+ current_test_page_addr = new_address;
+ rip = ip;
+
+ if (sigsetjmp(jmpbuf, 1) == 0) {
+ asm volatile ("call *%[syscall_insn]" :: "a" (SYS_getpid),
+ [syscall_insn] "rm" (ip - 2));
+ errx(1, "[FAIL]\tSyscall trampoline returned");
+ }
+
+ printf("[OK]\tWe survived\n");
+}
+
+int main()
+{
+ /*
+ * When the kernel returns from a slow-path syscall, it will
+ * detect whether SYSRET is appropriate. If it incorrectly
+ * thinks that SYSRET is appropriate when RIP is noncanonical,
+ * it'll crash on Intel CPUs.
+ */
+ sethandler(SIGUSR1, sigusr1, 0);
+ for (int i = 47; i < 64; i++)
+ test_sigreturn_to(1UL<<i);
+
+ clearhandler(SIGUSR1);
+
+ sethandler(SIGSEGV, sigsegv_for_fallthrough, 0);
+
+ /* One extra test to check that we didn't screw up the mremap logic. */
+ test_syscall_fallthrough_to((1UL << 47) - 2*PAGE_SIZE);
+
+ /* These are the interesting cases. */
+ for (int i = 47; i < 64; i++) {
+ test_syscall_fallthrough_to((1UL<<i) - PAGE_SIZE);
+ test_syscall_fallthrough_to(1UL<<i);
+ }
+
+ return 0;
+}
diff --git a/tools/testing/selftests/x86/sysret_ss_attrs.c b/tools/testing/selftests/x86/sysret_ss_attrs.c
new file mode 100644
index 000000000..ce42d5a64
--- /dev/null
+++ b/tools/testing/selftests/x86/sysret_ss_attrs.c
@@ -0,0 +1,112 @@
+/*
+ * sysret_ss_attrs.c - test that syscalls return valid hidden SS attributes
+ * Copyright (c) 2015 Andrew Lutomirski
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * On AMD CPUs, SYSRET can return with a valid SS descriptor with with
+ * the hidden attributes set to an unusable state. Make sure the kernel
+ * doesn't let this happen.
+ */
+
+#define _GNU_SOURCE
+
+#include <stdlib.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <err.h>
+#include <stddef.h>
+#include <stdbool.h>
+#include <pthread.h>
+
+static void *threadproc(void *ctx)
+{
+ /*
+ * Do our best to cause sleeps on this CPU to exit the kernel and
+ * re-enter with SS = 0.
+ */
+ while (true)
+ ;
+
+ return NULL;
+}
+
+#ifdef __x86_64__
+extern unsigned long call32_from_64(void *stack, void (*function)(void));
+
+asm (".pushsection .text\n\t"
+ ".code32\n\t"
+ "test_ss:\n\t"
+ "pushl $0\n\t"
+ "popl %eax\n\t"
+ "ret\n\t"
+ ".code64");
+extern void test_ss(void);
+#endif
+
+int main()
+{
+ /*
+ * Start a busy-looping thread on the same CPU we're on.
+ * For simplicity, just stick everything to CPU 0. This will
+ * fail in some containers, but that's probably okay.
+ */
+ cpu_set_t cpuset;
+ CPU_ZERO(&cpuset);
+ CPU_SET(0, &cpuset);
+ if (sched_setaffinity(0, sizeof(cpuset), &cpuset) != 0)
+ printf("[WARN]\tsched_setaffinity failed\n");
+
+ pthread_t thread;
+ if (pthread_create(&thread, 0, threadproc, 0) != 0)
+ err(1, "pthread_create");
+
+#ifdef __x86_64__
+ unsigned char *stack32 = mmap(NULL, 4096, PROT_READ | PROT_WRITE,
+ MAP_32BIT | MAP_ANONYMOUS | MAP_PRIVATE,
+ -1, 0);
+ if (stack32 == MAP_FAILED)
+ err(1, "mmap");
+#endif
+
+ printf("[RUN]\tSyscalls followed by SS validation\n");
+
+ for (int i = 0; i < 1000; i++) {
+ /*
+ * Go to sleep and return using sysret (if we're 64-bit
+ * or we're 32-bit on AMD on a 64-bit kernel). On AMD CPUs,
+ * SYSRET doesn't fix up the cached SS descriptor, so the
+ * kernel needs some kind of workaround to make sure that we
+ * end the system call with a valid stack segment. This
+ * can be a confusing failure because the SS *selector*
+ * is the same regardless.
+ */
+ usleep(2);
+
+#ifdef __x86_64__
+ /*
+ * On 32-bit, just doing a syscall through glibc is enough
+ * to cause a crash if our cached SS descriptor is invalid.
+ * On 64-bit, it's not, so try extra hard.
+ */
+ call32_from_64(stack32 + 4088, test_ss);
+#endif
+ }
+
+ printf("[OK]\tWe survived\n");
+
+#ifdef __x86_64__
+ munmap(stack32, 4096);
+#endif
+
+ return 0;
+}
diff --git a/tools/testing/selftests/x86/test_FCMOV.c b/tools/testing/selftests/x86/test_FCMOV.c
new file mode 100644
index 000000000..6b5036fbb
--- /dev/null
+++ b/tools/testing/selftests/x86/test_FCMOV.c
@@ -0,0 +1,94 @@
+// SPDX-License-Identifier: GPL-2.0
+#undef _GNU_SOURCE
+#define _GNU_SOURCE 1
+#undef __USE_GNU
+#define __USE_GNU 1
+#include <unistd.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <signal.h>
+#include <sys/types.h>
+#include <sys/select.h>
+#include <sys/time.h>
+#include <sys/wait.h>
+
+#define TEST(insn) \
+long double __attribute__((noinline)) insn(long flags) \
+{ \
+ long double out; \
+ asm ("\n" \
+ " push %1""\n" \
+ " popf""\n" \
+ " fldpi""\n" \
+ " fld1""\n" \
+ " " #insn " %%st(1), %%st" "\n" \
+ " ffree %%st(1)" "\n" \
+ : "=t" (out) \
+ : "r" (flags) \
+ ); \
+ return out; \
+}
+
+TEST(fcmovb)
+TEST(fcmove)
+TEST(fcmovbe)
+TEST(fcmovu)
+TEST(fcmovnb)
+TEST(fcmovne)
+TEST(fcmovnbe)
+TEST(fcmovnu)
+
+enum {
+ CF = 1 << 0,
+ PF = 1 << 2,
+ ZF = 1 << 6,
+};
+
+void sighandler(int sig)
+{
+ printf("[FAIL]\tGot signal %d, exiting\n", sig);
+ exit(1);
+}
+
+int main(int argc, char **argv, char **envp)
+{
+ int err = 0;
+
+ /* SIGILL triggers on 32-bit kernels w/o fcomi emulation
+ * when run with "no387 nofxsr". Other signals are caught
+ * just in case.
+ */
+ signal(SIGILL, sighandler);
+ signal(SIGFPE, sighandler);
+ signal(SIGSEGV, sighandler);
+
+ printf("[RUN]\tTesting fcmovCC instructions\n");
+ /* If fcmovCC() returns 1.0, the move wasn't done */
+ err |= !(fcmovb(0) == 1.0); err |= !(fcmovnb(0) != 1.0);
+ err |= !(fcmove(0) == 1.0); err |= !(fcmovne(0) != 1.0);
+ err |= !(fcmovbe(0) == 1.0); err |= !(fcmovnbe(0) != 1.0);
+ err |= !(fcmovu(0) == 1.0); err |= !(fcmovnu(0) != 1.0);
+
+ err |= !(fcmovb(CF) != 1.0); err |= !(fcmovnb(CF) == 1.0);
+ err |= !(fcmove(CF) == 1.0); err |= !(fcmovne(CF) != 1.0);
+ err |= !(fcmovbe(CF) != 1.0); err |= !(fcmovnbe(CF) == 1.0);
+ err |= !(fcmovu(CF) == 1.0); err |= !(fcmovnu(CF) != 1.0);
+
+ err |= !(fcmovb(ZF) == 1.0); err |= !(fcmovnb(ZF) != 1.0);
+ err |= !(fcmove(ZF) != 1.0); err |= !(fcmovne(ZF) == 1.0);
+ err |= !(fcmovbe(ZF) != 1.0); err |= !(fcmovnbe(ZF) == 1.0);
+ err |= !(fcmovu(ZF) == 1.0); err |= !(fcmovnu(ZF) != 1.0);
+
+ err |= !(fcmovb(PF) == 1.0); err |= !(fcmovnb(PF) != 1.0);
+ err |= !(fcmove(PF) == 1.0); err |= !(fcmovne(PF) != 1.0);
+ err |= !(fcmovbe(PF) == 1.0); err |= !(fcmovnbe(PF) != 1.0);
+ err |= !(fcmovu(PF) != 1.0); err |= !(fcmovnu(PF) == 1.0);
+
+ if (!err)
+ printf("[OK]\tfcmovCC\n");
+ else
+ printf("[FAIL]\tfcmovCC errors: %d\n", err);
+
+ return err;
+}
diff --git a/tools/testing/selftests/x86/test_FCOMI.c b/tools/testing/selftests/x86/test_FCOMI.c
new file mode 100644
index 000000000..aec6692c6
--- /dev/null
+++ b/tools/testing/selftests/x86/test_FCOMI.c
@@ -0,0 +1,332 @@
+// SPDX-License-Identifier: GPL-2.0
+#undef _GNU_SOURCE
+#define _GNU_SOURCE 1
+#undef __USE_GNU
+#define __USE_GNU 1
+#include <unistd.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <signal.h>
+#include <sys/types.h>
+#include <sys/select.h>
+#include <sys/time.h>
+#include <sys/wait.h>
+#include <fenv.h>
+
+enum {
+ CF = 1 << 0,
+ PF = 1 << 2,
+ ZF = 1 << 6,
+ ARITH = CF | PF | ZF,
+};
+
+long res_fcomi_pi_1;
+long res_fcomi_1_pi;
+long res_fcomi_1_1;
+long res_fcomi_nan_1;
+/* sNaN is s|111 1111 1|1xx xxxx xxxx xxxx xxxx xxxx */
+/* qNaN is s|111 1111 1|0xx xxxx xxxx xxxx xxxx xxxx (some x must be nonzero) */
+int snan = 0x7fc11111;
+int qnan = 0x7f811111;
+unsigned short snan1[5];
+/* sNaN80 is s|111 1111 1111 1111 |10xx xx...xx (some x must be nonzero) */
+unsigned short snan80[5] = { 0x1111, 0x1111, 0x1111, 0x8111, 0x7fff };
+
+int test(long flags)
+{
+ feclearexcept(FE_DIVBYZERO|FE_INEXACT|FE_INVALID|FE_OVERFLOW|FE_UNDERFLOW);
+
+ asm ("\n"
+
+ " push %0""\n"
+ " popf""\n"
+ " fld1""\n"
+ " fldpi""\n"
+ " fcomi %%st(1), %%st" "\n"
+ " ffree %%st(0)" "\n"
+ " ffree %%st(1)" "\n"
+ " pushf""\n"
+ " pop res_fcomi_1_pi""\n"
+
+ " push %0""\n"
+ " popf""\n"
+ " fldpi""\n"
+ " fld1""\n"
+ " fcomi %%st(1), %%st" "\n"
+ " ffree %%st(0)" "\n"
+ " ffree %%st(1)" "\n"
+ " pushf""\n"
+ " pop res_fcomi_pi_1""\n"
+
+ " push %0""\n"
+ " popf""\n"
+ " fld1""\n"
+ " fld1""\n"
+ " fcomi %%st(1), %%st" "\n"
+ " ffree %%st(0)" "\n"
+ " ffree %%st(1)" "\n"
+ " pushf""\n"
+ " pop res_fcomi_1_1""\n"
+ :
+ : "r" (flags)
+ );
+ if ((res_fcomi_1_pi & ARITH) != (0)) {
+ printf("[BAD]\tfcomi_1_pi with flags:%lx\n", flags);
+ return 1;
+ }
+ if ((res_fcomi_pi_1 & ARITH) != (CF)) {
+ printf("[BAD]\tfcomi_pi_1 with flags:%lx->%lx\n", flags, res_fcomi_pi_1 & ARITH);
+ return 1;
+ }
+ if ((res_fcomi_1_1 & ARITH) != (ZF)) {
+ printf("[BAD]\tfcomi_1_1 with flags:%lx\n", flags);
+ return 1;
+ }
+ if (fetestexcept(FE_INVALID) != 0) {
+ printf("[BAD]\tFE_INVALID is set in %s\n", __func__);
+ return 1;
+ }
+ return 0;
+}
+
+int test_qnan(long flags)
+{
+ feclearexcept(FE_DIVBYZERO|FE_INEXACT|FE_INVALID|FE_OVERFLOW|FE_UNDERFLOW);
+
+ asm ("\n"
+ " push %0""\n"
+ " popf""\n"
+ " flds qnan""\n"
+ " fld1""\n"
+ " fnclex""\n" // fld of a qnan raised FE_INVALID, clear it
+ " fcomi %%st(1), %%st" "\n"
+ " ffree %%st(0)" "\n"
+ " ffree %%st(1)" "\n"
+ " pushf""\n"
+ " pop res_fcomi_nan_1""\n"
+ :
+ : "r" (flags)
+ );
+ if ((res_fcomi_nan_1 & ARITH) != (ZF|CF|PF)) {
+ printf("[BAD]\tfcomi_qnan_1 with flags:%lx\n", flags);
+ return 1;
+ }
+ if (fetestexcept(FE_INVALID) != FE_INVALID) {
+ printf("[BAD]\tFE_INVALID is not set in %s\n", __func__);
+ return 1;
+ }
+ return 0;
+}
+
+int testu_qnan(long flags)
+{
+ feclearexcept(FE_DIVBYZERO|FE_INEXACT|FE_INVALID|FE_OVERFLOW|FE_UNDERFLOW);
+
+ asm ("\n"
+ " push %0""\n"
+ " popf""\n"
+ " flds qnan""\n"
+ " fld1""\n"
+ " fnclex""\n" // fld of a qnan raised FE_INVALID, clear it
+ " fucomi %%st(1), %%st" "\n"
+ " ffree %%st(0)" "\n"
+ " ffree %%st(1)" "\n"
+ " pushf""\n"
+ " pop res_fcomi_nan_1""\n"
+ :
+ : "r" (flags)
+ );
+ if ((res_fcomi_nan_1 & ARITH) != (ZF|CF|PF)) {
+ printf("[BAD]\tfcomi_qnan_1 with flags:%lx\n", flags);
+ return 1;
+ }
+ if (fetestexcept(FE_INVALID) != 0) {
+ printf("[BAD]\tFE_INVALID is set in %s\n", __func__);
+ return 1;
+ }
+ return 0;
+}
+
+int testu_snan(long flags)
+{
+ feclearexcept(FE_DIVBYZERO|FE_INEXACT|FE_INVALID|FE_OVERFLOW|FE_UNDERFLOW);
+
+ asm ("\n"
+ " push %0""\n"
+ " popf""\n"
+// " flds snan""\n" // WRONG, this will convert 32-bit fp snan to a *qnan* in 80-bit fp register!
+// " fstpt snan1""\n" // if uncommented, it prints "snan1:7fff c111 1100 0000 0000" - c111, not 8111!
+// " fnclex""\n" // flds of a snan raised FE_INVALID, clear it
+ " fldt snan80""\n" // fldt never raise FE_INVALID
+ " fld1""\n"
+ " fucomi %%st(1), %%st" "\n"
+ " ffree %%st(0)" "\n"
+ " ffree %%st(1)" "\n"
+ " pushf""\n"
+ " pop res_fcomi_nan_1""\n"
+ :
+ : "r" (flags)
+ );
+ if ((res_fcomi_nan_1 & ARITH) != (ZF|CF|PF)) {
+ printf("[BAD]\tfcomi_qnan_1 with flags:%lx\n", flags);
+ return 1;
+ }
+// printf("snan:%x snan1:%04x %04x %04x %04x %04x\n", snan, snan1[4], snan1[3], snan1[2], snan1[1], snan1[0]);
+ if (fetestexcept(FE_INVALID) != FE_INVALID) {
+ printf("[BAD]\tFE_INVALID is not set in %s\n", __func__);
+ return 1;
+ }
+ return 0;
+}
+
+int testp(long flags)
+{
+ feclearexcept(FE_DIVBYZERO|FE_INEXACT|FE_INVALID|FE_OVERFLOW|FE_UNDERFLOW);
+
+ asm ("\n"
+
+ " push %0""\n"
+ " popf""\n"
+ " fld1""\n"
+ " fldpi""\n"
+ " fcomip %%st(1), %%st" "\n"
+ " ffree %%st(0)" "\n"
+ " pushf""\n"
+ " pop res_fcomi_1_pi""\n"
+
+ " push %0""\n"
+ " popf""\n"
+ " fldpi""\n"
+ " fld1""\n"
+ " fcomip %%st(1), %%st" "\n"
+ " ffree %%st(0)" "\n"
+ " pushf""\n"
+ " pop res_fcomi_pi_1""\n"
+
+ " push %0""\n"
+ " popf""\n"
+ " fld1""\n"
+ " fld1""\n"
+ " fcomip %%st(1), %%st" "\n"
+ " ffree %%st(0)" "\n"
+ " pushf""\n"
+ " pop res_fcomi_1_1""\n"
+ :
+ : "r" (flags)
+ );
+ if ((res_fcomi_1_pi & ARITH) != (0)) {
+ printf("[BAD]\tfcomi_1_pi with flags:%lx\n", flags);
+ return 1;
+ }
+ if ((res_fcomi_pi_1 & ARITH) != (CF)) {
+ printf("[BAD]\tfcomi_pi_1 with flags:%lx->%lx\n", flags, res_fcomi_pi_1 & ARITH);
+ return 1;
+ }
+ if ((res_fcomi_1_1 & ARITH) != (ZF)) {
+ printf("[BAD]\tfcomi_1_1 with flags:%lx\n", flags);
+ return 1;
+ }
+ if (fetestexcept(FE_INVALID) != 0) {
+ printf("[BAD]\tFE_INVALID is set in %s\n", __func__);
+ return 1;
+ }
+ return 0;
+}
+
+int testp_qnan(long flags)
+{
+ feclearexcept(FE_DIVBYZERO|FE_INEXACT|FE_INVALID|FE_OVERFLOW|FE_UNDERFLOW);
+
+ asm ("\n"
+ " push %0""\n"
+ " popf""\n"
+ " flds qnan""\n"
+ " fld1""\n"
+ " fnclex""\n" // fld of a qnan raised FE_INVALID, clear it
+ " fcomip %%st(1), %%st" "\n"
+ " ffree %%st(0)" "\n"
+ " pushf""\n"
+ " pop res_fcomi_nan_1""\n"
+ :
+ : "r" (flags)
+ );
+ if ((res_fcomi_nan_1 & ARITH) != (ZF|CF|PF)) {
+ printf("[BAD]\tfcomi_qnan_1 with flags:%lx\n", flags);
+ return 1;
+ }
+ if (fetestexcept(FE_INVALID) != FE_INVALID) {
+ printf("[BAD]\tFE_INVALID is not set in %s\n", __func__);
+ return 1;
+ }
+ return 0;
+}
+
+int testup_qnan(long flags)
+{
+ feclearexcept(FE_DIVBYZERO|FE_INEXACT|FE_INVALID|FE_OVERFLOW|FE_UNDERFLOW);
+
+ asm ("\n"
+ " push %0""\n"
+ " popf""\n"
+ " flds qnan""\n"
+ " fld1""\n"
+ " fnclex""\n" // fld of a qnan raised FE_INVALID, clear it
+ " fucomip %%st(1), %%st" "\n"
+ " ffree %%st(0)" "\n"
+ " pushf""\n"
+ " pop res_fcomi_nan_1""\n"
+ :
+ : "r" (flags)
+ );
+ if ((res_fcomi_nan_1 & ARITH) != (ZF|CF|PF)) {
+ printf("[BAD]\tfcomi_qnan_1 with flags:%lx\n", flags);
+ return 1;
+ }
+ if (fetestexcept(FE_INVALID) != 0) {
+ printf("[BAD]\tFE_INVALID is set in %s\n", __func__);
+ return 1;
+ }
+ return 0;
+}
+
+void sighandler(int sig)
+{
+ printf("[FAIL]\tGot signal %d, exiting\n", sig);
+ exit(1);
+}
+
+int main(int argc, char **argv, char **envp)
+{
+ int err = 0;
+
+ /* SIGILL triggers on 32-bit kernels w/o fcomi emulation
+ * when run with "no387 nofxsr". Other signals are caught
+ * just in case.
+ */
+ signal(SIGILL, sighandler);
+ signal(SIGFPE, sighandler);
+ signal(SIGSEGV, sighandler);
+
+ printf("[RUN]\tTesting f[u]comi[p] instructions\n");
+ err |= test(0);
+ err |= test_qnan(0);
+ err |= testu_qnan(0);
+ err |= testu_snan(0);
+ err |= test(CF|ZF|PF);
+ err |= test_qnan(CF|ZF|PF);
+ err |= testu_qnan(CF|ZF|PF);
+ err |= testu_snan(CF|ZF|PF);
+ err |= testp(0);
+ err |= testp_qnan(0);
+ err |= testup_qnan(0);
+ err |= testp(CF|ZF|PF);
+ err |= testp_qnan(CF|ZF|PF);
+ err |= testup_qnan(CF|ZF|PF);
+ if (!err)
+ printf("[OK]\tf[u]comi[p]\n");
+ else
+ printf("[FAIL]\tf[u]comi[p] errors: %d\n", err);
+
+ return err;
+}
diff --git a/tools/testing/selftests/x86/test_FISTTP.c b/tools/testing/selftests/x86/test_FISTTP.c
new file mode 100644
index 000000000..09789c0ce
--- /dev/null
+++ b/tools/testing/selftests/x86/test_FISTTP.c
@@ -0,0 +1,138 @@
+// SPDX-License-Identifier: GPL-2.0
+#undef _GNU_SOURCE
+#define _GNU_SOURCE 1
+#undef __USE_GNU
+#define __USE_GNU 1
+#include <unistd.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <signal.h>
+#include <sys/types.h>
+#include <sys/select.h>
+#include <sys/time.h>
+#include <sys/wait.h>
+#include <fenv.h>
+
+unsigned long long res64 = -1;
+unsigned int res32 = -1;
+unsigned short res16 = -1;
+
+int test(void)
+{
+ int ex;
+
+ feclearexcept(FE_DIVBYZERO|FE_INEXACT|FE_INVALID|FE_OVERFLOW|FE_UNDERFLOW);
+ asm volatile ("\n"
+ " fld1""\n"
+ " fisttp res16""\n"
+ " fld1""\n"
+ " fisttpl res32""\n"
+ " fld1""\n"
+ " fisttpll res64""\n"
+ : : : "memory"
+ );
+ if (res16 != 1 || res32 != 1 || res64 != 1) {
+ printf("[BAD]\tfisttp 1\n");
+ return 1;
+ }
+ ex = fetestexcept(FE_DIVBYZERO|FE_INEXACT|FE_INVALID|FE_OVERFLOW|FE_UNDERFLOW);
+ if (ex != 0) {
+ printf("[BAD]\tfisttp 1: wrong exception state\n");
+ return 1;
+ }
+
+ feclearexcept(FE_DIVBYZERO|FE_INEXACT|FE_INVALID|FE_OVERFLOW|FE_UNDERFLOW);
+ asm volatile ("\n"
+ " fldpi""\n"
+ " fisttp res16""\n"
+ " fldpi""\n"
+ " fisttpl res32""\n"
+ " fldpi""\n"
+ " fisttpll res64""\n"
+ : : : "memory"
+ );
+ if (res16 != 3 || res32 != 3 || res64 != 3) {
+ printf("[BAD]\tfisttp pi\n");
+ return 1;
+ }
+ ex = fetestexcept(FE_DIVBYZERO|FE_INEXACT|FE_INVALID|FE_OVERFLOW|FE_UNDERFLOW);
+ if (ex != FE_INEXACT) {
+ printf("[BAD]\tfisttp pi: wrong exception state\n");
+ return 1;
+ }
+
+ feclearexcept(FE_DIVBYZERO|FE_INEXACT|FE_INVALID|FE_OVERFLOW|FE_UNDERFLOW);
+ asm volatile ("\n"
+ " fldpi""\n"
+ " fchs""\n"
+ " fisttp res16""\n"
+ " fldpi""\n"
+ " fchs""\n"
+ " fisttpl res32""\n"
+ " fldpi""\n"
+ " fchs""\n"
+ " fisttpll res64""\n"
+ : : : "memory"
+ );
+ if (res16 != 0xfffd || res32 != 0xfffffffd || res64 != 0xfffffffffffffffdULL) {
+ printf("[BAD]\tfisttp -pi\n");
+ return 1;
+ }
+ ex = fetestexcept(FE_DIVBYZERO|FE_INEXACT|FE_INVALID|FE_OVERFLOW|FE_UNDERFLOW);
+ if (ex != FE_INEXACT) {
+ printf("[BAD]\tfisttp -pi: wrong exception state\n");
+ return 1;
+ }
+
+ feclearexcept(FE_DIVBYZERO|FE_INEXACT|FE_INVALID|FE_OVERFLOW|FE_UNDERFLOW);
+ asm volatile ("\n"
+ " fldln2""\n"
+ " fisttp res16""\n"
+ " fldln2""\n"
+ " fisttpl res32""\n"
+ " fldln2""\n"
+ " fisttpll res64""\n"
+ : : : "memory"
+ );
+ /* Test truncation to zero (round-to-nearest would give 1 here) */
+ if (res16 != 0 || res32 != 0 || res64 != 0) {
+ printf("[BAD]\tfisttp ln2\n");
+ return 1;
+ }
+ ex = fetestexcept(FE_DIVBYZERO|FE_INEXACT|FE_INVALID|FE_OVERFLOW|FE_UNDERFLOW);
+ if (ex != FE_INEXACT) {
+ printf("[BAD]\tfisttp ln2: wrong exception state\n");
+ return 1;
+ }
+
+ return 0;
+}
+
+void sighandler(int sig)
+{
+ printf("[FAIL]\tGot signal %d, exiting\n", sig);
+ exit(1);
+}
+
+int main(int argc, char **argv, char **envp)
+{
+ int err = 0;
+
+ /* SIGILL triggers on 32-bit kernels w/o fisttp emulation
+ * when run with "no387 nofxsr". Other signals are caught
+ * just in case.
+ */
+ signal(SIGILL, sighandler);
+ signal(SIGFPE, sighandler);
+ signal(SIGSEGV, sighandler);
+
+ printf("[RUN]\tTesting fisttp instructions\n");
+ err |= test();
+ if (!err)
+ printf("[OK]\tfisttp\n");
+ else
+ printf("[FAIL]\tfisttp errors: %d\n", err);
+
+ return err;
+}
diff --git a/tools/testing/selftests/x86/test_mremap_vdso.c b/tools/testing/selftests/x86/test_mremap_vdso.c
new file mode 100644
index 000000000..64f11c8d9
--- /dev/null
+++ b/tools/testing/selftests/x86/test_mremap_vdso.c
@@ -0,0 +1,115 @@
+/*
+ * 32-bit test to check vDSO mremap.
+ *
+ * Copyright (c) 2016 Dmitry Safonov
+ * Suggested-by: Andrew Lutomirski
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+/*
+ * Can be built statically:
+ * gcc -Os -Wall -static -m32 test_mremap_vdso.c
+ */
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <errno.h>
+#include <unistd.h>
+#include <string.h>
+
+#include <sys/mman.h>
+#include <sys/auxv.h>
+#include <sys/syscall.h>
+#include <sys/wait.h>
+
+#define PAGE_SIZE 4096
+
+static int try_to_remap(void *vdso_addr, unsigned long size)
+{
+ void *dest_addr, *new_addr;
+
+ /* Searching for memory location where to remap */
+ dest_addr = mmap(0, size, PROT_NONE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
+ if (dest_addr == MAP_FAILED) {
+ printf("[WARN]\tmmap failed (%d): %m\n", errno);
+ return 0;
+ }
+
+ printf("[NOTE]\tMoving vDSO: [%p, %#lx] -> [%p, %#lx]\n",
+ vdso_addr, (unsigned long)vdso_addr + size,
+ dest_addr, (unsigned long)dest_addr + size);
+ fflush(stdout);
+
+ new_addr = mremap(vdso_addr, size, size,
+ MREMAP_FIXED|MREMAP_MAYMOVE, dest_addr);
+ if ((unsigned long)new_addr == (unsigned long)-1) {
+ munmap(dest_addr, size);
+ if (errno == EINVAL) {
+ printf("[NOTE]\tvDSO partial move failed, will try with bigger size\n");
+ return -1; /* Retry with larger */
+ }
+ printf("[FAIL]\tmremap failed (%d): %m\n", errno);
+ return 1;
+ }
+
+ return 0;
+
+}
+
+int main(int argc, char **argv, char **envp)
+{
+ pid_t child;
+
+ child = fork();
+ if (child == -1) {
+ printf("[WARN]\tfailed to fork (%d): %m\n", errno);
+ return 1;
+ }
+
+ if (child == 0) {
+ unsigned long vdso_size = PAGE_SIZE;
+ unsigned long auxval;
+ int ret = -1;
+
+ auxval = getauxval(AT_SYSINFO_EHDR);
+ printf("\tAT_SYSINFO_EHDR is %#lx\n", auxval);
+ if (!auxval || auxval == -ENOENT) {
+ printf("[WARN]\tgetauxval failed\n");
+ return 0;
+ }
+
+ /* Simpler than parsing ELF header */
+ while (ret < 0) {
+ ret = try_to_remap((void *)auxval, vdso_size);
+ vdso_size += PAGE_SIZE;
+ }
+
+#ifdef __i386__
+ /* Glibc is likely to explode now - exit with raw syscall */
+ asm volatile ("int $0x80" : : "a" (__NR_exit), "b" (!!ret));
+#else /* __x86_64__ */
+ syscall(SYS_exit, ret);
+#endif
+ } else {
+ int status;
+
+ if (waitpid(child, &status, 0) != child ||
+ !WIFEXITED(status)) {
+ printf("[FAIL]\tmremap() of the vDSO does not work on this kernel!\n");
+ return 1;
+ } else if (WEXITSTATUS(status) != 0) {
+ printf("[FAIL]\tChild failed with %d\n",
+ WEXITSTATUS(status));
+ return 1;
+ }
+ printf("[OK]\n");
+ }
+
+ return 0;
+}
diff --git a/tools/testing/selftests/x86/test_syscall_vdso.c b/tools/testing/selftests/x86/test_syscall_vdso.c
new file mode 100644
index 000000000..c9c328107
--- /dev/null
+++ b/tools/testing/selftests/x86/test_syscall_vdso.c
@@ -0,0 +1,408 @@
+/*
+ * 32-bit syscall ABI conformance test.
+ *
+ * Copyright (c) 2015 Denys Vlasenko
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+/*
+ * Can be built statically:
+ * gcc -Os -Wall -static -m32 test_syscall_vdso.c thunks_32.S
+ */
+#undef _GNU_SOURCE
+#define _GNU_SOURCE 1
+#undef __USE_GNU
+#define __USE_GNU 1
+#include <unistd.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <signal.h>
+#include <sys/types.h>
+#include <sys/select.h>
+#include <sys/time.h>
+#include <elf.h>
+#include <sys/ptrace.h>
+#include <sys/wait.h>
+
+#if !defined(__i386__)
+int main(int argc, char **argv, char **envp)
+{
+ printf("[SKIP]\tNot a 32-bit x86 userspace\n");
+ return 0;
+}
+#else
+
+long syscall_addr;
+long get_syscall(char **envp)
+{
+ Elf32_auxv_t *auxv;
+ while (*envp++ != NULL)
+ continue;
+ for (auxv = (void *)envp; auxv->a_type != AT_NULL; auxv++)
+ if (auxv->a_type == AT_SYSINFO)
+ return auxv->a_un.a_val;
+ printf("[WARN]\tAT_SYSINFO not supplied\n");
+ return 0;
+}
+
+asm (
+ " .pushsection .text\n"
+ " .global int80\n"
+ "int80:\n"
+ " int $0x80\n"
+ " ret\n"
+ " .popsection\n"
+);
+extern char int80;
+
+struct regs64 {
+ uint64_t rax, rbx, rcx, rdx;
+ uint64_t rsi, rdi, rbp, rsp;
+ uint64_t r8, r9, r10, r11;
+ uint64_t r12, r13, r14, r15;
+};
+struct regs64 regs64;
+int kernel_is_64bit;
+
+asm (
+ " .pushsection .text\n"
+ " .code64\n"
+ "get_regs64:\n"
+ " push %rax\n"
+ " mov $regs64, %eax\n"
+ " pop 0*8(%rax)\n"
+ " movq %rbx, 1*8(%rax)\n"
+ " movq %rcx, 2*8(%rax)\n"
+ " movq %rdx, 3*8(%rax)\n"
+ " movq %rsi, 4*8(%rax)\n"
+ " movq %rdi, 5*8(%rax)\n"
+ " movq %rbp, 6*8(%rax)\n"
+ " movq %rsp, 7*8(%rax)\n"
+ " movq %r8, 8*8(%rax)\n"
+ " movq %r9, 9*8(%rax)\n"
+ " movq %r10, 10*8(%rax)\n"
+ " movq %r11, 11*8(%rax)\n"
+ " movq %r12, 12*8(%rax)\n"
+ " movq %r13, 13*8(%rax)\n"
+ " movq %r14, 14*8(%rax)\n"
+ " movq %r15, 15*8(%rax)\n"
+ " ret\n"
+ "poison_regs64:\n"
+ " movq $0x7f7f7f7f, %r8\n"
+ " shl $32, %r8\n"
+ " orq $0x7f7f7f7f, %r8\n"
+ " movq %r8, %r9\n"
+ " incq %r9\n"
+ " movq %r9, %r10\n"
+ " incq %r10\n"
+ " movq %r10, %r11\n"
+ " incq %r11\n"
+ " movq %r11, %r12\n"
+ " incq %r12\n"
+ " movq %r12, %r13\n"
+ " incq %r13\n"
+ " movq %r13, %r14\n"
+ " incq %r14\n"
+ " movq %r14, %r15\n"
+ " incq %r15\n"
+ " ret\n"
+ " .code32\n"
+ " .popsection\n"
+);
+extern void get_regs64(void);
+extern void poison_regs64(void);
+extern unsigned long call64_from_32(void (*function)(void));
+void print_regs64(void)
+{
+ if (!kernel_is_64bit)
+ return;
+ printf("ax:%016llx bx:%016llx cx:%016llx dx:%016llx\n", regs64.rax, regs64.rbx, regs64.rcx, regs64.rdx);
+ printf("si:%016llx di:%016llx bp:%016llx sp:%016llx\n", regs64.rsi, regs64.rdi, regs64.rbp, regs64.rsp);
+ printf(" 8:%016llx 9:%016llx 10:%016llx 11:%016llx\n", regs64.r8 , regs64.r9 , regs64.r10, regs64.r11);
+ printf("12:%016llx 13:%016llx 14:%016llx 15:%016llx\n", regs64.r12, regs64.r13, regs64.r14, regs64.r15);
+}
+
+int check_regs64(void)
+{
+ int err = 0;
+ int num = 8;
+ uint64_t *r64 = &regs64.r8;
+ uint64_t expected = 0x7f7f7f7f7f7f7f7fULL;
+
+ if (!kernel_is_64bit)
+ return 0;
+
+ do {
+ if (*r64 == expected++)
+ continue; /* register did not change */
+ if (syscall_addr != (long)&int80) {
+ /*
+ * Non-INT80 syscall entrypoints are allowed to clobber R8+ regs:
+ * either clear them to 0, or for R11, load EFLAGS.
+ */
+ if (*r64 == 0)
+ continue;
+ if (num == 11) {
+ printf("[NOTE]\tR11 has changed:%016llx - assuming clobbered by SYSRET insn\n", *r64);
+ continue;
+ }
+ } else {
+ /*
+ * INT80 syscall entrypoint can be used by
+ * 64-bit programs too, unlike SYSCALL/SYSENTER.
+ * Therefore it must preserve R12+
+ * (they are callee-saved registers in 64-bit C ABI).
+ *
+ * Starting in Linux 4.17 (and any kernel that
+ * backports the change), R8..11 are preserved.
+ * Historically (and probably unintentionally), they
+ * were clobbered or zeroed.
+ */
+ }
+ printf("[FAIL]\tR%d has changed:%016llx\n", num, *r64);
+ err++;
+ } while (r64++, ++num < 16);
+
+ if (!err)
+ printf("[OK]\tR8..R15 did not leak kernel data\n");
+ return err;
+}
+
+int nfds;
+fd_set rfds;
+fd_set wfds;
+fd_set efds;
+struct timespec timeout;
+sigset_t sigmask;
+struct {
+ sigset_t *sp;
+ int sz;
+} sigmask_desc;
+
+void prep_args()
+{
+ nfds = 42;
+ FD_ZERO(&rfds);
+ FD_ZERO(&wfds);
+ FD_ZERO(&efds);
+ FD_SET(0, &rfds);
+ FD_SET(1, &wfds);
+ FD_SET(2, &efds);
+ timeout.tv_sec = 0;
+ timeout.tv_nsec = 123;
+ sigemptyset(&sigmask);
+ sigaddset(&sigmask, SIGINT);
+ sigaddset(&sigmask, SIGUSR2);
+ sigaddset(&sigmask, SIGRTMAX);
+ sigmask_desc.sp = &sigmask;
+ sigmask_desc.sz = 8; /* bytes */
+}
+
+static void print_flags(const char *name, unsigned long r)
+{
+ static const char *bitarray[] = {
+ "\n" ,"c\n" ,/* Carry Flag */
+ "0 " ,"1 " ,/* Bit 1 - always on */
+ "" ,"p " ,/* Parity Flag */
+ "0 " ,"3? " ,
+ "" ,"a " ,/* Auxiliary carry Flag */
+ "0 " ,"5? " ,
+ "" ,"z " ,/* Zero Flag */
+ "" ,"s " ,/* Sign Flag */
+ "" ,"t " ,/* Trap Flag */
+ "" ,"i " ,/* Interrupt Flag */
+ "" ,"d " ,/* Direction Flag */
+ "" ,"o " ,/* Overflow Flag */
+ "0 " ,"1 " ,/* I/O Privilege Level (2 bits) */
+ "0" ,"1" ,/* I/O Privilege Level (2 bits) */
+ "" ,"n " ,/* Nested Task */
+ "0 " ,"15? ",
+ "" ,"r " ,/* Resume Flag */
+ "" ,"v " ,/* Virtual Mode */
+ "" ,"ac " ,/* Alignment Check/Access Control */
+ "" ,"vif ",/* Virtual Interrupt Flag */
+ "" ,"vip ",/* Virtual Interrupt Pending */
+ "" ,"id " ,/* CPUID detection */
+ NULL
+ };
+ const char **bitstr;
+ int bit;
+
+ printf("%s=%016lx ", name, r);
+ bitstr = bitarray + 42;
+ bit = 21;
+ if ((r >> 22) != 0)
+ printf("(extra bits are set) ");
+ do {
+ if (bitstr[(r >> bit) & 1][0])
+ fputs(bitstr[(r >> bit) & 1], stdout);
+ bitstr -= 2;
+ bit--;
+ } while (bit >= 0);
+}
+
+int run_syscall(void)
+{
+ long flags, bad_arg;
+
+ prep_args();
+
+ if (kernel_is_64bit)
+ call64_from_32(poison_regs64);
+ /*print_regs64();*/
+
+ asm("\n"
+ /* Try 6-arg syscall: pselect. It should return quickly */
+ " push %%ebp\n"
+ " mov $308, %%eax\n" /* PSELECT */
+ " mov nfds, %%ebx\n" /* ebx arg1 */
+ " mov $rfds, %%ecx\n" /* ecx arg2 */
+ " mov $wfds, %%edx\n" /* edx arg3 */
+ " mov $efds, %%esi\n" /* esi arg4 */
+ " mov $timeout, %%edi\n" /* edi arg5 */
+ " mov $sigmask_desc, %%ebp\n" /* %ebp arg6 */
+ " push $0x200ed7\n" /* set almost all flags */
+ " popf\n" /* except TF, IOPL, NT, RF, VM, AC, VIF, VIP */
+ " call *syscall_addr\n"
+ /* Check that registers are not clobbered */
+ " pushf\n"
+ " pop %%eax\n"
+ " cld\n"
+ " cmp nfds, %%ebx\n" /* ebx arg1 */
+ " mov $1, %%ebx\n"
+ " jne 1f\n"
+ " cmp $rfds, %%ecx\n" /* ecx arg2 */
+ " mov $2, %%ebx\n"
+ " jne 1f\n"
+ " cmp $wfds, %%edx\n" /* edx arg3 */
+ " mov $3, %%ebx\n"
+ " jne 1f\n"
+ " cmp $efds, %%esi\n" /* esi arg4 */
+ " mov $4, %%ebx\n"
+ " jne 1f\n"
+ " cmp $timeout, %%edi\n" /* edi arg5 */
+ " mov $5, %%ebx\n"
+ " jne 1f\n"
+ " cmpl $sigmask_desc, %%ebp\n" /* %ebp arg6 */
+ " mov $6, %%ebx\n"
+ " jne 1f\n"
+ " mov $0, %%ebx\n"
+ "1:\n"
+ " pop %%ebp\n"
+ : "=a" (flags), "=b" (bad_arg)
+ :
+ : "cx", "dx", "si", "di"
+ );
+
+ if (kernel_is_64bit) {
+ memset(&regs64, 0x77, sizeof(regs64));
+ call64_from_32(get_regs64);
+ /*print_regs64();*/
+ }
+
+ /*
+ * On paravirt kernels, flags are not preserved across syscalls.
+ * Thus, we do not consider it a bug if some are changed.
+ * We just show ones which do.
+ */
+ if ((0x200ed7 ^ flags) != 0) {
+ print_flags("[WARN]\tFlags before", 0x200ed7);
+ print_flags("[WARN]\tFlags after", flags);
+ print_flags("[WARN]\tFlags change", (0x200ed7 ^ flags));
+ }
+
+ if (bad_arg) {
+ printf("[FAIL]\targ#%ld clobbered\n", bad_arg);
+ return 1;
+ }
+ printf("[OK]\tArguments are preserved across syscall\n");
+
+ return check_regs64();
+}
+
+int run_syscall_twice()
+{
+ int exitcode = 0;
+ long sv;
+
+ if (syscall_addr) {
+ printf("[RUN]\tExecuting 6-argument 32-bit syscall via VDSO\n");
+ exitcode = run_syscall();
+ }
+ sv = syscall_addr;
+ syscall_addr = (long)&int80;
+ printf("[RUN]\tExecuting 6-argument 32-bit syscall via INT 80\n");
+ exitcode += run_syscall();
+ syscall_addr = sv;
+ return exitcode;
+}
+
+void ptrace_me()
+{
+ pid_t pid;
+
+ fflush(NULL);
+ pid = fork();
+ if (pid < 0)
+ exit(1);
+ if (pid == 0) {
+ /* child */
+ if (ptrace(PTRACE_TRACEME, 0L, 0L, 0L) != 0)
+ exit(0);
+ raise(SIGSTOP);
+ return;
+ }
+ /* parent */
+ printf("[RUN]\tRunning tests under ptrace\n");
+ while (1) {
+ int status;
+ pid = waitpid(-1, &status, __WALL);
+ if (WIFEXITED(status))
+ exit(WEXITSTATUS(status));
+ if (WIFSIGNALED(status))
+ exit(WTERMSIG(status));
+ if (pid <= 0 || !WIFSTOPPED(status)) /* paranoia */
+ exit(255);
+ /*
+ * Note: we do not inject sig = WSTOPSIG(status).
+ * We probably should, but careful: do not inject SIGTRAP
+ * generated by syscall entry/exit stops.
+ * That kills the child.
+ */
+ ptrace(PTRACE_SYSCALL, pid, 0L, 0L /*sig*/);
+ }
+}
+
+int main(int argc, char **argv, char **envp)
+{
+ int exitcode = 0;
+ int cs;
+
+ asm("\n"
+ " movl %%cs, %%eax\n"
+ : "=a" (cs)
+ );
+ kernel_is_64bit = (cs == 0x23);
+ if (!kernel_is_64bit)
+ printf("[NOTE]\tNot a 64-bit kernel, won't test R8..R15 leaks\n");
+
+ /* This only works for non-static builds:
+ * syscall_addr = dlsym(dlopen("linux-gate.so.1", RTLD_NOW), "__kernel_vsyscall");
+ */
+ syscall_addr = get_syscall(envp);
+
+ exitcode += run_syscall_twice();
+ ptrace_me();
+ exitcode += run_syscall_twice();
+
+ return exitcode;
+}
+#endif
diff --git a/tools/testing/selftests/x86/test_vdso.c b/tools/testing/selftests/x86/test_vdso.c
new file mode 100644
index 000000000..35edd61d1
--- /dev/null
+++ b/tools/testing/selftests/x86/test_vdso.c
@@ -0,0 +1,337 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * ldt_gdt.c - Test cases for LDT and GDT access
+ * Copyright (c) 2011-2015 Andrew Lutomirski
+ */
+
+#define _GNU_SOURCE
+
+#include <stdio.h>
+#include <sys/time.h>
+#include <time.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/syscall.h>
+#include <dlfcn.h>
+#include <string.h>
+#include <errno.h>
+#include <sched.h>
+#include <stdbool.h>
+#include <limits.h>
+
+#ifndef SYS_getcpu
+# ifdef __x86_64__
+# define SYS_getcpu 309
+# else
+# define SYS_getcpu 318
+# endif
+#endif
+
+/* max length of lines in /proc/self/maps - anything longer is skipped here */
+#define MAPS_LINE_LEN 128
+
+int nerrs = 0;
+
+typedef int (*vgettime_t)(clockid_t, struct timespec *);
+
+vgettime_t vdso_clock_gettime;
+
+typedef long (*vgtod_t)(struct timeval *tv, struct timezone *tz);
+
+vgtod_t vdso_gettimeofday;
+
+typedef long (*getcpu_t)(unsigned *, unsigned *, void *);
+
+getcpu_t vgetcpu;
+getcpu_t vdso_getcpu;
+
+static void *vsyscall_getcpu(void)
+{
+#ifdef __x86_64__
+ FILE *maps;
+ char line[MAPS_LINE_LEN];
+ bool found = false;
+
+ maps = fopen("/proc/self/maps", "r");
+ if (!maps) /* might still be present, but ignore it here, as we test vDSO not vsyscall */
+ return NULL;
+
+ while (fgets(line, MAPS_LINE_LEN, maps)) {
+ char r, x;
+ void *start, *end;
+ char name[MAPS_LINE_LEN];
+
+ /* sscanf() is safe here as strlen(name) >= strlen(line) */
+ if (sscanf(line, "%p-%p %c-%cp %*x %*x:%*x %*u %s",
+ &start, &end, &r, &x, name) != 5)
+ continue;
+
+ if (strcmp(name, "[vsyscall]"))
+ continue;
+
+ /* assume entries are OK, as we test vDSO here not vsyscall */
+ found = true;
+ break;
+ }
+
+ fclose(maps);
+
+ if (!found) {
+ printf("Warning: failed to find vsyscall getcpu\n");
+ return NULL;
+ }
+ return (void *) (0xffffffffff600800);
+#else
+ return NULL;
+#endif
+}
+
+
+static void fill_function_pointers()
+{
+ void *vdso = dlopen("linux-vdso.so.1",
+ RTLD_LAZY | RTLD_LOCAL | RTLD_NOLOAD);
+ if (!vdso)
+ vdso = dlopen("linux-gate.so.1",
+ RTLD_LAZY | RTLD_LOCAL | RTLD_NOLOAD);
+ if (!vdso) {
+ printf("[WARN]\tfailed to find vDSO\n");
+ return;
+ }
+
+ vdso_getcpu = (getcpu_t)dlsym(vdso, "__vdso_getcpu");
+ if (!vdso_getcpu)
+ printf("Warning: failed to find getcpu in vDSO\n");
+
+ vgetcpu = (getcpu_t) vsyscall_getcpu();
+
+ vdso_clock_gettime = (vgettime_t)dlsym(vdso, "__vdso_clock_gettime");
+ if (!vdso_clock_gettime)
+ printf("Warning: failed to find clock_gettime in vDSO\n");
+
+ vdso_gettimeofday = (vgtod_t)dlsym(vdso, "__vdso_gettimeofday");
+ if (!vdso_gettimeofday)
+ printf("Warning: failed to find gettimeofday in vDSO\n");
+
+}
+
+static long sys_getcpu(unsigned * cpu, unsigned * node,
+ void* cache)
+{
+ return syscall(__NR_getcpu, cpu, node, cache);
+}
+
+static inline int sys_clock_gettime(clockid_t id, struct timespec *ts)
+{
+ return syscall(__NR_clock_gettime, id, ts);
+}
+
+static inline int sys_gettimeofday(struct timeval *tv, struct timezone *tz)
+{
+ return syscall(__NR_gettimeofday, tv, tz);
+}
+
+static void test_getcpu(void)
+{
+ printf("[RUN]\tTesting getcpu...\n");
+
+ for (int cpu = 0; ; cpu++) {
+ cpu_set_t cpuset;
+ CPU_ZERO(&cpuset);
+ CPU_SET(cpu, &cpuset);
+ if (sched_setaffinity(0, sizeof(cpuset), &cpuset) != 0)
+ return;
+
+ unsigned cpu_sys, cpu_vdso, cpu_vsys,
+ node_sys, node_vdso, node_vsys;
+ long ret_sys, ret_vdso = 1, ret_vsys = 1;
+ unsigned node;
+
+ ret_sys = sys_getcpu(&cpu_sys, &node_sys, 0);
+ if (vdso_getcpu)
+ ret_vdso = vdso_getcpu(&cpu_vdso, &node_vdso, 0);
+ if (vgetcpu)
+ ret_vsys = vgetcpu(&cpu_vsys, &node_vsys, 0);
+
+ if (!ret_sys)
+ node = node_sys;
+ else if (!ret_vdso)
+ node = node_vdso;
+ else if (!ret_vsys)
+ node = node_vsys;
+
+ bool ok = true;
+ if (!ret_sys && (cpu_sys != cpu || node_sys != node))
+ ok = false;
+ if (!ret_vdso && (cpu_vdso != cpu || node_vdso != node))
+ ok = false;
+ if (!ret_vsys && (cpu_vsys != cpu || node_vsys != node))
+ ok = false;
+
+ printf("[%s]\tCPU %u:", ok ? "OK" : "FAIL", cpu);
+ if (!ret_sys)
+ printf(" syscall: cpu %u, node %u", cpu_sys, node_sys);
+ if (!ret_vdso)
+ printf(" vdso: cpu %u, node %u", cpu_vdso, node_vdso);
+ if (!ret_vsys)
+ printf(" vsyscall: cpu %u, node %u", cpu_vsys,
+ node_vsys);
+ printf("\n");
+
+ if (!ok)
+ nerrs++;
+ }
+}
+
+static bool ts_leq(const struct timespec *a, const struct timespec *b)
+{
+ if (a->tv_sec != b->tv_sec)
+ return a->tv_sec < b->tv_sec;
+ else
+ return a->tv_nsec <= b->tv_nsec;
+}
+
+static bool tv_leq(const struct timeval *a, const struct timeval *b)
+{
+ if (a->tv_sec != b->tv_sec)
+ return a->tv_sec < b->tv_sec;
+ else
+ return a->tv_usec <= b->tv_usec;
+}
+
+static char const * const clocknames[] = {
+ [0] = "CLOCK_REALTIME",
+ [1] = "CLOCK_MONOTONIC",
+ [2] = "CLOCK_PROCESS_CPUTIME_ID",
+ [3] = "CLOCK_THREAD_CPUTIME_ID",
+ [4] = "CLOCK_MONOTONIC_RAW",
+ [5] = "CLOCK_REALTIME_COARSE",
+ [6] = "CLOCK_MONOTONIC_COARSE",
+ [7] = "CLOCK_BOOTTIME",
+ [8] = "CLOCK_REALTIME_ALARM",
+ [9] = "CLOCK_BOOTTIME_ALARM",
+ [10] = "CLOCK_SGI_CYCLE",
+ [11] = "CLOCK_TAI",
+};
+
+static void test_one_clock_gettime(int clock, const char *name)
+{
+ struct timespec start, vdso, end;
+ int vdso_ret, end_ret;
+
+ printf("[RUN]\tTesting clock_gettime for clock %s (%d)...\n", name, clock);
+
+ if (sys_clock_gettime(clock, &start) < 0) {
+ if (errno == EINVAL) {
+ vdso_ret = vdso_clock_gettime(clock, &vdso);
+ if (vdso_ret == -EINVAL) {
+ printf("[OK]\tNo such clock.\n");
+ } else {
+ printf("[FAIL]\tNo such clock, but __vdso_clock_gettime returned %d\n", vdso_ret);
+ nerrs++;
+ }
+ } else {
+ printf("[WARN]\t clock_gettime(%d) syscall returned error %d\n", clock, errno);
+ }
+ return;
+ }
+
+ vdso_ret = vdso_clock_gettime(clock, &vdso);
+ end_ret = sys_clock_gettime(clock, &end);
+
+ if (vdso_ret != 0 || end_ret != 0) {
+ printf("[FAIL]\tvDSO returned %d, syscall errno=%d\n",
+ vdso_ret, errno);
+ nerrs++;
+ return;
+ }
+
+ printf("\t%llu.%09ld %llu.%09ld %llu.%09ld\n",
+ (unsigned long long)start.tv_sec, start.tv_nsec,
+ (unsigned long long)vdso.tv_sec, vdso.tv_nsec,
+ (unsigned long long)end.tv_sec, end.tv_nsec);
+
+ if (!ts_leq(&start, &vdso) || !ts_leq(&vdso, &end)) {
+ printf("[FAIL]\tTimes are out of sequence\n");
+ nerrs++;
+ }
+}
+
+static void test_clock_gettime(void)
+{
+ for (int clock = 0; clock < sizeof(clocknames) / sizeof(clocknames[0]);
+ clock++) {
+ test_one_clock_gettime(clock, clocknames[clock]);
+ }
+
+ /* Also test some invalid clock ids */
+ test_one_clock_gettime(-1, "invalid");
+ test_one_clock_gettime(INT_MIN, "invalid");
+ test_one_clock_gettime(INT_MAX, "invalid");
+}
+
+static void test_gettimeofday(void)
+{
+ struct timeval start, vdso, end;
+ struct timezone sys_tz, vdso_tz;
+ int vdso_ret, end_ret;
+
+ if (!vdso_gettimeofday)
+ return;
+
+ printf("[RUN]\tTesting gettimeofday...\n");
+
+ if (sys_gettimeofday(&start, &sys_tz) < 0) {
+ printf("[FAIL]\tsys_gettimeofday failed (%d)\n", errno);
+ nerrs++;
+ return;
+ }
+
+ vdso_ret = vdso_gettimeofday(&vdso, &vdso_tz);
+ end_ret = sys_gettimeofday(&end, NULL);
+
+ if (vdso_ret != 0 || end_ret != 0) {
+ printf("[FAIL]\tvDSO returned %d, syscall errno=%d\n",
+ vdso_ret, errno);
+ nerrs++;
+ return;
+ }
+
+ printf("\t%llu.%06ld %llu.%06ld %llu.%06ld\n",
+ (unsigned long long)start.tv_sec, start.tv_usec,
+ (unsigned long long)vdso.tv_sec, vdso.tv_usec,
+ (unsigned long long)end.tv_sec, end.tv_usec);
+
+ if (!tv_leq(&start, &vdso) || !tv_leq(&vdso, &end)) {
+ printf("[FAIL]\tTimes are out of sequence\n");
+ nerrs++;
+ }
+
+ if (sys_tz.tz_minuteswest == vdso_tz.tz_minuteswest &&
+ sys_tz.tz_dsttime == vdso_tz.tz_dsttime) {
+ printf("[OK]\ttimezones match: minuteswest=%d, dsttime=%d\n",
+ sys_tz.tz_minuteswest, sys_tz.tz_dsttime);
+ } else {
+ printf("[FAIL]\ttimezones do not match\n");
+ nerrs++;
+ }
+
+ /* And make sure that passing NULL for tz doesn't crash. */
+ vdso_gettimeofday(&vdso, NULL);
+}
+
+int main(int argc, char **argv)
+{
+ fill_function_pointers();
+
+ test_clock_gettime();
+ test_gettimeofday();
+
+ /*
+ * Test getcpu() last so that, if something goes wrong setting affinity,
+ * we still run the other tests.
+ */
+ test_getcpu();
+
+ return nerrs ? 1 : 0;
+}
diff --git a/tools/testing/selftests/x86/test_vsyscall.c b/tools/testing/selftests/x86/test_vsyscall.c
new file mode 100644
index 000000000..0b4f1cc22
--- /dev/null
+++ b/tools/testing/selftests/x86/test_vsyscall.c
@@ -0,0 +1,506 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#define _GNU_SOURCE
+
+#include <stdio.h>
+#include <sys/time.h>
+#include <time.h>
+#include <stdlib.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+#include <dlfcn.h>
+#include <string.h>
+#include <inttypes.h>
+#include <signal.h>
+#include <sys/ucontext.h>
+#include <errno.h>
+#include <err.h>
+#include <sched.h>
+#include <stdbool.h>
+#include <setjmp.h>
+
+#ifdef __x86_64__
+# define VSYS(x) (x)
+#else
+# define VSYS(x) 0
+#endif
+
+#ifndef SYS_getcpu
+# ifdef __x86_64__
+# define SYS_getcpu 309
+# else
+# define SYS_getcpu 318
+# endif
+#endif
+
+/* max length of lines in /proc/self/maps - anything longer is skipped here */
+#define MAPS_LINE_LEN 128
+
+static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *),
+ int flags)
+{
+ struct sigaction sa;
+ memset(&sa, 0, sizeof(sa));
+ sa.sa_sigaction = handler;
+ sa.sa_flags = SA_SIGINFO | flags;
+ sigemptyset(&sa.sa_mask);
+ if (sigaction(sig, &sa, 0))
+ err(1, "sigaction");
+}
+
+/* vsyscalls and vDSO */
+bool should_read_vsyscall = false;
+
+typedef long (*gtod_t)(struct timeval *tv, struct timezone *tz);
+gtod_t vgtod = (gtod_t)VSYS(0xffffffffff600000);
+gtod_t vdso_gtod;
+
+typedef int (*vgettime_t)(clockid_t, struct timespec *);
+vgettime_t vdso_gettime;
+
+typedef long (*time_func_t)(time_t *t);
+time_func_t vtime = (time_func_t)VSYS(0xffffffffff600400);
+time_func_t vdso_time;
+
+typedef long (*getcpu_t)(unsigned *, unsigned *, void *);
+getcpu_t vgetcpu = (getcpu_t)VSYS(0xffffffffff600800);
+getcpu_t vdso_getcpu;
+
+static void init_vdso(void)
+{
+ void *vdso = dlopen("linux-vdso.so.1", RTLD_LAZY | RTLD_LOCAL | RTLD_NOLOAD);
+ if (!vdso)
+ vdso = dlopen("linux-gate.so.1", RTLD_LAZY | RTLD_LOCAL | RTLD_NOLOAD);
+ if (!vdso) {
+ printf("[WARN]\tfailed to find vDSO\n");
+ return;
+ }
+
+ vdso_gtod = (gtod_t)dlsym(vdso, "__vdso_gettimeofday");
+ if (!vdso_gtod)
+ printf("[WARN]\tfailed to find gettimeofday in vDSO\n");
+
+ vdso_gettime = (vgettime_t)dlsym(vdso, "__vdso_clock_gettime");
+ if (!vdso_gettime)
+ printf("[WARN]\tfailed to find clock_gettime in vDSO\n");
+
+ vdso_time = (time_func_t)dlsym(vdso, "__vdso_time");
+ if (!vdso_time)
+ printf("[WARN]\tfailed to find time in vDSO\n");
+
+ vdso_getcpu = (getcpu_t)dlsym(vdso, "__vdso_getcpu");
+ if (!vdso_getcpu) {
+ /* getcpu() was never wired up in the 32-bit vDSO. */
+ printf("[%s]\tfailed to find getcpu in vDSO\n",
+ sizeof(long) == 8 ? "WARN" : "NOTE");
+ }
+}
+
+static int init_vsys(void)
+{
+#ifdef __x86_64__
+ int nerrs = 0;
+ FILE *maps;
+ char line[MAPS_LINE_LEN];
+ bool found = false;
+
+ maps = fopen("/proc/self/maps", "r");
+ if (!maps) {
+ printf("[WARN]\tCould not open /proc/self/maps -- assuming vsyscall is r-x\n");
+ should_read_vsyscall = true;
+ return 0;
+ }
+
+ while (fgets(line, MAPS_LINE_LEN, maps)) {
+ char r, x;
+ void *start, *end;
+ char name[MAPS_LINE_LEN];
+
+ /* sscanf() is safe here as strlen(name) >= strlen(line) */
+ if (sscanf(line, "%p-%p %c-%cp %*x %*x:%*x %*u %s",
+ &start, &end, &r, &x, name) != 5)
+ continue;
+
+ if (strcmp(name, "[vsyscall]"))
+ continue;
+
+ printf("\tvsyscall map: %s", line);
+
+ if (start != (void *)0xffffffffff600000 ||
+ end != (void *)0xffffffffff601000) {
+ printf("[FAIL]\taddress range is nonsense\n");
+ nerrs++;
+ }
+
+ printf("\tvsyscall permissions are %c-%c\n", r, x);
+ should_read_vsyscall = (r == 'r');
+ if (x != 'x') {
+ vgtod = NULL;
+ vtime = NULL;
+ vgetcpu = NULL;
+ }
+
+ found = true;
+ break;
+ }
+
+ fclose(maps);
+
+ if (!found) {
+ printf("\tno vsyscall map in /proc/self/maps\n");
+ should_read_vsyscall = false;
+ vgtod = NULL;
+ vtime = NULL;
+ vgetcpu = NULL;
+ }
+
+ return nerrs;
+#else
+ return 0;
+#endif
+}
+
+/* syscalls */
+static inline long sys_gtod(struct timeval *tv, struct timezone *tz)
+{
+ return syscall(SYS_gettimeofday, tv, tz);
+}
+
+static inline int sys_clock_gettime(clockid_t id, struct timespec *ts)
+{
+ return syscall(SYS_clock_gettime, id, ts);
+}
+
+static inline long sys_time(time_t *t)
+{
+ return syscall(SYS_time, t);
+}
+
+static inline long sys_getcpu(unsigned * cpu, unsigned * node,
+ void* cache)
+{
+ return syscall(SYS_getcpu, cpu, node, cache);
+}
+
+static jmp_buf jmpbuf;
+
+static void sigsegv(int sig, siginfo_t *info, void *ctx_void)
+{
+ siglongjmp(jmpbuf, 1);
+}
+
+static double tv_diff(const struct timeval *a, const struct timeval *b)
+{
+ return (double)(a->tv_sec - b->tv_sec) +
+ (double)((int)a->tv_usec - (int)b->tv_usec) * 1e-6;
+}
+
+static int check_gtod(const struct timeval *tv_sys1,
+ const struct timeval *tv_sys2,
+ const struct timezone *tz_sys,
+ const char *which,
+ const struct timeval *tv_other,
+ const struct timezone *tz_other)
+{
+ int nerrs = 0;
+ double d1, d2;
+
+ if (tz_other && (tz_sys->tz_minuteswest != tz_other->tz_minuteswest || tz_sys->tz_dsttime != tz_other->tz_dsttime)) {
+ printf("[FAIL] %s tz mismatch\n", which);
+ nerrs++;
+ }
+
+ d1 = tv_diff(tv_other, tv_sys1);
+ d2 = tv_diff(tv_sys2, tv_other);
+ printf("\t%s time offsets: %lf %lf\n", which, d1, d2);
+
+ if (d1 < 0 || d2 < 0) {
+ printf("[FAIL]\t%s time was inconsistent with the syscall\n", which);
+ nerrs++;
+ } else {
+ printf("[OK]\t%s gettimeofday()'s timeval was okay\n", which);
+ }
+
+ return nerrs;
+}
+
+static int test_gtod(void)
+{
+ struct timeval tv_sys1, tv_sys2, tv_vdso, tv_vsys;
+ struct timezone tz_sys, tz_vdso, tz_vsys;
+ long ret_vdso = -1;
+ long ret_vsys = -1;
+ int nerrs = 0;
+
+ printf("[RUN]\ttest gettimeofday()\n");
+
+ if (sys_gtod(&tv_sys1, &tz_sys) != 0)
+ err(1, "syscall gettimeofday");
+ if (vdso_gtod)
+ ret_vdso = vdso_gtod(&tv_vdso, &tz_vdso);
+ if (vgtod)
+ ret_vsys = vgtod(&tv_vsys, &tz_vsys);
+ if (sys_gtod(&tv_sys2, &tz_sys) != 0)
+ err(1, "syscall gettimeofday");
+
+ if (vdso_gtod) {
+ if (ret_vdso == 0) {
+ nerrs += check_gtod(&tv_sys1, &tv_sys2, &tz_sys, "vDSO", &tv_vdso, &tz_vdso);
+ } else {
+ printf("[FAIL]\tvDSO gettimeofday() failed: %ld\n", ret_vdso);
+ nerrs++;
+ }
+ }
+
+ if (vgtod) {
+ if (ret_vsys == 0) {
+ nerrs += check_gtod(&tv_sys1, &tv_sys2, &tz_sys, "vsyscall", &tv_vsys, &tz_vsys);
+ } else {
+ printf("[FAIL]\tvsys gettimeofday() failed: %ld\n", ret_vsys);
+ nerrs++;
+ }
+ }
+
+ return nerrs;
+}
+
+static int test_time(void) {
+ int nerrs = 0;
+
+ printf("[RUN]\ttest time()\n");
+ long t_sys1, t_sys2, t_vdso = 0, t_vsys = 0;
+ long t2_sys1 = -1, t2_sys2 = -1, t2_vdso = -1, t2_vsys = -1;
+ t_sys1 = sys_time(&t2_sys1);
+ if (vdso_time)
+ t_vdso = vdso_time(&t2_vdso);
+ if (vtime)
+ t_vsys = vtime(&t2_vsys);
+ t_sys2 = sys_time(&t2_sys2);
+ if (t_sys1 < 0 || t_sys1 != t2_sys1 || t_sys2 < 0 || t_sys2 != t2_sys2) {
+ printf("[FAIL]\tsyscall failed (ret1:%ld output1:%ld ret2:%ld output2:%ld)\n", t_sys1, t2_sys1, t_sys2, t2_sys2);
+ nerrs++;
+ return nerrs;
+ }
+
+ if (vdso_time) {
+ if (t_vdso < 0 || t_vdso != t2_vdso) {
+ printf("[FAIL]\tvDSO failed (ret:%ld output:%ld)\n", t_vdso, t2_vdso);
+ nerrs++;
+ } else if (t_vdso < t_sys1 || t_vdso > t_sys2) {
+ printf("[FAIL]\tvDSO returned the wrong time (%ld %ld %ld)\n", t_sys1, t_vdso, t_sys2);
+ nerrs++;
+ } else {
+ printf("[OK]\tvDSO time() is okay\n");
+ }
+ }
+
+ if (vtime) {
+ if (t_vsys < 0 || t_vsys != t2_vsys) {
+ printf("[FAIL]\tvsyscall failed (ret:%ld output:%ld)\n", t_vsys, t2_vsys);
+ nerrs++;
+ } else if (t_vsys < t_sys1 || t_vsys > t_sys2) {
+ printf("[FAIL]\tvsyscall returned the wrong time (%ld %ld %ld)\n", t_sys1, t_vsys, t_sys2);
+ nerrs++;
+ } else {
+ printf("[OK]\tvsyscall time() is okay\n");
+ }
+ }
+
+ return nerrs;
+}
+
+static int test_getcpu(int cpu)
+{
+ int nerrs = 0;
+ long ret_sys, ret_vdso = -1, ret_vsys = -1;
+
+ printf("[RUN]\tgetcpu() on CPU %d\n", cpu);
+
+ cpu_set_t cpuset;
+ CPU_ZERO(&cpuset);
+ CPU_SET(cpu, &cpuset);
+ if (sched_setaffinity(0, sizeof(cpuset), &cpuset) != 0) {
+ printf("[SKIP]\tfailed to force CPU %d\n", cpu);
+ return nerrs;
+ }
+
+ unsigned cpu_sys, cpu_vdso, cpu_vsys, node_sys, node_vdso, node_vsys;
+ unsigned node = 0;
+ bool have_node = false;
+ ret_sys = sys_getcpu(&cpu_sys, &node_sys, 0);
+ if (vdso_getcpu)
+ ret_vdso = vdso_getcpu(&cpu_vdso, &node_vdso, 0);
+ if (vgetcpu)
+ ret_vsys = vgetcpu(&cpu_vsys, &node_vsys, 0);
+
+ if (ret_sys == 0) {
+ if (cpu_sys != cpu) {
+ printf("[FAIL]\tsyscall reported CPU %hu but should be %d\n", cpu_sys, cpu);
+ nerrs++;
+ }
+
+ have_node = true;
+ node = node_sys;
+ }
+
+ if (vdso_getcpu) {
+ if (ret_vdso) {
+ printf("[FAIL]\tvDSO getcpu() failed\n");
+ nerrs++;
+ } else {
+ if (!have_node) {
+ have_node = true;
+ node = node_vdso;
+ }
+
+ if (cpu_vdso != cpu) {
+ printf("[FAIL]\tvDSO reported CPU %hu but should be %d\n", cpu_vdso, cpu);
+ nerrs++;
+ } else {
+ printf("[OK]\tvDSO reported correct CPU\n");
+ }
+
+ if (node_vdso != node) {
+ printf("[FAIL]\tvDSO reported node %hu but should be %hu\n", node_vdso, node);
+ nerrs++;
+ } else {
+ printf("[OK]\tvDSO reported correct node\n");
+ }
+ }
+ }
+
+ if (vgetcpu) {
+ if (ret_vsys) {
+ printf("[FAIL]\tvsyscall getcpu() failed\n");
+ nerrs++;
+ } else {
+ if (!have_node) {
+ have_node = true;
+ node = node_vsys;
+ }
+
+ if (cpu_vsys != cpu) {
+ printf("[FAIL]\tvsyscall reported CPU %hu but should be %d\n", cpu_vsys, cpu);
+ nerrs++;
+ } else {
+ printf("[OK]\tvsyscall reported correct CPU\n");
+ }
+
+ if (node_vsys != node) {
+ printf("[FAIL]\tvsyscall reported node %hu but should be %hu\n", node_vsys, node);
+ nerrs++;
+ } else {
+ printf("[OK]\tvsyscall reported correct node\n");
+ }
+ }
+ }
+
+ return nerrs;
+}
+
+static int test_vsys_r(void)
+{
+#ifdef __x86_64__
+ printf("[RUN]\tChecking read access to the vsyscall page\n");
+ bool can_read;
+ if (sigsetjmp(jmpbuf, 1) == 0) {
+ *(volatile int *)0xffffffffff600000;
+ can_read = true;
+ } else {
+ can_read = false;
+ }
+
+ if (can_read && !should_read_vsyscall) {
+ printf("[FAIL]\tWe have read access, but we shouldn't\n");
+ return 1;
+ } else if (!can_read && should_read_vsyscall) {
+ printf("[FAIL]\tWe don't have read access, but we should\n");
+ return 1;
+ } else {
+ printf("[OK]\tgot expected result\n");
+ }
+#endif
+
+ return 0;
+}
+
+
+#ifdef __x86_64__
+#define X86_EFLAGS_TF (1UL << 8)
+static volatile sig_atomic_t num_vsyscall_traps;
+
+static unsigned long get_eflags(void)
+{
+ unsigned long eflags;
+ asm volatile ("pushfq\n\tpopq %0" : "=rm" (eflags));
+ return eflags;
+}
+
+static void set_eflags(unsigned long eflags)
+{
+ asm volatile ("pushq %0\n\tpopfq" : : "rm" (eflags) : "flags");
+}
+
+static void sigtrap(int sig, siginfo_t *info, void *ctx_void)
+{
+ ucontext_t *ctx = (ucontext_t *)ctx_void;
+ unsigned long ip = ctx->uc_mcontext.gregs[REG_RIP];
+
+ if (((ip ^ 0xffffffffff600000UL) & ~0xfffUL) == 0)
+ num_vsyscall_traps++;
+}
+
+static int test_emulation(void)
+{
+ time_t tmp;
+ bool is_native;
+
+ if (!vtime)
+ return 0;
+
+ printf("[RUN]\tchecking that vsyscalls are emulated\n");
+ sethandler(SIGTRAP, sigtrap, 0);
+ set_eflags(get_eflags() | X86_EFLAGS_TF);
+ vtime(&tmp);
+ set_eflags(get_eflags() & ~X86_EFLAGS_TF);
+
+ /*
+ * If vsyscalls are emulated, we expect a single trap in the
+ * vsyscall page -- the call instruction will trap with RIP
+ * pointing to the entry point before emulation takes over.
+ * In native mode, we expect two traps, since whatever code
+ * the vsyscall page contains will be more than just a ret
+ * instruction.
+ */
+ is_native = (num_vsyscall_traps > 1);
+
+ printf("[%s]\tvsyscalls are %s (%d instructions in vsyscall page)\n",
+ (is_native ? "FAIL" : "OK"),
+ (is_native ? "native" : "emulated"),
+ (int)num_vsyscall_traps);
+
+ return is_native;
+}
+#endif
+
+int main(int argc, char **argv)
+{
+ int nerrs = 0;
+
+ init_vdso();
+ nerrs += init_vsys();
+
+ nerrs += test_gtod();
+ nerrs += test_time();
+ nerrs += test_getcpu(0);
+ nerrs += test_getcpu(1);
+
+ sethandler(SIGSEGV, sigsegv, 0);
+ nerrs += test_vsys_r();
+
+#ifdef __x86_64__
+ nerrs += test_emulation();
+#endif
+
+ return nerrs ? 1 : 0;
+}
diff --git a/tools/testing/selftests/x86/thunks.S b/tools/testing/selftests/x86/thunks.S
new file mode 100644
index 000000000..ce8a995bb
--- /dev/null
+++ b/tools/testing/selftests/x86/thunks.S
@@ -0,0 +1,67 @@
+/*
+ * thunks.S - assembly helpers for mixed-bitness code
+ * Copyright (c) 2015 Andrew Lutomirski
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * These are little helpers that make it easier to switch bitness on
+ * the fly.
+ */
+
+ .text
+
+ .global call32_from_64
+ .type call32_from_64, @function
+call32_from_64:
+ // rdi: stack to use
+ // esi: function to call
+
+ // Save registers
+ pushq %rbx
+ pushq %rbp
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ pushfq
+
+ // Switch stacks
+ mov %rsp,(%rdi)
+ mov %rdi,%rsp
+
+ // Switch to compatibility mode
+ pushq $0x23 /* USER32_CS */
+ pushq $1f
+ lretq
+
+1:
+ .code32
+ // Call the function
+ call *%esi
+ // Switch back to long mode
+ jmp $0x33,$1f
+ .code64
+
+1:
+ // Restore the stack
+ mov (%rsp),%rsp
+
+ // Restore registers
+ popfq
+ popq %r15
+ popq %r14
+ popq %r13
+ popq %r12
+ popq %rbp
+ popq %rbx
+
+ ret
+
+.size call32_from_64, .-call32_from_64
diff --git a/tools/testing/selftests/x86/thunks_32.S b/tools/testing/selftests/x86/thunks_32.S
new file mode 100644
index 000000000..29b644bb9
--- /dev/null
+++ b/tools/testing/selftests/x86/thunks_32.S
@@ -0,0 +1,55 @@
+/*
+ * thunks_32.S - assembly helpers for mixed-bitness code
+ * Copyright (c) 2015 Denys Vlasenko
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * These are little helpers that make it easier to switch bitness on
+ * the fly.
+ */
+
+ .text
+ .code32
+
+ .global call64_from_32
+ .type call32_from_64, @function
+
+ // 4(%esp): function to call
+call64_from_32:
+ // Fetch function address
+ mov 4(%esp), %eax
+
+ // Save registers which are callee-clobbered by 64-bit ABI
+ push %ecx
+ push %edx
+ push %esi
+ push %edi
+
+ // Switch to long mode
+ jmp $0x33,$1f
+1: .code64
+
+ // Call the function
+ call *%rax
+
+ // Switch to compatibility mode
+ push $0x23 /* USER32_CS */
+ .code32; push $1f; .code64 /* hack: can't have X86_64_32S relocation in 32-bit ELF */
+ lretq
+1: .code32
+
+ pop %edi
+ pop %esi
+ pop %edx
+ pop %ecx
+
+ ret
+
+.size call64_from_32, .-call64_from_32
diff --git a/tools/testing/selftests/x86/trivial_32bit_program.c b/tools/testing/selftests/x86/trivial_32bit_program.c
new file mode 100644
index 000000000..fabdf0f51
--- /dev/null
+++ b/tools/testing/selftests/x86/trivial_32bit_program.c
@@ -0,0 +1,18 @@
+/*
+ * Trivial program to check that we have a valid 32-bit build environment.
+ * Copyright (c) 2015 Andy Lutomirski
+ * GPL v2
+ */
+
+#ifndef __i386__
+# error wrong architecture
+#endif
+
+#include <stdio.h>
+
+int main()
+{
+ printf("\n");
+
+ return 0;
+}
diff --git a/tools/testing/selftests/x86/trivial_64bit_program.c b/tools/testing/selftests/x86/trivial_64bit_program.c
new file mode 100644
index 000000000..05c6a41b3
--- /dev/null
+++ b/tools/testing/selftests/x86/trivial_64bit_program.c
@@ -0,0 +1,18 @@
+/*
+ * Trivial program to check that we have a valid 64-bit build environment.
+ * Copyright (c) 2015 Andy Lutomirski
+ * GPL v2
+ */
+
+#ifndef __x86_64__
+# error wrong architecture
+#endif
+
+#include <stdio.h>
+
+int main()
+{
+ printf("\n");
+
+ return 0;
+}
diff --git a/tools/testing/selftests/x86/trivial_program.c b/tools/testing/selftests/x86/trivial_program.c
new file mode 100644
index 000000000..46a447163
--- /dev/null
+++ b/tools/testing/selftests/x86/trivial_program.c
@@ -0,0 +1,10 @@
+/* Trivial program to check that compilation with certain flags is working. */
+
+#include <stdio.h>
+
+int
+main(void)
+{
+ puts("");
+ return 0;
+}
diff --git a/tools/testing/selftests/x86/unwind_vdso.c b/tools/testing/selftests/x86/unwind_vdso.c
new file mode 100644
index 000000000..00a26a82f
--- /dev/null
+++ b/tools/testing/selftests/x86/unwind_vdso.c
@@ -0,0 +1,211 @@
+/*
+ * unwind_vdso.c - tests unwind info for AT_SYSINFO in the vDSO
+ * Copyright (c) 2014-2015 Andrew Lutomirski
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * This tests __kernel_vsyscall's unwind info.
+ */
+
+#define _GNU_SOURCE
+
+#include <features.h>
+#include <stdio.h>
+
+#if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ < 16
+
+int main()
+{
+ /* We need getauxval(). */
+ printf("[SKIP]\tGLIBC before 2.16 cannot compile this test\n");
+ return 0;
+}
+
+#else
+
+#include <sys/time.h>
+#include <stdlib.h>
+#include <syscall.h>
+#include <unistd.h>
+#include <string.h>
+#include <inttypes.h>
+#include <sys/mman.h>
+#include <signal.h>
+#include <sys/ucontext.h>
+#include <err.h>
+#include <stddef.h>
+#include <stdbool.h>
+#include <sys/ptrace.h>
+#include <sys/user.h>
+#include <sys/ucontext.h>
+#include <link.h>
+#include <sys/auxv.h>
+#include <dlfcn.h>
+#include <unwind.h>
+
+static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *),
+ int flags)
+{
+ struct sigaction sa;
+ memset(&sa, 0, sizeof(sa));
+ sa.sa_sigaction = handler;
+ sa.sa_flags = SA_SIGINFO | flags;
+ sigemptyset(&sa.sa_mask);
+ if (sigaction(sig, &sa, 0))
+ err(1, "sigaction");
+}
+
+#ifdef __x86_64__
+# define WIDTH "q"
+#else
+# define WIDTH "l"
+#endif
+
+static unsigned long get_eflags(void)
+{
+ unsigned long eflags;
+ asm volatile ("pushf" WIDTH "\n\tpop" WIDTH " %0" : "=rm" (eflags));
+ return eflags;
+}
+
+static void set_eflags(unsigned long eflags)
+{
+ asm volatile ("push" WIDTH " %0\n\tpopf" WIDTH
+ : : "rm" (eflags) : "flags");
+}
+
+#define X86_EFLAGS_TF (1UL << 8)
+
+static volatile sig_atomic_t nerrs;
+static unsigned long sysinfo;
+static bool got_sysinfo = false;
+static unsigned long return_address;
+
+struct unwind_state {
+ unsigned long ip; /* trap source */
+ int depth; /* -1 until we hit the trap source */
+};
+
+_Unwind_Reason_Code trace_fn(struct _Unwind_Context * ctx, void *opaque)
+{
+ struct unwind_state *state = opaque;
+ unsigned long ip = _Unwind_GetIP(ctx);
+
+ if (state->depth == -1) {
+ if (ip == state->ip)
+ state->depth = 0;
+ else
+ return _URC_NO_REASON; /* Not there yet */
+ }
+ printf("\t 0x%lx\n", ip);
+
+ if (ip == return_address) {
+ /* Here we are. */
+ unsigned long eax = _Unwind_GetGR(ctx, 0);
+ unsigned long ecx = _Unwind_GetGR(ctx, 1);
+ unsigned long edx = _Unwind_GetGR(ctx, 2);
+ unsigned long ebx = _Unwind_GetGR(ctx, 3);
+ unsigned long ebp = _Unwind_GetGR(ctx, 5);
+ unsigned long esi = _Unwind_GetGR(ctx, 6);
+ unsigned long edi = _Unwind_GetGR(ctx, 7);
+ bool ok = (eax == SYS_getpid || eax == getpid()) &&
+ ebx == 1 && ecx == 2 && edx == 3 &&
+ esi == 4 && edi == 5 && ebp == 6;
+
+ if (!ok)
+ nerrs++;
+ printf("[%s]\t NR = %ld, args = %ld, %ld, %ld, %ld, %ld, %ld\n",
+ (ok ? "OK" : "FAIL"),
+ eax, ebx, ecx, edx, esi, edi, ebp);
+
+ return _URC_NORMAL_STOP;
+ } else {
+ state->depth++;
+ return _URC_NO_REASON;
+ }
+}
+
+static void sigtrap(int sig, siginfo_t *info, void *ctx_void)
+{
+ ucontext_t *ctx = (ucontext_t *)ctx_void;
+ struct unwind_state state;
+ unsigned long ip = ctx->uc_mcontext.gregs[REG_EIP];
+
+ if (!got_sysinfo && ip == sysinfo) {
+ got_sysinfo = true;
+
+ /* Find the return address. */
+ return_address = *(unsigned long *)(unsigned long)ctx->uc_mcontext.gregs[REG_ESP];
+
+ printf("\tIn vsyscall at 0x%lx, returning to 0x%lx\n",
+ ip, return_address);
+ }
+
+ if (!got_sysinfo)
+ return; /* Not there yet */
+
+ if (ip == return_address) {
+ ctx->uc_mcontext.gregs[REG_EFL] &= ~X86_EFLAGS_TF;
+ printf("\tVsyscall is done\n");
+ return;
+ }
+
+ printf("\tSIGTRAP at 0x%lx\n", ip);
+
+ state.ip = ip;
+ state.depth = -1;
+ _Unwind_Backtrace(trace_fn, &state);
+}
+
+int main()
+{
+ sysinfo = getauxval(AT_SYSINFO);
+ printf("\tAT_SYSINFO is 0x%lx\n", sysinfo);
+
+ Dl_info info;
+ if (!dladdr((void *)sysinfo, &info)) {
+ printf("[WARN]\tdladdr failed on AT_SYSINFO\n");
+ } else {
+ printf("[OK]\tAT_SYSINFO maps to %s, loaded at 0x%p\n",
+ info.dli_fname, info.dli_fbase);
+ }
+
+ sethandler(SIGTRAP, sigtrap, 0);
+
+ syscall(SYS_getpid); /* Force symbol binding without TF set. */
+ printf("[RUN]\tSet TF and check a fast syscall\n");
+ set_eflags(get_eflags() | X86_EFLAGS_TF);
+ syscall(SYS_getpid, 1, 2, 3, 4, 5, 6);
+ if (!got_sysinfo) {
+ set_eflags(get_eflags() & ~X86_EFLAGS_TF);
+
+ /*
+ * The most likely cause of this is that you're on Debian or
+ * a Debian-based distro, you're missing libc6-i686, and you're
+ * affected by libc/19006 (https://sourceware.org/PR19006).
+ */
+ printf("[WARN]\tsyscall(2) didn't enter AT_SYSINFO\n");
+ }
+
+ if (get_eflags() & X86_EFLAGS_TF) {
+ printf("[FAIL]\tTF is still set\n");
+ nerrs++;
+ }
+
+ if (nerrs) {
+ printf("[FAIL]\tThere were errors\n");
+ return 1;
+ } else {
+ printf("[OK]\tAll is well\n");
+ return 0;
+ }
+}
+
+#endif /* New enough libc */
diff --git a/tools/testing/selftests/x86/vdso_restorer.c b/tools/testing/selftests/x86/vdso_restorer.c
new file mode 100644
index 000000000..cb038424a
--- /dev/null
+++ b/tools/testing/selftests/x86/vdso_restorer.c
@@ -0,0 +1,88 @@
+/*
+ * vdso_restorer.c - tests vDSO-based signal restore
+ * Copyright (c) 2015 Andrew Lutomirski
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * This makes sure that sa_restorer == NULL keeps working on 32-bit
+ * configurations. Modern glibc doesn't use it under any circumstances,
+ * so it's easy to overlook breakage.
+ *
+ * 64-bit userspace has never supported sa_restorer == NULL, so this is
+ * 32-bit only.
+ */
+
+#define _GNU_SOURCE
+
+#include <err.h>
+#include <stdio.h>
+#include <string.h>
+#include <signal.h>
+#include <unistd.h>
+#include <syscall.h>
+#include <sys/syscall.h>
+
+/* Open-code this -- the headers are too messy to easily use them. */
+struct real_sigaction {
+ void *handler;
+ unsigned long flags;
+ void *restorer;
+ unsigned int mask[2];
+};
+
+static volatile sig_atomic_t handler_called;
+
+static void handler_with_siginfo(int sig, siginfo_t *info, void *ctx_void)
+{
+ handler_called = 1;
+}
+
+static void handler_without_siginfo(int sig)
+{
+ handler_called = 1;
+}
+
+int main()
+{
+ int nerrs = 0;
+ struct real_sigaction sa;
+
+ memset(&sa, 0, sizeof(sa));
+ sa.handler = handler_with_siginfo;
+ sa.flags = SA_SIGINFO;
+ sa.restorer = NULL; /* request kernel-provided restorer */
+
+ if (syscall(SYS_rt_sigaction, SIGUSR1, &sa, NULL, 8) != 0)
+ err(1, "raw rt_sigaction syscall");
+
+ raise(SIGUSR1);
+
+ if (handler_called) {
+ printf("[OK]\tSA_SIGINFO handler returned successfully\n");
+ } else {
+ printf("[FAIL]\tSA_SIGINFO handler was not called\n");
+ nerrs++;
+ }
+
+ sa.flags = 0;
+ sa.handler = handler_without_siginfo;
+ if (syscall(SYS_sigaction, SIGUSR1, &sa, 0) != 0)
+ err(1, "raw sigaction syscall");
+ handler_called = 0;
+
+ raise(SIGUSR1);
+
+ if (handler_called) {
+ printf("[OK]\t!SA_SIGINFO handler returned successfully\n");
+ } else {
+ printf("[FAIL]\t!SA_SIGINFO handler was not called\n");
+ nerrs++;
+ }
+}