summaryrefslogtreecommitdiffstats
path: root/tools/testing/selftests/vm
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-06 01:02:30 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-06 01:02:30 +0000
commit76cb841cb886eef6b3bee341a2266c76578724ad (patch)
treef5892e5ba6cc11949952a6ce4ecbe6d516d6ce58 /tools/testing/selftests/vm
parentInitial commit. (diff)
downloadlinux-76cb841cb886eef6b3bee341a2266c76578724ad.tar.xz
linux-76cb841cb886eef6b3bee341a2266c76578724ad.zip
Adding upstream version 4.19.249.upstream/4.19.249
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'tools/testing/selftests/vm')
-rw-r--r--tools/testing/selftests/vm/.gitignore15
-rw-r--r--tools/testing/selftests/vm/Makefile34
-rw-r--r--tools/testing/selftests/vm/compaction_test.c230
-rw-r--r--tools/testing/selftests/vm/config2
-rw-r--r--tools/testing/selftests/vm/gup_benchmark.c93
-rw-r--r--tools/testing/selftests/vm/hugepage-mmap.c93
-rw-r--r--tools/testing/selftests/vm/hugepage-shm.c101
-rw-r--r--tools/testing/selftests/vm/map_hugetlb.c84
-rw-r--r--tools/testing/selftests/vm/map_populate.c113
-rw-r--r--tools/testing/selftests/vm/mlock-random-test.c294
-rw-r--r--tools/testing/selftests/vm/mlock2-tests.c520
-rw-r--r--tools/testing/selftests/vm/mlock2.h63
-rw-r--r--tools/testing/selftests/vm/on-fault-limit.c48
-rwxr-xr-xtools/testing/selftests/vm/run_vmtests214
-rw-r--r--tools/testing/selftests/vm/thuge-gen.c257
-rw-r--r--tools/testing/selftests/vm/transhuge-stress.c144
-rw-r--r--tools/testing/selftests/vm/userfaultfd.c1333
-rw-r--r--tools/testing/selftests/vm/va_128TBswitch.c297
-rw-r--r--tools/testing/selftests/vm/virtual_address_range.c139
19 files changed, 4074 insertions, 0 deletions
diff --git a/tools/testing/selftests/vm/.gitignore b/tools/testing/selftests/vm/.gitignore
new file mode 100644
index 000000000..af5ff83f6
--- /dev/null
+++ b/tools/testing/selftests/vm/.gitignore
@@ -0,0 +1,15 @@
+hugepage-mmap
+hugepage-shm
+map_hugetlb
+map_populate
+thuge-gen
+compaction_test
+mlock2-tests
+on-fault-limit
+transhuge-stress
+userfaultfd
+mlock-intersect-test
+mlock-random-test
+virtual_address_range
+gup_benchmark
+va_128TBswitch
diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile
new file mode 100644
index 000000000..2cf3dc49b
--- /dev/null
+++ b/tools/testing/selftests/vm/Makefile
@@ -0,0 +1,34 @@
+# SPDX-License-Identifier: GPL-2.0
+# Makefile for vm selftests
+
+ifndef OUTPUT
+ OUTPUT := $(shell pwd)
+endif
+
+CFLAGS = -Wall -I ../../../../usr/include $(EXTRA_CFLAGS)
+LDLIBS = -lrt
+TEST_GEN_FILES = compaction_test
+TEST_GEN_FILES += gup_benchmark
+TEST_GEN_FILES += hugepage-mmap
+TEST_GEN_FILES += hugepage-shm
+TEST_GEN_FILES += map_hugetlb
+TEST_GEN_FILES += map_populate
+TEST_GEN_FILES += mlock-random-test
+TEST_GEN_FILES += mlock2-tests
+TEST_GEN_FILES += on-fault-limit
+TEST_GEN_FILES += thuge-gen
+TEST_GEN_FILES += transhuge-stress
+TEST_GEN_FILES += userfaultfd
+TEST_GEN_FILES += va_128TBswitch
+TEST_GEN_FILES += virtual_address_range
+
+TEST_PROGS := run_vmtests
+
+TEST_FILES := test_vmalloc.sh
+
+KSFT_KHDR_INSTALL := 1
+include ../lib.mk
+
+$(OUTPUT)/userfaultfd: LDLIBS += -lpthread
+
+$(OUTPUT)/mlock-random-test: LDLIBS += -lcap
diff --git a/tools/testing/selftests/vm/compaction_test.c b/tools/testing/selftests/vm/compaction_test.c
new file mode 100644
index 000000000..bcec71250
--- /dev/null
+++ b/tools/testing/selftests/vm/compaction_test.c
@@ -0,0 +1,230 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ *
+ * A test for the patch "Allow compaction of unevictable pages".
+ * With this patch we should be able to allocate at least 1/4
+ * of RAM in huge pages. Without the patch much less is
+ * allocated.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/mman.h>
+#include <sys/resource.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <unistd.h>
+#include <string.h>
+
+#include "../kselftest.h"
+
+#define MAP_SIZE 1048576
+
+struct map_list {
+ void *map;
+ struct map_list *next;
+};
+
+int read_memory_info(unsigned long *memfree, unsigned long *hugepagesize)
+{
+ char buffer[256] = {0};
+ char *cmd = "cat /proc/meminfo | grep -i memfree | grep -o '[0-9]*'";
+ FILE *cmdfile = popen(cmd, "r");
+
+ if (!(fgets(buffer, sizeof(buffer), cmdfile))) {
+ perror("Failed to read meminfo\n");
+ return -1;
+ }
+
+ pclose(cmdfile);
+
+ *memfree = atoll(buffer);
+ cmd = "cat /proc/meminfo | grep -i hugepagesize | grep -o '[0-9]*'";
+ cmdfile = popen(cmd, "r");
+
+ if (!(fgets(buffer, sizeof(buffer), cmdfile))) {
+ perror("Failed to read meminfo\n");
+ return -1;
+ }
+
+ pclose(cmdfile);
+ *hugepagesize = atoll(buffer);
+
+ return 0;
+}
+
+int prereq(void)
+{
+ char allowed;
+ int fd;
+
+ fd = open("/proc/sys/vm/compact_unevictable_allowed",
+ O_RDONLY | O_NONBLOCK);
+ if (fd < 0) {
+ perror("Failed to open\n"
+ "/proc/sys/vm/compact_unevictable_allowed\n");
+ return -1;
+ }
+
+ if (read(fd, &allowed, sizeof(char)) != sizeof(char)) {
+ perror("Failed to read from\n"
+ "/proc/sys/vm/compact_unevictable_allowed\n");
+ close(fd);
+ return -1;
+ }
+
+ close(fd);
+ if (allowed == '1')
+ return 0;
+
+ return -1;
+}
+
+int check_compaction(unsigned long mem_free, unsigned int hugepage_size)
+{
+ int fd;
+ int compaction_index = 0;
+ char initial_nr_hugepages[10] = {0};
+ char nr_hugepages[10] = {0};
+
+ /* We want to test with 80% of available memory. Else, OOM killer comes
+ in to play */
+ mem_free = mem_free * 0.8;
+
+ fd = open("/proc/sys/vm/nr_hugepages", O_RDWR | O_NONBLOCK);
+ if (fd < 0) {
+ perror("Failed to open /proc/sys/vm/nr_hugepages");
+ return -1;
+ }
+
+ if (read(fd, initial_nr_hugepages, sizeof(initial_nr_hugepages)) <= 0) {
+ perror("Failed to read from /proc/sys/vm/nr_hugepages");
+ goto close_fd;
+ }
+
+ /* Start with the initial condition of 0 huge pages*/
+ if (write(fd, "0", sizeof(char)) != sizeof(char)) {
+ perror("Failed to write 0 to /proc/sys/vm/nr_hugepages\n");
+ goto close_fd;
+ }
+
+ lseek(fd, 0, SEEK_SET);
+
+ /* Request a large number of huge pages. The Kernel will allocate
+ as much as it can */
+ if (write(fd, "100000", (6*sizeof(char))) != (6*sizeof(char))) {
+ perror("Failed to write 100000 to /proc/sys/vm/nr_hugepages\n");
+ goto close_fd;
+ }
+
+ lseek(fd, 0, SEEK_SET);
+
+ if (read(fd, nr_hugepages, sizeof(nr_hugepages)) <= 0) {
+ perror("Failed to re-read from /proc/sys/vm/nr_hugepages\n");
+ goto close_fd;
+ }
+
+ /* We should have been able to request at least 1/3 rd of the memory in
+ huge pages */
+ compaction_index = mem_free/(atoi(nr_hugepages) * hugepage_size);
+
+ if (compaction_index > 3) {
+ printf("No of huge pages allocated = %d\n",
+ (atoi(nr_hugepages)));
+ fprintf(stderr, "ERROR: Less that 1/%d of memory is available\n"
+ "as huge pages\n", compaction_index);
+ goto close_fd;
+ }
+
+ printf("No of huge pages allocated = %d\n",
+ (atoi(nr_hugepages)));
+
+ lseek(fd, 0, SEEK_SET);
+
+ if (write(fd, initial_nr_hugepages, strlen(initial_nr_hugepages))
+ != strlen(initial_nr_hugepages)) {
+ perror("Failed to write value to /proc/sys/vm/nr_hugepages\n");
+ goto close_fd;
+ }
+
+ close(fd);
+ return 0;
+
+ close_fd:
+ close(fd);
+ printf("Not OK. Compaction test failed.");
+ return -1;
+}
+
+
+int main(int argc, char **argv)
+{
+ struct rlimit lim;
+ struct map_list *list, *entry;
+ size_t page_size, i;
+ void *map = NULL;
+ unsigned long mem_free = 0;
+ unsigned long hugepage_size = 0;
+ unsigned long mem_fragmentable = 0;
+
+ if (prereq() != 0) {
+ printf("Either the sysctl compact_unevictable_allowed is not\n"
+ "set to 1 or couldn't read the proc file.\n"
+ "Skipping the test\n");
+ return KSFT_SKIP;
+ }
+
+ lim.rlim_cur = RLIM_INFINITY;
+ lim.rlim_max = RLIM_INFINITY;
+ if (setrlimit(RLIMIT_MEMLOCK, &lim)) {
+ perror("Failed to set rlimit:\n");
+ return -1;
+ }
+
+ page_size = getpagesize();
+
+ list = NULL;
+
+ if (read_memory_info(&mem_free, &hugepage_size) != 0) {
+ printf("ERROR: Cannot read meminfo\n");
+ return -1;
+ }
+
+ mem_fragmentable = mem_free * 0.8 / 1024;
+
+ while (mem_fragmentable > 0) {
+ map = mmap(NULL, MAP_SIZE, PROT_READ | PROT_WRITE,
+ MAP_ANONYMOUS | MAP_PRIVATE | MAP_LOCKED, -1, 0);
+ if (map == MAP_FAILED)
+ break;
+
+ entry = malloc(sizeof(struct map_list));
+ if (!entry) {
+ munmap(map, MAP_SIZE);
+ break;
+ }
+ entry->map = map;
+ entry->next = list;
+ list = entry;
+
+ /* Write something (in this case the address of the map) to
+ * ensure that KSM can't merge the mapped pages
+ */
+ for (i = 0; i < MAP_SIZE; i += page_size)
+ *(unsigned long *)(map + i) = (unsigned long)map + i;
+
+ mem_fragmentable--;
+ }
+
+ for (entry = list; entry != NULL; entry = entry->next) {
+ munmap(entry->map, MAP_SIZE);
+ if (!entry->next)
+ break;
+ entry = entry->next;
+ }
+
+ if (check_compaction(mem_free, hugepage_size) == 0)
+ return 0;
+
+ return -1;
+}
diff --git a/tools/testing/selftests/vm/config b/tools/testing/selftests/vm/config
new file mode 100644
index 000000000..1c0d76cb5
--- /dev/null
+++ b/tools/testing/selftests/vm/config
@@ -0,0 +1,2 @@
+CONFIG_SYSVIPC=y
+CONFIG_USERFAULTFD=y
diff --git a/tools/testing/selftests/vm/gup_benchmark.c b/tools/testing/selftests/vm/gup_benchmark.c
new file mode 100644
index 000000000..17da711f2
--- /dev/null
+++ b/tools/testing/selftests/vm/gup_benchmark.c
@@ -0,0 +1,93 @@
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <sys/prctl.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+
+#include <linux/types.h>
+
+#define MB (1UL << 20)
+#define PAGE_SIZE sysconf(_SC_PAGESIZE)
+
+#define GUP_FAST_BENCHMARK _IOWR('g', 1, struct gup_benchmark)
+
+struct gup_benchmark {
+ __u64 delta_usec;
+ __u64 addr;
+ __u64 size;
+ __u32 nr_pages_per_call;
+ __u32 flags;
+ __u64 expansion[10]; /* For future use */
+};
+
+int main(int argc, char **argv)
+{
+ struct gup_benchmark gup;
+ unsigned long size = 128 * MB;
+ int i, fd, opt, nr_pages = 1, thp = -1, repeats = 1, write = 0;
+ char *p;
+
+ while ((opt = getopt(argc, argv, "m:r:n:tT")) != -1) {
+ switch (opt) {
+ case 'm':
+ size = atoi(optarg) * MB;
+ break;
+ case 'r':
+ repeats = atoi(optarg);
+ break;
+ case 'n':
+ nr_pages = atoi(optarg);
+ break;
+ case 't':
+ thp = 1;
+ break;
+ case 'T':
+ thp = 0;
+ break;
+ case 'w':
+ write = 1;
+ break;
+ default:
+ return -1;
+ }
+ }
+
+ gup.nr_pages_per_call = nr_pages;
+ gup.flags = write;
+
+ fd = open("/sys/kernel/debug/gup_benchmark", O_RDWR);
+ if (fd == -1)
+ perror("open"), exit(1);
+
+ p = mmap(NULL, size, PROT_READ | PROT_WRITE,
+ MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+ if (p == MAP_FAILED)
+ perror("mmap"), exit(1);
+ gup.addr = (unsigned long)p;
+
+ if (thp == 1)
+ madvise(p, size, MADV_HUGEPAGE);
+ else if (thp == 0)
+ madvise(p, size, MADV_NOHUGEPAGE);
+
+ for (; (unsigned long)p < gup.addr + size; p += PAGE_SIZE)
+ p[0] = 0;
+
+ for (i = 0; i < repeats; i++) {
+ gup.size = size;
+ if (ioctl(fd, GUP_FAST_BENCHMARK, &gup))
+ perror("ioctl"), exit(1);
+
+ printf("Time: %lld us", gup.delta_usec);
+ if (gup.size != size)
+ printf(", truncated (size: %lld)", gup.size);
+ printf("\n");
+ }
+
+ return 0;
+}
diff --git a/tools/testing/selftests/vm/hugepage-mmap.c b/tools/testing/selftests/vm/hugepage-mmap.c
new file mode 100644
index 000000000..93f9e7b81
--- /dev/null
+++ b/tools/testing/selftests/vm/hugepage-mmap.c
@@ -0,0 +1,93 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * hugepage-mmap:
+ *
+ * Example of using huge page memory in a user application using the mmap
+ * system call. Before running this application, make sure that the
+ * administrator has mounted the hugetlbfs filesystem (on some directory
+ * like /mnt) using the command mount -t hugetlbfs nodev /mnt. In this
+ * example, the app is requesting memory of size 256MB that is backed by
+ * huge pages.
+ *
+ * For the ia64 architecture, the Linux kernel reserves Region number 4 for
+ * huge pages. That means that if one requires a fixed address, a huge page
+ * aligned address starting with 0x800000... will be required. If a fixed
+ * address is not required, the kernel will select an address in the proper
+ * range.
+ * Other architectures, such as ppc64, i386 or x86_64 are not so constrained.
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/mman.h>
+#include <fcntl.h>
+
+#define FILE_NAME "huge/hugepagefile"
+#define LENGTH (256UL*1024*1024)
+#define PROTECTION (PROT_READ | PROT_WRITE)
+
+/* Only ia64 requires this */
+#ifdef __ia64__
+#define ADDR (void *)(0x8000000000000000UL)
+#define FLAGS (MAP_SHARED | MAP_FIXED)
+#else
+#define ADDR (void *)(0x0UL)
+#define FLAGS (MAP_SHARED)
+#endif
+
+static void check_bytes(char *addr)
+{
+ printf("First hex is %x\n", *((unsigned int *)addr));
+}
+
+static void write_bytes(char *addr)
+{
+ unsigned long i;
+
+ for (i = 0; i < LENGTH; i++)
+ *(addr + i) = (char)i;
+}
+
+static int read_bytes(char *addr)
+{
+ unsigned long i;
+
+ check_bytes(addr);
+ for (i = 0; i < LENGTH; i++)
+ if (*(addr + i) != (char)i) {
+ printf("Mismatch at %lu\n", i);
+ return 1;
+ }
+ return 0;
+}
+
+int main(void)
+{
+ void *addr;
+ int fd, ret;
+
+ fd = open(FILE_NAME, O_CREAT | O_RDWR, 0755);
+ if (fd < 0) {
+ perror("Open failed");
+ exit(1);
+ }
+
+ addr = mmap(ADDR, LENGTH, PROTECTION, FLAGS, fd, 0);
+ if (addr == MAP_FAILED) {
+ perror("mmap");
+ unlink(FILE_NAME);
+ exit(1);
+ }
+
+ printf("Returned address is %p\n", addr);
+ check_bytes(addr);
+ write_bytes(addr);
+ ret = read_bytes(addr);
+
+ munmap(addr, LENGTH);
+ close(fd);
+ unlink(FILE_NAME);
+
+ return ret;
+}
diff --git a/tools/testing/selftests/vm/hugepage-shm.c b/tools/testing/selftests/vm/hugepage-shm.c
new file mode 100644
index 000000000..e2527f320
--- /dev/null
+++ b/tools/testing/selftests/vm/hugepage-shm.c
@@ -0,0 +1,101 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * hugepage-shm:
+ *
+ * Example of using huge page memory in a user application using Sys V shared
+ * memory system calls. In this example the app is requesting 256MB of
+ * memory that is backed by huge pages. The application uses the flag
+ * SHM_HUGETLB in the shmget system call to inform the kernel that it is
+ * requesting huge pages.
+ *
+ * For the ia64 architecture, the Linux kernel reserves Region number 4 for
+ * huge pages. That means that if one requires a fixed address, a huge page
+ * aligned address starting with 0x800000... will be required. If a fixed
+ * address is not required, the kernel will select an address in the proper
+ * range.
+ * Other architectures, such as ppc64, i386 or x86_64 are not so constrained.
+ *
+ * Note: The default shared memory limit is quite low on many kernels,
+ * you may need to increase it via:
+ *
+ * echo 268435456 > /proc/sys/kernel/shmmax
+ *
+ * This will increase the maximum size per shared memory segment to 256MB.
+ * The other limit that you will hit eventually is shmall which is the
+ * total amount of shared memory in pages. To set it to 16GB on a system
+ * with a 4kB pagesize do:
+ *
+ * echo 4194304 > /proc/sys/kernel/shmall
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <sys/types.h>
+#include <sys/ipc.h>
+#include <sys/shm.h>
+#include <sys/mman.h>
+
+#ifndef SHM_HUGETLB
+#define SHM_HUGETLB 04000
+#endif
+
+#define LENGTH (256UL*1024*1024)
+
+#define dprintf(x) printf(x)
+
+/* Only ia64 requires this */
+#ifdef __ia64__
+#define ADDR (void *)(0x8000000000000000UL)
+#define SHMAT_FLAGS (SHM_RND)
+#else
+#define ADDR (void *)(0x0UL)
+#define SHMAT_FLAGS (0)
+#endif
+
+int main(void)
+{
+ int shmid;
+ unsigned long i;
+ char *shmaddr;
+
+ shmid = shmget(2, LENGTH, SHM_HUGETLB | IPC_CREAT | SHM_R | SHM_W);
+ if (shmid < 0) {
+ perror("shmget");
+ exit(1);
+ }
+ printf("shmid: 0x%x\n", shmid);
+
+ shmaddr = shmat(shmid, ADDR, SHMAT_FLAGS);
+ if (shmaddr == (char *)-1) {
+ perror("Shared memory attach failure");
+ shmctl(shmid, IPC_RMID, NULL);
+ exit(2);
+ }
+ printf("shmaddr: %p\n", shmaddr);
+
+ dprintf("Starting the writes:\n");
+ for (i = 0; i < LENGTH; i++) {
+ shmaddr[i] = (char)(i);
+ if (!(i % (1024 * 1024)))
+ dprintf(".");
+ }
+ dprintf("\n");
+
+ dprintf("Starting the Check...");
+ for (i = 0; i < LENGTH; i++)
+ if (shmaddr[i] != (char)i) {
+ printf("\nIndex %lu mismatched\n", i);
+ exit(3);
+ }
+ dprintf("Done.\n");
+
+ if (shmdt((const void *)shmaddr) != 0) {
+ perror("Detach failure");
+ shmctl(shmid, IPC_RMID, NULL);
+ exit(4);
+ }
+
+ shmctl(shmid, IPC_RMID, NULL);
+
+ return 0;
+}
diff --git a/tools/testing/selftests/vm/map_hugetlb.c b/tools/testing/selftests/vm/map_hugetlb.c
new file mode 100644
index 000000000..9b777fa95
--- /dev/null
+++ b/tools/testing/selftests/vm/map_hugetlb.c
@@ -0,0 +1,84 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Example of using hugepage memory in a user application using the mmap
+ * system call with MAP_HUGETLB flag. Before running this program make
+ * sure the administrator has allocated enough default sized huge pages
+ * to cover the 256 MB allocation.
+ *
+ * For ia64 architecture, Linux kernel reserves Region number 4 for hugepages.
+ * That means the addresses starting with 0x800000... will need to be
+ * specified. Specifying a fixed address is not required on ppc64, i386
+ * or x86_64.
+ */
+#include <stdlib.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/mman.h>
+#include <fcntl.h>
+
+#define LENGTH (256UL*1024*1024)
+#define PROTECTION (PROT_READ | PROT_WRITE)
+
+#ifndef MAP_HUGETLB
+#define MAP_HUGETLB 0x40000 /* arch specific */
+#endif
+
+/* Only ia64 requires this */
+#ifdef __ia64__
+#define ADDR (void *)(0x8000000000000000UL)
+#define FLAGS (MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | MAP_FIXED)
+#else
+#define ADDR (void *)(0x0UL)
+#define FLAGS (MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB)
+#endif
+
+static void check_bytes(char *addr)
+{
+ printf("First hex is %x\n", *((unsigned int *)addr));
+}
+
+static void write_bytes(char *addr)
+{
+ unsigned long i;
+
+ for (i = 0; i < LENGTH; i++)
+ *(addr + i) = (char)i;
+}
+
+static int read_bytes(char *addr)
+{
+ unsigned long i;
+
+ check_bytes(addr);
+ for (i = 0; i < LENGTH; i++)
+ if (*(addr + i) != (char)i) {
+ printf("Mismatch at %lu\n", i);
+ return 1;
+ }
+ return 0;
+}
+
+int main(void)
+{
+ void *addr;
+ int ret;
+
+ addr = mmap(ADDR, LENGTH, PROTECTION, FLAGS, -1, 0);
+ if (addr == MAP_FAILED) {
+ perror("mmap");
+ exit(1);
+ }
+
+ printf("Returned address is %p\n", addr);
+ check_bytes(addr);
+ write_bytes(addr);
+ ret = read_bytes(addr);
+
+ /* munmap() length of MAP_HUGETLB memory must be hugepage aligned */
+ if (munmap(addr, LENGTH)) {
+ perror("munmap");
+ exit(1);
+ }
+
+ return ret;
+}
diff --git a/tools/testing/selftests/vm/map_populate.c b/tools/testing/selftests/vm/map_populate.c
new file mode 100644
index 000000000..6b8aeaa0b
--- /dev/null
+++ b/tools/testing/selftests/vm/map_populate.c
@@ -0,0 +1,113 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2018 Dmitry Safonov, Arista Networks
+ *
+ * MAP_POPULATE | MAP_PRIVATE should COW VMA pages.
+ */
+
+#define _GNU_SOURCE
+#include <errno.h>
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#ifndef MMAP_SZ
+#define MMAP_SZ 4096
+#endif
+
+#define BUG_ON(condition, description) \
+ do { \
+ if (condition) { \
+ fprintf(stderr, "[FAIL]\t%s:%d\t%s:%s\n", __func__, \
+ __LINE__, (description), strerror(errno)); \
+ exit(1); \
+ } \
+ } while (0)
+
+static int parent_f(int sock, unsigned long *smap, int child)
+{
+ int status, ret;
+
+ ret = read(sock, &status, sizeof(int));
+ BUG_ON(ret <= 0, "read(sock)");
+
+ *smap = 0x22222BAD;
+ ret = msync(smap, MMAP_SZ, MS_SYNC);
+ BUG_ON(ret, "msync()");
+
+ ret = write(sock, &status, sizeof(int));
+ BUG_ON(ret <= 0, "write(sock)");
+
+ waitpid(child, &status, 0);
+ BUG_ON(!WIFEXITED(status), "child in unexpected state");
+
+ return WEXITSTATUS(status);
+}
+
+static int child_f(int sock, unsigned long *smap, int fd)
+{
+ int ret, buf = 0;
+
+ smap = mmap(0, MMAP_SZ, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_POPULATE, fd, 0);
+ BUG_ON(smap == MAP_FAILED, "mmap()");
+
+ BUG_ON(*smap != 0xdeadbabe, "MAP_PRIVATE | MAP_POPULATE changed file");
+
+ ret = write(sock, &buf, sizeof(int));
+ BUG_ON(ret <= 0, "write(sock)");
+
+ ret = read(sock, &buf, sizeof(int));
+ BUG_ON(ret <= 0, "read(sock)");
+
+ BUG_ON(*smap == 0x22222BAD, "MAP_POPULATE didn't COW private page");
+ BUG_ON(*smap != 0xdeadbabe, "mapping was corrupted");
+
+ return 0;
+}
+
+int main(int argc, char **argv)
+{
+ int sock[2], child, ret;
+ FILE *ftmp;
+ unsigned long *smap;
+
+ ftmp = tmpfile();
+ BUG_ON(ftmp == 0, "tmpfile()");
+
+ ret = ftruncate(fileno(ftmp), MMAP_SZ);
+ BUG_ON(ret, "ftruncate()");
+
+ smap = mmap(0, MMAP_SZ, PROT_READ | PROT_WRITE,
+ MAP_SHARED, fileno(ftmp), 0);
+ BUG_ON(smap == MAP_FAILED, "mmap()");
+
+ *smap = 0xdeadbabe;
+ /* Probably unnecessary, but let it be. */
+ ret = msync(smap, MMAP_SZ, MS_SYNC);
+ BUG_ON(ret, "msync()");
+
+ ret = socketpair(PF_LOCAL, SOCK_SEQPACKET, 0, sock);
+ BUG_ON(ret, "socketpair()");
+
+ child = fork();
+ BUG_ON(child == -1, "fork()");
+
+ if (child) {
+ ret = close(sock[0]);
+ BUG_ON(ret, "close()");
+
+ return parent_f(sock[1], smap, child);
+ }
+
+ ret = close(sock[1]);
+ BUG_ON(ret, "close()");
+
+ return child_f(sock[0], smap, fileno(ftmp));
+}
diff --git a/tools/testing/selftests/vm/mlock-random-test.c b/tools/testing/selftests/vm/mlock-random-test.c
new file mode 100644
index 000000000..ff4d72eb7
--- /dev/null
+++ b/tools/testing/selftests/vm/mlock-random-test.c
@@ -0,0 +1,294 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * It tests the mlock/mlock2() when they are invoked
+ * on randomly memory region.
+ */
+#include <unistd.h>
+#include <sys/resource.h>
+#include <sys/capability.h>
+#include <sys/mman.h>
+#include <fcntl.h>
+#include <string.h>
+#include <sys/ipc.h>
+#include <sys/shm.h>
+#include <time.h>
+#include "mlock2.h"
+
+#define CHUNK_UNIT (128 * 1024)
+#define MLOCK_RLIMIT_SIZE (CHUNK_UNIT * 2)
+#define MLOCK_WITHIN_LIMIT_SIZE CHUNK_UNIT
+#define MLOCK_OUTOF_LIMIT_SIZE (CHUNK_UNIT * 3)
+
+#define TEST_LOOP 100
+#define PAGE_ALIGN(size, ps) (((size) + ((ps) - 1)) & ~((ps) - 1))
+
+int set_cap_limits(rlim_t max)
+{
+ struct rlimit new;
+ cap_t cap = cap_init();
+
+ new.rlim_cur = max;
+ new.rlim_max = max;
+ if (setrlimit(RLIMIT_MEMLOCK, &new)) {
+ perror("setrlimit() returns error\n");
+ return -1;
+ }
+
+ /* drop capabilities including CAP_IPC_LOCK */
+ if (cap_set_proc(cap)) {
+ perror("cap_set_proc() returns error\n");
+ return -2;
+ }
+
+ return 0;
+}
+
+int get_proc_locked_vm_size(void)
+{
+ FILE *f;
+ int ret = -1;
+ char line[1024] = {0};
+ unsigned long lock_size = 0;
+
+ f = fopen("/proc/self/status", "r");
+ if (!f) {
+ perror("fopen");
+ return -1;
+ }
+
+ while (fgets(line, 1024, f)) {
+ if (strstr(line, "VmLck")) {
+ ret = sscanf(line, "VmLck:\t%8lu kB", &lock_size);
+ if (ret <= 0) {
+ printf("sscanf() on VmLck error: %s: %d\n",
+ line, ret);
+ fclose(f);
+ return -1;
+ }
+ fclose(f);
+ return (int)(lock_size << 10);
+ }
+ }
+
+ perror("cann't parse VmLck in /proc/self/status\n");
+ fclose(f);
+ return -1;
+}
+
+/*
+ * Get the MMUPageSize of the memory region including input
+ * address from proc file.
+ *
+ * return value: on error case, 0 will be returned.
+ * Otherwise the page size(in bytes) is returned.
+ */
+int get_proc_page_size(unsigned long addr)
+{
+ FILE *smaps;
+ char *line;
+ unsigned long mmupage_size = 0;
+ size_t size;
+
+ smaps = seek_to_smaps_entry(addr);
+ if (!smaps) {
+ printf("Unable to parse /proc/self/smaps\n");
+ return 0;
+ }
+
+ while (getline(&line, &size, smaps) > 0) {
+ if (!strstr(line, "MMUPageSize")) {
+ free(line);
+ line = NULL;
+ size = 0;
+ continue;
+ }
+
+ /* found the MMUPageSize of this section */
+ if (sscanf(line, "MMUPageSize: %8lu kB",
+ &mmupage_size) < 1) {
+ printf("Unable to parse smaps entry for Size:%s\n",
+ line);
+ break;
+ }
+
+ }
+ free(line);
+ if (smaps)
+ fclose(smaps);
+ return mmupage_size << 10;
+}
+
+/*
+ * Test mlock/mlock2() on provided memory chunk.
+ * It expects the mlock/mlock2() to be successful (within rlimit)
+ *
+ * With allocated memory chunk [p, p + alloc_size), this
+ * test will choose start/len randomly to perform mlock/mlock2
+ * [start, start + len] memory range. The range is within range
+ * of the allocated chunk.
+ *
+ * The memory region size alloc_size is within the rlimit.
+ * So we always expect a success of mlock/mlock2.
+ *
+ * VmLck is assumed to be 0 before this test.
+ *
+ * return value: 0 - success
+ * else: failure
+ */
+int test_mlock_within_limit(char *p, int alloc_size)
+{
+ int i;
+ int ret = 0;
+ int locked_vm_size = 0;
+ struct rlimit cur;
+ int page_size = 0;
+
+ getrlimit(RLIMIT_MEMLOCK, &cur);
+ if (cur.rlim_cur < alloc_size) {
+ printf("alloc_size[%d] < %u rlimit,lead to mlock failure\n",
+ alloc_size, (unsigned int)cur.rlim_cur);
+ return -1;
+ }
+
+ srand(time(NULL));
+ for (i = 0; i < TEST_LOOP; i++) {
+ /*
+ * - choose mlock/mlock2 randomly
+ * - choose lock_size randomly but lock_size < alloc_size
+ * - choose start_offset randomly but p+start_offset+lock_size
+ * < p+alloc_size
+ */
+ int is_mlock = !!(rand() % 2);
+ int lock_size = rand() % alloc_size;
+ int start_offset = rand() % (alloc_size - lock_size);
+
+ if (is_mlock)
+ ret = mlock(p + start_offset, lock_size);
+ else
+ ret = mlock2_(p + start_offset, lock_size,
+ MLOCK_ONFAULT);
+
+ if (ret) {
+ printf("%s() failure at |%p(%d)| mlock:|%p(%d)|\n",
+ is_mlock ? "mlock" : "mlock2",
+ p, alloc_size,
+ p + start_offset, lock_size);
+ return ret;
+ }
+ }
+
+ /*
+ * Check VmLck left by the tests.
+ */
+ locked_vm_size = get_proc_locked_vm_size();
+ page_size = get_proc_page_size((unsigned long)p);
+ if (page_size == 0) {
+ printf("cannot get proc MMUPageSize\n");
+ return -1;
+ }
+
+ if (locked_vm_size > PAGE_ALIGN(alloc_size, page_size) + page_size) {
+ printf("test_mlock_within_limit() left VmLck:%d on %d chunk\n",
+ locked_vm_size, alloc_size);
+ return -1;
+ }
+
+ return 0;
+}
+
+
+/*
+ * We expect the mlock/mlock2() to be fail (outof limitation)
+ *
+ * With allocated memory chunk [p, p + alloc_size), this
+ * test will randomly choose start/len and perform mlock/mlock2
+ * on [start, start+len] range.
+ *
+ * The memory region size alloc_size is above the rlimit.
+ * And the len to be locked is higher than rlimit.
+ * So we always expect a failure of mlock/mlock2.
+ * No locked page number should be increased as a side effect.
+ *
+ * return value: 0 - success
+ * else: failure
+ */
+int test_mlock_outof_limit(char *p, int alloc_size)
+{
+ int i;
+ int ret = 0;
+ int locked_vm_size = 0, old_locked_vm_size = 0;
+ struct rlimit cur;
+
+ getrlimit(RLIMIT_MEMLOCK, &cur);
+ if (cur.rlim_cur >= alloc_size) {
+ printf("alloc_size[%d] >%u rlimit, violates test condition\n",
+ alloc_size, (unsigned int)cur.rlim_cur);
+ return -1;
+ }
+
+ old_locked_vm_size = get_proc_locked_vm_size();
+ srand(time(NULL));
+ for (i = 0; i < TEST_LOOP; i++) {
+ int is_mlock = !!(rand() % 2);
+ int lock_size = (rand() % (alloc_size - cur.rlim_cur))
+ + cur.rlim_cur;
+ int start_offset = rand() % (alloc_size - lock_size);
+
+ if (is_mlock)
+ ret = mlock(p + start_offset, lock_size);
+ else
+ ret = mlock2_(p + start_offset, lock_size,
+ MLOCK_ONFAULT);
+ if (ret == 0) {
+ printf("%s() succeeds? on %p(%d) mlock%p(%d)\n",
+ is_mlock ? "mlock" : "mlock2",
+ p, alloc_size,
+ p + start_offset, lock_size);
+ return -1;
+ }
+ }
+
+ locked_vm_size = get_proc_locked_vm_size();
+ if (locked_vm_size != old_locked_vm_size) {
+ printf("tests leads to new mlocked page: old[%d], new[%d]\n",
+ old_locked_vm_size,
+ locked_vm_size);
+ return -1;
+ }
+
+ return 0;
+}
+
+int main(int argc, char **argv)
+{
+ char *p = NULL;
+ int ret = 0;
+
+ if (set_cap_limits(MLOCK_RLIMIT_SIZE))
+ return -1;
+
+ p = malloc(MLOCK_WITHIN_LIMIT_SIZE);
+ if (p == NULL) {
+ perror("malloc() failure\n");
+ return -1;
+ }
+ ret = test_mlock_within_limit(p, MLOCK_WITHIN_LIMIT_SIZE);
+ if (ret)
+ return ret;
+ munlock(p, MLOCK_WITHIN_LIMIT_SIZE);
+ free(p);
+
+
+ p = malloc(MLOCK_OUTOF_LIMIT_SIZE);
+ if (p == NULL) {
+ perror("malloc() failure\n");
+ return -1;
+ }
+ ret = test_mlock_outof_limit(p, MLOCK_OUTOF_LIMIT_SIZE);
+ if (ret)
+ return ret;
+ munlock(p, MLOCK_OUTOF_LIMIT_SIZE);
+ free(p);
+
+ return 0;
+}
diff --git a/tools/testing/selftests/vm/mlock2-tests.c b/tools/testing/selftests/vm/mlock2-tests.c
new file mode 100644
index 000000000..11b2301f3
--- /dev/null
+++ b/tools/testing/selftests/vm/mlock2-tests.c
@@ -0,0 +1,520 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#include <sys/mman.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <string.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+#include <stdbool.h>
+#include "mlock2.h"
+
+#include "../kselftest.h"
+
+struct vm_boundaries {
+ unsigned long start;
+ unsigned long end;
+};
+
+static int get_vm_area(unsigned long addr, struct vm_boundaries *area)
+{
+ FILE *file;
+ int ret = 1;
+ char line[1024] = {0};
+ char *end_addr;
+ char *stop;
+ unsigned long start;
+ unsigned long end;
+
+ if (!area)
+ return ret;
+
+ file = fopen("/proc/self/maps", "r");
+ if (!file) {
+ perror("fopen");
+ return ret;
+ }
+
+ memset(area, 0, sizeof(struct vm_boundaries));
+
+ while(fgets(line, 1024, file)) {
+ end_addr = strchr(line, '-');
+ if (!end_addr) {
+ printf("cannot parse /proc/self/maps\n");
+ goto out;
+ }
+ *end_addr = '\0';
+ end_addr++;
+ stop = strchr(end_addr, ' ');
+ if (!stop) {
+ printf("cannot parse /proc/self/maps\n");
+ goto out;
+ }
+ stop = '\0';
+
+ sscanf(line, "%lx", &start);
+ sscanf(end_addr, "%lx", &end);
+
+ if (start <= addr && end > addr) {
+ area->start = start;
+ area->end = end;
+ ret = 0;
+ goto out;
+ }
+ }
+out:
+ fclose(file);
+ return ret;
+}
+
+#define VMFLAGS "VmFlags:"
+
+static bool is_vmflag_set(unsigned long addr, const char *vmflag)
+{
+ char *line = NULL;
+ char *flags;
+ size_t size = 0;
+ bool ret = false;
+ FILE *smaps;
+
+ smaps = seek_to_smaps_entry(addr);
+ if (!smaps) {
+ printf("Unable to parse /proc/self/smaps\n");
+ goto out;
+ }
+
+ while (getline(&line, &size, smaps) > 0) {
+ if (!strstr(line, VMFLAGS)) {
+ free(line);
+ line = NULL;
+ size = 0;
+ continue;
+ }
+
+ flags = line + strlen(VMFLAGS);
+ ret = (strstr(flags, vmflag) != NULL);
+ goto out;
+ }
+
+out:
+ free(line);
+ fclose(smaps);
+ return ret;
+}
+
+#define SIZE "Size:"
+#define RSS "Rss:"
+#define LOCKED "lo"
+
+static unsigned long get_value_for_name(unsigned long addr, const char *name)
+{
+ char *line = NULL;
+ size_t size = 0;
+ char *value_ptr;
+ FILE *smaps = NULL;
+ unsigned long value = -1UL;
+
+ smaps = seek_to_smaps_entry(addr);
+ if (!smaps) {
+ printf("Unable to parse /proc/self/smaps\n");
+ goto out;
+ }
+
+ while (getline(&line, &size, smaps) > 0) {
+ if (!strstr(line, name)) {
+ free(line);
+ line = NULL;
+ size = 0;
+ continue;
+ }
+
+ value_ptr = line + strlen(name);
+ if (sscanf(value_ptr, "%lu kB", &value) < 1) {
+ printf("Unable to parse smaps entry for Size\n");
+ goto out;
+ }
+ break;
+ }
+
+out:
+ if (smaps)
+ fclose(smaps);
+ free(line);
+ return value;
+}
+
+static bool is_vma_lock_on_fault(unsigned long addr)
+{
+ bool locked;
+ unsigned long vma_size, vma_rss;
+
+ locked = is_vmflag_set(addr, LOCKED);
+ if (!locked)
+ return false;
+
+ vma_size = get_value_for_name(addr, SIZE);
+ vma_rss = get_value_for_name(addr, RSS);
+
+ /* only one page is faulted in */
+ return (vma_rss < vma_size);
+}
+
+#define PRESENT_BIT 0x8000000000000000ULL
+#define PFN_MASK 0x007FFFFFFFFFFFFFULL
+#define UNEVICTABLE_BIT (1UL << 18)
+
+static int lock_check(unsigned long addr)
+{
+ bool locked;
+ unsigned long vma_size, vma_rss;
+
+ locked = is_vmflag_set(addr, LOCKED);
+ if (!locked)
+ return false;
+
+ vma_size = get_value_for_name(addr, SIZE);
+ vma_rss = get_value_for_name(addr, RSS);
+
+ return (vma_rss == vma_size);
+}
+
+static int unlock_lock_check(char *map)
+{
+ if (is_vmflag_set((unsigned long)map, LOCKED)) {
+ printf("VMA flag %s is present on page 1 after unlock\n", LOCKED);
+ return 1;
+ }
+
+ return 0;
+}
+
+static int test_mlock_lock()
+{
+ char *map;
+ int ret = 1;
+ unsigned long page_size = getpagesize();
+
+ map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE,
+ MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+ if (map == MAP_FAILED) {
+ perror("test_mlock_locked mmap");
+ goto out;
+ }
+
+ if (mlock2_(map, 2 * page_size, 0)) {
+ if (errno == ENOSYS) {
+ printf("Cannot call new mlock family, skipping test\n");
+ _exit(KSFT_SKIP);
+ }
+ perror("mlock2(0)");
+ goto unmap;
+ }
+
+ if (!lock_check((unsigned long)map))
+ goto unmap;
+
+ /* Now unlock and recheck attributes */
+ if (munlock(map, 2 * page_size)) {
+ perror("munlock()");
+ goto unmap;
+ }
+
+ ret = unlock_lock_check(map);
+
+unmap:
+ munmap(map, 2 * page_size);
+out:
+ return ret;
+}
+
+static int onfault_check(char *map)
+{
+ *map = 'a';
+ if (!is_vma_lock_on_fault((unsigned long)map)) {
+ printf("VMA is not marked for lock on fault\n");
+ return 1;
+ }
+
+ return 0;
+}
+
+static int unlock_onfault_check(char *map)
+{
+ unsigned long page_size = getpagesize();
+
+ if (is_vma_lock_on_fault((unsigned long)map) ||
+ is_vma_lock_on_fault((unsigned long)map + page_size)) {
+ printf("VMA is still lock on fault after unlock\n");
+ return 1;
+ }
+
+ return 0;
+}
+
+static int test_mlock_onfault()
+{
+ char *map;
+ int ret = 1;
+ unsigned long page_size = getpagesize();
+
+ map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE,
+ MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+ if (map == MAP_FAILED) {
+ perror("test_mlock_locked mmap");
+ goto out;
+ }
+
+ if (mlock2_(map, 2 * page_size, MLOCK_ONFAULT)) {
+ if (errno == ENOSYS) {
+ printf("Cannot call new mlock family, skipping test\n");
+ _exit(KSFT_SKIP);
+ }
+ perror("mlock2(MLOCK_ONFAULT)");
+ goto unmap;
+ }
+
+ if (onfault_check(map))
+ goto unmap;
+
+ /* Now unlock and recheck attributes */
+ if (munlock(map, 2 * page_size)) {
+ if (errno == ENOSYS) {
+ printf("Cannot call new mlock family, skipping test\n");
+ _exit(KSFT_SKIP);
+ }
+ perror("munlock()");
+ goto unmap;
+ }
+
+ ret = unlock_onfault_check(map);
+unmap:
+ munmap(map, 2 * page_size);
+out:
+ return ret;
+}
+
+static int test_lock_onfault_of_present()
+{
+ char *map;
+ int ret = 1;
+ unsigned long page_size = getpagesize();
+
+ map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE,
+ MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+ if (map == MAP_FAILED) {
+ perror("test_mlock_locked mmap");
+ goto out;
+ }
+
+ *map = 'a';
+
+ if (mlock2_(map, 2 * page_size, MLOCK_ONFAULT)) {
+ if (errno == ENOSYS) {
+ printf("Cannot call new mlock family, skipping test\n");
+ _exit(KSFT_SKIP);
+ }
+ perror("mlock2(MLOCK_ONFAULT)");
+ goto unmap;
+ }
+
+ if (!is_vma_lock_on_fault((unsigned long)map) ||
+ !is_vma_lock_on_fault((unsigned long)map + page_size)) {
+ printf("VMA with present pages is not marked lock on fault\n");
+ goto unmap;
+ }
+ ret = 0;
+unmap:
+ munmap(map, 2 * page_size);
+out:
+ return ret;
+}
+
+static int test_munlockall()
+{
+ char *map;
+ int ret = 1;
+ unsigned long page_size = getpagesize();
+
+ map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE,
+ MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+
+ if (map == MAP_FAILED) {
+ perror("test_munlockall mmap");
+ goto out;
+ }
+
+ if (mlockall(MCL_CURRENT)) {
+ perror("mlockall(MCL_CURRENT)");
+ goto out;
+ }
+
+ if (!lock_check((unsigned long)map))
+ goto unmap;
+
+ if (munlockall()) {
+ perror("munlockall()");
+ goto unmap;
+ }
+
+ if (unlock_lock_check(map))
+ goto unmap;
+
+ munmap(map, 2 * page_size);
+
+ map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE,
+ MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+
+ if (map == MAP_FAILED) {
+ perror("test_munlockall second mmap");
+ goto out;
+ }
+
+ if (mlockall(MCL_CURRENT | MCL_ONFAULT)) {
+ perror("mlockall(MCL_CURRENT | MCL_ONFAULT)");
+ goto unmap;
+ }
+
+ if (onfault_check(map))
+ goto unmap;
+
+ if (munlockall()) {
+ perror("munlockall()");
+ goto unmap;
+ }
+
+ if (unlock_onfault_check(map))
+ goto unmap;
+
+ if (mlockall(MCL_CURRENT | MCL_FUTURE)) {
+ perror("mlockall(MCL_CURRENT | MCL_FUTURE)");
+ goto out;
+ }
+
+ if (!lock_check((unsigned long)map))
+ goto unmap;
+
+ if (munlockall()) {
+ perror("munlockall()");
+ goto unmap;
+ }
+
+ ret = unlock_lock_check(map);
+
+unmap:
+ munmap(map, 2 * page_size);
+out:
+ munlockall();
+ return ret;
+}
+
+static int test_vma_management(bool call_mlock)
+{
+ int ret = 1;
+ void *map;
+ unsigned long page_size = getpagesize();
+ struct vm_boundaries page1;
+ struct vm_boundaries page2;
+ struct vm_boundaries page3;
+
+ map = mmap(NULL, 3 * page_size, PROT_READ | PROT_WRITE,
+ MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+ if (map == MAP_FAILED) {
+ perror("mmap()");
+ return ret;
+ }
+
+ if (call_mlock && mlock2_(map, 3 * page_size, MLOCK_ONFAULT)) {
+ if (errno == ENOSYS) {
+ printf("Cannot call new mlock family, skipping test\n");
+ _exit(KSFT_SKIP);
+ }
+ perror("mlock(ONFAULT)\n");
+ goto out;
+ }
+
+ if (get_vm_area((unsigned long)map, &page1) ||
+ get_vm_area((unsigned long)map + page_size, &page2) ||
+ get_vm_area((unsigned long)map + page_size * 2, &page3)) {
+ printf("couldn't find mapping in /proc/self/maps\n");
+ goto out;
+ }
+
+ /*
+ * Before we unlock a portion, we need to that all three pages are in
+ * the same VMA. If they are not we abort this test (Note that this is
+ * not a failure)
+ */
+ if (page1.start != page2.start || page2.start != page3.start) {
+ printf("VMAs are not merged to start, aborting test\n");
+ ret = 0;
+ goto out;
+ }
+
+ if (munlock(map + page_size, page_size)) {
+ perror("munlock()");
+ goto out;
+ }
+
+ if (get_vm_area((unsigned long)map, &page1) ||
+ get_vm_area((unsigned long)map + page_size, &page2) ||
+ get_vm_area((unsigned long)map + page_size * 2, &page3)) {
+ printf("couldn't find mapping in /proc/self/maps\n");
+ goto out;
+ }
+
+ /* All three VMAs should be different */
+ if (page1.start == page2.start || page2.start == page3.start) {
+ printf("failed to split VMA for munlock\n");
+ goto out;
+ }
+
+ /* Now unlock the first and third page and check the VMAs again */
+ if (munlock(map, page_size * 3)) {
+ perror("munlock()");
+ goto out;
+ }
+
+ if (get_vm_area((unsigned long)map, &page1) ||
+ get_vm_area((unsigned long)map + page_size, &page2) ||
+ get_vm_area((unsigned long)map + page_size * 2, &page3)) {
+ printf("couldn't find mapping in /proc/self/maps\n");
+ goto out;
+ }
+
+ /* Now all three VMAs should be the same */
+ if (page1.start != page2.start || page2.start != page3.start) {
+ printf("failed to merge VMAs after munlock\n");
+ goto out;
+ }
+
+ ret = 0;
+out:
+ munmap(map, 3 * page_size);
+ return ret;
+}
+
+static int test_mlockall(int (test_function)(bool call_mlock))
+{
+ int ret = 1;
+
+ if (mlockall(MCL_CURRENT | MCL_ONFAULT | MCL_FUTURE)) {
+ perror("mlockall");
+ return ret;
+ }
+
+ ret = test_function(false);
+ munlockall();
+ return ret;
+}
+
+int main(int argc, char **argv)
+{
+ int ret = 0;
+ ret += test_mlock_lock();
+ ret += test_mlock_onfault();
+ ret += test_munlockall();
+ ret += test_lock_onfault_of_present();
+ ret += test_vma_management(true);
+ ret += test_mlockall(test_vma_management);
+ return ret;
+}
diff --git a/tools/testing/selftests/vm/mlock2.h b/tools/testing/selftests/vm/mlock2.h
new file mode 100644
index 000000000..2a6e76c22
--- /dev/null
+++ b/tools/testing/selftests/vm/mlock2.h
@@ -0,0 +1,63 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <syscall.h>
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#ifndef MLOCK_ONFAULT
+#define MLOCK_ONFAULT 1
+#endif
+
+#ifndef MCL_ONFAULT
+#define MCL_ONFAULT (MCL_FUTURE << 1)
+#endif
+
+static int mlock2_(void *start, size_t len, int flags)
+{
+#ifdef __NR_mlock2
+ return syscall(__NR_mlock2, start, len, flags);
+#else
+ errno = ENOSYS;
+ return -1;
+#endif
+}
+
+static FILE *seek_to_smaps_entry(unsigned long addr)
+{
+ FILE *file;
+ char *line = NULL;
+ size_t size = 0;
+ unsigned long start, end;
+ char perms[5];
+ unsigned long offset;
+ char dev[32];
+ unsigned long inode;
+ char path[BUFSIZ];
+
+ file = fopen("/proc/self/smaps", "r");
+ if (!file) {
+ perror("fopen smaps");
+ _exit(1);
+ }
+
+ while (getline(&line, &size, file) > 0) {
+ if (sscanf(line, "%lx-%lx %s %lx %s %lu %s\n",
+ &start, &end, perms, &offset, dev, &inode, path) < 6)
+ goto next;
+
+ if (start <= addr && addr < end)
+ goto out;
+
+next:
+ free(line);
+ line = NULL;
+ size = 0;
+ }
+
+ fclose(file);
+ file = NULL;
+
+out:
+ free(line);
+ return file;
+}
diff --git a/tools/testing/selftests/vm/on-fault-limit.c b/tools/testing/selftests/vm/on-fault-limit.c
new file mode 100644
index 000000000..634d87dfb
--- /dev/null
+++ b/tools/testing/selftests/vm/on-fault-limit.c
@@ -0,0 +1,48 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <sys/mman.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <string.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+
+#ifndef MCL_ONFAULT
+#define MCL_ONFAULT (MCL_FUTURE << 1)
+#endif
+
+static int test_limit(void)
+{
+ int ret = 1;
+ struct rlimit lims;
+ void *map;
+
+ if (getrlimit(RLIMIT_MEMLOCK, &lims)) {
+ perror("getrlimit");
+ return ret;
+ }
+
+ if (mlockall(MCL_ONFAULT | MCL_FUTURE)) {
+ perror("mlockall");
+ return ret;
+ }
+
+ map = mmap(NULL, 2 * lims.rlim_max, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS | MAP_POPULATE, -1, 0);
+ if (map != MAP_FAILED)
+ printf("mmap should have failed, but didn't\n");
+ else {
+ ret = 0;
+ munmap(map, 2 * lims.rlim_max);
+ }
+
+ munlockall();
+ return ret;
+}
+
+int main(int argc, char **argv)
+{
+ int ret = 0;
+
+ ret += test_limit();
+ return ret;
+}
diff --git a/tools/testing/selftests/vm/run_vmtests b/tools/testing/selftests/vm/run_vmtests
new file mode 100755
index 000000000..584a91ae4
--- /dev/null
+++ b/tools/testing/selftests/vm/run_vmtests
@@ -0,0 +1,214 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#please run as root
+
+# Kselftest framework requirement - SKIP code is 4.
+ksft_skip=4
+
+mnt=./huge
+exitcode=0
+
+#get huge pagesize and freepages from /proc/meminfo
+while read name size unit; do
+ if [ "$name" = "HugePages_Free:" ]; then
+ freepgs=$size
+ fi
+ if [ "$name" = "Hugepagesize:" ]; then
+ hpgsize_KB=$size
+ fi
+done < /proc/meminfo
+
+# Simple hugetlbfs tests have a hardcoded minimum requirement of
+# huge pages totaling 256MB (262144KB) in size. The userfaultfd
+# hugetlb test requires a minimum of 2 * nr_cpus huge pages. Take
+# both of these requirements into account and attempt to increase
+# number of huge pages available.
+nr_cpus=$(nproc)
+hpgsize_MB=$((hpgsize_KB / 1024))
+half_ufd_size_MB=$((((nr_cpus * hpgsize_MB + 127) / 128) * 128))
+needmem_KB=$((half_ufd_size_MB * 2 * 1024))
+
+#set proper nr_hugepages
+if [ -n "$freepgs" ] && [ -n "$hpgsize_KB" ]; then
+ nr_hugepgs=`cat /proc/sys/vm/nr_hugepages`
+ needpgs=$((needmem_KB / hpgsize_KB))
+ tries=2
+ while [ $tries -gt 0 ] && [ $freepgs -lt $needpgs ]; do
+ lackpgs=$(( $needpgs - $freepgs ))
+ echo 3 > /proc/sys/vm/drop_caches
+ echo $(( $lackpgs + $nr_hugepgs )) > /proc/sys/vm/nr_hugepages
+ if [ $? -ne 0 ]; then
+ echo "Please run this test as root"
+ exit $ksft_skip
+ fi
+ while read name size unit; do
+ if [ "$name" = "HugePages_Free:" ]; then
+ freepgs=$size
+ fi
+ done < /proc/meminfo
+ tries=$((tries - 1))
+ done
+ if [ $freepgs -lt $needpgs ]; then
+ printf "Not enough huge pages available (%d < %d)\n" \
+ $freepgs $needpgs
+ exit 1
+ fi
+else
+ echo "no hugetlbfs support in kernel?"
+ exit 1
+fi
+
+mkdir $mnt
+mount -t hugetlbfs none $mnt
+
+echo "---------------------"
+echo "running hugepage-mmap"
+echo "---------------------"
+./hugepage-mmap
+if [ $? -ne 0 ]; then
+ echo "[FAIL]"
+ exitcode=1
+else
+ echo "[PASS]"
+fi
+
+shmmax=`cat /proc/sys/kernel/shmmax`
+shmall=`cat /proc/sys/kernel/shmall`
+echo 268435456 > /proc/sys/kernel/shmmax
+echo 4194304 > /proc/sys/kernel/shmall
+echo "--------------------"
+echo "running hugepage-shm"
+echo "--------------------"
+./hugepage-shm
+if [ $? -ne 0 ]; then
+ echo "[FAIL]"
+ exitcode=1
+else
+ echo "[PASS]"
+fi
+echo $shmmax > /proc/sys/kernel/shmmax
+echo $shmall > /proc/sys/kernel/shmall
+
+echo "-------------------"
+echo "running map_hugetlb"
+echo "-------------------"
+./map_hugetlb
+if [ $? -ne 0 ]; then
+ echo "[FAIL]"
+ exitcode=1
+else
+ echo "[PASS]"
+fi
+
+echo "NOTE: The above hugetlb tests provide minimal coverage. Use"
+echo " https://github.com/libhugetlbfs/libhugetlbfs.git for"
+echo " hugetlb regression testing."
+
+echo "-------------------"
+echo "running userfaultfd"
+echo "-------------------"
+./userfaultfd anon 128 32
+if [ $? -ne 0 ]; then
+ echo "[FAIL]"
+ exitcode=1
+else
+ echo "[PASS]"
+fi
+
+echo "---------------------------"
+echo "running userfaultfd_hugetlb"
+echo "---------------------------"
+# Test requires source and destination huge pages. Size of source
+# (half_ufd_size_MB) is passed as argument to test.
+./userfaultfd hugetlb $half_ufd_size_MB 32 $mnt/ufd_test_file
+if [ $? -ne 0 ]; then
+ echo "[FAIL]"
+ exitcode=1
+else
+ echo "[PASS]"
+fi
+rm -f $mnt/ufd_test_file
+
+echo "-------------------------"
+echo "running userfaultfd_shmem"
+echo "-------------------------"
+./userfaultfd shmem 128 32
+if [ $? -ne 0 ]; then
+ echo "[FAIL]"
+ exitcode=1
+else
+ echo "[PASS]"
+fi
+
+#cleanup
+umount $mnt
+rm -rf $mnt
+echo $nr_hugepgs > /proc/sys/vm/nr_hugepages
+
+echo "-----------------------"
+echo "running compaction_test"
+echo "-----------------------"
+./compaction_test
+if [ $? -ne 0 ]; then
+ echo "[FAIL]"
+ exitcode=1
+else
+ echo "[PASS]"
+fi
+
+echo "----------------------"
+echo "running on-fault-limit"
+echo "----------------------"
+sudo -u nobody ./on-fault-limit
+if [ $? -ne 0 ]; then
+ echo "[FAIL]"
+ exitcode=1
+else
+ echo "[PASS]"
+fi
+
+echo "--------------------"
+echo "running map_populate"
+echo "--------------------"
+./map_populate
+if [ $? -ne 0 ]; then
+ echo "[FAIL]"
+ exitcode=1
+else
+ echo "[PASS]"
+fi
+
+echo "--------------------"
+echo "running mlock2-tests"
+echo "--------------------"
+./mlock2-tests
+if [ $? -ne 0 ]; then
+ echo "[FAIL]"
+ exitcode=1
+else
+ echo "[PASS]"
+fi
+
+echo "-----------------------------"
+echo "running virtual_address_range"
+echo "-----------------------------"
+./virtual_address_range
+if [ $? -ne 0 ]; then
+ echo "[FAIL]"
+ exitcode=1
+else
+ echo "[PASS]"
+fi
+
+echo "-----------------------------"
+echo "running virtual address 128TB switch test"
+echo "-----------------------------"
+./va_128TBswitch
+if [ $? -ne 0 ]; then
+ echo "[FAIL]"
+ exitcode=1
+else
+ echo "[PASS]"
+fi
+
+exit $exitcode
diff --git a/tools/testing/selftests/vm/thuge-gen.c b/tools/testing/selftests/vm/thuge-gen.c
new file mode 100644
index 000000000..361ef7192
--- /dev/null
+++ b/tools/testing/selftests/vm/thuge-gen.c
@@ -0,0 +1,257 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Test selecting other page sizes for mmap/shmget.
+
+ Before running this huge pages for each huge page size must have been
+ reserved.
+ For large pages beyond MAX_ORDER (like 1GB on x86) boot options must be used.
+ Also shmmax must be increased.
+ And you need to run as root to work around some weird permissions in shm.
+ And nothing using huge pages should run in parallel.
+ When the program aborts you may need to clean up the shm segments with
+ ipcrm -m by hand, like this
+ sudo ipcs | awk '$1 == "0x00000000" {print $2}' | xargs -n1 sudo ipcrm -m
+ (warning this will remove all if someone else uses them) */
+
+#define _GNU_SOURCE 1
+#include <sys/mman.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <sys/ipc.h>
+#include <sys/shm.h>
+#include <sys/stat.h>
+#include <glob.h>
+#include <assert.h>
+#include <unistd.h>
+#include <stdarg.h>
+#include <string.h>
+
+#define err(x) perror(x), exit(1)
+
+#define MAP_HUGE_2MB (21 << MAP_HUGE_SHIFT)
+#define MAP_HUGE_1GB (30 << MAP_HUGE_SHIFT)
+#define MAP_HUGE_SHIFT 26
+#define MAP_HUGE_MASK 0x3f
+#if !defined(MAP_HUGETLB)
+#define MAP_HUGETLB 0x40000
+#endif
+
+#define SHM_HUGETLB 04000 /* segment will use huge TLB pages */
+#define SHM_HUGE_SHIFT 26
+#define SHM_HUGE_MASK 0x3f
+#define SHM_HUGE_2MB (21 << SHM_HUGE_SHIFT)
+#define SHM_HUGE_1GB (30 << SHM_HUGE_SHIFT)
+
+#define NUM_PAGESIZES 5
+
+#define NUM_PAGES 4
+
+#define Dprintf(fmt...) // printf(fmt)
+
+unsigned long page_sizes[NUM_PAGESIZES];
+int num_page_sizes;
+
+int ilog2(unsigned long v)
+{
+ int l = 0;
+ while ((1UL << l) < v)
+ l++;
+ return l;
+}
+
+void find_pagesizes(void)
+{
+ glob_t g;
+ int i;
+ glob("/sys/kernel/mm/hugepages/hugepages-*kB", 0, NULL, &g);
+ assert(g.gl_pathc <= NUM_PAGESIZES);
+ for (i = 0; i < g.gl_pathc; i++) {
+ sscanf(g.gl_pathv[i], "/sys/kernel/mm/hugepages/hugepages-%lukB",
+ &page_sizes[i]);
+ page_sizes[i] <<= 10;
+ printf("Found %luMB\n", page_sizes[i] >> 20);
+ }
+ num_page_sizes = g.gl_pathc;
+ globfree(&g);
+}
+
+unsigned long default_huge_page_size(void)
+{
+ unsigned long hps = 0;
+ char *line = NULL;
+ size_t linelen = 0;
+ FILE *f = fopen("/proc/meminfo", "r");
+ if (!f)
+ return 0;
+ while (getline(&line, &linelen, f) > 0) {
+ if (sscanf(line, "Hugepagesize: %lu kB", &hps) == 1) {
+ hps <<= 10;
+ break;
+ }
+ }
+ free(line);
+ return hps;
+}
+
+void show(unsigned long ps)
+{
+ char buf[100];
+ if (ps == getpagesize())
+ return;
+ printf("%luMB: ", ps >> 20);
+ fflush(stdout);
+ snprintf(buf, sizeof buf,
+ "cat /sys/kernel/mm/hugepages/hugepages-%lukB/free_hugepages",
+ ps >> 10);
+ system(buf);
+}
+
+unsigned long read_sysfs(int warn, char *fmt, ...)
+{
+ char *line = NULL;
+ size_t linelen = 0;
+ char buf[100];
+ FILE *f;
+ va_list ap;
+ unsigned long val = 0;
+
+ va_start(ap, fmt);
+ vsnprintf(buf, sizeof buf, fmt, ap);
+ va_end(ap);
+
+ f = fopen(buf, "r");
+ if (!f) {
+ if (warn)
+ printf("missing %s\n", buf);
+ return 0;
+ }
+ if (getline(&line, &linelen, f) > 0) {
+ sscanf(line, "%lu", &val);
+ }
+ fclose(f);
+ free(line);
+ return val;
+}
+
+unsigned long read_free(unsigned long ps)
+{
+ return read_sysfs(ps != getpagesize(),
+ "/sys/kernel/mm/hugepages/hugepages-%lukB/free_hugepages",
+ ps >> 10);
+}
+
+void test_mmap(unsigned long size, unsigned flags)
+{
+ char *map;
+ unsigned long before, after;
+ int err;
+
+ before = read_free(size);
+ map = mmap(NULL, size*NUM_PAGES, PROT_READ|PROT_WRITE,
+ MAP_PRIVATE|MAP_ANONYMOUS|MAP_HUGETLB|flags, -1, 0);
+
+ if (map == (char *)-1) err("mmap");
+ memset(map, 0xff, size*NUM_PAGES);
+ after = read_free(size);
+ Dprintf("before %lu after %lu diff %ld size %lu\n",
+ before, after, before - after, size);
+ assert(size == getpagesize() || (before - after) == NUM_PAGES);
+ show(size);
+ err = munmap(map, size);
+ assert(!err);
+}
+
+void test_shmget(unsigned long size, unsigned flags)
+{
+ int id;
+ unsigned long before, after;
+ int err;
+
+ before = read_free(size);
+ id = shmget(IPC_PRIVATE, size * NUM_PAGES, IPC_CREAT|0600|flags);
+ if (id < 0) err("shmget");
+
+ struct shm_info i;
+ if (shmctl(id, SHM_INFO, (void *)&i) < 0) err("shmctl");
+ Dprintf("alloc %lu res %lu\n", i.shm_tot, i.shm_rss);
+
+
+ Dprintf("id %d\n", id);
+ char *map = shmat(id, NULL, 0600);
+ if (map == (char*)-1) err("shmat");
+
+ shmctl(id, IPC_RMID, NULL);
+
+ memset(map, 0xff, size*NUM_PAGES);
+ after = read_free(size);
+
+ Dprintf("before %lu after %lu diff %ld size %lu\n",
+ before, after, before - after, size);
+ assert(size == getpagesize() || (before - after) == NUM_PAGES);
+ show(size);
+ err = shmdt(map);
+ assert(!err);
+}
+
+void sanity_checks(void)
+{
+ int i;
+ unsigned long largest = getpagesize();
+
+ for (i = 0; i < num_page_sizes; i++) {
+ if (page_sizes[i] > largest)
+ largest = page_sizes[i];
+
+ if (read_free(page_sizes[i]) < NUM_PAGES) {
+ printf("Not enough huge pages for page size %lu MB, need %u\n",
+ page_sizes[i] >> 20,
+ NUM_PAGES);
+ exit(0);
+ }
+ }
+
+ if (read_sysfs(0, "/proc/sys/kernel/shmmax") < NUM_PAGES * largest) {
+ printf("Please do echo %lu > /proc/sys/kernel/shmmax", largest * NUM_PAGES);
+ exit(0);
+ }
+
+#if defined(__x86_64__)
+ if (largest != 1U<<30) {
+ printf("No GB pages available on x86-64\n"
+ "Please boot with hugepagesz=1G hugepages=%d\n", NUM_PAGES);
+ exit(0);
+ }
+#endif
+}
+
+int main(void)
+{
+ int i;
+ unsigned default_hps = default_huge_page_size();
+
+ find_pagesizes();
+
+ sanity_checks();
+
+ for (i = 0; i < num_page_sizes; i++) {
+ unsigned long ps = page_sizes[i];
+ int arg = ilog2(ps) << MAP_HUGE_SHIFT;
+ printf("Testing %luMB mmap with shift %x\n", ps >> 20, arg);
+ test_mmap(ps, MAP_HUGETLB | arg);
+ }
+ printf("Testing default huge mmap\n");
+ test_mmap(default_hps, SHM_HUGETLB);
+
+ puts("Testing non-huge shmget");
+ test_shmget(getpagesize(), 0);
+
+ for (i = 0; i < num_page_sizes; i++) {
+ unsigned long ps = page_sizes[i];
+ int arg = ilog2(ps) << SHM_HUGE_SHIFT;
+ printf("Testing %luMB shmget with shift %x\n", ps >> 20, arg);
+ test_shmget(ps, SHM_HUGETLB | arg);
+ }
+ puts("default huge shmget");
+ test_shmget(default_hps, SHM_HUGETLB);
+
+ return 0;
+}
diff --git a/tools/testing/selftests/vm/transhuge-stress.c b/tools/testing/selftests/vm/transhuge-stress.c
new file mode 100644
index 000000000..fd7f1b4a9
--- /dev/null
+++ b/tools/testing/selftests/vm/transhuge-stress.c
@@ -0,0 +1,144 @@
+/*
+ * Stress test for transparent huge pages, memory compaction and migration.
+ *
+ * Authors: Konstantin Khlebnikov <koct9i@gmail.com>
+ *
+ * This is free and unencumbered software released into the public domain.
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <err.h>
+#include <time.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <string.h>
+#include <sys/mman.h>
+
+#define PAGE_SHIFT 12
+#define HPAGE_SHIFT 21
+
+#define PAGE_SIZE (1 << PAGE_SHIFT)
+#define HPAGE_SIZE (1 << HPAGE_SHIFT)
+
+#define PAGEMAP_PRESENT(ent) (((ent) & (1ull << 63)) != 0)
+#define PAGEMAP_PFN(ent) ((ent) & ((1ull << 55) - 1))
+
+int pagemap_fd;
+
+int64_t allocate_transhuge(void *ptr)
+{
+ uint64_t ent[2];
+
+ /* drop pmd */
+ if (mmap(ptr, HPAGE_SIZE, PROT_READ | PROT_WRITE,
+ MAP_FIXED | MAP_ANONYMOUS |
+ MAP_NORESERVE | MAP_PRIVATE, -1, 0) != ptr)
+ errx(2, "mmap transhuge");
+
+ if (madvise(ptr, HPAGE_SIZE, MADV_HUGEPAGE))
+ err(2, "MADV_HUGEPAGE");
+
+ /* allocate transparent huge page */
+ *(volatile void **)ptr = ptr;
+
+ if (pread(pagemap_fd, ent, sizeof(ent),
+ (uintptr_t)ptr >> (PAGE_SHIFT - 3)) != sizeof(ent))
+ err(2, "read pagemap");
+
+ if (PAGEMAP_PRESENT(ent[0]) && PAGEMAP_PRESENT(ent[1]) &&
+ PAGEMAP_PFN(ent[0]) + 1 == PAGEMAP_PFN(ent[1]) &&
+ !(PAGEMAP_PFN(ent[0]) & ((1 << (HPAGE_SHIFT - PAGE_SHIFT)) - 1)))
+ return PAGEMAP_PFN(ent[0]);
+
+ return -1;
+}
+
+int main(int argc, char **argv)
+{
+ size_t ram, len;
+ void *ptr, *p;
+ struct timespec a, b;
+ double s;
+ uint8_t *map;
+ size_t map_len;
+
+ ram = sysconf(_SC_PHYS_PAGES);
+ if (ram > SIZE_MAX / sysconf(_SC_PAGESIZE) / 4)
+ ram = SIZE_MAX / 4;
+ else
+ ram *= sysconf(_SC_PAGESIZE);
+
+ if (argc == 1)
+ len = ram;
+ else if (!strcmp(argv[1], "-h"))
+ errx(1, "usage: %s [size in MiB]", argv[0]);
+ else
+ len = atoll(argv[1]) << 20;
+
+ warnx("allocate %zd transhuge pages, using %zd MiB virtual memory"
+ " and %zd MiB of ram", len >> HPAGE_SHIFT, len >> 20,
+ len >> (20 + HPAGE_SHIFT - PAGE_SHIFT - 1));
+
+ pagemap_fd = open("/proc/self/pagemap", O_RDONLY);
+ if (pagemap_fd < 0)
+ err(2, "open pagemap");
+
+ len -= len % HPAGE_SIZE;
+ ptr = mmap(NULL, len + HPAGE_SIZE, PROT_READ | PROT_WRITE,
+ MAP_ANONYMOUS | MAP_NORESERVE | MAP_PRIVATE, -1, 0);
+ if (ptr == MAP_FAILED)
+ err(2, "initial mmap");
+ ptr += HPAGE_SIZE - (uintptr_t)ptr % HPAGE_SIZE;
+
+ if (madvise(ptr, len, MADV_HUGEPAGE))
+ err(2, "MADV_HUGEPAGE");
+
+ map_len = ram >> (HPAGE_SHIFT - 1);
+ map = malloc(map_len);
+ if (!map)
+ errx(2, "map malloc");
+
+ while (1) {
+ int nr_succeed = 0, nr_failed = 0, nr_pages = 0;
+
+ memset(map, 0, map_len);
+
+ clock_gettime(CLOCK_MONOTONIC, &a);
+ for (p = ptr; p < ptr + len; p += HPAGE_SIZE) {
+ int64_t pfn;
+
+ pfn = allocate_transhuge(p);
+
+ if (pfn < 0) {
+ nr_failed++;
+ } else {
+ size_t idx = pfn >> (HPAGE_SHIFT - PAGE_SHIFT);
+
+ nr_succeed++;
+ if (idx >= map_len) {
+ map = realloc(map, idx + 1);
+ if (!map)
+ errx(2, "map realloc");
+ memset(map + map_len, 0, idx + 1 - map_len);
+ map_len = idx + 1;
+ }
+ if (!map[idx])
+ nr_pages++;
+ map[idx] = 1;
+ }
+
+ /* split transhuge page, keep last page */
+ if (madvise(p, HPAGE_SIZE - PAGE_SIZE, MADV_DONTNEED))
+ err(2, "MADV_DONTNEED");
+ }
+ clock_gettime(CLOCK_MONOTONIC, &b);
+ s = b.tv_sec - a.tv_sec + (b.tv_nsec - a.tv_nsec) / 1000000000.;
+
+ warnx("%.3f s/loop, %.3f ms/page, %10.3f MiB/s\t"
+ "%4d succeed, %4d failed, %4d different pages",
+ s, s * 1000 / (len >> HPAGE_SHIFT), len / s / (1 << 20),
+ nr_succeed, nr_failed, nr_pages);
+ }
+}
diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c
new file mode 100644
index 000000000..b2c7043c0
--- /dev/null
+++ b/tools/testing/selftests/vm/userfaultfd.c
@@ -0,0 +1,1333 @@
+/*
+ * Stress userfaultfd syscall.
+ *
+ * Copyright (C) 2015 Red Hat, Inc.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ *
+ * This test allocates two virtual areas and bounces the physical
+ * memory across the two virtual areas (from area_src to area_dst)
+ * using userfaultfd.
+ *
+ * There are three threads running per CPU:
+ *
+ * 1) one per-CPU thread takes a per-page pthread_mutex in a random
+ * page of the area_dst (while the physical page may still be in
+ * area_src), and increments a per-page counter in the same page,
+ * and checks its value against a verification region.
+ *
+ * 2) another per-CPU thread handles the userfaults generated by
+ * thread 1 above. userfaultfd blocking reads or poll() modes are
+ * exercised interleaved.
+ *
+ * 3) one last per-CPU thread transfers the memory in the background
+ * at maximum bandwidth (if not already transferred by thread
+ * 2). Each cpu thread takes cares of transferring a portion of the
+ * area.
+ *
+ * When all threads of type 3 completed the transfer, one bounce is
+ * complete. area_src and area_dst are then swapped. All threads are
+ * respawned and so the bounce is immediately restarted in the
+ * opposite direction.
+ *
+ * per-CPU threads 1 by triggering userfaults inside
+ * pthread_mutex_lock will also verify the atomicity of the memory
+ * transfer (UFFDIO_COPY).
+ *
+ * The program takes two parameters: the amounts of physical memory in
+ * megabytes (MiB) of the area and the number of bounces to execute.
+ *
+ * # 100MiB 99999 bounces
+ * ./userfaultfd 100 99999
+ *
+ * # 1GiB 99 bounces
+ * ./userfaultfd 1000 99
+ *
+ * # 10MiB-~6GiB 999 bounces, continue forever unless an error triggers
+ * while ./userfaultfd $[RANDOM % 6000 + 10] 999; do true; done
+ */
+
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <errno.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <time.h>
+#include <signal.h>
+#include <poll.h>
+#include <string.h>
+#include <linux/mman.h>
+#include <sys/mman.h>
+#include <sys/syscall.h>
+#include <sys/ioctl.h>
+#include <sys/wait.h>
+#include <pthread.h>
+#include <linux/userfaultfd.h>
+#include <setjmp.h>
+#include <stdbool.h>
+
+#include "../kselftest.h"
+
+#ifdef __NR_userfaultfd
+
+static unsigned long nr_cpus, nr_pages, nr_pages_per_cpu, page_size;
+
+#define BOUNCE_RANDOM (1<<0)
+#define BOUNCE_RACINGFAULTS (1<<1)
+#define BOUNCE_VERIFY (1<<2)
+#define BOUNCE_POLL (1<<3)
+static int bounces;
+
+#define TEST_ANON 1
+#define TEST_HUGETLB 2
+#define TEST_SHMEM 3
+static int test_type;
+
+/* exercise the test_uffdio_*_eexist every ALARM_INTERVAL_SECS */
+#define ALARM_INTERVAL_SECS 10
+static volatile bool test_uffdio_copy_eexist = true;
+static volatile bool test_uffdio_zeropage_eexist = true;
+
+static bool map_shared;
+static int huge_fd;
+static char *huge_fd_off0;
+static unsigned long long *count_verify;
+static int uffd, uffd_flags, finished, *pipefd;
+static char *area_src, *area_src_alias, *area_dst, *area_dst_alias;
+static char *zeropage;
+pthread_attr_t attr;
+
+/* pthread_mutex_t starts at page offset 0 */
+#define area_mutex(___area, ___nr) \
+ ((pthread_mutex_t *) ((___area) + (___nr)*page_size))
+/*
+ * count is placed in the page after pthread_mutex_t naturally aligned
+ * to avoid non alignment faults on non-x86 archs.
+ */
+#define area_count(___area, ___nr) \
+ ((volatile unsigned long long *) ((unsigned long) \
+ ((___area) + (___nr)*page_size + \
+ sizeof(pthread_mutex_t) + \
+ sizeof(unsigned long long) - 1) & \
+ ~(unsigned long)(sizeof(unsigned long long) \
+ - 1)))
+
+static int anon_release_pages(char *rel_area)
+{
+ int ret = 0;
+
+ if (madvise(rel_area, nr_pages * page_size, MADV_DONTNEED)) {
+ perror("madvise");
+ ret = 1;
+ }
+
+ return ret;
+}
+
+static void anon_allocate_area(void **alloc_area)
+{
+ *alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE,
+ MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+ if (*alloc_area == MAP_FAILED) {
+ fprintf(stderr, "mmap of anonymous memory failed");
+ *alloc_area = NULL;
+ }
+}
+
+static void noop_alias_mapping(__u64 *start, size_t len, unsigned long offset)
+{
+}
+
+/* HugeTLB memory */
+static int hugetlb_release_pages(char *rel_area)
+{
+ int ret = 0;
+
+ if (fallocate(huge_fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
+ rel_area == huge_fd_off0 ? 0 :
+ nr_pages * page_size,
+ nr_pages * page_size)) {
+ perror("fallocate");
+ ret = 1;
+ }
+
+ return ret;
+}
+
+
+static void hugetlb_allocate_area(void **alloc_area)
+{
+ void *area_alias = NULL;
+ char **alloc_area_alias;
+ *alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE,
+ (map_shared ? MAP_SHARED : MAP_PRIVATE) |
+ MAP_HUGETLB,
+ huge_fd, *alloc_area == area_src ? 0 :
+ nr_pages * page_size);
+ if (*alloc_area == MAP_FAILED) {
+ fprintf(stderr, "mmap of hugetlbfs file failed\n");
+ *alloc_area = NULL;
+ }
+
+ if (map_shared) {
+ area_alias = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_HUGETLB,
+ huge_fd, *alloc_area == area_src ? 0 :
+ nr_pages * page_size);
+ if (area_alias == MAP_FAILED) {
+ if (munmap(*alloc_area, nr_pages * page_size) < 0)
+ perror("hugetlb munmap"), exit(1);
+ *alloc_area = NULL;
+ return;
+ }
+ }
+ if (*alloc_area == area_src) {
+ huge_fd_off0 = *alloc_area;
+ alloc_area_alias = &area_src_alias;
+ } else {
+ alloc_area_alias = &area_dst_alias;
+ }
+ if (area_alias)
+ *alloc_area_alias = area_alias;
+}
+
+static void hugetlb_alias_mapping(__u64 *start, size_t len, unsigned long offset)
+{
+ if (!map_shared)
+ return;
+ /*
+ * We can't zap just the pagetable with hugetlbfs because
+ * MADV_DONTEED won't work. So exercise -EEXIST on a alias
+ * mapping where the pagetables are not established initially,
+ * this way we'll exercise the -EEXEC at the fs level.
+ */
+ *start = (unsigned long) area_dst_alias + offset;
+}
+
+/* Shared memory */
+static int shmem_release_pages(char *rel_area)
+{
+ int ret = 0;
+
+ if (madvise(rel_area, nr_pages * page_size, MADV_REMOVE)) {
+ perror("madvise");
+ ret = 1;
+ }
+
+ return ret;
+}
+
+static void shmem_allocate_area(void **alloc_area)
+{
+ *alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE,
+ MAP_ANONYMOUS | MAP_SHARED, -1, 0);
+ if (*alloc_area == MAP_FAILED) {
+ fprintf(stderr, "shared memory mmap failed\n");
+ *alloc_area = NULL;
+ }
+}
+
+struct uffd_test_ops {
+ unsigned long expected_ioctls;
+ void (*allocate_area)(void **alloc_area);
+ int (*release_pages)(char *rel_area);
+ void (*alias_mapping)(__u64 *start, size_t len, unsigned long offset);
+};
+
+#define ANON_EXPECTED_IOCTLS ((1 << _UFFDIO_WAKE) | \
+ (1 << _UFFDIO_COPY) | \
+ (1 << _UFFDIO_ZEROPAGE))
+
+static struct uffd_test_ops anon_uffd_test_ops = {
+ .expected_ioctls = ANON_EXPECTED_IOCTLS,
+ .allocate_area = anon_allocate_area,
+ .release_pages = anon_release_pages,
+ .alias_mapping = noop_alias_mapping,
+};
+
+static struct uffd_test_ops shmem_uffd_test_ops = {
+ .expected_ioctls = ANON_EXPECTED_IOCTLS,
+ .allocate_area = shmem_allocate_area,
+ .release_pages = shmem_release_pages,
+ .alias_mapping = noop_alias_mapping,
+};
+
+static struct uffd_test_ops hugetlb_uffd_test_ops = {
+ .expected_ioctls = UFFD_API_RANGE_IOCTLS_BASIC,
+ .allocate_area = hugetlb_allocate_area,
+ .release_pages = hugetlb_release_pages,
+ .alias_mapping = hugetlb_alias_mapping,
+};
+
+static struct uffd_test_ops *uffd_test_ops;
+
+static int my_bcmp(char *str1, char *str2, size_t n)
+{
+ unsigned long i;
+ for (i = 0; i < n; i++)
+ if (str1[i] != str2[i])
+ return 1;
+ return 0;
+}
+
+static void *locking_thread(void *arg)
+{
+ unsigned long cpu = (unsigned long) arg;
+ struct random_data rand;
+ unsigned long page_nr = *(&(page_nr)); /* uninitialized warning */
+ int32_t rand_nr;
+ unsigned long long count;
+ char randstate[64];
+ unsigned int seed;
+ time_t start;
+
+ if (bounces & BOUNCE_RANDOM) {
+ seed = (unsigned int) time(NULL) - bounces;
+ if (!(bounces & BOUNCE_RACINGFAULTS))
+ seed += cpu;
+ bzero(&rand, sizeof(rand));
+ bzero(&randstate, sizeof(randstate));
+ if (initstate_r(seed, randstate, sizeof(randstate), &rand))
+ fprintf(stderr, "srandom_r error\n"), exit(1);
+ } else {
+ page_nr = -bounces;
+ if (!(bounces & BOUNCE_RACINGFAULTS))
+ page_nr += cpu * nr_pages_per_cpu;
+ }
+
+ while (!finished) {
+ if (bounces & BOUNCE_RANDOM) {
+ if (random_r(&rand, &rand_nr))
+ fprintf(stderr, "random_r 1 error\n"), exit(1);
+ page_nr = rand_nr;
+ if (sizeof(page_nr) > sizeof(rand_nr)) {
+ if (random_r(&rand, &rand_nr))
+ fprintf(stderr, "random_r 2 error\n"), exit(1);
+ page_nr |= (((unsigned long) rand_nr) << 16) <<
+ 16;
+ }
+ } else
+ page_nr += 1;
+ page_nr %= nr_pages;
+
+ start = time(NULL);
+ if (bounces & BOUNCE_VERIFY) {
+ count = *area_count(area_dst, page_nr);
+ if (!count)
+ fprintf(stderr,
+ "page_nr %lu wrong count %Lu %Lu\n",
+ page_nr, count,
+ count_verify[page_nr]), exit(1);
+
+
+ /*
+ * We can't use bcmp (or memcmp) because that
+ * returns 0 erroneously if the memory is
+ * changing under it (even if the end of the
+ * page is never changing and always
+ * different).
+ */
+#if 1
+ if (!my_bcmp(area_dst + page_nr * page_size, zeropage,
+ page_size))
+ fprintf(stderr,
+ "my_bcmp page_nr %lu wrong count %Lu %Lu\n",
+ page_nr, count,
+ count_verify[page_nr]), exit(1);
+#else
+ unsigned long loops;
+
+ loops = 0;
+ /* uncomment the below line to test with mutex */
+ /* pthread_mutex_lock(area_mutex(area_dst, page_nr)); */
+ while (!bcmp(area_dst + page_nr * page_size, zeropage,
+ page_size)) {
+ loops += 1;
+ if (loops > 10)
+ break;
+ }
+ /* uncomment below line to test with mutex */
+ /* pthread_mutex_unlock(area_mutex(area_dst, page_nr)); */
+ if (loops) {
+ fprintf(stderr,
+ "page_nr %lu all zero thread %lu %p %lu\n",
+ page_nr, cpu, area_dst + page_nr * page_size,
+ loops);
+ if (loops > 10)
+ exit(1);
+ }
+#endif
+ }
+
+ pthread_mutex_lock(area_mutex(area_dst, page_nr));
+ count = *area_count(area_dst, page_nr);
+ if (count != count_verify[page_nr]) {
+ fprintf(stderr,
+ "page_nr %lu memory corruption %Lu %Lu\n",
+ page_nr, count,
+ count_verify[page_nr]), exit(1);
+ }
+ count++;
+ *area_count(area_dst, page_nr) = count_verify[page_nr] = count;
+ pthread_mutex_unlock(area_mutex(area_dst, page_nr));
+
+ if (time(NULL) - start > 1)
+ fprintf(stderr,
+ "userfault too slow %ld "
+ "possible false positive with overcommit\n",
+ time(NULL) - start);
+ }
+
+ return NULL;
+}
+
+static void retry_copy_page(int ufd, struct uffdio_copy *uffdio_copy,
+ unsigned long offset)
+{
+ uffd_test_ops->alias_mapping(&uffdio_copy->dst,
+ uffdio_copy->len,
+ offset);
+ if (ioctl(ufd, UFFDIO_COPY, uffdio_copy)) {
+ /* real retval in ufdio_copy.copy */
+ if (uffdio_copy->copy != -EEXIST)
+ fprintf(stderr, "UFFDIO_COPY retry error %Ld\n",
+ uffdio_copy->copy), exit(1);
+ } else {
+ fprintf(stderr, "UFFDIO_COPY retry unexpected %Ld\n",
+ uffdio_copy->copy), exit(1);
+ }
+}
+
+static int __copy_page(int ufd, unsigned long offset, bool retry)
+{
+ struct uffdio_copy uffdio_copy;
+
+ if (offset >= nr_pages * page_size)
+ fprintf(stderr, "unexpected offset %lu\n",
+ offset), exit(1);
+ uffdio_copy.dst = (unsigned long) area_dst + offset;
+ uffdio_copy.src = (unsigned long) area_src + offset;
+ uffdio_copy.len = page_size;
+ uffdio_copy.mode = 0;
+ uffdio_copy.copy = 0;
+ if (ioctl(ufd, UFFDIO_COPY, &uffdio_copy)) {
+ /* real retval in ufdio_copy.copy */
+ if (uffdio_copy.copy != -EEXIST)
+ fprintf(stderr, "UFFDIO_COPY error %Ld\n",
+ uffdio_copy.copy), exit(1);
+ } else if (uffdio_copy.copy != page_size) {
+ fprintf(stderr, "UFFDIO_COPY unexpected copy %Ld\n",
+ uffdio_copy.copy), exit(1);
+ } else {
+ if (test_uffdio_copy_eexist && retry) {
+ test_uffdio_copy_eexist = false;
+ retry_copy_page(ufd, &uffdio_copy, offset);
+ }
+ return 1;
+ }
+ return 0;
+}
+
+static int copy_page_retry(int ufd, unsigned long offset)
+{
+ return __copy_page(ufd, offset, true);
+}
+
+static int copy_page(int ufd, unsigned long offset)
+{
+ return __copy_page(ufd, offset, false);
+}
+
+static void *uffd_poll_thread(void *arg)
+{
+ unsigned long cpu = (unsigned long) arg;
+ struct pollfd pollfd[2];
+ struct uffd_msg msg;
+ struct uffdio_register uffd_reg;
+ int ret;
+ unsigned long offset;
+ char tmp_chr;
+ unsigned long userfaults = 0;
+
+ pollfd[0].fd = uffd;
+ pollfd[0].events = POLLIN;
+ pollfd[1].fd = pipefd[cpu*2];
+ pollfd[1].events = POLLIN;
+
+ for (;;) {
+ ret = poll(pollfd, 2, -1);
+ if (!ret)
+ fprintf(stderr, "poll error %d\n", ret), exit(1);
+ if (ret < 0)
+ perror("poll"), exit(1);
+ if (pollfd[1].revents & POLLIN) {
+ if (read(pollfd[1].fd, &tmp_chr, 1) != 1)
+ fprintf(stderr, "read pipefd error\n"),
+ exit(1);
+ break;
+ }
+ if (!(pollfd[0].revents & POLLIN))
+ fprintf(stderr, "pollfd[0].revents %d\n",
+ pollfd[0].revents), exit(1);
+ ret = read(uffd, &msg, sizeof(msg));
+ if (ret < 0) {
+ if (errno == EAGAIN)
+ continue;
+ perror("nonblocking read error"), exit(1);
+ }
+ switch (msg.event) {
+ default:
+ fprintf(stderr, "unexpected msg event %u\n",
+ msg.event), exit(1);
+ break;
+ case UFFD_EVENT_PAGEFAULT:
+ if (msg.arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE)
+ fprintf(stderr, "unexpected write fault\n"), exit(1);
+ offset = (char *)(unsigned long)msg.arg.pagefault.address -
+ area_dst;
+ offset &= ~(page_size-1);
+ if (copy_page(uffd, offset))
+ userfaults++;
+ break;
+ case UFFD_EVENT_FORK:
+ close(uffd);
+ uffd = msg.arg.fork.ufd;
+ pollfd[0].fd = uffd;
+ break;
+ case UFFD_EVENT_REMOVE:
+ uffd_reg.range.start = msg.arg.remove.start;
+ uffd_reg.range.len = msg.arg.remove.end -
+ msg.arg.remove.start;
+ if (ioctl(uffd, UFFDIO_UNREGISTER, &uffd_reg.range))
+ fprintf(stderr, "remove failure\n"), exit(1);
+ break;
+ case UFFD_EVENT_REMAP:
+ area_dst = (char *)(unsigned long)msg.arg.remap.to;
+ break;
+ }
+ }
+ return (void *)userfaults;
+}
+
+pthread_mutex_t uffd_read_mutex = PTHREAD_MUTEX_INITIALIZER;
+
+static void *uffd_read_thread(void *arg)
+{
+ unsigned long *this_cpu_userfaults;
+ struct uffd_msg msg;
+ unsigned long offset;
+ int ret;
+
+ this_cpu_userfaults = (unsigned long *) arg;
+ *this_cpu_userfaults = 0;
+
+ pthread_mutex_unlock(&uffd_read_mutex);
+ /* from here cancellation is ok */
+
+ for (;;) {
+ ret = read(uffd, &msg, sizeof(msg));
+ if (ret != sizeof(msg)) {
+ if (ret < 0)
+ perror("blocking read error"), exit(1);
+ else
+ fprintf(stderr, "short read\n"), exit(1);
+ }
+ if (msg.event != UFFD_EVENT_PAGEFAULT)
+ fprintf(stderr, "unexpected msg event %u\n",
+ msg.event), exit(1);
+ if (bounces & BOUNCE_VERIFY &&
+ msg.arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE)
+ fprintf(stderr, "unexpected write fault\n"), exit(1);
+ offset = (char *)(unsigned long)msg.arg.pagefault.address -
+ area_dst;
+ offset &= ~(page_size-1);
+ if (copy_page(uffd, offset))
+ (*this_cpu_userfaults)++;
+ }
+ return (void *)NULL;
+}
+
+static void *background_thread(void *arg)
+{
+ unsigned long cpu = (unsigned long) arg;
+ unsigned long page_nr;
+
+ for (page_nr = cpu * nr_pages_per_cpu;
+ page_nr < (cpu+1) * nr_pages_per_cpu;
+ page_nr++)
+ copy_page_retry(uffd, page_nr * page_size);
+
+ return NULL;
+}
+
+static int stress(unsigned long *userfaults)
+{
+ unsigned long cpu;
+ pthread_t locking_threads[nr_cpus];
+ pthread_t uffd_threads[nr_cpus];
+ pthread_t background_threads[nr_cpus];
+ void **_userfaults = (void **) userfaults;
+
+ finished = 0;
+ for (cpu = 0; cpu < nr_cpus; cpu++) {
+ if (pthread_create(&locking_threads[cpu], &attr,
+ locking_thread, (void *)cpu))
+ return 1;
+ if (bounces & BOUNCE_POLL) {
+ if (pthread_create(&uffd_threads[cpu], &attr,
+ uffd_poll_thread, (void *)cpu))
+ return 1;
+ } else {
+ if (pthread_create(&uffd_threads[cpu], &attr,
+ uffd_read_thread,
+ &_userfaults[cpu]))
+ return 1;
+ pthread_mutex_lock(&uffd_read_mutex);
+ }
+ if (pthread_create(&background_threads[cpu], &attr,
+ background_thread, (void *)cpu))
+ return 1;
+ }
+ for (cpu = 0; cpu < nr_cpus; cpu++)
+ if (pthread_join(background_threads[cpu], NULL))
+ return 1;
+
+ /*
+ * Be strict and immediately zap area_src, the whole area has
+ * been transferred already by the background treads. The
+ * area_src could then be faulted in in a racy way by still
+ * running uffdio_threads reading zeropages after we zapped
+ * area_src (but they're guaranteed to get -EEXIST from
+ * UFFDIO_COPY without writing zero pages into area_dst
+ * because the background threads already completed).
+ */
+ if (uffd_test_ops->release_pages(area_src))
+ return 1;
+
+ for (cpu = 0; cpu < nr_cpus; cpu++) {
+ char c;
+ if (bounces & BOUNCE_POLL) {
+ if (write(pipefd[cpu*2+1], &c, 1) != 1) {
+ fprintf(stderr, "pipefd write error\n");
+ return 1;
+ }
+ if (pthread_join(uffd_threads[cpu], &_userfaults[cpu]))
+ return 1;
+ } else {
+ if (pthread_cancel(uffd_threads[cpu]))
+ return 1;
+ if (pthread_join(uffd_threads[cpu], NULL))
+ return 1;
+ }
+ }
+
+ finished = 1;
+ for (cpu = 0; cpu < nr_cpus; cpu++)
+ if (pthread_join(locking_threads[cpu], NULL))
+ return 1;
+
+ return 0;
+}
+
+static int userfaultfd_open(int features)
+{
+ struct uffdio_api uffdio_api;
+
+ uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
+ if (uffd < 0) {
+ fprintf(stderr,
+ "userfaultfd syscall not available in this kernel\n");
+ return 1;
+ }
+ uffd_flags = fcntl(uffd, F_GETFD, NULL);
+
+ uffdio_api.api = UFFD_API;
+ uffdio_api.features = features;
+ if (ioctl(uffd, UFFDIO_API, &uffdio_api)) {
+ fprintf(stderr, "UFFDIO_API\n");
+ return 1;
+ }
+ if (uffdio_api.api != UFFD_API) {
+ fprintf(stderr, "UFFDIO_API error %Lu\n", uffdio_api.api);
+ return 1;
+ }
+
+ return 0;
+}
+
+sigjmp_buf jbuf, *sigbuf;
+
+static void sighndl(int sig, siginfo_t *siginfo, void *ptr)
+{
+ if (sig == SIGBUS) {
+ if (sigbuf)
+ siglongjmp(*sigbuf, 1);
+ abort();
+ }
+}
+
+/*
+ * For non-cooperative userfaultfd test we fork() a process that will
+ * generate pagefaults, will mremap the area monitored by the
+ * userfaultfd and at last this process will release the monitored
+ * area.
+ * For the anonymous and shared memory the area is divided into two
+ * parts, the first part is accessed before mremap, and the second
+ * part is accessed after mremap. Since hugetlbfs does not support
+ * mremap, the entire monitored area is accessed in a single pass for
+ * HUGETLB_TEST.
+ * The release of the pages currently generates event for shmem and
+ * anonymous memory (UFFD_EVENT_REMOVE), hence it is not checked
+ * for hugetlb.
+ * For signal test(UFFD_FEATURE_SIGBUS), signal_test = 1, we register
+ * monitored area, generate pagefaults and test that signal is delivered.
+ * Use UFFDIO_COPY to allocate missing page and retry. For signal_test = 2
+ * test robustness use case - we release monitored area, fork a process
+ * that will generate pagefaults and verify signal is generated.
+ * This also tests UFFD_FEATURE_EVENT_FORK event along with the signal
+ * feature. Using monitor thread, verify no userfault events are generated.
+ */
+static int faulting_process(int signal_test)
+{
+ unsigned long nr;
+ unsigned long long count;
+ unsigned long split_nr_pages;
+ unsigned long lastnr;
+ struct sigaction act;
+ unsigned long signalled = 0;
+
+ if (test_type != TEST_HUGETLB)
+ split_nr_pages = (nr_pages + 1) / 2;
+ else
+ split_nr_pages = nr_pages;
+
+ if (signal_test) {
+ sigbuf = &jbuf;
+ memset(&act, 0, sizeof(act));
+ act.sa_sigaction = sighndl;
+ act.sa_flags = SA_SIGINFO;
+ if (sigaction(SIGBUS, &act, 0)) {
+ perror("sigaction");
+ return 1;
+ }
+ lastnr = (unsigned long)-1;
+ }
+
+ for (nr = 0; nr < split_nr_pages; nr++) {
+ if (signal_test) {
+ if (sigsetjmp(*sigbuf, 1) != 0) {
+ if (nr == lastnr) {
+ fprintf(stderr, "Signal repeated\n");
+ return 1;
+ }
+
+ lastnr = nr;
+ if (signal_test == 1) {
+ if (copy_page(uffd, nr * page_size))
+ signalled++;
+ } else {
+ signalled++;
+ continue;
+ }
+ }
+ }
+
+ count = *area_count(area_dst, nr);
+ if (count != count_verify[nr]) {
+ fprintf(stderr,
+ "nr %lu memory corruption %Lu %Lu\n",
+ nr, count,
+ count_verify[nr]), exit(1);
+ }
+ }
+
+ if (signal_test)
+ return signalled != split_nr_pages;
+
+ if (test_type == TEST_HUGETLB)
+ return 0;
+
+ area_dst = mremap(area_dst, nr_pages * page_size, nr_pages * page_size,
+ MREMAP_MAYMOVE | MREMAP_FIXED, area_src);
+ if (area_dst == MAP_FAILED)
+ perror("mremap"), exit(1);
+
+ for (; nr < nr_pages; nr++) {
+ count = *area_count(area_dst, nr);
+ if (count != count_verify[nr]) {
+ fprintf(stderr,
+ "nr %lu memory corruption %Lu %Lu\n",
+ nr, count,
+ count_verify[nr]), exit(1);
+ }
+ }
+
+ if (uffd_test_ops->release_pages(area_dst))
+ return 1;
+
+ for (nr = 0; nr < nr_pages; nr++) {
+ if (my_bcmp(area_dst + nr * page_size, zeropage, page_size))
+ fprintf(stderr, "nr %lu is not zero\n", nr), exit(1);
+ }
+
+ return 0;
+}
+
+static void retry_uffdio_zeropage(int ufd,
+ struct uffdio_zeropage *uffdio_zeropage,
+ unsigned long offset)
+{
+ uffd_test_ops->alias_mapping(&uffdio_zeropage->range.start,
+ uffdio_zeropage->range.len,
+ offset);
+ if (ioctl(ufd, UFFDIO_ZEROPAGE, uffdio_zeropage)) {
+ if (uffdio_zeropage->zeropage != -EEXIST)
+ fprintf(stderr, "UFFDIO_ZEROPAGE retry error %Ld\n",
+ uffdio_zeropage->zeropage), exit(1);
+ } else {
+ fprintf(stderr, "UFFDIO_ZEROPAGE retry unexpected %Ld\n",
+ uffdio_zeropage->zeropage), exit(1);
+ }
+}
+
+static int __uffdio_zeropage(int ufd, unsigned long offset, bool retry)
+{
+ struct uffdio_zeropage uffdio_zeropage;
+ int ret;
+ unsigned long has_zeropage;
+
+ has_zeropage = uffd_test_ops->expected_ioctls & (1 << _UFFDIO_ZEROPAGE);
+
+ if (offset >= nr_pages * page_size)
+ fprintf(stderr, "unexpected offset %lu\n",
+ offset), exit(1);
+ uffdio_zeropage.range.start = (unsigned long) area_dst + offset;
+ uffdio_zeropage.range.len = page_size;
+ uffdio_zeropage.mode = 0;
+ ret = ioctl(ufd, UFFDIO_ZEROPAGE, &uffdio_zeropage);
+ if (ret) {
+ /* real retval in ufdio_zeropage.zeropage */
+ if (has_zeropage) {
+ if (uffdio_zeropage.zeropage == -EEXIST)
+ fprintf(stderr, "UFFDIO_ZEROPAGE -EEXIST\n"),
+ exit(1);
+ else
+ fprintf(stderr, "UFFDIO_ZEROPAGE error %Ld\n",
+ uffdio_zeropage.zeropage), exit(1);
+ } else {
+ if (uffdio_zeropage.zeropage != -EINVAL)
+ fprintf(stderr,
+ "UFFDIO_ZEROPAGE not -EINVAL %Ld\n",
+ uffdio_zeropage.zeropage), exit(1);
+ }
+ } else if (has_zeropage) {
+ if (uffdio_zeropage.zeropage != page_size) {
+ fprintf(stderr, "UFFDIO_ZEROPAGE unexpected %Ld\n",
+ uffdio_zeropage.zeropage), exit(1);
+ } else {
+ if (test_uffdio_zeropage_eexist && retry) {
+ test_uffdio_zeropage_eexist = false;
+ retry_uffdio_zeropage(ufd, &uffdio_zeropage,
+ offset);
+ }
+ return 1;
+ }
+ } else {
+ fprintf(stderr,
+ "UFFDIO_ZEROPAGE succeeded %Ld\n",
+ uffdio_zeropage.zeropage), exit(1);
+ }
+
+ return 0;
+}
+
+static int uffdio_zeropage(int ufd, unsigned long offset)
+{
+ return __uffdio_zeropage(ufd, offset, false);
+}
+
+/* exercise UFFDIO_ZEROPAGE */
+static int userfaultfd_zeropage_test(void)
+{
+ struct uffdio_register uffdio_register;
+ unsigned long expected_ioctls;
+
+ printf("testing UFFDIO_ZEROPAGE: ");
+ fflush(stdout);
+
+ if (uffd_test_ops->release_pages(area_dst))
+ return 1;
+
+ if (userfaultfd_open(0) < 0)
+ return 1;
+ uffdio_register.range.start = (unsigned long) area_dst;
+ uffdio_register.range.len = nr_pages * page_size;
+ uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
+ if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
+ fprintf(stderr, "register failure\n"), exit(1);
+
+ expected_ioctls = uffd_test_ops->expected_ioctls;
+ if ((uffdio_register.ioctls & expected_ioctls) !=
+ expected_ioctls)
+ fprintf(stderr,
+ "unexpected missing ioctl for anon memory\n"),
+ exit(1);
+
+ if (uffdio_zeropage(uffd, 0)) {
+ if (my_bcmp(area_dst, zeropage, page_size))
+ fprintf(stderr, "zeropage is not zero\n"), exit(1);
+ }
+
+ close(uffd);
+ printf("done.\n");
+ return 0;
+}
+
+static int userfaultfd_events_test(void)
+{
+ struct uffdio_register uffdio_register;
+ unsigned long expected_ioctls;
+ unsigned long userfaults;
+ pthread_t uffd_mon;
+ int err, features;
+ pid_t pid;
+ char c;
+
+ printf("testing events (fork, remap, remove): ");
+ fflush(stdout);
+
+ if (uffd_test_ops->release_pages(area_dst))
+ return 1;
+
+ features = UFFD_FEATURE_EVENT_FORK | UFFD_FEATURE_EVENT_REMAP |
+ UFFD_FEATURE_EVENT_REMOVE;
+ if (userfaultfd_open(features) < 0)
+ return 1;
+ fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK);
+
+ uffdio_register.range.start = (unsigned long) area_dst;
+ uffdio_register.range.len = nr_pages * page_size;
+ uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
+ if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
+ fprintf(stderr, "register failure\n"), exit(1);
+
+ expected_ioctls = uffd_test_ops->expected_ioctls;
+ if ((uffdio_register.ioctls & expected_ioctls) !=
+ expected_ioctls)
+ fprintf(stderr,
+ "unexpected missing ioctl for anon memory\n"),
+ exit(1);
+
+ if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, NULL))
+ perror("uffd_poll_thread create"), exit(1);
+
+ pid = fork();
+ if (pid < 0)
+ perror("fork"), exit(1);
+
+ if (!pid)
+ return faulting_process(0);
+
+ waitpid(pid, &err, 0);
+ if (err)
+ fprintf(stderr, "faulting process failed\n"), exit(1);
+
+ if (write(pipefd[1], &c, sizeof(c)) != sizeof(c))
+ perror("pipe write"), exit(1);
+ if (pthread_join(uffd_mon, (void **)&userfaults))
+ return 1;
+
+ close(uffd);
+ printf("userfaults: %ld\n", userfaults);
+
+ return userfaults != nr_pages;
+}
+
+static int userfaultfd_sig_test(void)
+{
+ struct uffdio_register uffdio_register;
+ unsigned long expected_ioctls;
+ unsigned long userfaults;
+ pthread_t uffd_mon;
+ int err, features;
+ pid_t pid;
+ char c;
+
+ printf("testing signal delivery: ");
+ fflush(stdout);
+
+ if (uffd_test_ops->release_pages(area_dst))
+ return 1;
+
+ features = UFFD_FEATURE_EVENT_FORK|UFFD_FEATURE_SIGBUS;
+ if (userfaultfd_open(features) < 0)
+ return 1;
+ fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK);
+
+ uffdio_register.range.start = (unsigned long) area_dst;
+ uffdio_register.range.len = nr_pages * page_size;
+ uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
+ if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
+ fprintf(stderr, "register failure\n"), exit(1);
+
+ expected_ioctls = uffd_test_ops->expected_ioctls;
+ if ((uffdio_register.ioctls & expected_ioctls) !=
+ expected_ioctls)
+ fprintf(stderr,
+ "unexpected missing ioctl for anon memory\n"),
+ exit(1);
+
+ if (faulting_process(1))
+ fprintf(stderr, "faulting process failed\n"), exit(1);
+
+ if (uffd_test_ops->release_pages(area_dst))
+ return 1;
+
+ if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, NULL))
+ perror("uffd_poll_thread create"), exit(1);
+
+ pid = fork();
+ if (pid < 0)
+ perror("fork"), exit(1);
+
+ if (!pid)
+ exit(faulting_process(2));
+
+ waitpid(pid, &err, 0);
+ if (err)
+ fprintf(stderr, "faulting process failed\n"), exit(1);
+
+ if (write(pipefd[1], &c, sizeof(c)) != sizeof(c))
+ perror("pipe write"), exit(1);
+ if (pthread_join(uffd_mon, (void **)&userfaults))
+ return 1;
+
+ printf("done.\n");
+ if (userfaults)
+ fprintf(stderr, "Signal test failed, userfaults: %ld\n",
+ userfaults);
+ close(uffd);
+ return userfaults != 0;
+}
+static int userfaultfd_stress(void)
+{
+ void *area;
+ char *tmp_area;
+ unsigned long nr;
+ struct uffdio_register uffdio_register;
+ unsigned long cpu;
+ int err;
+ unsigned long userfaults[nr_cpus];
+
+ uffd_test_ops->allocate_area((void **)&area_src);
+ if (!area_src)
+ return 1;
+ uffd_test_ops->allocate_area((void **)&area_dst);
+ if (!area_dst)
+ return 1;
+
+ if (userfaultfd_open(0) < 0)
+ return 1;
+
+ count_verify = malloc(nr_pages * sizeof(unsigned long long));
+ if (!count_verify) {
+ perror("count_verify");
+ return 1;
+ }
+
+ for (nr = 0; nr < nr_pages; nr++) {
+ *area_mutex(area_src, nr) = (pthread_mutex_t)
+ PTHREAD_MUTEX_INITIALIZER;
+ count_verify[nr] = *area_count(area_src, nr) = 1;
+ /*
+ * In the transition between 255 to 256, powerpc will
+ * read out of order in my_bcmp and see both bytes as
+ * zero, so leave a placeholder below always non-zero
+ * after the count, to avoid my_bcmp to trigger false
+ * positives.
+ */
+ *(area_count(area_src, nr) + 1) = 1;
+ }
+
+ pipefd = malloc(sizeof(int) * nr_cpus * 2);
+ if (!pipefd) {
+ perror("pipefd");
+ return 1;
+ }
+ for (cpu = 0; cpu < nr_cpus; cpu++) {
+ if (pipe2(&pipefd[cpu*2], O_CLOEXEC | O_NONBLOCK)) {
+ perror("pipe");
+ return 1;
+ }
+ }
+
+ if (posix_memalign(&area, page_size, page_size)) {
+ fprintf(stderr, "out of memory\n");
+ return 1;
+ }
+ zeropage = area;
+ bzero(zeropage, page_size);
+
+ pthread_mutex_lock(&uffd_read_mutex);
+
+ pthread_attr_init(&attr);
+ pthread_attr_setstacksize(&attr, 16*1024*1024);
+
+ err = 0;
+ while (bounces--) {
+ unsigned long expected_ioctls;
+
+ printf("bounces: %d, mode:", bounces);
+ if (bounces & BOUNCE_RANDOM)
+ printf(" rnd");
+ if (bounces & BOUNCE_RACINGFAULTS)
+ printf(" racing");
+ if (bounces & BOUNCE_VERIFY)
+ printf(" ver");
+ if (bounces & BOUNCE_POLL)
+ printf(" poll");
+ printf(", ");
+ fflush(stdout);
+
+ if (bounces & BOUNCE_POLL)
+ fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK);
+ else
+ fcntl(uffd, F_SETFL, uffd_flags & ~O_NONBLOCK);
+
+ /* register */
+ uffdio_register.range.start = (unsigned long) area_dst;
+ uffdio_register.range.len = nr_pages * page_size;
+ uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
+ if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) {
+ fprintf(stderr, "register failure\n");
+ return 1;
+ }
+ expected_ioctls = uffd_test_ops->expected_ioctls;
+ if ((uffdio_register.ioctls & expected_ioctls) !=
+ expected_ioctls) {
+ fprintf(stderr,
+ "unexpected missing ioctl for anon memory\n");
+ return 1;
+ }
+
+ if (area_dst_alias) {
+ uffdio_register.range.start = (unsigned long)
+ area_dst_alias;
+ if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) {
+ fprintf(stderr, "register failure alias\n");
+ return 1;
+ }
+ }
+
+ /*
+ * The madvise done previously isn't enough: some
+ * uffd_thread could have read userfaults (one of
+ * those already resolved by the background thread)
+ * and it may be in the process of calling
+ * UFFDIO_COPY. UFFDIO_COPY will read the zapped
+ * area_src and it would map a zero page in it (of
+ * course such a UFFDIO_COPY is perfectly safe as it'd
+ * return -EEXIST). The problem comes at the next
+ * bounce though: that racing UFFDIO_COPY would
+ * generate zeropages in the area_src, so invalidating
+ * the previous MADV_DONTNEED. Without this additional
+ * MADV_DONTNEED those zeropages leftovers in the
+ * area_src would lead to -EEXIST failure during the
+ * next bounce, effectively leaving a zeropage in the
+ * area_dst.
+ *
+ * Try to comment this out madvise to see the memory
+ * corruption being caught pretty quick.
+ *
+ * khugepaged is also inhibited to collapse THP after
+ * MADV_DONTNEED only after the UFFDIO_REGISTER, so it's
+ * required to MADV_DONTNEED here.
+ */
+ if (uffd_test_ops->release_pages(area_dst))
+ return 1;
+
+ /* bounce pass */
+ if (stress(userfaults))
+ return 1;
+
+ /* unregister */
+ if (ioctl(uffd, UFFDIO_UNREGISTER, &uffdio_register.range)) {
+ fprintf(stderr, "unregister failure\n");
+ return 1;
+ }
+ if (area_dst_alias) {
+ uffdio_register.range.start = (unsigned long) area_dst;
+ if (ioctl(uffd, UFFDIO_UNREGISTER,
+ &uffdio_register.range)) {
+ fprintf(stderr, "unregister failure alias\n");
+ return 1;
+ }
+ }
+
+ /* verification */
+ if (bounces & BOUNCE_VERIFY) {
+ for (nr = 0; nr < nr_pages; nr++) {
+ if (*area_count(area_dst, nr) != count_verify[nr]) {
+ fprintf(stderr,
+ "error area_count %Lu %Lu %lu\n",
+ *area_count(area_src, nr),
+ count_verify[nr],
+ nr);
+ err = 1;
+ bounces = 0;
+ }
+ }
+ }
+
+ /* prepare next bounce */
+ tmp_area = area_src;
+ area_src = area_dst;
+ area_dst = tmp_area;
+
+ tmp_area = area_src_alias;
+ area_src_alias = area_dst_alias;
+ area_dst_alias = tmp_area;
+
+ printf("userfaults:");
+ for (cpu = 0; cpu < nr_cpus; cpu++)
+ printf(" %lu", userfaults[cpu]);
+ printf("\n");
+ }
+
+ if (err)
+ return err;
+
+ close(uffd);
+ return userfaultfd_zeropage_test() || userfaultfd_sig_test()
+ || userfaultfd_events_test();
+}
+
+/*
+ * Copied from mlock2-tests.c
+ */
+unsigned long default_huge_page_size(void)
+{
+ unsigned long hps = 0;
+ char *line = NULL;
+ size_t linelen = 0;
+ FILE *f = fopen("/proc/meminfo", "r");
+
+ if (!f)
+ return 0;
+ while (getline(&line, &linelen, f) > 0) {
+ if (sscanf(line, "Hugepagesize: %lu kB", &hps) == 1) {
+ hps <<= 10;
+ break;
+ }
+ }
+
+ free(line);
+ fclose(f);
+ return hps;
+}
+
+static void set_test_type(const char *type)
+{
+ if (!strcmp(type, "anon")) {
+ test_type = TEST_ANON;
+ uffd_test_ops = &anon_uffd_test_ops;
+ } else if (!strcmp(type, "hugetlb")) {
+ test_type = TEST_HUGETLB;
+ uffd_test_ops = &hugetlb_uffd_test_ops;
+ } else if (!strcmp(type, "hugetlb_shared")) {
+ map_shared = true;
+ test_type = TEST_HUGETLB;
+ uffd_test_ops = &hugetlb_uffd_test_ops;
+ } else if (!strcmp(type, "shmem")) {
+ map_shared = true;
+ test_type = TEST_SHMEM;
+ uffd_test_ops = &shmem_uffd_test_ops;
+ } else {
+ fprintf(stderr, "Unknown test type: %s\n", type), exit(1);
+ }
+
+ if (test_type == TEST_HUGETLB)
+ page_size = default_huge_page_size();
+ else
+ page_size = sysconf(_SC_PAGE_SIZE);
+
+ if (!page_size)
+ fprintf(stderr, "Unable to determine page size\n"),
+ exit(2);
+ if ((unsigned long) area_count(NULL, 0) + sizeof(unsigned long long) * 2
+ > page_size)
+ fprintf(stderr, "Impossible to run this test\n"), exit(2);
+}
+
+static void sigalrm(int sig)
+{
+ if (sig != SIGALRM)
+ abort();
+ test_uffdio_copy_eexist = true;
+ test_uffdio_zeropage_eexist = true;
+ alarm(ALARM_INTERVAL_SECS);
+}
+
+int main(int argc, char **argv)
+{
+ if (argc < 4)
+ fprintf(stderr, "Usage: <test type> <MiB> <bounces> [hugetlbfs_file]\n"),
+ exit(1);
+
+ if (signal(SIGALRM, sigalrm) == SIG_ERR)
+ fprintf(stderr, "failed to arm SIGALRM"), exit(1);
+ alarm(ALARM_INTERVAL_SECS);
+
+ set_test_type(argv[1]);
+
+ nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
+ nr_pages_per_cpu = atol(argv[2]) * 1024*1024 / page_size /
+ nr_cpus;
+ if (!nr_pages_per_cpu) {
+ fprintf(stderr, "invalid MiB\n");
+ fprintf(stderr, "Usage: <MiB> <bounces>\n"), exit(1);
+ }
+
+ bounces = atoi(argv[3]);
+ if (bounces <= 0) {
+ fprintf(stderr, "invalid bounces\n");
+ fprintf(stderr, "Usage: <MiB> <bounces>\n"), exit(1);
+ }
+ nr_pages = nr_pages_per_cpu * nr_cpus;
+
+ if (test_type == TEST_HUGETLB) {
+ if (argc < 5)
+ fprintf(stderr, "Usage: hugetlb <MiB> <bounces> <hugetlbfs_file>\n"),
+ exit(1);
+ huge_fd = open(argv[4], O_CREAT | O_RDWR, 0755);
+ if (huge_fd < 0) {
+ fprintf(stderr, "Open of %s failed", argv[3]);
+ perror("open");
+ exit(1);
+ }
+ if (ftruncate(huge_fd, 0)) {
+ fprintf(stderr, "ftruncate %s to size 0 failed", argv[3]);
+ perror("ftruncate");
+ exit(1);
+ }
+ }
+ printf("nr_pages: %lu, nr_pages_per_cpu: %lu\n",
+ nr_pages, nr_pages_per_cpu);
+ return userfaultfd_stress();
+}
+
+#else /* __NR_userfaultfd */
+
+#warning "missing __NR_userfaultfd definition"
+
+int main(void)
+{
+ printf("skip: Skipping userfaultfd test (missing __NR_userfaultfd)\n");
+ return KSFT_SKIP;
+}
+
+#endif /* __NR_userfaultfd */
diff --git a/tools/testing/selftests/vm/va_128TBswitch.c b/tools/testing/selftests/vm/va_128TBswitch.c
new file mode 100644
index 000000000..e7fe734c3
--- /dev/null
+++ b/tools/testing/selftests/vm/va_128TBswitch.c
@@ -0,0 +1,297 @@
+/*
+ *
+ * Authors: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
+ * Authors: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ */
+
+#include <stdio.h>
+#include <sys/mman.h>
+#include <string.h>
+
+#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]))
+
+#ifdef __powerpc64__
+#define PAGE_SIZE (64 << 10)
+/*
+ * This will work with 16M and 2M hugepage size
+ */
+#define HUGETLB_SIZE (16 << 20)
+#else
+#define PAGE_SIZE (4 << 10)
+#define HUGETLB_SIZE (2 << 20)
+#endif
+
+/*
+ * >= 128TB is the hint addr value we used to select
+ * large address space.
+ */
+#define ADDR_SWITCH_HINT (1UL << 47)
+#define LOW_ADDR ((void *) (1UL << 30))
+#define HIGH_ADDR ((void *) (1UL << 48))
+
+struct testcase {
+ void *addr;
+ unsigned long size;
+ unsigned long flags;
+ const char *msg;
+ unsigned int low_addr_required:1;
+ unsigned int keep_mapped:1;
+};
+
+static struct testcase testcases[] = {
+ {
+ /*
+ * If stack is moved, we could possibly allocate
+ * this at the requested address.
+ */
+ .addr = ((void *)(ADDR_SWITCH_HINT - PAGE_SIZE)),
+ .size = PAGE_SIZE,
+ .flags = MAP_PRIVATE | MAP_ANONYMOUS,
+ .msg = "mmap(ADDR_SWITCH_HINT - PAGE_SIZE, PAGE_SIZE)",
+ .low_addr_required = 1,
+ },
+ {
+ /*
+ * We should never allocate at the requested address or above it
+ * The len cross the 128TB boundary. Without MAP_FIXED
+ * we will always search in the lower address space.
+ */
+ .addr = ((void *)(ADDR_SWITCH_HINT - PAGE_SIZE)),
+ .size = 2 * PAGE_SIZE,
+ .flags = MAP_PRIVATE | MAP_ANONYMOUS,
+ .msg = "mmap(ADDR_SWITCH_HINT - PAGE_SIZE, (2 * PAGE_SIZE))",
+ .low_addr_required = 1,
+ },
+ {
+ /*
+ * Exact mapping at 128TB, the area is free we should get that
+ * even without MAP_FIXED.
+ */
+ .addr = ((void *)(ADDR_SWITCH_HINT)),
+ .size = PAGE_SIZE,
+ .flags = MAP_PRIVATE | MAP_ANONYMOUS,
+ .msg = "mmap(ADDR_SWITCH_HINT, PAGE_SIZE)",
+ .keep_mapped = 1,
+ },
+ {
+ .addr = (void *)(ADDR_SWITCH_HINT),
+ .size = 2 * PAGE_SIZE,
+ .flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED,
+ .msg = "mmap(ADDR_SWITCH_HINT, 2 * PAGE_SIZE, MAP_FIXED)",
+ },
+ {
+ .addr = NULL,
+ .size = 2 * PAGE_SIZE,
+ .flags = MAP_PRIVATE | MAP_ANONYMOUS,
+ .msg = "mmap(NULL)",
+ .low_addr_required = 1,
+ },
+ {
+ .addr = LOW_ADDR,
+ .size = 2 * PAGE_SIZE,
+ .flags = MAP_PRIVATE | MAP_ANONYMOUS,
+ .msg = "mmap(LOW_ADDR)",
+ .low_addr_required = 1,
+ },
+ {
+ .addr = HIGH_ADDR,
+ .size = 2 * PAGE_SIZE,
+ .flags = MAP_PRIVATE | MAP_ANONYMOUS,
+ .msg = "mmap(HIGH_ADDR)",
+ .keep_mapped = 1,
+ },
+ {
+ .addr = HIGH_ADDR,
+ .size = 2 * PAGE_SIZE,
+ .flags = MAP_PRIVATE | MAP_ANONYMOUS,
+ .msg = "mmap(HIGH_ADDR) again",
+ .keep_mapped = 1,
+ },
+ {
+ .addr = HIGH_ADDR,
+ .size = 2 * PAGE_SIZE,
+ .flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED,
+ .msg = "mmap(HIGH_ADDR, MAP_FIXED)",
+ },
+ {
+ .addr = (void *) -1,
+ .size = 2 * PAGE_SIZE,
+ .flags = MAP_PRIVATE | MAP_ANONYMOUS,
+ .msg = "mmap(-1)",
+ .keep_mapped = 1,
+ },
+ {
+ .addr = (void *) -1,
+ .size = 2 * PAGE_SIZE,
+ .flags = MAP_PRIVATE | MAP_ANONYMOUS,
+ .msg = "mmap(-1) again",
+ },
+ {
+ .addr = ((void *)(ADDR_SWITCH_HINT - PAGE_SIZE)),
+ .size = PAGE_SIZE,
+ .flags = MAP_PRIVATE | MAP_ANONYMOUS,
+ .msg = "mmap(ADDR_SWITCH_HINT - PAGE_SIZE, PAGE_SIZE)",
+ .low_addr_required = 1,
+ },
+ {
+ .addr = (void *)(ADDR_SWITCH_HINT - PAGE_SIZE),
+ .size = 2 * PAGE_SIZE,
+ .flags = MAP_PRIVATE | MAP_ANONYMOUS,
+ .msg = "mmap(ADDR_SWITCH_HINT - PAGE_SIZE, 2 * PAGE_SIZE)",
+ .low_addr_required = 1,
+ .keep_mapped = 1,
+ },
+ {
+ .addr = (void *)(ADDR_SWITCH_HINT - PAGE_SIZE / 2),
+ .size = 2 * PAGE_SIZE,
+ .flags = MAP_PRIVATE | MAP_ANONYMOUS,
+ .msg = "mmap(ADDR_SWITCH_HINT - PAGE_SIZE/2 , 2 * PAGE_SIZE)",
+ .low_addr_required = 1,
+ .keep_mapped = 1,
+ },
+ {
+ .addr = ((void *)(ADDR_SWITCH_HINT)),
+ .size = PAGE_SIZE,
+ .flags = MAP_PRIVATE | MAP_ANONYMOUS,
+ .msg = "mmap(ADDR_SWITCH_HINT, PAGE_SIZE)",
+ },
+ {
+ .addr = (void *)(ADDR_SWITCH_HINT),
+ .size = 2 * PAGE_SIZE,
+ .flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED,
+ .msg = "mmap(ADDR_SWITCH_HINT, 2 * PAGE_SIZE, MAP_FIXED)",
+ },
+};
+
+static struct testcase hugetlb_testcases[] = {
+ {
+ .addr = NULL,
+ .size = HUGETLB_SIZE,
+ .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS,
+ .msg = "mmap(NULL, MAP_HUGETLB)",
+ .low_addr_required = 1,
+ },
+ {
+ .addr = LOW_ADDR,
+ .size = HUGETLB_SIZE,
+ .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS,
+ .msg = "mmap(LOW_ADDR, MAP_HUGETLB)",
+ .low_addr_required = 1,
+ },
+ {
+ .addr = HIGH_ADDR,
+ .size = HUGETLB_SIZE,
+ .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS,
+ .msg = "mmap(HIGH_ADDR, MAP_HUGETLB)",
+ .keep_mapped = 1,
+ },
+ {
+ .addr = HIGH_ADDR,
+ .size = HUGETLB_SIZE,
+ .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS,
+ .msg = "mmap(HIGH_ADDR, MAP_HUGETLB) again",
+ .keep_mapped = 1,
+ },
+ {
+ .addr = HIGH_ADDR,
+ .size = HUGETLB_SIZE,
+ .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED,
+ .msg = "mmap(HIGH_ADDR, MAP_FIXED | MAP_HUGETLB)",
+ },
+ {
+ .addr = (void *) -1,
+ .size = HUGETLB_SIZE,
+ .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS,
+ .msg = "mmap(-1, MAP_HUGETLB)",
+ .keep_mapped = 1,
+ },
+ {
+ .addr = (void *) -1,
+ .size = HUGETLB_SIZE,
+ .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS,
+ .msg = "mmap(-1, MAP_HUGETLB) again",
+ },
+ {
+ .addr = (void *)(ADDR_SWITCH_HINT - PAGE_SIZE),
+ .size = 2 * HUGETLB_SIZE,
+ .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS,
+ .msg = "mmap(ADDR_SWITCH_HINT - PAGE_SIZE, 2*HUGETLB_SIZE, MAP_HUGETLB)",
+ .low_addr_required = 1,
+ .keep_mapped = 1,
+ },
+ {
+ .addr = (void *)(ADDR_SWITCH_HINT),
+ .size = 2 * HUGETLB_SIZE,
+ .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED,
+ .msg = "mmap(ADDR_SWITCH_HINT , 2*HUGETLB_SIZE, MAP_FIXED | MAP_HUGETLB)",
+ },
+};
+
+static int run_test(struct testcase *test, int count)
+{
+ void *p;
+ int i, ret = 0;
+
+ for (i = 0; i < count; i++) {
+ struct testcase *t = test + i;
+
+ p = mmap(t->addr, t->size, PROT_READ | PROT_WRITE, t->flags, -1, 0);
+
+ printf("%s: %p - ", t->msg, p);
+
+ if (p == MAP_FAILED) {
+ printf("FAILED\n");
+ ret = 1;
+ continue;
+ }
+
+ if (t->low_addr_required && p >= (void *)(ADDR_SWITCH_HINT)) {
+ printf("FAILED\n");
+ ret = 1;
+ } else {
+ /*
+ * Do a dereference of the address returned so that we catch
+ * bugs in page fault handling
+ */
+ memset(p, 0, t->size);
+ printf("OK\n");
+ }
+ if (!t->keep_mapped)
+ munmap(p, t->size);
+ }
+
+ return ret;
+}
+
+static int supported_arch(void)
+{
+#if defined(__powerpc64__)
+ return 1;
+#elif defined(__x86_64__)
+ return 1;
+#else
+ return 0;
+#endif
+}
+
+int main(int argc, char **argv)
+{
+ int ret;
+
+ if (!supported_arch())
+ return 0;
+
+ ret = run_test(testcases, ARRAY_SIZE(testcases));
+ if (argc == 2 && !strcmp(argv[1], "--run-hugetlb"))
+ ret = run_test(hugetlb_testcases, ARRAY_SIZE(hugetlb_testcases));
+ return ret;
+}
diff --git a/tools/testing/selftests/vm/virtual_address_range.c b/tools/testing/selftests/vm/virtual_address_range.c
new file mode 100644
index 000000000..1830d66a6
--- /dev/null
+++ b/tools/testing/selftests/vm/virtual_address_range.c
@@ -0,0 +1,139 @@
+/*
+ * Copyright 2017, Anshuman Khandual, IBM Corp.
+ * Licensed under GPLv2.
+ *
+ * Works on architectures which support 128TB virtual
+ * address range and beyond.
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <errno.h>
+#include <sys/mman.h>
+#include <sys/time.h>
+
+/*
+ * Maximum address range mapped with a single mmap()
+ * call is little bit more than 16GB. Hence 16GB is
+ * chosen as the single chunk size for address space
+ * mapping.
+ */
+#define MAP_CHUNK_SIZE 17179869184UL /* 16GB */
+
+/*
+ * Address space till 128TB is mapped without any hint
+ * and is enabled by default. Address space beyond 128TB
+ * till 512TB is obtained by passing hint address as the
+ * first argument into mmap() system call.
+ *
+ * The process heap address space is divided into two
+ * different areas one below 128TB and one above 128TB
+ * till it reaches 512TB. One with size 128TB and the
+ * other being 384TB.
+ *
+ * On Arm64 the address space is 256TB and no high mappings
+ * are supported so far.
+ */
+
+#define NR_CHUNKS_128TB 8192UL /* Number of 16GB chunks for 128TB */
+#define NR_CHUNKS_256TB (NR_CHUNKS_128TB * 2UL)
+#define NR_CHUNKS_384TB (NR_CHUNKS_128TB * 3UL)
+
+#define ADDR_MARK_128TB (1UL << 47) /* First address beyond 128TB */
+#define ADDR_MARK_256TB (1UL << 48) /* First address beyond 256TB */
+
+#ifdef __aarch64__
+#define HIGH_ADDR_MARK ADDR_MARK_256TB
+#define HIGH_ADDR_SHIFT 49
+#define NR_CHUNKS_LOW NR_CHUNKS_256TB
+#define NR_CHUNKS_HIGH 0
+#else
+#define HIGH_ADDR_MARK ADDR_MARK_128TB
+#define HIGH_ADDR_SHIFT 48
+#define NR_CHUNKS_LOW NR_CHUNKS_128TB
+#define NR_CHUNKS_HIGH NR_CHUNKS_384TB
+#endif
+
+static char *hind_addr(void)
+{
+ int bits = HIGH_ADDR_SHIFT + rand() % (63 - HIGH_ADDR_SHIFT);
+
+ return (char *) (1UL << bits);
+}
+
+static int validate_addr(char *ptr, int high_addr)
+{
+ unsigned long addr = (unsigned long) ptr;
+
+ if (high_addr) {
+ if (addr < HIGH_ADDR_MARK) {
+ printf("Bad address %lx\n", addr);
+ return 1;
+ }
+ return 0;
+ }
+
+ if (addr > HIGH_ADDR_MARK) {
+ printf("Bad address %lx\n", addr);
+ return 1;
+ }
+ return 0;
+}
+
+static int validate_lower_address_hint(void)
+{
+ char *ptr;
+
+ ptr = mmap((void *) (1UL << 45), MAP_CHUNK_SIZE, PROT_READ |
+ PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+
+ if (ptr == MAP_FAILED)
+ return 0;
+
+ return 1;
+}
+
+int main(int argc, char *argv[])
+{
+ char *ptr[NR_CHUNKS_LOW];
+ char *hptr[NR_CHUNKS_HIGH];
+ char *hint;
+ unsigned long i, lchunks, hchunks;
+
+ for (i = 0; i < NR_CHUNKS_LOW; i++) {
+ ptr[i] = mmap(NULL, MAP_CHUNK_SIZE, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+
+ if (ptr[i] == MAP_FAILED) {
+ if (validate_lower_address_hint())
+ return 1;
+ break;
+ }
+
+ if (validate_addr(ptr[i], 0))
+ return 1;
+ }
+ lchunks = i;
+
+ for (i = 0; i < NR_CHUNKS_HIGH; i++) {
+ hint = hind_addr();
+ hptr[i] = mmap(hint, MAP_CHUNK_SIZE, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+
+ if (hptr[i] == MAP_FAILED)
+ break;
+
+ if (validate_addr(hptr[i], 1))
+ return 1;
+ }
+ hchunks = i;
+
+ for (i = 0; i < lchunks; i++)
+ munmap(ptr[i], MAP_CHUNK_SIZE);
+
+ for (i = 0; i < hchunks; i++)
+ munmap(hptr[i], MAP_CHUNK_SIZE);
+
+ return 0;
+}