Adding upstream version 4.19.249.upstream/4.19.249 upstream

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-05-06 01:02:30 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-05-06 01:02:30 +0000
commit: 76cb841cb886eef6b3bee341a2266c76578724ad (patch)
tree: f5892e5ba6cc11949952a6ce4ecbe6d516d6ce58 /tools/testing/selftests/vm
parent: Initial commit. (diff)
download: linux-upstream.tar.xz
linux-upstream.zip
19 files changed, 4074 insertions, 0 deletions
diff --git a/tools/testing/selftests/vm/.gitignore b/tools/testing/selftests/vm/.gitignore
new file mode 100644
index 000000000..af5ff83f6
--- /dev/null
+++ b/tools/testing/selftests/vm/.gitignore
@@ -0,0 +1,15 @@
+hugepage-mmap
+hugepage-shm
+map_hugetlb
+map_populate
+thuge-gen
+compaction_test
+mlock2-tests
+on-fault-limit
+transhuge-stress
+userfaultfd
+mlock-intersect-test
+mlock-random-test
+virtual_address_range
+gup_benchmark
+va_128TBswitch
diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile
new file mode 100644
index 000000000..2cf3dc49b
--- /dev/null
+++ b/tools/testing/selftests/vm/Makefile
@@ -0,0 +1,34 @@
+# SPDX-License-Identifier: GPL-2.0
+# Makefile for vm selftests
+
+ifndef OUTPUT
+  OUTPUT := $(shell pwd)
+endif
+
+CFLAGS = -Wall -I ../../../../usr/include $(EXTRA_CFLAGS)
+LDLIBS = -lrt
+TEST_GEN_FILES = compaction_test
+TEST_GEN_FILES += gup_benchmark
+TEST_GEN_FILES += hugepage-mmap
+TEST_GEN_FILES += hugepage-shm
+TEST_GEN_FILES += map_hugetlb
+TEST_GEN_FILES += map_populate
+TEST_GEN_FILES += mlock-random-test
+TEST_GEN_FILES += mlock2-tests
+TEST_GEN_FILES += on-fault-limit
+TEST_GEN_FILES += thuge-gen
+TEST_GEN_FILES += transhuge-stress
+TEST_GEN_FILES += userfaultfd
+TEST_GEN_FILES += va_128TBswitch
+TEST_GEN_FILES += virtual_address_range
+
+TEST_PROGS := run_vmtests
+
+TEST_FILES := test_vmalloc.sh
+
+KSFT_KHDR_INSTALL := 1
+include ../lib.mk
+
+$(OUTPUT)/userfaultfd: LDLIBS += -lpthread
+
+$(OUTPUT)/mlock-random-test: LDLIBS += -lcap
diff --git a/tools/testing/selftests/vm/compaction_test.c b/tools/testing/selftests/vm/compaction_test.c
new file mode 100644
index 000000000..bcec71250
--- /dev/null
+++ b/tools/testing/selftests/vm/compaction_test.c
@@ -0,0 +1,230 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ *
+ * A test for the patch "Allow compaction of unevictable pages".
+ * With this patch we should be able to allocate at least 1/4
+ * of RAM in huge pages. Without the patch much less is
+ * allocated.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/mman.h>
+#include <sys/resource.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <unistd.h>
+#include <string.h>
+
+#include "../kselftest.h"
+
+#define MAP_SIZE 1048576
+
+struct map_list {
+	void *map;
+	struct map_list *next;
+};
+
+int read_memory_info(unsigned long *memfree, unsigned long *hugepagesize)
+{
+	char  buffer[256] = {0};
+	char *cmd = "cat /proc/meminfo | grep -i memfree | grep -o '[0-9]*'";
+	FILE *cmdfile = popen(cmd, "r");
+
+	if (!(fgets(buffer, sizeof(buffer), cmdfile))) {
+		perror("Failed to read meminfo\n");
+		return -1;
+	}
+
+	pclose(cmdfile);
+
+	*memfree = atoll(buffer);
+	cmd = "cat /proc/meminfo | grep -i hugepagesize | grep -o '[0-9]*'";
+	cmdfile = popen(cmd, "r");
+
+	if (!(fgets(buffer, sizeof(buffer), cmdfile))) {
+		perror("Failed to read meminfo\n");
+		return -1;
+	}
+
+	pclose(cmdfile);
+	*hugepagesize = atoll(buffer);
+
+	return 0;
+}
+
+int prereq(void)
+{
+	char allowed;
+	int fd;
+
+	fd = open("/proc/sys/vm/compact_unevictable_allowed",
+		  O_RDONLY | O_NONBLOCK);
+	if (fd < 0) {
+		perror("Failed to open\n"
+		       "/proc/sys/vm/compact_unevictable_allowed\n");
+		return -1;
+	}
+
+	if (read(fd, &allowed, sizeof(char)) != sizeof(char)) {
+		perror("Failed to read from\n"
+		       "/proc/sys/vm/compact_unevictable_allowed\n");
+		close(fd);
+		return -1;
+	}
+
+	close(fd);
+	if (allowed == '1')
+		return 0;
+
+	return -1;
+}
+
+int check_compaction(unsigned long mem_free, unsigned int hugepage_size)
+{
+	int fd;
+	int compaction_index = 0;
+	char initial_nr_hugepages[10] = {0};
+	char nr_hugepages[10] = {0};
+
+	/* We want to test with 80% of available memory. Else, OOM killer comes
+	   in to play */
+	mem_free = mem_free * 0.8;
+
+	fd = open("/proc/sys/vm/nr_hugepages", O_RDWR | O_NONBLOCK);
+	if (fd < 0) {
+		perror("Failed to open /proc/sys/vm/nr_hugepages");
+		return -1;
+	}
+
+	if (read(fd, initial_nr_hugepages, sizeof(initial_nr_hugepages)) <= 0) {
+		perror("Failed to read from /proc/sys/vm/nr_hugepages");
+		goto close_fd;
+	}
+
+	/* Start with the initial condition of 0 huge pages*/
+	if (write(fd, "0", sizeof(char)) != sizeof(char)) {
+		perror("Failed to write 0 to /proc/sys/vm/nr_hugepages\n");
+		goto close_fd;
+	}
+
+	lseek(fd, 0, SEEK_SET);
+
+	/* Request a large number of huge pages. The Kernel will allocate
+	   as much as it can */
+	if (write(fd, "100000", (6*sizeof(char))) != (6*sizeof(char))) {
+		perror("Failed to write 100000 to /proc/sys/vm/nr_hugepages\n");
+		goto close_fd;
+	}
+
+	lseek(fd, 0, SEEK_SET);
+
+	if (read(fd, nr_hugepages, sizeof(nr_hugepages)) <= 0) {
+		perror("Failed to re-read from /proc/sys/vm/nr_hugepages\n");
+		goto close_fd;
+	}
+
+	/* We should have been able to request at least 1/3 rd of the memory in
+	   huge pages */
+	compaction_index = mem_free/(atoi(nr_hugepages) * hugepage_size);
+
+	if (compaction_index > 3) {
+		printf("No of huge pages allocated = %d\n",
+		       (atoi(nr_hugepages)));
+		fprintf(stderr, "ERROR: Less that 1/%d of memory is available\n"
+			"as huge pages\n", compaction_index);
+		goto close_fd;
+	}
+
+	printf("No of huge pages allocated = %d\n",
+	       (atoi(nr_hugepages)));
+
+	lseek(fd, 0, SEEK_SET);
+
+	if (write(fd, initial_nr_hugepages, strlen(initial_nr_hugepages))
+	    != strlen(initial_nr_hugepages)) {
+		perror("Failed to write value to /proc/sys/vm/nr_hugepages\n");
+		goto close_fd;
+	}
+
+	close(fd);
+	return 0;
+
+ close_fd:
+	close(fd);
+	printf("Not OK. Compaction test failed.");
+	return -1;
+}
+
+
+int main(int argc, char **argv)
+{
+	struct rlimit lim;
+	struct map_list *list, *entry;
+	size_t page_size, i;
+	void *map = NULL;
+	unsigned long mem_free = 0;
+	unsigned long hugepage_size = 0;
+	unsigned long mem_fragmentable = 0;
+
+	if (prereq() != 0) {
+		printf("Either the sysctl compact_unevictable_allowed is not\n"
+		       "set to 1 or couldn't read the proc file.\n"
+		       "Skipping the test\n");
+		return KSFT_SKIP;
+	}
+
+	lim.rlim_cur = RLIM_INFINITY;
+	lim.rlim_max = RLIM_INFINITY;
+	if (setrlimit(RLIMIT_MEMLOCK, &lim)) {
+		perror("Failed to set rlimit:\n");
+		return -1;
+	}
+
+	page_size = getpagesize();
+
+	list = NULL;
+
+	if (read_memory_info(&mem_free, &hugepage_size) != 0) {
+		printf("ERROR: Cannot read meminfo\n");
+		return -1;
+	}
+
+	mem_fragmentable = mem_free * 0.8 / 1024;
+
+	while (mem_fragmentable > 0) {
+		map = mmap(NULL, MAP_SIZE, PROT_READ | PROT_WRITE,
+			   MAP_ANONYMOUS | MAP_PRIVATE | MAP_LOCKED, -1, 0);
+		if (map == MAP_FAILED)
+			break;
+
+		entry = malloc(sizeof(struct map_list));
+		if (!entry) {
+			munmap(map, MAP_SIZE);
+			break;
+		}
+		entry->map = map;
+		entry->next = list;
+		list = entry;
+
+		/* Write something (in this case the address of the map) to
+		 * ensure that KSM can't merge the mapped pages
+		 */
+		for (i = 0; i < MAP_SIZE; i += page_size)
+			*(unsigned long *)(map + i) = (unsigned long)map + i;
+
+		mem_fragmentable--;
+	}
+
+	for (entry = list; entry != NULL; entry = entry->next) {
+		munmap(entry->map, MAP_SIZE);
+		if (!entry->next)
+			break;
+		entry = entry->next;
+	}
+
+	if (check_compaction(mem_free, hugepage_size) == 0)
+		return 0;
+
+	return -1;
+}
diff --git a/tools/testing/selftests/vm/config b/tools/testing/selftests/vm/config
new file mode 100644
index 000000000..1c0d76cb5
--- /dev/null
+++ b/tools/testing/selftests/vm/config
@@ -0,0 +1,2 @@
+CONFIG_SYSVIPC=y
+CONFIG_USERFAULTFD=y
diff --git a/tools/testing/selftests/vm/gup_benchmark.c b/tools/testing/selftests/vm/gup_benchmark.c
new file mode 100644
index 000000000..17da711f2
--- /dev/null
+++ b/tools/testing/selftests/vm/gup_benchmark.c
@@ -0,0 +1,93 @@
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <sys/prctl.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+
+#include <linux/types.h>
+
+#define MB (1UL << 20)
+#define PAGE_SIZE sysconf(_SC_PAGESIZE)
+
+#define GUP_FAST_BENCHMARK	_IOWR('g', 1, struct gup_benchmark)
+
+struct gup_benchmark {
+	__u64 delta_usec;
+	__u64 addr;
+	__u64 size;
+	__u32 nr_pages_per_call;
+	__u32 flags;
+	__u64 expansion[10];	/* For future use */
+};
+
+int main(int argc, char **argv)
+{
+	struct gup_benchmark gup;
+	unsigned long size = 128 * MB;
+	int i, fd, opt, nr_pages = 1, thp = -1, repeats = 1, write = 0;
+	char *p;
+
+	while ((opt = getopt(argc, argv, "m:r:n:tT")) != -1) {
+		switch (opt) {
+		case 'm':
+			size = atoi(optarg) * MB;
+			break;
+		case 'r':
+			repeats = atoi(optarg);
+			break;
+		case 'n':
+			nr_pages = atoi(optarg);
+			break;
+		case 't':
+			thp = 1;
+			break;
+		case 'T':
+			thp = 0;
+			break;
+		case 'w':
+			write = 1;
+			break;
+		default:
+			return -1;
+		}
+	}
+
+	gup.nr_pages_per_call = nr_pages;
+	gup.flags = write;
+
+	fd = open("/sys/kernel/debug/gup_benchmark", O_RDWR);
+	if (fd == -1)
+		perror("open"), exit(1);
+
+	p = mmap(NULL, size, PROT_READ | PROT_WRITE,
+			MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+	if (p == MAP_FAILED)
+		perror("mmap"), exit(1);
+	gup.addr = (unsigned long)p;
+
+	if (thp == 1)
+		madvise(p, size, MADV_HUGEPAGE);
+	else if (thp == 0)
+		madvise(p, size, MADV_NOHUGEPAGE);
+
+	for (; (unsigned long)p < gup.addr + size; p += PAGE_SIZE)
+		p[0] = 0;
+
+	for (i = 0; i < repeats; i++) {
+		gup.size = size;
+		if (ioctl(fd, GUP_FAST_BENCHMARK, &gup))
+			perror("ioctl"), exit(1);
+
+		printf("Time: %lld us", gup.delta_usec);
+		if (gup.size != size)
+			printf(", truncated (size: %lld)", gup.size);
+		printf("\n");
+	}
+
+	return 0;
+}
diff --git a/tools/testing/selftests/vm/hugepage-mmap.c b/tools/testing/selftests/vm/hugepage-mmap.c
new file mode 100644
index 000000000..93f9e7b81
--- /dev/null
+++ b/tools/testing/selftests/vm/hugepage-mmap.c
@@ -0,0 +1,93 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * hugepage-mmap:
+ *
+ * Example of using huge page memory in a user application using the mmap
+ * system call.  Before running this application, make sure that the
+ * administrator has mounted the hugetlbfs filesystem (on some directory
+ * like /mnt) using the command mount -t hugetlbfs nodev /mnt. In this
+ * example, the app is requesting memory of size 256MB that is backed by
+ * huge pages.
+ *
+ * For the ia64 architecture, the Linux kernel reserves Region number 4 for
+ * huge pages.  That means that if one requires a fixed address, a huge page
+ * aligned address starting with 0x800000... will be required.  If a fixed
+ * address is not required, the kernel will select an address in the proper
+ * range.
+ * Other architectures, such as ppc64, i386 or x86_64 are not so constrained.
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/mman.h>
+#include <fcntl.h>
+
+#define FILE_NAME "huge/hugepagefile"
+#define LENGTH (256UL*1024*1024)
+#define PROTECTION (PROT_READ | PROT_WRITE)
+
+/* Only ia64 requires this */
+#ifdef __ia64__
+#define ADDR (void *)(0x8000000000000000UL)
+#define FLAGS (MAP_SHARED | MAP_FIXED)
+#else
+#define ADDR (void *)(0x0UL)
+#define FLAGS (MAP_SHARED)
+#endif
+
+static void check_bytes(char *addr)
+{
+	printf("First hex is %x\n", *((unsigned int *)addr));
+}
+
+static void write_bytes(char *addr)
+{
+	unsigned long i;
+
+	for (i = 0; i < LENGTH; i++)
+		*(addr + i) = (char)i;
+}
+
+static int read_bytes(char *addr)
+{
+	unsigned long i;
+
+	check_bytes(addr);
+	for (i = 0; i < LENGTH; i++)
+		if (*(addr + i) != (char)i) {
+			printf("Mismatch at %lu\n", i);
+			return 1;
+		}
+	return 0;
+}
+
+int main(void)
+{
+	void *addr;
+	int fd, ret;
+
+	fd = open(FILE_NAME, O_CREAT | O_RDWR, 0755);
+	if (fd < 0) {
+		perror("Open failed");
+		exit(1);
+	}
+
+	addr = mmap(ADDR, LENGTH, PROTECTION, FLAGS, fd, 0);
+	if (addr == MAP_FAILED) {
+		perror("mmap");
+		unlink(FILE_NAME);
+		exit(1);
+	}
+
+	printf("Returned address is %p\n", addr);
+	check_bytes(addr);
+	write_bytes(addr);
+	ret = read_bytes(addr);
+
+	munmap(addr, LENGTH);
+	close(fd);
+	unlink(FILE_NAME);
+
+	return ret;
+}
diff --git a/tools/testing/selftests/vm/hugepage-shm.c b/tools/testing/selftests/vm/hugepage-shm.c
new file mode 100644
index 000000000..e2527f320
--- /dev/null
+++ b/tools/testing/selftests/vm/hugepage-shm.c
@@ -0,0 +1,101 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * hugepage-shm:
+ *
+ * Example of using huge page memory in a user application using Sys V shared
+ * memory system calls.  In this example the app is requesting 256MB of
+ * memory that is backed by huge pages.  The application uses the flag
+ * SHM_HUGETLB in the shmget system call to inform the kernel that it is
+ * requesting huge pages.
+ *
+ * For the ia64 architecture, the Linux kernel reserves Region number 4 for
+ * huge pages.  That means that if one requires a fixed address, a huge page
+ * aligned address starting with 0x800000... will be required.  If a fixed
+ * address is not required, the kernel will select an address in the proper
+ * range.
+ * Other architectures, such as ppc64, i386 or x86_64 are not so constrained.
+ *
+ * Note: The default shared memory limit is quite low on many kernels,
+ * you may need to increase it via:
+ *
+ * echo 268435456 > /proc/sys/kernel/shmmax
+ *
+ * This will increase the maximum size per shared memory segment to 256MB.
+ * The other limit that you will hit eventually is shmall which is the
+ * total amount of shared memory in pages. To set it to 16GB on a system
+ * with a 4kB pagesize do:
+ *
+ * echo 4194304 > /proc/sys/kernel/shmall
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <sys/types.h>
+#include <sys/ipc.h>
+#include <sys/shm.h>
+#include <sys/mman.h>
+
+#ifndef SHM_HUGETLB
+#define SHM_HUGETLB 04000
+#endif
+
+#define LENGTH (256UL*1024*1024)
+
+#define dprintf(x)  printf(x)
+
+/* Only ia64 requires this */
+#ifdef __ia64__
+#define ADDR (void *)(0x8000000000000000UL)
+#define SHMAT_FLAGS (SHM_RND)
+#else
+#define ADDR (void *)(0x0UL)
+#define SHMAT_FLAGS (0)
+#endif
+
+int main(void)
+{
+	int shmid;
+	unsigned long i;
+	char *shmaddr;
+
+	shmid = shmget(2, LENGTH, SHM_HUGETLB | IPC_CREAT | SHM_R | SHM_W);
+	if (shmid < 0) {
+		perror("shmget");
+		exit(1);
+	}
+	printf("shmid: 0x%x\n", shmid);
+
+	shmaddr = shmat(shmid, ADDR, SHMAT_FLAGS);
+	if (shmaddr == (char *)-1) {
+		perror("Shared memory attach failure");
+		shmctl(shmid, IPC_RMID, NULL);
+		exit(2);
+	}
+	printf("shmaddr: %p\n", shmaddr);
+
+	dprintf("Starting the writes:\n");
+	for (i = 0; i < LENGTH; i++) {
+		shmaddr[i] = (char)(i);
+		if (!(i % (1024 * 1024)))
+			dprintf(".");
+	}
+	dprintf("\n");
+
+	dprintf("Starting the Check...");
+	for (i = 0; i < LENGTH; i++)
+		if (shmaddr[i] != (char)i) {
+			printf("\nIndex %lu mismatched\n", i);
+			exit(3);
+		}
+	dprintf("Done.\n");
+
+	if (shmdt((const void *)shmaddr) != 0) {
+		perror("Detach failure");
+		shmctl(shmid, IPC_RMID, NULL);
+		exit(4);
+	}
+
+	shmctl(shmid, IPC_RMID, NULL);
+
+	return 0;
+}
diff --git a/tools/testing/selftests/vm/map_hugetlb.c b/tools/testing/selftests/vm/map_hugetlb.c
new file mode 100644
index 000000000..9b777fa95
--- /dev/null
+++ b/tools/testing/selftests/vm/map_hugetlb.c
@@ -0,0 +1,84 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Example of using hugepage memory in a user application using the mmap
+ * system call with MAP_HUGETLB flag.  Before running this program make
+ * sure the administrator has allocated enough default sized huge pages
+ * to cover the 256 MB allocation.
+ *
+ * For ia64 architecture, Linux kernel reserves Region number 4 for hugepages.
+ * That means the addresses starting with 0x800000... will need to be
+ * specified.  Specifying a fixed address is not required on ppc64, i386
+ * or x86_64.
+ */
+#include <stdlib.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/mman.h>
+#include <fcntl.h>
+
+#define LENGTH (256UL*1024*1024)
+#define PROTECTION (PROT_READ | PROT_WRITE)
+
+#ifndef MAP_HUGETLB
+#define MAP_HUGETLB 0x40000 /* arch specific */
+#endif
+
+/* Only ia64 requires this */
+#ifdef __ia64__
+#define ADDR (void *)(0x8000000000000000UL)
+#define FLAGS (MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | MAP_FIXED)
+#else
+#define ADDR (void *)(0x0UL)
+#define FLAGS (MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB)
+#endif
+
+static void check_bytes(char *addr)
+{
+	printf("First hex is %x\n", *((unsigned int *)addr));
+}
+
+static void write_bytes(char *addr)
+{
+	unsigned long i;
+
+	for (i = 0; i < LENGTH; i++)
+		*(addr + i) = (char)i;
+}
+
+static int read_bytes(char *addr)
+{
+	unsigned long i;
+
+	check_bytes(addr);
+	for (i = 0; i < LENGTH; i++)
+		if (*(addr + i) != (char)i) {
+			printf("Mismatch at %lu\n", i);
+			return 1;
+		}
+	return 0;
+}
+
+int main(void)
+{
+	void *addr;
+	int ret;
+
+	addr = mmap(ADDR, LENGTH, PROTECTION, FLAGS, -1, 0);
+	if (addr == MAP_FAILED) {
+		perror("mmap");
+		exit(1);
+	}
+
+	printf("Returned address is %p\n", addr);
+	check_bytes(addr);
+	write_bytes(addr);
+	ret = read_bytes(addr);
+
+	/* munmap() length of MAP_HUGETLB memory must be hugepage aligned */
+	if (munmap(addr, LENGTH)) {
+		perror("munmap");
+		exit(1);
+	}
+
+	return ret;
+}
diff --git a/tools/testing/selftests/vm/map_populate.c b/tools/testing/selftests/vm/map_populate.c
new file mode 100644
index 000000000..6b8aeaa0b
--- /dev/null
+++ b/tools/testing/selftests/vm/map_populate.c
@@ -0,0 +1,113 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2018 Dmitry Safonov, Arista Networks
+ *
+ * MAP_POPULATE | MAP_PRIVATE should COW VMA pages.
+ */
+
+#define _GNU_SOURCE
+#include <errno.h>
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#ifndef MMAP_SZ
+#define MMAP_SZ		4096
+#endif
+
+#define BUG_ON(condition, description)					\
+	do {								\
+		if (condition) {					\
+			fprintf(stderr, "[FAIL]\t%s:%d\t%s:%s\n", __func__, \
+				__LINE__, (description), strerror(errno)); \
+			exit(1);					\
+		}							\
+	} while (0)
+
+static int parent_f(int sock, unsigned long *smap, int child)
+{
+	int status, ret;
+
+	ret = read(sock, &status, sizeof(int));
+	BUG_ON(ret <= 0, "read(sock)");
+
+	*smap = 0x22222BAD;
+	ret = msync(smap, MMAP_SZ, MS_SYNC);
+	BUG_ON(ret, "msync()");
+
+	ret = write(sock, &status, sizeof(int));
+	BUG_ON(ret <= 0, "write(sock)");
+
+	waitpid(child, &status, 0);
+	BUG_ON(!WIFEXITED(status), "child in unexpected state");
+
+	return WEXITSTATUS(status);
+}
+
+static int child_f(int sock, unsigned long *smap, int fd)
+{
+	int ret, buf = 0;
+
+	smap = mmap(0, MMAP_SZ, PROT_READ | PROT_WRITE,
+			MAP_PRIVATE | MAP_POPULATE, fd, 0);
+	BUG_ON(smap == MAP_FAILED, "mmap()");
+
+	BUG_ON(*smap != 0xdeadbabe, "MAP_PRIVATE | MAP_POPULATE changed file");
+
+	ret = write(sock, &buf, sizeof(int));
+	BUG_ON(ret <= 0, "write(sock)");
+
+	ret = read(sock, &buf, sizeof(int));
+	BUG_ON(ret <= 0, "read(sock)");
+
+	BUG_ON(*smap == 0x22222BAD, "MAP_POPULATE didn't COW private page");
+	BUG_ON(*smap != 0xdeadbabe, "mapping was corrupted");
+
+	return 0;
+}
+
+int main(int argc, char **argv)
+{
+	int sock[2], child, ret;
+	FILE *ftmp;
+	unsigned long *smap;
+
+	ftmp = tmpfile();
+	BUG_ON(ftmp == 0, "tmpfile()");
+
+	ret = ftruncate(fileno(ftmp), MMAP_SZ);
+	BUG_ON(ret, "ftruncate()");
+
+	smap = mmap(0, MMAP_SZ, PROT_READ | PROT_WRITE,
+			MAP_SHARED, fileno(ftmp), 0);
+	BUG_ON(smap == MAP_FAILED, "mmap()");
+
+	*smap = 0xdeadbabe;
+	/* Probably unnecessary, but let it be. */
+	ret = msync(smap, MMAP_SZ, MS_SYNC);
+	BUG_ON(ret, "msync()");
+
+	ret = socketpair(PF_LOCAL, SOCK_SEQPACKET, 0, sock);
+	BUG_ON(ret, "socketpair()");
+
+	child = fork();
+	BUG_ON(child == -1, "fork()");
+
+	if (child) {
+		ret = close(sock[0]);
+		BUG_ON(ret, "close()");
+
+		return parent_f(sock[1], smap, child);
+	}
+
+	ret = close(sock[1]);
+	BUG_ON(ret, "close()");
+
+	return child_f(sock[0], smap, fileno(ftmp));
+}
diff --git a/tools/testing/selftests/vm/mlock-random-test.c b/tools/testing/selftests/vm/mlock-random-test.c
new file mode 100644
index 000000000..ff4d72eb7
--- /dev/null
+++ b/tools/testing/selftests/vm/mlock-random-test.c
@@ -0,0 +1,294 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * It tests the mlock/mlock2() when they are invoked
+ * on randomly memory region.
+ */
+#include <unistd.h>
+#include <sys/resource.h>
+#include <sys/capability.h>
+#include <sys/mman.h>
+#include <fcntl.h>
+#include <string.h>
+#include <sys/ipc.h>
+#include <sys/shm.h>
+#include <time.h>
+#include "mlock2.h"
+
+#define CHUNK_UNIT (128 * 1024)
+#define MLOCK_RLIMIT_SIZE (CHUNK_UNIT * 2)
+#define MLOCK_WITHIN_LIMIT_SIZE CHUNK_UNIT
+#define MLOCK_OUTOF_LIMIT_SIZE (CHUNK_UNIT * 3)
+
+#define TEST_LOOP 100
+#define PAGE_ALIGN(size, ps) (((size) + ((ps) - 1)) & ~((ps) - 1))
+
+int set_cap_limits(rlim_t max)
+{
+	struct rlimit new;
+	cap_t cap = cap_init();
+
+	new.rlim_cur = max;
+	new.rlim_max = max;
+	if (setrlimit(RLIMIT_MEMLOCK, &new)) {
+		perror("setrlimit() returns error\n");
+		return -1;
+	}
+
+	/* drop capabilities including CAP_IPC_LOCK */
+	if (cap_set_proc(cap)) {
+		perror("cap_set_proc() returns error\n");
+		return -2;
+	}
+
+	return 0;
+}
+
+int get_proc_locked_vm_size(void)
+{
+	FILE *f;
+	int ret = -1;
+	char line[1024] = {0};
+	unsigned long lock_size = 0;
+
+	f = fopen("/proc/self/status", "r");
+	if (!f) {
+		perror("fopen");
+		return -1;
+	}
+
+	while (fgets(line, 1024, f)) {
+		if (strstr(line, "VmLck")) {
+			ret = sscanf(line, "VmLck:\t%8lu kB", &lock_size);
+			if (ret <= 0) {
+				printf("sscanf() on VmLck error: %s: %d\n",
+						line, ret);
+				fclose(f);
+				return -1;
+			}
+			fclose(f);
+			return (int)(lock_size << 10);
+		}
+	}
+
+	perror("cann't parse VmLck in /proc/self/status\n");
+	fclose(f);
+	return -1;
+}
+
+/*
+ * Get the MMUPageSize of the memory region including input
+ * address from proc file.
+ *
+ * return value: on error case, 0 will be returned.
+ * Otherwise the page size(in bytes) is returned.
+ */
+int get_proc_page_size(unsigned long addr)
+{
+	FILE *smaps;
+	char *line;
+	unsigned long mmupage_size = 0;
+	size_t size;
+
+	smaps = seek_to_smaps_entry(addr);
+	if (!smaps) {
+		printf("Unable to parse /proc/self/smaps\n");
+		return 0;
+	}
+
+	while (getline(&line, &size, smaps) > 0) {
+		if (!strstr(line, "MMUPageSize")) {
+			free(line);
+			line = NULL;
+			size = 0;
+			continue;
+		}
+
+		/* found the MMUPageSize of this section */
+		if (sscanf(line, "MMUPageSize:    %8lu kB",
+					&mmupage_size) < 1) {
+			printf("Unable to parse smaps entry for Size:%s\n",
+					line);
+			break;
+		}
+
+	}
+	free(line);
+	if (smaps)
+		fclose(smaps);
+	return mmupage_size << 10;
+}
+
+/*
+ * Test mlock/mlock2() on provided memory chunk.
+ * It expects the mlock/mlock2() to be successful (within rlimit)
+ *
+ * With allocated memory chunk [p, p + alloc_size), this
+ * test will choose start/len randomly to perform mlock/mlock2
+ * [start, start +  len] memory range. The range is within range
+ * of the allocated chunk.
+ *
+ * The memory region size alloc_size is within the rlimit.
+ * So we always expect a success of mlock/mlock2.
+ *
+ * VmLck is assumed to be 0 before this test.
+ *
+ *    return value: 0 - success
+ *    else: failure
+ */
+int test_mlock_within_limit(char *p, int alloc_size)
+{
+	int i;
+	int ret = 0;
+	int locked_vm_size = 0;
+	struct rlimit cur;
+	int page_size = 0;
+
+	getrlimit(RLIMIT_MEMLOCK, &cur);
+	if (cur.rlim_cur < alloc_size) {
+		printf("alloc_size[%d] < %u rlimit,lead to mlock failure\n",
+				alloc_size, (unsigned int)cur.rlim_cur);
+		return -1;
+	}
+
+	srand(time(NULL));
+	for (i = 0; i < TEST_LOOP; i++) {
+		/*
+		 * - choose mlock/mlock2 randomly
+		 * - choose lock_size randomly but lock_size < alloc_size
+		 * - choose start_offset randomly but p+start_offset+lock_size
+		 *   < p+alloc_size
+		 */
+		int is_mlock = !!(rand() % 2);
+		int lock_size = rand() % alloc_size;
+		int start_offset = rand() % (alloc_size - lock_size);
+
+		if (is_mlock)
+			ret = mlock(p + start_offset, lock_size);
+		else
+			ret = mlock2_(p + start_offset, lock_size,
+				       MLOCK_ONFAULT);
+
+		if (ret) {
+			printf("%s() failure at |%p(%d)| mlock:|%p(%d)|\n",
+					is_mlock ? "mlock" : "mlock2",
+					p, alloc_size,
+					p + start_offset, lock_size);
+			return ret;
+		}
+	}
+
+	/*
+	 * Check VmLck left by the tests.
+	 */
+	locked_vm_size = get_proc_locked_vm_size();
+	page_size = get_proc_page_size((unsigned long)p);
+	if (page_size == 0) {
+		printf("cannot get proc MMUPageSize\n");
+		return -1;
+	}
+
+	if (locked_vm_size > PAGE_ALIGN(alloc_size, page_size) + page_size) {
+		printf("test_mlock_within_limit() left VmLck:%d on %d chunk\n",
+				locked_vm_size, alloc_size);
+		return -1;
+	}
+
+	return 0;
+}
+
+
+/*
+ * We expect the mlock/mlock2() to be fail (outof limitation)
+ *
+ * With allocated memory chunk [p, p + alloc_size), this
+ * test will randomly choose start/len and perform mlock/mlock2
+ * on [start, start+len] range.
+ *
+ * The memory region size alloc_size is above the rlimit.
+ * And the len to be locked is higher than rlimit.
+ * So we always expect a failure of mlock/mlock2.
+ * No locked page number should be increased as a side effect.
+ *
+ *    return value: 0 - success
+ *    else: failure
+ */
+int test_mlock_outof_limit(char *p, int alloc_size)
+{
+	int i;
+	int ret = 0;
+	int locked_vm_size = 0, old_locked_vm_size = 0;
+	struct rlimit cur;
+
+	getrlimit(RLIMIT_MEMLOCK, &cur);
+	if (cur.rlim_cur >= alloc_size) {
+		printf("alloc_size[%d] >%u rlimit, violates test condition\n",
+				alloc_size, (unsigned int)cur.rlim_cur);
+		return -1;
+	}
+
+	old_locked_vm_size = get_proc_locked_vm_size();
+	srand(time(NULL));
+	for (i = 0; i < TEST_LOOP; i++) {
+		int is_mlock = !!(rand() % 2);
+		int lock_size = (rand() % (alloc_size - cur.rlim_cur))
+			+ cur.rlim_cur;
+		int start_offset = rand() % (alloc_size - lock_size);
+
+		if (is_mlock)
+			ret = mlock(p + start_offset, lock_size);
+		else
+			ret = mlock2_(p + start_offset, lock_size,
+					MLOCK_ONFAULT);
+		if (ret == 0) {
+			printf("%s() succeeds? on %p(%d) mlock%p(%d)\n",
+					is_mlock ? "mlock" : "mlock2",
+					p, alloc_size,
+					p + start_offset, lock_size);
+			return -1;
+		}
+	}
+
+	locked_vm_size = get_proc_locked_vm_size();
+	if (locked_vm_size != old_locked_vm_size) {
+		printf("tests leads to new mlocked page: old[%d], new[%d]\n",
+				old_locked_vm_size,
+				locked_vm_size);
+		return -1;
+	}
+
+	return 0;
+}
+
+int main(int argc, char **argv)
+{
+	char *p = NULL;
+	int ret = 0;
+
+	if (set_cap_limits(MLOCK_RLIMIT_SIZE))
+		return -1;
+
+	p = malloc(MLOCK_WITHIN_LIMIT_SIZE);
+	if (p == NULL) {
+		perror("malloc() failure\n");
+		return -1;
+	}
+	ret = test_mlock_within_limit(p, MLOCK_WITHIN_LIMIT_SIZE);
+	if (ret)
+		return ret;
+	munlock(p, MLOCK_WITHIN_LIMIT_SIZE);
+	free(p);
+
+
+	p = malloc(MLOCK_OUTOF_LIMIT_SIZE);
+	if (p == NULL) {
+		perror("malloc() failure\n");
+		return -1;
+	}
+	ret = test_mlock_outof_limit(p, MLOCK_OUTOF_LIMIT_SIZE);
+	if (ret)
+		return ret;
+	munlock(p, MLOCK_OUTOF_LIMIT_SIZE);
+	free(p);
+
+	return 0;
+}
diff --git a/tools/testing/selftests/vm/mlock2-tests.c b/tools/testing/selftests/vm/mlock2-tests.c
new file mode 100644
index 000000000..11b2301f3
--- /dev/null
+++ b/tools/testing/selftests/vm/mlock2-tests.c
@@ -0,0 +1,520 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#include <sys/mman.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <string.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+#include <stdbool.h>
+#include "mlock2.h"
+
+#include "../kselftest.h"
+
+struct vm_boundaries {
+	unsigned long start;
+	unsigned long end;
+};
+
+static int get_vm_area(unsigned long addr, struct vm_boundaries *area)
+{
+	FILE *file;
+	int ret = 1;
+	char line[1024] = {0};
+	char *end_addr;
+	char *stop;
+	unsigned long start;
+	unsigned long end;
+
+	if (!area)
+		return ret;
+
+	file = fopen("/proc/self/maps", "r");
+	if (!file) {
+		perror("fopen");
+		return ret;
+	}
+
+	memset(area, 0, sizeof(struct vm_boundaries));
+
+	while(fgets(line, 1024, file)) {
+		end_addr = strchr(line, '-');
+		if (!end_addr) {
+			printf("cannot parse /proc/self/maps\n");
+			goto out;
+		}
+		*end_addr = '\0';
+		end_addr++;
+		stop = strchr(end_addr, ' ');
+		if (!stop) {
+			printf("cannot parse /proc/self/maps\n");
+			goto out;
+		}
+		stop = '\0';
+
+		sscanf(line, "%lx", &start);
+		sscanf(end_addr, "%lx", &end);
+
+		if (start <= addr && end > addr) {
+			area->start = start;
+			area->end = end;
+			ret = 0;
+			goto out;
+		}
+	}
+out:
+	fclose(file);
+	return ret;
+}
+
+#define VMFLAGS "VmFlags:"
+
+static bool is_vmflag_set(unsigned long addr, const char *vmflag)
+{
+	char *line = NULL;
+	char *flags;
+	size_t size = 0;
+	bool ret = false;
+	FILE *smaps;
+
+	smaps = seek_to_smaps_entry(addr);
+	if (!smaps) {
+		printf("Unable to parse /proc/self/smaps\n");
+		goto out;
+	}
+
+	while (getline(&line, &size, smaps) > 0) {
+		if (!strstr(line, VMFLAGS)) {
+			free(line);
+			line = NULL;
+			size = 0;
+			continue;
+		}
+
+		flags = line + strlen(VMFLAGS);
+		ret = (strstr(flags, vmflag) != NULL);
+		goto out;
+	}
+
+out:
+	free(line);
+	fclose(smaps);
+	return ret;
+}
+
+#define SIZE "Size:"
+#define RSS  "Rss:"
+#define LOCKED "lo"
+
+static unsigned long get_value_for_name(unsigned long addr, const char *name)
+{
+	char *line = NULL;
+	size_t size = 0;
+	char *value_ptr;
+	FILE *smaps = NULL;
+	unsigned long value = -1UL;
+
+	smaps = seek_to_smaps_entry(addr);
+	if (!smaps) {
+		printf("Unable to parse /proc/self/smaps\n");
+		goto out;
+	}
+
+	while (getline(&line, &size, smaps) > 0) {
+		if (!strstr(line, name)) {
+			free(line);
+			line = NULL;
+			size = 0;
+			continue;
+		}
+
+		value_ptr = line + strlen(name);
+		if (sscanf(value_ptr, "%lu kB", &value) < 1) {
+			printf("Unable to parse smaps entry for Size\n");
+			goto out;
+		}
+		break;
+	}
+
+out:
+	if (smaps)
+		fclose(smaps);
+	free(line);
+	return value;
+}
+
+static bool is_vma_lock_on_fault(unsigned long addr)
+{
+	bool locked;
+	unsigned long vma_size, vma_rss;
+
+	locked = is_vmflag_set(addr, LOCKED);
+	if (!locked)
+		return false;
+
+	vma_size = get_value_for_name(addr, SIZE);
+	vma_rss = get_value_for_name(addr, RSS);
+
+	/* only one page is faulted in */
+	return (vma_rss < vma_size);
+}
+
+#define PRESENT_BIT     0x8000000000000000ULL
+#define PFN_MASK        0x007FFFFFFFFFFFFFULL
+#define UNEVICTABLE_BIT (1UL << 18)
+
+static int lock_check(unsigned long addr)
+{
+	bool locked;
+	unsigned long vma_size, vma_rss;
+
+	locked = is_vmflag_set(addr, LOCKED);
+	if (!locked)
+		return false;
+
+	vma_size = get_value_for_name(addr, SIZE);
+	vma_rss = get_value_for_name(addr, RSS);
+
+	return (vma_rss == vma_size);
+}
+
+static int unlock_lock_check(char *map)
+{
+	if (is_vmflag_set((unsigned long)map, LOCKED)) {
+		printf("VMA flag %s is present on page 1 after unlock\n", LOCKED);
+		return 1;
+	}
+
+	return 0;
+}
+
+static int test_mlock_lock()
+{
+	char *map;
+	int ret = 1;
+	unsigned long page_size = getpagesize();
+
+	map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE,
+		   MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+	if (map == MAP_FAILED) {
+		perror("test_mlock_locked mmap");
+		goto out;
+	}
+
+	if (mlock2_(map, 2 * page_size, 0)) {
+		if (errno == ENOSYS) {
+			printf("Cannot call new mlock family, skipping test\n");
+			_exit(KSFT_SKIP);
+		}
+		perror("mlock2(0)");
+		goto unmap;
+	}
+
+	if (!lock_check((unsigned long)map))
+		goto unmap;
+
+	/* Now unlock and recheck attributes */
+	if (munlock(map, 2 * page_size)) {
+		perror("munlock()");
+		goto unmap;
+	}
+
+	ret = unlock_lock_check(map);
+
+unmap:
+	munmap(map, 2 * page_size);
+out:
+	return ret;
+}
+
+static int onfault_check(char *map)
+{
+	*map = 'a';
+	if (!is_vma_lock_on_fault((unsigned long)map)) {
+		printf("VMA is not marked for lock on fault\n");
+		return 1;
+	}
+
+	return 0;
+}
+
+static int unlock_onfault_check(char *map)
+{
+	unsigned long page_size = getpagesize();
+
+	if (is_vma_lock_on_fault((unsigned long)map) ||
+	    is_vma_lock_on_fault((unsigned long)map + page_size)) {
+		printf("VMA is still lock on fault after unlock\n");
+		return 1;
+	}
+
+	return 0;
+}
+
+static int test_mlock_onfault()
+{
+	char *map;
+	int ret = 1;
+	unsigned long page_size = getpagesize();
+
+	map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE,
+		   MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+	if (map == MAP_FAILED) {
+		perror("test_mlock_locked mmap");
+		goto out;
+	}
+
+	if (mlock2_(map, 2 * page_size, MLOCK_ONFAULT)) {
+		if (errno == ENOSYS) {
+			printf("Cannot call new mlock family, skipping test\n");
+			_exit(KSFT_SKIP);
+		}
+		perror("mlock2(MLOCK_ONFAULT)");
+		goto unmap;
+	}
+
+	if (onfault_check(map))
+		goto unmap;
+
+	/* Now unlock and recheck attributes */
+	if (munlock(map, 2 * page_size)) {
+		if (errno == ENOSYS) {
+			printf("Cannot call new mlock family, skipping test\n");
+			_exit(KSFT_SKIP);
+		}
+		perror("munlock()");
+		goto unmap;
+	}
+
+	ret = unlock_onfault_check(map);
+unmap:
+	munmap(map, 2 * page_size);
+out:
+	return ret;
+}
+
+static int test_lock_onfault_of_present()
+{
+	char *map;
+	int ret = 1;
+	unsigned long page_size = getpagesize();
+
+	map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE,
+		   MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+	if (map == MAP_FAILED) {
+		perror("test_mlock_locked mmap");
+		goto out;
+	}
+
+	*map = 'a';
+
+	if (mlock2_(map, 2 * page_size, MLOCK_ONFAULT)) {
+		if (errno == ENOSYS) {
+			printf("Cannot call new mlock family, skipping test\n");
+			_exit(KSFT_SKIP);
+		}
+		perror("mlock2(MLOCK_ONFAULT)");
+		goto unmap;
+	}
+
+	if (!is_vma_lock_on_fault((unsigned long)map) ||
+	    !is_vma_lock_on_fault((unsigned long)map + page_size)) {
+		printf("VMA with present pages is not marked lock on fault\n");
+		goto unmap;
+	}
+	ret = 0;
+unmap:
+	munmap(map, 2 * page_size);
+out:
+	return ret;
+}
+
+static int test_munlockall()
+{
+	char *map;
+	int ret = 1;
+	unsigned long page_size = getpagesize();
+
+	map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE,
+		   MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+
+	if (map == MAP_FAILED) {
+		perror("test_munlockall mmap");
+		goto out;
+	}
+
+	if (mlockall(MCL_CURRENT)) {
+		perror("mlockall(MCL_CURRENT)");
+		goto out;
+	}
+
+	if (!lock_check((unsigned long)map))
+		goto unmap;
+
+	if (munlockall()) {
+		perror("munlockall()");
+		goto unmap;
+	}
+
+	if (unlock_lock_check(map))
+		goto unmap;
+
+	munmap(map, 2 * page_size);
+
+	map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE,
+		   MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+
+	if (map == MAP_FAILED) {
+		perror("test_munlockall second mmap");
+		goto out;
+	}
+
+	if (mlockall(MCL_CURRENT | MCL_ONFAULT)) {
+		perror("mlockall(MCL_CURRENT | MCL_ONFAULT)");
+		goto unmap;
+	}
+
+	if (onfault_check(map))
+		goto unmap;
+
+	if (munlockall()) {
+		perror("munlockall()");
+		goto unmap;
+	}
+
+	if (unlock_onfault_check(map))
+		goto unmap;
+
+	if (mlockall(MCL_CURRENT | MCL_FUTURE)) {
+		perror("mlockall(MCL_CURRENT | MCL_FUTURE)");
+		goto out;
+	}
+
+	if (!lock_check((unsigned long)map))
+		goto unmap;
+
+	if (munlockall()) {
+		perror("munlockall()");
+		goto unmap;
+	}
+
+	ret = unlock_lock_check(map);
+
+unmap:
+	munmap(map, 2 * page_size);
+out:
+	munlockall();
+	return ret;
+}
+
+static int test_vma_management(bool call_mlock)
+{
+	int ret = 1;
+	void *map;
+	unsigned long page_size = getpagesize();
+	struct vm_boundaries page1;
+	struct vm_boundaries page2;
+	struct vm_boundaries page3;
+
+	map = mmap(NULL, 3 * page_size, PROT_READ | PROT_WRITE,
+		   MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+	if (map == MAP_FAILED) {
+		perror("mmap()");
+		return ret;
+	}
+
+	if (call_mlock && mlock2_(map, 3 * page_size, MLOCK_ONFAULT)) {
+		if (errno == ENOSYS) {
+			printf("Cannot call new mlock family, skipping test\n");
+			_exit(KSFT_SKIP);
+		}
+		perror("mlock(ONFAULT)\n");
+		goto out;
+	}
+
+	if (get_vm_area((unsigned long)map, &page1) ||
+	    get_vm_area((unsigned long)map + page_size, &page2) ||
+	    get_vm_area((unsigned long)map + page_size * 2, &page3)) {
+		printf("couldn't find mapping in /proc/self/maps\n");
+		goto out;
+	}
+
+	/*
+	 * Before we unlock a portion, we need to that all three pages are in
+	 * the same VMA.  If they are not we abort this test (Note that this is
+	 * not a failure)
+	 */
+	if (page1.start != page2.start || page2.start != page3.start) {
+		printf("VMAs are not merged to start, aborting test\n");
+		ret = 0;
+		goto out;
+	}
+
+	if (munlock(map + page_size, page_size)) {
+		perror("munlock()");
+		goto out;
+	}
+
+	if (get_vm_area((unsigned long)map, &page1) ||
+	    get_vm_area((unsigned long)map + page_size, &page2) ||
+	    get_vm_area((unsigned long)map + page_size * 2, &page3)) {
+		printf("couldn't find mapping in /proc/self/maps\n");
+		goto out;
+	}
+
+	/* All three VMAs should be different */
+	if (page1.start == page2.start || page2.start == page3.start) {
+		printf("failed to split VMA for munlock\n");
+		goto out;
+	}
+
+	/* Now unlock the first and third page and check the VMAs again */
+	if (munlock(map, page_size * 3)) {
+		perror("munlock()");
+		goto out;
+	}
+
+	if (get_vm_area((unsigned long)map, &page1) ||
+	    get_vm_area((unsigned long)map + page_size, &page2) ||
+	    get_vm_area((unsigned long)map + page_size * 2, &page3)) {
+		printf("couldn't find mapping in /proc/self/maps\n");
+		goto out;
+	}
+
+	/* Now all three VMAs should be the same */
+	if (page1.start != page2.start || page2.start != page3.start) {
+		printf("failed to merge VMAs after munlock\n");
+		goto out;
+	}
+
+	ret = 0;
+out:
+	munmap(map, 3 * page_size);
+	return ret;
+}
+
+static int test_mlockall(int (test_function)(bool call_mlock))
+{
+	int ret = 1;
+
+	if (mlockall(MCL_CURRENT | MCL_ONFAULT | MCL_FUTURE)) {
+		perror("mlockall");
+		return ret;
+	}
+
+	ret = test_function(false);
+	munlockall();
+	return ret;
+}
+
+int main(int argc, char **argv)
+{
+	int ret = 0;
+	ret += test_mlock_lock();
+	ret += test_mlock_onfault();
+	ret += test_munlockall();
+	ret += test_lock_onfault_of_present();
+	ret += test_vma_management(true);
+	ret += test_mlockall(test_vma_management);
+	return ret;
+}
diff --git a/tools/testing/selftests/vm/mlock2.h b/tools/testing/selftests/vm/mlock2.h
new file mode 100644
index 000000000..2a6e76c22
--- /dev/null
+++ b/tools/testing/selftests/vm/mlock2.h
@@ -0,0 +1,63 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <syscall.h>
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#ifndef MLOCK_ONFAULT
+#define MLOCK_ONFAULT 1
+#endif
+
+#ifndef MCL_ONFAULT
+#define MCL_ONFAULT (MCL_FUTURE << 1)
+#endif
+
+static int mlock2_(void *start, size_t len, int flags)
+{
+#ifdef __NR_mlock2
+	return syscall(__NR_mlock2, start, len, flags);
+#else
+	errno = ENOSYS;
+	return -1;
+#endif
+}
+
+static FILE *seek_to_smaps_entry(unsigned long addr)
+{
+	FILE *file;
+	char *line = NULL;
+	size_t size = 0;
+	unsigned long start, end;
+	char perms[5];
+	unsigned long offset;
+	char dev[32];
+	unsigned long inode;
+	char path[BUFSIZ];
+
+	file = fopen("/proc/self/smaps", "r");
+	if (!file) {
+		perror("fopen smaps");
+		_exit(1);
+	}
+
+	while (getline(&line, &size, file) > 0) {
+		if (sscanf(line, "%lx-%lx %s %lx %s %lu %s\n",
+			   &start, &end, perms, &offset, dev, &inode, path) < 6)
+			goto next;
+
+		if (start <= addr && addr < end)
+			goto out;
+
+next:
+		free(line);
+		line = NULL;
+		size = 0;
+	}
+
+	fclose(file);
+	file = NULL;
+
+out:
+	free(line);
+	return file;
+}
diff --git a/tools/testing/selftests/vm/on-fault-limit.c b/tools/testing/selftests/vm/on-fault-limit.c
new file mode 100644
index 000000000..634d87dfb
--- /dev/null
+++ b/tools/testing/selftests/vm/on-fault-limit.c
@@ -0,0 +1,48 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <sys/mman.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <string.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+
+#ifndef MCL_ONFAULT
+#define MCL_ONFAULT (MCL_FUTURE << 1)
+#endif
+
+static int test_limit(void)
+{
+	int ret = 1;
+	struct rlimit lims;
+	void *map;
+
+	if (getrlimit(RLIMIT_MEMLOCK, &lims)) {
+		perror("getrlimit");
+		return ret;
+	}
+
+	if (mlockall(MCL_ONFAULT | MCL_FUTURE)) {
+		perror("mlockall");
+		return ret;
+	}
+
+	map = mmap(NULL, 2 * lims.rlim_max, PROT_READ | PROT_WRITE,
+		   MAP_PRIVATE | MAP_ANONYMOUS | MAP_POPULATE, -1, 0);
+	if (map != MAP_FAILED)
+		printf("mmap should have failed, but didn't\n");
+	else {
+		ret = 0;
+		munmap(map, 2 * lims.rlim_max);
+	}
+
+	munlockall();
+	return ret;
+}
+
+int main(int argc, char **argv)
+{
+	int ret = 0;
+
+	ret += test_limit();
+	return ret;
+}
diff --git a/tools/testing/selftests/vm/run_vmtests b/tools/testing/selftests/vm/run_vmtests
new file mode 100755
index 000000000..584a91ae4
--- /dev/null
+++ b/tools/testing/selftests/vm/run_vmtests
@@ -0,0 +1,214 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#please run as root
+
+# Kselftest framework requirement - SKIP code is 4.
+ksft_skip=4
+
+mnt=./huge
+exitcode=0
+
+#get huge pagesize and freepages from /proc/meminfo
+while read name size unit; do
+	if [ "$name" = "HugePages_Free:" ]; then
+		freepgs=$size
+	fi
+	if [ "$name" = "Hugepagesize:" ]; then
+		hpgsize_KB=$size
+	fi
+done < /proc/meminfo
+
+# Simple hugetlbfs tests have a hardcoded minimum requirement of
+# huge pages totaling 256MB (262144KB) in size.  The userfaultfd
+# hugetlb test requires a minimum of 2 * nr_cpus huge pages.  Take
+# both of these requirements into account and attempt to increase
+# number of huge pages available.
+nr_cpus=$(nproc)
+hpgsize_MB=$((hpgsize_KB / 1024))
+half_ufd_size_MB=$((((nr_cpus * hpgsize_MB + 127) / 128) * 128))
+needmem_KB=$((half_ufd_size_MB * 2 * 1024))
+
+#set proper nr_hugepages
+if [ -n "$freepgs" ] && [ -n "$hpgsize_KB" ]; then
+	nr_hugepgs=`cat /proc/sys/vm/nr_hugepages`
+	needpgs=$((needmem_KB / hpgsize_KB))
+	tries=2
+	while [ $tries -gt 0 ] && [ $freepgs -lt $needpgs ]; do
+		lackpgs=$(( $needpgs - $freepgs ))
+		echo 3 > /proc/sys/vm/drop_caches
+		echo $(( $lackpgs + $nr_hugepgs )) > /proc/sys/vm/nr_hugepages
+		if [ $? -ne 0 ]; then
+			echo "Please run this test as root"
+			exit $ksft_skip
+		fi
+		while read name size unit; do
+			if [ "$name" = "HugePages_Free:" ]; then
+				freepgs=$size
+			fi
+		done < /proc/meminfo
+		tries=$((tries - 1))
+	done
+	if [ $freepgs -lt $needpgs ]; then
+		printf "Not enough huge pages available (%d < %d)\n" \
+		       $freepgs $needpgs
+		exit 1
+	fi
+else
+	echo "no hugetlbfs support in kernel?"
+	exit 1
+fi
+
+mkdir $mnt
+mount -t hugetlbfs none $mnt
+
+echo "---------------------"
+echo "running hugepage-mmap"
+echo "---------------------"
+./hugepage-mmap
+if [ $? -ne 0 ]; then
+	echo "[FAIL]"
+	exitcode=1
+else
+	echo "[PASS]"
+fi
+
+shmmax=`cat /proc/sys/kernel/shmmax`
+shmall=`cat /proc/sys/kernel/shmall`
+echo 268435456 > /proc/sys/kernel/shmmax
+echo 4194304 > /proc/sys/kernel/shmall
+echo "--------------------"
+echo "running hugepage-shm"
+echo "--------------------"
+./hugepage-shm
+if [ $? -ne 0 ]; then
+	echo "[FAIL]"
+	exitcode=1
+else
+	echo "[PASS]"
+fi
+echo $shmmax > /proc/sys/kernel/shmmax
+echo $shmall > /proc/sys/kernel/shmall
+
+echo "-------------------"
+echo "running map_hugetlb"
+echo "-------------------"
+./map_hugetlb
+if [ $? -ne 0 ]; then
+	echo "[FAIL]"
+	exitcode=1
+else
+	echo "[PASS]"
+fi
+
+echo "NOTE: The above hugetlb tests provide minimal coverage.  Use"
+echo "      https://github.com/libhugetlbfs/libhugetlbfs.git for"
+echo "      hugetlb regression testing."
+
+echo "-------------------"
+echo "running userfaultfd"
+echo "-------------------"
+./userfaultfd anon 128 32
+if [ $? -ne 0 ]; then
+	echo "[FAIL]"
+	exitcode=1
+else
+	echo "[PASS]"
+fi
+
+echo "---------------------------"
+echo "running userfaultfd_hugetlb"
+echo "---------------------------"
+# Test requires source and destination huge pages.  Size of source
+# (half_ufd_size_MB) is passed as argument to test.
+./userfaultfd hugetlb $half_ufd_size_MB 32 $mnt/ufd_test_file
+if [ $? -ne 0 ]; then
+	echo "[FAIL]"
+	exitcode=1
+else
+	echo "[PASS]"
+fi
+rm -f $mnt/ufd_test_file
+
+echo "-------------------------"
+echo "running userfaultfd_shmem"
+echo "-------------------------"
+./userfaultfd shmem 128 32
+if [ $? -ne 0 ]; then
+	echo "[FAIL]"
+	exitcode=1
+else
+	echo "[PASS]"
+fi
+
+#cleanup
+umount $mnt
+rm -rf $mnt
+echo $nr_hugepgs > /proc/sys/vm/nr_hugepages
+
+echo "-----------------------"
+echo "running compaction_test"
+echo "-----------------------"
+./compaction_test
+if [ $? -ne 0 ]; then
+	echo "[FAIL]"
+	exitcode=1
+else
+	echo "[PASS]"
+fi
+
+echo "----------------------"
+echo "running on-fault-limit"
+echo "----------------------"
+sudo -u nobody ./on-fault-limit
+if [ $? -ne 0 ]; then
+	echo "[FAIL]"
+	exitcode=1
+else
+	echo "[PASS]"
+fi
+
+echo "--------------------"
+echo "running map_populate"
+echo "--------------------"
+./map_populate
+if [ $? -ne 0 ]; then
+	echo "[FAIL]"
+	exitcode=1
+else
+	echo "[PASS]"
+fi
+
+echo "--------------------"
+echo "running mlock2-tests"
+echo "--------------------"
+./mlock2-tests
+if [ $? -ne 0 ]; then
+	echo "[FAIL]"
+	exitcode=1
+else
+	echo "[PASS]"
+fi
+
+echo "-----------------------------"
+echo "running virtual_address_range"
+echo "-----------------------------"
+./virtual_address_range
+if [ $? -ne 0 ]; then
+	echo "[FAIL]"
+	exitcode=1
+else
+	echo "[PASS]"
+fi
+
+echo "-----------------------------"
+echo "running virtual address 128TB switch test"
+echo "-----------------------------"
+./va_128TBswitch
+if [ $? -ne 0 ]; then
+    echo "[FAIL]"
+    exitcode=1
+else
+    echo "[PASS]"
+fi
+
+exit $exitcode
diff --git a/tools/testing/selftests/vm/thuge-gen.c b/tools/testing/selftests/vm/thuge-gen.c
new file mode 100644
index 000000000..361ef7192
--- /dev/null
+++ b/tools/testing/selftests/vm/thuge-gen.c
@@ -0,0 +1,257 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Test selecting other page sizes for mmap/shmget.
+
+   Before running this huge pages for each huge page size must have been
+   reserved.
+   For large pages beyond MAX_ORDER (like 1GB on x86) boot options must be used.
+   Also shmmax must be increased.
+   And you need to run as root to work around some weird permissions in shm.
+   And nothing using huge pages should run in parallel.
+   When the program aborts you may need to clean up the shm segments with
+   ipcrm -m by hand, like this
+   sudo ipcs | awk '$1 == "0x00000000" {print $2}' | xargs -n1 sudo ipcrm -m
+   (warning this will remove all if someone else uses them) */
+
+#define _GNU_SOURCE 1
+#include <sys/mman.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <sys/ipc.h>
+#include <sys/shm.h>
+#include <sys/stat.h>
+#include <glob.h>
+#include <assert.h>
+#include <unistd.h>
+#include <stdarg.h>
+#include <string.h>
+
+#define err(x) perror(x), exit(1)
+
+#define MAP_HUGE_2MB    (21 << MAP_HUGE_SHIFT)
+#define MAP_HUGE_1GB    (30 << MAP_HUGE_SHIFT)
+#define MAP_HUGE_SHIFT  26
+#define MAP_HUGE_MASK   0x3f
+#if !defined(MAP_HUGETLB)
+#define MAP_HUGETLB	0x40000
+#endif
+
+#define SHM_HUGETLB     04000   /* segment will use huge TLB pages */
+#define SHM_HUGE_SHIFT  26
+#define SHM_HUGE_MASK   0x3f
+#define SHM_HUGE_2MB    (21 << SHM_HUGE_SHIFT)
+#define SHM_HUGE_1GB    (30 << SHM_HUGE_SHIFT)
+
+#define NUM_PAGESIZES   5
+
+#define NUM_PAGES 4
+
+#define Dprintf(fmt...) // printf(fmt)
+
+unsigned long page_sizes[NUM_PAGESIZES];
+int num_page_sizes;
+
+int ilog2(unsigned long v)
+{
+	int l = 0;
+	while ((1UL << l) < v)
+		l++;
+	return l;
+}
+
+void find_pagesizes(void)
+{
+	glob_t g;
+	int i;
+	glob("/sys/kernel/mm/hugepages/hugepages-*kB", 0, NULL, &g);
+	assert(g.gl_pathc <= NUM_PAGESIZES);
+	for (i = 0; i < g.gl_pathc; i++) {
+		sscanf(g.gl_pathv[i], "/sys/kernel/mm/hugepages/hugepages-%lukB",
+				&page_sizes[i]);
+		page_sizes[i] <<= 10;
+		printf("Found %luMB\n", page_sizes[i] >> 20);
+	}
+	num_page_sizes = g.gl_pathc;
+	globfree(&g);
+}
+
+unsigned long default_huge_page_size(void)
+{
+	unsigned long hps = 0;
+	char *line = NULL;
+	size_t linelen = 0;
+	FILE *f = fopen("/proc/meminfo", "r");
+	if (!f)
+		return 0;
+	while (getline(&line, &linelen, f) > 0) {
+		if (sscanf(line, "Hugepagesize:       %lu kB", &hps) == 1) {
+			hps <<= 10;
+			break;
+		}
+	}
+	free(line);
+	return hps;
+}
+
+void show(unsigned long ps)
+{
+	char buf[100];
+	if (ps == getpagesize())
+		return;
+	printf("%luMB: ", ps >> 20);
+	fflush(stdout);
+	snprintf(buf, sizeof buf,
+		"cat /sys/kernel/mm/hugepages/hugepages-%lukB/free_hugepages",
+		ps >> 10);
+	system(buf);
+}
+
+unsigned long read_sysfs(int warn, char *fmt, ...)
+{
+	char *line = NULL;
+	size_t linelen = 0;
+	char buf[100];
+	FILE *f;
+	va_list ap;
+	unsigned long val = 0;
+
+	va_start(ap, fmt);
+	vsnprintf(buf, sizeof buf, fmt, ap);
+	va_end(ap);
+
+	f = fopen(buf, "r");
+	if (!f) {
+		if (warn)
+			printf("missing %s\n", buf);
+		return 0;
+	}
+	if (getline(&line, &linelen, f) > 0) {
+		sscanf(line, "%lu", &val);
+	}
+	fclose(f);
+	free(line);
+	return val;
+}
+
+unsigned long read_free(unsigned long ps)
+{
+	return read_sysfs(ps != getpagesize(),
+			"/sys/kernel/mm/hugepages/hugepages-%lukB/free_hugepages",
+			ps >> 10);
+}
+
+void test_mmap(unsigned long size, unsigned flags)
+{
+	char *map;
+	unsigned long before, after;
+	int err;
+
+	before = read_free(size);
+	map = mmap(NULL, size*NUM_PAGES, PROT_READ|PROT_WRITE,
+			MAP_PRIVATE|MAP_ANONYMOUS|MAP_HUGETLB|flags, -1, 0);
+
+	if (map == (char *)-1) err("mmap");
+	memset(map, 0xff, size*NUM_PAGES);
+	after = read_free(size);
+	Dprintf("before %lu after %lu diff %ld size %lu\n",
+		before, after, before - after, size);
+	assert(size == getpagesize() || (before - after) == NUM_PAGES);
+	show(size);
+	err = munmap(map, size);
+	assert(!err);
+}
+
+void test_shmget(unsigned long size, unsigned flags)
+{
+	int id;
+	unsigned long before, after;
+	int err;
+
+	before = read_free(size);
+	id = shmget(IPC_PRIVATE, size * NUM_PAGES, IPC_CREAT|0600|flags);
+	if (id < 0) err("shmget");
+
+	struct shm_info i;
+	if (shmctl(id, SHM_INFO, (void *)&i) < 0) err("shmctl");
+	Dprintf("alloc %lu res %lu\n", i.shm_tot, i.shm_rss);
+
+
+	Dprintf("id %d\n", id);
+	char *map = shmat(id, NULL, 0600);
+	if (map == (char*)-1) err("shmat");
+
+	shmctl(id, IPC_RMID, NULL);
+
+	memset(map, 0xff, size*NUM_PAGES);
+	after = read_free(size);
+
+	Dprintf("before %lu after %lu diff %ld size %lu\n",
+		before, after, before - after, size);
+	assert(size == getpagesize() || (before - after) == NUM_PAGES);
+	show(size);
+	err = shmdt(map);
+	assert(!err);
+}
+
+void sanity_checks(void)
+{
+	int i;
+	unsigned long largest = getpagesize();
+
+	for (i = 0; i < num_page_sizes; i++) {
+		if (page_sizes[i] > largest)
+			largest = page_sizes[i];
+
+		if (read_free(page_sizes[i]) < NUM_PAGES) {
+			printf("Not enough huge pages for page size %lu MB, need %u\n",
+				page_sizes[i] >> 20,
+				NUM_PAGES);
+			exit(0);
+		}
+	}
+
+	if (read_sysfs(0, "/proc/sys/kernel/shmmax") < NUM_PAGES * largest) {
+		printf("Please do echo %lu > /proc/sys/kernel/shmmax", largest * NUM_PAGES);
+		exit(0);
+	}
+
+#if defined(__x86_64__)
+	if (largest != 1U<<30) {
+		printf("No GB pages available on x86-64\n"
+		       "Please boot with hugepagesz=1G hugepages=%d\n", NUM_PAGES);
+		exit(0);
+	}
+#endif
+}
+
+int main(void)
+{
+	int i;
+	unsigned default_hps = default_huge_page_size();
+
+	find_pagesizes();
+
+	sanity_checks();
+
+	for (i = 0; i < num_page_sizes; i++) {
+		unsigned long ps = page_sizes[i];
+		int arg = ilog2(ps) << MAP_HUGE_SHIFT;
+		printf("Testing %luMB mmap with shift %x\n", ps >> 20, arg);
+		test_mmap(ps, MAP_HUGETLB | arg);
+	}
+	printf("Testing default huge mmap\n");
+	test_mmap(default_hps, SHM_HUGETLB);
+
+	puts("Testing non-huge shmget");
+	test_shmget(getpagesize(), 0);
+
+	for (i = 0; i < num_page_sizes; i++) {
+		unsigned long ps = page_sizes[i];
+		int arg = ilog2(ps) << SHM_HUGE_SHIFT;
+		printf("Testing %luMB shmget with shift %x\n", ps >> 20, arg);
+		test_shmget(ps, SHM_HUGETLB | arg);
+	}
+	puts("default huge shmget");
+	test_shmget(default_hps, SHM_HUGETLB);
+
+	return 0;
+}
diff --git a/tools/testing/selftests/vm/transhuge-stress.c b/tools/testing/selftests/vm/transhuge-stress.c
new file mode 100644
index 000000000..fd7f1b4a9
--- /dev/null
+++ b/tools/testing/selftests/vm/transhuge-stress.c
@@ -0,0 +1,144 @@
+/*
+ * Stress test for transparent huge pages, memory compaction and migration.
+ *
+ * Authors: Konstantin Khlebnikov <koct9i@gmail.com>
+ *
+ * This is free and unencumbered software released into the public domain.
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <err.h>
+#include <time.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <string.h>
+#include <sys/mman.h>
+
+#define PAGE_SHIFT 12
+#define HPAGE_SHIFT 21
+
+#define PAGE_SIZE (1 << PAGE_SHIFT)
+#define HPAGE_SIZE (1 << HPAGE_SHIFT)
+
+#define PAGEMAP_PRESENT(ent)	(((ent) & (1ull << 63)) != 0)
+#define PAGEMAP_PFN(ent)	((ent) & ((1ull << 55) - 1))
+
+int pagemap_fd;
+
+int64_t allocate_transhuge(void *ptr)
+{
+	uint64_t ent[2];
+
+	/* drop pmd */
+	if (mmap(ptr, HPAGE_SIZE, PROT_READ | PROT_WRITE,
+				MAP_FIXED | MAP_ANONYMOUS |
+				MAP_NORESERVE | MAP_PRIVATE, -1, 0) != ptr)
+		errx(2, "mmap transhuge");
+
+	if (madvise(ptr, HPAGE_SIZE, MADV_HUGEPAGE))
+		err(2, "MADV_HUGEPAGE");
+
+	/* allocate transparent huge page */
+	*(volatile void **)ptr = ptr;
+
+	if (pread(pagemap_fd, ent, sizeof(ent),
+			(uintptr_t)ptr >> (PAGE_SHIFT - 3)) != sizeof(ent))
+		err(2, "read pagemap");
+
+	if (PAGEMAP_PRESENT(ent[0]) && PAGEMAP_PRESENT(ent[1]) &&
+	    PAGEMAP_PFN(ent[0]) + 1 == PAGEMAP_PFN(ent[1]) &&
+	    !(PAGEMAP_PFN(ent[0]) & ((1 << (HPAGE_SHIFT - PAGE_SHIFT)) - 1)))
+		return PAGEMAP_PFN(ent[0]);
+
+	return -1;
+}
+
+int main(int argc, char **argv)
+{
+	size_t ram, len;
+	void *ptr, *p;
+	struct timespec a, b;
+	double s;
+	uint8_t *map;
+	size_t map_len;
+
+	ram = sysconf(_SC_PHYS_PAGES);
+	if (ram > SIZE_MAX / sysconf(_SC_PAGESIZE) / 4)
+		ram = SIZE_MAX / 4;
+	else
+		ram *= sysconf(_SC_PAGESIZE);
+
+	if (argc == 1)
+		len = ram;
+	else if (!strcmp(argv[1], "-h"))
+		errx(1, "usage: %s [size in MiB]", argv[0]);
+	else
+		len = atoll(argv[1]) << 20;
+
+	warnx("allocate %zd transhuge pages, using %zd MiB virtual memory"
+	      " and %zd MiB of ram", len >> HPAGE_SHIFT, len >> 20,
+	      len >> (20 + HPAGE_SHIFT - PAGE_SHIFT - 1));
+
+	pagemap_fd = open("/proc/self/pagemap", O_RDONLY);
+	if (pagemap_fd < 0)
+		err(2, "open pagemap");
+
+	len -= len % HPAGE_SIZE;
+	ptr = mmap(NULL, len + HPAGE_SIZE, PROT_READ | PROT_WRITE,
+			MAP_ANONYMOUS | MAP_NORESERVE | MAP_PRIVATE, -1, 0);
+	if (ptr == MAP_FAILED)
+		err(2, "initial mmap");
+	ptr += HPAGE_SIZE - (uintptr_t)ptr % HPAGE_SIZE;
+
+	if (madvise(ptr, len, MADV_HUGEPAGE))
+		err(2, "MADV_HUGEPAGE");
+
+	map_len = ram >> (HPAGE_SHIFT - 1);
+	map = malloc(map_len);
+	if (!map)
+		errx(2, "map malloc");
+
+	while (1) {
+		int nr_succeed = 0, nr_failed = 0, nr_pages = 0;
+
+		memset(map, 0, map_len);
+
+		clock_gettime(CLOCK_MONOTONIC, &a);
+		for (p = ptr; p < ptr + len; p += HPAGE_SIZE) {
+			int64_t pfn;
+
+			pfn = allocate_transhuge(p);
+
+			if (pfn < 0) {
+				nr_failed++;
+			} else {
+				size_t idx = pfn >> (HPAGE_SHIFT - PAGE_SHIFT);
+
+				nr_succeed++;
+				if (idx >= map_len) {
+					map = realloc(map, idx + 1);
+					if (!map)
+						errx(2, "map realloc");
+					memset(map + map_len, 0, idx + 1 - map_len);
+					map_len = idx + 1;
+				}
+				if (!map[idx])
+					nr_pages++;
+				map[idx] = 1;
+			}
+
+			/* split transhuge page, keep last page */
+			if (madvise(p, HPAGE_SIZE - PAGE_SIZE, MADV_DONTNEED))
+				err(2, "MADV_DONTNEED");
+		}
+		clock_gettime(CLOCK_MONOTONIC, &b);
+		s = b.tv_sec - a.tv_sec + (b.tv_nsec - a.tv_nsec) / 1000000000.;
+
+		warnx("%.3f s/loop, %.3f ms/page, %10.3f MiB/s\t"
+		      "%4d succeed, %4d failed, %4d different pages",
+		      s, s * 1000 / (len >> HPAGE_SHIFT), len / s / (1 << 20),
+		      nr_succeed, nr_failed, nr_pages);
+	}
+}
diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c
new file mode 100644
index 000000000..b2c7043c0
--- /dev/null
+++ b/tools/testing/selftests/vm/userfaultfd.c
@@ -0,0 +1,1333 @@
+/*
+ * Stress userfaultfd syscall.
+ *
+ *  Copyright (C) 2015  Red Hat, Inc.
+ *
+ *  This work is licensed under the terms of the GNU GPL, version 2. See
+ *  the COPYING file in the top-level directory.
+ *
+ * This test allocates two virtual areas and bounces the physical
+ * memory across the two virtual areas (from area_src to area_dst)
+ * using userfaultfd.
+ *
+ * There are three threads running per CPU:
+ *
+ * 1) one per-CPU thread takes a per-page pthread_mutex in a random
+ *    page of the area_dst (while the physical page may still be in
+ *    area_src), and increments a per-page counter in the same page,
+ *    and checks its value against a verification region.
+ *
+ * 2) another per-CPU thread handles the userfaults generated by
+ *    thread 1 above. userfaultfd blocking reads or poll() modes are
+ *    exercised interleaved.
+ *
+ * 3) one last per-CPU thread transfers the memory in the background
+ *    at maximum bandwidth (if not already transferred by thread
+ *    2). Each cpu thread takes cares of transferring a portion of the
+ *    area.
+ *
+ * When all threads of type 3 completed the transfer, one bounce is
+ * complete. area_src and area_dst are then swapped. All threads are
+ * respawned and so the bounce is immediately restarted in the
+ * opposite direction.
+ *
+ * per-CPU threads 1 by triggering userfaults inside
+ * pthread_mutex_lock will also verify the atomicity of the memory
+ * transfer (UFFDIO_COPY).
+ *
+ * The program takes two parameters: the amounts of physical memory in
+ * megabytes (MiB) of the area and the number of bounces to execute.
+ *
+ * # 100MiB 99999 bounces
+ * ./userfaultfd 100 99999
+ *
+ * # 1GiB 99 bounces
+ * ./userfaultfd 1000 99
+ *
+ * # 10MiB-~6GiB 999 bounces, continue forever unless an error triggers
+ * while ./userfaultfd $[RANDOM % 6000 + 10] 999; do true; done
+ */
+
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <errno.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <time.h>
+#include <signal.h>
+#include <poll.h>
+#include <string.h>
+#include <linux/mman.h>
+#include <sys/mman.h>
+#include <sys/syscall.h>
+#include <sys/ioctl.h>
+#include <sys/wait.h>
+#include <pthread.h>
+#include <linux/userfaultfd.h>
+#include <setjmp.h>
+#include <stdbool.h>
+
+#include "../kselftest.h"
+
+#ifdef __NR_userfaultfd
+
+static unsigned long nr_cpus, nr_pages, nr_pages_per_cpu, page_size;
+
+#define BOUNCE_RANDOM		(1<<0)
+#define BOUNCE_RACINGFAULTS	(1<<1)
+#define BOUNCE_VERIFY		(1<<2)
+#define BOUNCE_POLL		(1<<3)
+static int bounces;
+
+#define TEST_ANON	1
+#define TEST_HUGETLB	2
+#define TEST_SHMEM	3
+static int test_type;
+
+/* exercise the test_uffdio_*_eexist every ALARM_INTERVAL_SECS */
+#define ALARM_INTERVAL_SECS 10
+static volatile bool test_uffdio_copy_eexist = true;
+static volatile bool test_uffdio_zeropage_eexist = true;
+
+static bool map_shared;
+static int huge_fd;
+static char *huge_fd_off0;
+static unsigned long long *count_verify;
+static int uffd, uffd_flags, finished, *pipefd;
+static char *area_src, *area_src_alias, *area_dst, *area_dst_alias;
+static char *zeropage;
+pthread_attr_t attr;
+
+/* pthread_mutex_t starts at page offset 0 */
+#define area_mutex(___area, ___nr)					\
+	((pthread_mutex_t *) ((___area) + (___nr)*page_size))
+/*
+ * count is placed in the page after pthread_mutex_t naturally aligned
+ * to avoid non alignment faults on non-x86 archs.
+ */
+#define area_count(___area, ___nr)					\
+	((volatile unsigned long long *) ((unsigned long)		\
+				 ((___area) + (___nr)*page_size +	\
+				  sizeof(pthread_mutex_t) +		\
+				  sizeof(unsigned long long) - 1) &	\
+				 ~(unsigned long)(sizeof(unsigned long long) \
+						  -  1)))
+
+static int anon_release_pages(char *rel_area)
+{
+	int ret = 0;
+
+	if (madvise(rel_area, nr_pages * page_size, MADV_DONTNEED)) {
+		perror("madvise");
+		ret = 1;
+	}
+
+	return ret;
+}
+
+static void anon_allocate_area(void **alloc_area)
+{
+	*alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE,
+			   MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+	if (*alloc_area == MAP_FAILED) {
+		fprintf(stderr, "mmap of anonymous memory failed");
+		*alloc_area = NULL;
+	}
+}
+
+static void noop_alias_mapping(__u64 *start, size_t len, unsigned long offset)
+{
+}
+
+/* HugeTLB memory */
+static int hugetlb_release_pages(char *rel_area)
+{
+	int ret = 0;
+
+	if (fallocate(huge_fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
+				rel_area == huge_fd_off0 ? 0 :
+				nr_pages * page_size,
+				nr_pages * page_size)) {
+		perror("fallocate");
+		ret = 1;
+	}
+
+	return ret;
+}
+
+
+static void hugetlb_allocate_area(void **alloc_area)
+{
+	void *area_alias = NULL;
+	char **alloc_area_alias;
+	*alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE,
+			   (map_shared ? MAP_SHARED : MAP_PRIVATE) |
+			   MAP_HUGETLB,
+			   huge_fd, *alloc_area == area_src ? 0 :
+			   nr_pages * page_size);
+	if (*alloc_area == MAP_FAILED) {
+		fprintf(stderr, "mmap of hugetlbfs file failed\n");
+		*alloc_area = NULL;
+	}
+
+	if (map_shared) {
+		area_alias = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE,
+				  MAP_SHARED | MAP_HUGETLB,
+				  huge_fd, *alloc_area == area_src ? 0 :
+				  nr_pages * page_size);
+		if (area_alias == MAP_FAILED) {
+			if (munmap(*alloc_area, nr_pages * page_size) < 0)
+				perror("hugetlb munmap"), exit(1);
+			*alloc_area = NULL;
+			return;
+		}
+	}
+	if (*alloc_area == area_src) {
+		huge_fd_off0 = *alloc_area;
+		alloc_area_alias = &area_src_alias;
+	} else {
+		alloc_area_alias = &area_dst_alias;
+	}
+	if (area_alias)
+		*alloc_area_alias = area_alias;
+}
+
+static void hugetlb_alias_mapping(__u64 *start, size_t len, unsigned long offset)
+{
+	if (!map_shared)
+		return;
+	/*
+	 * We can't zap just the pagetable with hugetlbfs because
+	 * MADV_DONTEED won't work. So exercise -EEXIST on a alias
+	 * mapping where the pagetables are not established initially,
+	 * this way we'll exercise the -EEXEC at the fs level.
+	 */
+	*start = (unsigned long) area_dst_alias + offset;
+}
+
+/* Shared memory */
+static int shmem_release_pages(char *rel_area)
+{
+	int ret = 0;
+
+	if (madvise(rel_area, nr_pages * page_size, MADV_REMOVE)) {
+		perror("madvise");
+		ret = 1;
+	}
+
+	return ret;
+}
+
+static void shmem_allocate_area(void **alloc_area)
+{
+	*alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE,
+			   MAP_ANONYMOUS | MAP_SHARED, -1, 0);
+	if (*alloc_area == MAP_FAILED) {
+		fprintf(stderr, "shared memory mmap failed\n");
+		*alloc_area = NULL;
+	}
+}
+
+struct uffd_test_ops {
+	unsigned long expected_ioctls;
+	void (*allocate_area)(void **alloc_area);
+	int (*release_pages)(char *rel_area);
+	void (*alias_mapping)(__u64 *start, size_t len, unsigned long offset);
+};
+
+#define ANON_EXPECTED_IOCTLS		((1 << _UFFDIO_WAKE) | \
+					 (1 << _UFFDIO_COPY) | \
+					 (1 << _UFFDIO_ZEROPAGE))
+
+static struct uffd_test_ops anon_uffd_test_ops = {
+	.expected_ioctls = ANON_EXPECTED_IOCTLS,
+	.allocate_area	= anon_allocate_area,
+	.release_pages	= anon_release_pages,
+	.alias_mapping = noop_alias_mapping,
+};
+
+static struct uffd_test_ops shmem_uffd_test_ops = {
+	.expected_ioctls = ANON_EXPECTED_IOCTLS,
+	.allocate_area	= shmem_allocate_area,
+	.release_pages	= shmem_release_pages,
+	.alias_mapping = noop_alias_mapping,
+};
+
+static struct uffd_test_ops hugetlb_uffd_test_ops = {
+	.expected_ioctls = UFFD_API_RANGE_IOCTLS_BASIC,
+	.allocate_area	= hugetlb_allocate_area,
+	.release_pages	= hugetlb_release_pages,
+	.alias_mapping = hugetlb_alias_mapping,
+};
+
+static struct uffd_test_ops *uffd_test_ops;
+
+static int my_bcmp(char *str1, char *str2, size_t n)
+{
+	unsigned long i;
+	for (i = 0; i < n; i++)
+		if (str1[i] != str2[i])
+			return 1;
+	return 0;
+}
+
+static void *locking_thread(void *arg)
+{
+	unsigned long cpu = (unsigned long) arg;
+	struct random_data rand;
+	unsigned long page_nr = *(&(page_nr)); /* uninitialized warning */
+	int32_t rand_nr;
+	unsigned long long count;
+	char randstate[64];
+	unsigned int seed;
+	time_t start;
+
+	if (bounces & BOUNCE_RANDOM) {
+		seed = (unsigned int) time(NULL) - bounces;
+		if (!(bounces & BOUNCE_RACINGFAULTS))
+			seed += cpu;
+		bzero(&rand, sizeof(rand));
+		bzero(&randstate, sizeof(randstate));
+		if (initstate_r(seed, randstate, sizeof(randstate), &rand))
+			fprintf(stderr, "srandom_r error\n"), exit(1);
+	} else {
+		page_nr = -bounces;
+		if (!(bounces & BOUNCE_RACINGFAULTS))
+			page_nr += cpu * nr_pages_per_cpu;
+	}
+
+	while (!finished) {
+		if (bounces & BOUNCE_RANDOM) {
+			if (random_r(&rand, &rand_nr))
+				fprintf(stderr, "random_r 1 error\n"), exit(1);
+			page_nr = rand_nr;
+			if (sizeof(page_nr) > sizeof(rand_nr)) {
+				if (random_r(&rand, &rand_nr))
+					fprintf(stderr, "random_r 2 error\n"), exit(1);
+				page_nr |= (((unsigned long) rand_nr) << 16) <<
+					   16;
+			}
+		} else
+			page_nr += 1;
+		page_nr %= nr_pages;
+
+		start = time(NULL);
+		if (bounces & BOUNCE_VERIFY) {
+			count = *area_count(area_dst, page_nr);
+			if (!count)
+				fprintf(stderr,
+					"page_nr %lu wrong count %Lu %Lu\n",
+					page_nr, count,
+					count_verify[page_nr]), exit(1);
+
+
+			/*
+			 * We can't use bcmp (or memcmp) because that
+			 * returns 0 erroneously if the memory is
+			 * changing under it (even if the end of the
+			 * page is never changing and always
+			 * different).
+			 */
+#if 1
+			if (!my_bcmp(area_dst + page_nr * page_size, zeropage,
+				     page_size))
+				fprintf(stderr,
+					"my_bcmp page_nr %lu wrong count %Lu %Lu\n",
+					page_nr, count,
+					count_verify[page_nr]), exit(1);
+#else
+			unsigned long loops;
+
+			loops = 0;
+			/* uncomment the below line to test with mutex */
+			/* pthread_mutex_lock(area_mutex(area_dst, page_nr)); */
+			while (!bcmp(area_dst + page_nr * page_size, zeropage,
+				     page_size)) {
+				loops += 1;
+				if (loops > 10)
+					break;
+			}
+			/* uncomment below line to test with mutex */
+			/* pthread_mutex_unlock(area_mutex(area_dst, page_nr)); */
+			if (loops) {
+				fprintf(stderr,
+					"page_nr %lu all zero thread %lu %p %lu\n",
+					page_nr, cpu, area_dst + page_nr * page_size,
+					loops);
+				if (loops > 10)
+					exit(1);
+			}
+#endif
+		}
+
+		pthread_mutex_lock(area_mutex(area_dst, page_nr));
+		count = *area_count(area_dst, page_nr);
+		if (count != count_verify[page_nr]) {
+			fprintf(stderr,
+				"page_nr %lu memory corruption %Lu %Lu\n",
+				page_nr, count,
+				count_verify[page_nr]), exit(1);
+		}
+		count++;
+		*area_count(area_dst, page_nr) = count_verify[page_nr] = count;
+		pthread_mutex_unlock(area_mutex(area_dst, page_nr));
+
+		if (time(NULL) - start > 1)
+			fprintf(stderr,
+				"userfault too slow %ld "
+				"possible false positive with overcommit\n",
+				time(NULL) - start);
+	}
+
+	return NULL;
+}
+
+static void retry_copy_page(int ufd, struct uffdio_copy *uffdio_copy,
+			    unsigned long offset)
+{
+	uffd_test_ops->alias_mapping(&uffdio_copy->dst,
+				     uffdio_copy->len,
+				     offset);
+	if (ioctl(ufd, UFFDIO_COPY, uffdio_copy)) {
+		/* real retval in ufdio_copy.copy */
+		if (uffdio_copy->copy != -EEXIST)
+			fprintf(stderr, "UFFDIO_COPY retry error %Ld\n",
+				uffdio_copy->copy), exit(1);
+	} else {
+		fprintf(stderr,	"UFFDIO_COPY retry unexpected %Ld\n",
+			uffdio_copy->copy), exit(1);
+	}
+}
+
+static int __copy_page(int ufd, unsigned long offset, bool retry)
+{
+	struct uffdio_copy uffdio_copy;
+
+	if (offset >= nr_pages * page_size)
+		fprintf(stderr, "unexpected offset %lu\n",
+			offset), exit(1);
+	uffdio_copy.dst = (unsigned long) area_dst + offset;
+	uffdio_copy.src = (unsigned long) area_src + offset;
+	uffdio_copy.len = page_size;
+	uffdio_copy.mode = 0;
+	uffdio_copy.copy = 0;
+	if (ioctl(ufd, UFFDIO_COPY, &uffdio_copy)) {
+		/* real retval in ufdio_copy.copy */
+		if (uffdio_copy.copy != -EEXIST)
+			fprintf(stderr, "UFFDIO_COPY error %Ld\n",
+				uffdio_copy.copy), exit(1);
+	} else if (uffdio_copy.copy != page_size) {
+		fprintf(stderr, "UFFDIO_COPY unexpected copy %Ld\n",
+			uffdio_copy.copy), exit(1);
+	} else {
+		if (test_uffdio_copy_eexist && retry) {
+			test_uffdio_copy_eexist = false;
+			retry_copy_page(ufd, &uffdio_copy, offset);
+		}
+		return 1;
+	}
+	return 0;
+}
+
+static int copy_page_retry(int ufd, unsigned long offset)
+{
+	return __copy_page(ufd, offset, true);
+}
+
+static int copy_page(int ufd, unsigned long offset)
+{
+	return __copy_page(ufd, offset, false);
+}
+
+static void *uffd_poll_thread(void *arg)
+{
+	unsigned long cpu = (unsigned long) arg;
+	struct pollfd pollfd[2];
+	struct uffd_msg msg;
+	struct uffdio_register uffd_reg;
+	int ret;
+	unsigned long offset;
+	char tmp_chr;
+	unsigned long userfaults = 0;
+
+	pollfd[0].fd = uffd;
+	pollfd[0].events = POLLIN;
+	pollfd[1].fd = pipefd[cpu*2];
+	pollfd[1].events = POLLIN;
+
+	for (;;) {
+		ret = poll(pollfd, 2, -1);
+		if (!ret)
+			fprintf(stderr, "poll error %d\n", ret), exit(1);
+		if (ret < 0)
+			perror("poll"), exit(1);
+		if (pollfd[1].revents & POLLIN) {
+			if (read(pollfd[1].fd, &tmp_chr, 1) != 1)
+				fprintf(stderr, "read pipefd error\n"),
+					exit(1);
+			break;
+		}
+		if (!(pollfd[0].revents & POLLIN))
+			fprintf(stderr, "pollfd[0].revents %d\n",
+				pollfd[0].revents), exit(1);
+		ret = read(uffd, &msg, sizeof(msg));
+		if (ret < 0) {
+			if (errno == EAGAIN)
+				continue;
+			perror("nonblocking read error"), exit(1);
+		}
+		switch (msg.event) {
+		default:
+			fprintf(stderr, "unexpected msg event %u\n",
+				msg.event), exit(1);
+			break;
+		case UFFD_EVENT_PAGEFAULT:
+			if (msg.arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE)
+				fprintf(stderr, "unexpected write fault\n"), exit(1);
+			offset = (char *)(unsigned long)msg.arg.pagefault.address -
+				area_dst;
+			offset &= ~(page_size-1);
+			if (copy_page(uffd, offset))
+				userfaults++;
+			break;
+		case UFFD_EVENT_FORK:
+			close(uffd);
+			uffd = msg.arg.fork.ufd;
+			pollfd[0].fd = uffd;
+			break;
+		case UFFD_EVENT_REMOVE:
+			uffd_reg.range.start = msg.arg.remove.start;
+			uffd_reg.range.len = msg.arg.remove.end -
+				msg.arg.remove.start;
+			if (ioctl(uffd, UFFDIO_UNREGISTER, &uffd_reg.range))
+				fprintf(stderr, "remove failure\n"), exit(1);
+			break;
+		case UFFD_EVENT_REMAP:
+			area_dst = (char *)(unsigned long)msg.arg.remap.to;
+			break;
+		}
+	}
+	return (void *)userfaults;
+}
+
+pthread_mutex_t uffd_read_mutex = PTHREAD_MUTEX_INITIALIZER;
+
+static void *uffd_read_thread(void *arg)
+{
+	unsigned long *this_cpu_userfaults;
+	struct uffd_msg msg;
+	unsigned long offset;
+	int ret;
+
+	this_cpu_userfaults = (unsigned long *) arg;
+	*this_cpu_userfaults = 0;
+
+	pthread_mutex_unlock(&uffd_read_mutex);
+	/* from here cancellation is ok */
+
+	for (;;) {
+		ret = read(uffd, &msg, sizeof(msg));
+		if (ret != sizeof(msg)) {
+			if (ret < 0)
+				perror("blocking read error"), exit(1);
+			else
+				fprintf(stderr, "short read\n"), exit(1);
+		}
+		if (msg.event != UFFD_EVENT_PAGEFAULT)
+			fprintf(stderr, "unexpected msg event %u\n",
+				msg.event), exit(1);
+		if (bounces & BOUNCE_VERIFY &&
+		    msg.arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE)
+			fprintf(stderr, "unexpected write fault\n"), exit(1);
+		offset = (char *)(unsigned long)msg.arg.pagefault.address -
+			 area_dst;
+		offset &= ~(page_size-1);
+		if (copy_page(uffd, offset))
+			(*this_cpu_userfaults)++;
+	}
+	return (void *)NULL;
+}
+
+static void *background_thread(void *arg)
+{
+	unsigned long cpu = (unsigned long) arg;
+	unsigned long page_nr;
+
+	for (page_nr = cpu * nr_pages_per_cpu;
+	     page_nr < (cpu+1) * nr_pages_per_cpu;
+	     page_nr++)
+		copy_page_retry(uffd, page_nr * page_size);
+
+	return NULL;
+}
+
+static int stress(unsigned long *userfaults)
+{
+	unsigned long cpu;
+	pthread_t locking_threads[nr_cpus];
+	pthread_t uffd_threads[nr_cpus];
+	pthread_t background_threads[nr_cpus];
+	void **_userfaults = (void **) userfaults;
+
+	finished = 0;
+	for (cpu = 0; cpu < nr_cpus; cpu++) {
+		if (pthread_create(&locking_threads[cpu], &attr,
+				   locking_thread, (void *)cpu))
+			return 1;
+		if (bounces & BOUNCE_POLL) {
+			if (pthread_create(&uffd_threads[cpu], &attr,
+					   uffd_poll_thread, (void *)cpu))
+				return 1;
+		} else {
+			if (pthread_create(&uffd_threads[cpu], &attr,
+					   uffd_read_thread,
+					   &_userfaults[cpu]))
+				return 1;
+			pthread_mutex_lock(&uffd_read_mutex);
+		}
+		if (pthread_create(&background_threads[cpu], &attr,
+				   background_thread, (void *)cpu))
+			return 1;
+	}
+	for (cpu = 0; cpu < nr_cpus; cpu++)
+		if (pthread_join(background_threads[cpu], NULL))
+			return 1;
+
+	/*
+	 * Be strict and immediately zap area_src, the whole area has
+	 * been transferred already by the background treads. The
+	 * area_src could then be faulted in in a racy way by still
+	 * running uffdio_threads reading zeropages after we zapped
+	 * area_src (but they're guaranteed to get -EEXIST from
+	 * UFFDIO_COPY without writing zero pages into area_dst
+	 * because the background threads already completed).
+	 */
+	if (uffd_test_ops->release_pages(area_src))
+		return 1;
+
+	for (cpu = 0; cpu < nr_cpus; cpu++) {
+		char c;
+		if (bounces & BOUNCE_POLL) {
+			if (write(pipefd[cpu*2+1], &c, 1) != 1) {
+				fprintf(stderr, "pipefd write error\n");
+				return 1;
+			}
+			if (pthread_join(uffd_threads[cpu], &_userfaults[cpu]))
+				return 1;
+		} else {
+			if (pthread_cancel(uffd_threads[cpu]))
+				return 1;
+			if (pthread_join(uffd_threads[cpu], NULL))
+				return 1;
+		}
+	}
+
+	finished = 1;
+	for (cpu = 0; cpu < nr_cpus; cpu++)
+		if (pthread_join(locking_threads[cpu], NULL))
+			return 1;
+
+	return 0;
+}
+
+static int userfaultfd_open(int features)
+{
+	struct uffdio_api uffdio_api;
+
+	uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
+	if (uffd < 0) {
+		fprintf(stderr,
+			"userfaultfd syscall not available in this kernel\n");
+		return 1;
+	}
+	uffd_flags = fcntl(uffd, F_GETFD, NULL);
+
+	uffdio_api.api = UFFD_API;
+	uffdio_api.features = features;
+	if (ioctl(uffd, UFFDIO_API, &uffdio_api)) {
+		fprintf(stderr, "UFFDIO_API\n");
+		return 1;
+	}
+	if (uffdio_api.api != UFFD_API) {
+		fprintf(stderr, "UFFDIO_API error %Lu\n", uffdio_api.api);
+		return 1;
+	}
+
+	return 0;
+}
+
+sigjmp_buf jbuf, *sigbuf;
+
+static void sighndl(int sig, siginfo_t *siginfo, void *ptr)
+{
+	if (sig == SIGBUS) {
+		if (sigbuf)
+			siglongjmp(*sigbuf, 1);
+		abort();
+	}
+}
+
+/*
+ * For non-cooperative userfaultfd test we fork() a process that will
+ * generate pagefaults, will mremap the area monitored by the
+ * userfaultfd and at last this process will release the monitored
+ * area.
+ * For the anonymous and shared memory the area is divided into two
+ * parts, the first part is accessed before mremap, and the second
+ * part is accessed after mremap. Since hugetlbfs does not support
+ * mremap, the entire monitored area is accessed in a single pass for
+ * HUGETLB_TEST.
+ * The release of the pages currently generates event for shmem and
+ * anonymous memory (UFFD_EVENT_REMOVE), hence it is not checked
+ * for hugetlb.
+ * For signal test(UFFD_FEATURE_SIGBUS), signal_test = 1, we register
+ * monitored area, generate pagefaults and test that signal is delivered.
+ * Use UFFDIO_COPY to allocate missing page and retry. For signal_test = 2
+ * test robustness use case - we release monitored area, fork a process
+ * that will generate pagefaults and verify signal is generated.
+ * This also tests UFFD_FEATURE_EVENT_FORK event along with the signal
+ * feature. Using monitor thread, verify no userfault events are generated.
+ */
+static int faulting_process(int signal_test)
+{
+	unsigned long nr;
+	unsigned long long count;
+	unsigned long split_nr_pages;
+	unsigned long lastnr;
+	struct sigaction act;
+	unsigned long signalled = 0;
+
+	if (test_type != TEST_HUGETLB)
+		split_nr_pages = (nr_pages + 1) / 2;
+	else
+		split_nr_pages = nr_pages;
+
+	if (signal_test) {
+		sigbuf = &jbuf;
+		memset(&act, 0, sizeof(act));
+		act.sa_sigaction = sighndl;
+		act.sa_flags = SA_SIGINFO;
+		if (sigaction(SIGBUS, &act, 0)) {
+			perror("sigaction");
+			return 1;
+		}
+		lastnr = (unsigned long)-1;
+	}
+
+	for (nr = 0; nr < split_nr_pages; nr++) {
+		if (signal_test) {
+			if (sigsetjmp(*sigbuf, 1) != 0) {
+				if (nr == lastnr) {
+					fprintf(stderr, "Signal repeated\n");
+					return 1;
+				}
+
+				lastnr = nr;
+				if (signal_test == 1) {
+					if (copy_page(uffd, nr * page_size))
+						signalled++;
+				} else {
+					signalled++;
+					continue;
+				}
+			}
+		}
+
+		count = *area_count(area_dst, nr);
+		if (count != count_verify[nr]) {
+			fprintf(stderr,
+				"nr %lu memory corruption %Lu %Lu\n",
+				nr, count,
+				count_verify[nr]), exit(1);
+		}
+	}
+
+	if (signal_test)
+		return signalled != split_nr_pages;
+
+	if (test_type == TEST_HUGETLB)
+		return 0;
+
+	area_dst = mremap(area_dst, nr_pages * page_size,  nr_pages * page_size,
+			  MREMAP_MAYMOVE | MREMAP_FIXED, area_src);
+	if (area_dst == MAP_FAILED)
+		perror("mremap"), exit(1);
+
+	for (; nr < nr_pages; nr++) {
+		count = *area_count(area_dst, nr);
+		if (count != count_verify[nr]) {
+			fprintf(stderr,
+				"nr %lu memory corruption %Lu %Lu\n",
+				nr, count,
+				count_verify[nr]), exit(1);
+		}
+	}
+
+	if (uffd_test_ops->release_pages(area_dst))
+		return 1;
+
+	for (nr = 0; nr < nr_pages; nr++) {
+		if (my_bcmp(area_dst + nr * page_size, zeropage, page_size))
+			fprintf(stderr, "nr %lu is not zero\n", nr), exit(1);
+	}
+
+	return 0;
+}
+
+static void retry_uffdio_zeropage(int ufd,
+				  struct uffdio_zeropage *uffdio_zeropage,
+				  unsigned long offset)
+{
+	uffd_test_ops->alias_mapping(&uffdio_zeropage->range.start,
+				     uffdio_zeropage->range.len,
+				     offset);
+	if (ioctl(ufd, UFFDIO_ZEROPAGE, uffdio_zeropage)) {
+		if (uffdio_zeropage->zeropage != -EEXIST)
+			fprintf(stderr, "UFFDIO_ZEROPAGE retry error %Ld\n",
+				uffdio_zeropage->zeropage), exit(1);
+	} else {
+		fprintf(stderr, "UFFDIO_ZEROPAGE retry unexpected %Ld\n",
+			uffdio_zeropage->zeropage), exit(1);
+	}
+}
+
+static int __uffdio_zeropage(int ufd, unsigned long offset, bool retry)
+{
+	struct uffdio_zeropage uffdio_zeropage;
+	int ret;
+	unsigned long has_zeropage;
+
+	has_zeropage = uffd_test_ops->expected_ioctls & (1 << _UFFDIO_ZEROPAGE);
+
+	if (offset >= nr_pages * page_size)
+		fprintf(stderr, "unexpected offset %lu\n",
+			offset), exit(1);
+	uffdio_zeropage.range.start = (unsigned long) area_dst + offset;
+	uffdio_zeropage.range.len = page_size;
+	uffdio_zeropage.mode = 0;
+	ret = ioctl(ufd, UFFDIO_ZEROPAGE, &uffdio_zeropage);
+	if (ret) {
+		/* real retval in ufdio_zeropage.zeropage */
+		if (has_zeropage) {
+			if (uffdio_zeropage.zeropage == -EEXIST)
+				fprintf(stderr, "UFFDIO_ZEROPAGE -EEXIST\n"),
+					exit(1);
+			else
+				fprintf(stderr, "UFFDIO_ZEROPAGE error %Ld\n",
+					uffdio_zeropage.zeropage), exit(1);
+		} else {
+			if (uffdio_zeropage.zeropage != -EINVAL)
+				fprintf(stderr,
+					"UFFDIO_ZEROPAGE not -EINVAL %Ld\n",
+					uffdio_zeropage.zeropage), exit(1);
+		}
+	} else if (has_zeropage) {
+		if (uffdio_zeropage.zeropage != page_size) {
+			fprintf(stderr, "UFFDIO_ZEROPAGE unexpected %Ld\n",
+				uffdio_zeropage.zeropage), exit(1);
+		} else {
+			if (test_uffdio_zeropage_eexist && retry) {
+				test_uffdio_zeropage_eexist = false;
+				retry_uffdio_zeropage(ufd, &uffdio_zeropage,
+						      offset);
+			}
+			return 1;
+		}
+	} else {
+		fprintf(stderr,
+			"UFFDIO_ZEROPAGE succeeded %Ld\n",
+			uffdio_zeropage.zeropage), exit(1);
+	}
+
+	return 0;
+}
+
+static int uffdio_zeropage(int ufd, unsigned long offset)
+{
+	return __uffdio_zeropage(ufd, offset, false);
+}
+
+/* exercise UFFDIO_ZEROPAGE */
+static int userfaultfd_zeropage_test(void)
+{
+	struct uffdio_register uffdio_register;
+	unsigned long expected_ioctls;
+
+	printf("testing UFFDIO_ZEROPAGE: ");
+	fflush(stdout);
+
+	if (uffd_test_ops->release_pages(area_dst))
+		return 1;
+
+	if (userfaultfd_open(0) < 0)
+		return 1;
+	uffdio_register.range.start = (unsigned long) area_dst;
+	uffdio_register.range.len = nr_pages * page_size;
+	uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
+	if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
+		fprintf(stderr, "register failure\n"), exit(1);
+
+	expected_ioctls = uffd_test_ops->expected_ioctls;
+	if ((uffdio_register.ioctls & expected_ioctls) !=
+	    expected_ioctls)
+		fprintf(stderr,
+			"unexpected missing ioctl for anon memory\n"),
+			exit(1);
+
+	if (uffdio_zeropage(uffd, 0)) {
+		if (my_bcmp(area_dst, zeropage, page_size))
+			fprintf(stderr, "zeropage is not zero\n"), exit(1);
+	}
+
+	close(uffd);
+	printf("done.\n");
+	return 0;
+}
+
+static int userfaultfd_events_test(void)
+{
+	struct uffdio_register uffdio_register;
+	unsigned long expected_ioctls;
+	unsigned long userfaults;
+	pthread_t uffd_mon;
+	int err, features;
+	pid_t pid;
+	char c;
+
+	printf("testing events (fork, remap, remove): ");
+	fflush(stdout);
+
+	if (uffd_test_ops->release_pages(area_dst))
+		return 1;
+
+	features = UFFD_FEATURE_EVENT_FORK | UFFD_FEATURE_EVENT_REMAP |
+		UFFD_FEATURE_EVENT_REMOVE;
+	if (userfaultfd_open(features) < 0)
+		return 1;
+	fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK);
+
+	uffdio_register.range.start = (unsigned long) area_dst;
+	uffdio_register.range.len = nr_pages * page_size;
+	uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
+	if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
+		fprintf(stderr, "register failure\n"), exit(1);
+
+	expected_ioctls = uffd_test_ops->expected_ioctls;
+	if ((uffdio_register.ioctls & expected_ioctls) !=
+	    expected_ioctls)
+		fprintf(stderr,
+			"unexpected missing ioctl for anon memory\n"),
+			exit(1);
+
+	if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, NULL))
+		perror("uffd_poll_thread create"), exit(1);
+
+	pid = fork();
+	if (pid < 0)
+		perror("fork"), exit(1);
+
+	if (!pid)
+		return faulting_process(0);
+
+	waitpid(pid, &err, 0);
+	if (err)
+		fprintf(stderr, "faulting process failed\n"), exit(1);
+
+	if (write(pipefd[1], &c, sizeof(c)) != sizeof(c))
+		perror("pipe write"), exit(1);
+	if (pthread_join(uffd_mon, (void **)&userfaults))
+		return 1;
+
+	close(uffd);
+	printf("userfaults: %ld\n", userfaults);
+
+	return userfaults != nr_pages;
+}
+
+static int userfaultfd_sig_test(void)
+{
+	struct uffdio_register uffdio_register;
+	unsigned long expected_ioctls;
+	unsigned long userfaults;
+	pthread_t uffd_mon;
+	int err, features;
+	pid_t pid;
+	char c;
+
+	printf("testing signal delivery: ");
+	fflush(stdout);
+
+	if (uffd_test_ops->release_pages(area_dst))
+		return 1;
+
+	features = UFFD_FEATURE_EVENT_FORK|UFFD_FEATURE_SIGBUS;
+	if (userfaultfd_open(features) < 0)
+		return 1;
+	fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK);
+
+	uffdio_register.range.start = (unsigned long) area_dst;
+	uffdio_register.range.len = nr_pages * page_size;
+	uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
+	if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
+		fprintf(stderr, "register failure\n"), exit(1);
+
+	expected_ioctls = uffd_test_ops->expected_ioctls;
+	if ((uffdio_register.ioctls & expected_ioctls) !=
+	    expected_ioctls)
+		fprintf(stderr,
+			"unexpected missing ioctl for anon memory\n"),
+			exit(1);
+
+	if (faulting_process(1))
+		fprintf(stderr, "faulting process failed\n"), exit(1);
+
+	if (uffd_test_ops->release_pages(area_dst))
+		return 1;
+
+	if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, NULL))
+		perror("uffd_poll_thread create"), exit(1);
+
+	pid = fork();
+	if (pid < 0)
+		perror("fork"), exit(1);
+
+	if (!pid)
+		exit(faulting_process(2));
+
+	waitpid(pid, &err, 0);
+	if (err)
+		fprintf(stderr, "faulting process failed\n"), exit(1);
+
+	if (write(pipefd[1], &c, sizeof(c)) != sizeof(c))
+		perror("pipe write"), exit(1);
+	if (pthread_join(uffd_mon, (void **)&userfaults))
+		return 1;
+
+	printf("done.\n");
+	if (userfaults)
+		fprintf(stderr, "Signal test failed, userfaults: %ld\n",
+			userfaults);
+	close(uffd);
+	return userfaults != 0;
+}
+static int userfaultfd_stress(void)
+{
+	void *area;
+	char *tmp_area;
+	unsigned long nr;
+	struct uffdio_register uffdio_register;
+	unsigned long cpu;
+	int err;
+	unsigned long userfaults[nr_cpus];
+
+	uffd_test_ops->allocate_area((void **)&area_src);
+	if (!area_src)
+		return 1;
+	uffd_test_ops->allocate_area((void **)&area_dst);
+	if (!area_dst)
+		return 1;
+
+	if (userfaultfd_open(0) < 0)
+		return 1;
+
+	count_verify = malloc(nr_pages * sizeof(unsigned long long));
+	if (!count_verify) {
+		perror("count_verify");
+		return 1;
+	}
+
+	for (nr = 0; nr < nr_pages; nr++) {
+		*area_mutex(area_src, nr) = (pthread_mutex_t)
+			PTHREAD_MUTEX_INITIALIZER;
+		count_verify[nr] = *area_count(area_src, nr) = 1;
+		/*
+		 * In the transition between 255 to 256, powerpc will
+		 * read out of order in my_bcmp and see both bytes as
+		 * zero, so leave a placeholder below always non-zero
+		 * after the count, to avoid my_bcmp to trigger false
+		 * positives.
+		 */
+		*(area_count(area_src, nr) + 1) = 1;
+	}
+
+	pipefd = malloc(sizeof(int) * nr_cpus * 2);
+	if (!pipefd) {
+		perror("pipefd");
+		return 1;
+	}
+	for (cpu = 0; cpu < nr_cpus; cpu++) {
+		if (pipe2(&pipefd[cpu*2], O_CLOEXEC | O_NONBLOCK)) {
+			perror("pipe");
+			return 1;
+		}
+	}
+
+	if (posix_memalign(&area, page_size, page_size)) {
+		fprintf(stderr, "out of memory\n");
+		return 1;
+	}
+	zeropage = area;
+	bzero(zeropage, page_size);
+
+	pthread_mutex_lock(&uffd_read_mutex);
+
+	pthread_attr_init(&attr);
+	pthread_attr_setstacksize(&attr, 16*1024*1024);
+
+	err = 0;
+	while (bounces--) {
+		unsigned long expected_ioctls;
+
+		printf("bounces: %d, mode:", bounces);
+		if (bounces & BOUNCE_RANDOM)
+			printf(" rnd");
+		if (bounces & BOUNCE_RACINGFAULTS)
+			printf(" racing");
+		if (bounces & BOUNCE_VERIFY)
+			printf(" ver");
+		if (bounces & BOUNCE_POLL)
+			printf(" poll");
+		printf(", ");
+		fflush(stdout);
+
+		if (bounces & BOUNCE_POLL)
+			fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK);
+		else
+			fcntl(uffd, F_SETFL, uffd_flags & ~O_NONBLOCK);
+
+		/* register */
+		uffdio_register.range.start = (unsigned long) area_dst;
+		uffdio_register.range.len = nr_pages * page_size;
+		uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
+		if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) {
+			fprintf(stderr, "register failure\n");
+			return 1;
+		}
+		expected_ioctls = uffd_test_ops->expected_ioctls;
+		if ((uffdio_register.ioctls & expected_ioctls) !=
+		    expected_ioctls) {
+			fprintf(stderr,
+				"unexpected missing ioctl for anon memory\n");
+			return 1;
+		}
+
+		if (area_dst_alias) {
+			uffdio_register.range.start = (unsigned long)
+				area_dst_alias;
+			if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) {
+				fprintf(stderr, "register failure alias\n");
+				return 1;
+			}
+		}
+
+		/*
+		 * The madvise done previously isn't enough: some
+		 * uffd_thread could have read userfaults (one of
+		 * those already resolved by the background thread)
+		 * and it may be in the process of calling
+		 * UFFDIO_COPY. UFFDIO_COPY will read the zapped
+		 * area_src and it would map a zero page in it (of
+		 * course such a UFFDIO_COPY is perfectly safe as it'd
+		 * return -EEXIST). The problem comes at the next
+		 * bounce though: that racing UFFDIO_COPY would
+		 * generate zeropages in the area_src, so invalidating
+		 * the previous MADV_DONTNEED. Without this additional
+		 * MADV_DONTNEED those zeropages leftovers in the
+		 * area_src would lead to -EEXIST failure during the
+		 * next bounce, effectively leaving a zeropage in the
+		 * area_dst.
+		 *
+		 * Try to comment this out madvise to see the memory
+		 * corruption being caught pretty quick.
+		 *
+		 * khugepaged is also inhibited to collapse THP after
+		 * MADV_DONTNEED only after the UFFDIO_REGISTER, so it's
+		 * required to MADV_DONTNEED here.
+		 */
+		if (uffd_test_ops->release_pages(area_dst))
+			return 1;
+
+		/* bounce pass */
+		if (stress(userfaults))
+			return 1;
+
+		/* unregister */
+		if (ioctl(uffd, UFFDIO_UNREGISTER, &uffdio_register.range)) {
+			fprintf(stderr, "unregister failure\n");
+			return 1;
+		}
+		if (area_dst_alias) {
+			uffdio_register.range.start = (unsigned long) area_dst;
+			if (ioctl(uffd, UFFDIO_UNREGISTER,
+				  &uffdio_register.range)) {
+				fprintf(stderr, "unregister failure alias\n");
+				return 1;
+			}
+		}
+
+		/* verification */
+		if (bounces & BOUNCE_VERIFY) {
+			for (nr = 0; nr < nr_pages; nr++) {
+				if (*area_count(area_dst, nr) != count_verify[nr]) {
+					fprintf(stderr,
+						"error area_count %Lu %Lu %lu\n",
+						*area_count(area_src, nr),
+						count_verify[nr],
+						nr);
+					err = 1;
+					bounces = 0;
+				}
+			}
+		}
+
+		/* prepare next bounce */
+		tmp_area = area_src;
+		area_src = area_dst;
+		area_dst = tmp_area;
+
+		tmp_area = area_src_alias;
+		area_src_alias = area_dst_alias;
+		area_dst_alias = tmp_area;
+
+		printf("userfaults:");
+		for (cpu = 0; cpu < nr_cpus; cpu++)
+			printf(" %lu", userfaults[cpu]);
+		printf("\n");
+	}
+
+	if (err)
+		return err;
+
+	close(uffd);
+	return userfaultfd_zeropage_test() || userfaultfd_sig_test()
+		|| userfaultfd_events_test();
+}
+
+/*
+ * Copied from mlock2-tests.c
+ */
+unsigned long default_huge_page_size(void)
+{
+	unsigned long hps = 0;
+	char *line = NULL;
+	size_t linelen = 0;
+	FILE *f = fopen("/proc/meminfo", "r");
+
+	if (!f)
+		return 0;
+	while (getline(&line, &linelen, f) > 0) {
+		if (sscanf(line, "Hugepagesize:       %lu kB", &hps) == 1) {
+			hps <<= 10;
+			break;
+		}
+	}
+
+	free(line);
+	fclose(f);
+	return hps;
+}
+
+static void set_test_type(const char *type)
+{
+	if (!strcmp(type, "anon")) {
+		test_type = TEST_ANON;
+		uffd_test_ops = &anon_uffd_test_ops;
+	} else if (!strcmp(type, "hugetlb")) {
+		test_type = TEST_HUGETLB;
+		uffd_test_ops = &hugetlb_uffd_test_ops;
+	} else if (!strcmp(type, "hugetlb_shared")) {
+		map_shared = true;
+		test_type = TEST_HUGETLB;
+		uffd_test_ops = &hugetlb_uffd_test_ops;
+	} else if (!strcmp(type, "shmem")) {
+		map_shared = true;
+		test_type = TEST_SHMEM;
+		uffd_test_ops = &shmem_uffd_test_ops;
+	} else {
+		fprintf(stderr, "Unknown test type: %s\n", type), exit(1);
+	}
+
+	if (test_type == TEST_HUGETLB)
+		page_size = default_huge_page_size();
+	else
+		page_size = sysconf(_SC_PAGE_SIZE);
+
+	if (!page_size)
+		fprintf(stderr, "Unable to determine page size\n"),
+				exit(2);
+	if ((unsigned long) area_count(NULL, 0) + sizeof(unsigned long long) * 2
+	    > page_size)
+		fprintf(stderr, "Impossible to run this test\n"), exit(2);
+}
+
+static void sigalrm(int sig)
+{
+	if (sig != SIGALRM)
+		abort();
+	test_uffdio_copy_eexist = true;
+	test_uffdio_zeropage_eexist = true;
+	alarm(ALARM_INTERVAL_SECS);
+}
+
+int main(int argc, char **argv)
+{
+	if (argc < 4)
+		fprintf(stderr, "Usage: <test type> <MiB> <bounces> [hugetlbfs_file]\n"),
+				exit(1);
+
+	if (signal(SIGALRM, sigalrm) == SIG_ERR)
+		fprintf(stderr, "failed to arm SIGALRM"), exit(1);
+	alarm(ALARM_INTERVAL_SECS);
+
+	set_test_type(argv[1]);
+
+	nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
+	nr_pages_per_cpu = atol(argv[2]) * 1024*1024 / page_size /
+		nr_cpus;
+	if (!nr_pages_per_cpu) {
+		fprintf(stderr, "invalid MiB\n");
+		fprintf(stderr, "Usage: <MiB> <bounces>\n"), exit(1);
+	}
+
+	bounces = atoi(argv[3]);
+	if (bounces <= 0) {
+		fprintf(stderr, "invalid bounces\n");
+		fprintf(stderr, "Usage: <MiB> <bounces>\n"), exit(1);
+	}
+	nr_pages = nr_pages_per_cpu * nr_cpus;
+
+	if (test_type == TEST_HUGETLB) {
+		if (argc < 5)
+			fprintf(stderr, "Usage: hugetlb <MiB> <bounces> <hugetlbfs_file>\n"),
+				exit(1);
+		huge_fd = open(argv[4], O_CREAT | O_RDWR, 0755);
+		if (huge_fd < 0) {
+			fprintf(stderr, "Open of %s failed", argv[3]);
+			perror("open");
+			exit(1);
+		}
+		if (ftruncate(huge_fd, 0)) {
+			fprintf(stderr, "ftruncate %s to size 0 failed", argv[3]);
+			perror("ftruncate");
+			exit(1);
+		}
+	}
+	printf("nr_pages: %lu, nr_pages_per_cpu: %lu\n",
+	       nr_pages, nr_pages_per_cpu);
+	return userfaultfd_stress();
+}
+
+#else /* __NR_userfaultfd */
+
+#warning "missing __NR_userfaultfd definition"
+
+int main(void)
+{
+	printf("skip: Skipping userfaultfd test (missing __NR_userfaultfd)\n");
+	return KSFT_SKIP;
+}
+
+#endif /* __NR_userfaultfd */
diff --git a/tools/testing/selftests/vm/va_128TBswitch.c b/tools/testing/selftests/vm/va_128TBswitch.c
new file mode 100644
index 000000000..e7fe734c3
--- /dev/null
+++ b/tools/testing/selftests/vm/va_128TBswitch.c
@@ -0,0 +1,297 @@
+/*
+ *
+ * Authors: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
+ * Authors: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ */
+
+#include <stdio.h>
+#include <sys/mman.h>
+#include <string.h>
+
+#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]))
+
+#ifdef __powerpc64__
+#define PAGE_SIZE	(64 << 10)
+/*
+ * This will work with 16M and 2M hugepage size
+ */
+#define HUGETLB_SIZE	(16 << 20)
+#else
+#define PAGE_SIZE	(4 << 10)
+#define HUGETLB_SIZE	(2 << 20)
+#endif
+
+/*
+ * >= 128TB is the hint addr value we used to select
+ * large address space.
+ */
+#define ADDR_SWITCH_HINT (1UL << 47)
+#define LOW_ADDR	((void *) (1UL << 30))
+#define HIGH_ADDR	((void *) (1UL << 48))
+
+struct testcase {
+	void *addr;
+	unsigned long size;
+	unsigned long flags;
+	const char *msg;
+	unsigned int low_addr_required:1;
+	unsigned int keep_mapped:1;
+};
+
+static struct testcase testcases[] = {
+	{
+		/*
+		 * If stack is moved, we could possibly allocate
+		 * this at the requested address.
+		 */
+		.addr = ((void *)(ADDR_SWITCH_HINT - PAGE_SIZE)),
+		.size = PAGE_SIZE,
+		.flags = MAP_PRIVATE | MAP_ANONYMOUS,
+		.msg = "mmap(ADDR_SWITCH_HINT - PAGE_SIZE, PAGE_SIZE)",
+		.low_addr_required = 1,
+	},
+	{
+		/*
+		 * We should never allocate at the requested address or above it
+		 * The len cross the 128TB boundary. Without MAP_FIXED
+		 * we will always search in the lower address space.
+		 */
+		.addr = ((void *)(ADDR_SWITCH_HINT - PAGE_SIZE)),
+		.size = 2 * PAGE_SIZE,
+		.flags = MAP_PRIVATE | MAP_ANONYMOUS,
+		.msg = "mmap(ADDR_SWITCH_HINT - PAGE_SIZE, (2 * PAGE_SIZE))",
+		.low_addr_required = 1,
+	},
+	{
+		/*
+		 * Exact mapping at 128TB, the area is free we should get that
+		 * even without MAP_FIXED.
+		 */
+		.addr = ((void *)(ADDR_SWITCH_HINT)),
+		.size = PAGE_SIZE,
+		.flags = MAP_PRIVATE | MAP_ANONYMOUS,
+		.msg = "mmap(ADDR_SWITCH_HINT, PAGE_SIZE)",
+		.keep_mapped = 1,
+	},
+	{
+		.addr = (void *)(ADDR_SWITCH_HINT),
+		.size = 2 * PAGE_SIZE,
+		.flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED,
+		.msg = "mmap(ADDR_SWITCH_HINT, 2 * PAGE_SIZE, MAP_FIXED)",
+	},
+	{
+		.addr = NULL,
+		.size = 2 * PAGE_SIZE,
+		.flags = MAP_PRIVATE | MAP_ANONYMOUS,
+		.msg = "mmap(NULL)",
+		.low_addr_required = 1,
+	},
+	{
+		.addr = LOW_ADDR,
+		.size = 2 * PAGE_SIZE,
+		.flags = MAP_PRIVATE | MAP_ANONYMOUS,
+		.msg = "mmap(LOW_ADDR)",
+		.low_addr_required = 1,
+	},
+	{
+		.addr = HIGH_ADDR,
+		.size = 2 * PAGE_SIZE,
+		.flags = MAP_PRIVATE | MAP_ANONYMOUS,
+		.msg = "mmap(HIGH_ADDR)",
+		.keep_mapped = 1,
+	},
+	{
+		.addr = HIGH_ADDR,
+		.size = 2 * PAGE_SIZE,
+		.flags = MAP_PRIVATE | MAP_ANONYMOUS,
+		.msg = "mmap(HIGH_ADDR) again",
+		.keep_mapped = 1,
+	},
+	{
+		.addr = HIGH_ADDR,
+		.size = 2 * PAGE_SIZE,
+		.flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED,
+		.msg = "mmap(HIGH_ADDR, MAP_FIXED)",
+	},
+	{
+		.addr = (void *) -1,
+		.size = 2 * PAGE_SIZE,
+		.flags = MAP_PRIVATE | MAP_ANONYMOUS,
+		.msg = "mmap(-1)",
+		.keep_mapped = 1,
+	},
+	{
+		.addr = (void *) -1,
+		.size = 2 * PAGE_SIZE,
+		.flags = MAP_PRIVATE | MAP_ANONYMOUS,
+		.msg = "mmap(-1) again",
+	},
+	{
+		.addr = ((void *)(ADDR_SWITCH_HINT - PAGE_SIZE)),
+		.size = PAGE_SIZE,
+		.flags = MAP_PRIVATE | MAP_ANONYMOUS,
+		.msg = "mmap(ADDR_SWITCH_HINT - PAGE_SIZE, PAGE_SIZE)",
+		.low_addr_required = 1,
+	},
+	{
+		.addr = (void *)(ADDR_SWITCH_HINT - PAGE_SIZE),
+		.size = 2 * PAGE_SIZE,
+		.flags = MAP_PRIVATE | MAP_ANONYMOUS,
+		.msg = "mmap(ADDR_SWITCH_HINT - PAGE_SIZE, 2 * PAGE_SIZE)",
+		.low_addr_required = 1,
+		.keep_mapped = 1,
+	},
+	{
+		.addr = (void *)(ADDR_SWITCH_HINT - PAGE_SIZE / 2),
+		.size = 2 * PAGE_SIZE,
+		.flags = MAP_PRIVATE | MAP_ANONYMOUS,
+		.msg = "mmap(ADDR_SWITCH_HINT - PAGE_SIZE/2 , 2 * PAGE_SIZE)",
+		.low_addr_required = 1,
+		.keep_mapped = 1,
+	},
+	{
+		.addr = ((void *)(ADDR_SWITCH_HINT)),
+		.size = PAGE_SIZE,
+		.flags = MAP_PRIVATE | MAP_ANONYMOUS,
+		.msg = "mmap(ADDR_SWITCH_HINT, PAGE_SIZE)",
+	},
+	{
+		.addr = (void *)(ADDR_SWITCH_HINT),
+		.size = 2 * PAGE_SIZE,
+		.flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED,
+		.msg = "mmap(ADDR_SWITCH_HINT, 2 * PAGE_SIZE, MAP_FIXED)",
+	},
+};
+
+static struct testcase hugetlb_testcases[] = {
+	{
+		.addr = NULL,
+		.size = HUGETLB_SIZE,
+		.flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS,
+		.msg = "mmap(NULL, MAP_HUGETLB)",
+		.low_addr_required = 1,
+	},
+	{
+		.addr = LOW_ADDR,
+		.size = HUGETLB_SIZE,
+		.flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS,
+		.msg = "mmap(LOW_ADDR, MAP_HUGETLB)",
+		.low_addr_required = 1,
+	},
+	{
+		.addr = HIGH_ADDR,
+		.size = HUGETLB_SIZE,
+		.flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS,
+		.msg = "mmap(HIGH_ADDR, MAP_HUGETLB)",
+		.keep_mapped = 1,
+	},
+	{
+		.addr = HIGH_ADDR,
+		.size = HUGETLB_SIZE,
+		.flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS,
+		.msg = "mmap(HIGH_ADDR, MAP_HUGETLB) again",
+		.keep_mapped = 1,
+	},
+	{
+		.addr = HIGH_ADDR,
+		.size = HUGETLB_SIZE,
+		.flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED,
+		.msg = "mmap(HIGH_ADDR, MAP_FIXED | MAP_HUGETLB)",
+	},
+	{
+		.addr = (void *) -1,
+		.size = HUGETLB_SIZE,
+		.flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS,
+		.msg = "mmap(-1, MAP_HUGETLB)",
+		.keep_mapped = 1,
+	},
+	{
+		.addr = (void *) -1,
+		.size = HUGETLB_SIZE,
+		.flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS,
+		.msg = "mmap(-1, MAP_HUGETLB) again",
+	},
+	{
+		.addr = (void *)(ADDR_SWITCH_HINT - PAGE_SIZE),
+		.size = 2 * HUGETLB_SIZE,
+		.flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS,
+		.msg = "mmap(ADDR_SWITCH_HINT - PAGE_SIZE, 2*HUGETLB_SIZE, MAP_HUGETLB)",
+		.low_addr_required = 1,
+		.keep_mapped = 1,
+	},
+	{
+		.addr = (void *)(ADDR_SWITCH_HINT),
+		.size = 2 * HUGETLB_SIZE,
+		.flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED,
+		.msg = "mmap(ADDR_SWITCH_HINT , 2*HUGETLB_SIZE, MAP_FIXED | MAP_HUGETLB)",
+	},
+};
+
+static int run_test(struct testcase *test, int count)
+{
+	void *p;
+	int i, ret = 0;
+
+	for (i = 0; i < count; i++) {
+		struct testcase *t = test + i;
+
+		p = mmap(t->addr, t->size, PROT_READ | PROT_WRITE, t->flags, -1, 0);
+
+		printf("%s: %p - ", t->msg, p);
+
+		if (p == MAP_FAILED) {
+			printf("FAILED\n");
+			ret = 1;
+			continue;
+		}
+
+		if (t->low_addr_required && p >= (void *)(ADDR_SWITCH_HINT)) {
+			printf("FAILED\n");
+			ret = 1;
+		} else {
+			/*
+			 * Do a dereference of the address returned so that we catch
+			 * bugs in page fault handling
+			 */
+			memset(p, 0, t->size);
+			printf("OK\n");
+		}
+		if (!t->keep_mapped)
+			munmap(p, t->size);
+	}
+
+	return ret;
+}
+
+static int supported_arch(void)
+{
+#if defined(__powerpc64__)
+	return 1;
+#elif defined(__x86_64__)
+	return 1;
+#else
+	return 0;
+#endif
+}
+
+int main(int argc, char **argv)
+{
+	int ret;
+
+	if (!supported_arch())
+		return 0;
+
+	ret = run_test(testcases, ARRAY_SIZE(testcases));
+	if (argc == 2 && !strcmp(argv[1], "--run-hugetlb"))
+		ret = run_test(hugetlb_testcases, ARRAY_SIZE(hugetlb_testcases));
+	return ret;
+}
diff --git a/tools/testing/selftests/vm/virtual_address_range.c b/tools/testing/selftests/vm/virtual_address_range.c
new file mode 100644
index 000000000..1830d66a6
--- /dev/null
+++ b/tools/testing/selftests/vm/virtual_address_range.c
@@ -0,0 +1,139 @@
+/*
+ * Copyright 2017, Anshuman Khandual, IBM Corp.
+ * Licensed under GPLv2.
+ *
+ * Works on architectures which support 128TB virtual
+ * address range and beyond.
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <errno.h>
+#include <sys/mman.h>
+#include <sys/time.h>
+
+/*
+ * Maximum address range mapped with a single mmap()
+ * call is little bit more than 16GB. Hence 16GB is
+ * chosen as the single chunk size for address space
+ * mapping.
+ */
+#define MAP_CHUNK_SIZE   17179869184UL /* 16GB */
+
+/*
+ * Address space till 128TB is mapped without any hint
+ * and is enabled by default. Address space beyond 128TB
+ * till 512TB is obtained by passing hint address as the
+ * first argument into mmap() system call.
+ *
+ * The process heap address space is divided into two
+ * different areas one below 128TB and one above 128TB
+ * till it reaches 512TB. One with size 128TB and the
+ * other being 384TB.
+ *
+ * On Arm64 the address space is 256TB and no high mappings
+ * are supported so far.
+ */
+
+#define NR_CHUNKS_128TB   8192UL /* Number of 16GB chunks for 128TB */
+#define NR_CHUNKS_256TB   (NR_CHUNKS_128TB * 2UL)
+#define NR_CHUNKS_384TB   (NR_CHUNKS_128TB * 3UL)
+
+#define ADDR_MARK_128TB  (1UL << 47) /* First address beyond 128TB */
+#define ADDR_MARK_256TB  (1UL << 48) /* First address beyond 256TB */
+
+#ifdef __aarch64__
+#define HIGH_ADDR_MARK  ADDR_MARK_256TB
+#define HIGH_ADDR_SHIFT 49
+#define NR_CHUNKS_LOW   NR_CHUNKS_256TB
+#define NR_CHUNKS_HIGH  0
+#else
+#define HIGH_ADDR_MARK  ADDR_MARK_128TB
+#define HIGH_ADDR_SHIFT 48
+#define NR_CHUNKS_LOW   NR_CHUNKS_128TB
+#define NR_CHUNKS_HIGH  NR_CHUNKS_384TB
+#endif
+
+static char *hind_addr(void)
+{
+	int bits = HIGH_ADDR_SHIFT + rand() % (63 - HIGH_ADDR_SHIFT);
+
+	return (char *) (1UL << bits);
+}
+
+static int validate_addr(char *ptr, int high_addr)
+{
+	unsigned long addr = (unsigned long) ptr;
+
+	if (high_addr) {
+		if (addr < HIGH_ADDR_MARK) {
+			printf("Bad address %lx\n", addr);
+			return 1;
+		}
+		return 0;
+	}
+
+	if (addr > HIGH_ADDR_MARK) {
+		printf("Bad address %lx\n", addr);
+		return 1;
+	}
+	return 0;
+}
+
+static int validate_lower_address_hint(void)
+{
+	char *ptr;
+
+	ptr = mmap((void *) (1UL << 45), MAP_CHUNK_SIZE, PROT_READ |
+			PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+
+	if (ptr == MAP_FAILED)
+		return 0;
+
+	return 1;
+}
+
+int main(int argc, char *argv[])
+{
+	char *ptr[NR_CHUNKS_LOW];
+	char *hptr[NR_CHUNKS_HIGH];
+	char *hint;
+	unsigned long i, lchunks, hchunks;
+
+	for (i = 0; i < NR_CHUNKS_LOW; i++) {
+		ptr[i] = mmap(NULL, MAP_CHUNK_SIZE, PROT_READ | PROT_WRITE,
+					MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+
+		if (ptr[i] == MAP_FAILED) {
+			if (validate_lower_address_hint())
+				return 1;
+			break;
+		}
+
+		if (validate_addr(ptr[i], 0))
+			return 1;
+	}
+	lchunks = i;
+
+	for (i = 0; i < NR_CHUNKS_HIGH; i++) {
+		hint = hind_addr();
+		hptr[i] = mmap(hint, MAP_CHUNK_SIZE, PROT_READ | PROT_WRITE,
+					MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+
+		if (hptr[i] == MAP_FAILED)
+			break;
+
+		if (validate_addr(hptr[i], 1))
+			return 1;
+	}
+	hchunks = i;
+
+	for (i = 0; i < lchunks; i++)
+		munmap(ptr[i], MAP_CHUNK_SIZE);
+
+	for (i = 0; i < hchunks; i++)
+		munmap(hptr[i], MAP_CHUNK_SIZE);
+
+	return 0;
+}
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-05-06 01:02:30 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-05-06 01:02:30 +0000
commit	76cb841cb886eef6b3bee341a2266c76578724ad (patch)
tree	f5892e5ba6cc11949952a6ce4ecbe6d516d6ce58 /tools/testing/selftests/vm
parent	Initial commit. (diff)
download	linux-upstream.tar.xz linux-upstream.zip