Adding upstream version 6.1.76.upstream/6.1.76 upstream

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-07 18:49:45 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-07 18:49:45 +0000
commit: 2c3c1048746a4622d8c89a29670120dc8fab93c4 (patch)
tree: 848558de17fb3008cdf4d861b01ac7781903ce39 /tools/testing/selftests/cgroup
parent: Initial commit. (diff)
download: linux-2c3c1048746a4622d8c89a29670120dc8fab93c4.tar.xz
linux-2c3c1048746a4622d8c89a29670120dc8fab93c4.zip
16 files changed, 6267 insertions, 0 deletions
diff --git a/tools/testing/selftests/cgroup/.gitignore b/tools/testing/selftests/cgroup/.gitignore
new file mode 100644
index 000000000..c4a57e69f
--- /dev/null
+++ b/tools/testing/selftests/cgroup/.gitignore
@@ -0,0 +1,8 @@
+# SPDX-License-Identifier: GPL-2.0-only
+test_memcontrol
+test_core
+test_freezer
+test_kmem
+test_kill
+test_cpu
+wait_inotify
diff --git a/tools/testing/selftests/cgroup/Makefile b/tools/testing/selftests/cgroup/Makefile
new file mode 100644
index 000000000..3d263747d
--- /dev/null
+++ b/tools/testing/selftests/cgroup/Makefile
@@ -0,0 +1,25 @@
+# SPDX-License-Identifier: GPL-2.0
+CFLAGS += -Wall -pthread
+
+all: ${HELPER_PROGS}
+
+TEST_FILES     := with_stress.sh
+TEST_PROGS     := test_stress.sh test_cpuset_prs.sh
+TEST_GEN_FILES := wait_inotify
+TEST_GEN_PROGS = test_memcontrol
+TEST_GEN_PROGS += test_kmem
+TEST_GEN_PROGS += test_core
+TEST_GEN_PROGS += test_freezer
+TEST_GEN_PROGS += test_kill
+TEST_GEN_PROGS += test_cpu
+
+LOCAL_HDRS += $(selfdir)/clone3/clone3_selftests.h $(selfdir)/pidfd/pidfd.h
+
+include ../lib.mk
+
+$(OUTPUT)/test_memcontrol: cgroup_util.c
+$(OUTPUT)/test_kmem: cgroup_util.c
+$(OUTPUT)/test_core: cgroup_util.c
+$(OUTPUT)/test_freezer: cgroup_util.c
+$(OUTPUT)/test_kill: cgroup_util.c
+$(OUTPUT)/test_cpu: cgroup_util.c
diff --git a/tools/testing/selftests/cgroup/cgroup_util.c b/tools/testing/selftests/cgroup/cgroup_util.c
new file mode 100644
index 000000000..e8bbbdb77
--- /dev/null
+++ b/tools/testing/selftests/cgroup/cgroup_util.c
@@ -0,0 +1,657 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#define _GNU_SOURCE
+
+#include <errno.h>
+#include <fcntl.h>
+#include <linux/limits.h>
+#include <poll.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/inotify.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+#include "cgroup_util.h"
+#include "../clone3/clone3_selftests.h"
+
+/* Returns read len on success, or -errno on failure. */
+static ssize_t read_text(const char *path, char *buf, size_t max_len)
+{
+	ssize_t len;
+	int fd;
+
+	fd = open(path, O_RDONLY);
+	if (fd < 0)
+		return -errno;
+
+	len = read(fd, buf, max_len - 1);
+
+	if (len >= 0)
+		buf[len] = 0;
+
+	close(fd);
+	return len < 0 ? -errno : len;
+}
+
+/* Returns written len on success, or -errno on failure. */
+static ssize_t write_text(const char *path, char *buf, ssize_t len)
+{
+	int fd;
+
+	fd = open(path, O_WRONLY | O_APPEND);
+	if (fd < 0)
+		return -errno;
+
+	len = write(fd, buf, len);
+	close(fd);
+	return len < 0 ? -errno : len;
+}
+
+char *cg_name(const char *root, const char *name)
+{
+	size_t len = strlen(root) + strlen(name) + 2;
+	char *ret = malloc(len);
+
+	snprintf(ret, len, "%s/%s", root, name);
+
+	return ret;
+}
+
+char *cg_name_indexed(const char *root, const char *name, int index)
+{
+	size_t len = strlen(root) + strlen(name) + 10;
+	char *ret = malloc(len);
+
+	snprintf(ret, len, "%s/%s_%d", root, name, index);
+
+	return ret;
+}
+
+char *cg_control(const char *cgroup, const char *control)
+{
+	size_t len = strlen(cgroup) + strlen(control) + 2;
+	char *ret = malloc(len);
+
+	snprintf(ret, len, "%s/%s", cgroup, control);
+
+	return ret;
+}
+
+/* Returns 0 on success, or -errno on failure. */
+int cg_read(const char *cgroup, const char *control, char *buf, size_t len)
+{
+	char path[PATH_MAX];
+	ssize_t ret;
+
+	snprintf(path, sizeof(path), "%s/%s", cgroup, control);
+
+	ret = read_text(path, buf, len);
+	return ret >= 0 ? 0 : ret;
+}
+
+int cg_read_strcmp(const char *cgroup, const char *control,
+		   const char *expected)
+{
+	size_t size;
+	char *buf;
+	int ret;
+
+	/* Handle the case of comparing against empty string */
+	if (!expected)
+		return -1;
+	else
+		size = strlen(expected) + 1;
+
+	buf = malloc(size);
+	if (!buf)
+		return -1;
+
+	if (cg_read(cgroup, control, buf, size)) {
+		free(buf);
+		return -1;
+	}
+
+	ret = strcmp(expected, buf);
+	free(buf);
+	return ret;
+}
+
+int cg_read_strstr(const char *cgroup, const char *control, const char *needle)
+{
+	char buf[PAGE_SIZE];
+
+	if (cg_read(cgroup, control, buf, sizeof(buf)))
+		return -1;
+
+	return strstr(buf, needle) ? 0 : -1;
+}
+
+long cg_read_long(const char *cgroup, const char *control)
+{
+	char buf[128];
+
+	if (cg_read(cgroup, control, buf, sizeof(buf)))
+		return -1;
+
+	return atol(buf);
+}
+
+long cg_read_key_long(const char *cgroup, const char *control, const char *key)
+{
+	char buf[PAGE_SIZE];
+	char *ptr;
+
+	if (cg_read(cgroup, control, buf, sizeof(buf)))
+		return -1;
+
+	ptr = strstr(buf, key);
+	if (!ptr)
+		return -1;
+
+	return atol(ptr + strlen(key));
+}
+
+long cg_read_lc(const char *cgroup, const char *control)
+{
+	char buf[PAGE_SIZE];
+	const char delim[] = "\n";
+	char *line;
+	long cnt = 0;
+
+	if (cg_read(cgroup, control, buf, sizeof(buf)))
+		return -1;
+
+	for (line = strtok(buf, delim); line; line = strtok(NULL, delim))
+		cnt++;
+
+	return cnt;
+}
+
+/* Returns 0 on success, or -errno on failure. */
+int cg_write(const char *cgroup, const char *control, char *buf)
+{
+	char path[PATH_MAX];
+	ssize_t len = strlen(buf), ret;
+
+	snprintf(path, sizeof(path), "%s/%s", cgroup, control);
+	ret = write_text(path, buf, len);
+	return ret == len ? 0 : ret;
+}
+
+int cg_write_numeric(const char *cgroup, const char *control, long value)
+{
+	char buf[64];
+	int ret;
+
+	ret = sprintf(buf, "%lu", value);
+	if (ret < 0)
+		return ret;
+
+	return cg_write(cgroup, control, buf);
+}
+
+int cg_find_unified_root(char *root, size_t len)
+{
+	char buf[10 * PAGE_SIZE];
+	char *fs, *mount, *type;
+	const char delim[] = "\n\t ";
+
+	if (read_text("/proc/self/mounts", buf, sizeof(buf)) <= 0)
+		return -1;
+
+	/*
+	 * Example:
+	 * cgroup /sys/fs/cgroup cgroup2 rw,seclabel,noexec,relatime 0 0
+	 */
+	for (fs = strtok(buf, delim); fs; fs = strtok(NULL, delim)) {
+		mount = strtok(NULL, delim);
+		type = strtok(NULL, delim);
+		strtok(NULL, delim);
+		strtok(NULL, delim);
+		strtok(NULL, delim);
+
+		if (strcmp(type, "cgroup2") == 0) {
+			strncpy(root, mount, len);
+			return 0;
+		}
+	}
+
+	return -1;
+}
+
+int cg_create(const char *cgroup)
+{
+	return mkdir(cgroup, 0755);
+}
+
+int cg_wait_for_proc_count(const char *cgroup, int count)
+{
+	char buf[10 * PAGE_SIZE] = {0};
+	int attempts;
+	char *ptr;
+
+	for (attempts = 10; attempts >= 0; attempts--) {
+		int nr = 0;
+
+		if (cg_read(cgroup, "cgroup.procs", buf, sizeof(buf)))
+			break;
+
+		for (ptr = buf; *ptr; ptr++)
+			if (*ptr == '\n')
+				nr++;
+
+		if (nr >= count)
+			return 0;
+
+		usleep(100000);
+	}
+
+	return -1;
+}
+
+int cg_killall(const char *cgroup)
+{
+	char buf[PAGE_SIZE];
+	char *ptr = buf;
+
+	/* If cgroup.kill exists use it. */
+	if (!cg_write(cgroup, "cgroup.kill", "1"))
+		return 0;
+
+	if (cg_read(cgroup, "cgroup.procs", buf, sizeof(buf)))
+		return -1;
+
+	while (ptr < buf + sizeof(buf)) {
+		int pid = strtol(ptr, &ptr, 10);
+
+		if (pid == 0)
+			break;
+		if (*ptr)
+			ptr++;
+		else
+			break;
+		if (kill(pid, SIGKILL))
+			return -1;
+	}
+
+	return 0;
+}
+
+int cg_destroy(const char *cgroup)
+{
+	int ret;
+
+retry:
+	ret = rmdir(cgroup);
+	if (ret && errno == EBUSY) {
+		cg_killall(cgroup);
+		usleep(100);
+		goto retry;
+	}
+
+	if (ret && errno == ENOENT)
+		ret = 0;
+
+	return ret;
+}
+
+int cg_enter(const char *cgroup, int pid)
+{
+	char pidbuf[64];
+
+	snprintf(pidbuf, sizeof(pidbuf), "%d", pid);
+	return cg_write(cgroup, "cgroup.procs", pidbuf);
+}
+
+int cg_enter_current(const char *cgroup)
+{
+	return cg_write(cgroup, "cgroup.procs", "0");
+}
+
+int cg_enter_current_thread(const char *cgroup)
+{
+	return cg_write(cgroup, "cgroup.threads", "0");
+}
+
+int cg_run(const char *cgroup,
+	   int (*fn)(const char *cgroup, void *arg),
+	   void *arg)
+{
+	int pid, retcode;
+
+	pid = fork();
+	if (pid < 0) {
+		return pid;
+	} else if (pid == 0) {
+		char buf[64];
+
+		snprintf(buf, sizeof(buf), "%d", getpid());
+		if (cg_write(cgroup, "cgroup.procs", buf))
+			exit(EXIT_FAILURE);
+		exit(fn(cgroup, arg));
+	} else {
+		waitpid(pid, &retcode, 0);
+		if (WIFEXITED(retcode))
+			return WEXITSTATUS(retcode);
+		else
+			return -1;
+	}
+}
+
+pid_t clone_into_cgroup(int cgroup_fd)
+{
+#ifdef CLONE_ARGS_SIZE_VER2
+	pid_t pid;
+
+	struct __clone_args args = {
+		.flags = CLONE_INTO_CGROUP,
+		.exit_signal = SIGCHLD,
+		.cgroup = cgroup_fd,
+	};
+
+	pid = sys_clone3(&args, sizeof(struct __clone_args));
+	/*
+	 * Verify that this is a genuine test failure:
+	 * ENOSYS -> clone3() not available
+	 * E2BIG  -> CLONE_INTO_CGROUP not available
+	 */
+	if (pid < 0 && (errno == ENOSYS || errno == E2BIG))
+		goto pretend_enosys;
+
+	return pid;
+
+pretend_enosys:
+#endif
+	errno = ENOSYS;
+	return -ENOSYS;
+}
+
+int clone_reap(pid_t pid, int options)
+{
+	int ret;
+	siginfo_t info = {
+		.si_signo = 0,
+	};
+
+again:
+	ret = waitid(P_PID, pid, &info, options | __WALL | __WNOTHREAD);
+	if (ret < 0) {
+		if (errno == EINTR)
+			goto again;
+		return -1;
+	}
+
+	if (options & WEXITED) {
+		if (WIFEXITED(info.si_status))
+			return WEXITSTATUS(info.si_status);
+	}
+
+	if (options & WSTOPPED) {
+		if (WIFSTOPPED(info.si_status))
+			return WSTOPSIG(info.si_status);
+	}
+
+	if (options & WCONTINUED) {
+		if (WIFCONTINUED(info.si_status))
+			return 0;
+	}
+
+	return -1;
+}
+
+int dirfd_open_opath(const char *dir)
+{
+	return open(dir, O_DIRECTORY | O_CLOEXEC | O_NOFOLLOW | O_PATH);
+}
+
+#define close_prot_errno(fd)                                                   \
+	if (fd >= 0) {                                                         \
+		int _e_ = errno;                                               \
+		close(fd);                                                     \
+		errno = _e_;                                                   \
+	}
+
+static int clone_into_cgroup_run_nowait(const char *cgroup,
+					int (*fn)(const char *cgroup, void *arg),
+					void *arg)
+{
+	int cgroup_fd;
+	pid_t pid;
+
+	cgroup_fd =  dirfd_open_opath(cgroup);
+	if (cgroup_fd < 0)
+		return -1;
+
+	pid = clone_into_cgroup(cgroup_fd);
+	close_prot_errno(cgroup_fd);
+	if (pid == 0)
+		exit(fn(cgroup, arg));
+
+	return pid;
+}
+
+int cg_run_nowait(const char *cgroup,
+		  int (*fn)(const char *cgroup, void *arg),
+		  void *arg)
+{
+	int pid;
+
+	pid = clone_into_cgroup_run_nowait(cgroup, fn, arg);
+	if (pid > 0)
+		return pid;
+
+	/* Genuine test failure. */
+	if (pid < 0 && errno != ENOSYS)
+		return -1;
+
+	pid = fork();
+	if (pid == 0) {
+		char buf[64];
+
+		snprintf(buf, sizeof(buf), "%d", getpid());
+		if (cg_write(cgroup, "cgroup.procs", buf))
+			exit(EXIT_FAILURE);
+		exit(fn(cgroup, arg));
+	}
+
+	return pid;
+}
+
+int get_temp_fd(void)
+{
+	return open(".", O_TMPFILE | O_RDWR | O_EXCL);
+}
+
+int alloc_pagecache(int fd, size_t size)
+{
+	char buf[PAGE_SIZE];
+	struct stat st;
+	int i;
+
+	if (fstat(fd, &st))
+		goto cleanup;
+
+	size += st.st_size;
+
+	if (ftruncate(fd, size))
+		goto cleanup;
+
+	for (i = 0; i < size; i += sizeof(buf))
+		read(fd, buf, sizeof(buf));
+
+	return 0;
+
+cleanup:
+	return -1;
+}
+
+int alloc_anon(const char *cgroup, void *arg)
+{
+	size_t size = (unsigned long)arg;
+	char *buf, *ptr;
+
+	buf = malloc(size);
+	for (ptr = buf; ptr < buf + size; ptr += PAGE_SIZE)
+		*ptr = 0;
+
+	free(buf);
+	return 0;
+}
+
+int is_swap_enabled(void)
+{
+	char buf[PAGE_SIZE];
+	const char delim[] = "\n";
+	int cnt = 0;
+	char *line;
+
+	if (read_text("/proc/swaps", buf, sizeof(buf)) <= 0)
+		return -1;
+
+	for (line = strtok(buf, delim); line; line = strtok(NULL, delim))
+		cnt++;
+
+	return cnt > 1;
+}
+
+int set_oom_adj_score(int pid, int score)
+{
+	char path[PATH_MAX];
+	int fd, len;
+
+	sprintf(path, "/proc/%d/oom_score_adj", pid);
+
+	fd = open(path, O_WRONLY | O_APPEND);
+	if (fd < 0)
+		return fd;
+
+	len = dprintf(fd, "%d", score);
+	if (len < 0) {
+		close(fd);
+		return len;
+	}
+
+	close(fd);
+	return 0;
+}
+
+int proc_mount_contains(const char *option)
+{
+	char buf[4 * PAGE_SIZE];
+	ssize_t read;
+
+	read = read_text("/proc/mounts", buf, sizeof(buf));
+	if (read < 0)
+		return read;
+
+	return strstr(buf, option) != NULL;
+}
+
+ssize_t proc_read_text(int pid, bool thread, const char *item, char *buf, size_t size)
+{
+	char path[PATH_MAX];
+	ssize_t ret;
+
+	if (!pid)
+		snprintf(path, sizeof(path), "/proc/%s/%s",
+			 thread ? "thread-self" : "self", item);
+	else
+		snprintf(path, sizeof(path), "/proc/%d/%s", pid, item);
+
+	ret = read_text(path, buf, size);
+	return ret < 0 ? -1 : ret;
+}
+
+int proc_read_strstr(int pid, bool thread, const char *item, const char *needle)
+{
+	char buf[PAGE_SIZE];
+
+	if (proc_read_text(pid, thread, item, buf, sizeof(buf)) < 0)
+		return -1;
+
+	return strstr(buf, needle) ? 0 : -1;
+}
+
+int clone_into_cgroup_run_wait(const char *cgroup)
+{
+	int cgroup_fd;
+	pid_t pid;
+
+	cgroup_fd =  dirfd_open_opath(cgroup);
+	if (cgroup_fd < 0)
+		return -1;
+
+	pid = clone_into_cgroup(cgroup_fd);
+	close_prot_errno(cgroup_fd);
+	if (pid < 0)
+		return -1;
+
+	if (pid == 0)
+		exit(EXIT_SUCCESS);
+
+	/*
+	 * We don't care whether this fails. We only care whether the initial
+	 * clone succeeded.
+	 */
+	(void)clone_reap(pid, WEXITED);
+	return 0;
+}
+
+static int __prepare_for_wait(const char *cgroup, const char *filename)
+{
+	int fd, ret = -1;
+
+	fd = inotify_init1(0);
+	if (fd == -1)
+		return fd;
+
+	ret = inotify_add_watch(fd, cg_control(cgroup, filename), IN_MODIFY);
+	if (ret == -1) {
+		close(fd);
+		fd = -1;
+	}
+
+	return fd;
+}
+
+int cg_prepare_for_wait(const char *cgroup)
+{
+	return __prepare_for_wait(cgroup, "cgroup.events");
+}
+
+int memcg_prepare_for_wait(const char *cgroup)
+{
+	return __prepare_for_wait(cgroup, "memory.events");
+}
+
+int cg_wait_for(int fd)
+{
+	int ret = -1;
+	struct pollfd fds = {
+		.fd = fd,
+		.events = POLLIN,
+	};
+
+	while (true) {
+		ret = poll(&fds, 1, 10000);
+
+		if (ret == -1) {
+			if (errno == EINTR)
+				continue;
+
+			break;
+		}
+
+		if (ret > 0 && fds.revents & POLLIN) {
+			ret = 0;
+			break;
+		}
+	}
+
+	return ret;
+}
diff --git a/tools/testing/selftests/cgroup/cgroup_util.h b/tools/testing/selftests/cgroup/cgroup_util.h
new file mode 100644
index 000000000..c92df4e5d
--- /dev/null
+++ b/tools/testing/selftests/cgroup/cgroup_util.h
@@ -0,0 +1,64 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <stdbool.h>
+#include <stdlib.h>
+
+#include "../kselftest.h"
+
+#define PAGE_SIZE 4096
+
+#define MB(x) (x << 20)
+
+#define USEC_PER_SEC	1000000L
+#define NSEC_PER_SEC	1000000000L
+
+/*
+ * Checks if two given values differ by less than err% of their sum.
+ */
+static inline int values_close(long a, long b, int err)
+{
+	return abs(a - b) <= (a + b) / 100 * err;
+}
+
+extern int cg_find_unified_root(char *root, size_t len);
+extern char *cg_name(const char *root, const char *name);
+extern char *cg_name_indexed(const char *root, const char *name, int index);
+extern char *cg_control(const char *cgroup, const char *control);
+extern int cg_create(const char *cgroup);
+extern int cg_destroy(const char *cgroup);
+extern int cg_read(const char *cgroup, const char *control,
+		   char *buf, size_t len);
+extern int cg_read_strcmp(const char *cgroup, const char *control,
+			  const char *expected);
+extern int cg_read_strstr(const char *cgroup, const char *control,
+			  const char *needle);
+extern long cg_read_long(const char *cgroup, const char *control);
+long cg_read_key_long(const char *cgroup, const char *control, const char *key);
+extern long cg_read_lc(const char *cgroup, const char *control);
+extern int cg_write(const char *cgroup, const char *control, char *buf);
+int cg_write_numeric(const char *cgroup, const char *control, long value);
+extern int cg_run(const char *cgroup,
+		  int (*fn)(const char *cgroup, void *arg),
+		  void *arg);
+extern int cg_enter(const char *cgroup, int pid);
+extern int cg_enter_current(const char *cgroup);
+extern int cg_enter_current_thread(const char *cgroup);
+extern int cg_run_nowait(const char *cgroup,
+			 int (*fn)(const char *cgroup, void *arg),
+			 void *arg);
+extern int get_temp_fd(void);
+extern int alloc_pagecache(int fd, size_t size);
+extern int alloc_anon(const char *cgroup, void *arg);
+extern int is_swap_enabled(void);
+extern int set_oom_adj_score(int pid, int score);
+extern int cg_wait_for_proc_count(const char *cgroup, int count);
+extern int cg_killall(const char *cgroup);
+int proc_mount_contains(const char *option);
+extern ssize_t proc_read_text(int pid, bool thread, const char *item, char *buf, size_t size);
+extern int proc_read_strstr(int pid, bool thread, const char *item, const char *needle);
+extern pid_t clone_into_cgroup(int cgroup_fd);
+extern int clone_reap(pid_t pid, int options);
+extern int clone_into_cgroup_run_wait(const char *cgroup);
+extern int dirfd_open_opath(const char *dir);
+extern int cg_prepare_for_wait(const char *cgroup);
+extern int memcg_prepare_for_wait(const char *cgroup);
+extern int cg_wait_for(int fd);
diff --git a/tools/testing/selftests/cgroup/config b/tools/testing/selftests/cgroup/config
new file mode 100644
index 000000000..97d549ee8
--- /dev/null
+++ b/tools/testing/selftests/cgroup/config
@@ -0,0 +1,7 @@
+CONFIG_CGROUPS=y
+CONFIG_CGROUP_CPUACCT=y
+CONFIG_CGROUP_FREEZER=y
+CONFIG_CGROUP_SCHED=y
+CONFIG_MEMCG=y
+CONFIG_MEMCG_KMEM=y
+CONFIG_PAGE_COUNTER=y
diff --git a/tools/testing/selftests/cgroup/memcg_protection.m b/tools/testing/selftests/cgroup/memcg_protection.m
new file mode 100644
index 000000000..051daa347
--- /dev/null
+++ b/tools/testing/selftests/cgroup/memcg_protection.m
@@ -0,0 +1,89 @@
+% SPDX-License-Identifier: GPL-2.0
+%
+% run as: octave-cli memcg_protection.m
+%
+% This script simulates reclaim protection behavior on a single level of memcg
+% hierarchy to illustrate how overcommitted protection spreads among siblings
+% (as it depends also on their current consumption).
+%
+% Simulation assumes siblings consumed the initial amount of memory (w/out
+% reclaim) and then the reclaim starts, all memory is reclaimable, i.e. treated
+% same. It simulates only non-low reclaim and assumes all memory.min = 0.
+%
+% Input configurations
+% --------------------
+% E number	parent effective protection
+% n vector	nominal protection of siblings set at the given level (memory.low)
+% c vector	current consumption -,,- (memory.current)
+
+% example from testcase (values in GB)
+E = 50 / 1024;
+n = [75 25 0 500 ] / 1024;
+c = [50 50 50 0] / 1024;
+
+% Reclaim parameters
+% ------------------
+
+% Minimal reclaim amount (GB)
+cluster = 32*4 / 2**20;
+
+% Reclaim coefficient (think as 0.5^sc->priority)
+alpha = .1
+
+% Simulation parameters
+% ---------------------
+epsilon = 1e-7;
+timeout = 1000;
+
+% Simulation loop
+% ---------------
+
+ch = [];
+eh = [];
+rh = [];
+
+for t = 1:timeout
+        % low_usage
+        u = min(c, n);
+        siblings = sum(u);
+
+        % effective_protection()
+        protected = min(n, c);                % start with nominal
+        e = protected * min(1, E / siblings); % normalize overcommit
+
+        % recursive protection
+        unclaimed = max(0, E - siblings);
+        parent_overuse = sum(c) - siblings;
+        if (unclaimed > 0 && parent_overuse > 0)
+                overuse = max(0, c - protected);
+                e += unclaimed * (overuse / parent_overuse);
+        endif
+
+        % get_scan_count()
+        r = alpha * c;             % assume all memory is in a single LRU list
+
+        % commit 1bc63fb1272b ("mm, memcg: make scan aggression always exclude protection")
+        sz = max(e, c);
+        r .*= (1 - (e+epsilon) ./ (sz+epsilon));
+
+        % uncomment to debug prints
+        % e, c, r
+
+        % nothing to reclaim, reached equilibrium
+        if max(r) < epsilon
+                break;
+        endif
+
+        % SWAP_CLUSTER_MAX roundup
+        r = max(r, (r > epsilon) .* cluster);
+        % XXX here I do parallel reclaim of all siblings
+        % in reality reclaim is serialized and each sibling recalculates own residual
+        c = max(c - r, 0);
+
+        ch = [ch ; c];
+        eh = [eh ; e];
+        rh = [rh ; r];
+endfor
+
+t
+c, e
diff --git a/tools/testing/selftests/cgroup/test_core.c b/tools/testing/selftests/cgroup/test_core.c
new file mode 100644
index 000000000..600123503
--- /dev/null
+++ b/tools/testing/selftests/cgroup/test_core.c
@@ -0,0 +1,888 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#define _GNU_SOURCE
+#include <linux/limits.h>
+#include <linux/sched.h>
+#include <sys/types.h>
+#include <sys/mman.h>
+#include <sys/wait.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <sched.h>
+#include <stdio.h>
+#include <errno.h>
+#include <signal.h>
+#include <string.h>
+#include <pthread.h>
+
+#include "../kselftest.h"
+#include "cgroup_util.h"
+
+static int touch_anon(char *buf, size_t size)
+{
+	int fd;
+	char *pos = buf;
+
+	fd = open("/dev/urandom", O_RDONLY);
+	if (fd < 0)
+		return -1;
+
+	while (size > 0) {
+		ssize_t ret = read(fd, pos, size);
+
+		if (ret < 0) {
+			if (errno != EINTR) {
+				close(fd);
+				return -1;
+			}
+		} else {
+			pos += ret;
+			size -= ret;
+		}
+	}
+	close(fd);
+
+	return 0;
+}
+
+static int alloc_and_touch_anon_noexit(const char *cgroup, void *arg)
+{
+	int ppid = getppid();
+	size_t size = (size_t)arg;
+	void *buf;
+
+	buf = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON,
+		   0, 0);
+	if (buf == MAP_FAILED)
+		return -1;
+
+	if (touch_anon((char *)buf, size)) {
+		munmap(buf, size);
+		return -1;
+	}
+
+	while (getppid() == ppid)
+		sleep(1);
+
+	munmap(buf, size);
+	return 0;
+}
+
+/*
+ * Create a child process that allocates and touches 100MB, then waits to be
+ * killed. Wait until the child is attached to the cgroup, kill all processes
+ * in that cgroup and wait until "cgroup.procs" is empty. At this point try to
+ * destroy the empty cgroup. The test helps detect race conditions between
+ * dying processes leaving the cgroup and cgroup destruction path.
+ */
+static int test_cgcore_destroy(const char *root)
+{
+	int ret = KSFT_FAIL;
+	char *cg_test = NULL;
+	int child_pid;
+	char buf[PAGE_SIZE];
+
+	cg_test = cg_name(root, "cg_test");
+
+	if (!cg_test)
+		goto cleanup;
+
+	for (int i = 0; i < 10; i++) {
+		if (cg_create(cg_test))
+			goto cleanup;
+
+		child_pid = cg_run_nowait(cg_test, alloc_and_touch_anon_noexit,
+					  (void *) MB(100));
+
+		if (child_pid < 0)
+			goto cleanup;
+
+		/* wait for the child to enter cgroup */
+		if (cg_wait_for_proc_count(cg_test, 1))
+			goto cleanup;
+
+		if (cg_killall(cg_test))
+			goto cleanup;
+
+		/* wait for cgroup to be empty */
+		while (1) {
+			if (cg_read(cg_test, "cgroup.procs", buf, sizeof(buf)))
+				goto cleanup;
+			if (buf[0] == '\0')
+				break;
+			usleep(1000);
+		}
+
+		if (rmdir(cg_test))
+			goto cleanup;
+
+		if (waitpid(child_pid, NULL, 0) < 0)
+			goto cleanup;
+	}
+	ret = KSFT_PASS;
+cleanup:
+	if (cg_test)
+		cg_destroy(cg_test);
+	free(cg_test);
+	return ret;
+}
+
+/*
+ * A(0) - B(0) - C(1)
+ *        \ D(0)
+ *
+ * A, B and C's "populated" fields would be 1 while D's 0.
+ * test that after the one process in C is moved to root,
+ * A,B and C's "populated" fields would flip to "0" and file
+ * modified events will be generated on the
+ * "cgroup.events" files of both cgroups.
+ */
+static int test_cgcore_populated(const char *root)
+{
+	int ret = KSFT_FAIL;
+	int err;
+	char *cg_test_a = NULL, *cg_test_b = NULL;
+	char *cg_test_c = NULL, *cg_test_d = NULL;
+	int cgroup_fd = -EBADF;
+	pid_t pid;
+
+	cg_test_a = cg_name(root, "cg_test_a");
+	cg_test_b = cg_name(root, "cg_test_a/cg_test_b");
+	cg_test_c = cg_name(root, "cg_test_a/cg_test_b/cg_test_c");
+	cg_test_d = cg_name(root, "cg_test_a/cg_test_b/cg_test_d");
+
+	if (!cg_test_a || !cg_test_b || !cg_test_c || !cg_test_d)
+		goto cleanup;
+
+	if (cg_create(cg_test_a))
+		goto cleanup;
+
+	if (cg_create(cg_test_b))
+		goto cleanup;
+
+	if (cg_create(cg_test_c))
+		goto cleanup;
+
+	if (cg_create(cg_test_d))
+		goto cleanup;
+
+	if (cg_enter_current(cg_test_c))
+		goto cleanup;
+
+	if (cg_read_strcmp(cg_test_a, "cgroup.events", "populated 1\n"))
+		goto cleanup;
+
+	if (cg_read_strcmp(cg_test_b, "cgroup.events", "populated 1\n"))
+		goto cleanup;
+
+	if (cg_read_strcmp(cg_test_c, "cgroup.events", "populated 1\n"))
+		goto cleanup;
+
+	if (cg_read_strcmp(cg_test_d, "cgroup.events", "populated 0\n"))
+		goto cleanup;
+
+	if (cg_enter_current(root))
+		goto cleanup;
+
+	if (cg_read_strcmp(cg_test_a, "cgroup.events", "populated 0\n"))
+		goto cleanup;
+
+	if (cg_read_strcmp(cg_test_b, "cgroup.events", "populated 0\n"))
+		goto cleanup;
+
+	if (cg_read_strcmp(cg_test_c, "cgroup.events", "populated 0\n"))
+		goto cleanup;
+
+	if (cg_read_strcmp(cg_test_d, "cgroup.events", "populated 0\n"))
+		goto cleanup;
+
+	/* Test that we can directly clone into a new cgroup. */
+	cgroup_fd = dirfd_open_opath(cg_test_d);
+	if (cgroup_fd < 0)
+		goto cleanup;
+
+	pid = clone_into_cgroup(cgroup_fd);
+	if (pid < 0) {
+		if (errno == ENOSYS)
+			goto cleanup_pass;
+		goto cleanup;
+	}
+
+	if (pid == 0) {
+		if (raise(SIGSTOP))
+			exit(EXIT_FAILURE);
+		exit(EXIT_SUCCESS);
+	}
+
+	err = cg_read_strcmp(cg_test_d, "cgroup.events", "populated 1\n");
+
+	(void)clone_reap(pid, WSTOPPED);
+	(void)kill(pid, SIGCONT);
+	(void)clone_reap(pid, WEXITED);
+
+	if (err)
+		goto cleanup;
+
+	if (cg_read_strcmp(cg_test_d, "cgroup.events", "populated 0\n"))
+		goto cleanup;
+
+	/* Remove cgroup. */
+	if (cg_test_d) {
+		cg_destroy(cg_test_d);
+		free(cg_test_d);
+		cg_test_d = NULL;
+	}
+
+	pid = clone_into_cgroup(cgroup_fd);
+	if (pid < 0)
+		goto cleanup_pass;
+	if (pid == 0)
+		exit(EXIT_SUCCESS);
+	(void)clone_reap(pid, WEXITED);
+	goto cleanup;
+
+cleanup_pass:
+	ret = KSFT_PASS;
+
+cleanup:
+	if (cg_test_d)
+		cg_destroy(cg_test_d);
+	if (cg_test_c)
+		cg_destroy(cg_test_c);
+	if (cg_test_b)
+		cg_destroy(cg_test_b);
+	if (cg_test_a)
+		cg_destroy(cg_test_a);
+	free(cg_test_d);
+	free(cg_test_c);
+	free(cg_test_b);
+	free(cg_test_a);
+	if (cgroup_fd >= 0)
+		close(cgroup_fd);
+	return ret;
+}
+
+/*
+ * A (domain threaded) - B (threaded) - C (domain)
+ *
+ * test that C can't be used until it is turned into a
+ * threaded cgroup.  "cgroup.type" file will report "domain (invalid)" in
+ * these cases. Operations which fail due to invalid topology use
+ * EOPNOTSUPP as the errno.
+ */
+static int test_cgcore_invalid_domain(const char *root)
+{
+	int ret = KSFT_FAIL;
+	char *grandparent = NULL, *parent = NULL, *child = NULL;
+
+	grandparent = cg_name(root, "cg_test_grandparent");
+	parent = cg_name(root, "cg_test_grandparent/cg_test_parent");
+	child = cg_name(root, "cg_test_grandparent/cg_test_parent/cg_test_child");
+	if (!parent || !child || !grandparent)
+		goto cleanup;
+
+	if (cg_create(grandparent))
+		goto cleanup;
+
+	if (cg_create(parent))
+		goto cleanup;
+
+	if (cg_create(child))
+		goto cleanup;
+
+	if (cg_write(parent, "cgroup.type", "threaded"))
+		goto cleanup;
+
+	if (cg_read_strcmp(child, "cgroup.type", "domain invalid\n"))
+		goto cleanup;
+
+	if (!cg_enter_current(child))
+		goto cleanup;
+
+	if (errno != EOPNOTSUPP)
+		goto cleanup;
+
+	if (!clone_into_cgroup_run_wait(child))
+		goto cleanup;
+
+	if (errno == ENOSYS)
+		goto cleanup_pass;
+
+	if (errno != EOPNOTSUPP)
+		goto cleanup;
+
+cleanup_pass:
+	ret = KSFT_PASS;
+
+cleanup:
+	cg_enter_current(root);
+	if (child)
+		cg_destroy(child);
+	if (parent)
+		cg_destroy(parent);
+	if (grandparent)
+		cg_destroy(grandparent);
+	free(child);
+	free(parent);
+	free(grandparent);
+	return ret;
+}
+
+/*
+ * Test that when a child becomes threaded
+ * the parent type becomes domain threaded.
+ */
+static int test_cgcore_parent_becomes_threaded(const char *root)
+{
+	int ret = KSFT_FAIL;
+	char *parent = NULL, *child = NULL;
+
+	parent = cg_name(root, "cg_test_parent");
+	child = cg_name(root, "cg_test_parent/cg_test_child");
+	if (!parent || !child)
+		goto cleanup;
+
+	if (cg_create(parent))
+		goto cleanup;
+
+	if (cg_create(child))
+		goto cleanup;
+
+	if (cg_write(child, "cgroup.type", "threaded"))
+		goto cleanup;
+
+	if (cg_read_strcmp(parent, "cgroup.type", "domain threaded\n"))
+		goto cleanup;
+
+	ret = KSFT_PASS;
+
+cleanup:
+	if (child)
+		cg_destroy(child);
+	if (parent)
+		cg_destroy(parent);
+	free(child);
+	free(parent);
+	return ret;
+
+}
+
+/*
+ * Test that there's no internal process constrain on threaded cgroups.
+ * You can add threads/processes on a parent with a controller enabled.
+ */
+static int test_cgcore_no_internal_process_constraint_on_threads(const char *root)
+{
+	int ret = KSFT_FAIL;
+	char *parent = NULL, *child = NULL;
+
+	if (cg_read_strstr(root, "cgroup.controllers", "cpu") ||
+	    cg_write(root, "cgroup.subtree_control", "+cpu")) {
+		ret = KSFT_SKIP;
+		goto cleanup;
+	}
+
+	parent = cg_name(root, "cg_test_parent");
+	child = cg_name(root, "cg_test_parent/cg_test_child");
+	if (!parent || !child)
+		goto cleanup;
+
+	if (cg_create(parent))
+		goto cleanup;
+
+	if (cg_create(child))
+		goto cleanup;
+
+	if (cg_write(parent, "cgroup.type", "threaded"))
+		goto cleanup;
+
+	if (cg_write(child, "cgroup.type", "threaded"))
+		goto cleanup;
+
+	if (cg_write(parent, "cgroup.subtree_control", "+cpu"))
+		goto cleanup;
+
+	if (cg_enter_current(parent))
+		goto cleanup;
+
+	ret = KSFT_PASS;
+
+cleanup:
+	cg_enter_current(root);
+	cg_enter_current(root);
+	if (child)
+		cg_destroy(child);
+	if (parent)
+		cg_destroy(parent);
+	free(child);
+	free(parent);
+	return ret;
+}
+
+/*
+ * Test that you can't enable a controller on a child if it's not enabled
+ * on the parent.
+ */
+static int test_cgcore_top_down_constraint_enable(const char *root)
+{
+	int ret = KSFT_FAIL;
+	char *parent = NULL, *child = NULL;
+
+	parent = cg_name(root, "cg_test_parent");
+	child = cg_name(root, "cg_test_parent/cg_test_child");
+	if (!parent || !child)
+		goto cleanup;
+
+	if (cg_create(parent))
+		goto cleanup;
+
+	if (cg_create(child))
+		goto cleanup;
+
+	if (!cg_write(child, "cgroup.subtree_control", "+memory"))
+		goto cleanup;
+
+	ret = KSFT_PASS;
+
+cleanup:
+	if (child)
+		cg_destroy(child);
+	if (parent)
+		cg_destroy(parent);
+	free(child);
+	free(parent);
+	return ret;
+}
+
+/*
+ * Test that you can't disable a controller on a parent
+ * if it's enabled in a child.
+ */
+static int test_cgcore_top_down_constraint_disable(const char *root)
+{
+	int ret = KSFT_FAIL;
+	char *parent = NULL, *child = NULL;
+
+	parent = cg_name(root, "cg_test_parent");
+	child = cg_name(root, "cg_test_parent/cg_test_child");
+	if (!parent || !child)
+		goto cleanup;
+
+	if (cg_create(parent))
+		goto cleanup;
+
+	if (cg_create(child))
+		goto cleanup;
+
+	if (cg_write(parent, "cgroup.subtree_control", "+memory"))
+		goto cleanup;
+
+	if (cg_write(child, "cgroup.subtree_control", "+memory"))
+		goto cleanup;
+
+	if (!cg_write(parent, "cgroup.subtree_control", "-memory"))
+		goto cleanup;
+
+	ret = KSFT_PASS;
+
+cleanup:
+	if (child)
+		cg_destroy(child);
+	if (parent)
+		cg_destroy(parent);
+	free(child);
+	free(parent);
+	return ret;
+}
+
+/*
+ * Test internal process constraint.
+ * You can't add a pid to a domain parent if a controller is enabled.
+ */
+static int test_cgcore_internal_process_constraint(const char *root)
+{
+	int ret = KSFT_FAIL;
+	char *parent = NULL, *child = NULL;
+
+	parent = cg_name(root, "cg_test_parent");
+	child = cg_name(root, "cg_test_parent/cg_test_child");
+	if (!parent || !child)
+		goto cleanup;
+
+	if (cg_create(parent))
+		goto cleanup;
+
+	if (cg_create(child))
+		goto cleanup;
+
+	if (cg_write(parent, "cgroup.subtree_control", "+memory"))
+		goto cleanup;
+
+	if (!cg_enter_current(parent))
+		goto cleanup;
+
+	if (!clone_into_cgroup_run_wait(parent))
+		goto cleanup;
+
+	ret = KSFT_PASS;
+
+cleanup:
+	if (child)
+		cg_destroy(child);
+	if (parent)
+		cg_destroy(parent);
+	free(child);
+	free(parent);
+	return ret;
+}
+
+static void *dummy_thread_fn(void *arg)
+{
+	return (void *)(size_t)pause();
+}
+
+/*
+ * Test threadgroup migration.
+ * All threads of a process are migrated together.
+ */
+static int test_cgcore_proc_migration(const char *root)
+{
+	int ret = KSFT_FAIL;
+	int t, c_threads = 0, n_threads = 13;
+	char *src = NULL, *dst = NULL;
+	pthread_t threads[n_threads];
+
+	src = cg_name(root, "cg_src");
+	dst = cg_name(root, "cg_dst");
+	if (!src || !dst)
+		goto cleanup;
+
+	if (cg_create(src))
+		goto cleanup;
+	if (cg_create(dst))
+		goto cleanup;
+
+	if (cg_enter_current(src))
+		goto cleanup;
+
+	for (c_threads = 0; c_threads < n_threads; ++c_threads) {
+		if (pthread_create(&threads[c_threads], NULL, dummy_thread_fn, NULL))
+			goto cleanup;
+	}
+
+	cg_enter_current(dst);
+	if (cg_read_lc(dst, "cgroup.threads") != n_threads + 1)
+		goto cleanup;
+
+	ret = KSFT_PASS;
+
+cleanup:
+	for (t = 0; t < c_threads; ++t) {
+		pthread_cancel(threads[t]);
+	}
+
+	for (t = 0; t < c_threads; ++t) {
+		pthread_join(threads[t], NULL);
+	}
+
+	cg_enter_current(root);
+
+	if (dst)
+		cg_destroy(dst);
+	if (src)
+		cg_destroy(src);
+	free(dst);
+	free(src);
+	return ret;
+}
+
+static void *migrating_thread_fn(void *arg)
+{
+	int g, i, n_iterations = 1000;
+	char **grps = arg;
+	char lines[3][PATH_MAX];
+
+	for (g = 1; g < 3; ++g)
+		snprintf(lines[g], sizeof(lines[g]), "0::%s", grps[g] + strlen(grps[0]));
+
+	for (i = 0; i < n_iterations; ++i) {
+		cg_enter_current_thread(grps[(i % 2) + 1]);
+
+		if (proc_read_strstr(0, 1, "cgroup", lines[(i % 2) + 1]))
+			return (void *)-1;
+	}
+	return NULL;
+}
+
+/*
+ * Test single thread migration.
+ * Threaded cgroups allow successful migration of a thread.
+ */
+static int test_cgcore_thread_migration(const char *root)
+{
+	int ret = KSFT_FAIL;
+	char *dom = NULL;
+	char line[PATH_MAX];
+	char *grps[3] = { (char *)root, NULL, NULL };
+	pthread_t thr;
+	void *retval;
+
+	dom = cg_name(root, "cg_dom");
+	grps[1] = cg_name(root, "cg_dom/cg_src");
+	grps[2] = cg_name(root, "cg_dom/cg_dst");
+	if (!grps[1] || !grps[2] || !dom)
+		goto cleanup;
+
+	if (cg_create(dom))
+		goto cleanup;
+	if (cg_create(grps[1]))
+		goto cleanup;
+	if (cg_create(grps[2]))
+		goto cleanup;
+
+	if (cg_write(grps[1], "cgroup.type", "threaded"))
+		goto cleanup;
+	if (cg_write(grps[2], "cgroup.type", "threaded"))
+		goto cleanup;
+
+	if (cg_enter_current(grps[1]))
+		goto cleanup;
+
+	if (pthread_create(&thr, NULL, migrating_thread_fn, grps))
+		goto cleanup;
+
+	if (pthread_join(thr, &retval))
+		goto cleanup;
+
+	if (retval)
+		goto cleanup;
+
+	snprintf(line, sizeof(line), "0::%s", grps[1] + strlen(grps[0]));
+	if (proc_read_strstr(0, 1, "cgroup", line))
+		goto cleanup;
+
+	ret = KSFT_PASS;
+
+cleanup:
+	cg_enter_current(root);
+	if (grps[2])
+		cg_destroy(grps[2]);
+	if (grps[1])
+		cg_destroy(grps[1]);
+	if (dom)
+		cg_destroy(dom);
+	free(grps[2]);
+	free(grps[1]);
+	free(dom);
+	return ret;
+}
+
+/*
+ * cgroup migration permission check should be performed based on the
+ * credentials at the time of open instead of write.
+ */
+static int test_cgcore_lesser_euid_open(const char *root)
+{
+	const uid_t test_euid = 65534;	/* usually nobody, any !root is fine */
+	int ret = KSFT_FAIL;
+	char *cg_test_a = NULL, *cg_test_b = NULL;
+	char *cg_test_a_procs = NULL, *cg_test_b_procs = NULL;
+	int cg_test_b_procs_fd = -1;
+	uid_t saved_uid;
+
+	cg_test_a = cg_name(root, "cg_test_a");
+	cg_test_b = cg_name(root, "cg_test_b");
+
+	if (!cg_test_a || !cg_test_b)
+		goto cleanup;
+
+	cg_test_a_procs = cg_name(cg_test_a, "cgroup.procs");
+	cg_test_b_procs = cg_name(cg_test_b, "cgroup.procs");
+
+	if (!cg_test_a_procs || !cg_test_b_procs)
+		goto cleanup;
+
+	if (cg_create(cg_test_a) || cg_create(cg_test_b))
+		goto cleanup;
+
+	if (cg_enter_current(cg_test_a))
+		goto cleanup;
+
+	if (chown(cg_test_a_procs, test_euid, -1) ||
+	    chown(cg_test_b_procs, test_euid, -1))
+		goto cleanup;
+
+	saved_uid = geteuid();
+	if (seteuid(test_euid))
+		goto cleanup;
+
+	cg_test_b_procs_fd = open(cg_test_b_procs, O_RDWR);
+
+	if (seteuid(saved_uid))
+		goto cleanup;
+
+	if (cg_test_b_procs_fd < 0)
+		goto cleanup;
+
+	if (write(cg_test_b_procs_fd, "0", 1) >= 0 || errno != EACCES)
+		goto cleanup;
+
+	ret = KSFT_PASS;
+
+cleanup:
+	cg_enter_current(root);
+	if (cg_test_b_procs_fd >= 0)
+		close(cg_test_b_procs_fd);
+	if (cg_test_b)
+		cg_destroy(cg_test_b);
+	if (cg_test_a)
+		cg_destroy(cg_test_a);
+	free(cg_test_b_procs);
+	free(cg_test_a_procs);
+	free(cg_test_b);
+	free(cg_test_a);
+	return ret;
+}
+
+struct lesser_ns_open_thread_arg {
+	const char	*path;
+	int		fd;
+	int		err;
+};
+
+static int lesser_ns_open_thread_fn(void *arg)
+{
+	struct lesser_ns_open_thread_arg *targ = arg;
+
+	targ->fd = open(targ->path, O_RDWR);
+	targ->err = errno;
+	return 0;
+}
+
+/*
+ * cgroup migration permission check should be performed based on the cgroup
+ * namespace at the time of open instead of write.
+ */
+static int test_cgcore_lesser_ns_open(const char *root)
+{
+	static char stack[65536];
+	const uid_t test_euid = 65534;	/* usually nobody, any !root is fine */
+	int ret = KSFT_FAIL;
+	char *cg_test_a = NULL, *cg_test_b = NULL;
+	char *cg_test_a_procs = NULL, *cg_test_b_procs = NULL;
+	int cg_test_b_procs_fd = -1;
+	struct lesser_ns_open_thread_arg targ = { .fd = -1 };
+	pid_t pid;
+	int status;
+
+	cg_test_a = cg_name(root, "cg_test_a");
+	cg_test_b = cg_name(root, "cg_test_b");
+
+	if (!cg_test_a || !cg_test_b)
+		goto cleanup;
+
+	cg_test_a_procs = cg_name(cg_test_a, "cgroup.procs");
+	cg_test_b_procs = cg_name(cg_test_b, "cgroup.procs");
+
+	if (!cg_test_a_procs || !cg_test_b_procs)
+		goto cleanup;
+
+	if (cg_create(cg_test_a) || cg_create(cg_test_b))
+		goto cleanup;
+
+	if (cg_enter_current(cg_test_b))
+		goto cleanup;
+
+	if (chown(cg_test_a_procs, test_euid, -1) ||
+	    chown(cg_test_b_procs, test_euid, -1))
+		goto cleanup;
+
+	targ.path = cg_test_b_procs;
+	pid = clone(lesser_ns_open_thread_fn, stack + sizeof(stack),
+		    CLONE_NEWCGROUP | CLONE_FILES | CLONE_VM | SIGCHLD,
+		    &targ);
+	if (pid < 0)
+		goto cleanup;
+
+	if (waitpid(pid, &status, 0) < 0)
+		goto cleanup;
+
+	if (!WIFEXITED(status))
+		goto cleanup;
+
+	cg_test_b_procs_fd = targ.fd;
+	if (cg_test_b_procs_fd < 0)
+		goto cleanup;
+
+	if (cg_enter_current(cg_test_a))
+		goto cleanup;
+
+	if ((status = write(cg_test_b_procs_fd, "0", 1)) >= 0 || errno != ENOENT)
+		goto cleanup;
+
+	ret = KSFT_PASS;
+
+cleanup:
+	cg_enter_current(root);
+	if (cg_test_b_procs_fd >= 0)
+		close(cg_test_b_procs_fd);
+	if (cg_test_b)
+		cg_destroy(cg_test_b);
+	if (cg_test_a)
+		cg_destroy(cg_test_a);
+	free(cg_test_b_procs);
+	free(cg_test_a_procs);
+	free(cg_test_b);
+	free(cg_test_a);
+	return ret;
+}
+
+#define T(x) { x, #x }
+struct corecg_test {
+	int (*fn)(const char *root);
+	const char *name;
+} tests[] = {
+	T(test_cgcore_internal_process_constraint),
+	T(test_cgcore_top_down_constraint_enable),
+	T(test_cgcore_top_down_constraint_disable),
+	T(test_cgcore_no_internal_process_constraint_on_threads),
+	T(test_cgcore_parent_becomes_threaded),
+	T(test_cgcore_invalid_domain),
+	T(test_cgcore_populated),
+	T(test_cgcore_proc_migration),
+	T(test_cgcore_thread_migration),
+	T(test_cgcore_destroy),
+	T(test_cgcore_lesser_euid_open),
+	T(test_cgcore_lesser_ns_open),
+};
+#undef T
+
+int main(int argc, char *argv[])
+{
+	char root[PATH_MAX];
+	int i, ret = EXIT_SUCCESS;
+
+	if (cg_find_unified_root(root, sizeof(root)))
+		ksft_exit_skip("cgroup v2 isn't mounted\n");
+
+	if (cg_read_strstr(root, "cgroup.subtree_control", "memory"))
+		if (cg_write(root, "cgroup.subtree_control", "+memory"))
+			ksft_exit_skip("Failed to set memory controller\n");
+
+	for (i = 0; i < ARRAY_SIZE(tests); i++) {
+		switch (tests[i].fn(root)) {
+		case KSFT_PASS:
+			ksft_test_result_pass("%s\n", tests[i].name);
+			break;
+		case KSFT_SKIP:
+			ksft_test_result_skip("%s\n", tests[i].name);
+			break;
+		default:
+			ret = EXIT_FAILURE;
+			ksft_test_result_fail("%s\n", tests[i].name);
+			break;
+		}
+	}
+
+	return ret;
+}
diff --git a/tools/testing/selftests/cgroup/test_cpu.c b/tools/testing/selftests/cgroup/test_cpu.c
new file mode 100644
index 000000000..24020a2c6
--- /dev/null
+++ b/tools/testing/selftests/cgroup/test_cpu.c
@@ -0,0 +1,726 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#define _GNU_SOURCE
+#include <linux/limits.h>
+#include <sys/sysinfo.h>
+#include <sys/wait.h>
+#include <errno.h>
+#include <pthread.h>
+#include <stdio.h>
+#include <time.h>
+
+#include "../kselftest.h"
+#include "cgroup_util.h"
+
+enum hog_clock_type {
+	// Count elapsed time using the CLOCK_PROCESS_CPUTIME_ID clock.
+	CPU_HOG_CLOCK_PROCESS,
+	// Count elapsed time using system wallclock time.
+	CPU_HOG_CLOCK_WALL,
+};
+
+struct cpu_hogger {
+	char *cgroup;
+	pid_t pid;
+	long usage;
+};
+
+struct cpu_hog_func_param {
+	int nprocs;
+	struct timespec ts;
+	enum hog_clock_type clock_type;
+};
+
+/*
+ * This test creates two nested cgroups with and without enabling
+ * the cpu controller.
+ */
+static int test_cpucg_subtree_control(const char *root)
+{
+	char *parent = NULL, *child = NULL, *parent2 = NULL, *child2 = NULL;
+	int ret = KSFT_FAIL;
+
+	// Create two nested cgroups with the cpu controller enabled.
+	parent = cg_name(root, "cpucg_test_0");
+	if (!parent)
+		goto cleanup;
+
+	if (cg_create(parent))
+		goto cleanup;
+
+	if (cg_write(parent, "cgroup.subtree_control", "+cpu"))
+		goto cleanup;
+
+	child = cg_name(parent, "cpucg_test_child");
+	if (!child)
+		goto cleanup;
+
+	if (cg_create(child))
+		goto cleanup;
+
+	if (cg_read_strstr(child, "cgroup.controllers", "cpu"))
+		goto cleanup;
+
+	// Create two nested cgroups without enabling the cpu controller.
+	parent2 = cg_name(root, "cpucg_test_1");
+	if (!parent2)
+		goto cleanup;
+
+	if (cg_create(parent2))
+		goto cleanup;
+
+	child2 = cg_name(parent2, "cpucg_test_child");
+	if (!child2)
+		goto cleanup;
+
+	if (cg_create(child2))
+		goto cleanup;
+
+	if (!cg_read_strstr(child2, "cgroup.controllers", "cpu"))
+		goto cleanup;
+
+	ret = KSFT_PASS;
+
+cleanup:
+	cg_destroy(child);
+	free(child);
+	cg_destroy(child2);
+	free(child2);
+	cg_destroy(parent);
+	free(parent);
+	cg_destroy(parent2);
+	free(parent2);
+
+	return ret;
+}
+
+static void *hog_cpu_thread_func(void *arg)
+{
+	while (1)
+		;
+
+	return NULL;
+}
+
+static struct timespec
+timespec_sub(const struct timespec *lhs, const struct timespec *rhs)
+{
+	struct timespec zero = {
+		.tv_sec = 0,
+		.tv_nsec = 0,
+	};
+	struct timespec ret;
+
+	if (lhs->tv_sec < rhs->tv_sec)
+		return zero;
+
+	ret.tv_sec = lhs->tv_sec - rhs->tv_sec;
+
+	if (lhs->tv_nsec < rhs->tv_nsec) {
+		if (ret.tv_sec == 0)
+			return zero;
+
+		ret.tv_sec--;
+		ret.tv_nsec = NSEC_PER_SEC - rhs->tv_nsec + lhs->tv_nsec;
+	} else
+		ret.tv_nsec = lhs->tv_nsec - rhs->tv_nsec;
+
+	return ret;
+}
+
+static int hog_cpus_timed(const char *cgroup, void *arg)
+{
+	const struct cpu_hog_func_param *param =
+		(struct cpu_hog_func_param *)arg;
+	struct timespec ts_run = param->ts;
+	struct timespec ts_remaining = ts_run;
+	struct timespec ts_start;
+	int i, ret;
+
+	ret = clock_gettime(CLOCK_MONOTONIC, &ts_start);
+	if (ret != 0)
+		return ret;
+
+	for (i = 0; i < param->nprocs; i++) {
+		pthread_t tid;
+
+		ret = pthread_create(&tid, NULL, &hog_cpu_thread_func, NULL);
+		if (ret != 0)
+			return ret;
+	}
+
+	while (ts_remaining.tv_sec > 0 || ts_remaining.tv_nsec > 0) {
+		struct timespec ts_total;
+
+		ret = nanosleep(&ts_remaining, NULL);
+		if (ret && errno != EINTR)
+			return ret;
+
+		if (param->clock_type == CPU_HOG_CLOCK_PROCESS) {
+			ret = clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts_total);
+			if (ret != 0)
+				return ret;
+		} else {
+			struct timespec ts_current;
+
+			ret = clock_gettime(CLOCK_MONOTONIC, &ts_current);
+			if (ret != 0)
+				return ret;
+
+			ts_total = timespec_sub(&ts_current, &ts_start);
+		}
+
+		ts_remaining = timespec_sub(&ts_run, &ts_total);
+	}
+
+	return 0;
+}
+
+/*
+ * Creates a cpu cgroup, burns a CPU for a few quanta, and verifies that
+ * cpu.stat shows the expected output.
+ */
+static int test_cpucg_stats(const char *root)
+{
+	int ret = KSFT_FAIL;
+	long usage_usec, user_usec, system_usec;
+	long usage_seconds = 2;
+	long expected_usage_usec = usage_seconds * USEC_PER_SEC;
+	char *cpucg;
+
+	cpucg = cg_name(root, "cpucg_test");
+	if (!cpucg)
+		goto cleanup;
+
+	if (cg_create(cpucg))
+		goto cleanup;
+
+	usage_usec = cg_read_key_long(cpucg, "cpu.stat", "usage_usec");
+	user_usec = cg_read_key_long(cpucg, "cpu.stat", "user_usec");
+	system_usec = cg_read_key_long(cpucg, "cpu.stat", "system_usec");
+	if (usage_usec != 0 || user_usec != 0 || system_usec != 0)
+		goto cleanup;
+
+	struct cpu_hog_func_param param = {
+		.nprocs = 1,
+		.ts = {
+			.tv_sec = usage_seconds,
+			.tv_nsec = 0,
+		},
+		.clock_type = CPU_HOG_CLOCK_PROCESS,
+	};
+	if (cg_run(cpucg, hog_cpus_timed, (void *)&param))
+		goto cleanup;
+
+	usage_usec = cg_read_key_long(cpucg, "cpu.stat", "usage_usec");
+	user_usec = cg_read_key_long(cpucg, "cpu.stat", "user_usec");
+	if (user_usec <= 0)
+		goto cleanup;
+
+	if (!values_close(usage_usec, expected_usage_usec, 1))
+		goto cleanup;
+
+	ret = KSFT_PASS;
+
+cleanup:
+	cg_destroy(cpucg);
+	free(cpucg);
+
+	return ret;
+}
+
+static int
+run_cpucg_weight_test(
+		const char *root,
+		pid_t (*spawn_child)(const struct cpu_hogger *child),
+		int (*validate)(const struct cpu_hogger *children, int num_children))
+{
+	int ret = KSFT_FAIL, i;
+	char *parent = NULL;
+	struct cpu_hogger children[3] = {NULL};
+
+	parent = cg_name(root, "cpucg_test_0");
+	if (!parent)
+		goto cleanup;
+
+	if (cg_create(parent))
+		goto cleanup;
+
+	if (cg_write(parent, "cgroup.subtree_control", "+cpu"))
+		goto cleanup;
+
+	for (i = 0; i < ARRAY_SIZE(children); i++) {
+		children[i].cgroup = cg_name_indexed(parent, "cpucg_child", i);
+		if (!children[i].cgroup)
+			goto cleanup;
+
+		if (cg_create(children[i].cgroup))
+			goto cleanup;
+
+		if (cg_write_numeric(children[i].cgroup, "cpu.weight",
+					50 * (i + 1)))
+			goto cleanup;
+	}
+
+	for (i = 0; i < ARRAY_SIZE(children); i++) {
+		pid_t pid = spawn_child(&children[i]);
+		if (pid <= 0)
+			goto cleanup;
+		children[i].pid = pid;
+	}
+
+	for (i = 0; i < ARRAY_SIZE(children); i++) {
+		int retcode;
+
+		waitpid(children[i].pid, &retcode, 0);
+		if (!WIFEXITED(retcode))
+			goto cleanup;
+		if (WEXITSTATUS(retcode))
+			goto cleanup;
+	}
+
+	for (i = 0; i < ARRAY_SIZE(children); i++)
+		children[i].usage = cg_read_key_long(children[i].cgroup,
+				"cpu.stat", "usage_usec");
+
+	if (validate(children, ARRAY_SIZE(children)))
+		goto cleanup;
+
+	ret = KSFT_PASS;
+cleanup:
+	for (i = 0; i < ARRAY_SIZE(children); i++) {
+		cg_destroy(children[i].cgroup);
+		free(children[i].cgroup);
+	}
+	cg_destroy(parent);
+	free(parent);
+
+	return ret;
+}
+
+static pid_t weight_hog_ncpus(const struct cpu_hogger *child, int ncpus)
+{
+	long usage_seconds = 10;
+	struct cpu_hog_func_param param = {
+		.nprocs = ncpus,
+		.ts = {
+			.tv_sec = usage_seconds,
+			.tv_nsec = 0,
+		},
+		.clock_type = CPU_HOG_CLOCK_WALL,
+	};
+	return cg_run_nowait(child->cgroup, hog_cpus_timed, (void *)&param);
+}
+
+static pid_t weight_hog_all_cpus(const struct cpu_hogger *child)
+{
+	return weight_hog_ncpus(child, get_nprocs());
+}
+
+static int
+overprovision_validate(const struct cpu_hogger *children, int num_children)
+{
+	int ret = KSFT_FAIL, i;
+
+	for (i = 0; i < num_children - 1; i++) {
+		long delta;
+
+		if (children[i + 1].usage <= children[i].usage)
+			goto cleanup;
+
+		delta = children[i + 1].usage - children[i].usage;
+		if (!values_close(delta, children[0].usage, 35))
+			goto cleanup;
+	}
+
+	ret = KSFT_PASS;
+cleanup:
+	return ret;
+}
+
+/*
+ * First, this test creates the following hierarchy:
+ * A
+ * A/B     cpu.weight = 50
+ * A/C     cpu.weight = 100
+ * A/D     cpu.weight = 150
+ *
+ * A separate process is then created for each child cgroup which spawns as
+ * many threads as there are cores, and hogs each CPU as much as possible
+ * for some time interval.
+ *
+ * Once all of the children have exited, we verify that each child cgroup
+ * was given proportional runtime as informed by their cpu.weight.
+ */
+static int test_cpucg_weight_overprovisioned(const char *root)
+{
+	return run_cpucg_weight_test(root, weight_hog_all_cpus,
+			overprovision_validate);
+}
+
+static pid_t weight_hog_one_cpu(const struct cpu_hogger *child)
+{
+	return weight_hog_ncpus(child, 1);
+}
+
+static int
+underprovision_validate(const struct cpu_hogger *children, int num_children)
+{
+	int ret = KSFT_FAIL, i;
+
+	for (i = 0; i < num_children - 1; i++) {
+		if (!values_close(children[i + 1].usage, children[0].usage, 15))
+			goto cleanup;
+	}
+
+	ret = KSFT_PASS;
+cleanup:
+	return ret;
+}
+
+/*
+ * First, this test creates the following hierarchy:
+ * A
+ * A/B     cpu.weight = 50
+ * A/C     cpu.weight = 100
+ * A/D     cpu.weight = 150
+ *
+ * A separate process is then created for each child cgroup which spawns a
+ * single thread that hogs a CPU. The testcase is only run on systems that
+ * have at least one core per-thread in the child processes.
+ *
+ * Once all of the children have exited, we verify that each child cgroup
+ * had roughly the same runtime despite having different cpu.weight.
+ */
+static int test_cpucg_weight_underprovisioned(const char *root)
+{
+	// Only run the test if there are enough cores to avoid overprovisioning
+	// the system.
+	if (get_nprocs() < 4)
+		return KSFT_SKIP;
+
+	return run_cpucg_weight_test(root, weight_hog_one_cpu,
+			underprovision_validate);
+}
+
+static int
+run_cpucg_nested_weight_test(const char *root, bool overprovisioned)
+{
+	int ret = KSFT_FAIL, i;
+	char *parent = NULL, *child = NULL;
+	struct cpu_hogger leaf[3] = {NULL};
+	long nested_leaf_usage, child_usage;
+	int nprocs = get_nprocs();
+
+	if (!overprovisioned) {
+		if (nprocs < 4)
+			/*
+			 * Only run the test if there are enough cores to avoid overprovisioning
+			 * the system.
+			 */
+			return KSFT_SKIP;
+		nprocs /= 4;
+	}
+
+	parent = cg_name(root, "cpucg_test");
+	child = cg_name(parent, "cpucg_child");
+	if (!parent || !child)
+		goto cleanup;
+
+	if (cg_create(parent))
+		goto cleanup;
+	if (cg_write(parent, "cgroup.subtree_control", "+cpu"))
+		goto cleanup;
+
+	if (cg_create(child))
+		goto cleanup;
+	if (cg_write(child, "cgroup.subtree_control", "+cpu"))
+		goto cleanup;
+	if (cg_write(child, "cpu.weight", "1000"))
+		goto cleanup;
+
+	for (i = 0; i < ARRAY_SIZE(leaf); i++) {
+		const char *ancestor;
+		long weight;
+
+		if (i == 0) {
+			ancestor = parent;
+			weight = 1000;
+		} else {
+			ancestor = child;
+			weight = 5000;
+		}
+		leaf[i].cgroup = cg_name_indexed(ancestor, "cpucg_leaf", i);
+		if (!leaf[i].cgroup)
+			goto cleanup;
+
+		if (cg_create(leaf[i].cgroup))
+			goto cleanup;
+
+		if (cg_write_numeric(leaf[i].cgroup, "cpu.weight", weight))
+			goto cleanup;
+	}
+
+	for (i = 0; i < ARRAY_SIZE(leaf); i++) {
+		pid_t pid;
+		struct cpu_hog_func_param param = {
+			.nprocs = nprocs,
+			.ts = {
+				.tv_sec = 10,
+				.tv_nsec = 0,
+			},
+			.clock_type = CPU_HOG_CLOCK_WALL,
+		};
+
+		pid = cg_run_nowait(leaf[i].cgroup, hog_cpus_timed,
+				(void *)&param);
+		if (pid <= 0)
+			goto cleanup;
+		leaf[i].pid = pid;
+	}
+
+	for (i = 0; i < ARRAY_SIZE(leaf); i++) {
+		int retcode;
+
+		waitpid(leaf[i].pid, &retcode, 0);
+		if (!WIFEXITED(retcode))
+			goto cleanup;
+		if (WEXITSTATUS(retcode))
+			goto cleanup;
+	}
+
+	for (i = 0; i < ARRAY_SIZE(leaf); i++) {
+		leaf[i].usage = cg_read_key_long(leaf[i].cgroup,
+				"cpu.stat", "usage_usec");
+		if (leaf[i].usage <= 0)
+			goto cleanup;
+	}
+
+	nested_leaf_usage = leaf[1].usage + leaf[2].usage;
+	if (overprovisioned) {
+		if (!values_close(leaf[0].usage, nested_leaf_usage, 15))
+			goto cleanup;
+	} else if (!values_close(leaf[0].usage * 2, nested_leaf_usage, 15))
+		goto cleanup;
+
+
+	child_usage = cg_read_key_long(child, "cpu.stat", "usage_usec");
+	if (child_usage <= 0)
+		goto cleanup;
+	if (!values_close(child_usage, nested_leaf_usage, 1))
+		goto cleanup;
+
+	ret = KSFT_PASS;
+cleanup:
+	for (i = 0; i < ARRAY_SIZE(leaf); i++) {
+		cg_destroy(leaf[i].cgroup);
+		free(leaf[i].cgroup);
+	}
+	cg_destroy(child);
+	free(child);
+	cg_destroy(parent);
+	free(parent);
+
+	return ret;
+}
+
+/*
+ * First, this test creates the following hierarchy:
+ * A
+ * A/B     cpu.weight = 1000
+ * A/C     cpu.weight = 1000
+ * A/C/D   cpu.weight = 5000
+ * A/C/E   cpu.weight = 5000
+ *
+ * A separate process is then created for each leaf, which spawn nproc threads
+ * that burn a CPU for a few seconds.
+ *
+ * Once all of those processes have exited, we verify that each of the leaf
+ * cgroups have roughly the same usage from cpu.stat.
+ */
+static int
+test_cpucg_nested_weight_overprovisioned(const char *root)
+{
+	return run_cpucg_nested_weight_test(root, true);
+}
+
+/*
+ * First, this test creates the following hierarchy:
+ * A
+ * A/B     cpu.weight = 1000
+ * A/C     cpu.weight = 1000
+ * A/C/D   cpu.weight = 5000
+ * A/C/E   cpu.weight = 5000
+ *
+ * A separate process is then created for each leaf, which nproc / 4 threads
+ * that burns a CPU for a few seconds.
+ *
+ * Once all of those processes have exited, we verify that each of the leaf
+ * cgroups have roughly the same usage from cpu.stat.
+ */
+static int
+test_cpucg_nested_weight_underprovisioned(const char *root)
+{
+	return run_cpucg_nested_weight_test(root, false);
+}
+
+/*
+ * This test creates a cgroup with some maximum value within a period, and
+ * verifies that a process in the cgroup is not overscheduled.
+ */
+static int test_cpucg_max(const char *root)
+{
+	int ret = KSFT_FAIL;
+	long usage_usec, user_usec;
+	long usage_seconds = 1;
+	long expected_usage_usec = usage_seconds * USEC_PER_SEC;
+	char *cpucg;
+
+	cpucg = cg_name(root, "cpucg_test");
+	if (!cpucg)
+		goto cleanup;
+
+	if (cg_create(cpucg))
+		goto cleanup;
+
+	if (cg_write(cpucg, "cpu.max", "1000"))
+		goto cleanup;
+
+	struct cpu_hog_func_param param = {
+		.nprocs = 1,
+		.ts = {
+			.tv_sec = usage_seconds,
+			.tv_nsec = 0,
+		},
+		.clock_type = CPU_HOG_CLOCK_WALL,
+	};
+	if (cg_run(cpucg, hog_cpus_timed, (void *)&param))
+		goto cleanup;
+
+	usage_usec = cg_read_key_long(cpucg, "cpu.stat", "usage_usec");
+	user_usec = cg_read_key_long(cpucg, "cpu.stat", "user_usec");
+	if (user_usec <= 0)
+		goto cleanup;
+
+	if (user_usec >= expected_usage_usec)
+		goto cleanup;
+
+	if (values_close(usage_usec, expected_usage_usec, 95))
+		goto cleanup;
+
+	ret = KSFT_PASS;
+
+cleanup:
+	cg_destroy(cpucg);
+	free(cpucg);
+
+	return ret;
+}
+
+/*
+ * This test verifies that a process inside of a nested cgroup whose parent
+ * group has a cpu.max value set, is properly throttled.
+ */
+static int test_cpucg_max_nested(const char *root)
+{
+	int ret = KSFT_FAIL;
+	long usage_usec, user_usec;
+	long usage_seconds = 1;
+	long expected_usage_usec = usage_seconds * USEC_PER_SEC;
+	char *parent, *child;
+
+	parent = cg_name(root, "cpucg_parent");
+	child = cg_name(parent, "cpucg_child");
+	if (!parent || !child)
+		goto cleanup;
+
+	if (cg_create(parent))
+		goto cleanup;
+
+	if (cg_write(parent, "cgroup.subtree_control", "+cpu"))
+		goto cleanup;
+
+	if (cg_create(child))
+		goto cleanup;
+
+	if (cg_write(parent, "cpu.max", "1000"))
+		goto cleanup;
+
+	struct cpu_hog_func_param param = {
+		.nprocs = 1,
+		.ts = {
+			.tv_sec = usage_seconds,
+			.tv_nsec = 0,
+		},
+		.clock_type = CPU_HOG_CLOCK_WALL,
+	};
+	if (cg_run(child, hog_cpus_timed, (void *)&param))
+		goto cleanup;
+
+	usage_usec = cg_read_key_long(child, "cpu.stat", "usage_usec");
+	user_usec = cg_read_key_long(child, "cpu.stat", "user_usec");
+	if (user_usec <= 0)
+		goto cleanup;
+
+	if (user_usec >= expected_usage_usec)
+		goto cleanup;
+
+	if (values_close(usage_usec, expected_usage_usec, 95))
+		goto cleanup;
+
+	ret = KSFT_PASS;
+
+cleanup:
+	cg_destroy(child);
+	free(child);
+	cg_destroy(parent);
+	free(parent);
+
+	return ret;
+}
+
+#define T(x) { x, #x }
+struct cpucg_test {
+	int (*fn)(const char *root);
+	const char *name;
+} tests[] = {
+	T(test_cpucg_subtree_control),
+	T(test_cpucg_stats),
+	T(test_cpucg_weight_overprovisioned),
+	T(test_cpucg_weight_underprovisioned),
+	T(test_cpucg_nested_weight_overprovisioned),
+	T(test_cpucg_nested_weight_underprovisioned),
+	T(test_cpucg_max),
+	T(test_cpucg_max_nested),
+};
+#undef T
+
+int main(int argc, char *argv[])
+{
+	char root[PATH_MAX];
+	int i, ret = EXIT_SUCCESS;
+
+	if (cg_find_unified_root(root, sizeof(root)))
+		ksft_exit_skip("cgroup v2 isn't mounted\n");
+
+	if (cg_read_strstr(root, "cgroup.subtree_control", "cpu"))
+		if (cg_write(root, "cgroup.subtree_control", "+cpu"))
+			ksft_exit_skip("Failed to set cpu controller\n");
+
+	for (i = 0; i < ARRAY_SIZE(tests); i++) {
+		switch (tests[i].fn(root)) {
+		case KSFT_PASS:
+			ksft_test_result_pass("%s\n", tests[i].name);
+			break;
+		case KSFT_SKIP:
+			ksft_test_result_skip("%s\n", tests[i].name);
+			break;
+		default:
+			ret = EXIT_FAILURE;
+			ksft_test_result_fail("%s\n", tests[i].name);
+			break;
+		}
+	}
+
+	return ret;
+}
diff --git a/tools/testing/selftests/cgroup/test_cpuset_prs.sh b/tools/testing/selftests/cgroup/test_cpuset_prs.sh
new file mode 100755
index 000000000..a50324485
--- /dev/null
+++ b/tools/testing/selftests/cgroup/test_cpuset_prs.sh
@@ -0,0 +1,675 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Test for cpuset v2 partition root state (PRS)
+#
+# The sched verbose flag is set, if available, so that the console log
+# can be examined for the correct setting of scheduling domain.
+#
+
+skip_test() {
+	echo "$1"
+	echo "Test SKIPPED"
+	exit 0
+}
+
+[[ $(id -u) -eq 0 ]] || skip_test "Test must be run as root!"
+
+# Set sched verbose flag, if available
+[[ -d /sys/kernel/debug/sched ]] && echo Y > /sys/kernel/debug/sched/verbose
+
+# Get wait_inotify location
+WAIT_INOTIFY=$(cd $(dirname $0); pwd)/wait_inotify
+
+# Find cgroup v2 mount point
+CGROUP2=$(mount -t cgroup2 | head -1 | awk -e '{print $3}')
+[[ -n "$CGROUP2" ]] || skip_test "Cgroup v2 mount point not found!"
+
+CPUS=$(lscpu | grep "^CPU(s)" | sed -e "s/.*:[[:space:]]*//")
+[[ $CPUS -lt 8 ]] && skip_test "Test needs at least 8 cpus available!"
+
+# Set verbose flag and delay factor
+PROG=$1
+VERBOSE=
+DELAY_FACTOR=1
+while [[ "$1" = -* ]]
+do
+	case "$1" in
+		-v) VERBOSE=1
+		    break
+		    ;;
+		-d) DELAY_FACTOR=$2
+		    shift
+		    break
+		    ;;
+		*)  echo "Usage: $PROG [-v] [-d <delay-factor>"
+		    exit
+		    ;;
+	esac
+	shift
+done
+
+cd $CGROUP2
+echo +cpuset > cgroup.subtree_control
+[[ -d test ]] || mkdir test
+cd test
+
+# Pause in ms
+pause()
+{
+	DELAY=$1
+	LOOP=0
+	while [[ $LOOP -lt $DELAY_FACTOR ]]
+	do
+		sleep $DELAY
+		((LOOP++))
+	done
+	return 0
+}
+
+console_msg()
+{
+	MSG=$1
+	echo "$MSG"
+	echo "" > /dev/console
+	echo "$MSG" > /dev/console
+	pause 0.01
+}
+
+test_partition()
+{
+	EXPECTED_VAL=$1
+	echo $EXPECTED_VAL > cpuset.cpus.partition
+	[[ $? -eq 0 ]] || exit 1
+	ACTUAL_VAL=$(cat cpuset.cpus.partition)
+	[[ $ACTUAL_VAL != $EXPECTED_VAL ]] && {
+		echo "cpuset.cpus.partition: expect $EXPECTED_VAL, found $EXPECTED_VAL"
+		echo "Test FAILED"
+		exit 1
+	}
+}
+
+test_effective_cpus()
+{
+	EXPECTED_VAL=$1
+	ACTUAL_VAL=$(cat cpuset.cpus.effective)
+	[[ "$ACTUAL_VAL" != "$EXPECTED_VAL" ]] && {
+		echo "cpuset.cpus.effective: expect '$EXPECTED_VAL', found '$EXPECTED_VAL'"
+		echo "Test FAILED"
+		exit 1
+	}
+}
+
+# Adding current process to cgroup.procs as a test
+test_add_proc()
+{
+	OUTSTR="$1"
+	ERRMSG=$((echo $$ > cgroup.procs) |& cat)
+	echo $ERRMSG | grep -q "$OUTSTR"
+	[[ $? -ne 0 ]] && {
+		echo "cgroup.procs: expect '$OUTSTR', got '$ERRMSG'"
+		echo "Test FAILED"
+		exit 1
+	}
+	echo $$ > $CGROUP2/cgroup.procs	# Move out the task
+}
+
+#
+# Testing the new "isolated" partition root type
+#
+test_isolated()
+{
+	echo 2-3 > cpuset.cpus
+	TYPE=$(cat cpuset.cpus.partition)
+	[[ $TYPE = member ]] || echo member > cpuset.cpus.partition
+
+	console_msg "Change from member to root"
+	test_partition root
+
+	console_msg "Change from root to isolated"
+	test_partition isolated
+
+	console_msg "Change from isolated to member"
+	test_partition member
+
+	console_msg "Change from member to isolated"
+	test_partition isolated
+
+	console_msg "Change from isolated to root"
+	test_partition root
+
+	console_msg "Change from root to member"
+	test_partition member
+
+	#
+	# Testing partition root with no cpu
+	#
+	console_msg "Distribute all cpus to child partition"
+	echo +cpuset > cgroup.subtree_control
+	test_partition root
+
+	mkdir A1
+	cd A1
+	echo 2-3 > cpuset.cpus
+	test_partition root
+	test_effective_cpus 2-3
+	cd ..
+	test_effective_cpus ""
+
+	console_msg "Moving task to partition test"
+	test_add_proc "No space left"
+	cd A1
+	test_add_proc ""
+	cd ..
+
+	console_msg "Shrink and expand child partition"
+	cd A1
+	echo 2 > cpuset.cpus
+	cd ..
+	test_effective_cpus 3
+	cd A1
+	echo 2-3 > cpuset.cpus
+	cd ..
+	test_effective_cpus ""
+
+	# Cleaning up
+	console_msg "Cleaning up"
+	echo $$ > $CGROUP2/cgroup.procs
+	[[ -d A1 ]] && rmdir A1
+}
+
+#
+# Cpuset controller state transition test matrix.
+#
+# Cgroup test hierarchy
+#
+# test -- A1 -- A2 -- A3
+#      \- B1
+#
+#  P<v> = set cpus.partition (0:member, 1:root, 2:isolated, -1:root invalid)
+#  C<l> = add cpu-list
+#  S<p> = use prefix in subtree_control
+#  T    = put a task into cgroup
+#  O<c>-<v> = Write <v> to CPU online file of <c>
+#
+SETUP_A123_PARTITIONS="C1-3:P1:S+ C2-3:P1:S+ C3:P1"
+TEST_MATRIX=(
+	# test  old-A1 old-A2 old-A3 old-B1 new-A1 new-A2 new-A3 new-B1 fail ECPUs Pstate
+	# ----  ------ ------ ------ ------ ------ ------ ------ ------ ---- ----- ------
+	"  S+    C0-1     .      .    C2-3    S+    C4-5     .      .     0 A2:0-1"
+	"  S+    C0-1     .      .    C2-3    P1      .      .      .     0 "
+	"  S+    C0-1     .      .    C2-3   P1:S+ C0-1:P1   .      .     0 "
+	"  S+    C0-1     .      .    C2-3   P1:S+  C1:P1    .      .     0 "
+	"  S+   C0-1:S+   .      .    C2-3     .      .      .     P1     0 "
+	"  S+   C0-1:P1   .      .    C2-3    S+     C1      .      .     0 "
+	"  S+   C0-1:P1   .      .    C2-3    S+    C1:P1    .      .     0 "
+	"  S+   C0-1:P1   .      .    C2-3    S+    C1:P1    .     P1     0 "
+	"  S+   C0-1:P1   .      .    C2-3   C4-5     .      .      .     0 A1:4-5"
+	"  S+   C0-1:P1   .      .    C2-3  S+:C4-5   .      .      .     0 A1:4-5"
+	"  S+    C0-1     .      .   C2-3:P1   .      .      .     C2     0 "
+	"  S+    C0-1     .      .   C2-3:P1   .      .      .    C4-5    0 B1:4-5"
+	"  S+ C0-3:P1:S+ C2-3:P1 .      .      .      .      .      .     0 A1:0-1,A2:2-3"
+	"  S+ C0-3:P1:S+ C2-3:P1 .      .     C1-3    .      .      .     0 A1:1,A2:2-3"
+	"  S+ C2-3:P1:S+  C3:P1  .      .     C3      .      .      .     0 A1:,A2:3 A1:P1,A2:P1"
+	"  S+ C2-3:P1:S+  C3:P1  .      .     C3      P0     .      .     0 A1:3,A2:3 A1:P1,A2:P0"
+	"  S+ C2-3:P1:S+  C2:P1  .      .     C2-4    .      .      .     0 A1:3-4,A2:2"
+	"  S+ C2-3:P1:S+  C3:P1  .      .     C3      .      .     C0-2   0 A1:,B1:0-2 A1:P1,A2:P1"
+	"  S+ $SETUP_A123_PARTITIONS    .     C2-3    .      .      .     0 A1:,A2:2,A3:3 A1:P1,A2:P1,A3:P1"
+
+	# CPU offlining cases:
+	"  S+    C0-1     .      .    C2-3    S+    C4-5     .     O2-0   0 A1:0-1,B1:3"
+	"  S+ C0-3:P1:S+ C2-3:P1 .      .     O2-0    .      .      .     0 A1:0-1,A2:3"
+	"  S+ C0-3:P1:S+ C2-3:P1 .      .     O2-0   O2-1    .      .     0 A1:0-1,A2:2-3"
+	"  S+ C0-3:P1:S+ C2-3:P1 .      .     O1-0    .      .      .     0 A1:0,A2:2-3"
+	"  S+ C0-3:P1:S+ C2-3:P1 .      .     O1-0   O1-1    .      .     0 A1:0-1,A2:2-3"
+	"  S+ C2-3:P1:S+  C3:P1  .      .     O3-0   O3-1    .      .     0 A1:2,A2:3 A1:P1,A2:P1"
+	"  S+ C2-3:P1:S+  C3:P2  .      .     O3-0   O3-1    .      .     0 A1:2,A2:3 A1:P1,A2:P2"
+	"  S+ C2-3:P1:S+  C3:P1  .      .     O2-0   O2-1    .      .     0 A1:2,A2:3 A1:P1,A2:P1"
+	"  S+ C2-3:P1:S+  C3:P2  .      .     O2-0   O2-1    .      .     0 A1:2,A2:3 A1:P1,A2:P2"
+	"  S+ C2-3:P1:S+  C3:P1  .      .     O2-0    .      .      .     0 A1:,A2:3 A1:P1,A2:P1"
+	"  S+ C2-3:P1:S+  C3:P1  .      .     O3-0    .      .      .     0 A1:2,A2: A1:P1,A2:P1"
+	"  S+ C2-3:P1:S+  C3:P1  .      .    T:O2-0   .      .      .     0 A1:3,A2:3 A1:P1,A2:P-1"
+	"  S+ C2-3:P1:S+  C3:P1  .      .      .    T:O3-0   .      .     0 A1:2,A2:2 A1:P1,A2:P-1"
+	"  S+ $SETUP_A123_PARTITIONS    .     O1-0    .      .      .     0 A1:,A2:2,A3:3 A1:P1,A2:P1,A3:P1"
+	"  S+ $SETUP_A123_PARTITIONS    .     O2-0    .      .      .     0 A1:1,A2:,A3:3 A1:P1,A2:P1,A3:P1"
+	"  S+ $SETUP_A123_PARTITIONS    .     O3-0    .      .      .     0 A1:1,A2:2,A3: A1:P1,A2:P1,A3:P1"
+	"  S+ $SETUP_A123_PARTITIONS    .    T:O1-0   .      .      .     0 A1:2-3,A2:2-3,A3:3 A1:P1,A2:P-1,A3:P-1"
+	"  S+ $SETUP_A123_PARTITIONS    .      .    T:O2-0   .      .     0 A1:1,A2:3,A3:3 A1:P1,A2:P1,A3:P-1"
+	"  S+ $SETUP_A123_PARTITIONS    .      .      .    T:O3-0   .     0 A1:1,A2:2,A3:2 A1:P1,A2:P1,A3:P-1"
+	"  S+ $SETUP_A123_PARTITIONS    .    T:O1-0  O1-1    .      .     0 A1:1,A2:2,A3:3 A1:P1,A2:P1,A3:P1"
+	"  S+ $SETUP_A123_PARTITIONS    .      .    T:O2-0  O2-1    .     0 A1:1,A2:2,A3:3 A1:P1,A2:P1,A3:P1"
+	"  S+ $SETUP_A123_PARTITIONS    .      .      .    T:O3-0  O3-1   0 A1:1,A2:2,A3:3 A1:P1,A2:P1,A3:P1"
+	"  S+ $SETUP_A123_PARTITIONS    .    T:O1-0  O2-0   O1-1    .     0 A1:1,A2:,A3:3 A1:P1,A2:P1,A3:P1"
+	"  S+ $SETUP_A123_PARTITIONS    .    T:O1-0  O2-0   O2-1    .     0 A1:2-3,A2:2-3,A3:3 A1:P1,A2:P-1,A3:P-1"
+
+	# test  old-A1 old-A2 old-A3 old-B1 new-A1 new-A2 new-A3 new-B1 fail ECPUs Pstate
+	# ----  ------ ------ ------ ------ ------ ------ ------ ------ ---- ----- ------
+	#
+	# Incorrect change to cpuset.cpus invalidates partition root
+	#
+	# Adding CPUs to partition root that are not in parent's
+	# cpuset.cpus is allowed, but those extra CPUs are ignored.
+	"  S+ C2-3:P1:S+ C3:P1   .      .      .     C2-4    .      .     0 A1:,A2:2-3 A1:P1,A2:P1"
+
+	# Taking away all CPUs from parent or itself if there are tasks
+	# will make the partition invalid.
+	"  S+ C2-3:P1:S+  C3:P1  .      .      T     C2-3    .      .     0 A1:2-3,A2:2-3 A1:P1,A2:P-1"
+	"  S+  C3:P1:S+    C3    .      .      T      P1     .      .     0 A1:3,A2:3 A1:P1,A2:P-1"
+	"  S+ $SETUP_A123_PARTITIONS    .    T:C2-3   .      .      .     0 A1:2-3,A2:2-3,A3:3 A1:P1,A2:P-1,A3:P-1"
+	"  S+ $SETUP_A123_PARTITIONS    . T:C2-3:C1-3 .      .      .     0 A1:1,A2:2,A3:3 A1:P1,A2:P1,A3:P1"
+
+	# Changing a partition root to member makes child partitions invalid
+	"  S+ C2-3:P1:S+  C3:P1  .      .      P0     .      .      .     0 A1:2-3,A2:3 A1:P0,A2:P-1"
+	"  S+ $SETUP_A123_PARTITIONS    .     C2-3    P0     .      .     0 A1:2-3,A2:2-3,A3:3 A1:P1,A2:P0,A3:P-1"
+
+	# cpuset.cpus can contains cpus not in parent's cpuset.cpus as long
+	# as they overlap.
+	"  S+ C2-3:P1:S+  .      .      .      .   C3-4:P1   .      .     0 A1:2,A2:3 A1:P1,A2:P1"
+
+	# Deletion of CPUs distributed to child cgroup is allowed.
+	"  S+ C0-1:P1:S+ C1      .    C2-3   C4-5     .      .      .     0 A1:4-5,A2:4-5"
+
+	# To become a valid partition root, cpuset.cpus must overlap parent's
+	# cpuset.cpus.
+	"  S+   C0-1:P1   .      .    C2-3    S+   C4-5:P1   .      .     0 A1:0-1,A2:0-1 A1:P1,A2:P-1"
+
+	# Enabling partition with child cpusets is allowed
+	"  S+   C0-1:S+  C1      .    C2-3    P1      .      .      .     0 A1:0-1,A2:1 A1:P1"
+
+	# A partition root with non-partition root parent is invalid, but it
+	# can be made valid if its parent becomes a partition root too.
+	"  S+   C0-1:S+  C1      .    C2-3     .      P2     .      .     0 A1:0-1,A2:1 A1:P0,A2:P-2"
+	"  S+   C0-1:S+ C1:P2    .    C2-3     P1     .      .      .     0 A1:0,A2:1 A1:P1,A2:P2"
+
+	# A non-exclusive cpuset.cpus change will invalidate partition and its siblings
+	"  S+   C0-1:P1   .      .    C2-3   C0-2     .      .      .     0 A1:0-2,B1:2-3 A1:P-1,B1:P0"
+	"  S+   C0-1:P1   .      .  P1:C2-3  C0-2   .      .      .     0 A1:0-2,B1:2-3 A1:P-1,B1:P-1"
+	"  S+    C0-1     .      .  P1:C2-3  C0-2   .      .      .     0 A1:0-2,B1:2-3 A1:P0,B1:P-1"
+
+	# test  old-A1 old-A2 old-A3 old-B1 new-A1 new-A2 new-A3 new-B1 fail ECPUs Pstate
+	# ----  ------ ------ ------ ------ ------ ------ ------ ------ ---- ----- ------
+	# Failure cases:
+
+	# A task cannot be added to a partition with no cpu
+	"  S+ C2-3:P1:S+  C3:P1  .      .    O2-0:T   .      .      .     1 A1:,A2:3 A1:P1,A2:P1"
+)
+
+#
+# Write to the cpu online file
+#  $1 - <c>-<v> where <c> = cpu number, <v> value to be written
+#
+write_cpu_online()
+{
+	CPU=${1%-*}
+	VAL=${1#*-}
+	CPUFILE=//sys/devices/system/cpu/cpu${CPU}/online
+	if [[ $VAL -eq 0 ]]
+	then
+		OFFLINE_CPUS="$OFFLINE_CPUS $CPU"
+	else
+		[[ -n "$OFFLINE_CPUS" ]] && {
+			OFFLINE_CPUS=$(echo $CPU $CPU $OFFLINE_CPUS | fmt -1 |\
+					sort | uniq -u)
+		}
+	fi
+	echo $VAL > $CPUFILE
+	pause 0.01
+}
+
+#
+# Set controller state
+#  $1 - cgroup directory
+#  $2 - state
+#  $3 - showerr
+#
+# The presence of ":" in state means transition from one to the next.
+#
+set_ctrl_state()
+{
+	TMPMSG=/tmp/.msg_$$
+	CGRP=$1
+	STATE=$2
+	SHOWERR=${3}${VERBOSE}
+	CTRL=${CTRL:=$CONTROLLER}
+	HASERR=0
+	REDIRECT="2> $TMPMSG"
+	[[ -z "$STATE" || "$STATE" = '.' ]] && return 0
+
+	rm -f $TMPMSG
+	for CMD in $(echo $STATE | sed -e "s/:/ /g")
+	do
+		TFILE=$CGRP/cgroup.procs
+		SFILE=$CGRP/cgroup.subtree_control
+		PFILE=$CGRP/cpuset.cpus.partition
+		CFILE=$CGRP/cpuset.cpus
+		S=$(expr substr $CMD 1 1)
+		if [[ $S = S ]]
+		then
+			PREFIX=${CMD#?}
+			COMM="echo ${PREFIX}${CTRL} > $SFILE"
+			eval $COMM $REDIRECT
+		elif [[ $S = C ]]
+		then
+			CPUS=${CMD#?}
+			COMM="echo $CPUS > $CFILE"
+			eval $COMM $REDIRECT
+		elif [[ $S = P ]]
+		then
+			VAL=${CMD#?}
+			case $VAL in
+			0)  VAL=member
+			    ;;
+			1)  VAL=root
+			    ;;
+			2)  VAL=isolated
+			    ;;
+			*)
+			    echo "Invalid partition state - $VAL"
+			    exit 1
+			    ;;
+			esac
+			COMM="echo $VAL > $PFILE"
+			eval $COMM $REDIRECT
+		elif [[ $S = O ]]
+		then
+			VAL=${CMD#?}
+			write_cpu_online $VAL
+		elif [[ $S = T ]]
+		then
+			COMM="echo 0 > $TFILE"
+			eval $COMM $REDIRECT
+		fi
+		RET=$?
+		[[ $RET -ne 0 ]] && {
+			[[ -n "$SHOWERR" ]] && {
+				echo "$COMM"
+				cat $TMPMSG
+			}
+			HASERR=1
+		}
+		pause 0.01
+		rm -f $TMPMSG
+	done
+	return $HASERR
+}
+
+set_ctrl_state_noerr()
+{
+	CGRP=$1
+	STATE=$2
+	[[ -d $CGRP ]] || mkdir $CGRP
+	set_ctrl_state $CGRP $STATE 1
+	[[ $? -ne 0 ]] && {
+		echo "ERROR: Failed to set $2 to cgroup $1!"
+		exit 1
+	}
+}
+
+online_cpus()
+{
+	[[ -n "OFFLINE_CPUS" ]] && {
+		for C in $OFFLINE_CPUS
+		do
+			write_cpu_online ${C}-1
+		done
+	}
+}
+
+#
+# Return 1 if the list of effective cpus isn't the same as the initial list.
+#
+reset_cgroup_states()
+{
+	echo 0 > $CGROUP2/cgroup.procs
+	online_cpus
+	rmdir A1/A2/A3 A1/A2 A1 B1 > /dev/null 2>&1
+	set_ctrl_state . S-
+	pause 0.01
+}
+
+dump_states()
+{
+	for DIR in A1 A1/A2 A1/A2/A3 B1
+	do
+		ECPUS=$DIR/cpuset.cpus.effective
+		PRS=$DIR/cpuset.cpus.partition
+		[[ -e $ECPUS ]] && echo "$ECPUS: $(cat $ECPUS)"
+		[[ -e $PRS   ]] && echo "$PRS: $(cat $PRS)"
+	done
+}
+
+#
+# Check effective cpus
+# $1 - check string, format: <cgroup>:<cpu-list>[,<cgroup>:<cpu-list>]*
+#
+check_effective_cpus()
+{
+	CHK_STR=$1
+	for CHK in $(echo $CHK_STR | sed -e "s/,/ /g")
+	do
+		set -- $(echo $CHK | sed -e "s/:/ /g")
+		CGRP=$1
+		CPUS=$2
+		[[ $CGRP = A2 ]] && CGRP=A1/A2
+		[[ $CGRP = A3 ]] && CGRP=A1/A2/A3
+		FILE=$CGRP/cpuset.cpus.effective
+		[[ -e $FILE ]] || return 1
+		[[ $CPUS = $(cat $FILE) ]] || return 1
+	done
+}
+
+#
+# Check cgroup states
+#  $1 - check string, format: <cgroup>:<state>[,<cgroup>:<state>]*
+#
+check_cgroup_states()
+{
+	CHK_STR=$1
+	for CHK in $(echo $CHK_STR | sed -e "s/,/ /g")
+	do
+		set -- $(echo $CHK | sed -e "s/:/ /g")
+		CGRP=$1
+		STATE=$2
+		FILE=
+		EVAL=$(expr substr $STATE 2 2)
+		[[ $CGRP = A2 ]] && CGRP=A1/A2
+		[[ $CGRP = A3 ]] && CGRP=A1/A2/A3
+
+		case $STATE in
+			P*) FILE=$CGRP/cpuset.cpus.partition
+			    ;;
+			*)  echo "Unknown state: $STATE!"
+			    exit 1
+			    ;;
+		esac
+		VAL=$(cat $FILE)
+
+		case "$VAL" in
+			member) VAL=0
+				;;
+			root)	VAL=1
+				;;
+			isolated)
+				VAL=2
+				;;
+			"root invalid"*)
+				VAL=-1
+				;;
+			"isolated invalid"*)
+				VAL=-2
+				;;
+		esac
+		[[ $EVAL != $VAL ]] && return 1
+	done
+	return 0
+}
+
+#
+# Run cpuset state transition test
+#  $1 - test matrix name
+#
+# This test is somewhat fragile as delays (sleep x) are added in various
+# places to make sure state changes are fully propagated before the next
+# action. These delays may need to be adjusted if running in a slower machine.
+#
+run_state_test()
+{
+	TEST=$1
+	CONTROLLER=cpuset
+	CPULIST=0-6
+	I=0
+	eval CNT="\${#$TEST[@]}"
+
+	reset_cgroup_states
+	echo $CPULIST > cpuset.cpus
+	echo root > cpuset.cpus.partition
+	console_msg "Running state transition test ..."
+
+	while [[ $I -lt $CNT ]]
+	do
+		echo "Running test $I ..." > /dev/console
+		eval set -- "\${$TEST[$I]}"
+		ROOT=$1
+		OLD_A1=$2
+		OLD_A2=$3
+		OLD_A3=$4
+		OLD_B1=$5
+		NEW_A1=$6
+		NEW_A2=$7
+		NEW_A3=$8
+		NEW_B1=$9
+		RESULT=${10}
+		ECPUS=${11}
+		STATES=${12}
+
+		set_ctrl_state_noerr .        $ROOT
+		set_ctrl_state_noerr A1       $OLD_A1
+		set_ctrl_state_noerr A1/A2    $OLD_A2
+		set_ctrl_state_noerr A1/A2/A3 $OLD_A3
+		set_ctrl_state_noerr B1       $OLD_B1
+		RETVAL=0
+		set_ctrl_state A1       $NEW_A1; ((RETVAL += $?))
+		set_ctrl_state A1/A2    $NEW_A2; ((RETVAL += $?))
+		set_ctrl_state A1/A2/A3 $NEW_A3; ((RETVAL += $?))
+		set_ctrl_state B1       $NEW_B1; ((RETVAL += $?))
+
+		[[ $RETVAL -ne $RESULT ]] && {
+			echo "Test $TEST[$I] failed result check!"
+			eval echo \"\${$TEST[$I]}\"
+			dump_states
+			online_cpus
+			exit 1
+		}
+
+		[[ -n "$ECPUS" && "$ECPUS" != . ]] && {
+			check_effective_cpus $ECPUS
+			[[ $? -ne 0 ]] && {
+				echo "Test $TEST[$I] failed effective CPU check!"
+				eval echo \"\${$TEST[$I]}\"
+				echo
+				dump_states
+				online_cpus
+				exit 1
+			}
+		}
+
+		[[ -n "$STATES" ]] && {
+			check_cgroup_states $STATES
+			[[ $? -ne 0 ]] && {
+				echo "FAILED: Test $TEST[$I] failed states check!"
+				eval echo \"\${$TEST[$I]}\"
+				echo
+				dump_states
+				online_cpus
+				exit 1
+			}
+		}
+
+		reset_cgroup_states
+		#
+		# Check to see if effective cpu list changes
+		#
+		pause 0.05
+		NEWLIST=$(cat cpuset.cpus.effective)
+		[[ $NEWLIST != $CPULIST ]] && {
+			echo "Effective cpus changed to $NEWLIST after test $I!"
+			exit 1
+		}
+		[[ -n "$VERBOSE" ]] && echo "Test $I done."
+		((I++))
+	done
+	echo "All $I tests of $TEST PASSED."
+
+	echo member > cpuset.cpus.partition
+}
+
+#
+# Wait for inotify event for the given file and read it
+# $1: cgroup file to wait for
+# $2: file to store the read result
+#
+wait_inotify()
+{
+	CGROUP_FILE=$1
+	OUTPUT_FILE=$2
+
+	$WAIT_INOTIFY $CGROUP_FILE
+	cat $CGROUP_FILE > $OUTPUT_FILE
+}
+
+#
+# Test if inotify events are properly generated when going into and out of
+# invalid partition state.
+#
+test_inotify()
+{
+	ERR=0
+	PRS=/tmp/.prs_$$
+	[[ -f $WAIT_INOTIFY ]] || {
+		echo "wait_inotify not found, inotify test SKIPPED."
+		return
+	}
+
+	pause 0.01
+	echo 1 > cpuset.cpus
+	echo 0 > cgroup.procs
+	echo root > cpuset.cpus.partition
+	pause 0.01
+	rm -f $PRS
+	wait_inotify $PWD/cpuset.cpus.partition $PRS &
+	pause 0.01
+	set_ctrl_state . "O1-0"
+	pause 0.01
+	check_cgroup_states ".:P-1"
+	if [[ $? -ne 0 ]]
+	then
+		echo "FAILED: Inotify test - partition not invalid"
+		ERR=1
+	elif [[ ! -f $PRS ]]
+	then
+		echo "FAILED: Inotify test - event not generated"
+		ERR=1
+		kill %1
+	elif [[ $(cat $PRS) != "root invalid"* ]]
+	then
+		echo "FAILED: Inotify test - incorrect state"
+		cat $PRS
+		ERR=1
+	fi
+	online_cpus
+	echo member > cpuset.cpus.partition
+	echo 0 > ../cgroup.procs
+	if [[ $ERR -ne 0 ]]
+	then
+		exit 1
+	else
+		echo "Inotify test PASSED"
+	fi
+}
+
+run_state_test TEST_MATRIX
+test_isolated
+test_inotify
+echo "All tests PASSED."
+cd ..
+rmdir test
diff --git a/tools/testing/selftests/cgroup/test_freezer.c b/tools/testing/selftests/cgroup/test_freezer.c
new file mode 100644
index 000000000..ff519029f
--- /dev/null
+++ b/tools/testing/selftests/cgroup/test_freezer.c
@@ -0,0 +1,848 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <stdbool.h>
+#include <linux/limits.h>
+#include <sys/ptrace.h>
+#include <sys/types.h>
+#include <sys/mman.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/wait.h>
+
+#include "../kselftest.h"
+#include "cgroup_util.h"
+
+#define DEBUG
+#ifdef DEBUG
+#define debug(args...) fprintf(stderr, args)
+#else
+#define debug(args...)
+#endif
+
+/*
+ * Check if the cgroup is frozen by looking at the cgroup.events::frozen value.
+ */
+static int cg_check_frozen(const char *cgroup, bool frozen)
+{
+	if (frozen) {
+		if (cg_read_strstr(cgroup, "cgroup.events", "frozen 1") != 0) {
+			debug("Cgroup %s isn't frozen\n", cgroup);
+			return -1;
+		}
+	} else {
+		/*
+		 * Check the cgroup.events::frozen value.
+		 */
+		if (cg_read_strstr(cgroup, "cgroup.events", "frozen 0") != 0) {
+			debug("Cgroup %s is frozen\n", cgroup);
+			return -1;
+		}
+	}
+
+	return 0;
+}
+
+/*
+ * Freeze the given cgroup.
+ */
+static int cg_freeze_nowait(const char *cgroup, bool freeze)
+{
+	return cg_write(cgroup, "cgroup.freeze", freeze ? "1" : "0");
+}
+
+/*
+ * Attach a task to the given cgroup and wait for a cgroup frozen event.
+ * All transient events (e.g. populated) are ignored.
+ */
+static int cg_enter_and_wait_for_frozen(const char *cgroup, int pid,
+					bool frozen)
+{
+	int fd, ret = -1;
+	int attempts;
+
+	fd = cg_prepare_for_wait(cgroup);
+	if (fd < 0)
+		return fd;
+
+	ret = cg_enter(cgroup, pid);
+	if (ret)
+		goto out;
+
+	for (attempts = 0; attempts < 10; attempts++) {
+		ret = cg_wait_for(fd);
+		if (ret)
+			break;
+
+		ret = cg_check_frozen(cgroup, frozen);
+		if (ret)
+			continue;
+	}
+
+out:
+	close(fd);
+	return ret;
+}
+
+/*
+ * Freeze the given cgroup and wait for the inotify signal.
+ * If there are no events in 10 seconds, treat this as an error.
+ * Then check that the cgroup is in the desired state.
+ */
+static int cg_freeze_wait(const char *cgroup, bool freeze)
+{
+	int fd, ret = -1;
+
+	fd = cg_prepare_for_wait(cgroup);
+	if (fd < 0)
+		return fd;
+
+	ret = cg_freeze_nowait(cgroup, freeze);
+	if (ret) {
+		debug("Error: cg_freeze_nowait() failed\n");
+		goto out;
+	}
+
+	ret = cg_wait_for(fd);
+	if (ret)
+		goto out;
+
+	ret = cg_check_frozen(cgroup, freeze);
+out:
+	close(fd);
+	return ret;
+}
+
+/*
+ * A simple process running in a sleep loop until being
+ * re-parented.
+ */
+static int child_fn(const char *cgroup, void *arg)
+{
+	int ppid = getppid();
+
+	while (getppid() == ppid)
+		usleep(1000);
+
+	return getppid() == ppid;
+}
+
+/*
+ * A simple test for the cgroup freezer: populated the cgroup with 100
+ * running processes and freeze it. Then unfreeze it. Then it kills all
+ * processes and destroys the cgroup.
+ */
+static int test_cgfreezer_simple(const char *root)
+{
+	int ret = KSFT_FAIL;
+	char *cgroup = NULL;
+	int i;
+
+	cgroup = cg_name(root, "cg_test_simple");
+	if (!cgroup)
+		goto cleanup;
+
+	if (cg_create(cgroup))
+		goto cleanup;
+
+	for (i = 0; i < 100; i++)
+		cg_run_nowait(cgroup, child_fn, NULL);
+
+	if (cg_wait_for_proc_count(cgroup, 100))
+		goto cleanup;
+
+	if (cg_check_frozen(cgroup, false))
+		goto cleanup;
+
+	if (cg_freeze_wait(cgroup, true))
+		goto cleanup;
+
+	if (cg_freeze_wait(cgroup, false))
+		goto cleanup;
+
+	ret = KSFT_PASS;
+
+cleanup:
+	if (cgroup)
+		cg_destroy(cgroup);
+	free(cgroup);
+	return ret;
+}
+
+/*
+ * The test creates the following hierarchy:
+ *       A
+ *    / / \ \
+ *   B  E  I K
+ *  /\  |
+ * C  D F
+ *      |
+ *      G
+ *      |
+ *      H
+ *
+ * with a process in C, H and 3 processes in K.
+ * Then it tries to freeze and unfreeze the whole tree.
+ */
+static int test_cgfreezer_tree(const char *root)
+{
+	char *cgroup[10] = {0};
+	int ret = KSFT_FAIL;
+	int i;
+
+	cgroup[0] = cg_name(root, "cg_test_tree_A");
+	if (!cgroup[0])
+		goto cleanup;
+
+	cgroup[1] = cg_name(cgroup[0], "B");
+	if (!cgroup[1])
+		goto cleanup;
+
+	cgroup[2] = cg_name(cgroup[1], "C");
+	if (!cgroup[2])
+		goto cleanup;
+
+	cgroup[3] = cg_name(cgroup[1], "D");
+	if (!cgroup[3])
+		goto cleanup;
+
+	cgroup[4] = cg_name(cgroup[0], "E");
+	if (!cgroup[4])
+		goto cleanup;
+
+	cgroup[5] = cg_name(cgroup[4], "F");
+	if (!cgroup[5])
+		goto cleanup;
+
+	cgroup[6] = cg_name(cgroup[5], "G");
+	if (!cgroup[6])
+		goto cleanup;
+
+	cgroup[7] = cg_name(cgroup[6], "H");
+	if (!cgroup[7])
+		goto cleanup;
+
+	cgroup[8] = cg_name(cgroup[0], "I");
+	if (!cgroup[8])
+		goto cleanup;
+
+	cgroup[9] = cg_name(cgroup[0], "K");
+	if (!cgroup[9])
+		goto cleanup;
+
+	for (i = 0; i < 10; i++)
+		if (cg_create(cgroup[i]))
+			goto cleanup;
+
+	cg_run_nowait(cgroup[2], child_fn, NULL);
+	cg_run_nowait(cgroup[7], child_fn, NULL);
+	cg_run_nowait(cgroup[9], child_fn, NULL);
+	cg_run_nowait(cgroup[9], child_fn, NULL);
+	cg_run_nowait(cgroup[9], child_fn, NULL);
+
+	/*
+	 * Wait until all child processes will enter
+	 * corresponding cgroups.
+	 */
+
+	if (cg_wait_for_proc_count(cgroup[2], 1) ||
+	    cg_wait_for_proc_count(cgroup[7], 1) ||
+	    cg_wait_for_proc_count(cgroup[9], 3))
+		goto cleanup;
+
+	/*
+	 * Freeze B.
+	 */
+	if (cg_freeze_wait(cgroup[1], true))
+		goto cleanup;
+
+	/*
+	 * Freeze F.
+	 */
+	if (cg_freeze_wait(cgroup[5], true))
+		goto cleanup;
+
+	/*
+	 * Freeze G.
+	 */
+	if (cg_freeze_wait(cgroup[6], true))
+		goto cleanup;
+
+	/*
+	 * Check that A and E are not frozen.
+	 */
+	if (cg_check_frozen(cgroup[0], false))
+		goto cleanup;
+
+	if (cg_check_frozen(cgroup[4], false))
+		goto cleanup;
+
+	/*
+	 * Freeze A. Check that A, B and E are frozen.
+	 */
+	if (cg_freeze_wait(cgroup[0], true))
+		goto cleanup;
+
+	if (cg_check_frozen(cgroup[1], true))
+		goto cleanup;
+
+	if (cg_check_frozen(cgroup[4], true))
+		goto cleanup;
+
+	/*
+	 * Unfreeze B, F and G
+	 */
+	if (cg_freeze_nowait(cgroup[1], false))
+		goto cleanup;
+
+	if (cg_freeze_nowait(cgroup[5], false))
+		goto cleanup;
+
+	if (cg_freeze_nowait(cgroup[6], false))
+		goto cleanup;
+
+	/*
+	 * Check that C and H are still frozen.
+	 */
+	if (cg_check_frozen(cgroup[2], true))
+		goto cleanup;
+
+	if (cg_check_frozen(cgroup[7], true))
+		goto cleanup;
+
+	/*
+	 * Unfreeze A. Check that A, C and K are not frozen.
+	 */
+	if (cg_freeze_wait(cgroup[0], false))
+		goto cleanup;
+
+	if (cg_check_frozen(cgroup[2], false))
+		goto cleanup;
+
+	if (cg_check_frozen(cgroup[9], false))
+		goto cleanup;
+
+	ret = KSFT_PASS;
+
+cleanup:
+	for (i = 9; i >= 0 && cgroup[i]; i--) {
+		cg_destroy(cgroup[i]);
+		free(cgroup[i]);
+	}
+
+	return ret;
+}
+
+/*
+ * A fork bomb emulator.
+ */
+static int forkbomb_fn(const char *cgroup, void *arg)
+{
+	int ppid;
+
+	fork();
+	fork();
+
+	ppid = getppid();
+
+	while (getppid() == ppid)
+		usleep(1000);
+
+	return getppid() == ppid;
+}
+
+/*
+ * The test runs a fork bomb in a cgroup and tries to freeze it.
+ * Then it kills all processes and checks that cgroup isn't populated
+ * anymore.
+ */
+static int test_cgfreezer_forkbomb(const char *root)
+{
+	int ret = KSFT_FAIL;
+	char *cgroup = NULL;
+
+	cgroup = cg_name(root, "cg_forkbomb_test");
+	if (!cgroup)
+		goto cleanup;
+
+	if (cg_create(cgroup))
+		goto cleanup;
+
+	cg_run_nowait(cgroup, forkbomb_fn, NULL);
+
+	usleep(100000);
+
+	if (cg_freeze_wait(cgroup, true))
+		goto cleanup;
+
+	if (cg_killall(cgroup))
+		goto cleanup;
+
+	if (cg_wait_for_proc_count(cgroup, 0))
+		goto cleanup;
+
+	ret = KSFT_PASS;
+
+cleanup:
+	if (cgroup)
+		cg_destroy(cgroup);
+	free(cgroup);
+	return ret;
+}
+
+/*
+ * The test creates a cgroups and freezes it. Then it creates a child cgroup
+ * and populates it with a task. After that it checks that the child cgroup
+ * is frozen and the parent cgroup remains frozen too.
+ */
+static int test_cgfreezer_mkdir(const char *root)
+{
+	int ret = KSFT_FAIL;
+	char *parent, *child = NULL;
+	int pid;
+
+	parent = cg_name(root, "cg_test_mkdir_A");
+	if (!parent)
+		goto cleanup;
+
+	child = cg_name(parent, "cg_test_mkdir_B");
+	if (!child)
+		goto cleanup;
+
+	if (cg_create(parent))
+		goto cleanup;
+
+	if (cg_freeze_wait(parent, true))
+		goto cleanup;
+
+	if (cg_create(child))
+		goto cleanup;
+
+	pid = cg_run_nowait(child, child_fn, NULL);
+	if (pid < 0)
+		goto cleanup;
+
+	if (cg_wait_for_proc_count(child, 1))
+		goto cleanup;
+
+	if (cg_check_frozen(child, true))
+		goto cleanup;
+
+	if (cg_check_frozen(parent, true))
+		goto cleanup;
+
+	ret = KSFT_PASS;
+
+cleanup:
+	if (child)
+		cg_destroy(child);
+	free(child);
+	if (parent)
+		cg_destroy(parent);
+	free(parent);
+	return ret;
+}
+
+/*
+ * The test creates two nested cgroups, freezes the parent
+ * and removes the child. Then it checks that the parent cgroup
+ * remains frozen and it's possible to create a new child
+ * without unfreezing. The new child is frozen too.
+ */
+static int test_cgfreezer_rmdir(const char *root)
+{
+	int ret = KSFT_FAIL;
+	char *parent, *child = NULL;
+
+	parent = cg_name(root, "cg_test_rmdir_A");
+	if (!parent)
+		goto cleanup;
+
+	child = cg_name(parent, "cg_test_rmdir_B");
+	if (!child)
+		goto cleanup;
+
+	if (cg_create(parent))
+		goto cleanup;
+
+	if (cg_create(child))
+		goto cleanup;
+
+	if (cg_freeze_wait(parent, true))
+		goto cleanup;
+
+	if (cg_destroy(child))
+		goto cleanup;
+
+	if (cg_check_frozen(parent, true))
+		goto cleanup;
+
+	if (cg_create(child))
+		goto cleanup;
+
+	if (cg_check_frozen(child, true))
+		goto cleanup;
+
+	ret = KSFT_PASS;
+
+cleanup:
+	if (child)
+		cg_destroy(child);
+	free(child);
+	if (parent)
+		cg_destroy(parent);
+	free(parent);
+	return ret;
+}
+
+/*
+ * The test creates two cgroups: A and B, runs a process in A
+ * and performs several migrations:
+ * 1) A (running) -> B (frozen)
+ * 2) B (frozen) -> A (running)
+ * 3) A (frozen) -> B (frozen)
+ *
+ * On each step it checks the actual state of both cgroups.
+ */
+static int test_cgfreezer_migrate(const char *root)
+{
+	int ret = KSFT_FAIL;
+	char *cgroup[2] = {0};
+	int pid;
+
+	cgroup[0] = cg_name(root, "cg_test_migrate_A");
+	if (!cgroup[0])
+		goto cleanup;
+
+	cgroup[1] = cg_name(root, "cg_test_migrate_B");
+	if (!cgroup[1])
+		goto cleanup;
+
+	if (cg_create(cgroup[0]))
+		goto cleanup;
+
+	if (cg_create(cgroup[1]))
+		goto cleanup;
+
+	pid = cg_run_nowait(cgroup[0], child_fn, NULL);
+	if (pid < 0)
+		goto cleanup;
+
+	if (cg_wait_for_proc_count(cgroup[0], 1))
+		goto cleanup;
+
+	/*
+	 * Migrate from A (running) to B (frozen)
+	 */
+	if (cg_freeze_wait(cgroup[1], true))
+		goto cleanup;
+
+	if (cg_enter_and_wait_for_frozen(cgroup[1], pid, true))
+		goto cleanup;
+
+	if (cg_check_frozen(cgroup[0], false))
+		goto cleanup;
+
+	/*
+	 * Migrate from B (frozen) to A (running)
+	 */
+	if (cg_enter_and_wait_for_frozen(cgroup[0], pid, false))
+		goto cleanup;
+
+	if (cg_check_frozen(cgroup[1], true))
+		goto cleanup;
+
+	/*
+	 * Migrate from A (frozen) to B (frozen)
+	 */
+	if (cg_freeze_wait(cgroup[0], true))
+		goto cleanup;
+
+	if (cg_enter_and_wait_for_frozen(cgroup[1], pid, true))
+		goto cleanup;
+
+	if (cg_check_frozen(cgroup[0], true))
+		goto cleanup;
+
+	ret = KSFT_PASS;
+
+cleanup:
+	if (cgroup[0])
+		cg_destroy(cgroup[0]);
+	free(cgroup[0]);
+	if (cgroup[1])
+		cg_destroy(cgroup[1]);
+	free(cgroup[1]);
+	return ret;
+}
+
+/*
+ * The test checks that ptrace works with a tracing process in a frozen cgroup.
+ */
+static int test_cgfreezer_ptrace(const char *root)
+{
+	int ret = KSFT_FAIL;
+	char *cgroup = NULL;
+	siginfo_t siginfo;
+	int pid;
+
+	cgroup = cg_name(root, "cg_test_ptrace");
+	if (!cgroup)
+		goto cleanup;
+
+	if (cg_create(cgroup))
+		goto cleanup;
+
+	pid = cg_run_nowait(cgroup, child_fn, NULL);
+	if (pid < 0)
+		goto cleanup;
+
+	if (cg_wait_for_proc_count(cgroup, 1))
+		goto cleanup;
+
+	if (cg_freeze_wait(cgroup, true))
+		goto cleanup;
+
+	if (ptrace(PTRACE_SEIZE, pid, NULL, NULL))
+		goto cleanup;
+
+	if (ptrace(PTRACE_INTERRUPT, pid, NULL, NULL))
+		goto cleanup;
+
+	waitpid(pid, NULL, 0);
+
+	/*
+	 * Cgroup has to remain frozen, however the test task
+	 * is in traced state.
+	 */
+	if (cg_check_frozen(cgroup, true))
+		goto cleanup;
+
+	if (ptrace(PTRACE_GETSIGINFO, pid, NULL, &siginfo))
+		goto cleanup;
+
+	if (ptrace(PTRACE_DETACH, pid, NULL, NULL))
+		goto cleanup;
+
+	if (cg_check_frozen(cgroup, true))
+		goto cleanup;
+
+	ret = KSFT_PASS;
+
+cleanup:
+	if (cgroup)
+		cg_destroy(cgroup);
+	free(cgroup);
+	return ret;
+}
+
+/*
+ * Check if the process is stopped.
+ */
+static int proc_check_stopped(int pid)
+{
+	char buf[PAGE_SIZE];
+	int len;
+
+	len = proc_read_text(pid, 0, "stat", buf, sizeof(buf));
+	if (len == -1) {
+		debug("Can't get %d stat\n", pid);
+		return -1;
+	}
+
+	if (strstr(buf, "(test_freezer) T ") == NULL) {
+		debug("Process %d in the unexpected state: %s\n", pid, buf);
+		return -1;
+	}
+
+	return 0;
+}
+
+/*
+ * Test that it's possible to freeze a cgroup with a stopped process.
+ */
+static int test_cgfreezer_stopped(const char *root)
+{
+	int pid, ret = KSFT_FAIL;
+	char *cgroup = NULL;
+
+	cgroup = cg_name(root, "cg_test_stopped");
+	if (!cgroup)
+		goto cleanup;
+
+	if (cg_create(cgroup))
+		goto cleanup;
+
+	pid = cg_run_nowait(cgroup, child_fn, NULL);
+
+	if (cg_wait_for_proc_count(cgroup, 1))
+		goto cleanup;
+
+	if (kill(pid, SIGSTOP))
+		goto cleanup;
+
+	if (cg_check_frozen(cgroup, false))
+		goto cleanup;
+
+	if (cg_freeze_wait(cgroup, true))
+		goto cleanup;
+
+	if (cg_freeze_wait(cgroup, false))
+		goto cleanup;
+
+	if (proc_check_stopped(pid))
+		goto cleanup;
+
+	ret = KSFT_PASS;
+
+cleanup:
+	if (cgroup)
+		cg_destroy(cgroup);
+	free(cgroup);
+	return ret;
+}
+
+/*
+ * Test that it's possible to freeze a cgroup with a ptraced process.
+ */
+static int test_cgfreezer_ptraced(const char *root)
+{
+	int pid, ret = KSFT_FAIL;
+	char *cgroup = NULL;
+	siginfo_t siginfo;
+
+	cgroup = cg_name(root, "cg_test_ptraced");
+	if (!cgroup)
+		goto cleanup;
+
+	if (cg_create(cgroup))
+		goto cleanup;
+
+	pid = cg_run_nowait(cgroup, child_fn, NULL);
+
+	if (cg_wait_for_proc_count(cgroup, 1))
+		goto cleanup;
+
+	if (ptrace(PTRACE_SEIZE, pid, NULL, NULL))
+		goto cleanup;
+
+	if (ptrace(PTRACE_INTERRUPT, pid, NULL, NULL))
+		goto cleanup;
+
+	waitpid(pid, NULL, 0);
+
+	if (cg_check_frozen(cgroup, false))
+		goto cleanup;
+
+	if (cg_freeze_wait(cgroup, true))
+		goto cleanup;
+
+	/*
+	 * cg_check_frozen(cgroup, true) will fail here,
+	 * because the task in in the TRACEd state.
+	 */
+	if (cg_freeze_wait(cgroup, false))
+		goto cleanup;
+
+	if (ptrace(PTRACE_GETSIGINFO, pid, NULL, &siginfo))
+		goto cleanup;
+
+	if (ptrace(PTRACE_DETACH, pid, NULL, NULL))
+		goto cleanup;
+
+	ret = KSFT_PASS;
+
+cleanup:
+	if (cgroup)
+		cg_destroy(cgroup);
+	free(cgroup);
+	return ret;
+}
+
+static int vfork_fn(const char *cgroup, void *arg)
+{
+	int pid = vfork();
+
+	if (pid == 0)
+		while (true)
+			sleep(1);
+
+	return pid;
+}
+
+/*
+ * Test that it's possible to freeze a cgroup with a process,
+ * which called vfork() and is waiting for a child.
+ */
+static int test_cgfreezer_vfork(const char *root)
+{
+	int ret = KSFT_FAIL;
+	char *cgroup = NULL;
+
+	cgroup = cg_name(root, "cg_test_vfork");
+	if (!cgroup)
+		goto cleanup;
+
+	if (cg_create(cgroup))
+		goto cleanup;
+
+	cg_run_nowait(cgroup, vfork_fn, NULL);
+
+	if (cg_wait_for_proc_count(cgroup, 2))
+		goto cleanup;
+
+	if (cg_freeze_wait(cgroup, true))
+		goto cleanup;
+
+	ret = KSFT_PASS;
+
+cleanup:
+	if (cgroup)
+		cg_destroy(cgroup);
+	free(cgroup);
+	return ret;
+}
+
+#define T(x) { x, #x }
+struct cgfreezer_test {
+	int (*fn)(const char *root);
+	const char *name;
+} tests[] = {
+	T(test_cgfreezer_simple),
+	T(test_cgfreezer_tree),
+	T(test_cgfreezer_forkbomb),
+	T(test_cgfreezer_mkdir),
+	T(test_cgfreezer_rmdir),
+	T(test_cgfreezer_migrate),
+	T(test_cgfreezer_ptrace),
+	T(test_cgfreezer_stopped),
+	T(test_cgfreezer_ptraced),
+	T(test_cgfreezer_vfork),
+};
+#undef T
+
+int main(int argc, char *argv[])
+{
+	char root[PATH_MAX];
+	int i, ret = EXIT_SUCCESS;
+
+	if (cg_find_unified_root(root, sizeof(root)))
+		ksft_exit_skip("cgroup v2 isn't mounted\n");
+	for (i = 0; i < ARRAY_SIZE(tests); i++) {
+		switch (tests[i].fn(root)) {
+		case KSFT_PASS:
+			ksft_test_result_pass("%s\n", tests[i].name);
+			break;
+		case KSFT_SKIP:
+			ksft_test_result_skip("%s\n", tests[i].name);
+			break;
+		default:
+			ret = EXIT_FAILURE;
+			ksft_test_result_fail("%s\n", tests[i].name);
+			break;
+		}
+	}
+
+	return ret;
+}
diff --git a/tools/testing/selftests/cgroup/test_kill.c b/tools/testing/selftests/cgroup/test_kill.c
new file mode 100644
index 000000000..615369031
--- /dev/null
+++ b/tools/testing/selftests/cgroup/test_kill.c
@@ -0,0 +1,297 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#include <errno.h>
+#include <linux/limits.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "../kselftest.h"
+#include "../pidfd/pidfd.h"
+#include "cgroup_util.h"
+
+/*
+ * Kill the given cgroup and wait for the inotify signal.
+ * If there are no events in 10 seconds, treat this as an error.
+ * Then check that the cgroup is in the desired state.
+ */
+static int cg_kill_wait(const char *cgroup)
+{
+	int fd, ret = -1;
+
+	fd = cg_prepare_for_wait(cgroup);
+	if (fd < 0)
+		return fd;
+
+	ret = cg_write(cgroup, "cgroup.kill", "1");
+	if (ret)
+		goto out;
+
+	ret = cg_wait_for(fd);
+	if (ret)
+		goto out;
+
+out:
+	close(fd);
+	return ret;
+}
+
+/*
+ * A simple process running in a sleep loop until being
+ * re-parented.
+ */
+static int child_fn(const char *cgroup, void *arg)
+{
+	int ppid = getppid();
+
+	while (getppid() == ppid)
+		usleep(1000);
+
+	return getppid() == ppid;
+}
+
+static int test_cgkill_simple(const char *root)
+{
+	pid_t pids[100];
+	int ret = KSFT_FAIL;
+	char *cgroup = NULL;
+	int i;
+
+	cgroup = cg_name(root, "cg_test_simple");
+	if (!cgroup)
+		goto cleanup;
+
+	if (cg_create(cgroup))
+		goto cleanup;
+
+	for (i = 0; i < 100; i++)
+		pids[i] = cg_run_nowait(cgroup, child_fn, NULL);
+
+	if (cg_wait_for_proc_count(cgroup, 100))
+		goto cleanup;
+
+	if (cg_read_strcmp(cgroup, "cgroup.events", "populated 1\n"))
+		goto cleanup;
+
+	if (cg_kill_wait(cgroup))
+		goto cleanup;
+
+	ret = KSFT_PASS;
+
+cleanup:
+	for (i = 0; i < 100; i++)
+		wait_for_pid(pids[i]);
+
+	if (ret == KSFT_PASS &&
+	    cg_read_strcmp(cgroup, "cgroup.events", "populated 0\n"))
+		ret = KSFT_FAIL;
+
+	if (cgroup)
+		cg_destroy(cgroup);
+	free(cgroup);
+	return ret;
+}
+
+/*
+ * The test creates the following hierarchy:
+ *       A
+ *    / / \ \
+ *   B  E  I K
+ *  /\  |
+ * C  D F
+ *      |
+ *      G
+ *      |
+ *      H
+ *
+ * with a process in C, H and 3 processes in K.
+ * Then it tries to kill the whole tree.
+ */
+static int test_cgkill_tree(const char *root)
+{
+	pid_t pids[5];
+	char *cgroup[10] = {0};
+	int ret = KSFT_FAIL;
+	int i;
+
+	cgroup[0] = cg_name(root, "cg_test_tree_A");
+	if (!cgroup[0])
+		goto cleanup;
+
+	cgroup[1] = cg_name(cgroup[0], "B");
+	if (!cgroup[1])
+		goto cleanup;
+
+	cgroup[2] = cg_name(cgroup[1], "C");
+	if (!cgroup[2])
+		goto cleanup;
+
+	cgroup[3] = cg_name(cgroup[1], "D");
+	if (!cgroup[3])
+		goto cleanup;
+
+	cgroup[4] = cg_name(cgroup[0], "E");
+	if (!cgroup[4])
+		goto cleanup;
+
+	cgroup[5] = cg_name(cgroup[4], "F");
+	if (!cgroup[5])
+		goto cleanup;
+
+	cgroup[6] = cg_name(cgroup[5], "G");
+	if (!cgroup[6])
+		goto cleanup;
+
+	cgroup[7] = cg_name(cgroup[6], "H");
+	if (!cgroup[7])
+		goto cleanup;
+
+	cgroup[8] = cg_name(cgroup[0], "I");
+	if (!cgroup[8])
+		goto cleanup;
+
+	cgroup[9] = cg_name(cgroup[0], "K");
+	if (!cgroup[9])
+		goto cleanup;
+
+	for (i = 0; i < 10; i++)
+		if (cg_create(cgroup[i]))
+			goto cleanup;
+
+	pids[0] = cg_run_nowait(cgroup[2], child_fn, NULL);
+	pids[1] = cg_run_nowait(cgroup[7], child_fn, NULL);
+	pids[2] = cg_run_nowait(cgroup[9], child_fn, NULL);
+	pids[3] = cg_run_nowait(cgroup[9], child_fn, NULL);
+	pids[4] = cg_run_nowait(cgroup[9], child_fn, NULL);
+
+	/*
+	 * Wait until all child processes will enter
+	 * corresponding cgroups.
+	 */
+
+	if (cg_wait_for_proc_count(cgroup[2], 1) ||
+	    cg_wait_for_proc_count(cgroup[7], 1) ||
+	    cg_wait_for_proc_count(cgroup[9], 3))
+		goto cleanup;
+
+	/*
+	 * Kill A and check that we get an empty notification.
+	 */
+	if (cg_kill_wait(cgroup[0]))
+		goto cleanup;
+
+	ret = KSFT_PASS;
+
+cleanup:
+	for (i = 0; i < 5; i++)
+		wait_for_pid(pids[i]);
+
+	if (ret == KSFT_PASS &&
+	    cg_read_strcmp(cgroup[0], "cgroup.events", "populated 0\n"))
+		ret = KSFT_FAIL;
+
+	for (i = 9; i >= 0 && cgroup[i]; i--) {
+		cg_destroy(cgroup[i]);
+		free(cgroup[i]);
+	}
+
+	return ret;
+}
+
+static int forkbomb_fn(const char *cgroup, void *arg)
+{
+	int ppid;
+
+	fork();
+	fork();
+
+	ppid = getppid();
+
+	while (getppid() == ppid)
+		usleep(1000);
+
+	return getppid() == ppid;
+}
+
+/*
+ * The test runs a fork bomb in a cgroup and tries to kill it.
+ */
+static int test_cgkill_forkbomb(const char *root)
+{
+	int ret = KSFT_FAIL;
+	char *cgroup = NULL;
+	pid_t pid = -ESRCH;
+
+	cgroup = cg_name(root, "cg_forkbomb_test");
+	if (!cgroup)
+		goto cleanup;
+
+	if (cg_create(cgroup))
+		goto cleanup;
+
+	pid = cg_run_nowait(cgroup, forkbomb_fn, NULL);
+	if (pid < 0)
+		goto cleanup;
+
+	usleep(100000);
+
+	if (cg_kill_wait(cgroup))
+		goto cleanup;
+
+	if (cg_wait_for_proc_count(cgroup, 0))
+		goto cleanup;
+
+	ret = KSFT_PASS;
+
+cleanup:
+	if (pid > 0)
+		wait_for_pid(pid);
+
+	if (ret == KSFT_PASS &&
+	    cg_read_strcmp(cgroup, "cgroup.events", "populated 0\n"))
+		ret = KSFT_FAIL;
+
+	if (cgroup)
+		cg_destroy(cgroup);
+	free(cgroup);
+	return ret;
+}
+
+#define T(x) { x, #x }
+struct cgkill_test {
+	int (*fn)(const char *root);
+	const char *name;
+} tests[] = {
+	T(test_cgkill_simple),
+	T(test_cgkill_tree),
+	T(test_cgkill_forkbomb),
+};
+#undef T
+
+int main(int argc, char *argv[])
+{
+	char root[PATH_MAX];
+	int i, ret = EXIT_SUCCESS;
+
+	if (cg_find_unified_root(root, sizeof(root)))
+		ksft_exit_skip("cgroup v2 isn't mounted\n");
+	for (i = 0; i < ARRAY_SIZE(tests); i++) {
+		switch (tests[i].fn(root)) {
+		case KSFT_PASS:
+			ksft_test_result_pass("%s\n", tests[i].name);
+			break;
+		case KSFT_SKIP:
+			ksft_test_result_skip("%s\n", tests[i].name);
+			break;
+		default:
+			ret = EXIT_FAILURE;
+			ksft_test_result_fail("%s\n", tests[i].name);
+			break;
+		}
+	}
+
+	return ret;
+}
diff --git a/tools/testing/selftests/cgroup/test_kmem.c b/tools/testing/selftests/cgroup/test_kmem.c
new file mode 100644
index 000000000..258ddc565
--- /dev/null
+++ b/tools/testing/selftests/cgroup/test_kmem.c
@@ -0,0 +1,456 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+
+#include <linux/limits.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <sys/wait.h>
+#include <errno.h>
+#include <sys/sysinfo.h>
+#include <pthread.h>
+
+#include "../kselftest.h"
+#include "cgroup_util.h"
+
+
+/*
+ * Memory cgroup charging is performed using percpu batches 64 pages
+ * big (look at MEMCG_CHARGE_BATCH), whereas memory.stat is exact. So
+ * the maximum discrepancy between charge and vmstat entries is number
+ * of cpus multiplied by 64 pages.
+ */
+#define MAX_VMSTAT_ERROR (4096 * 64 * get_nprocs())
+
+
+static int alloc_dcache(const char *cgroup, void *arg)
+{
+	unsigned long i;
+	struct stat st;
+	char buf[128];
+
+	for (i = 0; i < (unsigned long)arg; i++) {
+		snprintf(buf, sizeof(buf),
+			"/something-non-existent-with-a-long-name-%64lu-%d",
+			 i, getpid());
+		stat(buf, &st);
+	}
+
+	return 0;
+}
+
+/*
+ * This test allocates 100000 of negative dentries with long names.
+ * Then it checks that "slab" in memory.stat is larger than 1M.
+ * Then it sets memory.high to 1M and checks that at least 1/2
+ * of slab memory has been reclaimed.
+ */
+static int test_kmem_basic(const char *root)
+{
+	int ret = KSFT_FAIL;
+	char *cg = NULL;
+	long slab0, slab1, current;
+
+	cg = cg_name(root, "kmem_basic_test");
+	if (!cg)
+		goto cleanup;
+
+	if (cg_create(cg))
+		goto cleanup;
+
+	if (cg_run(cg, alloc_dcache, (void *)100000))
+		goto cleanup;
+
+	slab0 = cg_read_key_long(cg, "memory.stat", "slab ");
+	if (slab0 < (1 << 20))
+		goto cleanup;
+
+	cg_write(cg, "memory.high", "1M");
+	slab1 = cg_read_key_long(cg, "memory.stat", "slab ");
+	if (slab1 <= 0)
+		goto cleanup;
+
+	current = cg_read_long(cg, "memory.current");
+	if (current <= 0)
+		goto cleanup;
+
+	if (slab1 < slab0 / 2 && current < slab0 / 2)
+		ret = KSFT_PASS;
+cleanup:
+	cg_destroy(cg);
+	free(cg);
+
+	return ret;
+}
+
+static void *alloc_kmem_fn(void *arg)
+{
+	alloc_dcache(NULL, (void *)100);
+	return NULL;
+}
+
+static int alloc_kmem_smp(const char *cgroup, void *arg)
+{
+	int nr_threads = 2 * get_nprocs();
+	pthread_t *tinfo;
+	unsigned long i;
+	int ret = -1;
+
+	tinfo = calloc(nr_threads, sizeof(pthread_t));
+	if (tinfo == NULL)
+		return -1;
+
+	for (i = 0; i < nr_threads; i++) {
+		if (pthread_create(&tinfo[i], NULL, &alloc_kmem_fn,
+				   (void *)i)) {
+			free(tinfo);
+			return -1;
+		}
+	}
+
+	for (i = 0; i < nr_threads; i++) {
+		ret = pthread_join(tinfo[i], NULL);
+		if (ret)
+			break;
+	}
+
+	free(tinfo);
+	return ret;
+}
+
+static int cg_run_in_subcgroups(const char *parent,
+				int (*fn)(const char *cgroup, void *arg),
+				void *arg, int times)
+{
+	char *child;
+	int i;
+
+	for (i = 0; i < times; i++) {
+		child = cg_name_indexed(parent, "child", i);
+		if (!child)
+			return -1;
+
+		if (cg_create(child)) {
+			cg_destroy(child);
+			free(child);
+			return -1;
+		}
+
+		if (cg_run(child, fn, NULL)) {
+			cg_destroy(child);
+			free(child);
+			return -1;
+		}
+
+		cg_destroy(child);
+		free(child);
+	}
+
+	return 0;
+}
+
+/*
+ * The test creates and destroys a large number of cgroups. In each cgroup it
+ * allocates some slab memory (mostly negative dentries) using 2 * NR_CPUS
+ * threads. Then it checks the sanity of numbers on the parent level:
+ * the total size of the cgroups should be roughly equal to
+ * anon + file + slab + kernel_stack.
+ */
+static int test_kmem_memcg_deletion(const char *root)
+{
+	long current, slab, anon, file, kernel_stack, pagetables, percpu, sock, sum;
+	int ret = KSFT_FAIL;
+	char *parent;
+
+	parent = cg_name(root, "kmem_memcg_deletion_test");
+	if (!parent)
+		goto cleanup;
+
+	if (cg_create(parent))
+		goto cleanup;
+
+	if (cg_write(parent, "cgroup.subtree_control", "+memory"))
+		goto cleanup;
+
+	if (cg_run_in_subcgroups(parent, alloc_kmem_smp, NULL, 100))
+		goto cleanup;
+
+	current = cg_read_long(parent, "memory.current");
+	slab = cg_read_key_long(parent, "memory.stat", "slab ");
+	anon = cg_read_key_long(parent, "memory.stat", "anon ");
+	file = cg_read_key_long(parent, "memory.stat", "file ");
+	kernel_stack = cg_read_key_long(parent, "memory.stat", "kernel_stack ");
+	pagetables = cg_read_key_long(parent, "memory.stat", "pagetables ");
+	percpu = cg_read_key_long(parent, "memory.stat", "percpu ");
+	sock = cg_read_key_long(parent, "memory.stat", "sock ");
+	if (current < 0 || slab < 0 || anon < 0 || file < 0 ||
+	    kernel_stack < 0 || pagetables < 0 || percpu < 0 || sock < 0)
+		goto cleanup;
+
+	sum = slab + anon + file + kernel_stack + pagetables + percpu + sock;
+	if (abs(sum - current) < MAX_VMSTAT_ERROR) {
+		ret = KSFT_PASS;
+	} else {
+		printf("memory.current = %ld\n", current);
+		printf("slab + anon + file + kernel_stack = %ld\n", sum);
+		printf("slab = %ld\n", slab);
+		printf("anon = %ld\n", anon);
+		printf("file = %ld\n", file);
+		printf("kernel_stack = %ld\n", kernel_stack);
+		printf("pagetables = %ld\n", pagetables);
+		printf("percpu = %ld\n", percpu);
+		printf("sock = %ld\n", sock);
+	}
+
+cleanup:
+	cg_destroy(parent);
+	free(parent);
+
+	return ret;
+}
+
+/*
+ * The test reads the entire /proc/kpagecgroup. If the operation went
+ * successfully (and the kernel didn't panic), the test is treated as passed.
+ */
+static int test_kmem_proc_kpagecgroup(const char *root)
+{
+	unsigned long buf[128];
+	int ret = KSFT_FAIL;
+	ssize_t len;
+	int fd;
+
+	fd = open("/proc/kpagecgroup", O_RDONLY);
+	if (fd < 0)
+		return ret;
+
+	do {
+		len = read(fd, buf, sizeof(buf));
+	} while (len > 0);
+
+	if (len == 0)
+		ret = KSFT_PASS;
+
+	close(fd);
+	return ret;
+}
+
+static void *pthread_wait_fn(void *arg)
+{
+	sleep(100);
+	return NULL;
+}
+
+static int spawn_1000_threads(const char *cgroup, void *arg)
+{
+	int nr_threads = 1000;
+	pthread_t *tinfo;
+	unsigned long i;
+	long stack;
+	int ret = -1;
+
+	tinfo = calloc(nr_threads, sizeof(pthread_t));
+	if (tinfo == NULL)
+		return -1;
+
+	for (i = 0; i < nr_threads; i++) {
+		if (pthread_create(&tinfo[i], NULL, &pthread_wait_fn,
+				   (void *)i)) {
+			free(tinfo);
+			return(-1);
+		}
+	}
+
+	stack = cg_read_key_long(cgroup, "memory.stat", "kernel_stack ");
+	if (stack >= 4096 * 1000)
+		ret = 0;
+
+	free(tinfo);
+	return ret;
+}
+
+/*
+ * The test spawns a process, which spawns 1000 threads. Then it checks
+ * that memory.stat's kernel_stack is at least 1000 pages large.
+ */
+static int test_kmem_kernel_stacks(const char *root)
+{
+	int ret = KSFT_FAIL;
+	char *cg = NULL;
+
+	cg = cg_name(root, "kmem_kernel_stacks_test");
+	if (!cg)
+		goto cleanup;
+
+	if (cg_create(cg))
+		goto cleanup;
+
+	if (cg_run(cg, spawn_1000_threads, NULL))
+		goto cleanup;
+
+	ret = KSFT_PASS;
+cleanup:
+	cg_destroy(cg);
+	free(cg);
+
+	return ret;
+}
+
+/*
+ * This test sequentionally creates 30 child cgroups, allocates some
+ * kernel memory in each of them, and deletes them. Then it checks
+ * that the number of dying cgroups on the parent level is 0.
+ */
+static int test_kmem_dead_cgroups(const char *root)
+{
+	int ret = KSFT_FAIL;
+	char *parent;
+	long dead;
+	int i;
+
+	parent = cg_name(root, "kmem_dead_cgroups_test");
+	if (!parent)
+		goto cleanup;
+
+	if (cg_create(parent))
+		goto cleanup;
+
+	if (cg_write(parent, "cgroup.subtree_control", "+memory"))
+		goto cleanup;
+
+	if (cg_run_in_subcgroups(parent, alloc_dcache, (void *)100, 30))
+		goto cleanup;
+
+	for (i = 0; i < 5; i++) {
+		dead = cg_read_key_long(parent, "cgroup.stat",
+					"nr_dying_descendants ");
+		if (dead == 0) {
+			ret = KSFT_PASS;
+			break;
+		}
+		/*
+		 * Reclaiming cgroups might take some time,
+		 * let's wait a bit and repeat.
+		 */
+		sleep(1);
+	}
+
+cleanup:
+	cg_destroy(parent);
+	free(parent);
+
+	return ret;
+}
+
+/*
+ * This test creates a sub-tree with 1000 memory cgroups.
+ * Then it checks that the memory.current on the parent level
+ * is greater than 0 and approximates matches the percpu value
+ * from memory.stat.
+ */
+static int test_percpu_basic(const char *root)
+{
+	int ret = KSFT_FAIL;
+	char *parent, *child;
+	long current, percpu;
+	int i;
+
+	parent = cg_name(root, "percpu_basic_test");
+	if (!parent)
+		goto cleanup;
+
+	if (cg_create(parent))
+		goto cleanup;
+
+	if (cg_write(parent, "cgroup.subtree_control", "+memory"))
+		goto cleanup;
+
+	for (i = 0; i < 1000; i++) {
+		child = cg_name_indexed(parent, "child", i);
+		if (!child)
+			return -1;
+
+		if (cg_create(child))
+			goto cleanup_children;
+
+		free(child);
+	}
+
+	current = cg_read_long(parent, "memory.current");
+	percpu = cg_read_key_long(parent, "memory.stat", "percpu ");
+
+	if (current > 0 && percpu > 0 && abs(current - percpu) <
+	    MAX_VMSTAT_ERROR)
+		ret = KSFT_PASS;
+	else
+		printf("memory.current %ld\npercpu %ld\n",
+		       current, percpu);
+
+cleanup_children:
+	for (i = 0; i < 1000; i++) {
+		child = cg_name_indexed(parent, "child", i);
+		cg_destroy(child);
+		free(child);
+	}
+
+cleanup:
+	cg_destroy(parent);
+	free(parent);
+
+	return ret;
+}
+
+#define T(x) { x, #x }
+struct kmem_test {
+	int (*fn)(const char *root);
+	const char *name;
+} tests[] = {
+	T(test_kmem_basic),
+	T(test_kmem_memcg_deletion),
+	T(test_kmem_proc_kpagecgroup),
+	T(test_kmem_kernel_stacks),
+	T(test_kmem_dead_cgroups),
+	T(test_percpu_basic),
+};
+#undef T
+
+int main(int argc, char **argv)
+{
+	char root[PATH_MAX];
+	int i, ret = EXIT_SUCCESS;
+
+	if (cg_find_unified_root(root, sizeof(root)))
+		ksft_exit_skip("cgroup v2 isn't mounted\n");
+
+	/*
+	 * Check that memory controller is available:
+	 * memory is listed in cgroup.controllers
+	 */
+	if (cg_read_strstr(root, "cgroup.controllers", "memory"))
+		ksft_exit_skip("memory controller isn't available\n");
+
+	if (cg_read_strstr(root, "cgroup.subtree_control", "memory"))
+		if (cg_write(root, "cgroup.subtree_control", "+memory"))
+			ksft_exit_skip("Failed to set memory controller\n");
+
+	for (i = 0; i < ARRAY_SIZE(tests); i++) {
+		switch (tests[i].fn(root)) {
+		case KSFT_PASS:
+			ksft_test_result_pass("%s\n", tests[i].name);
+			break;
+		case KSFT_SKIP:
+			ksft_test_result_skip("%s\n", tests[i].name);
+			break;
+		default:
+			ret = EXIT_FAILURE;
+			ksft_test_result_fail("%s\n", tests[i].name);
+			break;
+		}
+	}
+
+	return ret;
+}
diff --git a/tools/testing/selftests/cgroup/test_memcontrol.c b/tools/testing/selftests/cgroup/test_memcontrol.c
new file mode 100644
index 000000000..5a526a8e7
--- /dev/null
+++ b/tools/testing/selftests/cgroup/test_memcontrol.c
@@ -0,0 +1,1335 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#define _GNU_SOURCE
+
+#include <linux/limits.h>
+#include <linux/oom.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <sys/socket.h>
+#include <sys/wait.h>
+#include <arpa/inet.h>
+#include <netinet/in.h>
+#include <netdb.h>
+#include <errno.h>
+#include <sys/mman.h>
+
+#include "../kselftest.h"
+#include "cgroup_util.h"
+
+static bool has_localevents;
+static bool has_recursiveprot;
+
+/*
+ * This test creates two nested cgroups with and without enabling
+ * the memory controller.
+ */
+static int test_memcg_subtree_control(const char *root)
+{
+	char *parent, *child, *parent2 = NULL, *child2 = NULL;
+	int ret = KSFT_FAIL;
+	char buf[PAGE_SIZE];
+
+	/* Create two nested cgroups with the memory controller enabled */
+	parent = cg_name(root, "memcg_test_0");
+	child = cg_name(root, "memcg_test_0/memcg_test_1");
+	if (!parent || !child)
+		goto cleanup_free;
+
+	if (cg_create(parent))
+		goto cleanup_free;
+
+	if (cg_write(parent, "cgroup.subtree_control", "+memory"))
+		goto cleanup_parent;
+
+	if (cg_create(child))
+		goto cleanup_parent;
+
+	if (cg_read_strstr(child, "cgroup.controllers", "memory"))
+		goto cleanup_child;
+
+	/* Create two nested cgroups without enabling memory controller */
+	parent2 = cg_name(root, "memcg_test_1");
+	child2 = cg_name(root, "memcg_test_1/memcg_test_1");
+	if (!parent2 || !child2)
+		goto cleanup_free2;
+
+	if (cg_create(parent2))
+		goto cleanup_free2;
+
+	if (cg_create(child2))
+		goto cleanup_parent2;
+
+	if (cg_read(child2, "cgroup.controllers", buf, sizeof(buf)))
+		goto cleanup_all;
+
+	if (!cg_read_strstr(child2, "cgroup.controllers", "memory"))
+		goto cleanup_all;
+
+	ret = KSFT_PASS;
+
+cleanup_all:
+	cg_destroy(child2);
+cleanup_parent2:
+	cg_destroy(parent2);
+cleanup_free2:
+	free(parent2);
+	free(child2);
+cleanup_child:
+	cg_destroy(child);
+cleanup_parent:
+	cg_destroy(parent);
+cleanup_free:
+	free(parent);
+	free(child);
+
+	return ret;
+}
+
+static int alloc_anon_50M_check(const char *cgroup, void *arg)
+{
+	size_t size = MB(50);
+	char *buf, *ptr;
+	long anon, current;
+	int ret = -1;
+
+	buf = malloc(size);
+	if (buf == NULL) {
+		fprintf(stderr, "malloc() failed\n");
+		return -1;
+	}
+
+	for (ptr = buf; ptr < buf + size; ptr += PAGE_SIZE)
+		*ptr = 0;
+
+	current = cg_read_long(cgroup, "memory.current");
+	if (current < size)
+		goto cleanup;
+
+	if (!values_close(size, current, 3))
+		goto cleanup;
+
+	anon = cg_read_key_long(cgroup, "memory.stat", "anon ");
+	if (anon < 0)
+		goto cleanup;
+
+	if (!values_close(anon, current, 3))
+		goto cleanup;
+
+	ret = 0;
+cleanup:
+	free(buf);
+	return ret;
+}
+
+static int alloc_pagecache_50M_check(const char *cgroup, void *arg)
+{
+	size_t size = MB(50);
+	int ret = -1;
+	long current, file;
+	int fd;
+
+	fd = get_temp_fd();
+	if (fd < 0)
+		return -1;
+
+	if (alloc_pagecache(fd, size))
+		goto cleanup;
+
+	current = cg_read_long(cgroup, "memory.current");
+	if (current < size)
+		goto cleanup;
+
+	file = cg_read_key_long(cgroup, "memory.stat", "file ");
+	if (file < 0)
+		goto cleanup;
+
+	if (!values_close(file, current, 10))
+		goto cleanup;
+
+	ret = 0;
+
+cleanup:
+	close(fd);
+	return ret;
+}
+
+/*
+ * This test create a memory cgroup, allocates
+ * some anonymous memory and some pagecache
+ * and check memory.current and some memory.stat values.
+ */
+static int test_memcg_current(const char *root)
+{
+	int ret = KSFT_FAIL;
+	long current;
+	char *memcg;
+
+	memcg = cg_name(root, "memcg_test");
+	if (!memcg)
+		goto cleanup;
+
+	if (cg_create(memcg))
+		goto cleanup;
+
+	current = cg_read_long(memcg, "memory.current");
+	if (current != 0)
+		goto cleanup;
+
+	if (cg_run(memcg, alloc_anon_50M_check, NULL))
+		goto cleanup;
+
+	if (cg_run(memcg, alloc_pagecache_50M_check, NULL))
+		goto cleanup;
+
+	ret = KSFT_PASS;
+
+cleanup:
+	cg_destroy(memcg);
+	free(memcg);
+
+	return ret;
+}
+
+static int alloc_pagecache_50M_noexit(const char *cgroup, void *arg)
+{
+	int fd = (long)arg;
+	int ppid = getppid();
+
+	if (alloc_pagecache(fd, MB(50)))
+		return -1;
+
+	while (getppid() == ppid)
+		sleep(1);
+
+	return 0;
+}
+
+static int alloc_anon_noexit(const char *cgroup, void *arg)
+{
+	int ppid = getppid();
+	size_t size = (unsigned long)arg;
+	char *buf, *ptr;
+
+	buf = malloc(size);
+	if (buf == NULL) {
+		fprintf(stderr, "malloc() failed\n");
+		return -1;
+	}
+
+	for (ptr = buf; ptr < buf + size; ptr += PAGE_SIZE)
+		*ptr = 0;
+
+	while (getppid() == ppid)
+		sleep(1);
+
+	free(buf);
+	return 0;
+}
+
+/*
+ * Wait until processes are killed asynchronously by the OOM killer
+ * If we exceed a timeout, fail.
+ */
+static int cg_test_proc_killed(const char *cgroup)
+{
+	int limit;
+
+	for (limit = 10; limit > 0; limit--) {
+		if (cg_read_strcmp(cgroup, "cgroup.procs", "") == 0)
+			return 0;
+
+		usleep(100000);
+	}
+	return -1;
+}
+
+/*
+ * First, this test creates the following hierarchy:
+ * A       memory.min = 0,    memory.max = 200M
+ * A/B     memory.min = 50M
+ * A/B/C   memory.min = 75M,  memory.current = 50M
+ * A/B/D   memory.min = 25M,  memory.current = 50M
+ * A/B/E   memory.min = 0,    memory.current = 50M
+ * A/B/F   memory.min = 500M, memory.current = 0
+ *
+ * (or memory.low if we test soft protection)
+ *
+ * Usages are pagecache and the test keeps a running
+ * process in every leaf cgroup.
+ * Then it creates A/G and creates a significant
+ * memory pressure in A.
+ *
+ * Then it checks actual memory usages and expects that:
+ * A/B    memory.current ~= 50M
+ * A/B/C  memory.current ~= 29M
+ * A/B/D  memory.current ~= 21M
+ * A/B/E  memory.current ~= 0
+ * A/B/F  memory.current  = 0
+ * (for origin of the numbers, see model in memcg_protection.m.)
+ *
+ * After that it tries to allocate more than there is
+ * unprotected memory in A available, and checks that:
+ * a) memory.min protects pagecache even in this case,
+ * b) memory.low allows reclaiming page cache with low events.
+ */
+static int test_memcg_protection(const char *root, bool min)
+{
+	int ret = KSFT_FAIL, rc;
+	char *parent[3] = {NULL};
+	char *children[4] = {NULL};
+	const char *attribute = min ? "memory.min" : "memory.low";
+	long c[4];
+	long current;
+	int i, attempts;
+	int fd;
+
+	fd = get_temp_fd();
+	if (fd < 0)
+		goto cleanup;
+
+	parent[0] = cg_name(root, "memcg_test_0");
+	if (!parent[0])
+		goto cleanup;
+
+	parent[1] = cg_name(parent[0], "memcg_test_1");
+	if (!parent[1])
+		goto cleanup;
+
+	parent[2] = cg_name(parent[0], "memcg_test_2");
+	if (!parent[2])
+		goto cleanup;
+
+	if (cg_create(parent[0]))
+		goto cleanup;
+
+	if (cg_read_long(parent[0], attribute)) {
+		/* No memory.min on older kernels is fine */
+		if (min)
+			ret = KSFT_SKIP;
+		goto cleanup;
+	}
+
+	if (cg_write(parent[0], "cgroup.subtree_control", "+memory"))
+		goto cleanup;
+
+	if (cg_write(parent[0], "memory.max", "200M"))
+		goto cleanup;
+
+	if (cg_write(parent[0], "memory.swap.max", "0"))
+		goto cleanup;
+
+	if (cg_create(parent[1]))
+		goto cleanup;
+
+	if (cg_write(parent[1], "cgroup.subtree_control", "+memory"))
+		goto cleanup;
+
+	if (cg_create(parent[2]))
+		goto cleanup;
+
+	for (i = 0; i < ARRAY_SIZE(children); i++) {
+		children[i] = cg_name_indexed(parent[1], "child_memcg", i);
+		if (!children[i])
+			goto cleanup;
+
+		if (cg_create(children[i]))
+			goto cleanup;
+
+		if (i > 2)
+			continue;
+
+		cg_run_nowait(children[i], alloc_pagecache_50M_noexit,
+			      (void *)(long)fd);
+	}
+
+	if (cg_write(parent[1],   attribute, "50M"))
+		goto cleanup;
+	if (cg_write(children[0], attribute, "75M"))
+		goto cleanup;
+	if (cg_write(children[1], attribute, "25M"))
+		goto cleanup;
+	if (cg_write(children[2], attribute, "0"))
+		goto cleanup;
+	if (cg_write(children[3], attribute, "500M"))
+		goto cleanup;
+
+	attempts = 0;
+	while (!values_close(cg_read_long(parent[1], "memory.current"),
+			     MB(150), 3)) {
+		if (attempts++ > 5)
+			break;
+		sleep(1);
+	}
+
+	if (cg_run(parent[2], alloc_anon, (void *)MB(148)))
+		goto cleanup;
+
+	if (!values_close(cg_read_long(parent[1], "memory.current"), MB(50), 3))
+		goto cleanup;
+
+	for (i = 0; i < ARRAY_SIZE(children); i++)
+		c[i] = cg_read_long(children[i], "memory.current");
+
+	if (!values_close(c[0], MB(29), 10))
+		goto cleanup;
+
+	if (!values_close(c[1], MB(21), 10))
+		goto cleanup;
+
+	if (c[3] != 0)
+		goto cleanup;
+
+	rc = cg_run(parent[2], alloc_anon, (void *)MB(170));
+	if (min && !rc)
+		goto cleanup;
+	else if (!min && rc) {
+		fprintf(stderr,
+			"memory.low prevents from allocating anon memory\n");
+		goto cleanup;
+	}
+
+	current = min ? MB(50) : MB(30);
+	if (!values_close(cg_read_long(parent[1], "memory.current"), current, 3))
+		goto cleanup;
+
+	if (min) {
+		ret = KSFT_PASS;
+		goto cleanup;
+	}
+
+	for (i = 0; i < ARRAY_SIZE(children); i++) {
+		int no_low_events_index = 1;
+		long low, oom;
+
+		oom = cg_read_key_long(children[i], "memory.events", "oom ");
+		low = cg_read_key_long(children[i], "memory.events", "low ");
+
+		if (oom)
+			goto cleanup;
+		if (i <= no_low_events_index && low <= 0)
+			goto cleanup;
+		if (i > no_low_events_index && low)
+			goto cleanup;
+
+	}
+
+	ret = KSFT_PASS;
+
+cleanup:
+	for (i = ARRAY_SIZE(children) - 1; i >= 0; i--) {
+		if (!children[i])
+			continue;
+
+		cg_destroy(children[i]);
+		free(children[i]);
+	}
+
+	for (i = ARRAY_SIZE(parent) - 1; i >= 0; i--) {
+		if (!parent[i])
+			continue;
+
+		cg_destroy(parent[i]);
+		free(parent[i]);
+	}
+	close(fd);
+	return ret;
+}
+
+static int test_memcg_min(const char *root)
+{
+	return test_memcg_protection(root, true);
+}
+
+static int test_memcg_low(const char *root)
+{
+	return test_memcg_protection(root, false);
+}
+
+static int alloc_pagecache_max_30M(const char *cgroup, void *arg)
+{
+	size_t size = MB(50);
+	int ret = -1;
+	long current, high, max;
+	int fd;
+
+	high = cg_read_long(cgroup, "memory.high");
+	max = cg_read_long(cgroup, "memory.max");
+	if (high != MB(30) && max != MB(30))
+		return -1;
+
+	fd = get_temp_fd();
+	if (fd < 0)
+		return -1;
+
+	if (alloc_pagecache(fd, size))
+		goto cleanup;
+
+	current = cg_read_long(cgroup, "memory.current");
+	if (!values_close(current, MB(30), 5))
+		goto cleanup;
+
+	ret = 0;
+
+cleanup:
+	close(fd);
+	return ret;
+
+}
+
+/*
+ * This test checks that memory.high limits the amount of
+ * memory which can be consumed by either anonymous memory
+ * or pagecache.
+ */
+static int test_memcg_high(const char *root)
+{
+	int ret = KSFT_FAIL;
+	char *memcg;
+	long high;
+
+	memcg = cg_name(root, "memcg_test");
+	if (!memcg)
+		goto cleanup;
+
+	if (cg_create(memcg))
+		goto cleanup;
+
+	if (cg_read_strcmp(memcg, "memory.high", "max\n"))
+		goto cleanup;
+
+	if (cg_write(memcg, "memory.swap.max", "0"))
+		goto cleanup;
+
+	if (cg_write(memcg, "memory.high", "30M"))
+		goto cleanup;
+
+	if (cg_run(memcg, alloc_anon, (void *)MB(31)))
+		goto cleanup;
+
+	if (!cg_run(memcg, alloc_pagecache_50M_check, NULL))
+		goto cleanup;
+
+	if (cg_run(memcg, alloc_pagecache_max_30M, NULL))
+		goto cleanup;
+
+	high = cg_read_key_long(memcg, "memory.events", "high ");
+	if (high <= 0)
+		goto cleanup;
+
+	ret = KSFT_PASS;
+
+cleanup:
+	cg_destroy(memcg);
+	free(memcg);
+
+	return ret;
+}
+
+static int alloc_anon_mlock(const char *cgroup, void *arg)
+{
+	size_t size = (size_t)arg;
+	void *buf;
+
+	buf = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON,
+		   0, 0);
+	if (buf == MAP_FAILED)
+		return -1;
+
+	mlock(buf, size);
+	munmap(buf, size);
+	return 0;
+}
+
+/*
+ * This test checks that memory.high is able to throttle big single shot
+ * allocation i.e. large allocation within one kernel entry.
+ */
+static int test_memcg_high_sync(const char *root)
+{
+	int ret = KSFT_FAIL, pid, fd = -1;
+	char *memcg;
+	long pre_high, pre_max;
+	long post_high, post_max;
+
+	memcg = cg_name(root, "memcg_test");
+	if (!memcg)
+		goto cleanup;
+
+	if (cg_create(memcg))
+		goto cleanup;
+
+	pre_high = cg_read_key_long(memcg, "memory.events", "high ");
+	pre_max = cg_read_key_long(memcg, "memory.events", "max ");
+	if (pre_high < 0 || pre_max < 0)
+		goto cleanup;
+
+	if (cg_write(memcg, "memory.swap.max", "0"))
+		goto cleanup;
+
+	if (cg_write(memcg, "memory.high", "30M"))
+		goto cleanup;
+
+	if (cg_write(memcg, "memory.max", "140M"))
+		goto cleanup;
+
+	fd = memcg_prepare_for_wait(memcg);
+	if (fd < 0)
+		goto cleanup;
+
+	pid = cg_run_nowait(memcg, alloc_anon_mlock, (void *)MB(200));
+	if (pid < 0)
+		goto cleanup;
+
+	cg_wait_for(fd);
+
+	post_high = cg_read_key_long(memcg, "memory.events", "high ");
+	post_max = cg_read_key_long(memcg, "memory.events", "max ");
+	if (post_high < 0 || post_max < 0)
+		goto cleanup;
+
+	if (pre_high == post_high || pre_max != post_max)
+		goto cleanup;
+
+	ret = KSFT_PASS;
+
+cleanup:
+	if (fd >= 0)
+		close(fd);
+	cg_destroy(memcg);
+	free(memcg);
+
+	return ret;
+}
+
+/*
+ * This test checks that memory.max limits the amount of
+ * memory which can be consumed by either anonymous memory
+ * or pagecache.
+ */
+static int test_memcg_max(const char *root)
+{
+	int ret = KSFT_FAIL;
+	char *memcg;
+	long current, max;
+
+	memcg = cg_name(root, "memcg_test");
+	if (!memcg)
+		goto cleanup;
+
+	if (cg_create(memcg))
+		goto cleanup;
+
+	if (cg_read_strcmp(memcg, "memory.max", "max\n"))
+		goto cleanup;
+
+	if (cg_write(memcg, "memory.swap.max", "0"))
+		goto cleanup;
+
+	if (cg_write(memcg, "memory.max", "30M"))
+		goto cleanup;
+
+	/* Should be killed by OOM killer */
+	if (!cg_run(memcg, alloc_anon, (void *)MB(100)))
+		goto cleanup;
+
+	if (cg_run(memcg, alloc_pagecache_max_30M, NULL))
+		goto cleanup;
+
+	current = cg_read_long(memcg, "memory.current");
+	if (current > MB(30) || !current)
+		goto cleanup;
+
+	max = cg_read_key_long(memcg, "memory.events", "max ");
+	if (max <= 0)
+		goto cleanup;
+
+	ret = KSFT_PASS;
+
+cleanup:
+	cg_destroy(memcg);
+	free(memcg);
+
+	return ret;
+}
+
+/*
+ * This test checks that memory.reclaim reclaims the given
+ * amount of memory (from both anon and file, if possible).
+ */
+static int test_memcg_reclaim(const char *root)
+{
+	int ret = KSFT_FAIL, fd, retries;
+	char *memcg;
+	long current, expected_usage, to_reclaim;
+	char buf[64];
+
+	memcg = cg_name(root, "memcg_test");
+	if (!memcg)
+		goto cleanup;
+
+	if (cg_create(memcg))
+		goto cleanup;
+
+	current = cg_read_long(memcg, "memory.current");
+	if (current != 0)
+		goto cleanup;
+
+	fd = get_temp_fd();
+	if (fd < 0)
+		goto cleanup;
+
+	cg_run_nowait(memcg, alloc_pagecache_50M_noexit, (void *)(long)fd);
+
+	/*
+	 * If swap is enabled, try to reclaim from both anon and file, else try
+	 * to reclaim from file only.
+	 */
+	if (is_swap_enabled()) {
+		cg_run_nowait(memcg, alloc_anon_noexit, (void *) MB(50));
+		expected_usage = MB(100);
+	} else
+		expected_usage = MB(50);
+
+	/*
+	 * Wait until current usage reaches the expected usage (or we run out of
+	 * retries).
+	 */
+	retries = 5;
+	while (!values_close(cg_read_long(memcg, "memory.current"),
+			    expected_usage, 10)) {
+		if (retries--) {
+			sleep(1);
+			continue;
+		} else {
+			fprintf(stderr,
+				"failed to allocate %ld for memcg reclaim test\n",
+				expected_usage);
+			goto cleanup;
+		}
+	}
+
+	/*
+	 * Reclaim until current reaches 30M, this makes sure we hit both anon
+	 * and file if swap is enabled.
+	 */
+	retries = 5;
+	while (true) {
+		int err;
+
+		current = cg_read_long(memcg, "memory.current");
+		to_reclaim = current - MB(30);
+
+		/*
+		 * We only keep looping if we get EAGAIN, which means we could
+		 * not reclaim the full amount.
+		 */
+		if (to_reclaim <= 0)
+			goto cleanup;
+
+
+		snprintf(buf, sizeof(buf), "%ld", to_reclaim);
+		err = cg_write(memcg, "memory.reclaim", buf);
+		if (!err) {
+			/*
+			 * If writing succeeds, then the written amount should have been
+			 * fully reclaimed (and maybe more).
+			 */
+			current = cg_read_long(memcg, "memory.current");
+			if (!values_close(current, MB(30), 3) && current > MB(30))
+				goto cleanup;
+			break;
+		}
+
+		/* The kernel could not reclaim the full amount, try again. */
+		if (err == -EAGAIN && retries--)
+			continue;
+
+		/* We got an unexpected error or ran out of retries. */
+		goto cleanup;
+	}
+
+	ret = KSFT_PASS;
+cleanup:
+	cg_destroy(memcg);
+	free(memcg);
+	close(fd);
+
+	return ret;
+}
+
+static int alloc_anon_50M_check_swap(const char *cgroup, void *arg)
+{
+	long mem_max = (long)arg;
+	size_t size = MB(50);
+	char *buf, *ptr;
+	long mem_current, swap_current;
+	int ret = -1;
+
+	buf = malloc(size);
+	if (buf == NULL) {
+		fprintf(stderr, "malloc() failed\n");
+		return -1;
+	}
+
+	for (ptr = buf; ptr < buf + size; ptr += PAGE_SIZE)
+		*ptr = 0;
+
+	mem_current = cg_read_long(cgroup, "memory.current");
+	if (!mem_current || !values_close(mem_current, mem_max, 3))
+		goto cleanup;
+
+	swap_current = cg_read_long(cgroup, "memory.swap.current");
+	if (!swap_current ||
+	    !values_close(mem_current + swap_current, size, 3))
+		goto cleanup;
+
+	ret = 0;
+cleanup:
+	free(buf);
+	return ret;
+}
+
+/*
+ * This test checks that memory.swap.max limits the amount of
+ * anonymous memory which can be swapped out.
+ */
+static int test_memcg_swap_max(const char *root)
+{
+	int ret = KSFT_FAIL;
+	char *memcg;
+	long max;
+
+	if (!is_swap_enabled())
+		return KSFT_SKIP;
+
+	memcg = cg_name(root, "memcg_test");
+	if (!memcg)
+		goto cleanup;
+
+	if (cg_create(memcg))
+		goto cleanup;
+
+	if (cg_read_long(memcg, "memory.swap.current")) {
+		ret = KSFT_SKIP;
+		goto cleanup;
+	}
+
+	if (cg_read_strcmp(memcg, "memory.max", "max\n"))
+		goto cleanup;
+
+	if (cg_read_strcmp(memcg, "memory.swap.max", "max\n"))
+		goto cleanup;
+
+	if (cg_write(memcg, "memory.swap.max", "30M"))
+		goto cleanup;
+
+	if (cg_write(memcg, "memory.max", "30M"))
+		goto cleanup;
+
+	/* Should be killed by OOM killer */
+	if (!cg_run(memcg, alloc_anon, (void *)MB(100)))
+		goto cleanup;
+
+	if (cg_read_key_long(memcg, "memory.events", "oom ") != 1)
+		goto cleanup;
+
+	if (cg_read_key_long(memcg, "memory.events", "oom_kill ") != 1)
+		goto cleanup;
+
+	if (cg_run(memcg, alloc_anon_50M_check_swap, (void *)MB(30)))
+		goto cleanup;
+
+	max = cg_read_key_long(memcg, "memory.events", "max ");
+	if (max <= 0)
+		goto cleanup;
+
+	ret = KSFT_PASS;
+
+cleanup:
+	cg_destroy(memcg);
+	free(memcg);
+
+	return ret;
+}
+
+/*
+ * This test disables swapping and tries to allocate anonymous memory
+ * up to OOM. Then it checks for oom and oom_kill events in
+ * memory.events.
+ */
+static int test_memcg_oom_events(const char *root)
+{
+	int ret = KSFT_FAIL;
+	char *memcg;
+
+	memcg = cg_name(root, "memcg_test");
+	if (!memcg)
+		goto cleanup;
+
+	if (cg_create(memcg))
+		goto cleanup;
+
+	if (cg_write(memcg, "memory.max", "30M"))
+		goto cleanup;
+
+	if (cg_write(memcg, "memory.swap.max", "0"))
+		goto cleanup;
+
+	if (!cg_run(memcg, alloc_anon, (void *)MB(100)))
+		goto cleanup;
+
+	if (cg_read_strcmp(memcg, "cgroup.procs", ""))
+		goto cleanup;
+
+	if (cg_read_key_long(memcg, "memory.events", "oom ") != 1)
+		goto cleanup;
+
+	if (cg_read_key_long(memcg, "memory.events", "oom_kill ") != 1)
+		goto cleanup;
+
+	ret = KSFT_PASS;
+
+cleanup:
+	cg_destroy(memcg);
+	free(memcg);
+
+	return ret;
+}
+
+struct tcp_server_args {
+	unsigned short port;
+	int ctl[2];
+};
+
+static int tcp_server(const char *cgroup, void *arg)
+{
+	struct tcp_server_args *srv_args = arg;
+	struct sockaddr_in6 saddr = { 0 };
+	socklen_t slen = sizeof(saddr);
+	int sk, client_sk, ctl_fd, yes = 1, ret = -1;
+
+	close(srv_args->ctl[0]);
+	ctl_fd = srv_args->ctl[1];
+
+	saddr.sin6_family = AF_INET6;
+	saddr.sin6_addr = in6addr_any;
+	saddr.sin6_port = htons(srv_args->port);
+
+	sk = socket(AF_INET6, SOCK_STREAM, 0);
+	if (sk < 0)
+		return ret;
+
+	if (setsockopt(sk, SOL_SOCKET, SO_REUSEADDR, &yes, sizeof(yes)) < 0)
+		goto cleanup;
+
+	if (bind(sk, (struct sockaddr *)&saddr, slen)) {
+		write(ctl_fd, &errno, sizeof(errno));
+		goto cleanup;
+	}
+
+	if (listen(sk, 1))
+		goto cleanup;
+
+	ret = 0;
+	if (write(ctl_fd, &ret, sizeof(ret)) != sizeof(ret)) {
+		ret = -1;
+		goto cleanup;
+	}
+
+	client_sk = accept(sk, NULL, NULL);
+	if (client_sk < 0)
+		goto cleanup;
+
+	ret = -1;
+	for (;;) {
+		uint8_t buf[0x100000];
+
+		if (write(client_sk, buf, sizeof(buf)) <= 0) {
+			if (errno == ECONNRESET)
+				ret = 0;
+			break;
+		}
+	}
+
+	close(client_sk);
+
+cleanup:
+	close(sk);
+	return ret;
+}
+
+static int tcp_client(const char *cgroup, unsigned short port)
+{
+	const char server[] = "localhost";
+	struct addrinfo *ai;
+	char servport[6];
+	int retries = 0x10; /* nice round number */
+	int sk, ret;
+
+	snprintf(servport, sizeof(servport), "%hd", port);
+	ret = getaddrinfo(server, servport, NULL, &ai);
+	if (ret)
+		return ret;
+
+	sk = socket(ai->ai_family, ai->ai_socktype, ai->ai_protocol);
+	if (sk < 0)
+		goto free_ainfo;
+
+	ret = connect(sk, ai->ai_addr, ai->ai_addrlen);
+	if (ret < 0)
+		goto close_sk;
+
+	ret = KSFT_FAIL;
+	while (retries--) {
+		uint8_t buf[0x100000];
+		long current, sock;
+
+		if (read(sk, buf, sizeof(buf)) <= 0)
+			goto close_sk;
+
+		current = cg_read_long(cgroup, "memory.current");
+		sock = cg_read_key_long(cgroup, "memory.stat", "sock ");
+
+		if (current < 0 || sock < 0)
+			goto close_sk;
+
+		if (values_close(current, sock, 10)) {
+			ret = KSFT_PASS;
+			break;
+		}
+	}
+
+close_sk:
+	close(sk);
+free_ainfo:
+	freeaddrinfo(ai);
+	return ret;
+}
+
+/*
+ * This test checks socket memory accounting.
+ * The test forks a TCP server listens on a random port between 1000
+ * and 61000. Once it gets a client connection, it starts writing to
+ * its socket.
+ * The TCP client interleaves reads from the socket with check whether
+ * memory.current and memory.stat.sock are similar.
+ */
+static int test_memcg_sock(const char *root)
+{
+	int bind_retries = 5, ret = KSFT_FAIL, pid, err;
+	unsigned short port;
+	char *memcg;
+
+	memcg = cg_name(root, "memcg_test");
+	if (!memcg)
+		goto cleanup;
+
+	if (cg_create(memcg))
+		goto cleanup;
+
+	while (bind_retries--) {
+		struct tcp_server_args args;
+
+		if (pipe(args.ctl))
+			goto cleanup;
+
+		port = args.port = 1000 + rand() % 60000;
+
+		pid = cg_run_nowait(memcg, tcp_server, &args);
+		if (pid < 0)
+			goto cleanup;
+
+		close(args.ctl[1]);
+		if (read(args.ctl[0], &err, sizeof(err)) != sizeof(err))
+			goto cleanup;
+		close(args.ctl[0]);
+
+		if (!err)
+			break;
+		if (err != EADDRINUSE)
+			goto cleanup;
+
+		waitpid(pid, NULL, 0);
+	}
+
+	if (err == EADDRINUSE) {
+		ret = KSFT_SKIP;
+		goto cleanup;
+	}
+
+	if (tcp_client(memcg, port) != KSFT_PASS)
+		goto cleanup;
+
+	waitpid(pid, &err, 0);
+	if (WEXITSTATUS(err))
+		goto cleanup;
+
+	if (cg_read_long(memcg, "memory.current") < 0)
+		goto cleanup;
+
+	if (cg_read_key_long(memcg, "memory.stat", "sock "))
+		goto cleanup;
+
+	ret = KSFT_PASS;
+
+cleanup:
+	cg_destroy(memcg);
+	free(memcg);
+
+	return ret;
+}
+
+/*
+ * This test disables swapping and tries to allocate anonymous memory
+ * up to OOM with memory.group.oom set. Then it checks that all
+ * processes in the leaf were killed. It also checks that oom_events
+ * were propagated to the parent level.
+ */
+static int test_memcg_oom_group_leaf_events(const char *root)
+{
+	int ret = KSFT_FAIL;
+	char *parent, *child;
+	long parent_oom_events;
+
+	parent = cg_name(root, "memcg_test_0");
+	child = cg_name(root, "memcg_test_0/memcg_test_1");
+
+	if (!parent || !child)
+		goto cleanup;
+
+	if (cg_create(parent))
+		goto cleanup;
+
+	if (cg_create(child))
+		goto cleanup;
+
+	if (cg_write(parent, "cgroup.subtree_control", "+memory"))
+		goto cleanup;
+
+	if (cg_write(child, "memory.max", "50M"))
+		goto cleanup;
+
+	if (cg_write(child, "memory.swap.max", "0"))
+		goto cleanup;
+
+	if (cg_write(child, "memory.oom.group", "1"))
+		goto cleanup;
+
+	cg_run_nowait(parent, alloc_anon_noexit, (void *) MB(60));
+	cg_run_nowait(child, alloc_anon_noexit, (void *) MB(1));
+	cg_run_nowait(child, alloc_anon_noexit, (void *) MB(1));
+	if (!cg_run(child, alloc_anon, (void *)MB(100)))
+		goto cleanup;
+
+	if (cg_test_proc_killed(child))
+		goto cleanup;
+
+	if (cg_read_key_long(child, "memory.events", "oom_kill ") <= 0)
+		goto cleanup;
+
+	parent_oom_events = cg_read_key_long(
+			parent, "memory.events", "oom_kill ");
+	/*
+	 * If memory_localevents is not enabled (the default), the parent should
+	 * count OOM events in its children groups. Otherwise, it should not
+	 * have observed any events.
+	 */
+	if (has_localevents && parent_oom_events != 0)
+		goto cleanup;
+	else if (!has_localevents && parent_oom_events <= 0)
+		goto cleanup;
+
+	ret = KSFT_PASS;
+
+cleanup:
+	if (child)
+		cg_destroy(child);
+	if (parent)
+		cg_destroy(parent);
+	free(child);
+	free(parent);
+
+	return ret;
+}
+
+/*
+ * This test disables swapping and tries to allocate anonymous memory
+ * up to OOM with memory.group.oom set. Then it checks that all
+ * processes in the parent and leaf were killed.
+ */
+static int test_memcg_oom_group_parent_events(const char *root)
+{
+	int ret = KSFT_FAIL;
+	char *parent, *child;
+
+	parent = cg_name(root, "memcg_test_0");
+	child = cg_name(root, "memcg_test_0/memcg_test_1");
+
+	if (!parent || !child)
+		goto cleanup;
+
+	if (cg_create(parent))
+		goto cleanup;
+
+	if (cg_create(child))
+		goto cleanup;
+
+	if (cg_write(parent, "memory.max", "80M"))
+		goto cleanup;
+
+	if (cg_write(parent, "memory.swap.max", "0"))
+		goto cleanup;
+
+	if (cg_write(parent, "memory.oom.group", "1"))
+		goto cleanup;
+
+	cg_run_nowait(parent, alloc_anon_noexit, (void *) MB(60));
+	cg_run_nowait(child, alloc_anon_noexit, (void *) MB(1));
+	cg_run_nowait(child, alloc_anon_noexit, (void *) MB(1));
+
+	if (!cg_run(child, alloc_anon, (void *)MB(100)))
+		goto cleanup;
+
+	if (cg_test_proc_killed(child))
+		goto cleanup;
+	if (cg_test_proc_killed(parent))
+		goto cleanup;
+
+	ret = KSFT_PASS;
+
+cleanup:
+	if (child)
+		cg_destroy(child);
+	if (parent)
+		cg_destroy(parent);
+	free(child);
+	free(parent);
+
+	return ret;
+}
+
+/*
+ * This test disables swapping and tries to allocate anonymous memory
+ * up to OOM with memory.group.oom set. Then it checks that all
+ * processes were killed except those set with OOM_SCORE_ADJ_MIN
+ */
+static int test_memcg_oom_group_score_events(const char *root)
+{
+	int ret = KSFT_FAIL;
+	char *memcg;
+	int safe_pid;
+
+	memcg = cg_name(root, "memcg_test_0");
+
+	if (!memcg)
+		goto cleanup;
+
+	if (cg_create(memcg))
+		goto cleanup;
+
+	if (cg_write(memcg, "memory.max", "50M"))
+		goto cleanup;
+
+	if (cg_write(memcg, "memory.swap.max", "0"))
+		goto cleanup;
+
+	if (cg_write(memcg, "memory.oom.group", "1"))
+		goto cleanup;
+
+	safe_pid = cg_run_nowait(memcg, alloc_anon_noexit, (void *) MB(1));
+	if (set_oom_adj_score(safe_pid, OOM_SCORE_ADJ_MIN))
+		goto cleanup;
+
+	cg_run_nowait(memcg, alloc_anon_noexit, (void *) MB(1));
+	if (!cg_run(memcg, alloc_anon, (void *)MB(100)))
+		goto cleanup;
+
+	if (cg_read_key_long(memcg, "memory.events", "oom_kill ") != 3)
+		goto cleanup;
+
+	if (kill(safe_pid, SIGKILL))
+		goto cleanup;
+
+	ret = KSFT_PASS;
+
+cleanup:
+	if (memcg)
+		cg_destroy(memcg);
+	free(memcg);
+
+	return ret;
+}
+
+#define T(x) { x, #x }
+struct memcg_test {
+	int (*fn)(const char *root);
+	const char *name;
+} tests[] = {
+	T(test_memcg_subtree_control),
+	T(test_memcg_current),
+	T(test_memcg_min),
+	T(test_memcg_low),
+	T(test_memcg_high),
+	T(test_memcg_high_sync),
+	T(test_memcg_max),
+	T(test_memcg_reclaim),
+	T(test_memcg_oom_events),
+	T(test_memcg_swap_max),
+	T(test_memcg_sock),
+	T(test_memcg_oom_group_leaf_events),
+	T(test_memcg_oom_group_parent_events),
+	T(test_memcg_oom_group_score_events),
+};
+#undef T
+
+int main(int argc, char **argv)
+{
+	char root[PATH_MAX];
+	int i, proc_status, ret = EXIT_SUCCESS;
+
+	if (cg_find_unified_root(root, sizeof(root)))
+		ksft_exit_skip("cgroup v2 isn't mounted\n");
+
+	/*
+	 * Check that memory controller is available:
+	 * memory is listed in cgroup.controllers
+	 */
+	if (cg_read_strstr(root, "cgroup.controllers", "memory"))
+		ksft_exit_skip("memory controller isn't available\n");
+
+	if (cg_read_strstr(root, "cgroup.subtree_control", "memory"))
+		if (cg_write(root, "cgroup.subtree_control", "+memory"))
+			ksft_exit_skip("Failed to set memory controller\n");
+
+	proc_status = proc_mount_contains("memory_recursiveprot");
+	if (proc_status < 0)
+		ksft_exit_skip("Failed to query cgroup mount option\n");
+	has_recursiveprot = proc_status;
+
+	proc_status = proc_mount_contains("memory_localevents");
+	if (proc_status < 0)
+		ksft_exit_skip("Failed to query cgroup mount option\n");
+	has_localevents = proc_status;
+
+	for (i = 0; i < ARRAY_SIZE(tests); i++) {
+		switch (tests[i].fn(root)) {
+		case KSFT_PASS:
+			ksft_test_result_pass("%s\n", tests[i].name);
+			break;
+		case KSFT_SKIP:
+			ksft_test_result_skip("%s\n", tests[i].name);
+			break;
+		default:
+			ret = EXIT_FAILURE;
+			ksft_test_result_fail("%s\n", tests[i].name);
+			break;
+		}
+	}
+
+	return ret;
+}
diff --git a/tools/testing/selftests/cgroup/test_stress.sh b/tools/testing/selftests/cgroup/test_stress.sh
new file mode 100755
index 000000000..3c9c4554d
--- /dev/null
+++ b/tools/testing/selftests/cgroup/test_stress.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+./with_stress.sh -s subsys -s fork ${OUTPUT:-.}/test_core
diff --git a/tools/testing/selftests/cgroup/wait_inotify.c b/tools/testing/selftests/cgroup/wait_inotify.c
new file mode 100644
index 000000000..e11b431e1
--- /dev/null
+++ b/tools/testing/selftests/cgroup/wait_inotify.c
@@ -0,0 +1,87 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Wait until an inotify event on the given cgroup file.
+ */
+#include <linux/limits.h>
+#include <sys/inotify.h>
+#include <sys/mman.h>
+#include <sys/ptrace.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <poll.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+static const char usage[] = "Usage: %s [-v] <cgroup_file>\n";
+static char *file;
+static int verbose;
+
+static inline void fail_message(char *msg)
+{
+	fprintf(stderr, msg, file);
+	exit(1);
+}
+
+int main(int argc, char *argv[])
+{
+	char *cmd = argv[0];
+	int c, fd;
+	struct pollfd fds = { .events = POLLIN, };
+
+	while ((c = getopt(argc, argv, "v")) != -1) {
+		switch (c) {
+		case 'v':
+			verbose++;
+			break;
+		}
+		argv++, argc--;
+	}
+
+	if (argc != 2) {
+		fprintf(stderr, usage, cmd);
+		return -1;
+	}
+	file = argv[1];
+	fd = open(file, O_RDONLY);
+	if (fd < 0)
+		fail_message("Cgroup file %s not found!\n");
+	close(fd);
+
+	fd = inotify_init();
+	if (fd < 0)
+		fail_message("inotify_init() fails on %s!\n");
+	if (inotify_add_watch(fd, file, IN_MODIFY) < 0)
+		fail_message("inotify_add_watch() fails on %s!\n");
+	fds.fd = fd;
+
+	/*
+	 * poll waiting loop
+	 */
+	for (;;) {
+		int ret = poll(&fds, 1, 10000);
+
+		if (ret < 0) {
+			if (errno == EINTR)
+				continue;
+			perror("poll");
+			exit(1);
+		}
+		if ((ret > 0) && (fds.revents & POLLIN))
+			break;
+	}
+	if (verbose) {
+		struct inotify_event events[10];
+		long len;
+
+		usleep(1000);
+		len = read(fd, events, sizeof(events));
+		printf("Number of events read = %ld\n",
+			len/sizeof(struct inotify_event));
+	}
+	close(fd);
+	return 0;
+}
diff --git a/tools/testing/selftests/cgroup/with_stress.sh b/tools/testing/selftests/cgroup/with_stress.sh
new file mode 100755
index 000000000..e28c35008
--- /dev/null
+++ b/tools/testing/selftests/cgroup/with_stress.sh
@@ -0,0 +1,101 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# Kselftest framework requirement - SKIP code is 4.
+ksft_skip=4
+
+stress_fork()
+{
+	while true ; do
+		/usr/bin/true
+		sleep 0.01
+	done
+}
+
+stress_subsys()
+{
+	local verb=+
+	while true ; do
+		echo $verb$subsys_ctrl >$sysfs/cgroup.subtree_control
+		[ $verb = "+" ] && verb=- || verb=+
+		# incommensurable period with other stresses
+		sleep 0.011
+	done
+}
+
+init_and_check()
+{
+	sysfs=`mount -t cgroup2 | head -1 | awk '{ print $3 }'`
+	if [ ! -d "$sysfs" ]; then
+		echo "Skipping: cgroup2 is not mounted" >&2
+		exit $ksft_skip
+	fi
+
+	if ! echo +$subsys_ctrl >$sysfs/cgroup.subtree_control ; then
+		echo "Skipping: cannot enable $subsys_ctrl in $sysfs" >&2
+		exit $ksft_skip
+	fi
+
+	if ! echo -$subsys_ctrl >$sysfs/cgroup.subtree_control ; then
+		echo "Skipping: cannot disable $subsys_ctrl in $sysfs" >&2
+		exit $ksft_skip
+	fi
+}
+
+declare -a stresses
+declare -a stress_pids
+duration=5
+rc=0
+subsys_ctrl=cpuset
+sysfs=
+
+while getopts c:d:hs: opt; do
+	case $opt in
+	c)
+		subsys_ctrl=$OPTARG
+		;;
+	d)
+		duration=$OPTARG
+		;;
+	h)
+		echo "Usage $0 [ -s stress ] ... [ -d duration ] [-c controller] cmd args .."
+		echo -e "\t default duration $duration seconds"
+		echo -e "\t default controller $subsys_ctrl"
+		exit
+		;;
+	s)
+		func=stress_$OPTARG
+		if [ "x$(type -t $func)" != "xfunction" ] ; then
+			echo "Unknown stress $OPTARG"
+			exit 1
+		fi
+		stresses+=($func)
+		;;
+	esac
+done
+shift $((OPTIND - 1))
+
+init_and_check
+
+for s in ${stresses[*]} ; do
+	$s &
+	stress_pids+=($!)
+done
+
+
+time=0
+start=$(date +%s)
+
+while [ $time -lt $duration ] ; do
+	$*
+	rc=$?
+	[ $rc -eq 0 ] || break
+	time=$(($(date +%s) - $start))
+done
+
+for pid in ${stress_pids[*]} ; do
+	kill -SIGTERM $pid
+	wait $pid
+done
+
+exit $rc
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-07 18:49:45 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-07 18:49:45 +0000
commit	2c3c1048746a4622d8c89a29670120dc8fab93c4 (patch)
tree	848558de17fb3008cdf4d861b01ac7781903ce39 /tools/testing/selftests/cgroup
parent	Initial commit. (diff)
download	linux-2c3c1048746a4622d8c89a29670120dc8fab93c4.tar.xz linux-2c3c1048746a4622d8c89a29670120dc8fab93c4.zip