11 files changed, 2332 insertions, 0 deletions
diff --git a/tools/testing/selftests/pidfd/.gitignore b/tools/testing/selftests/pidfd/.gitignore
new file mode 100644
index 000000000..973198a3e
--- /dev/null
+++ b/tools/testing/selftests/pidfd/.gitignore
@@ -0,0 +1,8 @@
+# SPDX-License-Identifier: GPL-2.0-only
+pidfd_open_test
+pidfd_poll_test
+pidfd_test
+pidfd_wait
+pidfd_fdinfo_test
+pidfd_getfd_test
+pidfd_setns_test
diff --git a/tools/testing/selftests/pidfd/Makefile b/tools/testing/selftests/pidfd/Makefile
new file mode 100644
index 000000000..f4a2f28f9
--- /dev/null
+++ b/tools/testing/selftests/pidfd/Makefile
@@ -0,0 +1,8 @@
+# SPDX-License-Identifier: GPL-2.0-only
+CFLAGS += -g -I../../../../usr/include/ -pthread
+
+TEST_GEN_PROGS := pidfd_test pidfd_fdinfo_test pidfd_open_test \
+	pidfd_poll_test pidfd_wait pidfd_getfd_test pidfd_setns_test
+
+include ../lib.mk
+
diff --git a/tools/testing/selftests/pidfd/config b/tools/testing/selftests/pidfd/config
new file mode 100644
index 000000000..f6f2965e1
--- /dev/null
+++ b/tools/testing/selftests/pidfd/config
@@ -0,0 +1,7 @@
+CONFIG_UTS_NS=y
+CONFIG_IPC_NS=y
+CONFIG_USER_NS=y
+CONFIG_PID_NS=y
+CONFIG_NET_NS=y
+CONFIG_CGROUPS=y
+CONFIG_CHECKPOINT_RESTORE=y
diff --git a/tools/testing/selftests/pidfd/pidfd.h b/tools/testing/selftests/pidfd/pidfd.h
new file mode 100644
index 000000000..6922d6417
--- /dev/null
+++ b/tools/testing/selftests/pidfd/pidfd.h
@@ -0,0 +1,118 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef __PIDFD_H
+#define __PIDFD_H
+
+#define _GNU_SOURCE
+#include <errno.h>
+#include <fcntl.h>
+#include <sched.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <syscall.h>
+#include <sys/mount.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+
+#include "../kselftest.h"
+
+#ifndef P_PIDFD
+#define P_PIDFD 3
+#endif
+
+#ifndef CLONE_NEWTIME
+#define CLONE_NEWTIME 0x00000080
+#endif
+
+#ifndef CLONE_PIDFD
+#define CLONE_PIDFD 0x00001000
+#endif
+
+#ifndef __NR_pidfd_open
+#define __NR_pidfd_open -1
+#endif
+
+#ifndef __NR_pidfd_send_signal
+#define __NR_pidfd_send_signal -1
+#endif
+
+#ifndef __NR_clone3
+#define __NR_clone3 -1
+#endif
+
+#ifndef __NR_pidfd_getfd
+#define __NR_pidfd_getfd -1
+#endif
+
+#ifndef PIDFD_NONBLOCK
+#define PIDFD_NONBLOCK O_NONBLOCK
+#endif
+
+/*
+ * The kernel reserves 300 pids via RESERVED_PIDS in kernel/pid.c
+ * That means, when it wraps around any pid < 300 will be skipped.
+ * So we need to use a pid > 300 in order to test recycling.
+ */
+#define PID_RECYCLE 1000
+
+/*
+ * Define a few custom error codes for the child process to clearly indicate
+ * what is happening. This way we can tell the difference between a system
+ * error, a test error, etc.
+ */
+#define PIDFD_PASS 0
+#define PIDFD_FAIL 1
+#define PIDFD_ERROR 2
+#define PIDFD_SKIP 3
+#define PIDFD_XFAIL 4
+
+static inline int wait_for_pid(pid_t pid)
+{
+	int status, ret;
+
+again:
+	ret = waitpid(pid, &status, 0);
+	if (ret == -1) {
+		if (errno == EINTR)
+			goto again;
+
+		ksft_print_msg("waitpid returned -1, errno=%d\n", errno);
+		return -1;
+	}
+
+	if (!WIFEXITED(status)) {
+		ksft_print_msg(
+		       "waitpid !WIFEXITED, WIFSIGNALED=%d, WTERMSIG=%d\n",
+		       WIFSIGNALED(status), WTERMSIG(status));
+		return -1;
+	}
+
+	ret = WEXITSTATUS(status);
+	ksft_print_msg("waitpid WEXITSTATUS=%d\n", ret);
+	return ret;
+}
+
+static inline int sys_pidfd_open(pid_t pid, unsigned int flags)
+{
+	return syscall(__NR_pidfd_open, pid, flags);
+}
+
+static inline int sys_pidfd_send_signal(int pidfd, int sig, siginfo_t *info,
+					unsigned int flags)
+{
+	return syscall(__NR_pidfd_send_signal, pidfd, sig, info, flags);
+}
+
+static inline int sys_pidfd_getfd(int pidfd, int fd, int flags)
+{
+	return syscall(__NR_pidfd_getfd, pidfd, fd, flags);
+}
+
+static inline int sys_memfd_create(const char *name, unsigned int flags)
+{
+	return syscall(__NR_memfd_create, name, flags);
+}
+
+#endif /* __PIDFD_H */
diff --git a/tools/testing/selftests/pidfd/pidfd_fdinfo_test.c b/tools/testing/selftests/pidfd/pidfd_fdinfo_test.c
new file mode 100644
index 000000000..3bc46d615
--- /dev/null
+++ b/tools/testing/selftests/pidfd/pidfd_fdinfo_test.c
@@ -0,0 +1,310 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#define _GNU_SOURCE
+#include <assert.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <linux/types.h>
+#include <sched.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <syscall.h>
+#include <sys/wait.h>
+#include <sys/mman.h>
+
+#include "pidfd.h"
+#include "../kselftest.h"
+
+struct error {
+	int  code;
+	char msg[512];
+};
+
+static int error_set(struct error *err, int code, const char *fmt, ...)
+{
+	va_list args;
+	int r;
+
+	if (code == PIDFD_PASS || !err || err->code != PIDFD_PASS)
+		return code;
+
+	err->code = code;
+	va_start(args, fmt);
+	r = vsnprintf(err->msg, sizeof(err->msg), fmt, args);
+	assert((size_t)r < sizeof(err->msg));
+	va_end(args);
+
+	return code;
+}
+
+static void error_report(struct error *err, const char *test_name)
+{
+	switch (err->code) {
+	case PIDFD_ERROR:
+		ksft_exit_fail_msg("%s test: Fatal: %s\n", test_name, err->msg);
+		break;
+
+	case PIDFD_FAIL:
+		/* will be: not ok %d # error %s test: %s */
+		ksft_test_result_error("%s test: %s\n", test_name, err->msg);
+		break;
+
+	case PIDFD_SKIP:
+		/* will be: not ok %d # SKIP %s test: %s */
+		ksft_test_result_skip("%s test: %s\n", test_name, err->msg);
+		break;
+
+	case PIDFD_XFAIL:
+		ksft_test_result_pass("%s test: Expected failure: %s\n",
+				      test_name, err->msg);
+		break;
+
+	case PIDFD_PASS:
+		ksft_test_result_pass("%s test: Passed\n", test_name);
+		break;
+
+	default:
+		ksft_exit_fail_msg("%s test: Unknown code: %d %s\n",
+				   test_name, err->code, err->msg);
+		break;
+	}
+}
+
+static inline int error_check(struct error *err, const char *test_name)
+{
+	/* In case of error we bail out and terminate the test program */
+	if (err->code == PIDFD_ERROR)
+		error_report(err, test_name);
+
+	return err->code;
+}
+
+#define CHILD_STACK_SIZE 8192
+
+struct child {
+	char *stack;
+	pid_t pid;
+	int   fd;
+};
+
+static struct child clone_newns(int (*fn)(void *), void *args,
+				struct error *err)
+{
+	static int flags = CLONE_PIDFD | CLONE_NEWPID | CLONE_NEWNS | SIGCHLD;
+	struct child ret;
+
+	if (!(flags & CLONE_NEWUSER) && geteuid() != 0)
+		flags |= CLONE_NEWUSER;
+
+	ret.stack = mmap(NULL, CHILD_STACK_SIZE, PROT_READ | PROT_WRITE,
+			 MAP_PRIVATE | MAP_ANONYMOUS | MAP_STACK, -1, 0);
+	if (ret.stack == MAP_FAILED) {
+		error_set(err, -1, "mmap of stack failed (errno %d)", errno);
+		return ret;
+	}
+
+#ifdef __ia64__
+	ret.pid = __clone2(fn, ret.stack, CHILD_STACK_SIZE, flags, args, &ret.fd);
+#else
+	ret.pid = clone(fn, ret.stack + CHILD_STACK_SIZE, flags, args, &ret.fd);
+#endif
+
+	if (ret.pid < 0) {
+		error_set(err, PIDFD_ERROR, "clone failed (ret %d, errno %d)",
+			  ret.fd, errno);
+		return ret;
+	}
+
+	ksft_print_msg("New child: %d, fd: %d\n", ret.pid, ret.fd);
+
+	return ret;
+}
+
+static inline void child_close(struct child *child)
+{
+	close(child->fd);
+}
+
+static inline int child_join(struct child *child, struct error *err)
+{
+	int r;
+
+	r = wait_for_pid(child->pid);
+	if (r < 0)
+		error_set(err, PIDFD_ERROR, "waitpid failed (ret %d, errno %d)",
+			  r, errno);
+	else if (r > 0)
+		error_set(err, r, "child %d reported: %d", child->pid, r);
+
+	if (munmap(child->stack, CHILD_STACK_SIZE)) {
+		error_set(err, -1, "munmap of child stack failed (errno %d)", errno);
+		r = -1;
+	}
+
+	return r;
+}
+
+static inline int child_join_close(struct child *child, struct error *err)
+{
+	child_close(child);
+	return child_join(child, err);
+}
+
+static inline void trim_newline(char *str)
+{
+	char *pos = strrchr(str, '\n');
+
+	if (pos)
+		*pos = '\0';
+}
+
+static int verify_fdinfo(int pidfd, struct error *err, const char *prefix,
+			 size_t prefix_len, const char *expect, ...)
+{
+	char buffer[512] = {0, };
+	char path[512] = {0, };
+	va_list args;
+	FILE *f;
+	char *line = NULL;
+	size_t n = 0;
+	int found = 0;
+	int r;
+
+	va_start(args, expect);
+	r = vsnprintf(buffer, sizeof(buffer), expect, args);
+	assert((size_t)r < sizeof(buffer));
+	va_end(args);
+
+	snprintf(path, sizeof(path), "/proc/self/fdinfo/%d", pidfd);
+	f = fopen(path, "re");
+	if (!f)
+		return error_set(err, PIDFD_ERROR, "fdinfo open failed for %d",
+				 pidfd);
+
+	while (getline(&line, &n, f) != -1) {
+		char *val;
+
+		if (strncmp(line, prefix, prefix_len))
+			continue;
+
+		found = 1;
+
+		val = line + prefix_len;
+		r = strcmp(val, buffer);
+		if (r != 0) {
+			trim_newline(line);
+			trim_newline(buffer);
+			error_set(err, PIDFD_FAIL, "%s '%s' != '%s'",
+				  prefix, val, buffer);
+		}
+		break;
+	}
+
+	free(line);
+	fclose(f);
+
+	if (found == 0)
+		return error_set(err, PIDFD_FAIL, "%s not found for fd %d",
+				 prefix, pidfd);
+
+	return PIDFD_PASS;
+}
+
+static int child_fdinfo_nspid_test(void *args)
+{
+	struct error err;
+	int pidfd;
+	int r;
+
+	/* if we got no fd for the sibling, we are done */
+	if (!args)
+		return PIDFD_PASS;
+
+	/* verify that we can not resolve the pidfd for a process
+	 * in a sibling pid namespace, i.e. a pid namespace it is
+	 * not in our or a descended namespace
+	 */
+	r = mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0);
+	if (r < 0) {
+		ksft_print_msg("Failed to remount / private\n");
+		return PIDFD_ERROR;
+	}
+
+	(void)umount2("/proc", MNT_DETACH);
+	r = mount("proc", "/proc", "proc", 0, NULL);
+	if (r < 0) {
+		ksft_print_msg("Failed to remount /proc\n");
+		return PIDFD_ERROR;
+	}
+
+	pidfd = *(int *)args;
+	r = verify_fdinfo(pidfd, &err, "NSpid:", 6, "\t0\n");
+
+	if (r != PIDFD_PASS)
+		ksft_print_msg("NSpid fdinfo check failed: %s\n", err.msg);
+
+	return r;
+}
+
+static void test_pidfd_fdinfo_nspid(void)
+{
+	struct child a, b;
+	struct error err = {0, };
+	const char *test_name = "pidfd check for NSpid in fdinfo";
+
+	/* Create a new child in a new pid and mount namespace */
+	a = clone_newns(child_fdinfo_nspid_test, NULL, &err);
+	error_check(&err, test_name);
+
+	/* Pass the pidfd representing the first child to the
+	 * second child, which will be in a sibling pid namespace,
+	 * which means that the fdinfo NSpid entry for the pidfd
+	 * should only contain '0'.
+	 */
+	b = clone_newns(child_fdinfo_nspid_test, &a.fd, &err);
+	error_check(&err, test_name);
+
+	/* The children will have pid 1 in the new pid namespace,
+	 * so the line must be 'NSPid:\t<pid>\t1'.
+	 */
+	verify_fdinfo(a.fd, &err, "NSpid:", 6, "\t%d\t%d\n", a.pid, 1);
+	verify_fdinfo(b.fd, &err, "NSpid:", 6, "\t%d\t%d\n", b.pid, 1);
+
+	/* wait for the process, check the exit status and set
+	 * 'err' accordingly, if it is not already set.
+	 */
+	child_join_close(&a, &err);
+	child_join_close(&b, &err);
+
+	error_report(&err, test_name);
+}
+
+static void test_pidfd_dead_fdinfo(void)
+{
+	struct child a;
+	struct error err = {0, };
+	const char *test_name = "pidfd check fdinfo for dead process";
+
+	/* Create a new child in a new pid and mount namespace */
+	a = clone_newns(child_fdinfo_nspid_test, NULL, &err);
+	error_check(&err, test_name);
+	child_join(&a, &err);
+
+	verify_fdinfo(a.fd, &err, "Pid:", 4, "\t-1\n");
+	verify_fdinfo(a.fd, &err, "NSpid:", 6, "\t-1\n");
+	child_close(&a);
+	error_report(&err, test_name);
+}
+
+int main(int argc, char **argv)
+{
+	ksft_print_header();
+	ksft_set_plan(2);
+
+	test_pidfd_fdinfo_nspid();
+	test_pidfd_dead_fdinfo();
+
+	return ksft_exit_pass();
+}
diff --git a/tools/testing/selftests/pidfd/pidfd_getfd_test.c b/tools/testing/selftests/pidfd/pidfd_getfd_test.c
new file mode 100644
index 000000000..0930e2411
--- /dev/null
+++ b/tools/testing/selftests/pidfd/pidfd_getfd_test.c
@@ -0,0 +1,246 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#define _GNU_SOURCE
+#include <errno.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <linux/types.h>
+#include <sched.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <syscall.h>
+#include <sys/prctl.h>
+#include <sys/wait.h>
+#include <unistd.h>
+#include <sys/socket.h>
+#include <linux/kcmp.h>
+
+#include "pidfd.h"
+#include "../kselftest_harness.h"
+
+/*
+ * UNKNOWN_FD is an fd number that should never exist in the child, as it is
+ * used to check the negative case.
+ */
+#define UNKNOWN_FD 111
+#define UID_NOBODY 65535
+
+static int sys_kcmp(pid_t pid1, pid_t pid2, int type, unsigned long idx1,
+		    unsigned long idx2)
+{
+	return syscall(__NR_kcmp, pid1, pid2, type, idx1, idx2);
+}
+
+static int __child(int sk, int memfd)
+{
+	int ret;
+	char buf;
+
+	/*
+	 * Ensure we don't leave around a bunch of orphaned children if our
+	 * tests fail.
+	 */
+	ret = prctl(PR_SET_PDEATHSIG, SIGKILL);
+	if (ret) {
+		fprintf(stderr, "%s: Child could not set DEATHSIG\n",
+			strerror(errno));
+		return -1;
+	}
+
+	ret = send(sk, &memfd, sizeof(memfd), 0);
+	if (ret != sizeof(memfd)) {
+		fprintf(stderr, "%s: Child failed to send fd number\n",
+			strerror(errno));
+		return -1;
+	}
+
+	/*
+	 * The fixture setup is completed at this point. The tests will run.
+	 *
+	 * This blocking recv enables the parent to message the child.
+	 * Either we will read 'P' off of the sk, indicating that we need
+	 * to disable ptrace, or we will read a 0, indicating that the other
+	 * side has closed the sk. This occurs during fixture teardown time,
+	 * indicating that the child should exit.
+	 */
+	while ((ret = recv(sk, &buf, sizeof(buf), 0)) > 0) {
+		if (buf == 'P') {
+			ret = prctl(PR_SET_DUMPABLE, 0);
+			if (ret < 0) {
+				fprintf(stderr,
+					"%s: Child failed to disable ptrace\n",
+					strerror(errno));
+				return -1;
+			}
+		} else {
+			fprintf(stderr, "Child received unknown command %c\n",
+				buf);
+			return -1;
+		}
+		ret = send(sk, &buf, sizeof(buf), 0);
+		if (ret != 1) {
+			fprintf(stderr, "%s: Child failed to ack\n",
+				strerror(errno));
+			return -1;
+		}
+	}
+	if (ret < 0) {
+		fprintf(stderr, "%s: Child failed to read from socket\n",
+			strerror(errno));
+		return -1;
+	}
+
+	return 0;
+}
+
+static int child(int sk)
+{
+	int memfd, ret;
+
+	memfd = sys_memfd_create("test", 0);
+	if (memfd < 0) {
+		fprintf(stderr, "%s: Child could not create memfd\n",
+			strerror(errno));
+		ret = -1;
+	} else {
+		ret = __child(sk, memfd);
+		close(memfd);
+	}
+
+	close(sk);
+	return ret;
+}
+
+FIXTURE(child)
+{
+	/*
+	 * remote_fd is the number of the FD which we are trying to retrieve
+	 * from the child.
+	 */
+	int remote_fd;
+	/* pid points to the child which we are fetching FDs from */
+	pid_t pid;
+	/* pidfd is the pidfd of the child */
+	int pidfd;
+	/*
+	 * sk is our side of the socketpair used to communicate with the child.
+	 * When it is closed, the child will exit.
+	 */
+	int sk;
+};
+
+FIXTURE_SETUP(child)
+{
+	int ret, sk_pair[2];
+
+	ASSERT_EQ(0, socketpair(PF_LOCAL, SOCK_SEQPACKET, 0, sk_pair)) {
+		TH_LOG("%s: failed to create socketpair", strerror(errno));
+	}
+	self->sk = sk_pair[0];
+
+	self->pid = fork();
+	ASSERT_GE(self->pid, 0);
+
+	if (self->pid == 0) {
+		close(sk_pair[0]);
+		if (child(sk_pair[1]))
+			_exit(EXIT_FAILURE);
+		_exit(EXIT_SUCCESS);
+	}
+
+	close(sk_pair[1]);
+
+	self->pidfd = sys_pidfd_open(self->pid, 0);
+	ASSERT_GE(self->pidfd, 0);
+
+	/*
+	 * Wait for the child to complete setup. It'll send the remote memfd's
+	 * number when ready.
+	 */
+	ret = recv(sk_pair[0], &self->remote_fd, sizeof(self->remote_fd), 0);
+	ASSERT_EQ(sizeof(self->remote_fd), ret);
+}
+
+FIXTURE_TEARDOWN(child)
+{
+	EXPECT_EQ(0, close(self->pidfd));
+	EXPECT_EQ(0, close(self->sk));
+
+	EXPECT_EQ(0, wait_for_pid(self->pid));
+}
+
+TEST_F(child, disable_ptrace)
+{
+	int uid, fd;
+	char c;
+
+	/*
+	 * Turn into nobody if we're root, to avoid CAP_SYS_PTRACE
+	 *
+	 * The tests should run in their own process, so even this test fails,
+	 * it shouldn't result in subsequent tests failing.
+	 */
+	uid = getuid();
+	if (uid == 0)
+		ASSERT_EQ(0, seteuid(UID_NOBODY));
+
+	ASSERT_EQ(1, send(self->sk, "P", 1, 0));
+	ASSERT_EQ(1, recv(self->sk, &c, 1, 0));
+
+	fd = sys_pidfd_getfd(self->pidfd, self->remote_fd, 0);
+	EXPECT_EQ(-1, fd);
+	EXPECT_EQ(EPERM, errno);
+
+	if (uid == 0)
+		ASSERT_EQ(0, seteuid(0));
+}
+
+TEST_F(child, fetch_fd)
+{
+	int fd, ret;
+
+	fd = sys_pidfd_getfd(self->pidfd, self->remote_fd, 0);
+	ASSERT_GE(fd, 0);
+
+	ret = sys_kcmp(getpid(), self->pid, KCMP_FILE, fd, self->remote_fd);
+	if (ret < 0 && errno == ENOSYS)
+		SKIP(return, "kcmp() syscall not supported");
+	EXPECT_EQ(ret, 0);
+
+	ret = fcntl(fd, F_GETFD);
+	ASSERT_GE(ret, 0);
+	EXPECT_GE(ret & FD_CLOEXEC, 0);
+
+	close(fd);
+}
+
+TEST_F(child, test_unknown_fd)
+{
+	int fd;
+
+	fd = sys_pidfd_getfd(self->pidfd, UNKNOWN_FD, 0);
+	EXPECT_EQ(-1, fd) {
+		TH_LOG("getfd succeeded while fetching unknown fd");
+	};
+	EXPECT_EQ(EBADF, errno) {
+		TH_LOG("%s: getfd did not get EBADF", strerror(errno));
+	}
+}
+
+TEST(flags_set)
+{
+	ASSERT_EQ(-1, sys_pidfd_getfd(0, 0, 1));
+	EXPECT_EQ(errno, EINVAL);
+}
+
+#if __NR_pidfd_getfd == -1
+int main(void)
+{
+	fprintf(stderr, "__NR_pidfd_getfd undefined. The pidfd_getfd syscall is unavailable. Test aborting\n");
+	return KSFT_SKIP;
+}
+#else
+TEST_HARNESS_MAIN
+#endif
diff --git a/tools/testing/selftests/pidfd/pidfd_open_test.c b/tools/testing/selftests/pidfd/pidfd_open_test.c
new file mode 100644
index 000000000..8a59438cc
--- /dev/null
+++ b/tools/testing/selftests/pidfd/pidfd_open_test.c
@@ -0,0 +1,163 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#define _GNU_SOURCE
+#include <errno.h>
+#include <fcntl.h>
+#include <inttypes.h>
+#include <limits.h>
+#include <linux/types.h>
+#include <sched.h>
+#include <signal.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <syscall.h>
+#include <sys/mount.h>
+#include <sys/prctl.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+#include "pidfd.h"
+#include "../kselftest.h"
+
+static int safe_int(const char *numstr, int *converted)
+{
+	char *err = NULL;
+	long sli;
+
+	errno = 0;
+	sli = strtol(numstr, &err, 0);
+	if (errno == ERANGE && (sli == LONG_MAX || sli == LONG_MIN))
+		return -ERANGE;
+
+	if (errno != 0 && sli == 0)
+		return -EINVAL;
+
+	if (err == numstr || *err != '\0')
+		return -EINVAL;
+
+	if (sli > INT_MAX || sli < INT_MIN)
+		return -ERANGE;
+
+	*converted = (int)sli;
+	return 0;
+}
+
+static int char_left_gc(const char *buffer, size_t len)
+{
+	size_t i;
+
+	for (i = 0; i < len; i++) {
+		if (buffer[i] == ' ' ||
+		    buffer[i] == '\t')
+			continue;
+
+		return i;
+	}
+
+	return 0;
+}
+
+static int char_right_gc(const char *buffer, size_t len)
+{
+	int i;
+
+	for (i = len - 1; i >= 0; i--) {
+		if (buffer[i] == ' '  ||
+		    buffer[i] == '\t' ||
+		    buffer[i] == '\n' ||
+		    buffer[i] == '\0')
+			continue;
+
+		return i + 1;
+	}
+
+	return 0;
+}
+
+static char *trim_whitespace_in_place(char *buffer)
+{
+	buffer += char_left_gc(buffer, strlen(buffer));
+	buffer[char_right_gc(buffer, strlen(buffer))] = '\0';
+	return buffer;
+}
+
+static pid_t get_pid_from_fdinfo_file(int pidfd, const char *key, size_t keylen)
+{
+	int ret;
+	char path[512];
+	FILE *f;
+	size_t n = 0;
+	pid_t result = -1;
+	char *line = NULL;
+
+	snprintf(path, sizeof(path), "/proc/self/fdinfo/%d", pidfd);
+
+	f = fopen(path, "re");
+	if (!f)
+		return -1;
+
+	while (getline(&line, &n, f) != -1) {
+		char *numstr;
+
+		if (strncmp(line, key, keylen))
+			continue;
+
+		numstr = trim_whitespace_in_place(line + 4);
+		ret = safe_int(numstr, &result);
+		if (ret < 0)
+			goto out;
+
+		break;
+	}
+
+out:
+	free(line);
+	fclose(f);
+	return result;
+}
+
+int main(int argc, char **argv)
+{
+	int pidfd = -1, ret = 1;
+	pid_t pid;
+
+	ksft_set_plan(3);
+
+	pidfd = sys_pidfd_open(-1, 0);
+	if (pidfd >= 0) {
+		ksft_print_msg(
+			"%s - succeeded to open pidfd for invalid pid -1\n",
+			strerror(errno));
+		goto on_error;
+	}
+	ksft_test_result_pass("do not allow invalid pid test: passed\n");
+
+	pidfd = sys_pidfd_open(getpid(), 1);
+	if (pidfd >= 0) {
+		ksft_print_msg(
+			"%s - succeeded to open pidfd with invalid flag value specified\n",
+			strerror(errno));
+		goto on_error;
+	}
+	ksft_test_result_pass("do not allow invalid flag test: passed\n");
+
+	pidfd = sys_pidfd_open(getpid(), 0);
+	if (pidfd < 0) {
+		ksft_print_msg("%s - failed to open pidfd\n", strerror(errno));
+		goto on_error;
+	}
+	ksft_test_result_pass("open a new pidfd test: passed\n");
+
+	pid = get_pid_from_fdinfo_file(pidfd, "Pid:", sizeof("Pid:") - 1);
+	ksft_print_msg("pidfd %d refers to process with pid %d\n", pidfd, pid);
+
+	ret = 0;
+
+on_error:
+	if (pidfd >= 0)
+		close(pidfd);
+
+	return !ret ? ksft_exit_pass() : ksft_exit_fail();
+}
diff --git a/tools/testing/selftests/pidfd/pidfd_poll_test.c b/tools/testing/selftests/pidfd/pidfd_poll_test.c
new file mode 100644
index 000000000..610811275
--- /dev/null
+++ b/tools/testing/selftests/pidfd/pidfd_poll_test.c
@@ -0,0 +1,116 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#define _GNU_SOURCE
+#include <errno.h>
+#include <linux/types.h>
+#include <poll.h>
+#include <signal.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <syscall.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+#include "pidfd.h"
+#include "../kselftest.h"
+
+static bool timeout;
+
+static void handle_alarm(int sig)
+{
+	timeout = true;
+}
+
+int main(int argc, char **argv)
+{
+	struct pollfd fds;
+	int iter, nevents;
+	int nr_iterations = 10000;
+
+	fds.events = POLLIN;
+
+	if (argc > 2)
+		ksft_exit_fail_msg("Unexpected command line argument\n");
+
+	if (argc == 2) {
+		nr_iterations = atoi(argv[1]);
+		if (nr_iterations <= 0)
+			ksft_exit_fail_msg("invalid input parameter %s\n",
+					argv[1]);
+	}
+
+	ksft_print_msg("running pidfd poll test for %d iterations\n",
+		nr_iterations);
+
+	for (iter = 0; iter < nr_iterations; iter++) {
+		int pidfd;
+		int child_pid = fork();
+
+		if (child_pid < 0) {
+			if (errno == EAGAIN) {
+				iter--;
+				continue;
+			}
+			ksft_exit_fail_msg(
+				"%s - failed to fork a child process\n",
+				strerror(errno));
+		}
+
+		if (child_pid == 0) {
+			/* Child process just sleeps for a min and exits */
+			sleep(60);
+			exit(EXIT_SUCCESS);
+		}
+
+		/* Parent kills the child and waits for its death */
+		pidfd = sys_pidfd_open(child_pid, 0);
+		if (pidfd < 0)
+			ksft_exit_fail_msg("%s - pidfd_open failed\n",
+					strerror(errno));
+
+		/* Setup 3 sec alarm - plenty of time */
+		if (signal(SIGALRM, handle_alarm) == SIG_ERR)
+			ksft_exit_fail_msg("%s - signal failed\n",
+					strerror(errno));
+		alarm(3);
+
+		/* Send SIGKILL to the child */
+		if (sys_pidfd_send_signal(pidfd, SIGKILL, NULL, 0))
+			ksft_exit_fail_msg("%s - pidfd_send_signal failed\n",
+					strerror(errno));
+
+		/* Wait for the death notification */
+		fds.fd = pidfd;
+		nevents = poll(&fds, 1, -1);
+
+		/* Check for error conditions */
+		if (nevents < 0)
+			ksft_exit_fail_msg("%s - poll failed\n",
+					strerror(errno));
+
+		if (nevents != 1)
+			ksft_exit_fail_msg("unexpected poll result: %d\n",
+					nevents);
+
+		if (!(fds.revents & POLLIN))
+			ksft_exit_fail_msg(
+				"unexpected event type received: 0x%x\n",
+				fds.revents);
+
+		if (timeout)
+			ksft_exit_fail_msg(
+				"death notification wait timeout\n");
+
+		close(pidfd);
+		/* Wait for child to prevent zombies */
+		if (waitpid(child_pid, NULL, 0) < 0)
+			ksft_exit_fail_msg("%s - waitpid failed\n",
+					strerror(errno));
+
+	}
+
+	ksft_test_result_pass("pidfd poll test: pass\n");
+	return ksft_exit_pass();
+}
diff --git a/tools/testing/selftests/pidfd/pidfd_setns_test.c b/tools/testing/selftests/pidfd/pidfd_setns_test.c
new file mode 100644
index 000000000..6e2f2cd40
--- /dev/null
+++ b/tools/testing/selftests/pidfd/pidfd_setns_test.c
@@ -0,0 +1,559 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#define _GNU_SOURCE
+#include <errno.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <linux/types.h>
+#include <sched.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <syscall.h>
+#include <sys/prctl.h>
+#include <sys/wait.h>
+#include <unistd.h>
+#include <sys/socket.h>
+#include <sys/stat.h>
+
+#include "pidfd.h"
+#include "../clone3/clone3_selftests.h"
+#include "../kselftest_harness.h"
+
+enum {
+	PIDFD_NS_USER,
+	PIDFD_NS_MNT,
+	PIDFD_NS_PID,
+	PIDFD_NS_UTS,
+	PIDFD_NS_IPC,
+	PIDFD_NS_NET,
+	PIDFD_NS_CGROUP,
+	PIDFD_NS_PIDCLD,
+	PIDFD_NS_TIME,
+	PIDFD_NS_MAX
+};
+
+const struct ns_info {
+	const char *name;
+	int flag;
+} ns_info[] = {
+	[PIDFD_NS_USER]   = { "user",             CLONE_NEWUSER,   },
+	[PIDFD_NS_MNT]    = { "mnt",              CLONE_NEWNS,     },
+	[PIDFD_NS_PID]    = { "pid",              CLONE_NEWPID,    },
+	[PIDFD_NS_UTS]    = { "uts",              CLONE_NEWUTS,    },
+	[PIDFD_NS_IPC]    = { "ipc",              CLONE_NEWIPC,    },
+	[PIDFD_NS_NET]    = { "net",              CLONE_NEWNET,    },
+	[PIDFD_NS_CGROUP] = { "cgroup",           CLONE_NEWCGROUP, },
+	[PIDFD_NS_PIDCLD] = { "pid_for_children", 0,               },
+	[PIDFD_NS_TIME]	  = { "time",             CLONE_NEWTIME,   },
+};
+
+FIXTURE(current_nsset)
+{
+	pid_t pid;
+	int pidfd;
+	int nsfds[PIDFD_NS_MAX];
+
+	pid_t child_pid_exited;
+	int child_pidfd_exited;
+
+	pid_t child_pid1;
+	int child_pidfd1;
+	int child_nsfds1[PIDFD_NS_MAX];
+
+	pid_t child_pid2;
+	int child_pidfd2;
+	int child_nsfds2[PIDFD_NS_MAX];
+};
+
+static int sys_waitid(int which, pid_t pid, int options)
+{
+	return syscall(__NR_waitid, which, pid, NULL, options, NULL);
+}
+
+pid_t create_child(int *pidfd, unsigned flags)
+{
+	struct __clone_args args = {
+		.flags		= CLONE_PIDFD | flags,
+		.exit_signal	= SIGCHLD,
+		.pidfd		= ptr_to_u64(pidfd),
+	};
+
+	return sys_clone3(&args, sizeof(struct clone_args));
+}
+
+static bool switch_timens(void)
+{
+	int fd, ret;
+
+	if (unshare(CLONE_NEWTIME))
+		return false;
+
+	fd = open("/proc/self/ns/time_for_children", O_RDONLY | O_CLOEXEC);
+	if (fd < 0)
+		return false;
+
+	ret = setns(fd, CLONE_NEWTIME);
+	close(fd);
+	return ret == 0;
+}
+
+static ssize_t read_nointr(int fd, void *buf, size_t count)
+{
+	ssize_t ret;
+
+	do {
+		ret = read(fd, buf, count);
+	} while (ret < 0 && errno == EINTR);
+
+	return ret;
+}
+
+static ssize_t write_nointr(int fd, const void *buf, size_t count)
+{
+	ssize_t ret;
+
+	do {
+		ret = write(fd, buf, count);
+	} while (ret < 0 && errno == EINTR);
+
+	return ret;
+}
+
+FIXTURE_SETUP(current_nsset)
+{
+	int i, proc_fd, ret;
+	int ipc_sockets[2];
+	char c;
+
+	for (i = 0; i < PIDFD_NS_MAX; i++) {
+		self->nsfds[i]		= -EBADF;
+		self->child_nsfds1[i]	= -EBADF;
+		self->child_nsfds2[i]	= -EBADF;
+	}
+
+	proc_fd = open("/proc/self/ns", O_DIRECTORY | O_CLOEXEC);
+	ASSERT_GE(proc_fd, 0) {
+		TH_LOG("%m - Failed to open /proc/self/ns");
+	}
+
+	self->pid = getpid();
+	for (i = 0; i < PIDFD_NS_MAX; i++) {
+		const struct ns_info *info = &ns_info[i];
+		self->nsfds[i] = openat(proc_fd, info->name, O_RDONLY | O_CLOEXEC);
+		if (self->nsfds[i] < 0) {
+			EXPECT_EQ(errno, ENOENT) {
+				TH_LOG("%m - Failed to open %s namespace for process %d",
+				       info->name, self->pid);
+			}
+		}
+	}
+
+	self->pidfd = sys_pidfd_open(self->pid, 0);
+	EXPECT_GT(self->pidfd, 0) {
+		TH_LOG("%m - Failed to open pidfd for process %d", self->pid);
+	}
+
+	/* Create task that exits right away. */
+	self->child_pid_exited = create_child(&self->child_pidfd_exited,
+					      CLONE_NEWUSER | CLONE_NEWNET);
+	EXPECT_GT(self->child_pid_exited, 0);
+
+	if (self->child_pid_exited == 0)
+		_exit(EXIT_SUCCESS);
+
+	ASSERT_EQ(sys_waitid(P_PID, self->child_pid_exited, WEXITED | WNOWAIT), 0);
+
+	self->pidfd = sys_pidfd_open(self->pid, 0);
+	EXPECT_GE(self->pidfd, 0) {
+		TH_LOG("%m - Failed to open pidfd for process %d", self->pid);
+	}
+
+	ret = socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets);
+	EXPECT_EQ(ret, 0);
+
+	/* Create tasks that will be stopped. */
+	self->child_pid1 = create_child(&self->child_pidfd1,
+					CLONE_NEWUSER | CLONE_NEWNS |
+					CLONE_NEWCGROUP | CLONE_NEWIPC |
+					CLONE_NEWUTS | CLONE_NEWPID |
+					CLONE_NEWNET);
+	EXPECT_GE(self->child_pid1, 0);
+
+	if (self->child_pid1 == 0) {
+		close(ipc_sockets[0]);
+
+		if (!switch_timens())
+			_exit(EXIT_FAILURE);
+
+		if (write_nointr(ipc_sockets[1], "1", 1) < 0)
+			_exit(EXIT_FAILURE);
+
+		close(ipc_sockets[1]);
+
+		pause();
+		_exit(EXIT_SUCCESS);
+	}
+
+	close(ipc_sockets[1]);
+	ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1);
+	close(ipc_sockets[0]);
+
+	ret = socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets);
+	EXPECT_EQ(ret, 0);
+
+	self->child_pid2 = create_child(&self->child_pidfd2,
+					CLONE_NEWUSER | CLONE_NEWNS |
+					CLONE_NEWCGROUP | CLONE_NEWIPC |
+					CLONE_NEWUTS | CLONE_NEWPID |
+					CLONE_NEWNET);
+	EXPECT_GE(self->child_pid2, 0);
+
+	if (self->child_pid2 == 0) {
+		close(ipc_sockets[0]);
+
+		if (!switch_timens())
+			_exit(EXIT_FAILURE);
+
+		if (write_nointr(ipc_sockets[1], "1", 1) < 0)
+			_exit(EXIT_FAILURE);
+
+		close(ipc_sockets[1]);
+
+		pause();
+		_exit(EXIT_SUCCESS);
+	}
+
+	close(ipc_sockets[1]);
+	ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1);
+	close(ipc_sockets[0]);
+
+	for (i = 0; i < PIDFD_NS_MAX; i++) {
+		char p[100];
+
+		const struct ns_info *info = &ns_info[i];
+
+		self->nsfds[i] = openat(proc_fd, info->name, O_RDONLY | O_CLOEXEC);
+		if (self->nsfds[i] < 0) {
+			EXPECT_EQ(errno, ENOENT) {
+				TH_LOG("%m - Failed to open %s namespace for process %d",
+				       info->name, self->pid);
+			}
+		}
+
+		ret = snprintf(p, sizeof(p), "/proc/%d/ns/%s",
+			       self->child_pid1, info->name);
+		EXPECT_GT(ret, 0);
+		EXPECT_LT(ret, sizeof(p));
+
+		self->child_nsfds1[i] = open(p, O_RDONLY | O_CLOEXEC);
+		if (self->child_nsfds1[i] < 0) {
+			EXPECT_EQ(errno, ENOENT) {
+				TH_LOG("%m - Failed to open %s namespace for process %d",
+				       info->name, self->child_pid1);
+			}
+		}
+
+		ret = snprintf(p, sizeof(p), "/proc/%d/ns/%s",
+			       self->child_pid2, info->name);
+		EXPECT_GT(ret, 0);
+		EXPECT_LT(ret, sizeof(p));
+
+		self->child_nsfds2[i] = open(p, O_RDONLY | O_CLOEXEC);
+		if (self->child_nsfds2[i] < 0) {
+			EXPECT_EQ(errno, ENOENT) {
+				TH_LOG("%m - Failed to open %s namespace for process %d",
+				       info->name, self->child_pid1);
+			}
+		}
+	}
+
+	close(proc_fd);
+}
+
+FIXTURE_TEARDOWN(current_nsset)
+{
+	int i;
+
+	ASSERT_EQ(sys_pidfd_send_signal(self->child_pidfd1,
+					SIGKILL, NULL, 0), 0);
+	ASSERT_EQ(sys_pidfd_send_signal(self->child_pidfd2,
+					SIGKILL, NULL, 0), 0);
+
+	for (i = 0; i < PIDFD_NS_MAX; i++) {
+		if (self->nsfds[i] >= 0)
+			close(self->nsfds[i]);
+		if (self->child_nsfds1[i] >= 0)
+			close(self->child_nsfds1[i]);
+		if (self->child_nsfds2[i] >= 0)
+			close(self->child_nsfds2[i]);
+	}
+
+	if (self->child_pidfd1 >= 0)
+		EXPECT_EQ(0, close(self->child_pidfd1));
+	if (self->child_pidfd2 >= 0)
+		EXPECT_EQ(0, close(self->child_pidfd2));
+	ASSERT_EQ(sys_waitid(P_PID, self->child_pid_exited, WEXITED), 0);
+	ASSERT_EQ(sys_waitid(P_PID, self->child_pid1, WEXITED), 0);
+	ASSERT_EQ(sys_waitid(P_PID, self->child_pid2, WEXITED), 0);
+}
+
+static int preserve_ns(const int pid, const char *ns)
+{
+	int ret;
+	char path[50];
+
+	ret = snprintf(path, sizeof(path), "/proc/%d/ns/%s", pid, ns);
+	if (ret < 0 || (size_t)ret >= sizeof(path))
+		return -EIO;
+
+	return open(path, O_RDONLY | O_CLOEXEC);
+}
+
+static int in_same_namespace(int ns_fd1, pid_t pid2, const char *ns)
+{
+	int ns_fd2 = -EBADF;
+	int ret = -1;
+	struct stat ns_st1, ns_st2;
+
+	ret = fstat(ns_fd1, &ns_st1);
+	if (ret < 0)
+		return -1;
+
+	ns_fd2 = preserve_ns(pid2, ns);
+	if (ns_fd2 < 0)
+		return -1;
+
+	ret = fstat(ns_fd2, &ns_st2);
+	close(ns_fd2);
+	if (ret < 0)
+		return -1;
+
+	/* processes are in the same namespace */
+	if ((ns_st1.st_dev == ns_st2.st_dev) &&
+	    (ns_st1.st_ino == ns_st2.st_ino))
+		return 1;
+
+	/* processes are in different namespaces */
+	return 0;
+}
+
+/* Test that we can't pass garbage to the kernel. */
+TEST_F(current_nsset, invalid_flags)
+{
+	ASSERT_NE(setns(self->pidfd, 0), 0);
+	EXPECT_EQ(errno, EINVAL);
+
+	ASSERT_NE(setns(self->pidfd, -1), 0);
+	EXPECT_EQ(errno, EINVAL);
+
+	ASSERT_NE(setns(self->pidfd, CLONE_VM), 0);
+	EXPECT_EQ(errno, EINVAL);
+
+	ASSERT_NE(setns(self->pidfd, CLONE_NEWUSER | CLONE_VM), 0);
+	EXPECT_EQ(errno, EINVAL);
+}
+
+/* Test that we can't attach to a task that has already exited. */
+TEST_F(current_nsset, pidfd_exited_child)
+{
+	int i;
+	pid_t pid;
+
+	ASSERT_NE(setns(self->child_pidfd_exited, CLONE_NEWUSER | CLONE_NEWNET),
+		  0);
+	EXPECT_EQ(errno, ESRCH);
+
+	pid = getpid();
+	for (i = 0; i < PIDFD_NS_MAX; i++) {
+		const struct ns_info *info = &ns_info[i];
+		/* Verify that we haven't changed any namespaces. */
+		if (self->nsfds[i] >= 0)
+			ASSERT_EQ(in_same_namespace(self->nsfds[i], pid, info->name), 1);
+	}
+}
+
+TEST_F(current_nsset, pidfd_incremental_setns)
+{
+	int i;
+	pid_t pid;
+
+	pid = getpid();
+	for (i = 0; i < PIDFD_NS_MAX; i++) {
+		const struct ns_info *info = &ns_info[i];
+		int nsfd;
+
+		if (self->child_nsfds1[i] < 0)
+			continue;
+
+		if (info->flag) {
+			ASSERT_EQ(setns(self->child_pidfd1, info->flag), 0) {
+				TH_LOG("%m - Failed to setns to %s namespace of %d via pidfd %d",
+				       info->name, self->child_pid1,
+				       self->child_pidfd1);
+			}
+		}
+
+		/* Verify that we have changed to the correct namespaces. */
+		if (info->flag == CLONE_NEWPID)
+			nsfd = self->nsfds[i];
+		else
+			nsfd = self->child_nsfds1[i];
+		ASSERT_EQ(in_same_namespace(nsfd, pid, info->name), 1) {
+			TH_LOG("setns failed to place us correctly into %s namespace of %d via pidfd %d",
+			       info->name, self->child_pid1,
+			       self->child_pidfd1);
+		}
+		TH_LOG("Managed to correctly setns to %s namespace of %d via pidfd %d",
+		       info->name, self->child_pid1, self->child_pidfd1);
+	}
+}
+
+TEST_F(current_nsset, nsfd_incremental_setns)
+{
+	int i;
+	pid_t pid;
+
+	pid = getpid();
+	for (i = 0; i < PIDFD_NS_MAX; i++) {
+		const struct ns_info *info = &ns_info[i];
+		int nsfd;
+
+		if (self->child_nsfds1[i] < 0)
+			continue;
+
+		if (info->flag) {
+			ASSERT_EQ(setns(self->child_nsfds1[i], info->flag), 0) {
+				TH_LOG("%m - Failed to setns to %s namespace of %d via nsfd %d",
+				       info->name, self->child_pid1,
+				       self->child_nsfds1[i]);
+			}
+		}
+
+		/* Verify that we have changed to the correct namespaces. */
+		if (info->flag == CLONE_NEWPID)
+			nsfd = self->nsfds[i];
+		else
+			nsfd = self->child_nsfds1[i];
+		ASSERT_EQ(in_same_namespace(nsfd, pid, info->name), 1) {
+			TH_LOG("setns failed to place us correctly into %s namespace of %d via nsfd %d",
+			       info->name, self->child_pid1,
+			       self->child_nsfds1[i]);
+		}
+		TH_LOG("Managed to correctly setns to %s namespace of %d via nsfd %d",
+		       info->name, self->child_pid1, self->child_nsfds1[i]);
+	}
+}
+
+TEST_F(current_nsset, pidfd_one_shot_setns)
+{
+	unsigned flags = 0;
+	int i;
+	pid_t pid;
+
+	for (i = 0; i < PIDFD_NS_MAX; i++) {
+		const struct ns_info *info = &ns_info[i];
+
+		if (self->child_nsfds1[i] < 0)
+			continue;
+
+		flags |= info->flag;
+		TH_LOG("Adding %s namespace of %d to list of namespaces to attach to",
+		       info->name, self->child_pid1);
+	}
+
+	ASSERT_EQ(setns(self->child_pidfd1, flags), 0) {
+		TH_LOG("%m - Failed to setns to namespaces of %d",
+		       self->child_pid1);
+	}
+
+	pid = getpid();
+	for (i = 0; i < PIDFD_NS_MAX; i++) {
+		const struct ns_info *info = &ns_info[i];
+		int nsfd;
+
+		if (self->child_nsfds1[i] < 0)
+			continue;
+
+		/* Verify that we have changed to the correct namespaces. */
+		if (info->flag == CLONE_NEWPID)
+			nsfd = self->nsfds[i];
+		else
+			nsfd = self->child_nsfds1[i];
+		ASSERT_EQ(in_same_namespace(nsfd, pid, info->name), 1) {
+			TH_LOG("setns failed to place us correctly into %s namespace of %d",
+			       info->name, self->child_pid1);
+		}
+		TH_LOG("Managed to correctly setns to %s namespace of %d",
+		       info->name, self->child_pid1);
+	}
+}
+
+TEST_F(current_nsset, no_foul_play)
+{
+	unsigned flags = 0;
+	int i;
+
+	for (i = 0; i < PIDFD_NS_MAX; i++) {
+		const struct ns_info *info = &ns_info[i];
+
+		if (self->child_nsfds1[i] < 0)
+			continue;
+
+		flags |= info->flag;
+		if (info->flag) /* No use logging pid_for_children. */
+			TH_LOG("Adding %s namespace of %d to list of namespaces to attach to",
+			       info->name, self->child_pid1);
+	}
+
+	ASSERT_EQ(setns(self->child_pidfd1, flags), 0) {
+		TH_LOG("%m - Failed to setns to namespaces of %d vid pidfd %d",
+		       self->child_pid1, self->child_pidfd1);
+	}
+
+	/*
+	 * Can't setns to a user namespace outside of our hierarchy since we
+	 * don't have caps in there and didn't create it. That means that under
+	 * no circumstances should we be able to setns to any of the other
+	 * ones since they aren't owned by our user namespace.
+	 */
+	for (i = 0; i < PIDFD_NS_MAX; i++) {
+		const struct ns_info *info = &ns_info[i];
+
+		if (self->child_nsfds2[i] < 0 || !info->flag)
+			continue;
+
+		ASSERT_NE(setns(self->child_pidfd2, info->flag), 0) {
+			TH_LOG("Managed to setns to %s namespace of %d via pidfd %d",
+			       info->name, self->child_pid2,
+			       self->child_pidfd2);
+		}
+		TH_LOG("%m - Correctly failed to setns to %s namespace of %d via pidfd %d",
+		       info->name, self->child_pid2,
+		       self->child_pidfd2);
+
+		ASSERT_NE(setns(self->child_nsfds2[i], info->flag), 0) {
+			TH_LOG("Managed to setns to %s namespace of %d via nsfd %d",
+			       info->name, self->child_pid2,
+			       self->child_nsfds2[i]);
+		}
+		TH_LOG("%m - Correctly failed to setns to %s namespace of %d via nsfd %d",
+		       info->name, self->child_pid2,
+		       self->child_nsfds2[i]);
+	}
+}
+
+TEST(setns_einval)
+{
+	int fd;
+
+	fd = sys_memfd_create("rostock", 0);
+	EXPECT_GT(fd, 0);
+
+	ASSERT_NE(setns(fd, 0), 0);
+	EXPECT_EQ(errno, EINVAL);
+	close(fd);
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/pidfd/pidfd_test.c b/tools/testing/selftests/pidfd/pidfd_test.c
new file mode 100644
index 000000000..79f543ad3
--- /dev/null
+++ b/tools/testing/selftests/pidfd/pidfd_test.c
@@ -0,0 +1,573 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#define _GNU_SOURCE
+#include <errno.h>
+#include <fcntl.h>
+#include <linux/types.h>
+#include <pthread.h>
+#include <sched.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <string.h>
+#include <syscall.h>
+#include <sys/epoll.h>
+#include <sys/mman.h>
+#include <sys/mount.h>
+#include <sys/wait.h>
+#include <time.h>
+#include <unistd.h>
+
+#include "pidfd.h"
+#include "../kselftest.h"
+
+#define str(s) _str(s)
+#define _str(s) #s
+#define CHILD_THREAD_MIN_WAIT 3 /* seconds */
+
+#define MAX_EVENTS 5
+
+static bool have_pidfd_send_signal;
+
+static pid_t pidfd_clone(int flags, int *pidfd, int (*fn)(void *))
+{
+	size_t stack_size = 1024;
+	char *stack[1024] = { 0 };
+
+#ifdef __ia64__
+	return __clone2(fn, stack, stack_size, flags | SIGCHLD, NULL, pidfd);
+#else
+	return clone(fn, stack + stack_size, flags | SIGCHLD, NULL, pidfd);
+#endif
+}
+
+static int signal_received;
+
+static void set_signal_received_on_sigusr1(int sig)
+{
+	if (sig == SIGUSR1)
+		signal_received = 1;
+}
+
+/*
+ * Straightforward test to see whether pidfd_send_signal() works is to send
+ * a signal to ourself.
+ */
+static int test_pidfd_send_signal_simple_success(void)
+{
+	int pidfd, ret;
+	const char *test_name = "pidfd_send_signal send SIGUSR1";
+
+	if (!have_pidfd_send_signal) {
+		ksft_test_result_skip(
+			"%s test: pidfd_send_signal() syscall not supported\n",
+			test_name);
+		return 0;
+	}
+
+	pidfd = open("/proc/self", O_DIRECTORY | O_CLOEXEC);
+	if (pidfd < 0)
+		ksft_exit_fail_msg(
+			"%s test: Failed to open process file descriptor\n",
+			test_name);
+
+	signal(SIGUSR1, set_signal_received_on_sigusr1);
+
+	ret = sys_pidfd_send_signal(pidfd, SIGUSR1, NULL, 0);
+	close(pidfd);
+	if (ret < 0)
+		ksft_exit_fail_msg("%s test: Failed to send signal\n",
+				   test_name);
+
+	if (signal_received != 1)
+		ksft_exit_fail_msg("%s test: Failed to receive signal\n",
+				   test_name);
+
+	signal_received = 0;
+	ksft_test_result_pass("%s test: Sent signal\n", test_name);
+	return 0;
+}
+
+static int test_pidfd_send_signal_exited_fail(void)
+{
+	int pidfd, ret, saved_errno;
+	char buf[256];
+	pid_t pid;
+	const char *test_name = "pidfd_send_signal signal exited process";
+
+	if (!have_pidfd_send_signal) {
+		ksft_test_result_skip(
+			"%s test: pidfd_send_signal() syscall not supported\n",
+			test_name);
+		return 0;
+	}
+
+	pid = fork();
+	if (pid < 0)
+		ksft_exit_fail_msg("%s test: Failed to create new process\n",
+				   test_name);
+
+	if (pid == 0)
+		_exit(EXIT_SUCCESS);
+
+	snprintf(buf, sizeof(buf), "/proc/%d", pid);
+
+	pidfd = open(buf, O_DIRECTORY | O_CLOEXEC);
+
+	(void)wait_for_pid(pid);
+
+	if (pidfd < 0)
+		ksft_exit_fail_msg(
+			"%s test: Failed to open process file descriptor\n",
+			test_name);
+
+	ret = sys_pidfd_send_signal(pidfd, 0, NULL, 0);
+	saved_errno = errno;
+	close(pidfd);
+	if (ret == 0)
+		ksft_exit_fail_msg(
+			"%s test: Managed to send signal to process even though it should have failed\n",
+			test_name);
+
+	if (saved_errno != ESRCH)
+		ksft_exit_fail_msg(
+			"%s test: Expected to receive ESRCH as errno value but received %d instead\n",
+			test_name, saved_errno);
+
+	ksft_test_result_pass("%s test: Failed to send signal as expected\n",
+			      test_name);
+	return 0;
+}
+
+/*
+ * Maximum number of cycles we allow. This is equivalent to PID_MAX_DEFAULT.
+ * If users set a higher limit or we have cycled PIDFD_MAX_DEFAULT number of
+ * times then we skip the test to not go into an infinite loop or block for a
+ * long time.
+ */
+#define PIDFD_MAX_DEFAULT 0x8000
+
+static int test_pidfd_send_signal_recycled_pid_fail(void)
+{
+	int i, ret;
+	pid_t pid1;
+	const char *test_name = "pidfd_send_signal signal recycled pid";
+
+	if (!have_pidfd_send_signal) {
+		ksft_test_result_skip(
+			"%s test: pidfd_send_signal() syscall not supported\n",
+			test_name);
+		return 0;
+	}
+
+	ret = unshare(CLONE_NEWPID);
+	if (ret < 0) {
+		if (errno == EPERM) {
+			ksft_test_result_skip("%s test: Unsharing pid namespace not permitted\n",
+					      test_name);
+			return 0;
+		}
+		ksft_exit_fail_msg("%s test: Failed to unshare pid namespace\n",
+				   test_name);
+	}
+
+	ret = unshare(CLONE_NEWNS);
+	if (ret < 0) {
+		if (errno == EPERM) {
+			ksft_test_result_skip("%s test: Unsharing mount namespace not permitted\n",
+					      test_name);
+			return 0;
+		}
+		ksft_exit_fail_msg("%s test: Failed to unshare mount namespace\n",
+				   test_name);
+	}
+
+	ret = mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0);
+	if (ret < 0)
+		ksft_exit_fail_msg("%s test: Failed to remount / private\n",
+				   test_name);
+
+	/* pid 1 in new pid namespace */
+	pid1 = fork();
+	if (pid1 < 0)
+		ksft_exit_fail_msg("%s test: Failed to create new process\n",
+				   test_name);
+
+	if (pid1 == 0) {
+		char buf[256];
+		pid_t pid2;
+		int pidfd = -1;
+
+		(void)umount2("/proc", MNT_DETACH);
+		ret = mount("proc", "/proc", "proc", 0, NULL);
+		if (ret < 0)
+			_exit(PIDFD_ERROR);
+
+		/* grab pid PID_RECYCLE */
+		for (i = 0; i <= PIDFD_MAX_DEFAULT; i++) {
+			pid2 = fork();
+			if (pid2 < 0)
+				_exit(PIDFD_ERROR);
+
+			if (pid2 == 0)
+				_exit(PIDFD_PASS);
+
+			if (pid2 == PID_RECYCLE) {
+				snprintf(buf, sizeof(buf), "/proc/%d", pid2);
+				ksft_print_msg("pid to recycle is %d\n", pid2);
+				pidfd = open(buf, O_DIRECTORY | O_CLOEXEC);
+			}
+
+			if (wait_for_pid(pid2))
+				_exit(PIDFD_ERROR);
+
+			if (pid2 >= PID_RECYCLE)
+				break;
+		}
+
+		/*
+		 * We want to be as predictable as we can so if we haven't been
+		 * able to grab pid PID_RECYCLE skip the test.
+		 */
+		if (pid2 != PID_RECYCLE) {
+			/* skip test */
+			close(pidfd);
+			_exit(PIDFD_SKIP);
+		}
+
+		if (pidfd < 0)
+			_exit(PIDFD_ERROR);
+
+		for (i = 0; i <= PIDFD_MAX_DEFAULT; i++) {
+			char c;
+			int pipe_fds[2];
+			pid_t recycled_pid;
+			int child_ret = PIDFD_PASS;
+
+			ret = pipe2(pipe_fds, O_CLOEXEC);
+			if (ret < 0)
+				_exit(PIDFD_ERROR);
+
+			recycled_pid = fork();
+			if (recycled_pid < 0)
+				_exit(PIDFD_ERROR);
+
+			if (recycled_pid == 0) {
+				close(pipe_fds[1]);
+				(void)read(pipe_fds[0], &c, 1);
+				close(pipe_fds[0]);
+
+				_exit(PIDFD_PASS);
+			}
+
+			/*
+			 * Stop the child so we can inspect whether we have
+			 * recycled pid PID_RECYCLE.
+			 */
+			close(pipe_fds[0]);
+			ret = kill(recycled_pid, SIGSTOP);
+			close(pipe_fds[1]);
+			if (ret) {
+				(void)wait_for_pid(recycled_pid);
+				_exit(PIDFD_ERROR);
+			}
+
+			/*
+			 * We have recycled the pid. Try to signal it. This
+			 * needs to fail since this is a different process than
+			 * the one the pidfd refers to.
+			 */
+			if (recycled_pid == PID_RECYCLE) {
+				ret = sys_pidfd_send_signal(pidfd, SIGCONT,
+							    NULL, 0);
+				if (ret && errno == ESRCH)
+					child_ret = PIDFD_XFAIL;
+				else
+					child_ret = PIDFD_FAIL;
+			}
+
+			/* let the process move on */
+			ret = kill(recycled_pid, SIGCONT);
+			if (ret)
+				(void)kill(recycled_pid, SIGKILL);
+
+			if (wait_for_pid(recycled_pid))
+				_exit(PIDFD_ERROR);
+
+			switch (child_ret) {
+			case PIDFD_FAIL:
+				/* fallthrough */
+			case PIDFD_XFAIL:
+				_exit(child_ret);
+			case PIDFD_PASS:
+				break;
+			default:
+				/* not reached */
+				_exit(PIDFD_ERROR);
+			}
+
+			/*
+			 * If the user set a custom pid_max limit we could be
+			 * in the millions.
+			 * Skip the test in this case.
+			 */
+			if (recycled_pid > PIDFD_MAX_DEFAULT)
+				_exit(PIDFD_SKIP);
+		}
+
+		/* failed to recycle pid */
+		_exit(PIDFD_SKIP);
+	}
+
+	ret = wait_for_pid(pid1);
+	switch (ret) {
+	case PIDFD_FAIL:
+		ksft_exit_fail_msg(
+			"%s test: Managed to signal recycled pid %d\n",
+			test_name, PID_RECYCLE);
+	case PIDFD_PASS:
+		ksft_exit_fail_msg("%s test: Failed to recycle pid %d\n",
+				   test_name, PID_RECYCLE);
+	case PIDFD_SKIP:
+		ksft_test_result_skip("%s test: Skipping test\n", test_name);
+		ret = 0;
+		break;
+	case PIDFD_XFAIL:
+		ksft_test_result_pass(
+			"%s test: Failed to signal recycled pid as expected\n",
+			test_name);
+		ret = 0;
+		break;
+	default /* PIDFD_ERROR */:
+		ksft_exit_fail_msg("%s test: Error while running tests\n",
+				   test_name);
+	}
+
+	return ret;
+}
+
+static int test_pidfd_send_signal_syscall_support(void)
+{
+	int pidfd, ret;
+	const char *test_name = "pidfd_send_signal check for support";
+
+	pidfd = open("/proc/self", O_DIRECTORY | O_CLOEXEC);
+	if (pidfd < 0)
+		ksft_exit_fail_msg(
+			"%s test: Failed to open process file descriptor\n",
+			test_name);
+
+	ret = sys_pidfd_send_signal(pidfd, 0, NULL, 0);
+	if (ret < 0) {
+		if (errno == ENOSYS) {
+			ksft_test_result_skip(
+				"%s test: pidfd_send_signal() syscall not supported\n",
+				test_name);
+			return 0;
+		}
+		ksft_exit_fail_msg("%s test: Failed to send signal\n",
+				   test_name);
+	}
+
+	have_pidfd_send_signal = true;
+	close(pidfd);
+	ksft_test_result_pass(
+		"%s test: pidfd_send_signal() syscall is supported. Tests can be executed\n",
+		test_name);
+	return 0;
+}
+
+static void *test_pidfd_poll_exec_thread(void *priv)
+{
+	ksft_print_msg("Child Thread: starting. pid %d tid %ld ; and sleeping\n",
+			getpid(), syscall(SYS_gettid));
+	ksft_print_msg("Child Thread: doing exec of sleep\n");
+
+	execl("/bin/sleep", "sleep", str(CHILD_THREAD_MIN_WAIT), (char *)NULL);
+
+	ksft_print_msg("Child Thread: DONE. pid %d tid %ld\n",
+			getpid(), syscall(SYS_gettid));
+	return NULL;
+}
+
+static void poll_pidfd(const char *test_name, int pidfd)
+{
+	int c;
+	int epoll_fd = epoll_create1(EPOLL_CLOEXEC);
+	struct epoll_event event, events[MAX_EVENTS];
+
+	if (epoll_fd == -1)
+		ksft_exit_fail_msg("%s test: Failed to create epoll file descriptor "
+				   "(errno %d)\n",
+				   test_name, errno);
+
+	event.events = EPOLLIN;
+	event.data.fd = pidfd;
+
+	if (epoll_ctl(epoll_fd, EPOLL_CTL_ADD, pidfd, &event)) {
+		ksft_exit_fail_msg("%s test: Failed to add epoll file descriptor "
+				   "(errno %d)\n",
+				   test_name, errno);
+	}
+
+	c = epoll_wait(epoll_fd, events, MAX_EVENTS, 5000);
+	if (c != 1 || !(events[0].events & EPOLLIN))
+		ksft_exit_fail_msg("%s test: Unexpected epoll_wait result (c=%d, events=%x) ",
+				   "(errno %d)\n",
+				   test_name, c, events[0].events, errno);
+
+	close(epoll_fd);
+	return;
+
+}
+
+static int child_poll_exec_test(void *args)
+{
+	pthread_t t1;
+
+	ksft_print_msg("Child (pidfd): starting. pid %d tid %ld\n", getpid(),
+			syscall(SYS_gettid));
+	pthread_create(&t1, NULL, test_pidfd_poll_exec_thread, NULL);
+	/*
+	 * Exec in the non-leader thread will destroy the leader immediately.
+	 * If the wait in the parent returns too soon, the test fails.
+	 */
+	while (1)
+		sleep(1);
+}
+
+static void test_pidfd_poll_exec(int use_waitpid)
+{
+	int pid, pidfd = 0;
+	int status, ret;
+	time_t prog_start = time(NULL);
+	const char *test_name = "pidfd_poll check for premature notification on child thread exec";
+
+	ksft_print_msg("Parent: pid: %d\n", getpid());
+	pid = pidfd_clone(CLONE_PIDFD, &pidfd, child_poll_exec_test);
+	if (pid < 0)
+		ksft_exit_fail_msg("%s test: pidfd_clone failed (ret %d, errno %d)\n",
+				   test_name, pid, errno);
+
+	ksft_print_msg("Parent: Waiting for Child (%d) to complete.\n", pid);
+
+	if (use_waitpid) {
+		ret = waitpid(pid, &status, 0);
+		if (ret == -1)
+			ksft_print_msg("Parent: error\n");
+
+		if (ret == pid)
+			ksft_print_msg("Parent: Child process waited for.\n");
+	} else {
+		poll_pidfd(test_name, pidfd);
+	}
+
+	time_t prog_time = time(NULL) - prog_start;
+
+	ksft_print_msg("Time waited for child: %lu\n", prog_time);
+
+	close(pidfd);
+
+	if (prog_time < CHILD_THREAD_MIN_WAIT || prog_time > CHILD_THREAD_MIN_WAIT + 2)
+		ksft_exit_fail_msg("%s test: Failed\n", test_name);
+	else
+		ksft_test_result_pass("%s test: Passed\n", test_name);
+}
+
+static void *test_pidfd_poll_leader_exit_thread(void *priv)
+{
+	ksft_print_msg("Child Thread: starting. pid %d tid %ld ; and sleeping\n",
+			getpid(), syscall(SYS_gettid));
+	sleep(CHILD_THREAD_MIN_WAIT);
+	ksft_print_msg("Child Thread: DONE. pid %d tid %ld\n", getpid(), syscall(SYS_gettid));
+	return NULL;
+}
+
+static time_t *child_exit_secs;
+static int child_poll_leader_exit_test(void *args)
+{
+	pthread_t t1, t2;
+
+	ksft_print_msg("Child: starting. pid %d tid %ld\n", getpid(), syscall(SYS_gettid));
+	pthread_create(&t1, NULL, test_pidfd_poll_leader_exit_thread, NULL);
+	pthread_create(&t2, NULL, test_pidfd_poll_leader_exit_thread, NULL);
+
+	/*
+	 * glibc exit calls exit_group syscall, so explicity call exit only
+	 * so that only the group leader exits, leaving the threads alone.
+	 */
+	*child_exit_secs = time(NULL);
+	syscall(SYS_exit, 0);
+	/* Never reached, but appeases compiler thinking we should return. */
+	exit(0);
+}
+
+static void test_pidfd_poll_leader_exit(int use_waitpid)
+{
+	int pid, pidfd = 0;
+	int status, ret = 0;
+	const char *test_name = "pidfd_poll check for premature notification on non-empty"
+				"group leader exit";
+
+	child_exit_secs = mmap(NULL, sizeof *child_exit_secs, PROT_READ | PROT_WRITE,
+			MAP_SHARED | MAP_ANONYMOUS, -1, 0);
+
+	if (child_exit_secs == MAP_FAILED)
+		ksft_exit_fail_msg("%s test: mmap failed (errno %d)\n",
+				   test_name, errno);
+
+	ksft_print_msg("Parent: pid: %d\n", getpid());
+	pid = pidfd_clone(CLONE_PIDFD, &pidfd, child_poll_leader_exit_test);
+	if (pid < 0)
+		ksft_exit_fail_msg("%s test: pidfd_clone failed (ret %d, errno %d)\n",
+				   test_name, pid, errno);
+
+	ksft_print_msg("Parent: Waiting for Child (%d) to complete.\n", pid);
+
+	if (use_waitpid) {
+		ret = waitpid(pid, &status, 0);
+		if (ret == -1)
+			ksft_print_msg("Parent: error\n");
+	} else {
+		/*
+		 * This sleep tests for the case where if the child exits, and is in
+		 * EXIT_ZOMBIE, but the thread group leader is non-empty, then the poll
+		 * doesn't prematurely return even though there are active threads
+		 */
+		sleep(1);
+		poll_pidfd(test_name, pidfd);
+	}
+
+	if (ret == pid)
+		ksft_print_msg("Parent: Child process waited for.\n");
+
+	time_t since_child_exit = time(NULL) - *child_exit_secs;
+
+	ksft_print_msg("Time since child exit: %lu\n", since_child_exit);
+
+	close(pidfd);
+
+	if (since_child_exit < CHILD_THREAD_MIN_WAIT ||
+			since_child_exit > CHILD_THREAD_MIN_WAIT + 2)
+		ksft_exit_fail_msg("%s test: Failed\n", test_name);
+	else
+		ksft_test_result_pass("%s test: Passed\n", test_name);
+}
+
+int main(int argc, char **argv)
+{
+	ksft_print_header();
+	ksft_set_plan(8);
+
+	test_pidfd_poll_exec(0);
+	test_pidfd_poll_exec(1);
+	test_pidfd_poll_leader_exit(0);
+	test_pidfd_poll_leader_exit(1);
+	test_pidfd_send_signal_syscall_support();
+	test_pidfd_send_signal_simple_success();
+	test_pidfd_send_signal_exited_fail();
+	test_pidfd_send_signal_recycled_pid_fail();
+
+	return ksft_exit_pass();
+}
diff --git a/tools/testing/selftests/pidfd/pidfd_wait.c b/tools/testing/selftests/pidfd/pidfd_wait.c
new file mode 100644
index 000000000..17999e082
--- /dev/null
+++ b/tools/testing/selftests/pidfd/pidfd_wait.c
@@ -0,0 +1,224 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#define _GNU_SOURCE
+#include <errno.h>
+#include <linux/sched.h>
+#include <linux/types.h>
+#include <signal.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sched.h>
+#include <string.h>
+#include <sys/resource.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+#include "pidfd.h"
+#include "../kselftest_harness.h"
+
+#define ptr_to_u64(ptr) ((__u64)((uintptr_t)(ptr)))
+
+/* Attempt to de-conflict with the selftests tree. */
+#ifndef SKIP
+#define SKIP(s, ...)	XFAIL(s, ##__VA_ARGS__)
+#endif
+
+static pid_t sys_clone3(struct clone_args *args)
+{
+	return syscall(__NR_clone3, args, sizeof(struct clone_args));
+}
+
+static int sys_waitid(int which, pid_t pid, siginfo_t *info, int options,
+		      struct rusage *ru)
+{
+	return syscall(__NR_waitid, which, pid, info, options, ru);
+}
+
+TEST(wait_simple)
+{
+	int pidfd = -1;
+	pid_t parent_tid = -1;
+	struct clone_args args = {
+		.parent_tid = ptr_to_u64(&parent_tid),
+		.pidfd = ptr_to_u64(&pidfd),
+		.flags = CLONE_PIDFD | CLONE_PARENT_SETTID,
+		.exit_signal = SIGCHLD,
+	};
+	pid_t pid;
+	siginfo_t info = {
+		.si_signo = 0,
+	};
+
+	pidfd = open("/proc/self", O_DIRECTORY | O_RDONLY | O_CLOEXEC);
+	ASSERT_GE(pidfd, 0);
+
+	pid = sys_waitid(P_PIDFD, pidfd, &info, WEXITED, NULL);
+	ASSERT_NE(pid, 0);
+	EXPECT_EQ(close(pidfd), 0);
+	pidfd = -1;
+
+	pidfd = open("/dev/null", O_RDONLY | O_CLOEXEC);
+	ASSERT_GE(pidfd, 0);
+
+	pid = sys_waitid(P_PIDFD, pidfd, &info, WEXITED, NULL);
+	ASSERT_NE(pid, 0);
+	EXPECT_EQ(close(pidfd), 0);
+	pidfd = -1;
+
+	pid = sys_clone3(&args);
+	ASSERT_GE(pid, 0);
+
+	if (pid == 0)
+		exit(EXIT_SUCCESS);
+
+	pid = sys_waitid(P_PIDFD, pidfd, &info, WEXITED, NULL);
+	ASSERT_GE(pid, 0);
+	ASSERT_EQ(WIFEXITED(info.si_status), true);
+	ASSERT_EQ(WEXITSTATUS(info.si_status), 0);
+	EXPECT_EQ(close(pidfd), 0);
+
+	ASSERT_EQ(info.si_signo, SIGCHLD);
+	ASSERT_EQ(info.si_code, CLD_EXITED);
+	ASSERT_EQ(info.si_pid, parent_tid);
+}
+
+TEST(wait_states)
+{
+	int pidfd = -1;
+	pid_t parent_tid = -1;
+	struct clone_args args = {
+		.parent_tid = ptr_to_u64(&parent_tid),
+		.pidfd = ptr_to_u64(&pidfd),
+		.flags = CLONE_PIDFD | CLONE_PARENT_SETTID,
+		.exit_signal = SIGCHLD,
+	};
+	int ret;
+	pid_t pid;
+	siginfo_t info = {
+		.si_signo = 0,
+	};
+
+	pid = sys_clone3(&args);
+	ASSERT_GE(pid, 0);
+
+	if (pid == 0) {
+		kill(getpid(), SIGSTOP);
+		kill(getpid(), SIGSTOP);
+		exit(EXIT_SUCCESS);
+	}
+
+	ASSERT_EQ(sys_waitid(P_PIDFD, pidfd, &info, WSTOPPED, NULL), 0);
+	ASSERT_EQ(info.si_signo, SIGCHLD);
+	ASSERT_EQ(info.si_code, CLD_STOPPED);
+	ASSERT_EQ(info.si_pid, parent_tid);
+
+	ASSERT_EQ(sys_pidfd_send_signal(pidfd, SIGCONT, NULL, 0), 0);
+
+	ASSERT_EQ(sys_waitid(P_PIDFD, pidfd, &info, WCONTINUED, NULL), 0);
+	ASSERT_EQ(info.si_signo, SIGCHLD);
+	ASSERT_EQ(info.si_code, CLD_CONTINUED);
+	ASSERT_EQ(info.si_pid, parent_tid);
+
+	ASSERT_EQ(sys_waitid(P_PIDFD, pidfd, &info, WUNTRACED, NULL), 0);
+	ASSERT_EQ(info.si_signo, SIGCHLD);
+	ASSERT_EQ(info.si_code, CLD_STOPPED);
+	ASSERT_EQ(info.si_pid, parent_tid);
+
+	ASSERT_EQ(sys_pidfd_send_signal(pidfd, SIGKILL, NULL, 0), 0);
+
+	ASSERT_EQ(sys_waitid(P_PIDFD, pidfd, &info, WEXITED, NULL), 0);
+	ASSERT_EQ(info.si_signo, SIGCHLD);
+	ASSERT_EQ(info.si_code, CLD_KILLED);
+	ASSERT_EQ(info.si_pid, parent_tid);
+
+	EXPECT_EQ(close(pidfd), 0);
+}
+
+TEST(wait_nonblock)
+{
+	int pidfd, status = 0;
+	unsigned int flags = 0;
+	pid_t parent_tid = -1;
+	struct clone_args args = {
+		.parent_tid = ptr_to_u64(&parent_tid),
+		.flags = CLONE_PARENT_SETTID,
+		.exit_signal = SIGCHLD,
+	};
+	int ret;
+	pid_t pid;
+	siginfo_t info = {
+		.si_signo = 0,
+	};
+
+	/*
+	 * Callers need to see ECHILD with non-blocking pidfds when no child
+	 * processes exists.
+	 */
+	pidfd = sys_pidfd_open(getpid(), PIDFD_NONBLOCK);
+	EXPECT_GE(pidfd, 0) {
+		/* pidfd_open() doesn't support PIDFD_NONBLOCK. */
+		ASSERT_EQ(errno, EINVAL);
+		SKIP(return, "Skipping PIDFD_NONBLOCK test");
+	}
+
+	ret = sys_waitid(P_PIDFD, pidfd, &info, WEXITED, NULL);
+	ASSERT_LT(ret, 0);
+	ASSERT_EQ(errno, ECHILD);
+	EXPECT_EQ(close(pidfd), 0);
+
+	pid = sys_clone3(&args);
+	ASSERT_GE(pid, 0);
+
+	if (pid == 0) {
+		kill(getpid(), SIGSTOP);
+		exit(EXIT_SUCCESS);
+	}
+
+	pidfd = sys_pidfd_open(pid, PIDFD_NONBLOCK);
+	EXPECT_GE(pidfd, 0) {
+		/* pidfd_open() doesn't support PIDFD_NONBLOCK. */
+		ASSERT_EQ(errno, EINVAL);
+		SKIP(return, "Skipping PIDFD_NONBLOCK test");
+	}
+
+	flags = fcntl(pidfd, F_GETFL, 0);
+	ASSERT_GT(flags, 0);
+	ASSERT_GT((flags & O_NONBLOCK), 0);
+
+	/*
+	 * Callers need to see EAGAIN/EWOULDBLOCK with non-blocking pidfd when
+	 * child processes exist but none have exited.
+	 */
+	ret = sys_waitid(P_PIDFD, pidfd, &info, WEXITED, NULL);
+	ASSERT_LT(ret, 0);
+	ASSERT_EQ(errno, EAGAIN);
+
+	/*
+	 * Callers need to continue seeing 0 with non-blocking pidfd and
+	 * WNOHANG raised explicitly when child processes exist but none have
+	 * exited.
+	 */
+	ret = sys_waitid(P_PIDFD, pidfd, &info, WEXITED | WNOHANG, NULL);
+	ASSERT_EQ(ret, 0);
+
+	ASSERT_EQ(fcntl(pidfd, F_SETFL, (flags & ~O_NONBLOCK)), 0);
+
+	ASSERT_EQ(sys_waitid(P_PIDFD, pidfd, &info, WSTOPPED, NULL), 0);
+	ASSERT_EQ(info.si_signo, SIGCHLD);
+	ASSERT_EQ(info.si_code, CLD_STOPPED);
+	ASSERT_EQ(info.si_pid, parent_tid);
+
+	ASSERT_EQ(sys_pidfd_send_signal(pidfd, SIGCONT, NULL, 0), 0);
+
+	ASSERT_EQ(sys_waitid(P_PIDFD, pidfd, &info, WEXITED, NULL), 0);
+	ASSERT_EQ(info.si_signo, SIGCHLD);
+	ASSERT_EQ(info.si_code, CLD_EXITED);
+	ASSERT_EQ(info.si_pid, parent_tid);
+
+	EXPECT_EQ(close(pidfd), 0);
+}
+
+TEST_HARNESS_MAIN