8 files changed, 1113 insertions, 0 deletions
diff --git a/samples/seccomp/.gitignore b/samples/seccomp/.gitignore
new file mode 100644
index 000000000..4a5a5b7db
--- /dev/null
+++ b/samples/seccomp/.gitignore
@@ -0,0 +1,5 @@
+# SPDX-License-Identifier: GPL-2.0-only
+bpf-direct
+bpf-fancy
+dropper
+user-trap
diff --git a/samples/seccomp/Makefile b/samples/seccomp/Makefile
new file mode 100644
index 000000000..c85ae0ed8
--- /dev/null
+++ b/samples/seccomp/Makefile
@@ -0,0 +1,6 @@
+# SPDX-License-Identifier: GPL-2.0
+userprogs-always-y += bpf-fancy dropper bpf-direct user-trap
+
+bpf-fancy-objs := bpf-fancy.o bpf-helper.o
+
+userccflags += -I usr/include
diff --git a/samples/seccomp/bpf-direct.c b/samples/seccomp/bpf-direct.c
new file mode 100644
index 000000000..c09e4a17a
--- /dev/null
+++ b/samples/seccomp/bpf-direct.c
@@ -0,0 +1,191 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Seccomp filter example for x86 (32-bit and 64-bit) with BPF macros
+ *
+ * Copyright (c) 2012 The Chromium OS Authors <chromium-os-dev@chromium.org>
+ * Author: Will Drewry <wad@chromium.org>
+ *
+ * The code may be used by anyone for any purpose,
+ * and can serve as a starting point for developing
+ * applications using prctl(PR_SET_SECCOMP, 2, ...).
+ */
+#if defined(__i386__) || defined(__x86_64__)
+#define SUPPORTED_ARCH 1
+#endif
+
+#if defined(SUPPORTED_ARCH)
+#define __USE_GNU 1
+#define _GNU_SOURCE 1
+
+#include <linux/types.h>
+#include <linux/filter.h>
+#include <linux/seccomp.h>
+#include <linux/unistd.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stddef.h>
+#include <string.h>
+#include <sys/prctl.h>
+#include <unistd.h>
+
+#define syscall_arg(_n) (offsetof(struct seccomp_data, args[_n]))
+#define syscall_nr (offsetof(struct seccomp_data, nr))
+
+#if defined(__i386__)
+#define REG_RESULT	REG_EAX
+#define REG_SYSCALL	REG_EAX
+#define REG_ARG0	REG_EBX
+#define REG_ARG1	REG_ECX
+#define REG_ARG2	REG_EDX
+#define REG_ARG3	REG_ESI
+#define REG_ARG4	REG_EDI
+#define REG_ARG5	REG_EBP
+#elif defined(__x86_64__)
+#define REG_RESULT	REG_RAX
+#define REG_SYSCALL	REG_RAX
+#define REG_ARG0	REG_RDI
+#define REG_ARG1	REG_RSI
+#define REG_ARG2	REG_RDX
+#define REG_ARG3	REG_R10
+#define REG_ARG4	REG_R8
+#define REG_ARG5	REG_R9
+#endif
+
+#ifndef PR_SET_NO_NEW_PRIVS
+#define PR_SET_NO_NEW_PRIVS 38
+#endif
+
+#ifndef SYS_SECCOMP
+#define SYS_SECCOMP 1
+#endif
+
+static void emulator(int nr, siginfo_t *info, void *void_context)
+{
+	ucontext_t *ctx = (ucontext_t *)(void_context);
+	int syscall;
+	char *buf;
+	ssize_t bytes;
+	size_t len;
+	if (info->si_code != SYS_SECCOMP)
+		return;
+	if (!ctx)
+		return;
+	syscall = ctx->uc_mcontext.gregs[REG_SYSCALL];
+	buf = (char *) ctx->uc_mcontext.gregs[REG_ARG1];
+	len = (size_t) ctx->uc_mcontext.gregs[REG_ARG2];
+
+	if (syscall != __NR_write)
+		return;
+	if (ctx->uc_mcontext.gregs[REG_ARG0] != STDERR_FILENO)
+		return;
+	/* Redirect stderr messages to stdout. Doesn't handle EINTR, etc */
+	ctx->uc_mcontext.gregs[REG_RESULT] = -1;
+	if (write(STDOUT_FILENO, "[ERR] ", 6) > 0) {
+		bytes = write(STDOUT_FILENO, buf, len);
+		ctx->uc_mcontext.gregs[REG_RESULT] = bytes;
+	}
+	return;
+}
+
+static int install_emulator(void)
+{
+	struct sigaction act;
+	sigset_t mask;
+	memset(&act, 0, sizeof(act));
+	sigemptyset(&mask);
+	sigaddset(&mask, SIGSYS);
+
+	act.sa_sigaction = &emulator;
+	act.sa_flags = SA_SIGINFO;
+	if (sigaction(SIGSYS, &act, NULL) < 0) {
+		perror("sigaction");
+		return -1;
+	}
+	if (sigprocmask(SIG_UNBLOCK, &mask, NULL)) {
+		perror("sigprocmask");
+		return -1;
+	}
+	return 0;
+}
+
+static int install_filter(void)
+{
+	struct sock_filter filter[] = {
+		/* Grab the system call number */
+		BPF_STMT(BPF_LD+BPF_W+BPF_ABS, syscall_nr),
+		/* Jump table for the allowed syscalls */
+		BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, __NR_rt_sigreturn, 0, 1),
+		BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW),
+#ifdef __NR_sigreturn
+		BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, __NR_sigreturn, 0, 1),
+		BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW),
+#endif
+		BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, __NR_exit_group, 0, 1),
+		BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW),
+		BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, __NR_exit, 0, 1),
+		BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW),
+		BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, __NR_read, 1, 0),
+		BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, __NR_write, 3, 2),
+
+		/* Check that read is only using stdin. */
+		BPF_STMT(BPF_LD+BPF_W+BPF_ABS, syscall_arg(0)),
+		BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, STDIN_FILENO, 4, 0),
+		BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_KILL),
+
+		/* Check that write is only using stdout */
+		BPF_STMT(BPF_LD+BPF_W+BPF_ABS, syscall_arg(0)),
+		BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, STDOUT_FILENO, 1, 0),
+		/* Trap attempts to write to stderr */
+		BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, STDERR_FILENO, 1, 2),
+
+		BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW),
+		BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_TRAP),
+		BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_KILL),
+	};
+	struct sock_fprog prog = {
+		.len = (unsigned short)(sizeof(filter)/sizeof(filter[0])),
+		.filter = filter,
+	};
+
+	if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
+		perror("prctl(NO_NEW_PRIVS)");
+		return 1;
+	}
+
+
+	if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog)) {
+		perror("prctl");
+		return 1;
+	}
+	return 0;
+}
+
+#define payload(_c) (_c), sizeof((_c))
+int main(int argc, char **argv)
+{
+	char buf[4096];
+	ssize_t bytes = 0;
+	if (install_emulator())
+		return 1;
+	if (install_filter())
+		return 1;
+	syscall(__NR_write, STDOUT_FILENO,
+		payload("OHAI! WHAT IS YOUR NAME? "));
+	bytes = syscall(__NR_read, STDIN_FILENO, buf, sizeof(buf));
+	syscall(__NR_write, STDOUT_FILENO, payload("HELLO, "));
+	syscall(__NR_write, STDOUT_FILENO, buf, bytes);
+	syscall(__NR_write, STDERR_FILENO,
+		payload("Error message going to STDERR\n"));
+	return 0;
+}
+#else	/* SUPPORTED_ARCH */
+/*
+ * This sample is x86-only.  Since kernel samples are compiled with the
+ * host toolchain, a non-x86 host will result in using only the main()
+ * below.
+ */
+int main(void)
+{
+	return 1;
+}
+#endif	/* SUPPORTED_ARCH */
diff --git a/samples/seccomp/bpf-fancy.c b/samples/seccomp/bpf-fancy.c
new file mode 100644
index 000000000..1ccb43502
--- /dev/null
+++ b/samples/seccomp/bpf-fancy.c
@@ -0,0 +1,105 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Seccomp BPF example using a macro-based generator.
+ *
+ * Copyright (c) 2012 The Chromium OS Authors <chromium-os-dev@chromium.org>
+ * Author: Will Drewry <wad@chromium.org>
+ *
+ * The code may be used by anyone for any purpose,
+ * and can serve as a starting point for developing
+ * applications using prctl(PR_ATTACH_SECCOMP_FILTER).
+ */
+
+#include <linux/filter.h>
+#include <linux/seccomp.h>
+#include <linux/unistd.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/prctl.h>
+#include <unistd.h>
+
+#include "bpf-helper.h"
+
+#ifndef PR_SET_NO_NEW_PRIVS
+#define PR_SET_NO_NEW_PRIVS 38
+#endif
+
+int main(int argc, char **argv)
+{
+	struct bpf_labels l = {
+		.count = 0,
+	};
+	static const char msg1[] = "Please type something: ";
+	static const char msg2[] = "You typed: ";
+	char buf[256];
+	struct sock_filter filter[] = {
+		/* TODO: LOAD_SYSCALL_NR(arch) and enforce an arch */
+		LOAD_SYSCALL_NR,
+		SYSCALL(__NR_exit, ALLOW),
+		SYSCALL(__NR_exit_group, ALLOW),
+		SYSCALL(__NR_write, JUMP(&l, write_fd)),
+		SYSCALL(__NR_read, JUMP(&l, read)),
+		DENY,  /* Don't passthrough into a label */
+
+		LABEL(&l, read),
+		ARG(0),
+		JNE(STDIN_FILENO, DENY),
+		ARG(1),
+		JNE((unsigned long)buf, DENY),
+		ARG(2),
+		JGE(sizeof(buf), DENY),
+		ALLOW,
+
+		LABEL(&l, write_fd),
+		ARG(0),
+		JEQ(STDOUT_FILENO, JUMP(&l, write_buf)),
+		JEQ(STDERR_FILENO, JUMP(&l, write_buf)),
+		DENY,
+
+		LABEL(&l, write_buf),
+		ARG(1),
+		JEQ((unsigned long)msg1, JUMP(&l, msg1_len)),
+		JEQ((unsigned long)msg2, JUMP(&l, msg2_len)),
+		JEQ((unsigned long)buf, JUMP(&l, buf_len)),
+		DENY,
+
+		LABEL(&l, msg1_len),
+		ARG(2),
+		JLT(sizeof(msg1), ALLOW),
+		DENY,
+
+		LABEL(&l, msg2_len),
+		ARG(2),
+		JLT(sizeof(msg2), ALLOW),
+		DENY,
+
+		LABEL(&l, buf_len),
+		ARG(2),
+		JLT(sizeof(buf), ALLOW),
+		DENY,
+	};
+	struct sock_fprog prog = {
+		.filter = filter,
+		.len = (unsigned short)(sizeof(filter)/sizeof(filter[0])),
+	};
+	ssize_t bytes;
+	bpf_resolve_jumps(&l, filter, sizeof(filter)/sizeof(*filter));
+
+	if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
+		perror("prctl(NO_NEW_PRIVS)");
+		return 1;
+	}
+
+	if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog)) {
+		perror("prctl(SECCOMP)");
+		return 1;
+	}
+	syscall(__NR_write, STDOUT_FILENO, msg1, strlen(msg1));
+	bytes = syscall(__NR_read, STDIN_FILENO, buf, sizeof(buf)-1);
+	bytes = (bytes > 0 ? bytes : 0);
+	syscall(__NR_write, STDERR_FILENO, msg2, strlen(msg2));
+	syscall(__NR_write, STDERR_FILENO, buf, bytes);
+	/* Now get killed */
+	syscall(__NR_write, STDERR_FILENO, msg2, strlen(msg2)+2);
+	return 0;
+}
diff --git a/samples/seccomp/bpf-helper.c b/samples/seccomp/bpf-helper.c
new file mode 100644
index 000000000..ae260d77a
--- /dev/null
+++ b/samples/seccomp/bpf-helper.c
@@ -0,0 +1,96 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Seccomp BPF helper functions
+ *
+ * Copyright (c) 2012 The Chromium OS Authors <chromium-os-dev@chromium.org>
+ * Author: Will Drewry <wad@chromium.org>
+ *
+ * The code may be used by anyone for any purpose,
+ * and can serve as a starting point for developing
+ * applications using prctl(PR_ATTACH_SECCOMP_FILTER).
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "bpf-helper.h"
+
+int bpf_resolve_jumps(struct bpf_labels *labels,
+		      struct sock_filter *filter, size_t count)
+{
+	size_t i;
+
+	if (count < 1 || count > BPF_MAXINSNS)
+		return -1;
+	/*
+	* Walk it once, backwards, to build the label table and do fixups.
+	* Since backward jumps are disallowed by BPF, this is easy.
+	*/
+	for (i = 0; i < count; ++i) {
+		size_t offset = count - i - 1;
+		struct sock_filter *instr = &filter[offset];
+		if (instr->code != (BPF_JMP+BPF_JA))
+			continue;
+		switch ((instr->jt<<8)|instr->jf) {
+		case (JUMP_JT<<8)|JUMP_JF:
+			if (labels->labels[instr->k].location == 0xffffffff) {
+				fprintf(stderr, "Unresolved label: '%s'\n",
+					labels->labels[instr->k].label);
+				return 1;
+			}
+			instr->k = labels->labels[instr->k].location -
+				    (offset + 1);
+			instr->jt = 0;
+			instr->jf = 0;
+			continue;
+		case (LABEL_JT<<8)|LABEL_JF:
+			if (labels->labels[instr->k].location != 0xffffffff) {
+				fprintf(stderr, "Duplicate label use: '%s'\n",
+					labels->labels[instr->k].label);
+				return 1;
+			}
+			labels->labels[instr->k].location = offset;
+			instr->k = 0; /* fall through */
+			instr->jt = 0;
+			instr->jf = 0;
+			continue;
+		}
+	}
+	return 0;
+}
+
+/* Simple lookup table for labels. */
+__u32 seccomp_bpf_label(struct bpf_labels *labels, const char *label)
+{
+	struct __bpf_label *begin = labels->labels, *end;
+	int id;
+
+	if (labels->count == BPF_LABELS_MAX) {
+		fprintf(stderr, "Too many labels\n");
+		exit(1);
+	}
+	if (labels->count == 0) {
+		begin->label = label;
+		begin->location = 0xffffffff;
+		labels->count++;
+		return 0;
+	}
+	end = begin + labels->count;
+	for (id = 0; begin < end; ++begin, ++id) {
+		if (!strcmp(label, begin->label))
+			return id;
+	}
+	begin->label = label;
+	begin->location = 0xffffffff;
+	labels->count++;
+	return id;
+}
+
+void seccomp_bpf_print(struct sock_filter *filter, size_t count)
+{
+	struct sock_filter *end = filter + count;
+	for ( ; filter < end; ++filter)
+		printf("{ code=%u,jt=%u,jf=%u,k=%u },\n",
+			filter->code, filter->jt, filter->jf, filter->k);
+}
diff --git a/samples/seccomp/bpf-helper.h b/samples/seccomp/bpf-helper.h
new file mode 100644
index 000000000..0cc9816fe
--- /dev/null
+++ b/samples/seccomp/bpf-helper.h
@@ -0,0 +1,263 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Example wrapper around BPF macros.
+ *
+ * Copyright (c) 2012 The Chromium OS Authors <chromium-os-dev@chromium.org>
+ * Author: Will Drewry <wad@chromium.org>
+ *
+ * The code may be used by anyone for any purpose,
+ * and can serve as a starting point for developing
+ * applications using prctl(PR_SET_SECCOMP, 2, ...).
+ *
+ * No guarantees are provided with respect to the correctness
+ * or functionality of this code.
+ */
+#ifndef __BPF_HELPER_H__
+#define __BPF_HELPER_H__
+
+#include <asm/bitsperlong.h>	/* for __BITS_PER_LONG */
+#include <endian.h>
+#include <linux/filter.h>
+#include <linux/seccomp.h>	/* for seccomp_data */
+#include <linux/types.h>
+#include <linux/unistd.h>
+#include <stddef.h>
+
+#define BPF_LABELS_MAX 256
+struct bpf_labels {
+	int count;
+	struct __bpf_label {
+		const char *label;
+		__u32 location;
+	} labels[BPF_LABELS_MAX];
+};
+
+int bpf_resolve_jumps(struct bpf_labels *labels,
+		      struct sock_filter *filter, size_t count);
+__u32 seccomp_bpf_label(struct bpf_labels *labels, const char *label);
+void seccomp_bpf_print(struct sock_filter *filter, size_t count);
+
+#define JUMP_JT 0xff
+#define JUMP_JF 0xff
+#define LABEL_JT 0xfe
+#define LABEL_JF 0xfe
+
+#define ALLOW \
+	BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW)
+#define DENY \
+	BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_KILL)
+#define JUMP(labels, label) \
+	BPF_JUMP(BPF_JMP+BPF_JA, FIND_LABEL((labels), (label)), \
+		 JUMP_JT, JUMP_JF)
+#define LABEL(labels, label) \
+	BPF_JUMP(BPF_JMP+BPF_JA, FIND_LABEL((labels), (label)), \
+		 LABEL_JT, LABEL_JF)
+#define SYSCALL(nr, jt) \
+	BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, (nr), 0, 1), \
+	jt
+
+/* Lame, but just an example */
+#define FIND_LABEL(labels, label) seccomp_bpf_label((labels), #label)
+
+#define EXPAND(...) __VA_ARGS__
+
+/* Ensure that we load the logically correct offset. */
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+#define LO_ARG(idx) offsetof(struct seccomp_data, args[(idx)])
+#elif __BYTE_ORDER == __BIG_ENDIAN
+#define LO_ARG(idx) offsetof(struct seccomp_data, args[(idx)]) + sizeof(__u32)
+#else
+#error "Unknown endianness"
+#endif
+
+/* Map all width-sensitive operations */
+#if __BITS_PER_LONG == 32
+
+#define JEQ(x, jt) JEQ32(x, EXPAND(jt))
+#define JNE(x, jt) JNE32(x, EXPAND(jt))
+#define JGT(x, jt) JGT32(x, EXPAND(jt))
+#define JLT(x, jt) JLT32(x, EXPAND(jt))
+#define JGE(x, jt) JGE32(x, EXPAND(jt))
+#define JLE(x, jt) JLE32(x, EXPAND(jt))
+#define JA(x, jt) JA32(x, EXPAND(jt))
+#define ARG(i) ARG_32(i)
+
+#elif __BITS_PER_LONG == 64
+
+/* Ensure that we load the logically correct offset. */
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+#define ENDIAN(_lo, _hi) _lo, _hi
+#define HI_ARG(idx) offsetof(struct seccomp_data, args[(idx)]) + sizeof(__u32)
+#elif __BYTE_ORDER == __BIG_ENDIAN
+#define ENDIAN(_lo, _hi) _hi, _lo
+#define HI_ARG(idx) offsetof(struct seccomp_data, args[(idx)])
+#endif
+
+union arg64 {
+	struct {
+		__u32 ENDIAN(lo32, hi32);
+	};
+	__u64 u64;
+};
+
+#define JEQ(x, jt) \
+	JEQ64(((union arg64){.u64 = (x)}).lo32, \
+	      ((union arg64){.u64 = (x)}).hi32, \
+	      EXPAND(jt))
+#define JGT(x, jt) \
+	JGT64(((union arg64){.u64 = (x)}).lo32, \
+	      ((union arg64){.u64 = (x)}).hi32, \
+	      EXPAND(jt))
+#define JGE(x, jt) \
+	JGE64(((union arg64){.u64 = (x)}).lo32, \
+	      ((union arg64){.u64 = (x)}).hi32, \
+	      EXPAND(jt))
+#define JNE(x, jt) \
+	JNE64(((union arg64){.u64 = (x)}).lo32, \
+	      ((union arg64){.u64 = (x)}).hi32, \
+	      EXPAND(jt))
+#define JLT(x, jt) \
+	JLT64(((union arg64){.u64 = (x)}).lo32, \
+	      ((union arg64){.u64 = (x)}).hi32, \
+	      EXPAND(jt))
+#define JLE(x, jt) \
+	JLE64(((union arg64){.u64 = (x)}).lo32, \
+	      ((union arg64){.u64 = (x)}).hi32, \
+	      EXPAND(jt))
+
+#define JA(x, jt) \
+	JA64(((union arg64){.u64 = (x)}).lo32, \
+	       ((union arg64){.u64 = (x)}).hi32, \
+	       EXPAND(jt))
+#define ARG(i) ARG_64(i)
+
+#else
+#error __BITS_PER_LONG value unusable.
+#endif
+
+/* Loads the arg into A */
+#define ARG_32(idx) \
+	BPF_STMT(BPF_LD+BPF_W+BPF_ABS, LO_ARG(idx))
+
+/* Loads lo into M[0] and hi into M[1] and A */
+#define ARG_64(idx) \
+	BPF_STMT(BPF_LD+BPF_W+BPF_ABS, LO_ARG(idx)), \
+	BPF_STMT(BPF_ST, 0), /* lo -> M[0] */ \
+	BPF_STMT(BPF_LD+BPF_W+BPF_ABS, HI_ARG(idx)), \
+	BPF_STMT(BPF_ST, 1) /* hi -> M[1] */
+
+#define JEQ32(value, jt) \
+	BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, (value), 0, 1), \
+	jt
+
+#define JNE32(value, jt) \
+	BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, (value), 1, 0), \
+	jt
+
+#define JA32(value, jt) \
+	BPF_JUMP(BPF_JMP+BPF_JSET+BPF_K, (value), 0, 1), \
+	jt
+
+#define JGE32(value, jt) \
+	BPF_JUMP(BPF_JMP+BPF_JGE+BPF_K, (value), 0, 1), \
+	jt
+
+#define JGT32(value, jt) \
+	BPF_JUMP(BPF_JMP+BPF_JGT+BPF_K, (value), 0, 1), \
+	jt
+
+#define JLE32(value, jt) \
+	BPF_JUMP(BPF_JMP+BPF_JGT+BPF_K, (value), 1, 0), \
+	jt
+
+#define JLT32(value, jt) \
+	BPF_JUMP(BPF_JMP+BPF_JGE+BPF_K, (value), 1, 0), \
+	jt
+
+/*
+ * All the JXX64 checks assume lo is saved in M[0] and hi is saved in both
+ * A and M[1]. This invariant is kept by restoring A if necessary.
+ */
+#define JEQ64(lo, hi, jt) \
+	/* if (hi != arg.hi) goto NOMATCH; */ \
+	BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, (hi), 0, 5), \
+	BPF_STMT(BPF_LD+BPF_MEM, 0), /* swap in lo */ \
+	/* if (lo != arg.lo) goto NOMATCH; */ \
+	BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, (lo), 0, 2), \
+	BPF_STMT(BPF_LD+BPF_MEM, 1), \
+	jt, \
+	BPF_STMT(BPF_LD+BPF_MEM, 1)
+
+#define JNE64(lo, hi, jt) \
+	/* if (hi != arg.hi) goto MATCH; */ \
+	BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, (hi), 0, 3), \
+	BPF_STMT(BPF_LD+BPF_MEM, 0), \
+	/* if (lo != arg.lo) goto MATCH; */ \
+	BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, (lo), 2, 0), \
+	BPF_STMT(BPF_LD+BPF_MEM, 1), \
+	jt, \
+	BPF_STMT(BPF_LD+BPF_MEM, 1)
+
+#define JA64(lo, hi, jt) \
+	/* if (hi & arg.hi) goto MATCH; */ \
+	BPF_JUMP(BPF_JMP+BPF_JSET+BPF_K, (hi), 3, 0), \
+	BPF_STMT(BPF_LD+BPF_MEM, 0), \
+	/* if (lo & arg.lo) goto MATCH; */ \
+	BPF_JUMP(BPF_JMP+BPF_JSET+BPF_K, (lo), 0, 2), \
+	BPF_STMT(BPF_LD+BPF_MEM, 1), \
+	jt, \
+	BPF_STMT(BPF_LD+BPF_MEM, 1)
+
+#define JGE64(lo, hi, jt) \
+	/* if (hi > arg.hi) goto MATCH; */ \
+	BPF_JUMP(BPF_JMP+BPF_JGT+BPF_K, (hi), 4, 0), \
+	/* if (hi != arg.hi) goto NOMATCH; */ \
+	BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, (hi), 0, 5), \
+	BPF_STMT(BPF_LD+BPF_MEM, 0), \
+	/* if (lo >= arg.lo) goto MATCH; */ \
+	BPF_JUMP(BPF_JMP+BPF_JGE+BPF_K, (lo), 0, 2), \
+	BPF_STMT(BPF_LD+BPF_MEM, 1), \
+	jt, \
+	BPF_STMT(BPF_LD+BPF_MEM, 1)
+
+#define JGT64(lo, hi, jt) \
+	/* if (hi > arg.hi) goto MATCH; */ \
+	BPF_JUMP(BPF_JMP+BPF_JGT+BPF_K, (hi), 4, 0), \
+	/* if (hi != arg.hi) goto NOMATCH; */ \
+	BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, (hi), 0, 5), \
+	BPF_STMT(BPF_LD+BPF_MEM, 0), \
+	/* if (lo > arg.lo) goto MATCH; */ \
+	BPF_JUMP(BPF_JMP+BPF_JGT+BPF_K, (lo), 0, 2), \
+	BPF_STMT(BPF_LD+BPF_MEM, 1), \
+	jt, \
+	BPF_STMT(BPF_LD+BPF_MEM, 1)
+
+#define JLE64(lo, hi, jt) \
+	/* if (hi < arg.hi) goto MATCH; */ \
+	BPF_JUMP(BPF_JMP+BPF_JGE+BPF_K, (hi), 0, 4), \
+	/* if (hi != arg.hi) goto NOMATCH; */ \
+	BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, (hi), 0, 5), \
+	BPF_STMT(BPF_LD+BPF_MEM, 0), \
+	/* if (lo <= arg.lo) goto MATCH; */ \
+	BPF_JUMP(BPF_JMP+BPF_JGT+BPF_K, (lo), 2, 0), \
+	BPF_STMT(BPF_LD+BPF_MEM, 1), \
+	jt, \
+	BPF_STMT(BPF_LD+BPF_MEM, 1)
+
+#define JLT64(lo, hi, jt) \
+	/* if (hi < arg.hi) goto MATCH; */ \
+	BPF_JUMP(BPF_JMP+BPF_JGE+BPF_K, (hi), 0, 4), \
+	/* if (hi != arg.hi) goto NOMATCH; */ \
+	BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, (hi), 0, 5), \
+	BPF_STMT(BPF_LD+BPF_MEM, 0), \
+	/* if (lo < arg.lo) goto MATCH; */ \
+	BPF_JUMP(BPF_JMP+BPF_JGE+BPF_K, (lo), 2, 0), \
+	BPF_STMT(BPF_LD+BPF_MEM, 1), \
+	jt, \
+	BPF_STMT(BPF_LD+BPF_MEM, 1)
+
+#define LOAD_SYSCALL_NR \
+	BPF_STMT(BPF_LD+BPF_W+BPF_ABS, \
+		 offsetof(struct seccomp_data, nr))
+
+#endif  /* __BPF_HELPER_H__ */
diff --git a/samples/seccomp/dropper.c b/samples/seccomp/dropper.c
new file mode 100644
index 000000000..cc0648eb3
--- /dev/null
+++ b/samples/seccomp/dropper.c
@@ -0,0 +1,72 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Naive system call dropper built on seccomp_filter.
+ *
+ * Copyright (c) 2012 The Chromium OS Authors <chromium-os-dev@chromium.org>
+ * Author: Will Drewry <wad@chromium.org>
+ *
+ * The code may be used by anyone for any purpose,
+ * and can serve as a starting point for developing
+ * applications using prctl(PR_SET_SECCOMP, 2, ...).
+ *
+ * When run, returns the specified errno for the specified
+ * system call number against the given architecture.
+ *
+ */
+
+#include <errno.h>
+#include <linux/audit.h>
+#include <linux/filter.h>
+#include <linux/seccomp.h>
+#include <linux/unistd.h>
+#include <stdio.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <sys/prctl.h>
+#include <unistd.h>
+
+static int install_filter(int nr, int arch, int error)
+{
+	struct sock_filter filter[] = {
+		BPF_STMT(BPF_LD+BPF_W+BPF_ABS,
+			 (offsetof(struct seccomp_data, arch))),
+		BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, arch, 0, 3),
+		BPF_STMT(BPF_LD+BPF_W+BPF_ABS,
+			 (offsetof(struct seccomp_data, nr))),
+		BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, nr, 0, 1),
+		BPF_STMT(BPF_RET+BPF_K,
+			 SECCOMP_RET_ERRNO|(error & SECCOMP_RET_DATA)),
+		BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW),
+	};
+	struct sock_fprog prog = {
+		.len = (unsigned short)(sizeof(filter)/sizeof(filter[0])),
+		.filter = filter,
+	};
+	if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
+		perror("prctl(NO_NEW_PRIVS)");
+		return 1;
+	}
+	if (prctl(PR_SET_SECCOMP, 2, &prog)) {
+		perror("prctl(PR_SET_SECCOMP)");
+		return 1;
+	}
+	return 0;
+}
+
+int main(int argc, char **argv)
+{
+	if (argc < 5) {
+		fprintf(stderr, "Usage:\n"
+			"dropper <syscall_nr> <arch> <errno> <prog> [<args>]\n"
+			"Hint:	AUDIT_ARCH_I386: 0x%X\n"
+			"	AUDIT_ARCH_X86_64: 0x%X\n"
+			"\n", AUDIT_ARCH_I386, AUDIT_ARCH_X86_64);
+		return 1;
+	}
+	if (install_filter(strtol(argv[1], NULL, 0), strtol(argv[2], NULL, 0),
+			   strtol(argv[3], NULL, 0)))
+		return 1;
+	execv(argv[4], &argv[4]);
+	printf("Failed to execv\n");
+	return 255;
+}
diff --git a/samples/seccomp/user-trap.c b/samples/seccomp/user-trap.c
new file mode 100644
index 000000000..20291ec64
--- /dev/null
+++ b/samples/seccomp/user-trap.c
@@ -0,0 +1,375 @@
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <string.h>
+#include <stddef.h>
+#include <sys/sysmacros.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <sys/socket.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+#include <sys/syscall.h>
+#include <sys/user.h>
+#include <sys/ioctl.h>
+#include <sys/ptrace.h>
+#include <sys/mount.h>
+#include <linux/limits.h>
+#include <linux/filter.h>
+#include <linux/seccomp.h>
+
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x)))
+
+static int seccomp(unsigned int op, unsigned int flags, void *args)
+{
+	errno = 0;
+	return syscall(__NR_seccomp, op, flags, args);
+}
+
+static int send_fd(int sock, int fd)
+{
+	struct msghdr msg = {};
+	struct cmsghdr *cmsg;
+	char buf[CMSG_SPACE(sizeof(int))] = {0}, c = 'c';
+	struct iovec io = {
+		.iov_base = &c,
+		.iov_len = 1,
+	};
+
+	msg.msg_iov = &io;
+	msg.msg_iovlen = 1;
+	msg.msg_control = buf;
+	msg.msg_controllen = sizeof(buf);
+	cmsg = CMSG_FIRSTHDR(&msg);
+	cmsg->cmsg_level = SOL_SOCKET;
+	cmsg->cmsg_type = SCM_RIGHTS;
+	cmsg->cmsg_len = CMSG_LEN(sizeof(int));
+	*((int *)CMSG_DATA(cmsg)) = fd;
+	msg.msg_controllen = cmsg->cmsg_len;
+
+	if (sendmsg(sock, &msg, 0) < 0) {
+		perror("sendmsg");
+		return -1;
+	}
+
+	return 0;
+}
+
+static int recv_fd(int sock)
+{
+	struct msghdr msg = {};
+	struct cmsghdr *cmsg;
+	char buf[CMSG_SPACE(sizeof(int))] = {0}, c = 'c';
+	struct iovec io = {
+		.iov_base = &c,
+		.iov_len = 1,
+	};
+
+	msg.msg_iov = &io;
+	msg.msg_iovlen = 1;
+	msg.msg_control = buf;
+	msg.msg_controllen = sizeof(buf);
+
+	if (recvmsg(sock, &msg, 0) < 0) {
+		perror("recvmsg");
+		return -1;
+	}
+
+	cmsg = CMSG_FIRSTHDR(&msg);
+
+	return *((int *)CMSG_DATA(cmsg));
+}
+
+static int user_trap_syscall(int nr, unsigned int flags)
+{
+	struct sock_filter filter[] = {
+		BPF_STMT(BPF_LD+BPF_W+BPF_ABS,
+			offsetof(struct seccomp_data, nr)),
+		BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, nr, 0, 1),
+		BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_USER_NOTIF),
+		BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW),
+	};
+
+	struct sock_fprog prog = {
+		.len = (unsigned short)ARRAY_SIZE(filter),
+		.filter = filter,
+	};
+
+	return seccomp(SECCOMP_SET_MODE_FILTER, flags, &prog);
+}
+
+static int handle_req(struct seccomp_notif *req,
+		      struct seccomp_notif_resp *resp, int listener)
+{
+	char path[PATH_MAX], source[PATH_MAX], target[PATH_MAX];
+	int ret = -1, mem;
+
+	resp->id = req->id;
+	resp->error = -EPERM;
+	resp->val = 0;
+
+	if (req->data.nr != __NR_mount) {
+		fprintf(stderr, "huh? trapped something besides mount? %d\n", req->data.nr);
+		return -1;
+	}
+
+	/* Only allow bind mounts. */
+	if (!(req->data.args[3] & MS_BIND))
+		return 0;
+
+	/*
+	 * Ok, let's read the task's memory to see where they wanted their
+	 * mount to go.
+	 */
+	snprintf(path, sizeof(path), "/proc/%d/mem", req->pid);
+	mem = open(path, O_RDONLY);
+	if (mem < 0) {
+		perror("open mem");
+		return -1;
+	}
+
+	/*
+	 * Now we avoid a TOCTOU: we referred to a pid by its pid, but since
+	 * the pid that made the syscall may have died, we need to confirm that
+	 * the pid is still valid after we open its /proc/pid/mem file. We can
+	 * ask the listener fd this as follows.
+	 *
+	 * Note that this check should occur *after* any task-specific
+	 * resources are opened, to make sure that the task has not died and
+	 * we're not wrongly reading someone else's state in order to make
+	 * decisions.
+	 */
+	if (ioctl(listener, SECCOMP_IOCTL_NOTIF_ID_VALID, &req->id) < 0) {
+		fprintf(stderr, "task died before we could map its memory\n");
+		goto out;
+	}
+
+	/*
+	 * Phew, we've got the right /proc/pid/mem. Now we can read it. Note
+	 * that to avoid another TOCTOU, we should read all of the pointer args
+	 * before we decide to allow the syscall.
+	 */
+	if (lseek(mem, req->data.args[0], SEEK_SET) < 0) {
+		perror("seek");
+		goto out;
+	}
+
+	ret = read(mem, source, sizeof(source));
+	if (ret < 0) {
+		perror("read");
+		goto out;
+	}
+
+	if (lseek(mem, req->data.args[1], SEEK_SET) < 0) {
+		perror("seek");
+		goto out;
+	}
+
+	ret = read(mem, target, sizeof(target));
+	if (ret < 0) {
+		perror("read");
+		goto out;
+	}
+
+	/*
+	 * Our policy is to only allow bind mounts inside /tmp. This isn't very
+	 * interesting, because we could do unprivlieged bind mounts with user
+	 * namespaces already, but you get the idea.
+	 */
+	if (!strncmp(source, "/tmp/", 5) && !strncmp(target, "/tmp/", 5)) {
+		if (mount(source, target, NULL, req->data.args[3], NULL) < 0) {
+			ret = -1;
+			perror("actual mount");
+			goto out;
+		}
+		resp->error = 0;
+	}
+
+	/* Even if we didn't allow it because of policy, generating the
+	 * response was be a success, because we want to tell the worker EPERM.
+	 */
+	ret = 0;
+
+out:
+	close(mem);
+	return ret;
+}
+
+int main(void)
+{
+	int sk_pair[2], ret = 1, status, listener;
+	pid_t worker = 0 , tracer = 0;
+
+	if (socketpair(PF_LOCAL, SOCK_SEQPACKET, 0, sk_pair) < 0) {
+		perror("socketpair");
+		return 1;
+	}
+
+	worker = fork();
+	if (worker < 0) {
+		perror("fork");
+		goto close_pair;
+	}
+
+	if (worker == 0) {
+		listener = user_trap_syscall(__NR_mount,
+					     SECCOMP_FILTER_FLAG_NEW_LISTENER);
+		if (listener < 0) {
+			perror("seccomp");
+			exit(1);
+		}
+
+		/*
+		 * Drop privileges. We definitely can't mount as uid 1000.
+		 */
+		if (setuid(1000) < 0) {
+			perror("setuid");
+			exit(1);
+		}
+
+		/*
+		 * Send the listener to the parent; also serves as
+		 * synchronization.
+		 */
+		if (send_fd(sk_pair[1], listener) < 0)
+			exit(1);
+		close(listener);
+
+		if (mkdir("/tmp/foo", 0755) < 0) {
+			perror("mkdir");
+			exit(1);
+		}
+
+		/*
+		 * Try a bad mount just for grins.
+		 */
+		if (mount("/dev/sda", "/tmp/foo", NULL, 0, NULL) != -1) {
+			fprintf(stderr, "huh? mounted /dev/sda?\n");
+			exit(1);
+		}
+
+		if (errno != EPERM) {
+			perror("bad error from mount");
+			exit(1);
+		}
+
+		/*
+		 * Ok, we expect this one to succeed.
+		 */
+		if (mount("/tmp/foo", "/tmp/foo", NULL, MS_BIND, NULL) < 0) {
+			perror("mount");
+			exit(1);
+		}
+
+		exit(0);
+	}
+
+	/*
+	 * Get the listener from the child.
+	 */
+	listener = recv_fd(sk_pair[0]);
+	if (listener < 0)
+		goto out_kill;
+
+	/*
+	 * Fork a task to handle the requests. This isn't strictly necessary,
+	 * but it makes the particular writing of this sample easier, since we
+	 * can just wait ofr the tracee to exit and kill the tracer.
+	 */
+	tracer = fork();
+	if (tracer < 0) {
+		perror("fork");
+		goto out_kill;
+	}
+
+	if (tracer == 0) {
+		struct seccomp_notif *req;
+		struct seccomp_notif_resp *resp;
+		struct seccomp_notif_sizes sizes;
+
+		if (seccomp(SECCOMP_GET_NOTIF_SIZES, 0, &sizes) < 0) {
+			perror("seccomp(GET_NOTIF_SIZES)");
+			goto out_close;
+		}
+
+		req = malloc(sizes.seccomp_notif);
+		if (!req)
+			goto out_close;
+
+		resp = malloc(sizes.seccomp_notif_resp);
+		if (!resp)
+			goto out_req;
+		memset(resp, 0, sizes.seccomp_notif_resp);
+
+		while (1) {
+			memset(req, 0, sizes.seccomp_notif);
+			if (ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, req)) {
+				perror("ioctl recv");
+				goto out_resp;
+			}
+
+			if (handle_req(req, resp, listener) < 0)
+				goto out_resp;
+
+			/*
+			 * ENOENT here means that the task may have gotten a
+			 * signal and restarted the syscall. It's up to the
+			 * handler to decide what to do in this case, but for
+			 * the sample code, we just ignore it. Probably
+			 * something better should happen, like undoing the
+			 * mount, or keeping track of the args to make sure we
+			 * don't do it again.
+			 */
+			if (ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, resp) < 0 &&
+			    errno != ENOENT) {
+				perror("ioctl send");
+				goto out_resp;
+			}
+		}
+out_resp:
+		free(resp);
+out_req:
+		free(req);
+out_close:
+		close(listener);
+		exit(1);
+	}
+
+	close(listener);
+
+	if (waitpid(worker, &status, 0) != worker) {
+		perror("waitpid");
+		goto out_kill;
+	}
+
+	if (umount2("/tmp/foo", MNT_DETACH) < 0 && errno != EINVAL) {
+		perror("umount2");
+		goto out_kill;
+	}
+
+	if (remove("/tmp/foo") < 0 && errno != ENOENT) {
+		perror("remove");
+		exit(1);
+	}
+
+	if (!WIFEXITED(status) || WEXITSTATUS(status)) {
+		fprintf(stderr, "worker exited nonzero\n");
+		goto out_kill;
+	}
+
+	ret = 0;
+
+out_kill:
+	if (tracer > 0)
+		kill(tracer, SIGKILL);
+	if (worker > 0)
+		kill(worker, SIGKILL);
+
+close_pair:
+	close(sk_pair[0]);
+	close(sk_pair[1]);
+	return ret;
+}