summaryrefslogtreecommitdiffstats
path: root/man/man2/seccomp.2
diff options
context:
space:
mode:
Diffstat (limited to 'man/man2/seccomp.2')
-rw-r--r--man/man2/seccomp.21245
1 files changed, 1245 insertions, 0 deletions
diff --git a/man/man2/seccomp.2 b/man/man2/seccomp.2
new file mode 100644
index 0000000..83f5406
--- /dev/null
+++ b/man/man2/seccomp.2
@@ -0,0 +1,1245 @@
+.\" Copyright (C) 2014 Kees Cook <keescook@chromium.org>
+.\" and Copyright (C) 2012 Will Drewry <wad@chromium.org>
+.\" and Copyright (C) 2008, 2014,2017 Michael Kerrisk <mtk.manpages@gmail.com>
+.\" and Copyright (C) 2017 Tyler Hicks <tyhicks@canonical.com>
+.\" and Copyright (C) 2020 Tycho Andersen <tycho@tycho.ws>
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.TH seccomp 2 2024-05-02 "Linux man-pages (unreleased)"
+.SH NAME
+seccomp \- operate on Secure Computing state of the process
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.BR "#include <linux/seccomp.h>" " /* Definition of " SECCOMP_* " constants */"
+.BR "#include <linux/filter.h>" " /* Definition of " "struct sock_fprog" " */"
+.BR "#include <linux/audit.h>" " /* Definition of " AUDIT_* " constants */"
+.BR "#include <linux/signal.h>" " /* Definition of " SIG* " constants */"
+.BR "#include <sys/ptrace.h>" " /* Definition of " PTRACE_* " constants */"
+.\" Kees Cook noted: Anything that uses SECCOMP_RET_TRACE returns will
+.\" need <sys/ptrace.h>
+.BR "#include <sys/syscall.h>" " /* Definition of " SYS_* " constants */"
+.B #include <unistd.h>
+.P
+.BI "int syscall(SYS_seccomp, unsigned int " operation ", unsigned int " flags ,
+.BI " void *" args );
+.fi
+.P
+.IR Note :
+glibc provides no wrapper for
+.BR seccomp (),
+necessitating the use of
+.BR syscall (2).
+.SH DESCRIPTION
+The
+.BR seccomp ()
+system call operates on the Secure Computing (seccomp) state of the
+calling process.
+.P
+Currently, Linux supports the following
+.I operation
+values:
+.TP
+.B SECCOMP_SET_MODE_STRICT
+The only system calls that the calling thread is permitted to make are
+.BR read (2),
+.BR write (2),
+.BR _exit (2)
+(but not
+.BR exit_group (2)),
+and
+.BR sigreturn (2).
+Other system calls result in the termination of the calling thread,
+or termination of the entire process with the
+.B SIGKILL
+signal when there is only one thread.
+Strict secure computing mode is useful for number-crunching
+applications that may need to execute untrusted byte code, perhaps
+obtained by reading from a pipe or socket.
+.IP
+Note that although the calling thread can no longer call
+.BR sigprocmask (2),
+it can use
+.BR sigreturn (2)
+to block all signals apart from
+.B SIGKILL
+and
+.BR SIGSTOP .
+This means that
+.BR alarm (2)
+(for example) is not sufficient for restricting the process's execution time.
+Instead, to reliably terminate the process,
+.B SIGKILL
+must be used.
+This can be done by using
+.BR timer_create (2)
+with
+.B SIGEV_SIGNAL
+and
+.I sigev_signo
+set to
+.BR SIGKILL ,
+or by using
+.BR setrlimit (2)
+to set the hard limit for
+.BR RLIMIT_CPU .
+.IP
+This operation is available only if the kernel is configured with
+.B CONFIG_SECCOMP
+enabled.
+.IP
+The value of
+.I flags
+must be 0, and
+.I args
+must be NULL.
+.IP
+This operation is functionally identical to the call:
+.IP
+.in +4n
+.EX
+prctl(PR_SET_SECCOMP, SECCOMP_MODE_STRICT);
+.EE
+.in
+.TP
+.B SECCOMP_SET_MODE_FILTER
+The system calls allowed are defined by a pointer to a Berkeley Packet
+Filter (BPF) passed via
+.IR args .
+This argument is a pointer to a
+.IR "struct\~sock_fprog" ;
+it can be designed to filter arbitrary system calls and system call
+arguments.
+If the filter is invalid,
+.BR seccomp ()
+fails, returning
+.B EINVAL
+in
+.IR errno .
+.IP
+If
+.BR fork (2)
+or
+.BR clone (2)
+is allowed by the filter, any child processes will be constrained to
+the same system call filters as the parent.
+If
+.BR execve (2)
+is allowed,
+the existing filters will be preserved across a call to
+.BR execve (2).
+.IP
+In order to use the
+.B SECCOMP_SET_MODE_FILTER
+operation, either the calling thread must have the
+.B CAP_SYS_ADMIN
+capability in its user namespace, or the thread must already have the
+.I no_new_privs
+bit set.
+If that bit was not already set by an ancestor of this thread,
+the thread must make the following call:
+.IP
+.in +4n
+.EX
+prctl(PR_SET_NO_NEW_PRIVS, 1);
+.EE
+.in
+.IP
+Otherwise, the
+.B SECCOMP_SET_MODE_FILTER
+operation fails and returns
+.B EACCES
+in
+.IR errno .
+This requirement ensures that an unprivileged process cannot apply
+a malicious filter and then invoke a set-user-ID or
+other privileged program using
+.BR execve (2),
+thus potentially compromising that program.
+(Such a malicious filter might, for example, cause an attempt to use
+.BR setuid (2)
+to set the caller's user IDs to nonzero values to instead
+return 0 without actually making the system call.
+Thus, the program might be tricked into retaining superuser privileges
+in circumstances where it is possible to influence it to do
+dangerous things because it did not actually drop privileges.)
+.IP
+If
+.BR prctl (2)
+or
+.BR seccomp ()
+is allowed by the attached filter, further filters may be added.
+This will increase evaluation time, but allows for further reduction of
+the attack surface during execution of a thread.
+.IP
+The
+.B SECCOMP_SET_MODE_FILTER
+operation is available only if the kernel is configured with
+.B CONFIG_SECCOMP_FILTER
+enabled.
+.IP
+When
+.I flags
+is 0, this operation is functionally identical to the call:
+.IP
+.in +4n
+.EX
+prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, args);
+.EE
+.in
+.IP
+The recognized
+.I flags
+are:
+.RS
+.TP
+.BR SECCOMP_FILTER_FLAG_LOG " (since Linux 4.14)"
+.\" commit e66a39977985b1e69e17c4042cb290768eca9b02
+All filter return actions except
+.B SECCOMP_RET_ALLOW
+should be logged.
+An administrator may override this filter flag by preventing specific
+actions from being logged via the
+.I /proc/sys/kernel/seccomp/actions_logged
+file.
+.TP
+.BR SECCOMP_FILTER_FLAG_NEW_LISTENER " (since Linux 5.0)"
+.\" commit 6a21cc50f0c7f87dae5259f6cfefe024412313f6
+After successfully installing the filter program,
+return a new user-space notification file descriptor.
+(The close-on-exec flag is set for the file descriptor.)
+When the filter returns
+.B SECCOMP_RET_USER_NOTIF
+a notification will be sent to this file descriptor.
+.IP
+At most one seccomp filter using the
+.B SECCOMP_FILTER_FLAG_NEW_LISTENER
+flag can be installed for a thread.
+.IP
+See
+.BR seccomp_unotify (2)
+for further details.
+.TP
+.BR SECCOMP_FILTER_FLAG_SPEC_ALLOW " (since Linux 4.17)"
+.\" commit 00a02d0c502a06d15e07b857f8ff921e3e402675
+Disable Speculative Store Bypass mitigation.
+.TP
+.B SECCOMP_FILTER_FLAG_TSYNC
+When adding a new filter, synchronize all other threads of the calling
+process to the same seccomp filter tree.
+A "filter tree" is the ordered list of filters attached to a thread.
+(Attaching identical filters in separate
+.BR seccomp ()
+calls results in different filters from this perspective.)
+.IP
+If any thread cannot synchronize to the same filter tree,
+the call will not attach the new seccomp filter,
+and will fail, returning the first thread ID found that cannot synchronize.
+Synchronization will fail if another thread in the same process is in
+.B SECCOMP_MODE_STRICT
+or if it has attached new seccomp filters to itself,
+diverging from the calling thread's filter tree.
+.RE
+.TP
+.BR SECCOMP_GET_ACTION_AVAIL " (since Linux 4.14)"
+.\" commit d612b1fd8010d0d67b5287fe146b8b55bcbb8655
+Test to see if an action is supported by the kernel.
+This operation is helpful to confirm that the kernel knows
+of a more recently added filter return action
+since the kernel treats all unknown actions as
+.BR SECCOMP_RET_KILL_PROCESS .
+.IP
+The value of
+.I flags
+must be 0, and
+.I args
+must be a pointer to an unsigned 32-bit filter return action.
+.TP
+.BR SECCOMP_GET_NOTIF_SIZES " (since Linux 5.0)"
+.\" commit 6a21cc50f0c7f87dae5259f6cfefe024412313f6
+Get the sizes of the seccomp user-space notification structures.
+Since these structures may evolve and grow over time,
+this command can be used to determine how
+much memory to allocate for sending and receiving notifications.
+.IP
+The value of
+.I flags
+must be 0, and
+.I args
+must be a pointer to a
+.IR "struct seccomp_notif_sizes" ,
+which has the following form:
+.IP
+.EX
+struct seccomp_notif_sizes
+ __u16 seccomp_notif; /* Size of notification structure */
+ __u16 seccomp_notif_resp; /* Size of response structure */
+ __u16 seccomp_data; /* Size of \[aq]struct seccomp_data\[aq] */
+};
+.EE
+.IP
+See
+.BR seccomp_unotify (2)
+for further details.
+.\"
+.SS Filters
+When adding filters via
+.BR SECCOMP_SET_MODE_FILTER ,
+.I args
+points to a filter program:
+.P
+.in +4n
+.EX
+struct sock_fprog {
+ unsigned short len; /* Number of BPF instructions */
+ struct sock_filter *filter; /* Pointer to array of
+ BPF instructions */
+};
+.EE
+.in
+.P
+Each program must contain one or more BPF instructions:
+.P
+.in +4n
+.EX
+struct sock_filter { /* Filter block */
+ __u16 code; /* Actual filter code */
+ __u8 jt; /* Jump true */
+ __u8 jf; /* Jump false */
+ __u32 k; /* Generic multiuse field */
+};
+.EE
+.in
+.P
+When executing the instructions, the BPF program operates on the
+system call information made available (i.e., use the
+.B BPF_ABS
+addressing mode) as a (read-only)
+.\" Quoting Kees Cook:
+.\" If BPF even allows changing the data, it's not copied back to
+.\" the syscall when it runs. Anything wanting to do things like
+.\" that would need to use ptrace to catch the call and directly
+.\" modify the registers before continuing with the call.
+buffer of the following form:
+.P
+.in +4n
+.EX
+struct seccomp_data {
+ int nr; /* System call number */
+ __u32 arch; /* AUDIT_ARCH_* value
+ (see <linux/audit.h>) */
+ __u64 instruction_pointer; /* CPU instruction pointer */
+ __u64 args[6]; /* Up to 6 system call arguments */
+};
+.EE
+.in
+.P
+Because numbering of system calls varies between architectures and
+some architectures (e.g., x86-64) allow user-space code to use
+the calling conventions of multiple architectures
+(and the convention being used may vary over the life of a process that uses
+.BR execve (2)
+to execute binaries that employ the different conventions),
+it is usually necessary to verify the value of the
+.I arch
+field.
+.P
+It is strongly recommended to use an allow-list approach whenever
+possible because such an approach is more robust and simple.
+A deny-list will have to be updated whenever a potentially
+dangerous system call is added (or a dangerous flag or option if those
+are deny-listed), and it is often possible to alter the
+representation of a value without altering its meaning, leading to
+a deny-list bypass.
+See also
+.I Caveats
+below.
+.P
+The
+.I arch
+field is not unique for all calling conventions.
+The x86-64 ABI and the x32 ABI both use
+.B AUDIT_ARCH_X86_64
+as
+.IR arch ,
+and they run on the same processors.
+Instead, the mask
+.B __X32_SYSCALL_BIT
+is used on the system call number to tell the two ABIs apart.
+.\" As noted by Dave Drysdale in a note at the end of
+.\" https://lwn.net/Articles/604515/
+.\" One additional detail to point out for the x32 ABI case:
+.\" the syscall number gets a high bit set (__X32_SYSCALL_BIT),
+.\" to mark it as an x32 call.
+.\"
+.\" If x32 support is included in the kernel, then __SYSCALL_MASK
+.\" will have a value that is not all-ones, and this will trigger
+.\" an extra instruction in system_call to mask off the extra bit,
+.\" so that the syscall table indexing still works.
+.P
+This means that a policy must either deny all syscalls with
+.B __X32_SYSCALL_BIT
+or it must recognize syscalls with and without
+.B __X32_SYSCALL_BIT
+set.
+A list of system calls to be denied based on
+.I nr
+that does not also contain
+.I nr
+values with
+.B __X32_SYSCALL_BIT
+set can be bypassed by a malicious program that sets
+.BR __X32_SYSCALL_BIT .
+.P
+Additionally, kernels prior to Linux 5.4 incorrectly permitted
+.I nr
+in the ranges 512-547 as well as the corresponding non-x32 syscalls ORed
+with
+.BR __X32_SYSCALL_BIT .
+For example,
+.I nr
+== 521 and
+.I nr
+== (101 |
+.BR __X32_SYSCALL_BIT )
+would result in invocations of
+.BR ptrace (2)
+with potentially confused x32-vs-x86_64 semantics in the kernel.
+Policies intended to work on kernels before Linux 5.4 must ensure that they
+deny or otherwise correctly handle these system calls.
+On Linux 5.4 and newer,
+.\" commit 6365b842aae4490ebfafadfc6bb27a6d3cc54757
+such system calls will fail with the error
+.BR ENOSYS ,
+without doing anything.
+.P
+The
+.I instruction_pointer
+field provides the address of the machine-language instruction that
+performed the system call.
+This might be useful in conjunction with the use of
+.IR /proc/ pid /maps
+to perform checks based on which region (mapping) of the program
+made the system call.
+(Probably, it is wise to lock down the
+.BR mmap (2)
+and
+.BR mprotect (2)
+system calls to prevent the program from subverting such checks.)
+.P
+When checking values from
+.IR args ,
+keep in mind that arguments are often
+silently truncated before being processed, but after the seccomp check.
+For example, this happens if the i386 ABI is used on an
+x86-64 kernel: although the kernel will normally not look beyond
+the 32 lowest bits of the arguments, the values of the full
+64-bit registers will be present in the seccomp data.
+A less surprising example is that if the x86-64 ABI is used to perform
+a system call that takes an argument of type
+.IR int ,
+the more-significant half of the argument register is ignored by
+the system call, but visible in the seccomp data.
+.P
+A seccomp filter returns a 32-bit value consisting of two parts:
+the most significant 16 bits
+(corresponding to the mask defined by the constant
+.BR SECCOMP_RET_ACTION_FULL )
+contain one of the "action" values listed below;
+the least significant 16-bits (defined by the constant
+.BR SECCOMP_RET_DATA )
+are "data" to be associated with this return value.
+.P
+If multiple filters exist, they are \fIall\fP executed,
+in reverse order of their addition to the filter tree\[em]that is,
+the most recently installed filter is executed first.
+(Note that all filters will be called
+even if one of the earlier filters returns
+.BR SECCOMP_RET_KILL .
+This is done to simplify the kernel code and to provide a
+tiny speed-up in the execution of sets of filters by
+avoiding a check for this uncommon case.)
+.\" From an Aug 2015 conversation with Kees Cook where I asked why *all*
+.\" filters are applied even if one of the early filters returns
+.\" SECCOMP_RET_KILL:
+.\"
+.\" It's just because it would be an optimization that would only speed up
+.\" the RET_KILL case, but it's the uncommon one and the one that doesn't
+.\" benefit meaningfully from such a change (you need to kill the process
+.\" really quickly?). We would speed up killing a program at the (albeit
+.\" tiny) expense to all other filtered programs. Best to keep the filter
+.\" execution logic clear, simple, and as fast as possible for all
+.\" filters.
+The return value for the evaluation of a given system call is the first-seen
+action value of highest precedence (along with its accompanying data)
+returned by execution of all of the filters.
+.P
+In decreasing order of precedence,
+the action values that may be returned by a seccomp filter are:
+.TP
+.BR SECCOMP_RET_KILL_PROCESS " (since Linux 4.14)"
+.\" commit 4d3b0b05aae9ee9ce0970dc4cc0fb3fad5e85945
+.\" commit 0466bdb99e8744bc9befa8d62a317f0fd7fd7421
+This value results in immediate termination of the process,
+with a core dump.
+The system call is not executed.
+By contrast with
+.B SECCOMP_RET_KILL_THREAD
+below, all threads in the thread group are terminated.
+(For a discussion of thread groups, see the description of the
+.B CLONE_THREAD
+flag in
+.BR clone (2).)
+.IP
+The process terminates
+.I "as though"
+killed by a
+.B SIGSYS
+signal.
+Even if a signal handler has been registered for
+.BR SIGSYS ,
+the handler will be ignored in this case and the process always terminates.
+To a parent process that is waiting on this process (using
+.BR waitpid (2)
+or similar), the returned
+.I wstatus
+will indicate that its child was terminated as though by a
+.B SIGSYS
+signal.
+.TP
+.BR SECCOMP_RET_KILL_THREAD " (or " SECCOMP_RET_KILL )
+This value results in immediate termination of the thread
+that made the system call.
+The system call is not executed.
+Other threads in the same thread group will continue to execute.
+.IP
+The thread terminates
+.I "as though"
+killed by a
+.B SIGSYS
+signal.
+See
+.B SECCOMP_RET_KILL_PROCESS
+above.
+.IP
+.\" See these commits:
+.\" seccomp: dump core when using SECCOMP_RET_KILL
+.\" (b25e67161c295c98acda92123b2dd1e7d8642901)
+.\" seccomp: Only dump core when single-threaded
+.\" (d7276e321ff8a53106a59c85ca46d03e34288893)
+Before Linux 4.11,
+any process terminated in this way would not trigger a coredump
+(even though
+.B SIGSYS
+is documented in
+.BR signal (7)
+as having a default action of termination with a core dump).
+Since Linux 4.11,
+a single-threaded process will dump core if terminated in this way.
+.IP
+With the addition of
+.B SECCOMP_RET_KILL_PROCESS
+in Linux 4.14,
+.B SECCOMP_RET_KILL_THREAD
+was added as a synonym for
+.BR SECCOMP_RET_KILL ,
+in order to more clearly distinguish the two actions.
+.IP
+.BR Note :
+the use of
+.B SECCOMP_RET_KILL_THREAD
+to kill a single thread in a multithreaded process is likely to leave the
+process in a permanently inconsistent and possibly corrupt state.
+.TP
+.B SECCOMP_RET_TRAP
+This value results in the kernel sending a thread-directed
+.B SIGSYS
+signal to the triggering thread.
+(The system call is not executed.)
+Various fields will be set in the
+.I siginfo_t
+structure (see
+.BR sigaction (2))
+associated with signal:
+.RS
+.IP \[bu] 3
+.I si_signo
+will contain
+.BR SIGSYS .
+.IP \[bu]
+.I si_call_addr
+will show the address of the system call instruction.
+.IP \[bu]
+.I si_syscall
+and
+.I si_arch
+will indicate which system call was attempted.
+.IP \[bu]
+.I si_code
+will contain
+.BR SYS_SECCOMP .
+.IP \[bu]
+.I si_errno
+will contain the
+.B SECCOMP_RET_DATA
+portion of the filter return value.
+.RE
+.IP
+The program counter will be as though the system call happened
+(i.e., the program counter will not point to the system call instruction).
+The return value register will contain an architecture\-dependent value;
+if resuming execution, set it to something appropriate for the system call.
+(The architecture dependency is because replacing it with
+.B ENOSYS
+could overwrite some useful information.)
+.TP
+.B SECCOMP_RET_ERRNO
+This value results in the
+.B SECCOMP_RET_DATA
+portion of the filter's return value being passed to user space as the
+.I errno
+value without executing the system call.
+.TP
+.BR SECCOMP_RET_USER_NOTIF " (since Linux 5.0)"
+.\" commit 6a21cc50f0c7f87dae5259f6cfefe024412313f6
+Forward the system call to an attached user-space supervisor
+process to allow that process to decide what to do with the system call.
+If there is no attached supervisor (either
+because the filter was not installed with the
+.B SECCOMP_FILTER_FLAG_NEW_LISTENER
+flag or because the file descriptor was closed), the filter returns
+.B ENOSYS
+(similar to what happens when a filter returns
+.B SECCOMP_RET_TRACE
+and there is no tracer).
+See
+.BR seccomp_unotify (2)
+for further details.
+.IP
+Note that the supervisor process will not be notified
+if another filter returns an action value with a precedence greater than
+.BR SECCOMP_RET_USER_NOTIF .
+.TP
+.B SECCOMP_RET_TRACE
+When returned, this value will cause the kernel to attempt to notify a
+.BR ptrace (2)-based
+tracer prior to executing the system call.
+If there is no tracer present,
+the system call is not executed and returns a failure status with
+.I errno
+set to
+.BR ENOSYS .
+.IP
+A tracer will be notified if it requests
+.B PTRACE_O_TRACESECCOMP
+using
+.IR ptrace(PTRACE_SETOPTIONS) .
+The tracer will be notified of a
+.B PTRACE_EVENT_SECCOMP
+and the
+.B SECCOMP_RET_DATA
+portion of the filter's return value will be available to the tracer via
+.BR PTRACE_GETEVENTMSG .
+.IP
+The tracer can skip the system call by changing the system call number
+to \-1.
+Alternatively, the tracer can change the system call
+requested by changing the system call to a valid system call number.
+If the tracer asks to skip the system call, then the system call will
+appear to return the value that the tracer puts in the return value register.
+.IP
+.\" This was changed in ce6526e8afa4.
+.\" A related hole, using PTRACE_SYSCALL instead of SECCOMP_RET_TRACE, was
+.\" changed in arch-specific commits, e.g. 93e35efb8de4 for X86 and
+.\" 0f3912fd934c for ARM.
+Before Linux 4.8, the seccomp check will not be run again after the tracer is
+notified.
+(This means that, on older kernels, seccomp-based sandboxes
+.B "must not"
+allow use of
+.BR ptrace (2)\[em]even
+of other
+sandboxed processes\[em]without extreme care;
+ptracers can use this mechanism to escape from the seccomp sandbox.)
+.IP
+Note that a tracer process will not be notified
+if another filter returns an action value with a precedence greater than
+.BR SECCOMP_RET_TRACE .
+.TP
+.BR SECCOMP_RET_LOG " (since Linux 4.14)"
+.\" commit 59f5cf44a38284eb9e76270c786fb6cc62ef8ac4
+This value results in the system call being executed after
+the filter return action is logged.
+An administrator may override the logging of this action via
+the
+.I /proc/sys/kernel/seccomp/actions_logged
+file.
+.TP
+.B SECCOMP_RET_ALLOW
+This value results in the system call being executed.
+.P
+If an action value other than one of the above is specified,
+then the filter action is treated as either
+.B SECCOMP_RET_KILL_PROCESS
+(since Linux 4.14)
+.\" commit 4d3b0b05aae9ee9ce0970dc4cc0fb3fad5e85945
+or
+.B SECCOMP_RET_KILL_THREAD
+(in Linux 4.13 and earlier).
+.\"
+.SS /proc interfaces
+The files in the directory
+.I /proc/sys/kernel/seccomp
+provide additional seccomp information and configuration:
+.TP
+.IR actions_avail " (since Linux 4.14)"
+.\" commit 8e5f1ad116df6b0de65eac458d5e7c318d1c05af
+A read-only ordered list of seccomp filter return actions in string form.
+The ordering, from left-to-right, is in decreasing order of precedence.
+The list represents the set of seccomp filter return actions
+supported by the kernel.
+.TP
+.IR actions_logged " (since Linux 4.14)"
+.\" commit 0ddec0fc8900201c0897b87b762b7c420436662f
+A read-write ordered list of seccomp filter return actions that
+are allowed to be logged.
+Writes to the file do not need to be in ordered form but reads from
+the file will be ordered in the same way as the
+.I actions_avail
+file.
+.IP
+It is important to note that the value of
+.I actions_logged
+does not prevent certain filter return actions from being logged when
+the audit subsystem is configured to audit a task.
+If the action is not found in the
+.I actions_logged
+file, the final decision on whether to audit the action for that task is
+ultimately left up to the audit subsystem to decide for all filter return
+actions other than
+.BR SECCOMP_RET_ALLOW .
+.IP
+The "allow" string is not accepted in the
+.I actions_logged
+file as it is not possible to log
+.B SECCOMP_RET_ALLOW
+actions.
+Attempting to write "allow" to the file will fail with the error
+.BR EINVAL .
+.\"
+.SS Audit logging of seccomp actions
+.\" commit 59f5cf44a38284eb9e76270c786fb6cc62ef8ac4
+Since Linux 4.14, the kernel provides the facility to log the
+actions returned by seccomp filters in the audit log.
+The kernel makes the decision to log an action based on
+the action type, whether or not the action is present in the
+.I actions_logged
+file, and whether kernel auditing is enabled
+(e.g., via the kernel boot option
+.IR audit=1 ).
+.\" or auditing could be enabled via the netlink API (AUDIT_SET)
+The rules are as follows:
+.IP \[bu] 3
+If the action is
+.BR SECCOMP_RET_ALLOW ,
+the action is not logged.
+.IP \[bu]
+Otherwise, if the action is either
+.B SECCOMP_RET_KILL_PROCESS
+or
+.BR SECCOMP_RET_KILL_THREAD ,
+and that action appears in the
+.I actions_logged
+file, the action is logged.
+.IP \[bu]
+Otherwise, if the filter has requested logging (the
+.B SECCOMP_FILTER_FLAG_LOG
+flag)
+and the action appears in the
+.I actions_logged
+file, the action is logged.
+.IP \[bu]
+Otherwise, if kernel auditing is enabled and the process is being audited
+.RB ( autrace (8)),
+the action is logged.
+.IP \[bu]
+Otherwise, the action is not logged.
+.SH RETURN VALUE
+On success,
+.BR seccomp ()
+returns 0.
+On error, if
+.B SECCOMP_FILTER_FLAG_TSYNC
+was used,
+the return value is the ID of the thread
+that caused the synchronization failure.
+(This ID is a kernel thread ID of the type returned by
+.BR clone (2)
+and
+.BR gettid (2).)
+On other errors, \-1 is returned, and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.BR seccomp ()
+can fail for the following reasons:
+.TP
+.B EACCES
+The caller did not have the
+.B CAP_SYS_ADMIN
+capability in its user namespace, or had not set
+.I no_new_privs
+before using
+.BR SECCOMP_SET_MODE_FILTER .
+.TP
+.B EBUSY
+While installing a new filter, the
+.B SECCOMP_FILTER_FLAG_NEW_LISTENER
+flag was specified,
+but a previous filter had already been installed with that flag.
+.TP
+.B EFAULT
+.I args
+was not a valid address.
+.TP
+.B EINVAL
+.I operation
+is unknown or is not supported by this kernel version or configuration.
+.TP
+.B EINVAL
+The specified
+.I flags
+are invalid for the given
+.IR operation .
+.TP
+.B EINVAL
+.I operation
+included
+.BR BPF_ABS ,
+but the specified offset was not aligned to a 32-bit boundary or exceeded
+.IR "sizeof(struct\~seccomp_data)" .
+.TP
+.B EINVAL
+.\" See kernel/seccomp.c::seccomp_may_assign_mode() in Linux 3.18 sources
+A secure computing mode has already been set, and
+.I operation
+differs from the existing setting.
+.TP
+.B EINVAL
+.I operation
+specified
+.BR SECCOMP_SET_MODE_FILTER ,
+but the filter program pointed to by
+.I args
+was not valid or the length of the filter program was zero or exceeded
+.B BPF_MAXINSNS
+(4096) instructions.
+.TP
+.B ENOMEM
+Out of memory.
+.TP
+.B ENOMEM
+.\" ENOMEM in kernel/seccomp.c::seccomp_attach_filter() in Linux 3.18 sources
+The total length of all filter programs attached
+to the calling thread would exceed
+.B MAX_INSNS_PER_PATH
+(32768) instructions.
+Note that for the purposes of calculating this limit,
+each already existing filter program incurs an
+overhead penalty of 4 instructions.
+.TP
+.B EOPNOTSUPP
+.I operation
+specified
+.BR SECCOMP_GET_ACTION_AVAIL ,
+but the kernel does not support the filter return action specified by
+.IR args .
+.TP
+.B ESRCH
+Another thread caused a failure during thread sync, but its ID could not
+be determined.
+.SH STANDARDS
+Linux.
+.SH HISTORY
+Linux 3.17.
+.\" FIXME . Add glibc version
+.SH NOTES
+Rather than hand-coding seccomp filters as shown in the example below,
+you may prefer to employ the
+.I libseccomp
+library, which provides a front-end for generating seccomp filters.
+.P
+The
+.I Seccomp
+field of the
+.IR /proc/ pid /status
+file provides a method of viewing the seccomp mode of a process; see
+.BR proc (5).
+.P
+.BR seccomp ()
+provides a superset of the functionality provided by the
+.BR prctl (2)
+.B PR_SET_SECCOMP
+operation (which does not support
+.IR flags ).
+.P
+Since Linux 4.4, the
+.BR ptrace (2)
+.B PTRACE_SECCOMP_GET_FILTER
+operation can be used to dump a process's seccomp filters.
+.\"
+.SS Architecture support for seccomp BPF
+Architecture support for seccomp BPF filtering
+.\" Check by grepping for HAVE_ARCH_SECCOMP_FILTER in Kconfig files in
+.\" kernel source. Last checked in Linux 4.16-rc source.
+is available on the following architectures:
+.IP \[bu] 3
+x86-64, i386, x32 (since Linux 3.5)
+.PD 0
+.IP \[bu]
+ARM (since Linux 3.8)
+.IP \[bu]
+s390 (since Linux 3.8)
+.IP \[bu]
+MIPS (since Linux 3.16)
+.IP \[bu]
+ARM-64 (since Linux 3.19)
+.IP \[bu]
+PowerPC (since Linux 4.3)
+.IP \[bu]
+Tile (since Linux 4.3)
+.IP \[bu]
+PA-RISC (since Linux 4.6)
+.\" User mode Linux since Linux 4.6
+.PD
+.\"
+.SS Caveats
+There are various subtleties to consider when applying seccomp filters
+to a program, including the following:
+.IP \[bu] 3
+Some traditional system calls have user-space implementations in the
+.BR vdso (7)
+on many architectures.
+Notable examples include
+.BR clock_gettime (2),
+.BR gettimeofday (2),
+and
+.BR time (2).
+On such architectures,
+seccomp filtering for these system calls will have no effect.
+(However, there are cases where the
+.BR vdso (7)
+implementations may fall back to invoking the true system call,
+in which case seccomp filters would see the system call.)
+.IP \[bu]
+Seccomp filtering is based on system call numbers.
+However, applications typically do not directly invoke system calls,
+but instead call wrapper functions in the C library which
+in turn invoke the system calls.
+Consequently, one must be aware of the following:
+.RS
+.IP \[bu] 3
+The glibc wrappers for some traditional system calls may actually
+employ system calls with different names in the kernel.
+For example, the
+.BR exit (2)
+wrapper function actually employs the
+.BR exit_group (2)
+system call, and the
+.BR fork (2)
+wrapper function actually calls
+.BR clone (2).
+.IP \[bu]
+The behavior of wrapper functions may vary across architectures,
+according to the range of system calls provided on those architectures.
+In other words, the same wrapper function may invoke
+different system calls on different architectures.
+.IP \[bu]
+Finally, the behavior of wrapper functions can change across glibc versions.
+For example, in older versions, the glibc wrapper function for
+.BR open (2)
+invoked the system call of the same name,
+but starting in glibc 2.26, the implementation switched to calling
+.BR openat (2)
+on all architectures.
+.RE
+.P
+The consequence of the above points is that it may be necessary
+to filter for a system call other than might be expected.
+Various manual pages in Section 2 provide helpful details
+about the differences between wrapper functions and
+the underlying system calls in subsections entitled
+.IR "C library/kernel differences" .
+.P
+Furthermore, note that the application of seccomp filters
+even risks causing bugs in an application,
+when the filters cause unexpected failures for legitimate operations
+that the application might need to perform.
+Such bugs may not easily be discovered when testing the seccomp
+filters if the bugs occur in rarely used application code paths.
+.\"
+.SS Seccomp-specific BPF details
+Note the following BPF details specific to seccomp filters:
+.IP \[bu] 3
+The
+.B BPF_H
+and
+.B BPF_B
+size modifiers are not supported: all operations must load and store
+(4-byte) words
+.RB ( BPF_W ).
+.IP \[bu]
+To access the contents of the
+.I seccomp_data
+buffer, use the
+.B BPF_ABS
+addressing mode modifier.
+.IP \[bu]
+The
+.B BPF_LEN
+addressing mode modifier yields an immediate mode operand
+whose value is the size of the
+.I seccomp_data
+buffer.
+.SH EXAMPLES
+The program below accepts four or more arguments.
+The first three arguments are a system call number,
+a numeric architecture identifier, and an error number.
+The program uses these values to construct a BPF filter
+that is used at run time to perform the following checks:
+.IP \[bu] 3
+If the program is not running on the specified architecture,
+the BPF filter causes system calls to fail with the error
+.BR ENOSYS .
+.IP \[bu]
+If the program attempts to execute the system call with the specified number,
+the BPF filter causes the system call to fail, with
+.I errno
+being set to the specified error number.
+.P
+The remaining command-line arguments specify
+the pathname and additional arguments of a program
+that the example program should attempt to execute using
+.BR execv (3)
+(a library function that employs the
+.BR execve (2)
+system call).
+Some example runs of the program are shown below.
+.P
+First, we display the architecture that we are running on (x86-64)
+and then construct a shell function that looks up system call
+numbers on this architecture:
+.P
+.in +4n
+.EX
+$ \fBuname \-m\fP
+x86_64
+$ \fBsyscall_nr() {
+ cat /usr/src/linux/arch/x86/syscalls/syscall_64.tbl | \e
+ awk \[aq]$2 != "x32" && $3 == "\[aq]$1\[aq]" { print $1 }\[aq]
+}\fP
+.EE
+.in
+.P
+When the BPF filter rejects a system call (case [2] above),
+it causes the system call to fail with the error number
+specified on the command line.
+In the experiments shown here, we'll use error number 99:
+.P
+.in +4n
+.EX
+$ \fBerrno 99\fP
+EADDRNOTAVAIL 99 Cannot assign requested address
+.EE
+.in
+.P
+In the following example, we attempt to run the command
+.BR whoami (1),
+but the BPF filter rejects the
+.BR execve (2)
+system call, so that the command is not even executed:
+.P
+.in +4n
+.EX
+$ \fBsyscall_nr execve\fP
+59
+$ \fB./a.out\fP
+Usage: ./a.out <syscall_nr> <arch> <errno> <prog> [<args>]
+Hint for <arch>: AUDIT_ARCH_I386: 0x40000003
+ AUDIT_ARCH_X86_64: 0xC000003E
+$ \fB./a.out 59 0xC000003E 99 /bin/whoami\fP
+execv: Cannot assign requested address
+.EE
+.in
+.P
+In the next example, the BPF filter rejects the
+.BR write (2)
+system call, so that, although it is successfully started, the
+.BR whoami (1)
+command is not able to write output:
+.P
+.in +4n
+.EX
+$ \fBsyscall_nr write\fP
+1
+$ \fB./a.out 1 0xC000003E 99 /bin/whoami\fP
+.EE
+.in
+.P
+In the final example,
+the BPF filter rejects a system call that is not used by the
+.BR whoami (1)
+command, so it is able to successfully execute and produce output:
+.P
+.in +4n
+.EX
+$ \fBsyscall_nr preadv\fP
+295
+$ \fB./a.out 295 0xC000003E 99 /bin/whoami\fP
+cecilia
+.EE
+.in
+.SS Program source
+.\" SRC BEGIN (seccomp.c)
+.EX
+#include <linux/audit.h>
+#include <linux/filter.h>
+#include <linux/seccomp.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/prctl.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+\&
+#define X32_SYSCALL_BIT 0x40000000
+#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]))
+\&
+static int
+install_filter(int syscall_nr, unsigned int t_arch, int f_errno)
+{
+ unsigned int upper_nr_limit = 0xffffffff;
+\&
+ /* Assume that AUDIT_ARCH_X86_64 means the normal x86\-64 ABI
+ (in the x32 ABI, all system calls have bit 30 set in the
+ \[aq]nr\[aq] field, meaning the numbers are >= X32_SYSCALL_BIT). */
+ if (t_arch == AUDIT_ARCH_X86_64)
+ upper_nr_limit = X32_SYSCALL_BIT \- 1;
+\&
+ struct sock_filter filter[] = {
+ /* [0] Load architecture from \[aq]seccomp_data\[aq] buffer into
+ accumulator. */
+ BPF_STMT(BPF_LD | BPF_W | BPF_ABS,
+ (offsetof(struct seccomp_data, arch))),
+\&
+ /* [1] Jump forward 5 instructions if architecture does not
+ match \[aq]t_arch\[aq]. */
+ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, t_arch, 0, 5),
+\&
+ /* [2] Load system call number from \[aq]seccomp_data\[aq] buffer into
+ accumulator. */
+ BPF_STMT(BPF_LD | BPF_W | BPF_ABS,
+ (offsetof(struct seccomp_data, nr))),
+\&
+ /* [3] Check ABI \- only needed for x86\-64 in deny\-list use
+ cases. Use BPF_JGT instead of checking against the bit
+ mask to avoid having to reload the syscall number. */
+ BPF_JUMP(BPF_JMP | BPF_JGT | BPF_K, upper_nr_limit, 3, 0),
+\&
+ /* [4] Jump forward 1 instruction if system call number
+ does not match \[aq]syscall_nr\[aq]. */
+ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, syscall_nr, 0, 1),
+\&
+ /* [5] Matching architecture and system call: don\[aq]t execute
+ the system call, and return \[aq]f_errno\[aq] in \[aq]errno\[aq]. */
+ BPF_STMT(BPF_RET | BPF_K,
+ SECCOMP_RET_ERRNO | (f_errno & SECCOMP_RET_DATA)),
+\&
+ /* [6] Destination of system call number mismatch: allow other
+ system calls. */
+ BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW),
+\&
+ /* [7] Destination of architecture mismatch: kill process. */
+ BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_KILL_PROCESS),
+ };
+\&
+ struct sock_fprog prog = {
+ .len = ARRAY_SIZE(filter),
+ .filter = filter,
+ };
+\&
+ if (syscall(SYS_seccomp, SECCOMP_SET_MODE_FILTER, 0, &prog)) {
+ perror("seccomp");
+ return 1;
+ }
+\&
+ return 0;
+}
+\&
+int
+main(int argc, char *argv[])
+{
+ if (argc < 5) {
+ fprintf(stderr, "Usage: "
+ "%s <syscall_nr> <arch> <errno> <prog> [<args>]\en"
+ "Hint for <arch>: AUDIT_ARCH_I386: 0x%X\en"
+ " AUDIT_ARCH_X86_64: 0x%X\en"
+ "\en", argv[0], AUDIT_ARCH_I386, AUDIT_ARCH_X86_64);
+ exit(EXIT_FAILURE);
+ }
+\&
+ if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
+ perror("prctl");
+ exit(EXIT_FAILURE);
+ }
+\&
+ if (install_filter(strtol(argv[1], NULL, 0),
+ strtoul(argv[2], NULL, 0),
+ strtol(argv[3], NULL, 0)))
+ exit(EXIT_FAILURE);
+\&
+ execv(argv[4], &argv[4]);
+ perror("execv");
+ exit(EXIT_FAILURE);
+}
+.EE
+.\" SRC END
+.SH SEE ALSO
+.BR bpfc (1),
+.BR strace (1),
+.BR bpf (2),
+.BR prctl (2),
+.BR ptrace (2),
+.BR seccomp_unotify (2),
+.BR sigaction (2),
+.BR proc (5),
+.BR signal (7),
+.BR socket (7)
+.P
+Various pages from the
+.I libseccomp
+library, including:
+.BR scmp_sys_resolver (1),
+.BR seccomp_export_bpf (3),
+.BR seccomp_init (3),
+.BR seccomp_load (3),
+and
+.BR seccomp_rule_add (3).
+.P
+The kernel source files
+.I Documentation/networking/filter.txt
+and
+.I Documentation/userspace\-api/seccomp_filter.rst
+.\" commit c061f33f35be0ccc80f4b8e0aea5dfd2ed7e01a3
+(or
+.I Documentation/prctl/seccomp_filter.txt
+before Linux 4.13).
+.P
+McCanne, S.\& and Jacobson, V.\& (1992)
+.IR "The BSD Packet Filter: A New Architecture for User-level Packet Capture" ,
+Proceedings of the USENIX Winter 1993 Conference
+.UR http://www.tcpdump.org/papers/bpf\-usenix93.pdf
+.UE