diff options
Diffstat (limited to 'man2/seccomp.2')
-rw-r--r-- | man2/seccomp.2 | 1245 |
1 files changed, 1245 insertions, 0 deletions
diff --git a/man2/seccomp.2 b/man2/seccomp.2 new file mode 100644 index 0000000..6b32eec --- /dev/null +++ b/man2/seccomp.2 @@ -0,0 +1,1245 @@ +.\" Copyright (C) 2014 Kees Cook <keescook@chromium.org> +.\" and Copyright (C) 2012 Will Drewry <wad@chromium.org> +.\" and Copyright (C) 2008, 2014,2017 Michael Kerrisk <mtk.manpages@gmail.com> +.\" and Copyright (C) 2017 Tyler Hicks <tyhicks@canonical.com> +.\" and Copyright (C) 2020 Tycho Andersen <tycho@tycho.ws> +.\" +.\" SPDX-License-Identifier: Linux-man-pages-copyleft +.\" +.TH seccomp 2 2023-05-03 "Linux man-pages 6.05.01" +.SH NAME +seccomp \- operate on Secure Computing state of the process +.SH LIBRARY +Standard C library +.RI ( libc ", " \-lc ) +.SH SYNOPSIS +.nf +.BR "#include <linux/seccomp.h>" " /* Definition of " SECCOMP_* " constants */" +.BR "#include <linux/filter.h>" " /* Definition of " "struct sock_fprog" " */" +.BR "#include <linux/audit.h>" " /* Definition of " AUDIT_* " constants */" +.BR "#include <linux/signal.h>" " /* Definition of " SIG* " constants */" +.BR "#include <sys/ptrace.h>" " /* Definition of " PTRACE_* " constants */" +.\" Kees Cook noted: Anything that uses SECCOMP_RET_TRACE returns will +.\" need <sys/ptrace.h> +.BR "#include <sys/syscall.h>" " /* Definition of " SYS_* " constants */" +.B #include <unistd.h> +.PP +.BI "int syscall(SYS_seccomp, unsigned int " operation ", unsigned int " flags , +.BI " void *" args ); +.fi +.PP +.IR Note : +glibc provides no wrapper for +.BR seccomp (), +necessitating the use of +.BR syscall (2). +.SH DESCRIPTION +The +.BR seccomp () +system call operates on the Secure Computing (seccomp) state of the +calling process. +.PP +Currently, Linux supports the following +.I operation +values: +.TP +.B SECCOMP_SET_MODE_STRICT +The only system calls that the calling thread is permitted to make are +.BR read (2), +.BR write (2), +.BR _exit (2) +(but not +.BR exit_group (2)), +and +.BR sigreturn (2). +Other system calls result in the termination of the calling thread, +or termination of the entire process with the +.B SIGKILL +signal when there is only one thread. +Strict secure computing mode is useful for number-crunching +applications that may need to execute untrusted byte code, perhaps +obtained by reading from a pipe or socket. +.IP +Note that although the calling thread can no longer call +.BR sigprocmask (2), +it can use +.BR sigreturn (2) +to block all signals apart from +.B SIGKILL +and +.BR SIGSTOP . +This means that +.BR alarm (2) +(for example) is not sufficient for restricting the process's execution time. +Instead, to reliably terminate the process, +.B SIGKILL +must be used. +This can be done by using +.BR timer_create (2) +with +.B SIGEV_SIGNAL +and +.I sigev_signo +set to +.BR SIGKILL , +or by using +.BR setrlimit (2) +to set the hard limit for +.BR RLIMIT_CPU . +.IP +This operation is available only if the kernel is configured with +.B CONFIG_SECCOMP +enabled. +.IP +The value of +.I flags +must be 0, and +.I args +must be NULL. +.IP +This operation is functionally identical to the call: +.IP +.in +4n +.EX +prctl(PR_SET_SECCOMP, SECCOMP_MODE_STRICT); +.EE +.in +.TP +.B SECCOMP_SET_MODE_FILTER +The system calls allowed are defined by a pointer to a Berkeley Packet +Filter (BPF) passed via +.IR args . +This argument is a pointer to a +.IR "struct\~sock_fprog" ; +it can be designed to filter arbitrary system calls and system call +arguments. +If the filter is invalid, +.BR seccomp () +fails, returning +.B EINVAL +in +.IR errno . +.IP +If +.BR fork (2) +or +.BR clone (2) +is allowed by the filter, any child processes will be constrained to +the same system call filters as the parent. +If +.BR execve (2) +is allowed, +the existing filters will be preserved across a call to +.BR execve (2). +.IP +In order to use the +.B SECCOMP_SET_MODE_FILTER +operation, either the calling thread must have the +.B CAP_SYS_ADMIN +capability in its user namespace, or the thread must already have the +.I no_new_privs +bit set. +If that bit was not already set by an ancestor of this thread, +the thread must make the following call: +.IP +.in +4n +.EX +prctl(PR_SET_NO_NEW_PRIVS, 1); +.EE +.in +.IP +Otherwise, the +.B SECCOMP_SET_MODE_FILTER +operation fails and returns +.B EACCES +in +.IR errno . +This requirement ensures that an unprivileged process cannot apply +a malicious filter and then invoke a set-user-ID or +other privileged program using +.BR execve (2), +thus potentially compromising that program. +(Such a malicious filter might, for example, cause an attempt to use +.BR setuid (2) +to set the caller's user IDs to nonzero values to instead +return 0 without actually making the system call. +Thus, the program might be tricked into retaining superuser privileges +in circumstances where it is possible to influence it to do +dangerous things because it did not actually drop privileges.) +.IP +If +.BR prctl (2) +or +.BR seccomp () +is allowed by the attached filter, further filters may be added. +This will increase evaluation time, but allows for further reduction of +the attack surface during execution of a thread. +.IP +The +.B SECCOMP_SET_MODE_FILTER +operation is available only if the kernel is configured with +.B CONFIG_SECCOMP_FILTER +enabled. +.IP +When +.I flags +is 0, this operation is functionally identical to the call: +.IP +.in +4n +.EX +prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, args); +.EE +.in +.IP +The recognized +.I flags +are: +.RS +.TP +.BR SECCOMP_FILTER_FLAG_LOG " (since Linux 4.14)" +.\" commit e66a39977985b1e69e17c4042cb290768eca9b02 +All filter return actions except +.B SECCOMP_RET_ALLOW +should be logged. +An administrator may override this filter flag by preventing specific +actions from being logged via the +.I /proc/sys/kernel/seccomp/actions_logged +file. +.TP +.BR SECCOMP_FILTER_FLAG_NEW_LISTENER " (since Linux 5.0)" +.\" commit 6a21cc50f0c7f87dae5259f6cfefe024412313f6 +After successfully installing the filter program, +return a new user-space notification file descriptor. +(The close-on-exec flag is set for the file descriptor.) +When the filter returns +.B SECCOMP_RET_USER_NOTIF +a notification will be sent to this file descriptor. +.IP +At most one seccomp filter using the +.B SECCOMP_FILTER_FLAG_NEW_LISTENER +flag can be installed for a thread. +.IP +See +.BR seccomp_unotify (2) +for further details. +.TP +.BR SECCOMP_FILTER_FLAG_SPEC_ALLOW " (since Linux 4.17)" +.\" commit 00a02d0c502a06d15e07b857f8ff921e3e402675 +Disable Speculative Store Bypass mitigation. +.TP +.B SECCOMP_FILTER_FLAG_TSYNC +When adding a new filter, synchronize all other threads of the calling +process to the same seccomp filter tree. +A "filter tree" is the ordered list of filters attached to a thread. +(Attaching identical filters in separate +.BR seccomp () +calls results in different filters from this perspective.) +.IP +If any thread cannot synchronize to the same filter tree, +the call will not attach the new seccomp filter, +and will fail, returning the first thread ID found that cannot synchronize. +Synchronization will fail if another thread in the same process is in +.B SECCOMP_MODE_STRICT +or if it has attached new seccomp filters to itself, +diverging from the calling thread's filter tree. +.RE +.TP +.BR SECCOMP_GET_ACTION_AVAIL " (since Linux 4.14)" +.\" commit d612b1fd8010d0d67b5287fe146b8b55bcbb8655 +Test to see if an action is supported by the kernel. +This operation is helpful to confirm that the kernel knows +of a more recently added filter return action +since the kernel treats all unknown actions as +.BR SECCOMP_RET_KILL_PROCESS . +.IP +The value of +.I flags +must be 0, and +.I args +must be a pointer to an unsigned 32-bit filter return action. +.TP +.BR SECCOMP_GET_NOTIF_SIZES " (since Linux 5.0)" +.\" commit 6a21cc50f0c7f87dae5259f6cfefe024412313f6 +Get the sizes of the seccomp user-space notification structures. +Since these structures may evolve and grow over time, +this command can be used to determine how +much memory to allocate for sending and receiving notifications. +.IP +The value of +.I flags +must be 0, and +.I args +must be a pointer to a +.IR "struct seccomp_notif_sizes" , +which has the following form: +.IP +.EX +struct seccomp_notif_sizes + __u16 seccomp_notif; /* Size of notification structure */ + __u16 seccomp_notif_resp; /* Size of response structure */ + __u16 seccomp_data; /* Size of \[aq]struct seccomp_data\[aq] */ +}; +.EE +.IP +See +.BR seccomp_unotify (2) +for further details. +.\" +.SS Filters +When adding filters via +.BR SECCOMP_SET_MODE_FILTER , +.I args +points to a filter program: +.PP +.in +4n +.EX +struct sock_fprog { + unsigned short len; /* Number of BPF instructions */ + struct sock_filter *filter; /* Pointer to array of + BPF instructions */ +}; +.EE +.in +.PP +Each program must contain one or more BPF instructions: +.PP +.in +4n +.EX +struct sock_filter { /* Filter block */ + __u16 code; /* Actual filter code */ + __u8 jt; /* Jump true */ + __u8 jf; /* Jump false */ + __u32 k; /* Generic multiuse field */ +}; +.EE +.in +.PP +When executing the instructions, the BPF program operates on the +system call information made available (i.e., use the +.B BPF_ABS +addressing mode) as a (read-only) +.\" Quoting Kees Cook: +.\" If BPF even allows changing the data, it's not copied back to +.\" the syscall when it runs. Anything wanting to do things like +.\" that would need to use ptrace to catch the call and directly +.\" modify the registers before continuing with the call. +buffer of the following form: +.PP +.in +4n +.EX +struct seccomp_data { + int nr; /* System call number */ + __u32 arch; /* AUDIT_ARCH_* value + (see <linux/audit.h>) */ + __u64 instruction_pointer; /* CPU instruction pointer */ + __u64 args[6]; /* Up to 6 system call arguments */ +}; +.EE +.in +.PP +Because numbering of system calls varies between architectures and +some architectures (e.g., x86-64) allow user-space code to use +the calling conventions of multiple architectures +(and the convention being used may vary over the life of a process that uses +.BR execve (2) +to execute binaries that employ the different conventions), +it is usually necessary to verify the value of the +.I arch +field. +.PP +It is strongly recommended to use an allow-list approach whenever +possible because such an approach is more robust and simple. +A deny-list will have to be updated whenever a potentially +dangerous system call is added (or a dangerous flag or option if those +are deny-listed), and it is often possible to alter the +representation of a value without altering its meaning, leading to +a deny-list bypass. +See also +.I Caveats +below. +.PP +The +.I arch +field is not unique for all calling conventions. +The x86-64 ABI and the x32 ABI both use +.B AUDIT_ARCH_X86_64 +as +.IR arch , +and they run on the same processors. +Instead, the mask +.B __X32_SYSCALL_BIT +is used on the system call number to tell the two ABIs apart. +.\" As noted by Dave Drysdale in a note at the end of +.\" https://lwn.net/Articles/604515/ +.\" One additional detail to point out for the x32 ABI case: +.\" the syscall number gets a high bit set (__X32_SYSCALL_BIT), +.\" to mark it as an x32 call. +.\" +.\" If x32 support is included in the kernel, then __SYSCALL_MASK +.\" will have a value that is not all-ones, and this will trigger +.\" an extra instruction in system_call to mask off the extra bit, +.\" so that the syscall table indexing still works. +.PP +This means that a policy must either deny all syscalls with +.B __X32_SYSCALL_BIT +or it must recognize syscalls with and without +.B __X32_SYSCALL_BIT +set. +A list of system calls to be denied based on +.I nr +that does not also contain +.I nr +values with +.B __X32_SYSCALL_BIT +set can be bypassed by a malicious program that sets +.BR __X32_SYSCALL_BIT . +.PP +Additionally, kernels prior to Linux 5.4 incorrectly permitted +.I nr +in the ranges 512-547 as well as the corresponding non-x32 syscalls ORed +with +.BR __X32_SYSCALL_BIT . +For example, +.I nr +== 521 and +.I nr +== (101 | +.BR __X32_SYSCALL_BIT ) +would result in invocations of +.BR ptrace (2) +with potentially confused x32-vs-x86_64 semantics in the kernel. +Policies intended to work on kernels before Linux 5.4 must ensure that they +deny or otherwise correctly handle these system calls. +On Linux 5.4 and newer, +.\" commit 6365b842aae4490ebfafadfc6bb27a6d3cc54757 +such system calls will fail with the error +.BR ENOSYS , +without doing anything. +.PP +The +.I instruction_pointer +field provides the address of the machine-language instruction that +performed the system call. +This might be useful in conjunction with the use of +.IR /proc/ pid /maps +to perform checks based on which region (mapping) of the program +made the system call. +(Probably, it is wise to lock down the +.BR mmap (2) +and +.BR mprotect (2) +system calls to prevent the program from subverting such checks.) +.PP +When checking values from +.IR args , +keep in mind that arguments are often +silently truncated before being processed, but after the seccomp check. +For example, this happens if the i386 ABI is used on an +x86-64 kernel: although the kernel will normally not look beyond +the 32 lowest bits of the arguments, the values of the full +64-bit registers will be present in the seccomp data. +A less surprising example is that if the x86-64 ABI is used to perform +a system call that takes an argument of type +.IR int , +the more-significant half of the argument register is ignored by +the system call, but visible in the seccomp data. +.PP +A seccomp filter returns a 32-bit value consisting of two parts: +the most significant 16 bits +(corresponding to the mask defined by the constant +.BR SECCOMP_RET_ACTION_FULL ) +contain one of the "action" values listed below; +the least significant 16-bits (defined by the constant +.BR SECCOMP_RET_DATA ) +are "data" to be associated with this return value. +.PP +If multiple filters exist, they are \fIall\fP executed, +in reverse order of their addition to the filter tree\[em]that is, +the most recently installed filter is executed first. +(Note that all filters will be called +even if one of the earlier filters returns +.BR SECCOMP_RET_KILL . +This is done to simplify the kernel code and to provide a +tiny speed-up in the execution of sets of filters by +avoiding a check for this uncommon case.) +.\" From an Aug 2015 conversation with Kees Cook where I asked why *all* +.\" filters are applied even if one of the early filters returns +.\" SECCOMP_RET_KILL: +.\" +.\" It's just because it would be an optimization that would only speed up +.\" the RET_KILL case, but it's the uncommon one and the one that doesn't +.\" benefit meaningfully from such a change (you need to kill the process +.\" really quickly?). We would speed up killing a program at the (albeit +.\" tiny) expense to all other filtered programs. Best to keep the filter +.\" execution logic clear, simple, and as fast as possible for all +.\" filters. +The return value for the evaluation of a given system call is the first-seen +action value of highest precedence (along with its accompanying data) +returned by execution of all of the filters. +.PP +In decreasing order of precedence, +the action values that may be returned by a seccomp filter are: +.TP +.BR SECCOMP_RET_KILL_PROCESS " (since Linux 4.14)" +.\" commit 4d3b0b05aae9ee9ce0970dc4cc0fb3fad5e85945 +.\" commit 0466bdb99e8744bc9befa8d62a317f0fd7fd7421 +This value results in immediate termination of the process, +with a core dump. +The system call is not executed. +By contrast with +.B SECCOMP_RET_KILL_THREAD +below, all threads in the thread group are terminated. +(For a discussion of thread groups, see the description of the +.B CLONE_THREAD +flag in +.BR clone (2).) +.IP +The process terminates +.I "as though" +killed by a +.B SIGSYS +signal. +Even if a signal handler has been registered for +.BR SIGSYS , +the handler will be ignored in this case and the process always terminates. +To a parent process that is waiting on this process (using +.BR waitpid (2) +or similar), the returned +.I wstatus +will indicate that its child was terminated as though by a +.B SIGSYS +signal. +.TP +.BR SECCOMP_RET_KILL_THREAD " (or " SECCOMP_RET_KILL ) +This value results in immediate termination of the thread +that made the system call. +The system call is not executed. +Other threads in the same thread group will continue to execute. +.IP +The thread terminates +.I "as though" +killed by a +.B SIGSYS +signal. +See +.B SECCOMP_RET_KILL_PROCESS +above. +.IP +.\" See these commits: +.\" seccomp: dump core when using SECCOMP_RET_KILL +.\" (b25e67161c295c98acda92123b2dd1e7d8642901) +.\" seccomp: Only dump core when single-threaded +.\" (d7276e321ff8a53106a59c85ca46d03e34288893) +Before Linux 4.11, +any process terminated in this way would not trigger a coredump +(even though +.B SIGSYS +is documented in +.BR signal (7) +as having a default action of termination with a core dump). +Since Linux 4.11, +a single-threaded process will dump core if terminated in this way. +.IP +With the addition of +.B SECCOMP_RET_KILL_PROCESS +in Linux 4.14, +.B SECCOMP_RET_KILL_THREAD +was added as a synonym for +.BR SECCOMP_RET_KILL , +in order to more clearly distinguish the two actions. +.IP +.BR Note : +the use of +.B SECCOMP_RET_KILL_THREAD +to kill a single thread in a multithreaded process is likely to leave the +process in a permanently inconsistent and possibly corrupt state. +.TP +.B SECCOMP_RET_TRAP +This value results in the kernel sending a thread-directed +.B SIGSYS +signal to the triggering thread. +(The system call is not executed.) +Various fields will be set in the +.I siginfo_t +structure (see +.BR sigaction (2)) +associated with signal: +.RS +.IP \[bu] 3 +.I si_signo +will contain +.BR SIGSYS . +.IP \[bu] +.I si_call_addr +will show the address of the system call instruction. +.IP \[bu] +.I si_syscall +and +.I si_arch +will indicate which system call was attempted. +.IP \[bu] +.I si_code +will contain +.BR SYS_SECCOMP . +.IP \[bu] +.I si_errno +will contain the +.B SECCOMP_RET_DATA +portion of the filter return value. +.RE +.IP +The program counter will be as though the system call happened +(i.e., the program counter will not point to the system call instruction). +The return value register will contain an architecture\-dependent value; +if resuming execution, set it to something appropriate for the system call. +(The architecture dependency is because replacing it with +.B ENOSYS +could overwrite some useful information.) +.TP +.B SECCOMP_RET_ERRNO +This value results in the +.B SECCOMP_RET_DATA +portion of the filter's return value being passed to user space as the +.I errno +value without executing the system call. +.TP +.BR SECCOMP_RET_USER_NOTIF " (since Linux 5.0)" +.\" commit 6a21cc50f0c7f87dae5259f6cfefe024412313f6 +Forward the system call to an attached user-space supervisor +process to allow that process to decide what to do with the system call. +If there is no attached supervisor (either +because the filter was not installed with the +.B SECCOMP_FILTER_FLAG_NEW_LISTENER +flag or because the file descriptor was closed), the filter returns +.B ENOSYS +(similar to what happens when a filter returns +.B SECCOMP_RET_TRACE +and there is no tracer). +See +.BR seccomp_unotify (2) +for further details. +.IP +Note that the supervisor process will not be notified +if another filter returns an action value with a precedence greater than +.BR SECCOMP_RET_USER_NOTIF . +.TP +.B SECCOMP_RET_TRACE +When returned, this value will cause the kernel to attempt to notify a +.BR ptrace (2)-based +tracer prior to executing the system call. +If there is no tracer present, +the system call is not executed and returns a failure status with +.I errno +set to +.BR ENOSYS . +.IP +A tracer will be notified if it requests +.B PTRACE_O_TRACESECCOMP +using +.IR ptrace(PTRACE_SETOPTIONS) . +The tracer will be notified of a +.B PTRACE_EVENT_SECCOMP +and the +.B SECCOMP_RET_DATA +portion of the filter's return value will be available to the tracer via +.BR PTRACE_GETEVENTMSG . +.IP +The tracer can skip the system call by changing the system call number +to \-1. +Alternatively, the tracer can change the system call +requested by changing the system call to a valid system call number. +If the tracer asks to skip the system call, then the system call will +appear to return the value that the tracer puts in the return value register. +.IP +.\" This was changed in ce6526e8afa4. +.\" A related hole, using PTRACE_SYSCALL instead of SECCOMP_RET_TRACE, was +.\" changed in arch-specific commits, e.g. 93e35efb8de4 for X86 and +.\" 0f3912fd934c for ARM. +Before Linux 4.8, the seccomp check will not be run again after the tracer is +notified. +(This means that, on older kernels, seccomp-based sandboxes +.B "must not" +allow use of +.BR ptrace (2)\[em]even +of other +sandboxed processes\[em]without extreme care; +ptracers can use this mechanism to escape from the seccomp sandbox.) +.IP +Note that a tracer process will not be notified +if another filter returns an action value with a precedence greater than +.BR SECCOMP_RET_TRACE . +.TP +.BR SECCOMP_RET_LOG " (since Linux 4.14)" +.\" commit 59f5cf44a38284eb9e76270c786fb6cc62ef8ac4 +This value results in the system call being executed after +the filter return action is logged. +An administrator may override the logging of this action via +the +.I /proc/sys/kernel/seccomp/actions_logged +file. +.TP +.B SECCOMP_RET_ALLOW +This value results in the system call being executed. +.PP +If an action value other than one of the above is specified, +then the filter action is treated as either +.B SECCOMP_RET_KILL_PROCESS +(since Linux 4.14) +.\" commit 4d3b0b05aae9ee9ce0970dc4cc0fb3fad5e85945 +or +.B SECCOMP_RET_KILL_THREAD +(in Linux 4.13 and earlier). +.\" +.SS /proc interfaces +The files in the directory +.I /proc/sys/kernel/seccomp +provide additional seccomp information and configuration: +.TP +.IR actions_avail " (since Linux 4.14)" +.\" commit 8e5f1ad116df6b0de65eac458d5e7c318d1c05af +A read-only ordered list of seccomp filter return actions in string form. +The ordering, from left-to-right, is in decreasing order of precedence. +The list represents the set of seccomp filter return actions +supported by the kernel. +.TP +.IR actions_logged " (since Linux 4.14)" +.\" commit 0ddec0fc8900201c0897b87b762b7c420436662f +A read-write ordered list of seccomp filter return actions that +are allowed to be logged. +Writes to the file do not need to be in ordered form but reads from +the file will be ordered in the same way as the +.I actions_avail +file. +.IP +It is important to note that the value of +.I actions_logged +does not prevent certain filter return actions from being logged when +the audit subsystem is configured to audit a task. +If the action is not found in the +.I actions_logged +file, the final decision on whether to audit the action for that task is +ultimately left up to the audit subsystem to decide for all filter return +actions other than +.BR SECCOMP_RET_ALLOW . +.IP +The "allow" string is not accepted in the +.I actions_logged +file as it is not possible to log +.B SECCOMP_RET_ALLOW +actions. +Attempting to write "allow" to the file will fail with the error +.BR EINVAL . +.\" +.SS Audit logging of seccomp actions +.\" commit 59f5cf44a38284eb9e76270c786fb6cc62ef8ac4 +Since Linux 4.14, the kernel provides the facility to log the +actions returned by seccomp filters in the audit log. +The kernel makes the decision to log an action based on +the action type, whether or not the action is present in the +.I actions_logged +file, and whether kernel auditing is enabled +(e.g., via the kernel boot option +.IR audit=1 ). +.\" or auditing could be enabled via the netlink API (AUDIT_SET) +The rules are as follows: +.IP \[bu] 3 +If the action is +.BR SECCOMP_RET_ALLOW , +the action is not logged. +.IP \[bu] +Otherwise, if the action is either +.B SECCOMP_RET_KILL_PROCESS +or +.BR SECCOMP_RET_KILL_THREAD , +and that action appears in the +.I actions_logged +file, the action is logged. +.IP \[bu] +Otherwise, if the filter has requested logging (the +.B SECCOMP_FILTER_FLAG_LOG +flag) +and the action appears in the +.I actions_logged +file, the action is logged. +.IP \[bu] +Otherwise, if kernel auditing is enabled and the process is being audited +.RB ( autrace (8)), +the action is logged. +.IP \[bu] +Otherwise, the action is not logged. +.SH RETURN VALUE +On success, +.BR seccomp () +returns 0. +On error, if +.B SECCOMP_FILTER_FLAG_TSYNC +was used, +the return value is the ID of the thread +that caused the synchronization failure. +(This ID is a kernel thread ID of the type returned by +.BR clone (2) +and +.BR gettid (2).) +On other errors, \-1 is returned, and +.I errno +is set to indicate the error. +.SH ERRORS +.BR seccomp () +can fail for the following reasons: +.TP +.B EACCES +The caller did not have the +.B CAP_SYS_ADMIN +capability in its user namespace, or had not set +.I no_new_privs +before using +.BR SECCOMP_SET_MODE_FILTER . +.TP +.B EBUSY +While installing a new filter, the +.B SECCOMP_FILTER_FLAG_NEW_LISTENER +flag was specified, +but a previous filter had already been installed with that flag. +.TP +.B EFAULT +.I args +was not a valid address. +.TP +.B EINVAL +.I operation +is unknown or is not supported by this kernel version or configuration. +.TP +.B EINVAL +The specified +.I flags +are invalid for the given +.IR operation . +.TP +.B EINVAL +.I operation +included +.BR BPF_ABS , +but the specified offset was not aligned to a 32-bit boundary or exceeded +.IR "sizeof(struct\~seccomp_data)" . +.TP +.B EINVAL +.\" See kernel/seccomp.c::seccomp_may_assign_mode() in Linux 3.18 sources +A secure computing mode has already been set, and +.I operation +differs from the existing setting. +.TP +.B EINVAL +.I operation +specified +.BR SECCOMP_SET_MODE_FILTER , +but the filter program pointed to by +.I args +was not valid or the length of the filter program was zero or exceeded +.B BPF_MAXINSNS +(4096) instructions. +.TP +.B ENOMEM +Out of memory. +.TP +.B ENOMEM +.\" ENOMEM in kernel/seccomp.c::seccomp_attach_filter() in Linux 3.18 sources +The total length of all filter programs attached +to the calling thread would exceed +.B MAX_INSNS_PER_PATH +(32768) instructions. +Note that for the purposes of calculating this limit, +each already existing filter program incurs an +overhead penalty of 4 instructions. +.TP +.B EOPNOTSUPP +.I operation +specified +.BR SECCOMP_GET_ACTION_AVAIL , +but the kernel does not support the filter return action specified by +.IR args . +.TP +.B ESRCH +Another thread caused a failure during thread sync, but its ID could not +be determined. +.SH STANDARDS +Linux. +.SH HISTORY +Linux 3.17. +.\" FIXME . Add glibc version +.SH NOTES +Rather than hand-coding seccomp filters as shown in the example below, +you may prefer to employ the +.I libseccomp +library, which provides a front-end for generating seccomp filters. +.PP +The +.I Seccomp +field of the +.IR /proc/ pid /status +file provides a method of viewing the seccomp mode of a process; see +.BR proc (5). +.PP +.BR seccomp () +provides a superset of the functionality provided by the +.BR prctl (2) +.B PR_SET_SECCOMP +operation (which does not support +.IR flags ). +.PP +Since Linux 4.4, the +.BR ptrace (2) +.B PTRACE_SECCOMP_GET_FILTER +operation can be used to dump a process's seccomp filters. +.\" +.SS Architecture support for seccomp BPF +Architecture support for seccomp BPF filtering +.\" Check by grepping for HAVE_ARCH_SECCOMP_FILTER in Kconfig files in +.\" kernel source. Last checked in Linux 4.16-rc source. +is available on the following architectures: +.IP \[bu] 3 +x86-64, i386, x32 (since Linux 3.5) +.PD 0 +.IP \[bu] +ARM (since Linux 3.8) +.IP \[bu] +s390 (since Linux 3.8) +.IP \[bu] +MIPS (since Linux 3.16) +.IP \[bu] +ARM-64 (since Linux 3.19) +.IP \[bu] +PowerPC (since Linux 4.3) +.IP \[bu] +Tile (since Linux 4.3) +.IP \[bu] +PA-RISC (since Linux 4.6) +.\" User mode Linux since Linux 4.6 +.PD +.\" +.SS Caveats +There are various subtleties to consider when applying seccomp filters +to a program, including the following: +.IP \[bu] 3 +Some traditional system calls have user-space implementations in the +.BR vdso (7) +on many architectures. +Notable examples include +.BR clock_gettime (2), +.BR gettimeofday (2), +and +.BR time (2). +On such architectures, +seccomp filtering for these system calls will have no effect. +(However, there are cases where the +.BR vdso (7) +implementations may fall back to invoking the true system call, +in which case seccomp filters would see the system call.) +.IP \[bu] +Seccomp filtering is based on system call numbers. +However, applications typically do not directly invoke system calls, +but instead call wrapper functions in the C library which +in turn invoke the system calls. +Consequently, one must be aware of the following: +.RS +.IP \[bu] 3 +The glibc wrappers for some traditional system calls may actually +employ system calls with different names in the kernel. +For example, the +.BR exit (2) +wrapper function actually employs the +.BR exit_group (2) +system call, and the +.BR fork (2) +wrapper function actually calls +.BR clone (2). +.IP \[bu] +The behavior of wrapper functions may vary across architectures, +according to the range of system calls provided on those architectures. +In other words, the same wrapper function may invoke +different system calls on different architectures. +.IP \[bu] +Finally, the behavior of wrapper functions can change across glibc versions. +For example, in older versions, the glibc wrapper function for +.BR open (2) +invoked the system call of the same name, +but starting in glibc 2.26, the implementation switched to calling +.BR openat (2) +on all architectures. +.RE +.PP +The consequence of the above points is that it may be necessary +to filter for a system call other than might be expected. +Various manual pages in Section 2 provide helpful details +about the differences between wrapper functions and +the underlying system calls in subsections entitled +.IR "C library/kernel differences" . +.PP +Furthermore, note that the application of seccomp filters +even risks causing bugs in an application, +when the filters cause unexpected failures for legitimate operations +that the application might need to perform. +Such bugs may not easily be discovered when testing the seccomp +filters if the bugs occur in rarely used application code paths. +.\" +.SS Seccomp-specific BPF details +Note the following BPF details specific to seccomp filters: +.IP \[bu] 3 +The +.B BPF_H +and +.B BPF_B +size modifiers are not supported: all operations must load and store +(4-byte) words +.RB ( BPF_W ). +.IP \[bu] +To access the contents of the +.I seccomp_data +buffer, use the +.B BPF_ABS +addressing mode modifier. +.IP \[bu] +The +.B BPF_LEN +addressing mode modifier yields an immediate mode operand +whose value is the size of the +.I seccomp_data +buffer. +.SH EXAMPLES +The program below accepts four or more arguments. +The first three arguments are a system call number, +a numeric architecture identifier, and an error number. +The program uses these values to construct a BPF filter +that is used at run time to perform the following checks: +.IP \[bu] 3 +If the program is not running on the specified architecture, +the BPF filter causes system calls to fail with the error +.BR ENOSYS . +.IP \[bu] +If the program attempts to execute the system call with the specified number, +the BPF filter causes the system call to fail, with +.I errno +being set to the specified error number. +.PP +The remaining command-line arguments specify +the pathname and additional arguments of a program +that the example program should attempt to execute using +.BR execv (3) +(a library function that employs the +.BR execve (2) +system call). +Some example runs of the program are shown below. +.PP +First, we display the architecture that we are running on (x86-64) +and then construct a shell function that looks up system call +numbers on this architecture: +.PP +.in +4n +.EX +$ \fBuname \-m\fP +x86_64 +$ \fBsyscall_nr() { + cat /usr/src/linux/arch/x86/syscalls/syscall_64.tbl | \e + awk \[aq]$2 != "x32" && $3 == "\[aq]$1\[aq]" { print $1 }\[aq] +}\fP +.EE +.in +.PP +When the BPF filter rejects a system call (case [2] above), +it causes the system call to fail with the error number +specified on the command line. +In the experiments shown here, we'll use error number 99: +.PP +.in +4n +.EX +$ \fBerrno 99\fP +EADDRNOTAVAIL 99 Cannot assign requested address +.EE +.in +.PP +In the following example, we attempt to run the command +.BR whoami (1), +but the BPF filter rejects the +.BR execve (2) +system call, so that the command is not even executed: +.PP +.in +4n +.EX +$ \fBsyscall_nr execve\fP +59 +$ \fB./a.out\fP +Usage: ./a.out <syscall_nr> <arch> <errno> <prog> [<args>] +Hint for <arch>: AUDIT_ARCH_I386: 0x40000003 + AUDIT_ARCH_X86_64: 0xC000003E +$ \fB./a.out 59 0xC000003E 99 /bin/whoami\fP +execv: Cannot assign requested address +.EE +.in +.PP +In the next example, the BPF filter rejects the +.BR write (2) +system call, so that, although it is successfully started, the +.BR whoami (1) +command is not able to write output: +.PP +.in +4n +.EX +$ \fBsyscall_nr write\fP +1 +$ \fB./a.out 1 0xC000003E 99 /bin/whoami\fP +.EE +.in +.PP +In the final example, +the BPF filter rejects a system call that is not used by the +.BR whoami (1) +command, so it is able to successfully execute and produce output: +.PP +.in +4n +.EX +$ \fBsyscall_nr preadv\fP +295 +$ \fB./a.out 295 0xC000003E 99 /bin/whoami\fP +cecilia +.EE +.in +.SS Program source +.\" SRC BEGIN (seccomp.c) +.EX +#include <linux/audit.h> +#include <linux/filter.h> +#include <linux/seccomp.h> +#include <stddef.h> +#include <stdio.h> +#include <stdlib.h> +#include <sys/prctl.h> +#include <sys/syscall.h> +#include <unistd.h> +\& +#define X32_SYSCALL_BIT 0x40000000 +#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0])) +\& +static int +install_filter(int syscall_nr, unsigned int t_arch, int f_errno) +{ + unsigned int upper_nr_limit = 0xffffffff; +\& + /* Assume that AUDIT_ARCH_X86_64 means the normal x86\-64 ABI + (in the x32 ABI, all system calls have bit 30 set in the + \[aq]nr\[aq] field, meaning the numbers are >= X32_SYSCALL_BIT). */ + if (t_arch == AUDIT_ARCH_X86_64) + upper_nr_limit = X32_SYSCALL_BIT \- 1; +\& + struct sock_filter filter[] = { + /* [0] Load architecture from \[aq]seccomp_data\[aq] buffer into + accumulator. */ + BPF_STMT(BPF_LD | BPF_W | BPF_ABS, + (offsetof(struct seccomp_data, arch))), +\& + /* [1] Jump forward 5 instructions if architecture does not + match \[aq]t_arch\[aq]. */ + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, t_arch, 0, 5), +\& + /* [2] Load system call number from \[aq]seccomp_data\[aq] buffer into + accumulator. */ + BPF_STMT(BPF_LD | BPF_W | BPF_ABS, + (offsetof(struct seccomp_data, nr))), +\& + /* [3] Check ABI \- only needed for x86\-64 in deny\-list use + cases. Use BPF_JGT instead of checking against the bit + mask to avoid having to reload the syscall number. */ + BPF_JUMP(BPF_JMP | BPF_JGT | BPF_K, upper_nr_limit, 3, 0), +\& + /* [4] Jump forward 1 instruction if system call number + does not match \[aq]syscall_nr\[aq]. */ + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, syscall_nr, 0, 1), +\& + /* [5] Matching architecture and system call: don\[aq]t execute + the system call, and return \[aq]f_errno\[aq] in \[aq]errno\[aq]. */ + BPF_STMT(BPF_RET | BPF_K, + SECCOMP_RET_ERRNO | (f_errno & SECCOMP_RET_DATA)), +\& + /* [6] Destination of system call number mismatch: allow other + system calls. */ + BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW), +\& + /* [7] Destination of architecture mismatch: kill process. */ + BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_KILL_PROCESS), + }; +\& + struct sock_fprog prog = { + .len = ARRAY_SIZE(filter), + .filter = filter, + }; +\& + if (syscall(SYS_seccomp, SECCOMP_SET_MODE_FILTER, 0, &prog)) { + perror("seccomp"); + return 1; + } +\& + return 0; +} +\& +int +main(int argc, char *argv[]) +{ + if (argc < 5) { + fprintf(stderr, "Usage: " + "%s <syscall_nr> <arch> <errno> <prog> [<args>]\en" + "Hint for <arch>: AUDIT_ARCH_I386: 0x%X\en" + " AUDIT_ARCH_X86_64: 0x%X\en" + "\en", argv[0], AUDIT_ARCH_I386, AUDIT_ARCH_X86_64); + exit(EXIT_FAILURE); + } +\& + if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) { + perror("prctl"); + exit(EXIT_FAILURE); + } +\& + if (install_filter(strtol(argv[1], NULL, 0), + strtoul(argv[2], NULL, 0), + strtol(argv[3], NULL, 0))) + exit(EXIT_FAILURE); +\& + execv(argv[4], &argv[4]); + perror("execv"); + exit(EXIT_FAILURE); +} +.EE +.\" SRC END +.SH SEE ALSO +.BR bpfc (1), +.BR strace (1), +.BR bpf (2), +.BR prctl (2), +.BR ptrace (2), +.BR seccomp_unotify (2), +.BR sigaction (2), +.BR proc (5), +.BR signal (7), +.BR socket (7) +.PP +Various pages from the +.I libseccomp +library, including: +.BR scmp_sys_resolver (1), +.BR seccomp_export_bpf (3), +.BR seccomp_init (3), +.BR seccomp_load (3), +and +.BR seccomp_rule_add (3). +.PP +The kernel source files +.I Documentation/networking/filter.txt +and +.I Documentation/userspace\-api/seccomp_filter.rst +.\" commit c061f33f35be0ccc80f4b8e0aea5dfd2ed7e01a3 +(or +.I Documentation/prctl/seccomp_filter.txt +before Linux 4.13). +.PP +McCanne, S.\& and Jacobson, V.\& (1992) +.IR "The BSD Packet Filter: A New Architecture for User-level Packet Capture" , +Proceedings of the USENIX Winter 1993 Conference +.UR http://www.tcpdump.org/papers/bpf\-usenix93.pdf +.UE |