diff options
Diffstat (limited to 'man2/seccomp.2')
-rw-r--r-- | man2/seccomp.2 | 1245 |
1 files changed, 0 insertions, 1245 deletions
diff --git a/man2/seccomp.2 b/man2/seccomp.2 deleted file mode 100644 index b3f8026..0000000 --- a/man2/seccomp.2 +++ /dev/null @@ -1,1245 +0,0 @@ -.\" Copyright (C) 2014 Kees Cook <keescook@chromium.org> -.\" and Copyright (C) 2012 Will Drewry <wad@chromium.org> -.\" and Copyright (C) 2008, 2014,2017 Michael Kerrisk <mtk.manpages@gmail.com> -.\" and Copyright (C) 2017 Tyler Hicks <tyhicks@canonical.com> -.\" and Copyright (C) 2020 Tycho Andersen <tycho@tycho.ws> -.\" -.\" SPDX-License-Identifier: Linux-man-pages-copyleft -.\" -.TH seccomp 2 2023-10-31 "Linux man-pages 6.7" -.SH NAME -seccomp \- operate on Secure Computing state of the process -.SH LIBRARY -Standard C library -.RI ( libc ", " \-lc ) -.SH SYNOPSIS -.nf -.BR "#include <linux/seccomp.h>" " /* Definition of " SECCOMP_* " constants */" -.BR "#include <linux/filter.h>" " /* Definition of " "struct sock_fprog" " */" -.BR "#include <linux/audit.h>" " /* Definition of " AUDIT_* " constants */" -.BR "#include <linux/signal.h>" " /* Definition of " SIG* " constants */" -.BR "#include <sys/ptrace.h>" " /* Definition of " PTRACE_* " constants */" -.\" Kees Cook noted: Anything that uses SECCOMP_RET_TRACE returns will -.\" need <sys/ptrace.h> -.BR "#include <sys/syscall.h>" " /* Definition of " SYS_* " constants */" -.B #include <unistd.h> -.P -.BI "int syscall(SYS_seccomp, unsigned int " operation ", unsigned int " flags , -.BI " void *" args ); -.fi -.P -.IR Note : -glibc provides no wrapper for -.BR seccomp (), -necessitating the use of -.BR syscall (2). -.SH DESCRIPTION -The -.BR seccomp () -system call operates on the Secure Computing (seccomp) state of the -calling process. -.P -Currently, Linux supports the following -.I operation -values: -.TP -.B SECCOMP_SET_MODE_STRICT -The only system calls that the calling thread is permitted to make are -.BR read (2), -.BR write (2), -.BR _exit (2) -(but not -.BR exit_group (2)), -and -.BR sigreturn (2). -Other system calls result in the termination of the calling thread, -or termination of the entire process with the -.B SIGKILL -signal when there is only one thread. -Strict secure computing mode is useful for number-crunching -applications that may need to execute untrusted byte code, perhaps -obtained by reading from a pipe or socket. -.IP -Note that although the calling thread can no longer call -.BR sigprocmask (2), -it can use -.BR sigreturn (2) -to block all signals apart from -.B SIGKILL -and -.BR SIGSTOP . -This means that -.BR alarm (2) -(for example) is not sufficient for restricting the process's execution time. -Instead, to reliably terminate the process, -.B SIGKILL -must be used. -This can be done by using -.BR timer_create (2) -with -.B SIGEV_SIGNAL -and -.I sigev_signo -set to -.BR SIGKILL , -or by using -.BR setrlimit (2) -to set the hard limit for -.BR RLIMIT_CPU . -.IP -This operation is available only if the kernel is configured with -.B CONFIG_SECCOMP -enabled. -.IP -The value of -.I flags -must be 0, and -.I args -must be NULL. -.IP -This operation is functionally identical to the call: -.IP -.in +4n -.EX -prctl(PR_SET_SECCOMP, SECCOMP_MODE_STRICT); -.EE -.in -.TP -.B SECCOMP_SET_MODE_FILTER -The system calls allowed are defined by a pointer to a Berkeley Packet -Filter (BPF) passed via -.IR args . -This argument is a pointer to a -.IR "struct\~sock_fprog" ; -it can be designed to filter arbitrary system calls and system call -arguments. -If the filter is invalid, -.BR seccomp () -fails, returning -.B EINVAL -in -.IR errno . -.IP -If -.BR fork (2) -or -.BR clone (2) -is allowed by the filter, any child processes will be constrained to -the same system call filters as the parent. -If -.BR execve (2) -is allowed, -the existing filters will be preserved across a call to -.BR execve (2). -.IP -In order to use the -.B SECCOMP_SET_MODE_FILTER -operation, either the calling thread must have the -.B CAP_SYS_ADMIN -capability in its user namespace, or the thread must already have the -.I no_new_privs -bit set. -If that bit was not already set by an ancestor of this thread, -the thread must make the following call: -.IP -.in +4n -.EX -prctl(PR_SET_NO_NEW_PRIVS, 1); -.EE -.in -.IP -Otherwise, the -.B SECCOMP_SET_MODE_FILTER -operation fails and returns -.B EACCES -in -.IR errno . -This requirement ensures that an unprivileged process cannot apply -a malicious filter and then invoke a set-user-ID or -other privileged program using -.BR execve (2), -thus potentially compromising that program. -(Such a malicious filter might, for example, cause an attempt to use -.BR setuid (2) -to set the caller's user IDs to nonzero values to instead -return 0 without actually making the system call. -Thus, the program might be tricked into retaining superuser privileges -in circumstances where it is possible to influence it to do -dangerous things because it did not actually drop privileges.) -.IP -If -.BR prctl (2) -or -.BR seccomp () -is allowed by the attached filter, further filters may be added. -This will increase evaluation time, but allows for further reduction of -the attack surface during execution of a thread. -.IP -The -.B SECCOMP_SET_MODE_FILTER -operation is available only if the kernel is configured with -.B CONFIG_SECCOMP_FILTER -enabled. -.IP -When -.I flags -is 0, this operation is functionally identical to the call: -.IP -.in +4n -.EX -prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, args); -.EE -.in -.IP -The recognized -.I flags -are: -.RS -.TP -.BR SECCOMP_FILTER_FLAG_LOG " (since Linux 4.14)" -.\" commit e66a39977985b1e69e17c4042cb290768eca9b02 -All filter return actions except -.B SECCOMP_RET_ALLOW -should be logged. -An administrator may override this filter flag by preventing specific -actions from being logged via the -.I /proc/sys/kernel/seccomp/actions_logged -file. -.TP -.BR SECCOMP_FILTER_FLAG_NEW_LISTENER " (since Linux 5.0)" -.\" commit 6a21cc50f0c7f87dae5259f6cfefe024412313f6 -After successfully installing the filter program, -return a new user-space notification file descriptor. -(The close-on-exec flag is set for the file descriptor.) -When the filter returns -.B SECCOMP_RET_USER_NOTIF -a notification will be sent to this file descriptor. -.IP -At most one seccomp filter using the -.B SECCOMP_FILTER_FLAG_NEW_LISTENER -flag can be installed for a thread. -.IP -See -.BR seccomp_unotify (2) -for further details. -.TP -.BR SECCOMP_FILTER_FLAG_SPEC_ALLOW " (since Linux 4.17)" -.\" commit 00a02d0c502a06d15e07b857f8ff921e3e402675 -Disable Speculative Store Bypass mitigation. -.TP -.B SECCOMP_FILTER_FLAG_TSYNC -When adding a new filter, synchronize all other threads of the calling -process to the same seccomp filter tree. -A "filter tree" is the ordered list of filters attached to a thread. -(Attaching identical filters in separate -.BR seccomp () -calls results in different filters from this perspective.) -.IP -If any thread cannot synchronize to the same filter tree, -the call will not attach the new seccomp filter, -and will fail, returning the first thread ID found that cannot synchronize. -Synchronization will fail if another thread in the same process is in -.B SECCOMP_MODE_STRICT -or if it has attached new seccomp filters to itself, -diverging from the calling thread's filter tree. -.RE -.TP -.BR SECCOMP_GET_ACTION_AVAIL " (since Linux 4.14)" -.\" commit d612b1fd8010d0d67b5287fe146b8b55bcbb8655 -Test to see if an action is supported by the kernel. -This operation is helpful to confirm that the kernel knows -of a more recently added filter return action -since the kernel treats all unknown actions as -.BR SECCOMP_RET_KILL_PROCESS . -.IP -The value of -.I flags -must be 0, and -.I args -must be a pointer to an unsigned 32-bit filter return action. -.TP -.BR SECCOMP_GET_NOTIF_SIZES " (since Linux 5.0)" -.\" commit 6a21cc50f0c7f87dae5259f6cfefe024412313f6 -Get the sizes of the seccomp user-space notification structures. -Since these structures may evolve and grow over time, -this command can be used to determine how -much memory to allocate for sending and receiving notifications. -.IP -The value of -.I flags -must be 0, and -.I args -must be a pointer to a -.IR "struct seccomp_notif_sizes" , -which has the following form: -.IP -.EX -struct seccomp_notif_sizes - __u16 seccomp_notif; /* Size of notification structure */ - __u16 seccomp_notif_resp; /* Size of response structure */ - __u16 seccomp_data; /* Size of \[aq]struct seccomp_data\[aq] */ -}; -.EE -.IP -See -.BR seccomp_unotify (2) -for further details. -.\" -.SS Filters -When adding filters via -.BR SECCOMP_SET_MODE_FILTER , -.I args -points to a filter program: -.P -.in +4n -.EX -struct sock_fprog { - unsigned short len; /* Number of BPF instructions */ - struct sock_filter *filter; /* Pointer to array of - BPF instructions */ -}; -.EE -.in -.P -Each program must contain one or more BPF instructions: -.P -.in +4n -.EX -struct sock_filter { /* Filter block */ - __u16 code; /* Actual filter code */ - __u8 jt; /* Jump true */ - __u8 jf; /* Jump false */ - __u32 k; /* Generic multiuse field */ -}; -.EE -.in -.P -When executing the instructions, the BPF program operates on the -system call information made available (i.e., use the -.B BPF_ABS -addressing mode) as a (read-only) -.\" Quoting Kees Cook: -.\" If BPF even allows changing the data, it's not copied back to -.\" the syscall when it runs. Anything wanting to do things like -.\" that would need to use ptrace to catch the call and directly -.\" modify the registers before continuing with the call. -buffer of the following form: -.P -.in +4n -.EX -struct seccomp_data { - int nr; /* System call number */ - __u32 arch; /* AUDIT_ARCH_* value - (see <linux/audit.h>) */ - __u64 instruction_pointer; /* CPU instruction pointer */ - __u64 args[6]; /* Up to 6 system call arguments */ -}; -.EE -.in -.P -Because numbering of system calls varies between architectures and -some architectures (e.g., x86-64) allow user-space code to use -the calling conventions of multiple architectures -(and the convention being used may vary over the life of a process that uses -.BR execve (2) -to execute binaries that employ the different conventions), -it is usually necessary to verify the value of the -.I arch -field. -.P -It is strongly recommended to use an allow-list approach whenever -possible because such an approach is more robust and simple. -A deny-list will have to be updated whenever a potentially -dangerous system call is added (or a dangerous flag or option if those -are deny-listed), and it is often possible to alter the -representation of a value without altering its meaning, leading to -a deny-list bypass. -See also -.I Caveats -below. -.P -The -.I arch -field is not unique for all calling conventions. -The x86-64 ABI and the x32 ABI both use -.B AUDIT_ARCH_X86_64 -as -.IR arch , -and they run on the same processors. -Instead, the mask -.B __X32_SYSCALL_BIT -is used on the system call number to tell the two ABIs apart. -.\" As noted by Dave Drysdale in a note at the end of -.\" https://lwn.net/Articles/604515/ -.\" One additional detail to point out for the x32 ABI case: -.\" the syscall number gets a high bit set (__X32_SYSCALL_BIT), -.\" to mark it as an x32 call. -.\" -.\" If x32 support is included in the kernel, then __SYSCALL_MASK -.\" will have a value that is not all-ones, and this will trigger -.\" an extra instruction in system_call to mask off the extra bit, -.\" so that the syscall table indexing still works. -.P -This means that a policy must either deny all syscalls with -.B __X32_SYSCALL_BIT -or it must recognize syscalls with and without -.B __X32_SYSCALL_BIT -set. -A list of system calls to be denied based on -.I nr -that does not also contain -.I nr -values with -.B __X32_SYSCALL_BIT -set can be bypassed by a malicious program that sets -.BR __X32_SYSCALL_BIT . -.P -Additionally, kernels prior to Linux 5.4 incorrectly permitted -.I nr -in the ranges 512-547 as well as the corresponding non-x32 syscalls ORed -with -.BR __X32_SYSCALL_BIT . -For example, -.I nr -== 521 and -.I nr -== (101 | -.BR __X32_SYSCALL_BIT ) -would result in invocations of -.BR ptrace (2) -with potentially confused x32-vs-x86_64 semantics in the kernel. -Policies intended to work on kernels before Linux 5.4 must ensure that they -deny or otherwise correctly handle these system calls. -On Linux 5.4 and newer, -.\" commit 6365b842aae4490ebfafadfc6bb27a6d3cc54757 -such system calls will fail with the error -.BR ENOSYS , -without doing anything. -.P -The -.I instruction_pointer -field provides the address of the machine-language instruction that -performed the system call. -This might be useful in conjunction with the use of -.IR /proc/ pid /maps -to perform checks based on which region (mapping) of the program -made the system call. -(Probably, it is wise to lock down the -.BR mmap (2) -and -.BR mprotect (2) -system calls to prevent the program from subverting such checks.) -.P -When checking values from -.IR args , -keep in mind that arguments are often -silently truncated before being processed, but after the seccomp check. -For example, this happens if the i386 ABI is used on an -x86-64 kernel: although the kernel will normally not look beyond -the 32 lowest bits of the arguments, the values of the full -64-bit registers will be present in the seccomp data. -A less surprising example is that if the x86-64 ABI is used to perform -a system call that takes an argument of type -.IR int , -the more-significant half of the argument register is ignored by -the system call, but visible in the seccomp data. -.P -A seccomp filter returns a 32-bit value consisting of two parts: -the most significant 16 bits -(corresponding to the mask defined by the constant -.BR SECCOMP_RET_ACTION_FULL ) -contain one of the "action" values listed below; -the least significant 16-bits (defined by the constant -.BR SECCOMP_RET_DATA ) -are "data" to be associated with this return value. -.P -If multiple filters exist, they are \fIall\fP executed, -in reverse order of their addition to the filter tree\[em]that is, -the most recently installed filter is executed first. -(Note that all filters will be called -even if one of the earlier filters returns -.BR SECCOMP_RET_KILL . -This is done to simplify the kernel code and to provide a -tiny speed-up in the execution of sets of filters by -avoiding a check for this uncommon case.) -.\" From an Aug 2015 conversation with Kees Cook where I asked why *all* -.\" filters are applied even if one of the early filters returns -.\" SECCOMP_RET_KILL: -.\" -.\" It's just because it would be an optimization that would only speed up -.\" the RET_KILL case, but it's the uncommon one and the one that doesn't -.\" benefit meaningfully from such a change (you need to kill the process -.\" really quickly?). We would speed up killing a program at the (albeit -.\" tiny) expense to all other filtered programs. Best to keep the filter -.\" execution logic clear, simple, and as fast as possible for all -.\" filters. -The return value for the evaluation of a given system call is the first-seen -action value of highest precedence (along with its accompanying data) -returned by execution of all of the filters. -.P -In decreasing order of precedence, -the action values that may be returned by a seccomp filter are: -.TP -.BR SECCOMP_RET_KILL_PROCESS " (since Linux 4.14)" -.\" commit 4d3b0b05aae9ee9ce0970dc4cc0fb3fad5e85945 -.\" commit 0466bdb99e8744bc9befa8d62a317f0fd7fd7421 -This value results in immediate termination of the process, -with a core dump. -The system call is not executed. -By contrast with -.B SECCOMP_RET_KILL_THREAD -below, all threads in the thread group are terminated. -(For a discussion of thread groups, see the description of the -.B CLONE_THREAD -flag in -.BR clone (2).) -.IP -The process terminates -.I "as though" -killed by a -.B SIGSYS -signal. -Even if a signal handler has been registered for -.BR SIGSYS , -the handler will be ignored in this case and the process always terminates. -To a parent process that is waiting on this process (using -.BR waitpid (2) -or similar), the returned -.I wstatus -will indicate that its child was terminated as though by a -.B SIGSYS -signal. -.TP -.BR SECCOMP_RET_KILL_THREAD " (or " SECCOMP_RET_KILL ) -This value results in immediate termination of the thread -that made the system call. -The system call is not executed. -Other threads in the same thread group will continue to execute. -.IP -The thread terminates -.I "as though" -killed by a -.B SIGSYS -signal. -See -.B SECCOMP_RET_KILL_PROCESS -above. -.IP -.\" See these commits: -.\" seccomp: dump core when using SECCOMP_RET_KILL -.\" (b25e67161c295c98acda92123b2dd1e7d8642901) -.\" seccomp: Only dump core when single-threaded -.\" (d7276e321ff8a53106a59c85ca46d03e34288893) -Before Linux 4.11, -any process terminated in this way would not trigger a coredump -(even though -.B SIGSYS -is documented in -.BR signal (7) -as having a default action of termination with a core dump). -Since Linux 4.11, -a single-threaded process will dump core if terminated in this way. -.IP -With the addition of -.B SECCOMP_RET_KILL_PROCESS -in Linux 4.14, -.B SECCOMP_RET_KILL_THREAD -was added as a synonym for -.BR SECCOMP_RET_KILL , -in order to more clearly distinguish the two actions. -.IP -.BR Note : -the use of -.B SECCOMP_RET_KILL_THREAD -to kill a single thread in a multithreaded process is likely to leave the -process in a permanently inconsistent and possibly corrupt state. -.TP -.B SECCOMP_RET_TRAP -This value results in the kernel sending a thread-directed -.B SIGSYS -signal to the triggering thread. -(The system call is not executed.) -Various fields will be set in the -.I siginfo_t -structure (see -.BR sigaction (2)) -associated with signal: -.RS -.IP \[bu] 3 -.I si_signo -will contain -.BR SIGSYS . -.IP \[bu] -.I si_call_addr -will show the address of the system call instruction. -.IP \[bu] -.I si_syscall -and -.I si_arch -will indicate which system call was attempted. -.IP \[bu] -.I si_code -will contain -.BR SYS_SECCOMP . -.IP \[bu] -.I si_errno -will contain the -.B SECCOMP_RET_DATA -portion of the filter return value. -.RE -.IP -The program counter will be as though the system call happened -(i.e., the program counter will not point to the system call instruction). -The return value register will contain an architecture\-dependent value; -if resuming execution, set it to something appropriate for the system call. -(The architecture dependency is because replacing it with -.B ENOSYS -could overwrite some useful information.) -.TP -.B SECCOMP_RET_ERRNO -This value results in the -.B SECCOMP_RET_DATA -portion of the filter's return value being passed to user space as the -.I errno -value without executing the system call. -.TP -.BR SECCOMP_RET_USER_NOTIF " (since Linux 5.0)" -.\" commit 6a21cc50f0c7f87dae5259f6cfefe024412313f6 -Forward the system call to an attached user-space supervisor -process to allow that process to decide what to do with the system call. -If there is no attached supervisor (either -because the filter was not installed with the -.B SECCOMP_FILTER_FLAG_NEW_LISTENER -flag or because the file descriptor was closed), the filter returns -.B ENOSYS -(similar to what happens when a filter returns -.B SECCOMP_RET_TRACE -and there is no tracer). -See -.BR seccomp_unotify (2) -for further details. -.IP -Note that the supervisor process will not be notified -if another filter returns an action value with a precedence greater than -.BR SECCOMP_RET_USER_NOTIF . -.TP -.B SECCOMP_RET_TRACE -When returned, this value will cause the kernel to attempt to notify a -.BR ptrace (2)-based -tracer prior to executing the system call. -If there is no tracer present, -the system call is not executed and returns a failure status with -.I errno -set to -.BR ENOSYS . -.IP -A tracer will be notified if it requests -.B PTRACE_O_TRACESECCOMP -using -.IR ptrace(PTRACE_SETOPTIONS) . -The tracer will be notified of a -.B PTRACE_EVENT_SECCOMP -and the -.B SECCOMP_RET_DATA -portion of the filter's return value will be available to the tracer via -.BR PTRACE_GETEVENTMSG . -.IP -The tracer can skip the system call by changing the system call number -to \-1. -Alternatively, the tracer can change the system call -requested by changing the system call to a valid system call number. -If the tracer asks to skip the system call, then the system call will -appear to return the value that the tracer puts in the return value register. -.IP -.\" This was changed in ce6526e8afa4. -.\" A related hole, using PTRACE_SYSCALL instead of SECCOMP_RET_TRACE, was -.\" changed in arch-specific commits, e.g. 93e35efb8de4 for X86 and -.\" 0f3912fd934c for ARM. -Before Linux 4.8, the seccomp check will not be run again after the tracer is -notified. -(This means that, on older kernels, seccomp-based sandboxes -.B "must not" -allow use of -.BR ptrace (2)\[em]even -of other -sandboxed processes\[em]without extreme care; -ptracers can use this mechanism to escape from the seccomp sandbox.) -.IP -Note that a tracer process will not be notified -if another filter returns an action value with a precedence greater than -.BR SECCOMP_RET_TRACE . -.TP -.BR SECCOMP_RET_LOG " (since Linux 4.14)" -.\" commit 59f5cf44a38284eb9e76270c786fb6cc62ef8ac4 -This value results in the system call being executed after -the filter return action is logged. -An administrator may override the logging of this action via -the -.I /proc/sys/kernel/seccomp/actions_logged -file. -.TP -.B SECCOMP_RET_ALLOW -This value results in the system call being executed. -.P -If an action value other than one of the above is specified, -then the filter action is treated as either -.B SECCOMP_RET_KILL_PROCESS -(since Linux 4.14) -.\" commit 4d3b0b05aae9ee9ce0970dc4cc0fb3fad5e85945 -or -.B SECCOMP_RET_KILL_THREAD -(in Linux 4.13 and earlier). -.\" -.SS /proc interfaces -The files in the directory -.I /proc/sys/kernel/seccomp -provide additional seccomp information and configuration: -.TP -.IR actions_avail " (since Linux 4.14)" -.\" commit 8e5f1ad116df6b0de65eac458d5e7c318d1c05af -A read-only ordered list of seccomp filter return actions in string form. -The ordering, from left-to-right, is in decreasing order of precedence. -The list represents the set of seccomp filter return actions -supported by the kernel. -.TP -.IR actions_logged " (since Linux 4.14)" -.\" commit 0ddec0fc8900201c0897b87b762b7c420436662f -A read-write ordered list of seccomp filter return actions that -are allowed to be logged. -Writes to the file do not need to be in ordered form but reads from -the file will be ordered in the same way as the -.I actions_avail -file. -.IP -It is important to note that the value of -.I actions_logged -does not prevent certain filter return actions from being logged when -the audit subsystem is configured to audit a task. -If the action is not found in the -.I actions_logged -file, the final decision on whether to audit the action for that task is -ultimately left up to the audit subsystem to decide for all filter return -actions other than -.BR SECCOMP_RET_ALLOW . -.IP -The "allow" string is not accepted in the -.I actions_logged -file as it is not possible to log -.B SECCOMP_RET_ALLOW -actions. -Attempting to write "allow" to the file will fail with the error -.BR EINVAL . -.\" -.SS Audit logging of seccomp actions -.\" commit 59f5cf44a38284eb9e76270c786fb6cc62ef8ac4 -Since Linux 4.14, the kernel provides the facility to log the -actions returned by seccomp filters in the audit log. -The kernel makes the decision to log an action based on -the action type, whether or not the action is present in the -.I actions_logged -file, and whether kernel auditing is enabled -(e.g., via the kernel boot option -.IR audit=1 ). -.\" or auditing could be enabled via the netlink API (AUDIT_SET) -The rules are as follows: -.IP \[bu] 3 -If the action is -.BR SECCOMP_RET_ALLOW , -the action is not logged. -.IP \[bu] -Otherwise, if the action is either -.B SECCOMP_RET_KILL_PROCESS -or -.BR SECCOMP_RET_KILL_THREAD , -and that action appears in the -.I actions_logged -file, the action is logged. -.IP \[bu] -Otherwise, if the filter has requested logging (the -.B SECCOMP_FILTER_FLAG_LOG -flag) -and the action appears in the -.I actions_logged -file, the action is logged. -.IP \[bu] -Otherwise, if kernel auditing is enabled and the process is being audited -.RB ( autrace (8)), -the action is logged. -.IP \[bu] -Otherwise, the action is not logged. -.SH RETURN VALUE -On success, -.BR seccomp () -returns 0. -On error, if -.B SECCOMP_FILTER_FLAG_TSYNC -was used, -the return value is the ID of the thread -that caused the synchronization failure. -(This ID is a kernel thread ID of the type returned by -.BR clone (2) -and -.BR gettid (2).) -On other errors, \-1 is returned, and -.I errno -is set to indicate the error. -.SH ERRORS -.BR seccomp () -can fail for the following reasons: -.TP -.B EACCES -The caller did not have the -.B CAP_SYS_ADMIN -capability in its user namespace, or had not set -.I no_new_privs -before using -.BR SECCOMP_SET_MODE_FILTER . -.TP -.B EBUSY -While installing a new filter, the -.B SECCOMP_FILTER_FLAG_NEW_LISTENER -flag was specified, -but a previous filter had already been installed with that flag. -.TP -.B EFAULT -.I args -was not a valid address. -.TP -.B EINVAL -.I operation -is unknown or is not supported by this kernel version or configuration. -.TP -.B EINVAL -The specified -.I flags -are invalid for the given -.IR operation . -.TP -.B EINVAL -.I operation -included -.BR BPF_ABS , -but the specified offset was not aligned to a 32-bit boundary or exceeded -.IR "sizeof(struct\~seccomp_data)" . -.TP -.B EINVAL -.\" See kernel/seccomp.c::seccomp_may_assign_mode() in Linux 3.18 sources -A secure computing mode has already been set, and -.I operation -differs from the existing setting. -.TP -.B EINVAL -.I operation -specified -.BR SECCOMP_SET_MODE_FILTER , -but the filter program pointed to by -.I args -was not valid or the length of the filter program was zero or exceeded -.B BPF_MAXINSNS -(4096) instructions. -.TP -.B ENOMEM -Out of memory. -.TP -.B ENOMEM -.\" ENOMEM in kernel/seccomp.c::seccomp_attach_filter() in Linux 3.18 sources -The total length of all filter programs attached -to the calling thread would exceed -.B MAX_INSNS_PER_PATH -(32768) instructions. -Note that for the purposes of calculating this limit, -each already existing filter program incurs an -overhead penalty of 4 instructions. -.TP -.B EOPNOTSUPP -.I operation -specified -.BR SECCOMP_GET_ACTION_AVAIL , -but the kernel does not support the filter return action specified by -.IR args . -.TP -.B ESRCH -Another thread caused a failure during thread sync, but its ID could not -be determined. -.SH STANDARDS -Linux. -.SH HISTORY -Linux 3.17. -.\" FIXME . Add glibc version -.SH NOTES -Rather than hand-coding seccomp filters as shown in the example below, -you may prefer to employ the -.I libseccomp -library, which provides a front-end for generating seccomp filters. -.P -The -.I Seccomp -field of the -.IR /proc/ pid /status -file provides a method of viewing the seccomp mode of a process; see -.BR proc (5). -.P -.BR seccomp () -provides a superset of the functionality provided by the -.BR prctl (2) -.B PR_SET_SECCOMP -operation (which does not support -.IR flags ). -.P -Since Linux 4.4, the -.BR ptrace (2) -.B PTRACE_SECCOMP_GET_FILTER -operation can be used to dump a process's seccomp filters. -.\" -.SS Architecture support for seccomp BPF -Architecture support for seccomp BPF filtering -.\" Check by grepping for HAVE_ARCH_SECCOMP_FILTER in Kconfig files in -.\" kernel source. Last checked in Linux 4.16-rc source. -is available on the following architectures: -.IP \[bu] 3 -x86-64, i386, x32 (since Linux 3.5) -.PD 0 -.IP \[bu] -ARM (since Linux 3.8) -.IP \[bu] -s390 (since Linux 3.8) -.IP \[bu] -MIPS (since Linux 3.16) -.IP \[bu] -ARM-64 (since Linux 3.19) -.IP \[bu] -PowerPC (since Linux 4.3) -.IP \[bu] -Tile (since Linux 4.3) -.IP \[bu] -PA-RISC (since Linux 4.6) -.\" User mode Linux since Linux 4.6 -.PD -.\" -.SS Caveats -There are various subtleties to consider when applying seccomp filters -to a program, including the following: -.IP \[bu] 3 -Some traditional system calls have user-space implementations in the -.BR vdso (7) -on many architectures. -Notable examples include -.BR clock_gettime (2), -.BR gettimeofday (2), -and -.BR time (2). -On such architectures, -seccomp filtering for these system calls will have no effect. -(However, there are cases where the -.BR vdso (7) -implementations may fall back to invoking the true system call, -in which case seccomp filters would see the system call.) -.IP \[bu] -Seccomp filtering is based on system call numbers. -However, applications typically do not directly invoke system calls, -but instead call wrapper functions in the C library which -in turn invoke the system calls. -Consequently, one must be aware of the following: -.RS -.IP \[bu] 3 -The glibc wrappers for some traditional system calls may actually -employ system calls with different names in the kernel. -For example, the -.BR exit (2) -wrapper function actually employs the -.BR exit_group (2) -system call, and the -.BR fork (2) -wrapper function actually calls -.BR clone (2). -.IP \[bu] -The behavior of wrapper functions may vary across architectures, -according to the range of system calls provided on those architectures. -In other words, the same wrapper function may invoke -different system calls on different architectures. -.IP \[bu] -Finally, the behavior of wrapper functions can change across glibc versions. -For example, in older versions, the glibc wrapper function for -.BR open (2) -invoked the system call of the same name, -but starting in glibc 2.26, the implementation switched to calling -.BR openat (2) -on all architectures. -.RE -.P -The consequence of the above points is that it may be necessary -to filter for a system call other than might be expected. -Various manual pages in Section 2 provide helpful details -about the differences between wrapper functions and -the underlying system calls in subsections entitled -.IR "C library/kernel differences" . -.P -Furthermore, note that the application of seccomp filters -even risks causing bugs in an application, -when the filters cause unexpected failures for legitimate operations -that the application might need to perform. -Such bugs may not easily be discovered when testing the seccomp -filters if the bugs occur in rarely used application code paths. -.\" -.SS Seccomp-specific BPF details -Note the following BPF details specific to seccomp filters: -.IP \[bu] 3 -The -.B BPF_H -and -.B BPF_B -size modifiers are not supported: all operations must load and store -(4-byte) words -.RB ( BPF_W ). -.IP \[bu] -To access the contents of the -.I seccomp_data -buffer, use the -.B BPF_ABS -addressing mode modifier. -.IP \[bu] -The -.B BPF_LEN -addressing mode modifier yields an immediate mode operand -whose value is the size of the -.I seccomp_data -buffer. -.SH EXAMPLES -The program below accepts four or more arguments. -The first three arguments are a system call number, -a numeric architecture identifier, and an error number. -The program uses these values to construct a BPF filter -that is used at run time to perform the following checks: -.IP \[bu] 3 -If the program is not running on the specified architecture, -the BPF filter causes system calls to fail with the error -.BR ENOSYS . -.IP \[bu] -If the program attempts to execute the system call with the specified number, -the BPF filter causes the system call to fail, with -.I errno -being set to the specified error number. -.P -The remaining command-line arguments specify -the pathname and additional arguments of a program -that the example program should attempt to execute using -.BR execv (3) -(a library function that employs the -.BR execve (2) -system call). -Some example runs of the program are shown below. -.P -First, we display the architecture that we are running on (x86-64) -and then construct a shell function that looks up system call -numbers on this architecture: -.P -.in +4n -.EX -$ \fBuname \-m\fP -x86_64 -$ \fBsyscall_nr() { - cat /usr/src/linux/arch/x86/syscalls/syscall_64.tbl | \e - awk \[aq]$2 != "x32" && $3 == "\[aq]$1\[aq]" { print $1 }\[aq] -}\fP -.EE -.in -.P -When the BPF filter rejects a system call (case [2] above), -it causes the system call to fail with the error number -specified on the command line. -In the experiments shown here, we'll use error number 99: -.P -.in +4n -.EX -$ \fBerrno 99\fP -EADDRNOTAVAIL 99 Cannot assign requested address -.EE -.in -.P -In the following example, we attempt to run the command -.BR whoami (1), -but the BPF filter rejects the -.BR execve (2) -system call, so that the command is not even executed: -.P -.in +4n -.EX -$ \fBsyscall_nr execve\fP -59 -$ \fB./a.out\fP -Usage: ./a.out <syscall_nr> <arch> <errno> <prog> [<args>] -Hint for <arch>: AUDIT_ARCH_I386: 0x40000003 - AUDIT_ARCH_X86_64: 0xC000003E -$ \fB./a.out 59 0xC000003E 99 /bin/whoami\fP -execv: Cannot assign requested address -.EE -.in -.P -In the next example, the BPF filter rejects the -.BR write (2) -system call, so that, although it is successfully started, the -.BR whoami (1) -command is not able to write output: -.P -.in +4n -.EX -$ \fBsyscall_nr write\fP -1 -$ \fB./a.out 1 0xC000003E 99 /bin/whoami\fP -.EE -.in -.P -In the final example, -the BPF filter rejects a system call that is not used by the -.BR whoami (1) -command, so it is able to successfully execute and produce output: -.P -.in +4n -.EX -$ \fBsyscall_nr preadv\fP -295 -$ \fB./a.out 295 0xC000003E 99 /bin/whoami\fP -cecilia -.EE -.in -.SS Program source -.\" SRC BEGIN (seccomp.c) -.EX -#include <linux/audit.h> -#include <linux/filter.h> -#include <linux/seccomp.h> -#include <stddef.h> -#include <stdio.h> -#include <stdlib.h> -#include <sys/prctl.h> -#include <sys/syscall.h> -#include <unistd.h> -\& -#define X32_SYSCALL_BIT 0x40000000 -#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0])) -\& -static int -install_filter(int syscall_nr, unsigned int t_arch, int f_errno) -{ - unsigned int upper_nr_limit = 0xffffffff; -\& - /* Assume that AUDIT_ARCH_X86_64 means the normal x86\-64 ABI - (in the x32 ABI, all system calls have bit 30 set in the - \[aq]nr\[aq] field, meaning the numbers are >= X32_SYSCALL_BIT). */ - if (t_arch == AUDIT_ARCH_X86_64) - upper_nr_limit = X32_SYSCALL_BIT \- 1; -\& - struct sock_filter filter[] = { - /* [0] Load architecture from \[aq]seccomp_data\[aq] buffer into - accumulator. */ - BPF_STMT(BPF_LD | BPF_W | BPF_ABS, - (offsetof(struct seccomp_data, arch))), -\& - /* [1] Jump forward 5 instructions if architecture does not - match \[aq]t_arch\[aq]. */ - BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, t_arch, 0, 5), -\& - /* [2] Load system call number from \[aq]seccomp_data\[aq] buffer into - accumulator. */ - BPF_STMT(BPF_LD | BPF_W | BPF_ABS, - (offsetof(struct seccomp_data, nr))), -\& - /* [3] Check ABI \- only needed for x86\-64 in deny\-list use - cases. Use BPF_JGT instead of checking against the bit - mask to avoid having to reload the syscall number. */ - BPF_JUMP(BPF_JMP | BPF_JGT | BPF_K, upper_nr_limit, 3, 0), -\& - /* [4] Jump forward 1 instruction if system call number - does not match \[aq]syscall_nr\[aq]. */ - BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, syscall_nr, 0, 1), -\& - /* [5] Matching architecture and system call: don\[aq]t execute - the system call, and return \[aq]f_errno\[aq] in \[aq]errno\[aq]. */ - BPF_STMT(BPF_RET | BPF_K, - SECCOMP_RET_ERRNO | (f_errno & SECCOMP_RET_DATA)), -\& - /* [6] Destination of system call number mismatch: allow other - system calls. */ - BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW), -\& - /* [7] Destination of architecture mismatch: kill process. */ - BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_KILL_PROCESS), - }; -\& - struct sock_fprog prog = { - .len = ARRAY_SIZE(filter), - .filter = filter, - }; -\& - if (syscall(SYS_seccomp, SECCOMP_SET_MODE_FILTER, 0, &prog)) { - perror("seccomp"); - return 1; - } -\& - return 0; -} -\& -int -main(int argc, char *argv[]) -{ - if (argc < 5) { - fprintf(stderr, "Usage: " - "%s <syscall_nr> <arch> <errno> <prog> [<args>]\en" - "Hint for <arch>: AUDIT_ARCH_I386: 0x%X\en" - " AUDIT_ARCH_X86_64: 0x%X\en" - "\en", argv[0], AUDIT_ARCH_I386, AUDIT_ARCH_X86_64); - exit(EXIT_FAILURE); - } -\& - if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) { - perror("prctl"); - exit(EXIT_FAILURE); - } -\& - if (install_filter(strtol(argv[1], NULL, 0), - strtoul(argv[2], NULL, 0), - strtol(argv[3], NULL, 0))) - exit(EXIT_FAILURE); -\& - execv(argv[4], &argv[4]); - perror("execv"); - exit(EXIT_FAILURE); -} -.EE -.\" SRC END -.SH SEE ALSO -.BR bpfc (1), -.BR strace (1), -.BR bpf (2), -.BR prctl (2), -.BR ptrace (2), -.BR seccomp_unotify (2), -.BR sigaction (2), -.BR proc (5), -.BR signal (7), -.BR socket (7) -.P -Various pages from the -.I libseccomp -library, including: -.BR scmp_sys_resolver (1), -.BR seccomp_export_bpf (3), -.BR seccomp_init (3), -.BR seccomp_load (3), -and -.BR seccomp_rule_add (3). -.P -The kernel source files -.I Documentation/networking/filter.txt -and -.I Documentation/userspace\-api/seccomp_filter.rst -.\" commit c061f33f35be0ccc80f4b8e0aea5dfd2ed7e01a3 -(or -.I Documentation/prctl/seccomp_filter.txt -before Linux 4.13). -.P -McCanne, S.\& and Jacobson, V.\& (1992) -.IR "The BSD Packet Filter: A New Architecture for User-level Packet Capture" , -Proceedings of the USENIX Winter 1993 Conference -.UR http://www.tcpdump.org/papers/bpf\-usenix93.pdf -.UE |